diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,287552 @@ +{ + "best_global_step": 80811, + "best_metric": 0.07710938155651093, + "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_hellaswag_123_1760637745/checkpoint-80811", + "epoch": 20.0, + "eval_steps": 8979, + "global_step": 179580, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005568548836173293, + "grad_norm": 3.6115634441375732, + "learning_rate": 1.1137097672346587e-08, + "loss": 0.8318, + "num_input_tokens_seen": 5568, + "step": 5 + }, + { + "epoch": 0.0011137097672346587, + "grad_norm": 4.444170951843262, + "learning_rate": 2.505846976277982e-08, + "loss": 1.1354, + "num_input_tokens_seen": 11712, + "step": 10 + }, + { + "epoch": 0.001670564650851988, + "grad_norm": 4.315086841583252, + "learning_rate": 3.897984185321305e-08, + "loss": 1.1161, + "num_input_tokens_seen": 17568, + "step": 15 + }, + { + "epoch": 0.0022274195344693173, + "grad_norm": 3.772920608520508, + "learning_rate": 5.290121394364629e-08, + "loss": 1.1547, + "num_input_tokens_seen": 23936, + "step": 20 + }, + { + "epoch": 0.0027842744180866467, + "grad_norm": 3.995605230331421, + "learning_rate": 6.682258603407952e-08, + "loss": 1.0458, + "num_input_tokens_seen": 29600, + "step": 25 + }, + { + "epoch": 0.003341129301703976, + "grad_norm": 4.391587734222412, + "learning_rate": 8.074395812451275e-08, + "loss": 1.1183, + "num_input_tokens_seen": 35904, + "step": 30 + }, + { + "epoch": 0.0038979841853213053, + "grad_norm": 4.2609639167785645, + "learning_rate": 9.466533021494599e-08, + "loss": 1.3308, + "num_input_tokens_seen": 41792, + "step": 35 + }, + { + "epoch": 0.004454839068938635, + "grad_norm": 4.759876728057861, + "learning_rate": 1.0858670230537921e-07, + "loss": 1.1197, + "num_input_tokens_seen": 48192, + "step": 40 + }, + { + "epoch": 0.005011693952555964, + "grad_norm": 4.299136161804199, + "learning_rate": 1.2250807439581244e-07, + "loss": 1.1992, + "num_input_tokens_seen": 54624, + "step": 45 + }, + { + "epoch": 0.005568548836173293, + "grad_norm": 4.586293697357178, + "learning_rate": 1.3642944648624568e-07, + "loss": 1.0876, + "num_input_tokens_seen": 61088, + "step": 50 + }, + { + "epoch": 0.006125403719790623, + "grad_norm": 4.253786087036133, + "learning_rate": 1.5035081857667893e-07, + "loss": 0.9748, + "num_input_tokens_seen": 66624, + "step": 55 + }, + { + "epoch": 0.006682258603407952, + "grad_norm": 5.053655624389648, + "learning_rate": 1.6427219066711214e-07, + "loss": 1.1689, + "num_input_tokens_seen": 72640, + "step": 60 + }, + { + "epoch": 0.007239113487025281, + "grad_norm": 4.5483078956604, + "learning_rate": 1.7819356275754539e-07, + "loss": 1.0467, + "num_input_tokens_seen": 78880, + "step": 65 + }, + { + "epoch": 0.007795968370642611, + "grad_norm": 3.8423850536346436, + "learning_rate": 1.9211493484797863e-07, + "loss": 0.9806, + "num_input_tokens_seen": 85088, + "step": 70 + }, + { + "epoch": 0.00835282325425994, + "grad_norm": 4.6526780128479, + "learning_rate": 2.0603630693841187e-07, + "loss": 0.8412, + "num_input_tokens_seen": 91392, + "step": 75 + }, + { + "epoch": 0.00890967813787727, + "grad_norm": 4.002804279327393, + "learning_rate": 2.1995767902884511e-07, + "loss": 1.1276, + "num_input_tokens_seen": 97472, + "step": 80 + }, + { + "epoch": 0.009466533021494599, + "grad_norm": 5.084099292755127, + "learning_rate": 2.3387905111927833e-07, + "loss": 1.2278, + "num_input_tokens_seen": 103840, + "step": 85 + }, + { + "epoch": 0.010023387905111928, + "grad_norm": 3.928600311279297, + "learning_rate": 2.478004232097116e-07, + "loss": 1.0851, + "num_input_tokens_seen": 109984, + "step": 90 + }, + { + "epoch": 0.010580242788729257, + "grad_norm": 4.400847434997559, + "learning_rate": 2.617217953001448e-07, + "loss": 1.0056, + "num_input_tokens_seen": 116000, + "step": 95 + }, + { + "epoch": 0.011137097672346587, + "grad_norm": 5.761290073394775, + "learning_rate": 2.7564316739057806e-07, + "loss": 0.9123, + "num_input_tokens_seen": 121664, + "step": 100 + }, + { + "epoch": 0.011693952555963916, + "grad_norm": 4.792648792266846, + "learning_rate": 2.895645394810113e-07, + "loss": 1.1836, + "num_input_tokens_seen": 128096, + "step": 105 + }, + { + "epoch": 0.012250807439581245, + "grad_norm": 5.405680179595947, + "learning_rate": 3.034859115714445e-07, + "loss": 0.9849, + "num_input_tokens_seen": 133952, + "step": 110 + }, + { + "epoch": 0.012807662323198575, + "grad_norm": 4.3053436279296875, + "learning_rate": 3.1740728366187776e-07, + "loss": 1.0337, + "num_input_tokens_seen": 140192, + "step": 115 + }, + { + "epoch": 0.013364517206815904, + "grad_norm": 4.852023124694824, + "learning_rate": 3.31328655752311e-07, + "loss": 1.0193, + "num_input_tokens_seen": 146528, + "step": 120 + }, + { + "epoch": 0.013921372090433233, + "grad_norm": 4.852121353149414, + "learning_rate": 3.452500278427442e-07, + "loss": 1.0821, + "num_input_tokens_seen": 152192, + "step": 125 + }, + { + "epoch": 0.014478226974050563, + "grad_norm": 4.5115556716918945, + "learning_rate": 3.5917139993317747e-07, + "loss": 1.0408, + "num_input_tokens_seen": 158432, + "step": 130 + }, + { + "epoch": 0.015035081857667892, + "grad_norm": 4.542613506317139, + "learning_rate": 3.730927720236107e-07, + "loss": 1.0441, + "num_input_tokens_seen": 164416, + "step": 135 + }, + { + "epoch": 0.015591936741285221, + "grad_norm": 3.848565101623535, + "learning_rate": 3.870141441140439e-07, + "loss": 1.2402, + "num_input_tokens_seen": 170624, + "step": 140 + }, + { + "epoch": 0.01614879162490255, + "grad_norm": 4.914551258087158, + "learning_rate": 4.009355162044771e-07, + "loss": 1.2169, + "num_input_tokens_seen": 176640, + "step": 145 + }, + { + "epoch": 0.01670564650851988, + "grad_norm": 4.614715099334717, + "learning_rate": 4.148568882949104e-07, + "loss": 1.0648, + "num_input_tokens_seen": 182560, + "step": 150 + }, + { + "epoch": 0.017262501392137208, + "grad_norm": 4.414526462554932, + "learning_rate": 4.2877826038534355e-07, + "loss": 1.1075, + "num_input_tokens_seen": 188832, + "step": 155 + }, + { + "epoch": 0.01781935627575454, + "grad_norm": 4.275631427764893, + "learning_rate": 4.4269963247577687e-07, + "loss": 1.0564, + "num_input_tokens_seen": 195104, + "step": 160 + }, + { + "epoch": 0.018376211159371866, + "grad_norm": 4.5389509201049805, + "learning_rate": 4.5662100456621004e-07, + "loss": 1.1325, + "num_input_tokens_seen": 201056, + "step": 165 + }, + { + "epoch": 0.018933066042989197, + "grad_norm": 4.367937088012695, + "learning_rate": 4.705423766566433e-07, + "loss": 0.9384, + "num_input_tokens_seen": 207168, + "step": 170 + }, + { + "epoch": 0.019489920926606525, + "grad_norm": 4.472005844116211, + "learning_rate": 4.844637487470765e-07, + "loss": 0.9964, + "num_input_tokens_seen": 213408, + "step": 175 + }, + { + "epoch": 0.020046775810223856, + "grad_norm": 4.5133819580078125, + "learning_rate": 4.983851208375097e-07, + "loss": 1.0757, + "num_input_tokens_seen": 219584, + "step": 180 + }, + { + "epoch": 0.020603630693841184, + "grad_norm": 5.588822364807129, + "learning_rate": 5.12306492927943e-07, + "loss": 1.3252, + "num_input_tokens_seen": 225600, + "step": 185 + }, + { + "epoch": 0.021160485577458515, + "grad_norm": 5.160659313201904, + "learning_rate": 5.262278650183763e-07, + "loss": 1.1059, + "num_input_tokens_seen": 232096, + "step": 190 + }, + { + "epoch": 0.021717340461075842, + "grad_norm": 4.579878807067871, + "learning_rate": 5.401492371088094e-07, + "loss": 1.071, + "num_input_tokens_seen": 238432, + "step": 195 + }, + { + "epoch": 0.022274195344693173, + "grad_norm": 4.256295204162598, + "learning_rate": 5.540706091992427e-07, + "loss": 1.1545, + "num_input_tokens_seen": 244640, + "step": 200 + }, + { + "epoch": 0.0228310502283105, + "grad_norm": 4.304007530212402, + "learning_rate": 5.679919812896759e-07, + "loss": 0.9725, + "num_input_tokens_seen": 250528, + "step": 205 + }, + { + "epoch": 0.023387905111927832, + "grad_norm": 4.957269191741943, + "learning_rate": 5.819133533801091e-07, + "loss": 1.092, + "num_input_tokens_seen": 256960, + "step": 210 + }, + { + "epoch": 0.02394475999554516, + "grad_norm": 5.166626453399658, + "learning_rate": 5.958347254705424e-07, + "loss": 1.2164, + "num_input_tokens_seen": 263200, + "step": 215 + }, + { + "epoch": 0.02450161487916249, + "grad_norm": 4.297743797302246, + "learning_rate": 6.097560975609757e-07, + "loss": 0.9382, + "num_input_tokens_seen": 269440, + "step": 220 + }, + { + "epoch": 0.02505846976277982, + "grad_norm": 5.584685802459717, + "learning_rate": 6.236774696514088e-07, + "loss": 1.2335, + "num_input_tokens_seen": 275424, + "step": 225 + }, + { + "epoch": 0.02561532464639715, + "grad_norm": 4.982387065887451, + "learning_rate": 6.375988417418421e-07, + "loss": 0.954, + "num_input_tokens_seen": 281888, + "step": 230 + }, + { + "epoch": 0.026172179530014477, + "grad_norm": 4.392430782318115, + "learning_rate": 6.515202138322753e-07, + "loss": 1.1002, + "num_input_tokens_seen": 287296, + "step": 235 + }, + { + "epoch": 0.026729034413631808, + "grad_norm": 4.071572780609131, + "learning_rate": 6.654415859227086e-07, + "loss": 1.0977, + "num_input_tokens_seen": 293600, + "step": 240 + }, + { + "epoch": 0.027285889297249136, + "grad_norm": 4.074090480804443, + "learning_rate": 6.793629580131418e-07, + "loss": 0.9896, + "num_input_tokens_seen": 299776, + "step": 245 + }, + { + "epoch": 0.027842744180866467, + "grad_norm": 4.956945896148682, + "learning_rate": 6.932843301035751e-07, + "loss": 1.1586, + "num_input_tokens_seen": 306080, + "step": 250 + }, + { + "epoch": 0.028399599064483794, + "grad_norm": 4.2536163330078125, + "learning_rate": 7.072057021940083e-07, + "loss": 1.0317, + "num_input_tokens_seen": 311968, + "step": 255 + }, + { + "epoch": 0.028956453948101125, + "grad_norm": 4.234081745147705, + "learning_rate": 7.211270742844415e-07, + "loss": 0.9848, + "num_input_tokens_seen": 318368, + "step": 260 + }, + { + "epoch": 0.029513308831718453, + "grad_norm": 4.710058212280273, + "learning_rate": 7.350484463748747e-07, + "loss": 0.9667, + "num_input_tokens_seen": 324672, + "step": 265 + }, + { + "epoch": 0.030070163715335784, + "grad_norm": 4.363619327545166, + "learning_rate": 7.48969818465308e-07, + "loss": 1.0205, + "num_input_tokens_seen": 330336, + "step": 270 + }, + { + "epoch": 0.03062701859895311, + "grad_norm": 4.742355823516846, + "learning_rate": 7.628911905557412e-07, + "loss": 1.3045, + "num_input_tokens_seen": 336096, + "step": 275 + }, + { + "epoch": 0.031183873482570443, + "grad_norm": 3.9827377796173096, + "learning_rate": 7.768125626461745e-07, + "loss": 1.1382, + "num_input_tokens_seen": 342528, + "step": 280 + }, + { + "epoch": 0.03174072836618777, + "grad_norm": 4.361292362213135, + "learning_rate": 7.907339347366077e-07, + "loss": 1.0932, + "num_input_tokens_seen": 348704, + "step": 285 + }, + { + "epoch": 0.0322975832498051, + "grad_norm": 4.780867576599121, + "learning_rate": 8.046553068270408e-07, + "loss": 1.2194, + "num_input_tokens_seen": 354720, + "step": 290 + }, + { + "epoch": 0.03285443813342243, + "grad_norm": 5.671310901641846, + "learning_rate": 8.185766789174742e-07, + "loss": 1.1577, + "num_input_tokens_seen": 361152, + "step": 295 + }, + { + "epoch": 0.03341129301703976, + "grad_norm": 4.9904584884643555, + "learning_rate": 8.324980510079074e-07, + "loss": 0.9648, + "num_input_tokens_seen": 367104, + "step": 300 + }, + { + "epoch": 0.03396814790065709, + "grad_norm": 5.011760711669922, + "learning_rate": 8.464194230983406e-07, + "loss": 1.2782, + "num_input_tokens_seen": 373440, + "step": 305 + }, + { + "epoch": 0.034525002784274415, + "grad_norm": 4.254681587219238, + "learning_rate": 8.603407951887738e-07, + "loss": 0.9992, + "num_input_tokens_seen": 379584, + "step": 310 + }, + { + "epoch": 0.03508185766789175, + "grad_norm": 4.119410991668701, + "learning_rate": 8.742621672792072e-07, + "loss": 1.1739, + "num_input_tokens_seen": 385952, + "step": 315 + }, + { + "epoch": 0.03563871255150908, + "grad_norm": 4.164834499359131, + "learning_rate": 8.881835393696403e-07, + "loss": 1.3828, + "num_input_tokens_seen": 392032, + "step": 320 + }, + { + "epoch": 0.036195567435126405, + "grad_norm": 4.019003868103027, + "learning_rate": 9.021049114600735e-07, + "loss": 0.9624, + "num_input_tokens_seen": 398304, + "step": 325 + }, + { + "epoch": 0.03675242231874373, + "grad_norm": 4.281070232391357, + "learning_rate": 9.160262835505068e-07, + "loss": 0.9696, + "num_input_tokens_seen": 404128, + "step": 330 + }, + { + "epoch": 0.03730927720236107, + "grad_norm": 4.3383636474609375, + "learning_rate": 9.2994765564094e-07, + "loss": 0.9918, + "num_input_tokens_seen": 410240, + "step": 335 + }, + { + "epoch": 0.037866132085978395, + "grad_norm": 4.674637317657471, + "learning_rate": 9.438690277313733e-07, + "loss": 0.8052, + "num_input_tokens_seen": 416480, + "step": 340 + }, + { + "epoch": 0.03842298696959572, + "grad_norm": 4.327814102172852, + "learning_rate": 9.577903998218064e-07, + "loss": 1.2185, + "num_input_tokens_seen": 422528, + "step": 345 + }, + { + "epoch": 0.03897984185321305, + "grad_norm": 3.938828468322754, + "learning_rate": 9.717117719122396e-07, + "loss": 1.1212, + "num_input_tokens_seen": 428288, + "step": 350 + }, + { + "epoch": 0.039536696736830385, + "grad_norm": 4.165746688842773, + "learning_rate": 9.856331440026731e-07, + "loss": 1.1558, + "num_input_tokens_seen": 434688, + "step": 355 + }, + { + "epoch": 0.04009355162044771, + "grad_norm": 4.705662250518799, + "learning_rate": 9.995545160931062e-07, + "loss": 1.1384, + "num_input_tokens_seen": 440672, + "step": 360 + }, + { + "epoch": 0.04065040650406504, + "grad_norm": 5.757937431335449, + "learning_rate": 1.0134758881835394e-06, + "loss": 0.9096, + "num_input_tokens_seen": 446816, + "step": 365 + }, + { + "epoch": 0.04120726138768237, + "grad_norm": 4.013961315155029, + "learning_rate": 1.0273972602739725e-06, + "loss": 1.0531, + "num_input_tokens_seen": 452896, + "step": 370 + }, + { + "epoch": 0.0417641162712997, + "grad_norm": 4.949216842651367, + "learning_rate": 1.041318632364406e-06, + "loss": 1.0353, + "num_input_tokens_seen": 459104, + "step": 375 + }, + { + "epoch": 0.04232097115491703, + "grad_norm": 3.9257307052612305, + "learning_rate": 1.0552400044548393e-06, + "loss": 1.2786, + "num_input_tokens_seen": 465376, + "step": 380 + }, + { + "epoch": 0.04287782603853436, + "grad_norm": 4.810807704925537, + "learning_rate": 1.0691613765452723e-06, + "loss": 0.9737, + "num_input_tokens_seen": 471456, + "step": 385 + }, + { + "epoch": 0.043434680922151685, + "grad_norm": 4.342674732208252, + "learning_rate": 1.0830827486357056e-06, + "loss": 1.0758, + "num_input_tokens_seen": 477600, + "step": 390 + }, + { + "epoch": 0.04399153580576902, + "grad_norm": 4.9923858642578125, + "learning_rate": 1.0970041207261389e-06, + "loss": 1.1186, + "num_input_tokens_seen": 483488, + "step": 395 + }, + { + "epoch": 0.04454839068938635, + "grad_norm": 4.112658977508545, + "learning_rate": 1.1109254928165721e-06, + "loss": 1.0204, + "num_input_tokens_seen": 489632, + "step": 400 + }, + { + "epoch": 0.045105245573003674, + "grad_norm": 4.891449451446533, + "learning_rate": 1.1248468649070052e-06, + "loss": 1.1879, + "num_input_tokens_seen": 495744, + "step": 405 + }, + { + "epoch": 0.045662100456621, + "grad_norm": 4.248114585876465, + "learning_rate": 1.1387682369974384e-06, + "loss": 1.1631, + "num_input_tokens_seen": 501984, + "step": 410 + }, + { + "epoch": 0.04621895534023834, + "grad_norm": 4.343608856201172, + "learning_rate": 1.152689609087872e-06, + "loss": 1.1732, + "num_input_tokens_seen": 508128, + "step": 415 + }, + { + "epoch": 0.046775810223855664, + "grad_norm": 4.414834499359131, + "learning_rate": 1.166610981178305e-06, + "loss": 1.0212, + "num_input_tokens_seen": 514144, + "step": 420 + }, + { + "epoch": 0.04733266510747299, + "grad_norm": 4.298830509185791, + "learning_rate": 1.1805323532687383e-06, + "loss": 1.0914, + "num_input_tokens_seen": 520320, + "step": 425 + }, + { + "epoch": 0.04788951999109032, + "grad_norm": 4.330338001251221, + "learning_rate": 1.1944537253591713e-06, + "loss": 1.092, + "num_input_tokens_seen": 526592, + "step": 430 + }, + { + "epoch": 0.048446374874707654, + "grad_norm": 5.037263870239258, + "learning_rate": 1.2083750974496048e-06, + "loss": 1.188, + "num_input_tokens_seen": 533152, + "step": 435 + }, + { + "epoch": 0.04900322975832498, + "grad_norm": 3.957827568054199, + "learning_rate": 1.2222964695400379e-06, + "loss": 1.0808, + "num_input_tokens_seen": 539200, + "step": 440 + }, + { + "epoch": 0.04956008464194231, + "grad_norm": 4.136734485626221, + "learning_rate": 1.2362178416304711e-06, + "loss": 0.9234, + "num_input_tokens_seen": 545248, + "step": 445 + }, + { + "epoch": 0.05011693952555964, + "grad_norm": 6.534049034118652, + "learning_rate": 1.2501392137209044e-06, + "loss": 1.1584, + "num_input_tokens_seen": 550144, + "step": 450 + }, + { + "epoch": 0.05067379440917697, + "grad_norm": 4.719611644744873, + "learning_rate": 1.2640605858113377e-06, + "loss": 1.1143, + "num_input_tokens_seen": 556512, + "step": 455 + }, + { + "epoch": 0.0512306492927943, + "grad_norm": 5.160732269287109, + "learning_rate": 1.277981957901771e-06, + "loss": 1.2427, + "num_input_tokens_seen": 562528, + "step": 460 + }, + { + "epoch": 0.051787504176411626, + "grad_norm": 4.602361679077148, + "learning_rate": 1.291903329992204e-06, + "loss": 1.1332, + "num_input_tokens_seen": 569152, + "step": 465 + }, + { + "epoch": 0.052344359060028954, + "grad_norm": 4.538396835327148, + "learning_rate": 1.3058247020826373e-06, + "loss": 1.1753, + "num_input_tokens_seen": 575168, + "step": 470 + }, + { + "epoch": 0.05290121394364629, + "grad_norm": 4.073441028594971, + "learning_rate": 1.3197460741730707e-06, + "loss": 1.085, + "num_input_tokens_seen": 581312, + "step": 475 + }, + { + "epoch": 0.053458068827263616, + "grad_norm": 5.2257304191589355, + "learning_rate": 1.3336674462635038e-06, + "loss": 1.0464, + "num_input_tokens_seen": 587616, + "step": 480 + }, + { + "epoch": 0.054014923710880944, + "grad_norm": 4.098769664764404, + "learning_rate": 1.347588818353937e-06, + "loss": 0.9421, + "num_input_tokens_seen": 593632, + "step": 485 + }, + { + "epoch": 0.05457177859449827, + "grad_norm": 4.49718713760376, + "learning_rate": 1.3615101904443701e-06, + "loss": 0.9928, + "num_input_tokens_seen": 599360, + "step": 490 + }, + { + "epoch": 0.055128633478115606, + "grad_norm": 4.5978827476501465, + "learning_rate": 1.3754315625348036e-06, + "loss": 1.0638, + "num_input_tokens_seen": 605504, + "step": 495 + }, + { + "epoch": 0.055685488361732934, + "grad_norm": 4.249790191650391, + "learning_rate": 1.3893529346252367e-06, + "loss": 1.1672, + "num_input_tokens_seen": 611648, + "step": 500 + }, + { + "epoch": 0.05624234324535026, + "grad_norm": 4.296840667724609, + "learning_rate": 1.40327430671567e-06, + "loss": 1.2417, + "num_input_tokens_seen": 617856, + "step": 505 + }, + { + "epoch": 0.05679919812896759, + "grad_norm": 4.107887268066406, + "learning_rate": 1.4171956788061032e-06, + "loss": 1.0426, + "num_input_tokens_seen": 623968, + "step": 510 + }, + { + "epoch": 0.05735605301258492, + "grad_norm": 4.679174423217773, + "learning_rate": 1.4311170508965365e-06, + "loss": 1.3244, + "num_input_tokens_seen": 629888, + "step": 515 + }, + { + "epoch": 0.05791290789620225, + "grad_norm": 5.088805675506592, + "learning_rate": 1.4450384229869697e-06, + "loss": 0.9795, + "num_input_tokens_seen": 635808, + "step": 520 + }, + { + "epoch": 0.05846976277981958, + "grad_norm": 4.328815460205078, + "learning_rate": 1.4589597950774028e-06, + "loss": 1.2212, + "num_input_tokens_seen": 641376, + "step": 525 + }, + { + "epoch": 0.059026617663436906, + "grad_norm": 4.519065856933594, + "learning_rate": 1.472881167167836e-06, + "loss": 1.1411, + "num_input_tokens_seen": 647392, + "step": 530 + }, + { + "epoch": 0.05958347254705424, + "grad_norm": 4.766757488250732, + "learning_rate": 1.4868025392582693e-06, + "loss": 1.2156, + "num_input_tokens_seen": 653600, + "step": 535 + }, + { + "epoch": 0.06014032743067157, + "grad_norm": 4.455078601837158, + "learning_rate": 1.5007239113487026e-06, + "loss": 1.2132, + "num_input_tokens_seen": 660128, + "step": 540 + }, + { + "epoch": 0.060697182314288896, + "grad_norm": 4.273447036743164, + "learning_rate": 1.5146452834391359e-06, + "loss": 1.1318, + "num_input_tokens_seen": 665888, + "step": 545 + }, + { + "epoch": 0.06125403719790622, + "grad_norm": 4.1126017570495605, + "learning_rate": 1.528566655529569e-06, + "loss": 1.0766, + "num_input_tokens_seen": 672128, + "step": 550 + }, + { + "epoch": 0.06181089208152356, + "grad_norm": 4.3490071296691895, + "learning_rate": 1.5424880276200024e-06, + "loss": 1.3743, + "num_input_tokens_seen": 677696, + "step": 555 + }, + { + "epoch": 0.062367746965140886, + "grad_norm": 4.5155439376831055, + "learning_rate": 1.5564093997104355e-06, + "loss": 1.1487, + "num_input_tokens_seen": 684064, + "step": 560 + }, + { + "epoch": 0.06292460184875821, + "grad_norm": 4.1850152015686035, + "learning_rate": 1.5703307718008687e-06, + "loss": 1.0237, + "num_input_tokens_seen": 690272, + "step": 565 + }, + { + "epoch": 0.06348145673237554, + "grad_norm": 4.272155284881592, + "learning_rate": 1.584252143891302e-06, + "loss": 1.0703, + "num_input_tokens_seen": 695968, + "step": 570 + }, + { + "epoch": 0.06403831161599287, + "grad_norm": 6.5319719314575195, + "learning_rate": 1.598173515981735e-06, + "loss": 1.239, + "num_input_tokens_seen": 702304, + "step": 575 + }, + { + "epoch": 0.0645951664996102, + "grad_norm": 4.173793315887451, + "learning_rate": 1.6120948880721683e-06, + "loss": 0.9902, + "num_input_tokens_seen": 708512, + "step": 580 + }, + { + "epoch": 0.06515202138322754, + "grad_norm": 5.0915303230285645, + "learning_rate": 1.6260162601626018e-06, + "loss": 1.2604, + "num_input_tokens_seen": 714912, + "step": 585 + }, + { + "epoch": 0.06570887626684487, + "grad_norm": 6.899030685424805, + "learning_rate": 1.639937632253035e-06, + "loss": 1.3001, + "num_input_tokens_seen": 721024, + "step": 590 + }, + { + "epoch": 0.06626573115046219, + "grad_norm": 4.198339462280273, + "learning_rate": 1.6538590043434682e-06, + "loss": 1.1155, + "num_input_tokens_seen": 727168, + "step": 595 + }, + { + "epoch": 0.06682258603407952, + "grad_norm": 4.550741195678711, + "learning_rate": 1.6677803764339014e-06, + "loss": 1.2763, + "num_input_tokens_seen": 733152, + "step": 600 + }, + { + "epoch": 0.06737944091769685, + "grad_norm": 4.947117328643799, + "learning_rate": 1.6817017485243347e-06, + "loss": 1.0431, + "num_input_tokens_seen": 739328, + "step": 605 + }, + { + "epoch": 0.06793629580131418, + "grad_norm": 4.56787109375, + "learning_rate": 1.6956231206147678e-06, + "loss": 1.0579, + "num_input_tokens_seen": 745568, + "step": 610 + }, + { + "epoch": 0.0684931506849315, + "grad_norm": 5.06978702545166, + "learning_rate": 1.709544492705201e-06, + "loss": 1.0363, + "num_input_tokens_seen": 751872, + "step": 615 + }, + { + "epoch": 0.06905000556854883, + "grad_norm": 4.907290935516357, + "learning_rate": 1.7234658647956343e-06, + "loss": 1.2397, + "num_input_tokens_seen": 757824, + "step": 620 + }, + { + "epoch": 0.06960686045216617, + "grad_norm": 5.858518600463867, + "learning_rate": 1.7373872368860673e-06, + "loss": 1.0931, + "num_input_tokens_seen": 764320, + "step": 625 + }, + { + "epoch": 0.0701637153357835, + "grad_norm": 5.2378363609313965, + "learning_rate": 1.751308608976501e-06, + "loss": 1.2674, + "num_input_tokens_seen": 770592, + "step": 630 + }, + { + "epoch": 0.07072057021940083, + "grad_norm": 4.37540864944458, + "learning_rate": 1.765229981066934e-06, + "loss": 1.0855, + "num_input_tokens_seen": 775968, + "step": 635 + }, + { + "epoch": 0.07127742510301815, + "grad_norm": 4.6276936531066895, + "learning_rate": 1.7791513531573674e-06, + "loss": 1.1887, + "num_input_tokens_seen": 782144, + "step": 640 + }, + { + "epoch": 0.07183427998663548, + "grad_norm": 3.9098360538482666, + "learning_rate": 1.7930727252478004e-06, + "loss": 1.089, + "num_input_tokens_seen": 788352, + "step": 645 + }, + { + "epoch": 0.07239113487025281, + "grad_norm": 3.88321852684021, + "learning_rate": 1.8069940973382337e-06, + "loss": 1.1123, + "num_input_tokens_seen": 794464, + "step": 650 + }, + { + "epoch": 0.07294798975387014, + "grad_norm": 5.539846420288086, + "learning_rate": 1.820915469428667e-06, + "loss": 1.363, + "num_input_tokens_seen": 800608, + "step": 655 + }, + { + "epoch": 0.07350484463748747, + "grad_norm": 4.600950241088867, + "learning_rate": 1.8348368415191e-06, + "loss": 1.1472, + "num_input_tokens_seen": 806400, + "step": 660 + }, + { + "epoch": 0.0740616995211048, + "grad_norm": 4.622992038726807, + "learning_rate": 1.8487582136095333e-06, + "loss": 1.1022, + "num_input_tokens_seen": 812448, + "step": 665 + }, + { + "epoch": 0.07461855440472213, + "grad_norm": 4.707732677459717, + "learning_rate": 1.8626795856999668e-06, + "loss": 1.2521, + "num_input_tokens_seen": 817984, + "step": 670 + }, + { + "epoch": 0.07517540928833946, + "grad_norm": 4.845822334289551, + "learning_rate": 1.8766009577904e-06, + "loss": 1.0763, + "num_input_tokens_seen": 823776, + "step": 675 + }, + { + "epoch": 0.07573226417195679, + "grad_norm": 3.948136568069458, + "learning_rate": 1.890522329880833e-06, + "loss": 1.0381, + "num_input_tokens_seen": 830080, + "step": 680 + }, + { + "epoch": 0.07628911905557412, + "grad_norm": 4.6855316162109375, + "learning_rate": 1.9044437019712664e-06, + "loss": 0.9918, + "num_input_tokens_seen": 836288, + "step": 685 + }, + { + "epoch": 0.07684597393919144, + "grad_norm": 4.8415021896362305, + "learning_rate": 1.9183650740616994e-06, + "loss": 1.2199, + "num_input_tokens_seen": 842496, + "step": 690 + }, + { + "epoch": 0.07740282882280877, + "grad_norm": 4.376083850860596, + "learning_rate": 1.932286446152133e-06, + "loss": 0.9891, + "num_input_tokens_seen": 848256, + "step": 695 + }, + { + "epoch": 0.0779596837064261, + "grad_norm": 4.49850606918335, + "learning_rate": 1.946207818242566e-06, + "loss": 0.98, + "num_input_tokens_seen": 854048, + "step": 700 + }, + { + "epoch": 0.07851653859004344, + "grad_norm": 4.168995380401611, + "learning_rate": 1.960129190332999e-06, + "loss": 1.0428, + "num_input_tokens_seen": 860064, + "step": 705 + }, + { + "epoch": 0.07907339347366077, + "grad_norm": 4.053037166595459, + "learning_rate": 1.9740505624234325e-06, + "loss": 1.1893, + "num_input_tokens_seen": 866208, + "step": 710 + }, + { + "epoch": 0.0796302483572781, + "grad_norm": 4.586674213409424, + "learning_rate": 1.987971934513866e-06, + "loss": 1.3148, + "num_input_tokens_seen": 872416, + "step": 715 + }, + { + "epoch": 0.08018710324089542, + "grad_norm": 3.962557554244995, + "learning_rate": 2.001893306604299e-06, + "loss": 0.9405, + "num_input_tokens_seen": 878624, + "step": 720 + }, + { + "epoch": 0.08074395812451275, + "grad_norm": 4.634491920471191, + "learning_rate": 2.015814678694732e-06, + "loss": 1.244, + "num_input_tokens_seen": 884832, + "step": 725 + }, + { + "epoch": 0.08130081300813008, + "grad_norm": 4.7655792236328125, + "learning_rate": 2.0297360507851656e-06, + "loss": 1.161, + "num_input_tokens_seen": 891008, + "step": 730 + }, + { + "epoch": 0.08185766789174741, + "grad_norm": 4.93132209777832, + "learning_rate": 2.0436574228755986e-06, + "loss": 1.222, + "num_input_tokens_seen": 897056, + "step": 735 + }, + { + "epoch": 0.08241452277536473, + "grad_norm": 5.130275249481201, + "learning_rate": 2.0575787949660317e-06, + "loss": 1.3422, + "num_input_tokens_seen": 902464, + "step": 740 + }, + { + "epoch": 0.08297137765898208, + "grad_norm": 4.85706090927124, + "learning_rate": 2.071500167056465e-06, + "loss": 1.1227, + "num_input_tokens_seen": 908896, + "step": 745 + }, + { + "epoch": 0.0835282325425994, + "grad_norm": 4.178387641906738, + "learning_rate": 2.0854215391468987e-06, + "loss": 1.292, + "num_input_tokens_seen": 914944, + "step": 750 + }, + { + "epoch": 0.08408508742621673, + "grad_norm": 4.692017078399658, + "learning_rate": 2.0993429112373317e-06, + "loss": 1.3656, + "num_input_tokens_seen": 921120, + "step": 755 + }, + { + "epoch": 0.08464194230983406, + "grad_norm": 4.192712306976318, + "learning_rate": 2.1132642833277648e-06, + "loss": 1.0275, + "num_input_tokens_seen": 927168, + "step": 760 + }, + { + "epoch": 0.08519879719345139, + "grad_norm": 4.400692939758301, + "learning_rate": 2.1271856554181983e-06, + "loss": 1.0787, + "num_input_tokens_seen": 933216, + "step": 765 + }, + { + "epoch": 0.08575565207706871, + "grad_norm": 4.479741096496582, + "learning_rate": 2.1411070275086313e-06, + "loss": 1.1377, + "num_input_tokens_seen": 939136, + "step": 770 + }, + { + "epoch": 0.08631250696068604, + "grad_norm": 4.36796236038208, + "learning_rate": 2.1550283995990644e-06, + "loss": 0.8717, + "num_input_tokens_seen": 945152, + "step": 775 + }, + { + "epoch": 0.08686936184430337, + "grad_norm": 4.3757123947143555, + "learning_rate": 2.168949771689498e-06, + "loss": 1.0807, + "num_input_tokens_seen": 951072, + "step": 780 + }, + { + "epoch": 0.0874262167279207, + "grad_norm": 4.360332012176514, + "learning_rate": 2.182871143779931e-06, + "loss": 1.1092, + "num_input_tokens_seen": 956928, + "step": 785 + }, + { + "epoch": 0.08798307161153804, + "grad_norm": 4.28585147857666, + "learning_rate": 2.1967925158703644e-06, + "loss": 1.2791, + "num_input_tokens_seen": 963360, + "step": 790 + }, + { + "epoch": 0.08853992649515537, + "grad_norm": 4.332316875457764, + "learning_rate": 2.2107138879607975e-06, + "loss": 1.0768, + "num_input_tokens_seen": 969664, + "step": 795 + }, + { + "epoch": 0.0890967813787727, + "grad_norm": 6.795017242431641, + "learning_rate": 2.224635260051231e-06, + "loss": 1.409, + "num_input_tokens_seen": 976064, + "step": 800 + }, + { + "epoch": 0.08965363626239002, + "grad_norm": 4.949966907501221, + "learning_rate": 2.238556632141664e-06, + "loss": 1.0975, + "num_input_tokens_seen": 982336, + "step": 805 + }, + { + "epoch": 0.09021049114600735, + "grad_norm": 5.150527477264404, + "learning_rate": 2.252478004232097e-06, + "loss": 1.2597, + "num_input_tokens_seen": 987680, + "step": 810 + }, + { + "epoch": 0.09076734602962468, + "grad_norm": 4.747499942779541, + "learning_rate": 2.2663993763225305e-06, + "loss": 1.2637, + "num_input_tokens_seen": 994144, + "step": 815 + }, + { + "epoch": 0.091324200913242, + "grad_norm": 4.087789535522461, + "learning_rate": 2.2803207484129636e-06, + "loss": 1.1318, + "num_input_tokens_seen": 1000192, + "step": 820 + }, + { + "epoch": 0.09188105579685933, + "grad_norm": 5.314230442047119, + "learning_rate": 2.2942421205033967e-06, + "loss": 1.2271, + "num_input_tokens_seen": 1006304, + "step": 825 + }, + { + "epoch": 0.09243791068047667, + "grad_norm": 4.578556537628174, + "learning_rate": 2.30816349259383e-06, + "loss": 1.2139, + "num_input_tokens_seen": 1012512, + "step": 830 + }, + { + "epoch": 0.092994765564094, + "grad_norm": 4.862993240356445, + "learning_rate": 2.3220848646842636e-06, + "loss": 1.056, + "num_input_tokens_seen": 1018720, + "step": 835 + }, + { + "epoch": 0.09355162044771133, + "grad_norm": 3.9062399864196777, + "learning_rate": 2.3360062367746967e-06, + "loss": 1.1158, + "num_input_tokens_seen": 1024256, + "step": 840 + }, + { + "epoch": 0.09410847533132866, + "grad_norm": 4.596052646636963, + "learning_rate": 2.3499276088651297e-06, + "loss": 1.0849, + "num_input_tokens_seen": 1030464, + "step": 845 + }, + { + "epoch": 0.09466533021494598, + "grad_norm": 4.829099178314209, + "learning_rate": 2.3638489809555632e-06, + "loss": 1.1233, + "num_input_tokens_seen": 1036320, + "step": 850 + }, + { + "epoch": 0.09522218509856331, + "grad_norm": 4.770541191101074, + "learning_rate": 2.3777703530459963e-06, + "loss": 0.9331, + "num_input_tokens_seen": 1042560, + "step": 855 + }, + { + "epoch": 0.09577903998218064, + "grad_norm": 4.684576511383057, + "learning_rate": 2.3916917251364293e-06, + "loss": 1.3178, + "num_input_tokens_seen": 1048832, + "step": 860 + }, + { + "epoch": 0.09633589486579797, + "grad_norm": 4.413801193237305, + "learning_rate": 2.405613097226863e-06, + "loss": 1.3257, + "num_input_tokens_seen": 1054944, + "step": 865 + }, + { + "epoch": 0.09689274974941531, + "grad_norm": 4.4523234367370605, + "learning_rate": 2.4195344693172963e-06, + "loss": 1.0747, + "num_input_tokens_seen": 1061056, + "step": 870 + }, + { + "epoch": 0.09744960463303264, + "grad_norm": 4.032668590545654, + "learning_rate": 2.4334558414077293e-06, + "loss": 0.8992, + "num_input_tokens_seen": 1066816, + "step": 875 + }, + { + "epoch": 0.09800645951664996, + "grad_norm": 3.873157501220703, + "learning_rate": 2.4473772134981624e-06, + "loss": 0.9768, + "num_input_tokens_seen": 1072736, + "step": 880 + }, + { + "epoch": 0.09856331440026729, + "grad_norm": 4.384052276611328, + "learning_rate": 2.461298585588596e-06, + "loss": 1.1802, + "num_input_tokens_seen": 1079072, + "step": 885 + }, + { + "epoch": 0.09912016928388462, + "grad_norm": 4.38204288482666, + "learning_rate": 2.475219957679029e-06, + "loss": 1.3935, + "num_input_tokens_seen": 1085280, + "step": 890 + }, + { + "epoch": 0.09967702416750195, + "grad_norm": 5.532227993011475, + "learning_rate": 2.489141329769462e-06, + "loss": 1.0257, + "num_input_tokens_seen": 1091488, + "step": 895 + }, + { + "epoch": 0.10023387905111927, + "grad_norm": 5.414539337158203, + "learning_rate": 2.5030627018598955e-06, + "loss": 1.3654, + "num_input_tokens_seen": 1096992, + "step": 900 + }, + { + "epoch": 0.1007907339347366, + "grad_norm": 4.088347434997559, + "learning_rate": 2.5169840739503285e-06, + "loss": 1.1658, + "num_input_tokens_seen": 1103168, + "step": 905 + }, + { + "epoch": 0.10134758881835394, + "grad_norm": 4.8382182121276855, + "learning_rate": 2.530905446040762e-06, + "loss": 1.2801, + "num_input_tokens_seen": 1109344, + "step": 910 + }, + { + "epoch": 0.10190444370197127, + "grad_norm": 4.704990386962891, + "learning_rate": 2.544826818131195e-06, + "loss": 1.1251, + "num_input_tokens_seen": 1115680, + "step": 915 + }, + { + "epoch": 0.1024612985855886, + "grad_norm": 4.577752590179443, + "learning_rate": 2.5587481902216286e-06, + "loss": 1.0953, + "num_input_tokens_seen": 1121760, + "step": 920 + }, + { + "epoch": 0.10301815346920593, + "grad_norm": 4.914770603179932, + "learning_rate": 2.5726695623120616e-06, + "loss": 1.1253, + "num_input_tokens_seen": 1127936, + "step": 925 + }, + { + "epoch": 0.10357500835282325, + "grad_norm": 4.3093976974487305, + "learning_rate": 2.5865909344024947e-06, + "loss": 1.0166, + "num_input_tokens_seen": 1134272, + "step": 930 + }, + { + "epoch": 0.10413186323644058, + "grad_norm": 6.129045009613037, + "learning_rate": 2.600512306492928e-06, + "loss": 1.1149, + "num_input_tokens_seen": 1139776, + "step": 935 + }, + { + "epoch": 0.10468871812005791, + "grad_norm": 4.7541327476501465, + "learning_rate": 2.6144336785833612e-06, + "loss": 0.9964, + "num_input_tokens_seen": 1145792, + "step": 940 + }, + { + "epoch": 0.10524557300367524, + "grad_norm": 4.535073757171631, + "learning_rate": 2.6283550506737943e-06, + "loss": 1.0894, + "num_input_tokens_seen": 1152032, + "step": 945 + }, + { + "epoch": 0.10580242788729258, + "grad_norm": 4.722495079040527, + "learning_rate": 2.6422764227642278e-06, + "loss": 0.9707, + "num_input_tokens_seen": 1158304, + "step": 950 + }, + { + "epoch": 0.1063592827709099, + "grad_norm": 4.554954528808594, + "learning_rate": 2.6561977948546612e-06, + "loss": 1.0918, + "num_input_tokens_seen": 1164352, + "step": 955 + }, + { + "epoch": 0.10691613765452723, + "grad_norm": 4.376375675201416, + "learning_rate": 2.6701191669450943e-06, + "loss": 1.3314, + "num_input_tokens_seen": 1170592, + "step": 960 + }, + { + "epoch": 0.10747299253814456, + "grad_norm": 4.082557201385498, + "learning_rate": 2.6840405390355274e-06, + "loss": 0.9679, + "num_input_tokens_seen": 1177024, + "step": 965 + }, + { + "epoch": 0.10802984742176189, + "grad_norm": 4.454309463500977, + "learning_rate": 2.697961911125961e-06, + "loss": 1.2934, + "num_input_tokens_seen": 1183264, + "step": 970 + }, + { + "epoch": 0.10858670230537922, + "grad_norm": 4.20889949798584, + "learning_rate": 2.711883283216394e-06, + "loss": 1.0349, + "num_input_tokens_seen": 1189376, + "step": 975 + }, + { + "epoch": 0.10914355718899654, + "grad_norm": 4.6555681228637695, + "learning_rate": 2.725804655306827e-06, + "loss": 1.1246, + "num_input_tokens_seen": 1195808, + "step": 980 + }, + { + "epoch": 0.10970041207261387, + "grad_norm": 4.022549152374268, + "learning_rate": 2.7397260273972604e-06, + "loss": 0.9264, + "num_input_tokens_seen": 1201344, + "step": 985 + }, + { + "epoch": 0.11025726695623121, + "grad_norm": 4.3350958824157715, + "learning_rate": 2.753647399487694e-06, + "loss": 1.0556, + "num_input_tokens_seen": 1207392, + "step": 990 + }, + { + "epoch": 0.11081412183984854, + "grad_norm": 5.006669521331787, + "learning_rate": 2.767568771578127e-06, + "loss": 0.9249, + "num_input_tokens_seen": 1213632, + "step": 995 + }, + { + "epoch": 0.11137097672346587, + "grad_norm": 5.876039981842041, + "learning_rate": 2.78149014366856e-06, + "loss": 1.1363, + "num_input_tokens_seen": 1219744, + "step": 1000 + }, + { + "epoch": 0.1119278316070832, + "grad_norm": 4.284298419952393, + "learning_rate": 2.7954115157589935e-06, + "loss": 1.0306, + "num_input_tokens_seen": 1226272, + "step": 1005 + }, + { + "epoch": 0.11248468649070052, + "grad_norm": 4.125556468963623, + "learning_rate": 2.8093328878494266e-06, + "loss": 1.2631, + "num_input_tokens_seen": 1231776, + "step": 1010 + }, + { + "epoch": 0.11304154137431785, + "grad_norm": 4.750244140625, + "learning_rate": 2.8232542599398596e-06, + "loss": 1.3833, + "num_input_tokens_seen": 1237568, + "step": 1015 + }, + { + "epoch": 0.11359839625793518, + "grad_norm": 5.581284046173096, + "learning_rate": 2.837175632030293e-06, + "loss": 1.0084, + "num_input_tokens_seen": 1243872, + "step": 1020 + }, + { + "epoch": 0.1141552511415525, + "grad_norm": 3.963442087173462, + "learning_rate": 2.851097004120726e-06, + "loss": 1.0141, + "num_input_tokens_seen": 1250304, + "step": 1025 + }, + { + "epoch": 0.11471210602516985, + "grad_norm": 4.716802597045898, + "learning_rate": 2.8650183762111596e-06, + "loss": 1.121, + "num_input_tokens_seen": 1256064, + "step": 1030 + }, + { + "epoch": 0.11526896090878717, + "grad_norm": 4.7411651611328125, + "learning_rate": 2.8789397483015927e-06, + "loss": 1.1759, + "num_input_tokens_seen": 1262432, + "step": 1035 + }, + { + "epoch": 0.1158258157924045, + "grad_norm": 4.496252536773682, + "learning_rate": 2.892861120392026e-06, + "loss": 0.8641, + "num_input_tokens_seen": 1268608, + "step": 1040 + }, + { + "epoch": 0.11638267067602183, + "grad_norm": 5.084023475646973, + "learning_rate": 2.9067824924824592e-06, + "loss": 0.9659, + "num_input_tokens_seen": 1274496, + "step": 1045 + }, + { + "epoch": 0.11693952555963916, + "grad_norm": 4.5303544998168945, + "learning_rate": 2.9207038645728923e-06, + "loss": 0.9391, + "num_input_tokens_seen": 1280096, + "step": 1050 + }, + { + "epoch": 0.11749638044325648, + "grad_norm": 4.497148036956787, + "learning_rate": 2.9346252366633258e-06, + "loss": 1.1985, + "num_input_tokens_seen": 1286272, + "step": 1055 + }, + { + "epoch": 0.11805323532687381, + "grad_norm": 4.745471477508545, + "learning_rate": 2.948546608753759e-06, + "loss": 1.0626, + "num_input_tokens_seen": 1292256, + "step": 1060 + }, + { + "epoch": 0.11861009021049114, + "grad_norm": 4.505265235900879, + "learning_rate": 2.962467980844192e-06, + "loss": 1.082, + "num_input_tokens_seen": 1298048, + "step": 1065 + }, + { + "epoch": 0.11916694509410848, + "grad_norm": 4.534062385559082, + "learning_rate": 2.9763893529346254e-06, + "loss": 1.056, + "num_input_tokens_seen": 1304256, + "step": 1070 + }, + { + "epoch": 0.11972379997772581, + "grad_norm": 4.313847541809082, + "learning_rate": 2.990310725025059e-06, + "loss": 1.0467, + "num_input_tokens_seen": 1310848, + "step": 1075 + }, + { + "epoch": 0.12028065486134314, + "grad_norm": 4.949687480926514, + "learning_rate": 3.004232097115492e-06, + "loss": 1.0704, + "num_input_tokens_seen": 1316288, + "step": 1080 + }, + { + "epoch": 0.12083750974496046, + "grad_norm": 4.894528388977051, + "learning_rate": 3.018153469205925e-06, + "loss": 1.0888, + "num_input_tokens_seen": 1322656, + "step": 1085 + }, + { + "epoch": 0.12139436462857779, + "grad_norm": 4.900559425354004, + "learning_rate": 3.0320748412963585e-06, + "loss": 1.0922, + "num_input_tokens_seen": 1328704, + "step": 1090 + }, + { + "epoch": 0.12195121951219512, + "grad_norm": 5.129476070404053, + "learning_rate": 3.0459962133867915e-06, + "loss": 1.2838, + "num_input_tokens_seen": 1334816, + "step": 1095 + }, + { + "epoch": 0.12250807439581245, + "grad_norm": 5.253417015075684, + "learning_rate": 3.0599175854772246e-06, + "loss": 1.0887, + "num_input_tokens_seen": 1340928, + "step": 1100 + }, + { + "epoch": 0.12306492927942977, + "grad_norm": 3.834029197692871, + "learning_rate": 3.073838957567658e-06, + "loss": 0.919, + "num_input_tokens_seen": 1346848, + "step": 1105 + }, + { + "epoch": 0.12362178416304712, + "grad_norm": 4.778772830963135, + "learning_rate": 3.0877603296580915e-06, + "loss": 0.985, + "num_input_tokens_seen": 1353152, + "step": 1110 + }, + { + "epoch": 0.12417863904666444, + "grad_norm": 5.388144016265869, + "learning_rate": 3.1016817017485246e-06, + "loss": 1.0243, + "num_input_tokens_seen": 1359456, + "step": 1115 + }, + { + "epoch": 0.12473549393028177, + "grad_norm": 4.574638843536377, + "learning_rate": 3.1156030738389577e-06, + "loss": 1.2815, + "num_input_tokens_seen": 1365408, + "step": 1120 + }, + { + "epoch": 0.12529234881389908, + "grad_norm": 4.272572040557861, + "learning_rate": 3.129524445929391e-06, + "loss": 1.0013, + "num_input_tokens_seen": 1371680, + "step": 1125 + }, + { + "epoch": 0.12584920369751643, + "grad_norm": 5.253408432006836, + "learning_rate": 3.1434458180198246e-06, + "loss": 1.1276, + "num_input_tokens_seen": 1378016, + "step": 1130 + }, + { + "epoch": 0.12640605858113377, + "grad_norm": 5.029556751251221, + "learning_rate": 3.1573671901102573e-06, + "loss": 1.3121, + "num_input_tokens_seen": 1384096, + "step": 1135 + }, + { + "epoch": 0.12696291346475108, + "grad_norm": 5.714774131774902, + "learning_rate": 3.1712885622006907e-06, + "loss": 1.1884, + "num_input_tokens_seen": 1390208, + "step": 1140 + }, + { + "epoch": 0.12751976834836842, + "grad_norm": 4.242864608764648, + "learning_rate": 3.185209934291124e-06, + "loss": 1.2022, + "num_input_tokens_seen": 1396320, + "step": 1145 + }, + { + "epoch": 0.12807662323198574, + "grad_norm": 4.038809776306152, + "learning_rate": 3.1991313063815573e-06, + "loss": 1.3862, + "num_input_tokens_seen": 1402624, + "step": 1150 + }, + { + "epoch": 0.12863347811560308, + "grad_norm": 4.400712490081787, + "learning_rate": 3.2130526784719903e-06, + "loss": 1.1531, + "num_input_tokens_seen": 1408832, + "step": 1155 + }, + { + "epoch": 0.1291903329992204, + "grad_norm": 4.218389511108398, + "learning_rate": 3.226974050562424e-06, + "loss": 1.0271, + "num_input_tokens_seen": 1415168, + "step": 1160 + }, + { + "epoch": 0.12974718788283773, + "grad_norm": 4.443569183349609, + "learning_rate": 3.2408954226528564e-06, + "loss": 1.0536, + "num_input_tokens_seen": 1421056, + "step": 1165 + }, + { + "epoch": 0.13030404276645507, + "grad_norm": 4.362643241882324, + "learning_rate": 3.25481679474329e-06, + "loss": 1.0806, + "num_input_tokens_seen": 1426368, + "step": 1170 + }, + { + "epoch": 0.1308608976500724, + "grad_norm": 5.0072407722473145, + "learning_rate": 3.2687381668337234e-06, + "loss": 1.1828, + "num_input_tokens_seen": 1432224, + "step": 1175 + }, + { + "epoch": 0.13141775253368973, + "grad_norm": 4.219447612762451, + "learning_rate": 3.2826595389241565e-06, + "loss": 1.1127, + "num_input_tokens_seen": 1438528, + "step": 1180 + }, + { + "epoch": 0.13197460741730704, + "grad_norm": 6.141009330749512, + "learning_rate": 3.29658091101459e-06, + "loss": 1.0669, + "num_input_tokens_seen": 1444288, + "step": 1185 + }, + { + "epoch": 0.13253146230092439, + "grad_norm": 4.72769021987915, + "learning_rate": 3.310502283105023e-06, + "loss": 1.3338, + "num_input_tokens_seen": 1450688, + "step": 1190 + }, + { + "epoch": 0.1330883171845417, + "grad_norm": 4.937537670135498, + "learning_rate": 3.3244236551954565e-06, + "loss": 1.174, + "num_input_tokens_seen": 1456736, + "step": 1195 + }, + { + "epoch": 0.13364517206815904, + "grad_norm": 4.8798041343688965, + "learning_rate": 3.338345027285889e-06, + "loss": 1.094, + "num_input_tokens_seen": 1462976, + "step": 1200 + }, + { + "epoch": 0.13420202695177635, + "grad_norm": 4.783101558685303, + "learning_rate": 3.3522663993763226e-06, + "loss": 1.1936, + "num_input_tokens_seen": 1468512, + "step": 1205 + }, + { + "epoch": 0.1347588818353937, + "grad_norm": 4.616835594177246, + "learning_rate": 3.366187771466756e-06, + "loss": 1.1879, + "num_input_tokens_seen": 1474624, + "step": 1210 + }, + { + "epoch": 0.13531573671901104, + "grad_norm": 4.116428852081299, + "learning_rate": 3.380109143557189e-06, + "loss": 1.0681, + "num_input_tokens_seen": 1481184, + "step": 1215 + }, + { + "epoch": 0.13587259160262835, + "grad_norm": 4.593533992767334, + "learning_rate": 3.3940305156476226e-06, + "loss": 1.0111, + "num_input_tokens_seen": 1487328, + "step": 1220 + }, + { + "epoch": 0.1364294464862457, + "grad_norm": 4.686479568481445, + "learning_rate": 3.4079518877380557e-06, + "loss": 1.0403, + "num_input_tokens_seen": 1493568, + "step": 1225 + }, + { + "epoch": 0.136986301369863, + "grad_norm": 4.579375267028809, + "learning_rate": 3.421873259828489e-06, + "loss": 1.1796, + "num_input_tokens_seen": 1499744, + "step": 1230 + }, + { + "epoch": 0.13754315625348035, + "grad_norm": 4.159205436706543, + "learning_rate": 3.435794631918922e-06, + "loss": 0.9163, + "num_input_tokens_seen": 1506112, + "step": 1235 + }, + { + "epoch": 0.13810001113709766, + "grad_norm": 4.049727439880371, + "learning_rate": 3.4497160040093553e-06, + "loss": 0.9197, + "num_input_tokens_seen": 1512288, + "step": 1240 + }, + { + "epoch": 0.138656866020715, + "grad_norm": 4.222021579742432, + "learning_rate": 3.4636373760997883e-06, + "loss": 1.0641, + "num_input_tokens_seen": 1518464, + "step": 1245 + }, + { + "epoch": 0.13921372090433234, + "grad_norm": 5.365772247314453, + "learning_rate": 3.477558748190222e-06, + "loss": 0.9883, + "num_input_tokens_seen": 1524512, + "step": 1250 + }, + { + "epoch": 0.13977057578794966, + "grad_norm": 5.13266658782959, + "learning_rate": 3.4914801202806553e-06, + "loss": 0.9632, + "num_input_tokens_seen": 1530848, + "step": 1255 + }, + { + "epoch": 0.140327430671567, + "grad_norm": 4.035582542419434, + "learning_rate": 3.5054014923710884e-06, + "loss": 0.762, + "num_input_tokens_seen": 1537056, + "step": 1260 + }, + { + "epoch": 0.1408842855551843, + "grad_norm": 4.856882572174072, + "learning_rate": 3.519322864461522e-06, + "loss": 1.1646, + "num_input_tokens_seen": 1542880, + "step": 1265 + }, + { + "epoch": 0.14144114043880165, + "grad_norm": 3.7850074768066406, + "learning_rate": 3.5332442365519545e-06, + "loss": 0.7351, + "num_input_tokens_seen": 1549216, + "step": 1270 + }, + { + "epoch": 0.14199799532241897, + "grad_norm": 3.5763144493103027, + "learning_rate": 3.547165608642388e-06, + "loss": 0.9456, + "num_input_tokens_seen": 1555200, + "step": 1275 + }, + { + "epoch": 0.1425548502060363, + "grad_norm": 4.42571496963501, + "learning_rate": 3.561086980732821e-06, + "loss": 0.7953, + "num_input_tokens_seen": 1561248, + "step": 1280 + }, + { + "epoch": 0.14311170508965362, + "grad_norm": 3.61651873588562, + "learning_rate": 3.5750083528232545e-06, + "loss": 0.7831, + "num_input_tokens_seen": 1567360, + "step": 1285 + }, + { + "epoch": 0.14366855997327097, + "grad_norm": 3.685879945755005, + "learning_rate": 3.588929724913688e-06, + "loss": 0.7957, + "num_input_tokens_seen": 1573472, + "step": 1290 + }, + { + "epoch": 0.1442254148568883, + "grad_norm": 4.501856803894043, + "learning_rate": 3.602851097004121e-06, + "loss": 0.9654, + "num_input_tokens_seen": 1579072, + "step": 1295 + }, + { + "epoch": 0.14478226974050562, + "grad_norm": 4.1594414710998535, + "learning_rate": 3.6167724690945545e-06, + "loss": 0.8836, + "num_input_tokens_seen": 1585088, + "step": 1300 + }, + { + "epoch": 0.14533912462412296, + "grad_norm": 5.126284599304199, + "learning_rate": 3.630693841184987e-06, + "loss": 0.5754, + "num_input_tokens_seen": 1591232, + "step": 1305 + }, + { + "epoch": 0.14589597950774028, + "grad_norm": 3.452998161315918, + "learning_rate": 3.6446152132754206e-06, + "loss": 0.9072, + "num_input_tokens_seen": 1596736, + "step": 1310 + }, + { + "epoch": 0.14645283439135762, + "grad_norm": 4.194852352142334, + "learning_rate": 3.6585365853658537e-06, + "loss": 0.9047, + "num_input_tokens_seen": 1603264, + "step": 1315 + }, + { + "epoch": 0.14700968927497493, + "grad_norm": 2.8529610633850098, + "learning_rate": 3.672457957456287e-06, + "loss": 0.6811, + "num_input_tokens_seen": 1608640, + "step": 1320 + }, + { + "epoch": 0.14756654415859227, + "grad_norm": 2.966900587081909, + "learning_rate": 3.68637932954672e-06, + "loss": 0.6591, + "num_input_tokens_seen": 1614176, + "step": 1325 + }, + { + "epoch": 0.1481233990422096, + "grad_norm": 3.4120333194732666, + "learning_rate": 3.7003007016371537e-06, + "loss": 0.7481, + "num_input_tokens_seen": 1620416, + "step": 1330 + }, + { + "epoch": 0.14868025392582693, + "grad_norm": 3.264315128326416, + "learning_rate": 3.714222073727587e-06, + "loss": 0.7295, + "num_input_tokens_seen": 1626624, + "step": 1335 + }, + { + "epoch": 0.14923710880944427, + "grad_norm": 4.320507526397705, + "learning_rate": 3.72814344581802e-06, + "loss": 0.7115, + "num_input_tokens_seen": 1632928, + "step": 1340 + }, + { + "epoch": 0.14979396369306158, + "grad_norm": 3.2679386138916016, + "learning_rate": 3.7420648179084533e-06, + "loss": 0.6846, + "num_input_tokens_seen": 1639008, + "step": 1345 + }, + { + "epoch": 0.15035081857667892, + "grad_norm": 2.4279849529266357, + "learning_rate": 3.7559861899988864e-06, + "loss": 0.3983, + "num_input_tokens_seen": 1645056, + "step": 1350 + }, + { + "epoch": 0.15090767346029624, + "grad_norm": 2.47406005859375, + "learning_rate": 3.76990756208932e-06, + "loss": 0.4649, + "num_input_tokens_seen": 1651136, + "step": 1355 + }, + { + "epoch": 0.15146452834391358, + "grad_norm": 3.168919086456299, + "learning_rate": 3.7838289341797525e-06, + "loss": 0.9049, + "num_input_tokens_seen": 1657440, + "step": 1360 + }, + { + "epoch": 0.1520213832275309, + "grad_norm": 2.5601322650909424, + "learning_rate": 3.7977503062701864e-06, + "loss": 0.6586, + "num_input_tokens_seen": 1664000, + "step": 1365 + }, + { + "epoch": 0.15257823811114823, + "grad_norm": 2.3212687969207764, + "learning_rate": 3.81167167836062e-06, + "loss": 0.8186, + "num_input_tokens_seen": 1669824, + "step": 1370 + }, + { + "epoch": 0.15313509299476558, + "grad_norm": 1.9759979248046875, + "learning_rate": 3.825593050451053e-06, + "loss": 0.4668, + "num_input_tokens_seen": 1675744, + "step": 1375 + }, + { + "epoch": 0.1536919478783829, + "grad_norm": 2.26216721534729, + "learning_rate": 3.839514422541486e-06, + "loss": 0.6046, + "num_input_tokens_seen": 1681856, + "step": 1380 + }, + { + "epoch": 0.15424880276200023, + "grad_norm": 3.1887547969818115, + "learning_rate": 3.853435794631919e-06, + "loss": 0.7284, + "num_input_tokens_seen": 1688448, + "step": 1385 + }, + { + "epoch": 0.15480565764561754, + "grad_norm": 2.691831350326538, + "learning_rate": 3.8673571667223525e-06, + "loss": 0.7467, + "num_input_tokens_seen": 1694304, + "step": 1390 + }, + { + "epoch": 0.1553625125292349, + "grad_norm": 2.392800807952881, + "learning_rate": 3.881278538812785e-06, + "loss": 0.6433, + "num_input_tokens_seen": 1700416, + "step": 1395 + }, + { + "epoch": 0.1559193674128522, + "grad_norm": 3.657330274581909, + "learning_rate": 3.895199910903219e-06, + "loss": 0.4551, + "num_input_tokens_seen": 1706624, + "step": 1400 + }, + { + "epoch": 0.15647622229646954, + "grad_norm": 2.7961606979370117, + "learning_rate": 3.909121282993652e-06, + "loss": 0.4186, + "num_input_tokens_seen": 1712736, + "step": 1405 + }, + { + "epoch": 0.15703307718008688, + "grad_norm": 1.816717267036438, + "learning_rate": 3.923042655084086e-06, + "loss": 0.503, + "num_input_tokens_seen": 1718976, + "step": 1410 + }, + { + "epoch": 0.1575899320637042, + "grad_norm": 3.4210848808288574, + "learning_rate": 3.936964027174519e-06, + "loss": 0.611, + "num_input_tokens_seen": 1724704, + "step": 1415 + }, + { + "epoch": 0.15814678694732154, + "grad_norm": 2.591505289077759, + "learning_rate": 3.950885399264952e-06, + "loss": 0.5907, + "num_input_tokens_seen": 1730752, + "step": 1420 + }, + { + "epoch": 0.15870364183093885, + "grad_norm": 5.60207986831665, + "learning_rate": 3.964806771355385e-06, + "loss": 0.6027, + "num_input_tokens_seen": 1736864, + "step": 1425 + }, + { + "epoch": 0.1592604967145562, + "grad_norm": 1.8507306575775146, + "learning_rate": 3.978728143445818e-06, + "loss": 0.7508, + "num_input_tokens_seen": 1742496, + "step": 1430 + }, + { + "epoch": 0.1598173515981735, + "grad_norm": 2.654751777648926, + "learning_rate": 3.992649515536251e-06, + "loss": 0.4867, + "num_input_tokens_seen": 1748480, + "step": 1435 + }, + { + "epoch": 0.16037420648179085, + "grad_norm": 1.9548935890197754, + "learning_rate": 4.006570887626685e-06, + "loss": 0.4428, + "num_input_tokens_seen": 1754528, + "step": 1440 + }, + { + "epoch": 0.16093106136540816, + "grad_norm": 3.0305562019348145, + "learning_rate": 4.020492259717118e-06, + "loss": 0.5508, + "num_input_tokens_seen": 1760896, + "step": 1445 + }, + { + "epoch": 0.1614879162490255, + "grad_norm": 1.7903122901916504, + "learning_rate": 4.034413631807551e-06, + "loss": 0.4112, + "num_input_tokens_seen": 1766656, + "step": 1450 + }, + { + "epoch": 0.16204477113264285, + "grad_norm": 2.2407407760620117, + "learning_rate": 4.048335003897984e-06, + "loss": 0.2568, + "num_input_tokens_seen": 1772640, + "step": 1455 + }, + { + "epoch": 0.16260162601626016, + "grad_norm": 1.4841406345367432, + "learning_rate": 4.062256375988418e-06, + "loss": 0.5076, + "num_input_tokens_seen": 1778560, + "step": 1460 + }, + { + "epoch": 0.1631584808998775, + "grad_norm": 2.692340612411499, + "learning_rate": 4.0761777480788505e-06, + "loss": 0.5197, + "num_input_tokens_seen": 1784608, + "step": 1465 + }, + { + "epoch": 0.16371533578349481, + "grad_norm": 1.4846718311309814, + "learning_rate": 4.090099120169284e-06, + "loss": 0.2842, + "num_input_tokens_seen": 1790592, + "step": 1470 + }, + { + "epoch": 0.16427219066711216, + "grad_norm": 1.8314740657806396, + "learning_rate": 4.1040204922597175e-06, + "loss": 0.3889, + "num_input_tokens_seen": 1796576, + "step": 1475 + }, + { + "epoch": 0.16482904555072947, + "grad_norm": 1.3686485290527344, + "learning_rate": 4.117941864350151e-06, + "loss": 0.5041, + "num_input_tokens_seen": 1802688, + "step": 1480 + }, + { + "epoch": 0.1653859004343468, + "grad_norm": 2.141038179397583, + "learning_rate": 4.131863236440584e-06, + "loss": 0.3398, + "num_input_tokens_seen": 1809056, + "step": 1485 + }, + { + "epoch": 0.16594275531796415, + "grad_norm": 1.4957995414733887, + "learning_rate": 4.145784608531017e-06, + "loss": 0.5017, + "num_input_tokens_seen": 1815040, + "step": 1490 + }, + { + "epoch": 0.16649961020158147, + "grad_norm": 3.7264676094055176, + "learning_rate": 4.1597059806214505e-06, + "loss": 0.6191, + "num_input_tokens_seen": 1821184, + "step": 1495 + }, + { + "epoch": 0.1670564650851988, + "grad_norm": 1.2168136835098267, + "learning_rate": 4.173627352711883e-06, + "loss": 0.5421, + "num_input_tokens_seen": 1827360, + "step": 1500 + }, + { + "epoch": 0.16761331996881612, + "grad_norm": 2.294086217880249, + "learning_rate": 4.187548724802317e-06, + "loss": 0.6814, + "num_input_tokens_seen": 1833472, + "step": 1505 + }, + { + "epoch": 0.16817017485243346, + "grad_norm": 1.0981647968292236, + "learning_rate": 4.20147009689275e-06, + "loss": 0.3186, + "num_input_tokens_seen": 1839264, + "step": 1510 + }, + { + "epoch": 0.16872702973605078, + "grad_norm": 1.3301753997802734, + "learning_rate": 4.215391468983184e-06, + "loss": 0.4695, + "num_input_tokens_seen": 1845600, + "step": 1515 + }, + { + "epoch": 0.16928388461966812, + "grad_norm": 1.4120237827301025, + "learning_rate": 4.229312841073616e-06, + "loss": 0.4132, + "num_input_tokens_seen": 1852256, + "step": 1520 + }, + { + "epoch": 0.16984073950328543, + "grad_norm": 5.434198379516602, + "learning_rate": 4.24323421316405e-06, + "loss": 0.4069, + "num_input_tokens_seen": 1858400, + "step": 1525 + }, + { + "epoch": 0.17039759438690277, + "grad_norm": 1.0660725831985474, + "learning_rate": 4.257155585254482e-06, + "loss": 0.5903, + "num_input_tokens_seen": 1863936, + "step": 1530 + }, + { + "epoch": 0.17095444927052011, + "grad_norm": 1.897925615310669, + "learning_rate": 4.271076957344916e-06, + "loss": 0.2138, + "num_input_tokens_seen": 1869792, + "step": 1535 + }, + { + "epoch": 0.17151130415413743, + "grad_norm": 2.1005399227142334, + "learning_rate": 4.284998329435349e-06, + "loss": 0.1948, + "num_input_tokens_seen": 1875936, + "step": 1540 + }, + { + "epoch": 0.17206815903775477, + "grad_norm": 1.6097058057785034, + "learning_rate": 4.298919701525783e-06, + "loss": 0.3159, + "num_input_tokens_seen": 1881920, + "step": 1545 + }, + { + "epoch": 0.17262501392137208, + "grad_norm": 2.1649508476257324, + "learning_rate": 4.312841073616216e-06, + "loss": 0.3217, + "num_input_tokens_seen": 1888160, + "step": 1550 + }, + { + "epoch": 0.17318186880498943, + "grad_norm": 1.0434622764587402, + "learning_rate": 4.326762445706649e-06, + "loss": 0.2416, + "num_input_tokens_seen": 1894208, + "step": 1555 + }, + { + "epoch": 0.17373872368860674, + "grad_norm": 0.6899924278259277, + "learning_rate": 4.340683817797082e-06, + "loss": 0.3791, + "num_input_tokens_seen": 1900448, + "step": 1560 + }, + { + "epoch": 0.17429557857222408, + "grad_norm": 3.969374895095825, + "learning_rate": 4.354605189887515e-06, + "loss": 0.6201, + "num_input_tokens_seen": 1906624, + "step": 1565 + }, + { + "epoch": 0.1748524334558414, + "grad_norm": 2.3943490982055664, + "learning_rate": 4.3685265619779485e-06, + "loss": 0.4133, + "num_input_tokens_seen": 1912672, + "step": 1570 + }, + { + "epoch": 0.17540928833945874, + "grad_norm": 1.601441502571106, + "learning_rate": 4.382447934068382e-06, + "loss": 0.5123, + "num_input_tokens_seen": 1918880, + "step": 1575 + }, + { + "epoch": 0.17596614322307608, + "grad_norm": 0.8297561407089233, + "learning_rate": 4.3963693061588155e-06, + "loss": 0.3219, + "num_input_tokens_seen": 1925120, + "step": 1580 + }, + { + "epoch": 0.1765229981066934, + "grad_norm": 1.8939285278320312, + "learning_rate": 4.410290678249249e-06, + "loss": 0.3749, + "num_input_tokens_seen": 1931040, + "step": 1585 + }, + { + "epoch": 0.17707985299031073, + "grad_norm": 1.628227949142456, + "learning_rate": 4.424212050339682e-06, + "loss": 0.2577, + "num_input_tokens_seen": 1937280, + "step": 1590 + }, + { + "epoch": 0.17763670787392805, + "grad_norm": 1.9118647575378418, + "learning_rate": 4.438133422430115e-06, + "loss": 0.6866, + "num_input_tokens_seen": 1943424, + "step": 1595 + }, + { + "epoch": 0.1781935627575454, + "grad_norm": 1.2692575454711914, + "learning_rate": 4.452054794520548e-06, + "loss": 0.2686, + "num_input_tokens_seen": 1949536, + "step": 1600 + }, + { + "epoch": 0.1787504176411627, + "grad_norm": 1.7796849012374878, + "learning_rate": 4.465976166610981e-06, + "loss": 0.3697, + "num_input_tokens_seen": 1955520, + "step": 1605 + }, + { + "epoch": 0.17930727252478004, + "grad_norm": 1.2701336145401, + "learning_rate": 4.479897538701415e-06, + "loss": 0.2921, + "num_input_tokens_seen": 1961088, + "step": 1610 + }, + { + "epoch": 0.17986412740839738, + "grad_norm": 2.037193536758423, + "learning_rate": 4.493818910791848e-06, + "loss": 0.2674, + "num_input_tokens_seen": 1967328, + "step": 1615 + }, + { + "epoch": 0.1804209822920147, + "grad_norm": 2.0171291828155518, + "learning_rate": 4.507740282882282e-06, + "loss": 0.502, + "num_input_tokens_seen": 1973216, + "step": 1620 + }, + { + "epoch": 0.18097783717563204, + "grad_norm": 2.293116331100464, + "learning_rate": 4.521661654972714e-06, + "loss": 0.2848, + "num_input_tokens_seen": 1978784, + "step": 1625 + }, + { + "epoch": 0.18153469205924935, + "grad_norm": 1.7863420248031616, + "learning_rate": 4.535583027063148e-06, + "loss": 0.4877, + "num_input_tokens_seen": 1985088, + "step": 1630 + }, + { + "epoch": 0.1820915469428667, + "grad_norm": 1.8558881282806396, + "learning_rate": 4.54950439915358e-06, + "loss": 0.3934, + "num_input_tokens_seen": 1991264, + "step": 1635 + }, + { + "epoch": 0.182648401826484, + "grad_norm": 1.5357017517089844, + "learning_rate": 4.563425771244014e-06, + "loss": 0.4358, + "num_input_tokens_seen": 1997216, + "step": 1640 + }, + { + "epoch": 0.18320525671010135, + "grad_norm": 1.8711628913879395, + "learning_rate": 4.577347143334447e-06, + "loss": 0.2377, + "num_input_tokens_seen": 2002912, + "step": 1645 + }, + { + "epoch": 0.18376211159371866, + "grad_norm": 0.8597718477249146, + "learning_rate": 4.591268515424881e-06, + "loss": 0.3776, + "num_input_tokens_seen": 2009280, + "step": 1650 + }, + { + "epoch": 0.184318966477336, + "grad_norm": 0.5548314452171326, + "learning_rate": 4.605189887515314e-06, + "loss": 0.336, + "num_input_tokens_seen": 2015296, + "step": 1655 + }, + { + "epoch": 0.18487582136095335, + "grad_norm": 2.3944666385650635, + "learning_rate": 4.619111259605747e-06, + "loss": 0.2888, + "num_input_tokens_seen": 2021632, + "step": 1660 + }, + { + "epoch": 0.18543267624457066, + "grad_norm": 2.9417052268981934, + "learning_rate": 4.6330326316961804e-06, + "loss": 0.3431, + "num_input_tokens_seen": 2027808, + "step": 1665 + }, + { + "epoch": 0.185989531128188, + "grad_norm": 2.376530408859253, + "learning_rate": 4.646954003786613e-06, + "loss": 0.3771, + "num_input_tokens_seen": 2033984, + "step": 1670 + }, + { + "epoch": 0.18654638601180532, + "grad_norm": 0.9138212203979492, + "learning_rate": 4.6608753758770466e-06, + "loss": 0.3564, + "num_input_tokens_seen": 2039776, + "step": 1675 + }, + { + "epoch": 0.18710324089542266, + "grad_norm": 1.3329193592071533, + "learning_rate": 4.67479674796748e-06, + "loss": 0.4029, + "num_input_tokens_seen": 2045888, + "step": 1680 + }, + { + "epoch": 0.18766009577903997, + "grad_norm": 0.4897187054157257, + "learning_rate": 4.6887181200579135e-06, + "loss": 0.3496, + "num_input_tokens_seen": 2051552, + "step": 1685 + }, + { + "epoch": 0.1882169506626573, + "grad_norm": 2.2591490745544434, + "learning_rate": 4.702639492148346e-06, + "loss": 0.3618, + "num_input_tokens_seen": 2057696, + "step": 1690 + }, + { + "epoch": 0.18877380554627465, + "grad_norm": 2.847201347351074, + "learning_rate": 4.71656086423878e-06, + "loss": 0.5559, + "num_input_tokens_seen": 2063648, + "step": 1695 + }, + { + "epoch": 0.18933066042989197, + "grad_norm": 1.6808046102523804, + "learning_rate": 4.730482236329213e-06, + "loss": 0.3409, + "num_input_tokens_seen": 2069888, + "step": 1700 + }, + { + "epoch": 0.1898875153135093, + "grad_norm": 2.7919180393218994, + "learning_rate": 4.744403608419646e-06, + "loss": 0.3511, + "num_input_tokens_seen": 2076032, + "step": 1705 + }, + { + "epoch": 0.19044437019712662, + "grad_norm": 1.6452677249908447, + "learning_rate": 4.758324980510079e-06, + "loss": 0.3835, + "num_input_tokens_seen": 2081632, + "step": 1710 + }, + { + "epoch": 0.19100122508074396, + "grad_norm": 1.1252981424331665, + "learning_rate": 4.772246352600513e-06, + "loss": 0.22, + "num_input_tokens_seen": 2087744, + "step": 1715 + }, + { + "epoch": 0.19155807996436128, + "grad_norm": 1.7853981256484985, + "learning_rate": 4.786167724690946e-06, + "loss": 0.3326, + "num_input_tokens_seen": 2093824, + "step": 1720 + }, + { + "epoch": 0.19211493484797862, + "grad_norm": 2.5153186321258545, + "learning_rate": 4.800089096781379e-06, + "loss": 0.5653, + "num_input_tokens_seen": 2099936, + "step": 1725 + }, + { + "epoch": 0.19267178973159593, + "grad_norm": 1.1389131546020508, + "learning_rate": 4.814010468871812e-06, + "loss": 0.5927, + "num_input_tokens_seen": 2106048, + "step": 1730 + }, + { + "epoch": 0.19322864461521327, + "grad_norm": 1.7555913925170898, + "learning_rate": 4.827931840962246e-06, + "loss": 0.4022, + "num_input_tokens_seen": 2112096, + "step": 1735 + }, + { + "epoch": 0.19378549949883062, + "grad_norm": 2.7944562435150146, + "learning_rate": 4.8418532130526784e-06, + "loss": 0.248, + "num_input_tokens_seen": 2118112, + "step": 1740 + }, + { + "epoch": 0.19434235438244793, + "grad_norm": 1.623072624206543, + "learning_rate": 4.855774585143112e-06, + "loss": 0.1881, + "num_input_tokens_seen": 2124448, + "step": 1745 + }, + { + "epoch": 0.19489920926606527, + "grad_norm": 0.9694177508354187, + "learning_rate": 4.869695957233545e-06, + "loss": 0.2378, + "num_input_tokens_seen": 2130272, + "step": 1750 + }, + { + "epoch": 0.19545606414968258, + "grad_norm": 1.2099922895431519, + "learning_rate": 4.883617329323979e-06, + "loss": 0.2634, + "num_input_tokens_seen": 2136448, + "step": 1755 + }, + { + "epoch": 0.19601291903329993, + "grad_norm": 2.1871495246887207, + "learning_rate": 4.8975387014144115e-06, + "loss": 0.2084, + "num_input_tokens_seen": 2142400, + "step": 1760 + }, + { + "epoch": 0.19656977391691724, + "grad_norm": 1.6051756143569946, + "learning_rate": 4.911460073504845e-06, + "loss": 0.2534, + "num_input_tokens_seen": 2148864, + "step": 1765 + }, + { + "epoch": 0.19712662880053458, + "grad_norm": 0.6360865235328674, + "learning_rate": 4.925381445595278e-06, + "loss": 0.2862, + "num_input_tokens_seen": 2154944, + "step": 1770 + }, + { + "epoch": 0.19768348368415192, + "grad_norm": 2.9602835178375244, + "learning_rate": 4.939302817685711e-06, + "loss": 0.4655, + "num_input_tokens_seen": 2161024, + "step": 1775 + }, + { + "epoch": 0.19824033856776924, + "grad_norm": 1.1649446487426758, + "learning_rate": 4.953224189776145e-06, + "loss": 0.3978, + "num_input_tokens_seen": 2167296, + "step": 1780 + }, + { + "epoch": 0.19879719345138658, + "grad_norm": 2.355618476867676, + "learning_rate": 4.967145561866578e-06, + "loss": 0.5431, + "num_input_tokens_seen": 2173664, + "step": 1785 + }, + { + "epoch": 0.1993540483350039, + "grad_norm": 2.2084836959838867, + "learning_rate": 4.9810669339570116e-06, + "loss": 0.3384, + "num_input_tokens_seen": 2179616, + "step": 1790 + }, + { + "epoch": 0.19991090321862123, + "grad_norm": 1.4919029474258423, + "learning_rate": 4.994988306047444e-06, + "loss": 0.4324, + "num_input_tokens_seen": 2185632, + "step": 1795 + }, + { + "epoch": 0.20046775810223855, + "grad_norm": 3.7182281017303467, + "learning_rate": 5.008909678137878e-06, + "loss": 0.401, + "num_input_tokens_seen": 2191552, + "step": 1800 + }, + { + "epoch": 0.2010246129858559, + "grad_norm": 1.0128874778747559, + "learning_rate": 5.02283105022831e-06, + "loss": 0.2739, + "num_input_tokens_seen": 2197280, + "step": 1805 + }, + { + "epoch": 0.2015814678694732, + "grad_norm": 1.5548657178878784, + "learning_rate": 5.036752422318744e-06, + "loss": 0.3497, + "num_input_tokens_seen": 2203712, + "step": 1810 + }, + { + "epoch": 0.20213832275309054, + "grad_norm": 0.6027396321296692, + "learning_rate": 5.050673794409177e-06, + "loss": 0.2038, + "num_input_tokens_seen": 2209920, + "step": 1815 + }, + { + "epoch": 0.20269517763670789, + "grad_norm": 4.069998264312744, + "learning_rate": 5.064595166499611e-06, + "loss": 0.3487, + "num_input_tokens_seen": 2215808, + "step": 1820 + }, + { + "epoch": 0.2032520325203252, + "grad_norm": 1.9106853008270264, + "learning_rate": 5.078516538590044e-06, + "loss": 0.3292, + "num_input_tokens_seen": 2222240, + "step": 1825 + }, + { + "epoch": 0.20380888740394254, + "grad_norm": 1.0705865621566772, + "learning_rate": 5.092437910680477e-06, + "loss": 0.1464, + "num_input_tokens_seen": 2228192, + "step": 1830 + }, + { + "epoch": 0.20436574228755985, + "grad_norm": 1.8440192937850952, + "learning_rate": 5.10635928277091e-06, + "loss": 0.5794, + "num_input_tokens_seen": 2234112, + "step": 1835 + }, + { + "epoch": 0.2049225971711772, + "grad_norm": 3.7459065914154053, + "learning_rate": 5.120280654861343e-06, + "loss": 0.324, + "num_input_tokens_seen": 2240288, + "step": 1840 + }, + { + "epoch": 0.2054794520547945, + "grad_norm": 2.847233295440674, + "learning_rate": 5.1342020269517765e-06, + "loss": 0.3048, + "num_input_tokens_seen": 2246336, + "step": 1845 + }, + { + "epoch": 0.20603630693841185, + "grad_norm": 2.8086326122283936, + "learning_rate": 5.14812339904221e-06, + "loss": 0.2549, + "num_input_tokens_seen": 2252704, + "step": 1850 + }, + { + "epoch": 0.2065931618220292, + "grad_norm": 1.2097676992416382, + "learning_rate": 5.162044771132643e-06, + "loss": 0.3557, + "num_input_tokens_seen": 2258944, + "step": 1855 + }, + { + "epoch": 0.2071500167056465, + "grad_norm": 1.8226523399353027, + "learning_rate": 5.175966143223077e-06, + "loss": 0.4276, + "num_input_tokens_seen": 2264224, + "step": 1860 + }, + { + "epoch": 0.20770687158926385, + "grad_norm": 1.471382737159729, + "learning_rate": 5.1898875153135095e-06, + "loss": 0.148, + "num_input_tokens_seen": 2270464, + "step": 1865 + }, + { + "epoch": 0.20826372647288116, + "grad_norm": 2.506028175354004, + "learning_rate": 5.203808887403943e-06, + "loss": 0.5185, + "num_input_tokens_seen": 2276352, + "step": 1870 + }, + { + "epoch": 0.2088205813564985, + "grad_norm": 1.5448545217514038, + "learning_rate": 5.217730259494376e-06, + "loss": 0.3756, + "num_input_tokens_seen": 2282272, + "step": 1875 + }, + { + "epoch": 0.20937743624011582, + "grad_norm": 1.4627342224121094, + "learning_rate": 5.231651631584809e-06, + "loss": 0.4594, + "num_input_tokens_seen": 2288576, + "step": 1880 + }, + { + "epoch": 0.20993429112373316, + "grad_norm": 1.878933072090149, + "learning_rate": 5.245573003675243e-06, + "loss": 0.6639, + "num_input_tokens_seen": 2294528, + "step": 1885 + }, + { + "epoch": 0.21049114600735047, + "grad_norm": 0.966867983341217, + "learning_rate": 5.259494375765676e-06, + "loss": 0.4249, + "num_input_tokens_seen": 2300864, + "step": 1890 + }, + { + "epoch": 0.2110480008909678, + "grad_norm": 2.7238450050354004, + "learning_rate": 5.27341574785611e-06, + "loss": 0.3734, + "num_input_tokens_seen": 2307072, + "step": 1895 + }, + { + "epoch": 0.21160485577458515, + "grad_norm": 0.7273203730583191, + "learning_rate": 5.287337119946542e-06, + "loss": 0.2148, + "num_input_tokens_seen": 2313024, + "step": 1900 + }, + { + "epoch": 0.21216171065820247, + "grad_norm": 0.6712373495101929, + "learning_rate": 5.301258492036976e-06, + "loss": 0.2993, + "num_input_tokens_seen": 2319200, + "step": 1905 + }, + { + "epoch": 0.2127185655418198, + "grad_norm": 1.6466087102890015, + "learning_rate": 5.315179864127408e-06, + "loss": 0.281, + "num_input_tokens_seen": 2325312, + "step": 1910 + }, + { + "epoch": 0.21327542042543712, + "grad_norm": 1.654220700263977, + "learning_rate": 5.329101236217842e-06, + "loss": 0.401, + "num_input_tokens_seen": 2330784, + "step": 1915 + }, + { + "epoch": 0.21383227530905446, + "grad_norm": 4.152066707611084, + "learning_rate": 5.343022608308275e-06, + "loss": 0.496, + "num_input_tokens_seen": 2336992, + "step": 1920 + }, + { + "epoch": 0.21438913019267178, + "grad_norm": 3.136435031890869, + "learning_rate": 5.356943980398709e-06, + "loss": 0.3068, + "num_input_tokens_seen": 2343360, + "step": 1925 + }, + { + "epoch": 0.21494598507628912, + "grad_norm": 2.7355759143829346, + "learning_rate": 5.370865352489141e-06, + "loss": 0.4074, + "num_input_tokens_seen": 2349728, + "step": 1930 + }, + { + "epoch": 0.21550283995990646, + "grad_norm": 1.575333833694458, + "learning_rate": 5.384786724579575e-06, + "loss": 0.3294, + "num_input_tokens_seen": 2355680, + "step": 1935 + }, + { + "epoch": 0.21605969484352378, + "grad_norm": 1.5301464796066284, + "learning_rate": 5.398708096670008e-06, + "loss": 0.4343, + "num_input_tokens_seen": 2361568, + "step": 1940 + }, + { + "epoch": 0.21661654972714112, + "grad_norm": 2.6357622146606445, + "learning_rate": 5.412629468760441e-06, + "loss": 0.3027, + "num_input_tokens_seen": 2367904, + "step": 1945 + }, + { + "epoch": 0.21717340461075843, + "grad_norm": 2.3818628787994385, + "learning_rate": 5.4265508408508745e-06, + "loss": 0.2703, + "num_input_tokens_seen": 2374496, + "step": 1950 + }, + { + "epoch": 0.21773025949437577, + "grad_norm": 0.7511159777641296, + "learning_rate": 5.440472212941308e-06, + "loss": 0.3407, + "num_input_tokens_seen": 2380416, + "step": 1955 + }, + { + "epoch": 0.21828711437799309, + "grad_norm": 1.825392246246338, + "learning_rate": 5.4543935850317414e-06, + "loss": 0.2759, + "num_input_tokens_seen": 2386080, + "step": 1960 + }, + { + "epoch": 0.21884396926161043, + "grad_norm": 1.213257908821106, + "learning_rate": 5.468314957122174e-06, + "loss": 0.2211, + "num_input_tokens_seen": 2392352, + "step": 1965 + }, + { + "epoch": 0.21940082414522774, + "grad_norm": 1.0549057722091675, + "learning_rate": 5.4822363292126076e-06, + "loss": 0.3332, + "num_input_tokens_seen": 2398528, + "step": 1970 + }, + { + "epoch": 0.21995767902884508, + "grad_norm": 0.19382329285144806, + "learning_rate": 5.496157701303041e-06, + "loss": 0.1552, + "num_input_tokens_seen": 2404672, + "step": 1975 + }, + { + "epoch": 0.22051453391246242, + "grad_norm": 1.5162739753723145, + "learning_rate": 5.510079073393474e-06, + "loss": 0.4573, + "num_input_tokens_seen": 2410880, + "step": 1980 + }, + { + "epoch": 0.22107138879607974, + "grad_norm": 1.3292226791381836, + "learning_rate": 5.524000445483907e-06, + "loss": 0.3347, + "num_input_tokens_seen": 2416864, + "step": 1985 + }, + { + "epoch": 0.22162824367969708, + "grad_norm": 0.8309261202812195, + "learning_rate": 5.537921817574341e-06, + "loss": 0.2228, + "num_input_tokens_seen": 2422784, + "step": 1990 + }, + { + "epoch": 0.2221850985633144, + "grad_norm": 0.7279968857765198, + "learning_rate": 5.551843189664774e-06, + "loss": 0.2129, + "num_input_tokens_seen": 2428576, + "step": 1995 + }, + { + "epoch": 0.22274195344693173, + "grad_norm": 1.2570210695266724, + "learning_rate": 5.565764561755207e-06, + "loss": 0.1566, + "num_input_tokens_seen": 2434624, + "step": 2000 + }, + { + "epoch": 0.22329880833054905, + "grad_norm": 1.8598361015319824, + "learning_rate": 5.57968593384564e-06, + "loss": 0.4309, + "num_input_tokens_seen": 2440768, + "step": 2005 + }, + { + "epoch": 0.2238556632141664, + "grad_norm": 0.7089745402336121, + "learning_rate": 5.593607305936073e-06, + "loss": 0.2892, + "num_input_tokens_seen": 2446720, + "step": 2010 + }, + { + "epoch": 0.22441251809778373, + "grad_norm": 0.38951221108436584, + "learning_rate": 5.607528678026506e-06, + "loss": 0.3683, + "num_input_tokens_seen": 2452352, + "step": 2015 + }, + { + "epoch": 0.22496937298140104, + "grad_norm": 2.9293599128723145, + "learning_rate": 5.62145005011694e-06, + "loss": 0.4704, + "num_input_tokens_seen": 2458592, + "step": 2020 + }, + { + "epoch": 0.22552622786501839, + "grad_norm": 2.2337679862976074, + "learning_rate": 5.635371422207373e-06, + "loss": 0.4043, + "num_input_tokens_seen": 2464832, + "step": 2025 + }, + { + "epoch": 0.2260830827486357, + "grad_norm": 1.1283704042434692, + "learning_rate": 5.649292794297807e-06, + "loss": 0.2002, + "num_input_tokens_seen": 2471360, + "step": 2030 + }, + { + "epoch": 0.22663993763225304, + "grad_norm": 1.350443720817566, + "learning_rate": 5.6632141663882394e-06, + "loss": 0.3377, + "num_input_tokens_seen": 2477440, + "step": 2035 + }, + { + "epoch": 0.22719679251587035, + "grad_norm": 2.0165112018585205, + "learning_rate": 5.677135538478673e-06, + "loss": 0.2518, + "num_input_tokens_seen": 2483616, + "step": 2040 + }, + { + "epoch": 0.2277536473994877, + "grad_norm": 3.2057249546051025, + "learning_rate": 5.6910569105691056e-06, + "loss": 0.1658, + "num_input_tokens_seen": 2489632, + "step": 2045 + }, + { + "epoch": 0.228310502283105, + "grad_norm": 0.3499944806098938, + "learning_rate": 5.704978282659539e-06, + "loss": 0.3498, + "num_input_tokens_seen": 2495232, + "step": 2050 + }, + { + "epoch": 0.22886735716672235, + "grad_norm": 1.3328890800476074, + "learning_rate": 5.7188996547499725e-06, + "loss": 0.1879, + "num_input_tokens_seen": 2501568, + "step": 2055 + }, + { + "epoch": 0.2294242120503397, + "grad_norm": 0.9030905365943909, + "learning_rate": 5.732821026840406e-06, + "loss": 0.2731, + "num_input_tokens_seen": 2506976, + "step": 2060 + }, + { + "epoch": 0.229981066933957, + "grad_norm": 0.908555805683136, + "learning_rate": 5.7467423989308395e-06, + "loss": 0.2028, + "num_input_tokens_seen": 2512832, + "step": 2065 + }, + { + "epoch": 0.23053792181757435, + "grad_norm": 1.7594202756881714, + "learning_rate": 5.760663771021272e-06, + "loss": 0.2895, + "num_input_tokens_seen": 2519200, + "step": 2070 + }, + { + "epoch": 0.23109477670119166, + "grad_norm": 0.17011401057243347, + "learning_rate": 5.774585143111706e-06, + "loss": 0.2187, + "num_input_tokens_seen": 2525344, + "step": 2075 + }, + { + "epoch": 0.231651631584809, + "grad_norm": 3.186101198196411, + "learning_rate": 5.788506515202138e-06, + "loss": 0.2337, + "num_input_tokens_seen": 2531744, + "step": 2080 + }, + { + "epoch": 0.23220848646842632, + "grad_norm": 2.5286998748779297, + "learning_rate": 5.802427887292572e-06, + "loss": 0.2034, + "num_input_tokens_seen": 2537568, + "step": 2085 + }, + { + "epoch": 0.23276534135204366, + "grad_norm": 1.648959994316101, + "learning_rate": 5.816349259383004e-06, + "loss": 0.2277, + "num_input_tokens_seen": 2543456, + "step": 2090 + }, + { + "epoch": 0.233322196235661, + "grad_norm": 0.740899920463562, + "learning_rate": 5.830270631473439e-06, + "loss": 0.5482, + "num_input_tokens_seen": 2549632, + "step": 2095 + }, + { + "epoch": 0.23387905111927831, + "grad_norm": 0.8836609125137329, + "learning_rate": 5.844192003563872e-06, + "loss": 0.2857, + "num_input_tokens_seen": 2555840, + "step": 2100 + }, + { + "epoch": 0.23443590600289566, + "grad_norm": 0.33603760600090027, + "learning_rate": 5.858113375654305e-06, + "loss": 0.1948, + "num_input_tokens_seen": 2561792, + "step": 2105 + }, + { + "epoch": 0.23499276088651297, + "grad_norm": 1.9549570083618164, + "learning_rate": 5.872034747744738e-06, + "loss": 0.4464, + "num_input_tokens_seen": 2567712, + "step": 2110 + }, + { + "epoch": 0.2355496157701303, + "grad_norm": 0.809601366519928, + "learning_rate": 5.885956119835171e-06, + "loss": 0.2498, + "num_input_tokens_seen": 2573664, + "step": 2115 + }, + { + "epoch": 0.23610647065374762, + "grad_norm": 1.262994408607483, + "learning_rate": 5.899877491925604e-06, + "loss": 0.3289, + "num_input_tokens_seen": 2580096, + "step": 2120 + }, + { + "epoch": 0.23666332553736497, + "grad_norm": 0.4227832555770874, + "learning_rate": 5.913798864016038e-06, + "loss": 0.222, + "num_input_tokens_seen": 2586528, + "step": 2125 + }, + { + "epoch": 0.23722018042098228, + "grad_norm": 0.1252857893705368, + "learning_rate": 5.927720236106471e-06, + "loss": 0.1453, + "num_input_tokens_seen": 2592704, + "step": 2130 + }, + { + "epoch": 0.23777703530459962, + "grad_norm": 0.7215713262557983, + "learning_rate": 5.941641608196904e-06, + "loss": 0.2377, + "num_input_tokens_seen": 2598464, + "step": 2135 + }, + { + "epoch": 0.23833389018821696, + "grad_norm": 0.7730120420455933, + "learning_rate": 5.9555629802873375e-06, + "loss": 0.156, + "num_input_tokens_seen": 2604672, + "step": 2140 + }, + { + "epoch": 0.23889074507183428, + "grad_norm": 2.628406047821045, + "learning_rate": 5.969484352377771e-06, + "loss": 0.307, + "num_input_tokens_seen": 2610752, + "step": 2145 + }, + { + "epoch": 0.23944759995545162, + "grad_norm": 2.9289562702178955, + "learning_rate": 5.983405724468204e-06, + "loss": 0.414, + "num_input_tokens_seen": 2616608, + "step": 2150 + }, + { + "epoch": 0.24000445483906893, + "grad_norm": 1.7956265211105347, + "learning_rate": 5.997327096558637e-06, + "loss": 0.33, + "num_input_tokens_seen": 2622592, + "step": 2155 + }, + { + "epoch": 0.24056130972268627, + "grad_norm": 1.233469843864441, + "learning_rate": 6.0112484686490705e-06, + "loss": 0.1804, + "num_input_tokens_seen": 2628704, + "step": 2160 + }, + { + "epoch": 0.2411181646063036, + "grad_norm": 1.1720426082611084, + "learning_rate": 6.025169840739504e-06, + "loss": 0.2862, + "num_input_tokens_seen": 2634752, + "step": 2165 + }, + { + "epoch": 0.24167501948992093, + "grad_norm": 3.4720051288604736, + "learning_rate": 6.039091212829937e-06, + "loss": 0.4216, + "num_input_tokens_seen": 2639936, + "step": 2170 + }, + { + "epoch": 0.24223187437353827, + "grad_norm": 2.0458550453186035, + "learning_rate": 6.05301258492037e-06, + "loss": 0.2962, + "num_input_tokens_seen": 2646048, + "step": 2175 + }, + { + "epoch": 0.24278872925715558, + "grad_norm": 1.4956796169281006, + "learning_rate": 6.066933957010804e-06, + "loss": 0.3452, + "num_input_tokens_seen": 2652128, + "step": 2180 + }, + { + "epoch": 0.24334558414077292, + "grad_norm": 1.5349714756011963, + "learning_rate": 6.080855329101236e-06, + "loss": 0.4808, + "num_input_tokens_seen": 2658112, + "step": 2185 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 2.7815611362457275, + "learning_rate": 6.09477670119167e-06, + "loss": 0.3357, + "num_input_tokens_seen": 2664416, + "step": 2190 + }, + { + "epoch": 0.24445929390800758, + "grad_norm": 1.2132139205932617, + "learning_rate": 6.108698073282103e-06, + "loss": 0.2747, + "num_input_tokens_seen": 2670656, + "step": 2195 + }, + { + "epoch": 0.2450161487916249, + "grad_norm": 0.4514075815677643, + "learning_rate": 6.122619445372537e-06, + "loss": 0.3743, + "num_input_tokens_seen": 2676800, + "step": 2200 + }, + { + "epoch": 0.24557300367524224, + "grad_norm": 1.1132675409317017, + "learning_rate": 6.136540817462969e-06, + "loss": 0.4194, + "num_input_tokens_seen": 2682688, + "step": 2205 + }, + { + "epoch": 0.24612985855885955, + "grad_norm": 2.0165979862213135, + "learning_rate": 6.150462189553403e-06, + "loss": 0.224, + "num_input_tokens_seen": 2688928, + "step": 2210 + }, + { + "epoch": 0.2466867134424769, + "grad_norm": 0.6878224611282349, + "learning_rate": 6.1643835616438354e-06, + "loss": 0.3794, + "num_input_tokens_seen": 2694816, + "step": 2215 + }, + { + "epoch": 0.24724356832609423, + "grad_norm": 0.6265125274658203, + "learning_rate": 6.178304933734269e-06, + "loss": 0.4577, + "num_input_tokens_seen": 2700704, + "step": 2220 + }, + { + "epoch": 0.24780042320971155, + "grad_norm": 1.6023638248443604, + "learning_rate": 6.192226305824702e-06, + "loss": 0.2409, + "num_input_tokens_seen": 2707040, + "step": 2225 + }, + { + "epoch": 0.2483572780933289, + "grad_norm": 1.1873646974563599, + "learning_rate": 6.206147677915136e-06, + "loss": 0.4514, + "num_input_tokens_seen": 2713152, + "step": 2230 + }, + { + "epoch": 0.2489141329769462, + "grad_norm": 2.3736989498138428, + "learning_rate": 6.220069050005569e-06, + "loss": 0.4519, + "num_input_tokens_seen": 2719392, + "step": 2235 + }, + { + "epoch": 0.24947098786056354, + "grad_norm": 0.5313743948936462, + "learning_rate": 6.233990422096002e-06, + "loss": 0.2282, + "num_input_tokens_seen": 2725408, + "step": 2240 + }, + { + "epoch": 0.2500278427441809, + "grad_norm": 0.3209305703639984, + "learning_rate": 6.2479117941864355e-06, + "loss": 0.142, + "num_input_tokens_seen": 2731424, + "step": 2245 + }, + { + "epoch": 0.25058469762779817, + "grad_norm": 1.044787049293518, + "learning_rate": 6.261833166276869e-06, + "loss": 0.3288, + "num_input_tokens_seen": 2737536, + "step": 2250 + }, + { + "epoch": 0.2511415525114155, + "grad_norm": 1.7929795980453491, + "learning_rate": 6.275754538367301e-06, + "loss": 0.2527, + "num_input_tokens_seen": 2743808, + "step": 2255 + }, + { + "epoch": 0.25169840739503285, + "grad_norm": 1.01334547996521, + "learning_rate": 6.289675910457734e-06, + "loss": 0.3567, + "num_input_tokens_seen": 2749952, + "step": 2260 + }, + { + "epoch": 0.2522552622786502, + "grad_norm": 0.8981134295463562, + "learning_rate": 6.3035972825481686e-06, + "loss": 0.3048, + "num_input_tokens_seen": 2755968, + "step": 2265 + }, + { + "epoch": 0.25281211716226754, + "grad_norm": 0.6669492125511169, + "learning_rate": 6.317518654638602e-06, + "loss": 0.2404, + "num_input_tokens_seen": 2762304, + "step": 2270 + }, + { + "epoch": 0.2533689720458848, + "grad_norm": 0.19226616621017456, + "learning_rate": 6.3314400267290355e-06, + "loss": 0.2313, + "num_input_tokens_seen": 2768288, + "step": 2275 + }, + { + "epoch": 0.25392582692950216, + "grad_norm": 1.4508994817733765, + "learning_rate": 6.345361398819467e-06, + "loss": 0.2445, + "num_input_tokens_seen": 2774528, + "step": 2280 + }, + { + "epoch": 0.2544826818131195, + "grad_norm": 1.8482166528701782, + "learning_rate": 6.359282770909901e-06, + "loss": 0.1795, + "num_input_tokens_seen": 2780608, + "step": 2285 + }, + { + "epoch": 0.25503953669673685, + "grad_norm": 1.410054326057434, + "learning_rate": 6.373204143000334e-06, + "loss": 0.16, + "num_input_tokens_seen": 2786944, + "step": 2290 + }, + { + "epoch": 0.25559639158035413, + "grad_norm": 2.009888172149658, + "learning_rate": 6.387125515090768e-06, + "loss": 0.262, + "num_input_tokens_seen": 2793088, + "step": 2295 + }, + { + "epoch": 0.2561532464639715, + "grad_norm": 1.0398377180099487, + "learning_rate": 6.401046887181201e-06, + "loss": 0.31, + "num_input_tokens_seen": 2799264, + "step": 2300 + }, + { + "epoch": 0.2567101013475888, + "grad_norm": 0.31136998534202576, + "learning_rate": 6.414968259271634e-06, + "loss": 0.4499, + "num_input_tokens_seen": 2805376, + "step": 2305 + }, + { + "epoch": 0.25726695623120616, + "grad_norm": 3.273339033126831, + "learning_rate": 6.428889631362067e-06, + "loss": 0.5246, + "num_input_tokens_seen": 2810784, + "step": 2310 + }, + { + "epoch": 0.2578238111148235, + "grad_norm": 1.0906280279159546, + "learning_rate": 6.442811003452501e-06, + "loss": 0.4321, + "num_input_tokens_seen": 2816896, + "step": 2315 + }, + { + "epoch": 0.2583806659984408, + "grad_norm": 1.9256640672683716, + "learning_rate": 6.456732375542934e-06, + "loss": 0.2758, + "num_input_tokens_seen": 2823328, + "step": 2320 + }, + { + "epoch": 0.2589375208820581, + "grad_norm": 1.1120234727859497, + "learning_rate": 6.470653747633366e-06, + "loss": 0.2443, + "num_input_tokens_seen": 2829504, + "step": 2325 + }, + { + "epoch": 0.25949437576567547, + "grad_norm": 1.2909839153289795, + "learning_rate": 6.4845751197238e-06, + "loss": 0.2656, + "num_input_tokens_seen": 2835744, + "step": 2330 + }, + { + "epoch": 0.2600512306492928, + "grad_norm": 0.7562479376792908, + "learning_rate": 6.498496491814234e-06, + "loss": 0.3716, + "num_input_tokens_seen": 2841856, + "step": 2335 + }, + { + "epoch": 0.26060808553291015, + "grad_norm": 1.8445113897323608, + "learning_rate": 6.512417863904667e-06, + "loss": 0.3479, + "num_input_tokens_seen": 2848032, + "step": 2340 + }, + { + "epoch": 0.26116494041652744, + "grad_norm": 2.0285747051239014, + "learning_rate": 6.526339235995101e-06, + "loss": 0.2981, + "num_input_tokens_seen": 2854176, + "step": 2345 + }, + { + "epoch": 0.2617217953001448, + "grad_norm": 0.722957968711853, + "learning_rate": 6.540260608085533e-06, + "loss": 0.1601, + "num_input_tokens_seen": 2860448, + "step": 2350 + }, + { + "epoch": 0.2622786501837621, + "grad_norm": 3.032172203063965, + "learning_rate": 6.554181980175966e-06, + "loss": 0.4224, + "num_input_tokens_seen": 2866688, + "step": 2355 + }, + { + "epoch": 0.26283550506737946, + "grad_norm": 4.072173118591309, + "learning_rate": 6.5681033522664e-06, + "loss": 0.5447, + "num_input_tokens_seen": 2872928, + "step": 2360 + }, + { + "epoch": 0.26339235995099675, + "grad_norm": 2.4889578819274902, + "learning_rate": 6.582024724356833e-06, + "loss": 0.6743, + "num_input_tokens_seen": 2879136, + "step": 2365 + }, + { + "epoch": 0.2639492148346141, + "grad_norm": 1.3339427709579468, + "learning_rate": 6.595946096447266e-06, + "loss": 0.1885, + "num_input_tokens_seen": 2885152, + "step": 2370 + }, + { + "epoch": 0.26450606971823143, + "grad_norm": 1.8284415006637573, + "learning_rate": 6.609867468537699e-06, + "loss": 0.2115, + "num_input_tokens_seen": 2891296, + "step": 2375 + }, + { + "epoch": 0.26506292460184877, + "grad_norm": 0.23118634521961212, + "learning_rate": 6.623788840628133e-06, + "loss": 0.2728, + "num_input_tokens_seen": 2897696, + "step": 2380 + }, + { + "epoch": 0.2656197794854661, + "grad_norm": 0.4288792312145233, + "learning_rate": 6.637710212718566e-06, + "loss": 0.3382, + "num_input_tokens_seen": 2903456, + "step": 2385 + }, + { + "epoch": 0.2661766343690834, + "grad_norm": 2.2011122703552246, + "learning_rate": 6.651631584809e-06, + "loss": 0.429, + "num_input_tokens_seen": 2909632, + "step": 2390 + }, + { + "epoch": 0.26673348925270074, + "grad_norm": 2.389209270477295, + "learning_rate": 6.6655529568994315e-06, + "loss": 0.4282, + "num_input_tokens_seen": 2915872, + "step": 2395 + }, + { + "epoch": 0.2672903441363181, + "grad_norm": 3.1367506980895996, + "learning_rate": 6.679474328989865e-06, + "loss": 0.4122, + "num_input_tokens_seen": 2921920, + "step": 2400 + }, + { + "epoch": 0.2678471990199354, + "grad_norm": 1.1286373138427734, + "learning_rate": 6.693395701080299e-06, + "loss": 0.0986, + "num_input_tokens_seen": 2928544, + "step": 2405 + }, + { + "epoch": 0.2684040539035527, + "grad_norm": 2.718824625015259, + "learning_rate": 6.707317073170733e-06, + "loss": 0.2573, + "num_input_tokens_seen": 2934720, + "step": 2410 + }, + { + "epoch": 0.26896090878717005, + "grad_norm": 1.8296968936920166, + "learning_rate": 6.7212384452611645e-06, + "loss": 0.5394, + "num_input_tokens_seen": 2940768, + "step": 2415 + }, + { + "epoch": 0.2695177636707874, + "grad_norm": 0.7156379818916321, + "learning_rate": 6.735159817351598e-06, + "loss": 0.1876, + "num_input_tokens_seen": 2946784, + "step": 2420 + }, + { + "epoch": 0.27007461855440473, + "grad_norm": 1.240393042564392, + "learning_rate": 6.7490811894420315e-06, + "loss": 0.425, + "num_input_tokens_seen": 2952640, + "step": 2425 + }, + { + "epoch": 0.2706314734380221, + "grad_norm": 1.4177473783493042, + "learning_rate": 6.763002561532465e-06, + "loss": 0.2493, + "num_input_tokens_seen": 2958528, + "step": 2430 + }, + { + "epoch": 0.27118832832163936, + "grad_norm": 0.6223466396331787, + "learning_rate": 6.7769239336228985e-06, + "loss": 0.232, + "num_input_tokens_seen": 2964224, + "step": 2435 + }, + { + "epoch": 0.2717451832052567, + "grad_norm": 0.9834815263748169, + "learning_rate": 6.790845305713331e-06, + "loss": 0.2404, + "num_input_tokens_seen": 2970624, + "step": 2440 + }, + { + "epoch": 0.27230203808887404, + "grad_norm": 2.6498169898986816, + "learning_rate": 6.804766677803765e-06, + "loss": 0.2974, + "num_input_tokens_seen": 2976512, + "step": 2445 + }, + { + "epoch": 0.2728588929724914, + "grad_norm": 0.5572649240493774, + "learning_rate": 6.818688049894198e-06, + "loss": 0.2187, + "num_input_tokens_seen": 2982656, + "step": 2450 + }, + { + "epoch": 0.27341574785610867, + "grad_norm": 0.42733249068260193, + "learning_rate": 6.8326094219846315e-06, + "loss": 0.3449, + "num_input_tokens_seen": 2988672, + "step": 2455 + }, + { + "epoch": 0.273972602739726, + "grad_norm": 1.5393145084381104, + "learning_rate": 6.846530794075065e-06, + "loss": 0.2355, + "num_input_tokens_seen": 2994752, + "step": 2460 + }, + { + "epoch": 0.27452945762334335, + "grad_norm": 2.199643850326538, + "learning_rate": 6.860452166165497e-06, + "loss": 0.3406, + "num_input_tokens_seen": 3000768, + "step": 2465 + }, + { + "epoch": 0.2750863125069607, + "grad_norm": 1.2526769638061523, + "learning_rate": 6.874373538255931e-06, + "loss": 0.2966, + "num_input_tokens_seen": 3006912, + "step": 2470 + }, + { + "epoch": 0.27564316739057804, + "grad_norm": 1.551451563835144, + "learning_rate": 6.888294910346365e-06, + "loss": 0.3812, + "num_input_tokens_seen": 3012800, + "step": 2475 + }, + { + "epoch": 0.2762000222741953, + "grad_norm": 0.5357726812362671, + "learning_rate": 6.902216282436798e-06, + "loss": 0.2706, + "num_input_tokens_seen": 3019104, + "step": 2480 + }, + { + "epoch": 0.27675687715781266, + "grad_norm": 0.5878213047981262, + "learning_rate": 6.91613765452723e-06, + "loss": 0.1591, + "num_input_tokens_seen": 3025440, + "step": 2485 + }, + { + "epoch": 0.27731373204143, + "grad_norm": 1.3035850524902344, + "learning_rate": 6.930059026617663e-06, + "loss": 0.2215, + "num_input_tokens_seen": 3031456, + "step": 2490 + }, + { + "epoch": 0.27787058692504735, + "grad_norm": 0.5769200921058655, + "learning_rate": 6.943980398708097e-06, + "loss": 0.1187, + "num_input_tokens_seen": 3037664, + "step": 2495 + }, + { + "epoch": 0.2784274418086647, + "grad_norm": 0.48183852434158325, + "learning_rate": 6.95790177079853e-06, + "loss": 0.1668, + "num_input_tokens_seen": 3043968, + "step": 2500 + }, + { + "epoch": 0.278984296692282, + "grad_norm": 1.7592926025390625, + "learning_rate": 6.971823142888964e-06, + "loss": 0.2758, + "num_input_tokens_seen": 3049888, + "step": 2505 + }, + { + "epoch": 0.2795411515758993, + "grad_norm": 2.4037277698516846, + "learning_rate": 6.9857445149793965e-06, + "loss": 0.2687, + "num_input_tokens_seen": 3056096, + "step": 2510 + }, + { + "epoch": 0.28009800645951666, + "grad_norm": 0.7769395709037781, + "learning_rate": 6.99966588706983e-06, + "loss": 0.4114, + "num_input_tokens_seen": 3062464, + "step": 2515 + }, + { + "epoch": 0.280654861343134, + "grad_norm": 2.2154951095581055, + "learning_rate": 7.013587259160263e-06, + "loss": 0.2778, + "num_input_tokens_seen": 3068736, + "step": 2520 + }, + { + "epoch": 0.2812117162267513, + "grad_norm": 2.7151055335998535, + "learning_rate": 7.027508631250697e-06, + "loss": 0.3461, + "num_input_tokens_seen": 3075008, + "step": 2525 + }, + { + "epoch": 0.2817685711103686, + "grad_norm": 1.5332056283950806, + "learning_rate": 7.041430003341129e-06, + "loss": 0.1142, + "num_input_tokens_seen": 3081088, + "step": 2530 + }, + { + "epoch": 0.28232542599398597, + "grad_norm": 1.5105167627334595, + "learning_rate": 7.055351375431562e-06, + "loss": 0.2062, + "num_input_tokens_seen": 3086432, + "step": 2535 + }, + { + "epoch": 0.2828822808776033, + "grad_norm": 0.3379823863506317, + "learning_rate": 7.0692727475219965e-06, + "loss": 0.1361, + "num_input_tokens_seen": 3092576, + "step": 2540 + }, + { + "epoch": 0.28343913576122065, + "grad_norm": 2.3745064735412598, + "learning_rate": 7.08319411961243e-06, + "loss": 0.27, + "num_input_tokens_seen": 3098560, + "step": 2545 + }, + { + "epoch": 0.28399599064483794, + "grad_norm": 2.3583462238311768, + "learning_rate": 7.0971154917028635e-06, + "loss": 0.1745, + "num_input_tokens_seen": 3104672, + "step": 2550 + }, + { + "epoch": 0.2845528455284553, + "grad_norm": 1.3057531118392944, + "learning_rate": 7.111036863793295e-06, + "loss": 0.3052, + "num_input_tokens_seen": 3110656, + "step": 2555 + }, + { + "epoch": 0.2851097004120726, + "grad_norm": 1.1146557331085205, + "learning_rate": 7.124958235883729e-06, + "loss": 0.2145, + "num_input_tokens_seen": 3116832, + "step": 2560 + }, + { + "epoch": 0.28566655529568996, + "grad_norm": 1.298332929611206, + "learning_rate": 7.138879607974162e-06, + "loss": 0.2492, + "num_input_tokens_seen": 3122208, + "step": 2565 + }, + { + "epoch": 0.28622341017930725, + "grad_norm": 0.8222976326942444, + "learning_rate": 7.152800980064596e-06, + "loss": 0.2462, + "num_input_tokens_seen": 3128672, + "step": 2570 + }, + { + "epoch": 0.2867802650629246, + "grad_norm": 1.1198025941848755, + "learning_rate": 7.166722352155028e-06, + "loss": 0.1866, + "num_input_tokens_seen": 3134880, + "step": 2575 + }, + { + "epoch": 0.28733711994654193, + "grad_norm": 0.786170482635498, + "learning_rate": 7.180643724245462e-06, + "loss": 0.2711, + "num_input_tokens_seen": 3140864, + "step": 2580 + }, + { + "epoch": 0.28789397483015927, + "grad_norm": 1.9203596115112305, + "learning_rate": 7.194565096335895e-06, + "loss": 0.3834, + "num_input_tokens_seen": 3146688, + "step": 2585 + }, + { + "epoch": 0.2884508297137766, + "grad_norm": 2.396845579147339, + "learning_rate": 7.208486468426329e-06, + "loss": 0.4031, + "num_input_tokens_seen": 3152832, + "step": 2590 + }, + { + "epoch": 0.2890076845973939, + "grad_norm": 0.22957177460193634, + "learning_rate": 7.222407840516762e-06, + "loss": 0.2274, + "num_input_tokens_seen": 3158752, + "step": 2595 + }, + { + "epoch": 0.28956453948101124, + "grad_norm": 0.7112821936607361, + "learning_rate": 7.236329212607194e-06, + "loss": 0.36, + "num_input_tokens_seen": 3164544, + "step": 2600 + }, + { + "epoch": 0.2901213943646286, + "grad_norm": 0.3911186754703522, + "learning_rate": 7.2502505846976275e-06, + "loss": 0.1954, + "num_input_tokens_seen": 3170784, + "step": 2605 + }, + { + "epoch": 0.2906782492482459, + "grad_norm": 1.80836820602417, + "learning_rate": 7.264171956788062e-06, + "loss": 0.188, + "num_input_tokens_seen": 3176768, + "step": 2610 + }, + { + "epoch": 0.2912351041318632, + "grad_norm": 1.3407224416732788, + "learning_rate": 7.278093328878495e-06, + "loss": 0.2819, + "num_input_tokens_seen": 3183072, + "step": 2615 + }, + { + "epoch": 0.29179195901548055, + "grad_norm": 0.9769819378852844, + "learning_rate": 7.292014700968929e-06, + "loss": 0.2872, + "num_input_tokens_seen": 3189280, + "step": 2620 + }, + { + "epoch": 0.2923488138990979, + "grad_norm": 2.2077622413635254, + "learning_rate": 7.305936073059361e-06, + "loss": 0.1567, + "num_input_tokens_seen": 3195424, + "step": 2625 + }, + { + "epoch": 0.29290566878271523, + "grad_norm": 2.18583345413208, + "learning_rate": 7.319857445149794e-06, + "loss": 0.3243, + "num_input_tokens_seen": 3201568, + "step": 2630 + }, + { + "epoch": 0.2934625236663326, + "grad_norm": 1.749873399734497, + "learning_rate": 7.3337788172402276e-06, + "loss": 0.2543, + "num_input_tokens_seen": 3207680, + "step": 2635 + }, + { + "epoch": 0.29401937854994986, + "grad_norm": 2.590536594390869, + "learning_rate": 7.347700189330661e-06, + "loss": 0.3406, + "num_input_tokens_seen": 3213824, + "step": 2640 + }, + { + "epoch": 0.2945762334335672, + "grad_norm": 0.22425922751426697, + "learning_rate": 7.361621561421094e-06, + "loss": 0.3308, + "num_input_tokens_seen": 3219904, + "step": 2645 + }, + { + "epoch": 0.29513308831718454, + "grad_norm": 1.3067256212234497, + "learning_rate": 7.375542933511527e-06, + "loss": 0.3364, + "num_input_tokens_seen": 3226240, + "step": 2650 + }, + { + "epoch": 0.2956899432008019, + "grad_norm": 1.0845035314559937, + "learning_rate": 7.389464305601961e-06, + "loss": 0.308, + "num_input_tokens_seen": 3232352, + "step": 2655 + }, + { + "epoch": 0.2962467980844192, + "grad_norm": 1.9145153760910034, + "learning_rate": 7.403385677692394e-06, + "loss": 0.2424, + "num_input_tokens_seen": 3238368, + "step": 2660 + }, + { + "epoch": 0.2968036529680365, + "grad_norm": 2.9116218090057373, + "learning_rate": 7.417307049782828e-06, + "loss": 0.5633, + "num_input_tokens_seen": 3244384, + "step": 2665 + }, + { + "epoch": 0.29736050785165385, + "grad_norm": 0.10074030607938766, + "learning_rate": 7.431228421873259e-06, + "loss": 0.2428, + "num_input_tokens_seen": 3250656, + "step": 2670 + }, + { + "epoch": 0.2979173627352712, + "grad_norm": 0.9165312647819519, + "learning_rate": 7.445149793963693e-06, + "loss": 0.3951, + "num_input_tokens_seen": 3256416, + "step": 2675 + }, + { + "epoch": 0.29847421761888854, + "grad_norm": 1.7415646314620972, + "learning_rate": 7.459071166054127e-06, + "loss": 0.2854, + "num_input_tokens_seen": 3262304, + "step": 2680 + }, + { + "epoch": 0.2990310725025058, + "grad_norm": 0.5864146947860718, + "learning_rate": 7.472992538144561e-06, + "loss": 0.2799, + "num_input_tokens_seen": 3268448, + "step": 2685 + }, + { + "epoch": 0.29958792738612317, + "grad_norm": 2.032588005065918, + "learning_rate": 7.4869139102349925e-06, + "loss": 0.5888, + "num_input_tokens_seen": 3274144, + "step": 2690 + }, + { + "epoch": 0.3001447822697405, + "grad_norm": 0.5951268076896667, + "learning_rate": 7.500835282325426e-06, + "loss": 0.265, + "num_input_tokens_seen": 3280288, + "step": 2695 + }, + { + "epoch": 0.30070163715335785, + "grad_norm": 1.5689976215362549, + "learning_rate": 7.5147566544158594e-06, + "loss": 0.1398, + "num_input_tokens_seen": 3286592, + "step": 2700 + }, + { + "epoch": 0.3012584920369752, + "grad_norm": 1.5205535888671875, + "learning_rate": 7.528678026506293e-06, + "loss": 0.172, + "num_input_tokens_seen": 3292224, + "step": 2705 + }, + { + "epoch": 0.3018153469205925, + "grad_norm": 2.3981435298919678, + "learning_rate": 7.542599398596726e-06, + "loss": 0.3109, + "num_input_tokens_seen": 3298208, + "step": 2710 + }, + { + "epoch": 0.3023722018042098, + "grad_norm": 0.46218377351760864, + "learning_rate": 7.556520770687159e-06, + "loss": 0.3498, + "num_input_tokens_seen": 3304256, + "step": 2715 + }, + { + "epoch": 0.30292905668782716, + "grad_norm": 1.4197802543640137, + "learning_rate": 7.5704421427775925e-06, + "loss": 0.3067, + "num_input_tokens_seen": 3310112, + "step": 2720 + }, + { + "epoch": 0.3034859115714445, + "grad_norm": 1.8295069932937622, + "learning_rate": 7.584363514868026e-06, + "loss": 0.2908, + "num_input_tokens_seen": 3316448, + "step": 2725 + }, + { + "epoch": 0.3040427664550618, + "grad_norm": 1.8302932977676392, + "learning_rate": 7.5982848869584595e-06, + "loss": 0.2722, + "num_input_tokens_seen": 3322400, + "step": 2730 + }, + { + "epoch": 0.3045996213386791, + "grad_norm": 1.8409534692764282, + "learning_rate": 7.612206259048891e-06, + "loss": 0.3382, + "num_input_tokens_seen": 3328288, + "step": 2735 + }, + { + "epoch": 0.30515647622229647, + "grad_norm": 3.228813409805298, + "learning_rate": 7.626127631139325e-06, + "loss": 0.4465, + "num_input_tokens_seen": 3334368, + "step": 2740 + }, + { + "epoch": 0.3057133311059138, + "grad_norm": 0.256478488445282, + "learning_rate": 7.640049003229759e-06, + "loss": 0.1152, + "num_input_tokens_seen": 3340288, + "step": 2745 + }, + { + "epoch": 0.30627018598953115, + "grad_norm": 0.9436787962913513, + "learning_rate": 7.653970375320192e-06, + "loss": 0.2432, + "num_input_tokens_seen": 3346624, + "step": 2750 + }, + { + "epoch": 0.30682704087314844, + "grad_norm": 1.2897807359695435, + "learning_rate": 7.667891747410626e-06, + "loss": 0.2729, + "num_input_tokens_seen": 3352384, + "step": 2755 + }, + { + "epoch": 0.3073838957567658, + "grad_norm": 1.3238365650177002, + "learning_rate": 7.681813119501059e-06, + "loss": 0.4201, + "num_input_tokens_seen": 3358464, + "step": 2760 + }, + { + "epoch": 0.3079407506403831, + "grad_norm": 0.7841054797172546, + "learning_rate": 7.695734491591491e-06, + "loss": 0.3038, + "num_input_tokens_seen": 3364544, + "step": 2765 + }, + { + "epoch": 0.30849760552400046, + "grad_norm": 0.7098439931869507, + "learning_rate": 7.709655863681926e-06, + "loss": 0.1337, + "num_input_tokens_seen": 3370912, + "step": 2770 + }, + { + "epoch": 0.30905446040761775, + "grad_norm": 1.7491772174835205, + "learning_rate": 7.723577235772358e-06, + "loss": 0.2166, + "num_input_tokens_seen": 3376960, + "step": 2775 + }, + { + "epoch": 0.3096113152912351, + "grad_norm": 1.8499698638916016, + "learning_rate": 7.737498607862793e-06, + "loss": 0.1899, + "num_input_tokens_seen": 3383328, + "step": 2780 + }, + { + "epoch": 0.31016817017485243, + "grad_norm": 1.237229585647583, + "learning_rate": 7.751419979953224e-06, + "loss": 0.1623, + "num_input_tokens_seen": 3389376, + "step": 2785 + }, + { + "epoch": 0.3107250250584698, + "grad_norm": 2.8543827533721924, + "learning_rate": 7.765341352043658e-06, + "loss": 0.5435, + "num_input_tokens_seen": 3395616, + "step": 2790 + }, + { + "epoch": 0.3112818799420871, + "grad_norm": 0.2752024233341217, + "learning_rate": 7.77926272413409e-06, + "loss": 0.3354, + "num_input_tokens_seen": 3401184, + "step": 2795 + }, + { + "epoch": 0.3118387348257044, + "grad_norm": 0.24956931173801422, + "learning_rate": 7.793184096224525e-06, + "loss": 0.2133, + "num_input_tokens_seen": 3407584, + "step": 2800 + }, + { + "epoch": 0.31239558970932174, + "grad_norm": 0.7161419987678528, + "learning_rate": 7.807105468314957e-06, + "loss": 0.2072, + "num_input_tokens_seen": 3414016, + "step": 2805 + }, + { + "epoch": 0.3129524445929391, + "grad_norm": 0.3505074381828308, + "learning_rate": 7.82102684040539e-06, + "loss": 0.247, + "num_input_tokens_seen": 3419424, + "step": 2810 + }, + { + "epoch": 0.3135092994765564, + "grad_norm": 1.6389437913894653, + "learning_rate": 7.834948212495824e-06, + "loss": 0.2684, + "num_input_tokens_seen": 3425312, + "step": 2815 + }, + { + "epoch": 0.31406615436017377, + "grad_norm": 0.9832606315612793, + "learning_rate": 7.848869584586257e-06, + "loss": 0.1868, + "num_input_tokens_seen": 3431392, + "step": 2820 + }, + { + "epoch": 0.31462300924379105, + "grad_norm": 1.9220595359802246, + "learning_rate": 7.862790956676691e-06, + "loss": 0.379, + "num_input_tokens_seen": 3437856, + "step": 2825 + }, + { + "epoch": 0.3151798641274084, + "grad_norm": 0.07609865069389343, + "learning_rate": 7.876712328767124e-06, + "loss": 0.3115, + "num_input_tokens_seen": 3444032, + "step": 2830 + }, + { + "epoch": 0.31573671901102573, + "grad_norm": 1.401044249534607, + "learning_rate": 7.890633700857557e-06, + "loss": 0.2421, + "num_input_tokens_seen": 3450112, + "step": 2835 + }, + { + "epoch": 0.3162935738946431, + "grad_norm": 1.4166446924209595, + "learning_rate": 7.904555072947991e-06, + "loss": 0.1707, + "num_input_tokens_seen": 3456352, + "step": 2840 + }, + { + "epoch": 0.31685042877826036, + "grad_norm": 0.6069604754447937, + "learning_rate": 7.918476445038424e-06, + "loss": 0.4818, + "num_input_tokens_seen": 3462528, + "step": 2845 + }, + { + "epoch": 0.3174072836618777, + "grad_norm": 1.1037057638168335, + "learning_rate": 7.932397817128856e-06, + "loss": 0.3631, + "num_input_tokens_seen": 3468800, + "step": 2850 + }, + { + "epoch": 0.31796413854549505, + "grad_norm": 1.803320288658142, + "learning_rate": 7.946319189219289e-06, + "loss": 0.3282, + "num_input_tokens_seen": 3475040, + "step": 2855 + }, + { + "epoch": 0.3185209934291124, + "grad_norm": 1.5379985570907593, + "learning_rate": 7.960240561309723e-06, + "loss": 0.2156, + "num_input_tokens_seen": 3481472, + "step": 2860 + }, + { + "epoch": 0.31907784831272973, + "grad_norm": 1.3155170679092407, + "learning_rate": 7.974161933400156e-06, + "loss": 0.3753, + "num_input_tokens_seen": 3487424, + "step": 2865 + }, + { + "epoch": 0.319634703196347, + "grad_norm": 1.1949843168258667, + "learning_rate": 7.98808330549059e-06, + "loss": 0.2551, + "num_input_tokens_seen": 3493568, + "step": 2870 + }, + { + "epoch": 0.32019155807996436, + "grad_norm": 2.1334147453308105, + "learning_rate": 8.002004677581023e-06, + "loss": 0.3504, + "num_input_tokens_seen": 3499744, + "step": 2875 + }, + { + "epoch": 0.3207484129635817, + "grad_norm": 1.2058050632476807, + "learning_rate": 8.015926049671455e-06, + "loss": 0.2441, + "num_input_tokens_seen": 3506016, + "step": 2880 + }, + { + "epoch": 0.32130526784719904, + "grad_norm": 2.1410932540893555, + "learning_rate": 8.02984742176189e-06, + "loss": 0.1696, + "num_input_tokens_seen": 3512096, + "step": 2885 + }, + { + "epoch": 0.3218621227308163, + "grad_norm": 3.751163959503174, + "learning_rate": 8.043768793852322e-06, + "loss": 0.4023, + "num_input_tokens_seen": 3518080, + "step": 2890 + }, + { + "epoch": 0.32241897761443367, + "grad_norm": 0.38438671827316284, + "learning_rate": 8.057690165942755e-06, + "loss": 0.1712, + "num_input_tokens_seen": 3524064, + "step": 2895 + }, + { + "epoch": 0.322975832498051, + "grad_norm": 1.3394964933395386, + "learning_rate": 8.07161153803319e-06, + "loss": 0.2008, + "num_input_tokens_seen": 3529984, + "step": 2900 + }, + { + "epoch": 0.32353268738166835, + "grad_norm": 1.66836416721344, + "learning_rate": 8.085532910123622e-06, + "loss": 0.3096, + "num_input_tokens_seen": 3536128, + "step": 2905 + }, + { + "epoch": 0.3240895422652857, + "grad_norm": 3.527578115463257, + "learning_rate": 8.099454282214056e-06, + "loss": 0.4592, + "num_input_tokens_seen": 3542208, + "step": 2910 + }, + { + "epoch": 0.324646397148903, + "grad_norm": 2.6074869632720947, + "learning_rate": 8.113375654304489e-06, + "loss": 0.468, + "num_input_tokens_seen": 3548032, + "step": 2915 + }, + { + "epoch": 0.3252032520325203, + "grad_norm": 0.6735188364982605, + "learning_rate": 8.127297026394922e-06, + "loss": 0.1414, + "num_input_tokens_seen": 3553952, + "step": 2920 + }, + { + "epoch": 0.32576010691613766, + "grad_norm": 1.7335025072097778, + "learning_rate": 8.141218398485354e-06, + "loss": 0.3593, + "num_input_tokens_seen": 3560096, + "step": 2925 + }, + { + "epoch": 0.326316961799755, + "grad_norm": 0.9691980481147766, + "learning_rate": 8.155139770575789e-06, + "loss": 0.3994, + "num_input_tokens_seen": 3565728, + "step": 2930 + }, + { + "epoch": 0.3268738166833723, + "grad_norm": 0.5928093194961548, + "learning_rate": 8.169061142666221e-06, + "loss": 0.5558, + "num_input_tokens_seen": 3571712, + "step": 2935 + }, + { + "epoch": 0.32743067156698963, + "grad_norm": 0.21612459421157837, + "learning_rate": 8.182982514756654e-06, + "loss": 0.2708, + "num_input_tokens_seen": 3578016, + "step": 2940 + }, + { + "epoch": 0.32798752645060697, + "grad_norm": 1.0559505224227905, + "learning_rate": 8.196903886847088e-06, + "loss": 0.249, + "num_input_tokens_seen": 3584128, + "step": 2945 + }, + { + "epoch": 0.3285443813342243, + "grad_norm": 1.2666972875595093, + "learning_rate": 8.21082525893752e-06, + "loss": 0.1339, + "num_input_tokens_seen": 3590208, + "step": 2950 + }, + { + "epoch": 0.32910123621784165, + "grad_norm": 0.053379882127046585, + "learning_rate": 8.224746631027955e-06, + "loss": 0.5016, + "num_input_tokens_seen": 3596256, + "step": 2955 + }, + { + "epoch": 0.32965809110145894, + "grad_norm": 0.5231803059577942, + "learning_rate": 8.238668003118388e-06, + "loss": 0.2236, + "num_input_tokens_seen": 3601824, + "step": 2960 + }, + { + "epoch": 0.3302149459850763, + "grad_norm": 0.6615619659423828, + "learning_rate": 8.25258937520882e-06, + "loss": 0.2805, + "num_input_tokens_seen": 3607648, + "step": 2965 + }, + { + "epoch": 0.3307718008686936, + "grad_norm": 3.315159320831299, + "learning_rate": 8.266510747299255e-06, + "loss": 0.3487, + "num_input_tokens_seen": 3613408, + "step": 2970 + }, + { + "epoch": 0.33132865575231096, + "grad_norm": 1.9256283044815063, + "learning_rate": 8.280432119389687e-06, + "loss": 0.2066, + "num_input_tokens_seen": 3619520, + "step": 2975 + }, + { + "epoch": 0.3318855106359283, + "grad_norm": 3.389054775238037, + "learning_rate": 8.294353491480122e-06, + "loss": 0.2673, + "num_input_tokens_seen": 3625408, + "step": 2980 + }, + { + "epoch": 0.3324423655195456, + "grad_norm": 1.125272512435913, + "learning_rate": 8.308274863570554e-06, + "loss": 0.222, + "num_input_tokens_seen": 3631648, + "step": 2985 + }, + { + "epoch": 0.33299922040316293, + "grad_norm": 2.0870280265808105, + "learning_rate": 8.322196235660987e-06, + "loss": 0.2399, + "num_input_tokens_seen": 3637792, + "step": 2990 + }, + { + "epoch": 0.3335560752867803, + "grad_norm": 2.7922213077545166, + "learning_rate": 8.33611760775142e-06, + "loss": 0.3932, + "num_input_tokens_seen": 3643584, + "step": 2995 + }, + { + "epoch": 0.3341129301703976, + "grad_norm": 1.0679261684417725, + "learning_rate": 8.350038979841854e-06, + "loss": 0.3213, + "num_input_tokens_seen": 3649728, + "step": 3000 + }, + { + "epoch": 0.3346697850540149, + "grad_norm": 1.0313762426376343, + "learning_rate": 8.363960351932287e-06, + "loss": 0.1328, + "num_input_tokens_seen": 3655840, + "step": 3005 + }, + { + "epoch": 0.33522663993763224, + "grad_norm": 2.667285203933716, + "learning_rate": 8.37788172402272e-06, + "loss": 0.3751, + "num_input_tokens_seen": 3662304, + "step": 3010 + }, + { + "epoch": 0.3357834948212496, + "grad_norm": 0.08005350828170776, + "learning_rate": 8.391803096113154e-06, + "loss": 0.1231, + "num_input_tokens_seen": 3668192, + "step": 3015 + }, + { + "epoch": 0.3363403497048669, + "grad_norm": 2.760439157485962, + "learning_rate": 8.405724468203586e-06, + "loss": 0.2123, + "num_input_tokens_seen": 3674368, + "step": 3020 + }, + { + "epoch": 0.33689720458848427, + "grad_norm": 1.5933624505996704, + "learning_rate": 8.41964584029402e-06, + "loss": 0.4288, + "num_input_tokens_seen": 3680352, + "step": 3025 + }, + { + "epoch": 0.33745405947210155, + "grad_norm": 0.5474730134010315, + "learning_rate": 8.433567212384453e-06, + "loss": 0.1718, + "num_input_tokens_seen": 3686464, + "step": 3030 + }, + { + "epoch": 0.3380109143557189, + "grad_norm": 2.012233018875122, + "learning_rate": 8.447488584474886e-06, + "loss": 0.3923, + "num_input_tokens_seen": 3692640, + "step": 3035 + }, + { + "epoch": 0.33856776923933624, + "grad_norm": 3.634716033935547, + "learning_rate": 8.46140995656532e-06, + "loss": 0.4503, + "num_input_tokens_seen": 3698048, + "step": 3040 + }, + { + "epoch": 0.3391246241229536, + "grad_norm": 1.3372794389724731, + "learning_rate": 8.475331328655753e-06, + "loss": 0.1652, + "num_input_tokens_seen": 3704128, + "step": 3045 + }, + { + "epoch": 0.33968147900657086, + "grad_norm": 1.0612248182296753, + "learning_rate": 8.489252700746187e-06, + "loss": 0.3817, + "num_input_tokens_seen": 3710688, + "step": 3050 + }, + { + "epoch": 0.3402383338901882, + "grad_norm": 0.8430193662643433, + "learning_rate": 8.503174072836618e-06, + "loss": 0.4675, + "num_input_tokens_seen": 3716736, + "step": 3055 + }, + { + "epoch": 0.34079518877380555, + "grad_norm": 2.4698867797851562, + "learning_rate": 8.517095444927052e-06, + "loss": 0.3691, + "num_input_tokens_seen": 3722496, + "step": 3060 + }, + { + "epoch": 0.3413520436574229, + "grad_norm": 1.44895339012146, + "learning_rate": 8.531016817017485e-06, + "loss": 0.298, + "num_input_tokens_seen": 3728608, + "step": 3065 + }, + { + "epoch": 0.34190889854104023, + "grad_norm": 1.657004475593567, + "learning_rate": 8.54493818910792e-06, + "loss": 0.1788, + "num_input_tokens_seen": 3734688, + "step": 3070 + }, + { + "epoch": 0.3424657534246575, + "grad_norm": 0.2694631814956665, + "learning_rate": 8.558859561198352e-06, + "loss": 0.2933, + "num_input_tokens_seen": 3741056, + "step": 3075 + }, + { + "epoch": 0.34302260830827486, + "grad_norm": 1.4989259243011475, + "learning_rate": 8.572780933288785e-06, + "loss": 0.1969, + "num_input_tokens_seen": 3747168, + "step": 3080 + }, + { + "epoch": 0.3435794631918922, + "grad_norm": 1.121235728263855, + "learning_rate": 8.586702305379219e-06, + "loss": 0.2419, + "num_input_tokens_seen": 3753440, + "step": 3085 + }, + { + "epoch": 0.34413631807550954, + "grad_norm": 1.1034941673278809, + "learning_rate": 8.600623677469651e-06, + "loss": 0.193, + "num_input_tokens_seen": 3759584, + "step": 3090 + }, + { + "epoch": 0.3446931729591268, + "grad_norm": 1.73140287399292, + "learning_rate": 8.614545049560086e-06, + "loss": 0.1318, + "num_input_tokens_seen": 3765856, + "step": 3095 + }, + { + "epoch": 0.34525002784274417, + "grad_norm": 1.1794005632400513, + "learning_rate": 8.628466421650517e-06, + "loss": 0.451, + "num_input_tokens_seen": 3772224, + "step": 3100 + }, + { + "epoch": 0.3458068827263615, + "grad_norm": 4.398324012756348, + "learning_rate": 8.642387793740951e-06, + "loss": 0.3598, + "num_input_tokens_seen": 3778400, + "step": 3105 + }, + { + "epoch": 0.34636373760997885, + "grad_norm": 0.5415234565734863, + "learning_rate": 8.656309165831385e-06, + "loss": 0.1356, + "num_input_tokens_seen": 3784608, + "step": 3110 + }, + { + "epoch": 0.3469205924935962, + "grad_norm": 0.972968339920044, + "learning_rate": 8.670230537921818e-06, + "loss": 0.1591, + "num_input_tokens_seen": 3790688, + "step": 3115 + }, + { + "epoch": 0.3474774473772135, + "grad_norm": 1.4655177593231201, + "learning_rate": 8.684151910012252e-06, + "loss": 0.2666, + "num_input_tokens_seen": 3796864, + "step": 3120 + }, + { + "epoch": 0.3480343022608308, + "grad_norm": 2.27813458442688, + "learning_rate": 8.698073282102683e-06, + "loss": 0.343, + "num_input_tokens_seen": 3803008, + "step": 3125 + }, + { + "epoch": 0.34859115714444816, + "grad_norm": 1.7977368831634521, + "learning_rate": 8.711994654193118e-06, + "loss": 0.2344, + "num_input_tokens_seen": 3809152, + "step": 3130 + }, + { + "epoch": 0.3491480120280655, + "grad_norm": 1.1797422170639038, + "learning_rate": 8.72591602628355e-06, + "loss": 0.2363, + "num_input_tokens_seen": 3815296, + "step": 3135 + }, + { + "epoch": 0.3497048669116828, + "grad_norm": 2.64890456199646, + "learning_rate": 8.739837398373985e-06, + "loss": 0.1991, + "num_input_tokens_seen": 3821312, + "step": 3140 + }, + { + "epoch": 0.35026172179530013, + "grad_norm": 0.2256774604320526, + "learning_rate": 8.753758770464417e-06, + "loss": 0.1545, + "num_input_tokens_seen": 3827328, + "step": 3145 + }, + { + "epoch": 0.35081857667891747, + "grad_norm": 2.5463171005249023, + "learning_rate": 8.76768014255485e-06, + "loss": 0.2242, + "num_input_tokens_seen": 3833312, + "step": 3150 + }, + { + "epoch": 0.3513754315625348, + "grad_norm": 2.9979724884033203, + "learning_rate": 8.781601514645284e-06, + "loss": 0.2047, + "num_input_tokens_seen": 3839456, + "step": 3155 + }, + { + "epoch": 0.35193228644615215, + "grad_norm": 1.1875543594360352, + "learning_rate": 8.795522886735717e-06, + "loss": 0.1833, + "num_input_tokens_seen": 3845568, + "step": 3160 + }, + { + "epoch": 0.35248914132976944, + "grad_norm": 1.6575465202331543, + "learning_rate": 8.809444258826151e-06, + "loss": 0.2448, + "num_input_tokens_seen": 3851680, + "step": 3165 + }, + { + "epoch": 0.3530459962133868, + "grad_norm": 1.073224663734436, + "learning_rate": 8.823365630916584e-06, + "loss": 0.255, + "num_input_tokens_seen": 3858144, + "step": 3170 + }, + { + "epoch": 0.3536028510970041, + "grad_norm": 1.8733527660369873, + "learning_rate": 8.837287003007016e-06, + "loss": 0.2324, + "num_input_tokens_seen": 3864128, + "step": 3175 + }, + { + "epoch": 0.35415970598062146, + "grad_norm": 1.3237124681472778, + "learning_rate": 8.85120837509745e-06, + "loss": 0.5013, + "num_input_tokens_seen": 3870528, + "step": 3180 + }, + { + "epoch": 0.3547165608642388, + "grad_norm": 0.1785822957754135, + "learning_rate": 8.865129747187883e-06, + "loss": 0.1702, + "num_input_tokens_seen": 3876640, + "step": 3185 + }, + { + "epoch": 0.3552734157478561, + "grad_norm": 2.6423451900482178, + "learning_rate": 8.879051119278318e-06, + "loss": 0.3162, + "num_input_tokens_seen": 3882656, + "step": 3190 + }, + { + "epoch": 0.35583027063147343, + "grad_norm": 1.1263824701309204, + "learning_rate": 8.892972491368749e-06, + "loss": 0.2349, + "num_input_tokens_seen": 3888544, + "step": 3195 + }, + { + "epoch": 0.3563871255150908, + "grad_norm": 1.2921017408370972, + "learning_rate": 8.906893863459183e-06, + "loss": 0.4129, + "num_input_tokens_seen": 3894720, + "step": 3200 + }, + { + "epoch": 0.3569439803987081, + "grad_norm": 1.3854928016662598, + "learning_rate": 8.920815235549616e-06, + "loss": 0.222, + "num_input_tokens_seen": 3900928, + "step": 3205 + }, + { + "epoch": 0.3575008352823254, + "grad_norm": 2.76162052154541, + "learning_rate": 8.93473660764005e-06, + "loss": 0.2121, + "num_input_tokens_seen": 3907008, + "step": 3210 + }, + { + "epoch": 0.35805769016594274, + "grad_norm": 0.4959102272987366, + "learning_rate": 8.948657979730483e-06, + "loss": 0.0736, + "num_input_tokens_seen": 3912896, + "step": 3215 + }, + { + "epoch": 0.3586145450495601, + "grad_norm": 3.722766637802124, + "learning_rate": 8.962579351820915e-06, + "loss": 0.3034, + "num_input_tokens_seen": 3918848, + "step": 3220 + }, + { + "epoch": 0.3591713999331774, + "grad_norm": 2.430013418197632, + "learning_rate": 8.97650072391135e-06, + "loss": 0.1483, + "num_input_tokens_seen": 3924832, + "step": 3225 + }, + { + "epoch": 0.35972825481679477, + "grad_norm": 1.4251220226287842, + "learning_rate": 8.990422096001782e-06, + "loss": 0.1182, + "num_input_tokens_seen": 3931232, + "step": 3230 + }, + { + "epoch": 0.36028510970041205, + "grad_norm": 1.1877520084381104, + "learning_rate": 9.004343468092217e-06, + "loss": 0.3291, + "num_input_tokens_seen": 3937472, + "step": 3235 + }, + { + "epoch": 0.3608419645840294, + "grad_norm": 2.3543496131896973, + "learning_rate": 9.01826484018265e-06, + "loss": 0.3421, + "num_input_tokens_seen": 3943552, + "step": 3240 + }, + { + "epoch": 0.36139881946764674, + "grad_norm": 0.8882214426994324, + "learning_rate": 9.032186212273082e-06, + "loss": 0.1619, + "num_input_tokens_seen": 3949760, + "step": 3245 + }, + { + "epoch": 0.3619556743512641, + "grad_norm": 0.3702546954154968, + "learning_rate": 9.046107584363516e-06, + "loss": 0.2848, + "num_input_tokens_seen": 3956320, + "step": 3250 + }, + { + "epoch": 0.36251252923488136, + "grad_norm": 1.2071088552474976, + "learning_rate": 9.060028956453949e-06, + "loss": 0.4252, + "num_input_tokens_seen": 3962400, + "step": 3255 + }, + { + "epoch": 0.3630693841184987, + "grad_norm": 1.3257269859313965, + "learning_rate": 9.073950328544381e-06, + "loss": 0.206, + "num_input_tokens_seen": 3968448, + "step": 3260 + }, + { + "epoch": 0.36362623900211605, + "grad_norm": 0.7772386074066162, + "learning_rate": 9.087871700634814e-06, + "loss": 0.1139, + "num_input_tokens_seen": 3974720, + "step": 3265 + }, + { + "epoch": 0.3641830938857334, + "grad_norm": 1.1363327503204346, + "learning_rate": 9.101793072725248e-06, + "loss": 0.2482, + "num_input_tokens_seen": 3980576, + "step": 3270 + }, + { + "epoch": 0.36473994876935073, + "grad_norm": 0.6917766332626343, + "learning_rate": 9.115714444815681e-06, + "loss": 0.1673, + "num_input_tokens_seen": 3986272, + "step": 3275 + }, + { + "epoch": 0.365296803652968, + "grad_norm": 0.5232757925987244, + "learning_rate": 9.129635816906115e-06, + "loss": 0.2197, + "num_input_tokens_seen": 3992480, + "step": 3280 + }, + { + "epoch": 0.36585365853658536, + "grad_norm": 1.3303303718566895, + "learning_rate": 9.143557188996548e-06, + "loss": 0.2888, + "num_input_tokens_seen": 3998240, + "step": 3285 + }, + { + "epoch": 0.3664105134202027, + "grad_norm": 0.7755256295204163, + "learning_rate": 9.15747856108698e-06, + "loss": 0.2409, + "num_input_tokens_seen": 4004160, + "step": 3290 + }, + { + "epoch": 0.36696736830382004, + "grad_norm": 0.6738230586051941, + "learning_rate": 9.171399933177415e-06, + "loss": 0.2156, + "num_input_tokens_seen": 4009920, + "step": 3295 + }, + { + "epoch": 0.3675242231874373, + "grad_norm": 2.7754437923431396, + "learning_rate": 9.185321305267848e-06, + "loss": 0.5073, + "num_input_tokens_seen": 4015328, + "step": 3300 + }, + { + "epoch": 0.36808107807105467, + "grad_norm": 0.8624547719955444, + "learning_rate": 9.199242677358282e-06, + "loss": 0.2822, + "num_input_tokens_seen": 4021280, + "step": 3305 + }, + { + "epoch": 0.368637932954672, + "grad_norm": 1.901232361793518, + "learning_rate": 9.213164049448715e-06, + "loss": 0.3274, + "num_input_tokens_seen": 4027392, + "step": 3310 + }, + { + "epoch": 0.36919478783828935, + "grad_norm": 0.19374725222587585, + "learning_rate": 9.227085421539147e-06, + "loss": 0.2438, + "num_input_tokens_seen": 4033664, + "step": 3315 + }, + { + "epoch": 0.3697516427219067, + "grad_norm": 1.698517084121704, + "learning_rate": 9.241006793629581e-06, + "loss": 0.5064, + "num_input_tokens_seen": 4039424, + "step": 3320 + }, + { + "epoch": 0.370308497605524, + "grad_norm": 0.7697085738182068, + "learning_rate": 9.254928165720014e-06, + "loss": 0.2136, + "num_input_tokens_seen": 4045600, + "step": 3325 + }, + { + "epoch": 0.3708653524891413, + "grad_norm": 0.24246804416179657, + "learning_rate": 9.268849537810447e-06, + "loss": 0.2004, + "num_input_tokens_seen": 4051456, + "step": 3330 + }, + { + "epoch": 0.37142220737275866, + "grad_norm": 2.1832172870635986, + "learning_rate": 9.28277090990088e-06, + "loss": 0.2892, + "num_input_tokens_seen": 4057664, + "step": 3335 + }, + { + "epoch": 0.371979062256376, + "grad_norm": 1.1532679796218872, + "learning_rate": 9.296692281991314e-06, + "loss": 0.2791, + "num_input_tokens_seen": 4063936, + "step": 3340 + }, + { + "epoch": 0.37253591713999334, + "grad_norm": 1.6682072877883911, + "learning_rate": 9.310613654081746e-06, + "loss": 0.3732, + "num_input_tokens_seen": 4070112, + "step": 3345 + }, + { + "epoch": 0.37309277202361063, + "grad_norm": 1.4399755001068115, + "learning_rate": 9.32453502617218e-06, + "loss": 0.2903, + "num_input_tokens_seen": 4076320, + "step": 3350 + }, + { + "epoch": 0.37364962690722797, + "grad_norm": 1.1452373266220093, + "learning_rate": 9.338456398262613e-06, + "loss": 0.2686, + "num_input_tokens_seen": 4082432, + "step": 3355 + }, + { + "epoch": 0.3742064817908453, + "grad_norm": 1.8832024335861206, + "learning_rate": 9.352377770353046e-06, + "loss": 0.3095, + "num_input_tokens_seen": 4088224, + "step": 3360 + }, + { + "epoch": 0.37476333667446265, + "grad_norm": 2.149290084838867, + "learning_rate": 9.36629914244348e-06, + "loss": 0.3495, + "num_input_tokens_seen": 4094208, + "step": 3365 + }, + { + "epoch": 0.37532019155807994, + "grad_norm": 3.0450172424316406, + "learning_rate": 9.380220514533913e-06, + "loss": 0.273, + "num_input_tokens_seen": 4100736, + "step": 3370 + }, + { + "epoch": 0.3758770464416973, + "grad_norm": 1.0481369495391846, + "learning_rate": 9.394141886624346e-06, + "loss": 0.3134, + "num_input_tokens_seen": 4107072, + "step": 3375 + }, + { + "epoch": 0.3764339013253146, + "grad_norm": 1.4286859035491943, + "learning_rate": 9.40806325871478e-06, + "loss": 0.28, + "num_input_tokens_seen": 4113184, + "step": 3380 + }, + { + "epoch": 0.37699075620893197, + "grad_norm": 3.1112060546875, + "learning_rate": 9.421984630805212e-06, + "loss": 0.2292, + "num_input_tokens_seen": 4119488, + "step": 3385 + }, + { + "epoch": 0.3775476110925493, + "grad_norm": 2.847360134124756, + "learning_rate": 9.435906002895647e-06, + "loss": 0.3668, + "num_input_tokens_seen": 4125760, + "step": 3390 + }, + { + "epoch": 0.3781044659761666, + "grad_norm": 1.1978132724761963, + "learning_rate": 9.44982737498608e-06, + "loss": 0.2175, + "num_input_tokens_seen": 4132064, + "step": 3395 + }, + { + "epoch": 0.37866132085978393, + "grad_norm": 0.5769366025924683, + "learning_rate": 9.463748747076512e-06, + "loss": 0.2412, + "num_input_tokens_seen": 4137440, + "step": 3400 + }, + { + "epoch": 0.3792181757434013, + "grad_norm": 1.5219639539718628, + "learning_rate": 9.477670119166945e-06, + "loss": 0.1771, + "num_input_tokens_seen": 4143392, + "step": 3405 + }, + { + "epoch": 0.3797750306270186, + "grad_norm": 0.6299421787261963, + "learning_rate": 9.491591491257379e-06, + "loss": 0.1547, + "num_input_tokens_seen": 4149632, + "step": 3410 + }, + { + "epoch": 0.3803318855106359, + "grad_norm": 1.2774344682693481, + "learning_rate": 9.505512863347812e-06, + "loss": 0.3364, + "num_input_tokens_seen": 4156096, + "step": 3415 + }, + { + "epoch": 0.38088874039425324, + "grad_norm": 1.1825839281082153, + "learning_rate": 9.519434235438244e-06, + "loss": 0.207, + "num_input_tokens_seen": 4162336, + "step": 3420 + }, + { + "epoch": 0.3814455952778706, + "grad_norm": 1.4923423528671265, + "learning_rate": 9.533355607528679e-06, + "loss": 0.4118, + "num_input_tokens_seen": 4168544, + "step": 3425 + }, + { + "epoch": 0.3820024501614879, + "grad_norm": 0.9785728454589844, + "learning_rate": 9.547276979619111e-06, + "loss": 0.2629, + "num_input_tokens_seen": 4174752, + "step": 3430 + }, + { + "epoch": 0.38255930504510527, + "grad_norm": 3.1093742847442627, + "learning_rate": 9.561198351709546e-06, + "loss": 0.2597, + "num_input_tokens_seen": 4180576, + "step": 3435 + }, + { + "epoch": 0.38311615992872255, + "grad_norm": 1.1551463603973389, + "learning_rate": 9.575119723799978e-06, + "loss": 0.2889, + "num_input_tokens_seen": 4186688, + "step": 3440 + }, + { + "epoch": 0.3836730148123399, + "grad_norm": 0.19218255579471588, + "learning_rate": 9.589041095890411e-06, + "loss": 0.1541, + "num_input_tokens_seen": 4192320, + "step": 3445 + }, + { + "epoch": 0.38422986969595724, + "grad_norm": 1.348653793334961, + "learning_rate": 9.602962467980845e-06, + "loss": 0.3386, + "num_input_tokens_seen": 4198432, + "step": 3450 + }, + { + "epoch": 0.3847867245795746, + "grad_norm": 0.4956481456756592, + "learning_rate": 9.616883840071278e-06, + "loss": 0.2001, + "num_input_tokens_seen": 4204736, + "step": 3455 + }, + { + "epoch": 0.38534357946319187, + "grad_norm": 0.8835581541061401, + "learning_rate": 9.630805212161712e-06, + "loss": 0.1673, + "num_input_tokens_seen": 4210624, + "step": 3460 + }, + { + "epoch": 0.3859004343468092, + "grad_norm": 1.5401277542114258, + "learning_rate": 9.644726584252145e-06, + "loss": 0.4365, + "num_input_tokens_seen": 4216352, + "step": 3465 + }, + { + "epoch": 0.38645728923042655, + "grad_norm": 2.898993730545044, + "learning_rate": 9.658647956342577e-06, + "loss": 0.4055, + "num_input_tokens_seen": 4222464, + "step": 3470 + }, + { + "epoch": 0.3870141441140439, + "grad_norm": 0.8187147378921509, + "learning_rate": 9.67256932843301e-06, + "loss": 0.3179, + "num_input_tokens_seen": 4228768, + "step": 3475 + }, + { + "epoch": 0.38757099899766123, + "grad_norm": 1.5005308389663696, + "learning_rate": 9.686490700523444e-06, + "loss": 0.3242, + "num_input_tokens_seen": 4235040, + "step": 3480 + }, + { + "epoch": 0.3881278538812785, + "grad_norm": 0.6941857933998108, + "learning_rate": 9.700412072613877e-06, + "loss": 0.3308, + "num_input_tokens_seen": 4241312, + "step": 3485 + }, + { + "epoch": 0.38868470876489586, + "grad_norm": 2.0572361946105957, + "learning_rate": 9.71433344470431e-06, + "loss": 0.2245, + "num_input_tokens_seen": 4247648, + "step": 3490 + }, + { + "epoch": 0.3892415636485132, + "grad_norm": 0.4208579361438751, + "learning_rate": 9.728254816794744e-06, + "loss": 0.4676, + "num_input_tokens_seen": 4253664, + "step": 3495 + }, + { + "epoch": 0.38979841853213054, + "grad_norm": 1.3026344776153564, + "learning_rate": 9.742176188885177e-06, + "loss": 0.4109, + "num_input_tokens_seen": 4259840, + "step": 3500 + }, + { + "epoch": 0.3903552734157479, + "grad_norm": 1.7586692571640015, + "learning_rate": 9.756097560975611e-06, + "loss": 0.2555, + "num_input_tokens_seen": 4265696, + "step": 3505 + }, + { + "epoch": 0.39091212829936517, + "grad_norm": 0.15289027988910675, + "learning_rate": 9.770018933066044e-06, + "loss": 0.2892, + "num_input_tokens_seen": 4271520, + "step": 3510 + }, + { + "epoch": 0.3914689831829825, + "grad_norm": 1.5776631832122803, + "learning_rate": 9.783940305156476e-06, + "loss": 0.3615, + "num_input_tokens_seen": 4277792, + "step": 3515 + }, + { + "epoch": 0.39202583806659985, + "grad_norm": 1.4178516864776611, + "learning_rate": 9.79786167724691e-06, + "loss": 0.1515, + "num_input_tokens_seen": 4284160, + "step": 3520 + }, + { + "epoch": 0.3925826929502172, + "grad_norm": 1.3994282484054565, + "learning_rate": 9.811783049337343e-06, + "loss": 0.3245, + "num_input_tokens_seen": 4289952, + "step": 3525 + }, + { + "epoch": 0.3931395478338345, + "grad_norm": 0.9382721781730652, + "learning_rate": 9.825704421427778e-06, + "loss": 0.2043, + "num_input_tokens_seen": 4296160, + "step": 3530 + }, + { + "epoch": 0.3936964027174518, + "grad_norm": 0.8957577347755432, + "learning_rate": 9.839625793518208e-06, + "loss": 0.232, + "num_input_tokens_seen": 4302240, + "step": 3535 + }, + { + "epoch": 0.39425325760106916, + "grad_norm": 1.751229166984558, + "learning_rate": 9.853547165608643e-06, + "loss": 0.3132, + "num_input_tokens_seen": 4308544, + "step": 3540 + }, + { + "epoch": 0.3948101124846865, + "grad_norm": 0.547620415687561, + "learning_rate": 9.867468537699075e-06, + "loss": 0.3235, + "num_input_tokens_seen": 4314624, + "step": 3545 + }, + { + "epoch": 0.39536696736830385, + "grad_norm": 2.5304839611053467, + "learning_rate": 9.88138990978951e-06, + "loss": 0.2028, + "num_input_tokens_seen": 4320704, + "step": 3550 + }, + { + "epoch": 0.39592382225192113, + "grad_norm": 0.5601527094841003, + "learning_rate": 9.895311281879942e-06, + "loss": 0.2806, + "num_input_tokens_seen": 4326752, + "step": 3555 + }, + { + "epoch": 0.3964806771355385, + "grad_norm": 3.244166851043701, + "learning_rate": 9.909232653970375e-06, + "loss": 0.3294, + "num_input_tokens_seen": 4333248, + "step": 3560 + }, + { + "epoch": 0.3970375320191558, + "grad_norm": 1.7035636901855469, + "learning_rate": 9.92315402606081e-06, + "loss": 0.3784, + "num_input_tokens_seen": 4339360, + "step": 3565 + }, + { + "epoch": 0.39759438690277316, + "grad_norm": 0.6101680397987366, + "learning_rate": 9.937075398151242e-06, + "loss": 0.2632, + "num_input_tokens_seen": 4345568, + "step": 3570 + }, + { + "epoch": 0.39815124178639044, + "grad_norm": 2.8279483318328857, + "learning_rate": 9.950996770241676e-06, + "loss": 0.2886, + "num_input_tokens_seen": 4351840, + "step": 3575 + }, + { + "epoch": 0.3987080966700078, + "grad_norm": 1.7364156246185303, + "learning_rate": 9.964918142332107e-06, + "loss": 0.3477, + "num_input_tokens_seen": 4357824, + "step": 3580 + }, + { + "epoch": 0.3992649515536251, + "grad_norm": 2.640620708465576, + "learning_rate": 9.978839514422542e-06, + "loss": 0.2821, + "num_input_tokens_seen": 4364000, + "step": 3585 + }, + { + "epoch": 0.39982180643724247, + "grad_norm": 1.682661533355713, + "learning_rate": 9.992760886512976e-06, + "loss": 0.367, + "num_input_tokens_seen": 4370272, + "step": 3590 + }, + { + "epoch": 0.4003786613208598, + "grad_norm": 2.5878846645355225, + "learning_rate": 1.0006682258603409e-05, + "loss": 0.2392, + "num_input_tokens_seen": 4375840, + "step": 3595 + }, + { + "epoch": 0.4009355162044771, + "grad_norm": 0.7607879638671875, + "learning_rate": 1.0020603630693843e-05, + "loss": 0.2406, + "num_input_tokens_seen": 4382432, + "step": 3600 + }, + { + "epoch": 0.40149237108809444, + "grad_norm": 1.4934968948364258, + "learning_rate": 1.0034525002784274e-05, + "loss": 0.1032, + "num_input_tokens_seen": 4388512, + "step": 3605 + }, + { + "epoch": 0.4020492259717118, + "grad_norm": 0.8182069063186646, + "learning_rate": 1.0048446374874708e-05, + "loss": 0.3647, + "num_input_tokens_seen": 4394144, + "step": 3610 + }, + { + "epoch": 0.4026060808553291, + "grad_norm": 1.1750590801239014, + "learning_rate": 1.006236774696514e-05, + "loss": 0.2033, + "num_input_tokens_seen": 4399776, + "step": 3615 + }, + { + "epoch": 0.4031629357389464, + "grad_norm": 0.23093865811824799, + "learning_rate": 1.0076289119055575e-05, + "loss": 0.1539, + "num_input_tokens_seen": 4405472, + "step": 3620 + }, + { + "epoch": 0.40371979062256375, + "grad_norm": 1.9510995149612427, + "learning_rate": 1.0090210491146008e-05, + "loss": 0.2648, + "num_input_tokens_seen": 4411936, + "step": 3625 + }, + { + "epoch": 0.4042766455061811, + "grad_norm": 2.7108917236328125, + "learning_rate": 1.010413186323644e-05, + "loss": 0.3254, + "num_input_tokens_seen": 4418304, + "step": 3630 + }, + { + "epoch": 0.40483350038979843, + "grad_norm": 0.6963976621627808, + "learning_rate": 1.0118053235326875e-05, + "loss": 0.1429, + "num_input_tokens_seen": 4424320, + "step": 3635 + }, + { + "epoch": 0.40539035527341577, + "grad_norm": 2.0876195430755615, + "learning_rate": 1.0131974607417307e-05, + "loss": 0.4855, + "num_input_tokens_seen": 4430752, + "step": 3640 + }, + { + "epoch": 0.40594721015703306, + "grad_norm": 0.4632662534713745, + "learning_rate": 1.0145895979507742e-05, + "loss": 0.2047, + "num_input_tokens_seen": 4437088, + "step": 3645 + }, + { + "epoch": 0.4065040650406504, + "grad_norm": 0.8240542411804199, + "learning_rate": 1.0159817351598173e-05, + "loss": 0.1724, + "num_input_tokens_seen": 4442976, + "step": 3650 + }, + { + "epoch": 0.40706091992426774, + "grad_norm": 1.1535561084747314, + "learning_rate": 1.0173738723688607e-05, + "loss": 0.2123, + "num_input_tokens_seen": 4449024, + "step": 3655 + }, + { + "epoch": 0.4076177748078851, + "grad_norm": 2.4642512798309326, + "learning_rate": 1.0187660095779041e-05, + "loss": 0.2398, + "num_input_tokens_seen": 4455040, + "step": 3660 + }, + { + "epoch": 0.4081746296915024, + "grad_norm": 0.2961001396179199, + "learning_rate": 1.0201581467869474e-05, + "loss": 0.2482, + "num_input_tokens_seen": 4461088, + "step": 3665 + }, + { + "epoch": 0.4087314845751197, + "grad_norm": 1.204410195350647, + "learning_rate": 1.0215502839959908e-05, + "loss": 0.2614, + "num_input_tokens_seen": 4467520, + "step": 3670 + }, + { + "epoch": 0.40928833945873705, + "grad_norm": 1.394749641418457, + "learning_rate": 1.022942421205034e-05, + "loss": 0.238, + "num_input_tokens_seen": 4473440, + "step": 3675 + }, + { + "epoch": 0.4098451943423544, + "grad_norm": 2.598324775695801, + "learning_rate": 1.0243345584140773e-05, + "loss": 0.1707, + "num_input_tokens_seen": 4479904, + "step": 3680 + }, + { + "epoch": 0.41040204922597173, + "grad_norm": 0.6357415914535522, + "learning_rate": 1.0257266956231206e-05, + "loss": 0.4047, + "num_input_tokens_seen": 4485856, + "step": 3685 + }, + { + "epoch": 0.410958904109589, + "grad_norm": 1.5943069458007812, + "learning_rate": 1.027118832832164e-05, + "loss": 0.2353, + "num_input_tokens_seen": 4492032, + "step": 3690 + }, + { + "epoch": 0.41151575899320636, + "grad_norm": 1.0350383520126343, + "learning_rate": 1.0285109700412073e-05, + "loss": 0.2677, + "num_input_tokens_seen": 4498080, + "step": 3695 + }, + { + "epoch": 0.4120726138768237, + "grad_norm": 0.9128947854042053, + "learning_rate": 1.0299031072502506e-05, + "loss": 0.2811, + "num_input_tokens_seen": 4504256, + "step": 3700 + }, + { + "epoch": 0.41262946876044104, + "grad_norm": 0.2957676649093628, + "learning_rate": 1.031295244459294e-05, + "loss": 0.2685, + "num_input_tokens_seen": 4510432, + "step": 3705 + }, + { + "epoch": 0.4131863236440584, + "grad_norm": 1.5170692205429077, + "learning_rate": 1.0326873816683373e-05, + "loss": 0.2767, + "num_input_tokens_seen": 4516928, + "step": 3710 + }, + { + "epoch": 0.41374317852767567, + "grad_norm": 0.6939054131507874, + "learning_rate": 1.0340795188773807e-05, + "loss": 0.3305, + "num_input_tokens_seen": 4523104, + "step": 3715 + }, + { + "epoch": 0.414300033411293, + "grad_norm": 0.7993552684783936, + "learning_rate": 1.0354716560864238e-05, + "loss": 0.2465, + "num_input_tokens_seen": 4529696, + "step": 3720 + }, + { + "epoch": 0.41485688829491035, + "grad_norm": 2.111543655395508, + "learning_rate": 1.0368637932954672e-05, + "loss": 0.3976, + "num_input_tokens_seen": 4535392, + "step": 3725 + }, + { + "epoch": 0.4154137431785277, + "grad_norm": 0.7634919285774231, + "learning_rate": 1.0382559305045107e-05, + "loss": 0.1716, + "num_input_tokens_seen": 4541504, + "step": 3730 + }, + { + "epoch": 0.415970598062145, + "grad_norm": 2.172314405441284, + "learning_rate": 1.039648067713554e-05, + "loss": 0.4936, + "num_input_tokens_seen": 4547616, + "step": 3735 + }, + { + "epoch": 0.4165274529457623, + "grad_norm": 0.940574586391449, + "learning_rate": 1.0410402049225972e-05, + "loss": 0.0939, + "num_input_tokens_seen": 4553856, + "step": 3740 + }, + { + "epoch": 0.41708430782937966, + "grad_norm": 3.769268751144409, + "learning_rate": 1.0424323421316405e-05, + "loss": 0.365, + "num_input_tokens_seen": 4559872, + "step": 3745 + }, + { + "epoch": 0.417641162712997, + "grad_norm": 1.5953912734985352, + "learning_rate": 1.0438244793406839e-05, + "loss": 0.1881, + "num_input_tokens_seen": 4566144, + "step": 3750 + }, + { + "epoch": 0.41819801759661435, + "grad_norm": 0.7478357553482056, + "learning_rate": 1.0452166165497271e-05, + "loss": 0.2507, + "num_input_tokens_seen": 4572320, + "step": 3755 + }, + { + "epoch": 0.41875487248023163, + "grad_norm": 0.3365839719772339, + "learning_rate": 1.0466087537587706e-05, + "loss": 0.0467, + "num_input_tokens_seen": 4578560, + "step": 3760 + }, + { + "epoch": 0.419311727363849, + "grad_norm": 1.137332558631897, + "learning_rate": 1.0480008909678138e-05, + "loss": 0.4241, + "num_input_tokens_seen": 4584416, + "step": 3765 + }, + { + "epoch": 0.4198685822474663, + "grad_norm": 0.8516455292701721, + "learning_rate": 1.0493930281768571e-05, + "loss": 0.178, + "num_input_tokens_seen": 4590336, + "step": 3770 + }, + { + "epoch": 0.42042543713108366, + "grad_norm": 1.9188836812973022, + "learning_rate": 1.0507851653859005e-05, + "loss": 0.276, + "num_input_tokens_seen": 4596480, + "step": 3775 + }, + { + "epoch": 0.42098229201470094, + "grad_norm": 0.8956512808799744, + "learning_rate": 1.0521773025949438e-05, + "loss": 0.2409, + "num_input_tokens_seen": 4602496, + "step": 3780 + }, + { + "epoch": 0.4215391468983183, + "grad_norm": 0.5711193084716797, + "learning_rate": 1.053569439803987e-05, + "loss": 0.2159, + "num_input_tokens_seen": 4608672, + "step": 3785 + }, + { + "epoch": 0.4220960017819356, + "grad_norm": 0.3989723324775696, + "learning_rate": 1.0549615770130305e-05, + "loss": 0.1032, + "num_input_tokens_seen": 4614560, + "step": 3790 + }, + { + "epoch": 0.42265285666555297, + "grad_norm": 1.619056224822998, + "learning_rate": 1.0563537142220738e-05, + "loss": 0.2612, + "num_input_tokens_seen": 4620832, + "step": 3795 + }, + { + "epoch": 0.4232097115491703, + "grad_norm": 3.894597053527832, + "learning_rate": 1.0577458514311172e-05, + "loss": 0.3052, + "num_input_tokens_seen": 4627168, + "step": 3800 + }, + { + "epoch": 0.4237665664327876, + "grad_norm": 0.9200706481933594, + "learning_rate": 1.0591379886401605e-05, + "loss": 0.2741, + "num_input_tokens_seen": 4633440, + "step": 3805 + }, + { + "epoch": 0.42432342131640494, + "grad_norm": 1.2786146402359009, + "learning_rate": 1.0605301258492037e-05, + "loss": 0.5407, + "num_input_tokens_seen": 4639520, + "step": 3810 + }, + { + "epoch": 0.4248802762000223, + "grad_norm": 1.0377633571624756, + "learning_rate": 1.061922263058247e-05, + "loss": 0.1965, + "num_input_tokens_seen": 4645600, + "step": 3815 + }, + { + "epoch": 0.4254371310836396, + "grad_norm": 1.4837576150894165, + "learning_rate": 1.0633144002672904e-05, + "loss": 0.4194, + "num_input_tokens_seen": 4651552, + "step": 3820 + }, + { + "epoch": 0.4259939859672569, + "grad_norm": 1.318058729171753, + "learning_rate": 1.0647065374763337e-05, + "loss": 0.1931, + "num_input_tokens_seen": 4657504, + "step": 3825 + }, + { + "epoch": 0.42655084085087425, + "grad_norm": 0.8719730973243713, + "learning_rate": 1.0660986746853771e-05, + "loss": 0.1493, + "num_input_tokens_seen": 4663616, + "step": 3830 + }, + { + "epoch": 0.4271076957344916, + "grad_norm": 2.2089741230010986, + "learning_rate": 1.0674908118944204e-05, + "loss": 0.223, + "num_input_tokens_seen": 4669504, + "step": 3835 + }, + { + "epoch": 0.42766455061810893, + "grad_norm": 1.2132296562194824, + "learning_rate": 1.0688829491034636e-05, + "loss": 0.2214, + "num_input_tokens_seen": 4675136, + "step": 3840 + }, + { + "epoch": 0.42822140550172627, + "grad_norm": 1.6814155578613281, + "learning_rate": 1.070275086312507e-05, + "loss": 0.3096, + "num_input_tokens_seen": 4681184, + "step": 3845 + }, + { + "epoch": 0.42877826038534356, + "grad_norm": 3.2620465755462646, + "learning_rate": 1.0716672235215503e-05, + "loss": 0.3149, + "num_input_tokens_seen": 4687200, + "step": 3850 + }, + { + "epoch": 0.4293351152689609, + "grad_norm": 0.7578563094139099, + "learning_rate": 1.0730593607305936e-05, + "loss": 0.1225, + "num_input_tokens_seen": 4693248, + "step": 3855 + }, + { + "epoch": 0.42989197015257824, + "grad_norm": 0.7856800556182861, + "learning_rate": 1.074451497939637e-05, + "loss": 0.2075, + "num_input_tokens_seen": 4699296, + "step": 3860 + }, + { + "epoch": 0.4304488250361956, + "grad_norm": 1.0215846300125122, + "learning_rate": 1.0758436351486803e-05, + "loss": 0.1414, + "num_input_tokens_seen": 4705216, + "step": 3865 + }, + { + "epoch": 0.4310056799198129, + "grad_norm": 0.2827739715576172, + "learning_rate": 1.0772357723577237e-05, + "loss": 0.2018, + "num_input_tokens_seen": 4711264, + "step": 3870 + }, + { + "epoch": 0.4315625348034302, + "grad_norm": 1.016075849533081, + "learning_rate": 1.078627909566767e-05, + "loss": 0.3243, + "num_input_tokens_seen": 4717728, + "step": 3875 + }, + { + "epoch": 0.43211938968704755, + "grad_norm": 1.6314270496368408, + "learning_rate": 1.0800200467758103e-05, + "loss": 0.3273, + "num_input_tokens_seen": 4723936, + "step": 3880 + }, + { + "epoch": 0.4326762445706649, + "grad_norm": 1.4025275707244873, + "learning_rate": 1.0814121839848535e-05, + "loss": 0.2744, + "num_input_tokens_seen": 4730368, + "step": 3885 + }, + { + "epoch": 0.43323309945428223, + "grad_norm": 0.7665657997131348, + "learning_rate": 1.082804321193897e-05, + "loss": 0.0902, + "num_input_tokens_seen": 4735936, + "step": 3890 + }, + { + "epoch": 0.4337899543378995, + "grad_norm": 1.4508748054504395, + "learning_rate": 1.0841964584029402e-05, + "loss": 0.2725, + "num_input_tokens_seen": 4741824, + "step": 3895 + }, + { + "epoch": 0.43434680922151686, + "grad_norm": 1.4025098085403442, + "learning_rate": 1.0855885956119835e-05, + "loss": 0.3214, + "num_input_tokens_seen": 4748128, + "step": 3900 + }, + { + "epoch": 0.4349036641051342, + "grad_norm": 0.08730136603116989, + "learning_rate": 1.0869807328210269e-05, + "loss": 0.2283, + "num_input_tokens_seen": 4754208, + "step": 3905 + }, + { + "epoch": 0.43546051898875154, + "grad_norm": 1.442206859588623, + "learning_rate": 1.0883728700300702e-05, + "loss": 0.1447, + "num_input_tokens_seen": 4760320, + "step": 3910 + }, + { + "epoch": 0.4360173738723689, + "grad_norm": 0.6879591941833496, + "learning_rate": 1.0897650072391136e-05, + "loss": 0.2359, + "num_input_tokens_seen": 4766656, + "step": 3915 + }, + { + "epoch": 0.43657422875598617, + "grad_norm": 3.14302396774292, + "learning_rate": 1.0911571444481569e-05, + "loss": 0.1818, + "num_input_tokens_seen": 4772768, + "step": 3920 + }, + { + "epoch": 0.4371310836396035, + "grad_norm": 0.43644365668296814, + "learning_rate": 1.0925492816572001e-05, + "loss": 0.2875, + "num_input_tokens_seen": 4778464, + "step": 3925 + }, + { + "epoch": 0.43768793852322085, + "grad_norm": 0.87889564037323, + "learning_rate": 1.0939414188662436e-05, + "loss": 0.2309, + "num_input_tokens_seen": 4784896, + "step": 3930 + }, + { + "epoch": 0.4382447934068382, + "grad_norm": 0.37287095189094543, + "learning_rate": 1.0953335560752868e-05, + "loss": 0.2215, + "num_input_tokens_seen": 4790624, + "step": 3935 + }, + { + "epoch": 0.4388016482904555, + "grad_norm": 0.9064532518386841, + "learning_rate": 1.0967256932843303e-05, + "loss": 0.3465, + "num_input_tokens_seen": 4796704, + "step": 3940 + }, + { + "epoch": 0.4393585031740728, + "grad_norm": 2.6619598865509033, + "learning_rate": 1.0981178304933734e-05, + "loss": 0.3316, + "num_input_tokens_seen": 4802848, + "step": 3945 + }, + { + "epoch": 0.43991535805769016, + "grad_norm": 1.2452633380889893, + "learning_rate": 1.0995099677024168e-05, + "loss": 0.334, + "num_input_tokens_seen": 4808672, + "step": 3950 + }, + { + "epoch": 0.4404722129413075, + "grad_norm": 1.9399304389953613, + "learning_rate": 1.10090210491146e-05, + "loss": 0.3684, + "num_input_tokens_seen": 4814720, + "step": 3955 + }, + { + "epoch": 0.44102906782492485, + "grad_norm": 1.0688621997833252, + "learning_rate": 1.1022942421205035e-05, + "loss": 0.3079, + "num_input_tokens_seen": 4820896, + "step": 3960 + }, + { + "epoch": 0.44158592270854213, + "grad_norm": 1.261550784111023, + "learning_rate": 1.1036863793295468e-05, + "loss": 0.2952, + "num_input_tokens_seen": 4826976, + "step": 3965 + }, + { + "epoch": 0.4421427775921595, + "grad_norm": 1.3650141954421997, + "learning_rate": 1.10507851653859e-05, + "loss": 0.2037, + "num_input_tokens_seen": 4832896, + "step": 3970 + }, + { + "epoch": 0.4426996324757768, + "grad_norm": 1.1242681741714478, + "learning_rate": 1.1064706537476335e-05, + "loss": 0.217, + "num_input_tokens_seen": 4839200, + "step": 3975 + }, + { + "epoch": 0.44325648735939416, + "grad_norm": 0.791551411151886, + "learning_rate": 1.1078627909566767e-05, + "loss": 0.0733, + "num_input_tokens_seen": 4845376, + "step": 3980 + }, + { + "epoch": 0.44381334224301144, + "grad_norm": 0.3733033835887909, + "learning_rate": 1.1092549281657201e-05, + "loss": 0.1804, + "num_input_tokens_seen": 4851168, + "step": 3985 + }, + { + "epoch": 0.4443701971266288, + "grad_norm": 1.1937190294265747, + "learning_rate": 1.1106470653747634e-05, + "loss": 0.2313, + "num_input_tokens_seen": 4856608, + "step": 3990 + }, + { + "epoch": 0.4449270520102461, + "grad_norm": 0.2081182599067688, + "learning_rate": 1.1120392025838067e-05, + "loss": 0.1877, + "num_input_tokens_seen": 4862176, + "step": 3995 + }, + { + "epoch": 0.44548390689386347, + "grad_norm": 0.9715025424957275, + "learning_rate": 1.1134313397928501e-05, + "loss": 0.149, + "num_input_tokens_seen": 4868192, + "step": 4000 + }, + { + "epoch": 0.4460407617774808, + "grad_norm": 0.6689687371253967, + "learning_rate": 1.1148234770018934e-05, + "loss": 0.2756, + "num_input_tokens_seen": 4874400, + "step": 4005 + }, + { + "epoch": 0.4465976166610981, + "grad_norm": 2.0072014331817627, + "learning_rate": 1.1162156142109368e-05, + "loss": 0.2676, + "num_input_tokens_seen": 4880416, + "step": 4010 + }, + { + "epoch": 0.44715447154471544, + "grad_norm": 3.156337022781372, + "learning_rate": 1.1176077514199799e-05, + "loss": 0.2529, + "num_input_tokens_seen": 4886368, + "step": 4015 + }, + { + "epoch": 0.4477113264283328, + "grad_norm": 0.3660562336444855, + "learning_rate": 1.1189998886290233e-05, + "loss": 0.1835, + "num_input_tokens_seen": 4892608, + "step": 4020 + }, + { + "epoch": 0.4482681813119501, + "grad_norm": 0.49814730882644653, + "learning_rate": 1.1203920258380666e-05, + "loss": 0.1814, + "num_input_tokens_seen": 4898752, + "step": 4025 + }, + { + "epoch": 0.44882503619556746, + "grad_norm": 0.0750192254781723, + "learning_rate": 1.12178416304711e-05, + "loss": 0.2948, + "num_input_tokens_seen": 4904576, + "step": 4030 + }, + { + "epoch": 0.44938189107918475, + "grad_norm": 0.24874962866306305, + "learning_rate": 1.1231763002561533e-05, + "loss": 0.1492, + "num_input_tokens_seen": 4910592, + "step": 4035 + }, + { + "epoch": 0.4499387459628021, + "grad_norm": 2.028871536254883, + "learning_rate": 1.1245684374651966e-05, + "loss": 0.4213, + "num_input_tokens_seen": 4916512, + "step": 4040 + }, + { + "epoch": 0.45049560084641943, + "grad_norm": 1.4150971174240112, + "learning_rate": 1.12596057467424e-05, + "loss": 0.4181, + "num_input_tokens_seen": 4922720, + "step": 4045 + }, + { + "epoch": 0.45105245573003677, + "grad_norm": 0.796746551990509, + "learning_rate": 1.1273527118832832e-05, + "loss": 0.136, + "num_input_tokens_seen": 4928800, + "step": 4050 + }, + { + "epoch": 0.45160931061365406, + "grad_norm": 2.813380718231201, + "learning_rate": 1.1287448490923267e-05, + "loss": 0.4609, + "num_input_tokens_seen": 4934816, + "step": 4055 + }, + { + "epoch": 0.4521661654972714, + "grad_norm": 0.6599982380867004, + "learning_rate": 1.1301369863013698e-05, + "loss": 0.2122, + "num_input_tokens_seen": 4940768, + "step": 4060 + }, + { + "epoch": 0.45272302038088874, + "grad_norm": 0.7604705691337585, + "learning_rate": 1.1315291235104132e-05, + "loss": 0.2914, + "num_input_tokens_seen": 4946816, + "step": 4065 + }, + { + "epoch": 0.4532798752645061, + "grad_norm": 0.5870413780212402, + "learning_rate": 1.1329212607194566e-05, + "loss": 0.1229, + "num_input_tokens_seen": 4953152, + "step": 4070 + }, + { + "epoch": 0.4538367301481234, + "grad_norm": 1.0904297828674316, + "learning_rate": 1.1343133979284999e-05, + "loss": 0.4184, + "num_input_tokens_seen": 4959296, + "step": 4075 + }, + { + "epoch": 0.4543935850317407, + "grad_norm": 0.7939707636833191, + "learning_rate": 1.1357055351375433e-05, + "loss": 0.1288, + "num_input_tokens_seen": 4965600, + "step": 4080 + }, + { + "epoch": 0.45495043991535805, + "grad_norm": 1.308683156967163, + "learning_rate": 1.1370976723465864e-05, + "loss": 0.3127, + "num_input_tokens_seen": 4971776, + "step": 4085 + }, + { + "epoch": 0.4555072947989754, + "grad_norm": 1.105220913887024, + "learning_rate": 1.1384898095556299e-05, + "loss": 0.3326, + "num_input_tokens_seen": 4977920, + "step": 4090 + }, + { + "epoch": 0.45606414968259273, + "grad_norm": 1.4993841648101807, + "learning_rate": 1.1398819467646731e-05, + "loss": 0.2133, + "num_input_tokens_seen": 4984128, + "step": 4095 + }, + { + "epoch": 0.45662100456621, + "grad_norm": 0.24112018942832947, + "learning_rate": 1.1412740839737166e-05, + "loss": 0.1112, + "num_input_tokens_seen": 4990496, + "step": 4100 + }, + { + "epoch": 0.45717785944982736, + "grad_norm": 2.2499687671661377, + "learning_rate": 1.1426662211827598e-05, + "loss": 0.2156, + "num_input_tokens_seen": 4996608, + "step": 4105 + }, + { + "epoch": 0.4577347143334447, + "grad_norm": 1.9237626791000366, + "learning_rate": 1.1440583583918031e-05, + "loss": 0.2838, + "num_input_tokens_seen": 5003040, + "step": 4110 + }, + { + "epoch": 0.45829156921706204, + "grad_norm": 0.5716122388839722, + "learning_rate": 1.1454504956008465e-05, + "loss": 0.2333, + "num_input_tokens_seen": 5009344, + "step": 4115 + }, + { + "epoch": 0.4588484241006794, + "grad_norm": 1.1091485023498535, + "learning_rate": 1.1468426328098898e-05, + "loss": 0.3028, + "num_input_tokens_seen": 5015424, + "step": 4120 + }, + { + "epoch": 0.45940527898429667, + "grad_norm": 0.5574259757995605, + "learning_rate": 1.1482347700189332e-05, + "loss": 0.1096, + "num_input_tokens_seen": 5021568, + "step": 4125 + }, + { + "epoch": 0.459962133867914, + "grad_norm": 0.21375282108783722, + "learning_rate": 1.1496269072279763e-05, + "loss": 0.2788, + "num_input_tokens_seen": 5027648, + "step": 4130 + }, + { + "epoch": 0.46051898875153136, + "grad_norm": 1.5820773839950562, + "learning_rate": 1.1510190444370197e-05, + "loss": 0.1881, + "num_input_tokens_seen": 5033920, + "step": 4135 + }, + { + "epoch": 0.4610758436351487, + "grad_norm": 0.5878670811653137, + "learning_rate": 1.1524111816460632e-05, + "loss": 0.2803, + "num_input_tokens_seen": 5039776, + "step": 4140 + }, + { + "epoch": 0.461632698518766, + "grad_norm": 0.9822213649749756, + "learning_rate": 1.1538033188551064e-05, + "loss": 0.1198, + "num_input_tokens_seen": 5045920, + "step": 4145 + }, + { + "epoch": 0.4621895534023833, + "grad_norm": 1.9434914588928223, + "learning_rate": 1.1551954560641499e-05, + "loss": 0.2158, + "num_input_tokens_seen": 5052480, + "step": 4150 + }, + { + "epoch": 0.46274640828600067, + "grad_norm": 0.5051631927490234, + "learning_rate": 1.156587593273193e-05, + "loss": 0.2657, + "num_input_tokens_seen": 5058944, + "step": 4155 + }, + { + "epoch": 0.463303263169618, + "grad_norm": 1.8224941492080688, + "learning_rate": 1.1579797304822364e-05, + "loss": 0.3213, + "num_input_tokens_seen": 5065024, + "step": 4160 + }, + { + "epoch": 0.46386011805323535, + "grad_norm": 2.3940482139587402, + "learning_rate": 1.1593718676912797e-05, + "loss": 0.3873, + "num_input_tokens_seen": 5071232, + "step": 4165 + }, + { + "epoch": 0.46441697293685263, + "grad_norm": 1.3210116624832153, + "learning_rate": 1.1607640049003231e-05, + "loss": 0.2101, + "num_input_tokens_seen": 5077760, + "step": 4170 + }, + { + "epoch": 0.46497382782047, + "grad_norm": 2.3220152854919434, + "learning_rate": 1.1621561421093664e-05, + "loss": 0.2815, + "num_input_tokens_seen": 5083392, + "step": 4175 + }, + { + "epoch": 0.4655306827040873, + "grad_norm": 1.2836190462112427, + "learning_rate": 1.1635482793184096e-05, + "loss": 0.2434, + "num_input_tokens_seen": 5089536, + "step": 4180 + }, + { + "epoch": 0.46608753758770466, + "grad_norm": 1.5524497032165527, + "learning_rate": 1.164940416527453e-05, + "loss": 0.4052, + "num_input_tokens_seen": 5095680, + "step": 4185 + }, + { + "epoch": 0.466644392471322, + "grad_norm": 3.816647529602051, + "learning_rate": 1.1663325537364963e-05, + "loss": 0.3094, + "num_input_tokens_seen": 5102016, + "step": 4190 + }, + { + "epoch": 0.4672012473549393, + "grad_norm": 0.3976850211620331, + "learning_rate": 1.1677246909455398e-05, + "loss": 0.2389, + "num_input_tokens_seen": 5108448, + "step": 4195 + }, + { + "epoch": 0.46775810223855663, + "grad_norm": 2.5251386165618896, + "learning_rate": 1.1691168281545828e-05, + "loss": 0.3028, + "num_input_tokens_seen": 5114624, + "step": 4200 + }, + { + "epoch": 0.46831495712217397, + "grad_norm": 1.0947957038879395, + "learning_rate": 1.1705089653636263e-05, + "loss": 0.3461, + "num_input_tokens_seen": 5120736, + "step": 4205 + }, + { + "epoch": 0.4688718120057913, + "grad_norm": 1.396579623222351, + "learning_rate": 1.1719011025726697e-05, + "loss": 0.2601, + "num_input_tokens_seen": 5126656, + "step": 4210 + }, + { + "epoch": 0.4694286668894086, + "grad_norm": 1.0067546367645264, + "learning_rate": 1.173293239781713e-05, + "loss": 0.2805, + "num_input_tokens_seen": 5132640, + "step": 4215 + }, + { + "epoch": 0.46998552177302594, + "grad_norm": 1.1471251249313354, + "learning_rate": 1.1746853769907562e-05, + "loss": 0.2894, + "num_input_tokens_seen": 5138560, + "step": 4220 + }, + { + "epoch": 0.4705423766566433, + "grad_norm": 0.9582952260971069, + "learning_rate": 1.1760775141997995e-05, + "loss": 0.4063, + "num_input_tokens_seen": 5144704, + "step": 4225 + }, + { + "epoch": 0.4710992315402606, + "grad_norm": 0.6474908590316772, + "learning_rate": 1.177469651408843e-05, + "loss": 0.1866, + "num_input_tokens_seen": 5150656, + "step": 4230 + }, + { + "epoch": 0.47165608642387796, + "grad_norm": 3.453951120376587, + "learning_rate": 1.1788617886178862e-05, + "loss": 0.2361, + "num_input_tokens_seen": 5156960, + "step": 4235 + }, + { + "epoch": 0.47221294130749525, + "grad_norm": 2.177077293395996, + "learning_rate": 1.1802539258269296e-05, + "loss": 0.2444, + "num_input_tokens_seen": 5162880, + "step": 4240 + }, + { + "epoch": 0.4727697961911126, + "grad_norm": 0.6871076822280884, + "learning_rate": 1.1816460630359729e-05, + "loss": 0.4832, + "num_input_tokens_seen": 5169216, + "step": 4245 + }, + { + "epoch": 0.47332665107472993, + "grad_norm": 0.6685916185379028, + "learning_rate": 1.1830382002450162e-05, + "loss": 0.2215, + "num_input_tokens_seen": 5175200, + "step": 4250 + }, + { + "epoch": 0.4738835059583473, + "grad_norm": 1.445045828819275, + "learning_rate": 1.1844303374540596e-05, + "loss": 0.2079, + "num_input_tokens_seen": 5181248, + "step": 4255 + }, + { + "epoch": 0.47444036084196456, + "grad_norm": 1.8477338552474976, + "learning_rate": 1.1858224746631029e-05, + "loss": 0.327, + "num_input_tokens_seen": 5187680, + "step": 4260 + }, + { + "epoch": 0.4749972157255819, + "grad_norm": 0.37056735157966614, + "learning_rate": 1.1872146118721461e-05, + "loss": 0.2996, + "num_input_tokens_seen": 5194176, + "step": 4265 + }, + { + "epoch": 0.47555407060919924, + "grad_norm": 0.9679704308509827, + "learning_rate": 1.1886067490811894e-05, + "loss": 0.2809, + "num_input_tokens_seen": 5200352, + "step": 4270 + }, + { + "epoch": 0.4761109254928166, + "grad_norm": 1.7546368837356567, + "learning_rate": 1.1899988862902328e-05, + "loss": 0.2134, + "num_input_tokens_seen": 5206176, + "step": 4275 + }, + { + "epoch": 0.4766677803764339, + "grad_norm": 1.2647221088409424, + "learning_rate": 1.1913910234992762e-05, + "loss": 0.3709, + "num_input_tokens_seen": 5212608, + "step": 4280 + }, + { + "epoch": 0.4772246352600512, + "grad_norm": 1.6488040685653687, + "learning_rate": 1.1927831607083195e-05, + "loss": 0.2234, + "num_input_tokens_seen": 5218656, + "step": 4285 + }, + { + "epoch": 0.47778149014366855, + "grad_norm": 0.35533469915390015, + "learning_rate": 1.1941752979173628e-05, + "loss": 0.1728, + "num_input_tokens_seen": 5224928, + "step": 4290 + }, + { + "epoch": 0.4783383450272859, + "grad_norm": 1.2247416973114014, + "learning_rate": 1.195567435126406e-05, + "loss": 0.2852, + "num_input_tokens_seen": 5230880, + "step": 4295 + }, + { + "epoch": 0.47889519991090324, + "grad_norm": 0.0828959047794342, + "learning_rate": 1.1969595723354495e-05, + "loss": 0.4728, + "num_input_tokens_seen": 5236736, + "step": 4300 + }, + { + "epoch": 0.4794520547945205, + "grad_norm": 1.6954433917999268, + "learning_rate": 1.1983517095444927e-05, + "loss": 0.3962, + "num_input_tokens_seen": 5242912, + "step": 4305 + }, + { + "epoch": 0.48000890967813786, + "grad_norm": 1.1810729503631592, + "learning_rate": 1.1997438467535362e-05, + "loss": 0.2259, + "num_input_tokens_seen": 5249056, + "step": 4310 + }, + { + "epoch": 0.4805657645617552, + "grad_norm": 0.9238909482955933, + "learning_rate": 1.2011359839625794e-05, + "loss": 0.248, + "num_input_tokens_seen": 5254976, + "step": 4315 + }, + { + "epoch": 0.48112261944537255, + "grad_norm": 0.35942333936691284, + "learning_rate": 1.2025281211716227e-05, + "loss": 0.4667, + "num_input_tokens_seen": 5261088, + "step": 4320 + }, + { + "epoch": 0.4816794743289899, + "grad_norm": 0.7348432540893555, + "learning_rate": 1.2039202583806661e-05, + "loss": 0.1348, + "num_input_tokens_seen": 5267104, + "step": 4325 + }, + { + "epoch": 0.4822363292126072, + "grad_norm": 1.3150851726531982, + "learning_rate": 1.2053123955897094e-05, + "loss": 0.1098, + "num_input_tokens_seen": 5273120, + "step": 4330 + }, + { + "epoch": 0.4827931840962245, + "grad_norm": 1.0834139585494995, + "learning_rate": 1.2067045327987527e-05, + "loss": 0.2137, + "num_input_tokens_seen": 5278240, + "step": 4335 + }, + { + "epoch": 0.48335003897984186, + "grad_norm": 0.9422652125358582, + "learning_rate": 1.208096670007796e-05, + "loss": 0.1889, + "num_input_tokens_seen": 5284512, + "step": 4340 + }, + { + "epoch": 0.4839068938634592, + "grad_norm": 0.5122734904289246, + "learning_rate": 1.2094888072168393e-05, + "loss": 0.2551, + "num_input_tokens_seen": 5290432, + "step": 4345 + }, + { + "epoch": 0.48446374874707654, + "grad_norm": 0.8499854803085327, + "learning_rate": 1.2108809444258828e-05, + "loss": 0.242, + "num_input_tokens_seen": 5296640, + "step": 4350 + }, + { + "epoch": 0.4850206036306938, + "grad_norm": 1.0118120908737183, + "learning_rate": 1.212273081634926e-05, + "loss": 0.2512, + "num_input_tokens_seen": 5302720, + "step": 4355 + }, + { + "epoch": 0.48557745851431117, + "grad_norm": 0.5553609132766724, + "learning_rate": 1.2136652188439693e-05, + "loss": 0.2476, + "num_input_tokens_seen": 5308864, + "step": 4360 + }, + { + "epoch": 0.4861343133979285, + "grad_norm": 4.705817222595215, + "learning_rate": 1.2150573560530126e-05, + "loss": 0.2881, + "num_input_tokens_seen": 5314432, + "step": 4365 + }, + { + "epoch": 0.48669116828154585, + "grad_norm": 0.8433491587638855, + "learning_rate": 1.216449493262056e-05, + "loss": 0.2108, + "num_input_tokens_seen": 5319968, + "step": 4370 + }, + { + "epoch": 0.48724802316516314, + "grad_norm": 2.2707791328430176, + "learning_rate": 1.2178416304710993e-05, + "loss": 0.241, + "num_input_tokens_seen": 5325568, + "step": 4375 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.40999817848205566, + "learning_rate": 1.2192337676801425e-05, + "loss": 0.0916, + "num_input_tokens_seen": 5331520, + "step": 4380 + }, + { + "epoch": 0.4883617329323978, + "grad_norm": 1.830949306488037, + "learning_rate": 1.220625904889186e-05, + "loss": 0.1921, + "num_input_tokens_seen": 5338144, + "step": 4385 + }, + { + "epoch": 0.48891858781601516, + "grad_norm": 0.5513787269592285, + "learning_rate": 1.2220180420982292e-05, + "loss": 0.1592, + "num_input_tokens_seen": 5344224, + "step": 4390 + }, + { + "epoch": 0.4894754426996325, + "grad_norm": 1.4812700748443604, + "learning_rate": 1.2234101793072727e-05, + "loss": 0.1313, + "num_input_tokens_seen": 5350208, + "step": 4395 + }, + { + "epoch": 0.4900322975832498, + "grad_norm": 1.2384973764419556, + "learning_rate": 1.224802316516316e-05, + "loss": 0.1981, + "num_input_tokens_seen": 5356672, + "step": 4400 + }, + { + "epoch": 0.49058915246686713, + "grad_norm": 0.7120057940483093, + "learning_rate": 1.2261944537253592e-05, + "loss": 0.1135, + "num_input_tokens_seen": 5362688, + "step": 4405 + }, + { + "epoch": 0.49114600735048447, + "grad_norm": 0.6757912635803223, + "learning_rate": 1.2275865909344026e-05, + "loss": 0.2493, + "num_input_tokens_seen": 5368896, + "step": 4410 + }, + { + "epoch": 0.4917028622341018, + "grad_norm": 0.770695686340332, + "learning_rate": 1.2289787281434459e-05, + "loss": 0.1949, + "num_input_tokens_seen": 5375040, + "step": 4415 + }, + { + "epoch": 0.4922597171177191, + "grad_norm": 0.6375377774238586, + "learning_rate": 1.2303708653524893e-05, + "loss": 0.2871, + "num_input_tokens_seen": 5381408, + "step": 4420 + }, + { + "epoch": 0.49281657200133644, + "grad_norm": 1.0270838737487793, + "learning_rate": 1.2317630025615324e-05, + "loss": 0.1384, + "num_input_tokens_seen": 5387744, + "step": 4425 + }, + { + "epoch": 0.4933734268849538, + "grad_norm": 1.675126075744629, + "learning_rate": 1.2331551397705758e-05, + "loss": 0.2453, + "num_input_tokens_seen": 5394016, + "step": 4430 + }, + { + "epoch": 0.4939302817685711, + "grad_norm": 0.6015958189964294, + "learning_rate": 1.2345472769796191e-05, + "loss": 0.1153, + "num_input_tokens_seen": 5400608, + "step": 4435 + }, + { + "epoch": 0.49448713665218846, + "grad_norm": 1.360052227973938, + "learning_rate": 1.2359394141886625e-05, + "loss": 0.1692, + "num_input_tokens_seen": 5406912, + "step": 4440 + }, + { + "epoch": 0.49504399153580575, + "grad_norm": 1.8953007459640503, + "learning_rate": 1.2373315513977058e-05, + "loss": 0.3193, + "num_input_tokens_seen": 5412864, + "step": 4445 + }, + { + "epoch": 0.4956008464194231, + "grad_norm": 1.5506843328475952, + "learning_rate": 1.238723688606749e-05, + "loss": 0.2505, + "num_input_tokens_seen": 5418944, + "step": 4450 + }, + { + "epoch": 0.49615770130304043, + "grad_norm": 1.6009780168533325, + "learning_rate": 1.2401158258157925e-05, + "loss": 0.2253, + "num_input_tokens_seen": 5424992, + "step": 4455 + }, + { + "epoch": 0.4967145561866578, + "grad_norm": 0.4164126515388489, + "learning_rate": 1.2415079630248358e-05, + "loss": 0.2373, + "num_input_tokens_seen": 5431008, + "step": 4460 + }, + { + "epoch": 0.49727141107027506, + "grad_norm": 0.312496155500412, + "learning_rate": 1.2429001002338792e-05, + "loss": 0.1427, + "num_input_tokens_seen": 5436992, + "step": 4465 + }, + { + "epoch": 0.4978282659538924, + "grad_norm": 0.359754741191864, + "learning_rate": 1.2442922374429223e-05, + "loss": 0.1671, + "num_input_tokens_seen": 5443104, + "step": 4470 + }, + { + "epoch": 0.49838512083750974, + "grad_norm": 1.3601094484329224, + "learning_rate": 1.2456843746519657e-05, + "loss": 0.2712, + "num_input_tokens_seen": 5449248, + "step": 4475 + }, + { + "epoch": 0.4989419757211271, + "grad_norm": 0.5300992727279663, + "learning_rate": 1.2470765118610092e-05, + "loss": 0.1994, + "num_input_tokens_seen": 5455392, + "step": 4480 + }, + { + "epoch": 0.4994988306047444, + "grad_norm": 0.11406666040420532, + "learning_rate": 1.2484686490700524e-05, + "loss": 0.2247, + "num_input_tokens_seen": 5461504, + "step": 4485 + }, + { + "epoch": 0.5000556854883618, + "grad_norm": 0.5812989473342896, + "learning_rate": 1.2498607862790959e-05, + "loss": 0.2464, + "num_input_tokens_seen": 5467520, + "step": 4490 + }, + { + "epoch": 0.5006125403719791, + "grad_norm": 1.4195780754089355, + "learning_rate": 1.251252923488139e-05, + "loss": 0.3309, + "num_input_tokens_seen": 5473632, + "step": 4495 + }, + { + "epoch": 0.5011693952555963, + "grad_norm": 1.9830992221832275, + "learning_rate": 1.2526450606971824e-05, + "loss": 0.2412, + "num_input_tokens_seen": 5479072, + "step": 4500 + }, + { + "epoch": 0.5017262501392137, + "grad_norm": 2.2748281955718994, + "learning_rate": 1.2540371979062256e-05, + "loss": 0.4685, + "num_input_tokens_seen": 5485056, + "step": 4505 + }, + { + "epoch": 0.502283105022831, + "grad_norm": 1.0786515474319458, + "learning_rate": 1.255429335115269e-05, + "loss": 0.1619, + "num_input_tokens_seen": 5491456, + "step": 4510 + }, + { + "epoch": 0.5028399599064484, + "grad_norm": 1.2269765138626099, + "learning_rate": 1.2568214723243123e-05, + "loss": 0.2366, + "num_input_tokens_seen": 5497696, + "step": 4515 + }, + { + "epoch": 0.5033968147900657, + "grad_norm": 0.5149310827255249, + "learning_rate": 1.2582136095333558e-05, + "loss": 0.2611, + "num_input_tokens_seen": 5503520, + "step": 4520 + }, + { + "epoch": 0.503953669673683, + "grad_norm": 1.1044467687606812, + "learning_rate": 1.259605746742399e-05, + "loss": 0.27, + "num_input_tokens_seen": 5509280, + "step": 4525 + }, + { + "epoch": 0.5045105245573004, + "grad_norm": 1.9482921361923218, + "learning_rate": 1.2609978839514421e-05, + "loss": 0.3434, + "num_input_tokens_seen": 5515584, + "step": 4530 + }, + { + "epoch": 0.5050673794409177, + "grad_norm": 1.4189693927764893, + "learning_rate": 1.2623900211604856e-05, + "loss": 0.2319, + "num_input_tokens_seen": 5521760, + "step": 4535 + }, + { + "epoch": 0.5056242343245351, + "grad_norm": 1.2681708335876465, + "learning_rate": 1.2637821583695288e-05, + "loss": 0.1012, + "num_input_tokens_seen": 5528128, + "step": 4540 + }, + { + "epoch": 0.5061810892081523, + "grad_norm": 1.8889600038528442, + "learning_rate": 1.2651742955785723e-05, + "loss": 0.175, + "num_input_tokens_seen": 5534176, + "step": 4545 + }, + { + "epoch": 0.5067379440917696, + "grad_norm": 0.38776925206184387, + "learning_rate": 1.2665664327876157e-05, + "loss": 0.2959, + "num_input_tokens_seen": 5540256, + "step": 4550 + }, + { + "epoch": 0.507294798975387, + "grad_norm": 0.3609693944454193, + "learning_rate": 1.267958569996659e-05, + "loss": 0.0806, + "num_input_tokens_seen": 5546528, + "step": 4555 + }, + { + "epoch": 0.5078516538590043, + "grad_norm": 1.848738670349121, + "learning_rate": 1.2693507072057024e-05, + "loss": 0.2245, + "num_input_tokens_seen": 5552224, + "step": 4560 + }, + { + "epoch": 0.5084085087426217, + "grad_norm": 1.3426786661148071, + "learning_rate": 1.2707428444147457e-05, + "loss": 0.2021, + "num_input_tokens_seen": 5558752, + "step": 4565 + }, + { + "epoch": 0.508965363626239, + "grad_norm": 1.085437297821045, + "learning_rate": 1.272134981623789e-05, + "loss": 0.1603, + "num_input_tokens_seen": 5564800, + "step": 4570 + }, + { + "epoch": 0.5095222185098564, + "grad_norm": 1.147874355316162, + "learning_rate": 1.2735271188328322e-05, + "loss": 0.2824, + "num_input_tokens_seen": 5570144, + "step": 4575 + }, + { + "epoch": 0.5100790733934737, + "grad_norm": 0.7792468667030334, + "learning_rate": 1.2749192560418754e-05, + "loss": 0.3118, + "num_input_tokens_seen": 5576352, + "step": 4580 + }, + { + "epoch": 0.510635928277091, + "grad_norm": 3.148611068725586, + "learning_rate": 1.2763113932509189e-05, + "loss": 0.3781, + "num_input_tokens_seen": 5582464, + "step": 4585 + }, + { + "epoch": 0.5111927831607083, + "grad_norm": 1.737334966659546, + "learning_rate": 1.2777035304599621e-05, + "loss": 0.2983, + "num_input_tokens_seen": 5588288, + "step": 4590 + }, + { + "epoch": 0.5117496380443256, + "grad_norm": 0.6265900135040283, + "learning_rate": 1.2790956676690056e-05, + "loss": 0.2704, + "num_input_tokens_seen": 5594752, + "step": 4595 + }, + { + "epoch": 0.512306492927943, + "grad_norm": 1.60422682762146, + "learning_rate": 1.2804878048780488e-05, + "loss": 0.2528, + "num_input_tokens_seen": 5601152, + "step": 4600 + }, + { + "epoch": 0.5128633478115603, + "grad_norm": 0.6167590618133545, + "learning_rate": 1.2818799420870923e-05, + "loss": 0.2216, + "num_input_tokens_seen": 5607392, + "step": 4605 + }, + { + "epoch": 0.5134202026951776, + "grad_norm": 1.644441843032837, + "learning_rate": 1.2832720792961355e-05, + "loss": 0.2744, + "num_input_tokens_seen": 5613696, + "step": 4610 + }, + { + "epoch": 0.513977057578795, + "grad_norm": 1.2345470190048218, + "learning_rate": 1.284664216505179e-05, + "loss": 0.2055, + "num_input_tokens_seen": 5619872, + "step": 4615 + }, + { + "epoch": 0.5145339124624123, + "grad_norm": 0.15860667824745178, + "learning_rate": 1.286056353714222e-05, + "loss": 0.0959, + "num_input_tokens_seen": 5626048, + "step": 4620 + }, + { + "epoch": 0.5150907673460297, + "grad_norm": 1.0725221633911133, + "learning_rate": 1.2874484909232653e-05, + "loss": 0.1693, + "num_input_tokens_seen": 5632224, + "step": 4625 + }, + { + "epoch": 0.515647622229647, + "grad_norm": 2.0848779678344727, + "learning_rate": 1.2888406281323088e-05, + "loss": 0.2768, + "num_input_tokens_seen": 5638016, + "step": 4630 + }, + { + "epoch": 0.5162044771132643, + "grad_norm": 5.009835720062256, + "learning_rate": 1.290232765341352e-05, + "loss": 0.5102, + "num_input_tokens_seen": 5644160, + "step": 4635 + }, + { + "epoch": 0.5167613319968816, + "grad_norm": 0.6896377205848694, + "learning_rate": 1.2916249025503954e-05, + "loss": 0.1287, + "num_input_tokens_seen": 5650272, + "step": 4640 + }, + { + "epoch": 0.5173181868804989, + "grad_norm": 1.101624846458435, + "learning_rate": 1.2930170397594387e-05, + "loss": 0.2107, + "num_input_tokens_seen": 5656576, + "step": 4645 + }, + { + "epoch": 0.5178750417641163, + "grad_norm": 0.49695929884910583, + "learning_rate": 1.2944091769684821e-05, + "loss": 0.2563, + "num_input_tokens_seen": 5662592, + "step": 4650 + }, + { + "epoch": 0.5184318966477336, + "grad_norm": 1.1161242723464966, + "learning_rate": 1.2958013141775254e-05, + "loss": 0.27, + "num_input_tokens_seen": 5668736, + "step": 4655 + }, + { + "epoch": 0.5189887515313509, + "grad_norm": 0.43300238251686096, + "learning_rate": 1.2971934513865688e-05, + "loss": 0.1874, + "num_input_tokens_seen": 5674880, + "step": 4660 + }, + { + "epoch": 0.5195456064149683, + "grad_norm": 1.5415953397750854, + "learning_rate": 1.298585588595612e-05, + "loss": 0.2794, + "num_input_tokens_seen": 5681056, + "step": 4665 + }, + { + "epoch": 0.5201024612985856, + "grad_norm": 0.03481674939393997, + "learning_rate": 1.2999777258046552e-05, + "loss": 0.1578, + "num_input_tokens_seen": 5687136, + "step": 4670 + }, + { + "epoch": 0.520659316182203, + "grad_norm": 0.7792909145355225, + "learning_rate": 1.3013698630136986e-05, + "loss": 0.1269, + "num_input_tokens_seen": 5693216, + "step": 4675 + }, + { + "epoch": 0.5212161710658203, + "grad_norm": 0.25394269824028015, + "learning_rate": 1.3027620002227419e-05, + "loss": 0.1936, + "num_input_tokens_seen": 5699232, + "step": 4680 + }, + { + "epoch": 0.5217730259494375, + "grad_norm": 3.011901617050171, + "learning_rate": 1.3041541374317853e-05, + "loss": 0.3716, + "num_input_tokens_seen": 5705280, + "step": 4685 + }, + { + "epoch": 0.5223298808330549, + "grad_norm": 2.007561683654785, + "learning_rate": 1.3055462746408288e-05, + "loss": 0.222, + "num_input_tokens_seen": 5711456, + "step": 4690 + }, + { + "epoch": 0.5228867357166722, + "grad_norm": 2.7484958171844482, + "learning_rate": 1.306938411849872e-05, + "loss": 0.2699, + "num_input_tokens_seen": 5717440, + "step": 4695 + }, + { + "epoch": 0.5234435906002896, + "grad_norm": 0.8015807271003723, + "learning_rate": 1.3083305490589155e-05, + "loss": 0.1872, + "num_input_tokens_seen": 5723456, + "step": 4700 + }, + { + "epoch": 0.5240004454839069, + "grad_norm": 0.7708243131637573, + "learning_rate": 1.3097226862679587e-05, + "loss": 0.1182, + "num_input_tokens_seen": 5729632, + "step": 4705 + }, + { + "epoch": 0.5245573003675242, + "grad_norm": 1.5337055921554565, + "learning_rate": 1.3111148234770018e-05, + "loss": 0.3185, + "num_input_tokens_seen": 5736000, + "step": 4710 + }, + { + "epoch": 0.5251141552511416, + "grad_norm": 0.1081574410200119, + "learning_rate": 1.3125069606860452e-05, + "loss": 0.1295, + "num_input_tokens_seen": 5741952, + "step": 4715 + }, + { + "epoch": 0.5256710101347589, + "grad_norm": 1.0986663103103638, + "learning_rate": 1.3138990978950885e-05, + "loss": 0.288, + "num_input_tokens_seen": 5748160, + "step": 4720 + }, + { + "epoch": 0.5262278650183763, + "grad_norm": 0.3138372302055359, + "learning_rate": 1.315291235104132e-05, + "loss": 0.1955, + "num_input_tokens_seen": 5754176, + "step": 4725 + }, + { + "epoch": 0.5267847199019935, + "grad_norm": 2.5580334663391113, + "learning_rate": 1.3166833723131752e-05, + "loss": 0.1279, + "num_input_tokens_seen": 5760256, + "step": 4730 + }, + { + "epoch": 0.5273415747856108, + "grad_norm": 1.2708059549331665, + "learning_rate": 1.3180755095222186e-05, + "loss": 0.122, + "num_input_tokens_seen": 5766336, + "step": 4735 + }, + { + "epoch": 0.5278984296692282, + "grad_norm": 0.7918967008590698, + "learning_rate": 1.3194676467312619e-05, + "loss": 0.1014, + "num_input_tokens_seen": 5772736, + "step": 4740 + }, + { + "epoch": 0.5284552845528455, + "grad_norm": 0.3823913335800171, + "learning_rate": 1.3208597839403053e-05, + "loss": 0.3115, + "num_input_tokens_seen": 5778784, + "step": 4745 + }, + { + "epoch": 0.5290121394364629, + "grad_norm": 0.6331570148468018, + "learning_rate": 1.3222519211493486e-05, + "loss": 0.3534, + "num_input_tokens_seen": 5784992, + "step": 4750 + }, + { + "epoch": 0.5295689943200802, + "grad_norm": 2.178738832473755, + "learning_rate": 1.323644058358392e-05, + "loss": 0.3354, + "num_input_tokens_seen": 5791072, + "step": 4755 + }, + { + "epoch": 0.5301258492036975, + "grad_norm": 0.5439645051956177, + "learning_rate": 1.3250361955674351e-05, + "loss": 0.118, + "num_input_tokens_seen": 5797248, + "step": 4760 + }, + { + "epoch": 0.5306827040873149, + "grad_norm": 0.34649649262428284, + "learning_rate": 1.3264283327764784e-05, + "loss": 0.2154, + "num_input_tokens_seen": 5803648, + "step": 4765 + }, + { + "epoch": 0.5312395589709322, + "grad_norm": 1.797161340713501, + "learning_rate": 1.3278204699855218e-05, + "loss": 0.343, + "num_input_tokens_seen": 5809760, + "step": 4770 + }, + { + "epoch": 0.5317964138545495, + "grad_norm": 1.8231070041656494, + "learning_rate": 1.3292126071945651e-05, + "loss": 0.4062, + "num_input_tokens_seen": 5815744, + "step": 4775 + }, + { + "epoch": 0.5323532687381668, + "grad_norm": 1.1227123737335205, + "learning_rate": 1.3306047444036085e-05, + "loss": 0.1787, + "num_input_tokens_seen": 5821984, + "step": 4780 + }, + { + "epoch": 0.5329101236217841, + "grad_norm": 0.48520177602767944, + "learning_rate": 1.3319968816126518e-05, + "loss": 0.1936, + "num_input_tokens_seen": 5828128, + "step": 4785 + }, + { + "epoch": 0.5334669785054015, + "grad_norm": 0.550279974937439, + "learning_rate": 1.3333890188216952e-05, + "loss": 0.309, + "num_input_tokens_seen": 5834016, + "step": 4790 + }, + { + "epoch": 0.5340238333890188, + "grad_norm": 1.2341604232788086, + "learning_rate": 1.3347811560307385e-05, + "loss": 0.1421, + "num_input_tokens_seen": 5840256, + "step": 4795 + }, + { + "epoch": 0.5345806882726362, + "grad_norm": 0.6324247717857361, + "learning_rate": 1.3361732932397819e-05, + "loss": 0.2883, + "num_input_tokens_seen": 5846112, + "step": 4800 + }, + { + "epoch": 0.5351375431562535, + "grad_norm": 1.9175447225570679, + "learning_rate": 1.337565430448825e-05, + "loss": 0.1884, + "num_input_tokens_seen": 5852032, + "step": 4805 + }, + { + "epoch": 0.5356943980398708, + "grad_norm": 1.6896309852600098, + "learning_rate": 1.3389575676578683e-05, + "loss": 0.225, + "num_input_tokens_seen": 5857408, + "step": 4810 + }, + { + "epoch": 0.5362512529234882, + "grad_norm": 1.3510181903839111, + "learning_rate": 1.3403497048669117e-05, + "loss": 0.2627, + "num_input_tokens_seen": 5863584, + "step": 4815 + }, + { + "epoch": 0.5368081078071054, + "grad_norm": 2.2814877033233643, + "learning_rate": 1.341741842075955e-05, + "loss": 0.1871, + "num_input_tokens_seen": 5869664, + "step": 4820 + }, + { + "epoch": 0.5373649626907228, + "grad_norm": 1.1968953609466553, + "learning_rate": 1.3431339792849984e-05, + "loss": 0.2633, + "num_input_tokens_seen": 5875840, + "step": 4825 + }, + { + "epoch": 0.5379218175743401, + "grad_norm": 2.289290428161621, + "learning_rate": 1.3445261164940418e-05, + "loss": 0.2619, + "num_input_tokens_seen": 5882048, + "step": 4830 + }, + { + "epoch": 0.5384786724579574, + "grad_norm": 1.0515533685684204, + "learning_rate": 1.3459182537030851e-05, + "loss": 0.1477, + "num_input_tokens_seen": 5888256, + "step": 4835 + }, + { + "epoch": 0.5390355273415748, + "grad_norm": 2.257871150970459, + "learning_rate": 1.3473103909121285e-05, + "loss": 0.2666, + "num_input_tokens_seen": 5894368, + "step": 4840 + }, + { + "epoch": 0.5395923822251921, + "grad_norm": 0.4793892502784729, + "learning_rate": 1.3487025281211718e-05, + "loss": 0.1345, + "num_input_tokens_seen": 5900000, + "step": 4845 + }, + { + "epoch": 0.5401492371088095, + "grad_norm": 0.2865378260612488, + "learning_rate": 1.3500946653302149e-05, + "loss": 0.1397, + "num_input_tokens_seen": 5906048, + "step": 4850 + }, + { + "epoch": 0.5407060919924268, + "grad_norm": 0.40610387921333313, + "learning_rate": 1.3514868025392583e-05, + "loss": 0.1686, + "num_input_tokens_seen": 5912448, + "step": 4855 + }, + { + "epoch": 0.5412629468760441, + "grad_norm": 0.6827104091644287, + "learning_rate": 1.3528789397483016e-05, + "loss": 0.1144, + "num_input_tokens_seen": 5918656, + "step": 4860 + }, + { + "epoch": 0.5418198017596614, + "grad_norm": 1.4913005828857422, + "learning_rate": 1.354271076957345e-05, + "loss": 0.2748, + "num_input_tokens_seen": 5924672, + "step": 4865 + }, + { + "epoch": 0.5423766566432787, + "grad_norm": 0.7168650031089783, + "learning_rate": 1.3556632141663883e-05, + "loss": 0.1148, + "num_input_tokens_seen": 5930688, + "step": 4870 + }, + { + "epoch": 0.5429335115268961, + "grad_norm": 0.8291113376617432, + "learning_rate": 1.3570553513754317e-05, + "loss": 0.1322, + "num_input_tokens_seen": 5936800, + "step": 4875 + }, + { + "epoch": 0.5434903664105134, + "grad_norm": 0.7123365998268127, + "learning_rate": 1.358447488584475e-05, + "loss": 0.0587, + "num_input_tokens_seen": 5943072, + "step": 4880 + }, + { + "epoch": 0.5440472212941307, + "grad_norm": 2.463165044784546, + "learning_rate": 1.3598396257935184e-05, + "loss": 0.2369, + "num_input_tokens_seen": 5949312, + "step": 4885 + }, + { + "epoch": 0.5446040761777481, + "grad_norm": 1.5197365283966064, + "learning_rate": 1.3612317630025617e-05, + "loss": 0.1595, + "num_input_tokens_seen": 5955712, + "step": 4890 + }, + { + "epoch": 0.5451609310613654, + "grad_norm": 0.7450543642044067, + "learning_rate": 1.3626239002116048e-05, + "loss": 0.422, + "num_input_tokens_seen": 5962432, + "step": 4895 + }, + { + "epoch": 0.5457177859449828, + "grad_norm": 1.3704146146774292, + "learning_rate": 1.3640160374206482e-05, + "loss": 0.1874, + "num_input_tokens_seen": 5968416, + "step": 4900 + }, + { + "epoch": 0.5462746408286001, + "grad_norm": 0.7857948541641235, + "learning_rate": 1.3654081746296915e-05, + "loss": 0.191, + "num_input_tokens_seen": 5974816, + "step": 4905 + }, + { + "epoch": 0.5468314957122173, + "grad_norm": 0.6980739831924438, + "learning_rate": 1.3668003118387349e-05, + "loss": 0.2073, + "num_input_tokens_seen": 5980864, + "step": 4910 + }, + { + "epoch": 0.5473883505958347, + "grad_norm": 1.75370454788208, + "learning_rate": 1.3681924490477782e-05, + "loss": 0.2179, + "num_input_tokens_seen": 5986144, + "step": 4915 + }, + { + "epoch": 0.547945205479452, + "grad_norm": 0.6871148943901062, + "learning_rate": 1.3695845862568216e-05, + "loss": 0.0889, + "num_input_tokens_seen": 5992672, + "step": 4920 + }, + { + "epoch": 0.5485020603630694, + "grad_norm": 0.6961513757705688, + "learning_rate": 1.3709767234658649e-05, + "loss": 0.2517, + "num_input_tokens_seen": 5998176, + "step": 4925 + }, + { + "epoch": 0.5490589152466867, + "grad_norm": 0.8640778660774231, + "learning_rate": 1.3723688606749083e-05, + "loss": 0.1874, + "num_input_tokens_seen": 6004064, + "step": 4930 + }, + { + "epoch": 0.549615770130304, + "grad_norm": 1.9181057214736938, + "learning_rate": 1.3737609978839515e-05, + "loss": 0.2527, + "num_input_tokens_seen": 6010688, + "step": 4935 + }, + { + "epoch": 0.5501726250139214, + "grad_norm": 1.8293401002883911, + "learning_rate": 1.3751531350929946e-05, + "loss": 0.214, + "num_input_tokens_seen": 6016896, + "step": 4940 + }, + { + "epoch": 0.5507294798975387, + "grad_norm": 2.3319427967071533, + "learning_rate": 1.376545272302038e-05, + "loss": 0.2896, + "num_input_tokens_seen": 6022848, + "step": 4945 + }, + { + "epoch": 0.5512863347811561, + "grad_norm": 1.5141961574554443, + "learning_rate": 1.3779374095110813e-05, + "loss": 0.2286, + "num_input_tokens_seen": 6029216, + "step": 4950 + }, + { + "epoch": 0.5518431896647734, + "grad_norm": 0.33558690547943115, + "learning_rate": 1.3793295467201248e-05, + "loss": 0.115, + "num_input_tokens_seen": 6035360, + "step": 4955 + }, + { + "epoch": 0.5524000445483906, + "grad_norm": 0.4244135618209839, + "learning_rate": 1.380721683929168e-05, + "loss": 0.1743, + "num_input_tokens_seen": 6041632, + "step": 4960 + }, + { + "epoch": 0.552956899432008, + "grad_norm": 0.49917367100715637, + "learning_rate": 1.3821138211382115e-05, + "loss": 0.1619, + "num_input_tokens_seen": 6047712, + "step": 4965 + }, + { + "epoch": 0.5535137543156253, + "grad_norm": 1.9671260118484497, + "learning_rate": 1.3835059583472549e-05, + "loss": 0.2399, + "num_input_tokens_seen": 6053792, + "step": 4970 + }, + { + "epoch": 0.5540706091992427, + "grad_norm": 2.5626461505889893, + "learning_rate": 1.3848980955562982e-05, + "loss": 0.1381, + "num_input_tokens_seen": 6059904, + "step": 4975 + }, + { + "epoch": 0.55462746408286, + "grad_norm": 0.24784846603870392, + "learning_rate": 1.3862902327653416e-05, + "loss": 0.1904, + "num_input_tokens_seen": 6066336, + "step": 4980 + }, + { + "epoch": 0.5551843189664774, + "grad_norm": 0.4622545540332794, + "learning_rate": 1.3876823699743847e-05, + "loss": 0.0776, + "num_input_tokens_seen": 6072448, + "step": 4985 + }, + { + "epoch": 0.5557411738500947, + "grad_norm": 1.1094969511032104, + "learning_rate": 1.389074507183428e-05, + "loss": 0.1895, + "num_input_tokens_seen": 6078464, + "step": 4990 + }, + { + "epoch": 0.556298028733712, + "grad_norm": 0.7274879217147827, + "learning_rate": 1.3904666443924714e-05, + "loss": 0.5285, + "num_input_tokens_seen": 6084480, + "step": 4995 + }, + { + "epoch": 0.5568548836173294, + "grad_norm": 0.732981264591217, + "learning_rate": 1.3918587816015147e-05, + "loss": 0.4259, + "num_input_tokens_seen": 6090784, + "step": 5000 + }, + { + "epoch": 0.5574117385009466, + "grad_norm": 2.3373591899871826, + "learning_rate": 1.3932509188105581e-05, + "loss": 0.4717, + "num_input_tokens_seen": 6096608, + "step": 5005 + }, + { + "epoch": 0.557968593384564, + "grad_norm": 1.2733207941055298, + "learning_rate": 1.3946430560196013e-05, + "loss": 0.2554, + "num_input_tokens_seen": 6102784, + "step": 5010 + }, + { + "epoch": 0.5585254482681813, + "grad_norm": 2.0952670574188232, + "learning_rate": 1.3960351932286448e-05, + "loss": 0.3093, + "num_input_tokens_seen": 6108864, + "step": 5015 + }, + { + "epoch": 0.5590823031517986, + "grad_norm": 0.9409027099609375, + "learning_rate": 1.397427330437688e-05, + "loss": 0.1669, + "num_input_tokens_seen": 6114848, + "step": 5020 + }, + { + "epoch": 0.559639158035416, + "grad_norm": 1.1156350374221802, + "learning_rate": 1.3988194676467315e-05, + "loss": 0.1761, + "num_input_tokens_seen": 6120992, + "step": 5025 + }, + { + "epoch": 0.5601960129190333, + "grad_norm": 0.377748966217041, + "learning_rate": 1.4002116048557746e-05, + "loss": 0.1655, + "num_input_tokens_seen": 6126432, + "step": 5030 + }, + { + "epoch": 0.5607528678026507, + "grad_norm": 1.8507074117660522, + "learning_rate": 1.4016037420648178e-05, + "loss": 0.2022, + "num_input_tokens_seen": 6132512, + "step": 5035 + }, + { + "epoch": 0.561309722686268, + "grad_norm": 0.24234361946582794, + "learning_rate": 1.4029958792738613e-05, + "loss": 0.0552, + "num_input_tokens_seen": 6138464, + "step": 5040 + }, + { + "epoch": 0.5618665775698853, + "grad_norm": 0.5432570576667786, + "learning_rate": 1.4043880164829045e-05, + "loss": 0.2645, + "num_input_tokens_seen": 6144800, + "step": 5045 + }, + { + "epoch": 0.5624234324535026, + "grad_norm": 1.4847183227539062, + "learning_rate": 1.405780153691948e-05, + "loss": 0.3082, + "num_input_tokens_seen": 6150880, + "step": 5050 + }, + { + "epoch": 0.5629802873371199, + "grad_norm": 1.7938261032104492, + "learning_rate": 1.4071722909009912e-05, + "loss": 0.2837, + "num_input_tokens_seen": 6156832, + "step": 5055 + }, + { + "epoch": 0.5635371422207373, + "grad_norm": 1.0145204067230225, + "learning_rate": 1.4085644281100347e-05, + "loss": 0.1771, + "num_input_tokens_seen": 6162720, + "step": 5060 + }, + { + "epoch": 0.5640939971043546, + "grad_norm": 3.3943684101104736, + "learning_rate": 1.409956565319078e-05, + "loss": 0.5084, + "num_input_tokens_seen": 6168640, + "step": 5065 + }, + { + "epoch": 0.5646508519879719, + "grad_norm": 1.6145521402359009, + "learning_rate": 1.4113487025281214e-05, + "loss": 0.2277, + "num_input_tokens_seen": 6174816, + "step": 5070 + }, + { + "epoch": 0.5652077068715893, + "grad_norm": 1.072461485862732, + "learning_rate": 1.4127408397371646e-05, + "loss": 0.3782, + "num_input_tokens_seen": 6180448, + "step": 5075 + }, + { + "epoch": 0.5657645617552066, + "grad_norm": 2.391047716140747, + "learning_rate": 1.4141329769462077e-05, + "loss": 0.3557, + "num_input_tokens_seen": 6186336, + "step": 5080 + }, + { + "epoch": 0.566321416638824, + "grad_norm": 0.9250662922859192, + "learning_rate": 1.4155251141552511e-05, + "loss": 0.2869, + "num_input_tokens_seen": 6192256, + "step": 5085 + }, + { + "epoch": 0.5668782715224413, + "grad_norm": 0.10793192684650421, + "learning_rate": 1.4169172513642944e-05, + "loss": 0.1907, + "num_input_tokens_seen": 6197600, + "step": 5090 + }, + { + "epoch": 0.5674351264060585, + "grad_norm": 1.7742289304733276, + "learning_rate": 1.4183093885733378e-05, + "loss": 0.099, + "num_input_tokens_seen": 6203776, + "step": 5095 + }, + { + "epoch": 0.5679919812896759, + "grad_norm": 2.0475997924804688, + "learning_rate": 1.4197015257823813e-05, + "loss": 0.4431, + "num_input_tokens_seen": 6209408, + "step": 5100 + }, + { + "epoch": 0.5685488361732932, + "grad_norm": 1.4109702110290527, + "learning_rate": 1.4210936629914245e-05, + "loss": 0.1079, + "num_input_tokens_seen": 6215360, + "step": 5105 + }, + { + "epoch": 0.5691056910569106, + "grad_norm": 1.1738030910491943, + "learning_rate": 1.422485800200468e-05, + "loss": 0.3418, + "num_input_tokens_seen": 6220960, + "step": 5110 + }, + { + "epoch": 0.5696625459405279, + "grad_norm": 1.1624099016189575, + "learning_rate": 1.4238779374095112e-05, + "loss": 0.2479, + "num_input_tokens_seen": 6227168, + "step": 5115 + }, + { + "epoch": 0.5702194008241452, + "grad_norm": 0.9608790278434753, + "learning_rate": 1.4252700746185547e-05, + "loss": 0.3591, + "num_input_tokens_seen": 6233024, + "step": 5120 + }, + { + "epoch": 0.5707762557077626, + "grad_norm": 0.06945519894361496, + "learning_rate": 1.4266622118275978e-05, + "loss": 0.0983, + "num_input_tokens_seen": 6238880, + "step": 5125 + }, + { + "epoch": 0.5713331105913799, + "grad_norm": 0.938219428062439, + "learning_rate": 1.428054349036641e-05, + "loss": 0.2874, + "num_input_tokens_seen": 6245056, + "step": 5130 + }, + { + "epoch": 0.5718899654749973, + "grad_norm": 0.21999430656433105, + "learning_rate": 1.4294464862456845e-05, + "loss": 0.0548, + "num_input_tokens_seen": 6251520, + "step": 5135 + }, + { + "epoch": 0.5724468203586145, + "grad_norm": 1.299635410308838, + "learning_rate": 1.4308386234547277e-05, + "loss": 0.2952, + "num_input_tokens_seen": 6257632, + "step": 5140 + }, + { + "epoch": 0.5730036752422318, + "grad_norm": 1.751299262046814, + "learning_rate": 1.4322307606637712e-05, + "loss": 0.2333, + "num_input_tokens_seen": 6263904, + "step": 5145 + }, + { + "epoch": 0.5735605301258492, + "grad_norm": 2.4469399452209473, + "learning_rate": 1.4336228978728144e-05, + "loss": 0.2228, + "num_input_tokens_seen": 6269728, + "step": 5150 + }, + { + "epoch": 0.5741173850094665, + "grad_norm": 1.0912131071090698, + "learning_rate": 1.4350150350818579e-05, + "loss": 0.1578, + "num_input_tokens_seen": 6275744, + "step": 5155 + }, + { + "epoch": 0.5746742398930839, + "grad_norm": 0.3003332316875458, + "learning_rate": 1.4364071722909011e-05, + "loss": 0.0987, + "num_input_tokens_seen": 6281696, + "step": 5160 + }, + { + "epoch": 0.5752310947767012, + "grad_norm": 1.8004815578460693, + "learning_rate": 1.4377993094999445e-05, + "loss": 0.3018, + "num_input_tokens_seen": 6287840, + "step": 5165 + }, + { + "epoch": 0.5757879496603185, + "grad_norm": 1.8020594120025635, + "learning_rate": 1.4391914467089876e-05, + "loss": 0.1505, + "num_input_tokens_seen": 6293472, + "step": 5170 + }, + { + "epoch": 0.5763448045439359, + "grad_norm": 1.5013813972473145, + "learning_rate": 1.4405835839180309e-05, + "loss": 0.2122, + "num_input_tokens_seen": 6299456, + "step": 5175 + }, + { + "epoch": 0.5769016594275532, + "grad_norm": 0.3664199709892273, + "learning_rate": 1.4419757211270743e-05, + "loss": 0.1378, + "num_input_tokens_seen": 6305472, + "step": 5180 + }, + { + "epoch": 0.5774585143111705, + "grad_norm": 1.7464159727096558, + "learning_rate": 1.4433678583361176e-05, + "loss": 0.2175, + "num_input_tokens_seen": 6311616, + "step": 5185 + }, + { + "epoch": 0.5780153691947878, + "grad_norm": 0.46047160029411316, + "learning_rate": 1.444759995545161e-05, + "loss": 0.3394, + "num_input_tokens_seen": 6317760, + "step": 5190 + }, + { + "epoch": 0.5785722240784051, + "grad_norm": 1.721217393875122, + "learning_rate": 1.4461521327542043e-05, + "loss": 0.4676, + "num_input_tokens_seen": 6324032, + "step": 5195 + }, + { + "epoch": 0.5791290789620225, + "grad_norm": 2.255868673324585, + "learning_rate": 1.4475442699632477e-05, + "loss": 0.3574, + "num_input_tokens_seen": 6330176, + "step": 5200 + }, + { + "epoch": 0.5796859338456398, + "grad_norm": 2.1362898349761963, + "learning_rate": 1.448936407172291e-05, + "loss": 0.2399, + "num_input_tokens_seen": 6336000, + "step": 5205 + }, + { + "epoch": 0.5802427887292572, + "grad_norm": 1.0375475883483887, + "learning_rate": 1.4503285443813344e-05, + "loss": 0.2099, + "num_input_tokens_seen": 6341536, + "step": 5210 + }, + { + "epoch": 0.5807996436128745, + "grad_norm": 1.9864230155944824, + "learning_rate": 1.4517206815903775e-05, + "loss": 0.311, + "num_input_tokens_seen": 6347136, + "step": 5215 + }, + { + "epoch": 0.5813564984964918, + "grad_norm": 2.1536073684692383, + "learning_rate": 1.4531128187994208e-05, + "loss": 0.2362, + "num_input_tokens_seen": 6352736, + "step": 5220 + }, + { + "epoch": 0.5819133533801092, + "grad_norm": 0.79313063621521, + "learning_rate": 1.4545049560084642e-05, + "loss": 0.1379, + "num_input_tokens_seen": 6358912, + "step": 5225 + }, + { + "epoch": 0.5824702082637264, + "grad_norm": 0.9944854974746704, + "learning_rate": 1.4558970932175075e-05, + "loss": 0.1348, + "num_input_tokens_seen": 6364928, + "step": 5230 + }, + { + "epoch": 0.5830270631473438, + "grad_norm": 0.1617928445339203, + "learning_rate": 1.4572892304265509e-05, + "loss": 0.1541, + "num_input_tokens_seen": 6370912, + "step": 5235 + }, + { + "epoch": 0.5835839180309611, + "grad_norm": 0.5259851813316345, + "learning_rate": 1.4586813676355943e-05, + "loss": 0.1596, + "num_input_tokens_seen": 6377088, + "step": 5240 + }, + { + "epoch": 0.5841407729145784, + "grad_norm": 0.6278572082519531, + "learning_rate": 1.4600735048446376e-05, + "loss": 0.1164, + "num_input_tokens_seen": 6383104, + "step": 5245 + }, + { + "epoch": 0.5846976277981958, + "grad_norm": 1.392622947692871, + "learning_rate": 1.461465642053681e-05, + "loss": 0.1416, + "num_input_tokens_seen": 6389504, + "step": 5250 + }, + { + "epoch": 0.5852544826818131, + "grad_norm": 2.6578009128570557, + "learning_rate": 1.4628577792627243e-05, + "loss": 0.3934, + "num_input_tokens_seen": 6395552, + "step": 5255 + }, + { + "epoch": 0.5858113375654305, + "grad_norm": 0.7914688587188721, + "learning_rate": 1.4642499164717674e-05, + "loss": 0.3132, + "num_input_tokens_seen": 6401856, + "step": 5260 + }, + { + "epoch": 0.5863681924490478, + "grad_norm": 1.3025623559951782, + "learning_rate": 1.4656420536808108e-05, + "loss": 0.3303, + "num_input_tokens_seen": 6407840, + "step": 5265 + }, + { + "epoch": 0.5869250473326652, + "grad_norm": 0.9062146544456482, + "learning_rate": 1.4670341908898541e-05, + "loss": 0.1314, + "num_input_tokens_seen": 6413920, + "step": 5270 + }, + { + "epoch": 0.5874819022162825, + "grad_norm": 0.6282882690429688, + "learning_rate": 1.4684263280988975e-05, + "loss": 0.2552, + "num_input_tokens_seen": 6419872, + "step": 5275 + }, + { + "epoch": 0.5880387570998997, + "grad_norm": 1.1397219896316528, + "learning_rate": 1.4698184653079408e-05, + "loss": 0.1864, + "num_input_tokens_seen": 6425472, + "step": 5280 + }, + { + "epoch": 0.5885956119835171, + "grad_norm": 1.0999399423599243, + "learning_rate": 1.4712106025169842e-05, + "loss": 0.2364, + "num_input_tokens_seen": 6431680, + "step": 5285 + }, + { + "epoch": 0.5891524668671344, + "grad_norm": 0.8741183280944824, + "learning_rate": 1.4726027397260275e-05, + "loss": 0.3065, + "num_input_tokens_seen": 6437824, + "step": 5290 + }, + { + "epoch": 0.5897093217507517, + "grad_norm": 0.7077872157096863, + "learning_rate": 1.473994876935071e-05, + "loss": 0.2259, + "num_input_tokens_seen": 6444096, + "step": 5295 + }, + { + "epoch": 0.5902661766343691, + "grad_norm": 0.427985817193985, + "learning_rate": 1.4753870141441142e-05, + "loss": 0.083, + "num_input_tokens_seen": 6450176, + "step": 5300 + }, + { + "epoch": 0.5908230315179864, + "grad_norm": 0.37862148880958557, + "learning_rate": 1.4767791513531573e-05, + "loss": 0.1776, + "num_input_tokens_seen": 6455712, + "step": 5305 + }, + { + "epoch": 0.5913798864016038, + "grad_norm": 0.6435206532478333, + "learning_rate": 1.4781712885622007e-05, + "loss": 0.2819, + "num_input_tokens_seen": 6461696, + "step": 5310 + }, + { + "epoch": 0.5919367412852211, + "grad_norm": 1.0449899435043335, + "learning_rate": 1.479563425771244e-05, + "loss": 0.2434, + "num_input_tokens_seen": 6467328, + "step": 5315 + }, + { + "epoch": 0.5924935961688385, + "grad_norm": 1.5535755157470703, + "learning_rate": 1.4809555629802874e-05, + "loss": 0.3102, + "num_input_tokens_seen": 6473696, + "step": 5320 + }, + { + "epoch": 0.5930504510524557, + "grad_norm": 0.5042328834533691, + "learning_rate": 1.4823477001893307e-05, + "loss": 0.1395, + "num_input_tokens_seen": 6480000, + "step": 5325 + }, + { + "epoch": 0.593607305936073, + "grad_norm": 2.016237497329712, + "learning_rate": 1.4837398373983741e-05, + "loss": 0.2183, + "num_input_tokens_seen": 6485696, + "step": 5330 + }, + { + "epoch": 0.5941641608196904, + "grad_norm": 0.48718222975730896, + "learning_rate": 1.4851319746074174e-05, + "loss": 0.277, + "num_input_tokens_seen": 6491616, + "step": 5335 + }, + { + "epoch": 0.5947210157033077, + "grad_norm": 0.6780077815055847, + "learning_rate": 1.4865241118164608e-05, + "loss": 0.1561, + "num_input_tokens_seen": 6497728, + "step": 5340 + }, + { + "epoch": 0.595277870586925, + "grad_norm": 0.9836495518684387, + "learning_rate": 1.487916249025504e-05, + "loss": 0.1769, + "num_input_tokens_seen": 6504064, + "step": 5345 + }, + { + "epoch": 0.5958347254705424, + "grad_norm": 0.8919978737831116, + "learning_rate": 1.4893083862345472e-05, + "loss": 0.3347, + "num_input_tokens_seen": 6509952, + "step": 5350 + }, + { + "epoch": 0.5963915803541597, + "grad_norm": 1.5179448127746582, + "learning_rate": 1.4907005234435906e-05, + "loss": 0.3109, + "num_input_tokens_seen": 6515968, + "step": 5355 + }, + { + "epoch": 0.5969484352377771, + "grad_norm": 0.8139241933822632, + "learning_rate": 1.4920926606526339e-05, + "loss": 0.1169, + "num_input_tokens_seen": 6521664, + "step": 5360 + }, + { + "epoch": 0.5975052901213944, + "grad_norm": 1.2201013565063477, + "learning_rate": 1.4934847978616773e-05, + "loss": 0.2429, + "num_input_tokens_seen": 6528032, + "step": 5365 + }, + { + "epoch": 0.5980621450050116, + "grad_norm": 1.1611697673797607, + "learning_rate": 1.4948769350707206e-05, + "loss": 0.1402, + "num_input_tokens_seen": 6534208, + "step": 5370 + }, + { + "epoch": 0.598618999888629, + "grad_norm": 0.2044166475534439, + "learning_rate": 1.496269072279764e-05, + "loss": 0.1212, + "num_input_tokens_seen": 6540672, + "step": 5375 + }, + { + "epoch": 0.5991758547722463, + "grad_norm": 2.9624216556549072, + "learning_rate": 1.4976612094888074e-05, + "loss": 0.4075, + "num_input_tokens_seen": 6546784, + "step": 5380 + }, + { + "epoch": 0.5997327096558637, + "grad_norm": 0.765896737575531, + "learning_rate": 1.4990533466978507e-05, + "loss": 0.2873, + "num_input_tokens_seen": 6553248, + "step": 5385 + }, + { + "epoch": 0.600289564539481, + "grad_norm": 1.681243896484375, + "learning_rate": 1.5004454839068941e-05, + "loss": 0.1493, + "num_input_tokens_seen": 6559360, + "step": 5390 + }, + { + "epoch": 0.6008464194230984, + "grad_norm": 0.438841849565506, + "learning_rate": 1.501837621115937e-05, + "loss": 0.1452, + "num_input_tokens_seen": 6565312, + "step": 5395 + }, + { + "epoch": 0.6014032743067157, + "grad_norm": 1.6467514038085938, + "learning_rate": 1.5032297583249805e-05, + "loss": 0.1445, + "num_input_tokens_seen": 6571328, + "step": 5400 + }, + { + "epoch": 0.601960129190333, + "grad_norm": 2.4646975994110107, + "learning_rate": 1.5046218955340239e-05, + "loss": 0.3237, + "num_input_tokens_seen": 6577248, + "step": 5405 + }, + { + "epoch": 0.6025169840739504, + "grad_norm": 0.8153958320617676, + "learning_rate": 1.5060140327430672e-05, + "loss": 0.1747, + "num_input_tokens_seen": 6583648, + "step": 5410 + }, + { + "epoch": 0.6030738389575676, + "grad_norm": 0.2995830774307251, + "learning_rate": 1.5074061699521106e-05, + "loss": 0.1716, + "num_input_tokens_seen": 6589504, + "step": 5415 + }, + { + "epoch": 0.603630693841185, + "grad_norm": 2.0906128883361816, + "learning_rate": 1.5087983071611539e-05, + "loss": 0.2437, + "num_input_tokens_seen": 6595776, + "step": 5420 + }, + { + "epoch": 0.6041875487248023, + "grad_norm": 1.2160166501998901, + "learning_rate": 1.5101904443701973e-05, + "loss": 0.1971, + "num_input_tokens_seen": 6601856, + "step": 5425 + }, + { + "epoch": 0.6047444036084196, + "grad_norm": 3.0475194454193115, + "learning_rate": 1.5115825815792406e-05, + "loss": 0.4618, + "num_input_tokens_seen": 6607328, + "step": 5430 + }, + { + "epoch": 0.605301258492037, + "grad_norm": 4.129725933074951, + "learning_rate": 1.512974718788284e-05, + "loss": 0.294, + "num_input_tokens_seen": 6613344, + "step": 5435 + }, + { + "epoch": 0.6058581133756543, + "grad_norm": 1.3633759021759033, + "learning_rate": 1.5143668559973273e-05, + "loss": 0.1533, + "num_input_tokens_seen": 6619552, + "step": 5440 + }, + { + "epoch": 0.6064149682592717, + "grad_norm": 2.4415781497955322, + "learning_rate": 1.5157589932063703e-05, + "loss": 0.3468, + "num_input_tokens_seen": 6625024, + "step": 5445 + }, + { + "epoch": 0.606971823142889, + "grad_norm": 0.06648126989603043, + "learning_rate": 1.5171511304154138e-05, + "loss": 0.1379, + "num_input_tokens_seen": 6631232, + "step": 5450 + }, + { + "epoch": 0.6075286780265063, + "grad_norm": 1.126000165939331, + "learning_rate": 1.518543267624457e-05, + "loss": 0.2837, + "num_input_tokens_seen": 6637408, + "step": 5455 + }, + { + "epoch": 0.6080855329101236, + "grad_norm": 0.3511005640029907, + "learning_rate": 1.5199354048335005e-05, + "loss": 0.3118, + "num_input_tokens_seen": 6643552, + "step": 5460 + }, + { + "epoch": 0.6086423877937409, + "grad_norm": 0.6565269827842712, + "learning_rate": 1.5213275420425437e-05, + "loss": 0.2986, + "num_input_tokens_seen": 6649728, + "step": 5465 + }, + { + "epoch": 0.6091992426773583, + "grad_norm": 0.6957889795303345, + "learning_rate": 1.5227196792515872e-05, + "loss": 0.2747, + "num_input_tokens_seen": 6655680, + "step": 5470 + }, + { + "epoch": 0.6097560975609756, + "grad_norm": 1.3404513597488403, + "learning_rate": 1.5241118164606304e-05, + "loss": 0.2696, + "num_input_tokens_seen": 6661664, + "step": 5475 + }, + { + "epoch": 0.6103129524445929, + "grad_norm": 1.2083677053451538, + "learning_rate": 1.5255039536696739e-05, + "loss": 0.1454, + "num_input_tokens_seen": 6667968, + "step": 5480 + }, + { + "epoch": 0.6108698073282103, + "grad_norm": 0.5391320586204529, + "learning_rate": 1.526896090878717e-05, + "loss": 0.2302, + "num_input_tokens_seen": 6674016, + "step": 5485 + }, + { + "epoch": 0.6114266622118276, + "grad_norm": 1.058450698852539, + "learning_rate": 1.5282882280877602e-05, + "loss": 0.3052, + "num_input_tokens_seen": 6680192, + "step": 5490 + }, + { + "epoch": 0.611983517095445, + "grad_norm": 0.3314778506755829, + "learning_rate": 1.5296803652968037e-05, + "loss": 0.1958, + "num_input_tokens_seen": 6686784, + "step": 5495 + }, + { + "epoch": 0.6125403719790623, + "grad_norm": 0.159693643450737, + "learning_rate": 1.531072502505847e-05, + "loss": 0.1603, + "num_input_tokens_seen": 6692960, + "step": 5500 + }, + { + "epoch": 0.6130972268626795, + "grad_norm": 1.2129799127578735, + "learning_rate": 1.5324646397148902e-05, + "loss": 0.2884, + "num_input_tokens_seen": 6698784, + "step": 5505 + }, + { + "epoch": 0.6136540817462969, + "grad_norm": 0.998309314250946, + "learning_rate": 1.5338567769239336e-05, + "loss": 0.18, + "num_input_tokens_seen": 6705216, + "step": 5510 + }, + { + "epoch": 0.6142109366299142, + "grad_norm": 1.770436406135559, + "learning_rate": 1.535248914132977e-05, + "loss": 0.1954, + "num_input_tokens_seen": 6711360, + "step": 5515 + }, + { + "epoch": 0.6147677915135316, + "grad_norm": 2.0995852947235107, + "learning_rate": 1.5366410513420205e-05, + "loss": 0.1894, + "num_input_tokens_seen": 6717728, + "step": 5520 + }, + { + "epoch": 0.6153246463971489, + "grad_norm": 0.5941336750984192, + "learning_rate": 1.538033188551064e-05, + "loss": 0.2553, + "num_input_tokens_seen": 6724000, + "step": 5525 + }, + { + "epoch": 0.6158815012807662, + "grad_norm": 0.7406835556030273, + "learning_rate": 1.539425325760107e-05, + "loss": 0.2482, + "num_input_tokens_seen": 6730304, + "step": 5530 + }, + { + "epoch": 0.6164383561643836, + "grad_norm": 0.538866400718689, + "learning_rate": 1.54081746296915e-05, + "loss": 0.2096, + "num_input_tokens_seen": 6736192, + "step": 5535 + }, + { + "epoch": 0.6169952110480009, + "grad_norm": 2.8993852138519287, + "learning_rate": 1.5422096001781935e-05, + "loss": 0.4417, + "num_input_tokens_seen": 6741952, + "step": 5540 + }, + { + "epoch": 0.6175520659316183, + "grad_norm": 0.4322991967201233, + "learning_rate": 1.543601737387237e-05, + "loss": 0.2122, + "num_input_tokens_seen": 6748256, + "step": 5545 + }, + { + "epoch": 0.6181089208152355, + "grad_norm": 0.42991939187049866, + "learning_rate": 1.5449938745962804e-05, + "loss": 0.1944, + "num_input_tokens_seen": 6754336, + "step": 5550 + }, + { + "epoch": 0.6186657756988528, + "grad_norm": 0.7431523203849792, + "learning_rate": 1.5463860118053235e-05, + "loss": 0.2377, + "num_input_tokens_seen": 6760512, + "step": 5555 + }, + { + "epoch": 0.6192226305824702, + "grad_norm": 1.4438132047653198, + "learning_rate": 1.547778149014367e-05, + "loss": 0.2082, + "num_input_tokens_seen": 6766656, + "step": 5560 + }, + { + "epoch": 0.6197794854660875, + "grad_norm": 2.453258514404297, + "learning_rate": 1.5491702862234104e-05, + "loss": 0.2356, + "num_input_tokens_seen": 6773120, + "step": 5565 + }, + { + "epoch": 0.6203363403497049, + "grad_norm": 1.0896607637405396, + "learning_rate": 1.5505624234324538e-05, + "loss": 0.1889, + "num_input_tokens_seen": 6779392, + "step": 5570 + }, + { + "epoch": 0.6208931952333222, + "grad_norm": 1.22590970993042, + "learning_rate": 1.551954560641497e-05, + "loss": 0.2516, + "num_input_tokens_seen": 6785600, + "step": 5575 + }, + { + "epoch": 0.6214500501169395, + "grad_norm": 0.6333708167076111, + "learning_rate": 1.55334669785054e-05, + "loss": 0.0842, + "num_input_tokens_seen": 6791680, + "step": 5580 + }, + { + "epoch": 0.6220069050005569, + "grad_norm": 2.4714314937591553, + "learning_rate": 1.5547388350595834e-05, + "loss": 0.3593, + "num_input_tokens_seen": 6797664, + "step": 5585 + }, + { + "epoch": 0.6225637598841742, + "grad_norm": 0.922593355178833, + "learning_rate": 1.556130972268627e-05, + "loss": 0.2304, + "num_input_tokens_seen": 6803968, + "step": 5590 + }, + { + "epoch": 0.6231206147677915, + "grad_norm": 0.25847598910331726, + "learning_rate": 1.5575231094776703e-05, + "loss": 0.1724, + "num_input_tokens_seen": 6810144, + "step": 5595 + }, + { + "epoch": 0.6236774696514088, + "grad_norm": 0.24280527234077454, + "learning_rate": 1.5589152466867134e-05, + "loss": 0.3818, + "num_input_tokens_seen": 6816448, + "step": 5600 + }, + { + "epoch": 0.6242343245350261, + "grad_norm": 0.9604713320732117, + "learning_rate": 1.5603073838957568e-05, + "loss": 0.2663, + "num_input_tokens_seen": 6822432, + "step": 5605 + }, + { + "epoch": 0.6247911794186435, + "grad_norm": 1.1924649477005005, + "learning_rate": 1.5616995211048002e-05, + "loss": 0.2441, + "num_input_tokens_seen": 6827744, + "step": 5610 + }, + { + "epoch": 0.6253480343022608, + "grad_norm": 0.11503510177135468, + "learning_rate": 1.5630916583138437e-05, + "loss": 0.0956, + "num_input_tokens_seen": 6834080, + "step": 5615 + }, + { + "epoch": 0.6259048891858782, + "grad_norm": 0.30223262310028076, + "learning_rate": 1.5644837955228868e-05, + "loss": 0.1747, + "num_input_tokens_seen": 6839872, + "step": 5620 + }, + { + "epoch": 0.6264617440694955, + "grad_norm": 1.1658778190612793, + "learning_rate": 1.56587593273193e-05, + "loss": 0.2856, + "num_input_tokens_seen": 6845888, + "step": 5625 + }, + { + "epoch": 0.6270185989531128, + "grad_norm": 0.16959567368030548, + "learning_rate": 1.5672680699409733e-05, + "loss": 0.0992, + "num_input_tokens_seen": 6851648, + "step": 5630 + }, + { + "epoch": 0.6275754538367302, + "grad_norm": 0.7829920649528503, + "learning_rate": 1.5686602071500167e-05, + "loss": 0.1186, + "num_input_tokens_seen": 6857920, + "step": 5635 + }, + { + "epoch": 0.6281323087203475, + "grad_norm": 3.0702102184295654, + "learning_rate": 1.57005234435906e-05, + "loss": 0.315, + "num_input_tokens_seen": 6863840, + "step": 5640 + }, + { + "epoch": 0.6286891636039648, + "grad_norm": 0.31490153074264526, + "learning_rate": 1.5714444815681033e-05, + "loss": 0.1182, + "num_input_tokens_seen": 6870176, + "step": 5645 + }, + { + "epoch": 0.6292460184875821, + "grad_norm": 0.30647721886634827, + "learning_rate": 1.5728366187771467e-05, + "loss": 0.1392, + "num_input_tokens_seen": 6876192, + "step": 5650 + }, + { + "epoch": 0.6298028733711994, + "grad_norm": 0.9428191184997559, + "learning_rate": 1.57422875598619e-05, + "loss": 0.1955, + "num_input_tokens_seen": 6882336, + "step": 5655 + }, + { + "epoch": 0.6303597282548168, + "grad_norm": 1.3865736722946167, + "learning_rate": 1.5756208931952336e-05, + "loss": 0.1509, + "num_input_tokens_seen": 6888608, + "step": 5660 + }, + { + "epoch": 0.6309165831384341, + "grad_norm": 1.7450779676437378, + "learning_rate": 1.577013030404277e-05, + "loss": 0.1942, + "num_input_tokens_seen": 6894944, + "step": 5665 + }, + { + "epoch": 0.6314734380220515, + "grad_norm": 1.4893195629119873, + "learning_rate": 1.5784051676133197e-05, + "loss": 0.1793, + "num_input_tokens_seen": 6900960, + "step": 5670 + }, + { + "epoch": 0.6320302929056688, + "grad_norm": 0.14811141788959503, + "learning_rate": 1.5797973048223632e-05, + "loss": 0.1219, + "num_input_tokens_seen": 6906880, + "step": 5675 + }, + { + "epoch": 0.6325871477892862, + "grad_norm": 0.581623911857605, + "learning_rate": 1.5811894420314066e-05, + "loss": 0.2421, + "num_input_tokens_seen": 6913280, + "step": 5680 + }, + { + "epoch": 0.6331440026729035, + "grad_norm": 0.6517337560653687, + "learning_rate": 1.58258157924045e-05, + "loss": 0.1399, + "num_input_tokens_seen": 6919520, + "step": 5685 + }, + { + "epoch": 0.6337008575565207, + "grad_norm": 0.7928338646888733, + "learning_rate": 1.5839737164494935e-05, + "loss": 0.2319, + "num_input_tokens_seen": 6925728, + "step": 5690 + }, + { + "epoch": 0.6342577124401381, + "grad_norm": 0.2570113241672516, + "learning_rate": 1.5853658536585366e-05, + "loss": 0.1231, + "num_input_tokens_seen": 6931808, + "step": 5695 + }, + { + "epoch": 0.6348145673237554, + "grad_norm": 1.509216070175171, + "learning_rate": 1.58675799086758e-05, + "loss": 0.1776, + "num_input_tokens_seen": 6938048, + "step": 5700 + }, + { + "epoch": 0.6353714222073727, + "grad_norm": 2.3795838356018066, + "learning_rate": 1.5881501280766234e-05, + "loss": 0.2214, + "num_input_tokens_seen": 6943712, + "step": 5705 + }, + { + "epoch": 0.6359282770909901, + "grad_norm": 1.3313487768173218, + "learning_rate": 1.589542265285667e-05, + "loss": 0.1354, + "num_input_tokens_seen": 6949888, + "step": 5710 + }, + { + "epoch": 0.6364851319746074, + "grad_norm": 1.8142306804656982, + "learning_rate": 1.59093440249471e-05, + "loss": 0.2091, + "num_input_tokens_seen": 6956192, + "step": 5715 + }, + { + "epoch": 0.6370419868582248, + "grad_norm": 1.3707239627838135, + "learning_rate": 1.592326539703753e-05, + "loss": 0.244, + "num_input_tokens_seen": 6962624, + "step": 5720 + }, + { + "epoch": 0.6375988417418421, + "grad_norm": 2.432300329208374, + "learning_rate": 1.5937186769127965e-05, + "loss": 0.2377, + "num_input_tokens_seen": 6967904, + "step": 5725 + }, + { + "epoch": 0.6381556966254595, + "grad_norm": 0.19769534468650818, + "learning_rate": 1.59511081412184e-05, + "loss": 0.1178, + "num_input_tokens_seen": 6974080, + "step": 5730 + }, + { + "epoch": 0.6387125515090767, + "grad_norm": 0.22019845247268677, + "learning_rate": 1.5965029513308834e-05, + "loss": 0.1976, + "num_input_tokens_seen": 6980416, + "step": 5735 + }, + { + "epoch": 0.639269406392694, + "grad_norm": 0.9518765211105347, + "learning_rate": 1.5978950885399264e-05, + "loss": 0.1656, + "num_input_tokens_seen": 6986560, + "step": 5740 + }, + { + "epoch": 0.6398262612763114, + "grad_norm": 1.0297694206237793, + "learning_rate": 1.59928722574897e-05, + "loss": 0.1058, + "num_input_tokens_seen": 6992704, + "step": 5745 + }, + { + "epoch": 0.6403831161599287, + "grad_norm": 1.7364246845245361, + "learning_rate": 1.6006793629580133e-05, + "loss": 0.1538, + "num_input_tokens_seen": 6999296, + "step": 5750 + }, + { + "epoch": 0.640939971043546, + "grad_norm": 1.1002436876296997, + "learning_rate": 1.6020715001670567e-05, + "loss": 0.5618, + "num_input_tokens_seen": 7005440, + "step": 5755 + }, + { + "epoch": 0.6414968259271634, + "grad_norm": 1.4958922863006592, + "learning_rate": 1.6034636373761e-05, + "loss": 0.1539, + "num_input_tokens_seen": 7011616, + "step": 5760 + }, + { + "epoch": 0.6420536808107807, + "grad_norm": 1.0220431089401245, + "learning_rate": 1.604855774585143e-05, + "loss": 0.3539, + "num_input_tokens_seen": 7017184, + "step": 5765 + }, + { + "epoch": 0.6426105356943981, + "grad_norm": 0.7533059120178223, + "learning_rate": 1.6062479117941864e-05, + "loss": 0.3693, + "num_input_tokens_seen": 7023104, + "step": 5770 + }, + { + "epoch": 0.6431673905780154, + "grad_norm": 1.2902827262878418, + "learning_rate": 1.6076400490032298e-05, + "loss": 0.1275, + "num_input_tokens_seen": 7029248, + "step": 5775 + }, + { + "epoch": 0.6437242454616326, + "grad_norm": 0.3807288110256195, + "learning_rate": 1.6090321862122732e-05, + "loss": 0.1337, + "num_input_tokens_seen": 7035296, + "step": 5780 + }, + { + "epoch": 0.64428110034525, + "grad_norm": 1.1578989028930664, + "learning_rate": 1.6104243234213163e-05, + "loss": 0.2312, + "num_input_tokens_seen": 7041536, + "step": 5785 + }, + { + "epoch": 0.6448379552288673, + "grad_norm": 0.8587301969528198, + "learning_rate": 1.6118164606303598e-05, + "loss": 0.2187, + "num_input_tokens_seen": 7047584, + "step": 5790 + }, + { + "epoch": 0.6453948101124847, + "grad_norm": 0.6343014240264893, + "learning_rate": 1.6132085978394032e-05, + "loss": 0.0901, + "num_input_tokens_seen": 7053920, + "step": 5795 + }, + { + "epoch": 0.645951664996102, + "grad_norm": 0.8475937247276306, + "learning_rate": 1.6146007350484466e-05, + "loss": 0.1631, + "num_input_tokens_seen": 7060032, + "step": 5800 + }, + { + "epoch": 0.6465085198797194, + "grad_norm": 0.9072160124778748, + "learning_rate": 1.61599287225749e-05, + "loss": 0.1023, + "num_input_tokens_seen": 7066464, + "step": 5805 + }, + { + "epoch": 0.6470653747633367, + "grad_norm": 1.487064003944397, + "learning_rate": 1.617385009466533e-05, + "loss": 0.1825, + "num_input_tokens_seen": 7072480, + "step": 5810 + }, + { + "epoch": 0.647622229646954, + "grad_norm": 2.4773943424224854, + "learning_rate": 1.6187771466755762e-05, + "loss": 0.2745, + "num_input_tokens_seen": 7078432, + "step": 5815 + }, + { + "epoch": 0.6481790845305714, + "grad_norm": 1.668529748916626, + "learning_rate": 1.6201692838846197e-05, + "loss": 0.2596, + "num_input_tokens_seen": 7084928, + "step": 5820 + }, + { + "epoch": 0.6487359394141886, + "grad_norm": 1.5824775695800781, + "learning_rate": 1.621561421093663e-05, + "loss": 0.318, + "num_input_tokens_seen": 7090848, + "step": 5825 + }, + { + "epoch": 0.649292794297806, + "grad_norm": 0.81565922498703, + "learning_rate": 1.6229535583027065e-05, + "loss": 0.203, + "num_input_tokens_seen": 7096928, + "step": 5830 + }, + { + "epoch": 0.6498496491814233, + "grad_norm": 0.04896242171525955, + "learning_rate": 1.6243456955117496e-05, + "loss": 0.0226, + "num_input_tokens_seen": 7102976, + "step": 5835 + }, + { + "epoch": 0.6504065040650406, + "grad_norm": 1.3097009658813477, + "learning_rate": 1.625737832720793e-05, + "loss": 0.3038, + "num_input_tokens_seen": 7109408, + "step": 5840 + }, + { + "epoch": 0.650963358948658, + "grad_norm": 0.45901918411254883, + "learning_rate": 1.6271299699298365e-05, + "loss": 0.2686, + "num_input_tokens_seen": 7115328, + "step": 5845 + }, + { + "epoch": 0.6515202138322753, + "grad_norm": 0.8411445021629333, + "learning_rate": 1.62852210713888e-05, + "loss": 0.2144, + "num_input_tokens_seen": 7121536, + "step": 5850 + }, + { + "epoch": 0.6520770687158927, + "grad_norm": 3.0732665061950684, + "learning_rate": 1.629914244347923e-05, + "loss": 0.1984, + "num_input_tokens_seen": 7127616, + "step": 5855 + }, + { + "epoch": 0.65263392359951, + "grad_norm": 0.33750370144844055, + "learning_rate": 1.631306381556966e-05, + "loss": 0.1442, + "num_input_tokens_seen": 7133760, + "step": 5860 + }, + { + "epoch": 0.6531907784831273, + "grad_norm": 1.1648023128509521, + "learning_rate": 1.6326985187660096e-05, + "loss": 0.142, + "num_input_tokens_seen": 7139968, + "step": 5865 + }, + { + "epoch": 0.6537476333667446, + "grad_norm": 0.1782255917787552, + "learning_rate": 1.634090655975053e-05, + "loss": 0.1528, + "num_input_tokens_seen": 7146144, + "step": 5870 + }, + { + "epoch": 0.6543044882503619, + "grad_norm": 0.09999411553144455, + "learning_rate": 1.6354827931840964e-05, + "loss": 0.1006, + "num_input_tokens_seen": 7152384, + "step": 5875 + }, + { + "epoch": 0.6548613431339793, + "grad_norm": 1.5044472217559814, + "learning_rate": 1.6368749303931395e-05, + "loss": 0.2766, + "num_input_tokens_seen": 7158400, + "step": 5880 + }, + { + "epoch": 0.6554181980175966, + "grad_norm": 0.8288733959197998, + "learning_rate": 1.638267067602183e-05, + "loss": 0.2568, + "num_input_tokens_seen": 7164544, + "step": 5885 + }, + { + "epoch": 0.6559750529012139, + "grad_norm": 0.7528876662254333, + "learning_rate": 1.6396592048112264e-05, + "loss": 0.1677, + "num_input_tokens_seen": 7170848, + "step": 5890 + }, + { + "epoch": 0.6565319077848313, + "grad_norm": 2.1768548488616943, + "learning_rate": 1.6410513420202698e-05, + "loss": 0.2737, + "num_input_tokens_seen": 7176736, + "step": 5895 + }, + { + "epoch": 0.6570887626684486, + "grad_norm": 0.930819034576416, + "learning_rate": 1.642443479229313e-05, + "loss": 0.1713, + "num_input_tokens_seen": 7183040, + "step": 5900 + }, + { + "epoch": 0.657645617552066, + "grad_norm": 1.8633322715759277, + "learning_rate": 1.643835616438356e-05, + "loss": 0.2272, + "num_input_tokens_seen": 7189248, + "step": 5905 + }, + { + "epoch": 0.6582024724356833, + "grad_norm": 1.5913790464401245, + "learning_rate": 1.6452277536473994e-05, + "loss": 0.202, + "num_input_tokens_seen": 7195392, + "step": 5910 + }, + { + "epoch": 0.6587593273193005, + "grad_norm": 0.6622933149337769, + "learning_rate": 1.646619890856443e-05, + "loss": 0.1655, + "num_input_tokens_seen": 7201504, + "step": 5915 + }, + { + "epoch": 0.6593161822029179, + "grad_norm": 0.8970844149589539, + "learning_rate": 1.6480120280654863e-05, + "loss": 0.1656, + "num_input_tokens_seen": 7207904, + "step": 5920 + }, + { + "epoch": 0.6598730370865352, + "grad_norm": 0.12825226783752441, + "learning_rate": 1.6494041652745294e-05, + "loss": 0.0671, + "num_input_tokens_seen": 7214368, + "step": 5925 + }, + { + "epoch": 0.6604298919701526, + "grad_norm": 0.8209102749824524, + "learning_rate": 1.650796302483573e-05, + "loss": 0.1583, + "num_input_tokens_seen": 7220480, + "step": 5930 + }, + { + "epoch": 0.6609867468537699, + "grad_norm": 2.1362874507904053, + "learning_rate": 1.6521884396926163e-05, + "loss": 0.1904, + "num_input_tokens_seen": 7226688, + "step": 5935 + }, + { + "epoch": 0.6615436017373872, + "grad_norm": 2.168952226638794, + "learning_rate": 1.6535805769016597e-05, + "loss": 0.1747, + "num_input_tokens_seen": 7233152, + "step": 5940 + }, + { + "epoch": 0.6621004566210046, + "grad_norm": 1.1291509866714478, + "learning_rate": 1.6549727141107028e-05, + "loss": 0.2533, + "num_input_tokens_seen": 7239040, + "step": 5945 + }, + { + "epoch": 0.6626573115046219, + "grad_norm": 0.09910190850496292, + "learning_rate": 1.6563648513197462e-05, + "loss": 0.1423, + "num_input_tokens_seen": 7245280, + "step": 5950 + }, + { + "epoch": 0.6632141663882393, + "grad_norm": 1.3766664266586304, + "learning_rate": 1.6577569885287893e-05, + "loss": 0.1833, + "num_input_tokens_seen": 7251168, + "step": 5955 + }, + { + "epoch": 0.6637710212718566, + "grad_norm": 2.1590869426727295, + "learning_rate": 1.6591491257378328e-05, + "loss": 0.3361, + "num_input_tokens_seen": 7256608, + "step": 5960 + }, + { + "epoch": 0.6643278761554738, + "grad_norm": 0.8778948783874512, + "learning_rate": 1.6605412629468762e-05, + "loss": 0.222, + "num_input_tokens_seen": 7262624, + "step": 5965 + }, + { + "epoch": 0.6648847310390912, + "grad_norm": 1.0181576013565063, + "learning_rate": 1.6619334001559196e-05, + "loss": 0.1265, + "num_input_tokens_seen": 7268640, + "step": 5970 + }, + { + "epoch": 0.6654415859227085, + "grad_norm": 0.4750426411628723, + "learning_rate": 1.6633255373649627e-05, + "loss": 0.1087, + "num_input_tokens_seen": 7274976, + "step": 5975 + }, + { + "epoch": 0.6659984408063259, + "grad_norm": 0.44439664483070374, + "learning_rate": 1.664717674574006e-05, + "loss": 0.2288, + "num_input_tokens_seen": 7281216, + "step": 5980 + }, + { + "epoch": 0.6665552956899432, + "grad_norm": 0.8047117590904236, + "learning_rate": 1.6661098117830496e-05, + "loss": 0.2447, + "num_input_tokens_seen": 7287552, + "step": 5985 + }, + { + "epoch": 0.6671121505735605, + "grad_norm": 1.1920002698898315, + "learning_rate": 1.6675019489920927e-05, + "loss": 0.4444, + "num_input_tokens_seen": 7293792, + "step": 5990 + }, + { + "epoch": 0.6676690054571779, + "grad_norm": 0.8958007097244263, + "learning_rate": 1.668894086201136e-05, + "loss": 0.1082, + "num_input_tokens_seen": 7299936, + "step": 5995 + }, + { + "epoch": 0.6682258603407952, + "grad_norm": 0.3102874755859375, + "learning_rate": 1.6702862234101792e-05, + "loss": 0.1985, + "num_input_tokens_seen": 7306240, + "step": 6000 + }, + { + "epoch": 0.6687827152244126, + "grad_norm": 0.47674939036369324, + "learning_rate": 1.6716783606192226e-05, + "loss": 0.2618, + "num_input_tokens_seen": 7312320, + "step": 6005 + }, + { + "epoch": 0.6693395701080298, + "grad_norm": 1.7007803916931152, + "learning_rate": 1.673070497828266e-05, + "loss": 0.1995, + "num_input_tokens_seen": 7318784, + "step": 6010 + }, + { + "epoch": 0.6698964249916471, + "grad_norm": 1.790461540222168, + "learning_rate": 1.6744626350373095e-05, + "loss": 0.2266, + "num_input_tokens_seen": 7325312, + "step": 6015 + }, + { + "epoch": 0.6704532798752645, + "grad_norm": 0.763918936252594, + "learning_rate": 1.6758547722463526e-05, + "loss": 0.2731, + "num_input_tokens_seen": 7331328, + "step": 6020 + }, + { + "epoch": 0.6710101347588818, + "grad_norm": 1.3454958200454712, + "learning_rate": 1.677246909455396e-05, + "loss": 0.1844, + "num_input_tokens_seen": 7337216, + "step": 6025 + }, + { + "epoch": 0.6715669896424992, + "grad_norm": 0.7113502621650696, + "learning_rate": 1.6786390466644395e-05, + "loss": 0.1553, + "num_input_tokens_seen": 7343232, + "step": 6030 + }, + { + "epoch": 0.6721238445261165, + "grad_norm": 0.28293508291244507, + "learning_rate": 1.6800311838734825e-05, + "loss": 0.1973, + "num_input_tokens_seen": 7349568, + "step": 6035 + }, + { + "epoch": 0.6726806994097339, + "grad_norm": 0.6867997646331787, + "learning_rate": 1.681423321082526e-05, + "loss": 0.1229, + "num_input_tokens_seen": 7355776, + "step": 6040 + }, + { + "epoch": 0.6732375542933512, + "grad_norm": 1.4340745210647583, + "learning_rate": 1.682815458291569e-05, + "loss": 0.2731, + "num_input_tokens_seen": 7361920, + "step": 6045 + }, + { + "epoch": 0.6737944091769685, + "grad_norm": 1.9355542659759521, + "learning_rate": 1.6842075955006125e-05, + "loss": 0.4222, + "num_input_tokens_seen": 7368288, + "step": 6050 + }, + { + "epoch": 0.6743512640605858, + "grad_norm": 1.1858631372451782, + "learning_rate": 1.685599732709656e-05, + "loss": 0.2081, + "num_input_tokens_seen": 7374240, + "step": 6055 + }, + { + "epoch": 0.6749081189442031, + "grad_norm": 1.3198031187057495, + "learning_rate": 1.6869918699186994e-05, + "loss": 0.1422, + "num_input_tokens_seen": 7380352, + "step": 6060 + }, + { + "epoch": 0.6754649738278204, + "grad_norm": 1.3808411359786987, + "learning_rate": 1.6883840071277428e-05, + "loss": 0.212, + "num_input_tokens_seen": 7386592, + "step": 6065 + }, + { + "epoch": 0.6760218287114378, + "grad_norm": 2.2011942863464355, + "learning_rate": 1.689776144336786e-05, + "loss": 0.3724, + "num_input_tokens_seen": 7392704, + "step": 6070 + }, + { + "epoch": 0.6765786835950551, + "grad_norm": 0.8558923602104187, + "learning_rate": 1.6911682815458293e-05, + "loss": 0.1156, + "num_input_tokens_seen": 7398912, + "step": 6075 + }, + { + "epoch": 0.6771355384786725, + "grad_norm": 0.7563077807426453, + "learning_rate": 1.6925604187548724e-05, + "loss": 0.1853, + "num_input_tokens_seen": 7405056, + "step": 6080 + }, + { + "epoch": 0.6776923933622898, + "grad_norm": 0.30908024311065674, + "learning_rate": 1.693952555963916e-05, + "loss": 0.1621, + "num_input_tokens_seen": 7411200, + "step": 6085 + }, + { + "epoch": 0.6782492482459072, + "grad_norm": 0.5999177098274231, + "learning_rate": 1.6953446931729593e-05, + "loss": 0.235, + "num_input_tokens_seen": 7417152, + "step": 6090 + }, + { + "epoch": 0.6788061031295245, + "grad_norm": 0.9278191328048706, + "learning_rate": 1.6967368303820024e-05, + "loss": 0.1909, + "num_input_tokens_seen": 7423360, + "step": 6095 + }, + { + "epoch": 0.6793629580131417, + "grad_norm": 1.239612102508545, + "learning_rate": 1.6981289675910458e-05, + "loss": 0.3314, + "num_input_tokens_seen": 7429344, + "step": 6100 + }, + { + "epoch": 0.6799198128967591, + "grad_norm": 0.9208769202232361, + "learning_rate": 1.6995211048000893e-05, + "loss": 0.1341, + "num_input_tokens_seen": 7435616, + "step": 6105 + }, + { + "epoch": 0.6804766677803764, + "grad_norm": 1.3821154832839966, + "learning_rate": 1.7009132420091327e-05, + "loss": 0.2578, + "num_input_tokens_seen": 7441824, + "step": 6110 + }, + { + "epoch": 0.6810335226639938, + "grad_norm": 0.445444792509079, + "learning_rate": 1.7023053792181758e-05, + "loss": 0.1168, + "num_input_tokens_seen": 7447808, + "step": 6115 + }, + { + "epoch": 0.6815903775476111, + "grad_norm": 0.12826471030712128, + "learning_rate": 1.7036975164272192e-05, + "loss": 0.0923, + "num_input_tokens_seen": 7453632, + "step": 6120 + }, + { + "epoch": 0.6821472324312284, + "grad_norm": 1.0530794858932495, + "learning_rate": 1.7050896536362626e-05, + "loss": 0.1902, + "num_input_tokens_seen": 7459648, + "step": 6125 + }, + { + "epoch": 0.6827040873148458, + "grad_norm": 0.9324491620063782, + "learning_rate": 1.7064817908453057e-05, + "loss": 0.167, + "num_input_tokens_seen": 7465856, + "step": 6130 + }, + { + "epoch": 0.6832609421984631, + "grad_norm": 1.9635212421417236, + "learning_rate": 1.7078739280543492e-05, + "loss": 0.3628, + "num_input_tokens_seen": 7471680, + "step": 6135 + }, + { + "epoch": 0.6838177970820805, + "grad_norm": 0.4026939868927002, + "learning_rate": 1.7092660652633923e-05, + "loss": 0.1338, + "num_input_tokens_seen": 7477696, + "step": 6140 + }, + { + "epoch": 0.6843746519656977, + "grad_norm": 1.6240394115447998, + "learning_rate": 1.7106582024724357e-05, + "loss": 0.1121, + "num_input_tokens_seen": 7483776, + "step": 6145 + }, + { + "epoch": 0.684931506849315, + "grad_norm": 1.516653299331665, + "learning_rate": 1.712050339681479e-05, + "loss": 0.3635, + "num_input_tokens_seen": 7489856, + "step": 6150 + }, + { + "epoch": 0.6854883617329324, + "grad_norm": 1.2815085649490356, + "learning_rate": 1.7134424768905226e-05, + "loss": 0.2577, + "num_input_tokens_seen": 7495968, + "step": 6155 + }, + { + "epoch": 0.6860452166165497, + "grad_norm": 1.3835844993591309, + "learning_rate": 1.7148346140995657e-05, + "loss": 0.1995, + "num_input_tokens_seen": 7501536, + "step": 6160 + }, + { + "epoch": 0.686602071500167, + "grad_norm": 0.7373533844947815, + "learning_rate": 1.716226751308609e-05, + "loss": 0.1576, + "num_input_tokens_seen": 7507808, + "step": 6165 + }, + { + "epoch": 0.6871589263837844, + "grad_norm": 0.44439128041267395, + "learning_rate": 1.7176188885176525e-05, + "loss": 0.1215, + "num_input_tokens_seen": 7513920, + "step": 6170 + }, + { + "epoch": 0.6877157812674017, + "grad_norm": 0.8560948371887207, + "learning_rate": 1.7190110257266956e-05, + "loss": 0.4492, + "num_input_tokens_seen": 7519424, + "step": 6175 + }, + { + "epoch": 0.6882726361510191, + "grad_norm": 0.19184941053390503, + "learning_rate": 1.720403162935739e-05, + "loss": 0.1512, + "num_input_tokens_seen": 7525504, + "step": 6180 + }, + { + "epoch": 0.6888294910346364, + "grad_norm": 1.235478401184082, + "learning_rate": 1.721795300144782e-05, + "loss": 0.2054, + "num_input_tokens_seen": 7531680, + "step": 6185 + }, + { + "epoch": 0.6893863459182537, + "grad_norm": 0.3399341404438019, + "learning_rate": 1.7231874373538256e-05, + "loss": 0.1134, + "num_input_tokens_seen": 7537952, + "step": 6190 + }, + { + "epoch": 0.689943200801871, + "grad_norm": 0.4749133884906769, + "learning_rate": 1.724579574562869e-05, + "loss": 0.1219, + "num_input_tokens_seen": 7544000, + "step": 6195 + }, + { + "epoch": 0.6905000556854883, + "grad_norm": 1.3224565982818604, + "learning_rate": 1.7259717117719124e-05, + "loss": 0.2172, + "num_input_tokens_seen": 7550240, + "step": 6200 + }, + { + "epoch": 0.6910569105691057, + "grad_norm": 0.384635865688324, + "learning_rate": 1.727363848980956e-05, + "loss": 0.2832, + "num_input_tokens_seen": 7556896, + "step": 6205 + }, + { + "epoch": 0.691613765452723, + "grad_norm": 1.221282720565796, + "learning_rate": 1.728755986189999e-05, + "loss": 0.1866, + "num_input_tokens_seen": 7563040, + "step": 6210 + }, + { + "epoch": 0.6921706203363404, + "grad_norm": 0.16875329613685608, + "learning_rate": 1.7301481233990424e-05, + "loss": 0.0839, + "num_input_tokens_seen": 7569088, + "step": 6215 + }, + { + "epoch": 0.6927274752199577, + "grad_norm": 0.9138569831848145, + "learning_rate": 1.7315402606080855e-05, + "loss": 0.1153, + "num_input_tokens_seen": 7575392, + "step": 6220 + }, + { + "epoch": 0.693284330103575, + "grad_norm": 0.4136374592781067, + "learning_rate": 1.732932397817129e-05, + "loss": 0.1084, + "num_input_tokens_seen": 7581184, + "step": 6225 + }, + { + "epoch": 0.6938411849871924, + "grad_norm": 1.246899127960205, + "learning_rate": 1.7343245350261724e-05, + "loss": 0.2464, + "num_input_tokens_seen": 7587136, + "step": 6230 + }, + { + "epoch": 0.6943980398708096, + "grad_norm": 1.3048824071884155, + "learning_rate": 1.7357166722352155e-05, + "loss": 0.1825, + "num_input_tokens_seen": 7593184, + "step": 6235 + }, + { + "epoch": 0.694954894754427, + "grad_norm": 0.7663057446479797, + "learning_rate": 1.737108809444259e-05, + "loss": 0.1725, + "num_input_tokens_seen": 7599456, + "step": 6240 + }, + { + "epoch": 0.6955117496380443, + "grad_norm": 1.1953233480453491, + "learning_rate": 1.7385009466533023e-05, + "loss": 0.2601, + "num_input_tokens_seen": 7604736, + "step": 6245 + }, + { + "epoch": 0.6960686045216616, + "grad_norm": 1.3685349225997925, + "learning_rate": 1.7398930838623458e-05, + "loss": 0.1817, + "num_input_tokens_seen": 7611104, + "step": 6250 + }, + { + "epoch": 0.696625459405279, + "grad_norm": 0.37422215938568115, + "learning_rate": 1.741285221071389e-05, + "loss": 0.1258, + "num_input_tokens_seen": 7617664, + "step": 6255 + }, + { + "epoch": 0.6971823142888963, + "grad_norm": 1.6521737575531006, + "learning_rate": 1.7426773582804323e-05, + "loss": 0.2446, + "num_input_tokens_seen": 7624160, + "step": 6260 + }, + { + "epoch": 0.6977391691725137, + "grad_norm": 1.5090856552124023, + "learning_rate": 1.7440694954894754e-05, + "loss": 0.2722, + "num_input_tokens_seen": 7630240, + "step": 6265 + }, + { + "epoch": 0.698296024056131, + "grad_norm": 2.9808900356292725, + "learning_rate": 1.7454616326985188e-05, + "loss": 0.2891, + "num_input_tokens_seen": 7636544, + "step": 6270 + }, + { + "epoch": 0.6988528789397483, + "grad_norm": 0.5632281303405762, + "learning_rate": 1.7468537699075622e-05, + "loss": 0.174, + "num_input_tokens_seen": 7643040, + "step": 6275 + }, + { + "epoch": 0.6994097338233656, + "grad_norm": 0.6811168193817139, + "learning_rate": 1.7482459071166053e-05, + "loss": 0.2243, + "num_input_tokens_seen": 7649344, + "step": 6280 + }, + { + "epoch": 0.6999665887069829, + "grad_norm": 1.9579551219940186, + "learning_rate": 1.7496380443256488e-05, + "loss": 0.2318, + "num_input_tokens_seen": 7655584, + "step": 6285 + }, + { + "epoch": 0.7005234435906003, + "grad_norm": 1.827696442604065, + "learning_rate": 1.7510301815346922e-05, + "loss": 0.2505, + "num_input_tokens_seen": 7661920, + "step": 6290 + }, + { + "epoch": 0.7010802984742176, + "grad_norm": 1.0036017894744873, + "learning_rate": 1.7524223187437356e-05, + "loss": 0.3568, + "num_input_tokens_seen": 7668000, + "step": 6295 + }, + { + "epoch": 0.7016371533578349, + "grad_norm": 0.2521788775920868, + "learning_rate": 1.7538144559527787e-05, + "loss": 0.1609, + "num_input_tokens_seen": 7673792, + "step": 6300 + }, + { + "epoch": 0.7021940082414523, + "grad_norm": 1.3361161947250366, + "learning_rate": 1.755206593161822e-05, + "loss": 0.2697, + "num_input_tokens_seen": 7679744, + "step": 6305 + }, + { + "epoch": 0.7027508631250696, + "grad_norm": 0.849789559841156, + "learning_rate": 1.7565987303708653e-05, + "loss": 0.2468, + "num_input_tokens_seen": 7685792, + "step": 6310 + }, + { + "epoch": 0.703307718008687, + "grad_norm": 1.212764859199524, + "learning_rate": 1.7579908675799087e-05, + "loss": 0.2369, + "num_input_tokens_seen": 7692032, + "step": 6315 + }, + { + "epoch": 0.7038645728923043, + "grad_norm": 0.3246811628341675, + "learning_rate": 1.759383004788952e-05, + "loss": 0.1116, + "num_input_tokens_seen": 7697792, + "step": 6320 + }, + { + "epoch": 0.7044214277759216, + "grad_norm": 0.21226833760738373, + "learning_rate": 1.7607751419979952e-05, + "loss": 0.1153, + "num_input_tokens_seen": 7704160, + "step": 6325 + }, + { + "epoch": 0.7049782826595389, + "grad_norm": 1.6181837320327759, + "learning_rate": 1.7621672792070387e-05, + "loss": 0.2803, + "num_input_tokens_seen": 7710304, + "step": 6330 + }, + { + "epoch": 0.7055351375431562, + "grad_norm": 0.7306963801383972, + "learning_rate": 1.763559416416082e-05, + "loss": 0.1766, + "num_input_tokens_seen": 7716320, + "step": 6335 + }, + { + "epoch": 0.7060919924267736, + "grad_norm": 1.487632155418396, + "learning_rate": 1.7649515536251255e-05, + "loss": 0.1344, + "num_input_tokens_seen": 7722592, + "step": 6340 + }, + { + "epoch": 0.7066488473103909, + "grad_norm": 2.064737558364868, + "learning_rate": 1.766343690834169e-05, + "loss": 0.225, + "num_input_tokens_seen": 7728832, + "step": 6345 + }, + { + "epoch": 0.7072057021940082, + "grad_norm": 1.0300028324127197, + "learning_rate": 1.767735828043212e-05, + "loss": 0.1358, + "num_input_tokens_seen": 7735136, + "step": 6350 + }, + { + "epoch": 0.7077625570776256, + "grad_norm": 1.538465976715088, + "learning_rate": 1.769127965252255e-05, + "loss": 0.0742, + "num_input_tokens_seen": 7741472, + "step": 6355 + }, + { + "epoch": 0.7083194119612429, + "grad_norm": 0.7034310698509216, + "learning_rate": 1.7705201024612986e-05, + "loss": 0.0774, + "num_input_tokens_seen": 7747680, + "step": 6360 + }, + { + "epoch": 0.7088762668448603, + "grad_norm": 0.5202513933181763, + "learning_rate": 1.771912239670342e-05, + "loss": 0.15, + "num_input_tokens_seen": 7753856, + "step": 6365 + }, + { + "epoch": 0.7094331217284776, + "grad_norm": 1.5208570957183838, + "learning_rate": 1.7733043768793854e-05, + "loss": 0.2496, + "num_input_tokens_seen": 7759872, + "step": 6370 + }, + { + "epoch": 0.7099899766120948, + "grad_norm": 0.5856565833091736, + "learning_rate": 1.7746965140884285e-05, + "loss": 0.384, + "num_input_tokens_seen": 7765920, + "step": 6375 + }, + { + "epoch": 0.7105468314957122, + "grad_norm": 0.33216819167137146, + "learning_rate": 1.776088651297472e-05, + "loss": 0.2062, + "num_input_tokens_seen": 7772096, + "step": 6380 + }, + { + "epoch": 0.7111036863793295, + "grad_norm": 0.6605873703956604, + "learning_rate": 1.7774807885065154e-05, + "loss": 0.325, + "num_input_tokens_seen": 7778112, + "step": 6385 + }, + { + "epoch": 0.7116605412629469, + "grad_norm": 0.8035803437232971, + "learning_rate": 1.7788729257155588e-05, + "loss": 0.2671, + "num_input_tokens_seen": 7784128, + "step": 6390 + }, + { + "epoch": 0.7122173961465642, + "grad_norm": 0.2134469449520111, + "learning_rate": 1.780265062924602e-05, + "loss": 0.2494, + "num_input_tokens_seen": 7789888, + "step": 6395 + }, + { + "epoch": 0.7127742510301815, + "grad_norm": 1.05836021900177, + "learning_rate": 1.781657200133645e-05, + "loss": 0.1723, + "num_input_tokens_seen": 7795968, + "step": 6400 + }, + { + "epoch": 0.7133311059137989, + "grad_norm": 1.1503450870513916, + "learning_rate": 1.7830493373426884e-05, + "loss": 0.3804, + "num_input_tokens_seen": 7801920, + "step": 6405 + }, + { + "epoch": 0.7138879607974162, + "grad_norm": 0.2765839099884033, + "learning_rate": 1.784441474551732e-05, + "loss": 0.2118, + "num_input_tokens_seen": 7808320, + "step": 6410 + }, + { + "epoch": 0.7144448156810336, + "grad_norm": 0.9356546998023987, + "learning_rate": 1.7858336117607753e-05, + "loss": 0.2461, + "num_input_tokens_seen": 7814656, + "step": 6415 + }, + { + "epoch": 0.7150016705646508, + "grad_norm": 0.5861429572105408, + "learning_rate": 1.7872257489698184e-05, + "loss": 0.209, + "num_input_tokens_seen": 7821024, + "step": 6420 + }, + { + "epoch": 0.7155585254482681, + "grad_norm": 0.8244715929031372, + "learning_rate": 1.788617886178862e-05, + "loss": 0.0877, + "num_input_tokens_seen": 7827296, + "step": 6425 + }, + { + "epoch": 0.7161153803318855, + "grad_norm": 0.8772768378257751, + "learning_rate": 1.7900100233879053e-05, + "loss": 0.1941, + "num_input_tokens_seen": 7833856, + "step": 6430 + }, + { + "epoch": 0.7166722352155028, + "grad_norm": 0.8708745837211609, + "learning_rate": 1.7914021605969487e-05, + "loss": 0.2169, + "num_input_tokens_seen": 7840128, + "step": 6435 + }, + { + "epoch": 0.7172290900991202, + "grad_norm": 0.999735414981842, + "learning_rate": 1.7927942978059918e-05, + "loss": 0.0964, + "num_input_tokens_seen": 7846208, + "step": 6440 + }, + { + "epoch": 0.7177859449827375, + "grad_norm": 1.5478636026382446, + "learning_rate": 1.7941864350150352e-05, + "loss": 0.3389, + "num_input_tokens_seen": 7852160, + "step": 6445 + }, + { + "epoch": 0.7183427998663549, + "grad_norm": 0.8200066685676575, + "learning_rate": 1.7955785722240783e-05, + "loss": 0.1043, + "num_input_tokens_seen": 7858464, + "step": 6450 + }, + { + "epoch": 0.7188996547499722, + "grad_norm": 0.26005280017852783, + "learning_rate": 1.7969707094331218e-05, + "loss": 0.1962, + "num_input_tokens_seen": 7864640, + "step": 6455 + }, + { + "epoch": 0.7194565096335895, + "grad_norm": 1.2391184568405151, + "learning_rate": 1.7983628466421652e-05, + "loss": 0.2932, + "num_input_tokens_seen": 7871232, + "step": 6460 + }, + { + "epoch": 0.7200133645172068, + "grad_norm": 0.21594728529453278, + "learning_rate": 1.7997549838512083e-05, + "loss": 0.1306, + "num_input_tokens_seen": 7877312, + "step": 6465 + }, + { + "epoch": 0.7205702194008241, + "grad_norm": 1.4558461904525757, + "learning_rate": 1.8011471210602517e-05, + "loss": 0.2154, + "num_input_tokens_seen": 7883488, + "step": 6470 + }, + { + "epoch": 0.7211270742844414, + "grad_norm": 1.0612704753875732, + "learning_rate": 1.802539258269295e-05, + "loss": 0.2824, + "num_input_tokens_seen": 7889760, + "step": 6475 + }, + { + "epoch": 0.7216839291680588, + "grad_norm": 0.24836936593055725, + "learning_rate": 1.8039313954783386e-05, + "loss": 0.1607, + "num_input_tokens_seen": 7895744, + "step": 6480 + }, + { + "epoch": 0.7222407840516761, + "grad_norm": 0.10989200323820114, + "learning_rate": 1.805323532687382e-05, + "loss": 0.1803, + "num_input_tokens_seen": 7902144, + "step": 6485 + }, + { + "epoch": 0.7227976389352935, + "grad_norm": 1.377251148223877, + "learning_rate": 1.806715669896425e-05, + "loss": 0.1298, + "num_input_tokens_seen": 7908608, + "step": 6490 + }, + { + "epoch": 0.7233544938189108, + "grad_norm": 0.20700141787528992, + "learning_rate": 1.8081078071054682e-05, + "loss": 0.0694, + "num_input_tokens_seen": 7914752, + "step": 6495 + }, + { + "epoch": 0.7239113487025282, + "grad_norm": 0.4354921877384186, + "learning_rate": 1.8094999443145116e-05, + "loss": 0.1319, + "num_input_tokens_seen": 7920576, + "step": 6500 + }, + { + "epoch": 0.7244682035861455, + "grad_norm": 0.7009885907173157, + "learning_rate": 1.810892081523555e-05, + "loss": 0.2307, + "num_input_tokens_seen": 7926560, + "step": 6505 + }, + { + "epoch": 0.7250250584697627, + "grad_norm": 0.8960058093070984, + "learning_rate": 1.8122842187325985e-05, + "loss": 0.1174, + "num_input_tokens_seen": 7932416, + "step": 6510 + }, + { + "epoch": 0.7255819133533801, + "grad_norm": 0.3146961033344269, + "learning_rate": 1.8136763559416416e-05, + "loss": 0.1885, + "num_input_tokens_seen": 7938656, + "step": 6515 + }, + { + "epoch": 0.7261387682369974, + "grad_norm": 0.3543579578399658, + "learning_rate": 1.815068493150685e-05, + "loss": 0.1502, + "num_input_tokens_seen": 7944768, + "step": 6520 + }, + { + "epoch": 0.7266956231206148, + "grad_norm": 1.5624371767044067, + "learning_rate": 1.8164606303597285e-05, + "loss": 0.3084, + "num_input_tokens_seen": 7950592, + "step": 6525 + }, + { + "epoch": 0.7272524780042321, + "grad_norm": 1.3219311237335205, + "learning_rate": 1.817852767568772e-05, + "loss": 0.1637, + "num_input_tokens_seen": 7956736, + "step": 6530 + }, + { + "epoch": 0.7278093328878494, + "grad_norm": 1.0802485942840576, + "learning_rate": 1.819244904777815e-05, + "loss": 0.1523, + "num_input_tokens_seen": 7962944, + "step": 6535 + }, + { + "epoch": 0.7283661877714668, + "grad_norm": 1.6879311800003052, + "learning_rate": 1.820637041986858e-05, + "loss": 0.1837, + "num_input_tokens_seen": 7968864, + "step": 6540 + }, + { + "epoch": 0.7289230426550841, + "grad_norm": 0.9341915249824524, + "learning_rate": 1.8220291791959015e-05, + "loss": 0.3234, + "num_input_tokens_seen": 7975072, + "step": 6545 + }, + { + "epoch": 0.7294798975387015, + "grad_norm": 1.8456443548202515, + "learning_rate": 1.823421316404945e-05, + "loss": 0.3038, + "num_input_tokens_seen": 7981536, + "step": 6550 + }, + { + "epoch": 0.7300367524223187, + "grad_norm": 0.9561784267425537, + "learning_rate": 1.8248134536139884e-05, + "loss": 0.1867, + "num_input_tokens_seen": 7987840, + "step": 6555 + }, + { + "epoch": 0.730593607305936, + "grad_norm": 1.0116533041000366, + "learning_rate": 1.8262055908230315e-05, + "loss": 0.1753, + "num_input_tokens_seen": 7994112, + "step": 6560 + }, + { + "epoch": 0.7311504621895534, + "grad_norm": 0.7982893586158752, + "learning_rate": 1.827597728032075e-05, + "loss": 0.2755, + "num_input_tokens_seen": 8000384, + "step": 6565 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 0.6185986399650574, + "learning_rate": 1.8289898652411183e-05, + "loss": 0.1216, + "num_input_tokens_seen": 8006656, + "step": 6570 + }, + { + "epoch": 0.7322641719567881, + "grad_norm": 0.6774430274963379, + "learning_rate": 1.8303820024501618e-05, + "loss": 0.0486, + "num_input_tokens_seen": 8012864, + "step": 6575 + }, + { + "epoch": 0.7328210268404054, + "grad_norm": 0.8410897254943848, + "learning_rate": 1.831774139659205e-05, + "loss": 0.1939, + "num_input_tokens_seen": 8018880, + "step": 6580 + }, + { + "epoch": 0.7333778817240227, + "grad_norm": 0.9531610012054443, + "learning_rate": 1.833166276868248e-05, + "loss": 0.115, + "num_input_tokens_seen": 8025024, + "step": 6585 + }, + { + "epoch": 0.7339347366076401, + "grad_norm": 0.5827532410621643, + "learning_rate": 1.8345584140772914e-05, + "loss": 0.1191, + "num_input_tokens_seen": 8031072, + "step": 6590 + }, + { + "epoch": 0.7344915914912574, + "grad_norm": 1.7516133785247803, + "learning_rate": 1.835950551286335e-05, + "loss": 0.2371, + "num_input_tokens_seen": 8037184, + "step": 6595 + }, + { + "epoch": 0.7350484463748747, + "grad_norm": 1.4539709091186523, + "learning_rate": 1.8373426884953783e-05, + "loss": 0.3163, + "num_input_tokens_seen": 8043360, + "step": 6600 + }, + { + "epoch": 0.735605301258492, + "grad_norm": 1.0444490909576416, + "learning_rate": 1.8387348257044214e-05, + "loss": 0.111, + "num_input_tokens_seen": 8049440, + "step": 6605 + }, + { + "epoch": 0.7361621561421093, + "grad_norm": 0.5754314661026001, + "learning_rate": 1.8401269629134648e-05, + "loss": 0.3732, + "num_input_tokens_seen": 8055392, + "step": 6610 + }, + { + "epoch": 0.7367190110257267, + "grad_norm": 1.2720741033554077, + "learning_rate": 1.8415191001225082e-05, + "loss": 0.209, + "num_input_tokens_seen": 8061376, + "step": 6615 + }, + { + "epoch": 0.737275865909344, + "grad_norm": 0.7648025155067444, + "learning_rate": 1.8429112373315517e-05, + "loss": 0.1918, + "num_input_tokens_seen": 8067744, + "step": 6620 + }, + { + "epoch": 0.7378327207929614, + "grad_norm": 0.8849483132362366, + "learning_rate": 1.844303374540595e-05, + "loss": 0.1112, + "num_input_tokens_seen": 8073696, + "step": 6625 + }, + { + "epoch": 0.7383895756765787, + "grad_norm": 0.402527391910553, + "learning_rate": 1.845695511749638e-05, + "loss": 0.2386, + "num_input_tokens_seen": 8079136, + "step": 6630 + }, + { + "epoch": 0.738946430560196, + "grad_norm": 1.2996392250061035, + "learning_rate": 1.8470876489586813e-05, + "loss": 0.4441, + "num_input_tokens_seen": 8085344, + "step": 6635 + }, + { + "epoch": 0.7395032854438134, + "grad_norm": 1.2841790914535522, + "learning_rate": 1.8484797861677247e-05, + "loss": 0.0804, + "num_input_tokens_seen": 8091520, + "step": 6640 + }, + { + "epoch": 0.7400601403274307, + "grad_norm": 1.3003084659576416, + "learning_rate": 1.849871923376768e-05, + "loss": 0.2958, + "num_input_tokens_seen": 8097600, + "step": 6645 + }, + { + "epoch": 0.740616995211048, + "grad_norm": 0.9076826572418213, + "learning_rate": 1.8512640605858116e-05, + "loss": 0.148, + "num_input_tokens_seen": 8103968, + "step": 6650 + }, + { + "epoch": 0.7411738500946653, + "grad_norm": 1.038632869720459, + "learning_rate": 1.8526561977948547e-05, + "loss": 0.2777, + "num_input_tokens_seen": 8109664, + "step": 6655 + }, + { + "epoch": 0.7417307049782826, + "grad_norm": 0.3678628206253052, + "learning_rate": 1.854048335003898e-05, + "loss": 0.2114, + "num_input_tokens_seen": 8115552, + "step": 6660 + }, + { + "epoch": 0.7422875598619, + "grad_norm": 0.29279690980911255, + "learning_rate": 1.8554404722129415e-05, + "loss": 0.1688, + "num_input_tokens_seen": 8121504, + "step": 6665 + }, + { + "epoch": 0.7428444147455173, + "grad_norm": 1.0359314680099487, + "learning_rate": 1.856832609421985e-05, + "loss": 0.0761, + "num_input_tokens_seen": 8127776, + "step": 6670 + }, + { + "epoch": 0.7434012696291347, + "grad_norm": 2.218797445297241, + "learning_rate": 1.858224746631028e-05, + "loss": 0.2317, + "num_input_tokens_seen": 8133952, + "step": 6675 + }, + { + "epoch": 0.743958124512752, + "grad_norm": 1.2621989250183105, + "learning_rate": 1.859616883840071e-05, + "loss": 0.2279, + "num_input_tokens_seen": 8140416, + "step": 6680 + }, + { + "epoch": 0.7445149793963693, + "grad_norm": 2.1364386081695557, + "learning_rate": 1.8610090210491146e-05, + "loss": 0.1504, + "num_input_tokens_seen": 8146688, + "step": 6685 + }, + { + "epoch": 0.7450718342799867, + "grad_norm": 0.5143260955810547, + "learning_rate": 1.862401158258158e-05, + "loss": 0.1498, + "num_input_tokens_seen": 8152160, + "step": 6690 + }, + { + "epoch": 0.7456286891636039, + "grad_norm": 0.3909805715084076, + "learning_rate": 1.8637932954672015e-05, + "loss": 0.1434, + "num_input_tokens_seen": 8158176, + "step": 6695 + }, + { + "epoch": 0.7461855440472213, + "grad_norm": 3.267704963684082, + "learning_rate": 1.8651854326762445e-05, + "loss": 0.3427, + "num_input_tokens_seen": 8164512, + "step": 6700 + }, + { + "epoch": 0.7467423989308386, + "grad_norm": 2.6402218341827393, + "learning_rate": 1.866577569885288e-05, + "loss": 0.2609, + "num_input_tokens_seen": 8170720, + "step": 6705 + }, + { + "epoch": 0.7472992538144559, + "grad_norm": 0.3998452425003052, + "learning_rate": 1.8679697070943314e-05, + "loss": 0.2468, + "num_input_tokens_seen": 8176672, + "step": 6710 + }, + { + "epoch": 0.7478561086980733, + "grad_norm": 1.0670967102050781, + "learning_rate": 1.869361844303375e-05, + "loss": 0.2891, + "num_input_tokens_seen": 8183168, + "step": 6715 + }, + { + "epoch": 0.7484129635816906, + "grad_norm": 1.3817639350891113, + "learning_rate": 1.870753981512418e-05, + "loss": 0.2447, + "num_input_tokens_seen": 8189024, + "step": 6720 + }, + { + "epoch": 0.748969818465308, + "grad_norm": 0.9514822959899902, + "learning_rate": 1.872146118721461e-05, + "loss": 0.3662, + "num_input_tokens_seen": 8195264, + "step": 6725 + }, + { + "epoch": 0.7495266733489253, + "grad_norm": 1.868791103363037, + "learning_rate": 1.8735382559305045e-05, + "loss": 0.1134, + "num_input_tokens_seen": 8201472, + "step": 6730 + }, + { + "epoch": 0.7500835282325427, + "grad_norm": 0.5062668323516846, + "learning_rate": 1.874930393139548e-05, + "loss": 0.1643, + "num_input_tokens_seen": 8207520, + "step": 6735 + }, + { + "epoch": 0.7506403831161599, + "grad_norm": 0.4950350224971771, + "learning_rate": 1.8763225303485913e-05, + "loss": 0.2786, + "num_input_tokens_seen": 8213440, + "step": 6740 + }, + { + "epoch": 0.7511972379997772, + "grad_norm": 0.8543272614479065, + "learning_rate": 1.8777146675576344e-05, + "loss": 0.1887, + "num_input_tokens_seen": 8219520, + "step": 6745 + }, + { + "epoch": 0.7517540928833946, + "grad_norm": 2.8302297592163086, + "learning_rate": 1.879106804766678e-05, + "loss": 0.242, + "num_input_tokens_seen": 8225600, + "step": 6750 + }, + { + "epoch": 0.7523109477670119, + "grad_norm": 1.0109108686447144, + "learning_rate": 1.8804989419757213e-05, + "loss": 0.1691, + "num_input_tokens_seen": 8232000, + "step": 6755 + }, + { + "epoch": 0.7528678026506292, + "grad_norm": 0.5666096210479736, + "learning_rate": 1.8818910791847647e-05, + "loss": 0.2117, + "num_input_tokens_seen": 8238208, + "step": 6760 + }, + { + "epoch": 0.7534246575342466, + "grad_norm": 0.6281962394714355, + "learning_rate": 1.8832832163938078e-05, + "loss": 0.2135, + "num_input_tokens_seen": 8243840, + "step": 6765 + }, + { + "epoch": 0.7539815124178639, + "grad_norm": 0.7136801481246948, + "learning_rate": 1.884675353602851e-05, + "loss": 0.2572, + "num_input_tokens_seen": 8250144, + "step": 6770 + }, + { + "epoch": 0.7545383673014813, + "grad_norm": 2.0754852294921875, + "learning_rate": 1.8860674908118943e-05, + "loss": 0.2518, + "num_input_tokens_seen": 8256320, + "step": 6775 + }, + { + "epoch": 0.7550952221850986, + "grad_norm": 1.2927088737487793, + "learning_rate": 1.8874596280209378e-05, + "loss": 0.1617, + "num_input_tokens_seen": 8262464, + "step": 6780 + }, + { + "epoch": 0.7556520770687158, + "grad_norm": 1.9026962518692017, + "learning_rate": 1.8888517652299812e-05, + "loss": 0.2781, + "num_input_tokens_seen": 8268672, + "step": 6785 + }, + { + "epoch": 0.7562089319523332, + "grad_norm": 0.7844789028167725, + "learning_rate": 1.8902439024390246e-05, + "loss": 0.2987, + "num_input_tokens_seen": 8274528, + "step": 6790 + }, + { + "epoch": 0.7567657868359505, + "grad_norm": 1.006653070449829, + "learning_rate": 1.8916360396480677e-05, + "loss": 0.2154, + "num_input_tokens_seen": 8280736, + "step": 6795 + }, + { + "epoch": 0.7573226417195679, + "grad_norm": 1.6713330745697021, + "learning_rate": 1.8930281768571112e-05, + "loss": 0.2794, + "num_input_tokens_seen": 8286368, + "step": 6800 + }, + { + "epoch": 0.7578794966031852, + "grad_norm": 0.870313286781311, + "learning_rate": 1.8944203140661546e-05, + "loss": 0.1468, + "num_input_tokens_seen": 8292480, + "step": 6805 + }, + { + "epoch": 0.7584363514868026, + "grad_norm": 0.5892230868339539, + "learning_rate": 1.895812451275198e-05, + "loss": 0.16, + "num_input_tokens_seen": 8298528, + "step": 6810 + }, + { + "epoch": 0.7589932063704199, + "grad_norm": 1.2716058492660522, + "learning_rate": 1.897204588484241e-05, + "loss": 0.1357, + "num_input_tokens_seen": 8304416, + "step": 6815 + }, + { + "epoch": 0.7595500612540372, + "grad_norm": 1.423291802406311, + "learning_rate": 1.8985967256932842e-05, + "loss": 0.1165, + "num_input_tokens_seen": 8310432, + "step": 6820 + }, + { + "epoch": 0.7601069161376546, + "grad_norm": 2.030465841293335, + "learning_rate": 1.8999888629023277e-05, + "loss": 0.2769, + "num_input_tokens_seen": 8316096, + "step": 6825 + }, + { + "epoch": 0.7606637710212718, + "grad_norm": 1.4794186353683472, + "learning_rate": 1.901381000111371e-05, + "loss": 0.2653, + "num_input_tokens_seen": 8322432, + "step": 6830 + }, + { + "epoch": 0.7612206259048891, + "grad_norm": 0.6034063696861267, + "learning_rate": 1.9027731373204145e-05, + "loss": 0.0979, + "num_input_tokens_seen": 8328640, + "step": 6835 + }, + { + "epoch": 0.7617774807885065, + "grad_norm": 0.16761310398578644, + "learning_rate": 1.9041652745294576e-05, + "loss": 0.1586, + "num_input_tokens_seen": 8334752, + "step": 6840 + }, + { + "epoch": 0.7623343356721238, + "grad_norm": 1.1386466026306152, + "learning_rate": 1.905557411738501e-05, + "loss": 0.2888, + "num_input_tokens_seen": 8340800, + "step": 6845 + }, + { + "epoch": 0.7628911905557412, + "grad_norm": 3.333989381790161, + "learning_rate": 1.9069495489475445e-05, + "loss": 0.1739, + "num_input_tokens_seen": 8346464, + "step": 6850 + }, + { + "epoch": 0.7634480454393585, + "grad_norm": 0.7898080348968506, + "learning_rate": 1.908341686156588e-05, + "loss": 0.1396, + "num_input_tokens_seen": 8352224, + "step": 6855 + }, + { + "epoch": 0.7640049003229759, + "grad_norm": 0.5529003739356995, + "learning_rate": 1.909733823365631e-05, + "loss": 0.2171, + "num_input_tokens_seen": 8358304, + "step": 6860 + }, + { + "epoch": 0.7645617552065932, + "grad_norm": 1.0846084356307983, + "learning_rate": 1.911125960574674e-05, + "loss": 0.162, + "num_input_tokens_seen": 8364384, + "step": 6865 + }, + { + "epoch": 0.7651186100902105, + "grad_norm": 1.483453392982483, + "learning_rate": 1.9125180977837175e-05, + "loss": 0.2074, + "num_input_tokens_seen": 8370112, + "step": 6870 + }, + { + "epoch": 0.7656754649738278, + "grad_norm": 0.8829191327095032, + "learning_rate": 1.913910234992761e-05, + "loss": 0.2524, + "num_input_tokens_seen": 8376384, + "step": 6875 + }, + { + "epoch": 0.7662323198574451, + "grad_norm": 3.000873327255249, + "learning_rate": 1.9153023722018044e-05, + "loss": 0.2413, + "num_input_tokens_seen": 8382304, + "step": 6880 + }, + { + "epoch": 0.7667891747410625, + "grad_norm": 0.2816523611545563, + "learning_rate": 1.9166945094108475e-05, + "loss": 0.0735, + "num_input_tokens_seen": 8388448, + "step": 6885 + }, + { + "epoch": 0.7673460296246798, + "grad_norm": 1.5387412309646606, + "learning_rate": 1.918086646619891e-05, + "loss": 0.3039, + "num_input_tokens_seen": 8394560, + "step": 6890 + }, + { + "epoch": 0.7679028845082971, + "grad_norm": 2.2413744926452637, + "learning_rate": 1.9194787838289344e-05, + "loss": 0.2664, + "num_input_tokens_seen": 8400032, + "step": 6895 + }, + { + "epoch": 0.7684597393919145, + "grad_norm": 0.5437954664230347, + "learning_rate": 1.9208709210379778e-05, + "loss": 0.1136, + "num_input_tokens_seen": 8406080, + "step": 6900 + }, + { + "epoch": 0.7690165942755318, + "grad_norm": 1.4196455478668213, + "learning_rate": 1.922263058247021e-05, + "loss": 0.1186, + "num_input_tokens_seen": 8412256, + "step": 6905 + }, + { + "epoch": 0.7695734491591492, + "grad_norm": 2.11722993850708, + "learning_rate": 1.923655195456064e-05, + "loss": 0.1547, + "num_input_tokens_seen": 8418272, + "step": 6910 + }, + { + "epoch": 0.7701303040427665, + "grad_norm": 1.6488349437713623, + "learning_rate": 1.9250473326651074e-05, + "loss": 0.3193, + "num_input_tokens_seen": 8424064, + "step": 6915 + }, + { + "epoch": 0.7706871589263837, + "grad_norm": 0.03713208809494972, + "learning_rate": 1.926439469874151e-05, + "loss": 0.1761, + "num_input_tokens_seen": 8430208, + "step": 6920 + }, + { + "epoch": 0.7712440138100011, + "grad_norm": 1.541242003440857, + "learning_rate": 1.9278316070831943e-05, + "loss": 0.1573, + "num_input_tokens_seen": 8436064, + "step": 6925 + }, + { + "epoch": 0.7718008686936184, + "grad_norm": 2.414405345916748, + "learning_rate": 1.9292237442922377e-05, + "loss": 0.2985, + "num_input_tokens_seen": 8441984, + "step": 6930 + }, + { + "epoch": 0.7723577235772358, + "grad_norm": 1.1885132789611816, + "learning_rate": 1.9306158815012808e-05, + "loss": 0.1246, + "num_input_tokens_seen": 8448256, + "step": 6935 + }, + { + "epoch": 0.7729145784608531, + "grad_norm": 0.863318681716919, + "learning_rate": 1.9320080187103242e-05, + "loss": 0.129, + "num_input_tokens_seen": 8454432, + "step": 6940 + }, + { + "epoch": 0.7734714333444704, + "grad_norm": 1.0598338842391968, + "learning_rate": 1.9334001559193677e-05, + "loss": 0.1258, + "num_input_tokens_seen": 8460192, + "step": 6945 + }, + { + "epoch": 0.7740282882280878, + "grad_norm": 1.3109161853790283, + "learning_rate": 1.9347922931284108e-05, + "loss": 0.1882, + "num_input_tokens_seen": 8466432, + "step": 6950 + }, + { + "epoch": 0.7745851431117051, + "grad_norm": 0.49802106618881226, + "learning_rate": 1.9361844303374542e-05, + "loss": 0.1323, + "num_input_tokens_seen": 8472704, + "step": 6955 + }, + { + "epoch": 0.7751419979953225, + "grad_norm": 1.4641296863555908, + "learning_rate": 1.9375765675464973e-05, + "loss": 0.0913, + "num_input_tokens_seen": 8478656, + "step": 6960 + }, + { + "epoch": 0.7756988528789397, + "grad_norm": 0.8538838028907776, + "learning_rate": 1.9389687047555407e-05, + "loss": 0.3371, + "num_input_tokens_seen": 8484832, + "step": 6965 + }, + { + "epoch": 0.776255707762557, + "grad_norm": 0.2968672811985016, + "learning_rate": 1.940360841964584e-05, + "loss": 0.1026, + "num_input_tokens_seen": 8490880, + "step": 6970 + }, + { + "epoch": 0.7768125626461744, + "grad_norm": 1.3591063022613525, + "learning_rate": 1.9417529791736276e-05, + "loss": 0.2832, + "num_input_tokens_seen": 8496992, + "step": 6975 + }, + { + "epoch": 0.7773694175297917, + "grad_norm": 0.9499943852424622, + "learning_rate": 1.9431451163826707e-05, + "loss": 0.2018, + "num_input_tokens_seen": 8503200, + "step": 6980 + }, + { + "epoch": 0.7779262724134091, + "grad_norm": 1.874187707901001, + "learning_rate": 1.944537253591714e-05, + "loss": 0.241, + "num_input_tokens_seen": 8509504, + "step": 6985 + }, + { + "epoch": 0.7784831272970264, + "grad_norm": 1.085898756980896, + "learning_rate": 1.9459293908007576e-05, + "loss": 0.2594, + "num_input_tokens_seen": 8516000, + "step": 6990 + }, + { + "epoch": 0.7790399821806437, + "grad_norm": 2.4427759647369385, + "learning_rate": 1.9473215280098006e-05, + "loss": 0.3243, + "num_input_tokens_seen": 8522272, + "step": 6995 + }, + { + "epoch": 0.7795968370642611, + "grad_norm": 0.7027262449264526, + "learning_rate": 1.948713665218844e-05, + "loss": 0.157, + "num_input_tokens_seen": 8528000, + "step": 7000 + }, + { + "epoch": 0.7801536919478784, + "grad_norm": 0.7428576350212097, + "learning_rate": 1.9501058024278872e-05, + "loss": 0.0936, + "num_input_tokens_seen": 8533792, + "step": 7005 + }, + { + "epoch": 0.7807105468314958, + "grad_norm": 1.1147456169128418, + "learning_rate": 1.9514979396369306e-05, + "loss": 0.1857, + "num_input_tokens_seen": 8540480, + "step": 7010 + }, + { + "epoch": 0.781267401715113, + "grad_norm": 0.12523624300956726, + "learning_rate": 1.952890076845974e-05, + "loss": 0.338, + "num_input_tokens_seen": 8546592, + "step": 7015 + }, + { + "epoch": 0.7818242565987303, + "grad_norm": 0.22013717889785767, + "learning_rate": 1.9542822140550175e-05, + "loss": 0.2258, + "num_input_tokens_seen": 8552576, + "step": 7020 + }, + { + "epoch": 0.7823811114823477, + "grad_norm": 1.3460735082626343, + "learning_rate": 1.9556743512640606e-05, + "loss": 0.2772, + "num_input_tokens_seen": 8558656, + "step": 7025 + }, + { + "epoch": 0.782937966365965, + "grad_norm": 1.1584811210632324, + "learning_rate": 1.957066488473104e-05, + "loss": 0.2256, + "num_input_tokens_seen": 8564736, + "step": 7030 + }, + { + "epoch": 0.7834948212495824, + "grad_norm": 1.1689971685409546, + "learning_rate": 1.9584586256821474e-05, + "loss": 0.1622, + "num_input_tokens_seen": 8571008, + "step": 7035 + }, + { + "epoch": 0.7840516761331997, + "grad_norm": 1.205058217048645, + "learning_rate": 1.9598507628911905e-05, + "loss": 0.1717, + "num_input_tokens_seen": 8577312, + "step": 7040 + }, + { + "epoch": 0.784608531016817, + "grad_norm": 1.097424864768982, + "learning_rate": 1.961242900100234e-05, + "loss": 0.1853, + "num_input_tokens_seen": 8583616, + "step": 7045 + }, + { + "epoch": 0.7851653859004344, + "grad_norm": 2.524976968765259, + "learning_rate": 1.962635037309277e-05, + "loss": 0.2177, + "num_input_tokens_seen": 8589536, + "step": 7050 + }, + { + "epoch": 0.7857222407840517, + "grad_norm": 1.1712993383407593, + "learning_rate": 1.9640271745183205e-05, + "loss": 0.2142, + "num_input_tokens_seen": 8595424, + "step": 7055 + }, + { + "epoch": 0.786279095667669, + "grad_norm": 1.1905944347381592, + "learning_rate": 1.965419311727364e-05, + "loss": 0.2405, + "num_input_tokens_seen": 8601280, + "step": 7060 + }, + { + "epoch": 0.7868359505512863, + "grad_norm": 1.377095103263855, + "learning_rate": 1.9668114489364074e-05, + "loss": 0.2006, + "num_input_tokens_seen": 8607424, + "step": 7065 + }, + { + "epoch": 0.7873928054349036, + "grad_norm": 0.984490156173706, + "learning_rate": 1.9682035861454508e-05, + "loss": 0.1396, + "num_input_tokens_seen": 8613568, + "step": 7070 + }, + { + "epoch": 0.787949660318521, + "grad_norm": 0.07719984650611877, + "learning_rate": 1.969595723354494e-05, + "loss": 0.1282, + "num_input_tokens_seen": 8619520, + "step": 7075 + }, + { + "epoch": 0.7885065152021383, + "grad_norm": 0.30944785475730896, + "learning_rate": 1.9709878605635373e-05, + "loss": 0.0968, + "num_input_tokens_seen": 8625792, + "step": 7080 + }, + { + "epoch": 0.7890633700857557, + "grad_norm": 2.5953617095947266, + "learning_rate": 1.9723799977725804e-05, + "loss": 0.1621, + "num_input_tokens_seen": 8632160, + "step": 7085 + }, + { + "epoch": 0.789620224969373, + "grad_norm": 0.2861834466457367, + "learning_rate": 1.973772134981624e-05, + "loss": 0.2742, + "num_input_tokens_seen": 8638432, + "step": 7090 + }, + { + "epoch": 0.7901770798529903, + "grad_norm": 2.254263401031494, + "learning_rate": 1.9751642721906673e-05, + "loss": 0.1814, + "num_input_tokens_seen": 8643936, + "step": 7095 + }, + { + "epoch": 0.7907339347366077, + "grad_norm": 2.0500571727752686, + "learning_rate": 1.9765564093997104e-05, + "loss": 0.2169, + "num_input_tokens_seen": 8649952, + "step": 7100 + }, + { + "epoch": 0.7912907896202249, + "grad_norm": 1.2336770296096802, + "learning_rate": 1.9779485466087538e-05, + "loss": 0.1238, + "num_input_tokens_seen": 8656000, + "step": 7105 + }, + { + "epoch": 0.7918476445038423, + "grad_norm": 1.2725788354873657, + "learning_rate": 1.9793406838177972e-05, + "loss": 0.1691, + "num_input_tokens_seen": 8662304, + "step": 7110 + }, + { + "epoch": 0.7924044993874596, + "grad_norm": 0.8854738473892212, + "learning_rate": 1.9807328210268407e-05, + "loss": 0.2291, + "num_input_tokens_seen": 8668736, + "step": 7115 + }, + { + "epoch": 0.792961354271077, + "grad_norm": 1.1221109628677368, + "learning_rate": 1.9821249582358838e-05, + "loss": 0.1388, + "num_input_tokens_seen": 8675008, + "step": 7120 + }, + { + "epoch": 0.7935182091546943, + "grad_norm": 0.9675194621086121, + "learning_rate": 1.9835170954449272e-05, + "loss": 0.1255, + "num_input_tokens_seen": 8680960, + "step": 7125 + }, + { + "epoch": 0.7940750640383116, + "grad_norm": 0.7423879504203796, + "learning_rate": 1.9849092326539706e-05, + "loss": 0.236, + "num_input_tokens_seen": 8687136, + "step": 7130 + }, + { + "epoch": 0.794631918921929, + "grad_norm": 1.5988578796386719, + "learning_rate": 1.9863013698630137e-05, + "loss": 0.3053, + "num_input_tokens_seen": 8693120, + "step": 7135 + }, + { + "epoch": 0.7951887738055463, + "grad_norm": 0.7841442823410034, + "learning_rate": 1.987693507072057e-05, + "loss": 0.1594, + "num_input_tokens_seen": 8699296, + "step": 7140 + }, + { + "epoch": 0.7957456286891637, + "grad_norm": 1.5402898788452148, + "learning_rate": 1.9890856442811002e-05, + "loss": 0.2144, + "num_input_tokens_seen": 8705184, + "step": 7145 + }, + { + "epoch": 0.7963024835727809, + "grad_norm": 0.7218888401985168, + "learning_rate": 1.9904777814901437e-05, + "loss": 0.2448, + "num_input_tokens_seen": 8711232, + "step": 7150 + }, + { + "epoch": 0.7968593384563982, + "grad_norm": 0.8209134340286255, + "learning_rate": 1.991869918699187e-05, + "loss": 0.1454, + "num_input_tokens_seen": 8717184, + "step": 7155 + }, + { + "epoch": 0.7974161933400156, + "grad_norm": 1.6549304723739624, + "learning_rate": 1.9932620559082305e-05, + "loss": 0.2974, + "num_input_tokens_seen": 8723360, + "step": 7160 + }, + { + "epoch": 0.7979730482236329, + "grad_norm": 0.026673218235373497, + "learning_rate": 1.9946541931172736e-05, + "loss": 0.1301, + "num_input_tokens_seen": 8729472, + "step": 7165 + }, + { + "epoch": 0.7985299031072502, + "grad_norm": 1.03033447265625, + "learning_rate": 1.996046330326317e-05, + "loss": 0.3452, + "num_input_tokens_seen": 8734944, + "step": 7170 + }, + { + "epoch": 0.7990867579908676, + "grad_norm": 0.6045578122138977, + "learning_rate": 1.9974384675353605e-05, + "loss": 0.1414, + "num_input_tokens_seen": 8741056, + "step": 7175 + }, + { + "epoch": 0.7996436128744849, + "grad_norm": 2.1417009830474854, + "learning_rate": 1.9988306047444036e-05, + "loss": 0.2889, + "num_input_tokens_seen": 8747040, + "step": 7180 + }, + { + "epoch": 0.8002004677581023, + "grad_norm": 0.7500271201133728, + "learning_rate": 2.000222741953447e-05, + "loss": 0.1608, + "num_input_tokens_seen": 8753280, + "step": 7185 + }, + { + "epoch": 0.8007573226417196, + "grad_norm": 0.9714496731758118, + "learning_rate": 2.0016148791624905e-05, + "loss": 0.1914, + "num_input_tokens_seen": 8759552, + "step": 7190 + }, + { + "epoch": 0.8013141775253368, + "grad_norm": 2.6341745853424072, + "learning_rate": 2.0030070163715336e-05, + "loss": 0.3664, + "num_input_tokens_seen": 8765760, + "step": 7195 + }, + { + "epoch": 0.8018710324089542, + "grad_norm": 1.3965508937835693, + "learning_rate": 2.004399153580577e-05, + "loss": 0.311, + "num_input_tokens_seen": 8771776, + "step": 7200 + }, + { + "epoch": 0.8024278872925715, + "grad_norm": 0.9090884923934937, + "learning_rate": 2.0057912907896204e-05, + "loss": 0.1665, + "num_input_tokens_seen": 8778144, + "step": 7205 + }, + { + "epoch": 0.8029847421761889, + "grad_norm": 0.9472143650054932, + "learning_rate": 2.007183427998664e-05, + "loss": 0.0615, + "num_input_tokens_seen": 8784160, + "step": 7210 + }, + { + "epoch": 0.8035415970598062, + "grad_norm": 0.5543638467788696, + "learning_rate": 2.008575565207707e-05, + "loss": 0.2004, + "num_input_tokens_seen": 8790176, + "step": 7215 + }, + { + "epoch": 0.8040984519434236, + "grad_norm": 1.2334179878234863, + "learning_rate": 2.0099677024167504e-05, + "loss": 0.2365, + "num_input_tokens_seen": 8796448, + "step": 7220 + }, + { + "epoch": 0.8046553068270409, + "grad_norm": 0.06548963487148285, + "learning_rate": 2.0113598396257935e-05, + "loss": 0.1044, + "num_input_tokens_seen": 8802560, + "step": 7225 + }, + { + "epoch": 0.8052121617106582, + "grad_norm": 0.11786920577287674, + "learning_rate": 2.012751976834837e-05, + "loss": 0.0781, + "num_input_tokens_seen": 8808608, + "step": 7230 + }, + { + "epoch": 0.8057690165942756, + "grad_norm": 0.06946415454149246, + "learning_rate": 2.0141441140438803e-05, + "loss": 0.202, + "num_input_tokens_seen": 8814304, + "step": 7235 + }, + { + "epoch": 0.8063258714778928, + "grad_norm": 0.47965988516807556, + "learning_rate": 2.0155362512529234e-05, + "loss": 0.0809, + "num_input_tokens_seen": 8820384, + "step": 7240 + }, + { + "epoch": 0.8068827263615101, + "grad_norm": 1.6461430788040161, + "learning_rate": 2.016928388461967e-05, + "loss": 0.3742, + "num_input_tokens_seen": 8826368, + "step": 7245 + }, + { + "epoch": 0.8074395812451275, + "grad_norm": 1.238863229751587, + "learning_rate": 2.0183205256710103e-05, + "loss": 0.2364, + "num_input_tokens_seen": 8832384, + "step": 7250 + }, + { + "epoch": 0.8079964361287448, + "grad_norm": 0.8485961556434631, + "learning_rate": 2.0197126628800537e-05, + "loss": 0.12, + "num_input_tokens_seen": 8838464, + "step": 7255 + }, + { + "epoch": 0.8085532910123622, + "grad_norm": 0.33811306953430176, + "learning_rate": 2.0211048000890968e-05, + "loss": 0.1667, + "num_input_tokens_seen": 8844032, + "step": 7260 + }, + { + "epoch": 0.8091101458959795, + "grad_norm": 0.9103360176086426, + "learning_rate": 2.0224969372981403e-05, + "loss": 0.1763, + "num_input_tokens_seen": 8850272, + "step": 7265 + }, + { + "epoch": 0.8096670007795969, + "grad_norm": 1.416707158088684, + "learning_rate": 2.0238890745071834e-05, + "loss": 0.1179, + "num_input_tokens_seen": 8855936, + "step": 7270 + }, + { + "epoch": 0.8102238556632142, + "grad_norm": 1.9092522859573364, + "learning_rate": 2.0252812117162268e-05, + "loss": 0.2935, + "num_input_tokens_seen": 8862272, + "step": 7275 + }, + { + "epoch": 0.8107807105468315, + "grad_norm": 0.27578938007354736, + "learning_rate": 2.0266733489252702e-05, + "loss": 0.2535, + "num_input_tokens_seen": 8868416, + "step": 7280 + }, + { + "epoch": 0.8113375654304488, + "grad_norm": 0.40970003604888916, + "learning_rate": 2.0280654861343133e-05, + "loss": 0.228, + "num_input_tokens_seen": 8874496, + "step": 7285 + }, + { + "epoch": 0.8118944203140661, + "grad_norm": 1.2048430442810059, + "learning_rate": 2.0294576233433567e-05, + "loss": 0.1612, + "num_input_tokens_seen": 8880672, + "step": 7290 + }, + { + "epoch": 0.8124512751976835, + "grad_norm": 1.658448338508606, + "learning_rate": 2.0308497605524002e-05, + "loss": 0.3092, + "num_input_tokens_seen": 8886656, + "step": 7295 + }, + { + "epoch": 0.8130081300813008, + "grad_norm": 1.4108128547668457, + "learning_rate": 2.0322418977614436e-05, + "loss": 0.1386, + "num_input_tokens_seen": 8892640, + "step": 7300 + }, + { + "epoch": 0.8135649849649181, + "grad_norm": 1.718407392501831, + "learning_rate": 2.033634034970487e-05, + "loss": 0.2097, + "num_input_tokens_seen": 8898816, + "step": 7305 + }, + { + "epoch": 0.8141218398485355, + "grad_norm": 0.6220503449440002, + "learning_rate": 2.03502617217953e-05, + "loss": 0.1531, + "num_input_tokens_seen": 8904736, + "step": 7310 + }, + { + "epoch": 0.8146786947321528, + "grad_norm": 0.5294393301010132, + "learning_rate": 2.0364183093885732e-05, + "loss": 0.2184, + "num_input_tokens_seen": 8910688, + "step": 7315 + }, + { + "epoch": 0.8152355496157702, + "grad_norm": 0.39240795373916626, + "learning_rate": 2.0378104465976167e-05, + "loss": 0.1561, + "num_input_tokens_seen": 8916992, + "step": 7320 + }, + { + "epoch": 0.8157924044993875, + "grad_norm": 0.49715232849121094, + "learning_rate": 2.03920258380666e-05, + "loss": 0.1486, + "num_input_tokens_seen": 8922240, + "step": 7325 + }, + { + "epoch": 0.8163492593830048, + "grad_norm": 1.3667107820510864, + "learning_rate": 2.0405947210157035e-05, + "loss": 0.2212, + "num_input_tokens_seen": 8928000, + "step": 7330 + }, + { + "epoch": 0.8169061142666221, + "grad_norm": 1.0533171892166138, + "learning_rate": 2.0419868582247466e-05, + "loss": 0.2771, + "num_input_tokens_seen": 8934016, + "step": 7335 + }, + { + "epoch": 0.8174629691502394, + "grad_norm": 2.052461862564087, + "learning_rate": 2.04337899543379e-05, + "loss": 0.3886, + "num_input_tokens_seen": 8940192, + "step": 7340 + }, + { + "epoch": 0.8180198240338568, + "grad_norm": 0.5691341161727905, + "learning_rate": 2.0447711326428335e-05, + "loss": 0.2112, + "num_input_tokens_seen": 8946592, + "step": 7345 + }, + { + "epoch": 0.8185766789174741, + "grad_norm": 1.2654699087142944, + "learning_rate": 2.046163269851877e-05, + "loss": 0.2207, + "num_input_tokens_seen": 8952448, + "step": 7350 + }, + { + "epoch": 0.8191335338010914, + "grad_norm": 1.6893494129180908, + "learning_rate": 2.04755540706092e-05, + "loss": 0.2625, + "num_input_tokens_seen": 8958336, + "step": 7355 + }, + { + "epoch": 0.8196903886847088, + "grad_norm": 0.888055145740509, + "learning_rate": 2.048947544269963e-05, + "loss": 0.1882, + "num_input_tokens_seen": 8964448, + "step": 7360 + }, + { + "epoch": 0.8202472435683261, + "grad_norm": 0.8008314371109009, + "learning_rate": 2.0503396814790065e-05, + "loss": 0.1898, + "num_input_tokens_seen": 8970048, + "step": 7365 + }, + { + "epoch": 0.8208040984519435, + "grad_norm": 0.8429785966873169, + "learning_rate": 2.05173181868805e-05, + "loss": 0.1083, + "num_input_tokens_seen": 8975968, + "step": 7370 + }, + { + "epoch": 0.8213609533355608, + "grad_norm": 0.8382023572921753, + "learning_rate": 2.0531239558970934e-05, + "loss": 0.1673, + "num_input_tokens_seen": 8982240, + "step": 7375 + }, + { + "epoch": 0.821917808219178, + "grad_norm": 1.0239118337631226, + "learning_rate": 2.0545160931061365e-05, + "loss": 0.1593, + "num_input_tokens_seen": 8988256, + "step": 7380 + }, + { + "epoch": 0.8224746631027954, + "grad_norm": 1.4484363794326782, + "learning_rate": 2.05590823031518e-05, + "loss": 0.2373, + "num_input_tokens_seen": 8994528, + "step": 7385 + }, + { + "epoch": 0.8230315179864127, + "grad_norm": 1.0931698083877563, + "learning_rate": 2.0573003675242234e-05, + "loss": 0.2565, + "num_input_tokens_seen": 9000576, + "step": 7390 + }, + { + "epoch": 0.8235883728700301, + "grad_norm": 1.327697515487671, + "learning_rate": 2.0586925047332668e-05, + "loss": 0.2172, + "num_input_tokens_seen": 9006688, + "step": 7395 + }, + { + "epoch": 0.8241452277536474, + "grad_norm": 0.9808008670806885, + "learning_rate": 2.06008464194231e-05, + "loss": 0.4001, + "num_input_tokens_seen": 9012864, + "step": 7400 + }, + { + "epoch": 0.8247020826372647, + "grad_norm": 1.6595075130462646, + "learning_rate": 2.061476779151353e-05, + "loss": 0.2127, + "num_input_tokens_seen": 9019072, + "step": 7405 + }, + { + "epoch": 0.8252589375208821, + "grad_norm": 0.18792548775672913, + "learning_rate": 2.0628689163603964e-05, + "loss": 0.097, + "num_input_tokens_seen": 9025280, + "step": 7410 + }, + { + "epoch": 0.8258157924044994, + "grad_norm": 1.4728162288665771, + "learning_rate": 2.06426105356944e-05, + "loss": 0.2799, + "num_input_tokens_seen": 9031200, + "step": 7415 + }, + { + "epoch": 0.8263726472881168, + "grad_norm": 1.1427099704742432, + "learning_rate": 2.0656531907784833e-05, + "loss": 0.1223, + "num_input_tokens_seen": 9036704, + "step": 7420 + }, + { + "epoch": 0.826929502171734, + "grad_norm": 0.3323042690753937, + "learning_rate": 2.0670453279875264e-05, + "loss": 0.128, + "num_input_tokens_seen": 9043040, + "step": 7425 + }, + { + "epoch": 0.8274863570553513, + "grad_norm": 1.7646584510803223, + "learning_rate": 2.0684374651965698e-05, + "loss": 0.2697, + "num_input_tokens_seen": 9048864, + "step": 7430 + }, + { + "epoch": 0.8280432119389687, + "grad_norm": 1.2233121395111084, + "learning_rate": 2.0698296024056133e-05, + "loss": 0.1074, + "num_input_tokens_seen": 9055168, + "step": 7435 + }, + { + "epoch": 0.828600066822586, + "grad_norm": 2.432018995285034, + "learning_rate": 2.0712217396146567e-05, + "loss": 0.1686, + "num_input_tokens_seen": 9061312, + "step": 7440 + }, + { + "epoch": 0.8291569217062034, + "grad_norm": 2.1572353839874268, + "learning_rate": 2.0726138768237e-05, + "loss": 0.1211, + "num_input_tokens_seen": 9067392, + "step": 7445 + }, + { + "epoch": 0.8297137765898207, + "grad_norm": 0.3449375629425049, + "learning_rate": 2.0740060140327432e-05, + "loss": 0.0732, + "num_input_tokens_seen": 9073440, + "step": 7450 + }, + { + "epoch": 0.830270631473438, + "grad_norm": 1.5976934432983398, + "learning_rate": 2.0753981512417863e-05, + "loss": 0.208, + "num_input_tokens_seen": 9079616, + "step": 7455 + }, + { + "epoch": 0.8308274863570554, + "grad_norm": 0.5766518712043762, + "learning_rate": 2.0767902884508297e-05, + "loss": 0.1556, + "num_input_tokens_seen": 9085120, + "step": 7460 + }, + { + "epoch": 0.8313843412406727, + "grad_norm": 0.35568302869796753, + "learning_rate": 2.0781824256598732e-05, + "loss": 0.1136, + "num_input_tokens_seen": 9091360, + "step": 7465 + }, + { + "epoch": 0.83194119612429, + "grad_norm": 0.5506699681282043, + "learning_rate": 2.0795745628689166e-05, + "loss": 0.0884, + "num_input_tokens_seen": 9097440, + "step": 7470 + }, + { + "epoch": 0.8324980510079073, + "grad_norm": 0.4635283052921295, + "learning_rate": 2.0809667000779597e-05, + "loss": 0.1957, + "num_input_tokens_seen": 9103680, + "step": 7475 + }, + { + "epoch": 0.8330549058915246, + "grad_norm": 2.437946081161499, + "learning_rate": 2.082358837287003e-05, + "loss": 0.2979, + "num_input_tokens_seen": 9109632, + "step": 7480 + }, + { + "epoch": 0.833611760775142, + "grad_norm": 0.22178786993026733, + "learning_rate": 2.0837509744960466e-05, + "loss": 0.1764, + "num_input_tokens_seen": 9115712, + "step": 7485 + }, + { + "epoch": 0.8341686156587593, + "grad_norm": 1.0925296545028687, + "learning_rate": 2.08514311170509e-05, + "loss": 0.1786, + "num_input_tokens_seen": 9121920, + "step": 7490 + }, + { + "epoch": 0.8347254705423767, + "grad_norm": 1.5778173208236694, + "learning_rate": 2.086535248914133e-05, + "loss": 0.2104, + "num_input_tokens_seen": 9127936, + "step": 7495 + }, + { + "epoch": 0.835282325425994, + "grad_norm": 1.203887939453125, + "learning_rate": 2.0879273861231762e-05, + "loss": 0.2645, + "num_input_tokens_seen": 9133952, + "step": 7500 + }, + { + "epoch": 0.8358391803096114, + "grad_norm": 0.7993137836456299, + "learning_rate": 2.0893195233322196e-05, + "loss": 0.2745, + "num_input_tokens_seen": 9140032, + "step": 7505 + }, + { + "epoch": 0.8363960351932287, + "grad_norm": 2.0133988857269287, + "learning_rate": 2.090711660541263e-05, + "loss": 0.2653, + "num_input_tokens_seen": 9145504, + "step": 7510 + }, + { + "epoch": 0.8369528900768459, + "grad_norm": 1.1692441701889038, + "learning_rate": 2.0921037977503065e-05, + "loss": 0.2727, + "num_input_tokens_seen": 9151776, + "step": 7515 + }, + { + "epoch": 0.8375097449604633, + "grad_norm": 1.2239412069320679, + "learning_rate": 2.0934959349593496e-05, + "loss": 0.2034, + "num_input_tokens_seen": 9158144, + "step": 7520 + }, + { + "epoch": 0.8380665998440806, + "grad_norm": 0.9631525874137878, + "learning_rate": 2.094888072168393e-05, + "loss": 0.3384, + "num_input_tokens_seen": 9164000, + "step": 7525 + }, + { + "epoch": 0.838623454727698, + "grad_norm": 0.8602821826934814, + "learning_rate": 2.0962802093774364e-05, + "loss": 0.2383, + "num_input_tokens_seen": 9170016, + "step": 7530 + }, + { + "epoch": 0.8391803096113153, + "grad_norm": 0.9591982364654541, + "learning_rate": 2.09767234658648e-05, + "loss": 0.2426, + "num_input_tokens_seen": 9176128, + "step": 7535 + }, + { + "epoch": 0.8397371644949326, + "grad_norm": 1.1095664501190186, + "learning_rate": 2.099064483795523e-05, + "loss": 0.271, + "num_input_tokens_seen": 9182464, + "step": 7540 + }, + { + "epoch": 0.84029401937855, + "grad_norm": 0.7734997868537903, + "learning_rate": 2.100456621004566e-05, + "loss": 0.1439, + "num_input_tokens_seen": 9188416, + "step": 7545 + }, + { + "epoch": 0.8408508742621673, + "grad_norm": 0.506401002407074, + "learning_rate": 2.1018487582136095e-05, + "loss": 0.0776, + "num_input_tokens_seen": 9194272, + "step": 7550 + }, + { + "epoch": 0.8414077291457847, + "grad_norm": 0.39438945055007935, + "learning_rate": 2.103240895422653e-05, + "loss": 0.2866, + "num_input_tokens_seen": 9200416, + "step": 7555 + }, + { + "epoch": 0.8419645840294019, + "grad_norm": 0.258868545293808, + "learning_rate": 2.1046330326316964e-05, + "loss": 0.1603, + "num_input_tokens_seen": 9206880, + "step": 7560 + }, + { + "epoch": 0.8425214389130192, + "grad_norm": 0.4962333142757416, + "learning_rate": 2.1060251698407395e-05, + "loss": 0.0958, + "num_input_tokens_seen": 9212864, + "step": 7565 + }, + { + "epoch": 0.8430782937966366, + "grad_norm": 1.146802306175232, + "learning_rate": 2.107417307049783e-05, + "loss": 0.2083, + "num_input_tokens_seen": 9217376, + "step": 7570 + }, + { + "epoch": 0.8436351486802539, + "grad_norm": 1.8241050243377686, + "learning_rate": 2.1088094442588263e-05, + "loss": 0.2269, + "num_input_tokens_seen": 9223168, + "step": 7575 + }, + { + "epoch": 0.8441920035638713, + "grad_norm": 1.064924955368042, + "learning_rate": 2.1102015814678698e-05, + "loss": 0.3361, + "num_input_tokens_seen": 9229184, + "step": 7580 + }, + { + "epoch": 0.8447488584474886, + "grad_norm": 0.922523021697998, + "learning_rate": 2.1115937186769132e-05, + "loss": 0.1561, + "num_input_tokens_seen": 9235168, + "step": 7585 + }, + { + "epoch": 0.8453057133311059, + "grad_norm": 1.4029263257980347, + "learning_rate": 2.112985855885956e-05, + "loss": 0.1415, + "num_input_tokens_seen": 9241504, + "step": 7590 + }, + { + "epoch": 0.8458625682147233, + "grad_norm": 0.6346423029899597, + "learning_rate": 2.1143779930949994e-05, + "loss": 0.2274, + "num_input_tokens_seen": 9247552, + "step": 7595 + }, + { + "epoch": 0.8464194230983406, + "grad_norm": 0.7439924478530884, + "learning_rate": 2.1157701303040428e-05, + "loss": 0.195, + "num_input_tokens_seen": 9253888, + "step": 7600 + }, + { + "epoch": 0.8469762779819578, + "grad_norm": 1.1723068952560425, + "learning_rate": 2.1171622675130862e-05, + "loss": 0.2606, + "num_input_tokens_seen": 9260256, + "step": 7605 + }, + { + "epoch": 0.8475331328655752, + "grad_norm": 1.109169840812683, + "learning_rate": 2.1185544047221297e-05, + "loss": 0.2446, + "num_input_tokens_seen": 9266656, + "step": 7610 + }, + { + "epoch": 0.8480899877491925, + "grad_norm": 1.0733566284179688, + "learning_rate": 2.1199465419311728e-05, + "loss": 0.0695, + "num_input_tokens_seen": 9272768, + "step": 7615 + }, + { + "epoch": 0.8486468426328099, + "grad_norm": 0.5418310761451721, + "learning_rate": 2.1213386791402162e-05, + "loss": 0.0888, + "num_input_tokens_seen": 9278880, + "step": 7620 + }, + { + "epoch": 0.8492036975164272, + "grad_norm": 0.4845014810562134, + "learning_rate": 2.1227308163492596e-05, + "loss": 0.1366, + "num_input_tokens_seen": 9284896, + "step": 7625 + }, + { + "epoch": 0.8497605524000446, + "grad_norm": 0.5504873991012573, + "learning_rate": 2.124122953558303e-05, + "loss": 0.3228, + "num_input_tokens_seen": 9290720, + "step": 7630 + }, + { + "epoch": 0.8503174072836619, + "grad_norm": 1.3932807445526123, + "learning_rate": 2.125515090767346e-05, + "loss": 0.1261, + "num_input_tokens_seen": 9296704, + "step": 7635 + }, + { + "epoch": 0.8508742621672792, + "grad_norm": 0.5467883944511414, + "learning_rate": 2.1269072279763893e-05, + "loss": 0.2429, + "num_input_tokens_seen": 9303136, + "step": 7640 + }, + { + "epoch": 0.8514311170508966, + "grad_norm": 1.1193634271621704, + "learning_rate": 2.1282993651854327e-05, + "loss": 0.204, + "num_input_tokens_seen": 9309024, + "step": 7645 + }, + { + "epoch": 0.8519879719345138, + "grad_norm": 1.1751694679260254, + "learning_rate": 2.129691502394476e-05, + "loss": 0.0856, + "num_input_tokens_seen": 9315072, + "step": 7650 + }, + { + "epoch": 0.8525448268181312, + "grad_norm": 1.758052945137024, + "learning_rate": 2.1310836396035196e-05, + "loss": 0.1791, + "num_input_tokens_seen": 9321216, + "step": 7655 + }, + { + "epoch": 0.8531016817017485, + "grad_norm": 0.7468615770339966, + "learning_rate": 2.1324757768125626e-05, + "loss": 0.1498, + "num_input_tokens_seen": 9327200, + "step": 7660 + }, + { + "epoch": 0.8536585365853658, + "grad_norm": 0.33773329854011536, + "learning_rate": 2.133867914021606e-05, + "loss": 0.2484, + "num_input_tokens_seen": 9333056, + "step": 7665 + }, + { + "epoch": 0.8542153914689832, + "grad_norm": 0.7686031460762024, + "learning_rate": 2.1352600512306495e-05, + "loss": 0.3066, + "num_input_tokens_seen": 9339424, + "step": 7670 + }, + { + "epoch": 0.8547722463526005, + "grad_norm": 0.793566882610321, + "learning_rate": 2.136652188439693e-05, + "loss": 0.1595, + "num_input_tokens_seen": 9345376, + "step": 7675 + }, + { + "epoch": 0.8553291012362179, + "grad_norm": 0.651665210723877, + "learning_rate": 2.138044325648736e-05, + "loss": 0.1998, + "num_input_tokens_seen": 9351808, + "step": 7680 + }, + { + "epoch": 0.8558859561198352, + "grad_norm": 1.0045068264007568, + "learning_rate": 2.139436462857779e-05, + "loss": 0.1834, + "num_input_tokens_seen": 9357696, + "step": 7685 + }, + { + "epoch": 0.8564428110034525, + "grad_norm": 0.6681436896324158, + "learning_rate": 2.1408286000668226e-05, + "loss": 0.2438, + "num_input_tokens_seen": 9363712, + "step": 7690 + }, + { + "epoch": 0.8569996658870699, + "grad_norm": 0.08117666095495224, + "learning_rate": 2.142220737275866e-05, + "loss": 0.1457, + "num_input_tokens_seen": 9369888, + "step": 7695 + }, + { + "epoch": 0.8575565207706871, + "grad_norm": 0.2567492723464966, + "learning_rate": 2.1436128744849094e-05, + "loss": 0.1926, + "num_input_tokens_seen": 9375840, + "step": 7700 + }, + { + "epoch": 0.8581133756543045, + "grad_norm": 0.7593568563461304, + "learning_rate": 2.1450050116939525e-05, + "loss": 0.1774, + "num_input_tokens_seen": 9381600, + "step": 7705 + }, + { + "epoch": 0.8586702305379218, + "grad_norm": 0.8115570545196533, + "learning_rate": 2.146397148902996e-05, + "loss": 0.1107, + "num_input_tokens_seen": 9387936, + "step": 7710 + }, + { + "epoch": 0.8592270854215391, + "grad_norm": 1.393341064453125, + "learning_rate": 2.1477892861120394e-05, + "loss": 0.2454, + "num_input_tokens_seen": 9394016, + "step": 7715 + }, + { + "epoch": 0.8597839403051565, + "grad_norm": 0.719292402267456, + "learning_rate": 2.1491814233210828e-05, + "loss": 0.0937, + "num_input_tokens_seen": 9399904, + "step": 7720 + }, + { + "epoch": 0.8603407951887738, + "grad_norm": 1.1747157573699951, + "learning_rate": 2.150573560530126e-05, + "loss": 0.2323, + "num_input_tokens_seen": 9406400, + "step": 7725 + }, + { + "epoch": 0.8608976500723912, + "grad_norm": 0.1864500194787979, + "learning_rate": 2.151965697739169e-05, + "loss": 0.1312, + "num_input_tokens_seen": 9412768, + "step": 7730 + }, + { + "epoch": 0.8614545049560085, + "grad_norm": 0.7743856906890869, + "learning_rate": 2.1533578349482124e-05, + "loss": 0.1857, + "num_input_tokens_seen": 9418752, + "step": 7735 + }, + { + "epoch": 0.8620113598396258, + "grad_norm": 0.6758248805999756, + "learning_rate": 2.154749972157256e-05, + "loss": 0.2294, + "num_input_tokens_seen": 9424352, + "step": 7740 + }, + { + "epoch": 0.8625682147232431, + "grad_norm": 0.5026268362998962, + "learning_rate": 2.1561421093662993e-05, + "loss": 0.1931, + "num_input_tokens_seen": 9430464, + "step": 7745 + }, + { + "epoch": 0.8631250696068604, + "grad_norm": 1.4081079959869385, + "learning_rate": 2.1575342465753427e-05, + "loss": 0.2546, + "num_input_tokens_seen": 9436448, + "step": 7750 + }, + { + "epoch": 0.8636819244904778, + "grad_norm": 0.4624461233615875, + "learning_rate": 2.158926383784386e-05, + "loss": 0.1353, + "num_input_tokens_seen": 9442656, + "step": 7755 + }, + { + "epoch": 0.8642387793740951, + "grad_norm": 1.4525306224822998, + "learning_rate": 2.1603185209934293e-05, + "loss": 0.2065, + "num_input_tokens_seen": 9448480, + "step": 7760 + }, + { + "epoch": 0.8647956342577124, + "grad_norm": 0.6211286187171936, + "learning_rate": 2.1617106582024727e-05, + "loss": 0.169, + "num_input_tokens_seen": 9454336, + "step": 7765 + }, + { + "epoch": 0.8653524891413298, + "grad_norm": 0.5402584671974182, + "learning_rate": 2.1631027954115158e-05, + "loss": 0.1917, + "num_input_tokens_seen": 9460896, + "step": 7770 + }, + { + "epoch": 0.8659093440249471, + "grad_norm": 1.636159896850586, + "learning_rate": 2.1644949326205592e-05, + "loss": 0.2518, + "num_input_tokens_seen": 9466688, + "step": 7775 + }, + { + "epoch": 0.8664661989085645, + "grad_norm": 0.3908396363258362, + "learning_rate": 2.1658870698296023e-05, + "loss": 0.0614, + "num_input_tokens_seen": 9473024, + "step": 7780 + }, + { + "epoch": 0.8670230537921818, + "grad_norm": 0.32140621542930603, + "learning_rate": 2.1672792070386458e-05, + "loss": 0.169, + "num_input_tokens_seen": 9479104, + "step": 7785 + }, + { + "epoch": 0.867579908675799, + "grad_norm": 0.32169029116630554, + "learning_rate": 2.1686713442476892e-05, + "loss": 0.203, + "num_input_tokens_seen": 9485504, + "step": 7790 + }, + { + "epoch": 0.8681367635594164, + "grad_norm": 1.3107900619506836, + "learning_rate": 2.1700634814567326e-05, + "loss": 0.1064, + "num_input_tokens_seen": 9491968, + "step": 7795 + }, + { + "epoch": 0.8686936184430337, + "grad_norm": 1.0624841451644897, + "learning_rate": 2.1714556186657757e-05, + "loss": 0.19, + "num_input_tokens_seen": 9497888, + "step": 7800 + }, + { + "epoch": 0.8692504733266511, + "grad_norm": 0.4191210865974426, + "learning_rate": 2.172847755874819e-05, + "loss": 0.2014, + "num_input_tokens_seen": 9503872, + "step": 7805 + }, + { + "epoch": 0.8698073282102684, + "grad_norm": 0.2967255413532257, + "learning_rate": 2.1742398930838626e-05, + "loss": 0.1946, + "num_input_tokens_seen": 9509568, + "step": 7810 + }, + { + "epoch": 0.8703641830938857, + "grad_norm": 0.5320481657981873, + "learning_rate": 2.175632030292906e-05, + "loss": 0.1147, + "num_input_tokens_seen": 9515584, + "step": 7815 + }, + { + "epoch": 0.8709210379775031, + "grad_norm": 0.7225697040557861, + "learning_rate": 2.177024167501949e-05, + "loss": 0.1651, + "num_input_tokens_seen": 9521248, + "step": 7820 + }, + { + "epoch": 0.8714778928611204, + "grad_norm": 0.8230639100074768, + "learning_rate": 2.1784163047109922e-05, + "loss": 0.1401, + "num_input_tokens_seen": 9527680, + "step": 7825 + }, + { + "epoch": 0.8720347477447378, + "grad_norm": 1.549275279045105, + "learning_rate": 2.1798084419200356e-05, + "loss": 0.1687, + "num_input_tokens_seen": 9533632, + "step": 7830 + }, + { + "epoch": 0.872591602628355, + "grad_norm": 0.5012232661247253, + "learning_rate": 2.181200579129079e-05, + "loss": 0.1458, + "num_input_tokens_seen": 9539968, + "step": 7835 + }, + { + "epoch": 0.8731484575119723, + "grad_norm": 1.7651023864746094, + "learning_rate": 2.1825927163381225e-05, + "loss": 0.2578, + "num_input_tokens_seen": 9545984, + "step": 7840 + }, + { + "epoch": 0.8737053123955897, + "grad_norm": 0.6727228760719299, + "learning_rate": 2.1839848535471656e-05, + "loss": 0.2032, + "num_input_tokens_seen": 9552256, + "step": 7845 + }, + { + "epoch": 0.874262167279207, + "grad_norm": 0.7708010673522949, + "learning_rate": 2.185376990756209e-05, + "loss": 0.2748, + "num_input_tokens_seen": 9558464, + "step": 7850 + }, + { + "epoch": 0.8748190221628244, + "grad_norm": 1.2174922227859497, + "learning_rate": 2.1867691279652525e-05, + "loss": 0.2523, + "num_input_tokens_seen": 9564480, + "step": 7855 + }, + { + "epoch": 0.8753758770464417, + "grad_norm": 1.591671109199524, + "learning_rate": 2.188161265174296e-05, + "loss": 0.1652, + "num_input_tokens_seen": 9570656, + "step": 7860 + }, + { + "epoch": 0.875932731930059, + "grad_norm": 0.4889470338821411, + "learning_rate": 2.189553402383339e-05, + "loss": 0.2187, + "num_input_tokens_seen": 9576544, + "step": 7865 + }, + { + "epoch": 0.8764895868136764, + "grad_norm": 0.7815585732460022, + "learning_rate": 2.190945539592382e-05, + "loss": 0.1388, + "num_input_tokens_seen": 9582784, + "step": 7870 + }, + { + "epoch": 0.8770464416972937, + "grad_norm": 0.0629911869764328, + "learning_rate": 2.1923376768014255e-05, + "loss": 0.192, + "num_input_tokens_seen": 9589440, + "step": 7875 + }, + { + "epoch": 0.877603296580911, + "grad_norm": 2.4574549198150635, + "learning_rate": 2.193729814010469e-05, + "loss": 0.2669, + "num_input_tokens_seen": 9595552, + "step": 7880 + }, + { + "epoch": 0.8781601514645283, + "grad_norm": 0.720374584197998, + "learning_rate": 2.1951219512195124e-05, + "loss": 0.329, + "num_input_tokens_seen": 9601792, + "step": 7885 + }, + { + "epoch": 0.8787170063481456, + "grad_norm": 1.6838324069976807, + "learning_rate": 2.1965140884285558e-05, + "loss": 0.1253, + "num_input_tokens_seen": 9607296, + "step": 7890 + }, + { + "epoch": 0.879273861231763, + "grad_norm": 0.3453942537307739, + "learning_rate": 2.197906225637599e-05, + "loss": 0.1562, + "num_input_tokens_seen": 9613504, + "step": 7895 + }, + { + "epoch": 0.8798307161153803, + "grad_norm": 0.05907998979091644, + "learning_rate": 2.1992983628466423e-05, + "loss": 0.1098, + "num_input_tokens_seen": 9619936, + "step": 7900 + }, + { + "epoch": 0.8803875709989977, + "grad_norm": 1.0522615909576416, + "learning_rate": 2.2006905000556858e-05, + "loss": 0.1721, + "num_input_tokens_seen": 9626080, + "step": 7905 + }, + { + "epoch": 0.880944425882615, + "grad_norm": 0.727945864200592, + "learning_rate": 2.202082637264729e-05, + "loss": 0.2215, + "num_input_tokens_seen": 9632352, + "step": 7910 + }, + { + "epoch": 0.8815012807662324, + "grad_norm": 1.7107629776000977, + "learning_rate": 2.2034747744737723e-05, + "loss": 0.2808, + "num_input_tokens_seen": 9638240, + "step": 7915 + }, + { + "epoch": 0.8820581356498497, + "grad_norm": 0.7061022520065308, + "learning_rate": 2.2048669116828154e-05, + "loss": 0.2137, + "num_input_tokens_seen": 9644640, + "step": 7920 + }, + { + "epoch": 0.8826149905334669, + "grad_norm": 0.6940363049507141, + "learning_rate": 2.2062590488918588e-05, + "loss": 0.2544, + "num_input_tokens_seen": 9650720, + "step": 7925 + }, + { + "epoch": 0.8831718454170843, + "grad_norm": 0.8593979477882385, + "learning_rate": 2.2076511861009023e-05, + "loss": 0.0984, + "num_input_tokens_seen": 9656544, + "step": 7930 + }, + { + "epoch": 0.8837287003007016, + "grad_norm": 0.7459100484848022, + "learning_rate": 2.2090433233099457e-05, + "loss": 0.2044, + "num_input_tokens_seen": 9662752, + "step": 7935 + }, + { + "epoch": 0.884285555184319, + "grad_norm": 1.3836334943771362, + "learning_rate": 2.2104354605189888e-05, + "loss": 0.2288, + "num_input_tokens_seen": 9668320, + "step": 7940 + }, + { + "epoch": 0.8848424100679363, + "grad_norm": 0.6567296981811523, + "learning_rate": 2.2118275977280322e-05, + "loss": 0.0991, + "num_input_tokens_seen": 9674432, + "step": 7945 + }, + { + "epoch": 0.8853992649515536, + "grad_norm": 0.8033948540687561, + "learning_rate": 2.2132197349370757e-05, + "loss": 0.1489, + "num_input_tokens_seen": 9680640, + "step": 7950 + }, + { + "epoch": 0.885956119835171, + "grad_norm": 0.5974475145339966, + "learning_rate": 2.2146118721461187e-05, + "loss": 0.1078, + "num_input_tokens_seen": 9686880, + "step": 7955 + }, + { + "epoch": 0.8865129747187883, + "grad_norm": 0.30192437767982483, + "learning_rate": 2.2160040093551622e-05, + "loss": 0.0899, + "num_input_tokens_seen": 9692928, + "step": 7960 + }, + { + "epoch": 0.8870698296024057, + "grad_norm": 0.43105608224868774, + "learning_rate": 2.2173961465642053e-05, + "loss": 0.0996, + "num_input_tokens_seen": 9698944, + "step": 7965 + }, + { + "epoch": 0.8876266844860229, + "grad_norm": 1.4070560932159424, + "learning_rate": 2.2187882837732487e-05, + "loss": 0.2382, + "num_input_tokens_seen": 9705088, + "step": 7970 + }, + { + "epoch": 0.8881835393696402, + "grad_norm": 0.21332581341266632, + "learning_rate": 2.220180420982292e-05, + "loss": 0.1053, + "num_input_tokens_seen": 9711072, + "step": 7975 + }, + { + "epoch": 0.8887403942532576, + "grad_norm": 1.5854002237319946, + "learning_rate": 2.2215725581913356e-05, + "loss": 0.1143, + "num_input_tokens_seen": 9717408, + "step": 7980 + }, + { + "epoch": 0.8892972491368749, + "grad_norm": 0.6030420660972595, + "learning_rate": 2.2229646954003787e-05, + "loss": 0.1792, + "num_input_tokens_seen": 9723584, + "step": 7985 + }, + { + "epoch": 0.8898541040204923, + "grad_norm": 1.2949268817901611, + "learning_rate": 2.224356832609422e-05, + "loss": 0.2341, + "num_input_tokens_seen": 9729760, + "step": 7990 + }, + { + "epoch": 0.8904109589041096, + "grad_norm": 0.6556268930435181, + "learning_rate": 2.2257489698184655e-05, + "loss": 0.1784, + "num_input_tokens_seen": 9735776, + "step": 7995 + }, + { + "epoch": 0.8909678137877269, + "grad_norm": 0.7159271240234375, + "learning_rate": 2.2271411070275086e-05, + "loss": 0.0901, + "num_input_tokens_seen": 9741824, + "step": 8000 + }, + { + "epoch": 0.8915246686713443, + "grad_norm": 0.9075763821601868, + "learning_rate": 2.228533244236552e-05, + "loss": 0.0689, + "num_input_tokens_seen": 9747392, + "step": 8005 + }, + { + "epoch": 0.8920815235549616, + "grad_norm": 1.2828056812286377, + "learning_rate": 2.229925381445595e-05, + "loss": 0.198, + "num_input_tokens_seen": 9753696, + "step": 8010 + }, + { + "epoch": 0.892638378438579, + "grad_norm": 1.1982547044754028, + "learning_rate": 2.2313175186546386e-05, + "loss": 0.1949, + "num_input_tokens_seen": 9759904, + "step": 8015 + }, + { + "epoch": 0.8931952333221962, + "grad_norm": 0.7006415128707886, + "learning_rate": 2.232709655863682e-05, + "loss": 0.156, + "num_input_tokens_seen": 9765984, + "step": 8020 + }, + { + "epoch": 0.8937520882058135, + "grad_norm": 1.6612350940704346, + "learning_rate": 2.2341017930727255e-05, + "loss": 0.2565, + "num_input_tokens_seen": 9772256, + "step": 8025 + }, + { + "epoch": 0.8943089430894309, + "grad_norm": 1.7939879894256592, + "learning_rate": 2.235493930281769e-05, + "loss": 0.2046, + "num_input_tokens_seen": 9778144, + "step": 8030 + }, + { + "epoch": 0.8948657979730482, + "grad_norm": 0.16140617430210114, + "learning_rate": 2.236886067490812e-05, + "loss": 0.1476, + "num_input_tokens_seen": 9784704, + "step": 8035 + }, + { + "epoch": 0.8954226528566656, + "grad_norm": 2.013782024383545, + "learning_rate": 2.2382782046998554e-05, + "loss": 0.1403, + "num_input_tokens_seen": 9790912, + "step": 8040 + }, + { + "epoch": 0.8959795077402829, + "grad_norm": 0.6743290424346924, + "learning_rate": 2.2396703419088985e-05, + "loss": 0.2259, + "num_input_tokens_seen": 9796928, + "step": 8045 + }, + { + "epoch": 0.8965363626239002, + "grad_norm": 0.6324414014816284, + "learning_rate": 2.241062479117942e-05, + "loss": 0.3391, + "num_input_tokens_seen": 9803008, + "step": 8050 + }, + { + "epoch": 0.8970932175075176, + "grad_norm": 0.36676767468452454, + "learning_rate": 2.2424546163269854e-05, + "loss": 0.182, + "num_input_tokens_seen": 9809280, + "step": 8055 + }, + { + "epoch": 0.8976500723911349, + "grad_norm": 0.4803946614265442, + "learning_rate": 2.2438467535360285e-05, + "loss": 0.1692, + "num_input_tokens_seen": 9815008, + "step": 8060 + }, + { + "epoch": 0.8982069272747522, + "grad_norm": 1.450804352760315, + "learning_rate": 2.245238890745072e-05, + "loss": 0.1405, + "num_input_tokens_seen": 9821120, + "step": 8065 + }, + { + "epoch": 0.8987637821583695, + "grad_norm": 0.2632442116737366, + "learning_rate": 2.2466310279541153e-05, + "loss": 0.1773, + "num_input_tokens_seen": 9827232, + "step": 8070 + }, + { + "epoch": 0.8993206370419868, + "grad_norm": 1.3496325016021729, + "learning_rate": 2.2480231651631588e-05, + "loss": 0.2858, + "num_input_tokens_seen": 9833376, + "step": 8075 + }, + { + "epoch": 0.8998774919256042, + "grad_norm": 0.6851080060005188, + "learning_rate": 2.249415302372202e-05, + "loss": 0.0998, + "num_input_tokens_seen": 9839936, + "step": 8080 + }, + { + "epoch": 0.9004343468092215, + "grad_norm": 1.1366920471191406, + "learning_rate": 2.2508074395812453e-05, + "loss": 0.2102, + "num_input_tokens_seen": 9846496, + "step": 8085 + }, + { + "epoch": 0.9009912016928389, + "grad_norm": 1.6473946571350098, + "learning_rate": 2.2521995767902884e-05, + "loss": 0.1513, + "num_input_tokens_seen": 9852736, + "step": 8090 + }, + { + "epoch": 0.9015480565764562, + "grad_norm": 0.10565096884965897, + "learning_rate": 2.2535917139993318e-05, + "loss": 0.1608, + "num_input_tokens_seen": 9859456, + "step": 8095 + }, + { + "epoch": 0.9021049114600735, + "grad_norm": 1.0236647129058838, + "learning_rate": 2.2549838512083753e-05, + "loss": 0.212, + "num_input_tokens_seen": 9865152, + "step": 8100 + }, + { + "epoch": 0.9026617663436909, + "grad_norm": 0.3443857729434967, + "learning_rate": 2.2563759884174183e-05, + "loss": 0.0806, + "num_input_tokens_seen": 9871200, + "step": 8105 + }, + { + "epoch": 0.9032186212273081, + "grad_norm": 0.8990414142608643, + "learning_rate": 2.2577681256264618e-05, + "loss": 0.119, + "num_input_tokens_seen": 9877664, + "step": 8110 + }, + { + "epoch": 0.9037754761109255, + "grad_norm": 1.2460358142852783, + "learning_rate": 2.2591602628355052e-05, + "loss": 0.1636, + "num_input_tokens_seen": 9883936, + "step": 8115 + }, + { + "epoch": 0.9043323309945428, + "grad_norm": 1.9398859739303589, + "learning_rate": 2.2605524000445486e-05, + "loss": 0.1461, + "num_input_tokens_seen": 9890144, + "step": 8120 + }, + { + "epoch": 0.9048891858781601, + "grad_norm": 0.4641742706298828, + "learning_rate": 2.2619445372535917e-05, + "loss": 0.1114, + "num_input_tokens_seen": 9895968, + "step": 8125 + }, + { + "epoch": 0.9054460407617775, + "grad_norm": 0.44261786341667175, + "learning_rate": 2.2633366744626352e-05, + "loss": 0.2894, + "num_input_tokens_seen": 9901984, + "step": 8130 + }, + { + "epoch": 0.9060028956453948, + "grad_norm": 1.3714303970336914, + "learning_rate": 2.2647288116716786e-05, + "loss": 0.1387, + "num_input_tokens_seen": 9908128, + "step": 8135 + }, + { + "epoch": 0.9065597505290122, + "grad_norm": 0.29496294260025024, + "learning_rate": 2.2661209488807217e-05, + "loss": 0.1479, + "num_input_tokens_seen": 9914208, + "step": 8140 + }, + { + "epoch": 0.9071166054126295, + "grad_norm": 0.6359155774116516, + "learning_rate": 2.267513086089765e-05, + "loss": 0.2305, + "num_input_tokens_seen": 9920480, + "step": 8145 + }, + { + "epoch": 0.9076734602962468, + "grad_norm": 2.2752232551574707, + "learning_rate": 2.2689052232988082e-05, + "loss": 0.246, + "num_input_tokens_seen": 9926816, + "step": 8150 + }, + { + "epoch": 0.9082303151798641, + "grad_norm": 0.9088738560676575, + "learning_rate": 2.2702973605078517e-05, + "loss": 0.2182, + "num_input_tokens_seen": 9933120, + "step": 8155 + }, + { + "epoch": 0.9087871700634814, + "grad_norm": 0.4220268130302429, + "learning_rate": 2.271689497716895e-05, + "loss": 0.1144, + "num_input_tokens_seen": 9939072, + "step": 8160 + }, + { + "epoch": 0.9093440249470988, + "grad_norm": 1.7907131910324097, + "learning_rate": 2.2730816349259385e-05, + "loss": 0.2408, + "num_input_tokens_seen": 9945024, + "step": 8165 + }, + { + "epoch": 0.9099008798307161, + "grad_norm": 1.151279330253601, + "learning_rate": 2.274473772134982e-05, + "loss": 0.1364, + "num_input_tokens_seen": 9951104, + "step": 8170 + }, + { + "epoch": 0.9104577347143334, + "grad_norm": 0.9938017725944519, + "learning_rate": 2.275865909344025e-05, + "loss": 0.2056, + "num_input_tokens_seen": 9957088, + "step": 8175 + }, + { + "epoch": 0.9110145895979508, + "grad_norm": 0.7608786821365356, + "learning_rate": 2.2772580465530685e-05, + "loss": 0.1714, + "num_input_tokens_seen": 9963072, + "step": 8180 + }, + { + "epoch": 0.9115714444815681, + "grad_norm": 0.7695382833480835, + "learning_rate": 2.2786501837621116e-05, + "loss": 0.3199, + "num_input_tokens_seen": 9969280, + "step": 8185 + }, + { + "epoch": 0.9121282993651855, + "grad_norm": 1.2750340700149536, + "learning_rate": 2.280042320971155e-05, + "loss": 0.2734, + "num_input_tokens_seen": 9975296, + "step": 8190 + }, + { + "epoch": 0.9126851542488028, + "grad_norm": 4.313830852508545, + "learning_rate": 2.2814344581801984e-05, + "loss": 0.2752, + "num_input_tokens_seen": 9981376, + "step": 8195 + }, + { + "epoch": 0.91324200913242, + "grad_norm": 0.6373096108436584, + "learning_rate": 2.2828265953892415e-05, + "loss": 0.1705, + "num_input_tokens_seen": 9987392, + "step": 8200 + }, + { + "epoch": 0.9137988640160374, + "grad_norm": 1.304025650024414, + "learning_rate": 2.284218732598285e-05, + "loss": 0.214, + "num_input_tokens_seen": 9993344, + "step": 8205 + }, + { + "epoch": 0.9143557188996547, + "grad_norm": 0.9494308233261108, + "learning_rate": 2.2856108698073284e-05, + "loss": 0.1333, + "num_input_tokens_seen": 9999520, + "step": 8210 + }, + { + "epoch": 0.9149125737832721, + "grad_norm": 0.16276344656944275, + "learning_rate": 2.287003007016372e-05, + "loss": 0.058, + "num_input_tokens_seen": 10005376, + "step": 8215 + }, + { + "epoch": 0.9154694286668894, + "grad_norm": 1.4064751863479614, + "learning_rate": 2.288395144225415e-05, + "loss": 0.0832, + "num_input_tokens_seen": 10011424, + "step": 8220 + }, + { + "epoch": 0.9160262835505067, + "grad_norm": 0.6294333338737488, + "learning_rate": 2.2897872814344584e-05, + "loss": 0.124, + "num_input_tokens_seen": 10017600, + "step": 8225 + }, + { + "epoch": 0.9165831384341241, + "grad_norm": 1.919399380683899, + "learning_rate": 2.2911794186435015e-05, + "loss": 0.2088, + "num_input_tokens_seen": 10023264, + "step": 8230 + }, + { + "epoch": 0.9171399933177414, + "grad_norm": 1.265061616897583, + "learning_rate": 2.292571555852545e-05, + "loss": 0.1766, + "num_input_tokens_seen": 10029312, + "step": 8235 + }, + { + "epoch": 0.9176968482013588, + "grad_norm": 0.3929349482059479, + "learning_rate": 2.2939636930615883e-05, + "loss": 0.0494, + "num_input_tokens_seen": 10035776, + "step": 8240 + }, + { + "epoch": 0.918253703084976, + "grad_norm": 0.8011038303375244, + "learning_rate": 2.2953558302706314e-05, + "loss": 0.1329, + "num_input_tokens_seen": 10041632, + "step": 8245 + }, + { + "epoch": 0.9188105579685933, + "grad_norm": 0.03189976140856743, + "learning_rate": 2.296747967479675e-05, + "loss": 0.1464, + "num_input_tokens_seen": 10047584, + "step": 8250 + }, + { + "epoch": 0.9193674128522107, + "grad_norm": 0.5972334146499634, + "learning_rate": 2.2981401046887183e-05, + "loss": 0.0983, + "num_input_tokens_seen": 10053760, + "step": 8255 + }, + { + "epoch": 0.919924267735828, + "grad_norm": 0.0397288054227829, + "learning_rate": 2.2995322418977617e-05, + "loss": 0.1087, + "num_input_tokens_seen": 10059680, + "step": 8260 + }, + { + "epoch": 0.9204811226194454, + "grad_norm": 1.0670257806777954, + "learning_rate": 2.3009243791068048e-05, + "loss": 0.2489, + "num_input_tokens_seen": 10065760, + "step": 8265 + }, + { + "epoch": 0.9210379775030627, + "grad_norm": 1.6118217706680298, + "learning_rate": 2.3023165163158482e-05, + "loss": 0.2122, + "num_input_tokens_seen": 10071584, + "step": 8270 + }, + { + "epoch": 0.92159483238668, + "grad_norm": 0.9269004464149475, + "learning_rate": 2.3037086535248913e-05, + "loss": 0.0768, + "num_input_tokens_seen": 10077664, + "step": 8275 + }, + { + "epoch": 0.9221516872702974, + "grad_norm": 2.189117193222046, + "learning_rate": 2.3051007907339348e-05, + "loss": 0.2047, + "num_input_tokens_seen": 10084192, + "step": 8280 + }, + { + "epoch": 0.9227085421539147, + "grad_norm": 0.6589037775993347, + "learning_rate": 2.3064929279429782e-05, + "loss": 0.1375, + "num_input_tokens_seen": 10090336, + "step": 8285 + }, + { + "epoch": 0.923265397037532, + "grad_norm": 0.3830779194831848, + "learning_rate": 2.3078850651520213e-05, + "loss": 0.2865, + "num_input_tokens_seen": 10096640, + "step": 8290 + }, + { + "epoch": 0.9238222519211493, + "grad_norm": 1.2747631072998047, + "learning_rate": 2.3092772023610647e-05, + "loss": 0.1754, + "num_input_tokens_seen": 10102528, + "step": 8295 + }, + { + "epoch": 0.9243791068047666, + "grad_norm": 1.3444747924804688, + "learning_rate": 2.310669339570108e-05, + "loss": 0.168, + "num_input_tokens_seen": 10108416, + "step": 8300 + }, + { + "epoch": 0.924935961688384, + "grad_norm": 1.4385733604431152, + "learning_rate": 2.3120614767791516e-05, + "loss": 0.0849, + "num_input_tokens_seen": 10114624, + "step": 8305 + }, + { + "epoch": 0.9254928165720013, + "grad_norm": 0.25052863359451294, + "learning_rate": 2.313453613988195e-05, + "loss": 0.1013, + "num_input_tokens_seen": 10120640, + "step": 8310 + }, + { + "epoch": 0.9260496714556187, + "grad_norm": 0.10994714498519897, + "learning_rate": 2.314845751197238e-05, + "loss": 0.0621, + "num_input_tokens_seen": 10126976, + "step": 8315 + }, + { + "epoch": 0.926606526339236, + "grad_norm": 0.9303894639015198, + "learning_rate": 2.3162378884062812e-05, + "loss": 0.1465, + "num_input_tokens_seen": 10133248, + "step": 8320 + }, + { + "epoch": 0.9271633812228534, + "grad_norm": 0.3109555244445801, + "learning_rate": 2.3176300256153246e-05, + "loss": 0.1929, + "num_input_tokens_seen": 10139584, + "step": 8325 + }, + { + "epoch": 0.9277202361064707, + "grad_norm": 0.2972775101661682, + "learning_rate": 2.319022162824368e-05, + "loss": 0.0477, + "num_input_tokens_seen": 10145376, + "step": 8330 + }, + { + "epoch": 0.928277090990088, + "grad_norm": 0.7648248076438904, + "learning_rate": 2.3204143000334115e-05, + "loss": 0.1576, + "num_input_tokens_seen": 10151744, + "step": 8335 + }, + { + "epoch": 0.9288339458737053, + "grad_norm": 0.16619297862052917, + "learning_rate": 2.3218064372424546e-05, + "loss": 0.1795, + "num_input_tokens_seen": 10157664, + "step": 8340 + }, + { + "epoch": 0.9293908007573226, + "grad_norm": 0.951530396938324, + "learning_rate": 2.323198574451498e-05, + "loss": 0.1167, + "num_input_tokens_seen": 10163712, + "step": 8345 + }, + { + "epoch": 0.92994765564094, + "grad_norm": 0.49546584486961365, + "learning_rate": 2.3245907116605415e-05, + "loss": 0.0944, + "num_input_tokens_seen": 10169728, + "step": 8350 + }, + { + "epoch": 0.9305045105245573, + "grad_norm": 2.576122283935547, + "learning_rate": 2.325982848869585e-05, + "loss": 0.3045, + "num_input_tokens_seen": 10175680, + "step": 8355 + }, + { + "epoch": 0.9310613654081746, + "grad_norm": 1.5585230588912964, + "learning_rate": 2.327374986078628e-05, + "loss": 0.1002, + "num_input_tokens_seen": 10181728, + "step": 8360 + }, + { + "epoch": 0.931618220291792, + "grad_norm": 0.7140898108482361, + "learning_rate": 2.328767123287671e-05, + "loss": 0.1505, + "num_input_tokens_seen": 10187808, + "step": 8365 + }, + { + "epoch": 0.9321750751754093, + "grad_norm": 3.2055745124816895, + "learning_rate": 2.3301592604967145e-05, + "loss": 0.2292, + "num_input_tokens_seen": 10193856, + "step": 8370 + }, + { + "epoch": 0.9327319300590267, + "grad_norm": 0.5237979292869568, + "learning_rate": 2.331551397705758e-05, + "loss": 0.2063, + "num_input_tokens_seen": 10199968, + "step": 8375 + }, + { + "epoch": 0.933288784942644, + "grad_norm": 1.1641942262649536, + "learning_rate": 2.3329435349148014e-05, + "loss": 0.1524, + "num_input_tokens_seen": 10205600, + "step": 8380 + }, + { + "epoch": 0.9338456398262612, + "grad_norm": 1.1524468660354614, + "learning_rate": 2.3343356721238445e-05, + "loss": 0.1909, + "num_input_tokens_seen": 10211680, + "step": 8385 + }, + { + "epoch": 0.9344024947098786, + "grad_norm": 1.3409428596496582, + "learning_rate": 2.335727809332888e-05, + "loss": 0.2277, + "num_input_tokens_seen": 10218048, + "step": 8390 + }, + { + "epoch": 0.9349593495934959, + "grad_norm": 0.3613893687725067, + "learning_rate": 2.3371199465419314e-05, + "loss": 0.1153, + "num_input_tokens_seen": 10224160, + "step": 8395 + }, + { + "epoch": 0.9355162044771133, + "grad_norm": 0.6367754340171814, + "learning_rate": 2.3385120837509748e-05, + "loss": 0.0776, + "num_input_tokens_seen": 10230016, + "step": 8400 + }, + { + "epoch": 0.9360730593607306, + "grad_norm": 0.4508684277534485, + "learning_rate": 2.339904220960018e-05, + "loss": 0.0795, + "num_input_tokens_seen": 10236128, + "step": 8405 + }, + { + "epoch": 0.9366299142443479, + "grad_norm": 2.1901888847351074, + "learning_rate": 2.341296358169061e-05, + "loss": 0.1114, + "num_input_tokens_seen": 10242304, + "step": 8410 + }, + { + "epoch": 0.9371867691279653, + "grad_norm": 0.19242900609970093, + "learning_rate": 2.3426884953781044e-05, + "loss": 0.2902, + "num_input_tokens_seen": 10248096, + "step": 8415 + }, + { + "epoch": 0.9377436240115826, + "grad_norm": 1.1955746412277222, + "learning_rate": 2.344080632587148e-05, + "loss": 0.2129, + "num_input_tokens_seen": 10253792, + "step": 8420 + }, + { + "epoch": 0.9383004788952, + "grad_norm": 0.8384407758712769, + "learning_rate": 2.3454727697961913e-05, + "loss": 0.1146, + "num_input_tokens_seen": 10259936, + "step": 8425 + }, + { + "epoch": 0.9388573337788172, + "grad_norm": 2.1649227142333984, + "learning_rate": 2.3468649070052347e-05, + "loss": 0.3063, + "num_input_tokens_seen": 10265408, + "step": 8430 + }, + { + "epoch": 0.9394141886624345, + "grad_norm": 0.2804138958454132, + "learning_rate": 2.3482570442142778e-05, + "loss": 0.117, + "num_input_tokens_seen": 10271840, + "step": 8435 + }, + { + "epoch": 0.9399710435460519, + "grad_norm": 1.1046457290649414, + "learning_rate": 2.3496491814233212e-05, + "loss": 0.3051, + "num_input_tokens_seen": 10277664, + "step": 8440 + }, + { + "epoch": 0.9405278984296692, + "grad_norm": 1.2376418113708496, + "learning_rate": 2.3510413186323647e-05, + "loss": 0.1729, + "num_input_tokens_seen": 10284000, + "step": 8445 + }, + { + "epoch": 0.9410847533132866, + "grad_norm": 0.9857825636863708, + "learning_rate": 2.352433455841408e-05, + "loss": 0.1878, + "num_input_tokens_seen": 10290208, + "step": 8450 + }, + { + "epoch": 0.9416416081969039, + "grad_norm": 1.441224217414856, + "learning_rate": 2.3538255930504512e-05, + "loss": 0.1655, + "num_input_tokens_seen": 10296320, + "step": 8455 + }, + { + "epoch": 0.9421984630805212, + "grad_norm": 1.3226975202560425, + "learning_rate": 2.3552177302594943e-05, + "loss": 0.1338, + "num_input_tokens_seen": 10302688, + "step": 8460 + }, + { + "epoch": 0.9427553179641386, + "grad_norm": 0.8624377250671387, + "learning_rate": 2.3566098674685377e-05, + "loss": 0.144, + "num_input_tokens_seen": 10308928, + "step": 8465 + }, + { + "epoch": 0.9433121728477559, + "grad_norm": 0.9649918675422668, + "learning_rate": 2.358002004677581e-05, + "loss": 0.1239, + "num_input_tokens_seen": 10315008, + "step": 8470 + }, + { + "epoch": 0.9438690277313732, + "grad_norm": 2.933828830718994, + "learning_rate": 2.3593941418866246e-05, + "loss": 0.2772, + "num_input_tokens_seen": 10320384, + "step": 8475 + }, + { + "epoch": 0.9444258826149905, + "grad_norm": 0.39034590125083923, + "learning_rate": 2.3607862790956677e-05, + "loss": 0.2217, + "num_input_tokens_seen": 10326464, + "step": 8480 + }, + { + "epoch": 0.9449827374986078, + "grad_norm": 0.8845910429954529, + "learning_rate": 2.362178416304711e-05, + "loss": 0.0868, + "num_input_tokens_seen": 10332640, + "step": 8485 + }, + { + "epoch": 0.9455395923822252, + "grad_norm": 1.062172770500183, + "learning_rate": 2.3635705535137545e-05, + "loss": 0.1422, + "num_input_tokens_seen": 10338560, + "step": 8490 + }, + { + "epoch": 0.9460964472658425, + "grad_norm": 1.1207096576690674, + "learning_rate": 2.364962690722798e-05, + "loss": 0.0788, + "num_input_tokens_seen": 10345088, + "step": 8495 + }, + { + "epoch": 0.9466533021494599, + "grad_norm": 0.4846084713935852, + "learning_rate": 2.366354827931841e-05, + "loss": 0.0743, + "num_input_tokens_seen": 10351296, + "step": 8500 + }, + { + "epoch": 0.9472101570330772, + "grad_norm": 2.242342233657837, + "learning_rate": 2.367746965140884e-05, + "loss": 0.5171, + "num_input_tokens_seen": 10357248, + "step": 8505 + }, + { + "epoch": 0.9477670119166945, + "grad_norm": 0.17556002736091614, + "learning_rate": 2.3691391023499276e-05, + "loss": 0.0495, + "num_input_tokens_seen": 10363488, + "step": 8510 + }, + { + "epoch": 0.9483238668003119, + "grad_norm": 0.7312512397766113, + "learning_rate": 2.370531239558971e-05, + "loss": 0.1179, + "num_input_tokens_seen": 10369440, + "step": 8515 + }, + { + "epoch": 0.9488807216839291, + "grad_norm": 1.1463385820388794, + "learning_rate": 2.3719233767680145e-05, + "loss": 0.1628, + "num_input_tokens_seen": 10375648, + "step": 8520 + }, + { + "epoch": 0.9494375765675465, + "grad_norm": 1.0961167812347412, + "learning_rate": 2.3733155139770576e-05, + "loss": 0.1765, + "num_input_tokens_seen": 10382144, + "step": 8525 + }, + { + "epoch": 0.9499944314511638, + "grad_norm": 1.1347336769104004, + "learning_rate": 2.374707651186101e-05, + "loss": 0.1486, + "num_input_tokens_seen": 10388480, + "step": 8530 + }, + { + "epoch": 0.9505512863347811, + "grad_norm": 1.1748464107513428, + "learning_rate": 2.3760997883951444e-05, + "loss": 0.0948, + "num_input_tokens_seen": 10394624, + "step": 8535 + }, + { + "epoch": 0.9511081412183985, + "grad_norm": 0.6270872354507446, + "learning_rate": 2.377491925604188e-05, + "loss": 0.1501, + "num_input_tokens_seen": 10401152, + "step": 8540 + }, + { + "epoch": 0.9516649961020158, + "grad_norm": 0.6761946678161621, + "learning_rate": 2.378884062813231e-05, + "loss": 0.2319, + "num_input_tokens_seen": 10407200, + "step": 8545 + }, + { + "epoch": 0.9522218509856332, + "grad_norm": 0.774450421333313, + "learning_rate": 2.380276200022274e-05, + "loss": 0.1872, + "num_input_tokens_seen": 10412960, + "step": 8550 + }, + { + "epoch": 0.9527787058692505, + "grad_norm": 0.5313449501991272, + "learning_rate": 2.3816683372313175e-05, + "loss": 0.1685, + "num_input_tokens_seen": 10419424, + "step": 8555 + }, + { + "epoch": 0.9533355607528678, + "grad_norm": 0.23928558826446533, + "learning_rate": 2.383060474440361e-05, + "loss": 0.2944, + "num_input_tokens_seen": 10425888, + "step": 8560 + }, + { + "epoch": 0.9538924156364851, + "grad_norm": 1.7700450420379639, + "learning_rate": 2.3844526116494043e-05, + "loss": 0.2067, + "num_input_tokens_seen": 10432224, + "step": 8565 + }, + { + "epoch": 0.9544492705201024, + "grad_norm": 1.1981126070022583, + "learning_rate": 2.3858447488584478e-05, + "loss": 0.1899, + "num_input_tokens_seen": 10438208, + "step": 8570 + }, + { + "epoch": 0.9550061254037198, + "grad_norm": 0.6170264482498169, + "learning_rate": 2.387236886067491e-05, + "loss": 0.0898, + "num_input_tokens_seen": 10444352, + "step": 8575 + }, + { + "epoch": 0.9555629802873371, + "grad_norm": 0.28980275988578796, + "learning_rate": 2.3886290232765343e-05, + "loss": 0.1421, + "num_input_tokens_seen": 10450528, + "step": 8580 + }, + { + "epoch": 0.9561198351709544, + "grad_norm": 0.4616442918777466, + "learning_rate": 2.3900211604855777e-05, + "loss": 0.1847, + "num_input_tokens_seen": 10456864, + "step": 8585 + }, + { + "epoch": 0.9566766900545718, + "grad_norm": 0.8817617297172546, + "learning_rate": 2.391413297694621e-05, + "loss": 0.2051, + "num_input_tokens_seen": 10462880, + "step": 8590 + }, + { + "epoch": 0.9572335449381891, + "grad_norm": 1.3197821378707886, + "learning_rate": 2.3928054349036643e-05, + "loss": 0.1508, + "num_input_tokens_seen": 10468672, + "step": 8595 + }, + { + "epoch": 0.9577903998218065, + "grad_norm": 1.1255351305007935, + "learning_rate": 2.3941975721127074e-05, + "loss": 0.1962, + "num_input_tokens_seen": 10474784, + "step": 8600 + }, + { + "epoch": 0.9583472547054238, + "grad_norm": 0.10883361846208572, + "learning_rate": 2.3955897093217508e-05, + "loss": 0.0402, + "num_input_tokens_seen": 10480960, + "step": 8605 + }, + { + "epoch": 0.958904109589041, + "grad_norm": 1.1658918857574463, + "learning_rate": 2.3969818465307942e-05, + "loss": 0.1439, + "num_input_tokens_seen": 10487360, + "step": 8610 + }, + { + "epoch": 0.9594609644726584, + "grad_norm": 0.40267109870910645, + "learning_rate": 2.3983739837398377e-05, + "loss": 0.0839, + "num_input_tokens_seen": 10493472, + "step": 8615 + }, + { + "epoch": 0.9600178193562757, + "grad_norm": 0.5633034706115723, + "learning_rate": 2.3997661209488807e-05, + "loss": 0.196, + "num_input_tokens_seen": 10499424, + "step": 8620 + }, + { + "epoch": 0.9605746742398931, + "grad_norm": 0.5549514293670654, + "learning_rate": 2.4011582581579242e-05, + "loss": 0.1261, + "num_input_tokens_seen": 10505536, + "step": 8625 + }, + { + "epoch": 0.9611315291235104, + "grad_norm": 0.5388513207435608, + "learning_rate": 2.4025503953669676e-05, + "loss": 0.1285, + "num_input_tokens_seen": 10511616, + "step": 8630 + }, + { + "epoch": 0.9616883840071278, + "grad_norm": 1.0615785121917725, + "learning_rate": 2.403942532576011e-05, + "loss": 0.1094, + "num_input_tokens_seen": 10517344, + "step": 8635 + }, + { + "epoch": 0.9622452388907451, + "grad_norm": 1.3653498888015747, + "learning_rate": 2.405334669785054e-05, + "loss": 0.2096, + "num_input_tokens_seen": 10523488, + "step": 8640 + }, + { + "epoch": 0.9628020937743624, + "grad_norm": 0.07657527923583984, + "learning_rate": 2.4067268069940972e-05, + "loss": 0.0931, + "num_input_tokens_seen": 10529568, + "step": 8645 + }, + { + "epoch": 0.9633589486579798, + "grad_norm": 1.2785550355911255, + "learning_rate": 2.4081189442031407e-05, + "loss": 0.2176, + "num_input_tokens_seen": 10536160, + "step": 8650 + }, + { + "epoch": 0.963915803541597, + "grad_norm": 1.2264025211334229, + "learning_rate": 2.409511081412184e-05, + "loss": 0.1623, + "num_input_tokens_seen": 10542080, + "step": 8655 + }, + { + "epoch": 0.9644726584252143, + "grad_norm": 0.21297647058963776, + "learning_rate": 2.4109032186212275e-05, + "loss": 0.0764, + "num_input_tokens_seen": 10547776, + "step": 8660 + }, + { + "epoch": 0.9650295133088317, + "grad_norm": 1.3206044435501099, + "learning_rate": 2.4122953558302706e-05, + "loss": 0.1692, + "num_input_tokens_seen": 10553888, + "step": 8665 + }, + { + "epoch": 0.965586368192449, + "grad_norm": 1.5392431020736694, + "learning_rate": 2.413687493039314e-05, + "loss": 0.2313, + "num_input_tokens_seen": 10560000, + "step": 8670 + }, + { + "epoch": 0.9661432230760664, + "grad_norm": 0.7528294324874878, + "learning_rate": 2.4150796302483575e-05, + "loss": 0.1382, + "num_input_tokens_seen": 10566560, + "step": 8675 + }, + { + "epoch": 0.9667000779596837, + "grad_norm": 0.3061215281486511, + "learning_rate": 2.416471767457401e-05, + "loss": 0.1173, + "num_input_tokens_seen": 10572896, + "step": 8680 + }, + { + "epoch": 0.967256932843301, + "grad_norm": 1.826363205909729, + "learning_rate": 2.417863904666444e-05, + "loss": 0.2627, + "num_input_tokens_seen": 10579264, + "step": 8685 + }, + { + "epoch": 0.9678137877269184, + "grad_norm": 0.40458157658576965, + "learning_rate": 2.419256041875487e-05, + "loss": 0.1868, + "num_input_tokens_seen": 10584928, + "step": 8690 + }, + { + "epoch": 0.9683706426105357, + "grad_norm": 1.573800802230835, + "learning_rate": 2.4206481790845305e-05, + "loss": 0.1949, + "num_input_tokens_seen": 10591104, + "step": 8695 + }, + { + "epoch": 0.9689274974941531, + "grad_norm": 2.5203309059143066, + "learning_rate": 2.422040316293574e-05, + "loss": 0.2754, + "num_input_tokens_seen": 10597152, + "step": 8700 + }, + { + "epoch": 0.9694843523777703, + "grad_norm": 0.6250830292701721, + "learning_rate": 2.4234324535026174e-05, + "loss": 0.1991, + "num_input_tokens_seen": 10603392, + "step": 8705 + }, + { + "epoch": 0.9700412072613877, + "grad_norm": 0.7574318051338196, + "learning_rate": 2.424824590711661e-05, + "loss": 0.1109, + "num_input_tokens_seen": 10608800, + "step": 8710 + }, + { + "epoch": 0.970598062145005, + "grad_norm": 0.829542338848114, + "learning_rate": 2.426216727920704e-05, + "loss": 0.1243, + "num_input_tokens_seen": 10615072, + "step": 8715 + }, + { + "epoch": 0.9711549170286223, + "grad_norm": 0.904918909072876, + "learning_rate": 2.4276088651297474e-05, + "loss": 0.0888, + "num_input_tokens_seen": 10620864, + "step": 8720 + }, + { + "epoch": 0.9717117719122397, + "grad_norm": 1.2968626022338867, + "learning_rate": 2.4290010023387908e-05, + "loss": 0.1699, + "num_input_tokens_seen": 10626336, + "step": 8725 + }, + { + "epoch": 0.972268626795857, + "grad_norm": 0.8835611939430237, + "learning_rate": 2.430393139547834e-05, + "loss": 0.2186, + "num_input_tokens_seen": 10632448, + "step": 8730 + }, + { + "epoch": 0.9728254816794744, + "grad_norm": 0.6569027304649353, + "learning_rate": 2.4317852767568773e-05, + "loss": 0.0874, + "num_input_tokens_seen": 10638624, + "step": 8735 + }, + { + "epoch": 0.9733823365630917, + "grad_norm": 0.5355091094970703, + "learning_rate": 2.4331774139659204e-05, + "loss": 0.1426, + "num_input_tokens_seen": 10644608, + "step": 8740 + }, + { + "epoch": 0.973939191446709, + "grad_norm": 0.35960671305656433, + "learning_rate": 2.434569551174964e-05, + "loss": 0.1238, + "num_input_tokens_seen": 10650368, + "step": 8745 + }, + { + "epoch": 0.9744960463303263, + "grad_norm": 0.827858567237854, + "learning_rate": 2.4359616883840073e-05, + "loss": 0.263, + "num_input_tokens_seen": 10656032, + "step": 8750 + }, + { + "epoch": 0.9750529012139436, + "grad_norm": 1.0317397117614746, + "learning_rate": 2.4373538255930507e-05, + "loss": 0.1984, + "num_input_tokens_seen": 10662144, + "step": 8755 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 1.2543905973434448, + "learning_rate": 2.4387459628020938e-05, + "loss": 0.2282, + "num_input_tokens_seen": 10668352, + "step": 8760 + }, + { + "epoch": 0.9761666109811783, + "grad_norm": 0.44250068068504333, + "learning_rate": 2.4401381000111373e-05, + "loss": 0.1833, + "num_input_tokens_seen": 10674080, + "step": 8765 + }, + { + "epoch": 0.9767234658647956, + "grad_norm": 2.513700485229492, + "learning_rate": 2.4415302372201807e-05, + "loss": 0.1231, + "num_input_tokens_seen": 10680320, + "step": 8770 + }, + { + "epoch": 0.977280320748413, + "grad_norm": 0.16809725761413574, + "learning_rate": 2.4429223744292238e-05, + "loss": 0.1322, + "num_input_tokens_seen": 10685824, + "step": 8775 + }, + { + "epoch": 0.9778371756320303, + "grad_norm": 0.6562129259109497, + "learning_rate": 2.4443145116382672e-05, + "loss": 0.1615, + "num_input_tokens_seen": 10691584, + "step": 8780 + }, + { + "epoch": 0.9783940305156477, + "grad_norm": 0.6020663380622864, + "learning_rate": 2.4457066488473103e-05, + "loss": 0.2132, + "num_input_tokens_seen": 10697312, + "step": 8785 + }, + { + "epoch": 0.978950885399265, + "grad_norm": 1.3104748725891113, + "learning_rate": 2.4470987860563537e-05, + "loss": 0.1839, + "num_input_tokens_seen": 10703328, + "step": 8790 + }, + { + "epoch": 0.9795077402828822, + "grad_norm": 0.39526212215423584, + "learning_rate": 2.4484909232653972e-05, + "loss": 0.2643, + "num_input_tokens_seen": 10709664, + "step": 8795 + }, + { + "epoch": 0.9800645951664996, + "grad_norm": 1.6515182256698608, + "learning_rate": 2.4498830604744406e-05, + "loss": 0.2051, + "num_input_tokens_seen": 10715648, + "step": 8800 + }, + { + "epoch": 0.9806214500501169, + "grad_norm": 0.9713642001152039, + "learning_rate": 2.4512751976834837e-05, + "loss": 0.1825, + "num_input_tokens_seen": 10721632, + "step": 8805 + }, + { + "epoch": 0.9811783049337343, + "grad_norm": 0.8762457966804504, + "learning_rate": 2.452667334892527e-05, + "loss": 0.207, + "num_input_tokens_seen": 10727008, + "step": 8810 + }, + { + "epoch": 0.9817351598173516, + "grad_norm": 1.7836925983428955, + "learning_rate": 2.4540594721015706e-05, + "loss": 0.2259, + "num_input_tokens_seen": 10732800, + "step": 8815 + }, + { + "epoch": 0.9822920147009689, + "grad_norm": 1.5149763822555542, + "learning_rate": 2.455451609310614e-05, + "loss": 0.2483, + "num_input_tokens_seen": 10738848, + "step": 8820 + }, + { + "epoch": 0.9828488695845863, + "grad_norm": 0.06746307015419006, + "learning_rate": 2.456843746519657e-05, + "loss": 0.0957, + "num_input_tokens_seen": 10745216, + "step": 8825 + }, + { + "epoch": 0.9834057244682036, + "grad_norm": 0.6856257915496826, + "learning_rate": 2.4582358837287002e-05, + "loss": 0.1159, + "num_input_tokens_seen": 10751552, + "step": 8830 + }, + { + "epoch": 0.983962579351821, + "grad_norm": 0.4914136230945587, + "learning_rate": 2.4596280209377436e-05, + "loss": 0.1194, + "num_input_tokens_seen": 10757888, + "step": 8835 + }, + { + "epoch": 0.9845194342354382, + "grad_norm": 0.6461710929870605, + "learning_rate": 2.461020158146787e-05, + "loss": 0.1758, + "num_input_tokens_seen": 10764128, + "step": 8840 + }, + { + "epoch": 0.9850762891190555, + "grad_norm": 0.21524550020694733, + "learning_rate": 2.4624122953558305e-05, + "loss": 0.0636, + "num_input_tokens_seen": 10770336, + "step": 8845 + }, + { + "epoch": 0.9856331440026729, + "grad_norm": 0.8558119535446167, + "learning_rate": 2.463804432564874e-05, + "loss": 0.1447, + "num_input_tokens_seen": 10776256, + "step": 8850 + }, + { + "epoch": 0.9861899988862902, + "grad_norm": 2.5642476081848145, + "learning_rate": 2.465196569773917e-05, + "loss": 0.2783, + "num_input_tokens_seen": 10782272, + "step": 8855 + }, + { + "epoch": 0.9867468537699076, + "grad_norm": 1.4949003458023071, + "learning_rate": 2.4665887069829604e-05, + "loss": 0.1361, + "num_input_tokens_seen": 10788256, + "step": 8860 + }, + { + "epoch": 0.9873037086535249, + "grad_norm": 0.43709370493888855, + "learning_rate": 2.467980844192004e-05, + "loss": 0.0791, + "num_input_tokens_seen": 10794496, + "step": 8865 + }, + { + "epoch": 0.9878605635371422, + "grad_norm": 1.0241940021514893, + "learning_rate": 2.469372981401047e-05, + "loss": 0.21, + "num_input_tokens_seen": 10800416, + "step": 8870 + }, + { + "epoch": 0.9884174184207596, + "grad_norm": 0.6512049436569214, + "learning_rate": 2.4707651186100904e-05, + "loss": 0.1939, + "num_input_tokens_seen": 10806496, + "step": 8875 + }, + { + "epoch": 0.9889742733043769, + "grad_norm": 0.5323039889335632, + "learning_rate": 2.4721572558191335e-05, + "loss": 0.2455, + "num_input_tokens_seen": 10812448, + "step": 8880 + }, + { + "epoch": 0.9895311281879942, + "grad_norm": 1.4991464614868164, + "learning_rate": 2.473549393028177e-05, + "loss": 0.1867, + "num_input_tokens_seen": 10818752, + "step": 8885 + }, + { + "epoch": 0.9900879830716115, + "grad_norm": 0.9565499424934387, + "learning_rate": 2.4749415302372204e-05, + "loss": 0.0892, + "num_input_tokens_seen": 10824896, + "step": 8890 + }, + { + "epoch": 0.9906448379552288, + "grad_norm": 0.938307523727417, + "learning_rate": 2.4763336674462638e-05, + "loss": 0.2366, + "num_input_tokens_seen": 10830976, + "step": 8895 + }, + { + "epoch": 0.9912016928388462, + "grad_norm": 1.5062907934188843, + "learning_rate": 2.477725804655307e-05, + "loss": 0.1083, + "num_input_tokens_seen": 10837504, + "step": 8900 + }, + { + "epoch": 0.9917585477224635, + "grad_norm": 1.0857172012329102, + "learning_rate": 2.4791179418643503e-05, + "loss": 0.1044, + "num_input_tokens_seen": 10843552, + "step": 8905 + }, + { + "epoch": 0.9923154026060809, + "grad_norm": 0.4719983637332916, + "learning_rate": 2.4805100790733938e-05, + "loss": 0.0897, + "num_input_tokens_seen": 10849184, + "step": 8910 + }, + { + "epoch": 0.9928722574896982, + "grad_norm": 0.5819359421730042, + "learning_rate": 2.481902216282437e-05, + "loss": 0.1428, + "num_input_tokens_seen": 10855200, + "step": 8915 + }, + { + "epoch": 0.9934291123733155, + "grad_norm": 1.6672313213348389, + "learning_rate": 2.4832943534914803e-05, + "loss": 0.255, + "num_input_tokens_seen": 10861408, + "step": 8920 + }, + { + "epoch": 0.9939859672569329, + "grad_norm": 0.39103180170059204, + "learning_rate": 2.4846864907005234e-05, + "loss": 0.2275, + "num_input_tokens_seen": 10867456, + "step": 8925 + }, + { + "epoch": 0.9945428221405501, + "grad_norm": 0.19639332592487335, + "learning_rate": 2.4860786279095668e-05, + "loss": 0.1806, + "num_input_tokens_seen": 10873728, + "step": 8930 + }, + { + "epoch": 0.9950996770241675, + "grad_norm": 0.3830409348011017, + "learning_rate": 2.4874707651186102e-05, + "loss": 0.1201, + "num_input_tokens_seen": 10880224, + "step": 8935 + }, + { + "epoch": 0.9956565319077848, + "grad_norm": 2.1860270500183105, + "learning_rate": 2.4888629023276537e-05, + "loss": 0.2718, + "num_input_tokens_seen": 10886240, + "step": 8940 + }, + { + "epoch": 0.9962133867914021, + "grad_norm": 1.3139277696609497, + "learning_rate": 2.4902550395366968e-05, + "loss": 0.12, + "num_input_tokens_seen": 10892352, + "step": 8945 + }, + { + "epoch": 0.9967702416750195, + "grad_norm": 0.7763372659683228, + "learning_rate": 2.4916471767457402e-05, + "loss": 0.1014, + "num_input_tokens_seen": 10898592, + "step": 8950 + }, + { + "epoch": 0.9973270965586368, + "grad_norm": 0.1725425273180008, + "learning_rate": 2.4930393139547836e-05, + "loss": 0.1661, + "num_input_tokens_seen": 10904384, + "step": 8955 + }, + { + "epoch": 0.9978839514422542, + "grad_norm": 0.5598978400230408, + "learning_rate": 2.4944314511638267e-05, + "loss": 0.0867, + "num_input_tokens_seen": 10910496, + "step": 8960 + }, + { + "epoch": 0.9984408063258715, + "grad_norm": 2.7526748180389404, + "learning_rate": 2.49582358837287e-05, + "loss": 0.2204, + "num_input_tokens_seen": 10916608, + "step": 8965 + }, + { + "epoch": 0.9989976612094889, + "grad_norm": 2.5230767726898193, + "learning_rate": 2.4972157255819133e-05, + "loss": 0.153, + "num_input_tokens_seen": 10922624, + "step": 8970 + }, + { + "epoch": 0.9995545160931061, + "grad_norm": 0.8958519101142883, + "learning_rate": 2.4986078627909567e-05, + "loss": 0.1942, + "num_input_tokens_seen": 10928640, + "step": 8975 + }, + { + "epoch": 1.0, + "eval_loss": 0.1737435758113861, + "eval_runtime": 112.3128, + "eval_samples_per_second": 35.535, + "eval_steps_per_second": 8.886, + "num_input_tokens_seen": 10932896, + "step": 8979 + }, + { + "epoch": 1.0001113709767235, + "grad_norm": 1.3111757040023804, + "learning_rate": 2.5e-05, + "loss": 0.1613, + "num_input_tokens_seen": 10934144, + "step": 8980 + }, + { + "epoch": 1.0006682258603408, + "grad_norm": 0.9962360262870789, + "learning_rate": 2.5013921372090432e-05, + "loss": 0.2357, + "num_input_tokens_seen": 10940256, + "step": 8985 + }, + { + "epoch": 1.0012250807439582, + "grad_norm": 0.060963377356529236, + "learning_rate": 2.502784274418087e-05, + "loss": 0.0445, + "num_input_tokens_seen": 10946528, + "step": 8990 + }, + { + "epoch": 1.0017819356275754, + "grad_norm": 1.6010503768920898, + "learning_rate": 2.5041764116271297e-05, + "loss": 0.2129, + "num_input_tokens_seen": 10952544, + "step": 8995 + }, + { + "epoch": 1.0023387905111927, + "grad_norm": 1.1640188694000244, + "learning_rate": 2.5055685488361735e-05, + "loss": 0.2138, + "num_input_tokens_seen": 10958592, + "step": 9000 + }, + { + "epoch": 1.0028956453948101, + "grad_norm": 0.7141361832618713, + "learning_rate": 2.5069606860452166e-05, + "loss": 0.1815, + "num_input_tokens_seen": 10964256, + "step": 9005 + }, + { + "epoch": 1.0034525002784274, + "grad_norm": 0.1759420931339264, + "learning_rate": 2.5083528232542604e-05, + "loss": 0.1801, + "num_input_tokens_seen": 10970208, + "step": 9010 + }, + { + "epoch": 1.0040093551620448, + "grad_norm": 1.8082587718963623, + "learning_rate": 2.5097449604633035e-05, + "loss": 0.1342, + "num_input_tokens_seen": 10976320, + "step": 9015 + }, + { + "epoch": 1.004566210045662, + "grad_norm": 1.0304715633392334, + "learning_rate": 2.511137097672347e-05, + "loss": 0.1354, + "num_input_tokens_seen": 10982432, + "step": 9020 + }, + { + "epoch": 1.0051230649292795, + "grad_norm": 0.3705896735191345, + "learning_rate": 2.51252923488139e-05, + "loss": 0.0986, + "num_input_tokens_seen": 10988640, + "step": 9025 + }, + { + "epoch": 1.0056799198128967, + "grad_norm": 0.7680332064628601, + "learning_rate": 2.513921372090433e-05, + "loss": 0.0905, + "num_input_tokens_seen": 10994656, + "step": 9030 + }, + { + "epoch": 1.0062367746965142, + "grad_norm": 0.9847506284713745, + "learning_rate": 2.515313509299477e-05, + "loss": 0.1404, + "num_input_tokens_seen": 11000992, + "step": 9035 + }, + { + "epoch": 1.0067936295801314, + "grad_norm": 0.2093275785446167, + "learning_rate": 2.51670564650852e-05, + "loss": 0.0408, + "num_input_tokens_seen": 11007232, + "step": 9040 + }, + { + "epoch": 1.0073504844637486, + "grad_norm": 0.7125298976898193, + "learning_rate": 2.5180977837175634e-05, + "loss": 0.1076, + "num_input_tokens_seen": 11013408, + "step": 9045 + }, + { + "epoch": 1.007907339347366, + "grad_norm": 1.1484277248382568, + "learning_rate": 2.5194899209266065e-05, + "loss": 0.0955, + "num_input_tokens_seen": 11019456, + "step": 9050 + }, + { + "epoch": 1.0084641942309833, + "grad_norm": 0.05621737986803055, + "learning_rate": 2.5208820581356503e-05, + "loss": 0.1567, + "num_input_tokens_seen": 11025184, + "step": 9055 + }, + { + "epoch": 1.0090210491146008, + "grad_norm": 0.6184742450714111, + "learning_rate": 2.5222741953446934e-05, + "loss": 0.2293, + "num_input_tokens_seen": 11031520, + "step": 9060 + }, + { + "epoch": 1.009577903998218, + "grad_norm": 0.12821350991725922, + "learning_rate": 2.5236663325537368e-05, + "loss": 0.04, + "num_input_tokens_seen": 11037728, + "step": 9065 + }, + { + "epoch": 1.0101347588818355, + "grad_norm": 1.0912638902664185, + "learning_rate": 2.52505846976278e-05, + "loss": 0.1764, + "num_input_tokens_seen": 11043744, + "step": 9070 + }, + { + "epoch": 1.0106916137654527, + "grad_norm": 0.4782302975654602, + "learning_rate": 2.526450606971823e-05, + "loss": 0.1104, + "num_input_tokens_seen": 11049920, + "step": 9075 + }, + { + "epoch": 1.0112484686490701, + "grad_norm": 1.6962412595748901, + "learning_rate": 2.5278427441808667e-05, + "loss": 0.1714, + "num_input_tokens_seen": 11056480, + "step": 9080 + }, + { + "epoch": 1.0118053235326874, + "grad_norm": 1.0663682222366333, + "learning_rate": 2.52923488138991e-05, + "loss": 0.1264, + "num_input_tokens_seen": 11062496, + "step": 9085 + }, + { + "epoch": 1.0123621784163046, + "grad_norm": 0.32841989398002625, + "learning_rate": 2.5306270185989533e-05, + "loss": 0.1338, + "num_input_tokens_seen": 11068576, + "step": 9090 + }, + { + "epoch": 1.012919033299922, + "grad_norm": 1.1638370752334595, + "learning_rate": 2.5320191558079964e-05, + "loss": 0.2816, + "num_input_tokens_seen": 11074688, + "step": 9095 + }, + { + "epoch": 1.0134758881835393, + "grad_norm": 0.9419611692428589, + "learning_rate": 2.53341129301704e-05, + "loss": 0.2157, + "num_input_tokens_seen": 11080768, + "step": 9100 + }, + { + "epoch": 1.0140327430671567, + "grad_norm": 1.3060321807861328, + "learning_rate": 2.5348034302260832e-05, + "loss": 0.1965, + "num_input_tokens_seen": 11086528, + "step": 9105 + }, + { + "epoch": 1.014589597950774, + "grad_norm": 1.5561425685882568, + "learning_rate": 2.5361955674351267e-05, + "loss": 0.1127, + "num_input_tokens_seen": 11092288, + "step": 9110 + }, + { + "epoch": 1.0151464528343914, + "grad_norm": 1.210463285446167, + "learning_rate": 2.5375877046441698e-05, + "loss": 0.1357, + "num_input_tokens_seen": 11098784, + "step": 9115 + }, + { + "epoch": 1.0157033077180087, + "grad_norm": 1.0490020513534546, + "learning_rate": 2.538979841853213e-05, + "loss": 0.1069, + "num_input_tokens_seen": 11104736, + "step": 9120 + }, + { + "epoch": 1.016260162601626, + "grad_norm": 0.22060662508010864, + "learning_rate": 2.5403719790622566e-05, + "loss": 0.1706, + "num_input_tokens_seen": 11110528, + "step": 9125 + }, + { + "epoch": 1.0168170174852433, + "grad_norm": 0.5542005896568298, + "learning_rate": 2.5417641162712997e-05, + "loss": 0.1726, + "num_input_tokens_seen": 11116352, + "step": 9130 + }, + { + "epoch": 1.0173738723688606, + "grad_norm": 0.3108653724193573, + "learning_rate": 2.543156253480343e-05, + "loss": 0.1011, + "num_input_tokens_seen": 11122688, + "step": 9135 + }, + { + "epoch": 1.017930727252478, + "grad_norm": 1.6250526905059814, + "learning_rate": 2.5445483906893862e-05, + "loss": 0.3323, + "num_input_tokens_seen": 11128864, + "step": 9140 + }, + { + "epoch": 1.0184875821360952, + "grad_norm": 0.06753882765769958, + "learning_rate": 2.54594052789843e-05, + "loss": 0.1018, + "num_input_tokens_seen": 11135072, + "step": 9145 + }, + { + "epoch": 1.0190444370197127, + "grad_norm": 0.6772701740264893, + "learning_rate": 2.547332665107473e-05, + "loss": 0.078, + "num_input_tokens_seen": 11141248, + "step": 9150 + }, + { + "epoch": 1.01960129190333, + "grad_norm": 1.8875285387039185, + "learning_rate": 2.5487248023165165e-05, + "loss": 0.1649, + "num_input_tokens_seen": 11147072, + "step": 9155 + }, + { + "epoch": 1.0201581467869474, + "grad_norm": 1.245195746421814, + "learning_rate": 2.5501169395255596e-05, + "loss": 0.155, + "num_input_tokens_seen": 11152608, + "step": 9160 + }, + { + "epoch": 1.0207150016705646, + "grad_norm": 0.056575924158096313, + "learning_rate": 2.5515090767346027e-05, + "loss": 0.0949, + "num_input_tokens_seen": 11158752, + "step": 9165 + }, + { + "epoch": 1.021271856554182, + "grad_norm": 0.5501588582992554, + "learning_rate": 2.5529012139436465e-05, + "loss": 0.0699, + "num_input_tokens_seen": 11164672, + "step": 9170 + }, + { + "epoch": 1.0218287114377993, + "grad_norm": 1.950537919998169, + "learning_rate": 2.5542933511526896e-05, + "loss": 0.1794, + "num_input_tokens_seen": 11170784, + "step": 9175 + }, + { + "epoch": 1.0223855663214167, + "grad_norm": 1.0679713487625122, + "learning_rate": 2.555685488361733e-05, + "loss": 0.2755, + "num_input_tokens_seen": 11177024, + "step": 9180 + }, + { + "epoch": 1.022942421205034, + "grad_norm": 1.320223093032837, + "learning_rate": 2.557077625570776e-05, + "loss": 0.1615, + "num_input_tokens_seen": 11182944, + "step": 9185 + }, + { + "epoch": 1.0234992760886512, + "grad_norm": 0.29183775186538696, + "learning_rate": 2.55846976277982e-05, + "loss": 0.1841, + "num_input_tokens_seen": 11188896, + "step": 9190 + }, + { + "epoch": 1.0240561309722687, + "grad_norm": 0.19761821627616882, + "learning_rate": 2.559861899988863e-05, + "loss": 0.0914, + "num_input_tokens_seen": 11195104, + "step": 9195 + }, + { + "epoch": 1.024612985855886, + "grad_norm": 1.9157378673553467, + "learning_rate": 2.5612540371979064e-05, + "loss": 0.208, + "num_input_tokens_seen": 11200960, + "step": 9200 + }, + { + "epoch": 1.0251698407395033, + "grad_norm": 1.0934302806854248, + "learning_rate": 2.5626461744069495e-05, + "loss": 0.1984, + "num_input_tokens_seen": 11206752, + "step": 9205 + }, + { + "epoch": 1.0257266956231206, + "grad_norm": 0.8453556895256042, + "learning_rate": 2.5640383116159926e-05, + "loss": 0.081, + "num_input_tokens_seen": 11213088, + "step": 9210 + }, + { + "epoch": 1.026283550506738, + "grad_norm": 1.3029972314834595, + "learning_rate": 2.5654304488250364e-05, + "loss": 0.2866, + "num_input_tokens_seen": 11219328, + "step": 9215 + }, + { + "epoch": 1.0268404053903553, + "grad_norm": 1.5785454511642456, + "learning_rate": 2.5668225860340795e-05, + "loss": 0.2787, + "num_input_tokens_seen": 11225088, + "step": 9220 + }, + { + "epoch": 1.0273972602739727, + "grad_norm": 0.6835338473320007, + "learning_rate": 2.568214723243123e-05, + "loss": 0.0493, + "num_input_tokens_seen": 11231296, + "step": 9225 + }, + { + "epoch": 1.02795411515759, + "grad_norm": 0.5353710651397705, + "learning_rate": 2.569606860452166e-05, + "loss": 0.2661, + "num_input_tokens_seen": 11237664, + "step": 9230 + }, + { + "epoch": 1.0285109700412072, + "grad_norm": 1.1956287622451782, + "learning_rate": 2.5709989976612098e-05, + "loss": 0.2291, + "num_input_tokens_seen": 11243680, + "step": 9235 + }, + { + "epoch": 1.0290678249248246, + "grad_norm": 1.892791748046875, + "learning_rate": 2.572391134870253e-05, + "loss": 0.3063, + "num_input_tokens_seen": 11249760, + "step": 9240 + }, + { + "epoch": 1.0296246798084419, + "grad_norm": 1.0086954832077026, + "learning_rate": 2.5737832720792966e-05, + "loss": 0.1359, + "num_input_tokens_seen": 11255744, + "step": 9245 + }, + { + "epoch": 1.0301815346920593, + "grad_norm": 1.1082385778427124, + "learning_rate": 2.5751754092883394e-05, + "loss": 0.0951, + "num_input_tokens_seen": 11261856, + "step": 9250 + }, + { + "epoch": 1.0307383895756765, + "grad_norm": 0.22538922727108002, + "learning_rate": 2.5765675464973825e-05, + "loss": 0.128, + "num_input_tokens_seen": 11268288, + "step": 9255 + }, + { + "epoch": 1.031295244459294, + "grad_norm": 1.6034002304077148, + "learning_rate": 2.5779596837064263e-05, + "loss": 0.1325, + "num_input_tokens_seen": 11274432, + "step": 9260 + }, + { + "epoch": 1.0318520993429112, + "grad_norm": 1.5860133171081543, + "learning_rate": 2.5793518209154694e-05, + "loss": 0.3087, + "num_input_tokens_seen": 11279744, + "step": 9265 + }, + { + "epoch": 1.0324089542265287, + "grad_norm": 0.9060791730880737, + "learning_rate": 2.580743958124513e-05, + "loss": 0.1991, + "num_input_tokens_seen": 11285536, + "step": 9270 + }, + { + "epoch": 1.032965809110146, + "grad_norm": 0.9970847368240356, + "learning_rate": 2.582136095333556e-05, + "loss": 0.149, + "num_input_tokens_seen": 11291648, + "step": 9275 + }, + { + "epoch": 1.0335226639937631, + "grad_norm": 0.8241659998893738, + "learning_rate": 2.5835282325425997e-05, + "loss": 0.1932, + "num_input_tokens_seen": 11297888, + "step": 9280 + }, + { + "epoch": 1.0340795188773806, + "grad_norm": 0.582112193107605, + "learning_rate": 2.5849203697516427e-05, + "loss": 0.1552, + "num_input_tokens_seen": 11303968, + "step": 9285 + }, + { + "epoch": 1.0346363737609978, + "grad_norm": 0.37207064032554626, + "learning_rate": 2.5863125069606865e-05, + "loss": 0.151, + "num_input_tokens_seen": 11310016, + "step": 9290 + }, + { + "epoch": 1.0351932286446153, + "grad_norm": 0.3818701505661011, + "learning_rate": 2.5877046441697296e-05, + "loss": 0.2979, + "num_input_tokens_seen": 11316096, + "step": 9295 + }, + { + "epoch": 1.0357500835282325, + "grad_norm": 1.1788866519927979, + "learning_rate": 2.5890967813787727e-05, + "loss": 0.3005, + "num_input_tokens_seen": 11322240, + "step": 9300 + }, + { + "epoch": 1.03630693841185, + "grad_norm": 0.06775099039077759, + "learning_rate": 2.590488918587816e-05, + "loss": 0.203, + "num_input_tokens_seen": 11328544, + "step": 9305 + }, + { + "epoch": 1.0368637932954672, + "grad_norm": 0.10909786075353622, + "learning_rate": 2.5918810557968592e-05, + "loss": 0.0569, + "num_input_tokens_seen": 11334368, + "step": 9310 + }, + { + "epoch": 1.0374206481790846, + "grad_norm": 0.052785009145736694, + "learning_rate": 2.593273193005903e-05, + "loss": 0.1131, + "num_input_tokens_seen": 11340288, + "step": 9315 + }, + { + "epoch": 1.0379775030627019, + "grad_norm": 0.7154438495635986, + "learning_rate": 2.594665330214946e-05, + "loss": 0.1974, + "num_input_tokens_seen": 11346240, + "step": 9320 + }, + { + "epoch": 1.038534357946319, + "grad_norm": 0.39223480224609375, + "learning_rate": 2.5960574674239895e-05, + "loss": 0.1027, + "num_input_tokens_seen": 11352608, + "step": 9325 + }, + { + "epoch": 1.0390912128299366, + "grad_norm": 0.7988489270210266, + "learning_rate": 2.5974496046330326e-05, + "loss": 0.1145, + "num_input_tokens_seen": 11358784, + "step": 9330 + }, + { + "epoch": 1.0396480677135538, + "grad_norm": 1.850825309753418, + "learning_rate": 2.5988417418420764e-05, + "loss": 0.1964, + "num_input_tokens_seen": 11364544, + "step": 9335 + }, + { + "epoch": 1.0402049225971712, + "grad_norm": 0.051172975450754166, + "learning_rate": 2.6002338790511195e-05, + "loss": 0.0444, + "num_input_tokens_seen": 11370688, + "step": 9340 + }, + { + "epoch": 1.0407617774807885, + "grad_norm": 1.4693156480789185, + "learning_rate": 2.601626016260163e-05, + "loss": 0.1823, + "num_input_tokens_seen": 11377088, + "step": 9345 + }, + { + "epoch": 1.041318632364406, + "grad_norm": 0.428691029548645, + "learning_rate": 2.603018153469206e-05, + "loss": 0.0906, + "num_input_tokens_seen": 11383072, + "step": 9350 + }, + { + "epoch": 1.0418754872480231, + "grad_norm": 1.3127590417861938, + "learning_rate": 2.604410290678249e-05, + "loss": 0.2257, + "num_input_tokens_seen": 11389184, + "step": 9355 + }, + { + "epoch": 1.0424323421316406, + "grad_norm": 0.7089748978614807, + "learning_rate": 2.605802427887293e-05, + "loss": 0.1552, + "num_input_tokens_seen": 11395456, + "step": 9360 + }, + { + "epoch": 1.0429891970152578, + "grad_norm": 0.008971837349236012, + "learning_rate": 2.607194565096336e-05, + "loss": 0.1865, + "num_input_tokens_seen": 11401920, + "step": 9365 + }, + { + "epoch": 1.043546051898875, + "grad_norm": 0.3371591866016388, + "learning_rate": 2.6085867023053794e-05, + "loss": 0.076, + "num_input_tokens_seen": 11408224, + "step": 9370 + }, + { + "epoch": 1.0441029067824925, + "grad_norm": 0.8741312623023987, + "learning_rate": 2.6099788395144225e-05, + "loss": 0.1164, + "num_input_tokens_seen": 11414176, + "step": 9375 + }, + { + "epoch": 1.0446597616661097, + "grad_norm": 1.6821112632751465, + "learning_rate": 2.6113709767234663e-05, + "loss": 0.2237, + "num_input_tokens_seen": 11420320, + "step": 9380 + }, + { + "epoch": 1.0452166165497272, + "grad_norm": 0.5854402780532837, + "learning_rate": 2.6127631139325094e-05, + "loss": 0.1798, + "num_input_tokens_seen": 11426432, + "step": 9385 + }, + { + "epoch": 1.0457734714333444, + "grad_norm": 0.26497822999954224, + "learning_rate": 2.6141552511415528e-05, + "loss": 0.0684, + "num_input_tokens_seen": 11432640, + "step": 9390 + }, + { + "epoch": 1.0463303263169619, + "grad_norm": 0.6560778021812439, + "learning_rate": 2.615547388350596e-05, + "loss": 0.1196, + "num_input_tokens_seen": 11438880, + "step": 9395 + }, + { + "epoch": 1.046887181200579, + "grad_norm": 1.379807949066162, + "learning_rate": 2.616939525559639e-05, + "loss": 0.2026, + "num_input_tokens_seen": 11445056, + "step": 9400 + }, + { + "epoch": 1.0474440360841966, + "grad_norm": 1.4440698623657227, + "learning_rate": 2.6183316627686828e-05, + "loss": 0.2709, + "num_input_tokens_seen": 11450880, + "step": 9405 + }, + { + "epoch": 1.0480008909678138, + "grad_norm": 0.30832234025001526, + "learning_rate": 2.619723799977726e-05, + "loss": 0.0842, + "num_input_tokens_seen": 11456864, + "step": 9410 + }, + { + "epoch": 1.048557745851431, + "grad_norm": 0.25828036665916443, + "learning_rate": 2.6211159371867693e-05, + "loss": 0.1417, + "num_input_tokens_seen": 11463168, + "step": 9415 + }, + { + "epoch": 1.0491146007350485, + "grad_norm": 0.6320565938949585, + "learning_rate": 2.6225080743958124e-05, + "loss": 0.186, + "num_input_tokens_seen": 11469344, + "step": 9420 + }, + { + "epoch": 1.0496714556186657, + "grad_norm": 0.8240415453910828, + "learning_rate": 2.623900211604856e-05, + "loss": 0.107, + "num_input_tokens_seen": 11475616, + "step": 9425 + }, + { + "epoch": 1.0502283105022832, + "grad_norm": 0.7188096046447754, + "learning_rate": 2.6252923488138993e-05, + "loss": 0.2664, + "num_input_tokens_seen": 11481408, + "step": 9430 + }, + { + "epoch": 1.0507851653859004, + "grad_norm": 0.714453935623169, + "learning_rate": 2.6266844860229427e-05, + "loss": 0.1615, + "num_input_tokens_seen": 11487616, + "step": 9435 + }, + { + "epoch": 1.0513420202695178, + "grad_norm": 1.4293147325515747, + "learning_rate": 2.6280766232319858e-05, + "loss": 0.201, + "num_input_tokens_seen": 11493600, + "step": 9440 + }, + { + "epoch": 1.051898875153135, + "grad_norm": 0.352900892496109, + "learning_rate": 2.629468760441029e-05, + "loss": 0.1053, + "num_input_tokens_seen": 11499936, + "step": 9445 + }, + { + "epoch": 1.0524557300367525, + "grad_norm": 0.31866052746772766, + "learning_rate": 2.6308608976500726e-05, + "loss": 0.1221, + "num_input_tokens_seen": 11506144, + "step": 9450 + }, + { + "epoch": 1.0530125849203698, + "grad_norm": 1.0828956365585327, + "learning_rate": 2.6322530348591157e-05, + "loss": 0.0825, + "num_input_tokens_seen": 11512544, + "step": 9455 + }, + { + "epoch": 1.053569439803987, + "grad_norm": 1.0265454053878784, + "learning_rate": 2.6336451720681592e-05, + "loss": 0.2867, + "num_input_tokens_seen": 11518112, + "step": 9460 + }, + { + "epoch": 1.0541262946876044, + "grad_norm": 0.6925121545791626, + "learning_rate": 2.6350373092772023e-05, + "loss": 0.1689, + "num_input_tokens_seen": 11524480, + "step": 9465 + }, + { + "epoch": 1.0546831495712217, + "grad_norm": 0.9047210812568665, + "learning_rate": 2.636429446486246e-05, + "loss": 0.3522, + "num_input_tokens_seen": 11530464, + "step": 9470 + }, + { + "epoch": 1.0552400044548391, + "grad_norm": 0.27758756279945374, + "learning_rate": 2.637821583695289e-05, + "loss": 0.0803, + "num_input_tokens_seen": 11536544, + "step": 9475 + }, + { + "epoch": 1.0557968593384564, + "grad_norm": 1.0481085777282715, + "learning_rate": 2.6392137209043326e-05, + "loss": 0.1628, + "num_input_tokens_seen": 11542848, + "step": 9480 + }, + { + "epoch": 1.0563537142220738, + "grad_norm": 0.7293245792388916, + "learning_rate": 2.6406058581133757e-05, + "loss": 0.1838, + "num_input_tokens_seen": 11549056, + "step": 9485 + }, + { + "epoch": 1.056910569105691, + "grad_norm": 0.6197203993797302, + "learning_rate": 2.6419979953224188e-05, + "loss": 0.158, + "num_input_tokens_seen": 11555232, + "step": 9490 + }, + { + "epoch": 1.0574674239893085, + "grad_norm": 0.9264791011810303, + "learning_rate": 2.6433901325314625e-05, + "loss": 0.0534, + "num_input_tokens_seen": 11561536, + "step": 9495 + }, + { + "epoch": 1.0580242788729257, + "grad_norm": 1.1841179132461548, + "learning_rate": 2.6447822697405056e-05, + "loss": 0.2533, + "num_input_tokens_seen": 11567744, + "step": 9500 + }, + { + "epoch": 1.058581133756543, + "grad_norm": 0.5807974338531494, + "learning_rate": 2.646174406949549e-05, + "loss": 0.1783, + "num_input_tokens_seen": 11573920, + "step": 9505 + }, + { + "epoch": 1.0591379886401604, + "grad_norm": 0.7301587462425232, + "learning_rate": 2.647566544158592e-05, + "loss": 0.0801, + "num_input_tokens_seen": 11580096, + "step": 9510 + }, + { + "epoch": 1.0596948435237776, + "grad_norm": 0.08023087680339813, + "learning_rate": 2.648958681367636e-05, + "loss": 0.1508, + "num_input_tokens_seen": 11586240, + "step": 9515 + }, + { + "epoch": 1.060251698407395, + "grad_norm": 1.3455588817596436, + "learning_rate": 2.650350818576679e-05, + "loss": 0.1573, + "num_input_tokens_seen": 11592512, + "step": 9520 + }, + { + "epoch": 1.0608085532910123, + "grad_norm": 0.6988961100578308, + "learning_rate": 2.6517429557857228e-05, + "loss": 0.1324, + "num_input_tokens_seen": 11598752, + "step": 9525 + }, + { + "epoch": 1.0613654081746298, + "grad_norm": 0.5289499759674072, + "learning_rate": 2.6531350929947655e-05, + "loss": 0.2244, + "num_input_tokens_seen": 11604672, + "step": 9530 + }, + { + "epoch": 1.061922263058247, + "grad_norm": 0.9352350831031799, + "learning_rate": 2.6545272302038086e-05, + "loss": 0.3419, + "num_input_tokens_seen": 11610848, + "step": 9535 + }, + { + "epoch": 1.0624791179418644, + "grad_norm": 1.1130372285842896, + "learning_rate": 2.6559193674128524e-05, + "loss": 0.2502, + "num_input_tokens_seen": 11616256, + "step": 9540 + }, + { + "epoch": 1.0630359728254817, + "grad_norm": 0.310201495885849, + "learning_rate": 2.6573115046218955e-05, + "loss": 0.2457, + "num_input_tokens_seen": 11622144, + "step": 9545 + }, + { + "epoch": 1.063592827709099, + "grad_norm": 0.8088942170143127, + "learning_rate": 2.6587036418309393e-05, + "loss": 0.1163, + "num_input_tokens_seen": 11628480, + "step": 9550 + }, + { + "epoch": 1.0641496825927164, + "grad_norm": 0.5643314123153687, + "learning_rate": 2.6600957790399824e-05, + "loss": 0.133, + "num_input_tokens_seen": 11634496, + "step": 9555 + }, + { + "epoch": 1.0647065374763336, + "grad_norm": 0.2369096428155899, + "learning_rate": 2.6614879162490258e-05, + "loss": 0.0898, + "num_input_tokens_seen": 11640768, + "step": 9560 + }, + { + "epoch": 1.065263392359951, + "grad_norm": 0.5679417252540588, + "learning_rate": 2.662880053458069e-05, + "loss": 0.2284, + "num_input_tokens_seen": 11646720, + "step": 9565 + }, + { + "epoch": 1.0658202472435683, + "grad_norm": 1.7829259634017944, + "learning_rate": 2.6642721906671127e-05, + "loss": 0.1857, + "num_input_tokens_seen": 11652672, + "step": 9570 + }, + { + "epoch": 1.0663771021271857, + "grad_norm": 0.46898728609085083, + "learning_rate": 2.6656643278761558e-05, + "loss": 0.0606, + "num_input_tokens_seen": 11659104, + "step": 9575 + }, + { + "epoch": 1.066933957010803, + "grad_norm": 1.0185835361480713, + "learning_rate": 2.667056465085199e-05, + "loss": 0.0584, + "num_input_tokens_seen": 11665120, + "step": 9580 + }, + { + "epoch": 1.0674908118944204, + "grad_norm": 0.4871823787689209, + "learning_rate": 2.6684486022942423e-05, + "loss": 0.1913, + "num_input_tokens_seen": 11671104, + "step": 9585 + }, + { + "epoch": 1.0680476667780376, + "grad_norm": 0.8035956025123596, + "learning_rate": 2.6698407395032854e-05, + "loss": 0.1439, + "num_input_tokens_seen": 11676576, + "step": 9590 + }, + { + "epoch": 1.0686045216616549, + "grad_norm": 1.6470108032226562, + "learning_rate": 2.671232876712329e-05, + "loss": 0.2676, + "num_input_tokens_seen": 11682848, + "step": 9595 + }, + { + "epoch": 1.0691613765452723, + "grad_norm": 1.0789111852645874, + "learning_rate": 2.6726250139213722e-05, + "loss": 0.1207, + "num_input_tokens_seen": 11688928, + "step": 9600 + }, + { + "epoch": 1.0697182314288896, + "grad_norm": 0.7177366018295288, + "learning_rate": 2.6740171511304157e-05, + "loss": 0.1482, + "num_input_tokens_seen": 11694816, + "step": 9605 + }, + { + "epoch": 1.070275086312507, + "grad_norm": 0.4951251447200775, + "learning_rate": 2.6754092883394588e-05, + "loss": 0.1449, + "num_input_tokens_seen": 11701280, + "step": 9610 + }, + { + "epoch": 1.0708319411961242, + "grad_norm": 0.6445391178131104, + "learning_rate": 2.6768014255485025e-05, + "loss": 0.1052, + "num_input_tokens_seen": 11707456, + "step": 9615 + }, + { + "epoch": 1.0713887960797417, + "grad_norm": 2.986537218093872, + "learning_rate": 2.6781935627575456e-05, + "loss": 0.16, + "num_input_tokens_seen": 11713280, + "step": 9620 + }, + { + "epoch": 1.071945650963359, + "grad_norm": 0.8738337159156799, + "learning_rate": 2.6795856999665887e-05, + "loss": 0.1509, + "num_input_tokens_seen": 11719264, + "step": 9625 + }, + { + "epoch": 1.0725025058469764, + "grad_norm": 0.8715239763259888, + "learning_rate": 2.680977837175632e-05, + "loss": 0.1337, + "num_input_tokens_seen": 11725472, + "step": 9630 + }, + { + "epoch": 1.0730593607305936, + "grad_norm": 0.6855936050415039, + "learning_rate": 2.6823699743846753e-05, + "loss": 0.0518, + "num_input_tokens_seen": 11731456, + "step": 9635 + }, + { + "epoch": 1.0736162156142108, + "grad_norm": 0.6432687640190125, + "learning_rate": 2.683762111593719e-05, + "loss": 0.1976, + "num_input_tokens_seen": 11737632, + "step": 9640 + }, + { + "epoch": 1.0741730704978283, + "grad_norm": 0.6358098983764648, + "learning_rate": 2.685154248802762e-05, + "loss": 0.1336, + "num_input_tokens_seen": 11743776, + "step": 9645 + }, + { + "epoch": 1.0747299253814455, + "grad_norm": 0.5269756317138672, + "learning_rate": 2.6865463860118056e-05, + "loss": 0.195, + "num_input_tokens_seen": 11749632, + "step": 9650 + }, + { + "epoch": 1.075286780265063, + "grad_norm": 2.1515204906463623, + "learning_rate": 2.6879385232208486e-05, + "loss": 0.2625, + "num_input_tokens_seen": 11755040, + "step": 9655 + }, + { + "epoch": 1.0758436351486802, + "grad_norm": 0.4192754626274109, + "learning_rate": 2.6893306604298924e-05, + "loss": 0.1245, + "num_input_tokens_seen": 11761472, + "step": 9660 + }, + { + "epoch": 1.0764004900322977, + "grad_norm": 0.11340510845184326, + "learning_rate": 2.6907227976389355e-05, + "loss": 0.1405, + "num_input_tokens_seen": 11767616, + "step": 9665 + }, + { + "epoch": 1.0769573449159149, + "grad_norm": 1.2139594554901123, + "learning_rate": 2.6921149348479786e-05, + "loss": 0.2659, + "num_input_tokens_seen": 11773888, + "step": 9670 + }, + { + "epoch": 1.0775141997995323, + "grad_norm": 0.6402862668037415, + "learning_rate": 2.693507072057022e-05, + "loss": 0.1373, + "num_input_tokens_seen": 11779872, + "step": 9675 + }, + { + "epoch": 1.0780710546831496, + "grad_norm": 0.26124486327171326, + "learning_rate": 2.694899209266065e-05, + "loss": 0.1027, + "num_input_tokens_seen": 11786176, + "step": 9680 + }, + { + "epoch": 1.0786279095667668, + "grad_norm": 1.185481071472168, + "learning_rate": 2.696291346475109e-05, + "loss": 0.1059, + "num_input_tokens_seen": 11792448, + "step": 9685 + }, + { + "epoch": 1.0791847644503842, + "grad_norm": 0.13354381918907166, + "learning_rate": 2.697683483684152e-05, + "loss": 0.0687, + "num_input_tokens_seen": 11798592, + "step": 9690 + }, + { + "epoch": 1.0797416193340015, + "grad_norm": 0.3253818452358246, + "learning_rate": 2.6990756208931954e-05, + "loss": 0.0912, + "num_input_tokens_seen": 11804800, + "step": 9695 + }, + { + "epoch": 1.080298474217619, + "grad_norm": 0.9115231037139893, + "learning_rate": 2.7004677581022385e-05, + "loss": 0.1497, + "num_input_tokens_seen": 11810944, + "step": 9700 + }, + { + "epoch": 1.0808553291012362, + "grad_norm": 0.3718288540840149, + "learning_rate": 2.7018598953112823e-05, + "loss": 0.1206, + "num_input_tokens_seen": 11817312, + "step": 9705 + }, + { + "epoch": 1.0814121839848536, + "grad_norm": 0.500187873840332, + "learning_rate": 2.7032520325203254e-05, + "loss": 0.1592, + "num_input_tokens_seen": 11823424, + "step": 9710 + }, + { + "epoch": 1.0819690388684708, + "grad_norm": 0.0999925285577774, + "learning_rate": 2.7046441697293685e-05, + "loss": 0.1864, + "num_input_tokens_seen": 11829536, + "step": 9715 + }, + { + "epoch": 1.0825258937520883, + "grad_norm": 2.470156192779541, + "learning_rate": 2.706036306938412e-05, + "loss": 0.1662, + "num_input_tokens_seen": 11835744, + "step": 9720 + }, + { + "epoch": 1.0830827486357055, + "grad_norm": 0.6931014060974121, + "learning_rate": 2.707428444147455e-05, + "loss": 0.0964, + "num_input_tokens_seen": 11841856, + "step": 9725 + }, + { + "epoch": 1.0836396035193228, + "grad_norm": 1.3042887449264526, + "learning_rate": 2.7088205813564988e-05, + "loss": 0.2204, + "num_input_tokens_seen": 11848064, + "step": 9730 + }, + { + "epoch": 1.0841964584029402, + "grad_norm": 1.1072649955749512, + "learning_rate": 2.710212718565542e-05, + "loss": 0.1839, + "num_input_tokens_seen": 11854016, + "step": 9735 + }, + { + "epoch": 1.0847533132865574, + "grad_norm": 1.1036772727966309, + "learning_rate": 2.7116048557745853e-05, + "loss": 0.132, + "num_input_tokens_seen": 11860128, + "step": 9740 + }, + { + "epoch": 1.085310168170175, + "grad_norm": 0.1043771505355835, + "learning_rate": 2.7129969929836284e-05, + "loss": 0.1121, + "num_input_tokens_seen": 11866336, + "step": 9745 + }, + { + "epoch": 1.0858670230537921, + "grad_norm": 0.18882405757904053, + "learning_rate": 2.7143891301926722e-05, + "loss": 0.1922, + "num_input_tokens_seen": 11872224, + "step": 9750 + }, + { + "epoch": 1.0864238779374096, + "grad_norm": 1.3638771772384644, + "learning_rate": 2.7157812674017153e-05, + "loss": 0.2469, + "num_input_tokens_seen": 11878688, + "step": 9755 + }, + { + "epoch": 1.0869807328210268, + "grad_norm": 0.6541118025779724, + "learning_rate": 2.7171734046107584e-05, + "loss": 0.1671, + "num_input_tokens_seen": 11884416, + "step": 9760 + }, + { + "epoch": 1.0875375877046443, + "grad_norm": 0.15122956037521362, + "learning_rate": 2.7185655418198018e-05, + "loss": 0.1016, + "num_input_tokens_seen": 11890528, + "step": 9765 + }, + { + "epoch": 1.0880944425882615, + "grad_norm": 0.1671692281961441, + "learning_rate": 2.719957679028845e-05, + "loss": 0.167, + "num_input_tokens_seen": 11896896, + "step": 9770 + }, + { + "epoch": 1.0886512974718787, + "grad_norm": 2.4634501934051514, + "learning_rate": 2.7213498162378887e-05, + "loss": 0.3289, + "num_input_tokens_seen": 11902912, + "step": 9775 + }, + { + "epoch": 1.0892081523554962, + "grad_norm": 0.696542501449585, + "learning_rate": 2.7227419534469318e-05, + "loss": 0.1832, + "num_input_tokens_seen": 11909152, + "step": 9780 + }, + { + "epoch": 1.0897650072391134, + "grad_norm": 1.2079343795776367, + "learning_rate": 2.7241340906559752e-05, + "loss": 0.3124, + "num_input_tokens_seen": 11914752, + "step": 9785 + }, + { + "epoch": 1.0903218621227309, + "grad_norm": 1.0223239660263062, + "learning_rate": 2.7255262278650183e-05, + "loss": 0.151, + "num_input_tokens_seen": 11920768, + "step": 9790 + }, + { + "epoch": 1.090878717006348, + "grad_norm": 2.138392686843872, + "learning_rate": 2.726918365074062e-05, + "loss": 0.2762, + "num_input_tokens_seen": 11926688, + "step": 9795 + }, + { + "epoch": 1.0914355718899655, + "grad_norm": 0.5683856010437012, + "learning_rate": 2.728310502283105e-05, + "loss": 0.1382, + "num_input_tokens_seen": 11932928, + "step": 9800 + }, + { + "epoch": 1.0919924267735828, + "grad_norm": 0.4166925549507141, + "learning_rate": 2.7297026394921482e-05, + "loss": 0.1175, + "num_input_tokens_seen": 11938944, + "step": 9805 + }, + { + "epoch": 1.0925492816572002, + "grad_norm": 1.220180630683899, + "learning_rate": 2.731094776701192e-05, + "loss": 0.1808, + "num_input_tokens_seen": 11945344, + "step": 9810 + }, + { + "epoch": 1.0931061365408175, + "grad_norm": 0.8652098774909973, + "learning_rate": 2.7324869139102348e-05, + "loss": 0.2151, + "num_input_tokens_seen": 11951584, + "step": 9815 + }, + { + "epoch": 1.0936629914244347, + "grad_norm": 0.35111793875694275, + "learning_rate": 2.7338790511192785e-05, + "loss": 0.1712, + "num_input_tokens_seen": 11957984, + "step": 9820 + }, + { + "epoch": 1.0942198463080521, + "grad_norm": 0.9261323809623718, + "learning_rate": 2.7352711883283216e-05, + "loss": 0.1076, + "num_input_tokens_seen": 11964032, + "step": 9825 + }, + { + "epoch": 1.0947767011916694, + "grad_norm": 0.5084980130195618, + "learning_rate": 2.7366633255373654e-05, + "loss": 0.1497, + "num_input_tokens_seen": 11970016, + "step": 9830 + }, + { + "epoch": 1.0953335560752868, + "grad_norm": 0.5304277539253235, + "learning_rate": 2.7380554627464085e-05, + "loss": 0.11, + "num_input_tokens_seen": 11976160, + "step": 9835 + }, + { + "epoch": 1.095890410958904, + "grad_norm": 0.05055266246199608, + "learning_rate": 2.739447599955452e-05, + "loss": 0.1081, + "num_input_tokens_seen": 11982272, + "step": 9840 + }, + { + "epoch": 1.0964472658425215, + "grad_norm": 1.2175921201705933, + "learning_rate": 2.740839737164495e-05, + "loss": 0.1619, + "num_input_tokens_seen": 11988480, + "step": 9845 + }, + { + "epoch": 1.0970041207261387, + "grad_norm": 1.39902925491333, + "learning_rate": 2.742231874373538e-05, + "loss": 0.2535, + "num_input_tokens_seen": 11994144, + "step": 9850 + }, + { + "epoch": 1.0975609756097562, + "grad_norm": 0.3859539031982422, + "learning_rate": 2.743624011582582e-05, + "loss": 0.1084, + "num_input_tokens_seen": 12000096, + "step": 9855 + }, + { + "epoch": 1.0981178304933734, + "grad_norm": 2.1498332023620605, + "learning_rate": 2.745016148791625e-05, + "loss": 0.4096, + "num_input_tokens_seen": 12006208, + "step": 9860 + }, + { + "epoch": 1.0986746853769906, + "grad_norm": 0.8866593837738037, + "learning_rate": 2.7464082860006684e-05, + "loss": 0.1609, + "num_input_tokens_seen": 12012352, + "step": 9865 + }, + { + "epoch": 1.099231540260608, + "grad_norm": 1.4093395471572876, + "learning_rate": 2.7478004232097115e-05, + "loss": 0.218, + "num_input_tokens_seen": 12018432, + "step": 9870 + }, + { + "epoch": 1.0997883951442253, + "grad_norm": 0.8460603356361389, + "learning_rate": 2.7491925604187553e-05, + "loss": 0.289, + "num_input_tokens_seen": 12023648, + "step": 9875 + }, + { + "epoch": 1.1003452500278428, + "grad_norm": 0.5294076204299927, + "learning_rate": 2.7505846976277984e-05, + "loss": 0.1161, + "num_input_tokens_seen": 12029184, + "step": 9880 + }, + { + "epoch": 1.10090210491146, + "grad_norm": 0.42142391204833984, + "learning_rate": 2.7519768348368418e-05, + "loss": 0.1083, + "num_input_tokens_seen": 12035392, + "step": 9885 + }, + { + "epoch": 1.1014589597950775, + "grad_norm": 0.9091790318489075, + "learning_rate": 2.753368972045885e-05, + "loss": 0.1628, + "num_input_tokens_seen": 12041440, + "step": 9890 + }, + { + "epoch": 1.1020158146786947, + "grad_norm": 0.9605005979537964, + "learning_rate": 2.754761109254928e-05, + "loss": 0.1535, + "num_input_tokens_seen": 12047680, + "step": 9895 + }, + { + "epoch": 1.1025726695623121, + "grad_norm": 1.426230549812317, + "learning_rate": 2.7561532464639718e-05, + "loss": 0.103, + "num_input_tokens_seen": 12054048, + "step": 9900 + }, + { + "epoch": 1.1031295244459294, + "grad_norm": 0.7569586634635925, + "learning_rate": 2.757545383673015e-05, + "loss": 0.1475, + "num_input_tokens_seen": 12060512, + "step": 9905 + }, + { + "epoch": 1.1036863793295466, + "grad_norm": 0.5256431698799133, + "learning_rate": 2.7589375208820583e-05, + "loss": 0.0469, + "num_input_tokens_seen": 12066656, + "step": 9910 + }, + { + "epoch": 1.104243234213164, + "grad_norm": 1.0723603963851929, + "learning_rate": 2.7603296580911014e-05, + "loss": 0.1085, + "num_input_tokens_seen": 12073024, + "step": 9915 + }, + { + "epoch": 1.1048000890967813, + "grad_norm": 0.5043888092041016, + "learning_rate": 2.761721795300145e-05, + "loss": 0.0517, + "num_input_tokens_seen": 12079168, + "step": 9920 + }, + { + "epoch": 1.1053569439803987, + "grad_norm": 0.5772845149040222, + "learning_rate": 2.7631139325091883e-05, + "loss": 0.2147, + "num_input_tokens_seen": 12084704, + "step": 9925 + }, + { + "epoch": 1.105913798864016, + "grad_norm": 0.49230560660362244, + "learning_rate": 2.7645060697182317e-05, + "loss": 0.1362, + "num_input_tokens_seen": 12090720, + "step": 9930 + }, + { + "epoch": 1.1064706537476334, + "grad_norm": 0.4700410068035126, + "learning_rate": 2.7658982069272748e-05, + "loss": 0.0585, + "num_input_tokens_seen": 12096672, + "step": 9935 + }, + { + "epoch": 1.1070275086312507, + "grad_norm": 2.5317749977111816, + "learning_rate": 2.767290344136318e-05, + "loss": 0.2922, + "num_input_tokens_seen": 12102816, + "step": 9940 + }, + { + "epoch": 1.107584363514868, + "grad_norm": 0.24352626502513885, + "learning_rate": 2.7686824813453617e-05, + "loss": 0.1476, + "num_input_tokens_seen": 12108960, + "step": 9945 + }, + { + "epoch": 1.1081412183984853, + "grad_norm": 0.20175282657146454, + "learning_rate": 2.7700746185544047e-05, + "loss": 0.0931, + "num_input_tokens_seen": 12115136, + "step": 9950 + }, + { + "epoch": 1.1086980732821026, + "grad_norm": 0.7026132345199585, + "learning_rate": 2.7714667557634482e-05, + "loss": 0.2333, + "num_input_tokens_seen": 12121120, + "step": 9955 + }, + { + "epoch": 1.10925492816572, + "grad_norm": 1.0096567869186401, + "learning_rate": 2.7728588929724913e-05, + "loss": 0.1304, + "num_input_tokens_seen": 12127136, + "step": 9960 + }, + { + "epoch": 1.1098117830493373, + "grad_norm": 1.4612808227539062, + "learning_rate": 2.774251030181535e-05, + "loss": 0.179, + "num_input_tokens_seen": 12133184, + "step": 9965 + }, + { + "epoch": 1.1103686379329547, + "grad_norm": 0.06735344231128693, + "learning_rate": 2.775643167390578e-05, + "loss": 0.3261, + "num_input_tokens_seen": 12139104, + "step": 9970 + }, + { + "epoch": 1.110925492816572, + "grad_norm": 0.01633385568857193, + "learning_rate": 2.7770353045996216e-05, + "loss": 0.1634, + "num_input_tokens_seen": 12145280, + "step": 9975 + }, + { + "epoch": 1.1114823477001894, + "grad_norm": 1.429574966430664, + "learning_rate": 2.7784274418086647e-05, + "loss": 0.1663, + "num_input_tokens_seen": 12151264, + "step": 9980 + }, + { + "epoch": 1.1120392025838066, + "grad_norm": 0.2518957257270813, + "learning_rate": 2.7798195790177078e-05, + "loss": 0.2247, + "num_input_tokens_seen": 12157504, + "step": 9985 + }, + { + "epoch": 1.112596057467424, + "grad_norm": 0.659820556640625, + "learning_rate": 2.7812117162267515e-05, + "loss": 0.278, + "num_input_tokens_seen": 12163488, + "step": 9990 + }, + { + "epoch": 1.1131529123510413, + "grad_norm": 1.6796568632125854, + "learning_rate": 2.7826038534357946e-05, + "loss": 0.1671, + "num_input_tokens_seen": 12169760, + "step": 9995 + }, + { + "epoch": 1.1137097672346585, + "grad_norm": 1.0726124048233032, + "learning_rate": 2.783995990644838e-05, + "loss": 0.1603, + "num_input_tokens_seen": 12175776, + "step": 10000 + }, + { + "epoch": 1.114266622118276, + "grad_norm": 0.2981066405773163, + "learning_rate": 2.785388127853881e-05, + "loss": 0.2353, + "num_input_tokens_seen": 12181760, + "step": 10005 + }, + { + "epoch": 1.1148234770018932, + "grad_norm": 0.02924284338951111, + "learning_rate": 2.786780265062925e-05, + "loss": 0.1402, + "num_input_tokens_seen": 12187776, + "step": 10010 + }, + { + "epoch": 1.1153803318855107, + "grad_norm": 0.6389594674110413, + "learning_rate": 2.788172402271968e-05, + "loss": 0.1826, + "num_input_tokens_seen": 12193920, + "step": 10015 + }, + { + "epoch": 1.115937186769128, + "grad_norm": 1.120758056640625, + "learning_rate": 2.7895645394810115e-05, + "loss": 0.3337, + "num_input_tokens_seen": 12199872, + "step": 10020 + }, + { + "epoch": 1.1164940416527454, + "grad_norm": 0.23553825914859772, + "learning_rate": 2.7909566766900545e-05, + "loss": 0.1254, + "num_input_tokens_seen": 12206016, + "step": 10025 + }, + { + "epoch": 1.1170508965363626, + "grad_norm": 1.3294011354446411, + "learning_rate": 2.7923488138990983e-05, + "loss": 0.231, + "num_input_tokens_seen": 12211904, + "step": 10030 + }, + { + "epoch": 1.11760775141998, + "grad_norm": 1.0060505867004395, + "learning_rate": 2.7937409511081414e-05, + "loss": 0.2095, + "num_input_tokens_seen": 12218112, + "step": 10035 + }, + { + "epoch": 1.1181646063035973, + "grad_norm": 2.287687063217163, + "learning_rate": 2.7951330883171845e-05, + "loss": 0.2282, + "num_input_tokens_seen": 12224288, + "step": 10040 + }, + { + "epoch": 1.1187214611872145, + "grad_norm": 0.3134075701236725, + "learning_rate": 2.796525225526228e-05, + "loss": 0.1415, + "num_input_tokens_seen": 12230560, + "step": 10045 + }, + { + "epoch": 1.119278316070832, + "grad_norm": 0.5176141262054443, + "learning_rate": 2.797917362735271e-05, + "loss": 0.0324, + "num_input_tokens_seen": 12236608, + "step": 10050 + }, + { + "epoch": 1.1198351709544492, + "grad_norm": 0.2525392174720764, + "learning_rate": 2.7993094999443148e-05, + "loss": 0.0931, + "num_input_tokens_seen": 12242880, + "step": 10055 + }, + { + "epoch": 1.1203920258380666, + "grad_norm": 0.5832995176315308, + "learning_rate": 2.800701637153358e-05, + "loss": 0.232, + "num_input_tokens_seen": 12249088, + "step": 10060 + }, + { + "epoch": 1.1209488807216839, + "grad_norm": 0.3403690755367279, + "learning_rate": 2.8020937743624017e-05, + "loss": 0.1631, + "num_input_tokens_seen": 12255296, + "step": 10065 + }, + { + "epoch": 1.1215057356053013, + "grad_norm": 0.622937798500061, + "learning_rate": 2.8034859115714444e-05, + "loss": 0.1579, + "num_input_tokens_seen": 12261312, + "step": 10070 + }, + { + "epoch": 1.1220625904889185, + "grad_norm": 1.3122398853302002, + "learning_rate": 2.8048780487804882e-05, + "loss": 0.157, + "num_input_tokens_seen": 12267424, + "step": 10075 + }, + { + "epoch": 1.122619445372536, + "grad_norm": 2.0826637744903564, + "learning_rate": 2.8062701859895313e-05, + "loss": 0.1822, + "num_input_tokens_seen": 12273216, + "step": 10080 + }, + { + "epoch": 1.1231763002561532, + "grad_norm": 0.10824296623468399, + "learning_rate": 2.8076623231985744e-05, + "loss": 0.1319, + "num_input_tokens_seen": 12279232, + "step": 10085 + }, + { + "epoch": 1.1237331551397707, + "grad_norm": 0.9662073254585266, + "learning_rate": 2.809054460407618e-05, + "loss": 0.2708, + "num_input_tokens_seen": 12285504, + "step": 10090 + }, + { + "epoch": 1.124290010023388, + "grad_norm": 0.5443354845046997, + "learning_rate": 2.810446597616661e-05, + "loss": 0.0831, + "num_input_tokens_seen": 12291488, + "step": 10095 + }, + { + "epoch": 1.1248468649070051, + "grad_norm": 1.3393336534500122, + "learning_rate": 2.8118387348257047e-05, + "loss": 0.1403, + "num_input_tokens_seen": 12297536, + "step": 10100 + }, + { + "epoch": 1.1254037197906226, + "grad_norm": 0.7561311721801758, + "learning_rate": 2.8132308720347478e-05, + "loss": 0.0717, + "num_input_tokens_seen": 12303808, + "step": 10105 + }, + { + "epoch": 1.1259605746742398, + "grad_norm": 0.5939314961433411, + "learning_rate": 2.8146230092437915e-05, + "loss": 0.1451, + "num_input_tokens_seen": 12310144, + "step": 10110 + }, + { + "epoch": 1.1265174295578573, + "grad_norm": 1.0416780710220337, + "learning_rate": 2.8160151464528346e-05, + "loss": 0.1962, + "num_input_tokens_seen": 12316096, + "step": 10115 + }, + { + "epoch": 1.1270742844414745, + "grad_norm": 0.8634747862815857, + "learning_rate": 2.817407283661878e-05, + "loss": 0.0966, + "num_input_tokens_seen": 12322496, + "step": 10120 + }, + { + "epoch": 1.127631139325092, + "grad_norm": 0.18871742486953735, + "learning_rate": 2.818799420870921e-05, + "loss": 0.1277, + "num_input_tokens_seen": 12328576, + "step": 10125 + }, + { + "epoch": 1.1281879942087092, + "grad_norm": 1.3496649265289307, + "learning_rate": 2.8201915580799643e-05, + "loss": 0.1617, + "num_input_tokens_seen": 12334816, + "step": 10130 + }, + { + "epoch": 1.1287448490923264, + "grad_norm": 3.159237861633301, + "learning_rate": 2.821583695289008e-05, + "loss": 0.1255, + "num_input_tokens_seen": 12340448, + "step": 10135 + }, + { + "epoch": 1.1293017039759439, + "grad_norm": 1.1857423782348633, + "learning_rate": 2.822975832498051e-05, + "loss": 0.0858, + "num_input_tokens_seen": 12346592, + "step": 10140 + }, + { + "epoch": 1.129858558859561, + "grad_norm": 1.2995434999465942, + "learning_rate": 2.8243679697070946e-05, + "loss": 0.1002, + "num_input_tokens_seen": 12352832, + "step": 10145 + }, + { + "epoch": 1.1304154137431786, + "grad_norm": 0.2469072937965393, + "learning_rate": 2.8257601069161377e-05, + "loss": 0.0607, + "num_input_tokens_seen": 12358944, + "step": 10150 + }, + { + "epoch": 1.1309722686267958, + "grad_norm": 2.6972951889038086, + "learning_rate": 2.8271522441251814e-05, + "loss": 0.2052, + "num_input_tokens_seen": 12364832, + "step": 10155 + }, + { + "epoch": 1.1315291235104132, + "grad_norm": 1.1711310148239136, + "learning_rate": 2.8285443813342245e-05, + "loss": 0.1497, + "num_input_tokens_seen": 12370720, + "step": 10160 + }, + { + "epoch": 1.1320859783940305, + "grad_norm": 1.111283302307129, + "learning_rate": 2.829936518543268e-05, + "loss": 0.1726, + "num_input_tokens_seen": 12376768, + "step": 10165 + }, + { + "epoch": 1.132642833277648, + "grad_norm": 0.3989301025867462, + "learning_rate": 2.831328655752311e-05, + "loss": 0.2139, + "num_input_tokens_seen": 12381792, + "step": 10170 + }, + { + "epoch": 1.1331996881612652, + "grad_norm": 0.6494684815406799, + "learning_rate": 2.832720792961354e-05, + "loss": 0.1691, + "num_input_tokens_seen": 12388288, + "step": 10175 + }, + { + "epoch": 1.1337565430448824, + "grad_norm": 0.40851977467536926, + "learning_rate": 2.834112930170398e-05, + "loss": 0.072, + "num_input_tokens_seen": 12394400, + "step": 10180 + }, + { + "epoch": 1.1343133979284998, + "grad_norm": 0.043581437319517136, + "learning_rate": 2.835505067379441e-05, + "loss": 0.1071, + "num_input_tokens_seen": 12400640, + "step": 10185 + }, + { + "epoch": 1.1348702528121173, + "grad_norm": 0.5332591533660889, + "learning_rate": 2.8368972045884844e-05, + "loss": 0.0474, + "num_input_tokens_seen": 12406784, + "step": 10190 + }, + { + "epoch": 1.1354271076957345, + "grad_norm": 0.3630179464817047, + "learning_rate": 2.8382893417975275e-05, + "loss": 0.0921, + "num_input_tokens_seen": 12413120, + "step": 10195 + }, + { + "epoch": 1.1359839625793517, + "grad_norm": 0.9947589039802551, + "learning_rate": 2.8396814790065713e-05, + "loss": 0.2337, + "num_input_tokens_seen": 12418464, + "step": 10200 + }, + { + "epoch": 1.1365408174629692, + "grad_norm": 0.5945553779602051, + "learning_rate": 2.8410736162156144e-05, + "loss": 0.0499, + "num_input_tokens_seen": 12424544, + "step": 10205 + }, + { + "epoch": 1.1370976723465864, + "grad_norm": 0.4933774173259735, + "learning_rate": 2.842465753424658e-05, + "loss": 0.1664, + "num_input_tokens_seen": 12430400, + "step": 10210 + }, + { + "epoch": 1.1376545272302039, + "grad_norm": 0.20103320479393005, + "learning_rate": 2.843857890633701e-05, + "loss": 0.1681, + "num_input_tokens_seen": 12436704, + "step": 10215 + }, + { + "epoch": 1.1382113821138211, + "grad_norm": 0.8651533126831055, + "learning_rate": 2.845250027842744e-05, + "loss": 0.0502, + "num_input_tokens_seen": 12442720, + "step": 10220 + }, + { + "epoch": 1.1387682369974383, + "grad_norm": 1.0663999319076538, + "learning_rate": 2.8466421650517878e-05, + "loss": 0.2107, + "num_input_tokens_seen": 12448832, + "step": 10225 + }, + { + "epoch": 1.1393250918810558, + "grad_norm": 1.6797605752944946, + "learning_rate": 2.848034302260831e-05, + "loss": 0.235, + "num_input_tokens_seen": 12454976, + "step": 10230 + }, + { + "epoch": 1.1398819467646732, + "grad_norm": 0.6673113107681274, + "learning_rate": 2.8494264394698743e-05, + "loss": 0.1098, + "num_input_tokens_seen": 12460864, + "step": 10235 + }, + { + "epoch": 1.1404388016482905, + "grad_norm": 0.02373494952917099, + "learning_rate": 2.8508185766789174e-05, + "loss": 0.2012, + "num_input_tokens_seen": 12466784, + "step": 10240 + }, + { + "epoch": 1.1409956565319077, + "grad_norm": 0.3441801965236664, + "learning_rate": 2.8522107138879612e-05, + "loss": 0.1833, + "num_input_tokens_seen": 12472640, + "step": 10245 + }, + { + "epoch": 1.1415525114155252, + "grad_norm": 0.020122984424233437, + "learning_rate": 2.8536028510970043e-05, + "loss": 0.1521, + "num_input_tokens_seen": 12478720, + "step": 10250 + }, + { + "epoch": 1.1421093662991424, + "grad_norm": 0.9098536968231201, + "learning_rate": 2.8549949883060477e-05, + "loss": 0.0888, + "num_input_tokens_seen": 12485088, + "step": 10255 + }, + { + "epoch": 1.1426662211827598, + "grad_norm": 1.3107575178146362, + "learning_rate": 2.8563871255150908e-05, + "loss": 0.2093, + "num_input_tokens_seen": 12490912, + "step": 10260 + }, + { + "epoch": 1.143223076066377, + "grad_norm": 0.3755905032157898, + "learning_rate": 2.857779262724134e-05, + "loss": 0.0539, + "num_input_tokens_seen": 12497536, + "step": 10265 + }, + { + "epoch": 1.1437799309499945, + "grad_norm": 0.925727367401123, + "learning_rate": 2.8591713999331777e-05, + "loss": 0.1862, + "num_input_tokens_seen": 12503328, + "step": 10270 + }, + { + "epoch": 1.1443367858336118, + "grad_norm": 0.13181358575820923, + "learning_rate": 2.8605635371422208e-05, + "loss": 0.247, + "num_input_tokens_seen": 12509376, + "step": 10275 + }, + { + "epoch": 1.1448936407172292, + "grad_norm": 1.778127670288086, + "learning_rate": 2.8619556743512642e-05, + "loss": 0.1153, + "num_input_tokens_seen": 12515808, + "step": 10280 + }, + { + "epoch": 1.1454504956008464, + "grad_norm": 1.11405611038208, + "learning_rate": 2.8633478115603073e-05, + "loss": 0.1547, + "num_input_tokens_seen": 12522080, + "step": 10285 + }, + { + "epoch": 1.1460073504844637, + "grad_norm": 0.33431276679039, + "learning_rate": 2.864739948769351e-05, + "loss": 0.3238, + "num_input_tokens_seen": 12528352, + "step": 10290 + }, + { + "epoch": 1.1465642053680811, + "grad_norm": 1.7451674938201904, + "learning_rate": 2.866132085978394e-05, + "loss": 0.1401, + "num_input_tokens_seen": 12534496, + "step": 10295 + }, + { + "epoch": 1.1471210602516984, + "grad_norm": 1.053695559501648, + "learning_rate": 2.8675242231874376e-05, + "loss": 0.1662, + "num_input_tokens_seen": 12540576, + "step": 10300 + }, + { + "epoch": 1.1476779151353158, + "grad_norm": 0.8738763332366943, + "learning_rate": 2.8689163603964807e-05, + "loss": 0.2406, + "num_input_tokens_seen": 12546528, + "step": 10305 + }, + { + "epoch": 1.148234770018933, + "grad_norm": 2.3654050827026367, + "learning_rate": 2.8703084976055238e-05, + "loss": 0.1775, + "num_input_tokens_seen": 12552672, + "step": 10310 + }, + { + "epoch": 1.1487916249025505, + "grad_norm": 1.57909095287323, + "learning_rate": 2.8717006348145676e-05, + "loss": 0.2082, + "num_input_tokens_seen": 12558816, + "step": 10315 + }, + { + "epoch": 1.1493484797861677, + "grad_norm": 1.1915076971054077, + "learning_rate": 2.8730927720236106e-05, + "loss": 0.1748, + "num_input_tokens_seen": 12564608, + "step": 10320 + }, + { + "epoch": 1.1499053346697852, + "grad_norm": 1.1826107501983643, + "learning_rate": 2.874484909232654e-05, + "loss": 0.1553, + "num_input_tokens_seen": 12570368, + "step": 10325 + }, + { + "epoch": 1.1504621895534024, + "grad_norm": 0.28985872864723206, + "learning_rate": 2.8758770464416972e-05, + "loss": 0.1568, + "num_input_tokens_seen": 12576512, + "step": 10330 + }, + { + "epoch": 1.1510190444370196, + "grad_norm": 0.11829528957605362, + "learning_rate": 2.877269183650741e-05, + "loss": 0.363, + "num_input_tokens_seen": 12582560, + "step": 10335 + }, + { + "epoch": 1.151575899320637, + "grad_norm": 0.7801473140716553, + "learning_rate": 2.878661320859784e-05, + "loss": 0.0978, + "num_input_tokens_seen": 12589120, + "step": 10340 + }, + { + "epoch": 1.1521327542042543, + "grad_norm": 1.5532041788101196, + "learning_rate": 2.8800534580688278e-05, + "loss": 0.1128, + "num_input_tokens_seen": 12595328, + "step": 10345 + }, + { + "epoch": 1.1526896090878718, + "grad_norm": 0.46891912817955017, + "learning_rate": 2.8814455952778706e-05, + "loss": 0.1129, + "num_input_tokens_seen": 12601632, + "step": 10350 + }, + { + "epoch": 1.153246463971489, + "grad_norm": 1.0679572820663452, + "learning_rate": 2.8828377324869137e-05, + "loss": 0.1156, + "num_input_tokens_seen": 12607904, + "step": 10355 + }, + { + "epoch": 1.1538033188551065, + "grad_norm": 0.1894179880619049, + "learning_rate": 2.8842298696959574e-05, + "loss": 0.3039, + "num_input_tokens_seen": 12614272, + "step": 10360 + }, + { + "epoch": 1.1543601737387237, + "grad_norm": 0.7272555232048035, + "learning_rate": 2.8856220069050005e-05, + "loss": 0.1254, + "num_input_tokens_seen": 12620192, + "step": 10365 + }, + { + "epoch": 1.1549170286223411, + "grad_norm": 1.3260713815689087, + "learning_rate": 2.8870141441140443e-05, + "loss": 0.1279, + "num_input_tokens_seen": 12626272, + "step": 10370 + }, + { + "epoch": 1.1554738835059584, + "grad_norm": 0.03651243448257446, + "learning_rate": 2.888406281323087e-05, + "loss": 0.1966, + "num_input_tokens_seen": 12632288, + "step": 10375 + }, + { + "epoch": 1.1560307383895756, + "grad_norm": 0.738094687461853, + "learning_rate": 2.8897984185321308e-05, + "loss": 0.133, + "num_input_tokens_seen": 12637856, + "step": 10380 + }, + { + "epoch": 1.156587593273193, + "grad_norm": 0.3348749876022339, + "learning_rate": 2.891190555741174e-05, + "loss": 0.1459, + "num_input_tokens_seen": 12643872, + "step": 10385 + }, + { + "epoch": 1.1571444481568103, + "grad_norm": 0.8768423199653625, + "learning_rate": 2.8925826929502177e-05, + "loss": 0.2472, + "num_input_tokens_seen": 12649664, + "step": 10390 + }, + { + "epoch": 1.1577013030404277, + "grad_norm": 0.8857418894767761, + "learning_rate": 2.8939748301592608e-05, + "loss": 0.1143, + "num_input_tokens_seen": 12655872, + "step": 10395 + }, + { + "epoch": 1.158258157924045, + "grad_norm": 0.11580786108970642, + "learning_rate": 2.8953669673683035e-05, + "loss": 0.1965, + "num_input_tokens_seen": 12661792, + "step": 10400 + }, + { + "epoch": 1.1588150128076624, + "grad_norm": 0.05995440110564232, + "learning_rate": 2.8967591045773473e-05, + "loss": 0.1038, + "num_input_tokens_seen": 12668128, + "step": 10405 + }, + { + "epoch": 1.1593718676912796, + "grad_norm": 0.24246680736541748, + "learning_rate": 2.8981512417863904e-05, + "loss": 0.1991, + "num_input_tokens_seen": 12674336, + "step": 10410 + }, + { + "epoch": 1.159928722574897, + "grad_norm": 1.2907838821411133, + "learning_rate": 2.8995433789954342e-05, + "loss": 0.0886, + "num_input_tokens_seen": 12680480, + "step": 10415 + }, + { + "epoch": 1.1604855774585143, + "grad_norm": 2.4855401515960693, + "learning_rate": 2.9009355162044773e-05, + "loss": 0.3202, + "num_input_tokens_seen": 12686496, + "step": 10420 + }, + { + "epoch": 1.1610424323421316, + "grad_norm": 0.6680050492286682, + "learning_rate": 2.9023276534135207e-05, + "loss": 0.0801, + "num_input_tokens_seen": 12692544, + "step": 10425 + }, + { + "epoch": 1.161599287225749, + "grad_norm": 1.4825475215911865, + "learning_rate": 2.9037197906225638e-05, + "loss": 0.1628, + "num_input_tokens_seen": 12698784, + "step": 10430 + }, + { + "epoch": 1.1621561421093662, + "grad_norm": 1.3700119256973267, + "learning_rate": 2.9051119278316076e-05, + "loss": 0.0683, + "num_input_tokens_seen": 12704832, + "step": 10435 + }, + { + "epoch": 1.1627129969929837, + "grad_norm": 1.343968152999878, + "learning_rate": 2.9065040650406507e-05, + "loss": 0.1093, + "num_input_tokens_seen": 12710880, + "step": 10440 + }, + { + "epoch": 1.163269851876601, + "grad_norm": 0.14282456040382385, + "learning_rate": 2.9078962022496938e-05, + "loss": 0.1522, + "num_input_tokens_seen": 12716416, + "step": 10445 + }, + { + "epoch": 1.1638267067602184, + "grad_norm": 0.45144277811050415, + "learning_rate": 2.9092883394587372e-05, + "loss": 0.1097, + "num_input_tokens_seen": 12722368, + "step": 10450 + }, + { + "epoch": 1.1643835616438356, + "grad_norm": 0.020302901044487953, + "learning_rate": 2.9106804766677803e-05, + "loss": 0.2839, + "num_input_tokens_seen": 12728576, + "step": 10455 + }, + { + "epoch": 1.164940416527453, + "grad_norm": 0.07980433851480484, + "learning_rate": 2.912072613876824e-05, + "loss": 0.0968, + "num_input_tokens_seen": 12734880, + "step": 10460 + }, + { + "epoch": 1.1654972714110703, + "grad_norm": 0.7363187074661255, + "learning_rate": 2.913464751085867e-05, + "loss": 0.0983, + "num_input_tokens_seen": 12741120, + "step": 10465 + }, + { + "epoch": 1.1660541262946875, + "grad_norm": 0.351031094789505, + "learning_rate": 2.9148568882949106e-05, + "loss": 0.0807, + "num_input_tokens_seen": 12747424, + "step": 10470 + }, + { + "epoch": 1.166610981178305, + "grad_norm": 0.8948748111724854, + "learning_rate": 2.9162490255039537e-05, + "loss": 0.0854, + "num_input_tokens_seen": 12753792, + "step": 10475 + }, + { + "epoch": 1.1671678360619222, + "grad_norm": 0.39772921800613403, + "learning_rate": 2.9176411627129974e-05, + "loss": 0.1587, + "num_input_tokens_seen": 12760192, + "step": 10480 + }, + { + "epoch": 1.1677246909455397, + "grad_norm": 1.4849098920822144, + "learning_rate": 2.9190332999220405e-05, + "loss": 0.1296, + "num_input_tokens_seen": 12766208, + "step": 10485 + }, + { + "epoch": 1.1682815458291569, + "grad_norm": 0.33363720774650574, + "learning_rate": 2.9204254371310836e-05, + "loss": 0.0961, + "num_input_tokens_seen": 12772288, + "step": 10490 + }, + { + "epoch": 1.1688384007127743, + "grad_norm": 0.6130156517028809, + "learning_rate": 2.921817574340127e-05, + "loss": 0.0932, + "num_input_tokens_seen": 12778432, + "step": 10495 + }, + { + "epoch": 1.1693952555963916, + "grad_norm": 2.0576019287109375, + "learning_rate": 2.92320971154917e-05, + "loss": 0.2079, + "num_input_tokens_seen": 12784832, + "step": 10500 + }, + { + "epoch": 1.169952110480009, + "grad_norm": 1.171205997467041, + "learning_rate": 2.924601848758214e-05, + "loss": 0.1433, + "num_input_tokens_seen": 12790880, + "step": 10505 + }, + { + "epoch": 1.1705089653636263, + "grad_norm": 0.27796655893325806, + "learning_rate": 2.925993985967257e-05, + "loss": 0.1258, + "num_input_tokens_seen": 12796768, + "step": 10510 + }, + { + "epoch": 1.1710658202472435, + "grad_norm": 1.1630905866622925, + "learning_rate": 2.9273861231763005e-05, + "loss": 0.178, + "num_input_tokens_seen": 12802912, + "step": 10515 + }, + { + "epoch": 1.171622675130861, + "grad_norm": 0.9769444465637207, + "learning_rate": 2.9287782603853436e-05, + "loss": 0.1429, + "num_input_tokens_seen": 12809152, + "step": 10520 + }, + { + "epoch": 1.1721795300144782, + "grad_norm": 0.9530826210975647, + "learning_rate": 2.9301703975943873e-05, + "loss": 0.1215, + "num_input_tokens_seen": 12815424, + "step": 10525 + }, + { + "epoch": 1.1727363848980956, + "grad_norm": 0.04434185102581978, + "learning_rate": 2.9315625348034304e-05, + "loss": 0.0639, + "num_input_tokens_seen": 12821536, + "step": 10530 + }, + { + "epoch": 1.1732932397817128, + "grad_norm": 1.67789888381958, + "learning_rate": 2.9329546720124735e-05, + "loss": 0.1728, + "num_input_tokens_seen": 12827520, + "step": 10535 + }, + { + "epoch": 1.1738500946653303, + "grad_norm": 1.1945849657058716, + "learning_rate": 2.934346809221517e-05, + "loss": 0.2471, + "num_input_tokens_seen": 12833504, + "step": 10540 + }, + { + "epoch": 1.1744069495489475, + "grad_norm": 0.16261084377765656, + "learning_rate": 2.93573894643056e-05, + "loss": 0.1507, + "num_input_tokens_seen": 12839488, + "step": 10545 + }, + { + "epoch": 1.174963804432565, + "grad_norm": 1.311678171157837, + "learning_rate": 2.9371310836396038e-05, + "loss": 0.0733, + "num_input_tokens_seen": 12845856, + "step": 10550 + }, + { + "epoch": 1.1755206593161822, + "grad_norm": 0.8432148694992065, + "learning_rate": 2.938523220848647e-05, + "loss": 0.1815, + "num_input_tokens_seen": 12851552, + "step": 10555 + }, + { + "epoch": 1.1760775141997994, + "grad_norm": 0.3634023368358612, + "learning_rate": 2.9399153580576903e-05, + "loss": 0.1194, + "num_input_tokens_seen": 12857824, + "step": 10560 + }, + { + "epoch": 1.176634369083417, + "grad_norm": 0.8697482347488403, + "learning_rate": 2.9413074952667334e-05, + "loss": 0.1903, + "num_input_tokens_seen": 12863392, + "step": 10565 + }, + { + "epoch": 1.1771912239670341, + "grad_norm": 0.1254614293575287, + "learning_rate": 2.9426996324757772e-05, + "loss": 0.1339, + "num_input_tokens_seen": 12869760, + "step": 10570 + }, + { + "epoch": 1.1777480788506516, + "grad_norm": 0.4375044107437134, + "learning_rate": 2.9440917696848203e-05, + "loss": 0.1658, + "num_input_tokens_seen": 12875776, + "step": 10575 + }, + { + "epoch": 1.1783049337342688, + "grad_norm": 1.8149017095565796, + "learning_rate": 2.9454839068938634e-05, + "loss": 0.1749, + "num_input_tokens_seen": 12881696, + "step": 10580 + }, + { + "epoch": 1.1788617886178863, + "grad_norm": 0.5699126124382019, + "learning_rate": 2.9468760441029068e-05, + "loss": 0.0714, + "num_input_tokens_seen": 12887456, + "step": 10585 + }, + { + "epoch": 1.1794186435015035, + "grad_norm": 1.0465364456176758, + "learning_rate": 2.94826818131195e-05, + "loss": 0.2753, + "num_input_tokens_seen": 12893632, + "step": 10590 + }, + { + "epoch": 1.179975498385121, + "grad_norm": 0.7232337594032288, + "learning_rate": 2.9496603185209937e-05, + "loss": 0.0581, + "num_input_tokens_seen": 12899904, + "step": 10595 + }, + { + "epoch": 1.1805323532687382, + "grad_norm": 1.0577995777130127, + "learning_rate": 2.9510524557300368e-05, + "loss": 0.1713, + "num_input_tokens_seen": 12906112, + "step": 10600 + }, + { + "epoch": 1.1810892081523554, + "grad_norm": 0.7377578020095825, + "learning_rate": 2.9524445929390802e-05, + "loss": 0.2801, + "num_input_tokens_seen": 12912160, + "step": 10605 + }, + { + "epoch": 1.1816460630359729, + "grad_norm": 0.5348569750785828, + "learning_rate": 2.9538367301481233e-05, + "loss": 0.1348, + "num_input_tokens_seen": 12918496, + "step": 10610 + }, + { + "epoch": 1.18220291791959, + "grad_norm": 0.5675197839736938, + "learning_rate": 2.955228867357167e-05, + "loss": 0.0951, + "num_input_tokens_seen": 12924672, + "step": 10615 + }, + { + "epoch": 1.1827597728032075, + "grad_norm": 0.2532382011413574, + "learning_rate": 2.9566210045662102e-05, + "loss": 0.1898, + "num_input_tokens_seen": 12930944, + "step": 10620 + }, + { + "epoch": 1.1833166276868248, + "grad_norm": 1.2581210136413574, + "learning_rate": 2.9580131417752533e-05, + "loss": 0.2085, + "num_input_tokens_seen": 12936992, + "step": 10625 + }, + { + "epoch": 1.1838734825704422, + "grad_norm": 0.5649065971374512, + "learning_rate": 2.9594052789842967e-05, + "loss": 0.0938, + "num_input_tokens_seen": 12943488, + "step": 10630 + }, + { + "epoch": 1.1844303374540595, + "grad_norm": 0.6170865297317505, + "learning_rate": 2.9607974161933398e-05, + "loss": 0.1758, + "num_input_tokens_seen": 12949632, + "step": 10635 + }, + { + "epoch": 1.184987192337677, + "grad_norm": 0.7477676272392273, + "learning_rate": 2.9621895534023836e-05, + "loss": 0.1361, + "num_input_tokens_seen": 12955808, + "step": 10640 + }, + { + "epoch": 1.1855440472212941, + "grad_norm": 0.3941863775253296, + "learning_rate": 2.9635816906114267e-05, + "loss": 0.1881, + "num_input_tokens_seen": 12961696, + "step": 10645 + }, + { + "epoch": 1.1861009021049114, + "grad_norm": 0.8520174622535706, + "learning_rate": 2.9649738278204704e-05, + "loss": 0.3047, + "num_input_tokens_seen": 12967840, + "step": 10650 + }, + { + "epoch": 1.1866577569885288, + "grad_norm": 0.8160400390625, + "learning_rate": 2.9663659650295132e-05, + "loss": 0.191, + "num_input_tokens_seen": 12973376, + "step": 10655 + }, + { + "epoch": 1.187214611872146, + "grad_norm": 1.4971635341644287, + "learning_rate": 2.967758102238557e-05, + "loss": 0.1813, + "num_input_tokens_seen": 12979584, + "step": 10660 + }, + { + "epoch": 1.1877714667557635, + "grad_norm": 1.02728271484375, + "learning_rate": 2.9691502394476e-05, + "loss": 0.233, + "num_input_tokens_seen": 12985888, + "step": 10665 + }, + { + "epoch": 1.1883283216393807, + "grad_norm": 1.0627743005752563, + "learning_rate": 2.970542376656643e-05, + "loss": 0.1846, + "num_input_tokens_seen": 12991968, + "step": 10670 + }, + { + "epoch": 1.1888851765229982, + "grad_norm": 1.0522857904434204, + "learning_rate": 2.971934513865687e-05, + "loss": 0.3206, + "num_input_tokens_seen": 12998368, + "step": 10675 + }, + { + "epoch": 1.1894420314066154, + "grad_norm": 0.029093820601701736, + "learning_rate": 2.97332665107473e-05, + "loss": 0.0621, + "num_input_tokens_seen": 13004704, + "step": 10680 + }, + { + "epoch": 1.1899988862902329, + "grad_norm": 0.6345466375350952, + "learning_rate": 2.9747187882837734e-05, + "loss": 0.1117, + "num_input_tokens_seen": 13010752, + "step": 10685 + }, + { + "epoch": 1.19055574117385, + "grad_norm": 1.5300462245941162, + "learning_rate": 2.9761109254928165e-05, + "loss": 0.241, + "num_input_tokens_seen": 13016832, + "step": 10690 + }, + { + "epoch": 1.1911125960574673, + "grad_norm": 1.809888243675232, + "learning_rate": 2.9775030627018603e-05, + "loss": 0.1483, + "num_input_tokens_seen": 13023360, + "step": 10695 + }, + { + "epoch": 1.1916694509410848, + "grad_norm": 1.6357121467590332, + "learning_rate": 2.9788951999109034e-05, + "loss": 0.1782, + "num_input_tokens_seen": 13028832, + "step": 10700 + }, + { + "epoch": 1.192226305824702, + "grad_norm": 3.7758498191833496, + "learning_rate": 2.980287337119947e-05, + "loss": 0.0875, + "num_input_tokens_seen": 13035104, + "step": 10705 + }, + { + "epoch": 1.1927831607083195, + "grad_norm": 1.0020372867584229, + "learning_rate": 2.98167947432899e-05, + "loss": 0.181, + "num_input_tokens_seen": 13040768, + "step": 10710 + }, + { + "epoch": 1.1933400155919367, + "grad_norm": 0.16012205183506012, + "learning_rate": 2.9830716115380337e-05, + "loss": 0.1709, + "num_input_tokens_seen": 13047008, + "step": 10715 + }, + { + "epoch": 1.1938968704755542, + "grad_norm": 0.9192321300506592, + "learning_rate": 2.9844637487470768e-05, + "loss": 0.1612, + "num_input_tokens_seen": 13053152, + "step": 10720 + }, + { + "epoch": 1.1944537253591714, + "grad_norm": 0.5294702053070068, + "learning_rate": 2.98585588595612e-05, + "loss": 0.0743, + "num_input_tokens_seen": 13059520, + "step": 10725 + }, + { + "epoch": 1.1950105802427888, + "grad_norm": 0.2569234371185303, + "learning_rate": 2.9872480231651633e-05, + "loss": 0.1104, + "num_input_tokens_seen": 13066080, + "step": 10730 + }, + { + "epoch": 1.195567435126406, + "grad_norm": 0.7018679976463318, + "learning_rate": 2.9886401603742064e-05, + "loss": 0.182, + "num_input_tokens_seen": 13072096, + "step": 10735 + }, + { + "epoch": 1.1961242900100233, + "grad_norm": 0.04837426170706749, + "learning_rate": 2.9900322975832502e-05, + "loss": 0.1396, + "num_input_tokens_seen": 13078336, + "step": 10740 + }, + { + "epoch": 1.1966811448936407, + "grad_norm": 1.3588374853134155, + "learning_rate": 2.9914244347922933e-05, + "loss": 0.2256, + "num_input_tokens_seen": 13083904, + "step": 10745 + }, + { + "epoch": 1.197237999777258, + "grad_norm": 1.4254220724105835, + "learning_rate": 2.9928165720013367e-05, + "loss": 0.107, + "num_input_tokens_seen": 13090048, + "step": 10750 + }, + { + "epoch": 1.1977948546608754, + "grad_norm": 1.0118014812469482, + "learning_rate": 2.9942087092103798e-05, + "loss": 0.1425, + "num_input_tokens_seen": 13096352, + "step": 10755 + }, + { + "epoch": 1.1983517095444927, + "grad_norm": 0.5257288217544556, + "learning_rate": 2.9956008464194236e-05, + "loss": 0.2436, + "num_input_tokens_seen": 13102336, + "step": 10760 + }, + { + "epoch": 1.1989085644281101, + "grad_norm": 0.2299557328224182, + "learning_rate": 2.9969929836284667e-05, + "loss": 0.1924, + "num_input_tokens_seen": 13108320, + "step": 10765 + }, + { + "epoch": 1.1994654193117273, + "grad_norm": 0.8212246298789978, + "learning_rate": 2.9983851208375098e-05, + "loss": 0.1855, + "num_input_tokens_seen": 13114368, + "step": 10770 + }, + { + "epoch": 1.2000222741953448, + "grad_norm": 0.7684857249259949, + "learning_rate": 2.9997772580465532e-05, + "loss": 0.2026, + "num_input_tokens_seen": 13120064, + "step": 10775 + }, + { + "epoch": 1.200579129078962, + "grad_norm": 1.0143282413482666, + "learning_rate": 3.0011693952555963e-05, + "loss": 0.2584, + "num_input_tokens_seen": 13126112, + "step": 10780 + }, + { + "epoch": 1.2011359839625793, + "grad_norm": 1.047127366065979, + "learning_rate": 3.00256153246464e-05, + "loss": 0.1733, + "num_input_tokens_seen": 13131936, + "step": 10785 + }, + { + "epoch": 1.2016928388461967, + "grad_norm": 1.2867435216903687, + "learning_rate": 3.003953669673683e-05, + "loss": 0.2319, + "num_input_tokens_seen": 13138240, + "step": 10790 + }, + { + "epoch": 1.202249693729814, + "grad_norm": 0.8688098192214966, + "learning_rate": 3.0053458068827266e-05, + "loss": 0.1156, + "num_input_tokens_seen": 13144128, + "step": 10795 + }, + { + "epoch": 1.2028065486134314, + "grad_norm": 1.282662272453308, + "learning_rate": 3.0067379440917697e-05, + "loss": 0.1255, + "num_input_tokens_seen": 13150144, + "step": 10800 + }, + { + "epoch": 1.2033634034970486, + "grad_norm": 3.002190351486206, + "learning_rate": 3.0081300813008135e-05, + "loss": 0.2666, + "num_input_tokens_seen": 13156288, + "step": 10805 + }, + { + "epoch": 1.203920258380666, + "grad_norm": 0.9858518242835999, + "learning_rate": 3.0095222185098566e-05, + "loss": 0.2159, + "num_input_tokens_seen": 13162528, + "step": 10810 + }, + { + "epoch": 1.2044771132642833, + "grad_norm": 2.9916157722473145, + "learning_rate": 3.0109143557188997e-05, + "loss": 0.1432, + "num_input_tokens_seen": 13168384, + "step": 10815 + }, + { + "epoch": 1.2050339681479008, + "grad_norm": 2.4867284297943115, + "learning_rate": 3.012306492927943e-05, + "loss": 0.219, + "num_input_tokens_seen": 13174208, + "step": 10820 + }, + { + "epoch": 1.205590823031518, + "grad_norm": 0.387638658285141, + "learning_rate": 3.0136986301369862e-05, + "loss": 0.1698, + "num_input_tokens_seen": 13180480, + "step": 10825 + }, + { + "epoch": 1.2061476779151352, + "grad_norm": 0.8664297461509705, + "learning_rate": 3.01509076734603e-05, + "loss": 0.1601, + "num_input_tokens_seen": 13186272, + "step": 10830 + }, + { + "epoch": 1.2067045327987527, + "grad_norm": 0.47376298904418945, + "learning_rate": 3.016482904555073e-05, + "loss": 0.0644, + "num_input_tokens_seen": 13192288, + "step": 10835 + }, + { + "epoch": 1.20726138768237, + "grad_norm": 1.11573326587677, + "learning_rate": 3.0178750417641165e-05, + "loss": 0.2593, + "num_input_tokens_seen": 13198144, + "step": 10840 + }, + { + "epoch": 1.2078182425659874, + "grad_norm": 0.8968383073806763, + "learning_rate": 3.0192671789731596e-05, + "loss": 0.0639, + "num_input_tokens_seen": 13204576, + "step": 10845 + }, + { + "epoch": 1.2083750974496046, + "grad_norm": 0.6341099143028259, + "learning_rate": 3.0206593161822033e-05, + "loss": 0.2189, + "num_input_tokens_seen": 13210560, + "step": 10850 + }, + { + "epoch": 1.208931952333222, + "grad_norm": 0.32412227988243103, + "learning_rate": 3.0220514533912464e-05, + "loss": 0.1278, + "num_input_tokens_seen": 13216800, + "step": 10855 + }, + { + "epoch": 1.2094888072168393, + "grad_norm": 0.2583045959472656, + "learning_rate": 3.0234435906002895e-05, + "loss": 0.2092, + "num_input_tokens_seen": 13223136, + "step": 10860 + }, + { + "epoch": 1.2100456621004567, + "grad_norm": 3.4596548080444336, + "learning_rate": 3.024835727809333e-05, + "loss": 0.1641, + "num_input_tokens_seen": 13229312, + "step": 10865 + }, + { + "epoch": 1.210602516984074, + "grad_norm": 2.1931440830230713, + "learning_rate": 3.026227865018376e-05, + "loss": 0.1722, + "num_input_tokens_seen": 13235616, + "step": 10870 + }, + { + "epoch": 1.2111593718676912, + "grad_norm": 0.7042754292488098, + "learning_rate": 3.02762000222742e-05, + "loss": 0.1621, + "num_input_tokens_seen": 13241696, + "step": 10875 + }, + { + "epoch": 1.2117162267513086, + "grad_norm": 1.7082210779190063, + "learning_rate": 3.029012139436463e-05, + "loss": 0.1891, + "num_input_tokens_seen": 13247872, + "step": 10880 + }, + { + "epoch": 1.2122730816349259, + "grad_norm": 0.8167304396629333, + "learning_rate": 3.0304042766455064e-05, + "loss": 0.1298, + "num_input_tokens_seen": 13254016, + "step": 10885 + }, + { + "epoch": 1.2128299365185433, + "grad_norm": 0.5215151309967041, + "learning_rate": 3.0317964138545495e-05, + "loss": 0.1583, + "num_input_tokens_seen": 13260032, + "step": 10890 + }, + { + "epoch": 1.2133867914021605, + "grad_norm": 0.3813779652118683, + "learning_rate": 3.0331885510635932e-05, + "loss": 0.0534, + "num_input_tokens_seen": 13266080, + "step": 10895 + }, + { + "epoch": 1.213943646285778, + "grad_norm": 1.901391863822937, + "learning_rate": 3.0345806882726363e-05, + "loss": 0.1539, + "num_input_tokens_seen": 13272704, + "step": 10900 + }, + { + "epoch": 1.2145005011693952, + "grad_norm": 0.4250619411468506, + "learning_rate": 3.0359728254816794e-05, + "loss": 0.0728, + "num_input_tokens_seen": 13279168, + "step": 10905 + }, + { + "epoch": 1.2150573560530127, + "grad_norm": 0.15485695004463196, + "learning_rate": 3.0373649626907232e-05, + "loss": 0.1271, + "num_input_tokens_seen": 13285344, + "step": 10910 + }, + { + "epoch": 1.21561421093663, + "grad_norm": 0.9842609763145447, + "learning_rate": 3.038757099899766e-05, + "loss": 0.1195, + "num_input_tokens_seen": 13291264, + "step": 10915 + }, + { + "epoch": 1.2161710658202471, + "grad_norm": 0.5282835364341736, + "learning_rate": 3.0401492371088097e-05, + "loss": 0.1385, + "num_input_tokens_seen": 13297024, + "step": 10920 + }, + { + "epoch": 1.2167279207038646, + "grad_norm": 0.4698047935962677, + "learning_rate": 3.0415413743178528e-05, + "loss": 0.3452, + "num_input_tokens_seen": 13302784, + "step": 10925 + }, + { + "epoch": 1.2172847755874818, + "grad_norm": 0.427817702293396, + "learning_rate": 3.0429335115268966e-05, + "loss": 0.2797, + "num_input_tokens_seen": 13308800, + "step": 10930 + }, + { + "epoch": 1.2178416304710993, + "grad_norm": 1.3348263502120972, + "learning_rate": 3.0443256487359397e-05, + "loss": 0.1525, + "num_input_tokens_seen": 13314656, + "step": 10935 + }, + { + "epoch": 1.2183984853547165, + "grad_norm": 1.6770727634429932, + "learning_rate": 3.045717785944983e-05, + "loss": 0.2103, + "num_input_tokens_seen": 13320128, + "step": 10940 + }, + { + "epoch": 1.218955340238334, + "grad_norm": 0.8858080506324768, + "learning_rate": 3.0471099231540262e-05, + "loss": 0.1263, + "num_input_tokens_seen": 13326304, + "step": 10945 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 0.6681308746337891, + "learning_rate": 3.0485020603630693e-05, + "loss": 0.1683, + "num_input_tokens_seen": 13332608, + "step": 10950 + }, + { + "epoch": 1.2200690500055686, + "grad_norm": 1.2259418964385986, + "learning_rate": 3.049894197572113e-05, + "loss": 0.161, + "num_input_tokens_seen": 13338560, + "step": 10955 + }, + { + "epoch": 1.2206259048891859, + "grad_norm": 1.2824811935424805, + "learning_rate": 3.051286334781156e-05, + "loss": 0.2649, + "num_input_tokens_seen": 13344096, + "step": 10960 + }, + { + "epoch": 1.221182759772803, + "grad_norm": 1.291749119758606, + "learning_rate": 3.0526784719902e-05, + "loss": 0.2377, + "num_input_tokens_seen": 13350496, + "step": 10965 + }, + { + "epoch": 1.2217396146564206, + "grad_norm": 0.07909068465232849, + "learning_rate": 3.054070609199243e-05, + "loss": 0.0983, + "num_input_tokens_seen": 13356704, + "step": 10970 + }, + { + "epoch": 1.2222964695400378, + "grad_norm": 0.7435467839241028, + "learning_rate": 3.055462746408286e-05, + "loss": 0.1654, + "num_input_tokens_seen": 13361984, + "step": 10975 + }, + { + "epoch": 1.2228533244236552, + "grad_norm": 0.9810774326324463, + "learning_rate": 3.0568548836173296e-05, + "loss": 0.1626, + "num_input_tokens_seen": 13367840, + "step": 10980 + }, + { + "epoch": 1.2234101793072725, + "grad_norm": 0.24422089755535126, + "learning_rate": 3.058247020826373e-05, + "loss": 0.2145, + "num_input_tokens_seen": 13373952, + "step": 10985 + }, + { + "epoch": 1.22396703419089, + "grad_norm": 0.3581104278564453, + "learning_rate": 3.0596391580354164e-05, + "loss": 0.1184, + "num_input_tokens_seen": 13378976, + "step": 10990 + }, + { + "epoch": 1.2245238890745072, + "grad_norm": 0.5192462801933289, + "learning_rate": 3.061031295244459e-05, + "loss": 0.1106, + "num_input_tokens_seen": 13384864, + "step": 10995 + }, + { + "epoch": 1.2250807439581246, + "grad_norm": 1.4339717626571655, + "learning_rate": 3.0624234324535026e-05, + "loss": 0.1665, + "num_input_tokens_seen": 13391328, + "step": 11000 + }, + { + "epoch": 1.2256375988417418, + "grad_norm": 0.6427861452102661, + "learning_rate": 3.063815569662546e-05, + "loss": 0.1545, + "num_input_tokens_seen": 13397408, + "step": 11005 + }, + { + "epoch": 1.226194453725359, + "grad_norm": 0.4099884033203125, + "learning_rate": 3.0652077068715895e-05, + "loss": 0.0693, + "num_input_tokens_seen": 13403552, + "step": 11010 + }, + { + "epoch": 1.2267513086089765, + "grad_norm": 0.7548840045928955, + "learning_rate": 3.066599844080633e-05, + "loss": 0.1277, + "num_input_tokens_seen": 13409792, + "step": 11015 + }, + { + "epoch": 1.2273081634925938, + "grad_norm": 1.023712396621704, + "learning_rate": 3.067991981289676e-05, + "loss": 0.0979, + "num_input_tokens_seen": 13416224, + "step": 11020 + }, + { + "epoch": 1.2278650183762112, + "grad_norm": 0.9068801999092102, + "learning_rate": 3.069384118498719e-05, + "loss": 0.2114, + "num_input_tokens_seen": 13422272, + "step": 11025 + }, + { + "epoch": 1.2284218732598284, + "grad_norm": 0.4805375635623932, + "learning_rate": 3.070776255707763e-05, + "loss": 0.228, + "num_input_tokens_seen": 13428224, + "step": 11030 + }, + { + "epoch": 1.2289787281434459, + "grad_norm": 0.34207883477211, + "learning_rate": 3.072168392916806e-05, + "loss": 0.1618, + "num_input_tokens_seen": 13433856, + "step": 11035 + }, + { + "epoch": 1.2295355830270631, + "grad_norm": 0.6879488229751587, + "learning_rate": 3.0735605301258494e-05, + "loss": 0.1556, + "num_input_tokens_seen": 13439808, + "step": 11040 + }, + { + "epoch": 1.2300924379106806, + "grad_norm": 0.626532793045044, + "learning_rate": 3.074952667334893e-05, + "loss": 0.1495, + "num_input_tokens_seen": 13446080, + "step": 11045 + }, + { + "epoch": 1.2306492927942978, + "grad_norm": 1.2032833099365234, + "learning_rate": 3.0763448045439356e-05, + "loss": 0.0675, + "num_input_tokens_seen": 13452192, + "step": 11050 + }, + { + "epoch": 1.231206147677915, + "grad_norm": 0.6204188466072083, + "learning_rate": 3.07773694175298e-05, + "loss": 0.0768, + "num_input_tokens_seen": 13458304, + "step": 11055 + }, + { + "epoch": 1.2317630025615325, + "grad_norm": 0.5981226563453674, + "learning_rate": 3.0791290789620224e-05, + "loss": 0.1231, + "num_input_tokens_seen": 13464480, + "step": 11060 + }, + { + "epoch": 1.2323198574451497, + "grad_norm": 0.12699079513549805, + "learning_rate": 3.080521216171066e-05, + "loss": 0.1589, + "num_input_tokens_seen": 13470656, + "step": 11065 + }, + { + "epoch": 1.2328767123287672, + "grad_norm": 1.0216331481933594, + "learning_rate": 3.081913353380109e-05, + "loss": 0.1173, + "num_input_tokens_seen": 13476672, + "step": 11070 + }, + { + "epoch": 1.2334335672123844, + "grad_norm": 0.932922899723053, + "learning_rate": 3.083305490589153e-05, + "loss": 0.1235, + "num_input_tokens_seen": 13482816, + "step": 11075 + }, + { + "epoch": 1.2339904220960018, + "grad_norm": 1.3360613584518433, + "learning_rate": 3.084697627798196e-05, + "loss": 0.0782, + "num_input_tokens_seen": 13488896, + "step": 11080 + }, + { + "epoch": 1.234547276979619, + "grad_norm": 0.11567474901676178, + "learning_rate": 3.086089765007239e-05, + "loss": 0.1495, + "num_input_tokens_seen": 13495264, + "step": 11085 + }, + { + "epoch": 1.2351041318632365, + "grad_norm": 0.4301244914531708, + "learning_rate": 3.0874819022162824e-05, + "loss": 0.2244, + "num_input_tokens_seen": 13501184, + "step": 11090 + }, + { + "epoch": 1.2356609867468538, + "grad_norm": 0.4890909790992737, + "learning_rate": 3.088874039425326e-05, + "loss": 0.0878, + "num_input_tokens_seen": 13507232, + "step": 11095 + }, + { + "epoch": 1.236217841630471, + "grad_norm": 0.5996005535125732, + "learning_rate": 3.090266176634369e-05, + "loss": 0.2314, + "num_input_tokens_seen": 13513216, + "step": 11100 + }, + { + "epoch": 1.2367746965140884, + "grad_norm": 1.2533841133117676, + "learning_rate": 3.0916583138434127e-05, + "loss": 0.1596, + "num_input_tokens_seen": 13519680, + "step": 11105 + }, + { + "epoch": 1.2373315513977057, + "grad_norm": 0.04988943785429001, + "learning_rate": 3.093050451052456e-05, + "loss": 0.0584, + "num_input_tokens_seen": 13525824, + "step": 11110 + }, + { + "epoch": 1.2378884062813231, + "grad_norm": 0.6291336417198181, + "learning_rate": 3.094442588261499e-05, + "loss": 0.1109, + "num_input_tokens_seen": 13531936, + "step": 11115 + }, + { + "epoch": 1.2384452611649404, + "grad_norm": 0.8186628818511963, + "learning_rate": 3.095834725470543e-05, + "loss": 0.2369, + "num_input_tokens_seen": 13538048, + "step": 11120 + }, + { + "epoch": 1.2390021160485578, + "grad_norm": 0.06308270245790482, + "learning_rate": 3.097226862679586e-05, + "loss": 0.1047, + "num_input_tokens_seen": 13544480, + "step": 11125 + }, + { + "epoch": 1.239558970932175, + "grad_norm": 1.1466318368911743, + "learning_rate": 3.098618999888629e-05, + "loss": 0.0983, + "num_input_tokens_seen": 13550656, + "step": 11130 + }, + { + "epoch": 1.2401158258157925, + "grad_norm": 0.7658427357673645, + "learning_rate": 3.1000111370976726e-05, + "loss": 0.1378, + "num_input_tokens_seen": 13556640, + "step": 11135 + }, + { + "epoch": 1.2406726806994097, + "grad_norm": 0.19902941584587097, + "learning_rate": 3.101403274306715e-05, + "loss": 0.0724, + "num_input_tokens_seen": 13562656, + "step": 11140 + }, + { + "epoch": 1.241229535583027, + "grad_norm": 0.5580900311470032, + "learning_rate": 3.1027954115157594e-05, + "loss": 0.0836, + "num_input_tokens_seen": 13568640, + "step": 11145 + }, + { + "epoch": 1.2417863904666444, + "grad_norm": 1.048759937286377, + "learning_rate": 3.104187548724802e-05, + "loss": 0.1392, + "num_input_tokens_seen": 13574816, + "step": 11150 + }, + { + "epoch": 1.2423432453502616, + "grad_norm": 1.0499378442764282, + "learning_rate": 3.1055796859338456e-05, + "loss": 0.2874, + "num_input_tokens_seen": 13580800, + "step": 11155 + }, + { + "epoch": 1.242900100233879, + "grad_norm": 0.04719575121998787, + "learning_rate": 3.106971823142889e-05, + "loss": 0.1014, + "num_input_tokens_seen": 13587040, + "step": 11160 + }, + { + "epoch": 1.2434569551174963, + "grad_norm": 0.3390818238258362, + "learning_rate": 3.1083639603519325e-05, + "loss": 0.0468, + "num_input_tokens_seen": 13593248, + "step": 11165 + }, + { + "epoch": 1.2440138100011138, + "grad_norm": 0.6935365796089172, + "learning_rate": 3.109756097560976e-05, + "loss": 0.1762, + "num_input_tokens_seen": 13599360, + "step": 11170 + }, + { + "epoch": 1.244570664884731, + "grad_norm": 0.6416206359863281, + "learning_rate": 3.111148234770019e-05, + "loss": 0.1516, + "num_input_tokens_seen": 13605344, + "step": 11175 + }, + { + "epoch": 1.2451275197683485, + "grad_norm": 0.9086179733276367, + "learning_rate": 3.112540371979062e-05, + "loss": 0.1395, + "num_input_tokens_seen": 13611520, + "step": 11180 + }, + { + "epoch": 1.2456843746519657, + "grad_norm": 0.7592298984527588, + "learning_rate": 3.1139325091881056e-05, + "loss": 0.1403, + "num_input_tokens_seen": 13617760, + "step": 11185 + }, + { + "epoch": 1.246241229535583, + "grad_norm": 0.9918700456619263, + "learning_rate": 3.115324646397149e-05, + "loss": 0.1754, + "num_input_tokens_seen": 13623904, + "step": 11190 + }, + { + "epoch": 1.2467980844192004, + "grad_norm": 0.7925569415092468, + "learning_rate": 3.1167167836061924e-05, + "loss": 0.1223, + "num_input_tokens_seen": 13629984, + "step": 11195 + }, + { + "epoch": 1.2473549393028176, + "grad_norm": 0.0976843386888504, + "learning_rate": 3.118108920815236e-05, + "loss": 0.1192, + "num_input_tokens_seen": 13636288, + "step": 11200 + }, + { + "epoch": 1.247911794186435, + "grad_norm": 1.1264327764511108, + "learning_rate": 3.1195010580242786e-05, + "loss": 0.1181, + "num_input_tokens_seen": 13642400, + "step": 11205 + }, + { + "epoch": 1.2484686490700523, + "grad_norm": 1.3969268798828125, + "learning_rate": 3.120893195233323e-05, + "loss": 0.0943, + "num_input_tokens_seen": 13648640, + "step": 11210 + }, + { + "epoch": 1.2490255039536697, + "grad_norm": 0.16290611028671265, + "learning_rate": 3.1222853324423655e-05, + "loss": 0.14, + "num_input_tokens_seen": 13654912, + "step": 11215 + }, + { + "epoch": 1.249582358837287, + "grad_norm": 0.17338454723358154, + "learning_rate": 3.123677469651409e-05, + "loss": 0.1386, + "num_input_tokens_seen": 13661056, + "step": 11220 + }, + { + "epoch": 1.2501392137209044, + "grad_norm": 0.501753032207489, + "learning_rate": 3.125069606860452e-05, + "loss": 0.1058, + "num_input_tokens_seen": 13667136, + "step": 11225 + }, + { + "epoch": 1.2506960686045216, + "grad_norm": 0.1190083846449852, + "learning_rate": 3.126461744069496e-05, + "loss": 0.0772, + "num_input_tokens_seen": 13673600, + "step": 11230 + }, + { + "epoch": 1.2512529234881389, + "grad_norm": 0.364446222782135, + "learning_rate": 3.127853881278539e-05, + "loss": 0.1367, + "num_input_tokens_seen": 13679872, + "step": 11235 + }, + { + "epoch": 1.2518097783717563, + "grad_norm": 0.44334468245506287, + "learning_rate": 3.129246018487582e-05, + "loss": 0.0561, + "num_input_tokens_seen": 13685376, + "step": 11240 + }, + { + "epoch": 1.2523666332553738, + "grad_norm": 1.510420322418213, + "learning_rate": 3.130638155696626e-05, + "loss": 0.1386, + "num_input_tokens_seen": 13691488, + "step": 11245 + }, + { + "epoch": 1.252923488138991, + "grad_norm": 0.4507812261581421, + "learning_rate": 3.132030292905669e-05, + "loss": 0.1039, + "num_input_tokens_seen": 13697600, + "step": 11250 + }, + { + "epoch": 1.2534803430226082, + "grad_norm": 0.06228237599134445, + "learning_rate": 3.133422430114712e-05, + "loss": 0.254, + "num_input_tokens_seen": 13703968, + "step": 11255 + }, + { + "epoch": 1.2540371979062257, + "grad_norm": 1.0700459480285645, + "learning_rate": 3.134814567323756e-05, + "loss": 0.0975, + "num_input_tokens_seen": 13710240, + "step": 11260 + }, + { + "epoch": 1.254594052789843, + "grad_norm": 0.17940004169940948, + "learning_rate": 3.1362067045327984e-05, + "loss": 0.1209, + "num_input_tokens_seen": 13715584, + "step": 11265 + }, + { + "epoch": 1.2551509076734604, + "grad_norm": 0.847852885723114, + "learning_rate": 3.1375988417418426e-05, + "loss": 0.0686, + "num_input_tokens_seen": 13721952, + "step": 11270 + }, + { + "epoch": 1.2557077625570776, + "grad_norm": 1.574103593826294, + "learning_rate": 3.138990978950885e-05, + "loss": 0.2392, + "num_input_tokens_seen": 13727872, + "step": 11275 + }, + { + "epoch": 1.2562646174406948, + "grad_norm": 0.816518247127533, + "learning_rate": 3.140383116159929e-05, + "loss": 0.0668, + "num_input_tokens_seen": 13734080, + "step": 11280 + }, + { + "epoch": 1.2568214723243123, + "grad_norm": 0.9557921290397644, + "learning_rate": 3.141775253368972e-05, + "loss": 0.2353, + "num_input_tokens_seen": 13740032, + "step": 11285 + }, + { + "epoch": 1.2573783272079297, + "grad_norm": 0.8612655401229858, + "learning_rate": 3.1431673905780156e-05, + "loss": 0.2303, + "num_input_tokens_seen": 13745632, + "step": 11290 + }, + { + "epoch": 1.257935182091547, + "grad_norm": 0.5442182421684265, + "learning_rate": 3.144559527787059e-05, + "loss": 0.0869, + "num_input_tokens_seen": 13751616, + "step": 11295 + }, + { + "epoch": 1.2584920369751642, + "grad_norm": 0.6604971885681152, + "learning_rate": 3.1459516649961025e-05, + "loss": 0.2078, + "num_input_tokens_seen": 13757184, + "step": 11300 + }, + { + "epoch": 1.2590488918587817, + "grad_norm": 0.21808169782161713, + "learning_rate": 3.147343802205145e-05, + "loss": 0.2263, + "num_input_tokens_seen": 13763328, + "step": 11305 + }, + { + "epoch": 1.259605746742399, + "grad_norm": 0.9076587557792664, + "learning_rate": 3.148735939414189e-05, + "loss": 0.188, + "num_input_tokens_seen": 13769632, + "step": 11310 + }, + { + "epoch": 1.2601626016260163, + "grad_norm": 0.609083354473114, + "learning_rate": 3.150128076623232e-05, + "loss": 0.1617, + "num_input_tokens_seen": 13775712, + "step": 11315 + }, + { + "epoch": 1.2607194565096336, + "grad_norm": 0.20732732117176056, + "learning_rate": 3.1515202138322755e-05, + "loss": 0.1455, + "num_input_tokens_seen": 13782144, + "step": 11320 + }, + { + "epoch": 1.2612763113932508, + "grad_norm": 2.753848075866699, + "learning_rate": 3.152912351041319e-05, + "loss": 0.0911, + "num_input_tokens_seen": 13788160, + "step": 11325 + }, + { + "epoch": 1.2618331662768683, + "grad_norm": 0.06579532474279404, + "learning_rate": 3.154304488250362e-05, + "loss": 0.1371, + "num_input_tokens_seen": 13794240, + "step": 11330 + }, + { + "epoch": 1.2623900211604857, + "grad_norm": 1.08268141746521, + "learning_rate": 3.155696625459406e-05, + "loss": 0.1287, + "num_input_tokens_seen": 13800032, + "step": 11335 + }, + { + "epoch": 1.262946876044103, + "grad_norm": 0.503293514251709, + "learning_rate": 3.1570887626684486e-05, + "loss": 0.1326, + "num_input_tokens_seen": 13805664, + "step": 11340 + }, + { + "epoch": 1.2635037309277202, + "grad_norm": 1.235629916191101, + "learning_rate": 3.158480899877492e-05, + "loss": 0.1676, + "num_input_tokens_seen": 13811712, + "step": 11345 + }, + { + "epoch": 1.2640605858113376, + "grad_norm": 0.583513081073761, + "learning_rate": 3.1598730370865354e-05, + "loss": 0.1795, + "num_input_tokens_seen": 13817760, + "step": 11350 + }, + { + "epoch": 1.2646174406949549, + "grad_norm": 0.23850885033607483, + "learning_rate": 3.161265174295578e-05, + "loss": 0.1657, + "num_input_tokens_seen": 13823776, + "step": 11355 + }, + { + "epoch": 1.2651742955785723, + "grad_norm": 0.24703344702720642, + "learning_rate": 3.162657311504622e-05, + "loss": 0.1136, + "num_input_tokens_seen": 13829760, + "step": 11360 + }, + { + "epoch": 1.2657311504621895, + "grad_norm": 1.0297120809555054, + "learning_rate": 3.164049448713665e-05, + "loss": 0.1639, + "num_input_tokens_seen": 13835744, + "step": 11365 + }, + { + "epoch": 1.2662880053458068, + "grad_norm": 0.17787429690361023, + "learning_rate": 3.1654415859227085e-05, + "loss": 0.2082, + "num_input_tokens_seen": 13841632, + "step": 11370 + }, + { + "epoch": 1.2668448602294242, + "grad_norm": 0.839145839214325, + "learning_rate": 3.166833723131752e-05, + "loss": 0.1323, + "num_input_tokens_seen": 13847840, + "step": 11375 + }, + { + "epoch": 1.2674017151130417, + "grad_norm": 0.22810088098049164, + "learning_rate": 3.1682258603407954e-05, + "loss": 0.1184, + "num_input_tokens_seen": 13854080, + "step": 11380 + }, + { + "epoch": 1.267958569996659, + "grad_norm": 0.5805832147598267, + "learning_rate": 3.169617997549839e-05, + "loss": 0.075, + "num_input_tokens_seen": 13860224, + "step": 11385 + }, + { + "epoch": 1.2685154248802761, + "grad_norm": 0.21078407764434814, + "learning_rate": 3.171010134758882e-05, + "loss": 0.1401, + "num_input_tokens_seen": 13866432, + "step": 11390 + }, + { + "epoch": 1.2690722797638936, + "grad_norm": 0.9902284145355225, + "learning_rate": 3.172402271967925e-05, + "loss": 0.0944, + "num_input_tokens_seen": 13872672, + "step": 11395 + }, + { + "epoch": 1.2696291346475108, + "grad_norm": 1.0965358018875122, + "learning_rate": 3.173794409176969e-05, + "loss": 0.1309, + "num_input_tokens_seen": 13878752, + "step": 11400 + }, + { + "epoch": 1.2701859895311283, + "grad_norm": 0.3036738932132721, + "learning_rate": 3.175186546386012e-05, + "loss": 0.0459, + "num_input_tokens_seen": 13885152, + "step": 11405 + }, + { + "epoch": 1.2707428444147455, + "grad_norm": 0.9689066410064697, + "learning_rate": 3.176578683595055e-05, + "loss": 0.112, + "num_input_tokens_seen": 13891200, + "step": 11410 + }, + { + "epoch": 1.2712996992983627, + "grad_norm": 0.9481145739555359, + "learning_rate": 3.177970820804099e-05, + "loss": 0.2189, + "num_input_tokens_seen": 13897088, + "step": 11415 + }, + { + "epoch": 1.2718565541819802, + "grad_norm": 0.196010023355484, + "learning_rate": 3.1793629580131415e-05, + "loss": 0.0932, + "num_input_tokens_seen": 13903040, + "step": 11420 + }, + { + "epoch": 1.2724134090655976, + "grad_norm": 1.0927929878234863, + "learning_rate": 3.1807550952221856e-05, + "loss": 0.1395, + "num_input_tokens_seen": 13909152, + "step": 11425 + }, + { + "epoch": 1.2729702639492149, + "grad_norm": 0.3599050045013428, + "learning_rate": 3.1821472324312283e-05, + "loss": 0.0752, + "num_input_tokens_seen": 13915296, + "step": 11430 + }, + { + "epoch": 1.273527118832832, + "grad_norm": 0.03693339601159096, + "learning_rate": 3.183539369640272e-05, + "loss": 0.1066, + "num_input_tokens_seen": 13921568, + "step": 11435 + }, + { + "epoch": 1.2740839737164495, + "grad_norm": 1.9903227090835571, + "learning_rate": 3.184931506849315e-05, + "loss": 0.2084, + "num_input_tokens_seen": 13927936, + "step": 11440 + }, + { + "epoch": 1.2746408286000668, + "grad_norm": 0.7181577086448669, + "learning_rate": 3.1863236440583586e-05, + "loss": 0.0945, + "num_input_tokens_seen": 13934112, + "step": 11445 + }, + { + "epoch": 1.2751976834836842, + "grad_norm": 0.02159799076616764, + "learning_rate": 3.187715781267402e-05, + "loss": 0.0792, + "num_input_tokens_seen": 13940064, + "step": 11450 + }, + { + "epoch": 1.2757545383673015, + "grad_norm": 0.3940768837928772, + "learning_rate": 3.189107918476445e-05, + "loss": 0.1387, + "num_input_tokens_seen": 13946272, + "step": 11455 + }, + { + "epoch": 1.2763113932509187, + "grad_norm": 1.6858102083206177, + "learning_rate": 3.190500055685488e-05, + "loss": 0.1207, + "num_input_tokens_seen": 13952672, + "step": 11460 + }, + { + "epoch": 1.2768682481345361, + "grad_norm": 1.3738843202590942, + "learning_rate": 3.191892192894532e-05, + "loss": 0.1837, + "num_input_tokens_seen": 13958848, + "step": 11465 + }, + { + "epoch": 1.2774251030181536, + "grad_norm": 1.0542359352111816, + "learning_rate": 3.193284330103575e-05, + "loss": 0.1978, + "num_input_tokens_seen": 13965312, + "step": 11470 + }, + { + "epoch": 1.2779819579017708, + "grad_norm": 0.7129787802696228, + "learning_rate": 3.1946764673126186e-05, + "loss": 0.1155, + "num_input_tokens_seen": 13971360, + "step": 11475 + }, + { + "epoch": 1.278538812785388, + "grad_norm": 1.470139980316162, + "learning_rate": 3.196068604521662e-05, + "loss": 0.1818, + "num_input_tokens_seen": 13977344, + "step": 11480 + }, + { + "epoch": 1.2790956676690055, + "grad_norm": 0.625373899936676, + "learning_rate": 3.1974607417307054e-05, + "loss": 0.161, + "num_input_tokens_seen": 13983424, + "step": 11485 + }, + { + "epoch": 1.2796525225526227, + "grad_norm": 0.6402484774589539, + "learning_rate": 3.198852878939749e-05, + "loss": 0.097, + "num_input_tokens_seen": 13989632, + "step": 11490 + }, + { + "epoch": 1.2802093774362402, + "grad_norm": 0.0408964566886425, + "learning_rate": 3.2002450161487916e-05, + "loss": 0.1446, + "num_input_tokens_seen": 13996096, + "step": 11495 + }, + { + "epoch": 1.2807662323198574, + "grad_norm": 2.0264832973480225, + "learning_rate": 3.201637153357835e-05, + "loss": 0.1578, + "num_input_tokens_seen": 14002112, + "step": 11500 + }, + { + "epoch": 1.2813230872034747, + "grad_norm": 0.07946907728910446, + "learning_rate": 3.2030292905668785e-05, + "loss": 0.1149, + "num_input_tokens_seen": 14008320, + "step": 11505 + }, + { + "epoch": 1.281879942087092, + "grad_norm": 1.5599061250686646, + "learning_rate": 3.204421427775922e-05, + "loss": 0.2484, + "num_input_tokens_seen": 14014560, + "step": 11510 + }, + { + "epoch": 1.2824367969707096, + "grad_norm": 0.22555196285247803, + "learning_rate": 3.2058135649849653e-05, + "loss": 0.1422, + "num_input_tokens_seen": 14020480, + "step": 11515 + }, + { + "epoch": 1.2829936518543268, + "grad_norm": 1.266768217086792, + "learning_rate": 3.207205702194008e-05, + "loss": 0.2268, + "num_input_tokens_seen": 14026496, + "step": 11520 + }, + { + "epoch": 1.283550506737944, + "grad_norm": 1.0862138271331787, + "learning_rate": 3.208597839403052e-05, + "loss": 0.2279, + "num_input_tokens_seen": 14032384, + "step": 11525 + }, + { + "epoch": 1.2841073616215615, + "grad_norm": 0.28840190172195435, + "learning_rate": 3.209989976612095e-05, + "loss": 0.0626, + "num_input_tokens_seen": 14038336, + "step": 11530 + }, + { + "epoch": 1.2846642165051787, + "grad_norm": 0.9362396001815796, + "learning_rate": 3.2113821138211384e-05, + "loss": 0.0661, + "num_input_tokens_seen": 14045024, + "step": 11535 + }, + { + "epoch": 1.2852210713887962, + "grad_norm": 1.2595446109771729, + "learning_rate": 3.212774251030182e-05, + "loss": 0.0382, + "num_input_tokens_seen": 14051296, + "step": 11540 + }, + { + "epoch": 1.2857779262724134, + "grad_norm": 0.13083265721797943, + "learning_rate": 3.2141663882392246e-05, + "loss": 0.1563, + "num_input_tokens_seen": 14057280, + "step": 11545 + }, + { + "epoch": 1.2863347811560306, + "grad_norm": 1.395690679550171, + "learning_rate": 3.215558525448269e-05, + "loss": 0.2313, + "num_input_tokens_seen": 14062912, + "step": 11550 + }, + { + "epoch": 1.286891636039648, + "grad_norm": 0.34451767802238464, + "learning_rate": 3.2169506626573115e-05, + "loss": 0.0764, + "num_input_tokens_seen": 14069248, + "step": 11555 + }, + { + "epoch": 1.2874484909232655, + "grad_norm": 0.7943507432937622, + "learning_rate": 3.218342799866355e-05, + "loss": 0.1312, + "num_input_tokens_seen": 14075360, + "step": 11560 + }, + { + "epoch": 1.2880053458068828, + "grad_norm": 0.3730575442314148, + "learning_rate": 3.219734937075398e-05, + "loss": 0.1082, + "num_input_tokens_seen": 14081568, + "step": 11565 + }, + { + "epoch": 1.2885622006905, + "grad_norm": 1.1518183946609497, + "learning_rate": 3.221127074284442e-05, + "loss": 0.1876, + "num_input_tokens_seen": 14087680, + "step": 11570 + }, + { + "epoch": 1.2891190555741174, + "grad_norm": 0.8526890277862549, + "learning_rate": 3.222519211493485e-05, + "loss": 0.2152, + "num_input_tokens_seen": 14093664, + "step": 11575 + }, + { + "epoch": 1.2896759104577347, + "grad_norm": 0.38908883929252625, + "learning_rate": 3.2239113487025286e-05, + "loss": 0.0624, + "num_input_tokens_seen": 14099744, + "step": 11580 + }, + { + "epoch": 1.2902327653413521, + "grad_norm": 0.29162904620170593, + "learning_rate": 3.2253034859115714e-05, + "loss": 0.1273, + "num_input_tokens_seen": 14105696, + "step": 11585 + }, + { + "epoch": 1.2907896202249693, + "grad_norm": 0.7272166013717651, + "learning_rate": 3.226695623120615e-05, + "loss": 0.2018, + "num_input_tokens_seen": 14111200, + "step": 11590 + }, + { + "epoch": 1.2913464751085866, + "grad_norm": 1.2080272436141968, + "learning_rate": 3.228087760329658e-05, + "loss": 0.1737, + "num_input_tokens_seen": 14117184, + "step": 11595 + }, + { + "epoch": 1.291903329992204, + "grad_norm": 0.016958335414528847, + "learning_rate": 3.229479897538702e-05, + "loss": 0.0586, + "num_input_tokens_seen": 14123520, + "step": 11600 + }, + { + "epoch": 1.2924601848758215, + "grad_norm": 0.3014504909515381, + "learning_rate": 3.230872034747745e-05, + "loss": 0.2095, + "num_input_tokens_seen": 14129376, + "step": 11605 + }, + { + "epoch": 1.2930170397594387, + "grad_norm": 0.7920318841934204, + "learning_rate": 3.232264171956788e-05, + "loss": 0.1107, + "num_input_tokens_seen": 14135296, + "step": 11610 + }, + { + "epoch": 1.293573894643056, + "grad_norm": 1.3642330169677734, + "learning_rate": 3.233656309165832e-05, + "loss": 0.1836, + "num_input_tokens_seen": 14140768, + "step": 11615 + }, + { + "epoch": 1.2941307495266734, + "grad_norm": 0.765457808971405, + "learning_rate": 3.235048446374875e-05, + "loss": 0.0977, + "num_input_tokens_seen": 14147136, + "step": 11620 + }, + { + "epoch": 1.2946876044102906, + "grad_norm": 1.3473432064056396, + "learning_rate": 3.236440583583918e-05, + "loss": 0.1388, + "num_input_tokens_seen": 14153568, + "step": 11625 + }, + { + "epoch": 1.295244459293908, + "grad_norm": 0.4304943382740021, + "learning_rate": 3.2378327207929616e-05, + "loss": 0.169, + "num_input_tokens_seen": 14159680, + "step": 11630 + }, + { + "epoch": 1.2958013141775253, + "grad_norm": 0.010057604871690273, + "learning_rate": 3.2392248580020043e-05, + "loss": 0.1363, + "num_input_tokens_seen": 14165952, + "step": 11635 + }, + { + "epoch": 1.2963581690611425, + "grad_norm": 0.8671753406524658, + "learning_rate": 3.2406169952110485e-05, + "loss": 0.0643, + "num_input_tokens_seen": 14172384, + "step": 11640 + }, + { + "epoch": 1.29691502394476, + "grad_norm": 0.9899182915687561, + "learning_rate": 3.242009132420091e-05, + "loss": 0.0974, + "num_input_tokens_seen": 14178400, + "step": 11645 + }, + { + "epoch": 1.2974718788283774, + "grad_norm": 0.28384533524513245, + "learning_rate": 3.2434012696291346e-05, + "loss": 0.1749, + "num_input_tokens_seen": 14184256, + "step": 11650 + }, + { + "epoch": 1.2980287337119947, + "grad_norm": 0.5931205749511719, + "learning_rate": 3.244793406838178e-05, + "loss": 0.0848, + "num_input_tokens_seen": 14190336, + "step": 11655 + }, + { + "epoch": 1.298585588595612, + "grad_norm": 0.9289055466651917, + "learning_rate": 3.2461855440472215e-05, + "loss": 0.1596, + "num_input_tokens_seen": 14196192, + "step": 11660 + }, + { + "epoch": 1.2991424434792294, + "grad_norm": 2.386700391769409, + "learning_rate": 3.247577681256265e-05, + "loss": 0.2238, + "num_input_tokens_seen": 14202080, + "step": 11665 + }, + { + "epoch": 1.2996992983628466, + "grad_norm": 0.9058008193969727, + "learning_rate": 3.2489698184653084e-05, + "loss": 0.0842, + "num_input_tokens_seen": 14208096, + "step": 11670 + }, + { + "epoch": 1.300256153246464, + "grad_norm": 0.058446288108825684, + "learning_rate": 3.250361955674351e-05, + "loss": 0.1415, + "num_input_tokens_seen": 14214336, + "step": 11675 + }, + { + "epoch": 1.3008130081300813, + "grad_norm": 0.706709623336792, + "learning_rate": 3.2517540928833946e-05, + "loss": 0.0626, + "num_input_tokens_seen": 14220576, + "step": 11680 + }, + { + "epoch": 1.3013698630136985, + "grad_norm": 2.3856866359710693, + "learning_rate": 3.253146230092438e-05, + "loss": 0.13, + "num_input_tokens_seen": 14226624, + "step": 11685 + }, + { + "epoch": 1.301926717897316, + "grad_norm": 0.8118784427642822, + "learning_rate": 3.2545383673014814e-05, + "loss": 0.2866, + "num_input_tokens_seen": 14232256, + "step": 11690 + }, + { + "epoch": 1.3024835727809334, + "grad_norm": 0.4097164571285248, + "learning_rate": 3.255930504510525e-05, + "loss": 0.157, + "num_input_tokens_seen": 14238528, + "step": 11695 + }, + { + "epoch": 1.3030404276645506, + "grad_norm": 1.8042012453079224, + "learning_rate": 3.2573226417195676e-05, + "loss": 0.2334, + "num_input_tokens_seen": 14243712, + "step": 11700 + }, + { + "epoch": 1.3035972825481679, + "grad_norm": 1.5187264680862427, + "learning_rate": 3.258714778928612e-05, + "loss": 0.0934, + "num_input_tokens_seen": 14249664, + "step": 11705 + }, + { + "epoch": 1.3041541374317853, + "grad_norm": 0.6799335479736328, + "learning_rate": 3.2601069161376545e-05, + "loss": 0.1348, + "num_input_tokens_seen": 14255680, + "step": 11710 + }, + { + "epoch": 1.3047109923154026, + "grad_norm": 0.731019139289856, + "learning_rate": 3.261499053346698e-05, + "loss": 0.1235, + "num_input_tokens_seen": 14261760, + "step": 11715 + }, + { + "epoch": 1.30526784719902, + "grad_norm": 1.3987253904342651, + "learning_rate": 3.2628911905557413e-05, + "loss": 0.2094, + "num_input_tokens_seen": 14267456, + "step": 11720 + }, + { + "epoch": 1.3058247020826372, + "grad_norm": 0.7943562865257263, + "learning_rate": 3.264283327764784e-05, + "loss": 0.203, + "num_input_tokens_seen": 14273312, + "step": 11725 + }, + { + "epoch": 1.3063815569662545, + "grad_norm": 0.8394778966903687, + "learning_rate": 3.265675464973828e-05, + "loss": 0.1482, + "num_input_tokens_seen": 14279232, + "step": 11730 + }, + { + "epoch": 1.306938411849872, + "grad_norm": 1.2507333755493164, + "learning_rate": 3.267067602182871e-05, + "loss": 0.1334, + "num_input_tokens_seen": 14284960, + "step": 11735 + }, + { + "epoch": 1.3074952667334894, + "grad_norm": 0.17236420512199402, + "learning_rate": 3.268459739391915e-05, + "loss": 0.1825, + "num_input_tokens_seen": 14290976, + "step": 11740 + }, + { + "epoch": 1.3080521216171066, + "grad_norm": 0.1629222333431244, + "learning_rate": 3.269851876600958e-05, + "loss": 0.0662, + "num_input_tokens_seen": 14297312, + "step": 11745 + }, + { + "epoch": 1.3086089765007238, + "grad_norm": 0.6377294063568115, + "learning_rate": 3.271244013810001e-05, + "loss": 0.1145, + "num_input_tokens_seen": 14303264, + "step": 11750 + }, + { + "epoch": 1.3091658313843413, + "grad_norm": 0.9740136861801147, + "learning_rate": 3.272636151019045e-05, + "loss": 0.1231, + "num_input_tokens_seen": 14309440, + "step": 11755 + }, + { + "epoch": 1.3097226862679585, + "grad_norm": 0.6414777040481567, + "learning_rate": 3.274028288228088e-05, + "loss": 0.0623, + "num_input_tokens_seen": 14315424, + "step": 11760 + }, + { + "epoch": 1.310279541151576, + "grad_norm": 0.49492397904396057, + "learning_rate": 3.2754204254371316e-05, + "loss": 0.1131, + "num_input_tokens_seen": 14321600, + "step": 11765 + }, + { + "epoch": 1.3108363960351932, + "grad_norm": 0.6850976943969727, + "learning_rate": 3.276812562646174e-05, + "loss": 0.1259, + "num_input_tokens_seen": 14327328, + "step": 11770 + }, + { + "epoch": 1.3113932509188104, + "grad_norm": 0.4407142400741577, + "learning_rate": 3.278204699855218e-05, + "loss": 0.1626, + "num_input_tokens_seen": 14332544, + "step": 11775 + }, + { + "epoch": 1.3119501058024279, + "grad_norm": 1.2084901332855225, + "learning_rate": 3.279596837064261e-05, + "loss": 0.1257, + "num_input_tokens_seen": 14338624, + "step": 11780 + }, + { + "epoch": 1.3125069606860453, + "grad_norm": 0.23055194318294525, + "learning_rate": 3.2809889742733046e-05, + "loss": 0.0594, + "num_input_tokens_seen": 14344448, + "step": 11785 + }, + { + "epoch": 1.3130638155696626, + "grad_norm": 0.5326517224311829, + "learning_rate": 3.282381111482348e-05, + "loss": 0.2572, + "num_input_tokens_seen": 14350528, + "step": 11790 + }, + { + "epoch": 1.3136206704532798, + "grad_norm": 1.0085456371307373, + "learning_rate": 3.2837732486913915e-05, + "loss": 0.1074, + "num_input_tokens_seen": 14356736, + "step": 11795 + }, + { + "epoch": 1.3141775253368972, + "grad_norm": 0.7730490565299988, + "learning_rate": 3.285165385900434e-05, + "loss": 0.083, + "num_input_tokens_seen": 14362912, + "step": 11800 + }, + { + "epoch": 1.3147343802205145, + "grad_norm": 1.1014338731765747, + "learning_rate": 3.2865575231094784e-05, + "loss": 0.1427, + "num_input_tokens_seen": 14368928, + "step": 11805 + }, + { + "epoch": 1.315291235104132, + "grad_norm": 0.034417860209941864, + "learning_rate": 3.287949660318521e-05, + "loss": 0.2018, + "num_input_tokens_seen": 14374688, + "step": 11810 + }, + { + "epoch": 1.3158480899877492, + "grad_norm": 0.45511510968208313, + "learning_rate": 3.2893417975275645e-05, + "loss": 0.1306, + "num_input_tokens_seen": 14380960, + "step": 11815 + }, + { + "epoch": 1.3164049448713664, + "grad_norm": 0.6441144943237305, + "learning_rate": 3.290733934736608e-05, + "loss": 0.1185, + "num_input_tokens_seen": 14387168, + "step": 11820 + }, + { + "epoch": 1.3169617997549838, + "grad_norm": 2.1816911697387695, + "learning_rate": 3.292126071945651e-05, + "loss": 0.2139, + "num_input_tokens_seen": 14393152, + "step": 11825 + }, + { + "epoch": 1.3175186546386013, + "grad_norm": 0.5311364531517029, + "learning_rate": 3.293518209154695e-05, + "loss": 0.2003, + "num_input_tokens_seen": 14398976, + "step": 11830 + }, + { + "epoch": 1.3180755095222185, + "grad_norm": 0.6130179166793823, + "learning_rate": 3.2949103463637376e-05, + "loss": 0.2107, + "num_input_tokens_seen": 14405024, + "step": 11835 + }, + { + "epoch": 1.3186323644058358, + "grad_norm": 0.755919337272644, + "learning_rate": 3.296302483572781e-05, + "loss": 0.224, + "num_input_tokens_seen": 14410976, + "step": 11840 + }, + { + "epoch": 1.3191892192894532, + "grad_norm": 0.9788738489151001, + "learning_rate": 3.2976946207818245e-05, + "loss": 0.2247, + "num_input_tokens_seen": 14417120, + "step": 11845 + }, + { + "epoch": 1.3197460741730704, + "grad_norm": 0.5146324634552002, + "learning_rate": 3.299086757990868e-05, + "loss": 0.1475, + "num_input_tokens_seen": 14423168, + "step": 11850 + }, + { + "epoch": 1.320302929056688, + "grad_norm": 0.29394352436065674, + "learning_rate": 3.300478895199911e-05, + "loss": 0.0222, + "num_input_tokens_seen": 14429408, + "step": 11855 + }, + { + "epoch": 1.3208597839403051, + "grad_norm": 0.5729764103889465, + "learning_rate": 3.301871032408954e-05, + "loss": 0.071, + "num_input_tokens_seen": 14435456, + "step": 11860 + }, + { + "epoch": 1.3214166388239224, + "grad_norm": 1.4233909845352173, + "learning_rate": 3.3032631696179975e-05, + "loss": 0.1836, + "num_input_tokens_seen": 14441632, + "step": 11865 + }, + { + "epoch": 1.3219734937075398, + "grad_norm": 0.7460857033729553, + "learning_rate": 3.304655306827041e-05, + "loss": 0.2049, + "num_input_tokens_seen": 14447456, + "step": 11870 + }, + { + "epoch": 1.3225303485911573, + "grad_norm": 0.8191669583320618, + "learning_rate": 3.3060474440360844e-05, + "loss": 0.135, + "num_input_tokens_seen": 14453472, + "step": 11875 + }, + { + "epoch": 1.3230872034747745, + "grad_norm": 0.6263427734375, + "learning_rate": 3.307439581245128e-05, + "loss": 0.1344, + "num_input_tokens_seen": 14459040, + "step": 11880 + }, + { + "epoch": 1.3236440583583917, + "grad_norm": 0.9719507694244385, + "learning_rate": 3.308831718454171e-05, + "loss": 0.1794, + "num_input_tokens_seen": 14465280, + "step": 11885 + }, + { + "epoch": 1.3242009132420092, + "grad_norm": 0.7317732572555542, + "learning_rate": 3.310223855663214e-05, + "loss": 0.1293, + "num_input_tokens_seen": 14471616, + "step": 11890 + }, + { + "epoch": 1.3247577681256264, + "grad_norm": 0.07881452143192291, + "learning_rate": 3.311615992872258e-05, + "loss": 0.2574, + "num_input_tokens_seen": 14477824, + "step": 11895 + }, + { + "epoch": 1.3253146230092439, + "grad_norm": 0.0954989492893219, + "learning_rate": 3.313008130081301e-05, + "loss": 0.0682, + "num_input_tokens_seen": 14483904, + "step": 11900 + }, + { + "epoch": 1.325871477892861, + "grad_norm": 1.0745784044265747, + "learning_rate": 3.314400267290344e-05, + "loss": 0.2279, + "num_input_tokens_seen": 14489920, + "step": 11905 + }, + { + "epoch": 1.3264283327764783, + "grad_norm": 0.38703981041908264, + "learning_rate": 3.315792404499388e-05, + "loss": 0.1162, + "num_input_tokens_seen": 14496128, + "step": 11910 + }, + { + "epoch": 1.3269851876600958, + "grad_norm": 0.390418142080307, + "learning_rate": 3.3171845417084305e-05, + "loss": 0.1585, + "num_input_tokens_seen": 14501632, + "step": 11915 + }, + { + "epoch": 1.3275420425437132, + "grad_norm": 0.8062316179275513, + "learning_rate": 3.3185766789174746e-05, + "loss": 0.2654, + "num_input_tokens_seen": 14507808, + "step": 11920 + }, + { + "epoch": 1.3280988974273304, + "grad_norm": 0.8810397982597351, + "learning_rate": 3.3199688161265174e-05, + "loss": 0.1331, + "num_input_tokens_seen": 14513824, + "step": 11925 + }, + { + "epoch": 1.3286557523109477, + "grad_norm": 0.4911481440067291, + "learning_rate": 3.321360953335561e-05, + "loss": 0.0794, + "num_input_tokens_seen": 14519680, + "step": 11930 + }, + { + "epoch": 1.3292126071945651, + "grad_norm": 0.9585636854171753, + "learning_rate": 3.322753090544604e-05, + "loss": 0.0909, + "num_input_tokens_seen": 14525984, + "step": 11935 + }, + { + "epoch": 1.3297694620781824, + "grad_norm": 1.346065878868103, + "learning_rate": 3.3241452277536476e-05, + "loss": 0.1494, + "num_input_tokens_seen": 14531936, + "step": 11940 + }, + { + "epoch": 1.3303263169617998, + "grad_norm": 0.3009191155433655, + "learning_rate": 3.325537364962691e-05, + "loss": 0.1428, + "num_input_tokens_seen": 14537952, + "step": 11945 + }, + { + "epoch": 1.330883171845417, + "grad_norm": 0.8869218826293945, + "learning_rate": 3.326929502171734e-05, + "loss": 0.0653, + "num_input_tokens_seen": 14544160, + "step": 11950 + }, + { + "epoch": 1.3314400267290343, + "grad_norm": 0.8931643962860107, + "learning_rate": 3.328321639380777e-05, + "loss": 0.1369, + "num_input_tokens_seen": 14550272, + "step": 11955 + }, + { + "epoch": 1.3319968816126517, + "grad_norm": 1.7516229152679443, + "learning_rate": 3.329713776589821e-05, + "loss": 0.2241, + "num_input_tokens_seen": 14556832, + "step": 11960 + }, + { + "epoch": 1.3325537364962692, + "grad_norm": 1.6969799995422363, + "learning_rate": 3.331105913798864e-05, + "loss": 0.2113, + "num_input_tokens_seen": 14563200, + "step": 11965 + }, + { + "epoch": 1.3331105913798864, + "grad_norm": 0.3923173248767853, + "learning_rate": 3.3324980510079076e-05, + "loss": 0.1181, + "num_input_tokens_seen": 14569376, + "step": 11970 + }, + { + "epoch": 1.3336674462635036, + "grad_norm": 0.3368174135684967, + "learning_rate": 3.333890188216951e-05, + "loss": 0.0661, + "num_input_tokens_seen": 14575136, + "step": 11975 + }, + { + "epoch": 1.334224301147121, + "grad_norm": 0.6951747536659241, + "learning_rate": 3.335282325425994e-05, + "loss": 0.0911, + "num_input_tokens_seen": 14581088, + "step": 11980 + }, + { + "epoch": 1.3347811560307383, + "grad_norm": 1.0206351280212402, + "learning_rate": 3.336674462635038e-05, + "loss": 0.097, + "num_input_tokens_seen": 14587360, + "step": 11985 + }, + { + "epoch": 1.3353380109143558, + "grad_norm": 0.2098807394504547, + "learning_rate": 3.3380665998440806e-05, + "loss": 0.0945, + "num_input_tokens_seen": 14593376, + "step": 11990 + }, + { + "epoch": 1.335894865797973, + "grad_norm": 1.3066805601119995, + "learning_rate": 3.339458737053124e-05, + "loss": 0.1313, + "num_input_tokens_seen": 14599488, + "step": 11995 + }, + { + "epoch": 1.3364517206815905, + "grad_norm": 1.7132989168167114, + "learning_rate": 3.3408508742621675e-05, + "loss": 0.188, + "num_input_tokens_seen": 14605600, + "step": 12000 + }, + { + "epoch": 1.3370085755652077, + "grad_norm": 0.265500545501709, + "learning_rate": 3.34224301147121e-05, + "loss": 0.0792, + "num_input_tokens_seen": 14611840, + "step": 12005 + }, + { + "epoch": 1.3375654304488251, + "grad_norm": 1.0646519660949707, + "learning_rate": 3.3436351486802544e-05, + "loss": 0.1409, + "num_input_tokens_seen": 14618016, + "step": 12010 + }, + { + "epoch": 1.3381222853324424, + "grad_norm": 2.2018094062805176, + "learning_rate": 3.345027285889297e-05, + "loss": 0.1435, + "num_input_tokens_seen": 14624032, + "step": 12015 + }, + { + "epoch": 1.3386791402160596, + "grad_norm": 0.6335297226905823, + "learning_rate": 3.346419423098341e-05, + "loss": 0.1633, + "num_input_tokens_seen": 14629536, + "step": 12020 + }, + { + "epoch": 1.339235995099677, + "grad_norm": 1.462481141090393, + "learning_rate": 3.347811560307384e-05, + "loss": 0.0609, + "num_input_tokens_seen": 14635616, + "step": 12025 + }, + { + "epoch": 1.3397928499832943, + "grad_norm": 2.413853168487549, + "learning_rate": 3.3492036975164274e-05, + "loss": 0.0869, + "num_input_tokens_seen": 14641664, + "step": 12030 + }, + { + "epoch": 1.3403497048669117, + "grad_norm": 1.2521510124206543, + "learning_rate": 3.350595834725471e-05, + "loss": 0.142, + "num_input_tokens_seen": 14647936, + "step": 12035 + }, + { + "epoch": 1.340906559750529, + "grad_norm": 1.083209753036499, + "learning_rate": 3.3519879719345136e-05, + "loss": 0.1602, + "num_input_tokens_seen": 14654016, + "step": 12040 + }, + { + "epoch": 1.3414634146341464, + "grad_norm": 0.21877224743366241, + "learning_rate": 3.353380109143558e-05, + "loss": 0.0669, + "num_input_tokens_seen": 14660480, + "step": 12045 + }, + { + "epoch": 1.3420202695177637, + "grad_norm": 0.9951229691505432, + "learning_rate": 3.3547722463526005e-05, + "loss": 0.139, + "num_input_tokens_seen": 14666432, + "step": 12050 + }, + { + "epoch": 1.342577124401381, + "grad_norm": 1.7416177988052368, + "learning_rate": 3.356164383561644e-05, + "loss": 0.1928, + "num_input_tokens_seen": 14672288, + "step": 12055 + }, + { + "epoch": 1.3431339792849983, + "grad_norm": 0.7219870090484619, + "learning_rate": 3.357556520770687e-05, + "loss": 0.2005, + "num_input_tokens_seen": 14677600, + "step": 12060 + }, + { + "epoch": 1.3436908341686156, + "grad_norm": 2.105156660079956, + "learning_rate": 3.358948657979731e-05, + "loss": 0.2854, + "num_input_tokens_seen": 14683904, + "step": 12065 + }, + { + "epoch": 1.344247689052233, + "grad_norm": 0.10766412317752838, + "learning_rate": 3.360340795188774e-05, + "loss": 0.1484, + "num_input_tokens_seen": 14689888, + "step": 12070 + }, + { + "epoch": 1.3448045439358502, + "grad_norm": 0.005211135372519493, + "learning_rate": 3.3617329323978176e-05, + "loss": 0.1928, + "num_input_tokens_seen": 14696224, + "step": 12075 + }, + { + "epoch": 1.3453613988194677, + "grad_norm": 0.7918464541435242, + "learning_rate": 3.3631250696068604e-05, + "loss": 0.3025, + "num_input_tokens_seen": 14702208, + "step": 12080 + }, + { + "epoch": 1.345918253703085, + "grad_norm": 0.4773259460926056, + "learning_rate": 3.3645172068159045e-05, + "loss": 0.1928, + "num_input_tokens_seen": 14708256, + "step": 12085 + }, + { + "epoch": 1.3464751085867024, + "grad_norm": 0.09165851771831512, + "learning_rate": 3.365909344024947e-05, + "loss": 0.0632, + "num_input_tokens_seen": 14714304, + "step": 12090 + }, + { + "epoch": 1.3470319634703196, + "grad_norm": 0.966568648815155, + "learning_rate": 3.367301481233991e-05, + "loss": 0.148, + "num_input_tokens_seen": 14720288, + "step": 12095 + }, + { + "epoch": 1.347588818353937, + "grad_norm": 1.037524700164795, + "learning_rate": 3.368693618443034e-05, + "loss": 0.1879, + "num_input_tokens_seen": 14726368, + "step": 12100 + }, + { + "epoch": 1.3481456732375543, + "grad_norm": 0.6305306553840637, + "learning_rate": 3.370085755652077e-05, + "loss": 0.0891, + "num_input_tokens_seen": 14732608, + "step": 12105 + }, + { + "epoch": 1.3487025281211715, + "grad_norm": 0.1662144958972931, + "learning_rate": 3.371477892861121e-05, + "loss": 0.1585, + "num_input_tokens_seen": 14738752, + "step": 12110 + }, + { + "epoch": 1.349259383004789, + "grad_norm": 0.13797792792320251, + "learning_rate": 3.372870030070164e-05, + "loss": 0.1265, + "num_input_tokens_seen": 14745216, + "step": 12115 + }, + { + "epoch": 1.3498162378884062, + "grad_norm": 0.639472484588623, + "learning_rate": 3.374262167279207e-05, + "loss": 0.0982, + "num_input_tokens_seen": 14750912, + "step": 12120 + }, + { + "epoch": 1.3503730927720237, + "grad_norm": 0.9550604820251465, + "learning_rate": 3.3756543044882506e-05, + "loss": 0.0864, + "num_input_tokens_seen": 14756992, + "step": 12125 + }, + { + "epoch": 1.350929947655641, + "grad_norm": 0.07442932575941086, + "learning_rate": 3.377046441697294e-05, + "loss": 0.0866, + "num_input_tokens_seen": 14763392, + "step": 12130 + }, + { + "epoch": 1.3514868025392583, + "grad_norm": 0.5075360536575317, + "learning_rate": 3.3784385789063375e-05, + "loss": 0.2567, + "num_input_tokens_seen": 14769120, + "step": 12135 + }, + { + "epoch": 1.3520436574228756, + "grad_norm": 1.7541041374206543, + "learning_rate": 3.37983071611538e-05, + "loss": 0.1678, + "num_input_tokens_seen": 14775264, + "step": 12140 + }, + { + "epoch": 1.352600512306493, + "grad_norm": 0.5950473546981812, + "learning_rate": 3.3812228533244237e-05, + "loss": 0.2026, + "num_input_tokens_seen": 14781408, + "step": 12145 + }, + { + "epoch": 1.3531573671901103, + "grad_norm": 0.27478334307670593, + "learning_rate": 3.382614990533467e-05, + "loss": 0.0561, + "num_input_tokens_seen": 14787552, + "step": 12150 + }, + { + "epoch": 1.3537142220737275, + "grad_norm": 1.0464390516281128, + "learning_rate": 3.3840071277425105e-05, + "loss": 0.1071, + "num_input_tokens_seen": 14793728, + "step": 12155 + }, + { + "epoch": 1.354271076957345, + "grad_norm": 0.38821941614151, + "learning_rate": 3.385399264951554e-05, + "loss": 0.3094, + "num_input_tokens_seen": 14799552, + "step": 12160 + }, + { + "epoch": 1.3548279318409622, + "grad_norm": 0.9034696221351624, + "learning_rate": 3.3867914021605974e-05, + "loss": 0.0875, + "num_input_tokens_seen": 14805696, + "step": 12165 + }, + { + "epoch": 1.3553847867245796, + "grad_norm": 0.8963009119033813, + "learning_rate": 3.38818353936964e-05, + "loss": 0.1283, + "num_input_tokens_seen": 14811328, + "step": 12170 + }, + { + "epoch": 1.3559416416081969, + "grad_norm": 1.2677489519119263, + "learning_rate": 3.389575676578684e-05, + "loss": 0.2029, + "num_input_tokens_seen": 14817248, + "step": 12175 + }, + { + "epoch": 1.3564984964918143, + "grad_norm": 2.2963626384735107, + "learning_rate": 3.390967813787727e-05, + "loss": 0.168, + "num_input_tokens_seen": 14823424, + "step": 12180 + }, + { + "epoch": 1.3570553513754315, + "grad_norm": 0.07225704193115234, + "learning_rate": 3.3923599509967704e-05, + "loss": 0.2459, + "num_input_tokens_seen": 14829600, + "step": 12185 + }, + { + "epoch": 1.357612206259049, + "grad_norm": 0.5813011527061462, + "learning_rate": 3.393752088205814e-05, + "loss": 0.1051, + "num_input_tokens_seen": 14835744, + "step": 12190 + }, + { + "epoch": 1.3581690611426662, + "grad_norm": 0.6807673573493958, + "learning_rate": 3.3951442254148566e-05, + "loss": 0.0569, + "num_input_tokens_seen": 14841984, + "step": 12195 + }, + { + "epoch": 1.3587259160262835, + "grad_norm": 1.3813271522521973, + "learning_rate": 3.396536362623901e-05, + "loss": 0.1349, + "num_input_tokens_seen": 14848160, + "step": 12200 + }, + { + "epoch": 1.359282770909901, + "grad_norm": 0.6792354583740234, + "learning_rate": 3.3979284998329435e-05, + "loss": 0.2191, + "num_input_tokens_seen": 14854336, + "step": 12205 + }, + { + "epoch": 1.3598396257935181, + "grad_norm": 0.4028901159763336, + "learning_rate": 3.399320637041987e-05, + "loss": 0.1038, + "num_input_tokens_seen": 14860320, + "step": 12210 + }, + { + "epoch": 1.3603964806771356, + "grad_norm": 1.0474388599395752, + "learning_rate": 3.4007127742510304e-05, + "loss": 0.1659, + "num_input_tokens_seen": 14865536, + "step": 12215 + }, + { + "epoch": 1.3609533355607528, + "grad_norm": 0.15092790126800537, + "learning_rate": 3.402104911460074e-05, + "loss": 0.1619, + "num_input_tokens_seen": 14871488, + "step": 12220 + }, + { + "epoch": 1.3615101904443703, + "grad_norm": 1.0669970512390137, + "learning_rate": 3.403497048669117e-05, + "loss": 0.2133, + "num_input_tokens_seen": 14877600, + "step": 12225 + }, + { + "epoch": 1.3620670453279875, + "grad_norm": 0.5737857818603516, + "learning_rate": 3.40488918587816e-05, + "loss": 0.0954, + "num_input_tokens_seen": 14883872, + "step": 12230 + }, + { + "epoch": 1.362623900211605, + "grad_norm": 1.2108116149902344, + "learning_rate": 3.4062813230872034e-05, + "loss": 0.078, + "num_input_tokens_seen": 14889856, + "step": 12235 + }, + { + "epoch": 1.3631807550952222, + "grad_norm": 2.4567477703094482, + "learning_rate": 3.407673460296247e-05, + "loss": 0.1691, + "num_input_tokens_seen": 14895936, + "step": 12240 + }, + { + "epoch": 1.3637376099788394, + "grad_norm": 0.48186227679252625, + "learning_rate": 3.40906559750529e-05, + "loss": 0.2105, + "num_input_tokens_seen": 14902176, + "step": 12245 + }, + { + "epoch": 1.3642944648624569, + "grad_norm": 0.23619814217090607, + "learning_rate": 3.410457734714334e-05, + "loss": 0.1223, + "num_input_tokens_seen": 14908096, + "step": 12250 + }, + { + "epoch": 1.364851319746074, + "grad_norm": 1.3578639030456543, + "learning_rate": 3.411849871923377e-05, + "loss": 0.228, + "num_input_tokens_seen": 14914048, + "step": 12255 + }, + { + "epoch": 1.3654081746296916, + "grad_norm": 0.6388468146324158, + "learning_rate": 3.41324200913242e-05, + "loss": 0.1738, + "num_input_tokens_seen": 14920224, + "step": 12260 + }, + { + "epoch": 1.3659650295133088, + "grad_norm": 0.7239686250686646, + "learning_rate": 3.414634146341464e-05, + "loss": 0.0863, + "num_input_tokens_seen": 14926240, + "step": 12265 + }, + { + "epoch": 1.3665218843969262, + "grad_norm": 0.9130647778511047, + "learning_rate": 3.416026283550507e-05, + "loss": 0.1534, + "num_input_tokens_seen": 14932160, + "step": 12270 + }, + { + "epoch": 1.3670787392805435, + "grad_norm": 1.139121174812317, + "learning_rate": 3.41741842075955e-05, + "loss": 0.1041, + "num_input_tokens_seen": 14938400, + "step": 12275 + }, + { + "epoch": 1.367635594164161, + "grad_norm": 0.19437682628631592, + "learning_rate": 3.4188105579685936e-05, + "loss": 0.1413, + "num_input_tokens_seen": 14943936, + "step": 12280 + }, + { + "epoch": 1.3681924490477781, + "grad_norm": 1.471463680267334, + "learning_rate": 3.4202026951776364e-05, + "loss": 0.2619, + "num_input_tokens_seen": 14950432, + "step": 12285 + }, + { + "epoch": 1.3687493039313954, + "grad_norm": 0.8195834755897522, + "learning_rate": 3.4215948323866805e-05, + "loss": 0.1029, + "num_input_tokens_seen": 14956608, + "step": 12290 + }, + { + "epoch": 1.3693061588150128, + "grad_norm": 1.8135422468185425, + "learning_rate": 3.422986969595723e-05, + "loss": 0.296, + "num_input_tokens_seen": 14961856, + "step": 12295 + }, + { + "epoch": 1.36986301369863, + "grad_norm": 0.01703563705086708, + "learning_rate": 3.4243791068047674e-05, + "loss": 0.0688, + "num_input_tokens_seen": 14968000, + "step": 12300 + }, + { + "epoch": 1.3704198685822475, + "grad_norm": 0.648771345615387, + "learning_rate": 3.42577124401381e-05, + "loss": 0.1407, + "num_input_tokens_seen": 14974208, + "step": 12305 + }, + { + "epoch": 1.3709767234658647, + "grad_norm": 0.006484887097030878, + "learning_rate": 3.4271633812228535e-05, + "loss": 0.1336, + "num_input_tokens_seen": 14980224, + "step": 12310 + }, + { + "epoch": 1.3715335783494822, + "grad_norm": 0.10135503858327866, + "learning_rate": 3.428555518431897e-05, + "loss": 0.1942, + "num_input_tokens_seen": 14986144, + "step": 12315 + }, + { + "epoch": 1.3720904332330994, + "grad_norm": 0.8154076933860779, + "learning_rate": 3.42994765564094e-05, + "loss": 0.2138, + "num_input_tokens_seen": 14991488, + "step": 12320 + }, + { + "epoch": 1.3726472881167169, + "grad_norm": 0.9531069397926331, + "learning_rate": 3.431339792849984e-05, + "loss": 0.0978, + "num_input_tokens_seen": 14997568, + "step": 12325 + }, + { + "epoch": 1.373204143000334, + "grad_norm": 0.05925427004694939, + "learning_rate": 3.4327319300590266e-05, + "loss": 0.1035, + "num_input_tokens_seen": 15003904, + "step": 12330 + }, + { + "epoch": 1.3737609978839513, + "grad_norm": 1.6387014389038086, + "learning_rate": 3.43412406726807e-05, + "loss": 0.1674, + "num_input_tokens_seen": 15010016, + "step": 12335 + }, + { + "epoch": 1.3743178527675688, + "grad_norm": 0.6609817147254944, + "learning_rate": 3.4355162044771135e-05, + "loss": 0.2361, + "num_input_tokens_seen": 15016160, + "step": 12340 + }, + { + "epoch": 1.374874707651186, + "grad_norm": 0.45978960394859314, + "learning_rate": 3.436908341686157e-05, + "loss": 0.1822, + "num_input_tokens_seen": 15022272, + "step": 12345 + }, + { + "epoch": 1.3754315625348035, + "grad_norm": 3.7724664211273193, + "learning_rate": 3.4383004788952e-05, + "loss": 0.2911, + "num_input_tokens_seen": 15028416, + "step": 12350 + }, + { + "epoch": 1.3759884174184207, + "grad_norm": 0.018153788521885872, + "learning_rate": 3.439692616104244e-05, + "loss": 0.1623, + "num_input_tokens_seen": 15034464, + "step": 12355 + }, + { + "epoch": 1.3765452723020382, + "grad_norm": 1.8210185766220093, + "learning_rate": 3.4410847533132865e-05, + "loss": 0.2146, + "num_input_tokens_seen": 15039808, + "step": 12360 + }, + { + "epoch": 1.3771021271856554, + "grad_norm": 0.32388386130332947, + "learning_rate": 3.44247689052233e-05, + "loss": 0.072, + "num_input_tokens_seen": 15045984, + "step": 12365 + }, + { + "epoch": 1.3776589820692728, + "grad_norm": 3.4146652221679688, + "learning_rate": 3.4438690277313734e-05, + "loss": 0.1025, + "num_input_tokens_seen": 15052256, + "step": 12370 + }, + { + "epoch": 1.37821583695289, + "grad_norm": 0.8050941824913025, + "learning_rate": 3.445261164940417e-05, + "loss": 0.07, + "num_input_tokens_seen": 15058496, + "step": 12375 + }, + { + "epoch": 1.3787726918365073, + "grad_norm": 0.4323500990867615, + "learning_rate": 3.44665330214946e-05, + "loss": 0.0732, + "num_input_tokens_seen": 15064416, + "step": 12380 + }, + { + "epoch": 1.3793295467201248, + "grad_norm": 1.061187505722046, + "learning_rate": 3.448045439358503e-05, + "loss": 0.105, + "num_input_tokens_seen": 15070656, + "step": 12385 + }, + { + "epoch": 1.379886401603742, + "grad_norm": 1.8916733264923096, + "learning_rate": 3.449437576567547e-05, + "loss": 0.357, + "num_input_tokens_seen": 15076352, + "step": 12390 + }, + { + "epoch": 1.3804432564873594, + "grad_norm": 0.06531421095132828, + "learning_rate": 3.45082971377659e-05, + "loss": 0.1144, + "num_input_tokens_seen": 15082592, + "step": 12395 + }, + { + "epoch": 1.3810001113709767, + "grad_norm": 0.6073611378669739, + "learning_rate": 3.452221850985633e-05, + "loss": 0.1428, + "num_input_tokens_seen": 15088800, + "step": 12400 + }, + { + "epoch": 1.3815569662545941, + "grad_norm": 0.6457718014717102, + "learning_rate": 3.453613988194677e-05, + "loss": 0.1711, + "num_input_tokens_seen": 15094496, + "step": 12405 + }, + { + "epoch": 1.3821138211382114, + "grad_norm": 0.47518038749694824, + "learning_rate": 3.4550061254037195e-05, + "loss": 0.0794, + "num_input_tokens_seen": 15100672, + "step": 12410 + }, + { + "epoch": 1.3826706760218288, + "grad_norm": 0.3904787302017212, + "learning_rate": 3.4563982626127636e-05, + "loss": 0.2342, + "num_input_tokens_seen": 15106496, + "step": 12415 + }, + { + "epoch": 1.383227530905446, + "grad_norm": 0.43202894926071167, + "learning_rate": 3.4577903998218064e-05, + "loss": 0.1158, + "num_input_tokens_seen": 15112640, + "step": 12420 + }, + { + "epoch": 1.3837843857890633, + "grad_norm": 0.5384976267814636, + "learning_rate": 3.45918253703085e-05, + "loss": 0.1445, + "num_input_tokens_seen": 15118624, + "step": 12425 + }, + { + "epoch": 1.3843412406726807, + "grad_norm": 0.5129441618919373, + "learning_rate": 3.460574674239893e-05, + "loss": 0.1261, + "num_input_tokens_seen": 15124352, + "step": 12430 + }, + { + "epoch": 1.384898095556298, + "grad_norm": 1.18551766872406, + "learning_rate": 3.4619668114489367e-05, + "loss": 0.1174, + "num_input_tokens_seen": 15130304, + "step": 12435 + }, + { + "epoch": 1.3854549504399154, + "grad_norm": 0.46116384863853455, + "learning_rate": 3.46335894865798e-05, + "loss": 0.1445, + "num_input_tokens_seen": 15136608, + "step": 12440 + }, + { + "epoch": 1.3860118053235326, + "grad_norm": 1.4513866901397705, + "learning_rate": 3.4647510858670235e-05, + "loss": 0.1641, + "num_input_tokens_seen": 15142656, + "step": 12445 + }, + { + "epoch": 1.38656866020715, + "grad_norm": 0.21156275272369385, + "learning_rate": 3.466143223076066e-05, + "loss": 0.0852, + "num_input_tokens_seen": 15148704, + "step": 12450 + }, + { + "epoch": 1.3871255150907673, + "grad_norm": 0.9366897940635681, + "learning_rate": 3.46753536028511e-05, + "loss": 0.1515, + "num_input_tokens_seen": 15154752, + "step": 12455 + }, + { + "epoch": 1.3876823699743848, + "grad_norm": 0.7644907832145691, + "learning_rate": 3.468927497494153e-05, + "loss": 0.1683, + "num_input_tokens_seen": 15160960, + "step": 12460 + }, + { + "epoch": 1.388239224858002, + "grad_norm": 0.5370027422904968, + "learning_rate": 3.4703196347031966e-05, + "loss": 0.0605, + "num_input_tokens_seen": 15167200, + "step": 12465 + }, + { + "epoch": 1.3887960797416192, + "grad_norm": 0.9789283275604248, + "learning_rate": 3.47171177191224e-05, + "loss": 0.0903, + "num_input_tokens_seen": 15172992, + "step": 12470 + }, + { + "epoch": 1.3893529346252367, + "grad_norm": 0.42731809616088867, + "learning_rate": 3.473103909121283e-05, + "loss": 0.1661, + "num_input_tokens_seen": 15178912, + "step": 12475 + }, + { + "epoch": 1.389909789508854, + "grad_norm": 1.0840736627578735, + "learning_rate": 3.474496046330327e-05, + "loss": 0.1355, + "num_input_tokens_seen": 15185408, + "step": 12480 + }, + { + "epoch": 1.3904666443924714, + "grad_norm": 1.3860961198806763, + "learning_rate": 3.4758881835393696e-05, + "loss": 0.1134, + "num_input_tokens_seen": 15191488, + "step": 12485 + }, + { + "epoch": 1.3910234992760886, + "grad_norm": 1.1140860319137573, + "learning_rate": 3.477280320748413e-05, + "loss": 0.1297, + "num_input_tokens_seen": 15197632, + "step": 12490 + }, + { + "epoch": 1.391580354159706, + "grad_norm": 2.5202581882476807, + "learning_rate": 3.4786724579574565e-05, + "loss": 0.2933, + "num_input_tokens_seen": 15204096, + "step": 12495 + }, + { + "epoch": 1.3921372090433233, + "grad_norm": 2.080564498901367, + "learning_rate": 3.480064595166499e-05, + "loss": 0.1992, + "num_input_tokens_seen": 15210080, + "step": 12500 + }, + { + "epoch": 1.3926940639269407, + "grad_norm": 0.5770233869552612, + "learning_rate": 3.4814567323755434e-05, + "loss": 0.0636, + "num_input_tokens_seen": 15215584, + "step": 12505 + }, + { + "epoch": 1.393250918810558, + "grad_norm": 2.646221160888672, + "learning_rate": 3.482848869584586e-05, + "loss": 0.269, + "num_input_tokens_seen": 15221504, + "step": 12510 + }, + { + "epoch": 1.3938077736941752, + "grad_norm": 1.2166087627410889, + "learning_rate": 3.4842410067936296e-05, + "loss": 0.1186, + "num_input_tokens_seen": 15227808, + "step": 12515 + }, + { + "epoch": 1.3943646285777926, + "grad_norm": 0.09890724718570709, + "learning_rate": 3.485633144002673e-05, + "loss": 0.1148, + "num_input_tokens_seen": 15233920, + "step": 12520 + }, + { + "epoch": 1.39492148346141, + "grad_norm": 0.06899519264698029, + "learning_rate": 3.4870252812117164e-05, + "loss": 0.0333, + "num_input_tokens_seen": 15240224, + "step": 12525 + }, + { + "epoch": 1.3954783383450273, + "grad_norm": 0.2952411472797394, + "learning_rate": 3.48841741842076e-05, + "loss": 0.0881, + "num_input_tokens_seen": 15246336, + "step": 12530 + }, + { + "epoch": 1.3960351932286446, + "grad_norm": 3.1303012371063232, + "learning_rate": 3.489809555629803e-05, + "loss": 0.3899, + "num_input_tokens_seen": 15252480, + "step": 12535 + }, + { + "epoch": 1.396592048112262, + "grad_norm": 0.11921894550323486, + "learning_rate": 3.491201692838846e-05, + "loss": 0.1429, + "num_input_tokens_seen": 15257984, + "step": 12540 + }, + { + "epoch": 1.3971489029958792, + "grad_norm": 0.05062161386013031, + "learning_rate": 3.4925938300478895e-05, + "loss": 0.2174, + "num_input_tokens_seen": 15263872, + "step": 12545 + }, + { + "epoch": 1.3977057578794967, + "grad_norm": 0.9694837927818298, + "learning_rate": 3.493985967256933e-05, + "loss": 0.1497, + "num_input_tokens_seen": 15269984, + "step": 12550 + }, + { + "epoch": 1.398262612763114, + "grad_norm": 0.4902922511100769, + "learning_rate": 3.495378104465976e-05, + "loss": 0.1753, + "num_input_tokens_seen": 15276032, + "step": 12555 + }, + { + "epoch": 1.3988194676467312, + "grad_norm": 0.2687428593635559, + "learning_rate": 3.49677024167502e-05, + "loss": 0.0997, + "num_input_tokens_seen": 15282688, + "step": 12560 + }, + { + "epoch": 1.3993763225303486, + "grad_norm": 0.5204976797103882, + "learning_rate": 3.4981623788840625e-05, + "loss": 0.1175, + "num_input_tokens_seen": 15288800, + "step": 12565 + }, + { + "epoch": 1.399933177413966, + "grad_norm": 1.2455590963363647, + "learning_rate": 3.4995545160931066e-05, + "loss": 0.1515, + "num_input_tokens_seen": 15295136, + "step": 12570 + }, + { + "epoch": 1.4004900322975833, + "grad_norm": 0.8291925191879272, + "learning_rate": 3.5009466533021494e-05, + "loss": 0.0507, + "num_input_tokens_seen": 15301472, + "step": 12575 + }, + { + "epoch": 1.4010468871812005, + "grad_norm": 1.8581411838531494, + "learning_rate": 3.5023387905111935e-05, + "loss": 0.1896, + "num_input_tokens_seen": 15307584, + "step": 12580 + }, + { + "epoch": 1.401603742064818, + "grad_norm": 0.1683465838432312, + "learning_rate": 3.503730927720236e-05, + "loss": 0.1267, + "num_input_tokens_seen": 15313536, + "step": 12585 + }, + { + "epoch": 1.4021605969484352, + "grad_norm": 0.044059012085199356, + "learning_rate": 3.505123064929279e-05, + "loss": 0.0766, + "num_input_tokens_seen": 15319744, + "step": 12590 + }, + { + "epoch": 1.4027174518320527, + "grad_norm": 0.5412081480026245, + "learning_rate": 3.506515202138323e-05, + "loss": 0.0281, + "num_input_tokens_seen": 15325696, + "step": 12595 + }, + { + "epoch": 1.4032743067156699, + "grad_norm": 1.1092714071273804, + "learning_rate": 3.507907339347366e-05, + "loss": 0.1511, + "num_input_tokens_seen": 15331840, + "step": 12600 + }, + { + "epoch": 1.4038311615992871, + "grad_norm": 1.065584421157837, + "learning_rate": 3.50929947655641e-05, + "loss": 0.1314, + "num_input_tokens_seen": 15337952, + "step": 12605 + }, + { + "epoch": 1.4043880164829046, + "grad_norm": 0.4998261034488678, + "learning_rate": 3.510691613765453e-05, + "loss": 0.1457, + "num_input_tokens_seen": 15344000, + "step": 12610 + }, + { + "epoch": 1.404944871366522, + "grad_norm": 0.7903715372085571, + "learning_rate": 3.512083750974496e-05, + "loss": 0.1499, + "num_input_tokens_seen": 15349824, + "step": 12615 + }, + { + "epoch": 1.4055017262501392, + "grad_norm": 0.5133092999458313, + "learning_rate": 3.5134758881835396e-05, + "loss": 0.1071, + "num_input_tokens_seen": 15355456, + "step": 12620 + }, + { + "epoch": 1.4060585811337565, + "grad_norm": 0.26684504747390747, + "learning_rate": 3.514868025392583e-05, + "loss": 0.13, + "num_input_tokens_seen": 15361408, + "step": 12625 + }, + { + "epoch": 1.406615436017374, + "grad_norm": 0.4376451373100281, + "learning_rate": 3.5162601626016265e-05, + "loss": 0.0675, + "num_input_tokens_seen": 15367360, + "step": 12630 + }, + { + "epoch": 1.4071722909009912, + "grad_norm": 0.2981870472431183, + "learning_rate": 3.517652299810669e-05, + "loss": 0.1369, + "num_input_tokens_seen": 15373536, + "step": 12635 + }, + { + "epoch": 1.4077291457846086, + "grad_norm": 0.8412713408470154, + "learning_rate": 3.519044437019713e-05, + "loss": 0.053, + "num_input_tokens_seen": 15379680, + "step": 12640 + }, + { + "epoch": 1.4082860006682258, + "grad_norm": 0.8292065858840942, + "learning_rate": 3.520436574228756e-05, + "loss": 0.067, + "num_input_tokens_seen": 15385952, + "step": 12645 + }, + { + "epoch": 1.408842855551843, + "grad_norm": 0.14533190429210663, + "learning_rate": 3.5218287114377995e-05, + "loss": 0.1069, + "num_input_tokens_seen": 15392256, + "step": 12650 + }, + { + "epoch": 1.4093997104354605, + "grad_norm": 0.7697882652282715, + "learning_rate": 3.523220848646843e-05, + "loss": 0.1864, + "num_input_tokens_seen": 15398016, + "step": 12655 + }, + { + "epoch": 1.409956565319078, + "grad_norm": 0.7404921054840088, + "learning_rate": 3.5246129858558864e-05, + "loss": 0.2674, + "num_input_tokens_seen": 15404032, + "step": 12660 + }, + { + "epoch": 1.4105134202026952, + "grad_norm": 0.44729167222976685, + "learning_rate": 3.526005123064929e-05, + "loss": 0.0965, + "num_input_tokens_seen": 15409248, + "step": 12665 + }, + { + "epoch": 1.4110702750863124, + "grad_norm": 0.6710172295570374, + "learning_rate": 3.527397260273973e-05, + "loss": 0.1587, + "num_input_tokens_seen": 15415456, + "step": 12670 + }, + { + "epoch": 1.41162712996993, + "grad_norm": 0.9623320698738098, + "learning_rate": 3.528789397483016e-05, + "loss": 0.2718, + "num_input_tokens_seen": 15421664, + "step": 12675 + }, + { + "epoch": 1.4121839848535471, + "grad_norm": 1.3754950761795044, + "learning_rate": 3.5301815346920594e-05, + "loss": 0.1, + "num_input_tokens_seen": 15427968, + "step": 12680 + }, + { + "epoch": 1.4127408397371646, + "grad_norm": 0.03290114179253578, + "learning_rate": 3.531573671901103e-05, + "loss": 0.062, + "num_input_tokens_seen": 15434272, + "step": 12685 + }, + { + "epoch": 1.4132976946207818, + "grad_norm": 0.1569998860359192, + "learning_rate": 3.5329658091101456e-05, + "loss": 0.0396, + "num_input_tokens_seen": 15440000, + "step": 12690 + }, + { + "epoch": 1.413854549504399, + "grad_norm": 2.6006901264190674, + "learning_rate": 3.53435794631919e-05, + "loss": 0.2633, + "num_input_tokens_seen": 15445952, + "step": 12695 + }, + { + "epoch": 1.4144114043880165, + "grad_norm": 1.7116414308547974, + "learning_rate": 3.5357500835282325e-05, + "loss": 0.2555, + "num_input_tokens_seen": 15452224, + "step": 12700 + }, + { + "epoch": 1.414968259271634, + "grad_norm": 1.114418625831604, + "learning_rate": 3.537142220737276e-05, + "loss": 0.2552, + "num_input_tokens_seen": 15458336, + "step": 12705 + }, + { + "epoch": 1.4155251141552512, + "grad_norm": 0.7334887385368347, + "learning_rate": 3.5385343579463194e-05, + "loss": 0.2137, + "num_input_tokens_seen": 15464704, + "step": 12710 + }, + { + "epoch": 1.4160819690388684, + "grad_norm": 0.7175560593605042, + "learning_rate": 3.539926495155363e-05, + "loss": 0.1048, + "num_input_tokens_seen": 15470752, + "step": 12715 + }, + { + "epoch": 1.4166388239224859, + "grad_norm": 0.3395783603191376, + "learning_rate": 3.541318632364406e-05, + "loss": 0.0661, + "num_input_tokens_seen": 15476928, + "step": 12720 + }, + { + "epoch": 1.417195678806103, + "grad_norm": 2.7254154682159424, + "learning_rate": 3.54271076957345e-05, + "loss": 0.1169, + "num_input_tokens_seen": 15482880, + "step": 12725 + }, + { + "epoch": 1.4177525336897205, + "grad_norm": 0.836045503616333, + "learning_rate": 3.5441029067824924e-05, + "loss": 0.1247, + "num_input_tokens_seen": 15489280, + "step": 12730 + }, + { + "epoch": 1.4183093885733378, + "grad_norm": 1.3306219577789307, + "learning_rate": 3.545495043991536e-05, + "loss": 0.138, + "num_input_tokens_seen": 15495520, + "step": 12735 + }, + { + "epoch": 1.418866243456955, + "grad_norm": 0.5262572765350342, + "learning_rate": 3.546887181200579e-05, + "loss": 0.0888, + "num_input_tokens_seen": 15501632, + "step": 12740 + }, + { + "epoch": 1.4194230983405725, + "grad_norm": 1.011008620262146, + "learning_rate": 3.548279318409623e-05, + "loss": 0.0989, + "num_input_tokens_seen": 15507776, + "step": 12745 + }, + { + "epoch": 1.41997995322419, + "grad_norm": 1.3495423793792725, + "learning_rate": 3.549671455618666e-05, + "loss": 0.152, + "num_input_tokens_seen": 15513728, + "step": 12750 + }, + { + "epoch": 1.4205368081078071, + "grad_norm": 0.3983905613422394, + "learning_rate": 3.551063592827709e-05, + "loss": 0.1273, + "num_input_tokens_seen": 15520032, + "step": 12755 + }, + { + "epoch": 1.4210936629914244, + "grad_norm": 0.7252240777015686, + "learning_rate": 3.552455730036753e-05, + "loss": 0.0661, + "num_input_tokens_seen": 15526272, + "step": 12760 + }, + { + "epoch": 1.4216505178750418, + "grad_norm": 0.4944974482059479, + "learning_rate": 3.553847867245796e-05, + "loss": 0.1954, + "num_input_tokens_seen": 15532288, + "step": 12765 + }, + { + "epoch": 1.422207372758659, + "grad_norm": 2.0562829971313477, + "learning_rate": 3.555240004454839e-05, + "loss": 0.1915, + "num_input_tokens_seen": 15538272, + "step": 12770 + }, + { + "epoch": 1.4227642276422765, + "grad_norm": 1.1942058801651, + "learning_rate": 3.5566321416638826e-05, + "loss": 0.0606, + "num_input_tokens_seen": 15544448, + "step": 12775 + }, + { + "epoch": 1.4233210825258937, + "grad_norm": 0.8662906289100647, + "learning_rate": 3.5580242788729254e-05, + "loss": 0.1958, + "num_input_tokens_seen": 15549920, + "step": 12780 + }, + { + "epoch": 1.423877937409511, + "grad_norm": 1.1130870580673218, + "learning_rate": 3.5594164160819695e-05, + "loss": 0.0711, + "num_input_tokens_seen": 15556288, + "step": 12785 + }, + { + "epoch": 1.4244347922931284, + "grad_norm": 1.1077351570129395, + "learning_rate": 3.560808553291012e-05, + "loss": 0.0914, + "num_input_tokens_seen": 15562432, + "step": 12790 + }, + { + "epoch": 1.4249916471767459, + "grad_norm": 0.10258408635854721, + "learning_rate": 3.562200690500056e-05, + "loss": 0.2486, + "num_input_tokens_seen": 15568608, + "step": 12795 + }, + { + "epoch": 1.425548502060363, + "grad_norm": 1.5480774641036987, + "learning_rate": 3.563592827709099e-05, + "loss": 0.1019, + "num_input_tokens_seen": 15575072, + "step": 12800 + }, + { + "epoch": 1.4261053569439803, + "grad_norm": 0.23599113523960114, + "learning_rate": 3.5649849649181426e-05, + "loss": 0.1648, + "num_input_tokens_seen": 15581088, + "step": 12805 + }, + { + "epoch": 1.4266622118275978, + "grad_norm": 2.243654727935791, + "learning_rate": 3.566377102127186e-05, + "loss": 0.2522, + "num_input_tokens_seen": 15586848, + "step": 12810 + }, + { + "epoch": 1.427219066711215, + "grad_norm": 0.6630223393440247, + "learning_rate": 3.5677692393362294e-05, + "loss": 0.1513, + "num_input_tokens_seen": 15593312, + "step": 12815 + }, + { + "epoch": 1.4277759215948325, + "grad_norm": 1.1887569427490234, + "learning_rate": 3.569161376545272e-05, + "loss": 0.1438, + "num_input_tokens_seen": 15599296, + "step": 12820 + }, + { + "epoch": 1.4283327764784497, + "grad_norm": 1.468045711517334, + "learning_rate": 3.5705535137543156e-05, + "loss": 0.1443, + "num_input_tokens_seen": 15605600, + "step": 12825 + }, + { + "epoch": 1.428889631362067, + "grad_norm": 0.00694076344370842, + "learning_rate": 3.571945650963359e-05, + "loss": 0.0917, + "num_input_tokens_seen": 15611840, + "step": 12830 + }, + { + "epoch": 1.4294464862456844, + "grad_norm": 0.32159918546676636, + "learning_rate": 3.5733377881724025e-05, + "loss": 0.0823, + "num_input_tokens_seen": 15617664, + "step": 12835 + }, + { + "epoch": 1.4300033411293018, + "grad_norm": 0.31981056928634644, + "learning_rate": 3.574729925381446e-05, + "loss": 0.1552, + "num_input_tokens_seen": 15623680, + "step": 12840 + }, + { + "epoch": 1.430560196012919, + "grad_norm": 0.301403284072876, + "learning_rate": 3.576122062590489e-05, + "loss": 0.0547, + "num_input_tokens_seen": 15629312, + "step": 12845 + }, + { + "epoch": 1.4311170508965363, + "grad_norm": 0.9803117513656616, + "learning_rate": 3.577514199799533e-05, + "loss": 0.1263, + "num_input_tokens_seen": 15635616, + "step": 12850 + }, + { + "epoch": 1.4316739057801537, + "grad_norm": 0.09952536225318909, + "learning_rate": 3.5789063370085755e-05, + "loss": 0.0305, + "num_input_tokens_seen": 15641504, + "step": 12855 + }, + { + "epoch": 1.432230760663771, + "grad_norm": 1.0794061422348022, + "learning_rate": 3.5802984742176196e-05, + "loss": 0.0796, + "num_input_tokens_seen": 15647328, + "step": 12860 + }, + { + "epoch": 1.4327876155473884, + "grad_norm": 0.1451895385980606, + "learning_rate": 3.5816906114266624e-05, + "loss": 0.115, + "num_input_tokens_seen": 15653376, + "step": 12865 + }, + { + "epoch": 1.4333444704310057, + "grad_norm": 1.4850504398345947, + "learning_rate": 3.583082748635705e-05, + "loss": 0.1368, + "num_input_tokens_seen": 15659712, + "step": 12870 + }, + { + "epoch": 1.4339013253146229, + "grad_norm": 0.09945783764123917, + "learning_rate": 3.584474885844749e-05, + "loss": 0.128, + "num_input_tokens_seen": 15665536, + "step": 12875 + }, + { + "epoch": 1.4344581801982403, + "grad_norm": 0.6346538662910461, + "learning_rate": 3.585867023053792e-05, + "loss": 0.1474, + "num_input_tokens_seen": 15671840, + "step": 12880 + }, + { + "epoch": 1.4350150350818578, + "grad_norm": 2.3390142917633057, + "learning_rate": 3.587259160262836e-05, + "loss": 0.2148, + "num_input_tokens_seen": 15678080, + "step": 12885 + }, + { + "epoch": 1.435571889965475, + "grad_norm": 2.243438243865967, + "learning_rate": 3.588651297471879e-05, + "loss": 0.1047, + "num_input_tokens_seen": 15684128, + "step": 12890 + }, + { + "epoch": 1.4361287448490923, + "grad_norm": 1.1911344528198242, + "learning_rate": 3.590043434680922e-05, + "loss": 0.0916, + "num_input_tokens_seen": 15690336, + "step": 12895 + }, + { + "epoch": 1.4366855997327097, + "grad_norm": 0.04561017081141472, + "learning_rate": 3.591435571889966e-05, + "loss": 0.0635, + "num_input_tokens_seen": 15696672, + "step": 12900 + }, + { + "epoch": 1.437242454616327, + "grad_norm": 1.6535218954086304, + "learning_rate": 3.592827709099009e-05, + "loss": 0.1643, + "num_input_tokens_seen": 15702752, + "step": 12905 + }, + { + "epoch": 1.4377993094999444, + "grad_norm": 0.38414451479911804, + "learning_rate": 3.5942198463080526e-05, + "loss": 0.1904, + "num_input_tokens_seen": 15708896, + "step": 12910 + }, + { + "epoch": 1.4383561643835616, + "grad_norm": 0.3343403935432434, + "learning_rate": 3.5956119835170954e-05, + "loss": 0.0652, + "num_input_tokens_seen": 15715104, + "step": 12915 + }, + { + "epoch": 1.4389130192671788, + "grad_norm": 0.24260976910591125, + "learning_rate": 3.597004120726139e-05, + "loss": 0.1208, + "num_input_tokens_seen": 15721216, + "step": 12920 + }, + { + "epoch": 1.4394698741507963, + "grad_norm": 1.0492351055145264, + "learning_rate": 3.598396257935182e-05, + "loss": 0.1144, + "num_input_tokens_seen": 15727232, + "step": 12925 + }, + { + "epoch": 1.4400267290344138, + "grad_norm": 0.07397622615098953, + "learning_rate": 3.599788395144226e-05, + "loss": 0.1139, + "num_input_tokens_seen": 15733376, + "step": 12930 + }, + { + "epoch": 1.440583583918031, + "grad_norm": 1.0553905963897705, + "learning_rate": 3.601180532353269e-05, + "loss": 0.2745, + "num_input_tokens_seen": 15739392, + "step": 12935 + }, + { + "epoch": 1.4411404388016482, + "grad_norm": 0.36781349778175354, + "learning_rate": 3.6025726695623125e-05, + "loss": 0.1572, + "num_input_tokens_seen": 15745312, + "step": 12940 + }, + { + "epoch": 1.4416972936852657, + "grad_norm": 0.5493220090866089, + "learning_rate": 3.603964806771355e-05, + "loss": 0.0751, + "num_input_tokens_seen": 15751584, + "step": 12945 + }, + { + "epoch": 1.442254148568883, + "grad_norm": 0.26732516288757324, + "learning_rate": 3.6053569439803994e-05, + "loss": 0.2184, + "num_input_tokens_seen": 15758176, + "step": 12950 + }, + { + "epoch": 1.4428110034525004, + "grad_norm": 1.2433706521987915, + "learning_rate": 3.606749081189442e-05, + "loss": 0.1683, + "num_input_tokens_seen": 15764608, + "step": 12955 + }, + { + "epoch": 1.4433678583361176, + "grad_norm": 0.1135256290435791, + "learning_rate": 3.6081412183984856e-05, + "loss": 0.0731, + "num_input_tokens_seen": 15770688, + "step": 12960 + }, + { + "epoch": 1.4439247132197348, + "grad_norm": 1.480631947517395, + "learning_rate": 3.609533355607529e-05, + "loss": 0.248, + "num_input_tokens_seen": 15776352, + "step": 12965 + }, + { + "epoch": 1.4444815681033523, + "grad_norm": 1.8289003372192383, + "learning_rate": 3.610925492816572e-05, + "loss": 0.1158, + "num_input_tokens_seen": 15782560, + "step": 12970 + }, + { + "epoch": 1.4450384229869697, + "grad_norm": 1.468855381011963, + "learning_rate": 3.612317630025616e-05, + "loss": 0.1842, + "num_input_tokens_seen": 15788736, + "step": 12975 + }, + { + "epoch": 1.445595277870587, + "grad_norm": 0.09640523046255112, + "learning_rate": 3.6137097672346586e-05, + "loss": 0.0966, + "num_input_tokens_seen": 15794880, + "step": 12980 + }, + { + "epoch": 1.4461521327542042, + "grad_norm": 0.7809175252914429, + "learning_rate": 3.615101904443702e-05, + "loss": 0.1239, + "num_input_tokens_seen": 15801184, + "step": 12985 + }, + { + "epoch": 1.4467089876378216, + "grad_norm": 0.04460011050105095, + "learning_rate": 3.6164940416527455e-05, + "loss": 0.0372, + "num_input_tokens_seen": 15807552, + "step": 12990 + }, + { + "epoch": 1.4472658425214389, + "grad_norm": 1.8124089241027832, + "learning_rate": 3.617886178861789e-05, + "loss": 0.1904, + "num_input_tokens_seen": 15813888, + "step": 12995 + }, + { + "epoch": 1.4478226974050563, + "grad_norm": 0.01569020189344883, + "learning_rate": 3.6192783160708324e-05, + "loss": 0.0461, + "num_input_tokens_seen": 15819904, + "step": 13000 + }, + { + "epoch": 1.4483795522886735, + "grad_norm": 0.8170429468154907, + "learning_rate": 3.620670453279875e-05, + "loss": 0.3104, + "num_input_tokens_seen": 15825824, + "step": 13005 + }, + { + "epoch": 1.4489364071722908, + "grad_norm": 1.0280799865722656, + "learning_rate": 3.6220625904889186e-05, + "loss": 0.2067, + "num_input_tokens_seen": 15832064, + "step": 13010 + }, + { + "epoch": 1.4494932620559082, + "grad_norm": 0.8989758491516113, + "learning_rate": 3.623454727697962e-05, + "loss": 0.1494, + "num_input_tokens_seen": 15838144, + "step": 13015 + }, + { + "epoch": 1.4500501169395257, + "grad_norm": 1.3798553943634033, + "learning_rate": 3.6248468649070054e-05, + "loss": 0.2487, + "num_input_tokens_seen": 15844000, + "step": 13020 + }, + { + "epoch": 1.450606971823143, + "grad_norm": 0.545330286026001, + "learning_rate": 3.626239002116049e-05, + "loss": 0.1277, + "num_input_tokens_seen": 15850080, + "step": 13025 + }, + { + "epoch": 1.4511638267067601, + "grad_norm": 0.7609875202178955, + "learning_rate": 3.627631139325092e-05, + "loss": 0.1022, + "num_input_tokens_seen": 15856544, + "step": 13030 + }, + { + "epoch": 1.4517206815903776, + "grad_norm": 1.580399990081787, + "learning_rate": 3.629023276534135e-05, + "loss": 0.2747, + "num_input_tokens_seen": 15862784, + "step": 13035 + }, + { + "epoch": 1.4522775364739948, + "grad_norm": 0.3922373354434967, + "learning_rate": 3.630415413743179e-05, + "loss": 0.1166, + "num_input_tokens_seen": 15868544, + "step": 13040 + }, + { + "epoch": 1.4528343913576123, + "grad_norm": 1.0694553852081299, + "learning_rate": 3.631807550952222e-05, + "loss": 0.1097, + "num_input_tokens_seen": 15874944, + "step": 13045 + }, + { + "epoch": 1.4533912462412295, + "grad_norm": 1.4994357824325562, + "learning_rate": 3.6331996881612653e-05, + "loss": 0.2404, + "num_input_tokens_seen": 15881184, + "step": 13050 + }, + { + "epoch": 1.4539481011248467, + "grad_norm": 0.4301772713661194, + "learning_rate": 3.634591825370309e-05, + "loss": 0.0728, + "num_input_tokens_seen": 15887424, + "step": 13055 + }, + { + "epoch": 1.4545049560084642, + "grad_norm": 0.5989920496940613, + "learning_rate": 3.6359839625793515e-05, + "loss": 0.1342, + "num_input_tokens_seen": 15893248, + "step": 13060 + }, + { + "epoch": 1.4550618108920816, + "grad_norm": 1.5322109460830688, + "learning_rate": 3.6373760997883956e-05, + "loss": 0.1121, + "num_input_tokens_seen": 15899168, + "step": 13065 + }, + { + "epoch": 1.4556186657756989, + "grad_norm": 0.8048309087753296, + "learning_rate": 3.6387682369974384e-05, + "loss": 0.0978, + "num_input_tokens_seen": 15905248, + "step": 13070 + }, + { + "epoch": 1.456175520659316, + "grad_norm": 0.11462926119565964, + "learning_rate": 3.640160374206482e-05, + "loss": 0.1084, + "num_input_tokens_seen": 15911104, + "step": 13075 + }, + { + "epoch": 1.4567323755429336, + "grad_norm": 0.8787391781806946, + "learning_rate": 3.641552511415525e-05, + "loss": 0.1964, + "num_input_tokens_seen": 15917344, + "step": 13080 + }, + { + "epoch": 1.4572892304265508, + "grad_norm": 1.6010140180587769, + "learning_rate": 3.642944648624569e-05, + "loss": 0.182, + "num_input_tokens_seen": 15923360, + "step": 13085 + }, + { + "epoch": 1.4578460853101682, + "grad_norm": 0.16981972754001617, + "learning_rate": 3.644336785833612e-05, + "loss": 0.1503, + "num_input_tokens_seen": 15929568, + "step": 13090 + }, + { + "epoch": 1.4584029401937855, + "grad_norm": 0.5948046445846558, + "learning_rate": 3.645728923042655e-05, + "loss": 0.145, + "num_input_tokens_seen": 15935840, + "step": 13095 + }, + { + "epoch": 1.4589597950774027, + "grad_norm": 0.09348947554826736, + "learning_rate": 3.647121060251698e-05, + "loss": 0.0478, + "num_input_tokens_seen": 15942112, + "step": 13100 + }, + { + "epoch": 1.4595166499610202, + "grad_norm": 0.40231794118881226, + "learning_rate": 3.648513197460742e-05, + "loss": 0.1237, + "num_input_tokens_seen": 15948512, + "step": 13105 + }, + { + "epoch": 1.4600735048446376, + "grad_norm": 0.4087540805339813, + "learning_rate": 3.649905334669785e-05, + "loss": 0.11, + "num_input_tokens_seen": 15954560, + "step": 13110 + }, + { + "epoch": 1.4606303597282548, + "grad_norm": 0.34797045588493347, + "learning_rate": 3.6512974718788286e-05, + "loss": 0.0569, + "num_input_tokens_seen": 15960672, + "step": 13115 + }, + { + "epoch": 1.461187214611872, + "grad_norm": 0.48091667890548706, + "learning_rate": 3.652689609087872e-05, + "loss": 0.0849, + "num_input_tokens_seen": 15966688, + "step": 13120 + }, + { + "epoch": 1.4617440694954895, + "grad_norm": 1.47335946559906, + "learning_rate": 3.654081746296915e-05, + "loss": 0.0967, + "num_input_tokens_seen": 15972416, + "step": 13125 + }, + { + "epoch": 1.4623009243791067, + "grad_norm": 1.0112727880477905, + "learning_rate": 3.655473883505959e-05, + "loss": 0.1153, + "num_input_tokens_seen": 15978144, + "step": 13130 + }, + { + "epoch": 1.4628577792627242, + "grad_norm": 0.7366154193878174, + "learning_rate": 3.656866020715002e-05, + "loss": 0.182, + "num_input_tokens_seen": 15984320, + "step": 13135 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 0.8442792892456055, + "learning_rate": 3.658258157924045e-05, + "loss": 0.2058, + "num_input_tokens_seen": 15990560, + "step": 13140 + }, + { + "epoch": 1.4639714890299587, + "grad_norm": 0.582643449306488, + "learning_rate": 3.6596502951330885e-05, + "loss": 0.0312, + "num_input_tokens_seen": 15996480, + "step": 13145 + }, + { + "epoch": 1.4645283439135761, + "grad_norm": 0.8264581561088562, + "learning_rate": 3.661042432342131e-05, + "loss": 0.1182, + "num_input_tokens_seen": 16002240, + "step": 13150 + }, + { + "epoch": 1.4650851987971936, + "grad_norm": 0.7520602941513062, + "learning_rate": 3.6624345695511754e-05, + "loss": 0.21, + "num_input_tokens_seen": 16008320, + "step": 13155 + }, + { + "epoch": 1.4656420536808108, + "grad_norm": 0.06539395451545715, + "learning_rate": 3.663826706760218e-05, + "loss": 0.1335, + "num_input_tokens_seen": 16014560, + "step": 13160 + }, + { + "epoch": 1.466198908564428, + "grad_norm": 1.335884690284729, + "learning_rate": 3.665218843969262e-05, + "loss": 0.116, + "num_input_tokens_seen": 16020736, + "step": 13165 + }, + { + "epoch": 1.4667557634480455, + "grad_norm": 0.47022297978401184, + "learning_rate": 3.666610981178305e-05, + "loss": 0.0839, + "num_input_tokens_seen": 16026944, + "step": 13170 + }, + { + "epoch": 1.4673126183316627, + "grad_norm": 0.729362964630127, + "learning_rate": 3.6680031183873485e-05, + "loss": 0.0671, + "num_input_tokens_seen": 16033184, + "step": 13175 + }, + { + "epoch": 1.4678694732152802, + "grad_norm": 0.6362404823303223, + "learning_rate": 3.669395255596392e-05, + "loss": 0.0675, + "num_input_tokens_seen": 16039296, + "step": 13180 + }, + { + "epoch": 1.4684263280988974, + "grad_norm": 0.11497701704502106, + "learning_rate": 3.6707873928054346e-05, + "loss": 0.1166, + "num_input_tokens_seen": 16045440, + "step": 13185 + }, + { + "epoch": 1.4689831829825146, + "grad_norm": 0.00912399310618639, + "learning_rate": 3.672179530014479e-05, + "loss": 0.1357, + "num_input_tokens_seen": 16051904, + "step": 13190 + }, + { + "epoch": 1.469540037866132, + "grad_norm": 0.03045635111629963, + "learning_rate": 3.6735716672235215e-05, + "loss": 0.0412, + "num_input_tokens_seen": 16058112, + "step": 13195 + }, + { + "epoch": 1.4700968927497495, + "grad_norm": 0.500604510307312, + "learning_rate": 3.674963804432565e-05, + "loss": 0.1545, + "num_input_tokens_seen": 16064576, + "step": 13200 + }, + { + "epoch": 1.4706537476333668, + "grad_norm": 0.36141544580459595, + "learning_rate": 3.6763559416416084e-05, + "loss": 0.1234, + "num_input_tokens_seen": 16070624, + "step": 13205 + }, + { + "epoch": 1.471210602516984, + "grad_norm": 0.08225597441196442, + "learning_rate": 3.677748078850652e-05, + "loss": 0.0973, + "num_input_tokens_seen": 16076736, + "step": 13210 + }, + { + "epoch": 1.4717674574006014, + "grad_norm": 0.5662436485290527, + "learning_rate": 3.679140216059695e-05, + "loss": 0.108, + "num_input_tokens_seen": 16082912, + "step": 13215 + }, + { + "epoch": 1.4723243122842187, + "grad_norm": 0.7964692711830139, + "learning_rate": 3.680532353268739e-05, + "loss": 0.1503, + "num_input_tokens_seen": 16088544, + "step": 13220 + }, + { + "epoch": 1.4728811671678361, + "grad_norm": 2.14228892326355, + "learning_rate": 3.6819244904777814e-05, + "loss": 0.1141, + "num_input_tokens_seen": 16094784, + "step": 13225 + }, + { + "epoch": 1.4734380220514534, + "grad_norm": 0.2860046923160553, + "learning_rate": 3.683316627686825e-05, + "loss": 0.1511, + "num_input_tokens_seen": 16100800, + "step": 13230 + }, + { + "epoch": 1.4739948769350706, + "grad_norm": 0.2956373393535614, + "learning_rate": 3.684708764895868e-05, + "loss": 0.074, + "num_input_tokens_seen": 16107072, + "step": 13235 + }, + { + "epoch": 1.474551731818688, + "grad_norm": 1.3848133087158203, + "learning_rate": 3.686100902104912e-05, + "loss": 0.0799, + "num_input_tokens_seen": 16113216, + "step": 13240 + }, + { + "epoch": 1.4751085867023055, + "grad_norm": 0.9834355711936951, + "learning_rate": 3.687493039313955e-05, + "loss": 0.1928, + "num_input_tokens_seen": 16118752, + "step": 13245 + }, + { + "epoch": 1.4756654415859227, + "grad_norm": 0.3261396586894989, + "learning_rate": 3.688885176522998e-05, + "loss": 0.0325, + "num_input_tokens_seen": 16124672, + "step": 13250 + }, + { + "epoch": 1.47622229646954, + "grad_norm": 0.554797887802124, + "learning_rate": 3.690277313732042e-05, + "loss": 0.1149, + "num_input_tokens_seen": 16130688, + "step": 13255 + }, + { + "epoch": 1.4767791513531574, + "grad_norm": 0.6777223944664001, + "learning_rate": 3.691669450941085e-05, + "loss": 0.124, + "num_input_tokens_seen": 16136800, + "step": 13260 + }, + { + "epoch": 1.4773360062367746, + "grad_norm": 2.2869062423706055, + "learning_rate": 3.693061588150128e-05, + "loss": 0.3307, + "num_input_tokens_seen": 16142336, + "step": 13265 + }, + { + "epoch": 1.477892861120392, + "grad_norm": 0.19325445592403412, + "learning_rate": 3.6944537253591716e-05, + "loss": 0.0379, + "num_input_tokens_seen": 16148256, + "step": 13270 + }, + { + "epoch": 1.4784497160040093, + "grad_norm": 1.025839924812317, + "learning_rate": 3.6958458625682144e-05, + "loss": 0.1844, + "num_input_tokens_seen": 16154368, + "step": 13275 + }, + { + "epoch": 1.4790065708876265, + "grad_norm": 1.44358229637146, + "learning_rate": 3.6972379997772585e-05, + "loss": 0.1837, + "num_input_tokens_seen": 16160320, + "step": 13280 + }, + { + "epoch": 1.479563425771244, + "grad_norm": 0.045875366777181625, + "learning_rate": 3.698630136986301e-05, + "loss": 0.1702, + "num_input_tokens_seen": 16166016, + "step": 13285 + }, + { + "epoch": 1.4801202806548615, + "grad_norm": 1.8264719247817993, + "learning_rate": 3.700022274195345e-05, + "loss": 0.2161, + "num_input_tokens_seen": 16172288, + "step": 13290 + }, + { + "epoch": 1.4806771355384787, + "grad_norm": 0.18456552922725677, + "learning_rate": 3.701414411404388e-05, + "loss": 0.1161, + "num_input_tokens_seen": 16178528, + "step": 13295 + }, + { + "epoch": 1.481233990422096, + "grad_norm": 0.8072639107704163, + "learning_rate": 3.7028065486134316e-05, + "loss": 0.1644, + "num_input_tokens_seen": 16184800, + "step": 13300 + }, + { + "epoch": 1.4817908453057134, + "grad_norm": 0.3005400598049164, + "learning_rate": 3.704198685822475e-05, + "loss": 0.0345, + "num_input_tokens_seen": 16191008, + "step": 13305 + }, + { + "epoch": 1.4823477001893306, + "grad_norm": 0.08858933299779892, + "learning_rate": 3.7055908230315184e-05, + "loss": 0.1303, + "num_input_tokens_seen": 16197472, + "step": 13310 + }, + { + "epoch": 1.482904555072948, + "grad_norm": 0.15031836926937103, + "learning_rate": 3.706982960240561e-05, + "loss": 0.0372, + "num_input_tokens_seen": 16203648, + "step": 13315 + }, + { + "epoch": 1.4834614099565653, + "grad_norm": 1.3237051963806152, + "learning_rate": 3.7083750974496046e-05, + "loss": 0.0829, + "num_input_tokens_seen": 16209792, + "step": 13320 + }, + { + "epoch": 1.4840182648401825, + "grad_norm": 0.48485347628593445, + "learning_rate": 3.709767234658648e-05, + "loss": 0.0861, + "num_input_tokens_seen": 16215936, + "step": 13325 + }, + { + "epoch": 1.4845751197238, + "grad_norm": 0.7899525165557861, + "learning_rate": 3.7111593718676915e-05, + "loss": 0.1847, + "num_input_tokens_seen": 16222112, + "step": 13330 + }, + { + "epoch": 1.4851319746074174, + "grad_norm": 0.2943260371685028, + "learning_rate": 3.712551509076735e-05, + "loss": 0.1501, + "num_input_tokens_seen": 16228192, + "step": 13335 + }, + { + "epoch": 1.4856888294910346, + "grad_norm": 0.14099690318107605, + "learning_rate": 3.713943646285778e-05, + "loss": 0.1173, + "num_input_tokens_seen": 16234304, + "step": 13340 + }, + { + "epoch": 1.4862456843746519, + "grad_norm": 1.0446399450302124, + "learning_rate": 3.715335783494822e-05, + "loss": 0.0887, + "num_input_tokens_seen": 16240512, + "step": 13345 + }, + { + "epoch": 1.4868025392582693, + "grad_norm": 0.29097944498062134, + "learning_rate": 3.7167279207038645e-05, + "loss": 0.136, + "num_input_tokens_seen": 16246816, + "step": 13350 + }, + { + "epoch": 1.4873593941418866, + "grad_norm": 0.08792471885681152, + "learning_rate": 3.718120057912908e-05, + "loss": 0.0578, + "num_input_tokens_seen": 16253056, + "step": 13355 + }, + { + "epoch": 1.487916249025504, + "grad_norm": 0.17259126901626587, + "learning_rate": 3.7195121951219514e-05, + "loss": 0.0254, + "num_input_tokens_seen": 16259296, + "step": 13360 + }, + { + "epoch": 1.4884731039091212, + "grad_norm": 0.40128111839294434, + "learning_rate": 3.720904332330994e-05, + "loss": 0.1059, + "num_input_tokens_seen": 16264704, + "step": 13365 + }, + { + "epoch": 1.4890299587927387, + "grad_norm": 1.5735524892807007, + "learning_rate": 3.722296469540038e-05, + "loss": 0.1437, + "num_input_tokens_seen": 16271040, + "step": 13370 + }, + { + "epoch": 1.489586813676356, + "grad_norm": 0.23785096406936646, + "learning_rate": 3.723688606749081e-05, + "loss": 0.2051, + "num_input_tokens_seen": 16277024, + "step": 13375 + }, + { + "epoch": 1.4901436685599734, + "grad_norm": 0.717751681804657, + "learning_rate": 3.7250807439581245e-05, + "loss": 0.1093, + "num_input_tokens_seen": 16283360, + "step": 13380 + }, + { + "epoch": 1.4907005234435906, + "grad_norm": 0.19258585572242737, + "learning_rate": 3.726472881167168e-05, + "loss": 0.118, + "num_input_tokens_seen": 16288256, + "step": 13385 + }, + { + "epoch": 1.4912573783272078, + "grad_norm": 2.423279285430908, + "learning_rate": 3.727865018376211e-05, + "loss": 0.077, + "num_input_tokens_seen": 16294304, + "step": 13390 + }, + { + "epoch": 1.4918142332108253, + "grad_norm": 1.5521620512008667, + "learning_rate": 3.729257155585255e-05, + "loss": 0.1714, + "num_input_tokens_seen": 16300288, + "step": 13395 + }, + { + "epoch": 1.4923710880944425, + "grad_norm": 0.9244535565376282, + "learning_rate": 3.730649292794298e-05, + "loss": 0.0761, + "num_input_tokens_seen": 16306688, + "step": 13400 + }, + { + "epoch": 1.49292794297806, + "grad_norm": 0.33263099193573, + "learning_rate": 3.732041430003341e-05, + "loss": 0.0805, + "num_input_tokens_seen": 16312736, + "step": 13405 + }, + { + "epoch": 1.4934847978616772, + "grad_norm": 0.24894586205482483, + "learning_rate": 3.733433567212385e-05, + "loss": 0.1733, + "num_input_tokens_seen": 16318752, + "step": 13410 + }, + { + "epoch": 1.4940416527452947, + "grad_norm": 0.2120126187801361, + "learning_rate": 3.734825704421428e-05, + "loss": 0.0718, + "num_input_tokens_seen": 16324544, + "step": 13415 + }, + { + "epoch": 1.4945985076289119, + "grad_norm": 1.0811632871627808, + "learning_rate": 3.736217841630471e-05, + "loss": 0.0863, + "num_input_tokens_seen": 16330400, + "step": 13420 + }, + { + "epoch": 1.4951553625125293, + "grad_norm": 0.09616976231336594, + "learning_rate": 3.737609978839515e-05, + "loss": 0.0654, + "num_input_tokens_seen": 16336768, + "step": 13425 + }, + { + "epoch": 1.4957122173961466, + "grad_norm": 2.1449427604675293, + "learning_rate": 3.7390021160485574e-05, + "loss": 0.2797, + "num_input_tokens_seen": 16342816, + "step": 13430 + }, + { + "epoch": 1.4962690722797638, + "grad_norm": 0.9781050682067871, + "learning_rate": 3.7403942532576015e-05, + "loss": 0.1676, + "num_input_tokens_seen": 16348416, + "step": 13435 + }, + { + "epoch": 1.4968259271633813, + "grad_norm": 0.42138200998306274, + "learning_rate": 3.741786390466644e-05, + "loss": 0.1244, + "num_input_tokens_seen": 16353920, + "step": 13440 + }, + { + "epoch": 1.4973827820469985, + "grad_norm": 0.2706557512283325, + "learning_rate": 3.7431785276756884e-05, + "loss": 0.0651, + "num_input_tokens_seen": 16360512, + "step": 13445 + }, + { + "epoch": 1.497939636930616, + "grad_norm": 1.3726818561553955, + "learning_rate": 3.744570664884731e-05, + "loss": 0.1363, + "num_input_tokens_seen": 16366464, + "step": 13450 + }, + { + "epoch": 1.4984964918142332, + "grad_norm": 0.057623788714408875, + "learning_rate": 3.7459628020937746e-05, + "loss": 0.1503, + "num_input_tokens_seen": 16372672, + "step": 13455 + }, + { + "epoch": 1.4990533466978506, + "grad_norm": 0.7692272663116455, + "learning_rate": 3.747354939302818e-05, + "loss": 0.2223, + "num_input_tokens_seen": 16378496, + "step": 13460 + }, + { + "epoch": 1.4996102015814678, + "grad_norm": 2.6367039680480957, + "learning_rate": 3.748747076511861e-05, + "loss": 0.3334, + "num_input_tokens_seen": 16384896, + "step": 13465 + }, + { + "epoch": 1.5001670564650853, + "grad_norm": 0.5182158350944519, + "learning_rate": 3.750139213720905e-05, + "loss": 0.0722, + "num_input_tokens_seen": 16391040, + "step": 13470 + }, + { + "epoch": 1.5007239113487025, + "grad_norm": 0.9409258961677551, + "learning_rate": 3.7515313509299477e-05, + "loss": 0.0989, + "num_input_tokens_seen": 16396928, + "step": 13475 + }, + { + "epoch": 1.5012807662323198, + "grad_norm": 0.1523229330778122, + "learning_rate": 3.752923488138991e-05, + "loss": 0.1693, + "num_input_tokens_seen": 16403040, + "step": 13480 + }, + { + "epoch": 1.5018376211159372, + "grad_norm": 1.782155990600586, + "learning_rate": 3.7543156253480345e-05, + "loss": 0.11, + "num_input_tokens_seen": 16408832, + "step": 13485 + }, + { + "epoch": 1.5023944759995547, + "grad_norm": 0.743585467338562, + "learning_rate": 3.755707762557078e-05, + "loss": 0.1748, + "num_input_tokens_seen": 16415008, + "step": 13490 + }, + { + "epoch": 1.502951330883172, + "grad_norm": 0.6615666747093201, + "learning_rate": 3.7570998997661214e-05, + "loss": 0.0782, + "num_input_tokens_seen": 16421312, + "step": 13495 + }, + { + "epoch": 1.5035081857667891, + "grad_norm": 0.1400112509727478, + "learning_rate": 3.758492036975165e-05, + "loss": 0.0817, + "num_input_tokens_seen": 16427360, + "step": 13500 + }, + { + "epoch": 1.5040650406504064, + "grad_norm": 1.652994155883789, + "learning_rate": 3.7598841741842076e-05, + "loss": 0.1729, + "num_input_tokens_seen": 16433728, + "step": 13505 + }, + { + "epoch": 1.5046218955340238, + "grad_norm": 0.46949583292007446, + "learning_rate": 3.761276311393251e-05, + "loss": 0.0279, + "num_input_tokens_seen": 16439904, + "step": 13510 + }, + { + "epoch": 1.5051787504176413, + "grad_norm": 0.20376534759998322, + "learning_rate": 3.7626684486022944e-05, + "loss": 0.1732, + "num_input_tokens_seen": 16445504, + "step": 13515 + }, + { + "epoch": 1.5057356053012585, + "grad_norm": 0.531666100025177, + "learning_rate": 3.764060585811338e-05, + "loss": 0.2188, + "num_input_tokens_seen": 16451520, + "step": 13520 + }, + { + "epoch": 1.5062924601848757, + "grad_norm": 0.31600189208984375, + "learning_rate": 3.765452723020381e-05, + "loss": 0.0938, + "num_input_tokens_seen": 16458272, + "step": 13525 + }, + { + "epoch": 1.5068493150684932, + "grad_norm": 0.951927125453949, + "learning_rate": 3.766844860229424e-05, + "loss": 0.1826, + "num_input_tokens_seen": 16463712, + "step": 13530 + }, + { + "epoch": 1.5074061699521106, + "grad_norm": 0.6017723679542542, + "learning_rate": 3.768236997438468e-05, + "loss": 0.2444, + "num_input_tokens_seen": 16469984, + "step": 13535 + }, + { + "epoch": 1.5079630248357279, + "grad_norm": 1.0784351825714111, + "learning_rate": 3.769629134647511e-05, + "loss": 0.0981, + "num_input_tokens_seen": 16476384, + "step": 13540 + }, + { + "epoch": 1.508519879719345, + "grad_norm": 0.01435894425958395, + "learning_rate": 3.7710212718565544e-05, + "loss": 0.0544, + "num_input_tokens_seen": 16482752, + "step": 13545 + }, + { + "epoch": 1.5090767346029623, + "grad_norm": 0.668915867805481, + "learning_rate": 3.772413409065598e-05, + "loss": 0.1995, + "num_input_tokens_seen": 16488224, + "step": 13550 + }, + { + "epoch": 1.5096335894865798, + "grad_norm": 0.6604841351509094, + "learning_rate": 3.7738055462746405e-05, + "loss": 0.1464, + "num_input_tokens_seen": 16494464, + "step": 13555 + }, + { + "epoch": 1.5101904443701972, + "grad_norm": 0.6890557408332825, + "learning_rate": 3.7751976834836847e-05, + "loss": 0.1042, + "num_input_tokens_seen": 16500480, + "step": 13560 + }, + { + "epoch": 1.5107472992538145, + "grad_norm": 0.877357006072998, + "learning_rate": 3.7765898206927274e-05, + "loss": 0.1634, + "num_input_tokens_seen": 16506304, + "step": 13565 + }, + { + "epoch": 1.5113041541374317, + "grad_norm": 0.1704031229019165, + "learning_rate": 3.777981957901771e-05, + "loss": 0.0773, + "num_input_tokens_seen": 16512608, + "step": 13570 + }, + { + "epoch": 1.5118610090210491, + "grad_norm": 0.6891864538192749, + "learning_rate": 3.779374095110814e-05, + "loss": 0.3113, + "num_input_tokens_seen": 16519296, + "step": 13575 + }, + { + "epoch": 1.5124178639046666, + "grad_norm": 1.479729413986206, + "learning_rate": 3.780766232319858e-05, + "loss": 0.1003, + "num_input_tokens_seen": 16525664, + "step": 13580 + }, + { + "epoch": 1.5129747187882838, + "grad_norm": 1.237374186515808, + "learning_rate": 3.782158369528901e-05, + "loss": 0.0964, + "num_input_tokens_seen": 16531936, + "step": 13585 + }, + { + "epoch": 1.513531573671901, + "grad_norm": 0.9136253595352173, + "learning_rate": 3.7835505067379446e-05, + "loss": 0.0601, + "num_input_tokens_seen": 16538368, + "step": 13590 + }, + { + "epoch": 1.5140884285555183, + "grad_norm": 1.7702659368515015, + "learning_rate": 3.784942643946987e-05, + "loss": 0.1662, + "num_input_tokens_seen": 16544768, + "step": 13595 + }, + { + "epoch": 1.5146452834391357, + "grad_norm": 0.033374615013599396, + "learning_rate": 3.786334781156031e-05, + "loss": 0.0777, + "num_input_tokens_seen": 16550944, + "step": 13600 + }, + { + "epoch": 1.5152021383227532, + "grad_norm": 0.9305567145347595, + "learning_rate": 3.787726918365074e-05, + "loss": 0.1229, + "num_input_tokens_seen": 16556864, + "step": 13605 + }, + { + "epoch": 1.5157589932063704, + "grad_norm": 1.8261473178863525, + "learning_rate": 3.7891190555741176e-05, + "loss": 0.1793, + "num_input_tokens_seen": 16562880, + "step": 13610 + }, + { + "epoch": 1.5163158480899877, + "grad_norm": 0.8772442936897278, + "learning_rate": 3.790511192783161e-05, + "loss": 0.2142, + "num_input_tokens_seen": 16569088, + "step": 13615 + }, + { + "epoch": 1.516872702973605, + "grad_norm": 0.3633541464805603, + "learning_rate": 3.791903329992204e-05, + "loss": 0.1446, + "num_input_tokens_seen": 16575360, + "step": 13620 + }, + { + "epoch": 1.5174295578572226, + "grad_norm": 0.7830557227134705, + "learning_rate": 3.793295467201248e-05, + "loss": 0.1993, + "num_input_tokens_seen": 16581376, + "step": 13625 + }, + { + "epoch": 1.5179864127408398, + "grad_norm": 0.22402562201023102, + "learning_rate": 3.794687604410291e-05, + "loss": 0.107, + "num_input_tokens_seen": 16587552, + "step": 13630 + }, + { + "epoch": 1.518543267624457, + "grad_norm": 1.005935549736023, + "learning_rate": 3.796079741619334e-05, + "loss": 0.1803, + "num_input_tokens_seen": 16593792, + "step": 13635 + }, + { + "epoch": 1.5191001225080742, + "grad_norm": 1.1921169757843018, + "learning_rate": 3.7974718788283775e-05, + "loss": 0.2406, + "num_input_tokens_seen": 16600032, + "step": 13640 + }, + { + "epoch": 1.5196569773916917, + "grad_norm": 0.5533566474914551, + "learning_rate": 3.79886401603742e-05, + "loss": 0.2239, + "num_input_tokens_seen": 16605984, + "step": 13645 + }, + { + "epoch": 1.5202138322753092, + "grad_norm": 0.2914443612098694, + "learning_rate": 3.8002561532464644e-05, + "loss": 0.0369, + "num_input_tokens_seen": 16612064, + "step": 13650 + }, + { + "epoch": 1.5207706871589264, + "grad_norm": 1.1298890113830566, + "learning_rate": 3.801648290455507e-05, + "loss": 0.1551, + "num_input_tokens_seen": 16618112, + "step": 13655 + }, + { + "epoch": 1.5213275420425436, + "grad_norm": 0.4668244421482086, + "learning_rate": 3.8030404276645506e-05, + "loss": 0.1674, + "num_input_tokens_seen": 16623872, + "step": 13660 + }, + { + "epoch": 1.521884396926161, + "grad_norm": 1.7957186698913574, + "learning_rate": 3.804432564873594e-05, + "loss": 0.1114, + "num_input_tokens_seen": 16629856, + "step": 13665 + }, + { + "epoch": 1.5224412518097785, + "grad_norm": 0.6765850782394409, + "learning_rate": 3.8058247020826375e-05, + "loss": 0.138, + "num_input_tokens_seen": 16636192, + "step": 13670 + }, + { + "epoch": 1.5229981066933957, + "grad_norm": 0.35764527320861816, + "learning_rate": 3.807216839291681e-05, + "loss": 0.1035, + "num_input_tokens_seen": 16642432, + "step": 13675 + }, + { + "epoch": 1.523554961577013, + "grad_norm": 0.08851943165063858, + "learning_rate": 3.808608976500724e-05, + "loss": 0.0975, + "num_input_tokens_seen": 16648768, + "step": 13680 + }, + { + "epoch": 1.5241118164606302, + "grad_norm": 1.1637505292892456, + "learning_rate": 3.810001113709767e-05, + "loss": 0.1709, + "num_input_tokens_seen": 16654944, + "step": 13685 + }, + { + "epoch": 1.5246686713442477, + "grad_norm": 0.7441772818565369, + "learning_rate": 3.8113932509188105e-05, + "loss": 0.2895, + "num_input_tokens_seen": 16661184, + "step": 13690 + }, + { + "epoch": 1.5252255262278651, + "grad_norm": 0.46273958683013916, + "learning_rate": 3.812785388127854e-05, + "loss": 0.0844, + "num_input_tokens_seen": 16667072, + "step": 13695 + }, + { + "epoch": 1.5257823811114823, + "grad_norm": 0.7129637598991394, + "learning_rate": 3.8141775253368974e-05, + "loss": 0.0752, + "num_input_tokens_seen": 16673216, + "step": 13700 + }, + { + "epoch": 1.5263392359950996, + "grad_norm": 1.1835213899612427, + "learning_rate": 3.815569662545941e-05, + "loss": 0.1014, + "num_input_tokens_seen": 16679072, + "step": 13705 + }, + { + "epoch": 1.526896090878717, + "grad_norm": 0.11393173784017563, + "learning_rate": 3.816961799754984e-05, + "loss": 0.167, + "num_input_tokens_seen": 16685440, + "step": 13710 + }, + { + "epoch": 1.5274529457623345, + "grad_norm": 0.37932735681533813, + "learning_rate": 3.818353936964028e-05, + "loss": 0.1112, + "num_input_tokens_seen": 16691584, + "step": 13715 + }, + { + "epoch": 1.5280098006459517, + "grad_norm": 1.4576164484024048, + "learning_rate": 3.8197460741730704e-05, + "loss": 0.1422, + "num_input_tokens_seen": 16696800, + "step": 13720 + }, + { + "epoch": 1.528566655529569, + "grad_norm": 1.3180598020553589, + "learning_rate": 3.8211382113821145e-05, + "loss": 0.2483, + "num_input_tokens_seen": 16702656, + "step": 13725 + }, + { + "epoch": 1.5291235104131862, + "grad_norm": 0.3457951247692108, + "learning_rate": 3.822530348591157e-05, + "loss": 0.1421, + "num_input_tokens_seen": 16708832, + "step": 13730 + }, + { + "epoch": 1.5296803652968036, + "grad_norm": 0.5482712984085083, + "learning_rate": 3.823922485800201e-05, + "loss": 0.2005, + "num_input_tokens_seen": 16715104, + "step": 13735 + }, + { + "epoch": 1.530237220180421, + "grad_norm": 0.19606377184391022, + "learning_rate": 3.825314623009244e-05, + "loss": 0.1129, + "num_input_tokens_seen": 16721248, + "step": 13740 + }, + { + "epoch": 1.5307940750640383, + "grad_norm": 1.6726155281066895, + "learning_rate": 3.826706760218287e-05, + "loss": 0.118, + "num_input_tokens_seen": 16727264, + "step": 13745 + }, + { + "epoch": 1.5313509299476555, + "grad_norm": 1.3626707792282104, + "learning_rate": 3.828098897427331e-05, + "loss": 0.2415, + "num_input_tokens_seen": 16733504, + "step": 13750 + }, + { + "epoch": 1.531907784831273, + "grad_norm": 0.07497505098581314, + "learning_rate": 3.829491034636374e-05, + "loss": 0.122, + "num_input_tokens_seen": 16739744, + "step": 13755 + }, + { + "epoch": 1.5324646397148904, + "grad_norm": 0.8822043538093567, + "learning_rate": 3.830883171845417e-05, + "loss": 0.1818, + "num_input_tokens_seen": 16745536, + "step": 13760 + }, + { + "epoch": 1.5330214945985077, + "grad_norm": 0.6961380243301392, + "learning_rate": 3.8322753090544607e-05, + "loss": 0.1748, + "num_input_tokens_seen": 16751648, + "step": 13765 + }, + { + "epoch": 1.533578349482125, + "grad_norm": 0.018167488276958466, + "learning_rate": 3.833667446263504e-05, + "loss": 0.1316, + "num_input_tokens_seen": 16757504, + "step": 13770 + }, + { + "epoch": 1.5341352043657421, + "grad_norm": 1.0993856191635132, + "learning_rate": 3.8350595834725475e-05, + "loss": 0.1616, + "num_input_tokens_seen": 16763488, + "step": 13775 + }, + { + "epoch": 1.5346920592493596, + "grad_norm": 0.16796612739562988, + "learning_rate": 3.83645172068159e-05, + "loss": 0.1358, + "num_input_tokens_seen": 16769408, + "step": 13780 + }, + { + "epoch": 1.535248914132977, + "grad_norm": 0.12083049863576889, + "learning_rate": 3.837843857890634e-05, + "loss": 0.1136, + "num_input_tokens_seen": 16774976, + "step": 13785 + }, + { + "epoch": 1.5358057690165943, + "grad_norm": 1.2012202739715576, + "learning_rate": 3.839235995099677e-05, + "loss": 0.0918, + "num_input_tokens_seen": 16781024, + "step": 13790 + }, + { + "epoch": 1.5363626239002115, + "grad_norm": 0.0691332072019577, + "learning_rate": 3.8406281323087206e-05, + "loss": 0.1793, + "num_input_tokens_seen": 16787040, + "step": 13795 + }, + { + "epoch": 1.536919478783829, + "grad_norm": 1.437679409980774, + "learning_rate": 3.842020269517764e-05, + "loss": 0.1951, + "num_input_tokens_seen": 16792448, + "step": 13800 + }, + { + "epoch": 1.5374763336674464, + "grad_norm": 0.28257468342781067, + "learning_rate": 3.8434124067268074e-05, + "loss": 0.1297, + "num_input_tokens_seen": 16798432, + "step": 13805 + }, + { + "epoch": 1.5380331885510636, + "grad_norm": 1.0632675886154175, + "learning_rate": 3.84480454393585e-05, + "loss": 0.2963, + "num_input_tokens_seen": 16804320, + "step": 13810 + }, + { + "epoch": 1.5385900434346809, + "grad_norm": 0.7117316722869873, + "learning_rate": 3.846196681144894e-05, + "loss": 0.1402, + "num_input_tokens_seen": 16809952, + "step": 13815 + }, + { + "epoch": 1.539146898318298, + "grad_norm": 0.24615557491779327, + "learning_rate": 3.847588818353937e-05, + "loss": 0.0351, + "num_input_tokens_seen": 16815648, + "step": 13820 + }, + { + "epoch": 1.5397037532019155, + "grad_norm": 0.008611378259956837, + "learning_rate": 3.8489809555629805e-05, + "loss": 0.0614, + "num_input_tokens_seen": 16822016, + "step": 13825 + }, + { + "epoch": 1.540260608085533, + "grad_norm": 1.4479156732559204, + "learning_rate": 3.850373092772024e-05, + "loss": 0.266, + "num_input_tokens_seen": 16827616, + "step": 13830 + }, + { + "epoch": 1.5408174629691502, + "grad_norm": 0.17471200227737427, + "learning_rate": 3.851765229981067e-05, + "loss": 0.2193, + "num_input_tokens_seen": 16833824, + "step": 13835 + }, + { + "epoch": 1.5413743178527675, + "grad_norm": 0.35743269324302673, + "learning_rate": 3.853157367190111e-05, + "loss": 0.0594, + "num_input_tokens_seen": 16839968, + "step": 13840 + }, + { + "epoch": 1.541931172736385, + "grad_norm": 1.305135726928711, + "learning_rate": 3.8545495043991535e-05, + "loss": 0.1098, + "num_input_tokens_seen": 16845920, + "step": 13845 + }, + { + "epoch": 1.5424880276200024, + "grad_norm": 0.3873080313205719, + "learning_rate": 3.855941641608197e-05, + "loss": 0.2133, + "num_input_tokens_seen": 16852352, + "step": 13850 + }, + { + "epoch": 1.5430448825036196, + "grad_norm": 0.5535135865211487, + "learning_rate": 3.8573337788172404e-05, + "loss": 0.2681, + "num_input_tokens_seen": 16858432, + "step": 13855 + }, + { + "epoch": 1.5436017373872368, + "grad_norm": 0.44325903058052063, + "learning_rate": 3.858725916026284e-05, + "loss": 0.0637, + "num_input_tokens_seen": 16864704, + "step": 13860 + }, + { + "epoch": 1.544158592270854, + "grad_norm": 0.06885535269975662, + "learning_rate": 3.860118053235327e-05, + "loss": 0.1622, + "num_input_tokens_seen": 16870464, + "step": 13865 + }, + { + "epoch": 1.5447154471544715, + "grad_norm": 0.8090590238571167, + "learning_rate": 3.86151019044437e-05, + "loss": 0.2375, + "num_input_tokens_seen": 16876864, + "step": 13870 + }, + { + "epoch": 1.545272302038089, + "grad_norm": 1.1834832429885864, + "learning_rate": 3.8629023276534135e-05, + "loss": 0.0855, + "num_input_tokens_seen": 16883104, + "step": 13875 + }, + { + "epoch": 1.5458291569217062, + "grad_norm": 0.8325049877166748, + "learning_rate": 3.864294464862457e-05, + "loss": 0.1248, + "num_input_tokens_seen": 16889152, + "step": 13880 + }, + { + "epoch": 1.5463860118053234, + "grad_norm": 0.7234463095664978, + "learning_rate": 3.8656866020715e-05, + "loss": 0.1077, + "num_input_tokens_seen": 16895040, + "step": 13885 + }, + { + "epoch": 1.5469428666889409, + "grad_norm": 0.40238407254219055, + "learning_rate": 3.867078739280544e-05, + "loss": 0.0717, + "num_input_tokens_seen": 16901440, + "step": 13890 + }, + { + "epoch": 1.5474997215725583, + "grad_norm": 0.8203144669532776, + "learning_rate": 3.868470876489587e-05, + "loss": 0.1507, + "num_input_tokens_seen": 16906816, + "step": 13895 + }, + { + "epoch": 1.5480565764561756, + "grad_norm": 2.8143680095672607, + "learning_rate": 3.86986301369863e-05, + "loss": 0.2116, + "num_input_tokens_seen": 16912800, + "step": 13900 + }, + { + "epoch": 1.5486134313397928, + "grad_norm": 1.2132173776626587, + "learning_rate": 3.871255150907674e-05, + "loss": 0.1535, + "num_input_tokens_seen": 16918912, + "step": 13905 + }, + { + "epoch": 1.54917028622341, + "grad_norm": 1.5531110763549805, + "learning_rate": 3.872647288116717e-05, + "loss": 0.0708, + "num_input_tokens_seen": 16924672, + "step": 13910 + }, + { + "epoch": 1.5497271411070275, + "grad_norm": 1.0690752267837524, + "learning_rate": 3.87403942532576e-05, + "loss": 0.0961, + "num_input_tokens_seen": 16930432, + "step": 13915 + }, + { + "epoch": 1.550283995990645, + "grad_norm": 0.35507214069366455, + "learning_rate": 3.875431562534804e-05, + "loss": 0.2394, + "num_input_tokens_seen": 16935232, + "step": 13920 + }, + { + "epoch": 1.5508408508742622, + "grad_norm": 0.5272219777107239, + "learning_rate": 3.8768236997438464e-05, + "loss": 0.0649, + "num_input_tokens_seen": 16941376, + "step": 13925 + }, + { + "epoch": 1.5513977057578794, + "grad_norm": 1.5337011814117432, + "learning_rate": 3.8782158369528906e-05, + "loss": 0.1292, + "num_input_tokens_seen": 16947584, + "step": 13930 + }, + { + "epoch": 1.5519545606414968, + "grad_norm": 1.9872794151306152, + "learning_rate": 3.879607974161933e-05, + "loss": 0.1962, + "num_input_tokens_seen": 16953856, + "step": 13935 + }, + { + "epoch": 1.5525114155251143, + "grad_norm": 0.7744536399841309, + "learning_rate": 3.881000111370977e-05, + "loss": 0.2863, + "num_input_tokens_seen": 16959776, + "step": 13940 + }, + { + "epoch": 1.5530682704087315, + "grad_norm": 0.3359231948852539, + "learning_rate": 3.88239224858002e-05, + "loss": 0.0601, + "num_input_tokens_seen": 16966016, + "step": 13945 + }, + { + "epoch": 1.5536251252923488, + "grad_norm": 0.10364490747451782, + "learning_rate": 3.8837843857890636e-05, + "loss": 0.1158, + "num_input_tokens_seen": 16971616, + "step": 13950 + }, + { + "epoch": 1.554181980175966, + "grad_norm": 0.7570679187774658, + "learning_rate": 3.885176522998107e-05, + "loss": 0.0891, + "num_input_tokens_seen": 16977888, + "step": 13955 + }, + { + "epoch": 1.5547388350595834, + "grad_norm": 0.1768026053905487, + "learning_rate": 3.88656866020715e-05, + "loss": 0.1337, + "num_input_tokens_seen": 16983968, + "step": 13960 + }, + { + "epoch": 1.5552956899432009, + "grad_norm": 0.651485800743103, + "learning_rate": 3.887960797416194e-05, + "loss": 0.1283, + "num_input_tokens_seen": 16990240, + "step": 13965 + }, + { + "epoch": 1.5558525448268181, + "grad_norm": 0.4252316355705261, + "learning_rate": 3.8893529346252367e-05, + "loss": 0.0747, + "num_input_tokens_seen": 16996352, + "step": 13970 + }, + { + "epoch": 1.5564093997104353, + "grad_norm": 1.1536190509796143, + "learning_rate": 3.89074507183428e-05, + "loss": 0.2019, + "num_input_tokens_seen": 17002272, + "step": 13975 + }, + { + "epoch": 1.5569662545940528, + "grad_norm": 0.2691521644592285, + "learning_rate": 3.8921372090433235e-05, + "loss": 0.1046, + "num_input_tokens_seen": 17008128, + "step": 13980 + }, + { + "epoch": 1.5575231094776703, + "grad_norm": 1.021052598953247, + "learning_rate": 3.893529346252367e-05, + "loss": 0.101, + "num_input_tokens_seen": 17014016, + "step": 13985 + }, + { + "epoch": 1.5580799643612875, + "grad_norm": 0.9615932106971741, + "learning_rate": 3.8949214834614104e-05, + "loss": 0.0893, + "num_input_tokens_seen": 17020352, + "step": 13990 + }, + { + "epoch": 1.5586368192449047, + "grad_norm": 0.1474171280860901, + "learning_rate": 3.896313620670454e-05, + "loss": 0.0837, + "num_input_tokens_seen": 17026688, + "step": 13995 + }, + { + "epoch": 1.559193674128522, + "grad_norm": 1.2098180055618286, + "learning_rate": 3.8977057578794966e-05, + "loss": 0.063, + "num_input_tokens_seen": 17033088, + "step": 14000 + }, + { + "epoch": 1.5597505290121394, + "grad_norm": 0.9785323143005371, + "learning_rate": 3.89909789508854e-05, + "loss": 0.1429, + "num_input_tokens_seen": 17038944, + "step": 14005 + }, + { + "epoch": 1.5603073838957568, + "grad_norm": 1.2391490936279297, + "learning_rate": 3.9004900322975834e-05, + "loss": 0.1948, + "num_input_tokens_seen": 17045376, + "step": 14010 + }, + { + "epoch": 1.560864238779374, + "grad_norm": 0.7506659030914307, + "learning_rate": 3.901882169506627e-05, + "loss": 0.1689, + "num_input_tokens_seen": 17051488, + "step": 14015 + }, + { + "epoch": 1.5614210936629913, + "grad_norm": 0.5527480840682983, + "learning_rate": 3.90327430671567e-05, + "loss": 0.0527, + "num_input_tokens_seen": 17057472, + "step": 14020 + }, + { + "epoch": 1.5619779485466088, + "grad_norm": 0.3366561233997345, + "learning_rate": 3.904666443924713e-05, + "loss": 0.1242, + "num_input_tokens_seen": 17063456, + "step": 14025 + }, + { + "epoch": 1.5625348034302262, + "grad_norm": 0.4639371931552887, + "learning_rate": 3.906058581133757e-05, + "loss": 0.0922, + "num_input_tokens_seen": 17069472, + "step": 14030 + }, + { + "epoch": 1.5630916583138434, + "grad_norm": 0.5714733600616455, + "learning_rate": 3.9074507183428e-05, + "loss": 0.0841, + "num_input_tokens_seen": 17075552, + "step": 14035 + }, + { + "epoch": 1.5636485131974607, + "grad_norm": 1.066046953201294, + "learning_rate": 3.9088428555518434e-05, + "loss": 0.1217, + "num_input_tokens_seen": 17081856, + "step": 14040 + }, + { + "epoch": 1.564205368081078, + "grad_norm": 1.405439019203186, + "learning_rate": 3.910234992760887e-05, + "loss": 0.2302, + "num_input_tokens_seen": 17087392, + "step": 14045 + }, + { + "epoch": 1.5647622229646954, + "grad_norm": 0.6418622732162476, + "learning_rate": 3.9116271299699296e-05, + "loss": 0.1281, + "num_input_tokens_seen": 17093696, + "step": 14050 + }, + { + "epoch": 1.5653190778483128, + "grad_norm": 0.008704051375389099, + "learning_rate": 3.913019267178974e-05, + "loss": 0.0302, + "num_input_tokens_seen": 17100160, + "step": 14055 + }, + { + "epoch": 1.56587593273193, + "grad_norm": 0.006182339042425156, + "learning_rate": 3.9144114043880164e-05, + "loss": 0.0846, + "num_input_tokens_seen": 17106432, + "step": 14060 + }, + { + "epoch": 1.5664327876155473, + "grad_norm": 0.9780097007751465, + "learning_rate": 3.91580354159706e-05, + "loss": 0.0956, + "num_input_tokens_seen": 17112512, + "step": 14065 + }, + { + "epoch": 1.5669896424991647, + "grad_norm": 0.5546268224716187, + "learning_rate": 3.917195678806103e-05, + "loss": 0.1413, + "num_input_tokens_seen": 17118720, + "step": 14070 + }, + { + "epoch": 1.5675464973827822, + "grad_norm": 0.031442053616046906, + "learning_rate": 3.918587816015147e-05, + "loss": 0.0753, + "num_input_tokens_seen": 17124896, + "step": 14075 + }, + { + "epoch": 1.5681033522663994, + "grad_norm": 0.8291966915130615, + "learning_rate": 3.91997995322419e-05, + "loss": 0.0818, + "num_input_tokens_seen": 17130816, + "step": 14080 + }, + { + "epoch": 1.5686602071500166, + "grad_norm": 0.7566413283348083, + "learning_rate": 3.9213720904332336e-05, + "loss": 0.1082, + "num_input_tokens_seen": 17136768, + "step": 14085 + }, + { + "epoch": 1.5692170620336339, + "grad_norm": 0.653221845626831, + "learning_rate": 3.922764227642276e-05, + "loss": 0.1528, + "num_input_tokens_seen": 17142272, + "step": 14090 + }, + { + "epoch": 1.5697739169172513, + "grad_norm": 0.37835800647735596, + "learning_rate": 3.9241563648513204e-05, + "loss": 0.061, + "num_input_tokens_seen": 17148256, + "step": 14095 + }, + { + "epoch": 1.5703307718008688, + "grad_norm": 0.3313100337982178, + "learning_rate": 3.925548502060363e-05, + "loss": 0.1513, + "num_input_tokens_seen": 17154336, + "step": 14100 + }, + { + "epoch": 1.570887626684486, + "grad_norm": 0.6364753842353821, + "learning_rate": 3.9269406392694066e-05, + "loss": 0.0593, + "num_input_tokens_seen": 17159840, + "step": 14105 + }, + { + "epoch": 1.5714444815681032, + "grad_norm": 0.5816864967346191, + "learning_rate": 3.92833277647845e-05, + "loss": 0.0584, + "num_input_tokens_seen": 17166048, + "step": 14110 + }, + { + "epoch": 1.5720013364517207, + "grad_norm": 0.03422582894563675, + "learning_rate": 3.929724913687493e-05, + "loss": 0.0951, + "num_input_tokens_seen": 17172064, + "step": 14115 + }, + { + "epoch": 1.5725581913353381, + "grad_norm": 1.2845102548599243, + "learning_rate": 3.931117050896537e-05, + "loss": 0.1045, + "num_input_tokens_seen": 17178368, + "step": 14120 + }, + { + "epoch": 1.5731150462189554, + "grad_norm": 0.9869353175163269, + "learning_rate": 3.93250918810558e-05, + "loss": 0.2129, + "num_input_tokens_seen": 17184672, + "step": 14125 + }, + { + "epoch": 1.5736719011025726, + "grad_norm": 0.728188157081604, + "learning_rate": 3.933901325314623e-05, + "loss": 0.0897, + "num_input_tokens_seen": 17190656, + "step": 14130 + }, + { + "epoch": 1.57422875598619, + "grad_norm": 0.08250271528959274, + "learning_rate": 3.9352934625236666e-05, + "loss": 0.1087, + "num_input_tokens_seen": 17196544, + "step": 14135 + }, + { + "epoch": 1.5747856108698073, + "grad_norm": 0.25673654675483704, + "learning_rate": 3.93668559973271e-05, + "loss": 0.038, + "num_input_tokens_seen": 17202368, + "step": 14140 + }, + { + "epoch": 1.5753424657534247, + "grad_norm": 0.25162947177886963, + "learning_rate": 3.9380777369417534e-05, + "loss": 0.0797, + "num_input_tokens_seen": 17208480, + "step": 14145 + }, + { + "epoch": 1.575899320637042, + "grad_norm": 0.1590648591518402, + "learning_rate": 3.939469874150796e-05, + "loss": 0.1076, + "num_input_tokens_seen": 17214752, + "step": 14150 + }, + { + "epoch": 1.5764561755206592, + "grad_norm": 0.1845017671585083, + "learning_rate": 3.9408620113598396e-05, + "loss": 0.2316, + "num_input_tokens_seen": 17220896, + "step": 14155 + }, + { + "epoch": 1.5770130304042767, + "grad_norm": 0.9835301041603088, + "learning_rate": 3.942254148568883e-05, + "loss": 0.0605, + "num_input_tokens_seen": 17227136, + "step": 14160 + }, + { + "epoch": 1.577569885287894, + "grad_norm": 0.5579898953437805, + "learning_rate": 3.9436462857779265e-05, + "loss": 0.0298, + "num_input_tokens_seen": 17233216, + "step": 14165 + }, + { + "epoch": 1.5781267401715113, + "grad_norm": 0.6889331340789795, + "learning_rate": 3.94503842298697e-05, + "loss": 0.0982, + "num_input_tokens_seen": 17239136, + "step": 14170 + }, + { + "epoch": 1.5786835950551286, + "grad_norm": 0.6617001295089722, + "learning_rate": 3.9464305601960133e-05, + "loss": 0.0754, + "num_input_tokens_seen": 17245344, + "step": 14175 + }, + { + "epoch": 1.579240449938746, + "grad_norm": 0.21655422449111938, + "learning_rate": 3.947822697405056e-05, + "loss": 0.1559, + "num_input_tokens_seen": 17251552, + "step": 14180 + }, + { + "epoch": 1.5797973048223632, + "grad_norm": 0.04270494729280472, + "learning_rate": 3.9492148346141e-05, + "loss": 0.0732, + "num_input_tokens_seen": 17257728, + "step": 14185 + }, + { + "epoch": 1.5803541597059807, + "grad_norm": 0.047078050673007965, + "learning_rate": 3.950606971823143e-05, + "loss": 0.124, + "num_input_tokens_seen": 17263904, + "step": 14190 + }, + { + "epoch": 1.580911014589598, + "grad_norm": 1.6516386270523071, + "learning_rate": 3.9519991090321864e-05, + "loss": 0.1811, + "num_input_tokens_seen": 17270272, + "step": 14195 + }, + { + "epoch": 1.5814678694732152, + "grad_norm": 1.0277130603790283, + "learning_rate": 3.95339124624123e-05, + "loss": 0.2233, + "num_input_tokens_seen": 17276576, + "step": 14200 + }, + { + "epoch": 1.5820247243568326, + "grad_norm": 0.11260882019996643, + "learning_rate": 3.9547833834502726e-05, + "loss": 0.075, + "num_input_tokens_seen": 17282400, + "step": 14205 + }, + { + "epoch": 1.58258157924045, + "grad_norm": 0.28576213121414185, + "learning_rate": 3.956175520659317e-05, + "loss": 0.1893, + "num_input_tokens_seen": 17288224, + "step": 14210 + }, + { + "epoch": 1.5831384341240673, + "grad_norm": 0.033586591482162476, + "learning_rate": 3.9575676578683594e-05, + "loss": 0.0954, + "num_input_tokens_seen": 17294176, + "step": 14215 + }, + { + "epoch": 1.5836952890076845, + "grad_norm": 0.6332893967628479, + "learning_rate": 3.9589597950774036e-05, + "loss": 0.1448, + "num_input_tokens_seen": 17300448, + "step": 14220 + }, + { + "epoch": 1.584252143891302, + "grad_norm": 0.05634545907378197, + "learning_rate": 3.960351932286446e-05, + "loss": 0.1665, + "num_input_tokens_seen": 17306880, + "step": 14225 + }, + { + "epoch": 1.5848089987749192, + "grad_norm": 0.561713457107544, + "learning_rate": 3.96174406949549e-05, + "loss": 0.1417, + "num_input_tokens_seen": 17313120, + "step": 14230 + }, + { + "epoch": 1.5853658536585367, + "grad_norm": 3.016400098800659, + "learning_rate": 3.963136206704533e-05, + "loss": 0.2116, + "num_input_tokens_seen": 17319232, + "step": 14235 + }, + { + "epoch": 1.585922708542154, + "grad_norm": 0.8873316049575806, + "learning_rate": 3.964528343913576e-05, + "loss": 0.1598, + "num_input_tokens_seen": 17324896, + "step": 14240 + }, + { + "epoch": 1.5864795634257711, + "grad_norm": 0.015564576722681522, + "learning_rate": 3.96592048112262e-05, + "loss": 0.1189, + "num_input_tokens_seen": 17330912, + "step": 14245 + }, + { + "epoch": 1.5870364183093886, + "grad_norm": 1.139634132385254, + "learning_rate": 3.967312618331663e-05, + "loss": 0.1073, + "num_input_tokens_seen": 17337024, + "step": 14250 + }, + { + "epoch": 1.587593273193006, + "grad_norm": 0.7032648921012878, + "learning_rate": 3.968704755540706e-05, + "loss": 0.1971, + "num_input_tokens_seen": 17343040, + "step": 14255 + }, + { + "epoch": 1.5881501280766233, + "grad_norm": 0.4262610077857971, + "learning_rate": 3.97009689274975e-05, + "loss": 0.2649, + "num_input_tokens_seen": 17349152, + "step": 14260 + }, + { + "epoch": 1.5887069829602405, + "grad_norm": 0.23523983359336853, + "learning_rate": 3.971489029958793e-05, + "loss": 0.1487, + "num_input_tokens_seen": 17355264, + "step": 14265 + }, + { + "epoch": 1.589263837843858, + "grad_norm": 0.010585237294435501, + "learning_rate": 3.9728811671678365e-05, + "loss": 0.0723, + "num_input_tokens_seen": 17361504, + "step": 14270 + }, + { + "epoch": 1.5898206927274752, + "grad_norm": 0.8496782779693604, + "learning_rate": 3.97427330437688e-05, + "loss": 0.084, + "num_input_tokens_seen": 17367456, + "step": 14275 + }, + { + "epoch": 1.5903775476110926, + "grad_norm": 0.7484897375106812, + "learning_rate": 3.975665441585923e-05, + "loss": 0.0902, + "num_input_tokens_seen": 17373408, + "step": 14280 + }, + { + "epoch": 1.5909344024947099, + "grad_norm": 1.8023449182510376, + "learning_rate": 3.977057578794966e-05, + "loss": 0.3071, + "num_input_tokens_seen": 17379392, + "step": 14285 + }, + { + "epoch": 1.591491257378327, + "grad_norm": 0.3241589069366455, + "learning_rate": 3.9784497160040096e-05, + "loss": 0.1549, + "num_input_tokens_seen": 17385344, + "step": 14290 + }, + { + "epoch": 1.5920481122619445, + "grad_norm": 0.007211899384856224, + "learning_rate": 3.979841853213053e-05, + "loss": 0.0819, + "num_input_tokens_seen": 17391392, + "step": 14295 + }, + { + "epoch": 1.592604967145562, + "grad_norm": 0.7024438977241516, + "learning_rate": 3.9812339904220965e-05, + "loss": 0.243, + "num_input_tokens_seen": 17397216, + "step": 14300 + }, + { + "epoch": 1.5931618220291792, + "grad_norm": 0.636789858341217, + "learning_rate": 3.982626127631139e-05, + "loss": 0.0427, + "num_input_tokens_seen": 17403488, + "step": 14305 + }, + { + "epoch": 1.5937186769127965, + "grad_norm": 0.04494200274348259, + "learning_rate": 3.984018264840183e-05, + "loss": 0.1087, + "num_input_tokens_seen": 17408960, + "step": 14310 + }, + { + "epoch": 1.594275531796414, + "grad_norm": 0.2034130096435547, + "learning_rate": 3.985410402049226e-05, + "loss": 0.2136, + "num_input_tokens_seen": 17415136, + "step": 14315 + }, + { + "epoch": 1.5948323866800311, + "grad_norm": 0.8537586331367493, + "learning_rate": 3.9868025392582695e-05, + "loss": 0.0908, + "num_input_tokens_seen": 17421696, + "step": 14320 + }, + { + "epoch": 1.5953892415636486, + "grad_norm": 0.4970850348472595, + "learning_rate": 3.988194676467313e-05, + "loss": 0.1268, + "num_input_tokens_seen": 17427840, + "step": 14325 + }, + { + "epoch": 1.5959460964472658, + "grad_norm": 0.3994119465351105, + "learning_rate": 3.989586813676356e-05, + "loss": 0.1487, + "num_input_tokens_seen": 17434016, + "step": 14330 + }, + { + "epoch": 1.596502951330883, + "grad_norm": 0.14322122931480408, + "learning_rate": 3.9909789508854e-05, + "loss": 0.1762, + "num_input_tokens_seen": 17439776, + "step": 14335 + }, + { + "epoch": 1.5970598062145005, + "grad_norm": 0.6771246194839478, + "learning_rate": 3.9923710880944426e-05, + "loss": 0.144, + "num_input_tokens_seen": 17445696, + "step": 14340 + }, + { + "epoch": 1.597616661098118, + "grad_norm": 0.047614142298698425, + "learning_rate": 3.993763225303486e-05, + "loss": 0.0653, + "num_input_tokens_seen": 17451744, + "step": 14345 + }, + { + "epoch": 1.5981735159817352, + "grad_norm": 0.16855767369270325, + "learning_rate": 3.9951553625125294e-05, + "loss": 0.0907, + "num_input_tokens_seen": 17457984, + "step": 14350 + }, + { + "epoch": 1.5987303708653524, + "grad_norm": 0.007414667401462793, + "learning_rate": 3.996547499721573e-05, + "loss": 0.1854, + "num_input_tokens_seen": 17464288, + "step": 14355 + }, + { + "epoch": 1.5992872257489699, + "grad_norm": 0.8061937093734741, + "learning_rate": 3.997939636930616e-05, + "loss": 0.1305, + "num_input_tokens_seen": 17470080, + "step": 14360 + }, + { + "epoch": 1.599844080632587, + "grad_norm": 0.2311103641986847, + "learning_rate": 3.99933177413966e-05, + "loss": 0.0444, + "num_input_tokens_seen": 17475744, + "step": 14365 + }, + { + "epoch": 1.6004009355162045, + "grad_norm": 0.6788085103034973, + "learning_rate": 4.0007239113487025e-05, + "loss": 0.0755, + "num_input_tokens_seen": 17482016, + "step": 14370 + }, + { + "epoch": 1.6009577903998218, + "grad_norm": 1.3079049587249756, + "learning_rate": 4.002116048557746e-05, + "loss": 0.0678, + "num_input_tokens_seen": 17488160, + "step": 14375 + }, + { + "epoch": 1.601514645283439, + "grad_norm": 1.111575722694397, + "learning_rate": 4.0035081857667893e-05, + "loss": 0.0682, + "num_input_tokens_seen": 17494304, + "step": 14380 + }, + { + "epoch": 1.6020715001670565, + "grad_norm": 1.5980154275894165, + "learning_rate": 4.004900322975833e-05, + "loss": 0.2969, + "num_input_tokens_seen": 17500320, + "step": 14385 + }, + { + "epoch": 1.602628355050674, + "grad_norm": 1.3470664024353027, + "learning_rate": 4.006292460184876e-05, + "loss": 0.138, + "num_input_tokens_seen": 17506176, + "step": 14390 + }, + { + "epoch": 1.6031852099342911, + "grad_norm": 1.20785391330719, + "learning_rate": 4.007684597393919e-05, + "loss": 0.1594, + "num_input_tokens_seen": 17512160, + "step": 14395 + }, + { + "epoch": 1.6037420648179084, + "grad_norm": 0.590015709400177, + "learning_rate": 4.009076734602963e-05, + "loss": 0.1006, + "num_input_tokens_seen": 17517824, + "step": 14400 + }, + { + "epoch": 1.6042989197015258, + "grad_norm": 0.583666980266571, + "learning_rate": 4.010468871812006e-05, + "loss": 0.053, + "num_input_tokens_seen": 17523936, + "step": 14405 + }, + { + "epoch": 1.604855774585143, + "grad_norm": 0.7751873731613159, + "learning_rate": 4.011861009021049e-05, + "loss": 0.1669, + "num_input_tokens_seen": 17529920, + "step": 14410 + }, + { + "epoch": 1.6054126294687605, + "grad_norm": 1.110662579536438, + "learning_rate": 4.013253146230093e-05, + "loss": 0.1657, + "num_input_tokens_seen": 17535968, + "step": 14415 + }, + { + "epoch": 1.6059694843523777, + "grad_norm": 1.5150913000106812, + "learning_rate": 4.0146452834391355e-05, + "loss": 0.1558, + "num_input_tokens_seen": 17541920, + "step": 14420 + }, + { + "epoch": 1.606526339235995, + "grad_norm": 0.9368552565574646, + "learning_rate": 4.0160374206481796e-05, + "loss": 0.0768, + "num_input_tokens_seen": 17548416, + "step": 14425 + }, + { + "epoch": 1.6070831941196124, + "grad_norm": 0.9988397359848022, + "learning_rate": 4.017429557857222e-05, + "loss": 0.0679, + "num_input_tokens_seen": 17554560, + "step": 14430 + }, + { + "epoch": 1.6076400490032299, + "grad_norm": 0.5831665396690369, + "learning_rate": 4.018821695066266e-05, + "loss": 0.091, + "num_input_tokens_seen": 17560832, + "step": 14435 + }, + { + "epoch": 1.608196903886847, + "grad_norm": 1.6138468980789185, + "learning_rate": 4.020213832275309e-05, + "loss": 0.1178, + "num_input_tokens_seen": 17566912, + "step": 14440 + }, + { + "epoch": 1.6087537587704643, + "grad_norm": 0.5070197582244873, + "learning_rate": 4.0216059694843526e-05, + "loss": 0.2842, + "num_input_tokens_seen": 17572672, + "step": 14445 + }, + { + "epoch": 1.6093106136540818, + "grad_norm": 0.3794766366481781, + "learning_rate": 4.022998106693396e-05, + "loss": 0.081, + "num_input_tokens_seen": 17578624, + "step": 14450 + }, + { + "epoch": 1.6098674685376992, + "grad_norm": 1.6809580326080322, + "learning_rate": 4.0243902439024395e-05, + "loss": 0.1531, + "num_input_tokens_seen": 17584224, + "step": 14455 + }, + { + "epoch": 1.6104243234213165, + "grad_norm": 1.5784908533096313, + "learning_rate": 4.025782381111482e-05, + "loss": 0.0822, + "num_input_tokens_seen": 17590112, + "step": 14460 + }, + { + "epoch": 1.6109811783049337, + "grad_norm": 1.468113660812378, + "learning_rate": 4.027174518320526e-05, + "loss": 0.1918, + "num_input_tokens_seen": 17596448, + "step": 14465 + }, + { + "epoch": 1.611538033188551, + "grad_norm": 0.658600389957428, + "learning_rate": 4.028566655529569e-05, + "loss": 0.0841, + "num_input_tokens_seen": 17602560, + "step": 14470 + }, + { + "epoch": 1.6120948880721684, + "grad_norm": 1.6186383962631226, + "learning_rate": 4.0299587927386125e-05, + "loss": 0.1892, + "num_input_tokens_seen": 17608608, + "step": 14475 + }, + { + "epoch": 1.6126517429557858, + "grad_norm": 0.9114622473716736, + "learning_rate": 4.031350929947656e-05, + "loss": 0.2308, + "num_input_tokens_seen": 17614528, + "step": 14480 + }, + { + "epoch": 1.613208597839403, + "grad_norm": 0.262681245803833, + "learning_rate": 4.032743067156699e-05, + "loss": 0.1421, + "num_input_tokens_seen": 17620640, + "step": 14485 + }, + { + "epoch": 1.6137654527230203, + "grad_norm": 1.1970860958099365, + "learning_rate": 4.034135204365743e-05, + "loss": 0.089, + "num_input_tokens_seen": 17626880, + "step": 14490 + }, + { + "epoch": 1.6143223076066378, + "grad_norm": 0.19595976173877716, + "learning_rate": 4.0355273415747856e-05, + "loss": 0.1038, + "num_input_tokens_seen": 17633216, + "step": 14495 + }, + { + "epoch": 1.6148791624902552, + "grad_norm": 2.0982093811035156, + "learning_rate": 4.03691947878383e-05, + "loss": 0.1386, + "num_input_tokens_seen": 17639584, + "step": 14500 + }, + { + "epoch": 1.6154360173738724, + "grad_norm": 0.22810332477092743, + "learning_rate": 4.0383116159928725e-05, + "loss": 0.0931, + "num_input_tokens_seen": 17645568, + "step": 14505 + }, + { + "epoch": 1.6159928722574897, + "grad_norm": 0.28093594312667847, + "learning_rate": 4.039703753201915e-05, + "loss": 0.1724, + "num_input_tokens_seen": 17652000, + "step": 14510 + }, + { + "epoch": 1.616549727141107, + "grad_norm": 0.6800674200057983, + "learning_rate": 4.041095890410959e-05, + "loss": 0.2115, + "num_input_tokens_seen": 17658048, + "step": 14515 + }, + { + "epoch": 1.6171065820247243, + "grad_norm": 0.12507446110248566, + "learning_rate": 4.042488027620002e-05, + "loss": 0.1483, + "num_input_tokens_seen": 17664096, + "step": 14520 + }, + { + "epoch": 1.6176634369083418, + "grad_norm": 0.43572235107421875, + "learning_rate": 4.043880164829046e-05, + "loss": 0.084, + "num_input_tokens_seen": 17670272, + "step": 14525 + }, + { + "epoch": 1.618220291791959, + "grad_norm": 0.006847120355814695, + "learning_rate": 4.045272302038089e-05, + "loss": 0.1675, + "num_input_tokens_seen": 17676352, + "step": 14530 + }, + { + "epoch": 1.6187771466755763, + "grad_norm": 0.002409314503893256, + "learning_rate": 4.0466644392471324e-05, + "loss": 0.1396, + "num_input_tokens_seen": 17682624, + "step": 14535 + }, + { + "epoch": 1.6193340015591937, + "grad_norm": 1.0506879091262817, + "learning_rate": 4.048056576456176e-05, + "loss": 0.0569, + "num_input_tokens_seen": 17688608, + "step": 14540 + }, + { + "epoch": 1.6198908564428112, + "grad_norm": 0.021264424547553062, + "learning_rate": 4.049448713665219e-05, + "loss": 0.1805, + "num_input_tokens_seen": 17694592, + "step": 14545 + }, + { + "epoch": 1.6204477113264284, + "grad_norm": 0.008931479416787624, + "learning_rate": 4.050840850874263e-05, + "loss": 0.0678, + "num_input_tokens_seen": 17700768, + "step": 14550 + }, + { + "epoch": 1.6210045662100456, + "grad_norm": 0.8659631609916687, + "learning_rate": 4.0522329880833054e-05, + "loss": 0.1572, + "num_input_tokens_seen": 17706496, + "step": 14555 + }, + { + "epoch": 1.6215614210936629, + "grad_norm": 0.011003655381500721, + "learning_rate": 4.053625125292349e-05, + "loss": 0.0344, + "num_input_tokens_seen": 17712768, + "step": 14560 + }, + { + "epoch": 1.6221182759772803, + "grad_norm": 0.4117024838924408, + "learning_rate": 4.055017262501392e-05, + "loss": 0.0661, + "num_input_tokens_seen": 17718560, + "step": 14565 + }, + { + "epoch": 1.6226751308608978, + "grad_norm": 1.6106338500976562, + "learning_rate": 4.056409399710436e-05, + "loss": 0.2853, + "num_input_tokens_seen": 17724928, + "step": 14570 + }, + { + "epoch": 1.623231985744515, + "grad_norm": 0.8675395846366882, + "learning_rate": 4.057801536919479e-05, + "loss": 0.1293, + "num_input_tokens_seen": 17731072, + "step": 14575 + }, + { + "epoch": 1.6237888406281322, + "grad_norm": 1.0503824949264526, + "learning_rate": 4.0591936741285226e-05, + "loss": 0.1127, + "num_input_tokens_seen": 17737120, + "step": 14580 + }, + { + "epoch": 1.6243456955117497, + "grad_norm": 0.5768168568611145, + "learning_rate": 4.0605858113375653e-05, + "loss": 0.0577, + "num_input_tokens_seen": 17743488, + "step": 14585 + }, + { + "epoch": 1.6249025503953671, + "grad_norm": 0.07507356256246567, + "learning_rate": 4.0619779485466095e-05, + "loss": 0.0373, + "num_input_tokens_seen": 17749376, + "step": 14590 + }, + { + "epoch": 1.6254594052789844, + "grad_norm": 0.032619111239910126, + "learning_rate": 4.063370085755652e-05, + "loss": 0.0561, + "num_input_tokens_seen": 17755776, + "step": 14595 + }, + { + "epoch": 1.6260162601626016, + "grad_norm": 1.416872501373291, + "learning_rate": 4.0647622229646956e-05, + "loss": 0.1071, + "num_input_tokens_seen": 17762272, + "step": 14600 + }, + { + "epoch": 1.6265731150462188, + "grad_norm": 1.1034858226776123, + "learning_rate": 4.066154360173739e-05, + "loss": 0.09, + "num_input_tokens_seen": 17768512, + "step": 14605 + }, + { + "epoch": 1.6271299699298363, + "grad_norm": 0.671726644039154, + "learning_rate": 4.067546497382782e-05, + "loss": 0.1448, + "num_input_tokens_seen": 17774784, + "step": 14610 + }, + { + "epoch": 1.6276868248134537, + "grad_norm": 1.1536269187927246, + "learning_rate": 4.068938634591826e-05, + "loss": 0.1548, + "num_input_tokens_seen": 17781056, + "step": 14615 + }, + { + "epoch": 1.628243679697071, + "grad_norm": 1.176841378211975, + "learning_rate": 4.070330771800869e-05, + "loss": 0.1127, + "num_input_tokens_seen": 17787296, + "step": 14620 + }, + { + "epoch": 1.6288005345806882, + "grad_norm": 0.8603613376617432, + "learning_rate": 4.071722909009912e-05, + "loss": 0.0972, + "num_input_tokens_seen": 17793376, + "step": 14625 + }, + { + "epoch": 1.6293573894643056, + "grad_norm": 0.9109642505645752, + "learning_rate": 4.0731150462189556e-05, + "loss": 0.2585, + "num_input_tokens_seen": 17799488, + "step": 14630 + }, + { + "epoch": 1.629914244347923, + "grad_norm": 1.0240594148635864, + "learning_rate": 4.074507183427999e-05, + "loss": 0.1862, + "num_input_tokens_seen": 17805376, + "step": 14635 + }, + { + "epoch": 1.6304710992315403, + "grad_norm": 1.3661681413650513, + "learning_rate": 4.0758993206370424e-05, + "loss": 0.2124, + "num_input_tokens_seen": 17811520, + "step": 14640 + }, + { + "epoch": 1.6310279541151576, + "grad_norm": 0.3137829899787903, + "learning_rate": 4.077291457846085e-05, + "loss": 0.117, + "num_input_tokens_seen": 17817792, + "step": 14645 + }, + { + "epoch": 1.6315848089987748, + "grad_norm": 1.0954780578613281, + "learning_rate": 4.0786835950551286e-05, + "loss": 0.1872, + "num_input_tokens_seen": 17823936, + "step": 14650 + }, + { + "epoch": 1.6321416638823922, + "grad_norm": 1.1658848524093628, + "learning_rate": 4.080075732264172e-05, + "loss": 0.1204, + "num_input_tokens_seen": 17830304, + "step": 14655 + }, + { + "epoch": 1.6326985187660097, + "grad_norm": 2.0721194744110107, + "learning_rate": 4.0814678694732155e-05, + "loss": 0.1461, + "num_input_tokens_seen": 17836416, + "step": 14660 + }, + { + "epoch": 1.633255373649627, + "grad_norm": 0.5713366866111755, + "learning_rate": 4.082860006682259e-05, + "loss": 0.0599, + "num_input_tokens_seen": 17842752, + "step": 14665 + }, + { + "epoch": 1.6338122285332441, + "grad_norm": 0.42525628209114075, + "learning_rate": 4.0842521438913024e-05, + "loss": 0.0985, + "num_input_tokens_seen": 17849056, + "step": 14670 + }, + { + "epoch": 1.6343690834168616, + "grad_norm": 0.19733144342899323, + "learning_rate": 4.085644281100345e-05, + "loss": 0.0271, + "num_input_tokens_seen": 17855296, + "step": 14675 + }, + { + "epoch": 1.634925938300479, + "grad_norm": 1.8430429697036743, + "learning_rate": 4.087036418309389e-05, + "loss": 0.2768, + "num_input_tokens_seen": 17860800, + "step": 14680 + }, + { + "epoch": 1.6354827931840963, + "grad_norm": 1.6219778060913086, + "learning_rate": 4.088428555518432e-05, + "loss": 0.1507, + "num_input_tokens_seen": 17867328, + "step": 14685 + }, + { + "epoch": 1.6360396480677135, + "grad_norm": 0.025320101529359818, + "learning_rate": 4.0898206927274754e-05, + "loss": 0.114, + "num_input_tokens_seen": 17873408, + "step": 14690 + }, + { + "epoch": 1.6365965029513307, + "grad_norm": 0.15648548305034637, + "learning_rate": 4.091212829936519e-05, + "loss": 0.1534, + "num_input_tokens_seen": 17879744, + "step": 14695 + }, + { + "epoch": 1.6371533578349482, + "grad_norm": 0.04540744051337242, + "learning_rate": 4.0926049671455616e-05, + "loss": 0.1368, + "num_input_tokens_seen": 17885600, + "step": 14700 + }, + { + "epoch": 1.6377102127185656, + "grad_norm": 0.41765904426574707, + "learning_rate": 4.093997104354606e-05, + "loss": 0.0318, + "num_input_tokens_seen": 17892064, + "step": 14705 + }, + { + "epoch": 1.6382670676021829, + "grad_norm": 2.0663399696350098, + "learning_rate": 4.0953892415636485e-05, + "loss": 0.3908, + "num_input_tokens_seen": 17898304, + "step": 14710 + }, + { + "epoch": 1.6388239224858, + "grad_norm": 0.6113240122795105, + "learning_rate": 4.096781378772692e-05, + "loss": 0.095, + "num_input_tokens_seen": 17904192, + "step": 14715 + }, + { + "epoch": 1.6393807773694176, + "grad_norm": 1.4883638620376587, + "learning_rate": 4.098173515981735e-05, + "loss": 0.138, + "num_input_tokens_seen": 17910368, + "step": 14720 + }, + { + "epoch": 1.639937632253035, + "grad_norm": 0.04263295978307724, + "learning_rate": 4.099565653190779e-05, + "loss": 0.1047, + "num_input_tokens_seen": 17916384, + "step": 14725 + }, + { + "epoch": 1.6404944871366522, + "grad_norm": 0.2693231701850891, + "learning_rate": 4.100957790399822e-05, + "loss": 0.1442, + "num_input_tokens_seen": 17922528, + "step": 14730 + }, + { + "epoch": 1.6410513420202695, + "grad_norm": 1.4016215801239014, + "learning_rate": 4.102349927608865e-05, + "loss": 0.1963, + "num_input_tokens_seen": 17928736, + "step": 14735 + }, + { + "epoch": 1.6416081969038867, + "grad_norm": 0.9182106852531433, + "learning_rate": 4.1037420648179084e-05, + "loss": 0.1964, + "num_input_tokens_seen": 17935040, + "step": 14740 + }, + { + "epoch": 1.6421650517875042, + "grad_norm": 0.3129471242427826, + "learning_rate": 4.105134202026952e-05, + "loss": 0.1192, + "num_input_tokens_seen": 17941248, + "step": 14745 + }, + { + "epoch": 1.6427219066711216, + "grad_norm": 0.027610115706920624, + "learning_rate": 4.106526339235995e-05, + "loss": 0.1489, + "num_input_tokens_seen": 17947584, + "step": 14750 + }, + { + "epoch": 1.6432787615547388, + "grad_norm": 0.15047504007816315, + "learning_rate": 4.107918476445039e-05, + "loss": 0.0626, + "num_input_tokens_seen": 17954016, + "step": 14755 + }, + { + "epoch": 1.643835616438356, + "grad_norm": 0.5781774520874023, + "learning_rate": 4.109310613654082e-05, + "loss": 0.1932, + "num_input_tokens_seen": 17959840, + "step": 14760 + }, + { + "epoch": 1.6443924713219735, + "grad_norm": 0.7096369862556458, + "learning_rate": 4.110702750863125e-05, + "loss": 0.1317, + "num_input_tokens_seen": 17965472, + "step": 14765 + }, + { + "epoch": 1.644949326205591, + "grad_norm": 1.8903495073318481, + "learning_rate": 4.112094888072169e-05, + "loss": 0.1366, + "num_input_tokens_seen": 17971200, + "step": 14770 + }, + { + "epoch": 1.6455061810892082, + "grad_norm": 1.5130910873413086, + "learning_rate": 4.113487025281212e-05, + "loss": 0.137, + "num_input_tokens_seen": 17977536, + "step": 14775 + }, + { + "epoch": 1.6460630359728254, + "grad_norm": 0.8400015830993652, + "learning_rate": 4.114879162490256e-05, + "loss": 0.1075, + "num_input_tokens_seen": 17982976, + "step": 14780 + }, + { + "epoch": 1.6466198908564427, + "grad_norm": 0.2168012410402298, + "learning_rate": 4.1162712996992986e-05, + "loss": 0.1987, + "num_input_tokens_seen": 17989088, + "step": 14785 + }, + { + "epoch": 1.6471767457400601, + "grad_norm": 0.5159151554107666, + "learning_rate": 4.1176634369083414e-05, + "loss": 0.1859, + "num_input_tokens_seen": 17995264, + "step": 14790 + }, + { + "epoch": 1.6477336006236776, + "grad_norm": 0.650995671749115, + "learning_rate": 4.1190555741173855e-05, + "loss": 0.1266, + "num_input_tokens_seen": 18001248, + "step": 14795 + }, + { + "epoch": 1.6482904555072948, + "grad_norm": 0.583251416683197, + "learning_rate": 4.120447711326428e-05, + "loss": 0.0938, + "num_input_tokens_seen": 18007424, + "step": 14800 + }, + { + "epoch": 1.648847310390912, + "grad_norm": 0.1300990730524063, + "learning_rate": 4.121839848535472e-05, + "loss": 0.1673, + "num_input_tokens_seen": 18013504, + "step": 14805 + }, + { + "epoch": 1.6494041652745295, + "grad_norm": 0.1829552799463272, + "learning_rate": 4.123231985744515e-05, + "loss": 0.0404, + "num_input_tokens_seen": 18019584, + "step": 14810 + }, + { + "epoch": 1.649961020158147, + "grad_norm": 0.8270105719566345, + "learning_rate": 4.1246241229535585e-05, + "loss": 0.1599, + "num_input_tokens_seen": 18025632, + "step": 14815 + }, + { + "epoch": 1.6505178750417642, + "grad_norm": 0.33513343334198, + "learning_rate": 4.126016260162602e-05, + "loss": 0.0718, + "num_input_tokens_seen": 18031648, + "step": 14820 + }, + { + "epoch": 1.6510747299253814, + "grad_norm": 0.4289020895957947, + "learning_rate": 4.1274083973716454e-05, + "loss": 0.1602, + "num_input_tokens_seen": 18037888, + "step": 14825 + }, + { + "epoch": 1.6516315848089986, + "grad_norm": 0.23947729170322418, + "learning_rate": 4.128800534580689e-05, + "loss": 0.1777, + "num_input_tokens_seen": 18044064, + "step": 14830 + }, + { + "epoch": 1.652188439692616, + "grad_norm": 1.3410848379135132, + "learning_rate": 4.1301926717897316e-05, + "loss": 0.266, + "num_input_tokens_seen": 18049984, + "step": 14835 + }, + { + "epoch": 1.6527452945762335, + "grad_norm": 0.53839111328125, + "learning_rate": 4.131584808998775e-05, + "loss": 0.1118, + "num_input_tokens_seen": 18056416, + "step": 14840 + }, + { + "epoch": 1.6533021494598508, + "grad_norm": 0.3907468616962433, + "learning_rate": 4.1329769462078184e-05, + "loss": 0.0965, + "num_input_tokens_seen": 18063008, + "step": 14845 + }, + { + "epoch": 1.653859004343468, + "grad_norm": 0.30120986700057983, + "learning_rate": 4.134369083416862e-05, + "loss": 0.1159, + "num_input_tokens_seen": 18069024, + "step": 14850 + }, + { + "epoch": 1.6544158592270855, + "grad_norm": 0.27600541710853577, + "learning_rate": 4.135761220625905e-05, + "loss": 0.0955, + "num_input_tokens_seen": 18075200, + "step": 14855 + }, + { + "epoch": 1.654972714110703, + "grad_norm": 0.7617404460906982, + "learning_rate": 4.137153357834949e-05, + "loss": 0.1585, + "num_input_tokens_seen": 18081440, + "step": 14860 + }, + { + "epoch": 1.6555295689943201, + "grad_norm": 1.7104580402374268, + "learning_rate": 4.1385454950439915e-05, + "loss": 0.1165, + "num_input_tokens_seen": 18087776, + "step": 14865 + }, + { + "epoch": 1.6560864238779374, + "grad_norm": 2.3173844814300537, + "learning_rate": 4.1399376322530356e-05, + "loss": 0.1817, + "num_input_tokens_seen": 18093952, + "step": 14870 + }, + { + "epoch": 1.6566432787615546, + "grad_norm": 0.35606637597084045, + "learning_rate": 4.1413297694620784e-05, + "loss": 0.1465, + "num_input_tokens_seen": 18099904, + "step": 14875 + }, + { + "epoch": 1.657200133645172, + "grad_norm": 0.13781201839447021, + "learning_rate": 4.142721906671122e-05, + "loss": 0.1583, + "num_input_tokens_seen": 18106336, + "step": 14880 + }, + { + "epoch": 1.6577569885287895, + "grad_norm": 0.6811777353286743, + "learning_rate": 4.144114043880165e-05, + "loss": 0.1439, + "num_input_tokens_seen": 18112224, + "step": 14885 + }, + { + "epoch": 1.6583138434124067, + "grad_norm": 0.9902075529098511, + "learning_rate": 4.145506181089208e-05, + "loss": 0.1113, + "num_input_tokens_seen": 18118528, + "step": 14890 + }, + { + "epoch": 1.658870698296024, + "grad_norm": 0.4544312357902527, + "learning_rate": 4.146898318298252e-05, + "loss": 0.1471, + "num_input_tokens_seen": 18124224, + "step": 14895 + }, + { + "epoch": 1.6594275531796414, + "grad_norm": 0.31990501284599304, + "learning_rate": 4.148290455507295e-05, + "loss": 0.0815, + "num_input_tokens_seen": 18130656, + "step": 14900 + }, + { + "epoch": 1.6599844080632589, + "grad_norm": 0.6775155067443848, + "learning_rate": 4.149682592716338e-05, + "loss": 0.1008, + "num_input_tokens_seen": 18136672, + "step": 14905 + }, + { + "epoch": 1.660541262946876, + "grad_norm": 1.581993818283081, + "learning_rate": 4.151074729925382e-05, + "loss": 0.1921, + "num_input_tokens_seen": 18142432, + "step": 14910 + }, + { + "epoch": 1.6610981178304933, + "grad_norm": 1.247249960899353, + "learning_rate": 4.152466867134425e-05, + "loss": 0.1775, + "num_input_tokens_seen": 18148832, + "step": 14915 + }, + { + "epoch": 1.6616549727141106, + "grad_norm": 0.6161497831344604, + "learning_rate": 4.1538590043434686e-05, + "loss": 0.0575, + "num_input_tokens_seen": 18155040, + "step": 14920 + }, + { + "epoch": 1.662211827597728, + "grad_norm": 1.1322400569915771, + "learning_rate": 4.155251141552511e-05, + "loss": 0.1414, + "num_input_tokens_seen": 18161344, + "step": 14925 + }, + { + "epoch": 1.6627686824813455, + "grad_norm": 0.7902520298957825, + "learning_rate": 4.156643278761555e-05, + "loss": 0.1732, + "num_input_tokens_seen": 18167488, + "step": 14930 + }, + { + "epoch": 1.6633255373649627, + "grad_norm": 2.415343999862671, + "learning_rate": 4.158035415970598e-05, + "loss": 0.2346, + "num_input_tokens_seen": 18173216, + "step": 14935 + }, + { + "epoch": 1.66388239224858, + "grad_norm": 0.359758198261261, + "learning_rate": 4.1594275531796416e-05, + "loss": 0.0544, + "num_input_tokens_seen": 18179296, + "step": 14940 + }, + { + "epoch": 1.6644392471321974, + "grad_norm": 0.10650750249624252, + "learning_rate": 4.160819690388685e-05, + "loss": 0.1325, + "num_input_tokens_seen": 18185632, + "step": 14945 + }, + { + "epoch": 1.6649961020158148, + "grad_norm": 1.0719830989837646, + "learning_rate": 4.1622118275977285e-05, + "loss": 0.1761, + "num_input_tokens_seen": 18191008, + "step": 14950 + }, + { + "epoch": 1.665552956899432, + "grad_norm": 0.1905529648065567, + "learning_rate": 4.163603964806771e-05, + "loss": 0.0756, + "num_input_tokens_seen": 18197248, + "step": 14955 + }, + { + "epoch": 1.6661098117830493, + "grad_norm": 1.918051838874817, + "learning_rate": 4.1649961020158154e-05, + "loss": 0.0849, + "num_input_tokens_seen": 18203104, + "step": 14960 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.573275089263916, + "learning_rate": 4.166388239224858e-05, + "loss": 0.1739, + "num_input_tokens_seen": 18209312, + "step": 14965 + }, + { + "epoch": 1.667223521550284, + "grad_norm": 0.2733665704727173, + "learning_rate": 4.1677803764339015e-05, + "loss": 0.1923, + "num_input_tokens_seen": 18215232, + "step": 14970 + }, + { + "epoch": 1.6677803764339014, + "grad_norm": 0.9426243305206299, + "learning_rate": 4.169172513642945e-05, + "loss": 0.147, + "num_input_tokens_seen": 18221792, + "step": 14975 + }, + { + "epoch": 1.6683372313175187, + "grad_norm": 0.03850453346967697, + "learning_rate": 4.170564650851988e-05, + "loss": 0.0474, + "num_input_tokens_seen": 18228000, + "step": 14980 + }, + { + "epoch": 1.6688940862011359, + "grad_norm": 1.06267249584198, + "learning_rate": 4.171956788061032e-05, + "loss": 0.0888, + "num_input_tokens_seen": 18233984, + "step": 14985 + }, + { + "epoch": 1.6694509410847533, + "grad_norm": 1.2492194175720215, + "learning_rate": 4.1733489252700746e-05, + "loss": 0.0684, + "num_input_tokens_seen": 18240128, + "step": 14990 + }, + { + "epoch": 1.6700077959683708, + "grad_norm": 0.04165786877274513, + "learning_rate": 4.174741062479118e-05, + "loss": 0.0798, + "num_input_tokens_seen": 18246336, + "step": 14995 + }, + { + "epoch": 1.670564650851988, + "grad_norm": 0.3495279848575592, + "learning_rate": 4.1761331996881615e-05, + "loss": 0.0633, + "num_input_tokens_seen": 18252480, + "step": 15000 + }, + { + "epoch": 1.6711215057356053, + "grad_norm": 1.0910577774047852, + "learning_rate": 4.177525336897205e-05, + "loss": 0.1418, + "num_input_tokens_seen": 18258720, + "step": 15005 + }, + { + "epoch": 1.6716783606192225, + "grad_norm": 0.5516780614852905, + "learning_rate": 4.178917474106248e-05, + "loss": 0.105, + "num_input_tokens_seen": 18264640, + "step": 15010 + }, + { + "epoch": 1.67223521550284, + "grad_norm": 0.013357928022742271, + "learning_rate": 4.180309611315291e-05, + "loss": 0.0846, + "num_input_tokens_seen": 18270880, + "step": 15015 + }, + { + "epoch": 1.6727920703864574, + "grad_norm": 0.13282977044582367, + "learning_rate": 4.1817017485243345e-05, + "loss": 0.0924, + "num_input_tokens_seen": 18277024, + "step": 15020 + }, + { + "epoch": 1.6733489252700746, + "grad_norm": 0.6392345428466797, + "learning_rate": 4.183093885733378e-05, + "loss": 0.1494, + "num_input_tokens_seen": 18282592, + "step": 15025 + }, + { + "epoch": 1.6739057801536918, + "grad_norm": 0.24457283318042755, + "learning_rate": 4.1844860229424214e-05, + "loss": 0.1643, + "num_input_tokens_seen": 18288448, + "step": 15030 + }, + { + "epoch": 1.6744626350373093, + "grad_norm": 0.0466226264834404, + "learning_rate": 4.185878160151465e-05, + "loss": 0.0737, + "num_input_tokens_seen": 18294752, + "step": 15035 + }, + { + "epoch": 1.6750194899209268, + "grad_norm": 1.474968433380127, + "learning_rate": 4.187270297360508e-05, + "loss": 0.204, + "num_input_tokens_seen": 18300416, + "step": 15040 + }, + { + "epoch": 1.675576344804544, + "grad_norm": 0.01835002563893795, + "learning_rate": 4.188662434569551e-05, + "loss": 0.0849, + "num_input_tokens_seen": 18306816, + "step": 15045 + }, + { + "epoch": 1.6761331996881612, + "grad_norm": 1.0022914409637451, + "learning_rate": 4.190054571778595e-05, + "loss": 0.1886, + "num_input_tokens_seen": 18312576, + "step": 15050 + }, + { + "epoch": 1.6766900545717784, + "grad_norm": 0.6365463733673096, + "learning_rate": 4.191446708987638e-05, + "loss": 0.1047, + "num_input_tokens_seen": 18318496, + "step": 15055 + }, + { + "epoch": 1.677246909455396, + "grad_norm": 0.05098551884293556, + "learning_rate": 4.192838846196681e-05, + "loss": 0.1951, + "num_input_tokens_seen": 18323968, + "step": 15060 + }, + { + "epoch": 1.6778037643390133, + "grad_norm": 0.977553129196167, + "learning_rate": 4.194230983405725e-05, + "loss": 0.0759, + "num_input_tokens_seen": 18329856, + "step": 15065 + }, + { + "epoch": 1.6783606192226306, + "grad_norm": 0.7346006631851196, + "learning_rate": 4.1956231206147675e-05, + "loss": 0.1747, + "num_input_tokens_seen": 18335872, + "step": 15070 + }, + { + "epoch": 1.6789174741062478, + "grad_norm": 1.4183323383331299, + "learning_rate": 4.1970152578238116e-05, + "loss": 0.1667, + "num_input_tokens_seen": 18341920, + "step": 15075 + }, + { + "epoch": 1.6794743289898653, + "grad_norm": 0.4519192576408386, + "learning_rate": 4.1984073950328544e-05, + "loss": 0.1379, + "num_input_tokens_seen": 18347872, + "step": 15080 + }, + { + "epoch": 1.6800311838734827, + "grad_norm": 0.11251924932003021, + "learning_rate": 4.1997995322418985e-05, + "loss": 0.0177, + "num_input_tokens_seen": 18354144, + "step": 15085 + }, + { + "epoch": 1.6805880387571, + "grad_norm": 1.251470685005188, + "learning_rate": 4.201191669450941e-05, + "loss": 0.1357, + "num_input_tokens_seen": 18360512, + "step": 15090 + }, + { + "epoch": 1.6811448936407172, + "grad_norm": 0.6655911803245544, + "learning_rate": 4.2025838066599847e-05, + "loss": 0.1431, + "num_input_tokens_seen": 18366848, + "step": 15095 + }, + { + "epoch": 1.6817017485243344, + "grad_norm": 0.4792628586292267, + "learning_rate": 4.203975943869028e-05, + "loss": 0.0917, + "num_input_tokens_seen": 18372928, + "step": 15100 + }, + { + "epoch": 1.6822586034079519, + "grad_norm": 0.8877645134925842, + "learning_rate": 4.205368081078071e-05, + "loss": 0.1262, + "num_input_tokens_seen": 18378848, + "step": 15105 + }, + { + "epoch": 1.6828154582915693, + "grad_norm": 0.59061598777771, + "learning_rate": 4.206760218287115e-05, + "loss": 0.0667, + "num_input_tokens_seen": 18385312, + "step": 15110 + }, + { + "epoch": 1.6833723131751865, + "grad_norm": 0.4729306399822235, + "learning_rate": 4.208152355496158e-05, + "loss": 0.1282, + "num_input_tokens_seen": 18391424, + "step": 15115 + }, + { + "epoch": 1.6839291680588038, + "grad_norm": 0.6815294027328491, + "learning_rate": 4.209544492705201e-05, + "loss": 0.1085, + "num_input_tokens_seen": 18397536, + "step": 15120 + }, + { + "epoch": 1.6844860229424212, + "grad_norm": 0.4506398141384125, + "learning_rate": 4.2109366299142446e-05, + "loss": 0.1836, + "num_input_tokens_seen": 18403840, + "step": 15125 + }, + { + "epoch": 1.6850428778260387, + "grad_norm": 0.2570110261440277, + "learning_rate": 4.212328767123288e-05, + "loss": 0.0801, + "num_input_tokens_seen": 18409920, + "step": 15130 + }, + { + "epoch": 1.685599732709656, + "grad_norm": 0.15626728534698486, + "learning_rate": 4.2137209043323314e-05, + "loss": 0.133, + "num_input_tokens_seen": 18416256, + "step": 15135 + }, + { + "epoch": 1.6861565875932731, + "grad_norm": 0.6782206296920776, + "learning_rate": 4.215113041541375e-05, + "loss": 0.0562, + "num_input_tokens_seen": 18422368, + "step": 15140 + }, + { + "epoch": 1.6867134424768904, + "grad_norm": 0.8732416033744812, + "learning_rate": 4.2165051787504176e-05, + "loss": 0.143, + "num_input_tokens_seen": 18428512, + "step": 15145 + }, + { + "epoch": 1.6872702973605078, + "grad_norm": 1.2391138076782227, + "learning_rate": 4.217897315959461e-05, + "loss": 0.1199, + "num_input_tokens_seen": 18434208, + "step": 15150 + }, + { + "epoch": 1.6878271522441253, + "grad_norm": 1.437292456626892, + "learning_rate": 4.2192894531685045e-05, + "loss": 0.0695, + "num_input_tokens_seen": 18440160, + "step": 15155 + }, + { + "epoch": 1.6883840071277425, + "grad_norm": 0.08121297508478165, + "learning_rate": 4.220681590377548e-05, + "loss": 0.1086, + "num_input_tokens_seen": 18446400, + "step": 15160 + }, + { + "epoch": 1.6889408620113597, + "grad_norm": 0.016485653817653656, + "learning_rate": 4.2220737275865914e-05, + "loss": 0.0432, + "num_input_tokens_seen": 18452288, + "step": 15165 + }, + { + "epoch": 1.6894977168949772, + "grad_norm": 0.9151877164840698, + "learning_rate": 4.223465864795634e-05, + "loss": 0.1155, + "num_input_tokens_seen": 18458560, + "step": 15170 + }, + { + "epoch": 1.6900545717785946, + "grad_norm": 0.0906132385134697, + "learning_rate": 4.224858002004678e-05, + "loss": 0.2141, + "num_input_tokens_seen": 18464640, + "step": 15175 + }, + { + "epoch": 1.6906114266622119, + "grad_norm": 0.8785070776939392, + "learning_rate": 4.226250139213721e-05, + "loss": 0.2594, + "num_input_tokens_seen": 18470240, + "step": 15180 + }, + { + "epoch": 1.691168281545829, + "grad_norm": 1.9308550357818604, + "learning_rate": 4.2276422764227644e-05, + "loss": 0.2105, + "num_input_tokens_seen": 18476544, + "step": 15185 + }, + { + "epoch": 1.6917251364294463, + "grad_norm": 0.1490689218044281, + "learning_rate": 4.229034413631808e-05, + "loss": 0.1596, + "num_input_tokens_seen": 18482560, + "step": 15190 + }, + { + "epoch": 1.6922819913130638, + "grad_norm": 0.7088754773139954, + "learning_rate": 4.2304265508408506e-05, + "loss": 0.1531, + "num_input_tokens_seen": 18488800, + "step": 15195 + }, + { + "epoch": 1.6928388461966812, + "grad_norm": 1.5265387296676636, + "learning_rate": 4.231818688049895e-05, + "loss": 0.1426, + "num_input_tokens_seen": 18494848, + "step": 15200 + }, + { + "epoch": 1.6933957010802985, + "grad_norm": 0.06767202913761139, + "learning_rate": 4.2332108252589375e-05, + "loss": 0.1749, + "num_input_tokens_seen": 18500832, + "step": 15205 + }, + { + "epoch": 1.6939525559639157, + "grad_norm": 0.7046456933021545, + "learning_rate": 4.234602962467981e-05, + "loss": 0.0981, + "num_input_tokens_seen": 18506848, + "step": 15210 + }, + { + "epoch": 1.6945094108475331, + "grad_norm": 0.23281951248645782, + "learning_rate": 4.235995099677024e-05, + "loss": 0.0631, + "num_input_tokens_seen": 18512768, + "step": 15215 + }, + { + "epoch": 1.6950662657311506, + "grad_norm": 0.8559421896934509, + "learning_rate": 4.237387236886068e-05, + "loss": 0.0909, + "num_input_tokens_seen": 18518944, + "step": 15220 + }, + { + "epoch": 1.6956231206147678, + "grad_norm": 0.5759780406951904, + "learning_rate": 4.238779374095111e-05, + "loss": 0.186, + "num_input_tokens_seen": 18524896, + "step": 15225 + }, + { + "epoch": 1.696179975498385, + "grad_norm": 1.7024402618408203, + "learning_rate": 4.2401715113041546e-05, + "loss": 0.1908, + "num_input_tokens_seen": 18531104, + "step": 15230 + }, + { + "epoch": 1.6967368303820023, + "grad_norm": 1.3864867687225342, + "learning_rate": 4.2415636485131974e-05, + "loss": 0.1449, + "num_input_tokens_seen": 18537440, + "step": 15235 + }, + { + "epoch": 1.6972936852656197, + "grad_norm": 0.39231690764427185, + "learning_rate": 4.242955785722241e-05, + "loss": 0.1685, + "num_input_tokens_seen": 18543392, + "step": 15240 + }, + { + "epoch": 1.6978505401492372, + "grad_norm": 1.86426842212677, + "learning_rate": 4.244347922931284e-05, + "loss": 0.2234, + "num_input_tokens_seen": 18549568, + "step": 15245 + }, + { + "epoch": 1.6984073950328544, + "grad_norm": 0.9825167059898376, + "learning_rate": 4.245740060140328e-05, + "loss": 0.2216, + "num_input_tokens_seen": 18555520, + "step": 15250 + }, + { + "epoch": 1.6989642499164717, + "grad_norm": 1.6709777116775513, + "learning_rate": 4.247132197349371e-05, + "loss": 0.1965, + "num_input_tokens_seen": 18561824, + "step": 15255 + }, + { + "epoch": 1.699521104800089, + "grad_norm": 0.02777111530303955, + "learning_rate": 4.248524334558414e-05, + "loss": 0.1027, + "num_input_tokens_seen": 18568064, + "step": 15260 + }, + { + "epoch": 1.7000779596837066, + "grad_norm": 0.30821484327316284, + "learning_rate": 4.249916471767458e-05, + "loss": 0.1363, + "num_input_tokens_seen": 18574368, + "step": 15265 + }, + { + "epoch": 1.7006348145673238, + "grad_norm": 0.9706100821495056, + "learning_rate": 4.251308608976501e-05, + "loss": 0.0649, + "num_input_tokens_seen": 18580576, + "step": 15270 + }, + { + "epoch": 1.701191669450941, + "grad_norm": 0.6120163202285767, + "learning_rate": 4.252700746185544e-05, + "loss": 0.1591, + "num_input_tokens_seen": 18586464, + "step": 15275 + }, + { + "epoch": 1.7017485243345583, + "grad_norm": 0.38487938046455383, + "learning_rate": 4.2540928833945876e-05, + "loss": 0.135, + "num_input_tokens_seen": 18592224, + "step": 15280 + }, + { + "epoch": 1.7023053792181757, + "grad_norm": 0.9588046669960022, + "learning_rate": 4.2554850206036304e-05, + "loss": 0.1355, + "num_input_tokens_seen": 18598400, + "step": 15285 + }, + { + "epoch": 1.7028622341017932, + "grad_norm": 0.36208710074424744, + "learning_rate": 4.2568771578126745e-05, + "loss": 0.0554, + "num_input_tokens_seen": 18604896, + "step": 15290 + }, + { + "epoch": 1.7034190889854104, + "grad_norm": 0.011891092173755169, + "learning_rate": 4.258269295021717e-05, + "loss": 0.0941, + "num_input_tokens_seen": 18611104, + "step": 15295 + }, + { + "epoch": 1.7039759438690276, + "grad_norm": 0.5595443844795227, + "learning_rate": 4.2596614322307607e-05, + "loss": 0.1645, + "num_input_tokens_seen": 18617056, + "step": 15300 + }, + { + "epoch": 1.704532798752645, + "grad_norm": 0.6979488134384155, + "learning_rate": 4.261053569439804e-05, + "loss": 0.1275, + "num_input_tokens_seen": 18623360, + "step": 15305 + }, + { + "epoch": 1.7050896536362625, + "grad_norm": 1.985844612121582, + "learning_rate": 4.2624457066488475e-05, + "loss": 0.0754, + "num_input_tokens_seen": 18629312, + "step": 15310 + }, + { + "epoch": 1.7056465085198798, + "grad_norm": 0.39780986309051514, + "learning_rate": 4.263837843857891e-05, + "loss": 0.1039, + "num_input_tokens_seen": 18635456, + "step": 15315 + }, + { + "epoch": 1.706203363403497, + "grad_norm": 0.5134301781654358, + "learning_rate": 4.2652299810669344e-05, + "loss": 0.0452, + "num_input_tokens_seen": 18641536, + "step": 15320 + }, + { + "epoch": 1.7067602182871142, + "grad_norm": 0.6299194097518921, + "learning_rate": 4.266622118275977e-05, + "loss": 0.1096, + "num_input_tokens_seen": 18647904, + "step": 15325 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 0.4550389349460602, + "learning_rate": 4.2680142554850206e-05, + "loss": 0.1079, + "num_input_tokens_seen": 18654048, + "step": 15330 + }, + { + "epoch": 1.7078739280543491, + "grad_norm": 0.22904743254184723, + "learning_rate": 4.269406392694064e-05, + "loss": 0.1448, + "num_input_tokens_seen": 18660160, + "step": 15335 + }, + { + "epoch": 1.7084307829379664, + "grad_norm": 1.3670819997787476, + "learning_rate": 4.2707985299031074e-05, + "loss": 0.1939, + "num_input_tokens_seen": 18666336, + "step": 15340 + }, + { + "epoch": 1.7089876378215836, + "grad_norm": 0.515259861946106, + "learning_rate": 4.272190667112151e-05, + "loss": 0.2424, + "num_input_tokens_seen": 18672384, + "step": 15345 + }, + { + "epoch": 1.709544492705201, + "grad_norm": 0.18137945234775543, + "learning_rate": 4.2735828043211936e-05, + "loss": 0.0868, + "num_input_tokens_seen": 18678496, + "step": 15350 + }, + { + "epoch": 1.7101013475888185, + "grad_norm": 1.0485974550247192, + "learning_rate": 4.274974941530238e-05, + "loss": 0.2006, + "num_input_tokens_seen": 18684352, + "step": 15355 + }, + { + "epoch": 1.7106582024724357, + "grad_norm": 1.0111284255981445, + "learning_rate": 4.2763670787392805e-05, + "loss": 0.3692, + "num_input_tokens_seen": 18689920, + "step": 15360 + }, + { + "epoch": 1.711215057356053, + "grad_norm": 0.08757123351097107, + "learning_rate": 4.2777592159483246e-05, + "loss": 0.0801, + "num_input_tokens_seen": 18696192, + "step": 15365 + }, + { + "epoch": 1.7117719122396702, + "grad_norm": 0.6932209730148315, + "learning_rate": 4.2791513531573674e-05, + "loss": 0.0554, + "num_input_tokens_seen": 18702400, + "step": 15370 + }, + { + "epoch": 1.7123287671232876, + "grad_norm": 0.6001473665237427, + "learning_rate": 4.28054349036641e-05, + "loss": 0.0848, + "num_input_tokens_seen": 18708672, + "step": 15375 + }, + { + "epoch": 1.712885622006905, + "grad_norm": 0.4330282211303711, + "learning_rate": 4.281935627575454e-05, + "loss": 0.0627, + "num_input_tokens_seen": 18714880, + "step": 15380 + }, + { + "epoch": 1.7134424768905223, + "grad_norm": 0.16287630796432495, + "learning_rate": 4.283327764784497e-05, + "loss": 0.0446, + "num_input_tokens_seen": 18720992, + "step": 15385 + }, + { + "epoch": 1.7139993317741395, + "grad_norm": 0.5592854022979736, + "learning_rate": 4.284719901993541e-05, + "loss": 0.2059, + "num_input_tokens_seen": 18727200, + "step": 15390 + }, + { + "epoch": 1.714556186657757, + "grad_norm": 0.11598757654428482, + "learning_rate": 4.286112039202584e-05, + "loss": 0.0806, + "num_input_tokens_seen": 18733216, + "step": 15395 + }, + { + "epoch": 1.7151130415413745, + "grad_norm": 0.5385206341743469, + "learning_rate": 4.287504176411627e-05, + "loss": 0.0663, + "num_input_tokens_seen": 18739328, + "step": 15400 + }, + { + "epoch": 1.7156698964249917, + "grad_norm": 0.07668454945087433, + "learning_rate": 4.288896313620671e-05, + "loss": 0.1078, + "num_input_tokens_seen": 18745696, + "step": 15405 + }, + { + "epoch": 1.716226751308609, + "grad_norm": 1.0038894414901733, + "learning_rate": 4.290288450829714e-05, + "loss": 0.0716, + "num_input_tokens_seen": 18751904, + "step": 15410 + }, + { + "epoch": 1.7167836061922261, + "grad_norm": 1.2553467750549316, + "learning_rate": 4.2916805880387576e-05, + "loss": 0.1404, + "num_input_tokens_seen": 18757952, + "step": 15415 + }, + { + "epoch": 1.7173404610758436, + "grad_norm": 0.18651270866394043, + "learning_rate": 4.2930727252478e-05, + "loss": 0.05, + "num_input_tokens_seen": 18763872, + "step": 15420 + }, + { + "epoch": 1.717897315959461, + "grad_norm": 0.45368367433547974, + "learning_rate": 4.294464862456844e-05, + "loss": 0.13, + "num_input_tokens_seen": 18769728, + "step": 15425 + }, + { + "epoch": 1.7184541708430783, + "grad_norm": 0.17630015313625336, + "learning_rate": 4.295856999665887e-05, + "loss": 0.0881, + "num_input_tokens_seen": 18775776, + "step": 15430 + }, + { + "epoch": 1.7190110257266955, + "grad_norm": 0.2906000316143036, + "learning_rate": 4.2972491368749306e-05, + "loss": 0.0726, + "num_input_tokens_seen": 18782048, + "step": 15435 + }, + { + "epoch": 1.719567880610313, + "grad_norm": 0.39597198367118835, + "learning_rate": 4.298641274083974e-05, + "loss": 0.1609, + "num_input_tokens_seen": 18788032, + "step": 15440 + }, + { + "epoch": 1.7201247354939304, + "grad_norm": 0.18240241706371307, + "learning_rate": 4.3000334112930175e-05, + "loss": 0.0889, + "num_input_tokens_seen": 18794176, + "step": 15445 + }, + { + "epoch": 1.7206815903775476, + "grad_norm": 1.0138323307037354, + "learning_rate": 4.30142554850206e-05, + "loss": 0.1789, + "num_input_tokens_seen": 18800128, + "step": 15450 + }, + { + "epoch": 1.7212384452611649, + "grad_norm": 1.092605710029602, + "learning_rate": 4.3028176857111044e-05, + "loss": 0.1402, + "num_input_tokens_seen": 18806048, + "step": 15455 + }, + { + "epoch": 1.721795300144782, + "grad_norm": 0.6915703415870667, + "learning_rate": 4.304209822920147e-05, + "loss": 0.1044, + "num_input_tokens_seen": 18812352, + "step": 15460 + }, + { + "epoch": 1.7223521550283996, + "grad_norm": 0.31944897770881653, + "learning_rate": 4.3056019601291906e-05, + "loss": 0.1576, + "num_input_tokens_seen": 18818528, + "step": 15465 + }, + { + "epoch": 1.722909009912017, + "grad_norm": 0.44491299986839294, + "learning_rate": 4.306994097338234e-05, + "loss": 0.0435, + "num_input_tokens_seen": 18824480, + "step": 15470 + }, + { + "epoch": 1.7234658647956342, + "grad_norm": 0.4447910785675049, + "learning_rate": 4.308386234547277e-05, + "loss": 0.3167, + "num_input_tokens_seen": 18830560, + "step": 15475 + }, + { + "epoch": 1.7240227196792515, + "grad_norm": 0.13785670697689056, + "learning_rate": 4.309778371756321e-05, + "loss": 0.035, + "num_input_tokens_seen": 18836736, + "step": 15480 + }, + { + "epoch": 1.724579574562869, + "grad_norm": 0.12341038882732391, + "learning_rate": 4.3111705089653636e-05, + "loss": 0.2171, + "num_input_tokens_seen": 18842944, + "step": 15485 + }, + { + "epoch": 1.7251364294464864, + "grad_norm": 1.111636757850647, + "learning_rate": 4.312562646174407e-05, + "loss": 0.0987, + "num_input_tokens_seen": 18848480, + "step": 15490 + }, + { + "epoch": 1.7256932843301036, + "grad_norm": 0.28024163842201233, + "learning_rate": 4.3139547833834505e-05, + "loss": 0.0719, + "num_input_tokens_seen": 18854272, + "step": 15495 + }, + { + "epoch": 1.7262501392137208, + "grad_norm": 0.06344744563102722, + "learning_rate": 4.315346920592494e-05, + "loss": 0.1043, + "num_input_tokens_seen": 18860256, + "step": 15500 + }, + { + "epoch": 1.7268069940973383, + "grad_norm": 0.6674091219902039, + "learning_rate": 4.3167390578015373e-05, + "loss": 0.0576, + "num_input_tokens_seen": 18866400, + "step": 15505 + }, + { + "epoch": 1.7273638489809555, + "grad_norm": 1.2445452213287354, + "learning_rate": 4.318131195010581e-05, + "loss": 0.1421, + "num_input_tokens_seen": 18872704, + "step": 15510 + }, + { + "epoch": 1.727920703864573, + "grad_norm": 0.6998615860939026, + "learning_rate": 4.3195233322196235e-05, + "loss": 0.1211, + "num_input_tokens_seen": 18878688, + "step": 15515 + }, + { + "epoch": 1.7284775587481902, + "grad_norm": 0.7034018635749817, + "learning_rate": 4.320915469428667e-05, + "loss": 0.1107, + "num_input_tokens_seen": 18884928, + "step": 15520 + }, + { + "epoch": 1.7290344136318074, + "grad_norm": 0.5908929705619812, + "learning_rate": 4.3223076066377104e-05, + "loss": 0.194, + "num_input_tokens_seen": 18891200, + "step": 15525 + }, + { + "epoch": 1.7295912685154249, + "grad_norm": 1.3219720125198364, + "learning_rate": 4.323699743846754e-05, + "loss": 0.0496, + "num_input_tokens_seen": 18897152, + "step": 15530 + }, + { + "epoch": 1.7301481233990423, + "grad_norm": 0.6159481406211853, + "learning_rate": 4.325091881055797e-05, + "loss": 0.1096, + "num_input_tokens_seen": 18903328, + "step": 15535 + }, + { + "epoch": 1.7307049782826596, + "grad_norm": 1.4815746545791626, + "learning_rate": 4.32648401826484e-05, + "loss": 0.0897, + "num_input_tokens_seen": 18908832, + "step": 15540 + }, + { + "epoch": 1.7312618331662768, + "grad_norm": 0.6511753797531128, + "learning_rate": 4.327876155473884e-05, + "loss": 0.1693, + "num_input_tokens_seen": 18915104, + "step": 15545 + }, + { + "epoch": 1.7318186880498943, + "grad_norm": 0.09127218276262283, + "learning_rate": 4.329268292682927e-05, + "loss": 0.074, + "num_input_tokens_seen": 18921440, + "step": 15550 + }, + { + "epoch": 1.7323755429335115, + "grad_norm": 0.5181125402450562, + "learning_rate": 4.33066042989197e-05, + "loss": 0.1329, + "num_input_tokens_seen": 18927904, + "step": 15555 + }, + { + "epoch": 1.732932397817129, + "grad_norm": 0.5134049654006958, + "learning_rate": 4.332052567101014e-05, + "loss": 0.1952, + "num_input_tokens_seen": 18933600, + "step": 15560 + }, + { + "epoch": 1.7334892527007462, + "grad_norm": 1.461991548538208, + "learning_rate": 4.3334447043100565e-05, + "loss": 0.1512, + "num_input_tokens_seen": 18939328, + "step": 15565 + }, + { + "epoch": 1.7340461075843634, + "grad_norm": 1.8916535377502441, + "learning_rate": 4.3348368415191006e-05, + "loss": 0.1356, + "num_input_tokens_seen": 18945312, + "step": 15570 + }, + { + "epoch": 1.7346029624679808, + "grad_norm": 0.09134828299283981, + "learning_rate": 4.3362289787281434e-05, + "loss": 0.0553, + "num_input_tokens_seen": 18950624, + "step": 15575 + }, + { + "epoch": 1.7351598173515983, + "grad_norm": 0.7300796508789062, + "learning_rate": 4.337621115937187e-05, + "loss": 0.1668, + "num_input_tokens_seen": 18956864, + "step": 15580 + }, + { + "epoch": 1.7357166722352155, + "grad_norm": 0.2517707049846649, + "learning_rate": 4.33901325314623e-05, + "loss": 0.0525, + "num_input_tokens_seen": 18962880, + "step": 15585 + }, + { + "epoch": 1.7362735271188328, + "grad_norm": 0.5003886222839355, + "learning_rate": 4.340405390355274e-05, + "loss": 0.0377, + "num_input_tokens_seen": 18968896, + "step": 15590 + }, + { + "epoch": 1.7368303820024502, + "grad_norm": 0.21181273460388184, + "learning_rate": 4.341797527564317e-05, + "loss": 0.052, + "num_input_tokens_seen": 18975136, + "step": 15595 + }, + { + "epoch": 1.7373872368860674, + "grad_norm": 2.018404006958008, + "learning_rate": 4.3431896647733605e-05, + "loss": 0.1457, + "num_input_tokens_seen": 18981408, + "step": 15600 + }, + { + "epoch": 1.737944091769685, + "grad_norm": 0.1551431566476822, + "learning_rate": 4.344581801982403e-05, + "loss": 0.0668, + "num_input_tokens_seen": 18987200, + "step": 15605 + }, + { + "epoch": 1.7385009466533021, + "grad_norm": 0.35097038745880127, + "learning_rate": 4.345973939191447e-05, + "loss": 0.0809, + "num_input_tokens_seen": 18993344, + "step": 15610 + }, + { + "epoch": 1.7390578015369194, + "grad_norm": 0.16330905258655548, + "learning_rate": 4.34736607640049e-05, + "loss": 0.1514, + "num_input_tokens_seen": 18999360, + "step": 15615 + }, + { + "epoch": 1.7396146564205368, + "grad_norm": 0.6460041999816895, + "learning_rate": 4.3487582136095336e-05, + "loss": 0.1078, + "num_input_tokens_seen": 19005632, + "step": 15620 + }, + { + "epoch": 1.7401715113041543, + "grad_norm": 0.10363862663507462, + "learning_rate": 4.350150350818577e-05, + "loss": 0.2442, + "num_input_tokens_seen": 19011520, + "step": 15625 + }, + { + "epoch": 1.7407283661877715, + "grad_norm": 0.5249636173248291, + "learning_rate": 4.35154248802762e-05, + "loss": 0.1792, + "num_input_tokens_seen": 19017632, + "step": 15630 + }, + { + "epoch": 1.7412852210713887, + "grad_norm": 0.07285290211439133, + "learning_rate": 4.352934625236664e-05, + "loss": 0.0839, + "num_input_tokens_seen": 19023616, + "step": 15635 + }, + { + "epoch": 1.7418420759550062, + "grad_norm": 0.9441299438476562, + "learning_rate": 4.3543267624457066e-05, + "loss": 0.1949, + "num_input_tokens_seen": 19029792, + "step": 15640 + }, + { + "epoch": 1.7423989308386234, + "grad_norm": 0.8876815438270569, + "learning_rate": 4.355718899654751e-05, + "loss": 0.0706, + "num_input_tokens_seen": 19035936, + "step": 15645 + }, + { + "epoch": 1.7429557857222409, + "grad_norm": 1.1066534519195557, + "learning_rate": 4.3571110368637935e-05, + "loss": 0.124, + "num_input_tokens_seen": 19041952, + "step": 15650 + }, + { + "epoch": 1.743512640605858, + "grad_norm": 0.5195097327232361, + "learning_rate": 4.358503174072836e-05, + "loss": 0.0375, + "num_input_tokens_seen": 19047936, + "step": 15655 + }, + { + "epoch": 1.7440694954894753, + "grad_norm": 0.8763102293014526, + "learning_rate": 4.3598953112818804e-05, + "loss": 0.1147, + "num_input_tokens_seen": 19054208, + "step": 15660 + }, + { + "epoch": 1.7446263503730928, + "grad_norm": 0.02757718786597252, + "learning_rate": 4.361287448490923e-05, + "loss": 0.1435, + "num_input_tokens_seen": 19060480, + "step": 15665 + }, + { + "epoch": 1.7451832052567102, + "grad_norm": 0.12988197803497314, + "learning_rate": 4.362679585699967e-05, + "loss": 0.1172, + "num_input_tokens_seen": 19066624, + "step": 15670 + }, + { + "epoch": 1.7457400601403275, + "grad_norm": 0.341409295797348, + "learning_rate": 4.36407172290901e-05, + "loss": 0.0697, + "num_input_tokens_seen": 19072768, + "step": 15675 + }, + { + "epoch": 1.7462969150239447, + "grad_norm": 1.0349137783050537, + "learning_rate": 4.3654638601180534e-05, + "loss": 0.088, + "num_input_tokens_seen": 19078720, + "step": 15680 + }, + { + "epoch": 1.7468537699075621, + "grad_norm": 0.3441688120365143, + "learning_rate": 4.366855997327097e-05, + "loss": 0.1403, + "num_input_tokens_seen": 19084896, + "step": 15685 + }, + { + "epoch": 1.7474106247911794, + "grad_norm": 1.3957105875015259, + "learning_rate": 4.36824813453614e-05, + "loss": 0.1974, + "num_input_tokens_seen": 19090848, + "step": 15690 + }, + { + "epoch": 1.7479674796747968, + "grad_norm": 0.18807348608970642, + "learning_rate": 4.369640271745184e-05, + "loss": 0.0828, + "num_input_tokens_seen": 19097024, + "step": 15695 + }, + { + "epoch": 1.748524334558414, + "grad_norm": 0.3609565198421478, + "learning_rate": 4.3710324089542265e-05, + "loss": 0.1032, + "num_input_tokens_seen": 19103456, + "step": 15700 + }, + { + "epoch": 1.7490811894420313, + "grad_norm": 0.1922638863325119, + "learning_rate": 4.37242454616327e-05, + "loss": 0.0665, + "num_input_tokens_seen": 19109504, + "step": 15705 + }, + { + "epoch": 1.7496380443256487, + "grad_norm": 0.40244972705841064, + "learning_rate": 4.3738166833723133e-05, + "loss": 0.0935, + "num_input_tokens_seen": 19115616, + "step": 15710 + }, + { + "epoch": 1.7501948992092662, + "grad_norm": 0.0960109531879425, + "learning_rate": 4.375208820581357e-05, + "loss": 0.0371, + "num_input_tokens_seen": 19122144, + "step": 15715 + }, + { + "epoch": 1.7507517540928834, + "grad_norm": 0.2857667803764343, + "learning_rate": 4.3766009577904e-05, + "loss": 0.1108, + "num_input_tokens_seen": 19127808, + "step": 15720 + }, + { + "epoch": 1.7513086089765006, + "grad_norm": 0.6180700659751892, + "learning_rate": 4.3779930949994436e-05, + "loss": 0.2175, + "num_input_tokens_seen": 19133952, + "step": 15725 + }, + { + "epoch": 1.751865463860118, + "grad_norm": 1.3267638683319092, + "learning_rate": 4.3793852322084864e-05, + "loss": 0.125, + "num_input_tokens_seen": 19140160, + "step": 15730 + }, + { + "epoch": 1.7524223187437353, + "grad_norm": 0.3617127239704132, + "learning_rate": 4.3807773694175305e-05, + "loss": 0.1342, + "num_input_tokens_seen": 19146368, + "step": 15735 + }, + { + "epoch": 1.7529791736273528, + "grad_norm": 0.44867759943008423, + "learning_rate": 4.382169506626573e-05, + "loss": 0.0892, + "num_input_tokens_seen": 19152320, + "step": 15740 + }, + { + "epoch": 1.75353602851097, + "grad_norm": 1.256529688835144, + "learning_rate": 4.383561643835617e-05, + "loss": 0.1789, + "num_input_tokens_seen": 19158112, + "step": 15745 + }, + { + "epoch": 1.7540928833945872, + "grad_norm": 0.46356087923049927, + "learning_rate": 4.38495378104466e-05, + "loss": 0.0786, + "num_input_tokens_seen": 19164320, + "step": 15750 + }, + { + "epoch": 1.7546497382782047, + "grad_norm": 0.4462626278400421, + "learning_rate": 4.386345918253703e-05, + "loss": 0.1394, + "num_input_tokens_seen": 19170432, + "step": 15755 + }, + { + "epoch": 1.7552065931618221, + "grad_norm": 0.23617422580718994, + "learning_rate": 4.387738055462747e-05, + "loss": 0.1453, + "num_input_tokens_seen": 19176448, + "step": 15760 + }, + { + "epoch": 1.7557634480454394, + "grad_norm": 0.5721272826194763, + "learning_rate": 4.38913019267179e-05, + "loss": 0.1196, + "num_input_tokens_seen": 19182528, + "step": 15765 + }, + { + "epoch": 1.7563203029290566, + "grad_norm": 1.1287215948104858, + "learning_rate": 4.390522329880833e-05, + "loss": 0.1586, + "num_input_tokens_seen": 19188512, + "step": 15770 + }, + { + "epoch": 1.756877157812674, + "grad_norm": 0.06429536640644073, + "learning_rate": 4.3919144670898766e-05, + "loss": 0.0759, + "num_input_tokens_seen": 19194656, + "step": 15775 + }, + { + "epoch": 1.7574340126962913, + "grad_norm": 0.5739113688468933, + "learning_rate": 4.39330660429892e-05, + "loss": 0.2006, + "num_input_tokens_seen": 19200736, + "step": 15780 + }, + { + "epoch": 1.7579908675799087, + "grad_norm": 0.6002602577209473, + "learning_rate": 4.3946987415079635e-05, + "loss": 0.1731, + "num_input_tokens_seen": 19206784, + "step": 15785 + }, + { + "epoch": 1.758547722463526, + "grad_norm": 0.3341212272644043, + "learning_rate": 4.396090878717006e-05, + "loss": 0.1116, + "num_input_tokens_seen": 19212896, + "step": 15790 + }, + { + "epoch": 1.7591045773471432, + "grad_norm": 0.7338804006576538, + "learning_rate": 4.39748301592605e-05, + "loss": 0.1016, + "num_input_tokens_seen": 19218432, + "step": 15795 + }, + { + "epoch": 1.7596614322307607, + "grad_norm": 0.35627424716949463, + "learning_rate": 4.398875153135093e-05, + "loss": 0.128, + "num_input_tokens_seen": 19224544, + "step": 15800 + }, + { + "epoch": 1.760218287114378, + "grad_norm": 1.9285632371902466, + "learning_rate": 4.4002672903441365e-05, + "loss": 0.2193, + "num_input_tokens_seen": 19230784, + "step": 15805 + }, + { + "epoch": 1.7607751419979953, + "grad_norm": 0.8425408005714417, + "learning_rate": 4.40165942755318e-05, + "loss": 0.0441, + "num_input_tokens_seen": 19236960, + "step": 15810 + }, + { + "epoch": 1.7613319968816126, + "grad_norm": 0.0541839636862278, + "learning_rate": 4.4030515647622234e-05, + "loss": 0.074, + "num_input_tokens_seen": 19243296, + "step": 15815 + }, + { + "epoch": 1.76188885176523, + "grad_norm": 1.1406530141830444, + "learning_rate": 4.404443701971266e-05, + "loss": 0.1239, + "num_input_tokens_seen": 19249536, + "step": 15820 + }, + { + "epoch": 1.7624457066488475, + "grad_norm": 0.03624442592263222, + "learning_rate": 4.40583583918031e-05, + "loss": 0.0929, + "num_input_tokens_seen": 19255840, + "step": 15825 + }, + { + "epoch": 1.7630025615324647, + "grad_norm": 0.2935195565223694, + "learning_rate": 4.407227976389353e-05, + "loss": 0.1364, + "num_input_tokens_seen": 19262112, + "step": 15830 + }, + { + "epoch": 1.763559416416082, + "grad_norm": 0.9527688026428223, + "learning_rate": 4.4086201135983965e-05, + "loss": 0.0894, + "num_input_tokens_seen": 19268416, + "step": 15835 + }, + { + "epoch": 1.7641162712996992, + "grad_norm": 0.6369028091430664, + "learning_rate": 4.41001225080744e-05, + "loss": 0.0716, + "num_input_tokens_seen": 19274272, + "step": 15840 + }, + { + "epoch": 1.7646731261833166, + "grad_norm": 0.2866136133670807, + "learning_rate": 4.4114043880164826e-05, + "loss": 0.1734, + "num_input_tokens_seen": 19280416, + "step": 15845 + }, + { + "epoch": 1.765229981066934, + "grad_norm": 0.1545330137014389, + "learning_rate": 4.412796525225527e-05, + "loss": 0.1926, + "num_input_tokens_seen": 19286560, + "step": 15850 + }, + { + "epoch": 1.7657868359505513, + "grad_norm": 0.4326992928981781, + "learning_rate": 4.4141886624345695e-05, + "loss": 0.1657, + "num_input_tokens_seen": 19292768, + "step": 15855 + }, + { + "epoch": 1.7663436908341685, + "grad_norm": 0.5750066637992859, + "learning_rate": 4.415580799643613e-05, + "loss": 0.164, + "num_input_tokens_seen": 19298688, + "step": 15860 + }, + { + "epoch": 1.766900545717786, + "grad_norm": 0.596770703792572, + "learning_rate": 4.4169729368526564e-05, + "loss": 0.1334, + "num_input_tokens_seen": 19304320, + "step": 15865 + }, + { + "epoch": 1.7674574006014034, + "grad_norm": 0.08817245066165924, + "learning_rate": 4.4183650740617e-05, + "loss": 0.0434, + "num_input_tokens_seen": 19310464, + "step": 15870 + }, + { + "epoch": 1.7680142554850207, + "grad_norm": 1.8358007669448853, + "learning_rate": 4.419757211270743e-05, + "loss": 0.2466, + "num_input_tokens_seen": 19316256, + "step": 15875 + }, + { + "epoch": 1.768571110368638, + "grad_norm": 0.07573774456977844, + "learning_rate": 4.421149348479786e-05, + "loss": 0.1071, + "num_input_tokens_seen": 19322304, + "step": 15880 + }, + { + "epoch": 1.7691279652522551, + "grad_norm": 0.5937919020652771, + "learning_rate": 4.4225414856888294e-05, + "loss": 0.1108, + "num_input_tokens_seen": 19328416, + "step": 15885 + }, + { + "epoch": 1.7696848201358726, + "grad_norm": 1.8437564373016357, + "learning_rate": 4.423933622897873e-05, + "loss": 0.1238, + "num_input_tokens_seen": 19334336, + "step": 15890 + }, + { + "epoch": 1.77024167501949, + "grad_norm": 0.0930170863866806, + "learning_rate": 4.425325760106916e-05, + "loss": 0.1245, + "num_input_tokens_seen": 19340416, + "step": 15895 + }, + { + "epoch": 1.7707985299031073, + "grad_norm": 0.5357744097709656, + "learning_rate": 4.42671789731596e-05, + "loss": 0.1882, + "num_input_tokens_seen": 19346272, + "step": 15900 + }, + { + "epoch": 1.7713553847867245, + "grad_norm": 0.5472536087036133, + "learning_rate": 4.428110034525003e-05, + "loss": 0.1047, + "num_input_tokens_seen": 19352480, + "step": 15905 + }, + { + "epoch": 1.771912239670342, + "grad_norm": 1.2984185218811035, + "learning_rate": 4.429502171734046e-05, + "loss": 0.1286, + "num_input_tokens_seen": 19358592, + "step": 15910 + }, + { + "epoch": 1.7724690945539594, + "grad_norm": 0.5577032566070557, + "learning_rate": 4.43089430894309e-05, + "loss": 0.1009, + "num_input_tokens_seen": 19364800, + "step": 15915 + }, + { + "epoch": 1.7730259494375766, + "grad_norm": 0.31600141525268555, + "learning_rate": 4.432286446152133e-05, + "loss": 0.066, + "num_input_tokens_seen": 19371072, + "step": 15920 + }, + { + "epoch": 1.7735828043211939, + "grad_norm": 0.2585841119289398, + "learning_rate": 4.433678583361176e-05, + "loss": 0.0673, + "num_input_tokens_seen": 19377440, + "step": 15925 + }, + { + "epoch": 1.774139659204811, + "grad_norm": 0.6218186020851135, + "learning_rate": 4.4350707205702196e-05, + "loss": 0.121, + "num_input_tokens_seen": 19384000, + "step": 15930 + }, + { + "epoch": 1.7746965140884285, + "grad_norm": 1.2618755102157593, + "learning_rate": 4.4364628577792624e-05, + "loss": 0.1694, + "num_input_tokens_seen": 19390176, + "step": 15935 + }, + { + "epoch": 1.775253368972046, + "grad_norm": 1.1886354684829712, + "learning_rate": 4.4378549949883065e-05, + "loss": 0.1834, + "num_input_tokens_seen": 19396064, + "step": 15940 + }, + { + "epoch": 1.7758102238556632, + "grad_norm": 0.41752466559410095, + "learning_rate": 4.439247132197349e-05, + "loss": 0.1443, + "num_input_tokens_seen": 19402400, + "step": 15945 + }, + { + "epoch": 1.7763670787392805, + "grad_norm": 0.11399069428443909, + "learning_rate": 4.4406392694063934e-05, + "loss": 0.0497, + "num_input_tokens_seen": 19408768, + "step": 15950 + }, + { + "epoch": 1.776923933622898, + "grad_norm": 0.10219629108905792, + "learning_rate": 4.442031406615436e-05, + "loss": 0.0717, + "num_input_tokens_seen": 19415168, + "step": 15955 + }, + { + "epoch": 1.7774807885065154, + "grad_norm": 1.3077540397644043, + "learning_rate": 4.4434235438244796e-05, + "loss": 0.197, + "num_input_tokens_seen": 19421184, + "step": 15960 + }, + { + "epoch": 1.7780376433901326, + "grad_norm": 0.9311196208000183, + "learning_rate": 4.444815681033523e-05, + "loss": 0.1521, + "num_input_tokens_seen": 19427328, + "step": 15965 + }, + { + "epoch": 1.7785944982737498, + "grad_norm": 0.014945870265364647, + "learning_rate": 4.446207818242566e-05, + "loss": 0.1276, + "num_input_tokens_seen": 19433728, + "step": 15970 + }, + { + "epoch": 1.779151353157367, + "grad_norm": 1.4737142324447632, + "learning_rate": 4.44759995545161e-05, + "loss": 0.0953, + "num_input_tokens_seen": 19439776, + "step": 15975 + }, + { + "epoch": 1.7797082080409845, + "grad_norm": 0.10939957946538925, + "learning_rate": 4.4489920926606526e-05, + "loss": 0.0933, + "num_input_tokens_seen": 19445856, + "step": 15980 + }, + { + "epoch": 1.780265062924602, + "grad_norm": 0.3372056186199188, + "learning_rate": 4.450384229869696e-05, + "loss": 0.0865, + "num_input_tokens_seen": 19452064, + "step": 15985 + }, + { + "epoch": 1.7808219178082192, + "grad_norm": 0.004843650385737419, + "learning_rate": 4.4517763670787395e-05, + "loss": 0.0941, + "num_input_tokens_seen": 19458432, + "step": 15990 + }, + { + "epoch": 1.7813787726918364, + "grad_norm": 0.013738983310759068, + "learning_rate": 4.453168504287783e-05, + "loss": 0.1386, + "num_input_tokens_seen": 19464448, + "step": 15995 + }, + { + "epoch": 1.7819356275754539, + "grad_norm": 0.5057507157325745, + "learning_rate": 4.4545606414968263e-05, + "loss": 0.0679, + "num_input_tokens_seen": 19470112, + "step": 16000 + }, + { + "epoch": 1.7824924824590713, + "grad_norm": 0.37004971504211426, + "learning_rate": 4.45595277870587e-05, + "loss": 0.0999, + "num_input_tokens_seen": 19476608, + "step": 16005 + }, + { + "epoch": 1.7830493373426886, + "grad_norm": 0.27266088128089905, + "learning_rate": 4.4573449159149125e-05, + "loss": 0.0544, + "num_input_tokens_seen": 19482816, + "step": 16010 + }, + { + "epoch": 1.7836061922263058, + "grad_norm": 0.11633927375078201, + "learning_rate": 4.458737053123956e-05, + "loss": 0.1299, + "num_input_tokens_seen": 19488896, + "step": 16015 + }, + { + "epoch": 1.784163047109923, + "grad_norm": 0.6258735060691833, + "learning_rate": 4.4601291903329994e-05, + "loss": 0.1024, + "num_input_tokens_seen": 19495168, + "step": 16020 + }, + { + "epoch": 1.7847199019935405, + "grad_norm": 0.5034978985786438, + "learning_rate": 4.461521327542043e-05, + "loss": 0.041, + "num_input_tokens_seen": 19501824, + "step": 16025 + }, + { + "epoch": 1.785276756877158, + "grad_norm": 0.2501086890697479, + "learning_rate": 4.462913464751086e-05, + "loss": 0.0676, + "num_input_tokens_seen": 19507968, + "step": 16030 + }, + { + "epoch": 1.7858336117607752, + "grad_norm": 0.8466212749481201, + "learning_rate": 4.464305601960129e-05, + "loss": 0.0724, + "num_input_tokens_seen": 19514208, + "step": 16035 + }, + { + "epoch": 1.7863904666443924, + "grad_norm": 1.1025334596633911, + "learning_rate": 4.465697739169173e-05, + "loss": 0.1358, + "num_input_tokens_seen": 19520288, + "step": 16040 + }, + { + "epoch": 1.7869473215280098, + "grad_norm": 1.03672456741333, + "learning_rate": 4.467089876378216e-05, + "loss": 0.1364, + "num_input_tokens_seen": 19526240, + "step": 16045 + }, + { + "epoch": 1.7875041764116273, + "grad_norm": 0.3978976309299469, + "learning_rate": 4.468482013587259e-05, + "loss": 0.0931, + "num_input_tokens_seen": 19532256, + "step": 16050 + }, + { + "epoch": 1.7880610312952445, + "grad_norm": 0.3788553774356842, + "learning_rate": 4.469874150796303e-05, + "loss": 0.1477, + "num_input_tokens_seen": 19538400, + "step": 16055 + }, + { + "epoch": 1.7886178861788617, + "grad_norm": 0.6251480579376221, + "learning_rate": 4.4712662880053455e-05, + "loss": 0.1465, + "num_input_tokens_seen": 19544704, + "step": 16060 + }, + { + "epoch": 1.789174741062479, + "grad_norm": 1.006311058998108, + "learning_rate": 4.4726584252143896e-05, + "loss": 0.1211, + "num_input_tokens_seen": 19551040, + "step": 16065 + }, + { + "epoch": 1.7897315959460964, + "grad_norm": 0.5499122738838196, + "learning_rate": 4.4740505624234324e-05, + "loss": 0.0755, + "num_input_tokens_seen": 19557472, + "step": 16070 + }, + { + "epoch": 1.7902884508297139, + "grad_norm": 0.4355844259262085, + "learning_rate": 4.475442699632476e-05, + "loss": 0.206, + "num_input_tokens_seen": 19563520, + "step": 16075 + }, + { + "epoch": 1.7908453057133311, + "grad_norm": 0.5774704217910767, + "learning_rate": 4.476834836841519e-05, + "loss": 0.1737, + "num_input_tokens_seen": 19568992, + "step": 16080 + }, + { + "epoch": 1.7914021605969483, + "grad_norm": 0.3715621829032898, + "learning_rate": 4.478226974050563e-05, + "loss": 0.0522, + "num_input_tokens_seen": 19575008, + "step": 16085 + }, + { + "epoch": 1.7919590154805658, + "grad_norm": 0.3771083652973175, + "learning_rate": 4.479619111259606e-05, + "loss": 0.1539, + "num_input_tokens_seen": 19581312, + "step": 16090 + }, + { + "epoch": 1.7925158703641833, + "grad_norm": 0.4256909191608429, + "learning_rate": 4.4810112484686495e-05, + "loss": 0.0676, + "num_input_tokens_seen": 19587808, + "step": 16095 + }, + { + "epoch": 1.7930727252478005, + "grad_norm": 2.7443416118621826, + "learning_rate": 4.482403385677692e-05, + "loss": 0.1592, + "num_input_tokens_seen": 19593856, + "step": 16100 + }, + { + "epoch": 1.7936295801314177, + "grad_norm": 1.0341861248016357, + "learning_rate": 4.483795522886736e-05, + "loss": 0.1306, + "num_input_tokens_seen": 19600256, + "step": 16105 + }, + { + "epoch": 1.794186435015035, + "grad_norm": 0.8792572021484375, + "learning_rate": 4.485187660095779e-05, + "loss": 0.0969, + "num_input_tokens_seen": 19606176, + "step": 16110 + }, + { + "epoch": 1.7947432898986524, + "grad_norm": 0.251089870929718, + "learning_rate": 4.4865797973048226e-05, + "loss": 0.0281, + "num_input_tokens_seen": 19612288, + "step": 16115 + }, + { + "epoch": 1.7953001447822698, + "grad_norm": 0.5890914797782898, + "learning_rate": 4.487971934513866e-05, + "loss": 0.0579, + "num_input_tokens_seen": 19618112, + "step": 16120 + }, + { + "epoch": 1.795856999665887, + "grad_norm": 0.17685401439666748, + "learning_rate": 4.489364071722909e-05, + "loss": 0.1279, + "num_input_tokens_seen": 19624416, + "step": 16125 + }, + { + "epoch": 1.7964138545495043, + "grad_norm": 0.5030546188354492, + "learning_rate": 4.490756208931953e-05, + "loss": 0.0441, + "num_input_tokens_seen": 19630656, + "step": 16130 + }, + { + "epoch": 1.7969707094331218, + "grad_norm": 0.9196567535400391, + "learning_rate": 4.4921483461409956e-05, + "loss": 0.2136, + "num_input_tokens_seen": 19636736, + "step": 16135 + }, + { + "epoch": 1.7975275643167392, + "grad_norm": 0.9750614166259766, + "learning_rate": 4.493540483350039e-05, + "loss": 0.1023, + "num_input_tokens_seen": 19642752, + "step": 16140 + }, + { + "epoch": 1.7980844192003564, + "grad_norm": 0.814760684967041, + "learning_rate": 4.4949326205590825e-05, + "loss": 0.1462, + "num_input_tokens_seen": 19648704, + "step": 16145 + }, + { + "epoch": 1.7986412740839737, + "grad_norm": 0.2466564178466797, + "learning_rate": 4.496324757768126e-05, + "loss": 0.1722, + "num_input_tokens_seen": 19654560, + "step": 16150 + }, + { + "epoch": 1.799198128967591, + "grad_norm": 1.018267273902893, + "learning_rate": 4.4977168949771694e-05, + "loss": 0.1726, + "num_input_tokens_seen": 19660320, + "step": 16155 + }, + { + "epoch": 1.7997549838512084, + "grad_norm": 1.2544200420379639, + "learning_rate": 4.499109032186212e-05, + "loss": 0.0883, + "num_input_tokens_seen": 19666208, + "step": 16160 + }, + { + "epoch": 1.8003118387348258, + "grad_norm": 0.004594671539962292, + "learning_rate": 4.5005011693952556e-05, + "loss": 0.0978, + "num_input_tokens_seen": 19672096, + "step": 16165 + }, + { + "epoch": 1.800868693618443, + "grad_norm": 1.0806063413619995, + "learning_rate": 4.501893306604299e-05, + "loss": 0.238, + "num_input_tokens_seen": 19678048, + "step": 16170 + }, + { + "epoch": 1.8014255485020603, + "grad_norm": 1.0616984367370605, + "learning_rate": 4.5032854438133424e-05, + "loss": 0.1454, + "num_input_tokens_seen": 19684384, + "step": 16175 + }, + { + "epoch": 1.8019824033856777, + "grad_norm": 0.8671673536300659, + "learning_rate": 4.504677581022386e-05, + "loss": 0.2957, + "num_input_tokens_seen": 19690208, + "step": 16180 + }, + { + "epoch": 1.8025392582692952, + "grad_norm": 1.0453835725784302, + "learning_rate": 4.506069718231429e-05, + "loss": 0.2484, + "num_input_tokens_seen": 19696032, + "step": 16185 + }, + { + "epoch": 1.8030961131529124, + "grad_norm": 0.26858046650886536, + "learning_rate": 4.507461855440472e-05, + "loss": 0.1019, + "num_input_tokens_seen": 19702176, + "step": 16190 + }, + { + "epoch": 1.8036529680365296, + "grad_norm": 2.2321412563323975, + "learning_rate": 4.508853992649516e-05, + "loss": 0.1286, + "num_input_tokens_seen": 19707968, + "step": 16195 + }, + { + "epoch": 1.8042098229201469, + "grad_norm": 0.6121971607208252, + "learning_rate": 4.510246129858559e-05, + "loss": 0.1275, + "num_input_tokens_seen": 19713792, + "step": 16200 + }, + { + "epoch": 1.8047666778037643, + "grad_norm": 0.5183918476104736, + "learning_rate": 4.5116382670676024e-05, + "loss": 0.2439, + "num_input_tokens_seen": 19719776, + "step": 16205 + }, + { + "epoch": 1.8053235326873818, + "grad_norm": 0.9980742335319519, + "learning_rate": 4.513030404276646e-05, + "loss": 0.3095, + "num_input_tokens_seen": 19725856, + "step": 16210 + }, + { + "epoch": 1.805880387570999, + "grad_norm": 0.10718056559562683, + "learning_rate": 4.514422541485689e-05, + "loss": 0.0253, + "num_input_tokens_seen": 19731712, + "step": 16215 + }, + { + "epoch": 1.8064372424546162, + "grad_norm": 0.2593459188938141, + "learning_rate": 4.5158146786947327e-05, + "loss": 0.0654, + "num_input_tokens_seen": 19738336, + "step": 16220 + }, + { + "epoch": 1.8069940973382337, + "grad_norm": 0.409147709608078, + "learning_rate": 4.5172068159037754e-05, + "loss": 0.0557, + "num_input_tokens_seen": 19744256, + "step": 16225 + }, + { + "epoch": 1.8075509522218511, + "grad_norm": 1.712742567062378, + "learning_rate": 4.5185989531128195e-05, + "loss": 0.1666, + "num_input_tokens_seen": 19750432, + "step": 16230 + }, + { + "epoch": 1.8081078071054684, + "grad_norm": 0.20031115412712097, + "learning_rate": 4.519991090321862e-05, + "loss": 0.128, + "num_input_tokens_seen": 19756512, + "step": 16235 + }, + { + "epoch": 1.8086646619890856, + "grad_norm": 0.6189295053482056, + "learning_rate": 4.521383227530906e-05, + "loss": 0.1106, + "num_input_tokens_seen": 19762336, + "step": 16240 + }, + { + "epoch": 1.8092215168727028, + "grad_norm": 1.029457449913025, + "learning_rate": 4.522775364739949e-05, + "loss": 0.0757, + "num_input_tokens_seen": 19768928, + "step": 16245 + }, + { + "epoch": 1.8097783717563203, + "grad_norm": 0.9627676010131836, + "learning_rate": 4.524167501948992e-05, + "loss": 0.1421, + "num_input_tokens_seen": 19775136, + "step": 16250 + }, + { + "epoch": 1.8103352266399377, + "grad_norm": 0.5679406523704529, + "learning_rate": 4.525559639158036e-05, + "loss": 0.0934, + "num_input_tokens_seen": 19781024, + "step": 16255 + }, + { + "epoch": 1.810892081523555, + "grad_norm": 0.16095133125782013, + "learning_rate": 4.526951776367079e-05, + "loss": 0.0432, + "num_input_tokens_seen": 19787584, + "step": 16260 + }, + { + "epoch": 1.8114489364071722, + "grad_norm": 1.3180961608886719, + "learning_rate": 4.528343913576122e-05, + "loss": 0.0923, + "num_input_tokens_seen": 19793536, + "step": 16265 + }, + { + "epoch": 1.8120057912907896, + "grad_norm": 0.7991659641265869, + "learning_rate": 4.5297360507851656e-05, + "loss": 0.0682, + "num_input_tokens_seen": 19799968, + "step": 16270 + }, + { + "epoch": 1.812562646174407, + "grad_norm": 0.17289745807647705, + "learning_rate": 4.531128187994209e-05, + "loss": 0.1086, + "num_input_tokens_seen": 19806208, + "step": 16275 + }, + { + "epoch": 1.8131195010580243, + "grad_norm": 1.2097833156585693, + "learning_rate": 4.5325203252032525e-05, + "loss": 0.2178, + "num_input_tokens_seen": 19812320, + "step": 16280 + }, + { + "epoch": 1.8136763559416416, + "grad_norm": 0.22549381852149963, + "learning_rate": 4.533912462412296e-05, + "loss": 0.0418, + "num_input_tokens_seen": 19818688, + "step": 16285 + }, + { + "epoch": 1.8142332108252588, + "grad_norm": 0.05605443939566612, + "learning_rate": 4.535304599621339e-05, + "loss": 0.1014, + "num_input_tokens_seen": 19824928, + "step": 16290 + }, + { + "epoch": 1.8147900657088762, + "grad_norm": 0.6239849925041199, + "learning_rate": 4.536696736830382e-05, + "loss": 0.1216, + "num_input_tokens_seen": 19831104, + "step": 16295 + }, + { + "epoch": 1.8153469205924937, + "grad_norm": 1.3402328491210938, + "learning_rate": 4.5380888740394255e-05, + "loss": 0.1267, + "num_input_tokens_seen": 19837152, + "step": 16300 + }, + { + "epoch": 1.815903775476111, + "grad_norm": 0.6231488585472107, + "learning_rate": 4.539481011248469e-05, + "loss": 0.0904, + "num_input_tokens_seen": 19843456, + "step": 16305 + }, + { + "epoch": 1.8164606303597282, + "grad_norm": 0.3790552318096161, + "learning_rate": 4.5408731484575124e-05, + "loss": 0.2274, + "num_input_tokens_seen": 19848832, + "step": 16310 + }, + { + "epoch": 1.8170174852433456, + "grad_norm": 0.7045077085494995, + "learning_rate": 4.542265285666555e-05, + "loss": 0.0723, + "num_input_tokens_seen": 19854976, + "step": 16315 + }, + { + "epoch": 1.817574340126963, + "grad_norm": 0.5945013165473938, + "learning_rate": 4.543657422875599e-05, + "loss": 0.0978, + "num_input_tokens_seen": 19861216, + "step": 16320 + }, + { + "epoch": 1.8181311950105803, + "grad_norm": 0.04465155303478241, + "learning_rate": 4.545049560084642e-05, + "loss": 0.0081, + "num_input_tokens_seen": 19867488, + "step": 16325 + }, + { + "epoch": 1.8186880498941975, + "grad_norm": 0.09361328929662704, + "learning_rate": 4.5464416972936855e-05, + "loss": 0.0771, + "num_input_tokens_seen": 19873600, + "step": 16330 + }, + { + "epoch": 1.8192449047778148, + "grad_norm": 0.34713807702064514, + "learning_rate": 4.547833834502729e-05, + "loss": 0.1258, + "num_input_tokens_seen": 19879584, + "step": 16335 + }, + { + "epoch": 1.8198017596614322, + "grad_norm": 1.147483468055725, + "learning_rate": 4.5492259717117717e-05, + "loss": 0.1436, + "num_input_tokens_seen": 19885728, + "step": 16340 + }, + { + "epoch": 1.8203586145450497, + "grad_norm": 0.20243245363235474, + "learning_rate": 4.550618108920816e-05, + "loss": 0.049, + "num_input_tokens_seen": 19891840, + "step": 16345 + }, + { + "epoch": 1.8209154694286669, + "grad_norm": 1.7498871088027954, + "learning_rate": 4.5520102461298585e-05, + "loss": 0.0837, + "num_input_tokens_seen": 19897984, + "step": 16350 + }, + { + "epoch": 1.8214723243122841, + "grad_norm": 1.5880485773086548, + "learning_rate": 4.553402383338902e-05, + "loss": 0.1496, + "num_input_tokens_seen": 19904352, + "step": 16355 + }, + { + "epoch": 1.8220291791959016, + "grad_norm": 0.7440773248672485, + "learning_rate": 4.5547945205479454e-05, + "loss": 0.12, + "num_input_tokens_seen": 19910208, + "step": 16360 + }, + { + "epoch": 1.822586034079519, + "grad_norm": 0.747539222240448, + "learning_rate": 4.556186657756989e-05, + "loss": 0.1857, + "num_input_tokens_seen": 19916352, + "step": 16365 + }, + { + "epoch": 1.8231428889631363, + "grad_norm": 0.6793246269226074, + "learning_rate": 4.557578794966032e-05, + "loss": 0.0908, + "num_input_tokens_seen": 19922432, + "step": 16370 + }, + { + "epoch": 1.8236997438467535, + "grad_norm": 0.25849536061286926, + "learning_rate": 4.558970932175076e-05, + "loss": 0.0931, + "num_input_tokens_seen": 19928480, + "step": 16375 + }, + { + "epoch": 1.8242565987303707, + "grad_norm": 2.7763566970825195, + "learning_rate": 4.5603630693841184e-05, + "loss": 0.2062, + "num_input_tokens_seen": 19934400, + "step": 16380 + }, + { + "epoch": 1.8248134536139882, + "grad_norm": 1.6416070461273193, + "learning_rate": 4.561755206593162e-05, + "loss": 0.1464, + "num_input_tokens_seen": 19940608, + "step": 16385 + }, + { + "epoch": 1.8253703084976056, + "grad_norm": 0.5906521081924438, + "learning_rate": 4.563147343802205e-05, + "loss": 0.1722, + "num_input_tokens_seen": 19946848, + "step": 16390 + }, + { + "epoch": 1.8259271633812229, + "grad_norm": 0.13710153102874756, + "learning_rate": 4.564539481011249e-05, + "loss": 0.1115, + "num_input_tokens_seen": 19952480, + "step": 16395 + }, + { + "epoch": 1.82648401826484, + "grad_norm": 0.7855330109596252, + "learning_rate": 4.565931618220292e-05, + "loss": 0.0525, + "num_input_tokens_seen": 19958528, + "step": 16400 + }, + { + "epoch": 1.8270408731484575, + "grad_norm": 2.660027265548706, + "learning_rate": 4.567323755429335e-05, + "loss": 0.1213, + "num_input_tokens_seen": 19964320, + "step": 16405 + }, + { + "epoch": 1.827597728032075, + "grad_norm": 0.5759193301200867, + "learning_rate": 4.568715892638379e-05, + "loss": 0.1151, + "num_input_tokens_seen": 19970912, + "step": 16410 + }, + { + "epoch": 1.8281545829156922, + "grad_norm": 0.8445032238960266, + "learning_rate": 4.570108029847422e-05, + "loss": 0.1559, + "num_input_tokens_seen": 19977408, + "step": 16415 + }, + { + "epoch": 1.8287114377993094, + "grad_norm": 0.6892715692520142, + "learning_rate": 4.571500167056465e-05, + "loss": 0.1712, + "num_input_tokens_seen": 19982752, + "step": 16420 + }, + { + "epoch": 1.8292682926829267, + "grad_norm": 0.29959988594055176, + "learning_rate": 4.5728923042655087e-05, + "loss": 0.0431, + "num_input_tokens_seen": 19989152, + "step": 16425 + }, + { + "epoch": 1.8298251475665441, + "grad_norm": 1.9797918796539307, + "learning_rate": 4.5742844414745514e-05, + "loss": 0.299, + "num_input_tokens_seen": 19994976, + "step": 16430 + }, + { + "epoch": 1.8303820024501616, + "grad_norm": 0.4183412194252014, + "learning_rate": 4.5756765786835955e-05, + "loss": 0.0817, + "num_input_tokens_seen": 20000384, + "step": 16435 + }, + { + "epoch": 1.8309388573337788, + "grad_norm": 1.2197500467300415, + "learning_rate": 4.577068715892638e-05, + "loss": 0.1214, + "num_input_tokens_seen": 20006560, + "step": 16440 + }, + { + "epoch": 1.831495712217396, + "grad_norm": 0.1781550794839859, + "learning_rate": 4.578460853101682e-05, + "loss": 0.1121, + "num_input_tokens_seen": 20012832, + "step": 16445 + }, + { + "epoch": 1.8320525671010135, + "grad_norm": 0.10214725136756897, + "learning_rate": 4.579852990310725e-05, + "loss": 0.1073, + "num_input_tokens_seen": 20018976, + "step": 16450 + }, + { + "epoch": 1.832609421984631, + "grad_norm": 0.8276961445808411, + "learning_rate": 4.5812451275197686e-05, + "loss": 0.1486, + "num_input_tokens_seen": 20024928, + "step": 16455 + }, + { + "epoch": 1.8331662768682482, + "grad_norm": 0.0678059384226799, + "learning_rate": 4.582637264728812e-05, + "loss": 0.2188, + "num_input_tokens_seen": 20030400, + "step": 16460 + }, + { + "epoch": 1.8337231317518654, + "grad_norm": 0.5109464526176453, + "learning_rate": 4.5840294019378554e-05, + "loss": 0.3214, + "num_input_tokens_seen": 20036320, + "step": 16465 + }, + { + "epoch": 1.8342799866354826, + "grad_norm": 0.5335940718650818, + "learning_rate": 4.585421539146899e-05, + "loss": 0.0776, + "num_input_tokens_seen": 20042656, + "step": 16470 + }, + { + "epoch": 1.8348368415191, + "grad_norm": 0.06699260324239731, + "learning_rate": 4.5868136763559416e-05, + "loss": 0.0419, + "num_input_tokens_seen": 20048832, + "step": 16475 + }, + { + "epoch": 1.8353936964027175, + "grad_norm": 1.266980767250061, + "learning_rate": 4.588205813564985e-05, + "loss": 0.1121, + "num_input_tokens_seen": 20054976, + "step": 16480 + }, + { + "epoch": 1.8359505512863348, + "grad_norm": 0.14155778288841248, + "learning_rate": 4.5895979507740285e-05, + "loss": 0.0228, + "num_input_tokens_seen": 20061280, + "step": 16485 + }, + { + "epoch": 1.836507406169952, + "grad_norm": 1.108013391494751, + "learning_rate": 4.590990087983072e-05, + "loss": 0.164, + "num_input_tokens_seen": 20067264, + "step": 16490 + }, + { + "epoch": 1.8370642610535695, + "grad_norm": 0.15625987946987152, + "learning_rate": 4.5923822251921154e-05, + "loss": 0.1587, + "num_input_tokens_seen": 20073664, + "step": 16495 + }, + { + "epoch": 1.837621115937187, + "grad_norm": 1.798425316810608, + "learning_rate": 4.593774362401159e-05, + "loss": 0.1515, + "num_input_tokens_seen": 20080064, + "step": 16500 + }, + { + "epoch": 1.8381779708208041, + "grad_norm": 0.18885022401809692, + "learning_rate": 4.5951664996102015e-05, + "loss": 0.2223, + "num_input_tokens_seen": 20086400, + "step": 16505 + }, + { + "epoch": 1.8387348257044214, + "grad_norm": 0.212848499417305, + "learning_rate": 4.5965586368192457e-05, + "loss": 0.0677, + "num_input_tokens_seen": 20092384, + "step": 16510 + }, + { + "epoch": 1.8392916805880386, + "grad_norm": 0.5024674534797668, + "learning_rate": 4.5979507740282884e-05, + "loss": 0.1097, + "num_input_tokens_seen": 20098304, + "step": 16515 + }, + { + "epoch": 1.839848535471656, + "grad_norm": 0.12781192362308502, + "learning_rate": 4.599342911237332e-05, + "loss": 0.0608, + "num_input_tokens_seen": 20104416, + "step": 16520 + }, + { + "epoch": 1.8404053903552735, + "grad_norm": 0.20301038026809692, + "learning_rate": 4.600735048446375e-05, + "loss": 0.069, + "num_input_tokens_seen": 20110496, + "step": 16525 + }, + { + "epoch": 1.8409622452388907, + "grad_norm": 0.2748459279537201, + "learning_rate": 4.602127185655418e-05, + "loss": 0.1555, + "num_input_tokens_seen": 20116704, + "step": 16530 + }, + { + "epoch": 1.841519100122508, + "grad_norm": 0.7974340915679932, + "learning_rate": 4.603519322864462e-05, + "loss": 0.0634, + "num_input_tokens_seen": 20123008, + "step": 16535 + }, + { + "epoch": 1.8420759550061254, + "grad_norm": 0.5112535357475281, + "learning_rate": 4.604911460073505e-05, + "loss": 0.0797, + "num_input_tokens_seen": 20129312, + "step": 16540 + }, + { + "epoch": 1.8426328098897429, + "grad_norm": 1.6831358671188354, + "learning_rate": 4.606303597282548e-05, + "loss": 0.2904, + "num_input_tokens_seen": 20135392, + "step": 16545 + }, + { + "epoch": 1.84318966477336, + "grad_norm": 0.47442010045051575, + "learning_rate": 4.607695734491592e-05, + "loss": 0.0564, + "num_input_tokens_seen": 20141824, + "step": 16550 + }, + { + "epoch": 1.8437465196569773, + "grad_norm": 1.522140383720398, + "learning_rate": 4.609087871700635e-05, + "loss": 0.2094, + "num_input_tokens_seen": 20147712, + "step": 16555 + }, + { + "epoch": 1.8443033745405946, + "grad_norm": 0.754464328289032, + "learning_rate": 4.6104800089096786e-05, + "loss": 0.0956, + "num_input_tokens_seen": 20154144, + "step": 16560 + }, + { + "epoch": 1.844860229424212, + "grad_norm": 0.02830428257584572, + "learning_rate": 4.6118721461187214e-05, + "loss": 0.0513, + "num_input_tokens_seen": 20160416, + "step": 16565 + }, + { + "epoch": 1.8454170843078295, + "grad_norm": 0.08736565709114075, + "learning_rate": 4.613264283327765e-05, + "loss": 0.0751, + "num_input_tokens_seen": 20166688, + "step": 16570 + }, + { + "epoch": 1.8459739391914467, + "grad_norm": 0.8772808313369751, + "learning_rate": 4.614656420536808e-05, + "loss": 0.2202, + "num_input_tokens_seen": 20172256, + "step": 16575 + }, + { + "epoch": 1.846530794075064, + "grad_norm": 1.4609960317611694, + "learning_rate": 4.616048557745852e-05, + "loss": 0.139, + "num_input_tokens_seen": 20178016, + "step": 16580 + }, + { + "epoch": 1.8470876489586814, + "grad_norm": 0.4771650433540344, + "learning_rate": 4.617440694954895e-05, + "loss": 0.1205, + "num_input_tokens_seen": 20184032, + "step": 16585 + }, + { + "epoch": 1.8476445038422988, + "grad_norm": 0.01906915195286274, + "learning_rate": 4.6188328321639385e-05, + "loss": 0.0689, + "num_input_tokens_seen": 20190048, + "step": 16590 + }, + { + "epoch": 1.848201358725916, + "grad_norm": 0.07005305588245392, + "learning_rate": 4.620224969372981e-05, + "loss": 0.1681, + "num_input_tokens_seen": 20195616, + "step": 16595 + }, + { + "epoch": 1.8487582136095333, + "grad_norm": 0.11665718257427216, + "learning_rate": 4.6216171065820254e-05, + "loss": 0.2506, + "num_input_tokens_seen": 20201696, + "step": 16600 + }, + { + "epoch": 1.8493150684931505, + "grad_norm": 0.9011217355728149, + "learning_rate": 4.623009243791068e-05, + "loss": 0.1141, + "num_input_tokens_seen": 20207744, + "step": 16605 + }, + { + "epoch": 1.849871923376768, + "grad_norm": 0.5365315675735474, + "learning_rate": 4.6244013810001116e-05, + "loss": 0.1533, + "num_input_tokens_seen": 20214048, + "step": 16610 + }, + { + "epoch": 1.8504287782603854, + "grad_norm": 0.9600639343261719, + "learning_rate": 4.625793518209155e-05, + "loss": 0.1511, + "num_input_tokens_seen": 20219552, + "step": 16615 + }, + { + "epoch": 1.8509856331440027, + "grad_norm": 0.17933189868927002, + "learning_rate": 4.627185655418198e-05, + "loss": 0.0435, + "num_input_tokens_seen": 20225888, + "step": 16620 + }, + { + "epoch": 1.85154248802762, + "grad_norm": 1.4116125106811523, + "learning_rate": 4.628577792627242e-05, + "loss": 0.1782, + "num_input_tokens_seen": 20231968, + "step": 16625 + }, + { + "epoch": 1.8520993429112373, + "grad_norm": 0.007633406203240156, + "learning_rate": 4.6299699298362847e-05, + "loss": 0.0736, + "num_input_tokens_seen": 20238048, + "step": 16630 + }, + { + "epoch": 1.8526561977948548, + "grad_norm": 0.38293948769569397, + "learning_rate": 4.631362067045328e-05, + "loss": 0.1466, + "num_input_tokens_seen": 20244032, + "step": 16635 + }, + { + "epoch": 1.853213052678472, + "grad_norm": 0.547943115234375, + "learning_rate": 4.6327542042543715e-05, + "loss": 0.0872, + "num_input_tokens_seen": 20250080, + "step": 16640 + }, + { + "epoch": 1.8537699075620893, + "grad_norm": 0.1862129271030426, + "learning_rate": 4.634146341463415e-05, + "loss": 0.0928, + "num_input_tokens_seen": 20256192, + "step": 16645 + }, + { + "epoch": 1.8543267624457065, + "grad_norm": 0.2836395800113678, + "learning_rate": 4.6355384786724584e-05, + "loss": 0.062, + "num_input_tokens_seen": 20262400, + "step": 16650 + }, + { + "epoch": 1.854883617329324, + "grad_norm": 0.28773894906044006, + "learning_rate": 4.636930615881501e-05, + "loss": 0.0825, + "num_input_tokens_seen": 20268512, + "step": 16655 + }, + { + "epoch": 1.8554404722129414, + "grad_norm": 0.32645106315612793, + "learning_rate": 4.6383227530905446e-05, + "loss": 0.1698, + "num_input_tokens_seen": 20274816, + "step": 16660 + }, + { + "epoch": 1.8559973270965586, + "grad_norm": 0.4692760109901428, + "learning_rate": 4.639714890299588e-05, + "loss": 0.044, + "num_input_tokens_seen": 20280768, + "step": 16665 + }, + { + "epoch": 1.8565541819801759, + "grad_norm": 1.134973168373108, + "learning_rate": 4.6411070275086314e-05, + "loss": 0.0934, + "num_input_tokens_seen": 20286880, + "step": 16670 + }, + { + "epoch": 1.8571110368637933, + "grad_norm": 0.1878475546836853, + "learning_rate": 4.642499164717675e-05, + "loss": 0.1181, + "num_input_tokens_seen": 20293344, + "step": 16675 + }, + { + "epoch": 1.8576678917474108, + "grad_norm": 0.7736659049987793, + "learning_rate": 4.643891301926718e-05, + "loss": 0.1408, + "num_input_tokens_seen": 20299328, + "step": 16680 + }, + { + "epoch": 1.858224746631028, + "grad_norm": 0.3169881999492645, + "learning_rate": 4.645283439135761e-05, + "loss": 0.0731, + "num_input_tokens_seen": 20305408, + "step": 16685 + }, + { + "epoch": 1.8587816015146452, + "grad_norm": 0.01250145398080349, + "learning_rate": 4.646675576344805e-05, + "loss": 0.144, + "num_input_tokens_seen": 20311200, + "step": 16690 + }, + { + "epoch": 1.8593384563982625, + "grad_norm": 0.5864185690879822, + "learning_rate": 4.648067713553848e-05, + "loss": 0.2181, + "num_input_tokens_seen": 20317504, + "step": 16695 + }, + { + "epoch": 1.85989531128188, + "grad_norm": 0.012199889868497849, + "learning_rate": 4.6494598507628914e-05, + "loss": 0.0328, + "num_input_tokens_seen": 20323552, + "step": 16700 + }, + { + "epoch": 1.8604521661654974, + "grad_norm": 0.19045665860176086, + "learning_rate": 4.650851987971935e-05, + "loss": 0.1039, + "num_input_tokens_seen": 20329696, + "step": 16705 + }, + { + "epoch": 1.8610090210491146, + "grad_norm": 0.2836117744445801, + "learning_rate": 4.6522441251809775e-05, + "loss": 0.064, + "num_input_tokens_seen": 20335616, + "step": 16710 + }, + { + "epoch": 1.8615658759327318, + "grad_norm": 0.48527005314826965, + "learning_rate": 4.6536362623900217e-05, + "loss": 0.1365, + "num_input_tokens_seen": 20341888, + "step": 16715 + }, + { + "epoch": 1.8621227308163493, + "grad_norm": 0.8450993299484253, + "learning_rate": 4.6550283995990644e-05, + "loss": 0.1574, + "num_input_tokens_seen": 20348128, + "step": 16720 + }, + { + "epoch": 1.8626795856999667, + "grad_norm": 0.03579701855778694, + "learning_rate": 4.6564205368081085e-05, + "loss": 0.0772, + "num_input_tokens_seen": 20354496, + "step": 16725 + }, + { + "epoch": 1.863236440583584, + "grad_norm": 1.5290899276733398, + "learning_rate": 4.657812674017151e-05, + "loss": 0.0576, + "num_input_tokens_seen": 20360704, + "step": 16730 + }, + { + "epoch": 1.8637932954672012, + "grad_norm": 0.24435976147651672, + "learning_rate": 4.659204811226195e-05, + "loss": 0.0329, + "num_input_tokens_seen": 20367072, + "step": 16735 + }, + { + "epoch": 1.8643501503508184, + "grad_norm": 0.5576940774917603, + "learning_rate": 4.660596948435238e-05, + "loss": 0.1, + "num_input_tokens_seen": 20373312, + "step": 16740 + }, + { + "epoch": 1.8649070052344359, + "grad_norm": 0.02256399765610695, + "learning_rate": 4.661989085644281e-05, + "loss": 0.0524, + "num_input_tokens_seen": 20379712, + "step": 16745 + }, + { + "epoch": 1.8654638601180533, + "grad_norm": 0.7657848596572876, + "learning_rate": 4.663381222853325e-05, + "loss": 0.0943, + "num_input_tokens_seen": 20385856, + "step": 16750 + }, + { + "epoch": 1.8660207150016705, + "grad_norm": 1.0247738361358643, + "learning_rate": 4.664773360062368e-05, + "loss": 0.078, + "num_input_tokens_seen": 20392128, + "step": 16755 + }, + { + "epoch": 1.8665775698852878, + "grad_norm": 0.6669758558273315, + "learning_rate": 4.666165497271411e-05, + "loss": 0.136, + "num_input_tokens_seen": 20398208, + "step": 16760 + }, + { + "epoch": 1.8671344247689052, + "grad_norm": 0.47127026319503784, + "learning_rate": 4.6675576344804546e-05, + "loss": 0.243, + "num_input_tokens_seen": 20404192, + "step": 16765 + }, + { + "epoch": 1.8676912796525227, + "grad_norm": 1.1716597080230713, + "learning_rate": 4.668949771689498e-05, + "loss": 0.2032, + "num_input_tokens_seen": 20410784, + "step": 16770 + }, + { + "epoch": 1.86824813453614, + "grad_norm": 0.03026314452290535, + "learning_rate": 4.6703419088985415e-05, + "loss": 0.0567, + "num_input_tokens_seen": 20417184, + "step": 16775 + }, + { + "epoch": 1.8688049894197571, + "grad_norm": 0.20757633447647095, + "learning_rate": 4.671734046107585e-05, + "loss": 0.2586, + "num_input_tokens_seen": 20423296, + "step": 16780 + }, + { + "epoch": 1.8693618443033744, + "grad_norm": 0.5891596078872681, + "learning_rate": 4.673126183316628e-05, + "loss": 0.121, + "num_input_tokens_seen": 20428704, + "step": 16785 + }, + { + "epoch": 1.8699186991869918, + "grad_norm": 1.712658405303955, + "learning_rate": 4.674518320525671e-05, + "loss": 0.1402, + "num_input_tokens_seen": 20434496, + "step": 16790 + }, + { + "epoch": 1.8704755540706093, + "grad_norm": 1.4141480922698975, + "learning_rate": 4.6759104577347146e-05, + "loss": 0.087, + "num_input_tokens_seen": 20440576, + "step": 16795 + }, + { + "epoch": 1.8710324089542265, + "grad_norm": 1.131595253944397, + "learning_rate": 4.677302594943758e-05, + "loss": 0.0678, + "num_input_tokens_seen": 20446752, + "step": 16800 + }, + { + "epoch": 1.8715892638378437, + "grad_norm": 0.7050909399986267, + "learning_rate": 4.6786947321528014e-05, + "loss": 0.2139, + "num_input_tokens_seen": 20452672, + "step": 16805 + }, + { + "epoch": 1.8721461187214612, + "grad_norm": 1.105912685394287, + "learning_rate": 4.680086869361844e-05, + "loss": 0.1187, + "num_input_tokens_seen": 20458784, + "step": 16810 + }, + { + "epoch": 1.8727029736050786, + "grad_norm": 0.006954746786504984, + "learning_rate": 4.681479006570888e-05, + "loss": 0.1186, + "num_input_tokens_seen": 20465152, + "step": 16815 + }, + { + "epoch": 1.8732598284886959, + "grad_norm": 0.0996263399720192, + "learning_rate": 4.682871143779931e-05, + "loss": 0.1388, + "num_input_tokens_seen": 20471296, + "step": 16820 + }, + { + "epoch": 1.873816683372313, + "grad_norm": 1.2347527742385864, + "learning_rate": 4.6842632809889745e-05, + "loss": 0.1737, + "num_input_tokens_seen": 20477376, + "step": 16825 + }, + { + "epoch": 1.8743735382559303, + "grad_norm": 0.4252563416957855, + "learning_rate": 4.685655418198018e-05, + "loss": 0.0598, + "num_input_tokens_seen": 20483840, + "step": 16830 + }, + { + "epoch": 1.8749303931395478, + "grad_norm": 0.5915695428848267, + "learning_rate": 4.687047555407061e-05, + "loss": 0.1115, + "num_input_tokens_seen": 20490048, + "step": 16835 + }, + { + "epoch": 1.8754872480231652, + "grad_norm": 0.9458092451095581, + "learning_rate": 4.688439692616105e-05, + "loss": 0.0791, + "num_input_tokens_seen": 20495680, + "step": 16840 + }, + { + "epoch": 1.8760441029067825, + "grad_norm": 0.0032277717255055904, + "learning_rate": 4.6898318298251475e-05, + "loss": 0.1498, + "num_input_tokens_seen": 20502080, + "step": 16845 + }, + { + "epoch": 1.8766009577903997, + "grad_norm": 1.4078149795532227, + "learning_rate": 4.691223967034191e-05, + "loss": 0.1234, + "num_input_tokens_seen": 20508064, + "step": 16850 + }, + { + "epoch": 1.8771578126740172, + "grad_norm": 0.4704437851905823, + "learning_rate": 4.6926161042432344e-05, + "loss": 0.1257, + "num_input_tokens_seen": 20514368, + "step": 16855 + }, + { + "epoch": 1.8777146675576346, + "grad_norm": 0.45042189955711365, + "learning_rate": 4.694008241452278e-05, + "loss": 0.1501, + "num_input_tokens_seen": 20520640, + "step": 16860 + }, + { + "epoch": 1.8782715224412518, + "grad_norm": 0.012579329311847687, + "learning_rate": 4.695400378661321e-05, + "loss": 0.0733, + "num_input_tokens_seen": 20527008, + "step": 16865 + }, + { + "epoch": 1.878828377324869, + "grad_norm": 0.6087126135826111, + "learning_rate": 4.696792515870365e-05, + "loss": 0.1482, + "num_input_tokens_seen": 20533088, + "step": 16870 + }, + { + "epoch": 1.8793852322084865, + "grad_norm": 0.823168158531189, + "learning_rate": 4.6981846530794074e-05, + "loss": 0.1229, + "num_input_tokens_seen": 20538400, + "step": 16875 + }, + { + "epoch": 1.8799420870921038, + "grad_norm": 1.8573217391967773, + "learning_rate": 4.6995767902884516e-05, + "loss": 0.2472, + "num_input_tokens_seen": 20544128, + "step": 16880 + }, + { + "epoch": 1.8804989419757212, + "grad_norm": 0.4085913896560669, + "learning_rate": 4.700968927497494e-05, + "loss": 0.0376, + "num_input_tokens_seen": 20550336, + "step": 16885 + }, + { + "epoch": 1.8810557968593384, + "grad_norm": 0.21210877597332, + "learning_rate": 4.702361064706538e-05, + "loss": 0.0769, + "num_input_tokens_seen": 20556576, + "step": 16890 + }, + { + "epoch": 1.8816126517429557, + "grad_norm": 0.27895277738571167, + "learning_rate": 4.703753201915581e-05, + "loss": 0.1513, + "num_input_tokens_seen": 20562848, + "step": 16895 + }, + { + "epoch": 1.8821695066265731, + "grad_norm": 0.1837446540594101, + "learning_rate": 4.705145339124624e-05, + "loss": 0.0302, + "num_input_tokens_seen": 20568896, + "step": 16900 + }, + { + "epoch": 1.8827263615101906, + "grad_norm": 0.7432642579078674, + "learning_rate": 4.706537476333668e-05, + "loss": 0.065, + "num_input_tokens_seen": 20575072, + "step": 16905 + }, + { + "epoch": 1.8832832163938078, + "grad_norm": 0.41937679052352905, + "learning_rate": 4.707929613542711e-05, + "loss": 0.0539, + "num_input_tokens_seen": 20581440, + "step": 16910 + }, + { + "epoch": 1.883840071277425, + "grad_norm": 0.5399029850959778, + "learning_rate": 4.709321750751754e-05, + "loss": 0.0968, + "num_input_tokens_seen": 20587648, + "step": 16915 + }, + { + "epoch": 1.8843969261610425, + "grad_norm": 0.02828231453895569, + "learning_rate": 4.710713887960798e-05, + "loss": 0.1108, + "num_input_tokens_seen": 20593728, + "step": 16920 + }, + { + "epoch": 1.8849537810446597, + "grad_norm": 1.3209339380264282, + "learning_rate": 4.712106025169841e-05, + "loss": 0.1157, + "num_input_tokens_seen": 20600032, + "step": 16925 + }, + { + "epoch": 1.8855106359282772, + "grad_norm": 0.5100467205047607, + "learning_rate": 4.7134981623788845e-05, + "loss": 0.1901, + "num_input_tokens_seen": 20606080, + "step": 16930 + }, + { + "epoch": 1.8860674908118944, + "grad_norm": 0.04833408445119858, + "learning_rate": 4.714890299587927e-05, + "loss": 0.1007, + "num_input_tokens_seen": 20611616, + "step": 16935 + }, + { + "epoch": 1.8866243456955116, + "grad_norm": 0.0729886144399643, + "learning_rate": 4.716282436796971e-05, + "loss": 0.1546, + "num_input_tokens_seen": 20617696, + "step": 16940 + }, + { + "epoch": 1.887181200579129, + "grad_norm": 0.48041877150535583, + "learning_rate": 4.717674574006014e-05, + "loss": 0.0713, + "num_input_tokens_seen": 20623680, + "step": 16945 + }, + { + "epoch": 1.8877380554627465, + "grad_norm": 0.4045594036579132, + "learning_rate": 4.7190667112150576e-05, + "loss": 0.1507, + "num_input_tokens_seen": 20629312, + "step": 16950 + }, + { + "epoch": 1.8882949103463638, + "grad_norm": 0.7672168612480164, + "learning_rate": 4.720458848424101e-05, + "loss": 0.1651, + "num_input_tokens_seen": 20635360, + "step": 16955 + }, + { + "epoch": 1.888851765229981, + "grad_norm": 1.0244041681289673, + "learning_rate": 4.7218509856331444e-05, + "loss": 0.2165, + "num_input_tokens_seen": 20641664, + "step": 16960 + }, + { + "epoch": 1.8894086201135984, + "grad_norm": 0.503510057926178, + "learning_rate": 4.723243122842187e-05, + "loss": 0.1036, + "num_input_tokens_seen": 20647744, + "step": 16965 + }, + { + "epoch": 1.8899654749972157, + "grad_norm": 0.37983083724975586, + "learning_rate": 4.724635260051231e-05, + "loss": 0.0448, + "num_input_tokens_seen": 20653792, + "step": 16970 + }, + { + "epoch": 1.8905223298808331, + "grad_norm": 0.021980905905365944, + "learning_rate": 4.726027397260274e-05, + "loss": 0.0859, + "num_input_tokens_seen": 20660096, + "step": 16975 + }, + { + "epoch": 1.8910791847644504, + "grad_norm": 0.27823883295059204, + "learning_rate": 4.7274195344693175e-05, + "loss": 0.1949, + "num_input_tokens_seen": 20666144, + "step": 16980 + }, + { + "epoch": 1.8916360396480676, + "grad_norm": 1.2777502536773682, + "learning_rate": 4.728811671678361e-05, + "loss": 0.1535, + "num_input_tokens_seen": 20672480, + "step": 16985 + }, + { + "epoch": 1.892192894531685, + "grad_norm": 0.2697676718235016, + "learning_rate": 4.730203808887404e-05, + "loss": 0.1409, + "num_input_tokens_seen": 20678176, + "step": 16990 + }, + { + "epoch": 1.8927497494153025, + "grad_norm": 0.1878265142440796, + "learning_rate": 4.731595946096448e-05, + "loss": 0.029, + "num_input_tokens_seen": 20683904, + "step": 16995 + }, + { + "epoch": 1.8933066042989197, + "grad_norm": 0.5883525609970093, + "learning_rate": 4.7329880833054906e-05, + "loss": 0.1699, + "num_input_tokens_seen": 20689824, + "step": 17000 + }, + { + "epoch": 1.893863459182537, + "grad_norm": 1.0036896467208862, + "learning_rate": 4.734380220514535e-05, + "loss": 0.0465, + "num_input_tokens_seen": 20696256, + "step": 17005 + }, + { + "epoch": 1.8944203140661544, + "grad_norm": 1.4943639039993286, + "learning_rate": 4.7357723577235774e-05, + "loss": 0.1598, + "num_input_tokens_seen": 20702272, + "step": 17010 + }, + { + "epoch": 1.8949771689497716, + "grad_norm": 1.019657015800476, + "learning_rate": 4.737164494932621e-05, + "loss": 0.1044, + "num_input_tokens_seen": 20708576, + "step": 17015 + }, + { + "epoch": 1.895534023833389, + "grad_norm": 0.26641198992729187, + "learning_rate": 4.738556632141664e-05, + "loss": 0.0769, + "num_input_tokens_seen": 20714560, + "step": 17020 + }, + { + "epoch": 1.8960908787170063, + "grad_norm": 1.509433388710022, + "learning_rate": 4.739948769350707e-05, + "loss": 0.176, + "num_input_tokens_seen": 20720672, + "step": 17025 + }, + { + "epoch": 1.8966477336006236, + "grad_norm": 0.47596681118011475, + "learning_rate": 4.741340906559751e-05, + "loss": 0.0816, + "num_input_tokens_seen": 20726976, + "step": 17030 + }, + { + "epoch": 1.897204588484241, + "grad_norm": 1.1423126459121704, + "learning_rate": 4.742733043768794e-05, + "loss": 0.107, + "num_input_tokens_seen": 20733088, + "step": 17035 + }, + { + "epoch": 1.8977614433678585, + "grad_norm": 0.35477620363235474, + "learning_rate": 4.7441251809778373e-05, + "loss": 0.1378, + "num_input_tokens_seen": 20739264, + "step": 17040 + }, + { + "epoch": 1.8983182982514757, + "grad_norm": 0.7038474082946777, + "learning_rate": 4.745517318186881e-05, + "loss": 0.1343, + "num_input_tokens_seen": 20744896, + "step": 17045 + }, + { + "epoch": 1.898875153135093, + "grad_norm": 0.8421375155448914, + "learning_rate": 4.746909455395924e-05, + "loss": 0.1069, + "num_input_tokens_seen": 20751168, + "step": 17050 + }, + { + "epoch": 1.8994320080187104, + "grad_norm": 0.5132733583450317, + "learning_rate": 4.7483015926049676e-05, + "loss": 0.1876, + "num_input_tokens_seen": 20756704, + "step": 17055 + }, + { + "epoch": 1.8999888629023276, + "grad_norm": 0.37849926948547363, + "learning_rate": 4.749693729814011e-05, + "loss": 0.0911, + "num_input_tokens_seen": 20763008, + "step": 17060 + }, + { + "epoch": 1.900545717785945, + "grad_norm": 0.32857778668403625, + "learning_rate": 4.751085867023054e-05, + "loss": 0.0888, + "num_input_tokens_seen": 20769216, + "step": 17065 + }, + { + "epoch": 1.9011025726695623, + "grad_norm": 0.5585547089576721, + "learning_rate": 4.752478004232097e-05, + "loss": 0.2032, + "num_input_tokens_seen": 20775872, + "step": 17070 + }, + { + "epoch": 1.9016594275531795, + "grad_norm": 0.9041782021522522, + "learning_rate": 4.753870141441141e-05, + "loss": 0.2601, + "num_input_tokens_seen": 20782048, + "step": 17075 + }, + { + "epoch": 1.902216282436797, + "grad_norm": 0.20734941959381104, + "learning_rate": 4.755262278650184e-05, + "loss": 0.0713, + "num_input_tokens_seen": 20788192, + "step": 17080 + }, + { + "epoch": 1.9027731373204144, + "grad_norm": 0.7167047262191772, + "learning_rate": 4.7566544158592276e-05, + "loss": 0.2057, + "num_input_tokens_seen": 20794336, + "step": 17085 + }, + { + "epoch": 1.9033299922040317, + "grad_norm": 0.03392431512475014, + "learning_rate": 4.75804655306827e-05, + "loss": 0.0589, + "num_input_tokens_seen": 20800416, + "step": 17090 + }, + { + "epoch": 1.9038868470876489, + "grad_norm": 1.2138429880142212, + "learning_rate": 4.7594386902773144e-05, + "loss": 0.2121, + "num_input_tokens_seen": 20806432, + "step": 17095 + }, + { + "epoch": 1.9044437019712663, + "grad_norm": 0.7153932452201843, + "learning_rate": 4.760830827486357e-05, + "loss": 0.0971, + "num_input_tokens_seen": 20812576, + "step": 17100 + }, + { + "epoch": 1.9050005568548836, + "grad_norm": 0.056966908276081085, + "learning_rate": 4.7622229646954006e-05, + "loss": 0.0269, + "num_input_tokens_seen": 20818816, + "step": 17105 + }, + { + "epoch": 1.905557411738501, + "grad_norm": 1.8685401678085327, + "learning_rate": 4.763615101904444e-05, + "loss": 0.1332, + "num_input_tokens_seen": 20825024, + "step": 17110 + }, + { + "epoch": 1.9061142666221182, + "grad_norm": 1.3857686519622803, + "learning_rate": 4.765007239113487e-05, + "loss": 0.1281, + "num_input_tokens_seen": 20831200, + "step": 17115 + }, + { + "epoch": 1.9066711215057355, + "grad_norm": 0.10996946692466736, + "learning_rate": 4.766399376322531e-05, + "loss": 0.0771, + "num_input_tokens_seen": 20837568, + "step": 17120 + }, + { + "epoch": 1.907227976389353, + "grad_norm": 0.6707080602645874, + "learning_rate": 4.767791513531574e-05, + "loss": 0.0824, + "num_input_tokens_seen": 20843520, + "step": 17125 + }, + { + "epoch": 1.9077848312729704, + "grad_norm": 0.18437831103801727, + "learning_rate": 4.769183650740617e-05, + "loss": 0.1587, + "num_input_tokens_seen": 20849760, + "step": 17130 + }, + { + "epoch": 1.9083416861565876, + "grad_norm": 1.359028935432434, + "learning_rate": 4.7705757879496605e-05, + "loss": 0.1589, + "num_input_tokens_seen": 20855872, + "step": 17135 + }, + { + "epoch": 1.9088985410402048, + "grad_norm": 0.3294473886489868, + "learning_rate": 4.771967925158704e-05, + "loss": 0.2338, + "num_input_tokens_seen": 20861696, + "step": 17140 + }, + { + "epoch": 1.9094553959238223, + "grad_norm": 1.1320550441741943, + "learning_rate": 4.7733600623677474e-05, + "loss": 0.1548, + "num_input_tokens_seen": 20867488, + "step": 17145 + }, + { + "epoch": 1.9100122508074395, + "grad_norm": 0.6189675331115723, + "learning_rate": 4.774752199576791e-05, + "loss": 0.1271, + "num_input_tokens_seen": 20873472, + "step": 17150 + }, + { + "epoch": 1.910569105691057, + "grad_norm": 1.3863792419433594, + "learning_rate": 4.7761443367858336e-05, + "loss": 0.1533, + "num_input_tokens_seen": 20879520, + "step": 17155 + }, + { + "epoch": 1.9111259605746742, + "grad_norm": 1.1384998559951782, + "learning_rate": 4.777536473994877e-05, + "loss": 0.1453, + "num_input_tokens_seen": 20885664, + "step": 17160 + }, + { + "epoch": 1.9116828154582914, + "grad_norm": 0.3665907084941864, + "learning_rate": 4.7789286112039205e-05, + "loss": 0.09, + "num_input_tokens_seen": 20891744, + "step": 17165 + }, + { + "epoch": 1.912239670341909, + "grad_norm": 0.38713422417640686, + "learning_rate": 4.780320748412964e-05, + "loss": 0.2169, + "num_input_tokens_seen": 20897920, + "step": 17170 + }, + { + "epoch": 1.9127965252255263, + "grad_norm": 0.5035258531570435, + "learning_rate": 4.781712885622007e-05, + "loss": 0.101, + "num_input_tokens_seen": 20903904, + "step": 17175 + }, + { + "epoch": 1.9133533801091436, + "grad_norm": 0.7770029306411743, + "learning_rate": 4.78310502283105e-05, + "loss": 0.1108, + "num_input_tokens_seen": 20910208, + "step": 17180 + }, + { + "epoch": 1.9139102349927608, + "grad_norm": 0.955045223236084, + "learning_rate": 4.784497160040094e-05, + "loss": 0.1341, + "num_input_tokens_seen": 20916032, + "step": 17185 + }, + { + "epoch": 1.9144670898763783, + "grad_norm": 1.444151759147644, + "learning_rate": 4.785889297249137e-05, + "loss": 0.1038, + "num_input_tokens_seen": 20922208, + "step": 17190 + }, + { + "epoch": 1.9150239447599957, + "grad_norm": 0.028840795159339905, + "learning_rate": 4.7872814344581804e-05, + "loss": 0.0549, + "num_input_tokens_seen": 20928160, + "step": 17195 + }, + { + "epoch": 1.915580799643613, + "grad_norm": 0.22844652831554413, + "learning_rate": 4.788673571667224e-05, + "loss": 0.0773, + "num_input_tokens_seen": 20934208, + "step": 17200 + }, + { + "epoch": 1.9161376545272302, + "grad_norm": 0.491254597902298, + "learning_rate": 4.7900657088762666e-05, + "loss": 0.0604, + "num_input_tokens_seen": 20940480, + "step": 17205 + }, + { + "epoch": 1.9166945094108474, + "grad_norm": 0.6861525774002075, + "learning_rate": 4.791457846085311e-05, + "loss": 0.1597, + "num_input_tokens_seen": 20947040, + "step": 17210 + }, + { + "epoch": 1.9172513642944649, + "grad_norm": 0.2794489562511444, + "learning_rate": 4.7928499832943534e-05, + "loss": 0.128, + "num_input_tokens_seen": 20953248, + "step": 17215 + }, + { + "epoch": 1.9178082191780823, + "grad_norm": 0.5310275554656982, + "learning_rate": 4.794242120503397e-05, + "loss": 0.164, + "num_input_tokens_seen": 20959616, + "step": 17220 + }, + { + "epoch": 1.9183650740616995, + "grad_norm": 0.6760208606719971, + "learning_rate": 4.79563425771244e-05, + "loss": 0.0879, + "num_input_tokens_seen": 20965728, + "step": 17225 + }, + { + "epoch": 1.9189219289453168, + "grad_norm": 0.6344884634017944, + "learning_rate": 4.797026394921484e-05, + "loss": 0.1284, + "num_input_tokens_seen": 20971360, + "step": 17230 + }, + { + "epoch": 1.9194787838289342, + "grad_norm": 0.003453258890658617, + "learning_rate": 4.798418532130527e-05, + "loss": 0.2039, + "num_input_tokens_seen": 20977696, + "step": 17235 + }, + { + "epoch": 1.9200356387125517, + "grad_norm": 0.6534112095832825, + "learning_rate": 4.7998106693395706e-05, + "loss": 0.0786, + "num_input_tokens_seen": 20983872, + "step": 17240 + }, + { + "epoch": 1.920592493596169, + "grad_norm": 0.16658391058444977, + "learning_rate": 4.8012028065486133e-05, + "loss": 0.0549, + "num_input_tokens_seen": 20990208, + "step": 17245 + }, + { + "epoch": 1.9211493484797861, + "grad_norm": 0.014861335046589375, + "learning_rate": 4.802594943757657e-05, + "loss": 0.132, + "num_input_tokens_seen": 20995520, + "step": 17250 + }, + { + "epoch": 1.9217062033634034, + "grad_norm": 0.605479896068573, + "learning_rate": 4.8039870809667e-05, + "loss": 0.1479, + "num_input_tokens_seen": 21001856, + "step": 17255 + }, + { + "epoch": 1.9222630582470208, + "grad_norm": 0.06101740896701813, + "learning_rate": 4.8053792181757436e-05, + "loss": 0.2394, + "num_input_tokens_seen": 21007008, + "step": 17260 + }, + { + "epoch": 1.9228199131306383, + "grad_norm": 0.8731966614723206, + "learning_rate": 4.806771355384787e-05, + "loss": 0.1785, + "num_input_tokens_seen": 21013248, + "step": 17265 + }, + { + "epoch": 1.9233767680142555, + "grad_norm": 0.9653014540672302, + "learning_rate": 4.80816349259383e-05, + "loss": 0.1696, + "num_input_tokens_seen": 21018816, + "step": 17270 + }, + { + "epoch": 1.9239336228978727, + "grad_norm": 1.7213478088378906, + "learning_rate": 4.809555629802874e-05, + "loss": 0.2012, + "num_input_tokens_seen": 21025376, + "step": 17275 + }, + { + "epoch": 1.9244904777814902, + "grad_norm": 0.28825896978378296, + "learning_rate": 4.810947767011917e-05, + "loss": 0.1195, + "num_input_tokens_seen": 21031328, + "step": 17280 + }, + { + "epoch": 1.9250473326651076, + "grad_norm": 0.05940018221735954, + "learning_rate": 4.812339904220961e-05, + "loss": 0.0485, + "num_input_tokens_seen": 21037600, + "step": 17285 + }, + { + "epoch": 1.9256041875487249, + "grad_norm": 0.01912136748433113, + "learning_rate": 4.8137320414300036e-05, + "loss": 0.1789, + "num_input_tokens_seen": 21043264, + "step": 17290 + }, + { + "epoch": 1.926161042432342, + "grad_norm": 0.6643596887588501, + "learning_rate": 4.815124178639046e-05, + "loss": 0.1536, + "num_input_tokens_seen": 21049184, + "step": 17295 + }, + { + "epoch": 1.9267178973159593, + "grad_norm": 1.2866835594177246, + "learning_rate": 4.8165163158480904e-05, + "loss": 0.0928, + "num_input_tokens_seen": 21055392, + "step": 17300 + }, + { + "epoch": 1.9272747521995768, + "grad_norm": 0.17727451026439667, + "learning_rate": 4.817908453057133e-05, + "loss": 0.0891, + "num_input_tokens_seen": 21061728, + "step": 17305 + }, + { + "epoch": 1.9278316070831942, + "grad_norm": 0.6693006753921509, + "learning_rate": 4.819300590266177e-05, + "loss": 0.2838, + "num_input_tokens_seen": 21067712, + "step": 17310 + }, + { + "epoch": 1.9283884619668115, + "grad_norm": 0.7574144601821899, + "learning_rate": 4.82069272747522e-05, + "loss": 0.1709, + "num_input_tokens_seen": 21073760, + "step": 17315 + }, + { + "epoch": 1.9289453168504287, + "grad_norm": 0.7127963304519653, + "learning_rate": 4.8220848646842635e-05, + "loss": 0.0652, + "num_input_tokens_seen": 21079744, + "step": 17320 + }, + { + "epoch": 1.9295021717340461, + "grad_norm": 0.46172693371772766, + "learning_rate": 4.823477001893307e-05, + "loss": 0.1358, + "num_input_tokens_seen": 21085600, + "step": 17325 + }, + { + "epoch": 1.9300590266176636, + "grad_norm": 0.6366307735443115, + "learning_rate": 4.8248691391023503e-05, + "loss": 0.1964, + "num_input_tokens_seen": 21091968, + "step": 17330 + }, + { + "epoch": 1.9306158815012808, + "grad_norm": 0.4578987956047058, + "learning_rate": 4.826261276311394e-05, + "loss": 0.1124, + "num_input_tokens_seen": 21098016, + "step": 17335 + }, + { + "epoch": 1.931172736384898, + "grad_norm": 0.06521646678447723, + "learning_rate": 4.8276534135204365e-05, + "loss": 0.0999, + "num_input_tokens_seen": 21104096, + "step": 17340 + }, + { + "epoch": 1.9317295912685153, + "grad_norm": 1.4771796464920044, + "learning_rate": 4.82904555072948e-05, + "loss": 0.1044, + "num_input_tokens_seen": 21110112, + "step": 17345 + }, + { + "epoch": 1.9322864461521327, + "grad_norm": 0.18984021246433258, + "learning_rate": 4.8304376879385234e-05, + "loss": 0.1055, + "num_input_tokens_seen": 21115360, + "step": 17350 + }, + { + "epoch": 1.9328433010357502, + "grad_norm": 0.05552424117922783, + "learning_rate": 4.831829825147567e-05, + "loss": 0.0769, + "num_input_tokens_seen": 21120992, + "step": 17355 + }, + { + "epoch": 1.9334001559193674, + "grad_norm": 0.5575118064880371, + "learning_rate": 4.83322196235661e-05, + "loss": 0.0405, + "num_input_tokens_seen": 21126944, + "step": 17360 + }, + { + "epoch": 1.9339570108029847, + "grad_norm": 0.6531519293785095, + "learning_rate": 4.834614099565654e-05, + "loss": 0.0696, + "num_input_tokens_seen": 21132736, + "step": 17365 + }, + { + "epoch": 1.934513865686602, + "grad_norm": 1.476465106010437, + "learning_rate": 4.8360062367746965e-05, + "loss": 0.1818, + "num_input_tokens_seen": 21138880, + "step": 17370 + }, + { + "epoch": 1.9350707205702196, + "grad_norm": 0.7690767049789429, + "learning_rate": 4.8373983739837406e-05, + "loss": 0.1297, + "num_input_tokens_seen": 21145024, + "step": 17375 + }, + { + "epoch": 1.9356275754538368, + "grad_norm": 0.3558387756347656, + "learning_rate": 4.838790511192783e-05, + "loss": 0.1192, + "num_input_tokens_seen": 21151232, + "step": 17380 + }, + { + "epoch": 1.936184430337454, + "grad_norm": 0.1708386242389679, + "learning_rate": 4.840182648401827e-05, + "loss": 0.0983, + "num_input_tokens_seen": 21157600, + "step": 17385 + }, + { + "epoch": 1.9367412852210713, + "grad_norm": 0.3434734642505646, + "learning_rate": 4.84157478561087e-05, + "loss": 0.2131, + "num_input_tokens_seen": 21163744, + "step": 17390 + }, + { + "epoch": 1.9372981401046887, + "grad_norm": 0.3733697235584259, + "learning_rate": 4.842966922819913e-05, + "loss": 0.1262, + "num_input_tokens_seen": 21170112, + "step": 17395 + }, + { + "epoch": 1.9378549949883062, + "grad_norm": 0.8422450423240662, + "learning_rate": 4.844359060028957e-05, + "loss": 0.2509, + "num_input_tokens_seen": 21175712, + "step": 17400 + }, + { + "epoch": 1.9384118498719234, + "grad_norm": 0.41819125413894653, + "learning_rate": 4.845751197238e-05, + "loss": 0.0224, + "num_input_tokens_seen": 21182080, + "step": 17405 + }, + { + "epoch": 1.9389687047555406, + "grad_norm": 0.5640524625778198, + "learning_rate": 4.847143334447043e-05, + "loss": 0.1083, + "num_input_tokens_seen": 21187776, + "step": 17410 + }, + { + "epoch": 1.939525559639158, + "grad_norm": 0.6787318587303162, + "learning_rate": 4.848535471656087e-05, + "loss": 0.118, + "num_input_tokens_seen": 21193952, + "step": 17415 + }, + { + "epoch": 1.9400824145227755, + "grad_norm": 0.8164771199226379, + "learning_rate": 4.84992760886513e-05, + "loss": 0.1173, + "num_input_tokens_seen": 21199936, + "step": 17420 + }, + { + "epoch": 1.9406392694063928, + "grad_norm": 0.64776211977005, + "learning_rate": 4.8513197460741735e-05, + "loss": 0.0953, + "num_input_tokens_seen": 21206304, + "step": 17425 + }, + { + "epoch": 1.94119612429001, + "grad_norm": 1.8359501361846924, + "learning_rate": 4.852711883283216e-05, + "loss": 0.1228, + "num_input_tokens_seen": 21212544, + "step": 17430 + }, + { + "epoch": 1.9417529791736272, + "grad_norm": 0.5708510279655457, + "learning_rate": 4.85410402049226e-05, + "loss": 0.0169, + "num_input_tokens_seen": 21218784, + "step": 17435 + }, + { + "epoch": 1.9423098340572447, + "grad_norm": 0.430976539850235, + "learning_rate": 4.855496157701303e-05, + "loss": 0.0904, + "num_input_tokens_seen": 21225088, + "step": 17440 + }, + { + "epoch": 1.9428666889408621, + "grad_norm": 0.43020525574684143, + "learning_rate": 4.8568882949103466e-05, + "loss": 0.0932, + "num_input_tokens_seen": 21231296, + "step": 17445 + }, + { + "epoch": 1.9434235438244793, + "grad_norm": 0.993881106376648, + "learning_rate": 4.85828043211939e-05, + "loss": 0.1494, + "num_input_tokens_seen": 21237472, + "step": 17450 + }, + { + "epoch": 1.9439803987080966, + "grad_norm": 0.17315514385700226, + "learning_rate": 4.8596725693284335e-05, + "loss": 0.108, + "num_input_tokens_seen": 21243264, + "step": 17455 + }, + { + "epoch": 1.944537253591714, + "grad_norm": 0.530881404876709, + "learning_rate": 4.861064706537476e-05, + "loss": 0.0738, + "num_input_tokens_seen": 21249568, + "step": 17460 + }, + { + "epoch": 1.9450941084753315, + "grad_norm": 0.9669278860092163, + "learning_rate": 4.86245684374652e-05, + "loss": 0.1331, + "num_input_tokens_seen": 21255712, + "step": 17465 + }, + { + "epoch": 1.9456509633589487, + "grad_norm": 1.179918646812439, + "learning_rate": 4.863848980955563e-05, + "loss": 0.1534, + "num_input_tokens_seen": 21261952, + "step": 17470 + }, + { + "epoch": 1.946207818242566, + "grad_norm": 0.4948525130748749, + "learning_rate": 4.8652411181646065e-05, + "loss": 0.1439, + "num_input_tokens_seen": 21268288, + "step": 17475 + }, + { + "epoch": 1.9467646731261832, + "grad_norm": 0.708516538143158, + "learning_rate": 4.86663325537365e-05, + "loss": 0.0554, + "num_input_tokens_seen": 21274272, + "step": 17480 + }, + { + "epoch": 1.9473215280098006, + "grad_norm": 0.9947460293769836, + "learning_rate": 4.868025392582693e-05, + "loss": 0.1238, + "num_input_tokens_seen": 21280352, + "step": 17485 + }, + { + "epoch": 1.947878382893418, + "grad_norm": 0.01031988114118576, + "learning_rate": 4.869417529791737e-05, + "loss": 0.1185, + "num_input_tokens_seen": 21286592, + "step": 17490 + }, + { + "epoch": 1.9484352377770353, + "grad_norm": 0.2224106788635254, + "learning_rate": 4.8708096670007796e-05, + "loss": 0.1163, + "num_input_tokens_seen": 21292384, + "step": 17495 + }, + { + "epoch": 1.9489920926606525, + "grad_norm": 0.17334946990013123, + "learning_rate": 4.872201804209823e-05, + "loss": 0.0779, + "num_input_tokens_seen": 21298016, + "step": 17500 + }, + { + "epoch": 1.94954894754427, + "grad_norm": 0.13613703846931458, + "learning_rate": 4.8735939414188664e-05, + "loss": 0.056, + "num_input_tokens_seen": 21304096, + "step": 17505 + }, + { + "epoch": 1.9501058024278874, + "grad_norm": 0.7489537000656128, + "learning_rate": 4.87498607862791e-05, + "loss": 0.0648, + "num_input_tokens_seen": 21310400, + "step": 17510 + }, + { + "epoch": 1.9506626573115047, + "grad_norm": 0.31491899490356445, + "learning_rate": 4.876378215836953e-05, + "loss": 0.0894, + "num_input_tokens_seen": 21316416, + "step": 17515 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.6172335743904114, + "learning_rate": 4.877770353045997e-05, + "loss": 0.2303, + "num_input_tokens_seen": 21322304, + "step": 17520 + }, + { + "epoch": 1.9517763670787391, + "grad_norm": 0.759324848651886, + "learning_rate": 4.8791624902550395e-05, + "loss": 0.0274, + "num_input_tokens_seen": 21328704, + "step": 17525 + }, + { + "epoch": 1.9523332219623566, + "grad_norm": 1.4580292701721191, + "learning_rate": 4.880554627464083e-05, + "loss": 0.1082, + "num_input_tokens_seen": 21334432, + "step": 17530 + }, + { + "epoch": 1.952890076845974, + "grad_norm": 0.2735118269920349, + "learning_rate": 4.8819467646731264e-05, + "loss": 0.0769, + "num_input_tokens_seen": 21340512, + "step": 17535 + }, + { + "epoch": 1.9534469317295913, + "grad_norm": 1.1705981492996216, + "learning_rate": 4.88333890188217e-05, + "loss": 0.1585, + "num_input_tokens_seen": 21346656, + "step": 17540 + }, + { + "epoch": 1.9540037866132085, + "grad_norm": 0.7320494055747986, + "learning_rate": 4.884731039091213e-05, + "loss": 0.1099, + "num_input_tokens_seen": 21352896, + "step": 17545 + }, + { + "epoch": 1.954560641496826, + "grad_norm": 0.25590723752975464, + "learning_rate": 4.886123176300256e-05, + "loss": 0.0858, + "num_input_tokens_seen": 21359040, + "step": 17550 + }, + { + "epoch": 1.9551174963804434, + "grad_norm": 2.5393288135528564, + "learning_rate": 4.8875153135093e-05, + "loss": 0.2104, + "num_input_tokens_seen": 21365088, + "step": 17555 + }, + { + "epoch": 1.9556743512640606, + "grad_norm": 0.4597039520740509, + "learning_rate": 4.888907450718343e-05, + "loss": 0.1072, + "num_input_tokens_seen": 21371296, + "step": 17560 + }, + { + "epoch": 1.9562312061476779, + "grad_norm": 0.13932068645954132, + "learning_rate": 4.890299587927387e-05, + "loss": 0.3264, + "num_input_tokens_seen": 21377312, + "step": 17565 + }, + { + "epoch": 1.956788061031295, + "grad_norm": 0.07112862169742584, + "learning_rate": 4.89169172513643e-05, + "loss": 0.0568, + "num_input_tokens_seen": 21383488, + "step": 17570 + }, + { + "epoch": 1.9573449159149126, + "grad_norm": 0.39921319484710693, + "learning_rate": 4.8930838623454725e-05, + "loss": 0.1158, + "num_input_tokens_seen": 21389760, + "step": 17575 + }, + { + "epoch": 1.95790177079853, + "grad_norm": 0.8113719820976257, + "learning_rate": 4.8944759995545166e-05, + "loss": 0.0869, + "num_input_tokens_seen": 21395904, + "step": 17580 + }, + { + "epoch": 1.9584586256821472, + "grad_norm": 2.1540989875793457, + "learning_rate": 4.895868136763559e-05, + "loss": 0.2863, + "num_input_tokens_seen": 21401440, + "step": 17585 + }, + { + "epoch": 1.9590154805657645, + "grad_norm": 0.003732096403837204, + "learning_rate": 4.8972602739726034e-05, + "loss": 0.0472, + "num_input_tokens_seen": 21407712, + "step": 17590 + }, + { + "epoch": 1.959572335449382, + "grad_norm": 0.09047062695026398, + "learning_rate": 4.898652411181646e-05, + "loss": 0.051, + "num_input_tokens_seen": 21414240, + "step": 17595 + }, + { + "epoch": 1.9601291903329994, + "grad_norm": 0.9601837396621704, + "learning_rate": 4.9000445483906896e-05, + "loss": 0.113, + "num_input_tokens_seen": 21420576, + "step": 17600 + }, + { + "epoch": 1.9606860452166166, + "grad_norm": 1.2776129245758057, + "learning_rate": 4.901436685599733e-05, + "loss": 0.2365, + "num_input_tokens_seen": 21426272, + "step": 17605 + }, + { + "epoch": 1.9612429001002338, + "grad_norm": 0.39861828088760376, + "learning_rate": 4.9028288228087765e-05, + "loss": 0.0463, + "num_input_tokens_seen": 21432192, + "step": 17610 + }, + { + "epoch": 1.961799754983851, + "grad_norm": 0.10804452002048492, + "learning_rate": 4.90422096001782e-05, + "loss": 0.1324, + "num_input_tokens_seen": 21438464, + "step": 17615 + }, + { + "epoch": 1.9623566098674685, + "grad_norm": 0.05426230654120445, + "learning_rate": 4.905613097226863e-05, + "loss": 0.1356, + "num_input_tokens_seen": 21444288, + "step": 17620 + }, + { + "epoch": 1.962913464751086, + "grad_norm": 0.514838695526123, + "learning_rate": 4.907005234435906e-05, + "loss": 0.2195, + "num_input_tokens_seen": 21450656, + "step": 17625 + }, + { + "epoch": 1.9634703196347032, + "grad_norm": 0.20321784913539886, + "learning_rate": 4.9083973716449495e-05, + "loss": 0.0718, + "num_input_tokens_seen": 21456736, + "step": 17630 + }, + { + "epoch": 1.9640271745183204, + "grad_norm": 0.873789370059967, + "learning_rate": 4.909789508853993e-05, + "loss": 0.0504, + "num_input_tokens_seen": 21462304, + "step": 17635 + }, + { + "epoch": 1.9645840294019379, + "grad_norm": 0.46537792682647705, + "learning_rate": 4.9111816460630364e-05, + "loss": 0.0342, + "num_input_tokens_seen": 21468352, + "step": 17640 + }, + { + "epoch": 1.9651408842855553, + "grad_norm": 0.5222129821777344, + "learning_rate": 4.91257378327208e-05, + "loss": 0.1195, + "num_input_tokens_seen": 21474336, + "step": 17645 + }, + { + "epoch": 1.9656977391691726, + "grad_norm": 0.1547536998987198, + "learning_rate": 4.9139659204811226e-05, + "loss": 0.129, + "num_input_tokens_seen": 21480608, + "step": 17650 + }, + { + "epoch": 1.9662545940527898, + "grad_norm": 1.593985915184021, + "learning_rate": 4.915358057690167e-05, + "loss": 0.1197, + "num_input_tokens_seen": 21486912, + "step": 17655 + }, + { + "epoch": 1.966811448936407, + "grad_norm": 0.016984399408102036, + "learning_rate": 4.9167501948992095e-05, + "loss": 0.085, + "num_input_tokens_seen": 21492992, + "step": 17660 + }, + { + "epoch": 1.9673683038200245, + "grad_norm": 0.7716768980026245, + "learning_rate": 4.918142332108253e-05, + "loss": 0.0931, + "num_input_tokens_seen": 21499296, + "step": 17665 + }, + { + "epoch": 1.967925158703642, + "grad_norm": 0.20993171632289886, + "learning_rate": 4.919534469317296e-05, + "loss": 0.0473, + "num_input_tokens_seen": 21505568, + "step": 17670 + }, + { + "epoch": 1.9684820135872592, + "grad_norm": 0.02332788147032261, + "learning_rate": 4.920926606526339e-05, + "loss": 0.0454, + "num_input_tokens_seen": 21511872, + "step": 17675 + }, + { + "epoch": 1.9690388684708764, + "grad_norm": 0.09650295972824097, + "learning_rate": 4.922318743735383e-05, + "loss": 0.1525, + "num_input_tokens_seen": 21518240, + "step": 17680 + }, + { + "epoch": 1.9695957233544938, + "grad_norm": 0.27478480339050293, + "learning_rate": 4.923710880944426e-05, + "loss": 0.1724, + "num_input_tokens_seen": 21524352, + "step": 17685 + }, + { + "epoch": 1.9701525782381113, + "grad_norm": 0.33798712491989136, + "learning_rate": 4.9251030181534694e-05, + "loss": 0.0332, + "num_input_tokens_seen": 21530560, + "step": 17690 + }, + { + "epoch": 1.9707094331217285, + "grad_norm": 0.3989527225494385, + "learning_rate": 4.926495155362513e-05, + "loss": 0.1959, + "num_input_tokens_seen": 21535552, + "step": 17695 + }, + { + "epoch": 1.9712662880053458, + "grad_norm": 0.04586048051714897, + "learning_rate": 4.927887292571556e-05, + "loss": 0.1061, + "num_input_tokens_seen": 21541856, + "step": 17700 + }, + { + "epoch": 1.971823142888963, + "grad_norm": 3.6659040451049805, + "learning_rate": 4.9292794297806e-05, + "loss": 0.2295, + "num_input_tokens_seen": 21548160, + "step": 17705 + }, + { + "epoch": 1.9723799977725804, + "grad_norm": 0.8167448043823242, + "learning_rate": 4.9306715669896424e-05, + "loss": 0.0645, + "num_input_tokens_seen": 21554208, + "step": 17710 + }, + { + "epoch": 1.972936852656198, + "grad_norm": 0.3560085892677307, + "learning_rate": 4.932063704198686e-05, + "loss": 0.114, + "num_input_tokens_seen": 21560288, + "step": 17715 + }, + { + "epoch": 1.9734937075398151, + "grad_norm": 1.1063461303710938, + "learning_rate": 4.933455841407729e-05, + "loss": 0.1001, + "num_input_tokens_seen": 21566304, + "step": 17720 + }, + { + "epoch": 1.9740505624234324, + "grad_norm": 0.6551936268806458, + "learning_rate": 4.934847978616773e-05, + "loss": 0.0914, + "num_input_tokens_seen": 21572160, + "step": 17725 + }, + { + "epoch": 1.9746074173070498, + "grad_norm": 0.298710435628891, + "learning_rate": 4.936240115825816e-05, + "loss": 0.1228, + "num_input_tokens_seen": 21578272, + "step": 17730 + }, + { + "epoch": 1.9751642721906673, + "grad_norm": 0.4503615200519562, + "learning_rate": 4.9376322530348596e-05, + "loss": 0.0734, + "num_input_tokens_seen": 21584384, + "step": 17735 + }, + { + "epoch": 1.9757211270742845, + "grad_norm": 1.075130581855774, + "learning_rate": 4.9390243902439024e-05, + "loss": 0.1004, + "num_input_tokens_seen": 21590368, + "step": 17740 + }, + { + "epoch": 1.9762779819579017, + "grad_norm": 0.12042846530675888, + "learning_rate": 4.9404165274529465e-05, + "loss": 0.061, + "num_input_tokens_seen": 21596512, + "step": 17745 + }, + { + "epoch": 1.976834836841519, + "grad_norm": 0.36126211285591125, + "learning_rate": 4.941808664661989e-05, + "loss": 0.1758, + "num_input_tokens_seen": 21602624, + "step": 17750 + }, + { + "epoch": 1.9773916917251364, + "grad_norm": 0.2941085696220398, + "learning_rate": 4.9432008018710327e-05, + "loss": 0.129, + "num_input_tokens_seen": 21608704, + "step": 17755 + }, + { + "epoch": 1.9779485466087539, + "grad_norm": 0.7707021236419678, + "learning_rate": 4.944592939080076e-05, + "loss": 0.1365, + "num_input_tokens_seen": 21614976, + "step": 17760 + }, + { + "epoch": 1.978505401492371, + "grad_norm": 0.32397380471229553, + "learning_rate": 4.945985076289119e-05, + "loss": 0.1731, + "num_input_tokens_seen": 21621056, + "step": 17765 + }, + { + "epoch": 1.9790622563759883, + "grad_norm": 0.47191062569618225, + "learning_rate": 4.947377213498163e-05, + "loss": 0.0903, + "num_input_tokens_seen": 21627200, + "step": 17770 + }, + { + "epoch": 1.9796191112596058, + "grad_norm": 0.2525136470794678, + "learning_rate": 4.948769350707206e-05, + "loss": 0.0949, + "num_input_tokens_seen": 21633216, + "step": 17775 + }, + { + "epoch": 1.9801759661432232, + "grad_norm": 0.697769045829773, + "learning_rate": 4.950161487916249e-05, + "loss": 0.115, + "num_input_tokens_seen": 21639680, + "step": 17780 + }, + { + "epoch": 1.9807328210268405, + "grad_norm": 0.5277285575866699, + "learning_rate": 4.9515536251252926e-05, + "loss": 0.1054, + "num_input_tokens_seen": 21645856, + "step": 17785 + }, + { + "epoch": 1.9812896759104577, + "grad_norm": 0.11209364980459213, + "learning_rate": 4.952945762334336e-05, + "loss": 0.0716, + "num_input_tokens_seen": 21652512, + "step": 17790 + }, + { + "epoch": 1.981846530794075, + "grad_norm": 0.801024317741394, + "learning_rate": 4.9543378995433794e-05, + "loss": 0.1491, + "num_input_tokens_seen": 21658784, + "step": 17795 + }, + { + "epoch": 1.9824033856776924, + "grad_norm": 0.5622514486312866, + "learning_rate": 4.955730036752422e-05, + "loss": 0.0911, + "num_input_tokens_seen": 21664512, + "step": 17800 + }, + { + "epoch": 1.9829602405613098, + "grad_norm": 0.25514471530914307, + "learning_rate": 4.9571221739614656e-05, + "loss": 0.1666, + "num_input_tokens_seen": 21671168, + "step": 17805 + }, + { + "epoch": 1.983517095444927, + "grad_norm": 0.905651330947876, + "learning_rate": 4.958514311170509e-05, + "loss": 0.1479, + "num_input_tokens_seen": 21676960, + "step": 17810 + }, + { + "epoch": 1.9840739503285443, + "grad_norm": 0.6264880895614624, + "learning_rate": 4.9599064483795525e-05, + "loss": 0.0924, + "num_input_tokens_seen": 21682912, + "step": 17815 + }, + { + "epoch": 1.9846308052121617, + "grad_norm": 0.33108964562416077, + "learning_rate": 4.961298585588596e-05, + "loss": 0.0795, + "num_input_tokens_seen": 21689344, + "step": 17820 + }, + { + "epoch": 1.9851876600957792, + "grad_norm": 1.2397639751434326, + "learning_rate": 4.9626907227976394e-05, + "loss": 0.243, + "num_input_tokens_seen": 21695232, + "step": 17825 + }, + { + "epoch": 1.9857445149793964, + "grad_norm": 0.6751238107681274, + "learning_rate": 4.964082860006682e-05, + "loss": 0.1201, + "num_input_tokens_seen": 21701280, + "step": 17830 + }, + { + "epoch": 1.9863013698630136, + "grad_norm": 1.401792049407959, + "learning_rate": 4.965474997215726e-05, + "loss": 0.1466, + "num_input_tokens_seen": 21706720, + "step": 17835 + }, + { + "epoch": 1.9868582247466309, + "grad_norm": 0.009716755710542202, + "learning_rate": 4.966867134424769e-05, + "loss": 0.1063, + "num_input_tokens_seen": 21713088, + "step": 17840 + }, + { + "epoch": 1.9874150796302483, + "grad_norm": 0.012030445970594883, + "learning_rate": 4.9682592716338124e-05, + "loss": 0.0139, + "num_input_tokens_seen": 21719552, + "step": 17845 + }, + { + "epoch": 1.9879719345138658, + "grad_norm": 1.3692631721496582, + "learning_rate": 4.969651408842856e-05, + "loss": 0.1261, + "num_input_tokens_seen": 21725856, + "step": 17850 + }, + { + "epoch": 1.988528789397483, + "grad_norm": 0.7865995168685913, + "learning_rate": 4.9710435460518986e-05, + "loss": 0.2355, + "num_input_tokens_seen": 21732160, + "step": 17855 + }, + { + "epoch": 1.9890856442811002, + "grad_norm": 0.3186444342136383, + "learning_rate": 4.972435683260943e-05, + "loss": 0.1332, + "num_input_tokens_seen": 21737824, + "step": 17860 + }, + { + "epoch": 1.9896424991647177, + "grad_norm": 0.6254075169563293, + "learning_rate": 4.9738278204699855e-05, + "loss": 0.1816, + "num_input_tokens_seen": 21743680, + "step": 17865 + }, + { + "epoch": 1.9901993540483351, + "grad_norm": 0.46637406945228577, + "learning_rate": 4.9752199576790296e-05, + "loss": 0.1464, + "num_input_tokens_seen": 21750048, + "step": 17870 + }, + { + "epoch": 1.9907562089319524, + "grad_norm": 0.7837616205215454, + "learning_rate": 4.976612094888072e-05, + "loss": 0.2857, + "num_input_tokens_seen": 21756128, + "step": 17875 + }, + { + "epoch": 1.9913130638155696, + "grad_norm": 1.1473411321640015, + "learning_rate": 4.978004232097116e-05, + "loss": 0.0444, + "num_input_tokens_seen": 21762560, + "step": 17880 + }, + { + "epoch": 1.9918699186991868, + "grad_norm": 1.5976487398147583, + "learning_rate": 4.979396369306159e-05, + "loss": 0.0837, + "num_input_tokens_seen": 21769120, + "step": 17885 + }, + { + "epoch": 1.9924267735828043, + "grad_norm": 0.121733658015728, + "learning_rate": 4.980788506515202e-05, + "loss": 0.07, + "num_input_tokens_seen": 21775168, + "step": 17890 + }, + { + "epoch": 1.9929836284664217, + "grad_norm": 0.17598998546600342, + "learning_rate": 4.982180643724246e-05, + "loss": 0.2677, + "num_input_tokens_seen": 21780640, + "step": 17895 + }, + { + "epoch": 1.993540483350039, + "grad_norm": 2.5737078189849854, + "learning_rate": 4.983572780933289e-05, + "loss": 0.1218, + "num_input_tokens_seen": 21786720, + "step": 17900 + }, + { + "epoch": 1.9940973382336562, + "grad_norm": 0.7029927372932434, + "learning_rate": 4.984964918142332e-05, + "loss": 0.1305, + "num_input_tokens_seen": 21792672, + "step": 17905 + }, + { + "epoch": 1.9946541931172737, + "grad_norm": 0.4962322413921356, + "learning_rate": 4.986357055351376e-05, + "loss": 0.0833, + "num_input_tokens_seen": 21798752, + "step": 17910 + }, + { + "epoch": 1.995211048000891, + "grad_norm": 0.6702220439910889, + "learning_rate": 4.987749192560419e-05, + "loss": 0.0492, + "num_input_tokens_seen": 21804704, + "step": 17915 + }, + { + "epoch": 1.9957679028845083, + "grad_norm": 0.4004109501838684, + "learning_rate": 4.9891413297694625e-05, + "loss": 0.0872, + "num_input_tokens_seen": 21810976, + "step": 17920 + }, + { + "epoch": 1.9963247577681256, + "grad_norm": 0.3117329478263855, + "learning_rate": 4.990533466978506e-05, + "loss": 0.0345, + "num_input_tokens_seen": 21817056, + "step": 17925 + }, + { + "epoch": 1.9968816126517428, + "grad_norm": 0.01739474944770336, + "learning_rate": 4.991925604187549e-05, + "loss": 0.0866, + "num_input_tokens_seen": 21823168, + "step": 17930 + }, + { + "epoch": 1.9974384675353603, + "grad_norm": 0.34806931018829346, + "learning_rate": 4.993317741396592e-05, + "loss": 0.0298, + "num_input_tokens_seen": 21829504, + "step": 17935 + }, + { + "epoch": 1.9979953224189777, + "grad_norm": 0.47716256976127625, + "learning_rate": 4.9947098786056356e-05, + "loss": 0.07, + "num_input_tokens_seen": 21834944, + "step": 17940 + }, + { + "epoch": 1.998552177302595, + "grad_norm": 0.35352298617362976, + "learning_rate": 4.996102015814679e-05, + "loss": 0.0645, + "num_input_tokens_seen": 21841248, + "step": 17945 + }, + { + "epoch": 1.9991090321862122, + "grad_norm": 1.5327788591384888, + "learning_rate": 4.9974941530237225e-05, + "loss": 0.0662, + "num_input_tokens_seen": 21847008, + "step": 17950 + }, + { + "epoch": 1.9996658870698296, + "grad_norm": 0.10525394231081009, + "learning_rate": 4.998886290232765e-05, + "loss": 0.026, + "num_input_tokens_seen": 21853280, + "step": 17955 + }, + { + "epoch": 2.0, + "eval_loss": 0.12035691738128662, + "eval_runtime": 112.3959, + "eval_samples_per_second": 35.508, + "eval_steps_per_second": 8.879, + "num_input_tokens_seen": 21856400, + "step": 17958 + }, + { + "epoch": 2.000222741953447, + "grad_norm": 1.9690754413604736, + "learning_rate": 4.99999999952771e-05, + "loss": 0.1541, + "num_input_tokens_seen": 21859024, + "step": 17960 + }, + { + "epoch": 2.0007795968370643, + "grad_norm": 0.09932699799537659, + "learning_rate": 4.999999982997558e-05, + "loss": 0.0979, + "num_input_tokens_seen": 21865104, + "step": 17965 + }, + { + "epoch": 2.0013364517206815, + "grad_norm": 0.22938178479671478, + "learning_rate": 4.999999942852903e-05, + "loss": 0.0924, + "num_input_tokens_seen": 21871440, + "step": 17970 + }, + { + "epoch": 2.0018933066042988, + "grad_norm": 0.02581789903342724, + "learning_rate": 4.999999879093746e-05, + "loss": 0.047, + "num_input_tokens_seen": 21877648, + "step": 17975 + }, + { + "epoch": 2.0024501614879164, + "grad_norm": 1.2160234451293945, + "learning_rate": 4.9999997917200866e-05, + "loss": 0.2167, + "num_input_tokens_seen": 21883856, + "step": 17980 + }, + { + "epoch": 2.0030070163715337, + "grad_norm": 1.0735504627227783, + "learning_rate": 4.9999996807319263e-05, + "loss": 0.0638, + "num_input_tokens_seen": 21889968, + "step": 17985 + }, + { + "epoch": 2.003563871255151, + "grad_norm": 0.025684718042612076, + "learning_rate": 4.999999546129267e-05, + "loss": 0.0956, + "num_input_tokens_seen": 21896112, + "step": 17990 + }, + { + "epoch": 2.004120726138768, + "grad_norm": 0.20803603529930115, + "learning_rate": 4.999999387912108e-05, + "loss": 0.0349, + "num_input_tokens_seen": 21902032, + "step": 17995 + }, + { + "epoch": 2.0046775810223854, + "grad_norm": 0.06963679939508438, + "learning_rate": 4.999999206080452e-05, + "loss": 0.0532, + "num_input_tokens_seen": 21907952, + "step": 18000 + }, + { + "epoch": 2.005234435906003, + "grad_norm": 0.030488910153508186, + "learning_rate": 4.9999990006343005e-05, + "loss": 0.0345, + "num_input_tokens_seen": 21914256, + "step": 18005 + }, + { + "epoch": 2.0057912907896203, + "grad_norm": 0.5388779044151306, + "learning_rate": 4.999998771573655e-05, + "loss": 0.057, + "num_input_tokens_seen": 21920560, + "step": 18010 + }, + { + "epoch": 2.0063481456732375, + "grad_norm": 0.029220739379525185, + "learning_rate": 4.9999985188985195e-05, + "loss": 0.0731, + "num_input_tokens_seen": 21926320, + "step": 18015 + }, + { + "epoch": 2.0069050005568547, + "grad_norm": 0.749505341053009, + "learning_rate": 4.999998242608894e-05, + "loss": 0.1429, + "num_input_tokens_seen": 21932368, + "step": 18020 + }, + { + "epoch": 2.0074618554404724, + "grad_norm": 1.5745981931686401, + "learning_rate": 4.999997942704783e-05, + "loss": 0.1638, + "num_input_tokens_seen": 21938512, + "step": 18025 + }, + { + "epoch": 2.0080187103240896, + "grad_norm": 2.2758219242095947, + "learning_rate": 4.999997619186188e-05, + "loss": 0.2784, + "num_input_tokens_seen": 21944432, + "step": 18030 + }, + { + "epoch": 2.008575565207707, + "grad_norm": 0.6423492431640625, + "learning_rate": 4.999997272053112e-05, + "loss": 0.1346, + "num_input_tokens_seen": 21950800, + "step": 18035 + }, + { + "epoch": 2.009132420091324, + "grad_norm": 0.08568305522203445, + "learning_rate": 4.999996901305559e-05, + "loss": 0.2372, + "num_input_tokens_seen": 21956688, + "step": 18040 + }, + { + "epoch": 2.0096892749749413, + "grad_norm": 0.6780980229377747, + "learning_rate": 4.9999965069435316e-05, + "loss": 0.0834, + "num_input_tokens_seen": 21962704, + "step": 18045 + }, + { + "epoch": 2.010246129858559, + "grad_norm": 1.025593876838684, + "learning_rate": 4.9999960889670356e-05, + "loss": 0.1566, + "num_input_tokens_seen": 21968848, + "step": 18050 + }, + { + "epoch": 2.0108029847421762, + "grad_norm": 0.8497124910354614, + "learning_rate": 4.9999956473760735e-05, + "loss": 0.1939, + "num_input_tokens_seen": 21974928, + "step": 18055 + }, + { + "epoch": 2.0113598396257935, + "grad_norm": 0.01311552245169878, + "learning_rate": 4.999995182170649e-05, + "loss": 0.1027, + "num_input_tokens_seen": 21980976, + "step": 18060 + }, + { + "epoch": 2.0119166945094107, + "grad_norm": 1.065332055091858, + "learning_rate": 4.999994693350767e-05, + "loss": 0.282, + "num_input_tokens_seen": 21986832, + "step": 18065 + }, + { + "epoch": 2.0124735493930284, + "grad_norm": 1.7222241163253784, + "learning_rate": 4.999994180916432e-05, + "loss": 0.1164, + "num_input_tokens_seen": 21992816, + "step": 18070 + }, + { + "epoch": 2.0130304042766456, + "grad_norm": 0.8749215006828308, + "learning_rate": 4.999993644867649e-05, + "loss": 0.0837, + "num_input_tokens_seen": 21998992, + "step": 18075 + }, + { + "epoch": 2.013587259160263, + "grad_norm": 1.318967342376709, + "learning_rate": 4.999993085204424e-05, + "loss": 0.1444, + "num_input_tokens_seen": 22005200, + "step": 18080 + }, + { + "epoch": 2.01414411404388, + "grad_norm": 0.9967339634895325, + "learning_rate": 4.9999925019267605e-05, + "loss": 0.1911, + "num_input_tokens_seen": 22010992, + "step": 18085 + }, + { + "epoch": 2.0147009689274973, + "grad_norm": 0.24184542894363403, + "learning_rate": 4.9999918950346645e-05, + "loss": 0.1191, + "num_input_tokens_seen": 22017072, + "step": 18090 + }, + { + "epoch": 2.015257823811115, + "grad_norm": 1.6997946500778198, + "learning_rate": 4.999991264528143e-05, + "loss": 0.1045, + "num_input_tokens_seen": 22023696, + "step": 18095 + }, + { + "epoch": 2.015814678694732, + "grad_norm": 1.0068062543869019, + "learning_rate": 4.999990610407201e-05, + "loss": 0.0977, + "num_input_tokens_seen": 22029744, + "step": 18100 + }, + { + "epoch": 2.0163715335783494, + "grad_norm": 0.24414905905723572, + "learning_rate": 4.999989932671845e-05, + "loss": 0.1187, + "num_input_tokens_seen": 22035280, + "step": 18105 + }, + { + "epoch": 2.0169283884619666, + "grad_norm": 0.7479932904243469, + "learning_rate": 4.99998923132208e-05, + "loss": 0.0619, + "num_input_tokens_seen": 22041456, + "step": 18110 + }, + { + "epoch": 2.0174852433455843, + "grad_norm": 0.13545691967010498, + "learning_rate": 4.999988506357914e-05, + "loss": 0.0863, + "num_input_tokens_seen": 22047440, + "step": 18115 + }, + { + "epoch": 2.0180420982292016, + "grad_norm": 1.0381194353103638, + "learning_rate": 4.999987757779354e-05, + "loss": 0.0673, + "num_input_tokens_seen": 22053424, + "step": 18120 + }, + { + "epoch": 2.018598953112819, + "grad_norm": 0.2974492311477661, + "learning_rate": 4.999986985586407e-05, + "loss": 0.1012, + "num_input_tokens_seen": 22059536, + "step": 18125 + }, + { + "epoch": 2.019155807996436, + "grad_norm": 1.2031224966049194, + "learning_rate": 4.999986189779079e-05, + "loss": 0.1084, + "num_input_tokens_seen": 22065968, + "step": 18130 + }, + { + "epoch": 2.0197126628800532, + "grad_norm": 0.061847615987062454, + "learning_rate": 4.9999853703573796e-05, + "loss": 0.0248, + "num_input_tokens_seen": 22072112, + "step": 18135 + }, + { + "epoch": 2.020269517763671, + "grad_norm": 0.8765165209770203, + "learning_rate": 4.999984527321314e-05, + "loss": 0.141, + "num_input_tokens_seen": 22078128, + "step": 18140 + }, + { + "epoch": 2.020826372647288, + "grad_norm": 1.116872787475586, + "learning_rate": 4.9999836606708925e-05, + "loss": 0.1007, + "num_input_tokens_seen": 22084336, + "step": 18145 + }, + { + "epoch": 2.0213832275309054, + "grad_norm": 0.13534323871135712, + "learning_rate": 4.999982770406123e-05, + "loss": 0.0441, + "num_input_tokens_seen": 22090704, + "step": 18150 + }, + { + "epoch": 2.0219400824145226, + "grad_norm": 0.26096946001052856, + "learning_rate": 4.9999818565270125e-05, + "loss": 0.2308, + "num_input_tokens_seen": 22096816, + "step": 18155 + }, + { + "epoch": 2.0224969372981403, + "grad_norm": 1.4533692598342896, + "learning_rate": 4.99998091903357e-05, + "loss": 0.1249, + "num_input_tokens_seen": 22103152, + "step": 18160 + }, + { + "epoch": 2.0230537921817575, + "grad_norm": 0.06848601251840591, + "learning_rate": 4.9999799579258056e-05, + "loss": 0.1118, + "num_input_tokens_seen": 22109264, + "step": 18165 + }, + { + "epoch": 2.0236106470653747, + "grad_norm": 1.3755155801773071, + "learning_rate": 4.999978973203727e-05, + "loss": 0.1488, + "num_input_tokens_seen": 22115088, + "step": 18170 + }, + { + "epoch": 2.024167501948992, + "grad_norm": 0.954181432723999, + "learning_rate": 4.999977964867345e-05, + "loss": 0.0865, + "num_input_tokens_seen": 22121648, + "step": 18175 + }, + { + "epoch": 2.024724356832609, + "grad_norm": 0.12285993993282318, + "learning_rate": 4.999976932916667e-05, + "loss": 0.0343, + "num_input_tokens_seen": 22127984, + "step": 18180 + }, + { + "epoch": 2.025281211716227, + "grad_norm": 0.31589293479919434, + "learning_rate": 4.999975877351705e-05, + "loss": 0.038, + "num_input_tokens_seen": 22134224, + "step": 18185 + }, + { + "epoch": 2.025838066599844, + "grad_norm": 0.0421956442296505, + "learning_rate": 4.999974798172467e-05, + "loss": 0.0815, + "num_input_tokens_seen": 22140240, + "step": 18190 + }, + { + "epoch": 2.0263949214834613, + "grad_norm": 0.5083713531494141, + "learning_rate": 4.999973695378964e-05, + "loss": 0.1315, + "num_input_tokens_seen": 22145904, + "step": 18195 + }, + { + "epoch": 2.0269517763670786, + "grad_norm": 0.3561135530471802, + "learning_rate": 4.999972568971207e-05, + "loss": 0.155, + "num_input_tokens_seen": 22151088, + "step": 18200 + }, + { + "epoch": 2.0275086312506962, + "grad_norm": 0.4018169641494751, + "learning_rate": 4.999971418949206e-05, + "loss": 0.1688, + "num_input_tokens_seen": 22157264, + "step": 18205 + }, + { + "epoch": 2.0280654861343135, + "grad_norm": 0.3240397274494171, + "learning_rate": 4.9999702453129715e-05, + "loss": 0.1068, + "num_input_tokens_seen": 22163184, + "step": 18210 + }, + { + "epoch": 2.0286223410179307, + "grad_norm": 0.42436203360557556, + "learning_rate": 4.9999690480625164e-05, + "loss": 0.0824, + "num_input_tokens_seen": 22168368, + "step": 18215 + }, + { + "epoch": 2.029179195901548, + "grad_norm": 0.2669413387775421, + "learning_rate": 4.9999678271978486e-05, + "loss": 0.1861, + "num_input_tokens_seen": 22174608, + "step": 18220 + }, + { + "epoch": 2.029736050785165, + "grad_norm": 0.08228358626365662, + "learning_rate": 4.999966582718984e-05, + "loss": 0.1434, + "num_input_tokens_seen": 22180720, + "step": 18225 + }, + { + "epoch": 2.030292905668783, + "grad_norm": 1.5276511907577515, + "learning_rate": 4.9999653146259307e-05, + "loss": 0.183, + "num_input_tokens_seen": 22186576, + "step": 18230 + }, + { + "epoch": 2.0308497605524, + "grad_norm": 0.05176559463143349, + "learning_rate": 4.999964022918703e-05, + "loss": 0.0267, + "num_input_tokens_seen": 22192560, + "step": 18235 + }, + { + "epoch": 2.0314066154360173, + "grad_norm": 1.4436885118484497, + "learning_rate": 4.999962707597311e-05, + "loss": 0.3197, + "num_input_tokens_seen": 22198672, + "step": 18240 + }, + { + "epoch": 2.0319634703196345, + "grad_norm": 0.31201136112213135, + "learning_rate": 4.999961368661769e-05, + "loss": 0.0571, + "num_input_tokens_seen": 22204656, + "step": 18245 + }, + { + "epoch": 2.032520325203252, + "grad_norm": 0.4350714087486267, + "learning_rate": 4.999960006112089e-05, + "loss": 0.1301, + "num_input_tokens_seen": 22210608, + "step": 18250 + }, + { + "epoch": 2.0330771800868694, + "grad_norm": 0.7356971502304077, + "learning_rate": 4.999958619948284e-05, + "loss": 0.0473, + "num_input_tokens_seen": 22216752, + "step": 18255 + }, + { + "epoch": 2.0336340349704867, + "grad_norm": 1.0700525045394897, + "learning_rate": 4.9999572101703664e-05, + "loss": 0.1251, + "num_input_tokens_seen": 22222864, + "step": 18260 + }, + { + "epoch": 2.034190889854104, + "grad_norm": 1.2000235319137573, + "learning_rate": 4.99995577677835e-05, + "loss": 0.0899, + "num_input_tokens_seen": 22228944, + "step": 18265 + }, + { + "epoch": 2.034747744737721, + "grad_norm": 0.13859105110168457, + "learning_rate": 4.9999543197722486e-05, + "loss": 0.0469, + "num_input_tokens_seen": 22235088, + "step": 18270 + }, + { + "epoch": 2.035304599621339, + "grad_norm": 1.1682156324386597, + "learning_rate": 4.999952839152076e-05, + "loss": 0.1624, + "num_input_tokens_seen": 22240976, + "step": 18275 + }, + { + "epoch": 2.035861454504956, + "grad_norm": 0.8341971635818481, + "learning_rate": 4.9999513349178453e-05, + "loss": 0.1181, + "num_input_tokens_seen": 22247184, + "step": 18280 + }, + { + "epoch": 2.0364183093885733, + "grad_norm": 0.9890335202217102, + "learning_rate": 4.999949807069572e-05, + "loss": 0.137, + "num_input_tokens_seen": 22253328, + "step": 18285 + }, + { + "epoch": 2.0369751642721905, + "grad_norm": 1.2665284872055054, + "learning_rate": 4.999948255607268e-05, + "loss": 0.1348, + "num_input_tokens_seen": 22259344, + "step": 18290 + }, + { + "epoch": 2.037532019155808, + "grad_norm": 0.8681978583335876, + "learning_rate": 4.999946680530952e-05, + "loss": 0.2792, + "num_input_tokens_seen": 22265168, + "step": 18295 + }, + { + "epoch": 2.0380888740394254, + "grad_norm": 1.077160120010376, + "learning_rate": 4.9999450818406355e-05, + "loss": 0.1559, + "num_input_tokens_seen": 22271280, + "step": 18300 + }, + { + "epoch": 2.0386457289230426, + "grad_norm": 0.3804257810115814, + "learning_rate": 4.999943459536336e-05, + "loss": 0.0398, + "num_input_tokens_seen": 22277808, + "step": 18305 + }, + { + "epoch": 2.03920258380666, + "grad_norm": 0.49299386143684387, + "learning_rate": 4.999941813618066e-05, + "loss": 0.171, + "num_input_tokens_seen": 22284112, + "step": 18310 + }, + { + "epoch": 2.039759438690277, + "grad_norm": 0.6601404547691345, + "learning_rate": 4.999940144085844e-05, + "loss": 0.108, + "num_input_tokens_seen": 22290384, + "step": 18315 + }, + { + "epoch": 2.0403162935738948, + "grad_norm": 1.6550811529159546, + "learning_rate": 4.999938450939684e-05, + "loss": 0.1171, + "num_input_tokens_seen": 22296688, + "step": 18320 + }, + { + "epoch": 2.040873148457512, + "grad_norm": 0.03794488683342934, + "learning_rate": 4.999936734179602e-05, + "loss": 0.0973, + "num_input_tokens_seen": 22302928, + "step": 18325 + }, + { + "epoch": 2.0414300033411292, + "grad_norm": 1.3106887340545654, + "learning_rate": 4.999934993805615e-05, + "loss": 0.0991, + "num_input_tokens_seen": 22308848, + "step": 18330 + }, + { + "epoch": 2.0419868582247465, + "grad_norm": 0.7923256754875183, + "learning_rate": 4.999933229817739e-05, + "loss": 0.0655, + "num_input_tokens_seen": 22314800, + "step": 18335 + }, + { + "epoch": 2.042543713108364, + "grad_norm": 1.726208209991455, + "learning_rate": 4.9999314422159905e-05, + "loss": 0.0881, + "num_input_tokens_seen": 22320848, + "step": 18340 + }, + { + "epoch": 2.0431005679919814, + "grad_norm": 1.3894680738449097, + "learning_rate": 4.999929631000387e-05, + "loss": 0.2034, + "num_input_tokens_seen": 22326704, + "step": 18345 + }, + { + "epoch": 2.0436574228755986, + "grad_norm": 0.19433574378490448, + "learning_rate": 4.999927796170944e-05, + "loss": 0.078, + "num_input_tokens_seen": 22332752, + "step": 18350 + }, + { + "epoch": 2.044214277759216, + "grad_norm": 1.53804349899292, + "learning_rate": 4.999925937727682e-05, + "loss": 0.1689, + "num_input_tokens_seen": 22338640, + "step": 18355 + }, + { + "epoch": 2.0447711326428335, + "grad_norm": 0.09769263863563538, + "learning_rate": 4.9999240556706154e-05, + "loss": 0.119, + "num_input_tokens_seen": 22344720, + "step": 18360 + }, + { + "epoch": 2.0453279875264507, + "grad_norm": 0.7061609625816345, + "learning_rate": 4.999922149999764e-05, + "loss": 0.2385, + "num_input_tokens_seen": 22350800, + "step": 18365 + }, + { + "epoch": 2.045884842410068, + "grad_norm": 0.9578084349632263, + "learning_rate": 4.999920220715144e-05, + "loss": 0.1557, + "num_input_tokens_seen": 22357136, + "step": 18370 + }, + { + "epoch": 2.046441697293685, + "grad_norm": 1.0474966764450073, + "learning_rate": 4.999918267816775e-05, + "loss": 0.2135, + "num_input_tokens_seen": 22363120, + "step": 18375 + }, + { + "epoch": 2.0469985521773024, + "grad_norm": 0.5877757668495178, + "learning_rate": 4.9999162913046755e-05, + "loss": 0.0925, + "num_input_tokens_seen": 22369328, + "step": 18380 + }, + { + "epoch": 2.04755540706092, + "grad_norm": 0.16553851962089539, + "learning_rate": 4.999914291178863e-05, + "loss": 0.0203, + "num_input_tokens_seen": 22375632, + "step": 18385 + }, + { + "epoch": 2.0481122619445373, + "grad_norm": 0.1180514544248581, + "learning_rate": 4.999912267439358e-05, + "loss": 0.0551, + "num_input_tokens_seen": 22381872, + "step": 18390 + }, + { + "epoch": 2.0486691168281546, + "grad_norm": 1.4093507528305054, + "learning_rate": 4.999910220086178e-05, + "loss": 0.2504, + "num_input_tokens_seen": 22388112, + "step": 18395 + }, + { + "epoch": 2.049225971711772, + "grad_norm": 1.0452642440795898, + "learning_rate": 4.999908149119343e-05, + "loss": 0.2076, + "num_input_tokens_seen": 22394096, + "step": 18400 + }, + { + "epoch": 2.049782826595389, + "grad_norm": 0.5709975361824036, + "learning_rate": 4.999906054538873e-05, + "loss": 0.0861, + "num_input_tokens_seen": 22400368, + "step": 18405 + }, + { + "epoch": 2.0503396814790067, + "grad_norm": 0.2475237250328064, + "learning_rate": 4.999903936344787e-05, + "loss": 0.0535, + "num_input_tokens_seen": 22406416, + "step": 18410 + }, + { + "epoch": 2.050896536362624, + "grad_norm": 0.9342204928398132, + "learning_rate": 4.999901794537106e-05, + "loss": 0.1232, + "num_input_tokens_seen": 22412752, + "step": 18415 + }, + { + "epoch": 2.051453391246241, + "grad_norm": 1.0982694625854492, + "learning_rate": 4.99989962911585e-05, + "loss": 0.0604, + "num_input_tokens_seen": 22419248, + "step": 18420 + }, + { + "epoch": 2.0520102461298584, + "grad_norm": 0.25861549377441406, + "learning_rate": 4.999897440081038e-05, + "loss": 0.0838, + "num_input_tokens_seen": 22425072, + "step": 18425 + }, + { + "epoch": 2.052567101013476, + "grad_norm": 0.5112177729606628, + "learning_rate": 4.999895227432693e-05, + "loss": 0.1824, + "num_input_tokens_seen": 22431248, + "step": 18430 + }, + { + "epoch": 2.0531239558970933, + "grad_norm": 0.004171446897089481, + "learning_rate": 4.9998929911708344e-05, + "loss": 0.1288, + "num_input_tokens_seen": 22437296, + "step": 18435 + }, + { + "epoch": 2.0536808107807105, + "grad_norm": 0.24087315797805786, + "learning_rate": 4.9998907312954834e-05, + "loss": 0.0479, + "num_input_tokens_seen": 22443248, + "step": 18440 + }, + { + "epoch": 2.0542376656643277, + "grad_norm": 0.3783644437789917, + "learning_rate": 4.999888447806661e-05, + "loss": 0.0369, + "num_input_tokens_seen": 22449456, + "step": 18445 + }, + { + "epoch": 2.0547945205479454, + "grad_norm": 0.32942983508110046, + "learning_rate": 4.99988614070439e-05, + "loss": 0.1035, + "num_input_tokens_seen": 22455760, + "step": 18450 + }, + { + "epoch": 2.0553513754315627, + "grad_norm": 0.28117087483406067, + "learning_rate": 4.999883809988691e-05, + "loss": 0.0385, + "num_input_tokens_seen": 22461776, + "step": 18455 + }, + { + "epoch": 2.05590823031518, + "grad_norm": 0.6127769947052002, + "learning_rate": 4.999881455659587e-05, + "loss": 0.0816, + "num_input_tokens_seen": 22468272, + "step": 18460 + }, + { + "epoch": 2.056465085198797, + "grad_norm": 1.307996153831482, + "learning_rate": 4.9998790777171004e-05, + "loss": 0.0877, + "num_input_tokens_seen": 22474128, + "step": 18465 + }, + { + "epoch": 2.0570219400824143, + "grad_norm": 1.8332841396331787, + "learning_rate": 4.9998766761612514e-05, + "loss": 0.1654, + "num_input_tokens_seen": 22480464, + "step": 18470 + }, + { + "epoch": 2.057578794966032, + "grad_norm": 0.652682363986969, + "learning_rate": 4.999874250992065e-05, + "loss": 0.1125, + "num_input_tokens_seen": 22486576, + "step": 18475 + }, + { + "epoch": 2.0581356498496493, + "grad_norm": 0.011424972675740719, + "learning_rate": 4.999871802209564e-05, + "loss": 0.1231, + "num_input_tokens_seen": 22492912, + "step": 18480 + }, + { + "epoch": 2.0586925047332665, + "grad_norm": 1.0838719606399536, + "learning_rate": 4.999869329813771e-05, + "loss": 0.0832, + "num_input_tokens_seen": 22499152, + "step": 18485 + }, + { + "epoch": 2.0592493596168837, + "grad_norm": 0.9473181962966919, + "learning_rate": 4.999866833804708e-05, + "loss": 0.1674, + "num_input_tokens_seen": 22505424, + "step": 18490 + }, + { + "epoch": 2.0598062145005014, + "grad_norm": 0.02682134509086609, + "learning_rate": 4.9998643141824016e-05, + "loss": 0.1121, + "num_input_tokens_seen": 22511472, + "step": 18495 + }, + { + "epoch": 2.0603630693841186, + "grad_norm": 0.12777061760425568, + "learning_rate": 4.999861770946873e-05, + "loss": 0.0458, + "num_input_tokens_seen": 22517488, + "step": 18500 + }, + { + "epoch": 2.060919924267736, + "grad_norm": 0.5257683396339417, + "learning_rate": 4.999859204098147e-05, + "loss": 0.0634, + "num_input_tokens_seen": 22523056, + "step": 18505 + }, + { + "epoch": 2.061476779151353, + "grad_norm": 0.6676977872848511, + "learning_rate": 4.9998566136362485e-05, + "loss": 0.1103, + "num_input_tokens_seen": 22529072, + "step": 18510 + }, + { + "epoch": 2.0620336340349703, + "grad_norm": 0.024053772911429405, + "learning_rate": 4.999853999561201e-05, + "loss": 0.0812, + "num_input_tokens_seen": 22535568, + "step": 18515 + }, + { + "epoch": 2.062590488918588, + "grad_norm": 0.13626503944396973, + "learning_rate": 4.9998513618730295e-05, + "loss": 0.0912, + "num_input_tokens_seen": 22541520, + "step": 18520 + }, + { + "epoch": 2.063147343802205, + "grad_norm": 1.0269407033920288, + "learning_rate": 4.99984870057176e-05, + "loss": 0.1042, + "num_input_tokens_seen": 22546896, + "step": 18525 + }, + { + "epoch": 2.0637041986858224, + "grad_norm": 0.6291787028312683, + "learning_rate": 4.999846015657416e-05, + "loss": 0.0873, + "num_input_tokens_seen": 22552880, + "step": 18530 + }, + { + "epoch": 2.0642610535694397, + "grad_norm": 0.4346601963043213, + "learning_rate": 4.9998433071300234e-05, + "loss": 0.2541, + "num_input_tokens_seen": 22558832, + "step": 18535 + }, + { + "epoch": 2.0648179084530573, + "grad_norm": 0.162944957613945, + "learning_rate": 4.9998405749896075e-05, + "loss": 0.176, + "num_input_tokens_seen": 22564976, + "step": 18540 + }, + { + "epoch": 2.0653747633366746, + "grad_norm": 0.031031310558319092, + "learning_rate": 4.999837819236195e-05, + "loss": 0.095, + "num_input_tokens_seen": 22571184, + "step": 18545 + }, + { + "epoch": 2.065931618220292, + "grad_norm": 0.1289636194705963, + "learning_rate": 4.999835039869812e-05, + "loss": 0.068, + "num_input_tokens_seen": 22577328, + "step": 18550 + }, + { + "epoch": 2.066488473103909, + "grad_norm": 1.0091115236282349, + "learning_rate": 4.9998322368904836e-05, + "loss": 0.1433, + "num_input_tokens_seen": 22583728, + "step": 18555 + }, + { + "epoch": 2.0670453279875263, + "grad_norm": 0.026283226907253265, + "learning_rate": 4.999829410298237e-05, + "loss": 0.0267, + "num_input_tokens_seen": 22590000, + "step": 18560 + }, + { + "epoch": 2.067602182871144, + "grad_norm": 0.5255677103996277, + "learning_rate": 4.999826560093099e-05, + "loss": 0.0349, + "num_input_tokens_seen": 22596048, + "step": 18565 + }, + { + "epoch": 2.068159037754761, + "grad_norm": 0.09514112025499344, + "learning_rate": 4.9998236862750955e-05, + "loss": 0.1402, + "num_input_tokens_seen": 22602320, + "step": 18570 + }, + { + "epoch": 2.0687158926383784, + "grad_norm": 0.8215955495834351, + "learning_rate": 4.9998207888442556e-05, + "loss": 0.0778, + "num_input_tokens_seen": 22608528, + "step": 18575 + }, + { + "epoch": 2.0692727475219956, + "grad_norm": 0.4630650281906128, + "learning_rate": 4.9998178678006044e-05, + "loss": 0.0621, + "num_input_tokens_seen": 22614704, + "step": 18580 + }, + { + "epoch": 2.0698296024056133, + "grad_norm": 0.4547440707683563, + "learning_rate": 4.9998149231441716e-05, + "loss": 0.1012, + "num_input_tokens_seen": 22620784, + "step": 18585 + }, + { + "epoch": 2.0703864572892305, + "grad_norm": 0.3578895926475525, + "learning_rate": 4.999811954874984e-05, + "loss": 0.0772, + "num_input_tokens_seen": 22627152, + "step": 18590 + }, + { + "epoch": 2.0709433121728478, + "grad_norm": 0.5491964817047119, + "learning_rate": 4.9998089629930686e-05, + "loss": 0.0991, + "num_input_tokens_seen": 22633008, + "step": 18595 + }, + { + "epoch": 2.071500167056465, + "grad_norm": 0.7999553680419922, + "learning_rate": 4.999805947498456e-05, + "loss": 0.1338, + "num_input_tokens_seen": 22639376, + "step": 18600 + }, + { + "epoch": 2.0720570219400822, + "grad_norm": 1.3884390592575073, + "learning_rate": 4.999802908391173e-05, + "loss": 0.2149, + "num_input_tokens_seen": 22645360, + "step": 18605 + }, + { + "epoch": 2.0726138768237, + "grad_norm": 0.09814275801181793, + "learning_rate": 4.999799845671249e-05, + "loss": 0.1404, + "num_input_tokens_seen": 22651696, + "step": 18610 + }, + { + "epoch": 2.073170731707317, + "grad_norm": 0.01968209072947502, + "learning_rate": 4.9997967593387116e-05, + "loss": 0.0551, + "num_input_tokens_seen": 22657936, + "step": 18615 + }, + { + "epoch": 2.0737275865909344, + "grad_norm": 0.19206975400447845, + "learning_rate": 4.9997936493935916e-05, + "loss": 0.0406, + "num_input_tokens_seen": 22664048, + "step": 18620 + }, + { + "epoch": 2.0742844414745516, + "grad_norm": 0.22205951809883118, + "learning_rate": 4.9997905158359184e-05, + "loss": 0.0443, + "num_input_tokens_seen": 22670160, + "step": 18625 + }, + { + "epoch": 2.0748412963581693, + "grad_norm": 1.019608736038208, + "learning_rate": 4.99978735866572e-05, + "loss": 0.0263, + "num_input_tokens_seen": 22676272, + "step": 18630 + }, + { + "epoch": 2.0753981512417865, + "grad_norm": 0.5549283027648926, + "learning_rate": 4.999784177883028e-05, + "loss": 0.0634, + "num_input_tokens_seen": 22682480, + "step": 18635 + }, + { + "epoch": 2.0759550061254037, + "grad_norm": 0.3435461223125458, + "learning_rate": 4.9997809734878706e-05, + "loss": 0.2072, + "num_input_tokens_seen": 22688688, + "step": 18640 + }, + { + "epoch": 2.076511861009021, + "grad_norm": 0.1351381540298462, + "learning_rate": 4.99977774548028e-05, + "loss": 0.0495, + "num_input_tokens_seen": 22694768, + "step": 18645 + }, + { + "epoch": 2.077068715892638, + "grad_norm": 0.16312721371650696, + "learning_rate": 4.999774493860286e-05, + "loss": 0.1084, + "num_input_tokens_seen": 22701200, + "step": 18650 + }, + { + "epoch": 2.077625570776256, + "grad_norm": 0.6381673216819763, + "learning_rate": 4.9997712186279184e-05, + "loss": 0.1543, + "num_input_tokens_seen": 22707312, + "step": 18655 + }, + { + "epoch": 2.078182425659873, + "grad_norm": 0.7105260491371155, + "learning_rate": 4.9997679197832094e-05, + "loss": 0.1359, + "num_input_tokens_seen": 22713360, + "step": 18660 + }, + { + "epoch": 2.0787392805434903, + "grad_norm": 0.06540465354919434, + "learning_rate": 4.999764597326189e-05, + "loss": 0.0483, + "num_input_tokens_seen": 22719696, + "step": 18665 + }, + { + "epoch": 2.0792961354271076, + "grad_norm": 1.0484004020690918, + "learning_rate": 4.99976125125689e-05, + "loss": 0.2126, + "num_input_tokens_seen": 22725872, + "step": 18670 + }, + { + "epoch": 2.0798529903107252, + "grad_norm": 0.7300972938537598, + "learning_rate": 4.999757881575343e-05, + "loss": 0.1664, + "num_input_tokens_seen": 22731920, + "step": 18675 + }, + { + "epoch": 2.0804098451943425, + "grad_norm": 0.1983991265296936, + "learning_rate": 4.99975448828158e-05, + "loss": 0.0409, + "num_input_tokens_seen": 22738032, + "step": 18680 + }, + { + "epoch": 2.0809667000779597, + "grad_norm": 0.20898933708667755, + "learning_rate": 4.999751071375632e-05, + "loss": 0.0963, + "num_input_tokens_seen": 22743952, + "step": 18685 + }, + { + "epoch": 2.081523554961577, + "grad_norm": 0.9047152996063232, + "learning_rate": 4.9997476308575334e-05, + "loss": 0.165, + "num_input_tokens_seen": 22750160, + "step": 18690 + }, + { + "epoch": 2.082080409845194, + "grad_norm": 1.4513074159622192, + "learning_rate": 4.999744166727316e-05, + "loss": 0.0625, + "num_input_tokens_seen": 22756080, + "step": 18695 + }, + { + "epoch": 2.082637264728812, + "grad_norm": 0.08893629908561707, + "learning_rate": 4.999740678985011e-05, + "loss": 0.0193, + "num_input_tokens_seen": 22762640, + "step": 18700 + }, + { + "epoch": 2.083194119612429, + "grad_norm": 0.516727089881897, + "learning_rate": 4.9997371676306536e-05, + "loss": 0.1481, + "num_input_tokens_seen": 22768688, + "step": 18705 + }, + { + "epoch": 2.0837509744960463, + "grad_norm": 0.61643385887146, + "learning_rate": 4.9997336326642754e-05, + "loss": 0.1235, + "num_input_tokens_seen": 22774608, + "step": 18710 + }, + { + "epoch": 2.0843078293796635, + "grad_norm": 0.6193421483039856, + "learning_rate": 4.999730074085911e-05, + "loss": 0.1139, + "num_input_tokens_seen": 22780784, + "step": 18715 + }, + { + "epoch": 2.084864684263281, + "grad_norm": 0.9725481271743774, + "learning_rate": 4.999726491895592e-05, + "loss": 0.1419, + "num_input_tokens_seen": 22786960, + "step": 18720 + }, + { + "epoch": 2.0854215391468984, + "grad_norm": 0.5851902365684509, + "learning_rate": 4.9997228860933544e-05, + "loss": 0.0541, + "num_input_tokens_seen": 22793008, + "step": 18725 + }, + { + "epoch": 2.0859783940305157, + "grad_norm": 1.6923351287841797, + "learning_rate": 4.9997192566792315e-05, + "loss": 0.1273, + "num_input_tokens_seen": 22799440, + "step": 18730 + }, + { + "epoch": 2.086535248914133, + "grad_norm": 0.42285963892936707, + "learning_rate": 4.9997156036532574e-05, + "loss": 0.1638, + "num_input_tokens_seen": 22805360, + "step": 18735 + }, + { + "epoch": 2.08709210379775, + "grad_norm": 0.7316593527793884, + "learning_rate": 4.999711927015466e-05, + "loss": 0.1044, + "num_input_tokens_seen": 22811728, + "step": 18740 + }, + { + "epoch": 2.087648958681368, + "grad_norm": 0.5729684233665466, + "learning_rate": 4.9997082267658935e-05, + "loss": 0.0767, + "num_input_tokens_seen": 22817904, + "step": 18745 + }, + { + "epoch": 2.088205813564985, + "grad_norm": 1.2505897283554077, + "learning_rate": 4.999704502904574e-05, + "loss": 0.1085, + "num_input_tokens_seen": 22824336, + "step": 18750 + }, + { + "epoch": 2.0887626684486023, + "grad_norm": 0.21027767658233643, + "learning_rate": 4.9997007554315425e-05, + "loss": 0.0798, + "num_input_tokens_seen": 22830416, + "step": 18755 + }, + { + "epoch": 2.0893195233322195, + "grad_norm": 1.3221421241760254, + "learning_rate": 4.9996969843468346e-05, + "loss": 0.1538, + "num_input_tokens_seen": 22836592, + "step": 18760 + }, + { + "epoch": 2.089876378215837, + "grad_norm": 0.9247229099273682, + "learning_rate": 4.999693189650486e-05, + "loss": 0.0509, + "num_input_tokens_seen": 22842672, + "step": 18765 + }, + { + "epoch": 2.0904332330994544, + "grad_norm": 0.5275936126708984, + "learning_rate": 4.999689371342533e-05, + "loss": 0.0732, + "num_input_tokens_seen": 22848912, + "step": 18770 + }, + { + "epoch": 2.0909900879830716, + "grad_norm": 1.289866328239441, + "learning_rate": 4.999685529423011e-05, + "loss": 0.2552, + "num_input_tokens_seen": 22854960, + "step": 18775 + }, + { + "epoch": 2.091546942866689, + "grad_norm": 0.32415008544921875, + "learning_rate": 4.9996816638919553e-05, + "loss": 0.1387, + "num_input_tokens_seen": 22861040, + "step": 18780 + }, + { + "epoch": 2.092103797750306, + "grad_norm": 0.38104376196861267, + "learning_rate": 4.999677774749405e-05, + "loss": 0.1038, + "num_input_tokens_seen": 22867024, + "step": 18785 + }, + { + "epoch": 2.0926606526339238, + "grad_norm": 0.48607778549194336, + "learning_rate": 4.9996738619953944e-05, + "loss": 0.0642, + "num_input_tokens_seen": 22873008, + "step": 18790 + }, + { + "epoch": 2.093217507517541, + "grad_norm": 0.02434789575636387, + "learning_rate": 4.999669925629962e-05, + "loss": 0.0318, + "num_input_tokens_seen": 22879120, + "step": 18795 + }, + { + "epoch": 2.093774362401158, + "grad_norm": 0.6241127848625183, + "learning_rate": 4.999665965653144e-05, + "loss": 0.2412, + "num_input_tokens_seen": 22885200, + "step": 18800 + }, + { + "epoch": 2.0943312172847754, + "grad_norm": 0.019590776413679123, + "learning_rate": 4.9996619820649796e-05, + "loss": 0.0705, + "num_input_tokens_seen": 22891248, + "step": 18805 + }, + { + "epoch": 2.094888072168393, + "grad_norm": 0.08640008419752121, + "learning_rate": 4.9996579748655035e-05, + "loss": 0.1048, + "num_input_tokens_seen": 22897456, + "step": 18810 + }, + { + "epoch": 2.0954449270520104, + "grad_norm": 1.4118691682815552, + "learning_rate": 4.9996539440547557e-05, + "loss": 0.1124, + "num_input_tokens_seen": 22904048, + "step": 18815 + }, + { + "epoch": 2.0960017819356276, + "grad_norm": 0.6996514797210693, + "learning_rate": 4.999649889632774e-05, + "loss": 0.0628, + "num_input_tokens_seen": 22910128, + "step": 18820 + }, + { + "epoch": 2.096558636819245, + "grad_norm": 0.854868471622467, + "learning_rate": 4.999645811599596e-05, + "loss": 0.0922, + "num_input_tokens_seen": 22916240, + "step": 18825 + }, + { + "epoch": 2.097115491702862, + "grad_norm": 1.223453402519226, + "learning_rate": 4.99964170995526e-05, + "loss": 0.0895, + "num_input_tokens_seen": 22922384, + "step": 18830 + }, + { + "epoch": 2.0976723465864797, + "grad_norm": 0.3630443215370178, + "learning_rate": 4.999637584699807e-05, + "loss": 0.1414, + "num_input_tokens_seen": 22928432, + "step": 18835 + }, + { + "epoch": 2.098229201470097, + "grad_norm": 0.4364042580127716, + "learning_rate": 4.9996334358332735e-05, + "loss": 0.1696, + "num_input_tokens_seen": 22934512, + "step": 18840 + }, + { + "epoch": 2.098786056353714, + "grad_norm": 0.5863174796104431, + "learning_rate": 4.9996292633556995e-05, + "loss": 0.1758, + "num_input_tokens_seen": 22940624, + "step": 18845 + }, + { + "epoch": 2.0993429112373314, + "grad_norm": 1.3903342485427856, + "learning_rate": 4.999625067267124e-05, + "loss": 0.1127, + "num_input_tokens_seen": 22946736, + "step": 18850 + }, + { + "epoch": 2.099899766120949, + "grad_norm": 1.9876153469085693, + "learning_rate": 4.999620847567588e-05, + "loss": 0.325, + "num_input_tokens_seen": 22952528, + "step": 18855 + }, + { + "epoch": 2.1004566210045663, + "grad_norm": 0.4637790322303772, + "learning_rate": 4.99961660425713e-05, + "loss": 0.0995, + "num_input_tokens_seen": 22958864, + "step": 18860 + }, + { + "epoch": 2.1010134758881835, + "grad_norm": 0.25518593192100525, + "learning_rate": 4.99961233733579e-05, + "loss": 0.0527, + "num_input_tokens_seen": 22965296, + "step": 18865 + }, + { + "epoch": 2.1015703307718008, + "grad_norm": 0.1135898306965828, + "learning_rate": 4.99960804680361e-05, + "loss": 0.1004, + "num_input_tokens_seen": 22971408, + "step": 18870 + }, + { + "epoch": 2.102127185655418, + "grad_norm": 1.1020374298095703, + "learning_rate": 4.9996037326606284e-05, + "loss": 0.0824, + "num_input_tokens_seen": 22977264, + "step": 18875 + }, + { + "epoch": 2.1026840405390357, + "grad_norm": 0.7789465188980103, + "learning_rate": 4.999599394906887e-05, + "loss": 0.0665, + "num_input_tokens_seen": 22983472, + "step": 18880 + }, + { + "epoch": 2.103240895422653, + "grad_norm": 1.173224687576294, + "learning_rate": 4.999595033542427e-05, + "loss": 0.1826, + "num_input_tokens_seen": 22989584, + "step": 18885 + }, + { + "epoch": 2.10379775030627, + "grad_norm": 0.9720991849899292, + "learning_rate": 4.9995906485672886e-05, + "loss": 0.1227, + "num_input_tokens_seen": 22995824, + "step": 18890 + }, + { + "epoch": 2.1043546051898874, + "grad_norm": 0.832436740398407, + "learning_rate": 4.9995862399815146e-05, + "loss": 0.1311, + "num_input_tokens_seen": 23001904, + "step": 18895 + }, + { + "epoch": 2.104911460073505, + "grad_norm": 1.0304632186889648, + "learning_rate": 4.999581807785146e-05, + "loss": 0.1524, + "num_input_tokens_seen": 23007952, + "step": 18900 + }, + { + "epoch": 2.1054683149571223, + "grad_norm": 0.7849881052970886, + "learning_rate": 4.999577351978224e-05, + "loss": 0.0864, + "num_input_tokens_seen": 23014288, + "step": 18905 + }, + { + "epoch": 2.1060251698407395, + "grad_norm": 2.776569366455078, + "learning_rate": 4.999572872560792e-05, + "loss": 0.1453, + "num_input_tokens_seen": 23020048, + "step": 18910 + }, + { + "epoch": 2.1065820247243567, + "grad_norm": 0.12710420787334442, + "learning_rate": 4.999568369532891e-05, + "loss": 0.0499, + "num_input_tokens_seen": 23026160, + "step": 18915 + }, + { + "epoch": 2.107138879607974, + "grad_norm": 1.905263066291809, + "learning_rate": 4.999563842894564e-05, + "loss": 0.1516, + "num_input_tokens_seen": 23032464, + "step": 18920 + }, + { + "epoch": 2.1076957344915916, + "grad_norm": 0.621452808380127, + "learning_rate": 4.999559292645855e-05, + "loss": 0.086, + "num_input_tokens_seen": 23038672, + "step": 18925 + }, + { + "epoch": 2.108252589375209, + "grad_norm": 0.10441955924034119, + "learning_rate": 4.999554718786804e-05, + "loss": 0.0406, + "num_input_tokens_seen": 23044880, + "step": 18930 + }, + { + "epoch": 2.108809444258826, + "grad_norm": 0.8812189698219299, + "learning_rate": 4.999550121317458e-05, + "loss": 0.0815, + "num_input_tokens_seen": 23051376, + "step": 18935 + }, + { + "epoch": 2.1093662991424433, + "grad_norm": 0.024207917973399162, + "learning_rate": 4.999545500237857e-05, + "loss": 0.0191, + "num_input_tokens_seen": 23057488, + "step": 18940 + }, + { + "epoch": 2.109923154026061, + "grad_norm": 0.09203946590423584, + "learning_rate": 4.9995408555480474e-05, + "loss": 0.0768, + "num_input_tokens_seen": 23063568, + "step": 18945 + }, + { + "epoch": 2.1104800089096782, + "grad_norm": 0.6179001331329346, + "learning_rate": 4.99953618724807e-05, + "loss": 0.0637, + "num_input_tokens_seen": 23069776, + "step": 18950 + }, + { + "epoch": 2.1110368637932955, + "grad_norm": 0.1455579698085785, + "learning_rate": 4.999531495337973e-05, + "loss": 0.0874, + "num_input_tokens_seen": 23076048, + "step": 18955 + }, + { + "epoch": 2.1115937186769127, + "grad_norm": 0.2589050531387329, + "learning_rate": 4.999526779817797e-05, + "loss": 0.0998, + "num_input_tokens_seen": 23082096, + "step": 18960 + }, + { + "epoch": 2.11215057356053, + "grad_norm": 0.6756677627563477, + "learning_rate": 4.999522040687588e-05, + "loss": 0.0614, + "num_input_tokens_seen": 23087856, + "step": 18965 + }, + { + "epoch": 2.1127074284441476, + "grad_norm": 1.0397762060165405, + "learning_rate": 4.9995172779473906e-05, + "loss": 0.2144, + "num_input_tokens_seen": 23093296, + "step": 18970 + }, + { + "epoch": 2.113264283327765, + "grad_norm": 0.12264879792928696, + "learning_rate": 4.9995124915972516e-05, + "loss": 0.062, + "num_input_tokens_seen": 23099504, + "step": 18975 + }, + { + "epoch": 2.113821138211382, + "grad_norm": 0.2918151021003723, + "learning_rate": 4.999507681637213e-05, + "loss": 0.0682, + "num_input_tokens_seen": 23105648, + "step": 18980 + }, + { + "epoch": 2.1143779930949993, + "grad_norm": 2.3436710834503174, + "learning_rate": 4.9995028480673215e-05, + "loss": 0.1749, + "num_input_tokens_seen": 23111248, + "step": 18985 + }, + { + "epoch": 2.114934847978617, + "grad_norm": 1.4035005569458008, + "learning_rate": 4.999497990887624e-05, + "loss": 0.1481, + "num_input_tokens_seen": 23117168, + "step": 18990 + }, + { + "epoch": 2.115491702862234, + "grad_norm": 0.9183539748191833, + "learning_rate": 4.999493110098165e-05, + "loss": 0.059, + "num_input_tokens_seen": 23123312, + "step": 18995 + }, + { + "epoch": 2.1160485577458514, + "grad_norm": 1.2790822982788086, + "learning_rate": 4.999488205698991e-05, + "loss": 0.1281, + "num_input_tokens_seen": 23129392, + "step": 19000 + }, + { + "epoch": 2.1166054126294687, + "grad_norm": 0.21855105459690094, + "learning_rate": 4.9994832776901484e-05, + "loss": 0.0105, + "num_input_tokens_seen": 23135472, + "step": 19005 + }, + { + "epoch": 2.117162267513086, + "grad_norm": 0.660524845123291, + "learning_rate": 4.999478326071684e-05, + "loss": 0.1145, + "num_input_tokens_seen": 23141776, + "step": 19010 + }, + { + "epoch": 2.1177191223967036, + "grad_norm": 0.012275099754333496, + "learning_rate": 4.9994733508436434e-05, + "loss": 0.0901, + "num_input_tokens_seen": 23147856, + "step": 19015 + }, + { + "epoch": 2.118275977280321, + "grad_norm": 0.13316425681114197, + "learning_rate": 4.999468352006075e-05, + "loss": 0.069, + "num_input_tokens_seen": 23153904, + "step": 19020 + }, + { + "epoch": 2.118832832163938, + "grad_norm": 0.8241076469421387, + "learning_rate": 4.9994633295590254e-05, + "loss": 0.0822, + "num_input_tokens_seen": 23160016, + "step": 19025 + }, + { + "epoch": 2.1193896870475553, + "grad_norm": 0.0038948585279285908, + "learning_rate": 4.999458283502543e-05, + "loss": 0.0337, + "num_input_tokens_seen": 23165680, + "step": 19030 + }, + { + "epoch": 2.119946541931173, + "grad_norm": 1.1878132820129395, + "learning_rate": 4.999453213836673e-05, + "loss": 0.0814, + "num_input_tokens_seen": 23171728, + "step": 19035 + }, + { + "epoch": 2.12050339681479, + "grad_norm": 0.3081096112728119, + "learning_rate": 4.9994481205614665e-05, + "loss": 0.0291, + "num_input_tokens_seen": 23178032, + "step": 19040 + }, + { + "epoch": 2.1210602516984074, + "grad_norm": 0.7065942883491516, + "learning_rate": 4.9994430036769686e-05, + "loss": 0.1405, + "num_input_tokens_seen": 23184016, + "step": 19045 + }, + { + "epoch": 2.1216171065820246, + "grad_norm": 0.5687143206596375, + "learning_rate": 4.99943786318323e-05, + "loss": 0.1356, + "num_input_tokens_seen": 23190288, + "step": 19050 + }, + { + "epoch": 2.122173961465642, + "grad_norm": 0.36152902245521545, + "learning_rate": 4.9994326990802974e-05, + "loss": 0.1174, + "num_input_tokens_seen": 23196752, + "step": 19055 + }, + { + "epoch": 2.1227308163492595, + "grad_norm": 0.35938653349876404, + "learning_rate": 4.999427511368221e-05, + "loss": 0.1388, + "num_input_tokens_seen": 23202864, + "step": 19060 + }, + { + "epoch": 2.1232876712328768, + "grad_norm": 1.3831827640533447, + "learning_rate": 4.999422300047049e-05, + "loss": 0.2175, + "num_input_tokens_seen": 23209008, + "step": 19065 + }, + { + "epoch": 2.123844526116494, + "grad_norm": 0.8693690896034241, + "learning_rate": 4.99941706511683e-05, + "loss": 0.0488, + "num_input_tokens_seen": 23215376, + "step": 19070 + }, + { + "epoch": 2.124401381000111, + "grad_norm": 0.6919363737106323, + "learning_rate": 4.9994118065776166e-05, + "loss": 0.1048, + "num_input_tokens_seen": 23221584, + "step": 19075 + }, + { + "epoch": 2.124958235883729, + "grad_norm": 1.1070772409439087, + "learning_rate": 4.999406524429454e-05, + "loss": 0.2998, + "num_input_tokens_seen": 23227760, + "step": 19080 + }, + { + "epoch": 2.125515090767346, + "grad_norm": 0.35737866163253784, + "learning_rate": 4.999401218672396e-05, + "loss": 0.069, + "num_input_tokens_seen": 23234000, + "step": 19085 + }, + { + "epoch": 2.1260719456509634, + "grad_norm": 0.4120480418205261, + "learning_rate": 4.999395889306489e-05, + "loss": 0.0794, + "num_input_tokens_seen": 23240336, + "step": 19090 + }, + { + "epoch": 2.1266288005345806, + "grad_norm": 0.568077027797699, + "learning_rate": 4.999390536331787e-05, + "loss": 0.0995, + "num_input_tokens_seen": 23246480, + "step": 19095 + }, + { + "epoch": 2.127185655418198, + "grad_norm": 1.253029465675354, + "learning_rate": 4.999385159748339e-05, + "loss": 0.1251, + "num_input_tokens_seen": 23252592, + "step": 19100 + }, + { + "epoch": 2.1277425103018155, + "grad_norm": 0.6500698924064636, + "learning_rate": 4.9993797595561944e-05, + "loss": 0.1372, + "num_input_tokens_seen": 23258384, + "step": 19105 + }, + { + "epoch": 2.1282993651854327, + "grad_norm": 1.5171504020690918, + "learning_rate": 4.999374335755407e-05, + "loss": 0.1854, + "num_input_tokens_seen": 23264464, + "step": 19110 + }, + { + "epoch": 2.12885622006905, + "grad_norm": 0.7322160601615906, + "learning_rate": 4.999368888346025e-05, + "loss": 0.1648, + "num_input_tokens_seen": 23270448, + "step": 19115 + }, + { + "epoch": 2.129413074952667, + "grad_norm": 0.051888369023799896, + "learning_rate": 4.999363417328102e-05, + "loss": 0.0685, + "num_input_tokens_seen": 23276368, + "step": 19120 + }, + { + "epoch": 2.129969929836285, + "grad_norm": 0.14714741706848145, + "learning_rate": 4.9993579227016896e-05, + "loss": 0.0174, + "num_input_tokens_seen": 23282672, + "step": 19125 + }, + { + "epoch": 2.130526784719902, + "grad_norm": 1.0304783582687378, + "learning_rate": 4.9993524044668385e-05, + "loss": 0.1289, + "num_input_tokens_seen": 23288848, + "step": 19130 + }, + { + "epoch": 2.1310836396035193, + "grad_norm": 2.184527635574341, + "learning_rate": 4.9993468626236016e-05, + "loss": 0.0876, + "num_input_tokens_seen": 23295088, + "step": 19135 + }, + { + "epoch": 2.1316404944871366, + "grad_norm": 0.6486743688583374, + "learning_rate": 4.999341297172032e-05, + "loss": 0.0879, + "num_input_tokens_seen": 23301104, + "step": 19140 + }, + { + "epoch": 2.132197349370754, + "grad_norm": 0.012992396019399166, + "learning_rate": 4.9993357081121806e-05, + "loss": 0.036, + "num_input_tokens_seen": 23307248, + "step": 19145 + }, + { + "epoch": 2.1327542042543715, + "grad_norm": 0.3611292243003845, + "learning_rate": 4.999330095444101e-05, + "loss": 0.0578, + "num_input_tokens_seen": 23312976, + "step": 19150 + }, + { + "epoch": 2.1333110591379887, + "grad_norm": 0.6971071362495422, + "learning_rate": 4.999324459167846e-05, + "loss": 0.1063, + "num_input_tokens_seen": 23319184, + "step": 19155 + }, + { + "epoch": 2.133867914021606, + "grad_norm": 0.6501970291137695, + "learning_rate": 4.999318799283469e-05, + "loss": 0.1115, + "num_input_tokens_seen": 23325328, + "step": 19160 + }, + { + "epoch": 2.134424768905223, + "grad_norm": 0.9651912450790405, + "learning_rate": 4.9993131157910244e-05, + "loss": 0.108, + "num_input_tokens_seen": 23331312, + "step": 19165 + }, + { + "epoch": 2.134981623788841, + "grad_norm": 0.7149854302406311, + "learning_rate": 4.9993074086905644e-05, + "loss": 0.2431, + "num_input_tokens_seen": 23337296, + "step": 19170 + }, + { + "epoch": 2.135538478672458, + "grad_norm": 1.0599905252456665, + "learning_rate": 4.9993016779821436e-05, + "loss": 0.0978, + "num_input_tokens_seen": 23343376, + "step": 19175 + }, + { + "epoch": 2.1360953335560753, + "grad_norm": 0.2627021372318268, + "learning_rate": 4.999295923665817e-05, + "loss": 0.0199, + "num_input_tokens_seen": 23349744, + "step": 19180 + }, + { + "epoch": 2.1366521884396925, + "grad_norm": 0.7256113290786743, + "learning_rate": 4.999290145741636e-05, + "loss": 0.0867, + "num_input_tokens_seen": 23356304, + "step": 19185 + }, + { + "epoch": 2.1372090433233097, + "grad_norm": 0.0029689553193747997, + "learning_rate": 4.999284344209658e-05, + "loss": 0.0495, + "num_input_tokens_seen": 23362320, + "step": 19190 + }, + { + "epoch": 2.1377658982069274, + "grad_norm": 0.42443856596946716, + "learning_rate": 4.999278519069938e-05, + "loss": 0.1577, + "num_input_tokens_seen": 23368368, + "step": 19195 + }, + { + "epoch": 2.1383227530905446, + "grad_norm": 1.4263150691986084, + "learning_rate": 4.999272670322529e-05, + "loss": 0.1796, + "num_input_tokens_seen": 23374320, + "step": 19200 + }, + { + "epoch": 2.138879607974162, + "grad_norm": 1.032880187034607, + "learning_rate": 4.9992667979674874e-05, + "loss": 0.147, + "num_input_tokens_seen": 23380240, + "step": 19205 + }, + { + "epoch": 2.139436462857779, + "grad_norm": 0.007735844235867262, + "learning_rate": 4.9992609020048685e-05, + "loss": 0.0932, + "num_input_tokens_seen": 23386288, + "step": 19210 + }, + { + "epoch": 2.139993317741397, + "grad_norm": 0.3295755088329315, + "learning_rate": 4.999254982434728e-05, + "loss": 0.1, + "num_input_tokens_seen": 23392176, + "step": 19215 + }, + { + "epoch": 2.140550172625014, + "grad_norm": 0.17364336550235748, + "learning_rate": 4.999249039257122e-05, + "loss": 0.0894, + "num_input_tokens_seen": 23398256, + "step": 19220 + }, + { + "epoch": 2.1411070275086312, + "grad_norm": 0.032086461782455444, + "learning_rate": 4.999243072472106e-05, + "loss": 0.1336, + "num_input_tokens_seen": 23404560, + "step": 19225 + }, + { + "epoch": 2.1416638823922485, + "grad_norm": 0.28048503398895264, + "learning_rate": 4.999237082079737e-05, + "loss": 0.0475, + "num_input_tokens_seen": 23410736, + "step": 19230 + }, + { + "epoch": 2.1422207372758657, + "grad_norm": 0.21233811974525452, + "learning_rate": 4.9992310680800725e-05, + "loss": 0.1113, + "num_input_tokens_seen": 23416752, + "step": 19235 + }, + { + "epoch": 2.1427775921594834, + "grad_norm": 0.6981341242790222, + "learning_rate": 4.999225030473167e-05, + "loss": 0.2291, + "num_input_tokens_seen": 23422704, + "step": 19240 + }, + { + "epoch": 2.1433344470431006, + "grad_norm": 0.020614031702280045, + "learning_rate": 4.999218969259078e-05, + "loss": 0.0739, + "num_input_tokens_seen": 23429008, + "step": 19245 + }, + { + "epoch": 2.143891301926718, + "grad_norm": 0.9642356038093567, + "learning_rate": 4.999212884437865e-05, + "loss": 0.1076, + "num_input_tokens_seen": 23435216, + "step": 19250 + }, + { + "epoch": 2.144448156810335, + "grad_norm": 0.09507819265127182, + "learning_rate": 4.999206776009584e-05, + "loss": 0.0489, + "num_input_tokens_seen": 23441520, + "step": 19255 + }, + { + "epoch": 2.1450050116939527, + "grad_norm": 0.07862278819084167, + "learning_rate": 4.999200643974292e-05, + "loss": 0.103, + "num_input_tokens_seen": 23447952, + "step": 19260 + }, + { + "epoch": 2.14556186657757, + "grad_norm": 0.7730852365493774, + "learning_rate": 4.999194488332048e-05, + "loss": 0.0585, + "num_input_tokens_seen": 23454512, + "step": 19265 + }, + { + "epoch": 2.146118721461187, + "grad_norm": 0.49076420068740845, + "learning_rate": 4.9991883090829096e-05, + "loss": 0.1347, + "num_input_tokens_seen": 23460048, + "step": 19270 + }, + { + "epoch": 2.1466755763448044, + "grad_norm": 0.08732887357473373, + "learning_rate": 4.999182106226935e-05, + "loss": 0.0516, + "num_input_tokens_seen": 23465968, + "step": 19275 + }, + { + "epoch": 2.1472324312284217, + "grad_norm": 0.5754145979881287, + "learning_rate": 4.999175879764183e-05, + "loss": 0.0779, + "num_input_tokens_seen": 23472176, + "step": 19280 + }, + { + "epoch": 2.1477892861120393, + "grad_norm": 2.0552878379821777, + "learning_rate": 4.999169629694713e-05, + "loss": 0.1803, + "num_input_tokens_seen": 23478224, + "step": 19285 + }, + { + "epoch": 2.1483461409956566, + "grad_norm": 0.3371334373950958, + "learning_rate": 4.999163356018584e-05, + "loss": 0.17, + "num_input_tokens_seen": 23484208, + "step": 19290 + }, + { + "epoch": 2.148902995879274, + "grad_norm": 0.6126899123191833, + "learning_rate": 4.999157058735854e-05, + "loss": 0.1322, + "num_input_tokens_seen": 23489936, + "step": 19295 + }, + { + "epoch": 2.149459850762891, + "grad_norm": 2.3215224742889404, + "learning_rate": 4.999150737846583e-05, + "loss": 0.1381, + "num_input_tokens_seen": 23495824, + "step": 19300 + }, + { + "epoch": 2.1500167056465087, + "grad_norm": 0.7356218695640564, + "learning_rate": 4.999144393350831e-05, + "loss": 0.0765, + "num_input_tokens_seen": 23501360, + "step": 19305 + }, + { + "epoch": 2.150573560530126, + "grad_norm": 0.18004699051380157, + "learning_rate": 4.9991380252486585e-05, + "loss": 0.0587, + "num_input_tokens_seen": 23507504, + "step": 19310 + }, + { + "epoch": 2.151130415413743, + "grad_norm": 0.08403555303812027, + "learning_rate": 4.999131633540125e-05, + "loss": 0.1379, + "num_input_tokens_seen": 23513776, + "step": 19315 + }, + { + "epoch": 2.1516872702973604, + "grad_norm": 0.6090229153633118, + "learning_rate": 4.9991252182252914e-05, + "loss": 0.1417, + "num_input_tokens_seen": 23519952, + "step": 19320 + }, + { + "epoch": 2.1522441251809776, + "grad_norm": 0.08248423784971237, + "learning_rate": 4.9991187793042174e-05, + "loss": 0.2116, + "num_input_tokens_seen": 23526000, + "step": 19325 + }, + { + "epoch": 2.1528009800645953, + "grad_norm": 0.6807165741920471, + "learning_rate": 4.999112316776964e-05, + "loss": 0.1693, + "num_input_tokens_seen": 23532208, + "step": 19330 + }, + { + "epoch": 2.1533578349482125, + "grad_norm": 0.5035737156867981, + "learning_rate": 4.999105830643592e-05, + "loss": 0.0485, + "num_input_tokens_seen": 23538032, + "step": 19335 + }, + { + "epoch": 2.1539146898318298, + "grad_norm": 1.5263309478759766, + "learning_rate": 4.999099320904165e-05, + "loss": 0.0984, + "num_input_tokens_seen": 23544528, + "step": 19340 + }, + { + "epoch": 2.154471544715447, + "grad_norm": 0.839713454246521, + "learning_rate": 4.9990927875587414e-05, + "loss": 0.1753, + "num_input_tokens_seen": 23550864, + "step": 19345 + }, + { + "epoch": 2.1550283995990647, + "grad_norm": 0.037031251937150955, + "learning_rate": 4.9990862306073836e-05, + "loss": 0.0897, + "num_input_tokens_seen": 23557040, + "step": 19350 + }, + { + "epoch": 2.155585254482682, + "grad_norm": 0.0672701895236969, + "learning_rate": 4.9990796500501555e-05, + "loss": 0.0498, + "num_input_tokens_seen": 23563376, + "step": 19355 + }, + { + "epoch": 2.156142109366299, + "grad_norm": 0.7699568867683411, + "learning_rate": 4.999073045887117e-05, + "loss": 0.074, + "num_input_tokens_seen": 23569712, + "step": 19360 + }, + { + "epoch": 2.1566989642499164, + "grad_norm": 0.5398870706558228, + "learning_rate": 4.999066418118332e-05, + "loss": 0.0612, + "num_input_tokens_seen": 23575760, + "step": 19365 + }, + { + "epoch": 2.1572558191335336, + "grad_norm": 0.34169942140579224, + "learning_rate": 4.999059766743862e-05, + "loss": 0.1057, + "num_input_tokens_seen": 23581616, + "step": 19370 + }, + { + "epoch": 2.1578126740171513, + "grad_norm": 0.02570878341794014, + "learning_rate": 4.99905309176377e-05, + "loss": 0.1379, + "num_input_tokens_seen": 23587888, + "step": 19375 + }, + { + "epoch": 2.1583695289007685, + "grad_norm": 0.6887845396995544, + "learning_rate": 4.9990463931781196e-05, + "loss": 0.0799, + "num_input_tokens_seen": 23594256, + "step": 19380 + }, + { + "epoch": 2.1589263837843857, + "grad_norm": 0.10324607044458389, + "learning_rate": 4.999039670986974e-05, + "loss": 0.0211, + "num_input_tokens_seen": 23600400, + "step": 19385 + }, + { + "epoch": 2.159483238668003, + "grad_norm": 0.31005170941352844, + "learning_rate": 4.999032925190397e-05, + "loss": 0.1458, + "num_input_tokens_seen": 23606608, + "step": 19390 + }, + { + "epoch": 2.1600400935516206, + "grad_norm": 1.277263879776001, + "learning_rate": 4.999026155788451e-05, + "loss": 0.18, + "num_input_tokens_seen": 23612592, + "step": 19395 + }, + { + "epoch": 2.160596948435238, + "grad_norm": 0.07518824189901352, + "learning_rate": 4.999019362781201e-05, + "loss": 0.0479, + "num_input_tokens_seen": 23618768, + "step": 19400 + }, + { + "epoch": 2.161153803318855, + "grad_norm": 0.3616437613964081, + "learning_rate": 4.999012546168711e-05, + "loss": 0.0764, + "num_input_tokens_seen": 23624784, + "step": 19405 + }, + { + "epoch": 2.1617106582024723, + "grad_norm": 0.4547424614429474, + "learning_rate": 4.999005705951045e-05, + "loss": 0.1146, + "num_input_tokens_seen": 23630736, + "step": 19410 + }, + { + "epoch": 2.16226751308609, + "grad_norm": 1.0346652269363403, + "learning_rate": 4.9989988421282686e-05, + "loss": 0.1886, + "num_input_tokens_seen": 23636752, + "step": 19415 + }, + { + "epoch": 2.1628243679697072, + "grad_norm": 1.3521008491516113, + "learning_rate": 4.998991954700445e-05, + "loss": 0.2067, + "num_input_tokens_seen": 23642480, + "step": 19420 + }, + { + "epoch": 2.1633812228533245, + "grad_norm": 0.7545886635780334, + "learning_rate": 4.998985043667641e-05, + "loss": 0.2732, + "num_input_tokens_seen": 23648112, + "step": 19425 + }, + { + "epoch": 2.1639380777369417, + "grad_norm": 0.9726218581199646, + "learning_rate": 4.998978109029921e-05, + "loss": 0.067, + "num_input_tokens_seen": 23654352, + "step": 19430 + }, + { + "epoch": 2.164494932620559, + "grad_norm": 0.060950178653001785, + "learning_rate": 4.9989711507873505e-05, + "loss": 0.0732, + "num_input_tokens_seen": 23660816, + "step": 19435 + }, + { + "epoch": 2.1650517875041766, + "grad_norm": 0.645648181438446, + "learning_rate": 4.998964168939995e-05, + "loss": 0.1337, + "num_input_tokens_seen": 23666896, + "step": 19440 + }, + { + "epoch": 2.165608642387794, + "grad_norm": 0.7017048001289368, + "learning_rate": 4.9989571634879214e-05, + "loss": 0.1166, + "num_input_tokens_seen": 23672880, + "step": 19445 + }, + { + "epoch": 2.166165497271411, + "grad_norm": 0.06727106869220734, + "learning_rate": 4.998950134431195e-05, + "loss": 0.0509, + "num_input_tokens_seen": 23679120, + "step": 19450 + }, + { + "epoch": 2.1667223521550283, + "grad_norm": 0.021385017782449722, + "learning_rate": 4.998943081769882e-05, + "loss": 0.0592, + "num_input_tokens_seen": 23685328, + "step": 19455 + }, + { + "epoch": 2.1672792070386455, + "grad_norm": 0.7645747065544128, + "learning_rate": 4.99893600550405e-05, + "loss": 0.1857, + "num_input_tokens_seen": 23691216, + "step": 19460 + }, + { + "epoch": 2.167836061922263, + "grad_norm": 0.24888670444488525, + "learning_rate": 4.9989289056337655e-05, + "loss": 0.1584, + "num_input_tokens_seen": 23697328, + "step": 19465 + }, + { + "epoch": 2.1683929168058804, + "grad_norm": 0.2577127516269684, + "learning_rate": 4.9989217821590956e-05, + "loss": 0.1096, + "num_input_tokens_seen": 23703600, + "step": 19470 + }, + { + "epoch": 2.1689497716894977, + "grad_norm": 0.3055659532546997, + "learning_rate": 4.9989146350801065e-05, + "loss": 0.0636, + "num_input_tokens_seen": 23709488, + "step": 19475 + }, + { + "epoch": 2.169506626573115, + "grad_norm": 0.3129875063896179, + "learning_rate": 4.998907464396867e-05, + "loss": 0.0845, + "num_input_tokens_seen": 23714928, + "step": 19480 + }, + { + "epoch": 2.1700634814567326, + "grad_norm": 0.2485097348690033, + "learning_rate": 4.9989002701094447e-05, + "loss": 0.0706, + "num_input_tokens_seen": 23721360, + "step": 19485 + }, + { + "epoch": 2.17062033634035, + "grad_norm": 1.3774535655975342, + "learning_rate": 4.998893052217907e-05, + "loss": 0.2315, + "num_input_tokens_seen": 23726896, + "step": 19490 + }, + { + "epoch": 2.171177191223967, + "grad_norm": 0.8948656916618347, + "learning_rate": 4.998885810722322e-05, + "loss": 0.0939, + "num_input_tokens_seen": 23733008, + "step": 19495 + }, + { + "epoch": 2.1717340461075842, + "grad_norm": 0.3356379270553589, + "learning_rate": 4.9988785456227596e-05, + "loss": 0.0758, + "num_input_tokens_seen": 23739088, + "step": 19500 + }, + { + "epoch": 2.172290900991202, + "grad_norm": 0.2753540873527527, + "learning_rate": 4.9988712569192857e-05, + "loss": 0.0491, + "num_input_tokens_seen": 23745264, + "step": 19505 + }, + { + "epoch": 2.172847755874819, + "grad_norm": 0.03672749549150467, + "learning_rate": 4.9988639446119715e-05, + "loss": 0.0623, + "num_input_tokens_seen": 23751344, + "step": 19510 + }, + { + "epoch": 2.1734046107584364, + "grad_norm": 0.082453154027462, + "learning_rate": 4.9988566087008855e-05, + "loss": 0.0709, + "num_input_tokens_seen": 23757296, + "step": 19515 + }, + { + "epoch": 2.1739614656420536, + "grad_norm": 0.16644950211048126, + "learning_rate": 4.998849249186096e-05, + "loss": 0.0473, + "num_input_tokens_seen": 23763184, + "step": 19520 + }, + { + "epoch": 2.174518320525671, + "grad_norm": 1.7340458631515503, + "learning_rate": 4.998841866067674e-05, + "loss": 0.1367, + "num_input_tokens_seen": 23769296, + "step": 19525 + }, + { + "epoch": 2.1750751754092885, + "grad_norm": 0.5349230170249939, + "learning_rate": 4.998834459345688e-05, + "loss": 0.1222, + "num_input_tokens_seen": 23775312, + "step": 19530 + }, + { + "epoch": 2.1756320302929057, + "grad_norm": 0.2346167415380478, + "learning_rate": 4.998827029020209e-05, + "loss": 0.1268, + "num_input_tokens_seen": 23780912, + "step": 19535 + }, + { + "epoch": 2.176188885176523, + "grad_norm": 0.5686939358711243, + "learning_rate": 4.998819575091307e-05, + "loss": 0.1387, + "num_input_tokens_seen": 23786896, + "step": 19540 + }, + { + "epoch": 2.17674574006014, + "grad_norm": 0.5579424500465393, + "learning_rate": 4.998812097559051e-05, + "loss": 0.0713, + "num_input_tokens_seen": 23793040, + "step": 19545 + }, + { + "epoch": 2.1773025949437574, + "grad_norm": 0.5229225754737854, + "learning_rate": 4.9988045964235134e-05, + "loss": 0.0702, + "num_input_tokens_seen": 23799312, + "step": 19550 + }, + { + "epoch": 2.177859449827375, + "grad_norm": 0.9229505658149719, + "learning_rate": 4.9987970716847644e-05, + "loss": 0.1856, + "num_input_tokens_seen": 23805456, + "step": 19555 + }, + { + "epoch": 2.1784163047109923, + "grad_norm": 0.779632031917572, + "learning_rate": 4.998789523342875e-05, + "loss": 0.0803, + "num_input_tokens_seen": 23811184, + "step": 19560 + }, + { + "epoch": 2.1789731595946096, + "grad_norm": 0.009639741852879524, + "learning_rate": 4.998781951397917e-05, + "loss": 0.0392, + "num_input_tokens_seen": 23817360, + "step": 19565 + }, + { + "epoch": 2.179530014478227, + "grad_norm": 0.3473644554615021, + "learning_rate": 4.9987743558499604e-05, + "loss": 0.0957, + "num_input_tokens_seen": 23823472, + "step": 19570 + }, + { + "epoch": 2.1800868693618445, + "grad_norm": 0.3923041820526123, + "learning_rate": 4.9987667366990786e-05, + "loss": 0.1269, + "num_input_tokens_seen": 23829744, + "step": 19575 + }, + { + "epoch": 2.1806437242454617, + "grad_norm": 0.06514366716146469, + "learning_rate": 4.998759093945343e-05, + "loss": 0.0283, + "num_input_tokens_seen": 23835984, + "step": 19580 + }, + { + "epoch": 2.181200579129079, + "grad_norm": 0.5688267946243286, + "learning_rate": 4.998751427588826e-05, + "loss": 0.0555, + "num_input_tokens_seen": 23842448, + "step": 19585 + }, + { + "epoch": 2.181757434012696, + "grad_norm": 0.0032031633891165257, + "learning_rate": 4.9987437376295996e-05, + "loss": 0.0882, + "num_input_tokens_seen": 23848624, + "step": 19590 + }, + { + "epoch": 2.182314288896314, + "grad_norm": 0.6941012740135193, + "learning_rate": 4.9987360240677364e-05, + "loss": 0.1033, + "num_input_tokens_seen": 23854704, + "step": 19595 + }, + { + "epoch": 2.182871143779931, + "grad_norm": 0.7373862266540527, + "learning_rate": 4.99872828690331e-05, + "loss": 0.1204, + "num_input_tokens_seen": 23860656, + "step": 19600 + }, + { + "epoch": 2.1834279986635483, + "grad_norm": 0.7243894934654236, + "learning_rate": 4.9987205261363924e-05, + "loss": 0.0554, + "num_input_tokens_seen": 23866672, + "step": 19605 + }, + { + "epoch": 2.1839848535471655, + "grad_norm": 2.4409472942352295, + "learning_rate": 4.998712741767058e-05, + "loss": 0.1864, + "num_input_tokens_seen": 23872752, + "step": 19610 + }, + { + "epoch": 2.1845417084307828, + "grad_norm": 0.06113755702972412, + "learning_rate": 4.9987049337953795e-05, + "loss": 0.0355, + "num_input_tokens_seen": 23879152, + "step": 19615 + }, + { + "epoch": 2.1850985633144004, + "grad_norm": 0.8606618642807007, + "learning_rate": 4.9986971022214315e-05, + "loss": 0.1549, + "num_input_tokens_seen": 23885328, + "step": 19620 + }, + { + "epoch": 2.1856554181980177, + "grad_norm": 1.233971357345581, + "learning_rate": 4.9986892470452865e-05, + "loss": 0.2003, + "num_input_tokens_seen": 23891664, + "step": 19625 + }, + { + "epoch": 2.186212273081635, + "grad_norm": 3.0858752727508545, + "learning_rate": 4.998681368267021e-05, + "loss": 0.1945, + "num_input_tokens_seen": 23897904, + "step": 19630 + }, + { + "epoch": 2.186769127965252, + "grad_norm": 0.8570969104766846, + "learning_rate": 4.9986734658867065e-05, + "loss": 0.0822, + "num_input_tokens_seen": 23903760, + "step": 19635 + }, + { + "epoch": 2.1873259828488694, + "grad_norm": 0.043332669883966446, + "learning_rate": 4.9986655399044205e-05, + "loss": 0.0785, + "num_input_tokens_seen": 23909552, + "step": 19640 + }, + { + "epoch": 2.187882837732487, + "grad_norm": 0.045605938881635666, + "learning_rate": 4.998657590320236e-05, + "loss": 0.0898, + "num_input_tokens_seen": 23915696, + "step": 19645 + }, + { + "epoch": 2.1884396926161043, + "grad_norm": 0.17505277693271637, + "learning_rate": 4.9986496171342286e-05, + "loss": 0.0334, + "num_input_tokens_seen": 23922128, + "step": 19650 + }, + { + "epoch": 2.1889965474997215, + "grad_norm": 0.6788631081581116, + "learning_rate": 4.998641620346474e-05, + "loss": 0.1517, + "num_input_tokens_seen": 23928272, + "step": 19655 + }, + { + "epoch": 2.1895534023833387, + "grad_norm": 1.5195025205612183, + "learning_rate": 4.9986335999570464e-05, + "loss": 0.1281, + "num_input_tokens_seen": 23934416, + "step": 19660 + }, + { + "epoch": 2.1901102572669564, + "grad_norm": 0.07752861082553864, + "learning_rate": 4.998625555966024e-05, + "loss": 0.0768, + "num_input_tokens_seen": 23940592, + "step": 19665 + }, + { + "epoch": 2.1906671121505736, + "grad_norm": 1.0002944469451904, + "learning_rate": 4.9986174883734805e-05, + "loss": 0.0994, + "num_input_tokens_seen": 23946384, + "step": 19670 + }, + { + "epoch": 2.191223967034191, + "grad_norm": 0.06527955830097198, + "learning_rate": 4.998609397179494e-05, + "loss": 0.0454, + "num_input_tokens_seen": 23952432, + "step": 19675 + }, + { + "epoch": 2.191780821917808, + "grad_norm": 0.0028768053743988276, + "learning_rate": 4.998601282384139e-05, + "loss": 0.0788, + "num_input_tokens_seen": 23958608, + "step": 19680 + }, + { + "epoch": 2.1923376768014258, + "grad_norm": 1.111298680305481, + "learning_rate": 4.998593143987492e-05, + "loss": 0.0419, + "num_input_tokens_seen": 23964432, + "step": 19685 + }, + { + "epoch": 2.192894531685043, + "grad_norm": 0.36629819869995117, + "learning_rate": 4.998584981989632e-05, + "loss": 0.0395, + "num_input_tokens_seen": 23970544, + "step": 19690 + }, + { + "epoch": 2.1934513865686602, + "grad_norm": 0.008782626129686832, + "learning_rate": 4.998576796390636e-05, + "loss": 0.0731, + "num_input_tokens_seen": 23976592, + "step": 19695 + }, + { + "epoch": 2.1940082414522775, + "grad_norm": 0.11563733220100403, + "learning_rate": 4.998568587190579e-05, + "loss": 0.0869, + "num_input_tokens_seen": 23982704, + "step": 19700 + }, + { + "epoch": 2.1945650963358947, + "grad_norm": 3.130432605743408, + "learning_rate": 4.9985603543895406e-05, + "loss": 0.1363, + "num_input_tokens_seen": 23989008, + "step": 19705 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 1.3579329252243042, + "learning_rate": 4.9985520979875976e-05, + "loss": 0.1299, + "num_input_tokens_seen": 23995120, + "step": 19710 + }, + { + "epoch": 2.1956788061031296, + "grad_norm": 0.6488685607910156, + "learning_rate": 4.998543817984828e-05, + "loss": 0.188, + "num_input_tokens_seen": 24001200, + "step": 19715 + }, + { + "epoch": 2.196235660986747, + "grad_norm": 0.2252630591392517, + "learning_rate": 4.9985355143813104e-05, + "loss": 0.1097, + "num_input_tokens_seen": 24007504, + "step": 19720 + }, + { + "epoch": 2.196792515870364, + "grad_norm": 0.2295675426721573, + "learning_rate": 4.9985271871771234e-05, + "loss": 0.156, + "num_input_tokens_seen": 24012560, + "step": 19725 + }, + { + "epoch": 2.1973493707539813, + "grad_norm": 1.3053944110870361, + "learning_rate": 4.9985188363723446e-05, + "loss": 0.2171, + "num_input_tokens_seen": 24018928, + "step": 19730 + }, + { + "epoch": 2.197906225637599, + "grad_norm": 1.5987704992294312, + "learning_rate": 4.998510461967054e-05, + "loss": 0.1875, + "num_input_tokens_seen": 24024784, + "step": 19735 + }, + { + "epoch": 2.198463080521216, + "grad_norm": 0.022090958431363106, + "learning_rate": 4.99850206396133e-05, + "loss": 0.1957, + "num_input_tokens_seen": 24030544, + "step": 19740 + }, + { + "epoch": 2.1990199354048334, + "grad_norm": 0.5376138091087341, + "learning_rate": 4.998493642355253e-05, + "loss": 0.1424, + "num_input_tokens_seen": 24036656, + "step": 19745 + }, + { + "epoch": 2.1995767902884507, + "grad_norm": 0.41459622979164124, + "learning_rate": 4.998485197148901e-05, + "loss": 0.0764, + "num_input_tokens_seen": 24043056, + "step": 19750 + }, + { + "epoch": 2.2001336451720683, + "grad_norm": 0.0541267916560173, + "learning_rate": 4.998476728342355e-05, + "loss": 0.209, + "num_input_tokens_seen": 24049808, + "step": 19755 + }, + { + "epoch": 2.2006905000556856, + "grad_norm": 0.04069974645972252, + "learning_rate": 4.998468235935695e-05, + "loss": 0.0575, + "num_input_tokens_seen": 24055920, + "step": 19760 + }, + { + "epoch": 2.201247354939303, + "grad_norm": 0.701450765132904, + "learning_rate": 4.9984597199289994e-05, + "loss": 0.0677, + "num_input_tokens_seen": 24062064, + "step": 19765 + }, + { + "epoch": 2.20180420982292, + "grad_norm": 0.5137426257133484, + "learning_rate": 4.998451180322351e-05, + "loss": 0.0872, + "num_input_tokens_seen": 24067984, + "step": 19770 + }, + { + "epoch": 2.2023610647065377, + "grad_norm": 0.20096828043460846, + "learning_rate": 4.9984426171158294e-05, + "loss": 0.1201, + "num_input_tokens_seen": 24074032, + "step": 19775 + }, + { + "epoch": 2.202917919590155, + "grad_norm": 0.028403395786881447, + "learning_rate": 4.998434030309516e-05, + "loss": 0.2267, + "num_input_tokens_seen": 24080368, + "step": 19780 + }, + { + "epoch": 2.203474774473772, + "grad_norm": 2.9861578941345215, + "learning_rate": 4.998425419903491e-05, + "loss": 0.1612, + "num_input_tokens_seen": 24086224, + "step": 19785 + }, + { + "epoch": 2.2040316293573894, + "grad_norm": 1.7575852870941162, + "learning_rate": 4.998416785897836e-05, + "loss": 0.2387, + "num_input_tokens_seen": 24090832, + "step": 19790 + }, + { + "epoch": 2.2045884842410066, + "grad_norm": 0.2304023951292038, + "learning_rate": 4.998408128292633e-05, + "loss": 0.1078, + "num_input_tokens_seen": 24096720, + "step": 19795 + }, + { + "epoch": 2.2051453391246243, + "grad_norm": 0.8920521140098572, + "learning_rate": 4.9983994470879634e-05, + "loss": 0.1382, + "num_input_tokens_seen": 24102864, + "step": 19800 + }, + { + "epoch": 2.2057021940082415, + "grad_norm": 0.40282171964645386, + "learning_rate": 4.998390742283909e-05, + "loss": 0.0807, + "num_input_tokens_seen": 24108976, + "step": 19805 + }, + { + "epoch": 2.2062590488918588, + "grad_norm": 0.2512061595916748, + "learning_rate": 4.998382013880553e-05, + "loss": 0.1376, + "num_input_tokens_seen": 24114768, + "step": 19810 + }, + { + "epoch": 2.206815903775476, + "grad_norm": 0.9358846545219421, + "learning_rate": 4.998373261877977e-05, + "loss": 0.1185, + "num_input_tokens_seen": 24121104, + "step": 19815 + }, + { + "epoch": 2.207372758659093, + "grad_norm": 0.5020219683647156, + "learning_rate": 4.9983644862762634e-05, + "loss": 0.1426, + "num_input_tokens_seen": 24127248, + "step": 19820 + }, + { + "epoch": 2.207929613542711, + "grad_norm": 0.576604425907135, + "learning_rate": 4.998355687075496e-05, + "loss": 0.0587, + "num_input_tokens_seen": 24133520, + "step": 19825 + }, + { + "epoch": 2.208486468426328, + "grad_norm": 1.3408637046813965, + "learning_rate": 4.9983468642757575e-05, + "loss": 0.0926, + "num_input_tokens_seen": 24139728, + "step": 19830 + }, + { + "epoch": 2.2090433233099454, + "grad_norm": 0.6106126308441162, + "learning_rate": 4.998338017877131e-05, + "loss": 0.1623, + "num_input_tokens_seen": 24145680, + "step": 19835 + }, + { + "epoch": 2.2096001781935626, + "grad_norm": 2.549043655395508, + "learning_rate": 4.998329147879701e-05, + "loss": 0.1245, + "num_input_tokens_seen": 24151920, + "step": 19840 + }, + { + "epoch": 2.2101570330771803, + "grad_norm": 0.6665663719177246, + "learning_rate": 4.99832025428355e-05, + "loss": 0.1022, + "num_input_tokens_seen": 24157840, + "step": 19845 + }, + { + "epoch": 2.2107138879607975, + "grad_norm": 0.5063521265983582, + "learning_rate": 4.998311337088762e-05, + "loss": 0.1497, + "num_input_tokens_seen": 24163824, + "step": 19850 + }, + { + "epoch": 2.2112707428444147, + "grad_norm": 0.6084859371185303, + "learning_rate": 4.9983023962954226e-05, + "loss": 0.1622, + "num_input_tokens_seen": 24170064, + "step": 19855 + }, + { + "epoch": 2.211827597728032, + "grad_norm": 0.6219589710235596, + "learning_rate": 4.998293431903616e-05, + "loss": 0.1011, + "num_input_tokens_seen": 24176272, + "step": 19860 + }, + { + "epoch": 2.2123844526116496, + "grad_norm": 0.04433892294764519, + "learning_rate": 4.9982844439134256e-05, + "loss": 0.0963, + "num_input_tokens_seen": 24182384, + "step": 19865 + }, + { + "epoch": 2.212941307495267, + "grad_norm": 0.13948091864585876, + "learning_rate": 4.998275432324937e-05, + "loss": 0.1058, + "num_input_tokens_seen": 24188400, + "step": 19870 + }, + { + "epoch": 2.213498162378884, + "grad_norm": 2.3132498264312744, + "learning_rate": 4.9982663971382355e-05, + "loss": 0.0906, + "num_input_tokens_seen": 24194448, + "step": 19875 + }, + { + "epoch": 2.2140550172625013, + "grad_norm": 0.5889950394630432, + "learning_rate": 4.9982573383534056e-05, + "loss": 0.1654, + "num_input_tokens_seen": 24200368, + "step": 19880 + }, + { + "epoch": 2.2146118721461185, + "grad_norm": 0.026936309412121773, + "learning_rate": 4.998248255970535e-05, + "loss": 0.2092, + "num_input_tokens_seen": 24206448, + "step": 19885 + }, + { + "epoch": 2.215168727029736, + "grad_norm": 0.06618215888738632, + "learning_rate": 4.9982391499897066e-05, + "loss": 0.031, + "num_input_tokens_seen": 24212976, + "step": 19890 + }, + { + "epoch": 2.2157255819133534, + "grad_norm": 0.9263511896133423, + "learning_rate": 4.998230020411009e-05, + "loss": 0.0863, + "num_input_tokens_seen": 24219344, + "step": 19895 + }, + { + "epoch": 2.2162824367969707, + "grad_norm": 0.7270582914352417, + "learning_rate": 4.998220867234526e-05, + "loss": 0.0681, + "num_input_tokens_seen": 24225296, + "step": 19900 + }, + { + "epoch": 2.216839291680588, + "grad_norm": 1.1351109743118286, + "learning_rate": 4.9982116904603474e-05, + "loss": 0.1304, + "num_input_tokens_seen": 24231280, + "step": 19905 + }, + { + "epoch": 2.217396146564205, + "grad_norm": 0.37550732493400574, + "learning_rate": 4.998202490088556e-05, + "loss": 0.0719, + "num_input_tokens_seen": 24237136, + "step": 19910 + }, + { + "epoch": 2.217953001447823, + "grad_norm": 0.9634771943092346, + "learning_rate": 4.998193266119242e-05, + "loss": 0.1005, + "num_input_tokens_seen": 24242640, + "step": 19915 + }, + { + "epoch": 2.21850985633144, + "grad_norm": 0.1302693784236908, + "learning_rate": 4.99818401855249e-05, + "loss": 0.0651, + "num_input_tokens_seen": 24248784, + "step": 19920 + }, + { + "epoch": 2.2190667112150573, + "grad_norm": 0.28689560294151306, + "learning_rate": 4.998174747388389e-05, + "loss": 0.0946, + "num_input_tokens_seen": 24254896, + "step": 19925 + }, + { + "epoch": 2.2196235660986745, + "grad_norm": 0.14958368241786957, + "learning_rate": 4.998165452627025e-05, + "loss": 0.1293, + "num_input_tokens_seen": 24261360, + "step": 19930 + }, + { + "epoch": 2.220180420982292, + "grad_norm": 1.3162128925323486, + "learning_rate": 4.998156134268488e-05, + "loss": 0.1281, + "num_input_tokens_seen": 24267216, + "step": 19935 + }, + { + "epoch": 2.2207372758659094, + "grad_norm": 0.4945046901702881, + "learning_rate": 4.9981467923128645e-05, + "loss": 0.0496, + "num_input_tokens_seen": 24273424, + "step": 19940 + }, + { + "epoch": 2.2212941307495266, + "grad_norm": 1.4015787839889526, + "learning_rate": 4.9981374267602426e-05, + "loss": 0.0948, + "num_input_tokens_seen": 24279440, + "step": 19945 + }, + { + "epoch": 2.221850985633144, + "grad_norm": 0.16209395229816437, + "learning_rate": 4.998128037610712e-05, + "loss": 0.1006, + "num_input_tokens_seen": 24285360, + "step": 19950 + }, + { + "epoch": 2.2224078405167615, + "grad_norm": 1.7429169416427612, + "learning_rate": 4.99811862486436e-05, + "loss": 0.332, + "num_input_tokens_seen": 24291344, + "step": 19955 + }, + { + "epoch": 2.2229646954003788, + "grad_norm": 0.4960055649280548, + "learning_rate": 4.998109188521276e-05, + "loss": 0.0764, + "num_input_tokens_seen": 24297424, + "step": 19960 + }, + { + "epoch": 2.223521550283996, + "grad_norm": 0.9727937579154968, + "learning_rate": 4.99809972858155e-05, + "loss": 0.1497, + "num_input_tokens_seen": 24303472, + "step": 19965 + }, + { + "epoch": 2.2240784051676132, + "grad_norm": 0.29567453265190125, + "learning_rate": 4.99809024504527e-05, + "loss": 0.0591, + "num_input_tokens_seen": 24309680, + "step": 19970 + }, + { + "epoch": 2.2246352600512305, + "grad_norm": 1.093309760093689, + "learning_rate": 4.998080737912526e-05, + "loss": 0.1172, + "num_input_tokens_seen": 24315568, + "step": 19975 + }, + { + "epoch": 2.225192114934848, + "grad_norm": 0.00704479543492198, + "learning_rate": 4.998071207183409e-05, + "loss": 0.1665, + "num_input_tokens_seen": 24321680, + "step": 19980 + }, + { + "epoch": 2.2257489698184654, + "grad_norm": 0.5528188943862915, + "learning_rate": 4.998061652858007e-05, + "loss": 0.0662, + "num_input_tokens_seen": 24327664, + "step": 19985 + }, + { + "epoch": 2.2263058247020826, + "grad_norm": 0.5160256028175354, + "learning_rate": 4.998052074936412e-05, + "loss": 0.0901, + "num_input_tokens_seen": 24333712, + "step": 19990 + }, + { + "epoch": 2.2268626795857, + "grad_norm": 1.2125499248504639, + "learning_rate": 4.998042473418714e-05, + "loss": 0.1426, + "num_input_tokens_seen": 24339856, + "step": 19995 + }, + { + "epoch": 2.227419534469317, + "grad_norm": 0.09718145430088043, + "learning_rate": 4.998032848305002e-05, + "loss": 0.0309, + "num_input_tokens_seen": 24345968, + "step": 20000 + }, + { + "epoch": 2.2279763893529347, + "grad_norm": 0.05595550313591957, + "learning_rate": 4.99802319959537e-05, + "loss": 0.0446, + "num_input_tokens_seen": 24352080, + "step": 20005 + }, + { + "epoch": 2.228533244236552, + "grad_norm": 1.2011518478393555, + "learning_rate": 4.998013527289906e-05, + "loss": 0.1169, + "num_input_tokens_seen": 24358320, + "step": 20010 + }, + { + "epoch": 2.229090099120169, + "grad_norm": 0.6784024238586426, + "learning_rate": 4.9980038313887035e-05, + "loss": 0.0653, + "num_input_tokens_seen": 24364464, + "step": 20015 + }, + { + "epoch": 2.2296469540037864, + "grad_norm": 0.8307245969772339, + "learning_rate": 4.997994111891854e-05, + "loss": 0.1408, + "num_input_tokens_seen": 24370544, + "step": 20020 + }, + { + "epoch": 2.230203808887404, + "grad_norm": 0.5839716196060181, + "learning_rate": 4.9979843687994485e-05, + "loss": 0.1009, + "num_input_tokens_seen": 24376912, + "step": 20025 + }, + { + "epoch": 2.2307606637710213, + "grad_norm": 0.13408099114894867, + "learning_rate": 4.9979746021115784e-05, + "loss": 0.0165, + "num_input_tokens_seen": 24383248, + "step": 20030 + }, + { + "epoch": 2.2313175186546386, + "grad_norm": 0.07762368768453598, + "learning_rate": 4.997964811828338e-05, + "loss": 0.1261, + "num_input_tokens_seen": 24389648, + "step": 20035 + }, + { + "epoch": 2.231874373538256, + "grad_norm": 0.4883717894554138, + "learning_rate": 4.997954997949818e-05, + "loss": 0.111, + "num_input_tokens_seen": 24395696, + "step": 20040 + }, + { + "epoch": 2.2324312284218735, + "grad_norm": 2.8395230770111084, + "learning_rate": 4.997945160476112e-05, + "loss": 0.255, + "num_input_tokens_seen": 24402032, + "step": 20045 + }, + { + "epoch": 2.2329880833054907, + "grad_norm": 1.8725446462631226, + "learning_rate": 4.9979352994073123e-05, + "loss": 0.0522, + "num_input_tokens_seen": 24407984, + "step": 20050 + }, + { + "epoch": 2.233544938189108, + "grad_norm": 0.44739243388175964, + "learning_rate": 4.997925414743513e-05, + "loss": 0.0623, + "num_input_tokens_seen": 24414128, + "step": 20055 + }, + { + "epoch": 2.234101793072725, + "grad_norm": 0.824397623538971, + "learning_rate": 4.997915506484806e-05, + "loss": 0.1873, + "num_input_tokens_seen": 24419888, + "step": 20060 + }, + { + "epoch": 2.2346586479563424, + "grad_norm": 0.46044808626174927, + "learning_rate": 4.9979055746312863e-05, + "loss": 0.0828, + "num_input_tokens_seen": 24426128, + "step": 20065 + }, + { + "epoch": 2.23521550283996, + "grad_norm": 0.10615598410367966, + "learning_rate": 4.9978956191830476e-05, + "loss": 0.0612, + "num_input_tokens_seen": 24432336, + "step": 20070 + }, + { + "epoch": 2.2357723577235773, + "grad_norm": 0.8963614106178284, + "learning_rate": 4.997885640140182e-05, + "loss": 0.2003, + "num_input_tokens_seen": 24438352, + "step": 20075 + }, + { + "epoch": 2.2363292126071945, + "grad_norm": 0.8698864579200745, + "learning_rate": 4.9978756375027865e-05, + "loss": 0.1044, + "num_input_tokens_seen": 24444368, + "step": 20080 + }, + { + "epoch": 2.2368860674908118, + "grad_norm": 1.7351627349853516, + "learning_rate": 4.997865611270954e-05, + "loss": 0.0917, + "num_input_tokens_seen": 24450128, + "step": 20085 + }, + { + "epoch": 2.237442922374429, + "grad_norm": 0.28423386812210083, + "learning_rate": 4.9978555614447796e-05, + "loss": 0.0601, + "num_input_tokens_seen": 24456432, + "step": 20090 + }, + { + "epoch": 2.2379997772580467, + "grad_norm": 0.7426119446754456, + "learning_rate": 4.997845488024359e-05, + "loss": 0.1272, + "num_input_tokens_seen": 24462256, + "step": 20095 + }, + { + "epoch": 2.238556632141664, + "grad_norm": 0.1886018067598343, + "learning_rate": 4.997835391009785e-05, + "loss": 0.0556, + "num_input_tokens_seen": 24468336, + "step": 20100 + }, + { + "epoch": 2.239113487025281, + "grad_norm": 0.7608005404472351, + "learning_rate": 4.9978252704011554e-05, + "loss": 0.0911, + "num_input_tokens_seen": 24474448, + "step": 20105 + }, + { + "epoch": 2.2396703419088984, + "grad_norm": 0.3675571084022522, + "learning_rate": 4.997815126198564e-05, + "loss": 0.0998, + "num_input_tokens_seen": 24480048, + "step": 20110 + }, + { + "epoch": 2.240227196792516, + "grad_norm": 0.5253908634185791, + "learning_rate": 4.997804958402109e-05, + "loss": 0.1609, + "num_input_tokens_seen": 24486416, + "step": 20115 + }, + { + "epoch": 2.2407840516761333, + "grad_norm": 0.8553987741470337, + "learning_rate": 4.9977947670118844e-05, + "loss": 0.0431, + "num_input_tokens_seen": 24492464, + "step": 20120 + }, + { + "epoch": 2.2413409065597505, + "grad_norm": 0.1939583569765091, + "learning_rate": 4.997784552027986e-05, + "loss": 0.0842, + "num_input_tokens_seen": 24498640, + "step": 20125 + }, + { + "epoch": 2.2418977614433677, + "grad_norm": 0.4673714339733124, + "learning_rate": 4.9977743134505124e-05, + "loss": 0.0256, + "num_input_tokens_seen": 24504528, + "step": 20130 + }, + { + "epoch": 2.2424546163269854, + "grad_norm": 2.3451013565063477, + "learning_rate": 4.9977640512795586e-05, + "loss": 0.2623, + "num_input_tokens_seen": 24510576, + "step": 20135 + }, + { + "epoch": 2.2430114712106026, + "grad_norm": 0.02155059203505516, + "learning_rate": 4.997753765515223e-05, + "loss": 0.1612, + "num_input_tokens_seen": 24516496, + "step": 20140 + }, + { + "epoch": 2.24356832609422, + "grad_norm": 1.304511547088623, + "learning_rate": 4.997743456157601e-05, + "loss": 0.1371, + "num_input_tokens_seen": 24522704, + "step": 20145 + }, + { + "epoch": 2.244125180977837, + "grad_norm": 0.03244199976325035, + "learning_rate": 4.997733123206792e-05, + "loss": 0.01, + "num_input_tokens_seen": 24528752, + "step": 20150 + }, + { + "epoch": 2.2446820358614543, + "grad_norm": 0.6031619310379028, + "learning_rate": 4.997722766662891e-05, + "loss": 0.1204, + "num_input_tokens_seen": 24534640, + "step": 20155 + }, + { + "epoch": 2.245238890745072, + "grad_norm": 0.12596581876277924, + "learning_rate": 4.997712386525998e-05, + "loss": 0.1218, + "num_input_tokens_seen": 24540624, + "step": 20160 + }, + { + "epoch": 2.245795745628689, + "grad_norm": 0.18388676643371582, + "learning_rate": 4.997701982796211e-05, + "loss": 0.0975, + "num_input_tokens_seen": 24546416, + "step": 20165 + }, + { + "epoch": 2.2463526005123065, + "grad_norm": 1.0449321269989014, + "learning_rate": 4.997691555473627e-05, + "loss": 0.08, + "num_input_tokens_seen": 24552304, + "step": 20170 + }, + { + "epoch": 2.2469094553959237, + "grad_norm": 0.37527623772621155, + "learning_rate": 4.9976811045583446e-05, + "loss": 0.1301, + "num_input_tokens_seen": 24558928, + "step": 20175 + }, + { + "epoch": 2.2474663102795414, + "grad_norm": 0.11360544711351395, + "learning_rate": 4.997670630050464e-05, + "loss": 0.0671, + "num_input_tokens_seen": 24565136, + "step": 20180 + }, + { + "epoch": 2.2480231651631586, + "grad_norm": 0.6998298168182373, + "learning_rate": 4.997660131950083e-05, + "loss": 0.0456, + "num_input_tokens_seen": 24571152, + "step": 20185 + }, + { + "epoch": 2.248580020046776, + "grad_norm": 0.24356620013713837, + "learning_rate": 4.9976496102573014e-05, + "loss": 0.0834, + "num_input_tokens_seen": 24577264, + "step": 20190 + }, + { + "epoch": 2.249136874930393, + "grad_norm": 0.0032015482429414988, + "learning_rate": 4.997639064972217e-05, + "loss": 0.07, + "num_input_tokens_seen": 24583568, + "step": 20195 + }, + { + "epoch": 2.2496937298140103, + "grad_norm": 1.3836873769760132, + "learning_rate": 4.997628496094932e-05, + "loss": 0.1074, + "num_input_tokens_seen": 24589808, + "step": 20200 + }, + { + "epoch": 2.250250584697628, + "grad_norm": 0.6708815097808838, + "learning_rate": 4.997617903625543e-05, + "loss": 0.0766, + "num_input_tokens_seen": 24595952, + "step": 20205 + }, + { + "epoch": 2.250807439581245, + "grad_norm": 0.7935079336166382, + "learning_rate": 4.997607287564153e-05, + "loss": 0.1753, + "num_input_tokens_seen": 24601680, + "step": 20210 + }, + { + "epoch": 2.2513642944648624, + "grad_norm": 0.6236294507980347, + "learning_rate": 4.997596647910862e-05, + "loss": 0.0921, + "num_input_tokens_seen": 24607472, + "step": 20215 + }, + { + "epoch": 2.2519211493484796, + "grad_norm": 1.8690050840377808, + "learning_rate": 4.997585984665768e-05, + "loss": 0.2355, + "num_input_tokens_seen": 24613552, + "step": 20220 + }, + { + "epoch": 2.2524780042320973, + "grad_norm": 1.4406112432479858, + "learning_rate": 4.9975752978289734e-05, + "loss": 0.193, + "num_input_tokens_seen": 24619536, + "step": 20225 + }, + { + "epoch": 2.2530348591157145, + "grad_norm": 0.3689027428627014, + "learning_rate": 4.99756458740058e-05, + "loss": 0.1051, + "num_input_tokens_seen": 24625776, + "step": 20230 + }, + { + "epoch": 2.253591713999332, + "grad_norm": 0.09235336631536484, + "learning_rate": 4.997553853380688e-05, + "loss": 0.0535, + "num_input_tokens_seen": 24631600, + "step": 20235 + }, + { + "epoch": 2.254148568882949, + "grad_norm": 0.1773129254579544, + "learning_rate": 4.997543095769398e-05, + "loss": 0.0895, + "num_input_tokens_seen": 24637136, + "step": 20240 + }, + { + "epoch": 2.2547054237665662, + "grad_norm": 0.15076887607574463, + "learning_rate": 4.997532314566813e-05, + "loss": 0.1131, + "num_input_tokens_seen": 24643376, + "step": 20245 + }, + { + "epoch": 2.255262278650184, + "grad_norm": 0.6484940648078918, + "learning_rate": 4.9975215097730346e-05, + "loss": 0.075, + "num_input_tokens_seen": 24649520, + "step": 20250 + }, + { + "epoch": 2.255819133533801, + "grad_norm": 0.8870704174041748, + "learning_rate": 4.997510681388164e-05, + "loss": 0.162, + "num_input_tokens_seen": 24655760, + "step": 20255 + }, + { + "epoch": 2.2563759884174184, + "grad_norm": 0.8054043054580688, + "learning_rate": 4.9974998294123046e-05, + "loss": 0.1652, + "num_input_tokens_seen": 24661424, + "step": 20260 + }, + { + "epoch": 2.2569328433010356, + "grad_norm": 1.7026993036270142, + "learning_rate": 4.9974889538455576e-05, + "loss": 0.103, + "num_input_tokens_seen": 24667536, + "step": 20265 + }, + { + "epoch": 2.257489698184653, + "grad_norm": 0.44299712777137756, + "learning_rate": 4.9974780546880276e-05, + "loss": 0.1039, + "num_input_tokens_seen": 24673328, + "step": 20270 + }, + { + "epoch": 2.2580465530682705, + "grad_norm": 0.7776549458503723, + "learning_rate": 4.9974671319398156e-05, + "loss": 0.0647, + "num_input_tokens_seen": 24679472, + "step": 20275 + }, + { + "epoch": 2.2586034079518877, + "grad_norm": 1.9575810432434082, + "learning_rate": 4.997456185601026e-05, + "loss": 0.1536, + "num_input_tokens_seen": 24685520, + "step": 20280 + }, + { + "epoch": 2.259160262835505, + "grad_norm": 0.7002701759338379, + "learning_rate": 4.997445215671761e-05, + "loss": 0.1154, + "num_input_tokens_seen": 24691504, + "step": 20285 + }, + { + "epoch": 2.259717117719122, + "grad_norm": 0.6078969836235046, + "learning_rate": 4.9974342221521256e-05, + "loss": 0.0974, + "num_input_tokens_seen": 24697424, + "step": 20290 + }, + { + "epoch": 2.26027397260274, + "grad_norm": 0.6005603075027466, + "learning_rate": 4.997423205042223e-05, + "loss": 0.1742, + "num_input_tokens_seen": 24703568, + "step": 20295 + }, + { + "epoch": 2.260830827486357, + "grad_norm": 0.007540685124695301, + "learning_rate": 4.9974121643421565e-05, + "loss": 0.0738, + "num_input_tokens_seen": 24709296, + "step": 20300 + }, + { + "epoch": 2.2613876823699743, + "grad_norm": 0.19218333065509796, + "learning_rate": 4.9974011000520325e-05, + "loss": 0.0576, + "num_input_tokens_seen": 24715664, + "step": 20305 + }, + { + "epoch": 2.2619445372535916, + "grad_norm": 0.6580577492713928, + "learning_rate": 4.997390012171954e-05, + "loss": 0.0361, + "num_input_tokens_seen": 24721552, + "step": 20310 + }, + { + "epoch": 2.2625013921372092, + "grad_norm": 0.5693164467811584, + "learning_rate": 4.997378900702025e-05, + "loss": 0.0699, + "num_input_tokens_seen": 24727952, + "step": 20315 + }, + { + "epoch": 2.2630582470208265, + "grad_norm": 1.4514095783233643, + "learning_rate": 4.997367765642353e-05, + "loss": 0.1693, + "num_input_tokens_seen": 24734064, + "step": 20320 + }, + { + "epoch": 2.2636151019044437, + "grad_norm": 0.3549116253852844, + "learning_rate": 4.99735660699304e-05, + "loss": 0.0464, + "num_input_tokens_seen": 24740400, + "step": 20325 + }, + { + "epoch": 2.264171956788061, + "grad_norm": 0.21280451118946075, + "learning_rate": 4.997345424754194e-05, + "loss": 0.0271, + "num_input_tokens_seen": 24746352, + "step": 20330 + }, + { + "epoch": 2.264728811671678, + "grad_norm": 0.019435161724686623, + "learning_rate": 4.99733421892592e-05, + "loss": 0.019, + "num_input_tokens_seen": 24752400, + "step": 20335 + }, + { + "epoch": 2.265285666555296, + "grad_norm": 0.31352677941322327, + "learning_rate": 4.9973229895083226e-05, + "loss": 0.0976, + "num_input_tokens_seen": 24758544, + "step": 20340 + }, + { + "epoch": 2.265842521438913, + "grad_norm": 0.20153065025806427, + "learning_rate": 4.997311736501509e-05, + "loss": 0.1581, + "num_input_tokens_seen": 24764336, + "step": 20345 + }, + { + "epoch": 2.2663993763225303, + "grad_norm": 0.410199373960495, + "learning_rate": 4.997300459905585e-05, + "loss": 0.0445, + "num_input_tokens_seen": 24770384, + "step": 20350 + }, + { + "epoch": 2.2669562312061475, + "grad_norm": 0.011337283998727798, + "learning_rate": 4.9972891597206576e-05, + "loss": 0.1256, + "num_input_tokens_seen": 24776560, + "step": 20355 + }, + { + "epoch": 2.2675130860897648, + "grad_norm": 0.5536414980888367, + "learning_rate": 4.997277835946833e-05, + "loss": 0.0432, + "num_input_tokens_seen": 24782672, + "step": 20360 + }, + { + "epoch": 2.2680699409733824, + "grad_norm": 0.5432679653167725, + "learning_rate": 4.997266488584219e-05, + "loss": 0.1045, + "num_input_tokens_seen": 24788560, + "step": 20365 + }, + { + "epoch": 2.2686267958569997, + "grad_norm": 0.4840089678764343, + "learning_rate": 4.9972551176329216e-05, + "loss": 0.1654, + "num_input_tokens_seen": 24794480, + "step": 20370 + }, + { + "epoch": 2.269183650740617, + "grad_norm": 0.8636415004730225, + "learning_rate": 4.99724372309305e-05, + "loss": 0.1016, + "num_input_tokens_seen": 24800784, + "step": 20375 + }, + { + "epoch": 2.2697405056242346, + "grad_norm": 0.07453729212284088, + "learning_rate": 4.9972323049647094e-05, + "loss": 0.0484, + "num_input_tokens_seen": 24807280, + "step": 20380 + }, + { + "epoch": 2.270297360507852, + "grad_norm": 0.4044331908226013, + "learning_rate": 4.9972208632480096e-05, + "loss": 0.1477, + "num_input_tokens_seen": 24813520, + "step": 20385 + }, + { + "epoch": 2.270854215391469, + "grad_norm": 0.30987614393234253, + "learning_rate": 4.9972093979430576e-05, + "loss": 0.0665, + "num_input_tokens_seen": 24819664, + "step": 20390 + }, + { + "epoch": 2.2714110702750863, + "grad_norm": 1.0961980819702148, + "learning_rate": 4.997197909049962e-05, + "loss": 0.1406, + "num_input_tokens_seen": 24825616, + "step": 20395 + }, + { + "epoch": 2.2719679251587035, + "grad_norm": 0.5400973558425903, + "learning_rate": 4.997186396568832e-05, + "loss": 0.1068, + "num_input_tokens_seen": 24831568, + "step": 20400 + }, + { + "epoch": 2.272524780042321, + "grad_norm": 4.042295455932617, + "learning_rate": 4.997174860499776e-05, + "loss": 0.1363, + "num_input_tokens_seen": 24837136, + "step": 20405 + }, + { + "epoch": 2.2730816349259384, + "grad_norm": 0.24834226071834564, + "learning_rate": 4.997163300842902e-05, + "loss": 0.0812, + "num_input_tokens_seen": 24842928, + "step": 20410 + }, + { + "epoch": 2.2736384898095556, + "grad_norm": 1.01483952999115, + "learning_rate": 4.99715171759832e-05, + "loss": 0.2223, + "num_input_tokens_seen": 24847728, + "step": 20415 + }, + { + "epoch": 2.274195344693173, + "grad_norm": 0.1685711294412613, + "learning_rate": 4.9971401107661394e-05, + "loss": 0.0237, + "num_input_tokens_seen": 24853840, + "step": 20420 + }, + { + "epoch": 2.27475219957679, + "grad_norm": 1.3697043657302856, + "learning_rate": 4.99712848034647e-05, + "loss": 0.103, + "num_input_tokens_seen": 24859888, + "step": 20425 + }, + { + "epoch": 2.2753090544604078, + "grad_norm": 0.7374454736709595, + "learning_rate": 4.997116826339422e-05, + "loss": 0.1774, + "num_input_tokens_seen": 24865648, + "step": 20430 + }, + { + "epoch": 2.275865909344025, + "grad_norm": 0.8927590847015381, + "learning_rate": 4.9971051487451035e-05, + "loss": 0.0472, + "num_input_tokens_seen": 24871952, + "step": 20435 + }, + { + "epoch": 2.2764227642276422, + "grad_norm": 1.1660032272338867, + "learning_rate": 4.9970934475636274e-05, + "loss": 0.1062, + "num_input_tokens_seen": 24877904, + "step": 20440 + }, + { + "epoch": 2.2769796191112595, + "grad_norm": 0.7473206520080566, + "learning_rate": 4.9970817227951026e-05, + "loss": 0.1416, + "num_input_tokens_seen": 24883760, + "step": 20445 + }, + { + "epoch": 2.2775364739948767, + "grad_norm": 1.4338332414627075, + "learning_rate": 4.9970699744396396e-05, + "loss": 0.1739, + "num_input_tokens_seen": 24890064, + "step": 20450 + }, + { + "epoch": 2.2780933288784944, + "grad_norm": 0.21794022619724274, + "learning_rate": 4.997058202497351e-05, + "loss": 0.0841, + "num_input_tokens_seen": 24896304, + "step": 20455 + }, + { + "epoch": 2.2786501837621116, + "grad_norm": 0.005208068061619997, + "learning_rate": 4.9970464069683476e-05, + "loss": 0.0811, + "num_input_tokens_seen": 24902672, + "step": 20460 + }, + { + "epoch": 2.279207038645729, + "grad_norm": 1.5023118257522583, + "learning_rate": 4.997034587852739e-05, + "loss": 0.0942, + "num_input_tokens_seen": 24909136, + "step": 20465 + }, + { + "epoch": 2.2797638935293465, + "grad_norm": 0.796498715877533, + "learning_rate": 4.997022745150639e-05, + "loss": 0.1339, + "num_input_tokens_seen": 24915312, + "step": 20470 + }, + { + "epoch": 2.2803207484129637, + "grad_norm": 0.5632055401802063, + "learning_rate": 4.997010878862158e-05, + "loss": 0.1012, + "num_input_tokens_seen": 24921392, + "step": 20475 + }, + { + "epoch": 2.280877603296581, + "grad_norm": 0.40894368290901184, + "learning_rate": 4.996998988987409e-05, + "loss": 0.0411, + "num_input_tokens_seen": 24927632, + "step": 20480 + }, + { + "epoch": 2.281434458180198, + "grad_norm": 0.4840998649597168, + "learning_rate": 4.996987075526504e-05, + "loss": 0.0705, + "num_input_tokens_seen": 24933328, + "step": 20485 + }, + { + "epoch": 2.2819913130638154, + "grad_norm": 0.7394428849220276, + "learning_rate": 4.996975138479556e-05, + "loss": 0.2878, + "num_input_tokens_seen": 24939472, + "step": 20490 + }, + { + "epoch": 2.282548167947433, + "grad_norm": 0.5564558506011963, + "learning_rate": 4.9969631778466763e-05, + "loss": 0.1579, + "num_input_tokens_seen": 24945168, + "step": 20495 + }, + { + "epoch": 2.2831050228310503, + "grad_norm": 0.4409244656562805, + "learning_rate": 4.996951193627979e-05, + "loss": 0.1023, + "num_input_tokens_seen": 24951376, + "step": 20500 + }, + { + "epoch": 2.2836618777146676, + "grad_norm": 0.0835028663277626, + "learning_rate": 4.996939185823578e-05, + "loss": 0.1235, + "num_input_tokens_seen": 24957264, + "step": 20505 + }, + { + "epoch": 2.284218732598285, + "grad_norm": 0.17684458196163177, + "learning_rate": 4.996927154433585e-05, + "loss": 0.1223, + "num_input_tokens_seen": 24963216, + "step": 20510 + }, + { + "epoch": 2.284775587481902, + "grad_norm": 0.6491788029670715, + "learning_rate": 4.9969150994581146e-05, + "loss": 0.1425, + "num_input_tokens_seen": 24969712, + "step": 20515 + }, + { + "epoch": 2.2853324423655197, + "grad_norm": 0.056806985288858414, + "learning_rate": 4.996903020897281e-05, + "loss": 0.124, + "num_input_tokens_seen": 24976048, + "step": 20520 + }, + { + "epoch": 2.285889297249137, + "grad_norm": 0.14808081090450287, + "learning_rate": 4.996890918751198e-05, + "loss": 0.0758, + "num_input_tokens_seen": 24982320, + "step": 20525 + }, + { + "epoch": 2.286446152132754, + "grad_norm": 0.5751668810844421, + "learning_rate": 4.996878793019979e-05, + "loss": 0.1246, + "num_input_tokens_seen": 24988272, + "step": 20530 + }, + { + "epoch": 2.2870030070163714, + "grad_norm": 1.7562896013259888, + "learning_rate": 4.9968666437037405e-05, + "loss": 0.2, + "num_input_tokens_seen": 24994320, + "step": 20535 + }, + { + "epoch": 2.287559861899989, + "grad_norm": 0.035419758409261703, + "learning_rate": 4.996854470802596e-05, + "loss": 0.0592, + "num_input_tokens_seen": 25000176, + "step": 20540 + }, + { + "epoch": 2.2881167167836063, + "grad_norm": 1.021292805671692, + "learning_rate": 4.996842274316659e-05, + "loss": 0.1486, + "num_input_tokens_seen": 25006096, + "step": 20545 + }, + { + "epoch": 2.2886735716672235, + "grad_norm": 0.29919174313545227, + "learning_rate": 4.996830054246048e-05, + "loss": 0.1198, + "num_input_tokens_seen": 25012176, + "step": 20550 + }, + { + "epoch": 2.2892304265508407, + "grad_norm": 0.3489430248737335, + "learning_rate": 4.996817810590876e-05, + "loss": 0.0237, + "num_input_tokens_seen": 25018672, + "step": 20555 + }, + { + "epoch": 2.2897872814344584, + "grad_norm": 0.04266885668039322, + "learning_rate": 4.996805543351259e-05, + "loss": 0.122, + "num_input_tokens_seen": 25025072, + "step": 20560 + }, + { + "epoch": 2.2903441363180757, + "grad_norm": 0.26107653975486755, + "learning_rate": 4.996793252527313e-05, + "loss": 0.0863, + "num_input_tokens_seen": 25031248, + "step": 20565 + }, + { + "epoch": 2.290900991201693, + "grad_norm": 0.007776158396154642, + "learning_rate": 4.9967809381191556e-05, + "loss": 0.0586, + "num_input_tokens_seen": 25037200, + "step": 20570 + }, + { + "epoch": 2.29145784608531, + "grad_norm": 1.7050504684448242, + "learning_rate": 4.996768600126901e-05, + "loss": 0.1642, + "num_input_tokens_seen": 25042256, + "step": 20575 + }, + { + "epoch": 2.2920147009689273, + "grad_norm": 1.5563445091247559, + "learning_rate": 4.9967562385506665e-05, + "loss": 0.1655, + "num_input_tokens_seen": 25048368, + "step": 20580 + }, + { + "epoch": 2.292571555852545, + "grad_norm": 0.7936336398124695, + "learning_rate": 4.99674385339057e-05, + "loss": 0.0849, + "num_input_tokens_seen": 25054672, + "step": 20585 + }, + { + "epoch": 2.2931284107361622, + "grad_norm": 0.4440929591655731, + "learning_rate": 4.996731444646726e-05, + "loss": 0.1482, + "num_input_tokens_seen": 25060944, + "step": 20590 + }, + { + "epoch": 2.2936852656197795, + "grad_norm": 0.7516568303108215, + "learning_rate": 4.9967190123192544e-05, + "loss": 0.1721, + "num_input_tokens_seen": 25067184, + "step": 20595 + }, + { + "epoch": 2.2942421205033967, + "grad_norm": 0.006368795409798622, + "learning_rate": 4.9967065564082705e-05, + "loss": 0.0692, + "num_input_tokens_seen": 25073424, + "step": 20600 + }, + { + "epoch": 2.294798975387014, + "grad_norm": 0.6571681499481201, + "learning_rate": 4.996694076913893e-05, + "loss": 0.1731, + "num_input_tokens_seen": 25079536, + "step": 20605 + }, + { + "epoch": 2.2953558302706316, + "grad_norm": 1.2626805305480957, + "learning_rate": 4.99668157383624e-05, + "loss": 0.1086, + "num_input_tokens_seen": 25085904, + "step": 20610 + }, + { + "epoch": 2.295912685154249, + "grad_norm": 0.005034437403082848, + "learning_rate": 4.996669047175429e-05, + "loss": 0.0333, + "num_input_tokens_seen": 25092016, + "step": 20615 + }, + { + "epoch": 2.296469540037866, + "grad_norm": 0.4679504632949829, + "learning_rate": 4.9966564969315786e-05, + "loss": 0.035, + "num_input_tokens_seen": 25098032, + "step": 20620 + }, + { + "epoch": 2.2970263949214833, + "grad_norm": 1.4976564645767212, + "learning_rate": 4.9966439231048074e-05, + "loss": 0.2104, + "num_input_tokens_seen": 25103792, + "step": 20625 + }, + { + "epoch": 2.297583249805101, + "grad_norm": 2.603943347930908, + "learning_rate": 4.9966313256952336e-05, + "loss": 0.1813, + "num_input_tokens_seen": 25109680, + "step": 20630 + }, + { + "epoch": 2.298140104688718, + "grad_norm": 0.3071497976779938, + "learning_rate": 4.996618704702977e-05, + "loss": 0.169, + "num_input_tokens_seen": 25115984, + "step": 20635 + }, + { + "epoch": 2.2986969595723354, + "grad_norm": 0.1369807869195938, + "learning_rate": 4.9966060601281564e-05, + "loss": 0.0543, + "num_input_tokens_seen": 25122192, + "step": 20640 + }, + { + "epoch": 2.2992538144559527, + "grad_norm": 0.013268272392451763, + "learning_rate": 4.9965933919708915e-05, + "loss": 0.0706, + "num_input_tokens_seen": 25127728, + "step": 20645 + }, + { + "epoch": 2.2998106693395703, + "grad_norm": 1.3887649774551392, + "learning_rate": 4.996580700231302e-05, + "loss": 0.169, + "num_input_tokens_seen": 25133840, + "step": 20650 + }, + { + "epoch": 2.3003675242231876, + "grad_norm": 0.6710311770439148, + "learning_rate": 4.996567984909507e-05, + "loss": 0.2063, + "num_input_tokens_seen": 25139856, + "step": 20655 + }, + { + "epoch": 2.300924379106805, + "grad_norm": 0.9716962575912476, + "learning_rate": 4.996555246005627e-05, + "loss": 0.1532, + "num_input_tokens_seen": 25145296, + "step": 20660 + }, + { + "epoch": 2.301481233990422, + "grad_norm": 0.18473811447620392, + "learning_rate": 4.996542483519784e-05, + "loss": 0.0943, + "num_input_tokens_seen": 25151280, + "step": 20665 + }, + { + "epoch": 2.3020380888740393, + "grad_norm": 0.5175012946128845, + "learning_rate": 4.996529697452095e-05, + "loss": 0.0817, + "num_input_tokens_seen": 25157264, + "step": 20670 + }, + { + "epoch": 2.302594943757657, + "grad_norm": 0.3196984827518463, + "learning_rate": 4.996516887802684e-05, + "loss": 0.0574, + "num_input_tokens_seen": 25162928, + "step": 20675 + }, + { + "epoch": 2.303151798641274, + "grad_norm": 1.0382981300354004, + "learning_rate": 4.996504054571671e-05, + "loss": 0.0728, + "num_input_tokens_seen": 25169296, + "step": 20680 + }, + { + "epoch": 2.3037086535248914, + "grad_norm": 1.2451926469802856, + "learning_rate": 4.996491197759176e-05, + "loss": 0.1386, + "num_input_tokens_seen": 25175536, + "step": 20685 + }, + { + "epoch": 2.3042655084085086, + "grad_norm": 0.40880924463272095, + "learning_rate": 4.9964783173653226e-05, + "loss": 0.1095, + "num_input_tokens_seen": 25181680, + "step": 20690 + }, + { + "epoch": 2.304822363292126, + "grad_norm": 0.8768694400787354, + "learning_rate": 4.996465413390231e-05, + "loss": 0.2428, + "num_input_tokens_seen": 25187568, + "step": 20695 + }, + { + "epoch": 2.3053792181757435, + "grad_norm": 0.40375208854675293, + "learning_rate": 4.996452485834023e-05, + "loss": 0.0834, + "num_input_tokens_seen": 25193808, + "step": 20700 + }, + { + "epoch": 2.3059360730593608, + "grad_norm": 0.25655585527420044, + "learning_rate": 4.996439534696822e-05, + "loss": 0.0891, + "num_input_tokens_seen": 25200240, + "step": 20705 + }, + { + "epoch": 2.306492927942978, + "grad_norm": 1.0570621490478516, + "learning_rate": 4.996426559978748e-05, + "loss": 0.0947, + "num_input_tokens_seen": 25206224, + "step": 20710 + }, + { + "epoch": 2.3070497828265952, + "grad_norm": 0.6168206930160522, + "learning_rate": 4.996413561679926e-05, + "loss": 0.083, + "num_input_tokens_seen": 25212304, + "step": 20715 + }, + { + "epoch": 2.307606637710213, + "grad_norm": 0.46573930978775024, + "learning_rate": 4.996400539800478e-05, + "loss": 0.0438, + "num_input_tokens_seen": 25218224, + "step": 20720 + }, + { + "epoch": 2.30816349259383, + "grad_norm": 0.27066656947135925, + "learning_rate": 4.9963874943405266e-05, + "loss": 0.0748, + "num_input_tokens_seen": 25224304, + "step": 20725 + }, + { + "epoch": 2.3087203474774474, + "grad_norm": 0.01608739234507084, + "learning_rate": 4.9963744253001956e-05, + "loss": 0.0988, + "num_input_tokens_seen": 25230384, + "step": 20730 + }, + { + "epoch": 2.3092772023610646, + "grad_norm": 0.26681703329086304, + "learning_rate": 4.996361332679608e-05, + "loss": 0.0625, + "num_input_tokens_seen": 25236464, + "step": 20735 + }, + { + "epoch": 2.3098340572446823, + "grad_norm": 0.0012318049557507038, + "learning_rate": 4.9963482164788865e-05, + "loss": 0.139, + "num_input_tokens_seen": 25242544, + "step": 20740 + }, + { + "epoch": 2.3103909121282995, + "grad_norm": 0.0037856223061680794, + "learning_rate": 4.996335076698157e-05, + "loss": 0.0703, + "num_input_tokens_seen": 25248656, + "step": 20745 + }, + { + "epoch": 2.3109477670119167, + "grad_norm": 0.2866390347480774, + "learning_rate": 4.996321913337543e-05, + "loss": 0.0239, + "num_input_tokens_seen": 25254768, + "step": 20750 + }, + { + "epoch": 2.311504621895534, + "grad_norm": 0.16267472505569458, + "learning_rate": 4.996308726397167e-05, + "loss": 0.0843, + "num_input_tokens_seen": 25261296, + "step": 20755 + }, + { + "epoch": 2.312061476779151, + "grad_norm": 0.16292081773281097, + "learning_rate": 4.996295515877157e-05, + "loss": 0.0735, + "num_input_tokens_seen": 25267088, + "step": 20760 + }, + { + "epoch": 2.312618331662769, + "grad_norm": 0.7920369505882263, + "learning_rate": 4.9962822817776343e-05, + "loss": 0.0659, + "num_input_tokens_seen": 25273264, + "step": 20765 + }, + { + "epoch": 2.313175186546386, + "grad_norm": 0.33308660984039307, + "learning_rate": 4.9962690240987265e-05, + "loss": 0.0729, + "num_input_tokens_seen": 25279184, + "step": 20770 + }, + { + "epoch": 2.3137320414300033, + "grad_norm": 0.898301362991333, + "learning_rate": 4.996255742840558e-05, + "loss": 0.1284, + "num_input_tokens_seen": 25285104, + "step": 20775 + }, + { + "epoch": 2.3142888963136206, + "grad_norm": 0.4554801285266876, + "learning_rate": 4.9962424380032526e-05, + "loss": 0.0231, + "num_input_tokens_seen": 25291312, + "step": 20780 + }, + { + "epoch": 2.314845751197238, + "grad_norm": 1.5650509595870972, + "learning_rate": 4.996229109586939e-05, + "loss": 0.0718, + "num_input_tokens_seen": 25297616, + "step": 20785 + }, + { + "epoch": 2.3154026060808555, + "grad_norm": 1.6484791040420532, + "learning_rate": 4.99621575759174e-05, + "loss": 0.133, + "num_input_tokens_seen": 25304016, + "step": 20790 + }, + { + "epoch": 2.3159594609644727, + "grad_norm": 0.5401063561439514, + "learning_rate": 4.996202382017784e-05, + "loss": 0.2854, + "num_input_tokens_seen": 25310000, + "step": 20795 + }, + { + "epoch": 2.31651631584809, + "grad_norm": 0.3820330500602722, + "learning_rate": 4.996188982865197e-05, + "loss": 0.0831, + "num_input_tokens_seen": 25316304, + "step": 20800 + }, + { + "epoch": 2.317073170731707, + "grad_norm": 0.11382567137479782, + "learning_rate": 4.996175560134104e-05, + "loss": 0.1414, + "num_input_tokens_seen": 25322800, + "step": 20805 + }, + { + "epoch": 2.317630025615325, + "grad_norm": 0.32935014367103577, + "learning_rate": 4.996162113824634e-05, + "loss": 0.2193, + "num_input_tokens_seen": 25328752, + "step": 20810 + }, + { + "epoch": 2.318186880498942, + "grad_norm": 0.11410994827747345, + "learning_rate": 4.996148643936913e-05, + "loss": 0.0529, + "num_input_tokens_seen": 25335216, + "step": 20815 + }, + { + "epoch": 2.3187437353825593, + "grad_norm": 1.991039514541626, + "learning_rate": 4.996135150471067e-05, + "loss": 0.185, + "num_input_tokens_seen": 25341424, + "step": 20820 + }, + { + "epoch": 2.3193005902661765, + "grad_norm": 0.13156305253505707, + "learning_rate": 4.996121633427226e-05, + "loss": 0.0501, + "num_input_tokens_seen": 25347600, + "step": 20825 + }, + { + "epoch": 2.319857445149794, + "grad_norm": 0.18854236602783203, + "learning_rate": 4.996108092805516e-05, + "loss": 0.0364, + "num_input_tokens_seen": 25353520, + "step": 20830 + }, + { + "epoch": 2.3204143000334114, + "grad_norm": 0.32315146923065186, + "learning_rate": 4.9960945286060646e-05, + "loss": 0.1545, + "num_input_tokens_seen": 25359728, + "step": 20835 + }, + { + "epoch": 2.3209711549170287, + "grad_norm": 1.2927641868591309, + "learning_rate": 4.996080940829001e-05, + "loss": 0.1135, + "num_input_tokens_seen": 25365744, + "step": 20840 + }, + { + "epoch": 2.321528009800646, + "grad_norm": 1.0978775024414062, + "learning_rate": 4.996067329474453e-05, + "loss": 0.1872, + "num_input_tokens_seen": 25371248, + "step": 20845 + }, + { + "epoch": 2.322084864684263, + "grad_norm": 0.23283737897872925, + "learning_rate": 4.9960536945425496e-05, + "loss": 0.0975, + "num_input_tokens_seen": 25377392, + "step": 20850 + }, + { + "epoch": 2.322641719567881, + "grad_norm": 1.2234265804290771, + "learning_rate": 4.996040036033418e-05, + "loss": 0.1309, + "num_input_tokens_seen": 25383632, + "step": 20855 + }, + { + "epoch": 2.323198574451498, + "grad_norm": 0.5719560384750366, + "learning_rate": 4.99602635394719e-05, + "loss": 0.2157, + "num_input_tokens_seen": 25390064, + "step": 20860 + }, + { + "epoch": 2.3237554293351153, + "grad_norm": 1.0794802904129028, + "learning_rate": 4.9960126482839924e-05, + "loss": 0.1687, + "num_input_tokens_seen": 25396112, + "step": 20865 + }, + { + "epoch": 2.3243122842187325, + "grad_norm": 0.2157234102487564, + "learning_rate": 4.995998919043956e-05, + "loss": 0.018, + "num_input_tokens_seen": 25402192, + "step": 20870 + }, + { + "epoch": 2.3248691391023497, + "grad_norm": 2.1398606300354004, + "learning_rate": 4.995985166227209e-05, + "loss": 0.0783, + "num_input_tokens_seen": 25408528, + "step": 20875 + }, + { + "epoch": 2.3254259939859674, + "grad_norm": 1.120782732963562, + "learning_rate": 4.995971389833884e-05, + "loss": 0.1629, + "num_input_tokens_seen": 25414736, + "step": 20880 + }, + { + "epoch": 2.3259828488695846, + "grad_norm": 1.3816375732421875, + "learning_rate": 4.9959575898641086e-05, + "loss": 0.0745, + "num_input_tokens_seen": 25421104, + "step": 20885 + }, + { + "epoch": 2.326539703753202, + "grad_norm": 0.04020407050848007, + "learning_rate": 4.995943766318014e-05, + "loss": 0.111, + "num_input_tokens_seen": 25427472, + "step": 20890 + }, + { + "epoch": 2.327096558636819, + "grad_norm": 0.3033672273159027, + "learning_rate": 4.995929919195731e-05, + "loss": 0.128, + "num_input_tokens_seen": 25433456, + "step": 20895 + }, + { + "epoch": 2.3276534135204368, + "grad_norm": 0.20630289614200592, + "learning_rate": 4.9959160484973896e-05, + "loss": 0.1318, + "num_input_tokens_seen": 25438928, + "step": 20900 + }, + { + "epoch": 2.328210268404054, + "grad_norm": 0.5628702044487, + "learning_rate": 4.995902154223123e-05, + "loss": 0.0924, + "num_input_tokens_seen": 25444880, + "step": 20905 + }, + { + "epoch": 2.328767123287671, + "grad_norm": 0.9203236699104309, + "learning_rate": 4.9958882363730596e-05, + "loss": 0.1389, + "num_input_tokens_seen": 25451088, + "step": 20910 + }, + { + "epoch": 2.3293239781712884, + "grad_norm": 0.3881564438343048, + "learning_rate": 4.9958742949473323e-05, + "loss": 0.1042, + "num_input_tokens_seen": 25456912, + "step": 20915 + }, + { + "epoch": 2.329880833054906, + "grad_norm": 0.17446701228618622, + "learning_rate": 4.995860329946073e-05, + "loss": 0.0871, + "num_input_tokens_seen": 25462992, + "step": 20920 + }, + { + "epoch": 2.3304376879385234, + "grad_norm": 0.0163619052618742, + "learning_rate": 4.995846341369412e-05, + "loss": 0.0766, + "num_input_tokens_seen": 25468528, + "step": 20925 + }, + { + "epoch": 2.3309945428221406, + "grad_norm": 0.0629936084151268, + "learning_rate": 4.995832329217484e-05, + "loss": 0.067, + "num_input_tokens_seen": 25474832, + "step": 20930 + }, + { + "epoch": 2.331551397705758, + "grad_norm": 0.21023298799991608, + "learning_rate": 4.9958182934904196e-05, + "loss": 0.0694, + "num_input_tokens_seen": 25480944, + "step": 20935 + }, + { + "epoch": 2.332108252589375, + "grad_norm": 0.2479902058839798, + "learning_rate": 4.995804234188352e-05, + "loss": 0.0791, + "num_input_tokens_seen": 25487184, + "step": 20940 + }, + { + "epoch": 2.3326651074729927, + "grad_norm": 1.674198865890503, + "learning_rate": 4.9957901513114136e-05, + "loss": 0.1116, + "num_input_tokens_seen": 25493072, + "step": 20945 + }, + { + "epoch": 2.33322196235661, + "grad_norm": 0.6366572976112366, + "learning_rate": 4.995776044859738e-05, + "loss": 0.0496, + "num_input_tokens_seen": 25499312, + "step": 20950 + }, + { + "epoch": 2.333778817240227, + "grad_norm": 1.3277398347854614, + "learning_rate": 4.995761914833458e-05, + "loss": 0.1479, + "num_input_tokens_seen": 25505424, + "step": 20955 + }, + { + "epoch": 2.3343356721238444, + "grad_norm": 0.3769768476486206, + "learning_rate": 4.9957477612327064e-05, + "loss": 0.0792, + "num_input_tokens_seen": 25511376, + "step": 20960 + }, + { + "epoch": 2.3348925270074616, + "grad_norm": 0.17356546223163605, + "learning_rate": 4.9957335840576184e-05, + "loss": 0.2129, + "num_input_tokens_seen": 25517392, + "step": 20965 + }, + { + "epoch": 2.3354493818910793, + "grad_norm": 0.12032147496938705, + "learning_rate": 4.995719383308327e-05, + "loss": 0.0769, + "num_input_tokens_seen": 25523632, + "step": 20970 + }, + { + "epoch": 2.3360062367746965, + "grad_norm": 0.9899340271949768, + "learning_rate": 4.995705158984966e-05, + "loss": 0.1496, + "num_input_tokens_seen": 25530288, + "step": 20975 + }, + { + "epoch": 2.3365630916583138, + "grad_norm": 0.557723879814148, + "learning_rate": 4.99569091108767e-05, + "loss": 0.1139, + "num_input_tokens_seen": 25535824, + "step": 20980 + }, + { + "epoch": 2.337119946541931, + "grad_norm": 0.4628877639770508, + "learning_rate": 4.995676639616575e-05, + "loss": 0.1401, + "num_input_tokens_seen": 25541840, + "step": 20985 + }, + { + "epoch": 2.3376768014255487, + "grad_norm": 0.09045995026826859, + "learning_rate": 4.995662344571814e-05, + "loss": 0.1048, + "num_input_tokens_seen": 25547568, + "step": 20990 + }, + { + "epoch": 2.338233656309166, + "grad_norm": 1.6916054487228394, + "learning_rate": 4.995648025953523e-05, + "loss": 0.2763, + "num_input_tokens_seen": 25553808, + "step": 20995 + }, + { + "epoch": 2.338790511192783, + "grad_norm": 0.26319199800491333, + "learning_rate": 4.9956336837618366e-05, + "loss": 0.0541, + "num_input_tokens_seen": 25560016, + "step": 21000 + }, + { + "epoch": 2.3393473660764004, + "grad_norm": 1.4254077672958374, + "learning_rate": 4.99561931799689e-05, + "loss": 0.1405, + "num_input_tokens_seen": 25566160, + "step": 21005 + }, + { + "epoch": 2.339904220960018, + "grad_norm": 0.1082635447382927, + "learning_rate": 4.9956049286588205e-05, + "loss": 0.176, + "num_input_tokens_seen": 25572144, + "step": 21010 + }, + { + "epoch": 2.3404610758436353, + "grad_norm": 0.14581231772899628, + "learning_rate": 4.995590515747763e-05, + "loss": 0.0848, + "num_input_tokens_seen": 25578256, + "step": 21015 + }, + { + "epoch": 2.3410179307272525, + "grad_norm": 0.003435274586081505, + "learning_rate": 4.995576079263853e-05, + "loss": 0.0883, + "num_input_tokens_seen": 25584528, + "step": 21020 + }, + { + "epoch": 2.3415747856108697, + "grad_norm": 1.4109169244766235, + "learning_rate": 4.995561619207227e-05, + "loss": 0.0595, + "num_input_tokens_seen": 25590608, + "step": 21025 + }, + { + "epoch": 2.342131640494487, + "grad_norm": 0.1712409108877182, + "learning_rate": 4.995547135578024e-05, + "loss": 0.0686, + "num_input_tokens_seen": 25596912, + "step": 21030 + }, + { + "epoch": 2.3426884953781046, + "grad_norm": 0.2897813320159912, + "learning_rate": 4.995532628376377e-05, + "loss": 0.1395, + "num_input_tokens_seen": 25602960, + "step": 21035 + }, + { + "epoch": 2.343245350261722, + "grad_norm": 0.24145090579986572, + "learning_rate": 4.9955180976024265e-05, + "loss": 0.1139, + "num_input_tokens_seen": 25609232, + "step": 21040 + }, + { + "epoch": 2.343802205145339, + "grad_norm": 1.0114127397537231, + "learning_rate": 4.995503543256307e-05, + "loss": 0.1171, + "num_input_tokens_seen": 25615504, + "step": 21045 + }, + { + "epoch": 2.3443590600289563, + "grad_norm": 0.8478174805641174, + "learning_rate": 4.995488965338157e-05, + "loss": 0.0684, + "num_input_tokens_seen": 25621488, + "step": 21050 + }, + { + "epoch": 2.3449159149125736, + "grad_norm": 0.8246568441390991, + "learning_rate": 4.995474363848115e-05, + "loss": 0.1627, + "num_input_tokens_seen": 25627536, + "step": 21055 + }, + { + "epoch": 2.3454727697961912, + "grad_norm": 1.0059880018234253, + "learning_rate": 4.9954597387863186e-05, + "loss": 0.0793, + "num_input_tokens_seen": 25633680, + "step": 21060 + }, + { + "epoch": 2.3460296246798085, + "grad_norm": 0.345119446516037, + "learning_rate": 4.9954450901529056e-05, + "loss": 0.2234, + "num_input_tokens_seen": 25638640, + "step": 21065 + }, + { + "epoch": 2.3465864795634257, + "grad_norm": 0.34158754348754883, + "learning_rate": 4.995430417948014e-05, + "loss": 0.0554, + "num_input_tokens_seen": 25644656, + "step": 21070 + }, + { + "epoch": 2.347143334447043, + "grad_norm": 0.045772626996040344, + "learning_rate": 4.9954157221717826e-05, + "loss": 0.1237, + "num_input_tokens_seen": 25650512, + "step": 21075 + }, + { + "epoch": 2.3477001893306606, + "grad_norm": 0.031241856515407562, + "learning_rate": 4.995401002824351e-05, + "loss": 0.1027, + "num_input_tokens_seen": 25656720, + "step": 21080 + }, + { + "epoch": 2.348257044214278, + "grad_norm": 0.1301165074110031, + "learning_rate": 4.9953862599058576e-05, + "loss": 0.0423, + "num_input_tokens_seen": 25662832, + "step": 21085 + }, + { + "epoch": 2.348813899097895, + "grad_norm": 0.060706838965415955, + "learning_rate": 4.995371493416441e-05, + "loss": 0.0659, + "num_input_tokens_seen": 25668816, + "step": 21090 + }, + { + "epoch": 2.3493707539815123, + "grad_norm": 1.5821800231933594, + "learning_rate": 4.995356703356242e-05, + "loss": 0.151, + "num_input_tokens_seen": 25674832, + "step": 21095 + }, + { + "epoch": 2.34992760886513, + "grad_norm": 0.16114607453346252, + "learning_rate": 4.9953418897253996e-05, + "loss": 0.0986, + "num_input_tokens_seen": 25680848, + "step": 21100 + }, + { + "epoch": 2.350484463748747, + "grad_norm": 0.8171042203903198, + "learning_rate": 4.9953270525240536e-05, + "loss": 0.1574, + "num_input_tokens_seen": 25687152, + "step": 21105 + }, + { + "epoch": 2.3510413186323644, + "grad_norm": 0.3936867415904999, + "learning_rate": 4.995312191752345e-05, + "loss": 0.0213, + "num_input_tokens_seen": 25693488, + "step": 21110 + }, + { + "epoch": 2.3515981735159817, + "grad_norm": 0.09059832990169525, + "learning_rate": 4.995297307410413e-05, + "loss": 0.0346, + "num_input_tokens_seen": 25699632, + "step": 21115 + }, + { + "epoch": 2.352155028399599, + "grad_norm": 0.4356575906276703, + "learning_rate": 4.9952823994983986e-05, + "loss": 0.106, + "num_input_tokens_seen": 25705648, + "step": 21120 + }, + { + "epoch": 2.3527118832832166, + "grad_norm": 0.4287669062614441, + "learning_rate": 4.9952674680164423e-05, + "loss": 0.0731, + "num_input_tokens_seen": 25711760, + "step": 21125 + }, + { + "epoch": 2.353268738166834, + "grad_norm": 0.08192090690135956, + "learning_rate": 4.9952525129646865e-05, + "loss": 0.2067, + "num_input_tokens_seen": 25717360, + "step": 21130 + }, + { + "epoch": 2.353825593050451, + "grad_norm": 0.6485437750816345, + "learning_rate": 4.995237534343271e-05, + "loss": 0.083, + "num_input_tokens_seen": 25723344, + "step": 21135 + }, + { + "epoch": 2.3543824479340683, + "grad_norm": 0.9107353687286377, + "learning_rate": 4.995222532152338e-05, + "loss": 0.0937, + "num_input_tokens_seen": 25729232, + "step": 21140 + }, + { + "epoch": 2.3549393028176855, + "grad_norm": 0.4966922998428345, + "learning_rate": 4.99520750639203e-05, + "loss": 0.0508, + "num_input_tokens_seen": 25735248, + "step": 21145 + }, + { + "epoch": 2.355496157701303, + "grad_norm": 0.30225419998168945, + "learning_rate": 4.9951924570624864e-05, + "loss": 0.0781, + "num_input_tokens_seen": 25741488, + "step": 21150 + }, + { + "epoch": 2.3560530125849204, + "grad_norm": 0.05181578919291496, + "learning_rate": 4.9951773841638524e-05, + "loss": 0.1329, + "num_input_tokens_seen": 25747408, + "step": 21155 + }, + { + "epoch": 2.3566098674685376, + "grad_norm": 0.6410030722618103, + "learning_rate": 4.995162287696268e-05, + "loss": 0.0985, + "num_input_tokens_seen": 25753808, + "step": 21160 + }, + { + "epoch": 2.357166722352155, + "grad_norm": 0.38177162408828735, + "learning_rate": 4.995147167659876e-05, + "loss": 0.0436, + "num_input_tokens_seen": 25759984, + "step": 21165 + }, + { + "epoch": 2.3577235772357725, + "grad_norm": 0.7870770692825317, + "learning_rate": 4.9951320240548214e-05, + "loss": 0.0648, + "num_input_tokens_seen": 25766160, + "step": 21170 + }, + { + "epoch": 2.3582804321193898, + "grad_norm": 0.4186163544654846, + "learning_rate": 4.995116856881245e-05, + "loss": 0.0463, + "num_input_tokens_seen": 25772336, + "step": 21175 + }, + { + "epoch": 2.358837287003007, + "grad_norm": 1.1679970026016235, + "learning_rate": 4.995101666139291e-05, + "loss": 0.1538, + "num_input_tokens_seen": 25778064, + "step": 21180 + }, + { + "epoch": 2.359394141886624, + "grad_norm": 0.7591636776924133, + "learning_rate": 4.995086451829103e-05, + "loss": 0.0866, + "num_input_tokens_seen": 25783984, + "step": 21185 + }, + { + "epoch": 2.359950996770242, + "grad_norm": 1.352813482284546, + "learning_rate": 4.995071213950824e-05, + "loss": 0.2007, + "num_input_tokens_seen": 25790032, + "step": 21190 + }, + { + "epoch": 2.360507851653859, + "grad_norm": 1.383787989616394, + "learning_rate": 4.995055952504598e-05, + "loss": 0.0509, + "num_input_tokens_seen": 25796112, + "step": 21195 + }, + { + "epoch": 2.3610647065374764, + "grad_norm": 0.0010250085033476353, + "learning_rate": 4.99504066749057e-05, + "loss": 0.0862, + "num_input_tokens_seen": 25801552, + "step": 21200 + }, + { + "epoch": 2.3616215614210936, + "grad_norm": 0.5635865330696106, + "learning_rate": 4.995025358908885e-05, + "loss": 0.1126, + "num_input_tokens_seen": 25807600, + "step": 21205 + }, + { + "epoch": 2.362178416304711, + "grad_norm": 0.5589331388473511, + "learning_rate": 4.995010026759685e-05, + "loss": 0.1355, + "num_input_tokens_seen": 25814000, + "step": 21210 + }, + { + "epoch": 2.3627352711883285, + "grad_norm": 1.7533419132232666, + "learning_rate": 4.9949946710431165e-05, + "loss": 0.1728, + "num_input_tokens_seen": 25820048, + "step": 21215 + }, + { + "epoch": 2.3632921260719457, + "grad_norm": 0.037644851952791214, + "learning_rate": 4.9949792917593244e-05, + "loss": 0.0909, + "num_input_tokens_seen": 25826256, + "step": 21220 + }, + { + "epoch": 2.363848980955563, + "grad_norm": 0.0031172751914709806, + "learning_rate": 4.9949638889084546e-05, + "loss": 0.1711, + "num_input_tokens_seen": 25832144, + "step": 21225 + }, + { + "epoch": 2.36440583583918, + "grad_norm": 0.12235799431800842, + "learning_rate": 4.9949484624906515e-05, + "loss": 0.0485, + "num_input_tokens_seen": 25838512, + "step": 21230 + }, + { + "epoch": 2.3649626907227974, + "grad_norm": 0.48406606912612915, + "learning_rate": 4.9949330125060615e-05, + "loss": 0.0936, + "num_input_tokens_seen": 25844400, + "step": 21235 + }, + { + "epoch": 2.365519545606415, + "grad_norm": 0.47625672817230225, + "learning_rate": 4.9949175389548295e-05, + "loss": 0.0608, + "num_input_tokens_seen": 25850384, + "step": 21240 + }, + { + "epoch": 2.3660764004900323, + "grad_norm": 0.9061087965965271, + "learning_rate": 4.9949020418371033e-05, + "loss": 0.1065, + "num_input_tokens_seen": 25856400, + "step": 21245 + }, + { + "epoch": 2.3666332553736495, + "grad_norm": 0.09976835548877716, + "learning_rate": 4.994886521153028e-05, + "loss": 0.0577, + "num_input_tokens_seen": 25862384, + "step": 21250 + }, + { + "epoch": 2.3671901102572668, + "grad_norm": 0.13538455963134766, + "learning_rate": 4.994870976902751e-05, + "loss": 0.0658, + "num_input_tokens_seen": 25868688, + "step": 21255 + }, + { + "epoch": 2.3677469651408845, + "grad_norm": 0.11614612489938736, + "learning_rate": 4.9948554090864184e-05, + "loss": 0.1129, + "num_input_tokens_seen": 25874992, + "step": 21260 + }, + { + "epoch": 2.3683038200245017, + "grad_norm": 0.13352443277835846, + "learning_rate": 4.994839817704178e-05, + "loss": 0.095, + "num_input_tokens_seen": 25880784, + "step": 21265 + }, + { + "epoch": 2.368860674908119, + "grad_norm": 0.49921828508377075, + "learning_rate": 4.9948242027561767e-05, + "loss": 0.0884, + "num_input_tokens_seen": 25887184, + "step": 21270 + }, + { + "epoch": 2.369417529791736, + "grad_norm": 0.22029267251491547, + "learning_rate": 4.9948085642425616e-05, + "loss": 0.0315, + "num_input_tokens_seen": 25892944, + "step": 21275 + }, + { + "epoch": 2.369974384675354, + "grad_norm": 0.47585588693618774, + "learning_rate": 4.994792902163481e-05, + "loss": 0.0478, + "num_input_tokens_seen": 25898864, + "step": 21280 + }, + { + "epoch": 2.370531239558971, + "grad_norm": 1.7713773250579834, + "learning_rate": 4.994777216519082e-05, + "loss": 0.0977, + "num_input_tokens_seen": 25904944, + "step": 21285 + }, + { + "epoch": 2.3710880944425883, + "grad_norm": 0.016250556334853172, + "learning_rate": 4.9947615073095146e-05, + "loss": 0.0262, + "num_input_tokens_seen": 25911120, + "step": 21290 + }, + { + "epoch": 2.3716449493262055, + "grad_norm": 0.4379364550113678, + "learning_rate": 4.994745774534925e-05, + "loss": 0.1106, + "num_input_tokens_seen": 25916976, + "step": 21295 + }, + { + "epoch": 2.3722018042098227, + "grad_norm": 0.491169273853302, + "learning_rate": 4.994730018195463e-05, + "loss": 0.0821, + "num_input_tokens_seen": 25923408, + "step": 21300 + }, + { + "epoch": 2.3727586590934404, + "grad_norm": 1.3621490001678467, + "learning_rate": 4.9947142382912773e-05, + "loss": 0.1666, + "num_input_tokens_seen": 25929392, + "step": 21305 + }, + { + "epoch": 2.3733155139770576, + "grad_norm": 0.32609015703201294, + "learning_rate": 4.9946984348225176e-05, + "loss": 0.1061, + "num_input_tokens_seen": 25935600, + "step": 21310 + }, + { + "epoch": 2.373872368860675, + "grad_norm": 0.4854219853878021, + "learning_rate": 4.994682607789332e-05, + "loss": 0.1997, + "num_input_tokens_seen": 25941040, + "step": 21315 + }, + { + "epoch": 2.374429223744292, + "grad_norm": 0.31727829575538635, + "learning_rate": 4.99466675719187e-05, + "loss": 0.0479, + "num_input_tokens_seen": 25947344, + "step": 21320 + }, + { + "epoch": 2.3749860786279093, + "grad_norm": 0.8946711421012878, + "learning_rate": 4.9946508830302815e-05, + "loss": 0.1685, + "num_input_tokens_seen": 25953328, + "step": 21325 + }, + { + "epoch": 2.375542933511527, + "grad_norm": 0.0039070420898497105, + "learning_rate": 4.994634985304718e-05, + "loss": 0.0758, + "num_input_tokens_seen": 25959664, + "step": 21330 + }, + { + "epoch": 2.3760997883951442, + "grad_norm": 2.013396978378296, + "learning_rate": 4.994619064015328e-05, + "loss": 0.2267, + "num_input_tokens_seen": 25965744, + "step": 21335 + }, + { + "epoch": 2.3766566432787615, + "grad_norm": 0.37285274267196655, + "learning_rate": 4.9946031191622614e-05, + "loss": 0.0925, + "num_input_tokens_seen": 25971440, + "step": 21340 + }, + { + "epoch": 2.3772134981623787, + "grad_norm": 1.8188213109970093, + "learning_rate": 4.9945871507456707e-05, + "loss": 0.1974, + "num_input_tokens_seen": 25977776, + "step": 21345 + }, + { + "epoch": 2.3777703530459964, + "grad_norm": 0.5341346263885498, + "learning_rate": 4.9945711587657054e-05, + "loss": 0.0591, + "num_input_tokens_seen": 25984080, + "step": 21350 + }, + { + "epoch": 2.3783272079296136, + "grad_norm": 0.10830549895763397, + "learning_rate": 4.994555143222517e-05, + "loss": 0.0577, + "num_input_tokens_seen": 25990480, + "step": 21355 + }, + { + "epoch": 2.378884062813231, + "grad_norm": 0.2673701345920563, + "learning_rate": 4.994539104116256e-05, + "loss": 0.0271, + "num_input_tokens_seen": 25996528, + "step": 21360 + }, + { + "epoch": 2.379440917696848, + "grad_norm": 0.539717435836792, + "learning_rate": 4.994523041447076e-05, + "loss": 0.2151, + "num_input_tokens_seen": 26002800, + "step": 21365 + }, + { + "epoch": 2.3799977725804657, + "grad_norm": 0.6376281976699829, + "learning_rate": 4.994506955215126e-05, + "loss": 0.1335, + "num_input_tokens_seen": 26008944, + "step": 21370 + }, + { + "epoch": 2.380554627464083, + "grad_norm": 0.5215865969657898, + "learning_rate": 4.9944908454205594e-05, + "loss": 0.1804, + "num_input_tokens_seen": 26015120, + "step": 21375 + }, + { + "epoch": 2.3811114823477, + "grad_norm": 0.6503283977508545, + "learning_rate": 4.9944747120635284e-05, + "loss": 0.0997, + "num_input_tokens_seen": 26021328, + "step": 21380 + }, + { + "epoch": 2.3816683372313174, + "grad_norm": 0.8362957835197449, + "learning_rate": 4.9944585551441856e-05, + "loss": 0.0599, + "num_input_tokens_seen": 26026832, + "step": 21385 + }, + { + "epoch": 2.3822251921149347, + "grad_norm": 0.0018130108946934342, + "learning_rate": 4.9944423746626826e-05, + "loss": 0.0186, + "num_input_tokens_seen": 26033200, + "step": 21390 + }, + { + "epoch": 2.3827820469985523, + "grad_norm": 0.011831626296043396, + "learning_rate": 4.994426170619173e-05, + "loss": 0.1185, + "num_input_tokens_seen": 26038992, + "step": 21395 + }, + { + "epoch": 2.3833389018821696, + "grad_norm": 0.5877845287322998, + "learning_rate": 4.99440994301381e-05, + "loss": 0.1804, + "num_input_tokens_seen": 26045232, + "step": 21400 + }, + { + "epoch": 2.383895756765787, + "grad_norm": 0.6346167325973511, + "learning_rate": 4.994393691846746e-05, + "loss": 0.1459, + "num_input_tokens_seen": 26051440, + "step": 21405 + }, + { + "epoch": 2.384452611649404, + "grad_norm": 0.45445308089256287, + "learning_rate": 4.994377417118136e-05, + "loss": 0.0553, + "num_input_tokens_seen": 26057392, + "step": 21410 + }, + { + "epoch": 2.3850094665330213, + "grad_norm": 0.06171264499425888, + "learning_rate": 4.994361118828133e-05, + "loss": 0.091, + "num_input_tokens_seen": 26063504, + "step": 21415 + }, + { + "epoch": 2.385566321416639, + "grad_norm": 0.15331338346004486, + "learning_rate": 4.99434479697689e-05, + "loss": 0.0987, + "num_input_tokens_seen": 26069584, + "step": 21420 + }, + { + "epoch": 2.386123176300256, + "grad_norm": 0.10074764490127563, + "learning_rate": 4.9943284515645614e-05, + "loss": 0.0166, + "num_input_tokens_seen": 26075568, + "step": 21425 + }, + { + "epoch": 2.3866800311838734, + "grad_norm": 0.8728166222572327, + "learning_rate": 4.994312082591303e-05, + "loss": 0.1231, + "num_input_tokens_seen": 26081840, + "step": 21430 + }, + { + "epoch": 2.3872368860674906, + "grad_norm": 0.6556500196456909, + "learning_rate": 4.9942956900572686e-05, + "loss": 0.0574, + "num_input_tokens_seen": 26087952, + "step": 21435 + }, + { + "epoch": 2.3877937409511083, + "grad_norm": 0.10708178579807281, + "learning_rate": 4.9942792739626124e-05, + "loss": 0.0839, + "num_input_tokens_seen": 26094288, + "step": 21440 + }, + { + "epoch": 2.3883505958347255, + "grad_norm": 2.0336763858795166, + "learning_rate": 4.99426283430749e-05, + "loss": 0.1814, + "num_input_tokens_seen": 26100592, + "step": 21445 + }, + { + "epoch": 2.3889074507183428, + "grad_norm": 0.009092465974390507, + "learning_rate": 4.9942463710920584e-05, + "loss": 0.1089, + "num_input_tokens_seen": 26106992, + "step": 21450 + }, + { + "epoch": 2.38946430560196, + "grad_norm": 0.8329080939292908, + "learning_rate": 4.99422988431647e-05, + "loss": 0.0507, + "num_input_tokens_seen": 26112848, + "step": 21455 + }, + { + "epoch": 2.3900211604855777, + "grad_norm": 0.10643252730369568, + "learning_rate": 4.994213373980883e-05, + "loss": 0.0529, + "num_input_tokens_seen": 26118480, + "step": 21460 + }, + { + "epoch": 2.390578015369195, + "grad_norm": 0.06267467886209488, + "learning_rate": 4.994196840085451e-05, + "loss": 0.1327, + "num_input_tokens_seen": 26124272, + "step": 21465 + }, + { + "epoch": 2.391134870252812, + "grad_norm": 1.3559210300445557, + "learning_rate": 4.994180282630332e-05, + "loss": 0.0755, + "num_input_tokens_seen": 26130416, + "step": 21470 + }, + { + "epoch": 2.3916917251364294, + "grad_norm": 1.4256223440170288, + "learning_rate": 4.9941637016156826e-05, + "loss": 0.2018, + "num_input_tokens_seen": 26136176, + "step": 21475 + }, + { + "epoch": 2.3922485800200466, + "grad_norm": 0.3413713872432709, + "learning_rate": 4.9941470970416585e-05, + "loss": 0.1111, + "num_input_tokens_seen": 26141968, + "step": 21480 + }, + { + "epoch": 2.3928054349036643, + "grad_norm": 0.19755423069000244, + "learning_rate": 4.994130468908416e-05, + "loss": 0.0973, + "num_input_tokens_seen": 26147920, + "step": 21485 + }, + { + "epoch": 2.3933622897872815, + "grad_norm": 1.653416395187378, + "learning_rate": 4.994113817216114e-05, + "loss": 0.1283, + "num_input_tokens_seen": 26154096, + "step": 21490 + }, + { + "epoch": 2.3939191446708987, + "grad_norm": 0.32590141892433167, + "learning_rate": 4.9940971419649086e-05, + "loss": 0.0648, + "num_input_tokens_seen": 26160208, + "step": 21495 + }, + { + "epoch": 2.394475999554516, + "grad_norm": 0.45446646213531494, + "learning_rate": 4.994080443154957e-05, + "loss": 0.1445, + "num_input_tokens_seen": 26165808, + "step": 21500 + }, + { + "epoch": 2.395032854438133, + "grad_norm": 0.7447226047515869, + "learning_rate": 4.994063720786417e-05, + "loss": 0.177, + "num_input_tokens_seen": 26172080, + "step": 21505 + }, + { + "epoch": 2.395589709321751, + "grad_norm": 0.713303804397583, + "learning_rate": 4.9940469748594474e-05, + "loss": 0.1464, + "num_input_tokens_seen": 26178256, + "step": 21510 + }, + { + "epoch": 2.396146564205368, + "grad_norm": 0.14328868687152863, + "learning_rate": 4.994030205374206e-05, + "loss": 0.0878, + "num_input_tokens_seen": 26183664, + "step": 21515 + }, + { + "epoch": 2.3967034190889853, + "grad_norm": 0.2797495424747467, + "learning_rate": 4.9940134123308515e-05, + "loss": 0.159, + "num_input_tokens_seen": 26189424, + "step": 21520 + }, + { + "epoch": 2.3972602739726026, + "grad_norm": 0.7581200003623962, + "learning_rate": 4.993996595729542e-05, + "loss": 0.0691, + "num_input_tokens_seen": 26195376, + "step": 21525 + }, + { + "epoch": 2.3978171288562202, + "grad_norm": 0.47535568475723267, + "learning_rate": 4.993979755570436e-05, + "loss": 0.1556, + "num_input_tokens_seen": 26201584, + "step": 21530 + }, + { + "epoch": 2.3983739837398375, + "grad_norm": 0.9890551567077637, + "learning_rate": 4.9939628918536936e-05, + "loss": 0.1262, + "num_input_tokens_seen": 26207984, + "step": 21535 + }, + { + "epoch": 2.3989308386234547, + "grad_norm": 0.7766334414482117, + "learning_rate": 4.993946004579473e-05, + "loss": 0.1351, + "num_input_tokens_seen": 26213968, + "step": 21540 + }, + { + "epoch": 2.399487693507072, + "grad_norm": 0.405247300863266, + "learning_rate": 4.9939290937479346e-05, + "loss": 0.2019, + "num_input_tokens_seen": 26220080, + "step": 21545 + }, + { + "epoch": 2.4000445483906896, + "grad_norm": 0.011279375292360783, + "learning_rate": 4.9939121593592384e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26226608, + "step": 21550 + }, + { + "epoch": 2.400601403274307, + "grad_norm": 0.6090007424354553, + "learning_rate": 4.993895201413543e-05, + "loss": 0.1563, + "num_input_tokens_seen": 26233392, + "step": 21555 + }, + { + "epoch": 2.401158258157924, + "grad_norm": 0.2664254307746887, + "learning_rate": 4.993878219911009e-05, + "loss": 0.026, + "num_input_tokens_seen": 26239568, + "step": 21560 + }, + { + "epoch": 2.4017151130415413, + "grad_norm": 0.010922400280833244, + "learning_rate": 4.993861214851798e-05, + "loss": 0.0823, + "num_input_tokens_seen": 26245488, + "step": 21565 + }, + { + "epoch": 2.4022719679251585, + "grad_norm": 0.8204051852226257, + "learning_rate": 4.9938441862360694e-05, + "loss": 0.0695, + "num_input_tokens_seen": 26251856, + "step": 21570 + }, + { + "epoch": 2.402828822808776, + "grad_norm": 0.6945724487304688, + "learning_rate": 4.993827134063984e-05, + "loss": 0.1658, + "num_input_tokens_seen": 26257936, + "step": 21575 + }, + { + "epoch": 2.4033856776923934, + "grad_norm": 1.1571274995803833, + "learning_rate": 4.993810058335704e-05, + "loss": 0.0535, + "num_input_tokens_seen": 26264176, + "step": 21580 + }, + { + "epoch": 2.4039425325760106, + "grad_norm": 0.46638286113739014, + "learning_rate": 4.99379295905139e-05, + "loss": 0.1013, + "num_input_tokens_seen": 26270000, + "step": 21585 + }, + { + "epoch": 2.404499387459628, + "grad_norm": 0.5093395709991455, + "learning_rate": 4.993775836211203e-05, + "loss": 0.1543, + "num_input_tokens_seen": 26276496, + "step": 21590 + }, + { + "epoch": 2.405056242343245, + "grad_norm": 0.6845768690109253, + "learning_rate": 4.9937586898153055e-05, + "loss": 0.0187, + "num_input_tokens_seen": 26282992, + "step": 21595 + }, + { + "epoch": 2.405613097226863, + "grad_norm": 1.19951593875885, + "learning_rate": 4.993741519863859e-05, + "loss": 0.1074, + "num_input_tokens_seen": 26289008, + "step": 21600 + }, + { + "epoch": 2.40616995211048, + "grad_norm": 1.669128656387329, + "learning_rate": 4.9937243263570264e-05, + "loss": 0.0794, + "num_input_tokens_seen": 26295216, + "step": 21605 + }, + { + "epoch": 2.4067268069940972, + "grad_norm": 0.3137910068035126, + "learning_rate": 4.9937071092949696e-05, + "loss": 0.0717, + "num_input_tokens_seen": 26301456, + "step": 21610 + }, + { + "epoch": 2.4072836618777145, + "grad_norm": 0.12057789415121078, + "learning_rate": 4.993689868677851e-05, + "loss": 0.0446, + "num_input_tokens_seen": 26307728, + "step": 21615 + }, + { + "epoch": 2.407840516761332, + "grad_norm": 3.040754795074463, + "learning_rate": 4.9936726045058335e-05, + "loss": 0.1411, + "num_input_tokens_seen": 26313968, + "step": 21620 + }, + { + "epoch": 2.4083973716449494, + "grad_norm": 0.38074803352355957, + "learning_rate": 4.99365531677908e-05, + "loss": 0.1258, + "num_input_tokens_seen": 26320176, + "step": 21625 + }, + { + "epoch": 2.4089542265285666, + "grad_norm": 0.3760179281234741, + "learning_rate": 4.993638005497755e-05, + "loss": 0.0947, + "num_input_tokens_seen": 26326256, + "step": 21630 + }, + { + "epoch": 2.409511081412184, + "grad_norm": 0.20315444469451904, + "learning_rate": 4.99362067066202e-05, + "loss": 0.085, + "num_input_tokens_seen": 26332496, + "step": 21635 + }, + { + "epoch": 2.4100679362958015, + "grad_norm": 0.640313982963562, + "learning_rate": 4.993603312272042e-05, + "loss": 0.0914, + "num_input_tokens_seen": 26338544, + "step": 21640 + }, + { + "epoch": 2.4106247911794187, + "grad_norm": 0.16940762102603912, + "learning_rate": 4.9935859303279807e-05, + "loss": 0.0731, + "num_input_tokens_seen": 26344496, + "step": 21645 + }, + { + "epoch": 2.411181646063036, + "grad_norm": 1.38886296749115, + "learning_rate": 4.9935685248300034e-05, + "loss": 0.2141, + "num_input_tokens_seen": 26350864, + "step": 21650 + }, + { + "epoch": 2.411738500946653, + "grad_norm": 0.1136648952960968, + "learning_rate": 4.993551095778274e-05, + "loss": 0.0443, + "num_input_tokens_seen": 26356912, + "step": 21655 + }, + { + "epoch": 2.4122953558302704, + "grad_norm": 0.022468548268079758, + "learning_rate": 4.993533643172956e-05, + "loss": 0.1371, + "num_input_tokens_seen": 26362928, + "step": 21660 + }, + { + "epoch": 2.412852210713888, + "grad_norm": 1.0445746183395386, + "learning_rate": 4.993516167014215e-05, + "loss": 0.1394, + "num_input_tokens_seen": 26369040, + "step": 21665 + }, + { + "epoch": 2.4134090655975053, + "grad_norm": 0.1482420563697815, + "learning_rate": 4.993498667302216e-05, + "loss": 0.0783, + "num_input_tokens_seen": 26375184, + "step": 21670 + }, + { + "epoch": 2.4139659204811226, + "grad_norm": 0.004739050287753344, + "learning_rate": 4.993481144037124e-05, + "loss": 0.1413, + "num_input_tokens_seen": 26381584, + "step": 21675 + }, + { + "epoch": 2.41452277536474, + "grad_norm": 0.8926354050636292, + "learning_rate": 4.9934635972191054e-05, + "loss": 0.1041, + "num_input_tokens_seen": 26387600, + "step": 21680 + }, + { + "epoch": 2.415079630248357, + "grad_norm": 0.04598228260874748, + "learning_rate": 4.9934460268483266e-05, + "loss": 0.0408, + "num_input_tokens_seen": 26393808, + "step": 21685 + }, + { + "epoch": 2.4156364851319747, + "grad_norm": 0.1750587671995163, + "learning_rate": 4.99342843292495e-05, + "loss": 0.121, + "num_input_tokens_seen": 26399760, + "step": 21690 + }, + { + "epoch": 2.416193340015592, + "grad_norm": 0.9754608273506165, + "learning_rate": 4.993410815449145e-05, + "loss": 0.1035, + "num_input_tokens_seen": 26406128, + "step": 21695 + }, + { + "epoch": 2.416750194899209, + "grad_norm": 0.06714020669460297, + "learning_rate": 4.993393174421078e-05, + "loss": 0.1389, + "num_input_tokens_seen": 26412368, + "step": 21700 + }, + { + "epoch": 2.417307049782827, + "grad_norm": 0.8509841561317444, + "learning_rate": 4.993375509840914e-05, + "loss": 0.1511, + "num_input_tokens_seen": 26418448, + "step": 21705 + }, + { + "epoch": 2.417863904666444, + "grad_norm": 0.01523298304527998, + "learning_rate": 4.9933578217088214e-05, + "loss": 0.051, + "num_input_tokens_seen": 26424720, + "step": 21710 + }, + { + "epoch": 2.4184207595500613, + "grad_norm": 0.3517632484436035, + "learning_rate": 4.993340110024966e-05, + "loss": 0.0428, + "num_input_tokens_seen": 26430832, + "step": 21715 + }, + { + "epoch": 2.4189776144336785, + "grad_norm": 1.1803975105285645, + "learning_rate": 4.9933223747895155e-05, + "loss": 0.1052, + "num_input_tokens_seen": 26436368, + "step": 21720 + }, + { + "epoch": 2.4195344693172958, + "grad_norm": 0.09320402890443802, + "learning_rate": 4.9933046160026374e-05, + "loss": 0.0519, + "num_input_tokens_seen": 26442416, + "step": 21725 + }, + { + "epoch": 2.4200913242009134, + "grad_norm": 0.3909740149974823, + "learning_rate": 4.9932868336645e-05, + "loss": 0.1082, + "num_input_tokens_seen": 26448656, + "step": 21730 + }, + { + "epoch": 2.4206481790845307, + "grad_norm": 0.7012697458267212, + "learning_rate": 4.993269027775271e-05, + "loss": 0.1398, + "num_input_tokens_seen": 26454544, + "step": 21735 + }, + { + "epoch": 2.421205033968148, + "grad_norm": 0.1603822112083435, + "learning_rate": 4.9932511983351184e-05, + "loss": 0.0734, + "num_input_tokens_seen": 26460656, + "step": 21740 + }, + { + "epoch": 2.421761888851765, + "grad_norm": 0.12287385016679764, + "learning_rate": 4.993233345344211e-05, + "loss": 0.0737, + "num_input_tokens_seen": 26466768, + "step": 21745 + }, + { + "epoch": 2.4223187437353824, + "grad_norm": 0.6246352791786194, + "learning_rate": 4.9932154688027154e-05, + "loss": 0.1441, + "num_input_tokens_seen": 26473456, + "step": 21750 + }, + { + "epoch": 2.422875598619, + "grad_norm": 1.0855516195297241, + "learning_rate": 4.993197568710803e-05, + "loss": 0.1845, + "num_input_tokens_seen": 26479600, + "step": 21755 + }, + { + "epoch": 2.4234324535026173, + "grad_norm": 1.524106740951538, + "learning_rate": 4.993179645068643e-05, + "loss": 0.2659, + "num_input_tokens_seen": 26485552, + "step": 21760 + }, + { + "epoch": 2.4239893083862345, + "grad_norm": 0.0796184092760086, + "learning_rate": 4.993161697876403e-05, + "loss": 0.0864, + "num_input_tokens_seen": 26491312, + "step": 21765 + }, + { + "epoch": 2.4245461632698517, + "grad_norm": 0.1845092922449112, + "learning_rate": 4.993143727134254e-05, + "loss": 0.0491, + "num_input_tokens_seen": 26497680, + "step": 21770 + }, + { + "epoch": 2.425103018153469, + "grad_norm": 0.030195781961083412, + "learning_rate": 4.993125732842364e-05, + "loss": 0.0842, + "num_input_tokens_seen": 26503728, + "step": 21775 + }, + { + "epoch": 2.4256598730370866, + "grad_norm": 0.062212783843278885, + "learning_rate": 4.993107715000905e-05, + "loss": 0.1165, + "num_input_tokens_seen": 26509872, + "step": 21780 + }, + { + "epoch": 2.426216727920704, + "grad_norm": 0.14682380855083466, + "learning_rate": 4.993089673610045e-05, + "loss": 0.0507, + "num_input_tokens_seen": 26516176, + "step": 21785 + }, + { + "epoch": 2.426773582804321, + "grad_norm": 1.0013126134872437, + "learning_rate": 4.993071608669957e-05, + "loss": 0.0848, + "num_input_tokens_seen": 26522384, + "step": 21790 + }, + { + "epoch": 2.4273304376879388, + "grad_norm": 0.7712774276733398, + "learning_rate": 4.9930535201808095e-05, + "loss": 0.123, + "num_input_tokens_seen": 26528272, + "step": 21795 + }, + { + "epoch": 2.427887292571556, + "grad_norm": 0.20911645889282227, + "learning_rate": 4.993035408142773e-05, + "loss": 0.0502, + "num_input_tokens_seen": 26534512, + "step": 21800 + }, + { + "epoch": 2.4284441474551732, + "grad_norm": 0.5562732219696045, + "learning_rate": 4.993017272556021e-05, + "loss": 0.0613, + "num_input_tokens_seen": 26540560, + "step": 21805 + }, + { + "epoch": 2.4290010023387905, + "grad_norm": 0.2635814845561981, + "learning_rate": 4.992999113420724e-05, + "loss": 0.0744, + "num_input_tokens_seen": 26546544, + "step": 21810 + }, + { + "epoch": 2.4295578572224077, + "grad_norm": 0.8388010859489441, + "learning_rate": 4.9929809307370525e-05, + "loss": 0.0529, + "num_input_tokens_seen": 26552496, + "step": 21815 + }, + { + "epoch": 2.4301147121060254, + "grad_norm": 0.19112971425056458, + "learning_rate": 4.992962724505178e-05, + "loss": 0.0648, + "num_input_tokens_seen": 26558480, + "step": 21820 + }, + { + "epoch": 2.4306715669896426, + "grad_norm": 1.7179633378982544, + "learning_rate": 4.992944494725274e-05, + "loss": 0.1252, + "num_input_tokens_seen": 26564304, + "step": 21825 + }, + { + "epoch": 2.43122842187326, + "grad_norm": 0.9014731049537659, + "learning_rate": 4.9929262413975114e-05, + "loss": 0.1552, + "num_input_tokens_seen": 26570064, + "step": 21830 + }, + { + "epoch": 2.431785276756877, + "grad_norm": 0.08476664870977402, + "learning_rate": 4.992907964522063e-05, + "loss": 0.1541, + "num_input_tokens_seen": 26576208, + "step": 21835 + }, + { + "epoch": 2.4323421316404943, + "grad_norm": 0.4105674624443054, + "learning_rate": 4.992889664099103e-05, + "loss": 0.1096, + "num_input_tokens_seen": 26582384, + "step": 21840 + }, + { + "epoch": 2.432898986524112, + "grad_norm": 1.8181203603744507, + "learning_rate": 4.9928713401288016e-05, + "loss": 0.098, + "num_input_tokens_seen": 26588240, + "step": 21845 + }, + { + "epoch": 2.433455841407729, + "grad_norm": 0.026895763352513313, + "learning_rate": 4.992852992611333e-05, + "loss": 0.069, + "num_input_tokens_seen": 26594224, + "step": 21850 + }, + { + "epoch": 2.4340126962913464, + "grad_norm": 0.0038272507954388857, + "learning_rate": 4.9928346215468716e-05, + "loss": 0.0275, + "num_input_tokens_seen": 26600592, + "step": 21855 + }, + { + "epoch": 2.4345695511749637, + "grad_norm": 0.9413275718688965, + "learning_rate": 4.992816226935589e-05, + "loss": 0.1356, + "num_input_tokens_seen": 26606512, + "step": 21860 + }, + { + "epoch": 2.4351264060585813, + "grad_norm": 0.7766362428665161, + "learning_rate": 4.992797808777661e-05, + "loss": 0.1936, + "num_input_tokens_seen": 26612304, + "step": 21865 + }, + { + "epoch": 2.4356832609421986, + "grad_norm": 0.29102808237075806, + "learning_rate": 4.9927793670732595e-05, + "loss": 0.0362, + "num_input_tokens_seen": 26618608, + "step": 21870 + }, + { + "epoch": 2.436240115825816, + "grad_norm": 0.2754632234573364, + "learning_rate": 4.99276090182256e-05, + "loss": 0.0545, + "num_input_tokens_seen": 26624656, + "step": 21875 + }, + { + "epoch": 2.436796970709433, + "grad_norm": 1.625484824180603, + "learning_rate": 4.992742413025737e-05, + "loss": 0.1899, + "num_input_tokens_seen": 26630832, + "step": 21880 + }, + { + "epoch": 2.4373538255930507, + "grad_norm": 0.07813554257154465, + "learning_rate": 4.992723900682964e-05, + "loss": 0.0377, + "num_input_tokens_seen": 26637040, + "step": 21885 + }, + { + "epoch": 2.437910680476668, + "grad_norm": 0.77944016456604, + "learning_rate": 4.992705364794417e-05, + "loss": 0.0902, + "num_input_tokens_seen": 26643152, + "step": 21890 + }, + { + "epoch": 2.438467535360285, + "grad_norm": 0.1904945820569992, + "learning_rate": 4.992686805360271e-05, + "loss": 0.1587, + "num_input_tokens_seen": 26649680, + "step": 21895 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 0.3073793947696686, + "learning_rate": 4.992668222380701e-05, + "loss": 0.0376, + "num_input_tokens_seen": 26655152, + "step": 21900 + }, + { + "epoch": 2.4395812451275196, + "grad_norm": 1.3693710565567017, + "learning_rate": 4.992649615855882e-05, + "loss": 0.1238, + "num_input_tokens_seen": 26660656, + "step": 21905 + }, + { + "epoch": 2.4401381000111373, + "grad_norm": 0.5036861300468445, + "learning_rate": 4.992630985785991e-05, + "loss": 0.051, + "num_input_tokens_seen": 26666448, + "step": 21910 + }, + { + "epoch": 2.4406949548947545, + "grad_norm": 0.8807055950164795, + "learning_rate": 4.992612332171202e-05, + "loss": 0.1147, + "num_input_tokens_seen": 26672592, + "step": 21915 + }, + { + "epoch": 2.4412518097783718, + "grad_norm": 1.8776699304580688, + "learning_rate": 4.992593655011694e-05, + "loss": 0.078, + "num_input_tokens_seen": 26678768, + "step": 21920 + }, + { + "epoch": 2.441808664661989, + "grad_norm": 0.10028815269470215, + "learning_rate": 4.992574954307642e-05, + "loss": 0.0867, + "num_input_tokens_seen": 26684752, + "step": 21925 + }, + { + "epoch": 2.442365519545606, + "grad_norm": 0.018729988485574722, + "learning_rate": 4.992556230059221e-05, + "loss": 0.0149, + "num_input_tokens_seen": 26691152, + "step": 21930 + }, + { + "epoch": 2.442922374429224, + "grad_norm": 0.3214181065559387, + "learning_rate": 4.9925374822666103e-05, + "loss": 0.0319, + "num_input_tokens_seen": 26697456, + "step": 21935 + }, + { + "epoch": 2.443479229312841, + "grad_norm": 1.4133639335632324, + "learning_rate": 4.992518710929986e-05, + "loss": 0.0408, + "num_input_tokens_seen": 26703792, + "step": 21940 + }, + { + "epoch": 2.4440360841964583, + "grad_norm": 0.8561100959777832, + "learning_rate": 4.992499916049526e-05, + "loss": 0.0743, + "num_input_tokens_seen": 26710128, + "step": 21945 + }, + { + "epoch": 2.4445929390800756, + "grad_norm": 0.21632744371891022, + "learning_rate": 4.9924810976254065e-05, + "loss": 0.1035, + "num_input_tokens_seen": 26716208, + "step": 21950 + }, + { + "epoch": 2.4451497939636933, + "grad_norm": 0.0955447256565094, + "learning_rate": 4.9924622556578065e-05, + "loss": 0.1031, + "num_input_tokens_seen": 26722608, + "step": 21955 + }, + { + "epoch": 2.4457066488473105, + "grad_norm": 0.011309538036584854, + "learning_rate": 4.9924433901469034e-05, + "loss": 0.1211, + "num_input_tokens_seen": 26728272, + "step": 21960 + }, + { + "epoch": 2.4462635037309277, + "grad_norm": 0.08618491142988205, + "learning_rate": 4.992424501092876e-05, + "loss": 0.0741, + "num_input_tokens_seen": 26734352, + "step": 21965 + }, + { + "epoch": 2.446820358614545, + "grad_norm": 1.1245007514953613, + "learning_rate": 4.992405588495902e-05, + "loss": 0.1191, + "num_input_tokens_seen": 26740656, + "step": 21970 + }, + { + "epoch": 2.4473772134981626, + "grad_norm": 0.5520121455192566, + "learning_rate": 4.99238665235616e-05, + "loss": 0.0863, + "num_input_tokens_seen": 26746640, + "step": 21975 + }, + { + "epoch": 2.44793406838178, + "grad_norm": 1.8620868921279907, + "learning_rate": 4.992367692673829e-05, + "loss": 0.2706, + "num_input_tokens_seen": 26752720, + "step": 21980 + }, + { + "epoch": 2.448490923265397, + "grad_norm": 0.18224437534809113, + "learning_rate": 4.99234870944909e-05, + "loss": 0.0301, + "num_input_tokens_seen": 26759280, + "step": 21985 + }, + { + "epoch": 2.4490477781490143, + "grad_norm": 1.7971822023391724, + "learning_rate": 4.992329702682119e-05, + "loss": 0.2004, + "num_input_tokens_seen": 26765232, + "step": 21990 + }, + { + "epoch": 2.4496046330326315, + "grad_norm": 1.7454503774642944, + "learning_rate": 4.992310672373097e-05, + "loss": 0.0938, + "num_input_tokens_seen": 26771120, + "step": 21995 + }, + { + "epoch": 2.450161487916249, + "grad_norm": 1.6930009126663208, + "learning_rate": 4.992291618522204e-05, + "loss": 0.2942, + "num_input_tokens_seen": 26777136, + "step": 22000 + }, + { + "epoch": 2.4507183427998664, + "grad_norm": 0.11972764134407043, + "learning_rate": 4.992272541129621e-05, + "loss": 0.0835, + "num_input_tokens_seen": 26782832, + "step": 22005 + }, + { + "epoch": 2.4512751976834837, + "grad_norm": 0.9694086909294128, + "learning_rate": 4.9922534401955265e-05, + "loss": 0.0834, + "num_input_tokens_seen": 26789040, + "step": 22010 + }, + { + "epoch": 2.451832052567101, + "grad_norm": 2.4053196907043457, + "learning_rate": 4.992234315720101e-05, + "loss": 0.2114, + "num_input_tokens_seen": 26794768, + "step": 22015 + }, + { + "epoch": 2.452388907450718, + "grad_norm": 0.9535958766937256, + "learning_rate": 4.9922151677035265e-05, + "loss": 0.0648, + "num_input_tokens_seen": 26800816, + "step": 22020 + }, + { + "epoch": 2.452945762334336, + "grad_norm": 0.26059016585350037, + "learning_rate": 4.992195996145982e-05, + "loss": 0.1055, + "num_input_tokens_seen": 26807184, + "step": 22025 + }, + { + "epoch": 2.453502617217953, + "grad_norm": 0.6282969117164612, + "learning_rate": 4.992176801047651e-05, + "loss": 0.1041, + "num_input_tokens_seen": 26813488, + "step": 22030 + }, + { + "epoch": 2.4540594721015703, + "grad_norm": 0.07568880915641785, + "learning_rate": 4.992157582408712e-05, + "loss": 0.0639, + "num_input_tokens_seen": 26819600, + "step": 22035 + }, + { + "epoch": 2.4546163269851875, + "grad_norm": 1.1668579578399658, + "learning_rate": 4.992138340229349e-05, + "loss": 0.0905, + "num_input_tokens_seen": 26825808, + "step": 22040 + }, + { + "epoch": 2.455173181868805, + "grad_norm": 0.2874809801578522, + "learning_rate": 4.992119074509742e-05, + "loss": 0.1394, + "num_input_tokens_seen": 26831920, + "step": 22045 + }, + { + "epoch": 2.4557300367524224, + "grad_norm": 0.4018261730670929, + "learning_rate": 4.992099785250074e-05, + "loss": 0.1017, + "num_input_tokens_seen": 26838192, + "step": 22050 + }, + { + "epoch": 2.4562868916360396, + "grad_norm": 0.7883277535438538, + "learning_rate": 4.992080472450526e-05, + "loss": 0.0928, + "num_input_tokens_seen": 26844048, + "step": 22055 + }, + { + "epoch": 2.456843746519657, + "grad_norm": 0.20946452021598816, + "learning_rate": 4.992061136111283e-05, + "loss": 0.1108, + "num_input_tokens_seen": 26850416, + "step": 22060 + }, + { + "epoch": 2.4574006014032745, + "grad_norm": 0.014814150519669056, + "learning_rate": 4.992041776232525e-05, + "loss": 0.0826, + "num_input_tokens_seen": 26856336, + "step": 22065 + }, + { + "epoch": 2.4579574562868918, + "grad_norm": 0.08575788885354996, + "learning_rate": 4.992022392814436e-05, + "loss": 0.078, + "num_input_tokens_seen": 26862448, + "step": 22070 + }, + { + "epoch": 2.458514311170509, + "grad_norm": 0.5758737325668335, + "learning_rate": 4.9920029858571985e-05, + "loss": 0.2604, + "num_input_tokens_seen": 26868016, + "step": 22075 + }, + { + "epoch": 2.4590711660541262, + "grad_norm": 0.9591152667999268, + "learning_rate": 4.9919835553609965e-05, + "loss": 0.1358, + "num_input_tokens_seen": 26873744, + "step": 22080 + }, + { + "epoch": 2.4596280209377435, + "grad_norm": 0.10942506790161133, + "learning_rate": 4.991964101326013e-05, + "loss": 0.0705, + "num_input_tokens_seen": 26879760, + "step": 22085 + }, + { + "epoch": 2.460184875821361, + "grad_norm": 0.02389751374721527, + "learning_rate": 4.991944623752432e-05, + "loss": 0.0535, + "num_input_tokens_seen": 26886096, + "step": 22090 + }, + { + "epoch": 2.4607417307049784, + "grad_norm": 0.4984317123889923, + "learning_rate": 4.9919251226404386e-05, + "loss": 0.0807, + "num_input_tokens_seen": 26892048, + "step": 22095 + }, + { + "epoch": 2.4612985855885956, + "grad_norm": 1.317285180091858, + "learning_rate": 4.991905597990215e-05, + "loss": 0.0582, + "num_input_tokens_seen": 26898032, + "step": 22100 + }, + { + "epoch": 2.461855440472213, + "grad_norm": 0.6916038990020752, + "learning_rate": 4.991886049801947e-05, + "loss": 0.1153, + "num_input_tokens_seen": 26904336, + "step": 22105 + }, + { + "epoch": 2.46241229535583, + "grad_norm": 0.0707123652100563, + "learning_rate": 4.9918664780758184e-05, + "loss": 0.1018, + "num_input_tokens_seen": 26910224, + "step": 22110 + }, + { + "epoch": 2.4629691502394477, + "grad_norm": 0.07613743841648102, + "learning_rate": 4.9918468828120144e-05, + "loss": 0.0626, + "num_input_tokens_seen": 26916784, + "step": 22115 + }, + { + "epoch": 2.463526005123065, + "grad_norm": 1.943077802658081, + "learning_rate": 4.991827264010721e-05, + "loss": 0.2682, + "num_input_tokens_seen": 26922672, + "step": 22120 + }, + { + "epoch": 2.464082860006682, + "grad_norm": 0.025413773953914642, + "learning_rate": 4.991807621672122e-05, + "loss": 0.0732, + "num_input_tokens_seen": 26928688, + "step": 22125 + }, + { + "epoch": 2.4646397148902994, + "grad_norm": 0.6115701198577881, + "learning_rate": 4.9917879557964036e-05, + "loss": 0.0343, + "num_input_tokens_seen": 26935184, + "step": 22130 + }, + { + "epoch": 2.465196569773917, + "grad_norm": 0.5082683563232422, + "learning_rate": 4.991768266383752e-05, + "loss": 0.071, + "num_input_tokens_seen": 26941168, + "step": 22135 + }, + { + "epoch": 2.4657534246575343, + "grad_norm": 0.1622924506664276, + "learning_rate": 4.991748553434352e-05, + "loss": 0.0389, + "num_input_tokens_seen": 26947440, + "step": 22140 + }, + { + "epoch": 2.4663102795411516, + "grad_norm": 0.01433936320245266, + "learning_rate": 4.991728816948391e-05, + "loss": 0.0154, + "num_input_tokens_seen": 26954064, + "step": 22145 + }, + { + "epoch": 2.466867134424769, + "grad_norm": 0.2581445872783661, + "learning_rate": 4.991709056926055e-05, + "loss": 0.0186, + "num_input_tokens_seen": 26960400, + "step": 22150 + }, + { + "epoch": 2.4674239893083865, + "grad_norm": 0.2644546627998352, + "learning_rate": 4.99168927336753e-05, + "loss": 0.116, + "num_input_tokens_seen": 26966608, + "step": 22155 + }, + { + "epoch": 2.4679808441920037, + "grad_norm": 0.05691863223910332, + "learning_rate": 4.991669466273004e-05, + "loss": 0.0497, + "num_input_tokens_seen": 26972848, + "step": 22160 + }, + { + "epoch": 2.468537699075621, + "grad_norm": 0.8352113366127014, + "learning_rate": 4.9916496356426644e-05, + "loss": 0.0918, + "num_input_tokens_seen": 26979056, + "step": 22165 + }, + { + "epoch": 2.469094553959238, + "grad_norm": 0.14373454451560974, + "learning_rate": 4.991629781476697e-05, + "loss": 0.1843, + "num_input_tokens_seen": 26984976, + "step": 22170 + }, + { + "epoch": 2.4696514088428554, + "grad_norm": 0.6587885618209839, + "learning_rate": 4.9916099037752894e-05, + "loss": 0.0886, + "num_input_tokens_seen": 26991120, + "step": 22175 + }, + { + "epoch": 2.470208263726473, + "grad_norm": 0.8025740385055542, + "learning_rate": 4.991590002538631e-05, + "loss": 0.116, + "num_input_tokens_seen": 26997424, + "step": 22180 + }, + { + "epoch": 2.4707651186100903, + "grad_norm": 0.09958766400814056, + "learning_rate": 4.991570077766908e-05, + "loss": 0.0832, + "num_input_tokens_seen": 27003248, + "step": 22185 + }, + { + "epoch": 2.4713219734937075, + "grad_norm": 0.3149507939815521, + "learning_rate": 4.9915501294603103e-05, + "loss": 0.09, + "num_input_tokens_seen": 27008912, + "step": 22190 + }, + { + "epoch": 2.4718788283773248, + "grad_norm": 0.3943941295146942, + "learning_rate": 4.9915301576190255e-05, + "loss": 0.1048, + "num_input_tokens_seen": 27015056, + "step": 22195 + }, + { + "epoch": 2.472435683260942, + "grad_norm": 1.154579520225525, + "learning_rate": 4.991510162243241e-05, + "loss": 0.0465, + "num_input_tokens_seen": 27021040, + "step": 22200 + }, + { + "epoch": 2.4729925381445597, + "grad_norm": 0.18023811280727386, + "learning_rate": 4.991490143333147e-05, + "loss": 0.0294, + "num_input_tokens_seen": 27027248, + "step": 22205 + }, + { + "epoch": 2.473549393028177, + "grad_norm": 0.5479745268821716, + "learning_rate": 4.9914701008889334e-05, + "loss": 0.069, + "num_input_tokens_seen": 27033584, + "step": 22210 + }, + { + "epoch": 2.474106247911794, + "grad_norm": 0.10846550762653351, + "learning_rate": 4.9914500349107886e-05, + "loss": 0.0568, + "num_input_tokens_seen": 27039568, + "step": 22215 + }, + { + "epoch": 2.4746631027954114, + "grad_norm": 1.8344275951385498, + "learning_rate": 4.9914299453989014e-05, + "loss": 0.092, + "num_input_tokens_seen": 27045872, + "step": 22220 + }, + { + "epoch": 2.475219957679029, + "grad_norm": 0.03865371271967888, + "learning_rate": 4.991409832353463e-05, + "loss": 0.1617, + "num_input_tokens_seen": 27051984, + "step": 22225 + }, + { + "epoch": 2.4757768125626463, + "grad_norm": 0.028492363169789314, + "learning_rate": 4.991389695774662e-05, + "loss": 0.0461, + "num_input_tokens_seen": 27058096, + "step": 22230 + }, + { + "epoch": 2.4763336674462635, + "grad_norm": 0.14760807156562805, + "learning_rate": 4.991369535662689e-05, + "loss": 0.1266, + "num_input_tokens_seen": 27064208, + "step": 22235 + }, + { + "epoch": 2.4768905223298807, + "grad_norm": 0.11370089650154114, + "learning_rate": 4.991349352017735e-05, + "loss": 0.151, + "num_input_tokens_seen": 27070256, + "step": 22240 + }, + { + "epoch": 2.4774473772134984, + "grad_norm": 0.3835414946079254, + "learning_rate": 4.99132914483999e-05, + "loss": 0.1064, + "num_input_tokens_seen": 27076240, + "step": 22245 + }, + { + "epoch": 2.4780042320971156, + "grad_norm": 1.4570270776748657, + "learning_rate": 4.9913089141296464e-05, + "loss": 0.1899, + "num_input_tokens_seen": 27082032, + "step": 22250 + }, + { + "epoch": 2.478561086980733, + "grad_norm": 1.243919014930725, + "learning_rate": 4.991288659886893e-05, + "loss": 0.0573, + "num_input_tokens_seen": 27087952, + "step": 22255 + }, + { + "epoch": 2.47911794186435, + "grad_norm": 0.1782841980457306, + "learning_rate": 4.991268382111923e-05, + "loss": 0.1175, + "num_input_tokens_seen": 27094096, + "step": 22260 + }, + { + "epoch": 2.4796747967479673, + "grad_norm": 1.028947114944458, + "learning_rate": 4.9912480808049264e-05, + "loss": 0.0613, + "num_input_tokens_seen": 27099952, + "step": 22265 + }, + { + "epoch": 2.480231651631585, + "grad_norm": 1.5430773496627808, + "learning_rate": 4.991227755966096e-05, + "loss": 0.1173, + "num_input_tokens_seen": 27105936, + "step": 22270 + }, + { + "epoch": 2.480788506515202, + "grad_norm": 0.18419234454631805, + "learning_rate": 4.991207407595623e-05, + "loss": 0.0303, + "num_input_tokens_seen": 27112016, + "step": 22275 + }, + { + "epoch": 2.4813453613988194, + "grad_norm": 0.13292504847049713, + "learning_rate": 4.9911870356937004e-05, + "loss": 0.0758, + "num_input_tokens_seen": 27117904, + "step": 22280 + }, + { + "epoch": 2.4819022162824367, + "grad_norm": 0.6859164834022522, + "learning_rate": 4.9911666402605214e-05, + "loss": 0.0693, + "num_input_tokens_seen": 27124048, + "step": 22285 + }, + { + "epoch": 2.482459071166054, + "grad_norm": 0.9522926211357117, + "learning_rate": 4.9911462212962766e-05, + "loss": 0.1157, + "num_input_tokens_seen": 27130256, + "step": 22290 + }, + { + "epoch": 2.4830159260496716, + "grad_norm": 1.5820661783218384, + "learning_rate": 4.9911257788011603e-05, + "loss": 0.124, + "num_input_tokens_seen": 27136368, + "step": 22295 + }, + { + "epoch": 2.483572780933289, + "grad_norm": 0.7875621914863586, + "learning_rate": 4.991105312775365e-05, + "loss": 0.0941, + "num_input_tokens_seen": 27142480, + "step": 22300 + }, + { + "epoch": 2.484129635816906, + "grad_norm": 0.6705824732780457, + "learning_rate": 4.9910848232190834e-05, + "loss": 0.0858, + "num_input_tokens_seen": 27148592, + "step": 22305 + }, + { + "epoch": 2.4846864907005233, + "grad_norm": 0.5216919779777527, + "learning_rate": 4.991064310132511e-05, + "loss": 0.1007, + "num_input_tokens_seen": 27154640, + "step": 22310 + }, + { + "epoch": 2.485243345584141, + "grad_norm": 0.14258243143558502, + "learning_rate": 4.99104377351584e-05, + "loss": 0.0477, + "num_input_tokens_seen": 27160496, + "step": 22315 + }, + { + "epoch": 2.485800200467758, + "grad_norm": 0.7485730648040771, + "learning_rate": 4.991023213369265e-05, + "loss": 0.061, + "num_input_tokens_seen": 27166800, + "step": 22320 + }, + { + "epoch": 2.4863570553513754, + "grad_norm": 0.8855988383293152, + "learning_rate": 4.991002629692979e-05, + "loss": 0.1135, + "num_input_tokens_seen": 27172848, + "step": 22325 + }, + { + "epoch": 2.4869139102349926, + "grad_norm": 0.2776215970516205, + "learning_rate": 4.9909820224871787e-05, + "loss": 0.0443, + "num_input_tokens_seen": 27178832, + "step": 22330 + }, + { + "epoch": 2.4874707651186103, + "grad_norm": 1.396081566810608, + "learning_rate": 4.990961391752056e-05, + "loss": 0.1186, + "num_input_tokens_seen": 27185200, + "step": 22335 + }, + { + "epoch": 2.4880276200022275, + "grad_norm": 0.6174371242523193, + "learning_rate": 4.990940737487808e-05, + "loss": 0.1068, + "num_input_tokens_seen": 27191440, + "step": 22340 + }, + { + "epoch": 2.4885844748858448, + "grad_norm": 0.8374453783035278, + "learning_rate": 4.990920059694629e-05, + "loss": 0.0352, + "num_input_tokens_seen": 27197584, + "step": 22345 + }, + { + "epoch": 2.489141329769462, + "grad_norm": 0.11056523770093918, + "learning_rate": 4.9908993583727145e-05, + "loss": 0.0129, + "num_input_tokens_seen": 27203600, + "step": 22350 + }, + { + "epoch": 2.4896981846530792, + "grad_norm": 0.3962629437446594, + "learning_rate": 4.99087863352226e-05, + "loss": 0.0346, + "num_input_tokens_seen": 27209808, + "step": 22355 + }, + { + "epoch": 2.490255039536697, + "grad_norm": 0.3111841082572937, + "learning_rate": 4.99085788514346e-05, + "loss": 0.017, + "num_input_tokens_seen": 27215984, + "step": 22360 + }, + { + "epoch": 2.490811894420314, + "grad_norm": 1.0062289237976074, + "learning_rate": 4.990837113236514e-05, + "loss": 0.1689, + "num_input_tokens_seen": 27221968, + "step": 22365 + }, + { + "epoch": 2.4913687493039314, + "grad_norm": 2.187751054763794, + "learning_rate": 4.990816317801614e-05, + "loss": 0.1448, + "num_input_tokens_seen": 27227472, + "step": 22370 + }, + { + "epoch": 2.4919256041875486, + "grad_norm": 0.1315116435289383, + "learning_rate": 4.9907954988389585e-05, + "loss": 0.08, + "num_input_tokens_seen": 27233808, + "step": 22375 + }, + { + "epoch": 2.492482459071166, + "grad_norm": 0.1603914201259613, + "learning_rate": 4.9907746563487444e-05, + "loss": 0.0348, + "num_input_tokens_seen": 27239824, + "step": 22380 + }, + { + "epoch": 2.4930393139547835, + "grad_norm": 0.40070268511772156, + "learning_rate": 4.990753790331168e-05, + "loss": 0.1099, + "num_input_tokens_seen": 27245872, + "step": 22385 + }, + { + "epoch": 2.4935961688384007, + "grad_norm": 1.0708872079849243, + "learning_rate": 4.9907329007864255e-05, + "loss": 0.2067, + "num_input_tokens_seen": 27252304, + "step": 22390 + }, + { + "epoch": 2.494153023722018, + "grad_norm": 0.9904220104217529, + "learning_rate": 4.9907119877147165e-05, + "loss": 0.1167, + "num_input_tokens_seen": 27258160, + "step": 22395 + }, + { + "epoch": 2.494709878605635, + "grad_norm": 0.011807426810264587, + "learning_rate": 4.990691051116236e-05, + "loss": 0.1093, + "num_input_tokens_seen": 27264304, + "step": 22400 + }, + { + "epoch": 2.495266733489253, + "grad_norm": 0.0017467120196670294, + "learning_rate": 4.990670090991184e-05, + "loss": 0.0466, + "num_input_tokens_seen": 27270128, + "step": 22405 + }, + { + "epoch": 2.49582358837287, + "grad_norm": 0.5110424160957336, + "learning_rate": 4.9906491073397576e-05, + "loss": 0.191, + "num_input_tokens_seen": 27275664, + "step": 22410 + }, + { + "epoch": 2.4963804432564873, + "grad_norm": 0.6507238745689392, + "learning_rate": 4.990628100162155e-05, + "loss": 0.0389, + "num_input_tokens_seen": 27281936, + "step": 22415 + }, + { + "epoch": 2.4969372981401046, + "grad_norm": 1.1203218698501587, + "learning_rate": 4.990607069458574e-05, + "loss": 0.0381, + "num_input_tokens_seen": 27288304, + "step": 22420 + }, + { + "epoch": 2.4974941530237222, + "grad_norm": 0.12448607385158539, + "learning_rate": 4.9905860152292136e-05, + "loss": 0.0363, + "num_input_tokens_seen": 27294224, + "step": 22425 + }, + { + "epoch": 2.4980510079073395, + "grad_norm": 0.4053685963153839, + "learning_rate": 4.990564937474273e-05, + "loss": 0.0928, + "num_input_tokens_seen": 27300272, + "step": 22430 + }, + { + "epoch": 2.4986078627909567, + "grad_norm": 0.6782663464546204, + "learning_rate": 4.990543836193952e-05, + "loss": 0.0903, + "num_input_tokens_seen": 27306352, + "step": 22435 + }, + { + "epoch": 2.499164717674574, + "grad_norm": 0.08556551486253738, + "learning_rate": 4.990522711388448e-05, + "loss": 0.1436, + "num_input_tokens_seen": 27312432, + "step": 22440 + }, + { + "epoch": 2.499721572558191, + "grad_norm": 0.9874436259269714, + "learning_rate": 4.990501563057962e-05, + "loss": 0.06, + "num_input_tokens_seen": 27318224, + "step": 22445 + }, + { + "epoch": 2.500278427441809, + "grad_norm": 2.2671120166778564, + "learning_rate": 4.990480391202693e-05, + "loss": 0.1408, + "num_input_tokens_seen": 27324304, + "step": 22450 + }, + { + "epoch": 2.500835282325426, + "grad_norm": 0.40904682874679565, + "learning_rate": 4.990459195822842e-05, + "loss": 0.1236, + "num_input_tokens_seen": 27330448, + "step": 22455 + }, + { + "epoch": 2.5013921372090433, + "grad_norm": 0.40598058700561523, + "learning_rate": 4.9904379769186085e-05, + "loss": 0.061, + "num_input_tokens_seen": 27336496, + "step": 22460 + }, + { + "epoch": 2.5019489920926605, + "grad_norm": 1.7684462070465088, + "learning_rate": 4.990416734490193e-05, + "loss": 0.2163, + "num_input_tokens_seen": 27342704, + "step": 22465 + }, + { + "epoch": 2.5025058469762778, + "grad_norm": 1.4594372510910034, + "learning_rate": 4.990395468537795e-05, + "loss": 0.181, + "num_input_tokens_seen": 27349040, + "step": 22470 + }, + { + "epoch": 2.5030627018598954, + "grad_norm": 0.5285419821739197, + "learning_rate": 4.990374179061618e-05, + "loss": 0.0645, + "num_input_tokens_seen": 27355088, + "step": 22475 + }, + { + "epoch": 2.5036195567435127, + "grad_norm": 0.00758148031309247, + "learning_rate": 4.990352866061862e-05, + "loss": 0.1636, + "num_input_tokens_seen": 27361232, + "step": 22480 + }, + { + "epoch": 2.50417641162713, + "grad_norm": 0.42360448837280273, + "learning_rate": 4.9903315295387265e-05, + "loss": 0.0386, + "num_input_tokens_seen": 27367760, + "step": 22485 + }, + { + "epoch": 2.5047332665107476, + "grad_norm": 0.2934570014476776, + "learning_rate": 4.990310169492415e-05, + "loss": 0.0312, + "num_input_tokens_seen": 27373808, + "step": 22490 + }, + { + "epoch": 2.5052901213943644, + "grad_norm": 0.1992076188325882, + "learning_rate": 4.990288785923128e-05, + "loss": 0.0256, + "num_input_tokens_seen": 27379856, + "step": 22495 + }, + { + "epoch": 2.505846976277982, + "grad_norm": 0.014079853892326355, + "learning_rate": 4.990267378831069e-05, + "loss": 0.2061, + "num_input_tokens_seen": 27385808, + "step": 22500 + }, + { + "epoch": 2.5064038311615993, + "grad_norm": 1.5862233638763428, + "learning_rate": 4.99024594821644e-05, + "loss": 0.0682, + "num_input_tokens_seen": 27391696, + "step": 22505 + }, + { + "epoch": 2.5069606860452165, + "grad_norm": 0.18432669341564178, + "learning_rate": 4.9902244940794424e-05, + "loss": 0.0308, + "num_input_tokens_seen": 27397936, + "step": 22510 + }, + { + "epoch": 2.507517540928834, + "grad_norm": 0.5610091090202332, + "learning_rate": 4.99020301642028e-05, + "loss": 0.055, + "num_input_tokens_seen": 27404080, + "step": 22515 + }, + { + "epoch": 2.5080743958124514, + "grad_norm": 0.1279040426015854, + "learning_rate": 4.990181515239153e-05, + "loss": 0.0208, + "num_input_tokens_seen": 27410448, + "step": 22520 + }, + { + "epoch": 2.5086312506960686, + "grad_norm": 0.2139294445514679, + "learning_rate": 4.9901599905362686e-05, + "loss": 0.0174, + "num_input_tokens_seen": 27416560, + "step": 22525 + }, + { + "epoch": 2.509188105579686, + "grad_norm": 0.42132365703582764, + "learning_rate": 4.990138442311827e-05, + "loss": 0.1614, + "num_input_tokens_seen": 27422704, + "step": 22530 + }, + { + "epoch": 2.509744960463303, + "grad_norm": 0.8561932444572449, + "learning_rate": 4.990116870566033e-05, + "loss": 0.1819, + "num_input_tokens_seen": 27428912, + "step": 22535 + }, + { + "epoch": 2.5103018153469208, + "grad_norm": 0.002083142986521125, + "learning_rate": 4.9900952752990895e-05, + "loss": 0.0288, + "num_input_tokens_seen": 27434672, + "step": 22540 + }, + { + "epoch": 2.510858670230538, + "grad_norm": 0.011129438877105713, + "learning_rate": 4.990073656511202e-05, + "loss": 0.0985, + "num_input_tokens_seen": 27440944, + "step": 22545 + }, + { + "epoch": 2.5114155251141552, + "grad_norm": 0.5657509565353394, + "learning_rate": 4.990052014202573e-05, + "loss": 0.16, + "num_input_tokens_seen": 27447344, + "step": 22550 + }, + { + "epoch": 2.5119723799977725, + "grad_norm": 0.6682488918304443, + "learning_rate": 4.990030348373409e-05, + "loss": 0.1133, + "num_input_tokens_seen": 27452752, + "step": 22555 + }, + { + "epoch": 2.5125292348813897, + "grad_norm": 1.70272958278656, + "learning_rate": 4.9900086590239116e-05, + "loss": 0.0956, + "num_input_tokens_seen": 27458864, + "step": 22560 + }, + { + "epoch": 2.5130860897650074, + "grad_norm": 0.752365231513977, + "learning_rate": 4.989986946154289e-05, + "loss": 0.0859, + "num_input_tokens_seen": 27465296, + "step": 22565 + }, + { + "epoch": 2.5136429446486246, + "grad_norm": 0.41471925377845764, + "learning_rate": 4.989965209764744e-05, + "loss": 0.1504, + "num_input_tokens_seen": 27471728, + "step": 22570 + }, + { + "epoch": 2.514199799532242, + "grad_norm": 0.6480679512023926, + "learning_rate": 4.989943449855482e-05, + "loss": 0.0753, + "num_input_tokens_seen": 27478224, + "step": 22575 + }, + { + "epoch": 2.5147566544158595, + "grad_norm": 1.0757830142974854, + "learning_rate": 4.98992166642671e-05, + "loss": 0.1679, + "num_input_tokens_seen": 27484400, + "step": 22580 + }, + { + "epoch": 2.5153135092994763, + "grad_norm": 0.07471882551908493, + "learning_rate": 4.989899859478633e-05, + "loss": 0.0449, + "num_input_tokens_seen": 27490448, + "step": 22585 + }, + { + "epoch": 2.515870364183094, + "grad_norm": 1.8542451858520508, + "learning_rate": 4.9898780290114574e-05, + "loss": 0.191, + "num_input_tokens_seen": 27496560, + "step": 22590 + }, + { + "epoch": 2.516427219066711, + "grad_norm": 0.6700359582901001, + "learning_rate": 4.989856175025388e-05, + "loss": 0.137, + "num_input_tokens_seen": 27502480, + "step": 22595 + }, + { + "epoch": 2.5169840739503284, + "grad_norm": 0.1076112911105156, + "learning_rate": 4.989834297520633e-05, + "loss": 0.2322, + "num_input_tokens_seen": 27508656, + "step": 22600 + }, + { + "epoch": 2.517540928833946, + "grad_norm": 0.6699445843696594, + "learning_rate": 4.9898123964973976e-05, + "loss": 0.1347, + "num_input_tokens_seen": 27514960, + "step": 22605 + }, + { + "epoch": 2.5180977837175633, + "grad_norm": 0.4114170968532562, + "learning_rate": 4.98979047195589e-05, + "loss": 0.0635, + "num_input_tokens_seen": 27521200, + "step": 22610 + }, + { + "epoch": 2.5186546386011806, + "grad_norm": 0.03179395943880081, + "learning_rate": 4.989768523896316e-05, + "loss": 0.0509, + "num_input_tokens_seen": 27527536, + "step": 22615 + }, + { + "epoch": 2.519211493484798, + "grad_norm": 0.006882747635245323, + "learning_rate": 4.989746552318884e-05, + "loss": 0.1426, + "num_input_tokens_seen": 27533840, + "step": 22620 + }, + { + "epoch": 2.519768348368415, + "grad_norm": 0.922548234462738, + "learning_rate": 4.989724557223801e-05, + "loss": 0.0347, + "num_input_tokens_seen": 27540048, + "step": 22625 + }, + { + "epoch": 2.5203252032520327, + "grad_norm": 0.14616301655769348, + "learning_rate": 4.989702538611274e-05, + "loss": 0.0768, + "num_input_tokens_seen": 27546320, + "step": 22630 + }, + { + "epoch": 2.52088205813565, + "grad_norm": 0.7061462998390198, + "learning_rate": 4.9896804964815126e-05, + "loss": 0.0227, + "num_input_tokens_seen": 27552496, + "step": 22635 + }, + { + "epoch": 2.521438913019267, + "grad_norm": 0.009876895695924759, + "learning_rate": 4.9896584308347236e-05, + "loss": 0.146, + "num_input_tokens_seen": 27558800, + "step": 22640 + }, + { + "epoch": 2.5219957679028844, + "grad_norm": 0.03000621497631073, + "learning_rate": 4.9896363416711165e-05, + "loss": 0.0245, + "num_input_tokens_seen": 27564944, + "step": 22645 + }, + { + "epoch": 2.5225526227865016, + "grad_norm": 0.37150436639785767, + "learning_rate": 4.9896142289909e-05, + "loss": 0.0614, + "num_input_tokens_seen": 27571024, + "step": 22650 + }, + { + "epoch": 2.5231094776701193, + "grad_norm": 0.10346072167158127, + "learning_rate": 4.989592092794282e-05, + "loss": 0.2122, + "num_input_tokens_seen": 27577008, + "step": 22655 + }, + { + "epoch": 2.5236663325537365, + "grad_norm": 0.8188828825950623, + "learning_rate": 4.9895699330814716e-05, + "loss": 0.1397, + "num_input_tokens_seen": 27583088, + "step": 22660 + }, + { + "epoch": 2.5242231874373537, + "grad_norm": 0.08742091804742813, + "learning_rate": 4.9895477498526785e-05, + "loss": 0.0885, + "num_input_tokens_seen": 27589264, + "step": 22665 + }, + { + "epoch": 2.5247800423209714, + "grad_norm": 0.1816837042570114, + "learning_rate": 4.9895255431081135e-05, + "loss": 0.0695, + "num_input_tokens_seen": 27595312, + "step": 22670 + }, + { + "epoch": 2.5253368972045886, + "grad_norm": 0.2246709018945694, + "learning_rate": 4.989503312847984e-05, + "loss": 0.1002, + "num_input_tokens_seen": 27600976, + "step": 22675 + }, + { + "epoch": 2.525893752088206, + "grad_norm": 0.6841935515403748, + "learning_rate": 4.9894810590725015e-05, + "loss": 0.114, + "num_input_tokens_seen": 27607504, + "step": 22680 + }, + { + "epoch": 2.526450606971823, + "grad_norm": 0.6248807311058044, + "learning_rate": 4.989458781781876e-05, + "loss": 0.1727, + "num_input_tokens_seen": 27613616, + "step": 22685 + }, + { + "epoch": 2.5270074618554403, + "grad_norm": 0.7263498306274414, + "learning_rate": 4.989436480976318e-05, + "loss": 0.059, + "num_input_tokens_seen": 27619984, + "step": 22690 + }, + { + "epoch": 2.527564316739058, + "grad_norm": 0.4878164529800415, + "learning_rate": 4.9894141566560375e-05, + "loss": 0.0781, + "num_input_tokens_seen": 27625808, + "step": 22695 + }, + { + "epoch": 2.5281211716226752, + "grad_norm": 0.0008241515606641769, + "learning_rate": 4.989391808821247e-05, + "loss": 0.0868, + "num_input_tokens_seen": 27631696, + "step": 22700 + }, + { + "epoch": 2.5286780265062925, + "grad_norm": 0.14862965047359467, + "learning_rate": 4.9893694374721545e-05, + "loss": 0.0194, + "num_input_tokens_seen": 27637552, + "step": 22705 + }, + { + "epoch": 2.5292348813899097, + "grad_norm": 1.430484652519226, + "learning_rate": 4.9893470426089737e-05, + "loss": 0.3154, + "num_input_tokens_seen": 27643664, + "step": 22710 + }, + { + "epoch": 2.529791736273527, + "grad_norm": 0.27607837319374084, + "learning_rate": 4.989324624231916e-05, + "loss": 0.0802, + "num_input_tokens_seen": 27649616, + "step": 22715 + }, + { + "epoch": 2.5303485911571446, + "grad_norm": 0.5873315930366516, + "learning_rate": 4.989302182341193e-05, + "loss": 0.214, + "num_input_tokens_seen": 27655632, + "step": 22720 + }, + { + "epoch": 2.530905446040762, + "grad_norm": 0.3370307683944702, + "learning_rate": 4.989279716937016e-05, + "loss": 0.1076, + "num_input_tokens_seen": 27661744, + "step": 22725 + }, + { + "epoch": 2.531462300924379, + "grad_norm": 0.5707840919494629, + "learning_rate": 4.9892572280195986e-05, + "loss": 0.2492, + "num_input_tokens_seen": 27667952, + "step": 22730 + }, + { + "epoch": 2.5320191558079963, + "grad_norm": 0.8942787647247314, + "learning_rate": 4.989234715589152e-05, + "loss": 0.1324, + "num_input_tokens_seen": 27674000, + "step": 22735 + }, + { + "epoch": 2.5325760106916135, + "grad_norm": 1.0056827068328857, + "learning_rate": 4.989212179645889e-05, + "loss": 0.0515, + "num_input_tokens_seen": 27680240, + "step": 22740 + }, + { + "epoch": 2.533132865575231, + "grad_norm": 0.7384499311447144, + "learning_rate": 4.989189620190022e-05, + "loss": 0.0734, + "num_input_tokens_seen": 27686288, + "step": 22745 + }, + { + "epoch": 2.5336897204588484, + "grad_norm": 0.7829557657241821, + "learning_rate": 4.989167037221766e-05, + "loss": 0.1218, + "num_input_tokens_seen": 27692688, + "step": 22750 + }, + { + "epoch": 2.5342465753424657, + "grad_norm": 0.009633481502532959, + "learning_rate": 4.989144430741332e-05, + "loss": 0.0178, + "num_input_tokens_seen": 27698800, + "step": 22755 + }, + { + "epoch": 2.5348034302260833, + "grad_norm": 0.058833952993154526, + "learning_rate": 4.989121800748935e-05, + "loss": 0.0137, + "num_input_tokens_seen": 27705168, + "step": 22760 + }, + { + "epoch": 2.5353602851097006, + "grad_norm": 0.16945543885231018, + "learning_rate": 4.9890991472447876e-05, + "loss": 0.0292, + "num_input_tokens_seen": 27711280, + "step": 22765 + }, + { + "epoch": 2.535917139993318, + "grad_norm": 0.08749298006296158, + "learning_rate": 4.989076470229106e-05, + "loss": 0.0144, + "num_input_tokens_seen": 27717360, + "step": 22770 + }, + { + "epoch": 2.536473994876935, + "grad_norm": 0.4996459186077118, + "learning_rate": 4.9890537697021014e-05, + "loss": 0.0764, + "num_input_tokens_seen": 27723664, + "step": 22775 + }, + { + "epoch": 2.5370308497605523, + "grad_norm": 0.6288267970085144, + "learning_rate": 4.9890310456639914e-05, + "loss": 0.1024, + "num_input_tokens_seen": 27729520, + "step": 22780 + }, + { + "epoch": 2.53758770464417, + "grad_norm": 1.3137836456298828, + "learning_rate": 4.989008298114988e-05, + "loss": 0.1685, + "num_input_tokens_seen": 27735760, + "step": 22785 + }, + { + "epoch": 2.538144559527787, + "grad_norm": 1.1068203449249268, + "learning_rate": 4.9889855270553066e-05, + "loss": 0.1015, + "num_input_tokens_seen": 27742160, + "step": 22790 + }, + { + "epoch": 2.5387014144114044, + "grad_norm": 0.05979050323367119, + "learning_rate": 4.988962732485163e-05, + "loss": 0.1145, + "num_input_tokens_seen": 27748208, + "step": 22795 + }, + { + "epoch": 2.5392582692950216, + "grad_norm": 0.524316132068634, + "learning_rate": 4.9889399144047725e-05, + "loss": 0.0994, + "num_input_tokens_seen": 27754672, + "step": 22800 + }, + { + "epoch": 2.539815124178639, + "grad_norm": 1.5792232751846313, + "learning_rate": 4.9889170728143506e-05, + "loss": 0.0905, + "num_input_tokens_seen": 27760816, + "step": 22805 + }, + { + "epoch": 2.5403719790622565, + "grad_norm": 0.4076269268989563, + "learning_rate": 4.9888942077141124e-05, + "loss": 0.1025, + "num_input_tokens_seen": 27766320, + "step": 22810 + }, + { + "epoch": 2.5409288339458738, + "grad_norm": 1.2144851684570312, + "learning_rate": 4.988871319104275e-05, + "loss": 0.1119, + "num_input_tokens_seen": 27772272, + "step": 22815 + }, + { + "epoch": 2.541485688829491, + "grad_norm": 0.5795597434043884, + "learning_rate": 4.9888484069850536e-05, + "loss": 0.0202, + "num_input_tokens_seen": 27778608, + "step": 22820 + }, + { + "epoch": 2.5420425437131082, + "grad_norm": 0.1969538927078247, + "learning_rate": 4.988825471356665e-05, + "loss": 0.1295, + "num_input_tokens_seen": 27784752, + "step": 22825 + }, + { + "epoch": 2.5425993985967255, + "grad_norm": 0.49638301134109497, + "learning_rate": 4.988802512219325e-05, + "loss": 0.0446, + "num_input_tokens_seen": 27790480, + "step": 22830 + }, + { + "epoch": 2.543156253480343, + "grad_norm": 0.3224252462387085, + "learning_rate": 4.988779529573253e-05, + "loss": 0.0622, + "num_input_tokens_seen": 27796592, + "step": 22835 + }, + { + "epoch": 2.5437131083639604, + "grad_norm": 0.8547857403755188, + "learning_rate": 4.988756523418663e-05, + "loss": 0.06, + "num_input_tokens_seen": 27802768, + "step": 22840 + }, + { + "epoch": 2.5442699632475776, + "grad_norm": 0.3185785412788391, + "learning_rate": 4.988733493755774e-05, + "loss": 0.1419, + "num_input_tokens_seen": 27808560, + "step": 22845 + }, + { + "epoch": 2.5448268181311953, + "grad_norm": 2.4530227184295654, + "learning_rate": 4.9887104405848034e-05, + "loss": 0.1387, + "num_input_tokens_seen": 27814704, + "step": 22850 + }, + { + "epoch": 2.5453836730148125, + "grad_norm": 1.7808382511138916, + "learning_rate": 4.9886873639059685e-05, + "loss": 0.1723, + "num_input_tokens_seen": 27820848, + "step": 22855 + }, + { + "epoch": 2.5459405278984297, + "grad_norm": 1.2949544191360474, + "learning_rate": 4.988664263719488e-05, + "loss": 0.1797, + "num_input_tokens_seen": 27825840, + "step": 22860 + }, + { + "epoch": 2.546497382782047, + "grad_norm": 0.991986870765686, + "learning_rate": 4.98864114002558e-05, + "loss": 0.1966, + "num_input_tokens_seen": 27831856, + "step": 22865 + }, + { + "epoch": 2.547054237665664, + "grad_norm": 0.05468921735882759, + "learning_rate": 4.9886179928244616e-05, + "loss": 0.1638, + "num_input_tokens_seen": 27838160, + "step": 22870 + }, + { + "epoch": 2.547611092549282, + "grad_norm": 1.2452077865600586, + "learning_rate": 4.988594822116352e-05, + "loss": 0.1786, + "num_input_tokens_seen": 27844304, + "step": 22875 + }, + { + "epoch": 2.548167947432899, + "grad_norm": 0.570922315120697, + "learning_rate": 4.988571627901472e-05, + "loss": 0.1016, + "num_input_tokens_seen": 27850512, + "step": 22880 + }, + { + "epoch": 2.5487248023165163, + "grad_norm": 0.17857782542705536, + "learning_rate": 4.9885484101800375e-05, + "loss": 0.1351, + "num_input_tokens_seen": 27856688, + "step": 22885 + }, + { + "epoch": 2.5492816572001336, + "grad_norm": 0.021676411852240562, + "learning_rate": 4.9885251689522706e-05, + "loss": 0.0462, + "num_input_tokens_seen": 27863248, + "step": 22890 + }, + { + "epoch": 2.549838512083751, + "grad_norm": 0.3848669230937958, + "learning_rate": 4.9885019042183894e-05, + "loss": 0.0727, + "num_input_tokens_seen": 27869072, + "step": 22895 + }, + { + "epoch": 2.5503953669673685, + "grad_norm": 0.5227898359298706, + "learning_rate": 4.988478615978614e-05, + "loss": 0.1256, + "num_input_tokens_seen": 27875568, + "step": 22900 + }, + { + "epoch": 2.5509522218509857, + "grad_norm": 0.019627505913376808, + "learning_rate": 4.988455304233164e-05, + "loss": 0.0261, + "num_input_tokens_seen": 27881680, + "step": 22905 + }, + { + "epoch": 2.551509076734603, + "grad_norm": 1.1219955682754517, + "learning_rate": 4.988431968982261e-05, + "loss": 0.1482, + "num_input_tokens_seen": 27887728, + "step": 22910 + }, + { + "epoch": 2.55206593161822, + "grad_norm": 0.25934574007987976, + "learning_rate": 4.988408610226123e-05, + "loss": 0.1809, + "num_input_tokens_seen": 27894160, + "step": 22915 + }, + { + "epoch": 2.5526227865018374, + "grad_norm": 0.2700725495815277, + "learning_rate": 4.988385227964973e-05, + "loss": 0.2216, + "num_input_tokens_seen": 27900176, + "step": 22920 + }, + { + "epoch": 2.553179641385455, + "grad_norm": 0.6772702932357788, + "learning_rate": 4.98836182219903e-05, + "loss": 0.142, + "num_input_tokens_seen": 27905904, + "step": 22925 + }, + { + "epoch": 2.5537364962690723, + "grad_norm": 0.8264525532722473, + "learning_rate": 4.9883383929285163e-05, + "loss": 0.0966, + "num_input_tokens_seen": 27911824, + "step": 22930 + }, + { + "epoch": 2.5542933511526895, + "grad_norm": 1.6795246601104736, + "learning_rate": 4.9883149401536535e-05, + "loss": 0.126, + "num_input_tokens_seen": 27917744, + "step": 22935 + }, + { + "epoch": 2.554850206036307, + "grad_norm": 0.40766188502311707, + "learning_rate": 4.988291463874662e-05, + "loss": 0.0983, + "num_input_tokens_seen": 27923760, + "step": 22940 + }, + { + "epoch": 2.5554070609199244, + "grad_norm": 0.6503118276596069, + "learning_rate": 4.988267964091764e-05, + "loss": 0.185, + "num_input_tokens_seen": 27930032, + "step": 22945 + }, + { + "epoch": 2.5559639158035417, + "grad_norm": 1.4437371492385864, + "learning_rate": 4.988244440805181e-05, + "loss": 0.111, + "num_input_tokens_seen": 27936240, + "step": 22950 + }, + { + "epoch": 2.556520770687159, + "grad_norm": 0.6382843852043152, + "learning_rate": 4.988220894015136e-05, + "loss": 0.0556, + "num_input_tokens_seen": 27942160, + "step": 22955 + }, + { + "epoch": 2.557077625570776, + "grad_norm": 0.16521325707435608, + "learning_rate": 4.9881973237218516e-05, + "loss": 0.0847, + "num_input_tokens_seen": 27948016, + "step": 22960 + }, + { + "epoch": 2.557634480454394, + "grad_norm": 0.74191814661026, + "learning_rate": 4.98817372992555e-05, + "loss": 0.2061, + "num_input_tokens_seen": 27954224, + "step": 22965 + }, + { + "epoch": 2.558191335338011, + "grad_norm": 0.14154894649982452, + "learning_rate": 4.988150112626454e-05, + "loss": 0.142, + "num_input_tokens_seen": 27960496, + "step": 22970 + }, + { + "epoch": 2.5587481902216282, + "grad_norm": 1.1113694906234741, + "learning_rate": 4.9881264718247864e-05, + "loss": 0.0581, + "num_input_tokens_seen": 27966928, + "step": 22975 + }, + { + "epoch": 2.5593050451052455, + "grad_norm": 0.9094530940055847, + "learning_rate": 4.9881028075207705e-05, + "loss": 0.0833, + "num_input_tokens_seen": 27972880, + "step": 22980 + }, + { + "epoch": 2.5598618999888627, + "grad_norm": 0.5009612441062927, + "learning_rate": 4.98807911971463e-05, + "loss": 0.2085, + "num_input_tokens_seen": 27979184, + "step": 22985 + }, + { + "epoch": 2.5604187548724804, + "grad_norm": 0.24707704782485962, + "learning_rate": 4.98805540840659e-05, + "loss": 0.1568, + "num_input_tokens_seen": 27985040, + "step": 22990 + }, + { + "epoch": 2.5609756097560976, + "grad_norm": 0.5504627227783203, + "learning_rate": 4.988031673596872e-05, + "loss": 0.0398, + "num_input_tokens_seen": 27991024, + "step": 22995 + }, + { + "epoch": 2.561532464639715, + "grad_norm": 0.035515204071998596, + "learning_rate": 4.988007915285703e-05, + "loss": 0.1756, + "num_input_tokens_seen": 27996592, + "step": 23000 + }, + { + "epoch": 2.562089319523332, + "grad_norm": 0.22746334969997406, + "learning_rate": 4.9879841334733043e-05, + "loss": 0.1744, + "num_input_tokens_seen": 28002672, + "step": 23005 + }, + { + "epoch": 2.5626461744069493, + "grad_norm": 1.4160804748535156, + "learning_rate": 4.987960328159903e-05, + "loss": 0.1149, + "num_input_tokens_seen": 28008400, + "step": 23010 + }, + { + "epoch": 2.563203029290567, + "grad_norm": 0.004725661128759384, + "learning_rate": 4.987936499345723e-05, + "loss": 0.1044, + "num_input_tokens_seen": 28014384, + "step": 23015 + }, + { + "epoch": 2.563759884174184, + "grad_norm": 0.05282289907336235, + "learning_rate": 4.9879126470309887e-05, + "loss": 0.0716, + "num_input_tokens_seen": 28020560, + "step": 23020 + }, + { + "epoch": 2.5643167390578014, + "grad_norm": 1.721479058265686, + "learning_rate": 4.987888771215927e-05, + "loss": 0.2088, + "num_input_tokens_seen": 28026384, + "step": 23025 + }, + { + "epoch": 2.564873593941419, + "grad_norm": 0.7150125503540039, + "learning_rate": 4.987864871900763e-05, + "loss": 0.1363, + "num_input_tokens_seen": 28032784, + "step": 23030 + }, + { + "epoch": 2.5654304488250363, + "grad_norm": 0.05128251761198044, + "learning_rate": 4.987840949085722e-05, + "loss": 0.1346, + "num_input_tokens_seen": 28039248, + "step": 23035 + }, + { + "epoch": 2.5659873037086536, + "grad_norm": 1.0240402221679688, + "learning_rate": 4.987817002771029e-05, + "loss": 0.0506, + "num_input_tokens_seen": 28045360, + "step": 23040 + }, + { + "epoch": 2.566544158592271, + "grad_norm": 0.10180971771478653, + "learning_rate": 4.987793032956911e-05, + "loss": 0.1082, + "num_input_tokens_seen": 28050992, + "step": 23045 + }, + { + "epoch": 2.567101013475888, + "grad_norm": 0.06949301809072495, + "learning_rate": 4.9877690396435954e-05, + "loss": 0.14, + "num_input_tokens_seen": 28057040, + "step": 23050 + }, + { + "epoch": 2.5676578683595057, + "grad_norm": 0.41392284631729126, + "learning_rate": 4.9877450228313084e-05, + "loss": 0.1317, + "num_input_tokens_seen": 28063120, + "step": 23055 + }, + { + "epoch": 2.568214723243123, + "grad_norm": 0.14263932406902313, + "learning_rate": 4.9877209825202755e-05, + "loss": 0.0286, + "num_input_tokens_seen": 28069392, + "step": 23060 + }, + { + "epoch": 2.56877157812674, + "grad_norm": 0.3804110586643219, + "learning_rate": 4.987696918710725e-05, + "loss": 0.0687, + "num_input_tokens_seen": 28075728, + "step": 23065 + }, + { + "epoch": 2.5693284330103574, + "grad_norm": 0.3073863685131073, + "learning_rate": 4.9876728314028845e-05, + "loss": 0.0543, + "num_input_tokens_seen": 28081456, + "step": 23070 + }, + { + "epoch": 2.5698852878939746, + "grad_norm": 0.19710403680801392, + "learning_rate": 4.987648720596981e-05, + "loss": 0.1349, + "num_input_tokens_seen": 28087472, + "step": 23075 + }, + { + "epoch": 2.5704421427775923, + "grad_norm": 0.3099794387817383, + "learning_rate": 4.987624586293242e-05, + "loss": 0.073, + "num_input_tokens_seen": 28093552, + "step": 23080 + }, + { + "epoch": 2.5709989976612095, + "grad_norm": 0.3366909921169281, + "learning_rate": 4.987600428491895e-05, + "loss": 0.033, + "num_input_tokens_seen": 28099792, + "step": 23085 + }, + { + "epoch": 2.5715558525448268, + "grad_norm": 0.6223500967025757, + "learning_rate": 4.987576247193171e-05, + "loss": 0.0837, + "num_input_tokens_seen": 28105808, + "step": 23090 + }, + { + "epoch": 2.572112707428444, + "grad_norm": 2.3642656803131104, + "learning_rate": 4.9875520423972945e-05, + "loss": 0.2837, + "num_input_tokens_seen": 28111792, + "step": 23095 + }, + { + "epoch": 2.5726695623120612, + "grad_norm": 0.5750578045845032, + "learning_rate": 4.9875278141044965e-05, + "loss": 0.0687, + "num_input_tokens_seen": 28118000, + "step": 23100 + }, + { + "epoch": 2.573226417195679, + "grad_norm": 1.8906439542770386, + "learning_rate": 4.987503562315006e-05, + "loss": 0.121, + "num_input_tokens_seen": 28124016, + "step": 23105 + }, + { + "epoch": 2.573783272079296, + "grad_norm": 0.2665184736251831, + "learning_rate": 4.98747928702905e-05, + "loss": 0.1169, + "num_input_tokens_seen": 28130384, + "step": 23110 + }, + { + "epoch": 2.5743401269629134, + "grad_norm": 0.8907784819602966, + "learning_rate": 4.9874549882468603e-05, + "loss": 0.0981, + "num_input_tokens_seen": 28136400, + "step": 23115 + }, + { + "epoch": 2.574896981846531, + "grad_norm": 1.9943703413009644, + "learning_rate": 4.987430665968665e-05, + "loss": 0.1702, + "num_input_tokens_seen": 28142416, + "step": 23120 + }, + { + "epoch": 2.5754538367301483, + "grad_norm": 0.8318965435028076, + "learning_rate": 4.987406320194694e-05, + "loss": 0.0976, + "num_input_tokens_seen": 28148432, + "step": 23125 + }, + { + "epoch": 2.5760106916137655, + "grad_norm": 0.3908247947692871, + "learning_rate": 4.9873819509251775e-05, + "loss": 0.0743, + "num_input_tokens_seen": 28154640, + "step": 23130 + }, + { + "epoch": 2.5765675464973827, + "grad_norm": 1.5571540594100952, + "learning_rate": 4.987357558160345e-05, + "loss": 0.1158, + "num_input_tokens_seen": 28160912, + "step": 23135 + }, + { + "epoch": 2.577124401381, + "grad_norm": 0.061741527169942856, + "learning_rate": 4.987333141900429e-05, + "loss": 0.0351, + "num_input_tokens_seen": 28167248, + "step": 23140 + }, + { + "epoch": 2.5776812562646176, + "grad_norm": 0.30260762572288513, + "learning_rate": 4.987308702145658e-05, + "loss": 0.0671, + "num_input_tokens_seen": 28173584, + "step": 23145 + }, + { + "epoch": 2.578238111148235, + "grad_norm": 0.9430816173553467, + "learning_rate": 4.987284238896263e-05, + "loss": 0.0678, + "num_input_tokens_seen": 28179728, + "step": 23150 + }, + { + "epoch": 2.578794966031852, + "grad_norm": 1.0098124742507935, + "learning_rate": 4.987259752152476e-05, + "loss": 0.1317, + "num_input_tokens_seen": 28185776, + "step": 23155 + }, + { + "epoch": 2.5793518209154693, + "grad_norm": 0.7970178723335266, + "learning_rate": 4.987235241914527e-05, + "loss": 0.2434, + "num_input_tokens_seen": 28191888, + "step": 23160 + }, + { + "epoch": 2.5799086757990866, + "grad_norm": 1.087020993232727, + "learning_rate": 4.9872107081826505e-05, + "loss": 0.1078, + "num_input_tokens_seen": 28198096, + "step": 23165 + }, + { + "epoch": 2.5804655306827042, + "grad_norm": 0.40514302253723145, + "learning_rate": 4.9871861509570745e-05, + "loss": 0.1403, + "num_input_tokens_seen": 28204240, + "step": 23170 + }, + { + "epoch": 2.5810223855663215, + "grad_norm": 0.2797664701938629, + "learning_rate": 4.9871615702380326e-05, + "loss": 0.0514, + "num_input_tokens_seen": 28210704, + "step": 23175 + }, + { + "epoch": 2.5815792404499387, + "grad_norm": 1.8727293014526367, + "learning_rate": 4.9871369660257575e-05, + "loss": 0.0896, + "num_input_tokens_seen": 28216784, + "step": 23180 + }, + { + "epoch": 2.582136095333556, + "grad_norm": 0.07638075202703476, + "learning_rate": 4.987112338320481e-05, + "loss": 0.0602, + "num_input_tokens_seen": 28223152, + "step": 23185 + }, + { + "epoch": 2.582692950217173, + "grad_norm": 0.5307329893112183, + "learning_rate": 4.987087687122436e-05, + "loss": 0.0681, + "num_input_tokens_seen": 28229264, + "step": 23190 + }, + { + "epoch": 2.583249805100791, + "grad_norm": 0.2692291736602783, + "learning_rate": 4.987063012431854e-05, + "loss": 0.1358, + "num_input_tokens_seen": 28235472, + "step": 23195 + }, + { + "epoch": 2.583806659984408, + "grad_norm": 1.6916552782058716, + "learning_rate": 4.987038314248971e-05, + "loss": 0.0769, + "num_input_tokens_seen": 28241648, + "step": 23200 + }, + { + "epoch": 2.5843635148680253, + "grad_norm": 0.4588363468647003, + "learning_rate": 4.987013592574018e-05, + "loss": 0.0685, + "num_input_tokens_seen": 28247984, + "step": 23205 + }, + { + "epoch": 2.584920369751643, + "grad_norm": 1.5085663795471191, + "learning_rate": 4.986988847407229e-05, + "loss": 0.1447, + "num_input_tokens_seen": 28253904, + "step": 23210 + }, + { + "epoch": 2.58547722463526, + "grad_norm": 0.2512945830821991, + "learning_rate": 4.986964078748837e-05, + "loss": 0.0616, + "num_input_tokens_seen": 28260016, + "step": 23215 + }, + { + "epoch": 2.5860340795188774, + "grad_norm": 0.02444656379520893, + "learning_rate": 4.986939286599077e-05, + "loss": 0.0782, + "num_input_tokens_seen": 28266384, + "step": 23220 + }, + { + "epoch": 2.5865909344024947, + "grad_norm": 0.7132163643836975, + "learning_rate": 4.986914470958184e-05, + "loss": 0.2585, + "num_input_tokens_seen": 28272592, + "step": 23225 + }, + { + "epoch": 2.587147789286112, + "grad_norm": 0.7378360033035278, + "learning_rate": 4.9868896318263904e-05, + "loss": 0.1564, + "num_input_tokens_seen": 28278896, + "step": 23230 + }, + { + "epoch": 2.5877046441697296, + "grad_norm": 0.17585612833499908, + "learning_rate": 4.9868647692039315e-05, + "loss": 0.0699, + "num_input_tokens_seen": 28285040, + "step": 23235 + }, + { + "epoch": 2.588261499053347, + "grad_norm": 2.4665720462799072, + "learning_rate": 4.9868398830910434e-05, + "loss": 0.0663, + "num_input_tokens_seen": 28291280, + "step": 23240 + }, + { + "epoch": 2.588818353936964, + "grad_norm": 1.0916023254394531, + "learning_rate": 4.98681497348796e-05, + "loss": 0.1376, + "num_input_tokens_seen": 28297360, + "step": 23245 + }, + { + "epoch": 2.5893752088205813, + "grad_norm": 1.1092292070388794, + "learning_rate": 4.9867900403949156e-05, + "loss": 0.0588, + "num_input_tokens_seen": 28303248, + "step": 23250 + }, + { + "epoch": 2.5899320637041985, + "grad_norm": 1.5333861112594604, + "learning_rate": 4.986765083812148e-05, + "loss": 0.3171, + "num_input_tokens_seen": 28309264, + "step": 23255 + }, + { + "epoch": 2.590488918587816, + "grad_norm": 1.4766809940338135, + "learning_rate": 4.986740103739892e-05, + "loss": 0.1031, + "num_input_tokens_seen": 28315504, + "step": 23260 + }, + { + "epoch": 2.5910457734714334, + "grad_norm": 1.7397574186325073, + "learning_rate": 4.9867151001783826e-05, + "loss": 0.1089, + "num_input_tokens_seen": 28321584, + "step": 23265 + }, + { + "epoch": 2.5916026283550506, + "grad_norm": 0.5052441358566284, + "learning_rate": 4.986690073127857e-05, + "loss": 0.1641, + "num_input_tokens_seen": 28327600, + "step": 23270 + }, + { + "epoch": 2.592159483238668, + "grad_norm": 0.13282406330108643, + "learning_rate": 4.986665022588551e-05, + "loss": 0.084, + "num_input_tokens_seen": 28333776, + "step": 23275 + }, + { + "epoch": 2.592716338122285, + "grad_norm": 0.8748583793640137, + "learning_rate": 4.986639948560702e-05, + "loss": 0.0939, + "num_input_tokens_seen": 28339952, + "step": 23280 + }, + { + "epoch": 2.5932731930059028, + "grad_norm": 0.047129105776548386, + "learning_rate": 4.986614851044547e-05, + "loss": 0.0147, + "num_input_tokens_seen": 28346224, + "step": 23285 + }, + { + "epoch": 2.59383004788952, + "grad_norm": 0.4805133044719696, + "learning_rate": 4.986589730040322e-05, + "loss": 0.0614, + "num_input_tokens_seen": 28352432, + "step": 23290 + }, + { + "epoch": 2.594386902773137, + "grad_norm": 1.0526238679885864, + "learning_rate": 4.9865645855482645e-05, + "loss": 0.0857, + "num_input_tokens_seen": 28358384, + "step": 23295 + }, + { + "epoch": 2.594943757656755, + "grad_norm": 1.0982228517532349, + "learning_rate": 4.986539417568613e-05, + "loss": 0.1147, + "num_input_tokens_seen": 28364208, + "step": 23300 + }, + { + "epoch": 2.595500612540372, + "grad_norm": 0.028148218989372253, + "learning_rate": 4.986514226101604e-05, + "loss": 0.05, + "num_input_tokens_seen": 28370544, + "step": 23305 + }, + { + "epoch": 2.5960574674239894, + "grad_norm": 2.510066270828247, + "learning_rate": 4.986489011147476e-05, + "loss": 0.3021, + "num_input_tokens_seen": 28376720, + "step": 23310 + }, + { + "epoch": 2.5966143223076066, + "grad_norm": 0.4382331967353821, + "learning_rate": 4.986463772706467e-05, + "loss": 0.0854, + "num_input_tokens_seen": 28382704, + "step": 23315 + }, + { + "epoch": 2.597171177191224, + "grad_norm": 0.00862811878323555, + "learning_rate": 4.986438510778815e-05, + "loss": 0.0174, + "num_input_tokens_seen": 28388816, + "step": 23320 + }, + { + "epoch": 2.5977280320748415, + "grad_norm": 0.2482384592294693, + "learning_rate": 4.98641322536476e-05, + "loss": 0.0243, + "num_input_tokens_seen": 28394896, + "step": 23325 + }, + { + "epoch": 2.5982848869584587, + "grad_norm": 1.1968116760253906, + "learning_rate": 4.98638791646454e-05, + "loss": 0.1781, + "num_input_tokens_seen": 28400304, + "step": 23330 + }, + { + "epoch": 2.598841741842076, + "grad_norm": 0.10644134134054184, + "learning_rate": 4.986362584078394e-05, + "loss": 0.1071, + "num_input_tokens_seen": 28406288, + "step": 23335 + }, + { + "epoch": 2.599398596725693, + "grad_norm": 0.0063653565011918545, + "learning_rate": 4.9863372282065615e-05, + "loss": 0.0702, + "num_input_tokens_seen": 28412496, + "step": 23340 + }, + { + "epoch": 2.5999554516093104, + "grad_norm": 0.6698089838027954, + "learning_rate": 4.986311848849281e-05, + "loss": 0.0416, + "num_input_tokens_seen": 28418448, + "step": 23345 + }, + { + "epoch": 2.600512306492928, + "grad_norm": 0.19770728051662445, + "learning_rate": 4.986286446006794e-05, + "loss": 0.1025, + "num_input_tokens_seen": 28424752, + "step": 23350 + }, + { + "epoch": 2.6010691613765453, + "grad_norm": 0.2280069887638092, + "learning_rate": 4.9862610196793394e-05, + "loss": 0.2084, + "num_input_tokens_seen": 28430896, + "step": 23355 + }, + { + "epoch": 2.6016260162601625, + "grad_norm": 1.9175491333007812, + "learning_rate": 4.986235569867157e-05, + "loss": 0.2905, + "num_input_tokens_seen": 28436528, + "step": 23360 + }, + { + "epoch": 2.6021828711437798, + "grad_norm": 1.076725959777832, + "learning_rate": 4.9862100965704884e-05, + "loss": 0.1219, + "num_input_tokens_seen": 28442544, + "step": 23365 + }, + { + "epoch": 2.602739726027397, + "grad_norm": 0.23492732644081116, + "learning_rate": 4.986184599789573e-05, + "loss": 0.046, + "num_input_tokens_seen": 28448688, + "step": 23370 + }, + { + "epoch": 2.6032965809110147, + "grad_norm": 0.2812250852584839, + "learning_rate": 4.986159079524653e-05, + "loss": 0.0988, + "num_input_tokens_seen": 28454544, + "step": 23375 + }, + { + "epoch": 2.603853435794632, + "grad_norm": 0.05945182964205742, + "learning_rate": 4.986133535775968e-05, + "loss": 0.2294, + "num_input_tokens_seen": 28460240, + "step": 23380 + }, + { + "epoch": 2.604410290678249, + "grad_norm": 0.7017935514450073, + "learning_rate": 4.986107968543759e-05, + "loss": 0.0677, + "num_input_tokens_seen": 28466512, + "step": 23385 + }, + { + "epoch": 2.604967145561867, + "grad_norm": 0.09046381711959839, + "learning_rate": 4.9860823778282696e-05, + "loss": 0.0425, + "num_input_tokens_seen": 28472592, + "step": 23390 + }, + { + "epoch": 2.605524000445484, + "grad_norm": 0.5433107018470764, + "learning_rate": 4.98605676362974e-05, + "loss": 0.0858, + "num_input_tokens_seen": 28478448, + "step": 23395 + }, + { + "epoch": 2.6060808553291013, + "grad_norm": 0.10447710752487183, + "learning_rate": 4.986031125948413e-05, + "loss": 0.0471, + "num_input_tokens_seen": 28484848, + "step": 23400 + }, + { + "epoch": 2.6066377102127185, + "grad_norm": 0.005570992361754179, + "learning_rate": 4.986005464784529e-05, + "loss": 0.0661, + "num_input_tokens_seen": 28491024, + "step": 23405 + }, + { + "epoch": 2.6071945650963357, + "grad_norm": 1.0551031827926636, + "learning_rate": 4.9859797801383325e-05, + "loss": 0.2202, + "num_input_tokens_seen": 28497360, + "step": 23410 + }, + { + "epoch": 2.6077514199799534, + "grad_norm": 0.6242750287055969, + "learning_rate": 4.985954072010065e-05, + "loss": 0.1052, + "num_input_tokens_seen": 28502960, + "step": 23415 + }, + { + "epoch": 2.6083082748635706, + "grad_norm": 0.7479208707809448, + "learning_rate": 4.98592834039997e-05, + "loss": 0.0344, + "num_input_tokens_seen": 28509360, + "step": 23420 + }, + { + "epoch": 2.608865129747188, + "grad_norm": 1.4573928117752075, + "learning_rate": 4.98590258530829e-05, + "loss": 0.1449, + "num_input_tokens_seen": 28515344, + "step": 23425 + }, + { + "epoch": 2.609421984630805, + "grad_norm": 1.0329055786132812, + "learning_rate": 4.985876806735268e-05, + "loss": 0.0742, + "num_input_tokens_seen": 28521360, + "step": 23430 + }, + { + "epoch": 2.6099788395144223, + "grad_norm": 0.025795044377446175, + "learning_rate": 4.985851004681148e-05, + "loss": 0.0946, + "num_input_tokens_seen": 28527440, + "step": 23435 + }, + { + "epoch": 2.61053569439804, + "grad_norm": 0.3100113570690155, + "learning_rate": 4.9858251791461734e-05, + "loss": 0.0946, + "num_input_tokens_seen": 28533360, + "step": 23440 + }, + { + "epoch": 2.6110925492816572, + "grad_norm": 1.7862322330474854, + "learning_rate": 4.9857993301305886e-05, + "loss": 0.0939, + "num_input_tokens_seen": 28539760, + "step": 23445 + }, + { + "epoch": 2.6116494041652745, + "grad_norm": 0.9794039726257324, + "learning_rate": 4.985773457634638e-05, + "loss": 0.0702, + "num_input_tokens_seen": 28545680, + "step": 23450 + }, + { + "epoch": 2.6122062590488917, + "grad_norm": 0.7197328805923462, + "learning_rate": 4.985747561658565e-05, + "loss": 0.1016, + "num_input_tokens_seen": 28551536, + "step": 23455 + }, + { + "epoch": 2.612763113932509, + "grad_norm": 0.625133216381073, + "learning_rate": 4.9857216422026154e-05, + "loss": 0.0806, + "num_input_tokens_seen": 28557424, + "step": 23460 + }, + { + "epoch": 2.6133199688161266, + "grad_norm": 0.2701059877872467, + "learning_rate": 4.985695699267032e-05, + "loss": 0.0566, + "num_input_tokens_seen": 28563568, + "step": 23465 + }, + { + "epoch": 2.613876823699744, + "grad_norm": 1.7984224557876587, + "learning_rate": 4.985669732852063e-05, + "loss": 0.1022, + "num_input_tokens_seen": 28569808, + "step": 23470 + }, + { + "epoch": 2.614433678583361, + "grad_norm": 1.1299771070480347, + "learning_rate": 4.985643742957951e-05, + "loss": 0.1151, + "num_input_tokens_seen": 28576240, + "step": 23475 + }, + { + "epoch": 2.6149905334669787, + "grad_norm": 0.2835704982280731, + "learning_rate": 4.9856177295849414e-05, + "loss": 0.0999, + "num_input_tokens_seen": 28582576, + "step": 23480 + }, + { + "epoch": 2.615547388350596, + "grad_norm": 0.17555749416351318, + "learning_rate": 4.9855916927332825e-05, + "loss": 0.0288, + "num_input_tokens_seen": 28588848, + "step": 23485 + }, + { + "epoch": 2.616104243234213, + "grad_norm": 0.06908442825078964, + "learning_rate": 4.9855656324032173e-05, + "loss": 0.1542, + "num_input_tokens_seen": 28594800, + "step": 23490 + }, + { + "epoch": 2.6166610981178304, + "grad_norm": 0.5665582418441772, + "learning_rate": 4.985539548594995e-05, + "loss": 0.1227, + "num_input_tokens_seen": 28600944, + "step": 23495 + }, + { + "epoch": 2.6172179530014477, + "grad_norm": 1.0125253200531006, + "learning_rate": 4.9855134413088586e-05, + "loss": 0.1001, + "num_input_tokens_seen": 28607152, + "step": 23500 + }, + { + "epoch": 2.6177748078850653, + "grad_norm": 0.69083172082901, + "learning_rate": 4.985487310545057e-05, + "loss": 0.1029, + "num_input_tokens_seen": 28613136, + "step": 23505 + }, + { + "epoch": 2.6183316627686826, + "grad_norm": 0.6764017939567566, + "learning_rate": 4.9854611563038364e-05, + "loss": 0.0978, + "num_input_tokens_seen": 28619024, + "step": 23510 + }, + { + "epoch": 2.6188885176523, + "grad_norm": 0.018967054784297943, + "learning_rate": 4.985434978585444e-05, + "loss": 0.0306, + "num_input_tokens_seen": 28625264, + "step": 23515 + }, + { + "epoch": 2.619445372535917, + "grad_norm": 1.6881158351898193, + "learning_rate": 4.985408777390127e-05, + "loss": 0.1203, + "num_input_tokens_seen": 28631472, + "step": 23520 + }, + { + "epoch": 2.6200022274195343, + "grad_norm": 0.7128158807754517, + "learning_rate": 4.985382552718133e-05, + "loss": 0.0784, + "num_input_tokens_seen": 28637648, + "step": 23525 + }, + { + "epoch": 2.620559082303152, + "grad_norm": 0.011210326105356216, + "learning_rate": 4.9853563045697094e-05, + "loss": 0.0654, + "num_input_tokens_seen": 28643600, + "step": 23530 + }, + { + "epoch": 2.621115937186769, + "grad_norm": 0.18042121827602386, + "learning_rate": 4.985330032945104e-05, + "loss": 0.1325, + "num_input_tokens_seen": 28649584, + "step": 23535 + }, + { + "epoch": 2.6216727920703864, + "grad_norm": 0.010008521378040314, + "learning_rate": 4.985303737844565e-05, + "loss": 0.0872, + "num_input_tokens_seen": 28655696, + "step": 23540 + }, + { + "epoch": 2.6222296469540036, + "grad_norm": 1.235983967781067, + "learning_rate": 4.9852774192683414e-05, + "loss": 0.2177, + "num_input_tokens_seen": 28661744, + "step": 23545 + }, + { + "epoch": 2.622786501837621, + "grad_norm": 0.0777941569685936, + "learning_rate": 4.9852510772166814e-05, + "loss": 0.064, + "num_input_tokens_seen": 28668144, + "step": 23550 + }, + { + "epoch": 2.6233433567212385, + "grad_norm": 0.26876434683799744, + "learning_rate": 4.985224711689833e-05, + "loss": 0.0604, + "num_input_tokens_seen": 28674160, + "step": 23555 + }, + { + "epoch": 2.6239002116048558, + "grad_norm": 0.4104275703430176, + "learning_rate": 4.9851983226880475e-05, + "loss": 0.0259, + "num_input_tokens_seen": 28680432, + "step": 23560 + }, + { + "epoch": 2.624457066488473, + "grad_norm": 0.4090050756931305, + "learning_rate": 4.985171910211572e-05, + "loss": 0.0297, + "num_input_tokens_seen": 28686896, + "step": 23565 + }, + { + "epoch": 2.6250139213720907, + "grad_norm": 0.602644681930542, + "learning_rate": 4.985145474260656e-05, + "loss": 0.1035, + "num_input_tokens_seen": 28693328, + "step": 23570 + }, + { + "epoch": 2.625570776255708, + "grad_norm": 1.4940690994262695, + "learning_rate": 4.985119014835552e-05, + "loss": 0.1751, + "num_input_tokens_seen": 28699440, + "step": 23575 + }, + { + "epoch": 2.626127631139325, + "grad_norm": 0.03879046067595482, + "learning_rate": 4.985092531936506e-05, + "loss": 0.0858, + "num_input_tokens_seen": 28705712, + "step": 23580 + }, + { + "epoch": 2.6266844860229424, + "grad_norm": 0.732416033744812, + "learning_rate": 4.9850660255637705e-05, + "loss": 0.1042, + "num_input_tokens_seen": 28711536, + "step": 23585 + }, + { + "epoch": 2.6272413409065596, + "grad_norm": 0.1427593231201172, + "learning_rate": 4.985039495717596e-05, + "loss": 0.1124, + "num_input_tokens_seen": 28717424, + "step": 23590 + }, + { + "epoch": 2.6277981957901773, + "grad_norm": 0.8979087471961975, + "learning_rate": 4.985012942398232e-05, + "loss": 0.1923, + "num_input_tokens_seen": 28723472, + "step": 23595 + }, + { + "epoch": 2.6283550506737945, + "grad_norm": 0.494891494512558, + "learning_rate": 4.984986365605929e-05, + "loss": 0.078, + "num_input_tokens_seen": 28729008, + "step": 23600 + }, + { + "epoch": 2.6289119055574117, + "grad_norm": 1.5697740316390991, + "learning_rate": 4.98495976534094e-05, + "loss": 0.1377, + "num_input_tokens_seen": 28734896, + "step": 23605 + }, + { + "epoch": 2.629468760441029, + "grad_norm": 0.1642770618200302, + "learning_rate": 4.984933141603514e-05, + "loss": 0.0681, + "num_input_tokens_seen": 28740976, + "step": 23610 + }, + { + "epoch": 2.630025615324646, + "grad_norm": 0.05090080946683884, + "learning_rate": 4.984906494393905e-05, + "loss": 0.0827, + "num_input_tokens_seen": 28746864, + "step": 23615 + }, + { + "epoch": 2.630582470208264, + "grad_norm": 1.1466716527938843, + "learning_rate": 4.9848798237123625e-05, + "loss": 0.09, + "num_input_tokens_seen": 28752944, + "step": 23620 + }, + { + "epoch": 2.631139325091881, + "grad_norm": 0.016422301530838013, + "learning_rate": 4.984853129559139e-05, + "loss": 0.1812, + "num_input_tokens_seen": 28759088, + "step": 23625 + }, + { + "epoch": 2.6316961799754983, + "grad_norm": 0.037412501871585846, + "learning_rate": 4.9848264119344865e-05, + "loss": 0.1122, + "num_input_tokens_seen": 28765296, + "step": 23630 + }, + { + "epoch": 2.6322530348591155, + "grad_norm": 0.16409796476364136, + "learning_rate": 4.984799670838659e-05, + "loss": 0.1621, + "num_input_tokens_seen": 28771120, + "step": 23635 + }, + { + "epoch": 2.632809889742733, + "grad_norm": 0.0815841555595398, + "learning_rate": 4.9847729062719076e-05, + "loss": 0.1667, + "num_input_tokens_seen": 28777392, + "step": 23640 + }, + { + "epoch": 2.6333667446263505, + "grad_norm": 0.0046185641549527645, + "learning_rate": 4.984746118234485e-05, + "loss": 0.0565, + "num_input_tokens_seen": 28783632, + "step": 23645 + }, + { + "epoch": 2.6339235995099677, + "grad_norm": 1.1312477588653564, + "learning_rate": 4.984719306726644e-05, + "loss": 0.0746, + "num_input_tokens_seen": 28789808, + "step": 23650 + }, + { + "epoch": 2.634480454393585, + "grad_norm": 0.016221841797232628, + "learning_rate": 4.9846924717486384e-05, + "loss": 0.0204, + "num_input_tokens_seen": 28796016, + "step": 23655 + }, + { + "epoch": 2.6350373092772026, + "grad_norm": 0.5456297397613525, + "learning_rate": 4.984665613300723e-05, + "loss": 0.03, + "num_input_tokens_seen": 28802320, + "step": 23660 + }, + { + "epoch": 2.63559416416082, + "grad_norm": 0.35462838411331177, + "learning_rate": 4.984638731383149e-05, + "loss": 0.0543, + "num_input_tokens_seen": 28808336, + "step": 23665 + }, + { + "epoch": 2.636151019044437, + "grad_norm": 0.3266569972038269, + "learning_rate": 4.9846118259961716e-05, + "loss": 0.1407, + "num_input_tokens_seen": 28814672, + "step": 23670 + }, + { + "epoch": 2.6367078739280543, + "grad_norm": 0.27598410844802856, + "learning_rate": 4.984584897140046e-05, + "loss": 0.0181, + "num_input_tokens_seen": 28820816, + "step": 23675 + }, + { + "epoch": 2.6372647288116715, + "grad_norm": 1.9885175228118896, + "learning_rate": 4.9845579448150243e-05, + "loss": 0.1825, + "num_input_tokens_seen": 28827184, + "step": 23680 + }, + { + "epoch": 2.637821583695289, + "grad_norm": 0.5231056809425354, + "learning_rate": 4.9845309690213626e-05, + "loss": 0.069, + "num_input_tokens_seen": 28833392, + "step": 23685 + }, + { + "epoch": 2.6383784385789064, + "grad_norm": 0.5110419988632202, + "learning_rate": 4.9845039697593155e-05, + "loss": 0.1285, + "num_input_tokens_seen": 28839376, + "step": 23690 + }, + { + "epoch": 2.6389352934625236, + "grad_norm": 0.2764446437358856, + "learning_rate": 4.984476947029138e-05, + "loss": 0.1534, + "num_input_tokens_seen": 28845712, + "step": 23695 + }, + { + "epoch": 2.639492148346141, + "grad_norm": 0.45284050703048706, + "learning_rate": 4.984449900831084e-05, + "loss": 0.0991, + "num_input_tokens_seen": 28851664, + "step": 23700 + }, + { + "epoch": 2.640049003229758, + "grad_norm": 0.06254582852125168, + "learning_rate": 4.984422831165411e-05, + "loss": 0.2833, + "num_input_tokens_seen": 28858128, + "step": 23705 + }, + { + "epoch": 2.640605858113376, + "grad_norm": 0.17894764244556427, + "learning_rate": 4.984395738032374e-05, + "loss": 0.0231, + "num_input_tokens_seen": 28864112, + "step": 23710 + }, + { + "epoch": 2.641162712996993, + "grad_norm": 0.010018392466008663, + "learning_rate": 4.984368621432228e-05, + "loss": 0.111, + "num_input_tokens_seen": 28870448, + "step": 23715 + }, + { + "epoch": 2.6417195678806102, + "grad_norm": 0.25152787566185, + "learning_rate": 4.984341481365231e-05, + "loss": 0.1534, + "num_input_tokens_seen": 28876880, + "step": 23720 + }, + { + "epoch": 2.642276422764228, + "grad_norm": 0.26392316818237305, + "learning_rate": 4.9843143178316375e-05, + "loss": 0.146, + "num_input_tokens_seen": 28883248, + "step": 23725 + }, + { + "epoch": 2.6428332776478447, + "grad_norm": 0.3437086343765259, + "learning_rate": 4.9842871308317056e-05, + "loss": 0.0479, + "num_input_tokens_seen": 28889680, + "step": 23730 + }, + { + "epoch": 2.6433901325314624, + "grad_norm": 0.09395751357078552, + "learning_rate": 4.9842599203656916e-05, + "loss": 0.0623, + "num_input_tokens_seen": 28895760, + "step": 23735 + }, + { + "epoch": 2.6439469874150796, + "grad_norm": 1.051164984703064, + "learning_rate": 4.9842326864338515e-05, + "loss": 0.0907, + "num_input_tokens_seen": 28901872, + "step": 23740 + }, + { + "epoch": 2.644503842298697, + "grad_norm": 0.004564544185996056, + "learning_rate": 4.9842054290364435e-05, + "loss": 0.0505, + "num_input_tokens_seen": 28908240, + "step": 23745 + }, + { + "epoch": 2.6450606971823145, + "grad_norm": 1.139481782913208, + "learning_rate": 4.984178148173725e-05, + "loss": 0.1464, + "num_input_tokens_seen": 28914448, + "step": 23750 + }, + { + "epoch": 2.6456175520659317, + "grad_norm": 0.1893082559108734, + "learning_rate": 4.984150843845953e-05, + "loss": 0.1494, + "num_input_tokens_seen": 28920240, + "step": 23755 + }, + { + "epoch": 2.646174406949549, + "grad_norm": 0.6734842658042908, + "learning_rate": 4.9841235160533874e-05, + "loss": 0.0534, + "num_input_tokens_seen": 28926352, + "step": 23760 + }, + { + "epoch": 2.646731261833166, + "grad_norm": 0.004374116193503141, + "learning_rate": 4.9840961647962836e-05, + "loss": 0.058, + "num_input_tokens_seen": 28932720, + "step": 23765 + }, + { + "epoch": 2.6472881167167834, + "grad_norm": 0.11299530416727066, + "learning_rate": 4.9840687900749015e-05, + "loss": 0.04, + "num_input_tokens_seen": 28938896, + "step": 23770 + }, + { + "epoch": 2.647844971600401, + "grad_norm": 0.9526160359382629, + "learning_rate": 4.9840413918895e-05, + "loss": 0.0891, + "num_input_tokens_seen": 28945264, + "step": 23775 + }, + { + "epoch": 2.6484018264840183, + "grad_norm": 1.3821897506713867, + "learning_rate": 4.984013970240338e-05, + "loss": 0.2614, + "num_input_tokens_seen": 28950896, + "step": 23780 + }, + { + "epoch": 2.6489586813676356, + "grad_norm": 2.4972522258758545, + "learning_rate": 4.983986525127672e-05, + "loss": 0.267, + "num_input_tokens_seen": 28956752, + "step": 23785 + }, + { + "epoch": 2.649515536251253, + "grad_norm": 1.1746834516525269, + "learning_rate": 4.9839590565517646e-05, + "loss": 0.082, + "num_input_tokens_seen": 28962640, + "step": 23790 + }, + { + "epoch": 2.65007239113487, + "grad_norm": 0.12577186524868011, + "learning_rate": 4.9839315645128736e-05, + "loss": 0.036, + "num_input_tokens_seen": 28968560, + "step": 23795 + }, + { + "epoch": 2.6506292460184877, + "grad_norm": 0.6700949668884277, + "learning_rate": 4.983904049011259e-05, + "loss": 0.0696, + "num_input_tokens_seen": 28974512, + "step": 23800 + }, + { + "epoch": 2.651186100902105, + "grad_norm": 0.001532660098746419, + "learning_rate": 4.9838765100471794e-05, + "loss": 0.1568, + "num_input_tokens_seen": 28980304, + "step": 23805 + }, + { + "epoch": 2.651742955785722, + "grad_norm": 0.3325542211532593, + "learning_rate": 4.9838489476208974e-05, + "loss": 0.0712, + "num_input_tokens_seen": 28986448, + "step": 23810 + }, + { + "epoch": 2.65229981066934, + "grad_norm": 0.17443935573101044, + "learning_rate": 4.9838213617326715e-05, + "loss": 0.1079, + "num_input_tokens_seen": 28992720, + "step": 23815 + }, + { + "epoch": 2.6528566655529566, + "grad_norm": 1.7381640672683716, + "learning_rate": 4.9837937523827625e-05, + "loss": 0.1749, + "num_input_tokens_seen": 28998864, + "step": 23820 + }, + { + "epoch": 2.6534135204365743, + "grad_norm": 1.6066210269927979, + "learning_rate": 4.983766119571433e-05, + "loss": 0.1529, + "num_input_tokens_seen": 29004944, + "step": 23825 + }, + { + "epoch": 2.6539703753201915, + "grad_norm": 0.6449276208877563, + "learning_rate": 4.983738463298941e-05, + "loss": 0.1236, + "num_input_tokens_seen": 29010544, + "step": 23830 + }, + { + "epoch": 2.6545272302038088, + "grad_norm": 0.5093927383422852, + "learning_rate": 4.9837107835655496e-05, + "loss": 0.0997, + "num_input_tokens_seen": 29016816, + "step": 23835 + }, + { + "epoch": 2.6550840850874264, + "grad_norm": 0.14046762883663177, + "learning_rate": 4.983683080371521e-05, + "loss": 0.0549, + "num_input_tokens_seen": 29023088, + "step": 23840 + }, + { + "epoch": 2.6556409399710437, + "grad_norm": 0.29822835326194763, + "learning_rate": 4.9836553537171146e-05, + "loss": 0.0495, + "num_input_tokens_seen": 29028816, + "step": 23845 + }, + { + "epoch": 2.656197794854661, + "grad_norm": 1.8376970291137695, + "learning_rate": 4.9836276036025934e-05, + "loss": 0.0857, + "num_input_tokens_seen": 29034832, + "step": 23850 + }, + { + "epoch": 2.656754649738278, + "grad_norm": 1.6977473497390747, + "learning_rate": 4.98359983002822e-05, + "loss": 0.182, + "num_input_tokens_seen": 29041168, + "step": 23855 + }, + { + "epoch": 2.6573115046218954, + "grad_norm": 0.04383415728807449, + "learning_rate": 4.983572032994257e-05, + "loss": 0.1247, + "num_input_tokens_seen": 29047152, + "step": 23860 + }, + { + "epoch": 2.657868359505513, + "grad_norm": 0.8928799629211426, + "learning_rate": 4.983544212500966e-05, + "loss": 0.0422, + "num_input_tokens_seen": 29053456, + "step": 23865 + }, + { + "epoch": 2.6584252143891303, + "grad_norm": 0.39244070649147034, + "learning_rate": 4.98351636854861e-05, + "loss": 0.0768, + "num_input_tokens_seen": 29059216, + "step": 23870 + }, + { + "epoch": 2.6589820692727475, + "grad_norm": 0.5701925158500671, + "learning_rate": 4.983488501137451e-05, + "loss": 0.1562, + "num_input_tokens_seen": 29065168, + "step": 23875 + }, + { + "epoch": 2.6595389241563647, + "grad_norm": 0.5873367190361023, + "learning_rate": 4.983460610267755e-05, + "loss": 0.0974, + "num_input_tokens_seen": 29071536, + "step": 23880 + }, + { + "epoch": 2.660095779039982, + "grad_norm": 0.33301645517349243, + "learning_rate": 4.9834326959397834e-05, + "loss": 0.044, + "num_input_tokens_seen": 29077712, + "step": 23885 + }, + { + "epoch": 2.6606526339235996, + "grad_norm": 0.14451326429843903, + "learning_rate": 4.9834047581538005e-05, + "loss": 0.1139, + "num_input_tokens_seen": 29083888, + "step": 23890 + }, + { + "epoch": 2.661209488807217, + "grad_norm": 0.13691748678684235, + "learning_rate": 4.9833767969100695e-05, + "loss": 0.1194, + "num_input_tokens_seen": 29090128, + "step": 23895 + }, + { + "epoch": 2.661766343690834, + "grad_norm": 0.8676692843437195, + "learning_rate": 4.983348812208855e-05, + "loss": 0.1295, + "num_input_tokens_seen": 29096304, + "step": 23900 + }, + { + "epoch": 2.6623231985744518, + "grad_norm": 0.059043776243925095, + "learning_rate": 4.983320804050421e-05, + "loss": 0.0729, + "num_input_tokens_seen": 29102512, + "step": 23905 + }, + { + "epoch": 2.6628800534580686, + "grad_norm": 0.42684048414230347, + "learning_rate": 4.983292772435033e-05, + "loss": 0.0574, + "num_input_tokens_seen": 29108496, + "step": 23910 + }, + { + "epoch": 2.6634369083416862, + "grad_norm": 0.09235090762376785, + "learning_rate": 4.983264717362955e-05, + "loss": 0.0196, + "num_input_tokens_seen": 29114864, + "step": 23915 + }, + { + "epoch": 2.6639937632253035, + "grad_norm": 0.7398672699928284, + "learning_rate": 4.983236638834453e-05, + "loss": 0.0693, + "num_input_tokens_seen": 29120784, + "step": 23920 + }, + { + "epoch": 2.6645506181089207, + "grad_norm": 0.03521677479147911, + "learning_rate": 4.98320853684979e-05, + "loss": 0.0535, + "num_input_tokens_seen": 29127056, + "step": 23925 + }, + { + "epoch": 2.6651074729925384, + "grad_norm": 0.2453521341085434, + "learning_rate": 4.983180411409234e-05, + "loss": 0.0812, + "num_input_tokens_seen": 29133072, + "step": 23930 + }, + { + "epoch": 2.6656643278761556, + "grad_norm": 0.36725476384162903, + "learning_rate": 4.983152262513049e-05, + "loss": 0.1172, + "num_input_tokens_seen": 29139536, + "step": 23935 + }, + { + "epoch": 2.666221182759773, + "grad_norm": 0.03562864288687706, + "learning_rate": 4.983124090161502e-05, + "loss": 0.0407, + "num_input_tokens_seen": 29146064, + "step": 23940 + }, + { + "epoch": 2.66677803764339, + "grad_norm": 0.9547216892242432, + "learning_rate": 4.983095894354858e-05, + "loss": 0.1078, + "num_input_tokens_seen": 29152368, + "step": 23945 + }, + { + "epoch": 2.6673348925270073, + "grad_norm": 0.6943970322608948, + "learning_rate": 4.983067675093384e-05, + "loss": 0.0692, + "num_input_tokens_seen": 29157232, + "step": 23950 + }, + { + "epoch": 2.667891747410625, + "grad_norm": 0.378300279378891, + "learning_rate": 4.983039432377345e-05, + "loss": 0.0665, + "num_input_tokens_seen": 29163312, + "step": 23955 + }, + { + "epoch": 2.668448602294242, + "grad_norm": 0.23669366538524628, + "learning_rate": 4.983011166207011e-05, + "loss": 0.0846, + "num_input_tokens_seen": 29169552, + "step": 23960 + }, + { + "epoch": 2.6690054571778594, + "grad_norm": 0.5067025423049927, + "learning_rate": 4.982982876582647e-05, + "loss": 0.1254, + "num_input_tokens_seen": 29175728, + "step": 23965 + }, + { + "epoch": 2.6695623120614766, + "grad_norm": 0.4755827486515045, + "learning_rate": 4.98295456350452e-05, + "loss": 0.2109, + "num_input_tokens_seen": 29181680, + "step": 23970 + }, + { + "epoch": 2.670119166945094, + "grad_norm": 0.03470516949892044, + "learning_rate": 4.9829262269728986e-05, + "loss": 0.1274, + "num_input_tokens_seen": 29187856, + "step": 23975 + }, + { + "epoch": 2.6706760218287116, + "grad_norm": 1.3240876197814941, + "learning_rate": 4.9828978669880485e-05, + "loss": 0.13, + "num_input_tokens_seen": 29193936, + "step": 23980 + }, + { + "epoch": 2.671232876712329, + "grad_norm": 0.2711569368839264, + "learning_rate": 4.9828694835502386e-05, + "loss": 0.0673, + "num_input_tokens_seen": 29200176, + "step": 23985 + }, + { + "epoch": 2.671789731595946, + "grad_norm": 0.0036017135716974735, + "learning_rate": 4.9828410766597384e-05, + "loss": 0.177, + "num_input_tokens_seen": 29206352, + "step": 23990 + }, + { + "epoch": 2.6723465864795637, + "grad_norm": 0.5993356108665466, + "learning_rate": 4.982812646316815e-05, + "loss": 0.1032, + "num_input_tokens_seen": 29212656, + "step": 23995 + }, + { + "epoch": 2.672903441363181, + "grad_norm": 1.1062692403793335, + "learning_rate": 4.982784192521736e-05, + "loss": 0.081, + "num_input_tokens_seen": 29218928, + "step": 24000 + }, + { + "epoch": 2.673460296246798, + "grad_norm": 0.136323943734169, + "learning_rate": 4.9827557152747714e-05, + "loss": 0.0836, + "num_input_tokens_seen": 29224752, + "step": 24005 + }, + { + "epoch": 2.6740171511304154, + "grad_norm": 0.5717238783836365, + "learning_rate": 4.98272721457619e-05, + "loss": 0.1469, + "num_input_tokens_seen": 29230800, + "step": 24010 + }, + { + "epoch": 2.6745740060140326, + "grad_norm": 0.3074701130390167, + "learning_rate": 4.9826986904262604e-05, + "loss": 0.0405, + "num_input_tokens_seen": 29236976, + "step": 24015 + }, + { + "epoch": 2.6751308608976503, + "grad_norm": 0.022987887263298035, + "learning_rate": 4.982670142825254e-05, + "loss": 0.1175, + "num_input_tokens_seen": 29243472, + "step": 24020 + }, + { + "epoch": 2.6756877157812675, + "grad_norm": 0.855227530002594, + "learning_rate": 4.982641571773437e-05, + "loss": 0.1183, + "num_input_tokens_seen": 29249072, + "step": 24025 + }, + { + "epoch": 2.6762445706648847, + "grad_norm": 0.012447788380086422, + "learning_rate": 4.9826129772710834e-05, + "loss": 0.1385, + "num_input_tokens_seen": 29254928, + "step": 24030 + }, + { + "epoch": 2.676801425548502, + "grad_norm": 0.4123072326183319, + "learning_rate": 4.9825843593184604e-05, + "loss": 0.0992, + "num_input_tokens_seen": 29260912, + "step": 24035 + }, + { + "epoch": 2.677358280432119, + "grad_norm": 0.6832284927368164, + "learning_rate": 4.982555717915839e-05, + "loss": 0.1432, + "num_input_tokens_seen": 29266768, + "step": 24040 + }, + { + "epoch": 2.677915135315737, + "grad_norm": 0.4287363886833191, + "learning_rate": 4.982527053063489e-05, + "loss": 0.0418, + "num_input_tokens_seen": 29273168, + "step": 24045 + }, + { + "epoch": 2.678471990199354, + "grad_norm": 0.8160415887832642, + "learning_rate": 4.982498364761683e-05, + "loss": 0.1149, + "num_input_tokens_seen": 29279024, + "step": 24050 + }, + { + "epoch": 2.6790288450829713, + "grad_norm": 1.4145545959472656, + "learning_rate": 4.982469653010691e-05, + "loss": 0.1609, + "num_input_tokens_seen": 29284976, + "step": 24055 + }, + { + "epoch": 2.6795856999665886, + "grad_norm": 1.4420599937438965, + "learning_rate": 4.982440917810784e-05, + "loss": 0.0892, + "num_input_tokens_seen": 29291280, + "step": 24060 + }, + { + "epoch": 2.680142554850206, + "grad_norm": 1.112210750579834, + "learning_rate": 4.982412159162234e-05, + "loss": 0.1389, + "num_input_tokens_seen": 29297200, + "step": 24065 + }, + { + "epoch": 2.6806994097338235, + "grad_norm": 1.0541818141937256, + "learning_rate": 4.982383377065312e-05, + "loss": 0.284, + "num_input_tokens_seen": 29302960, + "step": 24070 + }, + { + "epoch": 2.6812562646174407, + "grad_norm": 0.572910726070404, + "learning_rate": 4.98235457152029e-05, + "loss": 0.0487, + "num_input_tokens_seen": 29308784, + "step": 24075 + }, + { + "epoch": 2.681813119501058, + "grad_norm": 0.05326814204454422, + "learning_rate": 4.98232574252744e-05, + "loss": 0.0538, + "num_input_tokens_seen": 29314800, + "step": 24080 + }, + { + "epoch": 2.6823699743846756, + "grad_norm": 0.2420923113822937, + "learning_rate": 4.9822968900870354e-05, + "loss": 0.0562, + "num_input_tokens_seen": 29320976, + "step": 24085 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 1.4427343606948853, + "learning_rate": 4.982268014199347e-05, + "loss": 0.0797, + "num_input_tokens_seen": 29326992, + "step": 24090 + }, + { + "epoch": 2.68348368415191, + "grad_norm": 1.79766845703125, + "learning_rate": 4.982239114864649e-05, + "loss": 0.1683, + "num_input_tokens_seen": 29333040, + "step": 24095 + }, + { + "epoch": 2.6840405390355273, + "grad_norm": 0.6663380861282349, + "learning_rate": 4.982210192083214e-05, + "loss": 0.0486, + "num_input_tokens_seen": 29339184, + "step": 24100 + }, + { + "epoch": 2.6845973939191445, + "grad_norm": 0.8859108686447144, + "learning_rate": 4.982181245855314e-05, + "loss": 0.2315, + "num_input_tokens_seen": 29344784, + "step": 24105 + }, + { + "epoch": 2.685154248802762, + "grad_norm": 0.5713661909103394, + "learning_rate": 4.982152276181224e-05, + "loss": 0.0772, + "num_input_tokens_seen": 29350800, + "step": 24110 + }, + { + "epoch": 2.6857111036863794, + "grad_norm": 0.7112533450126648, + "learning_rate": 4.9821232830612174e-05, + "loss": 0.1967, + "num_input_tokens_seen": 29356784, + "step": 24115 + }, + { + "epoch": 2.6862679585699967, + "grad_norm": 0.055657971650362015, + "learning_rate": 4.9820942664955684e-05, + "loss": 0.044, + "num_input_tokens_seen": 29363152, + "step": 24120 + }, + { + "epoch": 2.686824813453614, + "grad_norm": 0.3067282438278198, + "learning_rate": 4.982065226484549e-05, + "loss": 0.1304, + "num_input_tokens_seen": 29369488, + "step": 24125 + }, + { + "epoch": 2.687381668337231, + "grad_norm": 0.3434737026691437, + "learning_rate": 4.982036163028436e-05, + "loss": 0.1051, + "num_input_tokens_seen": 29375312, + "step": 24130 + }, + { + "epoch": 2.687938523220849, + "grad_norm": 1.857350468635559, + "learning_rate": 4.982007076127502e-05, + "loss": 0.1141, + "num_input_tokens_seen": 29380752, + "step": 24135 + }, + { + "epoch": 2.688495378104466, + "grad_norm": 0.4161742329597473, + "learning_rate": 4.981977965782023e-05, + "loss": 0.0741, + "num_input_tokens_seen": 29387120, + "step": 24140 + }, + { + "epoch": 2.6890522329880833, + "grad_norm": 0.8411408066749573, + "learning_rate": 4.981948831992274e-05, + "loss": 0.0953, + "num_input_tokens_seen": 29393200, + "step": 24145 + }, + { + "epoch": 2.6896090878717005, + "grad_norm": 0.7563372850418091, + "learning_rate": 4.98191967475853e-05, + "loss": 0.1145, + "num_input_tokens_seen": 29398992, + "step": 24150 + }, + { + "epoch": 2.6901659427553177, + "grad_norm": 0.8636277914047241, + "learning_rate": 4.981890494081065e-05, + "loss": 0.0634, + "num_input_tokens_seen": 29405040, + "step": 24155 + }, + { + "epoch": 2.6907227976389354, + "grad_norm": 2.019897699356079, + "learning_rate": 4.981861289960156e-05, + "loss": 0.2172, + "num_input_tokens_seen": 29411152, + "step": 24160 + }, + { + "epoch": 2.6912796525225526, + "grad_norm": 0.7289682030677795, + "learning_rate": 4.981832062396079e-05, + "loss": 0.0999, + "num_input_tokens_seen": 29416880, + "step": 24165 + }, + { + "epoch": 2.69183650740617, + "grad_norm": 1.0754082202911377, + "learning_rate": 4.98180281138911e-05, + "loss": 0.115, + "num_input_tokens_seen": 29422832, + "step": 24170 + }, + { + "epoch": 2.6923933622897875, + "grad_norm": 0.4059898853302002, + "learning_rate": 4.981773536939525e-05, + "loss": 0.0821, + "num_input_tokens_seen": 29429328, + "step": 24175 + }, + { + "epoch": 2.6929502171734048, + "grad_norm": 0.7189980149269104, + "learning_rate": 4.9817442390476005e-05, + "loss": 0.0709, + "num_input_tokens_seen": 29435312, + "step": 24180 + }, + { + "epoch": 2.693507072057022, + "grad_norm": 1.69776451587677, + "learning_rate": 4.981714917713613e-05, + "loss": 0.2198, + "num_input_tokens_seen": 29441040, + "step": 24185 + }, + { + "epoch": 2.6940639269406392, + "grad_norm": 0.005746201612055302, + "learning_rate": 4.98168557293784e-05, + "loss": 0.055, + "num_input_tokens_seen": 29447120, + "step": 24190 + }, + { + "epoch": 2.6946207818242565, + "grad_norm": 0.052161432802677155, + "learning_rate": 4.981656204720559e-05, + "loss": 0.0126, + "num_input_tokens_seen": 29453296, + "step": 24195 + }, + { + "epoch": 2.695177636707874, + "grad_norm": 0.6369561553001404, + "learning_rate": 4.981626813062046e-05, + "loss": 0.0889, + "num_input_tokens_seen": 29459664, + "step": 24200 + }, + { + "epoch": 2.6957344915914914, + "grad_norm": 1.2063896656036377, + "learning_rate": 4.98159739796258e-05, + "loss": 0.1315, + "num_input_tokens_seen": 29465776, + "step": 24205 + }, + { + "epoch": 2.6962913464751086, + "grad_norm": 0.002989136381074786, + "learning_rate": 4.9815679594224384e-05, + "loss": 0.0592, + "num_input_tokens_seen": 29471920, + "step": 24210 + }, + { + "epoch": 2.696848201358726, + "grad_norm": 0.8592159152030945, + "learning_rate": 4.981538497441899e-05, + "loss": 0.123, + "num_input_tokens_seen": 29478128, + "step": 24215 + }, + { + "epoch": 2.697405056242343, + "grad_norm": 1.0801291465759277, + "learning_rate": 4.98150901202124e-05, + "loss": 0.1895, + "num_input_tokens_seen": 29484176, + "step": 24220 + }, + { + "epoch": 2.6979619111259607, + "grad_norm": 1.2614929676055908, + "learning_rate": 4.9814795031607416e-05, + "loss": 0.1295, + "num_input_tokens_seen": 29490352, + "step": 24225 + }, + { + "epoch": 2.698518766009578, + "grad_norm": 0.4330761134624481, + "learning_rate": 4.98144997086068e-05, + "loss": 0.1304, + "num_input_tokens_seen": 29496112, + "step": 24230 + }, + { + "epoch": 2.699075620893195, + "grad_norm": 0.26993921399116516, + "learning_rate": 4.981420415121336e-05, + "loss": 0.0617, + "num_input_tokens_seen": 29502192, + "step": 24235 + }, + { + "epoch": 2.6996324757768124, + "grad_norm": 1.1255053281784058, + "learning_rate": 4.9813908359429876e-05, + "loss": 0.1245, + "num_input_tokens_seen": 29508240, + "step": 24240 + }, + { + "epoch": 2.7001893306604297, + "grad_norm": 0.5455582141876221, + "learning_rate": 4.981361233325914e-05, + "loss": 0.1783, + "num_input_tokens_seen": 29513584, + "step": 24245 + }, + { + "epoch": 2.7007461855440473, + "grad_norm": 1.6906863451004028, + "learning_rate": 4.9813316072703965e-05, + "loss": 0.2035, + "num_input_tokens_seen": 29519632, + "step": 24250 + }, + { + "epoch": 2.7013030404276646, + "grad_norm": 0.24295152723789215, + "learning_rate": 4.981301957776714e-05, + "loss": 0.0174, + "num_input_tokens_seen": 29525808, + "step": 24255 + }, + { + "epoch": 2.701859895311282, + "grad_norm": 0.6599222421646118, + "learning_rate": 4.981272284845146e-05, + "loss": 0.0988, + "num_input_tokens_seen": 29532144, + "step": 24260 + }, + { + "epoch": 2.7024167501948995, + "grad_norm": 1.2214733362197876, + "learning_rate": 4.981242588475974e-05, + "loss": 0.1126, + "num_input_tokens_seen": 29538032, + "step": 24265 + }, + { + "epoch": 2.7029736050785167, + "grad_norm": 0.15741309523582458, + "learning_rate": 4.981212868669477e-05, + "loss": 0.0982, + "num_input_tokens_seen": 29544368, + "step": 24270 + }, + { + "epoch": 2.703530459962134, + "grad_norm": 0.06383676826953888, + "learning_rate": 4.981183125425937e-05, + "loss": 0.077, + "num_input_tokens_seen": 29550832, + "step": 24275 + }, + { + "epoch": 2.704087314845751, + "grad_norm": 0.2396208494901657, + "learning_rate": 4.9811533587456346e-05, + "loss": 0.0913, + "num_input_tokens_seen": 29556336, + "step": 24280 + }, + { + "epoch": 2.7046441697293684, + "grad_norm": 0.7219934463500977, + "learning_rate": 4.981123568628851e-05, + "loss": 0.0909, + "num_input_tokens_seen": 29562608, + "step": 24285 + }, + { + "epoch": 2.705201024612986, + "grad_norm": 0.9908111095428467, + "learning_rate": 4.981093755075866e-05, + "loss": 0.1777, + "num_input_tokens_seen": 29568368, + "step": 24290 + }, + { + "epoch": 2.7057578794966033, + "grad_norm": 1.6336287260055542, + "learning_rate": 4.981063918086964e-05, + "loss": 0.1224, + "num_input_tokens_seen": 29574704, + "step": 24295 + }, + { + "epoch": 2.7063147343802205, + "grad_norm": 1.5304073095321655, + "learning_rate": 4.9810340576624254e-05, + "loss": 0.1209, + "num_input_tokens_seen": 29580976, + "step": 24300 + }, + { + "epoch": 2.7068715892638378, + "grad_norm": 0.7731834650039673, + "learning_rate": 4.981004173802533e-05, + "loss": 0.0733, + "num_input_tokens_seen": 29587344, + "step": 24305 + }, + { + "epoch": 2.707428444147455, + "grad_norm": 0.055717118084430695, + "learning_rate": 4.980974266507567e-05, + "loss": 0.0907, + "num_input_tokens_seen": 29592816, + "step": 24310 + }, + { + "epoch": 2.7079852990310727, + "grad_norm": 0.6736350655555725, + "learning_rate": 4.980944335777812e-05, + "loss": 0.1124, + "num_input_tokens_seen": 29598832, + "step": 24315 + }, + { + "epoch": 2.70854215391469, + "grad_norm": 0.23411497473716736, + "learning_rate": 4.98091438161355e-05, + "loss": 0.0324, + "num_input_tokens_seen": 29605200, + "step": 24320 + }, + { + "epoch": 2.709099008798307, + "grad_norm": 1.799728512763977, + "learning_rate": 4.980884404015064e-05, + "loss": 0.1543, + "num_input_tokens_seen": 29611536, + "step": 24325 + }, + { + "epoch": 2.7096558636819243, + "grad_norm": 0.010771290399134159, + "learning_rate": 4.980854402982637e-05, + "loss": 0.0413, + "num_input_tokens_seen": 29617712, + "step": 24330 + }, + { + "epoch": 2.7102127185655416, + "grad_norm": 0.7418227791786194, + "learning_rate": 4.980824378516553e-05, + "loss": 0.0986, + "num_input_tokens_seen": 29623728, + "step": 24335 + }, + { + "epoch": 2.7107695734491593, + "grad_norm": 0.5319810509681702, + "learning_rate": 4.980794330617095e-05, + "loss": 0.0811, + "num_input_tokens_seen": 29629776, + "step": 24340 + }, + { + "epoch": 2.7113264283327765, + "grad_norm": 1.231873631477356, + "learning_rate": 4.9807642592845464e-05, + "loss": 0.1036, + "num_input_tokens_seen": 29635856, + "step": 24345 + }, + { + "epoch": 2.7118832832163937, + "grad_norm": 0.6167256236076355, + "learning_rate": 4.980734164519193e-05, + "loss": 0.0697, + "num_input_tokens_seen": 29641808, + "step": 24350 + }, + { + "epoch": 2.7124401381000114, + "grad_norm": 0.5069079399108887, + "learning_rate": 4.980704046321316e-05, + "loss": 0.051, + "num_input_tokens_seen": 29647856, + "step": 24355 + }, + { + "epoch": 2.7129969929836286, + "grad_norm": 0.39853641390800476, + "learning_rate": 4.980673904691203e-05, + "loss": 0.0285, + "num_input_tokens_seen": 29654256, + "step": 24360 + }, + { + "epoch": 2.713553847867246, + "grad_norm": 0.15089289844036102, + "learning_rate": 4.980643739629138e-05, + "loss": 0.1414, + "num_input_tokens_seen": 29660720, + "step": 24365 + }, + { + "epoch": 2.714110702750863, + "grad_norm": 0.4654122292995453, + "learning_rate": 4.980613551135405e-05, + "loss": 0.0847, + "num_input_tokens_seen": 29666576, + "step": 24370 + }, + { + "epoch": 2.7146675576344803, + "grad_norm": 1.3596580028533936, + "learning_rate": 4.980583339210289e-05, + "loss": 0.1023, + "num_input_tokens_seen": 29672848, + "step": 24375 + }, + { + "epoch": 2.715224412518098, + "grad_norm": 1.2063591480255127, + "learning_rate": 4.9805531038540766e-05, + "loss": 0.1092, + "num_input_tokens_seen": 29678896, + "step": 24380 + }, + { + "epoch": 2.715781267401715, + "grad_norm": 1.0658730268478394, + "learning_rate": 4.980522845067052e-05, + "loss": 0.1491, + "num_input_tokens_seen": 29685040, + "step": 24385 + }, + { + "epoch": 2.7163381222853324, + "grad_norm": 0.6722075939178467, + "learning_rate": 4.980492562849503e-05, + "loss": 0.081, + "num_input_tokens_seen": 29691344, + "step": 24390 + }, + { + "epoch": 2.7168949771689497, + "grad_norm": 0.2583771347999573, + "learning_rate": 4.980462257201713e-05, + "loss": 0.1554, + "num_input_tokens_seen": 29697264, + "step": 24395 + }, + { + "epoch": 2.717451832052567, + "grad_norm": 1.4315154552459717, + "learning_rate": 4.9804319281239705e-05, + "loss": 0.0838, + "num_input_tokens_seen": 29703280, + "step": 24400 + }, + { + "epoch": 2.7180086869361846, + "grad_norm": 0.014401412568986416, + "learning_rate": 4.980401575616561e-05, + "loss": 0.0537, + "num_input_tokens_seen": 29709040, + "step": 24405 + }, + { + "epoch": 2.718565541819802, + "grad_norm": 0.12666893005371094, + "learning_rate": 4.9803711996797706e-05, + "loss": 0.1302, + "num_input_tokens_seen": 29715312, + "step": 24410 + }, + { + "epoch": 2.719122396703419, + "grad_norm": 0.29321426153182983, + "learning_rate": 4.980340800313889e-05, + "loss": 0.0759, + "num_input_tokens_seen": 29721424, + "step": 24415 + }, + { + "epoch": 2.7196792515870363, + "grad_norm": 0.9044064879417419, + "learning_rate": 4.9803103775191996e-05, + "loss": 0.1723, + "num_input_tokens_seen": 29727856, + "step": 24420 + }, + { + "epoch": 2.7202361064706535, + "grad_norm": 2.2533042430877686, + "learning_rate": 4.980279931295991e-05, + "loss": 0.0901, + "num_input_tokens_seen": 29733808, + "step": 24425 + }, + { + "epoch": 2.720792961354271, + "grad_norm": 0.6810059547424316, + "learning_rate": 4.980249461644553e-05, + "loss": 0.1606, + "num_input_tokens_seen": 29740048, + "step": 24430 + }, + { + "epoch": 2.7213498162378884, + "grad_norm": 0.5344521403312683, + "learning_rate": 4.980218968565171e-05, + "loss": 0.0527, + "num_input_tokens_seen": 29746032, + "step": 24435 + }, + { + "epoch": 2.7219066711215056, + "grad_norm": 0.3330152630805969, + "learning_rate": 4.980188452058133e-05, + "loss": 0.0486, + "num_input_tokens_seen": 29752464, + "step": 24440 + }, + { + "epoch": 2.7224635260051233, + "grad_norm": 0.7303518652915955, + "learning_rate": 4.980157912123729e-05, + "loss": 0.1569, + "num_input_tokens_seen": 29758512, + "step": 24445 + }, + { + "epoch": 2.7230203808887405, + "grad_norm": 0.4236398935317993, + "learning_rate": 4.9801273487622454e-05, + "loss": 0.0989, + "num_input_tokens_seen": 29764752, + "step": 24450 + }, + { + "epoch": 2.7235772357723578, + "grad_norm": 1.2494053840637207, + "learning_rate": 4.9800967619739736e-05, + "loss": 0.1868, + "num_input_tokens_seen": 29770928, + "step": 24455 + }, + { + "epoch": 2.724134090655975, + "grad_norm": 0.9869111180305481, + "learning_rate": 4.9800661517592e-05, + "loss": 0.1573, + "num_input_tokens_seen": 29777328, + "step": 24460 + }, + { + "epoch": 2.7246909455395922, + "grad_norm": 0.00796604435890913, + "learning_rate": 4.980035518118214e-05, + "loss": 0.1394, + "num_input_tokens_seen": 29783600, + "step": 24465 + }, + { + "epoch": 2.72524780042321, + "grad_norm": 0.16218605637550354, + "learning_rate": 4.980004861051306e-05, + "loss": 0.0633, + "num_input_tokens_seen": 29789648, + "step": 24470 + }, + { + "epoch": 2.725804655306827, + "grad_norm": 1.3862541913986206, + "learning_rate": 4.9799741805587655e-05, + "loss": 0.1161, + "num_input_tokens_seen": 29795600, + "step": 24475 + }, + { + "epoch": 2.7263615101904444, + "grad_norm": 0.672927737236023, + "learning_rate": 4.979943476640882e-05, + "loss": 0.3187, + "num_input_tokens_seen": 29801136, + "step": 24480 + }, + { + "epoch": 2.7269183650740616, + "grad_norm": 0.9120215177536011, + "learning_rate": 4.979912749297944e-05, + "loss": 0.0877, + "num_input_tokens_seen": 29807024, + "step": 24485 + }, + { + "epoch": 2.727475219957679, + "grad_norm": 0.01935892552137375, + "learning_rate": 4.979881998530245e-05, + "loss": 0.0865, + "num_input_tokens_seen": 29813040, + "step": 24490 + }, + { + "epoch": 2.7280320748412965, + "grad_norm": 0.9090334177017212, + "learning_rate": 4.979851224338072e-05, + "loss": 0.0708, + "num_input_tokens_seen": 29819376, + "step": 24495 + }, + { + "epoch": 2.7285889297249137, + "grad_norm": 0.14076612889766693, + "learning_rate": 4.979820426721719e-05, + "loss": 0.1044, + "num_input_tokens_seen": 29825392, + "step": 24500 + }, + { + "epoch": 2.729145784608531, + "grad_norm": 0.5172731876373291, + "learning_rate": 4.9797896056814744e-05, + "loss": 0.0856, + "num_input_tokens_seen": 29830960, + "step": 24505 + }, + { + "epoch": 2.729702639492148, + "grad_norm": 0.04160720482468605, + "learning_rate": 4.97975876121763e-05, + "loss": 0.0588, + "num_input_tokens_seen": 29837424, + "step": 24510 + }, + { + "epoch": 2.7302594943757654, + "grad_norm": 1.4971027374267578, + "learning_rate": 4.979727893330478e-05, + "loss": 0.0875, + "num_input_tokens_seen": 29843376, + "step": 24515 + }, + { + "epoch": 2.730816349259383, + "grad_norm": 0.002735974732786417, + "learning_rate": 4.9796970020203093e-05, + "loss": 0.0603, + "num_input_tokens_seen": 29849520, + "step": 24520 + }, + { + "epoch": 2.7313732041430003, + "grad_norm": 0.9909808039665222, + "learning_rate": 4.9796660872874155e-05, + "loss": 0.0862, + "num_input_tokens_seen": 29855216, + "step": 24525 + }, + { + "epoch": 2.7319300590266176, + "grad_norm": 0.8233124613761902, + "learning_rate": 4.979635149132089e-05, + "loss": 0.1156, + "num_input_tokens_seen": 29861744, + "step": 24530 + }, + { + "epoch": 2.7324869139102352, + "grad_norm": 0.09734316915273666, + "learning_rate": 4.979604187554621e-05, + "loss": 0.0641, + "num_input_tokens_seen": 29867920, + "step": 24535 + }, + { + "epoch": 2.7330437687938525, + "grad_norm": 0.18178905546665192, + "learning_rate": 4.9795732025553055e-05, + "loss": 0.0956, + "num_input_tokens_seen": 29873904, + "step": 24540 + }, + { + "epoch": 2.7336006236774697, + "grad_norm": 1.931111216545105, + "learning_rate": 4.9795421941344345e-05, + "loss": 0.1557, + "num_input_tokens_seen": 29880240, + "step": 24545 + }, + { + "epoch": 2.734157478561087, + "grad_norm": 0.3847997486591339, + "learning_rate": 4.979511162292301e-05, + "loss": 0.0386, + "num_input_tokens_seen": 29886384, + "step": 24550 + }, + { + "epoch": 2.734714333444704, + "grad_norm": 1.7128384113311768, + "learning_rate": 4.979480107029198e-05, + "loss": 0.2127, + "num_input_tokens_seen": 29892048, + "step": 24555 + }, + { + "epoch": 2.735271188328322, + "grad_norm": 0.6527377367019653, + "learning_rate": 4.979449028345419e-05, + "loss": 0.1633, + "num_input_tokens_seen": 29898064, + "step": 24560 + }, + { + "epoch": 2.735828043211939, + "grad_norm": 0.7528226971626282, + "learning_rate": 4.979417926241257e-05, + "loss": 0.2, + "num_input_tokens_seen": 29904464, + "step": 24565 + }, + { + "epoch": 2.7363848980955563, + "grad_norm": 1.8332934379577637, + "learning_rate": 4.979386800717006e-05, + "loss": 0.1211, + "num_input_tokens_seen": 29910960, + "step": 24570 + }, + { + "epoch": 2.7369417529791735, + "grad_norm": 0.13041247427463531, + "learning_rate": 4.9793556517729614e-05, + "loss": 0.0914, + "num_input_tokens_seen": 29916688, + "step": 24575 + }, + { + "epoch": 2.7374986078627908, + "grad_norm": 0.45740050077438354, + "learning_rate": 4.979324479409415e-05, + "loss": 0.2598, + "num_input_tokens_seen": 29922832, + "step": 24580 + }, + { + "epoch": 2.7380554627464084, + "grad_norm": 1.273545265197754, + "learning_rate": 4.979293283626663e-05, + "loss": 0.143, + "num_input_tokens_seen": 29928976, + "step": 24585 + }, + { + "epoch": 2.7386123176300257, + "grad_norm": 0.07943660020828247, + "learning_rate": 4.9792620644249997e-05, + "loss": 0.0749, + "num_input_tokens_seen": 29935088, + "step": 24590 + }, + { + "epoch": 2.739169172513643, + "grad_norm": 0.2774450480937958, + "learning_rate": 4.9792308218047195e-05, + "loss": 0.0467, + "num_input_tokens_seen": 29941360, + "step": 24595 + }, + { + "epoch": 2.73972602739726, + "grad_norm": 0.9015700221061707, + "learning_rate": 4.979199555766118e-05, + "loss": 0.0701, + "num_input_tokens_seen": 29947888, + "step": 24600 + }, + { + "epoch": 2.7402828822808774, + "grad_norm": 0.15152905881404877, + "learning_rate": 4.979168266309491e-05, + "loss": 0.0578, + "num_input_tokens_seen": 29954064, + "step": 24605 + }, + { + "epoch": 2.740839737164495, + "grad_norm": 0.7845818996429443, + "learning_rate": 4.9791369534351325e-05, + "loss": 0.1264, + "num_input_tokens_seen": 29960112, + "step": 24610 + }, + { + "epoch": 2.7413965920481123, + "grad_norm": 0.5017778873443604, + "learning_rate": 4.9791056171433395e-05, + "loss": 0.1009, + "num_input_tokens_seen": 29966192, + "step": 24615 + }, + { + "epoch": 2.7419534469317295, + "grad_norm": 0.25506672263145447, + "learning_rate": 4.979074257434408e-05, + "loss": 0.0332, + "num_input_tokens_seen": 29972272, + "step": 24620 + }, + { + "epoch": 2.742510301815347, + "grad_norm": 0.24155420064926147, + "learning_rate": 4.979042874308634e-05, + "loss": 0.0548, + "num_input_tokens_seen": 29978608, + "step": 24625 + }, + { + "epoch": 2.7430671566989644, + "grad_norm": 0.09393534064292908, + "learning_rate": 4.9790114677663134e-05, + "loss": 0.0664, + "num_input_tokens_seen": 29984560, + "step": 24630 + }, + { + "epoch": 2.7436240115825816, + "grad_norm": 0.3409527540206909, + "learning_rate": 4.9789800378077434e-05, + "loss": 0.0876, + "num_input_tokens_seen": 29990576, + "step": 24635 + }, + { + "epoch": 2.744180866466199, + "grad_norm": 0.013877296820282936, + "learning_rate": 4.978948584433221e-05, + "loss": 0.075, + "num_input_tokens_seen": 29996656, + "step": 24640 + }, + { + "epoch": 2.744737721349816, + "grad_norm": 0.493752658367157, + "learning_rate": 4.978917107643043e-05, + "loss": 0.0358, + "num_input_tokens_seen": 30002992, + "step": 24645 + }, + { + "epoch": 2.7452945762334338, + "grad_norm": 1.0839197635650635, + "learning_rate": 4.978885607437507e-05, + "loss": 0.1315, + "num_input_tokens_seen": 30009008, + "step": 24650 + }, + { + "epoch": 2.745851431117051, + "grad_norm": 1.6200056076049805, + "learning_rate": 4.978854083816911e-05, + "loss": 0.093, + "num_input_tokens_seen": 30015184, + "step": 24655 + }, + { + "epoch": 2.746408286000668, + "grad_norm": 0.42990607023239136, + "learning_rate": 4.978822536781551e-05, + "loss": 0.0369, + "num_input_tokens_seen": 30021776, + "step": 24660 + }, + { + "epoch": 2.7469651408842855, + "grad_norm": 1.8714643716812134, + "learning_rate": 4.978790966331727e-05, + "loss": 0.1536, + "num_input_tokens_seen": 30027344, + "step": 24665 + }, + { + "epoch": 2.7475219957679027, + "grad_norm": 1.479516625404358, + "learning_rate": 4.978759372467735e-05, + "loss": 0.1463, + "num_input_tokens_seen": 30033232, + "step": 24670 + }, + { + "epoch": 2.7480788506515204, + "grad_norm": 0.35857903957366943, + "learning_rate": 4.978727755189876e-05, + "loss": 0.0556, + "num_input_tokens_seen": 30039248, + "step": 24675 + }, + { + "epoch": 2.7486357055351376, + "grad_norm": 0.37582525610923767, + "learning_rate": 4.978696114498447e-05, + "loss": 0.018, + "num_input_tokens_seen": 30045488, + "step": 24680 + }, + { + "epoch": 2.749192560418755, + "grad_norm": 0.24603919684886932, + "learning_rate": 4.978664450393748e-05, + "loss": 0.1189, + "num_input_tokens_seen": 30051888, + "step": 24685 + }, + { + "epoch": 2.749749415302372, + "grad_norm": 0.3893771767616272, + "learning_rate": 4.9786327628760765e-05, + "loss": 0.1101, + "num_input_tokens_seen": 30057712, + "step": 24690 + }, + { + "epoch": 2.7503062701859893, + "grad_norm": 0.39216750860214233, + "learning_rate": 4.9786010519457336e-05, + "loss": 0.0998, + "num_input_tokens_seen": 30064112, + "step": 24695 + }, + { + "epoch": 2.750863125069607, + "grad_norm": 0.17528437077999115, + "learning_rate": 4.978569317603017e-05, + "loss": 0.0245, + "num_input_tokens_seen": 30070384, + "step": 24700 + }, + { + "epoch": 2.751419979953224, + "grad_norm": 0.04012668877840042, + "learning_rate": 4.978537559848228e-05, + "loss": 0.0222, + "num_input_tokens_seen": 30076912, + "step": 24705 + }, + { + "epoch": 2.7519768348368414, + "grad_norm": 1.0762031078338623, + "learning_rate": 4.978505778681666e-05, + "loss": 0.1057, + "num_input_tokens_seen": 30083216, + "step": 24710 + }, + { + "epoch": 2.752533689720459, + "grad_norm": 0.11392568051815033, + "learning_rate": 4.9784739741036306e-05, + "loss": 0.1023, + "num_input_tokens_seen": 30089296, + "step": 24715 + }, + { + "epoch": 2.7530905446040763, + "grad_norm": 0.4051455855369568, + "learning_rate": 4.978442146114424e-05, + "loss": 0.125, + "num_input_tokens_seen": 30095248, + "step": 24720 + }, + { + "epoch": 2.7536473994876935, + "grad_norm": 0.5896056294441223, + "learning_rate": 4.978410294714344e-05, + "loss": 0.1792, + "num_input_tokens_seen": 30101360, + "step": 24725 + }, + { + "epoch": 2.754204254371311, + "grad_norm": 1.1203384399414062, + "learning_rate": 4.978378419903694e-05, + "loss": 0.0734, + "num_input_tokens_seen": 30107568, + "step": 24730 + }, + { + "epoch": 2.754761109254928, + "grad_norm": 0.7433817386627197, + "learning_rate": 4.978346521682774e-05, + "loss": 0.1342, + "num_input_tokens_seen": 30113584, + "step": 24735 + }, + { + "epoch": 2.7553179641385457, + "grad_norm": 0.2853846251964569, + "learning_rate": 4.978314600051885e-05, + "loss": 0.0375, + "num_input_tokens_seen": 30119568, + "step": 24740 + }, + { + "epoch": 2.755874819022163, + "grad_norm": 0.06719546765089035, + "learning_rate": 4.9782826550113305e-05, + "loss": 0.1657, + "num_input_tokens_seen": 30125936, + "step": 24745 + }, + { + "epoch": 2.75643167390578, + "grad_norm": 0.5446068048477173, + "learning_rate": 4.9782506865614095e-05, + "loss": 0.1117, + "num_input_tokens_seen": 30131920, + "step": 24750 + }, + { + "epoch": 2.7569885287893974, + "grad_norm": 0.25941964983940125, + "learning_rate": 4.978218694702426e-05, + "loss": 0.0592, + "num_input_tokens_seen": 30138448, + "step": 24755 + }, + { + "epoch": 2.7575453836730146, + "grad_norm": 0.011776470579206944, + "learning_rate": 4.978186679434681e-05, + "loss": 0.0464, + "num_input_tokens_seen": 30144720, + "step": 24760 + }, + { + "epoch": 2.7581022385566323, + "grad_norm": 0.4014146625995636, + "learning_rate": 4.978154640758477e-05, + "loss": 0.0954, + "num_input_tokens_seen": 30151216, + "step": 24765 + }, + { + "epoch": 2.7586590934402495, + "grad_norm": 0.3049580454826355, + "learning_rate": 4.978122578674117e-05, + "loss": 0.1364, + "num_input_tokens_seen": 30157200, + "step": 24770 + }, + { + "epoch": 2.7592159483238667, + "grad_norm": 0.04168538749217987, + "learning_rate": 4.978090493181904e-05, + "loss": 0.071, + "num_input_tokens_seen": 30163312, + "step": 24775 + }, + { + "epoch": 2.759772803207484, + "grad_norm": 0.43298450112342834, + "learning_rate": 4.9780583842821414e-05, + "loss": 0.1235, + "num_input_tokens_seen": 30169488, + "step": 24780 + }, + { + "epoch": 2.760329658091101, + "grad_norm": 2.0094594955444336, + "learning_rate": 4.978026251975131e-05, + "loss": 0.218, + "num_input_tokens_seen": 30175632, + "step": 24785 + }, + { + "epoch": 2.760886512974719, + "grad_norm": 0.044193100184202194, + "learning_rate": 4.977994096261178e-05, + "loss": 0.0602, + "num_input_tokens_seen": 30181904, + "step": 24790 + }, + { + "epoch": 2.761443367858336, + "grad_norm": 0.3160291612148285, + "learning_rate": 4.977961917140586e-05, + "loss": 0.1865, + "num_input_tokens_seen": 30188112, + "step": 24795 + }, + { + "epoch": 2.7620002227419533, + "grad_norm": 1.4079070091247559, + "learning_rate": 4.977929714613657e-05, + "loss": 0.162, + "num_input_tokens_seen": 30194032, + "step": 24800 + }, + { + "epoch": 2.762557077625571, + "grad_norm": 0.1942765712738037, + "learning_rate": 4.977897488680697e-05, + "loss": 0.0526, + "num_input_tokens_seen": 30200592, + "step": 24805 + }, + { + "epoch": 2.7631139325091882, + "grad_norm": 1.3834943771362305, + "learning_rate": 4.97786523934201e-05, + "loss": 0.0819, + "num_input_tokens_seen": 30206672, + "step": 24810 + }, + { + "epoch": 2.7636707873928055, + "grad_norm": 0.36304688453674316, + "learning_rate": 4.977832966597901e-05, + "loss": 0.2534, + "num_input_tokens_seen": 30212368, + "step": 24815 + }, + { + "epoch": 2.7642276422764227, + "grad_norm": 0.04386238381266594, + "learning_rate": 4.977800670448674e-05, + "loss": 0.1055, + "num_input_tokens_seen": 30218160, + "step": 24820 + }, + { + "epoch": 2.76478449716004, + "grad_norm": 1.0425665378570557, + "learning_rate": 4.977768350894635e-05, + "loss": 0.0632, + "num_input_tokens_seen": 30224112, + "step": 24825 + }, + { + "epoch": 2.7653413520436576, + "grad_norm": 0.43625667691230774, + "learning_rate": 4.977736007936088e-05, + "loss": 0.0947, + "num_input_tokens_seen": 30230224, + "step": 24830 + }, + { + "epoch": 2.765898206927275, + "grad_norm": 0.498362272977829, + "learning_rate": 4.97770364157334e-05, + "loss": 0.0787, + "num_input_tokens_seen": 30236176, + "step": 24835 + }, + { + "epoch": 2.766455061810892, + "grad_norm": 0.453615665435791, + "learning_rate": 4.9776712518066953e-05, + "loss": 0.1525, + "num_input_tokens_seen": 30242256, + "step": 24840 + }, + { + "epoch": 2.7670119166945093, + "grad_norm": 2.493886947631836, + "learning_rate": 4.9776388386364606e-05, + "loss": 0.0824, + "num_input_tokens_seen": 30248464, + "step": 24845 + }, + { + "epoch": 2.7675687715781265, + "grad_norm": 1.279821753501892, + "learning_rate": 4.977606402062943e-05, + "loss": 0.0988, + "num_input_tokens_seen": 30254928, + "step": 24850 + }, + { + "epoch": 2.768125626461744, + "grad_norm": 0.11216359585523605, + "learning_rate": 4.977573942086447e-05, + "loss": 0.1339, + "num_input_tokens_seen": 30261232, + "step": 24855 + }, + { + "epoch": 2.7686824813453614, + "grad_norm": 0.3449386954307556, + "learning_rate": 4.97754145870728e-05, + "loss": 0.1384, + "num_input_tokens_seen": 30267216, + "step": 24860 + }, + { + "epoch": 2.7692393362289787, + "grad_norm": 0.6952743530273438, + "learning_rate": 4.9775089519257496e-05, + "loss": 0.0752, + "num_input_tokens_seen": 30273392, + "step": 24865 + }, + { + "epoch": 2.769796191112596, + "grad_norm": 0.5497209429740906, + "learning_rate": 4.977476421742162e-05, + "loss": 0.0626, + "num_input_tokens_seen": 30279600, + "step": 24870 + }, + { + "epoch": 2.770353045996213, + "grad_norm": 1.0362097024917603, + "learning_rate": 4.9774438681568245e-05, + "loss": 0.0678, + "num_input_tokens_seen": 30285456, + "step": 24875 + }, + { + "epoch": 2.770909900879831, + "grad_norm": 1.0153075456619263, + "learning_rate": 4.977411291170045e-05, + "loss": 0.1771, + "num_input_tokens_seen": 30290832, + "step": 24880 + }, + { + "epoch": 2.771466755763448, + "grad_norm": 0.3827663064002991, + "learning_rate": 4.9773786907821306e-05, + "loss": 0.0441, + "num_input_tokens_seen": 30296368, + "step": 24885 + }, + { + "epoch": 2.7720236106470653, + "grad_norm": 0.7676447629928589, + "learning_rate": 4.97734606699339e-05, + "loss": 0.1886, + "num_input_tokens_seen": 30302448, + "step": 24890 + }, + { + "epoch": 2.772580465530683, + "grad_norm": 0.018826179206371307, + "learning_rate": 4.97731341980413e-05, + "loss": 0.1576, + "num_input_tokens_seen": 30308304, + "step": 24895 + }, + { + "epoch": 2.7731373204143, + "grad_norm": 0.212462916970253, + "learning_rate": 4.977280749214662e-05, + "loss": 0.1201, + "num_input_tokens_seen": 30314384, + "step": 24900 + }, + { + "epoch": 2.7736941752979174, + "grad_norm": 0.48723524808883667, + "learning_rate": 4.977248055225291e-05, + "loss": 0.0264, + "num_input_tokens_seen": 30320752, + "step": 24905 + }, + { + "epoch": 2.7742510301815346, + "grad_norm": 0.06034556031227112, + "learning_rate": 4.977215337836327e-05, + "loss": 0.0633, + "num_input_tokens_seen": 30326512, + "step": 24910 + }, + { + "epoch": 2.774807885065152, + "grad_norm": 0.009355058893561363, + "learning_rate": 4.9771825970480815e-05, + "loss": 0.0714, + "num_input_tokens_seen": 30332816, + "step": 24915 + }, + { + "epoch": 2.7753647399487695, + "grad_norm": 0.3816310465335846, + "learning_rate": 4.9771498328608604e-05, + "loss": 0.064, + "num_input_tokens_seen": 30339216, + "step": 24920 + }, + { + "epoch": 2.7759215948323868, + "grad_norm": 0.8853989839553833, + "learning_rate": 4.9771170452749736e-05, + "loss": 0.036, + "num_input_tokens_seen": 30345520, + "step": 24925 + }, + { + "epoch": 2.776478449716004, + "grad_norm": 0.44448474049568176, + "learning_rate": 4.9770842342907326e-05, + "loss": 0.1123, + "num_input_tokens_seen": 30351312, + "step": 24930 + }, + { + "epoch": 2.7770353045996212, + "grad_norm": 0.241390660405159, + "learning_rate": 4.9770513999084465e-05, + "loss": 0.0284, + "num_input_tokens_seen": 30357264, + "step": 24935 + }, + { + "epoch": 2.7775921594832385, + "grad_norm": 0.701543927192688, + "learning_rate": 4.977018542128425e-05, + "loss": 0.1737, + "num_input_tokens_seen": 30363280, + "step": 24940 + }, + { + "epoch": 2.778149014366856, + "grad_norm": 0.025677338242530823, + "learning_rate": 4.976985660950979e-05, + "loss": 0.1074, + "num_input_tokens_seen": 30369328, + "step": 24945 + }, + { + "epoch": 2.7787058692504734, + "grad_norm": 0.07133986800909042, + "learning_rate": 4.976952756376418e-05, + "loss": 0.0563, + "num_input_tokens_seen": 30375632, + "step": 24950 + }, + { + "epoch": 2.7792627241340906, + "grad_norm": 0.35474175214767456, + "learning_rate": 4.976919828405055e-05, + "loss": 0.0214, + "num_input_tokens_seen": 30382064, + "step": 24955 + }, + { + "epoch": 2.779819579017708, + "grad_norm": 0.3254077732563019, + "learning_rate": 4.9768868770371996e-05, + "loss": 0.0426, + "num_input_tokens_seen": 30388176, + "step": 24960 + }, + { + "epoch": 2.780376433901325, + "grad_norm": 0.37273529171943665, + "learning_rate": 4.976853902273163e-05, + "loss": 0.1365, + "num_input_tokens_seen": 30393968, + "step": 24965 + }, + { + "epoch": 2.7809332887849427, + "grad_norm": 0.43845024704933167, + "learning_rate": 4.976820904113257e-05, + "loss": 0.079, + "num_input_tokens_seen": 30399952, + "step": 24970 + }, + { + "epoch": 2.78149014366856, + "grad_norm": 0.09933681786060333, + "learning_rate": 4.976787882557793e-05, + "loss": 0.0549, + "num_input_tokens_seen": 30406480, + "step": 24975 + }, + { + "epoch": 2.782046998552177, + "grad_norm": 0.08409880846738815, + "learning_rate": 4.976754837607083e-05, + "loss": 0.049, + "num_input_tokens_seen": 30412560, + "step": 24980 + }, + { + "epoch": 2.782603853435795, + "grad_norm": 0.8626718521118164, + "learning_rate": 4.976721769261439e-05, + "loss": 0.1101, + "num_input_tokens_seen": 30418608, + "step": 24985 + }, + { + "epoch": 2.783160708319412, + "grad_norm": 0.046288251876831055, + "learning_rate": 4.976688677521174e-05, + "loss": 0.0407, + "num_input_tokens_seen": 30424528, + "step": 24990 + }, + { + "epoch": 2.7837175632030293, + "grad_norm": 0.7707536816596985, + "learning_rate": 4.9766555623866e-05, + "loss": 0.1015, + "num_input_tokens_seen": 30430704, + "step": 24995 + }, + { + "epoch": 2.7842744180866466, + "grad_norm": 0.6557891964912415, + "learning_rate": 4.97662242385803e-05, + "loss": 0.1425, + "num_input_tokens_seen": 30436912, + "step": 25000 + }, + { + "epoch": 2.784831272970264, + "grad_norm": 1.0920435190200806, + "learning_rate": 4.976589261935777e-05, + "loss": 0.1953, + "num_input_tokens_seen": 30443056, + "step": 25005 + }, + { + "epoch": 2.7853881278538815, + "grad_norm": 0.11783576756715775, + "learning_rate": 4.9765560766201536e-05, + "loss": 0.1161, + "num_input_tokens_seen": 30449232, + "step": 25010 + }, + { + "epoch": 2.7859449827374987, + "grad_norm": 0.2889471650123596, + "learning_rate": 4.976522867911474e-05, + "loss": 0.0553, + "num_input_tokens_seen": 30455248, + "step": 25015 + }, + { + "epoch": 2.786501837621116, + "grad_norm": 1.132270097732544, + "learning_rate": 4.976489635810053e-05, + "loss": 0.0848, + "num_input_tokens_seen": 30461360, + "step": 25020 + }, + { + "epoch": 2.787058692504733, + "grad_norm": 0.10031679272651672, + "learning_rate": 4.976456380316202e-05, + "loss": 0.091, + "num_input_tokens_seen": 30467760, + "step": 25025 + }, + { + "epoch": 2.7876155473883504, + "grad_norm": 0.22946368157863617, + "learning_rate": 4.9764231014302367e-05, + "loss": 0.0632, + "num_input_tokens_seen": 30473072, + "step": 25030 + }, + { + "epoch": 2.788172402271968, + "grad_norm": 1.060822606086731, + "learning_rate": 4.976389799152471e-05, + "loss": 0.0825, + "num_input_tokens_seen": 30479120, + "step": 25035 + }, + { + "epoch": 2.7887292571555853, + "grad_norm": 0.2217940092086792, + "learning_rate": 4.97635647348322e-05, + "loss": 0.0646, + "num_input_tokens_seen": 30485040, + "step": 25040 + }, + { + "epoch": 2.7892861120392025, + "grad_norm": 0.33727961778640747, + "learning_rate": 4.976323124422798e-05, + "loss": 0.0744, + "num_input_tokens_seen": 30491248, + "step": 25045 + }, + { + "epoch": 2.78984296692282, + "grad_norm": 0.20391203463077545, + "learning_rate": 4.97628975197152e-05, + "loss": 0.0747, + "num_input_tokens_seen": 30497136, + "step": 25050 + }, + { + "epoch": 2.790399821806437, + "grad_norm": 0.23832900822162628, + "learning_rate": 4.976256356129702e-05, + "loss": 0.1053, + "num_input_tokens_seen": 30503184, + "step": 25055 + }, + { + "epoch": 2.7909566766900546, + "grad_norm": 0.571532666683197, + "learning_rate": 4.976222936897657e-05, + "loss": 0.058, + "num_input_tokens_seen": 30509264, + "step": 25060 + }, + { + "epoch": 2.791513531573672, + "grad_norm": 0.2736981511116028, + "learning_rate": 4.9761894942757034e-05, + "loss": 0.0757, + "num_input_tokens_seen": 30515504, + "step": 25065 + }, + { + "epoch": 2.792070386457289, + "grad_norm": 1.9963105916976929, + "learning_rate": 4.9761560282641564e-05, + "loss": 0.2009, + "num_input_tokens_seen": 30521424, + "step": 25070 + }, + { + "epoch": 2.792627241340907, + "grad_norm": 0.10361745208501816, + "learning_rate": 4.976122538863332e-05, + "loss": 0.0904, + "num_input_tokens_seen": 30527760, + "step": 25075 + }, + { + "epoch": 2.793184096224524, + "grad_norm": 0.21434880793094635, + "learning_rate": 4.976089026073546e-05, + "loss": 0.039, + "num_input_tokens_seen": 30533872, + "step": 25080 + }, + { + "epoch": 2.7937409511081412, + "grad_norm": 0.4278530180454254, + "learning_rate": 4.9760554898951154e-05, + "loss": 0.0785, + "num_input_tokens_seen": 30539824, + "step": 25085 + }, + { + "epoch": 2.7942978059917585, + "grad_norm": 0.0006949263042770326, + "learning_rate": 4.976021930328357e-05, + "loss": 0.1137, + "num_input_tokens_seen": 30545904, + "step": 25090 + }, + { + "epoch": 2.7948546608753757, + "grad_norm": 0.9641397595405579, + "learning_rate": 4.975988347373588e-05, + "loss": 0.1179, + "num_input_tokens_seen": 30551824, + "step": 25095 + }, + { + "epoch": 2.7954115157589934, + "grad_norm": 0.009426096454262733, + "learning_rate": 4.975954741031125e-05, + "loss": 0.0449, + "num_input_tokens_seen": 30557936, + "step": 25100 + }, + { + "epoch": 2.7959683706426106, + "grad_norm": 1.4797821044921875, + "learning_rate": 4.9759211113012863e-05, + "loss": 0.221, + "num_input_tokens_seen": 30563728, + "step": 25105 + }, + { + "epoch": 2.796525225526228, + "grad_norm": 0.06991811841726303, + "learning_rate": 4.975887458184388e-05, + "loss": 0.1048, + "num_input_tokens_seen": 30569648, + "step": 25110 + }, + { + "epoch": 2.797082080409845, + "grad_norm": 0.15244688093662262, + "learning_rate": 4.9758537816807494e-05, + "loss": 0.1139, + "num_input_tokens_seen": 30575824, + "step": 25115 + }, + { + "epoch": 2.7976389352934623, + "grad_norm": 1.123264193534851, + "learning_rate": 4.975820081790689e-05, + "loss": 0.1418, + "num_input_tokens_seen": 30581200, + "step": 25120 + }, + { + "epoch": 2.79819579017708, + "grad_norm": 0.8704322576522827, + "learning_rate": 4.9757863585145226e-05, + "loss": 0.0654, + "num_input_tokens_seen": 30586928, + "step": 25125 + }, + { + "epoch": 2.798752645060697, + "grad_norm": 0.9757177233695984, + "learning_rate": 4.9757526118525724e-05, + "loss": 0.1295, + "num_input_tokens_seen": 30592976, + "step": 25130 + }, + { + "epoch": 2.7993094999443144, + "grad_norm": 0.058070797473192215, + "learning_rate": 4.975718841805154e-05, + "loss": 0.1022, + "num_input_tokens_seen": 30599184, + "step": 25135 + }, + { + "epoch": 2.799866354827932, + "grad_norm": 0.5126028060913086, + "learning_rate": 4.975685048372588e-05, + "loss": 0.0625, + "num_input_tokens_seen": 30604592, + "step": 25140 + }, + { + "epoch": 2.800423209711549, + "grad_norm": 0.6773502826690674, + "learning_rate": 4.975651231555193e-05, + "loss": 0.0221, + "num_input_tokens_seen": 30610736, + "step": 25145 + }, + { + "epoch": 2.8009800645951666, + "grad_norm": 0.5694398283958435, + "learning_rate": 4.975617391353289e-05, + "loss": 0.0664, + "num_input_tokens_seen": 30616976, + "step": 25150 + }, + { + "epoch": 2.801536919478784, + "grad_norm": 1.8549442291259766, + "learning_rate": 4.975583527767195e-05, + "loss": 0.1684, + "num_input_tokens_seen": 30623472, + "step": 25155 + }, + { + "epoch": 2.802093774362401, + "grad_norm": 2.6662163734436035, + "learning_rate": 4.975549640797231e-05, + "loss": 0.1104, + "num_input_tokens_seen": 30629904, + "step": 25160 + }, + { + "epoch": 2.8026506292460187, + "grad_norm": 0.8744105696678162, + "learning_rate": 4.9755157304437184e-05, + "loss": 0.2199, + "num_input_tokens_seen": 30635696, + "step": 25165 + }, + { + "epoch": 2.803207484129636, + "grad_norm": 0.3656046390533447, + "learning_rate": 4.9754817967069754e-05, + "loss": 0.1393, + "num_input_tokens_seen": 30641808, + "step": 25170 + }, + { + "epoch": 2.803764339013253, + "grad_norm": 0.026876220479607582, + "learning_rate": 4.975447839587324e-05, + "loss": 0.061, + "num_input_tokens_seen": 30647920, + "step": 25175 + }, + { + "epoch": 2.8043211938968704, + "grad_norm": 0.06536837667226791, + "learning_rate": 4.9754138590850844e-05, + "loss": 0.0599, + "num_input_tokens_seen": 30654288, + "step": 25180 + }, + { + "epoch": 2.8048780487804876, + "grad_norm": 0.09120508283376694, + "learning_rate": 4.9753798552005774e-05, + "loss": 0.0109, + "num_input_tokens_seen": 30660528, + "step": 25185 + }, + { + "epoch": 2.8054349036641053, + "grad_norm": 0.3237643539905548, + "learning_rate": 4.9753458279341236e-05, + "loss": 0.0208, + "num_input_tokens_seen": 30666352, + "step": 25190 + }, + { + "epoch": 2.8059917585477225, + "grad_norm": 0.06949540972709656, + "learning_rate": 4.975311777286046e-05, + "loss": 0.0798, + "num_input_tokens_seen": 30672496, + "step": 25195 + }, + { + "epoch": 2.8065486134313398, + "grad_norm": 0.0669947937130928, + "learning_rate": 4.9752777032566654e-05, + "loss": 0.0328, + "num_input_tokens_seen": 30678480, + "step": 25200 + }, + { + "epoch": 2.807105468314957, + "grad_norm": 0.10487911850214005, + "learning_rate": 4.975243605846304e-05, + "loss": 0.0683, + "num_input_tokens_seen": 30684656, + "step": 25205 + }, + { + "epoch": 2.8076623231985742, + "grad_norm": 0.9996651411056519, + "learning_rate": 4.9752094850552835e-05, + "loss": 0.0703, + "num_input_tokens_seen": 30690704, + "step": 25210 + }, + { + "epoch": 2.808219178082192, + "grad_norm": 1.308311104774475, + "learning_rate": 4.975175340883926e-05, + "loss": 0.1522, + "num_input_tokens_seen": 30696912, + "step": 25215 + }, + { + "epoch": 2.808776032965809, + "grad_norm": 0.2126321941614151, + "learning_rate": 4.9751411733325546e-05, + "loss": 0.0862, + "num_input_tokens_seen": 30703280, + "step": 25220 + }, + { + "epoch": 2.8093328878494264, + "grad_norm": 1.7061874866485596, + "learning_rate": 4.975106982401492e-05, + "loss": 0.1059, + "num_input_tokens_seen": 30709616, + "step": 25225 + }, + { + "epoch": 2.809889742733044, + "grad_norm": 0.07700633257627487, + "learning_rate": 4.9750727680910615e-05, + "loss": 0.1648, + "num_input_tokens_seen": 30715632, + "step": 25230 + }, + { + "epoch": 2.810446597616661, + "grad_norm": 1.2416642904281616, + "learning_rate": 4.975038530401584e-05, + "loss": 0.1298, + "num_input_tokens_seen": 30721520, + "step": 25235 + }, + { + "epoch": 2.8110034525002785, + "grad_norm": 0.18157345056533813, + "learning_rate": 4.975004269333386e-05, + "loss": 0.0766, + "num_input_tokens_seen": 30727632, + "step": 25240 + }, + { + "epoch": 2.8115603073838957, + "grad_norm": 0.46778959035873413, + "learning_rate": 4.974969984886789e-05, + "loss": 0.0829, + "num_input_tokens_seen": 30733680, + "step": 25245 + }, + { + "epoch": 2.812117162267513, + "grad_norm": 0.047673195600509644, + "learning_rate": 4.974935677062118e-05, + "loss": 0.1028, + "num_input_tokens_seen": 30739728, + "step": 25250 + }, + { + "epoch": 2.8126740171511306, + "grad_norm": 0.059793516993522644, + "learning_rate": 4.974901345859696e-05, + "loss": 0.0096, + "num_input_tokens_seen": 30745776, + "step": 25255 + }, + { + "epoch": 2.813230872034748, + "grad_norm": 0.010738510638475418, + "learning_rate": 4.974866991279849e-05, + "loss": 0.0594, + "num_input_tokens_seen": 30752016, + "step": 25260 + }, + { + "epoch": 2.813787726918365, + "grad_norm": 0.18008233606815338, + "learning_rate": 4.9748326133229e-05, + "loss": 0.0668, + "num_input_tokens_seen": 30757584, + "step": 25265 + }, + { + "epoch": 2.8143445818019823, + "grad_norm": 0.157833993434906, + "learning_rate": 4.9747982119891736e-05, + "loss": 0.0184, + "num_input_tokens_seen": 30763792, + "step": 25270 + }, + { + "epoch": 2.8149014366855996, + "grad_norm": 0.0435425229370594, + "learning_rate": 4.9747637872789965e-05, + "loss": 0.0906, + "num_input_tokens_seen": 30769392, + "step": 25275 + }, + { + "epoch": 2.8154582915692172, + "grad_norm": 0.17669543623924255, + "learning_rate": 4.974729339192692e-05, + "loss": 0.1673, + "num_input_tokens_seen": 30775824, + "step": 25280 + }, + { + "epoch": 2.8160151464528345, + "grad_norm": 0.08999036997556686, + "learning_rate": 4.974694867730586e-05, + "loss": 0.0728, + "num_input_tokens_seen": 30781968, + "step": 25285 + }, + { + "epoch": 2.8165720013364517, + "grad_norm": 0.6007608771324158, + "learning_rate": 4.974660372893004e-05, + "loss": 0.0484, + "num_input_tokens_seen": 30788368, + "step": 25290 + }, + { + "epoch": 2.817128856220069, + "grad_norm": 0.4794633090496063, + "learning_rate": 4.974625854680273e-05, + "loss": 0.1102, + "num_input_tokens_seen": 30794288, + "step": 25295 + }, + { + "epoch": 2.817685711103686, + "grad_norm": 0.29261311888694763, + "learning_rate": 4.9745913130927167e-05, + "loss": 0.0479, + "num_input_tokens_seen": 30800400, + "step": 25300 + }, + { + "epoch": 2.818242565987304, + "grad_norm": 0.058667443692684174, + "learning_rate": 4.974556748130664e-05, + "loss": 0.0168, + "num_input_tokens_seen": 30806480, + "step": 25305 + }, + { + "epoch": 2.818799420870921, + "grad_norm": 0.004948657006025314, + "learning_rate": 4.97452215979444e-05, + "loss": 0.078, + "num_input_tokens_seen": 30811792, + "step": 25310 + }, + { + "epoch": 2.8193562757545383, + "grad_norm": 0.007602003403007984, + "learning_rate": 4.974487548084372e-05, + "loss": 0.0293, + "num_input_tokens_seen": 30818128, + "step": 25315 + }, + { + "epoch": 2.819913130638156, + "grad_norm": 0.6347076892852783, + "learning_rate": 4.9744529130007865e-05, + "loss": 0.0442, + "num_input_tokens_seen": 30824496, + "step": 25320 + }, + { + "epoch": 2.8204699855217727, + "grad_norm": 0.00797413382679224, + "learning_rate": 4.97441825454401e-05, + "loss": 0.1111, + "num_input_tokens_seen": 30830768, + "step": 25325 + }, + { + "epoch": 2.8210268404053904, + "grad_norm": 0.005680765490978956, + "learning_rate": 4.974383572714372e-05, + "loss": 0.1336, + "num_input_tokens_seen": 30837168, + "step": 25330 + }, + { + "epoch": 2.8215836952890077, + "grad_norm": 0.5150864124298096, + "learning_rate": 4.9743488675121976e-05, + "loss": 0.037, + "num_input_tokens_seen": 30843248, + "step": 25335 + }, + { + "epoch": 2.822140550172625, + "grad_norm": 0.9480615854263306, + "learning_rate": 4.974314138937816e-05, + "loss": 0.1407, + "num_input_tokens_seen": 30849488, + "step": 25340 + }, + { + "epoch": 2.8226974050562426, + "grad_norm": 0.29725828766822815, + "learning_rate": 4.974279386991555e-05, + "loss": 0.1553, + "num_input_tokens_seen": 30855664, + "step": 25345 + }, + { + "epoch": 2.82325425993986, + "grad_norm": 0.3374366760253906, + "learning_rate": 4.974244611673742e-05, + "loss": 0.0649, + "num_input_tokens_seen": 30861552, + "step": 25350 + }, + { + "epoch": 2.823811114823477, + "grad_norm": 0.8801257610321045, + "learning_rate": 4.974209812984707e-05, + "loss": 0.0937, + "num_input_tokens_seen": 30867664, + "step": 25355 + }, + { + "epoch": 2.8243679697070943, + "grad_norm": 0.03382663428783417, + "learning_rate": 4.974174990924778e-05, + "loss": 0.0349, + "num_input_tokens_seen": 30873808, + "step": 25360 + }, + { + "epoch": 2.8249248245907115, + "grad_norm": 0.21851244568824768, + "learning_rate": 4.974140145494285e-05, + "loss": 0.1041, + "num_input_tokens_seen": 30879824, + "step": 25365 + }, + { + "epoch": 2.825481679474329, + "grad_norm": 0.033419571816921234, + "learning_rate": 4.9741052766935546e-05, + "loss": 0.0779, + "num_input_tokens_seen": 30885808, + "step": 25370 + }, + { + "epoch": 2.8260385343579464, + "grad_norm": 0.41753724217414856, + "learning_rate": 4.974070384522918e-05, + "loss": 0.1575, + "num_input_tokens_seen": 30892560, + "step": 25375 + }, + { + "epoch": 2.8265953892415636, + "grad_norm": 0.7168877720832825, + "learning_rate": 4.9740354689827044e-05, + "loss": 0.1991, + "num_input_tokens_seen": 30897456, + "step": 25380 + }, + { + "epoch": 2.827152244125181, + "grad_norm": 0.19427232444286346, + "learning_rate": 4.974000530073244e-05, + "loss": 0.1194, + "num_input_tokens_seen": 30903536, + "step": 25385 + }, + { + "epoch": 2.827709099008798, + "grad_norm": 0.13290168344974518, + "learning_rate": 4.973965567794866e-05, + "loss": 0.1486, + "num_input_tokens_seen": 30909168, + "step": 25390 + }, + { + "epoch": 2.8282659538924158, + "grad_norm": 0.0007756438571959734, + "learning_rate": 4.9739305821479014e-05, + "loss": 0.0355, + "num_input_tokens_seen": 30915728, + "step": 25395 + }, + { + "epoch": 2.828822808776033, + "grad_norm": 0.7247325778007507, + "learning_rate": 4.9738955731326806e-05, + "loss": 0.0358, + "num_input_tokens_seen": 30921744, + "step": 25400 + }, + { + "epoch": 2.82937966365965, + "grad_norm": 0.014493348076939583, + "learning_rate": 4.973860540749534e-05, + "loss": 0.0791, + "num_input_tokens_seen": 30927888, + "step": 25405 + }, + { + "epoch": 2.829936518543268, + "grad_norm": 0.40041980147361755, + "learning_rate": 4.973825484998792e-05, + "loss": 0.051, + "num_input_tokens_seen": 30933968, + "step": 25410 + }, + { + "epoch": 2.830493373426885, + "grad_norm": 0.19327965378761292, + "learning_rate": 4.973790405880787e-05, + "loss": 0.0412, + "num_input_tokens_seen": 30939920, + "step": 25415 + }, + { + "epoch": 2.8310502283105023, + "grad_norm": 0.060083530843257904, + "learning_rate": 4.9737553033958494e-05, + "loss": 0.1203, + "num_input_tokens_seen": 30945840, + "step": 25420 + }, + { + "epoch": 2.8316070831941196, + "grad_norm": 0.7577130198478699, + "learning_rate": 4.973720177544311e-05, + "loss": 0.1004, + "num_input_tokens_seen": 30951984, + "step": 25425 + }, + { + "epoch": 2.832163938077737, + "grad_norm": 0.09651380032300949, + "learning_rate": 4.9736850283265034e-05, + "loss": 0.0549, + "num_input_tokens_seen": 30957808, + "step": 25430 + }, + { + "epoch": 2.8327207929613545, + "grad_norm": 1.5384485721588135, + "learning_rate": 4.9736498557427594e-05, + "loss": 0.0989, + "num_input_tokens_seen": 30963728, + "step": 25435 + }, + { + "epoch": 2.8332776478449717, + "grad_norm": 0.7702617645263672, + "learning_rate": 4.9736146597934095e-05, + "loss": 0.1687, + "num_input_tokens_seen": 30969776, + "step": 25440 + }, + { + "epoch": 2.833834502728589, + "grad_norm": 2.3848609924316406, + "learning_rate": 4.973579440478788e-05, + "loss": 0.2207, + "num_input_tokens_seen": 30975792, + "step": 25445 + }, + { + "epoch": 2.834391357612206, + "grad_norm": 0.9700685143470764, + "learning_rate": 4.973544197799227e-05, + "loss": 0.0954, + "num_input_tokens_seen": 30982000, + "step": 25450 + }, + { + "epoch": 2.8349482124958234, + "grad_norm": 0.11050499230623245, + "learning_rate": 4.973508931755059e-05, + "loss": 0.0358, + "num_input_tokens_seen": 30988272, + "step": 25455 + }, + { + "epoch": 2.835505067379441, + "grad_norm": 0.6368626952171326, + "learning_rate": 4.9734736423466175e-05, + "loss": 0.093, + "num_input_tokens_seen": 30993648, + "step": 25460 + }, + { + "epoch": 2.8360619222630583, + "grad_norm": 0.7474290132522583, + "learning_rate": 4.9734383295742356e-05, + "loss": 0.0646, + "num_input_tokens_seen": 30999824, + "step": 25465 + }, + { + "epoch": 2.8366187771466755, + "grad_norm": 0.4189327657222748, + "learning_rate": 4.9734029934382476e-05, + "loss": 0.0937, + "num_input_tokens_seen": 31005712, + "step": 25470 + }, + { + "epoch": 2.8371756320302928, + "grad_norm": 0.09912397712469101, + "learning_rate": 4.973367633938987e-05, + "loss": 0.0333, + "num_input_tokens_seen": 31011728, + "step": 25475 + }, + { + "epoch": 2.83773248691391, + "grad_norm": 0.7034899592399597, + "learning_rate": 4.973332251076786e-05, + "loss": 0.1434, + "num_input_tokens_seen": 31018000, + "step": 25480 + }, + { + "epoch": 2.8382893417975277, + "grad_norm": 1.2962623834609985, + "learning_rate": 4.9732968448519814e-05, + "loss": 0.1249, + "num_input_tokens_seen": 31024528, + "step": 25485 + }, + { + "epoch": 2.838846196681145, + "grad_norm": 1.0934507846832275, + "learning_rate": 4.973261415264906e-05, + "loss": 0.0851, + "num_input_tokens_seen": 31030032, + "step": 25490 + }, + { + "epoch": 2.839403051564762, + "grad_norm": 0.0034949693363159895, + "learning_rate": 4.973225962315895e-05, + "loss": 0.0667, + "num_input_tokens_seen": 31035856, + "step": 25495 + }, + { + "epoch": 2.83995990644838, + "grad_norm": 0.31207141280174255, + "learning_rate": 4.9731904860052835e-05, + "loss": 0.0527, + "num_input_tokens_seen": 31041584, + "step": 25500 + }, + { + "epoch": 2.840516761331997, + "grad_norm": 1.1978169679641724, + "learning_rate": 4.973154986333406e-05, + "loss": 0.0482, + "num_input_tokens_seen": 31047152, + "step": 25505 + }, + { + "epoch": 2.8410736162156143, + "grad_norm": 0.1414673626422882, + "learning_rate": 4.973119463300599e-05, + "loss": 0.176, + "num_input_tokens_seen": 31053616, + "step": 25510 + }, + { + "epoch": 2.8416304710992315, + "grad_norm": 0.08771771937608719, + "learning_rate": 4.9730839169071966e-05, + "loss": 0.1756, + "num_input_tokens_seen": 31060144, + "step": 25515 + }, + { + "epoch": 2.8421873259828487, + "grad_norm": 0.1810067892074585, + "learning_rate": 4.973048347153535e-05, + "loss": 0.0344, + "num_input_tokens_seen": 31066096, + "step": 25520 + }, + { + "epoch": 2.8427441808664664, + "grad_norm": 0.4746268391609192, + "learning_rate": 4.9730127540399506e-05, + "loss": 0.1526, + "num_input_tokens_seen": 31072144, + "step": 25525 + }, + { + "epoch": 2.8433010357500836, + "grad_norm": 1.558544635772705, + "learning_rate": 4.97297713756678e-05, + "loss": 0.2182, + "num_input_tokens_seen": 31078224, + "step": 25530 + }, + { + "epoch": 2.843857890633701, + "grad_norm": 0.012620998546481133, + "learning_rate": 4.972941497734358e-05, + "loss": 0.1024, + "num_input_tokens_seen": 31084624, + "step": 25535 + }, + { + "epoch": 2.844414745517318, + "grad_norm": 1.5708850622177124, + "learning_rate": 4.972905834543024e-05, + "loss": 0.2266, + "num_input_tokens_seen": 31090576, + "step": 25540 + }, + { + "epoch": 2.8449716004009353, + "grad_norm": 0.18572449684143066, + "learning_rate": 4.972870147993111e-05, + "loss": 0.0688, + "num_input_tokens_seen": 31096688, + "step": 25545 + }, + { + "epoch": 2.845528455284553, + "grad_norm": 0.14939874410629272, + "learning_rate": 4.97283443808496e-05, + "loss": 0.0716, + "num_input_tokens_seen": 31103024, + "step": 25550 + }, + { + "epoch": 2.8460853101681702, + "grad_norm": 0.29722651839256287, + "learning_rate": 4.972798704818905e-05, + "loss": 0.072, + "num_input_tokens_seen": 31109168, + "step": 25555 + }, + { + "epoch": 2.8466421650517875, + "grad_norm": 1.0595399141311646, + "learning_rate": 4.972762948195286e-05, + "loss": 0.2008, + "num_input_tokens_seen": 31115216, + "step": 25560 + }, + { + "epoch": 2.8471990199354047, + "grad_norm": 0.7967730164527893, + "learning_rate": 4.972727168214439e-05, + "loss": 0.3151, + "num_input_tokens_seen": 31121232, + "step": 25565 + }, + { + "epoch": 2.847755874819022, + "grad_norm": 1.0124273300170898, + "learning_rate": 4.972691364876704e-05, + "loss": 0.2142, + "num_input_tokens_seen": 31127440, + "step": 25570 + }, + { + "epoch": 2.8483127297026396, + "grad_norm": 0.1329076588153839, + "learning_rate": 4.9726555381824166e-05, + "loss": 0.1465, + "num_input_tokens_seen": 31133392, + "step": 25575 + }, + { + "epoch": 2.848869584586257, + "grad_norm": 0.5786724090576172, + "learning_rate": 4.9726196881319175e-05, + "loss": 0.2237, + "num_input_tokens_seen": 31139408, + "step": 25580 + }, + { + "epoch": 2.849426439469874, + "grad_norm": 0.079708531498909, + "learning_rate": 4.9725838147255446e-05, + "loss": 0.0081, + "num_input_tokens_seen": 31145648, + "step": 25585 + }, + { + "epoch": 2.8499832943534917, + "grad_norm": 0.7209720015525818, + "learning_rate": 4.972547917963636e-05, + "loss": 0.1377, + "num_input_tokens_seen": 31151664, + "step": 25590 + }, + { + "epoch": 2.850540149237109, + "grad_norm": 0.1051800474524498, + "learning_rate": 4.9725119978465316e-05, + "loss": 0.0686, + "num_input_tokens_seen": 31157808, + "step": 25595 + }, + { + "epoch": 2.851097004120726, + "grad_norm": 0.003809830639511347, + "learning_rate": 4.9724760543745705e-05, + "loss": 0.0126, + "num_input_tokens_seen": 31164176, + "step": 25600 + }, + { + "epoch": 2.8516538590043434, + "grad_norm": 0.10301650315523148, + "learning_rate": 4.9724400875480916e-05, + "loss": 0.13, + "num_input_tokens_seen": 31170448, + "step": 25605 + }, + { + "epoch": 2.8522107138879607, + "grad_norm": 0.10168533772230148, + "learning_rate": 4.972404097367436e-05, + "loss": 0.1387, + "num_input_tokens_seen": 31176464, + "step": 25610 + }, + { + "epoch": 2.8527675687715783, + "grad_norm": 0.8664208054542542, + "learning_rate": 4.9723680838329424e-05, + "loss": 0.164, + "num_input_tokens_seen": 31183088, + "step": 25615 + }, + { + "epoch": 2.8533244236551956, + "grad_norm": 0.32682669162750244, + "learning_rate": 4.972332046944951e-05, + "loss": 0.0615, + "num_input_tokens_seen": 31189232, + "step": 25620 + }, + { + "epoch": 2.853881278538813, + "grad_norm": 0.26461905241012573, + "learning_rate": 4.9722959867038035e-05, + "loss": 0.1452, + "num_input_tokens_seen": 31194704, + "step": 25625 + }, + { + "epoch": 2.85443813342243, + "grad_norm": 0.01125249732285738, + "learning_rate": 4.9722599031098396e-05, + "loss": 0.0807, + "num_input_tokens_seen": 31201264, + "step": 25630 + }, + { + "epoch": 2.8549949883060473, + "grad_norm": 0.23168769478797913, + "learning_rate": 4.9722237961633995e-05, + "loss": 0.1791, + "num_input_tokens_seen": 31207600, + "step": 25635 + }, + { + "epoch": 2.855551843189665, + "grad_norm": 0.19365431368350983, + "learning_rate": 4.972187665864825e-05, + "loss": 0.0084, + "num_input_tokens_seen": 31213776, + "step": 25640 + }, + { + "epoch": 2.856108698073282, + "grad_norm": 0.6901265978813171, + "learning_rate": 4.972151512214458e-05, + "loss": 0.0542, + "num_input_tokens_seen": 31219344, + "step": 25645 + }, + { + "epoch": 2.8566655529568994, + "grad_norm": 0.7669908404350281, + "learning_rate": 4.972115335212638e-05, + "loss": 0.1759, + "num_input_tokens_seen": 31225680, + "step": 25650 + }, + { + "epoch": 2.8572224078405166, + "grad_norm": 0.16885755956172943, + "learning_rate": 4.9720791348597096e-05, + "loss": 0.0136, + "num_input_tokens_seen": 31231728, + "step": 25655 + }, + { + "epoch": 2.857779262724134, + "grad_norm": 0.03902557119727135, + "learning_rate": 4.972042911156012e-05, + "loss": 0.0506, + "num_input_tokens_seen": 31237872, + "step": 25660 + }, + { + "epoch": 2.8583361176077515, + "grad_norm": 1.8089847564697266, + "learning_rate": 4.9720066641018894e-05, + "loss": 0.1679, + "num_input_tokens_seen": 31243952, + "step": 25665 + }, + { + "epoch": 2.8588929724913688, + "grad_norm": 0.04561644047498703, + "learning_rate": 4.971970393697683e-05, + "loss": 0.1031, + "num_input_tokens_seen": 31250160, + "step": 25670 + }, + { + "epoch": 2.859449827374986, + "grad_norm": 0.035619426518678665, + "learning_rate": 4.9719340999437356e-05, + "loss": 0.0114, + "num_input_tokens_seen": 31256304, + "step": 25675 + }, + { + "epoch": 2.8600066822586037, + "grad_norm": 0.8494591116905212, + "learning_rate": 4.97189778284039e-05, + "loss": 0.1255, + "num_input_tokens_seen": 31262320, + "step": 25680 + }, + { + "epoch": 2.860563537142221, + "grad_norm": 0.7138398289680481, + "learning_rate": 4.971861442387989e-05, + "loss": 0.1747, + "num_input_tokens_seen": 31268240, + "step": 25685 + }, + { + "epoch": 2.861120392025838, + "grad_norm": 0.5731020569801331, + "learning_rate": 4.971825078586877e-05, + "loss": 0.0668, + "num_input_tokens_seen": 31274288, + "step": 25690 + }, + { + "epoch": 2.8616772469094554, + "grad_norm": 0.027294965460896492, + "learning_rate": 4.9717886914373966e-05, + "loss": 0.0116, + "num_input_tokens_seen": 31280368, + "step": 25695 + }, + { + "epoch": 2.8622341017930726, + "grad_norm": 0.247349813580513, + "learning_rate": 4.971752280939892e-05, + "loss": 0.0623, + "num_input_tokens_seen": 31286640, + "step": 25700 + }, + { + "epoch": 2.8627909566766903, + "grad_norm": 0.027330420911312103, + "learning_rate": 4.9717158470947063e-05, + "loss": 0.0882, + "num_input_tokens_seen": 31292752, + "step": 25705 + }, + { + "epoch": 2.8633478115603075, + "grad_norm": 0.10388566553592682, + "learning_rate": 4.971679389902184e-05, + "loss": 0.0706, + "num_input_tokens_seen": 31298640, + "step": 25710 + }, + { + "epoch": 2.8639046664439247, + "grad_norm": 0.009723331779241562, + "learning_rate": 4.9716429093626695e-05, + "loss": 0.1209, + "num_input_tokens_seen": 31304976, + "step": 25715 + }, + { + "epoch": 2.864461521327542, + "grad_norm": 0.46237799525260925, + "learning_rate": 4.971606405476508e-05, + "loss": 0.1147, + "num_input_tokens_seen": 31311312, + "step": 25720 + }, + { + "epoch": 2.865018376211159, + "grad_norm": 0.6281974911689758, + "learning_rate": 4.9715698782440434e-05, + "loss": 0.0854, + "num_input_tokens_seen": 31317488, + "step": 25725 + }, + { + "epoch": 2.865575231094777, + "grad_norm": 0.0773361399769783, + "learning_rate": 4.971533327665622e-05, + "loss": 0.0966, + "num_input_tokens_seen": 31323536, + "step": 25730 + }, + { + "epoch": 2.866132085978394, + "grad_norm": 0.8729897737503052, + "learning_rate": 4.9714967537415866e-05, + "loss": 0.0604, + "num_input_tokens_seen": 31329488, + "step": 25735 + }, + { + "epoch": 2.8666889408620113, + "grad_norm": 1.2113029956817627, + "learning_rate": 4.971460156472285e-05, + "loss": 0.136, + "num_input_tokens_seen": 31335728, + "step": 25740 + }, + { + "epoch": 2.8672457957456285, + "grad_norm": 0.005666246637701988, + "learning_rate": 4.9714235358580626e-05, + "loss": 0.0576, + "num_input_tokens_seen": 31342064, + "step": 25745 + }, + { + "epoch": 2.8678026506292458, + "grad_norm": 0.1167159304022789, + "learning_rate": 4.971386891899264e-05, + "loss": 0.109, + "num_input_tokens_seen": 31347472, + "step": 25750 + }, + { + "epoch": 2.8683595055128634, + "grad_norm": 2.2928383350372314, + "learning_rate": 4.9713502245962366e-05, + "loss": 0.1816, + "num_input_tokens_seen": 31353872, + "step": 25755 + }, + { + "epoch": 2.8689163603964807, + "grad_norm": 0.5033034086227417, + "learning_rate": 4.9713135339493264e-05, + "loss": 0.1088, + "num_input_tokens_seen": 31360144, + "step": 25760 + }, + { + "epoch": 2.869473215280098, + "grad_norm": 0.5103476643562317, + "learning_rate": 4.97127681995888e-05, + "loss": 0.0828, + "num_input_tokens_seen": 31366224, + "step": 25765 + }, + { + "epoch": 2.8700300701637156, + "grad_norm": 2.0400960445404053, + "learning_rate": 4.971240082625244e-05, + "loss": 0.1427, + "num_input_tokens_seen": 31372272, + "step": 25770 + }, + { + "epoch": 2.870586925047333, + "grad_norm": 0.16954587399959564, + "learning_rate": 4.9712033219487654e-05, + "loss": 0.0884, + "num_input_tokens_seen": 31378480, + "step": 25775 + }, + { + "epoch": 2.87114377993095, + "grad_norm": 0.21456852555274963, + "learning_rate": 4.971166537929791e-05, + "loss": 0.0865, + "num_input_tokens_seen": 31384848, + "step": 25780 + }, + { + "epoch": 2.8717006348145673, + "grad_norm": 0.6966578960418701, + "learning_rate": 4.9711297305686694e-05, + "loss": 0.0961, + "num_input_tokens_seen": 31390896, + "step": 25785 + }, + { + "epoch": 2.8722574896981845, + "grad_norm": 0.27402645349502563, + "learning_rate": 4.971092899865747e-05, + "loss": 0.0534, + "num_input_tokens_seen": 31396784, + "step": 25790 + }, + { + "epoch": 2.872814344581802, + "grad_norm": 0.30238428711891174, + "learning_rate": 4.971056045821374e-05, + "loss": 0.0352, + "num_input_tokens_seen": 31402736, + "step": 25795 + }, + { + "epoch": 2.8733711994654194, + "grad_norm": 0.1791643351316452, + "learning_rate": 4.9710191684358954e-05, + "loss": 0.0087, + "num_input_tokens_seen": 31409008, + "step": 25800 + }, + { + "epoch": 2.8739280543490366, + "grad_norm": 0.5269036293029785, + "learning_rate": 4.9709822677096606e-05, + "loss": 0.0458, + "num_input_tokens_seen": 31415088, + "step": 25805 + }, + { + "epoch": 2.874484909232654, + "grad_norm": 1.0995519161224365, + "learning_rate": 4.9709453436430196e-05, + "loss": 0.18, + "num_input_tokens_seen": 31421040, + "step": 25810 + }, + { + "epoch": 2.875041764116271, + "grad_norm": 0.5637664794921875, + "learning_rate": 4.97090839623632e-05, + "loss": 0.0839, + "num_input_tokens_seen": 31427344, + "step": 25815 + }, + { + "epoch": 2.875598618999889, + "grad_norm": 0.8994446992874146, + "learning_rate": 4.970871425489911e-05, + "loss": 0.1096, + "num_input_tokens_seen": 31433104, + "step": 25820 + }, + { + "epoch": 2.876155473883506, + "grad_norm": 0.15717272460460663, + "learning_rate": 4.970834431404141e-05, + "loss": 0.0615, + "num_input_tokens_seen": 31439120, + "step": 25825 + }, + { + "epoch": 2.8767123287671232, + "grad_norm": 0.8024091124534607, + "learning_rate": 4.9707974139793614e-05, + "loss": 0.0566, + "num_input_tokens_seen": 31445296, + "step": 25830 + }, + { + "epoch": 2.8772691836507405, + "grad_norm": 0.44323280453681946, + "learning_rate": 4.97076037321592e-05, + "loss": 0.0767, + "num_input_tokens_seen": 31451632, + "step": 25835 + }, + { + "epoch": 2.8778260385343577, + "grad_norm": 1.5247057676315308, + "learning_rate": 4.970723309114167e-05, + "loss": 0.1477, + "num_input_tokens_seen": 31457680, + "step": 25840 + }, + { + "epoch": 2.8783828934179754, + "grad_norm": 0.5176655650138855, + "learning_rate": 4.970686221674453e-05, + "loss": 0.1106, + "num_input_tokens_seen": 31463568, + "step": 25845 + }, + { + "epoch": 2.8789397483015926, + "grad_norm": 0.20151448249816895, + "learning_rate": 4.970649110897129e-05, + "loss": 0.0754, + "num_input_tokens_seen": 31469616, + "step": 25850 + }, + { + "epoch": 2.87949660318521, + "grad_norm": 0.553112804889679, + "learning_rate": 4.970611976782543e-05, + "loss": 0.1152, + "num_input_tokens_seen": 31475216, + "step": 25855 + }, + { + "epoch": 2.8800534580688275, + "grad_norm": 0.30970731377601624, + "learning_rate": 4.970574819331049e-05, + "loss": 0.109, + "num_input_tokens_seen": 31481488, + "step": 25860 + }, + { + "epoch": 2.8806103129524447, + "grad_norm": 1.919298768043518, + "learning_rate": 4.970537638542996e-05, + "loss": 0.1789, + "num_input_tokens_seen": 31487280, + "step": 25865 + }, + { + "epoch": 2.881167167836062, + "grad_norm": 0.07464291155338287, + "learning_rate": 4.9705004344187356e-05, + "loss": 0.0386, + "num_input_tokens_seen": 31493392, + "step": 25870 + }, + { + "epoch": 2.881724022719679, + "grad_norm": 0.2708209455013275, + "learning_rate": 4.970463206958619e-05, + "loss": 0.0142, + "num_input_tokens_seen": 31499504, + "step": 25875 + }, + { + "epoch": 2.8822808776032964, + "grad_norm": 0.004502189811319113, + "learning_rate": 4.9704259561629985e-05, + "loss": 0.0671, + "num_input_tokens_seen": 31505360, + "step": 25880 + }, + { + "epoch": 2.882837732486914, + "grad_norm": 0.5765049457550049, + "learning_rate": 4.9703886820322257e-05, + "loss": 0.1272, + "num_input_tokens_seen": 31511536, + "step": 25885 + }, + { + "epoch": 2.8833945873705313, + "grad_norm": 0.1193399503827095, + "learning_rate": 4.970351384566652e-05, + "loss": 0.0886, + "num_input_tokens_seen": 31518000, + "step": 25890 + }, + { + "epoch": 2.8839514422541486, + "grad_norm": 0.511016845703125, + "learning_rate": 4.97031406376663e-05, + "loss": 0.0934, + "num_input_tokens_seen": 31523728, + "step": 25895 + }, + { + "epoch": 2.884508297137766, + "grad_norm": 0.011397605761885643, + "learning_rate": 4.970276719632513e-05, + "loss": 0.066, + "num_input_tokens_seen": 31529936, + "step": 25900 + }, + { + "epoch": 2.885065152021383, + "grad_norm": 1.4940797090530396, + "learning_rate": 4.9702393521646536e-05, + "loss": 0.1848, + "num_input_tokens_seen": 31536048, + "step": 25905 + }, + { + "epoch": 2.8856220069050007, + "grad_norm": 1.1471686363220215, + "learning_rate": 4.970201961363404e-05, + "loss": 0.0654, + "num_input_tokens_seen": 31542192, + "step": 25910 + }, + { + "epoch": 2.886178861788618, + "grad_norm": 1.1843730211257935, + "learning_rate": 4.970164547229118e-05, + "loss": 0.1195, + "num_input_tokens_seen": 31548400, + "step": 25915 + }, + { + "epoch": 2.886735716672235, + "grad_norm": 0.02705392614006996, + "learning_rate": 4.970127109762148e-05, + "loss": 0.032, + "num_input_tokens_seen": 31554576, + "step": 25920 + }, + { + "epoch": 2.8872925715558524, + "grad_norm": 0.26416024565696716, + "learning_rate": 4.970089648962849e-05, + "loss": 0.0493, + "num_input_tokens_seen": 31560144, + "step": 25925 + }, + { + "epoch": 2.8878494264394696, + "grad_norm": 0.151222363114357, + "learning_rate": 4.9700521648315745e-05, + "loss": 0.0664, + "num_input_tokens_seen": 31566160, + "step": 25930 + }, + { + "epoch": 2.8884062813230873, + "grad_norm": 0.046593111008405685, + "learning_rate": 4.970014657368678e-05, + "loss": 0.0131, + "num_input_tokens_seen": 31572464, + "step": 25935 + }, + { + "epoch": 2.8889631362067045, + "grad_norm": 0.7302428483963013, + "learning_rate": 4.9699771265745144e-05, + "loss": 0.1352, + "num_input_tokens_seen": 31578576, + "step": 25940 + }, + { + "epoch": 2.8895199910903218, + "grad_norm": 0.29028305411338806, + "learning_rate": 4.969939572449438e-05, + "loss": 0.1696, + "num_input_tokens_seen": 31584752, + "step": 25945 + }, + { + "epoch": 2.8900768459739394, + "grad_norm": 1.3960357904434204, + "learning_rate": 4.969901994993803e-05, + "loss": 0.1112, + "num_input_tokens_seen": 31590704, + "step": 25950 + }, + { + "epoch": 2.8906337008575567, + "grad_norm": 0.7419565916061401, + "learning_rate": 4.969864394207965e-05, + "loss": 0.134, + "num_input_tokens_seen": 31596976, + "step": 25955 + }, + { + "epoch": 2.891190555741174, + "grad_norm": 0.04627368599176407, + "learning_rate": 4.969826770092279e-05, + "loss": 0.1162, + "num_input_tokens_seen": 31603440, + "step": 25960 + }, + { + "epoch": 2.891747410624791, + "grad_norm": 0.0725218802690506, + "learning_rate": 4.9697891226471e-05, + "loss": 0.0961, + "num_input_tokens_seen": 31609648, + "step": 25965 + }, + { + "epoch": 2.8923042655084084, + "grad_norm": 0.20614564418792725, + "learning_rate": 4.969751451872785e-05, + "loss": 0.0485, + "num_input_tokens_seen": 31615696, + "step": 25970 + }, + { + "epoch": 2.892861120392026, + "grad_norm": 0.10414847731590271, + "learning_rate": 4.969713757769688e-05, + "loss": 0.0381, + "num_input_tokens_seen": 31621200, + "step": 25975 + }, + { + "epoch": 2.8934179752756433, + "grad_norm": 0.5404050946235657, + "learning_rate": 4.969676040338166e-05, + "loss": 0.0735, + "num_input_tokens_seen": 31627408, + "step": 25980 + }, + { + "epoch": 2.8939748301592605, + "grad_norm": 0.7583516836166382, + "learning_rate": 4.969638299578575e-05, + "loss": 0.0938, + "num_input_tokens_seen": 31633456, + "step": 25985 + }, + { + "epoch": 2.8945316850428777, + "grad_norm": 0.2922256886959076, + "learning_rate": 4.9696005354912714e-05, + "loss": 0.0861, + "num_input_tokens_seen": 31639568, + "step": 25990 + }, + { + "epoch": 2.895088539926495, + "grad_norm": 1.3064024448394775, + "learning_rate": 4.969562748076613e-05, + "loss": 0.1281, + "num_input_tokens_seen": 31645840, + "step": 25995 + }, + { + "epoch": 2.8956453948101126, + "grad_norm": 0.2501140534877777, + "learning_rate": 4.969524937334955e-05, + "loss": 0.0369, + "num_input_tokens_seen": 31651952, + "step": 26000 + }, + { + "epoch": 2.89620224969373, + "grad_norm": 0.5953064560890198, + "learning_rate": 4.9694871032666556e-05, + "loss": 0.077, + "num_input_tokens_seen": 31658064, + "step": 26005 + }, + { + "epoch": 2.896759104577347, + "grad_norm": 1.1060377359390259, + "learning_rate": 4.969449245872072e-05, + "loss": 0.0216, + "num_input_tokens_seen": 31664080, + "step": 26010 + }, + { + "epoch": 2.8973159594609643, + "grad_norm": 0.29888230562210083, + "learning_rate": 4.969411365151562e-05, + "loss": 0.1099, + "num_input_tokens_seen": 31670224, + "step": 26015 + }, + { + "epoch": 2.8978728143445815, + "grad_norm": 1.5253020524978638, + "learning_rate": 4.9693734611054835e-05, + "loss": 0.2015, + "num_input_tokens_seen": 31676688, + "step": 26020 + }, + { + "epoch": 2.8984296692281992, + "grad_norm": 0.31788355112075806, + "learning_rate": 4.969335533734194e-05, + "loss": 0.0473, + "num_input_tokens_seen": 31682832, + "step": 26025 + }, + { + "epoch": 2.8989865241118165, + "grad_norm": 0.8727210164070129, + "learning_rate": 4.9692975830380515e-05, + "loss": 0.0762, + "num_input_tokens_seen": 31688976, + "step": 26030 + }, + { + "epoch": 2.8995433789954337, + "grad_norm": 0.1616658717393875, + "learning_rate": 4.9692596090174153e-05, + "loss": 0.1804, + "num_input_tokens_seen": 31694864, + "step": 26035 + }, + { + "epoch": 2.9001002338790514, + "grad_norm": 0.651487410068512, + "learning_rate": 4.9692216116726435e-05, + "loss": 0.1123, + "num_input_tokens_seen": 31700720, + "step": 26040 + }, + { + "epoch": 2.9006570887626686, + "grad_norm": 0.4457370936870575, + "learning_rate": 4.9691835910040957e-05, + "loss": 0.0869, + "num_input_tokens_seen": 31706768, + "step": 26045 + }, + { + "epoch": 2.901213943646286, + "grad_norm": 0.1394427865743637, + "learning_rate": 4.9691455470121304e-05, + "loss": 0.092, + "num_input_tokens_seen": 31712880, + "step": 26050 + }, + { + "epoch": 2.901770798529903, + "grad_norm": 0.06904073059558868, + "learning_rate": 4.969107479697107e-05, + "loss": 0.031, + "num_input_tokens_seen": 31718864, + "step": 26055 + }, + { + "epoch": 2.9023276534135203, + "grad_norm": 0.4443265199661255, + "learning_rate": 4.9690693890593855e-05, + "loss": 0.0421, + "num_input_tokens_seen": 31725040, + "step": 26060 + }, + { + "epoch": 2.902884508297138, + "grad_norm": 0.15358348190784454, + "learning_rate": 4.969031275099325e-05, + "loss": 0.0335, + "num_input_tokens_seen": 31731088, + "step": 26065 + }, + { + "epoch": 2.903441363180755, + "grad_norm": 0.07650594413280487, + "learning_rate": 4.9689931378172874e-05, + "loss": 0.0379, + "num_input_tokens_seen": 31737296, + "step": 26070 + }, + { + "epoch": 2.9039982180643724, + "grad_norm": 0.03376106172800064, + "learning_rate": 4.96895497721363e-05, + "loss": 0.176, + "num_input_tokens_seen": 31743760, + "step": 26075 + }, + { + "epoch": 2.9045550729479896, + "grad_norm": 0.8892009854316711, + "learning_rate": 4.968916793288715e-05, + "loss": 0.1525, + "num_input_tokens_seen": 31749872, + "step": 26080 + }, + { + "epoch": 2.905111927831607, + "grad_norm": 0.5771512985229492, + "learning_rate": 4.9688785860429034e-05, + "loss": 0.1692, + "num_input_tokens_seen": 31756016, + "step": 26085 + }, + { + "epoch": 2.9056687827152246, + "grad_norm": 0.058692995458841324, + "learning_rate": 4.968840355476554e-05, + "loss": 0.0953, + "num_input_tokens_seen": 31762352, + "step": 26090 + }, + { + "epoch": 2.906225637598842, + "grad_norm": 1.3336766958236694, + "learning_rate": 4.968802101590031e-05, + "loss": 0.1501, + "num_input_tokens_seen": 31768080, + "step": 26095 + }, + { + "epoch": 2.906782492482459, + "grad_norm": 0.3286411166191101, + "learning_rate": 4.968763824383694e-05, + "loss": 0.0961, + "num_input_tokens_seen": 31774000, + "step": 26100 + }, + { + "epoch": 2.9073393473660762, + "grad_norm": 1.0709718465805054, + "learning_rate": 4.9687255238579045e-05, + "loss": 0.2666, + "num_input_tokens_seen": 31779824, + "step": 26105 + }, + { + "epoch": 2.9078962022496935, + "grad_norm": 0.005696948152035475, + "learning_rate": 4.9686872000130244e-05, + "loss": 0.0992, + "num_input_tokens_seen": 31786160, + "step": 26110 + }, + { + "epoch": 2.908453057133311, + "grad_norm": 0.19977638125419617, + "learning_rate": 4.968648852849416e-05, + "loss": 0.0855, + "num_input_tokens_seen": 31792400, + "step": 26115 + }, + { + "epoch": 2.9090099120169284, + "grad_norm": 0.45717284083366394, + "learning_rate": 4.9686104823674404e-05, + "loss": 0.123, + "num_input_tokens_seen": 31797808, + "step": 26120 + }, + { + "epoch": 2.9095667669005456, + "grad_norm": 0.3196859359741211, + "learning_rate": 4.968572088567462e-05, + "loss": 0.135, + "num_input_tokens_seen": 31803792, + "step": 26125 + }, + { + "epoch": 2.9101236217841633, + "grad_norm": 0.013887663371860981, + "learning_rate": 4.968533671449843e-05, + "loss": 0.0341, + "num_input_tokens_seen": 31809840, + "step": 26130 + }, + { + "epoch": 2.9106804766677805, + "grad_norm": 0.12863564491271973, + "learning_rate": 4.9684952310149447e-05, + "loss": 0.1398, + "num_input_tokens_seen": 31815952, + "step": 26135 + }, + { + "epoch": 2.9112373315513977, + "grad_norm": 0.6811627745628357, + "learning_rate": 4.968456767263131e-05, + "loss": 0.1234, + "num_input_tokens_seen": 31822224, + "step": 26140 + }, + { + "epoch": 2.911794186435015, + "grad_norm": 0.012886643409729004, + "learning_rate": 4.9684182801947666e-05, + "loss": 0.054, + "num_input_tokens_seen": 31827728, + "step": 26145 + }, + { + "epoch": 2.912351041318632, + "grad_norm": 0.9937536716461182, + "learning_rate": 4.968379769810213e-05, + "loss": 0.064, + "num_input_tokens_seen": 31833936, + "step": 26150 + }, + { + "epoch": 2.91290789620225, + "grad_norm": 0.8671215176582336, + "learning_rate": 4.968341236109835e-05, + "loss": 0.0825, + "num_input_tokens_seen": 31839568, + "step": 26155 + }, + { + "epoch": 2.913464751085867, + "grad_norm": 0.19188852608203888, + "learning_rate": 4.968302679093996e-05, + "loss": 0.1342, + "num_input_tokens_seen": 31846064, + "step": 26160 + }, + { + "epoch": 2.9140216059694843, + "grad_norm": 2.310589075088501, + "learning_rate": 4.968264098763061e-05, + "loss": 0.1389, + "num_input_tokens_seen": 31852272, + "step": 26165 + }, + { + "epoch": 2.9145784608531016, + "grad_norm": 0.3766412138938904, + "learning_rate": 4.9682254951173945e-05, + "loss": 0.1509, + "num_input_tokens_seen": 31858032, + "step": 26170 + }, + { + "epoch": 2.915135315736719, + "grad_norm": 0.18102934956550598, + "learning_rate": 4.96818686815736e-05, + "loss": 0.183, + "num_input_tokens_seen": 31863696, + "step": 26175 + }, + { + "epoch": 2.9156921706203365, + "grad_norm": 0.754966676235199, + "learning_rate": 4.968148217883324e-05, + "loss": 0.0604, + "num_input_tokens_seen": 31869552, + "step": 26180 + }, + { + "epoch": 2.9162490255039537, + "grad_norm": 0.06721550971269608, + "learning_rate": 4.968109544295649e-05, + "loss": 0.0816, + "num_input_tokens_seen": 31875472, + "step": 26185 + }, + { + "epoch": 2.916805880387571, + "grad_norm": 0.7878251075744629, + "learning_rate": 4.9680708473947035e-05, + "loss": 0.0584, + "num_input_tokens_seen": 31881712, + "step": 26190 + }, + { + "epoch": 2.917362735271188, + "grad_norm": 1.197792649269104, + "learning_rate": 4.9680321271808506e-05, + "loss": 0.0716, + "num_input_tokens_seen": 31887888, + "step": 26195 + }, + { + "epoch": 2.9179195901548054, + "grad_norm": 0.003999346401542425, + "learning_rate": 4.967993383654458e-05, + "loss": 0.0681, + "num_input_tokens_seen": 31894096, + "step": 26200 + }, + { + "epoch": 2.918476445038423, + "grad_norm": 1.0387530326843262, + "learning_rate": 4.96795461681589e-05, + "loss": 0.215, + "num_input_tokens_seen": 31900144, + "step": 26205 + }, + { + "epoch": 2.9190332999220403, + "grad_norm": 0.2441045045852661, + "learning_rate": 4.967915826665512e-05, + "loss": 0.1729, + "num_input_tokens_seen": 31906416, + "step": 26210 + }, + { + "epoch": 2.9195901548056575, + "grad_norm": 1.4850895404815674, + "learning_rate": 4.967877013203693e-05, + "loss": 0.085, + "num_input_tokens_seen": 31912752, + "step": 26215 + }, + { + "epoch": 2.920147009689275, + "grad_norm": 0.31551337242126465, + "learning_rate": 4.9678381764307986e-05, + "loss": 0.0996, + "num_input_tokens_seen": 31919184, + "step": 26220 + }, + { + "epoch": 2.9207038645728924, + "grad_norm": 0.4169149696826935, + "learning_rate": 4.9677993163471947e-05, + "loss": 0.0844, + "num_input_tokens_seen": 31925488, + "step": 26225 + }, + { + "epoch": 2.9212607194565097, + "grad_norm": 0.3109196424484253, + "learning_rate": 4.96776043295325e-05, + "loss": 0.1336, + "num_input_tokens_seen": 31931280, + "step": 26230 + }, + { + "epoch": 2.921817574340127, + "grad_norm": 0.7898166179656982, + "learning_rate": 4.96772152624933e-05, + "loss": 0.116, + "num_input_tokens_seen": 31937552, + "step": 26235 + }, + { + "epoch": 2.922374429223744, + "grad_norm": 0.4010288417339325, + "learning_rate": 4.9676825962358035e-05, + "loss": 0.1026, + "num_input_tokens_seen": 31943728, + "step": 26240 + }, + { + "epoch": 2.922931284107362, + "grad_norm": 0.18840596079826355, + "learning_rate": 4.967643642913038e-05, + "loss": 0.0396, + "num_input_tokens_seen": 31949968, + "step": 26245 + }, + { + "epoch": 2.923488138990979, + "grad_norm": 0.13793815672397614, + "learning_rate": 4.967604666281401e-05, + "loss": 0.0252, + "num_input_tokens_seen": 31955792, + "step": 26250 + }, + { + "epoch": 2.9240449938745963, + "grad_norm": 0.3114777207374573, + "learning_rate": 4.9675656663412605e-05, + "loss": 0.112, + "num_input_tokens_seen": 31962224, + "step": 26255 + }, + { + "epoch": 2.9246018487582135, + "grad_norm": 0.39394745230674744, + "learning_rate": 4.967526643092986e-05, + "loss": 0.0649, + "num_input_tokens_seen": 31968144, + "step": 26260 + }, + { + "epoch": 2.9251587036418307, + "grad_norm": 0.8495294451713562, + "learning_rate": 4.9674875965369446e-05, + "loss": 0.0875, + "num_input_tokens_seen": 31974064, + "step": 26265 + }, + { + "epoch": 2.9257155585254484, + "grad_norm": 0.0007350453524850309, + "learning_rate": 4.967448526673507e-05, + "loss": 0.0408, + "num_input_tokens_seen": 31980176, + "step": 26270 + }, + { + "epoch": 2.9262724134090656, + "grad_norm": 1.073089838027954, + "learning_rate": 4.967409433503041e-05, + "loss": 0.1143, + "num_input_tokens_seen": 31986288, + "step": 26275 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 1.429856300354004, + "learning_rate": 4.967370317025915e-05, + "loss": 0.0838, + "num_input_tokens_seen": 31992496, + "step": 26280 + }, + { + "epoch": 2.9273861231763, + "grad_norm": 0.6187422275543213, + "learning_rate": 4.9673311772425e-05, + "loss": 0.1012, + "num_input_tokens_seen": 31998736, + "step": 26285 + }, + { + "epoch": 2.9279429780599173, + "grad_norm": 1.2589036226272583, + "learning_rate": 4.9672920141531655e-05, + "loss": 0.1184, + "num_input_tokens_seen": 32004848, + "step": 26290 + }, + { + "epoch": 2.928499832943535, + "grad_norm": 0.14921881258487701, + "learning_rate": 4.9672528277582806e-05, + "loss": 0.0495, + "num_input_tokens_seen": 32010768, + "step": 26295 + }, + { + "epoch": 2.9290566878271522, + "grad_norm": 0.7844094038009644, + "learning_rate": 4.967213618058217e-05, + "loss": 0.0602, + "num_input_tokens_seen": 32016848, + "step": 26300 + }, + { + "epoch": 2.9296135427107695, + "grad_norm": 0.8905608654022217, + "learning_rate": 4.9671743850533435e-05, + "loss": 0.0756, + "num_input_tokens_seen": 32022992, + "step": 26305 + }, + { + "epoch": 2.930170397594387, + "grad_norm": 0.8033682107925415, + "learning_rate": 4.967135128744032e-05, + "loss": 0.1783, + "num_input_tokens_seen": 32029264, + "step": 26310 + }, + { + "epoch": 2.9307272524780044, + "grad_norm": 0.26919716596603394, + "learning_rate": 4.967095849130652e-05, + "loss": 0.0438, + "num_input_tokens_seen": 32035152, + "step": 26315 + }, + { + "epoch": 2.9312841073616216, + "grad_norm": 0.1324589103460312, + "learning_rate": 4.9670565462135744e-05, + "loss": 0.2547, + "num_input_tokens_seen": 32041712, + "step": 26320 + }, + { + "epoch": 2.931840962245239, + "grad_norm": 0.26666152477264404, + "learning_rate": 4.967017219993172e-05, + "loss": 0.0941, + "num_input_tokens_seen": 32047792, + "step": 26325 + }, + { + "epoch": 2.932397817128856, + "grad_norm": 0.6038082242012024, + "learning_rate": 4.966977870469815e-05, + "loss": 0.1156, + "num_input_tokens_seen": 32054320, + "step": 26330 + }, + { + "epoch": 2.9329546720124737, + "grad_norm": 0.06970732659101486, + "learning_rate": 4.966938497643876e-05, + "loss": 0.0359, + "num_input_tokens_seen": 32060560, + "step": 26335 + }, + { + "epoch": 2.933511526896091, + "grad_norm": 0.5413913130760193, + "learning_rate": 4.966899101515726e-05, + "loss": 0.0757, + "num_input_tokens_seen": 32066800, + "step": 26340 + }, + { + "epoch": 2.934068381779708, + "grad_norm": 0.11294280737638474, + "learning_rate": 4.966859682085737e-05, + "loss": 0.0918, + "num_input_tokens_seen": 32072976, + "step": 26345 + }, + { + "epoch": 2.9346252366633254, + "grad_norm": 0.3453350067138672, + "learning_rate": 4.966820239354283e-05, + "loss": 0.0769, + "num_input_tokens_seen": 32079024, + "step": 26350 + }, + { + "epoch": 2.9351820915469427, + "grad_norm": 0.806965708732605, + "learning_rate": 4.966780773321735e-05, + "loss": 0.132, + "num_input_tokens_seen": 32085264, + "step": 26355 + }, + { + "epoch": 2.9357389464305603, + "grad_norm": 0.009179460816085339, + "learning_rate": 4.9667412839884664e-05, + "loss": 0.0476, + "num_input_tokens_seen": 32091664, + "step": 26360 + }, + { + "epoch": 2.9362958013141776, + "grad_norm": 0.3061203956604004, + "learning_rate": 4.96670177135485e-05, + "loss": 0.1034, + "num_input_tokens_seen": 32097744, + "step": 26365 + }, + { + "epoch": 2.936852656197795, + "grad_norm": 1.1529536247253418, + "learning_rate": 4.966662235421259e-05, + "loss": 0.1456, + "num_input_tokens_seen": 32103760, + "step": 26370 + }, + { + "epoch": 2.9374095110814125, + "grad_norm": 0.736734926700592, + "learning_rate": 4.9666226761880664e-05, + "loss": 0.0687, + "num_input_tokens_seen": 32110032, + "step": 26375 + }, + { + "epoch": 2.9379663659650292, + "grad_norm": 0.017517853528261185, + "learning_rate": 4.9665830936556466e-05, + "loss": 0.0477, + "num_input_tokens_seen": 32115664, + "step": 26380 + }, + { + "epoch": 2.938523220848647, + "grad_norm": 2.6106197834014893, + "learning_rate": 4.966543487824375e-05, + "loss": 0.1184, + "num_input_tokens_seen": 32121424, + "step": 26385 + }, + { + "epoch": 2.939080075732264, + "grad_norm": 0.38176101446151733, + "learning_rate": 4.966503858694622e-05, + "loss": 0.1242, + "num_input_tokens_seen": 32127280, + "step": 26390 + }, + { + "epoch": 2.9396369306158814, + "grad_norm": 0.20655599236488342, + "learning_rate": 4.9664642062667645e-05, + "loss": 0.0594, + "num_input_tokens_seen": 32133424, + "step": 26395 + }, + { + "epoch": 2.940193785499499, + "grad_norm": 1.1831951141357422, + "learning_rate": 4.9664245305411764e-05, + "loss": 0.06, + "num_input_tokens_seen": 32139696, + "step": 26400 + }, + { + "epoch": 2.9407506403831163, + "grad_norm": 0.6160606741905212, + "learning_rate": 4.9663848315182323e-05, + "loss": 0.1297, + "num_input_tokens_seen": 32146000, + "step": 26405 + }, + { + "epoch": 2.9413074952667335, + "grad_norm": 0.9932460188865662, + "learning_rate": 4.966345109198308e-05, + "loss": 0.1338, + "num_input_tokens_seen": 32152176, + "step": 26410 + }, + { + "epoch": 2.9418643501503507, + "grad_norm": 0.44801270961761475, + "learning_rate": 4.9663053635817774e-05, + "loss": 0.0546, + "num_input_tokens_seen": 32158288, + "step": 26415 + }, + { + "epoch": 2.942421205033968, + "grad_norm": 0.41634589433670044, + "learning_rate": 4.966265594669017e-05, + "loss": 0.1864, + "num_input_tokens_seen": 32164496, + "step": 26420 + }, + { + "epoch": 2.9429780599175857, + "grad_norm": 0.2343510389328003, + "learning_rate": 4.966225802460402e-05, + "loss": 0.0421, + "num_input_tokens_seen": 32170736, + "step": 26425 + }, + { + "epoch": 2.943534914801203, + "grad_norm": 0.6989211440086365, + "learning_rate": 4.9661859869563096e-05, + "loss": 0.0824, + "num_input_tokens_seen": 32176816, + "step": 26430 + }, + { + "epoch": 2.94409176968482, + "grad_norm": 0.20471978187561035, + "learning_rate": 4.9661461481571135e-05, + "loss": 0.1732, + "num_input_tokens_seen": 32183152, + "step": 26435 + }, + { + "epoch": 2.9446486245684373, + "grad_norm": 0.10251415520906448, + "learning_rate": 4.966106286063191e-05, + "loss": 0.0321, + "num_input_tokens_seen": 32189456, + "step": 26440 + }, + { + "epoch": 2.9452054794520546, + "grad_norm": 0.17547273635864258, + "learning_rate": 4.96606640067492e-05, + "loss": 0.0457, + "num_input_tokens_seen": 32195760, + "step": 26445 + }, + { + "epoch": 2.9457623343356723, + "grad_norm": 1.2800147533416748, + "learning_rate": 4.9660264919926744e-05, + "loss": 0.1286, + "num_input_tokens_seen": 32201712, + "step": 26450 + }, + { + "epoch": 2.9463191892192895, + "grad_norm": 0.0016562967794016004, + "learning_rate": 4.9659865600168345e-05, + "loss": 0.0083, + "num_input_tokens_seen": 32208112, + "step": 26455 + }, + { + "epoch": 2.9468760441029067, + "grad_norm": 0.7065065503120422, + "learning_rate": 4.965946604747775e-05, + "loss": 0.0953, + "num_input_tokens_seen": 32214256, + "step": 26460 + }, + { + "epoch": 2.9474328989865244, + "grad_norm": 0.08596830815076828, + "learning_rate": 4.965906626185874e-05, + "loss": 0.0341, + "num_input_tokens_seen": 32220112, + "step": 26465 + }, + { + "epoch": 2.947989753870141, + "grad_norm": 0.3066970705986023, + "learning_rate": 4.965866624331509e-05, + "loss": 0.0672, + "num_input_tokens_seen": 32226480, + "step": 26470 + }, + { + "epoch": 2.948546608753759, + "grad_norm": 0.7447596788406372, + "learning_rate": 4.965826599185059e-05, + "loss": 0.1711, + "num_input_tokens_seen": 32232400, + "step": 26475 + }, + { + "epoch": 2.949103463637376, + "grad_norm": 0.07893125712871552, + "learning_rate": 4.965786550746901e-05, + "loss": 0.2469, + "num_input_tokens_seen": 32238928, + "step": 26480 + }, + { + "epoch": 2.9496603185209933, + "grad_norm": 0.25517600774765015, + "learning_rate": 4.965746479017413e-05, + "loss": 0.0644, + "num_input_tokens_seen": 32245168, + "step": 26485 + }, + { + "epoch": 2.950217173404611, + "grad_norm": 0.2806936800479889, + "learning_rate": 4.9657063839969744e-05, + "loss": 0.1421, + "num_input_tokens_seen": 32251024, + "step": 26490 + }, + { + "epoch": 2.950774028288228, + "grad_norm": 0.7496413588523865, + "learning_rate": 4.965666265685963e-05, + "loss": 0.0568, + "num_input_tokens_seen": 32257296, + "step": 26495 + }, + { + "epoch": 2.9513308831718454, + "grad_norm": 0.2869361937046051, + "learning_rate": 4.965626124084759e-05, + "loss": 0.0328, + "num_input_tokens_seen": 32263440, + "step": 26500 + }, + { + "epoch": 2.9518877380554627, + "grad_norm": 0.5030607581138611, + "learning_rate": 4.96558595919374e-05, + "loss": 0.0537, + "num_input_tokens_seen": 32269680, + "step": 26505 + }, + { + "epoch": 2.95244459293908, + "grad_norm": 0.5387782454490662, + "learning_rate": 4.965545771013287e-05, + "loss": 0.1124, + "num_input_tokens_seen": 32275696, + "step": 26510 + }, + { + "epoch": 2.9530014478226976, + "grad_norm": 0.4137417674064636, + "learning_rate": 4.9655055595437784e-05, + "loss": 0.0401, + "num_input_tokens_seen": 32281616, + "step": 26515 + }, + { + "epoch": 2.953558302706315, + "grad_norm": 0.28552690148353577, + "learning_rate": 4.9654653247855944e-05, + "loss": 0.031, + "num_input_tokens_seen": 32287792, + "step": 26520 + }, + { + "epoch": 2.954115157589932, + "grad_norm": 0.308052122592926, + "learning_rate": 4.965425066739116e-05, + "loss": 0.0269, + "num_input_tokens_seen": 32293808, + "step": 26525 + }, + { + "epoch": 2.9546720124735493, + "grad_norm": 0.43101972341537476, + "learning_rate": 4.965384785404721e-05, + "loss": 0.1318, + "num_input_tokens_seen": 32299920, + "step": 26530 + }, + { + "epoch": 2.9552288673571665, + "grad_norm": 1.4014554023742676, + "learning_rate": 4.965344480782793e-05, + "loss": 0.1558, + "num_input_tokens_seen": 32305808, + "step": 26535 + }, + { + "epoch": 2.955785722240784, + "grad_norm": 0.14969168603420258, + "learning_rate": 4.96530415287371e-05, + "loss": 0.0956, + "num_input_tokens_seen": 32311664, + "step": 26540 + }, + { + "epoch": 2.9563425771244014, + "grad_norm": 0.5325706005096436, + "learning_rate": 4.965263801677855e-05, + "loss": 0.0829, + "num_input_tokens_seen": 32317616, + "step": 26545 + }, + { + "epoch": 2.9568994320080186, + "grad_norm": 0.07623197883367538, + "learning_rate": 4.965223427195608e-05, + "loss": 0.0392, + "num_input_tokens_seen": 32323888, + "step": 26550 + }, + { + "epoch": 2.9574562868916363, + "grad_norm": 0.09833548218011856, + "learning_rate": 4.9651830294273496e-05, + "loss": 0.045, + "num_input_tokens_seen": 32330160, + "step": 26555 + }, + { + "epoch": 2.958013141775253, + "grad_norm": 0.25325217843055725, + "learning_rate": 4.965142608373463e-05, + "loss": 0.0263, + "num_input_tokens_seen": 32336368, + "step": 26560 + }, + { + "epoch": 2.9585699966588708, + "grad_norm": 0.7904902696609497, + "learning_rate": 4.96510216403433e-05, + "loss": 0.0342, + "num_input_tokens_seen": 32342576, + "step": 26565 + }, + { + "epoch": 2.959126851542488, + "grad_norm": 0.46221017837524414, + "learning_rate": 4.965061696410332e-05, + "loss": 0.1183, + "num_input_tokens_seen": 32348464, + "step": 26570 + }, + { + "epoch": 2.9596837064261052, + "grad_norm": 1.4072976112365723, + "learning_rate": 4.965021205501851e-05, + "loss": 0.1263, + "num_input_tokens_seen": 32354896, + "step": 26575 + }, + { + "epoch": 2.960240561309723, + "grad_norm": 0.09040118008852005, + "learning_rate": 4.96498069130927e-05, + "loss": 0.1615, + "num_input_tokens_seen": 32360400, + "step": 26580 + }, + { + "epoch": 2.96079741619334, + "grad_norm": 0.12172283977270126, + "learning_rate": 4.964940153832971e-05, + "loss": 0.1345, + "num_input_tokens_seen": 32366160, + "step": 26585 + }, + { + "epoch": 2.9613542710769574, + "grad_norm": 0.1146525889635086, + "learning_rate": 4.964899593073338e-05, + "loss": 0.0362, + "num_input_tokens_seen": 32372464, + "step": 26590 + }, + { + "epoch": 2.9619111259605746, + "grad_norm": 0.2257019579410553, + "learning_rate": 4.964859009030753e-05, + "loss": 0.0314, + "num_input_tokens_seen": 32377840, + "step": 26595 + }, + { + "epoch": 2.962467980844192, + "grad_norm": 1.660302996635437, + "learning_rate": 4.9648184017056e-05, + "loss": 0.3178, + "num_input_tokens_seen": 32384080, + "step": 26600 + }, + { + "epoch": 2.9630248357278095, + "grad_norm": 0.19686949253082275, + "learning_rate": 4.964777771098262e-05, + "loss": 0.1272, + "num_input_tokens_seen": 32389968, + "step": 26605 + }, + { + "epoch": 2.9635816906114267, + "grad_norm": 0.03494355082511902, + "learning_rate": 4.964737117209124e-05, + "loss": 0.0821, + "num_input_tokens_seen": 32395952, + "step": 26610 + }, + { + "epoch": 2.964138545495044, + "grad_norm": 1.3848706483840942, + "learning_rate": 4.964696440038569e-05, + "loss": 0.1139, + "num_input_tokens_seen": 32402000, + "step": 26615 + }, + { + "epoch": 2.964695400378661, + "grad_norm": 0.7469200491905212, + "learning_rate": 4.964655739586981e-05, + "loss": 0.1404, + "num_input_tokens_seen": 32408304, + "step": 26620 + }, + { + "epoch": 2.9652522552622784, + "grad_norm": 0.7027459144592285, + "learning_rate": 4.964615015854745e-05, + "loss": 0.0252, + "num_input_tokens_seen": 32414544, + "step": 26625 + }, + { + "epoch": 2.965809110145896, + "grad_norm": 0.3927076458930969, + "learning_rate": 4.9645742688422456e-05, + "loss": 0.0353, + "num_input_tokens_seen": 32420656, + "step": 26630 + }, + { + "epoch": 2.9663659650295133, + "grad_norm": 0.359668493270874, + "learning_rate": 4.964533498549868e-05, + "loss": 0.0908, + "num_input_tokens_seen": 32426704, + "step": 26635 + }, + { + "epoch": 2.9669228199131306, + "grad_norm": 0.6278908252716064, + "learning_rate": 4.9644927049779974e-05, + "loss": 0.0873, + "num_input_tokens_seen": 32433136, + "step": 26640 + }, + { + "epoch": 2.9674796747967482, + "grad_norm": 0.005711282137781382, + "learning_rate": 4.964451888127017e-05, + "loss": 0.0559, + "num_input_tokens_seen": 32439600, + "step": 26645 + }, + { + "epoch": 2.968036529680365, + "grad_norm": 0.2310723215341568, + "learning_rate": 4.964411047997316e-05, + "loss": 0.1357, + "num_input_tokens_seen": 32445296, + "step": 26650 + }, + { + "epoch": 2.9685933845639827, + "grad_norm": 0.01784202829003334, + "learning_rate": 4.964370184589277e-05, + "loss": 0.0731, + "num_input_tokens_seen": 32451440, + "step": 26655 + }, + { + "epoch": 2.9691502394476, + "grad_norm": 0.07992304861545563, + "learning_rate": 4.964329297903287e-05, + "loss": 0.0337, + "num_input_tokens_seen": 32457584, + "step": 26660 + }, + { + "epoch": 2.969707094331217, + "grad_norm": 0.07173321396112442, + "learning_rate": 4.9642883879397336e-05, + "loss": 0.0736, + "num_input_tokens_seen": 32463568, + "step": 26665 + }, + { + "epoch": 2.970263949214835, + "grad_norm": 1.6211520433425903, + "learning_rate": 4.964247454699001e-05, + "loss": 0.1811, + "num_input_tokens_seen": 32469872, + "step": 26670 + }, + { + "epoch": 2.970820804098452, + "grad_norm": 1.4162023067474365, + "learning_rate": 4.964206498181477e-05, + "loss": 0.0877, + "num_input_tokens_seen": 32475728, + "step": 26675 + }, + { + "epoch": 2.9713776589820693, + "grad_norm": 0.036426033824682236, + "learning_rate": 4.9641655183875484e-05, + "loss": 0.0698, + "num_input_tokens_seen": 32481904, + "step": 26680 + }, + { + "epoch": 2.9719345138656865, + "grad_norm": 0.04340111464262009, + "learning_rate": 4.964124515317603e-05, + "loss": 0.0733, + "num_input_tokens_seen": 32488240, + "step": 26685 + }, + { + "epoch": 2.9724913687493038, + "grad_norm": 0.4681667387485504, + "learning_rate": 4.964083488972026e-05, + "loss": 0.0239, + "num_input_tokens_seen": 32494096, + "step": 26690 + }, + { + "epoch": 2.9730482236329214, + "grad_norm": 0.04602726548910141, + "learning_rate": 4.964042439351207e-05, + "loss": 0.0941, + "num_input_tokens_seen": 32500208, + "step": 26695 + }, + { + "epoch": 2.9736050785165387, + "grad_norm": 1.0338044166564941, + "learning_rate": 4.9640013664555326e-05, + "loss": 0.1264, + "num_input_tokens_seen": 32506640, + "step": 26700 + }, + { + "epoch": 2.974161933400156, + "grad_norm": 0.6597902774810791, + "learning_rate": 4.9639602702853917e-05, + "loss": 0.0781, + "num_input_tokens_seen": 32512816, + "step": 26705 + }, + { + "epoch": 2.974718788283773, + "grad_norm": 0.9069920778274536, + "learning_rate": 4.963919150841171e-05, + "loss": 0.1062, + "num_input_tokens_seen": 32518672, + "step": 26710 + }, + { + "epoch": 2.9752756431673903, + "grad_norm": 0.26524659991264343, + "learning_rate": 4.963878008123261e-05, + "loss": 0.0988, + "num_input_tokens_seen": 32524656, + "step": 26715 + }, + { + "epoch": 2.975832498051008, + "grad_norm": 0.1986953169107437, + "learning_rate": 4.963836842132049e-05, + "loss": 0.0422, + "num_input_tokens_seen": 32530608, + "step": 26720 + }, + { + "epoch": 2.9763893529346253, + "grad_norm": 0.49664661288261414, + "learning_rate": 4.9637956528679234e-05, + "loss": 0.0754, + "num_input_tokens_seen": 32536912, + "step": 26725 + }, + { + "epoch": 2.9769462078182425, + "grad_norm": 0.6995514631271362, + "learning_rate": 4.963754440331274e-05, + "loss": 0.2162, + "num_input_tokens_seen": 32543312, + "step": 26730 + }, + { + "epoch": 2.97750306270186, + "grad_norm": 0.3704002797603607, + "learning_rate": 4.963713204522491e-05, + "loss": 0.0655, + "num_input_tokens_seen": 32549456, + "step": 26735 + }, + { + "epoch": 2.9780599175854774, + "grad_norm": 0.38528677821159363, + "learning_rate": 4.963671945441962e-05, + "loss": 0.0509, + "num_input_tokens_seen": 32555568, + "step": 26740 + }, + { + "epoch": 2.9786167724690946, + "grad_norm": 0.1363503336906433, + "learning_rate": 4.9636306630900775e-05, + "loss": 0.0821, + "num_input_tokens_seen": 32561744, + "step": 26745 + }, + { + "epoch": 2.979173627352712, + "grad_norm": 0.7043859362602234, + "learning_rate": 4.963589357467228e-05, + "loss": 0.0757, + "num_input_tokens_seen": 32567824, + "step": 26750 + }, + { + "epoch": 2.979730482236329, + "grad_norm": 1.7567471265792847, + "learning_rate": 4.963548028573803e-05, + "loss": 0.192, + "num_input_tokens_seen": 32574224, + "step": 26755 + }, + { + "epoch": 2.9802873371199468, + "grad_norm": 0.02731965109705925, + "learning_rate": 4.963506676410193e-05, + "loss": 0.1051, + "num_input_tokens_seen": 32580816, + "step": 26760 + }, + { + "epoch": 2.980844192003564, + "grad_norm": 2.607187032699585, + "learning_rate": 4.963465300976789e-05, + "loss": 0.1941, + "num_input_tokens_seen": 32587120, + "step": 26765 + }, + { + "epoch": 2.981401046887181, + "grad_norm": 0.16261403262615204, + "learning_rate": 4.963423902273981e-05, + "loss": 0.0134, + "num_input_tokens_seen": 32593488, + "step": 26770 + }, + { + "epoch": 2.9819579017707984, + "grad_norm": 0.180800199508667, + "learning_rate": 4.963382480302161e-05, + "loss": 0.0945, + "num_input_tokens_seen": 32599696, + "step": 26775 + }, + { + "epoch": 2.9825147566544157, + "grad_norm": 0.5514781475067139, + "learning_rate": 4.9633410350617205e-05, + "loss": 0.1281, + "num_input_tokens_seen": 32606064, + "step": 26780 + }, + { + "epoch": 2.9830716115380334, + "grad_norm": 1.157850980758667, + "learning_rate": 4.9632995665530494e-05, + "loss": 0.0904, + "num_input_tokens_seen": 32612208, + "step": 26785 + }, + { + "epoch": 2.9836284664216506, + "grad_norm": 1.1904488801956177, + "learning_rate": 4.9632580747765404e-05, + "loss": 0.076, + "num_input_tokens_seen": 32618320, + "step": 26790 + }, + { + "epoch": 2.984185321305268, + "grad_norm": 0.04107905924320221, + "learning_rate": 4.963216559732585e-05, + "loss": 0.04, + "num_input_tokens_seen": 32624560, + "step": 26795 + }, + { + "epoch": 2.984742176188885, + "grad_norm": 0.19934605062007904, + "learning_rate": 4.963175021421577e-05, + "loss": 0.0408, + "num_input_tokens_seen": 32630448, + "step": 26800 + }, + { + "epoch": 2.9852990310725023, + "grad_norm": 0.18823711574077606, + "learning_rate": 4.9631334598439064e-05, + "loss": 0.0173, + "num_input_tokens_seen": 32636592, + "step": 26805 + }, + { + "epoch": 2.98585588595612, + "grad_norm": 0.008715414442121983, + "learning_rate": 4.9630918749999674e-05, + "loss": 0.0696, + "num_input_tokens_seen": 32642704, + "step": 26810 + }, + { + "epoch": 2.986412740839737, + "grad_norm": 0.8444534540176392, + "learning_rate": 4.9630502668901516e-05, + "loss": 0.0934, + "num_input_tokens_seen": 32648656, + "step": 26815 + }, + { + "epoch": 2.9869695957233544, + "grad_norm": 0.0070344991981983185, + "learning_rate": 4.9630086355148534e-05, + "loss": 0.01, + "num_input_tokens_seen": 32654704, + "step": 26820 + }, + { + "epoch": 2.987526450606972, + "grad_norm": 0.7549980878829956, + "learning_rate": 4.962966980874465e-05, + "loss": 0.041, + "num_input_tokens_seen": 32661200, + "step": 26825 + }, + { + "epoch": 2.9880833054905893, + "grad_norm": 0.10403501987457275, + "learning_rate": 4.96292530296938e-05, + "loss": 0.1077, + "num_input_tokens_seen": 32666768, + "step": 26830 + }, + { + "epoch": 2.9886401603742065, + "grad_norm": 0.9668408036231995, + "learning_rate": 4.9628836017999925e-05, + "loss": 0.2898, + "num_input_tokens_seen": 32673072, + "step": 26835 + }, + { + "epoch": 2.9891970152578238, + "grad_norm": 0.10550790280103683, + "learning_rate": 4.962841877366696e-05, + "loss": 0.1113, + "num_input_tokens_seen": 32679248, + "step": 26840 + }, + { + "epoch": 2.989753870141441, + "grad_norm": 1.0265306234359741, + "learning_rate": 4.9628001296698846e-05, + "loss": 0.0611, + "num_input_tokens_seen": 32685296, + "step": 26845 + }, + { + "epoch": 2.9903107250250587, + "grad_norm": 0.009512027725577354, + "learning_rate": 4.962758358709953e-05, + "loss": 0.1106, + "num_input_tokens_seen": 32691440, + "step": 26850 + }, + { + "epoch": 2.990867579908676, + "grad_norm": 0.12596368789672852, + "learning_rate": 4.962716564487295e-05, + "loss": 0.181, + "num_input_tokens_seen": 32697392, + "step": 26855 + }, + { + "epoch": 2.991424434792293, + "grad_norm": 0.5314089059829712, + "learning_rate": 4.9626747470023074e-05, + "loss": 0.0373, + "num_input_tokens_seen": 32703888, + "step": 26860 + }, + { + "epoch": 2.9919812896759104, + "grad_norm": 0.8053138852119446, + "learning_rate": 4.9626329062553826e-05, + "loss": 0.1809, + "num_input_tokens_seen": 32709776, + "step": 26865 + }, + { + "epoch": 2.9925381445595276, + "grad_norm": 0.6071574091911316, + "learning_rate": 4.962591042246917e-05, + "loss": 0.1124, + "num_input_tokens_seen": 32715856, + "step": 26870 + }, + { + "epoch": 2.9930949994431453, + "grad_norm": 1.237396240234375, + "learning_rate": 4.962549154977306e-05, + "loss": 0.2036, + "num_input_tokens_seen": 32722064, + "step": 26875 + }, + { + "epoch": 2.9936518543267625, + "grad_norm": 0.1980985701084137, + "learning_rate": 4.9625072444469464e-05, + "loss": 0.0895, + "num_input_tokens_seen": 32728144, + "step": 26880 + }, + { + "epoch": 2.9942087092103797, + "grad_norm": 0.2028367519378662, + "learning_rate": 4.962465310656232e-05, + "loss": 0.0768, + "num_input_tokens_seen": 32734384, + "step": 26885 + }, + { + "epoch": 2.994765564093997, + "grad_norm": 0.3479212820529938, + "learning_rate": 4.96242335360556e-05, + "loss": 0.0948, + "num_input_tokens_seen": 32740752, + "step": 26890 + }, + { + "epoch": 2.995322418977614, + "grad_norm": 0.2368408441543579, + "learning_rate": 4.962381373295326e-05, + "loss": 0.0507, + "num_input_tokens_seen": 32746800, + "step": 26895 + }, + { + "epoch": 2.995879273861232, + "grad_norm": 1.0644490718841553, + "learning_rate": 4.962339369725928e-05, + "loss": 0.1151, + "num_input_tokens_seen": 32752880, + "step": 26900 + }, + { + "epoch": 2.996436128744849, + "grad_norm": 0.19067491590976715, + "learning_rate": 4.9622973428977615e-05, + "loss": 0.1847, + "num_input_tokens_seen": 32759216, + "step": 26905 + }, + { + "epoch": 2.9969929836284663, + "grad_norm": 0.4886007606983185, + "learning_rate": 4.962255292811224e-05, + "loss": 0.1109, + "num_input_tokens_seen": 32764720, + "step": 26910 + }, + { + "epoch": 2.997549838512084, + "grad_norm": 0.04249968007206917, + "learning_rate": 4.962213219466712e-05, + "loss": 0.0495, + "num_input_tokens_seen": 32770864, + "step": 26915 + }, + { + "epoch": 2.9981066933957012, + "grad_norm": 1.1599664688110352, + "learning_rate": 4.962171122864624e-05, + "loss": 0.1852, + "num_input_tokens_seen": 32777200, + "step": 26920 + }, + { + "epoch": 2.9986635482793185, + "grad_norm": 0.33954334259033203, + "learning_rate": 4.962129003005357e-05, + "loss": 0.1243, + "num_input_tokens_seen": 32783440, + "step": 26925 + }, + { + "epoch": 2.9992204031629357, + "grad_norm": 0.05180013179779053, + "learning_rate": 4.9620868598893084e-05, + "loss": 0.0594, + "num_input_tokens_seen": 32789584, + "step": 26930 + }, + { + "epoch": 2.999777258046553, + "grad_norm": 1.1456927061080933, + "learning_rate": 4.9620446935168775e-05, + "loss": 0.1132, + "num_input_tokens_seen": 32795952, + "step": 26935 + }, + { + "epoch": 3.0, + "eval_loss": 0.09943488240242004, + "eval_runtime": 112.4285, + "eval_samples_per_second": 35.498, + "eval_steps_per_second": 8.877, + "num_input_tokens_seen": 32797696, + "step": 26937 + }, + { + "epoch": 3.0003341129301706, + "grad_norm": 0.6807231307029724, + "learning_rate": 4.962002503888461e-05, + "loss": 0.0475, + "num_input_tokens_seen": 32801152, + "step": 26940 + }, + { + "epoch": 3.000890967813788, + "grad_norm": 0.04323466867208481, + "learning_rate": 4.9619602910044596e-05, + "loss": 0.0469, + "num_input_tokens_seen": 32807200, + "step": 26945 + }, + { + "epoch": 3.001447822697405, + "grad_norm": 0.025996673852205276, + "learning_rate": 4.96191805486527e-05, + "loss": 0.1349, + "num_input_tokens_seen": 32813312, + "step": 26950 + }, + { + "epoch": 3.0020046775810223, + "grad_norm": 2.132129192352295, + "learning_rate": 4.961875795471292e-05, + "loss": 0.146, + "num_input_tokens_seen": 32819968, + "step": 26955 + }, + { + "epoch": 3.0025615324646395, + "grad_norm": 0.25672319531440735, + "learning_rate": 4.961833512822924e-05, + "loss": 0.1232, + "num_input_tokens_seen": 32826144, + "step": 26960 + }, + { + "epoch": 3.003118387348257, + "grad_norm": 0.009630915708839893, + "learning_rate": 4.961791206920567e-05, + "loss": 0.0696, + "num_input_tokens_seen": 32831904, + "step": 26965 + }, + { + "epoch": 3.0036752422318744, + "grad_norm": 0.025450877845287323, + "learning_rate": 4.96174887776462e-05, + "loss": 0.0771, + "num_input_tokens_seen": 32838144, + "step": 26970 + }, + { + "epoch": 3.0042320971154917, + "grad_norm": 1.2323857545852661, + "learning_rate": 4.961706525355482e-05, + "loss": 0.1407, + "num_input_tokens_seen": 32844032, + "step": 26975 + }, + { + "epoch": 3.004788951999109, + "grad_norm": 1.3959555625915527, + "learning_rate": 4.9616641496935535e-05, + "loss": 0.1523, + "num_input_tokens_seen": 32850144, + "step": 26980 + }, + { + "epoch": 3.0053458068827266, + "grad_norm": 0.1424614042043686, + "learning_rate": 4.961621750779235e-05, + "loss": 0.0538, + "num_input_tokens_seen": 32856192, + "step": 26985 + }, + { + "epoch": 3.005902661766344, + "grad_norm": 0.4321027100086212, + "learning_rate": 4.961579328612927e-05, + "loss": 0.0861, + "num_input_tokens_seen": 32862432, + "step": 26990 + }, + { + "epoch": 3.006459516649961, + "grad_norm": 0.2219657450914383, + "learning_rate": 4.96153688319503e-05, + "loss": 0.0134, + "num_input_tokens_seen": 32868960, + "step": 26995 + }, + { + "epoch": 3.0070163715335783, + "grad_norm": 0.29403144121170044, + "learning_rate": 4.961494414525945e-05, + "loss": 0.1482, + "num_input_tokens_seen": 32875104, + "step": 27000 + }, + { + "epoch": 3.0075732264171955, + "grad_norm": 0.03951242193579674, + "learning_rate": 4.961451922606073e-05, + "loss": 0.0911, + "num_input_tokens_seen": 32881088, + "step": 27005 + }, + { + "epoch": 3.008130081300813, + "grad_norm": 2.408207893371582, + "learning_rate": 4.961409407435815e-05, + "loss": 0.1628, + "num_input_tokens_seen": 32887328, + "step": 27010 + }, + { + "epoch": 3.0086869361844304, + "grad_norm": 2.2140674591064453, + "learning_rate": 4.961366869015573e-05, + "loss": 0.1877, + "num_input_tokens_seen": 32893440, + "step": 27015 + }, + { + "epoch": 3.0092437910680476, + "grad_norm": 0.92658931016922, + "learning_rate": 4.961324307345751e-05, + "loss": 0.1154, + "num_input_tokens_seen": 32899360, + "step": 27020 + }, + { + "epoch": 3.009800645951665, + "grad_norm": 0.6950349807739258, + "learning_rate": 4.961281722426747e-05, + "loss": 0.1244, + "num_input_tokens_seen": 32904928, + "step": 27025 + }, + { + "epoch": 3.0103575008352825, + "grad_norm": 0.11914312839508057, + "learning_rate": 4.961239114258966e-05, + "loss": 0.0469, + "num_input_tokens_seen": 32911200, + "step": 27030 + }, + { + "epoch": 3.0109143557188998, + "grad_norm": 0.005510753486305475, + "learning_rate": 4.9611964828428086e-05, + "loss": 0.0546, + "num_input_tokens_seen": 32917088, + "step": 27035 + }, + { + "epoch": 3.011471210602517, + "grad_norm": 0.6288120746612549, + "learning_rate": 4.9611538281786796e-05, + "loss": 0.1304, + "num_input_tokens_seen": 32922592, + "step": 27040 + }, + { + "epoch": 3.012028065486134, + "grad_norm": 0.004485876765102148, + "learning_rate": 4.9611111502669805e-05, + "loss": 0.0146, + "num_input_tokens_seen": 32929088, + "step": 27045 + }, + { + "epoch": 3.0125849203697515, + "grad_norm": 0.31908559799194336, + "learning_rate": 4.9610684491081146e-05, + "loss": 0.088, + "num_input_tokens_seen": 32934112, + "step": 27050 + }, + { + "epoch": 3.013141775253369, + "grad_norm": 0.7120285034179688, + "learning_rate": 4.961025724702486e-05, + "loss": 0.0476, + "num_input_tokens_seen": 32940064, + "step": 27055 + }, + { + "epoch": 3.0136986301369864, + "grad_norm": 2.5094687938690186, + "learning_rate": 4.960982977050497e-05, + "loss": 0.2797, + "num_input_tokens_seen": 32946144, + "step": 27060 + }, + { + "epoch": 3.0142554850206036, + "grad_norm": 0.045339249074459076, + "learning_rate": 4.9609402061525524e-05, + "loss": 0.1181, + "num_input_tokens_seen": 32952288, + "step": 27065 + }, + { + "epoch": 3.014812339904221, + "grad_norm": 0.5269622206687927, + "learning_rate": 4.960897412009056e-05, + "loss": 0.0978, + "num_input_tokens_seen": 32958592, + "step": 27070 + }, + { + "epoch": 3.0153691947878385, + "grad_norm": 0.12657351791858673, + "learning_rate": 4.960854594620411e-05, + "loss": 0.1201, + "num_input_tokens_seen": 32964960, + "step": 27075 + }, + { + "epoch": 3.0159260496714557, + "grad_norm": 0.42905524373054504, + "learning_rate": 4.9608117539870235e-05, + "loss": 0.0605, + "num_input_tokens_seen": 32971104, + "step": 27080 + }, + { + "epoch": 3.016482904555073, + "grad_norm": 0.024456458166241646, + "learning_rate": 4.960768890109297e-05, + "loss": 0.0291, + "num_input_tokens_seen": 32977088, + "step": 27085 + }, + { + "epoch": 3.01703975943869, + "grad_norm": 0.33775949478149414, + "learning_rate": 4.9607260029876376e-05, + "loss": 0.061, + "num_input_tokens_seen": 32983264, + "step": 27090 + }, + { + "epoch": 3.0175966143223074, + "grad_norm": 0.011676881462335587, + "learning_rate": 4.960683092622449e-05, + "loss": 0.121, + "num_input_tokens_seen": 32989408, + "step": 27095 + }, + { + "epoch": 3.018153469205925, + "grad_norm": 0.05438065156340599, + "learning_rate": 4.960640159014138e-05, + "loss": 0.0601, + "num_input_tokens_seen": 32995552, + "step": 27100 + }, + { + "epoch": 3.0187103240895423, + "grad_norm": 0.03358602896332741, + "learning_rate": 4.960597202163109e-05, + "loss": 0.1556, + "num_input_tokens_seen": 33002016, + "step": 27105 + }, + { + "epoch": 3.0192671789731595, + "grad_norm": 0.0797041580080986, + "learning_rate": 4.960554222069767e-05, + "loss": 0.0774, + "num_input_tokens_seen": 33008128, + "step": 27110 + }, + { + "epoch": 3.019824033856777, + "grad_norm": 0.604069173336029, + "learning_rate": 4.960511218734519e-05, + "loss": 0.0285, + "num_input_tokens_seen": 33014080, + "step": 27115 + }, + { + "epoch": 3.0203808887403945, + "grad_norm": 0.7596750259399414, + "learning_rate": 4.960468192157772e-05, + "loss": 0.0704, + "num_input_tokens_seen": 33020192, + "step": 27120 + }, + { + "epoch": 3.0209377436240117, + "grad_norm": 0.18697889149188995, + "learning_rate": 4.960425142339932e-05, + "loss": 0.0605, + "num_input_tokens_seen": 33026240, + "step": 27125 + }, + { + "epoch": 3.021494598507629, + "grad_norm": 0.11351748555898666, + "learning_rate": 4.9603820692814054e-05, + "loss": 0.1157, + "num_input_tokens_seen": 33031648, + "step": 27130 + }, + { + "epoch": 3.022051453391246, + "grad_norm": 0.7922021150588989, + "learning_rate": 4.960338972982598e-05, + "loss": 0.1645, + "num_input_tokens_seen": 33038048, + "step": 27135 + }, + { + "epoch": 3.0226083082748634, + "grad_norm": 0.3820313513278961, + "learning_rate": 4.9602958534439176e-05, + "loss": 0.0472, + "num_input_tokens_seen": 33043776, + "step": 27140 + }, + { + "epoch": 3.023165163158481, + "grad_norm": 1.2210742235183716, + "learning_rate": 4.960252710665772e-05, + "loss": 0.0794, + "num_input_tokens_seen": 33050112, + "step": 27145 + }, + { + "epoch": 3.0237220180420983, + "grad_norm": 0.06159316375851631, + "learning_rate": 4.9602095446485687e-05, + "loss": 0.0567, + "num_input_tokens_seen": 33056192, + "step": 27150 + }, + { + "epoch": 3.0242788729257155, + "grad_norm": 0.6677621603012085, + "learning_rate": 4.960166355392715e-05, + "loss": 0.0846, + "num_input_tokens_seen": 33061632, + "step": 27155 + }, + { + "epoch": 3.0248357278093327, + "grad_norm": 0.5775237083435059, + "learning_rate": 4.960123142898619e-05, + "loss": 0.177, + "num_input_tokens_seen": 33067776, + "step": 27160 + }, + { + "epoch": 3.0253925826929504, + "grad_norm": 2.368126392364502, + "learning_rate": 4.9600799071666894e-05, + "loss": 0.1399, + "num_input_tokens_seen": 33073920, + "step": 27165 + }, + { + "epoch": 3.0259494375765676, + "grad_norm": 0.3707484304904938, + "learning_rate": 4.9600366481973335e-05, + "loss": 0.0541, + "num_input_tokens_seen": 33080320, + "step": 27170 + }, + { + "epoch": 3.026506292460185, + "grad_norm": 0.6743701696395874, + "learning_rate": 4.959993365990961e-05, + "loss": 0.0412, + "num_input_tokens_seen": 33086592, + "step": 27175 + }, + { + "epoch": 3.027063147343802, + "grad_norm": 0.17158161103725433, + "learning_rate": 4.9599500605479796e-05, + "loss": 0.1372, + "num_input_tokens_seen": 33092640, + "step": 27180 + }, + { + "epoch": 3.0276200022274193, + "grad_norm": 0.2859877645969391, + "learning_rate": 4.9599067318687995e-05, + "loss": 0.0832, + "num_input_tokens_seen": 33098720, + "step": 27185 + }, + { + "epoch": 3.028176857111037, + "grad_norm": 0.4849948585033417, + "learning_rate": 4.95986337995383e-05, + "loss": 0.0687, + "num_input_tokens_seen": 33104960, + "step": 27190 + }, + { + "epoch": 3.0287337119946542, + "grad_norm": 0.050362419337034225, + "learning_rate": 4.959820004803479e-05, + "loss": 0.0058, + "num_input_tokens_seen": 33111264, + "step": 27195 + }, + { + "epoch": 3.0292905668782715, + "grad_norm": 0.1067977100610733, + "learning_rate": 4.959776606418157e-05, + "loss": 0.0242, + "num_input_tokens_seen": 33117568, + "step": 27200 + }, + { + "epoch": 3.0298474217618887, + "grad_norm": 0.012418546713888645, + "learning_rate": 4.9597331847982754e-05, + "loss": 0.0581, + "num_input_tokens_seen": 33123232, + "step": 27205 + }, + { + "epoch": 3.0304042766455064, + "grad_norm": 0.21831239759922028, + "learning_rate": 4.959689739944242e-05, + "loss": 0.0242, + "num_input_tokens_seen": 33129280, + "step": 27210 + }, + { + "epoch": 3.0309611315291236, + "grad_norm": 0.010559115558862686, + "learning_rate": 4.9596462718564695e-05, + "loss": 0.0999, + "num_input_tokens_seen": 33135616, + "step": 27215 + }, + { + "epoch": 3.031517986412741, + "grad_norm": 0.003849245375022292, + "learning_rate": 4.9596027805353666e-05, + "loss": 0.0523, + "num_input_tokens_seen": 33141952, + "step": 27220 + }, + { + "epoch": 3.032074841296358, + "grad_norm": 0.0036576823331415653, + "learning_rate": 4.959559265981345e-05, + "loss": 0.0852, + "num_input_tokens_seen": 33148032, + "step": 27225 + }, + { + "epoch": 3.0326316961799753, + "grad_norm": 0.004088937770575285, + "learning_rate": 4.9595157281948155e-05, + "loss": 0.1223, + "num_input_tokens_seen": 33154144, + "step": 27230 + }, + { + "epoch": 3.033188551063593, + "grad_norm": 0.008653596043586731, + "learning_rate": 4.95947216717619e-05, + "loss": 0.0309, + "num_input_tokens_seen": 33160576, + "step": 27235 + }, + { + "epoch": 3.03374540594721, + "grad_norm": 0.050742220133543015, + "learning_rate": 4.9594285829258794e-05, + "loss": 0.0835, + "num_input_tokens_seen": 33166240, + "step": 27240 + }, + { + "epoch": 3.0343022608308274, + "grad_norm": 0.2022992968559265, + "learning_rate": 4.959384975444294e-05, + "loss": 0.1028, + "num_input_tokens_seen": 33172256, + "step": 27245 + }, + { + "epoch": 3.0348591157144447, + "grad_norm": 0.03520641475915909, + "learning_rate": 4.959341344731848e-05, + "loss": 0.0791, + "num_input_tokens_seen": 33178336, + "step": 27250 + }, + { + "epoch": 3.0354159705980623, + "grad_norm": 0.25385525822639465, + "learning_rate": 4.9592976907889526e-05, + "loss": 0.055, + "num_input_tokens_seen": 33183680, + "step": 27255 + }, + { + "epoch": 3.0359728254816796, + "grad_norm": 0.5926734209060669, + "learning_rate": 4.959254013616021e-05, + "loss": 0.1565, + "num_input_tokens_seen": 33189952, + "step": 27260 + }, + { + "epoch": 3.036529680365297, + "grad_norm": 0.8476154208183289, + "learning_rate": 4.959210313213463e-05, + "loss": 0.1005, + "num_input_tokens_seen": 33195968, + "step": 27265 + }, + { + "epoch": 3.037086535248914, + "grad_norm": 0.3618316352367401, + "learning_rate": 4.959166589581695e-05, + "loss": 0.2174, + "num_input_tokens_seen": 33202272, + "step": 27270 + }, + { + "epoch": 3.0376433901325313, + "grad_norm": 0.16866637766361237, + "learning_rate": 4.9591228427211276e-05, + "loss": 0.0239, + "num_input_tokens_seen": 33208512, + "step": 27275 + }, + { + "epoch": 3.038200245016149, + "grad_norm": 1.3878430128097534, + "learning_rate": 4.9590790726321746e-05, + "loss": 0.0621, + "num_input_tokens_seen": 33214432, + "step": 27280 + }, + { + "epoch": 3.038757099899766, + "grad_norm": 1.2503376007080078, + "learning_rate": 4.95903527931525e-05, + "loss": 0.1888, + "num_input_tokens_seen": 33220608, + "step": 27285 + }, + { + "epoch": 3.0393139547833834, + "grad_norm": 1.562050700187683, + "learning_rate": 4.9589914627707666e-05, + "loss": 0.1251, + "num_input_tokens_seen": 33226784, + "step": 27290 + }, + { + "epoch": 3.0398708096670006, + "grad_norm": 1.165481686592102, + "learning_rate": 4.958947622999139e-05, + "loss": 0.0399, + "num_input_tokens_seen": 33232992, + "step": 27295 + }, + { + "epoch": 3.0404276645506183, + "grad_norm": 0.15056279301643372, + "learning_rate": 4.9589037600007806e-05, + "loss": 0.0512, + "num_input_tokens_seen": 33239200, + "step": 27300 + }, + { + "epoch": 3.0409845194342355, + "grad_norm": 1.9045395851135254, + "learning_rate": 4.9588598737761065e-05, + "loss": 0.2467, + "num_input_tokens_seen": 33245344, + "step": 27305 + }, + { + "epoch": 3.0415413743178528, + "grad_norm": 0.7900893092155457, + "learning_rate": 4.958815964325531e-05, + "loss": 0.0398, + "num_input_tokens_seen": 33251520, + "step": 27310 + }, + { + "epoch": 3.04209822920147, + "grad_norm": 0.9204455614089966, + "learning_rate": 4.958772031649469e-05, + "loss": 0.1912, + "num_input_tokens_seen": 33257056, + "step": 27315 + }, + { + "epoch": 3.0426550840850872, + "grad_norm": 1.7076321840286255, + "learning_rate": 4.958728075748335e-05, + "loss": 0.1745, + "num_input_tokens_seen": 33262752, + "step": 27320 + }, + { + "epoch": 3.043211938968705, + "grad_norm": 0.29108738899230957, + "learning_rate": 4.958684096622544e-05, + "loss": 0.0737, + "num_input_tokens_seen": 33268832, + "step": 27325 + }, + { + "epoch": 3.043768793852322, + "grad_norm": 0.07822010666131973, + "learning_rate": 4.958640094272512e-05, + "loss": 0.1261, + "num_input_tokens_seen": 33274560, + "step": 27330 + }, + { + "epoch": 3.0443256487359394, + "grad_norm": 1.0799700021743774, + "learning_rate": 4.9585960686986546e-05, + "loss": 0.0765, + "num_input_tokens_seen": 33280864, + "step": 27335 + }, + { + "epoch": 3.0448825036195566, + "grad_norm": 0.010270226746797562, + "learning_rate": 4.958552019901388e-05, + "loss": 0.0516, + "num_input_tokens_seen": 33287008, + "step": 27340 + }, + { + "epoch": 3.0454393585031743, + "grad_norm": 0.18496586382389069, + "learning_rate": 4.958507947881127e-05, + "loss": 0.0668, + "num_input_tokens_seen": 33293216, + "step": 27345 + }, + { + "epoch": 3.0459962133867915, + "grad_norm": 0.0191847775131464, + "learning_rate": 4.958463852638289e-05, + "loss": 0.0753, + "num_input_tokens_seen": 33299712, + "step": 27350 + }, + { + "epoch": 3.0465530682704087, + "grad_norm": 0.1548917293548584, + "learning_rate": 4.9584197341732905e-05, + "loss": 0.038, + "num_input_tokens_seen": 33305824, + "step": 27355 + }, + { + "epoch": 3.047109923154026, + "grad_norm": 0.4169257879257202, + "learning_rate": 4.958375592486547e-05, + "loss": 0.1231, + "num_input_tokens_seen": 33312000, + "step": 27360 + }, + { + "epoch": 3.047666778037643, + "grad_norm": 0.7230256795883179, + "learning_rate": 4.9583314275784775e-05, + "loss": 0.1781, + "num_input_tokens_seen": 33317504, + "step": 27365 + }, + { + "epoch": 3.048223632921261, + "grad_norm": 1.4395840167999268, + "learning_rate": 4.9582872394494976e-05, + "loss": 0.0841, + "num_input_tokens_seen": 33323072, + "step": 27370 + }, + { + "epoch": 3.048780487804878, + "grad_norm": 0.3753931224346161, + "learning_rate": 4.9582430281000257e-05, + "loss": 0.0751, + "num_input_tokens_seen": 33329376, + "step": 27375 + }, + { + "epoch": 3.0493373426884953, + "grad_norm": 0.01061855535954237, + "learning_rate": 4.958198793530478e-05, + "loss": 0.026, + "num_input_tokens_seen": 33335584, + "step": 27380 + }, + { + "epoch": 3.0498941975721126, + "grad_norm": 0.07405778020620346, + "learning_rate": 4.958154535741274e-05, + "loss": 0.0638, + "num_input_tokens_seen": 33341312, + "step": 27385 + }, + { + "epoch": 3.0504510524557302, + "grad_norm": 0.015160974115133286, + "learning_rate": 4.958110254732831e-05, + "loss": 0.1116, + "num_input_tokens_seen": 33346720, + "step": 27390 + }, + { + "epoch": 3.0510079073393475, + "grad_norm": 0.05210521072149277, + "learning_rate": 4.958065950505567e-05, + "loss": 0.0373, + "num_input_tokens_seen": 33352864, + "step": 27395 + }, + { + "epoch": 3.0515647622229647, + "grad_norm": 0.10226840525865555, + "learning_rate": 4.958021623059901e-05, + "loss": 0.0092, + "num_input_tokens_seen": 33359168, + "step": 27400 + }, + { + "epoch": 3.052121617106582, + "grad_norm": 0.6350712180137634, + "learning_rate": 4.95797727239625e-05, + "loss": 0.036, + "num_input_tokens_seen": 33365472, + "step": 27405 + }, + { + "epoch": 3.052678471990199, + "grad_norm": 0.12855637073516846, + "learning_rate": 4.957932898515036e-05, + "loss": 0.1303, + "num_input_tokens_seen": 33371840, + "step": 27410 + }, + { + "epoch": 3.053235326873817, + "grad_norm": 0.009712169878184795, + "learning_rate": 4.957888501416676e-05, + "loss": 0.2039, + "num_input_tokens_seen": 33378016, + "step": 27415 + }, + { + "epoch": 3.053792181757434, + "grad_norm": 0.22173136472702026, + "learning_rate": 4.9578440811015896e-05, + "loss": 0.1795, + "num_input_tokens_seen": 33384160, + "step": 27420 + }, + { + "epoch": 3.0543490366410513, + "grad_norm": 0.02635124884545803, + "learning_rate": 4.957799637570197e-05, + "loss": 0.0251, + "num_input_tokens_seen": 33390560, + "step": 27425 + }, + { + "epoch": 3.0549058915246685, + "grad_norm": 0.3642299473285675, + "learning_rate": 4.9577551708229174e-05, + "loss": 0.031, + "num_input_tokens_seen": 33396512, + "step": 27430 + }, + { + "epoch": 3.055462746408286, + "grad_norm": 0.25448790192604065, + "learning_rate": 4.9577106808601714e-05, + "loss": 0.073, + "num_input_tokens_seen": 33402464, + "step": 27435 + }, + { + "epoch": 3.0560196012919034, + "grad_norm": 0.8541176915168762, + "learning_rate": 4.9576661676823786e-05, + "loss": 0.0487, + "num_input_tokens_seen": 33408608, + "step": 27440 + }, + { + "epoch": 3.0565764561755207, + "grad_norm": 0.29842403531074524, + "learning_rate": 4.957621631289961e-05, + "loss": 0.0858, + "num_input_tokens_seen": 33414848, + "step": 27445 + }, + { + "epoch": 3.057133311059138, + "grad_norm": 0.37963035702705383, + "learning_rate": 4.957577071683336e-05, + "loss": 0.0687, + "num_input_tokens_seen": 33421024, + "step": 27450 + }, + { + "epoch": 3.057690165942755, + "grad_norm": 0.2885740101337433, + "learning_rate": 4.9575324888629284e-05, + "loss": 0.1725, + "num_input_tokens_seen": 33426784, + "step": 27455 + }, + { + "epoch": 3.058247020826373, + "grad_norm": 0.12755872309207916, + "learning_rate": 4.957487882829156e-05, + "loss": 0.1185, + "num_input_tokens_seen": 33432960, + "step": 27460 + }, + { + "epoch": 3.05880387570999, + "grad_norm": 0.0019948750268667936, + "learning_rate": 4.957443253582443e-05, + "loss": 0.0489, + "num_input_tokens_seen": 33439200, + "step": 27465 + }, + { + "epoch": 3.0593607305936072, + "grad_norm": 2.265543222427368, + "learning_rate": 4.957398601123209e-05, + "loss": 0.158, + "num_input_tokens_seen": 33445248, + "step": 27470 + }, + { + "epoch": 3.0599175854772245, + "grad_norm": 0.2065024971961975, + "learning_rate": 4.9573539254518766e-05, + "loss": 0.1993, + "num_input_tokens_seen": 33451360, + "step": 27475 + }, + { + "epoch": 3.060474440360842, + "grad_norm": 0.11465948075056076, + "learning_rate": 4.957309226568867e-05, + "loss": 0.1455, + "num_input_tokens_seen": 33457856, + "step": 27480 + }, + { + "epoch": 3.0610312952444594, + "grad_norm": 0.8679394125938416, + "learning_rate": 4.957264504474604e-05, + "loss": 0.085, + "num_input_tokens_seen": 33464192, + "step": 27485 + }, + { + "epoch": 3.0615881501280766, + "grad_norm": 1.513372540473938, + "learning_rate": 4.957219759169508e-05, + "loss": 0.2934, + "num_input_tokens_seen": 33470368, + "step": 27490 + }, + { + "epoch": 3.062145005011694, + "grad_norm": 0.12192486226558685, + "learning_rate": 4.9571749906540026e-05, + "loss": 0.035, + "num_input_tokens_seen": 33476608, + "step": 27495 + }, + { + "epoch": 3.062701859895311, + "grad_norm": 1.043316125869751, + "learning_rate": 4.957130198928511e-05, + "loss": 0.1137, + "num_input_tokens_seen": 33482560, + "step": 27500 + }, + { + "epoch": 3.0632587147789287, + "grad_norm": 0.03180031105875969, + "learning_rate": 4.957085383993457e-05, + "loss": 0.0037, + "num_input_tokens_seen": 33488800, + "step": 27505 + }, + { + "epoch": 3.063815569662546, + "grad_norm": 1.358911395072937, + "learning_rate": 4.957040545849262e-05, + "loss": 0.1739, + "num_input_tokens_seen": 33494784, + "step": 27510 + }, + { + "epoch": 3.064372424546163, + "grad_norm": 0.5326030254364014, + "learning_rate": 4.9569956844963505e-05, + "loss": 0.0388, + "num_input_tokens_seen": 33501280, + "step": 27515 + }, + { + "epoch": 3.0649292794297804, + "grad_norm": 1.5793004035949707, + "learning_rate": 4.9569507999351466e-05, + "loss": 0.1213, + "num_input_tokens_seen": 33507040, + "step": 27520 + }, + { + "epoch": 3.065486134313398, + "grad_norm": 0.42137569189071655, + "learning_rate": 4.9569058921660736e-05, + "loss": 0.0402, + "num_input_tokens_seen": 33512928, + "step": 27525 + }, + { + "epoch": 3.0660429891970153, + "grad_norm": 0.4072466492652893, + "learning_rate": 4.956860961189557e-05, + "loss": 0.0632, + "num_input_tokens_seen": 33519584, + "step": 27530 + }, + { + "epoch": 3.0665998440806326, + "grad_norm": 0.14840461313724518, + "learning_rate": 4.956816007006019e-05, + "loss": 0.0355, + "num_input_tokens_seen": 33525824, + "step": 27535 + }, + { + "epoch": 3.06715669896425, + "grad_norm": 2.2410221099853516, + "learning_rate": 4.956771029615885e-05, + "loss": 0.2721, + "num_input_tokens_seen": 33531840, + "step": 27540 + }, + { + "epoch": 3.067713553847867, + "grad_norm": 0.9156807065010071, + "learning_rate": 4.956726029019582e-05, + "loss": 0.1456, + "num_input_tokens_seen": 33538048, + "step": 27545 + }, + { + "epoch": 3.0682704087314847, + "grad_norm": 0.4811880588531494, + "learning_rate": 4.956681005217533e-05, + "loss": 0.0367, + "num_input_tokens_seen": 33544096, + "step": 27550 + }, + { + "epoch": 3.068827263615102, + "grad_norm": 0.8790661692619324, + "learning_rate": 4.956635958210163e-05, + "loss": 0.1027, + "num_input_tokens_seen": 33550176, + "step": 27555 + }, + { + "epoch": 3.069384118498719, + "grad_norm": 0.4864215850830078, + "learning_rate": 4.956590887997898e-05, + "loss": 0.174, + "num_input_tokens_seen": 33556224, + "step": 27560 + }, + { + "epoch": 3.0699409733823364, + "grad_norm": 0.21222007274627686, + "learning_rate": 4.956545794581165e-05, + "loss": 0.0653, + "num_input_tokens_seen": 33562592, + "step": 27565 + }, + { + "epoch": 3.070497828265954, + "grad_norm": 0.01589234545826912, + "learning_rate": 4.9565006779603873e-05, + "loss": 0.0429, + "num_input_tokens_seen": 33568864, + "step": 27570 + }, + { + "epoch": 3.0710546831495713, + "grad_norm": 0.011617991141974926, + "learning_rate": 4.9564555381359935e-05, + "loss": 0.0194, + "num_input_tokens_seen": 33575040, + "step": 27575 + }, + { + "epoch": 3.0716115380331885, + "grad_norm": 0.5351259112358093, + "learning_rate": 4.956410375108409e-05, + "loss": 0.1191, + "num_input_tokens_seen": 33580896, + "step": 27580 + }, + { + "epoch": 3.0721683929168058, + "grad_norm": 0.09548639506101608, + "learning_rate": 4.956365188878059e-05, + "loss": 0.1522, + "num_input_tokens_seen": 33586528, + "step": 27585 + }, + { + "epoch": 3.072725247800423, + "grad_norm": 0.08070652931928635, + "learning_rate": 4.956319979445374e-05, + "loss": 0.1581, + "num_input_tokens_seen": 33592608, + "step": 27590 + }, + { + "epoch": 3.0732821026840407, + "grad_norm": 0.2427646964788437, + "learning_rate": 4.956274746810777e-05, + "loss": 0.0877, + "num_input_tokens_seen": 33598400, + "step": 27595 + }, + { + "epoch": 3.073838957567658, + "grad_norm": 0.7850040793418884, + "learning_rate": 4.9562294909746984e-05, + "loss": 0.0929, + "num_input_tokens_seen": 33604896, + "step": 27600 + }, + { + "epoch": 3.074395812451275, + "grad_norm": 0.001668079406954348, + "learning_rate": 4.9561842119375645e-05, + "loss": 0.0316, + "num_input_tokens_seen": 33611200, + "step": 27605 + }, + { + "epoch": 3.0749526673348924, + "grad_norm": 0.9970601797103882, + "learning_rate": 4.9561389096998025e-05, + "loss": 0.0714, + "num_input_tokens_seen": 33617408, + "step": 27610 + }, + { + "epoch": 3.07550952221851, + "grad_norm": 1.5028343200683594, + "learning_rate": 4.95609358426184e-05, + "loss": 0.027, + "num_input_tokens_seen": 33623936, + "step": 27615 + }, + { + "epoch": 3.0760663771021273, + "grad_norm": 0.21710945665836334, + "learning_rate": 4.956048235624107e-05, + "loss": 0.0805, + "num_input_tokens_seen": 33630112, + "step": 27620 + }, + { + "epoch": 3.0766232319857445, + "grad_norm": 0.02695394493639469, + "learning_rate": 4.9560028637870294e-05, + "loss": 0.1091, + "num_input_tokens_seen": 33636288, + "step": 27625 + }, + { + "epoch": 3.0771800868693617, + "grad_norm": 0.33394408226013184, + "learning_rate": 4.955957468751037e-05, + "loss": 0.141, + "num_input_tokens_seen": 33642496, + "step": 27630 + }, + { + "epoch": 3.077736941752979, + "grad_norm": 0.10620196163654327, + "learning_rate": 4.9559120505165604e-05, + "loss": 0.0225, + "num_input_tokens_seen": 33648576, + "step": 27635 + }, + { + "epoch": 3.0782937966365966, + "grad_norm": 0.12062511593103409, + "learning_rate": 4.955866609084025e-05, + "loss": 0.2141, + "num_input_tokens_seen": 33654592, + "step": 27640 + }, + { + "epoch": 3.078850651520214, + "grad_norm": 0.6181296110153198, + "learning_rate": 4.9558211444538625e-05, + "loss": 0.1697, + "num_input_tokens_seen": 33660512, + "step": 27645 + }, + { + "epoch": 3.079407506403831, + "grad_norm": 0.13066886365413666, + "learning_rate": 4.955775656626502e-05, + "loss": 0.0629, + "num_input_tokens_seen": 33666496, + "step": 27650 + }, + { + "epoch": 3.0799643612874483, + "grad_norm": 0.09382151067256927, + "learning_rate": 4.9557301456023725e-05, + "loss": 0.1014, + "num_input_tokens_seen": 33672896, + "step": 27655 + }, + { + "epoch": 3.080521216171066, + "grad_norm": 0.3813905417919159, + "learning_rate": 4.955684611381904e-05, + "loss": 0.1943, + "num_input_tokens_seen": 33678496, + "step": 27660 + }, + { + "epoch": 3.0810780710546832, + "grad_norm": 0.08621197193861008, + "learning_rate": 4.955639053965527e-05, + "loss": 0.0745, + "num_input_tokens_seen": 33684672, + "step": 27665 + }, + { + "epoch": 3.0816349259383005, + "grad_norm": 0.0045444597490131855, + "learning_rate": 4.955593473353672e-05, + "loss": 0.1475, + "num_input_tokens_seen": 33691008, + "step": 27670 + }, + { + "epoch": 3.0821917808219177, + "grad_norm": 0.002037788275629282, + "learning_rate": 4.95554786954677e-05, + "loss": 0.1574, + "num_input_tokens_seen": 33697408, + "step": 27675 + }, + { + "epoch": 3.082748635705535, + "grad_norm": 0.6717821955680847, + "learning_rate": 4.95550224254525e-05, + "loss": 0.2219, + "num_input_tokens_seen": 33703296, + "step": 27680 + }, + { + "epoch": 3.0833054905891526, + "grad_norm": 0.7998072504997253, + "learning_rate": 4.9554565923495444e-05, + "loss": 0.1193, + "num_input_tokens_seen": 33709088, + "step": 27685 + }, + { + "epoch": 3.08386234547277, + "grad_norm": 0.6247497200965881, + "learning_rate": 4.9554109189600836e-05, + "loss": 0.0648, + "num_input_tokens_seen": 33715232, + "step": 27690 + }, + { + "epoch": 3.084419200356387, + "grad_norm": 0.2040608525276184, + "learning_rate": 4.9553652223773e-05, + "loss": 0.0107, + "num_input_tokens_seen": 33721440, + "step": 27695 + }, + { + "epoch": 3.0849760552400043, + "grad_norm": 0.013266063295304775, + "learning_rate": 4.955319502601624e-05, + "loss": 0.1049, + "num_input_tokens_seen": 33727520, + "step": 27700 + }, + { + "epoch": 3.085532910123622, + "grad_norm": 0.05304087698459625, + "learning_rate": 4.955273759633488e-05, + "loss": 0.0496, + "num_input_tokens_seen": 33733664, + "step": 27705 + }, + { + "epoch": 3.086089765007239, + "grad_norm": 0.5530469417572021, + "learning_rate": 4.955227993473326e-05, + "loss": 0.1867, + "num_input_tokens_seen": 33739136, + "step": 27710 + }, + { + "epoch": 3.0866466198908564, + "grad_norm": 1.1958822011947632, + "learning_rate": 4.955182204121567e-05, + "loss": 0.0841, + "num_input_tokens_seen": 33745216, + "step": 27715 + }, + { + "epoch": 3.0872034747744737, + "grad_norm": 0.8098995685577393, + "learning_rate": 4.9551363915786456e-05, + "loss": 0.1531, + "num_input_tokens_seen": 33751424, + "step": 27720 + }, + { + "epoch": 3.0877603296580913, + "grad_norm": 0.021106228232383728, + "learning_rate": 4.9550905558449934e-05, + "loss": 0.0885, + "num_input_tokens_seen": 33757664, + "step": 27725 + }, + { + "epoch": 3.0883171845417086, + "grad_norm": 0.07541981339454651, + "learning_rate": 4.955044696921044e-05, + "loss": 0.0531, + "num_input_tokens_seen": 33763712, + "step": 27730 + }, + { + "epoch": 3.088874039425326, + "grad_norm": 0.08521560579538345, + "learning_rate": 4.954998814807231e-05, + "loss": 0.1486, + "num_input_tokens_seen": 33769792, + "step": 27735 + }, + { + "epoch": 3.089430894308943, + "grad_norm": 0.607837975025177, + "learning_rate": 4.9549529095039865e-05, + "loss": 0.1009, + "num_input_tokens_seen": 33776224, + "step": 27740 + }, + { + "epoch": 3.0899877491925603, + "grad_norm": 0.1376909613609314, + "learning_rate": 4.9549069810117454e-05, + "loss": 0.051, + "num_input_tokens_seen": 33782112, + "step": 27745 + }, + { + "epoch": 3.090544604076178, + "grad_norm": 1.1578750610351562, + "learning_rate": 4.9548610293309406e-05, + "loss": 0.1172, + "num_input_tokens_seen": 33788192, + "step": 27750 + }, + { + "epoch": 3.091101458959795, + "grad_norm": 1.0489649772644043, + "learning_rate": 4.954815054462007e-05, + "loss": 0.0677, + "num_input_tokens_seen": 33794368, + "step": 27755 + }, + { + "epoch": 3.0916583138434124, + "grad_norm": 0.018458575010299683, + "learning_rate": 4.954769056405378e-05, + "loss": 0.078, + "num_input_tokens_seen": 33800672, + "step": 27760 + }, + { + "epoch": 3.0922151687270296, + "grad_norm": 0.28598475456237793, + "learning_rate": 4.954723035161489e-05, + "loss": 0.0258, + "num_input_tokens_seen": 33806880, + "step": 27765 + }, + { + "epoch": 3.092772023610647, + "grad_norm": 0.3970455229282379, + "learning_rate": 4.9546769907307744e-05, + "loss": 0.0585, + "num_input_tokens_seen": 33813024, + "step": 27770 + }, + { + "epoch": 3.0933288784942645, + "grad_norm": 0.22869379818439484, + "learning_rate": 4.954630923113669e-05, + "loss": 0.0962, + "num_input_tokens_seen": 33819040, + "step": 27775 + }, + { + "epoch": 3.0938857333778818, + "grad_norm": 0.030182018876075745, + "learning_rate": 4.954584832310607e-05, + "loss": 0.0871, + "num_input_tokens_seen": 33825088, + "step": 27780 + }, + { + "epoch": 3.094442588261499, + "grad_norm": 2.165980100631714, + "learning_rate": 4.954538718322026e-05, + "loss": 0.2204, + "num_input_tokens_seen": 33830912, + "step": 27785 + }, + { + "epoch": 3.094999443145116, + "grad_norm": 0.4834381341934204, + "learning_rate": 4.954492581148359e-05, + "loss": 0.115, + "num_input_tokens_seen": 33836608, + "step": 27790 + }, + { + "epoch": 3.095556298028734, + "grad_norm": 0.5683470368385315, + "learning_rate": 4.954446420790044e-05, + "loss": 0.0773, + "num_input_tokens_seen": 33842688, + "step": 27795 + }, + { + "epoch": 3.096113152912351, + "grad_norm": 0.8283392190933228, + "learning_rate": 4.954400237247515e-05, + "loss": 0.0986, + "num_input_tokens_seen": 33848640, + "step": 27800 + }, + { + "epoch": 3.0966700077959683, + "grad_norm": 0.41718390583992004, + "learning_rate": 4.954354030521211e-05, + "loss": 0.0896, + "num_input_tokens_seen": 33854368, + "step": 27805 + }, + { + "epoch": 3.0972268626795856, + "grad_norm": 0.025079287588596344, + "learning_rate": 4.954307800611565e-05, + "loss": 0.037, + "num_input_tokens_seen": 33860512, + "step": 27810 + }, + { + "epoch": 3.0977837175632033, + "grad_norm": 0.800671398639679, + "learning_rate": 4.954261547519017e-05, + "loss": 0.0663, + "num_input_tokens_seen": 33866720, + "step": 27815 + }, + { + "epoch": 3.0983405724468205, + "grad_norm": 1.2219789028167725, + "learning_rate": 4.954215271244002e-05, + "loss": 0.0654, + "num_input_tokens_seen": 33872704, + "step": 27820 + }, + { + "epoch": 3.0988974273304377, + "grad_norm": 0.6377710103988647, + "learning_rate": 4.954168971786957e-05, + "loss": 0.0895, + "num_input_tokens_seen": 33878656, + "step": 27825 + }, + { + "epoch": 3.099454282214055, + "grad_norm": 0.41375935077667236, + "learning_rate": 4.9541226491483194e-05, + "loss": 0.1114, + "num_input_tokens_seen": 33884416, + "step": 27830 + }, + { + "epoch": 3.100011137097672, + "grad_norm": 0.39071667194366455, + "learning_rate": 4.9540763033285275e-05, + "loss": 0.057, + "num_input_tokens_seen": 33890560, + "step": 27835 + }, + { + "epoch": 3.10056799198129, + "grad_norm": 0.41581931710243225, + "learning_rate": 4.954029934328019e-05, + "loss": 0.1025, + "num_input_tokens_seen": 33896544, + "step": 27840 + }, + { + "epoch": 3.101124846864907, + "grad_norm": 0.43264102935791016, + "learning_rate": 4.953983542147231e-05, + "loss": 0.1696, + "num_input_tokens_seen": 33901856, + "step": 27845 + }, + { + "epoch": 3.1016817017485243, + "grad_norm": 0.41917797923088074, + "learning_rate": 4.953937126786603e-05, + "loss": 0.0294, + "num_input_tokens_seen": 33907808, + "step": 27850 + }, + { + "epoch": 3.1022385566321415, + "grad_norm": 0.06142646446824074, + "learning_rate": 4.953890688246573e-05, + "loss": 0.1584, + "num_input_tokens_seen": 33913696, + "step": 27855 + }, + { + "epoch": 3.1027954115157588, + "grad_norm": 0.36757194995880127, + "learning_rate": 4.953844226527579e-05, + "loss": 0.031, + "num_input_tokens_seen": 33919776, + "step": 27860 + }, + { + "epoch": 3.1033522663993764, + "grad_norm": 0.18728919327259064, + "learning_rate": 4.953797741630061e-05, + "loss": 0.0206, + "num_input_tokens_seen": 33925952, + "step": 27865 + }, + { + "epoch": 3.1039091212829937, + "grad_norm": 0.11005126684904099, + "learning_rate": 4.9537512335544564e-05, + "loss": 0.043, + "num_input_tokens_seen": 33931968, + "step": 27870 + }, + { + "epoch": 3.104465976166611, + "grad_norm": 0.23614929616451263, + "learning_rate": 4.953704702301206e-05, + "loss": 0.1378, + "num_input_tokens_seen": 33937728, + "step": 27875 + }, + { + "epoch": 3.105022831050228, + "grad_norm": 0.03294840082526207, + "learning_rate": 4.953658147870749e-05, + "loss": 0.084, + "num_input_tokens_seen": 33943872, + "step": 27880 + }, + { + "epoch": 3.105579685933846, + "grad_norm": 0.6786644458770752, + "learning_rate": 4.9536115702635245e-05, + "loss": 0.0904, + "num_input_tokens_seen": 33949888, + "step": 27885 + }, + { + "epoch": 3.106136540817463, + "grad_norm": 1.4982099533081055, + "learning_rate": 4.953564969479972e-05, + "loss": 0.0514, + "num_input_tokens_seen": 33956000, + "step": 27890 + }, + { + "epoch": 3.1066933957010803, + "grad_norm": 0.18159087002277374, + "learning_rate": 4.9535183455205345e-05, + "loss": 0.1482, + "num_input_tokens_seen": 33962400, + "step": 27895 + }, + { + "epoch": 3.1072502505846975, + "grad_norm": 0.07827091217041016, + "learning_rate": 4.95347169838565e-05, + "loss": 0.0056, + "num_input_tokens_seen": 33968544, + "step": 27900 + }, + { + "epoch": 3.107807105468315, + "grad_norm": 0.27342280745506287, + "learning_rate": 4.953425028075759e-05, + "loss": 0.051, + "num_input_tokens_seen": 33974688, + "step": 27905 + }, + { + "epoch": 3.1083639603519324, + "grad_norm": 1.4632179737091064, + "learning_rate": 4.953378334591303e-05, + "loss": 0.1436, + "num_input_tokens_seen": 33980672, + "step": 27910 + }, + { + "epoch": 3.1089208152355496, + "grad_norm": 0.11229285597801208, + "learning_rate": 4.9533316179327235e-05, + "loss": 0.0583, + "num_input_tokens_seen": 33986784, + "step": 27915 + }, + { + "epoch": 3.109477670119167, + "grad_norm": 1.9461965560913086, + "learning_rate": 4.953284878100461e-05, + "loss": 0.1231, + "num_input_tokens_seen": 33992736, + "step": 27920 + }, + { + "epoch": 3.110034525002784, + "grad_norm": 0.5429057478904724, + "learning_rate": 4.953238115094957e-05, + "loss": 0.092, + "num_input_tokens_seen": 33998176, + "step": 27925 + }, + { + "epoch": 3.1105913798864018, + "grad_norm": 0.3926500082015991, + "learning_rate": 4.953191328916654e-05, + "loss": 0.0626, + "num_input_tokens_seen": 34003680, + "step": 27930 + }, + { + "epoch": 3.111148234770019, + "grad_norm": 1.1754437685012817, + "learning_rate": 4.953144519565993e-05, + "loss": 0.0832, + "num_input_tokens_seen": 34009824, + "step": 27935 + }, + { + "epoch": 3.1117050896536362, + "grad_norm": 0.07891648262739182, + "learning_rate": 4.953097687043417e-05, + "loss": 0.0744, + "num_input_tokens_seen": 34015712, + "step": 27940 + }, + { + "epoch": 3.1122619445372535, + "grad_norm": 0.21270811557769775, + "learning_rate": 4.953050831349368e-05, + "loss": 0.0378, + "num_input_tokens_seen": 34021696, + "step": 27945 + }, + { + "epoch": 3.1128187994208707, + "grad_norm": 0.032332003116607666, + "learning_rate": 4.953003952484289e-05, + "loss": 0.0937, + "num_input_tokens_seen": 34027904, + "step": 27950 + }, + { + "epoch": 3.1133756543044884, + "grad_norm": 0.43899455666542053, + "learning_rate": 4.952957050448621e-05, + "loss": 0.1597, + "num_input_tokens_seen": 34034240, + "step": 27955 + }, + { + "epoch": 3.1139325091881056, + "grad_norm": 0.518489420413971, + "learning_rate": 4.952910125242809e-05, + "loss": 0.0865, + "num_input_tokens_seen": 34040384, + "step": 27960 + }, + { + "epoch": 3.114489364071723, + "grad_norm": 1.2417151927947998, + "learning_rate": 4.9528631768672964e-05, + "loss": 0.0907, + "num_input_tokens_seen": 34046592, + "step": 27965 + }, + { + "epoch": 3.11504621895534, + "grad_norm": 0.6450989246368408, + "learning_rate": 4.952816205322525e-05, + "loss": 0.2106, + "num_input_tokens_seen": 34053280, + "step": 27970 + }, + { + "epoch": 3.1156030738389577, + "grad_norm": 0.06875568628311157, + "learning_rate": 4.9527692106089394e-05, + "loss": 0.0705, + "num_input_tokens_seen": 34059232, + "step": 27975 + }, + { + "epoch": 3.116159928722575, + "grad_norm": 0.11045821756124496, + "learning_rate": 4.952722192726984e-05, + "loss": 0.1527, + "num_input_tokens_seen": 34065440, + "step": 27980 + }, + { + "epoch": 3.116716783606192, + "grad_norm": 0.4623975455760956, + "learning_rate": 4.952675151677102e-05, + "loss": 0.0789, + "num_input_tokens_seen": 34071424, + "step": 27985 + }, + { + "epoch": 3.1172736384898094, + "grad_norm": 0.36799144744873047, + "learning_rate": 4.952628087459738e-05, + "loss": 0.0936, + "num_input_tokens_seen": 34077504, + "step": 27990 + }, + { + "epoch": 3.117830493373427, + "grad_norm": 0.4366967976093292, + "learning_rate": 4.952581000075337e-05, + "loss": 0.0375, + "num_input_tokens_seen": 34083744, + "step": 27995 + }, + { + "epoch": 3.1183873482570443, + "grad_norm": 0.03456490859389305, + "learning_rate": 4.9525338895243436e-05, + "loss": 0.0456, + "num_input_tokens_seen": 34090016, + "step": 28000 + }, + { + "epoch": 3.1189442031406616, + "grad_norm": 0.01774917170405388, + "learning_rate": 4.952486755807202e-05, + "loss": 0.1742, + "num_input_tokens_seen": 34095872, + "step": 28005 + }, + { + "epoch": 3.119501058024279, + "grad_norm": 0.006802470423281193, + "learning_rate": 4.952439598924359e-05, + "loss": 0.0777, + "num_input_tokens_seen": 34102048, + "step": 28010 + }, + { + "epoch": 3.120057912907896, + "grad_norm": 0.8283630013465881, + "learning_rate": 4.952392418876258e-05, + "loss": 0.1669, + "num_input_tokens_seen": 34107584, + "step": 28015 + }, + { + "epoch": 3.1206147677915137, + "grad_norm": 0.25191667675971985, + "learning_rate": 4.9523452156633465e-05, + "loss": 0.0357, + "num_input_tokens_seen": 34113728, + "step": 28020 + }, + { + "epoch": 3.121171622675131, + "grad_norm": 0.92848801612854, + "learning_rate": 4.95229798928607e-05, + "loss": 0.1204, + "num_input_tokens_seen": 34119872, + "step": 28025 + }, + { + "epoch": 3.121728477558748, + "grad_norm": 1.0253735780715942, + "learning_rate": 4.9522507397448735e-05, + "loss": 0.1472, + "num_input_tokens_seen": 34126336, + "step": 28030 + }, + { + "epoch": 3.1222853324423654, + "grad_norm": 0.607475757598877, + "learning_rate": 4.9522034670402045e-05, + "loss": 0.0538, + "num_input_tokens_seen": 34132288, + "step": 28035 + }, + { + "epoch": 3.1228421873259826, + "grad_norm": 0.19292078912258148, + "learning_rate": 4.9521561711725096e-05, + "loss": 0.0308, + "num_input_tokens_seen": 34138624, + "step": 28040 + }, + { + "epoch": 3.1233990422096003, + "grad_norm": 0.19811667501926422, + "learning_rate": 4.9521088521422345e-05, + "loss": 0.0941, + "num_input_tokens_seen": 34144512, + "step": 28045 + }, + { + "epoch": 3.1239558970932175, + "grad_norm": 1.8793504238128662, + "learning_rate": 4.952061509949826e-05, + "loss": 0.1956, + "num_input_tokens_seen": 34150624, + "step": 28050 + }, + { + "epoch": 3.1245127519768348, + "grad_norm": 0.6845636367797852, + "learning_rate": 4.952014144595732e-05, + "loss": 0.0595, + "num_input_tokens_seen": 34156832, + "step": 28055 + }, + { + "epoch": 3.125069606860452, + "grad_norm": 0.004607074428349733, + "learning_rate": 4.951966756080401e-05, + "loss": 0.0243, + "num_input_tokens_seen": 34163008, + "step": 28060 + }, + { + "epoch": 3.1256264617440697, + "grad_norm": 2.008929491043091, + "learning_rate": 4.951919344404279e-05, + "loss": 0.1663, + "num_input_tokens_seen": 34169088, + "step": 28065 + }, + { + "epoch": 3.126183316627687, + "grad_norm": 0.0032738882582634687, + "learning_rate": 4.951871909567815e-05, + "loss": 0.0949, + "num_input_tokens_seen": 34175136, + "step": 28070 + }, + { + "epoch": 3.126740171511304, + "grad_norm": 0.27200278639793396, + "learning_rate": 4.951824451571455e-05, + "loss": 0.1826, + "num_input_tokens_seen": 34181088, + "step": 28075 + }, + { + "epoch": 3.1272970263949214, + "grad_norm": 0.3498266637325287, + "learning_rate": 4.95177697041565e-05, + "loss": 0.0761, + "num_input_tokens_seen": 34187456, + "step": 28080 + }, + { + "epoch": 3.127853881278539, + "grad_norm": 0.5164651870727539, + "learning_rate": 4.9517294661008464e-05, + "loss": 0.0233, + "num_input_tokens_seen": 34193536, + "step": 28085 + }, + { + "epoch": 3.1284107361621563, + "grad_norm": 0.17010954022407532, + "learning_rate": 4.951681938627494e-05, + "loss": 0.1153, + "num_input_tokens_seen": 34199360, + "step": 28090 + }, + { + "epoch": 3.1289675910457735, + "grad_norm": 0.8718324899673462, + "learning_rate": 4.9516343879960414e-05, + "loss": 0.1217, + "num_input_tokens_seen": 34205184, + "step": 28095 + }, + { + "epoch": 3.1295244459293907, + "grad_norm": 0.2488013505935669, + "learning_rate": 4.951586814206938e-05, + "loss": 0.0782, + "num_input_tokens_seen": 34210848, + "step": 28100 + }, + { + "epoch": 3.130081300813008, + "grad_norm": 0.13633756339550018, + "learning_rate": 4.951539217260632e-05, + "loss": 0.1208, + "num_input_tokens_seen": 34216672, + "step": 28105 + }, + { + "epoch": 3.1306381556966256, + "grad_norm": 0.005838354583829641, + "learning_rate": 4.951491597157575e-05, + "loss": 0.0556, + "num_input_tokens_seen": 34222784, + "step": 28110 + }, + { + "epoch": 3.131195010580243, + "grad_norm": 0.1809234470129013, + "learning_rate": 4.951443953898215e-05, + "loss": 0.1448, + "num_input_tokens_seen": 34228448, + "step": 28115 + }, + { + "epoch": 3.13175186546386, + "grad_norm": 0.11169760674238205, + "learning_rate": 4.951396287483003e-05, + "loss": 0.0186, + "num_input_tokens_seen": 34234336, + "step": 28120 + }, + { + "epoch": 3.1323087203474773, + "grad_norm": 0.11909159272909164, + "learning_rate": 4.95134859791239e-05, + "loss": 0.015, + "num_input_tokens_seen": 34240512, + "step": 28125 + }, + { + "epoch": 3.1328655752310945, + "grad_norm": 0.08001697063446045, + "learning_rate": 4.9513008851868245e-05, + "loss": 0.0916, + "num_input_tokens_seen": 34246560, + "step": 28130 + }, + { + "epoch": 3.133422430114712, + "grad_norm": 0.01798734813928604, + "learning_rate": 4.9512531493067584e-05, + "loss": 0.0837, + "num_input_tokens_seen": 34252736, + "step": 28135 + }, + { + "epoch": 3.1339792849983295, + "grad_norm": 0.7323899865150452, + "learning_rate": 4.951205390272642e-05, + "loss": 0.1232, + "num_input_tokens_seen": 34258592, + "step": 28140 + }, + { + "epoch": 3.1345361398819467, + "grad_norm": 0.23596781492233276, + "learning_rate": 4.951157608084928e-05, + "loss": 0.0908, + "num_input_tokens_seen": 34264672, + "step": 28145 + }, + { + "epoch": 3.135092994765564, + "grad_norm": 0.16375435888767242, + "learning_rate": 4.951109802744066e-05, + "loss": 0.0235, + "num_input_tokens_seen": 34270816, + "step": 28150 + }, + { + "epoch": 3.1356498496491816, + "grad_norm": 0.21132761240005493, + "learning_rate": 4.951061974250507e-05, + "loss": 0.1478, + "num_input_tokens_seen": 34277248, + "step": 28155 + }, + { + "epoch": 3.136206704532799, + "grad_norm": 0.20029976963996887, + "learning_rate": 4.951014122604705e-05, + "loss": 0.1495, + "num_input_tokens_seen": 34283456, + "step": 28160 + }, + { + "epoch": 3.136763559416416, + "grad_norm": 1.0486063957214355, + "learning_rate": 4.950966247807111e-05, + "loss": 0.1431, + "num_input_tokens_seen": 34288896, + "step": 28165 + }, + { + "epoch": 3.1373204143000333, + "grad_norm": 0.010234811343252659, + "learning_rate": 4.950918349858177e-05, + "loss": 0.0239, + "num_input_tokens_seen": 34294976, + "step": 28170 + }, + { + "epoch": 3.137877269183651, + "grad_norm": 0.9465296864509583, + "learning_rate": 4.950870428758355e-05, + "loss": 0.0933, + "num_input_tokens_seen": 34300800, + "step": 28175 + }, + { + "epoch": 3.138434124067268, + "grad_norm": 0.32099977135658264, + "learning_rate": 4.9508224845080984e-05, + "loss": 0.1116, + "num_input_tokens_seen": 34306944, + "step": 28180 + }, + { + "epoch": 3.1389909789508854, + "grad_norm": 0.05524606257677078, + "learning_rate": 4.95077451710786e-05, + "loss": 0.0389, + "num_input_tokens_seen": 34312704, + "step": 28185 + }, + { + "epoch": 3.1395478338345026, + "grad_norm": 0.26526176929473877, + "learning_rate": 4.950726526558093e-05, + "loss": 0.0706, + "num_input_tokens_seen": 34318816, + "step": 28190 + }, + { + "epoch": 3.14010468871812, + "grad_norm": 0.12689124047756195, + "learning_rate": 4.95067851285925e-05, + "loss": 0.0776, + "num_input_tokens_seen": 34324896, + "step": 28195 + }, + { + "epoch": 3.1406615436017375, + "grad_norm": 0.1649261862039566, + "learning_rate": 4.9506304760117855e-05, + "loss": 0.1101, + "num_input_tokens_seen": 34331040, + "step": 28200 + }, + { + "epoch": 3.141218398485355, + "grad_norm": 0.053389355540275574, + "learning_rate": 4.950582416016153e-05, + "loss": 0.0588, + "num_input_tokens_seen": 34337184, + "step": 28205 + }, + { + "epoch": 3.141775253368972, + "grad_norm": 0.4483468532562256, + "learning_rate": 4.950534332872805e-05, + "loss": 0.0575, + "num_input_tokens_seen": 34343552, + "step": 28210 + }, + { + "epoch": 3.1423321082525892, + "grad_norm": 2.8718490600585938, + "learning_rate": 4.9504862265821975e-05, + "loss": 0.0819, + "num_input_tokens_seen": 34349632, + "step": 28215 + }, + { + "epoch": 3.1428889631362065, + "grad_norm": 0.1833098977804184, + "learning_rate": 4.950438097144785e-05, + "loss": 0.0659, + "num_input_tokens_seen": 34355712, + "step": 28220 + }, + { + "epoch": 3.143445818019824, + "grad_norm": 0.37010154128074646, + "learning_rate": 4.95038994456102e-05, + "loss": 0.036, + "num_input_tokens_seen": 34361536, + "step": 28225 + }, + { + "epoch": 3.1440026729034414, + "grad_norm": 1.7232214212417603, + "learning_rate": 4.95034176883136e-05, + "loss": 0.2728, + "num_input_tokens_seen": 34367488, + "step": 28230 + }, + { + "epoch": 3.1445595277870586, + "grad_norm": 0.2860882580280304, + "learning_rate": 4.950293569956258e-05, + "loss": 0.1392, + "num_input_tokens_seen": 34373568, + "step": 28235 + }, + { + "epoch": 3.145116382670676, + "grad_norm": 0.5004854798316956, + "learning_rate": 4.950245347936171e-05, + "loss": 0.0584, + "num_input_tokens_seen": 34379840, + "step": 28240 + }, + { + "epoch": 3.1456732375542935, + "grad_norm": 0.300533264875412, + "learning_rate": 4.950197102771553e-05, + "loss": 0.112, + "num_input_tokens_seen": 34385952, + "step": 28245 + }, + { + "epoch": 3.1462300924379107, + "grad_norm": 0.7737550139427185, + "learning_rate": 4.9501488344628596e-05, + "loss": 0.0511, + "num_input_tokens_seen": 34391968, + "step": 28250 + }, + { + "epoch": 3.146786947321528, + "grad_norm": 0.26345235109329224, + "learning_rate": 4.950100543010548e-05, + "loss": 0.0603, + "num_input_tokens_seen": 34397632, + "step": 28255 + }, + { + "epoch": 3.147343802205145, + "grad_norm": 0.6843783855438232, + "learning_rate": 4.9500522284150746e-05, + "loss": 0.0925, + "num_input_tokens_seen": 34403872, + "step": 28260 + }, + { + "epoch": 3.147900657088763, + "grad_norm": 0.2030305713415146, + "learning_rate": 4.9500038906768944e-05, + "loss": 0.0597, + "num_input_tokens_seen": 34410112, + "step": 28265 + }, + { + "epoch": 3.14845751197238, + "grad_norm": 0.18766385316848755, + "learning_rate": 4.949955529796464e-05, + "loss": 0.0197, + "num_input_tokens_seen": 34416224, + "step": 28270 + }, + { + "epoch": 3.1490143668559973, + "grad_norm": 0.387113094329834, + "learning_rate": 4.949907145774242e-05, + "loss": 0.0196, + "num_input_tokens_seen": 34422176, + "step": 28275 + }, + { + "epoch": 3.1495712217396146, + "grad_norm": 0.2756085693836212, + "learning_rate": 4.949858738610683e-05, + "loss": 0.0459, + "num_input_tokens_seen": 34428448, + "step": 28280 + }, + { + "epoch": 3.150128076623232, + "grad_norm": 0.322762131690979, + "learning_rate": 4.949810308306246e-05, + "loss": 0.0308, + "num_input_tokens_seen": 34434528, + "step": 28285 + }, + { + "epoch": 3.1506849315068495, + "grad_norm": 1.0118716955184937, + "learning_rate": 4.9497618548613876e-05, + "loss": 0.1217, + "num_input_tokens_seen": 34440608, + "step": 28290 + }, + { + "epoch": 3.1512417863904667, + "grad_norm": 0.5124029517173767, + "learning_rate": 4.949713378276566e-05, + "loss": 0.0177, + "num_input_tokens_seen": 34446528, + "step": 28295 + }, + { + "epoch": 3.151798641274084, + "grad_norm": 0.47787272930145264, + "learning_rate": 4.9496648785522385e-05, + "loss": 0.0923, + "num_input_tokens_seen": 34452544, + "step": 28300 + }, + { + "epoch": 3.152355496157701, + "grad_norm": 0.13759571313858032, + "learning_rate": 4.9496163556888636e-05, + "loss": 0.1149, + "num_input_tokens_seen": 34458592, + "step": 28305 + }, + { + "epoch": 3.152912351041319, + "grad_norm": 0.032291702926158905, + "learning_rate": 4.9495678096869e-05, + "loss": 0.0244, + "num_input_tokens_seen": 34464480, + "step": 28310 + }, + { + "epoch": 3.153469205924936, + "grad_norm": 0.23895102739334106, + "learning_rate": 4.9495192405468056e-05, + "loss": 0.0859, + "num_input_tokens_seen": 34470496, + "step": 28315 + }, + { + "epoch": 3.1540260608085533, + "grad_norm": 0.28035688400268555, + "learning_rate": 4.9494706482690394e-05, + "loss": 0.1622, + "num_input_tokens_seen": 34476224, + "step": 28320 + }, + { + "epoch": 3.1545829156921705, + "grad_norm": 0.071234792470932, + "learning_rate": 4.9494220328540607e-05, + "loss": 0.1282, + "num_input_tokens_seen": 34482176, + "step": 28325 + }, + { + "epoch": 3.1551397705757878, + "grad_norm": 0.9997285008430481, + "learning_rate": 4.949373394302328e-05, + "loss": 0.2316, + "num_input_tokens_seen": 34488384, + "step": 28330 + }, + { + "epoch": 3.1556966254594054, + "grad_norm": 0.6092976331710815, + "learning_rate": 4.9493247326143014e-05, + "loss": 0.0673, + "num_input_tokens_seen": 34494464, + "step": 28335 + }, + { + "epoch": 3.1562534803430227, + "grad_norm": 0.008383302949368954, + "learning_rate": 4.949276047790441e-05, + "loss": 0.0357, + "num_input_tokens_seen": 34500736, + "step": 28340 + }, + { + "epoch": 3.15681033522664, + "grad_norm": 0.091441810131073, + "learning_rate": 4.949227339831205e-05, + "loss": 0.058, + "num_input_tokens_seen": 34506976, + "step": 28345 + }, + { + "epoch": 3.157367190110257, + "grad_norm": 0.20484137535095215, + "learning_rate": 4.949178608737055e-05, + "loss": 0.0732, + "num_input_tokens_seen": 34513312, + "step": 28350 + }, + { + "epoch": 3.157924044993875, + "grad_norm": 0.3112584352493286, + "learning_rate": 4.9491298545084505e-05, + "loss": 0.0572, + "num_input_tokens_seen": 34519488, + "step": 28355 + }, + { + "epoch": 3.158480899877492, + "grad_norm": 0.6539413332939148, + "learning_rate": 4.949081077145853e-05, + "loss": 0.0663, + "num_input_tokens_seen": 34525664, + "step": 28360 + }, + { + "epoch": 3.1590377547611093, + "grad_norm": 0.5524054169654846, + "learning_rate": 4.949032276649722e-05, + "loss": 0.0317, + "num_input_tokens_seen": 34531520, + "step": 28365 + }, + { + "epoch": 3.1595946096447265, + "grad_norm": 0.9672690033912659, + "learning_rate": 4.9489834530205194e-05, + "loss": 0.1271, + "num_input_tokens_seen": 34537920, + "step": 28370 + }, + { + "epoch": 3.1601514645283437, + "grad_norm": 1.7130488157272339, + "learning_rate": 4.9489346062587054e-05, + "loss": 0.0438, + "num_input_tokens_seen": 34544224, + "step": 28375 + }, + { + "epoch": 3.1607083194119614, + "grad_norm": 0.9823734760284424, + "learning_rate": 4.948885736364742e-05, + "loss": 0.0614, + "num_input_tokens_seen": 34550272, + "step": 28380 + }, + { + "epoch": 3.1612651742955786, + "grad_norm": 0.11600115895271301, + "learning_rate": 4.948836843339091e-05, + "loss": 0.0665, + "num_input_tokens_seen": 34556448, + "step": 28385 + }, + { + "epoch": 3.161822029179196, + "grad_norm": 1.1028460264205933, + "learning_rate": 4.948787927182214e-05, + "loss": 0.1019, + "num_input_tokens_seen": 34562976, + "step": 28390 + }, + { + "epoch": 3.162378884062813, + "grad_norm": 0.5676161646842957, + "learning_rate": 4.948738987894574e-05, + "loss": 0.0547, + "num_input_tokens_seen": 34569152, + "step": 28395 + }, + { + "epoch": 3.1629357389464308, + "grad_norm": 1.5131394863128662, + "learning_rate": 4.948690025476631e-05, + "loss": 0.2813, + "num_input_tokens_seen": 34575392, + "step": 28400 + }, + { + "epoch": 3.163492593830048, + "grad_norm": 3.169018507003784, + "learning_rate": 4.9486410399288494e-05, + "loss": 0.212, + "num_input_tokens_seen": 34581504, + "step": 28405 + }, + { + "epoch": 3.1640494487136652, + "grad_norm": 0.6204661726951599, + "learning_rate": 4.948592031251692e-05, + "loss": 0.1508, + "num_input_tokens_seen": 34587680, + "step": 28410 + }, + { + "epoch": 3.1646063035972825, + "grad_norm": 0.12400985509157181, + "learning_rate": 4.94854299944562e-05, + "loss": 0.0337, + "num_input_tokens_seen": 34593984, + "step": 28415 + }, + { + "epoch": 3.1651631584808997, + "grad_norm": 0.03769733011722565, + "learning_rate": 4.948493944511099e-05, + "loss": 0.0431, + "num_input_tokens_seen": 34599872, + "step": 28420 + }, + { + "epoch": 3.1657200133645174, + "grad_norm": 0.796919584274292, + "learning_rate": 4.94844486644859e-05, + "loss": 0.1751, + "num_input_tokens_seen": 34605472, + "step": 28425 + }, + { + "epoch": 3.1662768682481346, + "grad_norm": 0.5715917348861694, + "learning_rate": 4.9483957652585575e-05, + "loss": 0.0581, + "num_input_tokens_seen": 34611520, + "step": 28430 + }, + { + "epoch": 3.166833723131752, + "grad_norm": 0.39724764227867126, + "learning_rate": 4.948346640941465e-05, + "loss": 0.052, + "num_input_tokens_seen": 34617568, + "step": 28435 + }, + { + "epoch": 3.167390578015369, + "grad_norm": 0.1445440798997879, + "learning_rate": 4.948297493497778e-05, + "loss": 0.0358, + "num_input_tokens_seen": 34623328, + "step": 28440 + }, + { + "epoch": 3.1679474328989867, + "grad_norm": 0.05621585622429848, + "learning_rate": 4.948248322927959e-05, + "loss": 0.0988, + "num_input_tokens_seen": 34629440, + "step": 28445 + }, + { + "epoch": 3.168504287782604, + "grad_norm": 0.5657156109809875, + "learning_rate": 4.948199129232473e-05, + "loss": 0.0883, + "num_input_tokens_seen": 34634848, + "step": 28450 + }, + { + "epoch": 3.169061142666221, + "grad_norm": 0.13329903781414032, + "learning_rate": 4.9481499124117846e-05, + "loss": 0.0227, + "num_input_tokens_seen": 34641248, + "step": 28455 + }, + { + "epoch": 3.1696179975498384, + "grad_norm": 1.4219809770584106, + "learning_rate": 4.9481006724663594e-05, + "loss": 0.1749, + "num_input_tokens_seen": 34647296, + "step": 28460 + }, + { + "epoch": 3.1701748524334556, + "grad_norm": 3.0617055892944336, + "learning_rate": 4.948051409396662e-05, + "loss": 0.164, + "num_input_tokens_seen": 34653440, + "step": 28465 + }, + { + "epoch": 3.1707317073170733, + "grad_norm": 1.0459144115447998, + "learning_rate": 4.948002123203157e-05, + "loss": 0.1145, + "num_input_tokens_seen": 34659328, + "step": 28470 + }, + { + "epoch": 3.1712885622006906, + "grad_norm": 0.7147595882415771, + "learning_rate": 4.947952813886312e-05, + "loss": 0.0944, + "num_input_tokens_seen": 34665440, + "step": 28475 + }, + { + "epoch": 3.171845417084308, + "grad_norm": 1.3051093816757202, + "learning_rate": 4.94790348144659e-05, + "loss": 0.0475, + "num_input_tokens_seen": 34671584, + "step": 28480 + }, + { + "epoch": 3.172402271967925, + "grad_norm": 0.18973910808563232, + "learning_rate": 4.947854125884459e-05, + "loss": 0.1957, + "num_input_tokens_seen": 34677184, + "step": 28485 + }, + { + "epoch": 3.1729591268515427, + "grad_norm": 0.9042817950248718, + "learning_rate": 4.947804747200384e-05, + "loss": 0.1844, + "num_input_tokens_seen": 34683424, + "step": 28490 + }, + { + "epoch": 3.17351598173516, + "grad_norm": 0.35598456859588623, + "learning_rate": 4.947755345394833e-05, + "loss": 0.107, + "num_input_tokens_seen": 34689440, + "step": 28495 + }, + { + "epoch": 3.174072836618777, + "grad_norm": 0.5541809797286987, + "learning_rate": 4.947705920468271e-05, + "loss": 0.08, + "num_input_tokens_seen": 34695648, + "step": 28500 + }, + { + "epoch": 3.1746296915023944, + "grad_norm": 1.228678584098816, + "learning_rate": 4.9476564724211653e-05, + "loss": 0.0527, + "num_input_tokens_seen": 34701952, + "step": 28505 + }, + { + "epoch": 3.1751865463860116, + "grad_norm": 0.2758055329322815, + "learning_rate": 4.947607001253984e-05, + "loss": 0.0589, + "num_input_tokens_seen": 34708352, + "step": 28510 + }, + { + "epoch": 3.1757434012696293, + "grad_norm": 0.06235576421022415, + "learning_rate": 4.947557506967193e-05, + "loss": 0.0442, + "num_input_tokens_seen": 34714464, + "step": 28515 + }, + { + "epoch": 3.1763002561532465, + "grad_norm": 1.5475438833236694, + "learning_rate": 4.947507989561261e-05, + "loss": 0.211, + "num_input_tokens_seen": 34719264, + "step": 28520 + }, + { + "epoch": 3.1768571110368637, + "grad_norm": 0.09436244517564774, + "learning_rate": 4.9474584490366535e-05, + "loss": 0.0265, + "num_input_tokens_seen": 34725440, + "step": 28525 + }, + { + "epoch": 3.177413965920481, + "grad_norm": 1.3581392765045166, + "learning_rate": 4.9474088853938416e-05, + "loss": 0.1276, + "num_input_tokens_seen": 34731520, + "step": 28530 + }, + { + "epoch": 3.1779708208040987, + "grad_norm": 0.39297541975975037, + "learning_rate": 4.9473592986332914e-05, + "loss": 0.0488, + "num_input_tokens_seen": 34736992, + "step": 28535 + }, + { + "epoch": 3.178527675687716, + "grad_norm": 0.25599876046180725, + "learning_rate": 4.947309688755473e-05, + "loss": 0.0306, + "num_input_tokens_seen": 34743360, + "step": 28540 + }, + { + "epoch": 3.179084530571333, + "grad_norm": 0.8140971064567566, + "learning_rate": 4.947260055760852e-05, + "loss": 0.11, + "num_input_tokens_seen": 34749792, + "step": 28545 + }, + { + "epoch": 3.1796413854549503, + "grad_norm": 0.11877179890871048, + "learning_rate": 4.9472103996499e-05, + "loss": 0.0779, + "num_input_tokens_seen": 34756000, + "step": 28550 + }, + { + "epoch": 3.1801982403385676, + "grad_norm": 0.010095072910189629, + "learning_rate": 4.947160720423085e-05, + "loss": 0.0552, + "num_input_tokens_seen": 34762144, + "step": 28555 + }, + { + "epoch": 3.1807550952221852, + "grad_norm": 0.7237492799758911, + "learning_rate": 4.9471110180808766e-05, + "loss": 0.1076, + "num_input_tokens_seen": 34768000, + "step": 28560 + }, + { + "epoch": 3.1813119501058025, + "grad_norm": 1.7091710567474365, + "learning_rate": 4.947061292623744e-05, + "loss": 0.1073, + "num_input_tokens_seen": 34774208, + "step": 28565 + }, + { + "epoch": 3.1818688049894197, + "grad_norm": 0.15407362580299377, + "learning_rate": 4.947011544052156e-05, + "loss": 0.0263, + "num_input_tokens_seen": 34780704, + "step": 28570 + }, + { + "epoch": 3.182425659873037, + "grad_norm": 1.0148403644561768, + "learning_rate": 4.946961772366585e-05, + "loss": 0.0587, + "num_input_tokens_seen": 34786240, + "step": 28575 + }, + { + "epoch": 3.1829825147566546, + "grad_norm": 1.9906437397003174, + "learning_rate": 4.946911977567499e-05, + "loss": 0.0813, + "num_input_tokens_seen": 34792096, + "step": 28580 + }, + { + "epoch": 3.183539369640272, + "grad_norm": 0.45868995785713196, + "learning_rate": 4.9468621596553684e-05, + "loss": 0.0772, + "num_input_tokens_seen": 34798240, + "step": 28585 + }, + { + "epoch": 3.184096224523889, + "grad_norm": 1.042007565498352, + "learning_rate": 4.946812318630665e-05, + "loss": 0.1194, + "num_input_tokens_seen": 34804256, + "step": 28590 + }, + { + "epoch": 3.1846530794075063, + "grad_norm": 0.11969156563282013, + "learning_rate": 4.946762454493858e-05, + "loss": 0.0435, + "num_input_tokens_seen": 34810496, + "step": 28595 + }, + { + "epoch": 3.1852099342911235, + "grad_norm": 0.01250187773257494, + "learning_rate": 4.946712567245419e-05, + "loss": 0.1252, + "num_input_tokens_seen": 34816544, + "step": 28600 + }, + { + "epoch": 3.185766789174741, + "grad_norm": 1.3049025535583496, + "learning_rate": 4.946662656885821e-05, + "loss": 0.0926, + "num_input_tokens_seen": 34822560, + "step": 28605 + }, + { + "epoch": 3.1863236440583584, + "grad_norm": 0.011977647431194782, + "learning_rate": 4.946612723415534e-05, + "loss": 0.1159, + "num_input_tokens_seen": 34828768, + "step": 28610 + }, + { + "epoch": 3.1868804989419757, + "grad_norm": 0.9632483720779419, + "learning_rate": 4.9465627668350287e-05, + "loss": 0.1551, + "num_input_tokens_seen": 34834976, + "step": 28615 + }, + { + "epoch": 3.187437353825593, + "grad_norm": 0.612234354019165, + "learning_rate": 4.946512787144778e-05, + "loss": 0.0551, + "num_input_tokens_seen": 34840896, + "step": 28620 + }, + { + "epoch": 3.1879942087092106, + "grad_norm": 1.718070149421692, + "learning_rate": 4.946462784345254e-05, + "loss": 0.15, + "num_input_tokens_seen": 34846496, + "step": 28625 + }, + { + "epoch": 3.188551063592828, + "grad_norm": 0.11124041676521301, + "learning_rate": 4.946412758436929e-05, + "loss": 0.0822, + "num_input_tokens_seen": 34852480, + "step": 28630 + }, + { + "epoch": 3.189107918476445, + "grad_norm": 0.02595754899084568, + "learning_rate": 4.9463627094202755e-05, + "loss": 0.0284, + "num_input_tokens_seen": 34858752, + "step": 28635 + }, + { + "epoch": 3.1896647733600623, + "grad_norm": 0.01576949656009674, + "learning_rate": 4.946312637295766e-05, + "loss": 0.0295, + "num_input_tokens_seen": 34864768, + "step": 28640 + }, + { + "epoch": 3.1902216282436795, + "grad_norm": 0.7088651657104492, + "learning_rate": 4.946262542063874e-05, + "loss": 0.0428, + "num_input_tokens_seen": 34870912, + "step": 28645 + }, + { + "epoch": 3.190778483127297, + "grad_norm": 0.5686805248260498, + "learning_rate": 4.946212423725073e-05, + "loss": 0.0816, + "num_input_tokens_seen": 34877344, + "step": 28650 + }, + { + "epoch": 3.1913353380109144, + "grad_norm": 0.661798357963562, + "learning_rate": 4.9461622822798346e-05, + "loss": 0.1584, + "num_input_tokens_seen": 34883104, + "step": 28655 + }, + { + "epoch": 3.1918921928945316, + "grad_norm": 0.17897425591945648, + "learning_rate": 4.946112117728634e-05, + "loss": 0.064, + "num_input_tokens_seen": 34888960, + "step": 28660 + }, + { + "epoch": 3.192449047778149, + "grad_norm": 0.18677382171154022, + "learning_rate": 4.946061930071945e-05, + "loss": 0.0868, + "num_input_tokens_seen": 34894912, + "step": 28665 + }, + { + "epoch": 3.1930059026617665, + "grad_norm": 0.11435181647539139, + "learning_rate": 4.946011719310241e-05, + "loss": 0.0456, + "num_input_tokens_seen": 34900960, + "step": 28670 + }, + { + "epoch": 3.1935627575453838, + "grad_norm": 0.017721282318234444, + "learning_rate": 4.945961485443996e-05, + "loss": 0.121, + "num_input_tokens_seen": 34906880, + "step": 28675 + }, + { + "epoch": 3.194119612429001, + "grad_norm": 0.07897596806287766, + "learning_rate": 4.945911228473686e-05, + "loss": 0.0464, + "num_input_tokens_seen": 34913056, + "step": 28680 + }, + { + "epoch": 3.1946764673126182, + "grad_norm": 0.3939361572265625, + "learning_rate": 4.945860948399785e-05, + "loss": 0.1106, + "num_input_tokens_seen": 34918880, + "step": 28685 + }, + { + "epoch": 3.1952333221962355, + "grad_norm": 0.5433433055877686, + "learning_rate": 4.945810645222767e-05, + "loss": 0.0878, + "num_input_tokens_seen": 34924480, + "step": 28690 + }, + { + "epoch": 3.195790177079853, + "grad_norm": 0.022653143852949142, + "learning_rate": 4.945760318943108e-05, + "loss": 0.0024, + "num_input_tokens_seen": 34930976, + "step": 28695 + }, + { + "epoch": 3.1963470319634704, + "grad_norm": 0.027674859389662743, + "learning_rate": 4.945709969561284e-05, + "loss": 0.165, + "num_input_tokens_seen": 34937344, + "step": 28700 + }, + { + "epoch": 3.1969038868470876, + "grad_norm": 0.7750877141952515, + "learning_rate": 4.9456595970777695e-05, + "loss": 0.073, + "num_input_tokens_seen": 34943360, + "step": 28705 + }, + { + "epoch": 3.197460741730705, + "grad_norm": 0.6477213501930237, + "learning_rate": 4.94560920149304e-05, + "loss": 0.143, + "num_input_tokens_seen": 34949376, + "step": 28710 + }, + { + "epoch": 3.1980175966143225, + "grad_norm": 0.04283295199275017, + "learning_rate": 4.9455587828075726e-05, + "loss": 0.1009, + "num_input_tokens_seen": 34955680, + "step": 28715 + }, + { + "epoch": 3.1985744514979397, + "grad_norm": 5.989884853363037, + "learning_rate": 4.9455083410218436e-05, + "loss": 0.073, + "num_input_tokens_seen": 34961728, + "step": 28720 + }, + { + "epoch": 3.199131306381557, + "grad_norm": 0.8147254586219788, + "learning_rate": 4.945457876136328e-05, + "loss": 0.0939, + "num_input_tokens_seen": 34968064, + "step": 28725 + }, + { + "epoch": 3.199688161265174, + "grad_norm": 0.12697181105613708, + "learning_rate": 4.945407388151505e-05, + "loss": 0.0481, + "num_input_tokens_seen": 34973920, + "step": 28730 + }, + { + "epoch": 3.2002450161487914, + "grad_norm": 0.00611946452409029, + "learning_rate": 4.945356877067849e-05, + "loss": 0.0206, + "num_input_tokens_seen": 34980288, + "step": 28735 + }, + { + "epoch": 3.200801871032409, + "grad_norm": 1.4227561950683594, + "learning_rate": 4.945306342885838e-05, + "loss": 0.0853, + "num_input_tokens_seen": 34986496, + "step": 28740 + }, + { + "epoch": 3.2013587259160263, + "grad_norm": 0.22726160287857056, + "learning_rate": 4.9452557856059503e-05, + "loss": 0.1444, + "num_input_tokens_seen": 34992608, + "step": 28745 + }, + { + "epoch": 3.2019155807996436, + "grad_norm": 0.06841637194156647, + "learning_rate": 4.945205205228662e-05, + "loss": 0.0681, + "num_input_tokens_seen": 34998880, + "step": 28750 + }, + { + "epoch": 3.202472435683261, + "grad_norm": 0.6074411869049072, + "learning_rate": 4.945154601754452e-05, + "loss": 0.0995, + "num_input_tokens_seen": 35004992, + "step": 28755 + }, + { + "epoch": 3.2030292905668785, + "grad_norm": 0.9996064901351929, + "learning_rate": 4.945103975183797e-05, + "loss": 0.0902, + "num_input_tokens_seen": 35010688, + "step": 28760 + }, + { + "epoch": 3.2035861454504957, + "grad_norm": 0.04134172573685646, + "learning_rate": 4.945053325517176e-05, + "loss": 0.0082, + "num_input_tokens_seen": 35016896, + "step": 28765 + }, + { + "epoch": 3.204143000334113, + "grad_norm": 0.8002395629882812, + "learning_rate": 4.945002652755067e-05, + "loss": 0.045, + "num_input_tokens_seen": 35022944, + "step": 28770 + }, + { + "epoch": 3.20469985521773, + "grad_norm": 0.07891730219125748, + "learning_rate": 4.9449519568979495e-05, + "loss": 0.0641, + "num_input_tokens_seen": 35028768, + "step": 28775 + }, + { + "epoch": 3.2052567101013474, + "grad_norm": 0.07173503935337067, + "learning_rate": 4.944901237946302e-05, + "loss": 0.0658, + "num_input_tokens_seen": 35035104, + "step": 28780 + }, + { + "epoch": 3.205813564984965, + "grad_norm": 0.5380929708480835, + "learning_rate": 4.9448504959006044e-05, + "loss": 0.1217, + "num_input_tokens_seen": 35040736, + "step": 28785 + }, + { + "epoch": 3.2063704198685823, + "grad_norm": 1.3350898027420044, + "learning_rate": 4.9447997307613334e-05, + "loss": 0.156, + "num_input_tokens_seen": 35046912, + "step": 28790 + }, + { + "epoch": 3.2069272747521995, + "grad_norm": 0.8518776893615723, + "learning_rate": 4.9447489425289714e-05, + "loss": 0.0316, + "num_input_tokens_seen": 35053120, + "step": 28795 + }, + { + "epoch": 3.2074841296358167, + "grad_norm": 0.5858270525932312, + "learning_rate": 4.944698131203997e-05, + "loss": 0.1355, + "num_input_tokens_seen": 35058592, + "step": 28800 + }, + { + "epoch": 3.2080409845194344, + "grad_norm": 0.7235375642776489, + "learning_rate": 4.94464729678689e-05, + "loss": 0.0645, + "num_input_tokens_seen": 35064736, + "step": 28805 + }, + { + "epoch": 3.2085978394030517, + "grad_norm": 0.903508722782135, + "learning_rate": 4.9445964392781296e-05, + "loss": 0.0729, + "num_input_tokens_seen": 35071008, + "step": 28810 + }, + { + "epoch": 3.209154694286669, + "grad_norm": 0.20802977681159973, + "learning_rate": 4.944545558678198e-05, + "loss": 0.0124, + "num_input_tokens_seen": 35077248, + "step": 28815 + }, + { + "epoch": 3.209711549170286, + "grad_norm": 1.1105631589889526, + "learning_rate": 4.9444946549875755e-05, + "loss": 0.1252, + "num_input_tokens_seen": 35083104, + "step": 28820 + }, + { + "epoch": 3.2102684040539033, + "grad_norm": 0.57924884557724, + "learning_rate": 4.944443728206742e-05, + "loss": 0.0605, + "num_input_tokens_seen": 35089312, + "step": 28825 + }, + { + "epoch": 3.210825258937521, + "grad_norm": 0.009135703556239605, + "learning_rate": 4.944392778336179e-05, + "loss": 0.1316, + "num_input_tokens_seen": 35095584, + "step": 28830 + }, + { + "epoch": 3.2113821138211383, + "grad_norm": 0.006399558857083321, + "learning_rate": 4.944341805376368e-05, + "loss": 0.0321, + "num_input_tokens_seen": 35101664, + "step": 28835 + }, + { + "epoch": 3.2119389687047555, + "grad_norm": 0.22794023156166077, + "learning_rate": 4.944290809327789e-05, + "loss": 0.1377, + "num_input_tokens_seen": 35107104, + "step": 28840 + }, + { + "epoch": 3.2124958235883727, + "grad_norm": 0.12788082659244537, + "learning_rate": 4.944239790190927e-05, + "loss": 0.0599, + "num_input_tokens_seen": 35112768, + "step": 28845 + }, + { + "epoch": 3.2130526784719904, + "grad_norm": 0.07047952711582184, + "learning_rate": 4.9441887479662604e-05, + "loss": 0.1105, + "num_input_tokens_seen": 35119072, + "step": 28850 + }, + { + "epoch": 3.2136095333556076, + "grad_norm": 0.902420163154602, + "learning_rate": 4.944137682654274e-05, + "loss": 0.2231, + "num_input_tokens_seen": 35124128, + "step": 28855 + }, + { + "epoch": 3.214166388239225, + "grad_norm": 0.007070986554026604, + "learning_rate": 4.944086594255448e-05, + "loss": 0.1436, + "num_input_tokens_seen": 35130304, + "step": 28860 + }, + { + "epoch": 3.214723243122842, + "grad_norm": 1.075177550315857, + "learning_rate": 4.944035482770267e-05, + "loss": 0.1453, + "num_input_tokens_seen": 35136576, + "step": 28865 + }, + { + "epoch": 3.2152800980064598, + "grad_norm": 0.3851703405380249, + "learning_rate": 4.943984348199212e-05, + "loss": 0.0384, + "num_input_tokens_seen": 35142560, + "step": 28870 + }, + { + "epoch": 3.215836952890077, + "grad_norm": 0.061517808586359024, + "learning_rate": 4.943933190542767e-05, + "loss": 0.0471, + "num_input_tokens_seen": 35148960, + "step": 28875 + }, + { + "epoch": 3.216393807773694, + "grad_norm": 0.20801179111003876, + "learning_rate": 4.9438820098014146e-05, + "loss": 0.1579, + "num_input_tokens_seen": 35155264, + "step": 28880 + }, + { + "epoch": 3.2169506626573114, + "grad_norm": 0.0031776991672813892, + "learning_rate": 4.943830805975639e-05, + "loss": 0.1162, + "num_input_tokens_seen": 35161280, + "step": 28885 + }, + { + "epoch": 3.2175075175409287, + "grad_norm": 0.003966271877288818, + "learning_rate": 4.943779579065923e-05, + "loss": 0.1043, + "num_input_tokens_seen": 35167616, + "step": 28890 + }, + { + "epoch": 3.2180643724245463, + "grad_norm": 0.07911743968725204, + "learning_rate": 4.943728329072751e-05, + "loss": 0.2732, + "num_input_tokens_seen": 35173760, + "step": 28895 + }, + { + "epoch": 3.2186212273081636, + "grad_norm": 2.3454411029815674, + "learning_rate": 4.9436770559966074e-05, + "loss": 0.2843, + "num_input_tokens_seen": 35179456, + "step": 28900 + }, + { + "epoch": 3.219178082191781, + "grad_norm": 0.00617457227781415, + "learning_rate": 4.9436257598379767e-05, + "loss": 0.0408, + "num_input_tokens_seen": 35185344, + "step": 28905 + }, + { + "epoch": 3.219734937075398, + "grad_norm": 0.03072710894048214, + "learning_rate": 4.943574440597342e-05, + "loss": 0.0315, + "num_input_tokens_seen": 35191392, + "step": 28910 + }, + { + "epoch": 3.2202917919590153, + "grad_norm": 0.513480007648468, + "learning_rate": 4.943523098275189e-05, + "loss": 0.1125, + "num_input_tokens_seen": 35197216, + "step": 28915 + }, + { + "epoch": 3.220848646842633, + "grad_norm": 1.0231362581253052, + "learning_rate": 4.9434717328720025e-05, + "loss": 0.1636, + "num_input_tokens_seen": 35202656, + "step": 28920 + }, + { + "epoch": 3.22140550172625, + "grad_norm": 0.5625559687614441, + "learning_rate": 4.943420344388268e-05, + "loss": 0.0736, + "num_input_tokens_seen": 35208896, + "step": 28925 + }, + { + "epoch": 3.2219623566098674, + "grad_norm": 0.9808470606803894, + "learning_rate": 4.943368932824471e-05, + "loss": 0.1249, + "num_input_tokens_seen": 35214944, + "step": 28930 + }, + { + "epoch": 3.2225192114934846, + "grad_norm": 0.08313131332397461, + "learning_rate": 4.943317498181097e-05, + "loss": 0.0759, + "num_input_tokens_seen": 35220736, + "step": 28935 + }, + { + "epoch": 3.2230760663771023, + "grad_norm": 0.38655886054039, + "learning_rate": 4.943266040458631e-05, + "loss": 0.0456, + "num_input_tokens_seen": 35227040, + "step": 28940 + }, + { + "epoch": 3.2236329212607195, + "grad_norm": 0.024820860475301743, + "learning_rate": 4.9432145596575605e-05, + "loss": 0.0707, + "num_input_tokens_seen": 35233280, + "step": 28945 + }, + { + "epoch": 3.2241897761443368, + "grad_norm": 0.05442195385694504, + "learning_rate": 4.94316305577837e-05, + "loss": 0.0863, + "num_input_tokens_seen": 35239264, + "step": 28950 + }, + { + "epoch": 3.224746631027954, + "grad_norm": 1.1783112287521362, + "learning_rate": 4.943111528821548e-05, + "loss": 0.0163, + "num_input_tokens_seen": 35245344, + "step": 28955 + }, + { + "epoch": 3.2253034859115717, + "grad_norm": 0.020339930430054665, + "learning_rate": 4.94305997878758e-05, + "loss": 0.1009, + "num_input_tokens_seen": 35251616, + "step": 28960 + }, + { + "epoch": 3.225860340795189, + "grad_norm": 0.9709073305130005, + "learning_rate": 4.9430084056769526e-05, + "loss": 0.097, + "num_input_tokens_seen": 35257472, + "step": 28965 + }, + { + "epoch": 3.226417195678806, + "grad_norm": 0.735620379447937, + "learning_rate": 4.942956809490154e-05, + "loss": 0.1502, + "num_input_tokens_seen": 35263680, + "step": 28970 + }, + { + "epoch": 3.2269740505624234, + "grad_norm": 0.3128524422645569, + "learning_rate": 4.942905190227671e-05, + "loss": 0.0573, + "num_input_tokens_seen": 35269760, + "step": 28975 + }, + { + "epoch": 3.2275309054460406, + "grad_norm": 0.2756434977054596, + "learning_rate": 4.942853547889991e-05, + "loss": 0.0364, + "num_input_tokens_seen": 35275776, + "step": 28980 + }, + { + "epoch": 3.2280877603296583, + "grad_norm": 0.7861158847808838, + "learning_rate": 4.942801882477602e-05, + "loss": 0.1372, + "num_input_tokens_seen": 35281664, + "step": 28985 + }, + { + "epoch": 3.2286446152132755, + "grad_norm": 0.2253454625606537, + "learning_rate": 4.9427501939909924e-05, + "loss": 0.0513, + "num_input_tokens_seen": 35287392, + "step": 28990 + }, + { + "epoch": 3.2292014700968927, + "grad_norm": 0.05408366397023201, + "learning_rate": 4.942698482430651e-05, + "loss": 0.1787, + "num_input_tokens_seen": 35293824, + "step": 28995 + }, + { + "epoch": 3.22975832498051, + "grad_norm": 0.0036593724507838488, + "learning_rate": 4.942646747797064e-05, + "loss": 0.0329, + "num_input_tokens_seen": 35300288, + "step": 29000 + }, + { + "epoch": 3.230315179864127, + "grad_norm": 0.04204697534441948, + "learning_rate": 4.942594990090722e-05, + "loss": 0.0133, + "num_input_tokens_seen": 35306464, + "step": 29005 + }, + { + "epoch": 3.230872034747745, + "grad_norm": 0.0674113854765892, + "learning_rate": 4.9425432093121125e-05, + "loss": 0.0255, + "num_input_tokens_seen": 35312576, + "step": 29010 + }, + { + "epoch": 3.231428889631362, + "grad_norm": 0.8662371635437012, + "learning_rate": 4.942491405461727e-05, + "loss": 0.0764, + "num_input_tokens_seen": 35318688, + "step": 29015 + }, + { + "epoch": 3.2319857445149793, + "grad_norm": 1.4515609741210938, + "learning_rate": 4.9424395785400526e-05, + "loss": 0.165, + "num_input_tokens_seen": 35324544, + "step": 29020 + }, + { + "epoch": 3.2325425993985966, + "grad_norm": 0.5148858428001404, + "learning_rate": 4.942387728547579e-05, + "loss": 0.0467, + "num_input_tokens_seen": 35330368, + "step": 29025 + }, + { + "epoch": 3.2330994542822142, + "grad_norm": 0.1784917116165161, + "learning_rate": 4.942335855484797e-05, + "loss": 0.0985, + "num_input_tokens_seen": 35336512, + "step": 29030 + }, + { + "epoch": 3.2336563091658315, + "grad_norm": 0.2516707181930542, + "learning_rate": 4.942283959352196e-05, + "loss": 0.0883, + "num_input_tokens_seen": 35342752, + "step": 29035 + }, + { + "epoch": 3.2342131640494487, + "grad_norm": 0.3806476891040802, + "learning_rate": 4.942232040150267e-05, + "loss": 0.0576, + "num_input_tokens_seen": 35348640, + "step": 29040 + }, + { + "epoch": 3.234770018933066, + "grad_norm": 0.20062536001205444, + "learning_rate": 4.942180097879498e-05, + "loss": 0.0735, + "num_input_tokens_seen": 35354912, + "step": 29045 + }, + { + "epoch": 3.2353268738166836, + "grad_norm": 0.3108023405075073, + "learning_rate": 4.942128132540382e-05, + "loss": 0.0843, + "num_input_tokens_seen": 35360800, + "step": 29050 + }, + { + "epoch": 3.235883728700301, + "grad_norm": 1.3142690658569336, + "learning_rate": 4.9420761441334096e-05, + "loss": 0.0544, + "num_input_tokens_seen": 35367200, + "step": 29055 + }, + { + "epoch": 3.236440583583918, + "grad_norm": 0.020318996161222458, + "learning_rate": 4.9420241326590714e-05, + "loss": 0.0898, + "num_input_tokens_seen": 35373184, + "step": 29060 + }, + { + "epoch": 3.2369974384675353, + "grad_norm": 0.5822032690048218, + "learning_rate": 4.9419720981178584e-05, + "loss": 0.1912, + "num_input_tokens_seen": 35378784, + "step": 29065 + }, + { + "epoch": 3.2375542933511525, + "grad_norm": 0.009065970778465271, + "learning_rate": 4.941920040510263e-05, + "loss": 0.0538, + "num_input_tokens_seen": 35384768, + "step": 29070 + }, + { + "epoch": 3.23811114823477, + "grad_norm": 0.3992127478122711, + "learning_rate": 4.941867959836776e-05, + "loss": 0.0958, + "num_input_tokens_seen": 35390880, + "step": 29075 + }, + { + "epoch": 3.2386680031183874, + "grad_norm": 1.131650686264038, + "learning_rate": 4.94181585609789e-05, + "loss": 0.1623, + "num_input_tokens_seen": 35397088, + "step": 29080 + }, + { + "epoch": 3.2392248580020047, + "grad_norm": 0.6426228284835815, + "learning_rate": 4.9417637292940965e-05, + "loss": 0.0975, + "num_input_tokens_seen": 35403104, + "step": 29085 + }, + { + "epoch": 3.239781712885622, + "grad_norm": 1.457330346107483, + "learning_rate": 4.941711579425889e-05, + "loss": 0.1016, + "num_input_tokens_seen": 35409344, + "step": 29090 + }, + { + "epoch": 3.240338567769239, + "grad_norm": 0.23566442728042603, + "learning_rate": 4.9416594064937585e-05, + "loss": 0.0314, + "num_input_tokens_seen": 35415584, + "step": 29095 + }, + { + "epoch": 3.240895422652857, + "grad_norm": 0.5909842252731323, + "learning_rate": 4.9416072104981995e-05, + "loss": 0.176, + "num_input_tokens_seen": 35421728, + "step": 29100 + }, + { + "epoch": 3.241452277536474, + "grad_norm": 0.6029388904571533, + "learning_rate": 4.941554991439703e-05, + "loss": 0.1619, + "num_input_tokens_seen": 35428064, + "step": 29105 + }, + { + "epoch": 3.2420091324200913, + "grad_norm": 0.6204710006713867, + "learning_rate": 4.941502749318765e-05, + "loss": 0.0683, + "num_input_tokens_seen": 35434240, + "step": 29110 + }, + { + "epoch": 3.2425659873037085, + "grad_norm": 0.23337340354919434, + "learning_rate": 4.9414504841358765e-05, + "loss": 0.1215, + "num_input_tokens_seen": 35440256, + "step": 29115 + }, + { + "epoch": 3.243122842187326, + "grad_norm": 0.8828007578849792, + "learning_rate": 4.941398195891532e-05, + "loss": 0.1384, + "num_input_tokens_seen": 35446336, + "step": 29120 + }, + { + "epoch": 3.2436796970709434, + "grad_norm": 0.5939276814460754, + "learning_rate": 4.941345884586226e-05, + "loss": 0.1106, + "num_input_tokens_seen": 35452640, + "step": 29125 + }, + { + "epoch": 3.2442365519545606, + "grad_norm": 0.21086670458316803, + "learning_rate": 4.941293550220452e-05, + "loss": 0.0418, + "num_input_tokens_seen": 35458464, + "step": 29130 + }, + { + "epoch": 3.244793406838178, + "grad_norm": 0.12831605970859528, + "learning_rate": 4.941241192794704e-05, + "loss": 0.0536, + "num_input_tokens_seen": 35464640, + "step": 29135 + }, + { + "epoch": 3.2453502617217955, + "grad_norm": 1.396746277809143, + "learning_rate": 4.941188812309478e-05, + "loss": 0.2292, + "num_input_tokens_seen": 35470848, + "step": 29140 + }, + { + "epoch": 3.2459071166054128, + "grad_norm": 1.5579017400741577, + "learning_rate": 4.941136408765267e-05, + "loss": 0.1336, + "num_input_tokens_seen": 35476896, + "step": 29145 + }, + { + "epoch": 3.24646397148903, + "grad_norm": 0.251608282327652, + "learning_rate": 4.941083982162568e-05, + "loss": 0.0848, + "num_input_tokens_seen": 35483104, + "step": 29150 + }, + { + "epoch": 3.247020826372647, + "grad_norm": 0.4802912473678589, + "learning_rate": 4.941031532501874e-05, + "loss": 0.0431, + "num_input_tokens_seen": 35489216, + "step": 29155 + }, + { + "epoch": 3.2475776812562644, + "grad_norm": 0.8551483750343323, + "learning_rate": 4.940979059783681e-05, + "loss": 0.1874, + "num_input_tokens_seen": 35495456, + "step": 29160 + }, + { + "epoch": 3.248134536139882, + "grad_norm": 0.9416577816009521, + "learning_rate": 4.9409265640084854e-05, + "loss": 0.1506, + "num_input_tokens_seen": 35501824, + "step": 29165 + }, + { + "epoch": 3.2486913910234994, + "grad_norm": 1.2082512378692627, + "learning_rate": 4.940874045176783e-05, + "loss": 0.1177, + "num_input_tokens_seen": 35508064, + "step": 29170 + }, + { + "epoch": 3.2492482459071166, + "grad_norm": 1.2422471046447754, + "learning_rate": 4.94082150328907e-05, + "loss": 0.0451, + "num_input_tokens_seen": 35514208, + "step": 29175 + }, + { + "epoch": 3.249805100790734, + "grad_norm": 0.12721119821071625, + "learning_rate": 4.940768938345842e-05, + "loss": 0.0972, + "num_input_tokens_seen": 35519840, + "step": 29180 + }, + { + "epoch": 3.250361955674351, + "grad_norm": 1.5720571279525757, + "learning_rate": 4.940716350347596e-05, + "loss": 0.1379, + "num_input_tokens_seen": 35525792, + "step": 29185 + }, + { + "epoch": 3.2509188105579687, + "grad_norm": 0.26438209414482117, + "learning_rate": 4.9406637392948285e-05, + "loss": 0.0822, + "num_input_tokens_seen": 35531904, + "step": 29190 + }, + { + "epoch": 3.251475665441586, + "grad_norm": 0.36177417635917664, + "learning_rate": 4.9406111051880366e-05, + "loss": 0.0657, + "num_input_tokens_seen": 35538016, + "step": 29195 + }, + { + "epoch": 3.252032520325203, + "grad_norm": 0.07958605885505676, + "learning_rate": 4.940558448027718e-05, + "loss": 0.0321, + "num_input_tokens_seen": 35544096, + "step": 29200 + }, + { + "epoch": 3.2525893752088204, + "grad_norm": 1.4750362634658813, + "learning_rate": 4.9405057678143686e-05, + "loss": 0.1068, + "num_input_tokens_seen": 35550400, + "step": 29205 + }, + { + "epoch": 3.253146230092438, + "grad_norm": 0.35572561621665955, + "learning_rate": 4.940453064548487e-05, + "loss": 0.0424, + "num_input_tokens_seen": 35556416, + "step": 29210 + }, + { + "epoch": 3.2537030849760553, + "grad_norm": 0.5334131121635437, + "learning_rate": 4.940400338230572e-05, + "loss": 0.0675, + "num_input_tokens_seen": 35562624, + "step": 29215 + }, + { + "epoch": 3.2542599398596725, + "grad_norm": 0.07572027295827866, + "learning_rate": 4.940347588861119e-05, + "loss": 0.0243, + "num_input_tokens_seen": 35568832, + "step": 29220 + }, + { + "epoch": 3.2548167947432898, + "grad_norm": 0.8471910357475281, + "learning_rate": 4.940294816440629e-05, + "loss": 0.2456, + "num_input_tokens_seen": 35574784, + "step": 29225 + }, + { + "epoch": 3.2553736496269075, + "grad_norm": 1.0774946212768555, + "learning_rate": 4.940242020969599e-05, + "loss": 0.1101, + "num_input_tokens_seen": 35580608, + "step": 29230 + }, + { + "epoch": 3.2559305045105247, + "grad_norm": 0.20820479094982147, + "learning_rate": 4.940189202448527e-05, + "loss": 0.1354, + "num_input_tokens_seen": 35587104, + "step": 29235 + }, + { + "epoch": 3.256487359394142, + "grad_norm": 0.6045440435409546, + "learning_rate": 4.940136360877915e-05, + "loss": 0.1165, + "num_input_tokens_seen": 35593376, + "step": 29240 + }, + { + "epoch": 3.257044214277759, + "grad_norm": 1.484262228012085, + "learning_rate": 4.940083496258259e-05, + "loss": 0.0825, + "num_input_tokens_seen": 35599616, + "step": 29245 + }, + { + "epoch": 3.2576010691613764, + "grad_norm": 0.07902014255523682, + "learning_rate": 4.940030608590059e-05, + "loss": 0.0327, + "num_input_tokens_seen": 35605760, + "step": 29250 + }, + { + "epoch": 3.258157924044994, + "grad_norm": 1.118454933166504, + "learning_rate": 4.9399776978738156e-05, + "loss": 0.1486, + "num_input_tokens_seen": 35611904, + "step": 29255 + }, + { + "epoch": 3.2587147789286113, + "grad_norm": 1.3572556972503662, + "learning_rate": 4.939924764110028e-05, + "loss": 0.0478, + "num_input_tokens_seen": 35618112, + "step": 29260 + }, + { + "epoch": 3.2592716338122285, + "grad_norm": 0.032859016209840775, + "learning_rate": 4.9398718072991954e-05, + "loss": 0.1386, + "num_input_tokens_seen": 35624096, + "step": 29265 + }, + { + "epoch": 3.2598284886958457, + "grad_norm": 0.015606650151312351, + "learning_rate": 4.9398188274418195e-05, + "loss": 0.0475, + "num_input_tokens_seen": 35630368, + "step": 29270 + }, + { + "epoch": 3.260385343579463, + "grad_norm": 0.061544809490442276, + "learning_rate": 4.9397658245383996e-05, + "loss": 0.1244, + "num_input_tokens_seen": 35636384, + "step": 29275 + }, + { + "epoch": 3.2609421984630806, + "grad_norm": 0.5511353611946106, + "learning_rate": 4.939712798589437e-05, + "loss": 0.1093, + "num_input_tokens_seen": 35642528, + "step": 29280 + }, + { + "epoch": 3.261499053346698, + "grad_norm": 0.09140639007091522, + "learning_rate": 4.9396597495954324e-05, + "loss": 0.0685, + "num_input_tokens_seen": 35648832, + "step": 29285 + }, + { + "epoch": 3.262055908230315, + "grad_norm": 0.48547494411468506, + "learning_rate": 4.939606677556887e-05, + "loss": 0.0511, + "num_input_tokens_seen": 35655072, + "step": 29290 + }, + { + "epoch": 3.2626127631139323, + "grad_norm": 0.3591330945491791, + "learning_rate": 4.939553582474302e-05, + "loss": 0.041, + "num_input_tokens_seen": 35661696, + "step": 29295 + }, + { + "epoch": 3.26316961799755, + "grad_norm": 0.002721804194152355, + "learning_rate": 4.939500464348178e-05, + "loss": 0.036, + "num_input_tokens_seen": 35667808, + "step": 29300 + }, + { + "epoch": 3.2637264728811672, + "grad_norm": 0.08998967707157135, + "learning_rate": 4.939447323179018e-05, + "loss": 0.0563, + "num_input_tokens_seen": 35673760, + "step": 29305 + }, + { + "epoch": 3.2642833277647845, + "grad_norm": 0.31355905532836914, + "learning_rate": 4.939394158967324e-05, + "loss": 0.1473, + "num_input_tokens_seen": 35680096, + "step": 29310 + }, + { + "epoch": 3.2648401826484017, + "grad_norm": 0.13197216391563416, + "learning_rate": 4.939340971713598e-05, + "loss": 0.0335, + "num_input_tokens_seen": 35686080, + "step": 29315 + }, + { + "epoch": 3.2653970375320194, + "grad_norm": 0.341926246881485, + "learning_rate": 4.939287761418342e-05, + "loss": 0.0935, + "num_input_tokens_seen": 35692320, + "step": 29320 + }, + { + "epoch": 3.2659538924156366, + "grad_norm": 0.036359772086143494, + "learning_rate": 4.939234528082058e-05, + "loss": 0.0588, + "num_input_tokens_seen": 35698368, + "step": 29325 + }, + { + "epoch": 3.266510747299254, + "grad_norm": 0.24258892238140106, + "learning_rate": 4.93918127170525e-05, + "loss": 0.0566, + "num_input_tokens_seen": 35704768, + "step": 29330 + }, + { + "epoch": 3.267067602182871, + "grad_norm": 0.08858084678649902, + "learning_rate": 4.939127992288421e-05, + "loss": 0.0714, + "num_input_tokens_seen": 35711040, + "step": 29335 + }, + { + "epoch": 3.2676244570664883, + "grad_norm": 1.9530417919158936, + "learning_rate": 4.939074689832074e-05, + "loss": 0.1216, + "num_input_tokens_seen": 35716896, + "step": 29340 + }, + { + "epoch": 3.268181311950106, + "grad_norm": 1.8156391382217407, + "learning_rate": 4.9390213643367117e-05, + "loss": 0.1435, + "num_input_tokens_seen": 35722816, + "step": 29345 + }, + { + "epoch": 3.268738166833723, + "grad_norm": 0.06676772236824036, + "learning_rate": 4.938968015802839e-05, + "loss": 0.0348, + "num_input_tokens_seen": 35728992, + "step": 29350 + }, + { + "epoch": 3.2692950217173404, + "grad_norm": 0.013083959929645061, + "learning_rate": 4.938914644230959e-05, + "loss": 0.0299, + "num_input_tokens_seen": 35734976, + "step": 29355 + }, + { + "epoch": 3.2698518766009577, + "grad_norm": 0.600185215473175, + "learning_rate": 4.938861249621577e-05, + "loss": 0.0566, + "num_input_tokens_seen": 35741248, + "step": 29360 + }, + { + "epoch": 3.270408731484575, + "grad_norm": 0.035742636770009995, + "learning_rate": 4.938807831975195e-05, + "loss": 0.177, + "num_input_tokens_seen": 35747424, + "step": 29365 + }, + { + "epoch": 3.2709655863681926, + "grad_norm": 0.16455799341201782, + "learning_rate": 4.9387543912923205e-05, + "loss": 0.0576, + "num_input_tokens_seen": 35753376, + "step": 29370 + }, + { + "epoch": 3.27152244125181, + "grad_norm": 0.23208580911159515, + "learning_rate": 4.9387009275734565e-05, + "loss": 0.0731, + "num_input_tokens_seen": 35759584, + "step": 29375 + }, + { + "epoch": 3.272079296135427, + "grad_norm": 0.3748997449874878, + "learning_rate": 4.938647440819108e-05, + "loss": 0.1383, + "num_input_tokens_seen": 35765696, + "step": 29380 + }, + { + "epoch": 3.2726361510190443, + "grad_norm": 1.163749098777771, + "learning_rate": 4.9385939310297806e-05, + "loss": 0.1001, + "num_input_tokens_seen": 35772224, + "step": 29385 + }, + { + "epoch": 3.273193005902662, + "grad_norm": 1.3057361841201782, + "learning_rate": 4.93854039820598e-05, + "loss": 0.0769, + "num_input_tokens_seen": 35778240, + "step": 29390 + }, + { + "epoch": 3.273749860786279, + "grad_norm": 0.24304042756557465, + "learning_rate": 4.938486842348212e-05, + "loss": 0.0935, + "num_input_tokens_seen": 35784544, + "step": 29395 + }, + { + "epoch": 3.2743067156698964, + "grad_norm": 0.7988641858100891, + "learning_rate": 4.938433263456982e-05, + "loss": 0.0209, + "num_input_tokens_seen": 35790464, + "step": 29400 + }, + { + "epoch": 3.2748635705535136, + "grad_norm": 0.16161388158798218, + "learning_rate": 4.9383796615327954e-05, + "loss": 0.0163, + "num_input_tokens_seen": 35796096, + "step": 29405 + }, + { + "epoch": 3.2754204254371313, + "grad_norm": 0.4277043640613556, + "learning_rate": 4.9383260365761596e-05, + "loss": 0.0714, + "num_input_tokens_seen": 35802304, + "step": 29410 + }, + { + "epoch": 3.2759772803207485, + "grad_norm": 0.9768722057342529, + "learning_rate": 4.938272388587581e-05, + "loss": 0.0627, + "num_input_tokens_seen": 35808608, + "step": 29415 + }, + { + "epoch": 3.2765341352043658, + "grad_norm": 0.4796980619430542, + "learning_rate": 4.9382187175675664e-05, + "loss": 0.0456, + "num_input_tokens_seen": 35814688, + "step": 29420 + }, + { + "epoch": 3.277090990087983, + "grad_norm": 0.8497686386108398, + "learning_rate": 4.938165023516622e-05, + "loss": 0.1536, + "num_input_tokens_seen": 35820768, + "step": 29425 + }, + { + "epoch": 3.2776478449716, + "grad_norm": 1.6504665613174438, + "learning_rate": 4.938111306435256e-05, + "loss": 0.132, + "num_input_tokens_seen": 35826720, + "step": 29430 + }, + { + "epoch": 3.278204699855218, + "grad_norm": 0.15818339586257935, + "learning_rate": 4.938057566323975e-05, + "loss": 0.0791, + "num_input_tokens_seen": 35833280, + "step": 29435 + }, + { + "epoch": 3.278761554738835, + "grad_norm": 0.07953301072120667, + "learning_rate": 4.9380038031832876e-05, + "loss": 0.0236, + "num_input_tokens_seen": 35839200, + "step": 29440 + }, + { + "epoch": 3.2793184096224524, + "grad_norm": 0.8012165427207947, + "learning_rate": 4.937950017013701e-05, + "loss": 0.1049, + "num_input_tokens_seen": 35844704, + "step": 29445 + }, + { + "epoch": 3.2798752645060696, + "grad_norm": 0.014737647958099842, + "learning_rate": 4.937896207815722e-05, + "loss": 0.0326, + "num_input_tokens_seen": 35850880, + "step": 29450 + }, + { + "epoch": 3.280432119389687, + "grad_norm": 0.8474319577217102, + "learning_rate": 4.93784237558986e-05, + "loss": 0.0756, + "num_input_tokens_seen": 35857216, + "step": 29455 + }, + { + "epoch": 3.2809889742733045, + "grad_norm": 1.2337590456008911, + "learning_rate": 4.9377885203366254e-05, + "loss": 0.1799, + "num_input_tokens_seen": 35863488, + "step": 29460 + }, + { + "epoch": 3.2815458291569217, + "grad_norm": 0.5248954892158508, + "learning_rate": 4.937734642056524e-05, + "loss": 0.0843, + "num_input_tokens_seen": 35869504, + "step": 29465 + }, + { + "epoch": 3.282102684040539, + "grad_norm": 0.08345163613557816, + "learning_rate": 4.9376807407500657e-05, + "loss": 0.1697, + "num_input_tokens_seen": 35875552, + "step": 29470 + }, + { + "epoch": 3.282659538924156, + "grad_norm": 0.4997955858707428, + "learning_rate": 4.937626816417761e-05, + "loss": 0.0509, + "num_input_tokens_seen": 35881920, + "step": 29475 + }, + { + "epoch": 3.283216393807774, + "grad_norm": 0.06303024291992188, + "learning_rate": 4.937572869060117e-05, + "loss": 0.2936, + "num_input_tokens_seen": 35888320, + "step": 29480 + }, + { + "epoch": 3.283773248691391, + "grad_norm": 0.9090748429298401, + "learning_rate": 4.937518898677644e-05, + "loss": 0.091, + "num_input_tokens_seen": 35894432, + "step": 29485 + }, + { + "epoch": 3.2843301035750083, + "grad_norm": 1.5523707866668701, + "learning_rate": 4.937464905270852e-05, + "loss": 0.1978, + "num_input_tokens_seen": 35900736, + "step": 29490 + }, + { + "epoch": 3.2848869584586255, + "grad_norm": 1.486901044845581, + "learning_rate": 4.937410888840252e-05, + "loss": 0.0505, + "num_input_tokens_seen": 35906880, + "step": 29495 + }, + { + "epoch": 3.2854438133422432, + "grad_norm": 0.24812303483486176, + "learning_rate": 4.937356849386353e-05, + "loss": 0.0167, + "num_input_tokens_seen": 35913024, + "step": 29500 + }, + { + "epoch": 3.2860006682258605, + "grad_norm": 0.042700935155153275, + "learning_rate": 4.9373027869096655e-05, + "loss": 0.0988, + "num_input_tokens_seen": 35918912, + "step": 29505 + }, + { + "epoch": 3.2865575231094777, + "grad_norm": 0.6634278893470764, + "learning_rate": 4.937248701410701e-05, + "loss": 0.0648, + "num_input_tokens_seen": 35924960, + "step": 29510 + }, + { + "epoch": 3.287114377993095, + "grad_norm": 0.04362965002655983, + "learning_rate": 4.937194592889969e-05, + "loss": 0.0937, + "num_input_tokens_seen": 35930464, + "step": 29515 + }, + { + "epoch": 3.287671232876712, + "grad_norm": 0.5328559875488281, + "learning_rate": 4.937140461347982e-05, + "loss": 0.1169, + "num_input_tokens_seen": 35935776, + "step": 29520 + }, + { + "epoch": 3.28822808776033, + "grad_norm": 0.6269246935844421, + "learning_rate": 4.937086306785251e-05, + "loss": 0.1275, + "num_input_tokens_seen": 35941824, + "step": 29525 + }, + { + "epoch": 3.288784942643947, + "grad_norm": 1.769071102142334, + "learning_rate": 4.9370321292022863e-05, + "loss": 0.163, + "num_input_tokens_seen": 35947456, + "step": 29530 + }, + { + "epoch": 3.2893417975275643, + "grad_norm": 0.9633088707923889, + "learning_rate": 4.936977928599602e-05, + "loss": 0.1981, + "num_input_tokens_seen": 35952864, + "step": 29535 + }, + { + "epoch": 3.2898986524111815, + "grad_norm": 0.05973171815276146, + "learning_rate": 4.936923704977707e-05, + "loss": 0.0546, + "num_input_tokens_seen": 35958816, + "step": 29540 + }, + { + "epoch": 3.2904555072947987, + "grad_norm": 0.23524169623851776, + "learning_rate": 4.9368694583371165e-05, + "loss": 0.0251, + "num_input_tokens_seen": 35964832, + "step": 29545 + }, + { + "epoch": 3.2910123621784164, + "grad_norm": 0.26022323966026306, + "learning_rate": 4.936815188678341e-05, + "loss": 0.0187, + "num_input_tokens_seen": 35970816, + "step": 29550 + }, + { + "epoch": 3.2915692170620336, + "grad_norm": 0.3143937289714813, + "learning_rate": 4.936760896001894e-05, + "loss": 0.0535, + "num_input_tokens_seen": 35976576, + "step": 29555 + }, + { + "epoch": 3.292126071945651, + "grad_norm": 0.1250522881746292, + "learning_rate": 4.936706580308288e-05, + "loss": 0.018, + "num_input_tokens_seen": 35982752, + "step": 29560 + }, + { + "epoch": 3.292682926829268, + "grad_norm": 0.05569629371166229, + "learning_rate": 4.9366522415980356e-05, + "loss": 0.0047, + "num_input_tokens_seen": 35989312, + "step": 29565 + }, + { + "epoch": 3.293239781712886, + "grad_norm": 0.00991098117083311, + "learning_rate": 4.936597879871651e-05, + "loss": 0.1724, + "num_input_tokens_seen": 35995712, + "step": 29570 + }, + { + "epoch": 3.293796636596503, + "grad_norm": 0.8344874382019043, + "learning_rate": 4.9365434951296475e-05, + "loss": 0.1152, + "num_input_tokens_seen": 36001600, + "step": 29575 + }, + { + "epoch": 3.2943534914801202, + "grad_norm": 0.9590213894844055, + "learning_rate": 4.936489087372538e-05, + "loss": 0.1019, + "num_input_tokens_seen": 36007456, + "step": 29580 + }, + { + "epoch": 3.2949103463637375, + "grad_norm": 0.0029974502976983786, + "learning_rate": 4.936434656600837e-05, + "loss": 0.0768, + "num_input_tokens_seen": 36013600, + "step": 29585 + }, + { + "epoch": 3.295467201247355, + "grad_norm": 0.003991435281932354, + "learning_rate": 4.936380202815059e-05, + "loss": 0.0418, + "num_input_tokens_seen": 36019744, + "step": 29590 + }, + { + "epoch": 3.2960240561309724, + "grad_norm": 0.5177038311958313, + "learning_rate": 4.936325726015718e-05, + "loss": 0.2267, + "num_input_tokens_seen": 36025888, + "step": 29595 + }, + { + "epoch": 3.2965809110145896, + "grad_norm": 1.1159683465957642, + "learning_rate": 4.936271226203328e-05, + "loss": 0.1139, + "num_input_tokens_seen": 36032096, + "step": 29600 + }, + { + "epoch": 3.297137765898207, + "grad_norm": 0.00762277701869607, + "learning_rate": 4.9362167033784054e-05, + "loss": 0.0959, + "num_input_tokens_seen": 36038528, + "step": 29605 + }, + { + "epoch": 3.297694620781824, + "grad_norm": 0.5453372597694397, + "learning_rate": 4.936162157541464e-05, + "loss": 0.1704, + "num_input_tokens_seen": 36044192, + "step": 29610 + }, + { + "epoch": 3.2982514756654417, + "grad_norm": 0.4894438683986664, + "learning_rate": 4.936107588693019e-05, + "loss": 0.1461, + "num_input_tokens_seen": 36050016, + "step": 29615 + }, + { + "epoch": 3.298808330549059, + "grad_norm": 0.014640948735177517, + "learning_rate": 4.9360529968335853e-05, + "loss": 0.1379, + "num_input_tokens_seen": 36056480, + "step": 29620 + }, + { + "epoch": 3.299365185432676, + "grad_norm": 1.1730868816375732, + "learning_rate": 4.935998381963679e-05, + "loss": 0.1618, + "num_input_tokens_seen": 36062816, + "step": 29625 + }, + { + "epoch": 3.2999220403162934, + "grad_norm": 0.1749853640794754, + "learning_rate": 4.935943744083818e-05, + "loss": 0.0303, + "num_input_tokens_seen": 36069120, + "step": 29630 + }, + { + "epoch": 3.3004788951999107, + "grad_norm": 0.17658470571041107, + "learning_rate": 4.935889083194516e-05, + "loss": 0.0503, + "num_input_tokens_seen": 36075008, + "step": 29635 + }, + { + "epoch": 3.3010357500835283, + "grad_norm": 1.0218486785888672, + "learning_rate": 4.9358343992962896e-05, + "loss": 0.0801, + "num_input_tokens_seen": 36081216, + "step": 29640 + }, + { + "epoch": 3.3015926049671456, + "grad_norm": 0.4360475242137909, + "learning_rate": 4.935779692389656e-05, + "loss": 0.0357, + "num_input_tokens_seen": 36087552, + "step": 29645 + }, + { + "epoch": 3.302149459850763, + "grad_norm": 0.04481235891580582, + "learning_rate": 4.935724962475131e-05, + "loss": 0.0781, + "num_input_tokens_seen": 36093760, + "step": 29650 + }, + { + "epoch": 3.30270631473438, + "grad_norm": 0.4219720959663391, + "learning_rate": 4.935670209553234e-05, + "loss": 0.0981, + "num_input_tokens_seen": 36099552, + "step": 29655 + }, + { + "epoch": 3.3032631696179977, + "grad_norm": 0.6905175447463989, + "learning_rate": 4.9356154336244786e-05, + "loss": 0.0917, + "num_input_tokens_seen": 36105952, + "step": 29660 + }, + { + "epoch": 3.303820024501615, + "grad_norm": 0.2593996524810791, + "learning_rate": 4.935560634689385e-05, + "loss": 0.0278, + "num_input_tokens_seen": 36112224, + "step": 29665 + }, + { + "epoch": 3.304376879385232, + "grad_norm": 0.0025560660287737846, + "learning_rate": 4.9355058127484696e-05, + "loss": 0.1536, + "num_input_tokens_seen": 36118464, + "step": 29670 + }, + { + "epoch": 3.3049337342688494, + "grad_norm": 0.6648044586181641, + "learning_rate": 4.9354509678022506e-05, + "loss": 0.0775, + "num_input_tokens_seen": 36124896, + "step": 29675 + }, + { + "epoch": 3.305490589152467, + "grad_norm": 0.32246455550193787, + "learning_rate": 4.935396099851246e-05, + "loss": 0.048, + "num_input_tokens_seen": 36131488, + "step": 29680 + }, + { + "epoch": 3.3060474440360843, + "grad_norm": 0.04235413670539856, + "learning_rate": 4.935341208895974e-05, + "loss": 0.1449, + "num_input_tokens_seen": 36137568, + "step": 29685 + }, + { + "epoch": 3.3066042989197015, + "grad_norm": 1.4596757888793945, + "learning_rate": 4.9352862949369526e-05, + "loss": 0.1306, + "num_input_tokens_seen": 36143584, + "step": 29690 + }, + { + "epoch": 3.3071611538033188, + "grad_norm": 0.18962439894676208, + "learning_rate": 4.935231357974702e-05, + "loss": 0.1294, + "num_input_tokens_seen": 36150080, + "step": 29695 + }, + { + "epoch": 3.307718008686936, + "grad_norm": 0.0035782752092927694, + "learning_rate": 4.93517639800974e-05, + "loss": 0.0161, + "num_input_tokens_seen": 36156192, + "step": 29700 + }, + { + "epoch": 3.3082748635705537, + "grad_norm": 0.3738623559474945, + "learning_rate": 4.935121415042585e-05, + "loss": 0.0432, + "num_input_tokens_seen": 36162400, + "step": 29705 + }, + { + "epoch": 3.308831718454171, + "grad_norm": 0.5053384900093079, + "learning_rate": 4.9350664090737574e-05, + "loss": 0.0311, + "num_input_tokens_seen": 36168288, + "step": 29710 + }, + { + "epoch": 3.309388573337788, + "grad_norm": 4.036724090576172, + "learning_rate": 4.935011380103777e-05, + "loss": 0.1356, + "num_input_tokens_seen": 36174656, + "step": 29715 + }, + { + "epoch": 3.3099454282214054, + "grad_norm": 0.027738723903894424, + "learning_rate": 4.934956328133164e-05, + "loss": 0.0485, + "num_input_tokens_seen": 36180896, + "step": 29720 + }, + { + "epoch": 3.3105022831050226, + "grad_norm": 0.15056546032428741, + "learning_rate": 4.9349012531624364e-05, + "loss": 0.046, + "num_input_tokens_seen": 36186688, + "step": 29725 + }, + { + "epoch": 3.3110591379886403, + "grad_norm": 0.46992921829223633, + "learning_rate": 4.934846155192116e-05, + "loss": 0.1335, + "num_input_tokens_seen": 36193344, + "step": 29730 + }, + { + "epoch": 3.3116159928722575, + "grad_norm": 0.8210551738739014, + "learning_rate": 4.934791034222723e-05, + "loss": 0.0532, + "num_input_tokens_seen": 36199424, + "step": 29735 + }, + { + "epoch": 3.3121728477558747, + "grad_norm": 0.0235662292689085, + "learning_rate": 4.934735890254778e-05, + "loss": 0.1135, + "num_input_tokens_seen": 36205504, + "step": 29740 + }, + { + "epoch": 3.312729702639492, + "grad_norm": 0.12259582430124283, + "learning_rate": 4.934680723288802e-05, + "loss": 0.0699, + "num_input_tokens_seen": 36211776, + "step": 29745 + }, + { + "epoch": 3.3132865575231096, + "grad_norm": 1.1504902839660645, + "learning_rate": 4.9346255333253155e-05, + "loss": 0.0737, + "num_input_tokens_seen": 36217952, + "step": 29750 + }, + { + "epoch": 3.313843412406727, + "grad_norm": 0.641762375831604, + "learning_rate": 4.934570320364841e-05, + "loss": 0.0783, + "num_input_tokens_seen": 36224032, + "step": 29755 + }, + { + "epoch": 3.314400267290344, + "grad_norm": 0.9522316455841064, + "learning_rate": 4.9345150844078984e-05, + "loss": 0.0732, + "num_input_tokens_seen": 36230336, + "step": 29760 + }, + { + "epoch": 3.3149571221739613, + "grad_norm": 0.003667940618470311, + "learning_rate": 4.9344598254550114e-05, + "loss": 0.1141, + "num_input_tokens_seen": 36236544, + "step": 29765 + }, + { + "epoch": 3.315513977057579, + "grad_norm": 0.891899585723877, + "learning_rate": 4.9344045435067e-05, + "loss": 0.1236, + "num_input_tokens_seen": 36242592, + "step": 29770 + }, + { + "epoch": 3.3160708319411962, + "grad_norm": 1.155795693397522, + "learning_rate": 4.934349238563487e-05, + "loss": 0.1184, + "num_input_tokens_seen": 36248736, + "step": 29775 + }, + { + "epoch": 3.3166276868248135, + "grad_norm": 1.3283058404922485, + "learning_rate": 4.934293910625895e-05, + "loss": 0.0931, + "num_input_tokens_seen": 36254848, + "step": 29780 + }, + { + "epoch": 3.3171845417084307, + "grad_norm": 1.8664777278900146, + "learning_rate": 4.934238559694448e-05, + "loss": 0.2054, + "num_input_tokens_seen": 36260928, + "step": 29785 + }, + { + "epoch": 3.317741396592048, + "grad_norm": 0.0014465475687757134, + "learning_rate": 4.9341831857696666e-05, + "loss": 0.0363, + "num_input_tokens_seen": 36267552, + "step": 29790 + }, + { + "epoch": 3.3182982514756656, + "grad_norm": 0.25476232171058655, + "learning_rate": 4.934127788852075e-05, + "loss": 0.0961, + "num_input_tokens_seen": 36273696, + "step": 29795 + }, + { + "epoch": 3.318855106359283, + "grad_norm": 0.35452306270599365, + "learning_rate": 4.9340723689421965e-05, + "loss": 0.024, + "num_input_tokens_seen": 36279808, + "step": 29800 + }, + { + "epoch": 3.3194119612429, + "grad_norm": 0.22618120908737183, + "learning_rate": 4.9340169260405535e-05, + "loss": 0.1064, + "num_input_tokens_seen": 36285696, + "step": 29805 + }, + { + "epoch": 3.3199688161265173, + "grad_norm": 0.04896138608455658, + "learning_rate": 4.9339614601476716e-05, + "loss": 0.0242, + "num_input_tokens_seen": 36291744, + "step": 29810 + }, + { + "epoch": 3.3205256710101345, + "grad_norm": 0.13850167393684387, + "learning_rate": 4.9339059712640726e-05, + "loss": 0.0864, + "num_input_tokens_seen": 36297664, + "step": 29815 + }, + { + "epoch": 3.321082525893752, + "grad_norm": 1.3317804336547852, + "learning_rate": 4.933850459390282e-05, + "loss": 0.0478, + "num_input_tokens_seen": 36303712, + "step": 29820 + }, + { + "epoch": 3.3216393807773694, + "grad_norm": 0.7153336405754089, + "learning_rate": 4.9337949245268244e-05, + "loss": 0.1646, + "num_input_tokens_seen": 36309984, + "step": 29825 + }, + { + "epoch": 3.3221962356609867, + "grad_norm": 1.2809772491455078, + "learning_rate": 4.933739366674223e-05, + "loss": 0.1884, + "num_input_tokens_seen": 36316064, + "step": 29830 + }, + { + "epoch": 3.322753090544604, + "grad_norm": 0.37884998321533203, + "learning_rate": 4.933683785833004e-05, + "loss": 0.0485, + "num_input_tokens_seen": 36322368, + "step": 29835 + }, + { + "epoch": 3.3233099454282216, + "grad_norm": 0.6936231255531311, + "learning_rate": 4.9336281820036915e-05, + "loss": 0.0957, + "num_input_tokens_seen": 36328448, + "step": 29840 + }, + { + "epoch": 3.323866800311839, + "grad_norm": 0.007484115660190582, + "learning_rate": 4.933572555186812e-05, + "loss": 0.0386, + "num_input_tokens_seen": 36334848, + "step": 29845 + }, + { + "epoch": 3.324423655195456, + "grad_norm": 0.6349025368690491, + "learning_rate": 4.9335169053828886e-05, + "loss": 0.1147, + "num_input_tokens_seen": 36341280, + "step": 29850 + }, + { + "epoch": 3.3249805100790732, + "grad_norm": 0.9277727007865906, + "learning_rate": 4.933461232592449e-05, + "loss": 0.0603, + "num_input_tokens_seen": 36347296, + "step": 29855 + }, + { + "epoch": 3.325537364962691, + "grad_norm": 0.08435854315757751, + "learning_rate": 4.933405536816018e-05, + "loss": 0.056, + "num_input_tokens_seen": 36353472, + "step": 29860 + }, + { + "epoch": 3.326094219846308, + "grad_norm": 1.467161774635315, + "learning_rate": 4.933349818054123e-05, + "loss": 0.1033, + "num_input_tokens_seen": 36359680, + "step": 29865 + }, + { + "epoch": 3.3266510747299254, + "grad_norm": 1.3487316370010376, + "learning_rate": 4.933294076307288e-05, + "loss": 0.1337, + "num_input_tokens_seen": 36365184, + "step": 29870 + }, + { + "epoch": 3.3272079296135426, + "grad_norm": 0.4175805449485779, + "learning_rate": 4.933238311576042e-05, + "loss": 0.0642, + "num_input_tokens_seen": 36371072, + "step": 29875 + }, + { + "epoch": 3.32776478449716, + "grad_norm": 0.09566663205623627, + "learning_rate": 4.93318252386091e-05, + "loss": 0.1027, + "num_input_tokens_seen": 36377312, + "step": 29880 + }, + { + "epoch": 3.3283216393807775, + "grad_norm": 0.06678848713636398, + "learning_rate": 4.933126713162421e-05, + "loss": 0.0316, + "num_input_tokens_seen": 36383424, + "step": 29885 + }, + { + "epoch": 3.3288784942643947, + "grad_norm": 0.5203629732131958, + "learning_rate": 4.933070879481099e-05, + "loss": 0.1183, + "num_input_tokens_seen": 36389344, + "step": 29890 + }, + { + "epoch": 3.329435349148012, + "grad_norm": 0.5650519132614136, + "learning_rate": 4.9330150228174746e-05, + "loss": 0.1533, + "num_input_tokens_seen": 36395328, + "step": 29895 + }, + { + "epoch": 3.329992204031629, + "grad_norm": 0.6478967666625977, + "learning_rate": 4.932959143172073e-05, + "loss": 0.2957, + "num_input_tokens_seen": 36401152, + "step": 29900 + }, + { + "epoch": 3.3305490589152464, + "grad_norm": 0.5473718047142029, + "learning_rate": 4.932903240545424e-05, + "loss": 0.0589, + "num_input_tokens_seen": 36407360, + "step": 29905 + }, + { + "epoch": 3.331105913798864, + "grad_norm": 0.3418862521648407, + "learning_rate": 4.9328473149380535e-05, + "loss": 0.076, + "num_input_tokens_seen": 36413408, + "step": 29910 + }, + { + "epoch": 3.3316627686824813, + "grad_norm": 0.9285635948181152, + "learning_rate": 4.932791366350492e-05, + "loss": 0.0862, + "num_input_tokens_seen": 36419584, + "step": 29915 + }, + { + "epoch": 3.3322196235660986, + "grad_norm": 0.04072117805480957, + "learning_rate": 4.932735394783266e-05, + "loss": 0.0547, + "num_input_tokens_seen": 36425600, + "step": 29920 + }, + { + "epoch": 3.3327764784497163, + "grad_norm": 0.4159318804740906, + "learning_rate": 4.932679400236906e-05, + "loss": 0.1221, + "num_input_tokens_seen": 36431456, + "step": 29925 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.4448363482952118, + "learning_rate": 4.932623382711939e-05, + "loss": 0.152, + "num_input_tokens_seen": 36437376, + "step": 29930 + }, + { + "epoch": 3.3338901882169507, + "grad_norm": 3.0295398235321045, + "learning_rate": 4.932567342208896e-05, + "loss": 0.1007, + "num_input_tokens_seen": 36443712, + "step": 29935 + }, + { + "epoch": 3.334447043100568, + "grad_norm": 0.6855589747428894, + "learning_rate": 4.932511278728306e-05, + "loss": 0.1128, + "num_input_tokens_seen": 36449536, + "step": 29940 + }, + { + "epoch": 3.335003897984185, + "grad_norm": 0.2674548625946045, + "learning_rate": 4.932455192270697e-05, + "loss": 0.0257, + "num_input_tokens_seen": 36455872, + "step": 29945 + }, + { + "epoch": 3.335560752867803, + "grad_norm": 0.004050259944051504, + "learning_rate": 4.9323990828366e-05, + "loss": 0.0415, + "num_input_tokens_seen": 36462080, + "step": 29950 + }, + { + "epoch": 3.33611760775142, + "grad_norm": 0.8371367454528809, + "learning_rate": 4.932342950426545e-05, + "loss": 0.0933, + "num_input_tokens_seen": 36468160, + "step": 29955 + }, + { + "epoch": 3.3366744626350373, + "grad_norm": 0.5839406847953796, + "learning_rate": 4.932286795041062e-05, + "loss": 0.0467, + "num_input_tokens_seen": 36474112, + "step": 29960 + }, + { + "epoch": 3.3372313175186545, + "grad_norm": 0.24066828191280365, + "learning_rate": 4.932230616680682e-05, + "loss": 0.0431, + "num_input_tokens_seen": 36480352, + "step": 29965 + }, + { + "epoch": 3.3377881724022718, + "grad_norm": 0.015836860984563828, + "learning_rate": 4.932174415345935e-05, + "loss": 0.0223, + "num_input_tokens_seen": 36486368, + "step": 29970 + }, + { + "epoch": 3.3383450272858894, + "grad_norm": 0.0734027773141861, + "learning_rate": 4.9321181910373515e-05, + "loss": 0.0935, + "num_input_tokens_seen": 36492480, + "step": 29975 + }, + { + "epoch": 3.3389018821695067, + "grad_norm": 0.26936331391334534, + "learning_rate": 4.932061943755464e-05, + "loss": 0.1002, + "num_input_tokens_seen": 36498912, + "step": 29980 + }, + { + "epoch": 3.339458737053124, + "grad_norm": 0.0036746517289429903, + "learning_rate": 4.9320056735008024e-05, + "loss": 0.0578, + "num_input_tokens_seen": 36504448, + "step": 29985 + }, + { + "epoch": 3.340015591936741, + "grad_norm": 0.45627284049987793, + "learning_rate": 4.931949380273899e-05, + "loss": 0.1242, + "num_input_tokens_seen": 36510816, + "step": 29990 + }, + { + "epoch": 3.3405724468203584, + "grad_norm": 0.39076027274131775, + "learning_rate": 4.931893064075285e-05, + "loss": 0.1136, + "num_input_tokens_seen": 36517248, + "step": 29995 + }, + { + "epoch": 3.341129301703976, + "grad_norm": 0.0799548402428627, + "learning_rate": 4.931836724905492e-05, + "loss": 0.031, + "num_input_tokens_seen": 36523488, + "step": 30000 + }, + { + "epoch": 3.3416861565875933, + "grad_norm": 0.49860334396362305, + "learning_rate": 4.931780362765053e-05, + "loss": 0.0475, + "num_input_tokens_seen": 36529408, + "step": 30005 + }, + { + "epoch": 3.3422430114712105, + "grad_norm": 0.27588939666748047, + "learning_rate": 4.931723977654501e-05, + "loss": 0.0851, + "num_input_tokens_seen": 36535392, + "step": 30010 + }, + { + "epoch": 3.342799866354828, + "grad_norm": 1.2075176239013672, + "learning_rate": 4.9316675695743674e-05, + "loss": 0.071, + "num_input_tokens_seen": 36541088, + "step": 30015 + }, + { + "epoch": 3.3433567212384454, + "grad_norm": 0.025522857904434204, + "learning_rate": 4.931611138525185e-05, + "loss": 0.0922, + "num_input_tokens_seen": 36547040, + "step": 30020 + }, + { + "epoch": 3.3439135761220626, + "grad_norm": 0.4401102066040039, + "learning_rate": 4.9315546845074876e-05, + "loss": 0.0877, + "num_input_tokens_seen": 36553472, + "step": 30025 + }, + { + "epoch": 3.34447043100568, + "grad_norm": 0.05144791305065155, + "learning_rate": 4.931498207521808e-05, + "loss": 0.0787, + "num_input_tokens_seen": 36558976, + "step": 30030 + }, + { + "epoch": 3.345027285889297, + "grad_norm": 0.615835964679718, + "learning_rate": 4.9314417075686795e-05, + "loss": 0.1211, + "num_input_tokens_seen": 36565024, + "step": 30035 + }, + { + "epoch": 3.3455841407729148, + "grad_norm": 0.2551099359989166, + "learning_rate": 4.931385184648636e-05, + "loss": 0.1276, + "num_input_tokens_seen": 36571040, + "step": 30040 + }, + { + "epoch": 3.346140995656532, + "grad_norm": 0.052096396684646606, + "learning_rate": 4.931328638762212e-05, + "loss": 0.044, + "num_input_tokens_seen": 36577024, + "step": 30045 + }, + { + "epoch": 3.3466978505401492, + "grad_norm": 0.6670276522636414, + "learning_rate": 4.931272069909941e-05, + "loss": 0.179, + "num_input_tokens_seen": 36583296, + "step": 30050 + }, + { + "epoch": 3.3472547054237665, + "grad_norm": 1.218182921409607, + "learning_rate": 4.931215478092357e-05, + "loss": 0.0968, + "num_input_tokens_seen": 36589536, + "step": 30055 + }, + { + "epoch": 3.3478115603073837, + "grad_norm": 1.6059706211090088, + "learning_rate": 4.931158863309995e-05, + "loss": 0.1734, + "num_input_tokens_seen": 36595520, + "step": 30060 + }, + { + "epoch": 3.3483684151910014, + "grad_norm": 2.326599597930908, + "learning_rate": 4.9311022255633896e-05, + "loss": 0.1719, + "num_input_tokens_seen": 36601312, + "step": 30065 + }, + { + "epoch": 3.3489252700746186, + "grad_norm": 0.3886299431324005, + "learning_rate": 4.931045564853076e-05, + "loss": 0.0626, + "num_input_tokens_seen": 36607168, + "step": 30070 + }, + { + "epoch": 3.349482124958236, + "grad_norm": 0.6736785173416138, + "learning_rate": 4.93098888117959e-05, + "loss": 0.1395, + "num_input_tokens_seen": 36612896, + "step": 30075 + }, + { + "epoch": 3.350038979841853, + "grad_norm": 0.7462093234062195, + "learning_rate": 4.9309321745434655e-05, + "loss": 0.1104, + "num_input_tokens_seen": 36618880, + "step": 30080 + }, + { + "epoch": 3.3505958347254707, + "grad_norm": 1.4999159574508667, + "learning_rate": 4.93087544494524e-05, + "loss": 0.1924, + "num_input_tokens_seen": 36624800, + "step": 30085 + }, + { + "epoch": 3.351152689609088, + "grad_norm": 0.014579000882804394, + "learning_rate": 4.930818692385447e-05, + "loss": 0.0845, + "num_input_tokens_seen": 36630496, + "step": 30090 + }, + { + "epoch": 3.351709544492705, + "grad_norm": 0.05861121043562889, + "learning_rate": 4.9307619168646246e-05, + "loss": 0.1263, + "num_input_tokens_seen": 36636384, + "step": 30095 + }, + { + "epoch": 3.3522663993763224, + "grad_norm": 0.005427639931440353, + "learning_rate": 4.9307051183833085e-05, + "loss": 0.0509, + "num_input_tokens_seen": 36642624, + "step": 30100 + }, + { + "epoch": 3.35282325425994, + "grad_norm": 0.7239711284637451, + "learning_rate": 4.930648296942035e-05, + "loss": 0.173, + "num_input_tokens_seen": 36648480, + "step": 30105 + }, + { + "epoch": 3.3533801091435573, + "grad_norm": 0.030624985694885254, + "learning_rate": 4.930591452541341e-05, + "loss": 0.1055, + "num_input_tokens_seen": 36654496, + "step": 30110 + }, + { + "epoch": 3.3539369640271746, + "grad_norm": 0.9003844261169434, + "learning_rate": 4.930534585181763e-05, + "loss": 0.0712, + "num_input_tokens_seen": 36660480, + "step": 30115 + }, + { + "epoch": 3.354493818910792, + "grad_norm": 0.8278446197509766, + "learning_rate": 4.930477694863839e-05, + "loss": 0.1084, + "num_input_tokens_seen": 36666688, + "step": 30120 + }, + { + "epoch": 3.355050673794409, + "grad_norm": 0.9179643988609314, + "learning_rate": 4.9304207815881055e-05, + "loss": 0.096, + "num_input_tokens_seen": 36672800, + "step": 30125 + }, + { + "epoch": 3.3556075286780267, + "grad_norm": 0.31440725922584534, + "learning_rate": 4.9303638453551006e-05, + "loss": 0.1112, + "num_input_tokens_seen": 36679008, + "step": 30130 + }, + { + "epoch": 3.356164383561644, + "grad_norm": 0.6967577934265137, + "learning_rate": 4.930306886165362e-05, + "loss": 0.1043, + "num_input_tokens_seen": 36684384, + "step": 30135 + }, + { + "epoch": 3.356721238445261, + "grad_norm": 0.2648237347602844, + "learning_rate": 4.9302499040194276e-05, + "loss": 0.169, + "num_input_tokens_seen": 36690816, + "step": 30140 + }, + { + "epoch": 3.3572780933288784, + "grad_norm": 0.06729285418987274, + "learning_rate": 4.930192898917836e-05, + "loss": 0.0812, + "num_input_tokens_seen": 36696768, + "step": 30145 + }, + { + "epoch": 3.3578349482124956, + "grad_norm": 0.7221624255180359, + "learning_rate": 4.930135870861126e-05, + "loss": 0.1421, + "num_input_tokens_seen": 36702528, + "step": 30150 + }, + { + "epoch": 3.3583918030961133, + "grad_norm": 0.5324991941452026, + "learning_rate": 4.930078819849835e-05, + "loss": 0.0254, + "num_input_tokens_seen": 36708672, + "step": 30155 + }, + { + "epoch": 3.3589486579797305, + "grad_norm": 0.5373893976211548, + "learning_rate": 4.9300217458845024e-05, + "loss": 0.1133, + "num_input_tokens_seen": 36714432, + "step": 30160 + }, + { + "epoch": 3.3595055128633478, + "grad_norm": 0.5051231384277344, + "learning_rate": 4.929964648965668e-05, + "loss": 0.1042, + "num_input_tokens_seen": 36720480, + "step": 30165 + }, + { + "epoch": 3.360062367746965, + "grad_norm": 1.3474071025848389, + "learning_rate": 4.92990752909387e-05, + "loss": 0.1463, + "num_input_tokens_seen": 36726624, + "step": 30170 + }, + { + "epoch": 3.3606192226305827, + "grad_norm": 2.196509599685669, + "learning_rate": 4.9298503862696495e-05, + "loss": 0.0668, + "num_input_tokens_seen": 36732768, + "step": 30175 + }, + { + "epoch": 3.3611760775142, + "grad_norm": 0.9219834804534912, + "learning_rate": 4.929793220493545e-05, + "loss": 0.1205, + "num_input_tokens_seen": 36738784, + "step": 30180 + }, + { + "epoch": 3.361732932397817, + "grad_norm": 0.2745446264743805, + "learning_rate": 4.929736031766097e-05, + "loss": 0.0194, + "num_input_tokens_seen": 36745152, + "step": 30185 + }, + { + "epoch": 3.3622897872814344, + "grad_norm": 0.7697990536689758, + "learning_rate": 4.9296788200878443e-05, + "loss": 0.0528, + "num_input_tokens_seen": 36751360, + "step": 30190 + }, + { + "epoch": 3.362846642165052, + "grad_norm": 0.9106770753860474, + "learning_rate": 4.92962158545933e-05, + "loss": 0.1037, + "num_input_tokens_seen": 36757344, + "step": 30195 + }, + { + "epoch": 3.3634034970486693, + "grad_norm": 0.21110372245311737, + "learning_rate": 4.9295643278810924e-05, + "loss": 0.0581, + "num_input_tokens_seen": 36763648, + "step": 30200 + }, + { + "epoch": 3.3639603519322865, + "grad_norm": 0.1931799054145813, + "learning_rate": 4.929507047353673e-05, + "loss": 0.0981, + "num_input_tokens_seen": 36769312, + "step": 30205 + }, + { + "epoch": 3.3645172068159037, + "grad_norm": 0.0539434552192688, + "learning_rate": 4.9294497438776144e-05, + "loss": 0.0788, + "num_input_tokens_seen": 36775040, + "step": 30210 + }, + { + "epoch": 3.365074061699521, + "grad_norm": 0.15672734379768372, + "learning_rate": 4.929392417453456e-05, + "loss": 0.0547, + "num_input_tokens_seen": 36780768, + "step": 30215 + }, + { + "epoch": 3.3656309165831386, + "grad_norm": 0.878157913684845, + "learning_rate": 4.929335068081738e-05, + "loss": 0.2369, + "num_input_tokens_seen": 36786208, + "step": 30220 + }, + { + "epoch": 3.366187771466756, + "grad_norm": 0.5885860323905945, + "learning_rate": 4.929277695763006e-05, + "loss": 0.0142, + "num_input_tokens_seen": 36792608, + "step": 30225 + }, + { + "epoch": 3.366744626350373, + "grad_norm": 0.4362727999687195, + "learning_rate": 4.9292203004977996e-05, + "loss": 0.0536, + "num_input_tokens_seen": 36798528, + "step": 30230 + }, + { + "epoch": 3.3673014812339903, + "grad_norm": 0.5852758884429932, + "learning_rate": 4.929162882286661e-05, + "loss": 0.1049, + "num_input_tokens_seen": 36804512, + "step": 30235 + }, + { + "epoch": 3.3678583361176075, + "grad_norm": 1.6045724153518677, + "learning_rate": 4.929105441130132e-05, + "loss": 0.2125, + "num_input_tokens_seen": 36810944, + "step": 30240 + }, + { + "epoch": 3.368415191001225, + "grad_norm": 0.0071565438993275166, + "learning_rate": 4.929047977028757e-05, + "loss": 0.1097, + "num_input_tokens_seen": 36817280, + "step": 30245 + }, + { + "epoch": 3.3689720458848424, + "grad_norm": 0.5313379764556885, + "learning_rate": 4.928990489983077e-05, + "loss": 0.1455, + "num_input_tokens_seen": 36823168, + "step": 30250 + }, + { + "epoch": 3.3695289007684597, + "grad_norm": 0.2931033968925476, + "learning_rate": 4.928932979993636e-05, + "loss": 0.1504, + "num_input_tokens_seen": 36828928, + "step": 30255 + }, + { + "epoch": 3.370085755652077, + "grad_norm": 1.0519835948944092, + "learning_rate": 4.9288754470609766e-05, + "loss": 0.0958, + "num_input_tokens_seen": 36835072, + "step": 30260 + }, + { + "epoch": 3.3706426105356946, + "grad_norm": 0.47760429978370667, + "learning_rate": 4.928817891185643e-05, + "loss": 0.0313, + "num_input_tokens_seen": 36841088, + "step": 30265 + }, + { + "epoch": 3.371199465419312, + "grad_norm": 1.0213407278060913, + "learning_rate": 4.928760312368179e-05, + "loss": 0.1536, + "num_input_tokens_seen": 36847072, + "step": 30270 + }, + { + "epoch": 3.371756320302929, + "grad_norm": 0.3256371319293976, + "learning_rate": 4.9287027106091275e-05, + "loss": 0.0846, + "num_input_tokens_seen": 36853376, + "step": 30275 + }, + { + "epoch": 3.3723131751865463, + "grad_norm": 1.124855875968933, + "learning_rate": 4.9286450859090325e-05, + "loss": 0.0984, + "num_input_tokens_seen": 36859648, + "step": 30280 + }, + { + "epoch": 3.372870030070164, + "grad_norm": 0.22609707713127136, + "learning_rate": 4.92858743826844e-05, + "loss": 0.0796, + "num_input_tokens_seen": 36866048, + "step": 30285 + }, + { + "epoch": 3.373426884953781, + "grad_norm": 0.2327718883752823, + "learning_rate": 4.928529767687893e-05, + "loss": 0.1706, + "num_input_tokens_seen": 36872032, + "step": 30290 + }, + { + "epoch": 3.3739837398373984, + "grad_norm": 0.6947866082191467, + "learning_rate": 4.9284720741679356e-05, + "loss": 0.0749, + "num_input_tokens_seen": 36877824, + "step": 30295 + }, + { + "epoch": 3.3745405947210156, + "grad_norm": 1.0272507667541504, + "learning_rate": 4.9284143577091145e-05, + "loss": 0.0431, + "num_input_tokens_seen": 36884064, + "step": 30300 + }, + { + "epoch": 3.375097449604633, + "grad_norm": 0.21547843515872955, + "learning_rate": 4.9283566183119745e-05, + "loss": 0.0256, + "num_input_tokens_seen": 36890400, + "step": 30305 + }, + { + "epoch": 3.3756543044882505, + "grad_norm": 0.6517410278320312, + "learning_rate": 4.92829885597706e-05, + "loss": 0.0651, + "num_input_tokens_seen": 36896704, + "step": 30310 + }, + { + "epoch": 3.3762111593718678, + "grad_norm": 0.6745983958244324, + "learning_rate": 4.928241070704919e-05, + "loss": 0.0741, + "num_input_tokens_seen": 36902624, + "step": 30315 + }, + { + "epoch": 3.376768014255485, + "grad_norm": 0.05149121582508087, + "learning_rate": 4.928183262496094e-05, + "loss": 0.12, + "num_input_tokens_seen": 36908832, + "step": 30320 + }, + { + "epoch": 3.3773248691391022, + "grad_norm": 0.372562438249588, + "learning_rate": 4.928125431351133e-05, + "loss": 0.0736, + "num_input_tokens_seen": 36914688, + "step": 30325 + }, + { + "epoch": 3.3778817240227195, + "grad_norm": 0.09473861753940582, + "learning_rate": 4.928067577270582e-05, + "loss": 0.1026, + "num_input_tokens_seen": 36920736, + "step": 30330 + }, + { + "epoch": 3.378438578906337, + "grad_norm": 0.2283375859260559, + "learning_rate": 4.9280097002549875e-05, + "loss": 0.0424, + "num_input_tokens_seen": 36926944, + "step": 30335 + }, + { + "epoch": 3.3789954337899544, + "grad_norm": 0.17881253361701965, + "learning_rate": 4.927951800304896e-05, + "loss": 0.1914, + "num_input_tokens_seen": 36933024, + "step": 30340 + }, + { + "epoch": 3.3795522886735716, + "grad_norm": 0.40354546904563904, + "learning_rate": 4.927893877420854e-05, + "loss": 0.089, + "num_input_tokens_seen": 36938976, + "step": 30345 + }, + { + "epoch": 3.380109143557189, + "grad_norm": 0.227498859167099, + "learning_rate": 4.92783593160341e-05, + "loss": 0.1135, + "num_input_tokens_seen": 36945216, + "step": 30350 + }, + { + "epoch": 3.3806659984408065, + "grad_norm": 0.06945262849330902, + "learning_rate": 4.9277779628531095e-05, + "loss": 0.0372, + "num_input_tokens_seen": 36951264, + "step": 30355 + }, + { + "epoch": 3.3812228533244237, + "grad_norm": 0.02391575276851654, + "learning_rate": 4.927719971170502e-05, + "loss": 0.0969, + "num_input_tokens_seen": 36957376, + "step": 30360 + }, + { + "epoch": 3.381779708208041, + "grad_norm": 0.8081635236740112, + "learning_rate": 4.927661956556134e-05, + "loss": 0.1194, + "num_input_tokens_seen": 36962976, + "step": 30365 + }, + { + "epoch": 3.382336563091658, + "grad_norm": 0.11696044355630875, + "learning_rate": 4.927603919010554e-05, + "loss": 0.0532, + "num_input_tokens_seen": 36969184, + "step": 30370 + }, + { + "epoch": 3.382893417975276, + "grad_norm": 0.2156975120306015, + "learning_rate": 4.927545858534309e-05, + "loss": 0.0413, + "num_input_tokens_seen": 36975648, + "step": 30375 + }, + { + "epoch": 3.383450272858893, + "grad_norm": 0.015872053802013397, + "learning_rate": 4.927487775127949e-05, + "loss": 0.0675, + "num_input_tokens_seen": 36981856, + "step": 30380 + }, + { + "epoch": 3.3840071277425103, + "grad_norm": 0.2480953484773636, + "learning_rate": 4.9274296687920226e-05, + "loss": 0.1474, + "num_input_tokens_seen": 36987104, + "step": 30385 + }, + { + "epoch": 3.3845639826261276, + "grad_norm": 0.5611339807510376, + "learning_rate": 4.927371539527078e-05, + "loss": 0.0898, + "num_input_tokens_seen": 36993152, + "step": 30390 + }, + { + "epoch": 3.385120837509745, + "grad_norm": 0.7356531620025635, + "learning_rate": 4.927313387333664e-05, + "loss": 0.1069, + "num_input_tokens_seen": 36998912, + "step": 30395 + }, + { + "epoch": 3.3856776923933625, + "grad_norm": 1.43009614944458, + "learning_rate": 4.927255212212331e-05, + "loss": 0.1725, + "num_input_tokens_seen": 37005440, + "step": 30400 + }, + { + "epoch": 3.3862345472769797, + "grad_norm": 0.018616776913404465, + "learning_rate": 4.927197014163627e-05, + "loss": 0.0642, + "num_input_tokens_seen": 37011648, + "step": 30405 + }, + { + "epoch": 3.386791402160597, + "grad_norm": 0.1609695851802826, + "learning_rate": 4.927138793188103e-05, + "loss": 0.1058, + "num_input_tokens_seen": 37018048, + "step": 30410 + }, + { + "epoch": 3.387348257044214, + "grad_norm": 0.6866763830184937, + "learning_rate": 4.9270805492863084e-05, + "loss": 0.1176, + "num_input_tokens_seen": 37023968, + "step": 30415 + }, + { + "epoch": 3.3879051119278314, + "grad_norm": 0.033926159143447876, + "learning_rate": 4.9270222824587944e-05, + "loss": 0.1088, + "num_input_tokens_seen": 37029632, + "step": 30420 + }, + { + "epoch": 3.388461966811449, + "grad_norm": 1.240126132965088, + "learning_rate": 4.926963992706109e-05, + "loss": 0.1232, + "num_input_tokens_seen": 37035424, + "step": 30425 + }, + { + "epoch": 3.3890188216950663, + "grad_norm": 0.11562566459178925, + "learning_rate": 4.926905680028805e-05, + "loss": 0.0543, + "num_input_tokens_seen": 37041568, + "step": 30430 + }, + { + "epoch": 3.3895756765786835, + "grad_norm": 0.775926947593689, + "learning_rate": 4.926847344427432e-05, + "loss": 0.1695, + "num_input_tokens_seen": 37047744, + "step": 30435 + }, + { + "epoch": 3.3901325314623008, + "grad_norm": 1.8812819719314575, + "learning_rate": 4.9267889859025416e-05, + "loss": 0.1234, + "num_input_tokens_seen": 37053888, + "step": 30440 + }, + { + "epoch": 3.3906893863459184, + "grad_norm": 0.404819130897522, + "learning_rate": 4.926730604454686e-05, + "loss": 0.2643, + "num_input_tokens_seen": 37059904, + "step": 30445 + }, + { + "epoch": 3.3912462412295357, + "grad_norm": 0.09197075664997101, + "learning_rate": 4.926672200084414e-05, + "loss": 0.0384, + "num_input_tokens_seen": 37066208, + "step": 30450 + }, + { + "epoch": 3.391803096113153, + "grad_norm": 0.2691945731639862, + "learning_rate": 4.9266137727922795e-05, + "loss": 0.0732, + "num_input_tokens_seen": 37072416, + "step": 30455 + }, + { + "epoch": 3.39235995099677, + "grad_norm": 0.5049517154693604, + "learning_rate": 4.9265553225788344e-05, + "loss": 0.103, + "num_input_tokens_seen": 37078688, + "step": 30460 + }, + { + "epoch": 3.392916805880388, + "grad_norm": 0.009986210614442825, + "learning_rate": 4.926496849444629e-05, + "loss": 0.0436, + "num_input_tokens_seen": 37084992, + "step": 30465 + }, + { + "epoch": 3.393473660764005, + "grad_norm": 0.0617523267865181, + "learning_rate": 4.926438353390217e-05, + "loss": 0.0338, + "num_input_tokens_seen": 37091136, + "step": 30470 + }, + { + "epoch": 3.3940305156476223, + "grad_norm": 1.341517448425293, + "learning_rate": 4.9263798344161516e-05, + "loss": 0.1957, + "num_input_tokens_seen": 37097472, + "step": 30475 + }, + { + "epoch": 3.3945873705312395, + "grad_norm": 0.4498688876628876, + "learning_rate": 4.926321292522984e-05, + "loss": 0.1251, + "num_input_tokens_seen": 37103424, + "step": 30480 + }, + { + "epoch": 3.3951442254148567, + "grad_norm": 0.10516726970672607, + "learning_rate": 4.9262627277112675e-05, + "loss": 0.0241, + "num_input_tokens_seen": 37109344, + "step": 30485 + }, + { + "epoch": 3.3957010802984744, + "grad_norm": 0.17239074409008026, + "learning_rate": 4.926204139981556e-05, + "loss": 0.025, + "num_input_tokens_seen": 37115744, + "step": 30490 + }, + { + "epoch": 3.3962579351820916, + "grad_norm": 0.8195309042930603, + "learning_rate": 4.926145529334403e-05, + "loss": 0.0869, + "num_input_tokens_seen": 37121952, + "step": 30495 + }, + { + "epoch": 3.396814790065709, + "grad_norm": 0.3438425660133362, + "learning_rate": 4.926086895770361e-05, + "loss": 0.0586, + "num_input_tokens_seen": 37128224, + "step": 30500 + }, + { + "epoch": 3.397371644949326, + "grad_norm": 0.9320237040519714, + "learning_rate": 4.926028239289985e-05, + "loss": 0.1268, + "num_input_tokens_seen": 37134304, + "step": 30505 + }, + { + "epoch": 3.3979284998329433, + "grad_norm": 0.45086297392845154, + "learning_rate": 4.925969559893828e-05, + "loss": 0.1584, + "num_input_tokens_seen": 37140768, + "step": 30510 + }, + { + "epoch": 3.398485354716561, + "grad_norm": 0.7857404947280884, + "learning_rate": 4.9259108575824455e-05, + "loss": 0.0974, + "num_input_tokens_seen": 37147136, + "step": 30515 + }, + { + "epoch": 3.399042209600178, + "grad_norm": 0.9137835502624512, + "learning_rate": 4.925852132356391e-05, + "loss": 0.0305, + "num_input_tokens_seen": 37153056, + "step": 30520 + }, + { + "epoch": 3.3995990644837955, + "grad_norm": 1.311936855316162, + "learning_rate": 4.92579338421622e-05, + "loss": 0.1871, + "num_input_tokens_seen": 37159040, + "step": 30525 + }, + { + "epoch": 3.4001559193674127, + "grad_norm": 0.3256229758262634, + "learning_rate": 4.9257346131624874e-05, + "loss": 0.0278, + "num_input_tokens_seen": 37165440, + "step": 30530 + }, + { + "epoch": 3.4007127742510304, + "grad_norm": 0.16493485867977142, + "learning_rate": 4.9256758191957464e-05, + "loss": 0.056, + "num_input_tokens_seen": 37171648, + "step": 30535 + }, + { + "epoch": 3.4012696291346476, + "grad_norm": 0.6093107461929321, + "learning_rate": 4.925617002316555e-05, + "loss": 0.1233, + "num_input_tokens_seen": 37177568, + "step": 30540 + }, + { + "epoch": 3.401826484018265, + "grad_norm": 0.9998424649238586, + "learning_rate": 4.925558162525467e-05, + "loss": 0.1873, + "num_input_tokens_seen": 37183584, + "step": 30545 + }, + { + "epoch": 3.402383338901882, + "grad_norm": 0.7881739735603333, + "learning_rate": 4.925499299823039e-05, + "loss": 0.1764, + "num_input_tokens_seen": 37189792, + "step": 30550 + }, + { + "epoch": 3.4029401937854997, + "grad_norm": 1.3562464714050293, + "learning_rate": 4.925440414209827e-05, + "loss": 0.1067, + "num_input_tokens_seen": 37196128, + "step": 30555 + }, + { + "epoch": 3.403497048669117, + "grad_norm": 0.4847549498081207, + "learning_rate": 4.925381505686387e-05, + "loss": 0.0371, + "num_input_tokens_seen": 37202432, + "step": 30560 + }, + { + "epoch": 3.404053903552734, + "grad_norm": 0.8055010437965393, + "learning_rate": 4.925322574253276e-05, + "loss": 0.2356, + "num_input_tokens_seen": 37207936, + "step": 30565 + }, + { + "epoch": 3.4046107584363514, + "grad_norm": 0.023580282926559448, + "learning_rate": 4.925263619911049e-05, + "loss": 0.0476, + "num_input_tokens_seen": 37213856, + "step": 30570 + }, + { + "epoch": 3.4051676133199686, + "grad_norm": 0.031047377735376358, + "learning_rate": 4.925204642660265e-05, + "loss": 0.0294, + "num_input_tokens_seen": 37220128, + "step": 30575 + }, + { + "epoch": 3.4057244682035863, + "grad_norm": 0.0450633242726326, + "learning_rate": 4.9251456425014794e-05, + "loss": 0.1346, + "num_input_tokens_seen": 37226400, + "step": 30580 + }, + { + "epoch": 3.4062813230872035, + "grad_norm": 1.7161157131195068, + "learning_rate": 4.925086619435251e-05, + "loss": 0.1512, + "num_input_tokens_seen": 37232672, + "step": 30585 + }, + { + "epoch": 3.406838177970821, + "grad_norm": 0.007978379726409912, + "learning_rate": 4.925027573462136e-05, + "loss": 0.0268, + "num_input_tokens_seen": 37238688, + "step": 30590 + }, + { + "epoch": 3.407395032854438, + "grad_norm": 0.8446102142333984, + "learning_rate": 4.9249685045826925e-05, + "loss": 0.0297, + "num_input_tokens_seen": 37244992, + "step": 30595 + }, + { + "epoch": 3.4079518877380552, + "grad_norm": 0.013130987994372845, + "learning_rate": 4.924909412797479e-05, + "loss": 0.1333, + "num_input_tokens_seen": 37250912, + "step": 30600 + }, + { + "epoch": 3.408508742621673, + "grad_norm": 0.36251360177993774, + "learning_rate": 4.924850298107053e-05, + "loss": 0.1301, + "num_input_tokens_seen": 37257088, + "step": 30605 + }, + { + "epoch": 3.40906559750529, + "grad_norm": 0.47447091341018677, + "learning_rate": 4.924791160511973e-05, + "loss": 0.0974, + "num_input_tokens_seen": 37263296, + "step": 30610 + }, + { + "epoch": 3.4096224523889074, + "grad_norm": 0.07264840602874756, + "learning_rate": 4.924732000012799e-05, + "loss": 0.0258, + "num_input_tokens_seen": 37269376, + "step": 30615 + }, + { + "epoch": 3.4101793072725246, + "grad_norm": 0.17272216081619263, + "learning_rate": 4.924672816610088e-05, + "loss": 0.1746, + "num_input_tokens_seen": 37275552, + "step": 30620 + }, + { + "epoch": 3.4107361621561423, + "grad_norm": 0.028714073821902275, + "learning_rate": 4.9246136103043985e-05, + "loss": 0.0442, + "num_input_tokens_seen": 37281920, + "step": 30625 + }, + { + "epoch": 3.4112930170397595, + "grad_norm": 0.709976851940155, + "learning_rate": 4.924554381096292e-05, + "loss": 0.0712, + "num_input_tokens_seen": 37288288, + "step": 30630 + }, + { + "epoch": 3.4118498719233767, + "grad_norm": 0.2914276421070099, + "learning_rate": 4.924495128986327e-05, + "loss": 0.0723, + "num_input_tokens_seen": 37294336, + "step": 30635 + }, + { + "epoch": 3.412406726806994, + "grad_norm": 0.5901317000389099, + "learning_rate": 4.924435853975062e-05, + "loss": 0.0533, + "num_input_tokens_seen": 37300320, + "step": 30640 + }, + { + "epoch": 3.4129635816906116, + "grad_norm": 0.5351224541664124, + "learning_rate": 4.92437655606306e-05, + "loss": 0.1622, + "num_input_tokens_seen": 37306112, + "step": 30645 + }, + { + "epoch": 3.413520436574229, + "grad_norm": 1.1504982709884644, + "learning_rate": 4.924317235250877e-05, + "loss": 0.0751, + "num_input_tokens_seen": 37312128, + "step": 30650 + }, + { + "epoch": 3.414077291457846, + "grad_norm": 0.5141434669494629, + "learning_rate": 4.924257891539076e-05, + "loss": 0.0828, + "num_input_tokens_seen": 37317632, + "step": 30655 + }, + { + "epoch": 3.4146341463414633, + "grad_norm": 0.4342433512210846, + "learning_rate": 4.924198524928216e-05, + "loss": 0.0472, + "num_input_tokens_seen": 37323712, + "step": 30660 + }, + { + "epoch": 3.4151910012250806, + "grad_norm": 0.1687760055065155, + "learning_rate": 4.9241391354188604e-05, + "loss": 0.0967, + "num_input_tokens_seen": 37329728, + "step": 30665 + }, + { + "epoch": 3.4157478561086982, + "grad_norm": 1.4071462154388428, + "learning_rate": 4.924079723011567e-05, + "loss": 0.1014, + "num_input_tokens_seen": 37335744, + "step": 30670 + }, + { + "epoch": 3.4163047109923155, + "grad_norm": 1.7409725189208984, + "learning_rate": 4.9240202877068995e-05, + "loss": 0.1125, + "num_input_tokens_seen": 37341408, + "step": 30675 + }, + { + "epoch": 3.4168615658759327, + "grad_norm": 1.8350600004196167, + "learning_rate": 4.923960829505419e-05, + "loss": 0.1889, + "num_input_tokens_seen": 37347328, + "step": 30680 + }, + { + "epoch": 3.41741842075955, + "grad_norm": 0.6553136706352234, + "learning_rate": 4.9239013484076845e-05, + "loss": 0.0618, + "num_input_tokens_seen": 37353280, + "step": 30685 + }, + { + "epoch": 3.417975275643167, + "grad_norm": 1.3509904146194458, + "learning_rate": 4.923841844414261e-05, + "loss": 0.2329, + "num_input_tokens_seen": 37358912, + "step": 30690 + }, + { + "epoch": 3.418532130526785, + "grad_norm": 0.007182369939982891, + "learning_rate": 4.9237823175257094e-05, + "loss": 0.0236, + "num_input_tokens_seen": 37365536, + "step": 30695 + }, + { + "epoch": 3.419088985410402, + "grad_norm": 0.39955267310142517, + "learning_rate": 4.923722767742591e-05, + "loss": 0.02, + "num_input_tokens_seen": 37371744, + "step": 30700 + }, + { + "epoch": 3.4196458402940193, + "grad_norm": 0.14124968647956848, + "learning_rate": 4.9236631950654696e-05, + "loss": 0.033, + "num_input_tokens_seen": 37377728, + "step": 30705 + }, + { + "epoch": 3.4202026951776365, + "grad_norm": 1.8111908435821533, + "learning_rate": 4.923603599494908e-05, + "loss": 0.1982, + "num_input_tokens_seen": 37384032, + "step": 30710 + }, + { + "epoch": 3.420759550061254, + "grad_norm": 0.27671536803245544, + "learning_rate": 4.923543981031468e-05, + "loss": 0.0743, + "num_input_tokens_seen": 37390048, + "step": 30715 + }, + { + "epoch": 3.4213164049448714, + "grad_norm": 0.1940920203924179, + "learning_rate": 4.923484339675713e-05, + "loss": 0.0668, + "num_input_tokens_seen": 37396128, + "step": 30720 + }, + { + "epoch": 3.4218732598284887, + "grad_norm": 0.719168484210968, + "learning_rate": 4.9234246754282083e-05, + "loss": 0.1481, + "num_input_tokens_seen": 37401792, + "step": 30725 + }, + { + "epoch": 3.422430114712106, + "grad_norm": 0.3479703366756439, + "learning_rate": 4.923364988289515e-05, + "loss": 0.0764, + "num_input_tokens_seen": 37407776, + "step": 30730 + }, + { + "epoch": 3.4229869695957236, + "grad_norm": 0.6287475824356079, + "learning_rate": 4.923305278260197e-05, + "loss": 0.0471, + "num_input_tokens_seen": 37414368, + "step": 30735 + }, + { + "epoch": 3.423543824479341, + "grad_norm": 0.1404280662536621, + "learning_rate": 4.923245545340821e-05, + "loss": 0.1049, + "num_input_tokens_seen": 37420736, + "step": 30740 + }, + { + "epoch": 3.424100679362958, + "grad_norm": 0.10688341408967972, + "learning_rate": 4.923185789531948e-05, + "loss": 0.0399, + "num_input_tokens_seen": 37426816, + "step": 30745 + }, + { + "epoch": 3.4246575342465753, + "grad_norm": 0.2580030858516693, + "learning_rate": 4.9231260108341445e-05, + "loss": 0.0978, + "num_input_tokens_seen": 37432928, + "step": 30750 + }, + { + "epoch": 3.4252143891301925, + "grad_norm": 0.2200678288936615, + "learning_rate": 4.923066209247974e-05, + "loss": 0.0478, + "num_input_tokens_seen": 37438944, + "step": 30755 + }, + { + "epoch": 3.42577124401381, + "grad_norm": 0.23087656497955322, + "learning_rate": 4.923006384774002e-05, + "loss": 0.075, + "num_input_tokens_seen": 37444832, + "step": 30760 + }, + { + "epoch": 3.4263280988974274, + "grad_norm": 0.32142162322998047, + "learning_rate": 4.9229465374127925e-05, + "loss": 0.1594, + "num_input_tokens_seen": 37451040, + "step": 30765 + }, + { + "epoch": 3.4268849537810446, + "grad_norm": 0.25744760036468506, + "learning_rate": 4.922886667164913e-05, + "loss": 0.0682, + "num_input_tokens_seen": 37456608, + "step": 30770 + }, + { + "epoch": 3.427441808664662, + "grad_norm": 1.968166708946228, + "learning_rate": 4.922826774030928e-05, + "loss": 0.1413, + "num_input_tokens_seen": 37462720, + "step": 30775 + }, + { + "epoch": 3.427998663548279, + "grad_norm": 0.1178552508354187, + "learning_rate": 4.9227668580114016e-05, + "loss": 0.0512, + "num_input_tokens_seen": 37468896, + "step": 30780 + }, + { + "epoch": 3.4285555184318968, + "grad_norm": 1.1396713256835938, + "learning_rate": 4.922706919106902e-05, + "loss": 0.0495, + "num_input_tokens_seen": 37474912, + "step": 30785 + }, + { + "epoch": 3.429112373315514, + "grad_norm": 0.7885523438453674, + "learning_rate": 4.922646957317994e-05, + "loss": 0.1766, + "num_input_tokens_seen": 37480992, + "step": 30790 + }, + { + "epoch": 3.4296692281991312, + "grad_norm": 1.5600192546844482, + "learning_rate": 4.922586972645245e-05, + "loss": 0.2346, + "num_input_tokens_seen": 37487168, + "step": 30795 + }, + { + "epoch": 3.4302260830827485, + "grad_norm": 0.15751051902770996, + "learning_rate": 4.922526965089221e-05, + "loss": 0.0207, + "num_input_tokens_seen": 37493280, + "step": 30800 + }, + { + "epoch": 3.430782937966366, + "grad_norm": 0.2809763550758362, + "learning_rate": 4.922466934650489e-05, + "loss": 0.0485, + "num_input_tokens_seen": 37499808, + "step": 30805 + }, + { + "epoch": 3.4313397928499834, + "grad_norm": 0.38376492261886597, + "learning_rate": 4.9224068813296155e-05, + "loss": 0.0714, + "num_input_tokens_seen": 37505920, + "step": 30810 + }, + { + "epoch": 3.4318966477336006, + "grad_norm": 0.6534329056739807, + "learning_rate": 4.922346805127168e-05, + "loss": 0.0727, + "num_input_tokens_seen": 37511904, + "step": 30815 + }, + { + "epoch": 3.432453502617218, + "grad_norm": 1.2271828651428223, + "learning_rate": 4.922286706043715e-05, + "loss": 0.0758, + "num_input_tokens_seen": 37517792, + "step": 30820 + }, + { + "epoch": 3.4330103575008355, + "grad_norm": 0.16314852237701416, + "learning_rate": 4.922226584079823e-05, + "loss": 0.0133, + "num_input_tokens_seen": 37524000, + "step": 30825 + }, + { + "epoch": 3.4335672123844527, + "grad_norm": 0.9232980608940125, + "learning_rate": 4.92216643923606e-05, + "loss": 0.1222, + "num_input_tokens_seen": 37529760, + "step": 30830 + }, + { + "epoch": 3.43412406726807, + "grad_norm": 0.20799390971660614, + "learning_rate": 4.922106271512995e-05, + "loss": 0.0382, + "num_input_tokens_seen": 37536032, + "step": 30835 + }, + { + "epoch": 3.434680922151687, + "grad_norm": 0.6804742813110352, + "learning_rate": 4.922046080911196e-05, + "loss": 0.1231, + "num_input_tokens_seen": 37542112, + "step": 30840 + }, + { + "epoch": 3.4352377770353044, + "grad_norm": 0.009174053557217121, + "learning_rate": 4.92198586743123e-05, + "loss": 0.0544, + "num_input_tokens_seen": 37548416, + "step": 30845 + }, + { + "epoch": 3.435794631918922, + "grad_norm": 0.8326523303985596, + "learning_rate": 4.921925631073667e-05, + "loss": 0.0821, + "num_input_tokens_seen": 37554496, + "step": 30850 + }, + { + "epoch": 3.4363514868025393, + "grad_norm": 0.5425325036048889, + "learning_rate": 4.9218653718390776e-05, + "loss": 0.0302, + "num_input_tokens_seen": 37560544, + "step": 30855 + }, + { + "epoch": 3.4369083416861566, + "grad_norm": 0.4034292995929718, + "learning_rate": 4.921805089728028e-05, + "loss": 0.0765, + "num_input_tokens_seen": 37566624, + "step": 30860 + }, + { + "epoch": 3.437465196569774, + "grad_norm": 0.947514533996582, + "learning_rate": 4.921744784741089e-05, + "loss": 0.0349, + "num_input_tokens_seen": 37572800, + "step": 30865 + }, + { + "epoch": 3.438022051453391, + "grad_norm": 1.2981090545654297, + "learning_rate": 4.921684456878831e-05, + "loss": 0.0781, + "num_input_tokens_seen": 37578112, + "step": 30870 + }, + { + "epoch": 3.4385789063370087, + "grad_norm": 0.0010894332081079483, + "learning_rate": 4.9216241061418234e-05, + "loss": 0.0236, + "num_input_tokens_seen": 37584416, + "step": 30875 + }, + { + "epoch": 3.439135761220626, + "grad_norm": 2.064176321029663, + "learning_rate": 4.921563732530635e-05, + "loss": 0.1003, + "num_input_tokens_seen": 37590560, + "step": 30880 + }, + { + "epoch": 3.439692616104243, + "grad_norm": 0.055766310542821884, + "learning_rate": 4.921503336045837e-05, + "loss": 0.0653, + "num_input_tokens_seen": 37596320, + "step": 30885 + }, + { + "epoch": 3.4402494709878604, + "grad_norm": 0.11072994768619537, + "learning_rate": 4.9214429166880006e-05, + "loss": 0.1349, + "num_input_tokens_seen": 37602112, + "step": 30890 + }, + { + "epoch": 3.440806325871478, + "grad_norm": 0.32057517766952515, + "learning_rate": 4.921382474457695e-05, + "loss": 0.0859, + "num_input_tokens_seen": 37608384, + "step": 30895 + }, + { + "epoch": 3.4413631807550953, + "grad_norm": 0.1987529993057251, + "learning_rate": 4.9213220093554924e-05, + "loss": 0.0643, + "num_input_tokens_seen": 37614656, + "step": 30900 + }, + { + "epoch": 3.4419200356387125, + "grad_norm": 0.03977986052632332, + "learning_rate": 4.9212615213819635e-05, + "loss": 0.0503, + "num_input_tokens_seen": 37620768, + "step": 30905 + }, + { + "epoch": 3.4424768905223297, + "grad_norm": 0.2497834414243698, + "learning_rate": 4.92120101053768e-05, + "loss": 0.0379, + "num_input_tokens_seen": 37627040, + "step": 30910 + }, + { + "epoch": 3.4430337454059474, + "grad_norm": 0.29771140217781067, + "learning_rate": 4.921140476823213e-05, + "loss": 0.0323, + "num_input_tokens_seen": 37633344, + "step": 30915 + }, + { + "epoch": 3.4435906002895647, + "grad_norm": 0.3638733923435211, + "learning_rate": 4.921079920239134e-05, + "loss": 0.1188, + "num_input_tokens_seen": 37639584, + "step": 30920 + }, + { + "epoch": 3.444147455173182, + "grad_norm": 0.03916201367974281, + "learning_rate": 4.921019340786015e-05, + "loss": 0.1642, + "num_input_tokens_seen": 37645728, + "step": 30925 + }, + { + "epoch": 3.444704310056799, + "grad_norm": 1.1003903150558472, + "learning_rate": 4.92095873846443e-05, + "loss": 0.0762, + "num_input_tokens_seen": 37652000, + "step": 30930 + }, + { + "epoch": 3.4452611649404163, + "grad_norm": 0.050959181040525436, + "learning_rate": 4.920898113274949e-05, + "loss": 0.0756, + "num_input_tokens_seen": 37657728, + "step": 30935 + }, + { + "epoch": 3.445818019824034, + "grad_norm": 0.3658289313316345, + "learning_rate": 4.920837465218146e-05, + "loss": 0.0474, + "num_input_tokens_seen": 37664064, + "step": 30940 + }, + { + "epoch": 3.4463748747076512, + "grad_norm": 0.025002535432577133, + "learning_rate": 4.920776794294594e-05, + "loss": 0.0455, + "num_input_tokens_seen": 37670368, + "step": 30945 + }, + { + "epoch": 3.4469317295912685, + "grad_norm": 1.4539884328842163, + "learning_rate": 4.9207161005048654e-05, + "loss": 0.0819, + "num_input_tokens_seen": 37676384, + "step": 30950 + }, + { + "epoch": 3.4474885844748857, + "grad_norm": 1.0809786319732666, + "learning_rate": 4.920655383849533e-05, + "loss": 0.0558, + "num_input_tokens_seen": 37682432, + "step": 30955 + }, + { + "epoch": 3.448045439358503, + "grad_norm": 0.7386239767074585, + "learning_rate": 4.920594644329172e-05, + "loss": 0.0491, + "num_input_tokens_seen": 37688128, + "step": 30960 + }, + { + "epoch": 3.4486022942421206, + "grad_norm": 0.010268809273838997, + "learning_rate": 4.920533881944355e-05, + "loss": 0.036, + "num_input_tokens_seen": 37693888, + "step": 30965 + }, + { + "epoch": 3.449159149125738, + "grad_norm": 0.10368960350751877, + "learning_rate": 4.920473096695657e-05, + "loss": 0.1, + "num_input_tokens_seen": 37700096, + "step": 30970 + }, + { + "epoch": 3.449716004009355, + "grad_norm": 0.5201529860496521, + "learning_rate": 4.92041228858365e-05, + "loss": 0.0957, + "num_input_tokens_seen": 37705952, + "step": 30975 + }, + { + "epoch": 3.4502728588929723, + "grad_norm": 1.6496795415878296, + "learning_rate": 4.9203514576089106e-05, + "loss": 0.1794, + "num_input_tokens_seen": 37711648, + "step": 30980 + }, + { + "epoch": 3.45082971377659, + "grad_norm": 0.37905722856521606, + "learning_rate": 4.920290603772012e-05, + "loss": 0.0087, + "num_input_tokens_seen": 37717664, + "step": 30985 + }, + { + "epoch": 3.451386568660207, + "grad_norm": 1.897210955619812, + "learning_rate": 4.92022972707353e-05, + "loss": 0.2052, + "num_input_tokens_seen": 37723904, + "step": 30990 + }, + { + "epoch": 3.4519434235438244, + "grad_norm": 1.7362470626831055, + "learning_rate": 4.920168827514039e-05, + "loss": 0.1268, + "num_input_tokens_seen": 37730112, + "step": 30995 + }, + { + "epoch": 3.4525002784274417, + "grad_norm": 0.8002042770385742, + "learning_rate": 4.9201079050941146e-05, + "loss": 0.0725, + "num_input_tokens_seen": 37736192, + "step": 31000 + }, + { + "epoch": 3.4530571333110593, + "grad_norm": 0.5313451290130615, + "learning_rate": 4.920046959814332e-05, + "loss": 0.0398, + "num_input_tokens_seen": 37742432, + "step": 31005 + }, + { + "epoch": 3.4536139881946766, + "grad_norm": 0.3858013451099396, + "learning_rate": 4.919985991675267e-05, + "loss": 0.0677, + "num_input_tokens_seen": 37748448, + "step": 31010 + }, + { + "epoch": 3.454170843078294, + "grad_norm": 0.08006488531827927, + "learning_rate": 4.919925000677495e-05, + "loss": 0.0218, + "num_input_tokens_seen": 37754784, + "step": 31015 + }, + { + "epoch": 3.454727697961911, + "grad_norm": 0.1893884390592575, + "learning_rate": 4.919863986821592e-05, + "loss": 0.1245, + "num_input_tokens_seen": 37760992, + "step": 31020 + }, + { + "epoch": 3.4552845528455283, + "grad_norm": 0.26542097330093384, + "learning_rate": 4.919802950108136e-05, + "loss": 0.1506, + "num_input_tokens_seen": 37766848, + "step": 31025 + }, + { + "epoch": 3.455841407729146, + "grad_norm": 0.32818111777305603, + "learning_rate": 4.9197418905377024e-05, + "loss": 0.0253, + "num_input_tokens_seen": 37773152, + "step": 31030 + }, + { + "epoch": 3.456398262612763, + "grad_norm": 0.26391279697418213, + "learning_rate": 4.9196808081108675e-05, + "loss": 0.034, + "num_input_tokens_seen": 37779296, + "step": 31035 + }, + { + "epoch": 3.4569551174963804, + "grad_norm": 0.0033644915092736483, + "learning_rate": 4.9196197028282085e-05, + "loss": 0.0969, + "num_input_tokens_seen": 37785440, + "step": 31040 + }, + { + "epoch": 3.4575119723799976, + "grad_norm": 0.5743622183799744, + "learning_rate": 4.919558574690304e-05, + "loss": 0.0883, + "num_input_tokens_seen": 37791168, + "step": 31045 + }, + { + "epoch": 3.458068827263615, + "grad_norm": 1.2368541955947876, + "learning_rate": 4.9194974236977296e-05, + "loss": 0.077, + "num_input_tokens_seen": 37797184, + "step": 31050 + }, + { + "epoch": 3.4586256821472325, + "grad_norm": 0.07190436869859695, + "learning_rate": 4.919436249851063e-05, + "loss": 0.0229, + "num_input_tokens_seen": 37803712, + "step": 31055 + }, + { + "epoch": 3.4591825370308498, + "grad_norm": 0.37854981422424316, + "learning_rate": 4.919375053150883e-05, + "loss": 0.041, + "num_input_tokens_seen": 37809952, + "step": 31060 + }, + { + "epoch": 3.459739391914467, + "grad_norm": 0.26887959241867065, + "learning_rate": 4.919313833597768e-05, + "loss": 0.0844, + "num_input_tokens_seen": 37816128, + "step": 31065 + }, + { + "epoch": 3.4602962467980842, + "grad_norm": 2.056101083755493, + "learning_rate": 4.919252591192295e-05, + "loss": 0.2424, + "num_input_tokens_seen": 37822208, + "step": 31070 + }, + { + "epoch": 3.460853101681702, + "grad_norm": 0.25676968693733215, + "learning_rate": 4.9191913259350424e-05, + "loss": 0.0146, + "num_input_tokens_seen": 37828512, + "step": 31075 + }, + { + "epoch": 3.461409956565319, + "grad_norm": 0.41221514344215393, + "learning_rate": 4.9191300378265905e-05, + "loss": 0.1084, + "num_input_tokens_seen": 37834528, + "step": 31080 + }, + { + "epoch": 3.4619668114489364, + "grad_norm": 0.808795690536499, + "learning_rate": 4.919068726867516e-05, + "loss": 0.1268, + "num_input_tokens_seen": 37840544, + "step": 31085 + }, + { + "epoch": 3.4625236663325536, + "grad_norm": 0.3480266332626343, + "learning_rate": 4.9190073930584e-05, + "loss": 0.1158, + "num_input_tokens_seen": 37846496, + "step": 31090 + }, + { + "epoch": 3.4630805212161713, + "grad_norm": 0.6422951221466064, + "learning_rate": 4.918946036399821e-05, + "loss": 0.0293, + "num_input_tokens_seen": 37852640, + "step": 31095 + }, + { + "epoch": 3.4636373760997885, + "grad_norm": 0.002297814469784498, + "learning_rate": 4.918884656892359e-05, + "loss": 0.0344, + "num_input_tokens_seen": 37858688, + "step": 31100 + }, + { + "epoch": 3.4641942309834057, + "grad_norm": 1.3078693151474, + "learning_rate": 4.918823254536593e-05, + "loss": 0.1936, + "num_input_tokens_seen": 37864704, + "step": 31105 + }, + { + "epoch": 3.464751085867023, + "grad_norm": 0.16293959319591522, + "learning_rate": 4.918761829333104e-05, + "loss": 0.0587, + "num_input_tokens_seen": 37870944, + "step": 31110 + }, + { + "epoch": 3.46530794075064, + "grad_norm": 0.12718556821346283, + "learning_rate": 4.9187003812824705e-05, + "loss": 0.0606, + "num_input_tokens_seen": 37877216, + "step": 31115 + }, + { + "epoch": 3.465864795634258, + "grad_norm": 0.036411769688129425, + "learning_rate": 4.9186389103852755e-05, + "loss": 0.0761, + "num_input_tokens_seen": 37883328, + "step": 31120 + }, + { + "epoch": 3.466421650517875, + "grad_norm": 0.06033066287636757, + "learning_rate": 4.918577416642097e-05, + "loss": 0.0408, + "num_input_tokens_seen": 37889504, + "step": 31125 + }, + { + "epoch": 3.4669785054014923, + "grad_norm": 0.6623417139053345, + "learning_rate": 4.9185159000535175e-05, + "loss": 0.0844, + "num_input_tokens_seen": 37895552, + "step": 31130 + }, + { + "epoch": 3.4675353602851096, + "grad_norm": 0.04047625511884689, + "learning_rate": 4.9184543606201186e-05, + "loss": 0.0522, + "num_input_tokens_seen": 37901440, + "step": 31135 + }, + { + "epoch": 3.468092215168727, + "grad_norm": 0.1677514612674713, + "learning_rate": 4.918392798342479e-05, + "loss": 0.1091, + "num_input_tokens_seen": 37907584, + "step": 31140 + }, + { + "epoch": 3.4686490700523445, + "grad_norm": 0.04247434064745903, + "learning_rate": 4.918331213221183e-05, + "loss": 0.0506, + "num_input_tokens_seen": 37913792, + "step": 31145 + }, + { + "epoch": 3.4692059249359617, + "grad_norm": 0.7937276363372803, + "learning_rate": 4.9182696052568106e-05, + "loss": 0.0775, + "num_input_tokens_seen": 37919808, + "step": 31150 + }, + { + "epoch": 3.469762779819579, + "grad_norm": 0.8057924509048462, + "learning_rate": 4.918207974449944e-05, + "loss": 0.1602, + "num_input_tokens_seen": 37925344, + "step": 31155 + }, + { + "epoch": 3.470319634703196, + "grad_norm": 0.020899474620819092, + "learning_rate": 4.918146320801166e-05, + "loss": 0.0374, + "num_input_tokens_seen": 37931232, + "step": 31160 + }, + { + "epoch": 3.470876489586814, + "grad_norm": 0.9506351351737976, + "learning_rate": 4.918084644311059e-05, + "loss": 0.1689, + "num_input_tokens_seen": 37936928, + "step": 31165 + }, + { + "epoch": 3.471433344470431, + "grad_norm": 0.16388161480426788, + "learning_rate": 4.9180229449802054e-05, + "loss": 0.0208, + "num_input_tokens_seen": 37942496, + "step": 31170 + }, + { + "epoch": 3.4719901993540483, + "grad_norm": 0.032766085118055344, + "learning_rate": 4.917961222809186e-05, + "loss": 0.0459, + "num_input_tokens_seen": 37948576, + "step": 31175 + }, + { + "epoch": 3.4725470542376655, + "grad_norm": 1.4795910120010376, + "learning_rate": 4.917899477798588e-05, + "loss": 0.0507, + "num_input_tokens_seen": 37954816, + "step": 31180 + }, + { + "epoch": 3.473103909121283, + "grad_norm": 0.2558412551879883, + "learning_rate": 4.917837709948991e-05, + "loss": 0.0628, + "num_input_tokens_seen": 37961088, + "step": 31185 + }, + { + "epoch": 3.4736607640049004, + "grad_norm": 0.2782507836818695, + "learning_rate": 4.917775919260979e-05, + "loss": 0.0408, + "num_input_tokens_seen": 37967360, + "step": 31190 + }, + { + "epoch": 3.4742176188885177, + "grad_norm": 0.5540297627449036, + "learning_rate": 4.917714105735138e-05, + "loss": 0.0493, + "num_input_tokens_seen": 37973344, + "step": 31195 + }, + { + "epoch": 3.474774473772135, + "grad_norm": 3.10626220703125, + "learning_rate": 4.91765226937205e-05, + "loss": 0.1356, + "num_input_tokens_seen": 37979328, + "step": 31200 + }, + { + "epoch": 3.475331328655752, + "grad_norm": 1.60918128490448, + "learning_rate": 4.917590410172298e-05, + "loss": 0.0777, + "num_input_tokens_seen": 37985568, + "step": 31205 + }, + { + "epoch": 3.47588818353937, + "grad_norm": 0.8845716714859009, + "learning_rate": 4.917528528136468e-05, + "loss": 0.1282, + "num_input_tokens_seen": 37991584, + "step": 31210 + }, + { + "epoch": 3.476445038422987, + "grad_norm": 0.4561998248100281, + "learning_rate": 4.9174666232651445e-05, + "loss": 0.0152, + "num_input_tokens_seen": 37998016, + "step": 31215 + }, + { + "epoch": 3.4770018933066043, + "grad_norm": 0.6229650974273682, + "learning_rate": 4.917404695558912e-05, + "loss": 0.0478, + "num_input_tokens_seen": 38004032, + "step": 31220 + }, + { + "epoch": 3.4775587481902215, + "grad_norm": 0.005740882828831673, + "learning_rate": 4.917342745018356e-05, + "loss": 0.0657, + "num_input_tokens_seen": 38010336, + "step": 31225 + }, + { + "epoch": 3.4781156030738387, + "grad_norm": 0.02605867199599743, + "learning_rate": 4.91728077164406e-05, + "loss": 0.0715, + "num_input_tokens_seen": 38016800, + "step": 31230 + }, + { + "epoch": 3.4786724579574564, + "grad_norm": 0.03297775611281395, + "learning_rate": 4.917218775436611e-05, + "loss": 0.0723, + "num_input_tokens_seen": 38022880, + "step": 31235 + }, + { + "epoch": 3.4792293128410736, + "grad_norm": 0.09125450998544693, + "learning_rate": 4.917156756396594e-05, + "loss": 0.0697, + "num_input_tokens_seen": 38028992, + "step": 31240 + }, + { + "epoch": 3.479786167724691, + "grad_norm": 0.13849468529224396, + "learning_rate": 4.917094714524594e-05, + "loss": 0.0283, + "num_input_tokens_seen": 38035296, + "step": 31245 + }, + { + "epoch": 3.4803430226083085, + "grad_norm": 0.8142822980880737, + "learning_rate": 4.917032649821198e-05, + "loss": 0.0511, + "num_input_tokens_seen": 38041504, + "step": 31250 + }, + { + "epoch": 3.4808998774919258, + "grad_norm": 0.09087592363357544, + "learning_rate": 4.916970562286993e-05, + "loss": 0.0056, + "num_input_tokens_seen": 38047680, + "step": 31255 + }, + { + "epoch": 3.481456732375543, + "grad_norm": 0.7287088632583618, + "learning_rate": 4.916908451922564e-05, + "loss": 0.0219, + "num_input_tokens_seen": 38053600, + "step": 31260 + }, + { + "epoch": 3.48201358725916, + "grad_norm": 0.11567942053079605, + "learning_rate": 4.916846318728498e-05, + "loss": 0.048, + "num_input_tokens_seen": 38059616, + "step": 31265 + }, + { + "epoch": 3.4825704421427774, + "grad_norm": 0.19327756762504578, + "learning_rate": 4.9167841627053835e-05, + "loss": 0.0169, + "num_input_tokens_seen": 38065888, + "step": 31270 + }, + { + "epoch": 3.483127297026395, + "grad_norm": 0.0026423775125294924, + "learning_rate": 4.916721983853805e-05, + "loss": 0.0505, + "num_input_tokens_seen": 38072256, + "step": 31275 + }, + { + "epoch": 3.4836841519100123, + "grad_norm": 0.40598276257514954, + "learning_rate": 4.916659782174352e-05, + "loss": 0.0592, + "num_input_tokens_seen": 38078528, + "step": 31280 + }, + { + "epoch": 3.4842410067936296, + "grad_norm": 0.873897135257721, + "learning_rate": 4.9165975576676105e-05, + "loss": 0.1589, + "num_input_tokens_seen": 38084928, + "step": 31285 + }, + { + "epoch": 3.484797861677247, + "grad_norm": 0.5306804180145264, + "learning_rate": 4.916535310334169e-05, + "loss": 0.0728, + "num_input_tokens_seen": 38091072, + "step": 31290 + }, + { + "epoch": 3.485354716560864, + "grad_norm": 1.5090214014053345, + "learning_rate": 4.916473040174616e-05, + "loss": 0.1162, + "num_input_tokens_seen": 38097216, + "step": 31295 + }, + { + "epoch": 3.4859115714444817, + "grad_norm": 0.27112066745758057, + "learning_rate": 4.916410747189538e-05, + "loss": 0.0685, + "num_input_tokens_seen": 38103232, + "step": 31300 + }, + { + "epoch": 3.486468426328099, + "grad_norm": 0.30867069959640503, + "learning_rate": 4.9163484313795255e-05, + "loss": 0.0284, + "num_input_tokens_seen": 38109312, + "step": 31305 + }, + { + "epoch": 3.487025281211716, + "grad_norm": 0.4832819998264313, + "learning_rate": 4.916286092745166e-05, + "loss": 0.0338, + "num_input_tokens_seen": 38115200, + "step": 31310 + }, + { + "epoch": 3.4875821360953334, + "grad_norm": 0.06418111175298691, + "learning_rate": 4.916223731287048e-05, + "loss": 0.0285, + "num_input_tokens_seen": 38121440, + "step": 31315 + }, + { + "epoch": 3.4881389909789506, + "grad_norm": 0.32637664675712585, + "learning_rate": 4.916161347005761e-05, + "loss": 0.0535, + "num_input_tokens_seen": 38127648, + "step": 31320 + }, + { + "epoch": 3.4886958458625683, + "grad_norm": 1.905711054801941, + "learning_rate": 4.916098939901895e-05, + "loss": 0.1499, + "num_input_tokens_seen": 38133696, + "step": 31325 + }, + { + "epoch": 3.4892527007461855, + "grad_norm": 0.3568711280822754, + "learning_rate": 4.916036509976038e-05, + "loss": 0.0798, + "num_input_tokens_seen": 38140032, + "step": 31330 + }, + { + "epoch": 3.4898095556298028, + "grad_norm": 1.019662618637085, + "learning_rate": 4.91597405722878e-05, + "loss": 0.1121, + "num_input_tokens_seen": 38145792, + "step": 31335 + }, + { + "epoch": 3.4903664105134204, + "grad_norm": 1.8249467611312866, + "learning_rate": 4.915911581660713e-05, + "loss": 0.059, + "num_input_tokens_seen": 38151936, + "step": 31340 + }, + { + "epoch": 3.4909232653970377, + "grad_norm": 0.9514544606208801, + "learning_rate": 4.915849083272425e-05, + "loss": 0.1165, + "num_input_tokens_seen": 38158080, + "step": 31345 + }, + { + "epoch": 3.491480120280655, + "grad_norm": 0.6172600984573364, + "learning_rate": 4.915786562064506e-05, + "loss": 0.0278, + "num_input_tokens_seen": 38164224, + "step": 31350 + }, + { + "epoch": 3.492036975164272, + "grad_norm": 0.0012451228685677052, + "learning_rate": 4.915724018037548e-05, + "loss": 0.0363, + "num_input_tokens_seen": 38170400, + "step": 31355 + }, + { + "epoch": 3.4925938300478894, + "grad_norm": 0.30871397256851196, + "learning_rate": 4.9156614511921405e-05, + "loss": 0.0134, + "num_input_tokens_seen": 38176480, + "step": 31360 + }, + { + "epoch": 3.493150684931507, + "grad_norm": 1.0752792358398438, + "learning_rate": 4.915598861528876e-05, + "loss": 0.1238, + "num_input_tokens_seen": 38182592, + "step": 31365 + }, + { + "epoch": 3.4937075398151243, + "grad_norm": 2.1113171577453613, + "learning_rate": 4.915536249048345e-05, + "loss": 0.1548, + "num_input_tokens_seen": 38188864, + "step": 31370 + }, + { + "epoch": 3.4942643946987415, + "grad_norm": 0.38782763481140137, + "learning_rate": 4.915473613751138e-05, + "loss": 0.0405, + "num_input_tokens_seen": 38194976, + "step": 31375 + }, + { + "epoch": 3.4948212495823587, + "grad_norm": 0.3995746374130249, + "learning_rate": 4.9154109556378486e-05, + "loss": 0.0115, + "num_input_tokens_seen": 38201024, + "step": 31380 + }, + { + "epoch": 3.495378104465976, + "grad_norm": 0.008897259831428528, + "learning_rate": 4.915348274709067e-05, + "loss": 0.0079, + "num_input_tokens_seen": 38206496, + "step": 31385 + }, + { + "epoch": 3.4959349593495936, + "grad_norm": 0.6942322254180908, + "learning_rate": 4.915285570965386e-05, + "loss": 0.0879, + "num_input_tokens_seen": 38212480, + "step": 31390 + }, + { + "epoch": 3.496491814233211, + "grad_norm": 1.1345064640045166, + "learning_rate": 4.9152228444073973e-05, + "loss": 0.1154, + "num_input_tokens_seen": 38218624, + "step": 31395 + }, + { + "epoch": 3.497048669116828, + "grad_norm": 0.776215136051178, + "learning_rate": 4.915160095035693e-05, + "loss": 0.1151, + "num_input_tokens_seen": 38224736, + "step": 31400 + }, + { + "epoch": 3.4976055240004453, + "grad_norm": 0.24090898036956787, + "learning_rate": 4.915097322850868e-05, + "loss": 0.1585, + "num_input_tokens_seen": 38230784, + "step": 31405 + }, + { + "epoch": 3.498162378884063, + "grad_norm": 0.04504408314824104, + "learning_rate": 4.9150345278535135e-05, + "loss": 0.1319, + "num_input_tokens_seen": 38236672, + "step": 31410 + }, + { + "epoch": 3.4987192337676802, + "grad_norm": 0.9474640488624573, + "learning_rate": 4.9149717100442225e-05, + "loss": 0.0949, + "num_input_tokens_seen": 38242912, + "step": 31415 + }, + { + "epoch": 3.4992760886512975, + "grad_norm": 0.06495624035596848, + "learning_rate": 4.91490886942359e-05, + "loss": 0.0501, + "num_input_tokens_seen": 38249024, + "step": 31420 + }, + { + "epoch": 3.4998329435349147, + "grad_norm": 0.544784665107727, + "learning_rate": 4.9148460059922075e-05, + "loss": 0.0379, + "num_input_tokens_seen": 38255136, + "step": 31425 + }, + { + "epoch": 3.5003897984185324, + "grad_norm": 0.02094339020550251, + "learning_rate": 4.91478311975067e-05, + "loss": 0.1496, + "num_input_tokens_seen": 38261568, + "step": 31430 + }, + { + "epoch": 3.5009466533021496, + "grad_norm": 1.1687231063842773, + "learning_rate": 4.914720210699571e-05, + "loss": 0.0462, + "num_input_tokens_seen": 38267648, + "step": 31435 + }, + { + "epoch": 3.501503508185767, + "grad_norm": 0.06700296700000763, + "learning_rate": 4.914657278839505e-05, + "loss": 0.0359, + "num_input_tokens_seen": 38273504, + "step": 31440 + }, + { + "epoch": 3.502060363069384, + "grad_norm": 0.20891660451889038, + "learning_rate": 4.914594324171067e-05, + "loss": 0.0664, + "num_input_tokens_seen": 38279776, + "step": 31445 + }, + { + "epoch": 3.5026172179530013, + "grad_norm": 0.008049978874623775, + "learning_rate": 4.914531346694851e-05, + "loss": 0.0707, + "num_input_tokens_seen": 38286080, + "step": 31450 + }, + { + "epoch": 3.503174072836619, + "grad_norm": 1.0091928243637085, + "learning_rate": 4.91446834641145e-05, + "loss": 0.0536, + "num_input_tokens_seen": 38292384, + "step": 31455 + }, + { + "epoch": 3.503730927720236, + "grad_norm": 0.0016345484182238579, + "learning_rate": 4.914405323321463e-05, + "loss": 0.094, + "num_input_tokens_seen": 38298560, + "step": 31460 + }, + { + "epoch": 3.5042877826038534, + "grad_norm": 0.4581582546234131, + "learning_rate": 4.9143422774254834e-05, + "loss": 0.0235, + "num_input_tokens_seen": 38304864, + "step": 31465 + }, + { + "epoch": 3.5048446374874707, + "grad_norm": 0.04682718217372894, + "learning_rate": 4.9142792087241064e-05, + "loss": 0.0246, + "num_input_tokens_seen": 38310816, + "step": 31470 + }, + { + "epoch": 3.505401492371088, + "grad_norm": 0.6034243106842041, + "learning_rate": 4.914216117217927e-05, + "loss": 0.0701, + "num_input_tokens_seen": 38316992, + "step": 31475 + }, + { + "epoch": 3.5059583472547056, + "grad_norm": 1.6802265644073486, + "learning_rate": 4.9141530029075435e-05, + "loss": 0.0528, + "num_input_tokens_seen": 38322976, + "step": 31480 + }, + { + "epoch": 3.506515202138323, + "grad_norm": 0.5373810529708862, + "learning_rate": 4.9140898657935495e-05, + "loss": 0.0648, + "num_input_tokens_seen": 38329248, + "step": 31485 + }, + { + "epoch": 3.50707205702194, + "grad_norm": 0.5829864144325256, + "learning_rate": 4.9140267058765436e-05, + "loss": 0.1419, + "num_input_tokens_seen": 38335232, + "step": 31490 + }, + { + "epoch": 3.5076289119055573, + "grad_norm": 0.3219808340072632, + "learning_rate": 4.913963523157121e-05, + "loss": 0.0143, + "num_input_tokens_seen": 38341376, + "step": 31495 + }, + { + "epoch": 3.5081857667891745, + "grad_norm": 0.5056755542755127, + "learning_rate": 4.9139003176358785e-05, + "loss": 0.2101, + "num_input_tokens_seen": 38347648, + "step": 31500 + }, + { + "epoch": 3.508742621672792, + "grad_norm": 0.4817626178264618, + "learning_rate": 4.913837089313414e-05, + "loss": 0.0856, + "num_input_tokens_seen": 38353344, + "step": 31505 + }, + { + "epoch": 3.5092994765564094, + "grad_norm": 1.8452825546264648, + "learning_rate": 4.913773838190324e-05, + "loss": 0.2128, + "num_input_tokens_seen": 38359488, + "step": 31510 + }, + { + "epoch": 3.5098563314400266, + "grad_norm": 1.051047682762146, + "learning_rate": 4.913710564267207e-05, + "loss": 0.0756, + "num_input_tokens_seen": 38365600, + "step": 31515 + }, + { + "epoch": 3.5104131863236443, + "grad_norm": 0.684781014919281, + "learning_rate": 4.9136472675446586e-05, + "loss": 0.083, + "num_input_tokens_seen": 38371552, + "step": 31520 + }, + { + "epoch": 3.5109700412072615, + "grad_norm": 0.14320634305477142, + "learning_rate": 4.913583948023278e-05, + "loss": 0.0785, + "num_input_tokens_seen": 38377632, + "step": 31525 + }, + { + "epoch": 3.5115268960908788, + "grad_norm": 0.04995099827647209, + "learning_rate": 4.9135206057036644e-05, + "loss": 0.0467, + "num_input_tokens_seen": 38383456, + "step": 31530 + }, + { + "epoch": 3.512083750974496, + "grad_norm": 0.14152854681015015, + "learning_rate": 4.913457240586414e-05, + "loss": 0.0479, + "num_input_tokens_seen": 38389280, + "step": 31535 + }, + { + "epoch": 3.512640605858113, + "grad_norm": 0.5457988381385803, + "learning_rate": 4.913393852672127e-05, + "loss": 0.0734, + "num_input_tokens_seen": 38395328, + "step": 31540 + }, + { + "epoch": 3.513197460741731, + "grad_norm": 0.016223059967160225, + "learning_rate": 4.9133304419614014e-05, + "loss": 0.035, + "num_input_tokens_seen": 38401504, + "step": 31545 + }, + { + "epoch": 3.513754315625348, + "grad_norm": 0.37107065320014954, + "learning_rate": 4.913267008454836e-05, + "loss": 0.1071, + "num_input_tokens_seen": 38407328, + "step": 31550 + }, + { + "epoch": 3.5143111705089654, + "grad_norm": 0.27627283334732056, + "learning_rate": 4.913203552153031e-05, + "loss": 0.0289, + "num_input_tokens_seen": 38413408, + "step": 31555 + }, + { + "epoch": 3.5148680253925826, + "grad_norm": 0.029537398368120193, + "learning_rate": 4.913140073056584e-05, + "loss": 0.0316, + "num_input_tokens_seen": 38419584, + "step": 31560 + }, + { + "epoch": 3.5154248802762, + "grad_norm": 0.6164605021476746, + "learning_rate": 4.913076571166095e-05, + "loss": 0.1178, + "num_input_tokens_seen": 38425088, + "step": 31565 + }, + { + "epoch": 3.5159817351598175, + "grad_norm": 0.39787569642066956, + "learning_rate": 4.9130130464821664e-05, + "loss": 0.0248, + "num_input_tokens_seen": 38430848, + "step": 31570 + }, + { + "epoch": 3.5165385900434347, + "grad_norm": 0.6543484926223755, + "learning_rate": 4.912949499005395e-05, + "loss": 0.1256, + "num_input_tokens_seen": 38436960, + "step": 31575 + }, + { + "epoch": 3.517095444927052, + "grad_norm": 0.4264598488807678, + "learning_rate": 4.9128859287363826e-05, + "loss": 0.0964, + "num_input_tokens_seen": 38443008, + "step": 31580 + }, + { + "epoch": 3.517652299810669, + "grad_norm": 0.7242000699043274, + "learning_rate": 4.91282233567573e-05, + "loss": 0.046, + "num_input_tokens_seen": 38449184, + "step": 31585 + }, + { + "epoch": 3.5182091546942864, + "grad_norm": 2.1289734840393066, + "learning_rate": 4.912758719824037e-05, + "loss": 0.187, + "num_input_tokens_seen": 38455680, + "step": 31590 + }, + { + "epoch": 3.518766009577904, + "grad_norm": 0.515237033367157, + "learning_rate": 4.912695081181904e-05, + "loss": 0.0469, + "num_input_tokens_seen": 38461536, + "step": 31595 + }, + { + "epoch": 3.5193228644615213, + "grad_norm": 2.4583959579467773, + "learning_rate": 4.9126314197499334e-05, + "loss": 0.1423, + "num_input_tokens_seen": 38467616, + "step": 31600 + }, + { + "epoch": 3.5198797193451385, + "grad_norm": 0.09207851439714432, + "learning_rate": 4.912567735528727e-05, + "loss": 0.1302, + "num_input_tokens_seen": 38473568, + "step": 31605 + }, + { + "epoch": 3.520436574228756, + "grad_norm": 0.7329589128494263, + "learning_rate": 4.912504028518884e-05, + "loss": 0.1758, + "num_input_tokens_seen": 38479232, + "step": 31610 + }, + { + "epoch": 3.5209934291123735, + "grad_norm": 0.142819344997406, + "learning_rate": 4.912440298721008e-05, + "loss": 0.1156, + "num_input_tokens_seen": 38485216, + "step": 31615 + }, + { + "epoch": 3.5215502839959907, + "grad_norm": 0.5758790373802185, + "learning_rate": 4.9123765461357016e-05, + "loss": 0.0302, + "num_input_tokens_seen": 38491584, + "step": 31620 + }, + { + "epoch": 3.522107138879608, + "grad_norm": 0.0032700528390705585, + "learning_rate": 4.9123127707635656e-05, + "loss": 0.1256, + "num_input_tokens_seen": 38497632, + "step": 31625 + }, + { + "epoch": 3.522663993763225, + "grad_norm": 0.4950457811355591, + "learning_rate": 4.9122489726052023e-05, + "loss": 0.1139, + "num_input_tokens_seen": 38503936, + "step": 31630 + }, + { + "epoch": 3.523220848646843, + "grad_norm": 0.7391352653503418, + "learning_rate": 4.912185151661215e-05, + "loss": 0.082, + "num_input_tokens_seen": 38510240, + "step": 31635 + }, + { + "epoch": 3.52377770353046, + "grad_norm": 0.017406610772013664, + "learning_rate": 4.9121213079322056e-05, + "loss": 0.0291, + "num_input_tokens_seen": 38516288, + "step": 31640 + }, + { + "epoch": 3.5243345584140773, + "grad_norm": 0.19549672305583954, + "learning_rate": 4.912057441418779e-05, + "loss": 0.1189, + "num_input_tokens_seen": 38522240, + "step": 31645 + }, + { + "epoch": 3.5248914132976945, + "grad_norm": 0.4191148281097412, + "learning_rate": 4.911993552121537e-05, + "loss": 0.0488, + "num_input_tokens_seen": 38528352, + "step": 31650 + }, + { + "epoch": 3.5254482681813117, + "grad_norm": 0.08223851025104523, + "learning_rate": 4.911929640041083e-05, + "loss": 0.04, + "num_input_tokens_seen": 38534784, + "step": 31655 + }, + { + "epoch": 3.5260051230649294, + "grad_norm": 1.782952070236206, + "learning_rate": 4.911865705178021e-05, + "loss": 0.0822, + "num_input_tokens_seen": 38541088, + "step": 31660 + }, + { + "epoch": 3.5265619779485466, + "grad_norm": 0.010058107785880566, + "learning_rate": 4.911801747532956e-05, + "loss": 0.0116, + "num_input_tokens_seen": 38547328, + "step": 31665 + }, + { + "epoch": 3.527118832832164, + "grad_norm": 0.9015686511993408, + "learning_rate": 4.9117377671064904e-05, + "loss": 0.0956, + "num_input_tokens_seen": 38553280, + "step": 31670 + }, + { + "epoch": 3.527675687715781, + "grad_norm": 0.7322624325752258, + "learning_rate": 4.9116737638992295e-05, + "loss": 0.1508, + "num_input_tokens_seen": 38559424, + "step": 31675 + }, + { + "epoch": 3.5282325425993983, + "grad_norm": 0.0007715141400694847, + "learning_rate": 4.911609737911778e-05, + "loss": 0.0613, + "num_input_tokens_seen": 38565408, + "step": 31680 + }, + { + "epoch": 3.528789397483016, + "grad_norm": 2.824812173843384, + "learning_rate": 4.91154568914474e-05, + "loss": 0.0519, + "num_input_tokens_seen": 38571840, + "step": 31685 + }, + { + "epoch": 3.5293462523666332, + "grad_norm": 0.4865117073059082, + "learning_rate": 4.911481617598721e-05, + "loss": 0.1135, + "num_input_tokens_seen": 38578048, + "step": 31690 + }, + { + "epoch": 3.5299031072502505, + "grad_norm": 0.068705715239048, + "learning_rate": 4.9114175232743264e-05, + "loss": 0.0335, + "num_input_tokens_seen": 38584192, + "step": 31695 + }, + { + "epoch": 3.530459962133868, + "grad_norm": 1.3271374702453613, + "learning_rate": 4.911353406172161e-05, + "loss": 0.1189, + "num_input_tokens_seen": 38590272, + "step": 31700 + }, + { + "epoch": 3.5310168170174854, + "grad_norm": 1.0061149597167969, + "learning_rate": 4.911289266292831e-05, + "loss": 0.1121, + "num_input_tokens_seen": 38596064, + "step": 31705 + }, + { + "epoch": 3.5315736719011026, + "grad_norm": 0.020487403497099876, + "learning_rate": 4.911225103636942e-05, + "loss": 0.0507, + "num_input_tokens_seen": 38602048, + "step": 31710 + }, + { + "epoch": 3.53213052678472, + "grad_norm": 0.01716918684542179, + "learning_rate": 4.911160918205099e-05, + "loss": 0.0413, + "num_input_tokens_seen": 38607936, + "step": 31715 + }, + { + "epoch": 3.532687381668337, + "grad_norm": 0.007314016111195087, + "learning_rate": 4.911096709997911e-05, + "loss": 0.032, + "num_input_tokens_seen": 38613984, + "step": 31720 + }, + { + "epoch": 3.5332442365519547, + "grad_norm": 0.4626096189022064, + "learning_rate": 4.9110324790159817e-05, + "loss": 0.0775, + "num_input_tokens_seen": 38620096, + "step": 31725 + }, + { + "epoch": 3.533801091435572, + "grad_norm": 0.0009851327631622553, + "learning_rate": 4.910968225259919e-05, + "loss": 0.035, + "num_input_tokens_seen": 38626368, + "step": 31730 + }, + { + "epoch": 3.534357946319189, + "grad_norm": 0.24550969898700714, + "learning_rate": 4.91090394873033e-05, + "loss": 0.0724, + "num_input_tokens_seen": 38632224, + "step": 31735 + }, + { + "epoch": 3.5349148012028064, + "grad_norm": 1.2542527914047241, + "learning_rate": 4.910839649427822e-05, + "loss": 0.0936, + "num_input_tokens_seen": 38638112, + "step": 31740 + }, + { + "epoch": 3.5354716560864237, + "grad_norm": 0.4704251289367676, + "learning_rate": 4.910775327353001e-05, + "loss": 0.0684, + "num_input_tokens_seen": 38644384, + "step": 31745 + }, + { + "epoch": 3.5360285109700413, + "grad_norm": 0.9813368320465088, + "learning_rate": 4.910710982506477e-05, + "loss": 0.0828, + "num_input_tokens_seen": 38650528, + "step": 31750 + }, + { + "epoch": 3.5365853658536586, + "grad_norm": 0.11018648743629456, + "learning_rate": 4.910646614888855e-05, + "loss": 0.0475, + "num_input_tokens_seen": 38656512, + "step": 31755 + }, + { + "epoch": 3.537142220737276, + "grad_norm": 0.0004925660323351622, + "learning_rate": 4.910582224500745e-05, + "loss": 0.1236, + "num_input_tokens_seen": 38662720, + "step": 31760 + }, + { + "epoch": 3.537699075620893, + "grad_norm": 1.2769696712493896, + "learning_rate": 4.910517811342754e-05, + "loss": 0.094, + "num_input_tokens_seen": 38668896, + "step": 31765 + }, + { + "epoch": 3.5382559305045103, + "grad_norm": 1.924939751625061, + "learning_rate": 4.910453375415492e-05, + "loss": 0.0547, + "num_input_tokens_seen": 38675168, + "step": 31770 + }, + { + "epoch": 3.538812785388128, + "grad_norm": 1.0518724918365479, + "learning_rate": 4.910388916719566e-05, + "loss": 0.0836, + "num_input_tokens_seen": 38681568, + "step": 31775 + }, + { + "epoch": 3.539369640271745, + "grad_norm": 0.5227946639060974, + "learning_rate": 4.9103244352555856e-05, + "loss": 0.2001, + "num_input_tokens_seen": 38687936, + "step": 31780 + }, + { + "epoch": 3.5399264951553624, + "grad_norm": 1.6993954181671143, + "learning_rate": 4.9102599310241596e-05, + "loss": 0.1143, + "num_input_tokens_seen": 38694176, + "step": 31785 + }, + { + "epoch": 3.54048335003898, + "grad_norm": 0.8238567113876343, + "learning_rate": 4.910195404025898e-05, + "loss": 0.1504, + "num_input_tokens_seen": 38700224, + "step": 31790 + }, + { + "epoch": 3.5410402049225973, + "grad_norm": 0.11602012068033218, + "learning_rate": 4.910130854261409e-05, + "loss": 0.094, + "num_input_tokens_seen": 38706464, + "step": 31795 + }, + { + "epoch": 3.5415970598062145, + "grad_norm": 0.041192296892404556, + "learning_rate": 4.910066281731304e-05, + "loss": 0.1049, + "num_input_tokens_seen": 38712032, + "step": 31800 + }, + { + "epoch": 3.5421539146898318, + "grad_norm": 0.0019369623623788357, + "learning_rate": 4.910001686436191e-05, + "loss": 0.0149, + "num_input_tokens_seen": 38718336, + "step": 31805 + }, + { + "epoch": 3.542710769573449, + "grad_norm": 0.6638302803039551, + "learning_rate": 4.909937068376682e-05, + "loss": 0.1586, + "num_input_tokens_seen": 38724288, + "step": 31810 + }, + { + "epoch": 3.5432676244570667, + "grad_norm": 0.0640367865562439, + "learning_rate": 4.9098724275533865e-05, + "loss": 0.0078, + "num_input_tokens_seen": 38730304, + "step": 31815 + }, + { + "epoch": 3.543824479340684, + "grad_norm": 0.0051776147447526455, + "learning_rate": 4.909807763966915e-05, + "loss": 0.0528, + "num_input_tokens_seen": 38736512, + "step": 31820 + }, + { + "epoch": 3.544381334224301, + "grad_norm": 1.148022174835205, + "learning_rate": 4.909743077617879e-05, + "loss": 0.208, + "num_input_tokens_seen": 38742560, + "step": 31825 + }, + { + "epoch": 3.5449381891079184, + "grad_norm": 1.089622974395752, + "learning_rate": 4.909678368506888e-05, + "loss": 0.1, + "num_input_tokens_seen": 38747904, + "step": 31830 + }, + { + "epoch": 3.5454950439915356, + "grad_norm": 1.1809301376342773, + "learning_rate": 4.909613636634555e-05, + "loss": 0.0648, + "num_input_tokens_seen": 38754048, + "step": 31835 + }, + { + "epoch": 3.5460518988751533, + "grad_norm": 0.13772962987422943, + "learning_rate": 4.90954888200149e-05, + "loss": 0.1194, + "num_input_tokens_seen": 38760128, + "step": 31840 + }, + { + "epoch": 3.5466087537587705, + "grad_norm": 0.6823747158050537, + "learning_rate": 4.909484104608306e-05, + "loss": 0.0322, + "num_input_tokens_seen": 38766496, + "step": 31845 + }, + { + "epoch": 3.5471656086423877, + "grad_norm": 0.008191798813641071, + "learning_rate": 4.909419304455614e-05, + "loss": 0.1441, + "num_input_tokens_seen": 38772352, + "step": 31850 + }, + { + "epoch": 3.5477224635260054, + "grad_norm": 0.07429228723049164, + "learning_rate": 4.9093544815440265e-05, + "loss": 0.0705, + "num_input_tokens_seen": 38778112, + "step": 31855 + }, + { + "epoch": 3.548279318409622, + "grad_norm": 0.7093315720558167, + "learning_rate": 4.909289635874155e-05, + "loss": 0.0426, + "num_input_tokens_seen": 38783520, + "step": 31860 + }, + { + "epoch": 3.54883617329324, + "grad_norm": 0.12518513202667236, + "learning_rate": 4.9092247674466125e-05, + "loss": 0.092, + "num_input_tokens_seen": 38789728, + "step": 31865 + }, + { + "epoch": 3.549393028176857, + "grad_norm": 0.4223494827747345, + "learning_rate": 4.9091598762620114e-05, + "loss": 0.1082, + "num_input_tokens_seen": 38795840, + "step": 31870 + }, + { + "epoch": 3.5499498830604743, + "grad_norm": 0.07914884388446808, + "learning_rate": 4.909094962320966e-05, + "loss": 0.0214, + "num_input_tokens_seen": 38802048, + "step": 31875 + }, + { + "epoch": 3.550506737944092, + "grad_norm": 0.10776752233505249, + "learning_rate": 4.909030025624089e-05, + "loss": 0.0655, + "num_input_tokens_seen": 38808352, + "step": 31880 + }, + { + "epoch": 3.5510635928277092, + "grad_norm": 0.008659894578158855, + "learning_rate": 4.908965066171993e-05, + "loss": 0.0992, + "num_input_tokens_seen": 38814688, + "step": 31885 + }, + { + "epoch": 3.5516204477113265, + "grad_norm": 0.7977935075759888, + "learning_rate": 4.908900083965291e-05, + "loss": 0.0395, + "num_input_tokens_seen": 38821088, + "step": 31890 + }, + { + "epoch": 3.5521773025949437, + "grad_norm": 0.20932817459106445, + "learning_rate": 4.908835079004599e-05, + "loss": 0.0658, + "num_input_tokens_seen": 38826912, + "step": 31895 + }, + { + "epoch": 3.552734157478561, + "grad_norm": 0.8311988115310669, + "learning_rate": 4.908770051290529e-05, + "loss": 0.107, + "num_input_tokens_seen": 38832352, + "step": 31900 + }, + { + "epoch": 3.5532910123621786, + "grad_norm": 0.8177805542945862, + "learning_rate": 4.908705000823696e-05, + "loss": 0.0742, + "num_input_tokens_seen": 38837920, + "step": 31905 + }, + { + "epoch": 3.553847867245796, + "grad_norm": 0.02163434959948063, + "learning_rate": 4.9086399276047145e-05, + "loss": 0.0096, + "num_input_tokens_seen": 38844224, + "step": 31910 + }, + { + "epoch": 3.554404722129413, + "grad_norm": 2.0045127868652344, + "learning_rate": 4.908574831634199e-05, + "loss": 0.1756, + "num_input_tokens_seen": 38850272, + "step": 31915 + }, + { + "epoch": 3.5549615770130303, + "grad_norm": 0.5384777784347534, + "learning_rate": 4.9085097129127646e-05, + "loss": 0.1125, + "num_input_tokens_seen": 38856384, + "step": 31920 + }, + { + "epoch": 3.5555184318966475, + "grad_norm": 0.13417133688926697, + "learning_rate": 4.9084445714410265e-05, + "loss": 0.1159, + "num_input_tokens_seen": 38862400, + "step": 31925 + }, + { + "epoch": 3.556075286780265, + "grad_norm": 0.09673865884542465, + "learning_rate": 4.9083794072195996e-05, + "loss": 0.1322, + "num_input_tokens_seen": 38868096, + "step": 31930 + }, + { + "epoch": 3.5566321416638824, + "grad_norm": 0.6749112606048584, + "learning_rate": 4.9083142202491e-05, + "loss": 0.0871, + "num_input_tokens_seen": 38873824, + "step": 31935 + }, + { + "epoch": 3.5571889965474996, + "grad_norm": 0.9986381530761719, + "learning_rate": 4.9082490105301424e-05, + "loss": 0.1395, + "num_input_tokens_seen": 38879904, + "step": 31940 + }, + { + "epoch": 3.5577458514311173, + "grad_norm": 0.008203547447919846, + "learning_rate": 4.908183778063344e-05, + "loss": 0.0457, + "num_input_tokens_seen": 38886272, + "step": 31945 + }, + { + "epoch": 3.558302706314734, + "grad_norm": 1.5673350095748901, + "learning_rate": 4.90811852284932e-05, + "loss": 0.1155, + "num_input_tokens_seen": 38892096, + "step": 31950 + }, + { + "epoch": 3.558859561198352, + "grad_norm": 0.31954431533813477, + "learning_rate": 4.908053244888687e-05, + "loss": 0.0134, + "num_input_tokens_seen": 38898592, + "step": 31955 + }, + { + "epoch": 3.559416416081969, + "grad_norm": 0.42733731865882874, + "learning_rate": 4.9079879441820625e-05, + "loss": 0.0856, + "num_input_tokens_seen": 38904992, + "step": 31960 + }, + { + "epoch": 3.5599732709655862, + "grad_norm": 0.6621891856193542, + "learning_rate": 4.907922620730062e-05, + "loss": 0.0405, + "num_input_tokens_seen": 38911168, + "step": 31965 + }, + { + "epoch": 3.560530125849204, + "grad_norm": 0.13382385671138763, + "learning_rate": 4.907857274533304e-05, + "loss": 0.099, + "num_input_tokens_seen": 38917248, + "step": 31970 + }, + { + "epoch": 3.561086980732821, + "grad_norm": 1.1478803157806396, + "learning_rate": 4.907791905592404e-05, + "loss": 0.0469, + "num_input_tokens_seen": 38923296, + "step": 31975 + }, + { + "epoch": 3.5616438356164384, + "grad_norm": 1.0140866041183472, + "learning_rate": 4.907726513907981e-05, + "loss": 0.1478, + "num_input_tokens_seen": 38929728, + "step": 31980 + }, + { + "epoch": 3.5622006905000556, + "grad_norm": 0.0012961260508745909, + "learning_rate": 4.9076610994806516e-05, + "loss": 0.0412, + "num_input_tokens_seen": 38936192, + "step": 31985 + }, + { + "epoch": 3.562757545383673, + "grad_norm": 1.7356295585632324, + "learning_rate": 4.907595662311035e-05, + "loss": 0.1061, + "num_input_tokens_seen": 38941792, + "step": 31990 + }, + { + "epoch": 3.5633144002672905, + "grad_norm": 2.7550389766693115, + "learning_rate": 4.907530202399747e-05, + "loss": 0.1584, + "num_input_tokens_seen": 38947488, + "step": 31995 + }, + { + "epoch": 3.5638712551509077, + "grad_norm": 1.346107840538025, + "learning_rate": 4.907464719747409e-05, + "loss": 0.0985, + "num_input_tokens_seen": 38952512, + "step": 32000 + }, + { + "epoch": 3.564428110034525, + "grad_norm": 1.3669706583023071, + "learning_rate": 4.9073992143546365e-05, + "loss": 0.148, + "num_input_tokens_seen": 38958912, + "step": 32005 + }, + { + "epoch": 3.564984964918142, + "grad_norm": 0.004154587630182505, + "learning_rate": 4.90733368622205e-05, + "loss": 0.0763, + "num_input_tokens_seen": 38964960, + "step": 32010 + }, + { + "epoch": 3.5655418198017594, + "grad_norm": 0.8633262515068054, + "learning_rate": 4.907268135350268e-05, + "loss": 0.0618, + "num_input_tokens_seen": 38971168, + "step": 32015 + }, + { + "epoch": 3.566098674685377, + "grad_norm": 0.5892894268035889, + "learning_rate": 4.9072025617399105e-05, + "loss": 0.1206, + "num_input_tokens_seen": 38977472, + "step": 32020 + }, + { + "epoch": 3.5666555295689943, + "grad_norm": 0.958455502986908, + "learning_rate": 4.9071369653915955e-05, + "loss": 0.1693, + "num_input_tokens_seen": 38983424, + "step": 32025 + }, + { + "epoch": 3.5672123844526116, + "grad_norm": 0.20404663681983948, + "learning_rate": 4.9070713463059434e-05, + "loss": 0.1268, + "num_input_tokens_seen": 38989280, + "step": 32030 + }, + { + "epoch": 3.5677692393362292, + "grad_norm": 0.06279750913381577, + "learning_rate": 4.907005704483574e-05, + "loss": 0.0294, + "num_input_tokens_seen": 38995424, + "step": 32035 + }, + { + "epoch": 3.568326094219846, + "grad_norm": 0.813174843788147, + "learning_rate": 4.9069400399251075e-05, + "loss": 0.1246, + "num_input_tokens_seen": 39000896, + "step": 32040 + }, + { + "epoch": 3.5688829491034637, + "grad_norm": 0.027844486758112907, + "learning_rate": 4.906874352631164e-05, + "loss": 0.0808, + "num_input_tokens_seen": 39006912, + "step": 32045 + }, + { + "epoch": 3.569439803987081, + "grad_norm": 0.12262772023677826, + "learning_rate": 4.906808642602364e-05, + "loss": 0.1102, + "num_input_tokens_seen": 39012960, + "step": 32050 + }, + { + "epoch": 3.569996658870698, + "grad_norm": 1.3148471117019653, + "learning_rate": 4.906742909839327e-05, + "loss": 0.109, + "num_input_tokens_seen": 39019008, + "step": 32055 + }, + { + "epoch": 3.570553513754316, + "grad_norm": 0.16321797668933868, + "learning_rate": 4.906677154342676e-05, + "loss": 0.0346, + "num_input_tokens_seen": 39025184, + "step": 32060 + }, + { + "epoch": 3.571110368637933, + "grad_norm": 2.301035165786743, + "learning_rate": 4.9066113761130305e-05, + "loss": 0.0623, + "num_input_tokens_seen": 39031200, + "step": 32065 + }, + { + "epoch": 3.5716672235215503, + "grad_norm": 0.22523273527622223, + "learning_rate": 4.9065455751510125e-05, + "loss": 0.1252, + "num_input_tokens_seen": 39036448, + "step": 32070 + }, + { + "epoch": 3.5722240784051675, + "grad_norm": 0.10012457519769669, + "learning_rate": 4.906479751457244e-05, + "loss": 0.1627, + "num_input_tokens_seen": 39042560, + "step": 32075 + }, + { + "epoch": 3.5727809332887848, + "grad_norm": 0.2038896232843399, + "learning_rate": 4.906413905032346e-05, + "loss": 0.1406, + "num_input_tokens_seen": 39048480, + "step": 32080 + }, + { + "epoch": 3.5733377881724024, + "grad_norm": 0.8104190826416016, + "learning_rate": 4.90634803587694e-05, + "loss": 0.1267, + "num_input_tokens_seen": 39054848, + "step": 32085 + }, + { + "epoch": 3.5738946430560197, + "grad_norm": 0.3410761058330536, + "learning_rate": 4.906282143991649e-05, + "loss": 0.1365, + "num_input_tokens_seen": 39060832, + "step": 32090 + }, + { + "epoch": 3.574451497939637, + "grad_norm": 0.030150875449180603, + "learning_rate": 4.9062162293770964e-05, + "loss": 0.0579, + "num_input_tokens_seen": 39066944, + "step": 32095 + }, + { + "epoch": 3.575008352823254, + "grad_norm": 1.931702733039856, + "learning_rate": 4.9061502920339024e-05, + "loss": 0.1852, + "num_input_tokens_seen": 39073120, + "step": 32100 + }, + { + "epoch": 3.5755652077068714, + "grad_norm": 0.10672120749950409, + "learning_rate": 4.9060843319626914e-05, + "loss": 0.0739, + "num_input_tokens_seen": 39078688, + "step": 32105 + }, + { + "epoch": 3.576122062590489, + "grad_norm": 1.757364273071289, + "learning_rate": 4.906018349164086e-05, + "loss": 0.1258, + "num_input_tokens_seen": 39084960, + "step": 32110 + }, + { + "epoch": 3.5766789174741063, + "grad_norm": 0.33460405468940735, + "learning_rate": 4.905952343638711e-05, + "loss": 0.0533, + "num_input_tokens_seen": 39090880, + "step": 32115 + }, + { + "epoch": 3.5772357723577235, + "grad_norm": 0.5132150650024414, + "learning_rate": 4.905886315387187e-05, + "loss": 0.0623, + "num_input_tokens_seen": 39096960, + "step": 32120 + }, + { + "epoch": 3.577792627241341, + "grad_norm": 0.05837952718138695, + "learning_rate": 4.9058202644101406e-05, + "loss": 0.0419, + "num_input_tokens_seen": 39103296, + "step": 32125 + }, + { + "epoch": 3.578349482124958, + "grad_norm": 2.401567220687866, + "learning_rate": 4.9057541907081926e-05, + "loss": 0.1599, + "num_input_tokens_seen": 39109728, + "step": 32130 + }, + { + "epoch": 3.5789063370085756, + "grad_norm": 0.15021680295467377, + "learning_rate": 4.9056880942819695e-05, + "loss": 0.0509, + "num_input_tokens_seen": 39115840, + "step": 32135 + }, + { + "epoch": 3.579463191892193, + "grad_norm": 1.3029181957244873, + "learning_rate": 4.905621975132095e-05, + "loss": 0.0905, + "num_input_tokens_seen": 39121888, + "step": 32140 + }, + { + "epoch": 3.58002004677581, + "grad_norm": 0.3006789982318878, + "learning_rate": 4.9055558332591936e-05, + "loss": 0.2925, + "num_input_tokens_seen": 39128096, + "step": 32145 + }, + { + "epoch": 3.5805769016594278, + "grad_norm": 0.5442284345626831, + "learning_rate": 4.905489668663891e-05, + "loss": 0.0673, + "num_input_tokens_seen": 39134272, + "step": 32150 + }, + { + "epoch": 3.581133756543045, + "grad_norm": 0.6601938605308533, + "learning_rate": 4.9054234813468096e-05, + "loss": 0.0471, + "num_input_tokens_seen": 39140384, + "step": 32155 + }, + { + "epoch": 3.5816906114266622, + "grad_norm": 0.2400812953710556, + "learning_rate": 4.905357271308577e-05, + "loss": 0.0724, + "num_input_tokens_seen": 39146432, + "step": 32160 + }, + { + "epoch": 3.5822474663102795, + "grad_norm": 0.5477350354194641, + "learning_rate": 4.905291038549817e-05, + "loss": 0.0406, + "num_input_tokens_seen": 39152448, + "step": 32165 + }, + { + "epoch": 3.5828043211938967, + "grad_norm": 0.004083313047885895, + "learning_rate": 4.905224783071157e-05, + "loss": 0.047, + "num_input_tokens_seen": 39158528, + "step": 32170 + }, + { + "epoch": 3.5833611760775144, + "grad_norm": 0.8618637919425964, + "learning_rate": 4.905158504873223e-05, + "loss": 0.2205, + "num_input_tokens_seen": 39164032, + "step": 32175 + }, + { + "epoch": 3.5839180309611316, + "grad_norm": 0.4590235948562622, + "learning_rate": 4.905092203956638e-05, + "loss": 0.0191, + "num_input_tokens_seen": 39170112, + "step": 32180 + }, + { + "epoch": 3.584474885844749, + "grad_norm": 0.5841129422187805, + "learning_rate": 4.905025880322031e-05, + "loss": 0.1337, + "num_input_tokens_seen": 39176032, + "step": 32185 + }, + { + "epoch": 3.585031740728366, + "grad_norm": 1.085555911064148, + "learning_rate": 4.904959533970027e-05, + "loss": 0.1456, + "num_input_tokens_seen": 39181824, + "step": 32190 + }, + { + "epoch": 3.5855885956119833, + "grad_norm": 0.0006896163104102015, + "learning_rate": 4.9048931649012543e-05, + "loss": 0.0727, + "num_input_tokens_seen": 39187424, + "step": 32195 + }, + { + "epoch": 3.586145450495601, + "grad_norm": 0.3582378625869751, + "learning_rate": 4.9048267731163386e-05, + "loss": 0.0963, + "num_input_tokens_seen": 39193600, + "step": 32200 + }, + { + "epoch": 3.586702305379218, + "grad_norm": 1.0742192268371582, + "learning_rate": 4.9047603586159074e-05, + "loss": 0.1113, + "num_input_tokens_seen": 39199552, + "step": 32205 + }, + { + "epoch": 3.5872591602628354, + "grad_norm": 1.458678960800171, + "learning_rate": 4.904693921400587e-05, + "loss": 0.2347, + "num_input_tokens_seen": 39205856, + "step": 32210 + }, + { + "epoch": 3.587816015146453, + "grad_norm": 0.607506513595581, + "learning_rate": 4.904627461471007e-05, + "loss": 0.1074, + "num_input_tokens_seen": 39211776, + "step": 32215 + }, + { + "epoch": 3.5883728700300703, + "grad_norm": 0.4807887375354767, + "learning_rate": 4.904560978827794e-05, + "loss": 0.0517, + "num_input_tokens_seen": 39217504, + "step": 32220 + }, + { + "epoch": 3.5889297249136876, + "grad_norm": 0.5983077883720398, + "learning_rate": 4.904494473471576e-05, + "loss": 0.0903, + "num_input_tokens_seen": 39223744, + "step": 32225 + }, + { + "epoch": 3.589486579797305, + "grad_norm": 0.10840853303670883, + "learning_rate": 4.904427945402981e-05, + "loss": 0.1548, + "num_input_tokens_seen": 39229888, + "step": 32230 + }, + { + "epoch": 3.590043434680922, + "grad_norm": 0.27151134610176086, + "learning_rate": 4.9043613946226375e-05, + "loss": 0.1181, + "num_input_tokens_seen": 39235872, + "step": 32235 + }, + { + "epoch": 3.5906002895645397, + "grad_norm": 0.45029085874557495, + "learning_rate": 4.9042948211311744e-05, + "loss": 0.0376, + "num_input_tokens_seen": 39242048, + "step": 32240 + }, + { + "epoch": 3.591157144448157, + "grad_norm": 0.24101081490516663, + "learning_rate": 4.9042282249292205e-05, + "loss": 0.0964, + "num_input_tokens_seen": 39247840, + "step": 32245 + }, + { + "epoch": 3.591713999331774, + "grad_norm": 1.0967751741409302, + "learning_rate": 4.904161606017405e-05, + "loss": 0.1365, + "num_input_tokens_seen": 39253856, + "step": 32250 + }, + { + "epoch": 3.5922708542153914, + "grad_norm": 0.9560286402702332, + "learning_rate": 4.904094964396357e-05, + "loss": 0.1435, + "num_input_tokens_seen": 39259520, + "step": 32255 + }, + { + "epoch": 3.5928277090990086, + "grad_norm": 0.1307843178510666, + "learning_rate": 4.9040283000667054e-05, + "loss": 0.0867, + "num_input_tokens_seen": 39265920, + "step": 32260 + }, + { + "epoch": 3.5933845639826263, + "grad_norm": 0.5834044218063354, + "learning_rate": 4.903961613029081e-05, + "loss": 0.0318, + "num_input_tokens_seen": 39272192, + "step": 32265 + }, + { + "epoch": 3.5939414188662435, + "grad_norm": 0.6331356763839722, + "learning_rate": 4.9038949032841124e-05, + "loss": 0.2081, + "num_input_tokens_seen": 39278112, + "step": 32270 + }, + { + "epoch": 3.5944982737498608, + "grad_norm": 0.010970879346132278, + "learning_rate": 4.9038281708324305e-05, + "loss": 0.0209, + "num_input_tokens_seen": 39284352, + "step": 32275 + }, + { + "epoch": 3.595055128633478, + "grad_norm": 0.3789184093475342, + "learning_rate": 4.903761415674667e-05, + "loss": 0.1352, + "num_input_tokens_seen": 39290112, + "step": 32280 + }, + { + "epoch": 3.595611983517095, + "grad_norm": 1.0256034135818481, + "learning_rate": 4.90369463781145e-05, + "loss": 0.1326, + "num_input_tokens_seen": 39296224, + "step": 32285 + }, + { + "epoch": 3.596168838400713, + "grad_norm": 0.5204481482505798, + "learning_rate": 4.9036278372434115e-05, + "loss": 0.1689, + "num_input_tokens_seen": 39302240, + "step": 32290 + }, + { + "epoch": 3.59672569328433, + "grad_norm": 0.011292077600955963, + "learning_rate": 4.903561013971182e-05, + "loss": 0.0967, + "num_input_tokens_seen": 39308576, + "step": 32295 + }, + { + "epoch": 3.5972825481679473, + "grad_norm": 0.6353954076766968, + "learning_rate": 4.9034941679953936e-05, + "loss": 0.0778, + "num_input_tokens_seen": 39314176, + "step": 32300 + }, + { + "epoch": 3.597839403051565, + "grad_norm": 0.6608734130859375, + "learning_rate": 4.903427299316676e-05, + "loss": 0.0794, + "num_input_tokens_seen": 39320480, + "step": 32305 + }, + { + "epoch": 3.5983962579351823, + "grad_norm": 0.5058675408363342, + "learning_rate": 4.9033604079356635e-05, + "loss": 0.1268, + "num_input_tokens_seen": 39326816, + "step": 32310 + }, + { + "epoch": 3.5989531128187995, + "grad_norm": 0.6914377808570862, + "learning_rate": 4.9032934938529855e-05, + "loss": 0.0908, + "num_input_tokens_seen": 39332928, + "step": 32315 + }, + { + "epoch": 3.5995099677024167, + "grad_norm": 1.4204707145690918, + "learning_rate": 4.903226557069275e-05, + "loss": 0.2118, + "num_input_tokens_seen": 39339328, + "step": 32320 + }, + { + "epoch": 3.600066822586034, + "grad_norm": 2.0352373123168945, + "learning_rate": 4.903159597585165e-05, + "loss": 0.0571, + "num_input_tokens_seen": 39345600, + "step": 32325 + }, + { + "epoch": 3.6006236774696516, + "grad_norm": 0.6266332864761353, + "learning_rate": 4.903092615401286e-05, + "loss": 0.0431, + "num_input_tokens_seen": 39351840, + "step": 32330 + }, + { + "epoch": 3.601180532353269, + "grad_norm": 0.2961577773094177, + "learning_rate": 4.9030256105182725e-05, + "loss": 0.0543, + "num_input_tokens_seen": 39357792, + "step": 32335 + }, + { + "epoch": 3.601737387236886, + "grad_norm": 0.012707501649856567, + "learning_rate": 4.9029585829367575e-05, + "loss": 0.0219, + "num_input_tokens_seen": 39364160, + "step": 32340 + }, + { + "epoch": 3.6022942421205033, + "grad_norm": 0.07168848067522049, + "learning_rate": 4.9028915326573724e-05, + "loss": 0.0099, + "num_input_tokens_seen": 39370528, + "step": 32345 + }, + { + "epoch": 3.6028510970041205, + "grad_norm": 0.2352292686700821, + "learning_rate": 4.902824459680752e-05, + "loss": 0.0793, + "num_input_tokens_seen": 39376192, + "step": 32350 + }, + { + "epoch": 3.603407951887738, + "grad_norm": 0.03820084407925606, + "learning_rate": 4.90275736400753e-05, + "loss": 0.0579, + "num_input_tokens_seen": 39382336, + "step": 32355 + }, + { + "epoch": 3.6039648067713554, + "grad_norm": 0.23563475906848907, + "learning_rate": 4.902690245638339e-05, + "loss": 0.0714, + "num_input_tokens_seen": 39388576, + "step": 32360 + }, + { + "epoch": 3.6045216616549727, + "grad_norm": 0.419982373714447, + "learning_rate": 4.902623104573814e-05, + "loss": 0.1467, + "num_input_tokens_seen": 39394688, + "step": 32365 + }, + { + "epoch": 3.60507851653859, + "grad_norm": 0.1230822503566742, + "learning_rate": 4.902555940814588e-05, + "loss": 0.0043, + "num_input_tokens_seen": 39400768, + "step": 32370 + }, + { + "epoch": 3.605635371422207, + "grad_norm": 0.00046260879025794566, + "learning_rate": 4.9024887543612976e-05, + "loss": 0.1023, + "num_input_tokens_seen": 39406720, + "step": 32375 + }, + { + "epoch": 3.606192226305825, + "grad_norm": 0.1825477033853531, + "learning_rate": 4.902421545214575e-05, + "loss": 0.0659, + "num_input_tokens_seen": 39412992, + "step": 32380 + }, + { + "epoch": 3.606749081189442, + "grad_norm": 0.6312333941459656, + "learning_rate": 4.902354313375056e-05, + "loss": 0.1, + "num_input_tokens_seen": 39418336, + "step": 32385 + }, + { + "epoch": 3.6073059360730593, + "grad_norm": 0.0008561829454265535, + "learning_rate": 4.902287058843377e-05, + "loss": 0.0855, + "num_input_tokens_seen": 39424704, + "step": 32390 + }, + { + "epoch": 3.607862790956677, + "grad_norm": 0.029156966134905815, + "learning_rate": 4.902219781620171e-05, + "loss": 0.0474, + "num_input_tokens_seen": 39430944, + "step": 32395 + }, + { + "epoch": 3.608419645840294, + "grad_norm": 0.030717015266418457, + "learning_rate": 4.902152481706075e-05, + "loss": 0.0662, + "num_input_tokens_seen": 39437248, + "step": 32400 + }, + { + "epoch": 3.6089765007239114, + "grad_norm": 0.006537401583045721, + "learning_rate": 4.9020851591017235e-05, + "loss": 0.0336, + "num_input_tokens_seen": 39443264, + "step": 32405 + }, + { + "epoch": 3.6095333556075286, + "grad_norm": 0.9355095028877258, + "learning_rate": 4.902017813807754e-05, + "loss": 0.0962, + "num_input_tokens_seen": 39449312, + "step": 32410 + }, + { + "epoch": 3.610090210491146, + "grad_norm": 0.27996936440467834, + "learning_rate": 4.9019504458248014e-05, + "loss": 0.0417, + "num_input_tokens_seen": 39455136, + "step": 32415 + }, + { + "epoch": 3.6106470653747635, + "grad_norm": 0.00048602273454889655, + "learning_rate": 4.901883055153502e-05, + "loss": 0.0094, + "num_input_tokens_seen": 39461184, + "step": 32420 + }, + { + "epoch": 3.6112039202583808, + "grad_norm": 0.7385186553001404, + "learning_rate": 4.901815641794494e-05, + "loss": 0.108, + "num_input_tokens_seen": 39467136, + "step": 32425 + }, + { + "epoch": 3.611760775141998, + "grad_norm": 0.06905362010002136, + "learning_rate": 4.901748205748412e-05, + "loss": 0.048, + "num_input_tokens_seen": 39472832, + "step": 32430 + }, + { + "epoch": 3.6123176300256152, + "grad_norm": 0.4051370620727539, + "learning_rate": 4.901680747015894e-05, + "loss": 0.0779, + "num_input_tokens_seen": 39478976, + "step": 32435 + }, + { + "epoch": 3.6128744849092325, + "grad_norm": 0.011227482929825783, + "learning_rate": 4.9016132655975776e-05, + "loss": 0.0931, + "num_input_tokens_seen": 39485056, + "step": 32440 + }, + { + "epoch": 3.61343133979285, + "grad_norm": 0.052718933671712875, + "learning_rate": 4.9015457614940994e-05, + "loss": 0.0737, + "num_input_tokens_seen": 39490880, + "step": 32445 + }, + { + "epoch": 3.6139881946764674, + "grad_norm": 0.7743541598320007, + "learning_rate": 4.901478234706097e-05, + "loss": 0.0627, + "num_input_tokens_seen": 39497120, + "step": 32450 + }, + { + "epoch": 3.6145450495600846, + "grad_norm": 1.0083200931549072, + "learning_rate": 4.90141068523421e-05, + "loss": 0.0524, + "num_input_tokens_seen": 39503328, + "step": 32455 + }, + { + "epoch": 3.615101904443702, + "grad_norm": 0.029539819806814194, + "learning_rate": 4.901343113079074e-05, + "loss": 0.0148, + "num_input_tokens_seen": 39509472, + "step": 32460 + }, + { + "epoch": 3.615658759327319, + "grad_norm": 0.7723373770713806, + "learning_rate": 4.9012755182413285e-05, + "loss": 0.0797, + "num_input_tokens_seen": 39515968, + "step": 32465 + }, + { + "epoch": 3.6162156142109367, + "grad_norm": 1.956118106842041, + "learning_rate": 4.9012079007216125e-05, + "loss": 0.1365, + "num_input_tokens_seen": 39521952, + "step": 32470 + }, + { + "epoch": 3.616772469094554, + "grad_norm": 0.20416100323200226, + "learning_rate": 4.901140260520564e-05, + "loss": 0.0795, + "num_input_tokens_seen": 39527520, + "step": 32475 + }, + { + "epoch": 3.617329323978171, + "grad_norm": 1.5093340873718262, + "learning_rate": 4.9010725976388204e-05, + "loss": 0.1175, + "num_input_tokens_seen": 39533536, + "step": 32480 + }, + { + "epoch": 3.617886178861789, + "grad_norm": 1.1653207540512085, + "learning_rate": 4.901004912077024e-05, + "loss": 0.0903, + "num_input_tokens_seen": 39539360, + "step": 32485 + }, + { + "epoch": 3.618443033745406, + "grad_norm": 3.7849678993225098, + "learning_rate": 4.900937203835812e-05, + "loss": 0.1989, + "num_input_tokens_seen": 39545536, + "step": 32490 + }, + { + "epoch": 3.6189998886290233, + "grad_norm": 0.3853336572647095, + "learning_rate": 4.9008694729158244e-05, + "loss": 0.0705, + "num_input_tokens_seen": 39551552, + "step": 32495 + }, + { + "epoch": 3.6195567435126406, + "grad_norm": 0.09617165476083755, + "learning_rate": 4.900801719317701e-05, + "loss": 0.0914, + "num_input_tokens_seen": 39557600, + "step": 32500 + }, + { + "epoch": 3.620113598396258, + "grad_norm": 0.1684655100107193, + "learning_rate": 4.900733943042083e-05, + "loss": 0.1561, + "num_input_tokens_seen": 39563744, + "step": 32505 + }, + { + "epoch": 3.6206704532798755, + "grad_norm": 0.020712677389383316, + "learning_rate": 4.9006661440896085e-05, + "loss": 0.015, + "num_input_tokens_seen": 39569952, + "step": 32510 + }, + { + "epoch": 3.6212273081634927, + "grad_norm": 0.013931610621511936, + "learning_rate": 4.900598322460919e-05, + "loss": 0.0735, + "num_input_tokens_seen": 39575840, + "step": 32515 + }, + { + "epoch": 3.62178416304711, + "grad_norm": 1.136214017868042, + "learning_rate": 4.900530478156655e-05, + "loss": 0.0631, + "num_input_tokens_seen": 39582240, + "step": 32520 + }, + { + "epoch": 3.622341017930727, + "grad_norm": 0.027431178838014603, + "learning_rate": 4.9004626111774576e-05, + "loss": 0.1906, + "num_input_tokens_seen": 39588576, + "step": 32525 + }, + { + "epoch": 3.6228978728143444, + "grad_norm": 1.0400357246398926, + "learning_rate": 4.900394721523967e-05, + "loss": 0.1022, + "num_input_tokens_seen": 39594368, + "step": 32530 + }, + { + "epoch": 3.623454727697962, + "grad_norm": 0.726553738117218, + "learning_rate": 4.900326809196826e-05, + "loss": 0.0512, + "num_input_tokens_seen": 39600352, + "step": 32535 + }, + { + "epoch": 3.6240115825815793, + "grad_norm": 0.9324137568473816, + "learning_rate": 4.900258874196674e-05, + "loss": 0.2815, + "num_input_tokens_seen": 39606336, + "step": 32540 + }, + { + "epoch": 3.6245684374651965, + "grad_norm": 0.3221869468688965, + "learning_rate": 4.900190916524155e-05, + "loss": 0.0311, + "num_input_tokens_seen": 39612288, + "step": 32545 + }, + { + "epoch": 3.6251252923488138, + "grad_norm": 0.49354103207588196, + "learning_rate": 4.900122936179909e-05, + "loss": 0.0926, + "num_input_tokens_seen": 39618400, + "step": 32550 + }, + { + "epoch": 3.625682147232431, + "grad_norm": 0.7640476226806641, + "learning_rate": 4.9000549331645796e-05, + "loss": 0.1739, + "num_input_tokens_seen": 39624416, + "step": 32555 + }, + { + "epoch": 3.6262390021160487, + "grad_norm": 1.1046656370162964, + "learning_rate": 4.899986907478808e-05, + "loss": 0.1261, + "num_input_tokens_seen": 39630720, + "step": 32560 + }, + { + "epoch": 3.626795856999666, + "grad_norm": 1.5021655559539795, + "learning_rate": 4.8999188591232376e-05, + "loss": 0.0974, + "num_input_tokens_seen": 39636736, + "step": 32565 + }, + { + "epoch": 3.627352711883283, + "grad_norm": 0.02252158895134926, + "learning_rate": 4.89985078809851e-05, + "loss": 0.0464, + "num_input_tokens_seen": 39642912, + "step": 32570 + }, + { + "epoch": 3.627909566766901, + "grad_norm": 0.08176158368587494, + "learning_rate": 4.89978269440527e-05, + "loss": 0.0429, + "num_input_tokens_seen": 39648992, + "step": 32575 + }, + { + "epoch": 3.628466421650518, + "grad_norm": 0.30066585540771484, + "learning_rate": 4.899714578044159e-05, + "loss": 0.0393, + "num_input_tokens_seen": 39655136, + "step": 32580 + }, + { + "epoch": 3.6290232765341353, + "grad_norm": 2.6807544231414795, + "learning_rate": 4.8996464390158215e-05, + "loss": 0.1166, + "num_input_tokens_seen": 39661120, + "step": 32585 + }, + { + "epoch": 3.6295801314177525, + "grad_norm": 0.06777582317590714, + "learning_rate": 4.899578277320901e-05, + "loss": 0.0477, + "num_input_tokens_seen": 39667136, + "step": 32590 + }, + { + "epoch": 3.6301369863013697, + "grad_norm": 0.5203010439872742, + "learning_rate": 4.899510092960041e-05, + "loss": 0.0954, + "num_input_tokens_seen": 39673632, + "step": 32595 + }, + { + "epoch": 3.6306938411849874, + "grad_norm": 0.3783743381500244, + "learning_rate": 4.899441885933886e-05, + "loss": 0.1421, + "num_input_tokens_seen": 39679328, + "step": 32600 + }, + { + "epoch": 3.6312506960686046, + "grad_norm": 1.494417667388916, + "learning_rate": 4.8993736562430795e-05, + "loss": 0.0538, + "num_input_tokens_seen": 39685440, + "step": 32605 + }, + { + "epoch": 3.631807550952222, + "grad_norm": 0.347064346075058, + "learning_rate": 4.8993054038882666e-05, + "loss": 0.0556, + "num_input_tokens_seen": 39691328, + "step": 32610 + }, + { + "epoch": 3.632364405835839, + "grad_norm": 0.5845589637756348, + "learning_rate": 4.8992371288700924e-05, + "loss": 0.1327, + "num_input_tokens_seen": 39697216, + "step": 32615 + }, + { + "epoch": 3.6329212607194563, + "grad_norm": 1.8997400999069214, + "learning_rate": 4.8991688311892006e-05, + "loss": 0.2095, + "num_input_tokens_seen": 39703296, + "step": 32620 + }, + { + "epoch": 3.633478115603074, + "grad_norm": 0.07000728696584702, + "learning_rate": 4.899100510846237e-05, + "loss": 0.0746, + "num_input_tokens_seen": 39709056, + "step": 32625 + }, + { + "epoch": 3.634034970486691, + "grad_norm": 0.1790507435798645, + "learning_rate": 4.899032167841847e-05, + "loss": 0.0636, + "num_input_tokens_seen": 39714880, + "step": 32630 + }, + { + "epoch": 3.6345918253703084, + "grad_norm": 0.5238017439842224, + "learning_rate": 4.898963802176677e-05, + "loss": 0.0642, + "num_input_tokens_seen": 39720832, + "step": 32635 + }, + { + "epoch": 3.6351486802539257, + "grad_norm": 0.08079712837934494, + "learning_rate": 4.898895413851371e-05, + "loss": 0.0182, + "num_input_tokens_seen": 39726912, + "step": 32640 + }, + { + "epoch": 3.635705535137543, + "grad_norm": 0.15971267223358154, + "learning_rate": 4.8988270028665754e-05, + "loss": 0.022, + "num_input_tokens_seen": 39732800, + "step": 32645 + }, + { + "epoch": 3.6362623900211606, + "grad_norm": 0.4750746786594391, + "learning_rate": 4.898758569222938e-05, + "loss": 0.1312, + "num_input_tokens_seen": 39738624, + "step": 32650 + }, + { + "epoch": 3.636819244904778, + "grad_norm": 0.1811450868844986, + "learning_rate": 4.8986901129211034e-05, + "loss": 0.0531, + "num_input_tokens_seen": 39744544, + "step": 32655 + }, + { + "epoch": 3.637376099788395, + "grad_norm": 0.026835234835743904, + "learning_rate": 4.898621633961719e-05, + "loss": 0.1013, + "num_input_tokens_seen": 39750336, + "step": 32660 + }, + { + "epoch": 3.6379329546720127, + "grad_norm": 0.505428671836853, + "learning_rate": 4.8985531323454315e-05, + "loss": 0.112, + "num_input_tokens_seen": 39756352, + "step": 32665 + }, + { + "epoch": 3.63848980955563, + "grad_norm": 0.7345198392868042, + "learning_rate": 4.898484608072887e-05, + "loss": 0.0981, + "num_input_tokens_seen": 39762528, + "step": 32670 + }, + { + "epoch": 3.639046664439247, + "grad_norm": 1.5813449621200562, + "learning_rate": 4.898416061144736e-05, + "loss": 0.12, + "num_input_tokens_seen": 39768512, + "step": 32675 + }, + { + "epoch": 3.6396035193228644, + "grad_norm": 1.4710826873779297, + "learning_rate": 4.898347491561622e-05, + "loss": 0.2163, + "num_input_tokens_seen": 39774112, + "step": 32680 + }, + { + "epoch": 3.6401603742064816, + "grad_norm": 0.5041201114654541, + "learning_rate": 4.898278899324195e-05, + "loss": 0.0857, + "num_input_tokens_seen": 39779872, + "step": 32685 + }, + { + "epoch": 3.6407172290900993, + "grad_norm": 0.37093695998191833, + "learning_rate": 4.898210284433102e-05, + "loss": 0.1301, + "num_input_tokens_seen": 39786080, + "step": 32690 + }, + { + "epoch": 3.6412740839737165, + "grad_norm": 0.7402286529541016, + "learning_rate": 4.8981416468889917e-05, + "loss": 0.1574, + "num_input_tokens_seen": 39791680, + "step": 32695 + }, + { + "epoch": 3.6418309388573338, + "grad_norm": 0.017387162894010544, + "learning_rate": 4.8980729866925126e-05, + "loss": 0.0499, + "num_input_tokens_seen": 39797952, + "step": 32700 + }, + { + "epoch": 3.642387793740951, + "grad_norm": 0.6214701533317566, + "learning_rate": 4.898004303844312e-05, + "loss": 0.1923, + "num_input_tokens_seen": 39804160, + "step": 32705 + }, + { + "epoch": 3.6429446486245682, + "grad_norm": 0.04203370586037636, + "learning_rate": 4.89793559834504e-05, + "loss": 0.0825, + "num_input_tokens_seen": 39810432, + "step": 32710 + }, + { + "epoch": 3.643501503508186, + "grad_norm": 0.027829375118017197, + "learning_rate": 4.897866870195345e-05, + "loss": 0.0567, + "num_input_tokens_seen": 39816480, + "step": 32715 + }, + { + "epoch": 3.644058358391803, + "grad_norm": 0.2315218299627304, + "learning_rate": 4.897798119395875e-05, + "loss": 0.0623, + "num_input_tokens_seen": 39822560, + "step": 32720 + }, + { + "epoch": 3.6446152132754204, + "grad_norm": 0.5097536444664001, + "learning_rate": 4.897729345947283e-05, + "loss": 0.0784, + "num_input_tokens_seen": 39828416, + "step": 32725 + }, + { + "epoch": 3.6451720681590376, + "grad_norm": 0.08914750069379807, + "learning_rate": 4.897660549850215e-05, + "loss": 0.0883, + "num_input_tokens_seen": 39834496, + "step": 32730 + }, + { + "epoch": 3.645728923042655, + "grad_norm": 0.6352952122688293, + "learning_rate": 4.897591731105322e-05, + "loss": 0.0441, + "num_input_tokens_seen": 39840448, + "step": 32735 + }, + { + "epoch": 3.6462857779262725, + "grad_norm": 0.8777053952217102, + "learning_rate": 4.897522889713255e-05, + "loss": 0.0649, + "num_input_tokens_seen": 39846496, + "step": 32740 + }, + { + "epoch": 3.6468426328098897, + "grad_norm": 0.2575824558734894, + "learning_rate": 4.897454025674662e-05, + "loss": 0.0812, + "num_input_tokens_seen": 39852736, + "step": 32745 + }, + { + "epoch": 3.647399487693507, + "grad_norm": 1.2623673677444458, + "learning_rate": 4.897385138990197e-05, + "loss": 0.2644, + "num_input_tokens_seen": 39858336, + "step": 32750 + }, + { + "epoch": 3.6479563425771246, + "grad_norm": 1.4474818706512451, + "learning_rate": 4.897316229660507e-05, + "loss": 0.0995, + "num_input_tokens_seen": 39863808, + "step": 32755 + }, + { + "epoch": 3.648513197460742, + "grad_norm": 0.1518193632364273, + "learning_rate": 4.8972472976862447e-05, + "loss": 0.0493, + "num_input_tokens_seen": 39869952, + "step": 32760 + }, + { + "epoch": 3.649070052344359, + "grad_norm": 0.7903139591217041, + "learning_rate": 4.8971783430680615e-05, + "loss": 0.0743, + "num_input_tokens_seen": 39876256, + "step": 32765 + }, + { + "epoch": 3.6496269072279763, + "grad_norm": 1.0439947843551636, + "learning_rate": 4.897109365806608e-05, + "loss": 0.1711, + "num_input_tokens_seen": 39882432, + "step": 32770 + }, + { + "epoch": 3.6501837621115936, + "grad_norm": 0.11300040781497955, + "learning_rate": 4.897040365902537e-05, + "loss": 0.0636, + "num_input_tokens_seen": 39888352, + "step": 32775 + }, + { + "epoch": 3.6507406169952112, + "grad_norm": 0.1695348620414734, + "learning_rate": 4.8969713433564977e-05, + "loss": 0.1061, + "num_input_tokens_seen": 39894560, + "step": 32780 + }, + { + "epoch": 3.6512974718788285, + "grad_norm": 0.6908153295516968, + "learning_rate": 4.8969022981691445e-05, + "loss": 0.1189, + "num_input_tokens_seen": 39900832, + "step": 32785 + }, + { + "epoch": 3.6518543267624457, + "grad_norm": 0.27504053711891174, + "learning_rate": 4.8968332303411285e-05, + "loss": 0.1206, + "num_input_tokens_seen": 39907200, + "step": 32790 + }, + { + "epoch": 3.652411181646063, + "grad_norm": 0.014642571099102497, + "learning_rate": 4.896764139873102e-05, + "loss": 0.0552, + "num_input_tokens_seen": 39913248, + "step": 32795 + }, + { + "epoch": 3.65296803652968, + "grad_norm": 0.26248180866241455, + "learning_rate": 4.8966950267657184e-05, + "loss": 0.0892, + "num_input_tokens_seen": 39919488, + "step": 32800 + }, + { + "epoch": 3.653524891413298, + "grad_norm": 1.161937952041626, + "learning_rate": 4.896625891019631e-05, + "loss": 0.1059, + "num_input_tokens_seen": 39925856, + "step": 32805 + }, + { + "epoch": 3.654081746296915, + "grad_norm": 0.7655027508735657, + "learning_rate": 4.89655673263549e-05, + "loss": 0.0638, + "num_input_tokens_seen": 39931552, + "step": 32810 + }, + { + "epoch": 3.6546386011805323, + "grad_norm": 0.31858694553375244, + "learning_rate": 4.896487551613952e-05, + "loss": 0.1735, + "num_input_tokens_seen": 39937440, + "step": 32815 + }, + { + "epoch": 3.6551954560641495, + "grad_norm": 0.4357472062110901, + "learning_rate": 4.896418347955668e-05, + "loss": 0.0519, + "num_input_tokens_seen": 39943200, + "step": 32820 + }, + { + "epoch": 3.6557523109477668, + "grad_norm": 0.03161635249853134, + "learning_rate": 4.896349121661293e-05, + "loss": 0.0232, + "num_input_tokens_seen": 39949600, + "step": 32825 + }, + { + "epoch": 3.6563091658313844, + "grad_norm": 0.02909010276198387, + "learning_rate": 4.8962798727314814e-05, + "loss": 0.1026, + "num_input_tokens_seen": 39955744, + "step": 32830 + }, + { + "epoch": 3.6568660207150017, + "grad_norm": 0.28309330344200134, + "learning_rate": 4.8962106011668854e-05, + "loss": 0.0365, + "num_input_tokens_seen": 39962112, + "step": 32835 + }, + { + "epoch": 3.657422875598619, + "grad_norm": 0.2469671666622162, + "learning_rate": 4.896141306968162e-05, + "loss": 0.1386, + "num_input_tokens_seen": 39968320, + "step": 32840 + }, + { + "epoch": 3.6579797304822366, + "grad_norm": 0.007007175590842962, + "learning_rate": 4.896071990135963e-05, + "loss": 0.0191, + "num_input_tokens_seen": 39974368, + "step": 32845 + }, + { + "epoch": 3.658536585365854, + "grad_norm": 0.6142757534980774, + "learning_rate": 4.8960026506709444e-05, + "loss": 0.0878, + "num_input_tokens_seen": 39980256, + "step": 32850 + }, + { + "epoch": 3.659093440249471, + "grad_norm": 0.11014155298471451, + "learning_rate": 4.895933288573761e-05, + "loss": 0.0176, + "num_input_tokens_seen": 39986368, + "step": 32855 + }, + { + "epoch": 3.6596502951330883, + "grad_norm": 0.12349897623062134, + "learning_rate": 4.8958639038450684e-05, + "loss": 0.059, + "num_input_tokens_seen": 39992544, + "step": 32860 + }, + { + "epoch": 3.6602071500167055, + "grad_norm": 0.30938223004341125, + "learning_rate": 4.895794496485522e-05, + "loss": 0.0772, + "num_input_tokens_seen": 39998496, + "step": 32865 + }, + { + "epoch": 3.660764004900323, + "grad_norm": 0.20475159585475922, + "learning_rate": 4.895725066495776e-05, + "loss": 0.1372, + "num_input_tokens_seen": 40004608, + "step": 32870 + }, + { + "epoch": 3.6613208597839404, + "grad_norm": 0.3036706745624542, + "learning_rate": 4.8956556138764886e-05, + "loss": 0.0197, + "num_input_tokens_seen": 40010848, + "step": 32875 + }, + { + "epoch": 3.6618777146675576, + "grad_norm": 0.08148343116044998, + "learning_rate": 4.8955861386283145e-05, + "loss": 0.0237, + "num_input_tokens_seen": 40016640, + "step": 32880 + }, + { + "epoch": 3.662434569551175, + "grad_norm": 0.23675765097141266, + "learning_rate": 4.895516640751909e-05, + "loss": 0.015, + "num_input_tokens_seen": 40022624, + "step": 32885 + }, + { + "epoch": 3.662991424434792, + "grad_norm": 0.0918349027633667, + "learning_rate": 4.895447120247931e-05, + "loss": 0.0478, + "num_input_tokens_seen": 40028896, + "step": 32890 + }, + { + "epoch": 3.6635482793184098, + "grad_norm": 1.225992202758789, + "learning_rate": 4.895377577117035e-05, + "loss": 0.237, + "num_input_tokens_seen": 40034496, + "step": 32895 + }, + { + "epoch": 3.664105134202027, + "grad_norm": 0.8179536461830139, + "learning_rate": 4.895308011359878e-05, + "loss": 0.0515, + "num_input_tokens_seen": 40040896, + "step": 32900 + }, + { + "epoch": 3.6646619890856442, + "grad_norm": 0.04517531022429466, + "learning_rate": 4.8952384229771184e-05, + "loss": 0.0911, + "num_input_tokens_seen": 40047104, + "step": 32905 + }, + { + "epoch": 3.6652188439692615, + "grad_norm": 0.7164111733436584, + "learning_rate": 4.8951688119694126e-05, + "loss": 0.0988, + "num_input_tokens_seen": 40053344, + "step": 32910 + }, + { + "epoch": 3.6657756988528787, + "grad_norm": 0.16798186302185059, + "learning_rate": 4.895099178337419e-05, + "loss": 0.0259, + "num_input_tokens_seen": 40059584, + "step": 32915 + }, + { + "epoch": 3.6663325537364964, + "grad_norm": 0.8228784799575806, + "learning_rate": 4.895029522081794e-05, + "loss": 0.0571, + "num_input_tokens_seen": 40065952, + "step": 32920 + }, + { + "epoch": 3.6668894086201136, + "grad_norm": 0.5709065198898315, + "learning_rate": 4.8949598432031964e-05, + "loss": 0.0525, + "num_input_tokens_seen": 40071872, + "step": 32925 + }, + { + "epoch": 3.667446263503731, + "grad_norm": 0.4506608247756958, + "learning_rate": 4.8948901417022846e-05, + "loss": 0.0483, + "num_input_tokens_seen": 40077984, + "step": 32930 + }, + { + "epoch": 3.6680031183873485, + "grad_norm": 0.0025092647410929203, + "learning_rate": 4.8948204175797166e-05, + "loss": 0.0204, + "num_input_tokens_seen": 40084448, + "step": 32935 + }, + { + "epoch": 3.6685599732709657, + "grad_norm": 0.7294307947158813, + "learning_rate": 4.894750670836151e-05, + "loss": 0.1584, + "num_input_tokens_seen": 40090496, + "step": 32940 + }, + { + "epoch": 3.669116828154583, + "grad_norm": 0.46772849559783936, + "learning_rate": 4.8946809014722464e-05, + "loss": 0.0706, + "num_input_tokens_seen": 40096352, + "step": 32945 + }, + { + "epoch": 3.6696736830382, + "grad_norm": 0.004701599013060331, + "learning_rate": 4.894611109488663e-05, + "loss": 0.1904, + "num_input_tokens_seen": 40102688, + "step": 32950 + }, + { + "epoch": 3.6702305379218174, + "grad_norm": 0.2745392322540283, + "learning_rate": 4.894541294886058e-05, + "loss": 0.0893, + "num_input_tokens_seen": 40108928, + "step": 32955 + }, + { + "epoch": 3.670787392805435, + "grad_norm": 1.3862495422363281, + "learning_rate": 4.894471457665093e-05, + "loss": 0.056, + "num_input_tokens_seen": 40115072, + "step": 32960 + }, + { + "epoch": 3.6713442476890523, + "grad_norm": 0.08533302694559097, + "learning_rate": 4.8944015978264255e-05, + "loss": 0.0642, + "num_input_tokens_seen": 40120992, + "step": 32965 + }, + { + "epoch": 3.6719011025726696, + "grad_norm": 0.6110008955001831, + "learning_rate": 4.894331715370718e-05, + "loss": 0.083, + "num_input_tokens_seen": 40126784, + "step": 32970 + }, + { + "epoch": 3.672457957456287, + "grad_norm": 1.2628933191299438, + "learning_rate": 4.894261810298628e-05, + "loss": 0.0725, + "num_input_tokens_seen": 40132672, + "step": 32975 + }, + { + "epoch": 3.673014812339904, + "grad_norm": 1.5774708986282349, + "learning_rate": 4.894191882610817e-05, + "loss": 0.1102, + "num_input_tokens_seen": 40138464, + "step": 32980 + }, + { + "epoch": 3.6735716672235217, + "grad_norm": 0.057095978409051895, + "learning_rate": 4.894121932307946e-05, + "loss": 0.0133, + "num_input_tokens_seen": 40144640, + "step": 32985 + }, + { + "epoch": 3.674128522107139, + "grad_norm": 0.4153250753879547, + "learning_rate": 4.8940519593906755e-05, + "loss": 0.0945, + "num_input_tokens_seen": 40150656, + "step": 32990 + }, + { + "epoch": 3.674685376990756, + "grad_norm": 0.08330505341291428, + "learning_rate": 4.893981963859665e-05, + "loss": 0.0862, + "num_input_tokens_seen": 40156544, + "step": 32995 + }, + { + "epoch": 3.6752422318743734, + "grad_norm": 1.4268810749053955, + "learning_rate": 4.893911945715578e-05, + "loss": 0.0723, + "num_input_tokens_seen": 40162848, + "step": 33000 + }, + { + "epoch": 3.6757990867579906, + "grad_norm": 0.10163956880569458, + "learning_rate": 4.893841904959074e-05, + "loss": 0.0994, + "num_input_tokens_seen": 40168832, + "step": 33005 + }, + { + "epoch": 3.6763559416416083, + "grad_norm": 0.0011053503258153796, + "learning_rate": 4.893771841590815e-05, + "loss": 0.1576, + "num_input_tokens_seen": 40175072, + "step": 33010 + }, + { + "epoch": 3.6769127965252255, + "grad_norm": 0.015031068585813046, + "learning_rate": 4.893701755611464e-05, + "loss": 0.1123, + "num_input_tokens_seen": 40181120, + "step": 33015 + }, + { + "epoch": 3.6774696514088427, + "grad_norm": 0.13617785274982452, + "learning_rate": 4.893631647021681e-05, + "loss": 0.1591, + "num_input_tokens_seen": 40187168, + "step": 33020 + }, + { + "epoch": 3.6780265062924604, + "grad_norm": 1.573510766029358, + "learning_rate": 4.89356151582213e-05, + "loss": 0.1633, + "num_input_tokens_seen": 40193312, + "step": 33025 + }, + { + "epoch": 3.6785833611760776, + "grad_norm": 0.409294068813324, + "learning_rate": 4.8934913620134735e-05, + "loss": 0.0651, + "num_input_tokens_seen": 40199328, + "step": 33030 + }, + { + "epoch": 3.679140216059695, + "grad_norm": 1.281661868095398, + "learning_rate": 4.893421185596373e-05, + "loss": 0.087, + "num_input_tokens_seen": 40205184, + "step": 33035 + }, + { + "epoch": 3.679697070943312, + "grad_norm": 0.2549959123134613, + "learning_rate": 4.893350986571491e-05, + "loss": 0.0347, + "num_input_tokens_seen": 40211136, + "step": 33040 + }, + { + "epoch": 3.6802539258269293, + "grad_norm": 0.2948945164680481, + "learning_rate": 4.8932807649394925e-05, + "loss": 0.0744, + "num_input_tokens_seen": 40217344, + "step": 33045 + }, + { + "epoch": 3.680810780710547, + "grad_norm": 3.8989310264587402, + "learning_rate": 4.893210520701039e-05, + "loss": 0.1587, + "num_input_tokens_seen": 40223232, + "step": 33050 + }, + { + "epoch": 3.6813676355941642, + "grad_norm": 0.1506642997264862, + "learning_rate": 4.893140253856795e-05, + "loss": 0.0074, + "num_input_tokens_seen": 40229216, + "step": 33055 + }, + { + "epoch": 3.6819244904777815, + "grad_norm": 0.3340361714363098, + "learning_rate": 4.893069964407424e-05, + "loss": 0.006, + "num_input_tokens_seen": 40235456, + "step": 33060 + }, + { + "epoch": 3.6824813453613987, + "grad_norm": 0.293267160654068, + "learning_rate": 4.8929996523535896e-05, + "loss": 0.0835, + "num_input_tokens_seen": 40241792, + "step": 33065 + }, + { + "epoch": 3.683038200245016, + "grad_norm": 0.9473000168800354, + "learning_rate": 4.892929317695957e-05, + "loss": 0.1172, + "num_input_tokens_seen": 40248032, + "step": 33070 + }, + { + "epoch": 3.6835950551286336, + "grad_norm": 0.013646257109940052, + "learning_rate": 4.892858960435189e-05, + "loss": 0.0175, + "num_input_tokens_seen": 40254016, + "step": 33075 + }, + { + "epoch": 3.684151910012251, + "grad_norm": 0.3837977349758148, + "learning_rate": 4.892788580571951e-05, + "loss": 0.0649, + "num_input_tokens_seen": 40260064, + "step": 33080 + }, + { + "epoch": 3.684708764895868, + "grad_norm": 1.1398727893829346, + "learning_rate": 4.892718178106908e-05, + "loss": 0.0786, + "num_input_tokens_seen": 40266432, + "step": 33085 + }, + { + "epoch": 3.6852656197794853, + "grad_norm": 1.0918097496032715, + "learning_rate": 4.892647753040725e-05, + "loss": 0.0495, + "num_input_tokens_seen": 40272128, + "step": 33090 + }, + { + "epoch": 3.6858224746631025, + "grad_norm": 0.1728736162185669, + "learning_rate": 4.892577305374067e-05, + "loss": 0.0413, + "num_input_tokens_seen": 40278400, + "step": 33095 + }, + { + "epoch": 3.68637932954672, + "grad_norm": 0.09309621155261993, + "learning_rate": 4.892506835107599e-05, + "loss": 0.0713, + "num_input_tokens_seen": 40284352, + "step": 33100 + }, + { + "epoch": 3.6869361844303374, + "grad_norm": 0.32897886633872986, + "learning_rate": 4.892436342241987e-05, + "loss": 0.1289, + "num_input_tokens_seen": 40289888, + "step": 33105 + }, + { + "epoch": 3.6874930393139547, + "grad_norm": 1.0197739601135254, + "learning_rate": 4.8923658267778976e-05, + "loss": 0.1386, + "num_input_tokens_seen": 40295840, + "step": 33110 + }, + { + "epoch": 3.6880498941975723, + "grad_norm": 1.5408145189285278, + "learning_rate": 4.892295288715996e-05, + "loss": 0.109, + "num_input_tokens_seen": 40301792, + "step": 33115 + }, + { + "epoch": 3.6886067490811896, + "grad_norm": 1.0939149856567383, + "learning_rate": 4.892224728056949e-05, + "loss": 0.0762, + "num_input_tokens_seen": 40308064, + "step": 33120 + }, + { + "epoch": 3.689163603964807, + "grad_norm": 1.0147130489349365, + "learning_rate": 4.8921541448014226e-05, + "loss": 0.1471, + "num_input_tokens_seen": 40314048, + "step": 33125 + }, + { + "epoch": 3.689720458848424, + "grad_norm": 0.5991386771202087, + "learning_rate": 4.892083538950083e-05, + "loss": 0.19, + "num_input_tokens_seen": 40320320, + "step": 33130 + }, + { + "epoch": 3.6902773137320413, + "grad_norm": 0.001898612710647285, + "learning_rate": 4.892012910503599e-05, + "loss": 0.0722, + "num_input_tokens_seen": 40326496, + "step": 33135 + }, + { + "epoch": 3.690834168615659, + "grad_norm": 0.031286682933568954, + "learning_rate": 4.891942259462636e-05, + "loss": 0.0582, + "num_input_tokens_seen": 40332640, + "step": 33140 + }, + { + "epoch": 3.691391023499276, + "grad_norm": 0.17036943137645721, + "learning_rate": 4.891871585827862e-05, + "loss": 0.043, + "num_input_tokens_seen": 40338784, + "step": 33145 + }, + { + "epoch": 3.6919478783828934, + "grad_norm": 1.1811084747314453, + "learning_rate": 4.8918008895999444e-05, + "loss": 0.0748, + "num_input_tokens_seen": 40344448, + "step": 33150 + }, + { + "epoch": 3.6925047332665106, + "grad_norm": 0.0387316569685936, + "learning_rate": 4.891730170779551e-05, + "loss": 0.1386, + "num_input_tokens_seen": 40350560, + "step": 33155 + }, + { + "epoch": 3.693061588150128, + "grad_norm": 1.4625853300094604, + "learning_rate": 4.8916594293673515e-05, + "loss": 0.1814, + "num_input_tokens_seen": 40356128, + "step": 33160 + }, + { + "epoch": 3.6936184430337455, + "grad_norm": 0.02265195921063423, + "learning_rate": 4.891588665364011e-05, + "loss": 0.0334, + "num_input_tokens_seen": 40361728, + "step": 33165 + }, + { + "epoch": 3.6941752979173628, + "grad_norm": 0.08461055904626846, + "learning_rate": 4.8915178787702e-05, + "loss": 0.0345, + "num_input_tokens_seen": 40368000, + "step": 33170 + }, + { + "epoch": 3.69473215280098, + "grad_norm": 0.12777820229530334, + "learning_rate": 4.891447069586586e-05, + "loss": 0.0491, + "num_input_tokens_seen": 40373824, + "step": 33175 + }, + { + "epoch": 3.6952890076845977, + "grad_norm": 1.1368483304977417, + "learning_rate": 4.8913762378138386e-05, + "loss": 0.1227, + "num_input_tokens_seen": 40379904, + "step": 33180 + }, + { + "epoch": 3.6958458625682145, + "grad_norm": 0.49962106347084045, + "learning_rate": 4.891305383452627e-05, + "loss": 0.1041, + "num_input_tokens_seen": 40386176, + "step": 33185 + }, + { + "epoch": 3.696402717451832, + "grad_norm": 0.22996287047863007, + "learning_rate": 4.8912345065036205e-05, + "loss": 0.0762, + "num_input_tokens_seen": 40392736, + "step": 33190 + }, + { + "epoch": 3.6969595723354494, + "grad_norm": 0.0030509759671986103, + "learning_rate": 4.891163606967487e-05, + "loss": 0.0187, + "num_input_tokens_seen": 40399168, + "step": 33195 + }, + { + "epoch": 3.6975164272190666, + "grad_norm": 1.002673625946045, + "learning_rate": 4.891092684844899e-05, + "loss": 0.1131, + "num_input_tokens_seen": 40405120, + "step": 33200 + }, + { + "epoch": 3.6980732821026843, + "grad_norm": 0.26282811164855957, + "learning_rate": 4.891021740136524e-05, + "loss": 0.0638, + "num_input_tokens_seen": 40411456, + "step": 33205 + }, + { + "epoch": 3.6986301369863015, + "grad_norm": 0.023272046819329262, + "learning_rate": 4.8909507728430326e-05, + "loss": 0.063, + "num_input_tokens_seen": 40417472, + "step": 33210 + }, + { + "epoch": 3.6991869918699187, + "grad_norm": 0.04489812254905701, + "learning_rate": 4.8908797829650964e-05, + "loss": 0.0496, + "num_input_tokens_seen": 40423712, + "step": 33215 + }, + { + "epoch": 3.699743846753536, + "grad_norm": 0.2844012677669525, + "learning_rate": 4.890808770503384e-05, + "loss": 0.1487, + "num_input_tokens_seen": 40429344, + "step": 33220 + }, + { + "epoch": 3.700300701637153, + "grad_norm": 0.0947086289525032, + "learning_rate": 4.890737735458569e-05, + "loss": 0.0912, + "num_input_tokens_seen": 40435072, + "step": 33225 + }, + { + "epoch": 3.700857556520771, + "grad_norm": 1.342922329902649, + "learning_rate": 4.8906666778313196e-05, + "loss": 0.0914, + "num_input_tokens_seen": 40440800, + "step": 33230 + }, + { + "epoch": 3.701414411404388, + "grad_norm": 1.285767674446106, + "learning_rate": 4.890595597622308e-05, + "loss": 0.1238, + "num_input_tokens_seen": 40447040, + "step": 33235 + }, + { + "epoch": 3.7019712662880053, + "grad_norm": 1.0321428775787354, + "learning_rate": 4.890524494832206e-05, + "loss": 0.0776, + "num_input_tokens_seen": 40453216, + "step": 33240 + }, + { + "epoch": 3.7025281211716226, + "grad_norm": 0.113554947078228, + "learning_rate": 4.890453369461685e-05, + "loss": 0.0225, + "num_input_tokens_seen": 40459264, + "step": 33245 + }, + { + "epoch": 3.70308497605524, + "grad_norm": 0.24308538436889648, + "learning_rate": 4.8903822215114166e-05, + "loss": 0.0419, + "num_input_tokens_seen": 40465440, + "step": 33250 + }, + { + "epoch": 3.7036418309388575, + "grad_norm": 1.1395502090454102, + "learning_rate": 4.8903110509820725e-05, + "loss": 0.0827, + "num_input_tokens_seen": 40471680, + "step": 33255 + }, + { + "epoch": 3.7041986858224747, + "grad_norm": 0.32557085156440735, + "learning_rate": 4.890239857874326e-05, + "loss": 0.1685, + "num_input_tokens_seen": 40477664, + "step": 33260 + }, + { + "epoch": 3.704755540706092, + "grad_norm": 0.003609293606132269, + "learning_rate": 4.8901686421888485e-05, + "loss": 0.1416, + "num_input_tokens_seen": 40483712, + "step": 33265 + }, + { + "epoch": 3.7053123955897096, + "grad_norm": 0.13367149233818054, + "learning_rate": 4.8900974039263135e-05, + "loss": 0.0329, + "num_input_tokens_seen": 40489824, + "step": 33270 + }, + { + "epoch": 3.7058692504733264, + "grad_norm": 0.39700424671173096, + "learning_rate": 4.890026143087394e-05, + "loss": 0.0445, + "num_input_tokens_seen": 40495776, + "step": 33275 + }, + { + "epoch": 3.706426105356944, + "grad_norm": 0.5155273079872131, + "learning_rate": 4.889954859672762e-05, + "loss": 0.0735, + "num_input_tokens_seen": 40502048, + "step": 33280 + }, + { + "epoch": 3.7069829602405613, + "grad_norm": 2.088038444519043, + "learning_rate": 4.8898835536830924e-05, + "loss": 0.0892, + "num_input_tokens_seen": 40508160, + "step": 33285 + }, + { + "epoch": 3.7075398151241785, + "grad_norm": 0.0899307131767273, + "learning_rate": 4.889812225119057e-05, + "loss": 0.0343, + "num_input_tokens_seen": 40513696, + "step": 33290 + }, + { + "epoch": 3.708096670007796, + "grad_norm": 0.6184965968132019, + "learning_rate": 4.8897408739813313e-05, + "loss": 0.0423, + "num_input_tokens_seen": 40519840, + "step": 33295 + }, + { + "epoch": 3.7086535248914134, + "grad_norm": 0.2743433713912964, + "learning_rate": 4.889669500270588e-05, + "loss": 0.0936, + "num_input_tokens_seen": 40526112, + "step": 33300 + }, + { + "epoch": 3.7092103797750307, + "grad_norm": 0.14152106642723083, + "learning_rate": 4.889598103987501e-05, + "loss": 0.057, + "num_input_tokens_seen": 40532416, + "step": 33305 + }, + { + "epoch": 3.709767234658648, + "grad_norm": 1.2114044427871704, + "learning_rate": 4.8895266851327465e-05, + "loss": 0.1416, + "num_input_tokens_seen": 40538048, + "step": 33310 + }, + { + "epoch": 3.710324089542265, + "grad_norm": 0.5424038171768188, + "learning_rate": 4.8894552437069976e-05, + "loss": 0.1727, + "num_input_tokens_seen": 40543872, + "step": 33315 + }, + { + "epoch": 3.710880944425883, + "grad_norm": 1.071993112564087, + "learning_rate": 4.889383779710929e-05, + "loss": 0.2226, + "num_input_tokens_seen": 40549664, + "step": 33320 + }, + { + "epoch": 3.7114377993095, + "grad_norm": 0.633731484413147, + "learning_rate": 4.8893122931452176e-05, + "loss": 0.1618, + "num_input_tokens_seen": 40555680, + "step": 33325 + }, + { + "epoch": 3.7119946541931172, + "grad_norm": 0.2365982085466385, + "learning_rate": 4.889240784010536e-05, + "loss": 0.0506, + "num_input_tokens_seen": 40562016, + "step": 33330 + }, + { + "epoch": 3.7125515090767345, + "grad_norm": 0.9258299469947815, + "learning_rate": 4.889169252307562e-05, + "loss": 0.0881, + "num_input_tokens_seen": 40568640, + "step": 33335 + }, + { + "epoch": 3.7131083639603517, + "grad_norm": 0.7659924626350403, + "learning_rate": 4.889097698036969e-05, + "loss": 0.0786, + "num_input_tokens_seen": 40574784, + "step": 33340 + }, + { + "epoch": 3.7136652188439694, + "grad_norm": 1.7293860912322998, + "learning_rate": 4.889026121199435e-05, + "loss": 0.0721, + "num_input_tokens_seen": 40581024, + "step": 33345 + }, + { + "epoch": 3.7142220737275866, + "grad_norm": 0.7952939867973328, + "learning_rate": 4.8889545217956346e-05, + "loss": 0.1049, + "num_input_tokens_seen": 40587200, + "step": 33350 + }, + { + "epoch": 3.714778928611204, + "grad_norm": 0.23992465436458588, + "learning_rate": 4.8888828998262455e-05, + "loss": 0.1029, + "num_input_tokens_seen": 40593248, + "step": 33355 + }, + { + "epoch": 3.7153357834948215, + "grad_norm": 0.04658392444252968, + "learning_rate": 4.888811255291943e-05, + "loss": 0.1891, + "num_input_tokens_seen": 40599296, + "step": 33360 + }, + { + "epoch": 3.7158926383784383, + "grad_norm": 0.02598268911242485, + "learning_rate": 4.888739588193404e-05, + "loss": 0.0659, + "num_input_tokens_seen": 40605504, + "step": 33365 + }, + { + "epoch": 3.716449493262056, + "grad_norm": 0.42533332109451294, + "learning_rate": 4.888667898531306e-05, + "loss": 0.104, + "num_input_tokens_seen": 40611456, + "step": 33370 + }, + { + "epoch": 3.717006348145673, + "grad_norm": 2.0957326889038086, + "learning_rate": 4.888596186306327e-05, + "loss": 0.095, + "num_input_tokens_seen": 40617344, + "step": 33375 + }, + { + "epoch": 3.7175632030292904, + "grad_norm": 1.2669899463653564, + "learning_rate": 4.8885244515191416e-05, + "loss": 0.1585, + "num_input_tokens_seen": 40623264, + "step": 33380 + }, + { + "epoch": 3.718120057912908, + "grad_norm": 0.32615965604782104, + "learning_rate": 4.88845269417043e-05, + "loss": 0.0614, + "num_input_tokens_seen": 40629344, + "step": 33385 + }, + { + "epoch": 3.7186769127965253, + "grad_norm": 1.4847925901412964, + "learning_rate": 4.8883809142608695e-05, + "loss": 0.0745, + "num_input_tokens_seen": 40635264, + "step": 33390 + }, + { + "epoch": 3.7192337676801426, + "grad_norm": 1.028881549835205, + "learning_rate": 4.888309111791137e-05, + "loss": 0.0594, + "num_input_tokens_seen": 40641312, + "step": 33395 + }, + { + "epoch": 3.71979062256376, + "grad_norm": 0.7577743530273438, + "learning_rate": 4.888237286761912e-05, + "loss": 0.066, + "num_input_tokens_seen": 40647264, + "step": 33400 + }, + { + "epoch": 3.720347477447377, + "grad_norm": 0.9059811234474182, + "learning_rate": 4.8881654391738715e-05, + "loss": 0.0533, + "num_input_tokens_seen": 40653376, + "step": 33405 + }, + { + "epoch": 3.7209043323309947, + "grad_norm": 0.14711791276931763, + "learning_rate": 4.888093569027696e-05, + "loss": 0.0669, + "num_input_tokens_seen": 40659648, + "step": 33410 + }, + { + "epoch": 3.721461187214612, + "grad_norm": 1.1297537088394165, + "learning_rate": 4.888021676324063e-05, + "loss": 0.1179, + "num_input_tokens_seen": 40665312, + "step": 33415 + }, + { + "epoch": 3.722018042098229, + "grad_norm": 0.10534871369600296, + "learning_rate": 4.8879497610636525e-05, + "loss": 0.0993, + "num_input_tokens_seen": 40671072, + "step": 33420 + }, + { + "epoch": 3.7225748969818464, + "grad_norm": 0.22972039878368378, + "learning_rate": 4.887877823247143e-05, + "loss": 0.1117, + "num_input_tokens_seen": 40677184, + "step": 33425 + }, + { + "epoch": 3.7231317518654636, + "grad_norm": 0.9477421641349792, + "learning_rate": 4.8878058628752144e-05, + "loss": 0.0673, + "num_input_tokens_seen": 40683168, + "step": 33430 + }, + { + "epoch": 3.7236886067490813, + "grad_norm": 0.03303741291165352, + "learning_rate": 4.887733879948546e-05, + "loss": 0.0899, + "num_input_tokens_seen": 40688960, + "step": 33435 + }, + { + "epoch": 3.7242454616326985, + "grad_norm": 0.30744436383247375, + "learning_rate": 4.8876618744678185e-05, + "loss": 0.1267, + "num_input_tokens_seen": 40694912, + "step": 33440 + }, + { + "epoch": 3.7248023165163158, + "grad_norm": 0.14682835340499878, + "learning_rate": 4.887589846433711e-05, + "loss": 0.1526, + "num_input_tokens_seen": 40700864, + "step": 33445 + }, + { + "epoch": 3.7253591713999334, + "grad_norm": 0.7595858573913574, + "learning_rate": 4.8875177958469055e-05, + "loss": 0.0683, + "num_input_tokens_seen": 40707328, + "step": 33450 + }, + { + "epoch": 3.7259160262835502, + "grad_norm": 2.421840190887451, + "learning_rate": 4.887445722708081e-05, + "loss": 0.0825, + "num_input_tokens_seen": 40713568, + "step": 33455 + }, + { + "epoch": 3.726472881167168, + "grad_norm": 0.14739160239696503, + "learning_rate": 4.887373627017918e-05, + "loss": 0.0328, + "num_input_tokens_seen": 40719552, + "step": 33460 + }, + { + "epoch": 3.727029736050785, + "grad_norm": 0.6914241313934326, + "learning_rate": 4.8873015087771e-05, + "loss": 0.0339, + "num_input_tokens_seen": 40725888, + "step": 33465 + }, + { + "epoch": 3.7275865909344024, + "grad_norm": 1.4869320392608643, + "learning_rate": 4.887229367986306e-05, + "loss": 0.0671, + "num_input_tokens_seen": 40732384, + "step": 33470 + }, + { + "epoch": 3.72814344581802, + "grad_norm": 0.47615185379981995, + "learning_rate": 4.8871572046462174e-05, + "loss": 0.0782, + "num_input_tokens_seen": 40738688, + "step": 33475 + }, + { + "epoch": 3.7287003007016373, + "grad_norm": 0.024335535243153572, + "learning_rate": 4.8870850187575165e-05, + "loss": 0.0103, + "num_input_tokens_seen": 40745056, + "step": 33480 + }, + { + "epoch": 3.7292571555852545, + "grad_norm": 0.020926516503095627, + "learning_rate": 4.887012810320886e-05, + "loss": 0.0253, + "num_input_tokens_seen": 40751328, + "step": 33485 + }, + { + "epoch": 3.7298140104688717, + "grad_norm": 0.11958533525466919, + "learning_rate": 4.886940579337006e-05, + "loss": 0.032, + "num_input_tokens_seen": 40757536, + "step": 33490 + }, + { + "epoch": 3.730370865352489, + "grad_norm": 0.3460087478160858, + "learning_rate": 4.8868683258065605e-05, + "loss": 0.1018, + "num_input_tokens_seen": 40763776, + "step": 33495 + }, + { + "epoch": 3.7309277202361066, + "grad_norm": 1.0794051885604858, + "learning_rate": 4.886796049730231e-05, + "loss": 0.1658, + "num_input_tokens_seen": 40769824, + "step": 33500 + }, + { + "epoch": 3.731484575119724, + "grad_norm": 0.12742199003696442, + "learning_rate": 4.886723751108701e-05, + "loss": 0.0438, + "num_input_tokens_seen": 40776128, + "step": 33505 + }, + { + "epoch": 3.732041430003341, + "grad_norm": 1.3470489978790283, + "learning_rate": 4.886651429942652e-05, + "loss": 0.108, + "num_input_tokens_seen": 40782272, + "step": 33510 + }, + { + "epoch": 3.7325982848869583, + "grad_norm": 0.04685201123356819, + "learning_rate": 4.88657908623277e-05, + "loss": 0.1034, + "num_input_tokens_seen": 40788384, + "step": 33515 + }, + { + "epoch": 3.7331551397705756, + "grad_norm": 0.03739296272397041, + "learning_rate": 4.886506719979735e-05, + "loss": 0.012, + "num_input_tokens_seen": 40794592, + "step": 33520 + }, + { + "epoch": 3.7337119946541932, + "grad_norm": 0.23256129026412964, + "learning_rate": 4.886434331184232e-05, + "loss": 0.0762, + "num_input_tokens_seen": 40800544, + "step": 33525 + }, + { + "epoch": 3.7342688495378105, + "grad_norm": 0.4753401577472687, + "learning_rate": 4.8863619198469445e-05, + "loss": 0.0582, + "num_input_tokens_seen": 40806592, + "step": 33530 + }, + { + "epoch": 3.7348257044214277, + "grad_norm": 0.0026931024622172117, + "learning_rate": 4.886289485968558e-05, + "loss": 0.0754, + "num_input_tokens_seen": 40812736, + "step": 33535 + }, + { + "epoch": 3.7353825593050454, + "grad_norm": 1.0414589643478394, + "learning_rate": 4.8862170295497546e-05, + "loss": 0.0595, + "num_input_tokens_seen": 40818208, + "step": 33540 + }, + { + "epoch": 3.7359394141886626, + "grad_norm": 0.1061306968331337, + "learning_rate": 4.8861445505912196e-05, + "loss": 0.0356, + "num_input_tokens_seen": 40824320, + "step": 33545 + }, + { + "epoch": 3.73649626907228, + "grad_norm": 0.0013639640528708696, + "learning_rate": 4.886072049093637e-05, + "loss": 0.0844, + "num_input_tokens_seen": 40830464, + "step": 33550 + }, + { + "epoch": 3.737053123955897, + "grad_norm": 0.1597547084093094, + "learning_rate": 4.8859995250576926e-05, + "loss": 0.1175, + "num_input_tokens_seen": 40836512, + "step": 33555 + }, + { + "epoch": 3.7376099788395143, + "grad_norm": 2.7625210285186768, + "learning_rate": 4.8859269784840715e-05, + "loss": 0.1402, + "num_input_tokens_seen": 40842432, + "step": 33560 + }, + { + "epoch": 3.738166833723132, + "grad_norm": 1.4065297842025757, + "learning_rate": 4.8858544093734584e-05, + "loss": 0.1737, + "num_input_tokens_seen": 40848672, + "step": 33565 + }, + { + "epoch": 3.738723688606749, + "grad_norm": 0.08102716505527496, + "learning_rate": 4.885781817726539e-05, + "loss": 0.0251, + "num_input_tokens_seen": 40854944, + "step": 33570 + }, + { + "epoch": 3.7392805434903664, + "grad_norm": 0.3818589448928833, + "learning_rate": 4.885709203543999e-05, + "loss": 0.0605, + "num_input_tokens_seen": 40860960, + "step": 33575 + }, + { + "epoch": 3.7398373983739837, + "grad_norm": 0.20986008644104004, + "learning_rate": 4.8856365668265234e-05, + "loss": 0.1423, + "num_input_tokens_seen": 40867296, + "step": 33580 + }, + { + "epoch": 3.740394253257601, + "grad_norm": 0.4699784219264984, + "learning_rate": 4.8855639075747995e-05, + "loss": 0.029, + "num_input_tokens_seen": 40873408, + "step": 33585 + }, + { + "epoch": 3.7409511081412186, + "grad_norm": 0.7785935401916504, + "learning_rate": 4.885491225789513e-05, + "loss": 0.0222, + "num_input_tokens_seen": 40879936, + "step": 33590 + }, + { + "epoch": 3.741507963024836, + "grad_norm": 0.75851970911026, + "learning_rate": 4.885418521471351e-05, + "loss": 0.0526, + "num_input_tokens_seen": 40886368, + "step": 33595 + }, + { + "epoch": 3.742064817908453, + "grad_norm": 0.7699452042579651, + "learning_rate": 4.8853457946209993e-05, + "loss": 0.0356, + "num_input_tokens_seen": 40892288, + "step": 33600 + }, + { + "epoch": 3.7426216727920703, + "grad_norm": 0.5679885745048523, + "learning_rate": 4.885273045239146e-05, + "loss": 0.0381, + "num_input_tokens_seen": 40898272, + "step": 33605 + }, + { + "epoch": 3.7431785276756875, + "grad_norm": 1.3562493324279785, + "learning_rate": 4.885200273326478e-05, + "loss": 0.1388, + "num_input_tokens_seen": 40904288, + "step": 33610 + }, + { + "epoch": 3.743735382559305, + "grad_norm": 0.333065003156662, + "learning_rate": 4.8851274788836823e-05, + "loss": 0.1187, + "num_input_tokens_seen": 40910176, + "step": 33615 + }, + { + "epoch": 3.7442922374429224, + "grad_norm": 1.3614269495010376, + "learning_rate": 4.8850546619114455e-05, + "loss": 0.1282, + "num_input_tokens_seen": 40916480, + "step": 33620 + }, + { + "epoch": 3.7448490923265396, + "grad_norm": 0.5382206439971924, + "learning_rate": 4.884981822410458e-05, + "loss": 0.0774, + "num_input_tokens_seen": 40922560, + "step": 33625 + }, + { + "epoch": 3.7454059472101573, + "grad_norm": 0.10016052424907684, + "learning_rate": 4.884908960381406e-05, + "loss": 0.0538, + "num_input_tokens_seen": 40928608, + "step": 33630 + }, + { + "epoch": 3.7459628020937745, + "grad_norm": 0.005937360227108002, + "learning_rate": 4.884836075824978e-05, + "loss": 0.0423, + "num_input_tokens_seen": 40934816, + "step": 33635 + }, + { + "epoch": 3.7465196569773918, + "grad_norm": 0.011143672280013561, + "learning_rate": 4.884763168741862e-05, + "loss": 0.0255, + "num_input_tokens_seen": 40940896, + "step": 33640 + }, + { + "epoch": 3.747076511861009, + "grad_norm": 1.257691502571106, + "learning_rate": 4.8846902391327474e-05, + "loss": 0.0694, + "num_input_tokens_seen": 40947136, + "step": 33645 + }, + { + "epoch": 3.747633366744626, + "grad_norm": 0.24336761236190796, + "learning_rate": 4.8846172869983234e-05, + "loss": 0.036, + "num_input_tokens_seen": 40952704, + "step": 33650 + }, + { + "epoch": 3.748190221628244, + "grad_norm": 0.07041438668966293, + "learning_rate": 4.884544312339279e-05, + "loss": 0.0176, + "num_input_tokens_seen": 40958176, + "step": 33655 + }, + { + "epoch": 3.748747076511861, + "grad_norm": 0.8584012389183044, + "learning_rate": 4.8844713151563026e-05, + "loss": 0.1256, + "num_input_tokens_seen": 40964544, + "step": 33660 + }, + { + "epoch": 3.7493039313954784, + "grad_norm": 0.32772377133369446, + "learning_rate": 4.884398295450084e-05, + "loss": 0.0932, + "num_input_tokens_seen": 40970752, + "step": 33665 + }, + { + "epoch": 3.7498607862790956, + "grad_norm": 1.0024080276489258, + "learning_rate": 4.884325253221314e-05, + "loss": 0.095, + "num_input_tokens_seen": 40976864, + "step": 33670 + }, + { + "epoch": 3.750417641162713, + "grad_norm": 0.0006873428355902433, + "learning_rate": 4.884252188470681e-05, + "loss": 0.1391, + "num_input_tokens_seen": 40982368, + "step": 33675 + }, + { + "epoch": 3.7509744960463305, + "grad_norm": 0.3249821066856384, + "learning_rate": 4.884179101198875e-05, + "loss": 0.0477, + "num_input_tokens_seen": 40988384, + "step": 33680 + }, + { + "epoch": 3.7515313509299477, + "grad_norm": 0.005811208859086037, + "learning_rate": 4.884105991406588e-05, + "loss": 0.0917, + "num_input_tokens_seen": 40994720, + "step": 33685 + }, + { + "epoch": 3.752088205813565, + "grad_norm": 0.10045918822288513, + "learning_rate": 4.88403285909451e-05, + "loss": 0.0324, + "num_input_tokens_seen": 41000864, + "step": 33690 + }, + { + "epoch": 3.752645060697182, + "grad_norm": 0.9464149475097656, + "learning_rate": 4.8839597042633314e-05, + "loss": 0.1275, + "num_input_tokens_seen": 41006720, + "step": 33695 + }, + { + "epoch": 3.7532019155807994, + "grad_norm": 0.10156286507844925, + "learning_rate": 4.8838865269137436e-05, + "loss": 0.0546, + "num_input_tokens_seen": 41012352, + "step": 33700 + }, + { + "epoch": 3.753758770464417, + "grad_norm": 0.07769004255533218, + "learning_rate": 4.883813327046437e-05, + "loss": 0.0966, + "num_input_tokens_seen": 41018848, + "step": 33705 + }, + { + "epoch": 3.7543156253480343, + "grad_norm": 0.48384547233581543, + "learning_rate": 4.883740104662104e-05, + "loss": 0.1433, + "num_input_tokens_seen": 41024704, + "step": 33710 + }, + { + "epoch": 3.7548724802316515, + "grad_norm": 0.017978066578507423, + "learning_rate": 4.8836668597614364e-05, + "loss": 0.0581, + "num_input_tokens_seen": 41030528, + "step": 33715 + }, + { + "epoch": 3.755429335115269, + "grad_norm": 0.6461074352264404, + "learning_rate": 4.883593592345126e-05, + "loss": 0.1914, + "num_input_tokens_seen": 41036544, + "step": 33720 + }, + { + "epoch": 3.7559861899988864, + "grad_norm": 0.005386491771787405, + "learning_rate": 4.8835203024138634e-05, + "loss": 0.0449, + "num_input_tokens_seen": 41042912, + "step": 33725 + }, + { + "epoch": 3.7565430448825037, + "grad_norm": 0.6512866616249084, + "learning_rate": 4.8834469899683414e-05, + "loss": 0.082, + "num_input_tokens_seen": 41049312, + "step": 33730 + }, + { + "epoch": 3.757099899766121, + "grad_norm": 0.0037285631988197565, + "learning_rate": 4.8833736550092535e-05, + "loss": 0.0331, + "num_input_tokens_seen": 41055488, + "step": 33735 + }, + { + "epoch": 3.757656754649738, + "grad_norm": 0.2904895842075348, + "learning_rate": 4.883300297537292e-05, + "loss": 0.0575, + "num_input_tokens_seen": 41061440, + "step": 33740 + }, + { + "epoch": 3.758213609533356, + "grad_norm": 0.3098065257072449, + "learning_rate": 4.88322691755315e-05, + "loss": 0.1049, + "num_input_tokens_seen": 41067424, + "step": 33745 + }, + { + "epoch": 3.758770464416973, + "grad_norm": 0.00041316408896818757, + "learning_rate": 4.88315351505752e-05, + "loss": 0.0724, + "num_input_tokens_seen": 41073696, + "step": 33750 + }, + { + "epoch": 3.7593273193005903, + "grad_norm": 0.12483888864517212, + "learning_rate": 4.883080090051096e-05, + "loss": 0.1564, + "num_input_tokens_seen": 41079808, + "step": 33755 + }, + { + "epoch": 3.7598841741842075, + "grad_norm": 0.09161446243524551, + "learning_rate": 4.883006642534571e-05, + "loss": 0.2071, + "num_input_tokens_seen": 41086208, + "step": 33760 + }, + { + "epoch": 3.7604410290678247, + "grad_norm": 0.0007916974718682468, + "learning_rate": 4.882933172508639e-05, + "loss": 0.0951, + "num_input_tokens_seen": 41092544, + "step": 33765 + }, + { + "epoch": 3.7609978839514424, + "grad_norm": 0.3799409866333008, + "learning_rate": 4.8828596799739945e-05, + "loss": 0.071, + "num_input_tokens_seen": 41098496, + "step": 33770 + }, + { + "epoch": 3.7615547388350596, + "grad_norm": 0.5823971629142761, + "learning_rate": 4.882786164931331e-05, + "loss": 0.1985, + "num_input_tokens_seen": 41104960, + "step": 33775 + }, + { + "epoch": 3.762111593718677, + "grad_norm": 0.7977815866470337, + "learning_rate": 4.8827126273813426e-05, + "loss": 0.0651, + "num_input_tokens_seen": 41111296, + "step": 33780 + }, + { + "epoch": 3.762668448602294, + "grad_norm": 1.1581759452819824, + "learning_rate": 4.882639067324725e-05, + "loss": 0.0828, + "num_input_tokens_seen": 41117472, + "step": 33785 + }, + { + "epoch": 3.7632253034859113, + "grad_norm": 1.12652587890625, + "learning_rate": 4.8825654847621727e-05, + "loss": 0.0963, + "num_input_tokens_seen": 41123424, + "step": 33790 + }, + { + "epoch": 3.763782158369529, + "grad_norm": 1.8419603109359741, + "learning_rate": 4.88249187969438e-05, + "loss": 0.1826, + "num_input_tokens_seen": 41129696, + "step": 33795 + }, + { + "epoch": 3.7643390132531462, + "grad_norm": 0.09839534759521484, + "learning_rate": 4.8824182521220426e-05, + "loss": 0.1638, + "num_input_tokens_seen": 41135808, + "step": 33800 + }, + { + "epoch": 3.7648958681367635, + "grad_norm": 0.4223189651966095, + "learning_rate": 4.8823446020458566e-05, + "loss": 0.0274, + "num_input_tokens_seen": 41141152, + "step": 33805 + }, + { + "epoch": 3.765452723020381, + "grad_norm": 0.042894843965768814, + "learning_rate": 4.882270929466516e-05, + "loss": 0.1232, + "num_input_tokens_seen": 41147392, + "step": 33810 + }, + { + "epoch": 3.7660095779039984, + "grad_norm": 0.0007793783443048596, + "learning_rate": 4.882197234384719e-05, + "loss": 0.0694, + "num_input_tokens_seen": 41153824, + "step": 33815 + }, + { + "epoch": 3.7665664327876156, + "grad_norm": 0.00649644061923027, + "learning_rate": 4.8821235168011596e-05, + "loss": 0.0371, + "num_input_tokens_seen": 41160192, + "step": 33820 + }, + { + "epoch": 3.767123287671233, + "grad_norm": 0.6297017931938171, + "learning_rate": 4.882049776716536e-05, + "loss": 0.0593, + "num_input_tokens_seen": 41166112, + "step": 33825 + }, + { + "epoch": 3.76768014255485, + "grad_norm": 0.4809893071651459, + "learning_rate": 4.8819760141315426e-05, + "loss": 0.0212, + "num_input_tokens_seen": 41172192, + "step": 33830 + }, + { + "epoch": 3.7682369974384677, + "grad_norm": 0.7316564917564392, + "learning_rate": 4.881902229046879e-05, + "loss": 0.1648, + "num_input_tokens_seen": 41178368, + "step": 33835 + }, + { + "epoch": 3.768793852322085, + "grad_norm": 0.539918065071106, + "learning_rate": 4.881828421463239e-05, + "loss": 0.0733, + "num_input_tokens_seen": 41184608, + "step": 33840 + }, + { + "epoch": 3.769350707205702, + "grad_norm": 1.4658393859863281, + "learning_rate": 4.881754591381322e-05, + "loss": 0.1259, + "num_input_tokens_seen": 41190976, + "step": 33845 + }, + { + "epoch": 3.7699075620893194, + "grad_norm": 0.18663986027240753, + "learning_rate": 4.881680738801825e-05, + "loss": 0.0717, + "num_input_tokens_seen": 41196512, + "step": 33850 + }, + { + "epoch": 3.7704644169729367, + "grad_norm": 0.612605094909668, + "learning_rate": 4.8816068637254444e-05, + "loss": 0.108, + "num_input_tokens_seen": 41202400, + "step": 33855 + }, + { + "epoch": 3.7710212718565543, + "grad_norm": 1.2811321020126343, + "learning_rate": 4.8815329661528794e-05, + "loss": 0.0714, + "num_input_tokens_seen": 41208704, + "step": 33860 + }, + { + "epoch": 3.7715781267401716, + "grad_norm": 0.4488596022129059, + "learning_rate": 4.8814590460848276e-05, + "loss": 0.0397, + "num_input_tokens_seen": 41214720, + "step": 33865 + }, + { + "epoch": 3.772134981623789, + "grad_norm": 0.5079705715179443, + "learning_rate": 4.8813851035219857e-05, + "loss": 0.0539, + "num_input_tokens_seen": 41220736, + "step": 33870 + }, + { + "epoch": 3.772691836507406, + "grad_norm": 0.39441800117492676, + "learning_rate": 4.8813111384650555e-05, + "loss": 0.0511, + "num_input_tokens_seen": 41226848, + "step": 33875 + }, + { + "epoch": 3.7732486913910233, + "grad_norm": 0.2780699133872986, + "learning_rate": 4.8812371509147325e-05, + "loss": 0.171, + "num_input_tokens_seen": 41233152, + "step": 33880 + }, + { + "epoch": 3.773805546274641, + "grad_norm": 0.001959853805601597, + "learning_rate": 4.881163140871717e-05, + "loss": 0.041, + "num_input_tokens_seen": 41239360, + "step": 33885 + }, + { + "epoch": 3.774362401158258, + "grad_norm": 0.19957010447978973, + "learning_rate": 4.881089108336708e-05, + "loss": 0.0779, + "num_input_tokens_seen": 41245248, + "step": 33890 + }, + { + "epoch": 3.7749192560418754, + "grad_norm": 3.109809637069702, + "learning_rate": 4.881015053310406e-05, + "loss": 0.1848, + "num_input_tokens_seen": 41251488, + "step": 33895 + }, + { + "epoch": 3.775476110925493, + "grad_norm": 0.41888391971588135, + "learning_rate": 4.8809409757935075e-05, + "loss": 0.0312, + "num_input_tokens_seen": 41257824, + "step": 33900 + }, + { + "epoch": 3.7760329658091103, + "grad_norm": 1.5787664651870728, + "learning_rate": 4.8808668757867136e-05, + "loss": 0.0801, + "num_input_tokens_seen": 41264032, + "step": 33905 + }, + { + "epoch": 3.7765898206927275, + "grad_norm": 0.3812815845012665, + "learning_rate": 4.880792753290726e-05, + "loss": 0.0475, + "num_input_tokens_seen": 41270240, + "step": 33910 + }, + { + "epoch": 3.7771466755763448, + "grad_norm": 1.5106092691421509, + "learning_rate": 4.880718608306243e-05, + "loss": 0.2587, + "num_input_tokens_seen": 41276608, + "step": 33915 + }, + { + "epoch": 3.777703530459962, + "grad_norm": 0.010356582701206207, + "learning_rate": 4.8806444408339643e-05, + "loss": 0.0217, + "num_input_tokens_seen": 41282848, + "step": 33920 + }, + { + "epoch": 3.7782603853435797, + "grad_norm": 0.6465928554534912, + "learning_rate": 4.880570250874592e-05, + "loss": 0.1463, + "num_input_tokens_seen": 41288224, + "step": 33925 + }, + { + "epoch": 3.778817240227197, + "grad_norm": 1.8075448274612427, + "learning_rate": 4.8804960384288265e-05, + "loss": 0.062, + "num_input_tokens_seen": 41294400, + "step": 33930 + }, + { + "epoch": 3.779374095110814, + "grad_norm": 0.025882726535201073, + "learning_rate": 4.880421803497369e-05, + "loss": 0.0291, + "num_input_tokens_seen": 41300640, + "step": 33935 + }, + { + "epoch": 3.7799309499944314, + "grad_norm": 0.44445857405662537, + "learning_rate": 4.8803475460809206e-05, + "loss": 0.0248, + "num_input_tokens_seen": 41306624, + "step": 33940 + }, + { + "epoch": 3.7804878048780486, + "grad_norm": 0.03272552043199539, + "learning_rate": 4.880273266180182e-05, + "loss": 0.0569, + "num_input_tokens_seen": 41312704, + "step": 33945 + }, + { + "epoch": 3.7810446597616663, + "grad_norm": 0.0006570511031895876, + "learning_rate": 4.880198963795856e-05, + "loss": 0.0402, + "num_input_tokens_seen": 41319232, + "step": 33950 + }, + { + "epoch": 3.7816015146452835, + "grad_norm": 0.0006830180645920336, + "learning_rate": 4.880124638928643e-05, + "loss": 0.0466, + "num_input_tokens_seen": 41325312, + "step": 33955 + }, + { + "epoch": 3.7821583695289007, + "grad_norm": 1.0388624668121338, + "learning_rate": 4.880050291579246e-05, + "loss": 0.138, + "num_input_tokens_seen": 41330944, + "step": 33960 + }, + { + "epoch": 3.782715224412518, + "grad_norm": 0.32242926955223083, + "learning_rate": 4.879975921748368e-05, + "loss": 0.09, + "num_input_tokens_seen": 41336704, + "step": 33965 + }, + { + "epoch": 3.783272079296135, + "grad_norm": 0.003451185068115592, + "learning_rate": 4.8799015294367096e-05, + "loss": 0.0046, + "num_input_tokens_seen": 41342752, + "step": 33970 + }, + { + "epoch": 3.783828934179753, + "grad_norm": 0.2735763490200043, + "learning_rate": 4.879827114644975e-05, + "loss": 0.0381, + "num_input_tokens_seen": 41348704, + "step": 33975 + }, + { + "epoch": 3.78438578906337, + "grad_norm": 1.329921841621399, + "learning_rate": 4.8797526773738664e-05, + "loss": 0.1228, + "num_input_tokens_seen": 41354784, + "step": 33980 + }, + { + "epoch": 3.7849426439469873, + "grad_norm": 0.017443865537643433, + "learning_rate": 4.879678217624087e-05, + "loss": 0.026, + "num_input_tokens_seen": 41360640, + "step": 33985 + }, + { + "epoch": 3.785499498830605, + "grad_norm": 0.27670302987098694, + "learning_rate": 4.8796037353963406e-05, + "loss": 0.0334, + "num_input_tokens_seen": 41366560, + "step": 33990 + }, + { + "epoch": 3.7860563537142222, + "grad_norm": 0.4017666280269623, + "learning_rate": 4.8795292306913296e-05, + "loss": 0.1329, + "num_input_tokens_seen": 41372768, + "step": 33995 + }, + { + "epoch": 3.7866132085978395, + "grad_norm": 0.04051775112748146, + "learning_rate": 4.879454703509759e-05, + "loss": 0.0589, + "num_input_tokens_seen": 41378656, + "step": 34000 + }, + { + "epoch": 3.7871700634814567, + "grad_norm": 0.023873871192336082, + "learning_rate": 4.879380153852333e-05, + "loss": 0.1366, + "num_input_tokens_seen": 41384352, + "step": 34005 + }, + { + "epoch": 3.787726918365074, + "grad_norm": 0.1650194376707077, + "learning_rate": 4.8793055817197556e-05, + "loss": 0.1291, + "num_input_tokens_seen": 41390656, + "step": 34010 + }, + { + "epoch": 3.7882837732486916, + "grad_norm": 0.1712084710597992, + "learning_rate": 4.8792309871127296e-05, + "loss": 0.1652, + "num_input_tokens_seen": 41396768, + "step": 34015 + }, + { + "epoch": 3.788840628132309, + "grad_norm": 0.199002206325531, + "learning_rate": 4.879156370031961e-05, + "loss": 0.0535, + "num_input_tokens_seen": 41402912, + "step": 34020 + }, + { + "epoch": 3.789397483015926, + "grad_norm": 0.4829520285129547, + "learning_rate": 4.879081730478154e-05, + "loss": 0.0198, + "num_input_tokens_seen": 41409024, + "step": 34025 + }, + { + "epoch": 3.7899543378995433, + "grad_norm": 0.8288648724555969, + "learning_rate": 4.879007068452014e-05, + "loss": 0.0797, + "num_input_tokens_seen": 41415264, + "step": 34030 + }, + { + "epoch": 3.7905111927831605, + "grad_norm": 0.8613840937614441, + "learning_rate": 4.8789323839542466e-05, + "loss": 0.1259, + "num_input_tokens_seen": 41421504, + "step": 34035 + }, + { + "epoch": 3.791068047666778, + "grad_norm": 2.6454410552978516, + "learning_rate": 4.8788576769855564e-05, + "loss": 0.1286, + "num_input_tokens_seen": 41427584, + "step": 34040 + }, + { + "epoch": 3.7916249025503954, + "grad_norm": 0.33995500206947327, + "learning_rate": 4.87878294754665e-05, + "loss": 0.0977, + "num_input_tokens_seen": 41433696, + "step": 34045 + }, + { + "epoch": 3.7921817574340126, + "grad_norm": 0.4830567240715027, + "learning_rate": 4.878708195638233e-05, + "loss": 0.095, + "num_input_tokens_seen": 41439936, + "step": 34050 + }, + { + "epoch": 3.79273861231763, + "grad_norm": 0.37029707431793213, + "learning_rate": 4.8786334212610105e-05, + "loss": 0.024, + "num_input_tokens_seen": 41446208, + "step": 34055 + }, + { + "epoch": 3.793295467201247, + "grad_norm": 0.14694960415363312, + "learning_rate": 4.87855862441569e-05, + "loss": 0.0818, + "num_input_tokens_seen": 41452128, + "step": 34060 + }, + { + "epoch": 3.793852322084865, + "grad_norm": 1.3480433225631714, + "learning_rate": 4.878483805102978e-05, + "loss": 0.1728, + "num_input_tokens_seen": 41457568, + "step": 34065 + }, + { + "epoch": 3.794409176968482, + "grad_norm": 0.0034614575561136007, + "learning_rate": 4.878408963323581e-05, + "loss": 0.0138, + "num_input_tokens_seen": 41463712, + "step": 34070 + }, + { + "epoch": 3.7949660318520992, + "grad_norm": 0.005664993543177843, + "learning_rate": 4.878334099078204e-05, + "loss": 0.0571, + "num_input_tokens_seen": 41469856, + "step": 34075 + }, + { + "epoch": 3.795522886735717, + "grad_norm": 0.8011394143104553, + "learning_rate": 4.878259212367558e-05, + "loss": 0.0316, + "num_input_tokens_seen": 41476128, + "step": 34080 + }, + { + "epoch": 3.796079741619334, + "grad_norm": 0.46657928824424744, + "learning_rate": 4.878184303192348e-05, + "loss": 0.1088, + "num_input_tokens_seen": 41482080, + "step": 34085 + }, + { + "epoch": 3.7966365965029514, + "grad_norm": 0.4883700907230377, + "learning_rate": 4.878109371553281e-05, + "loss": 0.1037, + "num_input_tokens_seen": 41487872, + "step": 34090 + }, + { + "epoch": 3.7971934513865686, + "grad_norm": 0.5706022381782532, + "learning_rate": 4.878034417451066e-05, + "loss": 0.1911, + "num_input_tokens_seen": 41494112, + "step": 34095 + }, + { + "epoch": 3.797750306270186, + "grad_norm": 0.030158087611198425, + "learning_rate": 4.877959440886411e-05, + "loss": 0.0476, + "num_input_tokens_seen": 41500384, + "step": 34100 + }, + { + "epoch": 3.7983071611538035, + "grad_norm": 0.19721508026123047, + "learning_rate": 4.8778844418600235e-05, + "loss": 0.1159, + "num_input_tokens_seen": 41506400, + "step": 34105 + }, + { + "epoch": 3.7988640160374207, + "grad_norm": 0.47379070520401, + "learning_rate": 4.8778094203726125e-05, + "loss": 0.0182, + "num_input_tokens_seen": 41512288, + "step": 34110 + }, + { + "epoch": 3.799420870921038, + "grad_norm": 0.4269229471683502, + "learning_rate": 4.877734376424887e-05, + "loss": 0.1151, + "num_input_tokens_seen": 41518880, + "step": 34115 + }, + { + "epoch": 3.799977725804655, + "grad_norm": 0.15457473695278168, + "learning_rate": 4.877659310017555e-05, + "loss": 0.0293, + "num_input_tokens_seen": 41525504, + "step": 34120 + }, + { + "epoch": 3.8005345806882724, + "grad_norm": 0.44764551520347595, + "learning_rate": 4.877584221151325e-05, + "loss": 0.1413, + "num_input_tokens_seen": 41531424, + "step": 34125 + }, + { + "epoch": 3.80109143557189, + "grad_norm": 0.15848439931869507, + "learning_rate": 4.877509109826908e-05, + "loss": 0.029, + "num_input_tokens_seen": 41536896, + "step": 34130 + }, + { + "epoch": 3.8016482904555073, + "grad_norm": 0.43518757820129395, + "learning_rate": 4.8774339760450125e-05, + "loss": 0.0974, + "num_input_tokens_seen": 41543200, + "step": 34135 + }, + { + "epoch": 3.8022051453391246, + "grad_norm": 0.6228029727935791, + "learning_rate": 4.877358819806348e-05, + "loss": 0.1424, + "num_input_tokens_seen": 41548480, + "step": 34140 + }, + { + "epoch": 3.802762000222742, + "grad_norm": 1.1442272663116455, + "learning_rate": 4.877283641111625e-05, + "loss": 0.1396, + "num_input_tokens_seen": 41554592, + "step": 34145 + }, + { + "epoch": 3.803318855106359, + "grad_norm": 0.36433103680610657, + "learning_rate": 4.877208439961554e-05, + "loss": 0.0585, + "num_input_tokens_seen": 41560704, + "step": 34150 + }, + { + "epoch": 3.8038757099899767, + "grad_norm": 0.022070497274398804, + "learning_rate": 4.877133216356844e-05, + "loss": 0.1635, + "num_input_tokens_seen": 41565696, + "step": 34155 + }, + { + "epoch": 3.804432564873594, + "grad_norm": 0.4998605251312256, + "learning_rate": 4.877057970298206e-05, + "loss": 0.0271, + "num_input_tokens_seen": 41571776, + "step": 34160 + }, + { + "epoch": 3.804989419757211, + "grad_norm": 0.8049934506416321, + "learning_rate": 4.8769827017863514e-05, + "loss": 0.1537, + "num_input_tokens_seen": 41577920, + "step": 34165 + }, + { + "epoch": 3.805546274640829, + "grad_norm": 0.5395233631134033, + "learning_rate": 4.87690741082199e-05, + "loss": 0.086, + "num_input_tokens_seen": 41583744, + "step": 34170 + }, + { + "epoch": 3.806103129524446, + "grad_norm": 0.276612251996994, + "learning_rate": 4.8768320974058345e-05, + "loss": 0.1331, + "num_input_tokens_seen": 41590336, + "step": 34175 + }, + { + "epoch": 3.8066599844080633, + "grad_norm": 0.8500767350196838, + "learning_rate": 4.876756761538596e-05, + "loss": 0.1022, + "num_input_tokens_seen": 41596032, + "step": 34180 + }, + { + "epoch": 3.8072168392916805, + "grad_norm": 1.280840277671814, + "learning_rate": 4.876681403220985e-05, + "loss": 0.1477, + "num_input_tokens_seen": 41602336, + "step": 34185 + }, + { + "epoch": 3.8077736941752978, + "grad_norm": 0.6366733312606812, + "learning_rate": 4.876606022453714e-05, + "loss": 0.0547, + "num_input_tokens_seen": 41608736, + "step": 34190 + }, + { + "epoch": 3.8083305490589154, + "grad_norm": 0.9819231629371643, + "learning_rate": 4.8765306192374954e-05, + "loss": 0.0774, + "num_input_tokens_seen": 41614688, + "step": 34195 + }, + { + "epoch": 3.8088874039425327, + "grad_norm": 1.7745476961135864, + "learning_rate": 4.8764551935730405e-05, + "loss": 0.1409, + "num_input_tokens_seen": 41621024, + "step": 34200 + }, + { + "epoch": 3.80944425882615, + "grad_norm": 0.5171442031860352, + "learning_rate": 4.876379745461063e-05, + "loss": 0.0229, + "num_input_tokens_seen": 41627072, + "step": 34205 + }, + { + "epoch": 3.810001113709767, + "grad_norm": 0.06306523084640503, + "learning_rate": 4.876304274902275e-05, + "loss": 0.0537, + "num_input_tokens_seen": 41633472, + "step": 34210 + }, + { + "epoch": 3.8105579685933844, + "grad_norm": 0.38638851046562195, + "learning_rate": 4.876228781897389e-05, + "loss": 0.0235, + "num_input_tokens_seen": 41640128, + "step": 34215 + }, + { + "epoch": 3.811114823477002, + "grad_norm": 0.0019406058127060533, + "learning_rate": 4.876153266447117e-05, + "loss": 0.0893, + "num_input_tokens_seen": 41646176, + "step": 34220 + }, + { + "epoch": 3.8116716783606193, + "grad_norm": 0.6953125596046448, + "learning_rate": 4.8760777285521755e-05, + "loss": 0.1406, + "num_input_tokens_seen": 41652160, + "step": 34225 + }, + { + "epoch": 3.8122285332442365, + "grad_norm": 1.3239781856536865, + "learning_rate": 4.876002168213275e-05, + "loss": 0.0798, + "num_input_tokens_seen": 41658208, + "step": 34230 + }, + { + "epoch": 3.8127853881278537, + "grad_norm": 1.472066879272461, + "learning_rate": 4.875926585431131e-05, + "loss": 0.1844, + "num_input_tokens_seen": 41664384, + "step": 34235 + }, + { + "epoch": 3.813342243011471, + "grad_norm": 0.061434537172317505, + "learning_rate": 4.8758509802064567e-05, + "loss": 0.1822, + "num_input_tokens_seen": 41670528, + "step": 34240 + }, + { + "epoch": 3.8138990978950886, + "grad_norm": 1.4073538780212402, + "learning_rate": 4.8757753525399664e-05, + "loss": 0.0917, + "num_input_tokens_seen": 41676512, + "step": 34245 + }, + { + "epoch": 3.814455952778706, + "grad_norm": 0.00076483772136271, + "learning_rate": 4.875699702432374e-05, + "loss": 0.0745, + "num_input_tokens_seen": 41683040, + "step": 34250 + }, + { + "epoch": 3.815012807662323, + "grad_norm": 0.0013437155866995454, + "learning_rate": 4.8756240298843946e-05, + "loss": 0.0585, + "num_input_tokens_seen": 41689408, + "step": 34255 + }, + { + "epoch": 3.8155696625459408, + "grad_norm": 0.03648259490728378, + "learning_rate": 4.8755483348967435e-05, + "loss": 0.083, + "num_input_tokens_seen": 41695872, + "step": 34260 + }, + { + "epoch": 3.816126517429558, + "grad_norm": 0.8647553324699402, + "learning_rate": 4.8754726174701345e-05, + "loss": 0.124, + "num_input_tokens_seen": 41702240, + "step": 34265 + }, + { + "epoch": 3.8166833723131752, + "grad_norm": 0.02014332450926304, + "learning_rate": 4.875396877605285e-05, + "loss": 0.0839, + "num_input_tokens_seen": 41708224, + "step": 34270 + }, + { + "epoch": 3.8172402271967925, + "grad_norm": 0.04662146791815758, + "learning_rate": 4.8753211153029075e-05, + "loss": 0.0848, + "num_input_tokens_seen": 41714240, + "step": 34275 + }, + { + "epoch": 3.8177970820804097, + "grad_norm": 0.46506062150001526, + "learning_rate": 4.875245330563719e-05, + "loss": 0.0636, + "num_input_tokens_seen": 41720384, + "step": 34280 + }, + { + "epoch": 3.8183539369640274, + "grad_norm": 0.24246692657470703, + "learning_rate": 4.875169523388435e-05, + "loss": 0.0471, + "num_input_tokens_seen": 41726304, + "step": 34285 + }, + { + "epoch": 3.8189107918476446, + "grad_norm": 0.16203224658966064, + "learning_rate": 4.875093693777773e-05, + "loss": 0.1814, + "num_input_tokens_seen": 41732416, + "step": 34290 + }, + { + "epoch": 3.819467646731262, + "grad_norm": 0.4066742956638336, + "learning_rate": 4.875017841732448e-05, + "loss": 0.1115, + "num_input_tokens_seen": 41738592, + "step": 34295 + }, + { + "epoch": 3.820024501614879, + "grad_norm": 0.3413991630077362, + "learning_rate": 4.874941967253176e-05, + "loss": 0.0393, + "num_input_tokens_seen": 41744512, + "step": 34300 + }, + { + "epoch": 3.8205813564984963, + "grad_norm": 0.223247691988945, + "learning_rate": 4.874866070340675e-05, + "loss": 0.1421, + "num_input_tokens_seen": 41750528, + "step": 34305 + }, + { + "epoch": 3.821138211382114, + "grad_norm": 1.3200111389160156, + "learning_rate": 4.87479015099566e-05, + "loss": 0.0773, + "num_input_tokens_seen": 41756480, + "step": 34310 + }, + { + "epoch": 3.821695066265731, + "grad_norm": 0.819670557975769, + "learning_rate": 4.8747142092188506e-05, + "loss": 0.1055, + "num_input_tokens_seen": 41762560, + "step": 34315 + }, + { + "epoch": 3.8222519211493484, + "grad_norm": 0.010109474882483482, + "learning_rate": 4.874638245010962e-05, + "loss": 0.025, + "num_input_tokens_seen": 41768576, + "step": 34320 + }, + { + "epoch": 3.8228087760329656, + "grad_norm": 0.1224743127822876, + "learning_rate": 4.8745622583727135e-05, + "loss": 0.052, + "num_input_tokens_seen": 41775136, + "step": 34325 + }, + { + "epoch": 3.823365630916583, + "grad_norm": 0.5054544806480408, + "learning_rate": 4.874486249304821e-05, + "loss": 0.0612, + "num_input_tokens_seen": 41781248, + "step": 34330 + }, + { + "epoch": 3.8239224858002006, + "grad_norm": 0.6495469212532043, + "learning_rate": 4.874410217808004e-05, + "loss": 0.0631, + "num_input_tokens_seen": 41787136, + "step": 34335 + }, + { + "epoch": 3.824479340683818, + "grad_norm": 0.6312963962554932, + "learning_rate": 4.8743341638829806e-05, + "loss": 0.034, + "num_input_tokens_seen": 41793344, + "step": 34340 + }, + { + "epoch": 3.825036195567435, + "grad_norm": 1.0415533781051636, + "learning_rate": 4.8742580875304686e-05, + "loss": 0.0793, + "num_input_tokens_seen": 41799296, + "step": 34345 + }, + { + "epoch": 3.8255930504510527, + "grad_norm": 1.7781046628952026, + "learning_rate": 4.8741819887511866e-05, + "loss": 0.0947, + "num_input_tokens_seen": 41805376, + "step": 34350 + }, + { + "epoch": 3.82614990533467, + "grad_norm": 0.011858262121677399, + "learning_rate": 4.8741058675458535e-05, + "loss": 0.086, + "num_input_tokens_seen": 41811712, + "step": 34355 + }, + { + "epoch": 3.826706760218287, + "grad_norm": 0.8380367159843445, + "learning_rate": 4.874029723915188e-05, + "loss": 0.1086, + "num_input_tokens_seen": 41818080, + "step": 34360 + }, + { + "epoch": 3.8272636151019044, + "grad_norm": 0.5832754969596863, + "learning_rate": 4.8739535578599105e-05, + "loss": 0.1114, + "num_input_tokens_seen": 41824416, + "step": 34365 + }, + { + "epoch": 3.8278204699855216, + "grad_norm": 0.003714545862749219, + "learning_rate": 4.873877369380739e-05, + "loss": 0.0399, + "num_input_tokens_seen": 41831008, + "step": 34370 + }, + { + "epoch": 3.8283773248691393, + "grad_norm": 0.5146629214286804, + "learning_rate": 4.873801158478394e-05, + "loss": 0.0572, + "num_input_tokens_seen": 41836928, + "step": 34375 + }, + { + "epoch": 3.8289341797527565, + "grad_norm": 1.2319196462631226, + "learning_rate": 4.873724925153595e-05, + "loss": 0.1589, + "num_input_tokens_seen": 41843264, + "step": 34380 + }, + { + "epoch": 3.8294910346363737, + "grad_norm": 0.7626959085464478, + "learning_rate": 4.8736486694070633e-05, + "loss": 0.0873, + "num_input_tokens_seen": 41849248, + "step": 34385 + }, + { + "epoch": 3.830047889519991, + "grad_norm": 0.8144989609718323, + "learning_rate": 4.873572391239517e-05, + "loss": 0.1667, + "num_input_tokens_seen": 41855264, + "step": 34390 + }, + { + "epoch": 3.830604744403608, + "grad_norm": 1.4050214290618896, + "learning_rate": 4.873496090651679e-05, + "loss": 0.0665, + "num_input_tokens_seen": 41861056, + "step": 34395 + }, + { + "epoch": 3.831161599287226, + "grad_norm": 0.42377573251724243, + "learning_rate": 4.873419767644268e-05, + "loss": 0.0563, + "num_input_tokens_seen": 41866880, + "step": 34400 + }, + { + "epoch": 3.831718454170843, + "grad_norm": 0.14267511665821075, + "learning_rate": 4.873343422218005e-05, + "loss": 0.0227, + "num_input_tokens_seen": 41872864, + "step": 34405 + }, + { + "epoch": 3.8322753090544603, + "grad_norm": 0.14019078016281128, + "learning_rate": 4.873267054373613e-05, + "loss": 0.0653, + "num_input_tokens_seen": 41878880, + "step": 34410 + }, + { + "epoch": 3.8328321639380776, + "grad_norm": 0.3723708689212799, + "learning_rate": 4.8731906641118116e-05, + "loss": 0.0262, + "num_input_tokens_seen": 41884544, + "step": 34415 + }, + { + "epoch": 3.833389018821695, + "grad_norm": 0.23236213624477386, + "learning_rate": 4.873114251433324e-05, + "loss": 0.0948, + "num_input_tokens_seen": 41890656, + "step": 34420 + }, + { + "epoch": 3.8339458737053125, + "grad_norm": 0.00632241228595376, + "learning_rate": 4.87303781633887e-05, + "loss": 0.0307, + "num_input_tokens_seen": 41896960, + "step": 34425 + }, + { + "epoch": 3.8345027285889297, + "grad_norm": 0.7640148401260376, + "learning_rate": 4.8729613588291735e-05, + "loss": 0.083, + "num_input_tokens_seen": 41903200, + "step": 34430 + }, + { + "epoch": 3.835059583472547, + "grad_norm": 0.948850691318512, + "learning_rate": 4.872884878904955e-05, + "loss": 0.0738, + "num_input_tokens_seen": 41909216, + "step": 34435 + }, + { + "epoch": 3.8356164383561646, + "grad_norm": 0.487281858921051, + "learning_rate": 4.872808376566937e-05, + "loss": 0.0812, + "num_input_tokens_seen": 41915040, + "step": 34440 + }, + { + "epoch": 3.836173293239782, + "grad_norm": 0.9947352409362793, + "learning_rate": 4.8727318518158446e-05, + "loss": 0.0109, + "num_input_tokens_seen": 41921440, + "step": 34445 + }, + { + "epoch": 3.836730148123399, + "grad_norm": 1.1148772239685059, + "learning_rate": 4.8726553046523976e-05, + "loss": 0.108, + "num_input_tokens_seen": 41927680, + "step": 34450 + }, + { + "epoch": 3.8372870030070163, + "grad_norm": 0.22776885330677032, + "learning_rate": 4.8725787350773214e-05, + "loss": 0.0546, + "num_input_tokens_seen": 41933280, + "step": 34455 + }, + { + "epoch": 3.8378438578906335, + "grad_norm": 0.16256679594516754, + "learning_rate": 4.8725021430913364e-05, + "loss": 0.0231, + "num_input_tokens_seen": 41939360, + "step": 34460 + }, + { + "epoch": 3.838400712774251, + "grad_norm": 0.05320892855525017, + "learning_rate": 4.872425528695169e-05, + "loss": 0.0171, + "num_input_tokens_seen": 41945376, + "step": 34465 + }, + { + "epoch": 3.8389575676578684, + "grad_norm": 1.73294997215271, + "learning_rate": 4.872348891889542e-05, + "loss": 0.0829, + "num_input_tokens_seen": 41951488, + "step": 34470 + }, + { + "epoch": 3.8395144225414857, + "grad_norm": 0.048286229372024536, + "learning_rate": 4.872272232675178e-05, + "loss": 0.0214, + "num_input_tokens_seen": 41957632, + "step": 34475 + }, + { + "epoch": 3.840071277425103, + "grad_norm": 1.977346658706665, + "learning_rate": 4.872195551052803e-05, + "loss": 0.1153, + "num_input_tokens_seen": 41963712, + "step": 34480 + }, + { + "epoch": 3.84062813230872, + "grad_norm": 0.11916716396808624, + "learning_rate": 4.87211884702314e-05, + "loss": 0.1286, + "num_input_tokens_seen": 41969888, + "step": 34485 + }, + { + "epoch": 3.841184987192338, + "grad_norm": 1.5338199138641357, + "learning_rate": 4.872042120586915e-05, + "loss": 0.1452, + "num_input_tokens_seen": 41975520, + "step": 34490 + }, + { + "epoch": 3.841741842075955, + "grad_norm": 0.31414052844047546, + "learning_rate": 4.871965371744851e-05, + "loss": 0.0635, + "num_input_tokens_seen": 41981760, + "step": 34495 + }, + { + "epoch": 3.8422986969595723, + "grad_norm": 0.40601596236228943, + "learning_rate": 4.871888600497673e-05, + "loss": 0.0744, + "num_input_tokens_seen": 41988064, + "step": 34500 + }, + { + "epoch": 3.8428555518431895, + "grad_norm": 0.5013412833213806, + "learning_rate": 4.871811806846108e-05, + "loss": 0.0756, + "num_input_tokens_seen": 41994080, + "step": 34505 + }, + { + "epoch": 3.8434124067268067, + "grad_norm": 1.0014145374298096, + "learning_rate": 4.8717349907908794e-05, + "loss": 0.0677, + "num_input_tokens_seen": 41999840, + "step": 34510 + }, + { + "epoch": 3.8439692616104244, + "grad_norm": 0.03893747925758362, + "learning_rate": 4.871658152332714e-05, + "loss": 0.0247, + "num_input_tokens_seen": 42005472, + "step": 34515 + }, + { + "epoch": 3.8445261164940416, + "grad_norm": 0.09496825188398361, + "learning_rate": 4.8715812914723367e-05, + "loss": 0.0194, + "num_input_tokens_seen": 42011648, + "step": 34520 + }, + { + "epoch": 3.845082971377659, + "grad_norm": 0.07336492091417313, + "learning_rate": 4.8715044082104744e-05, + "loss": 0.0222, + "num_input_tokens_seen": 42018048, + "step": 34525 + }, + { + "epoch": 3.8456398262612765, + "grad_norm": 0.39577656984329224, + "learning_rate": 4.871427502547853e-05, + "loss": 0.0485, + "num_input_tokens_seen": 42023680, + "step": 34530 + }, + { + "epoch": 3.8461966811448938, + "grad_norm": 1.11105477809906, + "learning_rate": 4.871350574485199e-05, + "loss": 0.0551, + "num_input_tokens_seen": 42030176, + "step": 34535 + }, + { + "epoch": 3.846753536028511, + "grad_norm": 0.15766257047653198, + "learning_rate": 4.8712736240232385e-05, + "loss": 0.1028, + "num_input_tokens_seen": 42036576, + "step": 34540 + }, + { + "epoch": 3.8473103909121282, + "grad_norm": 0.39290502667427063, + "learning_rate": 4.871196651162699e-05, + "loss": 0.115, + "num_input_tokens_seen": 42042592, + "step": 34545 + }, + { + "epoch": 3.8478672457957455, + "grad_norm": 0.02923681028187275, + "learning_rate": 4.871119655904308e-05, + "loss": 0.092, + "num_input_tokens_seen": 42048672, + "step": 34550 + }, + { + "epoch": 3.848424100679363, + "grad_norm": 0.235210120677948, + "learning_rate": 4.871042638248791e-05, + "loss": 0.0708, + "num_input_tokens_seen": 42054944, + "step": 34555 + }, + { + "epoch": 3.8489809555629804, + "grad_norm": 0.0025363184977322817, + "learning_rate": 4.8709655981968774e-05, + "loss": 0.0227, + "num_input_tokens_seen": 42061152, + "step": 34560 + }, + { + "epoch": 3.8495378104465976, + "grad_norm": 0.053167760372161865, + "learning_rate": 4.870888535749294e-05, + "loss": 0.0279, + "num_input_tokens_seen": 42066976, + "step": 34565 + }, + { + "epoch": 3.850094665330215, + "grad_norm": 0.4693783223628998, + "learning_rate": 4.870811450906768e-05, + "loss": 0.2054, + "num_input_tokens_seen": 42072928, + "step": 34570 + }, + { + "epoch": 3.850651520213832, + "grad_norm": 1.116554617881775, + "learning_rate": 4.8707343436700295e-05, + "loss": 0.0973, + "num_input_tokens_seen": 42079232, + "step": 34575 + }, + { + "epoch": 3.8512083750974497, + "grad_norm": 0.46352168917655945, + "learning_rate": 4.870657214039806e-05, + "loss": 0.1551, + "num_input_tokens_seen": 42084800, + "step": 34580 + }, + { + "epoch": 3.851765229981067, + "grad_norm": 0.5312322974205017, + "learning_rate": 4.870580062016825e-05, + "loss": 0.0368, + "num_input_tokens_seen": 42090976, + "step": 34585 + }, + { + "epoch": 3.852322084864684, + "grad_norm": 0.00517478259280324, + "learning_rate": 4.870502887601816e-05, + "loss": 0.0319, + "num_input_tokens_seen": 42097088, + "step": 34590 + }, + { + "epoch": 3.852878939748302, + "grad_norm": 0.2106008529663086, + "learning_rate": 4.870425690795508e-05, + "loss": 0.1979, + "num_input_tokens_seen": 42103136, + "step": 34595 + }, + { + "epoch": 3.8534357946319187, + "grad_norm": 0.2027244120836258, + "learning_rate": 4.87034847159863e-05, + "loss": 0.101, + "num_input_tokens_seen": 42109280, + "step": 34600 + }, + { + "epoch": 3.8539926495155363, + "grad_norm": 1.4317090511322021, + "learning_rate": 4.8702712300119125e-05, + "loss": 0.1004, + "num_input_tokens_seen": 42115328, + "step": 34605 + }, + { + "epoch": 3.8545495043991536, + "grad_norm": 0.2429162561893463, + "learning_rate": 4.870193966036084e-05, + "loss": 0.0413, + "num_input_tokens_seen": 42121696, + "step": 34610 + }, + { + "epoch": 3.855106359282771, + "grad_norm": 0.40212932229042053, + "learning_rate": 4.870116679671874e-05, + "loss": 0.1161, + "num_input_tokens_seen": 42128000, + "step": 34615 + }, + { + "epoch": 3.8556632141663885, + "grad_norm": 0.0022117779590189457, + "learning_rate": 4.8700393709200134e-05, + "loss": 0.1044, + "num_input_tokens_seen": 42134272, + "step": 34620 + }, + { + "epoch": 3.8562200690500057, + "grad_norm": 0.830202579498291, + "learning_rate": 4.8699620397812315e-05, + "loss": 0.1771, + "num_input_tokens_seen": 42140256, + "step": 34625 + }, + { + "epoch": 3.856776923933623, + "grad_norm": 0.03140651807188988, + "learning_rate": 4.86988468625626e-05, + "loss": 0.0656, + "num_input_tokens_seen": 42146432, + "step": 34630 + }, + { + "epoch": 3.85733377881724, + "grad_norm": 0.6396765112876892, + "learning_rate": 4.8698073103458285e-05, + "loss": 0.1144, + "num_input_tokens_seen": 42152992, + "step": 34635 + }, + { + "epoch": 3.8578906337008574, + "grad_norm": 0.7074165940284729, + "learning_rate": 4.869729912050669e-05, + "loss": 0.0273, + "num_input_tokens_seen": 42159552, + "step": 34640 + }, + { + "epoch": 3.858447488584475, + "grad_norm": 0.28398171067237854, + "learning_rate": 4.869652491371511e-05, + "loss": 0.0374, + "num_input_tokens_seen": 42165632, + "step": 34645 + }, + { + "epoch": 3.8590043434680923, + "grad_norm": 0.014726611785590649, + "learning_rate": 4.8695750483090875e-05, + "loss": 0.0961, + "num_input_tokens_seen": 42171968, + "step": 34650 + }, + { + "epoch": 3.8595611983517095, + "grad_norm": 0.2874503433704376, + "learning_rate": 4.8694975828641286e-05, + "loss": 0.014, + "num_input_tokens_seen": 42177952, + "step": 34655 + }, + { + "epoch": 3.8601180532353268, + "grad_norm": 0.012057134881615639, + "learning_rate": 4.869420095037367e-05, + "loss": 0.0351, + "num_input_tokens_seen": 42184224, + "step": 34660 + }, + { + "epoch": 3.860674908118944, + "grad_norm": 1.1917892694473267, + "learning_rate": 4.869342584829534e-05, + "loss": 0.0676, + "num_input_tokens_seen": 42190592, + "step": 34665 + }, + { + "epoch": 3.8612317630025617, + "grad_norm": 2.015939474105835, + "learning_rate": 4.869265052241362e-05, + "loss": 0.1779, + "num_input_tokens_seen": 42196640, + "step": 34670 + }, + { + "epoch": 3.861788617886179, + "grad_norm": 0.942103385925293, + "learning_rate": 4.869187497273584e-05, + "loss": 0.0775, + "num_input_tokens_seen": 42202784, + "step": 34675 + }, + { + "epoch": 3.862345472769796, + "grad_norm": 0.9154215455055237, + "learning_rate": 4.869109919926931e-05, + "loss": 0.1011, + "num_input_tokens_seen": 42208896, + "step": 34680 + }, + { + "epoch": 3.862902327653414, + "grad_norm": 0.4678095281124115, + "learning_rate": 4.869032320202137e-05, + "loss": 0.1016, + "num_input_tokens_seen": 42214880, + "step": 34685 + }, + { + "epoch": 3.8634591825370306, + "grad_norm": 0.06169731914997101, + "learning_rate": 4.868954698099935e-05, + "loss": 0.0288, + "num_input_tokens_seen": 42221184, + "step": 34690 + }, + { + "epoch": 3.8640160374206483, + "grad_norm": 0.42301222681999207, + "learning_rate": 4.8688770536210574e-05, + "loss": 0.108, + "num_input_tokens_seen": 42227328, + "step": 34695 + }, + { + "epoch": 3.8645728923042655, + "grad_norm": 1.2132441997528076, + "learning_rate": 4.868799386766239e-05, + "loss": 0.0756, + "num_input_tokens_seen": 42233280, + "step": 34700 + }, + { + "epoch": 3.8651297471878827, + "grad_norm": 0.2952318787574768, + "learning_rate": 4.868721697536211e-05, + "loss": 0.0561, + "num_input_tokens_seen": 42239648, + "step": 34705 + }, + { + "epoch": 3.8656866020715004, + "grad_norm": 0.1504868119955063, + "learning_rate": 4.86864398593171e-05, + "loss": 0.131, + "num_input_tokens_seen": 42245824, + "step": 34710 + }, + { + "epoch": 3.8662434569551176, + "grad_norm": 0.17295575141906738, + "learning_rate": 4.868566251953469e-05, + "loss": 0.0357, + "num_input_tokens_seen": 42251840, + "step": 34715 + }, + { + "epoch": 3.866800311838735, + "grad_norm": 1.6978050470352173, + "learning_rate": 4.8684884956022216e-05, + "loss": 0.0533, + "num_input_tokens_seen": 42257504, + "step": 34720 + }, + { + "epoch": 3.867357166722352, + "grad_norm": 0.38570547103881836, + "learning_rate": 4.8684107168787025e-05, + "loss": 0.1641, + "num_input_tokens_seen": 42262976, + "step": 34725 + }, + { + "epoch": 3.8679140216059693, + "grad_norm": 0.34738633036613464, + "learning_rate": 4.8683329157836466e-05, + "loss": 0.0548, + "num_input_tokens_seen": 42269248, + "step": 34730 + }, + { + "epoch": 3.868470876489587, + "grad_norm": 0.18651767075061798, + "learning_rate": 4.868255092317789e-05, + "loss": 0.0858, + "num_input_tokens_seen": 42275392, + "step": 34735 + }, + { + "epoch": 3.869027731373204, + "grad_norm": 0.699726939201355, + "learning_rate": 4.8681772464818656e-05, + "loss": 0.1628, + "num_input_tokens_seen": 42281120, + "step": 34740 + }, + { + "epoch": 3.8695845862568214, + "grad_norm": 0.02460881695151329, + "learning_rate": 4.8680993782766096e-05, + "loss": 0.0276, + "num_input_tokens_seen": 42287168, + "step": 34745 + }, + { + "epoch": 3.8701414411404387, + "grad_norm": 0.5521037578582764, + "learning_rate": 4.868021487702758e-05, + "loss": 0.0513, + "num_input_tokens_seen": 42293504, + "step": 34750 + }, + { + "epoch": 3.870698296024056, + "grad_norm": 0.0005439318483695388, + "learning_rate": 4.867943574761046e-05, + "loss": 0.0274, + "num_input_tokens_seen": 42299744, + "step": 34755 + }, + { + "epoch": 3.8712551509076736, + "grad_norm": 0.3266499936580658, + "learning_rate": 4.867865639452211e-05, + "loss": 0.0777, + "num_input_tokens_seen": 42305824, + "step": 34760 + }, + { + "epoch": 3.871812005791291, + "grad_norm": 0.5299777388572693, + "learning_rate": 4.867787681776986e-05, + "loss": 0.0765, + "num_input_tokens_seen": 42311136, + "step": 34765 + }, + { + "epoch": 3.872368860674908, + "grad_norm": 0.9658896327018738, + "learning_rate": 4.8677097017361106e-05, + "loss": 0.0611, + "num_input_tokens_seen": 42317216, + "step": 34770 + }, + { + "epoch": 3.8729257155585257, + "grad_norm": 0.009687925688922405, + "learning_rate": 4.8676316993303195e-05, + "loss": 0.1188, + "num_input_tokens_seen": 42323328, + "step": 34775 + }, + { + "epoch": 3.8734825704421425, + "grad_norm": 0.06773333996534348, + "learning_rate": 4.86755367456035e-05, + "loss": 0.032, + "num_input_tokens_seen": 42329408, + "step": 34780 + }, + { + "epoch": 3.87403942532576, + "grad_norm": 1.1577303409576416, + "learning_rate": 4.8674756274269394e-05, + "loss": 0.1925, + "num_input_tokens_seen": 42335424, + "step": 34785 + }, + { + "epoch": 3.8745962802093774, + "grad_norm": 0.5488534569740295, + "learning_rate": 4.867397557930825e-05, + "loss": 0.0151, + "num_input_tokens_seen": 42341856, + "step": 34790 + }, + { + "epoch": 3.8751531350929946, + "grad_norm": 1.0202256441116333, + "learning_rate": 4.867319466072744e-05, + "loss": 0.06, + "num_input_tokens_seen": 42348032, + "step": 34795 + }, + { + "epoch": 3.8757099899766123, + "grad_norm": 0.3052116334438324, + "learning_rate": 4.867241351853434e-05, + "loss": 0.098, + "num_input_tokens_seen": 42354112, + "step": 34800 + }, + { + "epoch": 3.8762668448602295, + "grad_norm": 0.10213976353406906, + "learning_rate": 4.867163215273632e-05, + "loss": 0.0511, + "num_input_tokens_seen": 42360384, + "step": 34805 + }, + { + "epoch": 3.8768236997438468, + "grad_norm": 0.051074445247650146, + "learning_rate": 4.8670850563340775e-05, + "loss": 0.0193, + "num_input_tokens_seen": 42366688, + "step": 34810 + }, + { + "epoch": 3.877380554627464, + "grad_norm": 0.3023832440376282, + "learning_rate": 4.8670068750355086e-05, + "loss": 0.0872, + "num_input_tokens_seen": 42372416, + "step": 34815 + }, + { + "epoch": 3.8779374095110812, + "grad_norm": 1.701625108718872, + "learning_rate": 4.866928671378663e-05, + "loss": 0.0902, + "num_input_tokens_seen": 42378336, + "step": 34820 + }, + { + "epoch": 3.878494264394699, + "grad_norm": 1.1013082265853882, + "learning_rate": 4.8668504453642794e-05, + "loss": 0.0748, + "num_input_tokens_seen": 42384416, + "step": 34825 + }, + { + "epoch": 3.879051119278316, + "grad_norm": 1.3559379577636719, + "learning_rate": 4.8667721969930976e-05, + "loss": 0.1532, + "num_input_tokens_seen": 42389952, + "step": 34830 + }, + { + "epoch": 3.8796079741619334, + "grad_norm": 0.6165803670883179, + "learning_rate": 4.8666939262658554e-05, + "loss": 0.0668, + "num_input_tokens_seen": 42396320, + "step": 34835 + }, + { + "epoch": 3.8801648290455506, + "grad_norm": 2.690727710723877, + "learning_rate": 4.8666156331832934e-05, + "loss": 0.1206, + "num_input_tokens_seen": 42402880, + "step": 34840 + }, + { + "epoch": 3.880721683929168, + "grad_norm": 0.7931370139122009, + "learning_rate": 4.86653731774615e-05, + "loss": 0.0651, + "num_input_tokens_seen": 42409056, + "step": 34845 + }, + { + "epoch": 3.8812785388127855, + "grad_norm": 0.46137142181396484, + "learning_rate": 4.8664589799551666e-05, + "loss": 0.0818, + "num_input_tokens_seen": 42414976, + "step": 34850 + }, + { + "epoch": 3.8818353936964027, + "grad_norm": 0.8126834630966187, + "learning_rate": 4.866380619811082e-05, + "loss": 0.1406, + "num_input_tokens_seen": 42421088, + "step": 34855 + }, + { + "epoch": 3.88239224858002, + "grad_norm": 0.0028972672298550606, + "learning_rate": 4.8663022373146363e-05, + "loss": 0.059, + "num_input_tokens_seen": 42427200, + "step": 34860 + }, + { + "epoch": 3.8829491034636376, + "grad_norm": 0.10961507260799408, + "learning_rate": 4.8662238324665696e-05, + "loss": 0.0636, + "num_input_tokens_seen": 42433152, + "step": 34865 + }, + { + "epoch": 3.883505958347255, + "grad_norm": 0.16836108267307281, + "learning_rate": 4.866145405267624e-05, + "loss": 0.0321, + "num_input_tokens_seen": 42439328, + "step": 34870 + }, + { + "epoch": 3.884062813230872, + "grad_norm": 0.8141340613365173, + "learning_rate": 4.866066955718539e-05, + "loss": 0.0696, + "num_input_tokens_seen": 42445696, + "step": 34875 + }, + { + "epoch": 3.8846196681144893, + "grad_norm": 2.013990640640259, + "learning_rate": 4.8659884838200556e-05, + "loss": 0.0331, + "num_input_tokens_seen": 42451968, + "step": 34880 + }, + { + "epoch": 3.8851765229981066, + "grad_norm": 0.458734393119812, + "learning_rate": 4.8659099895729156e-05, + "loss": 0.1066, + "num_input_tokens_seen": 42458176, + "step": 34885 + }, + { + "epoch": 3.8857333778817242, + "grad_norm": 0.7667382955551147, + "learning_rate": 4.86583147297786e-05, + "loss": 0.0793, + "num_input_tokens_seen": 42464640, + "step": 34890 + }, + { + "epoch": 3.8862902327653415, + "grad_norm": 0.49456459283828735, + "learning_rate": 4.865752934035631e-05, + "loss": 0.0114, + "num_input_tokens_seen": 42471008, + "step": 34895 + }, + { + "epoch": 3.8868470876489587, + "grad_norm": 1.620043158531189, + "learning_rate": 4.86567437274697e-05, + "loss": 0.1232, + "num_input_tokens_seen": 42477120, + "step": 34900 + }, + { + "epoch": 3.887403942532576, + "grad_norm": 0.00456258375197649, + "learning_rate": 4.8655957891126195e-05, + "loss": 0.144, + "num_input_tokens_seen": 42483360, + "step": 34905 + }, + { + "epoch": 3.887960797416193, + "grad_norm": 0.6475797295570374, + "learning_rate": 4.8655171831333214e-05, + "loss": 0.0557, + "num_input_tokens_seen": 42489472, + "step": 34910 + }, + { + "epoch": 3.888517652299811, + "grad_norm": 0.11942160129547119, + "learning_rate": 4.865438554809818e-05, + "loss": 0.1135, + "num_input_tokens_seen": 42495872, + "step": 34915 + }, + { + "epoch": 3.889074507183428, + "grad_norm": 0.7247549295425415, + "learning_rate": 4.865359904142852e-05, + "loss": 0.0474, + "num_input_tokens_seen": 42502112, + "step": 34920 + }, + { + "epoch": 3.8896313620670453, + "grad_norm": 0.0002829251461662352, + "learning_rate": 4.865281231133167e-05, + "loss": 0.1762, + "num_input_tokens_seen": 42508256, + "step": 34925 + }, + { + "epoch": 3.8901882169506625, + "grad_norm": 0.1725989431142807, + "learning_rate": 4.865202535781506e-05, + "loss": 0.0716, + "num_input_tokens_seen": 42514336, + "step": 34930 + }, + { + "epoch": 3.8907450718342798, + "grad_norm": 0.03500265255570412, + "learning_rate": 4.865123818088612e-05, + "loss": 0.0753, + "num_input_tokens_seen": 42520288, + "step": 34935 + }, + { + "epoch": 3.8913019267178974, + "grad_norm": 0.7693907022476196, + "learning_rate": 4.865045078055228e-05, + "loss": 0.2039, + "num_input_tokens_seen": 42526688, + "step": 34940 + }, + { + "epoch": 3.8918587816015147, + "grad_norm": 1.3731900453567505, + "learning_rate": 4.864966315682099e-05, + "loss": 0.1568, + "num_input_tokens_seen": 42532800, + "step": 34945 + }, + { + "epoch": 3.892415636485132, + "grad_norm": 0.2582285702228546, + "learning_rate": 4.864887530969968e-05, + "loss": 0.1151, + "num_input_tokens_seen": 42538560, + "step": 34950 + }, + { + "epoch": 3.8929724913687496, + "grad_norm": 0.11604483425617218, + "learning_rate": 4.864808723919579e-05, + "loss": 0.0254, + "num_input_tokens_seen": 42544640, + "step": 34955 + }, + { + "epoch": 3.893529346252367, + "grad_norm": 0.07202012091875076, + "learning_rate": 4.864729894531678e-05, + "loss": 0.1739, + "num_input_tokens_seen": 42550560, + "step": 34960 + }, + { + "epoch": 3.894086201135984, + "grad_norm": 0.6440120339393616, + "learning_rate": 4.8646510428070085e-05, + "loss": 0.0355, + "num_input_tokens_seen": 42556992, + "step": 34965 + }, + { + "epoch": 3.8946430560196013, + "grad_norm": 0.5763182044029236, + "learning_rate": 4.864572168746315e-05, + "loss": 0.1919, + "num_input_tokens_seen": 42563008, + "step": 34970 + }, + { + "epoch": 3.8951999109032185, + "grad_norm": 0.5685349702835083, + "learning_rate": 4.864493272350343e-05, + "loss": 0.0537, + "num_input_tokens_seen": 42568864, + "step": 34975 + }, + { + "epoch": 3.895756765786836, + "grad_norm": 0.040523216128349304, + "learning_rate": 4.864414353619838e-05, + "loss": 0.0444, + "num_input_tokens_seen": 42574720, + "step": 34980 + }, + { + "epoch": 3.8963136206704534, + "grad_norm": 0.12768934667110443, + "learning_rate": 4.864335412555544e-05, + "loss": 0.0501, + "num_input_tokens_seen": 42580832, + "step": 34985 + }, + { + "epoch": 3.8968704755540706, + "grad_norm": 0.08525655418634415, + "learning_rate": 4.8642564491582085e-05, + "loss": 0.0169, + "num_input_tokens_seen": 42586688, + "step": 34990 + }, + { + "epoch": 3.897427330437688, + "grad_norm": 1.293354868888855, + "learning_rate": 4.864177463428578e-05, + "loss": 0.1288, + "num_input_tokens_seen": 42592352, + "step": 34995 + }, + { + "epoch": 3.897984185321305, + "grad_norm": 0.16068725287914276, + "learning_rate": 4.864098455367395e-05, + "loss": 0.0572, + "num_input_tokens_seen": 42598144, + "step": 35000 + }, + { + "epoch": 3.8985410402049228, + "grad_norm": 0.005727703217417002, + "learning_rate": 4.864019424975409e-05, + "loss": 0.0294, + "num_input_tokens_seen": 42604416, + "step": 35005 + }, + { + "epoch": 3.89909789508854, + "grad_norm": 0.10516045987606049, + "learning_rate": 4.863940372253365e-05, + "loss": 0.1212, + "num_input_tokens_seen": 42610176, + "step": 35010 + }, + { + "epoch": 3.899654749972157, + "grad_norm": 0.28274863958358765, + "learning_rate": 4.8638612972020104e-05, + "loss": 0.0772, + "num_input_tokens_seen": 42615552, + "step": 35015 + }, + { + "epoch": 3.9002116048557744, + "grad_norm": 0.23751212656497955, + "learning_rate": 4.863782199822092e-05, + "loss": 0.1114, + "num_input_tokens_seen": 42621920, + "step": 35020 + }, + { + "epoch": 3.9007684597393917, + "grad_norm": 0.5296887755393982, + "learning_rate": 4.863703080114357e-05, + "loss": 0.0949, + "num_input_tokens_seen": 42628160, + "step": 35025 + }, + { + "epoch": 3.9013253146230094, + "grad_norm": 2.6529407501220703, + "learning_rate": 4.8636239380795534e-05, + "loss": 0.0386, + "num_input_tokens_seen": 42634528, + "step": 35030 + }, + { + "epoch": 3.9018821695066266, + "grad_norm": 0.7092593908309937, + "learning_rate": 4.863544773718427e-05, + "loss": 0.0949, + "num_input_tokens_seen": 42639904, + "step": 35035 + }, + { + "epoch": 3.902439024390244, + "grad_norm": 0.6615833640098572, + "learning_rate": 4.863465587031727e-05, + "loss": 0.1042, + "num_input_tokens_seen": 42646048, + "step": 35040 + }, + { + "epoch": 3.9029958792738615, + "grad_norm": 0.023526472970843315, + "learning_rate": 4.863386378020201e-05, + "loss": 0.0074, + "num_input_tokens_seen": 42652160, + "step": 35045 + }, + { + "epoch": 3.9035527341574787, + "grad_norm": 0.38123518228530884, + "learning_rate": 4.8633071466845964e-05, + "loss": 0.0068, + "num_input_tokens_seen": 42658432, + "step": 35050 + }, + { + "epoch": 3.904109589041096, + "grad_norm": 0.8688910007476807, + "learning_rate": 4.863227893025663e-05, + "loss": 0.0869, + "num_input_tokens_seen": 42664864, + "step": 35055 + }, + { + "epoch": 3.904666443924713, + "grad_norm": 1.0454795360565186, + "learning_rate": 4.8631486170441486e-05, + "loss": 0.0886, + "num_input_tokens_seen": 42670784, + "step": 35060 + }, + { + "epoch": 3.9052232988083304, + "grad_norm": 1.2445943355560303, + "learning_rate": 4.863069318740802e-05, + "loss": 0.146, + "num_input_tokens_seen": 42676896, + "step": 35065 + }, + { + "epoch": 3.905780153691948, + "grad_norm": 0.18281148374080658, + "learning_rate": 4.862989998116373e-05, + "loss": 0.054, + "num_input_tokens_seen": 42683232, + "step": 35070 + }, + { + "epoch": 3.9063370085755653, + "grad_norm": 0.03859272971749306, + "learning_rate": 4.86291065517161e-05, + "loss": 0.083, + "num_input_tokens_seen": 42689472, + "step": 35075 + }, + { + "epoch": 3.9068938634591825, + "grad_norm": 0.03215143829584122, + "learning_rate": 4.862831289907263e-05, + "loss": 0.0116, + "num_input_tokens_seen": 42695392, + "step": 35080 + }, + { + "epoch": 3.9074507183428, + "grad_norm": 0.7346407771110535, + "learning_rate": 4.862751902324081e-05, + "loss": 0.0761, + "num_input_tokens_seen": 42701504, + "step": 35085 + }, + { + "epoch": 3.908007573226417, + "grad_norm": 0.0456942580640316, + "learning_rate": 4.862672492422814e-05, + "loss": 0.0493, + "num_input_tokens_seen": 42707520, + "step": 35090 + }, + { + "epoch": 3.9085644281100347, + "grad_norm": 0.605744481086731, + "learning_rate": 4.8625930602042125e-05, + "loss": 0.0243, + "num_input_tokens_seen": 42713920, + "step": 35095 + }, + { + "epoch": 3.909121282993652, + "grad_norm": 0.16339871287345886, + "learning_rate": 4.862513605669027e-05, + "loss": 0.1383, + "num_input_tokens_seen": 42720032, + "step": 35100 + }, + { + "epoch": 3.909678137877269, + "grad_norm": 0.046594422310590744, + "learning_rate": 4.862434128818008e-05, + "loss": 0.0708, + "num_input_tokens_seen": 42726208, + "step": 35105 + }, + { + "epoch": 3.9102349927608864, + "grad_norm": 0.03601747751235962, + "learning_rate": 4.8623546296519054e-05, + "loss": 0.0549, + "num_input_tokens_seen": 42732736, + "step": 35110 + }, + { + "epoch": 3.9107918476445036, + "grad_norm": 2.2495360374450684, + "learning_rate": 4.86227510817147e-05, + "loss": 0.2087, + "num_input_tokens_seen": 42738400, + "step": 35115 + }, + { + "epoch": 3.9113487025281213, + "grad_norm": 0.8118671178817749, + "learning_rate": 4.862195564377455e-05, + "loss": 0.0811, + "num_input_tokens_seen": 42744672, + "step": 35120 + }, + { + "epoch": 3.9119055574117385, + "grad_norm": 0.21583501994609833, + "learning_rate": 4.862115998270609e-05, + "loss": 0.0473, + "num_input_tokens_seen": 42749952, + "step": 35125 + }, + { + "epoch": 3.9124624122953557, + "grad_norm": 0.03678646311163902, + "learning_rate": 4.862036409851686e-05, + "loss": 0.0876, + "num_input_tokens_seen": 42755904, + "step": 35130 + }, + { + "epoch": 3.9130192671789734, + "grad_norm": 0.0856432169675827, + "learning_rate": 4.861956799121436e-05, + "loss": 0.0938, + "num_input_tokens_seen": 42762016, + "step": 35135 + }, + { + "epoch": 3.9135761220625906, + "grad_norm": 0.5609424114227295, + "learning_rate": 4.8618771660806117e-05, + "loss": 0.0159, + "num_input_tokens_seen": 42768256, + "step": 35140 + }, + { + "epoch": 3.914132976946208, + "grad_norm": 0.13169655203819275, + "learning_rate": 4.861797510729965e-05, + "loss": 0.0432, + "num_input_tokens_seen": 42774304, + "step": 35145 + }, + { + "epoch": 3.914689831829825, + "grad_norm": 0.9012969732284546, + "learning_rate": 4.861717833070249e-05, + "loss": 0.1176, + "num_input_tokens_seen": 42780288, + "step": 35150 + }, + { + "epoch": 3.9152466867134423, + "grad_norm": 0.15478196740150452, + "learning_rate": 4.861638133102216e-05, + "loss": 0.1395, + "num_input_tokens_seen": 42786304, + "step": 35155 + }, + { + "epoch": 3.91580354159706, + "grad_norm": 1.269740104675293, + "learning_rate": 4.8615584108266185e-05, + "loss": 0.072, + "num_input_tokens_seen": 42792512, + "step": 35160 + }, + { + "epoch": 3.9163603964806772, + "grad_norm": 0.318126916885376, + "learning_rate": 4.86147866624421e-05, + "loss": 0.0504, + "num_input_tokens_seen": 42798400, + "step": 35165 + }, + { + "epoch": 3.9169172513642945, + "grad_norm": 0.03916431963443756, + "learning_rate": 4.8613988993557436e-05, + "loss": 0.0296, + "num_input_tokens_seen": 42804896, + "step": 35170 + }, + { + "epoch": 3.9174741062479117, + "grad_norm": 1.0855509042739868, + "learning_rate": 4.861319110161973e-05, + "loss": 0.1056, + "num_input_tokens_seen": 42810816, + "step": 35175 + }, + { + "epoch": 3.918030961131529, + "grad_norm": 0.0028159271460026503, + "learning_rate": 4.86123929866365e-05, + "loss": 0.0217, + "num_input_tokens_seen": 42816768, + "step": 35180 + }, + { + "epoch": 3.9185878160151466, + "grad_norm": 0.15090490877628326, + "learning_rate": 4.8611594648615314e-05, + "loss": 0.1203, + "num_input_tokens_seen": 42822976, + "step": 35185 + }, + { + "epoch": 3.919144670898764, + "grad_norm": 0.26591047644615173, + "learning_rate": 4.861079608756369e-05, + "loss": 0.0557, + "num_input_tokens_seen": 42828416, + "step": 35190 + }, + { + "epoch": 3.919701525782381, + "grad_norm": 0.012393003329634666, + "learning_rate": 4.86099973034892e-05, + "loss": 0.0628, + "num_input_tokens_seen": 42834016, + "step": 35195 + }, + { + "epoch": 3.9202583806659983, + "grad_norm": 1.095436692237854, + "learning_rate": 4.860919829639935e-05, + "loss": 0.0832, + "num_input_tokens_seen": 42840192, + "step": 35200 + }, + { + "epoch": 3.9208152355496155, + "grad_norm": 0.144967719912529, + "learning_rate": 4.860839906630171e-05, + "loss": 0.0619, + "num_input_tokens_seen": 42845504, + "step": 35205 + }, + { + "epoch": 3.921372090433233, + "grad_norm": 0.6944732666015625, + "learning_rate": 4.8607599613203826e-05, + "loss": 0.0217, + "num_input_tokens_seen": 42851584, + "step": 35210 + }, + { + "epoch": 3.9219289453168504, + "grad_norm": 0.12626785039901733, + "learning_rate": 4.8606799937113255e-05, + "loss": 0.0268, + "num_input_tokens_seen": 42857920, + "step": 35215 + }, + { + "epoch": 3.9224858002004677, + "grad_norm": 0.2372485250234604, + "learning_rate": 4.860600003803754e-05, + "loss": 0.1202, + "num_input_tokens_seen": 42863520, + "step": 35220 + }, + { + "epoch": 3.9230426550840853, + "grad_norm": 0.19564376771450043, + "learning_rate": 4.8605199915984245e-05, + "loss": 0.1633, + "num_input_tokens_seen": 42869728, + "step": 35225 + }, + { + "epoch": 3.9235995099677026, + "grad_norm": 0.028715528547763824, + "learning_rate": 4.8604399570960924e-05, + "loss": 0.0675, + "num_input_tokens_seen": 42875712, + "step": 35230 + }, + { + "epoch": 3.92415636485132, + "grad_norm": 0.5215656757354736, + "learning_rate": 4.860359900297513e-05, + "loss": 0.1993, + "num_input_tokens_seen": 42881696, + "step": 35235 + }, + { + "epoch": 3.924713219734937, + "grad_norm": 0.6942839026451111, + "learning_rate": 4.860279821203445e-05, + "loss": 0.1122, + "num_input_tokens_seen": 42887872, + "step": 35240 + }, + { + "epoch": 3.9252700746185543, + "grad_norm": 0.7394533753395081, + "learning_rate": 4.860199719814641e-05, + "loss": 0.0802, + "num_input_tokens_seen": 42894176, + "step": 35245 + }, + { + "epoch": 3.925826929502172, + "grad_norm": 0.28148186206817627, + "learning_rate": 4.8601195961318615e-05, + "loss": 0.0224, + "num_input_tokens_seen": 42900096, + "step": 35250 + }, + { + "epoch": 3.926383784385789, + "grad_norm": 0.6650058627128601, + "learning_rate": 4.86003945015586e-05, + "loss": 0.0511, + "num_input_tokens_seen": 42906656, + "step": 35255 + }, + { + "epoch": 3.9269406392694064, + "grad_norm": 0.13513170182704926, + "learning_rate": 4.859959281887396e-05, + "loss": 0.0912, + "num_input_tokens_seen": 42912576, + "step": 35260 + }, + { + "epoch": 3.9274974941530236, + "grad_norm": 0.6116363406181335, + "learning_rate": 4.859879091327225e-05, + "loss": 0.08, + "num_input_tokens_seen": 42918976, + "step": 35265 + }, + { + "epoch": 3.928054349036641, + "grad_norm": 0.8788653612136841, + "learning_rate": 4.859798878476106e-05, + "loss": 0.0765, + "num_input_tokens_seen": 42925056, + "step": 35270 + }, + { + "epoch": 3.9286112039202585, + "grad_norm": 0.12665170431137085, + "learning_rate": 4.859718643334796e-05, + "loss": 0.0453, + "num_input_tokens_seen": 42931200, + "step": 35275 + }, + { + "epoch": 3.9291680588038758, + "grad_norm": 0.20330119132995605, + "learning_rate": 4.859638385904053e-05, + "loss": 0.022, + "num_input_tokens_seen": 42937280, + "step": 35280 + }, + { + "epoch": 3.929724913687493, + "grad_norm": 0.1330936998128891, + "learning_rate": 4.859558106184634e-05, + "loss": 0.0443, + "num_input_tokens_seen": 42943328, + "step": 35285 + }, + { + "epoch": 3.9302817685711102, + "grad_norm": 1.1579128503799438, + "learning_rate": 4.859477804177299e-05, + "loss": 0.0414, + "num_input_tokens_seen": 42949824, + "step": 35290 + }, + { + "epoch": 3.9308386234547275, + "grad_norm": 0.762878954410553, + "learning_rate": 4.8593974798828056e-05, + "loss": 0.1067, + "num_input_tokens_seen": 42955808, + "step": 35295 + }, + { + "epoch": 3.931395478338345, + "grad_norm": 1.6167460680007935, + "learning_rate": 4.859317133301913e-05, + "loss": 0.1249, + "num_input_tokens_seen": 42961728, + "step": 35300 + }, + { + "epoch": 3.9319523332219624, + "grad_norm": 0.0024356122594326735, + "learning_rate": 4.8592367644353795e-05, + "loss": 0.0298, + "num_input_tokens_seen": 42967648, + "step": 35305 + }, + { + "epoch": 3.9325091881055796, + "grad_norm": 0.33823534846305847, + "learning_rate": 4.859156373283964e-05, + "loss": 0.0076, + "num_input_tokens_seen": 42974016, + "step": 35310 + }, + { + "epoch": 3.9330660429891973, + "grad_norm": 1.24033522605896, + "learning_rate": 4.859075959848427e-05, + "loss": 0.0459, + "num_input_tokens_seen": 42980352, + "step": 35315 + }, + { + "epoch": 3.9336228978728145, + "grad_norm": 0.15691761672496796, + "learning_rate": 4.8589955241295276e-05, + "loss": 0.0864, + "num_input_tokens_seen": 42986368, + "step": 35320 + }, + { + "epoch": 3.9341797527564317, + "grad_norm": 0.003027265192940831, + "learning_rate": 4.858915066128026e-05, + "loss": 0.0397, + "num_input_tokens_seen": 42992672, + "step": 35325 + }, + { + "epoch": 3.934736607640049, + "grad_norm": 0.8163630366325378, + "learning_rate": 4.8588345858446804e-05, + "loss": 0.076, + "num_input_tokens_seen": 42998688, + "step": 35330 + }, + { + "epoch": 3.935293462523666, + "grad_norm": 1.0531514883041382, + "learning_rate": 4.858754083280253e-05, + "loss": 0.0484, + "num_input_tokens_seen": 43005376, + "step": 35335 + }, + { + "epoch": 3.935850317407284, + "grad_norm": 0.2908608019351959, + "learning_rate": 4.8586735584355036e-05, + "loss": 0.0833, + "num_input_tokens_seen": 43011552, + "step": 35340 + }, + { + "epoch": 3.936407172290901, + "grad_norm": 1.7935841083526611, + "learning_rate": 4.858593011311192e-05, + "loss": 0.0566, + "num_input_tokens_seen": 43017920, + "step": 35345 + }, + { + "epoch": 3.9369640271745183, + "grad_norm": 1.5292103290557861, + "learning_rate": 4.858512441908081e-05, + "loss": 0.0879, + "num_input_tokens_seen": 43024192, + "step": 35350 + }, + { + "epoch": 3.9375208820581356, + "grad_norm": 0.002246875548735261, + "learning_rate": 4.8584318502269285e-05, + "loss": 0.0457, + "num_input_tokens_seen": 43030624, + "step": 35355 + }, + { + "epoch": 3.938077736941753, + "grad_norm": 0.004021054599434137, + "learning_rate": 4.858351236268499e-05, + "loss": 0.1272, + "num_input_tokens_seen": 43036608, + "step": 35360 + }, + { + "epoch": 3.9386345918253705, + "grad_norm": 0.01872500777244568, + "learning_rate": 4.858270600033553e-05, + "loss": 0.0058, + "num_input_tokens_seen": 43042944, + "step": 35365 + }, + { + "epoch": 3.9391914467089877, + "grad_norm": 0.9259032607078552, + "learning_rate": 4.858189941522851e-05, + "loss": 0.1699, + "num_input_tokens_seen": 43048928, + "step": 35370 + }, + { + "epoch": 3.939748301592605, + "grad_norm": 0.03791196271777153, + "learning_rate": 4.8581092607371554e-05, + "loss": 0.1125, + "num_input_tokens_seen": 43054656, + "step": 35375 + }, + { + "epoch": 3.940305156476222, + "grad_norm": 0.6595130562782288, + "learning_rate": 4.858028557677229e-05, + "loss": 0.0763, + "num_input_tokens_seen": 43060832, + "step": 35380 + }, + { + "epoch": 3.9408620113598394, + "grad_norm": 0.0011818999191746116, + "learning_rate": 4.857947832343833e-05, + "loss": 0.0292, + "num_input_tokens_seen": 43067040, + "step": 35385 + }, + { + "epoch": 3.941418866243457, + "grad_norm": 1.4816560745239258, + "learning_rate": 4.857867084737732e-05, + "loss": 0.0907, + "num_input_tokens_seen": 43073024, + "step": 35390 + }, + { + "epoch": 3.9419757211270743, + "grad_norm": 2.2953789234161377, + "learning_rate": 4.857786314859686e-05, + "loss": 0.2515, + "num_input_tokens_seen": 43079392, + "step": 35395 + }, + { + "epoch": 3.9425325760106915, + "grad_norm": 0.0866457000374794, + "learning_rate": 4.857705522710459e-05, + "loss": 0.0089, + "num_input_tokens_seen": 43085760, + "step": 35400 + }, + { + "epoch": 3.943089430894309, + "grad_norm": 0.7660647034645081, + "learning_rate": 4.8576247082908154e-05, + "loss": 0.0797, + "num_input_tokens_seen": 43092128, + "step": 35405 + }, + { + "epoch": 3.9436462857779264, + "grad_norm": 0.016627488657832146, + "learning_rate": 4.857543871601518e-05, + "loss": 0.0699, + "num_input_tokens_seen": 43098624, + "step": 35410 + }, + { + "epoch": 3.9442031406615436, + "grad_norm": 0.9110377430915833, + "learning_rate": 4.8574630126433284e-05, + "loss": 0.1141, + "num_input_tokens_seen": 43104928, + "step": 35415 + }, + { + "epoch": 3.944759995545161, + "grad_norm": 0.43471404910087585, + "learning_rate": 4.857382131417012e-05, + "loss": 0.0419, + "num_input_tokens_seen": 43111136, + "step": 35420 + }, + { + "epoch": 3.945316850428778, + "grad_norm": 0.0950261652469635, + "learning_rate": 4.857301227923333e-05, + "loss": 0.0339, + "num_input_tokens_seen": 43117376, + "step": 35425 + }, + { + "epoch": 3.945873705312396, + "grad_norm": 0.06619282811880112, + "learning_rate": 4.8572203021630555e-05, + "loss": 0.028, + "num_input_tokens_seen": 43123776, + "step": 35430 + }, + { + "epoch": 3.946430560196013, + "grad_norm": 0.04731215536594391, + "learning_rate": 4.857139354136944e-05, + "loss": 0.1031, + "num_input_tokens_seen": 43129504, + "step": 35435 + }, + { + "epoch": 3.9469874150796302, + "grad_norm": 0.279401034116745, + "learning_rate": 4.8570583838457625e-05, + "loss": 0.0503, + "num_input_tokens_seen": 43135552, + "step": 35440 + }, + { + "epoch": 3.9475442699632475, + "grad_norm": 0.11551543325185776, + "learning_rate": 4.856977391290276e-05, + "loss": 0.0624, + "num_input_tokens_seen": 43141728, + "step": 35445 + }, + { + "epoch": 3.9481011248468647, + "grad_norm": 0.03931552171707153, + "learning_rate": 4.856896376471249e-05, + "loss": 0.0965, + "num_input_tokens_seen": 43147968, + "step": 35450 + }, + { + "epoch": 3.9486579797304824, + "grad_norm": 1.3437222242355347, + "learning_rate": 4.856815339389449e-05, + "loss": 0.1587, + "num_input_tokens_seen": 43154048, + "step": 35455 + }, + { + "epoch": 3.9492148346140996, + "grad_norm": 0.05650894343852997, + "learning_rate": 4.856734280045639e-05, + "loss": 0.0501, + "num_input_tokens_seen": 43160384, + "step": 35460 + }, + { + "epoch": 3.949771689497717, + "grad_norm": 0.9603904485702515, + "learning_rate": 4.856653198440585e-05, + "loss": 0.1729, + "num_input_tokens_seen": 43166464, + "step": 35465 + }, + { + "epoch": 3.950328544381334, + "grad_norm": 0.8163918852806091, + "learning_rate": 4.856572094575054e-05, + "loss": 0.1403, + "num_input_tokens_seen": 43172896, + "step": 35470 + }, + { + "epoch": 3.9508853992649513, + "grad_norm": 0.11503858864307404, + "learning_rate": 4.856490968449812e-05, + "loss": 0.0369, + "num_input_tokens_seen": 43178528, + "step": 35475 + }, + { + "epoch": 3.951442254148569, + "grad_norm": 3.324594259262085, + "learning_rate": 4.8564098200656236e-05, + "loss": 0.206, + "num_input_tokens_seen": 43184736, + "step": 35480 + }, + { + "epoch": 3.951999109032186, + "grad_norm": 0.25209277868270874, + "learning_rate": 4.8563286494232576e-05, + "loss": 0.064, + "num_input_tokens_seen": 43190272, + "step": 35485 + }, + { + "epoch": 3.9525559639158034, + "grad_norm": 1.5278407335281372, + "learning_rate": 4.856247456523479e-05, + "loss": 0.0868, + "num_input_tokens_seen": 43196448, + "step": 35490 + }, + { + "epoch": 3.953112818799421, + "grad_norm": 1.4791654348373413, + "learning_rate": 4.856166241367056e-05, + "loss": 0.2522, + "num_input_tokens_seen": 43202496, + "step": 35495 + }, + { + "epoch": 3.9536696736830383, + "grad_norm": 0.41962242126464844, + "learning_rate": 4.856085003954754e-05, + "loss": 0.0312, + "num_input_tokens_seen": 43208544, + "step": 35500 + }, + { + "epoch": 3.9542265285666556, + "grad_norm": 0.6020514965057373, + "learning_rate": 4.856003744287343e-05, + "loss": 0.0531, + "num_input_tokens_seen": 43213568, + "step": 35505 + }, + { + "epoch": 3.954783383450273, + "grad_norm": 0.22253766655921936, + "learning_rate": 4.855922462365587e-05, + "loss": 0.0306, + "num_input_tokens_seen": 43219840, + "step": 35510 + }, + { + "epoch": 3.95534023833389, + "grad_norm": 1.2859169244766235, + "learning_rate": 4.855841158190258e-05, + "loss": 0.1603, + "num_input_tokens_seen": 43225408, + "step": 35515 + }, + { + "epoch": 3.9558970932175077, + "grad_norm": 0.3137779235839844, + "learning_rate": 4.855759831762121e-05, + "loss": 0.0775, + "num_input_tokens_seen": 43231648, + "step": 35520 + }, + { + "epoch": 3.956453948101125, + "grad_norm": 0.5480266213417053, + "learning_rate": 4.855678483081945e-05, + "loss": 0.0687, + "num_input_tokens_seen": 43237504, + "step": 35525 + }, + { + "epoch": 3.957010802984742, + "grad_norm": 0.08819963783025742, + "learning_rate": 4.855597112150498e-05, + "loss": 0.1386, + "num_input_tokens_seen": 43243264, + "step": 35530 + }, + { + "epoch": 3.9575676578683594, + "grad_norm": 0.3599812686443329, + "learning_rate": 4.855515718968549e-05, + "loss": 0.0178, + "num_input_tokens_seen": 43249376, + "step": 35535 + }, + { + "epoch": 3.9581245127519766, + "grad_norm": 0.1313801109790802, + "learning_rate": 4.855434303536867e-05, + "loss": 0.0105, + "num_input_tokens_seen": 43255328, + "step": 35540 + }, + { + "epoch": 3.9586813676355943, + "grad_norm": 0.008477779105305672, + "learning_rate": 4.8553528658562206e-05, + "loss": 0.0073, + "num_input_tokens_seen": 43261568, + "step": 35545 + }, + { + "epoch": 3.9592382225192115, + "grad_norm": 1.7101621627807617, + "learning_rate": 4.85527140592738e-05, + "loss": 0.1026, + "num_input_tokens_seen": 43267232, + "step": 35550 + }, + { + "epoch": 3.9597950774028288, + "grad_norm": 0.022285547107458115, + "learning_rate": 4.855189923751113e-05, + "loss": 0.0331, + "num_input_tokens_seen": 43273536, + "step": 35555 + }, + { + "epoch": 3.960351932286446, + "grad_norm": 0.009175005368888378, + "learning_rate": 4.8551084193281914e-05, + "loss": 0.0573, + "num_input_tokens_seen": 43279808, + "step": 35560 + }, + { + "epoch": 3.9609087871700632, + "grad_norm": 0.06289181113243103, + "learning_rate": 4.855026892659383e-05, + "loss": 0.0141, + "num_input_tokens_seen": 43285920, + "step": 35565 + }, + { + "epoch": 3.961465642053681, + "grad_norm": 0.00525851733982563, + "learning_rate": 4.8549453437454595e-05, + "loss": 0.1187, + "num_input_tokens_seen": 43292096, + "step": 35570 + }, + { + "epoch": 3.962022496937298, + "grad_norm": 0.9502178430557251, + "learning_rate": 4.85486377258719e-05, + "loss": 0.1105, + "num_input_tokens_seen": 43297888, + "step": 35575 + }, + { + "epoch": 3.9625793518209154, + "grad_norm": 0.011999806389212608, + "learning_rate": 4.8547821791853454e-05, + "loss": 0.0325, + "num_input_tokens_seen": 43304128, + "step": 35580 + }, + { + "epoch": 3.963136206704533, + "grad_norm": 0.49511754512786865, + "learning_rate": 4.8547005635406964e-05, + "loss": 0.0304, + "num_input_tokens_seen": 43309984, + "step": 35585 + }, + { + "epoch": 3.9636930615881503, + "grad_norm": 0.0625857263803482, + "learning_rate": 4.854618925654014e-05, + "loss": 0.0579, + "num_input_tokens_seen": 43316352, + "step": 35590 + }, + { + "epoch": 3.9642499164717675, + "grad_norm": 1.1867560148239136, + "learning_rate": 4.85453726552607e-05, + "loss": 0.0815, + "num_input_tokens_seen": 43321984, + "step": 35595 + }, + { + "epoch": 3.9648067713553847, + "grad_norm": 1.0616097450256348, + "learning_rate": 4.8544555831576344e-05, + "loss": 0.0574, + "num_input_tokens_seen": 43328384, + "step": 35600 + }, + { + "epoch": 3.965363626239002, + "grad_norm": 0.13333146274089813, + "learning_rate": 4.85437387854948e-05, + "loss": 0.0147, + "num_input_tokens_seen": 43334880, + "step": 35605 + }, + { + "epoch": 3.9659204811226196, + "grad_norm": 0.2943527400493622, + "learning_rate": 4.854292151702378e-05, + "loss": 0.0339, + "num_input_tokens_seen": 43340736, + "step": 35610 + }, + { + "epoch": 3.966477336006237, + "grad_norm": 0.010081271640956402, + "learning_rate": 4.8542104026170995e-05, + "loss": 0.1214, + "num_input_tokens_seen": 43346880, + "step": 35615 + }, + { + "epoch": 3.967034190889854, + "grad_norm": 0.2418440282344818, + "learning_rate": 4.854128631294419e-05, + "loss": 0.1385, + "num_input_tokens_seen": 43353152, + "step": 35620 + }, + { + "epoch": 3.9675910457734713, + "grad_norm": 0.07134594023227692, + "learning_rate": 4.854046837735107e-05, + "loss": 0.0597, + "num_input_tokens_seen": 43359008, + "step": 35625 + }, + { + "epoch": 3.9681479006570886, + "grad_norm": 0.04679439589381218, + "learning_rate": 4.853965021939936e-05, + "loss": 0.1128, + "num_input_tokens_seen": 43365280, + "step": 35630 + }, + { + "epoch": 3.9687047555407062, + "grad_norm": 0.07466736435890198, + "learning_rate": 4.85388318390968e-05, + "loss": 0.103, + "num_input_tokens_seen": 43371360, + "step": 35635 + }, + { + "epoch": 3.9692616104243235, + "grad_norm": 0.8213221430778503, + "learning_rate": 4.853801323645111e-05, + "loss": 0.0958, + "num_input_tokens_seen": 43377344, + "step": 35640 + }, + { + "epoch": 3.9698184653079407, + "grad_norm": 0.182328000664711, + "learning_rate": 4.853719441147003e-05, + "loss": 0.0045, + "num_input_tokens_seen": 43383712, + "step": 35645 + }, + { + "epoch": 3.970375320191558, + "grad_norm": 1.5123724937438965, + "learning_rate": 4.8536375364161294e-05, + "loss": 0.1088, + "num_input_tokens_seen": 43389664, + "step": 35650 + }, + { + "epoch": 3.970932175075175, + "grad_norm": 1.645615577697754, + "learning_rate": 4.853555609453263e-05, + "loss": 0.0997, + "num_input_tokens_seen": 43395808, + "step": 35655 + }, + { + "epoch": 3.971489029958793, + "grad_norm": 0.2035927027463913, + "learning_rate": 4.853473660259178e-05, + "loss": 0.1708, + "num_input_tokens_seen": 43401696, + "step": 35660 + }, + { + "epoch": 3.97204588484241, + "grad_norm": 0.7247897386550903, + "learning_rate": 4.853391688834649e-05, + "loss": 0.0868, + "num_input_tokens_seen": 43407712, + "step": 35665 + }, + { + "epoch": 3.9726027397260273, + "grad_norm": 0.05672822892665863, + "learning_rate": 4.85330969518045e-05, + "loss": 0.0247, + "num_input_tokens_seen": 43413984, + "step": 35670 + }, + { + "epoch": 3.973159594609645, + "grad_norm": 0.37783804535865784, + "learning_rate": 4.8532276792973553e-05, + "loss": 0.1261, + "num_input_tokens_seen": 43420128, + "step": 35675 + }, + { + "epoch": 3.973716449493262, + "grad_norm": 0.33261802792549133, + "learning_rate": 4.8531456411861396e-05, + "loss": 0.0789, + "num_input_tokens_seen": 43426688, + "step": 35680 + }, + { + "epoch": 3.9742733043768794, + "grad_norm": 0.004522602539509535, + "learning_rate": 4.8530635808475785e-05, + "loss": 0.0555, + "num_input_tokens_seen": 43432928, + "step": 35685 + }, + { + "epoch": 3.9748301592604967, + "grad_norm": 0.6983644366264343, + "learning_rate": 4.8529814982824474e-05, + "loss": 0.0653, + "num_input_tokens_seen": 43438624, + "step": 35690 + }, + { + "epoch": 3.975387014144114, + "grad_norm": 0.5433529019355774, + "learning_rate": 4.85289939349152e-05, + "loss": 0.0427, + "num_input_tokens_seen": 43444544, + "step": 35695 + }, + { + "epoch": 3.9759438690277316, + "grad_norm": 0.07070261985063553, + "learning_rate": 4.8528172664755723e-05, + "loss": 0.0605, + "num_input_tokens_seen": 43450464, + "step": 35700 + }, + { + "epoch": 3.976500723911349, + "grad_norm": 0.5988150238990784, + "learning_rate": 4.852735117235381e-05, + "loss": 0.0643, + "num_input_tokens_seen": 43456672, + "step": 35705 + }, + { + "epoch": 3.977057578794966, + "grad_norm": 0.5630416870117188, + "learning_rate": 4.852652945771722e-05, + "loss": 0.026, + "num_input_tokens_seen": 43462624, + "step": 35710 + }, + { + "epoch": 3.9776144336785833, + "grad_norm": 0.007654526270925999, + "learning_rate": 4.85257075208537e-05, + "loss": 0.0403, + "num_input_tokens_seen": 43469056, + "step": 35715 + }, + { + "epoch": 3.9781712885622005, + "grad_norm": 0.5646136403083801, + "learning_rate": 4.852488536177103e-05, + "loss": 0.0374, + "num_input_tokens_seen": 43475232, + "step": 35720 + }, + { + "epoch": 3.978728143445818, + "grad_norm": 1.0180306434631348, + "learning_rate": 4.852406298047697e-05, + "loss": 0.1832, + "num_input_tokens_seen": 43481248, + "step": 35725 + }, + { + "epoch": 3.9792849983294354, + "grad_norm": 1.2071638107299805, + "learning_rate": 4.852324037697928e-05, + "loss": 0.1559, + "num_input_tokens_seen": 43487072, + "step": 35730 + }, + { + "epoch": 3.9798418532130526, + "grad_norm": 0.5513473749160767, + "learning_rate": 4.852241755128575e-05, + "loss": 0.131, + "num_input_tokens_seen": 43493024, + "step": 35735 + }, + { + "epoch": 3.98039870809667, + "grad_norm": 0.29665425419807434, + "learning_rate": 4.852159450340413e-05, + "loss": 0.0587, + "num_input_tokens_seen": 43499168, + "step": 35740 + }, + { + "epoch": 3.980955562980287, + "grad_norm": 1.2672199010849, + "learning_rate": 4.8520771233342214e-05, + "loss": 0.0638, + "num_input_tokens_seen": 43505440, + "step": 35745 + }, + { + "epoch": 3.9815124178639048, + "grad_norm": 0.1940140277147293, + "learning_rate": 4.851994774110777e-05, + "loss": 0.0574, + "num_input_tokens_seen": 43511168, + "step": 35750 + }, + { + "epoch": 3.982069272747522, + "grad_norm": 1.084282398223877, + "learning_rate": 4.851912402670857e-05, + "loss": 0.2028, + "num_input_tokens_seen": 43517408, + "step": 35755 + }, + { + "epoch": 3.982626127631139, + "grad_norm": 2.1275634765625, + "learning_rate": 4.85183000901524e-05, + "loss": 0.0889, + "num_input_tokens_seen": 43523552, + "step": 35760 + }, + { + "epoch": 3.983182982514757, + "grad_norm": 0.24494260549545288, + "learning_rate": 4.851747593144704e-05, + "loss": 0.1093, + "num_input_tokens_seen": 43529760, + "step": 35765 + }, + { + "epoch": 3.983739837398374, + "grad_norm": 0.0990905836224556, + "learning_rate": 4.851665155060029e-05, + "loss": 0.0381, + "num_input_tokens_seen": 43536096, + "step": 35770 + }, + { + "epoch": 3.9842966922819913, + "grad_norm": 0.22963379323482513, + "learning_rate": 4.851582694761991e-05, + "loss": 0.0253, + "num_input_tokens_seen": 43542400, + "step": 35775 + }, + { + "epoch": 3.9848535471656086, + "grad_norm": 0.659602165222168, + "learning_rate": 4.851500212251371e-05, + "loss": 0.0258, + "num_input_tokens_seen": 43548640, + "step": 35780 + }, + { + "epoch": 3.985410402049226, + "grad_norm": 0.579847514629364, + "learning_rate": 4.851417707528948e-05, + "loss": 0.1145, + "num_input_tokens_seen": 43554592, + "step": 35785 + }, + { + "epoch": 3.9859672569328435, + "grad_norm": 0.0254280474036932, + "learning_rate": 4.851335180595501e-05, + "loss": 0.0359, + "num_input_tokens_seen": 43560896, + "step": 35790 + }, + { + "epoch": 3.9865241118164607, + "grad_norm": 2.5660243034362793, + "learning_rate": 4.851252631451808e-05, + "loss": 0.1015, + "num_input_tokens_seen": 43567232, + "step": 35795 + }, + { + "epoch": 3.987080966700078, + "grad_norm": 1.752064824104309, + "learning_rate": 4.851170060098651e-05, + "loss": 0.0907, + "num_input_tokens_seen": 43573152, + "step": 35800 + }, + { + "epoch": 3.987637821583695, + "grad_norm": 0.5007255673408508, + "learning_rate": 4.851087466536809e-05, + "loss": 0.1134, + "num_input_tokens_seen": 43579392, + "step": 35805 + }, + { + "epoch": 3.9881946764673124, + "grad_norm": 0.7119680047035217, + "learning_rate": 4.8510048507670626e-05, + "loss": 0.099, + "num_input_tokens_seen": 43585600, + "step": 35810 + }, + { + "epoch": 3.98875153135093, + "grad_norm": 0.40487128496170044, + "learning_rate": 4.850922212790191e-05, + "loss": 0.0621, + "num_input_tokens_seen": 43591808, + "step": 35815 + }, + { + "epoch": 3.9893083862345473, + "grad_norm": 0.014519504271447659, + "learning_rate": 4.850839552606976e-05, + "loss": 0.0818, + "num_input_tokens_seen": 43597824, + "step": 35820 + }, + { + "epoch": 3.9898652411181645, + "grad_norm": 0.008303942158818245, + "learning_rate": 4.850756870218198e-05, + "loss": 0.0915, + "num_input_tokens_seen": 43603968, + "step": 35825 + }, + { + "epoch": 3.9904220960017818, + "grad_norm": 0.05580233782529831, + "learning_rate": 4.850674165624638e-05, + "loss": 0.0842, + "num_input_tokens_seen": 43610080, + "step": 35830 + }, + { + "epoch": 3.990978950885399, + "grad_norm": 0.16581450402736664, + "learning_rate": 4.8505914388270766e-05, + "loss": 0.0367, + "num_input_tokens_seen": 43616416, + "step": 35835 + }, + { + "epoch": 3.9915358057690167, + "grad_norm": 0.0035303202457726, + "learning_rate": 4.850508689826296e-05, + "loss": 0.0243, + "num_input_tokens_seen": 43622560, + "step": 35840 + }, + { + "epoch": 3.992092660652634, + "grad_norm": 0.1472548395395279, + "learning_rate": 4.850425918623078e-05, + "loss": 0.0339, + "num_input_tokens_seen": 43628576, + "step": 35845 + }, + { + "epoch": 3.992649515536251, + "grad_norm": 0.6987333297729492, + "learning_rate": 4.850343125218204e-05, + "loss": 0.1041, + "num_input_tokens_seen": 43634528, + "step": 35850 + }, + { + "epoch": 3.993206370419869, + "grad_norm": 0.4041367471218109, + "learning_rate": 4.8502603096124565e-05, + "loss": 0.0509, + "num_input_tokens_seen": 43640640, + "step": 35855 + }, + { + "epoch": 3.993763225303486, + "grad_norm": 0.03290898725390434, + "learning_rate": 4.850177471806617e-05, + "loss": 0.059, + "num_input_tokens_seen": 43647008, + "step": 35860 + }, + { + "epoch": 3.9943200801871033, + "grad_norm": 0.061566002666950226, + "learning_rate": 4.850094611801468e-05, + "loss": 0.0593, + "num_input_tokens_seen": 43653248, + "step": 35865 + }, + { + "epoch": 3.9948769350707205, + "grad_norm": 1.3851017951965332, + "learning_rate": 4.850011729597793e-05, + "loss": 0.048, + "num_input_tokens_seen": 43659424, + "step": 35870 + }, + { + "epoch": 3.9954337899543377, + "grad_norm": 0.029392555356025696, + "learning_rate": 4.8499288251963745e-05, + "loss": 0.0543, + "num_input_tokens_seen": 43665792, + "step": 35875 + }, + { + "epoch": 3.9959906448379554, + "grad_norm": 0.9251619577407837, + "learning_rate": 4.8498458985979956e-05, + "loss": 0.1153, + "num_input_tokens_seen": 43671840, + "step": 35880 + }, + { + "epoch": 3.9965474997215726, + "grad_norm": 0.07635101675987244, + "learning_rate": 4.849762949803439e-05, + "loss": 0.0586, + "num_input_tokens_seen": 43678112, + "step": 35885 + }, + { + "epoch": 3.99710435460519, + "grad_norm": 0.04244047403335571, + "learning_rate": 4.849679978813488e-05, + "loss": 0.0467, + "num_input_tokens_seen": 43684384, + "step": 35890 + }, + { + "epoch": 3.997661209488807, + "grad_norm": 0.8429402112960815, + "learning_rate": 4.849596985628929e-05, + "loss": 0.0677, + "num_input_tokens_seen": 43690848, + "step": 35895 + }, + { + "epoch": 3.9982180643724243, + "grad_norm": 0.7031766176223755, + "learning_rate": 4.8495139702505426e-05, + "loss": 0.0685, + "num_input_tokens_seen": 43697152, + "step": 35900 + }, + { + "epoch": 3.998774919256042, + "grad_norm": 0.27042368054389954, + "learning_rate": 4.849430932679115e-05, + "loss": 0.0579, + "num_input_tokens_seen": 43703264, + "step": 35905 + }, + { + "epoch": 3.9993317741396592, + "grad_norm": 0.0034151580184698105, + "learning_rate": 4.84934787291543e-05, + "loss": 0.0416, + "num_input_tokens_seen": 43708672, + "step": 35910 + }, + { + "epoch": 3.9998886290232765, + "grad_norm": 0.6675002574920654, + "learning_rate": 4.8492647909602714e-05, + "loss": 0.0637, + "num_input_tokens_seen": 43714976, + "step": 35915 + }, + { + "epoch": 4.0, + "eval_loss": 0.09023210406303406, + "eval_runtime": 112.711, + "eval_samples_per_second": 35.409, + "eval_steps_per_second": 8.855, + "num_input_tokens_seen": 43715520, + "step": 35916 + }, + { + "epoch": 4.000445483906894, + "grad_norm": 0.42153200507164, + "learning_rate": 4.8491816868144247e-05, + "loss": 0.0205, + "num_input_tokens_seen": 43720288, + "step": 35920 + }, + { + "epoch": 4.001002338790511, + "grad_norm": 0.0006667696870863438, + "learning_rate": 4.8490985604786755e-05, + "loss": 0.0901, + "num_input_tokens_seen": 43726656, + "step": 35925 + }, + { + "epoch": 4.001559193674129, + "grad_norm": 0.1927676498889923, + "learning_rate": 4.849015411953808e-05, + "loss": 0.1453, + "num_input_tokens_seen": 43732736, + "step": 35930 + }, + { + "epoch": 4.002116048557746, + "grad_norm": 0.3800605237483978, + "learning_rate": 4.8489322412406075e-05, + "loss": 0.0744, + "num_input_tokens_seen": 43739040, + "step": 35935 + }, + { + "epoch": 4.002672903441363, + "grad_norm": 1.1604756116867065, + "learning_rate": 4.848849048339861e-05, + "loss": 0.0935, + "num_input_tokens_seen": 43745216, + "step": 35940 + }, + { + "epoch": 4.003229758324981, + "grad_norm": 0.4375275671482086, + "learning_rate": 4.8487658332523524e-05, + "loss": 0.1051, + "num_input_tokens_seen": 43751040, + "step": 35945 + }, + { + "epoch": 4.0037866132085975, + "grad_norm": 1.1153672933578491, + "learning_rate": 4.848682595978869e-05, + "loss": 0.0881, + "num_input_tokens_seen": 43756832, + "step": 35950 + }, + { + "epoch": 4.004343468092215, + "grad_norm": 0.11082112044095993, + "learning_rate": 4.8485993365201966e-05, + "loss": 0.0153, + "num_input_tokens_seen": 43763008, + "step": 35955 + }, + { + "epoch": 4.004900322975833, + "grad_norm": 0.390911340713501, + "learning_rate": 4.8485160548771225e-05, + "loss": 0.1078, + "num_input_tokens_seen": 43768992, + "step": 35960 + }, + { + "epoch": 4.00545717785945, + "grad_norm": 0.7469655275344849, + "learning_rate": 4.848432751050432e-05, + "loss": 0.0522, + "num_input_tokens_seen": 43775360, + "step": 35965 + }, + { + "epoch": 4.006014032743067, + "grad_norm": 1.1838550567626953, + "learning_rate": 4.848349425040913e-05, + "loss": 0.1642, + "num_input_tokens_seen": 43781088, + "step": 35970 + }, + { + "epoch": 4.006570887626684, + "grad_norm": 0.56630939245224, + "learning_rate": 4.848266076849352e-05, + "loss": 0.0749, + "num_input_tokens_seen": 43787424, + "step": 35975 + }, + { + "epoch": 4.007127742510302, + "grad_norm": 1.0600826740264893, + "learning_rate": 4.848182706476536e-05, + "loss": 0.1117, + "num_input_tokens_seen": 43793632, + "step": 35980 + }, + { + "epoch": 4.0076845973939195, + "grad_norm": 0.01082548126578331, + "learning_rate": 4.848099313923254e-05, + "loss": 0.0103, + "num_input_tokens_seen": 43800064, + "step": 35985 + }, + { + "epoch": 4.008241452277536, + "grad_norm": 2.197067975997925, + "learning_rate": 4.8480158991902926e-05, + "loss": 0.1014, + "num_input_tokens_seen": 43806368, + "step": 35990 + }, + { + "epoch": 4.008798307161154, + "grad_norm": 1.0805132389068604, + "learning_rate": 4.84793246227844e-05, + "loss": 0.1324, + "num_input_tokens_seen": 43812512, + "step": 35995 + }, + { + "epoch": 4.009355162044771, + "grad_norm": 0.2842947244644165, + "learning_rate": 4.847849003188483e-05, + "loss": 0.131, + "num_input_tokens_seen": 43818592, + "step": 36000 + }, + { + "epoch": 4.009912016928388, + "grad_norm": 0.4947088956832886, + "learning_rate": 4.8477655219212115e-05, + "loss": 0.0262, + "num_input_tokens_seen": 43824736, + "step": 36005 + }, + { + "epoch": 4.010468871812006, + "grad_norm": 0.024390125647187233, + "learning_rate": 4.847682018477414e-05, + "loss": 0.0633, + "num_input_tokens_seen": 43831008, + "step": 36010 + }, + { + "epoch": 4.011025726695623, + "grad_norm": 1.1672507524490356, + "learning_rate": 4.8475984928578785e-05, + "loss": 0.0865, + "num_input_tokens_seen": 43837312, + "step": 36015 + }, + { + "epoch": 4.0115825815792405, + "grad_norm": 0.5127132534980774, + "learning_rate": 4.847514945063395e-05, + "loss": 0.0165, + "num_input_tokens_seen": 43843680, + "step": 36020 + }, + { + "epoch": 4.012139436462858, + "grad_norm": 0.0459764190018177, + "learning_rate": 4.847431375094752e-05, + "loss": 0.0188, + "num_input_tokens_seen": 43849984, + "step": 36025 + }, + { + "epoch": 4.012696291346475, + "grad_norm": 0.0011320742778480053, + "learning_rate": 4.847347782952738e-05, + "loss": 0.1003, + "num_input_tokens_seen": 43856192, + "step": 36030 + }, + { + "epoch": 4.013253146230093, + "grad_norm": 0.12489498406648636, + "learning_rate": 4.8472641686381446e-05, + "loss": 0.0542, + "num_input_tokens_seen": 43862272, + "step": 36035 + }, + { + "epoch": 4.0138100011137094, + "grad_norm": 0.4607888460159302, + "learning_rate": 4.8471805321517606e-05, + "loss": 0.0341, + "num_input_tokens_seen": 43868416, + "step": 36040 + }, + { + "epoch": 4.014366855997327, + "grad_norm": 0.10702625662088394, + "learning_rate": 4.847096873494375e-05, + "loss": 0.0774, + "num_input_tokens_seen": 43874656, + "step": 36045 + }, + { + "epoch": 4.014923710880945, + "grad_norm": 1.195277452468872, + "learning_rate": 4.8470131926667793e-05, + "loss": 0.1024, + "num_input_tokens_seen": 43880128, + "step": 36050 + }, + { + "epoch": 4.015480565764562, + "grad_norm": 0.05536893755197525, + "learning_rate": 4.846929489669764e-05, + "loss": 0.1169, + "num_input_tokens_seen": 43885984, + "step": 36055 + }, + { + "epoch": 4.016037420648179, + "grad_norm": 0.001126192044466734, + "learning_rate": 4.8468457645041184e-05, + "loss": 0.1027, + "num_input_tokens_seen": 43891808, + "step": 36060 + }, + { + "epoch": 4.016594275531796, + "grad_norm": 0.16451174020767212, + "learning_rate": 4.8467620171706356e-05, + "loss": 0.0607, + "num_input_tokens_seen": 43897856, + "step": 36065 + }, + { + "epoch": 4.017151130415414, + "grad_norm": 0.004056538920849562, + "learning_rate": 4.846678247670105e-05, + "loss": 0.1145, + "num_input_tokens_seen": 43903936, + "step": 36070 + }, + { + "epoch": 4.017707985299031, + "grad_norm": 0.44248589873313904, + "learning_rate": 4.846594456003318e-05, + "loss": 0.0502, + "num_input_tokens_seen": 43910208, + "step": 36075 + }, + { + "epoch": 4.018264840182648, + "grad_norm": 0.3675185441970825, + "learning_rate": 4.846510642171066e-05, + "loss": 0.1293, + "num_input_tokens_seen": 43916384, + "step": 36080 + }, + { + "epoch": 4.018821695066266, + "grad_norm": 0.5719856023788452, + "learning_rate": 4.846426806174141e-05, + "loss": 0.0332, + "num_input_tokens_seen": 43922496, + "step": 36085 + }, + { + "epoch": 4.019378549949883, + "grad_norm": 0.1729322075843811, + "learning_rate": 4.8463429480133355e-05, + "loss": 0.0868, + "num_input_tokens_seen": 43928256, + "step": 36090 + }, + { + "epoch": 4.0199354048335, + "grad_norm": 0.008561665192246437, + "learning_rate": 4.8462590676894405e-05, + "loss": 0.0231, + "num_input_tokens_seen": 43934720, + "step": 36095 + }, + { + "epoch": 4.020492259717118, + "grad_norm": 0.27431967854499817, + "learning_rate": 4.846175165203249e-05, + "loss": 0.0253, + "num_input_tokens_seen": 43941056, + "step": 36100 + }, + { + "epoch": 4.021049114600735, + "grad_norm": 0.0033590695820748806, + "learning_rate": 4.846091240555553e-05, + "loss": 0.0477, + "num_input_tokens_seen": 43947072, + "step": 36105 + }, + { + "epoch": 4.0216059694843524, + "grad_norm": 0.2271205335855484, + "learning_rate": 4.846007293747146e-05, + "loss": 0.1074, + "num_input_tokens_seen": 43953248, + "step": 36110 + }, + { + "epoch": 4.02216282436797, + "grad_norm": 0.5286818146705627, + "learning_rate": 4.84592332477882e-05, + "loss": 0.162, + "num_input_tokens_seen": 43959200, + "step": 36115 + }, + { + "epoch": 4.022719679251587, + "grad_norm": 0.4794813096523285, + "learning_rate": 4.84583933365137e-05, + "loss": 0.0463, + "num_input_tokens_seen": 43965376, + "step": 36120 + }, + { + "epoch": 4.023276534135205, + "grad_norm": 0.23099425435066223, + "learning_rate": 4.845755320365587e-05, + "loss": 0.0254, + "num_input_tokens_seen": 43971616, + "step": 36125 + }, + { + "epoch": 4.023833389018821, + "grad_norm": 0.028275376185774803, + "learning_rate": 4.845671284922265e-05, + "loss": 0.0931, + "num_input_tokens_seen": 43977888, + "step": 36130 + }, + { + "epoch": 4.024390243902439, + "grad_norm": 0.39287635684013367, + "learning_rate": 4.845587227322199e-05, + "loss": 0.1052, + "num_input_tokens_seen": 43983616, + "step": 36135 + }, + { + "epoch": 4.024947098786057, + "grad_norm": 0.40794625878334045, + "learning_rate": 4.845503147566183e-05, + "loss": 0.1522, + "num_input_tokens_seen": 43988704, + "step": 36140 + }, + { + "epoch": 4.0255039536696735, + "grad_norm": 0.02816479094326496, + "learning_rate": 4.8454190456550095e-05, + "loss": 0.0481, + "num_input_tokens_seen": 43994624, + "step": 36145 + }, + { + "epoch": 4.026060808553291, + "grad_norm": 0.22816704213619232, + "learning_rate": 4.845334921589475e-05, + "loss": 0.0734, + "num_input_tokens_seen": 44000672, + "step": 36150 + }, + { + "epoch": 4.026617663436908, + "grad_norm": 1.2800683975219727, + "learning_rate": 4.845250775370372e-05, + "loss": 0.0908, + "num_input_tokens_seen": 44006880, + "step": 36155 + }, + { + "epoch": 4.027174518320526, + "grad_norm": 0.3071293830871582, + "learning_rate": 4.845166606998498e-05, + "loss": 0.0306, + "num_input_tokens_seen": 44012768, + "step": 36160 + }, + { + "epoch": 4.027731373204143, + "grad_norm": 0.004417449701577425, + "learning_rate": 4.845082416474646e-05, + "loss": 0.0468, + "num_input_tokens_seen": 44018816, + "step": 36165 + }, + { + "epoch": 4.02828822808776, + "grad_norm": 0.0022098205517977476, + "learning_rate": 4.844998203799611e-05, + "loss": 0.0568, + "num_input_tokens_seen": 44024896, + "step": 36170 + }, + { + "epoch": 4.028845082971378, + "grad_norm": 0.4133656919002533, + "learning_rate": 4.8449139689741894e-05, + "loss": 0.0787, + "num_input_tokens_seen": 44031136, + "step": 36175 + }, + { + "epoch": 4.029401937854995, + "grad_norm": 0.42464980483055115, + "learning_rate": 4.844829711999177e-05, + "loss": 0.0892, + "num_input_tokens_seen": 44037792, + "step": 36180 + }, + { + "epoch": 4.029958792738612, + "grad_norm": 0.5956622362136841, + "learning_rate": 4.844745432875369e-05, + "loss": 0.0432, + "num_input_tokens_seen": 44043488, + "step": 36185 + }, + { + "epoch": 4.03051564762223, + "grad_norm": 0.0036015650257468224, + "learning_rate": 4.844661131603562e-05, + "loss": 0.0525, + "num_input_tokens_seen": 44049376, + "step": 36190 + }, + { + "epoch": 4.031072502505847, + "grad_norm": 0.10703952610492706, + "learning_rate": 4.8445768081845516e-05, + "loss": 0.0195, + "num_input_tokens_seen": 44055712, + "step": 36195 + }, + { + "epoch": 4.031629357389464, + "grad_norm": 0.9699454307556152, + "learning_rate": 4.844492462619136e-05, + "loss": 0.0603, + "num_input_tokens_seen": 44061888, + "step": 36200 + }, + { + "epoch": 4.032186212273082, + "grad_norm": 0.01925826445221901, + "learning_rate": 4.8444080949081096e-05, + "loss": 0.0182, + "num_input_tokens_seen": 44067424, + "step": 36205 + }, + { + "epoch": 4.032743067156699, + "grad_norm": 0.06006691977381706, + "learning_rate": 4.844323705052271e-05, + "loss": 0.0315, + "num_input_tokens_seen": 44073504, + "step": 36210 + }, + { + "epoch": 4.0332999220403165, + "grad_norm": 0.29918256402015686, + "learning_rate": 4.844239293052416e-05, + "loss": 0.0266, + "num_input_tokens_seen": 44079392, + "step": 36215 + }, + { + "epoch": 4.033856776923933, + "grad_norm": 2.5588724613189697, + "learning_rate": 4.8441548589093436e-05, + "loss": 0.1115, + "num_input_tokens_seen": 44085568, + "step": 36220 + }, + { + "epoch": 4.034413631807551, + "grad_norm": 0.3593757152557373, + "learning_rate": 4.8440704026238495e-05, + "loss": 0.0987, + "num_input_tokens_seen": 44091904, + "step": 36225 + }, + { + "epoch": 4.034970486691169, + "grad_norm": 0.39640331268310547, + "learning_rate": 4.843985924196733e-05, + "loss": 0.0298, + "num_input_tokens_seen": 44097952, + "step": 36230 + }, + { + "epoch": 4.035527341574785, + "grad_norm": 0.15241512656211853, + "learning_rate": 4.843901423628792e-05, + "loss": 0.0432, + "num_input_tokens_seen": 44103872, + "step": 36235 + }, + { + "epoch": 4.036084196458403, + "grad_norm": 0.08108283579349518, + "learning_rate": 4.843816900920823e-05, + "loss": 0.0491, + "num_input_tokens_seen": 44109728, + "step": 36240 + }, + { + "epoch": 4.03664105134202, + "grad_norm": 0.9468072652816772, + "learning_rate": 4.8437323560736266e-05, + "loss": 0.0451, + "num_input_tokens_seen": 44115936, + "step": 36245 + }, + { + "epoch": 4.037197906225638, + "grad_norm": 0.9844193458557129, + "learning_rate": 4.8436477890879994e-05, + "loss": 0.0403, + "num_input_tokens_seen": 44122240, + "step": 36250 + }, + { + "epoch": 4.037754761109255, + "grad_norm": 0.3030621409416199, + "learning_rate": 4.843563199964742e-05, + "loss": 0.044, + "num_input_tokens_seen": 44127808, + "step": 36255 + }, + { + "epoch": 4.038311615992872, + "grad_norm": 0.2719423174858093, + "learning_rate": 4.843478588704652e-05, + "loss": 0.0109, + "num_input_tokens_seen": 44133984, + "step": 36260 + }, + { + "epoch": 4.03886847087649, + "grad_norm": 0.24105866253376007, + "learning_rate": 4.843393955308529e-05, + "loss": 0.0103, + "num_input_tokens_seen": 44139968, + "step": 36265 + }, + { + "epoch": 4.0394253257601065, + "grad_norm": 0.9011566042900085, + "learning_rate": 4.843309299777174e-05, + "loss": 0.0176, + "num_input_tokens_seen": 44146336, + "step": 36270 + }, + { + "epoch": 4.039982180643724, + "grad_norm": 0.20802158117294312, + "learning_rate": 4.843224622111383e-05, + "loss": 0.0379, + "num_input_tokens_seen": 44152160, + "step": 36275 + }, + { + "epoch": 4.040539035527342, + "grad_norm": 0.8988546133041382, + "learning_rate": 4.84313992231196e-05, + "loss": 0.1748, + "num_input_tokens_seen": 44158080, + "step": 36280 + }, + { + "epoch": 4.041095890410959, + "grad_norm": 0.23462505638599396, + "learning_rate": 4.843055200379702e-05, + "loss": 0.0622, + "num_input_tokens_seen": 44163232, + "step": 36285 + }, + { + "epoch": 4.041652745294576, + "grad_norm": 0.14959166944026947, + "learning_rate": 4.842970456315411e-05, + "loss": 0.0385, + "num_input_tokens_seen": 44169056, + "step": 36290 + }, + { + "epoch": 4.042209600178194, + "grad_norm": 2.1337056159973145, + "learning_rate": 4.842885690119887e-05, + "loss": 0.0926, + "num_input_tokens_seen": 44175040, + "step": 36295 + }, + { + "epoch": 4.042766455061811, + "grad_norm": 0.15223388373851776, + "learning_rate": 4.84280090179393e-05, + "loss": 0.0398, + "num_input_tokens_seen": 44181184, + "step": 36300 + }, + { + "epoch": 4.043323309945428, + "grad_norm": 0.999108076095581, + "learning_rate": 4.8427160913383417e-05, + "loss": 0.0912, + "num_input_tokens_seen": 44187360, + "step": 36305 + }, + { + "epoch": 4.043880164829045, + "grad_norm": 0.0005074584623798728, + "learning_rate": 4.842631258753923e-05, + "loss": 0.1262, + "num_input_tokens_seen": 44193536, + "step": 36310 + }, + { + "epoch": 4.044437019712663, + "grad_norm": 0.17151939868927002, + "learning_rate": 4.842546404041475e-05, + "loss": 0.1304, + "num_input_tokens_seen": 44199456, + "step": 36315 + }, + { + "epoch": 4.044993874596281, + "grad_norm": 0.01697934977710247, + "learning_rate": 4.8424615272017995e-05, + "loss": 0.0842, + "num_input_tokens_seen": 44206016, + "step": 36320 + }, + { + "epoch": 4.045550729479897, + "grad_norm": 0.012687884271144867, + "learning_rate": 4.842376628235698e-05, + "loss": 0.0114, + "num_input_tokens_seen": 44212224, + "step": 36325 + }, + { + "epoch": 4.046107584363515, + "grad_norm": 1.5068296194076538, + "learning_rate": 4.842291707143973e-05, + "loss": 0.0824, + "num_input_tokens_seen": 44218272, + "step": 36330 + }, + { + "epoch": 4.046664439247132, + "grad_norm": 0.10392726957798004, + "learning_rate": 4.842206763927426e-05, + "loss": 0.0245, + "num_input_tokens_seen": 44224544, + "step": 36335 + }, + { + "epoch": 4.0472212941307495, + "grad_norm": 0.39477619528770447, + "learning_rate": 4.8421217985868596e-05, + "loss": 0.0208, + "num_input_tokens_seen": 44230784, + "step": 36340 + }, + { + "epoch": 4.047778149014367, + "grad_norm": 0.31452062726020813, + "learning_rate": 4.842036811123076e-05, + "loss": 0.0767, + "num_input_tokens_seen": 44236544, + "step": 36345 + }, + { + "epoch": 4.048335003897984, + "grad_norm": 1.4586676359176636, + "learning_rate": 4.841951801536878e-05, + "loss": 0.1829, + "num_input_tokens_seen": 44242752, + "step": 36350 + }, + { + "epoch": 4.048891858781602, + "grad_norm": 0.02792460098862648, + "learning_rate": 4.8418667698290696e-05, + "loss": 0.2065, + "num_input_tokens_seen": 44248832, + "step": 36355 + }, + { + "epoch": 4.049448713665218, + "grad_norm": 0.1088934987783432, + "learning_rate": 4.841781716000453e-05, + "loss": 0.0373, + "num_input_tokens_seen": 44254816, + "step": 36360 + }, + { + "epoch": 4.050005568548836, + "grad_norm": 1.1921924352645874, + "learning_rate": 4.8416966400518324e-05, + "loss": 0.1034, + "num_input_tokens_seen": 44260928, + "step": 36365 + }, + { + "epoch": 4.050562423432454, + "grad_norm": 0.016006140038371086, + "learning_rate": 4.84161154198401e-05, + "loss": 0.0897, + "num_input_tokens_seen": 44266656, + "step": 36370 + }, + { + "epoch": 4.0511192783160705, + "grad_norm": 1.9243348836898804, + "learning_rate": 4.841526421797792e-05, + "loss": 0.0708, + "num_input_tokens_seen": 44272864, + "step": 36375 + }, + { + "epoch": 4.051676133199688, + "grad_norm": 2.2299017906188965, + "learning_rate": 4.841441279493979e-05, + "loss": 0.1503, + "num_input_tokens_seen": 44279072, + "step": 36380 + }, + { + "epoch": 4.052232988083306, + "grad_norm": 0.00026245543267577887, + "learning_rate": 4.841356115073379e-05, + "loss": 0.0024, + "num_input_tokens_seen": 44285280, + "step": 36385 + }, + { + "epoch": 4.052789842966923, + "grad_norm": 1.4837651252746582, + "learning_rate": 4.841270928536794e-05, + "loss": 0.0557, + "num_input_tokens_seen": 44291296, + "step": 36390 + }, + { + "epoch": 4.05334669785054, + "grad_norm": 0.07469682395458221, + "learning_rate": 4.841185719885029e-05, + "loss": 0.0605, + "num_input_tokens_seen": 44297792, + "step": 36395 + }, + { + "epoch": 4.053903552734157, + "grad_norm": 0.587432861328125, + "learning_rate": 4.841100489118889e-05, + "loss": 0.0422, + "num_input_tokens_seen": 44304128, + "step": 36400 + }, + { + "epoch": 4.054460407617775, + "grad_norm": 0.16192352771759033, + "learning_rate": 4.841015236239179e-05, + "loss": 0.1149, + "num_input_tokens_seen": 44310496, + "step": 36405 + }, + { + "epoch": 4.0550172625013925, + "grad_norm": 0.057892296463251114, + "learning_rate": 4.840929961246705e-05, + "loss": 0.0235, + "num_input_tokens_seen": 44316800, + "step": 36410 + }, + { + "epoch": 4.055574117385009, + "grad_norm": 1.363113522529602, + "learning_rate": 4.840844664142272e-05, + "loss": 0.0882, + "num_input_tokens_seen": 44322880, + "step": 36415 + }, + { + "epoch": 4.056130972268627, + "grad_norm": 1.103337287902832, + "learning_rate": 4.8407593449266866e-05, + "loss": 0.1232, + "num_input_tokens_seen": 44329024, + "step": 36420 + }, + { + "epoch": 4.056687827152244, + "grad_norm": 1.143256425857544, + "learning_rate": 4.840674003600753e-05, + "loss": 0.1242, + "num_input_tokens_seen": 44334912, + "step": 36425 + }, + { + "epoch": 4.057244682035861, + "grad_norm": 0.17110168933868408, + "learning_rate": 4.840588640165277e-05, + "loss": 0.1135, + "num_input_tokens_seen": 44340896, + "step": 36430 + }, + { + "epoch": 4.057801536919479, + "grad_norm": 0.057306982576847076, + "learning_rate": 4.840503254621067e-05, + "loss": 0.0174, + "num_input_tokens_seen": 44347040, + "step": 36435 + }, + { + "epoch": 4.058358391803096, + "grad_norm": 0.1645432710647583, + "learning_rate": 4.840417846968929e-05, + "loss": 0.1333, + "num_input_tokens_seen": 44353152, + "step": 36440 + }, + { + "epoch": 4.0589152466867136, + "grad_norm": 0.0134885860607028, + "learning_rate": 4.840332417209669e-05, + "loss": 0.1744, + "num_input_tokens_seen": 44359552, + "step": 36445 + }, + { + "epoch": 4.05947210157033, + "grad_norm": 0.2644849419593811, + "learning_rate": 4.840246965344094e-05, + "loss": 0.032, + "num_input_tokens_seen": 44365824, + "step": 36450 + }, + { + "epoch": 4.060028956453948, + "grad_norm": 0.06395073235034943, + "learning_rate": 4.840161491373012e-05, + "loss": 0.0327, + "num_input_tokens_seen": 44372288, + "step": 36455 + }, + { + "epoch": 4.060585811337566, + "grad_norm": 0.4693470299243927, + "learning_rate": 4.8400759952972293e-05, + "loss": 0.0917, + "num_input_tokens_seen": 44378400, + "step": 36460 + }, + { + "epoch": 4.0611426662211825, + "grad_norm": 0.00037004207842983305, + "learning_rate": 4.8399904771175544e-05, + "loss": 0.0708, + "num_input_tokens_seen": 44384736, + "step": 36465 + }, + { + "epoch": 4.0616995211048, + "grad_norm": 0.1612672656774521, + "learning_rate": 4.839904936834794e-05, + "loss": 0.0657, + "num_input_tokens_seen": 44390816, + "step": 36470 + }, + { + "epoch": 4.062256375988418, + "grad_norm": 0.006566660478711128, + "learning_rate": 4.839819374449757e-05, + "loss": 0.0098, + "num_input_tokens_seen": 44397056, + "step": 36475 + }, + { + "epoch": 4.062813230872035, + "grad_norm": 0.5905294418334961, + "learning_rate": 4.8397337899632514e-05, + "loss": 0.0325, + "num_input_tokens_seen": 44402976, + "step": 36480 + }, + { + "epoch": 4.063370085755652, + "grad_norm": 0.0012340160319581628, + "learning_rate": 4.839648183376086e-05, + "loss": 0.123, + "num_input_tokens_seen": 44409024, + "step": 36485 + }, + { + "epoch": 4.063926940639269, + "grad_norm": 0.5281389355659485, + "learning_rate": 4.839562554689069e-05, + "loss": 0.0887, + "num_input_tokens_seen": 44414528, + "step": 36490 + }, + { + "epoch": 4.064483795522887, + "grad_norm": 2.0992205142974854, + "learning_rate": 4.839476903903009e-05, + "loss": 0.264, + "num_input_tokens_seen": 44419840, + "step": 36495 + }, + { + "epoch": 4.065040650406504, + "grad_norm": 0.042831901460886, + "learning_rate": 4.839391231018715e-05, + "loss": 0.0328, + "num_input_tokens_seen": 44426080, + "step": 36500 + }, + { + "epoch": 4.065597505290121, + "grad_norm": 0.08786392956972122, + "learning_rate": 4.8393055360369964e-05, + "loss": 0.1131, + "num_input_tokens_seen": 44432064, + "step": 36505 + }, + { + "epoch": 4.066154360173739, + "grad_norm": 0.011419348418712616, + "learning_rate": 4.8392198189586636e-05, + "loss": 0.0546, + "num_input_tokens_seen": 44437568, + "step": 36510 + }, + { + "epoch": 4.066711215057356, + "grad_norm": 0.14299027621746063, + "learning_rate": 4.839134079784525e-05, + "loss": 0.0275, + "num_input_tokens_seen": 44443936, + "step": 36515 + }, + { + "epoch": 4.067268069940973, + "grad_norm": 0.5268481969833374, + "learning_rate": 4.839048318515391e-05, + "loss": 0.0185, + "num_input_tokens_seen": 44449984, + "step": 36520 + }, + { + "epoch": 4.067824924824591, + "grad_norm": 0.0013397474540397525, + "learning_rate": 4.8389625351520716e-05, + "loss": 0.0622, + "num_input_tokens_seen": 44456448, + "step": 36525 + }, + { + "epoch": 4.068381779708208, + "grad_norm": 0.2684507668018341, + "learning_rate": 4.8388767296953776e-05, + "loss": 0.0215, + "num_input_tokens_seen": 44462336, + "step": 36530 + }, + { + "epoch": 4.0689386345918255, + "grad_norm": 0.36463186144828796, + "learning_rate": 4.838790902146118e-05, + "loss": 0.127, + "num_input_tokens_seen": 44467872, + "step": 36535 + }, + { + "epoch": 4.069495489475442, + "grad_norm": 0.7244943976402283, + "learning_rate": 4.838705052505105e-05, + "loss": 0.1371, + "num_input_tokens_seen": 44473664, + "step": 36540 + }, + { + "epoch": 4.07005234435906, + "grad_norm": 0.9488703012466431, + "learning_rate": 4.8386191807731496e-05, + "loss": 0.1412, + "num_input_tokens_seen": 44478720, + "step": 36545 + }, + { + "epoch": 4.070609199242678, + "grad_norm": 0.012090400792658329, + "learning_rate": 4.838533286951061e-05, + "loss": 0.0348, + "num_input_tokens_seen": 44484896, + "step": 36550 + }, + { + "epoch": 4.071166054126294, + "grad_norm": 0.0005993345403112471, + "learning_rate": 4.838447371039654e-05, + "loss": 0.0025, + "num_input_tokens_seen": 44491296, + "step": 36555 + }, + { + "epoch": 4.071722909009912, + "grad_norm": 1.228550672531128, + "learning_rate": 4.8383614330397365e-05, + "loss": 0.1753, + "num_input_tokens_seen": 44497152, + "step": 36560 + }, + { + "epoch": 4.07227976389353, + "grad_norm": 0.1055896133184433, + "learning_rate": 4.8382754729521215e-05, + "loss": 0.1791, + "num_input_tokens_seen": 44503584, + "step": 36565 + }, + { + "epoch": 4.0728366187771465, + "grad_norm": 0.5865176320075989, + "learning_rate": 4.838189490777622e-05, + "loss": 0.1058, + "num_input_tokens_seen": 44509216, + "step": 36570 + }, + { + "epoch": 4.073393473660764, + "grad_norm": 1.4528882503509521, + "learning_rate": 4.8381034865170494e-05, + "loss": 0.0479, + "num_input_tokens_seen": 44515264, + "step": 36575 + }, + { + "epoch": 4.073950328544381, + "grad_norm": 0.47807639837265015, + "learning_rate": 4.838017460171216e-05, + "loss": 0.028, + "num_input_tokens_seen": 44520896, + "step": 36580 + }, + { + "epoch": 4.074507183427999, + "grad_norm": 0.7297901511192322, + "learning_rate": 4.837931411740935e-05, + "loss": 0.0638, + "num_input_tokens_seen": 44526944, + "step": 36585 + }, + { + "epoch": 4.075064038311616, + "grad_norm": 0.9548487663269043, + "learning_rate": 4.837845341227018e-05, + "loss": 0.0778, + "num_input_tokens_seen": 44532736, + "step": 36590 + }, + { + "epoch": 4.075620893195233, + "grad_norm": 0.20674729347229004, + "learning_rate": 4.837759248630279e-05, + "loss": 0.0773, + "num_input_tokens_seen": 44538784, + "step": 36595 + }, + { + "epoch": 4.076177748078851, + "grad_norm": 0.7372714281082153, + "learning_rate": 4.837673133951531e-05, + "loss": 0.0791, + "num_input_tokens_seen": 44545184, + "step": 36600 + }, + { + "epoch": 4.076734602962468, + "grad_norm": 0.41151008009910583, + "learning_rate": 4.837586997191587e-05, + "loss": 0.1007, + "num_input_tokens_seen": 44551520, + "step": 36605 + }, + { + "epoch": 4.077291457846085, + "grad_norm": 0.32593652606010437, + "learning_rate": 4.837500838351261e-05, + "loss": 0.0645, + "num_input_tokens_seen": 44556960, + "step": 36610 + }, + { + "epoch": 4.077848312729703, + "grad_norm": 0.9838781356811523, + "learning_rate": 4.8374146574313675e-05, + "loss": 0.1391, + "num_input_tokens_seen": 44562816, + "step": 36615 + }, + { + "epoch": 4.07840516761332, + "grad_norm": 1.746034026145935, + "learning_rate": 4.837328454432719e-05, + "loss": 0.1123, + "num_input_tokens_seen": 44569024, + "step": 36620 + }, + { + "epoch": 4.078962022496937, + "grad_norm": 0.7732987999916077, + "learning_rate": 4.837242229356131e-05, + "loss": 0.1942, + "num_input_tokens_seen": 44574816, + "step": 36625 + }, + { + "epoch": 4.079518877380554, + "grad_norm": 1.8741172552108765, + "learning_rate": 4.837155982202417e-05, + "loss": 0.0923, + "num_input_tokens_seen": 44581184, + "step": 36630 + }, + { + "epoch": 4.080075732264172, + "grad_norm": 0.41076529026031494, + "learning_rate": 4.837069712972393e-05, + "loss": 0.1002, + "num_input_tokens_seen": 44587616, + "step": 36635 + }, + { + "epoch": 4.0806325871477895, + "grad_norm": 0.981839120388031, + "learning_rate": 4.836983421666873e-05, + "loss": 0.2963, + "num_input_tokens_seen": 44593568, + "step": 36640 + }, + { + "epoch": 4.081189442031406, + "grad_norm": 0.0013026453088968992, + "learning_rate": 4.836897108286672e-05, + "loss": 0.0122, + "num_input_tokens_seen": 44599616, + "step": 36645 + }, + { + "epoch": 4.081746296915024, + "grad_norm": 0.1028762236237526, + "learning_rate": 4.836810772832606e-05, + "loss": 0.035, + "num_input_tokens_seen": 44605504, + "step": 36650 + }, + { + "epoch": 4.082303151798642, + "grad_norm": 0.098167285323143, + "learning_rate": 4.836724415305489e-05, + "loss": 0.038, + "num_input_tokens_seen": 44611776, + "step": 36655 + }, + { + "epoch": 4.0828600066822585, + "grad_norm": 1.7638599872589111, + "learning_rate": 4.836638035706139e-05, + "loss": 0.0971, + "num_input_tokens_seen": 44618176, + "step": 36660 + }, + { + "epoch": 4.083416861565876, + "grad_norm": 0.7662173509597778, + "learning_rate": 4.83655163403537e-05, + "loss": 0.0641, + "num_input_tokens_seen": 44624480, + "step": 36665 + }, + { + "epoch": 4.083973716449493, + "grad_norm": 0.4912016689777374, + "learning_rate": 4.836465210293999e-05, + "loss": 0.0163, + "num_input_tokens_seen": 44630656, + "step": 36670 + }, + { + "epoch": 4.084530571333111, + "grad_norm": 1.5864181518554688, + "learning_rate": 4.836378764482842e-05, + "loss": 0.0506, + "num_input_tokens_seen": 44636864, + "step": 36675 + }, + { + "epoch": 4.085087426216728, + "grad_norm": 1.0110951662063599, + "learning_rate": 4.8362922966027155e-05, + "loss": 0.0465, + "num_input_tokens_seen": 44642752, + "step": 36680 + }, + { + "epoch": 4.085644281100345, + "grad_norm": 1.2220662832260132, + "learning_rate": 4.8362058066544366e-05, + "loss": 0.1214, + "num_input_tokens_seen": 44648992, + "step": 36685 + }, + { + "epoch": 4.086201135983963, + "grad_norm": 0.9893290996551514, + "learning_rate": 4.836119294638822e-05, + "loss": 0.0749, + "num_input_tokens_seen": 44654976, + "step": 36690 + }, + { + "epoch": 4.0867579908675795, + "grad_norm": 0.6701226830482483, + "learning_rate": 4.836032760556689e-05, + "loss": 0.1306, + "num_input_tokens_seen": 44661024, + "step": 36695 + }, + { + "epoch": 4.087314845751197, + "grad_norm": 0.5570301413536072, + "learning_rate": 4.835946204408855e-05, + "loss": 0.2267, + "num_input_tokens_seen": 44666496, + "step": 36700 + }, + { + "epoch": 4.087871700634815, + "grad_norm": 0.0037099882028996944, + "learning_rate": 4.835859626196139e-05, + "loss": 0.0039, + "num_input_tokens_seen": 44672928, + "step": 36705 + }, + { + "epoch": 4.088428555518432, + "grad_norm": 0.010483848862349987, + "learning_rate": 4.8357730259193554e-05, + "loss": 0.1625, + "num_input_tokens_seen": 44678944, + "step": 36710 + }, + { + "epoch": 4.088985410402049, + "grad_norm": 2.0809593200683594, + "learning_rate": 4.835686403579325e-05, + "loss": 0.0877, + "num_input_tokens_seen": 44685088, + "step": 36715 + }, + { + "epoch": 4.089542265285667, + "grad_norm": 0.4297131896018982, + "learning_rate": 4.8355997591768646e-05, + "loss": 0.1596, + "num_input_tokens_seen": 44690528, + "step": 36720 + }, + { + "epoch": 4.090099120169284, + "grad_norm": 0.08911224454641342, + "learning_rate": 4.835513092712794e-05, + "loss": 0.1745, + "num_input_tokens_seen": 44696256, + "step": 36725 + }, + { + "epoch": 4.0906559750529015, + "grad_norm": 0.14222639799118042, + "learning_rate": 4.83542640418793e-05, + "loss": 0.0871, + "num_input_tokens_seen": 44702432, + "step": 36730 + }, + { + "epoch": 4.091212829936518, + "grad_norm": 0.007806111127138138, + "learning_rate": 4.8353396936030935e-05, + "loss": 0.029, + "num_input_tokens_seen": 44708768, + "step": 36735 + }, + { + "epoch": 4.091769684820136, + "grad_norm": 0.14797568321228027, + "learning_rate": 4.8352529609591026e-05, + "loss": 0.1778, + "num_input_tokens_seen": 44714880, + "step": 36740 + }, + { + "epoch": 4.092326539703754, + "grad_norm": 0.15357765555381775, + "learning_rate": 4.8351662062567765e-05, + "loss": 0.0296, + "num_input_tokens_seen": 44720928, + "step": 36745 + }, + { + "epoch": 4.09288339458737, + "grad_norm": 0.0005122201982885599, + "learning_rate": 4.8350794294969346e-05, + "loss": 0.0957, + "num_input_tokens_seen": 44727200, + "step": 36750 + }, + { + "epoch": 4.093440249470988, + "grad_norm": 1.0316109657287598, + "learning_rate": 4.834992630680396e-05, + "loss": 0.0419, + "num_input_tokens_seen": 44732992, + "step": 36755 + }, + { + "epoch": 4.093997104354605, + "grad_norm": 0.5769863724708557, + "learning_rate": 4.834905809807982e-05, + "loss": 0.0485, + "num_input_tokens_seen": 44739456, + "step": 36760 + }, + { + "epoch": 4.0945539592382225, + "grad_norm": 0.6173349618911743, + "learning_rate": 4.8348189668805115e-05, + "loss": 0.1302, + "num_input_tokens_seen": 44745440, + "step": 36765 + }, + { + "epoch": 4.09511081412184, + "grad_norm": 0.09239068627357483, + "learning_rate": 4.8347321018988054e-05, + "loss": 0.1228, + "num_input_tokens_seen": 44751456, + "step": 36770 + }, + { + "epoch": 4.095667669005457, + "grad_norm": 0.9985966682434082, + "learning_rate": 4.834645214863684e-05, + "loss": 0.0651, + "num_input_tokens_seen": 44757760, + "step": 36775 + }, + { + "epoch": 4.096224523889075, + "grad_norm": 0.05176204442977905, + "learning_rate": 4.834558305775968e-05, + "loss": 0.0902, + "num_input_tokens_seen": 44763744, + "step": 36780 + }, + { + "epoch": 4.096781378772691, + "grad_norm": 1.3589593172073364, + "learning_rate": 4.834471374636478e-05, + "loss": 0.0818, + "num_input_tokens_seen": 44770144, + "step": 36785 + }, + { + "epoch": 4.097338233656309, + "grad_norm": 1.0897598266601562, + "learning_rate": 4.834384421446036e-05, + "loss": 0.1303, + "num_input_tokens_seen": 44776000, + "step": 36790 + }, + { + "epoch": 4.097895088539927, + "grad_norm": 0.061686303466558456, + "learning_rate": 4.834297446205463e-05, + "loss": 0.0594, + "num_input_tokens_seen": 44782304, + "step": 36795 + }, + { + "epoch": 4.098451943423544, + "grad_norm": 1.7960394620895386, + "learning_rate": 4.8342104489155805e-05, + "loss": 0.0536, + "num_input_tokens_seen": 44788352, + "step": 36800 + }, + { + "epoch": 4.099008798307161, + "grad_norm": 0.7316338419914246, + "learning_rate": 4.83412342957721e-05, + "loss": 0.056, + "num_input_tokens_seen": 44794176, + "step": 36805 + }, + { + "epoch": 4.099565653190778, + "grad_norm": 0.2229829877614975, + "learning_rate": 4.834036388191173e-05, + "loss": 0.0612, + "num_input_tokens_seen": 44800672, + "step": 36810 + }, + { + "epoch": 4.100122508074396, + "grad_norm": 0.062198787927627563, + "learning_rate": 4.8339493247582934e-05, + "loss": 0.0476, + "num_input_tokens_seen": 44806816, + "step": 36815 + }, + { + "epoch": 4.100679362958013, + "grad_norm": 0.5893780589103699, + "learning_rate": 4.833862239279392e-05, + "loss": 0.1199, + "num_input_tokens_seen": 44812800, + "step": 36820 + }, + { + "epoch": 4.10123621784163, + "grad_norm": 0.9150670766830444, + "learning_rate": 4.833775131755291e-05, + "loss": 0.0861, + "num_input_tokens_seen": 44818528, + "step": 36825 + }, + { + "epoch": 4.101793072725248, + "grad_norm": 0.005361089948564768, + "learning_rate": 4.833688002186816e-05, + "loss": 0.0982, + "num_input_tokens_seen": 44824768, + "step": 36830 + }, + { + "epoch": 4.1023499276088655, + "grad_norm": 6.004094123840332, + "learning_rate": 4.833600850574786e-05, + "loss": 0.0482, + "num_input_tokens_seen": 44830400, + "step": 36835 + }, + { + "epoch": 4.102906782492482, + "grad_norm": 0.14042551815509796, + "learning_rate": 4.833513676920028e-05, + "loss": 0.047, + "num_input_tokens_seen": 44836832, + "step": 36840 + }, + { + "epoch": 4.1034636373761, + "grad_norm": 0.06545104831457138, + "learning_rate": 4.833426481223363e-05, + "loss": 0.0571, + "num_input_tokens_seen": 44842688, + "step": 36845 + }, + { + "epoch": 4.104020492259717, + "grad_norm": 0.4245818853378296, + "learning_rate": 4.833339263485616e-05, + "loss": 0.0234, + "num_input_tokens_seen": 44848832, + "step": 36850 + }, + { + "epoch": 4.104577347143334, + "grad_norm": 0.15277916193008423, + "learning_rate": 4.833252023707609e-05, + "loss": 0.033, + "num_input_tokens_seen": 44855040, + "step": 36855 + }, + { + "epoch": 4.105134202026952, + "grad_norm": 1.0591835975646973, + "learning_rate": 4.8331647618901684e-05, + "loss": 0.1092, + "num_input_tokens_seen": 44861120, + "step": 36860 + }, + { + "epoch": 4.105691056910569, + "grad_norm": 1.2321580648422241, + "learning_rate": 4.833077478034117e-05, + "loss": 0.1651, + "num_input_tokens_seen": 44867360, + "step": 36865 + }, + { + "epoch": 4.106247911794187, + "grad_norm": 0.6960574984550476, + "learning_rate": 4.832990172140279e-05, + "loss": 0.0894, + "num_input_tokens_seen": 44873600, + "step": 36870 + }, + { + "epoch": 4.106804766677803, + "grad_norm": 0.3375241756439209, + "learning_rate": 4.83290284420948e-05, + "loss": 0.0238, + "num_input_tokens_seen": 44879680, + "step": 36875 + }, + { + "epoch": 4.107361621561421, + "grad_norm": 0.27654147148132324, + "learning_rate": 4.832815494242545e-05, + "loss": 0.0228, + "num_input_tokens_seen": 44885440, + "step": 36880 + }, + { + "epoch": 4.107918476445039, + "grad_norm": 0.6448673009872437, + "learning_rate": 4.832728122240298e-05, + "loss": 0.0641, + "num_input_tokens_seen": 44891744, + "step": 36885 + }, + { + "epoch": 4.1084753313286555, + "grad_norm": 0.6947757601737976, + "learning_rate": 4.832640728203566e-05, + "loss": 0.0797, + "num_input_tokens_seen": 44897760, + "step": 36890 + }, + { + "epoch": 4.109032186212273, + "grad_norm": 0.1601165384054184, + "learning_rate": 4.8325533121331724e-05, + "loss": 0.0945, + "num_input_tokens_seen": 44903936, + "step": 36895 + }, + { + "epoch": 4.109589041095891, + "grad_norm": 0.6993523836135864, + "learning_rate": 4.8324658740299444e-05, + "loss": 0.0946, + "num_input_tokens_seen": 44909504, + "step": 36900 + }, + { + "epoch": 4.110145895979508, + "grad_norm": 0.39085209369659424, + "learning_rate": 4.832378413894707e-05, + "loss": 0.1012, + "num_input_tokens_seen": 44915776, + "step": 36905 + }, + { + "epoch": 4.110702750863125, + "grad_norm": 1.4188580513000488, + "learning_rate": 4.832290931728287e-05, + "loss": 0.0968, + "num_input_tokens_seen": 44921408, + "step": 36910 + }, + { + "epoch": 4.111259605746742, + "grad_norm": 0.03776064142584801, + "learning_rate": 4.832203427531511e-05, + "loss": 0.0702, + "num_input_tokens_seen": 44927552, + "step": 36915 + }, + { + "epoch": 4.11181646063036, + "grad_norm": 0.06218299642205238, + "learning_rate": 4.832115901305204e-05, + "loss": 0.0793, + "num_input_tokens_seen": 44933888, + "step": 36920 + }, + { + "epoch": 4.1123733155139774, + "grad_norm": 0.680192232131958, + "learning_rate": 4.832028353050195e-05, + "loss": 0.0504, + "num_input_tokens_seen": 44940000, + "step": 36925 + }, + { + "epoch": 4.112930170397594, + "grad_norm": 0.9444493651390076, + "learning_rate": 4.83194078276731e-05, + "loss": 0.195, + "num_input_tokens_seen": 44946400, + "step": 36930 + }, + { + "epoch": 4.113487025281212, + "grad_norm": 0.7217598557472229, + "learning_rate": 4.831853190457375e-05, + "loss": 0.1212, + "num_input_tokens_seen": 44952416, + "step": 36935 + }, + { + "epoch": 4.114043880164829, + "grad_norm": 0.29324695467948914, + "learning_rate": 4.831765576121219e-05, + "loss": 0.0214, + "num_input_tokens_seen": 44958272, + "step": 36940 + }, + { + "epoch": 4.114600735048446, + "grad_norm": 0.41417819261550903, + "learning_rate": 4.831677939759669e-05, + "loss": 0.0754, + "num_input_tokens_seen": 44964128, + "step": 36945 + }, + { + "epoch": 4.115157589932064, + "grad_norm": 0.3093269169330597, + "learning_rate": 4.831590281373553e-05, + "loss": 0.0794, + "num_input_tokens_seen": 44970496, + "step": 36950 + }, + { + "epoch": 4.115714444815681, + "grad_norm": 0.19164004921913147, + "learning_rate": 4.831502600963698e-05, + "loss": 0.0546, + "num_input_tokens_seen": 44976704, + "step": 36955 + }, + { + "epoch": 4.1162712996992985, + "grad_norm": 0.05673901364207268, + "learning_rate": 4.8314148985309324e-05, + "loss": 0.1176, + "num_input_tokens_seen": 44982976, + "step": 36960 + }, + { + "epoch": 4.116828154582915, + "grad_norm": 0.2985348403453827, + "learning_rate": 4.8313271740760864e-05, + "loss": 0.052, + "num_input_tokens_seen": 44989184, + "step": 36965 + }, + { + "epoch": 4.117385009466533, + "grad_norm": 0.5765435695648193, + "learning_rate": 4.8312394275999864e-05, + "loss": 0.0242, + "num_input_tokens_seen": 44995648, + "step": 36970 + }, + { + "epoch": 4.117941864350151, + "grad_norm": 1.551533579826355, + "learning_rate": 4.831151659103463e-05, + "loss": 0.0249, + "num_input_tokens_seen": 45001824, + "step": 36975 + }, + { + "epoch": 4.118498719233767, + "grad_norm": 0.18118001520633698, + "learning_rate": 4.8310638685873445e-05, + "loss": 0.1029, + "num_input_tokens_seen": 45008128, + "step": 36980 + }, + { + "epoch": 4.119055574117385, + "grad_norm": 0.022207582369446754, + "learning_rate": 4.83097605605246e-05, + "loss": 0.0638, + "num_input_tokens_seen": 45014080, + "step": 36985 + }, + { + "epoch": 4.119612429001003, + "grad_norm": 0.7101651430130005, + "learning_rate": 4.8308882214996395e-05, + "loss": 0.0927, + "num_input_tokens_seen": 45020064, + "step": 36990 + }, + { + "epoch": 4.12016928388462, + "grad_norm": 0.22802941501140594, + "learning_rate": 4.830800364929712e-05, + "loss": 0.0819, + "num_input_tokens_seen": 45026208, + "step": 36995 + }, + { + "epoch": 4.120726138768237, + "grad_norm": 0.19178743660449982, + "learning_rate": 4.830712486343507e-05, + "loss": 0.0767, + "num_input_tokens_seen": 45032736, + "step": 37000 + }, + { + "epoch": 4.121282993651854, + "grad_norm": 0.5424962639808655, + "learning_rate": 4.830624585741856e-05, + "loss": 0.0562, + "num_input_tokens_seen": 45039168, + "step": 37005 + }, + { + "epoch": 4.121839848535472, + "grad_norm": 0.038922566920518875, + "learning_rate": 4.8305366631255885e-05, + "loss": 0.01, + "num_input_tokens_seen": 45045632, + "step": 37010 + }, + { + "epoch": 4.122396703419089, + "grad_norm": 0.8871174454689026, + "learning_rate": 4.8304487184955345e-05, + "loss": 0.0529, + "num_input_tokens_seen": 45051808, + "step": 37015 + }, + { + "epoch": 4.122953558302706, + "grad_norm": 0.46005693078041077, + "learning_rate": 4.8303607518525254e-05, + "loss": 0.1087, + "num_input_tokens_seen": 45058112, + "step": 37020 + }, + { + "epoch": 4.123510413186324, + "grad_norm": 0.5684094429016113, + "learning_rate": 4.830272763197392e-05, + "loss": 0.0245, + "num_input_tokens_seen": 45064352, + "step": 37025 + }, + { + "epoch": 4.124067268069941, + "grad_norm": 1.839614987373352, + "learning_rate": 4.8301847525309655e-05, + "loss": 0.1948, + "num_input_tokens_seen": 45070464, + "step": 37030 + }, + { + "epoch": 4.124624122953558, + "grad_norm": 0.3031090795993805, + "learning_rate": 4.830096719854077e-05, + "loss": 0.1749, + "num_input_tokens_seen": 45076544, + "step": 37035 + }, + { + "epoch": 4.125180977837176, + "grad_norm": 1.5826574563980103, + "learning_rate": 4.830008665167558e-05, + "loss": 0.0707, + "num_input_tokens_seen": 45082624, + "step": 37040 + }, + { + "epoch": 4.125737832720793, + "grad_norm": 0.03535730764269829, + "learning_rate": 4.829920588472241e-05, + "loss": 0.0426, + "num_input_tokens_seen": 45088704, + "step": 37045 + }, + { + "epoch": 4.12629468760441, + "grad_norm": 1.3777879476547241, + "learning_rate": 4.829832489768957e-05, + "loss": 0.1125, + "num_input_tokens_seen": 45094848, + "step": 37050 + }, + { + "epoch": 4.126851542488027, + "grad_norm": 0.20357829332351685, + "learning_rate": 4.8297443690585386e-05, + "loss": 0.0461, + "num_input_tokens_seen": 45101184, + "step": 37055 + }, + { + "epoch": 4.127408397371645, + "grad_norm": 0.35871583223342896, + "learning_rate": 4.829656226341818e-05, + "loss": 0.0195, + "num_input_tokens_seen": 45107136, + "step": 37060 + }, + { + "epoch": 4.127965252255263, + "grad_norm": 0.5181418657302856, + "learning_rate": 4.8295680616196274e-05, + "loss": 0.0471, + "num_input_tokens_seen": 45113184, + "step": 37065 + }, + { + "epoch": 4.128522107138879, + "grad_norm": 0.0987120270729065, + "learning_rate": 4.8294798748928004e-05, + "loss": 0.0218, + "num_input_tokens_seen": 45119520, + "step": 37070 + }, + { + "epoch": 4.129078962022497, + "grad_norm": 0.2525801956653595, + "learning_rate": 4.8293916661621696e-05, + "loss": 0.0744, + "num_input_tokens_seen": 45125216, + "step": 37075 + }, + { + "epoch": 4.129635816906115, + "grad_norm": 0.08109982311725616, + "learning_rate": 4.8293034354285685e-05, + "loss": 0.0373, + "num_input_tokens_seen": 45131392, + "step": 37080 + }, + { + "epoch": 4.1301926717897315, + "grad_norm": 0.8265120983123779, + "learning_rate": 4.82921518269283e-05, + "loss": 0.0588, + "num_input_tokens_seen": 45137472, + "step": 37085 + }, + { + "epoch": 4.130749526673349, + "grad_norm": 0.1987745761871338, + "learning_rate": 4.829126907955788e-05, + "loss": 0.0953, + "num_input_tokens_seen": 45143456, + "step": 37090 + }, + { + "epoch": 4.131306381556966, + "grad_norm": 0.3896600902080536, + "learning_rate": 4.829038611218276e-05, + "loss": 0.0489, + "num_input_tokens_seen": 45149280, + "step": 37095 + }, + { + "epoch": 4.131863236440584, + "grad_norm": 0.03836039826273918, + "learning_rate": 4.828950292481128e-05, + "loss": 0.0407, + "num_input_tokens_seen": 45155424, + "step": 37100 + }, + { + "epoch": 4.132420091324201, + "grad_norm": 0.5724108219146729, + "learning_rate": 4.828861951745179e-05, + "loss": 0.0758, + "num_input_tokens_seen": 45161408, + "step": 37105 + }, + { + "epoch": 4.132976946207818, + "grad_norm": 1.853255271911621, + "learning_rate": 4.828773589011264e-05, + "loss": 0.1141, + "num_input_tokens_seen": 45167744, + "step": 37110 + }, + { + "epoch": 4.133533801091436, + "grad_norm": 1.134458065032959, + "learning_rate": 4.8286852042802156e-05, + "loss": 0.1233, + "num_input_tokens_seen": 45173888, + "step": 37115 + }, + { + "epoch": 4.1340906559750525, + "grad_norm": 0.16076277196407318, + "learning_rate": 4.82859679755287e-05, + "loss": 0.0097, + "num_input_tokens_seen": 45180032, + "step": 37120 + }, + { + "epoch": 4.13464751085867, + "grad_norm": 1.4459840059280396, + "learning_rate": 4.8285083688300616e-05, + "loss": 0.1599, + "num_input_tokens_seen": 45186080, + "step": 37125 + }, + { + "epoch": 4.135204365742288, + "grad_norm": 0.04324543848633766, + "learning_rate": 4.8284199181126264e-05, + "loss": 0.1554, + "num_input_tokens_seen": 45192288, + "step": 37130 + }, + { + "epoch": 4.135761220625905, + "grad_norm": 1.2492330074310303, + "learning_rate": 4.8283314454014e-05, + "loss": 0.1429, + "num_input_tokens_seen": 45198368, + "step": 37135 + }, + { + "epoch": 4.136318075509522, + "grad_norm": 0.8787345290184021, + "learning_rate": 4.828242950697217e-05, + "loss": 0.0465, + "num_input_tokens_seen": 45204672, + "step": 37140 + }, + { + "epoch": 4.136874930393139, + "grad_norm": 0.2257905900478363, + "learning_rate": 4.8281544340009144e-05, + "loss": 0.0678, + "num_input_tokens_seen": 45210816, + "step": 37145 + }, + { + "epoch": 4.137431785276757, + "grad_norm": 1.396958351135254, + "learning_rate": 4.828065895313328e-05, + "loss": 0.147, + "num_input_tokens_seen": 45216448, + "step": 37150 + }, + { + "epoch": 4.1379886401603745, + "grad_norm": 0.02808145247399807, + "learning_rate": 4.8279773346352935e-05, + "loss": 0.0441, + "num_input_tokens_seen": 45222560, + "step": 37155 + }, + { + "epoch": 4.138545495043991, + "grad_norm": 0.8407610654830933, + "learning_rate": 4.8278887519676486e-05, + "loss": 0.1016, + "num_input_tokens_seen": 45227808, + "step": 37160 + }, + { + "epoch": 4.139102349927609, + "grad_norm": 0.24309590458869934, + "learning_rate": 4.827800147311229e-05, + "loss": 0.0564, + "num_input_tokens_seen": 45233664, + "step": 37165 + }, + { + "epoch": 4.139659204811227, + "grad_norm": 0.38776448369026184, + "learning_rate": 4.8277115206668714e-05, + "loss": 0.0154, + "num_input_tokens_seen": 45239712, + "step": 37170 + }, + { + "epoch": 4.140216059694843, + "grad_norm": 0.2109862118959427, + "learning_rate": 4.827622872035414e-05, + "loss": 0.0549, + "num_input_tokens_seen": 45245472, + "step": 37175 + }, + { + "epoch": 4.140772914578461, + "grad_norm": 1.0885778665542603, + "learning_rate": 4.8275342014176936e-05, + "loss": 0.1441, + "num_input_tokens_seen": 45251680, + "step": 37180 + }, + { + "epoch": 4.141329769462078, + "grad_norm": 0.1355358064174652, + "learning_rate": 4.8274455088145484e-05, + "loss": 0.0698, + "num_input_tokens_seen": 45257760, + "step": 37185 + }, + { + "epoch": 4.1418866243456955, + "grad_norm": 0.9074793457984924, + "learning_rate": 4.8273567942268156e-05, + "loss": 0.0599, + "num_input_tokens_seen": 45263680, + "step": 37190 + }, + { + "epoch": 4.142443479229313, + "grad_norm": 1.2941924333572388, + "learning_rate": 4.827268057655333e-05, + "loss": 0.075, + "num_input_tokens_seen": 45269664, + "step": 37195 + }, + { + "epoch": 4.14300033411293, + "grad_norm": 0.16889366507530212, + "learning_rate": 4.827179299100939e-05, + "loss": 0.0404, + "num_input_tokens_seen": 45276000, + "step": 37200 + }, + { + "epoch": 4.143557188996548, + "grad_norm": 0.10511301457881927, + "learning_rate": 4.827090518564472e-05, + "loss": 0.0874, + "num_input_tokens_seen": 45282112, + "step": 37205 + }, + { + "epoch": 4.1441140438801645, + "grad_norm": 0.014351483434438705, + "learning_rate": 4.8270017160467705e-05, + "loss": 0.024, + "num_input_tokens_seen": 45288544, + "step": 37210 + }, + { + "epoch": 4.144670898763782, + "grad_norm": 0.11324749141931534, + "learning_rate": 4.826912891548674e-05, + "loss": 0.0359, + "num_input_tokens_seen": 45294560, + "step": 37215 + }, + { + "epoch": 4.1452277536474, + "grad_norm": 0.42131364345550537, + "learning_rate": 4.82682404507102e-05, + "loss": 0.076, + "num_input_tokens_seen": 45300800, + "step": 37220 + }, + { + "epoch": 4.145784608531017, + "grad_norm": 0.6379210948944092, + "learning_rate": 4.8267351766146495e-05, + "loss": 0.0702, + "num_input_tokens_seen": 45306752, + "step": 37225 + }, + { + "epoch": 4.146341463414634, + "grad_norm": 0.35335442423820496, + "learning_rate": 4.826646286180401e-05, + "loss": 0.0276, + "num_input_tokens_seen": 45312992, + "step": 37230 + }, + { + "epoch": 4.146898318298251, + "grad_norm": 0.27831390500068665, + "learning_rate": 4.826557373769114e-05, + "loss": 0.1484, + "num_input_tokens_seen": 45319296, + "step": 37235 + }, + { + "epoch": 4.147455173181869, + "grad_norm": 0.4676194190979004, + "learning_rate": 4.826468439381628e-05, + "loss": 0.0293, + "num_input_tokens_seen": 45324640, + "step": 37240 + }, + { + "epoch": 4.148012028065486, + "grad_norm": 0.04066858068108559, + "learning_rate": 4.826379483018785e-05, + "loss": 0.0598, + "num_input_tokens_seen": 45330656, + "step": 37245 + }, + { + "epoch": 4.148568882949103, + "grad_norm": 0.005292769055813551, + "learning_rate": 4.8262905046814226e-05, + "loss": 0.0389, + "num_input_tokens_seen": 45336928, + "step": 37250 + }, + { + "epoch": 4.149125737832721, + "grad_norm": 0.6111217737197876, + "learning_rate": 4.826201504370383e-05, + "loss": 0.0304, + "num_input_tokens_seen": 45342720, + "step": 37255 + }, + { + "epoch": 4.1496825927163385, + "grad_norm": 2.123166561126709, + "learning_rate": 4.826112482086507e-05, + "loss": 0.1256, + "num_input_tokens_seen": 45348672, + "step": 37260 + }, + { + "epoch": 4.150239447599955, + "grad_norm": 1.0984046459197998, + "learning_rate": 4.826023437830634e-05, + "loss": 0.0969, + "num_input_tokens_seen": 45354656, + "step": 37265 + }, + { + "epoch": 4.150796302483573, + "grad_norm": 1.2364100217819214, + "learning_rate": 4.8259343716036075e-05, + "loss": 0.0807, + "num_input_tokens_seen": 45361120, + "step": 37270 + }, + { + "epoch": 4.15135315736719, + "grad_norm": 0.26036906242370605, + "learning_rate": 4.8258452834062665e-05, + "loss": 0.1025, + "num_input_tokens_seen": 45367008, + "step": 37275 + }, + { + "epoch": 4.1519100122508075, + "grad_norm": 0.0032731916289776564, + "learning_rate": 4.825756173239453e-05, + "loss": 0.0162, + "num_input_tokens_seen": 45373152, + "step": 37280 + }, + { + "epoch": 4.152466867134425, + "grad_norm": 0.32970139384269714, + "learning_rate": 4.82566704110401e-05, + "loss": 0.1065, + "num_input_tokens_seen": 45379040, + "step": 37285 + }, + { + "epoch": 4.153023722018042, + "grad_norm": 0.25888440012931824, + "learning_rate": 4.8255778870007774e-05, + "loss": 0.1903, + "num_input_tokens_seen": 45384800, + "step": 37290 + }, + { + "epoch": 4.15358057690166, + "grad_norm": 0.4803549647331238, + "learning_rate": 4.825488710930599e-05, + "loss": 0.0124, + "num_input_tokens_seen": 45390752, + "step": 37295 + }, + { + "epoch": 4.154137431785276, + "grad_norm": 1.4839893579483032, + "learning_rate": 4.825399512894317e-05, + "loss": 0.1167, + "num_input_tokens_seen": 45396960, + "step": 37300 + }, + { + "epoch": 4.154694286668894, + "grad_norm": 1.1145117282867432, + "learning_rate": 4.825310292892773e-05, + "loss": 0.1222, + "num_input_tokens_seen": 45403008, + "step": 37305 + }, + { + "epoch": 4.155251141552512, + "grad_norm": 0.3890048861503601, + "learning_rate": 4.82522105092681e-05, + "loss": 0.0226, + "num_input_tokens_seen": 45409344, + "step": 37310 + }, + { + "epoch": 4.1558079964361285, + "grad_norm": 0.7855909466743469, + "learning_rate": 4.8251317869972724e-05, + "loss": 0.1437, + "num_input_tokens_seen": 45415648, + "step": 37315 + }, + { + "epoch": 4.156364851319746, + "grad_norm": 0.4175901710987091, + "learning_rate": 4.825042501105001e-05, + "loss": 0.0788, + "num_input_tokens_seen": 45421568, + "step": 37320 + }, + { + "epoch": 4.156921706203363, + "grad_norm": 0.29799884557724, + "learning_rate": 4.824953193250841e-05, + "loss": 0.0657, + "num_input_tokens_seen": 45428224, + "step": 37325 + }, + { + "epoch": 4.157478561086981, + "grad_norm": 0.14037717878818512, + "learning_rate": 4.8248638634356345e-05, + "loss": 0.1725, + "num_input_tokens_seen": 45434240, + "step": 37330 + }, + { + "epoch": 4.158035415970598, + "grad_norm": 1.0464704036712646, + "learning_rate": 4.824774511660227e-05, + "loss": 0.0508, + "num_input_tokens_seen": 45440448, + "step": 37335 + }, + { + "epoch": 4.158592270854215, + "grad_norm": 0.09613052010536194, + "learning_rate": 4.824685137925462e-05, + "loss": 0.0198, + "num_input_tokens_seen": 45446592, + "step": 37340 + }, + { + "epoch": 4.159149125737833, + "grad_norm": 1.0253350734710693, + "learning_rate": 4.824595742232183e-05, + "loss": 0.0631, + "num_input_tokens_seen": 45452544, + "step": 37345 + }, + { + "epoch": 4.1597059806214505, + "grad_norm": 1.8928279876708984, + "learning_rate": 4.8245063245812345e-05, + "loss": 0.1267, + "num_input_tokens_seen": 45458720, + "step": 37350 + }, + { + "epoch": 4.160262835505067, + "grad_norm": 0.6958268880844116, + "learning_rate": 4.824416884973462e-05, + "loss": 0.2087, + "num_input_tokens_seen": 45465152, + "step": 37355 + }, + { + "epoch": 4.160819690388685, + "grad_norm": 0.3760138154029846, + "learning_rate": 4.8243274234097086e-05, + "loss": 0.1087, + "num_input_tokens_seen": 45471168, + "step": 37360 + }, + { + "epoch": 4.161376545272302, + "grad_norm": 0.31692659854888916, + "learning_rate": 4.824237939890821e-05, + "loss": 0.1168, + "num_input_tokens_seen": 45477536, + "step": 37365 + }, + { + "epoch": 4.161933400155919, + "grad_norm": 0.1951303780078888, + "learning_rate": 4.824148434417645e-05, + "loss": 0.0379, + "num_input_tokens_seen": 45483936, + "step": 37370 + }, + { + "epoch": 4.162490255039537, + "grad_norm": 0.3839470446109772, + "learning_rate": 4.8240589069910234e-05, + "loss": 0.069, + "num_input_tokens_seen": 45490464, + "step": 37375 + }, + { + "epoch": 4.163047109923154, + "grad_norm": 0.06439410150051117, + "learning_rate": 4.823969357611804e-05, + "loss": 0.0397, + "num_input_tokens_seen": 45496544, + "step": 37380 + }, + { + "epoch": 4.1636039648067715, + "grad_norm": 0.0011617718264460564, + "learning_rate": 4.823879786280832e-05, + "loss": 0.0484, + "num_input_tokens_seen": 45502688, + "step": 37385 + }, + { + "epoch": 4.164160819690388, + "grad_norm": 0.13275498151779175, + "learning_rate": 4.8237901929989535e-05, + "loss": 0.0208, + "num_input_tokens_seen": 45509152, + "step": 37390 + }, + { + "epoch": 4.164717674574006, + "grad_norm": 0.36845913529396057, + "learning_rate": 4.823700577767015e-05, + "loss": 0.1498, + "num_input_tokens_seen": 45515008, + "step": 37395 + }, + { + "epoch": 4.165274529457624, + "grad_norm": 0.21647967398166656, + "learning_rate": 4.823610940585863e-05, + "loss": 0.0408, + "num_input_tokens_seen": 45521056, + "step": 37400 + }, + { + "epoch": 4.1658313843412405, + "grad_norm": 0.7578297853469849, + "learning_rate": 4.823521281456344e-05, + "loss": 0.0322, + "num_input_tokens_seen": 45527008, + "step": 37405 + }, + { + "epoch": 4.166388239224858, + "grad_norm": 0.47978416085243225, + "learning_rate": 4.8234316003793044e-05, + "loss": 0.0873, + "num_input_tokens_seen": 45533408, + "step": 37410 + }, + { + "epoch": 4.166945094108475, + "grad_norm": 0.19465170800685883, + "learning_rate": 4.823341897355592e-05, + "loss": 0.0409, + "num_input_tokens_seen": 45539776, + "step": 37415 + }, + { + "epoch": 4.167501948992093, + "grad_norm": 0.7817173600196838, + "learning_rate": 4.823252172386055e-05, + "loss": 0.0536, + "num_input_tokens_seen": 45546016, + "step": 37420 + }, + { + "epoch": 4.16805880387571, + "grad_norm": 0.630430281162262, + "learning_rate": 4.8231624254715384e-05, + "loss": 0.0159, + "num_input_tokens_seen": 45552224, + "step": 37425 + }, + { + "epoch": 4.168615658759327, + "grad_norm": 0.1892312467098236, + "learning_rate": 4.823072656612893e-05, + "loss": 0.0155, + "num_input_tokens_seen": 45558336, + "step": 37430 + }, + { + "epoch": 4.169172513642945, + "grad_norm": 0.5272559523582458, + "learning_rate": 4.8229828658109635e-05, + "loss": 0.1129, + "num_input_tokens_seen": 45563872, + "step": 37435 + }, + { + "epoch": 4.169729368526562, + "grad_norm": 1.4747207164764404, + "learning_rate": 4.8228930530666e-05, + "loss": 0.1243, + "num_input_tokens_seen": 45570112, + "step": 37440 + }, + { + "epoch": 4.170286223410179, + "grad_norm": 1.4729337692260742, + "learning_rate": 4.8228032183806516e-05, + "loss": 0.0803, + "num_input_tokens_seen": 45576192, + "step": 37445 + }, + { + "epoch": 4.170843078293797, + "grad_norm": 0.4090644419193268, + "learning_rate": 4.8227133617539644e-05, + "loss": 0.0551, + "num_input_tokens_seen": 45582144, + "step": 37450 + }, + { + "epoch": 4.171399933177414, + "grad_norm": 0.416328102350235, + "learning_rate": 4.822623483187389e-05, + "loss": 0.0823, + "num_input_tokens_seen": 45588064, + "step": 37455 + }, + { + "epoch": 4.171956788061031, + "grad_norm": 1.4629329442977905, + "learning_rate": 4.822533582681775e-05, + "loss": 0.1664, + "num_input_tokens_seen": 45593920, + "step": 37460 + }, + { + "epoch": 4.172513642944649, + "grad_norm": 1.9555739164352417, + "learning_rate": 4.8224436602379695e-05, + "loss": 0.0446, + "num_input_tokens_seen": 45599808, + "step": 37465 + }, + { + "epoch": 4.173070497828266, + "grad_norm": 0.0073537155985832214, + "learning_rate": 4.822353715856823e-05, + "loss": 0.0564, + "num_input_tokens_seen": 45605536, + "step": 37470 + }, + { + "epoch": 4.1736273527118835, + "grad_norm": 0.21377232670783997, + "learning_rate": 4.822263749539186e-05, + "loss": 0.0204, + "num_input_tokens_seen": 45612000, + "step": 37475 + }, + { + "epoch": 4.1741842075955, + "grad_norm": 1.2046473026275635, + "learning_rate": 4.822173761285906e-05, + "loss": 0.0589, + "num_input_tokens_seen": 45618144, + "step": 37480 + }, + { + "epoch": 4.174741062479118, + "grad_norm": 0.968183696269989, + "learning_rate": 4.822083751097834e-05, + "loss": 0.1146, + "num_input_tokens_seen": 45624160, + "step": 37485 + }, + { + "epoch": 4.175297917362736, + "grad_norm": 0.8644397258758545, + "learning_rate": 4.8219937189758226e-05, + "loss": 0.0711, + "num_input_tokens_seen": 45630272, + "step": 37490 + }, + { + "epoch": 4.175854772246352, + "grad_norm": 0.6839072704315186, + "learning_rate": 4.821903664920718e-05, + "loss": 0.1521, + "num_input_tokens_seen": 45636128, + "step": 37495 + }, + { + "epoch": 4.17641162712997, + "grad_norm": 0.20718243718147278, + "learning_rate": 4.8218135889333746e-05, + "loss": 0.0387, + "num_input_tokens_seen": 45642272, + "step": 37500 + }, + { + "epoch": 4.176968482013587, + "grad_norm": 0.019645817577838898, + "learning_rate": 4.821723491014641e-05, + "loss": 0.0209, + "num_input_tokens_seen": 45648640, + "step": 37505 + }, + { + "epoch": 4.1775253368972045, + "grad_norm": 0.26819196343421936, + "learning_rate": 4.821633371165369e-05, + "loss": 0.1395, + "num_input_tokens_seen": 45654944, + "step": 37510 + }, + { + "epoch": 4.178082191780822, + "grad_norm": 0.11413434892892838, + "learning_rate": 4.8215432293864095e-05, + "loss": 0.1091, + "num_input_tokens_seen": 45661120, + "step": 37515 + }, + { + "epoch": 4.178639046664439, + "grad_norm": 0.025718318298459053, + "learning_rate": 4.821453065678614e-05, + "loss": 0.008, + "num_input_tokens_seen": 45667392, + "step": 37520 + }, + { + "epoch": 4.179195901548057, + "grad_norm": 0.0016724669840186834, + "learning_rate": 4.821362880042836e-05, + "loss": 0.0412, + "num_input_tokens_seen": 45673664, + "step": 37525 + }, + { + "epoch": 4.179752756431674, + "grad_norm": 0.4356057047843933, + "learning_rate": 4.821272672479924e-05, + "loss": 0.1751, + "num_input_tokens_seen": 45679616, + "step": 37530 + }, + { + "epoch": 4.180309611315291, + "grad_norm": 1.2685585021972656, + "learning_rate": 4.821182442990732e-05, + "loss": 0.1436, + "num_input_tokens_seen": 45685952, + "step": 37535 + }, + { + "epoch": 4.180866466198909, + "grad_norm": 0.5585323572158813, + "learning_rate": 4.8210921915761126e-05, + "loss": 0.0836, + "num_input_tokens_seen": 45692320, + "step": 37540 + }, + { + "epoch": 4.181423321082526, + "grad_norm": 0.1461620330810547, + "learning_rate": 4.8210019182369175e-05, + "loss": 0.0564, + "num_input_tokens_seen": 45698624, + "step": 37545 + }, + { + "epoch": 4.181980175966143, + "grad_norm": 0.6721716523170471, + "learning_rate": 4.8209116229740004e-05, + "loss": 0.139, + "num_input_tokens_seen": 45704224, + "step": 37550 + }, + { + "epoch": 4.182537030849761, + "grad_norm": 0.9741517305374146, + "learning_rate": 4.8208213057882124e-05, + "loss": 0.0334, + "num_input_tokens_seen": 45710176, + "step": 37555 + }, + { + "epoch": 4.183093885733378, + "grad_norm": 0.39273738861083984, + "learning_rate": 4.820730966680409e-05, + "loss": 0.0464, + "num_input_tokens_seen": 45716160, + "step": 37560 + }, + { + "epoch": 4.183650740616995, + "grad_norm": 1.2741082906723022, + "learning_rate": 4.8206406056514414e-05, + "loss": 0.084, + "num_input_tokens_seen": 45722272, + "step": 37565 + }, + { + "epoch": 4.184207595500612, + "grad_norm": 0.518052339553833, + "learning_rate": 4.8205502227021645e-05, + "loss": 0.0909, + "num_input_tokens_seen": 45728352, + "step": 37570 + }, + { + "epoch": 4.18476445038423, + "grad_norm": 0.6139830946922302, + "learning_rate": 4.8204598178334314e-05, + "loss": 0.1987, + "num_input_tokens_seen": 45734368, + "step": 37575 + }, + { + "epoch": 4.1853213052678475, + "grad_norm": 0.03130335733294487, + "learning_rate": 4.820369391046096e-05, + "loss": 0.0289, + "num_input_tokens_seen": 45740192, + "step": 37580 + }, + { + "epoch": 4.185878160151464, + "grad_norm": 0.93216872215271, + "learning_rate": 4.820278942341013e-05, + "loss": 0.0906, + "num_input_tokens_seen": 45746048, + "step": 37585 + }, + { + "epoch": 4.186435015035082, + "grad_norm": 2.3290228843688965, + "learning_rate": 4.820188471719036e-05, + "loss": 0.1366, + "num_input_tokens_seen": 45752128, + "step": 37590 + }, + { + "epoch": 4.186991869918699, + "grad_norm": 0.44745054841041565, + "learning_rate": 4.82009797918102e-05, + "loss": 0.0298, + "num_input_tokens_seen": 45757792, + "step": 37595 + }, + { + "epoch": 4.187548724802316, + "grad_norm": 0.34035724401474, + "learning_rate": 4.8200074647278206e-05, + "loss": 0.0348, + "num_input_tokens_seen": 45763840, + "step": 37600 + }, + { + "epoch": 4.188105579685934, + "grad_norm": 2.2225801944732666, + "learning_rate": 4.819916928360291e-05, + "loss": 0.1067, + "num_input_tokens_seen": 45770048, + "step": 37605 + }, + { + "epoch": 4.188662434569551, + "grad_norm": 2.0282726287841797, + "learning_rate": 4.819826370079287e-05, + "loss": 0.1323, + "num_input_tokens_seen": 45775680, + "step": 37610 + }, + { + "epoch": 4.189219289453169, + "grad_norm": 0.8522171378135681, + "learning_rate": 4.8197357898856655e-05, + "loss": 0.0695, + "num_input_tokens_seen": 45781632, + "step": 37615 + }, + { + "epoch": 4.189776144336786, + "grad_norm": 0.007917128503322601, + "learning_rate": 4.81964518778028e-05, + "loss": 0.023, + "num_input_tokens_seen": 45787936, + "step": 37620 + }, + { + "epoch": 4.190332999220403, + "grad_norm": 0.7239906787872314, + "learning_rate": 4.8195545637639877e-05, + "loss": 0.0746, + "num_input_tokens_seen": 45793760, + "step": 37625 + }, + { + "epoch": 4.190889854104021, + "grad_norm": 0.955212414264679, + "learning_rate": 4.8194639178376446e-05, + "loss": 0.0755, + "num_input_tokens_seen": 45799872, + "step": 37630 + }, + { + "epoch": 4.1914467089876375, + "grad_norm": 0.508898138999939, + "learning_rate": 4.819373250002105e-05, + "loss": 0.1639, + "num_input_tokens_seen": 45805728, + "step": 37635 + }, + { + "epoch": 4.192003563871255, + "grad_norm": 0.2078908383846283, + "learning_rate": 4.819282560258228e-05, + "loss": 0.0206, + "num_input_tokens_seen": 45811680, + "step": 37640 + }, + { + "epoch": 4.192560418754873, + "grad_norm": 0.13871383666992188, + "learning_rate": 4.819191848606869e-05, + "loss": 0.0122, + "num_input_tokens_seen": 45817792, + "step": 37645 + }, + { + "epoch": 4.19311727363849, + "grad_norm": 0.19347697496414185, + "learning_rate": 4.8191011150488844e-05, + "loss": 0.0328, + "num_input_tokens_seen": 45823968, + "step": 37650 + }, + { + "epoch": 4.193674128522107, + "grad_norm": 0.7641322612762451, + "learning_rate": 4.819010359585132e-05, + "loss": 0.06, + "num_input_tokens_seen": 45829984, + "step": 37655 + }, + { + "epoch": 4.194230983405724, + "grad_norm": 1.3978232145309448, + "learning_rate": 4.818919582216469e-05, + "loss": 0.0462, + "num_input_tokens_seen": 45836160, + "step": 37660 + }, + { + "epoch": 4.194787838289342, + "grad_norm": 1.490767002105713, + "learning_rate": 4.8188287829437524e-05, + "loss": 0.1344, + "num_input_tokens_seen": 45842432, + "step": 37665 + }, + { + "epoch": 4.195344693172959, + "grad_norm": 0.07361598312854767, + "learning_rate": 4.8187379617678395e-05, + "loss": 0.1177, + "num_input_tokens_seen": 45848416, + "step": 37670 + }, + { + "epoch": 4.195901548056576, + "grad_norm": 0.9695754051208496, + "learning_rate": 4.81864711868959e-05, + "loss": 0.1308, + "num_input_tokens_seen": 45854592, + "step": 37675 + }, + { + "epoch": 4.196458402940194, + "grad_norm": 1.3749440908432007, + "learning_rate": 4.81855625370986e-05, + "loss": 0.0667, + "num_input_tokens_seen": 45860224, + "step": 37680 + }, + { + "epoch": 4.197015257823811, + "grad_norm": 0.06670751422643661, + "learning_rate": 4.818465366829509e-05, + "loss": 0.0954, + "num_input_tokens_seen": 45866272, + "step": 37685 + }, + { + "epoch": 4.197572112707428, + "grad_norm": 0.0019057670142501593, + "learning_rate": 4.818374458049395e-05, + "loss": 0.0551, + "num_input_tokens_seen": 45872704, + "step": 37690 + }, + { + "epoch": 4.198128967591046, + "grad_norm": 0.20562492311000824, + "learning_rate": 4.818283527370377e-05, + "loss": 0.1263, + "num_input_tokens_seen": 45878912, + "step": 37695 + }, + { + "epoch": 4.198685822474663, + "grad_norm": 0.06513217836618423, + "learning_rate": 4.818192574793313e-05, + "loss": 0.0682, + "num_input_tokens_seen": 45885440, + "step": 37700 + }, + { + "epoch": 4.1992426773582805, + "grad_norm": 0.014265455305576324, + "learning_rate": 4.818101600319064e-05, + "loss": 0.064, + "num_input_tokens_seen": 45891808, + "step": 37705 + }, + { + "epoch": 4.199799532241898, + "grad_norm": 1.1929306983947754, + "learning_rate": 4.818010603948487e-05, + "loss": 0.141, + "num_input_tokens_seen": 45897696, + "step": 37710 + }, + { + "epoch": 4.200356387125515, + "grad_norm": 1.5564578771591187, + "learning_rate": 4.817919585682443e-05, + "loss": 0.0805, + "num_input_tokens_seen": 45903840, + "step": 37715 + }, + { + "epoch": 4.200913242009133, + "grad_norm": 0.1960311383008957, + "learning_rate": 4.817828545521791e-05, + "loss": 0.0551, + "num_input_tokens_seen": 45909920, + "step": 37720 + }, + { + "epoch": 4.201470096892749, + "grad_norm": 0.04115118458867073, + "learning_rate": 4.817737483467393e-05, + "loss": 0.1203, + "num_input_tokens_seen": 45915968, + "step": 37725 + }, + { + "epoch": 4.202026951776367, + "grad_norm": 0.12850835919380188, + "learning_rate": 4.817646399520106e-05, + "loss": 0.0286, + "num_input_tokens_seen": 45922048, + "step": 37730 + }, + { + "epoch": 4.202583806659985, + "grad_norm": 0.45586293935775757, + "learning_rate": 4.8175552936807925e-05, + "loss": 0.0455, + "num_input_tokens_seen": 45927808, + "step": 37735 + }, + { + "epoch": 4.2031406615436016, + "grad_norm": 0.17856033146381378, + "learning_rate": 4.8174641659503116e-05, + "loss": 0.0372, + "num_input_tokens_seen": 45934112, + "step": 37740 + }, + { + "epoch": 4.203697516427219, + "grad_norm": 0.03234502300620079, + "learning_rate": 4.817373016329526e-05, + "loss": 0.0718, + "num_input_tokens_seen": 45940096, + "step": 37745 + }, + { + "epoch": 4.204254371310836, + "grad_norm": 0.5546534061431885, + "learning_rate": 4.817281844819295e-05, + "loss": 0.0838, + "num_input_tokens_seen": 45945984, + "step": 37750 + }, + { + "epoch": 4.204811226194454, + "grad_norm": 0.23255540430545807, + "learning_rate": 4.81719065142048e-05, + "loss": 0.0278, + "num_input_tokens_seen": 45952192, + "step": 37755 + }, + { + "epoch": 4.205368081078071, + "grad_norm": 0.6440708637237549, + "learning_rate": 4.817099436133944e-05, + "loss": 0.0232, + "num_input_tokens_seen": 45958368, + "step": 37760 + }, + { + "epoch": 4.205924935961688, + "grad_norm": 0.013973893597722054, + "learning_rate": 4.817008198960547e-05, + "loss": 0.0859, + "num_input_tokens_seen": 45964064, + "step": 37765 + }, + { + "epoch": 4.206481790845306, + "grad_norm": 0.01534954458475113, + "learning_rate": 4.816916939901151e-05, + "loss": 0.1759, + "num_input_tokens_seen": 45970336, + "step": 37770 + }, + { + "epoch": 4.2070386457289235, + "grad_norm": 0.9126020073890686, + "learning_rate": 4.816825658956619e-05, + "loss": 0.1373, + "num_input_tokens_seen": 45976448, + "step": 37775 + }, + { + "epoch": 4.20759550061254, + "grad_norm": 1.2643040418624878, + "learning_rate": 4.816734356127811e-05, + "loss": 0.214, + "num_input_tokens_seen": 45981888, + "step": 37780 + }, + { + "epoch": 4.208152355496158, + "grad_norm": 0.564709484577179, + "learning_rate": 4.8166430314155917e-05, + "loss": 0.0389, + "num_input_tokens_seen": 45988096, + "step": 37785 + }, + { + "epoch": 4.208709210379775, + "grad_norm": 0.43347057700157166, + "learning_rate": 4.8165516848208224e-05, + "loss": 0.0397, + "num_input_tokens_seen": 45994336, + "step": 37790 + }, + { + "epoch": 4.209266065263392, + "grad_norm": 2.2821595668792725, + "learning_rate": 4.8164603163443665e-05, + "loss": 0.107, + "num_input_tokens_seen": 46000512, + "step": 37795 + }, + { + "epoch": 4.20982292014701, + "grad_norm": 0.4522585868835449, + "learning_rate": 4.816368925987088e-05, + "loss": 0.1423, + "num_input_tokens_seen": 46006816, + "step": 37800 + }, + { + "epoch": 4.210379775030627, + "grad_norm": 0.0933685302734375, + "learning_rate": 4.816277513749848e-05, + "loss": 0.0381, + "num_input_tokens_seen": 46013088, + "step": 37805 + }, + { + "epoch": 4.2109366299142446, + "grad_norm": 0.1019745022058487, + "learning_rate": 4.816186079633512e-05, + "loss": 0.0219, + "num_input_tokens_seen": 46019200, + "step": 37810 + }, + { + "epoch": 4.211493484797861, + "grad_norm": 0.34156644344329834, + "learning_rate": 4.816094623638942e-05, + "loss": 0.0542, + "num_input_tokens_seen": 46025440, + "step": 37815 + }, + { + "epoch": 4.212050339681479, + "grad_norm": 0.6002677083015442, + "learning_rate": 4.816003145767003e-05, + "loss": 0.0236, + "num_input_tokens_seen": 46031584, + "step": 37820 + }, + { + "epoch": 4.212607194565097, + "grad_norm": 0.03527368977665901, + "learning_rate": 4.815911646018559e-05, + "loss": 0.0761, + "num_input_tokens_seen": 46037568, + "step": 37825 + }, + { + "epoch": 4.2131640494487135, + "grad_norm": 1.3298263549804688, + "learning_rate": 4.8158201243944735e-05, + "loss": 0.1198, + "num_input_tokens_seen": 46043488, + "step": 37830 + }, + { + "epoch": 4.213720904332331, + "grad_norm": 0.6930727362632751, + "learning_rate": 4.815728580895612e-05, + "loss": 0.1656, + "num_input_tokens_seen": 46049728, + "step": 37835 + }, + { + "epoch": 4.214277759215948, + "grad_norm": 0.5431931614875793, + "learning_rate": 4.815637015522838e-05, + "loss": 0.0473, + "num_input_tokens_seen": 46056032, + "step": 37840 + }, + { + "epoch": 4.214834614099566, + "grad_norm": 0.34392210841178894, + "learning_rate": 4.8155454282770177e-05, + "loss": 0.1263, + "num_input_tokens_seen": 46062048, + "step": 37845 + }, + { + "epoch": 4.215391468983183, + "grad_norm": 0.327737420797348, + "learning_rate": 4.815453819159016e-05, + "loss": 0.0673, + "num_input_tokens_seen": 46068544, + "step": 37850 + }, + { + "epoch": 4.2159483238668, + "grad_norm": 0.6581917405128479, + "learning_rate": 4.8153621881696974e-05, + "loss": 0.0852, + "num_input_tokens_seen": 46074368, + "step": 37855 + }, + { + "epoch": 4.216505178750418, + "grad_norm": 1.077151894569397, + "learning_rate": 4.815270535309928e-05, + "loss": 0.0266, + "num_input_tokens_seen": 46080640, + "step": 37860 + }, + { + "epoch": 4.2170620336340345, + "grad_norm": 1.7169941663742065, + "learning_rate": 4.815178860580573e-05, + "loss": 0.1367, + "num_input_tokens_seen": 46086048, + "step": 37865 + }, + { + "epoch": 4.217618888517652, + "grad_norm": 0.12521949410438538, + "learning_rate": 4.815087163982499e-05, + "loss": 0.1141, + "num_input_tokens_seen": 46091520, + "step": 37870 + }, + { + "epoch": 4.21817574340127, + "grad_norm": 0.27860382199287415, + "learning_rate": 4.814995445516572e-05, + "loss": 0.0418, + "num_input_tokens_seen": 46097568, + "step": 37875 + }, + { + "epoch": 4.218732598284887, + "grad_norm": 0.48826223611831665, + "learning_rate": 4.814903705183659e-05, + "loss": 0.1334, + "num_input_tokens_seen": 46103072, + "step": 37880 + }, + { + "epoch": 4.219289453168504, + "grad_norm": 0.15901675820350647, + "learning_rate": 4.814811942984625e-05, + "loss": 0.0252, + "num_input_tokens_seen": 46108608, + "step": 37885 + }, + { + "epoch": 4.219846308052122, + "grad_norm": 0.10663303732872009, + "learning_rate": 4.814720158920337e-05, + "loss": 0.0558, + "num_input_tokens_seen": 46114656, + "step": 37890 + }, + { + "epoch": 4.220403162935739, + "grad_norm": 0.14051465690135956, + "learning_rate": 4.8146283529916636e-05, + "loss": 0.0784, + "num_input_tokens_seen": 46121120, + "step": 37895 + }, + { + "epoch": 4.2209600178193565, + "grad_norm": 0.07129345089197159, + "learning_rate": 4.814536525199471e-05, + "loss": 0.0213, + "num_input_tokens_seen": 46127264, + "step": 37900 + }, + { + "epoch": 4.221516872702973, + "grad_norm": 0.29709577560424805, + "learning_rate": 4.814444675544626e-05, + "loss": 0.0196, + "num_input_tokens_seen": 46133408, + "step": 37905 + }, + { + "epoch": 4.222073727586591, + "grad_norm": 0.18949352204799652, + "learning_rate": 4.8143528040279975e-05, + "loss": 0.0351, + "num_input_tokens_seen": 46139584, + "step": 37910 + }, + { + "epoch": 4.222630582470209, + "grad_norm": 0.021761886775493622, + "learning_rate": 4.814260910650452e-05, + "loss": 0.0551, + "num_input_tokens_seen": 46145728, + "step": 37915 + }, + { + "epoch": 4.223187437353825, + "grad_norm": 0.17825783789157867, + "learning_rate": 4.814168995412858e-05, + "loss": 0.0599, + "num_input_tokens_seen": 46151776, + "step": 37920 + }, + { + "epoch": 4.223744292237443, + "grad_norm": 0.12082009762525558, + "learning_rate": 4.8140770583160835e-05, + "loss": 0.0272, + "num_input_tokens_seen": 46158016, + "step": 37925 + }, + { + "epoch": 4.22430114712106, + "grad_norm": 0.7492979764938354, + "learning_rate": 4.813985099360998e-05, + "loss": 0.0858, + "num_input_tokens_seen": 46163872, + "step": 37930 + }, + { + "epoch": 4.2248580020046775, + "grad_norm": 0.03772272542119026, + "learning_rate": 4.813893118548468e-05, + "loss": 0.0105, + "num_input_tokens_seen": 46170016, + "step": 37935 + }, + { + "epoch": 4.225414856888295, + "grad_norm": 0.1630227416753769, + "learning_rate": 4.813801115879365e-05, + "loss": 0.1088, + "num_input_tokens_seen": 46176608, + "step": 37940 + }, + { + "epoch": 4.225971711771912, + "grad_norm": 1.015783429145813, + "learning_rate": 4.8137090913545555e-05, + "loss": 0.1255, + "num_input_tokens_seen": 46182752, + "step": 37945 + }, + { + "epoch": 4.22652856665553, + "grad_norm": 0.13581766188144684, + "learning_rate": 4.8136170449749104e-05, + "loss": 0.0835, + "num_input_tokens_seen": 46188992, + "step": 37950 + }, + { + "epoch": 4.227085421539147, + "grad_norm": 1.3302377462387085, + "learning_rate": 4.8135249767412996e-05, + "loss": 0.1531, + "num_input_tokens_seen": 46195040, + "step": 37955 + }, + { + "epoch": 4.227642276422764, + "grad_norm": 1.9193801879882812, + "learning_rate": 4.813432886654591e-05, + "loss": 0.1239, + "num_input_tokens_seen": 46201056, + "step": 37960 + }, + { + "epoch": 4.228199131306382, + "grad_norm": 0.05166763812303543, + "learning_rate": 4.8133407747156556e-05, + "loss": 0.1163, + "num_input_tokens_seen": 46207392, + "step": 37965 + }, + { + "epoch": 4.228755986189999, + "grad_norm": 0.06091385707259178, + "learning_rate": 4.813248640925363e-05, + "loss": 0.0651, + "num_input_tokens_seen": 46213376, + "step": 37970 + }, + { + "epoch": 4.229312841073616, + "grad_norm": 0.056264616549015045, + "learning_rate": 4.8131564852845836e-05, + "loss": 0.0408, + "num_input_tokens_seen": 46219648, + "step": 37975 + }, + { + "epoch": 4.229869695957234, + "grad_norm": 0.005316336639225483, + "learning_rate": 4.813064307794187e-05, + "loss": 0.0736, + "num_input_tokens_seen": 46225824, + "step": 37980 + }, + { + "epoch": 4.230426550840851, + "grad_norm": 0.19917047023773193, + "learning_rate": 4.812972108455046e-05, + "loss": 0.1098, + "num_input_tokens_seen": 46232032, + "step": 37985 + }, + { + "epoch": 4.230983405724468, + "grad_norm": 0.6699984073638916, + "learning_rate": 4.8128798872680306e-05, + "loss": 0.1049, + "num_input_tokens_seen": 46237760, + "step": 37990 + }, + { + "epoch": 4.231540260608085, + "grad_norm": 0.999988317489624, + "learning_rate": 4.8127876442340105e-05, + "loss": 0.1593, + "num_input_tokens_seen": 46243808, + "step": 37995 + }, + { + "epoch": 4.232097115491703, + "grad_norm": 1.2955440282821655, + "learning_rate": 4.812695379353859e-05, + "loss": 0.1865, + "num_input_tokens_seen": 46250048, + "step": 38000 + }, + { + "epoch": 4.2326539703753205, + "grad_norm": 0.01191494707018137, + "learning_rate": 4.812603092628446e-05, + "loss": 0.0257, + "num_input_tokens_seen": 46256256, + "step": 38005 + }, + { + "epoch": 4.233210825258937, + "grad_norm": 1.1749076843261719, + "learning_rate": 4.812510784058644e-05, + "loss": 0.1532, + "num_input_tokens_seen": 46262464, + "step": 38010 + }, + { + "epoch": 4.233767680142555, + "grad_norm": 0.05392429605126381, + "learning_rate": 4.812418453645325e-05, + "loss": 0.0424, + "num_input_tokens_seen": 46268480, + "step": 38015 + }, + { + "epoch": 4.234324535026172, + "grad_norm": 0.6433127522468567, + "learning_rate": 4.812326101389362e-05, + "loss": 0.0931, + "num_input_tokens_seen": 46274848, + "step": 38020 + }, + { + "epoch": 4.2348813899097895, + "grad_norm": 1.2677159309387207, + "learning_rate": 4.812233727291625e-05, + "loss": 0.0811, + "num_input_tokens_seen": 46280768, + "step": 38025 + }, + { + "epoch": 4.235438244793407, + "grad_norm": 0.21060627698898315, + "learning_rate": 4.812141331352989e-05, + "loss": 0.0632, + "num_input_tokens_seen": 46286816, + "step": 38030 + }, + { + "epoch": 4.235995099677024, + "grad_norm": 2.1105780601501465, + "learning_rate": 4.8120489135743255e-05, + "loss": 0.2795, + "num_input_tokens_seen": 46292864, + "step": 38035 + }, + { + "epoch": 4.236551954560642, + "grad_norm": 0.03506507724523544, + "learning_rate": 4.8119564739565074e-05, + "loss": 0.0159, + "num_input_tokens_seen": 46298976, + "step": 38040 + }, + { + "epoch": 4.237108809444258, + "grad_norm": 0.033487848937511444, + "learning_rate": 4.811864012500408e-05, + "loss": 0.1242, + "num_input_tokens_seen": 46304672, + "step": 38045 + }, + { + "epoch": 4.237665664327876, + "grad_norm": 0.20995746552944183, + "learning_rate": 4.8117715292069004e-05, + "loss": 0.0725, + "num_input_tokens_seen": 46310976, + "step": 38050 + }, + { + "epoch": 4.238222519211494, + "grad_norm": 0.7521361708641052, + "learning_rate": 4.8116790240768586e-05, + "loss": 0.0853, + "num_input_tokens_seen": 46316768, + "step": 38055 + }, + { + "epoch": 4.2387793740951105, + "grad_norm": 2.192619800567627, + "learning_rate": 4.811586497111157e-05, + "loss": 0.0691, + "num_input_tokens_seen": 46322624, + "step": 38060 + }, + { + "epoch": 4.239336228978728, + "grad_norm": 0.8146328926086426, + "learning_rate": 4.811493948310669e-05, + "loss": 0.0833, + "num_input_tokens_seen": 46328352, + "step": 38065 + }, + { + "epoch": 4.239893083862346, + "grad_norm": 0.31563276052474976, + "learning_rate": 4.8114013776762677e-05, + "loss": 0.0581, + "num_input_tokens_seen": 46334688, + "step": 38070 + }, + { + "epoch": 4.240449938745963, + "grad_norm": 0.01140978280454874, + "learning_rate": 4.811308785208829e-05, + "loss": 0.057, + "num_input_tokens_seen": 46340576, + "step": 38075 + }, + { + "epoch": 4.24100679362958, + "grad_norm": 0.255811870098114, + "learning_rate": 4.811216170909227e-05, + "loss": 0.0596, + "num_input_tokens_seen": 46346272, + "step": 38080 + }, + { + "epoch": 4.241563648513197, + "grad_norm": 0.46413663029670715, + "learning_rate": 4.8111235347783377e-05, + "loss": 0.0742, + "num_input_tokens_seen": 46352160, + "step": 38085 + }, + { + "epoch": 4.242120503396815, + "grad_norm": 0.3209596872329712, + "learning_rate": 4.811030876817034e-05, + "loss": 0.1307, + "num_input_tokens_seen": 46358368, + "step": 38090 + }, + { + "epoch": 4.2426773582804325, + "grad_norm": 0.22989214956760406, + "learning_rate": 4.8109381970261915e-05, + "loss": 0.0374, + "num_input_tokens_seen": 46364480, + "step": 38095 + }, + { + "epoch": 4.243234213164049, + "grad_norm": 0.9846770167350769, + "learning_rate": 4.810845495406687e-05, + "loss": 0.0454, + "num_input_tokens_seen": 46370656, + "step": 38100 + }, + { + "epoch": 4.243791068047667, + "grad_norm": 0.332096129655838, + "learning_rate": 4.8107527719593954e-05, + "loss": 0.0544, + "num_input_tokens_seen": 46377184, + "step": 38105 + }, + { + "epoch": 4.244347922931284, + "grad_norm": 0.2813562750816345, + "learning_rate": 4.810660026685192e-05, + "loss": 0.1478, + "num_input_tokens_seen": 46383232, + "step": 38110 + }, + { + "epoch": 4.244904777814901, + "grad_norm": 0.01932750828564167, + "learning_rate": 4.810567259584954e-05, + "loss": 0.0639, + "num_input_tokens_seen": 46389664, + "step": 38115 + }, + { + "epoch": 4.245461632698519, + "grad_norm": 0.2756265103816986, + "learning_rate": 4.810474470659557e-05, + "loss": 0.0962, + "num_input_tokens_seen": 46395136, + "step": 38120 + }, + { + "epoch": 4.246018487582136, + "grad_norm": 0.9814038872718811, + "learning_rate": 4.810381659909877e-05, + "loss": 0.1357, + "num_input_tokens_seen": 46401568, + "step": 38125 + }, + { + "epoch": 4.2465753424657535, + "grad_norm": 0.14320841431617737, + "learning_rate": 4.8102888273367914e-05, + "loss": 0.0982, + "num_input_tokens_seen": 46407584, + "step": 38130 + }, + { + "epoch": 4.247132197349371, + "grad_norm": 0.18560931086540222, + "learning_rate": 4.8101959729411766e-05, + "loss": 0.0398, + "num_input_tokens_seen": 46413760, + "step": 38135 + }, + { + "epoch": 4.247689052232988, + "grad_norm": 0.013329904526472092, + "learning_rate": 4.81010309672391e-05, + "loss": 0.0199, + "num_input_tokens_seen": 46420096, + "step": 38140 + }, + { + "epoch": 4.248245907116606, + "grad_norm": 0.04436188563704491, + "learning_rate": 4.810010198685869e-05, + "loss": 0.0129, + "num_input_tokens_seen": 46426272, + "step": 38145 + }, + { + "epoch": 4.248802762000222, + "grad_norm": 0.2251884639263153, + "learning_rate": 4.809917278827931e-05, + "loss": 0.0149, + "num_input_tokens_seen": 46432608, + "step": 38150 + }, + { + "epoch": 4.24935961688384, + "grad_norm": 0.060427065938711166, + "learning_rate": 4.8098243371509746e-05, + "loss": 0.0646, + "num_input_tokens_seen": 46438624, + "step": 38155 + }, + { + "epoch": 4.249916471767458, + "grad_norm": 0.829677402973175, + "learning_rate": 4.809731373655875e-05, + "loss": 0.0776, + "num_input_tokens_seen": 46444736, + "step": 38160 + }, + { + "epoch": 4.250473326651075, + "grad_norm": 0.8510568737983704, + "learning_rate": 4.8096383883435126e-05, + "loss": 0.0575, + "num_input_tokens_seen": 46450656, + "step": 38165 + }, + { + "epoch": 4.251030181534692, + "grad_norm": 0.022850332781672478, + "learning_rate": 4.809545381214766e-05, + "loss": 0.06, + "num_input_tokens_seen": 46456576, + "step": 38170 + }, + { + "epoch": 4.251587036418309, + "grad_norm": 1.0402454137802124, + "learning_rate": 4.809452352270512e-05, + "loss": 0.0831, + "num_input_tokens_seen": 46462560, + "step": 38175 + }, + { + "epoch": 4.252143891301927, + "grad_norm": 0.16313111782073975, + "learning_rate": 4.80935930151163e-05, + "loss": 0.0625, + "num_input_tokens_seen": 46467744, + "step": 38180 + }, + { + "epoch": 4.252700746185544, + "grad_norm": 0.24543289840221405, + "learning_rate": 4.809266228939e-05, + "loss": 0.0964, + "num_input_tokens_seen": 46473760, + "step": 38185 + }, + { + "epoch": 4.253257601069161, + "grad_norm": 0.2698628306388855, + "learning_rate": 4.8091731345535e-05, + "loss": 0.1064, + "num_input_tokens_seen": 46480096, + "step": 38190 + }, + { + "epoch": 4.253814455952779, + "grad_norm": 1.0069165229797363, + "learning_rate": 4.80908001835601e-05, + "loss": 0.04, + "num_input_tokens_seen": 46486272, + "step": 38195 + }, + { + "epoch": 4.254371310836396, + "grad_norm": 0.6532930135726929, + "learning_rate": 4.808986880347409e-05, + "loss": 0.0564, + "num_input_tokens_seen": 46492448, + "step": 38200 + }, + { + "epoch": 4.254928165720013, + "grad_norm": 0.036134373396635056, + "learning_rate": 4.808893720528577e-05, + "loss": 0.0491, + "num_input_tokens_seen": 46498688, + "step": 38205 + }, + { + "epoch": 4.255485020603631, + "grad_norm": 0.15307104587554932, + "learning_rate": 4.808800538900393e-05, + "loss": 0.07, + "num_input_tokens_seen": 46505056, + "step": 38210 + }, + { + "epoch": 4.256041875487248, + "grad_norm": 0.19917930662631989, + "learning_rate": 4.808707335463739e-05, + "loss": 0.0301, + "num_input_tokens_seen": 46510816, + "step": 38215 + }, + { + "epoch": 4.2565987303708654, + "grad_norm": 0.5267328023910522, + "learning_rate": 4.808614110219495e-05, + "loss": 0.0713, + "num_input_tokens_seen": 46517088, + "step": 38220 + }, + { + "epoch": 4.257155585254482, + "grad_norm": 1.3672648668289185, + "learning_rate": 4.80852086316854e-05, + "loss": 0.0962, + "num_input_tokens_seen": 46523200, + "step": 38225 + }, + { + "epoch": 4.2577124401381, + "grad_norm": 0.003944865893572569, + "learning_rate": 4.808427594311756e-05, + "loss": 0.1162, + "num_input_tokens_seen": 46529536, + "step": 38230 + }, + { + "epoch": 4.258269295021718, + "grad_norm": 0.021481061354279518, + "learning_rate": 4.808334303650025e-05, + "loss": 0.074, + "num_input_tokens_seen": 46535424, + "step": 38235 + }, + { + "epoch": 4.258826149905334, + "grad_norm": 0.2120840847492218, + "learning_rate": 4.808240991184226e-05, + "loss": 0.0291, + "num_input_tokens_seen": 46541216, + "step": 38240 + }, + { + "epoch": 4.259383004788952, + "grad_norm": 0.03391275554895401, + "learning_rate": 4.808147656915242e-05, + "loss": 0.0599, + "num_input_tokens_seen": 46547488, + "step": 38245 + }, + { + "epoch": 4.25993985967257, + "grad_norm": 0.12366282939910889, + "learning_rate": 4.8080543008439544e-05, + "loss": 0.0661, + "num_input_tokens_seen": 46553856, + "step": 38250 + }, + { + "epoch": 4.2604967145561865, + "grad_norm": 1.430382251739502, + "learning_rate": 4.807960922971244e-05, + "loss": 0.0771, + "num_input_tokens_seen": 46559936, + "step": 38255 + }, + { + "epoch": 4.261053569439804, + "grad_norm": 0.11855270713567734, + "learning_rate": 4.807867523297994e-05, + "loss": 0.0366, + "num_input_tokens_seen": 46566080, + "step": 38260 + }, + { + "epoch": 4.261610424323421, + "grad_norm": 0.4799109399318695, + "learning_rate": 4.8077741018250864e-05, + "loss": 0.0433, + "num_input_tokens_seen": 46572384, + "step": 38265 + }, + { + "epoch": 4.262167279207039, + "grad_norm": 0.7821618914604187, + "learning_rate": 4.807680658553403e-05, + "loss": 0.068, + "num_input_tokens_seen": 46578816, + "step": 38270 + }, + { + "epoch": 4.262724134090656, + "grad_norm": 0.41262874007225037, + "learning_rate": 4.807587193483827e-05, + "loss": 0.0305, + "num_input_tokens_seen": 46585088, + "step": 38275 + }, + { + "epoch": 4.263280988974273, + "grad_norm": 0.0008255152497440577, + "learning_rate": 4.8074937066172413e-05, + "loss": 0.0404, + "num_input_tokens_seen": 46591104, + "step": 38280 + }, + { + "epoch": 4.263837843857891, + "grad_norm": 0.7485998272895813, + "learning_rate": 4.807400197954529e-05, + "loss": 0.0934, + "num_input_tokens_seen": 46597024, + "step": 38285 + }, + { + "epoch": 4.264394698741508, + "grad_norm": 0.002077328972518444, + "learning_rate": 4.8073066674965725e-05, + "loss": 0.0441, + "num_input_tokens_seen": 46603040, + "step": 38290 + }, + { + "epoch": 4.264951553625125, + "grad_norm": 0.9050670862197876, + "learning_rate": 4.807213115244257e-05, + "loss": 0.1402, + "num_input_tokens_seen": 46608832, + "step": 38295 + }, + { + "epoch": 4.265508408508743, + "grad_norm": 0.14204800128936768, + "learning_rate": 4.807119541198464e-05, + "loss": 0.0217, + "num_input_tokens_seen": 46614784, + "step": 38300 + }, + { + "epoch": 4.26606526339236, + "grad_norm": 2.9690897464752197, + "learning_rate": 4.8070259453600794e-05, + "loss": 0.106, + "num_input_tokens_seen": 46620928, + "step": 38305 + }, + { + "epoch": 4.266622118275977, + "grad_norm": 0.04265642538666725, + "learning_rate": 4.806932327729986e-05, + "loss": 0.01, + "num_input_tokens_seen": 46626880, + "step": 38310 + }, + { + "epoch": 4.267178973159595, + "grad_norm": 0.0373072512447834, + "learning_rate": 4.806838688309069e-05, + "loss": 0.0539, + "num_input_tokens_seen": 46633216, + "step": 38315 + }, + { + "epoch": 4.267735828043212, + "grad_norm": 1.6908904314041138, + "learning_rate": 4.806745027098212e-05, + "loss": 0.1521, + "num_input_tokens_seen": 46639136, + "step": 38320 + }, + { + "epoch": 4.2682926829268295, + "grad_norm": 0.19338199496269226, + "learning_rate": 4.8066513440983e-05, + "loss": 0.0226, + "num_input_tokens_seen": 46645120, + "step": 38325 + }, + { + "epoch": 4.268849537810446, + "grad_norm": 0.08478112518787384, + "learning_rate": 4.8065576393102174e-05, + "loss": 0.101, + "num_input_tokens_seen": 46651232, + "step": 38330 + }, + { + "epoch": 4.269406392694064, + "grad_norm": 0.5999385118484497, + "learning_rate": 4.8064639127348504e-05, + "loss": 0.0449, + "num_input_tokens_seen": 46657568, + "step": 38335 + }, + { + "epoch": 4.269963247577682, + "grad_norm": 0.8780233860015869, + "learning_rate": 4.806370164373084e-05, + "loss": 0.1186, + "num_input_tokens_seen": 46663936, + "step": 38340 + }, + { + "epoch": 4.270520102461298, + "grad_norm": 0.025823937729001045, + "learning_rate": 4.806276394225803e-05, + "loss": 0.0669, + "num_input_tokens_seen": 46669664, + "step": 38345 + }, + { + "epoch": 4.271076957344916, + "grad_norm": 0.44880083203315735, + "learning_rate": 4.8061826022938945e-05, + "loss": 0.1003, + "num_input_tokens_seen": 46676192, + "step": 38350 + }, + { + "epoch": 4.271633812228533, + "grad_norm": 1.0020966529846191, + "learning_rate": 4.806088788578244e-05, + "loss": 0.0358, + "num_input_tokens_seen": 46682144, + "step": 38355 + }, + { + "epoch": 4.272190667112151, + "grad_norm": 0.006051107309758663, + "learning_rate": 4.8059949530797355e-05, + "loss": 0.0578, + "num_input_tokens_seen": 46688352, + "step": 38360 + }, + { + "epoch": 4.272747521995768, + "grad_norm": 0.8385738134384155, + "learning_rate": 4.8059010957992585e-05, + "loss": 0.1076, + "num_input_tokens_seen": 46694368, + "step": 38365 + }, + { + "epoch": 4.273304376879385, + "grad_norm": 0.3004896342754364, + "learning_rate": 4.805807216737698e-05, + "loss": 0.1118, + "num_input_tokens_seen": 46700192, + "step": 38370 + }, + { + "epoch": 4.273861231763003, + "grad_norm": 0.41112151741981506, + "learning_rate": 4.805713315895941e-05, + "loss": 0.105, + "num_input_tokens_seen": 46706176, + "step": 38375 + }, + { + "epoch": 4.2744180866466195, + "grad_norm": 0.6756104230880737, + "learning_rate": 4.805619393274874e-05, + "loss": 0.1511, + "num_input_tokens_seen": 46712192, + "step": 38380 + }, + { + "epoch": 4.274974941530237, + "grad_norm": 0.0005640948074869812, + "learning_rate": 4.805525448875385e-05, + "loss": 0.0241, + "num_input_tokens_seen": 46718464, + "step": 38385 + }, + { + "epoch": 4.275531796413855, + "grad_norm": 0.9438830018043518, + "learning_rate": 4.8054314826983606e-05, + "loss": 0.0737, + "num_input_tokens_seen": 46724800, + "step": 38390 + }, + { + "epoch": 4.276088651297472, + "grad_norm": 0.0035611374769359827, + "learning_rate": 4.805337494744689e-05, + "loss": 0.0175, + "num_input_tokens_seen": 46730816, + "step": 38395 + }, + { + "epoch": 4.276645506181089, + "grad_norm": 0.002736555179581046, + "learning_rate": 4.8052434850152584e-05, + "loss": 0.0307, + "num_input_tokens_seen": 46736992, + "step": 38400 + }, + { + "epoch": 4.277202361064706, + "grad_norm": 1.296380877494812, + "learning_rate": 4.805149453510956e-05, + "loss": 0.0975, + "num_input_tokens_seen": 46743008, + "step": 38405 + }, + { + "epoch": 4.277759215948324, + "grad_norm": 0.00735858827829361, + "learning_rate": 4.805055400232669e-05, + "loss": 0.2585, + "num_input_tokens_seen": 46749152, + "step": 38410 + }, + { + "epoch": 4.278316070831941, + "grad_norm": 0.7667734622955322, + "learning_rate": 4.804961325181288e-05, + "loss": 0.2031, + "num_input_tokens_seen": 46755328, + "step": 38415 + }, + { + "epoch": 4.278872925715558, + "grad_norm": 0.09485483914613724, + "learning_rate": 4.8048672283577e-05, + "loss": 0.0372, + "num_input_tokens_seen": 46760928, + "step": 38420 + }, + { + "epoch": 4.279429780599176, + "grad_norm": 0.8618603348731995, + "learning_rate": 4.804773109762795e-05, + "loss": 0.0392, + "num_input_tokens_seen": 46767040, + "step": 38425 + }, + { + "epoch": 4.279986635482794, + "grad_norm": 1.021087408065796, + "learning_rate": 4.80467896939746e-05, + "loss": 0.1149, + "num_input_tokens_seen": 46773280, + "step": 38430 + }, + { + "epoch": 4.28054349036641, + "grad_norm": 0.265015184879303, + "learning_rate": 4.804584807262587e-05, + "loss": 0.1348, + "num_input_tokens_seen": 46779648, + "step": 38435 + }, + { + "epoch": 4.281100345250028, + "grad_norm": 0.06057964637875557, + "learning_rate": 4.8044906233590634e-05, + "loss": 0.0677, + "num_input_tokens_seen": 46786016, + "step": 38440 + }, + { + "epoch": 4.281657200133645, + "grad_norm": 0.06791790574789047, + "learning_rate": 4.804396417687781e-05, + "loss": 0.1248, + "num_input_tokens_seen": 46792352, + "step": 38445 + }, + { + "epoch": 4.2822140550172625, + "grad_norm": 0.3951103985309601, + "learning_rate": 4.804302190249626e-05, + "loss": 0.0238, + "num_input_tokens_seen": 46798656, + "step": 38450 + }, + { + "epoch": 4.28277090990088, + "grad_norm": 1.99754798412323, + "learning_rate": 4.804207941045493e-05, + "loss": 0.2175, + "num_input_tokens_seen": 46804480, + "step": 38455 + }, + { + "epoch": 4.283327764784497, + "grad_norm": 0.8249824643135071, + "learning_rate": 4.804113670076268e-05, + "loss": 0.108, + "num_input_tokens_seen": 46810592, + "step": 38460 + }, + { + "epoch": 4.283884619668115, + "grad_norm": 0.6039166450500488, + "learning_rate": 4.804019377342844e-05, + "loss": 0.0598, + "num_input_tokens_seen": 46816576, + "step": 38465 + }, + { + "epoch": 4.284441474551731, + "grad_norm": 0.5957629680633545, + "learning_rate": 4.803925062846111e-05, + "loss": 0.0156, + "num_input_tokens_seen": 46822656, + "step": 38470 + }, + { + "epoch": 4.284998329435349, + "grad_norm": 0.017992110922932625, + "learning_rate": 4.8038307265869605e-05, + "loss": 0.0208, + "num_input_tokens_seen": 46829088, + "step": 38475 + }, + { + "epoch": 4.285555184318967, + "grad_norm": 0.11206866055727005, + "learning_rate": 4.8037363685662824e-05, + "loss": 0.0965, + "num_input_tokens_seen": 46835072, + "step": 38480 + }, + { + "epoch": 4.2861120392025835, + "grad_norm": 1.202816367149353, + "learning_rate": 4.803641988784968e-05, + "loss": 0.1822, + "num_input_tokens_seen": 46841440, + "step": 38485 + }, + { + "epoch": 4.286668894086201, + "grad_norm": 2.43709397315979, + "learning_rate": 4.803547587243911e-05, + "loss": 0.1217, + "num_input_tokens_seen": 46847296, + "step": 38490 + }, + { + "epoch": 4.287225748969819, + "grad_norm": 0.13961859047412872, + "learning_rate": 4.803453163944e-05, + "loss": 0.0756, + "num_input_tokens_seen": 46853120, + "step": 38495 + }, + { + "epoch": 4.287782603853436, + "grad_norm": 0.1653628796339035, + "learning_rate": 4.803358718886128e-05, + "loss": 0.0805, + "num_input_tokens_seen": 46859168, + "step": 38500 + }, + { + "epoch": 4.288339458737053, + "grad_norm": 0.0358998142182827, + "learning_rate": 4.803264252071189e-05, + "loss": 0.098, + "num_input_tokens_seen": 46865248, + "step": 38505 + }, + { + "epoch": 4.28889631362067, + "grad_norm": 0.26340046525001526, + "learning_rate": 4.8031697635000734e-05, + "loss": 0.0921, + "num_input_tokens_seen": 46871168, + "step": 38510 + }, + { + "epoch": 4.289453168504288, + "grad_norm": 0.17515133321285248, + "learning_rate": 4.803075253173673e-05, + "loss": 0.0187, + "num_input_tokens_seen": 46877312, + "step": 38515 + }, + { + "epoch": 4.2900100233879055, + "grad_norm": 0.9818094372749329, + "learning_rate": 4.8029807210928834e-05, + "loss": 0.1512, + "num_input_tokens_seen": 46883616, + "step": 38520 + }, + { + "epoch": 4.290566878271522, + "grad_norm": 0.008203892968595028, + "learning_rate": 4.8028861672585946e-05, + "loss": 0.0278, + "num_input_tokens_seen": 46889632, + "step": 38525 + }, + { + "epoch": 4.29112373315514, + "grad_norm": 1.0549522638320923, + "learning_rate": 4.8027915916717015e-05, + "loss": 0.0777, + "num_input_tokens_seen": 46895872, + "step": 38530 + }, + { + "epoch": 4.291680588038757, + "grad_norm": 0.02060491219162941, + "learning_rate": 4.802696994333096e-05, + "loss": 0.0294, + "num_input_tokens_seen": 46902080, + "step": 38535 + }, + { + "epoch": 4.292237442922374, + "grad_norm": 0.953876256942749, + "learning_rate": 4.8026023752436735e-05, + "loss": 0.1212, + "num_input_tokens_seen": 46908096, + "step": 38540 + }, + { + "epoch": 4.292794297805992, + "grad_norm": 0.26645129919052124, + "learning_rate": 4.8025077344043254e-05, + "loss": 0.0388, + "num_input_tokens_seen": 46914080, + "step": 38545 + }, + { + "epoch": 4.293351152689609, + "grad_norm": 0.4347662627696991, + "learning_rate": 4.802413071815948e-05, + "loss": 0.0431, + "num_input_tokens_seen": 46920384, + "step": 38550 + }, + { + "epoch": 4.2939080075732265, + "grad_norm": 0.09475275129079819, + "learning_rate": 4.802318387479435e-05, + "loss": 0.1594, + "num_input_tokens_seen": 46926400, + "step": 38555 + }, + { + "epoch": 4.294464862456843, + "grad_norm": 0.3386205732822418, + "learning_rate": 4.8022236813956786e-05, + "loss": 0.0409, + "num_input_tokens_seen": 46932128, + "step": 38560 + }, + { + "epoch": 4.295021717340461, + "grad_norm": 0.010443238541483879, + "learning_rate": 4.8021289535655766e-05, + "loss": 0.0649, + "num_input_tokens_seen": 46938112, + "step": 38565 + }, + { + "epoch": 4.295578572224079, + "grad_norm": 0.6680684089660645, + "learning_rate": 4.802034203990021e-05, + "loss": 0.017, + "num_input_tokens_seen": 46944320, + "step": 38570 + }, + { + "epoch": 4.2961354271076955, + "grad_norm": 0.00513472780585289, + "learning_rate": 4.801939432669909e-05, + "loss": 0.0652, + "num_input_tokens_seen": 46950848, + "step": 38575 + }, + { + "epoch": 4.296692281991313, + "grad_norm": 0.8629980683326721, + "learning_rate": 4.8018446396061344e-05, + "loss": 0.0915, + "num_input_tokens_seen": 46957120, + "step": 38580 + }, + { + "epoch": 4.29724913687493, + "grad_norm": 0.6618045568466187, + "learning_rate": 4.8017498247995926e-05, + "loss": 0.2251, + "num_input_tokens_seen": 46962464, + "step": 38585 + }, + { + "epoch": 4.297805991758548, + "grad_norm": 0.18589216470718384, + "learning_rate": 4.801654988251181e-05, + "loss": 0.0626, + "num_input_tokens_seen": 46968416, + "step": 38590 + }, + { + "epoch": 4.298362846642165, + "grad_norm": 0.05535326525568962, + "learning_rate": 4.8015601299617926e-05, + "loss": 0.0664, + "num_input_tokens_seen": 46974432, + "step": 38595 + }, + { + "epoch": 4.298919701525782, + "grad_norm": 0.05003555491566658, + "learning_rate": 4.801465249932325e-05, + "loss": 0.0446, + "num_input_tokens_seen": 46980608, + "step": 38600 + }, + { + "epoch": 4.2994765564094, + "grad_norm": 0.22871516644954681, + "learning_rate": 4.801370348163674e-05, + "loss": 0.0621, + "num_input_tokens_seen": 46986752, + "step": 38605 + }, + { + "epoch": 4.300033411293017, + "grad_norm": 0.2214081883430481, + "learning_rate": 4.8012754246567364e-05, + "loss": 0.0983, + "num_input_tokens_seen": 46992960, + "step": 38610 + }, + { + "epoch": 4.300590266176634, + "grad_norm": 0.13934990763664246, + "learning_rate": 4.8011804794124096e-05, + "loss": 0.0355, + "num_input_tokens_seen": 46999168, + "step": 38615 + }, + { + "epoch": 4.301147121060252, + "grad_norm": 0.006304586306214333, + "learning_rate": 4.8010855124315876e-05, + "loss": 0.029, + "num_input_tokens_seen": 47004928, + "step": 38620 + }, + { + "epoch": 4.301703975943869, + "grad_norm": 0.04593060538172722, + "learning_rate": 4.800990523715171e-05, + "loss": 0.0171, + "num_input_tokens_seen": 47011104, + "step": 38625 + }, + { + "epoch": 4.302260830827486, + "grad_norm": 0.14315356314182281, + "learning_rate": 4.8008955132640546e-05, + "loss": 0.1015, + "num_input_tokens_seen": 47017152, + "step": 38630 + }, + { + "epoch": 4.302817685711104, + "grad_norm": 0.5009229183197021, + "learning_rate": 4.800800481079137e-05, + "loss": 0.1076, + "num_input_tokens_seen": 47023104, + "step": 38635 + }, + { + "epoch": 4.303374540594721, + "grad_norm": 0.0034442434553056955, + "learning_rate": 4.800705427161315e-05, + "loss": 0.0206, + "num_input_tokens_seen": 47029248, + "step": 38640 + }, + { + "epoch": 4.3039313954783385, + "grad_norm": 1.9111711978912354, + "learning_rate": 4.800610351511488e-05, + "loss": 0.0665, + "num_input_tokens_seen": 47035104, + "step": 38645 + }, + { + "epoch": 4.304488250361955, + "grad_norm": 0.06543108075857162, + "learning_rate": 4.800515254130552e-05, + "loss": 0.0537, + "num_input_tokens_seen": 47041152, + "step": 38650 + }, + { + "epoch": 4.305045105245573, + "grad_norm": 0.5009708404541016, + "learning_rate": 4.800420135019406e-05, + "loss": 0.0558, + "num_input_tokens_seen": 47047104, + "step": 38655 + }, + { + "epoch": 4.305601960129191, + "grad_norm": 0.25150150060653687, + "learning_rate": 4.8003249941789495e-05, + "loss": 0.009, + "num_input_tokens_seen": 47053440, + "step": 38660 + }, + { + "epoch": 4.306158815012807, + "grad_norm": 0.47982171177864075, + "learning_rate": 4.8002298316100804e-05, + "loss": 0.065, + "num_input_tokens_seen": 47059712, + "step": 38665 + }, + { + "epoch": 4.306715669896425, + "grad_norm": 1.2884222269058228, + "learning_rate": 4.8001346473136975e-05, + "loss": 0.1201, + "num_input_tokens_seen": 47065728, + "step": 38670 + }, + { + "epoch": 4.307272524780043, + "grad_norm": 0.9888564348220825, + "learning_rate": 4.8000394412907e-05, + "loss": 0.0824, + "num_input_tokens_seen": 47071808, + "step": 38675 + }, + { + "epoch": 4.3078293796636595, + "grad_norm": 0.1039036437869072, + "learning_rate": 4.799944213541987e-05, + "loss": 0.1105, + "num_input_tokens_seen": 47077600, + "step": 38680 + }, + { + "epoch": 4.308386234547277, + "grad_norm": 0.001238155527971685, + "learning_rate": 4.799848964068458e-05, + "loss": 0.0191, + "num_input_tokens_seen": 47083840, + "step": 38685 + }, + { + "epoch": 4.308943089430894, + "grad_norm": 0.38883504271507263, + "learning_rate": 4.799753692871014e-05, + "loss": 0.063, + "num_input_tokens_seen": 47090016, + "step": 38690 + }, + { + "epoch": 4.309499944314512, + "grad_norm": 0.007445171941071749, + "learning_rate": 4.7996583999505526e-05, + "loss": 0.013, + "num_input_tokens_seen": 47096352, + "step": 38695 + }, + { + "epoch": 4.310056799198129, + "grad_norm": 0.7941866517066956, + "learning_rate": 4.799563085307977e-05, + "loss": 0.068, + "num_input_tokens_seen": 47102208, + "step": 38700 + }, + { + "epoch": 4.310613654081746, + "grad_norm": 0.24552130699157715, + "learning_rate": 4.799467748944184e-05, + "loss": 0.1108, + "num_input_tokens_seen": 47108448, + "step": 38705 + }, + { + "epoch": 4.311170508965364, + "grad_norm": 0.6921929717063904, + "learning_rate": 4.799372390860076e-05, + "loss": 0.0494, + "num_input_tokens_seen": 47114464, + "step": 38710 + }, + { + "epoch": 4.311727363848981, + "grad_norm": 1.4148873090744019, + "learning_rate": 4.799277011056554e-05, + "loss": 0.1351, + "num_input_tokens_seen": 47120032, + "step": 38715 + }, + { + "epoch": 4.312284218732598, + "grad_norm": 0.010656720958650112, + "learning_rate": 4.799181609534518e-05, + "loss": 0.0359, + "num_input_tokens_seen": 47125632, + "step": 38720 + }, + { + "epoch": 4.312841073616216, + "grad_norm": 1.219349980354309, + "learning_rate": 4.79908618629487e-05, + "loss": 0.0549, + "num_input_tokens_seen": 47131808, + "step": 38725 + }, + { + "epoch": 4.313397928499833, + "grad_norm": 0.5295855402946472, + "learning_rate": 4.7989907413385104e-05, + "loss": 0.0291, + "num_input_tokens_seen": 47137984, + "step": 38730 + }, + { + "epoch": 4.31395478338345, + "grad_norm": 0.9998118281364441, + "learning_rate": 4.798895274666342e-05, + "loss": 0.0452, + "num_input_tokens_seen": 47144064, + "step": 38735 + }, + { + "epoch": 4.314511638267067, + "grad_norm": 2.1491310596466064, + "learning_rate": 4.798799786279265e-05, + "loss": 0.1547, + "num_input_tokens_seen": 47149952, + "step": 38740 + }, + { + "epoch": 4.315068493150685, + "grad_norm": 1.602575421333313, + "learning_rate": 4.798704276178184e-05, + "loss": 0.0635, + "num_input_tokens_seen": 47155904, + "step": 38745 + }, + { + "epoch": 4.3156253480343025, + "grad_norm": 0.43069955706596375, + "learning_rate": 4.798608744363998e-05, + "loss": 0.024, + "num_input_tokens_seen": 47162208, + "step": 38750 + }, + { + "epoch": 4.316182202917919, + "grad_norm": 0.12670497596263885, + "learning_rate": 4.798513190837611e-05, + "loss": 0.0118, + "num_input_tokens_seen": 47168512, + "step": 38755 + }, + { + "epoch": 4.316739057801537, + "grad_norm": 1.0389671325683594, + "learning_rate": 4.798417615599926e-05, + "loss": 0.0468, + "num_input_tokens_seen": 47174016, + "step": 38760 + }, + { + "epoch": 4.317295912685154, + "grad_norm": 0.8726932406425476, + "learning_rate": 4.7983220186518444e-05, + "loss": 0.1209, + "num_input_tokens_seen": 47180160, + "step": 38765 + }, + { + "epoch": 4.3178527675687715, + "grad_norm": 1.078837513923645, + "learning_rate": 4.79822639999427e-05, + "loss": 0.0885, + "num_input_tokens_seen": 47186240, + "step": 38770 + }, + { + "epoch": 4.318409622452389, + "grad_norm": 1.1897915601730347, + "learning_rate": 4.798130759628107e-05, + "loss": 0.0873, + "num_input_tokens_seen": 47192320, + "step": 38775 + }, + { + "epoch": 4.318966477336006, + "grad_norm": 1.0711663961410522, + "learning_rate": 4.7980350975542566e-05, + "loss": 0.0709, + "num_input_tokens_seen": 47198304, + "step": 38780 + }, + { + "epoch": 4.319523332219624, + "grad_norm": 0.005046428181231022, + "learning_rate": 4.797939413773623e-05, + "loss": 0.0371, + "num_input_tokens_seen": 47204704, + "step": 38785 + }, + { + "epoch": 4.320080187103241, + "grad_norm": 1.4536319971084595, + "learning_rate": 4.797843708287112e-05, + "loss": 0.3146, + "num_input_tokens_seen": 47210656, + "step": 38790 + }, + { + "epoch": 4.320637041986858, + "grad_norm": 0.15145909786224365, + "learning_rate": 4.797747981095626e-05, + "loss": 0.105, + "num_input_tokens_seen": 47216832, + "step": 38795 + }, + { + "epoch": 4.321193896870476, + "grad_norm": 2.8321311473846436, + "learning_rate": 4.7976522322000684e-05, + "loss": 0.1238, + "num_input_tokens_seen": 47223104, + "step": 38800 + }, + { + "epoch": 4.3217507517540925, + "grad_norm": 0.01193542592227459, + "learning_rate": 4.7975564616013456e-05, + "loss": 0.165, + "num_input_tokens_seen": 47229248, + "step": 38805 + }, + { + "epoch": 4.32230760663771, + "grad_norm": 0.10509254038333893, + "learning_rate": 4.797460669300361e-05, + "loss": 0.1109, + "num_input_tokens_seen": 47235232, + "step": 38810 + }, + { + "epoch": 4.322864461521328, + "grad_norm": 0.3299846053123474, + "learning_rate": 4.79736485529802e-05, + "loss": 0.037, + "num_input_tokens_seen": 47241376, + "step": 38815 + }, + { + "epoch": 4.323421316404945, + "grad_norm": 0.11259903758764267, + "learning_rate": 4.7972690195952265e-05, + "loss": 0.0661, + "num_input_tokens_seen": 47247520, + "step": 38820 + }, + { + "epoch": 4.323978171288562, + "grad_norm": 0.45796456933021545, + "learning_rate": 4.7971731621928864e-05, + "loss": 0.13, + "num_input_tokens_seen": 47253344, + "step": 38825 + }, + { + "epoch": 4.32453502617218, + "grad_norm": 0.11618350446224213, + "learning_rate": 4.797077283091906e-05, + "loss": 0.0287, + "num_input_tokens_seen": 47259296, + "step": 38830 + }, + { + "epoch": 4.325091881055797, + "grad_norm": 1.057198405265808, + "learning_rate": 4.7969813822931904e-05, + "loss": 0.0944, + "num_input_tokens_seen": 47265568, + "step": 38835 + }, + { + "epoch": 4.3256487359394145, + "grad_norm": 0.0025437993463128805, + "learning_rate": 4.796885459797645e-05, + "loss": 0.0814, + "num_input_tokens_seen": 47271968, + "step": 38840 + }, + { + "epoch": 4.326205590823031, + "grad_norm": 0.9348620772361755, + "learning_rate": 4.796789515606176e-05, + "loss": 0.0257, + "num_input_tokens_seen": 47278528, + "step": 38845 + }, + { + "epoch": 4.326762445706649, + "grad_norm": 0.5171984434127808, + "learning_rate": 4.79669354971969e-05, + "loss": 0.0567, + "num_input_tokens_seen": 47284256, + "step": 38850 + }, + { + "epoch": 4.327319300590267, + "grad_norm": 0.08142539858818054, + "learning_rate": 4.796597562139093e-05, + "loss": 0.1366, + "num_input_tokens_seen": 47289984, + "step": 38855 + }, + { + "epoch": 4.327876155473883, + "grad_norm": 0.13747629523277283, + "learning_rate": 4.7965015528652934e-05, + "loss": 0.0237, + "num_input_tokens_seen": 47296128, + "step": 38860 + }, + { + "epoch": 4.328433010357501, + "grad_norm": 0.006836466025561094, + "learning_rate": 4.796405521899195e-05, + "loss": 0.1144, + "num_input_tokens_seen": 47302240, + "step": 38865 + }, + { + "epoch": 4.328989865241118, + "grad_norm": 0.8583465218544006, + "learning_rate": 4.7963094692417076e-05, + "loss": 0.0344, + "num_input_tokens_seen": 47308288, + "step": 38870 + }, + { + "epoch": 4.3295467201247355, + "grad_norm": 1.695102572441101, + "learning_rate": 4.796213394893738e-05, + "loss": 0.0568, + "num_input_tokens_seen": 47314336, + "step": 38875 + }, + { + "epoch": 4.330103575008353, + "grad_norm": 0.0026584910228848457, + "learning_rate": 4.796117298856192e-05, + "loss": 0.0834, + "num_input_tokens_seen": 47320096, + "step": 38880 + }, + { + "epoch": 4.33066042989197, + "grad_norm": 0.7141575217247009, + "learning_rate": 4.796021181129979e-05, + "loss": 0.1124, + "num_input_tokens_seen": 47325952, + "step": 38885 + }, + { + "epoch": 4.331217284775588, + "grad_norm": 0.0009879631688818336, + "learning_rate": 4.7959250417160064e-05, + "loss": 0.0248, + "num_input_tokens_seen": 47332288, + "step": 38890 + }, + { + "epoch": 4.331774139659204, + "grad_norm": 0.0005592309753410518, + "learning_rate": 4.795828880615182e-05, + "loss": 0.0474, + "num_input_tokens_seen": 47338432, + "step": 38895 + }, + { + "epoch": 4.332330994542822, + "grad_norm": 2.2928504943847656, + "learning_rate": 4.795732697828415e-05, + "loss": 0.1429, + "num_input_tokens_seen": 47344864, + "step": 38900 + }, + { + "epoch": 4.33288784942644, + "grad_norm": 0.12371467053890228, + "learning_rate": 4.795636493356613e-05, + "loss": 0.0614, + "num_input_tokens_seen": 47350784, + "step": 38905 + }, + { + "epoch": 4.333444704310057, + "grad_norm": 0.04211636260151863, + "learning_rate": 4.7955402672006854e-05, + "loss": 0.2371, + "num_input_tokens_seen": 47356320, + "step": 38910 + }, + { + "epoch": 4.334001559193674, + "grad_norm": 0.9009504318237305, + "learning_rate": 4.79544401936154e-05, + "loss": 0.1402, + "num_input_tokens_seen": 47362432, + "step": 38915 + }, + { + "epoch": 4.334558414077291, + "grad_norm": 0.42716678977012634, + "learning_rate": 4.795347749840088e-05, + "loss": 0.1289, + "num_input_tokens_seen": 47368160, + "step": 38920 + }, + { + "epoch": 4.335115268960909, + "grad_norm": 0.005363022908568382, + "learning_rate": 4.7952514586372365e-05, + "loss": 0.095, + "num_input_tokens_seen": 47373504, + "step": 38925 + }, + { + "epoch": 4.335672123844526, + "grad_norm": 1.0551214218139648, + "learning_rate": 4.795155145753897e-05, + "loss": 0.1108, + "num_input_tokens_seen": 47379872, + "step": 38930 + }, + { + "epoch": 4.336228978728143, + "grad_norm": 0.14198945462703705, + "learning_rate": 4.795058811190977e-05, + "loss": 0.1221, + "num_input_tokens_seen": 47386144, + "step": 38935 + }, + { + "epoch": 4.336785833611761, + "grad_norm": 0.15129831433296204, + "learning_rate": 4.7949624549493886e-05, + "loss": 0.0501, + "num_input_tokens_seen": 47392192, + "step": 38940 + }, + { + "epoch": 4.3373426884953785, + "grad_norm": 0.0022806981578469276, + "learning_rate": 4.794866077030041e-05, + "loss": 0.084, + "num_input_tokens_seen": 47398144, + "step": 38945 + }, + { + "epoch": 4.337899543378995, + "grad_norm": 1.5841546058654785, + "learning_rate": 4.794769677433845e-05, + "loss": 0.1674, + "num_input_tokens_seen": 47404160, + "step": 38950 + }, + { + "epoch": 4.338456398262613, + "grad_norm": 0.18978454172611237, + "learning_rate": 4.7946732561617105e-05, + "loss": 0.026, + "num_input_tokens_seen": 47410144, + "step": 38955 + }, + { + "epoch": 4.33901325314623, + "grad_norm": 0.3621743619441986, + "learning_rate": 4.7945768132145485e-05, + "loss": 0.0161, + "num_input_tokens_seen": 47416288, + "step": 38960 + }, + { + "epoch": 4.339570108029847, + "grad_norm": 0.014172517694532871, + "learning_rate": 4.79448034859327e-05, + "loss": 0.0447, + "num_input_tokens_seen": 47422496, + "step": 38965 + }, + { + "epoch": 4.340126962913465, + "grad_norm": 0.11486057937145233, + "learning_rate": 4.794383862298787e-05, + "loss": 0.008, + "num_input_tokens_seen": 47428640, + "step": 38970 + }, + { + "epoch": 4.340683817797082, + "grad_norm": 0.5701643824577332, + "learning_rate": 4.794287354332009e-05, + "loss": 0.0748, + "num_input_tokens_seen": 47434848, + "step": 38975 + }, + { + "epoch": 4.3412406726807, + "grad_norm": 0.4872084856033325, + "learning_rate": 4.79419082469385e-05, + "loss": 0.0794, + "num_input_tokens_seen": 47441024, + "step": 38980 + }, + { + "epoch": 4.341797527564316, + "grad_norm": 2.0246193408966064, + "learning_rate": 4.7940942733852204e-05, + "loss": 0.1246, + "num_input_tokens_seen": 47446880, + "step": 38985 + }, + { + "epoch": 4.342354382447934, + "grad_norm": 0.5342463254928589, + "learning_rate": 4.793997700407032e-05, + "loss": 0.0544, + "num_input_tokens_seen": 47453152, + "step": 38990 + }, + { + "epoch": 4.342911237331552, + "grad_norm": 1.5178190469741821, + "learning_rate": 4.7939011057601974e-05, + "loss": 0.0938, + "num_input_tokens_seen": 47458752, + "step": 38995 + }, + { + "epoch": 4.3434680922151685, + "grad_norm": 0.21868641674518585, + "learning_rate": 4.793804489445629e-05, + "loss": 0.0644, + "num_input_tokens_seen": 47464608, + "step": 39000 + }, + { + "epoch": 4.344024947098786, + "grad_norm": 1.3938748836517334, + "learning_rate": 4.79370785146424e-05, + "loss": 0.0886, + "num_input_tokens_seen": 47470688, + "step": 39005 + }, + { + "epoch": 4.344581801982404, + "grad_norm": 0.8665058612823486, + "learning_rate": 4.793611191816942e-05, + "loss": 0.1088, + "num_input_tokens_seen": 47476768, + "step": 39010 + }, + { + "epoch": 4.345138656866021, + "grad_norm": 0.22250911593437195, + "learning_rate": 4.793514510504649e-05, + "loss": 0.0625, + "num_input_tokens_seen": 47482976, + "step": 39015 + }, + { + "epoch": 4.345695511749638, + "grad_norm": 1.0408364534378052, + "learning_rate": 4.7934178075282736e-05, + "loss": 0.2352, + "num_input_tokens_seen": 47488992, + "step": 39020 + }, + { + "epoch": 4.346252366633255, + "grad_norm": 1.615061640739441, + "learning_rate": 4.79332108288873e-05, + "loss": 0.0669, + "num_input_tokens_seen": 47495264, + "step": 39025 + }, + { + "epoch": 4.346809221516873, + "grad_norm": 1.0812817811965942, + "learning_rate": 4.7932243365869315e-05, + "loss": 0.1239, + "num_input_tokens_seen": 47501568, + "step": 39030 + }, + { + "epoch": 4.34736607640049, + "grad_norm": 1.325738549232483, + "learning_rate": 4.793127568623791e-05, + "loss": 0.088, + "num_input_tokens_seen": 47507648, + "step": 39035 + }, + { + "epoch": 4.347922931284107, + "grad_norm": 0.08581066876649857, + "learning_rate": 4.7930307790002236e-05, + "loss": 0.0271, + "num_input_tokens_seen": 47513792, + "step": 39040 + }, + { + "epoch": 4.348479786167725, + "grad_norm": 0.1067323237657547, + "learning_rate": 4.7929339677171435e-05, + "loss": 0.0595, + "num_input_tokens_seen": 47519776, + "step": 39045 + }, + { + "epoch": 4.349036641051342, + "grad_norm": 1.535326361656189, + "learning_rate": 4.792837134775465e-05, + "loss": 0.1013, + "num_input_tokens_seen": 47525696, + "step": 39050 + }, + { + "epoch": 4.349593495934959, + "grad_norm": 1.655460238456726, + "learning_rate": 4.792740280176103e-05, + "loss": 0.0811, + "num_input_tokens_seen": 47531552, + "step": 39055 + }, + { + "epoch": 4.350150350818577, + "grad_norm": 0.618278980255127, + "learning_rate": 4.7926434039199725e-05, + "loss": 0.0493, + "num_input_tokens_seen": 47537696, + "step": 39060 + }, + { + "epoch": 4.350707205702194, + "grad_norm": 0.39609724283218384, + "learning_rate": 4.7925465060079876e-05, + "loss": 0.0226, + "num_input_tokens_seen": 47544192, + "step": 39065 + }, + { + "epoch": 4.3512640605858115, + "grad_norm": 0.1874012053012848, + "learning_rate": 4.7924495864410646e-05, + "loss": 0.0649, + "num_input_tokens_seen": 47550400, + "step": 39070 + }, + { + "epoch": 4.351820915469428, + "grad_norm": 0.47506892681121826, + "learning_rate": 4.7923526452201184e-05, + "loss": 0.0658, + "num_input_tokens_seen": 47556480, + "step": 39075 + }, + { + "epoch": 4.352377770353046, + "grad_norm": 0.031966932117938995, + "learning_rate": 4.792255682346065e-05, + "loss": 0.0915, + "num_input_tokens_seen": 47562304, + "step": 39080 + }, + { + "epoch": 4.352934625236664, + "grad_norm": 2.391449213027954, + "learning_rate": 4.7921586978198195e-05, + "loss": 0.1624, + "num_input_tokens_seen": 47568288, + "step": 39085 + }, + { + "epoch": 4.35349148012028, + "grad_norm": 0.9850341081619263, + "learning_rate": 4.792061691642299e-05, + "loss": 0.0322, + "num_input_tokens_seen": 47574592, + "step": 39090 + }, + { + "epoch": 4.354048335003898, + "grad_norm": 0.00502919964492321, + "learning_rate": 4.79196466381442e-05, + "loss": 0.0173, + "num_input_tokens_seen": 47580672, + "step": 39095 + }, + { + "epoch": 4.354605189887515, + "grad_norm": 1.1969006061553955, + "learning_rate": 4.791867614337098e-05, + "loss": 0.1473, + "num_input_tokens_seen": 47586880, + "step": 39100 + }, + { + "epoch": 4.355162044771133, + "grad_norm": 0.008937056176364422, + "learning_rate": 4.79177054321125e-05, + "loss": 0.0074, + "num_input_tokens_seen": 47593120, + "step": 39105 + }, + { + "epoch": 4.35571889965475, + "grad_norm": 0.03968130052089691, + "learning_rate": 4.791673450437793e-05, + "loss": 0.0203, + "num_input_tokens_seen": 47599232, + "step": 39110 + }, + { + "epoch": 4.356275754538367, + "grad_norm": 0.6111027002334595, + "learning_rate": 4.7915763360176446e-05, + "loss": 0.163, + "num_input_tokens_seen": 47605056, + "step": 39115 + }, + { + "epoch": 4.356832609421985, + "grad_norm": 0.6945499777793884, + "learning_rate": 4.7914791999517214e-05, + "loss": 0.112, + "num_input_tokens_seen": 47611040, + "step": 39120 + }, + { + "epoch": 4.357389464305602, + "grad_norm": 0.1982194483280182, + "learning_rate": 4.791382042240942e-05, + "loss": 0.0961, + "num_input_tokens_seen": 47617440, + "step": 39125 + }, + { + "epoch": 4.357946319189219, + "grad_norm": 0.17837296426296234, + "learning_rate": 4.791284862886223e-05, + "loss": 0.023, + "num_input_tokens_seen": 47623392, + "step": 39130 + }, + { + "epoch": 4.358503174072837, + "grad_norm": 0.40677154064178467, + "learning_rate": 4.791187661888482e-05, + "loss": 0.0773, + "num_input_tokens_seen": 47629280, + "step": 39135 + }, + { + "epoch": 4.359060028956454, + "grad_norm": 1.0892653465270996, + "learning_rate": 4.791090439248639e-05, + "loss": 0.0958, + "num_input_tokens_seen": 47635072, + "step": 39140 + }, + { + "epoch": 4.359616883840071, + "grad_norm": 0.6835033893585205, + "learning_rate": 4.79099319496761e-05, + "loss": 0.0318, + "num_input_tokens_seen": 47641184, + "step": 39145 + }, + { + "epoch": 4.360173738723689, + "grad_norm": 1.1847635507583618, + "learning_rate": 4.790895929046316e-05, + "loss": 0.0939, + "num_input_tokens_seen": 47647584, + "step": 39150 + }, + { + "epoch": 4.360730593607306, + "grad_norm": 0.9448143839836121, + "learning_rate": 4.790798641485674e-05, + "loss": 0.1016, + "num_input_tokens_seen": 47653408, + "step": 39155 + }, + { + "epoch": 4.361287448490923, + "grad_norm": 0.3920401334762573, + "learning_rate": 4.790701332286603e-05, + "loss": 0.0719, + "num_input_tokens_seen": 47659616, + "step": 39160 + }, + { + "epoch": 4.36184430337454, + "grad_norm": 0.4464016258716583, + "learning_rate": 4.790604001450023e-05, + "loss": 0.0424, + "num_input_tokens_seen": 47665728, + "step": 39165 + }, + { + "epoch": 4.362401158258158, + "grad_norm": 0.07292278110980988, + "learning_rate": 4.7905066489768535e-05, + "loss": 0.0679, + "num_input_tokens_seen": 47671744, + "step": 39170 + }, + { + "epoch": 4.362958013141776, + "grad_norm": 0.347481906414032, + "learning_rate": 4.790409274868013e-05, + "loss": 0.0158, + "num_input_tokens_seen": 47678208, + "step": 39175 + }, + { + "epoch": 4.363514868025392, + "grad_norm": 0.06182071566581726, + "learning_rate": 4.7903118791244215e-05, + "loss": 0.1666, + "num_input_tokens_seen": 47684288, + "step": 39180 + }, + { + "epoch": 4.36407172290901, + "grad_norm": 0.3570987582206726, + "learning_rate": 4.790214461747e-05, + "loss": 0.0533, + "num_input_tokens_seen": 47690336, + "step": 39185 + }, + { + "epoch": 4.364628577792628, + "grad_norm": 0.012641938403248787, + "learning_rate": 4.7901170227366675e-05, + "loss": 0.0301, + "num_input_tokens_seen": 47696704, + "step": 39190 + }, + { + "epoch": 4.3651854326762445, + "grad_norm": 0.16320498287677765, + "learning_rate": 4.7900195620943454e-05, + "loss": 0.1479, + "num_input_tokens_seen": 47702560, + "step": 39195 + }, + { + "epoch": 4.365742287559862, + "grad_norm": 0.13940100371837616, + "learning_rate": 4.789922079820954e-05, + "loss": 0.029, + "num_input_tokens_seen": 47708928, + "step": 39200 + }, + { + "epoch": 4.366299142443479, + "grad_norm": 0.4977707862854004, + "learning_rate": 4.7898245759174134e-05, + "loss": 0.0346, + "num_input_tokens_seen": 47714976, + "step": 39205 + }, + { + "epoch": 4.366855997327097, + "grad_norm": 0.011856699362397194, + "learning_rate": 4.7897270503846453e-05, + "loss": 0.0326, + "num_input_tokens_seen": 47721376, + "step": 39210 + }, + { + "epoch": 4.367412852210714, + "grad_norm": 0.6603702306747437, + "learning_rate": 4.7896295032235706e-05, + "loss": 0.1253, + "num_input_tokens_seen": 47727648, + "step": 39215 + }, + { + "epoch": 4.367969707094331, + "grad_norm": 0.023462414741516113, + "learning_rate": 4.789531934435111e-05, + "loss": 0.0298, + "num_input_tokens_seen": 47733504, + "step": 39220 + }, + { + "epoch": 4.368526561977949, + "grad_norm": 0.12893734872341156, + "learning_rate": 4.7894343440201875e-05, + "loss": 0.0085, + "num_input_tokens_seen": 47739840, + "step": 39225 + }, + { + "epoch": 4.3690834168615655, + "grad_norm": 0.6503371596336365, + "learning_rate": 4.789336731979723e-05, + "loss": 0.0677, + "num_input_tokens_seen": 47746336, + "step": 39230 + }, + { + "epoch": 4.369640271745183, + "grad_norm": 0.14570963382720947, + "learning_rate": 4.789239098314639e-05, + "loss": 0.0549, + "num_input_tokens_seen": 47752544, + "step": 39235 + }, + { + "epoch": 4.370197126628801, + "grad_norm": 0.6604222059249878, + "learning_rate": 4.789141443025856e-05, + "loss": 0.0185, + "num_input_tokens_seen": 47758560, + "step": 39240 + }, + { + "epoch": 4.370753981512418, + "grad_norm": 0.030641475692391396, + "learning_rate": 4.7890437661142994e-05, + "loss": 0.0151, + "num_input_tokens_seen": 47764992, + "step": 39245 + }, + { + "epoch": 4.371310836396035, + "grad_norm": 0.0382852777838707, + "learning_rate": 4.78894606758089e-05, + "loss": 0.0941, + "num_input_tokens_seen": 47771104, + "step": 39250 + }, + { + "epoch": 4.371867691279652, + "grad_norm": 0.4996553957462311, + "learning_rate": 4.7888483474265525e-05, + "loss": 0.0712, + "num_input_tokens_seen": 47777216, + "step": 39255 + }, + { + "epoch": 4.37242454616327, + "grad_norm": 0.3298226296901703, + "learning_rate": 4.788750605652207e-05, + "loss": 0.0288, + "num_input_tokens_seen": 47783712, + "step": 39260 + }, + { + "epoch": 4.3729814010468875, + "grad_norm": 0.9928823113441467, + "learning_rate": 4.7886528422587786e-05, + "loss": 0.0888, + "num_input_tokens_seen": 47789984, + "step": 39265 + }, + { + "epoch": 4.373538255930504, + "grad_norm": 1.7306102514266968, + "learning_rate": 4.7885550572471904e-05, + "loss": 0.111, + "num_input_tokens_seen": 47795968, + "step": 39270 + }, + { + "epoch": 4.374095110814122, + "grad_norm": 0.2140379250049591, + "learning_rate": 4.7884572506183666e-05, + "loss": 0.118, + "num_input_tokens_seen": 47801088, + "step": 39275 + }, + { + "epoch": 4.374651965697739, + "grad_norm": 2.9862263202667236, + "learning_rate": 4.788359422373231e-05, + "loss": 0.2715, + "num_input_tokens_seen": 47807136, + "step": 39280 + }, + { + "epoch": 4.375208820581356, + "grad_norm": 1.539494276046753, + "learning_rate": 4.788261572512706e-05, + "loss": 0.1225, + "num_input_tokens_seen": 47813280, + "step": 39285 + }, + { + "epoch": 4.375765675464974, + "grad_norm": 0.5604822039604187, + "learning_rate": 4.788163701037718e-05, + "loss": 0.0749, + "num_input_tokens_seen": 47819456, + "step": 39290 + }, + { + "epoch": 4.376322530348591, + "grad_norm": 0.3010919690132141, + "learning_rate": 4.78806580794919e-05, + "loss": 0.1672, + "num_input_tokens_seen": 47825664, + "step": 39295 + }, + { + "epoch": 4.3768793852322085, + "grad_norm": 0.039186395704746246, + "learning_rate": 4.7879678932480474e-05, + "loss": 0.0628, + "num_input_tokens_seen": 47831616, + "step": 39300 + }, + { + "epoch": 4.377436240115826, + "grad_norm": 0.26623931527137756, + "learning_rate": 4.787869956935216e-05, + "loss": 0.0422, + "num_input_tokens_seen": 47837920, + "step": 39305 + }, + { + "epoch": 4.377993094999443, + "grad_norm": 1.1003774404525757, + "learning_rate": 4.787771999011619e-05, + "loss": 0.0654, + "num_input_tokens_seen": 47844320, + "step": 39310 + }, + { + "epoch": 4.378549949883061, + "grad_norm": 0.10083269327878952, + "learning_rate": 4.7876740194781824e-05, + "loss": 0.0721, + "num_input_tokens_seen": 47850240, + "step": 39315 + }, + { + "epoch": 4.3791068047666775, + "grad_norm": 0.9926713109016418, + "learning_rate": 4.787576018335832e-05, + "loss": 0.1509, + "num_input_tokens_seen": 47856000, + "step": 39320 + }, + { + "epoch": 4.379663659650295, + "grad_norm": 1.4475367069244385, + "learning_rate": 4.787477995585493e-05, + "loss": 0.1645, + "num_input_tokens_seen": 47861632, + "step": 39325 + }, + { + "epoch": 4.380220514533913, + "grad_norm": 0.02883574366569519, + "learning_rate": 4.787379951228091e-05, + "loss": 0.0034, + "num_input_tokens_seen": 47867904, + "step": 39330 + }, + { + "epoch": 4.38077736941753, + "grad_norm": 0.428008496761322, + "learning_rate": 4.7872818852645546e-05, + "loss": 0.0329, + "num_input_tokens_seen": 47873344, + "step": 39335 + }, + { + "epoch": 4.381334224301147, + "grad_norm": 0.9486835598945618, + "learning_rate": 4.787183797695807e-05, + "loss": 0.0544, + "num_input_tokens_seen": 47879776, + "step": 39340 + }, + { + "epoch": 4.381891079184764, + "grad_norm": 0.01816510409116745, + "learning_rate": 4.7870856885227766e-05, + "loss": 0.0939, + "num_input_tokens_seen": 47886016, + "step": 39345 + }, + { + "epoch": 4.382447934068382, + "grad_norm": 2.4507663249969482, + "learning_rate": 4.786987557746389e-05, + "loss": 0.1243, + "num_input_tokens_seen": 47892128, + "step": 39350 + }, + { + "epoch": 4.383004788951999, + "grad_norm": 0.07237593829631805, + "learning_rate": 4.7868894053675714e-05, + "loss": 0.0409, + "num_input_tokens_seen": 47898208, + "step": 39355 + }, + { + "epoch": 4.383561643835616, + "grad_norm": 0.2600758671760559, + "learning_rate": 4.786791231387252e-05, + "loss": 0.1434, + "num_input_tokens_seen": 47904288, + "step": 39360 + }, + { + "epoch": 4.384118498719234, + "grad_norm": 1.2867192029953003, + "learning_rate": 4.7866930358063564e-05, + "loss": 0.2068, + "num_input_tokens_seen": 47909856, + "step": 39365 + }, + { + "epoch": 4.3846753536028515, + "grad_norm": 1.1725448369979858, + "learning_rate": 4.786594818625814e-05, + "loss": 0.0428, + "num_input_tokens_seen": 47916320, + "step": 39370 + }, + { + "epoch": 4.385232208486468, + "grad_norm": 1.1640956401824951, + "learning_rate": 4.786496579846551e-05, + "loss": 0.1478, + "num_input_tokens_seen": 47922464, + "step": 39375 + }, + { + "epoch": 4.385789063370086, + "grad_norm": 0.23886793851852417, + "learning_rate": 4.786398319469496e-05, + "loss": 0.0564, + "num_input_tokens_seen": 47928000, + "step": 39380 + }, + { + "epoch": 4.386345918253703, + "grad_norm": 0.3300935924053192, + "learning_rate": 4.786300037495577e-05, + "loss": 0.0436, + "num_input_tokens_seen": 47933440, + "step": 39385 + }, + { + "epoch": 4.3869027731373205, + "grad_norm": 0.28734487295150757, + "learning_rate": 4.786201733925722e-05, + "loss": 0.0203, + "num_input_tokens_seen": 47939904, + "step": 39390 + }, + { + "epoch": 4.387459628020938, + "grad_norm": 0.8744091391563416, + "learning_rate": 4.7861034087608605e-05, + "loss": 0.0579, + "num_input_tokens_seen": 47946240, + "step": 39395 + }, + { + "epoch": 4.388016482904555, + "grad_norm": 0.005509570240974426, + "learning_rate": 4.78600506200192e-05, + "loss": 0.0115, + "num_input_tokens_seen": 47952672, + "step": 39400 + }, + { + "epoch": 4.388573337788173, + "grad_norm": 0.1679331660270691, + "learning_rate": 4.7859066936498306e-05, + "loss": 0.1054, + "num_input_tokens_seen": 47958688, + "step": 39405 + }, + { + "epoch": 4.389130192671789, + "grad_norm": 0.42724841833114624, + "learning_rate": 4.785808303705521e-05, + "loss": 0.0581, + "num_input_tokens_seen": 47965024, + "step": 39410 + }, + { + "epoch": 4.389687047555407, + "grad_norm": 0.7782766819000244, + "learning_rate": 4.785709892169921e-05, + "loss": 0.096, + "num_input_tokens_seen": 47971168, + "step": 39415 + }, + { + "epoch": 4.390243902439025, + "grad_norm": 0.7786518335342407, + "learning_rate": 4.785611459043959e-05, + "loss": 0.0413, + "num_input_tokens_seen": 47977536, + "step": 39420 + }, + { + "epoch": 4.3908007573226415, + "grad_norm": 0.3536108136177063, + "learning_rate": 4.7855130043285665e-05, + "loss": 0.0211, + "num_input_tokens_seen": 47982976, + "step": 39425 + }, + { + "epoch": 4.391357612206259, + "grad_norm": 0.008911175653338432, + "learning_rate": 4.785414528024672e-05, + "loss": 0.0689, + "num_input_tokens_seen": 47989472, + "step": 39430 + }, + { + "epoch": 4.391914467089876, + "grad_norm": 0.6988017559051514, + "learning_rate": 4.785316030133207e-05, + "loss": 0.0725, + "num_input_tokens_seen": 47995680, + "step": 39435 + }, + { + "epoch": 4.392471321973494, + "grad_norm": 0.7524412274360657, + "learning_rate": 4.7852175106551e-05, + "loss": 0.0752, + "num_input_tokens_seen": 48001920, + "step": 39440 + }, + { + "epoch": 4.393028176857111, + "grad_norm": 0.27115151286125183, + "learning_rate": 4.785118969591283e-05, + "loss": 0.0367, + "num_input_tokens_seen": 48008064, + "step": 39445 + }, + { + "epoch": 4.393585031740728, + "grad_norm": 0.2149333357810974, + "learning_rate": 4.785020406942687e-05, + "loss": 0.1478, + "num_input_tokens_seen": 48013856, + "step": 39450 + }, + { + "epoch": 4.394141886624346, + "grad_norm": 0.4812946915626526, + "learning_rate": 4.784921822710242e-05, + "loss": 0.1303, + "num_input_tokens_seen": 48020224, + "step": 39455 + }, + { + "epoch": 4.394698741507963, + "grad_norm": 1.026706337928772, + "learning_rate": 4.784823216894881e-05, + "loss": 0.0787, + "num_input_tokens_seen": 48025888, + "step": 39460 + }, + { + "epoch": 4.39525559639158, + "grad_norm": 0.10256703197956085, + "learning_rate": 4.7847245894975325e-05, + "loss": 0.1526, + "num_input_tokens_seen": 48032384, + "step": 39465 + }, + { + "epoch": 4.395812451275198, + "grad_norm": 0.6934171915054321, + "learning_rate": 4.784625940519129e-05, + "loss": 0.1187, + "num_input_tokens_seen": 48038336, + "step": 39470 + }, + { + "epoch": 4.396369306158815, + "grad_norm": 1.098792314529419, + "learning_rate": 4.784527269960606e-05, + "loss": 0.1754, + "num_input_tokens_seen": 48044544, + "step": 39475 + }, + { + "epoch": 4.396926161042432, + "grad_norm": 0.18052665889263153, + "learning_rate": 4.7844285778228906e-05, + "loss": 0.0722, + "num_input_tokens_seen": 48050464, + "step": 39480 + }, + { + "epoch": 4.39748301592605, + "grad_norm": 0.4512299597263336, + "learning_rate": 4.784329864106918e-05, + "loss": 0.0104, + "num_input_tokens_seen": 48056768, + "step": 39485 + }, + { + "epoch": 4.398039870809667, + "grad_norm": 0.8356616497039795, + "learning_rate": 4.7842311288136185e-05, + "loss": 0.1106, + "num_input_tokens_seen": 48062848, + "step": 39490 + }, + { + "epoch": 4.3985967256932845, + "grad_norm": 0.6354327201843262, + "learning_rate": 4.784132371943927e-05, + "loss": 0.1634, + "num_input_tokens_seen": 48069216, + "step": 39495 + }, + { + "epoch": 4.399153580576901, + "grad_norm": 0.219725102186203, + "learning_rate": 4.784033593498774e-05, + "loss": 0.0415, + "num_input_tokens_seen": 48075488, + "step": 39500 + }, + { + "epoch": 4.399710435460519, + "grad_norm": 0.7797171473503113, + "learning_rate": 4.783934793479095e-05, + "loss": 0.0733, + "num_input_tokens_seen": 48081376, + "step": 39505 + }, + { + "epoch": 4.400267290344137, + "grad_norm": 0.0030513769015669823, + "learning_rate": 4.7838359718858214e-05, + "loss": 0.0464, + "num_input_tokens_seen": 48087328, + "step": 39510 + }, + { + "epoch": 4.4008241452277534, + "grad_norm": 0.08845258504152298, + "learning_rate": 4.783737128719887e-05, + "loss": 0.0223, + "num_input_tokens_seen": 48093376, + "step": 39515 + }, + { + "epoch": 4.401381000111371, + "grad_norm": 0.56548672914505, + "learning_rate": 4.783638263982227e-05, + "loss": 0.0768, + "num_input_tokens_seen": 48099648, + "step": 39520 + }, + { + "epoch": 4.401937854994988, + "grad_norm": 0.9203308820724487, + "learning_rate": 4.783539377673773e-05, + "loss": 0.08, + "num_input_tokens_seen": 48106208, + "step": 39525 + }, + { + "epoch": 4.402494709878606, + "grad_norm": 0.13854680955410004, + "learning_rate": 4.78344046979546e-05, + "loss": 0.0697, + "num_input_tokens_seen": 48111488, + "step": 39530 + }, + { + "epoch": 4.403051564762223, + "grad_norm": 0.00244156620465219, + "learning_rate": 4.783341540348223e-05, + "loss": 0.077, + "num_input_tokens_seen": 48117696, + "step": 39535 + }, + { + "epoch": 4.40360841964584, + "grad_norm": 0.3374566435813904, + "learning_rate": 4.783242589332995e-05, + "loss": 0.059, + "num_input_tokens_seen": 48123936, + "step": 39540 + }, + { + "epoch": 4.404165274529458, + "grad_norm": 1.6685147285461426, + "learning_rate": 4.783143616750712e-05, + "loss": 0.0315, + "num_input_tokens_seen": 48130080, + "step": 39545 + }, + { + "epoch": 4.404722129413075, + "grad_norm": 0.4735181927680969, + "learning_rate": 4.783044622602307e-05, + "loss": 0.0937, + "num_input_tokens_seen": 48136256, + "step": 39550 + }, + { + "epoch": 4.405278984296692, + "grad_norm": 0.44562670588493347, + "learning_rate": 4.782945606888718e-05, + "loss": 0.0856, + "num_input_tokens_seen": 48142560, + "step": 39555 + }, + { + "epoch": 4.40583583918031, + "grad_norm": 0.23276200890541077, + "learning_rate": 4.782846569610878e-05, + "loss": 0.1062, + "num_input_tokens_seen": 48148704, + "step": 39560 + }, + { + "epoch": 4.406392694063927, + "grad_norm": 0.6477102041244507, + "learning_rate": 4.782747510769723e-05, + "loss": 0.1111, + "num_input_tokens_seen": 48154848, + "step": 39565 + }, + { + "epoch": 4.406949548947544, + "grad_norm": 0.09272429347038269, + "learning_rate": 4.782648430366189e-05, + "loss": 0.0413, + "num_input_tokens_seen": 48161248, + "step": 39570 + }, + { + "epoch": 4.407506403831162, + "grad_norm": 0.7562232613563538, + "learning_rate": 4.7825493284012125e-05, + "loss": 0.0432, + "num_input_tokens_seen": 48167200, + "step": 39575 + }, + { + "epoch": 4.408063258714779, + "grad_norm": 0.15374915301799774, + "learning_rate": 4.782450204875728e-05, + "loss": 0.0361, + "num_input_tokens_seen": 48173120, + "step": 39580 + }, + { + "epoch": 4.4086201135983965, + "grad_norm": 0.041734375059604645, + "learning_rate": 4.782351059790673e-05, + "loss": 0.0545, + "num_input_tokens_seen": 48179008, + "step": 39585 + }, + { + "epoch": 4.409176968482013, + "grad_norm": 0.3627701699733734, + "learning_rate": 4.782251893146983e-05, + "loss": 0.0942, + "num_input_tokens_seen": 48185312, + "step": 39590 + }, + { + "epoch": 4.409733823365631, + "grad_norm": 0.03111475147306919, + "learning_rate": 4.782152704945596e-05, + "loss": 0.0537, + "num_input_tokens_seen": 48191136, + "step": 39595 + }, + { + "epoch": 4.410290678249249, + "grad_norm": 0.06412434577941895, + "learning_rate": 4.782053495187448e-05, + "loss": 0.0582, + "num_input_tokens_seen": 48196704, + "step": 39600 + }, + { + "epoch": 4.410847533132865, + "grad_norm": 0.07528890669345856, + "learning_rate": 4.7819542638734766e-05, + "loss": 0.045, + "num_input_tokens_seen": 48202976, + "step": 39605 + }, + { + "epoch": 4.411404388016483, + "grad_norm": 0.06196404993534088, + "learning_rate": 4.781855011004619e-05, + "loss": 0.0153, + "num_input_tokens_seen": 48209056, + "step": 39610 + }, + { + "epoch": 4.4119612429001, + "grad_norm": 0.7866287231445312, + "learning_rate": 4.7817557365818124e-05, + "loss": 0.0614, + "num_input_tokens_seen": 48215296, + "step": 39615 + }, + { + "epoch": 4.4125180977837175, + "grad_norm": 0.8768743276596069, + "learning_rate": 4.7816564406059945e-05, + "loss": 0.1276, + "num_input_tokens_seen": 48220608, + "step": 39620 + }, + { + "epoch": 4.413074952667335, + "grad_norm": 0.2816082239151001, + "learning_rate": 4.781557123078104e-05, + "loss": 0.13, + "num_input_tokens_seen": 48226816, + "step": 39625 + }, + { + "epoch": 4.413631807550952, + "grad_norm": 0.6892163753509521, + "learning_rate": 4.781457783999078e-05, + "loss": 0.1104, + "num_input_tokens_seen": 48232896, + "step": 39630 + }, + { + "epoch": 4.41418866243457, + "grad_norm": 0.4205056428909302, + "learning_rate": 4.781358423369856e-05, + "loss": 0.0751, + "num_input_tokens_seen": 48238944, + "step": 39635 + }, + { + "epoch": 4.414745517318186, + "grad_norm": 0.008790729567408562, + "learning_rate": 4.781259041191375e-05, + "loss": 0.0534, + "num_input_tokens_seen": 48245184, + "step": 39640 + }, + { + "epoch": 4.415302372201804, + "grad_norm": 0.702274739742279, + "learning_rate": 4.781159637464575e-05, + "loss": 0.0349, + "num_input_tokens_seen": 48251520, + "step": 39645 + }, + { + "epoch": 4.415859227085422, + "grad_norm": 0.009207688271999359, + "learning_rate": 4.781060212190395e-05, + "loss": 0.0236, + "num_input_tokens_seen": 48257664, + "step": 39650 + }, + { + "epoch": 4.416416081969039, + "grad_norm": 0.34096604585647583, + "learning_rate": 4.780960765369774e-05, + "loss": 0.0291, + "num_input_tokens_seen": 48263776, + "step": 39655 + }, + { + "epoch": 4.416972936852656, + "grad_norm": 0.5215433239936829, + "learning_rate": 4.7808612970036505e-05, + "loss": 0.2517, + "num_input_tokens_seen": 48269824, + "step": 39660 + }, + { + "epoch": 4.417529791736274, + "grad_norm": 0.9189239144325256, + "learning_rate": 4.780761807092965e-05, + "loss": 0.0799, + "num_input_tokens_seen": 48275968, + "step": 39665 + }, + { + "epoch": 4.418086646619891, + "grad_norm": 0.5792685151100159, + "learning_rate": 4.780662295638656e-05, + "loss": 0.051, + "num_input_tokens_seen": 48281920, + "step": 39670 + }, + { + "epoch": 4.418643501503508, + "grad_norm": 1.3382210731506348, + "learning_rate": 4.780562762641665e-05, + "loss": 0.1431, + "num_input_tokens_seen": 48288064, + "step": 39675 + }, + { + "epoch": 4.419200356387125, + "grad_norm": 0.0010149995796382427, + "learning_rate": 4.780463208102932e-05, + "loss": 0.0594, + "num_input_tokens_seen": 48294304, + "step": 39680 + }, + { + "epoch": 4.419757211270743, + "grad_norm": 1.201621651649475, + "learning_rate": 4.780363632023397e-05, + "loss": 0.0805, + "num_input_tokens_seen": 48300160, + "step": 39685 + }, + { + "epoch": 4.4203140661543605, + "grad_norm": 1.1189188957214355, + "learning_rate": 4.7802640344039994e-05, + "loss": 0.0876, + "num_input_tokens_seen": 48306208, + "step": 39690 + }, + { + "epoch": 4.420870921037977, + "grad_norm": 1.2964905500411987, + "learning_rate": 4.7801644152456823e-05, + "loss": 0.0782, + "num_input_tokens_seen": 48312256, + "step": 39695 + }, + { + "epoch": 4.421427775921595, + "grad_norm": 0.16074901819229126, + "learning_rate": 4.780064774549384e-05, + "loss": 0.0454, + "num_input_tokens_seen": 48318464, + "step": 39700 + }, + { + "epoch": 4.421984630805212, + "grad_norm": 1.2040817737579346, + "learning_rate": 4.779965112316048e-05, + "loss": 0.0492, + "num_input_tokens_seen": 48323872, + "step": 39705 + }, + { + "epoch": 4.422541485688829, + "grad_norm": 0.6850296258926392, + "learning_rate": 4.779865428546616e-05, + "loss": 0.1825, + "num_input_tokens_seen": 48330176, + "step": 39710 + }, + { + "epoch": 4.423098340572447, + "grad_norm": 0.47872817516326904, + "learning_rate": 4.7797657232420264e-05, + "loss": 0.0346, + "num_input_tokens_seen": 48336288, + "step": 39715 + }, + { + "epoch": 4.423655195456064, + "grad_norm": 0.3139517605304718, + "learning_rate": 4.779665996403224e-05, + "loss": 0.1004, + "num_input_tokens_seen": 48342528, + "step": 39720 + }, + { + "epoch": 4.424212050339682, + "grad_norm": 0.04449634626507759, + "learning_rate": 4.77956624803115e-05, + "loss": 0.0599, + "num_input_tokens_seen": 48348352, + "step": 39725 + }, + { + "epoch": 4.424768905223299, + "grad_norm": 0.2662597596645355, + "learning_rate": 4.779466478126746e-05, + "loss": 0.062, + "num_input_tokens_seen": 48354432, + "step": 39730 + }, + { + "epoch": 4.425325760106916, + "grad_norm": 0.7969327569007874, + "learning_rate": 4.779366686690955e-05, + "loss": 0.0701, + "num_input_tokens_seen": 48360448, + "step": 39735 + }, + { + "epoch": 4.425882614990534, + "grad_norm": 1.2226736545562744, + "learning_rate": 4.779266873724719e-05, + "loss": 0.1498, + "num_input_tokens_seen": 48366144, + "step": 39740 + }, + { + "epoch": 4.4264394698741505, + "grad_norm": 0.807304322719574, + "learning_rate": 4.779167039228982e-05, + "loss": 0.0436, + "num_input_tokens_seen": 48372224, + "step": 39745 + }, + { + "epoch": 4.426996324757768, + "grad_norm": 0.011974003165960312, + "learning_rate": 4.7790671832046865e-05, + "loss": 0.1959, + "num_input_tokens_seen": 48378080, + "step": 39750 + }, + { + "epoch": 4.427553179641386, + "grad_norm": 0.082880400121212, + "learning_rate": 4.7789673056527754e-05, + "loss": 0.0124, + "num_input_tokens_seen": 48384224, + "step": 39755 + }, + { + "epoch": 4.428110034525003, + "grad_norm": 1.56637704372406, + "learning_rate": 4.7788674065741914e-05, + "loss": 0.224, + "num_input_tokens_seen": 48390752, + "step": 39760 + }, + { + "epoch": 4.42866688940862, + "grad_norm": 0.027480849996209145, + "learning_rate": 4.778767485969879e-05, + "loss": 0.024, + "num_input_tokens_seen": 48396832, + "step": 39765 + }, + { + "epoch": 4.429223744292237, + "grad_norm": 1.298674464225769, + "learning_rate": 4.778667543840783e-05, + "loss": 0.0499, + "num_input_tokens_seen": 48402976, + "step": 39770 + }, + { + "epoch": 4.429780599175855, + "grad_norm": 0.023490319028496742, + "learning_rate": 4.778567580187845e-05, + "loss": 0.0203, + "num_input_tokens_seen": 48409344, + "step": 39775 + }, + { + "epoch": 4.430337454059472, + "grad_norm": 0.48666009306907654, + "learning_rate": 4.778467595012012e-05, + "loss": 0.0132, + "num_input_tokens_seen": 48415552, + "step": 39780 + }, + { + "epoch": 4.430894308943089, + "grad_norm": 0.13903199136257172, + "learning_rate": 4.778367588314226e-05, + "loss": 0.0259, + "num_input_tokens_seen": 48421568, + "step": 39785 + }, + { + "epoch": 4.431451163826707, + "grad_norm": 0.10126857459545135, + "learning_rate": 4.778267560095433e-05, + "loss": 0.0429, + "num_input_tokens_seen": 48427744, + "step": 39790 + }, + { + "epoch": 4.432008018710324, + "grad_norm": 0.9655834436416626, + "learning_rate": 4.778167510356578e-05, + "loss": 0.066, + "num_input_tokens_seen": 48434016, + "step": 39795 + }, + { + "epoch": 4.432564873593941, + "grad_norm": 1.0672566890716553, + "learning_rate": 4.7780674390986055e-05, + "loss": 0.1353, + "num_input_tokens_seen": 48440064, + "step": 39800 + }, + { + "epoch": 4.433121728477559, + "grad_norm": 0.014384646899998188, + "learning_rate": 4.7779673463224604e-05, + "loss": 0.0234, + "num_input_tokens_seen": 48446048, + "step": 39805 + }, + { + "epoch": 4.433678583361176, + "grad_norm": 0.7220395803451538, + "learning_rate": 4.777867232029089e-05, + "loss": 0.1023, + "num_input_tokens_seen": 48452128, + "step": 39810 + }, + { + "epoch": 4.4342354382447935, + "grad_norm": 0.30491018295288086, + "learning_rate": 4.777767096219437e-05, + "loss": 0.1586, + "num_input_tokens_seen": 48458144, + "step": 39815 + }, + { + "epoch": 4.43479229312841, + "grad_norm": 0.5643994808197021, + "learning_rate": 4.777666938894449e-05, + "loss": 0.0408, + "num_input_tokens_seen": 48463712, + "step": 39820 + }, + { + "epoch": 4.435349148012028, + "grad_norm": 0.8422644138336182, + "learning_rate": 4.777566760055073e-05, + "loss": 0.0839, + "num_input_tokens_seen": 48469472, + "step": 39825 + }, + { + "epoch": 4.435906002895646, + "grad_norm": 0.0044505298137664795, + "learning_rate": 4.777466559702253e-05, + "loss": 0.0677, + "num_input_tokens_seen": 48475648, + "step": 39830 + }, + { + "epoch": 4.436462857779262, + "grad_norm": 0.01325551513582468, + "learning_rate": 4.777366337836938e-05, + "loss": 0.0293, + "num_input_tokens_seen": 48481888, + "step": 39835 + }, + { + "epoch": 4.43701971266288, + "grad_norm": 0.0012324776034802198, + "learning_rate": 4.777266094460072e-05, + "loss": 0.0326, + "num_input_tokens_seen": 48488064, + "step": 39840 + }, + { + "epoch": 4.437576567546498, + "grad_norm": 1.8496428728103638, + "learning_rate": 4.777165829572604e-05, + "loss": 0.0786, + "num_input_tokens_seen": 48493920, + "step": 39845 + }, + { + "epoch": 4.4381334224301145, + "grad_norm": 0.0014838053612038493, + "learning_rate": 4.77706554317548e-05, + "loss": 0.0336, + "num_input_tokens_seen": 48500352, + "step": 39850 + }, + { + "epoch": 4.438690277313732, + "grad_norm": 0.5758336186408997, + "learning_rate": 4.776965235269648e-05, + "loss": 0.0356, + "num_input_tokens_seen": 48506208, + "step": 39855 + }, + { + "epoch": 4.439247132197349, + "grad_norm": 0.7030112147331238, + "learning_rate": 4.776864905856055e-05, + "loss": 0.1123, + "num_input_tokens_seen": 48512480, + "step": 39860 + }, + { + "epoch": 4.439803987080967, + "grad_norm": 2.572437047958374, + "learning_rate": 4.776764554935649e-05, + "loss": 0.1038, + "num_input_tokens_seen": 48518496, + "step": 39865 + }, + { + "epoch": 4.440360841964584, + "grad_norm": 0.032791852951049805, + "learning_rate": 4.776664182509377e-05, + "loss": 0.0108, + "num_input_tokens_seen": 48524736, + "step": 39870 + }, + { + "epoch": 4.440917696848201, + "grad_norm": 0.0004759200382977724, + "learning_rate": 4.7765637885781876e-05, + "loss": 0.0476, + "num_input_tokens_seen": 48530912, + "step": 39875 + }, + { + "epoch": 4.441474551731819, + "grad_norm": 0.06179254129528999, + "learning_rate": 4.7764633731430294e-05, + "loss": 0.1315, + "num_input_tokens_seen": 48536928, + "step": 39880 + }, + { + "epoch": 4.442031406615436, + "grad_norm": 0.05474121868610382, + "learning_rate": 4.776362936204851e-05, + "loss": 0.0745, + "num_input_tokens_seen": 48543008, + "step": 39885 + }, + { + "epoch": 4.442588261499053, + "grad_norm": 0.014321370050311089, + "learning_rate": 4.7762624777646015e-05, + "loss": 0.017, + "num_input_tokens_seen": 48549184, + "step": 39890 + }, + { + "epoch": 4.443145116382671, + "grad_norm": 0.655128538608551, + "learning_rate": 4.7761619978232285e-05, + "loss": 0.0649, + "num_input_tokens_seen": 48555360, + "step": 39895 + }, + { + "epoch": 4.443701971266288, + "grad_norm": 0.3474277853965759, + "learning_rate": 4.776061496381682e-05, + "loss": 0.066, + "num_input_tokens_seen": 48561216, + "step": 39900 + }, + { + "epoch": 4.444258826149905, + "grad_norm": 0.8596634864807129, + "learning_rate": 4.77596097344091e-05, + "loss": 0.1108, + "num_input_tokens_seen": 48567616, + "step": 39905 + }, + { + "epoch": 4.444815681033523, + "grad_norm": 1.0196216106414795, + "learning_rate": 4.775860429001865e-05, + "loss": 0.0463, + "num_input_tokens_seen": 48574112, + "step": 39910 + }, + { + "epoch": 4.44537253591714, + "grad_norm": 0.15094543993473053, + "learning_rate": 4.7757598630654945e-05, + "loss": 0.07, + "num_input_tokens_seen": 48580320, + "step": 39915 + }, + { + "epoch": 4.4459293908007576, + "grad_norm": 0.011829293332993984, + "learning_rate": 4.7756592756327476e-05, + "loss": 0.0324, + "num_input_tokens_seen": 48585824, + "step": 39920 + }, + { + "epoch": 4.446486245684374, + "grad_norm": 0.4133467972278595, + "learning_rate": 4.775558666704577e-05, + "loss": 0.0288, + "num_input_tokens_seen": 48591776, + "step": 39925 + }, + { + "epoch": 4.447043100567992, + "grad_norm": 0.028271406888961792, + "learning_rate": 4.775458036281931e-05, + "loss": 0.0145, + "num_input_tokens_seen": 48598016, + "step": 39930 + }, + { + "epoch": 4.44759995545161, + "grad_norm": 0.0065554180182516575, + "learning_rate": 4.7753573843657605e-05, + "loss": 0.0231, + "num_input_tokens_seen": 48604192, + "step": 39935 + }, + { + "epoch": 4.4481568103352265, + "grad_norm": 0.9250103235244751, + "learning_rate": 4.775256710957017e-05, + "loss": 0.0666, + "num_input_tokens_seen": 48609888, + "step": 39940 + }, + { + "epoch": 4.448713665218844, + "grad_norm": 1.647915244102478, + "learning_rate": 4.77515601605665e-05, + "loss": 0.0718, + "num_input_tokens_seen": 48615968, + "step": 39945 + }, + { + "epoch": 4.449270520102461, + "grad_norm": 0.0004113113973289728, + "learning_rate": 4.7750552996656125e-05, + "loss": 0.077, + "num_input_tokens_seen": 48621856, + "step": 39950 + }, + { + "epoch": 4.449827374986079, + "grad_norm": 0.35543978214263916, + "learning_rate": 4.774954561784854e-05, + "loss": 0.0414, + "num_input_tokens_seen": 48628160, + "step": 39955 + }, + { + "epoch": 4.450384229869696, + "grad_norm": 0.8223370313644409, + "learning_rate": 4.774853802415329e-05, + "loss": 0.0262, + "num_input_tokens_seen": 48634272, + "step": 39960 + }, + { + "epoch": 4.450941084753313, + "grad_norm": 1.0854170322418213, + "learning_rate": 4.774753021557986e-05, + "loss": 0.1688, + "num_input_tokens_seen": 48640224, + "step": 39965 + }, + { + "epoch": 4.451497939636931, + "grad_norm": 0.05279349535703659, + "learning_rate": 4.774652219213778e-05, + "loss": 0.0755, + "num_input_tokens_seen": 48646176, + "step": 39970 + }, + { + "epoch": 4.4520547945205475, + "grad_norm": 1.1315261125564575, + "learning_rate": 4.774551395383657e-05, + "loss": 0.0808, + "num_input_tokens_seen": 48652320, + "step": 39975 + }, + { + "epoch": 4.452611649404165, + "grad_norm": 0.12870129942893982, + "learning_rate": 4.7744505500685766e-05, + "loss": 0.0701, + "num_input_tokens_seen": 48658560, + "step": 39980 + }, + { + "epoch": 4.453168504287783, + "grad_norm": 0.11873114854097366, + "learning_rate": 4.7743496832694885e-05, + "loss": 0.1418, + "num_input_tokens_seen": 48664384, + "step": 39985 + }, + { + "epoch": 4.4537253591714, + "grad_norm": 0.018522951751947403, + "learning_rate": 4.774248794987345e-05, + "loss": 0.0406, + "num_input_tokens_seen": 48670656, + "step": 39990 + }, + { + "epoch": 4.454282214055017, + "grad_norm": 0.20835503935813904, + "learning_rate": 4.7741478852231e-05, + "loss": 0.0787, + "num_input_tokens_seen": 48676704, + "step": 39995 + }, + { + "epoch": 4.454839068938634, + "grad_norm": 0.020729903131723404, + "learning_rate": 4.7740469539777055e-05, + "loss": 0.1587, + "num_input_tokens_seen": 48682720, + "step": 40000 + }, + { + "epoch": 4.455395923822252, + "grad_norm": 0.2638343870639801, + "learning_rate": 4.773946001252116e-05, + "loss": 0.0255, + "num_input_tokens_seen": 48688960, + "step": 40005 + }, + { + "epoch": 4.4559527787058695, + "grad_norm": 0.34403544664382935, + "learning_rate": 4.7738450270472853e-05, + "loss": 0.0677, + "num_input_tokens_seen": 48695264, + "step": 40010 + }, + { + "epoch": 4.456509633589486, + "grad_norm": 0.012016413733363152, + "learning_rate": 4.7737440313641654e-05, + "loss": 0.0176, + "num_input_tokens_seen": 48701664, + "step": 40015 + }, + { + "epoch": 4.457066488473104, + "grad_norm": 0.014869125559926033, + "learning_rate": 4.773643014203713e-05, + "loss": 0.0607, + "num_input_tokens_seen": 48707616, + "step": 40020 + }, + { + "epoch": 4.457623343356722, + "grad_norm": 0.7730260491371155, + "learning_rate": 4.7735419755668784e-05, + "loss": 0.0669, + "num_input_tokens_seen": 48713504, + "step": 40025 + }, + { + "epoch": 4.458180198240338, + "grad_norm": 0.38398492336273193, + "learning_rate": 4.77344091545462e-05, + "loss": 0.0985, + "num_input_tokens_seen": 48719520, + "step": 40030 + }, + { + "epoch": 4.458737053123956, + "grad_norm": 0.3626895844936371, + "learning_rate": 4.77333983386789e-05, + "loss": 0.0549, + "num_input_tokens_seen": 48725856, + "step": 40035 + }, + { + "epoch": 4.459293908007573, + "grad_norm": 1.9997085332870483, + "learning_rate": 4.773238730807644e-05, + "loss": 0.1408, + "num_input_tokens_seen": 48731488, + "step": 40040 + }, + { + "epoch": 4.4598507628911905, + "grad_norm": 0.0073130433447659016, + "learning_rate": 4.773137606274838e-05, + "loss": 0.0865, + "num_input_tokens_seen": 48737504, + "step": 40045 + }, + { + "epoch": 4.460407617774808, + "grad_norm": 1.2301359176635742, + "learning_rate": 4.773036460270425e-05, + "loss": 0.1326, + "num_input_tokens_seen": 48743040, + "step": 40050 + }, + { + "epoch": 4.460964472658425, + "grad_norm": 0.4735363721847534, + "learning_rate": 4.7729352927953616e-05, + "loss": 0.1151, + "num_input_tokens_seen": 48749184, + "step": 40055 + }, + { + "epoch": 4.461521327542043, + "grad_norm": 0.08082776516675949, + "learning_rate": 4.772834103850603e-05, + "loss": 0.0681, + "num_input_tokens_seen": 48755456, + "step": 40060 + }, + { + "epoch": 4.46207818242566, + "grad_norm": 0.3998073935508728, + "learning_rate": 4.772732893437106e-05, + "loss": 0.0703, + "num_input_tokens_seen": 48761376, + "step": 40065 + }, + { + "epoch": 4.462635037309277, + "grad_norm": 2.7818551063537598, + "learning_rate": 4.772631661555826e-05, + "loss": 0.1657, + "num_input_tokens_seen": 48767712, + "step": 40070 + }, + { + "epoch": 4.463191892192895, + "grad_norm": 0.00022419265587814152, + "learning_rate": 4.772530408207718e-05, + "loss": 0.097, + "num_input_tokens_seen": 48773664, + "step": 40075 + }, + { + "epoch": 4.463748747076512, + "grad_norm": 0.04447208344936371, + "learning_rate": 4.772429133393741e-05, + "loss": 0.0531, + "num_input_tokens_seen": 48779712, + "step": 40080 + }, + { + "epoch": 4.464305601960129, + "grad_norm": 0.5166695713996887, + "learning_rate": 4.7723278371148496e-05, + "loss": 0.0397, + "num_input_tokens_seen": 48785792, + "step": 40085 + }, + { + "epoch": 4.464862456843747, + "grad_norm": 0.005422896705567837, + "learning_rate": 4.772226519372001e-05, + "loss": 0.0201, + "num_input_tokens_seen": 48792032, + "step": 40090 + }, + { + "epoch": 4.465419311727364, + "grad_norm": 0.04684578999876976, + "learning_rate": 4.7721251801661525e-05, + "loss": 0.0505, + "num_input_tokens_seen": 48798272, + "step": 40095 + }, + { + "epoch": 4.465976166610981, + "grad_norm": 0.7655813694000244, + "learning_rate": 4.772023819498262e-05, + "loss": 0.0962, + "num_input_tokens_seen": 48804384, + "step": 40100 + }, + { + "epoch": 4.466533021494598, + "grad_norm": 0.16478100419044495, + "learning_rate": 4.771922437369286e-05, + "loss": 0.1421, + "num_input_tokens_seen": 48810368, + "step": 40105 + }, + { + "epoch": 4.467089876378216, + "grad_norm": 2.054532289505005, + "learning_rate": 4.7718210337801815e-05, + "loss": 0.1158, + "num_input_tokens_seen": 48816768, + "step": 40110 + }, + { + "epoch": 4.4676467312618335, + "grad_norm": 0.0655234158039093, + "learning_rate": 4.7717196087319075e-05, + "loss": 0.0089, + "num_input_tokens_seen": 48822752, + "step": 40115 + }, + { + "epoch": 4.46820358614545, + "grad_norm": 0.08726263791322708, + "learning_rate": 4.771618162225422e-05, + "loss": 0.0228, + "num_input_tokens_seen": 48828480, + "step": 40120 + }, + { + "epoch": 4.468760441029068, + "grad_norm": 3.769890785217285, + "learning_rate": 4.771516694261683e-05, + "loss": 0.1476, + "num_input_tokens_seen": 48834656, + "step": 40125 + }, + { + "epoch": 4.469317295912685, + "grad_norm": 0.1053585335612297, + "learning_rate": 4.771415204841649e-05, + "loss": 0.0559, + "num_input_tokens_seen": 48840992, + "step": 40130 + }, + { + "epoch": 4.4698741507963025, + "grad_norm": 0.16844670474529266, + "learning_rate": 4.7713136939662784e-05, + "loss": 0.0831, + "num_input_tokens_seen": 48847040, + "step": 40135 + }, + { + "epoch": 4.47043100567992, + "grad_norm": 0.1315569430589676, + "learning_rate": 4.77121216163653e-05, + "loss": 0.0177, + "num_input_tokens_seen": 48852704, + "step": 40140 + }, + { + "epoch": 4.470987860563537, + "grad_norm": 0.10408858954906464, + "learning_rate": 4.771110607853363e-05, + "loss": 0.0439, + "num_input_tokens_seen": 48858816, + "step": 40145 + }, + { + "epoch": 4.471544715447155, + "grad_norm": 1.3006733655929565, + "learning_rate": 4.771009032617737e-05, + "loss": 0.1141, + "num_input_tokens_seen": 48865056, + "step": 40150 + }, + { + "epoch": 4.472101570330771, + "grad_norm": 0.05335268750786781, + "learning_rate": 4.7709074359306114e-05, + "loss": 0.1192, + "num_input_tokens_seen": 48870944, + "step": 40155 + }, + { + "epoch": 4.472658425214389, + "grad_norm": 0.016963595524430275, + "learning_rate": 4.770805817792945e-05, + "loss": 0.0563, + "num_input_tokens_seen": 48877312, + "step": 40160 + }, + { + "epoch": 4.473215280098007, + "grad_norm": 0.08967917412519455, + "learning_rate": 4.770704178205699e-05, + "loss": 0.1039, + "num_input_tokens_seen": 48882944, + "step": 40165 + }, + { + "epoch": 4.4737721349816235, + "grad_norm": 0.8325782418251038, + "learning_rate": 4.7706025171698324e-05, + "loss": 0.0423, + "num_input_tokens_seen": 48888832, + "step": 40170 + }, + { + "epoch": 4.474328989865241, + "grad_norm": 0.5029069781303406, + "learning_rate": 4.7705008346863055e-05, + "loss": 0.0338, + "num_input_tokens_seen": 48895136, + "step": 40175 + }, + { + "epoch": 4.474885844748858, + "grad_norm": 1.7757508754730225, + "learning_rate": 4.7703991307560805e-05, + "loss": 0.0584, + "num_input_tokens_seen": 48901088, + "step": 40180 + }, + { + "epoch": 4.475442699632476, + "grad_norm": 1.5204075574874878, + "learning_rate": 4.770297405380115e-05, + "loss": 0.1456, + "num_input_tokens_seen": 48906848, + "step": 40185 + }, + { + "epoch": 4.475999554516093, + "grad_norm": 2.5250096321105957, + "learning_rate": 4.7701956585593726e-05, + "loss": 0.0877, + "num_input_tokens_seen": 48912864, + "step": 40190 + }, + { + "epoch": 4.47655640939971, + "grad_norm": 0.4579535722732544, + "learning_rate": 4.770093890294813e-05, + "loss": 0.0568, + "num_input_tokens_seen": 48919168, + "step": 40195 + }, + { + "epoch": 4.477113264283328, + "grad_norm": 0.1815631091594696, + "learning_rate": 4.769992100587398e-05, + "loss": 0.0555, + "num_input_tokens_seen": 48925184, + "step": 40200 + }, + { + "epoch": 4.4776701191669455, + "grad_norm": 0.059435248374938965, + "learning_rate": 4.769890289438089e-05, + "loss": 0.1178, + "num_input_tokens_seen": 48931040, + "step": 40205 + }, + { + "epoch": 4.478226974050562, + "grad_norm": 1.1885972023010254, + "learning_rate": 4.769788456847847e-05, + "loss": 0.1135, + "num_input_tokens_seen": 48937216, + "step": 40210 + }, + { + "epoch": 4.47878382893418, + "grad_norm": 0.6955493092536926, + "learning_rate": 4.769686602817635e-05, + "loss": 0.0649, + "num_input_tokens_seen": 48943232, + "step": 40215 + }, + { + "epoch": 4.479340683817797, + "grad_norm": 1.7809849977493286, + "learning_rate": 4.7695847273484144e-05, + "loss": 0.051, + "num_input_tokens_seen": 48949280, + "step": 40220 + }, + { + "epoch": 4.479897538701414, + "grad_norm": 0.7751438617706299, + "learning_rate": 4.7694828304411484e-05, + "loss": 0.0725, + "num_input_tokens_seen": 48955584, + "step": 40225 + }, + { + "epoch": 4.480454393585032, + "grad_norm": 0.1805187612771988, + "learning_rate": 4.769380912096798e-05, + "loss": 0.0758, + "num_input_tokens_seen": 48961472, + "step": 40230 + }, + { + "epoch": 4.481011248468649, + "grad_norm": 2.3028554916381836, + "learning_rate": 4.7692789723163264e-05, + "loss": 0.1147, + "num_input_tokens_seen": 48967488, + "step": 40235 + }, + { + "epoch": 4.4815681033522665, + "grad_norm": 0.5582250952720642, + "learning_rate": 4.7691770111006976e-05, + "loss": 0.0204, + "num_input_tokens_seen": 48973632, + "step": 40240 + }, + { + "epoch": 4.482124958235884, + "grad_norm": 0.02736016735434532, + "learning_rate": 4.7690750284508735e-05, + "loss": 0.051, + "num_input_tokens_seen": 48979904, + "step": 40245 + }, + { + "epoch": 4.482681813119501, + "grad_norm": 0.5812529921531677, + "learning_rate": 4.768973024367818e-05, + "loss": 0.0236, + "num_input_tokens_seen": 48986336, + "step": 40250 + }, + { + "epoch": 4.483238668003119, + "grad_norm": 0.7838647961616516, + "learning_rate": 4.7688709988524943e-05, + "loss": 0.0369, + "num_input_tokens_seen": 48992640, + "step": 40255 + }, + { + "epoch": 4.483795522886735, + "grad_norm": 2.7376511096954346, + "learning_rate": 4.7687689519058664e-05, + "loss": 0.1237, + "num_input_tokens_seen": 48999104, + "step": 40260 + }, + { + "epoch": 4.484352377770353, + "grad_norm": 0.0390399768948555, + "learning_rate": 4.7686668835288976e-05, + "loss": 0.1826, + "num_input_tokens_seen": 49005088, + "step": 40265 + }, + { + "epoch": 4.484909232653971, + "grad_norm": 0.5769565105438232, + "learning_rate": 4.768564793722553e-05, + "loss": 0.1234, + "num_input_tokens_seen": 49010944, + "step": 40270 + }, + { + "epoch": 4.485466087537588, + "grad_norm": 0.1672264039516449, + "learning_rate": 4.7684626824877966e-05, + "loss": 0.0697, + "num_input_tokens_seen": 49016800, + "step": 40275 + }, + { + "epoch": 4.486022942421205, + "grad_norm": 0.19266821444034576, + "learning_rate": 4.7683605498255915e-05, + "loss": 0.0453, + "num_input_tokens_seen": 49022976, + "step": 40280 + }, + { + "epoch": 4.486579797304822, + "grad_norm": 0.01649715006351471, + "learning_rate": 4.768258395736904e-05, + "loss": 0.0296, + "num_input_tokens_seen": 49029472, + "step": 40285 + }, + { + "epoch": 4.48713665218844, + "grad_norm": 0.12451781332492828, + "learning_rate": 4.768156220222699e-05, + "loss": 0.0317, + "num_input_tokens_seen": 49035872, + "step": 40290 + }, + { + "epoch": 4.487693507072057, + "grad_norm": 0.010593386366963387, + "learning_rate": 4.7680540232839425e-05, + "loss": 0.0834, + "num_input_tokens_seen": 49042016, + "step": 40295 + }, + { + "epoch": 4.488250361955674, + "grad_norm": 0.0827578529715538, + "learning_rate": 4.767951804921597e-05, + "loss": 0.2028, + "num_input_tokens_seen": 49048448, + "step": 40300 + }, + { + "epoch": 4.488807216839292, + "grad_norm": 0.00917730014771223, + "learning_rate": 4.76784956513663e-05, + "loss": 0.0663, + "num_input_tokens_seen": 49054624, + "step": 40305 + }, + { + "epoch": 4.489364071722909, + "grad_norm": 3.0331597328186035, + "learning_rate": 4.767747303930007e-05, + "loss": 0.1602, + "num_input_tokens_seen": 49060608, + "step": 40310 + }, + { + "epoch": 4.489920926606526, + "grad_norm": 0.21081118285655975, + "learning_rate": 4.7676450213026936e-05, + "loss": 0.0558, + "num_input_tokens_seen": 49066752, + "step": 40315 + }, + { + "epoch": 4.490477781490144, + "grad_norm": 0.015703970566391945, + "learning_rate": 4.7675427172556564e-05, + "loss": 0.073, + "num_input_tokens_seen": 49072992, + "step": 40320 + }, + { + "epoch": 4.491034636373761, + "grad_norm": 2.9957234859466553, + "learning_rate": 4.767440391789861e-05, + "loss": 0.2599, + "num_input_tokens_seen": 49078880, + "step": 40325 + }, + { + "epoch": 4.491591491257378, + "grad_norm": 0.5033550262451172, + "learning_rate": 4.767338044906275e-05, + "loss": 0.0716, + "num_input_tokens_seen": 49085056, + "step": 40330 + }, + { + "epoch": 4.492148346140995, + "grad_norm": 0.38711631298065186, + "learning_rate": 4.7672356766058645e-05, + "loss": 0.0792, + "num_input_tokens_seen": 49091136, + "step": 40335 + }, + { + "epoch": 4.492705201024613, + "grad_norm": 1.235046625137329, + "learning_rate": 4.7671332868895965e-05, + "loss": 0.0835, + "num_input_tokens_seen": 49097312, + "step": 40340 + }, + { + "epoch": 4.493262055908231, + "grad_norm": 0.08014924079179764, + "learning_rate": 4.767030875758438e-05, + "loss": 0.0179, + "num_input_tokens_seen": 49102880, + "step": 40345 + }, + { + "epoch": 4.493818910791847, + "grad_norm": 0.27501600980758667, + "learning_rate": 4.7669284432133574e-05, + "loss": 0.105, + "num_input_tokens_seen": 49108800, + "step": 40350 + }, + { + "epoch": 4.494375765675465, + "grad_norm": 0.47172224521636963, + "learning_rate": 4.766825989255321e-05, + "loss": 0.0967, + "num_input_tokens_seen": 49114976, + "step": 40355 + }, + { + "epoch": 4.494932620559083, + "grad_norm": 0.9090375900268555, + "learning_rate": 4.7667235138852965e-05, + "loss": 0.033, + "num_input_tokens_seen": 49120608, + "step": 40360 + }, + { + "epoch": 4.4954894754426995, + "grad_norm": 0.8765841722488403, + "learning_rate": 4.7666210171042524e-05, + "loss": 0.1271, + "num_input_tokens_seen": 49126848, + "step": 40365 + }, + { + "epoch": 4.496046330326317, + "grad_norm": 1.0950236320495605, + "learning_rate": 4.7665184989131576e-05, + "loss": 0.1628, + "num_input_tokens_seen": 49132448, + "step": 40370 + }, + { + "epoch": 4.496603185209934, + "grad_norm": 0.002002416644245386, + "learning_rate": 4.7664159593129784e-05, + "loss": 0.0335, + "num_input_tokens_seen": 49138816, + "step": 40375 + }, + { + "epoch": 4.497160040093552, + "grad_norm": 0.34073758125305176, + "learning_rate": 4.766313398304686e-05, + "loss": 0.0307, + "num_input_tokens_seen": 49144896, + "step": 40380 + }, + { + "epoch": 4.497716894977169, + "grad_norm": 0.5002663135528564, + "learning_rate": 4.766210815889247e-05, + "loss": 0.0743, + "num_input_tokens_seen": 49150688, + "step": 40385 + }, + { + "epoch": 4.498273749860786, + "grad_norm": 0.3878486156463623, + "learning_rate": 4.766108212067632e-05, + "loss": 0.0562, + "num_input_tokens_seen": 49156416, + "step": 40390 + }, + { + "epoch": 4.498830604744404, + "grad_norm": 0.5329262018203735, + "learning_rate": 4.766005586840808e-05, + "loss": 0.057, + "num_input_tokens_seen": 49162560, + "step": 40395 + }, + { + "epoch": 4.499387459628021, + "grad_norm": 0.09501059353351593, + "learning_rate": 4.765902940209747e-05, + "loss": 0.034, + "num_input_tokens_seen": 49168768, + "step": 40400 + }, + { + "epoch": 4.499944314511638, + "grad_norm": 0.00035455217584967613, + "learning_rate": 4.765800272175417e-05, + "loss": 0.0675, + "num_input_tokens_seen": 49175136, + "step": 40405 + }, + { + "epoch": 4.500501169395256, + "grad_norm": 0.0025701296981424093, + "learning_rate": 4.7656975827387874e-05, + "loss": 0.0856, + "num_input_tokens_seen": 49181376, + "step": 40410 + }, + { + "epoch": 4.501058024278873, + "grad_norm": 2.923611640930176, + "learning_rate": 4.765594871900829e-05, + "loss": 0.0807, + "num_input_tokens_seen": 49187424, + "step": 40415 + }, + { + "epoch": 4.50161487916249, + "grad_norm": 0.2464958280324936, + "learning_rate": 4.765492139662513e-05, + "loss": 0.1087, + "num_input_tokens_seen": 49193760, + "step": 40420 + }, + { + "epoch": 4.502171734046108, + "grad_norm": 0.041345663368701935, + "learning_rate": 4.765389386024808e-05, + "loss": 0.072, + "num_input_tokens_seen": 49199232, + "step": 40425 + }, + { + "epoch": 4.502728588929725, + "grad_norm": 0.22549410164356232, + "learning_rate": 4.765286610988685e-05, + "loss": 0.0101, + "num_input_tokens_seen": 49205376, + "step": 40430 + }, + { + "epoch": 4.5032854438133425, + "grad_norm": 0.47667428851127625, + "learning_rate": 4.7651838145551154e-05, + "loss": 0.0288, + "num_input_tokens_seen": 49211968, + "step": 40435 + }, + { + "epoch": 4.503842298696959, + "grad_norm": 1.9164317846298218, + "learning_rate": 4.765080996725069e-05, + "loss": 0.0594, + "num_input_tokens_seen": 49218112, + "step": 40440 + }, + { + "epoch": 4.504399153580577, + "grad_norm": 0.10637425631284714, + "learning_rate": 4.764978157499519e-05, + "loss": 0.0904, + "num_input_tokens_seen": 49224416, + "step": 40445 + }, + { + "epoch": 4.504956008464195, + "grad_norm": 0.6741967797279358, + "learning_rate": 4.764875296879435e-05, + "loss": 0.0671, + "num_input_tokens_seen": 49230784, + "step": 40450 + }, + { + "epoch": 4.505512863347811, + "grad_norm": 1.7271852493286133, + "learning_rate": 4.764772414865789e-05, + "loss": 0.1244, + "num_input_tokens_seen": 49236736, + "step": 40455 + }, + { + "epoch": 4.506069718231429, + "grad_norm": 0.5030184984207153, + "learning_rate": 4.764669511459554e-05, + "loss": 0.0589, + "num_input_tokens_seen": 49242432, + "step": 40460 + }, + { + "epoch": 4.506626573115046, + "grad_norm": 0.12448441982269287, + "learning_rate": 4.7645665866617e-05, + "loss": 0.036, + "num_input_tokens_seen": 49248288, + "step": 40465 + }, + { + "epoch": 4.507183427998664, + "grad_norm": 1.236318588256836, + "learning_rate": 4.7644636404732007e-05, + "loss": 0.1132, + "num_input_tokens_seen": 49254080, + "step": 40470 + }, + { + "epoch": 4.507740282882281, + "grad_norm": 0.02433471381664276, + "learning_rate": 4.764360672895029e-05, + "loss": 0.0147, + "num_input_tokens_seen": 49260544, + "step": 40475 + }, + { + "epoch": 4.508297137765898, + "grad_norm": 0.09141001850366592, + "learning_rate": 4.764257683928155e-05, + "loss": 0.1023, + "num_input_tokens_seen": 49266912, + "step": 40480 + }, + { + "epoch": 4.508853992649516, + "grad_norm": 0.1119302362203598, + "learning_rate": 4.764154673573553e-05, + "loss": 0.1121, + "num_input_tokens_seen": 49272736, + "step": 40485 + }, + { + "epoch": 4.5094108475331325, + "grad_norm": 0.2925271987915039, + "learning_rate": 4.7640516418321976e-05, + "loss": 0.06, + "num_input_tokens_seen": 49279104, + "step": 40490 + }, + { + "epoch": 4.50996770241675, + "grad_norm": 0.47092437744140625, + "learning_rate": 4.7639485887050594e-05, + "loss": 0.048, + "num_input_tokens_seen": 49285184, + "step": 40495 + }, + { + "epoch": 4.510524557300368, + "grad_norm": 0.2850055396556854, + "learning_rate": 4.763845514193113e-05, + "loss": 0.0181, + "num_input_tokens_seen": 49290880, + "step": 40500 + }, + { + "epoch": 4.511081412183985, + "grad_norm": 0.0010837054578587413, + "learning_rate": 4.7637424182973324e-05, + "loss": 0.0096, + "num_input_tokens_seen": 49297088, + "step": 40505 + }, + { + "epoch": 4.511638267067602, + "grad_norm": 0.5420514345169067, + "learning_rate": 4.763639301018691e-05, + "loss": 0.1422, + "num_input_tokens_seen": 49302688, + "step": 40510 + }, + { + "epoch": 4.512195121951219, + "grad_norm": 0.0006333977216854692, + "learning_rate": 4.763536162358162e-05, + "loss": 0.0482, + "num_input_tokens_seen": 49308896, + "step": 40515 + }, + { + "epoch": 4.512751976834837, + "grad_norm": 1.2134768962860107, + "learning_rate": 4.763433002316722e-05, + "loss": 0.155, + "num_input_tokens_seen": 49314176, + "step": 40520 + }, + { + "epoch": 4.513308831718454, + "grad_norm": 0.013363518752157688, + "learning_rate": 4.7633298208953424e-05, + "loss": 0.1605, + "num_input_tokens_seen": 49319936, + "step": 40525 + }, + { + "epoch": 4.513865686602071, + "grad_norm": 0.32179147005081177, + "learning_rate": 4.763226618094999e-05, + "loss": 0.0407, + "num_input_tokens_seen": 49325696, + "step": 40530 + }, + { + "epoch": 4.514422541485689, + "grad_norm": 0.49087342619895935, + "learning_rate": 4.763123393916669e-05, + "loss": 0.0421, + "num_input_tokens_seen": 49332000, + "step": 40535 + }, + { + "epoch": 4.514979396369306, + "grad_norm": 0.0005774066667072475, + "learning_rate": 4.763020148361324e-05, + "loss": 0.0214, + "num_input_tokens_seen": 49338176, + "step": 40540 + }, + { + "epoch": 4.515536251252923, + "grad_norm": 0.8338189125061035, + "learning_rate": 4.762916881429939e-05, + "loss": 0.0479, + "num_input_tokens_seen": 49344480, + "step": 40545 + }, + { + "epoch": 4.516093106136541, + "grad_norm": 2.058091163635254, + "learning_rate": 4.7628135931234935e-05, + "loss": 0.0677, + "num_input_tokens_seen": 49350656, + "step": 40550 + }, + { + "epoch": 4.516649961020158, + "grad_norm": 0.017485065385699272, + "learning_rate": 4.76271028344296e-05, + "loss": 0.114, + "num_input_tokens_seen": 49356384, + "step": 40555 + }, + { + "epoch": 4.5172068159037755, + "grad_norm": 0.3172052204608917, + "learning_rate": 4.762606952389315e-05, + "loss": 0.0962, + "num_input_tokens_seen": 49361792, + "step": 40560 + }, + { + "epoch": 4.517763670787393, + "grad_norm": 0.4269684851169586, + "learning_rate": 4.762503599963534e-05, + "loss": 0.0999, + "num_input_tokens_seen": 49368064, + "step": 40565 + }, + { + "epoch": 4.51832052567101, + "grad_norm": 2.2707879543304443, + "learning_rate": 4.762400226166594e-05, + "loss": 0.3486, + "num_input_tokens_seen": 49374368, + "step": 40570 + }, + { + "epoch": 4.518877380554628, + "grad_norm": 0.014576650224626064, + "learning_rate": 4.762296830999472e-05, + "loss": 0.0409, + "num_input_tokens_seen": 49380640, + "step": 40575 + }, + { + "epoch": 4.519434235438244, + "grad_norm": 0.5628677606582642, + "learning_rate": 4.762193414463143e-05, + "loss": 0.0515, + "num_input_tokens_seen": 49386976, + "step": 40580 + }, + { + "epoch": 4.519991090321862, + "grad_norm": 0.8484219312667847, + "learning_rate": 4.762089976558586e-05, + "loss": 0.0682, + "num_input_tokens_seen": 49393184, + "step": 40585 + }, + { + "epoch": 4.52054794520548, + "grad_norm": 0.051051270216703415, + "learning_rate": 4.7619865172867755e-05, + "loss": 0.0546, + "num_input_tokens_seen": 49399296, + "step": 40590 + }, + { + "epoch": 4.5211048000890965, + "grad_norm": 0.1802593171596527, + "learning_rate": 4.7618830366486905e-05, + "loss": 0.0437, + "num_input_tokens_seen": 49405536, + "step": 40595 + }, + { + "epoch": 4.521661654972714, + "grad_norm": 0.18930017948150635, + "learning_rate": 4.761779534645308e-05, + "loss": 0.0355, + "num_input_tokens_seen": 49411776, + "step": 40600 + }, + { + "epoch": 4.522218509856332, + "grad_norm": 2.7291667461395264, + "learning_rate": 4.761676011277606e-05, + "loss": 0.1328, + "num_input_tokens_seen": 49417856, + "step": 40605 + }, + { + "epoch": 4.522775364739949, + "grad_norm": 0.22208642959594727, + "learning_rate": 4.761572466546562e-05, + "loss": 0.0763, + "num_input_tokens_seen": 49424384, + "step": 40610 + }, + { + "epoch": 4.523332219623566, + "grad_norm": 0.32487577199935913, + "learning_rate": 4.761468900453154e-05, + "loss": 0.0559, + "num_input_tokens_seen": 49429728, + "step": 40615 + }, + { + "epoch": 4.523889074507183, + "grad_norm": 0.12192051857709885, + "learning_rate": 4.76136531299836e-05, + "loss": 0.0304, + "num_input_tokens_seen": 49436032, + "step": 40620 + }, + { + "epoch": 4.524445929390801, + "grad_norm": 0.35413864254951477, + "learning_rate": 4.7612617041831595e-05, + "loss": 0.107, + "num_input_tokens_seen": 49441984, + "step": 40625 + }, + { + "epoch": 4.5250027842744185, + "grad_norm": 0.49220600724220276, + "learning_rate": 4.7611580740085295e-05, + "loss": 0.0625, + "num_input_tokens_seen": 49448256, + "step": 40630 + }, + { + "epoch": 4.525559639158035, + "grad_norm": 0.6536756753921509, + "learning_rate": 4.761054422475451e-05, + "loss": 0.1643, + "num_input_tokens_seen": 49453984, + "step": 40635 + }, + { + "epoch": 4.526116494041653, + "grad_norm": 0.2678682506084442, + "learning_rate": 4.760950749584901e-05, + "loss": 0.1064, + "num_input_tokens_seen": 49460384, + "step": 40640 + }, + { + "epoch": 4.52667334892527, + "grad_norm": 0.024191388860344887, + "learning_rate": 4.7608470553378606e-05, + "loss": 0.0232, + "num_input_tokens_seen": 49466336, + "step": 40645 + }, + { + "epoch": 4.527230203808887, + "grad_norm": 0.527860164642334, + "learning_rate": 4.760743339735309e-05, + "loss": 0.0379, + "num_input_tokens_seen": 49472384, + "step": 40650 + }, + { + "epoch": 4.527787058692505, + "grad_norm": 0.0003698971413541585, + "learning_rate": 4.7606396027782235e-05, + "loss": 0.0121, + "num_input_tokens_seen": 49478688, + "step": 40655 + }, + { + "epoch": 4.528343913576122, + "grad_norm": 0.19344916939735413, + "learning_rate": 4.760535844467586e-05, + "loss": 0.1615, + "num_input_tokens_seen": 49485056, + "step": 40660 + }, + { + "epoch": 4.5289007684597395, + "grad_norm": 0.6017681956291199, + "learning_rate": 4.7604320648043775e-05, + "loss": 0.0989, + "num_input_tokens_seen": 49491232, + "step": 40665 + }, + { + "epoch": 4.529457623343356, + "grad_norm": 0.12746331095695496, + "learning_rate": 4.7603282637895765e-05, + "loss": 0.0113, + "num_input_tokens_seen": 49497632, + "step": 40670 + }, + { + "epoch": 4.530014478226974, + "grad_norm": 0.011364881880581379, + "learning_rate": 4.7602244414241636e-05, + "loss": 0.0969, + "num_input_tokens_seen": 49503776, + "step": 40675 + }, + { + "epoch": 4.530571333110592, + "grad_norm": 2.5345301628112793, + "learning_rate": 4.760120597709121e-05, + "loss": 0.0906, + "num_input_tokens_seen": 49509600, + "step": 40680 + }, + { + "epoch": 4.5311281879942085, + "grad_norm": 0.5851573348045349, + "learning_rate": 4.760016732645428e-05, + "loss": 0.0599, + "num_input_tokens_seen": 49515232, + "step": 40685 + }, + { + "epoch": 4.531685042877826, + "grad_norm": 0.2987062931060791, + "learning_rate": 4.759912846234066e-05, + "loss": 0.0134, + "num_input_tokens_seen": 49521440, + "step": 40690 + }, + { + "epoch": 4.532241897761443, + "grad_norm": 1.4415582418441772, + "learning_rate": 4.7598089384760174e-05, + "loss": 0.1603, + "num_input_tokens_seen": 49527424, + "step": 40695 + }, + { + "epoch": 4.532798752645061, + "grad_norm": 0.11061186343431473, + "learning_rate": 4.7597050093722625e-05, + "loss": 0.0401, + "num_input_tokens_seen": 49533280, + "step": 40700 + }, + { + "epoch": 4.533355607528678, + "grad_norm": 0.15859046578407288, + "learning_rate": 4.759601058923783e-05, + "loss": 0.0348, + "num_input_tokens_seen": 49539808, + "step": 40705 + }, + { + "epoch": 4.533912462412295, + "grad_norm": 0.009999514557421207, + "learning_rate": 4.759497087131561e-05, + "loss": 0.0159, + "num_input_tokens_seen": 49545536, + "step": 40710 + }, + { + "epoch": 4.534469317295913, + "grad_norm": 0.3989695906639099, + "learning_rate": 4.759393093996579e-05, + "loss": 0.0591, + "num_input_tokens_seen": 49551808, + "step": 40715 + }, + { + "epoch": 4.5350261721795295, + "grad_norm": 0.2016609013080597, + "learning_rate": 4.75928907951982e-05, + "loss": 0.0691, + "num_input_tokens_seen": 49557920, + "step": 40720 + }, + { + "epoch": 4.535583027063147, + "grad_norm": 1.3407317399978638, + "learning_rate": 4.759185043702264e-05, + "loss": 0.1465, + "num_input_tokens_seen": 49564288, + "step": 40725 + }, + { + "epoch": 4.536139881946765, + "grad_norm": 0.36239078640937805, + "learning_rate": 4.759080986544896e-05, + "loss": 0.0661, + "num_input_tokens_seen": 49570400, + "step": 40730 + }, + { + "epoch": 4.536696736830382, + "grad_norm": 0.20399929583072662, + "learning_rate": 4.758976908048698e-05, + "loss": 0.0232, + "num_input_tokens_seen": 49576672, + "step": 40735 + }, + { + "epoch": 4.537253591713999, + "grad_norm": 0.0018243732629343867, + "learning_rate": 4.758872808214653e-05, + "loss": 0.1289, + "num_input_tokens_seen": 49582656, + "step": 40740 + }, + { + "epoch": 4.537810446597617, + "grad_norm": 0.04055290296673775, + "learning_rate": 4.758768687043745e-05, + "loss": 0.0137, + "num_input_tokens_seen": 49588544, + "step": 40745 + }, + { + "epoch": 4.538367301481234, + "grad_norm": 0.8881906867027283, + "learning_rate": 4.758664544536957e-05, + "loss": 0.0442, + "num_input_tokens_seen": 49594720, + "step": 40750 + }, + { + "epoch": 4.5389241563648515, + "grad_norm": 1.7623538970947266, + "learning_rate": 4.7585603806952726e-05, + "loss": 0.2069, + "num_input_tokens_seen": 49600864, + "step": 40755 + }, + { + "epoch": 4.539481011248469, + "grad_norm": 0.5359047651290894, + "learning_rate": 4.758456195519676e-05, + "loss": 0.0249, + "num_input_tokens_seen": 49607008, + "step": 40760 + }, + { + "epoch": 4.540037866132086, + "grad_norm": 0.7161443829536438, + "learning_rate": 4.758351989011151e-05, + "loss": 0.0784, + "num_input_tokens_seen": 49613120, + "step": 40765 + }, + { + "epoch": 4.540594721015704, + "grad_norm": 0.04974805563688278, + "learning_rate": 4.758247761170682e-05, + "loss": 0.0735, + "num_input_tokens_seen": 49619104, + "step": 40770 + }, + { + "epoch": 4.54115157589932, + "grad_norm": 0.12229003012180328, + "learning_rate": 4.758143511999254e-05, + "loss": 0.0912, + "num_input_tokens_seen": 49624800, + "step": 40775 + }, + { + "epoch": 4.541708430782938, + "grad_norm": 0.19809506833553314, + "learning_rate": 4.758039241497851e-05, + "loss": 0.0331, + "num_input_tokens_seen": 49631008, + "step": 40780 + }, + { + "epoch": 4.542265285666556, + "grad_norm": 0.3198229968547821, + "learning_rate": 4.757934949667459e-05, + "loss": 0.0524, + "num_input_tokens_seen": 49636896, + "step": 40785 + }, + { + "epoch": 4.5428221405501725, + "grad_norm": 0.01714603789150715, + "learning_rate": 4.7578306365090616e-05, + "loss": 0.0613, + "num_input_tokens_seen": 49642912, + "step": 40790 + }, + { + "epoch": 4.54337899543379, + "grad_norm": 0.6522954702377319, + "learning_rate": 4.757726302023645e-05, + "loss": 0.0141, + "num_input_tokens_seen": 49649504, + "step": 40795 + }, + { + "epoch": 4.543935850317407, + "grad_norm": 0.2269618660211563, + "learning_rate": 4.757621946212194e-05, + "loss": 0.0869, + "num_input_tokens_seen": 49655776, + "step": 40800 + }, + { + "epoch": 4.544492705201025, + "grad_norm": 0.3391750454902649, + "learning_rate": 4.757517569075696e-05, + "loss": 0.0635, + "num_input_tokens_seen": 49661376, + "step": 40805 + }, + { + "epoch": 4.545049560084642, + "grad_norm": 0.28576987981796265, + "learning_rate": 4.757413170615136e-05, + "loss": 0.0361, + "num_input_tokens_seen": 49667680, + "step": 40810 + }, + { + "epoch": 4.545606414968259, + "grad_norm": 0.06743880361318588, + "learning_rate": 4.7573087508314986e-05, + "loss": 0.0153, + "num_input_tokens_seen": 49673536, + "step": 40815 + }, + { + "epoch": 4.546163269851877, + "grad_norm": 0.8357781171798706, + "learning_rate": 4.757204309725773e-05, + "loss": 0.1069, + "num_input_tokens_seen": 49679936, + "step": 40820 + }, + { + "epoch": 4.546720124735494, + "grad_norm": 0.43091705441474915, + "learning_rate": 4.7570998472989436e-05, + "loss": 0.2113, + "num_input_tokens_seen": 49685760, + "step": 40825 + }, + { + "epoch": 4.547276979619111, + "grad_norm": 0.3037262558937073, + "learning_rate": 4.7569953635519976e-05, + "loss": 0.0856, + "num_input_tokens_seen": 49691520, + "step": 40830 + }, + { + "epoch": 4.547833834502729, + "grad_norm": 1.1659678220748901, + "learning_rate": 4.756890858485923e-05, + "loss": 0.1612, + "num_input_tokens_seen": 49697600, + "step": 40835 + }, + { + "epoch": 4.548390689386346, + "grad_norm": 0.6223902702331543, + "learning_rate": 4.7567863321017045e-05, + "loss": 0.049, + "num_input_tokens_seen": 49703808, + "step": 40840 + }, + { + "epoch": 4.548947544269963, + "grad_norm": 0.14368200302124023, + "learning_rate": 4.756681784400332e-05, + "loss": 0.0776, + "num_input_tokens_seen": 49709824, + "step": 40845 + }, + { + "epoch": 4.54950439915358, + "grad_norm": 0.114744633436203, + "learning_rate": 4.756577215382793e-05, + "loss": 0.0435, + "num_input_tokens_seen": 49715712, + "step": 40850 + }, + { + "epoch": 4.550061254037198, + "grad_norm": 0.05526216700673103, + "learning_rate": 4.7564726250500724e-05, + "loss": 0.0252, + "num_input_tokens_seen": 49722304, + "step": 40855 + }, + { + "epoch": 4.5506181089208155, + "grad_norm": 0.10814491659402847, + "learning_rate": 4.7563680134031605e-05, + "loss": 0.1493, + "num_input_tokens_seen": 49728160, + "step": 40860 + }, + { + "epoch": 4.551174963804432, + "grad_norm": 0.015141593292355537, + "learning_rate": 4.756263380443046e-05, + "loss": 0.0401, + "num_input_tokens_seen": 49734368, + "step": 40865 + }, + { + "epoch": 4.55173181868805, + "grad_norm": 1.3885668516159058, + "learning_rate": 4.756158726170715e-05, + "loss": 0.0801, + "num_input_tokens_seen": 49740768, + "step": 40870 + }, + { + "epoch": 4.552288673571667, + "grad_norm": 0.2739836573600769, + "learning_rate": 4.756054050587158e-05, + "loss": 0.2476, + "num_input_tokens_seen": 49746432, + "step": 40875 + }, + { + "epoch": 4.5528455284552845, + "grad_norm": 0.11364299803972244, + "learning_rate": 4.755949353693362e-05, + "loss": 0.0427, + "num_input_tokens_seen": 49752704, + "step": 40880 + }, + { + "epoch": 4.553402383338902, + "grad_norm": 0.03708082064986229, + "learning_rate": 4.7558446354903174e-05, + "loss": 0.0998, + "num_input_tokens_seen": 49759072, + "step": 40885 + }, + { + "epoch": 4.553959238222519, + "grad_norm": 0.015293258242309093, + "learning_rate": 4.755739895979014e-05, + "loss": 0.0274, + "num_input_tokens_seen": 49765056, + "step": 40890 + }, + { + "epoch": 4.554516093106137, + "grad_norm": 0.34446069598197937, + "learning_rate": 4.7556351351604376e-05, + "loss": 0.0964, + "num_input_tokens_seen": 49771200, + "step": 40895 + }, + { + "epoch": 4.555072947989753, + "grad_norm": 0.01991361193358898, + "learning_rate": 4.755530353035582e-05, + "loss": 0.0921, + "num_input_tokens_seen": 49776992, + "step": 40900 + }, + { + "epoch": 4.555629802873371, + "grad_norm": 1.2627683877944946, + "learning_rate": 4.7554255496054346e-05, + "loss": 0.1716, + "num_input_tokens_seen": 49783360, + "step": 40905 + }, + { + "epoch": 4.556186657756989, + "grad_norm": 0.013769120909273624, + "learning_rate": 4.755320724870986e-05, + "loss": 0.0663, + "num_input_tokens_seen": 49789536, + "step": 40910 + }, + { + "epoch": 4.5567435126406055, + "grad_norm": 1.162891149520874, + "learning_rate": 4.755215878833226e-05, + "loss": 0.0889, + "num_input_tokens_seen": 49796032, + "step": 40915 + }, + { + "epoch": 4.557300367524223, + "grad_norm": 0.006184946279972792, + "learning_rate": 4.7551110114931455e-05, + "loss": 0.0564, + "num_input_tokens_seen": 49802272, + "step": 40920 + }, + { + "epoch": 4.557857222407841, + "grad_norm": 0.12614156305789948, + "learning_rate": 4.755006122851735e-05, + "loss": 0.0459, + "num_input_tokens_seen": 49807744, + "step": 40925 + }, + { + "epoch": 4.558414077291458, + "grad_norm": 0.4559159278869629, + "learning_rate": 4.754901212909984e-05, + "loss": 0.0347, + "num_input_tokens_seen": 49813760, + "step": 40930 + }, + { + "epoch": 4.558970932175075, + "grad_norm": 0.3831210434436798, + "learning_rate": 4.7547962816688855e-05, + "loss": 0.1073, + "num_input_tokens_seen": 49819904, + "step": 40935 + }, + { + "epoch": 4.559527787058693, + "grad_norm": 0.8373396396636963, + "learning_rate": 4.754691329129429e-05, + "loss": 0.1473, + "num_input_tokens_seen": 49826112, + "step": 40940 + }, + { + "epoch": 4.56008464194231, + "grad_norm": 0.5979156494140625, + "learning_rate": 4.754586355292606e-05, + "loss": 0.0338, + "num_input_tokens_seen": 49832256, + "step": 40945 + }, + { + "epoch": 4.5606414968259275, + "grad_norm": 0.6795429587364197, + "learning_rate": 4.7544813601594093e-05, + "loss": 0.1138, + "num_input_tokens_seen": 49837984, + "step": 40950 + }, + { + "epoch": 4.561198351709544, + "grad_norm": 3.6975467205047607, + "learning_rate": 4.75437634373083e-05, + "loss": 0.2283, + "num_input_tokens_seen": 49843616, + "step": 40955 + }, + { + "epoch": 4.561755206593162, + "grad_norm": 0.6128683090209961, + "learning_rate": 4.75427130600786e-05, + "loss": 0.0616, + "num_input_tokens_seen": 49849728, + "step": 40960 + }, + { + "epoch": 4.56231206147678, + "grad_norm": 0.4440309703350067, + "learning_rate": 4.754166246991491e-05, + "loss": 0.0831, + "num_input_tokens_seen": 49855808, + "step": 40965 + }, + { + "epoch": 4.562868916360396, + "grad_norm": 0.08090963214635849, + "learning_rate": 4.7540611666827156e-05, + "loss": 0.0115, + "num_input_tokens_seen": 49861664, + "step": 40970 + }, + { + "epoch": 4.563425771244014, + "grad_norm": 0.08389244973659515, + "learning_rate": 4.7539560650825265e-05, + "loss": 0.0382, + "num_input_tokens_seen": 49867744, + "step": 40975 + }, + { + "epoch": 4.563982626127631, + "grad_norm": 0.6271763443946838, + "learning_rate": 4.7538509421919176e-05, + "loss": 0.1106, + "num_input_tokens_seen": 49874048, + "step": 40980 + }, + { + "epoch": 4.5645394810112485, + "grad_norm": 0.08784118294715881, + "learning_rate": 4.75374579801188e-05, + "loss": 0.012, + "num_input_tokens_seen": 49879584, + "step": 40985 + }, + { + "epoch": 4.565096335894866, + "grad_norm": 1.2390975952148438, + "learning_rate": 4.7536406325434074e-05, + "loss": 0.2179, + "num_input_tokens_seen": 49885088, + "step": 40990 + }, + { + "epoch": 4.565653190778483, + "grad_norm": 0.005848886910825968, + "learning_rate": 4.7535354457874935e-05, + "loss": 0.0045, + "num_input_tokens_seen": 49891264, + "step": 40995 + }, + { + "epoch": 4.566210045662101, + "grad_norm": 0.07762831449508667, + "learning_rate": 4.753430237745132e-05, + "loss": 0.0235, + "num_input_tokens_seen": 49897600, + "step": 41000 + }, + { + "epoch": 4.566766900545717, + "grad_norm": 0.155997633934021, + "learning_rate": 4.753325008417317e-05, + "loss": 0.0408, + "num_input_tokens_seen": 49903904, + "step": 41005 + }, + { + "epoch": 4.567323755429335, + "grad_norm": 0.2342444807291031, + "learning_rate": 4.7532197578050415e-05, + "loss": 0.0368, + "num_input_tokens_seen": 49910336, + "step": 41010 + }, + { + "epoch": 4.567880610312953, + "grad_norm": 0.6661573648452759, + "learning_rate": 4.7531144859093e-05, + "loss": 0.1692, + "num_input_tokens_seen": 49916064, + "step": 41015 + }, + { + "epoch": 4.56843746519657, + "grad_norm": 0.240866556763649, + "learning_rate": 4.753009192731087e-05, + "loss": 0.0409, + "num_input_tokens_seen": 49922528, + "step": 41020 + }, + { + "epoch": 4.568994320080187, + "grad_norm": 0.0006345135625451803, + "learning_rate": 4.752903878271398e-05, + "loss": 0.0755, + "num_input_tokens_seen": 49928192, + "step": 41025 + }, + { + "epoch": 4.569551174963804, + "grad_norm": 0.47112131118774414, + "learning_rate": 4.752798542531226e-05, + "loss": 0.067, + "num_input_tokens_seen": 49934176, + "step": 41030 + }, + { + "epoch": 4.570108029847422, + "grad_norm": 0.1682116985321045, + "learning_rate": 4.7526931855115666e-05, + "loss": 0.0215, + "num_input_tokens_seen": 49940480, + "step": 41035 + }, + { + "epoch": 4.570664884731039, + "grad_norm": 0.06799202412366867, + "learning_rate": 4.752587807213416e-05, + "loss": 0.0079, + "num_input_tokens_seen": 49946528, + "step": 41040 + }, + { + "epoch": 4.571221739614656, + "grad_norm": 0.6336263418197632, + "learning_rate": 4.752482407637768e-05, + "loss": 0.1042, + "num_input_tokens_seen": 49952736, + "step": 41045 + }, + { + "epoch": 4.571778594498274, + "grad_norm": 0.004095808137208223, + "learning_rate": 4.752376986785619e-05, + "loss": 0.0719, + "num_input_tokens_seen": 49958624, + "step": 41050 + }, + { + "epoch": 4.572335449381891, + "grad_norm": 1.4229553937911987, + "learning_rate": 4.7522715446579655e-05, + "loss": 0.1311, + "num_input_tokens_seen": 49964800, + "step": 41055 + }, + { + "epoch": 4.572892304265508, + "grad_norm": 1.1369348764419556, + "learning_rate": 4.752166081255803e-05, + "loss": 0.0727, + "num_input_tokens_seen": 49970208, + "step": 41060 + }, + { + "epoch": 4.573449159149126, + "grad_norm": 0.232020765542984, + "learning_rate": 4.7520605965801265e-05, + "loss": 0.0298, + "num_input_tokens_seen": 49976256, + "step": 41065 + }, + { + "epoch": 4.574006014032743, + "grad_norm": 0.004148364067077637, + "learning_rate": 4.7519550906319346e-05, + "loss": 0.0376, + "num_input_tokens_seen": 49982496, + "step": 41070 + }, + { + "epoch": 4.57456286891636, + "grad_norm": 0.017309093847870827, + "learning_rate": 4.751849563412221e-05, + "loss": 0.0264, + "num_input_tokens_seen": 49988608, + "step": 41075 + }, + { + "epoch": 4.575119723799978, + "grad_norm": 0.25570741295814514, + "learning_rate": 4.751744014921985e-05, + "loss": 0.1007, + "num_input_tokens_seen": 49994784, + "step": 41080 + }, + { + "epoch": 4.575676578683595, + "grad_norm": 0.32539647817611694, + "learning_rate": 4.7516384451622234e-05, + "loss": 0.1285, + "num_input_tokens_seen": 50000768, + "step": 41085 + }, + { + "epoch": 4.576233433567213, + "grad_norm": 0.5171836018562317, + "learning_rate": 4.751532854133932e-05, + "loss": 0.0242, + "num_input_tokens_seen": 50006752, + "step": 41090 + }, + { + "epoch": 4.576790288450829, + "grad_norm": 0.5980959534645081, + "learning_rate": 4.7514272418381086e-05, + "loss": 0.0905, + "num_input_tokens_seen": 50012864, + "step": 41095 + }, + { + "epoch": 4.577347143334447, + "grad_norm": 0.9044206738471985, + "learning_rate": 4.751321608275751e-05, + "loss": 0.0482, + "num_input_tokens_seen": 50018848, + "step": 41100 + }, + { + "epoch": 4.577903998218065, + "grad_norm": 0.00030559723381884396, + "learning_rate": 4.751215953447857e-05, + "loss": 0.1095, + "num_input_tokens_seen": 50025088, + "step": 41105 + }, + { + "epoch": 4.5784608531016815, + "grad_norm": 1.1061688661575317, + "learning_rate": 4.7511102773554254e-05, + "loss": 0.0778, + "num_input_tokens_seen": 50031360, + "step": 41110 + }, + { + "epoch": 4.579017707985299, + "grad_norm": 0.13174577057361603, + "learning_rate": 4.751004579999454e-05, + "loss": 0.0748, + "num_input_tokens_seen": 50037184, + "step": 41115 + }, + { + "epoch": 4.579574562868917, + "grad_norm": 0.3651305139064789, + "learning_rate": 4.75089886138094e-05, + "loss": 0.1604, + "num_input_tokens_seen": 50043520, + "step": 41120 + }, + { + "epoch": 4.580131417752534, + "grad_norm": 0.00177041778806597, + "learning_rate": 4.750793121500883e-05, + "loss": 0.0162, + "num_input_tokens_seen": 50049856, + "step": 41125 + }, + { + "epoch": 4.580688272636151, + "grad_norm": 0.039172735065221786, + "learning_rate": 4.750687360360282e-05, + "loss": 0.0624, + "num_input_tokens_seen": 50056384, + "step": 41130 + }, + { + "epoch": 4.581245127519768, + "grad_norm": 0.4566078186035156, + "learning_rate": 4.7505815779601355e-05, + "loss": 0.1249, + "num_input_tokens_seen": 50061568, + "step": 41135 + }, + { + "epoch": 4.581801982403386, + "grad_norm": 0.2699783444404602, + "learning_rate": 4.750475774301443e-05, + "loss": 0.0434, + "num_input_tokens_seen": 50067104, + "step": 41140 + }, + { + "epoch": 4.582358837287003, + "grad_norm": 0.0005340231582522392, + "learning_rate": 4.750369949385203e-05, + "loss": 0.187, + "num_input_tokens_seen": 50073120, + "step": 41145 + }, + { + "epoch": 4.58291569217062, + "grad_norm": 0.639413058757782, + "learning_rate": 4.7502641032124165e-05, + "loss": 0.0473, + "num_input_tokens_seen": 50079328, + "step": 41150 + }, + { + "epoch": 4.583472547054238, + "grad_norm": 0.37025904655456543, + "learning_rate": 4.750158235784082e-05, + "loss": 0.0302, + "num_input_tokens_seen": 50085632, + "step": 41155 + }, + { + "epoch": 4.584029401937855, + "grad_norm": 0.038066472858190536, + "learning_rate": 4.7500523471012016e-05, + "loss": 0.0272, + "num_input_tokens_seen": 50091840, + "step": 41160 + }, + { + "epoch": 4.584586256821472, + "grad_norm": 0.5460692048072815, + "learning_rate": 4.749946437164773e-05, + "loss": 0.0967, + "num_input_tokens_seen": 50097664, + "step": 41165 + }, + { + "epoch": 4.58514311170509, + "grad_norm": 0.3920883536338806, + "learning_rate": 4.749840505975798e-05, + "loss": 0.0203, + "num_input_tokens_seen": 50103840, + "step": 41170 + }, + { + "epoch": 4.585699966588707, + "grad_norm": 0.0003915815323125571, + "learning_rate": 4.749734553535277e-05, + "loss": 0.0884, + "num_input_tokens_seen": 50110016, + "step": 41175 + }, + { + "epoch": 4.5862568214723245, + "grad_norm": 0.40976136922836304, + "learning_rate": 4.7496285798442096e-05, + "loss": 0.2014, + "num_input_tokens_seen": 50116384, + "step": 41180 + }, + { + "epoch": 4.586813676355941, + "grad_norm": 0.8650336861610413, + "learning_rate": 4.7495225849036e-05, + "loss": 0.2058, + "num_input_tokens_seen": 50121824, + "step": 41185 + }, + { + "epoch": 4.587370531239559, + "grad_norm": 0.8705184459686279, + "learning_rate": 4.7494165687144454e-05, + "loss": 0.1126, + "num_input_tokens_seen": 50128160, + "step": 41190 + }, + { + "epoch": 4.587927386123177, + "grad_norm": 1.6268565654754639, + "learning_rate": 4.7493105312777495e-05, + "loss": 0.1011, + "num_input_tokens_seen": 50134336, + "step": 41195 + }, + { + "epoch": 4.588484241006793, + "grad_norm": 0.02435143291950226, + "learning_rate": 4.749204472594514e-05, + "loss": 0.0435, + "num_input_tokens_seen": 50140288, + "step": 41200 + }, + { + "epoch": 4.589041095890411, + "grad_norm": 0.20933154225349426, + "learning_rate": 4.7490983926657395e-05, + "loss": 0.1311, + "num_input_tokens_seen": 50146368, + "step": 41205 + }, + { + "epoch": 4.589597950774028, + "grad_norm": 0.5207667946815491, + "learning_rate": 4.74899229149243e-05, + "loss": 0.245, + "num_input_tokens_seen": 50152128, + "step": 41210 + }, + { + "epoch": 4.5901548056576456, + "grad_norm": 0.47577062249183655, + "learning_rate": 4.7488861690755855e-05, + "loss": 0.0829, + "num_input_tokens_seen": 50158240, + "step": 41215 + }, + { + "epoch": 4.590711660541263, + "grad_norm": 1.0456814765930176, + "learning_rate": 4.74878002541621e-05, + "loss": 0.1066, + "num_input_tokens_seen": 50164448, + "step": 41220 + }, + { + "epoch": 4.59126851542488, + "grad_norm": 0.39411070942878723, + "learning_rate": 4.7486738605153044e-05, + "loss": 0.0658, + "num_input_tokens_seen": 50170624, + "step": 41225 + }, + { + "epoch": 4.591825370308498, + "grad_norm": 0.14740164577960968, + "learning_rate": 4.748567674373873e-05, + "loss": 0.0565, + "num_input_tokens_seen": 50176896, + "step": 41230 + }, + { + "epoch": 4.5923822251921145, + "grad_norm": 0.10926930606365204, + "learning_rate": 4.748461466992918e-05, + "loss": 0.0378, + "num_input_tokens_seen": 50183008, + "step": 41235 + }, + { + "epoch": 4.592939080075732, + "grad_norm": 0.002253251848742366, + "learning_rate": 4.748355238373444e-05, + "loss": 0.0611, + "num_input_tokens_seen": 50188992, + "step": 41240 + }, + { + "epoch": 4.59349593495935, + "grad_norm": 0.07865574955940247, + "learning_rate": 4.7482489885164536e-05, + "loss": 0.0608, + "num_input_tokens_seen": 50194432, + "step": 41245 + }, + { + "epoch": 4.594052789842967, + "grad_norm": 2.4614369869232178, + "learning_rate": 4.748142717422949e-05, + "loss": 0.073, + "num_input_tokens_seen": 50200608, + "step": 41250 + }, + { + "epoch": 4.594609644726584, + "grad_norm": 1.1495710611343384, + "learning_rate": 4.748036425093936e-05, + "loss": 0.0948, + "num_input_tokens_seen": 50206592, + "step": 41255 + }, + { + "epoch": 4.595166499610202, + "grad_norm": 0.003468225011602044, + "learning_rate": 4.747930111530418e-05, + "loss": 0.0411, + "num_input_tokens_seen": 50212352, + "step": 41260 + }, + { + "epoch": 4.595723354493819, + "grad_norm": 0.2744416892528534, + "learning_rate": 4.747823776733399e-05, + "loss": 0.0526, + "num_input_tokens_seen": 50218720, + "step": 41265 + }, + { + "epoch": 4.596280209377436, + "grad_norm": 0.008282619528472424, + "learning_rate": 4.7477174207038836e-05, + "loss": 0.0678, + "num_input_tokens_seen": 50224672, + "step": 41270 + }, + { + "epoch": 4.596837064261053, + "grad_norm": 0.460273802280426, + "learning_rate": 4.747611043442876e-05, + "loss": 0.1306, + "num_input_tokens_seen": 50230624, + "step": 41275 + }, + { + "epoch": 4.597393919144671, + "grad_norm": 0.3128207325935364, + "learning_rate": 4.7475046449513807e-05, + "loss": 0.01, + "num_input_tokens_seen": 50236864, + "step": 41280 + }, + { + "epoch": 4.597950774028289, + "grad_norm": 1.4824576377868652, + "learning_rate": 4.747398225230404e-05, + "loss": 0.141, + "num_input_tokens_seen": 50242016, + "step": 41285 + }, + { + "epoch": 4.598507628911905, + "grad_norm": 0.32961586117744446, + "learning_rate": 4.74729178428095e-05, + "loss": 0.0157, + "num_input_tokens_seen": 50248192, + "step": 41290 + }, + { + "epoch": 4.599064483795523, + "grad_norm": 0.013876305893063545, + "learning_rate": 4.747185322104026e-05, + "loss": 0.1202, + "num_input_tokens_seen": 50254496, + "step": 41295 + }, + { + "epoch": 4.599621338679141, + "grad_norm": 0.7397966980934143, + "learning_rate": 4.747078838700635e-05, + "loss": 0.0678, + "num_input_tokens_seen": 50260416, + "step": 41300 + }, + { + "epoch": 4.6001781935627575, + "grad_norm": 0.13990803062915802, + "learning_rate": 4.7469723340717844e-05, + "loss": 0.0335, + "num_input_tokens_seen": 50266080, + "step": 41305 + }, + { + "epoch": 4.600735048446375, + "grad_norm": 1.4362916946411133, + "learning_rate": 4.74686580821848e-05, + "loss": 0.1041, + "num_input_tokens_seen": 50272032, + "step": 41310 + }, + { + "epoch": 4.601291903329992, + "grad_norm": 0.3804606795310974, + "learning_rate": 4.746759261141728e-05, + "loss": 0.1086, + "num_input_tokens_seen": 50278016, + "step": 41315 + }, + { + "epoch": 4.60184875821361, + "grad_norm": 0.0115408506244421, + "learning_rate": 4.746652692842534e-05, + "loss": 0.0092, + "num_input_tokens_seen": 50284160, + "step": 41320 + }, + { + "epoch": 4.602405613097227, + "grad_norm": 0.16778476536273956, + "learning_rate": 4.746546103321906e-05, + "loss": 0.1469, + "num_input_tokens_seen": 50290272, + "step": 41325 + }, + { + "epoch": 4.602962467980844, + "grad_norm": 0.2212495058774948, + "learning_rate": 4.74643949258085e-05, + "loss": 0.0554, + "num_input_tokens_seen": 50296448, + "step": 41330 + }, + { + "epoch": 4.603519322864462, + "grad_norm": 0.09001114219427109, + "learning_rate": 4.7463328606203727e-05, + "loss": 0.0862, + "num_input_tokens_seen": 50302784, + "step": 41335 + }, + { + "epoch": 4.6040761777480785, + "grad_norm": 0.9011178016662598, + "learning_rate": 4.746226207441482e-05, + "loss": 0.0455, + "num_input_tokens_seen": 50308992, + "step": 41340 + }, + { + "epoch": 4.604633032631696, + "grad_norm": 0.5411967635154724, + "learning_rate": 4.746119533045186e-05, + "loss": 0.1667, + "num_input_tokens_seen": 50315040, + "step": 41345 + }, + { + "epoch": 4.605189887515314, + "grad_norm": 0.2611207365989685, + "learning_rate": 4.7460128374324906e-05, + "loss": 0.1633, + "num_input_tokens_seen": 50321408, + "step": 41350 + }, + { + "epoch": 4.605746742398931, + "grad_norm": 0.001110354787670076, + "learning_rate": 4.7459061206044045e-05, + "loss": 0.0348, + "num_input_tokens_seen": 50327872, + "step": 41355 + }, + { + "epoch": 4.606303597282548, + "grad_norm": 0.046258341521024704, + "learning_rate": 4.7457993825619364e-05, + "loss": 0.0212, + "num_input_tokens_seen": 50334080, + "step": 41360 + }, + { + "epoch": 4.606860452166165, + "grad_norm": 0.0026710552629083395, + "learning_rate": 4.7456926233060926e-05, + "loss": 0.1253, + "num_input_tokens_seen": 50340352, + "step": 41365 + }, + { + "epoch": 4.607417307049783, + "grad_norm": 0.854693591594696, + "learning_rate": 4.7455858428378835e-05, + "loss": 0.1292, + "num_input_tokens_seen": 50345696, + "step": 41370 + }, + { + "epoch": 4.6079741619334005, + "grad_norm": 1.8064765930175781, + "learning_rate": 4.745479041158317e-05, + "loss": 0.0649, + "num_input_tokens_seen": 50352000, + "step": 41375 + }, + { + "epoch": 4.608531016817017, + "grad_norm": 0.050557222217321396, + "learning_rate": 4.745372218268402e-05, + "loss": 0.1209, + "num_input_tokens_seen": 50357248, + "step": 41380 + }, + { + "epoch": 4.609087871700635, + "grad_norm": 0.02571624517440796, + "learning_rate": 4.745265374169147e-05, + "loss": 0.1105, + "num_input_tokens_seen": 50363200, + "step": 41385 + }, + { + "epoch": 4.609644726584252, + "grad_norm": 0.26846423745155334, + "learning_rate": 4.745158508861562e-05, + "loss": 0.0492, + "num_input_tokens_seen": 50369376, + "step": 41390 + }, + { + "epoch": 4.610201581467869, + "grad_norm": 0.20899598300457, + "learning_rate": 4.7450516223466556e-05, + "loss": 0.0323, + "num_input_tokens_seen": 50375456, + "step": 41395 + }, + { + "epoch": 4.610758436351487, + "grad_norm": 2.7943339347839355, + "learning_rate": 4.744944714625439e-05, + "loss": 0.0972, + "num_input_tokens_seen": 50381536, + "step": 41400 + }, + { + "epoch": 4.611315291235104, + "grad_norm": 0.0394100546836853, + "learning_rate": 4.7448377856989205e-05, + "loss": 0.0589, + "num_input_tokens_seen": 50387648, + "step": 41405 + }, + { + "epoch": 4.6118721461187215, + "grad_norm": 0.8154076337814331, + "learning_rate": 4.74473083556811e-05, + "loss": 0.132, + "num_input_tokens_seen": 50393664, + "step": 41410 + }, + { + "epoch": 4.612429001002338, + "grad_norm": 0.6683099865913391, + "learning_rate": 4.744623864234018e-05, + "loss": 0.1149, + "num_input_tokens_seen": 50399968, + "step": 41415 + }, + { + "epoch": 4.612985855885956, + "grad_norm": 0.15019096434116364, + "learning_rate": 4.7445168716976564e-05, + "loss": 0.2352, + "num_input_tokens_seen": 50405792, + "step": 41420 + }, + { + "epoch": 4.613542710769574, + "grad_norm": 0.17707520723342896, + "learning_rate": 4.744409857960034e-05, + "loss": 0.0917, + "num_input_tokens_seen": 50411776, + "step": 41425 + }, + { + "epoch": 4.6140995656531905, + "grad_norm": 1.7005069255828857, + "learning_rate": 4.744302823022163e-05, + "loss": 0.2682, + "num_input_tokens_seen": 50417088, + "step": 41430 + }, + { + "epoch": 4.614656420536808, + "grad_norm": 0.9295014142990112, + "learning_rate": 4.744195766885053e-05, + "loss": 0.169, + "num_input_tokens_seen": 50423392, + "step": 41435 + }, + { + "epoch": 4.615213275420426, + "grad_norm": 0.6065666079521179, + "learning_rate": 4.744088689549716e-05, + "loss": 0.0413, + "num_input_tokens_seen": 50429696, + "step": 41440 + }, + { + "epoch": 4.615770130304043, + "grad_norm": 0.007206824142485857, + "learning_rate": 4.743981591017164e-05, + "loss": 0.0342, + "num_input_tokens_seen": 50436032, + "step": 41445 + }, + { + "epoch": 4.61632698518766, + "grad_norm": 0.43842369318008423, + "learning_rate": 4.7438744712884074e-05, + "loss": 0.0972, + "num_input_tokens_seen": 50442496, + "step": 41450 + }, + { + "epoch": 4.616883840071277, + "grad_norm": 0.017865128815174103, + "learning_rate": 4.743767330364459e-05, + "loss": 0.0892, + "num_input_tokens_seen": 50448736, + "step": 41455 + }, + { + "epoch": 4.617440694954895, + "grad_norm": 0.023497195914387703, + "learning_rate": 4.74366016824633e-05, + "loss": 0.0419, + "num_input_tokens_seen": 50454656, + "step": 41460 + }, + { + "epoch": 4.617997549838512, + "grad_norm": 0.09973467886447906, + "learning_rate": 4.743552984935034e-05, + "loss": 0.0179, + "num_input_tokens_seen": 50460800, + "step": 41465 + }, + { + "epoch": 4.618554404722129, + "grad_norm": 0.4367876350879669, + "learning_rate": 4.743445780431581e-05, + "loss": 0.0508, + "num_input_tokens_seen": 50467232, + "step": 41470 + }, + { + "epoch": 4.619111259605747, + "grad_norm": 0.0023541452828794718, + "learning_rate": 4.7433385547369866e-05, + "loss": 0.0155, + "num_input_tokens_seen": 50473216, + "step": 41475 + }, + { + "epoch": 4.6196681144893645, + "grad_norm": 1.168508768081665, + "learning_rate": 4.7432313078522616e-05, + "loss": 0.1588, + "num_input_tokens_seen": 50479424, + "step": 41480 + }, + { + "epoch": 4.620224969372981, + "grad_norm": 0.0053212339989840984, + "learning_rate": 4.74312403977842e-05, + "loss": 0.0646, + "num_input_tokens_seen": 50485568, + "step": 41485 + }, + { + "epoch": 4.620781824256599, + "grad_norm": 1.3347511291503906, + "learning_rate": 4.7430167505164746e-05, + "loss": 0.1576, + "num_input_tokens_seen": 50491584, + "step": 41490 + }, + { + "epoch": 4.621338679140216, + "grad_norm": 0.45730534195899963, + "learning_rate": 4.742909440067439e-05, + "loss": 0.0399, + "num_input_tokens_seen": 50497472, + "step": 41495 + }, + { + "epoch": 4.6218955340238335, + "grad_norm": 0.02687755972146988, + "learning_rate": 4.7428021084323266e-05, + "loss": 0.0797, + "num_input_tokens_seen": 50503008, + "step": 41500 + }, + { + "epoch": 4.622452388907451, + "grad_norm": 0.08031418919563293, + "learning_rate": 4.7426947556121515e-05, + "loss": 0.0877, + "num_input_tokens_seen": 50509120, + "step": 41505 + }, + { + "epoch": 4.623009243791068, + "grad_norm": 0.2938166856765747, + "learning_rate": 4.742587381607927e-05, + "loss": 0.0371, + "num_input_tokens_seen": 50515008, + "step": 41510 + }, + { + "epoch": 4.623566098674686, + "grad_norm": 0.029858654364943504, + "learning_rate": 4.742479986420669e-05, + "loss": 0.0498, + "num_input_tokens_seen": 50521216, + "step": 41515 + }, + { + "epoch": 4.624122953558302, + "grad_norm": 0.09846547245979309, + "learning_rate": 4.74237257005139e-05, + "loss": 0.0516, + "num_input_tokens_seen": 50527264, + "step": 41520 + }, + { + "epoch": 4.62467980844192, + "grad_norm": 0.5447275638580322, + "learning_rate": 4.742265132501106e-05, + "loss": 0.0553, + "num_input_tokens_seen": 50533376, + "step": 41525 + }, + { + "epoch": 4.625236663325538, + "grad_norm": 0.6398820281028748, + "learning_rate": 4.74215767377083e-05, + "loss": 0.0473, + "num_input_tokens_seen": 50539488, + "step": 41530 + }, + { + "epoch": 4.6257935182091545, + "grad_norm": 2.9662208557128906, + "learning_rate": 4.742050193861581e-05, + "loss": 0.2495, + "num_input_tokens_seen": 50545056, + "step": 41535 + }, + { + "epoch": 4.626350373092772, + "grad_norm": 0.04537445306777954, + "learning_rate": 4.74194269277437e-05, + "loss": 0.0491, + "num_input_tokens_seen": 50551520, + "step": 41540 + }, + { + "epoch": 4.626907227976389, + "grad_norm": 0.15122881531715393, + "learning_rate": 4.741835170510214e-05, + "loss": 0.0291, + "num_input_tokens_seen": 50557760, + "step": 41545 + }, + { + "epoch": 4.627464082860007, + "grad_norm": 1.046202301979065, + "learning_rate": 4.741727627070129e-05, + "loss": 0.1104, + "num_input_tokens_seen": 50563680, + "step": 41550 + }, + { + "epoch": 4.628020937743624, + "grad_norm": 0.44389307498931885, + "learning_rate": 4.74162006245513e-05, + "loss": 0.0684, + "num_input_tokens_seen": 50569824, + "step": 41555 + }, + { + "epoch": 4.628577792627241, + "grad_norm": 1.581783413887024, + "learning_rate": 4.7415124766662346e-05, + "loss": 0.1762, + "num_input_tokens_seen": 50576032, + "step": 41560 + }, + { + "epoch": 4.629134647510859, + "grad_norm": 0.6915684938430786, + "learning_rate": 4.7414048697044576e-05, + "loss": 0.0656, + "num_input_tokens_seen": 50581920, + "step": 41565 + }, + { + "epoch": 4.629691502394476, + "grad_norm": 0.0011368670966476202, + "learning_rate": 4.7412972415708156e-05, + "loss": 0.0208, + "num_input_tokens_seen": 50588128, + "step": 41570 + }, + { + "epoch": 4.630248357278093, + "grad_norm": 0.5451599359512329, + "learning_rate": 4.741189592266325e-05, + "loss": 0.0896, + "num_input_tokens_seen": 50594176, + "step": 41575 + }, + { + "epoch": 4.630805212161711, + "grad_norm": 0.12758441269397736, + "learning_rate": 4.741081921792004e-05, + "loss": 0.0281, + "num_input_tokens_seen": 50600160, + "step": 41580 + }, + { + "epoch": 4.631362067045328, + "grad_norm": 0.6522627472877502, + "learning_rate": 4.740974230148868e-05, + "loss": 0.0677, + "num_input_tokens_seen": 50606208, + "step": 41585 + }, + { + "epoch": 4.631918921928945, + "grad_norm": 0.11041900515556335, + "learning_rate": 4.7408665173379353e-05, + "loss": 0.0836, + "num_input_tokens_seen": 50612064, + "step": 41590 + }, + { + "epoch": 4.632475776812562, + "grad_norm": 0.5739631652832031, + "learning_rate": 4.740758783360223e-05, + "loss": 0.0735, + "num_input_tokens_seen": 50617856, + "step": 41595 + }, + { + "epoch": 4.63303263169618, + "grad_norm": 0.04866773635149002, + "learning_rate": 4.7406510282167486e-05, + "loss": 0.1128, + "num_input_tokens_seen": 50623936, + "step": 41600 + }, + { + "epoch": 4.6335894865797975, + "grad_norm": 0.029029300436377525, + "learning_rate": 4.74054325190853e-05, + "loss": 0.0197, + "num_input_tokens_seen": 50630336, + "step": 41605 + }, + { + "epoch": 4.634146341463414, + "grad_norm": 0.017131999135017395, + "learning_rate": 4.740435454436586e-05, + "loss": 0.063, + "num_input_tokens_seen": 50636544, + "step": 41610 + }, + { + "epoch": 4.634703196347032, + "grad_norm": 0.0064665330573916435, + "learning_rate": 4.7403276358019334e-05, + "loss": 0.0331, + "num_input_tokens_seen": 50642752, + "step": 41615 + }, + { + "epoch": 4.63526005123065, + "grad_norm": 0.3164919316768646, + "learning_rate": 4.740219796005592e-05, + "loss": 0.0685, + "num_input_tokens_seen": 50648832, + "step": 41620 + }, + { + "epoch": 4.6358169061142664, + "grad_norm": 0.08005448430776596, + "learning_rate": 4.740111935048579e-05, + "loss": 0.0416, + "num_input_tokens_seen": 50654656, + "step": 41625 + }, + { + "epoch": 4.636373760997884, + "grad_norm": 0.6439313292503357, + "learning_rate": 4.740004052931914e-05, + "loss": 0.107, + "num_input_tokens_seen": 50660160, + "step": 41630 + }, + { + "epoch": 4.636930615881501, + "grad_norm": 0.26031216979026794, + "learning_rate": 4.7398961496566165e-05, + "loss": 0.0069, + "num_input_tokens_seen": 50666368, + "step": 41635 + }, + { + "epoch": 4.637487470765119, + "grad_norm": 0.9480453729629517, + "learning_rate": 4.739788225223705e-05, + "loss": 0.0618, + "num_input_tokens_seen": 50672640, + "step": 41640 + }, + { + "epoch": 4.638044325648736, + "grad_norm": 0.784076988697052, + "learning_rate": 4.7396802796342e-05, + "loss": 0.0702, + "num_input_tokens_seen": 50678336, + "step": 41645 + }, + { + "epoch": 4.638601180532353, + "grad_norm": 0.09156996011734009, + "learning_rate": 4.739572312889119e-05, + "loss": 0.0699, + "num_input_tokens_seen": 50684416, + "step": 41650 + }, + { + "epoch": 4.639158035415971, + "grad_norm": 0.15952087938785553, + "learning_rate": 4.7394643249894844e-05, + "loss": 0.0351, + "num_input_tokens_seen": 50690784, + "step": 41655 + }, + { + "epoch": 4.639714890299588, + "grad_norm": 0.7681717276573181, + "learning_rate": 4.739356315936314e-05, + "loss": 0.0913, + "num_input_tokens_seen": 50697088, + "step": 41660 + }, + { + "epoch": 4.640271745183205, + "grad_norm": 0.3232368528842926, + "learning_rate": 4.73924828573063e-05, + "loss": 0.0425, + "num_input_tokens_seen": 50703488, + "step": 41665 + }, + { + "epoch": 4.640828600066823, + "grad_norm": 2.0776989459991455, + "learning_rate": 4.7391402343734515e-05, + "loss": 0.161, + "num_input_tokens_seen": 50709440, + "step": 41670 + }, + { + "epoch": 4.64138545495044, + "grad_norm": 0.10069150477647781, + "learning_rate": 4.7390321618657994e-05, + "loss": 0.0419, + "num_input_tokens_seen": 50715776, + "step": 41675 + }, + { + "epoch": 4.641942309834057, + "grad_norm": 0.7832486629486084, + "learning_rate": 4.738924068208695e-05, + "loss": 0.1609, + "num_input_tokens_seen": 50721696, + "step": 41680 + }, + { + "epoch": 4.642499164717675, + "grad_norm": 0.3268754780292511, + "learning_rate": 4.738815953403158e-05, + "loss": 0.1304, + "num_input_tokens_seen": 50727808, + "step": 41685 + }, + { + "epoch": 4.643056019601292, + "grad_norm": 1.7450048923492432, + "learning_rate": 4.738707817450212e-05, + "loss": 0.0714, + "num_input_tokens_seen": 50733696, + "step": 41690 + }, + { + "epoch": 4.6436128744849094, + "grad_norm": 3.001732110977173, + "learning_rate": 4.7385996603508765e-05, + "loss": 0.0957, + "num_input_tokens_seen": 50739680, + "step": 41695 + }, + { + "epoch": 4.644169729368526, + "grad_norm": 0.006936698220670223, + "learning_rate": 4.738491482106173e-05, + "loss": 0.1188, + "num_input_tokens_seen": 50746176, + "step": 41700 + }, + { + "epoch": 4.644726584252144, + "grad_norm": 0.974943995475769, + "learning_rate": 4.738383282717125e-05, + "loss": 0.0905, + "num_input_tokens_seen": 50751872, + "step": 41705 + }, + { + "epoch": 4.645283439135762, + "grad_norm": 1.6690430641174316, + "learning_rate": 4.738275062184753e-05, + "loss": 0.2164, + "num_input_tokens_seen": 50757344, + "step": 41710 + }, + { + "epoch": 4.645840294019378, + "grad_norm": 0.11231369525194168, + "learning_rate": 4.73816682051008e-05, + "loss": 0.0125, + "num_input_tokens_seen": 50763488, + "step": 41715 + }, + { + "epoch": 4.646397148902996, + "grad_norm": 1.8466651439666748, + "learning_rate": 4.738058557694128e-05, + "loss": 0.112, + "num_input_tokens_seen": 50769696, + "step": 41720 + }, + { + "epoch": 4.646954003786613, + "grad_norm": 0.8719292283058167, + "learning_rate": 4.73795027373792e-05, + "loss": 0.0572, + "num_input_tokens_seen": 50775488, + "step": 41725 + }, + { + "epoch": 4.6475108586702305, + "grad_norm": 0.009138443507254124, + "learning_rate": 4.7378419686424786e-05, + "loss": 0.098, + "num_input_tokens_seen": 50781888, + "step": 41730 + }, + { + "epoch": 4.648067713553848, + "grad_norm": 1.1796495914459229, + "learning_rate": 4.737733642408827e-05, + "loss": 0.1186, + "num_input_tokens_seen": 50788000, + "step": 41735 + }, + { + "epoch": 4.648624568437465, + "grad_norm": 0.0373469740152359, + "learning_rate": 4.737625295037988e-05, + "loss": 0.0142, + "num_input_tokens_seen": 50794240, + "step": 41740 + }, + { + "epoch": 4.649181423321083, + "grad_norm": 0.5817003846168518, + "learning_rate": 4.737516926530986e-05, + "loss": 0.0511, + "num_input_tokens_seen": 50799552, + "step": 41745 + }, + { + "epoch": 4.649738278204699, + "grad_norm": 0.19753898680210114, + "learning_rate": 4.7374085368888436e-05, + "loss": 0.0321, + "num_input_tokens_seen": 50805632, + "step": 41750 + }, + { + "epoch": 4.650295133088317, + "grad_norm": 0.33729925751686096, + "learning_rate": 4.7373001261125836e-05, + "loss": 0.0162, + "num_input_tokens_seen": 50811648, + "step": 41755 + }, + { + "epoch": 4.650851987971935, + "grad_norm": 1.2652829885482788, + "learning_rate": 4.737191694203233e-05, + "loss": 0.1318, + "num_input_tokens_seen": 50817856, + "step": 41760 + }, + { + "epoch": 4.651408842855552, + "grad_norm": 0.7577183842658997, + "learning_rate": 4.737083241161814e-05, + "loss": 0.125, + "num_input_tokens_seen": 50823776, + "step": 41765 + }, + { + "epoch": 4.651965697739169, + "grad_norm": 0.5780395269393921, + "learning_rate": 4.736974766989351e-05, + "loss": 0.0951, + "num_input_tokens_seen": 50829856, + "step": 41770 + }, + { + "epoch": 4.652522552622786, + "grad_norm": 0.5099323391914368, + "learning_rate": 4.73686627168687e-05, + "loss": 0.1104, + "num_input_tokens_seen": 50836000, + "step": 41775 + }, + { + "epoch": 4.653079407506404, + "grad_norm": 0.018484298139810562, + "learning_rate": 4.736757755255394e-05, + "loss": 0.1162, + "num_input_tokens_seen": 50842080, + "step": 41780 + }, + { + "epoch": 4.653636262390021, + "grad_norm": 0.04014560952782631, + "learning_rate": 4.73664921769595e-05, + "loss": 0.0189, + "num_input_tokens_seen": 50848000, + "step": 41785 + }, + { + "epoch": 4.654193117273638, + "grad_norm": 0.18974488973617554, + "learning_rate": 4.7365406590095615e-05, + "loss": 0.106, + "num_input_tokens_seen": 50853984, + "step": 41790 + }, + { + "epoch": 4.654749972157256, + "grad_norm": 0.07059989869594574, + "learning_rate": 4.736432079197255e-05, + "loss": 0.0105, + "num_input_tokens_seen": 50860160, + "step": 41795 + }, + { + "epoch": 4.6553068270408735, + "grad_norm": 0.010137845762073994, + "learning_rate": 4.7363234782600554e-05, + "loss": 0.0421, + "num_input_tokens_seen": 50866368, + "step": 41800 + }, + { + "epoch": 4.65586368192449, + "grad_norm": 1.22166907787323, + "learning_rate": 4.736214856198989e-05, + "loss": 0.0638, + "num_input_tokens_seen": 50872128, + "step": 41805 + }, + { + "epoch": 4.656420536808108, + "grad_norm": 0.00817977823317051, + "learning_rate": 4.736106213015081e-05, + "loss": 0.017, + "num_input_tokens_seen": 50878432, + "step": 41810 + }, + { + "epoch": 4.656977391691725, + "grad_norm": 1.8388371467590332, + "learning_rate": 4.7359975487093586e-05, + "loss": 0.2286, + "num_input_tokens_seen": 50884256, + "step": 41815 + }, + { + "epoch": 4.657534246575342, + "grad_norm": 1.1440882682800293, + "learning_rate": 4.735888863282849e-05, + "loss": 0.0518, + "num_input_tokens_seen": 50890528, + "step": 41820 + }, + { + "epoch": 4.65809110145896, + "grad_norm": 0.019942529499530792, + "learning_rate": 4.735780156736577e-05, + "loss": 0.0319, + "num_input_tokens_seen": 50896384, + "step": 41825 + }, + { + "epoch": 4.658647956342577, + "grad_norm": 2.8485867977142334, + "learning_rate": 4.73567142907157e-05, + "loss": 0.0509, + "num_input_tokens_seen": 50902880, + "step": 41830 + }, + { + "epoch": 4.659204811226195, + "grad_norm": 1.0008291006088257, + "learning_rate": 4.735562680288855e-05, + "loss": 0.1364, + "num_input_tokens_seen": 50909024, + "step": 41835 + }, + { + "epoch": 4.659761666109812, + "grad_norm": 0.004691213369369507, + "learning_rate": 4.735453910389459e-05, + "loss": 0.1203, + "num_input_tokens_seen": 50915200, + "step": 41840 + }, + { + "epoch": 4.660318520993429, + "grad_norm": 1.4559508562088013, + "learning_rate": 4.73534511937441e-05, + "loss": 0.1204, + "num_input_tokens_seen": 50921280, + "step": 41845 + }, + { + "epoch": 4.660875375877047, + "grad_norm": 0.0036392463371157646, + "learning_rate": 4.735236307244736e-05, + "loss": 0.0067, + "num_input_tokens_seen": 50927776, + "step": 41850 + }, + { + "epoch": 4.6614322307606635, + "grad_norm": 0.0025599009823054075, + "learning_rate": 4.735127474001464e-05, + "loss": 0.1974, + "num_input_tokens_seen": 50933440, + "step": 41855 + }, + { + "epoch": 4.661989085644281, + "grad_norm": 0.9055846333503723, + "learning_rate": 4.735018619645623e-05, + "loss": 0.021, + "num_input_tokens_seen": 50939616, + "step": 41860 + }, + { + "epoch": 4.662545940527899, + "grad_norm": 0.16718438267707825, + "learning_rate": 4.734909744178239e-05, + "loss": 0.0309, + "num_input_tokens_seen": 50946240, + "step": 41865 + }, + { + "epoch": 4.663102795411516, + "grad_norm": 0.6066807508468628, + "learning_rate": 4.734800847600342e-05, + "loss": 0.0481, + "num_input_tokens_seen": 50952544, + "step": 41870 + }, + { + "epoch": 4.663659650295133, + "grad_norm": 0.14773757755756378, + "learning_rate": 4.734691929912962e-05, + "loss": 0.0745, + "num_input_tokens_seen": 50958944, + "step": 41875 + }, + { + "epoch": 4.66421650517875, + "grad_norm": 0.12291563302278519, + "learning_rate": 4.7345829911171254e-05, + "loss": 0.0421, + "num_input_tokens_seen": 50964992, + "step": 41880 + }, + { + "epoch": 4.664773360062368, + "grad_norm": 0.5725011825561523, + "learning_rate": 4.734474031213862e-05, + "loss": 0.0695, + "num_input_tokens_seen": 50971040, + "step": 41885 + }, + { + "epoch": 4.665330214945985, + "grad_norm": 1.087416172027588, + "learning_rate": 4.7343650502042013e-05, + "loss": 0.0738, + "num_input_tokens_seen": 50977280, + "step": 41890 + }, + { + "epoch": 4.665887069829602, + "grad_norm": 0.29048144817352295, + "learning_rate": 4.734256048089172e-05, + "loss": 0.0189, + "num_input_tokens_seen": 50983424, + "step": 41895 + }, + { + "epoch": 4.66644392471322, + "grad_norm": 0.12660743296146393, + "learning_rate": 4.734147024869805e-05, + "loss": 0.109, + "num_input_tokens_seen": 50989536, + "step": 41900 + }, + { + "epoch": 4.667000779596837, + "grad_norm": 0.7308066487312317, + "learning_rate": 4.734037980547129e-05, + "loss": 0.0485, + "num_input_tokens_seen": 50995584, + "step": 41905 + }, + { + "epoch": 4.667557634480454, + "grad_norm": 0.3454885482788086, + "learning_rate": 4.733928915122175e-05, + "loss": 0.1123, + "num_input_tokens_seen": 51001792, + "step": 41910 + }, + { + "epoch": 4.668114489364072, + "grad_norm": 0.1978663206100464, + "learning_rate": 4.733819828595972e-05, + "loss": 0.0608, + "num_input_tokens_seen": 51007936, + "step": 41915 + }, + { + "epoch": 4.668671344247689, + "grad_norm": 0.3342748284339905, + "learning_rate": 4.733710720969551e-05, + "loss": 0.1708, + "num_input_tokens_seen": 51014112, + "step": 41920 + }, + { + "epoch": 4.6692281991313065, + "grad_norm": 0.023697778582572937, + "learning_rate": 4.733601592243943e-05, + "loss": 0.0527, + "num_input_tokens_seen": 51020320, + "step": 41925 + }, + { + "epoch": 4.669785054014923, + "grad_norm": 0.985579252243042, + "learning_rate": 4.733492442420179e-05, + "loss": 0.1211, + "num_input_tokens_seen": 51026944, + "step": 41930 + }, + { + "epoch": 4.670341908898541, + "grad_norm": 0.4890253245830536, + "learning_rate": 4.733383271499288e-05, + "loss": 0.1076, + "num_input_tokens_seen": 51033216, + "step": 41935 + }, + { + "epoch": 4.670898763782159, + "grad_norm": 0.6831022500991821, + "learning_rate": 4.7332740794823033e-05, + "loss": 0.1883, + "num_input_tokens_seen": 51039264, + "step": 41940 + }, + { + "epoch": 4.671455618665775, + "grad_norm": 1.421508550643921, + "learning_rate": 4.7331648663702556e-05, + "loss": 0.0936, + "num_input_tokens_seen": 51045152, + "step": 41945 + }, + { + "epoch": 4.672012473549393, + "grad_norm": 0.7863033413887024, + "learning_rate": 4.733055632164177e-05, + "loss": 0.0398, + "num_input_tokens_seen": 51051200, + "step": 41950 + }, + { + "epoch": 4.67256932843301, + "grad_norm": 1.244002342224121, + "learning_rate": 4.7329463768650985e-05, + "loss": 0.1045, + "num_input_tokens_seen": 51056928, + "step": 41955 + }, + { + "epoch": 4.6731261833166275, + "grad_norm": 0.931011974811554, + "learning_rate": 4.7328371004740525e-05, + "loss": 0.0381, + "num_input_tokens_seen": 51062976, + "step": 41960 + }, + { + "epoch": 4.673683038200245, + "grad_norm": 0.8501803278923035, + "learning_rate": 4.732727802992071e-05, + "loss": 0.0273, + "num_input_tokens_seen": 51069184, + "step": 41965 + }, + { + "epoch": 4.674239893083862, + "grad_norm": 0.2844712436199188, + "learning_rate": 4.732618484420186e-05, + "loss": 0.082, + "num_input_tokens_seen": 51075296, + "step": 41970 + }, + { + "epoch": 4.67479674796748, + "grad_norm": 0.004165578167885542, + "learning_rate": 4.7325091447594314e-05, + "loss": 0.0852, + "num_input_tokens_seen": 51081344, + "step": 41975 + }, + { + "epoch": 4.675353602851097, + "grad_norm": 0.7946500778198242, + "learning_rate": 4.73239978401084e-05, + "loss": 0.0595, + "num_input_tokens_seen": 51086624, + "step": 41980 + }, + { + "epoch": 4.675910457734714, + "grad_norm": 0.5041038393974304, + "learning_rate": 4.732290402175443e-05, + "loss": 0.0908, + "num_input_tokens_seen": 51092352, + "step": 41985 + }, + { + "epoch": 4.676467312618332, + "grad_norm": 0.17355477809906006, + "learning_rate": 4.7321809992542755e-05, + "loss": 0.063, + "num_input_tokens_seen": 51098400, + "step": 41990 + }, + { + "epoch": 4.677024167501949, + "grad_norm": 0.14476843178272247, + "learning_rate": 4.732071575248369e-05, + "loss": 0.0411, + "num_input_tokens_seen": 51104640, + "step": 41995 + }, + { + "epoch": 4.677581022385566, + "grad_norm": 0.09690835326910019, + "learning_rate": 4.731962130158759e-05, + "loss": 0.0462, + "num_input_tokens_seen": 51110816, + "step": 42000 + }, + { + "epoch": 4.678137877269184, + "grad_norm": 0.28502070903778076, + "learning_rate": 4.731852663986478e-05, + "loss": 0.0522, + "num_input_tokens_seen": 51117120, + "step": 42005 + }, + { + "epoch": 4.678694732152801, + "grad_norm": 0.012438382022082806, + "learning_rate": 4.73174317673256e-05, + "loss": 0.0692, + "num_input_tokens_seen": 51123104, + "step": 42010 + }, + { + "epoch": 4.679251587036418, + "grad_norm": 0.023610709235072136, + "learning_rate": 4.731633668398041e-05, + "loss": 0.0297, + "num_input_tokens_seen": 51128960, + "step": 42015 + }, + { + "epoch": 4.679808441920036, + "grad_norm": 0.3715662956237793, + "learning_rate": 4.731524138983953e-05, + "loss": 0.0793, + "num_input_tokens_seen": 51135136, + "step": 42020 + }, + { + "epoch": 4.680365296803653, + "grad_norm": 0.14611856639385223, + "learning_rate": 4.7314145884913316e-05, + "loss": 0.0849, + "num_input_tokens_seen": 51141024, + "step": 42025 + }, + { + "epoch": 4.6809221516872705, + "grad_norm": 0.17488987743854523, + "learning_rate": 4.731305016921213e-05, + "loss": 0.0402, + "num_input_tokens_seen": 51147072, + "step": 42030 + }, + { + "epoch": 4.681479006570887, + "grad_norm": 0.3421548306941986, + "learning_rate": 4.731195424274629e-05, + "loss": 0.0303, + "num_input_tokens_seen": 51153152, + "step": 42035 + }, + { + "epoch": 4.682035861454505, + "grad_norm": 0.0026748960372060537, + "learning_rate": 4.7310858105526176e-05, + "loss": 0.0717, + "num_input_tokens_seen": 51159456, + "step": 42040 + }, + { + "epoch": 4.682592716338123, + "grad_norm": 0.2524394094944, + "learning_rate": 4.730976175756213e-05, + "loss": 0.0182, + "num_input_tokens_seen": 51165856, + "step": 42045 + }, + { + "epoch": 4.6831495712217395, + "grad_norm": 0.07118020206689835, + "learning_rate": 4.730866519886451e-05, + "loss": 0.0512, + "num_input_tokens_seen": 51171712, + "step": 42050 + }, + { + "epoch": 4.683706426105357, + "grad_norm": 0.013920044526457787, + "learning_rate": 4.730756842944368e-05, + "loss": 0.0634, + "num_input_tokens_seen": 51178080, + "step": 42055 + }, + { + "epoch": 4.684263280988974, + "grad_norm": 0.19416676461696625, + "learning_rate": 4.730647144930999e-05, + "loss": 0.1844, + "num_input_tokens_seen": 51183648, + "step": 42060 + }, + { + "epoch": 4.684820135872592, + "grad_norm": 0.4474233090877533, + "learning_rate": 4.73053742584738e-05, + "loss": 0.1312, + "num_input_tokens_seen": 51188928, + "step": 42065 + }, + { + "epoch": 4.685376990756209, + "grad_norm": 0.16343720257282257, + "learning_rate": 4.7304276856945484e-05, + "loss": 0.0347, + "num_input_tokens_seen": 51195232, + "step": 42070 + }, + { + "epoch": 4.685933845639826, + "grad_norm": 0.34388190507888794, + "learning_rate": 4.73031792447354e-05, + "loss": 0.1488, + "num_input_tokens_seen": 51201024, + "step": 42075 + }, + { + "epoch": 4.686490700523444, + "grad_norm": 0.3451080620288849, + "learning_rate": 4.7302081421853914e-05, + "loss": 0.0241, + "num_input_tokens_seen": 51207040, + "step": 42080 + }, + { + "epoch": 4.6870475554070605, + "grad_norm": 0.04903402924537659, + "learning_rate": 4.730098338831141e-05, + "loss": 0.1054, + "num_input_tokens_seen": 51213152, + "step": 42085 + }, + { + "epoch": 4.687604410290678, + "grad_norm": 0.16867811977863312, + "learning_rate": 4.729988514411825e-05, + "loss": 0.0287, + "num_input_tokens_seen": 51219328, + "step": 42090 + }, + { + "epoch": 4.688161265174296, + "grad_norm": 0.7589789032936096, + "learning_rate": 4.729878668928481e-05, + "loss": 0.0998, + "num_input_tokens_seen": 51225600, + "step": 42095 + }, + { + "epoch": 4.688718120057913, + "grad_norm": 0.17305941879749298, + "learning_rate": 4.729768802382146e-05, + "loss": 0.0631, + "num_input_tokens_seen": 51231552, + "step": 42100 + }, + { + "epoch": 4.68927497494153, + "grad_norm": 0.36899322271347046, + "learning_rate": 4.729658914773858e-05, + "loss": 0.0419, + "num_input_tokens_seen": 51237632, + "step": 42105 + }, + { + "epoch": 4.689831829825147, + "grad_norm": 0.26668256521224976, + "learning_rate": 4.729549006104655e-05, + "loss": 0.04, + "num_input_tokens_seen": 51243520, + "step": 42110 + }, + { + "epoch": 4.690388684708765, + "grad_norm": 0.5404312014579773, + "learning_rate": 4.729439076375576e-05, + "loss": 0.0527, + "num_input_tokens_seen": 51249856, + "step": 42115 + }, + { + "epoch": 4.6909455395923825, + "grad_norm": 0.10478606820106506, + "learning_rate": 4.729329125587659e-05, + "loss": 0.06, + "num_input_tokens_seen": 51256160, + "step": 42120 + }, + { + "epoch": 4.691502394475999, + "grad_norm": 0.17587442696094513, + "learning_rate": 4.7292191537419416e-05, + "loss": 0.0906, + "num_input_tokens_seen": 51262592, + "step": 42125 + }, + { + "epoch": 4.692059249359617, + "grad_norm": 0.14573942124843597, + "learning_rate": 4.7291091608394636e-05, + "loss": 0.0695, + "num_input_tokens_seen": 51268480, + "step": 42130 + }, + { + "epoch": 4.692616104243234, + "grad_norm": 1.1215442419052124, + "learning_rate": 4.7289991468812636e-05, + "loss": 0.095, + "num_input_tokens_seen": 51274720, + "step": 42135 + }, + { + "epoch": 4.693172959126851, + "grad_norm": 1.49215567111969, + "learning_rate": 4.7288891118683806e-05, + "loss": 0.1036, + "num_input_tokens_seen": 51280832, + "step": 42140 + }, + { + "epoch": 4.693729814010469, + "grad_norm": 0.611862063407898, + "learning_rate": 4.728779055801855e-05, + "loss": 0.0648, + "num_input_tokens_seen": 51286496, + "step": 42145 + }, + { + "epoch": 4.694286668894086, + "grad_norm": 0.008023451082408428, + "learning_rate": 4.728668978682725e-05, + "loss": 0.0493, + "num_input_tokens_seen": 51292704, + "step": 42150 + }, + { + "epoch": 4.6948435237777035, + "grad_norm": 0.2303771823644638, + "learning_rate": 4.728558880512031e-05, + "loss": 0.1313, + "num_input_tokens_seen": 51298848, + "step": 42155 + }, + { + "epoch": 4.695400378661321, + "grad_norm": 0.008177111856639385, + "learning_rate": 4.728448761290812e-05, + "loss": 0.0342, + "num_input_tokens_seen": 51305088, + "step": 42160 + }, + { + "epoch": 4.695957233544938, + "grad_norm": 0.04025191441178322, + "learning_rate": 4.7283386210201096e-05, + "loss": 0.0839, + "num_input_tokens_seen": 51311008, + "step": 42165 + }, + { + "epoch": 4.696514088428556, + "grad_norm": 0.6199379563331604, + "learning_rate": 4.728228459700964e-05, + "loss": 0.045, + "num_input_tokens_seen": 51316768, + "step": 42170 + }, + { + "epoch": 4.697070943312173, + "grad_norm": 0.016113199293613434, + "learning_rate": 4.728118277334416e-05, + "loss": 0.0227, + "num_input_tokens_seen": 51323104, + "step": 42175 + }, + { + "epoch": 4.69762779819579, + "grad_norm": 0.6210381388664246, + "learning_rate": 4.7280080739215045e-05, + "loss": 0.0341, + "num_input_tokens_seen": 51329184, + "step": 42180 + }, + { + "epoch": 4.698184653079408, + "grad_norm": 0.03741588070988655, + "learning_rate": 4.727897849463272e-05, + "loss": 0.0582, + "num_input_tokens_seen": 51335136, + "step": 42185 + }, + { + "epoch": 4.698741507963025, + "grad_norm": 0.03073565848171711, + "learning_rate": 4.727787603960759e-05, + "loss": 0.0836, + "num_input_tokens_seen": 51341024, + "step": 42190 + }, + { + "epoch": 4.699298362846642, + "grad_norm": 0.3211019039154053, + "learning_rate": 4.727677337415008e-05, + "loss": 0.085, + "num_input_tokens_seen": 51346400, + "step": 42195 + }, + { + "epoch": 4.69985521773026, + "grad_norm": 0.032001793384552, + "learning_rate": 4.72756704982706e-05, + "loss": 0.0781, + "num_input_tokens_seen": 51352576, + "step": 42200 + }, + { + "epoch": 4.700412072613877, + "grad_norm": 0.025579892098903656, + "learning_rate": 4.727456741197955e-05, + "loss": 0.0266, + "num_input_tokens_seen": 51358816, + "step": 42205 + }, + { + "epoch": 4.700968927497494, + "grad_norm": 0.03592054545879364, + "learning_rate": 4.727346411528737e-05, + "loss": 0.0465, + "num_input_tokens_seen": 51364768, + "step": 42210 + }, + { + "epoch": 4.701525782381111, + "grad_norm": 1.0697439908981323, + "learning_rate": 4.727236060820449e-05, + "loss": 0.0815, + "num_input_tokens_seen": 51370496, + "step": 42215 + }, + { + "epoch": 4.702082637264729, + "grad_norm": 1.1722288131713867, + "learning_rate": 4.7271256890741306e-05, + "loss": 0.1435, + "num_input_tokens_seen": 51376480, + "step": 42220 + }, + { + "epoch": 4.7026394921483465, + "grad_norm": 0.1733856499195099, + "learning_rate": 4.727015296290826e-05, + "loss": 0.0742, + "num_input_tokens_seen": 51382624, + "step": 42225 + }, + { + "epoch": 4.703196347031963, + "grad_norm": 0.027446618303656578, + "learning_rate": 4.726904882471578e-05, + "loss": 0.0652, + "num_input_tokens_seen": 51388576, + "step": 42230 + }, + { + "epoch": 4.703753201915581, + "grad_norm": 1.2219231128692627, + "learning_rate": 4.7267944476174285e-05, + "loss": 0.0751, + "num_input_tokens_seen": 51394400, + "step": 42235 + }, + { + "epoch": 4.704310056799198, + "grad_norm": 0.7987086772918701, + "learning_rate": 4.726683991729422e-05, + "loss": 0.1191, + "num_input_tokens_seen": 51400640, + "step": 42240 + }, + { + "epoch": 4.7048669116828155, + "grad_norm": 0.9476602077484131, + "learning_rate": 4.726573514808601e-05, + "loss": 0.0747, + "num_input_tokens_seen": 51406944, + "step": 42245 + }, + { + "epoch": 4.705423766566433, + "grad_norm": 1.1068549156188965, + "learning_rate": 4.7264630168560095e-05, + "loss": 0.0546, + "num_input_tokens_seen": 51413184, + "step": 42250 + }, + { + "epoch": 4.70598062145005, + "grad_norm": 0.0010163926053792238, + "learning_rate": 4.726352497872691e-05, + "loss": 0.1407, + "num_input_tokens_seen": 51419200, + "step": 42255 + }, + { + "epoch": 4.706537476333668, + "grad_norm": 0.5478881001472473, + "learning_rate": 4.726241957859689e-05, + "loss": 0.0383, + "num_input_tokens_seen": 51425312, + "step": 42260 + }, + { + "epoch": 4.707094331217284, + "grad_norm": 0.6438440680503845, + "learning_rate": 4.726131396818049e-05, + "loss": 0.024, + "num_input_tokens_seen": 51431616, + "step": 42265 + }, + { + "epoch": 4.707651186100902, + "grad_norm": 1.1897759437561035, + "learning_rate": 4.726020814748813e-05, + "loss": 0.0422, + "num_input_tokens_seen": 51437760, + "step": 42270 + }, + { + "epoch": 4.70820804098452, + "grad_norm": 0.04042166844010353, + "learning_rate": 4.7259102116530275e-05, + "loss": 0.0437, + "num_input_tokens_seen": 51443680, + "step": 42275 + }, + { + "epoch": 4.7087648958681365, + "grad_norm": 0.19780628383159637, + "learning_rate": 4.7257995875317377e-05, + "loss": 0.0646, + "num_input_tokens_seen": 51449760, + "step": 42280 + }, + { + "epoch": 4.709321750751754, + "grad_norm": 0.0015209036646410823, + "learning_rate": 4.725688942385986e-05, + "loss": 0.0773, + "num_input_tokens_seen": 51455968, + "step": 42285 + }, + { + "epoch": 4.709878605635371, + "grad_norm": 0.7474430799484253, + "learning_rate": 4.72557827621682e-05, + "loss": 0.071, + "num_input_tokens_seen": 51462336, + "step": 42290 + }, + { + "epoch": 4.710435460518989, + "grad_norm": 0.2994537949562073, + "learning_rate": 4.7254675890252836e-05, + "loss": 0.1021, + "num_input_tokens_seen": 51468640, + "step": 42295 + }, + { + "epoch": 4.710992315402606, + "grad_norm": 0.10859353095293045, + "learning_rate": 4.725356880812423e-05, + "loss": 0.0199, + "num_input_tokens_seen": 51474752, + "step": 42300 + }, + { + "epoch": 4.711549170286223, + "grad_norm": 0.20512716472148895, + "learning_rate": 4.7252461515792834e-05, + "loss": 0.0153, + "num_input_tokens_seen": 51480864, + "step": 42305 + }, + { + "epoch": 4.712106025169841, + "grad_norm": 0.08069781213998795, + "learning_rate": 4.725135401326912e-05, + "loss": 0.115, + "num_input_tokens_seen": 51486880, + "step": 42310 + }, + { + "epoch": 4.712662880053458, + "grad_norm": 0.8885907530784607, + "learning_rate": 4.7250246300563525e-05, + "loss": 0.0876, + "num_input_tokens_seen": 51492832, + "step": 42315 + }, + { + "epoch": 4.713219734937075, + "grad_norm": 0.01618180423974991, + "learning_rate": 4.724913837768654e-05, + "loss": 0.0731, + "num_input_tokens_seen": 51499296, + "step": 42320 + }, + { + "epoch": 4.713776589820693, + "grad_norm": 0.9158036708831787, + "learning_rate": 4.724803024464861e-05, + "loss": 0.1953, + "num_input_tokens_seen": 51505440, + "step": 42325 + }, + { + "epoch": 4.71433344470431, + "grad_norm": 0.5530790090560913, + "learning_rate": 4.7246921901460215e-05, + "loss": 0.0947, + "num_input_tokens_seen": 51511648, + "step": 42330 + }, + { + "epoch": 4.714890299587927, + "grad_norm": 1.6998096704483032, + "learning_rate": 4.724581334813182e-05, + "loss": 0.0798, + "num_input_tokens_seen": 51517920, + "step": 42335 + }, + { + "epoch": 4.715447154471545, + "grad_norm": 0.061137571930885315, + "learning_rate": 4.724470458467389e-05, + "loss": 0.038, + "num_input_tokens_seen": 51524640, + "step": 42340 + }, + { + "epoch": 4.716004009355162, + "grad_norm": 0.001481468789279461, + "learning_rate": 4.72435956110969e-05, + "loss": 0.102, + "num_input_tokens_seen": 51530624, + "step": 42345 + }, + { + "epoch": 4.7165608642387795, + "grad_norm": 0.3271067440509796, + "learning_rate": 4.7242486427411337e-05, + "loss": 0.0658, + "num_input_tokens_seen": 51536480, + "step": 42350 + }, + { + "epoch": 4.717117719122397, + "grad_norm": 0.4593895375728607, + "learning_rate": 4.7241377033627664e-05, + "loss": 0.0455, + "num_input_tokens_seen": 51541920, + "step": 42355 + }, + { + "epoch": 4.717674574006014, + "grad_norm": 0.3300146758556366, + "learning_rate": 4.724026742975637e-05, + "loss": 0.0296, + "num_input_tokens_seen": 51547936, + "step": 42360 + }, + { + "epoch": 4.718231428889632, + "grad_norm": 0.8923298716545105, + "learning_rate": 4.723915761580793e-05, + "loss": 0.2162, + "num_input_tokens_seen": 51553504, + "step": 42365 + }, + { + "epoch": 4.718788283773248, + "grad_norm": 0.3603648245334625, + "learning_rate": 4.723804759179282e-05, + "loss": 0.0784, + "num_input_tokens_seen": 51559744, + "step": 42370 + }, + { + "epoch": 4.719345138656866, + "grad_norm": 0.02998793125152588, + "learning_rate": 4.7236937357721546e-05, + "loss": 0.0212, + "num_input_tokens_seen": 51565824, + "step": 42375 + }, + { + "epoch": 4.719901993540484, + "grad_norm": 0.010555705055594444, + "learning_rate": 4.723582691360458e-05, + "loss": 0.009, + "num_input_tokens_seen": 51572224, + "step": 42380 + }, + { + "epoch": 4.720458848424101, + "grad_norm": 0.7612513899803162, + "learning_rate": 4.7234716259452406e-05, + "loss": 0.123, + "num_input_tokens_seen": 51577568, + "step": 42385 + }, + { + "epoch": 4.721015703307718, + "grad_norm": 0.9454023838043213, + "learning_rate": 4.723360539527553e-05, + "loss": 0.0937, + "num_input_tokens_seen": 51583712, + "step": 42390 + }, + { + "epoch": 4.721572558191335, + "grad_norm": 1.4716558456420898, + "learning_rate": 4.723249432108443e-05, + "loss": 0.0799, + "num_input_tokens_seen": 51590112, + "step": 42395 + }, + { + "epoch": 4.722129413074953, + "grad_norm": 1.7380609512329102, + "learning_rate": 4.723138303688961e-05, + "loss": 0.1806, + "num_input_tokens_seen": 51596320, + "step": 42400 + }, + { + "epoch": 4.72268626795857, + "grad_norm": 0.6096693277359009, + "learning_rate": 4.723027154270157e-05, + "loss": 0.0234, + "num_input_tokens_seen": 51602272, + "step": 42405 + }, + { + "epoch": 4.723243122842187, + "grad_norm": 0.7383288741111755, + "learning_rate": 4.722915983853081e-05, + "loss": 0.1578, + "num_input_tokens_seen": 51608576, + "step": 42410 + }, + { + "epoch": 4.723799977725805, + "grad_norm": 0.0010881322668865323, + "learning_rate": 4.722804792438782e-05, + "loss": 0.0415, + "num_input_tokens_seen": 51614720, + "step": 42415 + }, + { + "epoch": 4.724356832609422, + "grad_norm": 0.17320485413074493, + "learning_rate": 4.72269358002831e-05, + "loss": 0.0187, + "num_input_tokens_seen": 51621024, + "step": 42420 + }, + { + "epoch": 4.724913687493039, + "grad_norm": 0.04339317977428436, + "learning_rate": 4.7225823466227176e-05, + "loss": 0.032, + "num_input_tokens_seen": 51627296, + "step": 42425 + }, + { + "epoch": 4.725470542376657, + "grad_norm": 0.0023271830286830664, + "learning_rate": 4.722471092223054e-05, + "loss": 0.0644, + "num_input_tokens_seen": 51633280, + "step": 42430 + }, + { + "epoch": 4.726027397260274, + "grad_norm": 0.0014807234983891249, + "learning_rate": 4.722359816830369e-05, + "loss": 0.1288, + "num_input_tokens_seen": 51639296, + "step": 42435 + }, + { + "epoch": 4.726584252143891, + "grad_norm": 0.3457242250442505, + "learning_rate": 4.722248520445717e-05, + "loss": 0.0114, + "num_input_tokens_seen": 51644832, + "step": 42440 + }, + { + "epoch": 4.727141107027508, + "grad_norm": 0.0011037936201319098, + "learning_rate": 4.7221372030701466e-05, + "loss": 0.069, + "num_input_tokens_seen": 51651424, + "step": 42445 + }, + { + "epoch": 4.727697961911126, + "grad_norm": 0.017917629331350327, + "learning_rate": 4.72202586470471e-05, + "loss": 0.1381, + "num_input_tokens_seen": 51657888, + "step": 42450 + }, + { + "epoch": 4.728254816794744, + "grad_norm": 0.14899644255638123, + "learning_rate": 4.7219145053504584e-05, + "loss": 0.1362, + "num_input_tokens_seen": 51664000, + "step": 42455 + }, + { + "epoch": 4.72881167167836, + "grad_norm": 0.35766804218292236, + "learning_rate": 4.7218031250084444e-05, + "loss": 0.0327, + "num_input_tokens_seen": 51670016, + "step": 42460 + }, + { + "epoch": 4.729368526561978, + "grad_norm": 0.01763175241649151, + "learning_rate": 4.72169172367972e-05, + "loss": 0.0557, + "num_input_tokens_seen": 51675936, + "step": 42465 + }, + { + "epoch": 4.729925381445595, + "grad_norm": 0.0004547943826764822, + "learning_rate": 4.721580301365337e-05, + "loss": 0.0616, + "num_input_tokens_seen": 51682240, + "step": 42470 + }, + { + "epoch": 4.7304822363292125, + "grad_norm": 0.08289405703544617, + "learning_rate": 4.721468858066348e-05, + "loss": 0.0827, + "num_input_tokens_seen": 51688512, + "step": 42475 + }, + { + "epoch": 4.73103909121283, + "grad_norm": 0.3755168318748474, + "learning_rate": 4.721357393783806e-05, + "loss": 0.1185, + "num_input_tokens_seen": 51694496, + "step": 42480 + }, + { + "epoch": 4.731595946096447, + "grad_norm": 0.8543809056282043, + "learning_rate": 4.721245908518764e-05, + "loss": 0.0475, + "num_input_tokens_seen": 51700736, + "step": 42485 + }, + { + "epoch": 4.732152800980065, + "grad_norm": 0.6776341199874878, + "learning_rate": 4.721134402272274e-05, + "loss": 0.0679, + "num_input_tokens_seen": 51707008, + "step": 42490 + }, + { + "epoch": 4.732709655863682, + "grad_norm": 0.9649008512496948, + "learning_rate": 4.721022875045391e-05, + "loss": 0.0995, + "num_input_tokens_seen": 51713152, + "step": 42495 + }, + { + "epoch": 4.733266510747299, + "grad_norm": 1.573793888092041, + "learning_rate": 4.720911326839167e-05, + "loss": 0.0882, + "num_input_tokens_seen": 51719264, + "step": 42500 + }, + { + "epoch": 4.733823365630917, + "grad_norm": 0.015926674008369446, + "learning_rate": 4.720799757654656e-05, + "loss": 0.0957, + "num_input_tokens_seen": 51725888, + "step": 42505 + }, + { + "epoch": 4.7343802205145336, + "grad_norm": 1.0627797842025757, + "learning_rate": 4.720688167492912e-05, + "loss": 0.0402, + "num_input_tokens_seen": 51732000, + "step": 42510 + }, + { + "epoch": 4.734937075398151, + "grad_norm": 0.19760975241661072, + "learning_rate": 4.72057655635499e-05, + "loss": 0.022, + "num_input_tokens_seen": 51738272, + "step": 42515 + }, + { + "epoch": 4.735493930281769, + "grad_norm": 0.280289888381958, + "learning_rate": 4.720464924241942e-05, + "loss": 0.0191, + "num_input_tokens_seen": 51744544, + "step": 42520 + }, + { + "epoch": 4.736050785165386, + "grad_norm": 0.6340722441673279, + "learning_rate": 4.720353271154824e-05, + "loss": 0.0473, + "num_input_tokens_seen": 51750592, + "step": 42525 + }, + { + "epoch": 4.736607640049003, + "grad_norm": 0.41078615188598633, + "learning_rate": 4.720241597094691e-05, + "loss": 0.1378, + "num_input_tokens_seen": 51756640, + "step": 42530 + }, + { + "epoch": 4.737164494932621, + "grad_norm": 0.11922374367713928, + "learning_rate": 4.720129902062597e-05, + "loss": 0.1173, + "num_input_tokens_seen": 51762848, + "step": 42535 + }, + { + "epoch": 4.737721349816238, + "grad_norm": 0.8121181726455688, + "learning_rate": 4.7200181860595975e-05, + "loss": 0.0485, + "num_input_tokens_seen": 51769344, + "step": 42540 + }, + { + "epoch": 4.7382782046998555, + "grad_norm": 2.6968631744384766, + "learning_rate": 4.7199064490867473e-05, + "loss": 0.1065, + "num_input_tokens_seen": 51775424, + "step": 42545 + }, + { + "epoch": 4.738835059583472, + "grad_norm": 0.5206444263458252, + "learning_rate": 4.719794691145103e-05, + "loss": 0.0316, + "num_input_tokens_seen": 51781824, + "step": 42550 + }, + { + "epoch": 4.73939191446709, + "grad_norm": 0.10832163691520691, + "learning_rate": 4.719682912235718e-05, + "loss": 0.0214, + "num_input_tokens_seen": 51787840, + "step": 42555 + }, + { + "epoch": 4.739948769350708, + "grad_norm": 0.2214428037405014, + "learning_rate": 4.719571112359651e-05, + "loss": 0.093, + "num_input_tokens_seen": 51793952, + "step": 42560 + }, + { + "epoch": 4.740505624234324, + "grad_norm": 0.658343493938446, + "learning_rate": 4.7194592915179555e-05, + "loss": 0.0803, + "num_input_tokens_seen": 51800128, + "step": 42565 + }, + { + "epoch": 4.741062479117942, + "grad_norm": 0.1097501739859581, + "learning_rate": 4.719347449711689e-05, + "loss": 0.0257, + "num_input_tokens_seen": 51806336, + "step": 42570 + }, + { + "epoch": 4.741619334001559, + "grad_norm": 2.0130844116210938, + "learning_rate": 4.719235586941908e-05, + "loss": 0.0453, + "num_input_tokens_seen": 51812672, + "step": 42575 + }, + { + "epoch": 4.742176188885177, + "grad_norm": 1.8485475778579712, + "learning_rate": 4.7191237032096685e-05, + "loss": 0.1714, + "num_input_tokens_seen": 51818656, + "step": 42580 + }, + { + "epoch": 4.742733043768794, + "grad_norm": 0.05979285016655922, + "learning_rate": 4.719011798516028e-05, + "loss": 0.0853, + "num_input_tokens_seen": 51824640, + "step": 42585 + }, + { + "epoch": 4.743289898652411, + "grad_norm": 1.179791808128357, + "learning_rate": 4.7188998728620424e-05, + "loss": 0.1027, + "num_input_tokens_seen": 51830880, + "step": 42590 + }, + { + "epoch": 4.743846753536029, + "grad_norm": 0.016484957188367844, + "learning_rate": 4.718787926248771e-05, + "loss": 0.1124, + "num_input_tokens_seen": 51837248, + "step": 42595 + }, + { + "epoch": 4.7444036084196455, + "grad_norm": 0.006652201991528273, + "learning_rate": 4.718675958677269e-05, + "loss": 0.09, + "num_input_tokens_seen": 51843392, + "step": 42600 + }, + { + "epoch": 4.744960463303263, + "grad_norm": 0.1054287701845169, + "learning_rate": 4.718563970148596e-05, + "loss": 0.0551, + "num_input_tokens_seen": 51849568, + "step": 42605 + }, + { + "epoch": 4.745517318186881, + "grad_norm": 0.8762446641921997, + "learning_rate": 4.718451960663808e-05, + "loss": 0.1419, + "num_input_tokens_seen": 51855520, + "step": 42610 + }, + { + "epoch": 4.746074173070498, + "grad_norm": 0.3071182072162628, + "learning_rate": 4.718339930223964e-05, + "loss": 0.0681, + "num_input_tokens_seen": 51861728, + "step": 42615 + }, + { + "epoch": 4.746631027954115, + "grad_norm": 0.9311138987541199, + "learning_rate": 4.718227878830122e-05, + "loss": 0.1114, + "num_input_tokens_seen": 51867872, + "step": 42620 + }, + { + "epoch": 4.747187882837732, + "grad_norm": 1.5109937191009521, + "learning_rate": 4.7181158064833406e-05, + "loss": 0.1671, + "num_input_tokens_seen": 51874272, + "step": 42625 + }, + { + "epoch": 4.74774473772135, + "grad_norm": 0.7780439853668213, + "learning_rate": 4.7180037131846784e-05, + "loss": 0.0427, + "num_input_tokens_seen": 51880064, + "step": 42630 + }, + { + "epoch": 4.748301592604967, + "grad_norm": 1.5229926109313965, + "learning_rate": 4.7178915989351936e-05, + "loss": 0.1593, + "num_input_tokens_seen": 51886144, + "step": 42635 + }, + { + "epoch": 4.748858447488584, + "grad_norm": 0.20781424641609192, + "learning_rate": 4.717779463735946e-05, + "loss": 0.0557, + "num_input_tokens_seen": 51892640, + "step": 42640 + }, + { + "epoch": 4.749415302372202, + "grad_norm": 0.0919112041592598, + "learning_rate": 4.717667307587995e-05, + "loss": 0.0097, + "num_input_tokens_seen": 51899008, + "step": 42645 + }, + { + "epoch": 4.749972157255819, + "grad_norm": 0.5013104677200317, + "learning_rate": 4.717555130492399e-05, + "loss": 0.0534, + "num_input_tokens_seen": 51904992, + "step": 42650 + }, + { + "epoch": 4.750529012139436, + "grad_norm": 1.5169695615768433, + "learning_rate": 4.717442932450218e-05, + "loss": 0.1934, + "num_input_tokens_seen": 51911072, + "step": 42655 + }, + { + "epoch": 4.751085867023054, + "grad_norm": 0.019177475944161415, + "learning_rate": 4.717330713462512e-05, + "loss": 0.0375, + "num_input_tokens_seen": 51916704, + "step": 42660 + }, + { + "epoch": 4.751642721906671, + "grad_norm": 1.0574170351028442, + "learning_rate": 4.717218473530341e-05, + "loss": 0.0526, + "num_input_tokens_seen": 51922592, + "step": 42665 + }, + { + "epoch": 4.7521995767902885, + "grad_norm": 0.3767458200454712, + "learning_rate": 4.7171062126547646e-05, + "loss": 0.0866, + "num_input_tokens_seen": 51928160, + "step": 42670 + }, + { + "epoch": 4.752756431673906, + "grad_norm": 0.003091160673648119, + "learning_rate": 4.716993930836845e-05, + "loss": 0.0292, + "num_input_tokens_seen": 51934304, + "step": 42675 + }, + { + "epoch": 4.753313286557523, + "grad_norm": 0.3832166790962219, + "learning_rate": 4.7168816280776404e-05, + "loss": 0.1293, + "num_input_tokens_seen": 51940352, + "step": 42680 + }, + { + "epoch": 4.753870141441141, + "grad_norm": 0.38400596380233765, + "learning_rate": 4.716769304378214e-05, + "loss": 0.0778, + "num_input_tokens_seen": 51946400, + "step": 42685 + }, + { + "epoch": 4.754426996324757, + "grad_norm": 0.7494568228721619, + "learning_rate": 4.7166569597396236e-05, + "loss": 0.0411, + "num_input_tokens_seen": 51952480, + "step": 42690 + }, + { + "epoch": 4.754983851208375, + "grad_norm": 0.3741286098957062, + "learning_rate": 4.716544594162933e-05, + "loss": 0.059, + "num_input_tokens_seen": 51958400, + "step": 42695 + }, + { + "epoch": 4.755540706091993, + "grad_norm": 0.6046894192695618, + "learning_rate": 4.716432207649203e-05, + "loss": 0.0924, + "num_input_tokens_seen": 51964640, + "step": 42700 + }, + { + "epoch": 4.7560975609756095, + "grad_norm": 1.0812677145004272, + "learning_rate": 4.716319800199495e-05, + "loss": 0.1041, + "num_input_tokens_seen": 51970304, + "step": 42705 + }, + { + "epoch": 4.756654415859227, + "grad_norm": 0.28614434599876404, + "learning_rate": 4.716207371814871e-05, + "loss": 0.1133, + "num_input_tokens_seen": 51976128, + "step": 42710 + }, + { + "epoch": 4.757211270742845, + "grad_norm": 0.013406947255134583, + "learning_rate": 4.7160949224963926e-05, + "loss": 0.0527, + "num_input_tokens_seen": 51982560, + "step": 42715 + }, + { + "epoch": 4.757768125626462, + "grad_norm": 0.6469846963882446, + "learning_rate": 4.7159824522451224e-05, + "loss": 0.0283, + "num_input_tokens_seen": 51988672, + "step": 42720 + }, + { + "epoch": 4.758324980510079, + "grad_norm": 2.1268177032470703, + "learning_rate": 4.7158699610621224e-05, + "loss": 0.1148, + "num_input_tokens_seen": 51994464, + "step": 42725 + }, + { + "epoch": 4.758881835393696, + "grad_norm": 0.12540192902088165, + "learning_rate": 4.7157574489484544e-05, + "loss": 0.0262, + "num_input_tokens_seen": 52000768, + "step": 42730 + }, + { + "epoch": 4.759438690277314, + "grad_norm": 0.1460745930671692, + "learning_rate": 4.715644915905183e-05, + "loss": 0.1519, + "num_input_tokens_seen": 52007168, + "step": 42735 + }, + { + "epoch": 4.7599955451609315, + "grad_norm": 1.880521297454834, + "learning_rate": 4.71553236193337e-05, + "loss": 0.0629, + "num_input_tokens_seen": 52013184, + "step": 42740 + }, + { + "epoch": 4.760552400044548, + "grad_norm": 0.33328330516815186, + "learning_rate": 4.715419787034079e-05, + "loss": 0.0398, + "num_input_tokens_seen": 52019264, + "step": 42745 + }, + { + "epoch": 4.761109254928166, + "grad_norm": 0.19802726805210114, + "learning_rate": 4.715307191208374e-05, + "loss": 0.0174, + "num_input_tokens_seen": 52025536, + "step": 42750 + }, + { + "epoch": 4.761666109811783, + "grad_norm": 0.007331524044275284, + "learning_rate": 4.715194574457315e-05, + "loss": 0.0583, + "num_input_tokens_seen": 52031712, + "step": 42755 + }, + { + "epoch": 4.7622229646954, + "grad_norm": 0.12074816972017288, + "learning_rate": 4.715081936781971e-05, + "loss": 0.0366, + "num_input_tokens_seen": 52037824, + "step": 42760 + }, + { + "epoch": 4.762779819579018, + "grad_norm": 0.00031685997964814305, + "learning_rate": 4.714969278183403e-05, + "loss": 0.0445, + "num_input_tokens_seen": 52043840, + "step": 42765 + }, + { + "epoch": 4.763336674462635, + "grad_norm": 0.40880441665649414, + "learning_rate": 4.7148565986626744e-05, + "loss": 0.118, + "num_input_tokens_seen": 52049984, + "step": 42770 + }, + { + "epoch": 4.7638935293462525, + "grad_norm": 0.0012988990638405085, + "learning_rate": 4.7147438982208515e-05, + "loss": 0.037, + "num_input_tokens_seen": 52056128, + "step": 42775 + }, + { + "epoch": 4.764450384229869, + "grad_norm": 0.9594675898551941, + "learning_rate": 4.714631176858998e-05, + "loss": 0.0707, + "num_input_tokens_seen": 52061760, + "step": 42780 + }, + { + "epoch": 4.765007239113487, + "grad_norm": 0.015977974981069565, + "learning_rate": 4.714518434578179e-05, + "loss": 0.0574, + "num_input_tokens_seen": 52067840, + "step": 42785 + }, + { + "epoch": 4.765564093997105, + "grad_norm": 0.9328028559684753, + "learning_rate": 4.7144056713794584e-05, + "loss": 0.1424, + "num_input_tokens_seen": 52073920, + "step": 42790 + }, + { + "epoch": 4.7661209488807215, + "grad_norm": 0.16246764361858368, + "learning_rate": 4.7142928872639026e-05, + "loss": 0.0383, + "num_input_tokens_seen": 52080160, + "step": 42795 + }, + { + "epoch": 4.766677803764339, + "grad_norm": 1.0734386444091797, + "learning_rate": 4.7141800822325765e-05, + "loss": 0.133, + "num_input_tokens_seen": 52086176, + "step": 42800 + }, + { + "epoch": 4.767234658647956, + "grad_norm": 0.014380447566509247, + "learning_rate": 4.714067256286545e-05, + "loss": 0.0592, + "num_input_tokens_seen": 52092320, + "step": 42805 + }, + { + "epoch": 4.767791513531574, + "grad_norm": 0.602885365486145, + "learning_rate": 4.713954409426875e-05, + "loss": 0.0914, + "num_input_tokens_seen": 52098560, + "step": 42810 + }, + { + "epoch": 4.768348368415191, + "grad_norm": 0.62701016664505, + "learning_rate": 4.7138415416546324e-05, + "loss": 0.0898, + "num_input_tokens_seen": 52104864, + "step": 42815 + }, + { + "epoch": 4.768905223298808, + "grad_norm": 0.03174924477934837, + "learning_rate": 4.713728652970881e-05, + "loss": 0.1416, + "num_input_tokens_seen": 52111168, + "step": 42820 + }, + { + "epoch": 4.769462078182426, + "grad_norm": 0.001819481374695897, + "learning_rate": 4.71361574337669e-05, + "loss": 0.0897, + "num_input_tokens_seen": 52117376, + "step": 42825 + }, + { + "epoch": 4.7700189330660425, + "grad_norm": 0.6235867142677307, + "learning_rate": 4.7135028128731246e-05, + "loss": 0.023, + "num_input_tokens_seen": 52123616, + "step": 42830 + }, + { + "epoch": 4.77057578794966, + "grad_norm": 0.06409338116645813, + "learning_rate": 4.7133898614612515e-05, + "loss": 0.0792, + "num_input_tokens_seen": 52129952, + "step": 42835 + }, + { + "epoch": 4.771132642833278, + "grad_norm": 0.3849378228187561, + "learning_rate": 4.7132768891421387e-05, + "loss": 0.1452, + "num_input_tokens_seen": 52136064, + "step": 42840 + }, + { + "epoch": 4.771689497716895, + "grad_norm": 0.621201753616333, + "learning_rate": 4.7131638959168514e-05, + "loss": 0.0696, + "num_input_tokens_seen": 52142400, + "step": 42845 + }, + { + "epoch": 4.772246352600512, + "grad_norm": 0.01133787538856268, + "learning_rate": 4.713050881786458e-05, + "loss": 0.0069, + "num_input_tokens_seen": 52148736, + "step": 42850 + }, + { + "epoch": 4.77280320748413, + "grad_norm": 0.15364976227283478, + "learning_rate": 4.7129378467520265e-05, + "loss": 0.0334, + "num_input_tokens_seen": 52154688, + "step": 42855 + }, + { + "epoch": 4.773360062367747, + "grad_norm": 0.001299224910326302, + "learning_rate": 4.712824790814624e-05, + "loss": 0.0307, + "num_input_tokens_seen": 52161280, + "step": 42860 + }, + { + "epoch": 4.7739169172513645, + "grad_norm": 0.4719114601612091, + "learning_rate": 4.712711713975318e-05, + "loss": 0.0331, + "num_input_tokens_seen": 52167296, + "step": 42865 + }, + { + "epoch": 4.774473772134981, + "grad_norm": 0.9811342358589172, + "learning_rate": 4.712598616235178e-05, + "loss": 0.1032, + "num_input_tokens_seen": 52173536, + "step": 42870 + }, + { + "epoch": 4.775030627018599, + "grad_norm": 0.05963989719748497, + "learning_rate": 4.71248549759527e-05, + "loss": 0.0704, + "num_input_tokens_seen": 52179328, + "step": 42875 + }, + { + "epoch": 4.775587481902217, + "grad_norm": 0.09044302999973297, + "learning_rate": 4.712372358056665e-05, + "loss": 0.0372, + "num_input_tokens_seen": 52185376, + "step": 42880 + }, + { + "epoch": 4.776144336785833, + "grad_norm": 0.17730776965618134, + "learning_rate": 4.7122591976204297e-05, + "loss": 0.121, + "num_input_tokens_seen": 52191712, + "step": 42885 + }, + { + "epoch": 4.776701191669451, + "grad_norm": 0.04049370810389519, + "learning_rate": 4.712146016287634e-05, + "loss": 0.0325, + "num_input_tokens_seen": 52197920, + "step": 42890 + }, + { + "epoch": 4.777258046553069, + "grad_norm": 0.010814614593982697, + "learning_rate": 4.712032814059347e-05, + "loss": 0.0287, + "num_input_tokens_seen": 52204320, + "step": 42895 + }, + { + "epoch": 4.7778149014366855, + "grad_norm": 0.6644939184188843, + "learning_rate": 4.711919590936638e-05, + "loss": 0.0677, + "num_input_tokens_seen": 52210368, + "step": 42900 + }, + { + "epoch": 4.778371756320303, + "grad_norm": 0.2661958336830139, + "learning_rate": 4.711806346920577e-05, + "loss": 0.0836, + "num_input_tokens_seen": 52216384, + "step": 42905 + }, + { + "epoch": 4.77892861120392, + "grad_norm": 0.027510682120919228, + "learning_rate": 4.7116930820122316e-05, + "loss": 0.0989, + "num_input_tokens_seen": 52222240, + "step": 42910 + }, + { + "epoch": 4.779485466087538, + "grad_norm": 0.10299558192491531, + "learning_rate": 4.711579796212673e-05, + "loss": 0.0368, + "num_input_tokens_seen": 52228096, + "step": 42915 + }, + { + "epoch": 4.780042320971155, + "grad_norm": 0.423646479845047, + "learning_rate": 4.7114664895229725e-05, + "loss": 0.1271, + "num_input_tokens_seen": 52234336, + "step": 42920 + }, + { + "epoch": 4.780599175854772, + "grad_norm": 1.059563398361206, + "learning_rate": 4.7113531619441984e-05, + "loss": 0.0774, + "num_input_tokens_seen": 52240704, + "step": 42925 + }, + { + "epoch": 4.78115603073839, + "grad_norm": 0.3830001950263977, + "learning_rate": 4.7112398134774225e-05, + "loss": 0.1778, + "num_input_tokens_seen": 52246848, + "step": 42930 + }, + { + "epoch": 4.781712885622007, + "grad_norm": 1.0761326551437378, + "learning_rate": 4.711126444123715e-05, + "loss": 0.0403, + "num_input_tokens_seen": 52252896, + "step": 42935 + }, + { + "epoch": 4.782269740505624, + "grad_norm": 1.3908051252365112, + "learning_rate": 4.711013053884146e-05, + "loss": 0.0456, + "num_input_tokens_seen": 52258976, + "step": 42940 + }, + { + "epoch": 4.782826595389242, + "grad_norm": 0.6198556423187256, + "learning_rate": 4.710899642759788e-05, + "loss": 0.0946, + "num_input_tokens_seen": 52265184, + "step": 42945 + }, + { + "epoch": 4.783383450272859, + "grad_norm": 0.1997133493423462, + "learning_rate": 4.710786210751711e-05, + "loss": 0.012, + "num_input_tokens_seen": 52271552, + "step": 42950 + }, + { + "epoch": 4.783940305156476, + "grad_norm": 0.4691590964794159, + "learning_rate": 4.7106727578609875e-05, + "loss": 0.0237, + "num_input_tokens_seen": 52277888, + "step": 42955 + }, + { + "epoch": 4.784497160040093, + "grad_norm": 0.8245020508766174, + "learning_rate": 4.710559284088688e-05, + "loss": 0.1115, + "num_input_tokens_seen": 52284192, + "step": 42960 + }, + { + "epoch": 4.785054014923711, + "grad_norm": 1.030893087387085, + "learning_rate": 4.710445789435886e-05, + "loss": 0.0641, + "num_input_tokens_seen": 52290432, + "step": 42965 + }, + { + "epoch": 4.7856108698073285, + "grad_norm": 0.03229692578315735, + "learning_rate": 4.710332273903652e-05, + "loss": 0.1412, + "num_input_tokens_seen": 52296544, + "step": 42970 + }, + { + "epoch": 4.786167724690945, + "grad_norm": 1.6745644807815552, + "learning_rate": 4.7102187374930585e-05, + "loss": 0.128, + "num_input_tokens_seen": 52301824, + "step": 42975 + }, + { + "epoch": 4.786724579574563, + "grad_norm": 0.07608231157064438, + "learning_rate": 4.710105180205178e-05, + "loss": 0.0478, + "num_input_tokens_seen": 52308000, + "step": 42980 + }, + { + "epoch": 4.78728143445818, + "grad_norm": 1.872107982635498, + "learning_rate": 4.709991602041084e-05, + "loss": 0.0822, + "num_input_tokens_seen": 52314112, + "step": 42985 + }, + { + "epoch": 4.7878382893417974, + "grad_norm": 0.3968322277069092, + "learning_rate": 4.7098780030018484e-05, + "loss": 0.0325, + "num_input_tokens_seen": 52319808, + "step": 42990 + }, + { + "epoch": 4.788395144225415, + "grad_norm": 0.022329580038785934, + "learning_rate": 4.709764383088545e-05, + "loss": 0.0487, + "num_input_tokens_seen": 52326144, + "step": 42995 + }, + { + "epoch": 4.788951999109032, + "grad_norm": 0.009998020716011524, + "learning_rate": 4.7096507423022455e-05, + "loss": 0.0497, + "num_input_tokens_seen": 52332384, + "step": 43000 + }, + { + "epoch": 4.78950885399265, + "grad_norm": 0.2954805791378021, + "learning_rate": 4.7095370806440256e-05, + "loss": 0.0502, + "num_input_tokens_seen": 52338560, + "step": 43005 + }, + { + "epoch": 4.790065708876266, + "grad_norm": 0.041958894580602646, + "learning_rate": 4.709423398114957e-05, + "loss": 0.0255, + "num_input_tokens_seen": 52344992, + "step": 43010 + }, + { + "epoch": 4.790622563759884, + "grad_norm": 2.153992176055908, + "learning_rate": 4.709309694716114e-05, + "loss": 0.1031, + "num_input_tokens_seen": 52351200, + "step": 43015 + }, + { + "epoch": 4.791179418643502, + "grad_norm": 0.3215161859989166, + "learning_rate": 4.7091959704485715e-05, + "loss": 0.0172, + "num_input_tokens_seen": 52357312, + "step": 43020 + }, + { + "epoch": 4.7917362735271185, + "grad_norm": 0.2028866857290268, + "learning_rate": 4.7090822253134034e-05, + "loss": 0.1668, + "num_input_tokens_seen": 52363296, + "step": 43025 + }, + { + "epoch": 4.792293128410736, + "grad_norm": 0.10226744413375854, + "learning_rate": 4.708968459311683e-05, + "loss": 0.0076, + "num_input_tokens_seen": 52369760, + "step": 43030 + }, + { + "epoch": 4.792849983294354, + "grad_norm": 0.5537917017936707, + "learning_rate": 4.708854672444486e-05, + "loss": 0.0621, + "num_input_tokens_seen": 52375232, + "step": 43035 + }, + { + "epoch": 4.793406838177971, + "grad_norm": 0.09349673241376877, + "learning_rate": 4.708740864712887e-05, + "loss": 0.0836, + "num_input_tokens_seen": 52381184, + "step": 43040 + }, + { + "epoch": 4.793963693061588, + "grad_norm": 0.20883454382419586, + "learning_rate": 4.708627036117961e-05, + "loss": 0.156, + "num_input_tokens_seen": 52387200, + "step": 43045 + }, + { + "epoch": 4.794520547945205, + "grad_norm": 0.9773097634315491, + "learning_rate": 4.7085131866607823e-05, + "loss": 0.0955, + "num_input_tokens_seen": 52393728, + "step": 43050 + }, + { + "epoch": 4.795077402828823, + "grad_norm": 0.0145083824172616, + "learning_rate": 4.708399316342428e-05, + "loss": 0.0171, + "num_input_tokens_seen": 52400000, + "step": 43055 + }, + { + "epoch": 4.7956342577124405, + "grad_norm": 0.030419377610087395, + "learning_rate": 4.708285425163973e-05, + "loss": 0.0722, + "num_input_tokens_seen": 52406048, + "step": 43060 + }, + { + "epoch": 4.796191112596057, + "grad_norm": 0.01160106249153614, + "learning_rate": 4.708171513126492e-05, + "loss": 0.0324, + "num_input_tokens_seen": 52412192, + "step": 43065 + }, + { + "epoch": 4.796747967479675, + "grad_norm": 0.4454638659954071, + "learning_rate": 4.708057580231062e-05, + "loss": 0.0391, + "num_input_tokens_seen": 52418592, + "step": 43070 + }, + { + "epoch": 4.797304822363293, + "grad_norm": 0.039451438933610916, + "learning_rate": 4.70794362647876e-05, + "loss": 0.1014, + "num_input_tokens_seen": 52424000, + "step": 43075 + }, + { + "epoch": 4.797861677246909, + "grad_norm": 0.0019103419035673141, + "learning_rate": 4.707829651870661e-05, + "loss": 0.0209, + "num_input_tokens_seen": 52430336, + "step": 43080 + }, + { + "epoch": 4.798418532130527, + "grad_norm": 0.813538134098053, + "learning_rate": 4.707715656407842e-05, + "loss": 0.0879, + "num_input_tokens_seen": 52436032, + "step": 43085 + }, + { + "epoch": 4.798975387014144, + "grad_norm": 2.6400864124298096, + "learning_rate": 4.70760164009138e-05, + "loss": 0.1742, + "num_input_tokens_seen": 52442240, + "step": 43090 + }, + { + "epoch": 4.7995322418977615, + "grad_norm": 0.7938064932823181, + "learning_rate": 4.707487602922351e-05, + "loss": 0.0389, + "num_input_tokens_seen": 52448512, + "step": 43095 + }, + { + "epoch": 4.800089096781379, + "grad_norm": 0.5631454586982727, + "learning_rate": 4.7073735449018344e-05, + "loss": 0.0277, + "num_input_tokens_seen": 52454336, + "step": 43100 + }, + { + "epoch": 4.800645951664996, + "grad_norm": 0.44697120785713196, + "learning_rate": 4.707259466030905e-05, + "loss": 0.066, + "num_input_tokens_seen": 52460768, + "step": 43105 + }, + { + "epoch": 4.801202806548614, + "grad_norm": 0.6931934952735901, + "learning_rate": 4.707145366310642e-05, + "loss": 0.0807, + "num_input_tokens_seen": 52466944, + "step": 43110 + }, + { + "epoch": 4.80175966143223, + "grad_norm": 0.00609402172267437, + "learning_rate": 4.7070312457421226e-05, + "loss": 0.1435, + "num_input_tokens_seen": 52473120, + "step": 43115 + }, + { + "epoch": 4.802316516315848, + "grad_norm": 0.48075321316719055, + "learning_rate": 4.706917104326425e-05, + "loss": 0.0716, + "num_input_tokens_seen": 52479328, + "step": 43120 + }, + { + "epoch": 4.802873371199466, + "grad_norm": 0.010990140028297901, + "learning_rate": 4.706802942064626e-05, + "loss": 0.0332, + "num_input_tokens_seen": 52485408, + "step": 43125 + }, + { + "epoch": 4.803430226083083, + "grad_norm": 0.18740352988243103, + "learning_rate": 4.706688758957807e-05, + "loss": 0.1047, + "num_input_tokens_seen": 52490976, + "step": 43130 + }, + { + "epoch": 4.8039870809667, + "grad_norm": 0.14622633159160614, + "learning_rate": 4.706574555007044e-05, + "loss": 0.0078, + "num_input_tokens_seen": 52497184, + "step": 43135 + }, + { + "epoch": 4.804543935850317, + "grad_norm": 0.023147836327552795, + "learning_rate": 4.706460330213416e-05, + "loss": 0.1269, + "num_input_tokens_seen": 52503104, + "step": 43140 + }, + { + "epoch": 4.805100790733935, + "grad_norm": 0.010685148648917675, + "learning_rate": 4.706346084578003e-05, + "loss": 0.126, + "num_input_tokens_seen": 52508992, + "step": 43145 + }, + { + "epoch": 4.805657645617552, + "grad_norm": 0.16142134368419647, + "learning_rate": 4.706231818101883e-05, + "loss": 0.0338, + "num_input_tokens_seen": 52515104, + "step": 43150 + }, + { + "epoch": 4.806214500501169, + "grad_norm": 0.2773064374923706, + "learning_rate": 4.706117530786136e-05, + "loss": 0.1128, + "num_input_tokens_seen": 52521312, + "step": 43155 + }, + { + "epoch": 4.806771355384787, + "grad_norm": 0.0013125697150826454, + "learning_rate": 4.7060032226318416e-05, + "loss": 0.039, + "num_input_tokens_seen": 52527840, + "step": 43160 + }, + { + "epoch": 4.807328210268404, + "grad_norm": 0.021542660892009735, + "learning_rate": 4.705888893640079e-05, + "loss": 0.0042, + "num_input_tokens_seen": 52534432, + "step": 43165 + }, + { + "epoch": 4.807885065152021, + "grad_norm": 0.2636956572532654, + "learning_rate": 4.705774543811929e-05, + "loss": 0.0272, + "num_input_tokens_seen": 52540608, + "step": 43170 + }, + { + "epoch": 4.808441920035639, + "grad_norm": 0.25501129031181335, + "learning_rate": 4.7056601731484706e-05, + "loss": 0.0357, + "num_input_tokens_seen": 52546624, + "step": 43175 + }, + { + "epoch": 4.808998774919256, + "grad_norm": 0.0016166575951501727, + "learning_rate": 4.705545781650785e-05, + "loss": 0.0695, + "num_input_tokens_seen": 52553056, + "step": 43180 + }, + { + "epoch": 4.809555629802873, + "grad_norm": 0.20462259650230408, + "learning_rate": 4.705431369319953e-05, + "loss": 0.0585, + "num_input_tokens_seen": 52558976, + "step": 43185 + }, + { + "epoch": 4.81011248468649, + "grad_norm": 0.20725421607494354, + "learning_rate": 4.705316936157054e-05, + "loss": 0.1028, + "num_input_tokens_seen": 52565152, + "step": 43190 + }, + { + "epoch": 4.810669339570108, + "grad_norm": 0.9547629952430725, + "learning_rate": 4.7052024821631705e-05, + "loss": 0.2455, + "num_input_tokens_seen": 52570752, + "step": 43195 + }, + { + "epoch": 4.811226194453726, + "grad_norm": 0.7033770084381104, + "learning_rate": 4.705088007339382e-05, + "loss": 0.0704, + "num_input_tokens_seen": 52576864, + "step": 43200 + }, + { + "epoch": 4.811783049337342, + "grad_norm": 0.12339918315410614, + "learning_rate": 4.704973511686771e-05, + "loss": 0.1361, + "num_input_tokens_seen": 52583040, + "step": 43205 + }, + { + "epoch": 4.81233990422096, + "grad_norm": 2.022308349609375, + "learning_rate": 4.7048589952064184e-05, + "loss": 0.1108, + "num_input_tokens_seen": 52588448, + "step": 43210 + }, + { + "epoch": 4.812896759104578, + "grad_norm": 0.6410446763038635, + "learning_rate": 4.704744457899406e-05, + "loss": 0.0684, + "num_input_tokens_seen": 52594304, + "step": 43215 + }, + { + "epoch": 4.8134536139881945, + "grad_norm": 0.42599180340766907, + "learning_rate": 4.704629899766816e-05, + "loss": 0.0394, + "num_input_tokens_seen": 52599488, + "step": 43220 + }, + { + "epoch": 4.814010468871812, + "grad_norm": 0.16169355809688568, + "learning_rate": 4.704515320809729e-05, + "loss": 0.0385, + "num_input_tokens_seen": 52605632, + "step": 43225 + }, + { + "epoch": 4.814567323755429, + "grad_norm": 1.7611135244369507, + "learning_rate": 4.70440072102923e-05, + "loss": 0.1387, + "num_input_tokens_seen": 52611840, + "step": 43230 + }, + { + "epoch": 4.815124178639047, + "grad_norm": 2.040173292160034, + "learning_rate": 4.7042861004264e-05, + "loss": 0.0787, + "num_input_tokens_seen": 52618016, + "step": 43235 + }, + { + "epoch": 4.815681033522664, + "grad_norm": 0.3786611258983612, + "learning_rate": 4.7041714590023214e-05, + "loss": 0.1135, + "num_input_tokens_seen": 52623936, + "step": 43240 + }, + { + "epoch": 4.816237888406281, + "grad_norm": 1.5381336212158203, + "learning_rate": 4.7040567967580773e-05, + "loss": 0.1573, + "num_input_tokens_seen": 52629984, + "step": 43245 + }, + { + "epoch": 4.816794743289899, + "grad_norm": 0.33565956354141235, + "learning_rate": 4.7039421136947514e-05, + "loss": 0.063, + "num_input_tokens_seen": 52636096, + "step": 43250 + }, + { + "epoch": 4.817351598173516, + "grad_norm": 0.005686540622264147, + "learning_rate": 4.703827409813426e-05, + "loss": 0.0278, + "num_input_tokens_seen": 52642336, + "step": 43255 + }, + { + "epoch": 4.817908453057133, + "grad_norm": 0.020607853308320045, + "learning_rate": 4.7037126851151853e-05, + "loss": 0.0993, + "num_input_tokens_seen": 52648608, + "step": 43260 + }, + { + "epoch": 4.818465307940751, + "grad_norm": 0.015259907580912113, + "learning_rate": 4.703597939601113e-05, + "loss": 0.0021, + "num_input_tokens_seen": 52654816, + "step": 43265 + }, + { + "epoch": 4.819022162824368, + "grad_norm": 0.12348227947950363, + "learning_rate": 4.7034831732722914e-05, + "loss": 0.0305, + "num_input_tokens_seen": 52661056, + "step": 43270 + }, + { + "epoch": 4.819579017707985, + "grad_norm": 1.3912705183029175, + "learning_rate": 4.703368386129807e-05, + "loss": 0.1798, + "num_input_tokens_seen": 52666848, + "step": 43275 + }, + { + "epoch": 4.820135872591603, + "grad_norm": 1.6310555934906006, + "learning_rate": 4.7032535781747425e-05, + "loss": 0.0908, + "num_input_tokens_seen": 52672576, + "step": 43280 + }, + { + "epoch": 4.82069272747522, + "grad_norm": 0.40662682056427, + "learning_rate": 4.703138749408183e-05, + "loss": 0.04, + "num_input_tokens_seen": 52678752, + "step": 43285 + }, + { + "epoch": 4.8212495823588375, + "grad_norm": 0.29123979806900024, + "learning_rate": 4.703023899831212e-05, + "loss": 0.1353, + "num_input_tokens_seen": 52684608, + "step": 43290 + }, + { + "epoch": 4.821806437242454, + "grad_norm": 0.5244970321655273, + "learning_rate": 4.7029090294449164e-05, + "loss": 0.0502, + "num_input_tokens_seen": 52690720, + "step": 43295 + }, + { + "epoch": 4.822363292126072, + "grad_norm": 1.2338823080062866, + "learning_rate": 4.70279413825038e-05, + "loss": 0.2906, + "num_input_tokens_seen": 52696736, + "step": 43300 + }, + { + "epoch": 4.82292014700969, + "grad_norm": 0.15382930636405945, + "learning_rate": 4.702679226248688e-05, + "loss": 0.1446, + "num_input_tokens_seen": 52702976, + "step": 43305 + }, + { + "epoch": 4.823477001893306, + "grad_norm": 0.9583849310874939, + "learning_rate": 4.7025642934409255e-05, + "loss": 0.1203, + "num_input_tokens_seen": 52709472, + "step": 43310 + }, + { + "epoch": 4.824033856776924, + "grad_norm": 0.16993409395217896, + "learning_rate": 4.702449339828178e-05, + "loss": 0.0844, + "num_input_tokens_seen": 52715584, + "step": 43315 + }, + { + "epoch": 4.824590711660541, + "grad_norm": 0.02036176063120365, + "learning_rate": 4.702334365411533e-05, + "loss": 0.0403, + "num_input_tokens_seen": 52721696, + "step": 43320 + }, + { + "epoch": 4.8251475665441586, + "grad_norm": 1.3836489915847778, + "learning_rate": 4.702219370192075e-05, + "loss": 0.1788, + "num_input_tokens_seen": 52727552, + "step": 43325 + }, + { + "epoch": 4.825704421427776, + "grad_norm": 0.2366006076335907, + "learning_rate": 4.702104354170891e-05, + "loss": 0.196, + "num_input_tokens_seen": 52733472, + "step": 43330 + }, + { + "epoch": 4.826261276311393, + "grad_norm": 1.755947470664978, + "learning_rate": 4.701989317349067e-05, + "loss": 0.1469, + "num_input_tokens_seen": 52739456, + "step": 43335 + }, + { + "epoch": 4.826818131195011, + "grad_norm": 0.8426355123519897, + "learning_rate": 4.701874259727689e-05, + "loss": 0.026, + "num_input_tokens_seen": 52745568, + "step": 43340 + }, + { + "epoch": 4.8273749860786275, + "grad_norm": 0.16914093494415283, + "learning_rate": 4.7017591813078457e-05, + "loss": 0.0219, + "num_input_tokens_seen": 52751840, + "step": 43345 + }, + { + "epoch": 4.827931840962245, + "grad_norm": 0.8706310987472534, + "learning_rate": 4.701644082090622e-05, + "loss": 0.0853, + "num_input_tokens_seen": 52758240, + "step": 43350 + }, + { + "epoch": 4.828488695845863, + "grad_norm": 0.004307956900447607, + "learning_rate": 4.701528962077106e-05, + "loss": 0.0764, + "num_input_tokens_seen": 52764032, + "step": 43355 + }, + { + "epoch": 4.82904555072948, + "grad_norm": 2.0318150520324707, + "learning_rate": 4.701413821268386e-05, + "loss": 0.1395, + "num_input_tokens_seen": 52770208, + "step": 43360 + }, + { + "epoch": 4.829602405613097, + "grad_norm": 1.0319862365722656, + "learning_rate": 4.701298659665547e-05, + "loss": 0.0695, + "num_input_tokens_seen": 52776096, + "step": 43365 + }, + { + "epoch": 4.830159260496714, + "grad_norm": 0.42352405190467834, + "learning_rate": 4.70118347726968e-05, + "loss": 0.1121, + "num_input_tokens_seen": 52782336, + "step": 43370 + }, + { + "epoch": 4.830716115380332, + "grad_norm": 0.670997679233551, + "learning_rate": 4.701068274081871e-05, + "loss": 0.0545, + "num_input_tokens_seen": 52788480, + "step": 43375 + }, + { + "epoch": 4.831272970263949, + "grad_norm": 0.11907680332660675, + "learning_rate": 4.700953050103209e-05, + "loss": 0.1764, + "num_input_tokens_seen": 52794432, + "step": 43380 + }, + { + "epoch": 4.831829825147566, + "grad_norm": 0.273844450712204, + "learning_rate": 4.7008378053347824e-05, + "loss": 0.0259, + "num_input_tokens_seen": 52800608, + "step": 43385 + }, + { + "epoch": 4.832386680031184, + "grad_norm": 0.167039155960083, + "learning_rate": 4.700722539777679e-05, + "loss": 0.0482, + "num_input_tokens_seen": 52806016, + "step": 43390 + }, + { + "epoch": 4.8329435349148016, + "grad_norm": 1.5322551727294922, + "learning_rate": 4.700607253432988e-05, + "loss": 0.0826, + "num_input_tokens_seen": 52812096, + "step": 43395 + }, + { + "epoch": 4.833500389798418, + "grad_norm": 0.12604306638240814, + "learning_rate": 4.700491946301798e-05, + "loss": 0.0314, + "num_input_tokens_seen": 52818176, + "step": 43400 + }, + { + "epoch": 4.834057244682036, + "grad_norm": 0.2718127965927124, + "learning_rate": 4.700376618385198e-05, + "loss": 0.0975, + "num_input_tokens_seen": 52824192, + "step": 43405 + }, + { + "epoch": 4.834614099565654, + "grad_norm": 1.257484793663025, + "learning_rate": 4.7002612696842793e-05, + "loss": 0.183, + "num_input_tokens_seen": 52830336, + "step": 43410 + }, + { + "epoch": 4.8351709544492705, + "grad_norm": 0.1622556895017624, + "learning_rate": 4.70014590020013e-05, + "loss": 0.0577, + "num_input_tokens_seen": 52836384, + "step": 43415 + }, + { + "epoch": 4.835727809332888, + "grad_norm": 0.0016712920041754842, + "learning_rate": 4.7000305099338396e-05, + "loss": 0.0049, + "num_input_tokens_seen": 52842432, + "step": 43420 + }, + { + "epoch": 4.836284664216505, + "grad_norm": 0.3049187660217285, + "learning_rate": 4.699915098886498e-05, + "loss": 0.0289, + "num_input_tokens_seen": 52848608, + "step": 43425 + }, + { + "epoch": 4.836841519100123, + "grad_norm": 2.297459602355957, + "learning_rate": 4.699799667059196e-05, + "loss": 0.1144, + "num_input_tokens_seen": 52854336, + "step": 43430 + }, + { + "epoch": 4.83739837398374, + "grad_norm": 1.085774302482605, + "learning_rate": 4.699684214453024e-05, + "loss": 0.0822, + "num_input_tokens_seen": 52860704, + "step": 43435 + }, + { + "epoch": 4.837955228867357, + "grad_norm": 0.8361029028892517, + "learning_rate": 4.699568741069072e-05, + "loss": 0.117, + "num_input_tokens_seen": 52866752, + "step": 43440 + }, + { + "epoch": 4.838512083750975, + "grad_norm": 0.4389779567718506, + "learning_rate": 4.6994532469084305e-05, + "loss": 0.0806, + "num_input_tokens_seen": 52873216, + "step": 43445 + }, + { + "epoch": 4.8390689386345915, + "grad_norm": 0.5527767539024353, + "learning_rate": 4.6993377319721924e-05, + "loss": 0.0686, + "num_input_tokens_seen": 52879104, + "step": 43450 + }, + { + "epoch": 4.839625793518209, + "grad_norm": 0.46491366624832153, + "learning_rate": 4.699222196261446e-05, + "loss": 0.0429, + "num_input_tokens_seen": 52885472, + "step": 43455 + }, + { + "epoch": 4.840182648401827, + "grad_norm": 0.23381932079792023, + "learning_rate": 4.6991066397772844e-05, + "loss": 0.0101, + "num_input_tokens_seen": 52891616, + "step": 43460 + }, + { + "epoch": 4.840739503285444, + "grad_norm": 0.151760071516037, + "learning_rate": 4.6989910625207984e-05, + "loss": 0.057, + "num_input_tokens_seen": 52897472, + "step": 43465 + }, + { + "epoch": 4.841296358169061, + "grad_norm": 0.0037421435117721558, + "learning_rate": 4.69887546449308e-05, + "loss": 0.0334, + "num_input_tokens_seen": 52903872, + "step": 43470 + }, + { + "epoch": 4.841853213052678, + "grad_norm": 0.15426993370056152, + "learning_rate": 4.698759845695222e-05, + "loss": 0.0954, + "num_input_tokens_seen": 52909824, + "step": 43475 + }, + { + "epoch": 4.842410067936296, + "grad_norm": 0.3956606090068817, + "learning_rate": 4.698644206128314e-05, + "loss": 0.0688, + "num_input_tokens_seen": 52915904, + "step": 43480 + }, + { + "epoch": 4.8429669228199135, + "grad_norm": 0.12569591403007507, + "learning_rate": 4.698528545793452e-05, + "loss": 0.0417, + "num_input_tokens_seen": 52922048, + "step": 43485 + }, + { + "epoch": 4.84352377770353, + "grad_norm": 2.545382022857666, + "learning_rate": 4.6984128646917246e-05, + "loss": 0.1332, + "num_input_tokens_seen": 52928256, + "step": 43490 + }, + { + "epoch": 4.844080632587148, + "grad_norm": 0.46436071395874023, + "learning_rate": 4.698297162824227e-05, + "loss": 0.0613, + "num_input_tokens_seen": 52934368, + "step": 43495 + }, + { + "epoch": 4.844637487470765, + "grad_norm": 1.7414741516113281, + "learning_rate": 4.698181440192052e-05, + "loss": 0.2095, + "num_input_tokens_seen": 52940544, + "step": 43500 + }, + { + "epoch": 4.845194342354382, + "grad_norm": 2.034829616546631, + "learning_rate": 4.6980656967962915e-05, + "loss": 0.1159, + "num_input_tokens_seen": 52946784, + "step": 43505 + }, + { + "epoch": 4.845751197238, + "grad_norm": 0.3232804834842682, + "learning_rate": 4.697949932638039e-05, + "loss": 0.1716, + "num_input_tokens_seen": 52952960, + "step": 43510 + }, + { + "epoch": 4.846308052121617, + "grad_norm": 0.012171175330877304, + "learning_rate": 4.6978341477183894e-05, + "loss": 0.0054, + "num_input_tokens_seen": 52959072, + "step": 43515 + }, + { + "epoch": 4.8468649070052345, + "grad_norm": 0.00521768257021904, + "learning_rate": 4.697718342038435e-05, + "loss": 0.0543, + "num_input_tokens_seen": 52965312, + "step": 43520 + }, + { + "epoch": 4.847421761888851, + "grad_norm": 0.9834326505661011, + "learning_rate": 4.697602515599271e-05, + "loss": 0.1149, + "num_input_tokens_seen": 52971072, + "step": 43525 + }, + { + "epoch": 4.847978616772469, + "grad_norm": 0.1388006955385208, + "learning_rate": 4.6974866684019895e-05, + "loss": 0.0345, + "num_input_tokens_seen": 52977248, + "step": 43530 + }, + { + "epoch": 4.848535471656087, + "grad_norm": 0.0310935340821743, + "learning_rate": 4.6973708004476856e-05, + "loss": 0.0329, + "num_input_tokens_seen": 52983072, + "step": 43535 + }, + { + "epoch": 4.8490923265397035, + "grad_norm": 0.40334179997444153, + "learning_rate": 4.697254911737455e-05, + "loss": 0.031, + "num_input_tokens_seen": 52989152, + "step": 43540 + }, + { + "epoch": 4.849649181423321, + "grad_norm": 0.3966481685638428, + "learning_rate": 4.69713900227239e-05, + "loss": 0.0695, + "num_input_tokens_seen": 52995328, + "step": 43545 + }, + { + "epoch": 4.850206036306938, + "grad_norm": 0.22749967873096466, + "learning_rate": 4.6970230720535876e-05, + "loss": 0.029, + "num_input_tokens_seen": 53000736, + "step": 43550 + }, + { + "epoch": 4.850762891190556, + "grad_norm": 0.05804232880473137, + "learning_rate": 4.696907121082142e-05, + "loss": 0.0424, + "num_input_tokens_seen": 53006784, + "step": 43555 + }, + { + "epoch": 4.851319746074173, + "grad_norm": 0.731559693813324, + "learning_rate": 4.696791149359149e-05, + "loss": 0.1214, + "num_input_tokens_seen": 53012704, + "step": 43560 + }, + { + "epoch": 4.85187660095779, + "grad_norm": 0.0464947409927845, + "learning_rate": 4.696675156885703e-05, + "loss": 0.0143, + "num_input_tokens_seen": 53018720, + "step": 43565 + }, + { + "epoch": 4.852433455841408, + "grad_norm": 0.05089031532406807, + "learning_rate": 4.696559143662901e-05, + "loss": 0.013, + "num_input_tokens_seen": 53024672, + "step": 43570 + }, + { + "epoch": 4.852990310725025, + "grad_norm": 0.6139611601829529, + "learning_rate": 4.696443109691837e-05, + "loss": 0.0417, + "num_input_tokens_seen": 53030432, + "step": 43575 + }, + { + "epoch": 4.853547165608642, + "grad_norm": 1.006488561630249, + "learning_rate": 4.696327054973608e-05, + "loss": 0.09, + "num_input_tokens_seen": 53036768, + "step": 43580 + }, + { + "epoch": 4.85410402049226, + "grad_norm": 0.7701655030250549, + "learning_rate": 4.696210979509311e-05, + "loss": 0.1076, + "num_input_tokens_seen": 53042976, + "step": 43585 + }, + { + "epoch": 4.8546608753758775, + "grad_norm": 0.28223422169685364, + "learning_rate": 4.696094883300042e-05, + "loss": 0.1034, + "num_input_tokens_seen": 53049024, + "step": 43590 + }, + { + "epoch": 4.855217730259494, + "grad_norm": 1.3131436109542847, + "learning_rate": 4.695978766346896e-05, + "loss": 0.1259, + "num_input_tokens_seen": 53055424, + "step": 43595 + }, + { + "epoch": 4.855774585143112, + "grad_norm": 0.24355459213256836, + "learning_rate": 4.695862628650972e-05, + "loss": 0.0454, + "num_input_tokens_seen": 53061824, + "step": 43600 + }, + { + "epoch": 4.856331440026729, + "grad_norm": 0.7863143086433411, + "learning_rate": 4.6957464702133664e-05, + "loss": 0.1046, + "num_input_tokens_seen": 53067872, + "step": 43605 + }, + { + "epoch": 4.8568882949103465, + "grad_norm": 0.21754303574562073, + "learning_rate": 4.695630291035176e-05, + "loss": 0.1301, + "num_input_tokens_seen": 53074048, + "step": 43610 + }, + { + "epoch": 4.857445149793964, + "grad_norm": 0.4440484344959259, + "learning_rate": 4.6955140911174974e-05, + "loss": 0.0989, + "num_input_tokens_seen": 53080352, + "step": 43615 + }, + { + "epoch": 4.858002004677581, + "grad_norm": 0.0538448803126812, + "learning_rate": 4.695397870461431e-05, + "loss": 0.0632, + "num_input_tokens_seen": 53086944, + "step": 43620 + }, + { + "epoch": 4.858558859561199, + "grad_norm": 0.7053180932998657, + "learning_rate": 4.6952816290680714e-05, + "loss": 0.1164, + "num_input_tokens_seen": 53092480, + "step": 43625 + }, + { + "epoch": 4.859115714444815, + "grad_norm": 0.06831006705760956, + "learning_rate": 4.6951653669385186e-05, + "loss": 0.1009, + "num_input_tokens_seen": 53098496, + "step": 43630 + }, + { + "epoch": 4.859672569328433, + "grad_norm": 0.5783247351646423, + "learning_rate": 4.69504908407387e-05, + "loss": 0.0157, + "num_input_tokens_seen": 53104736, + "step": 43635 + }, + { + "epoch": 4.860229424212051, + "grad_norm": 0.05933959409594536, + "learning_rate": 4.694932780475224e-05, + "loss": 0.0927, + "num_input_tokens_seen": 53110976, + "step": 43640 + }, + { + "epoch": 4.8607862790956675, + "grad_norm": 0.01971895806491375, + "learning_rate": 4.6948164561436796e-05, + "loss": 0.0595, + "num_input_tokens_seen": 53117120, + "step": 43645 + }, + { + "epoch": 4.861343133979285, + "grad_norm": 0.500289797782898, + "learning_rate": 4.6947001110803354e-05, + "loss": 0.0871, + "num_input_tokens_seen": 53123232, + "step": 43650 + }, + { + "epoch": 4.861899988862902, + "grad_norm": 0.43644580245018005, + "learning_rate": 4.69458374528629e-05, + "loss": 0.1863, + "num_input_tokens_seen": 53128736, + "step": 43655 + }, + { + "epoch": 4.86245684374652, + "grad_norm": 1.1253029108047485, + "learning_rate": 4.694467358762643e-05, + "loss": 0.1839, + "num_input_tokens_seen": 53134976, + "step": 43660 + }, + { + "epoch": 4.863013698630137, + "grad_norm": 0.0026342258788645267, + "learning_rate": 4.694350951510493e-05, + "loss": 0.0659, + "num_input_tokens_seen": 53141184, + "step": 43665 + }, + { + "epoch": 4.863570553513754, + "grad_norm": 0.03886546939611435, + "learning_rate": 4.6942345235309415e-05, + "loss": 0.0535, + "num_input_tokens_seen": 53147168, + "step": 43670 + }, + { + "epoch": 4.864127408397372, + "grad_norm": 0.03681373968720436, + "learning_rate": 4.6941180748250856e-05, + "loss": 0.0544, + "num_input_tokens_seen": 53153440, + "step": 43675 + }, + { + "epoch": 4.864684263280989, + "grad_norm": 0.11753297597169876, + "learning_rate": 4.694001605394027e-05, + "loss": 0.097, + "num_input_tokens_seen": 53159680, + "step": 43680 + }, + { + "epoch": 4.865241118164606, + "grad_norm": 0.07538340985774994, + "learning_rate": 4.6938851152388666e-05, + "loss": 0.0625, + "num_input_tokens_seen": 53166016, + "step": 43685 + }, + { + "epoch": 4.865797973048224, + "grad_norm": 0.04196638986468315, + "learning_rate": 4.693768604360702e-05, + "loss": 0.1071, + "num_input_tokens_seen": 53172032, + "step": 43690 + }, + { + "epoch": 4.866354827931841, + "grad_norm": 0.008839701302349567, + "learning_rate": 4.693652072760636e-05, + "loss": 0.052, + "num_input_tokens_seen": 53178240, + "step": 43695 + }, + { + "epoch": 4.866911682815458, + "grad_norm": 0.16374045610427856, + "learning_rate": 4.693535520439769e-05, + "loss": 0.0141, + "num_input_tokens_seen": 53184192, + "step": 43700 + }, + { + "epoch": 4.867468537699075, + "grad_norm": 1.2105097770690918, + "learning_rate": 4.6934189473992006e-05, + "loss": 0.0357, + "num_input_tokens_seen": 53190304, + "step": 43705 + }, + { + "epoch": 4.868025392582693, + "grad_norm": 0.501092255115509, + "learning_rate": 4.693302353640033e-05, + "loss": 0.045, + "num_input_tokens_seen": 53196544, + "step": 43710 + }, + { + "epoch": 4.8685822474663105, + "grad_norm": 0.6251252889633179, + "learning_rate": 4.6931857391633685e-05, + "loss": 0.064, + "num_input_tokens_seen": 53202528, + "step": 43715 + }, + { + "epoch": 4.869139102349927, + "grad_norm": 0.9633876085281372, + "learning_rate": 4.693069103970307e-05, + "loss": 0.0652, + "num_input_tokens_seen": 53208864, + "step": 43720 + }, + { + "epoch": 4.869695957233545, + "grad_norm": 0.0008672417607158422, + "learning_rate": 4.69295244806195e-05, + "loss": 0.0448, + "num_input_tokens_seen": 53215328, + "step": 43725 + }, + { + "epoch": 4.870252812117163, + "grad_norm": 0.07632984220981598, + "learning_rate": 4.692835771439401e-05, + "loss": 0.0085, + "num_input_tokens_seen": 53221664, + "step": 43730 + }, + { + "epoch": 4.870809667000779, + "grad_norm": 1.2601304054260254, + "learning_rate": 4.6927190741037615e-05, + "loss": 0.1723, + "num_input_tokens_seen": 53226944, + "step": 43735 + }, + { + "epoch": 4.871366521884397, + "grad_norm": 0.01244655717164278, + "learning_rate": 4.692602356056133e-05, + "loss": 0.0169, + "num_input_tokens_seen": 53232960, + "step": 43740 + }, + { + "epoch": 4.871923376768014, + "grad_norm": 0.30958792567253113, + "learning_rate": 4.6924856172976184e-05, + "loss": 0.0453, + "num_input_tokens_seen": 53239136, + "step": 43745 + }, + { + "epoch": 4.872480231651632, + "grad_norm": 0.2537699043750763, + "learning_rate": 4.692368857829321e-05, + "loss": 0.0182, + "num_input_tokens_seen": 53245056, + "step": 43750 + }, + { + "epoch": 4.873037086535249, + "grad_norm": 0.49723169207572937, + "learning_rate": 4.6922520776523436e-05, + "loss": 0.0516, + "num_input_tokens_seen": 53251392, + "step": 43755 + }, + { + "epoch": 4.873593941418866, + "grad_norm": 0.015213332138955593, + "learning_rate": 4.692135276767788e-05, + "loss": 0.0288, + "num_input_tokens_seen": 53257440, + "step": 43760 + }, + { + "epoch": 4.874150796302484, + "grad_norm": 0.8171032667160034, + "learning_rate": 4.692018455176759e-05, + "loss": 0.1558, + "num_input_tokens_seen": 53263424, + "step": 43765 + }, + { + "epoch": 4.874707651186101, + "grad_norm": 0.5026096105575562, + "learning_rate": 4.69190161288036e-05, + "loss": 0.0455, + "num_input_tokens_seen": 53269696, + "step": 43770 + }, + { + "epoch": 4.875264506069718, + "grad_norm": 0.030830761417746544, + "learning_rate": 4.691784749879693e-05, + "loss": 0.0236, + "num_input_tokens_seen": 53275776, + "step": 43775 + }, + { + "epoch": 4.875821360953336, + "grad_norm": 0.04046694189310074, + "learning_rate": 4.6916678661758636e-05, + "loss": 0.1284, + "num_input_tokens_seen": 53282048, + "step": 43780 + }, + { + "epoch": 4.876378215836953, + "grad_norm": 0.3306434452533722, + "learning_rate": 4.691550961769975e-05, + "loss": 0.1509, + "num_input_tokens_seen": 53288192, + "step": 43785 + }, + { + "epoch": 4.87693507072057, + "grad_norm": 0.8880266547203064, + "learning_rate": 4.6914340366631315e-05, + "loss": 0.0379, + "num_input_tokens_seen": 53294496, + "step": 43790 + }, + { + "epoch": 4.877491925604188, + "grad_norm": 0.6352919340133667, + "learning_rate": 4.691317090856438e-05, + "loss": 0.0889, + "num_input_tokens_seen": 53300768, + "step": 43795 + }, + { + "epoch": 4.878048780487805, + "grad_norm": 1.512143850326538, + "learning_rate": 4.6912001243509995e-05, + "loss": 0.0887, + "num_input_tokens_seen": 53306592, + "step": 43800 + }, + { + "epoch": 4.8786056353714224, + "grad_norm": 0.509440004825592, + "learning_rate": 4.6910831371479204e-05, + "loss": 0.053, + "num_input_tokens_seen": 53312416, + "step": 43805 + }, + { + "epoch": 4.879162490255039, + "grad_norm": 0.7028222680091858, + "learning_rate": 4.690966129248304e-05, + "loss": 0.0332, + "num_input_tokens_seen": 53318880, + "step": 43810 + }, + { + "epoch": 4.879719345138657, + "grad_norm": 0.08214723318815231, + "learning_rate": 4.6908491006532584e-05, + "loss": 0.0154, + "num_input_tokens_seen": 53324800, + "step": 43815 + }, + { + "epoch": 4.880276200022275, + "grad_norm": 0.055074747651815414, + "learning_rate": 4.6907320513638874e-05, + "loss": 0.0073, + "num_input_tokens_seen": 53331104, + "step": 43820 + }, + { + "epoch": 4.880833054905891, + "grad_norm": 0.19352708756923676, + "learning_rate": 4.6906149813812974e-05, + "loss": 0.0465, + "num_input_tokens_seen": 53337696, + "step": 43825 + }, + { + "epoch": 4.881389909789509, + "grad_norm": 1.2471740245819092, + "learning_rate": 4.690497890706593e-05, + "loss": 0.0436, + "num_input_tokens_seen": 53343808, + "step": 43830 + }, + { + "epoch": 4.881946764673126, + "grad_norm": 0.30574169754981995, + "learning_rate": 4.690380779340882e-05, + "loss": 0.1097, + "num_input_tokens_seen": 53350080, + "step": 43835 + }, + { + "epoch": 4.8825036195567435, + "grad_norm": 0.1639438420534134, + "learning_rate": 4.690263647285269e-05, + "loss": 0.1297, + "num_input_tokens_seen": 53355680, + "step": 43840 + }, + { + "epoch": 4.883060474440361, + "grad_norm": 0.6260196566581726, + "learning_rate": 4.6901464945408614e-05, + "loss": 0.1126, + "num_input_tokens_seen": 53361600, + "step": 43845 + }, + { + "epoch": 4.883617329323978, + "grad_norm": 0.15828992426395416, + "learning_rate": 4.6900293211087654e-05, + "loss": 0.1003, + "num_input_tokens_seen": 53367744, + "step": 43850 + }, + { + "epoch": 4.884174184207596, + "grad_norm": 0.5332304835319519, + "learning_rate": 4.689912126990088e-05, + "loss": 0.1341, + "num_input_tokens_seen": 53373792, + "step": 43855 + }, + { + "epoch": 4.884731039091212, + "grad_norm": 0.38925832509994507, + "learning_rate": 4.6897949121859355e-05, + "loss": 0.0447, + "num_input_tokens_seen": 53380000, + "step": 43860 + }, + { + "epoch": 4.88528789397483, + "grad_norm": 0.12517592310905457, + "learning_rate": 4.689677676697416e-05, + "loss": 0.0546, + "num_input_tokens_seen": 53385568, + "step": 43865 + }, + { + "epoch": 4.885844748858448, + "grad_norm": 1.8471845388412476, + "learning_rate": 4.689560420525636e-05, + "loss": 0.2177, + "num_input_tokens_seen": 53391648, + "step": 43870 + }, + { + "epoch": 4.886401603742065, + "grad_norm": 0.0023413612507283688, + "learning_rate": 4.689443143671704e-05, + "loss": 0.0144, + "num_input_tokens_seen": 53397728, + "step": 43875 + }, + { + "epoch": 4.886958458625682, + "grad_norm": 0.2758736312389374, + "learning_rate": 4.689325846136727e-05, + "loss": 0.1269, + "num_input_tokens_seen": 53403488, + "step": 43880 + }, + { + "epoch": 4.887515313509299, + "grad_norm": 1.071703553199768, + "learning_rate": 4.6892085279218147e-05, + "loss": 0.0888, + "num_input_tokens_seen": 53409440, + "step": 43885 + }, + { + "epoch": 4.888072168392917, + "grad_norm": 0.5507163405418396, + "learning_rate": 4.689091189028072e-05, + "loss": 0.1032, + "num_input_tokens_seen": 53415616, + "step": 43890 + }, + { + "epoch": 4.888629023276534, + "grad_norm": 0.040088556706905365, + "learning_rate": 4.68897382945661e-05, + "loss": 0.0908, + "num_input_tokens_seen": 53422080, + "step": 43895 + }, + { + "epoch": 4.889185878160151, + "grad_norm": 0.2131499946117401, + "learning_rate": 4.688856449208536e-05, + "loss": 0.0333, + "num_input_tokens_seen": 53428032, + "step": 43900 + }, + { + "epoch": 4.889742733043769, + "grad_norm": 2.1277174949645996, + "learning_rate": 4.68873904828496e-05, + "loss": 0.2243, + "num_input_tokens_seen": 53433952, + "step": 43905 + }, + { + "epoch": 4.8902995879273865, + "grad_norm": 0.057139430195093155, + "learning_rate": 4.6886216266869896e-05, + "loss": 0.0521, + "num_input_tokens_seen": 53439936, + "step": 43910 + }, + { + "epoch": 4.890856442811003, + "grad_norm": 0.8706321120262146, + "learning_rate": 4.6885041844157346e-05, + "loss": 0.0472, + "num_input_tokens_seen": 53446016, + "step": 43915 + }, + { + "epoch": 4.891413297694621, + "grad_norm": 0.3916419446468353, + "learning_rate": 4.6883867214723045e-05, + "loss": 0.0594, + "num_input_tokens_seen": 53451712, + "step": 43920 + }, + { + "epoch": 4.891970152578238, + "grad_norm": 0.5201829075813293, + "learning_rate": 4.688269237857807e-05, + "loss": 0.0846, + "num_input_tokens_seen": 53458080, + "step": 43925 + }, + { + "epoch": 4.892527007461855, + "grad_norm": 0.15632858872413635, + "learning_rate": 4.688151733573355e-05, + "loss": 0.0453, + "num_input_tokens_seen": 53464448, + "step": 43930 + }, + { + "epoch": 4.893083862345473, + "grad_norm": 0.29860028624534607, + "learning_rate": 4.6880342086200565e-05, + "loss": 0.0573, + "num_input_tokens_seen": 53470368, + "step": 43935 + }, + { + "epoch": 4.89364071722909, + "grad_norm": 0.32800349593162537, + "learning_rate": 4.687916662999021e-05, + "loss": 0.0864, + "num_input_tokens_seen": 53476512, + "step": 43940 + }, + { + "epoch": 4.894197572112708, + "grad_norm": 0.2594180703163147, + "learning_rate": 4.68779909671136e-05, + "loss": 0.0804, + "num_input_tokens_seen": 53482496, + "step": 43945 + }, + { + "epoch": 4.894754426996325, + "grad_norm": 0.3829355239868164, + "learning_rate": 4.687681509758185e-05, + "loss": 0.0217, + "num_input_tokens_seen": 53488576, + "step": 43950 + }, + { + "epoch": 4.895311281879942, + "grad_norm": 0.04743930697441101, + "learning_rate": 4.6875639021406036e-05, + "loss": 0.0558, + "num_input_tokens_seen": 53494432, + "step": 43955 + }, + { + "epoch": 4.89586813676356, + "grad_norm": 1.4369248151779175, + "learning_rate": 4.687446273859729e-05, + "loss": 0.1316, + "num_input_tokens_seen": 53500640, + "step": 43960 + }, + { + "epoch": 4.8964249916471765, + "grad_norm": 0.3357248604297638, + "learning_rate": 4.687328624916673e-05, + "loss": 0.147, + "num_input_tokens_seen": 53506912, + "step": 43965 + }, + { + "epoch": 4.896981846530794, + "grad_norm": 0.3471747040748596, + "learning_rate": 4.687210955312544e-05, + "loss": 0.0319, + "num_input_tokens_seen": 53513120, + "step": 43970 + }, + { + "epoch": 4.897538701414412, + "grad_norm": 1.021035075187683, + "learning_rate": 4.6870932650484555e-05, + "loss": 0.065, + "num_input_tokens_seen": 53519136, + "step": 43975 + }, + { + "epoch": 4.898095556298029, + "grad_norm": 0.0761985033750534, + "learning_rate": 4.686975554125519e-05, + "loss": 0.0772, + "num_input_tokens_seen": 53525184, + "step": 43980 + }, + { + "epoch": 4.898652411181646, + "grad_norm": 0.7338497042655945, + "learning_rate": 4.6868578225448465e-05, + "loss": 0.1335, + "num_input_tokens_seen": 53530656, + "step": 43985 + }, + { + "epoch": 4.899209266065263, + "grad_norm": 0.0697249248623848, + "learning_rate": 4.68674007030755e-05, + "loss": 0.0526, + "num_input_tokens_seen": 53537056, + "step": 43990 + }, + { + "epoch": 4.899766120948881, + "grad_norm": 0.12215625494718552, + "learning_rate": 4.6866222974147414e-05, + "loss": 0.01, + "num_input_tokens_seen": 53543072, + "step": 43995 + }, + { + "epoch": 4.900322975832498, + "grad_norm": 0.09693711251020432, + "learning_rate": 4.686504503867533e-05, + "loss": 0.1156, + "num_input_tokens_seen": 53549216, + "step": 44000 + }, + { + "epoch": 4.900879830716115, + "grad_norm": 0.7141861319541931, + "learning_rate": 4.686386689667038e-05, + "loss": 0.1036, + "num_input_tokens_seen": 53554560, + "step": 44005 + }, + { + "epoch": 4.901436685599733, + "grad_norm": 1.6511093378067017, + "learning_rate": 4.686268854814369e-05, + "loss": 0.0453, + "num_input_tokens_seen": 53560768, + "step": 44010 + }, + { + "epoch": 4.90199354048335, + "grad_norm": 0.1421407014131546, + "learning_rate": 4.686150999310639e-05, + "loss": 0.0116, + "num_input_tokens_seen": 53566784, + "step": 44015 + }, + { + "epoch": 4.902550395366967, + "grad_norm": 0.15264761447906494, + "learning_rate": 4.6860331231569606e-05, + "loss": 0.178, + "num_input_tokens_seen": 53572960, + "step": 44020 + }, + { + "epoch": 4.903107250250585, + "grad_norm": 0.49086469411849976, + "learning_rate": 4.685915226354449e-05, + "loss": 0.1032, + "num_input_tokens_seen": 53578976, + "step": 44025 + }, + { + "epoch": 4.903664105134202, + "grad_norm": 0.5875064134597778, + "learning_rate": 4.685797308904216e-05, + "loss": 0.0224, + "num_input_tokens_seen": 53585216, + "step": 44030 + }, + { + "epoch": 4.9042209600178195, + "grad_norm": 0.5028687715530396, + "learning_rate": 4.685679370807377e-05, + "loss": 0.0853, + "num_input_tokens_seen": 53591008, + "step": 44035 + }, + { + "epoch": 4.904777814901436, + "grad_norm": 0.7496280670166016, + "learning_rate": 4.685561412065045e-05, + "loss": 0.1101, + "num_input_tokens_seen": 53596736, + "step": 44040 + }, + { + "epoch": 4.905334669785054, + "grad_norm": 0.2956448495388031, + "learning_rate": 4.685443432678334e-05, + "loss": 0.1578, + "num_input_tokens_seen": 53602784, + "step": 44045 + }, + { + "epoch": 4.905891524668672, + "grad_norm": 0.19039560854434967, + "learning_rate": 4.6853254326483586e-05, + "loss": 0.0597, + "num_input_tokens_seen": 53608128, + "step": 44050 + }, + { + "epoch": 4.906448379552288, + "grad_norm": 0.5168957710266113, + "learning_rate": 4.685207411976235e-05, + "loss": 0.1421, + "num_input_tokens_seen": 53614080, + "step": 44055 + }, + { + "epoch": 4.907005234435906, + "grad_norm": 0.808876097202301, + "learning_rate": 4.685089370663075e-05, + "loss": 0.1247, + "num_input_tokens_seen": 53620064, + "step": 44060 + }, + { + "epoch": 4.907562089319523, + "grad_norm": 0.3909294605255127, + "learning_rate": 4.684971308709996e-05, + "loss": 0.0938, + "num_input_tokens_seen": 53626048, + "step": 44065 + }, + { + "epoch": 4.9081189442031405, + "grad_norm": 1.1082390546798706, + "learning_rate": 4.684853226118112e-05, + "loss": 0.038, + "num_input_tokens_seen": 53632448, + "step": 44070 + }, + { + "epoch": 4.908675799086758, + "grad_norm": 0.4119672477245331, + "learning_rate": 4.684735122888539e-05, + "loss": 0.0456, + "num_input_tokens_seen": 53638432, + "step": 44075 + }, + { + "epoch": 4.909232653970375, + "grad_norm": 0.40865251421928406, + "learning_rate": 4.684616999022393e-05, + "loss": 0.1488, + "num_input_tokens_seen": 53644256, + "step": 44080 + }, + { + "epoch": 4.909789508853993, + "grad_norm": 0.9703196287155151, + "learning_rate": 4.684498854520789e-05, + "loss": 0.0855, + "num_input_tokens_seen": 53649888, + "step": 44085 + }, + { + "epoch": 4.91034636373761, + "grad_norm": 1.6111212968826294, + "learning_rate": 4.684380689384842e-05, + "loss": 0.1408, + "num_input_tokens_seen": 53655872, + "step": 44090 + }, + { + "epoch": 4.910903218621227, + "grad_norm": 1.0032404661178589, + "learning_rate": 4.6842625036156705e-05, + "loss": 0.1012, + "num_input_tokens_seen": 53661952, + "step": 44095 + }, + { + "epoch": 4.911460073504845, + "grad_norm": 0.6828983426094055, + "learning_rate": 4.68414429721439e-05, + "loss": 0.0782, + "num_input_tokens_seen": 53668352, + "step": 44100 + }, + { + "epoch": 4.912016928388462, + "grad_norm": 0.4699232876300812, + "learning_rate": 4.684026070182115e-05, + "loss": 0.0835, + "num_input_tokens_seen": 53674400, + "step": 44105 + }, + { + "epoch": 4.912573783272079, + "grad_norm": 0.7226839065551758, + "learning_rate": 4.683907822519965e-05, + "loss": 0.1001, + "num_input_tokens_seen": 53680576, + "step": 44110 + }, + { + "epoch": 4.913130638155697, + "grad_norm": 0.0065321046859025955, + "learning_rate": 4.683789554229056e-05, + "loss": 0.0067, + "num_input_tokens_seen": 53686560, + "step": 44115 + }, + { + "epoch": 4.913687493039314, + "grad_norm": 0.11947840452194214, + "learning_rate": 4.683671265310505e-05, + "loss": 0.0299, + "num_input_tokens_seen": 53692832, + "step": 44120 + }, + { + "epoch": 4.914244347922931, + "grad_norm": 0.9938628673553467, + "learning_rate": 4.683552955765429e-05, + "loss": 0.0672, + "num_input_tokens_seen": 53698848, + "step": 44125 + }, + { + "epoch": 4.914801202806549, + "grad_norm": 0.7074129581451416, + "learning_rate": 4.683434625594946e-05, + "loss": 0.0346, + "num_input_tokens_seen": 53705280, + "step": 44130 + }, + { + "epoch": 4.915358057690166, + "grad_norm": 1.267420768737793, + "learning_rate": 4.683316274800174e-05, + "loss": 0.0791, + "num_input_tokens_seen": 53711168, + "step": 44135 + }, + { + "epoch": 4.9159149125737835, + "grad_norm": 0.5857805013656616, + "learning_rate": 4.68319790338223e-05, + "loss": 0.0767, + "num_input_tokens_seen": 53717376, + "step": 44140 + }, + { + "epoch": 4.9164717674574, + "grad_norm": 0.48358434438705444, + "learning_rate": 4.683079511342233e-05, + "loss": 0.0403, + "num_input_tokens_seen": 53723808, + "step": 44145 + }, + { + "epoch": 4.917028622341018, + "grad_norm": 0.7815857529640198, + "learning_rate": 4.682961098681301e-05, + "loss": 0.1072, + "num_input_tokens_seen": 53729920, + "step": 44150 + }, + { + "epoch": 4.917585477224636, + "grad_norm": 0.5612554550170898, + "learning_rate": 4.682842665400552e-05, + "loss": 0.088, + "num_input_tokens_seen": 53735808, + "step": 44155 + }, + { + "epoch": 4.9181423321082525, + "grad_norm": 0.5742343664169312, + "learning_rate": 4.682724211501106e-05, + "loss": 0.1001, + "num_input_tokens_seen": 53741984, + "step": 44160 + }, + { + "epoch": 4.91869918699187, + "grad_norm": 0.0532742440700531, + "learning_rate": 4.682605736984081e-05, + "loss": 0.0358, + "num_input_tokens_seen": 53748160, + "step": 44165 + }, + { + "epoch": 4.919256041875487, + "grad_norm": 1.2525560855865479, + "learning_rate": 4.682487241850595e-05, + "loss": 0.1141, + "num_input_tokens_seen": 53754272, + "step": 44170 + }, + { + "epoch": 4.919812896759105, + "grad_norm": 0.8506428599357605, + "learning_rate": 4.682368726101769e-05, + "loss": 0.0783, + "num_input_tokens_seen": 53760448, + "step": 44175 + }, + { + "epoch": 4.920369751642722, + "grad_norm": 0.7212662100791931, + "learning_rate": 4.6822501897387223e-05, + "loss": 0.0777, + "num_input_tokens_seen": 53766144, + "step": 44180 + }, + { + "epoch": 4.920926606526339, + "grad_norm": 0.49771618843078613, + "learning_rate": 4.682131632762574e-05, + "loss": 0.0547, + "num_input_tokens_seen": 53772256, + "step": 44185 + }, + { + "epoch": 4.921483461409957, + "grad_norm": 0.000714403809979558, + "learning_rate": 4.682013055174444e-05, + "loss": 0.0753, + "num_input_tokens_seen": 53778752, + "step": 44190 + }, + { + "epoch": 4.9220403162935735, + "grad_norm": 0.15341562032699585, + "learning_rate": 4.6818944569754524e-05, + "loss": 0.0926, + "num_input_tokens_seen": 53784672, + "step": 44195 + }, + { + "epoch": 4.922597171177191, + "grad_norm": 0.6820088028907776, + "learning_rate": 4.68177583816672e-05, + "loss": 0.0717, + "num_input_tokens_seen": 53790976, + "step": 44200 + }, + { + "epoch": 4.923154026060809, + "grad_norm": 1.2556400299072266, + "learning_rate": 4.6816571987493665e-05, + "loss": 0.1496, + "num_input_tokens_seen": 53797120, + "step": 44205 + }, + { + "epoch": 4.923710880944426, + "grad_norm": 0.04787212982773781, + "learning_rate": 4.6815385387245125e-05, + "loss": 0.1188, + "num_input_tokens_seen": 53803264, + "step": 44210 + }, + { + "epoch": 4.924267735828043, + "grad_norm": 0.012780167162418365, + "learning_rate": 4.6814198580932794e-05, + "loss": 0.0078, + "num_input_tokens_seen": 53809632, + "step": 44215 + }, + { + "epoch": 4.92482459071166, + "grad_norm": 0.13245221972465515, + "learning_rate": 4.681301156856788e-05, + "loss": 0.0651, + "num_input_tokens_seen": 53815584, + "step": 44220 + }, + { + "epoch": 4.925381445595278, + "grad_norm": 0.27219536900520325, + "learning_rate": 4.68118243501616e-05, + "loss": 0.0697, + "num_input_tokens_seen": 53821952, + "step": 44225 + }, + { + "epoch": 4.9259383004788955, + "grad_norm": 1.4613349437713623, + "learning_rate": 4.681063692572516e-05, + "loss": 0.0602, + "num_input_tokens_seen": 53828512, + "step": 44230 + }, + { + "epoch": 4.926495155362512, + "grad_norm": 0.20635028183460236, + "learning_rate": 4.6809449295269783e-05, + "loss": 0.0133, + "num_input_tokens_seen": 53834592, + "step": 44235 + }, + { + "epoch": 4.92705201024613, + "grad_norm": 1.9919838905334473, + "learning_rate": 4.680826145880668e-05, + "loss": 0.0695, + "num_input_tokens_seen": 53840640, + "step": 44240 + }, + { + "epoch": 4.927608865129747, + "grad_norm": 0.01619461551308632, + "learning_rate": 4.680707341634707e-05, + "loss": 0.147, + "num_input_tokens_seen": 53846944, + "step": 44245 + }, + { + "epoch": 4.928165720013364, + "grad_norm": 0.012039156630635262, + "learning_rate": 4.680588516790219e-05, + "loss": 0.003, + "num_input_tokens_seen": 53853024, + "step": 44250 + }, + { + "epoch": 4.928722574896982, + "grad_norm": 0.5612748265266418, + "learning_rate": 4.680469671348325e-05, + "loss": 0.0774, + "num_input_tokens_seen": 53859328, + "step": 44255 + }, + { + "epoch": 4.929279429780599, + "grad_norm": 1.1054508686065674, + "learning_rate": 4.680350805310148e-05, + "loss": 0.1376, + "num_input_tokens_seen": 53865536, + "step": 44260 + }, + { + "epoch": 4.9298362846642165, + "grad_norm": 0.1187925934791565, + "learning_rate": 4.6802319186768105e-05, + "loss": 0.049, + "num_input_tokens_seen": 53872000, + "step": 44265 + }, + { + "epoch": 4.930393139547834, + "grad_norm": 0.010274361819028854, + "learning_rate": 4.680113011449436e-05, + "loss": 0.0393, + "num_input_tokens_seen": 53877824, + "step": 44270 + }, + { + "epoch": 4.930949994431451, + "grad_norm": 0.3793221116065979, + "learning_rate": 4.679994083629148e-05, + "loss": 0.0383, + "num_input_tokens_seen": 53883168, + "step": 44275 + }, + { + "epoch": 4.931506849315069, + "grad_norm": 0.21211197972297668, + "learning_rate": 4.679875135217068e-05, + "loss": 0.0364, + "num_input_tokens_seen": 53889184, + "step": 44280 + }, + { + "epoch": 4.9320637041986854, + "grad_norm": 0.5615761876106262, + "learning_rate": 4.679756166214322e-05, + "loss": 0.0375, + "num_input_tokens_seen": 53895264, + "step": 44285 + }, + { + "epoch": 4.932620559082303, + "grad_norm": 0.3512169420719147, + "learning_rate": 4.6796371766220324e-05, + "loss": 0.1119, + "num_input_tokens_seen": 53901632, + "step": 44290 + }, + { + "epoch": 4.933177413965921, + "grad_norm": 0.2573259770870209, + "learning_rate": 4.679518166441324e-05, + "loss": 0.0915, + "num_input_tokens_seen": 53907360, + "step": 44295 + }, + { + "epoch": 4.933734268849538, + "grad_norm": 0.23160867393016815, + "learning_rate": 4.6793991356733186e-05, + "loss": 0.0094, + "num_input_tokens_seen": 53913856, + "step": 44300 + }, + { + "epoch": 4.934291123733155, + "grad_norm": 0.11596503853797913, + "learning_rate": 4.679280084319143e-05, + "loss": 0.0292, + "num_input_tokens_seen": 53920128, + "step": 44305 + }, + { + "epoch": 4.934847978616773, + "grad_norm": 0.7228561043739319, + "learning_rate": 4.6791610123799213e-05, + "loss": 0.0395, + "num_input_tokens_seen": 53926016, + "step": 44310 + }, + { + "epoch": 4.93540483350039, + "grad_norm": 0.007592706475406885, + "learning_rate": 4.679041919856778e-05, + "loss": 0.0092, + "num_input_tokens_seen": 53932416, + "step": 44315 + }, + { + "epoch": 4.935961688384007, + "grad_norm": 0.5358933210372925, + "learning_rate": 4.6789228067508376e-05, + "loss": 0.0512, + "num_input_tokens_seen": 53937408, + "step": 44320 + }, + { + "epoch": 4.936518543267624, + "grad_norm": 0.03563397750258446, + "learning_rate": 4.6788036730632255e-05, + "loss": 0.0086, + "num_input_tokens_seen": 53943328, + "step": 44325 + }, + { + "epoch": 4.937075398151242, + "grad_norm": 0.9770001769065857, + "learning_rate": 4.6786845187950675e-05, + "loss": 0.0305, + "num_input_tokens_seen": 53949664, + "step": 44330 + }, + { + "epoch": 4.9376322530348595, + "grad_norm": 1.0367451906204224, + "learning_rate": 4.6785653439474886e-05, + "loss": 0.121, + "num_input_tokens_seen": 53955968, + "step": 44335 + }, + { + "epoch": 4.938189107918476, + "grad_norm": 0.3893777132034302, + "learning_rate": 4.678446148521615e-05, + "loss": 0.2406, + "num_input_tokens_seen": 53961888, + "step": 44340 + }, + { + "epoch": 4.938745962802094, + "grad_norm": 0.45178651809692383, + "learning_rate": 4.6783269325185705e-05, + "loss": 0.0707, + "num_input_tokens_seen": 53967744, + "step": 44345 + }, + { + "epoch": 4.939302817685711, + "grad_norm": 0.012725072912871838, + "learning_rate": 4.678207695939485e-05, + "loss": 0.0233, + "num_input_tokens_seen": 53974176, + "step": 44350 + }, + { + "epoch": 4.9398596725693285, + "grad_norm": 0.7634017467498779, + "learning_rate": 4.6780884387854815e-05, + "loss": 0.1128, + "num_input_tokens_seen": 53980256, + "step": 44355 + }, + { + "epoch": 4.940416527452946, + "grad_norm": 0.9102902412414551, + "learning_rate": 4.6779691610576874e-05, + "loss": 0.1395, + "num_input_tokens_seen": 53986400, + "step": 44360 + }, + { + "epoch": 4.940973382336563, + "grad_norm": 0.20600536465644836, + "learning_rate": 4.6778498627572295e-05, + "loss": 0.0749, + "num_input_tokens_seen": 53992448, + "step": 44365 + }, + { + "epoch": 4.941530237220181, + "grad_norm": 0.01970081776380539, + "learning_rate": 4.677730543885236e-05, + "loss": 0.027, + "num_input_tokens_seen": 53998688, + "step": 44370 + }, + { + "epoch": 4.942087092103797, + "grad_norm": 0.0010021787602454424, + "learning_rate": 4.677611204442832e-05, + "loss": 0.088, + "num_input_tokens_seen": 54004800, + "step": 44375 + }, + { + "epoch": 4.942643946987415, + "grad_norm": 0.9776735901832581, + "learning_rate": 4.677491844431145e-05, + "loss": 0.1088, + "num_input_tokens_seen": 54011040, + "step": 44380 + }, + { + "epoch": 4.943200801871033, + "grad_norm": 0.024649543687701225, + "learning_rate": 4.677372463851304e-05, + "loss": 0.1544, + "num_input_tokens_seen": 54017152, + "step": 44385 + }, + { + "epoch": 4.9437576567546495, + "grad_norm": 1.5459837913513184, + "learning_rate": 4.677253062704435e-05, + "loss": 0.045, + "num_input_tokens_seen": 54023296, + "step": 44390 + }, + { + "epoch": 4.944314511638267, + "grad_norm": 0.5060530304908752, + "learning_rate": 4.6771336409916664e-05, + "loss": 0.0398, + "num_input_tokens_seen": 54029632, + "step": 44395 + }, + { + "epoch": 4.944871366521884, + "grad_norm": 0.2678850293159485, + "learning_rate": 4.677014198714127e-05, + "loss": 0.0472, + "num_input_tokens_seen": 54035872, + "step": 44400 + }, + { + "epoch": 4.945428221405502, + "grad_norm": 0.0036735665053129196, + "learning_rate": 4.676894735872944e-05, + "loss": 0.027, + "num_input_tokens_seen": 54042176, + "step": 44405 + }, + { + "epoch": 4.945985076289119, + "grad_norm": 0.8350379467010498, + "learning_rate": 4.676775252469246e-05, + "loss": 0.0643, + "num_input_tokens_seen": 54048384, + "step": 44410 + }, + { + "epoch": 4.946541931172736, + "grad_norm": 0.03581526502966881, + "learning_rate": 4.676655748504161e-05, + "loss": 0.1191, + "num_input_tokens_seen": 54054944, + "step": 44415 + }, + { + "epoch": 4.947098786056354, + "grad_norm": 0.3428368866443634, + "learning_rate": 4.676536223978819e-05, + "loss": 0.1002, + "num_input_tokens_seen": 54061024, + "step": 44420 + }, + { + "epoch": 4.947655640939971, + "grad_norm": 0.2442844808101654, + "learning_rate": 4.6764166788943484e-05, + "loss": 0.0362, + "num_input_tokens_seen": 54067200, + "step": 44425 + }, + { + "epoch": 4.948212495823588, + "grad_norm": 2.082157611846924, + "learning_rate": 4.676297113251879e-05, + "loss": 0.1959, + "num_input_tokens_seen": 54073440, + "step": 44430 + }, + { + "epoch": 4.948769350707206, + "grad_norm": 0.1472720354795456, + "learning_rate": 4.67617752705254e-05, + "loss": 0.0672, + "num_input_tokens_seen": 54079424, + "step": 44435 + }, + { + "epoch": 4.949326205590823, + "grad_norm": 0.09531111270189285, + "learning_rate": 4.676057920297461e-05, + "loss": 0.1226, + "num_input_tokens_seen": 54085792, + "step": 44440 + }, + { + "epoch": 4.94988306047444, + "grad_norm": 0.6804160475730896, + "learning_rate": 4.6759382929877705e-05, + "loss": 0.1245, + "num_input_tokens_seen": 54092000, + "step": 44445 + }, + { + "epoch": 4.950439915358058, + "grad_norm": 1.4520299434661865, + "learning_rate": 4.675818645124599e-05, + "loss": 0.076, + "num_input_tokens_seen": 54098080, + "step": 44450 + }, + { + "epoch": 4.950996770241675, + "grad_norm": 0.0015300324885174632, + "learning_rate": 4.675698976709079e-05, + "loss": 0.1114, + "num_input_tokens_seen": 54104000, + "step": 44455 + }, + { + "epoch": 4.9515536251252925, + "grad_norm": 0.6266718506813049, + "learning_rate": 4.675579287742337e-05, + "loss": 0.0383, + "num_input_tokens_seen": 54110400, + "step": 44460 + }, + { + "epoch": 4.952110480008909, + "grad_norm": 0.027937114238739014, + "learning_rate": 4.6754595782255074e-05, + "loss": 0.0224, + "num_input_tokens_seen": 54116384, + "step": 44465 + }, + { + "epoch": 4.952667334892527, + "grad_norm": 0.2138093113899231, + "learning_rate": 4.675339848159718e-05, + "loss": 0.037, + "num_input_tokens_seen": 54122240, + "step": 44470 + }, + { + "epoch": 4.953224189776145, + "grad_norm": 0.037711210548877716, + "learning_rate": 4.6752200975461014e-05, + "loss": 0.0237, + "num_input_tokens_seen": 54128544, + "step": 44475 + }, + { + "epoch": 4.953781044659761, + "grad_norm": 0.6318591237068176, + "learning_rate": 4.675100326385788e-05, + "loss": 0.0981, + "num_input_tokens_seen": 54134784, + "step": 44480 + }, + { + "epoch": 4.954337899543379, + "grad_norm": 0.1912403255701065, + "learning_rate": 4.674980534679909e-05, + "loss": 0.0783, + "num_input_tokens_seen": 54140992, + "step": 44485 + }, + { + "epoch": 4.954894754426997, + "grad_norm": 1.2531938552856445, + "learning_rate": 4.674860722429597e-05, + "loss": 0.0555, + "num_input_tokens_seen": 54147424, + "step": 44490 + }, + { + "epoch": 4.955451609310614, + "grad_norm": 0.08303207159042358, + "learning_rate": 4.6747408896359824e-05, + "loss": 0.0724, + "num_input_tokens_seen": 54153888, + "step": 44495 + }, + { + "epoch": 4.956008464194231, + "grad_norm": 0.6975898742675781, + "learning_rate": 4.6746210363001984e-05, + "loss": 0.2097, + "num_input_tokens_seen": 54159840, + "step": 44500 + }, + { + "epoch": 4.956565319077848, + "grad_norm": 0.05675272271037102, + "learning_rate": 4.674501162423376e-05, + "loss": 0.0368, + "num_input_tokens_seen": 54166208, + "step": 44505 + }, + { + "epoch": 4.957122173961466, + "grad_norm": 1.3950321674346924, + "learning_rate": 4.674381268006648e-05, + "loss": 0.1637, + "num_input_tokens_seen": 54172384, + "step": 44510 + }, + { + "epoch": 4.957679028845083, + "grad_norm": 1.6851320266723633, + "learning_rate": 4.6742613530511464e-05, + "loss": 0.0537, + "num_input_tokens_seen": 54178592, + "step": 44515 + }, + { + "epoch": 4.9582358837287, + "grad_norm": 0.3601852059364319, + "learning_rate": 4.674141417558005e-05, + "loss": 0.0149, + "num_input_tokens_seen": 54184800, + "step": 44520 + }, + { + "epoch": 4.958792738612318, + "grad_norm": 0.41790857911109924, + "learning_rate": 4.6740214615283556e-05, + "loss": 0.0438, + "num_input_tokens_seen": 54190912, + "step": 44525 + }, + { + "epoch": 4.959349593495935, + "grad_norm": 0.058423399925231934, + "learning_rate": 4.6739014849633324e-05, + "loss": 0.0427, + "num_input_tokens_seen": 54197376, + "step": 44530 + }, + { + "epoch": 4.959906448379552, + "grad_norm": 3.470182418823242, + "learning_rate": 4.6737814878640676e-05, + "loss": 0.1057, + "num_input_tokens_seen": 54203296, + "step": 44535 + }, + { + "epoch": 4.96046330326317, + "grad_norm": 1.0134557485580444, + "learning_rate": 4.673661470231694e-05, + "loss": 0.1171, + "num_input_tokens_seen": 54209760, + "step": 44540 + }, + { + "epoch": 4.961020158146787, + "grad_norm": 1.6956202983856201, + "learning_rate": 4.673541432067348e-05, + "loss": 0.1809, + "num_input_tokens_seen": 54215936, + "step": 44545 + }, + { + "epoch": 4.961577013030404, + "grad_norm": 0.08609170466661453, + "learning_rate": 4.67342137337216e-05, + "loss": 0.03, + "num_input_tokens_seen": 54221728, + "step": 44550 + }, + { + "epoch": 4.962133867914021, + "grad_norm": 1.520497441291809, + "learning_rate": 4.673301294147267e-05, + "loss": 0.0487, + "num_input_tokens_seen": 54227552, + "step": 44555 + }, + { + "epoch": 4.962690722797639, + "grad_norm": 0.2536754608154297, + "learning_rate": 4.673181194393802e-05, + "loss": 0.0813, + "num_input_tokens_seen": 54233184, + "step": 44560 + }, + { + "epoch": 4.963247577681257, + "grad_norm": 0.46840035915374756, + "learning_rate": 4.673061074112899e-05, + "loss": 0.0794, + "num_input_tokens_seen": 54239552, + "step": 44565 + }, + { + "epoch": 4.963804432564873, + "grad_norm": 0.24463187158107758, + "learning_rate": 4.672940933305694e-05, + "loss": 0.0538, + "num_input_tokens_seen": 54245440, + "step": 44570 + }, + { + "epoch": 4.964361287448491, + "grad_norm": 0.0007163711707107723, + "learning_rate": 4.67282077197332e-05, + "loss": 0.1719, + "num_input_tokens_seen": 54251808, + "step": 44575 + }, + { + "epoch": 4.964918142332108, + "grad_norm": 0.3839641213417053, + "learning_rate": 4.672700590116913e-05, + "loss": 0.1958, + "num_input_tokens_seen": 54256512, + "step": 44580 + }, + { + "epoch": 4.9654749972157255, + "grad_norm": 0.1630549132823944, + "learning_rate": 4.672580387737609e-05, + "loss": 0.0465, + "num_input_tokens_seen": 54262432, + "step": 44585 + }, + { + "epoch": 4.966031852099343, + "grad_norm": 1.911286473274231, + "learning_rate": 4.672460164836542e-05, + "loss": 0.0609, + "num_input_tokens_seen": 54268672, + "step": 44590 + }, + { + "epoch": 4.96658870698296, + "grad_norm": 0.061622750014066696, + "learning_rate": 4.672339921414848e-05, + "loss": 0.167, + "num_input_tokens_seen": 54274880, + "step": 44595 + }, + { + "epoch": 4.967145561866578, + "grad_norm": 0.010103528387844563, + "learning_rate": 4.6722196574736636e-05, + "loss": 0.0843, + "num_input_tokens_seen": 54280992, + "step": 44600 + }, + { + "epoch": 4.967702416750194, + "grad_norm": 2.061837673187256, + "learning_rate": 4.672099373014124e-05, + "loss": 0.1213, + "num_input_tokens_seen": 54287040, + "step": 44605 + }, + { + "epoch": 4.968259271633812, + "grad_norm": 0.019183143973350525, + "learning_rate": 4.671979068037366e-05, + "loss": 0.1448, + "num_input_tokens_seen": 54293184, + "step": 44610 + }, + { + "epoch": 4.96881612651743, + "grad_norm": 0.3300568163394928, + "learning_rate": 4.671858742544525e-05, + "loss": 0.1334, + "num_input_tokens_seen": 54299328, + "step": 44615 + }, + { + "epoch": 4.9693729814010466, + "grad_norm": 0.6202014088630676, + "learning_rate": 4.671738396536738e-05, + "loss": 0.1255, + "num_input_tokens_seen": 54305280, + "step": 44620 + }, + { + "epoch": 4.969929836284664, + "grad_norm": 0.046908874064683914, + "learning_rate": 4.671618030015142e-05, + "loss": 0.1134, + "num_input_tokens_seen": 54311424, + "step": 44625 + }, + { + "epoch": 4.970486691168282, + "grad_norm": 1.8689535856246948, + "learning_rate": 4.671497642980874e-05, + "loss": 0.1216, + "num_input_tokens_seen": 54317472, + "step": 44630 + }, + { + "epoch": 4.971043546051899, + "grad_norm": 0.013475432060658932, + "learning_rate": 4.671377235435071e-05, + "loss": 0.0806, + "num_input_tokens_seen": 54323872, + "step": 44635 + }, + { + "epoch": 4.971600400935516, + "grad_norm": 0.168732687830925, + "learning_rate": 4.6712568073788696e-05, + "loss": 0.0781, + "num_input_tokens_seen": 54330208, + "step": 44640 + }, + { + "epoch": 4.972157255819133, + "grad_norm": 0.697036623954773, + "learning_rate": 4.671136358813409e-05, + "loss": 0.0785, + "num_input_tokens_seen": 54335808, + "step": 44645 + }, + { + "epoch": 4.972714110702751, + "grad_norm": 0.06396253407001495, + "learning_rate": 4.6710158897398255e-05, + "loss": 0.0439, + "num_input_tokens_seen": 54342144, + "step": 44650 + }, + { + "epoch": 4.9732709655863685, + "grad_norm": 0.5660400986671448, + "learning_rate": 4.6708954001592575e-05, + "loss": 0.0828, + "num_input_tokens_seen": 54348416, + "step": 44655 + }, + { + "epoch": 4.973827820469985, + "grad_norm": 0.24324730038642883, + "learning_rate": 4.670774890072843e-05, + "loss": 0.041, + "num_input_tokens_seen": 54354240, + "step": 44660 + }, + { + "epoch": 4.974384675353603, + "grad_norm": 0.20715580880641937, + "learning_rate": 4.670654359481721e-05, + "loss": 0.0762, + "num_input_tokens_seen": 54360320, + "step": 44665 + }, + { + "epoch": 4.974941530237221, + "grad_norm": 0.19618278741836548, + "learning_rate": 4.67053380838703e-05, + "loss": 0.2006, + "num_input_tokens_seen": 54366400, + "step": 44670 + }, + { + "epoch": 4.975498385120837, + "grad_norm": 1.2062914371490479, + "learning_rate": 4.670413236789907e-05, + "loss": 0.0725, + "num_input_tokens_seen": 54372608, + "step": 44675 + }, + { + "epoch": 4.976055240004455, + "grad_norm": 0.1682758331298828, + "learning_rate": 4.6702926446914926e-05, + "loss": 0.0687, + "num_input_tokens_seen": 54378144, + "step": 44680 + }, + { + "epoch": 4.976612094888072, + "grad_norm": 0.08250664174556732, + "learning_rate": 4.670172032092925e-05, + "loss": 0.1757, + "num_input_tokens_seen": 54384256, + "step": 44685 + }, + { + "epoch": 4.9771689497716896, + "grad_norm": 0.029566800221800804, + "learning_rate": 4.670051398995344e-05, + "loss": 0.1076, + "num_input_tokens_seen": 54390400, + "step": 44690 + }, + { + "epoch": 4.977725804655307, + "grad_norm": 0.3574429452419281, + "learning_rate": 4.66993074539989e-05, + "loss": 0.1193, + "num_input_tokens_seen": 54396608, + "step": 44695 + }, + { + "epoch": 4.978282659538924, + "grad_norm": 0.29551804065704346, + "learning_rate": 4.6698100713077e-05, + "loss": 0.1231, + "num_input_tokens_seen": 54402688, + "step": 44700 + }, + { + "epoch": 4.978839514422542, + "grad_norm": 0.005309098865836859, + "learning_rate": 4.6696893767199154e-05, + "loss": 0.0651, + "num_input_tokens_seen": 54408992, + "step": 44705 + }, + { + "epoch": 4.9793963693061585, + "grad_norm": 0.011315583251416683, + "learning_rate": 4.669568661637678e-05, + "loss": 0.0323, + "num_input_tokens_seen": 54415104, + "step": 44710 + }, + { + "epoch": 4.979953224189776, + "grad_norm": 0.06392008811235428, + "learning_rate": 4.669447926062125e-05, + "loss": 0.077, + "num_input_tokens_seen": 54420928, + "step": 44715 + }, + { + "epoch": 4.980510079073394, + "grad_norm": 0.001863323850557208, + "learning_rate": 4.6693271699943985e-05, + "loss": 0.0894, + "num_input_tokens_seen": 54427040, + "step": 44720 + }, + { + "epoch": 4.981066933957011, + "grad_norm": 1.1377558708190918, + "learning_rate": 4.669206393435639e-05, + "loss": 0.098, + "num_input_tokens_seen": 54433024, + "step": 44725 + }, + { + "epoch": 4.981623788840628, + "grad_norm": 1.8803406953811646, + "learning_rate": 4.6690855963869874e-05, + "loss": 0.0288, + "num_input_tokens_seen": 54439200, + "step": 44730 + }, + { + "epoch": 4.982180643724245, + "grad_norm": 0.002613317919895053, + "learning_rate": 4.668964778849584e-05, + "loss": 0.0248, + "num_input_tokens_seen": 54445120, + "step": 44735 + }, + { + "epoch": 4.982737498607863, + "grad_norm": 0.547510027885437, + "learning_rate": 4.66884394082457e-05, + "loss": 0.1348, + "num_input_tokens_seen": 54451456, + "step": 44740 + }, + { + "epoch": 4.98329435349148, + "grad_norm": 0.15471425652503967, + "learning_rate": 4.668723082313089e-05, + "loss": 0.0842, + "num_input_tokens_seen": 54457760, + "step": 44745 + }, + { + "epoch": 4.983851208375097, + "grad_norm": 0.3268495500087738, + "learning_rate": 4.66860220331628e-05, + "loss": 0.0704, + "num_input_tokens_seen": 54463968, + "step": 44750 + }, + { + "epoch": 4.984408063258715, + "grad_norm": 0.013291840441524982, + "learning_rate": 4.668481303835285e-05, + "loss": 0.1307, + "num_input_tokens_seen": 54469856, + "step": 44755 + }, + { + "epoch": 4.984964918142332, + "grad_norm": 0.0032863537780940533, + "learning_rate": 4.668360383871248e-05, + "loss": 0.0674, + "num_input_tokens_seen": 54475744, + "step": 44760 + }, + { + "epoch": 4.985521773025949, + "grad_norm": 0.023379968479275703, + "learning_rate": 4.668239443425309e-05, + "loss": 0.0666, + "num_input_tokens_seen": 54481568, + "step": 44765 + }, + { + "epoch": 4.986078627909567, + "grad_norm": 0.0007368149817921221, + "learning_rate": 4.6681184824986115e-05, + "loss": 0.0598, + "num_input_tokens_seen": 54487680, + "step": 44770 + }, + { + "epoch": 4.986635482793184, + "grad_norm": 0.11046478897333145, + "learning_rate": 4.667997501092298e-05, + "loss": 0.0933, + "num_input_tokens_seen": 54493664, + "step": 44775 + }, + { + "epoch": 4.9871923376768015, + "grad_norm": 0.05437258630990982, + "learning_rate": 4.6678764992075114e-05, + "loss": 0.0185, + "num_input_tokens_seen": 54499936, + "step": 44780 + }, + { + "epoch": 4.987749192560418, + "grad_norm": 0.28342384099960327, + "learning_rate": 4.667755476845394e-05, + "loss": 0.0712, + "num_input_tokens_seen": 54505984, + "step": 44785 + }, + { + "epoch": 4.988306047444036, + "grad_norm": 1.7769321203231812, + "learning_rate": 4.66763443400709e-05, + "loss": 0.1033, + "num_input_tokens_seen": 54512160, + "step": 44790 + }, + { + "epoch": 4.988862902327654, + "grad_norm": 0.34558477997779846, + "learning_rate": 4.667513370693741e-05, + "loss": 0.0233, + "num_input_tokens_seen": 54518272, + "step": 44795 + }, + { + "epoch": 4.98941975721127, + "grad_norm": 0.5554163455963135, + "learning_rate": 4.6673922869064925e-05, + "loss": 0.0759, + "num_input_tokens_seen": 54524096, + "step": 44800 + }, + { + "epoch": 4.989976612094888, + "grad_norm": 1.0000832080841064, + "learning_rate": 4.667271182646487e-05, + "loss": 0.1589, + "num_input_tokens_seen": 54529856, + "step": 44805 + }, + { + "epoch": 4.990533466978506, + "grad_norm": 0.03430813178420067, + "learning_rate": 4.667150057914868e-05, + "loss": 0.1758, + "num_input_tokens_seen": 54536192, + "step": 44810 + }, + { + "epoch": 4.9910903218621225, + "grad_norm": 0.1884249448776245, + "learning_rate": 4.667028912712782e-05, + "loss": 0.074, + "num_input_tokens_seen": 54542144, + "step": 44815 + }, + { + "epoch": 4.99164717674574, + "grad_norm": 0.026467453688383102, + "learning_rate": 4.6669077470413714e-05, + "loss": 0.0675, + "num_input_tokens_seen": 54548192, + "step": 44820 + }, + { + "epoch": 4.992204031629358, + "grad_norm": 0.6662651896476746, + "learning_rate": 4.6667865609017806e-05, + "loss": 0.0983, + "num_input_tokens_seen": 54553696, + "step": 44825 + }, + { + "epoch": 4.992760886512975, + "grad_norm": 0.08381152898073196, + "learning_rate": 4.6666653542951544e-05, + "loss": 0.0663, + "num_input_tokens_seen": 54559808, + "step": 44830 + }, + { + "epoch": 4.993317741396592, + "grad_norm": 0.2925296425819397, + "learning_rate": 4.666544127222638e-05, + "loss": 0.0586, + "num_input_tokens_seen": 54565888, + "step": 44835 + }, + { + "epoch": 4.993874596280209, + "grad_norm": 0.21100133657455444, + "learning_rate": 4.666422879685377e-05, + "loss": 0.0823, + "num_input_tokens_seen": 54572064, + "step": 44840 + }, + { + "epoch": 4.994431451163827, + "grad_norm": 0.38550323247909546, + "learning_rate": 4.666301611684516e-05, + "loss": 0.1898, + "num_input_tokens_seen": 54577984, + "step": 44845 + }, + { + "epoch": 4.9949883060474445, + "grad_norm": 0.26491254568099976, + "learning_rate": 4.666180323221201e-05, + "loss": 0.0296, + "num_input_tokens_seen": 54584224, + "step": 44850 + }, + { + "epoch": 4.995545160931061, + "grad_norm": 0.3746231198310852, + "learning_rate": 4.666059014296577e-05, + "loss": 0.0457, + "num_input_tokens_seen": 54590464, + "step": 44855 + }, + { + "epoch": 4.996102015814679, + "grad_norm": 0.38971298933029175, + "learning_rate": 4.6659376849117906e-05, + "loss": 0.0725, + "num_input_tokens_seen": 54596672, + "step": 44860 + }, + { + "epoch": 4.996658870698296, + "grad_norm": 1.474569320678711, + "learning_rate": 4.6658163350679865e-05, + "loss": 0.0905, + "num_input_tokens_seen": 54603072, + "step": 44865 + }, + { + "epoch": 4.997215725581913, + "grad_norm": 0.1929636299610138, + "learning_rate": 4.665694964766313e-05, + "loss": 0.0526, + "num_input_tokens_seen": 54609344, + "step": 44870 + }, + { + "epoch": 4.997772580465531, + "grad_norm": 0.23131205141544342, + "learning_rate": 4.665573574007915e-05, + "loss": 0.0815, + "num_input_tokens_seen": 54615488, + "step": 44875 + }, + { + "epoch": 4.998329435349148, + "grad_norm": 0.2408783733844757, + "learning_rate": 4.6654521627939394e-05, + "loss": 0.042, + "num_input_tokens_seen": 54621696, + "step": 44880 + }, + { + "epoch": 4.9988862902327655, + "grad_norm": 0.19654814898967743, + "learning_rate": 4.6653307311255337e-05, + "loss": 0.0559, + "num_input_tokens_seen": 54627520, + "step": 44885 + }, + { + "epoch": 4.999443145116382, + "grad_norm": 0.7406058311462402, + "learning_rate": 4.6652092790038436e-05, + "loss": 0.0488, + "num_input_tokens_seen": 54633728, + "step": 44890 + }, + { + "epoch": 5.0, + "grad_norm": 0.9235504865646362, + "learning_rate": 4.665087806430017e-05, + "loss": 0.0597, + "num_input_tokens_seen": 54639040, + "step": 44895 + }, + { + "epoch": 5.0, + "eval_loss": 0.08369806408882141, + "eval_runtime": 112.6582, + "eval_samples_per_second": 35.426, + "eval_steps_per_second": 8.859, + "num_input_tokens_seen": 54639040, + "step": 44895 + }, + { + "epoch": 5.000556854883618, + "grad_norm": 0.2378118634223938, + "learning_rate": 4.6649663134052024e-05, + "loss": 0.1206, + "num_input_tokens_seen": 54645152, + "step": 44900 + }, + { + "epoch": 5.0011137097672345, + "grad_norm": 0.17302867770195007, + "learning_rate": 4.6648447999305464e-05, + "loss": 0.1433, + "num_input_tokens_seen": 54651264, + "step": 44905 + }, + { + "epoch": 5.001670564650852, + "grad_norm": 0.16424232721328735, + "learning_rate": 4.664723266007196e-05, + "loss": 0.0105, + "num_input_tokens_seen": 54657440, + "step": 44910 + }, + { + "epoch": 5.002227419534469, + "grad_norm": 0.6582285761833191, + "learning_rate": 4.664601711636301e-05, + "loss": 0.1432, + "num_input_tokens_seen": 54663616, + "step": 44915 + }, + { + "epoch": 5.002784274418087, + "grad_norm": 0.014911646023392677, + "learning_rate": 4.664480136819007e-05, + "loss": 0.01, + "num_input_tokens_seen": 54669824, + "step": 44920 + }, + { + "epoch": 5.003341129301704, + "grad_norm": 0.6952040195465088, + "learning_rate": 4.664358541556465e-05, + "loss": 0.0396, + "num_input_tokens_seen": 54675872, + "step": 44925 + }, + { + "epoch": 5.003897984185321, + "grad_norm": 0.8216918706893921, + "learning_rate": 4.664236925849823e-05, + "loss": 0.0509, + "num_input_tokens_seen": 54681952, + "step": 44930 + }, + { + "epoch": 5.004454839068939, + "grad_norm": 0.12157321721315384, + "learning_rate": 4.6641152897002286e-05, + "loss": 0.0506, + "num_input_tokens_seen": 54688224, + "step": 44935 + }, + { + "epoch": 5.005011693952556, + "grad_norm": 0.029465224593877792, + "learning_rate": 4.663993633108832e-05, + "loss": 0.135, + "num_input_tokens_seen": 54694048, + "step": 44940 + }, + { + "epoch": 5.005568548836173, + "grad_norm": 0.004767961334437132, + "learning_rate": 4.6638719560767805e-05, + "loss": 0.0355, + "num_input_tokens_seen": 54700192, + "step": 44945 + }, + { + "epoch": 5.006125403719791, + "grad_norm": 1.0801546573638916, + "learning_rate": 4.6637502586052265e-05, + "loss": 0.0945, + "num_input_tokens_seen": 54706336, + "step": 44950 + }, + { + "epoch": 5.006682258603408, + "grad_norm": 1.9246141910552979, + "learning_rate": 4.663628540695316e-05, + "loss": 0.11, + "num_input_tokens_seen": 54712448, + "step": 44955 + }, + { + "epoch": 5.007239113487025, + "grad_norm": 0.06004389747977257, + "learning_rate": 4.663506802348201e-05, + "loss": 0.0954, + "num_input_tokens_seen": 54718560, + "step": 44960 + }, + { + "epoch": 5.007795968370643, + "grad_norm": 0.6811844110488892, + "learning_rate": 4.663385043565032e-05, + "loss": 0.0696, + "num_input_tokens_seen": 54724704, + "step": 44965 + }, + { + "epoch": 5.00835282325426, + "grad_norm": 0.08847884833812714, + "learning_rate": 4.663263264346956e-05, + "loss": 0.0367, + "num_input_tokens_seen": 54730592, + "step": 44970 + }, + { + "epoch": 5.0089096781378775, + "grad_norm": 0.823615550994873, + "learning_rate": 4.663141464695127e-05, + "loss": 0.1185, + "num_input_tokens_seen": 54736384, + "step": 44975 + }, + { + "epoch": 5.009466533021494, + "grad_norm": 1.474109411239624, + "learning_rate": 4.663019644610693e-05, + "loss": 0.1238, + "num_input_tokens_seen": 54742464, + "step": 44980 + }, + { + "epoch": 5.010023387905112, + "grad_norm": 0.0019866400398314, + "learning_rate": 4.6628978040948056e-05, + "loss": 0.0368, + "num_input_tokens_seen": 54748672, + "step": 44985 + }, + { + "epoch": 5.01058024278873, + "grad_norm": 0.06308579444885254, + "learning_rate": 4.662775943148616e-05, + "loss": 0.0801, + "num_input_tokens_seen": 54754752, + "step": 44990 + }, + { + "epoch": 5.011137097672346, + "grad_norm": 0.00024807066074572504, + "learning_rate": 4.662654061773275e-05, + "loss": 0.0839, + "num_input_tokens_seen": 54761152, + "step": 44995 + }, + { + "epoch": 5.011693952555964, + "grad_norm": 0.26277709007263184, + "learning_rate": 4.6625321599699326e-05, + "loss": 0.031, + "num_input_tokens_seen": 54766848, + "step": 45000 + }, + { + "epoch": 5.012250807439581, + "grad_norm": 0.23031195998191833, + "learning_rate": 4.662410237739742e-05, + "loss": 0.0522, + "num_input_tokens_seen": 54773088, + "step": 45005 + }, + { + "epoch": 5.0128076623231985, + "grad_norm": 0.7788370847702026, + "learning_rate": 4.6622882950838544e-05, + "loss": 0.0553, + "num_input_tokens_seen": 54779296, + "step": 45010 + }, + { + "epoch": 5.013364517206816, + "grad_norm": 0.4818168580532074, + "learning_rate": 4.662166332003421e-05, + "loss": 0.0539, + "num_input_tokens_seen": 54785248, + "step": 45015 + }, + { + "epoch": 5.013921372090433, + "grad_norm": 1.3754557371139526, + "learning_rate": 4.6620443484995944e-05, + "loss": 0.0601, + "num_input_tokens_seen": 54791264, + "step": 45020 + }, + { + "epoch": 5.014478226974051, + "grad_norm": 0.002181309973821044, + "learning_rate": 4.661922344573527e-05, + "loss": 0.0073, + "num_input_tokens_seen": 54797504, + "step": 45025 + }, + { + "epoch": 5.015035081857668, + "grad_norm": 0.20179279148578644, + "learning_rate": 4.6618003202263704e-05, + "loss": 0.014, + "num_input_tokens_seen": 54803744, + "step": 45030 + }, + { + "epoch": 5.015591936741285, + "grad_norm": 0.14837627112865448, + "learning_rate": 4.661678275459279e-05, + "loss": 0.1093, + "num_input_tokens_seen": 54809888, + "step": 45035 + }, + { + "epoch": 5.016148791624903, + "grad_norm": 0.014663660898804665, + "learning_rate": 4.661556210273403e-05, + "loss": 0.1191, + "num_input_tokens_seen": 54816160, + "step": 45040 + }, + { + "epoch": 5.01670564650852, + "grad_norm": 0.11315859854221344, + "learning_rate": 4.6614341246698973e-05, + "loss": 0.013, + "num_input_tokens_seen": 54822560, + "step": 45045 + }, + { + "epoch": 5.017262501392137, + "grad_norm": 0.05435018986463547, + "learning_rate": 4.661312018649915e-05, + "loss": 0.0794, + "num_input_tokens_seen": 54828320, + "step": 45050 + }, + { + "epoch": 5.017819356275755, + "grad_norm": 0.010641902685165405, + "learning_rate": 4.661189892214608e-05, + "loss": 0.0051, + "num_input_tokens_seen": 54834656, + "step": 45055 + }, + { + "epoch": 5.018376211159372, + "grad_norm": 1.1601486206054688, + "learning_rate": 4.661067745365132e-05, + "loss": 0.0844, + "num_input_tokens_seen": 54840736, + "step": 45060 + }, + { + "epoch": 5.018933066042989, + "grad_norm": 0.02580336295068264, + "learning_rate": 4.660945578102639e-05, + "loss": 0.0604, + "num_input_tokens_seen": 54846688, + "step": 45065 + }, + { + "epoch": 5.019489920926606, + "grad_norm": 0.370515912771225, + "learning_rate": 4.6608233904282836e-05, + "loss": 0.0474, + "num_input_tokens_seen": 54852928, + "step": 45070 + }, + { + "epoch": 5.020046775810224, + "grad_norm": 0.5883588790893555, + "learning_rate": 4.660701182343221e-05, + "loss": 0.0594, + "num_input_tokens_seen": 54858656, + "step": 45075 + }, + { + "epoch": 5.0206036306938415, + "grad_norm": 0.1501230001449585, + "learning_rate": 4.660578953848604e-05, + "loss": 0.02, + "num_input_tokens_seen": 54864896, + "step": 45080 + }, + { + "epoch": 5.021160485577458, + "grad_norm": 0.12094159424304962, + "learning_rate": 4.660456704945588e-05, + "loss": 0.1577, + "num_input_tokens_seen": 54871264, + "step": 45085 + }, + { + "epoch": 5.021717340461076, + "grad_norm": 0.5871977210044861, + "learning_rate": 4.6603344356353275e-05, + "loss": 0.1273, + "num_input_tokens_seen": 54877152, + "step": 45090 + }, + { + "epoch": 5.022274195344693, + "grad_norm": 0.7067388892173767, + "learning_rate": 4.660212145918977e-05, + "loss": 0.0245, + "num_input_tokens_seen": 54883552, + "step": 45095 + }, + { + "epoch": 5.0228310502283104, + "grad_norm": 0.00798301212489605, + "learning_rate": 4.6600898357976924e-05, + "loss": 0.0554, + "num_input_tokens_seen": 54889344, + "step": 45100 + }, + { + "epoch": 5.023387905111928, + "grad_norm": 0.10050386190414429, + "learning_rate": 4.659967505272628e-05, + "loss": 0.0632, + "num_input_tokens_seen": 54895648, + "step": 45105 + }, + { + "epoch": 5.023944759995545, + "grad_norm": 0.011988387443125248, + "learning_rate": 4.6598451543449404e-05, + "loss": 0.054, + "num_input_tokens_seen": 54901856, + "step": 45110 + }, + { + "epoch": 5.024501614879163, + "grad_norm": 0.0002139130956493318, + "learning_rate": 4.659722783015785e-05, + "loss": 0.0597, + "num_input_tokens_seen": 54908096, + "step": 45115 + }, + { + "epoch": 5.02505846976278, + "grad_norm": 0.515155553817749, + "learning_rate": 4.659600391286318e-05, + "loss": 0.0712, + "num_input_tokens_seen": 54913920, + "step": 45120 + }, + { + "epoch": 5.025615324646397, + "grad_norm": 0.6778508424758911, + "learning_rate": 4.659477979157695e-05, + "loss": 0.0661, + "num_input_tokens_seen": 54920352, + "step": 45125 + }, + { + "epoch": 5.026172179530015, + "grad_norm": 0.11247935146093369, + "learning_rate": 4.659355546631072e-05, + "loss": 0.1135, + "num_input_tokens_seen": 54925792, + "step": 45130 + }, + { + "epoch": 5.0267290344136315, + "grad_norm": 0.06419382989406586, + "learning_rate": 4.659233093707606e-05, + "loss": 0.0841, + "num_input_tokens_seen": 54931136, + "step": 45135 + }, + { + "epoch": 5.027285889297249, + "grad_norm": 0.005326569080352783, + "learning_rate": 4.659110620388453e-05, + "loss": 0.0121, + "num_input_tokens_seen": 54937184, + "step": 45140 + }, + { + "epoch": 5.027842744180867, + "grad_norm": 1.1477222442626953, + "learning_rate": 4.6589881266747704e-05, + "loss": 0.221, + "num_input_tokens_seen": 54942912, + "step": 45145 + }, + { + "epoch": 5.028399599064484, + "grad_norm": 0.08317261934280396, + "learning_rate": 4.6588656125677155e-05, + "loss": 0.0231, + "num_input_tokens_seen": 54949248, + "step": 45150 + }, + { + "epoch": 5.028956453948101, + "grad_norm": 0.012075821869075298, + "learning_rate": 4.6587430780684446e-05, + "loss": 0.0217, + "num_input_tokens_seen": 54955584, + "step": 45155 + }, + { + "epoch": 5.029513308831718, + "grad_norm": 0.7870747447013855, + "learning_rate": 4.6586205231781166e-05, + "loss": 0.0892, + "num_input_tokens_seen": 54961664, + "step": 45160 + }, + { + "epoch": 5.030070163715336, + "grad_norm": 0.20219174027442932, + "learning_rate": 4.658497947897889e-05, + "loss": 0.1013, + "num_input_tokens_seen": 54967968, + "step": 45165 + }, + { + "epoch": 5.0306270185989534, + "grad_norm": 0.2760806083679199, + "learning_rate": 4.658375352228917e-05, + "loss": 0.0078, + "num_input_tokens_seen": 54973984, + "step": 45170 + }, + { + "epoch": 5.03118387348257, + "grad_norm": 0.02791755273938179, + "learning_rate": 4.6582527361723615e-05, + "loss": 0.1012, + "num_input_tokens_seen": 54980224, + "step": 45175 + }, + { + "epoch": 5.031740728366188, + "grad_norm": 0.045563988387584686, + "learning_rate": 4.6581300997293794e-05, + "loss": 0.0224, + "num_input_tokens_seen": 54986272, + "step": 45180 + }, + { + "epoch": 5.032297583249805, + "grad_norm": 0.371549516916275, + "learning_rate": 4.65800744290113e-05, + "loss": 0.0332, + "num_input_tokens_seen": 54992448, + "step": 45185 + }, + { + "epoch": 5.032854438133422, + "grad_norm": 0.6865014433860779, + "learning_rate": 4.6578847656887715e-05, + "loss": 0.0738, + "num_input_tokens_seen": 54997824, + "step": 45190 + }, + { + "epoch": 5.03341129301704, + "grad_norm": 0.10351234674453735, + "learning_rate": 4.657762068093462e-05, + "loss": 0.0076, + "num_input_tokens_seen": 55004064, + "step": 45195 + }, + { + "epoch": 5.033968147900657, + "grad_norm": 0.006886964663863182, + "learning_rate": 4.657639350116361e-05, + "loss": 0.0179, + "num_input_tokens_seen": 55010400, + "step": 45200 + }, + { + "epoch": 5.0345250027842745, + "grad_norm": 0.23217058181762695, + "learning_rate": 4.657516611758628e-05, + "loss": 0.057, + "num_input_tokens_seen": 55016544, + "step": 45205 + }, + { + "epoch": 5.035081857667892, + "grad_norm": 0.8624809384346008, + "learning_rate": 4.6573938530214214e-05, + "loss": 0.0581, + "num_input_tokens_seen": 55022624, + "step": 45210 + }, + { + "epoch": 5.035638712551509, + "grad_norm": 0.48269960284233093, + "learning_rate": 4.6572710739059024e-05, + "loss": 0.0706, + "num_input_tokens_seen": 55028064, + "step": 45215 + }, + { + "epoch": 5.036195567435127, + "grad_norm": 1.9831818342208862, + "learning_rate": 4.657148274413229e-05, + "loss": 0.0926, + "num_input_tokens_seen": 55034016, + "step": 45220 + }, + { + "epoch": 5.036752422318743, + "grad_norm": 2.506640672683716, + "learning_rate": 4.6570254545445624e-05, + "loss": 0.0586, + "num_input_tokens_seen": 55040384, + "step": 45225 + }, + { + "epoch": 5.037309277202361, + "grad_norm": 0.038244787603616714, + "learning_rate": 4.656902614301062e-05, + "loss": 0.0403, + "num_input_tokens_seen": 55046752, + "step": 45230 + }, + { + "epoch": 5.037866132085979, + "grad_norm": 1.0112316608428955, + "learning_rate": 4.6567797536838874e-05, + "loss": 0.0682, + "num_input_tokens_seen": 55052576, + "step": 45235 + }, + { + "epoch": 5.038422986969596, + "grad_norm": 0.040596600621938705, + "learning_rate": 4.6566568726942013e-05, + "loss": 0.019, + "num_input_tokens_seen": 55058656, + "step": 45240 + }, + { + "epoch": 5.038979841853213, + "grad_norm": 0.011377708055078983, + "learning_rate": 4.656533971333162e-05, + "loss": 0.0198, + "num_input_tokens_seen": 55064928, + "step": 45245 + }, + { + "epoch": 5.03953669673683, + "grad_norm": 0.6994600892066956, + "learning_rate": 4.656411049601933e-05, + "loss": 0.0561, + "num_input_tokens_seen": 55071200, + "step": 45250 + }, + { + "epoch": 5.040093551620448, + "grad_norm": 0.33010467886924744, + "learning_rate": 4.656288107501673e-05, + "loss": 0.0405, + "num_input_tokens_seen": 55077024, + "step": 45255 + }, + { + "epoch": 5.040650406504065, + "grad_norm": 0.08700372278690338, + "learning_rate": 4.656165145033544e-05, + "loss": 0.0552, + "num_input_tokens_seen": 55082720, + "step": 45260 + }, + { + "epoch": 5.041207261387682, + "grad_norm": 0.13934312760829926, + "learning_rate": 4.656042162198708e-05, + "loss": 0.0893, + "num_input_tokens_seen": 55088992, + "step": 45265 + }, + { + "epoch": 5.0417641162713, + "grad_norm": 0.17354102432727814, + "learning_rate": 4.6559191589983264e-05, + "loss": 0.0907, + "num_input_tokens_seen": 55094880, + "step": 45270 + }, + { + "epoch": 5.042320971154917, + "grad_norm": 0.024015678092837334, + "learning_rate": 4.655796135433561e-05, + "loss": 0.0046, + "num_input_tokens_seen": 55100800, + "step": 45275 + }, + { + "epoch": 5.042877826038534, + "grad_norm": 2.3858752250671387, + "learning_rate": 4.655673091505575e-05, + "loss": 0.0987, + "num_input_tokens_seen": 55106784, + "step": 45280 + }, + { + "epoch": 5.043434680922152, + "grad_norm": 0.7009181976318359, + "learning_rate": 4.655550027215528e-05, + "loss": 0.0558, + "num_input_tokens_seen": 55113120, + "step": 45285 + }, + { + "epoch": 5.043991535805769, + "grad_norm": 0.0422041155397892, + "learning_rate": 4.6554269425645844e-05, + "loss": 0.0458, + "num_input_tokens_seen": 55119200, + "step": 45290 + }, + { + "epoch": 5.044548390689386, + "grad_norm": 2.1118898391723633, + "learning_rate": 4.655303837553907e-05, + "loss": 0.0305, + "num_input_tokens_seen": 55124960, + "step": 45295 + }, + { + "epoch": 5.045105245573004, + "grad_norm": 0.04234067350625992, + "learning_rate": 4.6551807121846567e-05, + "loss": 0.0349, + "num_input_tokens_seen": 55131104, + "step": 45300 + }, + { + "epoch": 5.045662100456621, + "grad_norm": 1.1633046865463257, + "learning_rate": 4.655057566458e-05, + "loss": 0.1881, + "num_input_tokens_seen": 55137472, + "step": 45305 + }, + { + "epoch": 5.046218955340239, + "grad_norm": 0.5238215923309326, + "learning_rate": 4.654934400375096e-05, + "loss": 0.1476, + "num_input_tokens_seen": 55143520, + "step": 45310 + }, + { + "epoch": 5.046775810223855, + "grad_norm": 0.04501689597964287, + "learning_rate": 4.6548112139371115e-05, + "loss": 0.091, + "num_input_tokens_seen": 55149568, + "step": 45315 + }, + { + "epoch": 5.047332665107473, + "grad_norm": 0.0006942970212548971, + "learning_rate": 4.654688007145208e-05, + "loss": 0.1227, + "num_input_tokens_seen": 55155840, + "step": 45320 + }, + { + "epoch": 5.047889519991091, + "grad_norm": 0.47192880511283875, + "learning_rate": 4.65456478000055e-05, + "loss": 0.0926, + "num_input_tokens_seen": 55162016, + "step": 45325 + }, + { + "epoch": 5.0484463748747075, + "grad_norm": 0.15929558873176575, + "learning_rate": 4.6544415325043014e-05, + "loss": 0.0705, + "num_input_tokens_seen": 55168320, + "step": 45330 + }, + { + "epoch": 5.049003229758325, + "grad_norm": 0.022423692047595978, + "learning_rate": 4.654318264657627e-05, + "loss": 0.0759, + "num_input_tokens_seen": 55173792, + "step": 45335 + }, + { + "epoch": 5.049560084641942, + "grad_norm": 0.007018808275461197, + "learning_rate": 4.6541949764616895e-05, + "loss": 0.0808, + "num_input_tokens_seen": 55179936, + "step": 45340 + }, + { + "epoch": 5.05011693952556, + "grad_norm": 0.2346188873052597, + "learning_rate": 4.654071667917655e-05, + "loss": 0.0574, + "num_input_tokens_seen": 55186080, + "step": 45345 + }, + { + "epoch": 5.050673794409177, + "grad_norm": 0.5123080015182495, + "learning_rate": 4.653948339026688e-05, + "loss": 0.1008, + "num_input_tokens_seen": 55192704, + "step": 45350 + }, + { + "epoch": 5.051230649292794, + "grad_norm": 0.009946633130311966, + "learning_rate": 4.6538249897899534e-05, + "loss": 0.0114, + "num_input_tokens_seen": 55198112, + "step": 45355 + }, + { + "epoch": 5.051787504176412, + "grad_norm": 1.61802339553833, + "learning_rate": 4.653701620208615e-05, + "loss": 0.0935, + "num_input_tokens_seen": 55204192, + "step": 45360 + }, + { + "epoch": 5.0523443590600285, + "grad_norm": 0.007586758583784103, + "learning_rate": 4.65357823028384e-05, + "loss": 0.0852, + "num_input_tokens_seen": 55210400, + "step": 45365 + }, + { + "epoch": 5.052901213943646, + "grad_norm": 0.8964149355888367, + "learning_rate": 4.653454820016794e-05, + "loss": 0.1165, + "num_input_tokens_seen": 55216544, + "step": 45370 + }, + { + "epoch": 5.053458068827264, + "grad_norm": 1.314441442489624, + "learning_rate": 4.653331389408642e-05, + "loss": 0.0596, + "num_input_tokens_seen": 55222688, + "step": 45375 + }, + { + "epoch": 5.054014923710881, + "grad_norm": 0.4388805627822876, + "learning_rate": 4.6532079384605485e-05, + "loss": 0.0728, + "num_input_tokens_seen": 55228000, + "step": 45380 + }, + { + "epoch": 5.054571778594498, + "grad_norm": 0.3363698422908783, + "learning_rate": 4.6530844671736815e-05, + "loss": 0.0375, + "num_input_tokens_seen": 55234112, + "step": 45385 + }, + { + "epoch": 5.055128633478116, + "grad_norm": 0.007903920486569405, + "learning_rate": 4.6529609755492064e-05, + "loss": 0.0451, + "num_input_tokens_seen": 55240480, + "step": 45390 + }, + { + "epoch": 5.055685488361733, + "grad_norm": 1.72026789188385, + "learning_rate": 4.6528374635882896e-05, + "loss": 0.1349, + "num_input_tokens_seen": 55246656, + "step": 45395 + }, + { + "epoch": 5.0562423432453505, + "grad_norm": 0.06326708942651749, + "learning_rate": 4.652713931292099e-05, + "loss": 0.2121, + "num_input_tokens_seen": 55252672, + "step": 45400 + }, + { + "epoch": 5.056799198128967, + "grad_norm": 0.001403473550453782, + "learning_rate": 4.6525903786618007e-05, + "loss": 0.0145, + "num_input_tokens_seen": 55258528, + "step": 45405 + }, + { + "epoch": 5.057356053012585, + "grad_norm": 0.7230168581008911, + "learning_rate": 4.652466805698561e-05, + "loss": 0.0612, + "num_input_tokens_seen": 55264800, + "step": 45410 + }, + { + "epoch": 5.057912907896203, + "grad_norm": 1.3380953073501587, + "learning_rate": 4.652343212403548e-05, + "loss": 0.1571, + "num_input_tokens_seen": 55270944, + "step": 45415 + }, + { + "epoch": 5.058469762779819, + "grad_norm": 0.033049702644348145, + "learning_rate": 4.6522195987779296e-05, + "loss": 0.0888, + "num_input_tokens_seen": 55277088, + "step": 45420 + }, + { + "epoch": 5.059026617663437, + "grad_norm": 0.020570406690239906, + "learning_rate": 4.6520959648228716e-05, + "loss": 0.0416, + "num_input_tokens_seen": 55283520, + "step": 45425 + }, + { + "epoch": 5.059583472547054, + "grad_norm": 0.0394277423620224, + "learning_rate": 4.651972310539544e-05, + "loss": 0.0275, + "num_input_tokens_seen": 55289280, + "step": 45430 + }, + { + "epoch": 5.0601403274306715, + "grad_norm": 0.9431671500205994, + "learning_rate": 4.6518486359291136e-05, + "loss": 0.0802, + "num_input_tokens_seen": 55295392, + "step": 45435 + }, + { + "epoch": 5.060697182314289, + "grad_norm": 0.36751335859298706, + "learning_rate": 4.651724940992748e-05, + "loss": 0.0979, + "num_input_tokens_seen": 55301760, + "step": 45440 + }, + { + "epoch": 5.061254037197906, + "grad_norm": 0.07991614192724228, + "learning_rate": 4.6516012257316174e-05, + "loss": 0.066, + "num_input_tokens_seen": 55307712, + "step": 45445 + }, + { + "epoch": 5.061810892081524, + "grad_norm": 0.23878927528858185, + "learning_rate": 4.651477490146888e-05, + "loss": 0.0227, + "num_input_tokens_seen": 55313664, + "step": 45450 + }, + { + "epoch": 5.0623677469651405, + "grad_norm": 1.1150093078613281, + "learning_rate": 4.6513537342397316e-05, + "loss": 0.0415, + "num_input_tokens_seen": 55319968, + "step": 45455 + }, + { + "epoch": 5.062924601848758, + "grad_norm": 0.7863612174987793, + "learning_rate": 4.651229958011315e-05, + "loss": 0.0541, + "num_input_tokens_seen": 55326080, + "step": 45460 + }, + { + "epoch": 5.063481456732376, + "grad_norm": 1.7660576105117798, + "learning_rate": 4.6511061614628075e-05, + "loss": 0.0521, + "num_input_tokens_seen": 55332000, + "step": 45465 + }, + { + "epoch": 5.064038311615993, + "grad_norm": 0.0005353910964913666, + "learning_rate": 4.650982344595379e-05, + "loss": 0.2399, + "num_input_tokens_seen": 55338208, + "step": 45470 + }, + { + "epoch": 5.06459516649961, + "grad_norm": 0.0034382762387394905, + "learning_rate": 4.6508585074101996e-05, + "loss": 0.0576, + "num_input_tokens_seen": 55344416, + "step": 45475 + }, + { + "epoch": 5.065152021383228, + "grad_norm": 0.743748664855957, + "learning_rate": 4.650734649908437e-05, + "loss": 0.1672, + "num_input_tokens_seen": 55350304, + "step": 45480 + }, + { + "epoch": 5.065708876266845, + "grad_norm": 0.8909115195274353, + "learning_rate": 4.650610772091264e-05, + "loss": 0.0892, + "num_input_tokens_seen": 55355840, + "step": 45485 + }, + { + "epoch": 5.066265731150462, + "grad_norm": 0.2610120177268982, + "learning_rate": 4.650486873959848e-05, + "loss": 0.0288, + "num_input_tokens_seen": 55361952, + "step": 45490 + }, + { + "epoch": 5.066822586034079, + "grad_norm": 1.6280837059020996, + "learning_rate": 4.650362955515362e-05, + "loss": 0.0923, + "num_input_tokens_seen": 55368256, + "step": 45495 + }, + { + "epoch": 5.067379440917697, + "grad_norm": 0.17677457630634308, + "learning_rate": 4.650239016758974e-05, + "loss": 0.0068, + "num_input_tokens_seen": 55374432, + "step": 45500 + }, + { + "epoch": 5.0679362958013146, + "grad_norm": 0.31570565700531006, + "learning_rate": 4.650115057691855e-05, + "loss": 0.0376, + "num_input_tokens_seen": 55380704, + "step": 45505 + }, + { + "epoch": 5.068493150684931, + "grad_norm": 0.00031429066439159214, + "learning_rate": 4.649991078315178e-05, + "loss": 0.0883, + "num_input_tokens_seen": 55386720, + "step": 45510 + }, + { + "epoch": 5.069050005568549, + "grad_norm": 0.15756197273731232, + "learning_rate": 4.649867078630112e-05, + "loss": 0.0134, + "num_input_tokens_seen": 55393184, + "step": 45515 + }, + { + "epoch": 5.069606860452166, + "grad_norm": 0.1795673370361328, + "learning_rate": 4.6497430586378285e-05, + "loss": 0.0964, + "num_input_tokens_seen": 55399456, + "step": 45520 + }, + { + "epoch": 5.0701637153357835, + "grad_norm": 0.004507836420089006, + "learning_rate": 4.6496190183395e-05, + "loss": 0.0353, + "num_input_tokens_seen": 55405696, + "step": 45525 + }, + { + "epoch": 5.070720570219401, + "grad_norm": 0.09615515172481537, + "learning_rate": 4.649494957736298e-05, + "loss": 0.1028, + "num_input_tokens_seen": 55411680, + "step": 45530 + }, + { + "epoch": 5.071277425103018, + "grad_norm": 0.6621116995811462, + "learning_rate": 4.6493708768293944e-05, + "loss": 0.1117, + "num_input_tokens_seen": 55418080, + "step": 45535 + }, + { + "epoch": 5.071834279986636, + "grad_norm": 0.0031451501417905092, + "learning_rate": 4.64924677561996e-05, + "loss": 0.1315, + "num_input_tokens_seen": 55423712, + "step": 45540 + }, + { + "epoch": 5.072391134870252, + "grad_norm": 0.048143405467271805, + "learning_rate": 4.6491226541091685e-05, + "loss": 0.0609, + "num_input_tokens_seen": 55429792, + "step": 45545 + }, + { + "epoch": 5.07294798975387, + "grad_norm": 0.1927432119846344, + "learning_rate": 4.64899851229819e-05, + "loss": 0.0348, + "num_input_tokens_seen": 55436032, + "step": 45550 + }, + { + "epoch": 5.073504844637488, + "grad_norm": 0.7391871213912964, + "learning_rate": 4.6488743501882e-05, + "loss": 0.0787, + "num_input_tokens_seen": 55442432, + "step": 45555 + }, + { + "epoch": 5.0740616995211045, + "grad_norm": 0.9122852087020874, + "learning_rate": 4.648750167780371e-05, + "loss": 0.0776, + "num_input_tokens_seen": 55448448, + "step": 45560 + }, + { + "epoch": 5.074618554404722, + "grad_norm": 0.006121423095464706, + "learning_rate": 4.648625965075874e-05, + "loss": 0.0645, + "num_input_tokens_seen": 55454624, + "step": 45565 + }, + { + "epoch": 5.07517540928834, + "grad_norm": 0.036360420286655426, + "learning_rate": 4.648501742075884e-05, + "loss": 0.0213, + "num_input_tokens_seen": 55461184, + "step": 45570 + }, + { + "epoch": 5.075732264171957, + "grad_norm": 0.00841253437101841, + "learning_rate": 4.648377498781573e-05, + "loss": 0.062, + "num_input_tokens_seen": 55467360, + "step": 45575 + }, + { + "epoch": 5.076289119055574, + "grad_norm": 0.3801330327987671, + "learning_rate": 4.6482532351941155e-05, + "loss": 0.1008, + "num_input_tokens_seen": 55473440, + "step": 45580 + }, + { + "epoch": 5.076845973939191, + "grad_norm": 0.0004546324198599905, + "learning_rate": 4.648128951314685e-05, + "loss": 0.0065, + "num_input_tokens_seen": 55480064, + "step": 45585 + }, + { + "epoch": 5.077402828822809, + "grad_norm": 0.02542661502957344, + "learning_rate": 4.6480046471444554e-05, + "loss": 0.0393, + "num_input_tokens_seen": 55486304, + "step": 45590 + }, + { + "epoch": 5.0779596837064265, + "grad_norm": 0.22767217457294464, + "learning_rate": 4.6478803226846016e-05, + "loss": 0.0659, + "num_input_tokens_seen": 55492896, + "step": 45595 + }, + { + "epoch": 5.078516538590043, + "grad_norm": 0.078639455139637, + "learning_rate": 4.647755977936297e-05, + "loss": 0.0323, + "num_input_tokens_seen": 55499264, + "step": 45600 + }, + { + "epoch": 5.079073393473661, + "grad_norm": 1.2273426055908203, + "learning_rate": 4.647631612900716e-05, + "loss": 0.0703, + "num_input_tokens_seen": 55505568, + "step": 45605 + }, + { + "epoch": 5.079630248357278, + "grad_norm": 0.9335610270500183, + "learning_rate": 4.647507227579034e-05, + "loss": 0.0302, + "num_input_tokens_seen": 55512032, + "step": 45610 + }, + { + "epoch": 5.080187103240895, + "grad_norm": 0.9077879190444946, + "learning_rate": 4.6473828219724255e-05, + "loss": 0.0483, + "num_input_tokens_seen": 55518016, + "step": 45615 + }, + { + "epoch": 5.080743958124513, + "grad_norm": 0.3883957266807556, + "learning_rate": 4.6472583960820656e-05, + "loss": 0.0375, + "num_input_tokens_seen": 55523936, + "step": 45620 + }, + { + "epoch": 5.08130081300813, + "grad_norm": 0.17026779055595398, + "learning_rate": 4.6471339499091305e-05, + "loss": 0.0123, + "num_input_tokens_seen": 55530144, + "step": 45625 + }, + { + "epoch": 5.0818576678917475, + "grad_norm": 0.32653510570526123, + "learning_rate": 4.647009483454795e-05, + "loss": 0.0215, + "num_input_tokens_seen": 55536416, + "step": 45630 + }, + { + "epoch": 5.082414522775364, + "grad_norm": 0.7717971205711365, + "learning_rate": 4.6468849967202344e-05, + "loss": 0.0796, + "num_input_tokens_seen": 55542240, + "step": 45635 + }, + { + "epoch": 5.082971377658982, + "grad_norm": 0.22656182944774628, + "learning_rate": 4.646760489706625e-05, + "loss": 0.0467, + "num_input_tokens_seen": 55548480, + "step": 45640 + }, + { + "epoch": 5.0835282325426, + "grad_norm": 0.4981364607810974, + "learning_rate": 4.646635962415142e-05, + "loss": 0.0683, + "num_input_tokens_seen": 55554432, + "step": 45645 + }, + { + "epoch": 5.0840850874262165, + "grad_norm": 1.5569257736206055, + "learning_rate": 4.646511414846964e-05, + "loss": 0.0427, + "num_input_tokens_seen": 55560000, + "step": 45650 + }, + { + "epoch": 5.084641942309834, + "grad_norm": 0.3633243441581726, + "learning_rate": 4.646386847003265e-05, + "loss": 0.0514, + "num_input_tokens_seen": 55566080, + "step": 45655 + }, + { + "epoch": 5.085198797193452, + "grad_norm": 0.10391437262296677, + "learning_rate": 4.6462622588852234e-05, + "loss": 0.1078, + "num_input_tokens_seen": 55572096, + "step": 45660 + }, + { + "epoch": 5.085755652077069, + "grad_norm": 0.13928094506263733, + "learning_rate": 4.646137650494014e-05, + "loss": 0.1502, + "num_input_tokens_seen": 55578368, + "step": 45665 + }, + { + "epoch": 5.086312506960686, + "grad_norm": 0.7515103816986084, + "learning_rate": 4.6460130218308154e-05, + "loss": 0.0651, + "num_input_tokens_seen": 55584608, + "step": 45670 + }, + { + "epoch": 5.086869361844303, + "grad_norm": 1.4229868650436401, + "learning_rate": 4.645888372896805e-05, + "loss": 0.0297, + "num_input_tokens_seen": 55590848, + "step": 45675 + }, + { + "epoch": 5.087426216727921, + "grad_norm": 1.1461961269378662, + "learning_rate": 4.6457637036931594e-05, + "loss": 0.0958, + "num_input_tokens_seen": 55596512, + "step": 45680 + }, + { + "epoch": 5.087983071611538, + "grad_norm": 0.5046659111976624, + "learning_rate": 4.645639014221057e-05, + "loss": 0.0342, + "num_input_tokens_seen": 55602848, + "step": 45685 + }, + { + "epoch": 5.088539926495155, + "grad_norm": 0.15112194418907166, + "learning_rate": 4.645514304481674e-05, + "loss": 0.0533, + "num_input_tokens_seen": 55609184, + "step": 45690 + }, + { + "epoch": 5.089096781378773, + "grad_norm": 0.005193472374230623, + "learning_rate": 4.64538957447619e-05, + "loss": 0.2682, + "num_input_tokens_seen": 55614688, + "step": 45695 + }, + { + "epoch": 5.08965363626239, + "grad_norm": 0.4720799922943115, + "learning_rate": 4.645264824205782e-05, + "loss": 0.0991, + "num_input_tokens_seen": 55620544, + "step": 45700 + }, + { + "epoch": 5.090210491146007, + "grad_norm": 1.7258273363113403, + "learning_rate": 4.6451400536716295e-05, + "loss": 0.1154, + "num_input_tokens_seen": 55626432, + "step": 45705 + }, + { + "epoch": 5.090767346029625, + "grad_norm": 0.5746020674705505, + "learning_rate": 4.645015262874911e-05, + "loss": 0.047, + "num_input_tokens_seen": 55632576, + "step": 45710 + }, + { + "epoch": 5.091324200913242, + "grad_norm": 0.08516458421945572, + "learning_rate": 4.644890451816804e-05, + "loss": 0.064, + "num_input_tokens_seen": 55638368, + "step": 45715 + }, + { + "epoch": 5.0918810557968595, + "grad_norm": 0.005402255337685347, + "learning_rate": 4.644765620498489e-05, + "loss": 0.0922, + "num_input_tokens_seen": 55644416, + "step": 45720 + }, + { + "epoch": 5.092437910680476, + "grad_norm": 0.45696085691452026, + "learning_rate": 4.644640768921143e-05, + "loss": 0.0447, + "num_input_tokens_seen": 55650368, + "step": 45725 + }, + { + "epoch": 5.092994765564094, + "grad_norm": 0.0018127451185137033, + "learning_rate": 4.644515897085948e-05, + "loss": 0.0173, + "num_input_tokens_seen": 55656736, + "step": 45730 + }, + { + "epoch": 5.093551620447712, + "grad_norm": 0.36588290333747864, + "learning_rate": 4.644391004994082e-05, + "loss": 0.079, + "num_input_tokens_seen": 55662976, + "step": 45735 + }, + { + "epoch": 5.094108475331328, + "grad_norm": 0.15170663595199585, + "learning_rate": 4.644266092646725e-05, + "loss": 0.0263, + "num_input_tokens_seen": 55669024, + "step": 45740 + }, + { + "epoch": 5.094665330214946, + "grad_norm": 0.27818605303764343, + "learning_rate": 4.644141160045056e-05, + "loss": 0.0968, + "num_input_tokens_seen": 55675104, + "step": 45745 + }, + { + "epoch": 5.095222185098564, + "grad_norm": 0.3121112585067749, + "learning_rate": 4.6440162071902566e-05, + "loss": 0.0273, + "num_input_tokens_seen": 55681376, + "step": 45750 + }, + { + "epoch": 5.0957790399821805, + "grad_norm": 0.4562875032424927, + "learning_rate": 4.643891234083506e-05, + "loss": 0.0512, + "num_input_tokens_seen": 55687456, + "step": 45755 + }, + { + "epoch": 5.096335894865798, + "grad_norm": 0.10159773379564285, + "learning_rate": 4.643766240725985e-05, + "loss": 0.0631, + "num_input_tokens_seen": 55693728, + "step": 45760 + }, + { + "epoch": 5.096892749749415, + "grad_norm": 0.16546630859375, + "learning_rate": 4.6436412271188746e-05, + "loss": 0.0779, + "num_input_tokens_seen": 55700032, + "step": 45765 + }, + { + "epoch": 5.097449604633033, + "grad_norm": 0.37376469373703003, + "learning_rate": 4.6435161932633554e-05, + "loss": 0.1096, + "num_input_tokens_seen": 55706080, + "step": 45770 + }, + { + "epoch": 5.09800645951665, + "grad_norm": 0.2152329534292221, + "learning_rate": 4.643391139160608e-05, + "loss": 0.0893, + "num_input_tokens_seen": 55711392, + "step": 45775 + }, + { + "epoch": 5.098563314400267, + "grad_norm": 1.231136441230774, + "learning_rate": 4.643266064811814e-05, + "loss": 0.0534, + "num_input_tokens_seen": 55717504, + "step": 45780 + }, + { + "epoch": 5.099120169283885, + "grad_norm": 0.37400341033935547, + "learning_rate": 4.643140970218155e-05, + "loss": 0.0402, + "num_input_tokens_seen": 55723424, + "step": 45785 + }, + { + "epoch": 5.099677024167502, + "grad_norm": 0.10169554501771927, + "learning_rate": 4.643015855380813e-05, + "loss": 0.0192, + "num_input_tokens_seen": 55729760, + "step": 45790 + }, + { + "epoch": 5.100233879051119, + "grad_norm": 0.8257652521133423, + "learning_rate": 4.6428907203009685e-05, + "loss": 0.0379, + "num_input_tokens_seen": 55735808, + "step": 45795 + }, + { + "epoch": 5.100790733934737, + "grad_norm": 0.05278487503528595, + "learning_rate": 4.642765564979805e-05, + "loss": 0.065, + "num_input_tokens_seen": 55741344, + "step": 45800 + }, + { + "epoch": 5.101347588818354, + "grad_norm": 0.729441225528717, + "learning_rate": 4.642640389418503e-05, + "loss": 0.029, + "num_input_tokens_seen": 55747712, + "step": 45805 + }, + { + "epoch": 5.101904443701971, + "grad_norm": 0.07476206868886948, + "learning_rate": 4.642515193618247e-05, + "loss": 0.0727, + "num_input_tokens_seen": 55753280, + "step": 45810 + }, + { + "epoch": 5.102461298585588, + "grad_norm": 1.1783477067947388, + "learning_rate": 4.6423899775802184e-05, + "loss": 0.1136, + "num_input_tokens_seen": 55759328, + "step": 45815 + }, + { + "epoch": 5.103018153469206, + "grad_norm": 0.010736338794231415, + "learning_rate": 4.642264741305599e-05, + "loss": 0.0363, + "num_input_tokens_seen": 55765376, + "step": 45820 + }, + { + "epoch": 5.1035750083528235, + "grad_norm": 0.19467374682426453, + "learning_rate": 4.642139484795574e-05, + "loss": 0.0129, + "num_input_tokens_seen": 55771520, + "step": 45825 + }, + { + "epoch": 5.10413186323644, + "grad_norm": 0.2501971125602722, + "learning_rate": 4.642014208051324e-05, + "loss": 0.067, + "num_input_tokens_seen": 55777344, + "step": 45830 + }, + { + "epoch": 5.104688718120058, + "grad_norm": 0.2804088592529297, + "learning_rate": 4.641888911074034e-05, + "loss": 0.0407, + "num_input_tokens_seen": 55782848, + "step": 45835 + }, + { + "epoch": 5.105245573003676, + "grad_norm": 0.0006561152986250818, + "learning_rate": 4.641763593864888e-05, + "loss": 0.0307, + "num_input_tokens_seen": 55788960, + "step": 45840 + }, + { + "epoch": 5.105802427887292, + "grad_norm": 0.0016401089960709214, + "learning_rate": 4.641638256425068e-05, + "loss": 0.1206, + "num_input_tokens_seen": 55795232, + "step": 45845 + }, + { + "epoch": 5.10635928277091, + "grad_norm": 0.04465865343809128, + "learning_rate": 4.6415128987557596e-05, + "loss": 0.0924, + "num_input_tokens_seen": 55801152, + "step": 45850 + }, + { + "epoch": 5.106916137654527, + "grad_norm": 0.32581815123558044, + "learning_rate": 4.641387520858146e-05, + "loss": 0.0463, + "num_input_tokens_seen": 55806688, + "step": 45855 + }, + { + "epoch": 5.107472992538145, + "grad_norm": 0.0005207091453485191, + "learning_rate": 4.641262122733411e-05, + "loss": 0.075, + "num_input_tokens_seen": 55812992, + "step": 45860 + }, + { + "epoch": 5.108029847421762, + "grad_norm": 1.0376591682434082, + "learning_rate": 4.64113670438274e-05, + "loss": 0.0548, + "num_input_tokens_seen": 55819072, + "step": 45865 + }, + { + "epoch": 5.108586702305379, + "grad_norm": 0.5623506307601929, + "learning_rate": 4.641011265807318e-05, + "loss": 0.1734, + "num_input_tokens_seen": 55825440, + "step": 45870 + }, + { + "epoch": 5.109143557188997, + "grad_norm": 0.00564216636121273, + "learning_rate": 4.640885807008328e-05, + "loss": 0.0743, + "num_input_tokens_seen": 55831584, + "step": 45875 + }, + { + "epoch": 5.1097004120726135, + "grad_norm": 1.3411766290664673, + "learning_rate": 4.640760327986957e-05, + "loss": 0.0772, + "num_input_tokens_seen": 55837856, + "step": 45880 + }, + { + "epoch": 5.110257266956231, + "grad_norm": 0.01345907710492611, + "learning_rate": 4.64063482874439e-05, + "loss": 0.0238, + "num_input_tokens_seen": 55843968, + "step": 45885 + }, + { + "epoch": 5.110814121839849, + "grad_norm": 0.46419456601142883, + "learning_rate": 4.640509309281811e-05, + "loss": 0.2442, + "num_input_tokens_seen": 55850048, + "step": 45890 + }, + { + "epoch": 5.111370976723466, + "grad_norm": 0.23480574786663055, + "learning_rate": 4.640383769600407e-05, + "loss": 0.0419, + "num_input_tokens_seen": 55855520, + "step": 45895 + }, + { + "epoch": 5.111927831607083, + "grad_norm": 0.010380025953054428, + "learning_rate": 4.640258209701364e-05, + "loss": 0.0433, + "num_input_tokens_seen": 55861632, + "step": 45900 + }, + { + "epoch": 5.1124846864907, + "grad_norm": 0.45863935351371765, + "learning_rate": 4.640132629585867e-05, + "loss": 0.1189, + "num_input_tokens_seen": 55867104, + "step": 45905 + }, + { + "epoch": 5.113041541374318, + "grad_norm": 1.4878605604171753, + "learning_rate": 4.6400070292551025e-05, + "loss": 0.1347, + "num_input_tokens_seen": 55872736, + "step": 45910 + }, + { + "epoch": 5.113598396257935, + "grad_norm": 0.7427395582199097, + "learning_rate": 4.639881408710257e-05, + "loss": 0.0156, + "num_input_tokens_seen": 55878720, + "step": 45915 + }, + { + "epoch": 5.114155251141552, + "grad_norm": 0.0475965291261673, + "learning_rate": 4.6397557679525175e-05, + "loss": 0.0175, + "num_input_tokens_seen": 55884800, + "step": 45920 + }, + { + "epoch": 5.11471210602517, + "grad_norm": 0.10252239555120468, + "learning_rate": 4.639630106983071e-05, + "loss": 0.0934, + "num_input_tokens_seen": 55890720, + "step": 45925 + }, + { + "epoch": 5.115268960908788, + "grad_norm": 0.17624680697917938, + "learning_rate": 4.639504425803103e-05, + "loss": 0.01, + "num_input_tokens_seen": 55896928, + "step": 45930 + }, + { + "epoch": 5.115825815792404, + "grad_norm": 0.0009721518727019429, + "learning_rate": 4.6393787244138023e-05, + "loss": 0.0186, + "num_input_tokens_seen": 55903136, + "step": 45935 + }, + { + "epoch": 5.116382670676022, + "grad_norm": 1.0191905498504639, + "learning_rate": 4.639253002816354e-05, + "loss": 0.2003, + "num_input_tokens_seen": 55909280, + "step": 45940 + }, + { + "epoch": 5.116939525559639, + "grad_norm": 0.07600461691617966, + "learning_rate": 4.6391272610119486e-05, + "loss": 0.0041, + "num_input_tokens_seen": 55915552, + "step": 45945 + }, + { + "epoch": 5.1174963804432565, + "grad_norm": 0.4264678359031677, + "learning_rate": 4.639001499001772e-05, + "loss": 0.0763, + "num_input_tokens_seen": 55920800, + "step": 45950 + }, + { + "epoch": 5.118053235326874, + "grad_norm": 0.8093297481536865, + "learning_rate": 4.638875716787012e-05, + "loss": 0.0613, + "num_input_tokens_seen": 55926336, + "step": 45955 + }, + { + "epoch": 5.118610090210491, + "grad_norm": 0.03535928949713707, + "learning_rate": 4.638749914368858e-05, + "loss": 0.0396, + "num_input_tokens_seen": 55932320, + "step": 45960 + }, + { + "epoch": 5.119166945094109, + "grad_norm": 0.09408359229564667, + "learning_rate": 4.638624091748497e-05, + "loss": 0.0303, + "num_input_tokens_seen": 55938528, + "step": 45965 + }, + { + "epoch": 5.119723799977725, + "grad_norm": 0.1211244985461235, + "learning_rate": 4.638498248927118e-05, + "loss": 0.0167, + "num_input_tokens_seen": 55944704, + "step": 45970 + }, + { + "epoch": 5.120280654861343, + "grad_norm": 0.3518109917640686, + "learning_rate": 4.6383723859059105e-05, + "loss": 0.114, + "num_input_tokens_seen": 55950560, + "step": 45975 + }, + { + "epoch": 5.120837509744961, + "grad_norm": 1.2933989763259888, + "learning_rate": 4.638246502686062e-05, + "loss": 0.1775, + "num_input_tokens_seen": 55956416, + "step": 45980 + }, + { + "epoch": 5.1213943646285776, + "grad_norm": 0.12644587457180023, + "learning_rate": 4.638120599268762e-05, + "loss": 0.0436, + "num_input_tokens_seen": 55962912, + "step": 45985 + }, + { + "epoch": 5.121951219512195, + "grad_norm": 1.1381560564041138, + "learning_rate": 4.637994675655199e-05, + "loss": 0.0739, + "num_input_tokens_seen": 55968832, + "step": 45990 + }, + { + "epoch": 5.122508074395812, + "grad_norm": 3.2952849864959717, + "learning_rate": 4.637868731846565e-05, + "loss": 0.1348, + "num_input_tokens_seen": 55975200, + "step": 45995 + }, + { + "epoch": 5.12306492927943, + "grad_norm": 1.2805999517440796, + "learning_rate": 4.637742767844048e-05, + "loss": 0.0894, + "num_input_tokens_seen": 55981184, + "step": 46000 + }, + { + "epoch": 5.123621784163047, + "grad_norm": 0.37314125895500183, + "learning_rate": 4.637616783648837e-05, + "loss": 0.0207, + "num_input_tokens_seen": 55987520, + "step": 46005 + }, + { + "epoch": 5.124178639046664, + "grad_norm": 1.6827423572540283, + "learning_rate": 4.6374907792621226e-05, + "loss": 0.1736, + "num_input_tokens_seen": 55992992, + "step": 46010 + }, + { + "epoch": 5.124735493930282, + "grad_norm": 1.7492783069610596, + "learning_rate": 4.6373647546850964e-05, + "loss": 0.0417, + "num_input_tokens_seen": 55999424, + "step": 46015 + }, + { + "epoch": 5.1252923488138995, + "grad_norm": 0.2662264108657837, + "learning_rate": 4.6372387099189463e-05, + "loss": 0.0988, + "num_input_tokens_seen": 56005152, + "step": 46020 + }, + { + "epoch": 5.125849203697516, + "grad_norm": 0.3371937870979309, + "learning_rate": 4.637112644964865e-05, + "loss": 0.0419, + "num_input_tokens_seen": 56011360, + "step": 46025 + }, + { + "epoch": 5.126406058581134, + "grad_norm": 0.18352623283863068, + "learning_rate": 4.6369865598240427e-05, + "loss": 0.0553, + "num_input_tokens_seen": 56017472, + "step": 46030 + }, + { + "epoch": 5.126962913464751, + "grad_norm": 0.0044134873896837234, + "learning_rate": 4.63686045449767e-05, + "loss": 0.0133, + "num_input_tokens_seen": 56023520, + "step": 46035 + }, + { + "epoch": 5.127519768348368, + "grad_norm": 0.30300697684288025, + "learning_rate": 4.636734328986938e-05, + "loss": 0.0699, + "num_input_tokens_seen": 56029536, + "step": 46040 + }, + { + "epoch": 5.128076623231986, + "grad_norm": 0.02622184343636036, + "learning_rate": 4.636608183293039e-05, + "loss": 0.0178, + "num_input_tokens_seen": 56035840, + "step": 46045 + }, + { + "epoch": 5.128633478115603, + "grad_norm": 0.021465467289090157, + "learning_rate": 4.636482017417163e-05, + "loss": 0.0489, + "num_input_tokens_seen": 56041856, + "step": 46050 + }, + { + "epoch": 5.129190332999221, + "grad_norm": 0.9048935770988464, + "learning_rate": 4.636355831360504e-05, + "loss": 0.1419, + "num_input_tokens_seen": 56047808, + "step": 46055 + }, + { + "epoch": 5.129747187882837, + "grad_norm": 0.5140626430511475, + "learning_rate": 4.636229625124251e-05, + "loss": 0.1049, + "num_input_tokens_seen": 56053856, + "step": 46060 + }, + { + "epoch": 5.130304042766455, + "grad_norm": 0.11502959579229355, + "learning_rate": 4.636103398709599e-05, + "loss": 0.1151, + "num_input_tokens_seen": 56060032, + "step": 46065 + }, + { + "epoch": 5.130860897650073, + "grad_norm": 0.07724277675151825, + "learning_rate": 4.6359771521177384e-05, + "loss": 0.0138, + "num_input_tokens_seen": 56065952, + "step": 46070 + }, + { + "epoch": 5.1314177525336895, + "grad_norm": 0.86745285987854, + "learning_rate": 4.6358508853498616e-05, + "loss": 0.09, + "num_input_tokens_seen": 56072416, + "step": 46075 + }, + { + "epoch": 5.131974607417307, + "grad_norm": 0.2848966419696808, + "learning_rate": 4.635724598407163e-05, + "loss": 0.0158, + "num_input_tokens_seen": 56078560, + "step": 46080 + }, + { + "epoch": 5.132531462300925, + "grad_norm": 1.2955342531204224, + "learning_rate": 4.635598291290834e-05, + "loss": 0.0498, + "num_input_tokens_seen": 56084576, + "step": 46085 + }, + { + "epoch": 5.133088317184542, + "grad_norm": 0.477632611989975, + "learning_rate": 4.635471964002068e-05, + "loss": 0.1056, + "num_input_tokens_seen": 56090752, + "step": 46090 + }, + { + "epoch": 5.133645172068159, + "grad_norm": 0.12012448906898499, + "learning_rate": 4.635345616542059e-05, + "loss": 0.1172, + "num_input_tokens_seen": 56096192, + "step": 46095 + }, + { + "epoch": 5.134202026951776, + "grad_norm": 0.008833053521811962, + "learning_rate": 4.635219248911999e-05, + "loss": 0.1013, + "num_input_tokens_seen": 56102208, + "step": 46100 + }, + { + "epoch": 5.134758881835394, + "grad_norm": 0.022403644397854805, + "learning_rate": 4.635092861113083e-05, + "loss": 0.3154, + "num_input_tokens_seen": 56107776, + "step": 46105 + }, + { + "epoch": 5.135315736719011, + "grad_norm": 0.007686964236199856, + "learning_rate": 4.6349664531465045e-05, + "loss": 0.0654, + "num_input_tokens_seen": 56113824, + "step": 46110 + }, + { + "epoch": 5.135872591602628, + "grad_norm": 0.2866770327091217, + "learning_rate": 4.6348400250134574e-05, + "loss": 0.086, + "num_input_tokens_seen": 56120096, + "step": 46115 + }, + { + "epoch": 5.136429446486246, + "grad_norm": 0.7183984518051147, + "learning_rate": 4.634713576715135e-05, + "loss": 0.0566, + "num_input_tokens_seen": 56126080, + "step": 46120 + }, + { + "epoch": 5.136986301369863, + "grad_norm": 0.005588165018707514, + "learning_rate": 4.634587108252733e-05, + "loss": 0.039, + "num_input_tokens_seen": 56132416, + "step": 46125 + }, + { + "epoch": 5.13754315625348, + "grad_norm": 0.08146700263023376, + "learning_rate": 4.6344606196274454e-05, + "loss": 0.0113, + "num_input_tokens_seen": 56138752, + "step": 46130 + }, + { + "epoch": 5.138100011137098, + "grad_norm": 0.025846006348729134, + "learning_rate": 4.634334110840468e-05, + "loss": 0.0997, + "num_input_tokens_seen": 56144704, + "step": 46135 + }, + { + "epoch": 5.138656866020715, + "grad_norm": 0.24958135187625885, + "learning_rate": 4.634207581892994e-05, + "loss": 0.0456, + "num_input_tokens_seen": 56151040, + "step": 46140 + }, + { + "epoch": 5.1392137209043325, + "grad_norm": 0.3830382823944092, + "learning_rate": 4.63408103278622e-05, + "loss": 0.0436, + "num_input_tokens_seen": 56156672, + "step": 46145 + }, + { + "epoch": 5.139770575787949, + "grad_norm": 0.00170989113394171, + "learning_rate": 4.6339544635213405e-05, + "loss": 0.0319, + "num_input_tokens_seen": 56162496, + "step": 46150 + }, + { + "epoch": 5.140327430671567, + "grad_norm": 0.006827492732554674, + "learning_rate": 4.633827874099551e-05, + "loss": 0.0881, + "num_input_tokens_seen": 56168768, + "step": 46155 + }, + { + "epoch": 5.140884285555185, + "grad_norm": 0.46455830335617065, + "learning_rate": 4.633701264522049e-05, + "loss": 0.0559, + "num_input_tokens_seen": 56174912, + "step": 46160 + }, + { + "epoch": 5.141441140438801, + "grad_norm": 0.7153419852256775, + "learning_rate": 4.633574634790028e-05, + "loss": 0.1281, + "num_input_tokens_seen": 56180864, + "step": 46165 + }, + { + "epoch": 5.141997995322419, + "grad_norm": 0.2397920787334442, + "learning_rate": 4.633447984904685e-05, + "loss": 0.1749, + "num_input_tokens_seen": 56186560, + "step": 46170 + }, + { + "epoch": 5.142554850206036, + "grad_norm": 0.3092866539955139, + "learning_rate": 4.633321314867217e-05, + "loss": 0.087, + "num_input_tokens_seen": 56192544, + "step": 46175 + }, + { + "epoch": 5.1431117050896535, + "grad_norm": 0.5367985963821411, + "learning_rate": 4.63319462467882e-05, + "loss": 0.1198, + "num_input_tokens_seen": 56198752, + "step": 46180 + }, + { + "epoch": 5.143668559973271, + "grad_norm": 0.4258434474468231, + "learning_rate": 4.633067914340691e-05, + "loss": 0.0954, + "num_input_tokens_seen": 56204288, + "step": 46185 + }, + { + "epoch": 5.144225414856888, + "grad_norm": 0.7974323034286499, + "learning_rate": 4.632941183854026e-05, + "loss": 0.0925, + "num_input_tokens_seen": 56210496, + "step": 46190 + }, + { + "epoch": 5.144782269740506, + "grad_norm": 0.10869535058736801, + "learning_rate": 4.6328144332200225e-05, + "loss": 0.0938, + "num_input_tokens_seen": 56216992, + "step": 46195 + }, + { + "epoch": 5.145339124624123, + "grad_norm": 1.0274136066436768, + "learning_rate": 4.632687662439879e-05, + "loss": 0.0901, + "num_input_tokens_seen": 56223328, + "step": 46200 + }, + { + "epoch": 5.14589597950774, + "grad_norm": 0.028751160949468613, + "learning_rate": 4.63256087151479e-05, + "loss": 0.0946, + "num_input_tokens_seen": 56229440, + "step": 46205 + }, + { + "epoch": 5.146452834391358, + "grad_norm": 0.7421094179153442, + "learning_rate": 4.632434060445956e-05, + "loss": 0.1159, + "num_input_tokens_seen": 56235584, + "step": 46210 + }, + { + "epoch": 5.147009689274975, + "grad_norm": 1.90352463722229, + "learning_rate": 4.6323072292345745e-05, + "loss": 0.0766, + "num_input_tokens_seen": 56241792, + "step": 46215 + }, + { + "epoch": 5.147566544158592, + "grad_norm": 1.9102228879928589, + "learning_rate": 4.632180377881842e-05, + "loss": 0.1581, + "num_input_tokens_seen": 56247936, + "step": 46220 + }, + { + "epoch": 5.14812339904221, + "grad_norm": 0.26899319887161255, + "learning_rate": 4.6320535063889575e-05, + "loss": 0.0236, + "num_input_tokens_seen": 56254336, + "step": 46225 + }, + { + "epoch": 5.148680253925827, + "grad_norm": 0.3849297761917114, + "learning_rate": 4.6319266147571194e-05, + "loss": 0.0492, + "num_input_tokens_seen": 56259936, + "step": 46230 + }, + { + "epoch": 5.149237108809444, + "grad_norm": 0.5391473174095154, + "learning_rate": 4.631799702987527e-05, + "loss": 0.0412, + "num_input_tokens_seen": 56266016, + "step": 46235 + }, + { + "epoch": 5.149793963693061, + "grad_norm": 0.15511316061019897, + "learning_rate": 4.6316727710813777e-05, + "loss": 0.0215, + "num_input_tokens_seen": 56272192, + "step": 46240 + }, + { + "epoch": 5.150350818576679, + "grad_norm": 1.5454916954040527, + "learning_rate": 4.631545819039872e-05, + "loss": 0.0663, + "num_input_tokens_seen": 56278368, + "step": 46245 + }, + { + "epoch": 5.1509076734602965, + "grad_norm": 0.6233232021331787, + "learning_rate": 4.631418846864208e-05, + "loss": 0.1288, + "num_input_tokens_seen": 56284352, + "step": 46250 + }, + { + "epoch": 5.151464528343913, + "grad_norm": 0.34347692131996155, + "learning_rate": 4.631291854555585e-05, + "loss": 0.0336, + "num_input_tokens_seen": 56290656, + "step": 46255 + }, + { + "epoch": 5.152021383227531, + "grad_norm": 0.3753831684589386, + "learning_rate": 4.631164842115203e-05, + "loss": 0.0189, + "num_input_tokens_seen": 56296992, + "step": 46260 + }, + { + "epoch": 5.152578238111149, + "grad_norm": 0.6730281710624695, + "learning_rate": 4.631037809544262e-05, + "loss": 0.1171, + "num_input_tokens_seen": 56303232, + "step": 46265 + }, + { + "epoch": 5.1531350929947655, + "grad_norm": 0.04519296810030937, + "learning_rate": 4.6309107568439616e-05, + "loss": 0.0109, + "num_input_tokens_seen": 56309312, + "step": 46270 + }, + { + "epoch": 5.153691947878383, + "grad_norm": 0.024557041004300117, + "learning_rate": 4.630783684015501e-05, + "loss": 0.088, + "num_input_tokens_seen": 56315232, + "step": 46275 + }, + { + "epoch": 5.154248802762, + "grad_norm": 0.015812357887625694, + "learning_rate": 4.630656591060082e-05, + "loss": 0.0847, + "num_input_tokens_seen": 56321728, + "step": 46280 + }, + { + "epoch": 5.154805657645618, + "grad_norm": 1.9435629844665527, + "learning_rate": 4.6305294779789043e-05, + "loss": 0.1129, + "num_input_tokens_seen": 56327872, + "step": 46285 + }, + { + "epoch": 5.155362512529235, + "grad_norm": 0.24331450462341309, + "learning_rate": 4.630402344773168e-05, + "loss": 0.1116, + "num_input_tokens_seen": 56334144, + "step": 46290 + }, + { + "epoch": 5.155919367412852, + "grad_norm": 0.7094546556472778, + "learning_rate": 4.630275191444076e-05, + "loss": 0.0796, + "num_input_tokens_seen": 56340576, + "step": 46295 + }, + { + "epoch": 5.15647622229647, + "grad_norm": 0.26020562648773193, + "learning_rate": 4.630148017992827e-05, + "loss": 0.1324, + "num_input_tokens_seen": 56346592, + "step": 46300 + }, + { + "epoch": 5.1570330771800865, + "grad_norm": 0.027520261704921722, + "learning_rate": 4.630020824420624e-05, + "loss": 0.1439, + "num_input_tokens_seen": 56352544, + "step": 46305 + }, + { + "epoch": 5.157589932063704, + "grad_norm": 0.1083223894238472, + "learning_rate": 4.6298936107286674e-05, + "loss": 0.0377, + "num_input_tokens_seen": 56358528, + "step": 46310 + }, + { + "epoch": 5.158146786947322, + "grad_norm": 0.28462475538253784, + "learning_rate": 4.6297663769181594e-05, + "loss": 0.0629, + "num_input_tokens_seen": 56364928, + "step": 46315 + }, + { + "epoch": 5.158703641830939, + "grad_norm": 0.3788324296474457, + "learning_rate": 4.629639122990301e-05, + "loss": 0.0318, + "num_input_tokens_seen": 56370848, + "step": 46320 + }, + { + "epoch": 5.159260496714556, + "grad_norm": 1.2027555704116821, + "learning_rate": 4.629511848946296e-05, + "loss": 0.0355, + "num_input_tokens_seen": 56376992, + "step": 46325 + }, + { + "epoch": 5.159817351598173, + "grad_norm": 1.4515769481658936, + "learning_rate": 4.629384554787345e-05, + "loss": 0.1891, + "num_input_tokens_seen": 56382336, + "step": 46330 + }, + { + "epoch": 5.160374206481791, + "grad_norm": 0.006881551817059517, + "learning_rate": 4.6292572405146506e-05, + "loss": 0.0722, + "num_input_tokens_seen": 56388224, + "step": 46335 + }, + { + "epoch": 5.1609310613654085, + "grad_norm": 0.33109670877456665, + "learning_rate": 4.6291299061294156e-05, + "loss": 0.1278, + "num_input_tokens_seen": 56394336, + "step": 46340 + }, + { + "epoch": 5.161487916249025, + "grad_norm": 0.047860946506261826, + "learning_rate": 4.629002551632843e-05, + "loss": 0.0397, + "num_input_tokens_seen": 56400480, + "step": 46345 + }, + { + "epoch": 5.162044771132643, + "grad_norm": 0.889843225479126, + "learning_rate": 4.628875177026136e-05, + "loss": 0.0804, + "num_input_tokens_seen": 56406400, + "step": 46350 + }, + { + "epoch": 5.16260162601626, + "grad_norm": 1.1904218196868896, + "learning_rate": 4.628747782310496e-05, + "loss": 0.2077, + "num_input_tokens_seen": 56411936, + "step": 46355 + }, + { + "epoch": 5.163158480899877, + "grad_norm": 0.4244602918624878, + "learning_rate": 4.6286203674871284e-05, + "loss": 0.1569, + "num_input_tokens_seen": 56417664, + "step": 46360 + }, + { + "epoch": 5.163715335783495, + "grad_norm": 0.11056454479694366, + "learning_rate": 4.628492932557237e-05, + "loss": 0.0843, + "num_input_tokens_seen": 56424000, + "step": 46365 + }, + { + "epoch": 5.164272190667112, + "grad_norm": 0.06354351341724396, + "learning_rate": 4.628365477522023e-05, + "loss": 0.1266, + "num_input_tokens_seen": 56429888, + "step": 46370 + }, + { + "epoch": 5.1648290455507295, + "grad_norm": 0.012538176029920578, + "learning_rate": 4.628238002382693e-05, + "loss": 0.1358, + "num_input_tokens_seen": 56435872, + "step": 46375 + }, + { + "epoch": 5.165385900434347, + "grad_norm": 0.25082048773765564, + "learning_rate": 4.62811050714045e-05, + "loss": 0.0345, + "num_input_tokens_seen": 56441952, + "step": 46380 + }, + { + "epoch": 5.165942755317964, + "grad_norm": 0.2425793558359146, + "learning_rate": 4.6279829917964966e-05, + "loss": 0.0581, + "num_input_tokens_seen": 56448160, + "step": 46385 + }, + { + "epoch": 5.166499610201582, + "grad_norm": 0.006824453826993704, + "learning_rate": 4.62785545635204e-05, + "loss": 0.0236, + "num_input_tokens_seen": 56454208, + "step": 46390 + }, + { + "epoch": 5.1670564650851984, + "grad_norm": 0.11339107155799866, + "learning_rate": 4.627727900808284e-05, + "loss": 0.0639, + "num_input_tokens_seen": 56460256, + "step": 46395 + }, + { + "epoch": 5.167613319968816, + "grad_norm": 0.024337315931916237, + "learning_rate": 4.6276003251664334e-05, + "loss": 0.0666, + "num_input_tokens_seen": 56466336, + "step": 46400 + }, + { + "epoch": 5.168170174852434, + "grad_norm": 0.5998706817626953, + "learning_rate": 4.627472729427693e-05, + "loss": 0.0461, + "num_input_tokens_seen": 56472704, + "step": 46405 + }, + { + "epoch": 5.168727029736051, + "grad_norm": 0.15184183418750763, + "learning_rate": 4.627345113593268e-05, + "loss": 0.0404, + "num_input_tokens_seen": 56478592, + "step": 46410 + }, + { + "epoch": 5.169283884619668, + "grad_norm": 0.0053514884784817696, + "learning_rate": 4.627217477664364e-05, + "loss": 0.0453, + "num_input_tokens_seen": 56484768, + "step": 46415 + }, + { + "epoch": 5.169840739503285, + "grad_norm": 1.9530041217803955, + "learning_rate": 4.6270898216421864e-05, + "loss": 0.0707, + "num_input_tokens_seen": 56491008, + "step": 46420 + }, + { + "epoch": 5.170397594386903, + "grad_norm": 0.5299863815307617, + "learning_rate": 4.6269621455279415e-05, + "loss": 0.0924, + "num_input_tokens_seen": 56497152, + "step": 46425 + }, + { + "epoch": 5.17095444927052, + "grad_norm": 0.039507701992988586, + "learning_rate": 4.626834449322835e-05, + "loss": 0.0673, + "num_input_tokens_seen": 56503264, + "step": 46430 + }, + { + "epoch": 5.171511304154137, + "grad_norm": 0.0027445585001260042, + "learning_rate": 4.626706733028073e-05, + "loss": 0.0323, + "num_input_tokens_seen": 56509088, + "step": 46435 + }, + { + "epoch": 5.172068159037755, + "grad_norm": 0.09421062469482422, + "learning_rate": 4.6265789966448625e-05, + "loss": 0.0664, + "num_input_tokens_seen": 56515584, + "step": 46440 + }, + { + "epoch": 5.1726250139213725, + "grad_norm": 0.07808489352464676, + "learning_rate": 4.6264512401744085e-05, + "loss": 0.0822, + "num_input_tokens_seen": 56521920, + "step": 46445 + }, + { + "epoch": 5.173181868804989, + "grad_norm": 1.679005742073059, + "learning_rate": 4.62632346361792e-05, + "loss": 0.0692, + "num_input_tokens_seen": 56528384, + "step": 46450 + }, + { + "epoch": 5.173738723688607, + "grad_norm": 0.9994046688079834, + "learning_rate": 4.6261956669766026e-05, + "loss": 0.1694, + "num_input_tokens_seen": 56534048, + "step": 46455 + }, + { + "epoch": 5.174295578572224, + "grad_norm": 1.0132023096084595, + "learning_rate": 4.626067850251664e-05, + "loss": 0.0809, + "num_input_tokens_seen": 56540576, + "step": 46460 + }, + { + "epoch": 5.1748524334558414, + "grad_norm": 0.046097107231616974, + "learning_rate": 4.62594001344431e-05, + "loss": 0.1578, + "num_input_tokens_seen": 56546784, + "step": 46465 + }, + { + "epoch": 5.175409288339459, + "grad_norm": 0.0712299719452858, + "learning_rate": 4.6258121565557496e-05, + "loss": 0.0282, + "num_input_tokens_seen": 56552160, + "step": 46470 + }, + { + "epoch": 5.175966143223076, + "grad_norm": 0.000852039607707411, + "learning_rate": 4.6256842795871916e-05, + "loss": 0.0401, + "num_input_tokens_seen": 56558656, + "step": 46475 + }, + { + "epoch": 5.176522998106694, + "grad_norm": 0.10616319626569748, + "learning_rate": 4.625556382539841e-05, + "loss": 0.0269, + "num_input_tokens_seen": 56565024, + "step": 46480 + }, + { + "epoch": 5.17707985299031, + "grad_norm": 1.0696762800216675, + "learning_rate": 4.6254284654149076e-05, + "loss": 0.1107, + "num_input_tokens_seen": 56571456, + "step": 46485 + }, + { + "epoch": 5.177636707873928, + "grad_norm": 0.29974356293678284, + "learning_rate": 4.6253005282135995e-05, + "loss": 0.0271, + "num_input_tokens_seen": 56577344, + "step": 46490 + }, + { + "epoch": 5.178193562757546, + "grad_norm": 0.0689338892698288, + "learning_rate": 4.625172570937126e-05, + "loss": 0.099, + "num_input_tokens_seen": 56583360, + "step": 46495 + }, + { + "epoch": 5.1787504176411625, + "grad_norm": 0.17910505831241608, + "learning_rate": 4.625044593586694e-05, + "loss": 0.0561, + "num_input_tokens_seen": 56589280, + "step": 46500 + }, + { + "epoch": 5.17930727252478, + "grad_norm": 0.691062867641449, + "learning_rate": 4.624916596163513e-05, + "loss": 0.076, + "num_input_tokens_seen": 56595264, + "step": 46505 + }, + { + "epoch": 5.179864127408397, + "grad_norm": 0.009515407495200634, + "learning_rate": 4.6247885786687935e-05, + "loss": 0.1411, + "num_input_tokens_seen": 56601344, + "step": 46510 + }, + { + "epoch": 5.180420982292015, + "grad_norm": 0.6253634095191956, + "learning_rate": 4.624660541103743e-05, + "loss": 0.1274, + "num_input_tokens_seen": 56607488, + "step": 46515 + }, + { + "epoch": 5.180977837175632, + "grad_norm": 0.8437857627868652, + "learning_rate": 4.624532483469571e-05, + "loss": 0.0986, + "num_input_tokens_seen": 56613984, + "step": 46520 + }, + { + "epoch": 5.181534692059249, + "grad_norm": 0.3126598298549652, + "learning_rate": 4.624404405767488e-05, + "loss": 0.0766, + "num_input_tokens_seen": 56620416, + "step": 46525 + }, + { + "epoch": 5.182091546942867, + "grad_norm": 0.04956481233239174, + "learning_rate": 4.624276307998703e-05, + "loss": 0.115, + "num_input_tokens_seen": 56626752, + "step": 46530 + }, + { + "epoch": 5.182648401826484, + "grad_norm": 0.005896370392292738, + "learning_rate": 4.624148190164427e-05, + "loss": 0.0418, + "num_input_tokens_seen": 56632896, + "step": 46535 + }, + { + "epoch": 5.183205256710101, + "grad_norm": 0.19991618394851685, + "learning_rate": 4.624020052265868e-05, + "loss": 0.0364, + "num_input_tokens_seen": 56638368, + "step": 46540 + }, + { + "epoch": 5.183762111593719, + "grad_norm": 0.3097826838493347, + "learning_rate": 4.6238918943042395e-05, + "loss": 0.0384, + "num_input_tokens_seen": 56644480, + "step": 46545 + }, + { + "epoch": 5.184318966477336, + "grad_norm": 0.41755786538124084, + "learning_rate": 4.62376371628075e-05, + "loss": 0.0294, + "num_input_tokens_seen": 56649952, + "step": 46550 + }, + { + "epoch": 5.184875821360953, + "grad_norm": 2.0556516647338867, + "learning_rate": 4.623635518196611e-05, + "loss": 0.1589, + "num_input_tokens_seen": 56655904, + "step": 46555 + }, + { + "epoch": 5.185432676244571, + "grad_norm": 0.8561772108078003, + "learning_rate": 4.623507300053032e-05, + "loss": 0.1451, + "num_input_tokens_seen": 56661888, + "step": 46560 + }, + { + "epoch": 5.185989531128188, + "grad_norm": 0.11833490431308746, + "learning_rate": 4.623379061851226e-05, + "loss": 0.1101, + "num_input_tokens_seen": 56667968, + "step": 46565 + }, + { + "epoch": 5.1865463860118055, + "grad_norm": 0.09999540448188782, + "learning_rate": 4.6232508035924026e-05, + "loss": 0.1415, + "num_input_tokens_seen": 56674144, + "step": 46570 + }, + { + "epoch": 5.187103240895422, + "grad_norm": 0.4562191665172577, + "learning_rate": 4.623122525277775e-05, + "loss": 0.0812, + "num_input_tokens_seen": 56679680, + "step": 46575 + }, + { + "epoch": 5.18766009577904, + "grad_norm": 1.1349622011184692, + "learning_rate": 4.6229942269085546e-05, + "loss": 0.0231, + "num_input_tokens_seen": 56685856, + "step": 46580 + }, + { + "epoch": 5.188216950662658, + "grad_norm": 0.2821885049343109, + "learning_rate": 4.622865908485952e-05, + "loss": 0.0557, + "num_input_tokens_seen": 56691808, + "step": 46585 + }, + { + "epoch": 5.188773805546274, + "grad_norm": 1.3763176202774048, + "learning_rate": 4.6227375700111805e-05, + "loss": 0.2231, + "num_input_tokens_seen": 56697728, + "step": 46590 + }, + { + "epoch": 5.189330660429892, + "grad_norm": 0.9086796045303345, + "learning_rate": 4.622609211485452e-05, + "loss": 0.1399, + "num_input_tokens_seen": 56703488, + "step": 46595 + }, + { + "epoch": 5.189887515313509, + "grad_norm": 1.0341476202011108, + "learning_rate": 4.622480832909979e-05, + "loss": 0.0305, + "num_input_tokens_seen": 56709664, + "step": 46600 + }, + { + "epoch": 5.190444370197127, + "grad_norm": 0.04236166179180145, + "learning_rate": 4.6223524342859734e-05, + "loss": 0.028, + "num_input_tokens_seen": 56715776, + "step": 46605 + }, + { + "epoch": 5.191001225080744, + "grad_norm": 0.36294710636138916, + "learning_rate": 4.622224015614649e-05, + "loss": 0.0356, + "num_input_tokens_seen": 56721408, + "step": 46610 + }, + { + "epoch": 5.191558079964361, + "grad_norm": 0.3857528567314148, + "learning_rate": 4.622095576897219e-05, + "loss": 0.0795, + "num_input_tokens_seen": 56727104, + "step": 46615 + }, + { + "epoch": 5.192114934847979, + "grad_norm": 0.019663292914628983, + "learning_rate": 4.6219671181348956e-05, + "loss": 0.1117, + "num_input_tokens_seen": 56732832, + "step": 46620 + }, + { + "epoch": 5.192671789731596, + "grad_norm": 0.002814990933984518, + "learning_rate": 4.621838639328892e-05, + "loss": 0.0813, + "num_input_tokens_seen": 56738976, + "step": 46625 + }, + { + "epoch": 5.193228644615213, + "grad_norm": 0.05213063210248947, + "learning_rate": 4.621710140480423e-05, + "loss": 0.0412, + "num_input_tokens_seen": 56745312, + "step": 46630 + }, + { + "epoch": 5.193785499498831, + "grad_norm": 0.07323925942182541, + "learning_rate": 4.621581621590703e-05, + "loss": 0.0319, + "num_input_tokens_seen": 56751456, + "step": 46635 + }, + { + "epoch": 5.194342354382448, + "grad_norm": 0.04431445524096489, + "learning_rate": 4.621453082660943e-05, + "loss": 0.0055, + "num_input_tokens_seen": 56757888, + "step": 46640 + }, + { + "epoch": 5.194899209266065, + "grad_norm": 0.19105389714241028, + "learning_rate": 4.6213245236923596e-05, + "loss": 0.1425, + "num_input_tokens_seen": 56763872, + "step": 46645 + }, + { + "epoch": 5.195456064149683, + "grad_norm": 0.07947663962841034, + "learning_rate": 4.621195944686167e-05, + "loss": 0.0964, + "num_input_tokens_seen": 56770240, + "step": 46650 + }, + { + "epoch": 5.1960129190333, + "grad_norm": 0.11026839911937714, + "learning_rate": 4.6210673456435786e-05, + "loss": 0.1413, + "num_input_tokens_seen": 56776448, + "step": 46655 + }, + { + "epoch": 5.196569773916917, + "grad_norm": 0.25551819801330566, + "learning_rate": 4.62093872656581e-05, + "loss": 0.0833, + "num_input_tokens_seen": 56782368, + "step": 46660 + }, + { + "epoch": 5.197126628800534, + "grad_norm": 0.33983469009399414, + "learning_rate": 4.620810087454076e-05, + "loss": 0.0274, + "num_input_tokens_seen": 56788576, + "step": 46665 + }, + { + "epoch": 5.197683483684152, + "grad_norm": 0.0004490635183174163, + "learning_rate": 4.620681428309591e-05, + "loss": 0.1107, + "num_input_tokens_seen": 56795040, + "step": 46670 + }, + { + "epoch": 5.19824033856777, + "grad_norm": 0.24485857784748077, + "learning_rate": 4.620552749133572e-05, + "loss": 0.0331, + "num_input_tokens_seen": 56801024, + "step": 46675 + }, + { + "epoch": 5.198797193451386, + "grad_norm": 1.6494145393371582, + "learning_rate": 4.620424049927232e-05, + "loss": 0.11, + "num_input_tokens_seen": 56807264, + "step": 46680 + }, + { + "epoch": 5.199354048335004, + "grad_norm": 0.007603805046528578, + "learning_rate": 4.620295330691789e-05, + "loss": 0.0408, + "num_input_tokens_seen": 56813472, + "step": 46685 + }, + { + "epoch": 5.199910903218621, + "grad_norm": 0.5560573935508728, + "learning_rate": 4.620166591428458e-05, + "loss": 0.1119, + "num_input_tokens_seen": 56819936, + "step": 46690 + }, + { + "epoch": 5.2004677581022385, + "grad_norm": 0.6201584339141846, + "learning_rate": 4.620037832138454e-05, + "loss": 0.0462, + "num_input_tokens_seen": 56826016, + "step": 46695 + }, + { + "epoch": 5.201024612985856, + "grad_norm": 2.071042060852051, + "learning_rate": 4.6199090528229935e-05, + "loss": 0.126, + "num_input_tokens_seen": 56832224, + "step": 46700 + }, + { + "epoch": 5.201581467869473, + "grad_norm": 0.16177554428577423, + "learning_rate": 4.619780253483295e-05, + "loss": 0.0466, + "num_input_tokens_seen": 56838336, + "step": 46705 + }, + { + "epoch": 5.202138322753091, + "grad_norm": 0.06572239100933075, + "learning_rate": 4.619651434120573e-05, + "loss": 0.0211, + "num_input_tokens_seen": 56844320, + "step": 46710 + }, + { + "epoch": 5.202695177636708, + "grad_norm": 0.6991752982139587, + "learning_rate": 4.619522594736045e-05, + "loss": 0.0312, + "num_input_tokens_seen": 56850336, + "step": 46715 + }, + { + "epoch": 5.203252032520325, + "grad_norm": 0.10964130610227585, + "learning_rate": 4.619393735330929e-05, + "loss": 0.0218, + "num_input_tokens_seen": 56856512, + "step": 46720 + }, + { + "epoch": 5.203808887403943, + "grad_norm": 0.17490410804748535, + "learning_rate": 4.61926485590644e-05, + "loss": 0.0426, + "num_input_tokens_seen": 56862496, + "step": 46725 + }, + { + "epoch": 5.2043657422875595, + "grad_norm": 0.5086140632629395, + "learning_rate": 4.6191359564637964e-05, + "loss": 0.0565, + "num_input_tokens_seen": 56868416, + "step": 46730 + }, + { + "epoch": 5.204922597171177, + "grad_norm": 0.5437474250793457, + "learning_rate": 4.619007037004217e-05, + "loss": 0.099, + "num_input_tokens_seen": 56874592, + "step": 46735 + }, + { + "epoch": 5.205479452054795, + "grad_norm": 0.0006290414021350443, + "learning_rate": 4.618878097528917e-05, + "loss": 0.0838, + "num_input_tokens_seen": 56880864, + "step": 46740 + }, + { + "epoch": 5.206036306938412, + "grad_norm": 0.04034318029880524, + "learning_rate": 4.6187491380391167e-05, + "loss": 0.0376, + "num_input_tokens_seen": 56887104, + "step": 46745 + }, + { + "epoch": 5.206593161822029, + "grad_norm": 0.7665567398071289, + "learning_rate": 4.618620158536033e-05, + "loss": 0.0565, + "num_input_tokens_seen": 56893312, + "step": 46750 + }, + { + "epoch": 5.207150016705646, + "grad_norm": 1.730953335762024, + "learning_rate": 4.618491159020884e-05, + "loss": 0.0369, + "num_input_tokens_seen": 56899488, + "step": 46755 + }, + { + "epoch": 5.207706871589264, + "grad_norm": 2.34806489944458, + "learning_rate": 4.618362139494889e-05, + "loss": 0.1514, + "num_input_tokens_seen": 56905632, + "step": 46760 + }, + { + "epoch": 5.2082637264728815, + "grad_norm": 0.6355534195899963, + "learning_rate": 4.618233099959267e-05, + "loss": 0.0792, + "num_input_tokens_seen": 56911840, + "step": 46765 + }, + { + "epoch": 5.208820581356498, + "grad_norm": 0.007494363002479076, + "learning_rate": 4.618104040415235e-05, + "loss": 0.018, + "num_input_tokens_seen": 56918208, + "step": 46770 + }, + { + "epoch": 5.209377436240116, + "grad_norm": 0.20385341346263885, + "learning_rate": 4.617974960864015e-05, + "loss": 0.0852, + "num_input_tokens_seen": 56924448, + "step": 46775 + }, + { + "epoch": 5.209934291123733, + "grad_norm": 0.07048673182725906, + "learning_rate": 4.6178458613068234e-05, + "loss": 0.0733, + "num_input_tokens_seen": 56930784, + "step": 46780 + }, + { + "epoch": 5.21049114600735, + "grad_norm": 0.7157546281814575, + "learning_rate": 4.6177167417448816e-05, + "loss": 0.0446, + "num_input_tokens_seen": 56937120, + "step": 46785 + }, + { + "epoch": 5.211048000890968, + "grad_norm": 0.02630004659295082, + "learning_rate": 4.617587602179408e-05, + "loss": 0.0434, + "num_input_tokens_seen": 56943360, + "step": 46790 + }, + { + "epoch": 5.211604855774585, + "grad_norm": 0.002758665243163705, + "learning_rate": 4.617458442611623e-05, + "loss": 0.1706, + "num_input_tokens_seen": 56949792, + "step": 46795 + }, + { + "epoch": 5.2121617106582026, + "grad_norm": 0.2567793130874634, + "learning_rate": 4.6173292630427465e-05, + "loss": 0.0557, + "num_input_tokens_seen": 56956032, + "step": 46800 + }, + { + "epoch": 5.21271856554182, + "grad_norm": 0.15083946287631989, + "learning_rate": 4.6172000634739986e-05, + "loss": 0.152, + "num_input_tokens_seen": 56962240, + "step": 46805 + }, + { + "epoch": 5.213275420425437, + "grad_norm": 0.012567129917442799, + "learning_rate": 4.6170708439066e-05, + "loss": 0.0528, + "num_input_tokens_seen": 56968416, + "step": 46810 + }, + { + "epoch": 5.213832275309055, + "grad_norm": 1.3655287027359009, + "learning_rate": 4.616941604341771e-05, + "loss": 0.156, + "num_input_tokens_seen": 56974752, + "step": 46815 + }, + { + "epoch": 5.2143891301926715, + "grad_norm": 1.1893678903579712, + "learning_rate": 4.616812344780733e-05, + "loss": 0.153, + "num_input_tokens_seen": 56980608, + "step": 46820 + }, + { + "epoch": 5.214945985076289, + "grad_norm": 0.5085873603820801, + "learning_rate": 4.6166830652247064e-05, + "loss": 0.0767, + "num_input_tokens_seen": 56986304, + "step": 46825 + }, + { + "epoch": 5.215502839959907, + "grad_norm": 0.757777750492096, + "learning_rate": 4.6165537656749115e-05, + "loss": 0.0488, + "num_input_tokens_seen": 56992544, + "step": 46830 + }, + { + "epoch": 5.216059694843524, + "grad_norm": 0.6468660235404968, + "learning_rate": 4.6164244461325715e-05, + "loss": 0.1129, + "num_input_tokens_seen": 56997760, + "step": 46835 + }, + { + "epoch": 5.216616549727141, + "grad_norm": 0.7754359245300293, + "learning_rate": 4.616295106598906e-05, + "loss": 0.0242, + "num_input_tokens_seen": 57003840, + "step": 46840 + }, + { + "epoch": 5.217173404610758, + "grad_norm": 0.5698720216751099, + "learning_rate": 4.6161657470751386e-05, + "loss": 0.0752, + "num_input_tokens_seen": 57009920, + "step": 46845 + }, + { + "epoch": 5.217730259494376, + "grad_norm": 0.030572861433029175, + "learning_rate": 4.61603636756249e-05, + "loss": 0.0453, + "num_input_tokens_seen": 57015936, + "step": 46850 + }, + { + "epoch": 5.218287114377993, + "grad_norm": 0.00390816293656826, + "learning_rate": 4.615906968062182e-05, + "loss": 0.097, + "num_input_tokens_seen": 57021792, + "step": 46855 + }, + { + "epoch": 5.21884396926161, + "grad_norm": 0.0028538512997329235, + "learning_rate": 4.615777548575438e-05, + "loss": 0.1162, + "num_input_tokens_seen": 57027552, + "step": 46860 + }, + { + "epoch": 5.219400824145228, + "grad_norm": 0.035670679062604904, + "learning_rate": 4.61564810910348e-05, + "loss": 0.0201, + "num_input_tokens_seen": 57033888, + "step": 46865 + }, + { + "epoch": 5.219957679028845, + "grad_norm": 0.058239247649908066, + "learning_rate": 4.61551864964753e-05, + "loss": 0.0627, + "num_input_tokens_seen": 57040256, + "step": 46870 + }, + { + "epoch": 5.220514533912462, + "grad_norm": 0.6441155672073364, + "learning_rate": 4.615389170208812e-05, + "loss": 0.2154, + "num_input_tokens_seen": 57045568, + "step": 46875 + }, + { + "epoch": 5.22107138879608, + "grad_norm": 2.1432673931121826, + "learning_rate": 4.615259670788548e-05, + "loss": 0.0617, + "num_input_tokens_seen": 57051744, + "step": 46880 + }, + { + "epoch": 5.221628243679697, + "grad_norm": 0.03254375979304314, + "learning_rate": 4.615130151387962e-05, + "loss": 0.0655, + "num_input_tokens_seen": 57057312, + "step": 46885 + }, + { + "epoch": 5.2221850985633145, + "grad_norm": 0.25361740589141846, + "learning_rate": 4.615000612008277e-05, + "loss": 0.021, + "num_input_tokens_seen": 57063200, + "step": 46890 + }, + { + "epoch": 5.222741953446932, + "grad_norm": 0.6321223974227905, + "learning_rate": 4.614871052650717e-05, + "loss": 0.1111, + "num_input_tokens_seen": 57069280, + "step": 46895 + }, + { + "epoch": 5.223298808330549, + "grad_norm": 0.46575817465782166, + "learning_rate": 4.614741473316505e-05, + "loss": 0.0649, + "num_input_tokens_seen": 57075232, + "step": 46900 + }, + { + "epoch": 5.223855663214167, + "grad_norm": 0.015813177451491356, + "learning_rate": 4.614611874006866e-05, + "loss": 0.056, + "num_input_tokens_seen": 57081344, + "step": 46905 + }, + { + "epoch": 5.224412518097783, + "grad_norm": 0.21175634860992432, + "learning_rate": 4.6144822547230236e-05, + "loss": 0.0049, + "num_input_tokens_seen": 57087584, + "step": 46910 + }, + { + "epoch": 5.224969372981401, + "grad_norm": 0.2016505002975464, + "learning_rate": 4.6143526154662023e-05, + "loss": 0.1305, + "num_input_tokens_seen": 57093888, + "step": 46915 + }, + { + "epoch": 5.225526227865019, + "grad_norm": 0.5531964898109436, + "learning_rate": 4.614222956237626e-05, + "loss": 0.1209, + "num_input_tokens_seen": 57099904, + "step": 46920 + }, + { + "epoch": 5.2260830827486355, + "grad_norm": 0.14643685519695282, + "learning_rate": 4.6140932770385205e-05, + "loss": 0.0266, + "num_input_tokens_seen": 57106432, + "step": 46925 + }, + { + "epoch": 5.226639937632253, + "grad_norm": 0.6287384033203125, + "learning_rate": 4.6139635778701095e-05, + "loss": 0.0775, + "num_input_tokens_seen": 57112064, + "step": 46930 + }, + { + "epoch": 5.22719679251587, + "grad_norm": 0.10593700408935547, + "learning_rate": 4.613833858733619e-05, + "loss": 0.0468, + "num_input_tokens_seen": 57118304, + "step": 46935 + }, + { + "epoch": 5.227753647399488, + "grad_norm": 0.11009447276592255, + "learning_rate": 4.6137041196302746e-05, + "loss": 0.0083, + "num_input_tokens_seen": 57124288, + "step": 46940 + }, + { + "epoch": 5.228310502283105, + "grad_norm": 2.0367043018341064, + "learning_rate": 4.6135743605613016e-05, + "loss": 0.1046, + "num_input_tokens_seen": 57130144, + "step": 46945 + }, + { + "epoch": 5.228867357166722, + "grad_norm": 0.040158625692129135, + "learning_rate": 4.613444581527925e-05, + "loss": 0.0601, + "num_input_tokens_seen": 57136544, + "step": 46950 + }, + { + "epoch": 5.22942421205034, + "grad_norm": 0.22561649978160858, + "learning_rate": 4.6133147825313704e-05, + "loss": 0.1011, + "num_input_tokens_seen": 57142688, + "step": 46955 + }, + { + "epoch": 5.229981066933957, + "grad_norm": 0.09068717062473297, + "learning_rate": 4.613184963572866e-05, + "loss": 0.1084, + "num_input_tokens_seen": 57148864, + "step": 46960 + }, + { + "epoch": 5.230537921817574, + "grad_norm": 0.0376877635717392, + "learning_rate": 4.613055124653636e-05, + "loss": 0.0466, + "num_input_tokens_seen": 57154720, + "step": 46965 + }, + { + "epoch": 5.231094776701192, + "grad_norm": 0.3675321340560913, + "learning_rate": 4.6129252657749064e-05, + "loss": 0.0518, + "num_input_tokens_seen": 57161376, + "step": 46970 + }, + { + "epoch": 5.231651631584809, + "grad_norm": 0.08772512525320053, + "learning_rate": 4.612795386937905e-05, + "loss": 0.019, + "num_input_tokens_seen": 57167648, + "step": 46975 + }, + { + "epoch": 5.232208486468426, + "grad_norm": 0.568461000919342, + "learning_rate": 4.612665488143859e-05, + "loss": 0.1051, + "num_input_tokens_seen": 57173664, + "step": 46980 + }, + { + "epoch": 5.232765341352044, + "grad_norm": 0.04456103965640068, + "learning_rate": 4.6125355693939956e-05, + "loss": 0.1422, + "num_input_tokens_seen": 57179872, + "step": 46985 + }, + { + "epoch": 5.233322196235661, + "grad_norm": 0.42897066473960876, + "learning_rate": 4.61240563068954e-05, + "loss": 0.0767, + "num_input_tokens_seen": 57185984, + "step": 46990 + }, + { + "epoch": 5.2338790511192785, + "grad_norm": 0.0005376067128963768, + "learning_rate": 4.612275672031721e-05, + "loss": 0.07, + "num_input_tokens_seen": 57191872, + "step": 46995 + }, + { + "epoch": 5.234435906002895, + "grad_norm": 1.3410521745681763, + "learning_rate": 4.6121456934217664e-05, + "loss": 0.1168, + "num_input_tokens_seen": 57198208, + "step": 47000 + }, + { + "epoch": 5.234992760886513, + "grad_norm": 0.0019540388602763414, + "learning_rate": 4.612015694860903e-05, + "loss": 0.0591, + "num_input_tokens_seen": 57204608, + "step": 47005 + }, + { + "epoch": 5.235549615770131, + "grad_norm": 0.42200544476509094, + "learning_rate": 4.6118856763503596e-05, + "loss": 0.034, + "num_input_tokens_seen": 57210976, + "step": 47010 + }, + { + "epoch": 5.2361064706537475, + "grad_norm": 0.0030389633029699326, + "learning_rate": 4.611755637891364e-05, + "loss": 0.0613, + "num_input_tokens_seen": 57217184, + "step": 47015 + }, + { + "epoch": 5.236663325537365, + "grad_norm": 0.11743580549955368, + "learning_rate": 4.611625579485144e-05, + "loss": 0.1113, + "num_input_tokens_seen": 57223168, + "step": 47020 + }, + { + "epoch": 5.237220180420982, + "grad_norm": 0.030393121764063835, + "learning_rate": 4.6114955011329294e-05, + "loss": 0.0027, + "num_input_tokens_seen": 57229184, + "step": 47025 + }, + { + "epoch": 5.2377770353046, + "grad_norm": 0.12854595482349396, + "learning_rate": 4.6113654028359476e-05, + "loss": 0.0306, + "num_input_tokens_seen": 57235328, + "step": 47030 + }, + { + "epoch": 5.238333890188217, + "grad_norm": 0.097606360912323, + "learning_rate": 4.611235284595428e-05, + "loss": 0.0386, + "num_input_tokens_seen": 57241504, + "step": 47035 + }, + { + "epoch": 5.238890745071834, + "grad_norm": 0.061556652188301086, + "learning_rate": 4.611105146412599e-05, + "loss": 0.0136, + "num_input_tokens_seen": 57247136, + "step": 47040 + }, + { + "epoch": 5.239447599955452, + "grad_norm": 1.701094388961792, + "learning_rate": 4.6109749882886914e-05, + "loss": 0.0505, + "num_input_tokens_seen": 57253088, + "step": 47045 + }, + { + "epoch": 5.2400044548390685, + "grad_norm": 0.9688096642494202, + "learning_rate": 4.610844810224934e-05, + "loss": 0.0938, + "num_input_tokens_seen": 57259200, + "step": 47050 + }, + { + "epoch": 5.240561309722686, + "grad_norm": 0.5153496861457825, + "learning_rate": 4.610714612222555e-05, + "loss": 0.0555, + "num_input_tokens_seen": 57265408, + "step": 47055 + }, + { + "epoch": 5.241118164606304, + "grad_norm": 0.04576583206653595, + "learning_rate": 4.6105843942827867e-05, + "loss": 0.0458, + "num_input_tokens_seen": 57271488, + "step": 47060 + }, + { + "epoch": 5.241675019489921, + "grad_norm": 0.5518195033073425, + "learning_rate": 4.610454156406857e-05, + "loss": 0.1077, + "num_input_tokens_seen": 57277344, + "step": 47065 + }, + { + "epoch": 5.242231874373538, + "grad_norm": 0.0010009730467572808, + "learning_rate": 4.610323898595997e-05, + "loss": 0.0972, + "num_input_tokens_seen": 57283424, + "step": 47070 + }, + { + "epoch": 5.242788729257156, + "grad_norm": 1.507960557937622, + "learning_rate": 4.610193620851438e-05, + "loss": 0.0818, + "num_input_tokens_seen": 57289472, + "step": 47075 + }, + { + "epoch": 5.243345584140773, + "grad_norm": 0.39886757731437683, + "learning_rate": 4.6100633231744075e-05, + "loss": 0.037, + "num_input_tokens_seen": 57295296, + "step": 47080 + }, + { + "epoch": 5.2439024390243905, + "grad_norm": 0.43453389406204224, + "learning_rate": 4.60993300556614e-05, + "loss": 0.0241, + "num_input_tokens_seen": 57301664, + "step": 47085 + }, + { + "epoch": 5.244459293908007, + "grad_norm": 0.08653118461370468, + "learning_rate": 4.6098026680278644e-05, + "loss": 0.0283, + "num_input_tokens_seen": 57307808, + "step": 47090 + }, + { + "epoch": 5.245016148791625, + "grad_norm": 0.1730610728263855, + "learning_rate": 4.609672310560812e-05, + "loss": 0.0974, + "num_input_tokens_seen": 57313984, + "step": 47095 + }, + { + "epoch": 5.245573003675243, + "grad_norm": 0.033383529633283615, + "learning_rate": 4.609541933166215e-05, + "loss": 0.07, + "num_input_tokens_seen": 57319840, + "step": 47100 + }, + { + "epoch": 5.246129858558859, + "grad_norm": 0.0028116770554333925, + "learning_rate": 4.609411535845304e-05, + "loss": 0.0592, + "num_input_tokens_seen": 57325824, + "step": 47105 + }, + { + "epoch": 5.246686713442477, + "grad_norm": 0.07613606750965118, + "learning_rate": 4.609281118599311e-05, + "loss": 0.0151, + "num_input_tokens_seen": 57331680, + "step": 47110 + }, + { + "epoch": 5.247243568326094, + "grad_norm": 0.7785603404045105, + "learning_rate": 4.609150681429468e-05, + "loss": 0.0321, + "num_input_tokens_seen": 57337824, + "step": 47115 + }, + { + "epoch": 5.2478004232097115, + "grad_norm": 0.6484007239341736, + "learning_rate": 4.609020224337007e-05, + "loss": 0.061, + "num_input_tokens_seen": 57343168, + "step": 47120 + }, + { + "epoch": 5.248357278093329, + "grad_norm": 0.11761501431465149, + "learning_rate": 4.60888974732316e-05, + "loss": 0.0693, + "num_input_tokens_seen": 57349376, + "step": 47125 + }, + { + "epoch": 5.248914132976946, + "grad_norm": 1.6470035314559937, + "learning_rate": 4.60875925038916e-05, + "loss": 0.108, + "num_input_tokens_seen": 57355552, + "step": 47130 + }, + { + "epoch": 5.249470987860564, + "grad_norm": 0.022127600386738777, + "learning_rate": 4.608628733536239e-05, + "loss": 0.0179, + "num_input_tokens_seen": 57361440, + "step": 47135 + }, + { + "epoch": 5.250027842744181, + "grad_norm": 0.856033444404602, + "learning_rate": 4.6084981967656305e-05, + "loss": 0.0804, + "num_input_tokens_seen": 57367808, + "step": 47140 + }, + { + "epoch": 5.250584697627798, + "grad_norm": 0.20123718678951263, + "learning_rate": 4.608367640078567e-05, + "loss": 0.1082, + "num_input_tokens_seen": 57374080, + "step": 47145 + }, + { + "epoch": 5.251141552511416, + "grad_norm": 0.011828036978840828, + "learning_rate": 4.608237063476282e-05, + "loss": 0.0748, + "num_input_tokens_seen": 57379680, + "step": 47150 + }, + { + "epoch": 5.251698407395033, + "grad_norm": 1.1824254989624023, + "learning_rate": 4.608106466960009e-05, + "loss": 0.0626, + "num_input_tokens_seen": 57385888, + "step": 47155 + }, + { + "epoch": 5.25225526227865, + "grad_norm": 0.13782811164855957, + "learning_rate": 4.6079758505309814e-05, + "loss": 0.1519, + "num_input_tokens_seen": 57391904, + "step": 47160 + }, + { + "epoch": 5.252812117162268, + "grad_norm": 1.733178734779358, + "learning_rate": 4.607845214190433e-05, + "loss": 0.1161, + "num_input_tokens_seen": 57397984, + "step": 47165 + }, + { + "epoch": 5.253368972045885, + "grad_norm": 0.7070655822753906, + "learning_rate": 4.607714557939598e-05, + "loss": 0.1643, + "num_input_tokens_seen": 57404000, + "step": 47170 + }, + { + "epoch": 5.253925826929502, + "grad_norm": 0.5538560152053833, + "learning_rate": 4.60758388177971e-05, + "loss": 0.1088, + "num_input_tokens_seen": 57410112, + "step": 47175 + }, + { + "epoch": 5.254482681813119, + "grad_norm": 0.006886041723191738, + "learning_rate": 4.607453185712004e-05, + "loss": 0.0446, + "num_input_tokens_seen": 57416256, + "step": 47180 + }, + { + "epoch": 5.255039536696737, + "grad_norm": 0.36679551005363464, + "learning_rate": 4.607322469737714e-05, + "loss": 0.0557, + "num_input_tokens_seen": 57422208, + "step": 47185 + }, + { + "epoch": 5.2555963915803545, + "grad_norm": 0.019667718559503555, + "learning_rate": 4.607191733858074e-05, + "loss": 0.1236, + "num_input_tokens_seen": 57428576, + "step": 47190 + }, + { + "epoch": 5.256153246463971, + "grad_norm": 0.00028117664624005556, + "learning_rate": 4.607060978074321e-05, + "loss": 0.0385, + "num_input_tokens_seen": 57434848, + "step": 47195 + }, + { + "epoch": 5.256710101347589, + "grad_norm": 0.018589545041322708, + "learning_rate": 4.6069302023876885e-05, + "loss": 0.0734, + "num_input_tokens_seen": 57441056, + "step": 47200 + }, + { + "epoch": 5.257266956231206, + "grad_norm": 0.12063104659318924, + "learning_rate": 4.6067994067994123e-05, + "loss": 0.0784, + "num_input_tokens_seen": 57447072, + "step": 47205 + }, + { + "epoch": 5.257823811114823, + "grad_norm": 0.04168634116649628, + "learning_rate": 4.606668591310728e-05, + "loss": 0.0597, + "num_input_tokens_seen": 57453344, + "step": 47210 + }, + { + "epoch": 5.258380665998441, + "grad_norm": 0.14887407422065735, + "learning_rate": 4.606537755922871e-05, + "loss": 0.0315, + "num_input_tokens_seen": 57459392, + "step": 47215 + }, + { + "epoch": 5.258937520882058, + "grad_norm": 0.7320400476455688, + "learning_rate": 4.6064069006370765e-05, + "loss": 0.0867, + "num_input_tokens_seen": 57465728, + "step": 47220 + }, + { + "epoch": 5.259494375765676, + "grad_norm": 0.7319187521934509, + "learning_rate": 4.6062760254545814e-05, + "loss": 0.0324, + "num_input_tokens_seen": 57471488, + "step": 47225 + }, + { + "epoch": 5.260051230649292, + "grad_norm": 1.9133473634719849, + "learning_rate": 4.606145130376622e-05, + "loss": 0.1219, + "num_input_tokens_seen": 57477344, + "step": 47230 + }, + { + "epoch": 5.26060808553291, + "grad_norm": 0.0005529846530407667, + "learning_rate": 4.6060142154044344e-05, + "loss": 0.0522, + "num_input_tokens_seen": 57483488, + "step": 47235 + }, + { + "epoch": 5.261164940416528, + "grad_norm": 0.051328375935554504, + "learning_rate": 4.605883280539255e-05, + "loss": 0.0174, + "num_input_tokens_seen": 57489536, + "step": 47240 + }, + { + "epoch": 5.2617217953001445, + "grad_norm": 0.8178043365478516, + "learning_rate": 4.6057523257823216e-05, + "loss": 0.1151, + "num_input_tokens_seen": 57495744, + "step": 47245 + }, + { + "epoch": 5.262278650183762, + "grad_norm": 1.6115823984146118, + "learning_rate": 4.60562135113487e-05, + "loss": 0.3132, + "num_input_tokens_seen": 57501952, + "step": 47250 + }, + { + "epoch": 5.26283550506738, + "grad_norm": 0.6930043697357178, + "learning_rate": 4.605490356598137e-05, + "loss": 0.0636, + "num_input_tokens_seen": 57508256, + "step": 47255 + }, + { + "epoch": 5.263392359950997, + "grad_norm": 0.4669293761253357, + "learning_rate": 4.605359342173361e-05, + "loss": 0.0681, + "num_input_tokens_seen": 57513696, + "step": 47260 + }, + { + "epoch": 5.263949214834614, + "grad_norm": 0.6288672685623169, + "learning_rate": 4.6052283078617796e-05, + "loss": 0.0644, + "num_input_tokens_seen": 57519072, + "step": 47265 + }, + { + "epoch": 5.264506069718231, + "grad_norm": 0.09558318555355072, + "learning_rate": 4.60509725366463e-05, + "loss": 0.0333, + "num_input_tokens_seen": 57525152, + "step": 47270 + }, + { + "epoch": 5.265062924601849, + "grad_norm": 0.7814302444458008, + "learning_rate": 4.604966179583151e-05, + "loss": 0.0346, + "num_input_tokens_seen": 57531456, + "step": 47275 + }, + { + "epoch": 5.2656197794854664, + "grad_norm": 0.7237627506256104, + "learning_rate": 4.604835085618578e-05, + "loss": 0.1244, + "num_input_tokens_seen": 57537856, + "step": 47280 + }, + { + "epoch": 5.266176634369083, + "grad_norm": 0.08573194593191147, + "learning_rate": 4.604703971772153e-05, + "loss": 0.0867, + "num_input_tokens_seen": 57544000, + "step": 47285 + }, + { + "epoch": 5.266733489252701, + "grad_norm": 0.4533030390739441, + "learning_rate": 4.6045728380451125e-05, + "loss": 0.0375, + "num_input_tokens_seen": 57550240, + "step": 47290 + }, + { + "epoch": 5.267290344136318, + "grad_norm": 0.190927192568779, + "learning_rate": 4.604441684438695e-05, + "loss": 0.1466, + "num_input_tokens_seen": 57556480, + "step": 47295 + }, + { + "epoch": 5.267847199019935, + "grad_norm": 1.0109550952911377, + "learning_rate": 4.6043105109541404e-05, + "loss": 0.0504, + "num_input_tokens_seen": 57562240, + "step": 47300 + }, + { + "epoch": 5.268404053903553, + "grad_norm": 0.8615391850471497, + "learning_rate": 4.6041793175926864e-05, + "loss": 0.0388, + "num_input_tokens_seen": 57567904, + "step": 47305 + }, + { + "epoch": 5.26896090878717, + "grad_norm": 0.013049052096903324, + "learning_rate": 4.604048104355573e-05, + "loss": 0.0534, + "num_input_tokens_seen": 57574016, + "step": 47310 + }, + { + "epoch": 5.2695177636707875, + "grad_norm": 0.5265423655509949, + "learning_rate": 4.603916871244039e-05, + "loss": 0.0739, + "num_input_tokens_seen": 57580256, + "step": 47315 + }, + { + "epoch": 5.270074618554405, + "grad_norm": 1.049191951751709, + "learning_rate": 4.6037856182593254e-05, + "loss": 0.0637, + "num_input_tokens_seen": 57586176, + "step": 47320 + }, + { + "epoch": 5.270631473438022, + "grad_norm": 0.3279176354408264, + "learning_rate": 4.603654345402671e-05, + "loss": 0.0234, + "num_input_tokens_seen": 57591744, + "step": 47325 + }, + { + "epoch": 5.27118832832164, + "grad_norm": 0.9022879600524902, + "learning_rate": 4.603523052675316e-05, + "loss": 0.0611, + "num_input_tokens_seen": 57597760, + "step": 47330 + }, + { + "epoch": 5.271745183205256, + "grad_norm": 0.08961854875087738, + "learning_rate": 4.6033917400784996e-05, + "loss": 0.0511, + "num_input_tokens_seen": 57603744, + "step": 47335 + }, + { + "epoch": 5.272302038088874, + "grad_norm": 0.01122006680816412, + "learning_rate": 4.6032604076134636e-05, + "loss": 0.027, + "num_input_tokens_seen": 57609856, + "step": 47340 + }, + { + "epoch": 5.272858892972492, + "grad_norm": 0.43617668747901917, + "learning_rate": 4.6031290552814474e-05, + "loss": 0.169, + "num_input_tokens_seen": 57615776, + "step": 47345 + }, + { + "epoch": 5.273415747856109, + "grad_norm": 0.0011470094323158264, + "learning_rate": 4.602997683083693e-05, + "loss": 0.0525, + "num_input_tokens_seen": 57621696, + "step": 47350 + }, + { + "epoch": 5.273972602739726, + "grad_norm": 0.021543335169553757, + "learning_rate": 4.602866291021441e-05, + "loss": 0.098, + "num_input_tokens_seen": 57627968, + "step": 47355 + }, + { + "epoch": 5.274529457623343, + "grad_norm": 0.6871137619018555, + "learning_rate": 4.6027348790959316e-05, + "loss": 0.1185, + "num_input_tokens_seen": 57634304, + "step": 47360 + }, + { + "epoch": 5.275086312506961, + "grad_norm": 0.34038040041923523, + "learning_rate": 4.602603447308406e-05, + "loss": 0.058, + "num_input_tokens_seen": 57640576, + "step": 47365 + }, + { + "epoch": 5.275643167390578, + "grad_norm": 0.017300210893154144, + "learning_rate": 4.602471995660106e-05, + "loss": 0.0675, + "num_input_tokens_seen": 57647072, + "step": 47370 + }, + { + "epoch": 5.276200022274195, + "grad_norm": 0.032233305275440216, + "learning_rate": 4.602340524152274e-05, + "loss": 0.0259, + "num_input_tokens_seen": 57652128, + "step": 47375 + }, + { + "epoch": 5.276756877157813, + "grad_norm": 0.11923995614051819, + "learning_rate": 4.6022090327861524e-05, + "loss": 0.0498, + "num_input_tokens_seen": 57658016, + "step": 47380 + }, + { + "epoch": 5.27731373204143, + "grad_norm": 0.0019346094923093915, + "learning_rate": 4.602077521562981e-05, + "loss": 0.0974, + "num_input_tokens_seen": 57664096, + "step": 47385 + }, + { + "epoch": 5.277870586925047, + "grad_norm": 0.17098727822303772, + "learning_rate": 4.601945990484004e-05, + "loss": 0.1508, + "num_input_tokens_seen": 57670080, + "step": 47390 + }, + { + "epoch": 5.278427441808665, + "grad_norm": 0.0027609586250036955, + "learning_rate": 4.6018144395504626e-05, + "loss": 0.06, + "num_input_tokens_seen": 57676160, + "step": 47395 + }, + { + "epoch": 5.278984296692282, + "grad_norm": 0.09071951359510422, + "learning_rate": 4.6016828687636e-05, + "loss": 0.13, + "num_input_tokens_seen": 57682528, + "step": 47400 + }, + { + "epoch": 5.279541151575899, + "grad_norm": 0.0016603043768554926, + "learning_rate": 4.601551278124659e-05, + "loss": 0.0199, + "num_input_tokens_seen": 57688960, + "step": 47405 + }, + { + "epoch": 5.280098006459516, + "grad_norm": 1.5147784948349, + "learning_rate": 4.601419667634882e-05, + "loss": 0.0317, + "num_input_tokens_seen": 57695040, + "step": 47410 + }, + { + "epoch": 5.280654861343134, + "grad_norm": 0.02306005358695984, + "learning_rate": 4.601288037295514e-05, + "loss": 0.0679, + "num_input_tokens_seen": 57701408, + "step": 47415 + }, + { + "epoch": 5.281211716226752, + "grad_norm": 0.7278749942779541, + "learning_rate": 4.601156387107795e-05, + "loss": 0.104, + "num_input_tokens_seen": 57707744, + "step": 47420 + }, + { + "epoch": 5.281768571110368, + "grad_norm": 0.07670480757951736, + "learning_rate": 4.601024717072971e-05, + "loss": 0.0399, + "num_input_tokens_seen": 57713984, + "step": 47425 + }, + { + "epoch": 5.282325425993986, + "grad_norm": 0.24863116443157196, + "learning_rate": 4.600893027192286e-05, + "loss": 0.0051, + "num_input_tokens_seen": 57720256, + "step": 47430 + }, + { + "epoch": 5.282882280877604, + "grad_norm": 0.580366313457489, + "learning_rate": 4.600761317466983e-05, + "loss": 0.0682, + "num_input_tokens_seen": 57726528, + "step": 47435 + }, + { + "epoch": 5.2834391357612205, + "grad_norm": 0.05043807998299599, + "learning_rate": 4.600629587898306e-05, + "loss": 0.019, + "num_input_tokens_seen": 57732864, + "step": 47440 + }, + { + "epoch": 5.283995990644838, + "grad_norm": 0.01844542659819126, + "learning_rate": 4.6004978384875e-05, + "loss": 0.032, + "num_input_tokens_seen": 57739104, + "step": 47445 + }, + { + "epoch": 5.284552845528455, + "grad_norm": 0.007470349781215191, + "learning_rate": 4.600366069235808e-05, + "loss": 0.0223, + "num_input_tokens_seen": 57744640, + "step": 47450 + }, + { + "epoch": 5.285109700412073, + "grad_norm": 0.05139429122209549, + "learning_rate": 4.6002342801444767e-05, + "loss": 0.1801, + "num_input_tokens_seen": 57750688, + "step": 47455 + }, + { + "epoch": 5.28566655529569, + "grad_norm": 2.022634983062744, + "learning_rate": 4.600102471214749e-05, + "loss": 0.2239, + "num_input_tokens_seen": 57756832, + "step": 47460 + }, + { + "epoch": 5.286223410179307, + "grad_norm": 0.0037868518847972155, + "learning_rate": 4.599970642447872e-05, + "loss": 0.0557, + "num_input_tokens_seen": 57763136, + "step": 47465 + }, + { + "epoch": 5.286780265062925, + "grad_norm": 1.048248052597046, + "learning_rate": 4.599838793845089e-05, + "loss": 0.0625, + "num_input_tokens_seen": 57769024, + "step": 47470 + }, + { + "epoch": 5.2873371199465415, + "grad_norm": 0.001817986718378961, + "learning_rate": 4.5997069254076466e-05, + "loss": 0.0755, + "num_input_tokens_seen": 57774656, + "step": 47475 + }, + { + "epoch": 5.287893974830159, + "grad_norm": 0.53998202085495, + "learning_rate": 4.5995750371367895e-05, + "loss": 0.0551, + "num_input_tokens_seen": 57780864, + "step": 47480 + }, + { + "epoch": 5.288450829713777, + "grad_norm": 0.38522860407829285, + "learning_rate": 4.5994431290337645e-05, + "loss": 0.0114, + "num_input_tokens_seen": 57787040, + "step": 47485 + }, + { + "epoch": 5.289007684597394, + "grad_norm": 1.0995720624923706, + "learning_rate": 4.599311201099817e-05, + "loss": 0.0654, + "num_input_tokens_seen": 57793056, + "step": 47490 + }, + { + "epoch": 5.289564539481011, + "grad_norm": 0.018070723861455917, + "learning_rate": 4.599179253336193e-05, + "loss": 0.0353, + "num_input_tokens_seen": 57799232, + "step": 47495 + }, + { + "epoch": 5.290121394364629, + "grad_norm": 1.7023038864135742, + "learning_rate": 4.59904728574414e-05, + "loss": 0.058, + "num_input_tokens_seen": 57805632, + "step": 47500 + }, + { + "epoch": 5.290678249248246, + "grad_norm": 1.3084803819656372, + "learning_rate": 4.598915298324903e-05, + "loss": 0.1156, + "num_input_tokens_seen": 57811552, + "step": 47505 + }, + { + "epoch": 5.2912351041318635, + "grad_norm": 0.000947762920986861, + "learning_rate": 4.59878329107973e-05, + "loss": 0.0734, + "num_input_tokens_seen": 57817952, + "step": 47510 + }, + { + "epoch": 5.29179195901548, + "grad_norm": 0.09647481143474579, + "learning_rate": 4.598651264009866e-05, + "loss": 0.012, + "num_input_tokens_seen": 57823968, + "step": 47515 + }, + { + "epoch": 5.292348813899098, + "grad_norm": 0.7614671587944031, + "learning_rate": 4.5985192171165605e-05, + "loss": 0.0809, + "num_input_tokens_seen": 57829984, + "step": 47520 + }, + { + "epoch": 5.292905668782716, + "grad_norm": 0.22670038044452667, + "learning_rate": 4.598387150401059e-05, + "loss": 0.1633, + "num_input_tokens_seen": 57836096, + "step": 47525 + }, + { + "epoch": 5.293462523666332, + "grad_norm": 0.019702522084116936, + "learning_rate": 4.5982550638646104e-05, + "loss": 0.036, + "num_input_tokens_seen": 57842016, + "step": 47530 + }, + { + "epoch": 5.29401937854995, + "grad_norm": 0.18990039825439453, + "learning_rate": 4.598122957508461e-05, + "loss": 0.2159, + "num_input_tokens_seen": 57848000, + "step": 47535 + }, + { + "epoch": 5.294576233433567, + "grad_norm": 0.8912879228591919, + "learning_rate": 4.5979908313338594e-05, + "loss": 0.0614, + "num_input_tokens_seen": 57854272, + "step": 47540 + }, + { + "epoch": 5.2951330883171845, + "grad_norm": 0.06357435137033463, + "learning_rate": 4.5978586853420533e-05, + "loss": 0.023, + "num_input_tokens_seen": 57860512, + "step": 47545 + }, + { + "epoch": 5.295689943200802, + "grad_norm": 0.7921620011329651, + "learning_rate": 4.597726519534292e-05, + "loss": 0.0711, + "num_input_tokens_seen": 57866592, + "step": 47550 + }, + { + "epoch": 5.296246798084419, + "grad_norm": 0.015448926016688347, + "learning_rate": 4.597594333911822e-05, + "loss": 0.0173, + "num_input_tokens_seen": 57872352, + "step": 47555 + }, + { + "epoch": 5.296803652968037, + "grad_norm": 0.002063606632873416, + "learning_rate": 4.597462128475894e-05, + "loss": 0.023, + "num_input_tokens_seen": 57878592, + "step": 47560 + }, + { + "epoch": 5.2973605078516535, + "grad_norm": 0.1253587156534195, + "learning_rate": 4.597329903227755e-05, + "loss": 0.0274, + "num_input_tokens_seen": 57884352, + "step": 47565 + }, + { + "epoch": 5.297917362735271, + "grad_norm": 0.06177110970020294, + "learning_rate": 4.597197658168655e-05, + "loss": 0.0821, + "num_input_tokens_seen": 57890688, + "step": 47570 + }, + { + "epoch": 5.298474217618889, + "grad_norm": 0.5078690052032471, + "learning_rate": 4.597065393299843e-05, + "loss": 0.1247, + "num_input_tokens_seen": 57896832, + "step": 47575 + }, + { + "epoch": 5.299031072502506, + "grad_norm": 1.3080703020095825, + "learning_rate": 4.5969331086225676e-05, + "loss": 0.1016, + "num_input_tokens_seen": 57903168, + "step": 47580 + }, + { + "epoch": 5.299587927386123, + "grad_norm": 0.3216043710708618, + "learning_rate": 4.596800804138079e-05, + "loss": 0.0239, + "num_input_tokens_seen": 57909408, + "step": 47585 + }, + { + "epoch": 5.30014478226974, + "grad_norm": 0.2258651852607727, + "learning_rate": 4.596668479847628e-05, + "loss": 0.0497, + "num_input_tokens_seen": 57915456, + "step": 47590 + }, + { + "epoch": 5.300701637153358, + "grad_norm": 0.034763552248477936, + "learning_rate": 4.596536135752463e-05, + "loss": 0.0216, + "num_input_tokens_seen": 57921728, + "step": 47595 + }, + { + "epoch": 5.301258492036975, + "grad_norm": 1.5529510974884033, + "learning_rate": 4.596403771853833e-05, + "loss": 0.1363, + "num_input_tokens_seen": 57928128, + "step": 47600 + }, + { + "epoch": 5.301815346920592, + "grad_norm": 1.1510727405548096, + "learning_rate": 4.596271388152992e-05, + "loss": 0.139, + "num_input_tokens_seen": 57934240, + "step": 47605 + }, + { + "epoch": 5.30237220180421, + "grad_norm": 0.031335148960351944, + "learning_rate": 4.5961389846511866e-05, + "loss": 0.0688, + "num_input_tokens_seen": 57940288, + "step": 47610 + }, + { + "epoch": 5.3029290566878275, + "grad_norm": 0.10541512072086334, + "learning_rate": 4.596006561349669e-05, + "loss": 0.0127, + "num_input_tokens_seen": 57946560, + "step": 47615 + }, + { + "epoch": 5.303485911571444, + "grad_norm": 1.3470591306686401, + "learning_rate": 4.595874118249691e-05, + "loss": 0.0802, + "num_input_tokens_seen": 57952480, + "step": 47620 + }, + { + "epoch": 5.304042766455062, + "grad_norm": 0.160816490650177, + "learning_rate": 4.5957416553525025e-05, + "loss": 0.0373, + "num_input_tokens_seen": 57958560, + "step": 47625 + }, + { + "epoch": 5.304599621338679, + "grad_norm": 2.592144727706909, + "learning_rate": 4.595609172659355e-05, + "loss": 0.0762, + "num_input_tokens_seen": 57964704, + "step": 47630 + }, + { + "epoch": 5.3051564762222965, + "grad_norm": 1.1242570877075195, + "learning_rate": 4.5954766701715e-05, + "loss": 0.1014, + "num_input_tokens_seen": 57971040, + "step": 47635 + }, + { + "epoch": 5.305713331105914, + "grad_norm": 0.020149964839220047, + "learning_rate": 4.5953441478901896e-05, + "loss": 0.0983, + "num_input_tokens_seen": 57977120, + "step": 47640 + }, + { + "epoch": 5.306270185989531, + "grad_norm": 1.0440198183059692, + "learning_rate": 4.5952116058166736e-05, + "loss": 0.0487, + "num_input_tokens_seen": 57983168, + "step": 47645 + }, + { + "epoch": 5.306827040873149, + "grad_norm": 0.056547220796346664, + "learning_rate": 4.595079043952206e-05, + "loss": 0.0065, + "num_input_tokens_seen": 57989504, + "step": 47650 + }, + { + "epoch": 5.307383895756765, + "grad_norm": 0.5401089787483215, + "learning_rate": 4.594946462298038e-05, + "loss": 0.044, + "num_input_tokens_seen": 57995904, + "step": 47655 + }, + { + "epoch": 5.307940750640383, + "grad_norm": 0.0144258514046669, + "learning_rate": 4.594813860855423e-05, + "loss": 0.03, + "num_input_tokens_seen": 58002208, + "step": 47660 + }, + { + "epoch": 5.308497605524001, + "grad_norm": 0.7302517294883728, + "learning_rate": 4.594681239625612e-05, + "loss": 0.04, + "num_input_tokens_seen": 58008160, + "step": 47665 + }, + { + "epoch": 5.3090544604076175, + "grad_norm": 0.021707970649003983, + "learning_rate": 4.594548598609859e-05, + "loss": 0.0376, + "num_input_tokens_seen": 58014240, + "step": 47670 + }, + { + "epoch": 5.309611315291235, + "grad_norm": 0.13185197114944458, + "learning_rate": 4.5944159378094157e-05, + "loss": 0.1001, + "num_input_tokens_seen": 58020480, + "step": 47675 + }, + { + "epoch": 5.310168170174853, + "grad_norm": 0.5687019228935242, + "learning_rate": 4.5942832572255355e-05, + "loss": 0.0486, + "num_input_tokens_seen": 58026720, + "step": 47680 + }, + { + "epoch": 5.31072502505847, + "grad_norm": 1.8228484392166138, + "learning_rate": 4.594150556859473e-05, + "loss": 0.1234, + "num_input_tokens_seen": 58032640, + "step": 47685 + }, + { + "epoch": 5.311281879942087, + "grad_norm": 2.5173068046569824, + "learning_rate": 4.5940178367124805e-05, + "loss": 0.1066, + "num_input_tokens_seen": 58038912, + "step": 47690 + }, + { + "epoch": 5.311838734825704, + "grad_norm": 0.34517788887023926, + "learning_rate": 4.5938850967858116e-05, + "loss": 0.1419, + "num_input_tokens_seen": 58044896, + "step": 47695 + }, + { + "epoch": 5.312395589709322, + "grad_norm": 0.05743642896413803, + "learning_rate": 4.593752337080721e-05, + "loss": 0.0592, + "num_input_tokens_seen": 58051008, + "step": 47700 + }, + { + "epoch": 5.3129524445929395, + "grad_norm": 0.5640559792518616, + "learning_rate": 4.5936195575984615e-05, + "loss": 0.0178, + "num_input_tokens_seen": 58056896, + "step": 47705 + }, + { + "epoch": 5.313509299476556, + "grad_norm": 0.2886750400066376, + "learning_rate": 4.5934867583402876e-05, + "loss": 0.0261, + "num_input_tokens_seen": 58062912, + "step": 47710 + }, + { + "epoch": 5.314066154360174, + "grad_norm": 0.05590982735157013, + "learning_rate": 4.593353939307454e-05, + "loss": 0.0958, + "num_input_tokens_seen": 58069056, + "step": 47715 + }, + { + "epoch": 5.314623009243791, + "grad_norm": 0.10474268347024918, + "learning_rate": 4.593221100501216e-05, + "loss": 0.084, + "num_input_tokens_seen": 58075360, + "step": 47720 + }, + { + "epoch": 5.315179864127408, + "grad_norm": 0.08250205963850021, + "learning_rate": 4.593088241922827e-05, + "loss": 0.0164, + "num_input_tokens_seen": 58081184, + "step": 47725 + }, + { + "epoch": 5.315736719011026, + "grad_norm": 0.33216428756713867, + "learning_rate": 4.5929553635735435e-05, + "loss": 0.108, + "num_input_tokens_seen": 58087264, + "step": 47730 + }, + { + "epoch": 5.316293573894643, + "grad_norm": 0.4796077609062195, + "learning_rate": 4.592822465454619e-05, + "loss": 0.051, + "num_input_tokens_seen": 58093600, + "step": 47735 + }, + { + "epoch": 5.3168504287782605, + "grad_norm": 0.5723252296447754, + "learning_rate": 4.592689547567309e-05, + "loss": 0.0573, + "num_input_tokens_seen": 58099584, + "step": 47740 + }, + { + "epoch": 5.317407283661877, + "grad_norm": 0.12358163297176361, + "learning_rate": 4.5925566099128706e-05, + "loss": 0.0071, + "num_input_tokens_seen": 58105920, + "step": 47745 + }, + { + "epoch": 5.317964138545495, + "grad_norm": 0.005488618742674589, + "learning_rate": 4.5924236524925584e-05, + "loss": 0.0121, + "num_input_tokens_seen": 58112128, + "step": 47750 + }, + { + "epoch": 5.318520993429113, + "grad_norm": 0.7312461137771606, + "learning_rate": 4.5922906753076275e-05, + "loss": 0.0509, + "num_input_tokens_seen": 58118112, + "step": 47755 + }, + { + "epoch": 5.3190778483127295, + "grad_norm": 0.46642619371414185, + "learning_rate": 4.592157678359336e-05, + "loss": 0.11, + "num_input_tokens_seen": 58124608, + "step": 47760 + }, + { + "epoch": 5.319634703196347, + "grad_norm": 0.16424211859703064, + "learning_rate": 4.592024661648939e-05, + "loss": 0.0241, + "num_input_tokens_seen": 58130304, + "step": 47765 + }, + { + "epoch": 5.320191558079964, + "grad_norm": 0.0026947930455207825, + "learning_rate": 4.591891625177692e-05, + "loss": 0.0197, + "num_input_tokens_seen": 58136448, + "step": 47770 + }, + { + "epoch": 5.320748412963582, + "grad_norm": 0.3092697858810425, + "learning_rate": 4.591758568946854e-05, + "loss": 0.0913, + "num_input_tokens_seen": 58142624, + "step": 47775 + }, + { + "epoch": 5.321305267847199, + "grad_norm": 0.05400653928518295, + "learning_rate": 4.591625492957678e-05, + "loss": 0.0448, + "num_input_tokens_seen": 58148384, + "step": 47780 + }, + { + "epoch": 5.321862122730816, + "grad_norm": 1.434995412826538, + "learning_rate": 4.5914923972114255e-05, + "loss": 0.1319, + "num_input_tokens_seen": 58154912, + "step": 47785 + }, + { + "epoch": 5.322418977614434, + "grad_norm": 0.9845010042190552, + "learning_rate": 4.5913592817093516e-05, + "loss": 0.1403, + "num_input_tokens_seen": 58161056, + "step": 47790 + }, + { + "epoch": 5.322975832498051, + "grad_norm": 0.014415755867958069, + "learning_rate": 4.591226146452713e-05, + "loss": 0.0041, + "num_input_tokens_seen": 58167296, + "step": 47795 + }, + { + "epoch": 5.323532687381668, + "grad_norm": 0.009579316712915897, + "learning_rate": 4.591092991442768e-05, + "loss": 0.0509, + "num_input_tokens_seen": 58173760, + "step": 47800 + }, + { + "epoch": 5.324089542265286, + "grad_norm": 0.9172943830490112, + "learning_rate": 4.590959816680775e-05, + "loss": 0.0999, + "num_input_tokens_seen": 58180032, + "step": 47805 + }, + { + "epoch": 5.324646397148903, + "grad_norm": 1.1000807285308838, + "learning_rate": 4.590826622167991e-05, + "loss": 0.042, + "num_input_tokens_seen": 58186336, + "step": 47810 + }, + { + "epoch": 5.32520325203252, + "grad_norm": 1.3965271711349487, + "learning_rate": 4.590693407905674e-05, + "loss": 0.0659, + "num_input_tokens_seen": 58192480, + "step": 47815 + }, + { + "epoch": 5.325760106916138, + "grad_norm": 0.0008455115603283048, + "learning_rate": 4.5905601738950824e-05, + "loss": 0.0203, + "num_input_tokens_seen": 58197696, + "step": 47820 + }, + { + "epoch": 5.326316961799755, + "grad_norm": 0.0945863202214241, + "learning_rate": 4.590426920137476e-05, + "loss": 0.0583, + "num_input_tokens_seen": 58203712, + "step": 47825 + }, + { + "epoch": 5.3268738166833725, + "grad_norm": 0.05630261451005936, + "learning_rate": 4.590293646634112e-05, + "loss": 0.0865, + "num_input_tokens_seen": 58209920, + "step": 47830 + }, + { + "epoch": 5.327430671566989, + "grad_norm": 0.22187212109565735, + "learning_rate": 4.59016035338625e-05, + "loss": 0.1382, + "num_input_tokens_seen": 58216288, + "step": 47835 + }, + { + "epoch": 5.327987526450607, + "grad_norm": 5.078110218048096, + "learning_rate": 4.5900270403951486e-05, + "loss": 0.149, + "num_input_tokens_seen": 58222272, + "step": 47840 + }, + { + "epoch": 5.328544381334225, + "grad_norm": 1.500955581665039, + "learning_rate": 4.589893707662067e-05, + "loss": 0.0705, + "num_input_tokens_seen": 58228512, + "step": 47845 + }, + { + "epoch": 5.329101236217841, + "grad_norm": 0.34718719124794006, + "learning_rate": 4.5897603551882664e-05, + "loss": 0.0823, + "num_input_tokens_seen": 58234272, + "step": 47850 + }, + { + "epoch": 5.329658091101459, + "grad_norm": 0.24049487709999084, + "learning_rate": 4.589626982975004e-05, + "loss": 0.079, + "num_input_tokens_seen": 58240352, + "step": 47855 + }, + { + "epoch": 5.330214945985077, + "grad_norm": 0.7552046179771423, + "learning_rate": 4.5894935910235404e-05, + "loss": 0.0992, + "num_input_tokens_seen": 58246720, + "step": 47860 + }, + { + "epoch": 5.3307718008686935, + "grad_norm": 0.00961958896368742, + "learning_rate": 4.589360179335135e-05, + "loss": 0.0634, + "num_input_tokens_seen": 58252864, + "step": 47865 + }, + { + "epoch": 5.331328655752311, + "grad_norm": 0.006794638000428677, + "learning_rate": 4.58922674791105e-05, + "loss": 0.0516, + "num_input_tokens_seen": 58259008, + "step": 47870 + }, + { + "epoch": 5.331885510635928, + "grad_norm": 0.2902570366859436, + "learning_rate": 4.589093296752544e-05, + "loss": 0.0351, + "num_input_tokens_seen": 58265120, + "step": 47875 + }, + { + "epoch": 5.332442365519546, + "grad_norm": 0.0003247448767069727, + "learning_rate": 4.5889598258608784e-05, + "loss": 0.0536, + "num_input_tokens_seen": 58271360, + "step": 47880 + }, + { + "epoch": 5.332999220403163, + "grad_norm": 0.21468301117420197, + "learning_rate": 4.588826335237314e-05, + "loss": 0.1251, + "num_input_tokens_seen": 58277472, + "step": 47885 + }, + { + "epoch": 5.33355607528678, + "grad_norm": 0.8209104537963867, + "learning_rate": 4.58869282488311e-05, + "loss": 0.1009, + "num_input_tokens_seen": 58283616, + "step": 47890 + }, + { + "epoch": 5.334112930170398, + "grad_norm": 0.7047815322875977, + "learning_rate": 4.5885592947995295e-05, + "loss": 0.0268, + "num_input_tokens_seen": 58289600, + "step": 47895 + }, + { + "epoch": 5.334669785054015, + "grad_norm": 0.8487882614135742, + "learning_rate": 4.5884257449878334e-05, + "loss": 0.0499, + "num_input_tokens_seen": 58296000, + "step": 47900 + }, + { + "epoch": 5.335226639937632, + "grad_norm": 0.8830689787864685, + "learning_rate": 4.588292175449283e-05, + "loss": 0.0794, + "num_input_tokens_seen": 58301664, + "step": 47905 + }, + { + "epoch": 5.33578349482125, + "grad_norm": 0.7809455990791321, + "learning_rate": 4.588158586185139e-05, + "loss": 0.0798, + "num_input_tokens_seen": 58307712, + "step": 47910 + }, + { + "epoch": 5.336340349704867, + "grad_norm": 0.43904489278793335, + "learning_rate": 4.588024977196665e-05, + "loss": 0.0882, + "num_input_tokens_seen": 58313184, + "step": 47915 + }, + { + "epoch": 5.336897204588484, + "grad_norm": 0.0005904958816245198, + "learning_rate": 4.5878913484851215e-05, + "loss": 0.0128, + "num_input_tokens_seen": 58319360, + "step": 47920 + }, + { + "epoch": 5.337454059472101, + "grad_norm": 0.04344326630234718, + "learning_rate": 4.5877577000517727e-05, + "loss": 0.0482, + "num_input_tokens_seen": 58325600, + "step": 47925 + }, + { + "epoch": 5.338010914355719, + "grad_norm": 0.26592493057250977, + "learning_rate": 4.587624031897879e-05, + "loss": 0.0259, + "num_input_tokens_seen": 58331968, + "step": 47930 + }, + { + "epoch": 5.3385677692393365, + "grad_norm": 0.21129551529884338, + "learning_rate": 4.587490344024703e-05, + "loss": 0.0734, + "num_input_tokens_seen": 58338112, + "step": 47935 + }, + { + "epoch": 5.339124624122953, + "grad_norm": 0.22571954131126404, + "learning_rate": 4.587356636433508e-05, + "loss": 0.0218, + "num_input_tokens_seen": 58344096, + "step": 47940 + }, + { + "epoch": 5.339681479006571, + "grad_norm": 0.5035973191261292, + "learning_rate": 4.587222909125559e-05, + "loss": 0.0803, + "num_input_tokens_seen": 58350208, + "step": 47945 + }, + { + "epoch": 5.340238333890188, + "grad_norm": 0.03993422910571098, + "learning_rate": 4.587089162102116e-05, + "loss": 0.0579, + "num_input_tokens_seen": 58356256, + "step": 47950 + }, + { + "epoch": 5.340795188773805, + "grad_norm": 0.4897806644439697, + "learning_rate": 4.586955395364445e-05, + "loss": 0.0541, + "num_input_tokens_seen": 58362208, + "step": 47955 + }, + { + "epoch": 5.341352043657423, + "grad_norm": 0.33887770771980286, + "learning_rate": 4.5868216089138074e-05, + "loss": 0.0116, + "num_input_tokens_seen": 58368160, + "step": 47960 + }, + { + "epoch": 5.34190889854104, + "grad_norm": 1.0812801122665405, + "learning_rate": 4.586687802751467e-05, + "loss": 0.116, + "num_input_tokens_seen": 58374144, + "step": 47965 + }, + { + "epoch": 5.342465753424658, + "grad_norm": 0.2736797034740448, + "learning_rate": 4.586553976878689e-05, + "loss": 0.0241, + "num_input_tokens_seen": 58380096, + "step": 47970 + }, + { + "epoch": 5.343022608308275, + "grad_norm": 0.2291482537984848, + "learning_rate": 4.586420131296738e-05, + "loss": 0.0766, + "num_input_tokens_seen": 58386560, + "step": 47975 + }, + { + "epoch": 5.343579463191892, + "grad_norm": 0.26160386204719543, + "learning_rate": 4.586286266006876e-05, + "loss": 0.034, + "num_input_tokens_seen": 58392704, + "step": 47980 + }, + { + "epoch": 5.34413631807551, + "grad_norm": 0.0005442628753371537, + "learning_rate": 4.586152381010369e-05, + "loss": 0.0989, + "num_input_tokens_seen": 58398880, + "step": 47985 + }, + { + "epoch": 5.3446931729591265, + "grad_norm": 0.006518700160086155, + "learning_rate": 4.586018476308482e-05, + "loss": 0.0637, + "num_input_tokens_seen": 58405280, + "step": 47990 + }, + { + "epoch": 5.345250027842744, + "grad_norm": 0.16649788618087769, + "learning_rate": 4.5858845519024783e-05, + "loss": 0.0106, + "num_input_tokens_seen": 58411808, + "step": 47995 + }, + { + "epoch": 5.345806882726362, + "grad_norm": 0.15820075571537018, + "learning_rate": 4.585750607793625e-05, + "loss": 0.0499, + "num_input_tokens_seen": 58417920, + "step": 48000 + }, + { + "epoch": 5.346363737609979, + "grad_norm": 0.3272157311439514, + "learning_rate": 4.585616643983185e-05, + "loss": 0.0371, + "num_input_tokens_seen": 58424128, + "step": 48005 + }, + { + "epoch": 5.346920592493596, + "grad_norm": 0.010310731828212738, + "learning_rate": 4.5854826604724256e-05, + "loss": 0.139, + "num_input_tokens_seen": 58430432, + "step": 48010 + }, + { + "epoch": 5.347477447377213, + "grad_norm": 0.6812174916267395, + "learning_rate": 4.5853486572626106e-05, + "loss": 0.0356, + "num_input_tokens_seen": 58436416, + "step": 48015 + }, + { + "epoch": 5.348034302260831, + "grad_norm": 0.05182981118559837, + "learning_rate": 4.585214634355007e-05, + "loss": 0.0317, + "num_input_tokens_seen": 58442688, + "step": 48020 + }, + { + "epoch": 5.348591157144448, + "grad_norm": 1.4066698551177979, + "learning_rate": 4.585080591750882e-05, + "loss": 0.0624, + "num_input_tokens_seen": 58448576, + "step": 48025 + }, + { + "epoch": 5.349148012028065, + "grad_norm": 0.41119349002838135, + "learning_rate": 4.584946529451499e-05, + "loss": 0.0389, + "num_input_tokens_seen": 58454912, + "step": 48030 + }, + { + "epoch": 5.349704866911683, + "grad_norm": 0.12673264741897583, + "learning_rate": 4.5848124474581256e-05, + "loss": 0.0478, + "num_input_tokens_seen": 58461056, + "step": 48035 + }, + { + "epoch": 5.350261721795301, + "grad_norm": 0.26779845356941223, + "learning_rate": 4.584678345772029e-05, + "loss": 0.08, + "num_input_tokens_seen": 58467328, + "step": 48040 + }, + { + "epoch": 5.350818576678917, + "grad_norm": 0.11244216561317444, + "learning_rate": 4.5845442243944745e-05, + "loss": 0.012, + "num_input_tokens_seen": 58473472, + "step": 48045 + }, + { + "epoch": 5.351375431562535, + "grad_norm": 0.6948320865631104, + "learning_rate": 4.5844100833267304e-05, + "loss": 0.0423, + "num_input_tokens_seen": 58479776, + "step": 48050 + }, + { + "epoch": 5.351932286446152, + "grad_norm": 0.8070053458213806, + "learning_rate": 4.584275922570062e-05, + "loss": 0.0344, + "num_input_tokens_seen": 58486208, + "step": 48055 + }, + { + "epoch": 5.3524891413297695, + "grad_norm": 2.040790319442749, + "learning_rate": 4.584141742125738e-05, + "loss": 0.0619, + "num_input_tokens_seen": 58492512, + "step": 48060 + }, + { + "epoch": 5.353045996213387, + "grad_norm": 0.08463180065155029, + "learning_rate": 4.5840075419950255e-05, + "loss": 0.1364, + "num_input_tokens_seen": 58498336, + "step": 48065 + }, + { + "epoch": 5.353602851097004, + "grad_norm": 0.0423944815993309, + "learning_rate": 4.583873322179193e-05, + "loss": 0.0081, + "num_input_tokens_seen": 58504544, + "step": 48070 + }, + { + "epoch": 5.354159705980622, + "grad_norm": 0.0045420159585773945, + "learning_rate": 4.583739082679506e-05, + "loss": 0.0134, + "num_input_tokens_seen": 58510528, + "step": 48075 + }, + { + "epoch": 5.354716560864238, + "grad_norm": 0.3035390377044678, + "learning_rate": 4.583604823497235e-05, + "loss": 0.0093, + "num_input_tokens_seen": 58516768, + "step": 48080 + }, + { + "epoch": 5.355273415747856, + "grad_norm": 0.5705946683883667, + "learning_rate": 4.583470544633647e-05, + "loss": 0.1281, + "num_input_tokens_seen": 58523168, + "step": 48085 + }, + { + "epoch": 5.355830270631474, + "grad_norm": 0.23914805054664612, + "learning_rate": 4.58333624609001e-05, + "loss": 0.0126, + "num_input_tokens_seen": 58529376, + "step": 48090 + }, + { + "epoch": 5.3563871255150906, + "grad_norm": 0.9585753083229065, + "learning_rate": 4.583201927867592e-05, + "loss": 0.121, + "num_input_tokens_seen": 58535712, + "step": 48095 + }, + { + "epoch": 5.356943980398708, + "grad_norm": 1.2153868675231934, + "learning_rate": 4.583067589967665e-05, + "loss": 0.1034, + "num_input_tokens_seen": 58541792, + "step": 48100 + }, + { + "epoch": 5.357500835282325, + "grad_norm": 0.4481048583984375, + "learning_rate": 4.582933232391494e-05, + "loss": 0.1764, + "num_input_tokens_seen": 58547360, + "step": 48105 + }, + { + "epoch": 5.358057690165943, + "grad_norm": 0.5096724629402161, + "learning_rate": 4.58279885514035e-05, + "loss": 0.0701, + "num_input_tokens_seen": 58553440, + "step": 48110 + }, + { + "epoch": 5.35861454504956, + "grad_norm": 0.04278222471475601, + "learning_rate": 4.582664458215503e-05, + "loss": 0.1214, + "num_input_tokens_seen": 58559232, + "step": 48115 + }, + { + "epoch": 5.359171399933177, + "grad_norm": 0.24882516264915466, + "learning_rate": 4.582530041618221e-05, + "loss": 0.0401, + "num_input_tokens_seen": 58564864, + "step": 48120 + }, + { + "epoch": 5.359728254816795, + "grad_norm": 0.05977912247180939, + "learning_rate": 4.582395605349774e-05, + "loss": 0.1128, + "num_input_tokens_seen": 58570144, + "step": 48125 + }, + { + "epoch": 5.360285109700412, + "grad_norm": 1.056485891342163, + "learning_rate": 4.5822611494114326e-05, + "loss": 0.0821, + "num_input_tokens_seen": 58576128, + "step": 48130 + }, + { + "epoch": 5.360841964584029, + "grad_norm": 0.018144112080335617, + "learning_rate": 4.5821266738044664e-05, + "loss": 0.0955, + "num_input_tokens_seen": 58582144, + "step": 48135 + }, + { + "epoch": 5.361398819467647, + "grad_norm": 0.008820670656859875, + "learning_rate": 4.581992178530146e-05, + "loss": 0.014, + "num_input_tokens_seen": 58588256, + "step": 48140 + }, + { + "epoch": 5.361955674351264, + "grad_norm": 0.9082252383232117, + "learning_rate": 4.5818576635897405e-05, + "loss": 0.1385, + "num_input_tokens_seen": 58594464, + "step": 48145 + }, + { + "epoch": 5.362512529234881, + "grad_norm": 0.5967576503753662, + "learning_rate": 4.5817231289845205e-05, + "loss": 0.0233, + "num_input_tokens_seen": 58600672, + "step": 48150 + }, + { + "epoch": 5.363069384118499, + "grad_norm": 0.1898055374622345, + "learning_rate": 4.5815885747157593e-05, + "loss": 0.019, + "num_input_tokens_seen": 58606752, + "step": 48155 + }, + { + "epoch": 5.363626239002116, + "grad_norm": 0.052011433988809586, + "learning_rate": 4.581454000784726e-05, + "loss": 0.005, + "num_input_tokens_seen": 58612864, + "step": 48160 + }, + { + "epoch": 5.3641830938857336, + "grad_norm": 0.0812758058309555, + "learning_rate": 4.581319407192692e-05, + "loss": 0.0299, + "num_input_tokens_seen": 58618912, + "step": 48165 + }, + { + "epoch": 5.36473994876935, + "grad_norm": 2.0869500637054443, + "learning_rate": 4.5811847939409286e-05, + "loss": 0.1715, + "num_input_tokens_seen": 58625184, + "step": 48170 + }, + { + "epoch": 5.365296803652968, + "grad_norm": 1.5864200592041016, + "learning_rate": 4.581050161030708e-05, + "loss": 0.0891, + "num_input_tokens_seen": 58631488, + "step": 48175 + }, + { + "epoch": 5.365853658536586, + "grad_norm": 0.8889429569244385, + "learning_rate": 4.5809155084633e-05, + "loss": 0.0633, + "num_input_tokens_seen": 58637888, + "step": 48180 + }, + { + "epoch": 5.3664105134202025, + "grad_norm": 0.0057773590087890625, + "learning_rate": 4.580780836239979e-05, + "loss": 0.0708, + "num_input_tokens_seen": 58643744, + "step": 48185 + }, + { + "epoch": 5.36696736830382, + "grad_norm": 0.29350903630256653, + "learning_rate": 4.580646144362015e-05, + "loss": 0.0732, + "num_input_tokens_seen": 58650176, + "step": 48190 + }, + { + "epoch": 5.367524223187437, + "grad_norm": 0.01575646549463272, + "learning_rate": 4.580511432830682e-05, + "loss": 0.0433, + "num_input_tokens_seen": 58656096, + "step": 48195 + }, + { + "epoch": 5.368081078071055, + "grad_norm": 0.13830924034118652, + "learning_rate": 4.580376701647251e-05, + "loss": 0.0044, + "num_input_tokens_seen": 58662400, + "step": 48200 + }, + { + "epoch": 5.368637932954672, + "grad_norm": 0.215087890625, + "learning_rate": 4.5802419508129955e-05, + "loss": 0.0402, + "num_input_tokens_seen": 58668512, + "step": 48205 + }, + { + "epoch": 5.369194787838289, + "grad_norm": 0.26488009095191956, + "learning_rate": 4.5801071803291886e-05, + "loss": 0.0304, + "num_input_tokens_seen": 58674752, + "step": 48210 + }, + { + "epoch": 5.369751642721907, + "grad_norm": 0.23717531561851501, + "learning_rate": 4.579972390197102e-05, + "loss": 0.1204, + "num_input_tokens_seen": 58680480, + "step": 48215 + }, + { + "epoch": 5.370308497605524, + "grad_norm": 1.2561495304107666, + "learning_rate": 4.57983758041801e-05, + "loss": 0.1376, + "num_input_tokens_seen": 58686336, + "step": 48220 + }, + { + "epoch": 5.370865352489141, + "grad_norm": 0.07662954181432724, + "learning_rate": 4.5797027509931855e-05, + "loss": 0.0626, + "num_input_tokens_seen": 58692256, + "step": 48225 + }, + { + "epoch": 5.371422207372759, + "grad_norm": 0.07269654422998428, + "learning_rate": 4.579567901923903e-05, + "loss": 0.0326, + "num_input_tokens_seen": 58698464, + "step": 48230 + }, + { + "epoch": 5.371979062256376, + "grad_norm": 0.1180080696940422, + "learning_rate": 4.579433033211435e-05, + "loss": 0.0233, + "num_input_tokens_seen": 58704192, + "step": 48235 + }, + { + "epoch": 5.372535917139993, + "grad_norm": 0.36818215250968933, + "learning_rate": 4.579298144857056e-05, + "loss": 0.0483, + "num_input_tokens_seen": 58710112, + "step": 48240 + }, + { + "epoch": 5.373092772023611, + "grad_norm": 0.06892668455839157, + "learning_rate": 4.579163236862041e-05, + "loss": 0.0091, + "num_input_tokens_seen": 58716288, + "step": 48245 + }, + { + "epoch": 5.373649626907228, + "grad_norm": 0.068269282579422, + "learning_rate": 4.579028309227662e-05, + "loss": 0.0667, + "num_input_tokens_seen": 58722496, + "step": 48250 + }, + { + "epoch": 5.3742064817908455, + "grad_norm": 1.0129643678665161, + "learning_rate": 4.578893361955196e-05, + "loss": 0.0743, + "num_input_tokens_seen": 58728192, + "step": 48255 + }, + { + "epoch": 5.374763336674462, + "grad_norm": 0.04553033411502838, + "learning_rate": 4.5787583950459155e-05, + "loss": 0.0323, + "num_input_tokens_seen": 58734560, + "step": 48260 + }, + { + "epoch": 5.37532019155808, + "grad_norm": 0.16589811444282532, + "learning_rate": 4.5786234085010976e-05, + "loss": 0.0282, + "num_input_tokens_seen": 58740512, + "step": 48265 + }, + { + "epoch": 5.375877046441698, + "grad_norm": 0.7713229060173035, + "learning_rate": 4.578488402322016e-05, + "loss": 0.0129, + "num_input_tokens_seen": 58746464, + "step": 48270 + }, + { + "epoch": 5.376433901325314, + "grad_norm": 0.42716357111930847, + "learning_rate": 4.578353376509946e-05, + "loss": 0.1753, + "num_input_tokens_seen": 58752672, + "step": 48275 + }, + { + "epoch": 5.376990756208932, + "grad_norm": 0.08198585361242294, + "learning_rate": 4.578218331066163e-05, + "loss": 0.0233, + "num_input_tokens_seen": 58758976, + "step": 48280 + }, + { + "epoch": 5.377547611092549, + "grad_norm": 0.1442352831363678, + "learning_rate": 4.5780832659919434e-05, + "loss": 0.0455, + "num_input_tokens_seen": 58765312, + "step": 48285 + }, + { + "epoch": 5.3781044659761665, + "grad_norm": 0.00908520445227623, + "learning_rate": 4.577948181288562e-05, + "loss": 0.0068, + "num_input_tokens_seen": 58771488, + "step": 48290 + }, + { + "epoch": 5.378661320859784, + "grad_norm": 1.2762888669967651, + "learning_rate": 4.577813076957295e-05, + "loss": 0.0451, + "num_input_tokens_seen": 58777728, + "step": 48295 + }, + { + "epoch": 5.379218175743401, + "grad_norm": 1.2591969966888428, + "learning_rate": 4.5776779529994195e-05, + "loss": 0.1146, + "num_input_tokens_seen": 58784032, + "step": 48300 + }, + { + "epoch": 5.379775030627019, + "grad_norm": 3.153069019317627, + "learning_rate": 4.577542809416211e-05, + "loss": 0.1014, + "num_input_tokens_seen": 58790112, + "step": 48305 + }, + { + "epoch": 5.3803318855106355, + "grad_norm": 0.7867346405982971, + "learning_rate": 4.577407646208945e-05, + "loss": 0.0234, + "num_input_tokens_seen": 58796256, + "step": 48310 + }, + { + "epoch": 5.380888740394253, + "grad_norm": 1.6838330030441284, + "learning_rate": 4.577272463378901e-05, + "loss": 0.1464, + "num_input_tokens_seen": 58802464, + "step": 48315 + }, + { + "epoch": 5.381445595277871, + "grad_norm": 0.0430801622569561, + "learning_rate": 4.5771372609273533e-05, + "loss": 0.0511, + "num_input_tokens_seen": 58808608, + "step": 48320 + }, + { + "epoch": 5.382002450161488, + "grad_norm": 0.31893327832221985, + "learning_rate": 4.5770020388555804e-05, + "loss": 0.0429, + "num_input_tokens_seen": 58814336, + "step": 48325 + }, + { + "epoch": 5.382559305045105, + "grad_norm": 0.0008425365085713565, + "learning_rate": 4.5768667971648595e-05, + "loss": 0.082, + "num_input_tokens_seen": 58820512, + "step": 48330 + }, + { + "epoch": 5.383116159928723, + "grad_norm": 0.007699587382376194, + "learning_rate": 4.5767315358564665e-05, + "loss": 0.1413, + "num_input_tokens_seen": 58826688, + "step": 48335 + }, + { + "epoch": 5.38367301481234, + "grad_norm": 0.0011011287569999695, + "learning_rate": 4.576596254931681e-05, + "loss": 0.0237, + "num_input_tokens_seen": 58832928, + "step": 48340 + }, + { + "epoch": 5.384229869695957, + "grad_norm": 1.9377025365829468, + "learning_rate": 4.57646095439178e-05, + "loss": 0.0725, + "num_input_tokens_seen": 58838976, + "step": 48345 + }, + { + "epoch": 5.384786724579574, + "grad_norm": 0.81404709815979, + "learning_rate": 4.576325634238042e-05, + "loss": 0.0346, + "num_input_tokens_seen": 58845056, + "step": 48350 + }, + { + "epoch": 5.385343579463192, + "grad_norm": 0.1299637258052826, + "learning_rate": 4.576190294471745e-05, + "loss": 0.0461, + "num_input_tokens_seen": 58850816, + "step": 48355 + }, + { + "epoch": 5.3859004343468095, + "grad_norm": 0.20220352709293365, + "learning_rate": 4.576054935094167e-05, + "loss": 0.1722, + "num_input_tokens_seen": 58857152, + "step": 48360 + }, + { + "epoch": 5.386457289230426, + "grad_norm": 0.5899999737739563, + "learning_rate": 4.575919556106587e-05, + "loss": 0.0345, + "num_input_tokens_seen": 58863392, + "step": 48365 + }, + { + "epoch": 5.387014144114044, + "grad_norm": 0.21934212744235992, + "learning_rate": 4.5757841575102835e-05, + "loss": 0.1458, + "num_input_tokens_seen": 58869728, + "step": 48370 + }, + { + "epoch": 5.387570998997662, + "grad_norm": 0.0596204474568367, + "learning_rate": 4.5756487393065354e-05, + "loss": 0.0717, + "num_input_tokens_seen": 58875776, + "step": 48375 + }, + { + "epoch": 5.3881278538812785, + "grad_norm": 3.0147135257720947, + "learning_rate": 4.5755133014966214e-05, + "loss": 0.135, + "num_input_tokens_seen": 58881952, + "step": 48380 + }, + { + "epoch": 5.388684708764896, + "grad_norm": 0.22665531933307648, + "learning_rate": 4.5753778440818215e-05, + "loss": 0.0744, + "num_input_tokens_seen": 58887680, + "step": 48385 + }, + { + "epoch": 5.389241563648513, + "grad_norm": 0.012403295375406742, + "learning_rate": 4.575242367063416e-05, + "loss": 0.0061, + "num_input_tokens_seen": 58893504, + "step": 48390 + }, + { + "epoch": 5.389798418532131, + "grad_norm": 0.7510777711868286, + "learning_rate": 4.5751068704426835e-05, + "loss": 0.0319, + "num_input_tokens_seen": 58899616, + "step": 48395 + }, + { + "epoch": 5.390355273415748, + "grad_norm": 0.0076893409714102745, + "learning_rate": 4.574971354220904e-05, + "loss": 0.0976, + "num_input_tokens_seen": 58905440, + "step": 48400 + }, + { + "epoch": 5.390912128299365, + "grad_norm": 1.370975136756897, + "learning_rate": 4.574835818399357e-05, + "loss": 0.0866, + "num_input_tokens_seen": 58911712, + "step": 48405 + }, + { + "epoch": 5.391468983182983, + "grad_norm": 0.0022710401099175215, + "learning_rate": 4.574700262979324e-05, + "loss": 0.0429, + "num_input_tokens_seen": 58917632, + "step": 48410 + }, + { + "epoch": 5.3920258380665995, + "grad_norm": 0.007964340969920158, + "learning_rate": 4.574564687962084e-05, + "loss": 0.02, + "num_input_tokens_seen": 58924032, + "step": 48415 + }, + { + "epoch": 5.392582692950217, + "grad_norm": 0.0021682297810912132, + "learning_rate": 4.5744290933489195e-05, + "loss": 0.0517, + "num_input_tokens_seen": 58930368, + "step": 48420 + }, + { + "epoch": 5.393139547833835, + "grad_norm": 0.17514300346374512, + "learning_rate": 4.57429347914111e-05, + "loss": 0.0333, + "num_input_tokens_seen": 58936800, + "step": 48425 + }, + { + "epoch": 5.393696402717452, + "grad_norm": 0.19920752942562103, + "learning_rate": 4.5741578453399365e-05, + "loss": 0.0086, + "num_input_tokens_seen": 58943264, + "step": 48430 + }, + { + "epoch": 5.394253257601069, + "grad_norm": 0.30571889877319336, + "learning_rate": 4.57402219194668e-05, + "loss": 0.0379, + "num_input_tokens_seen": 58949440, + "step": 48435 + }, + { + "epoch": 5.394810112484686, + "grad_norm": 0.3769049644470215, + "learning_rate": 4.573886518962622e-05, + "loss": 0.0235, + "num_input_tokens_seen": 58955680, + "step": 48440 + }, + { + "epoch": 5.395366967368304, + "grad_norm": 1.0213828086853027, + "learning_rate": 4.5737508263890454e-05, + "loss": 0.1316, + "num_input_tokens_seen": 58962112, + "step": 48445 + }, + { + "epoch": 5.3959238222519215, + "grad_norm": 0.10788838565349579, + "learning_rate": 4.57361511422723e-05, + "loss": 0.0668, + "num_input_tokens_seen": 58968416, + "step": 48450 + }, + { + "epoch": 5.396480677135538, + "grad_norm": 0.884659469127655, + "learning_rate": 4.5734793824784586e-05, + "loss": 0.059, + "num_input_tokens_seen": 58974432, + "step": 48455 + }, + { + "epoch": 5.397037532019156, + "grad_norm": 0.0019131106091663241, + "learning_rate": 4.5733436311440134e-05, + "loss": 0.0706, + "num_input_tokens_seen": 58980672, + "step": 48460 + }, + { + "epoch": 5.397594386902773, + "grad_norm": 0.20130321383476257, + "learning_rate": 4.573207860225176e-05, + "loss": 0.0479, + "num_input_tokens_seen": 58986560, + "step": 48465 + }, + { + "epoch": 5.39815124178639, + "grad_norm": 0.029118159785866737, + "learning_rate": 4.57307206972323e-05, + "loss": 0.077, + "num_input_tokens_seen": 58992960, + "step": 48470 + }, + { + "epoch": 5.398708096670008, + "grad_norm": 0.0013867456000298262, + "learning_rate": 4.572936259639457e-05, + "loss": 0.0303, + "num_input_tokens_seen": 58999200, + "step": 48475 + }, + { + "epoch": 5.399264951553625, + "grad_norm": 0.8324337601661682, + "learning_rate": 4.5728004299751405e-05, + "loss": 0.1117, + "num_input_tokens_seen": 59004832, + "step": 48480 + }, + { + "epoch": 5.3998218064372425, + "grad_norm": 0.12327950447797775, + "learning_rate": 4.5726645807315627e-05, + "loss": 0.0222, + "num_input_tokens_seen": 59010880, + "step": 48485 + }, + { + "epoch": 5.40037866132086, + "grad_norm": 0.21492356061935425, + "learning_rate": 4.572528711910008e-05, + "loss": 0.0704, + "num_input_tokens_seen": 59017120, + "step": 48490 + }, + { + "epoch": 5.400935516204477, + "grad_norm": 0.012689611874520779, + "learning_rate": 4.5723928235117584e-05, + "loss": 0.0447, + "num_input_tokens_seen": 59023392, + "step": 48495 + }, + { + "epoch": 5.401492371088095, + "grad_norm": 0.6568596363067627, + "learning_rate": 4.572256915538098e-05, + "loss": 0.0352, + "num_input_tokens_seen": 59029536, + "step": 48500 + }, + { + "epoch": 5.402049225971711, + "grad_norm": 0.8890869617462158, + "learning_rate": 4.5721209879903114e-05, + "loss": 0.0666, + "num_input_tokens_seen": 59035616, + "step": 48505 + }, + { + "epoch": 5.402606080855329, + "grad_norm": 0.059218913316726685, + "learning_rate": 4.571985040869682e-05, + "loss": 0.1866, + "num_input_tokens_seen": 59041792, + "step": 48510 + }, + { + "epoch": 5.403162935738947, + "grad_norm": 0.8682202696800232, + "learning_rate": 4.571849074177494e-05, + "loss": 0.0918, + "num_input_tokens_seen": 59047680, + "step": 48515 + }, + { + "epoch": 5.403719790622564, + "grad_norm": 0.038164399564266205, + "learning_rate": 4.571713087915031e-05, + "loss": 0.1559, + "num_input_tokens_seen": 59053760, + "step": 48520 + }, + { + "epoch": 5.404276645506181, + "grad_norm": 0.2918972373008728, + "learning_rate": 4.571577082083578e-05, + "loss": 0.0251, + "num_input_tokens_seen": 59059872, + "step": 48525 + }, + { + "epoch": 5.404833500389798, + "grad_norm": 0.16080158948898315, + "learning_rate": 4.5714410566844204e-05, + "loss": 0.1011, + "num_input_tokens_seen": 59065856, + "step": 48530 + }, + { + "epoch": 5.405390355273416, + "grad_norm": 0.18745480477809906, + "learning_rate": 4.571305011718842e-05, + "loss": 0.0316, + "num_input_tokens_seen": 59071936, + "step": 48535 + }, + { + "epoch": 5.405947210157033, + "grad_norm": 1.2919723987579346, + "learning_rate": 4.5711689471881284e-05, + "loss": 0.0858, + "num_input_tokens_seen": 59077632, + "step": 48540 + }, + { + "epoch": 5.40650406504065, + "grad_norm": 0.843101441860199, + "learning_rate": 4.571032863093565e-05, + "loss": 0.0688, + "num_input_tokens_seen": 59083680, + "step": 48545 + }, + { + "epoch": 5.407060919924268, + "grad_norm": 1.1759055852890015, + "learning_rate": 4.570896759436436e-05, + "loss": 0.0535, + "num_input_tokens_seen": 59090016, + "step": 48550 + }, + { + "epoch": 5.4076177748078855, + "grad_norm": 0.05632631480693817, + "learning_rate": 4.570760636218029e-05, + "loss": 0.0208, + "num_input_tokens_seen": 59096256, + "step": 48555 + }, + { + "epoch": 5.408174629691502, + "grad_norm": 0.1211080551147461, + "learning_rate": 4.570624493439628e-05, + "loss": 0.1331, + "num_input_tokens_seen": 59102112, + "step": 48560 + }, + { + "epoch": 5.40873148457512, + "grad_norm": 0.12450235337018967, + "learning_rate": 4.5704883311025196e-05, + "loss": 0.042, + "num_input_tokens_seen": 59108448, + "step": 48565 + }, + { + "epoch": 5.409288339458737, + "grad_norm": 0.7867884635925293, + "learning_rate": 4.5703521492079906e-05, + "loss": 0.0435, + "num_input_tokens_seen": 59114656, + "step": 48570 + }, + { + "epoch": 5.4098451943423544, + "grad_norm": 1.195865273475647, + "learning_rate": 4.570215947757326e-05, + "loss": 0.1391, + "num_input_tokens_seen": 59121056, + "step": 48575 + }, + { + "epoch": 5.410402049225972, + "grad_norm": 1.4032906293869019, + "learning_rate": 4.5700797267518144e-05, + "loss": 0.0857, + "num_input_tokens_seen": 59127232, + "step": 48580 + }, + { + "epoch": 5.410958904109589, + "grad_norm": 0.015079149045050144, + "learning_rate": 4.56994348619274e-05, + "loss": 0.0921, + "num_input_tokens_seen": 59133888, + "step": 48585 + }, + { + "epoch": 5.411515758993207, + "grad_norm": 0.5232257843017578, + "learning_rate": 4.569807226081392e-05, + "loss": 0.0932, + "num_input_tokens_seen": 59140000, + "step": 48590 + }, + { + "epoch": 5.412072613876823, + "grad_norm": 0.0003235588374081999, + "learning_rate": 4.569670946419056e-05, + "loss": 0.0264, + "num_input_tokens_seen": 59146240, + "step": 48595 + }, + { + "epoch": 5.412629468760441, + "grad_norm": 0.00019691232591867447, + "learning_rate": 4.56953464720702e-05, + "loss": 0.0037, + "num_input_tokens_seen": 59152512, + "step": 48600 + }, + { + "epoch": 5.413186323644059, + "grad_norm": 0.322732537984848, + "learning_rate": 4.56939832844657e-05, + "loss": 0.0501, + "num_input_tokens_seen": 59158272, + "step": 48605 + }, + { + "epoch": 5.4137431785276755, + "grad_norm": 0.01039969827979803, + "learning_rate": 4.569261990138997e-05, + "loss": 0.0892, + "num_input_tokens_seen": 59164192, + "step": 48610 + }, + { + "epoch": 5.414300033411293, + "grad_norm": 1.2579302787780762, + "learning_rate": 4.5691256322855844e-05, + "loss": 0.1233, + "num_input_tokens_seen": 59170304, + "step": 48615 + }, + { + "epoch": 5.41485688829491, + "grad_norm": 0.29234009981155396, + "learning_rate": 4.5689892548876234e-05, + "loss": 0.1134, + "num_input_tokens_seen": 59176512, + "step": 48620 + }, + { + "epoch": 5.415413743178528, + "grad_norm": 0.00965945329517126, + "learning_rate": 4.568852857946401e-05, + "loss": 0.0954, + "num_input_tokens_seen": 59182592, + "step": 48625 + }, + { + "epoch": 5.415970598062145, + "grad_norm": 0.00489766662940383, + "learning_rate": 4.568716441463207e-05, + "loss": 0.037, + "num_input_tokens_seen": 59188608, + "step": 48630 + }, + { + "epoch": 5.416527452945762, + "grad_norm": 1.6661032438278198, + "learning_rate": 4.5685800054393267e-05, + "loss": 0.1312, + "num_input_tokens_seen": 59194752, + "step": 48635 + }, + { + "epoch": 5.41708430782938, + "grad_norm": 0.07646451890468597, + "learning_rate": 4.5684435498760526e-05, + "loss": 0.0308, + "num_input_tokens_seen": 59201088, + "step": 48640 + }, + { + "epoch": 5.417641162712997, + "grad_norm": 0.40095531940460205, + "learning_rate": 4.5683070747746714e-05, + "loss": 0.0133, + "num_input_tokens_seen": 59206880, + "step": 48645 + }, + { + "epoch": 5.418198017596614, + "grad_norm": 0.4841446876525879, + "learning_rate": 4.5681705801364726e-05, + "loss": 0.0239, + "num_input_tokens_seen": 59212992, + "step": 48650 + }, + { + "epoch": 5.418754872480232, + "grad_norm": 0.3044086694717407, + "learning_rate": 4.5680340659627456e-05, + "loss": 0.0526, + "num_input_tokens_seen": 59219296, + "step": 48655 + }, + { + "epoch": 5.419311727363849, + "grad_norm": 0.01889517530798912, + "learning_rate": 4.56789753225478e-05, + "loss": 0.0162, + "num_input_tokens_seen": 59225536, + "step": 48660 + }, + { + "epoch": 5.419868582247466, + "grad_norm": 0.0018473787931725383, + "learning_rate": 4.567760979013866e-05, + "loss": 0.0655, + "num_input_tokens_seen": 59231968, + "step": 48665 + }, + { + "epoch": 5.420425437131084, + "grad_norm": 0.450682133436203, + "learning_rate": 4.5676244062412933e-05, + "loss": 0.2058, + "num_input_tokens_seen": 59238528, + "step": 48670 + }, + { + "epoch": 5.420982292014701, + "grad_norm": 0.7435659170150757, + "learning_rate": 4.5674878139383505e-05, + "loss": 0.0469, + "num_input_tokens_seen": 59244736, + "step": 48675 + }, + { + "epoch": 5.4215391468983185, + "grad_norm": 0.791996955871582, + "learning_rate": 4.567351202106329e-05, + "loss": 0.0554, + "num_input_tokens_seen": 59250784, + "step": 48680 + }, + { + "epoch": 5.422096001781935, + "grad_norm": 0.473093718290329, + "learning_rate": 4.567214570746518e-05, + "loss": 0.0779, + "num_input_tokens_seen": 59256736, + "step": 48685 + }, + { + "epoch": 5.422652856665553, + "grad_norm": 0.31230926513671875, + "learning_rate": 4.567077919860211e-05, + "loss": 0.0586, + "num_input_tokens_seen": 59262848, + "step": 48690 + }, + { + "epoch": 5.423209711549171, + "grad_norm": 0.5137503147125244, + "learning_rate": 4.5669412494486965e-05, + "loss": 0.0941, + "num_input_tokens_seen": 59268832, + "step": 48695 + }, + { + "epoch": 5.423766566432787, + "grad_norm": 0.08282146602869034, + "learning_rate": 4.566804559513266e-05, + "loss": 0.0473, + "num_input_tokens_seen": 59274816, + "step": 48700 + }, + { + "epoch": 5.424323421316405, + "grad_norm": 0.15376323461532593, + "learning_rate": 4.56666785005521e-05, + "loss": 0.049, + "num_input_tokens_seen": 59280896, + "step": 48705 + }, + { + "epoch": 5.424880276200022, + "grad_norm": 0.8919669389724731, + "learning_rate": 4.566531121075821e-05, + "loss": 0.1207, + "num_input_tokens_seen": 59286976, + "step": 48710 + }, + { + "epoch": 5.42543713108364, + "grad_norm": 0.27922141551971436, + "learning_rate": 4.566394372576389e-05, + "loss": 0.075, + "num_input_tokens_seen": 59293088, + "step": 48715 + }, + { + "epoch": 5.425993985967257, + "grad_norm": 1.5440462827682495, + "learning_rate": 4.5662576045582075e-05, + "loss": 0.1568, + "num_input_tokens_seen": 59298816, + "step": 48720 + }, + { + "epoch": 5.426550840850874, + "grad_norm": 0.0689963549375534, + "learning_rate": 4.566120817022567e-05, + "loss": 0.0807, + "num_input_tokens_seen": 59304992, + "step": 48725 + }, + { + "epoch": 5.427107695734492, + "grad_norm": 0.08289897441864014, + "learning_rate": 4.56598400997076e-05, + "loss": 0.0899, + "num_input_tokens_seen": 59310912, + "step": 48730 + }, + { + "epoch": 5.427664550618109, + "grad_norm": 0.24299535155296326, + "learning_rate": 4.565847183404079e-05, + "loss": 0.0996, + "num_input_tokens_seen": 59316576, + "step": 48735 + }, + { + "epoch": 5.428221405501726, + "grad_norm": 0.002239115070551634, + "learning_rate": 4.565710337323816e-05, + "loss": 0.027, + "num_input_tokens_seen": 59322656, + "step": 48740 + }, + { + "epoch": 5.428778260385344, + "grad_norm": 0.15393291413784027, + "learning_rate": 4.565573471731264e-05, + "loss": 0.0071, + "num_input_tokens_seen": 59328736, + "step": 48745 + }, + { + "epoch": 5.429335115268961, + "grad_norm": 0.4441990256309509, + "learning_rate": 4.5654365866277155e-05, + "loss": 0.075, + "num_input_tokens_seen": 59335008, + "step": 48750 + }, + { + "epoch": 5.429891970152578, + "grad_norm": 0.6697094440460205, + "learning_rate": 4.565299682014463e-05, + "loss": 0.0875, + "num_input_tokens_seen": 59341056, + "step": 48755 + }, + { + "epoch": 5.430448825036196, + "grad_norm": 1.0309182405471802, + "learning_rate": 4.5651627578928014e-05, + "loss": 0.013, + "num_input_tokens_seen": 59347328, + "step": 48760 + }, + { + "epoch": 5.431005679919813, + "grad_norm": 0.11916855722665787, + "learning_rate": 4.565025814264023e-05, + "loss": 0.0913, + "num_input_tokens_seen": 59353280, + "step": 48765 + }, + { + "epoch": 5.43156253480343, + "grad_norm": 0.08161336183547974, + "learning_rate": 4.564888851129421e-05, + "loss": 0.0903, + "num_input_tokens_seen": 59358912, + "step": 48770 + }, + { + "epoch": 5.432119389687047, + "grad_norm": 0.06107725948095322, + "learning_rate": 4.564751868490289e-05, + "loss": 0.0833, + "num_input_tokens_seen": 59365024, + "step": 48775 + }, + { + "epoch": 5.432676244570665, + "grad_norm": 0.05117762088775635, + "learning_rate": 4.5646148663479215e-05, + "loss": 0.0261, + "num_input_tokens_seen": 59371168, + "step": 48780 + }, + { + "epoch": 5.433233099454283, + "grad_norm": 0.00847088173031807, + "learning_rate": 4.564477844703613e-05, + "loss": 0.0469, + "num_input_tokens_seen": 59377376, + "step": 48785 + }, + { + "epoch": 5.433789954337899, + "grad_norm": 1.9196091890335083, + "learning_rate": 4.5643408035586564e-05, + "loss": 0.134, + "num_input_tokens_seen": 59383296, + "step": 48790 + }, + { + "epoch": 5.434346809221517, + "grad_norm": 0.1737958937883377, + "learning_rate": 4.564203742914348e-05, + "loss": 0.1252, + "num_input_tokens_seen": 59389568, + "step": 48795 + }, + { + "epoch": 5.434903664105134, + "grad_norm": 2.2825827598571777, + "learning_rate": 4.5640666627719805e-05, + "loss": 0.1213, + "num_input_tokens_seen": 59395776, + "step": 48800 + }, + { + "epoch": 5.4354605189887515, + "grad_norm": 0.4691178500652313, + "learning_rate": 4.563929563132851e-05, + "loss": 0.061, + "num_input_tokens_seen": 59401760, + "step": 48805 + }, + { + "epoch": 5.436017373872369, + "grad_norm": 0.8801186084747314, + "learning_rate": 4.5637924439982514e-05, + "loss": 0.0264, + "num_input_tokens_seen": 59408160, + "step": 48810 + }, + { + "epoch": 5.436574228755986, + "grad_norm": 0.04539816826581955, + "learning_rate": 4.5636553053694795e-05, + "loss": 0.0352, + "num_input_tokens_seen": 59414272, + "step": 48815 + }, + { + "epoch": 5.437131083639604, + "grad_norm": 0.29645246267318726, + "learning_rate": 4.56351814724783e-05, + "loss": 0.0965, + "num_input_tokens_seen": 59420256, + "step": 48820 + }, + { + "epoch": 5.43768793852322, + "grad_norm": 0.31634366512298584, + "learning_rate": 4.5633809696345984e-05, + "loss": 0.0287, + "num_input_tokens_seen": 59426528, + "step": 48825 + }, + { + "epoch": 5.438244793406838, + "grad_norm": 0.0589999221265316, + "learning_rate": 4.56324377253108e-05, + "loss": 0.0319, + "num_input_tokens_seen": 59432768, + "step": 48830 + }, + { + "epoch": 5.438801648290456, + "grad_norm": 0.2025478482246399, + "learning_rate": 4.563106555938571e-05, + "loss": 0.0197, + "num_input_tokens_seen": 59438848, + "step": 48835 + }, + { + "epoch": 5.4393585031740725, + "grad_norm": 0.6766207814216614, + "learning_rate": 4.562969319858368e-05, + "loss": 0.1006, + "num_input_tokens_seen": 59444896, + "step": 48840 + }, + { + "epoch": 5.43991535805769, + "grad_norm": 0.7451356649398804, + "learning_rate": 4.562832064291767e-05, + "loss": 0.1093, + "num_input_tokens_seen": 59450816, + "step": 48845 + }, + { + "epoch": 5.440472212941308, + "grad_norm": 0.09998700022697449, + "learning_rate": 4.5626947892400646e-05, + "loss": 0.11, + "num_input_tokens_seen": 59456736, + "step": 48850 + }, + { + "epoch": 5.441029067824925, + "grad_norm": 0.06000343710184097, + "learning_rate": 4.562557494704557e-05, + "loss": 0.1007, + "num_input_tokens_seen": 59463104, + "step": 48855 + }, + { + "epoch": 5.441585922708542, + "grad_norm": 0.4175912141799927, + "learning_rate": 4.562420180686542e-05, + "loss": 0.0738, + "num_input_tokens_seen": 59469344, + "step": 48860 + }, + { + "epoch": 5.442142777592159, + "grad_norm": 0.038711924105882645, + "learning_rate": 4.562282847187315e-05, + "loss": 0.0398, + "num_input_tokens_seen": 59475520, + "step": 48865 + }, + { + "epoch": 5.442699632475777, + "grad_norm": 0.18564648926258087, + "learning_rate": 4.5621454942081744e-05, + "loss": 0.1565, + "num_input_tokens_seen": 59481792, + "step": 48870 + }, + { + "epoch": 5.4432564873593945, + "grad_norm": 0.0014441597741097212, + "learning_rate": 4.562008121750418e-05, + "loss": 0.013, + "num_input_tokens_seen": 59487904, + "step": 48875 + }, + { + "epoch": 5.443813342243011, + "grad_norm": 0.11369484663009644, + "learning_rate": 4.561870729815343e-05, + "loss": 0.0649, + "num_input_tokens_seen": 59493856, + "step": 48880 + }, + { + "epoch": 5.444370197126629, + "grad_norm": 1.629340410232544, + "learning_rate": 4.561733318404246e-05, + "loss": 0.1338, + "num_input_tokens_seen": 59499488, + "step": 48885 + }, + { + "epoch": 5.444927052010246, + "grad_norm": 2.221972942352295, + "learning_rate": 4.561595887518427e-05, + "loss": 0.0761, + "num_input_tokens_seen": 59505504, + "step": 48890 + }, + { + "epoch": 5.445483906893863, + "grad_norm": 1.036274790763855, + "learning_rate": 4.561458437159183e-05, + "loss": 0.0559, + "num_input_tokens_seen": 59511168, + "step": 48895 + }, + { + "epoch": 5.446040761777481, + "grad_norm": 0.07242754101753235, + "learning_rate": 4.5613209673278114e-05, + "loss": 0.0239, + "num_input_tokens_seen": 59517312, + "step": 48900 + }, + { + "epoch": 5.446597616661098, + "grad_norm": 0.0003790182527154684, + "learning_rate": 4.5611834780256125e-05, + "loss": 0.0553, + "num_input_tokens_seen": 59523520, + "step": 48905 + }, + { + "epoch": 5.4471544715447155, + "grad_norm": 0.038913995027542114, + "learning_rate": 4.561045969253884e-05, + "loss": 0.1038, + "num_input_tokens_seen": 59529056, + "step": 48910 + }, + { + "epoch": 5.447711326428333, + "grad_norm": 0.07723360508680344, + "learning_rate": 4.5609084410139255e-05, + "loss": 0.0189, + "num_input_tokens_seen": 59535136, + "step": 48915 + }, + { + "epoch": 5.44826818131195, + "grad_norm": 0.2076903134584427, + "learning_rate": 4.560770893307036e-05, + "loss": 0.0177, + "num_input_tokens_seen": 59540992, + "step": 48920 + }, + { + "epoch": 5.448825036195568, + "grad_norm": 0.6225215792655945, + "learning_rate": 4.560633326134513e-05, + "loss": 0.0783, + "num_input_tokens_seen": 59546432, + "step": 48925 + }, + { + "epoch": 5.4493818910791845, + "grad_norm": 0.5666085481643677, + "learning_rate": 4.560495739497659e-05, + "loss": 0.1193, + "num_input_tokens_seen": 59552576, + "step": 48930 + }, + { + "epoch": 5.449938745962802, + "grad_norm": 0.031082477420568466, + "learning_rate": 4.5603581333977705e-05, + "loss": 0.0214, + "num_input_tokens_seen": 59559008, + "step": 48935 + }, + { + "epoch": 5.45049560084642, + "grad_norm": 0.1750490814447403, + "learning_rate": 4.5602205078361494e-05, + "loss": 0.0274, + "num_input_tokens_seen": 59565536, + "step": 48940 + }, + { + "epoch": 5.451052455730037, + "grad_norm": 0.04657493159174919, + "learning_rate": 4.560082862814095e-05, + "loss": 0.0272, + "num_input_tokens_seen": 59571456, + "step": 48945 + }, + { + "epoch": 5.451609310613654, + "grad_norm": 0.09914004057645798, + "learning_rate": 4.559945198332907e-05, + "loss": 0.0261, + "num_input_tokens_seen": 59577600, + "step": 48950 + }, + { + "epoch": 5.452166165497271, + "grad_norm": 0.2896537184715271, + "learning_rate": 4.5598075143938855e-05, + "loss": 0.2128, + "num_input_tokens_seen": 59583520, + "step": 48955 + }, + { + "epoch": 5.452723020380889, + "grad_norm": 0.0468868613243103, + "learning_rate": 4.559669810998333e-05, + "loss": 0.0996, + "num_input_tokens_seen": 59589824, + "step": 48960 + }, + { + "epoch": 5.453279875264506, + "grad_norm": 1.7965723276138306, + "learning_rate": 4.5595320881475484e-05, + "loss": 0.072, + "num_input_tokens_seen": 59595680, + "step": 48965 + }, + { + "epoch": 5.453836730148123, + "grad_norm": 0.0758836567401886, + "learning_rate": 4.5593943458428334e-05, + "loss": 0.0514, + "num_input_tokens_seen": 59601664, + "step": 48970 + }, + { + "epoch": 5.454393585031741, + "grad_norm": 0.010555950924754143, + "learning_rate": 4.5592565840854884e-05, + "loss": 0.0311, + "num_input_tokens_seen": 59607904, + "step": 48975 + }, + { + "epoch": 5.454950439915358, + "grad_norm": 0.009611374698579311, + "learning_rate": 4.559118802876816e-05, + "loss": 0.0268, + "num_input_tokens_seen": 59614208, + "step": 48980 + }, + { + "epoch": 5.455507294798975, + "grad_norm": 0.011567238718271255, + "learning_rate": 4.558981002218116e-05, + "loss": 0.1372, + "num_input_tokens_seen": 59620512, + "step": 48985 + }, + { + "epoch": 5.456064149682593, + "grad_norm": 0.488294392824173, + "learning_rate": 4.558843182110691e-05, + "loss": 0.0495, + "num_input_tokens_seen": 59626528, + "step": 48990 + }, + { + "epoch": 5.45662100456621, + "grad_norm": 3.0013177394866943, + "learning_rate": 4.558705342555843e-05, + "loss": 0.0779, + "num_input_tokens_seen": 59632768, + "step": 48995 + }, + { + "epoch": 5.4571778594498275, + "grad_norm": 0.10430476814508438, + "learning_rate": 4.558567483554873e-05, + "loss": 0.123, + "num_input_tokens_seen": 59638976, + "step": 49000 + }, + { + "epoch": 5.457734714333444, + "grad_norm": 0.03805631026625633, + "learning_rate": 4.5584296051090844e-05, + "loss": 0.0423, + "num_input_tokens_seen": 59644992, + "step": 49005 + }, + { + "epoch": 5.458291569217062, + "grad_norm": 0.0005658043082803488, + "learning_rate": 4.558291707219778e-05, + "loss": 0.0811, + "num_input_tokens_seen": 59651232, + "step": 49010 + }, + { + "epoch": 5.45884842410068, + "grad_norm": 1.432328701019287, + "learning_rate": 4.558153789888259e-05, + "loss": 0.1496, + "num_input_tokens_seen": 59657216, + "step": 49015 + }, + { + "epoch": 5.459405278984296, + "grad_norm": 0.8742478489875793, + "learning_rate": 4.558015853115827e-05, + "loss": 0.0734, + "num_input_tokens_seen": 59663424, + "step": 49020 + }, + { + "epoch": 5.459962133867914, + "grad_norm": 0.24764028191566467, + "learning_rate": 4.557877896903787e-05, + "loss": 0.042, + "num_input_tokens_seen": 59669312, + "step": 49025 + }, + { + "epoch": 5.460518988751532, + "grad_norm": 0.6432569026947021, + "learning_rate": 4.5577399212534416e-05, + "loss": 0.0282, + "num_input_tokens_seen": 59675616, + "step": 49030 + }, + { + "epoch": 5.4610758436351485, + "grad_norm": 0.4419097304344177, + "learning_rate": 4.557601926166094e-05, + "loss": 0.0559, + "num_input_tokens_seen": 59681440, + "step": 49035 + }, + { + "epoch": 5.461632698518766, + "grad_norm": 0.16881127655506134, + "learning_rate": 4.5574639116430475e-05, + "loss": 0.0815, + "num_input_tokens_seen": 59687616, + "step": 49040 + }, + { + "epoch": 5.462189553402383, + "grad_norm": 0.4826715886592865, + "learning_rate": 4.557325877685606e-05, + "loss": 0.1033, + "num_input_tokens_seen": 59693536, + "step": 49045 + }, + { + "epoch": 5.462746408286001, + "grad_norm": 0.7505033612251282, + "learning_rate": 4.557187824295073e-05, + "loss": 0.0601, + "num_input_tokens_seen": 59699680, + "step": 49050 + }, + { + "epoch": 5.463303263169618, + "grad_norm": 0.021834461018443108, + "learning_rate": 4.5570497514727535e-05, + "loss": 0.143, + "num_input_tokens_seen": 59705824, + "step": 49055 + }, + { + "epoch": 5.463860118053235, + "grad_norm": 0.8281425833702087, + "learning_rate": 4.55691165921995e-05, + "loss": 0.0724, + "num_input_tokens_seen": 59711936, + "step": 49060 + }, + { + "epoch": 5.464416972936853, + "grad_norm": 0.6297422051429749, + "learning_rate": 4.5567735475379694e-05, + "loss": 0.1866, + "num_input_tokens_seen": 59717984, + "step": 49065 + }, + { + "epoch": 5.46497382782047, + "grad_norm": 0.18172769248485565, + "learning_rate": 4.556635416428113e-05, + "loss": 0.0678, + "num_input_tokens_seen": 59724032, + "step": 49070 + }, + { + "epoch": 5.465530682704087, + "grad_norm": 0.10620269179344177, + "learning_rate": 4.556497265891689e-05, + "loss": 0.0264, + "num_input_tokens_seen": 59730336, + "step": 49075 + }, + { + "epoch": 5.466087537587705, + "grad_norm": 0.4459696114063263, + "learning_rate": 4.556359095929999e-05, + "loss": 0.1137, + "num_input_tokens_seen": 59735680, + "step": 49080 + }, + { + "epoch": 5.466644392471322, + "grad_norm": 0.01896919496357441, + "learning_rate": 4.556220906544352e-05, + "loss": 0.0852, + "num_input_tokens_seen": 59741888, + "step": 49085 + }, + { + "epoch": 5.467201247354939, + "grad_norm": 0.007256154902279377, + "learning_rate": 4.5560826977360496e-05, + "loss": 0.0259, + "num_input_tokens_seen": 59747968, + "step": 49090 + }, + { + "epoch": 5.467758102238557, + "grad_norm": 0.5678809881210327, + "learning_rate": 4.555944469506399e-05, + "loss": 0.0516, + "num_input_tokens_seen": 59754016, + "step": 49095 + }, + { + "epoch": 5.468314957122174, + "grad_norm": 0.5742924213409424, + "learning_rate": 4.5558062218567056e-05, + "loss": 0.0705, + "num_input_tokens_seen": 59760032, + "step": 49100 + }, + { + "epoch": 5.4688718120057915, + "grad_norm": 0.028681326657533646, + "learning_rate": 4.5556679547882757e-05, + "loss": 0.0946, + "num_input_tokens_seen": 59766304, + "step": 49105 + }, + { + "epoch": 5.469428666889408, + "grad_norm": 0.8096608519554138, + "learning_rate": 4.555529668302415e-05, + "loss": 0.0598, + "num_input_tokens_seen": 59772480, + "step": 49110 + }, + { + "epoch": 5.469985521773026, + "grad_norm": 2.339320421218872, + "learning_rate": 4.555391362400429e-05, + "loss": 0.2306, + "num_input_tokens_seen": 59778784, + "step": 49115 + }, + { + "epoch": 5.470542376656644, + "grad_norm": 0.309925377368927, + "learning_rate": 4.555253037083625e-05, + "loss": 0.0766, + "num_input_tokens_seen": 59784992, + "step": 49120 + }, + { + "epoch": 5.4710992315402605, + "grad_norm": 0.0285649336874485, + "learning_rate": 4.5551146923533105e-05, + "loss": 0.0504, + "num_input_tokens_seen": 59790880, + "step": 49125 + }, + { + "epoch": 5.471656086423878, + "grad_norm": 0.22373422980308533, + "learning_rate": 4.55497632821079e-05, + "loss": 0.0309, + "num_input_tokens_seen": 59797280, + "step": 49130 + }, + { + "epoch": 5.472212941307495, + "grad_norm": 0.40701964497566223, + "learning_rate": 4.554837944657373e-05, + "loss": 0.0559, + "num_input_tokens_seen": 59803392, + "step": 49135 + }, + { + "epoch": 5.472769796191113, + "grad_norm": 0.5192598700523376, + "learning_rate": 4.554699541694364e-05, + "loss": 0.0417, + "num_input_tokens_seen": 59809664, + "step": 49140 + }, + { + "epoch": 5.47332665107473, + "grad_norm": 0.0028396183624863625, + "learning_rate": 4.554561119323072e-05, + "loss": 0.0145, + "num_input_tokens_seen": 59816000, + "step": 49145 + }, + { + "epoch": 5.473883505958347, + "grad_norm": 0.46896877884864807, + "learning_rate": 4.5544226775448046e-05, + "loss": 0.0567, + "num_input_tokens_seen": 59822208, + "step": 49150 + }, + { + "epoch": 5.474440360841965, + "grad_norm": 0.007666600402444601, + "learning_rate": 4.554284216360869e-05, + "loss": 0.0226, + "num_input_tokens_seen": 59828288, + "step": 49155 + }, + { + "epoch": 5.4749972157255815, + "grad_norm": 0.585334837436676, + "learning_rate": 4.5541457357725724e-05, + "loss": 0.2123, + "num_input_tokens_seen": 59834336, + "step": 49160 + }, + { + "epoch": 5.475554070609199, + "grad_norm": 0.9665658473968506, + "learning_rate": 4.554007235781224e-05, + "loss": 0.1083, + "num_input_tokens_seen": 59839808, + "step": 49165 + }, + { + "epoch": 5.476110925492817, + "grad_norm": 1.0445185899734497, + "learning_rate": 4.5538687163881315e-05, + "loss": 0.0336, + "num_input_tokens_seen": 59845856, + "step": 49170 + }, + { + "epoch": 5.476667780376434, + "grad_norm": 0.41717466711997986, + "learning_rate": 4.553730177594604e-05, + "loss": 0.0814, + "num_input_tokens_seen": 59851552, + "step": 49175 + }, + { + "epoch": 5.477224635260051, + "grad_norm": 0.03048195131123066, + "learning_rate": 4.553591619401949e-05, + "loss": 0.0119, + "num_input_tokens_seen": 59857856, + "step": 49180 + }, + { + "epoch": 5.477781490143668, + "grad_norm": 0.03669988736510277, + "learning_rate": 4.553453041811476e-05, + "loss": 0.0585, + "num_input_tokens_seen": 59863968, + "step": 49185 + }, + { + "epoch": 5.478338345027286, + "grad_norm": 0.08265294879674911, + "learning_rate": 4.5533144448244936e-05, + "loss": 0.0815, + "num_input_tokens_seen": 59869984, + "step": 49190 + }, + { + "epoch": 5.4788951999109035, + "grad_norm": 0.21467998623847961, + "learning_rate": 4.553175828442311e-05, + "loss": 0.2139, + "num_input_tokens_seen": 59875328, + "step": 49195 + }, + { + "epoch": 5.47945205479452, + "grad_norm": 0.022296136245131493, + "learning_rate": 4.553037192666239e-05, + "loss": 0.0225, + "num_input_tokens_seen": 59881440, + "step": 49200 + }, + { + "epoch": 5.480008909678138, + "grad_norm": 0.05399167537689209, + "learning_rate": 4.5528985374975844e-05, + "loss": 0.0855, + "num_input_tokens_seen": 59887456, + "step": 49205 + }, + { + "epoch": 5.480565764561756, + "grad_norm": 0.27467432618141174, + "learning_rate": 4.5527598629376585e-05, + "loss": 0.1721, + "num_input_tokens_seen": 59893312, + "step": 49210 + }, + { + "epoch": 5.481122619445372, + "grad_norm": 0.2818440794944763, + "learning_rate": 4.5526211689877715e-05, + "loss": 0.0625, + "num_input_tokens_seen": 59899040, + "step": 49215 + }, + { + "epoch": 5.48167947432899, + "grad_norm": 0.2161695510149002, + "learning_rate": 4.552482455649233e-05, + "loss": 0.1149, + "num_input_tokens_seen": 59904832, + "step": 49220 + }, + { + "epoch": 5.482236329212607, + "grad_norm": 0.001861476805061102, + "learning_rate": 4.552343722923353e-05, + "loss": 0.1177, + "num_input_tokens_seen": 59910944, + "step": 49225 + }, + { + "epoch": 5.4827931840962245, + "grad_norm": 1.4443798065185547, + "learning_rate": 4.5522049708114424e-05, + "loss": 0.0733, + "num_input_tokens_seen": 59916768, + "step": 49230 + }, + { + "epoch": 5.483350038979842, + "grad_norm": 0.20350906252861023, + "learning_rate": 4.5520661993148115e-05, + "loss": 0.0974, + "num_input_tokens_seen": 59922848, + "step": 49235 + }, + { + "epoch": 5.483906893863459, + "grad_norm": 0.4170413017272949, + "learning_rate": 4.5519274084347716e-05, + "loss": 0.0596, + "num_input_tokens_seen": 59929120, + "step": 49240 + }, + { + "epoch": 5.484463748747077, + "grad_norm": 0.0012537251459434628, + "learning_rate": 4.551788598172633e-05, + "loss": 0.0704, + "num_input_tokens_seen": 59935424, + "step": 49245 + }, + { + "epoch": 5.485020603630693, + "grad_norm": 0.7577468752861023, + "learning_rate": 4.551649768529708e-05, + "loss": 0.0508, + "num_input_tokens_seen": 59941376, + "step": 49250 + }, + { + "epoch": 5.485577458514311, + "grad_norm": 0.5710753798484802, + "learning_rate": 4.5515109195073055e-05, + "loss": 0.081, + "num_input_tokens_seen": 59947616, + "step": 49255 + }, + { + "epoch": 5.486134313397929, + "grad_norm": 0.6893746852874756, + "learning_rate": 4.5513720511067404e-05, + "loss": 0.0605, + "num_input_tokens_seen": 59953888, + "step": 49260 + }, + { + "epoch": 5.486691168281546, + "grad_norm": 1.2464765310287476, + "learning_rate": 4.551233163329323e-05, + "loss": 0.1338, + "num_input_tokens_seen": 59960160, + "step": 49265 + }, + { + "epoch": 5.487248023165163, + "grad_norm": 0.25273531675338745, + "learning_rate": 4.5510942561763635e-05, + "loss": 0.0615, + "num_input_tokens_seen": 59966112, + "step": 49270 + }, + { + "epoch": 5.487804878048781, + "grad_norm": 0.1277994066476822, + "learning_rate": 4.550955329649176e-05, + "loss": 0.1242, + "num_input_tokens_seen": 59972032, + "step": 49275 + }, + { + "epoch": 5.488361732932398, + "grad_norm": 0.8491992950439453, + "learning_rate": 4.5508163837490726e-05, + "loss": 0.0596, + "num_input_tokens_seen": 59978272, + "step": 49280 + }, + { + "epoch": 5.488918587816015, + "grad_norm": 0.013439621776342392, + "learning_rate": 4.550677418477365e-05, + "loss": 0.0932, + "num_input_tokens_seen": 59984256, + "step": 49285 + }, + { + "epoch": 5.489475442699632, + "grad_norm": 0.4036567211151123, + "learning_rate": 4.550538433835366e-05, + "loss": 0.1347, + "num_input_tokens_seen": 59990656, + "step": 49290 + }, + { + "epoch": 5.49003229758325, + "grad_norm": 1.8474459648132324, + "learning_rate": 4.550399429824389e-05, + "loss": 0.0851, + "num_input_tokens_seen": 59996832, + "step": 49295 + }, + { + "epoch": 5.4905891524668675, + "grad_norm": 0.19400940835475922, + "learning_rate": 4.5502604064457464e-05, + "loss": 0.0626, + "num_input_tokens_seen": 60002624, + "step": 49300 + }, + { + "epoch": 5.491146007350484, + "grad_norm": 0.11853104084730148, + "learning_rate": 4.550121363700751e-05, + "loss": 0.079, + "num_input_tokens_seen": 60008384, + "step": 49305 + }, + { + "epoch": 5.491702862234102, + "grad_norm": 0.024989524856209755, + "learning_rate": 4.549982301590718e-05, + "loss": 0.1601, + "num_input_tokens_seen": 60014112, + "step": 49310 + }, + { + "epoch": 5.492259717117719, + "grad_norm": 0.47024670243263245, + "learning_rate": 4.549843220116959e-05, + "loss": 0.1225, + "num_input_tokens_seen": 60020256, + "step": 49315 + }, + { + "epoch": 5.492816572001336, + "grad_norm": 0.0034935269504785538, + "learning_rate": 4.549704119280789e-05, + "loss": 0.1195, + "num_input_tokens_seen": 60026176, + "step": 49320 + }, + { + "epoch": 5.493373426884954, + "grad_norm": 0.1176174059510231, + "learning_rate": 4.549564999083521e-05, + "loss": 0.1458, + "num_input_tokens_seen": 60032000, + "step": 49325 + }, + { + "epoch": 5.493930281768571, + "grad_norm": 0.8102778792381287, + "learning_rate": 4.54942585952647e-05, + "loss": 0.1402, + "num_input_tokens_seen": 60038176, + "step": 49330 + }, + { + "epoch": 5.494487136652189, + "grad_norm": 0.08081761747598648, + "learning_rate": 4.5492867006109495e-05, + "loss": 0.0576, + "num_input_tokens_seen": 60044384, + "step": 49335 + }, + { + "epoch": 5.495043991535805, + "grad_norm": 1.1309804916381836, + "learning_rate": 4.549147522338274e-05, + "loss": 0.1031, + "num_input_tokens_seen": 60050528, + "step": 49340 + }, + { + "epoch": 5.495600846419423, + "grad_norm": 0.5974334478378296, + "learning_rate": 4.549008324709759e-05, + "loss": 0.0642, + "num_input_tokens_seen": 60056384, + "step": 49345 + }, + { + "epoch": 5.496157701303041, + "grad_norm": 0.031714726239442825, + "learning_rate": 4.5488691077267185e-05, + "loss": 0.1486, + "num_input_tokens_seen": 60062112, + "step": 49350 + }, + { + "epoch": 5.4967145561866575, + "grad_norm": 1.5868741273880005, + "learning_rate": 4.548729871390467e-05, + "loss": 0.0553, + "num_input_tokens_seen": 60068128, + "step": 49355 + }, + { + "epoch": 5.497271411070275, + "grad_norm": 0.22339022159576416, + "learning_rate": 4.548590615702321e-05, + "loss": 0.0125, + "num_input_tokens_seen": 60073824, + "step": 49360 + }, + { + "epoch": 5.497828265953892, + "grad_norm": 1.8656736612319946, + "learning_rate": 4.548451340663595e-05, + "loss": 0.1929, + "num_input_tokens_seen": 60080000, + "step": 49365 + }, + { + "epoch": 5.49838512083751, + "grad_norm": 0.399924635887146, + "learning_rate": 4.5483120462756054e-05, + "loss": 0.0527, + "num_input_tokens_seen": 60085568, + "step": 49370 + }, + { + "epoch": 5.498941975721127, + "grad_norm": 0.42516079545021057, + "learning_rate": 4.5481727325396674e-05, + "loss": 0.0551, + "num_input_tokens_seen": 60091680, + "step": 49375 + }, + { + "epoch": 5.499498830604744, + "grad_norm": 0.006448498461395502, + "learning_rate": 4.5480333994570965e-05, + "loss": 0.0466, + "num_input_tokens_seen": 60097920, + "step": 49380 + }, + { + "epoch": 5.500055685488362, + "grad_norm": 0.010306967422366142, + "learning_rate": 4.5478940470292105e-05, + "loss": 0.0668, + "num_input_tokens_seen": 60103936, + "step": 49385 + }, + { + "epoch": 5.500612540371979, + "grad_norm": 2.1883127689361572, + "learning_rate": 4.547754675257323e-05, + "loss": 0.0287, + "num_input_tokens_seen": 60110080, + "step": 49390 + }, + { + "epoch": 5.501169395255596, + "grad_norm": 0.19107261300086975, + "learning_rate": 4.5476152841427526e-05, + "loss": 0.1257, + "num_input_tokens_seen": 60116320, + "step": 49395 + }, + { + "epoch": 5.501726250139214, + "grad_norm": 0.26501354575157166, + "learning_rate": 4.547475873686815e-05, + "loss": 0.0627, + "num_input_tokens_seen": 60122080, + "step": 49400 + }, + { + "epoch": 5.502283105022831, + "grad_norm": 1.5282284021377563, + "learning_rate": 4.547336443890828e-05, + "loss": 0.0476, + "num_input_tokens_seen": 60128192, + "step": 49405 + }, + { + "epoch": 5.502839959906448, + "grad_norm": 0.11102389544248581, + "learning_rate": 4.547196994756108e-05, + "loss": 0.0968, + "num_input_tokens_seen": 60134176, + "step": 49410 + }, + { + "epoch": 5.503396814790066, + "grad_norm": 0.0058884588070213795, + "learning_rate": 4.547057526283972e-05, + "loss": 0.1016, + "num_input_tokens_seen": 60140640, + "step": 49415 + }, + { + "epoch": 5.503953669673683, + "grad_norm": 1.821616530418396, + "learning_rate": 4.546918038475737e-05, + "loss": 0.1321, + "num_input_tokens_seen": 60146720, + "step": 49420 + }, + { + "epoch": 5.5045105245573005, + "grad_norm": 0.8289872407913208, + "learning_rate": 4.5467785313327215e-05, + "loss": 0.0828, + "num_input_tokens_seen": 60152576, + "step": 49425 + }, + { + "epoch": 5.505067379440918, + "grad_norm": 0.3268345594406128, + "learning_rate": 4.546639004856243e-05, + "loss": 0.0205, + "num_input_tokens_seen": 60158528, + "step": 49430 + }, + { + "epoch": 5.505624234324535, + "grad_norm": 0.004604940302670002, + "learning_rate": 4.54649945904762e-05, + "loss": 0.0317, + "num_input_tokens_seen": 60164576, + "step": 49435 + }, + { + "epoch": 5.506181089208153, + "grad_norm": 0.11145128309726715, + "learning_rate": 4.546359893908169e-05, + "loss": 0.0894, + "num_input_tokens_seen": 60170752, + "step": 49440 + }, + { + "epoch": 5.506737944091769, + "grad_norm": 0.08728311210870743, + "learning_rate": 4.54622030943921e-05, + "loss": 0.0487, + "num_input_tokens_seen": 60177280, + "step": 49445 + }, + { + "epoch": 5.507294798975387, + "grad_norm": 1.3595678806304932, + "learning_rate": 4.54608070564206e-05, + "loss": 0.1528, + "num_input_tokens_seen": 60183552, + "step": 49450 + }, + { + "epoch": 5.507851653859005, + "grad_norm": 0.08364895731210709, + "learning_rate": 4.545941082518039e-05, + "loss": 0.0745, + "num_input_tokens_seen": 60189408, + "step": 49455 + }, + { + "epoch": 5.508408508742622, + "grad_norm": 0.07710986584424973, + "learning_rate": 4.545801440068465e-05, + "loss": 0.1013, + "num_input_tokens_seen": 60195616, + "step": 49460 + }, + { + "epoch": 5.508965363626239, + "grad_norm": 0.752314031124115, + "learning_rate": 4.545661778294658e-05, + "loss": 0.1217, + "num_input_tokens_seen": 60201696, + "step": 49465 + }, + { + "epoch": 5.509522218509856, + "grad_norm": 0.356454074382782, + "learning_rate": 4.545522097197936e-05, + "loss": 0.0309, + "num_input_tokens_seen": 60208192, + "step": 49470 + }, + { + "epoch": 5.510079073393474, + "grad_norm": 0.1619664877653122, + "learning_rate": 4.545382396779618e-05, + "loss": 0.0539, + "num_input_tokens_seen": 60214432, + "step": 49475 + }, + { + "epoch": 5.510635928277091, + "grad_norm": 0.2950770854949951, + "learning_rate": 4.545242677041026e-05, + "loss": 0.0788, + "num_input_tokens_seen": 60220736, + "step": 49480 + }, + { + "epoch": 5.511192783160708, + "grad_norm": 0.09344275295734406, + "learning_rate": 4.545102937983478e-05, + "loss": 0.1409, + "num_input_tokens_seen": 60226752, + "step": 49485 + }, + { + "epoch": 5.511749638044326, + "grad_norm": 0.9173327088356018, + "learning_rate": 4.544963179608294e-05, + "loss": 0.1276, + "num_input_tokens_seen": 60232896, + "step": 49490 + }, + { + "epoch": 5.512306492927943, + "grad_norm": 0.008235009387135506, + "learning_rate": 4.5448234019167945e-05, + "loss": 0.0014, + "num_input_tokens_seen": 60239232, + "step": 49495 + }, + { + "epoch": 5.51286334781156, + "grad_norm": 0.7025980949401855, + "learning_rate": 4.544683604910299e-05, + "loss": 0.0523, + "num_input_tokens_seen": 60245088, + "step": 49500 + }, + { + "epoch": 5.513420202695178, + "grad_norm": 0.049108680337667465, + "learning_rate": 4.54454378859013e-05, + "loss": 0.047, + "num_input_tokens_seen": 60250880, + "step": 49505 + }, + { + "epoch": 5.513977057578795, + "grad_norm": 0.2408539354801178, + "learning_rate": 4.544403952957606e-05, + "loss": 0.0771, + "num_input_tokens_seen": 60256928, + "step": 49510 + }, + { + "epoch": 5.514533912462412, + "grad_norm": 0.30947646498680115, + "learning_rate": 4.544264098014049e-05, + "loss": 0.0951, + "num_input_tokens_seen": 60263296, + "step": 49515 + }, + { + "epoch": 5.515090767346029, + "grad_norm": 0.4632565677165985, + "learning_rate": 4.5441242237607795e-05, + "loss": 0.1713, + "num_input_tokens_seen": 60269280, + "step": 49520 + }, + { + "epoch": 5.515647622229647, + "grad_norm": 0.09278999269008636, + "learning_rate": 4.543984330199119e-05, + "loss": 0.084, + "num_input_tokens_seen": 60275040, + "step": 49525 + }, + { + "epoch": 5.516204477113265, + "grad_norm": 0.07680419832468033, + "learning_rate": 4.543844417330389e-05, + "loss": 0.0497, + "num_input_tokens_seen": 60280832, + "step": 49530 + }, + { + "epoch": 5.516761331996881, + "grad_norm": 1.4865926504135132, + "learning_rate": 4.5437044851559104e-05, + "loss": 0.0669, + "num_input_tokens_seen": 60286624, + "step": 49535 + }, + { + "epoch": 5.517318186880499, + "grad_norm": 0.038360532373189926, + "learning_rate": 4.5435645336770067e-05, + "loss": 0.0425, + "num_input_tokens_seen": 60292640, + "step": 49540 + }, + { + "epoch": 5.517875041764116, + "grad_norm": 0.28461742401123047, + "learning_rate": 4.543424562894998e-05, + "loss": 0.0155, + "num_input_tokens_seen": 60298752, + "step": 49545 + }, + { + "epoch": 5.5184318966477335, + "grad_norm": 0.0006909332005307078, + "learning_rate": 4.5432845728112076e-05, + "loss": 0.0742, + "num_input_tokens_seen": 60305088, + "step": 49550 + }, + { + "epoch": 5.518988751531351, + "grad_norm": 0.5338481664657593, + "learning_rate": 4.5431445634269563e-05, + "loss": 0.0392, + "num_input_tokens_seen": 60311360, + "step": 49555 + }, + { + "epoch": 5.519545606414968, + "grad_norm": 0.004026917275041342, + "learning_rate": 4.543004534743569e-05, + "loss": 0.0232, + "num_input_tokens_seen": 60317856, + "step": 49560 + }, + { + "epoch": 5.520102461298586, + "grad_norm": 1.524854302406311, + "learning_rate": 4.542864486762366e-05, + "loss": 0.0753, + "num_input_tokens_seen": 60324192, + "step": 49565 + }, + { + "epoch": 5.520659316182203, + "grad_norm": 0.17876294255256653, + "learning_rate": 4.542724419484672e-05, + "loss": 0.171, + "num_input_tokens_seen": 60330368, + "step": 49570 + }, + { + "epoch": 5.52121617106582, + "grad_norm": 0.64909428358078, + "learning_rate": 4.542584332911809e-05, + "loss": 0.0727, + "num_input_tokens_seen": 60336640, + "step": 49575 + }, + { + "epoch": 5.521773025949438, + "grad_norm": 0.19883769750595093, + "learning_rate": 4.5424442270451e-05, + "loss": 0.0281, + "num_input_tokens_seen": 60342912, + "step": 49580 + }, + { + "epoch": 5.5223298808330545, + "grad_norm": 0.869468629360199, + "learning_rate": 4.5423041018858695e-05, + "loss": 0.053, + "num_input_tokens_seen": 60348928, + "step": 49585 + }, + { + "epoch": 5.522886735716672, + "grad_norm": 0.012000193819403648, + "learning_rate": 4.54216395743544e-05, + "loss": 0.0588, + "num_input_tokens_seen": 60354976, + "step": 49590 + }, + { + "epoch": 5.52344359060029, + "grad_norm": 0.17055538296699524, + "learning_rate": 4.542023793695136e-05, + "loss": 0.021, + "num_input_tokens_seen": 60361280, + "step": 49595 + }, + { + "epoch": 5.524000445483907, + "grad_norm": 0.052441343665122986, + "learning_rate": 4.541883610666281e-05, + "loss": 0.0506, + "num_input_tokens_seen": 60367520, + "step": 49600 + }, + { + "epoch": 5.524557300367524, + "grad_norm": 0.684576153755188, + "learning_rate": 4.5417434083501995e-05, + "loss": 0.1426, + "num_input_tokens_seen": 60373760, + "step": 49605 + }, + { + "epoch": 5.525114155251142, + "grad_norm": 0.07998841255903244, + "learning_rate": 4.5416031867482164e-05, + "loss": 0.0215, + "num_input_tokens_seen": 60379968, + "step": 49610 + }, + { + "epoch": 5.525671010134759, + "grad_norm": 0.0011996575631201267, + "learning_rate": 4.541462945861654e-05, + "loss": 0.0895, + "num_input_tokens_seen": 60386112, + "step": 49615 + }, + { + "epoch": 5.5262278650183765, + "grad_norm": 0.21306075155735016, + "learning_rate": 4.54132268569184e-05, + "loss": 0.0723, + "num_input_tokens_seen": 60392224, + "step": 49620 + }, + { + "epoch": 5.526784719901993, + "grad_norm": 0.8212469220161438, + "learning_rate": 4.541182406240097e-05, + "loss": 0.1119, + "num_input_tokens_seen": 60397952, + "step": 49625 + }, + { + "epoch": 5.527341574785611, + "grad_norm": 1.1409050226211548, + "learning_rate": 4.5410421075077516e-05, + "loss": 0.0852, + "num_input_tokens_seen": 60403328, + "step": 49630 + }, + { + "epoch": 5.527898429669229, + "grad_norm": 0.04369746148586273, + "learning_rate": 4.540901789496127e-05, + "loss": 0.1023, + "num_input_tokens_seen": 60409632, + "step": 49635 + }, + { + "epoch": 5.528455284552845, + "grad_norm": 0.3056721091270447, + "learning_rate": 4.5407614522065505e-05, + "loss": 0.0718, + "num_input_tokens_seen": 60415552, + "step": 49640 + }, + { + "epoch": 5.529012139436463, + "grad_norm": 1.8346493244171143, + "learning_rate": 4.5406210956403474e-05, + "loss": 0.2071, + "num_input_tokens_seen": 60421760, + "step": 49645 + }, + { + "epoch": 5.52956899432008, + "grad_norm": 0.04627938196063042, + "learning_rate": 4.540480719798842e-05, + "loss": 0.0191, + "num_input_tokens_seen": 60428256, + "step": 49650 + }, + { + "epoch": 5.5301258492036975, + "grad_norm": 0.16601426899433136, + "learning_rate": 4.540340324683362e-05, + "loss": 0.0311, + "num_input_tokens_seen": 60434240, + "step": 49655 + }, + { + "epoch": 5.530682704087315, + "grad_norm": 0.0034779144916683435, + "learning_rate": 4.540199910295233e-05, + "loss": 0.0362, + "num_input_tokens_seen": 60440448, + "step": 49660 + }, + { + "epoch": 5.531239558970932, + "grad_norm": 0.8838385343551636, + "learning_rate": 4.540059476635782e-05, + "loss": 0.078, + "num_input_tokens_seen": 60446592, + "step": 49665 + }, + { + "epoch": 5.53179641385455, + "grad_norm": 0.0989634320139885, + "learning_rate": 4.5399190237063336e-05, + "loss": 0.0217, + "num_input_tokens_seen": 60452544, + "step": 49670 + }, + { + "epoch": 5.5323532687381665, + "grad_norm": 0.2822490334510803, + "learning_rate": 4.539778551508216e-05, + "loss": 0.0505, + "num_input_tokens_seen": 60457952, + "step": 49675 + }, + { + "epoch": 5.532910123621784, + "grad_norm": 0.04682307690382004, + "learning_rate": 4.5396380600427555e-05, + "loss": 0.1283, + "num_input_tokens_seen": 60464064, + "step": 49680 + }, + { + "epoch": 5.533466978505402, + "grad_norm": 0.32705458998680115, + "learning_rate": 4.53949754931128e-05, + "loss": 0.1356, + "num_input_tokens_seen": 60470272, + "step": 49685 + }, + { + "epoch": 5.534023833389019, + "grad_norm": 0.008010037243366241, + "learning_rate": 4.539357019315116e-05, + "loss": 0.0864, + "num_input_tokens_seen": 60476352, + "step": 49690 + }, + { + "epoch": 5.534580688272636, + "grad_norm": 0.05329441279172897, + "learning_rate": 4.5392164700555916e-05, + "loss": 0.0596, + "num_input_tokens_seen": 60482432, + "step": 49695 + }, + { + "epoch": 5.535137543156253, + "grad_norm": 0.87955641746521, + "learning_rate": 4.539075901534033e-05, + "loss": 0.0345, + "num_input_tokens_seen": 60488448, + "step": 49700 + }, + { + "epoch": 5.535694398039871, + "grad_norm": 1.983641266822815, + "learning_rate": 4.538935313751769e-05, + "loss": 0.0546, + "num_input_tokens_seen": 60494592, + "step": 49705 + }, + { + "epoch": 5.536251252923488, + "grad_norm": 1.236163854598999, + "learning_rate": 4.5387947067101274e-05, + "loss": 0.0581, + "num_input_tokens_seen": 60500800, + "step": 49710 + }, + { + "epoch": 5.536808107807105, + "grad_norm": 0.5643103122711182, + "learning_rate": 4.538654080410436e-05, + "loss": 0.1906, + "num_input_tokens_seen": 60506816, + "step": 49715 + }, + { + "epoch": 5.537364962690723, + "grad_norm": 0.13216368854045868, + "learning_rate": 4.538513434854024e-05, + "loss": 0.1308, + "num_input_tokens_seen": 60512864, + "step": 49720 + }, + { + "epoch": 5.53792181757434, + "grad_norm": 0.01833583414554596, + "learning_rate": 4.5383727700422194e-05, + "loss": 0.0484, + "num_input_tokens_seen": 60518464, + "step": 49725 + }, + { + "epoch": 5.538478672457957, + "grad_norm": 0.8628519773483276, + "learning_rate": 4.53823208597635e-05, + "loss": 0.0834, + "num_input_tokens_seen": 60524672, + "step": 49730 + }, + { + "epoch": 5.539035527341575, + "grad_norm": 0.08697768300771713, + "learning_rate": 4.538091382657747e-05, + "loss": 0.037, + "num_input_tokens_seen": 60530752, + "step": 49735 + }, + { + "epoch": 5.539592382225192, + "grad_norm": 0.026886969804763794, + "learning_rate": 4.537950660087737e-05, + "loss": 0.0422, + "num_input_tokens_seen": 60536800, + "step": 49740 + }, + { + "epoch": 5.5401492371088095, + "grad_norm": 0.4391142427921295, + "learning_rate": 4.53780991826765e-05, + "loss": 0.0644, + "num_input_tokens_seen": 60542848, + "step": 49745 + }, + { + "epoch": 5.540706091992427, + "grad_norm": 0.3603880703449249, + "learning_rate": 4.537669157198817e-05, + "loss": 0.1673, + "num_input_tokens_seen": 60548640, + "step": 49750 + }, + { + "epoch": 5.541262946876044, + "grad_norm": 0.001773293362930417, + "learning_rate": 4.537528376882565e-05, + "loss": 0.1266, + "num_input_tokens_seen": 60554816, + "step": 49755 + }, + { + "epoch": 5.541819801759662, + "grad_norm": 0.05194086953997612, + "learning_rate": 4.5373875773202257e-05, + "loss": 0.0315, + "num_input_tokens_seen": 60560480, + "step": 49760 + }, + { + "epoch": 5.542376656643278, + "grad_norm": 1.545829176902771, + "learning_rate": 4.537246758513128e-05, + "loss": 0.0984, + "num_input_tokens_seen": 60566656, + "step": 49765 + }, + { + "epoch": 5.542933511526896, + "grad_norm": 0.11875096708536148, + "learning_rate": 4.537105920462603e-05, + "loss": 0.1953, + "num_input_tokens_seen": 60572640, + "step": 49770 + }, + { + "epoch": 5.543490366410514, + "grad_norm": 0.23955300450325012, + "learning_rate": 4.5369650631699795e-05, + "loss": 0.0371, + "num_input_tokens_seen": 60578688, + "step": 49775 + }, + { + "epoch": 5.5440472212941305, + "grad_norm": 0.9555273652076721, + "learning_rate": 4.5368241866365894e-05, + "loss": 0.1044, + "num_input_tokens_seen": 60583936, + "step": 49780 + }, + { + "epoch": 5.544604076177748, + "grad_norm": 1.5823543071746826, + "learning_rate": 4.5366832908637635e-05, + "loss": 0.1583, + "num_input_tokens_seen": 60590048, + "step": 49785 + }, + { + "epoch": 5.545160931061366, + "grad_norm": 0.09853435307741165, + "learning_rate": 4.536542375852831e-05, + "loss": 0.016, + "num_input_tokens_seen": 60596416, + "step": 49790 + }, + { + "epoch": 5.545717785944983, + "grad_norm": 0.06045256555080414, + "learning_rate": 4.536401441605126e-05, + "loss": 0.068, + "num_input_tokens_seen": 60602432, + "step": 49795 + }, + { + "epoch": 5.5462746408286, + "grad_norm": 0.6739324331283569, + "learning_rate": 4.536260488121976e-05, + "loss": 0.0254, + "num_input_tokens_seen": 60608544, + "step": 49800 + }, + { + "epoch": 5.546831495712217, + "grad_norm": 0.8379277586936951, + "learning_rate": 4.536119515404715e-05, + "loss": 0.0568, + "num_input_tokens_seen": 60614624, + "step": 49805 + }, + { + "epoch": 5.547388350595835, + "grad_norm": 1.2632734775543213, + "learning_rate": 4.535978523454674e-05, + "loss": 0.2202, + "num_input_tokens_seen": 60620288, + "step": 49810 + }, + { + "epoch": 5.5479452054794525, + "grad_norm": 0.5332010984420776, + "learning_rate": 4.535837512273184e-05, + "loss": 0.0847, + "num_input_tokens_seen": 60626528, + "step": 49815 + }, + { + "epoch": 5.548502060363069, + "grad_norm": 0.699108898639679, + "learning_rate": 4.5356964818615786e-05, + "loss": 0.0589, + "num_input_tokens_seen": 60632096, + "step": 49820 + }, + { + "epoch": 5.549058915246687, + "grad_norm": 0.002704306272789836, + "learning_rate": 4.535555432221189e-05, + "loss": 0.0097, + "num_input_tokens_seen": 60638784, + "step": 49825 + }, + { + "epoch": 5.549615770130304, + "grad_norm": 2.7871978282928467, + "learning_rate": 4.5354143633533466e-05, + "loss": 0.1306, + "num_input_tokens_seen": 60644768, + "step": 49830 + }, + { + "epoch": 5.550172625013921, + "grad_norm": 2.2115066051483154, + "learning_rate": 4.5352732752593854e-05, + "loss": 0.0673, + "num_input_tokens_seen": 60650912, + "step": 49835 + }, + { + "epoch": 5.550729479897539, + "grad_norm": 0.017899727448821068, + "learning_rate": 4.5351321679406365e-05, + "loss": 0.0966, + "num_input_tokens_seen": 60656640, + "step": 49840 + }, + { + "epoch": 5.551286334781156, + "grad_norm": 0.21204490959644318, + "learning_rate": 4.534991041398435e-05, + "loss": 0.0785, + "num_input_tokens_seen": 60662944, + "step": 49845 + }, + { + "epoch": 5.5518431896647735, + "grad_norm": 0.007529358379542828, + "learning_rate": 4.534849895634112e-05, + "loss": 0.0384, + "num_input_tokens_seen": 60668800, + "step": 49850 + }, + { + "epoch": 5.55240004454839, + "grad_norm": 0.25591617822647095, + "learning_rate": 4.534708730649002e-05, + "loss": 0.0912, + "num_input_tokens_seen": 60674624, + "step": 49855 + }, + { + "epoch": 5.552956899432008, + "grad_norm": 0.6959370374679565, + "learning_rate": 4.534567546444437e-05, + "loss": 0.0777, + "num_input_tokens_seen": 60680800, + "step": 49860 + }, + { + "epoch": 5.553513754315626, + "grad_norm": 0.003312746062874794, + "learning_rate": 4.534426343021752e-05, + "loss": 0.0102, + "num_input_tokens_seen": 60687200, + "step": 49865 + }, + { + "epoch": 5.5540706091992424, + "grad_norm": 0.3641018867492676, + "learning_rate": 4.53428512038228e-05, + "loss": 0.0856, + "num_input_tokens_seen": 60693248, + "step": 49870 + }, + { + "epoch": 5.55462746408286, + "grad_norm": 0.35538485646247864, + "learning_rate": 4.534143878527356e-05, + "loss": 0.1194, + "num_input_tokens_seen": 60699424, + "step": 49875 + }, + { + "epoch": 5.555184318966477, + "grad_norm": 0.14037524163722992, + "learning_rate": 4.534002617458313e-05, + "loss": 0.0217, + "num_input_tokens_seen": 60705440, + "step": 49880 + }, + { + "epoch": 5.555741173850095, + "grad_norm": 0.0327090322971344, + "learning_rate": 4.533861337176485e-05, + "loss": 0.1179, + "num_input_tokens_seen": 60711616, + "step": 49885 + }, + { + "epoch": 5.556298028733712, + "grad_norm": 0.6142009496688843, + "learning_rate": 4.533720037683207e-05, + "loss": 0.0377, + "num_input_tokens_seen": 60717824, + "step": 49890 + }, + { + "epoch": 5.556854883617329, + "grad_norm": 2.565100908279419, + "learning_rate": 4.533578718979815e-05, + "loss": 0.0923, + "num_input_tokens_seen": 60724224, + "step": 49895 + }, + { + "epoch": 5.557411738500947, + "grad_norm": 0.9274745583534241, + "learning_rate": 4.533437381067642e-05, + "loss": 0.0557, + "num_input_tokens_seen": 60730720, + "step": 49900 + }, + { + "epoch": 5.5579685933845635, + "grad_norm": 0.943785548210144, + "learning_rate": 4.5332960239480234e-05, + "loss": 0.1338, + "num_input_tokens_seen": 60736544, + "step": 49905 + }, + { + "epoch": 5.558525448268181, + "grad_norm": 0.2329898625612259, + "learning_rate": 4.5331546476222954e-05, + "loss": 0.0648, + "num_input_tokens_seen": 60742720, + "step": 49910 + }, + { + "epoch": 5.559082303151799, + "grad_norm": 1.250746488571167, + "learning_rate": 4.5330132520917926e-05, + "loss": 0.1409, + "num_input_tokens_seen": 60748416, + "step": 49915 + }, + { + "epoch": 5.559639158035416, + "grad_norm": 0.001349545200355351, + "learning_rate": 4.5328718373578516e-05, + "loss": 0.113, + "num_input_tokens_seen": 60754560, + "step": 49920 + }, + { + "epoch": 5.560196012919033, + "grad_norm": 0.11937348544597626, + "learning_rate": 4.5327304034218064e-05, + "loss": 0.1479, + "num_input_tokens_seen": 60760800, + "step": 49925 + }, + { + "epoch": 5.560752867802651, + "grad_norm": 0.6641610860824585, + "learning_rate": 4.532588950284994e-05, + "loss": 0.1727, + "num_input_tokens_seen": 60767040, + "step": 49930 + }, + { + "epoch": 5.561309722686268, + "grad_norm": 0.5313503742218018, + "learning_rate": 4.532447477948751e-05, + "loss": 0.0293, + "num_input_tokens_seen": 60773088, + "step": 49935 + }, + { + "epoch": 5.5618665775698855, + "grad_norm": 1.755806565284729, + "learning_rate": 4.532305986414413e-05, + "loss": 0.2691, + "num_input_tokens_seen": 60779424, + "step": 49940 + }, + { + "epoch": 5.562423432453502, + "grad_norm": 0.27460071444511414, + "learning_rate": 4.5321644756833165e-05, + "loss": 0.0755, + "num_input_tokens_seen": 60785824, + "step": 49945 + }, + { + "epoch": 5.56298028733712, + "grad_norm": 0.39508190751075745, + "learning_rate": 4.5320229457567984e-05, + "loss": 0.0703, + "num_input_tokens_seen": 60791744, + "step": 49950 + }, + { + "epoch": 5.563537142220738, + "grad_norm": 0.11939965933561325, + "learning_rate": 4.531881396636196e-05, + "loss": 0.0274, + "num_input_tokens_seen": 60798144, + "step": 49955 + }, + { + "epoch": 5.564093997104354, + "grad_norm": 0.5998091697692871, + "learning_rate": 4.531739828322845e-05, + "loss": 0.0515, + "num_input_tokens_seen": 60804064, + "step": 49960 + }, + { + "epoch": 5.564650851987972, + "grad_norm": 0.008680960163474083, + "learning_rate": 4.531598240818085e-05, + "loss": 0.0095, + "num_input_tokens_seen": 60810208, + "step": 49965 + }, + { + "epoch": 5.56520770687159, + "grad_norm": 0.21445120871067047, + "learning_rate": 4.53145663412325e-05, + "loss": 0.1165, + "num_input_tokens_seen": 60816640, + "step": 49970 + }, + { + "epoch": 5.5657645617552065, + "grad_norm": 0.8994342684745789, + "learning_rate": 4.531315008239682e-05, + "loss": 0.1548, + "num_input_tokens_seen": 60822944, + "step": 49975 + }, + { + "epoch": 5.566321416638824, + "grad_norm": 0.9550737738609314, + "learning_rate": 4.531173363168714e-05, + "loss": 0.0447, + "num_input_tokens_seen": 60829152, + "step": 49980 + }, + { + "epoch": 5.566878271522441, + "grad_norm": 0.17292070388793945, + "learning_rate": 4.531031698911687e-05, + "loss": 0.0964, + "num_input_tokens_seen": 60835200, + "step": 49985 + }, + { + "epoch": 5.567435126406059, + "grad_norm": 0.11305105686187744, + "learning_rate": 4.530890015469938e-05, + "loss": 0.0594, + "num_input_tokens_seen": 60841472, + "step": 49990 + }, + { + "epoch": 5.567991981289676, + "grad_norm": 0.012926539406180382, + "learning_rate": 4.530748312844807e-05, + "loss": 0.0085, + "num_input_tokens_seen": 60847584, + "step": 49995 + }, + { + "epoch": 5.568548836173293, + "grad_norm": 0.9999111890792847, + "learning_rate": 4.5306065910376294e-05, + "loss": 0.057, + "num_input_tokens_seen": 60853824, + "step": 50000 + }, + { + "epoch": 5.569105691056911, + "grad_norm": 0.01487733330577612, + "learning_rate": 4.530464850049747e-05, + "loss": 0.0284, + "num_input_tokens_seen": 60859872, + "step": 50005 + }, + { + "epoch": 5.569662545940528, + "grad_norm": 0.8414775729179382, + "learning_rate": 4.5303230898824965e-05, + "loss": 0.1109, + "num_input_tokens_seen": 60865888, + "step": 50010 + }, + { + "epoch": 5.570219400824145, + "grad_norm": 0.29513871669769287, + "learning_rate": 4.530181310537218e-05, + "loss": 0.0123, + "num_input_tokens_seen": 60872352, + "step": 50015 + }, + { + "epoch": 5.570776255707763, + "grad_norm": 0.5047575235366821, + "learning_rate": 4.530039512015251e-05, + "loss": 0.0994, + "num_input_tokens_seen": 60877664, + "step": 50020 + }, + { + "epoch": 5.57133311059138, + "grad_norm": 1.042857050895691, + "learning_rate": 4.529897694317934e-05, + "loss": 0.1362, + "num_input_tokens_seen": 60883872, + "step": 50025 + }, + { + "epoch": 5.571889965474997, + "grad_norm": 0.518814206123352, + "learning_rate": 4.529755857446607e-05, + "loss": 0.0558, + "num_input_tokens_seen": 60889952, + "step": 50030 + }, + { + "epoch": 5.572446820358614, + "grad_norm": 0.014578212983906269, + "learning_rate": 4.529614001402609e-05, + "loss": 0.055, + "num_input_tokens_seen": 60896192, + "step": 50035 + }, + { + "epoch": 5.573003675242232, + "grad_norm": 0.8966498970985413, + "learning_rate": 4.529472126187282e-05, + "loss": 0.0424, + "num_input_tokens_seen": 60902400, + "step": 50040 + }, + { + "epoch": 5.5735605301258495, + "grad_norm": 1.2659920454025269, + "learning_rate": 4.529330231801964e-05, + "loss": 0.0696, + "num_input_tokens_seen": 60908480, + "step": 50045 + }, + { + "epoch": 5.574117385009466, + "grad_norm": 0.0190842654556036, + "learning_rate": 4.529188318247995e-05, + "loss": 0.0447, + "num_input_tokens_seen": 60914496, + "step": 50050 + }, + { + "epoch": 5.574674239893084, + "grad_norm": 1.282332420349121, + "learning_rate": 4.5290463855267187e-05, + "loss": 0.0953, + "num_input_tokens_seen": 60920288, + "step": 50055 + }, + { + "epoch": 5.575231094776701, + "grad_norm": 0.1382412612438202, + "learning_rate": 4.5289044336394724e-05, + "loss": 0.1009, + "num_input_tokens_seen": 60926176, + "step": 50060 + }, + { + "epoch": 5.575787949660318, + "grad_norm": 0.21486292779445648, + "learning_rate": 4.528762462587598e-05, + "loss": 0.0943, + "num_input_tokens_seen": 60932128, + "step": 50065 + }, + { + "epoch": 5.576344804543936, + "grad_norm": 0.3150162398815155, + "learning_rate": 4.5286204723724375e-05, + "loss": 0.1368, + "num_input_tokens_seen": 60937728, + "step": 50070 + }, + { + "epoch": 5.576901659427553, + "grad_norm": 0.6116737127304077, + "learning_rate": 4.528478462995331e-05, + "loss": 0.1101, + "num_input_tokens_seen": 60943936, + "step": 50075 + }, + { + "epoch": 5.577458514311171, + "grad_norm": 0.45899373292922974, + "learning_rate": 4.52833643445762e-05, + "loss": 0.0486, + "num_input_tokens_seen": 60950208, + "step": 50080 + }, + { + "epoch": 5.578015369194787, + "grad_norm": 0.0337018184363842, + "learning_rate": 4.528194386760647e-05, + "loss": 0.0775, + "num_input_tokens_seen": 60956352, + "step": 50085 + }, + { + "epoch": 5.578572224078405, + "grad_norm": 0.0644797757267952, + "learning_rate": 4.5280523199057524e-05, + "loss": 0.0639, + "num_input_tokens_seen": 60962144, + "step": 50090 + }, + { + "epoch": 5.579129078962023, + "grad_norm": 0.04841356724500656, + "learning_rate": 4.5279102338942785e-05, + "loss": 0.0176, + "num_input_tokens_seen": 60968160, + "step": 50095 + }, + { + "epoch": 5.5796859338456395, + "grad_norm": 0.01515136007219553, + "learning_rate": 4.5277681287275686e-05, + "loss": 0.0435, + "num_input_tokens_seen": 60973632, + "step": 50100 + }, + { + "epoch": 5.580242788729257, + "grad_norm": 0.13002566993236542, + "learning_rate": 4.5276260044069636e-05, + "loss": 0.0332, + "num_input_tokens_seen": 60979232, + "step": 50105 + }, + { + "epoch": 5.580799643612875, + "grad_norm": 0.14532408118247986, + "learning_rate": 4.5274838609338066e-05, + "loss": 0.0345, + "num_input_tokens_seen": 60985536, + "step": 50110 + }, + { + "epoch": 5.581356498496492, + "grad_norm": 0.1335529088973999, + "learning_rate": 4.5273416983094394e-05, + "loss": 0.0304, + "num_input_tokens_seen": 60991424, + "step": 50115 + }, + { + "epoch": 5.581913353380109, + "grad_norm": 0.06412217020988464, + "learning_rate": 4.527199516535207e-05, + "loss": 0.1274, + "num_input_tokens_seen": 60997152, + "step": 50120 + }, + { + "epoch": 5.582470208263726, + "grad_norm": 0.8417519330978394, + "learning_rate": 4.5270573156124496e-05, + "loss": 0.0367, + "num_input_tokens_seen": 61003360, + "step": 50125 + }, + { + "epoch": 5.583027063147344, + "grad_norm": 0.029066238552331924, + "learning_rate": 4.5269150955425124e-05, + "loss": 0.0365, + "num_input_tokens_seen": 61009792, + "step": 50130 + }, + { + "epoch": 5.583583918030961, + "grad_norm": 0.16221947968006134, + "learning_rate": 4.5267728563267386e-05, + "loss": 0.0096, + "num_input_tokens_seen": 61016192, + "step": 50135 + }, + { + "epoch": 5.584140772914578, + "grad_norm": 0.04513296112418175, + "learning_rate": 4.526630597966471e-05, + "loss": 0.102, + "num_input_tokens_seen": 61022240, + "step": 50140 + }, + { + "epoch": 5.584697627798196, + "grad_norm": 1.461379885673523, + "learning_rate": 4.526488320463054e-05, + "loss": 0.1074, + "num_input_tokens_seen": 61028416, + "step": 50145 + }, + { + "epoch": 5.585254482681814, + "grad_norm": 0.6958707571029663, + "learning_rate": 4.526346023817831e-05, + "loss": 0.0693, + "num_input_tokens_seen": 61034464, + "step": 50150 + }, + { + "epoch": 5.58581133756543, + "grad_norm": 0.8096070289611816, + "learning_rate": 4.526203708032146e-05, + "loss": 0.0432, + "num_input_tokens_seen": 61040512, + "step": 50155 + }, + { + "epoch": 5.586368192449048, + "grad_norm": 0.004955758340656757, + "learning_rate": 4.526061373107344e-05, + "loss": 0.0236, + "num_input_tokens_seen": 61046464, + "step": 50160 + }, + { + "epoch": 5.586925047332665, + "grad_norm": 1.0161182880401611, + "learning_rate": 4.525919019044769e-05, + "loss": 0.0489, + "num_input_tokens_seen": 61051744, + "step": 50165 + }, + { + "epoch": 5.5874819022162825, + "grad_norm": 1.4571328163146973, + "learning_rate": 4.525776645845765e-05, + "loss": 0.0691, + "num_input_tokens_seen": 61057984, + "step": 50170 + }, + { + "epoch": 5.5880387570999, + "grad_norm": 0.2715049088001251, + "learning_rate": 4.525634253511679e-05, + "loss": 0.0235, + "num_input_tokens_seen": 61063424, + "step": 50175 + }, + { + "epoch": 5.588595611983517, + "grad_norm": 0.07870122045278549, + "learning_rate": 4.525491842043853e-05, + "loss": 0.0611, + "num_input_tokens_seen": 61070016, + "step": 50180 + }, + { + "epoch": 5.589152466867135, + "grad_norm": 0.5259836316108704, + "learning_rate": 4.5253494114436347e-05, + "loss": 0.0286, + "num_input_tokens_seen": 61075552, + "step": 50185 + }, + { + "epoch": 5.589709321750751, + "grad_norm": 0.15300516784191132, + "learning_rate": 4.5252069617123684e-05, + "loss": 0.0334, + "num_input_tokens_seen": 61081536, + "step": 50190 + }, + { + "epoch": 5.590266176634369, + "grad_norm": 0.27202415466308594, + "learning_rate": 4.5250644928514e-05, + "loss": 0.0474, + "num_input_tokens_seen": 61087328, + "step": 50195 + }, + { + "epoch": 5.590823031517987, + "grad_norm": 0.015467748045921326, + "learning_rate": 4.524922004862074e-05, + "loss": 0.0473, + "num_input_tokens_seen": 61093408, + "step": 50200 + }, + { + "epoch": 5.5913798864016035, + "grad_norm": 0.005211750045418739, + "learning_rate": 4.524779497745739e-05, + "loss": 0.0585, + "num_input_tokens_seen": 61099744, + "step": 50205 + }, + { + "epoch": 5.591936741285221, + "grad_norm": 1.1944316625595093, + "learning_rate": 4.524636971503739e-05, + "loss": 0.0525, + "num_input_tokens_seen": 61105952, + "step": 50210 + }, + { + "epoch": 5.592493596168838, + "grad_norm": 0.290816992521286, + "learning_rate": 4.524494426137419e-05, + "loss": 0.0376, + "num_input_tokens_seen": 61111840, + "step": 50215 + }, + { + "epoch": 5.593050451052456, + "grad_norm": 0.008827276527881622, + "learning_rate": 4.524351861648128e-05, + "loss": 0.0769, + "num_input_tokens_seen": 61117920, + "step": 50220 + }, + { + "epoch": 5.593607305936073, + "grad_norm": 1.1659941673278809, + "learning_rate": 4.524209278037213e-05, + "loss": 0.0501, + "num_input_tokens_seen": 61124000, + "step": 50225 + }, + { + "epoch": 5.59416416081969, + "grad_norm": 1.4256147146224976, + "learning_rate": 4.524066675306019e-05, + "loss": 0.1005, + "num_input_tokens_seen": 61129440, + "step": 50230 + }, + { + "epoch": 5.594721015703308, + "grad_norm": 0.32909679412841797, + "learning_rate": 4.5239240534558924e-05, + "loss": 0.1247, + "num_input_tokens_seen": 61135264, + "step": 50235 + }, + { + "epoch": 5.595277870586925, + "grad_norm": 1.1312226057052612, + "learning_rate": 4.523781412488183e-05, + "loss": 0.0178, + "num_input_tokens_seen": 61141600, + "step": 50240 + }, + { + "epoch": 5.595834725470542, + "grad_norm": 0.8799679279327393, + "learning_rate": 4.5236387524042355e-05, + "loss": 0.073, + "num_input_tokens_seen": 61146784, + "step": 50245 + }, + { + "epoch": 5.59639158035416, + "grad_norm": 0.017538825049996376, + "learning_rate": 4.5234960732054e-05, + "loss": 0.1011, + "num_input_tokens_seen": 61153120, + "step": 50250 + }, + { + "epoch": 5.596948435237777, + "grad_norm": 1.5562182664871216, + "learning_rate": 4.5233533748930225e-05, + "loss": 0.0696, + "num_input_tokens_seen": 61159072, + "step": 50255 + }, + { + "epoch": 5.597505290121394, + "grad_norm": 2.0273067951202393, + "learning_rate": 4.5232106574684506e-05, + "loss": 0.1066, + "num_input_tokens_seen": 61165120, + "step": 50260 + }, + { + "epoch": 5.598062145005011, + "grad_norm": 0.07778685539960861, + "learning_rate": 4.523067920933034e-05, + "loss": 0.0185, + "num_input_tokens_seen": 61171008, + "step": 50265 + }, + { + "epoch": 5.598618999888629, + "grad_norm": 0.1538296490907669, + "learning_rate": 4.522925165288119e-05, + "loss": 0.0583, + "num_input_tokens_seen": 61177504, + "step": 50270 + }, + { + "epoch": 5.5991758547722466, + "grad_norm": 0.8533948659896851, + "learning_rate": 4.522782390535056e-05, + "loss": 0.1273, + "num_input_tokens_seen": 61183808, + "step": 50275 + }, + { + "epoch": 5.599732709655863, + "grad_norm": 0.0223425030708313, + "learning_rate": 4.5226395966751924e-05, + "loss": 0.1567, + "num_input_tokens_seen": 61189696, + "step": 50280 + }, + { + "epoch": 5.600289564539481, + "grad_norm": 1.0526765584945679, + "learning_rate": 4.5224967837098767e-05, + "loss": 0.06, + "num_input_tokens_seen": 61195712, + "step": 50285 + }, + { + "epoch": 5.600846419423099, + "grad_norm": 0.8642917275428772, + "learning_rate": 4.522353951640459e-05, + "loss": 0.0573, + "num_input_tokens_seen": 61201632, + "step": 50290 + }, + { + "epoch": 5.6014032743067155, + "grad_norm": 1.9098719358444214, + "learning_rate": 4.5222111004682885e-05, + "loss": 0.2517, + "num_input_tokens_seen": 61207904, + "step": 50295 + }, + { + "epoch": 5.601960129190333, + "grad_norm": 0.011954255402088165, + "learning_rate": 4.522068230194713e-05, + "loss": 0.0804, + "num_input_tokens_seen": 61213952, + "step": 50300 + }, + { + "epoch": 5.602516984073951, + "grad_norm": 0.025946732610464096, + "learning_rate": 4.521925340821084e-05, + "loss": 0.0796, + "num_input_tokens_seen": 61219808, + "step": 50305 + }, + { + "epoch": 5.603073838957568, + "grad_norm": 0.1177930012345314, + "learning_rate": 4.521782432348749e-05, + "loss": 0.0595, + "num_input_tokens_seen": 61226016, + "step": 50310 + }, + { + "epoch": 5.603630693841185, + "grad_norm": 0.02531951107084751, + "learning_rate": 4.5216395047790604e-05, + "loss": 0.0257, + "num_input_tokens_seen": 61232128, + "step": 50315 + }, + { + "epoch": 5.604187548724802, + "grad_norm": 0.23480413854122162, + "learning_rate": 4.521496558113366e-05, + "loss": 0.0777, + "num_input_tokens_seen": 61238176, + "step": 50320 + }, + { + "epoch": 5.60474440360842, + "grad_norm": 0.021535605192184448, + "learning_rate": 4.521353592353017e-05, + "loss": 0.086, + "num_input_tokens_seen": 61244320, + "step": 50325 + }, + { + "epoch": 5.605301258492037, + "grad_norm": 0.005815457087010145, + "learning_rate": 4.5212106074993644e-05, + "loss": 0.0901, + "num_input_tokens_seen": 61250560, + "step": 50330 + }, + { + "epoch": 5.605858113375654, + "grad_norm": 1.7977544069290161, + "learning_rate": 4.521067603553758e-05, + "loss": 0.1675, + "num_input_tokens_seen": 61256576, + "step": 50335 + }, + { + "epoch": 5.606414968259272, + "grad_norm": 0.018522433936595917, + "learning_rate": 4.520924580517549e-05, + "loss": 0.0659, + "num_input_tokens_seen": 61262464, + "step": 50340 + }, + { + "epoch": 5.606971823142889, + "grad_norm": 0.37267547845840454, + "learning_rate": 4.520781538392088e-05, + "loss": 0.0647, + "num_input_tokens_seen": 61268416, + "step": 50345 + }, + { + "epoch": 5.607528678026506, + "grad_norm": 0.5999749898910522, + "learning_rate": 4.520638477178727e-05, + "loss": 0.1032, + "num_input_tokens_seen": 61274624, + "step": 50350 + }, + { + "epoch": 5.608085532910124, + "grad_norm": 0.6051329970359802, + "learning_rate": 4.5204953968788156e-05, + "loss": 0.1195, + "num_input_tokens_seen": 61280768, + "step": 50355 + }, + { + "epoch": 5.608642387793741, + "grad_norm": 0.5219611525535583, + "learning_rate": 4.5203522974937066e-05, + "loss": 0.0435, + "num_input_tokens_seen": 61286656, + "step": 50360 + }, + { + "epoch": 5.6091992426773585, + "grad_norm": 0.41168564558029175, + "learning_rate": 4.520209179024752e-05, + "loss": 0.1023, + "num_input_tokens_seen": 61292992, + "step": 50365 + }, + { + "epoch": 5.609756097560975, + "grad_norm": 0.7394859194755554, + "learning_rate": 4.520066041473303e-05, + "loss": 0.0853, + "num_input_tokens_seen": 61298912, + "step": 50370 + }, + { + "epoch": 5.610312952444593, + "grad_norm": 0.2600560784339905, + "learning_rate": 4.5199228848407115e-05, + "loss": 0.0067, + "num_input_tokens_seen": 61305216, + "step": 50375 + }, + { + "epoch": 5.610869807328211, + "grad_norm": 0.36076584458351135, + "learning_rate": 4.51977970912833e-05, + "loss": 0.0217, + "num_input_tokens_seen": 61311328, + "step": 50380 + }, + { + "epoch": 5.611426662211827, + "grad_norm": 0.0030369646847248077, + "learning_rate": 4.5196365143375116e-05, + "loss": 0.0602, + "num_input_tokens_seen": 61317248, + "step": 50385 + }, + { + "epoch": 5.611983517095445, + "grad_norm": 0.012160468846559525, + "learning_rate": 4.519493300469607e-05, + "loss": 0.0453, + "num_input_tokens_seen": 61323456, + "step": 50390 + }, + { + "epoch": 5.612540371979062, + "grad_norm": 1.1431440114974976, + "learning_rate": 4.5193500675259714e-05, + "loss": 0.1457, + "num_input_tokens_seen": 61329568, + "step": 50395 + }, + { + "epoch": 5.6130972268626795, + "grad_norm": 0.002249586395919323, + "learning_rate": 4.519206815507956e-05, + "loss": 0.0865, + "num_input_tokens_seen": 61335616, + "step": 50400 + }, + { + "epoch": 5.613654081746297, + "grad_norm": 2.4299557209014893, + "learning_rate": 4.519063544416915e-05, + "loss": 0.0726, + "num_input_tokens_seen": 61341760, + "step": 50405 + }, + { + "epoch": 5.614210936629914, + "grad_norm": 0.8711107969284058, + "learning_rate": 4.5189202542542e-05, + "loss": 0.1114, + "num_input_tokens_seen": 61347872, + "step": 50410 + }, + { + "epoch": 5.614767791513532, + "grad_norm": 0.000220402012928389, + "learning_rate": 4.518776945021167e-05, + "loss": 0.0722, + "num_input_tokens_seen": 61354112, + "step": 50415 + }, + { + "epoch": 5.6153246463971485, + "grad_norm": 0.20881198346614838, + "learning_rate": 4.5186336167191676e-05, + "loss": 0.0211, + "num_input_tokens_seen": 61360288, + "step": 50420 + }, + { + "epoch": 5.615881501280766, + "grad_norm": 0.13726425170898438, + "learning_rate": 4.518490269349556e-05, + "loss": 0.0277, + "num_input_tokens_seen": 61366336, + "step": 50425 + }, + { + "epoch": 5.616438356164384, + "grad_norm": 0.03689099848270416, + "learning_rate": 4.518346902913687e-05, + "loss": 0.0214, + "num_input_tokens_seen": 61372512, + "step": 50430 + }, + { + "epoch": 5.616995211048001, + "grad_norm": 1.0779399871826172, + "learning_rate": 4.518203517412915e-05, + "loss": 0.1047, + "num_input_tokens_seen": 61378464, + "step": 50435 + }, + { + "epoch": 5.617552065931618, + "grad_norm": 0.016088761389255524, + "learning_rate": 4.5180601128485935e-05, + "loss": 0.0357, + "num_input_tokens_seen": 61384640, + "step": 50440 + }, + { + "epoch": 5.618108920815235, + "grad_norm": 0.026861879974603653, + "learning_rate": 4.517916689222077e-05, + "loss": 0.0405, + "num_input_tokens_seen": 61390592, + "step": 50445 + }, + { + "epoch": 5.618665775698853, + "grad_norm": 0.20321348309516907, + "learning_rate": 4.517773246534721e-05, + "loss": 0.0101, + "num_input_tokens_seen": 61397024, + "step": 50450 + }, + { + "epoch": 5.61922263058247, + "grad_norm": 0.3545251190662384, + "learning_rate": 4.51762978478788e-05, + "loss": 0.0981, + "num_input_tokens_seen": 61403296, + "step": 50455 + }, + { + "epoch": 5.619779485466087, + "grad_norm": 0.029999610036611557, + "learning_rate": 4.5174863039829094e-05, + "loss": 0.061, + "num_input_tokens_seen": 61409248, + "step": 50460 + }, + { + "epoch": 5.620336340349705, + "grad_norm": 0.10553870350122452, + "learning_rate": 4.5173428041211636e-05, + "loss": 0.0045, + "num_input_tokens_seen": 61415584, + "step": 50465 + }, + { + "epoch": 5.6208931952333225, + "grad_norm": 0.11259573698043823, + "learning_rate": 4.517199285203999e-05, + "loss": 0.0714, + "num_input_tokens_seen": 61422016, + "step": 50470 + }, + { + "epoch": 5.621450050116939, + "grad_norm": 0.031211387366056442, + "learning_rate": 4.517055747232771e-05, + "loss": 0.0993, + "num_input_tokens_seen": 61428224, + "step": 50475 + }, + { + "epoch": 5.622006905000557, + "grad_norm": 0.003171045333147049, + "learning_rate": 4.5169121902088366e-05, + "loss": 0.0472, + "num_input_tokens_seen": 61434688, + "step": 50480 + }, + { + "epoch": 5.622563759884175, + "grad_norm": 0.08064981549978256, + "learning_rate": 4.5167686141335483e-05, + "loss": 0.0456, + "num_input_tokens_seen": 61440992, + "step": 50485 + }, + { + "epoch": 5.6231206147677915, + "grad_norm": 0.09949405491352081, + "learning_rate": 4.516625019008266e-05, + "loss": 0.0955, + "num_input_tokens_seen": 61447104, + "step": 50490 + }, + { + "epoch": 5.623677469651409, + "grad_norm": 0.36208125948905945, + "learning_rate": 4.516481404834345e-05, + "loss": 0.0109, + "num_input_tokens_seen": 61453280, + "step": 50495 + }, + { + "epoch": 5.624234324535026, + "grad_norm": 0.5070552229881287, + "learning_rate": 4.516337771613142e-05, + "loss": 0.0452, + "num_input_tokens_seen": 61459488, + "step": 50500 + }, + { + "epoch": 5.624791179418644, + "grad_norm": 0.9733737707138062, + "learning_rate": 4.516194119346012e-05, + "loss": 0.056, + "num_input_tokens_seen": 61465728, + "step": 50505 + }, + { + "epoch": 5.625348034302261, + "grad_norm": 0.24309146404266357, + "learning_rate": 4.5160504480343135e-05, + "loss": 0.02, + "num_input_tokens_seen": 61472160, + "step": 50510 + }, + { + "epoch": 5.625904889185878, + "grad_norm": 0.055746305733919144, + "learning_rate": 4.5159067576794034e-05, + "loss": 0.0241, + "num_input_tokens_seen": 61478240, + "step": 50515 + }, + { + "epoch": 5.626461744069496, + "grad_norm": 0.1835404336452484, + "learning_rate": 4.515763048282639e-05, + "loss": 0.0434, + "num_input_tokens_seen": 61484576, + "step": 50520 + }, + { + "epoch": 5.6270185989531125, + "grad_norm": 1.1721872091293335, + "learning_rate": 4.5156193198453776e-05, + "loss": 0.0468, + "num_input_tokens_seen": 61490624, + "step": 50525 + }, + { + "epoch": 5.62757545383673, + "grad_norm": 0.023117534816265106, + "learning_rate": 4.5154755723689765e-05, + "loss": 0.0244, + "num_input_tokens_seen": 61496672, + "step": 50530 + }, + { + "epoch": 5.628132308720348, + "grad_norm": 0.08947169035673141, + "learning_rate": 4.515331805854794e-05, + "loss": 0.0754, + "num_input_tokens_seen": 61502528, + "step": 50535 + }, + { + "epoch": 5.628689163603965, + "grad_norm": 0.35605624318122864, + "learning_rate": 4.5151880203041884e-05, + "loss": 0.0493, + "num_input_tokens_seen": 61508832, + "step": 50540 + }, + { + "epoch": 5.629246018487582, + "grad_norm": 0.3198385238647461, + "learning_rate": 4.515044215718517e-05, + "loss": 0.0137, + "num_input_tokens_seen": 61515008, + "step": 50545 + }, + { + "epoch": 5.629802873371199, + "grad_norm": 1.114570140838623, + "learning_rate": 4.514900392099139e-05, + "loss": 0.1585, + "num_input_tokens_seen": 61520768, + "step": 50550 + }, + { + "epoch": 5.630359728254817, + "grad_norm": 0.06509950011968613, + "learning_rate": 4.514756549447412e-05, + "loss": 0.0147, + "num_input_tokens_seen": 61526784, + "step": 50555 + }, + { + "epoch": 5.6309165831384345, + "grad_norm": 0.04030046612024307, + "learning_rate": 4.5146126877646957e-05, + "loss": 0.0707, + "num_input_tokens_seen": 61532864, + "step": 50560 + }, + { + "epoch": 5.631473438022051, + "grad_norm": 0.2230071872472763, + "learning_rate": 4.514468807052348e-05, + "loss": 0.0363, + "num_input_tokens_seen": 61539136, + "step": 50565 + }, + { + "epoch": 5.632030292905669, + "grad_norm": 0.8186943531036377, + "learning_rate": 4.5143249073117286e-05, + "loss": 0.073, + "num_input_tokens_seen": 61545248, + "step": 50570 + }, + { + "epoch": 5.632587147789286, + "grad_norm": 0.5322305560112, + "learning_rate": 4.514180988544197e-05, + "loss": 0.1011, + "num_input_tokens_seen": 61551488, + "step": 50575 + }, + { + "epoch": 5.633144002672903, + "grad_norm": 0.19422827661037445, + "learning_rate": 4.514037050751111e-05, + "loss": 0.0175, + "num_input_tokens_seen": 61557120, + "step": 50580 + }, + { + "epoch": 5.633700857556521, + "grad_norm": 0.4845195412635803, + "learning_rate": 4.513893093933832e-05, + "loss": 0.0836, + "num_input_tokens_seen": 61563328, + "step": 50585 + }, + { + "epoch": 5.634257712440138, + "grad_norm": 0.00542863504961133, + "learning_rate": 4.5137491180937196e-05, + "loss": 0.0474, + "num_input_tokens_seen": 61569408, + "step": 50590 + }, + { + "epoch": 5.6348145673237555, + "grad_norm": 0.17503777146339417, + "learning_rate": 4.513605123232133e-05, + "loss": 0.102, + "num_input_tokens_seen": 61575616, + "step": 50595 + }, + { + "epoch": 5.635371422207372, + "grad_norm": 0.029647285118699074, + "learning_rate": 4.513461109350433e-05, + "loss": 0.0357, + "num_input_tokens_seen": 61581760, + "step": 50600 + }, + { + "epoch": 5.63592827709099, + "grad_norm": 0.05503513291478157, + "learning_rate": 4.51331707644998e-05, + "loss": 0.1381, + "num_input_tokens_seen": 61587872, + "step": 50605 + }, + { + "epoch": 5.636485131974608, + "grad_norm": 0.20219144225120544, + "learning_rate": 4.513173024532134e-05, + "loss": 0.056, + "num_input_tokens_seen": 61594336, + "step": 50610 + }, + { + "epoch": 5.637041986858224, + "grad_norm": 0.7269322276115417, + "learning_rate": 4.513028953598255e-05, + "loss": 0.0702, + "num_input_tokens_seen": 61600480, + "step": 50615 + }, + { + "epoch": 5.637598841741842, + "grad_norm": 0.05552990362048149, + "learning_rate": 4.512884863649706e-05, + "loss": 0.0864, + "num_input_tokens_seen": 61607008, + "step": 50620 + }, + { + "epoch": 5.63815569662546, + "grad_norm": 1.0331305265426636, + "learning_rate": 4.512740754687846e-05, + "loss": 0.0668, + "num_input_tokens_seen": 61612736, + "step": 50625 + }, + { + "epoch": 5.638712551509077, + "grad_norm": 0.8169506192207336, + "learning_rate": 4.5125966267140376e-05, + "loss": 0.0288, + "num_input_tokens_seen": 61618784, + "step": 50630 + }, + { + "epoch": 5.639269406392694, + "grad_norm": 0.2396720051765442, + "learning_rate": 4.512452479729641e-05, + "loss": 0.031, + "num_input_tokens_seen": 61625120, + "step": 50635 + }, + { + "epoch": 5.639826261276311, + "grad_norm": 1.0134950876235962, + "learning_rate": 4.512308313736018e-05, + "loss": 0.0305, + "num_input_tokens_seen": 61631296, + "step": 50640 + }, + { + "epoch": 5.640383116159929, + "grad_norm": 0.8789495825767517, + "learning_rate": 4.512164128734531e-05, + "loss": 0.1107, + "num_input_tokens_seen": 61637152, + "step": 50645 + }, + { + "epoch": 5.640939971043546, + "grad_norm": 0.25194576382637024, + "learning_rate": 4.5120199247265424e-05, + "loss": 0.0726, + "num_input_tokens_seen": 61643168, + "step": 50650 + }, + { + "epoch": 5.641496825927163, + "grad_norm": 0.9074164628982544, + "learning_rate": 4.511875701713413e-05, + "loss": 0.1015, + "num_input_tokens_seen": 61649184, + "step": 50655 + }, + { + "epoch": 5.642053680810781, + "grad_norm": 0.7043842077255249, + "learning_rate": 4.511731459696506e-05, + "loss": 0.0569, + "num_input_tokens_seen": 61655264, + "step": 50660 + }, + { + "epoch": 5.6426105356943985, + "grad_norm": 1.3828601837158203, + "learning_rate": 4.5115871986771835e-05, + "loss": 0.087, + "num_input_tokens_seen": 61661536, + "step": 50665 + }, + { + "epoch": 5.643167390578015, + "grad_norm": 0.521852970123291, + "learning_rate": 4.511442918656808e-05, + "loss": 0.1838, + "num_input_tokens_seen": 61667360, + "step": 50670 + }, + { + "epoch": 5.643724245461633, + "grad_norm": 0.05125647038221359, + "learning_rate": 4.5112986196367426e-05, + "loss": 0.0349, + "num_input_tokens_seen": 61673472, + "step": 50675 + }, + { + "epoch": 5.64428110034525, + "grad_norm": 1.1040575504302979, + "learning_rate": 4.511154301618351e-05, + "loss": 0.1847, + "num_input_tokens_seen": 61679712, + "step": 50680 + }, + { + "epoch": 5.644837955228867, + "grad_norm": 0.3916035294532776, + "learning_rate": 4.5110099646029946e-05, + "loss": 0.1419, + "num_input_tokens_seen": 61685920, + "step": 50685 + }, + { + "epoch": 5.645394810112485, + "grad_norm": 0.009031925350427628, + "learning_rate": 4.510865608592039e-05, + "loss": 0.0567, + "num_input_tokens_seen": 61692096, + "step": 50690 + }, + { + "epoch": 5.645951664996102, + "grad_norm": 0.03779175877571106, + "learning_rate": 4.510721233586846e-05, + "loss": 0.0666, + "num_input_tokens_seen": 61697824, + "step": 50695 + }, + { + "epoch": 5.64650851987972, + "grad_norm": 0.6819330453872681, + "learning_rate": 4.510576839588781e-05, + "loss": 0.05, + "num_input_tokens_seen": 61703968, + "step": 50700 + }, + { + "epoch": 5.647065374763336, + "grad_norm": 0.15131402015686035, + "learning_rate": 4.510432426599205e-05, + "loss": 0.0065, + "num_input_tokens_seen": 61710336, + "step": 50705 + }, + { + "epoch": 5.647622229646954, + "grad_norm": 0.0012053536484017968, + "learning_rate": 4.510287994619485e-05, + "loss": 0.0176, + "num_input_tokens_seen": 61716320, + "step": 50710 + }, + { + "epoch": 5.648179084530572, + "grad_norm": 0.30959552526474, + "learning_rate": 4.510143543650984e-05, + "loss": 0.0852, + "num_input_tokens_seen": 61722176, + "step": 50715 + }, + { + "epoch": 5.6487359394141885, + "grad_norm": 0.9405427575111389, + "learning_rate": 4.509999073695067e-05, + "loss": 0.0742, + "num_input_tokens_seen": 61728064, + "step": 50720 + }, + { + "epoch": 5.649292794297806, + "grad_norm": 1.8774343729019165, + "learning_rate": 4.5098545847530994e-05, + "loss": 0.0158, + "num_input_tokens_seen": 61734304, + "step": 50725 + }, + { + "epoch": 5.649849649181423, + "grad_norm": 1.3707029819488525, + "learning_rate": 4.509710076826443e-05, + "loss": 0.0802, + "num_input_tokens_seen": 61740160, + "step": 50730 + }, + { + "epoch": 5.650406504065041, + "grad_norm": 0.3593297302722931, + "learning_rate": 4.509565549916466e-05, + "loss": 0.0262, + "num_input_tokens_seen": 61746592, + "step": 50735 + }, + { + "epoch": 5.650963358948658, + "grad_norm": 0.037717778235673904, + "learning_rate": 4.509421004024532e-05, + "loss": 0.0665, + "num_input_tokens_seen": 61753120, + "step": 50740 + }, + { + "epoch": 5.651520213832275, + "grad_norm": 0.07271313667297363, + "learning_rate": 4.509276439152007e-05, + "loss": 0.0565, + "num_input_tokens_seen": 61759264, + "step": 50745 + }, + { + "epoch": 5.652077068715893, + "grad_norm": 0.007655106019228697, + "learning_rate": 4.509131855300256e-05, + "loss": 0.0051, + "num_input_tokens_seen": 61764896, + "step": 50750 + }, + { + "epoch": 5.65263392359951, + "grad_norm": 0.736339271068573, + "learning_rate": 4.508987252470645e-05, + "loss": 0.1278, + "num_input_tokens_seen": 61770432, + "step": 50755 + }, + { + "epoch": 5.653190778483127, + "grad_norm": 0.0032871682196855545, + "learning_rate": 4.50884263066454e-05, + "loss": 0.0381, + "num_input_tokens_seen": 61776576, + "step": 50760 + }, + { + "epoch": 5.653747633366745, + "grad_norm": 1.1603480577468872, + "learning_rate": 4.5086979898833064e-05, + "loss": 0.0595, + "num_input_tokens_seen": 61782496, + "step": 50765 + }, + { + "epoch": 5.654304488250362, + "grad_norm": 0.463225394487381, + "learning_rate": 4.508553330128311e-05, + "loss": 0.0135, + "num_input_tokens_seen": 61789024, + "step": 50770 + }, + { + "epoch": 5.654861343133979, + "grad_norm": 1.3140190839767456, + "learning_rate": 4.5084086514009204e-05, + "loss": 0.1152, + "num_input_tokens_seen": 61795136, + "step": 50775 + }, + { + "epoch": 5.655418198017596, + "grad_norm": 0.029777811840176582, + "learning_rate": 4.5082639537025015e-05, + "loss": 0.0702, + "num_input_tokens_seen": 61801248, + "step": 50780 + }, + { + "epoch": 5.655975052901214, + "grad_norm": 1.5230332612991333, + "learning_rate": 4.50811923703442e-05, + "loss": 0.0513, + "num_input_tokens_seen": 61806976, + "step": 50785 + }, + { + "epoch": 5.6565319077848315, + "grad_norm": 0.275303453207016, + "learning_rate": 4.507974501398043e-05, + "loss": 0.0514, + "num_input_tokens_seen": 61813248, + "step": 50790 + }, + { + "epoch": 5.657088762668448, + "grad_norm": 0.6663154363632202, + "learning_rate": 4.507829746794739e-05, + "loss": 0.1, + "num_input_tokens_seen": 61819520, + "step": 50795 + }, + { + "epoch": 5.657645617552066, + "grad_norm": 0.08759993314743042, + "learning_rate": 4.507684973225874e-05, + "loss": 0.0049, + "num_input_tokens_seen": 61825600, + "step": 50800 + }, + { + "epoch": 5.658202472435684, + "grad_norm": 0.006581385154277086, + "learning_rate": 4.5075401806928155e-05, + "loss": 0.1158, + "num_input_tokens_seen": 61831648, + "step": 50805 + }, + { + "epoch": 5.6587593273193, + "grad_norm": 0.8776355981826782, + "learning_rate": 4.5073953691969316e-05, + "loss": 0.0834, + "num_input_tokens_seen": 61837568, + "step": 50810 + }, + { + "epoch": 5.659316182202918, + "grad_norm": 0.015333005227148533, + "learning_rate": 4.507250538739591e-05, + "loss": 0.1391, + "num_input_tokens_seen": 61843744, + "step": 50815 + }, + { + "epoch": 5.659873037086535, + "grad_norm": 0.7461385130882263, + "learning_rate": 4.50710568932216e-05, + "loss": 0.2212, + "num_input_tokens_seen": 61850048, + "step": 50820 + }, + { + "epoch": 5.660429891970153, + "grad_norm": 0.0343383364379406, + "learning_rate": 4.506960820946008e-05, + "loss": 0.0736, + "num_input_tokens_seen": 61856224, + "step": 50825 + }, + { + "epoch": 5.66098674685377, + "grad_norm": 0.20766077935695648, + "learning_rate": 4.506815933612503e-05, + "loss": 0.0558, + "num_input_tokens_seen": 61862400, + "step": 50830 + }, + { + "epoch": 5.661543601737387, + "grad_norm": 0.06622611731290817, + "learning_rate": 4.506671027323014e-05, + "loss": 0.1312, + "num_input_tokens_seen": 61868320, + "step": 50835 + }, + { + "epoch": 5.662100456621005, + "grad_norm": 0.20141083002090454, + "learning_rate": 4.506526102078909e-05, + "loss": 0.0488, + "num_input_tokens_seen": 61874496, + "step": 50840 + }, + { + "epoch": 5.662657311504622, + "grad_norm": 0.01280286256223917, + "learning_rate": 4.506381157881558e-05, + "loss": 0.0681, + "num_input_tokens_seen": 61880960, + "step": 50845 + }, + { + "epoch": 5.663214166388239, + "grad_norm": 0.3312922418117523, + "learning_rate": 4.506236194732329e-05, + "loss": 0.073, + "num_input_tokens_seen": 61887488, + "step": 50850 + }, + { + "epoch": 5.663771021271857, + "grad_norm": 1.5780938863754272, + "learning_rate": 4.506091212632592e-05, + "loss": 0.1328, + "num_input_tokens_seen": 61893344, + "step": 50855 + }, + { + "epoch": 5.664327876155474, + "grad_norm": 0.04964100942015648, + "learning_rate": 4.505946211583716e-05, + "loss": 0.029, + "num_input_tokens_seen": 61899680, + "step": 50860 + }, + { + "epoch": 5.664884731039091, + "grad_norm": 0.2992973327636719, + "learning_rate": 4.505801191587071e-05, + "loss": 0.1284, + "num_input_tokens_seen": 61905632, + "step": 50865 + }, + { + "epoch": 5.665441585922709, + "grad_norm": 0.8980727195739746, + "learning_rate": 4.5056561526440265e-05, + "loss": 0.0809, + "num_input_tokens_seen": 61911744, + "step": 50870 + }, + { + "epoch": 5.665998440806326, + "grad_norm": 0.049667973071336746, + "learning_rate": 4.505511094755953e-05, + "loss": 0.0322, + "num_input_tokens_seen": 61917920, + "step": 50875 + }, + { + "epoch": 5.666555295689943, + "grad_norm": 0.3221934139728546, + "learning_rate": 4.505366017924221e-05, + "loss": 0.2148, + "num_input_tokens_seen": 61923840, + "step": 50880 + }, + { + "epoch": 5.66711215057356, + "grad_norm": 0.8181070685386658, + "learning_rate": 4.505220922150199e-05, + "loss": 0.1425, + "num_input_tokens_seen": 61929600, + "step": 50885 + }, + { + "epoch": 5.667669005457178, + "grad_norm": 0.037298135459423065, + "learning_rate": 4.50507580743526e-05, + "loss": 0.0682, + "num_input_tokens_seen": 61935680, + "step": 50890 + }, + { + "epoch": 5.668225860340796, + "grad_norm": 0.03507382050156593, + "learning_rate": 4.504930673780773e-05, + "loss": 0.0883, + "num_input_tokens_seen": 61942144, + "step": 50895 + }, + { + "epoch": 5.668782715224412, + "grad_norm": 0.09193625301122665, + "learning_rate": 4.5047855211881094e-05, + "loss": 0.0222, + "num_input_tokens_seen": 61948640, + "step": 50900 + }, + { + "epoch": 5.66933957010803, + "grad_norm": 0.19628341495990753, + "learning_rate": 4.50464034965864e-05, + "loss": 0.1825, + "num_input_tokens_seen": 61954208, + "step": 50905 + }, + { + "epoch": 5.669896424991647, + "grad_norm": 0.13286903500556946, + "learning_rate": 4.5044951591937367e-05, + "loss": 0.0227, + "num_input_tokens_seen": 61960480, + "step": 50910 + }, + { + "epoch": 5.6704532798752645, + "grad_norm": 0.1417999565601349, + "learning_rate": 4.504349949794771e-05, + "loss": 0.018, + "num_input_tokens_seen": 61966912, + "step": 50915 + }, + { + "epoch": 5.671010134758882, + "grad_norm": 0.45723238587379456, + "learning_rate": 4.504204721463114e-05, + "loss": 0.0964, + "num_input_tokens_seen": 61972992, + "step": 50920 + }, + { + "epoch": 5.671566989642499, + "grad_norm": 0.46831825375556946, + "learning_rate": 4.504059474200138e-05, + "loss": 0.1077, + "num_input_tokens_seen": 61978976, + "step": 50925 + }, + { + "epoch": 5.672123844526117, + "grad_norm": 1.2052007913589478, + "learning_rate": 4.503914208007214e-05, + "loss": 0.065, + "num_input_tokens_seen": 61985280, + "step": 50930 + }, + { + "epoch": 5.672680699409733, + "grad_norm": 0.00502358702942729, + "learning_rate": 4.503768922885715e-05, + "loss": 0.0503, + "num_input_tokens_seen": 61991424, + "step": 50935 + }, + { + "epoch": 5.673237554293351, + "grad_norm": 1.627539038658142, + "learning_rate": 4.503623618837013e-05, + "loss": 0.1639, + "num_input_tokens_seen": 61998016, + "step": 50940 + }, + { + "epoch": 5.673794409176969, + "grad_norm": 0.5613120794296265, + "learning_rate": 4.503478295862481e-05, + "loss": 0.0241, + "num_input_tokens_seen": 62003488, + "step": 50945 + }, + { + "epoch": 5.6743512640605855, + "grad_norm": 0.0557996965944767, + "learning_rate": 4.503332953963491e-05, + "loss": 0.0488, + "num_input_tokens_seen": 62009152, + "step": 50950 + }, + { + "epoch": 5.674908118944203, + "grad_norm": 0.9900584816932678, + "learning_rate": 4.503187593141416e-05, + "loss": 0.2013, + "num_input_tokens_seen": 62015072, + "step": 50955 + }, + { + "epoch": 5.67546497382782, + "grad_norm": 1.0002057552337646, + "learning_rate": 4.50304221339763e-05, + "loss": 0.0358, + "num_input_tokens_seen": 62021152, + "step": 50960 + }, + { + "epoch": 5.676021828711438, + "grad_norm": 1.5570577383041382, + "learning_rate": 4.502896814733505e-05, + "loss": 0.1466, + "num_input_tokens_seen": 62027424, + "step": 50965 + }, + { + "epoch": 5.676578683595055, + "grad_norm": 0.016420388594269753, + "learning_rate": 4.502751397150415e-05, + "loss": 0.0073, + "num_input_tokens_seen": 62033824, + "step": 50970 + }, + { + "epoch": 5.677135538478672, + "grad_norm": 0.028848471119999886, + "learning_rate": 4.502605960649734e-05, + "loss": 0.0644, + "num_input_tokens_seen": 62040160, + "step": 50975 + }, + { + "epoch": 5.67769239336229, + "grad_norm": 0.4742637872695923, + "learning_rate": 4.502460505232834e-05, + "loss": 0.0714, + "num_input_tokens_seen": 62046240, + "step": 50980 + }, + { + "epoch": 5.6782492482459075, + "grad_norm": 0.6183800101280212, + "learning_rate": 4.502315030901091e-05, + "loss": 0.0261, + "num_input_tokens_seen": 62052000, + "step": 50985 + }, + { + "epoch": 5.678806103129524, + "grad_norm": 0.18985618650913239, + "learning_rate": 4.5021695376558786e-05, + "loss": 0.0216, + "num_input_tokens_seen": 62058112, + "step": 50990 + }, + { + "epoch": 5.679362958013142, + "grad_norm": 1.2393666505813599, + "learning_rate": 4.50202402549857e-05, + "loss": 0.0713, + "num_input_tokens_seen": 62064000, + "step": 50995 + }, + { + "epoch": 5.679919812896759, + "grad_norm": 0.4574613571166992, + "learning_rate": 4.501878494430542e-05, + "loss": 0.0331, + "num_input_tokens_seen": 62070112, + "step": 51000 + }, + { + "epoch": 5.680476667780376, + "grad_norm": 0.13422651588916779, + "learning_rate": 4.5017329444531665e-05, + "loss": 0.0591, + "num_input_tokens_seen": 62076224, + "step": 51005 + }, + { + "epoch": 5.681033522663994, + "grad_norm": 0.9103294610977173, + "learning_rate": 4.501587375567819e-05, + "loss": 0.0599, + "num_input_tokens_seen": 62082048, + "step": 51010 + }, + { + "epoch": 5.681590377547611, + "grad_norm": 0.10286315530538559, + "learning_rate": 4.5014417877758756e-05, + "loss": 0.0221, + "num_input_tokens_seen": 62088192, + "step": 51015 + }, + { + "epoch": 5.6821472324312285, + "grad_norm": 0.6091855764389038, + "learning_rate": 4.501296181078711e-05, + "loss": 0.0754, + "num_input_tokens_seen": 62094432, + "step": 51020 + }, + { + "epoch": 5.682704087314846, + "grad_norm": 0.1825820356607437, + "learning_rate": 4.501150555477701e-05, + "loss": 0.0436, + "num_input_tokens_seen": 62100320, + "step": 51025 + }, + { + "epoch": 5.683260942198463, + "grad_norm": 0.11800801753997803, + "learning_rate": 4.501004910974221e-05, + "loss": 0.0028, + "num_input_tokens_seen": 62106496, + "step": 51030 + }, + { + "epoch": 5.683817797082081, + "grad_norm": 1.0404647588729858, + "learning_rate": 4.5008592475696454e-05, + "loss": 0.0913, + "num_input_tokens_seen": 62112928, + "step": 51035 + }, + { + "epoch": 5.6843746519656975, + "grad_norm": 1.9513449668884277, + "learning_rate": 4.500713565265352e-05, + "loss": 0.0787, + "num_input_tokens_seen": 62119072, + "step": 51040 + }, + { + "epoch": 5.684931506849315, + "grad_norm": 2.034217357635498, + "learning_rate": 4.500567864062716e-05, + "loss": 0.3496, + "num_input_tokens_seen": 62125024, + "step": 51045 + }, + { + "epoch": 5.685488361732933, + "grad_norm": 0.005681327078491449, + "learning_rate": 4.500422143963113e-05, + "loss": 0.115, + "num_input_tokens_seen": 62131360, + "step": 51050 + }, + { + "epoch": 5.68604521661655, + "grad_norm": 0.32247135043144226, + "learning_rate": 4.5002764049679204e-05, + "loss": 0.0394, + "num_input_tokens_seen": 62136800, + "step": 51055 + }, + { + "epoch": 5.686602071500167, + "grad_norm": 1.134008765220642, + "learning_rate": 4.500130647078515e-05, + "loss": 0.1466, + "num_input_tokens_seen": 62142816, + "step": 51060 + }, + { + "epoch": 5.687158926383784, + "grad_norm": 0.9033743143081665, + "learning_rate": 4.4999848702962726e-05, + "loss": 0.0845, + "num_input_tokens_seen": 62148992, + "step": 51065 + }, + { + "epoch": 5.687715781267402, + "grad_norm": 0.004023372195661068, + "learning_rate": 4.499839074622571e-05, + "loss": 0.0097, + "num_input_tokens_seen": 62155488, + "step": 51070 + }, + { + "epoch": 5.688272636151019, + "grad_norm": 1.3235245943069458, + "learning_rate": 4.499693260058787e-05, + "loss": 0.1026, + "num_input_tokens_seen": 62161536, + "step": 51075 + }, + { + "epoch": 5.688829491034636, + "grad_norm": 0.0006109371897764504, + "learning_rate": 4.499547426606298e-05, + "loss": 0.0536, + "num_input_tokens_seen": 62167584, + "step": 51080 + }, + { + "epoch": 5.689386345918254, + "grad_norm": 0.03433110937476158, + "learning_rate": 4.499401574266482e-05, + "loss": 0.0179, + "num_input_tokens_seen": 62174080, + "step": 51085 + }, + { + "epoch": 5.689943200801871, + "grad_norm": 0.4296163022518158, + "learning_rate": 4.499255703040716e-05, + "loss": 0.0947, + "num_input_tokens_seen": 62179776, + "step": 51090 + }, + { + "epoch": 5.690500055685488, + "grad_norm": 0.0988864004611969, + "learning_rate": 4.499109812930378e-05, + "loss": 0.0609, + "num_input_tokens_seen": 62185728, + "step": 51095 + }, + { + "epoch": 5.691056910569106, + "grad_norm": 0.5288938283920288, + "learning_rate": 4.498963903936846e-05, + "loss": 0.0354, + "num_input_tokens_seen": 62191776, + "step": 51100 + }, + { + "epoch": 5.691613765452723, + "grad_norm": 0.559024453163147, + "learning_rate": 4.498817976061498e-05, + "loss": 0.1048, + "num_input_tokens_seen": 62197888, + "step": 51105 + }, + { + "epoch": 5.6921706203363405, + "grad_norm": 0.03791823983192444, + "learning_rate": 4.498672029305714e-05, + "loss": 0.0155, + "num_input_tokens_seen": 62204384, + "step": 51110 + }, + { + "epoch": 5.692727475219957, + "grad_norm": 0.43840551376342773, + "learning_rate": 4.4985260636708705e-05, + "loss": 0.09, + "num_input_tokens_seen": 62210144, + "step": 51115 + }, + { + "epoch": 5.693284330103575, + "grad_norm": 0.004188166931271553, + "learning_rate": 4.4983800791583475e-05, + "loss": 0.0377, + "num_input_tokens_seen": 62216256, + "step": 51120 + }, + { + "epoch": 5.693841184987193, + "grad_norm": 0.7049245238304138, + "learning_rate": 4.498234075769523e-05, + "loss": 0.1242, + "num_input_tokens_seen": 62222464, + "step": 51125 + }, + { + "epoch": 5.694398039870809, + "grad_norm": 0.1356782466173172, + "learning_rate": 4.498088053505777e-05, + "loss": 0.0825, + "num_input_tokens_seen": 62228480, + "step": 51130 + }, + { + "epoch": 5.694954894754427, + "grad_norm": 0.9287223219871521, + "learning_rate": 4.497942012368489e-05, + "loss": 0.0673, + "num_input_tokens_seen": 62234784, + "step": 51135 + }, + { + "epoch": 5.695511749638044, + "grad_norm": 0.06346432119607925, + "learning_rate": 4.497795952359038e-05, + "loss": 0.0053, + "num_input_tokens_seen": 62240800, + "step": 51140 + }, + { + "epoch": 5.6960686045216615, + "grad_norm": 0.005633282009512186, + "learning_rate": 4.4976498734788024e-05, + "loss": 0.0298, + "num_input_tokens_seen": 62247200, + "step": 51145 + }, + { + "epoch": 5.696625459405279, + "grad_norm": 0.9986828565597534, + "learning_rate": 4.497503775729164e-05, + "loss": 0.0156, + "num_input_tokens_seen": 62253312, + "step": 51150 + }, + { + "epoch": 5.697182314288896, + "grad_norm": 0.01673155650496483, + "learning_rate": 4.497357659111502e-05, + "loss": 0.1032, + "num_input_tokens_seen": 62259040, + "step": 51155 + }, + { + "epoch": 5.697739169172514, + "grad_norm": 0.07277677208185196, + "learning_rate": 4.497211523627197e-05, + "loss": 0.0737, + "num_input_tokens_seen": 62265504, + "step": 51160 + }, + { + "epoch": 5.698296024056131, + "grad_norm": 0.85370934009552, + "learning_rate": 4.4970653692776285e-05, + "loss": 0.0787, + "num_input_tokens_seen": 62271200, + "step": 51165 + }, + { + "epoch": 5.698852878939748, + "grad_norm": 1.1421676874160767, + "learning_rate": 4.4969191960641775e-05, + "loss": 0.083, + "num_input_tokens_seen": 62277408, + "step": 51170 + }, + { + "epoch": 5.699409733823366, + "grad_norm": 0.9203903675079346, + "learning_rate": 4.496773003988226e-05, + "loss": 0.124, + "num_input_tokens_seen": 62283584, + "step": 51175 + }, + { + "epoch": 5.699966588706983, + "grad_norm": 0.2671028971672058, + "learning_rate": 4.496626793051153e-05, + "loss": 0.0139, + "num_input_tokens_seen": 62289376, + "step": 51180 + }, + { + "epoch": 5.7005234435906, + "grad_norm": 0.7168101072311401, + "learning_rate": 4.4964805632543396e-05, + "loss": 0.1724, + "num_input_tokens_seen": 62295328, + "step": 51185 + }, + { + "epoch": 5.701080298474218, + "grad_norm": 0.0006879312568344176, + "learning_rate": 4.496334314599168e-05, + "loss": 0.0243, + "num_input_tokens_seen": 62301408, + "step": 51190 + }, + { + "epoch": 5.701637153357835, + "grad_norm": 0.0022197982762008905, + "learning_rate": 4.49618804708702e-05, + "loss": 0.0297, + "num_input_tokens_seen": 62307520, + "step": 51195 + }, + { + "epoch": 5.702194008241452, + "grad_norm": 0.3619121313095093, + "learning_rate": 4.496041760719276e-05, + "loss": 0.2154, + "num_input_tokens_seen": 62313696, + "step": 51200 + }, + { + "epoch": 5.70275086312507, + "grad_norm": 0.35365334153175354, + "learning_rate": 4.495895455497319e-05, + "loss": 0.0608, + "num_input_tokens_seen": 62319968, + "step": 51205 + }, + { + "epoch": 5.703307718008687, + "grad_norm": 0.15725885331630707, + "learning_rate": 4.4957491314225296e-05, + "loss": 0.018, + "num_input_tokens_seen": 62326272, + "step": 51210 + }, + { + "epoch": 5.7038645728923045, + "grad_norm": 0.4224478006362915, + "learning_rate": 4.495602788496291e-05, + "loss": 0.0521, + "num_input_tokens_seen": 62332544, + "step": 51215 + }, + { + "epoch": 5.704421427775921, + "grad_norm": 0.14739565551280975, + "learning_rate": 4.495456426719985e-05, + "loss": 0.0383, + "num_input_tokens_seen": 62338368, + "step": 51220 + }, + { + "epoch": 5.704978282659539, + "grad_norm": 0.5174353718757629, + "learning_rate": 4.495310046094995e-05, + "loss": 0.0447, + "num_input_tokens_seen": 62344384, + "step": 51225 + }, + { + "epoch": 5.705535137543157, + "grad_norm": 0.009239706210792065, + "learning_rate": 4.495163646622702e-05, + "loss": 0.0815, + "num_input_tokens_seen": 62350400, + "step": 51230 + }, + { + "epoch": 5.7060919924267735, + "grad_norm": 0.0006689979927614331, + "learning_rate": 4.49501722830449e-05, + "loss": 0.0351, + "num_input_tokens_seen": 62356608, + "step": 51235 + }, + { + "epoch": 5.706648847310391, + "grad_norm": 0.5814047455787659, + "learning_rate": 4.4948707911417424e-05, + "loss": 0.0628, + "num_input_tokens_seen": 62362080, + "step": 51240 + }, + { + "epoch": 5.707205702194008, + "grad_norm": 0.023702217265963554, + "learning_rate": 4.4947243351358414e-05, + "loss": 0.0487, + "num_input_tokens_seen": 62368160, + "step": 51245 + }, + { + "epoch": 5.707762557077626, + "grad_norm": 0.13175897300243378, + "learning_rate": 4.4945778602881717e-05, + "loss": 0.0105, + "num_input_tokens_seen": 62374144, + "step": 51250 + }, + { + "epoch": 5.708319411961243, + "grad_norm": 0.6298719048500061, + "learning_rate": 4.494431366600116e-05, + "loss": 0.0835, + "num_input_tokens_seen": 62380128, + "step": 51255 + }, + { + "epoch": 5.70887626684486, + "grad_norm": 0.006819414906203747, + "learning_rate": 4.494284854073058e-05, + "loss": 0.0413, + "num_input_tokens_seen": 62386208, + "step": 51260 + }, + { + "epoch": 5.709433121728478, + "grad_norm": 0.21384820342063904, + "learning_rate": 4.494138322708381e-05, + "loss": 0.0882, + "num_input_tokens_seen": 62392192, + "step": 51265 + }, + { + "epoch": 5.7099899766120945, + "grad_norm": 0.36552906036376953, + "learning_rate": 4.4939917725074704e-05, + "loss": 0.0407, + "num_input_tokens_seen": 62398560, + "step": 51270 + }, + { + "epoch": 5.710546831495712, + "grad_norm": 0.38910868763923645, + "learning_rate": 4.49384520347171e-05, + "loss": 0.1267, + "num_input_tokens_seen": 62404416, + "step": 51275 + }, + { + "epoch": 5.71110368637933, + "grad_norm": 1.6887311935424805, + "learning_rate": 4.493698615602484e-05, + "loss": 0.1991, + "num_input_tokens_seen": 62410560, + "step": 51280 + }, + { + "epoch": 5.711660541262947, + "grad_norm": 0.10841520130634308, + "learning_rate": 4.493552008901177e-05, + "loss": 0.0707, + "num_input_tokens_seen": 62416480, + "step": 51285 + }, + { + "epoch": 5.712217396146564, + "grad_norm": 0.3120012879371643, + "learning_rate": 4.493405383369175e-05, + "loss": 0.0532, + "num_input_tokens_seen": 62422560, + "step": 51290 + }, + { + "epoch": 5.712774251030181, + "grad_norm": 0.0035375480074435472, + "learning_rate": 4.493258739007861e-05, + "loss": 0.0554, + "num_input_tokens_seen": 62428384, + "step": 51295 + }, + { + "epoch": 5.713331105913799, + "grad_norm": 0.5521422624588013, + "learning_rate": 4.493112075818622e-05, + "loss": 0.054, + "num_input_tokens_seen": 62433920, + "step": 51300 + }, + { + "epoch": 5.7138879607974165, + "grad_norm": 0.14973534643650055, + "learning_rate": 4.4929653938028425e-05, + "loss": 0.0238, + "num_input_tokens_seen": 62440128, + "step": 51305 + }, + { + "epoch": 5.714444815681033, + "grad_norm": 0.00869049783796072, + "learning_rate": 4.4928186929619076e-05, + "loss": 0.0242, + "num_input_tokens_seen": 62446112, + "step": 51310 + }, + { + "epoch": 5.715001670564651, + "grad_norm": 0.34877726435661316, + "learning_rate": 4.492671973297204e-05, + "loss": 0.0559, + "num_input_tokens_seen": 62451904, + "step": 51315 + }, + { + "epoch": 5.715558525448268, + "grad_norm": 0.8479699492454529, + "learning_rate": 4.492525234810118e-05, + "loss": 0.1519, + "num_input_tokens_seen": 62458080, + "step": 51320 + }, + { + "epoch": 5.716115380331885, + "grad_norm": 0.18508775532245636, + "learning_rate": 4.492378477502033e-05, + "loss": 0.0758, + "num_input_tokens_seen": 62464192, + "step": 51325 + }, + { + "epoch": 5.716672235215503, + "grad_norm": 0.0004467492981348187, + "learning_rate": 4.4922317013743376e-05, + "loss": 0.0699, + "num_input_tokens_seen": 62470432, + "step": 51330 + }, + { + "epoch": 5.71722909009912, + "grad_norm": 0.03910555690526962, + "learning_rate": 4.492084906428418e-05, + "loss": 0.1274, + "num_input_tokens_seen": 62476192, + "step": 51335 + }, + { + "epoch": 5.7177859449827375, + "grad_norm": 0.20237576961517334, + "learning_rate": 4.4919380926656607e-05, + "loss": 0.0347, + "num_input_tokens_seen": 62482144, + "step": 51340 + }, + { + "epoch": 5.718342799866355, + "grad_norm": 0.006502849515527487, + "learning_rate": 4.491791260087451e-05, + "loss": 0.0122, + "num_input_tokens_seen": 62488320, + "step": 51345 + }, + { + "epoch": 5.718899654749972, + "grad_norm": 0.018186647444963455, + "learning_rate": 4.4916444086951784e-05, + "loss": 0.1748, + "num_input_tokens_seen": 62494336, + "step": 51350 + }, + { + "epoch": 5.71945650963359, + "grad_norm": 0.1627759039402008, + "learning_rate": 4.491497538490228e-05, + "loss": 0.0335, + "num_input_tokens_seen": 62499840, + "step": 51355 + }, + { + "epoch": 5.720013364517206, + "grad_norm": 1.4399534463882446, + "learning_rate": 4.4913506494739875e-05, + "loss": 0.0406, + "num_input_tokens_seen": 62506304, + "step": 51360 + }, + { + "epoch": 5.720570219400824, + "grad_norm": 1.241578221321106, + "learning_rate": 4.491203741647845e-05, + "loss": 0.0609, + "num_input_tokens_seen": 62512448, + "step": 51365 + }, + { + "epoch": 5.721127074284442, + "grad_norm": 0.1295437067747116, + "learning_rate": 4.491056815013188e-05, + "loss": 0.0838, + "num_input_tokens_seen": 62518144, + "step": 51370 + }, + { + "epoch": 5.721683929168059, + "grad_norm": 0.14613944292068481, + "learning_rate": 4.490909869571405e-05, + "loss": 0.1073, + "num_input_tokens_seen": 62524448, + "step": 51375 + }, + { + "epoch": 5.722240784051676, + "grad_norm": 0.4721205532550812, + "learning_rate": 4.490762905323882e-05, + "loss": 0.0791, + "num_input_tokens_seen": 62530432, + "step": 51380 + }, + { + "epoch": 5.722797638935294, + "grad_norm": 0.16401362419128418, + "learning_rate": 4.490615922272008e-05, + "loss": 0.0902, + "num_input_tokens_seen": 62536704, + "step": 51385 + }, + { + "epoch": 5.723354493818911, + "grad_norm": 0.13351479172706604, + "learning_rate": 4.490468920417172e-05, + "loss": 0.0068, + "num_input_tokens_seen": 62542912, + "step": 51390 + }, + { + "epoch": 5.723911348702528, + "grad_norm": 0.011369739659130573, + "learning_rate": 4.490321899760763e-05, + "loss": 0.0199, + "num_input_tokens_seen": 62549088, + "step": 51395 + }, + { + "epoch": 5.724468203586145, + "grad_norm": 0.0006554042920470238, + "learning_rate": 4.4901748603041694e-05, + "loss": 0.024, + "num_input_tokens_seen": 62555520, + "step": 51400 + }, + { + "epoch": 5.725025058469763, + "grad_norm": 0.5775182843208313, + "learning_rate": 4.490027802048778e-05, + "loss": 0.0988, + "num_input_tokens_seen": 62561632, + "step": 51405 + }, + { + "epoch": 5.7255819133533805, + "grad_norm": 1.7849243879318237, + "learning_rate": 4.489880724995982e-05, + "loss": 0.1593, + "num_input_tokens_seen": 62567360, + "step": 51410 + }, + { + "epoch": 5.726138768236997, + "grad_norm": 1.140058994293213, + "learning_rate": 4.489733629147167e-05, + "loss": 0.0614, + "num_input_tokens_seen": 62572864, + "step": 51415 + }, + { + "epoch": 5.726695623120615, + "grad_norm": 0.14685004949569702, + "learning_rate": 4.489586514503723e-05, + "loss": 0.1394, + "num_input_tokens_seen": 62578912, + "step": 51420 + }, + { + "epoch": 5.727252478004232, + "grad_norm": 0.10689961165189743, + "learning_rate": 4.489439381067041e-05, + "loss": 0.1479, + "num_input_tokens_seen": 62584576, + "step": 51425 + }, + { + "epoch": 5.727809332887849, + "grad_norm": 0.35531964898109436, + "learning_rate": 4.489292228838511e-05, + "loss": 0.0676, + "num_input_tokens_seen": 62590752, + "step": 51430 + }, + { + "epoch": 5.728366187771467, + "grad_norm": 0.2464156597852707, + "learning_rate": 4.489145057819521e-05, + "loss": 0.05, + "num_input_tokens_seen": 62596992, + "step": 51435 + }, + { + "epoch": 5.728923042655084, + "grad_norm": 0.08879171311855316, + "learning_rate": 4.488997868011463e-05, + "loss": 0.0243, + "num_input_tokens_seen": 62603264, + "step": 51440 + }, + { + "epoch": 5.729479897538702, + "grad_norm": 0.07367993146181107, + "learning_rate": 4.4888506594157256e-05, + "loss": 0.0394, + "num_input_tokens_seen": 62609408, + "step": 51445 + }, + { + "epoch": 5.730036752422318, + "grad_norm": 0.00018964169430546463, + "learning_rate": 4.4887034320337004e-05, + "loss": 0.0125, + "num_input_tokens_seen": 62615808, + "step": 51450 + }, + { + "epoch": 5.730593607305936, + "grad_norm": 0.003003395162522793, + "learning_rate": 4.488556185866779e-05, + "loss": 0.0027, + "num_input_tokens_seen": 62621952, + "step": 51455 + }, + { + "epoch": 5.731150462189554, + "grad_norm": 0.19412168860435486, + "learning_rate": 4.4884089209163507e-05, + "loss": 0.1236, + "num_input_tokens_seen": 62627648, + "step": 51460 + }, + { + "epoch": 5.7317073170731705, + "grad_norm": 0.3410039246082306, + "learning_rate": 4.4882616371838065e-05, + "loss": 0.0319, + "num_input_tokens_seen": 62633664, + "step": 51465 + }, + { + "epoch": 5.732264171956788, + "grad_norm": 1.27003812789917, + "learning_rate": 4.488114334670539e-05, + "loss": 0.1209, + "num_input_tokens_seen": 62639744, + "step": 51470 + }, + { + "epoch": 5.732821026840405, + "grad_norm": 0.0716952383518219, + "learning_rate": 4.487967013377938e-05, + "loss": 0.0199, + "num_input_tokens_seen": 62645792, + "step": 51475 + }, + { + "epoch": 5.733377881724023, + "grad_norm": 0.5996524095535278, + "learning_rate": 4.4878196733073964e-05, + "loss": 0.111, + "num_input_tokens_seen": 62651808, + "step": 51480 + }, + { + "epoch": 5.73393473660764, + "grad_norm": 0.24569997191429138, + "learning_rate": 4.487672314460305e-05, + "loss": 0.0092, + "num_input_tokens_seen": 62657856, + "step": 51485 + }, + { + "epoch": 5.734491591491257, + "grad_norm": 0.40373244881629944, + "learning_rate": 4.487524936838056e-05, + "loss": 0.037, + "num_input_tokens_seen": 62663904, + "step": 51490 + }, + { + "epoch": 5.735048446374875, + "grad_norm": 0.50782310962677, + "learning_rate": 4.487377540442042e-05, + "loss": 0.0363, + "num_input_tokens_seen": 62670112, + "step": 51495 + }, + { + "epoch": 5.7356053012584916, + "grad_norm": 0.18411855399608612, + "learning_rate": 4.487230125273655e-05, + "loss": 0.0209, + "num_input_tokens_seen": 62676224, + "step": 51500 + }, + { + "epoch": 5.736162156142109, + "grad_norm": 0.07365730404853821, + "learning_rate": 4.487082691334287e-05, + "loss": 0.0607, + "num_input_tokens_seen": 62682496, + "step": 51505 + }, + { + "epoch": 5.736719011025727, + "grad_norm": 0.05647601559758186, + "learning_rate": 4.48693523862533e-05, + "loss": 0.0247, + "num_input_tokens_seen": 62688640, + "step": 51510 + }, + { + "epoch": 5.737275865909344, + "grad_norm": 2.806081533432007, + "learning_rate": 4.486787767148179e-05, + "loss": 0.0706, + "num_input_tokens_seen": 62694848, + "step": 51515 + }, + { + "epoch": 5.737832720792961, + "grad_norm": 1.5216785669326782, + "learning_rate": 4.486640276904226e-05, + "loss": 0.0754, + "num_input_tokens_seen": 62701056, + "step": 51520 + }, + { + "epoch": 5.738389575676579, + "grad_norm": 0.07764051854610443, + "learning_rate": 4.4864927678948636e-05, + "loss": 0.1139, + "num_input_tokens_seen": 62706848, + "step": 51525 + }, + { + "epoch": 5.738946430560196, + "grad_norm": 1.2380433082580566, + "learning_rate": 4.486345240121486e-05, + "loss": 0.0993, + "num_input_tokens_seen": 62712544, + "step": 51530 + }, + { + "epoch": 5.7395032854438135, + "grad_norm": 0.2958157956600189, + "learning_rate": 4.486197693585485e-05, + "loss": 0.016, + "num_input_tokens_seen": 62718880, + "step": 51535 + }, + { + "epoch": 5.740060140327431, + "grad_norm": 0.004933968652039766, + "learning_rate": 4.486050128288256e-05, + "loss": 0.0786, + "num_input_tokens_seen": 62725344, + "step": 51540 + }, + { + "epoch": 5.740616995211048, + "grad_norm": 0.20186690986156464, + "learning_rate": 4.485902544231192e-05, + "loss": 0.0455, + "num_input_tokens_seen": 62731552, + "step": 51545 + }, + { + "epoch": 5.741173850094666, + "grad_norm": 0.0003257339703850448, + "learning_rate": 4.485754941415688e-05, + "loss": 0.0525, + "num_input_tokens_seen": 62737760, + "step": 51550 + }, + { + "epoch": 5.741730704978282, + "grad_norm": 0.6016049385070801, + "learning_rate": 4.485607319843137e-05, + "loss": 0.0205, + "num_input_tokens_seen": 62744160, + "step": 51555 + }, + { + "epoch": 5.7422875598619, + "grad_norm": 0.5363879799842834, + "learning_rate": 4.4854596795149345e-05, + "loss": 0.021, + "num_input_tokens_seen": 62750176, + "step": 51560 + }, + { + "epoch": 5.742844414745518, + "grad_norm": 0.06409399956464767, + "learning_rate": 4.4853120204324744e-05, + "loss": 0.0282, + "num_input_tokens_seen": 62756288, + "step": 51565 + }, + { + "epoch": 5.7434012696291346, + "grad_norm": 0.02720770239830017, + "learning_rate": 4.4851643425971514e-05, + "loss": 0.0714, + "num_input_tokens_seen": 62762816, + "step": 51570 + }, + { + "epoch": 5.743958124512752, + "grad_norm": 0.8105288743972778, + "learning_rate": 4.485016646010361e-05, + "loss": 0.0317, + "num_input_tokens_seen": 62769088, + "step": 51575 + }, + { + "epoch": 5.744514979396369, + "grad_norm": 0.006385784596204758, + "learning_rate": 4.4848689306734984e-05, + "loss": 0.0413, + "num_input_tokens_seen": 62775328, + "step": 51580 + }, + { + "epoch": 5.745071834279987, + "grad_norm": 0.024236395955085754, + "learning_rate": 4.4847211965879574e-05, + "loss": 0.0129, + "num_input_tokens_seen": 62781728, + "step": 51585 + }, + { + "epoch": 5.745628689163604, + "grad_norm": 0.0008992850198410451, + "learning_rate": 4.484573443755136e-05, + "loss": 0.0783, + "num_input_tokens_seen": 62787680, + "step": 51590 + }, + { + "epoch": 5.746185544047221, + "grad_norm": 0.2091936618089676, + "learning_rate": 4.4844256721764276e-05, + "loss": 0.0914, + "num_input_tokens_seen": 62793632, + "step": 51595 + }, + { + "epoch": 5.746742398930839, + "grad_norm": 0.523067057132721, + "learning_rate": 4.4842778818532284e-05, + "loss": 0.0623, + "num_input_tokens_seen": 62800032, + "step": 51600 + }, + { + "epoch": 5.747299253814456, + "grad_norm": 0.4655187427997589, + "learning_rate": 4.484130072786936e-05, + "loss": 0.1017, + "num_input_tokens_seen": 62806016, + "step": 51605 + }, + { + "epoch": 5.747856108698073, + "grad_norm": 1.2024288177490234, + "learning_rate": 4.483982244978944e-05, + "loss": 0.3868, + "num_input_tokens_seen": 62810720, + "step": 51610 + }, + { + "epoch": 5.748412963581691, + "grad_norm": 1.0050731897354126, + "learning_rate": 4.483834398430651e-05, + "loss": 0.0833, + "num_input_tokens_seen": 62816960, + "step": 51615 + }, + { + "epoch": 5.748969818465308, + "grad_norm": 0.0007185754948295653, + "learning_rate": 4.483686533143453e-05, + "loss": 0.0229, + "num_input_tokens_seen": 62823456, + "step": 51620 + }, + { + "epoch": 5.749526673348925, + "grad_norm": 1.1326264142990112, + "learning_rate": 4.4835386491187456e-05, + "loss": 0.0832, + "num_input_tokens_seen": 62829216, + "step": 51625 + }, + { + "epoch": 5.750083528232542, + "grad_norm": 0.06208432465791702, + "learning_rate": 4.483390746357927e-05, + "loss": 0.0216, + "num_input_tokens_seen": 62835328, + "step": 51630 + }, + { + "epoch": 5.75064038311616, + "grad_norm": 0.15829728543758392, + "learning_rate": 4.4832428248623934e-05, + "loss": 0.0555, + "num_input_tokens_seen": 62840864, + "step": 51635 + }, + { + "epoch": 5.751197237999778, + "grad_norm": 0.02197035774588585, + "learning_rate": 4.483094884633543e-05, + "loss": 0.1797, + "num_input_tokens_seen": 62847008, + "step": 51640 + }, + { + "epoch": 5.751754092883394, + "grad_norm": 0.028601378202438354, + "learning_rate": 4.482946925672772e-05, + "loss": 0.0622, + "num_input_tokens_seen": 62853120, + "step": 51645 + }, + { + "epoch": 5.752310947767012, + "grad_norm": 0.0003882792661897838, + "learning_rate": 4.4827989479814784e-05, + "loss": 0.1136, + "num_input_tokens_seen": 62859168, + "step": 51650 + }, + { + "epoch": 5.752867802650629, + "grad_norm": 0.05459725111722946, + "learning_rate": 4.4826509515610605e-05, + "loss": 0.0315, + "num_input_tokens_seen": 62865248, + "step": 51655 + }, + { + "epoch": 5.7534246575342465, + "grad_norm": 0.024956990033388138, + "learning_rate": 4.4825029364129155e-05, + "loss": 0.0144, + "num_input_tokens_seen": 62871520, + "step": 51660 + }, + { + "epoch": 5.753981512417864, + "grad_norm": 0.1088387593626976, + "learning_rate": 4.482354902538443e-05, + "loss": 0.0493, + "num_input_tokens_seen": 62877760, + "step": 51665 + }, + { + "epoch": 5.754538367301481, + "grad_norm": 0.5813900232315063, + "learning_rate": 4.482206849939039e-05, + "loss": 0.0382, + "num_input_tokens_seen": 62884000, + "step": 51670 + }, + { + "epoch": 5.755095222185099, + "grad_norm": 0.8138967156410217, + "learning_rate": 4.482058778616104e-05, + "loss": 0.1318, + "num_input_tokens_seen": 62889632, + "step": 51675 + }, + { + "epoch": 5.755652077068715, + "grad_norm": 1.3831233978271484, + "learning_rate": 4.481910688571035e-05, + "loss": 0.1504, + "num_input_tokens_seen": 62894560, + "step": 51680 + }, + { + "epoch": 5.756208931952333, + "grad_norm": 2.1216163635253906, + "learning_rate": 4.481762579805232e-05, + "loss": 0.0889, + "num_input_tokens_seen": 62900736, + "step": 51685 + }, + { + "epoch": 5.756765786835951, + "grad_norm": 1.660369634628296, + "learning_rate": 4.481614452320094e-05, + "loss": 0.0746, + "num_input_tokens_seen": 62906912, + "step": 51690 + }, + { + "epoch": 5.7573226417195675, + "grad_norm": 0.08210665732622147, + "learning_rate": 4.48146630611702e-05, + "loss": 0.1135, + "num_input_tokens_seen": 62912672, + "step": 51695 + }, + { + "epoch": 5.757879496603185, + "grad_norm": 0.0010837521404027939, + "learning_rate": 4.4813181411974086e-05, + "loss": 0.1089, + "num_input_tokens_seen": 62919136, + "step": 51700 + }, + { + "epoch": 5.758436351486803, + "grad_norm": 0.23387980461120605, + "learning_rate": 4.48116995756266e-05, + "loss": 0.0966, + "num_input_tokens_seen": 62924992, + "step": 51705 + }, + { + "epoch": 5.75899320637042, + "grad_norm": 0.00864939484745264, + "learning_rate": 4.481021755214174e-05, + "loss": 0.1439, + "num_input_tokens_seen": 62931424, + "step": 51710 + }, + { + "epoch": 5.759550061254037, + "grad_norm": 0.03585858270525932, + "learning_rate": 4.480873534153351e-05, + "loss": 0.05, + "num_input_tokens_seen": 62937312, + "step": 51715 + }, + { + "epoch": 5.760106916137655, + "grad_norm": 1.4096217155456543, + "learning_rate": 4.4807252943815897e-05, + "loss": 0.0856, + "num_input_tokens_seen": 62942848, + "step": 51720 + }, + { + "epoch": 5.760663771021272, + "grad_norm": 0.34464552998542786, + "learning_rate": 4.4805770359002916e-05, + "loss": 0.0873, + "num_input_tokens_seen": 62949024, + "step": 51725 + }, + { + "epoch": 5.7612206259048895, + "grad_norm": 0.05090057849884033, + "learning_rate": 4.480428758710856e-05, + "loss": 0.1497, + "num_input_tokens_seen": 62954912, + "step": 51730 + }, + { + "epoch": 5.761777480788506, + "grad_norm": 0.0928826555609703, + "learning_rate": 4.480280462814684e-05, + "loss": 0.0519, + "num_input_tokens_seen": 62961376, + "step": 51735 + }, + { + "epoch": 5.762334335672124, + "grad_norm": 0.005389162804931402, + "learning_rate": 4.480132148213177e-05, + "loss": 0.0627, + "num_input_tokens_seen": 62967552, + "step": 51740 + }, + { + "epoch": 5.762891190555742, + "grad_norm": 0.026732059195637703, + "learning_rate": 4.479983814907735e-05, + "loss": 0.0367, + "num_input_tokens_seen": 62973024, + "step": 51745 + }, + { + "epoch": 5.763448045439358, + "grad_norm": 0.27129480242729187, + "learning_rate": 4.4798354628997595e-05, + "loss": 0.0258, + "num_input_tokens_seen": 62979232, + "step": 51750 + }, + { + "epoch": 5.764004900322976, + "grad_norm": 0.15269730985164642, + "learning_rate": 4.479687092190652e-05, + "loss": 0.0595, + "num_input_tokens_seen": 62985088, + "step": 51755 + }, + { + "epoch": 5.764561755206593, + "grad_norm": 0.05319223925471306, + "learning_rate": 4.479538702781814e-05, + "loss": 0.0405, + "num_input_tokens_seen": 62991232, + "step": 51760 + }, + { + "epoch": 5.7651186100902105, + "grad_norm": 0.003738731611520052, + "learning_rate": 4.479390294674647e-05, + "loss": 0.0485, + "num_input_tokens_seen": 62997504, + "step": 51765 + }, + { + "epoch": 5.765675464973828, + "grad_norm": 0.008500959724187851, + "learning_rate": 4.479241867870553e-05, + "loss": 0.0532, + "num_input_tokens_seen": 63003552, + "step": 51770 + }, + { + "epoch": 5.766232319857445, + "grad_norm": 1.1219985485076904, + "learning_rate": 4.479093422370933e-05, + "loss": 0.1153, + "num_input_tokens_seen": 63009504, + "step": 51775 + }, + { + "epoch": 5.766789174741063, + "grad_norm": 1.2837053537368774, + "learning_rate": 4.4789449581771904e-05, + "loss": 0.0966, + "num_input_tokens_seen": 63015456, + "step": 51780 + }, + { + "epoch": 5.7673460296246795, + "grad_norm": 0.5156721472740173, + "learning_rate": 4.478796475290727e-05, + "loss": 0.0986, + "num_input_tokens_seen": 63021216, + "step": 51785 + }, + { + "epoch": 5.767902884508297, + "grad_norm": 0.0616828016936779, + "learning_rate": 4.478647973712946e-05, + "loss": 0.0925, + "num_input_tokens_seen": 63027264, + "step": 51790 + }, + { + "epoch": 5.768459739391915, + "grad_norm": 0.0022240697871893644, + "learning_rate": 4.4784994534452497e-05, + "loss": 0.0298, + "num_input_tokens_seen": 63033504, + "step": 51795 + }, + { + "epoch": 5.769016594275532, + "grad_norm": 0.2604124844074249, + "learning_rate": 4.47835091448904e-05, + "loss": 0.0562, + "num_input_tokens_seen": 63039872, + "step": 51800 + }, + { + "epoch": 5.769573449159149, + "grad_norm": 1.4756187200546265, + "learning_rate": 4.4782023568457216e-05, + "loss": 0.0757, + "num_input_tokens_seen": 63046368, + "step": 51805 + }, + { + "epoch": 5.770130304042766, + "grad_norm": 0.4417373538017273, + "learning_rate": 4.478053780516697e-05, + "loss": 0.0757, + "num_input_tokens_seen": 63051264, + "step": 51810 + }, + { + "epoch": 5.770687158926384, + "grad_norm": 0.009549727663397789, + "learning_rate": 4.4779051855033694e-05, + "loss": 0.0122, + "num_input_tokens_seen": 63057344, + "step": 51815 + }, + { + "epoch": 5.771244013810001, + "grad_norm": 0.10730023682117462, + "learning_rate": 4.477756571807143e-05, + "loss": 0.0148, + "num_input_tokens_seen": 63063328, + "step": 51820 + }, + { + "epoch": 5.771800868693618, + "grad_norm": 1.2005319595336914, + "learning_rate": 4.477607939429421e-05, + "loss": 0.1128, + "num_input_tokens_seen": 63069408, + "step": 51825 + }, + { + "epoch": 5.772357723577236, + "grad_norm": 0.8902818560600281, + "learning_rate": 4.477459288371607e-05, + "loss": 0.1192, + "num_input_tokens_seen": 63075328, + "step": 51830 + }, + { + "epoch": 5.772914578460853, + "grad_norm": 1.0651055574417114, + "learning_rate": 4.4773106186351067e-05, + "loss": 0.064, + "num_input_tokens_seen": 63081600, + "step": 51835 + }, + { + "epoch": 5.77347143334447, + "grad_norm": 0.25904029607772827, + "learning_rate": 4.477161930221323e-05, + "loss": 0.0404, + "num_input_tokens_seen": 63087264, + "step": 51840 + }, + { + "epoch": 5.774028288228088, + "grad_norm": 0.037078242748975754, + "learning_rate": 4.477013223131661e-05, + "loss": 0.0346, + "num_input_tokens_seen": 63092960, + "step": 51845 + }, + { + "epoch": 5.774585143111705, + "grad_norm": 0.2676848769187927, + "learning_rate": 4.4768644973675246e-05, + "loss": 0.0856, + "num_input_tokens_seen": 63099104, + "step": 51850 + }, + { + "epoch": 5.7751419979953225, + "grad_norm": 0.045729003846645355, + "learning_rate": 4.4767157529303194e-05, + "loss": 0.0868, + "num_input_tokens_seen": 63104800, + "step": 51855 + }, + { + "epoch": 5.775698852878939, + "grad_norm": 0.6550860404968262, + "learning_rate": 4.4765669898214506e-05, + "loss": 0.0657, + "num_input_tokens_seen": 63111040, + "step": 51860 + }, + { + "epoch": 5.776255707762557, + "grad_norm": 0.0006734233465977013, + "learning_rate": 4.476418208042323e-05, + "loss": 0.1172, + "num_input_tokens_seen": 63117216, + "step": 51865 + }, + { + "epoch": 5.776812562646175, + "grad_norm": 0.0013543241657316685, + "learning_rate": 4.476269407594341e-05, + "loss": 0.1031, + "num_input_tokens_seen": 63123200, + "step": 51870 + }, + { + "epoch": 5.777369417529791, + "grad_norm": 0.6042733788490295, + "learning_rate": 4.476120588478912e-05, + "loss": 0.0383, + "num_input_tokens_seen": 63129152, + "step": 51875 + }, + { + "epoch": 5.777926272413409, + "grad_norm": 0.6027972102165222, + "learning_rate": 4.475971750697441e-05, + "loss": 0.0714, + "num_input_tokens_seen": 63135136, + "step": 51880 + }, + { + "epoch": 5.778483127297027, + "grad_norm": 0.5625489354133606, + "learning_rate": 4.4758228942513324e-05, + "loss": 0.0975, + "num_input_tokens_seen": 63141664, + "step": 51885 + }, + { + "epoch": 5.7790399821806435, + "grad_norm": 1.3062751293182373, + "learning_rate": 4.4756740191419946e-05, + "loss": 0.183, + "num_input_tokens_seen": 63147680, + "step": 51890 + }, + { + "epoch": 5.779596837064261, + "grad_norm": 0.004475523717701435, + "learning_rate": 4.475525125370833e-05, + "loss": 0.0252, + "num_input_tokens_seen": 63154144, + "step": 51895 + }, + { + "epoch": 5.780153691947879, + "grad_norm": 0.7523747682571411, + "learning_rate": 4.475376212939253e-05, + "loss": 0.0239, + "num_input_tokens_seen": 63160608, + "step": 51900 + }, + { + "epoch": 5.780710546831496, + "grad_norm": 0.0310075543820858, + "learning_rate": 4.475227281848663e-05, + "loss": 0.0414, + "num_input_tokens_seen": 63166752, + "step": 51905 + }, + { + "epoch": 5.781267401715113, + "grad_norm": 0.006877352949231863, + "learning_rate": 4.475078332100468e-05, + "loss": 0.0436, + "num_input_tokens_seen": 63172960, + "step": 51910 + }, + { + "epoch": 5.78182425659873, + "grad_norm": 0.02126290835440159, + "learning_rate": 4.4749293636960756e-05, + "loss": 0.0138, + "num_input_tokens_seen": 63179296, + "step": 51915 + }, + { + "epoch": 5.782381111482348, + "grad_norm": 0.47529545426368713, + "learning_rate": 4.4747803766368936e-05, + "loss": 0.017, + "num_input_tokens_seen": 63185952, + "step": 51920 + }, + { + "epoch": 5.7829379663659655, + "grad_norm": 0.06657577306032181, + "learning_rate": 4.474631370924329e-05, + "loss": 0.0779, + "num_input_tokens_seen": 63192032, + "step": 51925 + }, + { + "epoch": 5.783494821249582, + "grad_norm": 0.3347049951553345, + "learning_rate": 4.4744823465597885e-05, + "loss": 0.0638, + "num_input_tokens_seen": 63197472, + "step": 51930 + }, + { + "epoch": 5.7840516761332, + "grad_norm": 0.5620086193084717, + "learning_rate": 4.4743333035446803e-05, + "loss": 0.0169, + "num_input_tokens_seen": 63203360, + "step": 51935 + }, + { + "epoch": 5.784608531016817, + "grad_norm": 0.06770025193691254, + "learning_rate": 4.4741842418804125e-05, + "loss": 0.0403, + "num_input_tokens_seen": 63209472, + "step": 51940 + }, + { + "epoch": 5.785165385900434, + "grad_norm": 0.07315821945667267, + "learning_rate": 4.474035161568393e-05, + "loss": 0.0818, + "num_input_tokens_seen": 63215456, + "step": 51945 + }, + { + "epoch": 5.785722240784052, + "grad_norm": 0.430549681186676, + "learning_rate": 4.473886062610029e-05, + "loss": 0.1054, + "num_input_tokens_seen": 63221536, + "step": 51950 + }, + { + "epoch": 5.786279095667669, + "grad_norm": 0.9512972235679626, + "learning_rate": 4.473736945006731e-05, + "loss": 0.0707, + "num_input_tokens_seen": 63227616, + "step": 51955 + }, + { + "epoch": 5.7868359505512865, + "grad_norm": 0.15597312152385712, + "learning_rate": 4.4735878087599056e-05, + "loss": 0.0265, + "num_input_tokens_seen": 63233856, + "step": 51960 + }, + { + "epoch": 5.787392805434903, + "grad_norm": 0.4528367519378662, + "learning_rate": 4.473438653870962e-05, + "loss": 0.0364, + "num_input_tokens_seen": 63240000, + "step": 51965 + }, + { + "epoch": 5.787949660318521, + "grad_norm": 1.2588375806808472, + "learning_rate": 4.473289480341309e-05, + "loss": 0.1433, + "num_input_tokens_seen": 63246080, + "step": 51970 + }, + { + "epoch": 5.788506515202139, + "grad_norm": 0.7008630633354187, + "learning_rate": 4.473140288172356e-05, + "loss": 0.0803, + "num_input_tokens_seen": 63252352, + "step": 51975 + }, + { + "epoch": 5.7890633700857554, + "grad_norm": 0.5388645529747009, + "learning_rate": 4.472991077365513e-05, + "loss": 0.0301, + "num_input_tokens_seen": 63257408, + "step": 51980 + }, + { + "epoch": 5.789620224969373, + "grad_norm": 0.18401001393795013, + "learning_rate": 4.472841847922187e-05, + "loss": 0.0745, + "num_input_tokens_seen": 63263648, + "step": 51985 + }, + { + "epoch": 5.79017707985299, + "grad_norm": 0.2851603925228119, + "learning_rate": 4.47269259984379e-05, + "loss": 0.0162, + "num_input_tokens_seen": 63269888, + "step": 51990 + }, + { + "epoch": 5.790733934736608, + "grad_norm": 0.025384170934557915, + "learning_rate": 4.472543333131731e-05, + "loss": 0.0498, + "num_input_tokens_seen": 63276096, + "step": 51995 + }, + { + "epoch": 5.791290789620225, + "grad_norm": 0.1573118418455124, + "learning_rate": 4.472394047787419e-05, + "loss": 0.0546, + "num_input_tokens_seen": 63281984, + "step": 52000 + }, + { + "epoch": 5.791847644503842, + "grad_norm": 0.6092864274978638, + "learning_rate": 4.472244743812266e-05, + "loss": 0.1002, + "num_input_tokens_seen": 63288256, + "step": 52005 + }, + { + "epoch": 5.79240449938746, + "grad_norm": 0.13728737831115723, + "learning_rate": 4.472095421207682e-05, + "loss": 0.102, + "num_input_tokens_seen": 63294176, + "step": 52010 + }, + { + "epoch": 5.7929613542710765, + "grad_norm": 0.003396307583898306, + "learning_rate": 4.471946079975075e-05, + "loss": 0.0511, + "num_input_tokens_seen": 63299872, + "step": 52015 + }, + { + "epoch": 5.793518209154694, + "grad_norm": 0.04521363601088524, + "learning_rate": 4.4717967201158586e-05, + "loss": 0.0197, + "num_input_tokens_seen": 63306016, + "step": 52020 + }, + { + "epoch": 5.794075064038312, + "grad_norm": 0.45687395334243774, + "learning_rate": 4.4716473416314414e-05, + "loss": 0.0275, + "num_input_tokens_seen": 63312224, + "step": 52025 + }, + { + "epoch": 5.794631918921929, + "grad_norm": 1.052965760231018, + "learning_rate": 4.4714979445232356e-05, + "loss": 0.1164, + "num_input_tokens_seen": 63318048, + "step": 52030 + }, + { + "epoch": 5.795188773805546, + "grad_norm": 0.3626534640789032, + "learning_rate": 4.4713485287926526e-05, + "loss": 0.0908, + "num_input_tokens_seen": 63323904, + "step": 52035 + }, + { + "epoch": 5.795745628689164, + "grad_norm": 0.14012272655963898, + "learning_rate": 4.4711990944411034e-05, + "loss": 0.0473, + "num_input_tokens_seen": 63329792, + "step": 52040 + }, + { + "epoch": 5.796302483572781, + "grad_norm": 0.6934146285057068, + "learning_rate": 4.4710496414699986e-05, + "loss": 0.087, + "num_input_tokens_seen": 63335840, + "step": 52045 + }, + { + "epoch": 5.7968593384563984, + "grad_norm": 3.157749891281128, + "learning_rate": 4.470900169880752e-05, + "loss": 0.1, + "num_input_tokens_seen": 63342048, + "step": 52050 + }, + { + "epoch": 5.797416193340015, + "grad_norm": 0.007488538045436144, + "learning_rate": 4.470750679674773e-05, + "loss": 0.0219, + "num_input_tokens_seen": 63348256, + "step": 52055 + }, + { + "epoch": 5.797973048223633, + "grad_norm": 2.0329833030700684, + "learning_rate": 4.470601170853476e-05, + "loss": 0.0893, + "num_input_tokens_seen": 63354560, + "step": 52060 + }, + { + "epoch": 5.798529903107251, + "grad_norm": 0.09145796298980713, + "learning_rate": 4.4704516434182715e-05, + "loss": 0.0103, + "num_input_tokens_seen": 63360736, + "step": 52065 + }, + { + "epoch": 5.799086757990867, + "grad_norm": 0.33945468068122864, + "learning_rate": 4.4703020973705725e-05, + "loss": 0.0282, + "num_input_tokens_seen": 63366720, + "step": 52070 + }, + { + "epoch": 5.799643612874485, + "grad_norm": 0.00036337197525426745, + "learning_rate": 4.470152532711792e-05, + "loss": 0.1232, + "num_input_tokens_seen": 63372832, + "step": 52075 + }, + { + "epoch": 5.800200467758103, + "grad_norm": 0.013208580203354359, + "learning_rate": 4.4700029494433414e-05, + "loss": 0.0439, + "num_input_tokens_seen": 63378848, + "step": 52080 + }, + { + "epoch": 5.8007573226417195, + "grad_norm": 0.14229042828083038, + "learning_rate": 4.469853347566636e-05, + "loss": 0.1025, + "num_input_tokens_seen": 63384800, + "step": 52085 + }, + { + "epoch": 5.801314177525337, + "grad_norm": 1.842086911201477, + "learning_rate": 4.469703727083087e-05, + "loss": 0.1459, + "num_input_tokens_seen": 63390080, + "step": 52090 + }, + { + "epoch": 5.801871032408954, + "grad_norm": 0.16057755053043365, + "learning_rate": 4.469554087994108e-05, + "loss": 0.0344, + "num_input_tokens_seen": 63396128, + "step": 52095 + }, + { + "epoch": 5.802427887292572, + "grad_norm": 0.0722496509552002, + "learning_rate": 4.469404430301112e-05, + "loss": 0.0211, + "num_input_tokens_seen": 63402400, + "step": 52100 + }, + { + "epoch": 5.802984742176189, + "grad_norm": 0.925521731376648, + "learning_rate": 4.4692547540055144e-05, + "loss": 0.0421, + "num_input_tokens_seen": 63408096, + "step": 52105 + }, + { + "epoch": 5.803541597059806, + "grad_norm": 0.04821297526359558, + "learning_rate": 4.469105059108727e-05, + "loss": 0.0022, + "num_input_tokens_seen": 63414208, + "step": 52110 + }, + { + "epoch": 5.804098451943424, + "grad_norm": 0.053123943507671356, + "learning_rate": 4.468955345612165e-05, + "loss": 0.0211, + "num_input_tokens_seen": 63420256, + "step": 52115 + }, + { + "epoch": 5.804655306827041, + "grad_norm": 0.026285886764526367, + "learning_rate": 4.468805613517243e-05, + "loss": 0.0685, + "num_input_tokens_seen": 63426080, + "step": 52120 + }, + { + "epoch": 5.805212161710658, + "grad_norm": 0.6369525194168091, + "learning_rate": 4.468655862825374e-05, + "loss": 0.0839, + "num_input_tokens_seen": 63432256, + "step": 52125 + }, + { + "epoch": 5.805769016594276, + "grad_norm": 0.11078684031963348, + "learning_rate": 4.468506093537973e-05, + "loss": 0.0103, + "num_input_tokens_seen": 63438240, + "step": 52130 + }, + { + "epoch": 5.806325871477893, + "grad_norm": 0.1182989701628685, + "learning_rate": 4.468356305656455e-05, + "loss": 0.0114, + "num_input_tokens_seen": 63444128, + "step": 52135 + }, + { + "epoch": 5.80688272636151, + "grad_norm": 0.7860532999038696, + "learning_rate": 4.4682064991822345e-05, + "loss": 0.1266, + "num_input_tokens_seen": 63450144, + "step": 52140 + }, + { + "epoch": 5.807439581245127, + "grad_norm": 0.11531460285186768, + "learning_rate": 4.468056674116727e-05, + "loss": 0.0495, + "num_input_tokens_seen": 63456640, + "step": 52145 + }, + { + "epoch": 5.807996436128745, + "grad_norm": 1.1647300720214844, + "learning_rate": 4.467906830461347e-05, + "loss": 0.1002, + "num_input_tokens_seen": 63462528, + "step": 52150 + }, + { + "epoch": 5.8085532910123625, + "grad_norm": 0.3402288854122162, + "learning_rate": 4.467756968217511e-05, + "loss": 0.1289, + "num_input_tokens_seen": 63468576, + "step": 52155 + }, + { + "epoch": 5.809110145895979, + "grad_norm": 0.014884073287248611, + "learning_rate": 4.467607087386633e-05, + "loss": 0.0445, + "num_input_tokens_seen": 63474752, + "step": 52160 + }, + { + "epoch": 5.809667000779597, + "grad_norm": 0.2561732530593872, + "learning_rate": 4.46745718797013e-05, + "loss": 0.0113, + "num_input_tokens_seen": 63480864, + "step": 52165 + }, + { + "epoch": 5.810223855663214, + "grad_norm": 0.17610055208206177, + "learning_rate": 4.467307269969418e-05, + "loss": 0.1342, + "num_input_tokens_seen": 63486848, + "step": 52170 + }, + { + "epoch": 5.810780710546831, + "grad_norm": 0.7153018116950989, + "learning_rate": 4.467157333385912e-05, + "loss": 0.0929, + "num_input_tokens_seen": 63492768, + "step": 52175 + }, + { + "epoch": 5.811337565430449, + "grad_norm": 0.4736456871032715, + "learning_rate": 4.46700737822103e-05, + "loss": 0.0152, + "num_input_tokens_seen": 63498816, + "step": 52180 + }, + { + "epoch": 5.811894420314066, + "grad_norm": 0.12800906598567963, + "learning_rate": 4.4668574044761866e-05, + "loss": 0.0207, + "num_input_tokens_seen": 63504576, + "step": 52185 + }, + { + "epoch": 5.812451275197684, + "grad_norm": 0.8266920447349548, + "learning_rate": 4.4667074121527985e-05, + "loss": 0.1014, + "num_input_tokens_seen": 63510464, + "step": 52190 + }, + { + "epoch": 5.8130081300813, + "grad_norm": 0.007061781361699104, + "learning_rate": 4.466557401252284e-05, + "loss": 0.0395, + "num_input_tokens_seen": 63516416, + "step": 52195 + }, + { + "epoch": 5.813564984964918, + "grad_norm": 0.5902774333953857, + "learning_rate": 4.466407371776059e-05, + "loss": 0.083, + "num_input_tokens_seen": 63522880, + "step": 52200 + }, + { + "epoch": 5.814121839848536, + "grad_norm": 0.002572204452008009, + "learning_rate": 4.4662573237255414e-05, + "loss": 0.0756, + "num_input_tokens_seen": 63528928, + "step": 52205 + }, + { + "epoch": 5.8146786947321525, + "grad_norm": 1.64277184009552, + "learning_rate": 4.4661072571021476e-05, + "loss": 0.099, + "num_input_tokens_seen": 63535264, + "step": 52210 + }, + { + "epoch": 5.81523554961577, + "grad_norm": 0.525321364402771, + "learning_rate": 4.4659571719072956e-05, + "loss": 0.0294, + "num_input_tokens_seen": 63541728, + "step": 52215 + }, + { + "epoch": 5.815792404499388, + "grad_norm": 0.04706986993551254, + "learning_rate": 4.465807068142404e-05, + "loss": 0.0196, + "num_input_tokens_seen": 63547872, + "step": 52220 + }, + { + "epoch": 5.816349259383005, + "grad_norm": 0.0212910957634449, + "learning_rate": 4.465656945808888e-05, + "loss": 0.0301, + "num_input_tokens_seen": 63553920, + "step": 52225 + }, + { + "epoch": 5.816906114266622, + "grad_norm": 0.5843411087989807, + "learning_rate": 4.465506804908168e-05, + "loss": 0.1344, + "num_input_tokens_seen": 63559552, + "step": 52230 + }, + { + "epoch": 5.817462969150239, + "grad_norm": 0.03081885352730751, + "learning_rate": 4.465356645441662e-05, + "loss": 0.0744, + "num_input_tokens_seen": 63565888, + "step": 52235 + }, + { + "epoch": 5.818019824033857, + "grad_norm": 0.7132847309112549, + "learning_rate": 4.465206467410787e-05, + "loss": 0.0936, + "num_input_tokens_seen": 63571936, + "step": 52240 + }, + { + "epoch": 5.818576678917474, + "grad_norm": 0.5929152369499207, + "learning_rate": 4.465056270816963e-05, + "loss": 0.0246, + "num_input_tokens_seen": 63577920, + "step": 52245 + }, + { + "epoch": 5.819133533801091, + "grad_norm": 0.007885197177529335, + "learning_rate": 4.464906055661608e-05, + "loss": 0.0391, + "num_input_tokens_seen": 63584288, + "step": 52250 + }, + { + "epoch": 5.819690388684709, + "grad_norm": 0.1890588402748108, + "learning_rate": 4.464755821946141e-05, + "loss": 0.055, + "num_input_tokens_seen": 63589728, + "step": 52255 + }, + { + "epoch": 5.820247243568327, + "grad_norm": 1.190614938735962, + "learning_rate": 4.464605569671981e-05, + "loss": 0.1583, + "num_input_tokens_seen": 63595744, + "step": 52260 + }, + { + "epoch": 5.820804098451943, + "grad_norm": 0.1607791632413864, + "learning_rate": 4.4644552988405475e-05, + "loss": 0.1075, + "num_input_tokens_seen": 63602016, + "step": 52265 + }, + { + "epoch": 5.821360953335561, + "grad_norm": 0.2364228069782257, + "learning_rate": 4.46430500945326e-05, + "loss": 0.1122, + "num_input_tokens_seen": 63608128, + "step": 52270 + }, + { + "epoch": 5.821917808219178, + "grad_norm": 0.00181493884883821, + "learning_rate": 4.464154701511538e-05, + "loss": 0.1021, + "num_input_tokens_seen": 63614368, + "step": 52275 + }, + { + "epoch": 5.8224746631027955, + "grad_norm": 1.180242896080017, + "learning_rate": 4.464004375016801e-05, + "loss": 0.0529, + "num_input_tokens_seen": 63620288, + "step": 52280 + }, + { + "epoch": 5.823031517986413, + "grad_norm": 1.400699257850647, + "learning_rate": 4.463854029970469e-05, + "loss": 0.138, + "num_input_tokens_seen": 63626560, + "step": 52285 + }, + { + "epoch": 5.82358837287003, + "grad_norm": 0.05659151077270508, + "learning_rate": 4.463703666373962e-05, + "loss": 0.1089, + "num_input_tokens_seen": 63632704, + "step": 52290 + }, + { + "epoch": 5.824145227753648, + "grad_norm": 0.009531798772513866, + "learning_rate": 4.463553284228701e-05, + "loss": 0.1356, + "num_input_tokens_seen": 63638912, + "step": 52295 + }, + { + "epoch": 5.824702082637264, + "grad_norm": 0.7584513425827026, + "learning_rate": 4.463402883536107e-05, + "loss": 0.1612, + "num_input_tokens_seen": 63645120, + "step": 52300 + }, + { + "epoch": 5.825258937520882, + "grad_norm": 0.06639573723077774, + "learning_rate": 4.4632524642975984e-05, + "loss": 0.1461, + "num_input_tokens_seen": 63651136, + "step": 52305 + }, + { + "epoch": 5.8258157924045, + "grad_norm": 0.061480261385440826, + "learning_rate": 4.463102026514597e-05, + "loss": 0.0652, + "num_input_tokens_seen": 63657184, + "step": 52310 + }, + { + "epoch": 5.8263726472881165, + "grad_norm": 0.008523140102624893, + "learning_rate": 4.462951570188525e-05, + "loss": 0.1382, + "num_input_tokens_seen": 63662528, + "step": 52315 + }, + { + "epoch": 5.826929502171734, + "grad_norm": 0.4895087778568268, + "learning_rate": 4.462801095320802e-05, + "loss": 0.0221, + "num_input_tokens_seen": 63668864, + "step": 52320 + }, + { + "epoch": 5.827486357055351, + "grad_norm": 0.8039354681968689, + "learning_rate": 4.462650601912851e-05, + "loss": 0.0712, + "num_input_tokens_seen": 63675040, + "step": 52325 + }, + { + "epoch": 5.828043211938969, + "grad_norm": 0.15684659779071808, + "learning_rate": 4.462500089966092e-05, + "loss": 0.0102, + "num_input_tokens_seen": 63681152, + "step": 52330 + }, + { + "epoch": 5.828600066822586, + "grad_norm": 0.9820468425750732, + "learning_rate": 4.462349559481948e-05, + "loss": 0.0391, + "num_input_tokens_seen": 63687264, + "step": 52335 + }, + { + "epoch": 5.829156921706203, + "grad_norm": 1.7104167938232422, + "learning_rate": 4.4621990104618395e-05, + "loss": 0.1195, + "num_input_tokens_seen": 63693472, + "step": 52340 + }, + { + "epoch": 5.829713776589821, + "grad_norm": 0.4260323941707611, + "learning_rate": 4.46204844290719e-05, + "loss": 0.08, + "num_input_tokens_seen": 63699360, + "step": 52345 + }, + { + "epoch": 5.830270631473438, + "grad_norm": 0.9622470736503601, + "learning_rate": 4.4618978568194194e-05, + "loss": 0.1711, + "num_input_tokens_seen": 63704256, + "step": 52350 + }, + { + "epoch": 5.830827486357055, + "grad_norm": 0.07822739332914352, + "learning_rate": 4.461747252199953e-05, + "loss": 0.0935, + "num_input_tokens_seen": 63710336, + "step": 52355 + }, + { + "epoch": 5.831384341240673, + "grad_norm": 0.009008640423417091, + "learning_rate": 4.461596629050212e-05, + "loss": 0.004, + "num_input_tokens_seen": 63716544, + "step": 52360 + }, + { + "epoch": 5.83194119612429, + "grad_norm": 0.8000408411026001, + "learning_rate": 4.461445987371619e-05, + "loss": 0.0678, + "num_input_tokens_seen": 63723232, + "step": 52365 + }, + { + "epoch": 5.832498051007907, + "grad_norm": 0.06615381687879562, + "learning_rate": 4.4612953271655975e-05, + "loss": 0.1325, + "num_input_tokens_seen": 63729440, + "step": 52370 + }, + { + "epoch": 5.833054905891524, + "grad_norm": 0.05935303121805191, + "learning_rate": 4.46114464843357e-05, + "loss": 0.0988, + "num_input_tokens_seen": 63735424, + "step": 52375 + }, + { + "epoch": 5.833611760775142, + "grad_norm": 1.7041186094284058, + "learning_rate": 4.46099395117696e-05, + "loss": 0.1057, + "num_input_tokens_seen": 63741664, + "step": 52380 + }, + { + "epoch": 5.8341686156587595, + "grad_norm": 0.06655482202768326, + "learning_rate": 4.460843235397191e-05, + "loss": 0.0243, + "num_input_tokens_seen": 63747904, + "step": 52385 + }, + { + "epoch": 5.834725470542376, + "grad_norm": 0.29359832406044006, + "learning_rate": 4.460692501095687e-05, + "loss": 0.0527, + "num_input_tokens_seen": 63753856, + "step": 52390 + }, + { + "epoch": 5.835282325425994, + "grad_norm": 0.3248271644115448, + "learning_rate": 4.460541748273871e-05, + "loss": 0.0085, + "num_input_tokens_seen": 63760128, + "step": 52395 + }, + { + "epoch": 5.835839180309612, + "grad_norm": 0.4496997594833374, + "learning_rate": 4.4603909769331674e-05, + "loss": 0.0235, + "num_input_tokens_seen": 63766240, + "step": 52400 + }, + { + "epoch": 5.8363960351932285, + "grad_norm": 2.181809902191162, + "learning_rate": 4.460240187075001e-05, + "loss": 0.1173, + "num_input_tokens_seen": 63772544, + "step": 52405 + }, + { + "epoch": 5.836952890076846, + "grad_norm": 0.12067049741744995, + "learning_rate": 4.460089378700795e-05, + "loss": 0.0463, + "num_input_tokens_seen": 63778848, + "step": 52410 + }, + { + "epoch": 5.837509744960463, + "grad_norm": 0.002906624460592866, + "learning_rate": 4.459938551811974e-05, + "loss": 0.0668, + "num_input_tokens_seen": 63785056, + "step": 52415 + }, + { + "epoch": 5.838066599844081, + "grad_norm": 0.1255532056093216, + "learning_rate": 4.4597877064099644e-05, + "loss": 0.0567, + "num_input_tokens_seen": 63791584, + "step": 52420 + }, + { + "epoch": 5.838623454727698, + "grad_norm": 0.797447144985199, + "learning_rate": 4.459636842496189e-05, + "loss": 0.0749, + "num_input_tokens_seen": 63797536, + "step": 52425 + }, + { + "epoch": 5.839180309611315, + "grad_norm": 0.28012996912002563, + "learning_rate": 4.459485960072074e-05, + "loss": 0.1646, + "num_input_tokens_seen": 63803840, + "step": 52430 + }, + { + "epoch": 5.839737164494933, + "grad_norm": 0.19077332317829132, + "learning_rate": 4.459335059139043e-05, + "loss": 0.0615, + "num_input_tokens_seen": 63809888, + "step": 52435 + }, + { + "epoch": 5.84029401937855, + "grad_norm": 0.1294073760509491, + "learning_rate": 4.4591841396985234e-05, + "loss": 0.0437, + "num_input_tokens_seen": 63816096, + "step": 52440 + }, + { + "epoch": 5.840850874262167, + "grad_norm": 0.440394788980484, + "learning_rate": 4.45903320175194e-05, + "loss": 0.0299, + "num_input_tokens_seen": 63822496, + "step": 52445 + }, + { + "epoch": 5.841407729145785, + "grad_norm": 0.16053521633148193, + "learning_rate": 4.458882245300718e-05, + "loss": 0.0945, + "num_input_tokens_seen": 63828768, + "step": 52450 + }, + { + "epoch": 5.841964584029402, + "grad_norm": 0.29690587520599365, + "learning_rate": 4.458731270346285e-05, + "loss": 0.0503, + "num_input_tokens_seen": 63834944, + "step": 52455 + }, + { + "epoch": 5.842521438913019, + "grad_norm": 1.5358120203018188, + "learning_rate": 4.4585802768900644e-05, + "loss": 0.0691, + "num_input_tokens_seen": 63841184, + "step": 52460 + }, + { + "epoch": 5.843078293796637, + "grad_norm": 0.25351229310035706, + "learning_rate": 4.4584292649334845e-05, + "loss": 0.0782, + "num_input_tokens_seen": 63847456, + "step": 52465 + }, + { + "epoch": 5.843635148680254, + "grad_norm": 0.7419000267982483, + "learning_rate": 4.4582782344779706e-05, + "loss": 0.092, + "num_input_tokens_seen": 63853312, + "step": 52470 + }, + { + "epoch": 5.8441920035638715, + "grad_norm": 0.4805644750595093, + "learning_rate": 4.4581271855249506e-05, + "loss": 0.0754, + "num_input_tokens_seen": 63859136, + "step": 52475 + }, + { + "epoch": 5.844748858447488, + "grad_norm": 0.07035502791404724, + "learning_rate": 4.45797611807585e-05, + "loss": 0.0102, + "num_input_tokens_seen": 63865440, + "step": 52480 + }, + { + "epoch": 5.845305713331106, + "grad_norm": 0.049257542937994, + "learning_rate": 4.457825032132097e-05, + "loss": 0.0382, + "num_input_tokens_seen": 63871296, + "step": 52485 + }, + { + "epoch": 5.845862568214724, + "grad_norm": 0.3681165277957916, + "learning_rate": 4.4576739276951174e-05, + "loss": 0.0648, + "num_input_tokens_seen": 63877376, + "step": 52490 + }, + { + "epoch": 5.84641942309834, + "grad_norm": 0.27415281534194946, + "learning_rate": 4.457522804766339e-05, + "loss": 0.0093, + "num_input_tokens_seen": 63883392, + "step": 52495 + }, + { + "epoch": 5.846976277981958, + "grad_norm": 0.0032739266753196716, + "learning_rate": 4.457371663347189e-05, + "loss": 0.0116, + "num_input_tokens_seen": 63889504, + "step": 52500 + }, + { + "epoch": 5.847533132865575, + "grad_norm": 1.3813890218734741, + "learning_rate": 4.4572205034390954e-05, + "loss": 0.1169, + "num_input_tokens_seen": 63895808, + "step": 52505 + }, + { + "epoch": 5.8480899877491925, + "grad_norm": 0.5229946970939636, + "learning_rate": 4.457069325043487e-05, + "loss": 0.0449, + "num_input_tokens_seen": 63901824, + "step": 52510 + }, + { + "epoch": 5.84864684263281, + "grad_norm": 0.7756249904632568, + "learning_rate": 4.456918128161791e-05, + "loss": 0.042, + "num_input_tokens_seen": 63907776, + "step": 52515 + }, + { + "epoch": 5.849203697516427, + "grad_norm": 0.2627706527709961, + "learning_rate": 4.456766912795435e-05, + "loss": 0.043, + "num_input_tokens_seen": 63913984, + "step": 52520 + }, + { + "epoch": 5.849760552400045, + "grad_norm": 0.09871329367160797, + "learning_rate": 4.456615678945847e-05, + "loss": 0.0517, + "num_input_tokens_seen": 63920064, + "step": 52525 + }, + { + "epoch": 5.8503174072836615, + "grad_norm": 0.1520533263683319, + "learning_rate": 4.456464426614457e-05, + "loss": 0.0171, + "num_input_tokens_seen": 63926176, + "step": 52530 + }, + { + "epoch": 5.850874262167279, + "grad_norm": 0.09900067001581192, + "learning_rate": 4.456313155802693e-05, + "loss": 0.0109, + "num_input_tokens_seen": 63932416, + "step": 52535 + }, + { + "epoch": 5.851431117050897, + "grad_norm": 0.0023284228518605232, + "learning_rate": 4.4561618665119835e-05, + "loss": 0.0004, + "num_input_tokens_seen": 63938784, + "step": 52540 + }, + { + "epoch": 5.851987971934514, + "grad_norm": 0.3241943418979645, + "learning_rate": 4.4560105587437584e-05, + "loss": 0.0133, + "num_input_tokens_seen": 63945088, + "step": 52545 + }, + { + "epoch": 5.852544826818131, + "grad_norm": 1.7189770936965942, + "learning_rate": 4.455859232499446e-05, + "loss": 0.2707, + "num_input_tokens_seen": 63950720, + "step": 52550 + }, + { + "epoch": 5.853101681701748, + "grad_norm": 0.11231488734483719, + "learning_rate": 4.455707887780477e-05, + "loss": 0.1099, + "num_input_tokens_seen": 63956992, + "step": 52555 + }, + { + "epoch": 5.853658536585366, + "grad_norm": 0.0036220871843397617, + "learning_rate": 4.455556524588279e-05, + "loss": 0.2846, + "num_input_tokens_seen": 63962720, + "step": 52560 + }, + { + "epoch": 5.854215391468983, + "grad_norm": 0.23835089802742004, + "learning_rate": 4.455405142924284e-05, + "loss": 0.0284, + "num_input_tokens_seen": 63969024, + "step": 52565 + }, + { + "epoch": 5.8547722463526, + "grad_norm": 0.0797123834490776, + "learning_rate": 4.45525374278992e-05, + "loss": 0.1388, + "num_input_tokens_seen": 63975104, + "step": 52570 + }, + { + "epoch": 5.855329101236218, + "grad_norm": 0.3917103707790375, + "learning_rate": 4.4551023241866176e-05, + "loss": 0.2119, + "num_input_tokens_seen": 63980736, + "step": 52575 + }, + { + "epoch": 5.8558859561198355, + "grad_norm": 0.02295125462114811, + "learning_rate": 4.454950887115807e-05, + "loss": 0.1754, + "num_input_tokens_seen": 63986816, + "step": 52580 + }, + { + "epoch": 5.856442811003452, + "grad_norm": 0.2715420424938202, + "learning_rate": 4.45479943157892e-05, + "loss": 0.1028, + "num_input_tokens_seen": 63992768, + "step": 52585 + }, + { + "epoch": 5.85699966588707, + "grad_norm": 0.5254079699516296, + "learning_rate": 4.4546479575773865e-05, + "loss": 0.0372, + "num_input_tokens_seen": 63999104, + "step": 52590 + }, + { + "epoch": 5.857556520770687, + "grad_norm": 0.010768501088023186, + "learning_rate": 4.4544964651126366e-05, + "loss": 0.0137, + "num_input_tokens_seen": 64005248, + "step": 52595 + }, + { + "epoch": 5.8581133756543045, + "grad_norm": 0.9732950925827026, + "learning_rate": 4.4543449541861015e-05, + "loss": 0.0862, + "num_input_tokens_seen": 64011520, + "step": 52600 + }, + { + "epoch": 5.858670230537922, + "grad_norm": 1.2880176305770874, + "learning_rate": 4.4541934247992125e-05, + "loss": 0.1104, + "num_input_tokens_seen": 64017728, + "step": 52605 + }, + { + "epoch": 5.859227085421539, + "grad_norm": 0.365896075963974, + "learning_rate": 4.454041876953401e-05, + "loss": 0.0434, + "num_input_tokens_seen": 64023776, + "step": 52610 + }, + { + "epoch": 5.859783940305157, + "grad_norm": 0.0014471161412075162, + "learning_rate": 4.4538903106500986e-05, + "loss": 0.0287, + "num_input_tokens_seen": 64029920, + "step": 52615 + }, + { + "epoch": 5.860340795188774, + "grad_norm": 0.10372352600097656, + "learning_rate": 4.453738725890736e-05, + "loss": 0.0983, + "num_input_tokens_seen": 64035936, + "step": 52620 + }, + { + "epoch": 5.860897650072391, + "grad_norm": 0.030546285212039948, + "learning_rate": 4.4535871226767466e-05, + "loss": 0.035, + "num_input_tokens_seen": 64042016, + "step": 52625 + }, + { + "epoch": 5.861454504956009, + "grad_norm": 0.09786278009414673, + "learning_rate": 4.453435501009561e-05, + "loss": 0.064, + "num_input_tokens_seen": 64048160, + "step": 52630 + }, + { + "epoch": 5.8620113598396255, + "grad_norm": 0.925403892993927, + "learning_rate": 4.453283860890613e-05, + "loss": 0.0382, + "num_input_tokens_seen": 64054432, + "step": 52635 + }, + { + "epoch": 5.862568214723243, + "grad_norm": 0.8641507029533386, + "learning_rate": 4.4531322023213336e-05, + "loss": 0.0819, + "num_input_tokens_seen": 64060832, + "step": 52640 + }, + { + "epoch": 5.863125069606861, + "grad_norm": 0.0045820786617696285, + "learning_rate": 4.452980525303155e-05, + "loss": 0.0281, + "num_input_tokens_seen": 64066848, + "step": 52645 + }, + { + "epoch": 5.863681924490478, + "grad_norm": 0.1595858931541443, + "learning_rate": 4.452828829837512e-05, + "loss": 0.0091, + "num_input_tokens_seen": 64073312, + "step": 52650 + }, + { + "epoch": 5.864238779374095, + "grad_norm": 0.0004656723758671433, + "learning_rate": 4.452677115925835e-05, + "loss": 0.0557, + "num_input_tokens_seen": 64079392, + "step": 52655 + }, + { + "epoch": 5.864795634257712, + "grad_norm": 0.13629120588302612, + "learning_rate": 4.4525253835695584e-05, + "loss": 0.0922, + "num_input_tokens_seen": 64085408, + "step": 52660 + }, + { + "epoch": 5.86535248914133, + "grad_norm": 1.1285340785980225, + "learning_rate": 4.452373632770116e-05, + "loss": 0.1346, + "num_input_tokens_seen": 64091680, + "step": 52665 + }, + { + "epoch": 5.8659093440249475, + "grad_norm": 0.4199497699737549, + "learning_rate": 4.452221863528939e-05, + "loss": 0.0251, + "num_input_tokens_seen": 64097600, + "step": 52670 + }, + { + "epoch": 5.866466198908564, + "grad_norm": 0.0020166311878710985, + "learning_rate": 4.452070075847463e-05, + "loss": 0.0911, + "num_input_tokens_seen": 64103744, + "step": 52675 + }, + { + "epoch": 5.867023053792182, + "grad_norm": 0.0015213672304525971, + "learning_rate": 4.4519182697271217e-05, + "loss": 0.0044, + "num_input_tokens_seen": 64110048, + "step": 52680 + }, + { + "epoch": 5.867579908675799, + "grad_norm": 0.013331994414329529, + "learning_rate": 4.451766445169348e-05, + "loss": 0.0592, + "num_input_tokens_seen": 64115968, + "step": 52685 + }, + { + "epoch": 5.868136763559416, + "grad_norm": 0.015997035428881645, + "learning_rate": 4.451614602175577e-05, + "loss": 0.0333, + "num_input_tokens_seen": 64122048, + "step": 52690 + }, + { + "epoch": 5.868693618443034, + "grad_norm": 0.6440024971961975, + "learning_rate": 4.451462740747242e-05, + "loss": 0.0421, + "num_input_tokens_seen": 64128096, + "step": 52695 + }, + { + "epoch": 5.869250473326651, + "grad_norm": 0.5328825116157532, + "learning_rate": 4.451310860885778e-05, + "loss": 0.1964, + "num_input_tokens_seen": 64134112, + "step": 52700 + }, + { + "epoch": 5.8698073282102685, + "grad_norm": 1.1966716051101685, + "learning_rate": 4.451158962592619e-05, + "loss": 0.0608, + "num_input_tokens_seen": 64140128, + "step": 52705 + }, + { + "epoch": 5.870364183093885, + "grad_norm": 1.0302720069885254, + "learning_rate": 4.451007045869201e-05, + "loss": 0.2086, + "num_input_tokens_seen": 64146272, + "step": 52710 + }, + { + "epoch": 5.870921037977503, + "grad_norm": 0.32561710476875305, + "learning_rate": 4.4508551107169585e-05, + "loss": 0.0502, + "num_input_tokens_seen": 64152352, + "step": 52715 + }, + { + "epoch": 5.871477892861121, + "grad_norm": 1.180757761001587, + "learning_rate": 4.450703157137326e-05, + "loss": 0.038, + "num_input_tokens_seen": 64158400, + "step": 52720 + }, + { + "epoch": 5.872034747744737, + "grad_norm": 0.29859423637390137, + "learning_rate": 4.4505511851317395e-05, + "loss": 0.0707, + "num_input_tokens_seen": 64164416, + "step": 52725 + }, + { + "epoch": 5.872591602628355, + "grad_norm": 0.4424380362033844, + "learning_rate": 4.4503991947016345e-05, + "loss": 0.1369, + "num_input_tokens_seen": 64170752, + "step": 52730 + }, + { + "epoch": 5.873148457511972, + "grad_norm": 0.26210907101631165, + "learning_rate": 4.4502471858484464e-05, + "loss": 0.031, + "num_input_tokens_seen": 64177184, + "step": 52735 + }, + { + "epoch": 5.87370531239559, + "grad_norm": 0.3705948293209076, + "learning_rate": 4.450095158573611e-05, + "loss": 0.018, + "num_input_tokens_seen": 64183488, + "step": 52740 + }, + { + "epoch": 5.874262167279207, + "grad_norm": 0.4712621569633484, + "learning_rate": 4.449943112878565e-05, + "loss": 0.0348, + "num_input_tokens_seen": 64188928, + "step": 52745 + }, + { + "epoch": 5.874819022162824, + "grad_norm": 0.9458960890769958, + "learning_rate": 4.449791048764743e-05, + "loss": 0.0991, + "num_input_tokens_seen": 64195072, + "step": 52750 + }, + { + "epoch": 5.875375877046442, + "grad_norm": 0.027196558192372322, + "learning_rate": 4.449638966233583e-05, + "loss": 0.0317, + "num_input_tokens_seen": 64201184, + "step": 52755 + }, + { + "epoch": 5.875932731930059, + "grad_norm": 1.0762231349945068, + "learning_rate": 4.4494868652865205e-05, + "loss": 0.0707, + "num_input_tokens_seen": 64207456, + "step": 52760 + }, + { + "epoch": 5.876489586813676, + "grad_norm": 0.18655966222286224, + "learning_rate": 4.449334745924993e-05, + "loss": 0.0323, + "num_input_tokens_seen": 64213824, + "step": 52765 + }, + { + "epoch": 5.877046441697294, + "grad_norm": 0.06330977380275726, + "learning_rate": 4.4491826081504374e-05, + "loss": 0.2291, + "num_input_tokens_seen": 64219808, + "step": 52770 + }, + { + "epoch": 5.877603296580911, + "grad_norm": 0.35282406210899353, + "learning_rate": 4.44903045196429e-05, + "loss": 0.0537, + "num_input_tokens_seen": 64226144, + "step": 52775 + }, + { + "epoch": 5.878160151464528, + "grad_norm": 0.022313997149467468, + "learning_rate": 4.4488782773679885e-05, + "loss": 0.0201, + "num_input_tokens_seen": 64232320, + "step": 52780 + }, + { + "epoch": 5.878717006348146, + "grad_norm": 0.6915353536605835, + "learning_rate": 4.44872608436297e-05, + "loss": 0.0479, + "num_input_tokens_seen": 64238688, + "step": 52785 + }, + { + "epoch": 5.879273861231763, + "grad_norm": 0.11070144176483154, + "learning_rate": 4.448573872950672e-05, + "loss": 0.0937, + "num_input_tokens_seen": 64244640, + "step": 52790 + }, + { + "epoch": 5.87983071611538, + "grad_norm": 0.3790154755115509, + "learning_rate": 4.448421643132534e-05, + "loss": 0.0201, + "num_input_tokens_seen": 64250816, + "step": 52795 + }, + { + "epoch": 5.880387570998998, + "grad_norm": 0.26542234420776367, + "learning_rate": 4.448269394909992e-05, + "loss": 0.0171, + "num_input_tokens_seen": 64257120, + "step": 52800 + }, + { + "epoch": 5.880944425882615, + "grad_norm": 1.0258985757827759, + "learning_rate": 4.448117128284484e-05, + "loss": 0.0576, + "num_input_tokens_seen": 64263168, + "step": 52805 + }, + { + "epoch": 5.881501280766233, + "grad_norm": 0.2704048752784729, + "learning_rate": 4.447964843257449e-05, + "loss": 0.0702, + "num_input_tokens_seen": 64269568, + "step": 52810 + }, + { + "epoch": 5.882058135649849, + "grad_norm": 0.08867430686950684, + "learning_rate": 4.447812539830326e-05, + "loss": 0.0228, + "num_input_tokens_seen": 64275712, + "step": 52815 + }, + { + "epoch": 5.882614990533467, + "grad_norm": 0.0977790430188179, + "learning_rate": 4.4476602180045526e-05, + "loss": 0.0678, + "num_input_tokens_seen": 64281696, + "step": 52820 + }, + { + "epoch": 5.883171845417085, + "grad_norm": 0.020967451855540276, + "learning_rate": 4.4475078777815675e-05, + "loss": 0.0698, + "num_input_tokens_seen": 64287904, + "step": 52825 + }, + { + "epoch": 5.8837287003007015, + "grad_norm": 0.38845160603523254, + "learning_rate": 4.447355519162812e-05, + "loss": 0.0289, + "num_input_tokens_seen": 64294080, + "step": 52830 + }, + { + "epoch": 5.884285555184319, + "grad_norm": 0.7000799179077148, + "learning_rate": 4.447203142149721e-05, + "loss": 0.0441, + "num_input_tokens_seen": 64300224, + "step": 52835 + }, + { + "epoch": 5.884842410067936, + "grad_norm": 0.0010129597503691912, + "learning_rate": 4.4470507467437375e-05, + "loss": 0.0825, + "num_input_tokens_seen": 64306624, + "step": 52840 + }, + { + "epoch": 5.885399264951554, + "grad_norm": 0.3104235529899597, + "learning_rate": 4.4468983329462996e-05, + "loss": 0.0852, + "num_input_tokens_seen": 64312544, + "step": 52845 + }, + { + "epoch": 5.885956119835171, + "grad_norm": 0.6379266977310181, + "learning_rate": 4.4467459007588475e-05, + "loss": 0.015, + "num_input_tokens_seen": 64318784, + "step": 52850 + }, + { + "epoch": 5.886512974718788, + "grad_norm": 0.5639986395835876, + "learning_rate": 4.44659345018282e-05, + "loss": 0.0426, + "num_input_tokens_seen": 64325056, + "step": 52855 + }, + { + "epoch": 5.887069829602406, + "grad_norm": 0.2507922947406769, + "learning_rate": 4.446440981219658e-05, + "loss": 0.0565, + "num_input_tokens_seen": 64330656, + "step": 52860 + }, + { + "epoch": 5.8876266844860226, + "grad_norm": 0.059553515166044235, + "learning_rate": 4.4462884938708015e-05, + "loss": 0.0497, + "num_input_tokens_seen": 64336320, + "step": 52865 + }, + { + "epoch": 5.88818353936964, + "grad_norm": 0.045285772532224655, + "learning_rate": 4.446135988137691e-05, + "loss": 0.0663, + "num_input_tokens_seen": 64342496, + "step": 52870 + }, + { + "epoch": 5.888740394253258, + "grad_norm": 0.10601040720939636, + "learning_rate": 4.4459834640217667e-05, + "loss": 0.0909, + "num_input_tokens_seen": 64348384, + "step": 52875 + }, + { + "epoch": 5.889297249136875, + "grad_norm": 0.1932552307844162, + "learning_rate": 4.44583092152447e-05, + "loss": 0.0149, + "num_input_tokens_seen": 64355008, + "step": 52880 + }, + { + "epoch": 5.889854104020492, + "grad_norm": 0.01396115217357874, + "learning_rate": 4.445678360647241e-05, + "loss": 0.0144, + "num_input_tokens_seen": 64361408, + "step": 52885 + }, + { + "epoch": 5.890410958904109, + "grad_norm": 0.09923572093248367, + "learning_rate": 4.445525781391521e-05, + "loss": 0.0299, + "num_input_tokens_seen": 64367680, + "step": 52890 + }, + { + "epoch": 5.890967813787727, + "grad_norm": 0.080547995865345, + "learning_rate": 4.445373183758751e-05, + "loss": 0.0767, + "num_input_tokens_seen": 64373952, + "step": 52895 + }, + { + "epoch": 5.8915246686713445, + "grad_norm": 0.2557930052280426, + "learning_rate": 4.445220567750372e-05, + "loss": 0.1786, + "num_input_tokens_seen": 64379776, + "step": 52900 + }, + { + "epoch": 5.892081523554961, + "grad_norm": 0.6537120938301086, + "learning_rate": 4.4450679333678276e-05, + "loss": 0.2304, + "num_input_tokens_seen": 64385984, + "step": 52905 + }, + { + "epoch": 5.892638378438579, + "grad_norm": 0.8296215534210205, + "learning_rate": 4.444915280612557e-05, + "loss": 0.1092, + "num_input_tokens_seen": 64392288, + "step": 52910 + }, + { + "epoch": 5.893195233322196, + "grad_norm": 3.1871910095214844, + "learning_rate": 4.444762609486004e-05, + "loss": 0.1161, + "num_input_tokens_seen": 64398528, + "step": 52915 + }, + { + "epoch": 5.893752088205813, + "grad_norm": 0.8493650555610657, + "learning_rate": 4.444609919989611e-05, + "loss": 0.0691, + "num_input_tokens_seen": 64404576, + "step": 52920 + }, + { + "epoch": 5.894308943089431, + "grad_norm": 0.04248112440109253, + "learning_rate": 4.444457212124818e-05, + "loss": 0.0718, + "num_input_tokens_seen": 64410752, + "step": 52925 + }, + { + "epoch": 5.894865797973048, + "grad_norm": 0.3660855293273926, + "learning_rate": 4.444304485893069e-05, + "loss": 0.0192, + "num_input_tokens_seen": 64416800, + "step": 52930 + }, + { + "epoch": 5.895422652856666, + "grad_norm": 0.08694750815629959, + "learning_rate": 4.444151741295807e-05, + "loss": 0.1033, + "num_input_tokens_seen": 64421920, + "step": 52935 + }, + { + "epoch": 5.895979507740283, + "grad_norm": 0.3068024218082428, + "learning_rate": 4.443998978334473e-05, + "loss": 0.0317, + "num_input_tokens_seen": 64427904, + "step": 52940 + }, + { + "epoch": 5.8965363626239, + "grad_norm": 0.15386046469211578, + "learning_rate": 4.443846197010512e-05, + "loss": 0.0295, + "num_input_tokens_seen": 64434112, + "step": 52945 + }, + { + "epoch": 5.897093217507518, + "grad_norm": 0.002161214826628566, + "learning_rate": 4.4436933973253655e-05, + "loss": 0.0861, + "num_input_tokens_seen": 64440160, + "step": 52950 + }, + { + "epoch": 5.897650072391135, + "grad_norm": 3.64086651802063, + "learning_rate": 4.4435405792804785e-05, + "loss": 0.1817, + "num_input_tokens_seen": 64446272, + "step": 52955 + }, + { + "epoch": 5.898206927274752, + "grad_norm": 0.028691204264760017, + "learning_rate": 4.443387742877293e-05, + "loss": 0.059, + "num_input_tokens_seen": 64452512, + "step": 52960 + }, + { + "epoch": 5.89876378215837, + "grad_norm": 0.0023798360489308834, + "learning_rate": 4.443234888117254e-05, + "loss": 0.0434, + "num_input_tokens_seen": 64458496, + "step": 52965 + }, + { + "epoch": 5.899320637041987, + "grad_norm": 0.010964030399918556, + "learning_rate": 4.4430820150018035e-05, + "loss": 0.0307, + "num_input_tokens_seen": 64464608, + "step": 52970 + }, + { + "epoch": 5.899877491925604, + "grad_norm": 1.2158160209655762, + "learning_rate": 4.442929123532388e-05, + "loss": 0.0955, + "num_input_tokens_seen": 64470592, + "step": 52975 + }, + { + "epoch": 5.900434346809222, + "grad_norm": 1.0901310443878174, + "learning_rate": 4.442776213710449e-05, + "loss": 0.1016, + "num_input_tokens_seen": 64476768, + "step": 52980 + }, + { + "epoch": 5.900991201692839, + "grad_norm": 0.007598627358675003, + "learning_rate": 4.442623285537433e-05, + "loss": 0.0082, + "num_input_tokens_seen": 64483104, + "step": 52985 + }, + { + "epoch": 5.901548056576456, + "grad_norm": 0.34501734375953674, + "learning_rate": 4.442470339014783e-05, + "loss": 0.0267, + "num_input_tokens_seen": 64489280, + "step": 52990 + }, + { + "epoch": 5.902104911460073, + "grad_norm": 0.09330949187278748, + "learning_rate": 4.442317374143945e-05, + "loss": 0.1475, + "num_input_tokens_seen": 64494848, + "step": 52995 + }, + { + "epoch": 5.902661766343691, + "grad_norm": 0.6860999464988708, + "learning_rate": 4.442164390926363e-05, + "loss": 0.0978, + "num_input_tokens_seen": 64501120, + "step": 53000 + }, + { + "epoch": 5.903218621227309, + "grad_norm": 0.49108025431632996, + "learning_rate": 4.442011389363482e-05, + "loss": 0.0401, + "num_input_tokens_seen": 64507232, + "step": 53005 + }, + { + "epoch": 5.903775476110925, + "grad_norm": 0.0011059679090976715, + "learning_rate": 4.441858369456747e-05, + "loss": 0.0964, + "num_input_tokens_seen": 64513376, + "step": 53010 + }, + { + "epoch": 5.904332330994543, + "grad_norm": 1.3839517831802368, + "learning_rate": 4.441705331207605e-05, + "loss": 0.0431, + "num_input_tokens_seen": 64519296, + "step": 53015 + }, + { + "epoch": 5.90488918587816, + "grad_norm": 0.001475121476687491, + "learning_rate": 4.4415522746174995e-05, + "loss": 0.012, + "num_input_tokens_seen": 64525440, + "step": 53020 + }, + { + "epoch": 5.9054460407617775, + "grad_norm": 0.6225030422210693, + "learning_rate": 4.441399199687878e-05, + "loss": 0.043, + "num_input_tokens_seen": 64532032, + "step": 53025 + }, + { + "epoch": 5.906002895645395, + "grad_norm": 0.11270467191934586, + "learning_rate": 4.441246106420186e-05, + "loss": 0.0415, + "num_input_tokens_seen": 64538464, + "step": 53030 + }, + { + "epoch": 5.906559750529012, + "grad_norm": 0.6826905608177185, + "learning_rate": 4.4410929948158684e-05, + "loss": 0.0566, + "num_input_tokens_seen": 64543680, + "step": 53035 + }, + { + "epoch": 5.90711660541263, + "grad_norm": 0.041594333946704865, + "learning_rate": 4.440939864876373e-05, + "loss": 0.0489, + "num_input_tokens_seen": 64550048, + "step": 53040 + }, + { + "epoch": 5.907673460296246, + "grad_norm": 0.5499979853630066, + "learning_rate": 4.440786716603145e-05, + "loss": 0.1509, + "num_input_tokens_seen": 64556128, + "step": 53045 + }, + { + "epoch": 5.908230315179864, + "grad_norm": 0.030266648158431053, + "learning_rate": 4.440633549997631e-05, + "loss": 0.0218, + "num_input_tokens_seen": 64562496, + "step": 53050 + }, + { + "epoch": 5.908787170063482, + "grad_norm": 1.1669431924819946, + "learning_rate": 4.440480365061279e-05, + "loss": 0.0556, + "num_input_tokens_seen": 64568672, + "step": 53055 + }, + { + "epoch": 5.9093440249470985, + "grad_norm": 0.17197208106517792, + "learning_rate": 4.4403271617955356e-05, + "loss": 0.0172, + "num_input_tokens_seen": 64574848, + "step": 53060 + }, + { + "epoch": 5.909900879830716, + "grad_norm": 0.006635593716055155, + "learning_rate": 4.440173940201847e-05, + "loss": 0.0209, + "num_input_tokens_seen": 64581344, + "step": 53065 + }, + { + "epoch": 5.910457734714333, + "grad_norm": 0.24807555973529816, + "learning_rate": 4.4400207002816616e-05, + "loss": 0.0228, + "num_input_tokens_seen": 64587488, + "step": 53070 + }, + { + "epoch": 5.911014589597951, + "grad_norm": 0.6638931035995483, + "learning_rate": 4.439867442036426e-05, + "loss": 0.0238, + "num_input_tokens_seen": 64593824, + "step": 53075 + }, + { + "epoch": 5.911571444481568, + "grad_norm": 0.129277765750885, + "learning_rate": 4.439714165467588e-05, + "loss": 0.0676, + "num_input_tokens_seen": 64599072, + "step": 53080 + }, + { + "epoch": 5.912128299365185, + "grad_norm": 0.11008351296186447, + "learning_rate": 4.439560870576596e-05, + "loss": 0.0061, + "num_input_tokens_seen": 64605216, + "step": 53085 + }, + { + "epoch": 5.912685154248803, + "grad_norm": 0.01531117781996727, + "learning_rate": 4.439407557364897e-05, + "loss": 0.0551, + "num_input_tokens_seen": 64611360, + "step": 53090 + }, + { + "epoch": 5.91324200913242, + "grad_norm": 1.0210962295532227, + "learning_rate": 4.439254225833941e-05, + "loss": 0.0777, + "num_input_tokens_seen": 64617344, + "step": 53095 + }, + { + "epoch": 5.913798864016037, + "grad_norm": 0.03928440809249878, + "learning_rate": 4.439100875985174e-05, + "loss": 0.024, + "num_input_tokens_seen": 64623104, + "step": 53100 + }, + { + "epoch": 5.914355718899655, + "grad_norm": 0.047674499452114105, + "learning_rate": 4.438947507820046e-05, + "loss": 0.062, + "num_input_tokens_seen": 64628992, + "step": 53105 + }, + { + "epoch": 5.914912573783272, + "grad_norm": 0.0018467726185917854, + "learning_rate": 4.438794121340006e-05, + "loss": 0.0168, + "num_input_tokens_seen": 64635328, + "step": 53110 + }, + { + "epoch": 5.915469428666889, + "grad_norm": 0.12250792980194092, + "learning_rate": 4.438640716546502e-05, + "loss": 0.0279, + "num_input_tokens_seen": 64641408, + "step": 53115 + }, + { + "epoch": 5.916026283550507, + "grad_norm": 0.03505168482661247, + "learning_rate": 4.438487293440982e-05, + "loss": 0.0312, + "num_input_tokens_seen": 64647808, + "step": 53120 + }, + { + "epoch": 5.916583138434124, + "grad_norm": 1.6057918071746826, + "learning_rate": 4.4383338520248974e-05, + "loss": 0.1089, + "num_input_tokens_seen": 64653728, + "step": 53125 + }, + { + "epoch": 5.9171399933177415, + "grad_norm": 0.3046475052833557, + "learning_rate": 4.438180392299697e-05, + "loss": 0.0312, + "num_input_tokens_seen": 64659808, + "step": 53130 + }, + { + "epoch": 5.917696848201359, + "grad_norm": 1.1222387552261353, + "learning_rate": 4.438026914266829e-05, + "loss": 0.0799, + "num_input_tokens_seen": 64666176, + "step": 53135 + }, + { + "epoch": 5.918253703084976, + "grad_norm": 1.7229076623916626, + "learning_rate": 4.437873417927744e-05, + "loss": 0.0877, + "num_input_tokens_seen": 64672480, + "step": 53140 + }, + { + "epoch": 5.918810557968594, + "grad_norm": 0.007457773666828871, + "learning_rate": 4.437719903283893e-05, + "loss": 0.0802, + "num_input_tokens_seen": 64678784, + "step": 53145 + }, + { + "epoch": 5.9193674128522105, + "grad_norm": 0.03235355764627457, + "learning_rate": 4.437566370336724e-05, + "loss": 0.0758, + "num_input_tokens_seen": 64685024, + "step": 53150 + }, + { + "epoch": 5.919924267735828, + "grad_norm": 1.1486064195632935, + "learning_rate": 4.437412819087689e-05, + "loss": 0.0379, + "num_input_tokens_seen": 64691104, + "step": 53155 + }, + { + "epoch": 5.920481122619446, + "grad_norm": 0.1111784428358078, + "learning_rate": 4.437259249538237e-05, + "loss": 0.0104, + "num_input_tokens_seen": 64696800, + "step": 53160 + }, + { + "epoch": 5.921037977503063, + "grad_norm": 0.15342848002910614, + "learning_rate": 4.43710566168982e-05, + "loss": 0.011, + "num_input_tokens_seen": 64702592, + "step": 53165 + }, + { + "epoch": 5.92159483238668, + "grad_norm": 0.46488386392593384, + "learning_rate": 4.4369520555438884e-05, + "loss": 0.0623, + "num_input_tokens_seen": 64708672, + "step": 53170 + }, + { + "epoch": 5.922151687270297, + "grad_norm": 0.02023196965456009, + "learning_rate": 4.436798431101892e-05, + "loss": 0.0237, + "num_input_tokens_seen": 64715104, + "step": 53175 + }, + { + "epoch": 5.922708542153915, + "grad_norm": 0.33829113841056824, + "learning_rate": 4.436644788365283e-05, + "loss": 0.0718, + "num_input_tokens_seen": 64721376, + "step": 53180 + }, + { + "epoch": 5.923265397037532, + "grad_norm": 0.18054844439029694, + "learning_rate": 4.436491127335511e-05, + "loss": 0.0253, + "num_input_tokens_seen": 64727520, + "step": 53185 + }, + { + "epoch": 5.923822251921149, + "grad_norm": 0.1015540286898613, + "learning_rate": 4.4363374480140306e-05, + "loss": 0.0394, + "num_input_tokens_seen": 64733792, + "step": 53190 + }, + { + "epoch": 5.924379106804767, + "grad_norm": 0.18330271542072296, + "learning_rate": 4.436183750402291e-05, + "loss": 0.0246, + "num_input_tokens_seen": 64740064, + "step": 53195 + }, + { + "epoch": 5.924935961688384, + "grad_norm": 0.02575405314564705, + "learning_rate": 4.436030034501745e-05, + "loss": 0.0153, + "num_input_tokens_seen": 64746368, + "step": 53200 + }, + { + "epoch": 5.925492816572001, + "grad_norm": 1.6729583740234375, + "learning_rate": 4.435876300313844e-05, + "loss": 0.0364, + "num_input_tokens_seen": 64752352, + "step": 53205 + }, + { + "epoch": 5.926049671455619, + "grad_norm": 0.9601722359657288, + "learning_rate": 4.43572254784004e-05, + "loss": 0.145, + "num_input_tokens_seen": 64758656, + "step": 53210 + }, + { + "epoch": 5.926606526339236, + "grad_norm": 0.1996845006942749, + "learning_rate": 4.4355687770817855e-05, + "loss": 0.09, + "num_input_tokens_seen": 64764704, + "step": 53215 + }, + { + "epoch": 5.9271633812228535, + "grad_norm": 0.03973991423845291, + "learning_rate": 4.435414988040534e-05, + "loss": 0.0293, + "num_input_tokens_seen": 64770848, + "step": 53220 + }, + { + "epoch": 5.92772023610647, + "grad_norm": 0.13286137580871582, + "learning_rate": 4.435261180717737e-05, + "loss": 0.097, + "num_input_tokens_seen": 64777152, + "step": 53225 + }, + { + "epoch": 5.928277090990088, + "grad_norm": 0.049384646117687225, + "learning_rate": 4.435107355114847e-05, + "loss": 0.1384, + "num_input_tokens_seen": 64782560, + "step": 53230 + }, + { + "epoch": 5.928833945873706, + "grad_norm": 0.20452149212360382, + "learning_rate": 4.4349535112333186e-05, + "loss": 0.0379, + "num_input_tokens_seen": 64788736, + "step": 53235 + }, + { + "epoch": 5.929390800757322, + "grad_norm": 1.6609686613082886, + "learning_rate": 4.434799649074603e-05, + "loss": 0.2026, + "num_input_tokens_seen": 64794976, + "step": 53240 + }, + { + "epoch": 5.92994765564094, + "grad_norm": 0.06197470426559448, + "learning_rate": 4.4346457686401553e-05, + "loss": 0.0292, + "num_input_tokens_seen": 64800864, + "step": 53245 + }, + { + "epoch": 5.930504510524557, + "grad_norm": 1.1978319883346558, + "learning_rate": 4.434491869931428e-05, + "loss": 0.15, + "num_input_tokens_seen": 64806720, + "step": 53250 + }, + { + "epoch": 5.9310613654081745, + "grad_norm": 0.44144392013549805, + "learning_rate": 4.434337952949875e-05, + "loss": 0.0134, + "num_input_tokens_seen": 64812928, + "step": 53255 + }, + { + "epoch": 5.931618220291792, + "grad_norm": 3.1460673809051514, + "learning_rate": 4.434184017696951e-05, + "loss": 0.1041, + "num_input_tokens_seen": 64818464, + "step": 53260 + }, + { + "epoch": 5.932175075175409, + "grad_norm": 0.11185954511165619, + "learning_rate": 4.434030064174108e-05, + "loss": 0.0146, + "num_input_tokens_seen": 64824512, + "step": 53265 + }, + { + "epoch": 5.932731930059027, + "grad_norm": 0.7819924354553223, + "learning_rate": 4.433876092382803e-05, + "loss": 0.0665, + "num_input_tokens_seen": 64830880, + "step": 53270 + }, + { + "epoch": 5.933288784942644, + "grad_norm": 0.9499984979629517, + "learning_rate": 4.4337221023244885e-05, + "loss": 0.054, + "num_input_tokens_seen": 64837152, + "step": 53275 + }, + { + "epoch": 5.933845639826261, + "grad_norm": 0.34264621138572693, + "learning_rate": 4.433568094000619e-05, + "loss": 0.0366, + "num_input_tokens_seen": 64843648, + "step": 53280 + }, + { + "epoch": 5.934402494709879, + "grad_norm": 0.005648025311529636, + "learning_rate": 4.43341406741265e-05, + "loss": 0.0175, + "num_input_tokens_seen": 64849856, + "step": 53285 + }, + { + "epoch": 5.934959349593496, + "grad_norm": 0.00028447111253626645, + "learning_rate": 4.433260022562036e-05, + "loss": 0.0866, + "num_input_tokens_seen": 64855904, + "step": 53290 + }, + { + "epoch": 5.935516204477113, + "grad_norm": 1.0067335367202759, + "learning_rate": 4.433105959450232e-05, + "loss": 0.073, + "num_input_tokens_seen": 64862304, + "step": 53295 + }, + { + "epoch": 5.936073059360731, + "grad_norm": 0.8883967995643616, + "learning_rate": 4.432951878078694e-05, + "loss": 0.2007, + "num_input_tokens_seen": 64867968, + "step": 53300 + }, + { + "epoch": 5.936629914244348, + "grad_norm": 0.021911613643169403, + "learning_rate": 4.432797778448876e-05, + "loss": 0.0078, + "num_input_tokens_seen": 64874368, + "step": 53305 + }, + { + "epoch": 5.937186769127965, + "grad_norm": 0.11862467229366302, + "learning_rate": 4.432643660562236e-05, + "loss": 0.1112, + "num_input_tokens_seen": 64880448, + "step": 53310 + }, + { + "epoch": 5.937743624011583, + "grad_norm": 0.0005000617820769548, + "learning_rate": 4.432489524420226e-05, + "loss": 0.1302, + "num_input_tokens_seen": 64886112, + "step": 53315 + }, + { + "epoch": 5.9383004788952, + "grad_norm": 0.001226339372806251, + "learning_rate": 4.432335370024306e-05, + "loss": 0.0283, + "num_input_tokens_seen": 64892320, + "step": 53320 + }, + { + "epoch": 5.9388573337788175, + "grad_norm": 0.052218805998563766, + "learning_rate": 4.43218119737593e-05, + "loss": 0.0171, + "num_input_tokens_seen": 64898336, + "step": 53325 + }, + { + "epoch": 5.939414188662434, + "grad_norm": 0.1344442516565323, + "learning_rate": 4.4320270064765536e-05, + "loss": 0.0449, + "num_input_tokens_seen": 64904416, + "step": 53330 + }, + { + "epoch": 5.939971043546052, + "grad_norm": 0.0869385153055191, + "learning_rate": 4.4318727973276344e-05, + "loss": 0.1546, + "num_input_tokens_seen": 64910528, + "step": 53335 + }, + { + "epoch": 5.94052789842967, + "grad_norm": 0.4779275059700012, + "learning_rate": 4.43171856993063e-05, + "loss": 0.0185, + "num_input_tokens_seen": 64916896, + "step": 53340 + }, + { + "epoch": 5.9410847533132864, + "grad_norm": 0.4076346158981323, + "learning_rate": 4.431564324286995e-05, + "loss": 0.1009, + "num_input_tokens_seen": 64922784, + "step": 53345 + }, + { + "epoch": 5.941641608196904, + "grad_norm": 1.105880618095398, + "learning_rate": 4.431410060398188e-05, + "loss": 0.0878, + "num_input_tokens_seen": 64928768, + "step": 53350 + }, + { + "epoch": 5.942198463080521, + "grad_norm": 0.2564460039138794, + "learning_rate": 4.431255778265665e-05, + "loss": 0.194, + "num_input_tokens_seen": 64935072, + "step": 53355 + }, + { + "epoch": 5.942755317964139, + "grad_norm": 0.1350819617509842, + "learning_rate": 4.431101477890884e-05, + "loss": 0.0731, + "num_input_tokens_seen": 64940544, + "step": 53360 + }, + { + "epoch": 5.943312172847756, + "grad_norm": 0.0019451340194791555, + "learning_rate": 4.430947159275303e-05, + "loss": 0.0844, + "num_input_tokens_seen": 64946624, + "step": 53365 + }, + { + "epoch": 5.943869027731373, + "grad_norm": 0.33127912878990173, + "learning_rate": 4.430792822420378e-05, + "loss": 0.108, + "num_input_tokens_seen": 64952032, + "step": 53370 + }, + { + "epoch": 5.944425882614991, + "grad_norm": 0.4311597943305969, + "learning_rate": 4.430638467327568e-05, + "loss": 0.0557, + "num_input_tokens_seen": 64958368, + "step": 53375 + }, + { + "epoch": 5.9449827374986075, + "grad_norm": 0.07348114997148514, + "learning_rate": 4.430484093998331e-05, + "loss": 0.1172, + "num_input_tokens_seen": 64964416, + "step": 53380 + }, + { + "epoch": 5.945539592382225, + "grad_norm": 0.0010332157835364342, + "learning_rate": 4.430329702434126e-05, + "loss": 0.079, + "num_input_tokens_seen": 64970272, + "step": 53385 + }, + { + "epoch": 5.946096447265843, + "grad_norm": 2.1473124027252197, + "learning_rate": 4.430175292636409e-05, + "loss": 0.1099, + "num_input_tokens_seen": 64976448, + "step": 53390 + }, + { + "epoch": 5.94665330214946, + "grad_norm": 0.027950137853622437, + "learning_rate": 4.430020864606641e-05, + "loss": 0.0225, + "num_input_tokens_seen": 64982752, + "step": 53395 + }, + { + "epoch": 5.947210157033077, + "grad_norm": 0.3693550229072571, + "learning_rate": 4.429866418346279e-05, + "loss": 0.0471, + "num_input_tokens_seen": 64988960, + "step": 53400 + }, + { + "epoch": 5.947767011916694, + "grad_norm": 0.1306765377521515, + "learning_rate": 4.429711953856783e-05, + "loss": 0.0674, + "num_input_tokens_seen": 64994848, + "step": 53405 + }, + { + "epoch": 5.948323866800312, + "grad_norm": 2.503516435623169, + "learning_rate": 4.429557471139612e-05, + "loss": 0.1078, + "num_input_tokens_seen": 65000864, + "step": 53410 + }, + { + "epoch": 5.9488807216839295, + "grad_norm": 1.66234290599823, + "learning_rate": 4.429402970196223e-05, + "loss": 0.1494, + "num_input_tokens_seen": 65006688, + "step": 53415 + }, + { + "epoch": 5.949437576567546, + "grad_norm": 1.1867921352386475, + "learning_rate": 4.429248451028078e-05, + "loss": 0.1052, + "num_input_tokens_seen": 65012640, + "step": 53420 + }, + { + "epoch": 5.949994431451164, + "grad_norm": 0.009887195192277431, + "learning_rate": 4.429093913636636e-05, + "loss": 0.0207, + "num_input_tokens_seen": 65018720, + "step": 53425 + }, + { + "epoch": 5.950551286334781, + "grad_norm": 1.1242475509643555, + "learning_rate": 4.4289393580233565e-05, + "loss": 0.1091, + "num_input_tokens_seen": 65024960, + "step": 53430 + }, + { + "epoch": 5.951108141218398, + "grad_norm": 0.22511020302772522, + "learning_rate": 4.428784784189699e-05, + "loss": 0.0485, + "num_input_tokens_seen": 65031296, + "step": 53435 + }, + { + "epoch": 5.951664996102016, + "grad_norm": 0.5737186670303345, + "learning_rate": 4.428630192137124e-05, + "loss": 0.024, + "num_input_tokens_seen": 65037536, + "step": 53440 + }, + { + "epoch": 5.952221850985633, + "grad_norm": 1.0231519937515259, + "learning_rate": 4.428475581867092e-05, + "loss": 0.1412, + "num_input_tokens_seen": 65043616, + "step": 53445 + }, + { + "epoch": 5.9527787058692505, + "grad_norm": 1.0826995372772217, + "learning_rate": 4.428320953381062e-05, + "loss": 0.1012, + "num_input_tokens_seen": 65049824, + "step": 53450 + }, + { + "epoch": 5.953335560752868, + "grad_norm": 0.4418751895427704, + "learning_rate": 4.4281663066804965e-05, + "loss": 0.0226, + "num_input_tokens_seen": 65055744, + "step": 53455 + }, + { + "epoch": 5.953892415636485, + "grad_norm": 0.004790933802723885, + "learning_rate": 4.428011641766856e-05, + "loss": 0.0472, + "num_input_tokens_seen": 65061760, + "step": 53460 + }, + { + "epoch": 5.954449270520103, + "grad_norm": 0.399011492729187, + "learning_rate": 4.4278569586415995e-05, + "loss": 0.12, + "num_input_tokens_seen": 65067456, + "step": 53465 + }, + { + "epoch": 5.955006125403719, + "grad_norm": 0.592397153377533, + "learning_rate": 4.4277022573061906e-05, + "loss": 0.0338, + "num_input_tokens_seen": 65073472, + "step": 53470 + }, + { + "epoch": 5.955562980287337, + "grad_norm": 0.22720904648303986, + "learning_rate": 4.427547537762089e-05, + "loss": 0.0749, + "num_input_tokens_seen": 65079552, + "step": 53475 + }, + { + "epoch": 5.956119835170955, + "grad_norm": 0.18345381319522858, + "learning_rate": 4.427392800010756e-05, + "loss": 0.0653, + "num_input_tokens_seen": 65085504, + "step": 53480 + }, + { + "epoch": 5.956676690054572, + "grad_norm": 0.735277533531189, + "learning_rate": 4.427238044053654e-05, + "loss": 0.0405, + "num_input_tokens_seen": 65091200, + "step": 53485 + }, + { + "epoch": 5.957233544938189, + "grad_norm": 1.6327515840530396, + "learning_rate": 4.427083269892246e-05, + "loss": 0.1487, + "num_input_tokens_seen": 65097248, + "step": 53490 + }, + { + "epoch": 5.957790399821807, + "grad_norm": 0.4975782334804535, + "learning_rate": 4.426928477527991e-05, + "loss": 0.0177, + "num_input_tokens_seen": 65103584, + "step": 53495 + }, + { + "epoch": 5.958347254705424, + "grad_norm": 0.5646389722824097, + "learning_rate": 4.4267736669623524e-05, + "loss": 0.0507, + "num_input_tokens_seen": 65109376, + "step": 53500 + }, + { + "epoch": 5.958904109589041, + "grad_norm": 0.30339476466178894, + "learning_rate": 4.426618838196794e-05, + "loss": 0.0478, + "num_input_tokens_seen": 65115456, + "step": 53505 + }, + { + "epoch": 5.959460964472658, + "grad_norm": 1.2097065448760986, + "learning_rate": 4.4264639912327766e-05, + "loss": 0.0177, + "num_input_tokens_seen": 65121856, + "step": 53510 + }, + { + "epoch": 5.960017819356276, + "grad_norm": 0.07473249733448029, + "learning_rate": 4.426309126071764e-05, + "loss": 0.0201, + "num_input_tokens_seen": 65127904, + "step": 53515 + }, + { + "epoch": 5.9605746742398935, + "grad_norm": 0.7697085738182068, + "learning_rate": 4.426154242715217e-05, + "loss": 0.123, + "num_input_tokens_seen": 65133984, + "step": 53520 + }, + { + "epoch": 5.96113152912351, + "grad_norm": 0.009230277501046658, + "learning_rate": 4.425999341164601e-05, + "loss": 0.0207, + "num_input_tokens_seen": 65140192, + "step": 53525 + }, + { + "epoch": 5.961688384007128, + "grad_norm": 0.07495982199907303, + "learning_rate": 4.425844421421378e-05, + "loss": 0.0643, + "num_input_tokens_seen": 65146400, + "step": 53530 + }, + { + "epoch": 5.962245238890745, + "grad_norm": 0.04320038855075836, + "learning_rate": 4.425689483487011e-05, + "loss": 0.0327, + "num_input_tokens_seen": 65152544, + "step": 53535 + }, + { + "epoch": 5.962802093774362, + "grad_norm": 1.0628286600112915, + "learning_rate": 4.425534527362964e-05, + "loss": 0.0727, + "num_input_tokens_seen": 65158432, + "step": 53540 + }, + { + "epoch": 5.96335894865798, + "grad_norm": 0.17052745819091797, + "learning_rate": 4.4253795530507014e-05, + "loss": 0.0762, + "num_input_tokens_seen": 65164704, + "step": 53545 + }, + { + "epoch": 5.963915803541597, + "grad_norm": 0.4768106937408447, + "learning_rate": 4.4252245605516863e-05, + "loss": 0.0288, + "num_input_tokens_seen": 65170880, + "step": 53550 + }, + { + "epoch": 5.964472658425215, + "grad_norm": 0.073350690305233, + "learning_rate": 4.425069549867382e-05, + "loss": 0.0131, + "num_input_tokens_seen": 65177344, + "step": 53555 + }, + { + "epoch": 5.965029513308831, + "grad_norm": 0.028443634510040283, + "learning_rate": 4.424914520999254e-05, + "loss": 0.1053, + "num_input_tokens_seen": 65183488, + "step": 53560 + }, + { + "epoch": 5.965586368192449, + "grad_norm": 0.06061045825481415, + "learning_rate": 4.4247594739487664e-05, + "loss": 0.0409, + "num_input_tokens_seen": 65189536, + "step": 53565 + }, + { + "epoch": 5.966143223076067, + "grad_norm": 0.21272628009319305, + "learning_rate": 4.424604408717383e-05, + "loss": 0.1302, + "num_input_tokens_seen": 65195904, + "step": 53570 + }, + { + "epoch": 5.9667000779596835, + "grad_norm": 0.32380443811416626, + "learning_rate": 4.424449325306569e-05, + "loss": 0.044, + "num_input_tokens_seen": 65201920, + "step": 53575 + }, + { + "epoch": 5.967256932843301, + "grad_norm": 1.0624150037765503, + "learning_rate": 4.424294223717789e-05, + "loss": 0.0709, + "num_input_tokens_seen": 65207968, + "step": 53580 + }, + { + "epoch": 5.967813787726918, + "grad_norm": 1.0063594579696655, + "learning_rate": 4.424139103952509e-05, + "loss": 0.0314, + "num_input_tokens_seen": 65214112, + "step": 53585 + }, + { + "epoch": 5.968370642610536, + "grad_norm": 0.0744466707110405, + "learning_rate": 4.423983966012193e-05, + "loss": 0.0703, + "num_input_tokens_seen": 65220096, + "step": 53590 + }, + { + "epoch": 5.968927497494153, + "grad_norm": 0.0035501555539667606, + "learning_rate": 4.423828809898307e-05, + "loss": 0.0383, + "num_input_tokens_seen": 65226432, + "step": 53595 + }, + { + "epoch": 5.96948435237777, + "grad_norm": 0.022369438782334328, + "learning_rate": 4.423673635612317e-05, + "loss": 0.0691, + "num_input_tokens_seen": 65232736, + "step": 53600 + }, + { + "epoch": 5.970041207261388, + "grad_norm": 0.3746347725391388, + "learning_rate": 4.423518443155688e-05, + "loss": 0.0988, + "num_input_tokens_seen": 65238592, + "step": 53605 + }, + { + "epoch": 5.9705980621450045, + "grad_norm": 0.9308717846870422, + "learning_rate": 4.423363232529887e-05, + "loss": 0.0405, + "num_input_tokens_seen": 65245024, + "step": 53610 + }, + { + "epoch": 5.971154917028622, + "grad_norm": 0.29408663511276245, + "learning_rate": 4.4232080037363773e-05, + "loss": 0.0747, + "num_input_tokens_seen": 65250976, + "step": 53615 + }, + { + "epoch": 5.97171177191224, + "grad_norm": 0.1484231948852539, + "learning_rate": 4.423052756776629e-05, + "loss": 0.0757, + "num_input_tokens_seen": 65257344, + "step": 53620 + }, + { + "epoch": 5.972268626795857, + "grad_norm": 0.4255244731903076, + "learning_rate": 4.422897491652106e-05, + "loss": 0.0143, + "num_input_tokens_seen": 65263552, + "step": 53625 + }, + { + "epoch": 5.972825481679474, + "grad_norm": 0.48034778237342834, + "learning_rate": 4.422742208364276e-05, + "loss": 0.2063, + "num_input_tokens_seen": 65269248, + "step": 53630 + }, + { + "epoch": 5.973382336563092, + "grad_norm": 0.018096154555678368, + "learning_rate": 4.422586906914605e-05, + "loss": 0.1403, + "num_input_tokens_seen": 65275264, + "step": 53635 + }, + { + "epoch": 5.973939191446709, + "grad_norm": 2.5948030948638916, + "learning_rate": 4.422431587304561e-05, + "loss": 0.1167, + "num_input_tokens_seen": 65281408, + "step": 53640 + }, + { + "epoch": 5.9744960463303265, + "grad_norm": 0.025019068270921707, + "learning_rate": 4.42227624953561e-05, + "loss": 0.0404, + "num_input_tokens_seen": 65287328, + "step": 53645 + }, + { + "epoch": 5.975052901213943, + "grad_norm": 0.57607501745224, + "learning_rate": 4.4221208936092185e-05, + "loss": 0.015, + "num_input_tokens_seen": 65293440, + "step": 53650 + }, + { + "epoch": 5.975609756097561, + "grad_norm": 0.42508450150489807, + "learning_rate": 4.421965519526856e-05, + "loss": 0.0215, + "num_input_tokens_seen": 65299648, + "step": 53655 + }, + { + "epoch": 5.976166610981179, + "grad_norm": 0.1961580216884613, + "learning_rate": 4.4218101272899906e-05, + "loss": 0.1256, + "num_input_tokens_seen": 65305664, + "step": 53660 + }, + { + "epoch": 5.976723465864795, + "grad_norm": 0.5204748511314392, + "learning_rate": 4.421654716900087e-05, + "loss": 0.0786, + "num_input_tokens_seen": 65310976, + "step": 53665 + }, + { + "epoch": 5.977280320748413, + "grad_norm": 0.713923454284668, + "learning_rate": 4.421499288358616e-05, + "loss": 0.0176, + "num_input_tokens_seen": 65317120, + "step": 53670 + }, + { + "epoch": 5.977837175632031, + "grad_norm": 1.5459593534469604, + "learning_rate": 4.4213438416670445e-05, + "loss": 0.0402, + "num_input_tokens_seen": 65323232, + "step": 53675 + }, + { + "epoch": 5.9783940305156476, + "grad_norm": 1.4340969324111938, + "learning_rate": 4.4211883768268413e-05, + "loss": 0.1018, + "num_input_tokens_seen": 65329728, + "step": 53680 + }, + { + "epoch": 5.978950885399265, + "grad_norm": 0.04387659579515457, + "learning_rate": 4.421032893839474e-05, + "loss": 0.0377, + "num_input_tokens_seen": 65335488, + "step": 53685 + }, + { + "epoch": 5.979507740282882, + "grad_norm": 0.83743816614151, + "learning_rate": 4.4208773927064126e-05, + "loss": 0.1566, + "num_input_tokens_seen": 65341280, + "step": 53690 + }, + { + "epoch": 5.9800645951665, + "grad_norm": 0.3913164436817169, + "learning_rate": 4.4207218734291244e-05, + "loss": 0.082, + "num_input_tokens_seen": 65347456, + "step": 53695 + }, + { + "epoch": 5.980621450050117, + "grad_norm": 0.0992853045463562, + "learning_rate": 4.42056633600908e-05, + "loss": 0.0255, + "num_input_tokens_seen": 65353536, + "step": 53700 + }, + { + "epoch": 5.981178304933734, + "grad_norm": 0.20531634986400604, + "learning_rate": 4.420410780447748e-05, + "loss": 0.0508, + "num_input_tokens_seen": 65359904, + "step": 53705 + }, + { + "epoch": 5.981735159817352, + "grad_norm": 0.3557063639163971, + "learning_rate": 4.4202552067465976e-05, + "loss": 0.0261, + "num_input_tokens_seen": 65366496, + "step": 53710 + }, + { + "epoch": 5.982292014700969, + "grad_norm": 1.021427869796753, + "learning_rate": 4.420099614907097e-05, + "loss": 0.1111, + "num_input_tokens_seen": 65372576, + "step": 53715 + }, + { + "epoch": 5.982848869584586, + "grad_norm": 0.006878682877868414, + "learning_rate": 4.419944004930718e-05, + "loss": 0.0353, + "num_input_tokens_seen": 65378816, + "step": 53720 + }, + { + "epoch": 5.983405724468204, + "grad_norm": 1.5610179901123047, + "learning_rate": 4.41978837681893e-05, + "loss": 0.0317, + "num_input_tokens_seen": 65385120, + "step": 53725 + }, + { + "epoch": 5.983962579351821, + "grad_norm": 0.7845550179481506, + "learning_rate": 4.419632730573202e-05, + "loss": 0.0546, + "num_input_tokens_seen": 65390912, + "step": 53730 + }, + { + "epoch": 5.984519434235438, + "grad_norm": 0.6700180768966675, + "learning_rate": 4.419477066195006e-05, + "loss": 0.0257, + "num_input_tokens_seen": 65396832, + "step": 53735 + }, + { + "epoch": 5.985076289119055, + "grad_norm": 0.6529561281204224, + "learning_rate": 4.419321383685811e-05, + "loss": 0.057, + "num_input_tokens_seen": 65402240, + "step": 53740 + }, + { + "epoch": 5.985633144002673, + "grad_norm": 0.14310461282730103, + "learning_rate": 4.419165683047086e-05, + "loss": 0.1772, + "num_input_tokens_seen": 65408576, + "step": 53745 + }, + { + "epoch": 5.9861899988862906, + "grad_norm": 0.007796938996762037, + "learning_rate": 4.419009964280305e-05, + "loss": 0.0506, + "num_input_tokens_seen": 65414624, + "step": 53750 + }, + { + "epoch": 5.986746853769907, + "grad_norm": 0.002368537476286292, + "learning_rate": 4.4188542273869374e-05, + "loss": 0.0062, + "num_input_tokens_seen": 65420480, + "step": 53755 + }, + { + "epoch": 5.987303708653525, + "grad_norm": 2.1735501289367676, + "learning_rate": 4.4186984723684545e-05, + "loss": 0.0964, + "num_input_tokens_seen": 65426368, + "step": 53760 + }, + { + "epoch": 5.987860563537142, + "grad_norm": 1.5644406080245972, + "learning_rate": 4.418542699226327e-05, + "loss": 0.049, + "num_input_tokens_seen": 65432512, + "step": 53765 + }, + { + "epoch": 5.9884174184207595, + "grad_norm": 0.02029215544462204, + "learning_rate": 4.418386907962026e-05, + "loss": 0.0815, + "num_input_tokens_seen": 65438016, + "step": 53770 + }, + { + "epoch": 5.988974273304377, + "grad_norm": 0.015686023980379105, + "learning_rate": 4.418231098577024e-05, + "loss": 0.02, + "num_input_tokens_seen": 65444352, + "step": 53775 + }, + { + "epoch": 5.989531128187994, + "grad_norm": 1.5411157608032227, + "learning_rate": 4.418075271072792e-05, + "loss": 0.0492, + "num_input_tokens_seen": 65450400, + "step": 53780 + }, + { + "epoch": 5.990087983071612, + "grad_norm": 1.4989558458328247, + "learning_rate": 4.417919425450804e-05, + "loss": 0.1257, + "num_input_tokens_seen": 65456224, + "step": 53785 + }, + { + "epoch": 5.990644837955228, + "grad_norm": 0.0353638157248497, + "learning_rate": 4.417763561712529e-05, + "loss": 0.0286, + "num_input_tokens_seen": 65462304, + "step": 53790 + }, + { + "epoch": 5.991201692838846, + "grad_norm": 0.00190907868091017, + "learning_rate": 4.417607679859441e-05, + "loss": 0.1008, + "num_input_tokens_seen": 65468224, + "step": 53795 + }, + { + "epoch": 5.991758547722464, + "grad_norm": 0.0004428283136803657, + "learning_rate": 4.417451779893012e-05, + "loss": 0.0152, + "num_input_tokens_seen": 65474624, + "step": 53800 + }, + { + "epoch": 5.9923154026060805, + "grad_norm": 0.8672805428504944, + "learning_rate": 4.4172958618147146e-05, + "loss": 0.0637, + "num_input_tokens_seen": 65480672, + "step": 53805 + }, + { + "epoch": 5.992872257489698, + "grad_norm": 0.2167329639196396, + "learning_rate": 4.4171399256260215e-05, + "loss": 0.0235, + "num_input_tokens_seen": 65486368, + "step": 53810 + }, + { + "epoch": 5.993429112373316, + "grad_norm": 0.9948461055755615, + "learning_rate": 4.4169839713284065e-05, + "loss": 0.1154, + "num_input_tokens_seen": 65492544, + "step": 53815 + }, + { + "epoch": 5.993985967256933, + "grad_norm": 0.8251876831054688, + "learning_rate": 4.416827998923342e-05, + "loss": 0.053, + "num_input_tokens_seen": 65498784, + "step": 53820 + }, + { + "epoch": 5.99454282214055, + "grad_norm": 0.6300324201583862, + "learning_rate": 4.4166720084123e-05, + "loss": 0.0826, + "num_input_tokens_seen": 65505120, + "step": 53825 + }, + { + "epoch": 5.995099677024167, + "grad_norm": 0.024169178679585457, + "learning_rate": 4.416515999796757e-05, + "loss": 0.0412, + "num_input_tokens_seen": 65511168, + "step": 53830 + }, + { + "epoch": 5.995656531907785, + "grad_norm": 0.009976807050406933, + "learning_rate": 4.4163599730781845e-05, + "loss": 0.0547, + "num_input_tokens_seen": 65517280, + "step": 53835 + }, + { + "epoch": 5.9962133867914025, + "grad_norm": 0.0011311530834063888, + "learning_rate": 4.416203928258056e-05, + "loss": 0.0151, + "num_input_tokens_seen": 65523264, + "step": 53840 + }, + { + "epoch": 5.996770241675019, + "grad_norm": 0.10206200927495956, + "learning_rate": 4.4160478653378466e-05, + "loss": 0.0555, + "num_input_tokens_seen": 65529024, + "step": 53845 + }, + { + "epoch": 5.997327096558637, + "grad_norm": 1.071640968322754, + "learning_rate": 4.415891784319029e-05, + "loss": 0.031, + "num_input_tokens_seen": 65534720, + "step": 53850 + }, + { + "epoch": 5.997883951442255, + "grad_norm": 1.5371081829071045, + "learning_rate": 4.4157356852030806e-05, + "loss": 0.079, + "num_input_tokens_seen": 65540896, + "step": 53855 + }, + { + "epoch": 5.998440806325871, + "grad_norm": 0.04704992100596428, + "learning_rate": 4.4155795679914724e-05, + "loss": 0.1771, + "num_input_tokens_seen": 65546848, + "step": 53860 + }, + { + "epoch": 5.998997661209489, + "grad_norm": 0.005539020989090204, + "learning_rate": 4.4154234326856805e-05, + "loss": 0.1387, + "num_input_tokens_seen": 65552768, + "step": 53865 + }, + { + "epoch": 5.999554516093106, + "grad_norm": 1.4804810285568237, + "learning_rate": 4.4152672792871795e-05, + "loss": 0.2529, + "num_input_tokens_seen": 65558208, + "step": 53870 + }, + { + "epoch": 6.0, + "eval_loss": 0.08121076971292496, + "eval_runtime": 112.4255, + "eval_samples_per_second": 35.499, + "eval_steps_per_second": 8.877, + "num_input_tokens_seen": 65562352, + "step": 53874 + }, + { + "epoch": 6.0001113709767235, + "grad_norm": 0.040426742285490036, + "learning_rate": 4.415111107797445e-05, + "loss": 0.0608, + "num_input_tokens_seen": 65563632, + "step": 53875 + }, + { + "epoch": 6.000668225860341, + "grad_norm": 0.032299499958753586, + "learning_rate": 4.4149549182179516e-05, + "loss": 0.1075, + "num_input_tokens_seen": 65569360, + "step": 53880 + }, + { + "epoch": 6.001225080743958, + "grad_norm": 0.009540836326777935, + "learning_rate": 4.414798710550175e-05, + "loss": 0.0293, + "num_input_tokens_seen": 65575472, + "step": 53885 + }, + { + "epoch": 6.001781935627576, + "grad_norm": 0.152180477976799, + "learning_rate": 4.41464248479559e-05, + "loss": 0.0098, + "num_input_tokens_seen": 65581200, + "step": 53890 + }, + { + "epoch": 6.0023387905111925, + "grad_norm": 0.43969210982322693, + "learning_rate": 4.414486240955673e-05, + "loss": 0.1057, + "num_input_tokens_seen": 65587472, + "step": 53895 + }, + { + "epoch": 6.00289564539481, + "grad_norm": 0.15147747099399567, + "learning_rate": 4.4143299790319e-05, + "loss": 0.0085, + "num_input_tokens_seen": 65593552, + "step": 53900 + }, + { + "epoch": 6.003452500278428, + "grad_norm": 0.24755915999412537, + "learning_rate": 4.414173699025747e-05, + "loss": 0.1323, + "num_input_tokens_seen": 65599536, + "step": 53905 + }, + { + "epoch": 6.004009355162045, + "grad_norm": 0.353122353553772, + "learning_rate": 4.4140174009386884e-05, + "loss": 0.0598, + "num_input_tokens_seen": 65605488, + "step": 53910 + }, + { + "epoch": 6.004566210045662, + "grad_norm": 0.07552359253168106, + "learning_rate": 4.4138610847722024e-05, + "loss": 0.0113, + "num_input_tokens_seen": 65611216, + "step": 53915 + }, + { + "epoch": 6.005123064929279, + "grad_norm": 0.11939352750778198, + "learning_rate": 4.413704750527765e-05, + "loss": 0.0287, + "num_input_tokens_seen": 65617456, + "step": 53920 + }, + { + "epoch": 6.005679919812897, + "grad_norm": 0.6053532361984253, + "learning_rate": 4.413548398206854e-05, + "loss": 0.0395, + "num_input_tokens_seen": 65623472, + "step": 53925 + }, + { + "epoch": 6.006236774696514, + "grad_norm": 1.674054503440857, + "learning_rate": 4.413392027810944e-05, + "loss": 0.1353, + "num_input_tokens_seen": 65629776, + "step": 53930 + }, + { + "epoch": 6.006793629580131, + "grad_norm": 0.07528097182512283, + "learning_rate": 4.4132356393415136e-05, + "loss": 0.0207, + "num_input_tokens_seen": 65635952, + "step": 53935 + }, + { + "epoch": 6.007350484463749, + "grad_norm": 2.2277824878692627, + "learning_rate": 4.41307923280004e-05, + "loss": 0.0397, + "num_input_tokens_seen": 65641968, + "step": 53940 + }, + { + "epoch": 6.007907339347366, + "grad_norm": 0.06235359236598015, + "learning_rate": 4.4129228081880006e-05, + "loss": 0.1084, + "num_input_tokens_seen": 65647312, + "step": 53945 + }, + { + "epoch": 6.008464194230983, + "grad_norm": 0.5817404985427856, + "learning_rate": 4.412766365506872e-05, + "loss": 0.026, + "num_input_tokens_seen": 65653552, + "step": 53950 + }, + { + "epoch": 6.009021049114601, + "grad_norm": 0.34070664644241333, + "learning_rate": 4.412609904758133e-05, + "loss": 0.02, + "num_input_tokens_seen": 65659664, + "step": 53955 + }, + { + "epoch": 6.009577903998218, + "grad_norm": 1.1588351726531982, + "learning_rate": 4.412453425943261e-05, + "loss": 0.083, + "num_input_tokens_seen": 65665552, + "step": 53960 + }, + { + "epoch": 6.0101347588818355, + "grad_norm": 0.18023832142353058, + "learning_rate": 4.412296929063734e-05, + "loss": 0.2184, + "num_input_tokens_seen": 65671344, + "step": 53965 + }, + { + "epoch": 6.010691613765453, + "grad_norm": 0.6978132724761963, + "learning_rate": 4.412140414121031e-05, + "loss": 0.0818, + "num_input_tokens_seen": 65677488, + "step": 53970 + }, + { + "epoch": 6.01124846864907, + "grad_norm": 0.004585896153002977, + "learning_rate": 4.4119838811166294e-05, + "loss": 0.0135, + "num_input_tokens_seen": 65683696, + "step": 53975 + }, + { + "epoch": 6.011805323532688, + "grad_norm": 0.3490786552429199, + "learning_rate": 4.411827330052008e-05, + "loss": 0.0856, + "num_input_tokens_seen": 65689488, + "step": 53980 + }, + { + "epoch": 6.012362178416304, + "grad_norm": 0.17443884909152985, + "learning_rate": 4.4116707609286455e-05, + "loss": 0.0976, + "num_input_tokens_seen": 65695024, + "step": 53985 + }, + { + "epoch": 6.012919033299922, + "grad_norm": 1.3588223457336426, + "learning_rate": 4.411514173748022e-05, + "loss": 0.1803, + "num_input_tokens_seen": 65701008, + "step": 53990 + }, + { + "epoch": 6.01347588818354, + "grad_norm": 0.009335853159427643, + "learning_rate": 4.411357568511615e-05, + "loss": 0.066, + "num_input_tokens_seen": 65707184, + "step": 53995 + }, + { + "epoch": 6.0140327430671565, + "grad_norm": 0.06913292407989502, + "learning_rate": 4.411200945220905e-05, + "loss": 0.0019, + "num_input_tokens_seen": 65712816, + "step": 54000 + }, + { + "epoch": 6.014589597950774, + "grad_norm": 0.40654510259628296, + "learning_rate": 4.4110443038773693e-05, + "loss": 0.096, + "num_input_tokens_seen": 65719024, + "step": 54005 + }, + { + "epoch": 6.015146452834391, + "grad_norm": 0.0819682776927948, + "learning_rate": 4.410887644482491e-05, + "loss": 0.0406, + "num_input_tokens_seen": 65725360, + "step": 54010 + }, + { + "epoch": 6.015703307718009, + "grad_norm": 0.5267168879508972, + "learning_rate": 4.410730967037747e-05, + "loss": 0.1025, + "num_input_tokens_seen": 65731472, + "step": 54015 + }, + { + "epoch": 6.016260162601626, + "grad_norm": 0.1393798142671585, + "learning_rate": 4.410574271544618e-05, + "loss": 0.036, + "num_input_tokens_seen": 65737392, + "step": 54020 + }, + { + "epoch": 6.016817017485243, + "grad_norm": 0.03580722585320473, + "learning_rate": 4.410417558004585e-05, + "loss": 0.0671, + "num_input_tokens_seen": 65743696, + "step": 54025 + }, + { + "epoch": 6.017373872368861, + "grad_norm": 0.15318052470684052, + "learning_rate": 4.4102608264191266e-05, + "loss": 0.1138, + "num_input_tokens_seen": 65749648, + "step": 54030 + }, + { + "epoch": 6.017930727252478, + "grad_norm": 0.11769168078899384, + "learning_rate": 4.410104076789725e-05, + "loss": 0.0184, + "num_input_tokens_seen": 65755824, + "step": 54035 + }, + { + "epoch": 6.018487582136095, + "grad_norm": 0.019029468297958374, + "learning_rate": 4.40994730911786e-05, + "loss": 0.0833, + "num_input_tokens_seen": 65761968, + "step": 54040 + }, + { + "epoch": 6.019044437019713, + "grad_norm": 2.4246349334716797, + "learning_rate": 4.409790523405012e-05, + "loss": 0.105, + "num_input_tokens_seen": 65768144, + "step": 54045 + }, + { + "epoch": 6.01960129190333, + "grad_norm": 0.7744999527931213, + "learning_rate": 4.409633719652662e-05, + "loss": 0.0863, + "num_input_tokens_seen": 65774096, + "step": 54050 + }, + { + "epoch": 6.020158146786947, + "grad_norm": 0.006215902976691723, + "learning_rate": 4.409476897862293e-05, + "loss": 0.0521, + "num_input_tokens_seen": 65779856, + "step": 54055 + }, + { + "epoch": 6.020715001670565, + "grad_norm": 0.3159044086933136, + "learning_rate": 4.409320058035383e-05, + "loss": 0.0966, + "num_input_tokens_seen": 65785488, + "step": 54060 + }, + { + "epoch": 6.021271856554182, + "grad_norm": 0.008895370177924633, + "learning_rate": 4.4091632001734165e-05, + "loss": 0.0305, + "num_input_tokens_seen": 65791984, + "step": 54065 + }, + { + "epoch": 6.0218287114377995, + "grad_norm": 0.0012848939513787627, + "learning_rate": 4.409006324277874e-05, + "loss": 0.0281, + "num_input_tokens_seen": 65798128, + "step": 54070 + }, + { + "epoch": 6.022385566321416, + "grad_norm": 0.18969979882240295, + "learning_rate": 4.408849430350237e-05, + "loss": 0.0639, + "num_input_tokens_seen": 65804432, + "step": 54075 + }, + { + "epoch": 6.022942421205034, + "grad_norm": 0.3289010226726532, + "learning_rate": 4.4086925183919884e-05, + "loss": 0.0382, + "num_input_tokens_seen": 65810672, + "step": 54080 + }, + { + "epoch": 6.023499276088652, + "grad_norm": 0.12890781462192535, + "learning_rate": 4.4085355884046085e-05, + "loss": 0.0333, + "num_input_tokens_seen": 65816400, + "step": 54085 + }, + { + "epoch": 6.024056130972268, + "grad_norm": 0.13673265278339386, + "learning_rate": 4.408378640389582e-05, + "loss": 0.0552, + "num_input_tokens_seen": 65822288, + "step": 54090 + }, + { + "epoch": 6.024612985855886, + "grad_norm": 0.05824728682637215, + "learning_rate": 4.4082216743483894e-05, + "loss": 0.0538, + "num_input_tokens_seen": 65828528, + "step": 54095 + }, + { + "epoch": 6.025169840739503, + "grad_norm": 0.4763050973415375, + "learning_rate": 4.408064690282515e-05, + "loss": 0.0459, + "num_input_tokens_seen": 65834864, + "step": 54100 + }, + { + "epoch": 6.025726695623121, + "grad_norm": 0.20145496726036072, + "learning_rate": 4.407907688193441e-05, + "loss": 0.0095, + "num_input_tokens_seen": 65841264, + "step": 54105 + }, + { + "epoch": 6.026283550506738, + "grad_norm": 0.05789875611662865, + "learning_rate": 4.40775066808265e-05, + "loss": 0.083, + "num_input_tokens_seen": 65847856, + "step": 54110 + }, + { + "epoch": 6.026840405390355, + "grad_norm": 0.002917017089203, + "learning_rate": 4.4075936299516245e-05, + "loss": 0.0807, + "num_input_tokens_seen": 65853936, + "step": 54115 + }, + { + "epoch": 6.027397260273973, + "grad_norm": 0.036829862743616104, + "learning_rate": 4.4074365738018496e-05, + "loss": 0.0208, + "num_input_tokens_seen": 65859984, + "step": 54120 + }, + { + "epoch": 6.0279541151575895, + "grad_norm": 0.9786571860313416, + "learning_rate": 4.4072794996348084e-05, + "loss": 0.0935, + "num_input_tokens_seen": 65865648, + "step": 54125 + }, + { + "epoch": 6.028510970041207, + "grad_norm": 0.1839831918478012, + "learning_rate": 4.4071224074519836e-05, + "loss": 0.0378, + "num_input_tokens_seen": 65871728, + "step": 54130 + }, + { + "epoch": 6.029067824924825, + "grad_norm": 1.1375093460083008, + "learning_rate": 4.4069652972548605e-05, + "loss": 0.0642, + "num_input_tokens_seen": 65877200, + "step": 54135 + }, + { + "epoch": 6.029624679808442, + "grad_norm": 0.007813585922122002, + "learning_rate": 4.406808169044922e-05, + "loss": 0.0153, + "num_input_tokens_seen": 65882832, + "step": 54140 + }, + { + "epoch": 6.030181534692059, + "grad_norm": 1.2393931150436401, + "learning_rate": 4.406651022823652e-05, + "loss": 0.0935, + "num_input_tokens_seen": 65889168, + "step": 54145 + }, + { + "epoch": 6.030738389575677, + "grad_norm": 0.20729485154151917, + "learning_rate": 4.406493858592536e-05, + "loss": 0.0473, + "num_input_tokens_seen": 65895088, + "step": 54150 + }, + { + "epoch": 6.031295244459294, + "grad_norm": 0.024009695276618004, + "learning_rate": 4.4063366763530585e-05, + "loss": 0.0028, + "num_input_tokens_seen": 65901360, + "step": 54155 + }, + { + "epoch": 6.031852099342911, + "grad_norm": 0.10763432830572128, + "learning_rate": 4.4061794761067034e-05, + "loss": 0.0773, + "num_input_tokens_seen": 65907728, + "step": 54160 + }, + { + "epoch": 6.032408954226528, + "grad_norm": 0.1164655089378357, + "learning_rate": 4.406022257854956e-05, + "loss": 0.0934, + "num_input_tokens_seen": 65914160, + "step": 54165 + }, + { + "epoch": 6.032965809110146, + "grad_norm": 0.0013234223006293178, + "learning_rate": 4.405865021599301e-05, + "loss": 0.0641, + "num_input_tokens_seen": 65919728, + "step": 54170 + }, + { + "epoch": 6.033522663993764, + "grad_norm": 0.294407457113266, + "learning_rate": 4.405707767341224e-05, + "loss": 0.0229, + "num_input_tokens_seen": 65926032, + "step": 54175 + }, + { + "epoch": 6.03407951887738, + "grad_norm": 0.002015679609030485, + "learning_rate": 4.40555049508221e-05, + "loss": 0.1212, + "num_input_tokens_seen": 65931760, + "step": 54180 + }, + { + "epoch": 6.034636373760998, + "grad_norm": 0.0007527742418460548, + "learning_rate": 4.405393204823746e-05, + "loss": 0.0151, + "num_input_tokens_seen": 65938032, + "step": 54185 + }, + { + "epoch": 6.035193228644615, + "grad_norm": 0.396950364112854, + "learning_rate": 4.4052358965673156e-05, + "loss": 0.0857, + "num_input_tokens_seen": 65944496, + "step": 54190 + }, + { + "epoch": 6.0357500835282325, + "grad_norm": 2.0299172401428223, + "learning_rate": 4.405078570314406e-05, + "loss": 0.1343, + "num_input_tokens_seen": 65950672, + "step": 54195 + }, + { + "epoch": 6.03630693841185, + "grad_norm": 0.2501583993434906, + "learning_rate": 4.404921226066503e-05, + "loss": 0.0517, + "num_input_tokens_seen": 65956784, + "step": 54200 + }, + { + "epoch": 6.036863793295467, + "grad_norm": 0.16749243438243866, + "learning_rate": 4.4047638638250926e-05, + "loss": 0.0259, + "num_input_tokens_seen": 65962736, + "step": 54205 + }, + { + "epoch": 6.037420648179085, + "grad_norm": 0.017935553565621376, + "learning_rate": 4.4046064835916615e-05, + "loss": 0.0053, + "num_input_tokens_seen": 65968880, + "step": 54210 + }, + { + "epoch": 6.037977503062701, + "grad_norm": 1.452034831047058, + "learning_rate": 4.4044490853676966e-05, + "loss": 0.0974, + "num_input_tokens_seen": 65975120, + "step": 54215 + }, + { + "epoch": 6.038534357946319, + "grad_norm": 0.06541603058576584, + "learning_rate": 4.4042916691546845e-05, + "loss": 0.0072, + "num_input_tokens_seen": 65981392, + "step": 54220 + }, + { + "epoch": 6.039091212829937, + "grad_norm": 0.0038178558461368084, + "learning_rate": 4.404134234954111e-05, + "loss": 0.0294, + "num_input_tokens_seen": 65987536, + "step": 54225 + }, + { + "epoch": 6.039648067713554, + "grad_norm": 0.01308795902878046, + "learning_rate": 4.403976782767464e-05, + "loss": 0.0014, + "num_input_tokens_seen": 65993968, + "step": 54230 + }, + { + "epoch": 6.040204922597171, + "grad_norm": 0.08959919959306717, + "learning_rate": 4.403819312596232e-05, + "loss": 0.0183, + "num_input_tokens_seen": 66000144, + "step": 54235 + }, + { + "epoch": 6.040761777480789, + "grad_norm": 0.7463956475257874, + "learning_rate": 4.4036618244419e-05, + "loss": 0.1017, + "num_input_tokens_seen": 66006352, + "step": 54240 + }, + { + "epoch": 6.041318632364406, + "grad_norm": 0.21048708260059357, + "learning_rate": 4.403504318305957e-05, + "loss": 0.0749, + "num_input_tokens_seen": 66012432, + "step": 54245 + }, + { + "epoch": 6.041875487248023, + "grad_norm": 0.07179755717515945, + "learning_rate": 4.403346794189892e-05, + "loss": 0.1772, + "num_input_tokens_seen": 66018288, + "step": 54250 + }, + { + "epoch": 6.04243234213164, + "grad_norm": 0.010631691664457321, + "learning_rate": 4.4031892520951906e-05, + "loss": 0.0392, + "num_input_tokens_seen": 66024624, + "step": 54255 + }, + { + "epoch": 6.042989197015258, + "grad_norm": 1.8408921957015991, + "learning_rate": 4.403031692023342e-05, + "loss": 0.1689, + "num_input_tokens_seen": 66030160, + "step": 54260 + }, + { + "epoch": 6.0435460518988755, + "grad_norm": 0.06798208504915237, + "learning_rate": 4.402874113975834e-05, + "loss": 0.0125, + "num_input_tokens_seen": 66036464, + "step": 54265 + }, + { + "epoch": 6.044102906782492, + "grad_norm": 0.09196379780769348, + "learning_rate": 4.4027165179541564e-05, + "loss": 0.0304, + "num_input_tokens_seen": 66042320, + "step": 54270 + }, + { + "epoch": 6.04465976166611, + "grad_norm": 0.3775033950805664, + "learning_rate": 4.402558903959796e-05, + "loss": 0.069, + "num_input_tokens_seen": 66048272, + "step": 54275 + }, + { + "epoch": 6.045216616549727, + "grad_norm": 0.2249356359243393, + "learning_rate": 4.402401271994243e-05, + "loss": 0.0368, + "num_input_tokens_seen": 66054448, + "step": 54280 + }, + { + "epoch": 6.045773471433344, + "grad_norm": 0.0009877192787826061, + "learning_rate": 4.4022436220589855e-05, + "loss": 0.0326, + "num_input_tokens_seen": 66060816, + "step": 54285 + }, + { + "epoch": 6.046330326316962, + "grad_norm": 0.4557206928730011, + "learning_rate": 4.4020859541555135e-05, + "loss": 0.0521, + "num_input_tokens_seen": 66067344, + "step": 54290 + }, + { + "epoch": 6.046887181200579, + "grad_norm": 0.0019464984070509672, + "learning_rate": 4.401928268285315e-05, + "loss": 0.0827, + "num_input_tokens_seen": 66073552, + "step": 54295 + }, + { + "epoch": 6.047444036084197, + "grad_norm": 0.7396776080131531, + "learning_rate": 4.401770564449881e-05, + "loss": 0.0875, + "num_input_tokens_seen": 66079728, + "step": 54300 + }, + { + "epoch": 6.048000890967813, + "grad_norm": 0.09205416589975357, + "learning_rate": 4.401612842650699e-05, + "loss": 0.0136, + "num_input_tokens_seen": 66085744, + "step": 54305 + }, + { + "epoch": 6.048557745851431, + "grad_norm": 1.1337761878967285, + "learning_rate": 4.4014551028892615e-05, + "loss": 0.1001, + "num_input_tokens_seen": 66091600, + "step": 54310 + }, + { + "epoch": 6.049114600735049, + "grad_norm": 1.2052161693572998, + "learning_rate": 4.401297345167056e-05, + "loss": 0.1883, + "num_input_tokens_seen": 66097936, + "step": 54315 + }, + { + "epoch": 6.0496714556186655, + "grad_norm": 0.014130774885416031, + "learning_rate": 4.401139569485575e-05, + "loss": 0.0235, + "num_input_tokens_seen": 66104432, + "step": 54320 + }, + { + "epoch": 6.050228310502283, + "grad_norm": 0.24471166729927063, + "learning_rate": 4.400981775846307e-05, + "loss": 0.2546, + "num_input_tokens_seen": 66110576, + "step": 54325 + }, + { + "epoch": 6.050785165385901, + "grad_norm": 0.3747207224369049, + "learning_rate": 4.400823964250743e-05, + "loss": 0.0271, + "num_input_tokens_seen": 66116720, + "step": 54330 + }, + { + "epoch": 6.051342020269518, + "grad_norm": 1.0116591453552246, + "learning_rate": 4.400666134700374e-05, + "loss": 0.0524, + "num_input_tokens_seen": 66122576, + "step": 54335 + }, + { + "epoch": 6.051898875153135, + "grad_norm": 0.08436072617769241, + "learning_rate": 4.4005082871966894e-05, + "loss": 0.0069, + "num_input_tokens_seen": 66128944, + "step": 54340 + }, + { + "epoch": 6.052455730036752, + "grad_norm": 0.054909173399209976, + "learning_rate": 4.400350421741183e-05, + "loss": 0.102, + "num_input_tokens_seen": 66134736, + "step": 54345 + }, + { + "epoch": 6.05301258492037, + "grad_norm": 0.2272258996963501, + "learning_rate": 4.4001925383353435e-05, + "loss": 0.0376, + "num_input_tokens_seen": 66141104, + "step": 54350 + }, + { + "epoch": 6.053569439803987, + "grad_norm": 0.008206913247704506, + "learning_rate": 4.4000346369806635e-05, + "loss": 0.1084, + "num_input_tokens_seen": 66146896, + "step": 54355 + }, + { + "epoch": 6.054126294687604, + "grad_norm": 0.024470418691635132, + "learning_rate": 4.399876717678634e-05, + "loss": 0.0448, + "num_input_tokens_seen": 66153072, + "step": 54360 + }, + { + "epoch": 6.054683149571222, + "grad_norm": 0.3054127097129822, + "learning_rate": 4.399718780430746e-05, + "loss": 0.0773, + "num_input_tokens_seen": 66159024, + "step": 54365 + }, + { + "epoch": 6.055240004454839, + "grad_norm": 0.5128718018531799, + "learning_rate": 4.399560825238492e-05, + "loss": 0.0538, + "num_input_tokens_seen": 66165264, + "step": 54370 + }, + { + "epoch": 6.055796859338456, + "grad_norm": 0.0012150034308433533, + "learning_rate": 4.399402852103365e-05, + "loss": 0.0483, + "num_input_tokens_seen": 66171536, + "step": 54375 + }, + { + "epoch": 6.056353714222074, + "grad_norm": 0.2705662250518799, + "learning_rate": 4.3992448610268564e-05, + "loss": 0.0559, + "num_input_tokens_seen": 66177520, + "step": 54380 + }, + { + "epoch": 6.056910569105691, + "grad_norm": 0.045867420732975006, + "learning_rate": 4.399086852010458e-05, + "loss": 0.0336, + "num_input_tokens_seen": 66183856, + "step": 54385 + }, + { + "epoch": 6.0574674239893085, + "grad_norm": 0.36357954144477844, + "learning_rate": 4.398928825055663e-05, + "loss": 0.0134, + "num_input_tokens_seen": 66190128, + "step": 54390 + }, + { + "epoch": 6.058024278872926, + "grad_norm": 1.061694860458374, + "learning_rate": 4.3987707801639637e-05, + "loss": 0.0612, + "num_input_tokens_seen": 66196624, + "step": 54395 + }, + { + "epoch": 6.058581133756543, + "grad_norm": 0.07191994041204453, + "learning_rate": 4.398612717336853e-05, + "loss": 0.0223, + "num_input_tokens_seen": 66202960, + "step": 54400 + }, + { + "epoch": 6.059137988640161, + "grad_norm": 0.22825933992862701, + "learning_rate": 4.398454636575824e-05, + "loss": 0.0105, + "num_input_tokens_seen": 66209104, + "step": 54405 + }, + { + "epoch": 6.059694843523777, + "grad_norm": 0.06019298732280731, + "learning_rate": 4.39829653788237e-05, + "loss": 0.0333, + "num_input_tokens_seen": 66214768, + "step": 54410 + }, + { + "epoch": 6.060251698407395, + "grad_norm": 0.0416177436709404, + "learning_rate": 4.398138421257985e-05, + "loss": 0.0558, + "num_input_tokens_seen": 66220784, + "step": 54415 + }, + { + "epoch": 6.060808553291013, + "grad_norm": 0.6863035559654236, + "learning_rate": 4.397980286704161e-05, + "loss": 0.0204, + "num_input_tokens_seen": 66226736, + "step": 54420 + }, + { + "epoch": 6.0613654081746295, + "grad_norm": 0.47104063630104065, + "learning_rate": 4.3978221342223926e-05, + "loss": 0.033, + "num_input_tokens_seen": 66232816, + "step": 54425 + }, + { + "epoch": 6.061922263058247, + "grad_norm": 2.214811325073242, + "learning_rate": 4.3976639638141736e-05, + "loss": 0.0565, + "num_input_tokens_seen": 66239216, + "step": 54430 + }, + { + "epoch": 6.062479117941864, + "grad_norm": 0.5633594989776611, + "learning_rate": 4.3975057754809986e-05, + "loss": 0.0901, + "num_input_tokens_seen": 66245328, + "step": 54435 + }, + { + "epoch": 6.063035972825482, + "grad_norm": 0.15385736525058746, + "learning_rate": 4.397347569224361e-05, + "loss": 0.028, + "num_input_tokens_seen": 66251440, + "step": 54440 + }, + { + "epoch": 6.063592827709099, + "grad_norm": 0.34692132472991943, + "learning_rate": 4.3971893450457555e-05, + "loss": 0.0964, + "num_input_tokens_seen": 66257584, + "step": 54445 + }, + { + "epoch": 6.064149682592716, + "grad_norm": 0.5742690563201904, + "learning_rate": 4.397031102946676e-05, + "loss": 0.0364, + "num_input_tokens_seen": 66263568, + "step": 54450 + }, + { + "epoch": 6.064706537476334, + "grad_norm": 1.424752950668335, + "learning_rate": 4.3968728429286186e-05, + "loss": 0.1489, + "num_input_tokens_seen": 66269360, + "step": 54455 + }, + { + "epoch": 6.065263392359951, + "grad_norm": 1.4648336172103882, + "learning_rate": 4.396714564993078e-05, + "loss": 0.1067, + "num_input_tokens_seen": 66275216, + "step": 54460 + }, + { + "epoch": 6.065820247243568, + "grad_norm": 0.8164190649986267, + "learning_rate": 4.396556269141547e-05, + "loss": 0.034, + "num_input_tokens_seen": 66280816, + "step": 54465 + }, + { + "epoch": 6.066377102127186, + "grad_norm": 1.20908784866333, + "learning_rate": 4.396397955375524e-05, + "loss": 0.2042, + "num_input_tokens_seen": 66286576, + "step": 54470 + }, + { + "epoch": 6.066933957010803, + "grad_norm": 0.9502331018447876, + "learning_rate": 4.396239623696503e-05, + "loss": 0.0893, + "num_input_tokens_seen": 66292848, + "step": 54475 + }, + { + "epoch": 6.06749081189442, + "grad_norm": 2.3916361331939697, + "learning_rate": 4.396081274105979e-05, + "loss": 0.1196, + "num_input_tokens_seen": 66298960, + "step": 54480 + }, + { + "epoch": 6.068047666778037, + "grad_norm": 0.18573345243930817, + "learning_rate": 4.3959229066054486e-05, + "loss": 0.0331, + "num_input_tokens_seen": 66305360, + "step": 54485 + }, + { + "epoch": 6.068604521661655, + "grad_norm": 0.9066591262817383, + "learning_rate": 4.395764521196406e-05, + "loss": 0.0835, + "num_input_tokens_seen": 66311568, + "step": 54490 + }, + { + "epoch": 6.0691613765452725, + "grad_norm": 0.5737116932868958, + "learning_rate": 4.3956061178803496e-05, + "loss": 0.146, + "num_input_tokens_seen": 66317584, + "step": 54495 + }, + { + "epoch": 6.069718231428889, + "grad_norm": 0.07693534344434738, + "learning_rate": 4.395447696658775e-05, + "loss": 0.1064, + "num_input_tokens_seen": 66323824, + "step": 54500 + }, + { + "epoch": 6.070275086312507, + "grad_norm": 0.859470784664154, + "learning_rate": 4.395289257533178e-05, + "loss": 0.0513, + "num_input_tokens_seen": 66330096, + "step": 54505 + }, + { + "epoch": 6.070831941196125, + "grad_norm": 0.000498181558214128, + "learning_rate": 4.395130800505056e-05, + "loss": 0.0535, + "num_input_tokens_seen": 66336560, + "step": 54510 + }, + { + "epoch": 6.0713887960797415, + "grad_norm": 0.7280912399291992, + "learning_rate": 4.3949723255759044e-05, + "loss": 0.0691, + "num_input_tokens_seen": 66342672, + "step": 54515 + }, + { + "epoch": 6.071945650963359, + "grad_norm": 0.13158704340457916, + "learning_rate": 4.394813832747222e-05, + "loss": 0.0757, + "num_input_tokens_seen": 66348592, + "step": 54520 + }, + { + "epoch": 6.072502505846976, + "grad_norm": 0.7711992859840393, + "learning_rate": 4.394655322020504e-05, + "loss": 0.0214, + "num_input_tokens_seen": 66355184, + "step": 54525 + }, + { + "epoch": 6.073059360730594, + "grad_norm": 1.482653021812439, + "learning_rate": 4.394496793397248e-05, + "loss": 0.1733, + "num_input_tokens_seen": 66361520, + "step": 54530 + }, + { + "epoch": 6.073616215614211, + "grad_norm": 0.646067202091217, + "learning_rate": 4.394338246878953e-05, + "loss": 0.1163, + "num_input_tokens_seen": 66367952, + "step": 54535 + }, + { + "epoch": 6.074173070497828, + "grad_norm": 0.03935972973704338, + "learning_rate": 4.394179682467116e-05, + "loss": 0.0176, + "num_input_tokens_seen": 66373936, + "step": 54540 + }, + { + "epoch": 6.074729925381446, + "grad_norm": 0.14039866626262665, + "learning_rate": 4.394021100163233e-05, + "loss": 0.0228, + "num_input_tokens_seen": 66379344, + "step": 54545 + }, + { + "epoch": 6.0752867802650625, + "grad_norm": 0.3939661383628845, + "learning_rate": 4.3938624999688036e-05, + "loss": 0.0485, + "num_input_tokens_seen": 66385520, + "step": 54550 + }, + { + "epoch": 6.07584363514868, + "grad_norm": 0.16766826808452606, + "learning_rate": 4.393703881885325e-05, + "loss": 0.1212, + "num_input_tokens_seen": 66391568, + "step": 54555 + }, + { + "epoch": 6.076400490032298, + "grad_norm": 1.185835361480713, + "learning_rate": 4.393545245914297e-05, + "loss": 0.0903, + "num_input_tokens_seen": 66397744, + "step": 54560 + }, + { + "epoch": 6.076957344915915, + "grad_norm": 0.05505778640508652, + "learning_rate": 4.393386592057217e-05, + "loss": 0.012, + "num_input_tokens_seen": 66404016, + "step": 54565 + }, + { + "epoch": 6.077514199799532, + "grad_norm": 0.11765125393867493, + "learning_rate": 4.393227920315583e-05, + "loss": 0.0069, + "num_input_tokens_seen": 66410320, + "step": 54570 + }, + { + "epoch": 6.07807105468315, + "grad_norm": 0.3983026444911957, + "learning_rate": 4.393069230690895e-05, + "loss": 0.0723, + "num_input_tokens_seen": 66416528, + "step": 54575 + }, + { + "epoch": 6.078627909566767, + "grad_norm": 0.012716825120151043, + "learning_rate": 4.392910523184652e-05, + "loss": 0.0297, + "num_input_tokens_seen": 66423024, + "step": 54580 + }, + { + "epoch": 6.0791847644503845, + "grad_norm": 1.1916249990463257, + "learning_rate": 4.392751797798351e-05, + "loss": 0.0278, + "num_input_tokens_seen": 66428848, + "step": 54585 + }, + { + "epoch": 6.079741619334001, + "grad_norm": 1.7287672758102417, + "learning_rate": 4.392593054533494e-05, + "loss": 0.1551, + "num_input_tokens_seen": 66434672, + "step": 54590 + }, + { + "epoch": 6.080298474217619, + "grad_norm": 0.8931295275688171, + "learning_rate": 4.392434293391579e-05, + "loss": 0.1167, + "num_input_tokens_seen": 66440976, + "step": 54595 + }, + { + "epoch": 6.080855329101237, + "grad_norm": 0.4398360550403595, + "learning_rate": 4.392275514374106e-05, + "loss": 0.0856, + "num_input_tokens_seen": 66447312, + "step": 54600 + }, + { + "epoch": 6.081412183984853, + "grad_norm": 0.16399824619293213, + "learning_rate": 4.392116717482574e-05, + "loss": 0.0038, + "num_input_tokens_seen": 66453424, + "step": 54605 + }, + { + "epoch": 6.081969038868471, + "grad_norm": 0.7143295407295227, + "learning_rate": 4.3919579027184846e-05, + "loss": 0.0375, + "num_input_tokens_seen": 66459120, + "step": 54610 + }, + { + "epoch": 6.082525893752088, + "grad_norm": 0.0003327345766592771, + "learning_rate": 4.391799070083337e-05, + "loss": 0.0294, + "num_input_tokens_seen": 66465552, + "step": 54615 + }, + { + "epoch": 6.0830827486357055, + "grad_norm": 2.90091609954834, + "learning_rate": 4.391640219578631e-05, + "loss": 0.1362, + "num_input_tokens_seen": 66471792, + "step": 54620 + }, + { + "epoch": 6.083639603519323, + "grad_norm": 0.22456593811511993, + "learning_rate": 4.3914813512058675e-05, + "loss": 0.0394, + "num_input_tokens_seen": 66477840, + "step": 54625 + }, + { + "epoch": 6.08419645840294, + "grad_norm": 0.283185750246048, + "learning_rate": 4.391322464966547e-05, + "loss": 0.0422, + "num_input_tokens_seen": 66484016, + "step": 54630 + }, + { + "epoch": 6.084753313286558, + "grad_norm": 0.011214624159038067, + "learning_rate": 4.391163560862172e-05, + "loss": 0.0891, + "num_input_tokens_seen": 66490160, + "step": 54635 + }, + { + "epoch": 6.0853101681701744, + "grad_norm": 0.005629456602036953, + "learning_rate": 4.39100463889424e-05, + "loss": 0.0936, + "num_input_tokens_seen": 66496304, + "step": 54640 + }, + { + "epoch": 6.085867023053792, + "grad_norm": 1.0394941568374634, + "learning_rate": 4.390845699064255e-05, + "loss": 0.0866, + "num_input_tokens_seen": 66502640, + "step": 54645 + }, + { + "epoch": 6.08642387793741, + "grad_norm": 0.004368246998637915, + "learning_rate": 4.3906867413737174e-05, + "loss": 0.0973, + "num_input_tokens_seen": 66508368, + "step": 54650 + }, + { + "epoch": 6.086980732821027, + "grad_norm": 0.13430581986904144, + "learning_rate": 4.3905277658241296e-05, + "loss": 0.1669, + "num_input_tokens_seen": 66514608, + "step": 54655 + }, + { + "epoch": 6.087537587704644, + "grad_norm": 0.00045407310244627297, + "learning_rate": 4.390368772416991e-05, + "loss": 0.0757, + "num_input_tokens_seen": 66520688, + "step": 54660 + }, + { + "epoch": 6.088094442588262, + "grad_norm": 0.4194932281970978, + "learning_rate": 4.3902097611538055e-05, + "loss": 0.0474, + "num_input_tokens_seen": 66526960, + "step": 54665 + }, + { + "epoch": 6.088651297471879, + "grad_norm": 0.0034574156161397696, + "learning_rate": 4.3900507320360746e-05, + "loss": 0.0802, + "num_input_tokens_seen": 66532848, + "step": 54670 + }, + { + "epoch": 6.089208152355496, + "grad_norm": 0.5993158221244812, + "learning_rate": 4.3898916850653e-05, + "loss": 0.123, + "num_input_tokens_seen": 66538384, + "step": 54675 + }, + { + "epoch": 6.089765007239113, + "grad_norm": 0.030454441905021667, + "learning_rate": 4.3897326202429844e-05, + "loss": 0.0314, + "num_input_tokens_seen": 66544304, + "step": 54680 + }, + { + "epoch": 6.090321862122731, + "grad_norm": 0.8742572069168091, + "learning_rate": 4.389573537570629e-05, + "loss": 0.1681, + "num_input_tokens_seen": 66550000, + "step": 54685 + }, + { + "epoch": 6.0908787170063485, + "grad_norm": 0.9998412728309631, + "learning_rate": 4.389414437049739e-05, + "loss": 0.0505, + "num_input_tokens_seen": 66556112, + "step": 54690 + }, + { + "epoch": 6.091435571889965, + "grad_norm": 0.008684387430548668, + "learning_rate": 4.3892553186818163e-05, + "loss": 0.0741, + "num_input_tokens_seen": 66562224, + "step": 54695 + }, + { + "epoch": 6.091992426773583, + "grad_norm": 0.20616520941257477, + "learning_rate": 4.389096182468363e-05, + "loss": 0.0103, + "num_input_tokens_seen": 66568336, + "step": 54700 + }, + { + "epoch": 6.0925492816572, + "grad_norm": 1.8096665143966675, + "learning_rate": 4.388937028410882e-05, + "loss": 0.0251, + "num_input_tokens_seen": 66574736, + "step": 54705 + }, + { + "epoch": 6.0931061365408175, + "grad_norm": 0.4157451391220093, + "learning_rate": 4.388777856510878e-05, + "loss": 0.0702, + "num_input_tokens_seen": 66581328, + "step": 54710 + }, + { + "epoch": 6.093662991424435, + "grad_norm": 0.00033037105458788574, + "learning_rate": 4.388618666769854e-05, + "loss": 0.089, + "num_input_tokens_seen": 66587568, + "step": 54715 + }, + { + "epoch": 6.094219846308052, + "grad_norm": 0.2687333822250366, + "learning_rate": 4.3884594591893125e-05, + "loss": 0.0713, + "num_input_tokens_seen": 66593424, + "step": 54720 + }, + { + "epoch": 6.09477670119167, + "grad_norm": 0.02601773291826248, + "learning_rate": 4.388300233770759e-05, + "loss": 0.0399, + "num_input_tokens_seen": 66599280, + "step": 54725 + }, + { + "epoch": 6.095333556075286, + "grad_norm": 0.962862491607666, + "learning_rate": 4.388140990515698e-05, + "loss": 0.0413, + "num_input_tokens_seen": 66605424, + "step": 54730 + }, + { + "epoch": 6.095890410958904, + "grad_norm": 1.2757567167282104, + "learning_rate": 4.387981729425631e-05, + "loss": 0.0785, + "num_input_tokens_seen": 66611376, + "step": 54735 + }, + { + "epoch": 6.096447265842522, + "grad_norm": 1.244632601737976, + "learning_rate": 4.387822450502065e-05, + "loss": 0.1465, + "num_input_tokens_seen": 66617168, + "step": 54740 + }, + { + "epoch": 6.0970041207261385, + "grad_norm": 0.9737571477890015, + "learning_rate": 4.387663153746503e-05, + "loss": 0.0834, + "num_input_tokens_seen": 66622960, + "step": 54745 + }, + { + "epoch": 6.097560975609756, + "grad_norm": 1.0898958444595337, + "learning_rate": 4.3875038391604494e-05, + "loss": 0.1694, + "num_input_tokens_seen": 66629136, + "step": 54750 + }, + { + "epoch": 6.098117830493374, + "grad_norm": 0.0717349499464035, + "learning_rate": 4.387344506745411e-05, + "loss": 0.0086, + "num_input_tokens_seen": 66635600, + "step": 54755 + }, + { + "epoch": 6.098674685376991, + "grad_norm": 1.6824278831481934, + "learning_rate": 4.387185156502891e-05, + "loss": 0.2274, + "num_input_tokens_seen": 66641840, + "step": 54760 + }, + { + "epoch": 6.099231540260608, + "grad_norm": 1.285698413848877, + "learning_rate": 4.387025788434396e-05, + "loss": 0.103, + "num_input_tokens_seen": 66648048, + "step": 54765 + }, + { + "epoch": 6.099788395144225, + "grad_norm": 0.5897122621536255, + "learning_rate": 4.38686640254143e-05, + "loss": 0.0792, + "num_input_tokens_seen": 66654032, + "step": 54770 + }, + { + "epoch": 6.100345250027843, + "grad_norm": 0.5040189027786255, + "learning_rate": 4.3867069988254984e-05, + "loss": 0.071, + "num_input_tokens_seen": 66660336, + "step": 54775 + }, + { + "epoch": 6.1009021049114605, + "grad_norm": 0.07023140043020248, + "learning_rate": 4.386547577288108e-05, + "loss": 0.032, + "num_input_tokens_seen": 66666352, + "step": 54780 + }, + { + "epoch": 6.101458959795077, + "grad_norm": 0.011944356374442577, + "learning_rate": 4.386388137930765e-05, + "loss": 0.1076, + "num_input_tokens_seen": 66672368, + "step": 54785 + }, + { + "epoch": 6.102015814678695, + "grad_norm": 1.4582297801971436, + "learning_rate": 4.386228680754974e-05, + "loss": 0.0536, + "num_input_tokens_seen": 66678288, + "step": 54790 + }, + { + "epoch": 6.102572669562312, + "grad_norm": 0.7386346459388733, + "learning_rate": 4.386069205762242e-05, + "loss": 0.0323, + "num_input_tokens_seen": 66683952, + "step": 54795 + }, + { + "epoch": 6.103129524445929, + "grad_norm": 0.0004170923784840852, + "learning_rate": 4.385909712954076e-05, + "loss": 0.0664, + "num_input_tokens_seen": 66690224, + "step": 54800 + }, + { + "epoch": 6.103686379329547, + "grad_norm": 2.00588321685791, + "learning_rate": 4.385750202331981e-05, + "loss": 0.1374, + "num_input_tokens_seen": 66696560, + "step": 54805 + }, + { + "epoch": 6.104243234213164, + "grad_norm": 1.6922640800476074, + "learning_rate": 4.385590673897465e-05, + "loss": 0.0728, + "num_input_tokens_seen": 66702320, + "step": 54810 + }, + { + "epoch": 6.1048000890967815, + "grad_norm": 0.054372940212488174, + "learning_rate": 4.385431127652033e-05, + "loss": 0.0267, + "num_input_tokens_seen": 66708400, + "step": 54815 + }, + { + "epoch": 6.105356943980398, + "grad_norm": 0.6407859921455383, + "learning_rate": 4.385271563597195e-05, + "loss": 0.1224, + "num_input_tokens_seen": 66714544, + "step": 54820 + }, + { + "epoch": 6.105913798864016, + "grad_norm": 1.3346312046051025, + "learning_rate": 4.385111981734457e-05, + "loss": 0.183, + "num_input_tokens_seen": 66720720, + "step": 54825 + }, + { + "epoch": 6.106470653747634, + "grad_norm": 1.2201734781265259, + "learning_rate": 4.384952382065324e-05, + "loss": 0.0745, + "num_input_tokens_seen": 66726992, + "step": 54830 + }, + { + "epoch": 6.10702750863125, + "grad_norm": 2.811824083328247, + "learning_rate": 4.384792764591307e-05, + "loss": 0.1254, + "num_input_tokens_seen": 66733296, + "step": 54835 + }, + { + "epoch": 6.107584363514868, + "grad_norm": 0.017016252502799034, + "learning_rate": 4.384633129313912e-05, + "loss": 0.0421, + "num_input_tokens_seen": 66739792, + "step": 54840 + }, + { + "epoch": 6.108141218398486, + "grad_norm": 0.6032174229621887, + "learning_rate": 4.384473476234647e-05, + "loss": 0.0666, + "num_input_tokens_seen": 66745680, + "step": 54845 + }, + { + "epoch": 6.108698073282103, + "grad_norm": 0.657368540763855, + "learning_rate": 4.384313805355021e-05, + "loss": 0.1213, + "num_input_tokens_seen": 66751664, + "step": 54850 + }, + { + "epoch": 6.10925492816572, + "grad_norm": 0.006448842119425535, + "learning_rate": 4.38415411667654e-05, + "loss": 0.0528, + "num_input_tokens_seen": 66757616, + "step": 54855 + }, + { + "epoch": 6.109811783049337, + "grad_norm": 0.009870910085737705, + "learning_rate": 4.383994410200715e-05, + "loss": 0.0193, + "num_input_tokens_seen": 66763952, + "step": 54860 + }, + { + "epoch": 6.110368637932955, + "grad_norm": 0.10052069276571274, + "learning_rate": 4.3838346859290526e-05, + "loss": 0.03, + "num_input_tokens_seen": 66770224, + "step": 54865 + }, + { + "epoch": 6.110925492816572, + "grad_norm": 0.061563264578580856, + "learning_rate": 4.383674943863062e-05, + "loss": 0.0165, + "num_input_tokens_seen": 66775952, + "step": 54870 + }, + { + "epoch": 6.111482347700189, + "grad_norm": 0.0002412142202956602, + "learning_rate": 4.383515184004253e-05, + "loss": 0.1425, + "num_input_tokens_seen": 66782192, + "step": 54875 + }, + { + "epoch": 6.112039202583807, + "grad_norm": 1.3703557252883911, + "learning_rate": 4.3833554063541336e-05, + "loss": 0.1695, + "num_input_tokens_seen": 66788624, + "step": 54880 + }, + { + "epoch": 6.112596057467424, + "grad_norm": 0.19688600301742554, + "learning_rate": 4.383195610914214e-05, + "loss": 0.0664, + "num_input_tokens_seen": 66794864, + "step": 54885 + }, + { + "epoch": 6.113152912351041, + "grad_norm": 0.5254037380218506, + "learning_rate": 4.3830357976860034e-05, + "loss": 0.0933, + "num_input_tokens_seen": 66801072, + "step": 54890 + }, + { + "epoch": 6.113709767234659, + "grad_norm": 0.07014599442481995, + "learning_rate": 4.3828759666710106e-05, + "loss": 0.0229, + "num_input_tokens_seen": 66806896, + "step": 54895 + }, + { + "epoch": 6.114266622118276, + "grad_norm": 0.18024085462093353, + "learning_rate": 4.382716117870745e-05, + "loss": 0.0239, + "num_input_tokens_seen": 66812944, + "step": 54900 + }, + { + "epoch": 6.114823477001893, + "grad_norm": 0.01144913025200367, + "learning_rate": 4.382556251286718e-05, + "loss": 0.0712, + "num_input_tokens_seen": 66819184, + "step": 54905 + }, + { + "epoch": 6.11538033188551, + "grad_norm": 0.8847227692604065, + "learning_rate": 4.3823963669204395e-05, + "loss": 0.105, + "num_input_tokens_seen": 66825616, + "step": 54910 + }, + { + "epoch": 6.115937186769128, + "grad_norm": 1.4017189741134644, + "learning_rate": 4.382236464773418e-05, + "loss": 0.1856, + "num_input_tokens_seen": 66832048, + "step": 54915 + }, + { + "epoch": 6.116494041652746, + "grad_norm": 0.1691678762435913, + "learning_rate": 4.382076544847166e-05, + "loss": 0.0868, + "num_input_tokens_seen": 66838320, + "step": 54920 + }, + { + "epoch": 6.117050896536362, + "grad_norm": 0.012739866971969604, + "learning_rate": 4.3819166071431924e-05, + "loss": 0.0542, + "num_input_tokens_seen": 66844400, + "step": 54925 + }, + { + "epoch": 6.11760775141998, + "grad_norm": 0.8440355658531189, + "learning_rate": 4.381756651663009e-05, + "loss": 0.0738, + "num_input_tokens_seen": 66850576, + "step": 54930 + }, + { + "epoch": 6.118164606303598, + "grad_norm": 0.0020367722027003765, + "learning_rate": 4.3815966784081264e-05, + "loss": 0.0543, + "num_input_tokens_seen": 66856784, + "step": 54935 + }, + { + "epoch": 6.1187214611872145, + "grad_norm": 0.20920011401176453, + "learning_rate": 4.381436687380056e-05, + "loss": 0.0691, + "num_input_tokens_seen": 66862704, + "step": 54940 + }, + { + "epoch": 6.119278316070832, + "grad_norm": 0.0017811681609600782, + "learning_rate": 4.3812766785803086e-05, + "loss": 0.0402, + "num_input_tokens_seen": 66868880, + "step": 54945 + }, + { + "epoch": 6.119835170954449, + "grad_norm": 0.21977712213993073, + "learning_rate": 4.381116652010395e-05, + "loss": 0.019, + "num_input_tokens_seen": 66874992, + "step": 54950 + }, + { + "epoch": 6.120392025838067, + "grad_norm": 0.9506456255912781, + "learning_rate": 4.3809566076718276e-05, + "loss": 0.0558, + "num_input_tokens_seen": 66881424, + "step": 54955 + }, + { + "epoch": 6.120948880721684, + "grad_norm": 0.06358352303504944, + "learning_rate": 4.3807965455661187e-05, + "loss": 0.0224, + "num_input_tokens_seen": 66887632, + "step": 54960 + }, + { + "epoch": 6.121505735605301, + "grad_norm": 0.07574258744716644, + "learning_rate": 4.380636465694779e-05, + "loss": 0.0444, + "num_input_tokens_seen": 66893648, + "step": 54965 + }, + { + "epoch": 6.122062590488919, + "grad_norm": 0.0009567233501002192, + "learning_rate": 4.380476368059322e-05, + "loss": 0.0271, + "num_input_tokens_seen": 66900112, + "step": 54970 + }, + { + "epoch": 6.1226194453725356, + "grad_norm": 1.4694374799728394, + "learning_rate": 4.3803162526612584e-05, + "loss": 0.1093, + "num_input_tokens_seen": 66906032, + "step": 54975 + }, + { + "epoch": 6.123176300256153, + "grad_norm": 0.17410129308700562, + "learning_rate": 4.380156119502101e-05, + "loss": 0.0045, + "num_input_tokens_seen": 66912336, + "step": 54980 + }, + { + "epoch": 6.123733155139771, + "grad_norm": 0.15952660143375397, + "learning_rate": 4.3799959685833635e-05, + "loss": 0.0466, + "num_input_tokens_seen": 66918096, + "step": 54985 + }, + { + "epoch": 6.124290010023388, + "grad_norm": 0.008875355124473572, + "learning_rate": 4.3798357999065576e-05, + "loss": 0.0089, + "num_input_tokens_seen": 66924336, + "step": 54990 + }, + { + "epoch": 6.124846864907005, + "grad_norm": 0.0010544630931690335, + "learning_rate": 4.379675613473196e-05, + "loss": 0.0191, + "num_input_tokens_seen": 66930512, + "step": 54995 + }, + { + "epoch": 6.125403719790622, + "grad_norm": 0.3359779715538025, + "learning_rate": 4.379515409284793e-05, + "loss": 0.03, + "num_input_tokens_seen": 66936688, + "step": 55000 + }, + { + "epoch": 6.12596057467424, + "grad_norm": 0.8769819140434265, + "learning_rate": 4.379355187342861e-05, + "loss": 0.0595, + "num_input_tokens_seen": 66942768, + "step": 55005 + }, + { + "epoch": 6.1265174295578575, + "grad_norm": 0.10912635177373886, + "learning_rate": 4.379194947648913e-05, + "loss": 0.0373, + "num_input_tokens_seen": 66949136, + "step": 55010 + }, + { + "epoch": 6.127074284441474, + "grad_norm": 1.579113245010376, + "learning_rate": 4.379034690204463e-05, + "loss": 0.0668, + "num_input_tokens_seen": 66955312, + "step": 55015 + }, + { + "epoch": 6.127631139325092, + "grad_norm": 0.7218320369720459, + "learning_rate": 4.3788744150110254e-05, + "loss": 0.0556, + "num_input_tokens_seen": 66960944, + "step": 55020 + }, + { + "epoch": 6.12818799420871, + "grad_norm": 0.2329818308353424, + "learning_rate": 4.3787141220701135e-05, + "loss": 0.0223, + "num_input_tokens_seen": 66966672, + "step": 55025 + }, + { + "epoch": 6.128744849092326, + "grad_norm": 0.09400872141122818, + "learning_rate": 4.378553811383241e-05, + "loss": 0.0713, + "num_input_tokens_seen": 66972944, + "step": 55030 + }, + { + "epoch": 6.129301703975944, + "grad_norm": 0.3476257920265198, + "learning_rate": 4.378393482951923e-05, + "loss": 0.1209, + "num_input_tokens_seen": 66979024, + "step": 55035 + }, + { + "epoch": 6.129858558859561, + "grad_norm": 0.020361758768558502, + "learning_rate": 4.3782331367776746e-05, + "loss": 0.0339, + "num_input_tokens_seen": 66984976, + "step": 55040 + }, + { + "epoch": 6.1304154137431786, + "grad_norm": 0.07976554334163666, + "learning_rate": 4.3780727728620085e-05, + "loss": 0.0104, + "num_input_tokens_seen": 66990960, + "step": 55045 + }, + { + "epoch": 6.130972268626796, + "grad_norm": 0.6579641103744507, + "learning_rate": 4.377912391206441e-05, + "loss": 0.0698, + "num_input_tokens_seen": 66997104, + "step": 55050 + }, + { + "epoch": 6.131529123510413, + "grad_norm": 0.4691678285598755, + "learning_rate": 4.3777519918124854e-05, + "loss": 0.0513, + "num_input_tokens_seen": 67003376, + "step": 55055 + }, + { + "epoch": 6.132085978394031, + "grad_norm": 0.07021673768758774, + "learning_rate": 4.3775915746816586e-05, + "loss": 0.0761, + "num_input_tokens_seen": 67009488, + "step": 55060 + }, + { + "epoch": 6.1326428332776475, + "grad_norm": 0.1807168871164322, + "learning_rate": 4.3774311398154744e-05, + "loss": 0.0077, + "num_input_tokens_seen": 67015568, + "step": 55065 + }, + { + "epoch": 6.133199688161265, + "grad_norm": 0.26424816250801086, + "learning_rate": 4.377270687215449e-05, + "loss": 0.0538, + "num_input_tokens_seen": 67021680, + "step": 55070 + }, + { + "epoch": 6.133756543044883, + "grad_norm": 0.03926029056310654, + "learning_rate": 4.377110216883099e-05, + "loss": 0.093, + "num_input_tokens_seen": 67027248, + "step": 55075 + }, + { + "epoch": 6.1343133979285, + "grad_norm": 0.009173283353447914, + "learning_rate": 4.376949728819938e-05, + "loss": 0.038, + "num_input_tokens_seen": 67033456, + "step": 55080 + }, + { + "epoch": 6.134870252812117, + "grad_norm": 0.1247507780790329, + "learning_rate": 4.3767892230274834e-05, + "loss": 0.0818, + "num_input_tokens_seen": 67039024, + "step": 55085 + }, + { + "epoch": 6.135427107695734, + "grad_norm": 0.11785496026277542, + "learning_rate": 4.376628699507251e-05, + "loss": 0.0071, + "num_input_tokens_seen": 67045296, + "step": 55090 + }, + { + "epoch": 6.135983962579352, + "grad_norm": 0.022894153371453285, + "learning_rate": 4.376468158260757e-05, + "loss": 0.0692, + "num_input_tokens_seen": 67051568, + "step": 55095 + }, + { + "epoch": 6.136540817462969, + "grad_norm": 1.702501654624939, + "learning_rate": 4.376307599289518e-05, + "loss": 0.0621, + "num_input_tokens_seen": 67057648, + "step": 55100 + }, + { + "epoch": 6.137097672346586, + "grad_norm": 0.4010167419910431, + "learning_rate": 4.376147022595049e-05, + "loss": 0.0389, + "num_input_tokens_seen": 67063920, + "step": 55105 + }, + { + "epoch": 6.137654527230204, + "grad_norm": 0.8389191031455994, + "learning_rate": 4.37598642817887e-05, + "loss": 0.0726, + "num_input_tokens_seen": 67070032, + "step": 55110 + }, + { + "epoch": 6.138211382113822, + "grad_norm": 0.1947595179080963, + "learning_rate": 4.375825816042496e-05, + "loss": 0.0069, + "num_input_tokens_seen": 67075984, + "step": 55115 + }, + { + "epoch": 6.138768236997438, + "grad_norm": 0.021121904253959656, + "learning_rate": 4.375665186187443e-05, + "loss": 0.0466, + "num_input_tokens_seen": 67082256, + "step": 55120 + }, + { + "epoch": 6.139325091881056, + "grad_norm": 0.4353356957435608, + "learning_rate": 4.3755045386152305e-05, + "loss": 0.0262, + "num_input_tokens_seen": 67088784, + "step": 55125 + }, + { + "epoch": 6.139881946764673, + "grad_norm": 0.26687824726104736, + "learning_rate": 4.375343873327376e-05, + "loss": 0.0205, + "num_input_tokens_seen": 67094992, + "step": 55130 + }, + { + "epoch": 6.1404388016482905, + "grad_norm": 0.690953254699707, + "learning_rate": 4.375183190325394e-05, + "loss": 0.0256, + "num_input_tokens_seen": 67101168, + "step": 55135 + }, + { + "epoch": 6.140995656531908, + "grad_norm": 0.36751100420951843, + "learning_rate": 4.375022489610806e-05, + "loss": 0.2085, + "num_input_tokens_seen": 67106832, + "step": 55140 + }, + { + "epoch": 6.141552511415525, + "grad_norm": 0.7728605270385742, + "learning_rate": 4.374861771185127e-05, + "loss": 0.0359, + "num_input_tokens_seen": 67113296, + "step": 55145 + }, + { + "epoch": 6.142109366299143, + "grad_norm": 0.13183294236660004, + "learning_rate": 4.374701035049877e-05, + "loss": 0.0142, + "num_input_tokens_seen": 67119472, + "step": 55150 + }, + { + "epoch": 6.142666221182759, + "grad_norm": 0.010015587322413921, + "learning_rate": 4.374540281206574e-05, + "loss": 0.0704, + "num_input_tokens_seen": 67125136, + "step": 55155 + }, + { + "epoch": 6.143223076066377, + "grad_norm": 0.007873550057411194, + "learning_rate": 4.3743795096567366e-05, + "loss": 0.0851, + "num_input_tokens_seen": 67131408, + "step": 55160 + }, + { + "epoch": 6.143779930949995, + "grad_norm": 0.23765520751476288, + "learning_rate": 4.374218720401882e-05, + "loss": 0.0493, + "num_input_tokens_seen": 67138064, + "step": 55165 + }, + { + "epoch": 6.1443367858336115, + "grad_norm": 1.2304446697235107, + "learning_rate": 4.374057913443531e-05, + "loss": 0.2018, + "num_input_tokens_seen": 67144304, + "step": 55170 + }, + { + "epoch": 6.144893640717229, + "grad_norm": 0.3298216462135315, + "learning_rate": 4.373897088783201e-05, + "loss": 0.0112, + "num_input_tokens_seen": 67150384, + "step": 55175 + }, + { + "epoch": 6.145450495600846, + "grad_norm": 0.11726997792720795, + "learning_rate": 4.373736246422412e-05, + "loss": 0.0953, + "num_input_tokens_seen": 67156656, + "step": 55180 + }, + { + "epoch": 6.146007350484464, + "grad_norm": 0.5435829162597656, + "learning_rate": 4.3735753863626825e-05, + "loss": 0.0812, + "num_input_tokens_seen": 67162896, + "step": 55185 + }, + { + "epoch": 6.146564205368081, + "grad_norm": 0.6671226024627686, + "learning_rate": 4.3734145086055324e-05, + "loss": 0.1096, + "num_input_tokens_seen": 67168624, + "step": 55190 + }, + { + "epoch": 6.147121060251698, + "grad_norm": 0.0047570266760885715, + "learning_rate": 4.3732536131524817e-05, + "loss": 0.1025, + "num_input_tokens_seen": 67174960, + "step": 55195 + }, + { + "epoch": 6.147677915135316, + "grad_norm": 0.003869502106681466, + "learning_rate": 4.3730927000050496e-05, + "loss": 0.0657, + "num_input_tokens_seen": 67181200, + "step": 55200 + }, + { + "epoch": 6.1482347700189335, + "grad_norm": 1.1282163858413696, + "learning_rate": 4.372931769164757e-05, + "loss": 0.099, + "num_input_tokens_seen": 67187312, + "step": 55205 + }, + { + "epoch": 6.14879162490255, + "grad_norm": 0.3537333607673645, + "learning_rate": 4.372770820633122e-05, + "loss": 0.0756, + "num_input_tokens_seen": 67193328, + "step": 55210 + }, + { + "epoch": 6.149348479786168, + "grad_norm": 0.11884200572967529, + "learning_rate": 4.372609854411666e-05, + "loss": 0.0594, + "num_input_tokens_seen": 67198832, + "step": 55215 + }, + { + "epoch": 6.149905334669785, + "grad_norm": 0.8003292679786682, + "learning_rate": 4.3724488705019104e-05, + "loss": 0.051, + "num_input_tokens_seen": 67205008, + "step": 55220 + }, + { + "epoch": 6.150462189553402, + "grad_norm": 0.006275114603340626, + "learning_rate": 4.372287868905375e-05, + "loss": 0.0488, + "num_input_tokens_seen": 67211184, + "step": 55225 + }, + { + "epoch": 6.15101904443702, + "grad_norm": 0.020101172849535942, + "learning_rate": 4.372126849623581e-05, + "loss": 0.0452, + "num_input_tokens_seen": 67217136, + "step": 55230 + }, + { + "epoch": 6.151575899320637, + "grad_norm": 1.4100315570831299, + "learning_rate": 4.371965812658048e-05, + "loss": 0.0895, + "num_input_tokens_seen": 67223536, + "step": 55235 + }, + { + "epoch": 6.1521327542042545, + "grad_norm": 0.5528529286384583, + "learning_rate": 4.371804758010298e-05, + "loss": 0.0964, + "num_input_tokens_seen": 67230192, + "step": 55240 + }, + { + "epoch": 6.152689609087871, + "grad_norm": 0.0960330218076706, + "learning_rate": 4.3716436856818535e-05, + "loss": 0.0324, + "num_input_tokens_seen": 67236400, + "step": 55245 + }, + { + "epoch": 6.153246463971489, + "grad_norm": 0.030284736305475235, + "learning_rate": 4.371482595674235e-05, + "loss": 0.0084, + "num_input_tokens_seen": 67242416, + "step": 55250 + }, + { + "epoch": 6.153803318855107, + "grad_norm": 1.2002651691436768, + "learning_rate": 4.371321487988963e-05, + "loss": 0.0843, + "num_input_tokens_seen": 67248624, + "step": 55255 + }, + { + "epoch": 6.1543601737387235, + "grad_norm": 0.013219235464930534, + "learning_rate": 4.371160362627561e-05, + "loss": 0.0272, + "num_input_tokens_seen": 67254736, + "step": 55260 + }, + { + "epoch": 6.154917028622341, + "grad_norm": 0.00979532953351736, + "learning_rate": 4.370999219591549e-05, + "loss": 0.0652, + "num_input_tokens_seen": 67261200, + "step": 55265 + }, + { + "epoch": 6.155473883505958, + "grad_norm": 0.4944058656692505, + "learning_rate": 4.3708380588824516e-05, + "loss": 0.0398, + "num_input_tokens_seen": 67267056, + "step": 55270 + }, + { + "epoch": 6.156030738389576, + "grad_norm": 0.2594932019710541, + "learning_rate": 4.3706768805017896e-05, + "loss": 0.0172, + "num_input_tokens_seen": 67273584, + "step": 55275 + }, + { + "epoch": 6.156587593273193, + "grad_norm": 0.029741864651441574, + "learning_rate": 4.370515684451085e-05, + "loss": 0.1097, + "num_input_tokens_seen": 67279696, + "step": 55280 + }, + { + "epoch": 6.15714444815681, + "grad_norm": 0.2553892135620117, + "learning_rate": 4.3703544707318616e-05, + "loss": 0.1763, + "num_input_tokens_seen": 67285456, + "step": 55285 + }, + { + "epoch": 6.157701303040428, + "grad_norm": 0.39261990785598755, + "learning_rate": 4.3701932393456416e-05, + "loss": 0.0787, + "num_input_tokens_seen": 67291024, + "step": 55290 + }, + { + "epoch": 6.158258157924045, + "grad_norm": 0.1217740848660469, + "learning_rate": 4.370031990293949e-05, + "loss": 0.0203, + "num_input_tokens_seen": 67297040, + "step": 55295 + }, + { + "epoch": 6.158815012807662, + "grad_norm": 0.019393762573599815, + "learning_rate": 4.369870723578305e-05, + "loss": 0.0175, + "num_input_tokens_seen": 67302928, + "step": 55300 + }, + { + "epoch": 6.15937186769128, + "grad_norm": 1.9809846878051758, + "learning_rate": 4.3697094392002344e-05, + "loss": 0.0381, + "num_input_tokens_seen": 67309456, + "step": 55305 + }, + { + "epoch": 6.159928722574897, + "grad_norm": 0.2209646850824356, + "learning_rate": 4.3695481371612595e-05, + "loss": 0.0314, + "num_input_tokens_seen": 67315440, + "step": 55310 + }, + { + "epoch": 6.160485577458514, + "grad_norm": 1.709427833557129, + "learning_rate": 4.369386817462905e-05, + "loss": 0.1175, + "num_input_tokens_seen": 67321584, + "step": 55315 + }, + { + "epoch": 6.161042432342132, + "grad_norm": 0.24538950622081757, + "learning_rate": 4.3692254801066945e-05, + "loss": 0.0465, + "num_input_tokens_seen": 67327408, + "step": 55320 + }, + { + "epoch": 6.161599287225749, + "grad_norm": 0.137171670794487, + "learning_rate": 4.369064125094152e-05, + "loss": 0.0473, + "num_input_tokens_seen": 67333552, + "step": 55325 + }, + { + "epoch": 6.1621561421093665, + "grad_norm": 0.5159033536911011, + "learning_rate": 4.3689027524268e-05, + "loss": 0.0974, + "num_input_tokens_seen": 67339312, + "step": 55330 + }, + { + "epoch": 6.162712996992983, + "grad_norm": 0.20318949222564697, + "learning_rate": 4.368741362106166e-05, + "loss": 0.0216, + "num_input_tokens_seen": 67345424, + "step": 55335 + }, + { + "epoch": 6.163269851876601, + "grad_norm": 0.05667758360505104, + "learning_rate": 4.368579954133771e-05, + "loss": 0.1209, + "num_input_tokens_seen": 67351632, + "step": 55340 + }, + { + "epoch": 6.163826706760219, + "grad_norm": 0.4246180057525635, + "learning_rate": 4.368418528511142e-05, + "loss": 0.0484, + "num_input_tokens_seen": 67358096, + "step": 55345 + }, + { + "epoch": 6.164383561643835, + "grad_norm": 0.1562812775373459, + "learning_rate": 4.368257085239803e-05, + "loss": 0.021, + "num_input_tokens_seen": 67364176, + "step": 55350 + }, + { + "epoch": 6.164940416527453, + "grad_norm": 0.08354179561138153, + "learning_rate": 4.368095624321279e-05, + "loss": 0.0524, + "num_input_tokens_seen": 67370544, + "step": 55355 + }, + { + "epoch": 6.16549727141107, + "grad_norm": 0.14256715774536133, + "learning_rate": 4.367934145757096e-05, + "loss": 0.0288, + "num_input_tokens_seen": 67376752, + "step": 55360 + }, + { + "epoch": 6.1660541262946875, + "grad_norm": 0.004074914380908012, + "learning_rate": 4.367772649548777e-05, + "loss": 0.023, + "num_input_tokens_seen": 67383216, + "step": 55365 + }, + { + "epoch": 6.166610981178305, + "grad_norm": 1.3315331935882568, + "learning_rate": 4.367611135697849e-05, + "loss": 0.1449, + "num_input_tokens_seen": 67389264, + "step": 55370 + }, + { + "epoch": 6.167167836061922, + "grad_norm": 1.2905049324035645, + "learning_rate": 4.367449604205838e-05, + "loss": 0.0381, + "num_input_tokens_seen": 67395440, + "step": 55375 + }, + { + "epoch": 6.16772469094554, + "grad_norm": 0.2933211326599121, + "learning_rate": 4.36728805507427e-05, + "loss": 0.012, + "num_input_tokens_seen": 67401840, + "step": 55380 + }, + { + "epoch": 6.168281545829157, + "grad_norm": 0.37226322293281555, + "learning_rate": 4.367126488304669e-05, + "loss": 0.1591, + "num_input_tokens_seen": 67408080, + "step": 55385 + }, + { + "epoch": 6.168838400712774, + "grad_norm": 0.7611067891120911, + "learning_rate": 4.366964903898563e-05, + "loss": 0.0706, + "num_input_tokens_seen": 67414032, + "step": 55390 + }, + { + "epoch": 6.169395255596392, + "grad_norm": 1.1636488437652588, + "learning_rate": 4.3668033018574775e-05, + "loss": 0.0854, + "num_input_tokens_seen": 67420272, + "step": 55395 + }, + { + "epoch": 6.169952110480009, + "grad_norm": 0.13804124295711517, + "learning_rate": 4.366641682182939e-05, + "loss": 0.2087, + "num_input_tokens_seen": 67426416, + "step": 55400 + }, + { + "epoch": 6.170508965363626, + "grad_norm": 1.2817825078964233, + "learning_rate": 4.366480044876475e-05, + "loss": 0.0781, + "num_input_tokens_seen": 67432496, + "step": 55405 + }, + { + "epoch": 6.171065820247244, + "grad_norm": 0.15005019307136536, + "learning_rate": 4.366318389939611e-05, + "loss": 0.02, + "num_input_tokens_seen": 67438928, + "step": 55410 + }, + { + "epoch": 6.171622675130861, + "grad_norm": 0.24902039766311646, + "learning_rate": 4.366156717373875e-05, + "loss": 0.0073, + "num_input_tokens_seen": 67445360, + "step": 55415 + }, + { + "epoch": 6.172179530014478, + "grad_norm": 0.059878263622522354, + "learning_rate": 4.3659950271807935e-05, + "loss": 0.0135, + "num_input_tokens_seen": 67451600, + "step": 55420 + }, + { + "epoch": 6.172736384898095, + "grad_norm": 0.050072185695171356, + "learning_rate": 4.365833319361893e-05, + "loss": 0.0181, + "num_input_tokens_seen": 67457264, + "step": 55425 + }, + { + "epoch": 6.173293239781713, + "grad_norm": 0.4076373279094696, + "learning_rate": 4.3656715939187034e-05, + "loss": 0.0702, + "num_input_tokens_seen": 67463280, + "step": 55430 + }, + { + "epoch": 6.1738500946653305, + "grad_norm": 0.40913981199264526, + "learning_rate": 4.36550985085275e-05, + "loss": 0.0091, + "num_input_tokens_seen": 67469616, + "step": 55435 + }, + { + "epoch": 6.174406949548947, + "grad_norm": 0.5365919470787048, + "learning_rate": 4.365348090165562e-05, + "loss": 0.1477, + "num_input_tokens_seen": 67475696, + "step": 55440 + }, + { + "epoch": 6.174963804432565, + "grad_norm": 0.04466438665986061, + "learning_rate": 4.365186311858666e-05, + "loss": 0.0721, + "num_input_tokens_seen": 67481744, + "step": 55445 + }, + { + "epoch": 6.175520659316183, + "grad_norm": 0.3424239456653595, + "learning_rate": 4.365024515933591e-05, + "loss": 0.0647, + "num_input_tokens_seen": 67487216, + "step": 55450 + }, + { + "epoch": 6.1760775141997994, + "grad_norm": 0.02286709100008011, + "learning_rate": 4.364862702391867e-05, + "loss": 0.0822, + "num_input_tokens_seen": 67493424, + "step": 55455 + }, + { + "epoch": 6.176634369083417, + "grad_norm": 0.000665948900859803, + "learning_rate": 4.364700871235018e-05, + "loss": 0.0763, + "num_input_tokens_seen": 67499760, + "step": 55460 + }, + { + "epoch": 6.177191223967034, + "grad_norm": 0.029606806114315987, + "learning_rate": 4.364539022464577e-05, + "loss": 0.0334, + "num_input_tokens_seen": 67505936, + "step": 55465 + }, + { + "epoch": 6.177748078850652, + "grad_norm": 0.3593533933162689, + "learning_rate": 4.36437715608207e-05, + "loss": 0.1325, + "num_input_tokens_seen": 67511888, + "step": 55470 + }, + { + "epoch": 6.178304933734269, + "grad_norm": 0.30743932723999023, + "learning_rate": 4.364215272089028e-05, + "loss": 0.1234, + "num_input_tokens_seen": 67518128, + "step": 55475 + }, + { + "epoch": 6.178861788617886, + "grad_norm": 2.3986799716949463, + "learning_rate": 4.364053370486979e-05, + "loss": 0.1093, + "num_input_tokens_seen": 67524432, + "step": 55480 + }, + { + "epoch": 6.179418643501504, + "grad_norm": 0.5916489362716675, + "learning_rate": 4.363891451277452e-05, + "loss": 0.0344, + "num_input_tokens_seen": 67530896, + "step": 55485 + }, + { + "epoch": 6.1799754983851205, + "grad_norm": 1.0469812154769897, + "learning_rate": 4.363729514461977e-05, + "loss": 0.0796, + "num_input_tokens_seen": 67536592, + "step": 55490 + }, + { + "epoch": 6.180532353268738, + "grad_norm": 0.27845168113708496, + "learning_rate": 4.363567560042085e-05, + "loss": 0.02, + "num_input_tokens_seen": 67542640, + "step": 55495 + }, + { + "epoch": 6.181089208152356, + "grad_norm": 0.08002622425556183, + "learning_rate": 4.3634055880193027e-05, + "loss": 0.0406, + "num_input_tokens_seen": 67548976, + "step": 55500 + }, + { + "epoch": 6.181646063035973, + "grad_norm": 0.9705420732498169, + "learning_rate": 4.363243598395162e-05, + "loss": 0.0735, + "num_input_tokens_seen": 67554832, + "step": 55505 + }, + { + "epoch": 6.18220291791959, + "grad_norm": 0.4439985454082489, + "learning_rate": 4.3630815911711926e-05, + "loss": 0.0364, + "num_input_tokens_seen": 67561136, + "step": 55510 + }, + { + "epoch": 6.182759772803207, + "grad_norm": 0.01428013201802969, + "learning_rate": 4.3629195663489255e-05, + "loss": 0.0272, + "num_input_tokens_seen": 67567536, + "step": 55515 + }, + { + "epoch": 6.183316627686825, + "grad_norm": 0.08782973140478134, + "learning_rate": 4.36275752392989e-05, + "loss": 0.0219, + "num_input_tokens_seen": 67573904, + "step": 55520 + }, + { + "epoch": 6.1838734825704424, + "grad_norm": 1.1444311141967773, + "learning_rate": 4.362595463915617e-05, + "loss": 0.0969, + "num_input_tokens_seen": 67579344, + "step": 55525 + }, + { + "epoch": 6.184430337454059, + "grad_norm": 1.3961095809936523, + "learning_rate": 4.362433386307638e-05, + "loss": 0.0661, + "num_input_tokens_seen": 67585392, + "step": 55530 + }, + { + "epoch": 6.184987192337677, + "grad_norm": 0.06274624168872833, + "learning_rate": 4.3622712911074836e-05, + "loss": 0.0162, + "num_input_tokens_seen": 67591600, + "step": 55535 + }, + { + "epoch": 6.185544047221294, + "grad_norm": 1.0061694383621216, + "learning_rate": 4.362109178316684e-05, + "loss": 0.0472, + "num_input_tokens_seen": 67597424, + "step": 55540 + }, + { + "epoch": 6.186100902104911, + "grad_norm": 1.6008864641189575, + "learning_rate": 4.361947047936772e-05, + "loss": 0.0929, + "num_input_tokens_seen": 67603472, + "step": 55545 + }, + { + "epoch": 6.186657756988529, + "grad_norm": 0.7944552302360535, + "learning_rate": 4.361784899969279e-05, + "loss": 0.1436, + "num_input_tokens_seen": 67609424, + "step": 55550 + }, + { + "epoch": 6.187214611872146, + "grad_norm": 0.2042067050933838, + "learning_rate": 4.361622734415735e-05, + "loss": 0.1425, + "num_input_tokens_seen": 67614704, + "step": 55555 + }, + { + "epoch": 6.1877714667557635, + "grad_norm": 0.22068703174591064, + "learning_rate": 4.361460551277673e-05, + "loss": 0.072, + "num_input_tokens_seen": 67620752, + "step": 55560 + }, + { + "epoch": 6.188328321639381, + "grad_norm": 0.027684615924954414, + "learning_rate": 4.361298350556625e-05, + "loss": 0.0673, + "num_input_tokens_seen": 67626704, + "step": 55565 + }, + { + "epoch": 6.188885176522998, + "grad_norm": 0.5820696353912354, + "learning_rate": 4.361136132254123e-05, + "loss": 0.0961, + "num_input_tokens_seen": 67632176, + "step": 55570 + }, + { + "epoch": 6.189442031406616, + "grad_norm": 0.12230586260557175, + "learning_rate": 4.360973896371698e-05, + "loss": 0.0749, + "num_input_tokens_seen": 67638224, + "step": 55575 + }, + { + "epoch": 6.189998886290232, + "grad_norm": 1.9421238899230957, + "learning_rate": 4.3608116429108847e-05, + "loss": 0.0716, + "num_input_tokens_seen": 67644368, + "step": 55580 + }, + { + "epoch": 6.19055574117385, + "grad_norm": 0.5576512813568115, + "learning_rate": 4.3606493718732146e-05, + "loss": 0.0414, + "num_input_tokens_seen": 67650256, + "step": 55585 + }, + { + "epoch": 6.191112596057468, + "grad_norm": 0.12124022096395493, + "learning_rate": 4.3604870832602194e-05, + "loss": 0.0489, + "num_input_tokens_seen": 67656336, + "step": 55590 + }, + { + "epoch": 6.191669450941085, + "grad_norm": 1.9099310636520386, + "learning_rate": 4.3603247770734345e-05, + "loss": 0.1248, + "num_input_tokens_seen": 67662512, + "step": 55595 + }, + { + "epoch": 6.192226305824702, + "grad_norm": 0.1555747389793396, + "learning_rate": 4.36016245331439e-05, + "loss": 0.0671, + "num_input_tokens_seen": 67668816, + "step": 55600 + }, + { + "epoch": 6.192783160708319, + "grad_norm": 0.5160337090492249, + "learning_rate": 4.360000111984622e-05, + "loss": 0.0556, + "num_input_tokens_seen": 67674128, + "step": 55605 + }, + { + "epoch": 6.193340015591937, + "grad_norm": 0.124751515686512, + "learning_rate": 4.3598377530856625e-05, + "loss": 0.112, + "num_input_tokens_seen": 67680304, + "step": 55610 + }, + { + "epoch": 6.193896870475554, + "grad_norm": 0.14154428243637085, + "learning_rate": 4.3596753766190456e-05, + "loss": 0.0091, + "num_input_tokens_seen": 67686512, + "step": 55615 + }, + { + "epoch": 6.194453725359171, + "grad_norm": 0.5825770497322083, + "learning_rate": 4.3595129825863044e-05, + "loss": 0.1048, + "num_input_tokens_seen": 67692112, + "step": 55620 + }, + { + "epoch": 6.195010580242789, + "grad_norm": 0.18771786987781525, + "learning_rate": 4.359350570988973e-05, + "loss": 0.0704, + "num_input_tokens_seen": 67698416, + "step": 55625 + }, + { + "epoch": 6.1955674351264065, + "grad_norm": 0.9955546855926514, + "learning_rate": 4.359188141828586e-05, + "loss": 0.0608, + "num_input_tokens_seen": 67704112, + "step": 55630 + }, + { + "epoch": 6.196124290010023, + "grad_norm": 0.02307816408574581, + "learning_rate": 4.3590256951066775e-05, + "loss": 0.0229, + "num_input_tokens_seen": 67709872, + "step": 55635 + }, + { + "epoch": 6.196681144893641, + "grad_norm": 2.2926535606384277, + "learning_rate": 4.3588632308247824e-05, + "loss": 0.0846, + "num_input_tokens_seen": 67715856, + "step": 55640 + }, + { + "epoch": 6.197237999777258, + "grad_norm": 0.12054400146007538, + "learning_rate": 4.3587007489844344e-05, + "loss": 0.0135, + "num_input_tokens_seen": 67721488, + "step": 55645 + }, + { + "epoch": 6.197794854660875, + "grad_norm": 0.12253779917955399, + "learning_rate": 4.358538249587168e-05, + "loss": 0.0577, + "num_input_tokens_seen": 67727344, + "step": 55650 + }, + { + "epoch": 6.198351709544493, + "grad_norm": 0.006756051443517208, + "learning_rate": 4.3583757326345196e-05, + "loss": 0.0465, + "num_input_tokens_seen": 67733328, + "step": 55655 + }, + { + "epoch": 6.19890856442811, + "grad_norm": 0.03128676116466522, + "learning_rate": 4.358213198128024e-05, + "loss": 0.0634, + "num_input_tokens_seen": 67739280, + "step": 55660 + }, + { + "epoch": 6.199465419311728, + "grad_norm": 2.7762677669525146, + "learning_rate": 4.358050646069215e-05, + "loss": 0.2313, + "num_input_tokens_seen": 67745200, + "step": 55665 + }, + { + "epoch": 6.200022274195344, + "grad_norm": 0.004021527245640755, + "learning_rate": 4.3578880764596295e-05, + "loss": 0.0271, + "num_input_tokens_seen": 67751024, + "step": 55670 + }, + { + "epoch": 6.200579129078962, + "grad_norm": 0.3169063329696655, + "learning_rate": 4.357725489300802e-05, + "loss": 0.0068, + "num_input_tokens_seen": 67757168, + "step": 55675 + }, + { + "epoch": 6.20113598396258, + "grad_norm": 0.0014560704585164785, + "learning_rate": 4.357562884594269e-05, + "loss": 0.0521, + "num_input_tokens_seen": 67763344, + "step": 55680 + }, + { + "epoch": 6.2016928388461965, + "grad_norm": 0.0026055823545902967, + "learning_rate": 4.3574002623415665e-05, + "loss": 0.0338, + "num_input_tokens_seen": 67769488, + "step": 55685 + }, + { + "epoch": 6.202249693729814, + "grad_norm": 2.3685479164123535, + "learning_rate": 4.35723762254423e-05, + "loss": 0.0423, + "num_input_tokens_seen": 67775568, + "step": 55690 + }, + { + "epoch": 6.202806548613431, + "grad_norm": 0.1570727825164795, + "learning_rate": 4.357074965203797e-05, + "loss": 0.0799, + "num_input_tokens_seen": 67780816, + "step": 55695 + }, + { + "epoch": 6.203363403497049, + "grad_norm": 0.33424413204193115, + "learning_rate": 4.356912290321803e-05, + "loss": 0.0743, + "num_input_tokens_seen": 67786960, + "step": 55700 + }, + { + "epoch": 6.203920258380666, + "grad_norm": 0.2023010551929474, + "learning_rate": 4.356749597899784e-05, + "loss": 0.0101, + "num_input_tokens_seen": 67793200, + "step": 55705 + }, + { + "epoch": 6.204477113264283, + "grad_norm": 0.014111220836639404, + "learning_rate": 4.356586887939278e-05, + "loss": 0.182, + "num_input_tokens_seen": 67799152, + "step": 55710 + }, + { + "epoch": 6.205033968147901, + "grad_norm": 0.013337317854166031, + "learning_rate": 4.356424160441821e-05, + "loss": 0.0805, + "num_input_tokens_seen": 67805360, + "step": 55715 + }, + { + "epoch": 6.2055908230315175, + "grad_norm": 1.1814908981323242, + "learning_rate": 4.356261415408951e-05, + "loss": 0.1061, + "num_input_tokens_seen": 67811664, + "step": 55720 + }, + { + "epoch": 6.206147677915135, + "grad_norm": 0.2125266045331955, + "learning_rate": 4.3560986528422046e-05, + "loss": 0.0127, + "num_input_tokens_seen": 67817968, + "step": 55725 + }, + { + "epoch": 6.206704532798753, + "grad_norm": 0.2670494616031647, + "learning_rate": 4.355935872743119e-05, + "loss": 0.0397, + "num_input_tokens_seen": 67823920, + "step": 55730 + }, + { + "epoch": 6.20726138768237, + "grad_norm": 0.057185471057891846, + "learning_rate": 4.355773075113232e-05, + "loss": 0.0067, + "num_input_tokens_seen": 67830320, + "step": 55735 + }, + { + "epoch": 6.207818242565987, + "grad_norm": 0.2375413030385971, + "learning_rate": 4.3556102599540816e-05, + "loss": 0.1114, + "num_input_tokens_seen": 67836624, + "step": 55740 + }, + { + "epoch": 6.208375097449605, + "grad_norm": 0.21752388775348663, + "learning_rate": 4.3554474272672056e-05, + "loss": 0.1055, + "num_input_tokens_seen": 67842672, + "step": 55745 + }, + { + "epoch": 6.208931952333222, + "grad_norm": 0.005625995807349682, + "learning_rate": 4.3552845770541424e-05, + "loss": 0.0577, + "num_input_tokens_seen": 67848688, + "step": 55750 + }, + { + "epoch": 6.2094888072168395, + "grad_norm": 0.89529949426651, + "learning_rate": 4.35512170931643e-05, + "loss": 0.1117, + "num_input_tokens_seen": 67854608, + "step": 55755 + }, + { + "epoch": 6.210045662100456, + "grad_norm": 0.3326527178287506, + "learning_rate": 4.3549588240556064e-05, + "loss": 0.0141, + "num_input_tokens_seen": 67860848, + "step": 55760 + }, + { + "epoch": 6.210602516984074, + "grad_norm": 0.001757077407091856, + "learning_rate": 4.3547959212732106e-05, + "loss": 0.0389, + "num_input_tokens_seen": 67866768, + "step": 55765 + }, + { + "epoch": 6.211159371867692, + "grad_norm": 0.037616066634655, + "learning_rate": 4.354633000970781e-05, + "loss": 0.0398, + "num_input_tokens_seen": 67872560, + "step": 55770 + }, + { + "epoch": 6.211716226751308, + "grad_norm": 0.05986449122428894, + "learning_rate": 4.3544700631498566e-05, + "loss": 0.0639, + "num_input_tokens_seen": 67878704, + "step": 55775 + }, + { + "epoch": 6.212273081634926, + "grad_norm": 0.09147357195615768, + "learning_rate": 4.354307107811978e-05, + "loss": 0.0212, + "num_input_tokens_seen": 67884912, + "step": 55780 + }, + { + "epoch": 6.212829936518543, + "grad_norm": 0.7075831890106201, + "learning_rate": 4.354144134958682e-05, + "loss": 0.1333, + "num_input_tokens_seen": 67891280, + "step": 55785 + }, + { + "epoch": 6.2133867914021605, + "grad_norm": 0.05720232054591179, + "learning_rate": 4.35398114459151e-05, + "loss": 0.0158, + "num_input_tokens_seen": 67897456, + "step": 55790 + }, + { + "epoch": 6.213943646285778, + "grad_norm": 0.36929237842559814, + "learning_rate": 4.353818136712e-05, + "loss": 0.0691, + "num_input_tokens_seen": 67903600, + "step": 55795 + }, + { + "epoch": 6.214500501169395, + "grad_norm": 0.9369739294052124, + "learning_rate": 4.353655111321692e-05, + "loss": 0.0197, + "num_input_tokens_seen": 67909680, + "step": 55800 + }, + { + "epoch": 6.215057356053013, + "grad_norm": 0.0009284956031478941, + "learning_rate": 4.353492068422127e-05, + "loss": 0.1161, + "num_input_tokens_seen": 67915504, + "step": 55805 + }, + { + "epoch": 6.21561421093663, + "grad_norm": 0.35481902956962585, + "learning_rate": 4.353329008014845e-05, + "loss": 0.0641, + "num_input_tokens_seen": 67921808, + "step": 55810 + }, + { + "epoch": 6.216171065820247, + "grad_norm": 0.24661526083946228, + "learning_rate": 4.353165930101385e-05, + "loss": 0.0224, + "num_input_tokens_seen": 67927696, + "step": 55815 + }, + { + "epoch": 6.216727920703865, + "grad_norm": 0.013258015736937523, + "learning_rate": 4.353002834683288e-05, + "loss": 0.0029, + "num_input_tokens_seen": 67934224, + "step": 55820 + }, + { + "epoch": 6.217284775587482, + "grad_norm": 0.0011048819869756699, + "learning_rate": 4.3528397217620945e-05, + "loss": 0.0409, + "num_input_tokens_seen": 67940560, + "step": 55825 + }, + { + "epoch": 6.217841630471099, + "grad_norm": 0.06788621097803116, + "learning_rate": 4.3526765913393454e-05, + "loss": 0.0291, + "num_input_tokens_seen": 67946864, + "step": 55830 + }, + { + "epoch": 6.218398485354717, + "grad_norm": 1.249030351638794, + "learning_rate": 4.352513443416581e-05, + "loss": 0.0684, + "num_input_tokens_seen": 67953072, + "step": 55835 + }, + { + "epoch": 6.218955340238334, + "grad_norm": 0.10158727318048477, + "learning_rate": 4.352350277995344e-05, + "loss": 0.0474, + "num_input_tokens_seen": 67959056, + "step": 55840 + }, + { + "epoch": 6.219512195121951, + "grad_norm": 1.032271146774292, + "learning_rate": 4.352187095077175e-05, + "loss": 0.0246, + "num_input_tokens_seen": 67965424, + "step": 55845 + }, + { + "epoch": 6.220069050005568, + "grad_norm": 0.09236866980791092, + "learning_rate": 4.3520238946636135e-05, + "loss": 0.0107, + "num_input_tokens_seen": 67971792, + "step": 55850 + }, + { + "epoch": 6.220625904889186, + "grad_norm": 0.03973972052335739, + "learning_rate": 4.3518606767562036e-05, + "loss": 0.1043, + "num_input_tokens_seen": 67977232, + "step": 55855 + }, + { + "epoch": 6.2211827597728035, + "grad_norm": 0.25662878155708313, + "learning_rate": 4.351697441356485e-05, + "loss": 0.0653, + "num_input_tokens_seen": 67982384, + "step": 55860 + }, + { + "epoch": 6.22173961465642, + "grad_norm": 0.22790579497814178, + "learning_rate": 4.351534188466001e-05, + "loss": 0.0159, + "num_input_tokens_seen": 67987792, + "step": 55865 + }, + { + "epoch": 6.222296469540038, + "grad_norm": 1.2373864650726318, + "learning_rate": 4.351370918086294e-05, + "loss": 0.1043, + "num_input_tokens_seen": 67993488, + "step": 55870 + }, + { + "epoch": 6.222853324423655, + "grad_norm": 0.0024712709710001945, + "learning_rate": 4.351207630218904e-05, + "loss": 0.0013, + "num_input_tokens_seen": 67999760, + "step": 55875 + }, + { + "epoch": 6.2234101793072725, + "grad_norm": 0.8172916769981384, + "learning_rate": 4.351044324865375e-05, + "loss": 0.1058, + "num_input_tokens_seen": 68005520, + "step": 55880 + }, + { + "epoch": 6.22396703419089, + "grad_norm": 1.073921799659729, + "learning_rate": 4.35088100202725e-05, + "loss": 0.0351, + "num_input_tokens_seen": 68011792, + "step": 55885 + }, + { + "epoch": 6.224523889074507, + "grad_norm": 0.7475077509880066, + "learning_rate": 4.350717661706071e-05, + "loss": 0.0328, + "num_input_tokens_seen": 68018128, + "step": 55890 + }, + { + "epoch": 6.225080743958125, + "grad_norm": 0.011168544180691242, + "learning_rate": 4.35055430390338e-05, + "loss": 0.1224, + "num_input_tokens_seen": 68024368, + "step": 55895 + }, + { + "epoch": 6.225637598841741, + "grad_norm": 0.4500451385974884, + "learning_rate": 4.3503909286207215e-05, + "loss": 0.0829, + "num_input_tokens_seen": 68030384, + "step": 55900 + }, + { + "epoch": 6.226194453725359, + "grad_norm": 0.05028024688363075, + "learning_rate": 4.3502275358596376e-05, + "loss": 0.02, + "num_input_tokens_seen": 68036752, + "step": 55905 + }, + { + "epoch": 6.226751308608977, + "grad_norm": 1.1253130435943604, + "learning_rate": 4.350064125621673e-05, + "loss": 0.067, + "num_input_tokens_seen": 68042736, + "step": 55910 + }, + { + "epoch": 6.2273081634925935, + "grad_norm": 0.023776212707161903, + "learning_rate": 4.349900697908371e-05, + "loss": 0.051, + "num_input_tokens_seen": 68049232, + "step": 55915 + }, + { + "epoch": 6.227865018376211, + "grad_norm": 0.06120999529957771, + "learning_rate": 4.3497372527212745e-05, + "loss": 0.0821, + "num_input_tokens_seen": 68055536, + "step": 55920 + }, + { + "epoch": 6.228421873259829, + "grad_norm": 0.39417508244514465, + "learning_rate": 4.349573790061927e-05, + "loss": 0.0553, + "num_input_tokens_seen": 68061712, + "step": 55925 + }, + { + "epoch": 6.228978728143446, + "grad_norm": 1.5973106622695923, + "learning_rate": 4.3494103099318735e-05, + "loss": 0.1005, + "num_input_tokens_seen": 68067792, + "step": 55930 + }, + { + "epoch": 6.229535583027063, + "grad_norm": 0.03384104743599892, + "learning_rate": 4.349246812332658e-05, + "loss": 0.206, + "num_input_tokens_seen": 68073648, + "step": 55935 + }, + { + "epoch": 6.23009243791068, + "grad_norm": 1.0241366624832153, + "learning_rate": 4.349083297265825e-05, + "loss": 0.0494, + "num_input_tokens_seen": 68079728, + "step": 55940 + }, + { + "epoch": 6.230649292794298, + "grad_norm": 1.3396202325820923, + "learning_rate": 4.348919764732918e-05, + "loss": 0.0591, + "num_input_tokens_seen": 68085872, + "step": 55945 + }, + { + "epoch": 6.2312061476779155, + "grad_norm": 0.3495001494884491, + "learning_rate": 4.348756214735483e-05, + "loss": 0.0789, + "num_input_tokens_seen": 68091408, + "step": 55950 + }, + { + "epoch": 6.231763002561532, + "grad_norm": 0.07760825753211975, + "learning_rate": 4.348592647275064e-05, + "loss": 0.0448, + "num_input_tokens_seen": 68097744, + "step": 55955 + }, + { + "epoch": 6.23231985744515, + "grad_norm": 0.00032476679189130664, + "learning_rate": 4.348429062353206e-05, + "loss": 0.0273, + "num_input_tokens_seen": 68103984, + "step": 55960 + }, + { + "epoch": 6.232876712328767, + "grad_norm": 0.041532814502716064, + "learning_rate": 4.348265459971456e-05, + "loss": 0.0376, + "num_input_tokens_seen": 68110192, + "step": 55965 + }, + { + "epoch": 6.233433567212384, + "grad_norm": 0.3690589368343353, + "learning_rate": 4.348101840131357e-05, + "loss": 0.0483, + "num_input_tokens_seen": 68116592, + "step": 55970 + }, + { + "epoch": 6.233990422096002, + "grad_norm": 1.2747856378555298, + "learning_rate": 4.3479382028344555e-05, + "loss": 0.0538, + "num_input_tokens_seen": 68122640, + "step": 55975 + }, + { + "epoch": 6.234547276979619, + "grad_norm": 0.23323431611061096, + "learning_rate": 4.347774548082297e-05, + "loss": 0.1841, + "num_input_tokens_seen": 68128720, + "step": 55980 + }, + { + "epoch": 6.2351041318632365, + "grad_norm": 0.10189928859472275, + "learning_rate": 4.347610875876428e-05, + "loss": 0.0452, + "num_input_tokens_seen": 68134896, + "step": 55985 + }, + { + "epoch": 6.235660986746854, + "grad_norm": 0.0005315415910445154, + "learning_rate": 4.347447186218393e-05, + "loss": 0.0114, + "num_input_tokens_seen": 68140816, + "step": 55990 + }, + { + "epoch": 6.236217841630471, + "grad_norm": 0.05971066653728485, + "learning_rate": 4.347283479109741e-05, + "loss": 0.0429, + "num_input_tokens_seen": 68147280, + "step": 55995 + }, + { + "epoch": 6.236774696514089, + "grad_norm": 0.10233386605978012, + "learning_rate": 4.347119754552015e-05, + "loss": 0.0418, + "num_input_tokens_seen": 68153680, + "step": 56000 + }, + { + "epoch": 6.2373315513977055, + "grad_norm": 0.8109517097473145, + "learning_rate": 4.3469560125467635e-05, + "loss": 0.0424, + "num_input_tokens_seen": 68159696, + "step": 56005 + }, + { + "epoch": 6.237888406281323, + "grad_norm": 0.7166703939437866, + "learning_rate": 4.346792253095533e-05, + "loss": 0.0767, + "num_input_tokens_seen": 68165328, + "step": 56010 + }, + { + "epoch": 6.238445261164941, + "grad_norm": 0.00014181372534949332, + "learning_rate": 4.346628476199869e-05, + "loss": 0.0535, + "num_input_tokens_seen": 68171408, + "step": 56015 + }, + { + "epoch": 6.239002116048558, + "grad_norm": 0.21185992658138275, + "learning_rate": 4.3464646818613206e-05, + "loss": 0.0262, + "num_input_tokens_seen": 68177296, + "step": 56020 + }, + { + "epoch": 6.239558970932175, + "grad_norm": 0.033564042299985886, + "learning_rate": 4.3463008700814334e-05, + "loss": 0.0859, + "num_input_tokens_seen": 68183408, + "step": 56025 + }, + { + "epoch": 6.240115825815792, + "grad_norm": 0.13207590579986572, + "learning_rate": 4.346137040861755e-05, + "loss": 0.0894, + "num_input_tokens_seen": 68189872, + "step": 56030 + }, + { + "epoch": 6.24067268069941, + "grad_norm": 0.7253009676933289, + "learning_rate": 4.345973194203834e-05, + "loss": 0.0575, + "num_input_tokens_seen": 68195824, + "step": 56035 + }, + { + "epoch": 6.241229535583027, + "grad_norm": 0.9627826809883118, + "learning_rate": 4.345809330109217e-05, + "loss": 0.0531, + "num_input_tokens_seen": 68202064, + "step": 56040 + }, + { + "epoch": 6.241786390466644, + "grad_norm": 0.0007687507895752788, + "learning_rate": 4.345645448579452e-05, + "loss": 0.0227, + "num_input_tokens_seen": 68208368, + "step": 56045 + }, + { + "epoch": 6.242343245350262, + "grad_norm": 0.0573883019387722, + "learning_rate": 4.345481549616086e-05, + "loss": 0.0229, + "num_input_tokens_seen": 68214160, + "step": 56050 + }, + { + "epoch": 6.242900100233879, + "grad_norm": 0.1173572838306427, + "learning_rate": 4.345317633220669e-05, + "loss": 0.0061, + "num_input_tokens_seen": 68220208, + "step": 56055 + }, + { + "epoch": 6.243456955117496, + "grad_norm": 0.027671493589878082, + "learning_rate": 4.3451536993947486e-05, + "loss": 0.0411, + "num_input_tokens_seen": 68226608, + "step": 56060 + }, + { + "epoch": 6.244013810001114, + "grad_norm": 0.2448500692844391, + "learning_rate": 4.344989748139873e-05, + "loss": 0.0726, + "num_input_tokens_seen": 68232624, + "step": 56065 + }, + { + "epoch": 6.244570664884731, + "grad_norm": 0.03980154171586037, + "learning_rate": 4.344825779457592e-05, + "loss": 0.0102, + "num_input_tokens_seen": 68238672, + "step": 56070 + }, + { + "epoch": 6.2451275197683485, + "grad_norm": 0.014372213743627071, + "learning_rate": 4.344661793349452e-05, + "loss": 0.025, + "num_input_tokens_seen": 68245040, + "step": 56075 + }, + { + "epoch": 6.245684374651965, + "grad_norm": 0.07431583106517792, + "learning_rate": 4.344497789817004e-05, + "loss": 0.0011, + "num_input_tokens_seen": 68251472, + "step": 56080 + }, + { + "epoch": 6.246241229535583, + "grad_norm": 0.04266296327114105, + "learning_rate": 4.344333768861797e-05, + "loss": 0.1166, + "num_input_tokens_seen": 68257840, + "step": 56085 + }, + { + "epoch": 6.246798084419201, + "grad_norm": 0.0012940469896420836, + "learning_rate": 4.344169730485379e-05, + "loss": 0.0639, + "num_input_tokens_seen": 68263632, + "step": 56090 + }, + { + "epoch": 6.247354939302817, + "grad_norm": 0.004363345447927713, + "learning_rate": 4.344005674689301e-05, + "loss": 0.0673, + "num_input_tokens_seen": 68269840, + "step": 56095 + }, + { + "epoch": 6.247911794186435, + "grad_norm": 0.3220835030078888, + "learning_rate": 4.3438416014751124e-05, + "loss": 0.011, + "num_input_tokens_seen": 68276496, + "step": 56100 + }, + { + "epoch": 6.248468649070053, + "grad_norm": 0.11923260241746902, + "learning_rate": 4.343677510844362e-05, + "loss": 0.0542, + "num_input_tokens_seen": 68282544, + "step": 56105 + }, + { + "epoch": 6.2490255039536695, + "grad_norm": 1.038543701171875, + "learning_rate": 4.343513402798601e-05, + "loss": 0.0863, + "num_input_tokens_seen": 68288752, + "step": 56110 + }, + { + "epoch": 6.249582358837287, + "grad_norm": 0.6927447319030762, + "learning_rate": 4.343349277339378e-05, + "loss": 0.1711, + "num_input_tokens_seen": 68295024, + "step": 56115 + }, + { + "epoch": 6.250139213720904, + "grad_norm": 0.7145163416862488, + "learning_rate": 4.343185134468245e-05, + "loss": 0.1001, + "num_input_tokens_seen": 68300944, + "step": 56120 + }, + { + "epoch": 6.250696068604522, + "grad_norm": 0.027701694518327713, + "learning_rate": 4.343020974186751e-05, + "loss": 0.0312, + "num_input_tokens_seen": 68306960, + "step": 56125 + }, + { + "epoch": 6.251252923488139, + "grad_norm": 0.011156933382153511, + "learning_rate": 4.342856796496448e-05, + "loss": 0.038, + "num_input_tokens_seen": 68312784, + "step": 56130 + }, + { + "epoch": 6.251809778371756, + "grad_norm": 0.4907577931880951, + "learning_rate": 4.342692601398886e-05, + "loss": 0.0115, + "num_input_tokens_seen": 68318960, + "step": 56135 + }, + { + "epoch": 6.252366633255374, + "grad_norm": 0.1414986103773117, + "learning_rate": 4.3425283888956144e-05, + "loss": 0.0231, + "num_input_tokens_seen": 68324848, + "step": 56140 + }, + { + "epoch": 6.252923488138991, + "grad_norm": 0.775676965713501, + "learning_rate": 4.3423641589881884e-05, + "loss": 0.0145, + "num_input_tokens_seen": 68331408, + "step": 56145 + }, + { + "epoch": 6.253480343022608, + "grad_norm": 0.22202517092227936, + "learning_rate": 4.342199911678155e-05, + "loss": 0.0198, + "num_input_tokens_seen": 68337616, + "step": 56150 + }, + { + "epoch": 6.254037197906226, + "grad_norm": 0.028851188719272614, + "learning_rate": 4.3420356469670684e-05, + "loss": 0.0024, + "num_input_tokens_seen": 68343728, + "step": 56155 + }, + { + "epoch": 6.254594052789843, + "grad_norm": 0.015118051320314407, + "learning_rate": 4.341871364856479e-05, + "loss": 0.0652, + "num_input_tokens_seen": 68349712, + "step": 56160 + }, + { + "epoch": 6.25515090767346, + "grad_norm": 0.03756766766309738, + "learning_rate": 4.34170706534794e-05, + "loss": 0.0017, + "num_input_tokens_seen": 68355984, + "step": 56165 + }, + { + "epoch": 6.255707762557078, + "grad_norm": 0.03393058106303215, + "learning_rate": 4.3415427484430006e-05, + "loss": 0.0865, + "num_input_tokens_seen": 68361872, + "step": 56170 + }, + { + "epoch": 6.256264617440695, + "grad_norm": 0.30909889936447144, + "learning_rate": 4.341378414143215e-05, + "loss": 0.011, + "num_input_tokens_seen": 68367888, + "step": 56175 + }, + { + "epoch": 6.2568214723243125, + "grad_norm": 1.051586627960205, + "learning_rate": 4.341214062450135e-05, + "loss": 0.1022, + "num_input_tokens_seen": 68373808, + "step": 56180 + }, + { + "epoch": 6.257378327207929, + "grad_norm": 0.01387704722583294, + "learning_rate": 4.3410496933653135e-05, + "loss": 0.0698, + "num_input_tokens_seen": 68379952, + "step": 56185 + }, + { + "epoch": 6.257935182091547, + "grad_norm": 0.0015893349191173911, + "learning_rate": 4.340885306890302e-05, + "loss": 0.1421, + "num_input_tokens_seen": 68386128, + "step": 56190 + }, + { + "epoch": 6.258492036975165, + "grad_norm": 0.3467777669429779, + "learning_rate": 4.340720903026655e-05, + "loss": 0.07, + "num_input_tokens_seen": 68392272, + "step": 56195 + }, + { + "epoch": 6.259048891858781, + "grad_norm": 0.22572793066501617, + "learning_rate": 4.340556481775923e-05, + "loss": 0.0278, + "num_input_tokens_seen": 68398512, + "step": 56200 + }, + { + "epoch": 6.259605746742399, + "grad_norm": 0.4396231770515442, + "learning_rate": 4.3403920431396605e-05, + "loss": 0.0585, + "num_input_tokens_seen": 68404688, + "step": 56205 + }, + { + "epoch": 6.260162601626016, + "grad_norm": 0.0015905698528513312, + "learning_rate": 4.340227587119421e-05, + "loss": 0.053, + "num_input_tokens_seen": 68410992, + "step": 56210 + }, + { + "epoch": 6.260719456509634, + "grad_norm": 0.6263272762298584, + "learning_rate": 4.340063113716758e-05, + "loss": 0.0955, + "num_input_tokens_seen": 68416816, + "step": 56215 + }, + { + "epoch": 6.261276311393251, + "grad_norm": 1.0496591329574585, + "learning_rate": 4.3398986229332237e-05, + "loss": 0.1576, + "num_input_tokens_seen": 68422736, + "step": 56220 + }, + { + "epoch": 6.261833166276868, + "grad_norm": 1.1390087604522705, + "learning_rate": 4.339734114770374e-05, + "loss": 0.0367, + "num_input_tokens_seen": 68428976, + "step": 56225 + }, + { + "epoch": 6.262390021160486, + "grad_norm": 0.00062122120289132, + "learning_rate": 4.339569589229761e-05, + "loss": 0.0905, + "num_input_tokens_seen": 68435376, + "step": 56230 + }, + { + "epoch": 6.2629468760441025, + "grad_norm": 0.21539337933063507, + "learning_rate": 4.339405046312939e-05, + "loss": 0.077, + "num_input_tokens_seen": 68441456, + "step": 56235 + }, + { + "epoch": 6.26350373092772, + "grad_norm": 1.1062071323394775, + "learning_rate": 4.339240486021463e-05, + "loss": 0.0946, + "num_input_tokens_seen": 68447504, + "step": 56240 + }, + { + "epoch": 6.264060585811338, + "grad_norm": 0.743888258934021, + "learning_rate": 4.339075908356887e-05, + "loss": 0.1466, + "num_input_tokens_seen": 68453616, + "step": 56245 + }, + { + "epoch": 6.264617440694955, + "grad_norm": 0.8404601216316223, + "learning_rate": 4.338911313320766e-05, + "loss": 0.1583, + "num_input_tokens_seen": 68458800, + "step": 56250 + }, + { + "epoch": 6.265174295578572, + "grad_norm": 0.00313106388784945, + "learning_rate": 4.338746700914654e-05, + "loss": 0.0035, + "num_input_tokens_seen": 68465104, + "step": 56255 + }, + { + "epoch": 6.265731150462189, + "grad_norm": 0.7821508646011353, + "learning_rate": 4.338582071140106e-05, + "loss": 0.0429, + "num_input_tokens_seen": 68470960, + "step": 56260 + }, + { + "epoch": 6.266288005345807, + "grad_norm": 0.1980813592672348, + "learning_rate": 4.3384174239986775e-05, + "loss": 0.0828, + "num_input_tokens_seen": 68476816, + "step": 56265 + }, + { + "epoch": 6.266844860229424, + "grad_norm": 0.7127882242202759, + "learning_rate": 4.3382527594919236e-05, + "loss": 0.0728, + "num_input_tokens_seen": 68483344, + "step": 56270 + }, + { + "epoch": 6.267401715113041, + "grad_norm": 0.010108399204909801, + "learning_rate": 4.3380880776213995e-05, + "loss": 0.0046, + "num_input_tokens_seen": 68489296, + "step": 56275 + }, + { + "epoch": 6.267958569996659, + "grad_norm": 0.037267521023750305, + "learning_rate": 4.337923378388661e-05, + "loss": 0.0085, + "num_input_tokens_seen": 68495536, + "step": 56280 + }, + { + "epoch": 6.268515424880277, + "grad_norm": 2.1306943893432617, + "learning_rate": 4.3377586617952634e-05, + "loss": 0.2606, + "num_input_tokens_seen": 68501744, + "step": 56285 + }, + { + "epoch": 6.269072279763893, + "grad_norm": 0.7788159251213074, + "learning_rate": 4.337593927842763e-05, + "loss": 0.0526, + "num_input_tokens_seen": 68507920, + "step": 56290 + }, + { + "epoch": 6.269629134647511, + "grad_norm": 0.02530459314584732, + "learning_rate": 4.337429176532716e-05, + "loss": 0.0217, + "num_input_tokens_seen": 68514000, + "step": 56295 + }, + { + "epoch": 6.270185989531128, + "grad_norm": 0.021335506811738014, + "learning_rate": 4.337264407866678e-05, + "loss": 0.0657, + "num_input_tokens_seen": 68519728, + "step": 56300 + }, + { + "epoch": 6.2707428444147455, + "grad_norm": 0.3691554665565491, + "learning_rate": 4.337099621846206e-05, + "loss": 0.0874, + "num_input_tokens_seen": 68525936, + "step": 56305 + }, + { + "epoch": 6.271299699298363, + "grad_norm": 0.12386738508939743, + "learning_rate": 4.336934818472855e-05, + "loss": 0.0283, + "num_input_tokens_seen": 68531664, + "step": 56310 + }, + { + "epoch": 6.27185655418198, + "grad_norm": 1.1431140899658203, + "learning_rate": 4.336769997748184e-05, + "loss": 0.1282, + "num_input_tokens_seen": 68537648, + "step": 56315 + }, + { + "epoch": 6.272413409065598, + "grad_norm": 1.4278115034103394, + "learning_rate": 4.336605159673749e-05, + "loss": 0.0322, + "num_input_tokens_seen": 68543984, + "step": 56320 + }, + { + "epoch": 6.272970263949214, + "grad_norm": 0.40483713150024414, + "learning_rate": 4.336440304251106e-05, + "loss": 0.0297, + "num_input_tokens_seen": 68549392, + "step": 56325 + }, + { + "epoch": 6.273527118832832, + "grad_norm": 0.0026762604247778654, + "learning_rate": 4.336275431481813e-05, + "loss": 0.0042, + "num_input_tokens_seen": 68555792, + "step": 56330 + }, + { + "epoch": 6.27408397371645, + "grad_norm": 0.038803890347480774, + "learning_rate": 4.3361105413674284e-05, + "loss": 0.0112, + "num_input_tokens_seen": 68561936, + "step": 56335 + }, + { + "epoch": 6.2746408286000666, + "grad_norm": 0.5483822226524353, + "learning_rate": 4.3359456339095075e-05, + "loss": 0.0402, + "num_input_tokens_seen": 68568240, + "step": 56340 + }, + { + "epoch": 6.275197683483684, + "grad_norm": 0.9250123500823975, + "learning_rate": 4.33578070910961e-05, + "loss": 0.0458, + "num_input_tokens_seen": 68573936, + "step": 56345 + }, + { + "epoch": 6.275754538367302, + "grad_norm": 0.0029275682754814625, + "learning_rate": 4.3356157669692924e-05, + "loss": 0.0164, + "num_input_tokens_seen": 68580080, + "step": 56350 + }, + { + "epoch": 6.276311393250919, + "grad_norm": 0.0003169671108480543, + "learning_rate": 4.335450807490113e-05, + "loss": 0.0112, + "num_input_tokens_seen": 68586448, + "step": 56355 + }, + { + "epoch": 6.276868248134536, + "grad_norm": 0.0005867498693987727, + "learning_rate": 4.3352858306736314e-05, + "loss": 0.0042, + "num_input_tokens_seen": 68592560, + "step": 56360 + }, + { + "epoch": 6.277425103018153, + "grad_norm": 0.8886396884918213, + "learning_rate": 4.335120836521404e-05, + "loss": 0.1584, + "num_input_tokens_seen": 68598672, + "step": 56365 + }, + { + "epoch": 6.277981957901771, + "grad_norm": 0.0017854288453236222, + "learning_rate": 4.33495582503499e-05, + "loss": 0.0099, + "num_input_tokens_seen": 68605232, + "step": 56370 + }, + { + "epoch": 6.2785388127853885, + "grad_norm": 0.00046494495472870767, + "learning_rate": 4.3347907962159475e-05, + "loss": 0.0125, + "num_input_tokens_seen": 68611184, + "step": 56375 + }, + { + "epoch": 6.279095667669005, + "grad_norm": 0.09001441299915314, + "learning_rate": 4.334625750065836e-05, + "loss": 0.0181, + "num_input_tokens_seen": 68617136, + "step": 56380 + }, + { + "epoch": 6.279652522552623, + "grad_norm": 0.8048348426818848, + "learning_rate": 4.3344606865862146e-05, + "loss": 0.0188, + "num_input_tokens_seen": 68623280, + "step": 56385 + }, + { + "epoch": 6.28020937743624, + "grad_norm": 0.3840579390525818, + "learning_rate": 4.3342956057786425e-05, + "loss": 0.0551, + "num_input_tokens_seen": 68629136, + "step": 56390 + }, + { + "epoch": 6.280766232319857, + "grad_norm": 0.7722741961479187, + "learning_rate": 4.3341305076446795e-05, + "loss": 0.0876, + "num_input_tokens_seen": 68635408, + "step": 56395 + }, + { + "epoch": 6.281323087203475, + "grad_norm": 0.9082796573638916, + "learning_rate": 4.3339653921858834e-05, + "loss": 0.0662, + "num_input_tokens_seen": 68641648, + "step": 56400 + }, + { + "epoch": 6.281879942087092, + "grad_norm": 0.2392777055501938, + "learning_rate": 4.3338002594038154e-05, + "loss": 0.1204, + "num_input_tokens_seen": 68647984, + "step": 56405 + }, + { + "epoch": 6.28243679697071, + "grad_norm": 3.1227359771728516, + "learning_rate": 4.3336351093000335e-05, + "loss": 0.224, + "num_input_tokens_seen": 68653392, + "step": 56410 + }, + { + "epoch": 6.282993651854326, + "grad_norm": 0.07895103096961975, + "learning_rate": 4.3334699418761e-05, + "loss": 0.0277, + "num_input_tokens_seen": 68659440, + "step": 56415 + }, + { + "epoch": 6.283550506737944, + "grad_norm": 1.0216214656829834, + "learning_rate": 4.333304757133574e-05, + "loss": 0.0771, + "num_input_tokens_seen": 68665904, + "step": 56420 + }, + { + "epoch": 6.284107361621562, + "grad_norm": 1.4634603261947632, + "learning_rate": 4.3331395550740154e-05, + "loss": 0.0736, + "num_input_tokens_seen": 68671856, + "step": 56425 + }, + { + "epoch": 6.2846642165051785, + "grad_norm": 0.015866218134760857, + "learning_rate": 4.332974335698985e-05, + "loss": 0.068, + "num_input_tokens_seen": 68677968, + "step": 56430 + }, + { + "epoch": 6.285221071388796, + "grad_norm": 0.5497813820838928, + "learning_rate": 4.332809099010043e-05, + "loss": 0.0767, + "num_input_tokens_seen": 68683952, + "step": 56435 + }, + { + "epoch": 6.285777926272413, + "grad_norm": 0.268940806388855, + "learning_rate": 4.332643845008752e-05, + "loss": 0.0073, + "num_input_tokens_seen": 68690064, + "step": 56440 + }, + { + "epoch": 6.286334781156031, + "grad_norm": 0.4650144875049591, + "learning_rate": 4.332478573696671e-05, + "loss": 0.0886, + "num_input_tokens_seen": 68695664, + "step": 56445 + }, + { + "epoch": 6.286891636039648, + "grad_norm": 0.29241621494293213, + "learning_rate": 4.332313285075361e-05, + "loss": 0.026, + "num_input_tokens_seen": 68701552, + "step": 56450 + }, + { + "epoch": 6.287448490923265, + "grad_norm": 0.05909296125173569, + "learning_rate": 4.332147979146385e-05, + "loss": 0.1068, + "num_input_tokens_seen": 68707760, + "step": 56455 + }, + { + "epoch": 6.288005345806883, + "grad_norm": 0.03440558537840843, + "learning_rate": 4.331982655911303e-05, + "loss": 0.022, + "num_input_tokens_seen": 68714064, + "step": 56460 + }, + { + "epoch": 6.2885622006905, + "grad_norm": 0.19677497446537018, + "learning_rate": 4.331817315371677e-05, + "loss": 0.0164, + "num_input_tokens_seen": 68720464, + "step": 56465 + }, + { + "epoch": 6.289119055574117, + "grad_norm": 0.40137186646461487, + "learning_rate": 4.3316519575290686e-05, + "loss": 0.0711, + "num_input_tokens_seen": 68726608, + "step": 56470 + }, + { + "epoch": 6.289675910457735, + "grad_norm": 1.3956944942474365, + "learning_rate": 4.3314865823850406e-05, + "loss": 0.021, + "num_input_tokens_seen": 68732560, + "step": 56475 + }, + { + "epoch": 6.290232765341352, + "grad_norm": 0.6132339835166931, + "learning_rate": 4.331321189941154e-05, + "loss": 0.1252, + "num_input_tokens_seen": 68738096, + "step": 56480 + }, + { + "epoch": 6.290789620224969, + "grad_norm": 0.5454249382019043, + "learning_rate": 4.331155780198971e-05, + "loss": 0.1383, + "num_input_tokens_seen": 68743792, + "step": 56485 + }, + { + "epoch": 6.291346475108587, + "grad_norm": 1.2253257036209106, + "learning_rate": 4.330990353160055e-05, + "loss": 0.04, + "num_input_tokens_seen": 68749776, + "step": 56490 + }, + { + "epoch": 6.291903329992204, + "grad_norm": 3.0952398777008057, + "learning_rate": 4.330824908825969e-05, + "loss": 0.1127, + "num_input_tokens_seen": 68756080, + "step": 56495 + }, + { + "epoch": 6.2924601848758215, + "grad_norm": 0.012552863918244839, + "learning_rate": 4.330659447198274e-05, + "loss": 0.0184, + "num_input_tokens_seen": 68761776, + "step": 56500 + }, + { + "epoch": 6.293017039759439, + "grad_norm": 0.1241237223148346, + "learning_rate": 4.330493968278534e-05, + "loss": 0.0097, + "num_input_tokens_seen": 68767888, + "step": 56505 + }, + { + "epoch": 6.293573894643056, + "grad_norm": 0.5243682265281677, + "learning_rate": 4.330328472068312e-05, + "loss": 0.1445, + "num_input_tokens_seen": 68774096, + "step": 56510 + }, + { + "epoch": 6.294130749526674, + "grad_norm": 0.9360673427581787, + "learning_rate": 4.3301629585691704e-05, + "loss": 0.1768, + "num_input_tokens_seen": 68780400, + "step": 56515 + }, + { + "epoch": 6.29468760441029, + "grad_norm": 0.012460693717002869, + "learning_rate": 4.329997427782675e-05, + "loss": 0.0679, + "num_input_tokens_seen": 68785968, + "step": 56520 + }, + { + "epoch": 6.295244459293908, + "grad_norm": 0.00037366943433880806, + "learning_rate": 4.3298318797103866e-05, + "loss": 0.0507, + "num_input_tokens_seen": 68792080, + "step": 56525 + }, + { + "epoch": 6.295801314177526, + "grad_norm": 0.03155302256345749, + "learning_rate": 4.32966631435387e-05, + "loss": 0.2192, + "num_input_tokens_seen": 68798128, + "step": 56530 + }, + { + "epoch": 6.2963581690611425, + "grad_norm": 0.8720575571060181, + "learning_rate": 4.329500731714689e-05, + "loss": 0.0457, + "num_input_tokens_seen": 68803792, + "step": 56535 + }, + { + "epoch": 6.29691502394476, + "grad_norm": 0.0008090610499493778, + "learning_rate": 4.329335131794408e-05, + "loss": 0.0158, + "num_input_tokens_seen": 68809808, + "step": 56540 + }, + { + "epoch": 6.297471878828377, + "grad_norm": 0.10321034491062164, + "learning_rate": 4.329169514594592e-05, + "loss": 0.008, + "num_input_tokens_seen": 68816112, + "step": 56545 + }, + { + "epoch": 6.298028733711995, + "grad_norm": 1.0612571239471436, + "learning_rate": 4.329003880116803e-05, + "loss": 0.1325, + "num_input_tokens_seen": 68821808, + "step": 56550 + }, + { + "epoch": 6.298585588595612, + "grad_norm": 0.2951580286026001, + "learning_rate": 4.328838228362608e-05, + "loss": 0.0555, + "num_input_tokens_seen": 68827728, + "step": 56555 + }, + { + "epoch": 6.299142443479229, + "grad_norm": 0.08370629698038101, + "learning_rate": 4.3286725593335706e-05, + "loss": 0.1241, + "num_input_tokens_seen": 68834128, + "step": 56560 + }, + { + "epoch": 6.299699298362847, + "grad_norm": 0.28364482522010803, + "learning_rate": 4.3285068730312555e-05, + "loss": 0.0134, + "num_input_tokens_seen": 68840336, + "step": 56565 + }, + { + "epoch": 6.300256153246464, + "grad_norm": 0.04742913320660591, + "learning_rate": 4.3283411694572285e-05, + "loss": 0.0178, + "num_input_tokens_seen": 68846288, + "step": 56570 + }, + { + "epoch": 6.300813008130081, + "grad_norm": 0.3967248499393463, + "learning_rate": 4.3281754486130535e-05, + "loss": 0.0818, + "num_input_tokens_seen": 68852176, + "step": 56575 + }, + { + "epoch": 6.301369863013699, + "grad_norm": 0.42538613080978394, + "learning_rate": 4.328009710500297e-05, + "loss": 0.0569, + "num_input_tokens_seen": 68858576, + "step": 56580 + }, + { + "epoch": 6.301926717897316, + "grad_norm": 1.3181791305541992, + "learning_rate": 4.327843955120524e-05, + "loss": 0.063, + "num_input_tokens_seen": 68864688, + "step": 56585 + }, + { + "epoch": 6.302483572780933, + "grad_norm": 0.029431482776999474, + "learning_rate": 4.327678182475301e-05, + "loss": 0.0127, + "num_input_tokens_seen": 68870864, + "step": 56590 + }, + { + "epoch": 6.30304042766455, + "grad_norm": 0.023580824956297874, + "learning_rate": 4.327512392566192e-05, + "loss": 0.0095, + "num_input_tokens_seen": 68877264, + "step": 56595 + }, + { + "epoch": 6.303597282548168, + "grad_norm": 0.36794474720954895, + "learning_rate": 4.327346585394766e-05, + "loss": 0.0602, + "num_input_tokens_seen": 68883312, + "step": 56600 + }, + { + "epoch": 6.3041541374317855, + "grad_norm": 0.8392344117164612, + "learning_rate": 4.3271807609625855e-05, + "loss": 0.0774, + "num_input_tokens_seen": 68889552, + "step": 56605 + }, + { + "epoch": 6.304710992315402, + "grad_norm": 0.5758239030838013, + "learning_rate": 4.3270149192712205e-05, + "loss": 0.0151, + "num_input_tokens_seen": 68895888, + "step": 56610 + }, + { + "epoch": 6.30526784719902, + "grad_norm": 1.1336408853530884, + "learning_rate": 4.3268490603222354e-05, + "loss": 0.0435, + "num_input_tokens_seen": 68901840, + "step": 56615 + }, + { + "epoch": 6.305824702082638, + "grad_norm": 1.0886056423187256, + "learning_rate": 4.3266831841171976e-05, + "loss": 0.105, + "num_input_tokens_seen": 68908016, + "step": 56620 + }, + { + "epoch": 6.3063815569662545, + "grad_norm": 0.09767906367778778, + "learning_rate": 4.3265172906576725e-05, + "loss": 0.0783, + "num_input_tokens_seen": 68913968, + "step": 56625 + }, + { + "epoch": 6.306938411849872, + "grad_norm": 1.9973305463790894, + "learning_rate": 4.326351379945229e-05, + "loss": 0.0724, + "num_input_tokens_seen": 68919696, + "step": 56630 + }, + { + "epoch": 6.307495266733489, + "grad_norm": 0.11632154881954193, + "learning_rate": 4.326185451981433e-05, + "loss": 0.0487, + "num_input_tokens_seen": 68925712, + "step": 56635 + }, + { + "epoch": 6.308052121617107, + "grad_norm": 0.0003255713963881135, + "learning_rate": 4.3260195067678525e-05, + "loss": 0.0181, + "num_input_tokens_seen": 68931824, + "step": 56640 + }, + { + "epoch": 6.308608976500724, + "grad_norm": 0.010423832572996616, + "learning_rate": 4.325853544306055e-05, + "loss": 0.0681, + "num_input_tokens_seen": 68937872, + "step": 56645 + }, + { + "epoch": 6.309165831384341, + "grad_norm": 0.27947214245796204, + "learning_rate": 4.325687564597608e-05, + "loss": 0.1079, + "num_input_tokens_seen": 68943888, + "step": 56650 + }, + { + "epoch": 6.309722686267959, + "grad_norm": 0.026097219437360764, + "learning_rate": 4.325521567644078e-05, + "loss": 0.0834, + "num_input_tokens_seen": 68949808, + "step": 56655 + }, + { + "epoch": 6.3102795411515755, + "grad_norm": 1.4798017740249634, + "learning_rate": 4.3253555534470355e-05, + "loss": 0.0647, + "num_input_tokens_seen": 68956112, + "step": 56660 + }, + { + "epoch": 6.310836396035193, + "grad_norm": 0.2131785750389099, + "learning_rate": 4.3251895220080476e-05, + "loss": 0.0433, + "num_input_tokens_seen": 68962320, + "step": 56665 + }, + { + "epoch": 6.311393250918811, + "grad_norm": 0.07293704152107239, + "learning_rate": 4.325023473328682e-05, + "loss": 0.0562, + "num_input_tokens_seen": 68968816, + "step": 56670 + }, + { + "epoch": 6.311950105802428, + "grad_norm": 1.6855193376541138, + "learning_rate": 4.324857407410507e-05, + "loss": 0.1872, + "num_input_tokens_seen": 68974832, + "step": 56675 + }, + { + "epoch": 6.312506960686045, + "grad_norm": 0.8520923256874084, + "learning_rate": 4.324691324255092e-05, + "loss": 0.0575, + "num_input_tokens_seen": 68981040, + "step": 56680 + }, + { + "epoch": 6.313063815569663, + "grad_norm": 0.9960837960243225, + "learning_rate": 4.324525223864005e-05, + "loss": 0.1413, + "num_input_tokens_seen": 68986032, + "step": 56685 + }, + { + "epoch": 6.31362067045328, + "grad_norm": 0.4491407871246338, + "learning_rate": 4.324359106238817e-05, + "loss": 0.0133, + "num_input_tokens_seen": 68992400, + "step": 56690 + }, + { + "epoch": 6.3141775253368975, + "grad_norm": 0.8682177662849426, + "learning_rate": 4.3241929713810944e-05, + "loss": 0.018, + "num_input_tokens_seen": 68998608, + "step": 56695 + }, + { + "epoch": 6.314734380220514, + "grad_norm": 0.0870392918586731, + "learning_rate": 4.324026819292408e-05, + "loss": 0.0205, + "num_input_tokens_seen": 69004656, + "step": 56700 + }, + { + "epoch": 6.315291235104132, + "grad_norm": 0.04128801450133324, + "learning_rate": 4.323860649974326e-05, + "loss": 0.0179, + "num_input_tokens_seen": 69010608, + "step": 56705 + }, + { + "epoch": 6.31584808998775, + "grad_norm": 0.0678086131811142, + "learning_rate": 4.32369446342842e-05, + "loss": 0.0079, + "num_input_tokens_seen": 69016784, + "step": 56710 + }, + { + "epoch": 6.316404944871366, + "grad_norm": 0.02774595469236374, + "learning_rate": 4.323528259656259e-05, + "loss": 0.0054, + "num_input_tokens_seen": 69022864, + "step": 56715 + }, + { + "epoch": 6.316961799754984, + "grad_norm": 0.0015861515421420336, + "learning_rate": 4.323362038659412e-05, + "loss": 0.0734, + "num_input_tokens_seen": 69028848, + "step": 56720 + }, + { + "epoch": 6.317518654638601, + "grad_norm": 0.012391116470098495, + "learning_rate": 4.323195800439449e-05, + "loss": 0.0064, + "num_input_tokens_seen": 69034416, + "step": 56725 + }, + { + "epoch": 6.3180755095222185, + "grad_norm": 0.7188901901245117, + "learning_rate": 4.323029544997942e-05, + "loss": 0.1289, + "num_input_tokens_seen": 69040208, + "step": 56730 + }, + { + "epoch": 6.318632364405836, + "grad_norm": 0.520232081413269, + "learning_rate": 4.32286327233646e-05, + "loss": 0.091, + "num_input_tokens_seen": 69046416, + "step": 56735 + }, + { + "epoch": 6.319189219289453, + "grad_norm": 1.0532194375991821, + "learning_rate": 4.322696982456574e-05, + "loss": 0.0559, + "num_input_tokens_seen": 69052368, + "step": 56740 + }, + { + "epoch": 6.319746074173071, + "grad_norm": 0.5496587753295898, + "learning_rate": 4.322530675359855e-05, + "loss": 0.0474, + "num_input_tokens_seen": 69058224, + "step": 56745 + }, + { + "epoch": 6.3203029290566874, + "grad_norm": 0.8017135858535767, + "learning_rate": 4.3223643510478726e-05, + "loss": 0.1096, + "num_input_tokens_seen": 69064528, + "step": 56750 + }, + { + "epoch": 6.320859783940305, + "grad_norm": 0.09237472712993622, + "learning_rate": 4.3221980095222e-05, + "loss": 0.008, + "num_input_tokens_seen": 69070960, + "step": 56755 + }, + { + "epoch": 6.321416638823923, + "grad_norm": 0.825340747833252, + "learning_rate": 4.322031650784406e-05, + "loss": 0.1479, + "num_input_tokens_seen": 69077072, + "step": 56760 + }, + { + "epoch": 6.32197349370754, + "grad_norm": 0.30306822061538696, + "learning_rate": 4.3218652748360645e-05, + "loss": 0.1164, + "num_input_tokens_seen": 69083120, + "step": 56765 + }, + { + "epoch": 6.322530348591157, + "grad_norm": 0.0003200952778570354, + "learning_rate": 4.321698881678745e-05, + "loss": 0.0463, + "num_input_tokens_seen": 69089424, + "step": 56770 + }, + { + "epoch": 6.323087203474774, + "grad_norm": 0.0940389558672905, + "learning_rate": 4.3215324713140205e-05, + "loss": 0.0225, + "num_input_tokens_seen": 69095952, + "step": 56775 + }, + { + "epoch": 6.323644058358392, + "grad_norm": 0.2089409977197647, + "learning_rate": 4.321366043743462e-05, + "loss": 0.1521, + "num_input_tokens_seen": 69101456, + "step": 56780 + }, + { + "epoch": 6.324200913242009, + "grad_norm": 0.41461509466171265, + "learning_rate": 4.3211995989686427e-05, + "loss": 0.1589, + "num_input_tokens_seen": 69107216, + "step": 56785 + }, + { + "epoch": 6.324757768125626, + "grad_norm": 3.8610482215881348, + "learning_rate": 4.3210331369911336e-05, + "loss": 0.0609, + "num_input_tokens_seen": 69113360, + "step": 56790 + }, + { + "epoch": 6.325314623009244, + "grad_norm": 0.2324003130197525, + "learning_rate": 4.320866657812507e-05, + "loss": 0.0442, + "num_input_tokens_seen": 69119280, + "step": 56795 + }, + { + "epoch": 6.3258714778928615, + "grad_norm": 0.08196037262678146, + "learning_rate": 4.3207001614343365e-05, + "loss": 0.0091, + "num_input_tokens_seen": 69125648, + "step": 56800 + }, + { + "epoch": 6.326428332776478, + "grad_norm": 0.9734481573104858, + "learning_rate": 4.3205336478581946e-05, + "loss": 0.0505, + "num_input_tokens_seen": 69131792, + "step": 56805 + }, + { + "epoch": 6.326985187660096, + "grad_norm": 0.07763094455003738, + "learning_rate": 4.3203671170856535e-05, + "loss": 0.0331, + "num_input_tokens_seen": 69137776, + "step": 56810 + }, + { + "epoch": 6.327542042543713, + "grad_norm": 0.7560543417930603, + "learning_rate": 4.320200569118287e-05, + "loss": 0.0968, + "num_input_tokens_seen": 69144112, + "step": 56815 + }, + { + "epoch": 6.3280988974273304, + "grad_norm": 0.13570627570152283, + "learning_rate": 4.320034003957667e-05, + "loss": 0.0365, + "num_input_tokens_seen": 69150224, + "step": 56820 + }, + { + "epoch": 6.328655752310948, + "grad_norm": 1.427976131439209, + "learning_rate": 4.3198674216053676e-05, + "loss": 0.143, + "num_input_tokens_seen": 69156080, + "step": 56825 + }, + { + "epoch": 6.329212607194565, + "grad_norm": 0.00484324898570776, + "learning_rate": 4.319700822062963e-05, + "loss": 0.0493, + "num_input_tokens_seen": 69162256, + "step": 56830 + }, + { + "epoch": 6.329769462078183, + "grad_norm": 0.8215541839599609, + "learning_rate": 4.3195342053320254e-05, + "loss": 0.0672, + "num_input_tokens_seen": 69168080, + "step": 56835 + }, + { + "epoch": 6.330326316961799, + "grad_norm": 0.020253948867321014, + "learning_rate": 4.31936757141413e-05, + "loss": 0.0745, + "num_input_tokens_seen": 69174064, + "step": 56840 + }, + { + "epoch": 6.330883171845417, + "grad_norm": 0.08086197078227997, + "learning_rate": 4.3192009203108506e-05, + "loss": 0.0446, + "num_input_tokens_seen": 69180432, + "step": 56845 + }, + { + "epoch": 6.331440026729035, + "grad_norm": 0.264968603849411, + "learning_rate": 4.31903425202376e-05, + "loss": 0.0374, + "num_input_tokens_seen": 69186768, + "step": 56850 + }, + { + "epoch": 6.3319968816126515, + "grad_norm": 0.015266980044543743, + "learning_rate": 4.318867566554434e-05, + "loss": 0.0196, + "num_input_tokens_seen": 69192784, + "step": 56855 + }, + { + "epoch": 6.332553736496269, + "grad_norm": 1.3569375276565552, + "learning_rate": 4.318700863904447e-05, + "loss": 0.1005, + "num_input_tokens_seen": 69198544, + "step": 56860 + }, + { + "epoch": 6.333110591379887, + "grad_norm": 0.008546972647309303, + "learning_rate": 4.318534144075373e-05, + "loss": 0.0009, + "num_input_tokens_seen": 69204752, + "step": 56865 + }, + { + "epoch": 6.333667446263504, + "grad_norm": 0.002912157215178013, + "learning_rate": 4.318367407068787e-05, + "loss": 0.0207, + "num_input_tokens_seen": 69211024, + "step": 56870 + }, + { + "epoch": 6.334224301147121, + "grad_norm": 0.9205387830734253, + "learning_rate": 4.318200652886264e-05, + "loss": 0.167, + "num_input_tokens_seen": 69217104, + "step": 56875 + }, + { + "epoch": 6.334781156030738, + "grad_norm": 0.3573179543018341, + "learning_rate": 4.3180338815293784e-05, + "loss": 0.1282, + "num_input_tokens_seen": 69222832, + "step": 56880 + }, + { + "epoch": 6.335338010914356, + "grad_norm": 0.7695455551147461, + "learning_rate": 4.317867092999707e-05, + "loss": 0.0715, + "num_input_tokens_seen": 69228976, + "step": 56885 + }, + { + "epoch": 6.3358948657979735, + "grad_norm": 0.13827180862426758, + "learning_rate": 4.317700287298825e-05, + "loss": 0.0819, + "num_input_tokens_seen": 69235248, + "step": 56890 + }, + { + "epoch": 6.33645172068159, + "grad_norm": 1.0848441123962402, + "learning_rate": 4.317533464428306e-05, + "loss": 0.0332, + "num_input_tokens_seen": 69241456, + "step": 56895 + }, + { + "epoch": 6.337008575565208, + "grad_norm": 0.07643219828605652, + "learning_rate": 4.317366624389728e-05, + "loss": 0.0154, + "num_input_tokens_seen": 69247408, + "step": 56900 + }, + { + "epoch": 6.337565430448825, + "grad_norm": 0.44043150544166565, + "learning_rate": 4.3171997671846664e-05, + "loss": 0.0268, + "num_input_tokens_seen": 69253424, + "step": 56905 + }, + { + "epoch": 6.338122285332442, + "grad_norm": 0.581341028213501, + "learning_rate": 4.317032892814697e-05, + "loss": 0.0319, + "num_input_tokens_seen": 69259408, + "step": 56910 + }, + { + "epoch": 6.33867914021606, + "grad_norm": 0.5447144508361816, + "learning_rate": 4.316866001281396e-05, + "loss": 0.0665, + "num_input_tokens_seen": 69265648, + "step": 56915 + }, + { + "epoch": 6.339235995099677, + "grad_norm": 1.4714417457580566, + "learning_rate": 4.316699092586339e-05, + "loss": 0.1166, + "num_input_tokens_seen": 69271760, + "step": 56920 + }, + { + "epoch": 6.3397928499832945, + "grad_norm": 0.0002919795806519687, + "learning_rate": 4.316532166731105e-05, + "loss": 0.0281, + "num_input_tokens_seen": 69277584, + "step": 56925 + }, + { + "epoch": 6.340349704866911, + "grad_norm": 0.0026667462661862373, + "learning_rate": 4.316365223717269e-05, + "loss": 0.0234, + "num_input_tokens_seen": 69283824, + "step": 56930 + }, + { + "epoch": 6.340906559750529, + "grad_norm": 0.004373368341475725, + "learning_rate": 4.316198263546408e-05, + "loss": 0.105, + "num_input_tokens_seen": 69289968, + "step": 56935 + }, + { + "epoch": 6.341463414634147, + "grad_norm": 1.0916752815246582, + "learning_rate": 4.316031286220099e-05, + "loss": 0.1173, + "num_input_tokens_seen": 69296016, + "step": 56940 + }, + { + "epoch": 6.342020269517763, + "grad_norm": 0.0020487108267843723, + "learning_rate": 4.3158642917399205e-05, + "loss": 0.033, + "num_input_tokens_seen": 69302512, + "step": 56945 + }, + { + "epoch": 6.342577124401381, + "grad_norm": 0.08288317918777466, + "learning_rate": 4.315697280107448e-05, + "loss": 0.1658, + "num_input_tokens_seen": 69308080, + "step": 56950 + }, + { + "epoch": 6.343133979284998, + "grad_norm": 0.039023566991090775, + "learning_rate": 4.31553025132426e-05, + "loss": 0.0386, + "num_input_tokens_seen": 69314288, + "step": 56955 + }, + { + "epoch": 6.343690834168616, + "grad_norm": 0.16561894118785858, + "learning_rate": 4.3153632053919346e-05, + "loss": 0.0669, + "num_input_tokens_seen": 69320464, + "step": 56960 + }, + { + "epoch": 6.344247689052233, + "grad_norm": 1.6165889501571655, + "learning_rate": 4.315196142312049e-05, + "loss": 0.1041, + "num_input_tokens_seen": 69326608, + "step": 56965 + }, + { + "epoch": 6.34480454393585, + "grad_norm": 0.4526687264442444, + "learning_rate": 4.315029062086182e-05, + "loss": 0.0391, + "num_input_tokens_seen": 69332784, + "step": 56970 + }, + { + "epoch": 6.345361398819468, + "grad_norm": 0.49524882435798645, + "learning_rate": 4.314861964715911e-05, + "loss": 0.0405, + "num_input_tokens_seen": 69338608, + "step": 56975 + }, + { + "epoch": 6.345918253703085, + "grad_norm": 0.028640516102313995, + "learning_rate": 4.314694850202815e-05, + "loss": 0.0364, + "num_input_tokens_seen": 69344816, + "step": 56980 + }, + { + "epoch": 6.346475108586702, + "grad_norm": 0.27252498269081116, + "learning_rate": 4.314527718548472e-05, + "loss": 0.0561, + "num_input_tokens_seen": 69350896, + "step": 56985 + }, + { + "epoch": 6.34703196347032, + "grad_norm": 0.44410449266433716, + "learning_rate": 4.31436056975446e-05, + "loss": 0.0629, + "num_input_tokens_seen": 69356400, + "step": 56990 + }, + { + "epoch": 6.347588818353937, + "grad_norm": 0.014632372185587883, + "learning_rate": 4.3141934038223596e-05, + "loss": 0.0387, + "num_input_tokens_seen": 69362256, + "step": 56995 + }, + { + "epoch": 6.348145673237554, + "grad_norm": 0.0004400857724249363, + "learning_rate": 4.31402622075375e-05, + "loss": 0.1556, + "num_input_tokens_seen": 69367824, + "step": 57000 + }, + { + "epoch": 6.348702528121172, + "grad_norm": 0.4974845051765442, + "learning_rate": 4.3138590205502085e-05, + "loss": 0.0181, + "num_input_tokens_seen": 69374320, + "step": 57005 + }, + { + "epoch": 6.349259383004789, + "grad_norm": 0.06450570374727249, + "learning_rate": 4.313691803213314e-05, + "loss": 0.0723, + "num_input_tokens_seen": 69380336, + "step": 57010 + }, + { + "epoch": 6.349816237888406, + "grad_norm": 0.007529123220592737, + "learning_rate": 4.313524568744649e-05, + "loss": 0.0693, + "num_input_tokens_seen": 69386224, + "step": 57015 + }, + { + "epoch": 6.350373092772023, + "grad_norm": 0.3216487765312195, + "learning_rate": 4.313357317145791e-05, + "loss": 0.0259, + "num_input_tokens_seen": 69391856, + "step": 57020 + }, + { + "epoch": 6.350929947655641, + "grad_norm": 0.005135785322636366, + "learning_rate": 4.31319004841832e-05, + "loss": 0.0439, + "num_input_tokens_seen": 69397872, + "step": 57025 + }, + { + "epoch": 6.351486802539259, + "grad_norm": 0.06596378237009048, + "learning_rate": 4.313022762563816e-05, + "loss": 0.0097, + "num_input_tokens_seen": 69404048, + "step": 57030 + }, + { + "epoch": 6.352043657422875, + "grad_norm": 1.0325076580047607, + "learning_rate": 4.312855459583861e-05, + "loss": 0.041, + "num_input_tokens_seen": 69409968, + "step": 57035 + }, + { + "epoch": 6.352600512306493, + "grad_norm": 0.19612844288349152, + "learning_rate": 4.3126881394800325e-05, + "loss": 0.0323, + "num_input_tokens_seen": 69416016, + "step": 57040 + }, + { + "epoch": 6.353157367190111, + "grad_norm": 0.06477908790111542, + "learning_rate": 4.312520802253912e-05, + "loss": 0.0274, + "num_input_tokens_seen": 69421840, + "step": 57045 + }, + { + "epoch": 6.3537142220737275, + "grad_norm": 1.0159175395965576, + "learning_rate": 4.312353447907082e-05, + "loss": 0.1498, + "num_input_tokens_seen": 69428208, + "step": 57050 + }, + { + "epoch": 6.354271076957345, + "grad_norm": 0.15824902057647705, + "learning_rate": 4.31218607644112e-05, + "loss": 0.0951, + "num_input_tokens_seen": 69434288, + "step": 57055 + }, + { + "epoch": 6.354827931840962, + "grad_norm": 0.08453046530485153, + "learning_rate": 4.3120186878576084e-05, + "loss": 0.006, + "num_input_tokens_seen": 69440656, + "step": 57060 + }, + { + "epoch": 6.35538478672458, + "grad_norm": 0.01054737251251936, + "learning_rate": 4.31185128215813e-05, + "loss": 0.0585, + "num_input_tokens_seen": 69446672, + "step": 57065 + }, + { + "epoch": 6.355941641608197, + "grad_norm": 0.6553971171379089, + "learning_rate": 4.311683859344263e-05, + "loss": 0.0549, + "num_input_tokens_seen": 69452720, + "step": 57070 + }, + { + "epoch": 6.356498496491814, + "grad_norm": 0.4966576397418976, + "learning_rate": 4.311516419417592e-05, + "loss": 0.136, + "num_input_tokens_seen": 69458800, + "step": 57075 + }, + { + "epoch": 6.357055351375432, + "grad_norm": 0.004849631804972887, + "learning_rate": 4.311348962379696e-05, + "loss": 0.0324, + "num_input_tokens_seen": 69464720, + "step": 57080 + }, + { + "epoch": 6.3576122062590485, + "grad_norm": 1.750913381576538, + "learning_rate": 4.311181488232158e-05, + "loss": 0.0959, + "num_input_tokens_seen": 69471120, + "step": 57085 + }, + { + "epoch": 6.358169061142666, + "grad_norm": 2.650381565093994, + "learning_rate": 4.311013996976561e-05, + "loss": 0.1833, + "num_input_tokens_seen": 69477296, + "step": 57090 + }, + { + "epoch": 6.358725916026284, + "grad_norm": 0.15375390648841858, + "learning_rate": 4.310846488614484e-05, + "loss": 0.0311, + "num_input_tokens_seen": 69483408, + "step": 57095 + }, + { + "epoch": 6.359282770909901, + "grad_norm": 0.5394793748855591, + "learning_rate": 4.310678963147512e-05, + "loss": 0.0567, + "num_input_tokens_seen": 69489712, + "step": 57100 + }, + { + "epoch": 6.359839625793518, + "grad_norm": 0.2485864907503128, + "learning_rate": 4.3105114205772255e-05, + "loss": 0.076, + "num_input_tokens_seen": 69495920, + "step": 57105 + }, + { + "epoch": 6.360396480677135, + "grad_norm": 0.10078341513872147, + "learning_rate": 4.310343860905209e-05, + "loss": 0.0193, + "num_input_tokens_seen": 69502032, + "step": 57110 + }, + { + "epoch": 6.360953335560753, + "grad_norm": 0.4062313735485077, + "learning_rate": 4.310176284133045e-05, + "loss": 0.0171, + "num_input_tokens_seen": 69508080, + "step": 57115 + }, + { + "epoch": 6.3615101904443705, + "grad_norm": 0.0664765015244484, + "learning_rate": 4.310008690262315e-05, + "loss": 0.0088, + "num_input_tokens_seen": 69514416, + "step": 57120 + }, + { + "epoch": 6.362067045327987, + "grad_norm": 0.3191969394683838, + "learning_rate": 4.309841079294602e-05, + "loss": 0.0204, + "num_input_tokens_seen": 69520400, + "step": 57125 + }, + { + "epoch": 6.362623900211605, + "grad_norm": 0.5658974647521973, + "learning_rate": 4.3096734512314905e-05, + "loss": 0.1018, + "num_input_tokens_seen": 69526064, + "step": 57130 + }, + { + "epoch": 6.363180755095222, + "grad_norm": 0.4852153956890106, + "learning_rate": 4.3095058060745644e-05, + "loss": 0.0405, + "num_input_tokens_seen": 69532080, + "step": 57135 + }, + { + "epoch": 6.363737609978839, + "grad_norm": 0.3044965863227844, + "learning_rate": 4.309338143825405e-05, + "loss": 0.131, + "num_input_tokens_seen": 69537584, + "step": 57140 + }, + { + "epoch": 6.364294464862457, + "grad_norm": 0.012241180054843426, + "learning_rate": 4.309170464485598e-05, + "loss": 0.015, + "num_input_tokens_seen": 69543664, + "step": 57145 + }, + { + "epoch": 6.364851319746074, + "grad_norm": 0.5130921602249146, + "learning_rate": 4.309002768056726e-05, + "loss": 0.0697, + "num_input_tokens_seen": 69549776, + "step": 57150 + }, + { + "epoch": 6.3654081746296916, + "grad_norm": 0.6413878798484802, + "learning_rate": 4.308835054540373e-05, + "loss": 0.1429, + "num_input_tokens_seen": 69556080, + "step": 57155 + }, + { + "epoch": 6.365965029513309, + "grad_norm": 0.10838412493467331, + "learning_rate": 4.308667323938125e-05, + "loss": 0.0827, + "num_input_tokens_seen": 69562000, + "step": 57160 + }, + { + "epoch": 6.366521884396926, + "grad_norm": 0.012853831984102726, + "learning_rate": 4.308499576251563e-05, + "loss": 0.0912, + "num_input_tokens_seen": 69568304, + "step": 57165 + }, + { + "epoch": 6.367078739280544, + "grad_norm": 0.7934443950653076, + "learning_rate": 4.308331811482276e-05, + "loss": 0.0624, + "num_input_tokens_seen": 69574512, + "step": 57170 + }, + { + "epoch": 6.3676355941641605, + "grad_norm": 0.4613068103790283, + "learning_rate": 4.308164029631845e-05, + "loss": 0.0377, + "num_input_tokens_seen": 69580304, + "step": 57175 + }, + { + "epoch": 6.368192449047778, + "grad_norm": 0.27610471844673157, + "learning_rate": 4.307996230701856e-05, + "loss": 0.0564, + "num_input_tokens_seen": 69586224, + "step": 57180 + }, + { + "epoch": 6.368749303931396, + "grad_norm": 0.0026331781409680843, + "learning_rate": 4.307828414693894e-05, + "loss": 0.0221, + "num_input_tokens_seen": 69592400, + "step": 57185 + }, + { + "epoch": 6.369306158815013, + "grad_norm": 1.3871347904205322, + "learning_rate": 4.307660581609545e-05, + "loss": 0.1225, + "num_input_tokens_seen": 69598672, + "step": 57190 + }, + { + "epoch": 6.36986301369863, + "grad_norm": 0.011272529140114784, + "learning_rate": 4.307492731450392e-05, + "loss": 0.0366, + "num_input_tokens_seen": 69604816, + "step": 57195 + }, + { + "epoch": 6.370419868582247, + "grad_norm": 1.109618902206421, + "learning_rate": 4.307324864218023e-05, + "loss": 0.0513, + "num_input_tokens_seen": 69610672, + "step": 57200 + }, + { + "epoch": 6.370976723465865, + "grad_norm": 0.06300625950098038, + "learning_rate": 4.307156979914023e-05, + "loss": 0.0091, + "num_input_tokens_seen": 69616848, + "step": 57205 + }, + { + "epoch": 6.371533578349482, + "grad_norm": 0.602999210357666, + "learning_rate": 4.306989078539977e-05, + "loss": 0.031, + "num_input_tokens_seen": 69623024, + "step": 57210 + }, + { + "epoch": 6.372090433233099, + "grad_norm": 2.276973009109497, + "learning_rate": 4.306821160097472e-05, + "loss": 0.0697, + "num_input_tokens_seen": 69629232, + "step": 57215 + }, + { + "epoch": 6.372647288116717, + "grad_norm": 0.9804204702377319, + "learning_rate": 4.306653224588093e-05, + "loss": 0.054, + "num_input_tokens_seen": 69635088, + "step": 57220 + }, + { + "epoch": 6.3732041430003346, + "grad_norm": 2.40275239944458, + "learning_rate": 4.306485272013426e-05, + "loss": 0.1165, + "num_input_tokens_seen": 69641360, + "step": 57225 + }, + { + "epoch": 6.373760997883951, + "grad_norm": 0.5021406412124634, + "learning_rate": 4.306317302375059e-05, + "loss": 0.0556, + "num_input_tokens_seen": 69647216, + "step": 57230 + }, + { + "epoch": 6.374317852767569, + "grad_norm": 1.2626420259475708, + "learning_rate": 4.3061493156745787e-05, + "loss": 0.1367, + "num_input_tokens_seen": 69653424, + "step": 57235 + }, + { + "epoch": 6.374874707651186, + "grad_norm": 0.13434675335884094, + "learning_rate": 4.3059813119135705e-05, + "loss": 0.0667, + "num_input_tokens_seen": 69659344, + "step": 57240 + }, + { + "epoch": 6.3754315625348035, + "grad_norm": 0.4498186409473419, + "learning_rate": 4.305813291093622e-05, + "loss": 0.071, + "num_input_tokens_seen": 69665328, + "step": 57245 + }, + { + "epoch": 6.375988417418421, + "grad_norm": 1.1033601760864258, + "learning_rate": 4.305645253216319e-05, + "loss": 0.0954, + "num_input_tokens_seen": 69671504, + "step": 57250 + }, + { + "epoch": 6.376545272302038, + "grad_norm": 0.8134946823120117, + "learning_rate": 4.3054771982832516e-05, + "loss": 0.0864, + "num_input_tokens_seen": 69677104, + "step": 57255 + }, + { + "epoch": 6.377102127185656, + "grad_norm": 1.1648287773132324, + "learning_rate": 4.305309126296004e-05, + "loss": 0.1045, + "num_input_tokens_seen": 69683152, + "step": 57260 + }, + { + "epoch": 6.377658982069272, + "grad_norm": 0.050225161015987396, + "learning_rate": 4.305141037256166e-05, + "loss": 0.0069, + "num_input_tokens_seen": 69689584, + "step": 57265 + }, + { + "epoch": 6.37821583695289, + "grad_norm": 0.014501445926725864, + "learning_rate": 4.304972931165325e-05, + "loss": 0.2319, + "num_input_tokens_seen": 69695728, + "step": 57270 + }, + { + "epoch": 6.378772691836508, + "grad_norm": 1.0891571044921875, + "learning_rate": 4.304804808025068e-05, + "loss": 0.1302, + "num_input_tokens_seen": 69701456, + "step": 57275 + }, + { + "epoch": 6.3793295467201245, + "grad_norm": 2.4207603931427, + "learning_rate": 4.3046366678369845e-05, + "loss": 0.0821, + "num_input_tokens_seen": 69707408, + "step": 57280 + }, + { + "epoch": 6.379886401603742, + "grad_norm": 0.006583503447473049, + "learning_rate": 4.3044685106026614e-05, + "loss": 0.0102, + "num_input_tokens_seen": 69713552, + "step": 57285 + }, + { + "epoch": 6.380443256487359, + "grad_norm": 0.6662850975990295, + "learning_rate": 4.3043003363236875e-05, + "loss": 0.1187, + "num_input_tokens_seen": 69719632, + "step": 57290 + }, + { + "epoch": 6.381000111370977, + "grad_norm": 0.011731546372175217, + "learning_rate": 4.3041321450016504e-05, + "loss": 0.0345, + "num_input_tokens_seen": 69725808, + "step": 57295 + }, + { + "epoch": 6.381556966254594, + "grad_norm": 0.17228533327579498, + "learning_rate": 4.3039639366381405e-05, + "loss": 0.012, + "num_input_tokens_seen": 69732016, + "step": 57300 + }, + { + "epoch": 6.382113821138211, + "grad_norm": 0.036108024418354034, + "learning_rate": 4.303795711234746e-05, + "loss": 0.0766, + "num_input_tokens_seen": 69738064, + "step": 57305 + }, + { + "epoch": 6.382670676021829, + "grad_norm": 1.487237811088562, + "learning_rate": 4.3036274687930556e-05, + "loss": 0.1494, + "num_input_tokens_seen": 69744400, + "step": 57310 + }, + { + "epoch": 6.383227530905446, + "grad_norm": 0.694490909576416, + "learning_rate": 4.3034592093146595e-05, + "loss": 0.0523, + "num_input_tokens_seen": 69750800, + "step": 57315 + }, + { + "epoch": 6.383784385789063, + "grad_norm": 0.5768300294876099, + "learning_rate": 4.303290932801145e-05, + "loss": 0.0664, + "num_input_tokens_seen": 69757168, + "step": 57320 + }, + { + "epoch": 6.384341240672681, + "grad_norm": 1.3805885314941406, + "learning_rate": 4.3031226392541034e-05, + "loss": 0.0863, + "num_input_tokens_seen": 69763088, + "step": 57325 + }, + { + "epoch": 6.384898095556298, + "grad_norm": 0.9389404058456421, + "learning_rate": 4.302954328675124e-05, + "loss": 0.0626, + "num_input_tokens_seen": 69768880, + "step": 57330 + }, + { + "epoch": 6.385454950439915, + "grad_norm": 0.27959200739860535, + "learning_rate": 4.3027860010657964e-05, + "loss": 0.0798, + "num_input_tokens_seen": 69774992, + "step": 57335 + }, + { + "epoch": 6.386011805323533, + "grad_norm": 0.7309237122535706, + "learning_rate": 4.302617656427711e-05, + "loss": 0.0365, + "num_input_tokens_seen": 69781040, + "step": 57340 + }, + { + "epoch": 6.38656866020715, + "grad_norm": 0.4202529191970825, + "learning_rate": 4.3024492947624574e-05, + "loss": 0.0426, + "num_input_tokens_seen": 69787280, + "step": 57345 + }, + { + "epoch": 6.3871255150907675, + "grad_norm": 0.0386490635573864, + "learning_rate": 4.302280916071626e-05, + "loss": 0.0498, + "num_input_tokens_seen": 69792816, + "step": 57350 + }, + { + "epoch": 6.387682369974384, + "grad_norm": 0.31052881479263306, + "learning_rate": 4.302112520356807e-05, + "loss": 0.0427, + "num_input_tokens_seen": 69798800, + "step": 57355 + }, + { + "epoch": 6.388239224858002, + "grad_norm": 0.877918004989624, + "learning_rate": 4.301944107619592e-05, + "loss": 0.0524, + "num_input_tokens_seen": 69805168, + "step": 57360 + }, + { + "epoch": 6.38879607974162, + "grad_norm": 0.11372510343790054, + "learning_rate": 4.301775677861571e-05, + "loss": 0.0543, + "num_input_tokens_seen": 69810736, + "step": 57365 + }, + { + "epoch": 6.3893529346252365, + "grad_norm": 0.24208146333694458, + "learning_rate": 4.3016072310843344e-05, + "loss": 0.0141, + "num_input_tokens_seen": 69816720, + "step": 57370 + }, + { + "epoch": 6.389909789508854, + "grad_norm": 1.1219183206558228, + "learning_rate": 4.3014387672894754e-05, + "loss": 0.1085, + "num_input_tokens_seen": 69822896, + "step": 57375 + }, + { + "epoch": 6.390466644392471, + "grad_norm": 0.057433608919382095, + "learning_rate": 4.301270286478583e-05, + "loss": 0.0248, + "num_input_tokens_seen": 69829040, + "step": 57380 + }, + { + "epoch": 6.391023499276089, + "grad_norm": 0.09935708343982697, + "learning_rate": 4.30110178865325e-05, + "loss": 0.0822, + "num_input_tokens_seen": 69835216, + "step": 57385 + }, + { + "epoch": 6.391580354159706, + "grad_norm": 0.008645814843475819, + "learning_rate": 4.300933273815067e-05, + "loss": 0.0256, + "num_input_tokens_seen": 69841584, + "step": 57390 + }, + { + "epoch": 6.392137209043323, + "grad_norm": 0.04060135781764984, + "learning_rate": 4.300764741965627e-05, + "loss": 0.0059, + "num_input_tokens_seen": 69847536, + "step": 57395 + }, + { + "epoch": 6.392694063926941, + "grad_norm": 0.20533634722232819, + "learning_rate": 4.300596193106522e-05, + "loss": 0.052, + "num_input_tokens_seen": 69853776, + "step": 57400 + }, + { + "epoch": 6.393250918810558, + "grad_norm": 1.1486077308654785, + "learning_rate": 4.300427627239342e-05, + "loss": 0.0785, + "num_input_tokens_seen": 69860080, + "step": 57405 + }, + { + "epoch": 6.393807773694175, + "grad_norm": 0.44656607508659363, + "learning_rate": 4.300259044365681e-05, + "loss": 0.0191, + "num_input_tokens_seen": 69866064, + "step": 57410 + }, + { + "epoch": 6.394364628577793, + "grad_norm": 0.012480014935135841, + "learning_rate": 4.3000904444871314e-05, + "loss": 0.0365, + "num_input_tokens_seen": 69872592, + "step": 57415 + }, + { + "epoch": 6.39492148346141, + "grad_norm": 0.0773712694644928, + "learning_rate": 4.2999218276052844e-05, + "loss": 0.0384, + "num_input_tokens_seen": 69878576, + "step": 57420 + }, + { + "epoch": 6.395478338345027, + "grad_norm": 0.4332459568977356, + "learning_rate": 4.299753193721735e-05, + "loss": 0.0895, + "num_input_tokens_seen": 69884912, + "step": 57425 + }, + { + "epoch": 6.396035193228645, + "grad_norm": 1.778598427772522, + "learning_rate": 4.2995845428380735e-05, + "loss": 0.1001, + "num_input_tokens_seen": 69890864, + "step": 57430 + }, + { + "epoch": 6.396592048112262, + "grad_norm": 0.18401488661766052, + "learning_rate": 4.299415874955895e-05, + "loss": 0.0913, + "num_input_tokens_seen": 69896816, + "step": 57435 + }, + { + "epoch": 6.3971489029958795, + "grad_norm": 0.00043778857798315585, + "learning_rate": 4.299247190076792e-05, + "loss": 0.0357, + "num_input_tokens_seen": 69903248, + "step": 57440 + }, + { + "epoch": 6.397705757879496, + "grad_norm": 2.1625401973724365, + "learning_rate": 4.2990784882023565e-05, + "loss": 0.038, + "num_input_tokens_seen": 69909488, + "step": 57445 + }, + { + "epoch": 6.398262612763114, + "grad_norm": 1.1059322357177734, + "learning_rate": 4.298909769334184e-05, + "loss": 0.0855, + "num_input_tokens_seen": 69915408, + "step": 57450 + }, + { + "epoch": 6.398819467646732, + "grad_norm": 1.1680258512496948, + "learning_rate": 4.2987410334738674e-05, + "loss": 0.1252, + "num_input_tokens_seen": 69921488, + "step": 57455 + }, + { + "epoch": 6.399376322530348, + "grad_norm": 0.6423860192298889, + "learning_rate": 4.298572280623001e-05, + "loss": 0.1015, + "num_input_tokens_seen": 69927600, + "step": 57460 + }, + { + "epoch": 6.399933177413966, + "grad_norm": 1.207489013671875, + "learning_rate": 4.298403510783179e-05, + "loss": 0.075, + "num_input_tokens_seen": 69933872, + "step": 57465 + }, + { + "epoch": 6.400490032297583, + "grad_norm": 0.433523029088974, + "learning_rate": 4.298234723955994e-05, + "loss": 0.1951, + "num_input_tokens_seen": 69939824, + "step": 57470 + }, + { + "epoch": 6.4010468871812005, + "grad_norm": 0.37971261143684387, + "learning_rate": 4.298065920143042e-05, + "loss": 0.017, + "num_input_tokens_seen": 69946000, + "step": 57475 + }, + { + "epoch": 6.401603742064818, + "grad_norm": 0.39933496713638306, + "learning_rate": 4.2978970993459154e-05, + "loss": 0.1253, + "num_input_tokens_seen": 69952016, + "step": 57480 + }, + { + "epoch": 6.402160596948435, + "grad_norm": 0.023176874965429306, + "learning_rate": 4.297728261566211e-05, + "loss": 0.083, + "num_input_tokens_seen": 69958032, + "step": 57485 + }, + { + "epoch": 6.402717451832053, + "grad_norm": 0.010646195150911808, + "learning_rate": 4.2975594068055234e-05, + "loss": 0.0211, + "num_input_tokens_seen": 69964304, + "step": 57490 + }, + { + "epoch": 6.403274306715669, + "grad_norm": 0.6120584607124329, + "learning_rate": 4.297390535065446e-05, + "loss": 0.0353, + "num_input_tokens_seen": 69970480, + "step": 57495 + }, + { + "epoch": 6.403831161599287, + "grad_norm": 0.001377199194394052, + "learning_rate": 4.297221646347576e-05, + "loss": 0.1002, + "num_input_tokens_seen": 69976592, + "step": 57500 + }, + { + "epoch": 6.404388016482905, + "grad_norm": 1.1374257802963257, + "learning_rate": 4.2970527406535074e-05, + "loss": 0.0754, + "num_input_tokens_seen": 69982864, + "step": 57505 + }, + { + "epoch": 6.404944871366522, + "grad_norm": 0.7772673964500427, + "learning_rate": 4.2968838179848356e-05, + "loss": 0.0588, + "num_input_tokens_seen": 69989200, + "step": 57510 + }, + { + "epoch": 6.405501726250139, + "grad_norm": 1.1761748790740967, + "learning_rate": 4.296714878343156e-05, + "loss": 0.103, + "num_input_tokens_seen": 69995536, + "step": 57515 + }, + { + "epoch": 6.406058581133757, + "grad_norm": 1.5554327964782715, + "learning_rate": 4.2965459217300667e-05, + "loss": 0.1955, + "num_input_tokens_seen": 69999888, + "step": 57520 + }, + { + "epoch": 6.406615436017374, + "grad_norm": 0.06714179366827011, + "learning_rate": 4.29637694814716e-05, + "loss": 0.0259, + "num_input_tokens_seen": 70006352, + "step": 57525 + }, + { + "epoch": 6.407172290900991, + "grad_norm": 0.27139201760292053, + "learning_rate": 4.296207957596034e-05, + "loss": 0.0451, + "num_input_tokens_seen": 70012816, + "step": 57530 + }, + { + "epoch": 6.407729145784608, + "grad_norm": 0.20168814063072205, + "learning_rate": 4.296038950078285e-05, + "loss": 0.113, + "num_input_tokens_seen": 70018448, + "step": 57535 + }, + { + "epoch": 6.408286000668226, + "grad_norm": 0.5072982311248779, + "learning_rate": 4.29586992559551e-05, + "loss": 0.1142, + "num_input_tokens_seen": 70024592, + "step": 57540 + }, + { + "epoch": 6.4088428555518435, + "grad_norm": 0.337383896112442, + "learning_rate": 4.295700884149304e-05, + "loss": 0.0101, + "num_input_tokens_seen": 70030352, + "step": 57545 + }, + { + "epoch": 6.40939971043546, + "grad_norm": 0.6518659591674805, + "learning_rate": 4.295531825741264e-05, + "loss": 0.0636, + "num_input_tokens_seen": 70036528, + "step": 57550 + }, + { + "epoch": 6.409956565319078, + "grad_norm": 2.0699713230133057, + "learning_rate": 4.295362750372988e-05, + "loss": 0.2037, + "num_input_tokens_seen": 70042480, + "step": 57555 + }, + { + "epoch": 6.410513420202695, + "grad_norm": 0.20972470939159393, + "learning_rate": 4.2951936580460725e-05, + "loss": 0.0069, + "num_input_tokens_seen": 70048784, + "step": 57560 + }, + { + "epoch": 6.411070275086312, + "grad_norm": 0.8017511963844299, + "learning_rate": 4.2950245487621144e-05, + "loss": 0.0471, + "num_input_tokens_seen": 70055024, + "step": 57565 + }, + { + "epoch": 6.41162712996993, + "grad_norm": 0.044577326625585556, + "learning_rate": 4.294855422522711e-05, + "loss": 0.0937, + "num_input_tokens_seen": 70061104, + "step": 57570 + }, + { + "epoch": 6.412183984853547, + "grad_norm": 0.016198445111513138, + "learning_rate": 4.29468627932946e-05, + "loss": 0.0867, + "num_input_tokens_seen": 70067792, + "step": 57575 + }, + { + "epoch": 6.412740839737165, + "grad_norm": 0.4056943356990814, + "learning_rate": 4.29451711918396e-05, + "loss": 0.1092, + "num_input_tokens_seen": 70074032, + "step": 57580 + }, + { + "epoch": 6.413297694620782, + "grad_norm": 0.024944093078374863, + "learning_rate": 4.294347942087808e-05, + "loss": 0.0741, + "num_input_tokens_seen": 70080464, + "step": 57585 + }, + { + "epoch": 6.413854549504399, + "grad_norm": 0.054764699190855026, + "learning_rate": 4.294178748042601e-05, + "loss": 0.1358, + "num_input_tokens_seen": 70086480, + "step": 57590 + }, + { + "epoch": 6.414411404388017, + "grad_norm": 0.03390638902783394, + "learning_rate": 4.2940095370499386e-05, + "loss": 0.0432, + "num_input_tokens_seen": 70092464, + "step": 57595 + }, + { + "epoch": 6.4149682592716335, + "grad_norm": 3.3152196407318115, + "learning_rate": 4.293840309111419e-05, + "loss": 0.2111, + "num_input_tokens_seen": 70097584, + "step": 57600 + }, + { + "epoch": 6.415525114155251, + "grad_norm": 1.5896884202957153, + "learning_rate": 4.293671064228641e-05, + "loss": 0.0664, + "num_input_tokens_seen": 70103824, + "step": 57605 + }, + { + "epoch": 6.416081969038869, + "grad_norm": 0.7887038588523865, + "learning_rate": 4.293501802403202e-05, + "loss": 0.0817, + "num_input_tokens_seen": 70110032, + "step": 57610 + }, + { + "epoch": 6.416638823922486, + "grad_norm": 0.005674298387020826, + "learning_rate": 4.2933325236367023e-05, + "loss": 0.0292, + "num_input_tokens_seen": 70116208, + "step": 57615 + }, + { + "epoch": 6.417195678806103, + "grad_norm": 0.7289348840713501, + "learning_rate": 4.29316322793074e-05, + "loss": 0.169, + "num_input_tokens_seen": 70121776, + "step": 57620 + }, + { + "epoch": 6.41775253368972, + "grad_norm": 0.17500153183937073, + "learning_rate": 4.2929939152869136e-05, + "loss": 0.0152, + "num_input_tokens_seen": 70127088, + "step": 57625 + }, + { + "epoch": 6.418309388573338, + "grad_norm": 0.30365392565727234, + "learning_rate": 4.292824585706824e-05, + "loss": 0.02, + "num_input_tokens_seen": 70133264, + "step": 57630 + }, + { + "epoch": 6.4188662434569554, + "grad_norm": 0.28131935000419617, + "learning_rate": 4.292655239192069e-05, + "loss": 0.0608, + "num_input_tokens_seen": 70139600, + "step": 57635 + }, + { + "epoch": 6.419423098340572, + "grad_norm": 0.22257953882217407, + "learning_rate": 4.29248587574425e-05, + "loss": 0.0818, + "num_input_tokens_seen": 70145456, + "step": 57640 + }, + { + "epoch": 6.41997995322419, + "grad_norm": 0.22846680879592896, + "learning_rate": 4.2923164953649646e-05, + "loss": 0.0848, + "num_input_tokens_seen": 70151824, + "step": 57645 + }, + { + "epoch": 6.420536808107807, + "grad_norm": 0.42040860652923584, + "learning_rate": 4.292147098055815e-05, + "loss": 0.0667, + "num_input_tokens_seen": 70158192, + "step": 57650 + }, + { + "epoch": 6.421093662991424, + "grad_norm": 1.0213037729263306, + "learning_rate": 4.2919776838184e-05, + "loss": 0.0588, + "num_input_tokens_seen": 70164272, + "step": 57655 + }, + { + "epoch": 6.421650517875042, + "grad_norm": 0.04881902039051056, + "learning_rate": 4.2918082526543194e-05, + "loss": 0.0782, + "num_input_tokens_seen": 70170352, + "step": 57660 + }, + { + "epoch": 6.422207372758659, + "grad_norm": 0.27464595437049866, + "learning_rate": 4.2916388045651744e-05, + "loss": 0.1489, + "num_input_tokens_seen": 70176528, + "step": 57665 + }, + { + "epoch": 6.4227642276422765, + "grad_norm": 0.17667919397354126, + "learning_rate": 4.2914693395525665e-05, + "loss": 0.0381, + "num_input_tokens_seen": 70182448, + "step": 57670 + }, + { + "epoch": 6.423321082525893, + "grad_norm": 0.0861133262515068, + "learning_rate": 4.2912998576180944e-05, + "loss": 0.141, + "num_input_tokens_seen": 70188816, + "step": 57675 + }, + { + "epoch": 6.423877937409511, + "grad_norm": 1.0423309803009033, + "learning_rate": 4.29113035876336e-05, + "loss": 0.1975, + "num_input_tokens_seen": 70194736, + "step": 57680 + }, + { + "epoch": 6.424434792293129, + "grad_norm": 0.09620095044374466, + "learning_rate": 4.290960842989965e-05, + "loss": 0.0013, + "num_input_tokens_seen": 70200816, + "step": 57685 + }, + { + "epoch": 6.424991647176745, + "grad_norm": 0.1076289713382721, + "learning_rate": 4.290791310299509e-05, + "loss": 0.0388, + "num_input_tokens_seen": 70207024, + "step": 57690 + }, + { + "epoch": 6.425548502060363, + "grad_norm": 0.5797268152236938, + "learning_rate": 4.290621760693594e-05, + "loss": 0.1591, + "num_input_tokens_seen": 70212720, + "step": 57695 + }, + { + "epoch": 6.426105356943981, + "grad_norm": 0.30180248618125916, + "learning_rate": 4.290452194173823e-05, + "loss": 0.0822, + "num_input_tokens_seen": 70218544, + "step": 57700 + }, + { + "epoch": 6.426662211827598, + "grad_norm": 1.0202573537826538, + "learning_rate": 4.2902826107417964e-05, + "loss": 0.2262, + "num_input_tokens_seen": 70224720, + "step": 57705 + }, + { + "epoch": 6.427219066711215, + "grad_norm": 0.5397900342941284, + "learning_rate": 4.290113010399116e-05, + "loss": 0.0377, + "num_input_tokens_seen": 70230768, + "step": 57710 + }, + { + "epoch": 6.427775921594832, + "grad_norm": 0.4408499300479889, + "learning_rate": 4.289943393147384e-05, + "loss": 0.0388, + "num_input_tokens_seen": 70236976, + "step": 57715 + }, + { + "epoch": 6.42833277647845, + "grad_norm": 0.15646204352378845, + "learning_rate": 4.289773758988203e-05, + "loss": 0.0075, + "num_input_tokens_seen": 70242928, + "step": 57720 + }, + { + "epoch": 6.428889631362067, + "grad_norm": 0.6259641647338867, + "learning_rate": 4.289604107923174e-05, + "loss": 0.0731, + "num_input_tokens_seen": 70249360, + "step": 57725 + }, + { + "epoch": 6.429446486245684, + "grad_norm": 0.15775366127490997, + "learning_rate": 4.289434439953901e-05, + "loss": 0.0998, + "num_input_tokens_seen": 70255536, + "step": 57730 + }, + { + "epoch": 6.430003341129302, + "grad_norm": 0.08438315987586975, + "learning_rate": 4.289264755081986e-05, + "loss": 0.0513, + "num_input_tokens_seen": 70261456, + "step": 57735 + }, + { + "epoch": 6.4305601960129195, + "grad_norm": 0.04732196778059006, + "learning_rate": 4.289095053309031e-05, + "loss": 0.0102, + "num_input_tokens_seen": 70267216, + "step": 57740 + }, + { + "epoch": 6.431117050896536, + "grad_norm": 0.6069580912590027, + "learning_rate": 4.288925334636641e-05, + "loss": 0.015, + "num_input_tokens_seen": 70273456, + "step": 57745 + }, + { + "epoch": 6.431673905780154, + "grad_norm": 0.398771733045578, + "learning_rate": 4.288755599066417e-05, + "loss": 0.0814, + "num_input_tokens_seen": 70279728, + "step": 57750 + }, + { + "epoch": 6.432230760663771, + "grad_norm": 0.3163885772228241, + "learning_rate": 4.288585846599964e-05, + "loss": 0.0188, + "num_input_tokens_seen": 70285808, + "step": 57755 + }, + { + "epoch": 6.432787615547388, + "grad_norm": 0.8322655558586121, + "learning_rate": 4.288416077238885e-05, + "loss": 0.0565, + "num_input_tokens_seen": 70292240, + "step": 57760 + }, + { + "epoch": 6.433344470431006, + "grad_norm": 0.15156058967113495, + "learning_rate": 4.2882462909847824e-05, + "loss": 0.0281, + "num_input_tokens_seen": 70298320, + "step": 57765 + }, + { + "epoch": 6.433901325314623, + "grad_norm": 0.4631434679031372, + "learning_rate": 4.2880764878392614e-05, + "loss": 0.03, + "num_input_tokens_seen": 70304176, + "step": 57770 + }, + { + "epoch": 6.434458180198241, + "grad_norm": 0.5706311464309692, + "learning_rate": 4.287906667803925e-05, + "loss": 0.0495, + "num_input_tokens_seen": 70310160, + "step": 57775 + }, + { + "epoch": 6.435015035081857, + "grad_norm": 0.5293024778366089, + "learning_rate": 4.287736830880378e-05, + "loss": 0.0777, + "num_input_tokens_seen": 70316400, + "step": 57780 + }, + { + "epoch": 6.435571889965475, + "grad_norm": 0.3140639662742615, + "learning_rate": 4.287566977070224e-05, + "loss": 0.0559, + "num_input_tokens_seen": 70322320, + "step": 57785 + }, + { + "epoch": 6.436128744849093, + "grad_norm": 0.028903650119900703, + "learning_rate": 4.2873971063750685e-05, + "loss": 0.0252, + "num_input_tokens_seen": 70328496, + "step": 57790 + }, + { + "epoch": 6.4366855997327095, + "grad_norm": 1.4856128692626953, + "learning_rate": 4.2872272187965145e-05, + "loss": 0.1002, + "num_input_tokens_seen": 70334736, + "step": 57795 + }, + { + "epoch": 6.437242454616327, + "grad_norm": 0.10775575041770935, + "learning_rate": 4.2870573143361684e-05, + "loss": 0.1189, + "num_input_tokens_seen": 70341008, + "step": 57800 + }, + { + "epoch": 6.437799309499944, + "grad_norm": 0.1115138828754425, + "learning_rate": 4.286887392995633e-05, + "loss": 0.0177, + "num_input_tokens_seen": 70347248, + "step": 57805 + }, + { + "epoch": 6.438356164383562, + "grad_norm": 1.0069172382354736, + "learning_rate": 4.286717454776515e-05, + "loss": 0.1007, + "num_input_tokens_seen": 70353296, + "step": 57810 + }, + { + "epoch": 6.438913019267179, + "grad_norm": 0.5280737280845642, + "learning_rate": 4.286547499680419e-05, + "loss": 0.0767, + "num_input_tokens_seen": 70359440, + "step": 57815 + }, + { + "epoch": 6.439469874150796, + "grad_norm": 0.9556171298027039, + "learning_rate": 4.286377527708951e-05, + "loss": 0.0692, + "num_input_tokens_seen": 70365648, + "step": 57820 + }, + { + "epoch": 6.440026729034414, + "grad_norm": 0.21638603508472443, + "learning_rate": 4.286207538863716e-05, + "loss": 0.1124, + "num_input_tokens_seen": 70371792, + "step": 57825 + }, + { + "epoch": 6.4405835839180305, + "grad_norm": 0.1129327192902565, + "learning_rate": 4.286037533146319e-05, + "loss": 0.0808, + "num_input_tokens_seen": 70377872, + "step": 57830 + }, + { + "epoch": 6.441140438801648, + "grad_norm": 1.0733604431152344, + "learning_rate": 4.285867510558367e-05, + "loss": 0.0693, + "num_input_tokens_seen": 70383920, + "step": 57835 + }, + { + "epoch": 6.441697293685266, + "grad_norm": 0.010578504763543606, + "learning_rate": 4.285697471101466e-05, + "loss": 0.0203, + "num_input_tokens_seen": 70390160, + "step": 57840 + }, + { + "epoch": 6.442254148568883, + "grad_norm": 0.6351863741874695, + "learning_rate": 4.2855274147772214e-05, + "loss": 0.0848, + "num_input_tokens_seen": 70395664, + "step": 57845 + }, + { + "epoch": 6.4428110034525, + "grad_norm": 0.46883681416511536, + "learning_rate": 4.285357341587239e-05, + "loss": 0.1389, + "num_input_tokens_seen": 70401424, + "step": 57850 + }, + { + "epoch": 6.443367858336117, + "grad_norm": 0.0017469626618549228, + "learning_rate": 4.285187251533127e-05, + "loss": 0.0687, + "num_input_tokens_seen": 70407504, + "step": 57855 + }, + { + "epoch": 6.443924713219735, + "grad_norm": 0.611267626285553, + "learning_rate": 4.285017144616491e-05, + "loss": 0.0356, + "num_input_tokens_seen": 70413680, + "step": 57860 + }, + { + "epoch": 6.4444815681033525, + "grad_norm": 0.13635236024856567, + "learning_rate": 4.284847020838938e-05, + "loss": 0.1441, + "num_input_tokens_seen": 70420144, + "step": 57865 + }, + { + "epoch": 6.445038422986969, + "grad_norm": 0.02105892449617386, + "learning_rate": 4.2846768802020746e-05, + "loss": 0.0561, + "num_input_tokens_seen": 70426416, + "step": 57870 + }, + { + "epoch": 6.445595277870587, + "grad_norm": 0.04499799385666847, + "learning_rate": 4.284506722707508e-05, + "loss": 0.0163, + "num_input_tokens_seen": 70432752, + "step": 57875 + }, + { + "epoch": 6.446152132754205, + "grad_norm": 0.04868081212043762, + "learning_rate": 4.284336548356847e-05, + "loss": 0.0131, + "num_input_tokens_seen": 70438608, + "step": 57880 + }, + { + "epoch": 6.446708987637821, + "grad_norm": 0.3926159143447876, + "learning_rate": 4.284166357151697e-05, + "loss": 0.0276, + "num_input_tokens_seen": 70445040, + "step": 57885 + }, + { + "epoch": 6.447265842521439, + "grad_norm": 0.10598326474428177, + "learning_rate": 4.2839961490936654e-05, + "loss": 0.0154, + "num_input_tokens_seen": 70451152, + "step": 57890 + }, + { + "epoch": 6.447822697405056, + "grad_norm": 0.2749343514442444, + "learning_rate": 4.2838259241843614e-05, + "loss": 0.0258, + "num_input_tokens_seen": 70457104, + "step": 57895 + }, + { + "epoch": 6.4483795522886735, + "grad_norm": 0.18191896378993988, + "learning_rate": 4.283655682425393e-05, + "loss": 0.1401, + "num_input_tokens_seen": 70462256, + "step": 57900 + }, + { + "epoch": 6.448936407172291, + "grad_norm": 0.7327922582626343, + "learning_rate": 4.283485423818367e-05, + "loss": 0.0342, + "num_input_tokens_seen": 70468656, + "step": 57905 + }, + { + "epoch": 6.449493262055908, + "grad_norm": 0.0003355654189363122, + "learning_rate": 4.283315148364892e-05, + "loss": 0.0555, + "num_input_tokens_seen": 70474736, + "step": 57910 + }, + { + "epoch": 6.450050116939526, + "grad_norm": 0.678925096988678, + "learning_rate": 4.2831448560665774e-05, + "loss": 0.0273, + "num_input_tokens_seen": 70480208, + "step": 57915 + }, + { + "epoch": 6.450606971823143, + "grad_norm": 0.1426583230495453, + "learning_rate": 4.282974546925031e-05, + "loss": 0.0869, + "num_input_tokens_seen": 70486384, + "step": 57920 + }, + { + "epoch": 6.45116382670676, + "grad_norm": 0.30470651388168335, + "learning_rate": 4.282804220941861e-05, + "loss": 0.0192, + "num_input_tokens_seen": 70492784, + "step": 57925 + }, + { + "epoch": 6.451720681590378, + "grad_norm": 1.469484806060791, + "learning_rate": 4.282633878118677e-05, + "loss": 0.0813, + "num_input_tokens_seen": 70498672, + "step": 57930 + }, + { + "epoch": 6.452277536473995, + "grad_norm": 0.13594600558280945, + "learning_rate": 4.282463518457087e-05, + "loss": 0.0302, + "num_input_tokens_seen": 70504912, + "step": 57935 + }, + { + "epoch": 6.452834391357612, + "grad_norm": 0.09491519629955292, + "learning_rate": 4.282293141958702e-05, + "loss": 0.0314, + "num_input_tokens_seen": 70510960, + "step": 57940 + }, + { + "epoch": 6.45339124624123, + "grad_norm": 1.520195484161377, + "learning_rate": 4.28212274862513e-05, + "loss": 0.158, + "num_input_tokens_seen": 70516560, + "step": 57945 + }, + { + "epoch": 6.453948101124847, + "grad_norm": 0.0033749265130609274, + "learning_rate": 4.281952338457981e-05, + "loss": 0.0843, + "num_input_tokens_seen": 70522576, + "step": 57950 + }, + { + "epoch": 6.454504956008464, + "grad_norm": 0.058413513004779816, + "learning_rate": 4.2817819114588644e-05, + "loss": 0.115, + "num_input_tokens_seen": 70528336, + "step": 57955 + }, + { + "epoch": 6.455061810892081, + "grad_norm": 0.022141333669424057, + "learning_rate": 4.28161146762939e-05, + "loss": 0.004, + "num_input_tokens_seen": 70534800, + "step": 57960 + }, + { + "epoch": 6.455618665775699, + "grad_norm": 2.4376492500305176, + "learning_rate": 4.281441006971168e-05, + "loss": 0.1207, + "num_input_tokens_seen": 70540880, + "step": 57965 + }, + { + "epoch": 6.4561755206593165, + "grad_norm": 0.015988605096936226, + "learning_rate": 4.281270529485808e-05, + "loss": 0.0298, + "num_input_tokens_seen": 70546864, + "step": 57970 + }, + { + "epoch": 6.456732375542933, + "grad_norm": 1.3270621299743652, + "learning_rate": 4.281100035174922e-05, + "loss": 0.1391, + "num_input_tokens_seen": 70552784, + "step": 57975 + }, + { + "epoch": 6.457289230426551, + "grad_norm": 0.22020211815834045, + "learning_rate": 4.280929524040118e-05, + "loss": 0.0225, + "num_input_tokens_seen": 70558640, + "step": 57980 + }, + { + "epoch": 6.457846085310168, + "grad_norm": 0.06564754247665405, + "learning_rate": 4.280758996083008e-05, + "loss": 0.0716, + "num_input_tokens_seen": 70564720, + "step": 57985 + }, + { + "epoch": 6.4584029401937855, + "grad_norm": 2.1018455028533936, + "learning_rate": 4.280588451305203e-05, + "loss": 0.2085, + "num_input_tokens_seen": 70569968, + "step": 57990 + }, + { + "epoch": 6.458959795077403, + "grad_norm": 0.3905212879180908, + "learning_rate": 4.280417889708313e-05, + "loss": 0.0467, + "num_input_tokens_seen": 70576080, + "step": 57995 + }, + { + "epoch": 6.45951664996102, + "grad_norm": 0.048469893634319305, + "learning_rate": 4.280247311293949e-05, + "loss": 0.0978, + "num_input_tokens_seen": 70582064, + "step": 58000 + }, + { + "epoch": 6.460073504844638, + "grad_norm": 0.6032636761665344, + "learning_rate": 4.280076716063724e-05, + "loss": 0.0329, + "num_input_tokens_seen": 70588304, + "step": 58005 + }, + { + "epoch": 6.460630359728254, + "grad_norm": 0.03375300019979477, + "learning_rate": 4.279906104019248e-05, + "loss": 0.0077, + "num_input_tokens_seen": 70594608, + "step": 58010 + }, + { + "epoch": 6.461187214611872, + "grad_norm": 0.35114380717277527, + "learning_rate": 4.279735475162132e-05, + "loss": 0.0145, + "num_input_tokens_seen": 70600560, + "step": 58015 + }, + { + "epoch": 6.46174406949549, + "grad_norm": 1.4561141729354858, + "learning_rate": 4.279564829493989e-05, + "loss": 0.1172, + "num_input_tokens_seen": 70606256, + "step": 58020 + }, + { + "epoch": 6.4623009243791065, + "grad_norm": 0.5715648531913757, + "learning_rate": 4.279394167016431e-05, + "loss": 0.0628, + "num_input_tokens_seen": 70612656, + "step": 58025 + }, + { + "epoch": 6.462857779262724, + "grad_norm": 0.1619231402873993, + "learning_rate": 4.2792234877310695e-05, + "loss": 0.0411, + "num_input_tokens_seen": 70618800, + "step": 58030 + }, + { + "epoch": 6.463414634146342, + "grad_norm": 0.6367105841636658, + "learning_rate": 4.279052791639516e-05, + "loss": 0.0871, + "num_input_tokens_seen": 70623984, + "step": 58035 + }, + { + "epoch": 6.463971489029959, + "grad_norm": 0.03493976220488548, + "learning_rate": 4.278882078743384e-05, + "loss": 0.0431, + "num_input_tokens_seen": 70630256, + "step": 58040 + }, + { + "epoch": 6.464528343913576, + "grad_norm": 1.5520106554031372, + "learning_rate": 4.278711349044285e-05, + "loss": 0.0697, + "num_input_tokens_seen": 70636592, + "step": 58045 + }, + { + "epoch": 6.465085198797193, + "grad_norm": 0.17905578017234802, + "learning_rate": 4.2785406025438326e-05, + "loss": 0.0756, + "num_input_tokens_seen": 70642832, + "step": 58050 + }, + { + "epoch": 6.465642053680811, + "grad_norm": 0.028169255703687668, + "learning_rate": 4.278369839243639e-05, + "loss": 0.0452, + "num_input_tokens_seen": 70649040, + "step": 58055 + }, + { + "epoch": 6.4661989085644285, + "grad_norm": 0.9855269193649292, + "learning_rate": 4.2781990591453174e-05, + "loss": 0.0933, + "num_input_tokens_seen": 70654800, + "step": 58060 + }, + { + "epoch": 6.466755763448045, + "grad_norm": 2.4551103115081787, + "learning_rate": 4.278028262250481e-05, + "loss": 0.1003, + "num_input_tokens_seen": 70660784, + "step": 58065 + }, + { + "epoch": 6.467312618331663, + "grad_norm": 1.7818713188171387, + "learning_rate": 4.277857448560744e-05, + "loss": 0.0846, + "num_input_tokens_seen": 70666768, + "step": 58070 + }, + { + "epoch": 6.46786947321528, + "grad_norm": 0.5643267631530762, + "learning_rate": 4.277686618077718e-05, + "loss": 0.0408, + "num_input_tokens_seen": 70672528, + "step": 58075 + }, + { + "epoch": 6.468426328098897, + "grad_norm": 0.9237773418426514, + "learning_rate": 4.277515770803018e-05, + "loss": 0.0329, + "num_input_tokens_seen": 70678544, + "step": 58080 + }, + { + "epoch": 6.468983182982515, + "grad_norm": 0.0717734694480896, + "learning_rate": 4.2773449067382576e-05, + "loss": 0.0819, + "num_input_tokens_seen": 70684688, + "step": 58085 + }, + { + "epoch": 6.469540037866132, + "grad_norm": 0.39133137464523315, + "learning_rate": 4.27717402588505e-05, + "loss": 0.1023, + "num_input_tokens_seen": 70691024, + "step": 58090 + }, + { + "epoch": 6.4700968927497495, + "grad_norm": 0.5594692230224609, + "learning_rate": 4.2770031282450106e-05, + "loss": 0.0213, + "num_input_tokens_seen": 70696624, + "step": 58095 + }, + { + "epoch": 6.470653747633367, + "grad_norm": 0.009608262218534946, + "learning_rate": 4.276832213819753e-05, + "loss": 0.0357, + "num_input_tokens_seen": 70702896, + "step": 58100 + }, + { + "epoch": 6.471210602516984, + "grad_norm": 0.44949033856391907, + "learning_rate": 4.276661282610891e-05, + "loss": 0.0681, + "num_input_tokens_seen": 70708912, + "step": 58105 + }, + { + "epoch": 6.471767457400602, + "grad_norm": 0.1703215092420578, + "learning_rate": 4.27649033462004e-05, + "loss": 0.0184, + "num_input_tokens_seen": 70715248, + "step": 58110 + }, + { + "epoch": 6.4723243122842185, + "grad_norm": 2.665531873703003, + "learning_rate": 4.276319369848815e-05, + "loss": 0.0684, + "num_input_tokens_seen": 70721328, + "step": 58115 + }, + { + "epoch": 6.472881167167836, + "grad_norm": 0.0006191724096424878, + "learning_rate": 4.276148388298829e-05, + "loss": 0.0006, + "num_input_tokens_seen": 70727728, + "step": 58120 + }, + { + "epoch": 6.473438022051454, + "grad_norm": 0.701738178730011, + "learning_rate": 4.275977389971699e-05, + "loss": 0.0613, + "num_input_tokens_seen": 70733488, + "step": 58125 + }, + { + "epoch": 6.473994876935071, + "grad_norm": 0.2671486735343933, + "learning_rate": 4.27580637486904e-05, + "loss": 0.0733, + "num_input_tokens_seen": 70739568, + "step": 58130 + }, + { + "epoch": 6.474551731818688, + "grad_norm": 0.020002156496047974, + "learning_rate": 4.275635342992467e-05, + "loss": 0.018, + "num_input_tokens_seen": 70745552, + "step": 58135 + }, + { + "epoch": 6.475108586702305, + "grad_norm": 0.04211195930838585, + "learning_rate": 4.2754642943435956e-05, + "loss": 0.0732, + "num_input_tokens_seen": 70751504, + "step": 58140 + }, + { + "epoch": 6.475665441585923, + "grad_norm": 0.04534634202718735, + "learning_rate": 4.2752932289240416e-05, + "loss": 0.0729, + "num_input_tokens_seen": 70757968, + "step": 58145 + }, + { + "epoch": 6.47622229646954, + "grad_norm": 0.001001309952698648, + "learning_rate": 4.2751221467354205e-05, + "loss": 0.1229, + "num_input_tokens_seen": 70764272, + "step": 58150 + }, + { + "epoch": 6.476779151353157, + "grad_norm": 0.43811967968940735, + "learning_rate": 4.2749510477793486e-05, + "loss": 0.0105, + "num_input_tokens_seen": 70770736, + "step": 58155 + }, + { + "epoch": 6.477336006236775, + "grad_norm": 0.3273507356643677, + "learning_rate": 4.274779932057442e-05, + "loss": 0.0606, + "num_input_tokens_seen": 70776816, + "step": 58160 + }, + { + "epoch": 6.477892861120392, + "grad_norm": 0.3652353882789612, + "learning_rate": 4.274608799571317e-05, + "loss": 0.0533, + "num_input_tokens_seen": 70782864, + "step": 58165 + }, + { + "epoch": 6.478449716004009, + "grad_norm": 0.049705520272254944, + "learning_rate": 4.27443765032259e-05, + "loss": 0.1369, + "num_input_tokens_seen": 70788400, + "step": 58170 + }, + { + "epoch": 6.479006570887627, + "grad_norm": 0.39436104893684387, + "learning_rate": 4.274266484312878e-05, + "loss": 0.0297, + "num_input_tokens_seen": 70794416, + "step": 58175 + }, + { + "epoch": 6.479563425771244, + "grad_norm": 0.04105762764811516, + "learning_rate": 4.274095301543797e-05, + "loss": 0.0602, + "num_input_tokens_seen": 70800432, + "step": 58180 + }, + { + "epoch": 6.4801202806548615, + "grad_norm": 0.11709525436162949, + "learning_rate": 4.273924102016965e-05, + "loss": 0.1344, + "num_input_tokens_seen": 70806576, + "step": 58185 + }, + { + "epoch": 6.480677135538478, + "grad_norm": 0.029664326459169388, + "learning_rate": 4.273752885733998e-05, + "loss": 0.0907, + "num_input_tokens_seen": 70812976, + "step": 58190 + }, + { + "epoch": 6.481233990422096, + "grad_norm": 1.0624449253082275, + "learning_rate": 4.2735816526965145e-05, + "loss": 0.0797, + "num_input_tokens_seen": 70819344, + "step": 58195 + }, + { + "epoch": 6.481790845305714, + "grad_norm": 0.2672244906425476, + "learning_rate": 4.2734104029061306e-05, + "loss": 0.111, + "num_input_tokens_seen": 70825680, + "step": 58200 + }, + { + "epoch": 6.48234770018933, + "grad_norm": 0.15844450891017914, + "learning_rate": 4.2732391363644654e-05, + "loss": 0.0998, + "num_input_tokens_seen": 70831504, + "step": 58205 + }, + { + "epoch": 6.482904555072948, + "grad_norm": 0.5535068511962891, + "learning_rate": 4.2730678530731363e-05, + "loss": 0.0974, + "num_input_tokens_seen": 70837680, + "step": 58210 + }, + { + "epoch": 6.483461409956566, + "grad_norm": 0.05810890346765518, + "learning_rate": 4.2728965530337596e-05, + "loss": 0.0539, + "num_input_tokens_seen": 70843888, + "step": 58215 + }, + { + "epoch": 6.4840182648401825, + "grad_norm": 0.13065215945243835, + "learning_rate": 4.2727252362479546e-05, + "loss": 0.0603, + "num_input_tokens_seen": 70849808, + "step": 58220 + }, + { + "epoch": 6.4845751197238, + "grad_norm": 0.14616139233112335, + "learning_rate": 4.27255390271734e-05, + "loss": 0.0933, + "num_input_tokens_seen": 70855888, + "step": 58225 + }, + { + "epoch": 6.485131974607417, + "grad_norm": 1.0323033332824707, + "learning_rate": 4.2723825524435334e-05, + "loss": 0.0397, + "num_input_tokens_seen": 70861936, + "step": 58230 + }, + { + "epoch": 6.485688829491035, + "grad_norm": 0.492051362991333, + "learning_rate": 4.2722111854281536e-05, + "loss": 0.165, + "num_input_tokens_seen": 70867856, + "step": 58235 + }, + { + "epoch": 6.486245684374652, + "grad_norm": 0.2058652639389038, + "learning_rate": 4.2720398016728196e-05, + "loss": 0.0151, + "num_input_tokens_seen": 70873968, + "step": 58240 + }, + { + "epoch": 6.486802539258269, + "grad_norm": 0.7240676879882812, + "learning_rate": 4.2718684011791496e-05, + "loss": 0.0742, + "num_input_tokens_seen": 70879824, + "step": 58245 + }, + { + "epoch": 6.487359394141887, + "grad_norm": 1.2339468002319336, + "learning_rate": 4.271696983948763e-05, + "loss": 0.0802, + "num_input_tokens_seen": 70885968, + "step": 58250 + }, + { + "epoch": 6.487916249025504, + "grad_norm": 0.619303822517395, + "learning_rate": 4.271525549983279e-05, + "loss": 0.1348, + "num_input_tokens_seen": 70892080, + "step": 58255 + }, + { + "epoch": 6.488473103909121, + "grad_norm": 0.002711081877350807, + "learning_rate": 4.271354099284317e-05, + "loss": 0.0158, + "num_input_tokens_seen": 70897808, + "step": 58260 + }, + { + "epoch": 6.489029958792739, + "grad_norm": 1.4900462627410889, + "learning_rate": 4.2711826318534964e-05, + "loss": 0.0625, + "num_input_tokens_seen": 70903792, + "step": 58265 + }, + { + "epoch": 6.489586813676356, + "grad_norm": 0.3620518743991852, + "learning_rate": 4.271011147692436e-05, + "loss": 0.0377, + "num_input_tokens_seen": 70909712, + "step": 58270 + }, + { + "epoch": 6.490143668559973, + "grad_norm": 1.4548311233520508, + "learning_rate": 4.2708396468027576e-05, + "loss": 0.1931, + "num_input_tokens_seen": 70915664, + "step": 58275 + }, + { + "epoch": 6.490700523443591, + "grad_norm": 0.1489633470773697, + "learning_rate": 4.270668129186079e-05, + "loss": 0.0034, + "num_input_tokens_seen": 70922064, + "step": 58280 + }, + { + "epoch": 6.491257378327208, + "grad_norm": 0.11937177926301956, + "learning_rate": 4.270496594844021e-05, + "loss": 0.1201, + "num_input_tokens_seen": 70928400, + "step": 58285 + }, + { + "epoch": 6.4918142332108255, + "grad_norm": 0.05451488494873047, + "learning_rate": 4.270325043778205e-05, + "loss": 0.0014, + "num_input_tokens_seen": 70934704, + "step": 58290 + }, + { + "epoch": 6.492371088094442, + "grad_norm": 0.9087763428688049, + "learning_rate": 4.270153475990251e-05, + "loss": 0.0512, + "num_input_tokens_seen": 70940848, + "step": 58295 + }, + { + "epoch": 6.49292794297806, + "grad_norm": 0.23537084460258484, + "learning_rate": 4.269981891481778e-05, + "loss": 0.041, + "num_input_tokens_seen": 70946800, + "step": 58300 + }, + { + "epoch": 6.493484797861678, + "grad_norm": 0.6485013365745544, + "learning_rate": 4.269810290254409e-05, + "loss": 0.025, + "num_input_tokens_seen": 70953008, + "step": 58305 + }, + { + "epoch": 6.494041652745294, + "grad_norm": 0.00040450325468555093, + "learning_rate": 4.2696386723097636e-05, + "loss": 0.0641, + "num_input_tokens_seen": 70959312, + "step": 58310 + }, + { + "epoch": 6.494598507628912, + "grad_norm": 0.9927257895469666, + "learning_rate": 4.2694670376494626e-05, + "loss": 0.0701, + "num_input_tokens_seen": 70965328, + "step": 58315 + }, + { + "epoch": 6.495155362512529, + "grad_norm": 0.9341456890106201, + "learning_rate": 4.269295386275128e-05, + "loss": 0.0961, + "num_input_tokens_seen": 70971696, + "step": 58320 + }, + { + "epoch": 6.495712217396147, + "grad_norm": 0.18417029082775116, + "learning_rate": 4.269123718188381e-05, + "loss": 0.0523, + "num_input_tokens_seen": 70977840, + "step": 58325 + }, + { + "epoch": 6.496269072279764, + "grad_norm": 0.01668153703212738, + "learning_rate": 4.268952033390843e-05, + "loss": 0.0497, + "num_input_tokens_seen": 70984368, + "step": 58330 + }, + { + "epoch": 6.496825927163381, + "grad_norm": 0.09618160873651505, + "learning_rate": 4.268780331884136e-05, + "loss": 0.0627, + "num_input_tokens_seen": 70990672, + "step": 58335 + }, + { + "epoch": 6.497382782046999, + "grad_norm": 0.001562005141749978, + "learning_rate": 4.268608613669882e-05, + "loss": 0.0115, + "num_input_tokens_seen": 70996784, + "step": 58340 + }, + { + "epoch": 6.4979396369306155, + "grad_norm": 0.003862646408379078, + "learning_rate": 4.268436878749702e-05, + "loss": 0.0331, + "num_input_tokens_seen": 71003056, + "step": 58345 + }, + { + "epoch": 6.498496491814233, + "grad_norm": 1.8918346166610718, + "learning_rate": 4.268265127125218e-05, + "loss": 0.0921, + "num_input_tokens_seen": 71009424, + "step": 58350 + }, + { + "epoch": 6.499053346697851, + "grad_norm": 0.05417568236589432, + "learning_rate": 4.268093358798055e-05, + "loss": 0.1161, + "num_input_tokens_seen": 71015440, + "step": 58355 + }, + { + "epoch": 6.499610201581468, + "grad_norm": 3.3270866870880127, + "learning_rate": 4.267921573769833e-05, + "loss": 0.1803, + "num_input_tokens_seen": 71021616, + "step": 58360 + }, + { + "epoch": 6.500167056465085, + "grad_norm": 0.0035439489874988794, + "learning_rate": 4.2677497720421746e-05, + "loss": 0.1043, + "num_input_tokens_seen": 71027408, + "step": 58365 + }, + { + "epoch": 6.500723911348702, + "grad_norm": 0.19167402386665344, + "learning_rate": 4.267577953616704e-05, + "loss": 0.059, + "num_input_tokens_seen": 71033488, + "step": 58370 + }, + { + "epoch": 6.50128076623232, + "grad_norm": 0.8697838187217712, + "learning_rate": 4.267406118495043e-05, + "loss": 0.0894, + "num_input_tokens_seen": 71039536, + "step": 58375 + }, + { + "epoch": 6.501837621115937, + "grad_norm": 0.7342137694358826, + "learning_rate": 4.2672342666788154e-05, + "loss": 0.04, + "num_input_tokens_seen": 71045552, + "step": 58380 + }, + { + "epoch": 6.502394475999554, + "grad_norm": 0.31102338433265686, + "learning_rate": 4.267062398169645e-05, + "loss": 0.126, + "num_input_tokens_seen": 71051536, + "step": 58385 + }, + { + "epoch": 6.502951330883172, + "grad_norm": 0.005757980979979038, + "learning_rate": 4.266890512969154e-05, + "loss": 0.0107, + "num_input_tokens_seen": 71057712, + "step": 58390 + }, + { + "epoch": 6.503508185766789, + "grad_norm": 0.030442029237747192, + "learning_rate": 4.266718611078966e-05, + "loss": 0.1244, + "num_input_tokens_seen": 71063984, + "step": 58395 + }, + { + "epoch": 6.504065040650406, + "grad_norm": 1.5549516677856445, + "learning_rate": 4.266546692500706e-05, + "loss": 0.0544, + "num_input_tokens_seen": 71070192, + "step": 58400 + }, + { + "epoch": 6.504621895534024, + "grad_norm": 0.18258196115493774, + "learning_rate": 4.266374757235997e-05, + "loss": 0.0068, + "num_input_tokens_seen": 71076240, + "step": 58405 + }, + { + "epoch": 6.505178750417641, + "grad_norm": 0.04435364902019501, + "learning_rate": 4.266202805286462e-05, + "loss": 0.036, + "num_input_tokens_seen": 71082448, + "step": 58410 + }, + { + "epoch": 6.5057356053012585, + "grad_norm": 1.8944180011749268, + "learning_rate": 4.266030836653728e-05, + "loss": 0.1225, + "num_input_tokens_seen": 71088368, + "step": 58415 + }, + { + "epoch": 6.506292460184876, + "grad_norm": 0.628020703792572, + "learning_rate": 4.265858851339417e-05, + "loss": 0.0187, + "num_input_tokens_seen": 71094896, + "step": 58420 + }, + { + "epoch": 6.506849315068493, + "grad_norm": 0.33205506205558777, + "learning_rate": 4.265686849345155e-05, + "loss": 0.0368, + "num_input_tokens_seen": 71100528, + "step": 58425 + }, + { + "epoch": 6.507406169952111, + "grad_norm": 0.7313682436943054, + "learning_rate": 4.2655148306725655e-05, + "loss": 0.0835, + "num_input_tokens_seen": 71106576, + "step": 58430 + }, + { + "epoch": 6.507963024835728, + "grad_norm": 0.02940955013036728, + "learning_rate": 4.265342795323274e-05, + "loss": 0.0485, + "num_input_tokens_seen": 71112432, + "step": 58435 + }, + { + "epoch": 6.508519879719345, + "grad_norm": 0.32415062189102173, + "learning_rate": 4.2651707432989056e-05, + "loss": 0.2158, + "num_input_tokens_seen": 71118832, + "step": 58440 + }, + { + "epoch": 6.509076734602963, + "grad_norm": 0.40031731128692627, + "learning_rate": 4.264998674601085e-05, + "loss": 0.083, + "num_input_tokens_seen": 71124944, + "step": 58445 + }, + { + "epoch": 6.5096335894865796, + "grad_norm": 0.048316530883312225, + "learning_rate": 4.264826589231439e-05, + "loss": 0.1198, + "num_input_tokens_seen": 71131024, + "step": 58450 + }, + { + "epoch": 6.510190444370197, + "grad_norm": 0.14357590675354004, + "learning_rate": 4.264654487191591e-05, + "loss": 0.0559, + "num_input_tokens_seen": 71137168, + "step": 58455 + }, + { + "epoch": 6.510747299253815, + "grad_norm": 0.0005536790122278035, + "learning_rate": 4.264482368483167e-05, + "loss": 0.0274, + "num_input_tokens_seen": 71143216, + "step": 58460 + }, + { + "epoch": 6.511304154137432, + "grad_norm": 0.4422689974308014, + "learning_rate": 4.2643102331077936e-05, + "loss": 0.075, + "num_input_tokens_seen": 71149328, + "step": 58465 + }, + { + "epoch": 6.511861009021049, + "grad_norm": 0.07278933376073837, + "learning_rate": 4.2641380810670975e-05, + "loss": 0.0045, + "num_input_tokens_seen": 71154864, + "step": 58470 + }, + { + "epoch": 6.512417863904666, + "grad_norm": 0.0006853895611129701, + "learning_rate": 4.2639659123627026e-05, + "loss": 0.0471, + "num_input_tokens_seen": 71161136, + "step": 58475 + }, + { + "epoch": 6.512974718788284, + "grad_norm": 0.04699871689081192, + "learning_rate": 4.263793726996237e-05, + "loss": 0.0769, + "num_input_tokens_seen": 71167280, + "step": 58480 + }, + { + "epoch": 6.5135315736719015, + "grad_norm": 0.4461974799633026, + "learning_rate": 4.263621524969326e-05, + "loss": 0.0245, + "num_input_tokens_seen": 71173424, + "step": 58485 + }, + { + "epoch": 6.514088428555518, + "grad_norm": 0.09216976165771484, + "learning_rate": 4.2634493062835974e-05, + "loss": 0.0376, + "num_input_tokens_seen": 71179376, + "step": 58490 + }, + { + "epoch": 6.514645283439136, + "grad_norm": 0.7868220806121826, + "learning_rate": 4.2632770709406764e-05, + "loss": 0.117, + "num_input_tokens_seen": 71185232, + "step": 58495 + }, + { + "epoch": 6.515202138322753, + "grad_norm": 0.19943572580814362, + "learning_rate": 4.263104818942192e-05, + "loss": 0.0976, + "num_input_tokens_seen": 71190384, + "step": 58500 + }, + { + "epoch": 6.51575899320637, + "grad_norm": 0.009513040073215961, + "learning_rate": 4.262932550289769e-05, + "loss": 0.015, + "num_input_tokens_seen": 71196688, + "step": 58505 + }, + { + "epoch": 6.516315848089988, + "grad_norm": 0.04775653034448624, + "learning_rate": 4.262760264985036e-05, + "loss": 0.0209, + "num_input_tokens_seen": 71202576, + "step": 58510 + }, + { + "epoch": 6.516872702973605, + "grad_norm": 0.8512222170829773, + "learning_rate": 4.26258796302962e-05, + "loss": 0.0593, + "num_input_tokens_seen": 71208656, + "step": 58515 + }, + { + "epoch": 6.5174295578572226, + "grad_norm": 0.600612461566925, + "learning_rate": 4.2624156444251485e-05, + "loss": 0.0412, + "num_input_tokens_seen": 71214320, + "step": 58520 + }, + { + "epoch": 6.517986412740839, + "grad_norm": 2.5533204078674316, + "learning_rate": 4.262243309173249e-05, + "loss": 0.0606, + "num_input_tokens_seen": 71220432, + "step": 58525 + }, + { + "epoch": 6.518543267624457, + "grad_norm": 0.22246873378753662, + "learning_rate": 4.26207095727555e-05, + "loss": 0.0162, + "num_input_tokens_seen": 71226896, + "step": 58530 + }, + { + "epoch": 6.519100122508075, + "grad_norm": 0.009407204575836658, + "learning_rate": 4.2618985887336786e-05, + "loss": 0.0897, + "num_input_tokens_seen": 71233104, + "step": 58535 + }, + { + "epoch": 6.5196569773916915, + "grad_norm": 0.15357501804828644, + "learning_rate": 4.261726203549263e-05, + "loss": 0.0259, + "num_input_tokens_seen": 71239728, + "step": 58540 + }, + { + "epoch": 6.520213832275309, + "grad_norm": 0.5186959505081177, + "learning_rate": 4.261553801723933e-05, + "loss": 0.0371, + "num_input_tokens_seen": 71245936, + "step": 58545 + }, + { + "epoch": 6.520770687158926, + "grad_norm": 0.10395600646734238, + "learning_rate": 4.261381383259316e-05, + "loss": 0.0413, + "num_input_tokens_seen": 71252080, + "step": 58550 + }, + { + "epoch": 6.521327542042544, + "grad_norm": 2.9030396938323975, + "learning_rate": 4.2612089481570406e-05, + "loss": 0.1968, + "num_input_tokens_seen": 71258096, + "step": 58555 + }, + { + "epoch": 6.521884396926161, + "grad_norm": 0.7363096475601196, + "learning_rate": 4.2610364964187344e-05, + "loss": 0.1003, + "num_input_tokens_seen": 71264336, + "step": 58560 + }, + { + "epoch": 6.522441251809778, + "grad_norm": 0.0015071411617100239, + "learning_rate": 4.2608640280460286e-05, + "loss": 0.0071, + "num_input_tokens_seen": 71270480, + "step": 58565 + }, + { + "epoch": 6.522998106693396, + "grad_norm": 0.17121194303035736, + "learning_rate": 4.2606915430405516e-05, + "loss": 0.0124, + "num_input_tokens_seen": 71276976, + "step": 58570 + }, + { + "epoch": 6.5235549615770125, + "grad_norm": 0.815303385257721, + "learning_rate": 4.2605190414039316e-05, + "loss": 0.089, + "num_input_tokens_seen": 71283120, + "step": 58575 + }, + { + "epoch": 6.52411181646063, + "grad_norm": 0.10041196644306183, + "learning_rate": 4.2603465231377984e-05, + "loss": 0.0385, + "num_input_tokens_seen": 71288880, + "step": 58580 + }, + { + "epoch": 6.524668671344248, + "grad_norm": 2.1341967582702637, + "learning_rate": 4.260173988243783e-05, + "loss": 0.2989, + "num_input_tokens_seen": 71294960, + "step": 58585 + }, + { + "epoch": 6.525225526227865, + "grad_norm": 1.053422451019287, + "learning_rate": 4.2600014367235136e-05, + "loss": 0.1427, + "num_input_tokens_seen": 71300976, + "step": 58590 + }, + { + "epoch": 6.525782381111482, + "grad_norm": 0.22071915864944458, + "learning_rate": 4.25982886857862e-05, + "loss": 0.1105, + "num_input_tokens_seen": 71307504, + "step": 58595 + }, + { + "epoch": 6.5263392359951, + "grad_norm": 0.35560646653175354, + "learning_rate": 4.2596562838107335e-05, + "loss": 0.0675, + "num_input_tokens_seen": 71313136, + "step": 58600 + }, + { + "epoch": 6.526896090878717, + "grad_norm": 0.008389093913137913, + "learning_rate": 4.2594836824214834e-05, + "loss": 0.0701, + "num_input_tokens_seen": 71319472, + "step": 58605 + }, + { + "epoch": 6.5274529457623345, + "grad_norm": 0.04103996977210045, + "learning_rate": 4.2593110644125e-05, + "loss": 0.0846, + "num_input_tokens_seen": 71325392, + "step": 58610 + }, + { + "epoch": 6.528009800645952, + "grad_norm": 0.9076735377311707, + "learning_rate": 4.2591384297854145e-05, + "loss": 0.2187, + "num_input_tokens_seen": 71331216, + "step": 58615 + }, + { + "epoch": 6.528566655529569, + "grad_norm": 0.005964519921690226, + "learning_rate": 4.258965778541857e-05, + "loss": 0.166, + "num_input_tokens_seen": 71337168, + "step": 58620 + }, + { + "epoch": 6.529123510413187, + "grad_norm": 0.29198840260505676, + "learning_rate": 4.258793110683458e-05, + "loss": 0.0369, + "num_input_tokens_seen": 71343600, + "step": 58625 + }, + { + "epoch": 6.529680365296803, + "grad_norm": 0.6275624632835388, + "learning_rate": 4.258620426211849e-05, + "loss": 0.0161, + "num_input_tokens_seen": 71349968, + "step": 58630 + }, + { + "epoch": 6.530237220180421, + "grad_norm": 0.030247755348682404, + "learning_rate": 4.258447725128662e-05, + "loss": 0.0307, + "num_input_tokens_seen": 71356016, + "step": 58635 + }, + { + "epoch": 6.530794075064039, + "grad_norm": 0.9896493554115295, + "learning_rate": 4.2582750074355274e-05, + "loss": 0.1232, + "num_input_tokens_seen": 71362096, + "step": 58640 + }, + { + "epoch": 6.5313509299476555, + "grad_norm": 0.9163772463798523, + "learning_rate": 4.2581022731340754e-05, + "loss": 0.078, + "num_input_tokens_seen": 71368240, + "step": 58645 + }, + { + "epoch": 6.531907784831273, + "grad_norm": 0.09683112800121307, + "learning_rate": 4.25792952222594e-05, + "loss": 0.095, + "num_input_tokens_seen": 71374736, + "step": 58650 + }, + { + "epoch": 6.53246463971489, + "grad_norm": 0.003165856469422579, + "learning_rate": 4.257756754712751e-05, + "loss": 0.1447, + "num_input_tokens_seen": 71380656, + "step": 58655 + }, + { + "epoch": 6.533021494598508, + "grad_norm": 0.31328195333480835, + "learning_rate": 4.257583970596142e-05, + "loss": 0.0267, + "num_input_tokens_seen": 71386960, + "step": 58660 + }, + { + "epoch": 6.533578349482125, + "grad_norm": 0.07907954603433609, + "learning_rate": 4.257411169877743e-05, + "loss": 0.0188, + "num_input_tokens_seen": 71392880, + "step": 58665 + }, + { + "epoch": 6.534135204365742, + "grad_norm": 1.037713885307312, + "learning_rate": 4.2572383525591885e-05, + "loss": 0.0984, + "num_input_tokens_seen": 71399120, + "step": 58670 + }, + { + "epoch": 6.53469205924936, + "grad_norm": 2.629322052001953, + "learning_rate": 4.2570655186421095e-05, + "loss": 0.0816, + "num_input_tokens_seen": 71405424, + "step": 58675 + }, + { + "epoch": 6.535248914132977, + "grad_norm": 0.05571878328919411, + "learning_rate": 4.256892668128139e-05, + "loss": 0.0952, + "num_input_tokens_seen": 71411632, + "step": 58680 + }, + { + "epoch": 6.535805769016594, + "grad_norm": 0.8329879641532898, + "learning_rate": 4.25671980101891e-05, + "loss": 0.1624, + "num_input_tokens_seen": 71417840, + "step": 58685 + }, + { + "epoch": 6.536362623900212, + "grad_norm": 0.05412755161523819, + "learning_rate": 4.2565469173160544e-05, + "loss": 0.0384, + "num_input_tokens_seen": 71424080, + "step": 58690 + }, + { + "epoch": 6.536919478783829, + "grad_norm": 0.00030407318263314664, + "learning_rate": 4.256374017021206e-05, + "loss": 0.0227, + "num_input_tokens_seen": 71430064, + "step": 58695 + }, + { + "epoch": 6.537476333667446, + "grad_norm": 1.0972704887390137, + "learning_rate": 4.256201100135998e-05, + "loss": 0.1145, + "num_input_tokens_seen": 71436400, + "step": 58700 + }, + { + "epoch": 6.538033188551063, + "grad_norm": 0.7825791239738464, + "learning_rate": 4.2560281666620636e-05, + "loss": 0.0272, + "num_input_tokens_seen": 71442736, + "step": 58705 + }, + { + "epoch": 6.538590043434681, + "grad_norm": 0.4910535514354706, + "learning_rate": 4.2558552166010365e-05, + "loss": 0.0969, + "num_input_tokens_seen": 71448784, + "step": 58710 + }, + { + "epoch": 6.5391468983182985, + "grad_norm": 0.02178039588034153, + "learning_rate": 4.25568224995455e-05, + "loss": 0.0507, + "num_input_tokens_seen": 71455056, + "step": 58715 + }, + { + "epoch": 6.539703753201915, + "grad_norm": 0.014111263677477837, + "learning_rate": 4.255509266724238e-05, + "loss": 0.0467, + "num_input_tokens_seen": 71460912, + "step": 58720 + }, + { + "epoch": 6.540260608085533, + "grad_norm": 0.2662433683872223, + "learning_rate": 4.255336266911734e-05, + "loss": 0.0823, + "num_input_tokens_seen": 71466960, + "step": 58725 + }, + { + "epoch": 6.54081746296915, + "grad_norm": 0.1991097629070282, + "learning_rate": 4.255163250518673e-05, + "loss": 0.0282, + "num_input_tokens_seen": 71472560, + "step": 58730 + }, + { + "epoch": 6.5413743178527675, + "grad_norm": 0.8786699771881104, + "learning_rate": 4.254990217546689e-05, + "loss": 0.1087, + "num_input_tokens_seen": 71478416, + "step": 58735 + }, + { + "epoch": 6.541931172736385, + "grad_norm": 0.6216719150543213, + "learning_rate": 4.254817167997416e-05, + "loss": 0.1139, + "num_input_tokens_seen": 71484432, + "step": 58740 + }, + { + "epoch": 6.542488027620002, + "grad_norm": 0.006123648956418037, + "learning_rate": 4.254644101872489e-05, + "loss": 0.0076, + "num_input_tokens_seen": 71490512, + "step": 58745 + }, + { + "epoch": 6.54304488250362, + "grad_norm": 0.018666844815015793, + "learning_rate": 4.254471019173543e-05, + "loss": 0.0408, + "num_input_tokens_seen": 71496592, + "step": 58750 + }, + { + "epoch": 6.543601737387237, + "grad_norm": 0.18141262233257294, + "learning_rate": 4.254297919902211e-05, + "loss": 0.0428, + "num_input_tokens_seen": 71502672, + "step": 58755 + }, + { + "epoch": 6.544158592270854, + "grad_norm": 0.1118532344698906, + "learning_rate": 4.2541248040601315e-05, + "loss": 0.0118, + "num_input_tokens_seen": 71508656, + "step": 58760 + }, + { + "epoch": 6.544715447154472, + "grad_norm": 0.9787999987602234, + "learning_rate": 4.253951671648937e-05, + "loss": 0.1278, + "num_input_tokens_seen": 71514000, + "step": 58765 + }, + { + "epoch": 6.5452723020380885, + "grad_norm": 0.010053460486233234, + "learning_rate": 4.253778522670264e-05, + "loss": 0.0491, + "num_input_tokens_seen": 71520240, + "step": 58770 + }, + { + "epoch": 6.545829156921706, + "grad_norm": 0.23538608849048615, + "learning_rate": 4.2536053571257484e-05, + "loss": 0.0242, + "num_input_tokens_seen": 71526448, + "step": 58775 + }, + { + "epoch": 6.546386011805324, + "grad_norm": 0.23109659552574158, + "learning_rate": 4.2534321750170245e-05, + "loss": 0.1493, + "num_input_tokens_seen": 71532496, + "step": 58780 + }, + { + "epoch": 6.546942866688941, + "grad_norm": 0.009055224247276783, + "learning_rate": 4.253258976345729e-05, + "loss": 0.0156, + "num_input_tokens_seen": 71538928, + "step": 58785 + }, + { + "epoch": 6.547499721572558, + "grad_norm": 1.2456568479537964, + "learning_rate": 4.2530857611134975e-05, + "loss": 0.0766, + "num_input_tokens_seen": 71544656, + "step": 58790 + }, + { + "epoch": 6.548056576456176, + "grad_norm": 0.03235703706741333, + "learning_rate": 4.2529125293219666e-05, + "loss": 0.0451, + "num_input_tokens_seen": 71550960, + "step": 58795 + }, + { + "epoch": 6.548613431339793, + "grad_norm": 0.05336684733629227, + "learning_rate": 4.2527392809727726e-05, + "loss": 0.0314, + "num_input_tokens_seen": 71557520, + "step": 58800 + }, + { + "epoch": 6.5491702862234105, + "grad_norm": 0.000377367454348132, + "learning_rate": 4.252566016067552e-05, + "loss": 0.0539, + "num_input_tokens_seen": 71563088, + "step": 58805 + }, + { + "epoch": 6.549727141107027, + "grad_norm": 2.0862419605255127, + "learning_rate": 4.252392734607941e-05, + "loss": 0.116, + "num_input_tokens_seen": 71568848, + "step": 58810 + }, + { + "epoch": 6.550283995990645, + "grad_norm": 0.15544793009757996, + "learning_rate": 4.252219436595576e-05, + "loss": 0.0273, + "num_input_tokens_seen": 71575088, + "step": 58815 + }, + { + "epoch": 6.550840850874263, + "grad_norm": 0.3035302758216858, + "learning_rate": 4.252046122032095e-05, + "loss": 0.06, + "num_input_tokens_seen": 71581360, + "step": 58820 + }, + { + "epoch": 6.551397705757879, + "grad_norm": 1.5907008647918701, + "learning_rate": 4.251872790919135e-05, + "loss": 0.0692, + "num_input_tokens_seen": 71586864, + "step": 58825 + }, + { + "epoch": 6.551954560641497, + "grad_norm": 1.873283863067627, + "learning_rate": 4.251699443258333e-05, + "loss": 0.0736, + "num_input_tokens_seen": 71592976, + "step": 58830 + }, + { + "epoch": 6.552511415525114, + "grad_norm": 0.5042201280593872, + "learning_rate": 4.2515260790513264e-05, + "loss": 0.0618, + "num_input_tokens_seen": 71599152, + "step": 58835 + }, + { + "epoch": 6.5530682704087315, + "grad_norm": 0.002110406756401062, + "learning_rate": 4.251352698299752e-05, + "loss": 0.0983, + "num_input_tokens_seen": 71605296, + "step": 58840 + }, + { + "epoch": 6.553625125292349, + "grad_norm": 0.002498550107702613, + "learning_rate": 4.251179301005248e-05, + "loss": 0.1182, + "num_input_tokens_seen": 71611408, + "step": 58845 + }, + { + "epoch": 6.554181980175966, + "grad_norm": 0.015384213998913765, + "learning_rate": 4.251005887169454e-05, + "loss": 0.0401, + "num_input_tokens_seen": 71617552, + "step": 58850 + }, + { + "epoch": 6.554738835059584, + "grad_norm": 0.12192438542842865, + "learning_rate": 4.250832456794005e-05, + "loss": 0.0043, + "num_input_tokens_seen": 71623824, + "step": 58855 + }, + { + "epoch": 6.5552956899432, + "grad_norm": 0.05346579849720001, + "learning_rate": 4.250659009880541e-05, + "loss": 0.0233, + "num_input_tokens_seen": 71630096, + "step": 58860 + }, + { + "epoch": 6.555852544826818, + "grad_norm": 0.43169957399368286, + "learning_rate": 4.2504855464307e-05, + "loss": 0.038, + "num_input_tokens_seen": 71635504, + "step": 58865 + }, + { + "epoch": 6.556409399710436, + "grad_norm": 1.3486067056655884, + "learning_rate": 4.2503120664461214e-05, + "loss": 0.2427, + "num_input_tokens_seen": 71641936, + "step": 58870 + }, + { + "epoch": 6.556966254594053, + "grad_norm": 0.6423922777175903, + "learning_rate": 4.2501385699284426e-05, + "loss": 0.051, + "num_input_tokens_seen": 71648048, + "step": 58875 + }, + { + "epoch": 6.55752310947767, + "grad_norm": 0.5253409743309021, + "learning_rate": 4.2499650568793025e-05, + "loss": 0.0232, + "num_input_tokens_seen": 71654480, + "step": 58880 + }, + { + "epoch": 6.558079964361287, + "grad_norm": 0.4929225444793701, + "learning_rate": 4.249791527300341e-05, + "loss": 0.0845, + "num_input_tokens_seen": 71660464, + "step": 58885 + }, + { + "epoch": 6.558636819244905, + "grad_norm": 0.14616814255714417, + "learning_rate": 4.249617981193196e-05, + "loss": 0.0259, + "num_input_tokens_seen": 71666576, + "step": 58890 + }, + { + "epoch": 6.559193674128522, + "grad_norm": 0.5846402645111084, + "learning_rate": 4.2494444185595074e-05, + "loss": 0.0284, + "num_input_tokens_seen": 71672560, + "step": 58895 + }, + { + "epoch": 6.559750529012139, + "grad_norm": 0.9486863613128662, + "learning_rate": 4.249270839400915e-05, + "loss": 0.0597, + "num_input_tokens_seen": 71678640, + "step": 58900 + }, + { + "epoch": 6.560307383895757, + "grad_norm": 0.014539428986608982, + "learning_rate": 4.249097243719058e-05, + "loss": 0.0575, + "num_input_tokens_seen": 71684304, + "step": 58905 + }, + { + "epoch": 6.560864238779374, + "grad_norm": 1.0521841049194336, + "learning_rate": 4.248923631515576e-05, + "loss": 0.0451, + "num_input_tokens_seen": 71690416, + "step": 58910 + }, + { + "epoch": 6.561421093662991, + "grad_norm": 0.5572735071182251, + "learning_rate": 4.248750002792108e-05, + "loss": 0.0297, + "num_input_tokens_seen": 71696688, + "step": 58915 + }, + { + "epoch": 6.561977948546609, + "grad_norm": 1.4527699947357178, + "learning_rate": 4.248576357550297e-05, + "loss": 0.123, + "num_input_tokens_seen": 71702512, + "step": 58920 + }, + { + "epoch": 6.562534803430226, + "grad_norm": 0.3136487901210785, + "learning_rate": 4.2484026957917806e-05, + "loss": 0.0508, + "num_input_tokens_seen": 71709072, + "step": 58925 + }, + { + "epoch": 6.5630916583138434, + "grad_norm": 0.4614875614643097, + "learning_rate": 4.2482290175181996e-05, + "loss": 0.043, + "num_input_tokens_seen": 71715216, + "step": 58930 + }, + { + "epoch": 6.563648513197461, + "grad_norm": 0.05367957055568695, + "learning_rate": 4.2480553227311956e-05, + "loss": 0.1378, + "num_input_tokens_seen": 71721296, + "step": 58935 + }, + { + "epoch": 6.564205368081078, + "grad_norm": 1.7812247276306152, + "learning_rate": 4.2478816114324085e-05, + "loss": 0.026, + "num_input_tokens_seen": 71727728, + "step": 58940 + }, + { + "epoch": 6.564762222964696, + "grad_norm": 0.7249225974082947, + "learning_rate": 4.247707883623478e-05, + "loss": 0.1017, + "num_input_tokens_seen": 71733648, + "step": 58945 + }, + { + "epoch": 6.565319077848312, + "grad_norm": 0.029481086879968643, + "learning_rate": 4.247534139306048e-05, + "loss": 0.0497, + "num_input_tokens_seen": 71740080, + "step": 58950 + }, + { + "epoch": 6.56587593273193, + "grad_norm": 0.2119215577840805, + "learning_rate": 4.2473603784817565e-05, + "loss": 0.0737, + "num_input_tokens_seen": 71746096, + "step": 58955 + }, + { + "epoch": 6.566432787615548, + "grad_norm": 0.6194769740104675, + "learning_rate": 4.247186601152247e-05, + "loss": 0.0798, + "num_input_tokens_seen": 71751920, + "step": 58960 + }, + { + "epoch": 6.5669896424991645, + "grad_norm": 0.0013240460539236665, + "learning_rate": 4.2470128073191604e-05, + "loss": 0.0108, + "num_input_tokens_seen": 71758032, + "step": 58965 + }, + { + "epoch": 6.567546497382782, + "grad_norm": 0.8446286916732788, + "learning_rate": 4.246838996984138e-05, + "loss": 0.076, + "num_input_tokens_seen": 71764112, + "step": 58970 + }, + { + "epoch": 6.5681033522664, + "grad_norm": 0.9201540350914001, + "learning_rate": 4.2466651701488215e-05, + "loss": 0.1224, + "num_input_tokens_seen": 71770288, + "step": 58975 + }, + { + "epoch": 6.568660207150017, + "grad_norm": 0.1768539696931839, + "learning_rate": 4.246491326814853e-05, + "loss": 0.1481, + "num_input_tokens_seen": 71776304, + "step": 58980 + }, + { + "epoch": 6.569217062033634, + "grad_norm": 0.0002969578199554235, + "learning_rate": 4.246317466983874e-05, + "loss": 0.0525, + "num_input_tokens_seen": 71782512, + "step": 58985 + }, + { + "epoch": 6.569773916917251, + "grad_norm": 0.9996088147163391, + "learning_rate": 4.2461435906575286e-05, + "loss": 0.0943, + "num_input_tokens_seen": 71788848, + "step": 58990 + }, + { + "epoch": 6.570330771800869, + "grad_norm": 0.4549092948436737, + "learning_rate": 4.245969697837458e-05, + "loss": 0.146, + "num_input_tokens_seen": 71795184, + "step": 58995 + }, + { + "epoch": 6.5708876266844864, + "grad_norm": 0.08527329564094543, + "learning_rate": 4.245795788525304e-05, + "loss": 0.0567, + "num_input_tokens_seen": 71800880, + "step": 59000 + }, + { + "epoch": 6.571444481568103, + "grad_norm": 0.5087187886238098, + "learning_rate": 4.245621862722711e-05, + "loss": 0.1245, + "num_input_tokens_seen": 71807248, + "step": 59005 + }, + { + "epoch": 6.572001336451721, + "grad_norm": 0.016074782237410545, + "learning_rate": 4.2454479204313204e-05, + "loss": 0.0083, + "num_input_tokens_seen": 71813328, + "step": 59010 + }, + { + "epoch": 6.572558191335338, + "grad_norm": 1.3777929544448853, + "learning_rate": 4.245273961652776e-05, + "loss": 0.0524, + "num_input_tokens_seen": 71819312, + "step": 59015 + }, + { + "epoch": 6.573115046218955, + "grad_norm": 0.9187594056129456, + "learning_rate": 4.2450999863887197e-05, + "loss": 0.1276, + "num_input_tokens_seen": 71825616, + "step": 59020 + }, + { + "epoch": 6.573671901102573, + "grad_norm": 0.5600265860557556, + "learning_rate": 4.244925994640797e-05, + "loss": 0.0819, + "num_input_tokens_seen": 71831824, + "step": 59025 + }, + { + "epoch": 6.57422875598619, + "grad_norm": 1.1753640174865723, + "learning_rate": 4.24475198641065e-05, + "loss": 0.1109, + "num_input_tokens_seen": 71838032, + "step": 59030 + }, + { + "epoch": 6.5747856108698075, + "grad_norm": 1.933948278427124, + "learning_rate": 4.2445779616999224e-05, + "loss": 0.2026, + "num_input_tokens_seen": 71844272, + "step": 59035 + }, + { + "epoch": 6.575342465753424, + "grad_norm": 0.5809181332588196, + "learning_rate": 4.244403920510258e-05, + "loss": 0.033, + "num_input_tokens_seen": 71850352, + "step": 59040 + }, + { + "epoch": 6.575899320637042, + "grad_norm": 0.465912401676178, + "learning_rate": 4.244229862843302e-05, + "loss": 0.1336, + "num_input_tokens_seen": 71856496, + "step": 59045 + }, + { + "epoch": 6.57645617552066, + "grad_norm": 0.8937426209449768, + "learning_rate": 4.2440557887006964e-05, + "loss": 0.0409, + "num_input_tokens_seen": 71862800, + "step": 59050 + }, + { + "epoch": 6.577013030404276, + "grad_norm": 1.2444168329238892, + "learning_rate": 4.243881698084087e-05, + "loss": 0.0638, + "num_input_tokens_seen": 71869008, + "step": 59055 + }, + { + "epoch": 6.577569885287894, + "grad_norm": 0.6834031343460083, + "learning_rate": 4.243707590995118e-05, + "loss": 0.0931, + "num_input_tokens_seen": 71874928, + "step": 59060 + }, + { + "epoch": 6.578126740171511, + "grad_norm": 0.7091704607009888, + "learning_rate": 4.243533467435434e-05, + "loss": 0.0926, + "num_input_tokens_seen": 71881488, + "step": 59065 + }, + { + "epoch": 6.578683595055129, + "grad_norm": 0.26476362347602844, + "learning_rate": 4.243359327406679e-05, + "loss": 0.0248, + "num_input_tokens_seen": 71887504, + "step": 59070 + }, + { + "epoch": 6.579240449938746, + "grad_norm": 0.08458632975816727, + "learning_rate": 4.243185170910498e-05, + "loss": 0.0097, + "num_input_tokens_seen": 71893680, + "step": 59075 + }, + { + "epoch": 6.579797304822363, + "grad_norm": 1.2688264846801758, + "learning_rate": 4.243010997948536e-05, + "loss": 0.088, + "num_input_tokens_seen": 71899888, + "step": 59080 + }, + { + "epoch": 6.580354159705981, + "grad_norm": 0.28818997740745544, + "learning_rate": 4.2428368085224404e-05, + "loss": 0.0726, + "num_input_tokens_seen": 71905872, + "step": 59085 + }, + { + "epoch": 6.5809110145895975, + "grad_norm": 0.06841213256120682, + "learning_rate": 4.2426626026338546e-05, + "loss": 0.004, + "num_input_tokens_seen": 71912336, + "step": 59090 + }, + { + "epoch": 6.581467869473215, + "grad_norm": 0.7144467234611511, + "learning_rate": 4.242488380284423e-05, + "loss": 0.0349, + "num_input_tokens_seen": 71918864, + "step": 59095 + }, + { + "epoch": 6.582024724356833, + "grad_norm": 0.08004701882600784, + "learning_rate": 4.242314141475793e-05, + "loss": 0.0204, + "num_input_tokens_seen": 71924752, + "step": 59100 + }, + { + "epoch": 6.58258157924045, + "grad_norm": 0.6260764002799988, + "learning_rate": 4.242139886209611e-05, + "loss": 0.0482, + "num_input_tokens_seen": 71930224, + "step": 59105 + }, + { + "epoch": 6.583138434124067, + "grad_norm": 0.413944274187088, + "learning_rate": 4.241965614487522e-05, + "loss": 0.0928, + "num_input_tokens_seen": 71936304, + "step": 59110 + }, + { + "epoch": 6.583695289007685, + "grad_norm": 0.05029116943478584, + "learning_rate": 4.241791326311171e-05, + "loss": 0.057, + "num_input_tokens_seen": 71942800, + "step": 59115 + }, + { + "epoch": 6.584252143891302, + "grad_norm": 0.2833293080329895, + "learning_rate": 4.241617021682206e-05, + "loss": 0.0328, + "num_input_tokens_seen": 71949136, + "step": 59120 + }, + { + "epoch": 6.584808998774919, + "grad_norm": 0.332220196723938, + "learning_rate": 4.241442700602272e-05, + "loss": 0.1512, + "num_input_tokens_seen": 71954992, + "step": 59125 + }, + { + "epoch": 6.585365853658536, + "grad_norm": 0.002518008928745985, + "learning_rate": 4.241268363073018e-05, + "loss": 0.0391, + "num_input_tokens_seen": 71961328, + "step": 59130 + }, + { + "epoch": 6.585922708542154, + "grad_norm": 0.9092869162559509, + "learning_rate": 4.2410940090960876e-05, + "loss": 0.0384, + "num_input_tokens_seen": 71967536, + "step": 59135 + }, + { + "epoch": 6.586479563425772, + "grad_norm": 0.20223131775856018, + "learning_rate": 4.2409196386731306e-05, + "loss": 0.124, + "num_input_tokens_seen": 71973776, + "step": 59140 + }, + { + "epoch": 6.587036418309388, + "grad_norm": 0.041504472494125366, + "learning_rate": 4.240745251805792e-05, + "loss": 0.0189, + "num_input_tokens_seen": 71980240, + "step": 59145 + }, + { + "epoch": 6.587593273193006, + "grad_norm": 0.1888362467288971, + "learning_rate": 4.24057084849572e-05, + "loss": 0.1057, + "num_input_tokens_seen": 71986064, + "step": 59150 + }, + { + "epoch": 6.588150128076624, + "grad_norm": 0.07773162424564362, + "learning_rate": 4.240396428744562e-05, + "loss": 0.0415, + "num_input_tokens_seen": 71992304, + "step": 59155 + }, + { + "epoch": 6.5887069829602405, + "grad_norm": 0.7398709058761597, + "learning_rate": 4.240221992553966e-05, + "loss": 0.1049, + "num_input_tokens_seen": 71998192, + "step": 59160 + }, + { + "epoch": 6.589263837843858, + "grad_norm": 1.2789194583892822, + "learning_rate": 4.2400475399255776e-05, + "loss": 0.0984, + "num_input_tokens_seen": 72004080, + "step": 59165 + }, + { + "epoch": 6.589820692727475, + "grad_norm": 0.20671063661575317, + "learning_rate": 4.239873070861047e-05, + "loss": 0.0061, + "num_input_tokens_seen": 72010448, + "step": 59170 + }, + { + "epoch": 6.590377547611093, + "grad_norm": 0.10199031978845596, + "learning_rate": 4.2396985853620214e-05, + "loss": 0.0621, + "num_input_tokens_seen": 72016688, + "step": 59175 + }, + { + "epoch": 6.59093440249471, + "grad_norm": 0.528086245059967, + "learning_rate": 4.2395240834301486e-05, + "loss": 0.0532, + "num_input_tokens_seen": 72022800, + "step": 59180 + }, + { + "epoch": 6.591491257378327, + "grad_norm": 0.9319228529930115, + "learning_rate": 4.239349565067077e-05, + "loss": 0.0467, + "num_input_tokens_seen": 72029456, + "step": 59185 + }, + { + "epoch": 6.592048112261945, + "grad_norm": 0.7558197379112244, + "learning_rate": 4.239175030274456e-05, + "loss": 0.1047, + "num_input_tokens_seen": 72035376, + "step": 59190 + }, + { + "epoch": 6.5926049671455615, + "grad_norm": 0.24385620653629303, + "learning_rate": 4.239000479053932e-05, + "loss": 0.0077, + "num_input_tokens_seen": 72041808, + "step": 59195 + }, + { + "epoch": 6.593161822029179, + "grad_norm": 0.35493919253349304, + "learning_rate": 4.238825911407156e-05, + "loss": 0.0644, + "num_input_tokens_seen": 72047440, + "step": 59200 + }, + { + "epoch": 6.593718676912797, + "grad_norm": 2.840472459793091, + "learning_rate": 4.2386513273357766e-05, + "loss": 0.0646, + "num_input_tokens_seen": 72053296, + "step": 59205 + }, + { + "epoch": 6.594275531796414, + "grad_norm": 0.2940767705440521, + "learning_rate": 4.238476726841442e-05, + "loss": 0.061, + "num_input_tokens_seen": 72059568, + "step": 59210 + }, + { + "epoch": 6.594832386680031, + "grad_norm": 0.12583394348621368, + "learning_rate": 4.238302109925801e-05, + "loss": 0.0663, + "num_input_tokens_seen": 72065104, + "step": 59215 + }, + { + "epoch": 6.595389241563648, + "grad_norm": 0.063819020986557, + "learning_rate": 4.2381274765905056e-05, + "loss": 0.1653, + "num_input_tokens_seen": 72070896, + "step": 59220 + }, + { + "epoch": 6.595946096447266, + "grad_norm": 0.10327645391225815, + "learning_rate": 4.237952826837203e-05, + "loss": 0.0354, + "num_input_tokens_seen": 72077168, + "step": 59225 + }, + { + "epoch": 6.5965029513308835, + "grad_norm": 0.044395215809345245, + "learning_rate": 4.237778160667542e-05, + "loss": 0.0822, + "num_input_tokens_seen": 72083152, + "step": 59230 + }, + { + "epoch": 6.5970598062145, + "grad_norm": 0.026568129658699036, + "learning_rate": 4.237603478083176e-05, + "loss": 0.0243, + "num_input_tokens_seen": 72089360, + "step": 59235 + }, + { + "epoch": 6.597616661098118, + "grad_norm": 0.06411311030387878, + "learning_rate": 4.237428779085753e-05, + "loss": 0.0199, + "num_input_tokens_seen": 72095408, + "step": 59240 + }, + { + "epoch": 6.598173515981735, + "grad_norm": 1.5807462930679321, + "learning_rate": 4.237254063676922e-05, + "loss": 0.1167, + "num_input_tokens_seen": 72101552, + "step": 59245 + }, + { + "epoch": 6.598730370865352, + "grad_norm": 1.3284064531326294, + "learning_rate": 4.237079331858335e-05, + "loss": 0.0985, + "num_input_tokens_seen": 72107600, + "step": 59250 + }, + { + "epoch": 6.59928722574897, + "grad_norm": 0.01109226606786251, + "learning_rate": 4.236904583631641e-05, + "loss": 0.0545, + "num_input_tokens_seen": 72113872, + "step": 59255 + }, + { + "epoch": 6.599844080632587, + "grad_norm": 0.0003220017533749342, + "learning_rate": 4.236729818998493e-05, + "loss": 0.0049, + "num_input_tokens_seen": 72119920, + "step": 59260 + }, + { + "epoch": 6.6004009355162045, + "grad_norm": 0.12966187298297882, + "learning_rate": 4.23655503796054e-05, + "loss": 0.0708, + "num_input_tokens_seen": 72125904, + "step": 59265 + }, + { + "epoch": 6.600957790399821, + "grad_norm": 0.08608569949865341, + "learning_rate": 4.236380240519433e-05, + "loss": 0.0255, + "num_input_tokens_seen": 72132112, + "step": 59270 + }, + { + "epoch": 6.601514645283439, + "grad_norm": 0.2722499966621399, + "learning_rate": 4.236205426676824e-05, + "loss": 0.0339, + "num_input_tokens_seen": 72138128, + "step": 59275 + }, + { + "epoch": 6.602071500167057, + "grad_norm": 0.00944725051522255, + "learning_rate": 4.236030596434364e-05, + "loss": 0.0021, + "num_input_tokens_seen": 72144560, + "step": 59280 + }, + { + "epoch": 6.6026283550506735, + "grad_norm": 0.2333015650510788, + "learning_rate": 4.235855749793703e-05, + "loss": 0.1408, + "num_input_tokens_seen": 72150896, + "step": 59285 + }, + { + "epoch": 6.603185209934291, + "grad_norm": 1.7649742364883423, + "learning_rate": 4.235680886756495e-05, + "loss": 0.086, + "num_input_tokens_seen": 72156720, + "step": 59290 + }, + { + "epoch": 6.603742064817909, + "grad_norm": 0.003049192950129509, + "learning_rate": 4.235506007324389e-05, + "loss": 0.0933, + "num_input_tokens_seen": 72162864, + "step": 59295 + }, + { + "epoch": 6.604298919701526, + "grad_norm": 0.0007809275994077325, + "learning_rate": 4.235331111499039e-05, + "loss": 0.0262, + "num_input_tokens_seen": 72169072, + "step": 59300 + }, + { + "epoch": 6.604855774585143, + "grad_norm": 0.14869514107704163, + "learning_rate": 4.235156199282097e-05, + "loss": 0.027, + "num_input_tokens_seen": 72175248, + "step": 59305 + }, + { + "epoch": 6.60541262946876, + "grad_norm": 0.010701624676585197, + "learning_rate": 4.234981270675213e-05, + "loss": 0.0515, + "num_input_tokens_seen": 72181392, + "step": 59310 + }, + { + "epoch": 6.605969484352378, + "grad_norm": 0.26571178436279297, + "learning_rate": 4.234806325680042e-05, + "loss": 0.0669, + "num_input_tokens_seen": 72187600, + "step": 59315 + }, + { + "epoch": 6.606526339235995, + "grad_norm": 0.0024642106145620346, + "learning_rate": 4.234631364298235e-05, + "loss": 0.0313, + "num_input_tokens_seen": 72193840, + "step": 59320 + }, + { + "epoch": 6.607083194119612, + "grad_norm": 0.31253859400749207, + "learning_rate": 4.234456386531446e-05, + "loss": 0.0383, + "num_input_tokens_seen": 72199984, + "step": 59325 + }, + { + "epoch": 6.60764004900323, + "grad_norm": 2.5767271518707275, + "learning_rate": 4.234281392381325e-05, + "loss": 0.0463, + "num_input_tokens_seen": 72205776, + "step": 59330 + }, + { + "epoch": 6.6081969038868476, + "grad_norm": 1.7058522701263428, + "learning_rate": 4.234106381849528e-05, + "loss": 0.0179, + "num_input_tokens_seen": 72212016, + "step": 59335 + }, + { + "epoch": 6.608753758770464, + "grad_norm": 0.8764970898628235, + "learning_rate": 4.233931354937707e-05, + "loss": 0.0789, + "num_input_tokens_seen": 72218288, + "step": 59340 + }, + { + "epoch": 6.609310613654082, + "grad_norm": 0.009833678603172302, + "learning_rate": 4.2337563116475146e-05, + "loss": 0.0956, + "num_input_tokens_seen": 72224272, + "step": 59345 + }, + { + "epoch": 6.609867468537699, + "grad_norm": 0.2894057333469391, + "learning_rate": 4.233581251980604e-05, + "loss": 0.096, + "num_input_tokens_seen": 72230128, + "step": 59350 + }, + { + "epoch": 6.6104243234213165, + "grad_norm": 0.1943863481283188, + "learning_rate": 4.233406175938631e-05, + "loss": 0.0152, + "num_input_tokens_seen": 72236464, + "step": 59355 + }, + { + "epoch": 6.610981178304934, + "grad_norm": 0.909062385559082, + "learning_rate": 4.233231083523247e-05, + "loss": 0.049, + "num_input_tokens_seen": 72242704, + "step": 59360 + }, + { + "epoch": 6.611538033188551, + "grad_norm": 1.3734464645385742, + "learning_rate": 4.2330559747361075e-05, + "loss": 0.0999, + "num_input_tokens_seen": 72248880, + "step": 59365 + }, + { + "epoch": 6.612094888072169, + "grad_norm": 0.03845975548028946, + "learning_rate": 4.2328808495788654e-05, + "loss": 0.0467, + "num_input_tokens_seen": 72255216, + "step": 59370 + }, + { + "epoch": 6.612651742955785, + "grad_norm": 0.21205896139144897, + "learning_rate": 4.232705708053175e-05, + "loss": 0.0412, + "num_input_tokens_seen": 72261072, + "step": 59375 + }, + { + "epoch": 6.613208597839403, + "grad_norm": 0.3169693946838379, + "learning_rate": 4.2325305501606914e-05, + "loss": 0.0694, + "num_input_tokens_seen": 72267120, + "step": 59380 + }, + { + "epoch": 6.613765452723021, + "grad_norm": 0.6574592590332031, + "learning_rate": 4.232355375903069e-05, + "loss": 0.0233, + "num_input_tokens_seen": 72272208, + "step": 59385 + }, + { + "epoch": 6.6143223076066375, + "grad_norm": 0.04525863379240036, + "learning_rate": 4.232180185281961e-05, + "loss": 0.0878, + "num_input_tokens_seen": 72278224, + "step": 59390 + }, + { + "epoch": 6.614879162490255, + "grad_norm": 0.02923581376671791, + "learning_rate": 4.2320049782990245e-05, + "loss": 0.0518, + "num_input_tokens_seen": 72284368, + "step": 59395 + }, + { + "epoch": 6.615436017373872, + "grad_norm": 0.2590116262435913, + "learning_rate": 4.2318297549559126e-05, + "loss": 0.0175, + "num_input_tokens_seen": 72290032, + "step": 59400 + }, + { + "epoch": 6.61599287225749, + "grad_norm": 1.6312958002090454, + "learning_rate": 4.231654515254282e-05, + "loss": 0.0328, + "num_input_tokens_seen": 72296336, + "step": 59405 + }, + { + "epoch": 6.616549727141107, + "grad_norm": 0.20321646332740784, + "learning_rate": 4.231479259195786e-05, + "loss": 0.0437, + "num_input_tokens_seen": 72302544, + "step": 59410 + }, + { + "epoch": 6.617106582024724, + "grad_norm": 0.19069740176200867, + "learning_rate": 4.2313039867820816e-05, + "loss": 0.1424, + "num_input_tokens_seen": 72308720, + "step": 59415 + }, + { + "epoch": 6.617663436908342, + "grad_norm": 0.4192667305469513, + "learning_rate": 4.231128698014824e-05, + "loss": 0.0281, + "num_input_tokens_seen": 72314768, + "step": 59420 + }, + { + "epoch": 6.618220291791959, + "grad_norm": 0.001886101788841188, + "learning_rate": 4.230953392895669e-05, + "loss": 0.0116, + "num_input_tokens_seen": 72320912, + "step": 59425 + }, + { + "epoch": 6.618777146675576, + "grad_norm": 0.35047492384910583, + "learning_rate": 4.230778071426272e-05, + "loss": 0.055, + "num_input_tokens_seen": 72326352, + "step": 59430 + }, + { + "epoch": 6.619334001559194, + "grad_norm": 0.005114071071147919, + "learning_rate": 4.23060273360829e-05, + "loss": 0.0762, + "num_input_tokens_seen": 72332464, + "step": 59435 + }, + { + "epoch": 6.619890856442811, + "grad_norm": 1.1003469228744507, + "learning_rate": 4.230427379443379e-05, + "loss": 0.2367, + "num_input_tokens_seen": 72338576, + "step": 59440 + }, + { + "epoch": 6.620447711326428, + "grad_norm": 1.1893576383590698, + "learning_rate": 4.230252008933194e-05, + "loss": 0.0435, + "num_input_tokens_seen": 72344720, + "step": 59445 + }, + { + "epoch": 6.621004566210045, + "grad_norm": 0.23218894004821777, + "learning_rate": 4.230076622079393e-05, + "loss": 0.0161, + "num_input_tokens_seen": 72350864, + "step": 59450 + }, + { + "epoch": 6.621561421093663, + "grad_norm": 0.027070796117186546, + "learning_rate": 4.2299012188836315e-05, + "loss": 0.0607, + "num_input_tokens_seen": 72357232, + "step": 59455 + }, + { + "epoch": 6.6221182759772805, + "grad_norm": 0.8254033327102661, + "learning_rate": 4.229725799347568e-05, + "loss": 0.0866, + "num_input_tokens_seen": 72363472, + "step": 59460 + }, + { + "epoch": 6.622675130860897, + "grad_norm": 0.43745702505111694, + "learning_rate": 4.229550363472858e-05, + "loss": 0.0348, + "num_input_tokens_seen": 72369616, + "step": 59465 + }, + { + "epoch": 6.623231985744515, + "grad_norm": 0.3968806862831116, + "learning_rate": 4.22937491126116e-05, + "loss": 0.0643, + "num_input_tokens_seen": 72375824, + "step": 59470 + }, + { + "epoch": 6.623788840628133, + "grad_norm": 0.05963638052344322, + "learning_rate": 4.229199442714129e-05, + "loss": 0.1375, + "num_input_tokens_seen": 72381936, + "step": 59475 + }, + { + "epoch": 6.6243456955117495, + "grad_norm": 0.19997934997081757, + "learning_rate": 4.2290239578334246e-05, + "loss": 0.0509, + "num_input_tokens_seen": 72388144, + "step": 59480 + }, + { + "epoch": 6.624902550395367, + "grad_norm": 0.5670962929725647, + "learning_rate": 4.228848456620704e-05, + "loss": 0.0573, + "num_input_tokens_seen": 72394480, + "step": 59485 + }, + { + "epoch": 6.625459405278984, + "grad_norm": 1.867156744003296, + "learning_rate": 4.228672939077623e-05, + "loss": 0.0614, + "num_input_tokens_seen": 72400432, + "step": 59490 + }, + { + "epoch": 6.626016260162602, + "grad_norm": 0.11013831943273544, + "learning_rate": 4.2284974052058436e-05, + "loss": 0.0282, + "num_input_tokens_seen": 72406448, + "step": 59495 + }, + { + "epoch": 6.626573115046219, + "grad_norm": 0.028299521654844284, + "learning_rate": 4.2283218550070194e-05, + "loss": 0.0112, + "num_input_tokens_seen": 72412496, + "step": 59500 + }, + { + "epoch": 6.627129969929836, + "grad_norm": 0.0090473098680377, + "learning_rate": 4.228146288482811e-05, + "loss": 0.1118, + "num_input_tokens_seen": 72418064, + "step": 59505 + }, + { + "epoch": 6.627686824813454, + "grad_norm": 0.43501177430152893, + "learning_rate": 4.2279707056348765e-05, + "loss": 0.0311, + "num_input_tokens_seen": 72424016, + "step": 59510 + }, + { + "epoch": 6.628243679697071, + "grad_norm": 1.2434719800949097, + "learning_rate": 4.227795106464875e-05, + "loss": 0.045, + "num_input_tokens_seen": 72430512, + "step": 59515 + }, + { + "epoch": 6.628800534580688, + "grad_norm": 0.10299879312515259, + "learning_rate": 4.2276194909744635e-05, + "loss": 0.0094, + "num_input_tokens_seen": 72436496, + "step": 59520 + }, + { + "epoch": 6.629357389464306, + "grad_norm": 0.032324958592653275, + "learning_rate": 4.227443859165302e-05, + "loss": 0.0851, + "num_input_tokens_seen": 72442448, + "step": 59525 + }, + { + "epoch": 6.629914244347923, + "grad_norm": 0.3504410982131958, + "learning_rate": 4.2272682110390494e-05, + "loss": 0.1334, + "num_input_tokens_seen": 72448752, + "step": 59530 + }, + { + "epoch": 6.63047109923154, + "grad_norm": 0.5342055559158325, + "learning_rate": 4.2270925465973645e-05, + "loss": 0.0433, + "num_input_tokens_seen": 72454832, + "step": 59535 + }, + { + "epoch": 6.631027954115158, + "grad_norm": 2.3530919551849365, + "learning_rate": 4.226916865841907e-05, + "loss": 0.0613, + "num_input_tokens_seen": 72461136, + "step": 59540 + }, + { + "epoch": 6.631584808998775, + "grad_norm": 0.002148867817595601, + "learning_rate": 4.226741168774335e-05, + "loss": 0.0509, + "num_input_tokens_seen": 72467536, + "step": 59545 + }, + { + "epoch": 6.6321416638823925, + "grad_norm": 0.18234692513942719, + "learning_rate": 4.226565455396311e-05, + "loss": 0.0094, + "num_input_tokens_seen": 72473552, + "step": 59550 + }, + { + "epoch": 6.632698518766009, + "grad_norm": 0.7323348522186279, + "learning_rate": 4.226389725709492e-05, + "loss": 0.0432, + "num_input_tokens_seen": 72479984, + "step": 59555 + }, + { + "epoch": 6.633255373649627, + "grad_norm": 0.18957290053367615, + "learning_rate": 4.226213979715539e-05, + "loss": 0.0318, + "num_input_tokens_seen": 72486224, + "step": 59560 + }, + { + "epoch": 6.633812228533245, + "grad_norm": 0.0148433493450284, + "learning_rate": 4.226038217416112e-05, + "loss": 0.0117, + "num_input_tokens_seen": 72492624, + "step": 59565 + }, + { + "epoch": 6.634369083416861, + "grad_norm": 0.34575292468070984, + "learning_rate": 4.225862438812871e-05, + "loss": 0.0109, + "num_input_tokens_seen": 72498576, + "step": 59570 + }, + { + "epoch": 6.634925938300479, + "grad_norm": 0.0014433061005547643, + "learning_rate": 4.225686643907476e-05, + "loss": 0.0497, + "num_input_tokens_seen": 72504752, + "step": 59575 + }, + { + "epoch": 6.635482793184096, + "grad_norm": 0.0029505244456231594, + "learning_rate": 4.225510832701589e-05, + "loss": 0.1429, + "num_input_tokens_seen": 72510640, + "step": 59580 + }, + { + "epoch": 6.6360396480677135, + "grad_norm": 0.000981746008619666, + "learning_rate": 4.225335005196869e-05, + "loss": 0.0556, + "num_input_tokens_seen": 72516848, + "step": 59585 + }, + { + "epoch": 6.636596502951331, + "grad_norm": 0.6898165941238403, + "learning_rate": 4.2251591613949784e-05, + "loss": 0.0335, + "num_input_tokens_seen": 72523088, + "step": 59590 + }, + { + "epoch": 6.637153357834948, + "grad_norm": 0.01903393119573593, + "learning_rate": 4.224983301297577e-05, + "loss": 0.046, + "num_input_tokens_seen": 72529040, + "step": 59595 + }, + { + "epoch": 6.637710212718566, + "grad_norm": 0.020551633089780807, + "learning_rate": 4.2248074249063264e-05, + "loss": 0.007, + "num_input_tokens_seen": 72535312, + "step": 59600 + }, + { + "epoch": 6.638267067602182, + "grad_norm": 0.015962354838848114, + "learning_rate": 4.224631532222887e-05, + "loss": 0.0653, + "num_input_tokens_seen": 72541616, + "step": 59605 + }, + { + "epoch": 6.6388239224858, + "grad_norm": 0.0004467620456125587, + "learning_rate": 4.224455623248922e-05, + "loss": 0.0822, + "num_input_tokens_seen": 72547984, + "step": 59610 + }, + { + "epoch": 6.639380777369418, + "grad_norm": 0.49099797010421753, + "learning_rate": 4.224279697986091e-05, + "loss": 0.0219, + "num_input_tokens_seen": 72554032, + "step": 59615 + }, + { + "epoch": 6.639937632253035, + "grad_norm": 0.1944100558757782, + "learning_rate": 4.2241037564360576e-05, + "loss": 0.0145, + "num_input_tokens_seen": 72560176, + "step": 59620 + }, + { + "epoch": 6.640494487136652, + "grad_norm": 2.123939037322998, + "learning_rate": 4.223927798600483e-05, + "loss": 0.0808, + "num_input_tokens_seen": 72566384, + "step": 59625 + }, + { + "epoch": 6.641051342020269, + "grad_norm": 0.02945731021463871, + "learning_rate": 4.223751824481028e-05, + "loss": 0.1447, + "num_input_tokens_seen": 72572432, + "step": 59630 + }, + { + "epoch": 6.641608196903887, + "grad_norm": 1.343741536140442, + "learning_rate": 4.2235758340793574e-05, + "loss": 0.0994, + "num_input_tokens_seen": 72578512, + "step": 59635 + }, + { + "epoch": 6.642165051787504, + "grad_norm": 0.8637478947639465, + "learning_rate": 4.223399827397131e-05, + "loss": 0.057, + "num_input_tokens_seen": 72584784, + "step": 59640 + }, + { + "epoch": 6.642721906671121, + "grad_norm": 0.9534991383552551, + "learning_rate": 4.2232238044360135e-05, + "loss": 0.0232, + "num_input_tokens_seen": 72590768, + "step": 59645 + }, + { + "epoch": 6.643278761554739, + "grad_norm": 0.36888930201530457, + "learning_rate": 4.223047765197666e-05, + "loss": 0.029, + "num_input_tokens_seen": 72596720, + "step": 59650 + }, + { + "epoch": 6.6438356164383565, + "grad_norm": 0.013949395157396793, + "learning_rate": 4.222871709683752e-05, + "loss": 0.0307, + "num_input_tokens_seen": 72602480, + "step": 59655 + }, + { + "epoch": 6.644392471321973, + "grad_norm": 0.2887846827507019, + "learning_rate": 4.222695637895934e-05, + "loss": 0.0193, + "num_input_tokens_seen": 72608592, + "step": 59660 + }, + { + "epoch": 6.644949326205591, + "grad_norm": 1.1478549242019653, + "learning_rate": 4.222519549835876e-05, + "loss": 0.1173, + "num_input_tokens_seen": 72614800, + "step": 59665 + }, + { + "epoch": 6.645506181089208, + "grad_norm": 0.6105329394340515, + "learning_rate": 4.222343445505241e-05, + "loss": 0.1279, + "num_input_tokens_seen": 72620336, + "step": 59670 + }, + { + "epoch": 6.646063035972825, + "grad_norm": 0.159589022397995, + "learning_rate": 4.2221673249056915e-05, + "loss": 0.0299, + "num_input_tokens_seen": 72626256, + "step": 59675 + }, + { + "epoch": 6.646619890856443, + "grad_norm": 0.07793143391609192, + "learning_rate": 4.221991188038892e-05, + "loss": 0.027, + "num_input_tokens_seen": 72632336, + "step": 59680 + }, + { + "epoch": 6.64717674574006, + "grad_norm": 0.02706211246550083, + "learning_rate": 4.221815034906506e-05, + "loss": 0.0066, + "num_input_tokens_seen": 72638736, + "step": 59685 + }, + { + "epoch": 6.647733600623678, + "grad_norm": 3.4096617698669434, + "learning_rate": 4.221638865510198e-05, + "loss": 0.1746, + "num_input_tokens_seen": 72644976, + "step": 59690 + }, + { + "epoch": 6.648290455507295, + "grad_norm": 0.0777563750743866, + "learning_rate": 4.221462679851631e-05, + "loss": 0.0116, + "num_input_tokens_seen": 72650928, + "step": 59695 + }, + { + "epoch": 6.648847310390912, + "grad_norm": 0.008844430558383465, + "learning_rate": 4.22128647793247e-05, + "loss": 0.0735, + "num_input_tokens_seen": 72657040, + "step": 59700 + }, + { + "epoch": 6.64940416527453, + "grad_norm": 0.9952767491340637, + "learning_rate": 4.2211102597543796e-05, + "loss": 0.0762, + "num_input_tokens_seen": 72662832, + "step": 59705 + }, + { + "epoch": 6.6499610201581465, + "grad_norm": 0.021476279944181442, + "learning_rate": 4.2209340253190235e-05, + "loss": 0.1918, + "num_input_tokens_seen": 72669040, + "step": 59710 + }, + { + "epoch": 6.650517875041764, + "grad_norm": 0.0009660322102718055, + "learning_rate": 4.220757774628067e-05, + "loss": 0.007, + "num_input_tokens_seen": 72675344, + "step": 59715 + }, + { + "epoch": 6.651074729925382, + "grad_norm": 0.10923359543085098, + "learning_rate": 4.2205815076831746e-05, + "loss": 0.0647, + "num_input_tokens_seen": 72681424, + "step": 59720 + }, + { + "epoch": 6.651631584808999, + "grad_norm": 0.1340087354183197, + "learning_rate": 4.22040522448601e-05, + "loss": 0.0794, + "num_input_tokens_seen": 72687344, + "step": 59725 + }, + { + "epoch": 6.652188439692616, + "grad_norm": 1.071463942527771, + "learning_rate": 4.2202289250382415e-05, + "loss": 0.0669, + "num_input_tokens_seen": 72693232, + "step": 59730 + }, + { + "epoch": 6.652745294576233, + "grad_norm": 0.28639453649520874, + "learning_rate": 4.220052609341532e-05, + "loss": 0.0156, + "num_input_tokens_seen": 72699120, + "step": 59735 + }, + { + "epoch": 6.653302149459851, + "grad_norm": 0.002286932896822691, + "learning_rate": 4.219876277397548e-05, + "loss": 0.0089, + "num_input_tokens_seen": 72705296, + "step": 59740 + }, + { + "epoch": 6.653859004343468, + "grad_norm": 0.08462433516979218, + "learning_rate": 4.219699929207954e-05, + "loss": 0.0806, + "num_input_tokens_seen": 72710992, + "step": 59745 + }, + { + "epoch": 6.654415859227085, + "grad_norm": 0.4049665927886963, + "learning_rate": 4.2195235647744155e-05, + "loss": 0.0187, + "num_input_tokens_seen": 72716976, + "step": 59750 + }, + { + "epoch": 6.654972714110703, + "grad_norm": 0.0073107467032969, + "learning_rate": 4.219347184098601e-05, + "loss": 0.0492, + "num_input_tokens_seen": 72723120, + "step": 59755 + }, + { + "epoch": 6.65552956899432, + "grad_norm": 0.0017568398034200072, + "learning_rate": 4.2191707871821736e-05, + "loss": 0.045, + "num_input_tokens_seen": 72729040, + "step": 59760 + }, + { + "epoch": 6.656086423877937, + "grad_norm": 0.1494813710451126, + "learning_rate": 4.2189943740268014e-05, + "loss": 0.066, + "num_input_tokens_seen": 72735088, + "step": 59765 + }, + { + "epoch": 6.656643278761555, + "grad_norm": 0.009493590332567692, + "learning_rate": 4.21881794463415e-05, + "loss": 0.1169, + "num_input_tokens_seen": 72741360, + "step": 59770 + }, + { + "epoch": 6.657200133645172, + "grad_norm": 0.049828462302684784, + "learning_rate": 4.2186414990058856e-05, + "loss": 0.0643, + "num_input_tokens_seen": 72747408, + "step": 59775 + }, + { + "epoch": 6.6577569885287895, + "grad_norm": 0.39166009426116943, + "learning_rate": 4.2184650371436754e-05, + "loss": 0.0377, + "num_input_tokens_seen": 72753264, + "step": 59780 + }, + { + "epoch": 6.658313843412406, + "grad_norm": 1.313020944595337, + "learning_rate": 4.2182885590491866e-05, + "loss": 0.1235, + "num_input_tokens_seen": 72758992, + "step": 59785 + }, + { + "epoch": 6.658870698296024, + "grad_norm": 0.002869085408747196, + "learning_rate": 4.2181120647240856e-05, + "loss": 0.0108, + "num_input_tokens_seen": 72765200, + "step": 59790 + }, + { + "epoch": 6.659427553179642, + "grad_norm": 1.236714482307434, + "learning_rate": 4.2179355541700394e-05, + "loss": 0.0489, + "num_input_tokens_seen": 72771312, + "step": 59795 + }, + { + "epoch": 6.659984408063258, + "grad_norm": 0.2639075219631195, + "learning_rate": 4.2177590273887155e-05, + "loss": 0.0151, + "num_input_tokens_seen": 72777328, + "step": 59800 + }, + { + "epoch": 6.660541262946876, + "grad_norm": 1.6667262315750122, + "learning_rate": 4.217582484381781e-05, + "loss": 0.1489, + "num_input_tokens_seen": 72782832, + "step": 59805 + }, + { + "epoch": 6.661098117830493, + "grad_norm": 0.42581015825271606, + "learning_rate": 4.217405925150905e-05, + "loss": 0.016, + "num_input_tokens_seen": 72788944, + "step": 59810 + }, + { + "epoch": 6.661654972714111, + "grad_norm": 0.8423818349838257, + "learning_rate": 4.2172293496977524e-05, + "loss": 0.0777, + "num_input_tokens_seen": 72794960, + "step": 59815 + }, + { + "epoch": 6.662211827597728, + "grad_norm": 0.9476747512817383, + "learning_rate": 4.217052758023994e-05, + "loss": 0.1396, + "num_input_tokens_seen": 72800976, + "step": 59820 + }, + { + "epoch": 6.662768682481345, + "grad_norm": 0.08557721972465515, + "learning_rate": 4.216876150131296e-05, + "loss": 0.0499, + "num_input_tokens_seen": 72806384, + "step": 59825 + }, + { + "epoch": 6.663325537364963, + "grad_norm": 0.1675439178943634, + "learning_rate": 4.216699526021327e-05, + "loss": 0.0926, + "num_input_tokens_seen": 72812688, + "step": 59830 + }, + { + "epoch": 6.66388239224858, + "grad_norm": 0.5080960988998413, + "learning_rate": 4.216522885695757e-05, + "loss": 0.0125, + "num_input_tokens_seen": 72818800, + "step": 59835 + }, + { + "epoch": 6.664439247132197, + "grad_norm": 0.004013333935290575, + "learning_rate": 4.2163462291562516e-05, + "loss": 0.0031, + "num_input_tokens_seen": 72825104, + "step": 59840 + }, + { + "epoch": 6.664996102015815, + "grad_norm": 0.004921067971736193, + "learning_rate": 4.216169556404481e-05, + "loss": 0.0232, + "num_input_tokens_seen": 72831088, + "step": 59845 + }, + { + "epoch": 6.6655529568994325, + "grad_norm": 0.8010846972465515, + "learning_rate": 4.215992867442115e-05, + "loss": 0.13, + "num_input_tokens_seen": 72837104, + "step": 59850 + }, + { + "epoch": 6.666109811783049, + "grad_norm": 0.4047389030456543, + "learning_rate": 4.215816162270822e-05, + "loss": 0.0441, + "num_input_tokens_seen": 72842832, + "step": 59855 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.0032721308525651693, + "learning_rate": 4.2156394408922684e-05, + "loss": 0.1452, + "num_input_tokens_seen": 72848368, + "step": 59860 + }, + { + "epoch": 6.667223521550284, + "grad_norm": 0.08968397229909897, + "learning_rate": 4.215462703308127e-05, + "loss": 0.1109, + "num_input_tokens_seen": 72854480, + "step": 59865 + }, + { + "epoch": 6.667780376433901, + "grad_norm": 0.0015588985988870263, + "learning_rate": 4.2152859495200664e-05, + "loss": 0.0277, + "num_input_tokens_seen": 72860560, + "step": 59870 + }, + { + "epoch": 6.668337231317519, + "grad_norm": 0.004598663654178381, + "learning_rate": 4.215109179529755e-05, + "loss": 0.0207, + "num_input_tokens_seen": 72866576, + "step": 59875 + }, + { + "epoch": 6.668894086201136, + "grad_norm": 0.2407314032316208, + "learning_rate": 4.214932393338864e-05, + "loss": 0.0727, + "num_input_tokens_seen": 72872208, + "step": 59880 + }, + { + "epoch": 6.669450941084754, + "grad_norm": 0.299060583114624, + "learning_rate": 4.214755590949062e-05, + "loss": 0.0298, + "num_input_tokens_seen": 72877808, + "step": 59885 + }, + { + "epoch": 6.67000779596837, + "grad_norm": 0.02937370166182518, + "learning_rate": 4.2145787723620196e-05, + "loss": 0.0085, + "num_input_tokens_seen": 72883824, + "step": 59890 + }, + { + "epoch": 6.670564650851988, + "grad_norm": 0.8038843870162964, + "learning_rate": 4.2144019375794075e-05, + "loss": 0.0716, + "num_input_tokens_seen": 72890000, + "step": 59895 + }, + { + "epoch": 6.671121505735606, + "grad_norm": 0.0780310332775116, + "learning_rate": 4.214225086602895e-05, + "loss": 0.0119, + "num_input_tokens_seen": 72896208, + "step": 59900 + }, + { + "epoch": 6.6716783606192225, + "grad_norm": 0.04919300973415375, + "learning_rate": 4.214048219434154e-05, + "loss": 0.0336, + "num_input_tokens_seen": 72902640, + "step": 59905 + }, + { + "epoch": 6.67223521550284, + "grad_norm": 0.21423572301864624, + "learning_rate": 4.2138713360748526e-05, + "loss": 0.035, + "num_input_tokens_seen": 72908880, + "step": 59910 + }, + { + "epoch": 6.672792070386457, + "grad_norm": 0.2863806188106537, + "learning_rate": 4.213694436526665e-05, + "loss": 0.0523, + "num_input_tokens_seen": 72915024, + "step": 59915 + }, + { + "epoch": 6.673348925270075, + "grad_norm": 0.940504252910614, + "learning_rate": 4.213517520791259e-05, + "loss": 0.0299, + "num_input_tokens_seen": 72920592, + "step": 59920 + }, + { + "epoch": 6.673905780153692, + "grad_norm": 0.6523565649986267, + "learning_rate": 4.2133405888703085e-05, + "loss": 0.1018, + "num_input_tokens_seen": 72926640, + "step": 59925 + }, + { + "epoch": 6.674462635037309, + "grad_norm": 0.0034768579062074423, + "learning_rate": 4.2131636407654826e-05, + "loss": 0.0106, + "num_input_tokens_seen": 72932656, + "step": 59930 + }, + { + "epoch": 6.675019489920927, + "grad_norm": 0.05996245518326759, + "learning_rate": 4.212986676478454e-05, + "loss": 0.0755, + "num_input_tokens_seen": 72938512, + "step": 59935 + }, + { + "epoch": 6.6755763448045435, + "grad_norm": 0.17215058207511902, + "learning_rate": 4.2128096960108935e-05, + "loss": 0.0229, + "num_input_tokens_seen": 72944464, + "step": 59940 + }, + { + "epoch": 6.676133199688161, + "grad_norm": 0.13486813008785248, + "learning_rate": 4.2126326993644736e-05, + "loss": 0.0847, + "num_input_tokens_seen": 72950672, + "step": 59945 + }, + { + "epoch": 6.676690054571779, + "grad_norm": 0.5501511693000793, + "learning_rate": 4.2124556865408656e-05, + "loss": 0.0572, + "num_input_tokens_seen": 72956944, + "step": 59950 + }, + { + "epoch": 6.677246909455396, + "grad_norm": 0.15524077415466309, + "learning_rate": 4.212278657541741e-05, + "loss": 0.1565, + "num_input_tokens_seen": 72962352, + "step": 59955 + }, + { + "epoch": 6.677803764339013, + "grad_norm": 0.0032588138710707426, + "learning_rate": 4.212101612368773e-05, + "loss": 0.0597, + "num_input_tokens_seen": 72968112, + "step": 59960 + }, + { + "epoch": 6.67836061922263, + "grad_norm": 0.4667164981365204, + "learning_rate": 4.2119245510236335e-05, + "loss": 0.2486, + "num_input_tokens_seen": 72973808, + "step": 59965 + }, + { + "epoch": 6.678917474106248, + "grad_norm": 0.0787000060081482, + "learning_rate": 4.211747473507995e-05, + "loss": 0.0886, + "num_input_tokens_seen": 72979344, + "step": 59970 + }, + { + "epoch": 6.6794743289898655, + "grad_norm": 0.015287057496607304, + "learning_rate": 4.211570379823531e-05, + "loss": 0.0468, + "num_input_tokens_seen": 72985424, + "step": 59975 + }, + { + "epoch": 6.680031183873482, + "grad_norm": 1.4680010080337524, + "learning_rate": 4.211393269971913e-05, + "loss": 0.1073, + "num_input_tokens_seen": 72991024, + "step": 59980 + }, + { + "epoch": 6.6805880387571, + "grad_norm": 1.5508067607879639, + "learning_rate": 4.211216143954814e-05, + "loss": 0.1433, + "num_input_tokens_seen": 72997008, + "step": 59985 + }, + { + "epoch": 6.681144893640717, + "grad_norm": 0.03569776192307472, + "learning_rate": 4.2110390017739074e-05, + "loss": 0.0041, + "num_input_tokens_seen": 73003216, + "step": 59990 + }, + { + "epoch": 6.681701748524334, + "grad_norm": 0.23740123212337494, + "learning_rate": 4.210861843430867e-05, + "loss": 0.0568, + "num_input_tokens_seen": 73009392, + "step": 59995 + }, + { + "epoch": 6.682258603407952, + "grad_norm": 1.1227641105651855, + "learning_rate": 4.210684668927366e-05, + "loss": 0.0812, + "num_input_tokens_seen": 73015408, + "step": 60000 + }, + { + "epoch": 6.682815458291569, + "grad_norm": 0.01490867044776678, + "learning_rate": 4.210507478265078e-05, + "loss": 0.1065, + "num_input_tokens_seen": 73021360, + "step": 60005 + }, + { + "epoch": 6.6833723131751865, + "grad_norm": 1.154287576675415, + "learning_rate": 4.210330271445675e-05, + "loss": 0.0701, + "num_input_tokens_seen": 73027344, + "step": 60010 + }, + { + "epoch": 6.683929168058804, + "grad_norm": 0.1591719686985016, + "learning_rate": 4.2101530484708335e-05, + "loss": 0.022, + "num_input_tokens_seen": 73033200, + "step": 60015 + }, + { + "epoch": 6.684486022942421, + "grad_norm": 0.29885491728782654, + "learning_rate": 4.209975809342226e-05, + "loss": 0.0812, + "num_input_tokens_seen": 73039216, + "step": 60020 + }, + { + "epoch": 6.685042877826039, + "grad_norm": 0.061570148915052414, + "learning_rate": 4.209798554061527e-05, + "loss": 0.0957, + "num_input_tokens_seen": 73045072, + "step": 60025 + }, + { + "epoch": 6.685599732709656, + "grad_norm": 0.0006105592474341393, + "learning_rate": 4.2096212826304104e-05, + "loss": 0.0979, + "num_input_tokens_seen": 73051408, + "step": 60030 + }, + { + "epoch": 6.686156587593273, + "grad_norm": 0.49715033173561096, + "learning_rate": 4.209443995050552e-05, + "loss": 0.0556, + "num_input_tokens_seen": 73057680, + "step": 60035 + }, + { + "epoch": 6.686713442476891, + "grad_norm": 0.2353569120168686, + "learning_rate": 4.209266691323625e-05, + "loss": 0.0087, + "num_input_tokens_seen": 73063632, + "step": 60040 + }, + { + "epoch": 6.687270297360508, + "grad_norm": 0.032630931586027145, + "learning_rate": 4.209089371451304e-05, + "loss": 0.1483, + "num_input_tokens_seen": 73069872, + "step": 60045 + }, + { + "epoch": 6.687827152244125, + "grad_norm": 0.815508246421814, + "learning_rate": 4.2089120354352654e-05, + "loss": 0.1464, + "num_input_tokens_seen": 73076016, + "step": 60050 + }, + { + "epoch": 6.688384007127743, + "grad_norm": 1.7144074440002441, + "learning_rate": 4.2087346832771825e-05, + "loss": 0.0505, + "num_input_tokens_seen": 73082416, + "step": 60055 + }, + { + "epoch": 6.68894086201136, + "grad_norm": 0.7657740116119385, + "learning_rate": 4.208557314978733e-05, + "loss": 0.0624, + "num_input_tokens_seen": 73088336, + "step": 60060 + }, + { + "epoch": 6.689497716894977, + "grad_norm": 0.033877044916152954, + "learning_rate": 4.208379930541589e-05, + "loss": 0.0429, + "num_input_tokens_seen": 73094320, + "step": 60065 + }, + { + "epoch": 6.690054571778594, + "grad_norm": 0.5690578818321228, + "learning_rate": 4.208202529967429e-05, + "loss": 0.1006, + "num_input_tokens_seen": 73100528, + "step": 60070 + }, + { + "epoch": 6.690611426662212, + "grad_norm": 0.06575646251440048, + "learning_rate": 4.2080251132579274e-05, + "loss": 0.0263, + "num_input_tokens_seen": 73106512, + "step": 60075 + }, + { + "epoch": 6.6911682815458295, + "grad_norm": 1.281134843826294, + "learning_rate": 4.20784768041476e-05, + "loss": 0.0537, + "num_input_tokens_seen": 73112656, + "step": 60080 + }, + { + "epoch": 6.691725136429446, + "grad_norm": 0.42044034600257874, + "learning_rate": 4.207670231439603e-05, + "loss": 0.1267, + "num_input_tokens_seen": 73118736, + "step": 60085 + }, + { + "epoch": 6.692281991313064, + "grad_norm": 1.971671462059021, + "learning_rate": 4.207492766334132e-05, + "loss": 0.1734, + "num_input_tokens_seen": 73124528, + "step": 60090 + }, + { + "epoch": 6.692838846196681, + "grad_norm": 1.782248616218567, + "learning_rate": 4.207315285100025e-05, + "loss": 0.0694, + "num_input_tokens_seen": 73130704, + "step": 60095 + }, + { + "epoch": 6.6933957010802985, + "grad_norm": 1.5060181617736816, + "learning_rate": 4.207137787738956e-05, + "loss": 0.116, + "num_input_tokens_seen": 73136976, + "step": 60100 + }, + { + "epoch": 6.693952555963916, + "grad_norm": 0.32431185245513916, + "learning_rate": 4.2069602742526036e-05, + "loss": 0.0519, + "num_input_tokens_seen": 73143088, + "step": 60105 + }, + { + "epoch": 6.694509410847533, + "grad_norm": 0.32664376497268677, + "learning_rate": 4.206782744642644e-05, + "loss": 0.0269, + "num_input_tokens_seen": 73148944, + "step": 60110 + }, + { + "epoch": 6.695066265731151, + "grad_norm": 0.009473818354308605, + "learning_rate": 4.206605198910754e-05, + "loss": 0.0138, + "num_input_tokens_seen": 73155184, + "step": 60115 + }, + { + "epoch": 6.695623120614767, + "grad_norm": 0.9309558272361755, + "learning_rate": 4.20642763705861e-05, + "loss": 0.0408, + "num_input_tokens_seen": 73161584, + "step": 60120 + }, + { + "epoch": 6.696179975498385, + "grad_norm": 0.0002134662790922448, + "learning_rate": 4.2062500590878894e-05, + "loss": 0.0217, + "num_input_tokens_seen": 73167760, + "step": 60125 + }, + { + "epoch": 6.696736830382003, + "grad_norm": 2.1685945987701416, + "learning_rate": 4.206072465000271e-05, + "loss": 0.2064, + "num_input_tokens_seen": 73174096, + "step": 60130 + }, + { + "epoch": 6.6972936852656195, + "grad_norm": 0.8383061289787292, + "learning_rate": 4.205894854797431e-05, + "loss": 0.1324, + "num_input_tokens_seen": 73180240, + "step": 60135 + }, + { + "epoch": 6.697850540149237, + "grad_norm": 0.41881418228149414, + "learning_rate": 4.205717228481047e-05, + "loss": 0.0955, + "num_input_tokens_seen": 73186576, + "step": 60140 + }, + { + "epoch": 6.698407395032854, + "grad_norm": 0.12303746491670609, + "learning_rate": 4.205539586052797e-05, + "loss": 0.0716, + "num_input_tokens_seen": 73192912, + "step": 60145 + }, + { + "epoch": 6.698964249916472, + "grad_norm": 0.27724018692970276, + "learning_rate": 4.2053619275143595e-05, + "loss": 0.2275, + "num_input_tokens_seen": 73198768, + "step": 60150 + }, + { + "epoch": 6.699521104800089, + "grad_norm": 0.07818038761615753, + "learning_rate": 4.205184252867412e-05, + "loss": 0.0696, + "num_input_tokens_seen": 73204400, + "step": 60155 + }, + { + "epoch": 6.700077959683706, + "grad_norm": 0.29052844643592834, + "learning_rate": 4.205006562113634e-05, + "loss": 0.0446, + "num_input_tokens_seen": 73210416, + "step": 60160 + }, + { + "epoch": 6.700634814567324, + "grad_norm": 0.580255925655365, + "learning_rate": 4.2048288552547024e-05, + "loss": 0.0593, + "num_input_tokens_seen": 73216592, + "step": 60165 + }, + { + "epoch": 6.7011916694509415, + "grad_norm": 0.06790001690387726, + "learning_rate": 4.204651132292296e-05, + "loss": 0.0667, + "num_input_tokens_seen": 73221872, + "step": 60170 + }, + { + "epoch": 6.701748524334558, + "grad_norm": 0.016467008739709854, + "learning_rate": 4.204473393228094e-05, + "loss": 0.0304, + "num_input_tokens_seen": 73227888, + "step": 60175 + }, + { + "epoch": 6.702305379218176, + "grad_norm": 0.6526655554771423, + "learning_rate": 4.204295638063775e-05, + "loss": 0.0245, + "num_input_tokens_seen": 73233840, + "step": 60180 + }, + { + "epoch": 6.702862234101793, + "grad_norm": 0.38871467113494873, + "learning_rate": 4.2041178668010196e-05, + "loss": 0.0121, + "num_input_tokens_seen": 73239824, + "step": 60185 + }, + { + "epoch": 6.70341908898541, + "grad_norm": 0.0011957156239077449, + "learning_rate": 4.203940079441504e-05, + "loss": 0.0759, + "num_input_tokens_seen": 73245488, + "step": 60190 + }, + { + "epoch": 6.703975943869028, + "grad_norm": 0.17945599555969238, + "learning_rate": 4.20376227598691e-05, + "loss": 0.0113, + "num_input_tokens_seen": 73251632, + "step": 60195 + }, + { + "epoch": 6.704532798752645, + "grad_norm": 1.3785473108291626, + "learning_rate": 4.203584456438917e-05, + "loss": 0.2079, + "num_input_tokens_seen": 73257712, + "step": 60200 + }, + { + "epoch": 6.7050896536362625, + "grad_norm": 0.0029432366136461496, + "learning_rate": 4.203406620799203e-05, + "loss": 0.0395, + "num_input_tokens_seen": 73263600, + "step": 60205 + }, + { + "epoch": 6.70564650851988, + "grad_norm": 0.7926346659660339, + "learning_rate": 4.203228769069448e-05, + "loss": 0.0726, + "num_input_tokens_seen": 73269904, + "step": 60210 + }, + { + "epoch": 6.706203363403497, + "grad_norm": 0.011824448592960835, + "learning_rate": 4.2030509012513334e-05, + "loss": 0.0862, + "num_input_tokens_seen": 73276240, + "step": 60215 + }, + { + "epoch": 6.706760218287115, + "grad_norm": 0.7335628867149353, + "learning_rate": 4.202873017346539e-05, + "loss": 0.1958, + "num_input_tokens_seen": 73282032, + "step": 60220 + }, + { + "epoch": 6.7073170731707314, + "grad_norm": 0.011681719683110714, + "learning_rate": 4.202695117356744e-05, + "loss": 0.0516, + "num_input_tokens_seen": 73288112, + "step": 60225 + }, + { + "epoch": 6.707873928054349, + "grad_norm": 0.03974241763353348, + "learning_rate": 4.20251720128363e-05, + "loss": 0.0433, + "num_input_tokens_seen": 73294320, + "step": 60230 + }, + { + "epoch": 6.708430782937967, + "grad_norm": 0.003936660010367632, + "learning_rate": 4.202339269128877e-05, + "loss": 0.0749, + "num_input_tokens_seen": 73300112, + "step": 60235 + }, + { + "epoch": 6.708987637821584, + "grad_norm": 1.2397866249084473, + "learning_rate": 4.202161320894165e-05, + "loss": 0.1184, + "num_input_tokens_seen": 73306032, + "step": 60240 + }, + { + "epoch": 6.709544492705201, + "grad_norm": 0.6798334717750549, + "learning_rate": 4.201983356581176e-05, + "loss": 0.0571, + "num_input_tokens_seen": 73312112, + "step": 60245 + }, + { + "epoch": 6.710101347588818, + "grad_norm": 0.1732158064842224, + "learning_rate": 4.201805376191591e-05, + "loss": 0.0323, + "num_input_tokens_seen": 73318448, + "step": 60250 + }, + { + "epoch": 6.710658202472436, + "grad_norm": 0.45246976613998413, + "learning_rate": 4.20162737972709e-05, + "loss": 0.097, + "num_input_tokens_seen": 73324368, + "step": 60255 + }, + { + "epoch": 6.711215057356053, + "grad_norm": 1.8049100637435913, + "learning_rate": 4.2014493671893554e-05, + "loss": 0.2014, + "num_input_tokens_seen": 73330608, + "step": 60260 + }, + { + "epoch": 6.71177191223967, + "grad_norm": 1.2772246599197388, + "learning_rate": 4.2012713385800686e-05, + "loss": 0.0887, + "num_input_tokens_seen": 73337136, + "step": 60265 + }, + { + "epoch": 6.712328767123288, + "grad_norm": 1.317560076713562, + "learning_rate": 4.2010932939009106e-05, + "loss": 0.1077, + "num_input_tokens_seen": 73343152, + "step": 60270 + }, + { + "epoch": 6.712885622006905, + "grad_norm": 0.08272285759449005, + "learning_rate": 4.200915233153564e-05, + "loss": 0.1074, + "num_input_tokens_seen": 73349328, + "step": 60275 + }, + { + "epoch": 6.713442476890522, + "grad_norm": 0.9383803009986877, + "learning_rate": 4.200737156339709e-05, + "loss": 0.0405, + "num_input_tokens_seen": 73355440, + "step": 60280 + }, + { + "epoch": 6.71399933177414, + "grad_norm": 0.004532305523753166, + "learning_rate": 4.20055906346103e-05, + "loss": 0.0046, + "num_input_tokens_seen": 73361840, + "step": 60285 + }, + { + "epoch": 6.714556186657757, + "grad_norm": 0.1187586560845375, + "learning_rate": 4.200380954519208e-05, + "loss": 0.0194, + "num_input_tokens_seen": 73367984, + "step": 60290 + }, + { + "epoch": 6.7151130415413745, + "grad_norm": 0.07810000330209732, + "learning_rate": 4.200202829515926e-05, + "loss": 0.0221, + "num_input_tokens_seen": 73373968, + "step": 60295 + }, + { + "epoch": 6.715669896424991, + "grad_norm": 0.028295716270804405, + "learning_rate": 4.200024688452866e-05, + "loss": 0.0049, + "num_input_tokens_seen": 73380208, + "step": 60300 + }, + { + "epoch": 6.716226751308609, + "grad_norm": 0.12710636854171753, + "learning_rate": 4.1998465313317106e-05, + "loss": 0.0167, + "num_input_tokens_seen": 73386576, + "step": 60305 + }, + { + "epoch": 6.716783606192227, + "grad_norm": 0.47637322545051575, + "learning_rate": 4.1996683581541425e-05, + "loss": 0.0634, + "num_input_tokens_seen": 73392592, + "step": 60310 + }, + { + "epoch": 6.717340461075843, + "grad_norm": 0.05717965215444565, + "learning_rate": 4.1994901689218454e-05, + "loss": 0.117, + "num_input_tokens_seen": 73398736, + "step": 60315 + }, + { + "epoch": 6.717897315959461, + "grad_norm": 0.671170175075531, + "learning_rate": 4.199311963636502e-05, + "loss": 0.068, + "num_input_tokens_seen": 73404912, + "step": 60320 + }, + { + "epoch": 6.718454170843078, + "grad_norm": 0.10424929112195969, + "learning_rate": 4.1991337422997954e-05, + "loss": 0.0454, + "num_input_tokens_seen": 73411472, + "step": 60325 + }, + { + "epoch": 6.7190110257266955, + "grad_norm": 0.10541824251413345, + "learning_rate": 4.1989555049134096e-05, + "loss": 0.0825, + "num_input_tokens_seen": 73417232, + "step": 60330 + }, + { + "epoch": 6.719567880610313, + "grad_norm": 0.09746172279119492, + "learning_rate": 4.198777251479027e-05, + "loss": 0.0765, + "num_input_tokens_seen": 73423408, + "step": 60335 + }, + { + "epoch": 6.72012473549393, + "grad_norm": 0.15773922204971313, + "learning_rate": 4.198598981998334e-05, + "loss": 0.0243, + "num_input_tokens_seen": 73429520, + "step": 60340 + }, + { + "epoch": 6.720681590377548, + "grad_norm": 1.4261500835418701, + "learning_rate": 4.198420696473011e-05, + "loss": 0.1518, + "num_input_tokens_seen": 73435664, + "step": 60345 + }, + { + "epoch": 6.721238445261165, + "grad_norm": 1.6712485551834106, + "learning_rate": 4.198242394904744e-05, + "loss": 0.0913, + "num_input_tokens_seen": 73441616, + "step": 60350 + }, + { + "epoch": 6.721795300144782, + "grad_norm": 1.882398009300232, + "learning_rate": 4.198064077295218e-05, + "loss": 0.154, + "num_input_tokens_seen": 73447536, + "step": 60355 + }, + { + "epoch": 6.7223521550284, + "grad_norm": 0.03620624169707298, + "learning_rate": 4.197885743646116e-05, + "loss": 0.0665, + "num_input_tokens_seen": 73453424, + "step": 60360 + }, + { + "epoch": 6.722909009912017, + "grad_norm": 0.13070723414421082, + "learning_rate": 4.197707393959122e-05, + "loss": 0.0219, + "num_input_tokens_seen": 73459408, + "step": 60365 + }, + { + "epoch": 6.723465864795634, + "grad_norm": 0.049247756600379944, + "learning_rate": 4.197529028235922e-05, + "loss": 0.0077, + "num_input_tokens_seen": 73465520, + "step": 60370 + }, + { + "epoch": 6.724022719679252, + "grad_norm": 0.8899509310722351, + "learning_rate": 4.197350646478201e-05, + "loss": 0.1549, + "num_input_tokens_seen": 73471312, + "step": 60375 + }, + { + "epoch": 6.724579574562869, + "grad_norm": 0.33644217252731323, + "learning_rate": 4.197172248687642e-05, + "loss": 0.0444, + "num_input_tokens_seen": 73477200, + "step": 60380 + }, + { + "epoch": 6.725136429446486, + "grad_norm": 0.7017821073532104, + "learning_rate": 4.1969938348659324e-05, + "loss": 0.0846, + "num_input_tokens_seen": 73483312, + "step": 60385 + }, + { + "epoch": 6.725693284330104, + "grad_norm": 0.4126347005367279, + "learning_rate": 4.196815405014756e-05, + "loss": 0.0418, + "num_input_tokens_seen": 73489456, + "step": 60390 + }, + { + "epoch": 6.726250139213721, + "grad_norm": 1.8057512044906616, + "learning_rate": 4.196636959135798e-05, + "loss": 0.0351, + "num_input_tokens_seen": 73495760, + "step": 60395 + }, + { + "epoch": 6.7268069940973385, + "grad_norm": 0.3547213673591614, + "learning_rate": 4.196458497230745e-05, + "loss": 0.0549, + "num_input_tokens_seen": 73501808, + "step": 60400 + }, + { + "epoch": 6.727363848980955, + "grad_norm": 0.5476040840148926, + "learning_rate": 4.196280019301283e-05, + "loss": 0.083, + "num_input_tokens_seen": 73508080, + "step": 60405 + }, + { + "epoch": 6.727920703864573, + "grad_norm": 1.5454974174499512, + "learning_rate": 4.196101525349096e-05, + "loss": 0.0697, + "num_input_tokens_seen": 73514256, + "step": 60410 + }, + { + "epoch": 6.728477558748191, + "grad_norm": 0.06278669834136963, + "learning_rate": 4.1959230153758725e-05, + "loss": 0.1157, + "num_input_tokens_seen": 73520272, + "step": 60415 + }, + { + "epoch": 6.729034413631807, + "grad_norm": 0.11875518411397934, + "learning_rate": 4.195744489383297e-05, + "loss": 0.0266, + "num_input_tokens_seen": 73526448, + "step": 60420 + }, + { + "epoch": 6.729591268515425, + "grad_norm": 0.5298284888267517, + "learning_rate": 4.1955659473730555e-05, + "loss": 0.0766, + "num_input_tokens_seen": 73532688, + "step": 60425 + }, + { + "epoch": 6.730148123399042, + "grad_norm": 0.40620312094688416, + "learning_rate": 4.1953873893468355e-05, + "loss": 0.0305, + "num_input_tokens_seen": 73538800, + "step": 60430 + }, + { + "epoch": 6.73070497828266, + "grad_norm": 1.201425313949585, + "learning_rate": 4.195208815306323e-05, + "loss": 0.0552, + "num_input_tokens_seen": 73544976, + "step": 60435 + }, + { + "epoch": 6.731261833166277, + "grad_norm": 0.013833236880600452, + "learning_rate": 4.195030225253206e-05, + "loss": 0.0577, + "num_input_tokens_seen": 73551312, + "step": 60440 + }, + { + "epoch": 6.731818688049894, + "grad_norm": 0.011729584075510502, + "learning_rate": 4.194851619189169e-05, + "loss": 0.0363, + "num_input_tokens_seen": 73557456, + "step": 60445 + }, + { + "epoch": 6.732375542933512, + "grad_norm": 1.8319119215011597, + "learning_rate": 4.194672997115902e-05, + "loss": 0.0929, + "num_input_tokens_seen": 73563280, + "step": 60450 + }, + { + "epoch": 6.7329323978171285, + "grad_norm": 0.1424119919538498, + "learning_rate": 4.1944943590350905e-05, + "loss": 0.2569, + "num_input_tokens_seen": 73569424, + "step": 60455 + }, + { + "epoch": 6.733489252700746, + "grad_norm": 0.8795657753944397, + "learning_rate": 4.194315704948422e-05, + "loss": 0.1055, + "num_input_tokens_seen": 73575696, + "step": 60460 + }, + { + "epoch": 6.734046107584364, + "grad_norm": 0.832224428653717, + "learning_rate": 4.194137034857584e-05, + "loss": 0.0829, + "num_input_tokens_seen": 73581968, + "step": 60465 + }, + { + "epoch": 6.734602962467981, + "grad_norm": 0.7916579842567444, + "learning_rate": 4.193958348764264e-05, + "loss": 0.035, + "num_input_tokens_seen": 73588144, + "step": 60470 + }, + { + "epoch": 6.735159817351598, + "grad_norm": 0.0036301175132393837, + "learning_rate": 4.193779646670151e-05, + "loss": 0.0155, + "num_input_tokens_seen": 73594192, + "step": 60475 + }, + { + "epoch": 6.735716672235215, + "grad_norm": 0.1378290206193924, + "learning_rate": 4.193600928576932e-05, + "loss": 0.0294, + "num_input_tokens_seen": 73600400, + "step": 60480 + }, + { + "epoch": 6.736273527118833, + "grad_norm": 0.5858573317527771, + "learning_rate": 4.1934221944862955e-05, + "loss": 0.0446, + "num_input_tokens_seen": 73606352, + "step": 60485 + }, + { + "epoch": 6.73683038200245, + "grad_norm": 0.9668448567390442, + "learning_rate": 4.1932434443999294e-05, + "loss": 0.0558, + "num_input_tokens_seen": 73612272, + "step": 60490 + }, + { + "epoch": 6.737387236886067, + "grad_norm": 0.4731861352920532, + "learning_rate": 4.193064678319522e-05, + "loss": 0.1084, + "num_input_tokens_seen": 73618288, + "step": 60495 + }, + { + "epoch": 6.737944091769685, + "grad_norm": 0.5140952467918396, + "learning_rate": 4.192885896246763e-05, + "loss": 0.1305, + "num_input_tokens_seen": 73624624, + "step": 60500 + }, + { + "epoch": 6.738500946653302, + "grad_norm": 0.9086169004440308, + "learning_rate": 4.19270709818334e-05, + "loss": 0.0451, + "num_input_tokens_seen": 73630864, + "step": 60505 + }, + { + "epoch": 6.739057801536919, + "grad_norm": 0.0002654317650012672, + "learning_rate": 4.1925282841309424e-05, + "loss": 0.0107, + "num_input_tokens_seen": 73636944, + "step": 60510 + }, + { + "epoch": 6.739614656420537, + "grad_norm": 0.6186100840568542, + "learning_rate": 4.192349454091259e-05, + "loss": 0.0278, + "num_input_tokens_seen": 73643120, + "step": 60515 + }, + { + "epoch": 6.740171511304154, + "grad_norm": 0.14995932579040527, + "learning_rate": 4.1921706080659795e-05, + "loss": 0.0097, + "num_input_tokens_seen": 73648944, + "step": 60520 + }, + { + "epoch": 6.7407283661877715, + "grad_norm": 0.0018887340556830168, + "learning_rate": 4.191991746056792e-05, + "loss": 0.0843, + "num_input_tokens_seen": 73654800, + "step": 60525 + }, + { + "epoch": 6.741285221071389, + "grad_norm": 0.7847327589988708, + "learning_rate": 4.1918128680653875e-05, + "loss": 0.0958, + "num_input_tokens_seen": 73660656, + "step": 60530 + }, + { + "epoch": 6.741842075955006, + "grad_norm": 0.0005883729318156838, + "learning_rate": 4.1916339740934546e-05, + "loss": 0.0231, + "num_input_tokens_seen": 73666544, + "step": 60535 + }, + { + "epoch": 6.742398930838624, + "grad_norm": 0.5126585364341736, + "learning_rate": 4.191455064142684e-05, + "loss": 0.1732, + "num_input_tokens_seen": 73672528, + "step": 60540 + }, + { + "epoch": 6.74295578572224, + "grad_norm": 0.028195291757583618, + "learning_rate": 4.1912761382147645e-05, + "loss": 0.0098, + "num_input_tokens_seen": 73678864, + "step": 60545 + }, + { + "epoch": 6.743512640605858, + "grad_norm": 0.08076053112745285, + "learning_rate": 4.191097196311388e-05, + "loss": 0.0532, + "num_input_tokens_seen": 73684944, + "step": 60550 + }, + { + "epoch": 6.744069495489476, + "grad_norm": 0.012149150483310223, + "learning_rate": 4.1909182384342426e-05, + "loss": 0.0748, + "num_input_tokens_seen": 73691152, + "step": 60555 + }, + { + "epoch": 6.7446263503730925, + "grad_norm": 0.07281852513551712, + "learning_rate": 4.19073926458502e-05, + "loss": 0.1412, + "num_input_tokens_seen": 73697264, + "step": 60560 + }, + { + "epoch": 6.74518320525671, + "grad_norm": 0.5646998286247253, + "learning_rate": 4.19056027476541e-05, + "loss": 0.0312, + "num_input_tokens_seen": 73703536, + "step": 60565 + }, + { + "epoch": 6.745740060140328, + "grad_norm": 0.0280381478369236, + "learning_rate": 4.1903812689771045e-05, + "loss": 0.0401, + "num_input_tokens_seen": 73709520, + "step": 60570 + }, + { + "epoch": 6.746296915023945, + "grad_norm": 0.3036043643951416, + "learning_rate": 4.190202247221793e-05, + "loss": 0.096, + "num_input_tokens_seen": 73715088, + "step": 60575 + }, + { + "epoch": 6.746853769907562, + "grad_norm": 0.21233487129211426, + "learning_rate": 4.1900232095011675e-05, + "loss": 0.0213, + "num_input_tokens_seen": 73721392, + "step": 60580 + }, + { + "epoch": 6.747410624791179, + "grad_norm": 0.003654641332104802, + "learning_rate": 4.189844155816919e-05, + "loss": 0.0764, + "num_input_tokens_seen": 73727312, + "step": 60585 + }, + { + "epoch": 6.747967479674797, + "grad_norm": 0.2962397038936615, + "learning_rate": 4.189665086170738e-05, + "loss": 0.0108, + "num_input_tokens_seen": 73733648, + "step": 60590 + }, + { + "epoch": 6.7485243345584145, + "grad_norm": 0.020917585119605064, + "learning_rate": 4.1894860005643165e-05, + "loss": 0.0825, + "num_input_tokens_seen": 73739792, + "step": 60595 + }, + { + "epoch": 6.749081189442031, + "grad_norm": 0.637866199016571, + "learning_rate": 4.189306898999347e-05, + "loss": 0.0931, + "num_input_tokens_seen": 73745712, + "step": 60600 + }, + { + "epoch": 6.749638044325649, + "grad_norm": 2.149277687072754, + "learning_rate": 4.1891277814775195e-05, + "loss": 0.1058, + "num_input_tokens_seen": 73751088, + "step": 60605 + }, + { + "epoch": 6.750194899209266, + "grad_norm": 0.20197467505931854, + "learning_rate": 4.188948648000527e-05, + "loss": 0.0333, + "num_input_tokens_seen": 73757424, + "step": 60610 + }, + { + "epoch": 6.750751754092883, + "grad_norm": 0.2619592845439911, + "learning_rate": 4.188769498570061e-05, + "loss": 0.0479, + "num_input_tokens_seen": 73763504, + "step": 60615 + }, + { + "epoch": 6.751308608976501, + "grad_norm": 0.18721090257167816, + "learning_rate": 4.188590333187815e-05, + "loss": 0.1369, + "num_input_tokens_seen": 73769808, + "step": 60620 + }, + { + "epoch": 6.751865463860118, + "grad_norm": 0.013688771985471249, + "learning_rate": 4.1884111518554795e-05, + "loss": 0.0762, + "num_input_tokens_seen": 73776080, + "step": 60625 + }, + { + "epoch": 6.7524223187437356, + "grad_norm": 0.5297878980636597, + "learning_rate": 4.1882319545747484e-05, + "loss": 0.0111, + "num_input_tokens_seen": 73781808, + "step": 60630 + }, + { + "epoch": 6.752979173627352, + "grad_norm": 0.10661892592906952, + "learning_rate": 4.188052741347314e-05, + "loss": 0.1217, + "num_input_tokens_seen": 73787952, + "step": 60635 + }, + { + "epoch": 6.75353602851097, + "grad_norm": 1.404969573020935, + "learning_rate": 4.1878735121748686e-05, + "loss": 0.0647, + "num_input_tokens_seen": 73794000, + "step": 60640 + }, + { + "epoch": 6.754092883394588, + "grad_norm": 0.0016385791823267937, + "learning_rate": 4.187694267059106e-05, + "loss": 0.0125, + "num_input_tokens_seen": 73800080, + "step": 60645 + }, + { + "epoch": 6.7546497382782045, + "grad_norm": 0.43662533164024353, + "learning_rate": 4.187515006001719e-05, + "loss": 0.0778, + "num_input_tokens_seen": 73805744, + "step": 60650 + }, + { + "epoch": 6.755206593161822, + "grad_norm": 0.4614941477775574, + "learning_rate": 4.1873357290044004e-05, + "loss": 0.086, + "num_input_tokens_seen": 73811792, + "step": 60655 + }, + { + "epoch": 6.755763448045439, + "grad_norm": 0.5079184174537659, + "learning_rate": 4.187156436068843e-05, + "loss": 0.0442, + "num_input_tokens_seen": 73818032, + "step": 60660 + }, + { + "epoch": 6.756320302929057, + "grad_norm": 0.4501284956932068, + "learning_rate": 4.186977127196743e-05, + "loss": 0.0441, + "num_input_tokens_seen": 73824336, + "step": 60665 + }, + { + "epoch": 6.756877157812674, + "grad_norm": 0.5232099890708923, + "learning_rate": 4.1867978023897916e-05, + "loss": 0.0179, + "num_input_tokens_seen": 73830224, + "step": 60670 + }, + { + "epoch": 6.757434012696291, + "grad_norm": 0.0003576598537620157, + "learning_rate": 4.186618461649684e-05, + "loss": 0.0077, + "num_input_tokens_seen": 73836592, + "step": 60675 + }, + { + "epoch": 6.757990867579909, + "grad_norm": 0.9411371946334839, + "learning_rate": 4.1864391049781137e-05, + "loss": 0.0865, + "num_input_tokens_seen": 73842896, + "step": 60680 + }, + { + "epoch": 6.7585477224635255, + "grad_norm": 1.197713851928711, + "learning_rate": 4.186259732376774e-05, + "loss": 0.1619, + "num_input_tokens_seen": 73849328, + "step": 60685 + }, + { + "epoch": 6.759104577347143, + "grad_norm": 0.013575490564107895, + "learning_rate": 4.1860803438473604e-05, + "loss": 0.0778, + "num_input_tokens_seen": 73855632, + "step": 60690 + }, + { + "epoch": 6.759661432230761, + "grad_norm": 0.9725555777549744, + "learning_rate": 4.1859009393915686e-05, + "loss": 0.0717, + "num_input_tokens_seen": 73861616, + "step": 60695 + }, + { + "epoch": 6.760218287114378, + "grad_norm": 0.6288174986839294, + "learning_rate": 4.18572151901109e-05, + "loss": 0.0659, + "num_input_tokens_seen": 73867888, + "step": 60700 + }, + { + "epoch": 6.760775141997995, + "grad_norm": 0.4103952944278717, + "learning_rate": 4.185542082707622e-05, + "loss": 0.0612, + "num_input_tokens_seen": 73873936, + "step": 60705 + }, + { + "epoch": 6.761331996881613, + "grad_norm": 0.0019216462969779968, + "learning_rate": 4.1853626304828584e-05, + "loss": 0.0304, + "num_input_tokens_seen": 73880016, + "step": 60710 + }, + { + "epoch": 6.76188885176523, + "grad_norm": 0.10521166026592255, + "learning_rate": 4.185183162338494e-05, + "loss": 0.1415, + "num_input_tokens_seen": 73886224, + "step": 60715 + }, + { + "epoch": 6.7624457066488475, + "grad_norm": 0.03671889007091522, + "learning_rate": 4.185003678276225e-05, + "loss": 0.0922, + "num_input_tokens_seen": 73891888, + "step": 60720 + }, + { + "epoch": 6.763002561532464, + "grad_norm": 0.048119187355041504, + "learning_rate": 4.184824178297746e-05, + "loss": 0.0161, + "num_input_tokens_seen": 73897904, + "step": 60725 + }, + { + "epoch": 6.763559416416082, + "grad_norm": 0.9724071025848389, + "learning_rate": 4.1846446624047525e-05, + "loss": 0.0841, + "num_input_tokens_seen": 73903600, + "step": 60730 + }, + { + "epoch": 6.7641162712997, + "grad_norm": 0.29294174909591675, + "learning_rate": 4.1844651305989414e-05, + "loss": 0.0129, + "num_input_tokens_seen": 73909872, + "step": 60735 + }, + { + "epoch": 6.764673126183316, + "grad_norm": 0.2952650189399719, + "learning_rate": 4.184285582882007e-05, + "loss": 0.0669, + "num_input_tokens_seen": 73915920, + "step": 60740 + }, + { + "epoch": 6.765229981066934, + "grad_norm": 0.026909640058875084, + "learning_rate": 4.184106019255645e-05, + "loss": 0.0206, + "num_input_tokens_seen": 73921776, + "step": 60745 + }, + { + "epoch": 6.765786835950552, + "grad_norm": 0.41927534341812134, + "learning_rate": 4.183926439721554e-05, + "loss": 0.0444, + "num_input_tokens_seen": 73928048, + "step": 60750 + }, + { + "epoch": 6.7663436908341685, + "grad_norm": 0.02209162898361683, + "learning_rate": 4.1837468442814276e-05, + "loss": 0.024, + "num_input_tokens_seen": 73934288, + "step": 60755 + }, + { + "epoch": 6.766900545717786, + "grad_norm": 2.2005462646484375, + "learning_rate": 4.1835672329369636e-05, + "loss": 0.0497, + "num_input_tokens_seen": 73940336, + "step": 60760 + }, + { + "epoch": 6.767457400601403, + "grad_norm": 0.006318815052509308, + "learning_rate": 4.183387605689858e-05, + "loss": 0.0678, + "num_input_tokens_seen": 73946608, + "step": 60765 + }, + { + "epoch": 6.768014255485021, + "grad_norm": 0.7600697875022888, + "learning_rate": 4.183207962541808e-05, + "loss": 0.0626, + "num_input_tokens_seen": 73952720, + "step": 60770 + }, + { + "epoch": 6.768571110368638, + "grad_norm": 0.11380241066217422, + "learning_rate": 4.1830283034945095e-05, + "loss": 0.0536, + "num_input_tokens_seen": 73958896, + "step": 60775 + }, + { + "epoch": 6.769127965252255, + "grad_norm": 0.9213221669197083, + "learning_rate": 4.182848628549661e-05, + "loss": 0.0345, + "num_input_tokens_seen": 73965136, + "step": 60780 + }, + { + "epoch": 6.769684820135873, + "grad_norm": 1.350919246673584, + "learning_rate": 4.182668937708959e-05, + "loss": 0.1164, + "num_input_tokens_seen": 73971312, + "step": 60785 + }, + { + "epoch": 6.77024167501949, + "grad_norm": 0.009775976650416851, + "learning_rate": 4.182489230974101e-05, + "loss": 0.0194, + "num_input_tokens_seen": 73977264, + "step": 60790 + }, + { + "epoch": 6.770798529903107, + "grad_norm": 0.2163275182247162, + "learning_rate": 4.1823095083467835e-05, + "loss": 0.032, + "num_input_tokens_seen": 73983472, + "step": 60795 + }, + { + "epoch": 6.771355384786725, + "grad_norm": 0.6966009140014648, + "learning_rate": 4.182129769828704e-05, + "loss": 0.0535, + "num_input_tokens_seen": 73989296, + "step": 60800 + }, + { + "epoch": 6.771912239670342, + "grad_norm": 0.042112186551094055, + "learning_rate": 4.181950015421563e-05, + "loss": 0.006, + "num_input_tokens_seen": 73995440, + "step": 60805 + }, + { + "epoch": 6.772469094553959, + "grad_norm": 0.3342476785182953, + "learning_rate": 4.1817702451270555e-05, + "loss": 0.0358, + "num_input_tokens_seen": 74001232, + "step": 60810 + }, + { + "epoch": 6.773025949437576, + "grad_norm": 0.25248080492019653, + "learning_rate": 4.1815904589468813e-05, + "loss": 0.016, + "num_input_tokens_seen": 74007312, + "step": 60815 + }, + { + "epoch": 6.773582804321194, + "grad_norm": 1.1262091398239136, + "learning_rate": 4.1814106568827374e-05, + "loss": 0.2625, + "num_input_tokens_seen": 74013520, + "step": 60820 + }, + { + "epoch": 6.7741396592048115, + "grad_norm": 2.098560333251953, + "learning_rate": 4.181230838936323e-05, + "loss": 0.1662, + "num_input_tokens_seen": 74019728, + "step": 60825 + }, + { + "epoch": 6.774696514088428, + "grad_norm": 1.2707663774490356, + "learning_rate": 4.1810510051093356e-05, + "loss": 0.077, + "num_input_tokens_seen": 74025872, + "step": 60830 + }, + { + "epoch": 6.775253368972046, + "grad_norm": 0.006590634118765593, + "learning_rate": 4.180871155403475e-05, + "loss": 0.0802, + "num_input_tokens_seen": 74032016, + "step": 60835 + }, + { + "epoch": 6.775810223855663, + "grad_norm": 0.21717076003551483, + "learning_rate": 4.1806912898204404e-05, + "loss": 0.0461, + "num_input_tokens_seen": 74038192, + "step": 60840 + }, + { + "epoch": 6.7763670787392805, + "grad_norm": 0.15982335805892944, + "learning_rate": 4.180511408361929e-05, + "loss": 0.0839, + "num_input_tokens_seen": 74044432, + "step": 60845 + }, + { + "epoch": 6.776923933622898, + "grad_norm": 0.02507418766617775, + "learning_rate": 4.180331511029642e-05, + "loss": 0.1446, + "num_input_tokens_seen": 74050288, + "step": 60850 + }, + { + "epoch": 6.777480788506515, + "grad_norm": 0.005807315930724144, + "learning_rate": 4.180151597825277e-05, + "loss": 0.0259, + "num_input_tokens_seen": 74056112, + "step": 60855 + }, + { + "epoch": 6.778037643390133, + "grad_norm": 0.5974114537239075, + "learning_rate": 4.179971668750534e-05, + "loss": 0.0482, + "num_input_tokens_seen": 74061680, + "step": 60860 + }, + { + "epoch": 6.778594498273749, + "grad_norm": 0.06994850933551788, + "learning_rate": 4.179791723807113e-05, + "loss": 0.083, + "num_input_tokens_seen": 74067920, + "step": 60865 + }, + { + "epoch": 6.779151353157367, + "grad_norm": 0.8623169660568237, + "learning_rate": 4.1796117629967125e-05, + "loss": 0.0428, + "num_input_tokens_seen": 74074160, + "step": 60870 + }, + { + "epoch": 6.779708208040985, + "grad_norm": 0.228424072265625, + "learning_rate": 4.179431786321034e-05, + "loss": 0.0762, + "num_input_tokens_seen": 74080304, + "step": 60875 + }, + { + "epoch": 6.7802650629246015, + "grad_norm": 0.7281994819641113, + "learning_rate": 4.1792517937817766e-05, + "loss": 0.0864, + "num_input_tokens_seen": 74086576, + "step": 60880 + }, + { + "epoch": 6.780821917808219, + "grad_norm": 0.003252302994951606, + "learning_rate": 4.1790717853806405e-05, + "loss": 0.0553, + "num_input_tokens_seen": 74092496, + "step": 60885 + }, + { + "epoch": 6.781378772691837, + "grad_norm": 0.01365680992603302, + "learning_rate": 4.178891761119326e-05, + "loss": 0.0888, + "num_input_tokens_seen": 74098384, + "step": 60890 + }, + { + "epoch": 6.781935627575454, + "grad_norm": 0.623530387878418, + "learning_rate": 4.178711720999534e-05, + "loss": 0.0177, + "num_input_tokens_seen": 74104432, + "step": 60895 + }, + { + "epoch": 6.782492482459071, + "grad_norm": 0.0011101547861471772, + "learning_rate": 4.1785316650229645e-05, + "loss": 0.0011, + "num_input_tokens_seen": 74110704, + "step": 60900 + }, + { + "epoch": 6.783049337342688, + "grad_norm": 0.003538233693689108, + "learning_rate": 4.178351593191319e-05, + "loss": 0.0208, + "num_input_tokens_seen": 74116848, + "step": 60905 + }, + { + "epoch": 6.783606192226306, + "grad_norm": 0.005163072608411312, + "learning_rate": 4.178171505506298e-05, + "loss": 0.1698, + "num_input_tokens_seen": 74122576, + "step": 60910 + }, + { + "epoch": 6.7841630471099235, + "grad_norm": 0.2430543750524521, + "learning_rate": 4.177991401969602e-05, + "loss": 0.0225, + "num_input_tokens_seen": 74128400, + "step": 60915 + }, + { + "epoch": 6.78471990199354, + "grad_norm": 0.02842027135193348, + "learning_rate": 4.177811282582933e-05, + "loss": 0.0077, + "num_input_tokens_seen": 74134608, + "step": 60920 + }, + { + "epoch": 6.785276756877158, + "grad_norm": 0.0011419022921472788, + "learning_rate": 4.177631147347993e-05, + "loss": 0.0743, + "num_input_tokens_seen": 74140720, + "step": 60925 + }, + { + "epoch": 6.785833611760776, + "grad_norm": 0.8751204609870911, + "learning_rate": 4.177450996266482e-05, + "loss": 0.0488, + "num_input_tokens_seen": 74147056, + "step": 60930 + }, + { + "epoch": 6.786390466644392, + "grad_norm": 1.2035863399505615, + "learning_rate": 4.177270829340103e-05, + "loss": 0.0422, + "num_input_tokens_seen": 74153328, + "step": 60935 + }, + { + "epoch": 6.78694732152801, + "grad_norm": 0.04920526221394539, + "learning_rate": 4.177090646570556e-05, + "loss": 0.0297, + "num_input_tokens_seen": 74159152, + "step": 60940 + }, + { + "epoch": 6.787504176411627, + "grad_norm": 0.38701102137565613, + "learning_rate": 4.176910447959545e-05, + "loss": 0.0718, + "num_input_tokens_seen": 74165296, + "step": 60945 + }, + { + "epoch": 6.7880610312952445, + "grad_norm": 0.00019192190666217357, + "learning_rate": 4.176730233508772e-05, + "loss": 0.046, + "num_input_tokens_seen": 74170960, + "step": 60950 + }, + { + "epoch": 6.788617886178862, + "grad_norm": 0.4136747419834137, + "learning_rate": 4.176550003219938e-05, + "loss": 0.0565, + "num_input_tokens_seen": 74177328, + "step": 60955 + }, + { + "epoch": 6.789174741062479, + "grad_norm": 0.6178697347640991, + "learning_rate": 4.176369757094745e-05, + "loss": 0.0348, + "num_input_tokens_seen": 74183632, + "step": 60960 + }, + { + "epoch": 6.789731595946097, + "grad_norm": 0.4520639181137085, + "learning_rate": 4.176189495134898e-05, + "loss": 0.0519, + "num_input_tokens_seen": 74189968, + "step": 60965 + }, + { + "epoch": 6.790288450829713, + "grad_norm": 0.01683005318045616, + "learning_rate": 4.1760092173420975e-05, + "loss": 0.0095, + "num_input_tokens_seen": 74196048, + "step": 60970 + }, + { + "epoch": 6.790845305713331, + "grad_norm": 0.019444700330495834, + "learning_rate": 4.175828923718047e-05, + "loss": 0.0539, + "num_input_tokens_seen": 74202256, + "step": 60975 + }, + { + "epoch": 6.791402160596949, + "grad_norm": 0.016332857310771942, + "learning_rate": 4.17564861426445e-05, + "loss": 0.0159, + "num_input_tokens_seen": 74208400, + "step": 60980 + }, + { + "epoch": 6.791959015480566, + "grad_norm": 0.009676122106611729, + "learning_rate": 4.175468288983009e-05, + "loss": 0.0302, + "num_input_tokens_seen": 74214576, + "step": 60985 + }, + { + "epoch": 6.792515870364183, + "grad_norm": 0.002221490954980254, + "learning_rate": 4.175287947875428e-05, + "loss": 0.0109, + "num_input_tokens_seen": 74220336, + "step": 60990 + }, + { + "epoch": 6.7930727252478, + "grad_norm": 1.3929522037506104, + "learning_rate": 4.17510759094341e-05, + "loss": 0.0601, + "num_input_tokens_seen": 74226384, + "step": 60995 + }, + { + "epoch": 6.793629580131418, + "grad_norm": 0.06052685156464577, + "learning_rate": 4.174927218188659e-05, + "loss": 0.0104, + "num_input_tokens_seen": 74232240, + "step": 61000 + }, + { + "epoch": 6.794186435015035, + "grad_norm": 0.1852354109287262, + "learning_rate": 4.174746829612878e-05, + "loss": 0.0165, + "num_input_tokens_seen": 74238384, + "step": 61005 + }, + { + "epoch": 6.794743289898652, + "grad_norm": 0.6672405004501343, + "learning_rate": 4.174566425217772e-05, + "loss": 0.0306, + "num_input_tokens_seen": 74244176, + "step": 61010 + }, + { + "epoch": 6.79530014478227, + "grad_norm": 0.06864180415868759, + "learning_rate": 4.174386005005044e-05, + "loss": 0.0637, + "num_input_tokens_seen": 74250160, + "step": 61015 + }, + { + "epoch": 6.795856999665887, + "grad_norm": 0.11156493425369263, + "learning_rate": 4.174205568976399e-05, + "loss": 0.1211, + "num_input_tokens_seen": 74256368, + "step": 61020 + }, + { + "epoch": 6.796413854549504, + "grad_norm": 1.2946466207504272, + "learning_rate": 4.174025117133541e-05, + "loss": 0.1734, + "num_input_tokens_seen": 74262736, + "step": 61025 + }, + { + "epoch": 6.796970709433122, + "grad_norm": 0.9082090258598328, + "learning_rate": 4.1738446494781746e-05, + "loss": 0.0422, + "num_input_tokens_seen": 74269040, + "step": 61030 + }, + { + "epoch": 6.797527564316739, + "grad_norm": 0.02293563447892666, + "learning_rate": 4.1736641660120045e-05, + "loss": 0.132, + "num_input_tokens_seen": 74274576, + "step": 61035 + }, + { + "epoch": 6.798084419200356, + "grad_norm": 0.0403277650475502, + "learning_rate": 4.173483666736735e-05, + "loss": 0.0653, + "num_input_tokens_seen": 74280784, + "step": 61040 + }, + { + "epoch": 6.798641274083973, + "grad_norm": 0.2808585464954376, + "learning_rate": 4.1733031516540715e-05, + "loss": 0.0768, + "num_input_tokens_seen": 74286064, + "step": 61045 + }, + { + "epoch": 6.799198128967591, + "grad_norm": 0.00019896613957826048, + "learning_rate": 4.173122620765719e-05, + "loss": 0.0234, + "num_input_tokens_seen": 74292112, + "step": 61050 + }, + { + "epoch": 6.799754983851209, + "grad_norm": 0.028118368238210678, + "learning_rate": 4.172942074073384e-05, + "loss": 0.0038, + "num_input_tokens_seen": 74298288, + "step": 61055 + }, + { + "epoch": 6.800311838734825, + "grad_norm": 1.2213096618652344, + "learning_rate": 4.17276151157877e-05, + "loss": 0.1006, + "num_input_tokens_seen": 74304624, + "step": 61060 + }, + { + "epoch": 6.800868693618443, + "grad_norm": 1.6956795454025269, + "learning_rate": 4.172580933283583e-05, + "loss": 0.0648, + "num_input_tokens_seen": 74310448, + "step": 61065 + }, + { + "epoch": 6.801425548502061, + "grad_norm": 0.23756474256515503, + "learning_rate": 4.1724003391895294e-05, + "loss": 0.0111, + "num_input_tokens_seen": 74316720, + "step": 61070 + }, + { + "epoch": 6.8019824033856775, + "grad_norm": 0.3447168469429016, + "learning_rate": 4.1722197292983144e-05, + "loss": 0.0714, + "num_input_tokens_seen": 74322896, + "step": 61075 + }, + { + "epoch": 6.802539258269295, + "grad_norm": 0.016995033249258995, + "learning_rate": 4.1720391036116445e-05, + "loss": 0.0309, + "num_input_tokens_seen": 74329232, + "step": 61080 + }, + { + "epoch": 6.803096113152913, + "grad_norm": 1.8250452280044556, + "learning_rate": 4.1718584621312253e-05, + "loss": 0.0561, + "num_input_tokens_seen": 74335312, + "step": 61085 + }, + { + "epoch": 6.80365296803653, + "grad_norm": 0.09410663694143295, + "learning_rate": 4.171677804858764e-05, + "loss": 0.1121, + "num_input_tokens_seen": 74341424, + "step": 61090 + }, + { + "epoch": 6.804209822920147, + "grad_norm": 0.7069712281227112, + "learning_rate": 4.171497131795966e-05, + "loss": 0.0851, + "num_input_tokens_seen": 74347408, + "step": 61095 + }, + { + "epoch": 6.804766677803764, + "grad_norm": 0.00737435556948185, + "learning_rate": 4.171316442944539e-05, + "loss": 0.005, + "num_input_tokens_seen": 74353360, + "step": 61100 + }, + { + "epoch": 6.805323532687382, + "grad_norm": 0.3232550323009491, + "learning_rate": 4.1711357383061886e-05, + "loss": 0.1302, + "num_input_tokens_seen": 74359536, + "step": 61105 + }, + { + "epoch": 6.8058803875709994, + "grad_norm": 1.0248267650604248, + "learning_rate": 4.170955017882623e-05, + "loss": 0.0333, + "num_input_tokens_seen": 74365424, + "step": 61110 + }, + { + "epoch": 6.806437242454616, + "grad_norm": 0.37595972418785095, + "learning_rate": 4.170774281675548e-05, + "loss": 0.0253, + "num_input_tokens_seen": 74371600, + "step": 61115 + }, + { + "epoch": 6.806994097338234, + "grad_norm": 0.05026279017329216, + "learning_rate": 4.170593529686672e-05, + "loss": 0.014, + "num_input_tokens_seen": 74377808, + "step": 61120 + }, + { + "epoch": 6.807550952221851, + "grad_norm": 0.09636306762695312, + "learning_rate": 4.1704127619177005e-05, + "loss": 0.0818, + "num_input_tokens_seen": 74383664, + "step": 61125 + }, + { + "epoch": 6.808107807105468, + "grad_norm": 0.036164116114377975, + "learning_rate": 4.1702319783703425e-05, + "loss": 0.3058, + "num_input_tokens_seen": 74388880, + "step": 61130 + }, + { + "epoch": 6.808664661989086, + "grad_norm": 0.0034754066728055477, + "learning_rate": 4.170051179046306e-05, + "loss": 0.0809, + "num_input_tokens_seen": 74395056, + "step": 61135 + }, + { + "epoch": 6.809221516872703, + "grad_norm": 0.20944496989250183, + "learning_rate": 4.169870363947297e-05, + "loss": 0.0914, + "num_input_tokens_seen": 74401232, + "step": 61140 + }, + { + "epoch": 6.8097783717563205, + "grad_norm": 0.5498520135879517, + "learning_rate": 4.169689533075025e-05, + "loss": 0.0341, + "num_input_tokens_seen": 74407312, + "step": 61145 + }, + { + "epoch": 6.810335226639937, + "grad_norm": 0.40719279646873474, + "learning_rate": 4.169508686431198e-05, + "loss": 0.0605, + "num_input_tokens_seen": 74413328, + "step": 61150 + }, + { + "epoch": 6.810892081523555, + "grad_norm": 0.3761926591396332, + "learning_rate": 4.169327824017524e-05, + "loss": 0.0282, + "num_input_tokens_seen": 74419344, + "step": 61155 + }, + { + "epoch": 6.811448936407173, + "grad_norm": 0.21177606284618378, + "learning_rate": 4.1691469458357115e-05, + "loss": 0.0674, + "num_input_tokens_seen": 74425520, + "step": 61160 + }, + { + "epoch": 6.812005791290789, + "grad_norm": 0.6268917322158813, + "learning_rate": 4.1689660518874676e-05, + "loss": 0.0781, + "num_input_tokens_seen": 74431376, + "step": 61165 + }, + { + "epoch": 6.812562646174407, + "grad_norm": 0.38444188237190247, + "learning_rate": 4.1687851421745027e-05, + "loss": 0.0779, + "num_input_tokens_seen": 74437712, + "step": 61170 + }, + { + "epoch": 6.813119501058024, + "grad_norm": 0.010192213580012321, + "learning_rate": 4.168604216698525e-05, + "loss": 0.0448, + "num_input_tokens_seen": 74443632, + "step": 61175 + }, + { + "epoch": 6.813676355941642, + "grad_norm": 0.8737354278564453, + "learning_rate": 4.168423275461244e-05, + "loss": 0.1709, + "num_input_tokens_seen": 74449808, + "step": 61180 + }, + { + "epoch": 6.814233210825259, + "grad_norm": 0.7964513897895813, + "learning_rate": 4.1682423184643684e-05, + "loss": 0.059, + "num_input_tokens_seen": 74455888, + "step": 61185 + }, + { + "epoch": 6.814790065708876, + "grad_norm": 1.3153183460235596, + "learning_rate": 4.1680613457096076e-05, + "loss": 0.0482, + "num_input_tokens_seen": 74462064, + "step": 61190 + }, + { + "epoch": 6.815346920592494, + "grad_norm": 0.08097312599420547, + "learning_rate": 4.16788035719867e-05, + "loss": 0.1004, + "num_input_tokens_seen": 74468368, + "step": 61195 + }, + { + "epoch": 6.8159037754761105, + "grad_norm": 0.2027025669813156, + "learning_rate": 4.167699352933267e-05, + "loss": 0.1689, + "num_input_tokens_seen": 74474128, + "step": 61200 + }, + { + "epoch": 6.816460630359728, + "grad_norm": 0.06343148648738861, + "learning_rate": 4.167518332915107e-05, + "loss": 0.0273, + "num_input_tokens_seen": 74480336, + "step": 61205 + }, + { + "epoch": 6.817017485243346, + "grad_norm": 0.01197842787951231, + "learning_rate": 4.1673372971459014e-05, + "loss": 0.0976, + "num_input_tokens_seen": 74486768, + "step": 61210 + }, + { + "epoch": 6.817574340126963, + "grad_norm": 1.1693843603134155, + "learning_rate": 4.167156245627358e-05, + "loss": 0.0359, + "num_input_tokens_seen": 74492752, + "step": 61215 + }, + { + "epoch": 6.81813119501058, + "grad_norm": 0.570381224155426, + "learning_rate": 4.166975178361189e-05, + "loss": 0.0445, + "num_input_tokens_seen": 74498832, + "step": 61220 + }, + { + "epoch": 6.818688049894197, + "grad_norm": 0.4076092839241028, + "learning_rate": 4.166794095349103e-05, + "loss": 0.0193, + "num_input_tokens_seen": 74504880, + "step": 61225 + }, + { + "epoch": 6.819244904777815, + "grad_norm": 0.004167291335761547, + "learning_rate": 4.1666129965928126e-05, + "loss": 0.1025, + "num_input_tokens_seen": 74511120, + "step": 61230 + }, + { + "epoch": 6.819801759661432, + "grad_norm": 0.904720664024353, + "learning_rate": 4.1664318820940256e-05, + "loss": 0.0494, + "num_input_tokens_seen": 74517360, + "step": 61235 + }, + { + "epoch": 6.820358614545049, + "grad_norm": 0.32766008377075195, + "learning_rate": 4.166250751854455e-05, + "loss": 0.042, + "num_input_tokens_seen": 74523216, + "step": 61240 + }, + { + "epoch": 6.820915469428667, + "grad_norm": 0.007944130338728428, + "learning_rate": 4.166069605875812e-05, + "loss": 0.0457, + "num_input_tokens_seen": 74529136, + "step": 61245 + }, + { + "epoch": 6.821472324312285, + "grad_norm": 1.2026970386505127, + "learning_rate": 4.165888444159806e-05, + "loss": 0.0572, + "num_input_tokens_seen": 74534768, + "step": 61250 + }, + { + "epoch": 6.822029179195901, + "grad_norm": 0.6645951867103577, + "learning_rate": 4.165707266708149e-05, + "loss": 0.1058, + "num_input_tokens_seen": 74541168, + "step": 61255 + }, + { + "epoch": 6.822586034079519, + "grad_norm": 0.004667636472731829, + "learning_rate": 4.165526073522552e-05, + "loss": 0.0724, + "num_input_tokens_seen": 74547312, + "step": 61260 + }, + { + "epoch": 6.823142888963137, + "grad_norm": 0.7748461365699768, + "learning_rate": 4.165344864604726e-05, + "loss": 0.0557, + "num_input_tokens_seen": 74553360, + "step": 61265 + }, + { + "epoch": 6.8236997438467535, + "grad_norm": 0.040533535182476044, + "learning_rate": 4.165163639956386e-05, + "loss": 0.075, + "num_input_tokens_seen": 74559184, + "step": 61270 + }, + { + "epoch": 6.824256598730371, + "grad_norm": 1.0119540691375732, + "learning_rate": 4.164982399579239e-05, + "loss": 0.08, + "num_input_tokens_seen": 74565488, + "step": 61275 + }, + { + "epoch": 6.824813453613988, + "grad_norm": 0.0007898838957771659, + "learning_rate": 4.164801143475001e-05, + "loss": 0.093, + "num_input_tokens_seen": 74571632, + "step": 61280 + }, + { + "epoch": 6.825370308497606, + "grad_norm": 0.9938908815383911, + "learning_rate": 4.164619871645381e-05, + "loss": 0.0892, + "num_input_tokens_seen": 74577904, + "step": 61285 + }, + { + "epoch": 6.825927163381223, + "grad_norm": 0.02917185053229332, + "learning_rate": 4.164438584092094e-05, + "loss": 0.0289, + "num_input_tokens_seen": 74584080, + "step": 61290 + }, + { + "epoch": 6.82648401826484, + "grad_norm": 0.3044486343860626, + "learning_rate": 4.16425728081685e-05, + "loss": 0.0123, + "num_input_tokens_seen": 74590128, + "step": 61295 + }, + { + "epoch": 6.827040873148458, + "grad_norm": 0.49026596546173096, + "learning_rate": 4.164075961821363e-05, + "loss": 0.0151, + "num_input_tokens_seen": 74596048, + "step": 61300 + }, + { + "epoch": 6.8275977280320745, + "grad_norm": 0.3748015761375427, + "learning_rate": 4.1638946271073445e-05, + "loss": 0.0324, + "num_input_tokens_seen": 74602224, + "step": 61305 + }, + { + "epoch": 6.828154582915692, + "grad_norm": 0.10960663110017776, + "learning_rate": 4.163713276676509e-05, + "loss": 0.1615, + "num_input_tokens_seen": 74608592, + "step": 61310 + }, + { + "epoch": 6.82871143779931, + "grad_norm": 0.04433827847242355, + "learning_rate": 4.163531910530569e-05, + "loss": 0.0255, + "num_input_tokens_seen": 74614832, + "step": 61315 + }, + { + "epoch": 6.829268292682927, + "grad_norm": 0.8225768804550171, + "learning_rate": 4.1633505286712366e-05, + "loss": 0.0638, + "num_input_tokens_seen": 74621008, + "step": 61320 + }, + { + "epoch": 6.829825147566544, + "grad_norm": 1.7254242897033691, + "learning_rate": 4.163169131100226e-05, + "loss": 0.045, + "num_input_tokens_seen": 74627056, + "step": 61325 + }, + { + "epoch": 6.830382002450161, + "grad_norm": 0.015269878320395947, + "learning_rate": 4.16298771781925e-05, + "loss": 0.0927, + "num_input_tokens_seen": 74633040, + "step": 61330 + }, + { + "epoch": 6.830938857333779, + "grad_norm": 0.0027749761939048767, + "learning_rate": 4.1628062888300235e-05, + "loss": 0.1903, + "num_input_tokens_seen": 74639344, + "step": 61335 + }, + { + "epoch": 6.8314957122173965, + "grad_norm": 1.082507848739624, + "learning_rate": 4.162624844134258e-05, + "loss": 0.1039, + "num_input_tokens_seen": 74645488, + "step": 61340 + }, + { + "epoch": 6.832052567101013, + "grad_norm": 1.3156698942184448, + "learning_rate": 4.162443383733671e-05, + "loss": 0.0798, + "num_input_tokens_seen": 74651504, + "step": 61345 + }, + { + "epoch": 6.832609421984631, + "grad_norm": 0.42379072308540344, + "learning_rate": 4.162261907629973e-05, + "loss": 0.0681, + "num_input_tokens_seen": 74657520, + "step": 61350 + }, + { + "epoch": 6.833166276868248, + "grad_norm": 0.5301468968391418, + "learning_rate": 4.162080415824879e-05, + "loss": 0.0313, + "num_input_tokens_seen": 74663952, + "step": 61355 + }, + { + "epoch": 6.833723131751865, + "grad_norm": 1.3211263418197632, + "learning_rate": 4.1618989083201044e-05, + "loss": 0.1412, + "num_input_tokens_seen": 74669680, + "step": 61360 + }, + { + "epoch": 6.834279986635483, + "grad_norm": 0.7370113134384155, + "learning_rate": 4.161717385117363e-05, + "loss": 0.0412, + "num_input_tokens_seen": 74675344, + "step": 61365 + }, + { + "epoch": 6.8348368415191, + "grad_norm": 0.0018821165431290865, + "learning_rate": 4.16153584621837e-05, + "loss": 0.0145, + "num_input_tokens_seen": 74681264, + "step": 61370 + }, + { + "epoch": 6.8353936964027175, + "grad_norm": 0.10414405167102814, + "learning_rate": 4.161354291624839e-05, + "loss": 0.0067, + "num_input_tokens_seen": 74687440, + "step": 61375 + }, + { + "epoch": 6.835950551286334, + "grad_norm": 0.42307567596435547, + "learning_rate": 4.1611727213384866e-05, + "loss": 0.0702, + "num_input_tokens_seen": 74693616, + "step": 61380 + }, + { + "epoch": 6.836507406169952, + "grad_norm": 0.5896930694580078, + "learning_rate": 4.160991135361027e-05, + "loss": 0.0501, + "num_input_tokens_seen": 74699696, + "step": 61385 + }, + { + "epoch": 6.83706426105357, + "grad_norm": 0.7220185995101929, + "learning_rate": 4.160809533694174e-05, + "loss": 0.0099, + "num_input_tokens_seen": 74705872, + "step": 61390 + }, + { + "epoch": 6.8376211159371865, + "grad_norm": 0.6590445637702942, + "learning_rate": 4.160627916339645e-05, + "loss": 0.0209, + "num_input_tokens_seen": 74712208, + "step": 61395 + }, + { + "epoch": 6.838177970820804, + "grad_norm": 0.09711591899394989, + "learning_rate": 4.1604462832991554e-05, + "loss": 0.0989, + "num_input_tokens_seen": 74718288, + "step": 61400 + }, + { + "epoch": 6.838734825704422, + "grad_norm": 0.008256582543253899, + "learning_rate": 4.16026463457442e-05, + "loss": 0.1113, + "num_input_tokens_seen": 74724336, + "step": 61405 + }, + { + "epoch": 6.839291680588039, + "grad_norm": 0.01674109697341919, + "learning_rate": 4.160082970167154e-05, + "loss": 0.0465, + "num_input_tokens_seen": 74730032, + "step": 61410 + }, + { + "epoch": 6.839848535471656, + "grad_norm": 0.802663266658783, + "learning_rate": 4.159901290079076e-05, + "loss": 0.0204, + "num_input_tokens_seen": 74736432, + "step": 61415 + }, + { + "epoch": 6.840405390355273, + "grad_norm": 0.04321663826704025, + "learning_rate": 4.159719594311899e-05, + "loss": 0.1307, + "num_input_tokens_seen": 74742192, + "step": 61420 + }, + { + "epoch": 6.840962245238891, + "grad_norm": 0.005826961249113083, + "learning_rate": 4.159537882867342e-05, + "loss": 0.0622, + "num_input_tokens_seen": 74748592, + "step": 61425 + }, + { + "epoch": 6.841519100122508, + "grad_norm": 0.4225412607192993, + "learning_rate": 4.1593561557471184e-05, + "loss": 0.0084, + "num_input_tokens_seen": 74754992, + "step": 61430 + }, + { + "epoch": 6.842075955006125, + "grad_norm": 0.5439332723617554, + "learning_rate": 4.1591744129529475e-05, + "loss": 0.0668, + "num_input_tokens_seen": 74760976, + "step": 61435 + }, + { + "epoch": 6.842632809889743, + "grad_norm": 1.171970009803772, + "learning_rate": 4.158992654486545e-05, + "loss": 0.1455, + "num_input_tokens_seen": 74767056, + "step": 61440 + }, + { + "epoch": 6.8431896647733605, + "grad_norm": 0.8544085621833801, + "learning_rate": 4.158810880349627e-05, + "loss": 0.1235, + "num_input_tokens_seen": 74773008, + "step": 61445 + }, + { + "epoch": 6.843746519656977, + "grad_norm": 0.9351763725280762, + "learning_rate": 4.1586290905439126e-05, + "loss": 0.0562, + "num_input_tokens_seen": 74779216, + "step": 61450 + }, + { + "epoch": 6.844303374540595, + "grad_norm": 0.5619691610336304, + "learning_rate": 4.158447285071116e-05, + "loss": 0.0173, + "num_input_tokens_seen": 74785456, + "step": 61455 + }, + { + "epoch": 6.844860229424212, + "grad_norm": 0.028664199635386467, + "learning_rate": 4.158265463932957e-05, + "loss": 0.0096, + "num_input_tokens_seen": 74791632, + "step": 61460 + }, + { + "epoch": 6.8454170843078295, + "grad_norm": 0.024213898926973343, + "learning_rate": 4.1580836271311516e-05, + "loss": 0.0458, + "num_input_tokens_seen": 74798000, + "step": 61465 + }, + { + "epoch": 6.845973939191447, + "grad_norm": 0.5811092853546143, + "learning_rate": 4.157901774667419e-05, + "loss": 0.0233, + "num_input_tokens_seen": 74804464, + "step": 61470 + }, + { + "epoch": 6.846530794075064, + "grad_norm": 0.0002591607335489243, + "learning_rate": 4.157719906543475e-05, + "loss": 0.0413, + "num_input_tokens_seen": 74810640, + "step": 61475 + }, + { + "epoch": 6.847087648958682, + "grad_norm": 0.08917483687400818, + "learning_rate": 4.1575380227610384e-05, + "loss": 0.0651, + "num_input_tokens_seen": 74816848, + "step": 61480 + }, + { + "epoch": 6.847644503842298, + "grad_norm": 0.48972180485725403, + "learning_rate": 4.1573561233218275e-05, + "loss": 0.0448, + "num_input_tokens_seen": 74823216, + "step": 61485 + }, + { + "epoch": 6.848201358725916, + "grad_norm": 1.8857303857803345, + "learning_rate": 4.157174208227559e-05, + "loss": 0.0886, + "num_input_tokens_seen": 74829520, + "step": 61490 + }, + { + "epoch": 6.848758213609534, + "grad_norm": 2.3141133785247803, + "learning_rate": 4.156992277479954e-05, + "loss": 0.0516, + "num_input_tokens_seen": 74835856, + "step": 61495 + }, + { + "epoch": 6.8493150684931505, + "grad_norm": 0.1569831520318985, + "learning_rate": 4.156810331080728e-05, + "loss": 0.0352, + "num_input_tokens_seen": 74842032, + "step": 61500 + }, + { + "epoch": 6.849871923376768, + "grad_norm": 0.3218900263309479, + "learning_rate": 4.156628369031602e-05, + "loss": 0.0502, + "num_input_tokens_seen": 74847984, + "step": 61505 + }, + { + "epoch": 6.850428778260385, + "grad_norm": 0.0270675215870142, + "learning_rate": 4.156446391334294e-05, + "loss": 0.0193, + "num_input_tokens_seen": 74854256, + "step": 61510 + }, + { + "epoch": 6.850985633144003, + "grad_norm": 0.4777315557003021, + "learning_rate": 4.156264397990522e-05, + "loss": 0.0598, + "num_input_tokens_seen": 74860624, + "step": 61515 + }, + { + "epoch": 6.85154248802762, + "grad_norm": 0.0072082760743796825, + "learning_rate": 4.156082389002006e-05, + "loss": 0.0586, + "num_input_tokens_seen": 74866960, + "step": 61520 + }, + { + "epoch": 6.852099342911237, + "grad_norm": 0.34116455912590027, + "learning_rate": 4.155900364370465e-05, + "loss": 0.0675, + "num_input_tokens_seen": 74873392, + "step": 61525 + }, + { + "epoch": 6.852656197794855, + "grad_norm": 0.2780868113040924, + "learning_rate": 4.155718324097618e-05, + "loss": 0.0396, + "num_input_tokens_seen": 74879408, + "step": 61530 + }, + { + "epoch": 6.853213052678472, + "grad_norm": 0.5838050246238708, + "learning_rate": 4.155536268185185e-05, + "loss": 0.0676, + "num_input_tokens_seen": 74885328, + "step": 61535 + }, + { + "epoch": 6.853769907562089, + "grad_norm": 0.3221851587295532, + "learning_rate": 4.155354196634886e-05, + "loss": 0.0423, + "num_input_tokens_seen": 74891120, + "step": 61540 + }, + { + "epoch": 6.854326762445707, + "grad_norm": 0.12113619595766068, + "learning_rate": 4.1551721094484406e-05, + "loss": 0.0188, + "num_input_tokens_seen": 74897680, + "step": 61545 + }, + { + "epoch": 6.854883617329324, + "grad_norm": 0.9133394360542297, + "learning_rate": 4.154990006627568e-05, + "loss": 0.0304, + "num_input_tokens_seen": 74903728, + "step": 61550 + }, + { + "epoch": 6.855440472212941, + "grad_norm": 0.6575464010238647, + "learning_rate": 4.154807888173988e-05, + "loss": 0.0775, + "num_input_tokens_seen": 74910032, + "step": 61555 + }, + { + "epoch": 6.855997327096558, + "grad_norm": 0.20936726033687592, + "learning_rate": 4.154625754089423e-05, + "loss": 0.0762, + "num_input_tokens_seen": 74916304, + "step": 61560 + }, + { + "epoch": 6.856554181980176, + "grad_norm": 0.09408627450466156, + "learning_rate": 4.154443604375592e-05, + "loss": 0.0225, + "num_input_tokens_seen": 74922832, + "step": 61565 + }, + { + "epoch": 6.8571110368637935, + "grad_norm": 0.01468253880739212, + "learning_rate": 4.1542614390342146e-05, + "loss": 0.0213, + "num_input_tokens_seen": 74929168, + "step": 61570 + }, + { + "epoch": 6.85766789174741, + "grad_norm": 0.00495830038562417, + "learning_rate": 4.154079258067014e-05, + "loss": 0.015, + "num_input_tokens_seen": 74935408, + "step": 61575 + }, + { + "epoch": 6.858224746631028, + "grad_norm": 1.6319024562835693, + "learning_rate": 4.153897061475709e-05, + "loss": 0.0591, + "num_input_tokens_seen": 74941456, + "step": 61580 + }, + { + "epoch": 6.858781601514646, + "grad_norm": 0.4793698787689209, + "learning_rate": 4.1537148492620204e-05, + "loss": 0.0813, + "num_input_tokens_seen": 74947472, + "step": 61585 + }, + { + "epoch": 6.8593384563982625, + "grad_norm": 0.7974122166633606, + "learning_rate": 4.153532621427671e-05, + "loss": 0.0356, + "num_input_tokens_seen": 74953360, + "step": 61590 + }, + { + "epoch": 6.85989531128188, + "grad_norm": 0.47660180926322937, + "learning_rate": 4.153350377974381e-05, + "loss": 0.0392, + "num_input_tokens_seen": 74959600, + "step": 61595 + }, + { + "epoch": 6.860452166165497, + "grad_norm": 0.4326879680156708, + "learning_rate": 4.1531681189038715e-05, + "loss": 0.0702, + "num_input_tokens_seen": 74965552, + "step": 61600 + }, + { + "epoch": 6.861009021049115, + "grad_norm": 0.7401496767997742, + "learning_rate": 4.152985844217865e-05, + "loss": 0.0492, + "num_input_tokens_seen": 74971696, + "step": 61605 + }, + { + "epoch": 6.861565875932732, + "grad_norm": 0.13986265659332275, + "learning_rate": 4.152803553918083e-05, + "loss": 0.0434, + "num_input_tokens_seen": 74978000, + "step": 61610 + }, + { + "epoch": 6.862122730816349, + "grad_norm": 0.06055444851517677, + "learning_rate": 4.152621248006248e-05, + "loss": 0.006, + "num_input_tokens_seen": 74984016, + "step": 61615 + }, + { + "epoch": 6.862679585699967, + "grad_norm": 0.00020534296345431358, + "learning_rate": 4.1524389264840804e-05, + "loss": 0.0215, + "num_input_tokens_seen": 74990480, + "step": 61620 + }, + { + "epoch": 6.863236440583584, + "grad_norm": 0.004823804367333651, + "learning_rate": 4.152256589353303e-05, + "loss": 0.1586, + "num_input_tokens_seen": 74996592, + "step": 61625 + }, + { + "epoch": 6.863793295467201, + "grad_norm": 0.0010726319160312414, + "learning_rate": 4.1520742366156384e-05, + "loss": 0.0936, + "num_input_tokens_seen": 75002256, + "step": 61630 + }, + { + "epoch": 6.864350150350819, + "grad_norm": 0.012375101447105408, + "learning_rate": 4.1518918682728094e-05, + "loss": 0.0527, + "num_input_tokens_seen": 75008496, + "step": 61635 + }, + { + "epoch": 6.864907005234436, + "grad_norm": 0.028147898614406586, + "learning_rate": 4.151709484326538e-05, + "loss": 0.0553, + "num_input_tokens_seen": 75014736, + "step": 61640 + }, + { + "epoch": 6.865463860118053, + "grad_norm": 0.6749676465988159, + "learning_rate": 4.151527084778547e-05, + "loss": 0.0244, + "num_input_tokens_seen": 75020976, + "step": 61645 + }, + { + "epoch": 6.866020715001671, + "grad_norm": 0.24026305973529816, + "learning_rate": 4.1513446696305596e-05, + "loss": 0.0532, + "num_input_tokens_seen": 75027344, + "step": 61650 + }, + { + "epoch": 6.866577569885288, + "grad_norm": 0.5440117716789246, + "learning_rate": 4.151162238884299e-05, + "loss": 0.1063, + "num_input_tokens_seen": 75033328, + "step": 61655 + }, + { + "epoch": 6.8671344247689055, + "grad_norm": 0.08205854147672653, + "learning_rate": 4.150979792541488e-05, + "loss": 0.0099, + "num_input_tokens_seen": 75038992, + "step": 61660 + }, + { + "epoch": 6.867691279652522, + "grad_norm": 0.1117979884147644, + "learning_rate": 4.150797330603851e-05, + "loss": 0.0129, + "num_input_tokens_seen": 75045136, + "step": 61665 + }, + { + "epoch": 6.86824813453614, + "grad_norm": 0.0701524019241333, + "learning_rate": 4.1506148530731096e-05, + "loss": 0.1307, + "num_input_tokens_seen": 75051280, + "step": 61670 + }, + { + "epoch": 6.868804989419758, + "grad_norm": 0.36121848225593567, + "learning_rate": 4.150432359950988e-05, + "loss": 0.0584, + "num_input_tokens_seen": 75057520, + "step": 61675 + }, + { + "epoch": 6.869361844303374, + "grad_norm": 1.0628834962844849, + "learning_rate": 4.150249851239211e-05, + "loss": 0.0564, + "num_input_tokens_seen": 75063792, + "step": 61680 + }, + { + "epoch": 6.869918699186992, + "grad_norm": 0.03820660337805748, + "learning_rate": 4.150067326939502e-05, + "loss": 0.067, + "num_input_tokens_seen": 75069744, + "step": 61685 + }, + { + "epoch": 6.870475554070609, + "grad_norm": 0.04564674198627472, + "learning_rate": 4.1498847870535853e-05, + "loss": 0.0363, + "num_input_tokens_seen": 75075728, + "step": 61690 + }, + { + "epoch": 6.8710324089542265, + "grad_norm": 1.377780556678772, + "learning_rate": 4.1497022315831846e-05, + "loss": 0.1093, + "num_input_tokens_seen": 75081808, + "step": 61695 + }, + { + "epoch": 6.871589263837844, + "grad_norm": 2.0747623443603516, + "learning_rate": 4.149519660530025e-05, + "loss": 0.2402, + "num_input_tokens_seen": 75088080, + "step": 61700 + }, + { + "epoch": 6.872146118721461, + "grad_norm": 0.6798039078712463, + "learning_rate": 4.14933707389583e-05, + "loss": 0.0555, + "num_input_tokens_seen": 75094448, + "step": 61705 + }, + { + "epoch": 6.872702973605079, + "grad_norm": 0.025172924622893333, + "learning_rate": 4.149154471682326e-05, + "loss": 0.119, + "num_input_tokens_seen": 75099888, + "step": 61710 + }, + { + "epoch": 6.873259828488695, + "grad_norm": 1.1869468688964844, + "learning_rate": 4.148971853891236e-05, + "loss": 0.082, + "num_input_tokens_seen": 75106064, + "step": 61715 + }, + { + "epoch": 6.873816683372313, + "grad_norm": 0.010764075443148613, + "learning_rate": 4.148789220524286e-05, + "loss": 0.002, + "num_input_tokens_seen": 75112272, + "step": 61720 + }, + { + "epoch": 6.874373538255931, + "grad_norm": 1.4612236022949219, + "learning_rate": 4.1486065715832e-05, + "loss": 0.0403, + "num_input_tokens_seen": 75118832, + "step": 61725 + }, + { + "epoch": 6.874930393139548, + "grad_norm": 0.1647976189851761, + "learning_rate": 4.148423907069705e-05, + "loss": 0.0641, + "num_input_tokens_seen": 75124656, + "step": 61730 + }, + { + "epoch": 6.875487248023165, + "grad_norm": 0.5188531279563904, + "learning_rate": 4.148241226985525e-05, + "loss": 0.1048, + "num_input_tokens_seen": 75130288, + "step": 61735 + }, + { + "epoch": 6.876044102906782, + "grad_norm": 0.5041970610618591, + "learning_rate": 4.148058531332386e-05, + "loss": 0.0884, + "num_input_tokens_seen": 75136432, + "step": 61740 + }, + { + "epoch": 6.8766009577904, + "grad_norm": 0.5355154871940613, + "learning_rate": 4.147875820112015e-05, + "loss": 0.0356, + "num_input_tokens_seen": 75142608, + "step": 61745 + }, + { + "epoch": 6.877157812674017, + "grad_norm": 0.44265079498291016, + "learning_rate": 4.1476930933261346e-05, + "loss": 0.0729, + "num_input_tokens_seen": 75148784, + "step": 61750 + }, + { + "epoch": 6.877714667557634, + "grad_norm": 0.07142860442399979, + "learning_rate": 4.147510350976474e-05, + "loss": 0.0747, + "num_input_tokens_seen": 75154576, + "step": 61755 + }, + { + "epoch": 6.878271522441252, + "grad_norm": 0.8084165453910828, + "learning_rate": 4.147327593064759e-05, + "loss": 0.0598, + "num_input_tokens_seen": 75160496, + "step": 61760 + }, + { + "epoch": 6.8788283773248695, + "grad_norm": 1.3703906536102295, + "learning_rate": 4.147144819592713e-05, + "loss": 0.0729, + "num_input_tokens_seen": 75166704, + "step": 61765 + }, + { + "epoch": 6.879385232208486, + "grad_norm": 0.9934437274932861, + "learning_rate": 4.146962030562066e-05, + "loss": 0.1001, + "num_input_tokens_seen": 75172976, + "step": 61770 + }, + { + "epoch": 6.879942087092104, + "grad_norm": 0.8919137120246887, + "learning_rate": 4.146779225974543e-05, + "loss": 0.0613, + "num_input_tokens_seen": 75178928, + "step": 61775 + }, + { + "epoch": 6.880498941975721, + "grad_norm": 0.609798014163971, + "learning_rate": 4.146596405831871e-05, + "loss": 0.1078, + "num_input_tokens_seen": 75184944, + "step": 61780 + }, + { + "epoch": 6.881055796859338, + "grad_norm": 1.1614748239517212, + "learning_rate": 4.146413570135776e-05, + "loss": 0.1553, + "num_input_tokens_seen": 75191152, + "step": 61785 + }, + { + "epoch": 6.881612651742956, + "grad_norm": 0.4202262759208679, + "learning_rate": 4.1462307188879853e-05, + "loss": 0.1312, + "num_input_tokens_seen": 75197520, + "step": 61790 + }, + { + "epoch": 6.882169506626573, + "grad_norm": 0.049686115235090256, + "learning_rate": 4.146047852090228e-05, + "loss": 0.0424, + "num_input_tokens_seen": 75203696, + "step": 61795 + }, + { + "epoch": 6.882726361510191, + "grad_norm": 0.19340398907661438, + "learning_rate": 4.1458649697442284e-05, + "loss": 0.0079, + "num_input_tokens_seen": 75210000, + "step": 61800 + }, + { + "epoch": 6.883283216393808, + "grad_norm": 1.0817701816558838, + "learning_rate": 4.1456820718517165e-05, + "loss": 0.0601, + "num_input_tokens_seen": 75215568, + "step": 61805 + }, + { + "epoch": 6.883840071277425, + "grad_norm": 0.17842915654182434, + "learning_rate": 4.145499158414419e-05, + "loss": 0.0053, + "num_input_tokens_seen": 75221552, + "step": 61810 + }, + { + "epoch": 6.884396926161043, + "grad_norm": 0.0012848381884396076, + "learning_rate": 4.145316229434063e-05, + "loss": 0.0121, + "num_input_tokens_seen": 75227952, + "step": 61815 + }, + { + "epoch": 6.8849537810446595, + "grad_norm": 0.33445143699645996, + "learning_rate": 4.145133284912378e-05, + "loss": 0.0751, + "num_input_tokens_seen": 75233200, + "step": 61820 + }, + { + "epoch": 6.885510635928277, + "grad_norm": 0.0011492253979668021, + "learning_rate": 4.14495032485109e-05, + "loss": 0.0754, + "num_input_tokens_seen": 75239344, + "step": 61825 + }, + { + "epoch": 6.886067490811895, + "grad_norm": 0.8100736737251282, + "learning_rate": 4.144767349251929e-05, + "loss": 0.0126, + "num_input_tokens_seen": 75245520, + "step": 61830 + }, + { + "epoch": 6.886624345695512, + "grad_norm": 0.47393685579299927, + "learning_rate": 4.144584358116622e-05, + "loss": 0.0843, + "num_input_tokens_seen": 75251792, + "step": 61835 + }, + { + "epoch": 6.887181200579129, + "grad_norm": 0.005828526336699724, + "learning_rate": 4.144401351446898e-05, + "loss": 0.0342, + "num_input_tokens_seen": 75257520, + "step": 61840 + }, + { + "epoch": 6.887738055462746, + "grad_norm": 0.890997052192688, + "learning_rate": 4.144218329244487e-05, + "loss": 0.1067, + "num_input_tokens_seen": 75263472, + "step": 61845 + }, + { + "epoch": 6.888294910346364, + "grad_norm": 0.050367068499326706, + "learning_rate": 4.144035291511116e-05, + "loss": 0.0036, + "num_input_tokens_seen": 75269840, + "step": 61850 + }, + { + "epoch": 6.888851765229981, + "grad_norm": 0.13879388570785522, + "learning_rate": 4.1438522382485134e-05, + "loss": 0.0398, + "num_input_tokens_seen": 75276272, + "step": 61855 + }, + { + "epoch": 6.889408620113598, + "grad_norm": 0.005660048220306635, + "learning_rate": 4.1436691694584104e-05, + "loss": 0.136, + "num_input_tokens_seen": 75282384, + "step": 61860 + }, + { + "epoch": 6.889965474997216, + "grad_norm": 0.693832516670227, + "learning_rate": 4.143486085142535e-05, + "loss": 0.0343, + "num_input_tokens_seen": 75288432, + "step": 61865 + }, + { + "epoch": 6.890522329880833, + "grad_norm": 0.07955393195152283, + "learning_rate": 4.143302985302617e-05, + "loss": 0.0754, + "num_input_tokens_seen": 75294320, + "step": 61870 + }, + { + "epoch": 6.89107918476445, + "grad_norm": 2.395585536956787, + "learning_rate": 4.143119869940385e-05, + "loss": 0.0391, + "num_input_tokens_seen": 75300496, + "step": 61875 + }, + { + "epoch": 6.891636039648068, + "grad_norm": 0.041087981313467026, + "learning_rate": 4.1429367390575704e-05, + "loss": 0.1001, + "num_input_tokens_seen": 75306672, + "step": 61880 + }, + { + "epoch": 6.892192894531685, + "grad_norm": 0.028965717181563377, + "learning_rate": 4.142753592655901e-05, + "loss": 0.1149, + "num_input_tokens_seen": 75312816, + "step": 61885 + }, + { + "epoch": 6.8927497494153025, + "grad_norm": 0.6038068532943726, + "learning_rate": 4.142570430737109e-05, + "loss": 0.016, + "num_input_tokens_seen": 75318544, + "step": 61890 + }, + { + "epoch": 6.893306604298919, + "grad_norm": 1.8610827922821045, + "learning_rate": 4.142387253302922e-05, + "loss": 0.1123, + "num_input_tokens_seen": 75324304, + "step": 61895 + }, + { + "epoch": 6.893863459182537, + "grad_norm": 2.2261178493499756, + "learning_rate": 4.1422040603550725e-05, + "loss": 0.1957, + "num_input_tokens_seen": 75330800, + "step": 61900 + }, + { + "epoch": 6.894420314066155, + "grad_norm": 0.0005072260973975062, + "learning_rate": 4.142020851895289e-05, + "loss": 0.1037, + "num_input_tokens_seen": 75336848, + "step": 61905 + }, + { + "epoch": 6.894977168949771, + "grad_norm": 0.008976967073976994, + "learning_rate": 4.141837627925304e-05, + "loss": 0.0544, + "num_input_tokens_seen": 75343184, + "step": 61910 + }, + { + "epoch": 6.895534023833389, + "grad_norm": 0.03571793809533119, + "learning_rate": 4.141654388446846e-05, + "loss": 0.0162, + "num_input_tokens_seen": 75349296, + "step": 61915 + }, + { + "epoch": 6.896090878717006, + "grad_norm": 1.1702474355697632, + "learning_rate": 4.141471133461649e-05, + "loss": 0.1264, + "num_input_tokens_seen": 75355440, + "step": 61920 + }, + { + "epoch": 6.8966477336006236, + "grad_norm": 0.7846246957778931, + "learning_rate": 4.1412878629714404e-05, + "loss": 0.0636, + "num_input_tokens_seen": 75361072, + "step": 61925 + }, + { + "epoch": 6.897204588484241, + "grad_norm": 0.21988557279109955, + "learning_rate": 4.141104576977953e-05, + "loss": 0.0355, + "num_input_tokens_seen": 75367024, + "step": 61930 + }, + { + "epoch": 6.897761443367858, + "grad_norm": 0.0009893645765259862, + "learning_rate": 4.140921275482918e-05, + "loss": 0.1261, + "num_input_tokens_seen": 75373168, + "step": 61935 + }, + { + "epoch": 6.898318298251476, + "grad_norm": 0.0073933410458266735, + "learning_rate": 4.140737958488067e-05, + "loss": 0.0327, + "num_input_tokens_seen": 75379184, + "step": 61940 + }, + { + "epoch": 6.898875153135093, + "grad_norm": 0.20276930928230286, + "learning_rate": 4.140554625995132e-05, + "loss": 0.0715, + "num_input_tokens_seen": 75385424, + "step": 61945 + }, + { + "epoch": 6.89943200801871, + "grad_norm": 0.005193155724555254, + "learning_rate": 4.1403712780058436e-05, + "loss": 0.0189, + "num_input_tokens_seen": 75391184, + "step": 61950 + }, + { + "epoch": 6.899988862902328, + "grad_norm": 0.14656101167201996, + "learning_rate": 4.1401879145219343e-05, + "loss": 0.1018, + "num_input_tokens_seen": 75397680, + "step": 61955 + }, + { + "epoch": 6.900545717785945, + "grad_norm": 0.0007186870789155364, + "learning_rate": 4.1400045355451366e-05, + "loss": 0.0597, + "num_input_tokens_seen": 75403344, + "step": 61960 + }, + { + "epoch": 6.901102572669562, + "grad_norm": 0.24118340015411377, + "learning_rate": 4.1398211410771816e-05, + "loss": 0.1055, + "num_input_tokens_seen": 75409456, + "step": 61965 + }, + { + "epoch": 6.90165942755318, + "grad_norm": 0.0020773890428245068, + "learning_rate": 4.139637731119802e-05, + "loss": 0.0274, + "num_input_tokens_seen": 75416016, + "step": 61970 + }, + { + "epoch": 6.902216282436797, + "grad_norm": 1.5749013423919678, + "learning_rate": 4.139454305674731e-05, + "loss": 0.1372, + "num_input_tokens_seen": 75421872, + "step": 61975 + }, + { + "epoch": 6.902773137320414, + "grad_norm": 0.485731840133667, + "learning_rate": 4.1392708647436995e-05, + "loss": 0.0259, + "num_input_tokens_seen": 75427984, + "step": 61980 + }, + { + "epoch": 6.903329992204032, + "grad_norm": 0.1698746383190155, + "learning_rate": 4.1390874083284426e-05, + "loss": 0.0305, + "num_input_tokens_seen": 75434128, + "step": 61985 + }, + { + "epoch": 6.903886847087649, + "grad_norm": 0.10759326070547104, + "learning_rate": 4.138903936430691e-05, + "loss": 0.1384, + "num_input_tokens_seen": 75440272, + "step": 61990 + }, + { + "epoch": 6.904443701971267, + "grad_norm": 0.01597992517054081, + "learning_rate": 4.1387204490521794e-05, + "loss": 0.0538, + "num_input_tokens_seen": 75446320, + "step": 61995 + }, + { + "epoch": 6.905000556854883, + "grad_norm": 0.2255251407623291, + "learning_rate": 4.13853694619464e-05, + "loss": 0.0338, + "num_input_tokens_seen": 75452688, + "step": 62000 + }, + { + "epoch": 6.905557411738501, + "grad_norm": 0.2421385943889618, + "learning_rate": 4.1383534278598055e-05, + "loss": 0.124, + "num_input_tokens_seen": 75458736, + "step": 62005 + }, + { + "epoch": 6.906114266622119, + "grad_norm": 1.1596564054489136, + "learning_rate": 4.1381698940494114e-05, + "loss": 0.1131, + "num_input_tokens_seen": 75464944, + "step": 62010 + }, + { + "epoch": 6.9066711215057355, + "grad_norm": 0.010679914616048336, + "learning_rate": 4.13798634476519e-05, + "loss": 0.0596, + "num_input_tokens_seen": 75471056, + "step": 62015 + }, + { + "epoch": 6.907227976389353, + "grad_norm": 0.6213107109069824, + "learning_rate": 4.1378027800088745e-05, + "loss": 0.0745, + "num_input_tokens_seen": 75477168, + "step": 62020 + }, + { + "epoch": 6.90778483127297, + "grad_norm": 0.6898428797721863, + "learning_rate": 4.1376191997822e-05, + "loss": 0.0502, + "num_input_tokens_seen": 75483536, + "step": 62025 + }, + { + "epoch": 6.908341686156588, + "grad_norm": 0.059635940939188004, + "learning_rate": 4.1374356040869e-05, + "loss": 0.05, + "num_input_tokens_seen": 75489840, + "step": 62030 + }, + { + "epoch": 6.908898541040205, + "grad_norm": 0.7103794813156128, + "learning_rate": 4.1372519929247086e-05, + "loss": 0.1498, + "num_input_tokens_seen": 75495792, + "step": 62035 + }, + { + "epoch": 6.909455395923822, + "grad_norm": 1.4724913835525513, + "learning_rate": 4.137068366297361e-05, + "loss": 0.0703, + "num_input_tokens_seen": 75501872, + "step": 62040 + }, + { + "epoch": 6.91001225080744, + "grad_norm": 0.000263955385889858, + "learning_rate": 4.1368847242065914e-05, + "loss": 0.006, + "num_input_tokens_seen": 75508048, + "step": 62045 + }, + { + "epoch": 6.9105691056910565, + "grad_norm": 0.003382985247299075, + "learning_rate": 4.1367010666541325e-05, + "loss": 0.0582, + "num_input_tokens_seen": 75514224, + "step": 62050 + }, + { + "epoch": 6.911125960574674, + "grad_norm": 0.8023414015769958, + "learning_rate": 4.1365173936417225e-05, + "loss": 0.0509, + "num_input_tokens_seen": 75520336, + "step": 62055 + }, + { + "epoch": 6.911682815458292, + "grad_norm": 0.0018029686762019992, + "learning_rate": 4.136333705171094e-05, + "loss": 0.0646, + "num_input_tokens_seen": 75526640, + "step": 62060 + }, + { + "epoch": 6.912239670341909, + "grad_norm": 0.5501561164855957, + "learning_rate": 4.1361500012439824e-05, + "loss": 0.0249, + "num_input_tokens_seen": 75532976, + "step": 62065 + }, + { + "epoch": 6.912796525225526, + "grad_norm": 0.07103315740823746, + "learning_rate": 4.1359662818621225e-05, + "loss": 0.0672, + "num_input_tokens_seen": 75539120, + "step": 62070 + }, + { + "epoch": 6.913353380109143, + "grad_norm": 0.10302309691905975, + "learning_rate": 4.135782547027252e-05, + "loss": 0.084, + "num_input_tokens_seen": 75544912, + "step": 62075 + }, + { + "epoch": 6.913910234992761, + "grad_norm": 0.06989571452140808, + "learning_rate": 4.135598796741103e-05, + "loss": 0.011, + "num_input_tokens_seen": 75550736, + "step": 62080 + }, + { + "epoch": 6.9144670898763785, + "grad_norm": 0.5756889581680298, + "learning_rate": 4.135415031005414e-05, + "loss": 0.2159, + "num_input_tokens_seen": 75556624, + "step": 62085 + }, + { + "epoch": 6.915023944759995, + "grad_norm": 0.7815561294555664, + "learning_rate": 4.1352312498219196e-05, + "loss": 0.0433, + "num_input_tokens_seen": 75562672, + "step": 62090 + }, + { + "epoch": 6.915580799643613, + "grad_norm": 1.5454719066619873, + "learning_rate": 4.1350474531923564e-05, + "loss": 0.1089, + "num_input_tokens_seen": 75568784, + "step": 62095 + }, + { + "epoch": 6.91613765452723, + "grad_norm": 0.04110722243785858, + "learning_rate": 4.134863641118459e-05, + "loss": 0.0461, + "num_input_tokens_seen": 75575248, + "step": 62100 + }, + { + "epoch": 6.916694509410847, + "grad_norm": 0.6648277640342712, + "learning_rate": 4.134679813601965e-05, + "loss": 0.1616, + "num_input_tokens_seen": 75581680, + "step": 62105 + }, + { + "epoch": 6.917251364294465, + "grad_norm": 0.11412545293569565, + "learning_rate": 4.1344959706446104e-05, + "loss": 0.0336, + "num_input_tokens_seen": 75587888, + "step": 62110 + }, + { + "epoch": 6.917808219178082, + "grad_norm": 1.377055287361145, + "learning_rate": 4.134312112248133e-05, + "loss": 0.0879, + "num_input_tokens_seen": 75594160, + "step": 62115 + }, + { + "epoch": 6.9183650740616995, + "grad_norm": 0.4833468794822693, + "learning_rate": 4.134128238414266e-05, + "loss": 0.1489, + "num_input_tokens_seen": 75600304, + "step": 62120 + }, + { + "epoch": 6.918921928945317, + "grad_norm": 0.2868199348449707, + "learning_rate": 4.133944349144751e-05, + "loss": 0.0857, + "num_input_tokens_seen": 75606768, + "step": 62125 + }, + { + "epoch": 6.919478783828934, + "grad_norm": 0.1778208613395691, + "learning_rate": 4.1337604444413217e-05, + "loss": 0.0923, + "num_input_tokens_seen": 75612656, + "step": 62130 + }, + { + "epoch": 6.920035638712552, + "grad_norm": 1.5137287378311157, + "learning_rate": 4.133576524305716e-05, + "loss": 0.0557, + "num_input_tokens_seen": 75618832, + "step": 62135 + }, + { + "epoch": 6.9205924935961685, + "grad_norm": 0.0033637380693107843, + "learning_rate": 4.1333925887396706e-05, + "loss": 0.0346, + "num_input_tokens_seen": 75624944, + "step": 62140 + }, + { + "epoch": 6.921149348479786, + "grad_norm": 0.14786052703857422, + "learning_rate": 4.1332086377449244e-05, + "loss": 0.0386, + "num_input_tokens_seen": 75630928, + "step": 62145 + }, + { + "epoch": 6.921706203363404, + "grad_norm": 0.679165244102478, + "learning_rate": 4.133024671323213e-05, + "loss": 0.1271, + "num_input_tokens_seen": 75636848, + "step": 62150 + }, + { + "epoch": 6.922263058247021, + "grad_norm": 0.47414639592170715, + "learning_rate": 4.132840689476276e-05, + "loss": 0.0222, + "num_input_tokens_seen": 75643216, + "step": 62155 + }, + { + "epoch": 6.922819913130638, + "grad_norm": 0.0750388652086258, + "learning_rate": 4.132656692205851e-05, + "loss": 0.1164, + "num_input_tokens_seen": 75649392, + "step": 62160 + }, + { + "epoch": 6.923376768014256, + "grad_norm": 0.02911161072552204, + "learning_rate": 4.132472679513675e-05, + "loss": 0.0248, + "num_input_tokens_seen": 75655696, + "step": 62165 + }, + { + "epoch": 6.923933622897873, + "grad_norm": 0.19884610176086426, + "learning_rate": 4.1322886514014855e-05, + "loss": 0.089, + "num_input_tokens_seen": 75661680, + "step": 62170 + }, + { + "epoch": 6.92449047778149, + "grad_norm": 0.07742474973201752, + "learning_rate": 4.132104607871024e-05, + "loss": 0.0106, + "num_input_tokens_seen": 75667792, + "step": 62175 + }, + { + "epoch": 6.925047332665107, + "grad_norm": 0.9851770401000977, + "learning_rate": 4.1319205489240256e-05, + "loss": 0.11, + "num_input_tokens_seen": 75673584, + "step": 62180 + }, + { + "epoch": 6.925604187548725, + "grad_norm": 1.3726459741592407, + "learning_rate": 4.13173647456223e-05, + "loss": 0.0518, + "num_input_tokens_seen": 75679760, + "step": 62185 + }, + { + "epoch": 6.9261610424323425, + "grad_norm": 0.5240424275398254, + "learning_rate": 4.1315523847873764e-05, + "loss": 0.0322, + "num_input_tokens_seen": 75685392, + "step": 62190 + }, + { + "epoch": 6.926717897315959, + "grad_norm": 0.5742108821868896, + "learning_rate": 4.1313682796012034e-05, + "loss": 0.0685, + "num_input_tokens_seen": 75691568, + "step": 62195 + }, + { + "epoch": 6.927274752199577, + "grad_norm": 0.2722267210483551, + "learning_rate": 4.13118415900545e-05, + "loss": 0.0122, + "num_input_tokens_seen": 75697424, + "step": 62200 + }, + { + "epoch": 6.927831607083194, + "grad_norm": 0.15710797905921936, + "learning_rate": 4.1310000230018555e-05, + "loss": 0.0224, + "num_input_tokens_seen": 75703664, + "step": 62205 + }, + { + "epoch": 6.9283884619668115, + "grad_norm": 0.0006143338396213949, + "learning_rate": 4.130815871592159e-05, + "loss": 0.1335, + "num_input_tokens_seen": 75709712, + "step": 62210 + }, + { + "epoch": 6.928945316850429, + "grad_norm": 0.275698721408844, + "learning_rate": 4.1306317047780994e-05, + "loss": 0.1878, + "num_input_tokens_seen": 75715920, + "step": 62215 + }, + { + "epoch": 6.929502171734046, + "grad_norm": 0.006681437138468027, + "learning_rate": 4.130447522561417e-05, + "loss": 0.0017, + "num_input_tokens_seen": 75721840, + "step": 62220 + }, + { + "epoch": 6.930059026617664, + "grad_norm": 1.3825358152389526, + "learning_rate": 4.130263324943852e-05, + "loss": 0.045, + "num_input_tokens_seen": 75728112, + "step": 62225 + }, + { + "epoch": 6.93061588150128, + "grad_norm": 0.00971962884068489, + "learning_rate": 4.130079111927144e-05, + "loss": 0.0866, + "num_input_tokens_seen": 75733744, + "step": 62230 + }, + { + "epoch": 6.931172736384898, + "grad_norm": 1.4183688163757324, + "learning_rate": 4.1298948835130315e-05, + "loss": 0.0993, + "num_input_tokens_seen": 75739792, + "step": 62235 + }, + { + "epoch": 6.931729591268516, + "grad_norm": 0.01150089967995882, + "learning_rate": 4.129710639703257e-05, + "loss": 0.0877, + "num_input_tokens_seen": 75746128, + "step": 62240 + }, + { + "epoch": 6.9322864461521325, + "grad_norm": 0.017664996907114983, + "learning_rate": 4.12952638049956e-05, + "loss": 0.0882, + "num_input_tokens_seen": 75752272, + "step": 62245 + }, + { + "epoch": 6.93284330103575, + "grad_norm": 1.6626051664352417, + "learning_rate": 4.1293421059036805e-05, + "loss": 0.0629, + "num_input_tokens_seen": 75758096, + "step": 62250 + }, + { + "epoch": 6.933400155919367, + "grad_norm": 0.36590149998664856, + "learning_rate": 4.129157815917359e-05, + "loss": 0.1598, + "num_input_tokens_seen": 75763856, + "step": 62255 + }, + { + "epoch": 6.933957010802985, + "grad_norm": 0.21272224187850952, + "learning_rate": 4.128973510542337e-05, + "loss": 0.0824, + "num_input_tokens_seen": 75769680, + "step": 62260 + }, + { + "epoch": 6.934513865686602, + "grad_norm": 0.6609894633293152, + "learning_rate": 4.128789189780355e-05, + "loss": 0.0921, + "num_input_tokens_seen": 75776048, + "step": 62265 + }, + { + "epoch": 6.935070720570219, + "grad_norm": 0.0004777682479470968, + "learning_rate": 4.128604853633154e-05, + "loss": 0.0247, + "num_input_tokens_seen": 75782416, + "step": 62270 + }, + { + "epoch": 6.935627575453837, + "grad_norm": 1.4936293363571167, + "learning_rate": 4.128420502102476e-05, + "loss": 0.0749, + "num_input_tokens_seen": 75788624, + "step": 62275 + }, + { + "epoch": 6.936184430337454, + "grad_norm": 0.7971547842025757, + "learning_rate": 4.1282361351900613e-05, + "loss": 0.1278, + "num_input_tokens_seen": 75794800, + "step": 62280 + }, + { + "epoch": 6.936741285221071, + "grad_norm": 0.25561630725860596, + "learning_rate": 4.128051752897651e-05, + "loss": 0.0657, + "num_input_tokens_seen": 75801136, + "step": 62285 + }, + { + "epoch": 6.937298140104689, + "grad_norm": 0.1826784759759903, + "learning_rate": 4.127867355226989e-05, + "loss": 0.0247, + "num_input_tokens_seen": 75807408, + "step": 62290 + }, + { + "epoch": 6.937854994988306, + "grad_norm": 0.09514456242322922, + "learning_rate": 4.1276829421798146e-05, + "loss": 0.0785, + "num_input_tokens_seen": 75813488, + "step": 62295 + }, + { + "epoch": 6.938411849871923, + "grad_norm": 0.9915470480918884, + "learning_rate": 4.12749851375787e-05, + "loss": 0.0583, + "num_input_tokens_seen": 75819984, + "step": 62300 + }, + { + "epoch": 6.938968704755541, + "grad_norm": 0.07115953415632248, + "learning_rate": 4.127314069962899e-05, + "loss": 0.0871, + "num_input_tokens_seen": 75826128, + "step": 62305 + }, + { + "epoch": 6.939525559639158, + "grad_norm": 1.8756892681121826, + "learning_rate": 4.1271296107966426e-05, + "loss": 0.0699, + "num_input_tokens_seen": 75832272, + "step": 62310 + }, + { + "epoch": 6.9400824145227755, + "grad_norm": 0.00969543308019638, + "learning_rate": 4.126945136260844e-05, + "loss": 0.0284, + "num_input_tokens_seen": 75838512, + "step": 62315 + }, + { + "epoch": 6.940639269406392, + "grad_norm": 1.6240923404693604, + "learning_rate": 4.126760646357245e-05, + "loss": 0.0962, + "num_input_tokens_seen": 75844080, + "step": 62320 + }, + { + "epoch": 6.94119612429001, + "grad_norm": 0.7047348022460938, + "learning_rate": 4.126576141087588e-05, + "loss": 0.0357, + "num_input_tokens_seen": 75850224, + "step": 62325 + }, + { + "epoch": 6.941752979173628, + "grad_norm": 0.7808792591094971, + "learning_rate": 4.1263916204536156e-05, + "loss": 0.1561, + "num_input_tokens_seen": 75856336, + "step": 62330 + }, + { + "epoch": 6.9423098340572444, + "grad_norm": 0.8145408630371094, + "learning_rate": 4.126207084457072e-05, + "loss": 0.0898, + "num_input_tokens_seen": 75862320, + "step": 62335 + }, + { + "epoch": 6.942866688940862, + "grad_norm": 1.068156123161316, + "learning_rate": 4.1260225330997e-05, + "loss": 0.0981, + "num_input_tokens_seen": 75868464, + "step": 62340 + }, + { + "epoch": 6.94342354382448, + "grad_norm": 0.5675354599952698, + "learning_rate": 4.125837966383241e-05, + "loss": 0.0932, + "num_input_tokens_seen": 75874416, + "step": 62345 + }, + { + "epoch": 6.943980398708097, + "grad_norm": 0.4185000956058502, + "learning_rate": 4.125653384309441e-05, + "loss": 0.0648, + "num_input_tokens_seen": 75880464, + "step": 62350 + }, + { + "epoch": 6.944537253591714, + "grad_norm": 1.4089820384979248, + "learning_rate": 4.125468786880042e-05, + "loss": 0.1244, + "num_input_tokens_seen": 75886192, + "step": 62355 + }, + { + "epoch": 6.945094108475331, + "grad_norm": 0.01745087094604969, + "learning_rate": 4.1252841740967886e-05, + "loss": 0.1419, + "num_input_tokens_seen": 75892112, + "step": 62360 + }, + { + "epoch": 6.945650963358949, + "grad_norm": 0.1928158849477768, + "learning_rate": 4.1250995459614234e-05, + "loss": 0.0383, + "num_input_tokens_seen": 75898384, + "step": 62365 + }, + { + "epoch": 6.946207818242566, + "grad_norm": 1.9728158712387085, + "learning_rate": 4.124914902475691e-05, + "loss": 0.2417, + "num_input_tokens_seen": 75904432, + "step": 62370 + }, + { + "epoch": 6.946764673126183, + "grad_norm": 0.0071936193853616714, + "learning_rate": 4.124730243641336e-05, + "loss": 0.0773, + "num_input_tokens_seen": 75910736, + "step": 62375 + }, + { + "epoch": 6.947321528009801, + "grad_norm": 0.05526285991072655, + "learning_rate": 4.124545569460101e-05, + "loss": 0.0378, + "num_input_tokens_seen": 75916688, + "step": 62380 + }, + { + "epoch": 6.947878382893418, + "grad_norm": 0.08698806166648865, + "learning_rate": 4.124360879933732e-05, + "loss": 0.0294, + "num_input_tokens_seen": 75922800, + "step": 62385 + }, + { + "epoch": 6.948435237777035, + "grad_norm": 0.05849043279886246, + "learning_rate": 4.124176175063974e-05, + "loss": 0.0398, + "num_input_tokens_seen": 75928752, + "step": 62390 + }, + { + "epoch": 6.948992092660653, + "grad_norm": 0.07714760303497314, + "learning_rate": 4.1239914548525705e-05, + "loss": 0.0211, + "num_input_tokens_seen": 75934896, + "step": 62395 + }, + { + "epoch": 6.94954894754427, + "grad_norm": 0.15917405486106873, + "learning_rate": 4.1238067193012656e-05, + "loss": 0.0688, + "num_input_tokens_seen": 75940944, + "step": 62400 + }, + { + "epoch": 6.9501058024278874, + "grad_norm": 0.27378660440444946, + "learning_rate": 4.123621968411806e-05, + "loss": 0.0255, + "num_input_tokens_seen": 75947152, + "step": 62405 + }, + { + "epoch": 6.950662657311504, + "grad_norm": 0.0970081314444542, + "learning_rate": 4.1234372021859355e-05, + "loss": 0.0073, + "num_input_tokens_seen": 75953232, + "step": 62410 + }, + { + "epoch": 6.951219512195122, + "grad_norm": 0.08774314075708389, + "learning_rate": 4.123252420625401e-05, + "loss": 0.007, + "num_input_tokens_seen": 75959568, + "step": 62415 + }, + { + "epoch": 6.95177636707874, + "grad_norm": 0.005758730228990316, + "learning_rate": 4.1230676237319454e-05, + "loss": 0.0781, + "num_input_tokens_seen": 75965520, + "step": 62420 + }, + { + "epoch": 6.952333221962356, + "grad_norm": 0.4800401031970978, + "learning_rate": 4.122882811507317e-05, + "loss": 0.1154, + "num_input_tokens_seen": 75971344, + "step": 62425 + }, + { + "epoch": 6.952890076845974, + "grad_norm": 0.1838470995426178, + "learning_rate": 4.12269798395326e-05, + "loss": 0.0085, + "num_input_tokens_seen": 75977616, + "step": 62430 + }, + { + "epoch": 6.953446931729591, + "grad_norm": 0.01566169038414955, + "learning_rate": 4.12251314107152e-05, + "loss": 0.0535, + "num_input_tokens_seen": 75983632, + "step": 62435 + }, + { + "epoch": 6.9540037866132085, + "grad_norm": 0.46918419003486633, + "learning_rate": 4.1223282828638434e-05, + "loss": 0.0689, + "num_input_tokens_seen": 75989712, + "step": 62440 + }, + { + "epoch": 6.954560641496826, + "grad_norm": 0.12141662836074829, + "learning_rate": 4.1221434093319766e-05, + "loss": 0.078, + "num_input_tokens_seen": 75995728, + "step": 62445 + }, + { + "epoch": 6.955117496380443, + "grad_norm": 0.3092508912086487, + "learning_rate": 4.121958520477666e-05, + "loss": 0.0166, + "num_input_tokens_seen": 76001808, + "step": 62450 + }, + { + "epoch": 6.955674351264061, + "grad_norm": 0.04440990462899208, + "learning_rate": 4.121773616302656e-05, + "loss": 0.1549, + "num_input_tokens_seen": 76007920, + "step": 62455 + }, + { + "epoch": 6.956231206147677, + "grad_norm": 0.0004939821665175259, + "learning_rate": 4.121588696808697e-05, + "loss": 0.0656, + "num_input_tokens_seen": 76013968, + "step": 62460 + }, + { + "epoch": 6.956788061031295, + "grad_norm": 0.002619258128106594, + "learning_rate": 4.1214037619975334e-05, + "loss": 0.0481, + "num_input_tokens_seen": 76019792, + "step": 62465 + }, + { + "epoch": 6.957344915914913, + "grad_norm": 0.3631478548049927, + "learning_rate": 4.121218811870911e-05, + "loss": 0.0184, + "num_input_tokens_seen": 76025840, + "step": 62470 + }, + { + "epoch": 6.95790177079853, + "grad_norm": 0.06588336080312729, + "learning_rate": 4.1210338464305784e-05, + "loss": 0.047, + "num_input_tokens_seen": 76032112, + "step": 62475 + }, + { + "epoch": 6.958458625682147, + "grad_norm": 0.7093575596809387, + "learning_rate": 4.1208488656782826e-05, + "loss": 0.0886, + "num_input_tokens_seen": 76038256, + "step": 62480 + }, + { + "epoch": 6.959015480565765, + "grad_norm": 0.0012276310008019209, + "learning_rate": 4.120663869615771e-05, + "loss": 0.0021, + "num_input_tokens_seen": 76044496, + "step": 62485 + }, + { + "epoch": 6.959572335449382, + "grad_norm": 0.7771748900413513, + "learning_rate": 4.12047885824479e-05, + "loss": 0.0273, + "num_input_tokens_seen": 76050544, + "step": 62490 + }, + { + "epoch": 6.960129190332999, + "grad_norm": 0.047394171357154846, + "learning_rate": 4.120293831567088e-05, + "loss": 0.0209, + "num_input_tokens_seen": 76056496, + "step": 62495 + }, + { + "epoch": 6.960686045216617, + "grad_norm": 0.16944076120853424, + "learning_rate": 4.1201087895844134e-05, + "loss": 0.0172, + "num_input_tokens_seen": 76062512, + "step": 62500 + }, + { + "epoch": 6.961242900100234, + "grad_norm": 0.34204113483428955, + "learning_rate": 4.1199237322985126e-05, + "loss": 0.0189, + "num_input_tokens_seen": 76068752, + "step": 62505 + }, + { + "epoch": 6.9617997549838515, + "grad_norm": 0.05758245289325714, + "learning_rate": 4.1197386597111344e-05, + "loss": 0.0014, + "num_input_tokens_seen": 76074960, + "step": 62510 + }, + { + "epoch": 6.962356609867468, + "grad_norm": 0.007215210236608982, + "learning_rate": 4.1195535718240264e-05, + "loss": 0.043, + "num_input_tokens_seen": 76081360, + "step": 62515 + }, + { + "epoch": 6.962913464751086, + "grad_norm": 1.8948525190353394, + "learning_rate": 4.1193684686389376e-05, + "loss": 0.0635, + "num_input_tokens_seen": 76087408, + "step": 62520 + }, + { + "epoch": 6.963470319634704, + "grad_norm": 1.039574384689331, + "learning_rate": 4.119183350157617e-05, + "loss": 0.0271, + "num_input_tokens_seen": 76093552, + "step": 62525 + }, + { + "epoch": 6.96402717451832, + "grad_norm": 0.45211708545684814, + "learning_rate": 4.118998216381811e-05, + "loss": 0.0491, + "num_input_tokens_seen": 76099248, + "step": 62530 + }, + { + "epoch": 6.964584029401938, + "grad_norm": 1.4843714237213135, + "learning_rate": 4.118813067313271e-05, + "loss": 0.106, + "num_input_tokens_seen": 76105520, + "step": 62535 + }, + { + "epoch": 6.965140884285555, + "grad_norm": 0.3480544984340668, + "learning_rate": 4.1186279029537447e-05, + "loss": 0.082, + "num_input_tokens_seen": 76111888, + "step": 62540 + }, + { + "epoch": 6.965697739169173, + "grad_norm": 0.0011013440089300275, + "learning_rate": 4.118442723304979e-05, + "loss": 0.0231, + "num_input_tokens_seen": 76118096, + "step": 62545 + }, + { + "epoch": 6.96625459405279, + "grad_norm": 0.0009497989667579532, + "learning_rate": 4.118257528368728e-05, + "loss": 0.0303, + "num_input_tokens_seen": 76124144, + "step": 62550 + }, + { + "epoch": 6.966811448936407, + "grad_norm": 0.007711244281381369, + "learning_rate": 4.118072318146736e-05, + "loss": 0.0234, + "num_input_tokens_seen": 76130320, + "step": 62555 + }, + { + "epoch": 6.967368303820025, + "grad_norm": 0.6743878126144409, + "learning_rate": 4.1178870926407555e-05, + "loss": 0.0994, + "num_input_tokens_seen": 76136368, + "step": 62560 + }, + { + "epoch": 6.9679251587036415, + "grad_norm": 0.35177406668663025, + "learning_rate": 4.1177018518525345e-05, + "loss": 0.0374, + "num_input_tokens_seen": 76142288, + "step": 62565 + }, + { + "epoch": 6.968482013587259, + "grad_norm": 0.7600367069244385, + "learning_rate": 4.1175165957838236e-05, + "loss": 0.047, + "num_input_tokens_seen": 76148464, + "step": 62570 + }, + { + "epoch": 6.969038868470877, + "grad_norm": 0.002247363794595003, + "learning_rate": 4.117331324436373e-05, + "loss": 0.0256, + "num_input_tokens_seen": 76154448, + "step": 62575 + }, + { + "epoch": 6.969595723354494, + "grad_norm": 0.5156875848770142, + "learning_rate": 4.117146037811932e-05, + "loss": 0.0162, + "num_input_tokens_seen": 76160848, + "step": 62580 + }, + { + "epoch": 6.970152578238111, + "grad_norm": 0.41828563809394836, + "learning_rate": 4.116960735912251e-05, + "loss": 0.0285, + "num_input_tokens_seen": 76166800, + "step": 62585 + }, + { + "epoch": 6.970709433121728, + "grad_norm": 0.1423560380935669, + "learning_rate": 4.11677541873908e-05, + "loss": 0.065, + "num_input_tokens_seen": 76173104, + "step": 62590 + }, + { + "epoch": 6.971266288005346, + "grad_norm": 0.00712224468588829, + "learning_rate": 4.116590086294171e-05, + "loss": 0.0565, + "num_input_tokens_seen": 76179312, + "step": 62595 + }, + { + "epoch": 6.971823142888963, + "grad_norm": 0.5316850543022156, + "learning_rate": 4.1164047385792726e-05, + "loss": 0.0407, + "num_input_tokens_seen": 76185488, + "step": 62600 + }, + { + "epoch": 6.97237999777258, + "grad_norm": 0.0005079643451608717, + "learning_rate": 4.116219375596136e-05, + "loss": 0.0741, + "num_input_tokens_seen": 76191472, + "step": 62605 + }, + { + "epoch": 6.972936852656198, + "grad_norm": 0.6938892006874084, + "learning_rate": 4.116033997346514e-05, + "loss": 0.0242, + "num_input_tokens_seen": 76197456, + "step": 62610 + }, + { + "epoch": 6.973493707539815, + "grad_norm": 0.2134069949388504, + "learning_rate": 4.115848603832154e-05, + "loss": 0.1033, + "num_input_tokens_seen": 76203408, + "step": 62615 + }, + { + "epoch": 6.974050562423432, + "grad_norm": 0.5499680638313293, + "learning_rate": 4.115663195054811e-05, + "loss": 0.1475, + "num_input_tokens_seen": 76209456, + "step": 62620 + }, + { + "epoch": 6.97460741730705, + "grad_norm": 0.02208442986011505, + "learning_rate": 4.115477771016234e-05, + "loss": 0.0502, + "num_input_tokens_seen": 76215632, + "step": 62625 + }, + { + "epoch": 6.975164272190667, + "grad_norm": 0.2910808026790619, + "learning_rate": 4.115292331718175e-05, + "loss": 0.0417, + "num_input_tokens_seen": 76221712, + "step": 62630 + }, + { + "epoch": 6.9757211270742845, + "grad_norm": 0.5647468566894531, + "learning_rate": 4.1151068771623866e-05, + "loss": 0.0939, + "num_input_tokens_seen": 76227888, + "step": 62635 + }, + { + "epoch": 6.976277981957901, + "grad_norm": 0.01512989029288292, + "learning_rate": 4.1149214073506184e-05, + "loss": 0.0402, + "num_input_tokens_seen": 76234384, + "step": 62640 + }, + { + "epoch": 6.976834836841519, + "grad_norm": 0.05476037412881851, + "learning_rate": 4.114735922284625e-05, + "loss": 0.0426, + "num_input_tokens_seen": 76240528, + "step": 62645 + }, + { + "epoch": 6.977391691725137, + "grad_norm": 0.19725364446640015, + "learning_rate": 4.114550421966157e-05, + "loss": 0.0713, + "num_input_tokens_seen": 76246640, + "step": 62650 + }, + { + "epoch": 6.977948546608753, + "grad_norm": 0.06283392012119293, + "learning_rate": 4.114364906396966e-05, + "loss": 0.0108, + "num_input_tokens_seen": 76252656, + "step": 62655 + }, + { + "epoch": 6.978505401492371, + "grad_norm": 0.26611167192459106, + "learning_rate": 4.114179375578805e-05, + "loss": 0.051, + "num_input_tokens_seen": 76259152, + "step": 62660 + }, + { + "epoch": 6.979062256375989, + "grad_norm": 0.005203406326472759, + "learning_rate": 4.113993829513427e-05, + "loss": 0.0952, + "num_input_tokens_seen": 76265200, + "step": 62665 + }, + { + "epoch": 6.9796191112596055, + "grad_norm": 0.05262318626046181, + "learning_rate": 4.1138082682025836e-05, + "loss": 0.0063, + "num_input_tokens_seen": 76271408, + "step": 62670 + }, + { + "epoch": 6.980175966143223, + "grad_norm": 0.013346449472010136, + "learning_rate": 4.113622691648029e-05, + "loss": 0.0267, + "num_input_tokens_seen": 76277488, + "step": 62675 + }, + { + "epoch": 6.980732821026841, + "grad_norm": 0.0010135088814422488, + "learning_rate": 4.113437099851515e-05, + "loss": 0.014, + "num_input_tokens_seen": 76284080, + "step": 62680 + }, + { + "epoch": 6.981289675910458, + "grad_norm": 0.15915049612522125, + "learning_rate": 4.1132514928147944e-05, + "loss": 0.0148, + "num_input_tokens_seen": 76290320, + "step": 62685 + }, + { + "epoch": 6.981846530794075, + "grad_norm": 0.009222843684256077, + "learning_rate": 4.113065870539622e-05, + "loss": 0.0218, + "num_input_tokens_seen": 76296176, + "step": 62690 + }, + { + "epoch": 6.982403385677692, + "grad_norm": 0.10911890119314194, + "learning_rate": 4.1128802330277496e-05, + "loss": 0.0496, + "num_input_tokens_seen": 76302224, + "step": 62695 + }, + { + "epoch": 6.98296024056131, + "grad_norm": 0.032338742166757584, + "learning_rate": 4.11269458028093e-05, + "loss": 0.1571, + "num_input_tokens_seen": 76308240, + "step": 62700 + }, + { + "epoch": 6.9835170954449275, + "grad_norm": 0.0005814462783746421, + "learning_rate": 4.1125089123009194e-05, + "loss": 0.089, + "num_input_tokens_seen": 76314512, + "step": 62705 + }, + { + "epoch": 6.984073950328544, + "grad_norm": 0.626725971698761, + "learning_rate": 4.1123232290894696e-05, + "loss": 0.0304, + "num_input_tokens_seen": 76320624, + "step": 62710 + }, + { + "epoch": 6.984630805212162, + "grad_norm": 0.0019444411154836416, + "learning_rate": 4.1121375306483355e-05, + "loss": 0.0042, + "num_input_tokens_seen": 76326768, + "step": 62715 + }, + { + "epoch": 6.985187660095779, + "grad_norm": 1.7739239931106567, + "learning_rate": 4.11195181697927e-05, + "loss": 0.0624, + "num_input_tokens_seen": 76332816, + "step": 62720 + }, + { + "epoch": 6.985744514979396, + "grad_norm": 2.1574385166168213, + "learning_rate": 4.1117660880840294e-05, + "loss": 0.2072, + "num_input_tokens_seen": 76338672, + "step": 62725 + }, + { + "epoch": 6.986301369863014, + "grad_norm": 0.06168461963534355, + "learning_rate": 4.111580343964366e-05, + "loss": 0.0146, + "num_input_tokens_seen": 76344720, + "step": 62730 + }, + { + "epoch": 6.986858224746631, + "grad_norm": 0.17493115365505219, + "learning_rate": 4.1113945846220354e-05, + "loss": 0.0086, + "num_input_tokens_seen": 76350992, + "step": 62735 + }, + { + "epoch": 6.9874150796302485, + "grad_norm": 0.4547734558582306, + "learning_rate": 4.111208810058792e-05, + "loss": 0.0088, + "num_input_tokens_seen": 76357328, + "step": 62740 + }, + { + "epoch": 6.987971934513865, + "grad_norm": 0.10058877617120743, + "learning_rate": 4.11102302027639e-05, + "loss": 0.0315, + "num_input_tokens_seen": 76363664, + "step": 62745 + }, + { + "epoch": 6.988528789397483, + "grad_norm": 0.35154014825820923, + "learning_rate": 4.110837215276585e-05, + "loss": 0.0213, + "num_input_tokens_seen": 76369904, + "step": 62750 + }, + { + "epoch": 6.989085644281101, + "grad_norm": 0.3216873109340668, + "learning_rate": 4.110651395061132e-05, + "loss": 0.0802, + "num_input_tokens_seen": 76376304, + "step": 62755 + }, + { + "epoch": 6.9896424991647175, + "grad_norm": 0.3060445189476013, + "learning_rate": 4.1104655596317866e-05, + "loss": 0.0683, + "num_input_tokens_seen": 76382320, + "step": 62760 + }, + { + "epoch": 6.990199354048335, + "grad_norm": 0.36621034145355225, + "learning_rate": 4.110279708990303e-05, + "loss": 0.0291, + "num_input_tokens_seen": 76388880, + "step": 62765 + }, + { + "epoch": 6.990756208931952, + "grad_norm": 0.6815187335014343, + "learning_rate": 4.1100938431384375e-05, + "loss": 0.0348, + "num_input_tokens_seen": 76394672, + "step": 62770 + }, + { + "epoch": 6.99131306381557, + "grad_norm": 1.2403432130813599, + "learning_rate": 4.109907962077946e-05, + "loss": 0.0415, + "num_input_tokens_seen": 76400880, + "step": 62775 + }, + { + "epoch": 6.991869918699187, + "grad_norm": 1.1143333911895752, + "learning_rate": 4.109722065810583e-05, + "loss": 0.0885, + "num_input_tokens_seen": 76406864, + "step": 62780 + }, + { + "epoch": 6.992426773582804, + "grad_norm": 0.002250172197818756, + "learning_rate": 4.109536154338107e-05, + "loss": 0.0553, + "num_input_tokens_seen": 76412752, + "step": 62785 + }, + { + "epoch": 6.992983628466422, + "grad_norm": 0.9638857841491699, + "learning_rate": 4.109350227662271e-05, + "loss": 0.1311, + "num_input_tokens_seen": 76418224, + "step": 62790 + }, + { + "epoch": 6.9935404833500385, + "grad_norm": 0.05748610571026802, + "learning_rate": 4.109164285784834e-05, + "loss": 0.0613, + "num_input_tokens_seen": 76424272, + "step": 62795 + }, + { + "epoch": 6.994097338233656, + "grad_norm": 0.013897374272346497, + "learning_rate": 4.10897832870755e-05, + "loss": 0.0782, + "num_input_tokens_seen": 76430224, + "step": 62800 + }, + { + "epoch": 6.994654193117274, + "grad_norm": 2.4221315383911133, + "learning_rate": 4.1087923564321776e-05, + "loss": 0.1248, + "num_input_tokens_seen": 76436336, + "step": 62805 + }, + { + "epoch": 6.995211048000891, + "grad_norm": 0.06807535141706467, + "learning_rate": 4.108606368960472e-05, + "loss": 0.0454, + "num_input_tokens_seen": 76442512, + "step": 62810 + }, + { + "epoch": 6.995767902884508, + "grad_norm": 0.0013765072217211127, + "learning_rate": 4.10842036629419e-05, + "loss": 0.0489, + "num_input_tokens_seen": 76448784, + "step": 62815 + }, + { + "epoch": 6.996324757768126, + "grad_norm": 0.004029181320220232, + "learning_rate": 4.108234348435089e-05, + "loss": 0.0399, + "num_input_tokens_seen": 76454960, + "step": 62820 + }, + { + "epoch": 6.996881612651743, + "grad_norm": 0.265259712934494, + "learning_rate": 4.108048315384927e-05, + "loss": 0.0047, + "num_input_tokens_seen": 76461200, + "step": 62825 + }, + { + "epoch": 6.9974384675353605, + "grad_norm": 1.4446169137954712, + "learning_rate": 4.1078622671454595e-05, + "loss": 0.2378, + "num_input_tokens_seen": 76467344, + "step": 62830 + }, + { + "epoch": 6.997995322418977, + "grad_norm": 0.30098435282707214, + "learning_rate": 4.107676203718445e-05, + "loss": 0.2155, + "num_input_tokens_seen": 76473168, + "step": 62835 + }, + { + "epoch": 6.998552177302595, + "grad_norm": 0.006226923316717148, + "learning_rate": 4.107490125105641e-05, + "loss": 0.0138, + "num_input_tokens_seen": 76479504, + "step": 62840 + }, + { + "epoch": 6.999109032186213, + "grad_norm": 0.006171246990561485, + "learning_rate": 4.1073040313088044e-05, + "loss": 0.0063, + "num_input_tokens_seen": 76485936, + "step": 62845 + }, + { + "epoch": 6.999665887069829, + "grad_norm": 0.17489415407180786, + "learning_rate": 4.1071179223296936e-05, + "loss": 0.0659, + "num_input_tokens_seen": 76492208, + "step": 62850 + }, + { + "epoch": 7.0, + "eval_loss": 0.07926620543003082, + "eval_runtime": 112.4562, + "eval_samples_per_second": 35.489, + "eval_steps_per_second": 8.875, + "num_input_tokens_seen": 76495264, + "step": 62853 + }, + { + "epoch": 7.000222741953447, + "grad_norm": 0.0018079470610246062, + "learning_rate": 4.106931798170066e-05, + "loss": 0.0837, + "num_input_tokens_seen": 76497760, + "step": 62855 + }, + { + "epoch": 7.000779596837064, + "grad_norm": 0.41267129778862, + "learning_rate": 4.106745658831681e-05, + "loss": 0.0446, + "num_input_tokens_seen": 76503840, + "step": 62860 + }, + { + "epoch": 7.0013364517206815, + "grad_norm": 0.00020201329607516527, + "learning_rate": 4.106559504316295e-05, + "loss": 0.0449, + "num_input_tokens_seen": 76509856, + "step": 62865 + }, + { + "epoch": 7.001893306604299, + "grad_norm": 0.4317178726196289, + "learning_rate": 4.106373334625668e-05, + "loss": 0.0505, + "num_input_tokens_seen": 76515968, + "step": 62870 + }, + { + "epoch": 7.002450161487916, + "grad_norm": 0.022931523621082306, + "learning_rate": 4.106187149761558e-05, + "loss": 0.1021, + "num_input_tokens_seen": 76522176, + "step": 62875 + }, + { + "epoch": 7.003007016371534, + "grad_norm": 1.1445002555847168, + "learning_rate": 4.106000949725723e-05, + "loss": 0.071, + "num_input_tokens_seen": 76528256, + "step": 62880 + }, + { + "epoch": 7.003563871255151, + "grad_norm": 0.36872443556785583, + "learning_rate": 4.1058147345199226e-05, + "loss": 0.021, + "num_input_tokens_seen": 76534496, + "step": 62885 + }, + { + "epoch": 7.004120726138768, + "grad_norm": 0.03159579634666443, + "learning_rate": 4.105628504145915e-05, + "loss": 0.0264, + "num_input_tokens_seen": 76540512, + "step": 62890 + }, + { + "epoch": 7.004677581022386, + "grad_norm": 0.3025215268135071, + "learning_rate": 4.10544225860546e-05, + "loss": 0.1302, + "num_input_tokens_seen": 76546304, + "step": 62895 + }, + { + "epoch": 7.005234435906003, + "grad_norm": 0.09474978595972061, + "learning_rate": 4.105255997900317e-05, + "loss": 0.0869, + "num_input_tokens_seen": 76552448, + "step": 62900 + }, + { + "epoch": 7.00579129078962, + "grad_norm": 0.04511166736483574, + "learning_rate": 4.1050697220322446e-05, + "loss": 0.0604, + "num_input_tokens_seen": 76558528, + "step": 62905 + }, + { + "epoch": 7.006348145673238, + "grad_norm": 1.5498127937316895, + "learning_rate": 4.104883431003003e-05, + "loss": 0.1503, + "num_input_tokens_seen": 76564576, + "step": 62910 + }, + { + "epoch": 7.006905000556855, + "grad_norm": 0.03443170338869095, + "learning_rate": 4.1046971248143515e-05, + "loss": 0.1306, + "num_input_tokens_seen": 76570912, + "step": 62915 + }, + { + "epoch": 7.007461855440472, + "grad_norm": 0.42471423745155334, + "learning_rate": 4.10451080346805e-05, + "loss": 0.0178, + "num_input_tokens_seen": 76576864, + "step": 62920 + }, + { + "epoch": 7.008018710324089, + "grad_norm": 1.2302048206329346, + "learning_rate": 4.1043244669658584e-05, + "loss": 0.0981, + "num_input_tokens_seen": 76582592, + "step": 62925 + }, + { + "epoch": 7.008575565207707, + "grad_norm": 0.08501971513032913, + "learning_rate": 4.104138115309537e-05, + "loss": 0.0263, + "num_input_tokens_seen": 76588960, + "step": 62930 + }, + { + "epoch": 7.0091324200913245, + "grad_norm": 0.10777370631694794, + "learning_rate": 4.1039517485008456e-05, + "loss": 0.0093, + "num_input_tokens_seen": 76595232, + "step": 62935 + }, + { + "epoch": 7.009689274974941, + "grad_norm": 0.030641408637166023, + "learning_rate": 4.103765366541545e-05, + "loss": 0.071, + "num_input_tokens_seen": 76601472, + "step": 62940 + }, + { + "epoch": 7.010246129858559, + "grad_norm": 0.17439644038677216, + "learning_rate": 4.103578969433395e-05, + "loss": 0.0289, + "num_input_tokens_seen": 76607328, + "step": 62945 + }, + { + "epoch": 7.010802984742176, + "grad_norm": 0.48240578174591064, + "learning_rate": 4.103392557178157e-05, + "loss": 0.0633, + "num_input_tokens_seen": 76613344, + "step": 62950 + }, + { + "epoch": 7.0113598396257935, + "grad_norm": 0.5374029278755188, + "learning_rate": 4.1032061297775926e-05, + "loss": 0.0749, + "num_input_tokens_seen": 76619360, + "step": 62955 + }, + { + "epoch": 7.011916694509411, + "grad_norm": 0.6865137219429016, + "learning_rate": 4.1030196872334616e-05, + "loss": 0.0703, + "num_input_tokens_seen": 76625312, + "step": 62960 + }, + { + "epoch": 7.012473549393028, + "grad_norm": 0.5843644738197327, + "learning_rate": 4.1028332295475256e-05, + "loss": 0.0082, + "num_input_tokens_seen": 76631296, + "step": 62965 + }, + { + "epoch": 7.013030404276646, + "grad_norm": 0.36209434270858765, + "learning_rate": 4.1026467567215444e-05, + "loss": 0.0069, + "num_input_tokens_seen": 76637696, + "step": 62970 + }, + { + "epoch": 7.013587259160263, + "grad_norm": 0.05074668675661087, + "learning_rate": 4.1024602687572814e-05, + "loss": 0.0298, + "num_input_tokens_seen": 76644000, + "step": 62975 + }, + { + "epoch": 7.01414411404388, + "grad_norm": 0.019124802201986313, + "learning_rate": 4.102273765656497e-05, + "loss": 0.0679, + "num_input_tokens_seen": 76650272, + "step": 62980 + }, + { + "epoch": 7.014700968927498, + "grad_norm": 0.41651448607444763, + "learning_rate": 4.1020872474209534e-05, + "loss": 0.0371, + "num_input_tokens_seen": 76656320, + "step": 62985 + }, + { + "epoch": 7.0152578238111145, + "grad_norm": 0.01486294437199831, + "learning_rate": 4.101900714052412e-05, + "loss": 0.0653, + "num_input_tokens_seen": 76662240, + "step": 62990 + }, + { + "epoch": 7.015814678694732, + "grad_norm": 1.3141202926635742, + "learning_rate": 4.101714165552635e-05, + "loss": 0.1717, + "num_input_tokens_seen": 76668032, + "step": 62995 + }, + { + "epoch": 7.01637153357835, + "grad_norm": 1.7775399684906006, + "learning_rate": 4.101527601923384e-05, + "loss": 0.0345, + "num_input_tokens_seen": 76674144, + "step": 63000 + }, + { + "epoch": 7.016928388461967, + "grad_norm": 0.000624899985268712, + "learning_rate": 4.1013410231664226e-05, + "loss": 0.0124, + "num_input_tokens_seen": 76680320, + "step": 63005 + }, + { + "epoch": 7.017485243345584, + "grad_norm": 0.006391190458089113, + "learning_rate": 4.101154429283511e-05, + "loss": 0.018, + "num_input_tokens_seen": 76686624, + "step": 63010 + }, + { + "epoch": 7.018042098229201, + "grad_norm": 0.007275381125509739, + "learning_rate": 4.1009678202764144e-05, + "loss": 0.0447, + "num_input_tokens_seen": 76692384, + "step": 63015 + }, + { + "epoch": 7.018598953112819, + "grad_norm": 0.3844352066516876, + "learning_rate": 4.1007811961468936e-05, + "loss": 0.0187, + "num_input_tokens_seen": 76698464, + "step": 63020 + }, + { + "epoch": 7.0191558079964365, + "grad_norm": 0.4830757677555084, + "learning_rate": 4.100594556896712e-05, + "loss": 0.1206, + "num_input_tokens_seen": 76704384, + "step": 63025 + }, + { + "epoch": 7.019712662880053, + "grad_norm": 0.0006452447851188481, + "learning_rate": 4.100407902527632e-05, + "loss": 0.128, + "num_input_tokens_seen": 76710720, + "step": 63030 + }, + { + "epoch": 7.020269517763671, + "grad_norm": 1.533933162689209, + "learning_rate": 4.100221233041417e-05, + "loss": 0.0672, + "num_input_tokens_seen": 76716672, + "step": 63035 + }, + { + "epoch": 7.020826372647288, + "grad_norm": 0.004281565546989441, + "learning_rate": 4.1000345484398306e-05, + "loss": 0.0076, + "num_input_tokens_seen": 76722816, + "step": 63040 + }, + { + "epoch": 7.021383227530905, + "grad_norm": 0.2555682063102722, + "learning_rate": 4.099847848724636e-05, + "loss": 0.0125, + "num_input_tokens_seen": 76728768, + "step": 63045 + }, + { + "epoch": 7.021940082414523, + "grad_norm": 0.22606956958770752, + "learning_rate": 4.099661133897597e-05, + "loss": 0.0206, + "num_input_tokens_seen": 76734624, + "step": 63050 + }, + { + "epoch": 7.02249693729814, + "grad_norm": 0.7577147483825684, + "learning_rate": 4.099474403960476e-05, + "loss": 0.0512, + "num_input_tokens_seen": 76740736, + "step": 63055 + }, + { + "epoch": 7.0230537921817575, + "grad_norm": 0.07301285117864609, + "learning_rate": 4.099287658915039e-05, + "loss": 0.0831, + "num_input_tokens_seen": 76746848, + "step": 63060 + }, + { + "epoch": 7.023610647065375, + "grad_norm": 0.07384803146123886, + "learning_rate": 4.0991008987630485e-05, + "loss": 0.0093, + "num_input_tokens_seen": 76753216, + "step": 63065 + }, + { + "epoch": 7.024167501948992, + "grad_norm": 0.001228274661116302, + "learning_rate": 4.0989141235062684e-05, + "loss": 0.0138, + "num_input_tokens_seen": 76759072, + "step": 63070 + }, + { + "epoch": 7.02472435683261, + "grad_norm": 1.4152251482009888, + "learning_rate": 4.098727333146463e-05, + "loss": 0.0508, + "num_input_tokens_seen": 76764576, + "step": 63075 + }, + { + "epoch": 7.025281211716226, + "grad_norm": 0.13067583739757538, + "learning_rate": 4.0985405276853975e-05, + "loss": 0.0026, + "num_input_tokens_seen": 76770752, + "step": 63080 + }, + { + "epoch": 7.025838066599844, + "grad_norm": 0.0030171889811754227, + "learning_rate": 4.0983537071248366e-05, + "loss": 0.024, + "num_input_tokens_seen": 76777088, + "step": 63085 + }, + { + "epoch": 7.026394921483462, + "grad_norm": 1.0505985021591187, + "learning_rate": 4.0981668714665435e-05, + "loss": 0.0891, + "num_input_tokens_seen": 76783072, + "step": 63090 + }, + { + "epoch": 7.026951776367079, + "grad_norm": 0.00210024556145072, + "learning_rate": 4.097980020712284e-05, + "loss": 0.0179, + "num_input_tokens_seen": 76789184, + "step": 63095 + }, + { + "epoch": 7.027508631250696, + "grad_norm": 0.0003019881551153958, + "learning_rate": 4.097793154863824e-05, + "loss": 0.0247, + "num_input_tokens_seen": 76795360, + "step": 63100 + }, + { + "epoch": 7.028065486134313, + "grad_norm": 0.012358802370727062, + "learning_rate": 4.097606273922926e-05, + "loss": 0.0459, + "num_input_tokens_seen": 76801440, + "step": 63105 + }, + { + "epoch": 7.028622341017931, + "grad_norm": 0.679440975189209, + "learning_rate": 4.0974193778913574e-05, + "loss": 0.1496, + "num_input_tokens_seen": 76807040, + "step": 63110 + }, + { + "epoch": 7.029179195901548, + "grad_norm": 0.21467356383800507, + "learning_rate": 4.097232466770883e-05, + "loss": 0.0824, + "num_input_tokens_seen": 76813056, + "step": 63115 + }, + { + "epoch": 7.029736050785165, + "grad_norm": 0.03879861533641815, + "learning_rate": 4.097045540563268e-05, + "loss": 0.0707, + "num_input_tokens_seen": 76819328, + "step": 63120 + }, + { + "epoch": 7.030292905668783, + "grad_norm": 0.5787835717201233, + "learning_rate": 4.096858599270279e-05, + "loss": 0.0737, + "num_input_tokens_seen": 76824864, + "step": 63125 + }, + { + "epoch": 7.0308497605524, + "grad_norm": 0.24227991700172424, + "learning_rate": 4.09667164289368e-05, + "loss": 0.055, + "num_input_tokens_seen": 76831168, + "step": 63130 + }, + { + "epoch": 7.031406615436017, + "grad_norm": 0.18644918501377106, + "learning_rate": 4.096484671435239e-05, + "loss": 0.1261, + "num_input_tokens_seen": 76837312, + "step": 63135 + }, + { + "epoch": 7.031963470319635, + "grad_norm": 0.006353271659463644, + "learning_rate": 4.096297684896721e-05, + "loss": 0.0042, + "num_input_tokens_seen": 76843424, + "step": 63140 + }, + { + "epoch": 7.032520325203252, + "grad_norm": 0.07255522161722183, + "learning_rate": 4.0961106832798924e-05, + "loss": 0.1195, + "num_input_tokens_seen": 76849504, + "step": 63145 + }, + { + "epoch": 7.033077180086869, + "grad_norm": 0.2286875993013382, + "learning_rate": 4.0959236665865194e-05, + "loss": 0.0787, + "num_input_tokens_seen": 76855712, + "step": 63150 + }, + { + "epoch": 7.033634034970487, + "grad_norm": 0.0221170075237751, + "learning_rate": 4.095736634818369e-05, + "loss": 0.0027, + "num_input_tokens_seen": 76862112, + "step": 63155 + }, + { + "epoch": 7.034190889854104, + "grad_norm": 1.309170126914978, + "learning_rate": 4.0955495879772076e-05, + "loss": 0.1489, + "num_input_tokens_seen": 76868032, + "step": 63160 + }, + { + "epoch": 7.034747744737722, + "grad_norm": 1.137951374053955, + "learning_rate": 4.095362526064802e-05, + "loss": 0.0359, + "num_input_tokens_seen": 76874368, + "step": 63165 + }, + { + "epoch": 7.035304599621338, + "grad_norm": 0.2396034300327301, + "learning_rate": 4.095175449082919e-05, + "loss": 0.0415, + "num_input_tokens_seen": 76880544, + "step": 63170 + }, + { + "epoch": 7.035861454504956, + "grad_norm": 0.4657993018627167, + "learning_rate": 4.0949883570333256e-05, + "loss": 0.0676, + "num_input_tokens_seen": 76886464, + "step": 63175 + }, + { + "epoch": 7.036418309388574, + "grad_norm": 0.0004178599629085511, + "learning_rate": 4.09480124991779e-05, + "loss": 0.0106, + "num_input_tokens_seen": 76892736, + "step": 63180 + }, + { + "epoch": 7.0369751642721905, + "grad_norm": 0.016604742035269737, + "learning_rate": 4.094614127738079e-05, + "loss": 0.0259, + "num_input_tokens_seen": 76898880, + "step": 63185 + }, + { + "epoch": 7.037532019155808, + "grad_norm": 0.2194087952375412, + "learning_rate": 4.0944269904959595e-05, + "loss": 0.0403, + "num_input_tokens_seen": 76905184, + "step": 63190 + }, + { + "epoch": 7.038088874039425, + "grad_norm": 0.1999421864748001, + "learning_rate": 4.0942398381932e-05, + "loss": 0.0327, + "num_input_tokens_seen": 76911264, + "step": 63195 + }, + { + "epoch": 7.038645728923043, + "grad_norm": 1.0168136358261108, + "learning_rate": 4.094052670831567e-05, + "loss": 0.0636, + "num_input_tokens_seen": 76916960, + "step": 63200 + }, + { + "epoch": 7.03920258380666, + "grad_norm": 0.02101844549179077, + "learning_rate": 4.0938654884128304e-05, + "loss": 0.0368, + "num_input_tokens_seen": 76923200, + "step": 63205 + }, + { + "epoch": 7.039759438690277, + "grad_norm": 0.06776142865419388, + "learning_rate": 4.0936782909387564e-05, + "loss": 0.0761, + "num_input_tokens_seen": 76929536, + "step": 63210 + }, + { + "epoch": 7.040316293573895, + "grad_norm": 0.09124737977981567, + "learning_rate": 4.093491078411115e-05, + "loss": 0.0414, + "num_input_tokens_seen": 76935520, + "step": 63215 + }, + { + "epoch": 7.0408731484575116, + "grad_norm": 0.46241164207458496, + "learning_rate": 4.0933038508316737e-05, + "loss": 0.0256, + "num_input_tokens_seen": 76941664, + "step": 63220 + }, + { + "epoch": 7.041430003341129, + "grad_norm": 0.5204580426216125, + "learning_rate": 4.0931166082022e-05, + "loss": 0.046, + "num_input_tokens_seen": 76947904, + "step": 63225 + }, + { + "epoch": 7.041986858224747, + "grad_norm": 0.14934059977531433, + "learning_rate": 4.0929293505244645e-05, + "loss": 0.0249, + "num_input_tokens_seen": 76954336, + "step": 63230 + }, + { + "epoch": 7.042543713108364, + "grad_norm": 2.0241615772247314, + "learning_rate": 4.092742077800234e-05, + "loss": 0.1128, + "num_input_tokens_seen": 76960640, + "step": 63235 + }, + { + "epoch": 7.043100567991981, + "grad_norm": 0.6928637027740479, + "learning_rate": 4.092554790031279e-05, + "loss": 0.0701, + "num_input_tokens_seen": 76966656, + "step": 63240 + }, + { + "epoch": 7.043657422875599, + "grad_norm": 0.6213937401771545, + "learning_rate": 4.0923674872193686e-05, + "loss": 0.0641, + "num_input_tokens_seen": 76972640, + "step": 63245 + }, + { + "epoch": 7.044214277759216, + "grad_norm": 2.4445931911468506, + "learning_rate": 4.092180169366271e-05, + "loss": 0.0588, + "num_input_tokens_seen": 76978688, + "step": 63250 + }, + { + "epoch": 7.0447711326428335, + "grad_norm": 0.07265348732471466, + "learning_rate": 4.091992836473756e-05, + "loss": 0.0444, + "num_input_tokens_seen": 76984544, + "step": 63255 + }, + { + "epoch": 7.04532798752645, + "grad_norm": 0.8640892505645752, + "learning_rate": 4.0918054885435935e-05, + "loss": 0.038, + "num_input_tokens_seen": 76990496, + "step": 63260 + }, + { + "epoch": 7.045884842410068, + "grad_norm": 0.26540058851242065, + "learning_rate": 4.091618125577553e-05, + "loss": 0.0609, + "num_input_tokens_seen": 76996640, + "step": 63265 + }, + { + "epoch": 7.046441697293686, + "grad_norm": 0.31895697116851807, + "learning_rate": 4.091430747577404e-05, + "loss": 0.0081, + "num_input_tokens_seen": 77002464, + "step": 63270 + }, + { + "epoch": 7.046998552177302, + "grad_norm": 0.08325324207544327, + "learning_rate": 4.091243354544916e-05, + "loss": 0.1033, + "num_input_tokens_seen": 77008864, + "step": 63275 + }, + { + "epoch": 7.04755540706092, + "grad_norm": 0.2299874871969223, + "learning_rate": 4.09105594648186e-05, + "loss": 0.1047, + "num_input_tokens_seen": 77014848, + "step": 63280 + }, + { + "epoch": 7.048112261944537, + "grad_norm": 0.0004215518420096487, + "learning_rate": 4.090868523390006e-05, + "loss": 0.0327, + "num_input_tokens_seen": 77020832, + "step": 63285 + }, + { + "epoch": 7.048669116828155, + "grad_norm": 0.05545743182301521, + "learning_rate": 4.090681085271124e-05, + "loss": 0.1041, + "num_input_tokens_seen": 77026848, + "step": 63290 + }, + { + "epoch": 7.049225971711772, + "grad_norm": 1.9947363138198853, + "learning_rate": 4.0904936321269846e-05, + "loss": 0.0991, + "num_input_tokens_seen": 77033088, + "step": 63295 + }, + { + "epoch": 7.049782826595389, + "grad_norm": 1.0610307455062866, + "learning_rate": 4.090306163959359e-05, + "loss": 0.0366, + "num_input_tokens_seen": 77038720, + "step": 63300 + }, + { + "epoch": 7.050339681479007, + "grad_norm": 0.36733391880989075, + "learning_rate": 4.090118680770017e-05, + "loss": 0.0178, + "num_input_tokens_seen": 77044928, + "step": 63305 + }, + { + "epoch": 7.0508965363626235, + "grad_norm": 0.5846793055534363, + "learning_rate": 4.08993118256073e-05, + "loss": 0.0229, + "num_input_tokens_seen": 77051008, + "step": 63310 + }, + { + "epoch": 7.051453391246241, + "grad_norm": 0.8085569143295288, + "learning_rate": 4.0897436693332704e-05, + "loss": 0.0577, + "num_input_tokens_seen": 77057120, + "step": 63315 + }, + { + "epoch": 7.052010246129859, + "grad_norm": 0.3780345916748047, + "learning_rate": 4.0895561410894065e-05, + "loss": 0.0434, + "num_input_tokens_seen": 77063040, + "step": 63320 + }, + { + "epoch": 7.052567101013476, + "grad_norm": 0.016463181003928185, + "learning_rate": 4.0893685978309126e-05, + "loss": 0.0163, + "num_input_tokens_seen": 77069120, + "step": 63325 + }, + { + "epoch": 7.053123955897093, + "grad_norm": 0.5363548994064331, + "learning_rate": 4.089181039559558e-05, + "loss": 0.0738, + "num_input_tokens_seen": 77075168, + "step": 63330 + }, + { + "epoch": 7.053680810780711, + "grad_norm": 0.11618897318840027, + "learning_rate": 4.088993466277116e-05, + "loss": 0.0733, + "num_input_tokens_seen": 77081088, + "step": 63335 + }, + { + "epoch": 7.054237665664328, + "grad_norm": 0.32762446999549866, + "learning_rate": 4.088805877985357e-05, + "loss": 0.0092, + "num_input_tokens_seen": 77087104, + "step": 63340 + }, + { + "epoch": 7.054794520547945, + "grad_norm": 0.06654366850852966, + "learning_rate": 4.088618274686054e-05, + "loss": 0.0962, + "num_input_tokens_seen": 77093248, + "step": 63345 + }, + { + "epoch": 7.055351375431562, + "grad_norm": 1.0223180055618286, + "learning_rate": 4.088430656380978e-05, + "loss": 0.0944, + "num_input_tokens_seen": 77099424, + "step": 63350 + }, + { + "epoch": 7.05590823031518, + "grad_norm": 0.2459491789340973, + "learning_rate": 4.0882430230719024e-05, + "loss": 0.0935, + "num_input_tokens_seen": 77105760, + "step": 63355 + }, + { + "epoch": 7.056465085198798, + "grad_norm": 0.05529722943902016, + "learning_rate": 4.0880553747605985e-05, + "loss": 0.0219, + "num_input_tokens_seen": 77111424, + "step": 63360 + }, + { + "epoch": 7.057021940082414, + "grad_norm": 0.8652255535125732, + "learning_rate": 4.0878677114488405e-05, + "loss": 0.1153, + "num_input_tokens_seen": 77117152, + "step": 63365 + }, + { + "epoch": 7.057578794966032, + "grad_norm": 0.9320201873779297, + "learning_rate": 4.087680033138399e-05, + "loss": 0.0538, + "num_input_tokens_seen": 77122976, + "step": 63370 + }, + { + "epoch": 7.058135649849649, + "grad_norm": 0.1227487251162529, + "learning_rate": 4.0874923398310474e-05, + "loss": 0.006, + "num_input_tokens_seen": 77128448, + "step": 63375 + }, + { + "epoch": 7.0586925047332665, + "grad_norm": 0.15036049485206604, + "learning_rate": 4.087304631528559e-05, + "loss": 0.1048, + "num_input_tokens_seen": 77134144, + "step": 63380 + }, + { + "epoch": 7.059249359616884, + "grad_norm": 0.14176951348781586, + "learning_rate": 4.087116908232706e-05, + "loss": 0.0756, + "num_input_tokens_seen": 77140160, + "step": 63385 + }, + { + "epoch": 7.059806214500501, + "grad_norm": 0.1174919530749321, + "learning_rate": 4.086929169945263e-05, + "loss": 0.0166, + "num_input_tokens_seen": 77146336, + "step": 63390 + }, + { + "epoch": 7.060363069384119, + "grad_norm": 1.555884838104248, + "learning_rate": 4.086741416668002e-05, + "loss": 0.1035, + "num_input_tokens_seen": 77152384, + "step": 63395 + }, + { + "epoch": 7.060919924267735, + "grad_norm": 0.6957050561904907, + "learning_rate": 4.086553648402697e-05, + "loss": 0.0316, + "num_input_tokens_seen": 77158752, + "step": 63400 + }, + { + "epoch": 7.061476779151353, + "grad_norm": 0.2535388767719269, + "learning_rate": 4.086365865151122e-05, + "loss": 0.0422, + "num_input_tokens_seen": 77164992, + "step": 63405 + }, + { + "epoch": 7.062033634034971, + "grad_norm": 0.03261968493461609, + "learning_rate": 4.086178066915051e-05, + "loss": 0.0566, + "num_input_tokens_seen": 77171136, + "step": 63410 + }, + { + "epoch": 7.0625904889185875, + "grad_norm": 1.2085868120193481, + "learning_rate": 4.0859902536962554e-05, + "loss": 0.1651, + "num_input_tokens_seen": 77177024, + "step": 63415 + }, + { + "epoch": 7.063147343802205, + "grad_norm": 0.2840672433376312, + "learning_rate": 4.085802425496513e-05, + "loss": 0.0593, + "num_input_tokens_seen": 77183072, + "step": 63420 + }, + { + "epoch": 7.063704198685823, + "grad_norm": 0.2220555990934372, + "learning_rate": 4.085614582317596e-05, + "loss": 0.0461, + "num_input_tokens_seen": 77189056, + "step": 63425 + }, + { + "epoch": 7.06426105356944, + "grad_norm": 0.4775560200214386, + "learning_rate": 4.085426724161279e-05, + "loss": 0.1124, + "num_input_tokens_seen": 77195296, + "step": 63430 + }, + { + "epoch": 7.064817908453057, + "grad_norm": 0.001938199158757925, + "learning_rate": 4.0852388510293355e-05, + "loss": 0.0081, + "num_input_tokens_seen": 77201312, + "step": 63435 + }, + { + "epoch": 7.065374763336674, + "grad_norm": 1.789764404296875, + "learning_rate": 4.085050962923541e-05, + "loss": 0.0392, + "num_input_tokens_seen": 77207296, + "step": 63440 + }, + { + "epoch": 7.065931618220292, + "grad_norm": 0.013633858412504196, + "learning_rate": 4.0848630598456705e-05, + "loss": 0.0501, + "num_input_tokens_seen": 77213216, + "step": 63445 + }, + { + "epoch": 7.0664884731039095, + "grad_norm": 0.026568319648504257, + "learning_rate": 4.084675141797499e-05, + "loss": 0.0305, + "num_input_tokens_seen": 77219552, + "step": 63450 + }, + { + "epoch": 7.067045327987526, + "grad_norm": 0.18011297285556793, + "learning_rate": 4.0844872087808005e-05, + "loss": 0.0297, + "num_input_tokens_seen": 77225472, + "step": 63455 + }, + { + "epoch": 7.067602182871144, + "grad_norm": 0.21362538635730743, + "learning_rate": 4.084299260797352e-05, + "loss": 0.027, + "num_input_tokens_seen": 77231584, + "step": 63460 + }, + { + "epoch": 7.068159037754761, + "grad_norm": 0.10296814143657684, + "learning_rate": 4.084111297848927e-05, + "loss": 0.0067, + "num_input_tokens_seen": 77238112, + "step": 63465 + }, + { + "epoch": 7.068715892638378, + "grad_norm": 0.010051177814602852, + "learning_rate": 4.083923319937302e-05, + "loss": 0.0021, + "num_input_tokens_seen": 77244640, + "step": 63470 + }, + { + "epoch": 7.069272747521996, + "grad_norm": 1.881500005722046, + "learning_rate": 4.083735327064251e-05, + "loss": 0.1553, + "num_input_tokens_seen": 77250944, + "step": 63475 + }, + { + "epoch": 7.069829602405613, + "grad_norm": 1.6803797483444214, + "learning_rate": 4.083547319231552e-05, + "loss": 0.1179, + "num_input_tokens_seen": 77256192, + "step": 63480 + }, + { + "epoch": 7.0703864572892305, + "grad_norm": 0.005167149938642979, + "learning_rate": 4.083359296440979e-05, + "loss": 0.0669, + "num_input_tokens_seen": 77262240, + "step": 63485 + }, + { + "epoch": 7.070943312172847, + "grad_norm": 0.08465776592493057, + "learning_rate": 4.08317125869431e-05, + "loss": 0.0154, + "num_input_tokens_seen": 77268672, + "step": 63490 + }, + { + "epoch": 7.071500167056465, + "grad_norm": 0.0020132327917963266, + "learning_rate": 4.082983205993319e-05, + "loss": 0.0202, + "num_input_tokens_seen": 77274816, + "step": 63495 + }, + { + "epoch": 7.072057021940083, + "grad_norm": 2.1000380516052246, + "learning_rate": 4.0827951383397844e-05, + "loss": 0.1015, + "num_input_tokens_seen": 77280768, + "step": 63500 + }, + { + "epoch": 7.0726138768236995, + "grad_norm": 0.7801854014396667, + "learning_rate": 4.082607055735481e-05, + "loss": 0.1501, + "num_input_tokens_seen": 77287008, + "step": 63505 + }, + { + "epoch": 7.073170731707317, + "grad_norm": 0.11105024814605713, + "learning_rate": 4.082418958182186e-05, + "loss": 0.0371, + "num_input_tokens_seen": 77292960, + "step": 63510 + }, + { + "epoch": 7.073727586590935, + "grad_norm": 0.09918539971113205, + "learning_rate": 4.082230845681676e-05, + "loss": 0.0053, + "num_input_tokens_seen": 77299456, + "step": 63515 + }, + { + "epoch": 7.074284441474552, + "grad_norm": 0.003447767812758684, + "learning_rate": 4.082042718235728e-05, + "loss": 0.0315, + "num_input_tokens_seen": 77305504, + "step": 63520 + }, + { + "epoch": 7.074841296358169, + "grad_norm": 0.0006809429614804685, + "learning_rate": 4.08185457584612e-05, + "loss": 0.0342, + "num_input_tokens_seen": 77311776, + "step": 63525 + }, + { + "epoch": 7.075398151241786, + "grad_norm": 0.029432987794280052, + "learning_rate": 4.081666418514627e-05, + "loss": 0.001, + "num_input_tokens_seen": 77318048, + "step": 63530 + }, + { + "epoch": 7.075955006125404, + "grad_norm": 2.6861772537231445, + "learning_rate": 4.081478246243028e-05, + "loss": 0.0687, + "num_input_tokens_seen": 77324064, + "step": 63535 + }, + { + "epoch": 7.076511861009021, + "grad_norm": 0.4275358021259308, + "learning_rate": 4.0812900590331e-05, + "loss": 0.0402, + "num_input_tokens_seen": 77330112, + "step": 63540 + }, + { + "epoch": 7.077068715892638, + "grad_norm": 0.9855111837387085, + "learning_rate": 4.08110185688662e-05, + "loss": 0.0896, + "num_input_tokens_seen": 77336512, + "step": 63545 + }, + { + "epoch": 7.077625570776256, + "grad_norm": 0.023523975163698196, + "learning_rate": 4.080913639805366e-05, + "loss": 0.0338, + "num_input_tokens_seen": 77342336, + "step": 63550 + }, + { + "epoch": 7.078182425659873, + "grad_norm": 0.43195050954818726, + "learning_rate": 4.080725407791117e-05, + "loss": 0.0241, + "num_input_tokens_seen": 77348672, + "step": 63555 + }, + { + "epoch": 7.07873928054349, + "grad_norm": 0.7695842981338501, + "learning_rate": 4.08053716084565e-05, + "loss": 0.0857, + "num_input_tokens_seen": 77354688, + "step": 63560 + }, + { + "epoch": 7.079296135427108, + "grad_norm": 0.1563476026058197, + "learning_rate": 4.0803488989707425e-05, + "loss": 0.014, + "num_input_tokens_seen": 77361152, + "step": 63565 + }, + { + "epoch": 7.079852990310725, + "grad_norm": 1.2732868194580078, + "learning_rate": 4.080160622168173e-05, + "loss": 0.0624, + "num_input_tokens_seen": 77367392, + "step": 63570 + }, + { + "epoch": 7.0804098451943425, + "grad_norm": 0.270244836807251, + "learning_rate": 4.079972330439722e-05, + "loss": 0.0188, + "num_input_tokens_seen": 77373056, + "step": 63575 + }, + { + "epoch": 7.080966700077959, + "grad_norm": 0.8148424029350281, + "learning_rate": 4.079784023787165e-05, + "loss": 0.0507, + "num_input_tokens_seen": 77378816, + "step": 63580 + }, + { + "epoch": 7.081523554961577, + "grad_norm": 0.7151756286621094, + "learning_rate": 4.079595702212283e-05, + "loss": 0.0503, + "num_input_tokens_seen": 77385056, + "step": 63585 + }, + { + "epoch": 7.082080409845195, + "grad_norm": 0.09598427265882492, + "learning_rate": 4.079407365716854e-05, + "loss": 0.0418, + "num_input_tokens_seen": 77390624, + "step": 63590 + }, + { + "epoch": 7.082637264728811, + "grad_norm": 0.04238992556929588, + "learning_rate": 4.079219014302657e-05, + "loss": 0.0153, + "num_input_tokens_seen": 77396672, + "step": 63595 + }, + { + "epoch": 7.083194119612429, + "grad_norm": 0.06057048961520195, + "learning_rate": 4.0790306479714715e-05, + "loss": 0.0056, + "num_input_tokens_seen": 77402496, + "step": 63600 + }, + { + "epoch": 7.083750974496047, + "grad_norm": 0.00043290076428093016, + "learning_rate": 4.078842266725076e-05, + "loss": 0.0084, + "num_input_tokens_seen": 77409184, + "step": 63605 + }, + { + "epoch": 7.0843078293796635, + "grad_norm": 0.1229993999004364, + "learning_rate": 4.07865387056525e-05, + "loss": 0.0068, + "num_input_tokens_seen": 77415392, + "step": 63610 + }, + { + "epoch": 7.084864684263281, + "grad_norm": 2.1185457706451416, + "learning_rate": 4.078465459493774e-05, + "loss": 0.1192, + "num_input_tokens_seen": 77421376, + "step": 63615 + }, + { + "epoch": 7.085421539146898, + "grad_norm": 0.4211815595626831, + "learning_rate": 4.0782770335124266e-05, + "loss": 0.0134, + "num_input_tokens_seen": 77427584, + "step": 63620 + }, + { + "epoch": 7.085978394030516, + "grad_norm": 0.005476720165461302, + "learning_rate": 4.0780885926229884e-05, + "loss": 0.0077, + "num_input_tokens_seen": 77433568, + "step": 63625 + }, + { + "epoch": 7.086535248914133, + "grad_norm": 0.00029314146377146244, + "learning_rate": 4.0779001368272395e-05, + "loss": 0.0065, + "num_input_tokens_seen": 77439584, + "step": 63630 + }, + { + "epoch": 7.08709210379775, + "grad_norm": 0.6713632345199585, + "learning_rate": 4.077711666126959e-05, + "loss": 0.0125, + "num_input_tokens_seen": 77445728, + "step": 63635 + }, + { + "epoch": 7.087648958681368, + "grad_norm": 3.50508189201355, + "learning_rate": 4.0775231805239285e-05, + "loss": 0.1219, + "num_input_tokens_seen": 77451680, + "step": 63640 + }, + { + "epoch": 7.088205813564985, + "grad_norm": 0.06306082755327225, + "learning_rate": 4.077334680019927e-05, + "loss": 0.1277, + "num_input_tokens_seen": 77457632, + "step": 63645 + }, + { + "epoch": 7.088762668448602, + "grad_norm": 0.011690936982631683, + "learning_rate": 4.0771461646167365e-05, + "loss": 0.0331, + "num_input_tokens_seen": 77464160, + "step": 63650 + }, + { + "epoch": 7.08931952333222, + "grad_norm": 0.5238885879516602, + "learning_rate": 4.0769576343161356e-05, + "loss": 0.0185, + "num_input_tokens_seen": 77470240, + "step": 63655 + }, + { + "epoch": 7.089876378215837, + "grad_norm": 0.17533425986766815, + "learning_rate": 4.076769089119907e-05, + "loss": 0.0138, + "num_input_tokens_seen": 77476448, + "step": 63660 + }, + { + "epoch": 7.090433233099454, + "grad_norm": 0.10019928216934204, + "learning_rate": 4.076580529029831e-05, + "loss": 0.0372, + "num_input_tokens_seen": 77482400, + "step": 63665 + }, + { + "epoch": 7.090990087983071, + "grad_norm": 0.04485465958714485, + "learning_rate": 4.0763919540476894e-05, + "loss": 0.1138, + "num_input_tokens_seen": 77488096, + "step": 63670 + }, + { + "epoch": 7.091546942866689, + "grad_norm": 0.006157609634101391, + "learning_rate": 4.076203364175262e-05, + "loss": 0.0859, + "num_input_tokens_seen": 77494496, + "step": 63675 + }, + { + "epoch": 7.0921037977503065, + "grad_norm": 0.2464221715927124, + "learning_rate": 4.076014759414332e-05, + "loss": 0.0144, + "num_input_tokens_seen": 77500320, + "step": 63680 + }, + { + "epoch": 7.092660652633923, + "grad_norm": 0.01027904823422432, + "learning_rate": 4.075826139766679e-05, + "loss": 0.0089, + "num_input_tokens_seen": 77506720, + "step": 63685 + }, + { + "epoch": 7.093217507517541, + "grad_norm": 1.1216466426849365, + "learning_rate": 4.0756375052340856e-05, + "loss": 0.0647, + "num_input_tokens_seen": 77512960, + "step": 63690 + }, + { + "epoch": 7.093774362401159, + "grad_norm": 0.17285440862178802, + "learning_rate": 4.075448855818333e-05, + "loss": 0.0483, + "num_input_tokens_seen": 77518976, + "step": 63695 + }, + { + "epoch": 7.0943312172847754, + "grad_norm": 1.080284595489502, + "learning_rate": 4.0752601915212055e-05, + "loss": 0.0909, + "num_input_tokens_seen": 77525248, + "step": 63700 + }, + { + "epoch": 7.094888072168393, + "grad_norm": 0.07441197335720062, + "learning_rate": 4.075071512344482e-05, + "loss": 0.0757, + "num_input_tokens_seen": 77531008, + "step": 63705 + }, + { + "epoch": 7.09544492705201, + "grad_norm": 0.41806668043136597, + "learning_rate": 4.074882818289947e-05, + "loss": 0.0186, + "num_input_tokens_seen": 77537408, + "step": 63710 + }, + { + "epoch": 7.096001781935628, + "grad_norm": 0.010169116780161858, + "learning_rate": 4.074694109359381e-05, + "loss": 0.0269, + "num_input_tokens_seen": 77542976, + "step": 63715 + }, + { + "epoch": 7.096558636819245, + "grad_norm": 0.7733210325241089, + "learning_rate": 4.074505385554568e-05, + "loss": 0.0486, + "num_input_tokens_seen": 77548960, + "step": 63720 + }, + { + "epoch": 7.097115491702862, + "grad_norm": 0.3425721228122711, + "learning_rate": 4.07431664687729e-05, + "loss": 0.1014, + "num_input_tokens_seen": 77554816, + "step": 63725 + }, + { + "epoch": 7.09767234658648, + "grad_norm": 0.8005589842796326, + "learning_rate": 4.07412789332933e-05, + "loss": 0.0353, + "num_input_tokens_seen": 77561024, + "step": 63730 + }, + { + "epoch": 7.0982292014700965, + "grad_norm": 0.14188136160373688, + "learning_rate": 4.0739391249124716e-05, + "loss": 0.0074, + "num_input_tokens_seen": 77567136, + "step": 63735 + }, + { + "epoch": 7.098786056353714, + "grad_norm": 0.3249123692512512, + "learning_rate": 4.073750341628497e-05, + "loss": 0.0802, + "num_input_tokens_seen": 77572064, + "step": 63740 + }, + { + "epoch": 7.099342911237332, + "grad_norm": 0.09150901436805725, + "learning_rate": 4.073561543479188e-05, + "loss": 0.0323, + "num_input_tokens_seen": 77578272, + "step": 63745 + }, + { + "epoch": 7.099899766120949, + "grad_norm": 0.001340589253231883, + "learning_rate": 4.073372730466332e-05, + "loss": 0.0321, + "num_input_tokens_seen": 77584704, + "step": 63750 + }, + { + "epoch": 7.100456621004566, + "grad_norm": 2.0454061031341553, + "learning_rate": 4.073183902591708e-05, + "loss": 0.0927, + "num_input_tokens_seen": 77590656, + "step": 63755 + }, + { + "epoch": 7.101013475888184, + "grad_norm": 0.9756399393081665, + "learning_rate": 4.072995059857102e-05, + "loss": 0.0314, + "num_input_tokens_seen": 77596576, + "step": 63760 + }, + { + "epoch": 7.101570330771801, + "grad_norm": 0.7177768349647522, + "learning_rate": 4.0728062022642976e-05, + "loss": 0.1348, + "num_input_tokens_seen": 77602336, + "step": 63765 + }, + { + "epoch": 7.1021271856554185, + "grad_norm": 0.035755328834056854, + "learning_rate": 4.0726173298150796e-05, + "loss": 0.0024, + "num_input_tokens_seen": 77608416, + "step": 63770 + }, + { + "epoch": 7.102684040539035, + "grad_norm": 0.006311212200671434, + "learning_rate": 4.072428442511229e-05, + "loss": 0.073, + "num_input_tokens_seen": 77614336, + "step": 63775 + }, + { + "epoch": 7.103240895422653, + "grad_norm": 0.004199115093797445, + "learning_rate": 4.0722395403545335e-05, + "loss": 0.079, + "num_input_tokens_seen": 77620416, + "step": 63780 + }, + { + "epoch": 7.103797750306271, + "grad_norm": 0.0476037971675396, + "learning_rate": 4.0720506233467746e-05, + "loss": 0.0163, + "num_input_tokens_seen": 77626688, + "step": 63785 + }, + { + "epoch": 7.104354605189887, + "grad_norm": 0.1788979023694992, + "learning_rate": 4.071861691489739e-05, + "loss": 0.0617, + "num_input_tokens_seen": 77633024, + "step": 63790 + }, + { + "epoch": 7.104911460073505, + "grad_norm": 0.05286170169711113, + "learning_rate": 4.0716727447852106e-05, + "loss": 0.003, + "num_input_tokens_seen": 77639584, + "step": 63795 + }, + { + "epoch": 7.105468314957122, + "grad_norm": 1.7322945594787598, + "learning_rate": 4.071483783234973e-05, + "loss": 0.1621, + "num_input_tokens_seen": 77645728, + "step": 63800 + }, + { + "epoch": 7.1060251698407395, + "grad_norm": 0.33389943838119507, + "learning_rate": 4.071294806840813e-05, + "loss": 0.0084, + "num_input_tokens_seen": 77652032, + "step": 63805 + }, + { + "epoch": 7.106582024724357, + "grad_norm": 1.6928349733352661, + "learning_rate": 4.071105815604514e-05, + "loss": 0.0336, + "num_input_tokens_seen": 77658144, + "step": 63810 + }, + { + "epoch": 7.107138879607974, + "grad_norm": 0.0008210955420508981, + "learning_rate": 4.0709168095278615e-05, + "loss": 0.0137, + "num_input_tokens_seen": 77664448, + "step": 63815 + }, + { + "epoch": 7.107695734491592, + "grad_norm": 0.08390835672616959, + "learning_rate": 4.070727788612642e-05, + "loss": 0.035, + "num_input_tokens_seen": 77670400, + "step": 63820 + }, + { + "epoch": 7.108252589375208, + "grad_norm": 1.0900790691375732, + "learning_rate": 4.07053875286064e-05, + "loss": 0.0427, + "num_input_tokens_seen": 77676672, + "step": 63825 + }, + { + "epoch": 7.108809444258826, + "grad_norm": 0.0010239811381325126, + "learning_rate": 4.07034970227364e-05, + "loss": 0.0804, + "num_input_tokens_seen": 77682592, + "step": 63830 + }, + { + "epoch": 7.109366299142444, + "grad_norm": 0.004090325441211462, + "learning_rate": 4.07016063685343e-05, + "loss": 0.0342, + "num_input_tokens_seen": 77688736, + "step": 63835 + }, + { + "epoch": 7.109923154026061, + "grad_norm": 0.5948730707168579, + "learning_rate": 4.069971556601795e-05, + "loss": 0.0679, + "num_input_tokens_seen": 77694688, + "step": 63840 + }, + { + "epoch": 7.110480008909678, + "grad_norm": 0.24543644487857819, + "learning_rate": 4.06978246152052e-05, + "loss": 0.0133, + "num_input_tokens_seen": 77700960, + "step": 63845 + }, + { + "epoch": 7.111036863793295, + "grad_norm": 0.27949029207229614, + "learning_rate": 4.069593351611392e-05, + "loss": 0.0357, + "num_input_tokens_seen": 77707008, + "step": 63850 + }, + { + "epoch": 7.111593718676913, + "grad_norm": 0.8173075914382935, + "learning_rate": 4.069404226876198e-05, + "loss": 0.0669, + "num_input_tokens_seen": 77713152, + "step": 63855 + }, + { + "epoch": 7.11215057356053, + "grad_norm": 0.07148265093564987, + "learning_rate": 4.0692150873167234e-05, + "loss": 0.14, + "num_input_tokens_seen": 77719360, + "step": 63860 + }, + { + "epoch": 7.112707428444147, + "grad_norm": 0.03262593224644661, + "learning_rate": 4.0690259329347545e-05, + "loss": 0.235, + "num_input_tokens_seen": 77725536, + "step": 63865 + }, + { + "epoch": 7.113264283327765, + "grad_norm": 0.00027642189525067806, + "learning_rate": 4.068836763732079e-05, + "loss": 0.1997, + "num_input_tokens_seen": 77731520, + "step": 63870 + }, + { + "epoch": 7.1138211382113825, + "grad_norm": 1.236372470855713, + "learning_rate": 4.068647579710484e-05, + "loss": 0.2614, + "num_input_tokens_seen": 77737280, + "step": 63875 + }, + { + "epoch": 7.114377993094999, + "grad_norm": 0.002639428712427616, + "learning_rate": 4.068458380871755e-05, + "loss": 0.1231, + "num_input_tokens_seen": 77742880, + "step": 63880 + }, + { + "epoch": 7.114934847978617, + "grad_norm": 0.6322018504142761, + "learning_rate": 4.0682691672176797e-05, + "loss": 0.0436, + "num_input_tokens_seen": 77749184, + "step": 63885 + }, + { + "epoch": 7.115491702862234, + "grad_norm": 1.717891812324524, + "learning_rate": 4.068079938750046e-05, + "loss": 0.0614, + "num_input_tokens_seen": 77755072, + "step": 63890 + }, + { + "epoch": 7.116048557745851, + "grad_norm": 0.03546096011996269, + "learning_rate": 4.067890695470641e-05, + "loss": 0.0535, + "num_input_tokens_seen": 77761472, + "step": 63895 + }, + { + "epoch": 7.116605412629469, + "grad_norm": 0.49980270862579346, + "learning_rate": 4.067701437381253e-05, + "loss": 0.0295, + "num_input_tokens_seen": 77767680, + "step": 63900 + }, + { + "epoch": 7.117162267513086, + "grad_norm": 0.008584918454289436, + "learning_rate": 4.067512164483668e-05, + "loss": 0.0989, + "num_input_tokens_seen": 77773888, + "step": 63905 + }, + { + "epoch": 7.117719122396704, + "grad_norm": 0.07625017315149307, + "learning_rate": 4.067322876779675e-05, + "loss": 0.0118, + "num_input_tokens_seen": 77780288, + "step": 63910 + }, + { + "epoch": 7.11827597728032, + "grad_norm": 0.4503764808177948, + "learning_rate": 4.0671335742710615e-05, + "loss": 0.0064, + "num_input_tokens_seen": 77786528, + "step": 63915 + }, + { + "epoch": 7.118832832163938, + "grad_norm": 0.04609964042901993, + "learning_rate": 4.066944256959616e-05, + "loss": 0.0616, + "num_input_tokens_seen": 77792256, + "step": 63920 + }, + { + "epoch": 7.119389687047556, + "grad_norm": 0.10694805532693863, + "learning_rate": 4.066754924847126e-05, + "loss": 0.0113, + "num_input_tokens_seen": 77798560, + "step": 63925 + }, + { + "epoch": 7.1199465419311725, + "grad_norm": 0.05513129010796547, + "learning_rate": 4.0665655779353805e-05, + "loss": 0.1113, + "num_input_tokens_seen": 77804160, + "step": 63930 + }, + { + "epoch": 7.12050339681479, + "grad_norm": 0.1837504357099533, + "learning_rate": 4.066376216226169e-05, + "loss": 0.0524, + "num_input_tokens_seen": 77810176, + "step": 63935 + }, + { + "epoch": 7.121060251698408, + "grad_norm": 1.3573036193847656, + "learning_rate": 4.066186839721279e-05, + "loss": 0.0916, + "num_input_tokens_seen": 77816256, + "step": 63940 + }, + { + "epoch": 7.121617106582025, + "grad_norm": 0.0011397874914109707, + "learning_rate": 4.065997448422498e-05, + "loss": 0.0223, + "num_input_tokens_seen": 77822400, + "step": 63945 + }, + { + "epoch": 7.122173961465642, + "grad_norm": 0.6132811307907104, + "learning_rate": 4.065808042331618e-05, + "loss": 0.0636, + "num_input_tokens_seen": 77828576, + "step": 63950 + }, + { + "epoch": 7.122730816349259, + "grad_norm": 0.08392829447984695, + "learning_rate": 4.0656186214504264e-05, + "loss": 0.1649, + "num_input_tokens_seen": 77834752, + "step": 63955 + }, + { + "epoch": 7.123287671232877, + "grad_norm": 1.4501351118087769, + "learning_rate": 4.065429185780713e-05, + "loss": 0.1627, + "num_input_tokens_seen": 77840896, + "step": 63960 + }, + { + "epoch": 7.123844526116494, + "grad_norm": 0.900543212890625, + "learning_rate": 4.065239735324265e-05, + "loss": 0.1018, + "num_input_tokens_seen": 77847168, + "step": 63965 + }, + { + "epoch": 7.124401381000111, + "grad_norm": 0.7808799147605896, + "learning_rate": 4.065050270082875e-05, + "loss": 0.0451, + "num_input_tokens_seen": 77853184, + "step": 63970 + }, + { + "epoch": 7.124958235883729, + "grad_norm": 0.40203964710235596, + "learning_rate": 4.0648607900583314e-05, + "loss": 0.0288, + "num_input_tokens_seen": 77859200, + "step": 63975 + }, + { + "epoch": 7.125515090767346, + "grad_norm": 1.2557021379470825, + "learning_rate": 4.064671295252423e-05, + "loss": 0.1149, + "num_input_tokens_seen": 77865184, + "step": 63980 + }, + { + "epoch": 7.126071945650963, + "grad_norm": 0.47375956177711487, + "learning_rate": 4.064481785666942e-05, + "loss": 0.0606, + "num_input_tokens_seen": 77871584, + "step": 63985 + }, + { + "epoch": 7.126628800534581, + "grad_norm": 0.0011720949551090598, + "learning_rate": 4.064292261303675e-05, + "loss": 0.0692, + "num_input_tokens_seen": 77877632, + "step": 63990 + }, + { + "epoch": 7.127185655418198, + "grad_norm": 1.581764578819275, + "learning_rate": 4.0641027221644155e-05, + "loss": 0.1028, + "num_input_tokens_seen": 77883680, + "step": 63995 + }, + { + "epoch": 7.1277425103018155, + "grad_norm": 1.3696141242980957, + "learning_rate": 4.063913168250954e-05, + "loss": 0.0936, + "num_input_tokens_seen": 77889856, + "step": 64000 + }, + { + "epoch": 7.128299365185432, + "grad_norm": 0.2675885558128357, + "learning_rate": 4.0637235995650773e-05, + "loss": 0.0164, + "num_input_tokens_seen": 77896288, + "step": 64005 + }, + { + "epoch": 7.12885622006905, + "grad_norm": 0.7890623211860657, + "learning_rate": 4.0635340161085795e-05, + "loss": 0.1387, + "num_input_tokens_seen": 77901664, + "step": 64010 + }, + { + "epoch": 7.129413074952668, + "grad_norm": 0.008631500415503979, + "learning_rate": 4.0633444178832504e-05, + "loss": 0.1, + "num_input_tokens_seen": 77907808, + "step": 64015 + }, + { + "epoch": 7.129969929836284, + "grad_norm": 0.07653931528329849, + "learning_rate": 4.06315480489088e-05, + "loss": 0.0088, + "num_input_tokens_seen": 77913952, + "step": 64020 + }, + { + "epoch": 7.130526784719902, + "grad_norm": 0.019294222816824913, + "learning_rate": 4.0629651771332604e-05, + "loss": 0.0097, + "num_input_tokens_seen": 77920288, + "step": 64025 + }, + { + "epoch": 7.131083639603519, + "grad_norm": 0.1530836820602417, + "learning_rate": 4.0627755346121834e-05, + "loss": 0.077, + "num_input_tokens_seen": 77926112, + "step": 64030 + }, + { + "epoch": 7.1316404944871366, + "grad_norm": 0.005885060876607895, + "learning_rate": 4.062585877329438e-05, + "loss": 0.0671, + "num_input_tokens_seen": 77932320, + "step": 64035 + }, + { + "epoch": 7.132197349370754, + "grad_norm": 1.124328851699829, + "learning_rate": 4.0623962052868184e-05, + "loss": 0.0597, + "num_input_tokens_seen": 77938400, + "step": 64040 + }, + { + "epoch": 7.132754204254371, + "grad_norm": 0.24517321586608887, + "learning_rate": 4.0622065184861135e-05, + "loss": 0.0048, + "num_input_tokens_seen": 77944640, + "step": 64045 + }, + { + "epoch": 7.133311059137989, + "grad_norm": 1.903074026107788, + "learning_rate": 4.062016816929117e-05, + "loss": 0.0918, + "num_input_tokens_seen": 77951104, + "step": 64050 + }, + { + "epoch": 7.133867914021606, + "grad_norm": 0.003110273741185665, + "learning_rate": 4.061827100617621e-05, + "loss": 0.0262, + "num_input_tokens_seen": 77957120, + "step": 64055 + }, + { + "epoch": 7.134424768905223, + "grad_norm": 0.6707085371017456, + "learning_rate": 4.061637369553415e-05, + "loss": 0.1046, + "num_input_tokens_seen": 77963136, + "step": 64060 + }, + { + "epoch": 7.134981623788841, + "grad_norm": 0.9726600050926208, + "learning_rate": 4.0614476237382945e-05, + "loss": 0.1183, + "num_input_tokens_seen": 77969184, + "step": 64065 + }, + { + "epoch": 7.135538478672458, + "grad_norm": 0.0010971528245136142, + "learning_rate": 4.0612578631740494e-05, + "loss": 0.1069, + "num_input_tokens_seen": 77975296, + "step": 64070 + }, + { + "epoch": 7.136095333556075, + "grad_norm": 0.013845119625329971, + "learning_rate": 4.061068087862473e-05, + "loss": 0.0368, + "num_input_tokens_seen": 77980768, + "step": 64075 + }, + { + "epoch": 7.136652188439693, + "grad_norm": 0.16964872181415558, + "learning_rate": 4.0608782978053576e-05, + "loss": 0.0433, + "num_input_tokens_seen": 77986464, + "step": 64080 + }, + { + "epoch": 7.13720904332331, + "grad_norm": 3.3574023246765137, + "learning_rate": 4.060688493004496e-05, + "loss": 0.0953, + "num_input_tokens_seen": 77992864, + "step": 64085 + }, + { + "epoch": 7.137765898206927, + "grad_norm": 3.8562939167022705, + "learning_rate": 4.0604986734616825e-05, + "loss": 0.0433, + "num_input_tokens_seen": 77999200, + "step": 64090 + }, + { + "epoch": 7.138322753090544, + "grad_norm": 0.7583354711532593, + "learning_rate": 4.060308839178707e-05, + "loss": 0.0869, + "num_input_tokens_seen": 78005088, + "step": 64095 + }, + { + "epoch": 7.138879607974162, + "grad_norm": 0.02712847851216793, + "learning_rate": 4.060118990157365e-05, + "loss": 0.0419, + "num_input_tokens_seen": 78011168, + "step": 64100 + }, + { + "epoch": 7.1394364628577796, + "grad_norm": 0.03387518599629402, + "learning_rate": 4.05992912639945e-05, + "loss": 0.0333, + "num_input_tokens_seen": 78017408, + "step": 64105 + }, + { + "epoch": 7.139993317741396, + "grad_norm": 0.866780161857605, + "learning_rate": 4.059739247906754e-05, + "loss": 0.0446, + "num_input_tokens_seen": 78023808, + "step": 64110 + }, + { + "epoch": 7.140550172625014, + "grad_norm": 0.9422434568405151, + "learning_rate": 4.0595493546810713e-05, + "loss": 0.0925, + "num_input_tokens_seen": 78029408, + "step": 64115 + }, + { + "epoch": 7.141107027508632, + "grad_norm": 0.9176614880561829, + "learning_rate": 4.0593594467241955e-05, + "loss": 0.0538, + "num_input_tokens_seen": 78035328, + "step": 64120 + }, + { + "epoch": 7.1416638823922485, + "grad_norm": 0.031825244426727295, + "learning_rate": 4.05916952403792e-05, + "loss": 0.0297, + "num_input_tokens_seen": 78041408, + "step": 64125 + }, + { + "epoch": 7.142220737275866, + "grad_norm": 0.013690765015780926, + "learning_rate": 4.05897958662404e-05, + "loss": 0.0038, + "num_input_tokens_seen": 78047808, + "step": 64130 + }, + { + "epoch": 7.142777592159483, + "grad_norm": 0.3035881817340851, + "learning_rate": 4.058789634484348e-05, + "loss": 0.0822, + "num_input_tokens_seen": 78054112, + "step": 64135 + }, + { + "epoch": 7.143334447043101, + "grad_norm": 1.3155945539474487, + "learning_rate": 4.058599667620639e-05, + "loss": 0.0433, + "num_input_tokens_seen": 78060352, + "step": 64140 + }, + { + "epoch": 7.143891301926718, + "grad_norm": 0.998144805431366, + "learning_rate": 4.058409686034708e-05, + "loss": 0.1666, + "num_input_tokens_seen": 78066272, + "step": 64145 + }, + { + "epoch": 7.144448156810335, + "grad_norm": 0.11013650894165039, + "learning_rate": 4.058219689728348e-05, + "loss": 0.1111, + "num_input_tokens_seen": 78072512, + "step": 64150 + }, + { + "epoch": 7.145005011693953, + "grad_norm": 0.12162912636995316, + "learning_rate": 4.0580296787033556e-05, + "loss": 0.0833, + "num_input_tokens_seen": 78078624, + "step": 64155 + }, + { + "epoch": 7.1455618665775695, + "grad_norm": 0.14224016666412354, + "learning_rate": 4.057839652961524e-05, + "loss": 0.0876, + "num_input_tokens_seen": 78084992, + "step": 64160 + }, + { + "epoch": 7.146118721461187, + "grad_norm": 0.07759448885917664, + "learning_rate": 4.057649612504649e-05, + "loss": 0.0372, + "num_input_tokens_seen": 78090816, + "step": 64165 + }, + { + "epoch": 7.146675576344805, + "grad_norm": 0.15641187131404877, + "learning_rate": 4.0574595573345254e-05, + "loss": 0.0112, + "num_input_tokens_seen": 78096992, + "step": 64170 + }, + { + "epoch": 7.147232431228422, + "grad_norm": 1.5712710618972778, + "learning_rate": 4.057269487452948e-05, + "loss": 0.1771, + "num_input_tokens_seen": 78102848, + "step": 64175 + }, + { + "epoch": 7.147789286112039, + "grad_norm": 0.6849897503852844, + "learning_rate": 4.0570794028617135e-05, + "loss": 0.0441, + "num_input_tokens_seen": 78108992, + "step": 64180 + }, + { + "epoch": 7.148346140995656, + "grad_norm": 0.0013033684808760881, + "learning_rate": 4.056889303562616e-05, + "loss": 0.0555, + "num_input_tokens_seen": 78115040, + "step": 64185 + }, + { + "epoch": 7.148902995879274, + "grad_norm": 1.044419765472412, + "learning_rate": 4.056699189557451e-05, + "loss": 0.1498, + "num_input_tokens_seen": 78121088, + "step": 64190 + }, + { + "epoch": 7.1494598507628915, + "grad_norm": 0.6056849360466003, + "learning_rate": 4.056509060848016e-05, + "loss": 0.0588, + "num_input_tokens_seen": 78126688, + "step": 64195 + }, + { + "epoch": 7.150016705646508, + "grad_norm": 0.001118853921070695, + "learning_rate": 4.056318917436106e-05, + "loss": 0.0008, + "num_input_tokens_seen": 78132928, + "step": 64200 + }, + { + "epoch": 7.150573560530126, + "grad_norm": 0.027824703603982925, + "learning_rate": 4.056128759323516e-05, + "loss": 0.0866, + "num_input_tokens_seen": 78139008, + "step": 64205 + }, + { + "epoch": 7.151130415413743, + "grad_norm": 0.35460829734802246, + "learning_rate": 4.0559385865120444e-05, + "loss": 0.0752, + "num_input_tokens_seen": 78145344, + "step": 64210 + }, + { + "epoch": 7.15168727029736, + "grad_norm": 0.5657157897949219, + "learning_rate": 4.055748399003485e-05, + "loss": 0.0552, + "num_input_tokens_seen": 78150688, + "step": 64215 + }, + { + "epoch": 7.152244125180978, + "grad_norm": 0.1909327507019043, + "learning_rate": 4.055558196799636e-05, + "loss": 0.1316, + "num_input_tokens_seen": 78156608, + "step": 64220 + }, + { + "epoch": 7.152800980064595, + "grad_norm": 0.49135130643844604, + "learning_rate": 4.055367979902294e-05, + "loss": 0.0392, + "num_input_tokens_seen": 78162528, + "step": 64225 + }, + { + "epoch": 7.1533578349482125, + "grad_norm": 0.14000080525875092, + "learning_rate": 4.055177748313255e-05, + "loss": 0.1347, + "num_input_tokens_seen": 78168768, + "step": 64230 + }, + { + "epoch": 7.15391468983183, + "grad_norm": 1.1873242855072021, + "learning_rate": 4.054987502034315e-05, + "loss": 0.0202, + "num_input_tokens_seen": 78175200, + "step": 64235 + }, + { + "epoch": 7.154471544715447, + "grad_norm": 0.026732292026281357, + "learning_rate": 4.054797241067273e-05, + "loss": 0.0209, + "num_input_tokens_seen": 78181184, + "step": 64240 + }, + { + "epoch": 7.155028399599065, + "grad_norm": 0.0024864384904503822, + "learning_rate": 4.054606965413926e-05, + "loss": 0.0161, + "num_input_tokens_seen": 78187072, + "step": 64245 + }, + { + "epoch": 7.1555852544826815, + "grad_norm": 0.07981529831886292, + "learning_rate": 4.0544166750760705e-05, + "loss": 0.0072, + "num_input_tokens_seen": 78193600, + "step": 64250 + }, + { + "epoch": 7.156142109366299, + "grad_norm": 0.35769322514533997, + "learning_rate": 4.054226370055504e-05, + "loss": 0.0193, + "num_input_tokens_seen": 78199776, + "step": 64255 + }, + { + "epoch": 7.156698964249917, + "grad_norm": 0.5066214203834534, + "learning_rate": 4.054036050354024e-05, + "loss": 0.1054, + "num_input_tokens_seen": 78206176, + "step": 64260 + }, + { + "epoch": 7.157255819133534, + "grad_norm": 0.7570441961288452, + "learning_rate": 4.053845715973429e-05, + "loss": 0.0668, + "num_input_tokens_seen": 78212672, + "step": 64265 + }, + { + "epoch": 7.157812674017151, + "grad_norm": 0.19493401050567627, + "learning_rate": 4.053655366915515e-05, + "loss": 0.0273, + "num_input_tokens_seen": 78218496, + "step": 64270 + }, + { + "epoch": 7.158369528900768, + "grad_norm": 0.07575736194849014, + "learning_rate": 4.0534650031820825e-05, + "loss": 0.0557, + "num_input_tokens_seen": 78224864, + "step": 64275 + }, + { + "epoch": 7.158926383784386, + "grad_norm": 0.4275434911251068, + "learning_rate": 4.053274624774928e-05, + "loss": 0.029, + "num_input_tokens_seen": 78230976, + "step": 64280 + }, + { + "epoch": 7.159483238668003, + "grad_norm": 0.002435691887512803, + "learning_rate": 4.05308423169585e-05, + "loss": 0.0829, + "num_input_tokens_seen": 78236896, + "step": 64285 + }, + { + "epoch": 7.16004009355162, + "grad_norm": 0.4288817346096039, + "learning_rate": 4.0528938239466475e-05, + "loss": 0.0372, + "num_input_tokens_seen": 78243040, + "step": 64290 + }, + { + "epoch": 7.160596948435238, + "grad_norm": 0.9699664115905762, + "learning_rate": 4.052703401529119e-05, + "loss": 0.0917, + "num_input_tokens_seen": 78249312, + "step": 64295 + }, + { + "epoch": 7.1611538033188555, + "grad_norm": 0.011376962997019291, + "learning_rate": 4.052512964445062e-05, + "loss": 0.0115, + "num_input_tokens_seen": 78255552, + "step": 64300 + }, + { + "epoch": 7.161710658202472, + "grad_norm": 0.47491946816444397, + "learning_rate": 4.0523225126962765e-05, + "loss": 0.0307, + "num_input_tokens_seen": 78261536, + "step": 64305 + }, + { + "epoch": 7.16226751308609, + "grad_norm": 0.26007410883903503, + "learning_rate": 4.052132046284561e-05, + "loss": 0.0458, + "num_input_tokens_seen": 78267520, + "step": 64310 + }, + { + "epoch": 7.162824367969707, + "grad_norm": 0.34800097346305847, + "learning_rate": 4.051941565211715e-05, + "loss": 0.044, + "num_input_tokens_seen": 78273088, + "step": 64315 + }, + { + "epoch": 7.1633812228533245, + "grad_norm": 1.3886702060699463, + "learning_rate": 4.051751069479538e-05, + "loss": 0.1089, + "num_input_tokens_seen": 78278688, + "step": 64320 + }, + { + "epoch": 7.163938077736942, + "grad_norm": 0.048402298241853714, + "learning_rate": 4.0515605590898284e-05, + "loss": 0.0749, + "num_input_tokens_seen": 78284224, + "step": 64325 + }, + { + "epoch": 7.164494932620559, + "grad_norm": 0.45427581667900085, + "learning_rate": 4.0513700340443864e-05, + "loss": 0.018, + "num_input_tokens_seen": 78289728, + "step": 64330 + }, + { + "epoch": 7.165051787504177, + "grad_norm": 0.18973711133003235, + "learning_rate": 4.0511794943450116e-05, + "loss": 0.0166, + "num_input_tokens_seen": 78296096, + "step": 64335 + }, + { + "epoch": 7.165608642387793, + "grad_norm": 1.2257388830184937, + "learning_rate": 4.0509889399935035e-05, + "loss": 0.0409, + "num_input_tokens_seen": 78302272, + "step": 64340 + }, + { + "epoch": 7.166165497271411, + "grad_norm": 0.02101057395339012, + "learning_rate": 4.050798370991662e-05, + "loss": 0.0047, + "num_input_tokens_seen": 78308128, + "step": 64345 + }, + { + "epoch": 7.166722352155029, + "grad_norm": 1.076891303062439, + "learning_rate": 4.050607787341287e-05, + "loss": 0.0462, + "num_input_tokens_seen": 78314304, + "step": 64350 + }, + { + "epoch": 7.1672792070386455, + "grad_norm": 0.25278589129447937, + "learning_rate": 4.0504171890441805e-05, + "loss": 0.043, + "num_input_tokens_seen": 78320416, + "step": 64355 + }, + { + "epoch": 7.167836061922263, + "grad_norm": 0.27837803959846497, + "learning_rate": 4.0502265761021404e-05, + "loss": 0.0388, + "num_input_tokens_seen": 78326592, + "step": 64360 + }, + { + "epoch": 7.16839291680588, + "grad_norm": 0.07436319440603256, + "learning_rate": 4.0500359485169695e-05, + "loss": 0.0105, + "num_input_tokens_seen": 78332224, + "step": 64365 + }, + { + "epoch": 7.168949771689498, + "grad_norm": 0.5721049308776855, + "learning_rate": 4.049845306290466e-05, + "loss": 0.0987, + "num_input_tokens_seen": 78338368, + "step": 64370 + }, + { + "epoch": 7.169506626573115, + "grad_norm": 0.44490236043930054, + "learning_rate": 4.049654649424432e-05, + "loss": 0.0624, + "num_input_tokens_seen": 78344160, + "step": 64375 + }, + { + "epoch": 7.170063481456732, + "grad_norm": 1.009816288948059, + "learning_rate": 4.0494639779206686e-05, + "loss": 0.026, + "num_input_tokens_seen": 78350368, + "step": 64380 + }, + { + "epoch": 7.17062033634035, + "grad_norm": 0.21078546345233917, + "learning_rate": 4.049273291780976e-05, + "loss": 0.0112, + "num_input_tokens_seen": 78356736, + "step": 64385 + }, + { + "epoch": 7.1711771912239675, + "grad_norm": 0.024727579206228256, + "learning_rate": 4.049082591007156e-05, + "loss": 0.1302, + "num_input_tokens_seen": 78363104, + "step": 64390 + }, + { + "epoch": 7.171734046107584, + "grad_norm": 0.0014571589417755604, + "learning_rate": 4.048891875601011e-05, + "loss": 0.0731, + "num_input_tokens_seen": 78368960, + "step": 64395 + }, + { + "epoch": 7.172290900991202, + "grad_norm": 0.439463347196579, + "learning_rate": 4.04870114556434e-05, + "loss": 0.1337, + "num_input_tokens_seen": 78375104, + "step": 64400 + }, + { + "epoch": 7.172847755874819, + "grad_norm": 0.2685795724391937, + "learning_rate": 4.048510400898946e-05, + "loss": 0.0167, + "num_input_tokens_seen": 78381216, + "step": 64405 + }, + { + "epoch": 7.173404610758436, + "grad_norm": 0.14738506078720093, + "learning_rate": 4.048319641606631e-05, + "loss": 0.0179, + "num_input_tokens_seen": 78387744, + "step": 64410 + }, + { + "epoch": 7.173961465642054, + "grad_norm": 0.8306459188461304, + "learning_rate": 4.048128867689196e-05, + "loss": 0.0583, + "num_input_tokens_seen": 78393920, + "step": 64415 + }, + { + "epoch": 7.174518320525671, + "grad_norm": 1.5547511577606201, + "learning_rate": 4.047938079148445e-05, + "loss": 0.1468, + "num_input_tokens_seen": 78400000, + "step": 64420 + }, + { + "epoch": 7.1750751754092885, + "grad_norm": 0.0003724446287378669, + "learning_rate": 4.047747275986177e-05, + "loss": 0.1233, + "num_input_tokens_seen": 78406080, + "step": 64425 + }, + { + "epoch": 7.175632030292905, + "grad_norm": 0.6066648364067078, + "learning_rate": 4.047556458204196e-05, + "loss": 0.1322, + "num_input_tokens_seen": 78412128, + "step": 64430 + }, + { + "epoch": 7.176188885176523, + "grad_norm": 0.7134895920753479, + "learning_rate": 4.047365625804305e-05, + "loss": 0.1089, + "num_input_tokens_seen": 78418368, + "step": 64435 + }, + { + "epoch": 7.176745740060141, + "grad_norm": 0.057451289147138596, + "learning_rate": 4.047174778788306e-05, + "loss": 0.0327, + "num_input_tokens_seen": 78424480, + "step": 64440 + }, + { + "epoch": 7.177302594943757, + "grad_norm": 0.2854313552379608, + "learning_rate": 4.046983917158001e-05, + "loss": 0.0303, + "num_input_tokens_seen": 78430880, + "step": 64445 + }, + { + "epoch": 7.177859449827375, + "grad_norm": 0.47856849431991577, + "learning_rate": 4.046793040915194e-05, + "loss": 0.055, + "num_input_tokens_seen": 78436864, + "step": 64450 + }, + { + "epoch": 7.178416304710992, + "grad_norm": 0.049609940499067307, + "learning_rate": 4.046602150061688e-05, + "loss": 0.1251, + "num_input_tokens_seen": 78442816, + "step": 64455 + }, + { + "epoch": 7.17897315959461, + "grad_norm": 0.0019750366918742657, + "learning_rate": 4.046411244599285e-05, + "loss": 0.0134, + "num_input_tokens_seen": 78448512, + "step": 64460 + }, + { + "epoch": 7.179530014478227, + "grad_norm": 0.1377037763595581, + "learning_rate": 4.0462203245297884e-05, + "loss": 0.0231, + "num_input_tokens_seen": 78454816, + "step": 64465 + }, + { + "epoch": 7.180086869361844, + "grad_norm": 0.2244454175233841, + "learning_rate": 4.0460293898550023e-05, + "loss": 0.04, + "num_input_tokens_seen": 78460928, + "step": 64470 + }, + { + "epoch": 7.180643724245462, + "grad_norm": 1.1165438890457153, + "learning_rate": 4.04583844057673e-05, + "loss": 0.0474, + "num_input_tokens_seen": 78466944, + "step": 64475 + }, + { + "epoch": 7.181200579129079, + "grad_norm": 0.5570887923240662, + "learning_rate": 4.045647476696776e-05, + "loss": 0.0614, + "num_input_tokens_seen": 78472864, + "step": 64480 + }, + { + "epoch": 7.181757434012696, + "grad_norm": 1.6336743831634521, + "learning_rate": 4.0454564982169416e-05, + "loss": 0.0759, + "num_input_tokens_seen": 78478976, + "step": 64485 + }, + { + "epoch": 7.182314288896314, + "grad_norm": 1.0973659753799438, + "learning_rate": 4.045265505139033e-05, + "loss": 0.0781, + "num_input_tokens_seen": 78484640, + "step": 64490 + }, + { + "epoch": 7.182871143779931, + "grad_norm": 0.08755531162023544, + "learning_rate": 4.045074497464854e-05, + "loss": 0.0728, + "num_input_tokens_seen": 78490720, + "step": 64495 + }, + { + "epoch": 7.183427998663548, + "grad_norm": 0.004960452672094107, + "learning_rate": 4.044883475196208e-05, + "loss": 0.0147, + "num_input_tokens_seen": 78497152, + "step": 64500 + }, + { + "epoch": 7.183984853547166, + "grad_norm": 0.2857833206653595, + "learning_rate": 4.0446924383349005e-05, + "loss": 0.0589, + "num_input_tokens_seen": 78503296, + "step": 64505 + }, + { + "epoch": 7.184541708430783, + "grad_norm": 0.8813538551330566, + "learning_rate": 4.0445013868827354e-05, + "loss": 0.0753, + "num_input_tokens_seen": 78509632, + "step": 64510 + }, + { + "epoch": 7.1850985633144, + "grad_norm": 0.004217160865664482, + "learning_rate": 4.044310320841517e-05, + "loss": 0.0252, + "num_input_tokens_seen": 78515584, + "step": 64515 + }, + { + "epoch": 7.185655418198017, + "grad_norm": 0.2392968237400055, + "learning_rate": 4.04411924021305e-05, + "loss": 0.0813, + "num_input_tokens_seen": 78521600, + "step": 64520 + }, + { + "epoch": 7.186212273081635, + "grad_norm": 0.00020040330127812922, + "learning_rate": 4.0439281449991395e-05, + "loss": 0.0111, + "num_input_tokens_seen": 78527712, + "step": 64525 + }, + { + "epoch": 7.186769127965253, + "grad_norm": 0.7296250462532043, + "learning_rate": 4.0437370352015907e-05, + "loss": 0.0109, + "num_input_tokens_seen": 78534016, + "step": 64530 + }, + { + "epoch": 7.187325982848869, + "grad_norm": 0.06925299763679504, + "learning_rate": 4.04354591082221e-05, + "loss": 0.1162, + "num_input_tokens_seen": 78540384, + "step": 64535 + }, + { + "epoch": 7.187882837732487, + "grad_norm": 0.0005800076760351658, + "learning_rate": 4.0433547718628e-05, + "loss": 0.1039, + "num_input_tokens_seen": 78546272, + "step": 64540 + }, + { + "epoch": 7.188439692616104, + "grad_norm": 0.17419292032718658, + "learning_rate": 4.043163618325168e-05, + "loss": 0.0898, + "num_input_tokens_seen": 78552480, + "step": 64545 + }, + { + "epoch": 7.1889965474997215, + "grad_norm": 1.4887745380401611, + "learning_rate": 4.04297245021112e-05, + "loss": 0.0754, + "num_input_tokens_seen": 78558624, + "step": 64550 + }, + { + "epoch": 7.189553402383339, + "grad_norm": 1.874233603477478, + "learning_rate": 4.0427812675224605e-05, + "loss": 0.1013, + "num_input_tokens_seen": 78564576, + "step": 64555 + }, + { + "epoch": 7.190110257266956, + "grad_norm": 0.014583505690097809, + "learning_rate": 4.0425900702609956e-05, + "loss": 0.0554, + "num_input_tokens_seen": 78570624, + "step": 64560 + }, + { + "epoch": 7.190667112150574, + "grad_norm": 2.465855836868286, + "learning_rate": 4.0423988584285324e-05, + "loss": 0.0357, + "num_input_tokens_seen": 78576896, + "step": 64565 + }, + { + "epoch": 7.191223967034191, + "grad_norm": 0.1189405545592308, + "learning_rate": 4.0422076320268756e-05, + "loss": 0.1042, + "num_input_tokens_seen": 78583232, + "step": 64570 + }, + { + "epoch": 7.191780821917808, + "grad_norm": 0.02135239727795124, + "learning_rate": 4.0420163910578316e-05, + "loss": 0.0152, + "num_input_tokens_seen": 78589472, + "step": 64575 + }, + { + "epoch": 7.192337676801426, + "grad_norm": 0.007013024762272835, + "learning_rate": 4.0418251355232084e-05, + "loss": 0.0739, + "num_input_tokens_seen": 78595424, + "step": 64580 + }, + { + "epoch": 7.192894531685043, + "grad_norm": 0.09968588501214981, + "learning_rate": 4.041633865424811e-05, + "loss": 0.0489, + "num_input_tokens_seen": 78601408, + "step": 64585 + }, + { + "epoch": 7.19345138656866, + "grad_norm": 0.002412698930129409, + "learning_rate": 4.041442580764447e-05, + "loss": 0.0966, + "num_input_tokens_seen": 78607488, + "step": 64590 + }, + { + "epoch": 7.194008241452278, + "grad_norm": 2.1663942337036133, + "learning_rate": 4.041251281543922e-05, + "loss": 0.0556, + "num_input_tokens_seen": 78613184, + "step": 64595 + }, + { + "epoch": 7.194565096335895, + "grad_norm": 0.00034474596031941473, + "learning_rate": 4.041059967765045e-05, + "loss": 0.0147, + "num_input_tokens_seen": 78619776, + "step": 64600 + }, + { + "epoch": 7.195121951219512, + "grad_norm": 1.1249022483825684, + "learning_rate": 4.040868639429621e-05, + "loss": 0.0477, + "num_input_tokens_seen": 78626016, + "step": 64605 + }, + { + "epoch": 7.195678806103129, + "grad_norm": 0.5849630236625671, + "learning_rate": 4.040677296539458e-05, + "loss": 0.0421, + "num_input_tokens_seen": 78631616, + "step": 64610 + }, + { + "epoch": 7.196235660986747, + "grad_norm": 0.0367705300450325, + "learning_rate": 4.040485939096365e-05, + "loss": 0.1371, + "num_input_tokens_seen": 78637920, + "step": 64615 + }, + { + "epoch": 7.1967925158703645, + "grad_norm": 1.4227304458618164, + "learning_rate": 4.040294567102146e-05, + "loss": 0.0873, + "num_input_tokens_seen": 78644032, + "step": 64620 + }, + { + "epoch": 7.197349370753981, + "grad_norm": 0.4189344644546509, + "learning_rate": 4.040103180558612e-05, + "loss": 0.147, + "num_input_tokens_seen": 78649952, + "step": 64625 + }, + { + "epoch": 7.197906225637599, + "grad_norm": 1.4035403728485107, + "learning_rate": 4.03991177946757e-05, + "loss": 0.049, + "num_input_tokens_seen": 78656320, + "step": 64630 + }, + { + "epoch": 7.198463080521216, + "grad_norm": 1.8592942953109741, + "learning_rate": 4.039720363830827e-05, + "loss": 0.1542, + "num_input_tokens_seen": 78662688, + "step": 64635 + }, + { + "epoch": 7.199019935404833, + "grad_norm": 0.008225132711231709, + "learning_rate": 4.039528933650191e-05, + "loss": 0.0865, + "num_input_tokens_seen": 78669024, + "step": 64640 + }, + { + "epoch": 7.199576790288451, + "grad_norm": 0.6383904814720154, + "learning_rate": 4.039337488927472e-05, + "loss": 0.088, + "num_input_tokens_seen": 78675392, + "step": 64645 + }, + { + "epoch": 7.200133645172068, + "grad_norm": 0.011322775855660439, + "learning_rate": 4.039146029664475e-05, + "loss": 0.0726, + "num_input_tokens_seen": 78681376, + "step": 64650 + }, + { + "epoch": 7.200690500055686, + "grad_norm": 0.5904276371002197, + "learning_rate": 4.038954555863013e-05, + "loss": 0.1626, + "num_input_tokens_seen": 78687552, + "step": 64655 + }, + { + "epoch": 7.201247354939303, + "grad_norm": 0.003873751498758793, + "learning_rate": 4.038763067524891e-05, + "loss": 0.0864, + "num_input_tokens_seen": 78693728, + "step": 64660 + }, + { + "epoch": 7.20180420982292, + "grad_norm": 0.5776534080505371, + "learning_rate": 4.0385715646519184e-05, + "loss": 0.0515, + "num_input_tokens_seen": 78699808, + "step": 64665 + }, + { + "epoch": 7.202361064706538, + "grad_norm": 0.052954114973545074, + "learning_rate": 4.038380047245905e-05, + "loss": 0.0677, + "num_input_tokens_seen": 78705824, + "step": 64670 + }, + { + "epoch": 7.2029179195901545, + "grad_norm": 0.0006345170550048351, + "learning_rate": 4.038188515308661e-05, + "loss": 0.0516, + "num_input_tokens_seen": 78711968, + "step": 64675 + }, + { + "epoch": 7.203474774473772, + "grad_norm": 0.4709741175174713, + "learning_rate": 4.037996968841993e-05, + "loss": 0.1075, + "num_input_tokens_seen": 78717920, + "step": 64680 + }, + { + "epoch": 7.20403162935739, + "grad_norm": 0.0005435197381302714, + "learning_rate": 4.0378054078477114e-05, + "loss": 0.0575, + "num_input_tokens_seen": 78724288, + "step": 64685 + }, + { + "epoch": 7.204588484241007, + "grad_norm": 0.6832264065742493, + "learning_rate": 4.0376138323276255e-05, + "loss": 0.1908, + "num_input_tokens_seen": 78729728, + "step": 64690 + }, + { + "epoch": 7.205145339124624, + "grad_norm": 1.8357901573181152, + "learning_rate": 4.0374222422835456e-05, + "loss": 0.0524, + "num_input_tokens_seen": 78735712, + "step": 64695 + }, + { + "epoch": 7.205702194008241, + "grad_norm": 0.9580324292182922, + "learning_rate": 4.037230637717281e-05, + "loss": 0.1055, + "num_input_tokens_seen": 78741824, + "step": 64700 + }, + { + "epoch": 7.206259048891859, + "grad_norm": 0.8661172986030579, + "learning_rate": 4.03703901863064e-05, + "loss": 0.0879, + "num_input_tokens_seen": 78747904, + "step": 64705 + }, + { + "epoch": 7.206815903775476, + "grad_norm": 1.06040358543396, + "learning_rate": 4.0368473850254353e-05, + "loss": 0.0321, + "num_input_tokens_seen": 78754176, + "step": 64710 + }, + { + "epoch": 7.207372758659093, + "grad_norm": 0.003228127723559737, + "learning_rate": 4.0366557369034755e-05, + "loss": 0.045, + "num_input_tokens_seen": 78760320, + "step": 64715 + }, + { + "epoch": 7.207929613542711, + "grad_norm": 0.29344475269317627, + "learning_rate": 4.0364640742665714e-05, + "loss": 0.0294, + "num_input_tokens_seen": 78766304, + "step": 64720 + }, + { + "epoch": 7.208486468426328, + "grad_norm": 1.1857874393463135, + "learning_rate": 4.036272397116532e-05, + "loss": 0.0286, + "num_input_tokens_seen": 78772576, + "step": 64725 + }, + { + "epoch": 7.209043323309945, + "grad_norm": 0.23325929045677185, + "learning_rate": 4.03608070545517e-05, + "loss": 0.0132, + "num_input_tokens_seen": 78778784, + "step": 64730 + }, + { + "epoch": 7.209600178193563, + "grad_norm": 1.2110862731933594, + "learning_rate": 4.0358889992842955e-05, + "loss": 0.0795, + "num_input_tokens_seen": 78784928, + "step": 64735 + }, + { + "epoch": 7.21015703307718, + "grad_norm": 0.03333209455013275, + "learning_rate": 4.035697278605718e-05, + "loss": 0.0173, + "num_input_tokens_seen": 78791008, + "step": 64740 + }, + { + "epoch": 7.2107138879607975, + "grad_norm": 0.0001402837660862133, + "learning_rate": 4.0355055434212493e-05, + "loss": 0.0242, + "num_input_tokens_seen": 78797056, + "step": 64745 + }, + { + "epoch": 7.211270742844415, + "grad_norm": 0.34525516629219055, + "learning_rate": 4.0353137937327005e-05, + "loss": 0.043, + "num_input_tokens_seen": 78802816, + "step": 64750 + }, + { + "epoch": 7.211827597728032, + "grad_norm": 1.2639174461364746, + "learning_rate": 4.035122029541883e-05, + "loss": 0.1498, + "num_input_tokens_seen": 78809024, + "step": 64755 + }, + { + "epoch": 7.21238445261165, + "grad_norm": 0.36998727917671204, + "learning_rate": 4.034930250850608e-05, + "loss": 0.049, + "num_input_tokens_seen": 78814816, + "step": 64760 + }, + { + "epoch": 7.212941307495266, + "grad_norm": 0.5556043982505798, + "learning_rate": 4.034738457660687e-05, + "loss": 0.0657, + "num_input_tokens_seen": 78820928, + "step": 64765 + }, + { + "epoch": 7.213498162378884, + "grad_norm": 0.021686172112822533, + "learning_rate": 4.034546649973932e-05, + "loss": 0.0563, + "num_input_tokens_seen": 78827296, + "step": 64770 + }, + { + "epoch": 7.214055017262502, + "grad_norm": 0.3716000020503998, + "learning_rate": 4.034354827792154e-05, + "loss": 0.0775, + "num_input_tokens_seen": 78833504, + "step": 64775 + }, + { + "epoch": 7.2146118721461185, + "grad_norm": 0.3967946469783783, + "learning_rate": 4.034162991117165e-05, + "loss": 0.0514, + "num_input_tokens_seen": 78839744, + "step": 64780 + }, + { + "epoch": 7.215168727029736, + "grad_norm": 0.24132803082466125, + "learning_rate": 4.0339711399507785e-05, + "loss": 0.0717, + "num_input_tokens_seen": 78845728, + "step": 64785 + }, + { + "epoch": 7.215725581913353, + "grad_norm": 0.04701171815395355, + "learning_rate": 4.0337792742948045e-05, + "loss": 0.1246, + "num_input_tokens_seen": 78851456, + "step": 64790 + }, + { + "epoch": 7.216282436796971, + "grad_norm": 0.22372347116470337, + "learning_rate": 4.033587394151057e-05, + "loss": 0.0342, + "num_input_tokens_seen": 78857440, + "step": 64795 + }, + { + "epoch": 7.216839291680588, + "grad_norm": 0.2979326844215393, + "learning_rate": 4.033395499521348e-05, + "loss": 0.0664, + "num_input_tokens_seen": 78863584, + "step": 64800 + }, + { + "epoch": 7.217396146564205, + "grad_norm": 0.05345764383673668, + "learning_rate": 4.033203590407489e-05, + "loss": 0.0495, + "num_input_tokens_seen": 78869856, + "step": 64805 + }, + { + "epoch": 7.217953001447823, + "grad_norm": 0.018586380407214165, + "learning_rate": 4.033011666811295e-05, + "loss": 0.1204, + "num_input_tokens_seen": 78876000, + "step": 64810 + }, + { + "epoch": 7.21850985633144, + "grad_norm": 0.7802104949951172, + "learning_rate": 4.032819728734577e-05, + "loss": 0.0414, + "num_input_tokens_seen": 78882016, + "step": 64815 + }, + { + "epoch": 7.219066711215057, + "grad_norm": 0.002164066769182682, + "learning_rate": 4.0326277761791486e-05, + "loss": 0.0197, + "num_input_tokens_seen": 78888224, + "step": 64820 + }, + { + "epoch": 7.219623566098675, + "grad_norm": 0.040575698018074036, + "learning_rate": 4.032435809146823e-05, + "loss": 0.0027, + "num_input_tokens_seen": 78894464, + "step": 64825 + }, + { + "epoch": 7.220180420982292, + "grad_norm": 0.12905089557170868, + "learning_rate": 4.032243827639414e-05, + "loss": 0.1784, + "num_input_tokens_seen": 78900544, + "step": 64830 + }, + { + "epoch": 7.220737275865909, + "grad_norm": 0.000670107954647392, + "learning_rate": 4.032051831658733e-05, + "loss": 0.0837, + "num_input_tokens_seen": 78905920, + "step": 64835 + }, + { + "epoch": 7.221294130749527, + "grad_norm": 0.04229814559221268, + "learning_rate": 4.031859821206596e-05, + "loss": 0.0389, + "num_input_tokens_seen": 78911936, + "step": 64840 + }, + { + "epoch": 7.221850985633144, + "grad_norm": 0.4242783486843109, + "learning_rate": 4.031667796284815e-05, + "loss": 0.0111, + "num_input_tokens_seen": 78917824, + "step": 64845 + }, + { + "epoch": 7.2224078405167615, + "grad_norm": 0.4901961088180542, + "learning_rate": 4.0314757568952056e-05, + "loss": 0.0147, + "num_input_tokens_seen": 78923776, + "step": 64850 + }, + { + "epoch": 7.222964695400378, + "grad_norm": 0.1294529289007187, + "learning_rate": 4.0312837030395804e-05, + "loss": 0.0042, + "num_input_tokens_seen": 78930272, + "step": 64855 + }, + { + "epoch": 7.223521550283996, + "grad_norm": 0.10078208148479462, + "learning_rate": 4.0310916347197536e-05, + "loss": 0.0563, + "num_input_tokens_seen": 78936224, + "step": 64860 + }, + { + "epoch": 7.224078405167614, + "grad_norm": 0.05031580477952957, + "learning_rate": 4.030899551937539e-05, + "loss": 0.1298, + "num_input_tokens_seen": 78942400, + "step": 64865 + }, + { + "epoch": 7.2246352600512305, + "grad_norm": 0.1589682400226593, + "learning_rate": 4.030707454694752e-05, + "loss": 0.0179, + "num_input_tokens_seen": 78948480, + "step": 64870 + }, + { + "epoch": 7.225192114934848, + "grad_norm": 0.007853002287447453, + "learning_rate": 4.030515342993207e-05, + "loss": 0.0062, + "num_input_tokens_seen": 78954688, + "step": 64875 + }, + { + "epoch": 7.225748969818465, + "grad_norm": 1.460425615310669, + "learning_rate": 4.030323216834718e-05, + "loss": 0.0546, + "num_input_tokens_seen": 78960896, + "step": 64880 + }, + { + "epoch": 7.226305824702083, + "grad_norm": 0.31782039999961853, + "learning_rate": 4.0301310762211e-05, + "loss": 0.0227, + "num_input_tokens_seen": 78967552, + "step": 64885 + }, + { + "epoch": 7.2268626795857, + "grad_norm": 0.13169778883457184, + "learning_rate": 4.029938921154168e-05, + "loss": 0.0726, + "num_input_tokens_seen": 78972992, + "step": 64890 + }, + { + "epoch": 7.227419534469317, + "grad_norm": 0.11714497953653336, + "learning_rate": 4.029746751635738e-05, + "loss": 0.027, + "num_input_tokens_seen": 78979200, + "step": 64895 + }, + { + "epoch": 7.227976389352935, + "grad_norm": 0.7165853381156921, + "learning_rate": 4.029554567667624e-05, + "loss": 0.0994, + "num_input_tokens_seen": 78985376, + "step": 64900 + }, + { + "epoch": 7.2285332442365515, + "grad_norm": 0.0009757569641806185, + "learning_rate": 4.029362369251641e-05, + "loss": 0.0132, + "num_input_tokens_seen": 78991488, + "step": 64905 + }, + { + "epoch": 7.229090099120169, + "grad_norm": 1.1017272472381592, + "learning_rate": 4.029170156389606e-05, + "loss": 0.1995, + "num_input_tokens_seen": 78997536, + "step": 64910 + }, + { + "epoch": 7.229646954003787, + "grad_norm": 0.10647328197956085, + "learning_rate": 4.028977929083333e-05, + "loss": 0.0928, + "num_input_tokens_seen": 79003712, + "step": 64915 + }, + { + "epoch": 7.230203808887404, + "grad_norm": 0.8715558648109436, + "learning_rate": 4.028785687334639e-05, + "loss": 0.0372, + "num_input_tokens_seen": 79010016, + "step": 64920 + }, + { + "epoch": 7.230760663771021, + "grad_norm": 0.2675361931324005, + "learning_rate": 4.028593431145339e-05, + "loss": 0.0965, + "num_input_tokens_seen": 79016032, + "step": 64925 + }, + { + "epoch": 7.231317518654639, + "grad_norm": 0.5129056572914124, + "learning_rate": 4.028401160517249e-05, + "loss": 0.0097, + "num_input_tokens_seen": 79022464, + "step": 64930 + }, + { + "epoch": 7.231874373538256, + "grad_norm": 2.2953948974609375, + "learning_rate": 4.0282088754521864e-05, + "loss": 0.166, + "num_input_tokens_seen": 79028480, + "step": 64935 + }, + { + "epoch": 7.2324312284218735, + "grad_norm": 0.9495099186897278, + "learning_rate": 4.0280165759519657e-05, + "loss": 0.1322, + "num_input_tokens_seen": 79034368, + "step": 64940 + }, + { + "epoch": 7.23298808330549, + "grad_norm": 0.5952393412590027, + "learning_rate": 4.027824262018405e-05, + "loss": 0.0119, + "num_input_tokens_seen": 79040576, + "step": 64945 + }, + { + "epoch": 7.233544938189108, + "grad_norm": 0.08449696004390717, + "learning_rate": 4.0276319336533194e-05, + "loss": 0.1429, + "num_input_tokens_seen": 79046528, + "step": 64950 + }, + { + "epoch": 7.234101793072726, + "grad_norm": 0.01091244537383318, + "learning_rate": 4.027439590858527e-05, + "loss": 0.0509, + "num_input_tokens_seen": 79052288, + "step": 64955 + }, + { + "epoch": 7.234658647956342, + "grad_norm": 0.0018972167745232582, + "learning_rate": 4.027247233635843e-05, + "loss": 0.084, + "num_input_tokens_seen": 79058528, + "step": 64960 + }, + { + "epoch": 7.23521550283996, + "grad_norm": 0.5390279293060303, + "learning_rate": 4.027054861987085e-05, + "loss": 0.0995, + "num_input_tokens_seen": 79064384, + "step": 64965 + }, + { + "epoch": 7.235772357723577, + "grad_norm": 0.015348642133176327, + "learning_rate": 4.026862475914072e-05, + "loss": 0.0789, + "num_input_tokens_seen": 79070752, + "step": 64970 + }, + { + "epoch": 7.2363292126071945, + "grad_norm": 0.037473369389772415, + "learning_rate": 4.026670075418618e-05, + "loss": 0.0275, + "num_input_tokens_seen": 79077056, + "step": 64975 + }, + { + "epoch": 7.236886067490812, + "grad_norm": 0.038380395621061325, + "learning_rate": 4.026477660502543e-05, + "loss": 0.0069, + "num_input_tokens_seen": 79083168, + "step": 64980 + }, + { + "epoch": 7.237442922374429, + "grad_norm": 0.21594971418380737, + "learning_rate": 4.0262852311676634e-05, + "loss": 0.0935, + "num_input_tokens_seen": 79088864, + "step": 64985 + }, + { + "epoch": 7.237999777258047, + "grad_norm": 0.5066787600517273, + "learning_rate": 4.0260927874157964e-05, + "loss": 0.0374, + "num_input_tokens_seen": 79095136, + "step": 64990 + }, + { + "epoch": 7.238556632141664, + "grad_norm": 0.008007239550352097, + "learning_rate": 4.025900329248761e-05, + "loss": 0.0077, + "num_input_tokens_seen": 79101344, + "step": 64995 + }, + { + "epoch": 7.239113487025281, + "grad_norm": 0.012275571003556252, + "learning_rate": 4.0257078566683735e-05, + "loss": 0.0642, + "num_input_tokens_seen": 79107296, + "step": 65000 + }, + { + "epoch": 7.239670341908899, + "grad_norm": 0.6079505085945129, + "learning_rate": 4.0255153696764544e-05, + "loss": 0.0848, + "num_input_tokens_seen": 79113408, + "step": 65005 + }, + { + "epoch": 7.240227196792516, + "grad_norm": 0.036610644310712814, + "learning_rate": 4.025322868274819e-05, + "loss": 0.1494, + "num_input_tokens_seen": 79119296, + "step": 65010 + }, + { + "epoch": 7.240784051676133, + "grad_norm": 0.030286967754364014, + "learning_rate": 4.0251303524652885e-05, + "loss": 0.045, + "num_input_tokens_seen": 79125184, + "step": 65015 + }, + { + "epoch": 7.241340906559751, + "grad_norm": 0.018914582207798958, + "learning_rate": 4.0249378222496786e-05, + "loss": 0.0508, + "num_input_tokens_seen": 79131136, + "step": 65020 + }, + { + "epoch": 7.241897761443368, + "grad_norm": 0.1596890389919281, + "learning_rate": 4.02474527762981e-05, + "loss": 0.0277, + "num_input_tokens_seen": 79137280, + "step": 65025 + }, + { + "epoch": 7.242454616326985, + "grad_norm": 0.4685450494289398, + "learning_rate": 4.024552718607499e-05, + "loss": 0.013, + "num_input_tokens_seen": 79143488, + "step": 65030 + }, + { + "epoch": 7.243011471210602, + "grad_norm": 0.37504851818084717, + "learning_rate": 4.024360145184568e-05, + "loss": 0.043, + "num_input_tokens_seen": 79149952, + "step": 65035 + }, + { + "epoch": 7.24356832609422, + "grad_norm": 0.06051082909107208, + "learning_rate": 4.024167557362833e-05, + "loss": 0.0242, + "num_input_tokens_seen": 79155712, + "step": 65040 + }, + { + "epoch": 7.2441251809778375, + "grad_norm": 0.18113772571086884, + "learning_rate": 4.023974955144115e-05, + "loss": 0.1039, + "num_input_tokens_seen": 79161888, + "step": 65045 + }, + { + "epoch": 7.244682035861454, + "grad_norm": 0.0017166759353131056, + "learning_rate": 4.023782338530233e-05, + "loss": 0.0819, + "num_input_tokens_seen": 79167936, + "step": 65050 + }, + { + "epoch": 7.245238890745072, + "grad_norm": 0.19119319319725037, + "learning_rate": 4.0235897075230055e-05, + "loss": 0.0167, + "num_input_tokens_seen": 79174176, + "step": 65055 + }, + { + "epoch": 7.245795745628689, + "grad_norm": 0.385917603969574, + "learning_rate": 4.0233970621242525e-05, + "loss": 0.1089, + "num_input_tokens_seen": 79180064, + "step": 65060 + }, + { + "epoch": 7.2463526005123065, + "grad_norm": 0.16208700835704803, + "learning_rate": 4.023204402335793e-05, + "loss": 0.0096, + "num_input_tokens_seen": 79186048, + "step": 65065 + }, + { + "epoch": 7.246909455395924, + "grad_norm": 0.0006416455726139247, + "learning_rate": 4.023011728159448e-05, + "loss": 0.1468, + "num_input_tokens_seen": 79191904, + "step": 65070 + }, + { + "epoch": 7.247466310279541, + "grad_norm": 0.6017850637435913, + "learning_rate": 4.022819039597038e-05, + "loss": 0.0667, + "num_input_tokens_seen": 79198048, + "step": 65075 + }, + { + "epoch": 7.248023165163159, + "grad_norm": 1.1763516664505005, + "learning_rate": 4.0226263366503814e-05, + "loss": 0.0219, + "num_input_tokens_seen": 79204096, + "step": 65080 + }, + { + "epoch": 7.248580020046775, + "grad_norm": 0.00042631171527318656, + "learning_rate": 4.0224336193212985e-05, + "loss": 0.0058, + "num_input_tokens_seen": 79210432, + "step": 65085 + }, + { + "epoch": 7.249136874930393, + "grad_norm": 0.7459937334060669, + "learning_rate": 4.022240887611611e-05, + "loss": 0.0929, + "num_input_tokens_seen": 79216256, + "step": 65090 + }, + { + "epoch": 7.249693729814011, + "grad_norm": 0.006036362610757351, + "learning_rate": 4.022048141523138e-05, + "loss": 0.0632, + "num_input_tokens_seen": 79222624, + "step": 65095 + }, + { + "epoch": 7.2502505846976275, + "grad_norm": 0.3957403600215912, + "learning_rate": 4.021855381057702e-05, + "loss": 0.01, + "num_input_tokens_seen": 79228992, + "step": 65100 + }, + { + "epoch": 7.250807439581245, + "grad_norm": 0.21725142002105713, + "learning_rate": 4.021662606217122e-05, + "loss": 0.0221, + "num_input_tokens_seen": 79235040, + "step": 65105 + }, + { + "epoch": 7.251364294464863, + "grad_norm": 0.03322717174887657, + "learning_rate": 4.0214698170032195e-05, + "loss": 0.0279, + "num_input_tokens_seen": 79240960, + "step": 65110 + }, + { + "epoch": 7.25192114934848, + "grad_norm": 0.00033508878550492227, + "learning_rate": 4.021277013417816e-05, + "loss": 0.0311, + "num_input_tokens_seen": 79247520, + "step": 65115 + }, + { + "epoch": 7.252478004232097, + "grad_norm": 0.29686295986175537, + "learning_rate": 4.021084195462732e-05, + "loss": 0.0073, + "num_input_tokens_seen": 79253888, + "step": 65120 + }, + { + "epoch": 7.253034859115714, + "grad_norm": 1.6804152727127075, + "learning_rate": 4.020891363139789e-05, + "loss": 0.2173, + "num_input_tokens_seen": 79259904, + "step": 65125 + }, + { + "epoch": 7.253591713999332, + "grad_norm": 0.3759358823299408, + "learning_rate": 4.0206985164508085e-05, + "loss": 0.018, + "num_input_tokens_seen": 79266336, + "step": 65130 + }, + { + "epoch": 7.2541485688829495, + "grad_norm": 0.013076795265078545, + "learning_rate": 4.020505655397612e-05, + "loss": 0.0543, + "num_input_tokens_seen": 79272256, + "step": 65135 + }, + { + "epoch": 7.254705423766566, + "grad_norm": 0.47884443402290344, + "learning_rate": 4.020312779982022e-05, + "loss": 0.0777, + "num_input_tokens_seen": 79278400, + "step": 65140 + }, + { + "epoch": 7.255262278650184, + "grad_norm": 0.00019541832443792373, + "learning_rate": 4.020119890205859e-05, + "loss": 0.1039, + "num_input_tokens_seen": 79284640, + "step": 65145 + }, + { + "epoch": 7.255819133533801, + "grad_norm": 0.09843640774488449, + "learning_rate": 4.019926986070947e-05, + "loss": 0.0127, + "num_input_tokens_seen": 79290688, + "step": 65150 + }, + { + "epoch": 7.256375988417418, + "grad_norm": 1.054221749305725, + "learning_rate": 4.019734067579105e-05, + "loss": 0.2526, + "num_input_tokens_seen": 79296832, + "step": 65155 + }, + { + "epoch": 7.256932843301036, + "grad_norm": 0.2979907989501953, + "learning_rate": 4.0195411347321586e-05, + "loss": 0.1088, + "num_input_tokens_seen": 79302944, + "step": 65160 + }, + { + "epoch": 7.257489698184653, + "grad_norm": 0.08043094724416733, + "learning_rate": 4.019348187531928e-05, + "loss": 0.2209, + "num_input_tokens_seen": 79308608, + "step": 65165 + }, + { + "epoch": 7.2580465530682705, + "grad_norm": 1.1378958225250244, + "learning_rate": 4.0191552259802364e-05, + "loss": 0.1239, + "num_input_tokens_seen": 79314784, + "step": 65170 + }, + { + "epoch": 7.258603407951888, + "grad_norm": 1.8952001333236694, + "learning_rate": 4.018962250078907e-05, + "loss": 0.0976, + "num_input_tokens_seen": 79320992, + "step": 65175 + }, + { + "epoch": 7.259160262835505, + "grad_norm": 2.296426773071289, + "learning_rate": 4.018769259829763e-05, + "loss": 0.1791, + "num_input_tokens_seen": 79327168, + "step": 65180 + }, + { + "epoch": 7.259717117719123, + "grad_norm": 0.06634242087602615, + "learning_rate": 4.018576255234625e-05, + "loss": 0.0378, + "num_input_tokens_seen": 79333408, + "step": 65185 + }, + { + "epoch": 7.260273972602739, + "grad_norm": 0.0011365854879841208, + "learning_rate": 4.0183832362953185e-05, + "loss": 0.02, + "num_input_tokens_seen": 79339680, + "step": 65190 + }, + { + "epoch": 7.260830827486357, + "grad_norm": 0.5476183891296387, + "learning_rate": 4.0181902030136654e-05, + "loss": 0.0609, + "num_input_tokens_seen": 79345536, + "step": 65195 + }, + { + "epoch": 7.261387682369975, + "grad_norm": 0.8963492512702942, + "learning_rate": 4.01799715539149e-05, + "loss": 0.0507, + "num_input_tokens_seen": 79351680, + "step": 65200 + }, + { + "epoch": 7.261944537253592, + "grad_norm": 0.1534436196088791, + "learning_rate": 4.017804093430615e-05, + "loss": 0.0919, + "num_input_tokens_seen": 79357184, + "step": 65205 + }, + { + "epoch": 7.262501392137209, + "grad_norm": 0.049852460622787476, + "learning_rate": 4.017611017132864e-05, + "loss": 0.0326, + "num_input_tokens_seen": 79363936, + "step": 65210 + }, + { + "epoch": 7.263058247020826, + "grad_norm": 0.002803780836984515, + "learning_rate": 4.017417926500061e-05, + "loss": 0.0612, + "num_input_tokens_seen": 79370080, + "step": 65215 + }, + { + "epoch": 7.263615101904444, + "grad_norm": 0.03178081661462784, + "learning_rate": 4.0172248215340305e-05, + "loss": 0.0104, + "num_input_tokens_seen": 79376352, + "step": 65220 + }, + { + "epoch": 7.264171956788061, + "grad_norm": 0.7732500433921814, + "learning_rate": 4.0170317022365956e-05, + "loss": 0.0109, + "num_input_tokens_seen": 79382880, + "step": 65225 + }, + { + "epoch": 7.264728811671678, + "grad_norm": 1.2905259132385254, + "learning_rate": 4.016838568609581e-05, + "loss": 0.0686, + "num_input_tokens_seen": 79389152, + "step": 65230 + }, + { + "epoch": 7.265285666555296, + "grad_norm": 1.5377700328826904, + "learning_rate": 4.0166454206548107e-05, + "loss": 0.091, + "num_input_tokens_seen": 79395104, + "step": 65235 + }, + { + "epoch": 7.265842521438913, + "grad_norm": 0.061151646077632904, + "learning_rate": 4.0164522583741095e-05, + "loss": 0.0083, + "num_input_tokens_seen": 79401248, + "step": 65240 + }, + { + "epoch": 7.26639937632253, + "grad_norm": 0.23614844679832458, + "learning_rate": 4.0162590817693013e-05, + "loss": 0.0062, + "num_input_tokens_seen": 79407200, + "step": 65245 + }, + { + "epoch": 7.266956231206148, + "grad_norm": 1.1115343570709229, + "learning_rate": 4.016065890842212e-05, + "loss": 0.1292, + "num_input_tokens_seen": 79413408, + "step": 65250 + }, + { + "epoch": 7.267513086089765, + "grad_norm": 1.2495155334472656, + "learning_rate": 4.015872685594665e-05, + "loss": 0.1137, + "num_input_tokens_seen": 79419712, + "step": 65255 + }, + { + "epoch": 7.268069940973382, + "grad_norm": 0.041404128074645996, + "learning_rate": 4.015679466028486e-05, + "loss": 0.0222, + "num_input_tokens_seen": 79426048, + "step": 65260 + }, + { + "epoch": 7.268626795856999, + "grad_norm": 0.7270302176475525, + "learning_rate": 4.0154862321455014e-05, + "loss": 0.1271, + "num_input_tokens_seen": 79431648, + "step": 65265 + }, + { + "epoch": 7.269183650740617, + "grad_norm": 0.007986624725162983, + "learning_rate": 4.015292983947534e-05, + "loss": 0.0601, + "num_input_tokens_seen": 79437792, + "step": 65270 + }, + { + "epoch": 7.269740505624235, + "grad_norm": 0.9371218085289001, + "learning_rate": 4.0150997214364104e-05, + "loss": 0.0895, + "num_input_tokens_seen": 79443936, + "step": 65275 + }, + { + "epoch": 7.270297360507851, + "grad_norm": 0.32507190108299255, + "learning_rate": 4.0149064446139565e-05, + "loss": 0.0278, + "num_input_tokens_seen": 79449984, + "step": 65280 + }, + { + "epoch": 7.270854215391469, + "grad_norm": 0.690949022769928, + "learning_rate": 4.014713153481997e-05, + "loss": 0.0728, + "num_input_tokens_seen": 79455680, + "step": 65285 + }, + { + "epoch": 7.271411070275087, + "grad_norm": 0.07373739778995514, + "learning_rate": 4.014519848042359e-05, + "loss": 0.0675, + "num_input_tokens_seen": 79461472, + "step": 65290 + }, + { + "epoch": 7.2719679251587035, + "grad_norm": 1.1750859022140503, + "learning_rate": 4.014326528296866e-05, + "loss": 0.0346, + "num_input_tokens_seen": 79467840, + "step": 65295 + }, + { + "epoch": 7.272524780042321, + "grad_norm": 1.8187907934188843, + "learning_rate": 4.014133194247347e-05, + "loss": 0.0425, + "num_input_tokens_seen": 79473888, + "step": 65300 + }, + { + "epoch": 7.273081634925938, + "grad_norm": 0.004759581759572029, + "learning_rate": 4.013939845895626e-05, + "loss": 0.0221, + "num_input_tokens_seen": 79479776, + "step": 65305 + }, + { + "epoch": 7.273638489809556, + "grad_norm": 0.0004628050373867154, + "learning_rate": 4.013746483243531e-05, + "loss": 0.0419, + "num_input_tokens_seen": 79486176, + "step": 65310 + }, + { + "epoch": 7.274195344693173, + "grad_norm": 0.052994254976511, + "learning_rate": 4.0135531062928877e-05, + "loss": 0.0154, + "num_input_tokens_seen": 79492448, + "step": 65315 + }, + { + "epoch": 7.27475219957679, + "grad_norm": 0.0038755210116505623, + "learning_rate": 4.013359715045522e-05, + "loss": 0.0202, + "num_input_tokens_seen": 79498592, + "step": 65320 + }, + { + "epoch": 7.275309054460408, + "grad_norm": 0.03877291455864906, + "learning_rate": 4.013166309503262e-05, + "loss": 0.0121, + "num_input_tokens_seen": 79504992, + "step": 65325 + }, + { + "epoch": 7.2758659093440246, + "grad_norm": 0.9966668486595154, + "learning_rate": 4.012972889667933e-05, + "loss": 0.0712, + "num_input_tokens_seen": 79510688, + "step": 65330 + }, + { + "epoch": 7.276422764227642, + "grad_norm": 0.08403881639242172, + "learning_rate": 4.012779455541364e-05, + "loss": 0.0227, + "num_input_tokens_seen": 79516672, + "step": 65335 + }, + { + "epoch": 7.27697961911126, + "grad_norm": 0.02554389089345932, + "learning_rate": 4.01258600712538e-05, + "loss": 0.0279, + "num_input_tokens_seen": 79522880, + "step": 65340 + }, + { + "epoch": 7.277536473994877, + "grad_norm": 0.8530966639518738, + "learning_rate": 4.01239254442181e-05, + "loss": 0.1201, + "num_input_tokens_seen": 79528896, + "step": 65345 + }, + { + "epoch": 7.278093328878494, + "grad_norm": 0.0429496094584465, + "learning_rate": 4.0121990674324805e-05, + "loss": 0.0868, + "num_input_tokens_seen": 79535232, + "step": 65350 + }, + { + "epoch": 7.278650183762112, + "grad_norm": 0.17664222419261932, + "learning_rate": 4.012005576159219e-05, + "loss": 0.0864, + "num_input_tokens_seen": 79541024, + "step": 65355 + }, + { + "epoch": 7.279207038645729, + "grad_norm": 0.004025358706712723, + "learning_rate": 4.011812070603854e-05, + "loss": 0.1408, + "num_input_tokens_seen": 79547232, + "step": 65360 + }, + { + "epoch": 7.2797638935293465, + "grad_norm": 0.004979403223842382, + "learning_rate": 4.0116185507682126e-05, + "loss": 0.0154, + "num_input_tokens_seen": 79553376, + "step": 65365 + }, + { + "epoch": 7.280320748412963, + "grad_norm": 0.42020800709724426, + "learning_rate": 4.0114250166541226e-05, + "loss": 0.079, + "num_input_tokens_seen": 79559488, + "step": 65370 + }, + { + "epoch": 7.280877603296581, + "grad_norm": 0.7088748812675476, + "learning_rate": 4.011231468263412e-05, + "loss": 0.1263, + "num_input_tokens_seen": 79565632, + "step": 65375 + }, + { + "epoch": 7.281434458180199, + "grad_norm": 0.2218891829252243, + "learning_rate": 4.0110379055979104e-05, + "loss": 0.1283, + "num_input_tokens_seen": 79571360, + "step": 65380 + }, + { + "epoch": 7.281991313063815, + "grad_norm": 0.027138663455843925, + "learning_rate": 4.0108443286594446e-05, + "loss": 0.0484, + "num_input_tokens_seen": 79577376, + "step": 65385 + }, + { + "epoch": 7.282548167947433, + "grad_norm": 0.9563587307929993, + "learning_rate": 4.010650737449844e-05, + "loss": 0.1698, + "num_input_tokens_seen": 79583488, + "step": 65390 + }, + { + "epoch": 7.28310502283105, + "grad_norm": 0.8882016539573669, + "learning_rate": 4.010457131970936e-05, + "loss": 0.0465, + "num_input_tokens_seen": 79589504, + "step": 65395 + }, + { + "epoch": 7.2836618777146676, + "grad_norm": 0.007225906942039728, + "learning_rate": 4.0102635122245516e-05, + "loss": 0.0365, + "num_input_tokens_seen": 79595072, + "step": 65400 + }, + { + "epoch": 7.284218732598285, + "grad_norm": 0.38063856959342957, + "learning_rate": 4.0100698782125167e-05, + "loss": 0.0081, + "num_input_tokens_seen": 79601248, + "step": 65405 + }, + { + "epoch": 7.284775587481902, + "grad_norm": 0.6953485608100891, + "learning_rate": 4.009876229936663e-05, + "loss": 0.075, + "num_input_tokens_seen": 79607808, + "step": 65410 + }, + { + "epoch": 7.28533244236552, + "grad_norm": 0.1320028007030487, + "learning_rate": 4.009682567398818e-05, + "loss": 0.0545, + "num_input_tokens_seen": 79614176, + "step": 65415 + }, + { + "epoch": 7.2858892972491365, + "grad_norm": 0.9710831642150879, + "learning_rate": 4.009488890600812e-05, + "loss": 0.0188, + "num_input_tokens_seen": 79619904, + "step": 65420 + }, + { + "epoch": 7.286446152132754, + "grad_norm": 0.05659162998199463, + "learning_rate": 4.009295199544475e-05, + "loss": 0.1207, + "num_input_tokens_seen": 79625920, + "step": 65425 + }, + { + "epoch": 7.287003007016372, + "grad_norm": 0.5254117250442505, + "learning_rate": 4.009101494231634e-05, + "loss": 0.0341, + "num_input_tokens_seen": 79631968, + "step": 65430 + }, + { + "epoch": 7.287559861899989, + "grad_norm": 0.7191215753555298, + "learning_rate": 4.008907774664121e-05, + "loss": 0.0243, + "num_input_tokens_seen": 79638048, + "step": 65435 + }, + { + "epoch": 7.288116716783606, + "grad_norm": 0.2126908302307129, + "learning_rate": 4.008714040843765e-05, + "loss": 0.0153, + "num_input_tokens_seen": 79644416, + "step": 65440 + }, + { + "epoch": 7.288673571667223, + "grad_norm": 0.7200609445571899, + "learning_rate": 4.008520292772396e-05, + "loss": 0.0222, + "num_input_tokens_seen": 79650560, + "step": 65445 + }, + { + "epoch": 7.289230426550841, + "grad_norm": 0.013609109446406364, + "learning_rate": 4.008326530451845e-05, + "loss": 0.0364, + "num_input_tokens_seen": 79656416, + "step": 65450 + }, + { + "epoch": 7.289787281434458, + "grad_norm": 1.230163335800171, + "learning_rate": 4.0081327538839405e-05, + "loss": 0.1275, + "num_input_tokens_seen": 79661856, + "step": 65455 + }, + { + "epoch": 7.290344136318075, + "grad_norm": 0.053031954914331436, + "learning_rate": 4.007938963070515e-05, + "loss": 0.1226, + "num_input_tokens_seen": 79667808, + "step": 65460 + }, + { + "epoch": 7.290900991201693, + "grad_norm": 0.07697580009698868, + "learning_rate": 4.0077451580133966e-05, + "loss": 0.0331, + "num_input_tokens_seen": 79673760, + "step": 65465 + }, + { + "epoch": 7.291457846085311, + "grad_norm": 1.7449673414230347, + "learning_rate": 4.007551338714418e-05, + "loss": 0.0673, + "num_input_tokens_seen": 79679872, + "step": 65470 + }, + { + "epoch": 7.292014700968927, + "grad_norm": 1.4361342191696167, + "learning_rate": 4.007357505175409e-05, + "loss": 0.0945, + "num_input_tokens_seen": 79685568, + "step": 65475 + }, + { + "epoch": 7.292571555852545, + "grad_norm": 0.7250288724899292, + "learning_rate": 4.0071636573982007e-05, + "loss": 0.022, + "num_input_tokens_seen": 79691904, + "step": 65480 + }, + { + "epoch": 7.293128410736162, + "grad_norm": 0.09646021574735641, + "learning_rate": 4.006969795384624e-05, + "loss": 0.037, + "num_input_tokens_seen": 79697920, + "step": 65485 + }, + { + "epoch": 7.2936852656197795, + "grad_norm": 0.6007694005966187, + "learning_rate": 4.006775919136511e-05, + "loss": 0.0812, + "num_input_tokens_seen": 79703808, + "step": 65490 + }, + { + "epoch": 7.294242120503397, + "grad_norm": 0.04173703119158745, + "learning_rate": 4.006582028655691e-05, + "loss": 0.0682, + "num_input_tokens_seen": 79709824, + "step": 65495 + }, + { + "epoch": 7.294798975387014, + "grad_norm": 0.4181157946586609, + "learning_rate": 4.0063881239439974e-05, + "loss": 0.0502, + "num_input_tokens_seen": 79715904, + "step": 65500 + }, + { + "epoch": 7.295355830270632, + "grad_norm": 1.2302230596542358, + "learning_rate": 4.006194205003261e-05, + "loss": 0.0952, + "num_input_tokens_seen": 79722080, + "step": 65505 + }, + { + "epoch": 7.295912685154248, + "grad_norm": 0.5705081820487976, + "learning_rate": 4.006000271835313e-05, + "loss": 0.0185, + "num_input_tokens_seen": 79728128, + "step": 65510 + }, + { + "epoch": 7.296469540037866, + "grad_norm": 0.9402396082878113, + "learning_rate": 4.005806324441986e-05, + "loss": 0.0278, + "num_input_tokens_seen": 79734208, + "step": 65515 + }, + { + "epoch": 7.297026394921484, + "grad_norm": 0.48846131563186646, + "learning_rate": 4.005612362825113e-05, + "loss": 0.1042, + "num_input_tokens_seen": 79739712, + "step": 65520 + }, + { + "epoch": 7.2975832498051005, + "grad_norm": 0.019513601437211037, + "learning_rate": 4.0054183869865236e-05, + "loss": 0.0609, + "num_input_tokens_seen": 79746144, + "step": 65525 + }, + { + "epoch": 7.298140104688718, + "grad_norm": 0.017157824710011482, + "learning_rate": 4.005224396928052e-05, + "loss": 0.025, + "num_input_tokens_seen": 79752320, + "step": 65530 + }, + { + "epoch": 7.298696959572336, + "grad_norm": 0.0031735992524772882, + "learning_rate": 4.00503039265153e-05, + "loss": 0.0732, + "num_input_tokens_seen": 79758624, + "step": 65535 + }, + { + "epoch": 7.299253814455953, + "grad_norm": 0.7414277195930481, + "learning_rate": 4.0048363741587896e-05, + "loss": 0.0929, + "num_input_tokens_seen": 79764256, + "step": 65540 + }, + { + "epoch": 7.29981066933957, + "grad_norm": 0.9393923282623291, + "learning_rate": 4.004642341451664e-05, + "loss": 0.084, + "num_input_tokens_seen": 79769952, + "step": 65545 + }, + { + "epoch": 7.300367524223187, + "grad_norm": 0.2187049835920334, + "learning_rate": 4.0044482945319876e-05, + "loss": 0.062, + "num_input_tokens_seen": 79775968, + "step": 65550 + }, + { + "epoch": 7.300924379106805, + "grad_norm": 0.6531596183776855, + "learning_rate": 4.00425423340159e-05, + "loss": 0.0284, + "num_input_tokens_seen": 79782144, + "step": 65555 + }, + { + "epoch": 7.3014812339904225, + "grad_norm": 0.012762763537466526, + "learning_rate": 4.004060158062306e-05, + "loss": 0.0281, + "num_input_tokens_seen": 79788384, + "step": 65560 + }, + { + "epoch": 7.302038088874039, + "grad_norm": 0.016114616766572, + "learning_rate": 4.0038660685159703e-05, + "loss": 0.0616, + "num_input_tokens_seen": 79794688, + "step": 65565 + }, + { + "epoch": 7.302594943757657, + "grad_norm": 0.24996554851531982, + "learning_rate": 4.003671964764413e-05, + "loss": 0.0644, + "num_input_tokens_seen": 79800640, + "step": 65570 + }, + { + "epoch": 7.303151798641274, + "grad_norm": 0.2757660150527954, + "learning_rate": 4.0034778468094704e-05, + "loss": 0.0533, + "num_input_tokens_seen": 79806848, + "step": 65575 + }, + { + "epoch": 7.303708653524891, + "grad_norm": 0.7536302804946899, + "learning_rate": 4.003283714652974e-05, + "loss": 0.0423, + "num_input_tokens_seen": 79812832, + "step": 65580 + }, + { + "epoch": 7.304265508408509, + "grad_norm": 0.5721310377120972, + "learning_rate": 4.0030895682967595e-05, + "loss": 0.0274, + "num_input_tokens_seen": 79819136, + "step": 65585 + }, + { + "epoch": 7.304822363292126, + "grad_norm": 0.2593660354614258, + "learning_rate": 4.00289540774266e-05, + "loss": 0.0375, + "num_input_tokens_seen": 79825344, + "step": 65590 + }, + { + "epoch": 7.3053792181757435, + "grad_norm": 2.527134656906128, + "learning_rate": 4.0027012329925073e-05, + "loss": 0.1074, + "num_input_tokens_seen": 79831264, + "step": 65595 + }, + { + "epoch": 7.30593607305936, + "grad_norm": 0.8727133274078369, + "learning_rate": 4.0025070440481394e-05, + "loss": 0.04, + "num_input_tokens_seen": 79837696, + "step": 65600 + }, + { + "epoch": 7.306492927942978, + "grad_norm": 0.4602997899055481, + "learning_rate": 4.0023128409113874e-05, + "loss": 0.1284, + "num_input_tokens_seen": 79844032, + "step": 65605 + }, + { + "epoch": 7.307049782826596, + "grad_norm": 0.0020260890014469624, + "learning_rate": 4.002118623584088e-05, + "loss": 0.0978, + "num_input_tokens_seen": 79850048, + "step": 65610 + }, + { + "epoch": 7.3076066377102125, + "grad_norm": 0.044541582465171814, + "learning_rate": 4.001924392068075e-05, + "loss": 0.0614, + "num_input_tokens_seen": 79855744, + "step": 65615 + }, + { + "epoch": 7.30816349259383, + "grad_norm": 0.030370591208338737, + "learning_rate": 4.001730146365182e-05, + "loss": 0.0357, + "num_input_tokens_seen": 79861760, + "step": 65620 + }, + { + "epoch": 7.308720347477447, + "grad_norm": 0.0011402372037991881, + "learning_rate": 4.001535886477245e-05, + "loss": 0.0205, + "num_input_tokens_seen": 79867872, + "step": 65625 + }, + { + "epoch": 7.309277202361065, + "grad_norm": 0.26234328746795654, + "learning_rate": 4.0013416124060975e-05, + "loss": 0.0335, + "num_input_tokens_seen": 79873760, + "step": 65630 + }, + { + "epoch": 7.309834057244682, + "grad_norm": 0.0033846660517156124, + "learning_rate": 4.001147324153577e-05, + "loss": 0.0963, + "num_input_tokens_seen": 79879680, + "step": 65635 + }, + { + "epoch": 7.310390912128299, + "grad_norm": 0.10739388316869736, + "learning_rate": 4.000953021721516e-05, + "loss": 0.0214, + "num_input_tokens_seen": 79886240, + "step": 65640 + }, + { + "epoch": 7.310947767011917, + "grad_norm": 0.0023012007586658, + "learning_rate": 4.000758705111752e-05, + "loss": 0.1071, + "num_input_tokens_seen": 79892288, + "step": 65645 + }, + { + "epoch": 7.311504621895534, + "grad_norm": 0.9196615219116211, + "learning_rate": 4.00056437432612e-05, + "loss": 0.0928, + "num_input_tokens_seen": 79898496, + "step": 65650 + }, + { + "epoch": 7.312061476779151, + "grad_norm": 0.65509033203125, + "learning_rate": 4.0003700293664545e-05, + "loss": 0.0373, + "num_input_tokens_seen": 79904544, + "step": 65655 + }, + { + "epoch": 7.312618331662769, + "grad_norm": 0.6975078582763672, + "learning_rate": 4.000175670234593e-05, + "loss": 0.0912, + "num_input_tokens_seen": 79910112, + "step": 65660 + }, + { + "epoch": 7.313175186546386, + "grad_norm": 1.3358207941055298, + "learning_rate": 3.999981296932369e-05, + "loss": 0.1519, + "num_input_tokens_seen": 79916416, + "step": 65665 + }, + { + "epoch": 7.313732041430003, + "grad_norm": 0.12204064428806305, + "learning_rate": 3.999786909461621e-05, + "loss": 0.225, + "num_input_tokens_seen": 79922624, + "step": 65670 + }, + { + "epoch": 7.314288896313621, + "grad_norm": 0.1863493174314499, + "learning_rate": 3.999592507824184e-05, + "loss": 0.107, + "num_input_tokens_seen": 79929184, + "step": 65675 + }, + { + "epoch": 7.314845751197238, + "grad_norm": 1.04421865940094, + "learning_rate": 3.9993980920218934e-05, + "loss": 0.0213, + "num_input_tokens_seen": 79935488, + "step": 65680 + }, + { + "epoch": 7.3154026060808555, + "grad_norm": 1.9051450490951538, + "learning_rate": 3.999203662056587e-05, + "loss": 0.1132, + "num_input_tokens_seen": 79941408, + "step": 65685 + }, + { + "epoch": 7.315959460964472, + "grad_norm": 2.6968765258789062, + "learning_rate": 3.999009217930101e-05, + "loss": 0.1014, + "num_input_tokens_seen": 79947776, + "step": 65690 + }, + { + "epoch": 7.31651631584809, + "grad_norm": 0.952848494052887, + "learning_rate": 3.998814759644273e-05, + "loss": 0.0518, + "num_input_tokens_seen": 79954048, + "step": 65695 + }, + { + "epoch": 7.317073170731708, + "grad_norm": 0.4883688986301422, + "learning_rate": 3.998620287200937e-05, + "loss": 0.0322, + "num_input_tokens_seen": 79960192, + "step": 65700 + }, + { + "epoch": 7.317630025615324, + "grad_norm": 0.3948403298854828, + "learning_rate": 3.998425800601933e-05, + "loss": 0.0666, + "num_input_tokens_seen": 79966304, + "step": 65705 + }, + { + "epoch": 7.318186880498942, + "grad_norm": 1.2886171340942383, + "learning_rate": 3.9982312998490954e-05, + "loss": 0.0944, + "num_input_tokens_seen": 79972416, + "step": 65710 + }, + { + "epoch": 7.31874373538256, + "grad_norm": 0.08083750307559967, + "learning_rate": 3.998036784944264e-05, + "loss": 0.0801, + "num_input_tokens_seen": 79978272, + "step": 65715 + }, + { + "epoch": 7.3193005902661765, + "grad_norm": 1.667275309562683, + "learning_rate": 3.997842255889274e-05, + "loss": 0.1282, + "num_input_tokens_seen": 79983968, + "step": 65720 + }, + { + "epoch": 7.319857445149794, + "grad_norm": 0.08637534081935883, + "learning_rate": 3.9976477126859646e-05, + "loss": 0.0567, + "num_input_tokens_seen": 79990272, + "step": 65725 + }, + { + "epoch": 7.320414300033411, + "grad_norm": 0.24376702308654785, + "learning_rate": 3.9974531553361725e-05, + "loss": 0.0484, + "num_input_tokens_seen": 79996320, + "step": 65730 + }, + { + "epoch": 7.320971154917029, + "grad_norm": 0.0024105971679091454, + "learning_rate": 3.997258583841735e-05, + "loss": 0.0334, + "num_input_tokens_seen": 80002400, + "step": 65735 + }, + { + "epoch": 7.321528009800646, + "grad_norm": 0.00038755356217734516, + "learning_rate": 3.997063998204491e-05, + "loss": 0.0851, + "num_input_tokens_seen": 80008416, + "step": 65740 + }, + { + "epoch": 7.322084864684263, + "grad_norm": 0.4397929906845093, + "learning_rate": 3.9968693984262784e-05, + "loss": 0.0254, + "num_input_tokens_seen": 80014240, + "step": 65745 + }, + { + "epoch": 7.322641719567881, + "grad_norm": 0.5872089862823486, + "learning_rate": 3.9966747845089345e-05, + "loss": 0.0576, + "num_input_tokens_seen": 80020160, + "step": 65750 + }, + { + "epoch": 7.323198574451498, + "grad_norm": 0.005277604330331087, + "learning_rate": 3.9964801564542984e-05, + "loss": 0.043, + "num_input_tokens_seen": 80026368, + "step": 65755 + }, + { + "epoch": 7.323755429335115, + "grad_norm": 0.35830119252204895, + "learning_rate": 3.996285514264208e-05, + "loss": 0.0622, + "num_input_tokens_seen": 80032352, + "step": 65760 + }, + { + "epoch": 7.324312284218733, + "grad_norm": 0.17122907936573029, + "learning_rate": 3.9960908579405035e-05, + "loss": 0.1417, + "num_input_tokens_seen": 80038208, + "step": 65765 + }, + { + "epoch": 7.32486913910235, + "grad_norm": 0.5037193298339844, + "learning_rate": 3.995896187485021e-05, + "loss": 0.0131, + "num_input_tokens_seen": 80044480, + "step": 65770 + }, + { + "epoch": 7.325425993985967, + "grad_norm": 0.1071661189198494, + "learning_rate": 3.995701502899601e-05, + "loss": 0.0369, + "num_input_tokens_seen": 80050560, + "step": 65775 + }, + { + "epoch": 7.325982848869584, + "grad_norm": 0.08984597772359848, + "learning_rate": 3.9955068041860814e-05, + "loss": 0.0342, + "num_input_tokens_seen": 80056896, + "step": 65780 + }, + { + "epoch": 7.326539703753202, + "grad_norm": 0.00785953737795353, + "learning_rate": 3.995312091346302e-05, + "loss": 0.0669, + "num_input_tokens_seen": 80062880, + "step": 65785 + }, + { + "epoch": 7.3270965586368195, + "grad_norm": 0.5714042782783508, + "learning_rate": 3.995117364382102e-05, + "loss": 0.0567, + "num_input_tokens_seen": 80068704, + "step": 65790 + }, + { + "epoch": 7.327653413520436, + "grad_norm": 0.5156998634338379, + "learning_rate": 3.994922623295321e-05, + "loss": 0.0664, + "num_input_tokens_seen": 80074304, + "step": 65795 + }, + { + "epoch": 7.328210268404054, + "grad_norm": 0.7652722001075745, + "learning_rate": 3.994727868087798e-05, + "loss": 0.1476, + "num_input_tokens_seen": 80080480, + "step": 65800 + }, + { + "epoch": 7.328767123287671, + "grad_norm": 0.8426429629325867, + "learning_rate": 3.994533098761372e-05, + "loss": 0.0307, + "num_input_tokens_seen": 80086816, + "step": 65805 + }, + { + "epoch": 7.3293239781712884, + "grad_norm": 0.6376728415489197, + "learning_rate": 3.9943383153178835e-05, + "loss": 0.0666, + "num_input_tokens_seen": 80093024, + "step": 65810 + }, + { + "epoch": 7.329880833054906, + "grad_norm": 1.2136881351470947, + "learning_rate": 3.994143517759173e-05, + "loss": 0.1038, + "num_input_tokens_seen": 80098816, + "step": 65815 + }, + { + "epoch": 7.330437687938523, + "grad_norm": 2.540419340133667, + "learning_rate": 3.99394870608708e-05, + "loss": 0.0496, + "num_input_tokens_seen": 80104928, + "step": 65820 + }, + { + "epoch": 7.330994542822141, + "grad_norm": 0.15386253595352173, + "learning_rate": 3.993753880303445e-05, + "loss": 0.0089, + "num_input_tokens_seen": 80111168, + "step": 65825 + }, + { + "epoch": 7.331551397705758, + "grad_norm": 0.017466505989432335, + "learning_rate": 3.9935590404101066e-05, + "loss": 0.1443, + "num_input_tokens_seen": 80117344, + "step": 65830 + }, + { + "epoch": 7.332108252589375, + "grad_norm": 0.058067843317985535, + "learning_rate": 3.9933641864089066e-05, + "loss": 0.0858, + "num_input_tokens_seen": 80123776, + "step": 65835 + }, + { + "epoch": 7.332665107472993, + "grad_norm": 0.0017459711525589228, + "learning_rate": 3.993169318301686e-05, + "loss": 0.0537, + "num_input_tokens_seen": 80130272, + "step": 65840 + }, + { + "epoch": 7.3332219623566095, + "grad_norm": 0.008821714669466019, + "learning_rate": 3.992974436090284e-05, + "loss": 0.0037, + "num_input_tokens_seen": 80136416, + "step": 65845 + }, + { + "epoch": 7.333778817240227, + "grad_norm": 0.9188785552978516, + "learning_rate": 3.992779539776543e-05, + "loss": 0.0375, + "num_input_tokens_seen": 80142528, + "step": 65850 + }, + { + "epoch": 7.334335672123845, + "grad_norm": 0.0010335699189454317, + "learning_rate": 3.992584629362304e-05, + "loss": 0.0605, + "num_input_tokens_seen": 80148544, + "step": 65855 + }, + { + "epoch": 7.334892527007462, + "grad_norm": 0.0034567895345389843, + "learning_rate": 3.9923897048494063e-05, + "loss": 0.0203, + "num_input_tokens_seen": 80155168, + "step": 65860 + }, + { + "epoch": 7.335449381891079, + "grad_norm": 0.20542241632938385, + "learning_rate": 3.992194766239692e-05, + "loss": 0.1398, + "num_input_tokens_seen": 80161376, + "step": 65865 + }, + { + "epoch": 7.336006236774696, + "grad_norm": 1.1925486326217651, + "learning_rate": 3.991999813535003e-05, + "loss": 0.0606, + "num_input_tokens_seen": 80167264, + "step": 65870 + }, + { + "epoch": 7.336563091658314, + "grad_norm": 1.4832518100738525, + "learning_rate": 3.9918048467371805e-05, + "loss": 0.028, + "num_input_tokens_seen": 80173440, + "step": 65875 + }, + { + "epoch": 7.3371199465419314, + "grad_norm": 0.09056583791971207, + "learning_rate": 3.991609865848066e-05, + "loss": 0.0434, + "num_input_tokens_seen": 80179552, + "step": 65880 + }, + { + "epoch": 7.337676801425548, + "grad_norm": 0.28435492515563965, + "learning_rate": 3.991414870869501e-05, + "loss": 0.0245, + "num_input_tokens_seen": 80184288, + "step": 65885 + }, + { + "epoch": 7.338233656309166, + "grad_norm": 0.8026185035705566, + "learning_rate": 3.9912198618033275e-05, + "loss": 0.1122, + "num_input_tokens_seen": 80190432, + "step": 65890 + }, + { + "epoch": 7.338790511192784, + "grad_norm": 0.02291773073375225, + "learning_rate": 3.991024838651388e-05, + "loss": 0.0164, + "num_input_tokens_seen": 80196704, + "step": 65895 + }, + { + "epoch": 7.3393473660764, + "grad_norm": 0.14396356046199799, + "learning_rate": 3.990829801415524e-05, + "loss": 0.0374, + "num_input_tokens_seen": 80202656, + "step": 65900 + }, + { + "epoch": 7.339904220960018, + "grad_norm": 0.0031361524015665054, + "learning_rate": 3.990634750097578e-05, + "loss": 0.0357, + "num_input_tokens_seen": 80208992, + "step": 65905 + }, + { + "epoch": 7.340461075843635, + "grad_norm": 0.14805592596530914, + "learning_rate": 3.990439684699393e-05, + "loss": 0.0082, + "num_input_tokens_seen": 80215200, + "step": 65910 + }, + { + "epoch": 7.3410179307272525, + "grad_norm": 0.008143451064825058, + "learning_rate": 3.990244605222812e-05, + "loss": 0.0316, + "num_input_tokens_seen": 80221408, + "step": 65915 + }, + { + "epoch": 7.34157478561087, + "grad_norm": 0.0003061281458940357, + "learning_rate": 3.990049511669675e-05, + "loss": 0.0696, + "num_input_tokens_seen": 80227744, + "step": 65920 + }, + { + "epoch": 7.342131640494487, + "grad_norm": 0.4252461791038513, + "learning_rate": 3.9898544040418276e-05, + "loss": 0.1111, + "num_input_tokens_seen": 80234048, + "step": 65925 + }, + { + "epoch": 7.342688495378105, + "grad_norm": 0.3307487368583679, + "learning_rate": 3.989659282341111e-05, + "loss": 0.1007, + "num_input_tokens_seen": 80240352, + "step": 65930 + }, + { + "epoch": 7.343245350261721, + "grad_norm": 0.010511374101042747, + "learning_rate": 3.989464146569369e-05, + "loss": 0.0368, + "num_input_tokens_seen": 80246656, + "step": 65935 + }, + { + "epoch": 7.343802205145339, + "grad_norm": 0.5784333944320679, + "learning_rate": 3.989268996728445e-05, + "loss": 0.0236, + "num_input_tokens_seen": 80252736, + "step": 65940 + }, + { + "epoch": 7.344359060028957, + "grad_norm": 0.6674964427947998, + "learning_rate": 3.989073832820182e-05, + "loss": 0.0366, + "num_input_tokens_seen": 80259104, + "step": 65945 + }, + { + "epoch": 7.344915914912574, + "grad_norm": 1.2072497606277466, + "learning_rate": 3.988878654846424e-05, + "loss": 0.1354, + "num_input_tokens_seen": 80265184, + "step": 65950 + }, + { + "epoch": 7.345472769796191, + "grad_norm": 0.5703534483909607, + "learning_rate": 3.988683462809014e-05, + "loss": 0.0427, + "num_input_tokens_seen": 80271520, + "step": 65955 + }, + { + "epoch": 7.346029624679808, + "grad_norm": 0.011067640967667103, + "learning_rate": 3.9884882567097956e-05, + "loss": 0.0358, + "num_input_tokens_seen": 80277760, + "step": 65960 + }, + { + "epoch": 7.346586479563426, + "grad_norm": 0.1813623011112213, + "learning_rate": 3.988293036550614e-05, + "loss": 0.0576, + "num_input_tokens_seen": 80283904, + "step": 65965 + }, + { + "epoch": 7.347143334447043, + "grad_norm": 0.007459428161382675, + "learning_rate": 3.9880978023333115e-05, + "loss": 0.0166, + "num_input_tokens_seen": 80290304, + "step": 65970 + }, + { + "epoch": 7.34770018933066, + "grad_norm": 0.022401582449674606, + "learning_rate": 3.9879025540597336e-05, + "loss": 0.0848, + "num_input_tokens_seen": 80296672, + "step": 65975 + }, + { + "epoch": 7.348257044214278, + "grad_norm": 0.00036551913945004344, + "learning_rate": 3.9877072917317236e-05, + "loss": 0.0234, + "num_input_tokens_seen": 80303136, + "step": 65980 + }, + { + "epoch": 7.348813899097895, + "grad_norm": 0.7859467267990112, + "learning_rate": 3.9875120153511266e-05, + "loss": 0.034, + "num_input_tokens_seen": 80309472, + "step": 65985 + }, + { + "epoch": 7.349370753981512, + "grad_norm": 0.18717016279697418, + "learning_rate": 3.9873167249197865e-05, + "loss": 0.0241, + "num_input_tokens_seen": 80315776, + "step": 65990 + }, + { + "epoch": 7.34992760886513, + "grad_norm": 0.2518281042575836, + "learning_rate": 3.987121420439548e-05, + "loss": 0.0403, + "num_input_tokens_seen": 80321856, + "step": 65995 + }, + { + "epoch": 7.350484463748747, + "grad_norm": 0.16404521465301514, + "learning_rate": 3.986926101912257e-05, + "loss": 0.0213, + "num_input_tokens_seen": 80328128, + "step": 66000 + }, + { + "epoch": 7.351041318632364, + "grad_norm": 0.23123566806316376, + "learning_rate": 3.986730769339757e-05, + "loss": 0.0253, + "num_input_tokens_seen": 80334144, + "step": 66005 + }, + { + "epoch": 7.351598173515982, + "grad_norm": 0.011807971633970737, + "learning_rate": 3.9865354227238937e-05, + "loss": 0.0675, + "num_input_tokens_seen": 80340352, + "step": 66010 + }, + { + "epoch": 7.352155028399599, + "grad_norm": 0.5499065518379211, + "learning_rate": 3.986340062066513e-05, + "loss": 0.0408, + "num_input_tokens_seen": 80346368, + "step": 66015 + }, + { + "epoch": 7.352711883283217, + "grad_norm": 0.538043200969696, + "learning_rate": 3.9861446873694593e-05, + "loss": 0.1893, + "num_input_tokens_seen": 80351744, + "step": 66020 + }, + { + "epoch": 7.353268738166833, + "grad_norm": 0.0007667913450859487, + "learning_rate": 3.985949298634579e-05, + "loss": 0.1024, + "num_input_tokens_seen": 80357792, + "step": 66025 + }, + { + "epoch": 7.353825593050451, + "grad_norm": 0.003899588016793132, + "learning_rate": 3.985753895863716e-05, + "loss": 0.0609, + "num_input_tokens_seen": 80363680, + "step": 66030 + }, + { + "epoch": 7.354382447934069, + "grad_norm": 0.009270300157368183, + "learning_rate": 3.9855584790587174e-05, + "loss": 0.0976, + "num_input_tokens_seen": 80369792, + "step": 66035 + }, + { + "epoch": 7.3549393028176855, + "grad_norm": 0.8718575239181519, + "learning_rate": 3.985363048221429e-05, + "loss": 0.1106, + "num_input_tokens_seen": 80375776, + "step": 66040 + }, + { + "epoch": 7.355496157701303, + "grad_norm": 0.0011398399947211146, + "learning_rate": 3.985167603353696e-05, + "loss": 0.0296, + "num_input_tokens_seen": 80381856, + "step": 66045 + }, + { + "epoch": 7.356053012584921, + "grad_norm": 0.3478131890296936, + "learning_rate": 3.984972144457365e-05, + "loss": 0.1995, + "num_input_tokens_seen": 80387488, + "step": 66050 + }, + { + "epoch": 7.356609867468538, + "grad_norm": 2.2140402793884277, + "learning_rate": 3.984776671534283e-05, + "loss": 0.1285, + "num_input_tokens_seen": 80393664, + "step": 66055 + }, + { + "epoch": 7.357166722352155, + "grad_norm": 0.6520988941192627, + "learning_rate": 3.984581184586296e-05, + "loss": 0.0425, + "num_input_tokens_seen": 80399712, + "step": 66060 + }, + { + "epoch": 7.357723577235772, + "grad_norm": 1.2196637392044067, + "learning_rate": 3.984385683615249e-05, + "loss": 0.0272, + "num_input_tokens_seen": 80405696, + "step": 66065 + }, + { + "epoch": 7.35828043211939, + "grad_norm": 0.16177092492580414, + "learning_rate": 3.984190168622991e-05, + "loss": 0.0467, + "num_input_tokens_seen": 80411648, + "step": 66070 + }, + { + "epoch": 7.358837287003007, + "grad_norm": 0.10620788484811783, + "learning_rate": 3.983994639611368e-05, + "loss": 0.0198, + "num_input_tokens_seen": 80417760, + "step": 66075 + }, + { + "epoch": 7.359394141886624, + "grad_norm": 0.7260421514511108, + "learning_rate": 3.983799096582226e-05, + "loss": 0.0172, + "num_input_tokens_seen": 80424096, + "step": 66080 + }, + { + "epoch": 7.359950996770242, + "grad_norm": 1.0037846565246582, + "learning_rate": 3.9836035395374134e-05, + "loss": 0.1783, + "num_input_tokens_seen": 80430112, + "step": 66085 + }, + { + "epoch": 7.360507851653859, + "grad_norm": 0.005340749863535166, + "learning_rate": 3.9834079684787765e-05, + "loss": 0.0283, + "num_input_tokens_seen": 80436000, + "step": 66090 + }, + { + "epoch": 7.361064706537476, + "grad_norm": 1.0418092012405396, + "learning_rate": 3.983212383408163e-05, + "loss": 0.0768, + "num_input_tokens_seen": 80442080, + "step": 66095 + }, + { + "epoch": 7.361621561421094, + "grad_norm": 0.0049095614813268185, + "learning_rate": 3.983016784327419e-05, + "loss": 0.037, + "num_input_tokens_seen": 80448256, + "step": 66100 + }, + { + "epoch": 7.362178416304711, + "grad_norm": 0.45238614082336426, + "learning_rate": 3.9828211712383944e-05, + "loss": 0.0499, + "num_input_tokens_seen": 80454496, + "step": 66105 + }, + { + "epoch": 7.3627352711883285, + "grad_norm": 0.41572076082229614, + "learning_rate": 3.9826255441429356e-05, + "loss": 0.0266, + "num_input_tokens_seen": 80460640, + "step": 66110 + }, + { + "epoch": 7.363292126071945, + "grad_norm": 0.010351544246077538, + "learning_rate": 3.98242990304289e-05, + "loss": 0.0265, + "num_input_tokens_seen": 80467072, + "step": 66115 + }, + { + "epoch": 7.363848980955563, + "grad_norm": 1.4772751331329346, + "learning_rate": 3.982234247940107e-05, + "loss": 0.0699, + "num_input_tokens_seen": 80473344, + "step": 66120 + }, + { + "epoch": 7.364405835839181, + "grad_norm": 1.3624260425567627, + "learning_rate": 3.982038578836434e-05, + "loss": 0.1717, + "num_input_tokens_seen": 80479232, + "step": 66125 + }, + { + "epoch": 7.364962690722797, + "grad_norm": 0.14014700055122375, + "learning_rate": 3.98184289573372e-05, + "loss": 0.071, + "num_input_tokens_seen": 80485504, + "step": 66130 + }, + { + "epoch": 7.365519545606415, + "grad_norm": 0.32727181911468506, + "learning_rate": 3.981647198633811e-05, + "loss": 0.0197, + "num_input_tokens_seen": 80491552, + "step": 66135 + }, + { + "epoch": 7.366076400490032, + "grad_norm": 0.7466349005699158, + "learning_rate": 3.981451487538558e-05, + "loss": 0.0502, + "num_input_tokens_seen": 80497216, + "step": 66140 + }, + { + "epoch": 7.3666332553736495, + "grad_norm": 0.013555078767240047, + "learning_rate": 3.981255762449808e-05, + "loss": 0.0358, + "num_input_tokens_seen": 80502880, + "step": 66145 + }, + { + "epoch": 7.367190110257267, + "grad_norm": 0.7981670498847961, + "learning_rate": 3.9810600233694115e-05, + "loss": 0.0908, + "num_input_tokens_seen": 80509216, + "step": 66150 + }, + { + "epoch": 7.367746965140884, + "grad_norm": 0.18954457342624664, + "learning_rate": 3.980864270299216e-05, + "loss": 0.0204, + "num_input_tokens_seen": 80515360, + "step": 66155 + }, + { + "epoch": 7.368303820024502, + "grad_norm": 0.8713178038597107, + "learning_rate": 3.980668503241072e-05, + "loss": 0.0294, + "num_input_tokens_seen": 80521376, + "step": 66160 + }, + { + "epoch": 7.368860674908119, + "grad_norm": 0.02076277881860733, + "learning_rate": 3.9804727221968266e-05, + "loss": 0.1369, + "num_input_tokens_seen": 80527360, + "step": 66165 + }, + { + "epoch": 7.369417529791736, + "grad_norm": 0.04915451630949974, + "learning_rate": 3.9802769271683304e-05, + "loss": 0.087, + "num_input_tokens_seen": 80533472, + "step": 66170 + }, + { + "epoch": 7.369974384675354, + "grad_norm": 0.11898515373468399, + "learning_rate": 3.980081118157433e-05, + "loss": 0.1405, + "num_input_tokens_seen": 80539424, + "step": 66175 + }, + { + "epoch": 7.370531239558971, + "grad_norm": 1.4951074123382568, + "learning_rate": 3.9798852951659824e-05, + "loss": 0.1175, + "num_input_tokens_seen": 80545600, + "step": 66180 + }, + { + "epoch": 7.371088094442588, + "grad_norm": 0.10518935322761536, + "learning_rate": 3.979689458195831e-05, + "loss": 0.0722, + "num_input_tokens_seen": 80550848, + "step": 66185 + }, + { + "epoch": 7.371644949326206, + "grad_norm": 0.07163078337907791, + "learning_rate": 3.9794936072488266e-05, + "loss": 0.0663, + "num_input_tokens_seen": 80557088, + "step": 66190 + }, + { + "epoch": 7.372201804209823, + "grad_norm": 1.4248299598693848, + "learning_rate": 3.97929774232682e-05, + "loss": 0.0493, + "num_input_tokens_seen": 80562912, + "step": 66195 + }, + { + "epoch": 7.37275865909344, + "grad_norm": 0.8790651559829712, + "learning_rate": 3.979101863431661e-05, + "loss": 0.044, + "num_input_tokens_seen": 80568672, + "step": 66200 + }, + { + "epoch": 7.373315513977057, + "grad_norm": 0.6453179717063904, + "learning_rate": 3.978905970565199e-05, + "loss": 0.0262, + "num_input_tokens_seen": 80574688, + "step": 66205 + }, + { + "epoch": 7.373872368860675, + "grad_norm": 0.05251554772257805, + "learning_rate": 3.978710063729286e-05, + "loss": 0.0923, + "num_input_tokens_seen": 80580096, + "step": 66210 + }, + { + "epoch": 7.3744292237442925, + "grad_norm": 2.1993119716644287, + "learning_rate": 3.9785141429257716e-05, + "loss": 0.0377, + "num_input_tokens_seen": 80586336, + "step": 66215 + }, + { + "epoch": 7.374986078627909, + "grad_norm": 0.8345280885696411, + "learning_rate": 3.978318208156507e-05, + "loss": 0.0982, + "num_input_tokens_seen": 80592192, + "step": 66220 + }, + { + "epoch": 7.375542933511527, + "grad_norm": 0.004577448591589928, + "learning_rate": 3.978122259423342e-05, + "loss": 0.0792, + "num_input_tokens_seen": 80598144, + "step": 66225 + }, + { + "epoch": 7.376099788395145, + "grad_norm": 0.1351112276315689, + "learning_rate": 3.9779262967281285e-05, + "loss": 0.0204, + "num_input_tokens_seen": 80604288, + "step": 66230 + }, + { + "epoch": 7.3766566432787615, + "grad_norm": 0.01511896401643753, + "learning_rate": 3.977730320072716e-05, + "loss": 0.0295, + "num_input_tokens_seen": 80610528, + "step": 66235 + }, + { + "epoch": 7.377213498162379, + "grad_norm": 0.0002515378873795271, + "learning_rate": 3.977534329458957e-05, + "loss": 0.0046, + "num_input_tokens_seen": 80616800, + "step": 66240 + }, + { + "epoch": 7.377770353045996, + "grad_norm": 0.43830132484436035, + "learning_rate": 3.977338324888703e-05, + "loss": 0.0142, + "num_input_tokens_seen": 80623072, + "step": 66245 + }, + { + "epoch": 7.378327207929614, + "grad_norm": 0.0002774496388155967, + "learning_rate": 3.9771423063638046e-05, + "loss": 0.0053, + "num_input_tokens_seen": 80629376, + "step": 66250 + }, + { + "epoch": 7.378884062813231, + "grad_norm": 0.8246629238128662, + "learning_rate": 3.976946273886114e-05, + "loss": 0.0786, + "num_input_tokens_seen": 80635872, + "step": 66255 + }, + { + "epoch": 7.379440917696848, + "grad_norm": 0.098549023270607, + "learning_rate": 3.976750227457482e-05, + "loss": 0.0049, + "num_input_tokens_seen": 80641760, + "step": 66260 + }, + { + "epoch": 7.379997772580466, + "grad_norm": 1.155738353729248, + "learning_rate": 3.9765541670797615e-05, + "loss": 0.0578, + "num_input_tokens_seen": 80647680, + "step": 66265 + }, + { + "epoch": 7.3805546274640825, + "grad_norm": 0.2770503759384155, + "learning_rate": 3.976358092754804e-05, + "loss": 0.0168, + "num_input_tokens_seen": 80653920, + "step": 66270 + }, + { + "epoch": 7.3811114823477, + "grad_norm": 1.5325466394424438, + "learning_rate": 3.9761620044844605e-05, + "loss": 0.1659, + "num_input_tokens_seen": 80660256, + "step": 66275 + }, + { + "epoch": 7.381668337231318, + "grad_norm": 1.270939826965332, + "learning_rate": 3.975965902270585e-05, + "loss": 0.1336, + "num_input_tokens_seen": 80666400, + "step": 66280 + }, + { + "epoch": 7.382225192114935, + "grad_norm": 0.3435838520526886, + "learning_rate": 3.975769786115029e-05, + "loss": 0.0476, + "num_input_tokens_seen": 80672064, + "step": 66285 + }, + { + "epoch": 7.382782046998552, + "grad_norm": 0.028971031308174133, + "learning_rate": 3.975573656019644e-05, + "loss": 0.0017, + "num_input_tokens_seen": 80678112, + "step": 66290 + }, + { + "epoch": 7.383338901882169, + "grad_norm": 0.07656636834144592, + "learning_rate": 3.975377511986284e-05, + "loss": 0.0448, + "num_input_tokens_seen": 80683904, + "step": 66295 + }, + { + "epoch": 7.383895756765787, + "grad_norm": 0.0874110534787178, + "learning_rate": 3.9751813540168025e-05, + "loss": 0.0296, + "num_input_tokens_seen": 80690240, + "step": 66300 + }, + { + "epoch": 7.3844526116494045, + "grad_norm": 0.1554102897644043, + "learning_rate": 3.97498518211305e-05, + "loss": 0.0476, + "num_input_tokens_seen": 80696416, + "step": 66305 + }, + { + "epoch": 7.385009466533021, + "grad_norm": 0.1304711550474167, + "learning_rate": 3.9747889962768815e-05, + "loss": 0.0604, + "num_input_tokens_seen": 80702720, + "step": 66310 + }, + { + "epoch": 7.385566321416639, + "grad_norm": 1.8840912580490112, + "learning_rate": 3.9745927965101495e-05, + "loss": 0.1033, + "num_input_tokens_seen": 80708256, + "step": 66315 + }, + { + "epoch": 7.386123176300256, + "grad_norm": 0.0004680668644141406, + "learning_rate": 3.974396582814707e-05, + "loss": 0.0988, + "num_input_tokens_seen": 80714400, + "step": 66320 + }, + { + "epoch": 7.386680031183873, + "grad_norm": 0.07145116478204727, + "learning_rate": 3.974200355192407e-05, + "loss": 0.061, + "num_input_tokens_seen": 80720320, + "step": 66325 + }, + { + "epoch": 7.387236886067491, + "grad_norm": 0.5465183258056641, + "learning_rate": 3.974004113645103e-05, + "loss": 0.1621, + "num_input_tokens_seen": 80726400, + "step": 66330 + }, + { + "epoch": 7.387793740951108, + "grad_norm": 0.0016206548316404223, + "learning_rate": 3.97380785817465e-05, + "loss": 0.0172, + "num_input_tokens_seen": 80732256, + "step": 66335 + }, + { + "epoch": 7.3883505958347255, + "grad_norm": 0.06704425066709518, + "learning_rate": 3.9736115887829e-05, + "loss": 0.0544, + "num_input_tokens_seen": 80738592, + "step": 66340 + }, + { + "epoch": 7.388907450718343, + "grad_norm": 0.04424400255084038, + "learning_rate": 3.9734153054717096e-05, + "loss": 0.0279, + "num_input_tokens_seen": 80744096, + "step": 66345 + }, + { + "epoch": 7.38946430560196, + "grad_norm": 0.3620156943798065, + "learning_rate": 3.973219008242931e-05, + "loss": 0.0968, + "num_input_tokens_seen": 80749856, + "step": 66350 + }, + { + "epoch": 7.390021160485578, + "grad_norm": 1.1434051990509033, + "learning_rate": 3.973022697098418e-05, + "loss": 0.1134, + "num_input_tokens_seen": 80756000, + "step": 66355 + }, + { + "epoch": 7.3905780153691945, + "grad_norm": 0.44977349042892456, + "learning_rate": 3.972826372040025e-05, + "loss": 0.0439, + "num_input_tokens_seen": 80761856, + "step": 66360 + }, + { + "epoch": 7.391134870252812, + "grad_norm": 0.2824307084083557, + "learning_rate": 3.972630033069607e-05, + "loss": 0.1434, + "num_input_tokens_seen": 80767776, + "step": 66365 + }, + { + "epoch": 7.39169172513643, + "grad_norm": 0.38504329323768616, + "learning_rate": 3.9724336801890184e-05, + "loss": 0.1036, + "num_input_tokens_seen": 80773760, + "step": 66370 + }, + { + "epoch": 7.392248580020047, + "grad_norm": 0.11110154539346695, + "learning_rate": 3.9722373134001145e-05, + "loss": 0.1513, + "num_input_tokens_seen": 80779904, + "step": 66375 + }, + { + "epoch": 7.392805434903664, + "grad_norm": 0.011593202129006386, + "learning_rate": 3.9720409327047496e-05, + "loss": 0.0077, + "num_input_tokens_seen": 80786144, + "step": 66380 + }, + { + "epoch": 7.393362289787281, + "grad_norm": 0.9210994839668274, + "learning_rate": 3.9718445381047785e-05, + "loss": 0.0626, + "num_input_tokens_seen": 80792320, + "step": 66385 + }, + { + "epoch": 7.393919144670899, + "grad_norm": 0.20209160447120667, + "learning_rate": 3.9716481296020566e-05, + "loss": 0.0464, + "num_input_tokens_seen": 80798528, + "step": 66390 + }, + { + "epoch": 7.394475999554516, + "grad_norm": 0.03321784734725952, + "learning_rate": 3.971451707198439e-05, + "loss": 0.089, + "num_input_tokens_seen": 80804672, + "step": 66395 + }, + { + "epoch": 7.395032854438133, + "grad_norm": 0.5961217284202576, + "learning_rate": 3.9712552708957814e-05, + "loss": 0.1113, + "num_input_tokens_seen": 80810784, + "step": 66400 + }, + { + "epoch": 7.395589709321751, + "grad_norm": 0.004295865539461374, + "learning_rate": 3.971058820695939e-05, + "loss": 0.0161, + "num_input_tokens_seen": 80816768, + "step": 66405 + }, + { + "epoch": 7.3961465642053685, + "grad_norm": 0.19011647999286652, + "learning_rate": 3.970862356600767e-05, + "loss": 0.0255, + "num_input_tokens_seen": 80822816, + "step": 66410 + }, + { + "epoch": 7.396703419088985, + "grad_norm": 0.0006436922703869641, + "learning_rate": 3.970665878612122e-05, + "loss": 0.0478, + "num_input_tokens_seen": 80828800, + "step": 66415 + }, + { + "epoch": 7.397260273972603, + "grad_norm": 0.0989752858877182, + "learning_rate": 3.970469386731859e-05, + "loss": 0.2927, + "num_input_tokens_seen": 80834912, + "step": 66420 + }, + { + "epoch": 7.39781712885622, + "grad_norm": 2.3732056617736816, + "learning_rate": 3.970272880961835e-05, + "loss": 0.0808, + "num_input_tokens_seen": 80841088, + "step": 66425 + }, + { + "epoch": 7.3983739837398375, + "grad_norm": 0.016022970899939537, + "learning_rate": 3.9700763613039055e-05, + "loss": 0.0535, + "num_input_tokens_seen": 80847168, + "step": 66430 + }, + { + "epoch": 7.398930838623455, + "grad_norm": 0.4769655168056488, + "learning_rate": 3.969879827759927e-05, + "loss": 0.0451, + "num_input_tokens_seen": 80853536, + "step": 66435 + }, + { + "epoch": 7.399487693507072, + "grad_norm": 0.0003307574079371989, + "learning_rate": 3.969683280331756e-05, + "loss": 0.1215, + "num_input_tokens_seen": 80859392, + "step": 66440 + }, + { + "epoch": 7.40004454839069, + "grad_norm": 0.26193106174468994, + "learning_rate": 3.9694867190212485e-05, + "loss": 0.0596, + "num_input_tokens_seen": 80865664, + "step": 66445 + }, + { + "epoch": 7.400601403274306, + "grad_norm": 0.0033779130317270756, + "learning_rate": 3.969290143830262e-05, + "loss": 0.0612, + "num_input_tokens_seen": 80871744, + "step": 66450 + }, + { + "epoch": 7.401158258157924, + "grad_norm": 0.022618243470788002, + "learning_rate": 3.969093554760653e-05, + "loss": 0.0866, + "num_input_tokens_seen": 80877984, + "step": 66455 + }, + { + "epoch": 7.401715113041542, + "grad_norm": 0.5728687047958374, + "learning_rate": 3.968896951814278e-05, + "loss": 0.0177, + "num_input_tokens_seen": 80884064, + "step": 66460 + }, + { + "epoch": 7.4022719679251585, + "grad_norm": 0.8880051374435425, + "learning_rate": 3.968700334992995e-05, + "loss": 0.1083, + "num_input_tokens_seen": 80890208, + "step": 66465 + }, + { + "epoch": 7.402828822808776, + "grad_norm": 2.2709033489227295, + "learning_rate": 3.9685037042986595e-05, + "loss": 0.1175, + "num_input_tokens_seen": 80896384, + "step": 66470 + }, + { + "epoch": 7.403385677692393, + "grad_norm": 0.0010924565140157938, + "learning_rate": 3.9683070597331305e-05, + "loss": 0.0719, + "num_input_tokens_seen": 80902720, + "step": 66475 + }, + { + "epoch": 7.403942532576011, + "grad_norm": 1.5469834804534912, + "learning_rate": 3.9681104012982656e-05, + "loss": 0.0821, + "num_input_tokens_seen": 80908896, + "step": 66480 + }, + { + "epoch": 7.404499387459628, + "grad_norm": 0.026774827390909195, + "learning_rate": 3.967913728995921e-05, + "loss": 0.0359, + "num_input_tokens_seen": 80914752, + "step": 66485 + }, + { + "epoch": 7.405056242343245, + "grad_norm": 0.6853423118591309, + "learning_rate": 3.967717042827956e-05, + "loss": 0.1708, + "num_input_tokens_seen": 80920832, + "step": 66490 + }, + { + "epoch": 7.405613097226863, + "grad_norm": 0.5154860615730286, + "learning_rate": 3.967520342796227e-05, + "loss": 0.0614, + "num_input_tokens_seen": 80927072, + "step": 66495 + }, + { + "epoch": 7.40616995211048, + "grad_norm": 1.5568876266479492, + "learning_rate": 3.967323628902593e-05, + "loss": 0.137, + "num_input_tokens_seen": 80933088, + "step": 66500 + }, + { + "epoch": 7.406726806994097, + "grad_norm": 0.04662449657917023, + "learning_rate": 3.967126901148911e-05, + "loss": 0.0439, + "num_input_tokens_seen": 80939040, + "step": 66505 + }, + { + "epoch": 7.407283661877715, + "grad_norm": 0.032333459705114365, + "learning_rate": 3.9669301595370405e-05, + "loss": 0.0063, + "num_input_tokens_seen": 80945120, + "step": 66510 + }, + { + "epoch": 7.407840516761332, + "grad_norm": 0.004954883363097906, + "learning_rate": 3.966733404068839e-05, + "loss": 0.0194, + "num_input_tokens_seen": 80951328, + "step": 66515 + }, + { + "epoch": 7.408397371644949, + "grad_norm": 0.004988403525203466, + "learning_rate": 3.966536634746166e-05, + "loss": 0.0537, + "num_input_tokens_seen": 80957440, + "step": 66520 + }, + { + "epoch": 7.408954226528567, + "grad_norm": 0.25179314613342285, + "learning_rate": 3.966339851570879e-05, + "loss": 0.0234, + "num_input_tokens_seen": 80963808, + "step": 66525 + }, + { + "epoch": 7.409511081412184, + "grad_norm": 0.4613789916038513, + "learning_rate": 3.966143054544837e-05, + "loss": 0.0317, + "num_input_tokens_seen": 80969984, + "step": 66530 + }, + { + "epoch": 7.4100679362958015, + "grad_norm": 0.002213522791862488, + "learning_rate": 3.9659462436699e-05, + "loss": 0.0153, + "num_input_tokens_seen": 80976320, + "step": 66535 + }, + { + "epoch": 7.410624791179418, + "grad_norm": 0.06867828965187073, + "learning_rate": 3.965749418947926e-05, + "loss": 0.0012, + "num_input_tokens_seen": 80982656, + "step": 66540 + }, + { + "epoch": 7.411181646063036, + "grad_norm": 0.0071066743694245815, + "learning_rate": 3.965552580380773e-05, + "loss": 0.0224, + "num_input_tokens_seen": 80988704, + "step": 66545 + }, + { + "epoch": 7.411738500946654, + "grad_norm": 0.2904905080795288, + "learning_rate": 3.965355727970304e-05, + "loss": 0.1263, + "num_input_tokens_seen": 80995200, + "step": 66550 + }, + { + "epoch": 7.41229535583027, + "grad_norm": 0.13193117082118988, + "learning_rate": 3.965158861718375e-05, + "loss": 0.0847, + "num_input_tokens_seen": 81001472, + "step": 66555 + }, + { + "epoch": 7.412852210713888, + "grad_norm": 0.4900672733783722, + "learning_rate": 3.9649619816268466e-05, + "loss": 0.0602, + "num_input_tokens_seen": 81007424, + "step": 66560 + }, + { + "epoch": 7.413409065597505, + "grad_norm": 0.005184664856642485, + "learning_rate": 3.9647650876975785e-05, + "loss": 0.0846, + "num_input_tokens_seen": 81013600, + "step": 66565 + }, + { + "epoch": 7.413965920481123, + "grad_norm": 0.12222651392221451, + "learning_rate": 3.9645681799324305e-05, + "loss": 0.0616, + "num_input_tokens_seen": 81019744, + "step": 66570 + }, + { + "epoch": 7.41452277536474, + "grad_norm": 0.40024229884147644, + "learning_rate": 3.964371258333264e-05, + "loss": 0.0487, + "num_input_tokens_seen": 81026048, + "step": 66575 + }, + { + "epoch": 7.415079630248357, + "grad_norm": 0.12489135563373566, + "learning_rate": 3.964174322901936e-05, + "loss": 0.1377, + "num_input_tokens_seen": 81032064, + "step": 66580 + }, + { + "epoch": 7.415636485131975, + "grad_norm": 0.004999780561774969, + "learning_rate": 3.963977373640309e-05, + "loss": 0.1157, + "num_input_tokens_seen": 81038144, + "step": 66585 + }, + { + "epoch": 7.416193340015592, + "grad_norm": 0.00019435145077295601, + "learning_rate": 3.963780410550243e-05, + "loss": 0.0175, + "num_input_tokens_seen": 81044032, + "step": 66590 + }, + { + "epoch": 7.416750194899209, + "grad_norm": 0.29333627223968506, + "learning_rate": 3.963583433633598e-05, + "loss": 0.0987, + "num_input_tokens_seen": 81050080, + "step": 66595 + }, + { + "epoch": 7.417307049782827, + "grad_norm": 0.6290361285209656, + "learning_rate": 3.9633864428922355e-05, + "loss": 0.0185, + "num_input_tokens_seen": 81056256, + "step": 66600 + }, + { + "epoch": 7.417863904666444, + "grad_norm": 0.18887877464294434, + "learning_rate": 3.963189438328015e-05, + "loss": 0.0391, + "num_input_tokens_seen": 81062080, + "step": 66605 + }, + { + "epoch": 7.418420759550061, + "grad_norm": 0.7029045820236206, + "learning_rate": 3.962992419942798e-05, + "loss": 0.1227, + "num_input_tokens_seen": 81068352, + "step": 66610 + }, + { + "epoch": 7.418977614433679, + "grad_norm": 0.047247111797332764, + "learning_rate": 3.9627953877384454e-05, + "loss": 0.024, + "num_input_tokens_seen": 81074336, + "step": 66615 + }, + { + "epoch": 7.419534469317296, + "grad_norm": 0.0011281328042969108, + "learning_rate": 3.962598341716819e-05, + "loss": 0.0143, + "num_input_tokens_seen": 81080544, + "step": 66620 + }, + { + "epoch": 7.420091324200913, + "grad_norm": 0.8612421154975891, + "learning_rate": 3.9624012818797787e-05, + "loss": 0.0837, + "num_input_tokens_seen": 81087072, + "step": 66625 + }, + { + "epoch": 7.42064817908453, + "grad_norm": 0.528826117515564, + "learning_rate": 3.962204208229187e-05, + "loss": 0.0301, + "num_input_tokens_seen": 81092800, + "step": 66630 + }, + { + "epoch": 7.421205033968148, + "grad_norm": 0.26507943868637085, + "learning_rate": 3.962007120766905e-05, + "loss": 0.012, + "num_input_tokens_seen": 81099104, + "step": 66635 + }, + { + "epoch": 7.421761888851766, + "grad_norm": 0.4233340620994568, + "learning_rate": 3.9618100194947946e-05, + "loss": 0.0361, + "num_input_tokens_seen": 81105408, + "step": 66640 + }, + { + "epoch": 7.422318743735382, + "grad_norm": 0.04894155636429787, + "learning_rate": 3.961612904414717e-05, + "loss": 0.0168, + "num_input_tokens_seen": 81111552, + "step": 66645 + }, + { + "epoch": 7.422875598619, + "grad_norm": 0.6490718126296997, + "learning_rate": 3.961415775528534e-05, + "loss": 0.0185, + "num_input_tokens_seen": 81117856, + "step": 66650 + }, + { + "epoch": 7.423432453502617, + "grad_norm": 0.8514319062232971, + "learning_rate": 3.9612186328381096e-05, + "loss": 0.0268, + "num_input_tokens_seen": 81124064, + "step": 66655 + }, + { + "epoch": 7.4239893083862345, + "grad_norm": 0.35669636726379395, + "learning_rate": 3.9610214763453036e-05, + "loss": 0.0873, + "num_input_tokens_seen": 81130336, + "step": 66660 + }, + { + "epoch": 7.424546163269852, + "grad_norm": 0.11938490718603134, + "learning_rate": 3.960824306051979e-05, + "loss": 0.0255, + "num_input_tokens_seen": 81136416, + "step": 66665 + }, + { + "epoch": 7.425103018153469, + "grad_norm": 0.07297495752573013, + "learning_rate": 3.960627121959999e-05, + "loss": 0.0031, + "num_input_tokens_seen": 81141632, + "step": 66670 + }, + { + "epoch": 7.425659873037087, + "grad_norm": 0.19431640207767487, + "learning_rate": 3.960429924071225e-05, + "loss": 0.0084, + "num_input_tokens_seen": 81147872, + "step": 66675 + }, + { + "epoch": 7.426216727920703, + "grad_norm": 0.1475166231393814, + "learning_rate": 3.960232712387521e-05, + "loss": 0.0356, + "num_input_tokens_seen": 81153632, + "step": 66680 + }, + { + "epoch": 7.426773582804321, + "grad_norm": 0.07555713504552841, + "learning_rate": 3.960035486910748e-05, + "loss": 0.1196, + "num_input_tokens_seen": 81159328, + "step": 66685 + }, + { + "epoch": 7.427330437687939, + "grad_norm": 1.2186414003372192, + "learning_rate": 3.9598382476427716e-05, + "loss": 0.1541, + "num_input_tokens_seen": 81165760, + "step": 66690 + }, + { + "epoch": 7.4278872925715556, + "grad_norm": 0.003832288784906268, + "learning_rate": 3.959640994585452e-05, + "loss": 0.0198, + "num_input_tokens_seen": 81172096, + "step": 66695 + }, + { + "epoch": 7.428444147455173, + "grad_norm": 0.8965616226196289, + "learning_rate": 3.9594437277406546e-05, + "loss": 0.0461, + "num_input_tokens_seen": 81177600, + "step": 66700 + }, + { + "epoch": 7.429001002338791, + "grad_norm": 0.20046620070934296, + "learning_rate": 3.959246447110242e-05, + "loss": 0.016, + "num_input_tokens_seen": 81183808, + "step": 66705 + }, + { + "epoch": 7.429557857222408, + "grad_norm": 0.14117465913295746, + "learning_rate": 3.9590491526960774e-05, + "loss": 0.029, + "num_input_tokens_seen": 81189792, + "step": 66710 + }, + { + "epoch": 7.430114712106025, + "grad_norm": 1.4407172203063965, + "learning_rate": 3.9588518445000236e-05, + "loss": 0.0676, + "num_input_tokens_seen": 81195776, + "step": 66715 + }, + { + "epoch": 7.430671566989642, + "grad_norm": 0.07974927872419357, + "learning_rate": 3.9586545225239465e-05, + "loss": 0.0429, + "num_input_tokens_seen": 81201856, + "step": 66720 + }, + { + "epoch": 7.43122842187326, + "grad_norm": 0.6355013847351074, + "learning_rate": 3.958457186769708e-05, + "loss": 0.0496, + "num_input_tokens_seen": 81207936, + "step": 66725 + }, + { + "epoch": 7.4317852767568775, + "grad_norm": 2.1328649520874023, + "learning_rate": 3.958259837239173e-05, + "loss": 0.0635, + "num_input_tokens_seen": 81214016, + "step": 66730 + }, + { + "epoch": 7.432342131640494, + "grad_norm": 0.6968005299568176, + "learning_rate": 3.9580624739342063e-05, + "loss": 0.0587, + "num_input_tokens_seen": 81220256, + "step": 66735 + }, + { + "epoch": 7.432898986524112, + "grad_norm": 0.0010241519194096327, + "learning_rate": 3.95786509685667e-05, + "loss": 0.0046, + "num_input_tokens_seen": 81226464, + "step": 66740 + }, + { + "epoch": 7.433455841407729, + "grad_norm": 0.6239423155784607, + "learning_rate": 3.957667706008431e-05, + "loss": 0.1484, + "num_input_tokens_seen": 81232736, + "step": 66745 + }, + { + "epoch": 7.434012696291346, + "grad_norm": 0.3726702928543091, + "learning_rate": 3.9574703013913526e-05, + "loss": 0.0175, + "num_input_tokens_seen": 81238816, + "step": 66750 + }, + { + "epoch": 7.434569551174964, + "grad_norm": 0.13166293501853943, + "learning_rate": 3.9572728830072994e-05, + "loss": 0.0298, + "num_input_tokens_seen": 81244960, + "step": 66755 + }, + { + "epoch": 7.435126406058581, + "grad_norm": 0.08334100246429443, + "learning_rate": 3.957075450858136e-05, + "loss": 0.0295, + "num_input_tokens_seen": 81251232, + "step": 66760 + }, + { + "epoch": 7.435683260942199, + "grad_norm": 0.31846433877944946, + "learning_rate": 3.9568780049457276e-05, + "loss": 0.0055, + "num_input_tokens_seen": 81257504, + "step": 66765 + }, + { + "epoch": 7.436240115825816, + "grad_norm": 0.002161209238693118, + "learning_rate": 3.9566805452719394e-05, + "loss": 0.0709, + "num_input_tokens_seen": 81264032, + "step": 66770 + }, + { + "epoch": 7.436796970709433, + "grad_norm": 0.5076997876167297, + "learning_rate": 3.9564830718386355e-05, + "loss": 0.032, + "num_input_tokens_seen": 81270016, + "step": 66775 + }, + { + "epoch": 7.437353825593051, + "grad_norm": 0.0005786186666227877, + "learning_rate": 3.956285584647683e-05, + "loss": 0.0289, + "num_input_tokens_seen": 81276064, + "step": 66780 + }, + { + "epoch": 7.4379106804766675, + "grad_norm": 0.011909838765859604, + "learning_rate": 3.956088083700946e-05, + "loss": 0.0636, + "num_input_tokens_seen": 81282528, + "step": 66785 + }, + { + "epoch": 7.438467535360285, + "grad_norm": 0.4907371699810028, + "learning_rate": 3.955890569000291e-05, + "loss": 0.0897, + "num_input_tokens_seen": 81288768, + "step": 66790 + }, + { + "epoch": 7.439024390243903, + "grad_norm": 0.038351502269506454, + "learning_rate": 3.955693040547583e-05, + "loss": 0.0677, + "num_input_tokens_seen": 81295040, + "step": 66795 + }, + { + "epoch": 7.43958124512752, + "grad_norm": 1.219746708869934, + "learning_rate": 3.955495498344688e-05, + "loss": 0.0827, + "num_input_tokens_seen": 81301152, + "step": 66800 + }, + { + "epoch": 7.440138100011137, + "grad_norm": 1.397750973701477, + "learning_rate": 3.955297942393471e-05, + "loss": 0.096, + "num_input_tokens_seen": 81307328, + "step": 66805 + }, + { + "epoch": 7.440694954894754, + "grad_norm": 0.10228536278009415, + "learning_rate": 3.9551003726958e-05, + "loss": 0.2011, + "num_input_tokens_seen": 81313088, + "step": 66810 + }, + { + "epoch": 7.441251809778372, + "grad_norm": 1.5707917213439941, + "learning_rate": 3.954902789253539e-05, + "loss": 0.1605, + "num_input_tokens_seen": 81318816, + "step": 66815 + }, + { + "epoch": 7.441808664661989, + "grad_norm": 0.009397354908287525, + "learning_rate": 3.954705192068557e-05, + "loss": 0.004, + "num_input_tokens_seen": 81325376, + "step": 66820 + }, + { + "epoch": 7.442365519545606, + "grad_norm": 0.11211059987545013, + "learning_rate": 3.9545075811427186e-05, + "loss": 0.0095, + "num_input_tokens_seen": 81331424, + "step": 66825 + }, + { + "epoch": 7.442922374429224, + "grad_norm": 1.0190197229385376, + "learning_rate": 3.9543099564778904e-05, + "loss": 0.0565, + "num_input_tokens_seen": 81337952, + "step": 66830 + }, + { + "epoch": 7.443479229312841, + "grad_norm": 0.1868503987789154, + "learning_rate": 3.9541123180759395e-05, + "loss": 0.0225, + "num_input_tokens_seen": 81343968, + "step": 66835 + }, + { + "epoch": 7.444036084196458, + "grad_norm": 1.2527143955230713, + "learning_rate": 3.953914665938733e-05, + "loss": 0.1062, + "num_input_tokens_seen": 81349888, + "step": 66840 + }, + { + "epoch": 7.444592939080076, + "grad_norm": 0.027874967083334923, + "learning_rate": 3.953717000068137e-05, + "loss": 0.0487, + "num_input_tokens_seen": 81355808, + "step": 66845 + }, + { + "epoch": 7.445149793963693, + "grad_norm": 0.2969321012496948, + "learning_rate": 3.953519320466019e-05, + "loss": 0.0161, + "num_input_tokens_seen": 81362176, + "step": 66850 + }, + { + "epoch": 7.4457066488473105, + "grad_norm": 0.62065190076828, + "learning_rate": 3.953321627134248e-05, + "loss": 0.0305, + "num_input_tokens_seen": 81368672, + "step": 66855 + }, + { + "epoch": 7.446263503730927, + "grad_norm": 1.304648518562317, + "learning_rate": 3.953123920074688e-05, + "loss": 0.048, + "num_input_tokens_seen": 81374144, + "step": 66860 + }, + { + "epoch": 7.446820358614545, + "grad_norm": 0.04858376085758209, + "learning_rate": 3.9529261992892096e-05, + "loss": 0.0189, + "num_input_tokens_seen": 81380256, + "step": 66865 + }, + { + "epoch": 7.447377213498163, + "grad_norm": 0.0006562630296684802, + "learning_rate": 3.952728464779679e-05, + "loss": 0.0319, + "num_input_tokens_seen": 81386368, + "step": 66870 + }, + { + "epoch": 7.447934068381779, + "grad_norm": 0.0021773637272417545, + "learning_rate": 3.952530716547964e-05, + "loss": 0.0276, + "num_input_tokens_seen": 81392384, + "step": 66875 + }, + { + "epoch": 7.448490923265397, + "grad_norm": 0.014329100027680397, + "learning_rate": 3.952332954595933e-05, + "loss": 0.1959, + "num_input_tokens_seen": 81398368, + "step": 66880 + }, + { + "epoch": 7.449047778149015, + "grad_norm": 0.12411890923976898, + "learning_rate": 3.952135178925453e-05, + "loss": 0.0293, + "num_input_tokens_seen": 81404416, + "step": 66885 + }, + { + "epoch": 7.4496046330326315, + "grad_norm": 1.9991097450256348, + "learning_rate": 3.951937389538393e-05, + "loss": 0.0906, + "num_input_tokens_seen": 81410560, + "step": 66890 + }, + { + "epoch": 7.450161487916249, + "grad_norm": 0.0790422186255455, + "learning_rate": 3.951739586436621e-05, + "loss": 0.0404, + "num_input_tokens_seen": 81416704, + "step": 66895 + }, + { + "epoch": 7.450718342799866, + "grad_norm": 0.12448771297931671, + "learning_rate": 3.951541769622006e-05, + "loss": 0.0745, + "num_input_tokens_seen": 81422688, + "step": 66900 + }, + { + "epoch": 7.451275197683484, + "grad_norm": 0.05698050558567047, + "learning_rate": 3.951343939096416e-05, + "loss": 0.0493, + "num_input_tokens_seen": 81428832, + "step": 66905 + }, + { + "epoch": 7.451832052567101, + "grad_norm": 0.5571953654289246, + "learning_rate": 3.951146094861719e-05, + "loss": 0.042, + "num_input_tokens_seen": 81434432, + "step": 66910 + }, + { + "epoch": 7.452388907450718, + "grad_norm": 0.013121191412210464, + "learning_rate": 3.950948236919785e-05, + "loss": 0.0842, + "num_input_tokens_seen": 81440448, + "step": 66915 + }, + { + "epoch": 7.452945762334336, + "grad_norm": 0.024858135730028152, + "learning_rate": 3.9507503652724825e-05, + "loss": 0.0448, + "num_input_tokens_seen": 81446656, + "step": 66920 + }, + { + "epoch": 7.453502617217953, + "grad_norm": 0.042602892965078354, + "learning_rate": 3.95055247992168e-05, + "loss": 0.0252, + "num_input_tokens_seen": 81452864, + "step": 66925 + }, + { + "epoch": 7.45405947210157, + "grad_norm": 0.23511837422847748, + "learning_rate": 3.950354580869248e-05, + "loss": 0.0664, + "num_input_tokens_seen": 81459008, + "step": 66930 + }, + { + "epoch": 7.454616326985188, + "grad_norm": 0.004103127401322126, + "learning_rate": 3.950156668117054e-05, + "loss": 0.0526, + "num_input_tokens_seen": 81465024, + "step": 66935 + }, + { + "epoch": 7.455173181868805, + "grad_norm": 0.18200728297233582, + "learning_rate": 3.949958741666969e-05, + "loss": 0.0806, + "num_input_tokens_seen": 81471200, + "step": 66940 + }, + { + "epoch": 7.455730036752422, + "grad_norm": 1.0000072717666626, + "learning_rate": 3.949760801520862e-05, + "loss": 0.0778, + "num_input_tokens_seen": 81477248, + "step": 66945 + }, + { + "epoch": 7.45628689163604, + "grad_norm": 0.6206445693969727, + "learning_rate": 3.949562847680603e-05, + "loss": 0.0619, + "num_input_tokens_seen": 81483136, + "step": 66950 + }, + { + "epoch": 7.456843746519657, + "grad_norm": 0.00028327014297246933, + "learning_rate": 3.949364880148061e-05, + "loss": 0.015, + "num_input_tokens_seen": 81489504, + "step": 66955 + }, + { + "epoch": 7.4574006014032745, + "grad_norm": 0.08932767063379288, + "learning_rate": 3.9491668989251066e-05, + "loss": 0.0158, + "num_input_tokens_seen": 81495680, + "step": 66960 + }, + { + "epoch": 7.457957456286891, + "grad_norm": 0.6247054934501648, + "learning_rate": 3.94896890401361e-05, + "loss": 0.0612, + "num_input_tokens_seen": 81501600, + "step": 66965 + }, + { + "epoch": 7.458514311170509, + "grad_norm": 0.6064343452453613, + "learning_rate": 3.9487708954154405e-05, + "loss": 0.0427, + "num_input_tokens_seen": 81507872, + "step": 66970 + }, + { + "epoch": 7.459071166054127, + "grad_norm": 0.8980651497840881, + "learning_rate": 3.94857287313247e-05, + "loss": 0.1126, + "num_input_tokens_seen": 81514176, + "step": 66975 + }, + { + "epoch": 7.4596280209377435, + "grad_norm": 1.1916048526763916, + "learning_rate": 3.948374837166567e-05, + "loss": 0.1088, + "num_input_tokens_seen": 81520384, + "step": 66980 + }, + { + "epoch": 7.460184875821361, + "grad_norm": 2.710543155670166, + "learning_rate": 3.948176787519604e-05, + "loss": 0.2276, + "num_input_tokens_seen": 81526816, + "step": 66985 + }, + { + "epoch": 7.460741730704978, + "grad_norm": 0.5094576478004456, + "learning_rate": 3.9479787241934516e-05, + "loss": 0.0391, + "num_input_tokens_seen": 81533216, + "step": 66990 + }, + { + "epoch": 7.461298585588596, + "grad_norm": 0.08943866938352585, + "learning_rate": 3.9477806471899795e-05, + "loss": 0.0716, + "num_input_tokens_seen": 81539168, + "step": 66995 + }, + { + "epoch": 7.461855440472213, + "grad_norm": 1.4773190021514893, + "learning_rate": 3.947582556511059e-05, + "loss": 0.103, + "num_input_tokens_seen": 81545152, + "step": 67000 + }, + { + "epoch": 7.46241229535583, + "grad_norm": 0.9440675973892212, + "learning_rate": 3.947384452158562e-05, + "loss": 0.0424, + "num_input_tokens_seen": 81551040, + "step": 67005 + }, + { + "epoch": 7.462969150239448, + "grad_norm": 0.8281881213188171, + "learning_rate": 3.947186334134359e-05, + "loss": 0.075, + "num_input_tokens_seen": 81556512, + "step": 67010 + }, + { + "epoch": 7.4635260051230645, + "grad_norm": 0.07527311146259308, + "learning_rate": 3.946988202440321e-05, + "loss": 0.0053, + "num_input_tokens_seen": 81562784, + "step": 67015 + }, + { + "epoch": 7.464082860006682, + "grad_norm": 1.7897170782089233, + "learning_rate": 3.94679005707832e-05, + "loss": 0.067, + "num_input_tokens_seen": 81568544, + "step": 67020 + }, + { + "epoch": 7.4646397148903, + "grad_norm": 0.025480767711997032, + "learning_rate": 3.9465918980502294e-05, + "loss": 0.0758, + "num_input_tokens_seen": 81575072, + "step": 67025 + }, + { + "epoch": 7.465196569773917, + "grad_norm": 0.015086223371326923, + "learning_rate": 3.946393725357918e-05, + "loss": 0.0239, + "num_input_tokens_seen": 81581056, + "step": 67030 + }, + { + "epoch": 7.465753424657534, + "grad_norm": 0.15934501588344574, + "learning_rate": 3.946195539003259e-05, + "loss": 0.0812, + "num_input_tokens_seen": 81586560, + "step": 67035 + }, + { + "epoch": 7.466310279541151, + "grad_norm": 0.14173492789268494, + "learning_rate": 3.9459973389881254e-05, + "loss": 0.0752, + "num_input_tokens_seen": 81591904, + "step": 67040 + }, + { + "epoch": 7.466867134424769, + "grad_norm": 0.0022861389443278313, + "learning_rate": 3.9457991253143876e-05, + "loss": 0.0646, + "num_input_tokens_seen": 81597984, + "step": 67045 + }, + { + "epoch": 7.4674239893083865, + "grad_norm": 0.016665175557136536, + "learning_rate": 3.945600897983919e-05, + "loss": 0.1246, + "num_input_tokens_seen": 81604320, + "step": 67050 + }, + { + "epoch": 7.467980844192003, + "grad_norm": 0.290650337934494, + "learning_rate": 3.9454026569985916e-05, + "loss": 0.0172, + "num_input_tokens_seen": 81610208, + "step": 67055 + }, + { + "epoch": 7.468537699075621, + "grad_norm": 0.9206507802009583, + "learning_rate": 3.945204402360278e-05, + "loss": 0.1285, + "num_input_tokens_seen": 81616160, + "step": 67060 + }, + { + "epoch": 7.469094553959239, + "grad_norm": 0.45300886034965515, + "learning_rate": 3.945006134070851e-05, + "loss": 0.0241, + "num_input_tokens_seen": 81622432, + "step": 67065 + }, + { + "epoch": 7.469651408842855, + "grad_norm": 0.027772199362516403, + "learning_rate": 3.944807852132184e-05, + "loss": 0.1186, + "num_input_tokens_seen": 81628704, + "step": 67070 + }, + { + "epoch": 7.470208263726473, + "grad_norm": 0.002357719698920846, + "learning_rate": 3.9446095565461484e-05, + "loss": 0.0194, + "num_input_tokens_seen": 81634624, + "step": 67075 + }, + { + "epoch": 7.47076511861009, + "grad_norm": 0.02029651589691639, + "learning_rate": 3.9444112473146184e-05, + "loss": 0.0293, + "num_input_tokens_seen": 81640992, + "step": 67080 + }, + { + "epoch": 7.4713219734937075, + "grad_norm": 0.0066980719566345215, + "learning_rate": 3.9442129244394666e-05, + "loss": 0.0181, + "num_input_tokens_seen": 81647136, + "step": 67085 + }, + { + "epoch": 7.471878828377325, + "grad_norm": 0.0008473563939332962, + "learning_rate": 3.944014587922567e-05, + "loss": 0.0424, + "num_input_tokens_seen": 81653504, + "step": 67090 + }, + { + "epoch": 7.472435683260942, + "grad_norm": 0.23938384652137756, + "learning_rate": 3.943816237765793e-05, + "loss": 0.085, + "num_input_tokens_seen": 81659904, + "step": 67095 + }, + { + "epoch": 7.47299253814456, + "grad_norm": 0.044240664690732956, + "learning_rate": 3.943617873971017e-05, + "loss": 0.0158, + "num_input_tokens_seen": 81666240, + "step": 67100 + }, + { + "epoch": 7.4735493930281764, + "grad_norm": 0.4333561062812805, + "learning_rate": 3.943419496540115e-05, + "loss": 0.0252, + "num_input_tokens_seen": 81672704, + "step": 67105 + }, + { + "epoch": 7.474106247911794, + "grad_norm": 0.003263582941144705, + "learning_rate": 3.943221105474958e-05, + "loss": 0.0123, + "num_input_tokens_seen": 81679072, + "step": 67110 + }, + { + "epoch": 7.474663102795412, + "grad_norm": 0.15921686589717865, + "learning_rate": 3.9430227007774225e-05, + "loss": 0.0877, + "num_input_tokens_seen": 81684992, + "step": 67115 + }, + { + "epoch": 7.475219957679029, + "grad_norm": 0.38049954175949097, + "learning_rate": 3.9428242824493805e-05, + "loss": 0.0636, + "num_input_tokens_seen": 81690976, + "step": 67120 + }, + { + "epoch": 7.475776812562646, + "grad_norm": 0.34996160864830017, + "learning_rate": 3.942625850492707e-05, + "loss": 0.0144, + "num_input_tokens_seen": 81697280, + "step": 67125 + }, + { + "epoch": 7.476333667446264, + "grad_norm": 0.001289632054977119, + "learning_rate": 3.942427404909278e-05, + "loss": 0.0302, + "num_input_tokens_seen": 81703648, + "step": 67130 + }, + { + "epoch": 7.476890522329881, + "grad_norm": 0.0639309510588646, + "learning_rate": 3.9422289457009654e-05, + "loss": 0.1274, + "num_input_tokens_seen": 81709344, + "step": 67135 + }, + { + "epoch": 7.477447377213498, + "grad_norm": 0.4517369270324707, + "learning_rate": 3.942030472869645e-05, + "loss": 0.1224, + "num_input_tokens_seen": 81715232, + "step": 67140 + }, + { + "epoch": 7.478004232097115, + "grad_norm": 0.6792625784873962, + "learning_rate": 3.9418319864171914e-05, + "loss": 0.0806, + "num_input_tokens_seen": 81721440, + "step": 67145 + }, + { + "epoch": 7.478561086980733, + "grad_norm": 0.18129028379917145, + "learning_rate": 3.94163348634548e-05, + "loss": 0.076, + "num_input_tokens_seen": 81727872, + "step": 67150 + }, + { + "epoch": 7.4791179418643505, + "grad_norm": 0.0380319282412529, + "learning_rate": 3.941434972656385e-05, + "loss": 0.0583, + "num_input_tokens_seen": 81733344, + "step": 67155 + }, + { + "epoch": 7.479674796747967, + "grad_norm": 0.38216060400009155, + "learning_rate": 3.9412364453517815e-05, + "loss": 0.1234, + "num_input_tokens_seen": 81739456, + "step": 67160 + }, + { + "epoch": 7.480231651631585, + "grad_norm": 0.0014713420532643795, + "learning_rate": 3.941037904433545e-05, + "loss": 0.0264, + "num_input_tokens_seen": 81745824, + "step": 67165 + }, + { + "epoch": 7.480788506515202, + "grad_norm": 0.4085935354232788, + "learning_rate": 3.940839349903552e-05, + "loss": 0.0994, + "num_input_tokens_seen": 81751936, + "step": 67170 + }, + { + "epoch": 7.4813453613988194, + "grad_norm": 0.0005164397298358381, + "learning_rate": 3.940640781763676e-05, + "loss": 0.0901, + "num_input_tokens_seen": 81757952, + "step": 67175 + }, + { + "epoch": 7.481902216282437, + "grad_norm": 0.04425773769617081, + "learning_rate": 3.940442200015794e-05, + "loss": 0.1042, + "num_input_tokens_seen": 81764000, + "step": 67180 + }, + { + "epoch": 7.482459071166054, + "grad_norm": 0.39740514755249023, + "learning_rate": 3.9402436046617806e-05, + "loss": 0.0365, + "num_input_tokens_seen": 81769888, + "step": 67185 + }, + { + "epoch": 7.483015926049672, + "grad_norm": 0.5883432626724243, + "learning_rate": 3.9400449957035135e-05, + "loss": 0.0613, + "num_input_tokens_seen": 81775680, + "step": 67190 + }, + { + "epoch": 7.483572780933288, + "grad_norm": 0.6852973699569702, + "learning_rate": 3.939846373142867e-05, + "loss": 0.0813, + "num_input_tokens_seen": 81781664, + "step": 67195 + }, + { + "epoch": 7.484129635816906, + "grad_norm": 0.5464146733283997, + "learning_rate": 3.9396477369817185e-05, + "loss": 0.0302, + "num_input_tokens_seen": 81788128, + "step": 67200 + }, + { + "epoch": 7.484686490700524, + "grad_norm": 0.854949951171875, + "learning_rate": 3.9394490872219434e-05, + "loss": 0.0409, + "num_input_tokens_seen": 81794176, + "step": 67205 + }, + { + "epoch": 7.4852433455841405, + "grad_norm": 1.7437893152236938, + "learning_rate": 3.939250423865418e-05, + "loss": 0.1354, + "num_input_tokens_seen": 81800256, + "step": 67210 + }, + { + "epoch": 7.485800200467758, + "grad_norm": 0.006754694506525993, + "learning_rate": 3.93905174691402e-05, + "loss": 0.0796, + "num_input_tokens_seen": 81806400, + "step": 67215 + }, + { + "epoch": 7.486357055351375, + "grad_norm": 2.160491704940796, + "learning_rate": 3.9388530563696245e-05, + "loss": 0.0885, + "num_input_tokens_seen": 81812384, + "step": 67220 + }, + { + "epoch": 7.486913910234993, + "grad_norm": 1.1522823572158813, + "learning_rate": 3.93865435223411e-05, + "loss": 0.1181, + "num_input_tokens_seen": 81818496, + "step": 67225 + }, + { + "epoch": 7.48747076511861, + "grad_norm": 0.573827862739563, + "learning_rate": 3.938455634509352e-05, + "loss": 0.0811, + "num_input_tokens_seen": 81824128, + "step": 67230 + }, + { + "epoch": 7.488027620002227, + "grad_norm": 0.006382429040968418, + "learning_rate": 3.9382569031972275e-05, + "loss": 0.0359, + "num_input_tokens_seen": 81830208, + "step": 67235 + }, + { + "epoch": 7.488584474885845, + "grad_norm": 0.48097506165504456, + "learning_rate": 3.9380581582996144e-05, + "loss": 0.0736, + "num_input_tokens_seen": 81835488, + "step": 67240 + }, + { + "epoch": 7.4891413297694625, + "grad_norm": 0.00023397155746351928, + "learning_rate": 3.9378593998183914e-05, + "loss": 0.0754, + "num_input_tokens_seen": 81841664, + "step": 67245 + }, + { + "epoch": 7.489698184653079, + "grad_norm": 0.45619824528694153, + "learning_rate": 3.937660627755433e-05, + "loss": 0.0542, + "num_input_tokens_seen": 81847488, + "step": 67250 + }, + { + "epoch": 7.490255039536697, + "grad_norm": 0.5748812556266785, + "learning_rate": 3.937461842112618e-05, + "loss": 0.0635, + "num_input_tokens_seen": 81853632, + "step": 67255 + }, + { + "epoch": 7.490811894420314, + "grad_norm": 0.2636904716491699, + "learning_rate": 3.9372630428918245e-05, + "loss": 0.0283, + "num_input_tokens_seen": 81859136, + "step": 67260 + }, + { + "epoch": 7.491368749303931, + "grad_norm": 0.3658006191253662, + "learning_rate": 3.93706423009493e-05, + "loss": 0.0344, + "num_input_tokens_seen": 81865408, + "step": 67265 + }, + { + "epoch": 7.491925604187549, + "grad_norm": 0.11341783404350281, + "learning_rate": 3.9368654037238125e-05, + "loss": 0.0445, + "num_input_tokens_seen": 81871008, + "step": 67270 + }, + { + "epoch": 7.492482459071166, + "grad_norm": 0.031219597905874252, + "learning_rate": 3.93666656378035e-05, + "loss": 0.0085, + "num_input_tokens_seen": 81877024, + "step": 67275 + }, + { + "epoch": 7.4930393139547835, + "grad_norm": 0.011548192240297794, + "learning_rate": 3.936467710266422e-05, + "loss": 0.0626, + "num_input_tokens_seen": 81883168, + "step": 67280 + }, + { + "epoch": 7.493596168838401, + "grad_norm": 2.001481771469116, + "learning_rate": 3.936268843183904e-05, + "loss": 0.0323, + "num_input_tokens_seen": 81888992, + "step": 67285 + }, + { + "epoch": 7.494153023722018, + "grad_norm": 0.10700660943984985, + "learning_rate": 3.936069962534677e-05, + "loss": 0.1119, + "num_input_tokens_seen": 81895200, + "step": 67290 + }, + { + "epoch": 7.494709878605636, + "grad_norm": 0.289487361907959, + "learning_rate": 3.935871068320618e-05, + "loss": 0.0985, + "num_input_tokens_seen": 81901600, + "step": 67295 + }, + { + "epoch": 7.495266733489252, + "grad_norm": 0.33319613337516785, + "learning_rate": 3.9356721605436064e-05, + "loss": 0.0388, + "num_input_tokens_seen": 81907712, + "step": 67300 + }, + { + "epoch": 7.49582358837287, + "grad_norm": 0.07515427470207214, + "learning_rate": 3.9354732392055216e-05, + "loss": 0.1065, + "num_input_tokens_seen": 81913664, + "step": 67305 + }, + { + "epoch": 7.496380443256488, + "grad_norm": 1.0036529302597046, + "learning_rate": 3.935274304308241e-05, + "loss": 0.0608, + "num_input_tokens_seen": 81919936, + "step": 67310 + }, + { + "epoch": 7.496937298140105, + "grad_norm": 0.060629572719335556, + "learning_rate": 3.935075355853646e-05, + "loss": 0.0194, + "num_input_tokens_seen": 81926112, + "step": 67315 + }, + { + "epoch": 7.497494153023722, + "grad_norm": 0.19734914600849152, + "learning_rate": 3.9348763938436134e-05, + "loss": 0.1437, + "num_input_tokens_seen": 81932128, + "step": 67320 + }, + { + "epoch": 7.498051007907339, + "grad_norm": 0.021414168179035187, + "learning_rate": 3.9346774182800237e-05, + "loss": 0.0031, + "num_input_tokens_seen": 81938272, + "step": 67325 + }, + { + "epoch": 7.498607862790957, + "grad_norm": 0.010347379371523857, + "learning_rate": 3.934478429164757e-05, + "loss": 0.0043, + "num_input_tokens_seen": 81944480, + "step": 67330 + }, + { + "epoch": 7.499164717674574, + "grad_norm": 0.3776639997959137, + "learning_rate": 3.9342794264996916e-05, + "loss": 0.0097, + "num_input_tokens_seen": 81950592, + "step": 67335 + }, + { + "epoch": 7.499721572558191, + "grad_norm": 0.6702880263328552, + "learning_rate": 3.934080410286709e-05, + "loss": 0.0437, + "num_input_tokens_seen": 81955616, + "step": 67340 + }, + { + "epoch": 7.500278427441809, + "grad_norm": 0.5672076344490051, + "learning_rate": 3.933881380527687e-05, + "loss": 0.0172, + "num_input_tokens_seen": 81961856, + "step": 67345 + }, + { + "epoch": 7.500835282325426, + "grad_norm": 0.7365555763244629, + "learning_rate": 3.933682337224507e-05, + "loss": 0.0678, + "num_input_tokens_seen": 81967872, + "step": 67350 + }, + { + "epoch": 7.501392137209043, + "grad_norm": 0.09696658700704575, + "learning_rate": 3.933483280379048e-05, + "loss": 0.0967, + "num_input_tokens_seen": 81974112, + "step": 67355 + }, + { + "epoch": 7.501948992092661, + "grad_norm": 0.02203340083360672, + "learning_rate": 3.933284209993191e-05, + "loss": 0.0179, + "num_input_tokens_seen": 81980384, + "step": 67360 + }, + { + "epoch": 7.502505846976278, + "grad_norm": 0.2489558309316635, + "learning_rate": 3.933085126068817e-05, + "loss": 0.0488, + "num_input_tokens_seen": 81986624, + "step": 67365 + }, + { + "epoch": 7.503062701859895, + "grad_norm": 0.8847580552101135, + "learning_rate": 3.932886028607805e-05, + "loss": 0.0764, + "num_input_tokens_seen": 81992096, + "step": 67370 + }, + { + "epoch": 7.503619556743512, + "grad_norm": 0.03680189698934555, + "learning_rate": 3.9326869176120376e-05, + "loss": 0.052, + "num_input_tokens_seen": 81998272, + "step": 67375 + }, + { + "epoch": 7.50417641162713, + "grad_norm": 0.18921901285648346, + "learning_rate": 3.932487793083394e-05, + "loss": 0.0513, + "num_input_tokens_seen": 82004384, + "step": 67380 + }, + { + "epoch": 7.504733266510748, + "grad_norm": 0.0004985078703612089, + "learning_rate": 3.932288655023755e-05, + "loss": 0.0124, + "num_input_tokens_seen": 82010688, + "step": 67385 + }, + { + "epoch": 7.505290121394364, + "grad_norm": 0.0256656464189291, + "learning_rate": 3.9320895034350016e-05, + "loss": 0.0395, + "num_input_tokens_seen": 82017120, + "step": 67390 + }, + { + "epoch": 7.505846976277982, + "grad_norm": 0.01091048028320074, + "learning_rate": 3.9318903383190166e-05, + "loss": 0.016, + "num_input_tokens_seen": 82023360, + "step": 67395 + }, + { + "epoch": 7.506403831161599, + "grad_norm": 0.1049766018986702, + "learning_rate": 3.93169115967768e-05, + "loss": 0.0739, + "num_input_tokens_seen": 82029408, + "step": 67400 + }, + { + "epoch": 7.5069606860452165, + "grad_norm": 0.16166198253631592, + "learning_rate": 3.9314919675128726e-05, + "loss": 0.1006, + "num_input_tokens_seen": 82035424, + "step": 67405 + }, + { + "epoch": 7.507517540928834, + "grad_norm": 0.20092128217220306, + "learning_rate": 3.931292761826477e-05, + "loss": 0.0074, + "num_input_tokens_seen": 82041600, + "step": 67410 + }, + { + "epoch": 7.508074395812451, + "grad_norm": 1.7320871353149414, + "learning_rate": 3.9310935426203756e-05, + "loss": 0.0969, + "num_input_tokens_seen": 82047616, + "step": 67415 + }, + { + "epoch": 7.508631250696069, + "grad_norm": 0.5783053040504456, + "learning_rate": 3.930894309896448e-05, + "loss": 0.019, + "num_input_tokens_seen": 82053568, + "step": 67420 + }, + { + "epoch": 7.509188105579686, + "grad_norm": 0.004555635154247284, + "learning_rate": 3.930695063656577e-05, + "loss": 0.1097, + "num_input_tokens_seen": 82059808, + "step": 67425 + }, + { + "epoch": 7.509744960463303, + "grad_norm": 0.00035325094358995557, + "learning_rate": 3.930495803902645e-05, + "loss": 0.1061, + "num_input_tokens_seen": 82065664, + "step": 67430 + }, + { + "epoch": 7.510301815346921, + "grad_norm": 0.1196717843413353, + "learning_rate": 3.930296530636535e-05, + "loss": 0.038, + "num_input_tokens_seen": 82071392, + "step": 67435 + }, + { + "epoch": 7.5108586702305375, + "grad_norm": 0.8295993208885193, + "learning_rate": 3.9300972438601275e-05, + "loss": 0.1066, + "num_input_tokens_seen": 82077056, + "step": 67440 + }, + { + "epoch": 7.511415525114155, + "grad_norm": 0.10963626950979233, + "learning_rate": 3.929897943575306e-05, + "loss": 0.0223, + "num_input_tokens_seen": 82083424, + "step": 67445 + }, + { + "epoch": 7.511972379997773, + "grad_norm": 0.005072288680821657, + "learning_rate": 3.929698629783953e-05, + "loss": 0.06, + "num_input_tokens_seen": 82089632, + "step": 67450 + }, + { + "epoch": 7.51252923488139, + "grad_norm": 0.17454980313777924, + "learning_rate": 3.92949930248795e-05, + "loss": 0.1092, + "num_input_tokens_seen": 82095808, + "step": 67455 + }, + { + "epoch": 7.513086089765007, + "grad_norm": 0.2889329493045807, + "learning_rate": 3.929299961689182e-05, + "loss": 0.0637, + "num_input_tokens_seen": 82102016, + "step": 67460 + }, + { + "epoch": 7.513642944648625, + "grad_norm": 0.48743125796318054, + "learning_rate": 3.92910060738953e-05, + "loss": 0.05, + "num_input_tokens_seen": 82108256, + "step": 67465 + }, + { + "epoch": 7.514199799532242, + "grad_norm": 1.7943921089172363, + "learning_rate": 3.9289012395908785e-05, + "loss": 0.0741, + "num_input_tokens_seen": 82114336, + "step": 67470 + }, + { + "epoch": 7.5147566544158595, + "grad_norm": 0.006070687901228666, + "learning_rate": 3.9287018582951094e-05, + "loss": 0.0598, + "num_input_tokens_seen": 82120544, + "step": 67475 + }, + { + "epoch": 7.515313509299476, + "grad_norm": 1.3054754734039307, + "learning_rate": 3.9285024635041065e-05, + "loss": 0.0864, + "num_input_tokens_seen": 82126592, + "step": 67480 + }, + { + "epoch": 7.515870364183094, + "grad_norm": 0.15032976865768433, + "learning_rate": 3.9283030552197535e-05, + "loss": 0.0234, + "num_input_tokens_seen": 82132704, + "step": 67485 + }, + { + "epoch": 7.516427219066712, + "grad_norm": 0.08099107444286346, + "learning_rate": 3.9281036334439335e-05, + "loss": 0.0454, + "num_input_tokens_seen": 82138848, + "step": 67490 + }, + { + "epoch": 7.516984073950328, + "grad_norm": 1.3205664157867432, + "learning_rate": 3.927904198178531e-05, + "loss": 0.1191, + "num_input_tokens_seen": 82144800, + "step": 67495 + }, + { + "epoch": 7.517540928833946, + "grad_norm": 0.0034071372356265783, + "learning_rate": 3.9277047494254294e-05, + "loss": 0.0205, + "num_input_tokens_seen": 82151168, + "step": 67500 + }, + { + "epoch": 7.518097783717563, + "grad_norm": 0.3147628605365753, + "learning_rate": 3.927505287186512e-05, + "loss": 0.0165, + "num_input_tokens_seen": 82157216, + "step": 67505 + }, + { + "epoch": 7.5186546386011806, + "grad_norm": 1.9505650997161865, + "learning_rate": 3.927305811463664e-05, + "loss": 0.0296, + "num_input_tokens_seen": 82163328, + "step": 67510 + }, + { + "epoch": 7.519211493484798, + "grad_norm": 0.9596752524375916, + "learning_rate": 3.927106322258769e-05, + "loss": 0.0398, + "num_input_tokens_seen": 82169408, + "step": 67515 + }, + { + "epoch": 7.519768348368415, + "grad_norm": 0.0029056204948574305, + "learning_rate": 3.9269068195737116e-05, + "loss": 0.0353, + "num_input_tokens_seen": 82175840, + "step": 67520 + }, + { + "epoch": 7.520325203252033, + "grad_norm": 0.8765672445297241, + "learning_rate": 3.926707303410376e-05, + "loss": 0.0208, + "num_input_tokens_seen": 82182016, + "step": 67525 + }, + { + "epoch": 7.5208820581356495, + "grad_norm": 0.2807515263557434, + "learning_rate": 3.926507773770646e-05, + "loss": 0.1119, + "num_input_tokens_seen": 82188128, + "step": 67530 + }, + { + "epoch": 7.521438913019267, + "grad_norm": 0.048109542578458786, + "learning_rate": 3.926308230656408e-05, + "loss": 0.0823, + "num_input_tokens_seen": 82194464, + "step": 67535 + }, + { + "epoch": 7.521995767902885, + "grad_norm": 0.06499361991882324, + "learning_rate": 3.9261086740695466e-05, + "loss": 0.0307, + "num_input_tokens_seen": 82200736, + "step": 67540 + }, + { + "epoch": 7.522552622786502, + "grad_norm": 0.560907244682312, + "learning_rate": 3.925909104011945e-05, + "loss": 0.0614, + "num_input_tokens_seen": 82206848, + "step": 67545 + }, + { + "epoch": 7.523109477670119, + "grad_norm": 0.2562214136123657, + "learning_rate": 3.92570952048549e-05, + "loss": 0.096, + "num_input_tokens_seen": 82212960, + "step": 67550 + }, + { + "epoch": 7.523666332553736, + "grad_norm": 0.015140119940042496, + "learning_rate": 3.925509923492066e-05, + "loss": 0.1337, + "num_input_tokens_seen": 82219200, + "step": 67555 + }, + { + "epoch": 7.524223187437354, + "grad_norm": 0.019545748829841614, + "learning_rate": 3.925310313033559e-05, + "loss": 0.0093, + "num_input_tokens_seen": 82225600, + "step": 67560 + }, + { + "epoch": 7.524780042320971, + "grad_norm": 0.02319221757352352, + "learning_rate": 3.9251106891118536e-05, + "loss": 0.0378, + "num_input_tokens_seen": 82231648, + "step": 67565 + }, + { + "epoch": 7.525336897204588, + "grad_norm": 0.057296574115753174, + "learning_rate": 3.9249110517288365e-05, + "loss": 0.0279, + "num_input_tokens_seen": 82237920, + "step": 67570 + }, + { + "epoch": 7.525893752088206, + "grad_norm": 0.0382114015519619, + "learning_rate": 3.924711400886393e-05, + "loss": 0.082, + "num_input_tokens_seen": 82244128, + "step": 67575 + }, + { + "epoch": 7.526450606971823, + "grad_norm": 0.8389729261398315, + "learning_rate": 3.9245117365864085e-05, + "loss": 0.1322, + "num_input_tokens_seen": 82249760, + "step": 67580 + }, + { + "epoch": 7.52700746185544, + "grad_norm": 1.2026910781860352, + "learning_rate": 3.92431205883077e-05, + "loss": 0.0462, + "num_input_tokens_seen": 82256096, + "step": 67585 + }, + { + "epoch": 7.527564316739058, + "grad_norm": 0.05928008630871773, + "learning_rate": 3.924112367621362e-05, + "loss": 0.0297, + "num_input_tokens_seen": 82262080, + "step": 67590 + }, + { + "epoch": 7.528121171622675, + "grad_norm": 0.22886225581169128, + "learning_rate": 3.923912662960073e-05, + "loss": 0.0397, + "num_input_tokens_seen": 82268192, + "step": 67595 + }, + { + "epoch": 7.5286780265062925, + "grad_norm": 0.00010139759979210794, + "learning_rate": 3.923712944848787e-05, + "loss": 0.0538, + "num_input_tokens_seen": 82274592, + "step": 67600 + }, + { + "epoch": 7.52923488138991, + "grad_norm": 0.21943505108356476, + "learning_rate": 3.923513213289392e-05, + "loss": 0.101, + "num_input_tokens_seen": 82280064, + "step": 67605 + }, + { + "epoch": 7.529791736273527, + "grad_norm": 0.04641511291265488, + "learning_rate": 3.923313468283774e-05, + "loss": 0.0605, + "num_input_tokens_seen": 82285568, + "step": 67610 + }, + { + "epoch": 7.530348591157145, + "grad_norm": 0.004466154146939516, + "learning_rate": 3.92311370983382e-05, + "loss": 0.0479, + "num_input_tokens_seen": 82291840, + "step": 67615 + }, + { + "epoch": 7.530905446040761, + "grad_norm": 0.0007900000782683492, + "learning_rate": 3.922913937941417e-05, + "loss": 0.0902, + "num_input_tokens_seen": 82297312, + "step": 67620 + }, + { + "epoch": 7.531462300924379, + "grad_norm": 0.00450537446886301, + "learning_rate": 3.9227141526084515e-05, + "loss": 0.0413, + "num_input_tokens_seen": 82303488, + "step": 67625 + }, + { + "epoch": 7.532019155807997, + "grad_norm": 0.0015160322654992342, + "learning_rate": 3.922514353836811e-05, + "loss": 0.0848, + "num_input_tokens_seen": 82309248, + "step": 67630 + }, + { + "epoch": 7.5325760106916135, + "grad_norm": 0.3363499343395233, + "learning_rate": 3.922314541628383e-05, + "loss": 0.0121, + "num_input_tokens_seen": 82315296, + "step": 67635 + }, + { + "epoch": 7.533132865575231, + "grad_norm": 0.000417573464801535, + "learning_rate": 3.9221147159850545e-05, + "loss": 0.0365, + "num_input_tokens_seen": 82321376, + "step": 67640 + }, + { + "epoch": 7.533689720458849, + "grad_norm": 0.012268221005797386, + "learning_rate": 3.9219148769087136e-05, + "loss": 0.0219, + "num_input_tokens_seen": 82327584, + "step": 67645 + }, + { + "epoch": 7.534246575342466, + "grad_norm": 0.0005134324310347438, + "learning_rate": 3.921715024401247e-05, + "loss": 0.0295, + "num_input_tokens_seen": 82333536, + "step": 67650 + }, + { + "epoch": 7.534803430226083, + "grad_norm": 0.4788893163204193, + "learning_rate": 3.9215151584645426e-05, + "loss": 0.0115, + "num_input_tokens_seen": 82339808, + "step": 67655 + }, + { + "epoch": 7.5353602851097, + "grad_norm": 0.00593844149261713, + "learning_rate": 3.921315279100489e-05, + "loss": 0.0206, + "num_input_tokens_seen": 82346080, + "step": 67660 + }, + { + "epoch": 7.535917139993318, + "grad_norm": 0.01439723838120699, + "learning_rate": 3.921115386310974e-05, + "loss": 0.013, + "num_input_tokens_seen": 82351776, + "step": 67665 + }, + { + "epoch": 7.5364739948769355, + "grad_norm": 0.40996238589286804, + "learning_rate": 3.9209154800978856e-05, + "loss": 0.0485, + "num_input_tokens_seen": 82357632, + "step": 67670 + }, + { + "epoch": 7.537030849760552, + "grad_norm": 0.04626179859042168, + "learning_rate": 3.920715560463112e-05, + "loss": 0.0938, + "num_input_tokens_seen": 82363520, + "step": 67675 + }, + { + "epoch": 7.53758770464417, + "grad_norm": 0.3506900370121002, + "learning_rate": 3.920515627408541e-05, + "loss": 0.0198, + "num_input_tokens_seen": 82369728, + "step": 67680 + }, + { + "epoch": 7.538144559527787, + "grad_norm": 0.00047641893615946174, + "learning_rate": 3.920315680936062e-05, + "loss": 0.1522, + "num_input_tokens_seen": 82375712, + "step": 67685 + }, + { + "epoch": 7.538701414411404, + "grad_norm": 0.8342044949531555, + "learning_rate": 3.920115721047564e-05, + "loss": 0.1233, + "num_input_tokens_seen": 82381184, + "step": 67690 + }, + { + "epoch": 7.539258269295022, + "grad_norm": 0.08897053450345993, + "learning_rate": 3.9199157477449357e-05, + "loss": 0.041, + "num_input_tokens_seen": 82387584, + "step": 67695 + }, + { + "epoch": 7.539815124178639, + "grad_norm": 1.6784366369247437, + "learning_rate": 3.919715761030064e-05, + "loss": 0.1818, + "num_input_tokens_seen": 82393792, + "step": 67700 + }, + { + "epoch": 7.5403719790622565, + "grad_norm": 1.0341938734054565, + "learning_rate": 3.9195157609048405e-05, + "loss": 0.1198, + "num_input_tokens_seen": 82399808, + "step": 67705 + }, + { + "epoch": 7.540928833945873, + "grad_norm": 2.1118111610412598, + "learning_rate": 3.9193157473711536e-05, + "loss": 0.0787, + "num_input_tokens_seen": 82406016, + "step": 67710 + }, + { + "epoch": 7.541485688829491, + "grad_norm": 1.2905884981155396, + "learning_rate": 3.9191157204308915e-05, + "loss": 0.031, + "num_input_tokens_seen": 82411840, + "step": 67715 + }, + { + "epoch": 7.542042543713109, + "grad_norm": 1.701995611190796, + "learning_rate": 3.918915680085945e-05, + "loss": 0.1161, + "num_input_tokens_seen": 82418208, + "step": 67720 + }, + { + "epoch": 7.5425993985967255, + "grad_norm": 0.054885849356651306, + "learning_rate": 3.918715626338203e-05, + "loss": 0.0681, + "num_input_tokens_seen": 82424320, + "step": 67725 + }, + { + "epoch": 7.543156253480343, + "grad_norm": 0.6245135068893433, + "learning_rate": 3.9185155591895554e-05, + "loss": 0.0203, + "num_input_tokens_seen": 82430304, + "step": 67730 + }, + { + "epoch": 7.54371310836396, + "grad_norm": 0.03877410292625427, + "learning_rate": 3.918315478641892e-05, + "loss": 0.1098, + "num_input_tokens_seen": 82436608, + "step": 67735 + }, + { + "epoch": 7.544269963247578, + "grad_norm": 0.01590939424932003, + "learning_rate": 3.918115384697102e-05, + "loss": 0.0066, + "num_input_tokens_seen": 82442848, + "step": 67740 + }, + { + "epoch": 7.544826818131195, + "grad_norm": 0.016940267756581306, + "learning_rate": 3.9179152773570764e-05, + "loss": 0.0023, + "num_input_tokens_seen": 82449056, + "step": 67745 + }, + { + "epoch": 7.545383673014812, + "grad_norm": 0.5636134743690491, + "learning_rate": 3.917715156623705e-05, + "loss": 0.0179, + "num_input_tokens_seen": 82455328, + "step": 67750 + }, + { + "epoch": 7.54594052789843, + "grad_norm": 0.0009071716922335327, + "learning_rate": 3.917515022498878e-05, + "loss": 0.0741, + "num_input_tokens_seen": 82461440, + "step": 67755 + }, + { + "epoch": 7.5464973827820465, + "grad_norm": 0.6754312515258789, + "learning_rate": 3.917314874984486e-05, + "loss": 0.0965, + "num_input_tokens_seen": 82467680, + "step": 67760 + }, + { + "epoch": 7.547054237665664, + "grad_norm": 0.548992395401001, + "learning_rate": 3.917114714082419e-05, + "loss": 0.0613, + "num_input_tokens_seen": 82473728, + "step": 67765 + }, + { + "epoch": 7.547611092549282, + "grad_norm": 0.2111649066209793, + "learning_rate": 3.916914539794568e-05, + "loss": 0.0289, + "num_input_tokens_seen": 82480064, + "step": 67770 + }, + { + "epoch": 7.548167947432899, + "grad_norm": 0.0951828807592392, + "learning_rate": 3.916714352122825e-05, + "loss": 0.0187, + "num_input_tokens_seen": 82485984, + "step": 67775 + }, + { + "epoch": 7.548724802316516, + "grad_norm": 0.004151138477027416, + "learning_rate": 3.9165141510690795e-05, + "loss": 0.1455, + "num_input_tokens_seen": 82492192, + "step": 67780 + }, + { + "epoch": 7.549281657200134, + "grad_norm": 0.7308275103569031, + "learning_rate": 3.916313936635223e-05, + "loss": 0.0206, + "num_input_tokens_seen": 82498240, + "step": 67785 + }, + { + "epoch": 7.549838512083751, + "grad_norm": 0.0013166101416572928, + "learning_rate": 3.916113708823146e-05, + "loss": 0.1122, + "num_input_tokens_seen": 82504256, + "step": 67790 + }, + { + "epoch": 7.5503953669673685, + "grad_norm": 0.3631247580051422, + "learning_rate": 3.915913467634741e-05, + "loss": 0.0232, + "num_input_tokens_seen": 82510272, + "step": 67795 + }, + { + "epoch": 7.550952221850985, + "grad_norm": 0.4968021512031555, + "learning_rate": 3.915713213071899e-05, + "loss": 0.0867, + "num_input_tokens_seen": 82516640, + "step": 67800 + }, + { + "epoch": 7.551509076734603, + "grad_norm": 0.44326159358024597, + "learning_rate": 3.9155129451365114e-05, + "loss": 0.0969, + "num_input_tokens_seen": 82522912, + "step": 67805 + }, + { + "epoch": 7.552065931618221, + "grad_norm": 1.431381106376648, + "learning_rate": 3.915312663830469e-05, + "loss": 0.1211, + "num_input_tokens_seen": 82528896, + "step": 67810 + }, + { + "epoch": 7.552622786501837, + "grad_norm": 0.004755524918437004, + "learning_rate": 3.9151123691556656e-05, + "loss": 0.006, + "num_input_tokens_seen": 82534400, + "step": 67815 + }, + { + "epoch": 7.553179641385455, + "grad_norm": 0.32561880350112915, + "learning_rate": 3.9149120611139925e-05, + "loss": 0.0612, + "num_input_tokens_seen": 82540800, + "step": 67820 + }, + { + "epoch": 7.553736496269073, + "grad_norm": 0.0005411801976151764, + "learning_rate": 3.9147117397073396e-05, + "loss": 0.0347, + "num_input_tokens_seen": 82547008, + "step": 67825 + }, + { + "epoch": 7.5542933511526895, + "grad_norm": 0.3003048896789551, + "learning_rate": 3.914511404937603e-05, + "loss": 0.1182, + "num_input_tokens_seen": 82552384, + "step": 67830 + }, + { + "epoch": 7.554850206036307, + "grad_norm": 1.1462327241897583, + "learning_rate": 3.914311056806671e-05, + "loss": 0.1077, + "num_input_tokens_seen": 82558560, + "step": 67835 + }, + { + "epoch": 7.555407060919924, + "grad_norm": 0.20711766183376312, + "learning_rate": 3.9141106953164393e-05, + "loss": 0.0112, + "num_input_tokens_seen": 82564928, + "step": 67840 + }, + { + "epoch": 7.555963915803542, + "grad_norm": 1.4249804019927979, + "learning_rate": 3.9139103204687986e-05, + "loss": 0.0495, + "num_input_tokens_seen": 82571232, + "step": 67845 + }, + { + "epoch": 7.556520770687159, + "grad_norm": 0.06551822274923325, + "learning_rate": 3.9137099322656424e-05, + "loss": 0.0349, + "num_input_tokens_seen": 82577536, + "step": 67850 + }, + { + "epoch": 7.557077625570776, + "grad_norm": 0.7027388215065002, + "learning_rate": 3.913509530708862e-05, + "loss": 0.0327, + "num_input_tokens_seen": 82583520, + "step": 67855 + }, + { + "epoch": 7.557634480454394, + "grad_norm": 0.05937555804848671, + "learning_rate": 3.913309115800353e-05, + "loss": 0.2436, + "num_input_tokens_seen": 82589504, + "step": 67860 + }, + { + "epoch": 7.558191335338011, + "grad_norm": 0.0684443786740303, + "learning_rate": 3.913108687542006e-05, + "loss": 0.041, + "num_input_tokens_seen": 82595616, + "step": 67865 + }, + { + "epoch": 7.558748190221628, + "grad_norm": 0.2011694461107254, + "learning_rate": 3.912908245935717e-05, + "loss": 0.0824, + "num_input_tokens_seen": 82601600, + "step": 67870 + }, + { + "epoch": 7.559305045105246, + "grad_norm": 0.00017883178952615708, + "learning_rate": 3.912707790983376e-05, + "loss": 0.0075, + "num_input_tokens_seen": 82607296, + "step": 67875 + }, + { + "epoch": 7.559861899988863, + "grad_norm": 0.03800255432724953, + "learning_rate": 3.912507322686879e-05, + "loss": 0.0543, + "num_input_tokens_seen": 82613408, + "step": 67880 + }, + { + "epoch": 7.56041875487248, + "grad_norm": 0.16437314450740814, + "learning_rate": 3.9123068410481176e-05, + "loss": 0.0231, + "num_input_tokens_seen": 82619520, + "step": 67885 + }, + { + "epoch": 7.560975609756097, + "grad_norm": 0.07617994397878647, + "learning_rate": 3.9121063460689876e-05, + "loss": 0.1774, + "num_input_tokens_seen": 82625792, + "step": 67890 + }, + { + "epoch": 7.561532464639715, + "grad_norm": 1.0773627758026123, + "learning_rate": 3.911905837751382e-05, + "loss": 0.0243, + "num_input_tokens_seen": 82631936, + "step": 67895 + }, + { + "epoch": 7.5620893195233325, + "grad_norm": 1.4387327432632446, + "learning_rate": 3.911705316097194e-05, + "loss": 0.0963, + "num_input_tokens_seen": 82638016, + "step": 67900 + }, + { + "epoch": 7.562646174406949, + "grad_norm": 0.1906844824552536, + "learning_rate": 3.911504781108318e-05, + "loss": 0.0068, + "num_input_tokens_seen": 82644000, + "step": 67905 + }, + { + "epoch": 7.563203029290567, + "grad_norm": 0.04955271631479263, + "learning_rate": 3.911304232786649e-05, + "loss": 0.0097, + "num_input_tokens_seen": 82650016, + "step": 67910 + }, + { + "epoch": 7.563759884174184, + "grad_norm": 0.011835894547402859, + "learning_rate": 3.91110367113408e-05, + "loss": 0.053, + "num_input_tokens_seen": 82655904, + "step": 67915 + }, + { + "epoch": 7.564316739057801, + "grad_norm": 0.17152968049049377, + "learning_rate": 3.9109030961525066e-05, + "loss": 0.0715, + "num_input_tokens_seen": 82661888, + "step": 67920 + }, + { + "epoch": 7.564873593941419, + "grad_norm": 1.0964246988296509, + "learning_rate": 3.910702507843823e-05, + "loss": 0.0211, + "num_input_tokens_seen": 82668128, + "step": 67925 + }, + { + "epoch": 7.565430448825036, + "grad_norm": 0.10257866978645325, + "learning_rate": 3.910501906209925e-05, + "loss": 0.0793, + "num_input_tokens_seen": 82674368, + "step": 67930 + }, + { + "epoch": 7.565987303708654, + "grad_norm": 0.0063378578051924706, + "learning_rate": 3.9103012912527054e-05, + "loss": 0.0139, + "num_input_tokens_seen": 82680672, + "step": 67935 + }, + { + "epoch": 7.56654415859227, + "grad_norm": 0.2705498933792114, + "learning_rate": 3.9101006629740604e-05, + "loss": 0.1245, + "num_input_tokens_seen": 82686656, + "step": 67940 + }, + { + "epoch": 7.567101013475888, + "grad_norm": 0.17411300539970398, + "learning_rate": 3.9099000213758854e-05, + "loss": 0.109, + "num_input_tokens_seen": 82692320, + "step": 67945 + }, + { + "epoch": 7.567657868359506, + "grad_norm": 0.3571438491344452, + "learning_rate": 3.909699366460074e-05, + "loss": 0.0683, + "num_input_tokens_seen": 82698624, + "step": 67950 + }, + { + "epoch": 7.5682147232431225, + "grad_norm": 0.006011957302689552, + "learning_rate": 3.909498698228523e-05, + "loss": 0.0521, + "num_input_tokens_seen": 82704416, + "step": 67955 + }, + { + "epoch": 7.56877157812674, + "grad_norm": 1.6893657445907593, + "learning_rate": 3.909298016683128e-05, + "loss": 0.052, + "num_input_tokens_seen": 82710624, + "step": 67960 + }, + { + "epoch": 7.569328433010358, + "grad_norm": 0.14980342984199524, + "learning_rate": 3.909097321825784e-05, + "loss": 0.0218, + "num_input_tokens_seen": 82716768, + "step": 67965 + }, + { + "epoch": 7.569885287893975, + "grad_norm": 0.000589256698731333, + "learning_rate": 3.908896613658387e-05, + "loss": 0.0216, + "num_input_tokens_seen": 82722240, + "step": 67970 + }, + { + "epoch": 7.570442142777592, + "grad_norm": 0.14221668243408203, + "learning_rate": 3.908695892182832e-05, + "loss": 0.0794, + "num_input_tokens_seen": 82728288, + "step": 67975 + }, + { + "epoch": 7.57099899766121, + "grad_norm": 0.08811970055103302, + "learning_rate": 3.908495157401017e-05, + "loss": 0.0446, + "num_input_tokens_seen": 82734080, + "step": 67980 + }, + { + "epoch": 7.571555852544827, + "grad_norm": 0.2635619640350342, + "learning_rate": 3.9082944093148354e-05, + "loss": 0.0403, + "num_input_tokens_seen": 82740000, + "step": 67985 + }, + { + "epoch": 7.5721127074284444, + "grad_norm": 0.7022351026535034, + "learning_rate": 3.908093647926185e-05, + "loss": 0.1319, + "num_input_tokens_seen": 82745600, + "step": 67990 + }, + { + "epoch": 7.572669562312061, + "grad_norm": 0.44974783062934875, + "learning_rate": 3.907892873236962e-05, + "loss": 0.1499, + "num_input_tokens_seen": 82751936, + "step": 67995 + }, + { + "epoch": 7.573226417195679, + "grad_norm": 0.39339399337768555, + "learning_rate": 3.907692085249064e-05, + "loss": 0.0088, + "num_input_tokens_seen": 82758112, + "step": 68000 + }, + { + "epoch": 7.573783272079297, + "grad_norm": 0.7765460014343262, + "learning_rate": 3.907491283964385e-05, + "loss": 0.0546, + "num_input_tokens_seen": 82764256, + "step": 68005 + }, + { + "epoch": 7.574340126962913, + "grad_norm": 0.00038652110379189253, + "learning_rate": 3.907290469384824e-05, + "loss": 0.0197, + "num_input_tokens_seen": 82769824, + "step": 68010 + }, + { + "epoch": 7.574896981846531, + "grad_norm": 0.000552893616259098, + "learning_rate": 3.9070896415122765e-05, + "loss": 0.0529, + "num_input_tokens_seen": 82775936, + "step": 68015 + }, + { + "epoch": 7.575453836730148, + "grad_norm": 1.626456379890442, + "learning_rate": 3.90688880034864e-05, + "loss": 0.1045, + "num_input_tokens_seen": 82782464, + "step": 68020 + }, + { + "epoch": 7.5760106916137655, + "grad_norm": 0.907427191734314, + "learning_rate": 3.9066879458958114e-05, + "loss": 0.0628, + "num_input_tokens_seen": 82788544, + "step": 68025 + }, + { + "epoch": 7.576567546497383, + "grad_norm": 2.1409037113189697, + "learning_rate": 3.906487078155689e-05, + "loss": 0.1808, + "num_input_tokens_seen": 82794688, + "step": 68030 + }, + { + "epoch": 7.577124401381, + "grad_norm": 0.6862075328826904, + "learning_rate": 3.906286197130169e-05, + "loss": 0.0599, + "num_input_tokens_seen": 82800672, + "step": 68035 + }, + { + "epoch": 7.577681256264618, + "grad_norm": 0.38296976685523987, + "learning_rate": 3.9060853028211485e-05, + "loss": 0.0389, + "num_input_tokens_seen": 82806816, + "step": 68040 + }, + { + "epoch": 7.578238111148234, + "grad_norm": 2.3308205604553223, + "learning_rate": 3.905884395230527e-05, + "loss": 0.0616, + "num_input_tokens_seen": 82812832, + "step": 68045 + }, + { + "epoch": 7.578794966031852, + "grad_norm": 0.12031202763319016, + "learning_rate": 3.9056834743602e-05, + "loss": 0.0165, + "num_input_tokens_seen": 82819200, + "step": 68050 + }, + { + "epoch": 7.57935182091547, + "grad_norm": 0.016687309369444847, + "learning_rate": 3.9054825402120665e-05, + "loss": 0.0848, + "num_input_tokens_seen": 82825184, + "step": 68055 + }, + { + "epoch": 7.579908675799087, + "grad_norm": 0.009088855236768723, + "learning_rate": 3.9052815927880244e-05, + "loss": 0.0123, + "num_input_tokens_seen": 82831264, + "step": 68060 + }, + { + "epoch": 7.580465530682704, + "grad_norm": 1.9623733758926392, + "learning_rate": 3.905080632089972e-05, + "loss": 0.0403, + "num_input_tokens_seen": 82837408, + "step": 68065 + }, + { + "epoch": 7.581022385566321, + "grad_norm": 0.002400551922619343, + "learning_rate": 3.9048796581198074e-05, + "loss": 0.0095, + "num_input_tokens_seen": 82843712, + "step": 68070 + }, + { + "epoch": 7.581579240449939, + "grad_norm": 0.2867199182510376, + "learning_rate": 3.904678670879428e-05, + "loss": 0.0395, + "num_input_tokens_seen": 82850112, + "step": 68075 + }, + { + "epoch": 7.582136095333556, + "grad_norm": 0.19866397976875305, + "learning_rate": 3.904477670370734e-05, + "loss": 0.0632, + "num_input_tokens_seen": 82856736, + "step": 68080 + }, + { + "epoch": 7.582692950217173, + "grad_norm": 0.004790972918272018, + "learning_rate": 3.904276656595622e-05, + "loss": 0.0155, + "num_input_tokens_seen": 82862752, + "step": 68085 + }, + { + "epoch": 7.583249805100791, + "grad_norm": 0.05909550189971924, + "learning_rate": 3.904075629555993e-05, + "loss": 0.1472, + "num_input_tokens_seen": 82868736, + "step": 68090 + }, + { + "epoch": 7.583806659984408, + "grad_norm": 1.2199018001556396, + "learning_rate": 3.9038745892537454e-05, + "loss": 0.0989, + "num_input_tokens_seen": 82875008, + "step": 68095 + }, + { + "epoch": 7.584363514868025, + "grad_norm": 1.1484671831130981, + "learning_rate": 3.903673535690776e-05, + "loss": 0.153, + "num_input_tokens_seen": 82880672, + "step": 68100 + }, + { + "epoch": 7.584920369751643, + "grad_norm": 0.01282356958836317, + "learning_rate": 3.903472468868987e-05, + "loss": 0.0118, + "num_input_tokens_seen": 82887040, + "step": 68105 + }, + { + "epoch": 7.58547722463526, + "grad_norm": 0.05175389349460602, + "learning_rate": 3.903271388790275e-05, + "loss": 0.016, + "num_input_tokens_seen": 82893408, + "step": 68110 + }, + { + "epoch": 7.586034079518877, + "grad_norm": 0.8797822594642639, + "learning_rate": 3.9030702954565404e-05, + "loss": 0.0911, + "num_input_tokens_seen": 82899264, + "step": 68115 + }, + { + "epoch": 7.586590934402494, + "grad_norm": 0.03478094935417175, + "learning_rate": 3.9028691888696834e-05, + "loss": 0.0071, + "num_input_tokens_seen": 82905504, + "step": 68120 + }, + { + "epoch": 7.587147789286112, + "grad_norm": 0.4260115623474121, + "learning_rate": 3.902668069031602e-05, + "loss": 0.0763, + "num_input_tokens_seen": 82911488, + "step": 68125 + }, + { + "epoch": 7.58770464416973, + "grad_norm": 0.000298459199257195, + "learning_rate": 3.9024669359441976e-05, + "loss": 0.0427, + "num_input_tokens_seen": 82917696, + "step": 68130 + }, + { + "epoch": 7.588261499053346, + "grad_norm": 0.664822518825531, + "learning_rate": 3.9022657896093696e-05, + "loss": 0.0604, + "num_input_tokens_seen": 82923776, + "step": 68135 + }, + { + "epoch": 7.588818353936964, + "grad_norm": 1.6707249879837036, + "learning_rate": 3.902064630029017e-05, + "loss": 0.103, + "num_input_tokens_seen": 82929344, + "step": 68140 + }, + { + "epoch": 7.589375208820582, + "grad_norm": 0.7255804538726807, + "learning_rate": 3.901863457205041e-05, + "loss": 0.0168, + "num_input_tokens_seen": 82935616, + "step": 68145 + }, + { + "epoch": 7.5899320637041985, + "grad_norm": 0.6128475069999695, + "learning_rate": 3.9016622711393416e-05, + "loss": 0.0314, + "num_input_tokens_seen": 82941664, + "step": 68150 + }, + { + "epoch": 7.590488918587816, + "grad_norm": 0.08901812881231308, + "learning_rate": 3.9014610718338195e-05, + "loss": 0.0087, + "num_input_tokens_seen": 82947552, + "step": 68155 + }, + { + "epoch": 7.591045773471434, + "grad_norm": 0.6177788376808167, + "learning_rate": 3.901259859290374e-05, + "loss": 0.0345, + "num_input_tokens_seen": 82953792, + "step": 68160 + }, + { + "epoch": 7.591602628355051, + "grad_norm": 0.006172314751893282, + "learning_rate": 3.901058633510907e-05, + "loss": 0.0411, + "num_input_tokens_seen": 82959840, + "step": 68165 + }, + { + "epoch": 7.592159483238668, + "grad_norm": 0.004935950040817261, + "learning_rate": 3.900857394497318e-05, + "loss": 0.039, + "num_input_tokens_seen": 82965856, + "step": 68170 + }, + { + "epoch": 7.592716338122285, + "grad_norm": 0.0008456719224341214, + "learning_rate": 3.9006561422515084e-05, + "loss": 0.0167, + "num_input_tokens_seen": 82972160, + "step": 68175 + }, + { + "epoch": 7.593273193005903, + "grad_norm": 1.0793503522872925, + "learning_rate": 3.90045487677538e-05, + "loss": 0.0446, + "num_input_tokens_seen": 82978016, + "step": 68180 + }, + { + "epoch": 7.59383004788952, + "grad_norm": 1.0319331884384155, + "learning_rate": 3.900253598070833e-05, + "loss": 0.135, + "num_input_tokens_seen": 82984032, + "step": 68185 + }, + { + "epoch": 7.594386902773137, + "grad_norm": 0.4154508411884308, + "learning_rate": 3.9000523061397695e-05, + "loss": 0.0274, + "num_input_tokens_seen": 82989888, + "step": 68190 + }, + { + "epoch": 7.594943757656755, + "grad_norm": 0.08821689337491989, + "learning_rate": 3.899851000984089e-05, + "loss": 0.0833, + "num_input_tokens_seen": 82996192, + "step": 68195 + }, + { + "epoch": 7.595500612540372, + "grad_norm": 0.05768866091966629, + "learning_rate": 3.8996496826056956e-05, + "loss": 0.0226, + "num_input_tokens_seen": 83002400, + "step": 68200 + }, + { + "epoch": 7.596057467423989, + "grad_norm": 0.026056112721562386, + "learning_rate": 3.899448351006489e-05, + "loss": 0.0587, + "num_input_tokens_seen": 83007456, + "step": 68205 + }, + { + "epoch": 7.596614322307607, + "grad_norm": 0.19424504041671753, + "learning_rate": 3.899247006188371e-05, + "loss": 0.0329, + "num_input_tokens_seen": 83013632, + "step": 68210 + }, + { + "epoch": 7.597171177191224, + "grad_norm": 0.3315524756908417, + "learning_rate": 3.899045648153245e-05, + "loss": 0.01, + "num_input_tokens_seen": 83019552, + "step": 68215 + }, + { + "epoch": 7.5977280320748415, + "grad_norm": 0.052870020270347595, + "learning_rate": 3.898844276903011e-05, + "loss": 0.0025, + "num_input_tokens_seen": 83025824, + "step": 68220 + }, + { + "epoch": 7.598284886958458, + "grad_norm": 0.002250376623123884, + "learning_rate": 3.898642892439573e-05, + "loss": 0.0673, + "num_input_tokens_seen": 83031744, + "step": 68225 + }, + { + "epoch": 7.598841741842076, + "grad_norm": 0.013269650749862194, + "learning_rate": 3.8984414947648316e-05, + "loss": 0.0231, + "num_input_tokens_seen": 83037792, + "step": 68230 + }, + { + "epoch": 7.599398596725694, + "grad_norm": 0.08115176856517792, + "learning_rate": 3.8982400838806903e-05, + "loss": 0.0489, + "num_input_tokens_seen": 83043584, + "step": 68235 + }, + { + "epoch": 7.59995545160931, + "grad_norm": 0.1241365447640419, + "learning_rate": 3.8980386597890503e-05, + "loss": 0.148, + "num_input_tokens_seen": 83049920, + "step": 68240 + }, + { + "epoch": 7.600512306492928, + "grad_norm": 0.3421754539012909, + "learning_rate": 3.8978372224918164e-05, + "loss": 0.1545, + "num_input_tokens_seen": 83056064, + "step": 68245 + }, + { + "epoch": 7.601069161376545, + "grad_norm": 1.0321003198623657, + "learning_rate": 3.89763577199089e-05, + "loss": 0.0889, + "num_input_tokens_seen": 83061984, + "step": 68250 + }, + { + "epoch": 7.6016260162601625, + "grad_norm": 0.00523621728643775, + "learning_rate": 3.897434308288173e-05, + "loss": 0.0081, + "num_input_tokens_seen": 83068128, + "step": 68255 + }, + { + "epoch": 7.60218287114378, + "grad_norm": 0.5006517767906189, + "learning_rate": 3.897232831385569e-05, + "loss": 0.0812, + "num_input_tokens_seen": 83074464, + "step": 68260 + }, + { + "epoch": 7.602739726027397, + "grad_norm": 0.0020612964872270823, + "learning_rate": 3.8970313412849826e-05, + "loss": 0.0178, + "num_input_tokens_seen": 83080576, + "step": 68265 + }, + { + "epoch": 7.603296580911015, + "grad_norm": 0.5354617834091187, + "learning_rate": 3.896829837988315e-05, + "loss": 0.03, + "num_input_tokens_seen": 83086624, + "step": 68270 + }, + { + "epoch": 7.6038534357946315, + "grad_norm": 0.410526841878891, + "learning_rate": 3.8966283214974706e-05, + "loss": 0.0501, + "num_input_tokens_seen": 83092512, + "step": 68275 + }, + { + "epoch": 7.604410290678249, + "grad_norm": 1.6352472305297852, + "learning_rate": 3.896426791814353e-05, + "loss": 0.1443, + "num_input_tokens_seen": 83097856, + "step": 68280 + }, + { + "epoch": 7.604967145561867, + "grad_norm": 0.09126324951648712, + "learning_rate": 3.896225248940866e-05, + "loss": 0.0308, + "num_input_tokens_seen": 83103872, + "step": 68285 + }, + { + "epoch": 7.605524000445484, + "grad_norm": 0.6234011054039001, + "learning_rate": 3.896023692878912e-05, + "loss": 0.0699, + "num_input_tokens_seen": 83110272, + "step": 68290 + }, + { + "epoch": 7.606080855329101, + "grad_norm": 0.20259325206279755, + "learning_rate": 3.895822123630396e-05, + "loss": 0.0706, + "num_input_tokens_seen": 83115872, + "step": 68295 + }, + { + "epoch": 7.606637710212719, + "grad_norm": 0.09381883591413498, + "learning_rate": 3.8956205411972226e-05, + "loss": 0.0582, + "num_input_tokens_seen": 83121952, + "step": 68300 + }, + { + "epoch": 7.607194565096336, + "grad_norm": 0.8695944547653198, + "learning_rate": 3.895418945581294e-05, + "loss": 0.1063, + "num_input_tokens_seen": 83128032, + "step": 68305 + }, + { + "epoch": 7.607751419979953, + "grad_norm": 0.9478486180305481, + "learning_rate": 3.8952173367845154e-05, + "loss": 0.0547, + "num_input_tokens_seen": 83134368, + "step": 68310 + }, + { + "epoch": 7.60830827486357, + "grad_norm": 0.23353633284568787, + "learning_rate": 3.895015714808792e-05, + "loss": 0.0196, + "num_input_tokens_seen": 83140640, + "step": 68315 + }, + { + "epoch": 7.608865129747188, + "grad_norm": 0.019462276250123978, + "learning_rate": 3.894814079656027e-05, + "loss": 0.2033, + "num_input_tokens_seen": 83146880, + "step": 68320 + }, + { + "epoch": 7.6094219846308055, + "grad_norm": 0.6600297689437866, + "learning_rate": 3.894612431328126e-05, + "loss": 0.0232, + "num_input_tokens_seen": 83153056, + "step": 68325 + }, + { + "epoch": 7.609978839514422, + "grad_norm": 0.006253599654883146, + "learning_rate": 3.8944107698269924e-05, + "loss": 0.1243, + "num_input_tokens_seen": 83159168, + "step": 68330 + }, + { + "epoch": 7.61053569439804, + "grad_norm": 0.054975591599941254, + "learning_rate": 3.894209095154533e-05, + "loss": 0.0502, + "num_input_tokens_seen": 83165056, + "step": 68335 + }, + { + "epoch": 7.611092549281658, + "grad_norm": 0.07620427757501602, + "learning_rate": 3.894007407312651e-05, + "loss": 0.013, + "num_input_tokens_seen": 83171200, + "step": 68340 + }, + { + "epoch": 7.6116494041652745, + "grad_norm": 0.009196859784424305, + "learning_rate": 3.893805706303252e-05, + "loss": 0.0263, + "num_input_tokens_seen": 83177408, + "step": 68345 + }, + { + "epoch": 7.612206259048892, + "grad_norm": 0.11136288195848465, + "learning_rate": 3.893603992128242e-05, + "loss": 0.1327, + "num_input_tokens_seen": 83183232, + "step": 68350 + }, + { + "epoch": 7.612763113932509, + "grad_norm": 0.16107767820358276, + "learning_rate": 3.8934022647895254e-05, + "loss": 0.0098, + "num_input_tokens_seen": 83189344, + "step": 68355 + }, + { + "epoch": 7.613319968816127, + "grad_norm": 0.9271842241287231, + "learning_rate": 3.893200524289008e-05, + "loss": 0.0918, + "num_input_tokens_seen": 83195680, + "step": 68360 + }, + { + "epoch": 7.613876823699744, + "grad_norm": 0.05197238177061081, + "learning_rate": 3.8929987706285954e-05, + "loss": 0.0102, + "num_input_tokens_seen": 83201760, + "step": 68365 + }, + { + "epoch": 7.614433678583361, + "grad_norm": 0.0003594201698433608, + "learning_rate": 3.8927970038101936e-05, + "loss": 0.027, + "num_input_tokens_seen": 83207936, + "step": 68370 + }, + { + "epoch": 7.614990533466979, + "grad_norm": 0.1424921602010727, + "learning_rate": 3.892595223835708e-05, + "loss": 0.0361, + "num_input_tokens_seen": 83214176, + "step": 68375 + }, + { + "epoch": 7.6155473883505955, + "grad_norm": 1.9437620639801025, + "learning_rate": 3.8923934307070445e-05, + "loss": 0.1139, + "num_input_tokens_seen": 83220448, + "step": 68380 + }, + { + "epoch": 7.616104243234213, + "grad_norm": 1.168162226676941, + "learning_rate": 3.89219162442611e-05, + "loss": 0.0745, + "num_input_tokens_seen": 83226880, + "step": 68385 + }, + { + "epoch": 7.616661098117831, + "grad_norm": 0.012404710054397583, + "learning_rate": 3.89198980499481e-05, + "loss": 0.0261, + "num_input_tokens_seen": 83232960, + "step": 68390 + }, + { + "epoch": 7.617217953001448, + "grad_norm": 0.6959453821182251, + "learning_rate": 3.891787972415051e-05, + "loss": 0.0457, + "num_input_tokens_seen": 83239008, + "step": 68395 + }, + { + "epoch": 7.617774807885065, + "grad_norm": 0.0005585880135186017, + "learning_rate": 3.89158612668874e-05, + "loss": 0.0156, + "num_input_tokens_seen": 83245280, + "step": 68400 + }, + { + "epoch": 7.618331662768682, + "grad_norm": 0.012734968215227127, + "learning_rate": 3.8913842678177825e-05, + "loss": 0.072, + "num_input_tokens_seen": 83251360, + "step": 68405 + }, + { + "epoch": 7.6188885176523, + "grad_norm": 0.018894974142313004, + "learning_rate": 3.891182395804086e-05, + "loss": 0.0098, + "num_input_tokens_seen": 83257248, + "step": 68410 + }, + { + "epoch": 7.6194453725359175, + "grad_norm": 0.006461226847022772, + "learning_rate": 3.890980510649557e-05, + "loss": 0.0517, + "num_input_tokens_seen": 83263584, + "step": 68415 + }, + { + "epoch": 7.620002227419534, + "grad_norm": 0.11158408224582672, + "learning_rate": 3.890778612356103e-05, + "loss": 0.1144, + "num_input_tokens_seen": 83269376, + "step": 68420 + }, + { + "epoch": 7.620559082303152, + "grad_norm": 0.017134837806224823, + "learning_rate": 3.89057670092563e-05, + "loss": 0.0827, + "num_input_tokens_seen": 83275808, + "step": 68425 + }, + { + "epoch": 7.621115937186769, + "grad_norm": 1.944689154624939, + "learning_rate": 3.8903747763600466e-05, + "loss": 0.0387, + "num_input_tokens_seen": 83281760, + "step": 68430 + }, + { + "epoch": 7.621672792070386, + "grad_norm": 0.4059062898159027, + "learning_rate": 3.8901728386612594e-05, + "loss": 0.0207, + "num_input_tokens_seen": 83288032, + "step": 68435 + }, + { + "epoch": 7.622229646954004, + "grad_norm": 2.3072617053985596, + "learning_rate": 3.8899708878311765e-05, + "loss": 0.1031, + "num_input_tokens_seen": 83293920, + "step": 68440 + }, + { + "epoch": 7.622786501837621, + "grad_norm": 1.1684895753860474, + "learning_rate": 3.889768923871704e-05, + "loss": 0.1547, + "num_input_tokens_seen": 83299712, + "step": 68445 + }, + { + "epoch": 7.6233433567212385, + "grad_norm": 1.1342554092407227, + "learning_rate": 3.889566946784751e-05, + "loss": 0.1256, + "num_input_tokens_seen": 83305824, + "step": 68450 + }, + { + "epoch": 7.623900211604855, + "grad_norm": 0.2036711573600769, + "learning_rate": 3.8893649565722244e-05, + "loss": 0.0248, + "num_input_tokens_seen": 83312128, + "step": 68455 + }, + { + "epoch": 7.624457066488473, + "grad_norm": 0.0001240921119460836, + "learning_rate": 3.8891629532360334e-05, + "loss": 0.0467, + "num_input_tokens_seen": 83318496, + "step": 68460 + }, + { + "epoch": 7.625013921372091, + "grad_norm": 0.7135699391365051, + "learning_rate": 3.888960936778086e-05, + "loss": 0.0197, + "num_input_tokens_seen": 83324736, + "step": 68465 + }, + { + "epoch": 7.6255707762557075, + "grad_norm": 1.7669724225997925, + "learning_rate": 3.8887589072002876e-05, + "loss": 0.0487, + "num_input_tokens_seen": 83330496, + "step": 68470 + }, + { + "epoch": 7.626127631139325, + "grad_norm": 1.4402599334716797, + "learning_rate": 3.88855686450455e-05, + "loss": 0.072, + "num_input_tokens_seen": 83336800, + "step": 68475 + }, + { + "epoch": 7.626684486022943, + "grad_norm": 0.897030770778656, + "learning_rate": 3.88835480869278e-05, + "loss": 0.0899, + "num_input_tokens_seen": 83342656, + "step": 68480 + }, + { + "epoch": 7.62724134090656, + "grad_norm": 0.4070292115211487, + "learning_rate": 3.888152739766887e-05, + "loss": 0.0456, + "num_input_tokens_seen": 83348320, + "step": 68485 + }, + { + "epoch": 7.627798195790177, + "grad_norm": 0.27790209650993347, + "learning_rate": 3.8879506577287786e-05, + "loss": 0.0052, + "num_input_tokens_seen": 83354528, + "step": 68490 + }, + { + "epoch": 7.628355050673794, + "grad_norm": 1.1098469495773315, + "learning_rate": 3.887748562580364e-05, + "loss": 0.04, + "num_input_tokens_seen": 83360800, + "step": 68495 + }, + { + "epoch": 7.628911905557412, + "grad_norm": 1.4564226865768433, + "learning_rate": 3.8875464543235527e-05, + "loss": 0.1191, + "num_input_tokens_seen": 83367008, + "step": 68500 + }, + { + "epoch": 7.629468760441029, + "grad_norm": 1.1786707639694214, + "learning_rate": 3.8873443329602547e-05, + "loss": 0.0974, + "num_input_tokens_seen": 83373024, + "step": 68505 + }, + { + "epoch": 7.630025615324646, + "grad_norm": 0.2744865417480469, + "learning_rate": 3.8871421984923764e-05, + "loss": 0.0536, + "num_input_tokens_seen": 83379360, + "step": 68510 + }, + { + "epoch": 7.630582470208264, + "grad_norm": 0.15647386014461517, + "learning_rate": 3.886940050921829e-05, + "loss": 0.0666, + "num_input_tokens_seen": 83385600, + "step": 68515 + }, + { + "epoch": 7.6311393250918815, + "grad_norm": 1.5330206155776978, + "learning_rate": 3.8867378902505216e-05, + "loss": 0.0688, + "num_input_tokens_seen": 83391552, + "step": 68520 + }, + { + "epoch": 7.631696179975498, + "grad_norm": 0.8677172660827637, + "learning_rate": 3.886535716480364e-05, + "loss": 0.0168, + "num_input_tokens_seen": 83396960, + "step": 68525 + }, + { + "epoch": 7.632253034859116, + "grad_norm": 0.0043586017563939095, + "learning_rate": 3.886333529613266e-05, + "loss": 0.0052, + "num_input_tokens_seen": 83403104, + "step": 68530 + }, + { + "epoch": 7.632809889742733, + "grad_norm": 0.502858579158783, + "learning_rate": 3.8861313296511367e-05, + "loss": 0.0336, + "num_input_tokens_seen": 83408864, + "step": 68535 + }, + { + "epoch": 7.6333667446263505, + "grad_norm": 0.0009652055450715125, + "learning_rate": 3.8859291165958865e-05, + "loss": 0.069, + "num_input_tokens_seen": 83415360, + "step": 68540 + }, + { + "epoch": 7.633923599509968, + "grad_norm": 0.9292700886726379, + "learning_rate": 3.885726890449425e-05, + "loss": 0.0206, + "num_input_tokens_seen": 83421824, + "step": 68545 + }, + { + "epoch": 7.634480454393585, + "grad_norm": 0.0012844145530834794, + "learning_rate": 3.885524651213663e-05, + "loss": 0.0634, + "num_input_tokens_seen": 83428032, + "step": 68550 + }, + { + "epoch": 7.635037309277203, + "grad_norm": 0.06413327157497406, + "learning_rate": 3.885322398890511e-05, + "loss": 0.0121, + "num_input_tokens_seen": 83434144, + "step": 68555 + }, + { + "epoch": 7.635594164160819, + "grad_norm": 0.7801394462585449, + "learning_rate": 3.8851201334818796e-05, + "loss": 0.1421, + "num_input_tokens_seen": 83439776, + "step": 68560 + }, + { + "epoch": 7.636151019044437, + "grad_norm": 0.4798838496208191, + "learning_rate": 3.884917854989678e-05, + "loss": 0.0214, + "num_input_tokens_seen": 83445504, + "step": 68565 + }, + { + "epoch": 7.636707873928055, + "grad_norm": 0.12658682465553284, + "learning_rate": 3.884715563415817e-05, + "loss": 0.0214, + "num_input_tokens_seen": 83451584, + "step": 68570 + }, + { + "epoch": 7.6372647288116715, + "grad_norm": 1.2168947458267212, + "learning_rate": 3.884513258762209e-05, + "loss": 0.0917, + "num_input_tokens_seen": 83457632, + "step": 68575 + }, + { + "epoch": 7.637821583695289, + "grad_norm": 0.8259190917015076, + "learning_rate": 3.884310941030764e-05, + "loss": 0.0146, + "num_input_tokens_seen": 83463840, + "step": 68580 + }, + { + "epoch": 7.638378438578906, + "grad_norm": 0.015542513690888882, + "learning_rate": 3.884108610223393e-05, + "loss": 0.0275, + "num_input_tokens_seen": 83469696, + "step": 68585 + }, + { + "epoch": 7.638935293462524, + "grad_norm": 0.007232564967125654, + "learning_rate": 3.8839062663420065e-05, + "loss": 0.1187, + "num_input_tokens_seen": 83475904, + "step": 68590 + }, + { + "epoch": 7.639492148346141, + "grad_norm": 0.1741229146718979, + "learning_rate": 3.8837039093885174e-05, + "loss": 0.058, + "num_input_tokens_seen": 83481952, + "step": 68595 + }, + { + "epoch": 7.640049003229758, + "grad_norm": 0.030451368540525436, + "learning_rate": 3.883501539364836e-05, + "loss": 0.0165, + "num_input_tokens_seen": 83488320, + "step": 68600 + }, + { + "epoch": 7.640605858113376, + "grad_norm": 0.8866745233535767, + "learning_rate": 3.8832991562728745e-05, + "loss": 0.0535, + "num_input_tokens_seen": 83494624, + "step": 68605 + }, + { + "epoch": 7.641162712996993, + "grad_norm": 0.5312390327453613, + "learning_rate": 3.883096760114543e-05, + "loss": 0.0432, + "num_input_tokens_seen": 83500608, + "step": 68610 + }, + { + "epoch": 7.64171956788061, + "grad_norm": 0.2090751975774765, + "learning_rate": 3.882894350891756e-05, + "loss": 0.0081, + "num_input_tokens_seen": 83506784, + "step": 68615 + }, + { + "epoch": 7.642276422764228, + "grad_norm": 0.0019806779455393553, + "learning_rate": 3.8826919286064234e-05, + "loss": 0.0087, + "num_input_tokens_seen": 83513120, + "step": 68620 + }, + { + "epoch": 7.642833277647845, + "grad_norm": 1.6148332357406616, + "learning_rate": 3.882489493260457e-05, + "loss": 0.2061, + "num_input_tokens_seen": 83519232, + "step": 68625 + }, + { + "epoch": 7.643390132531462, + "grad_norm": 0.012029831297695637, + "learning_rate": 3.882287044855771e-05, + "loss": 0.0455, + "num_input_tokens_seen": 83525344, + "step": 68630 + }, + { + "epoch": 7.643946987415079, + "grad_norm": 0.4699716567993164, + "learning_rate": 3.8820845833942754e-05, + "loss": 0.1887, + "num_input_tokens_seen": 83531104, + "step": 68635 + }, + { + "epoch": 7.644503842298697, + "grad_norm": 0.7864018678665161, + "learning_rate": 3.881882108877884e-05, + "loss": 0.0659, + "num_input_tokens_seen": 83537536, + "step": 68640 + }, + { + "epoch": 7.6450606971823145, + "grad_norm": 0.7619027495384216, + "learning_rate": 3.881679621308508e-05, + "loss": 0.0532, + "num_input_tokens_seen": 83543072, + "step": 68645 + }, + { + "epoch": 7.645617552065931, + "grad_norm": 0.4985382556915283, + "learning_rate": 3.881477120688062e-05, + "loss": 0.0099, + "num_input_tokens_seen": 83549024, + "step": 68650 + }, + { + "epoch": 7.646174406949549, + "grad_norm": 0.058495763689279556, + "learning_rate": 3.881274607018458e-05, + "loss": 0.0039, + "num_input_tokens_seen": 83555136, + "step": 68655 + }, + { + "epoch": 7.646731261833167, + "grad_norm": 0.325408399105072, + "learning_rate": 3.881072080301608e-05, + "loss": 0.0291, + "num_input_tokens_seen": 83561216, + "step": 68660 + }, + { + "epoch": 7.647288116716783, + "grad_norm": 0.37715503573417664, + "learning_rate": 3.880869540539426e-05, + "loss": 0.0421, + "num_input_tokens_seen": 83566976, + "step": 68665 + }, + { + "epoch": 7.647844971600401, + "grad_norm": 0.031183796003460884, + "learning_rate": 3.8806669877338245e-05, + "loss": 0.0922, + "num_input_tokens_seen": 83572832, + "step": 68670 + }, + { + "epoch": 7.648401826484018, + "grad_norm": 0.7026594281196594, + "learning_rate": 3.880464421886717e-05, + "loss": 0.0336, + "num_input_tokens_seen": 83579136, + "step": 68675 + }, + { + "epoch": 7.648958681367636, + "grad_norm": 0.3314370810985565, + "learning_rate": 3.880261843000018e-05, + "loss": 0.0841, + "num_input_tokens_seen": 83585184, + "step": 68680 + }, + { + "epoch": 7.649515536251253, + "grad_norm": 0.13953718543052673, + "learning_rate": 3.8800592510756395e-05, + "loss": 0.0167, + "num_input_tokens_seen": 83591456, + "step": 68685 + }, + { + "epoch": 7.65007239113487, + "grad_norm": 0.44337382912635803, + "learning_rate": 3.8798566461154964e-05, + "loss": 0.0254, + "num_input_tokens_seen": 83597760, + "step": 68690 + }, + { + "epoch": 7.650629246018488, + "grad_norm": 0.04024164006114006, + "learning_rate": 3.8796540281214996e-05, + "loss": 0.0984, + "num_input_tokens_seen": 83604160, + "step": 68695 + }, + { + "epoch": 7.651186100902105, + "grad_norm": 0.00010164850391447544, + "learning_rate": 3.879451397095567e-05, + "loss": 0.0119, + "num_input_tokens_seen": 83610304, + "step": 68700 + }, + { + "epoch": 7.651742955785722, + "grad_norm": 0.00721514318138361, + "learning_rate": 3.8792487530396103e-05, + "loss": 0.0188, + "num_input_tokens_seen": 83616416, + "step": 68705 + }, + { + "epoch": 7.65229981066934, + "grad_norm": 0.02138718031346798, + "learning_rate": 3.8790460959555445e-05, + "loss": 0.0695, + "num_input_tokens_seen": 83621856, + "step": 68710 + }, + { + "epoch": 7.652856665552957, + "grad_norm": 0.34584611654281616, + "learning_rate": 3.8788434258452835e-05, + "loss": 0.0239, + "num_input_tokens_seen": 83628160, + "step": 68715 + }, + { + "epoch": 7.653413520436574, + "grad_norm": 1.3472927808761597, + "learning_rate": 3.878640742710741e-05, + "loss": 0.1366, + "num_input_tokens_seen": 83634560, + "step": 68720 + }, + { + "epoch": 7.653970375320192, + "grad_norm": 0.5949639678001404, + "learning_rate": 3.878438046553832e-05, + "loss": 0.0186, + "num_input_tokens_seen": 83640704, + "step": 68725 + }, + { + "epoch": 7.654527230203809, + "grad_norm": 0.27016618847846985, + "learning_rate": 3.878235337376472e-05, + "loss": 0.0172, + "num_input_tokens_seen": 83646688, + "step": 68730 + }, + { + "epoch": 7.655084085087426, + "grad_norm": 0.44089293479919434, + "learning_rate": 3.878032615180574e-05, + "loss": 0.0184, + "num_input_tokens_seen": 83652864, + "step": 68735 + }, + { + "epoch": 7.655640939971043, + "grad_norm": 1.520365595817566, + "learning_rate": 3.877829879968055e-05, + "loss": 0.0767, + "num_input_tokens_seen": 83658784, + "step": 68740 + }, + { + "epoch": 7.656197794854661, + "grad_norm": 0.08383335918188095, + "learning_rate": 3.877627131740829e-05, + "loss": 0.0073, + "num_input_tokens_seen": 83664704, + "step": 68745 + }, + { + "epoch": 7.656754649738279, + "grad_norm": 1.1594127416610718, + "learning_rate": 3.87742437050081e-05, + "loss": 0.0341, + "num_input_tokens_seen": 83670624, + "step": 68750 + }, + { + "epoch": 7.657311504621895, + "grad_norm": 0.3283747136592865, + "learning_rate": 3.8772215962499146e-05, + "loss": 0.0699, + "num_input_tokens_seen": 83676640, + "step": 68755 + }, + { + "epoch": 7.657868359505513, + "grad_norm": 0.001477440120652318, + "learning_rate": 3.877018808990057e-05, + "loss": 0.1011, + "num_input_tokens_seen": 83682176, + "step": 68760 + }, + { + "epoch": 7.65842521438913, + "grad_norm": 0.16483111679553986, + "learning_rate": 3.8768160087231556e-05, + "loss": 0.0573, + "num_input_tokens_seen": 83688160, + "step": 68765 + }, + { + "epoch": 7.6589820692727475, + "grad_norm": 1.3568044900894165, + "learning_rate": 3.876613195451122e-05, + "loss": 0.0725, + "num_input_tokens_seen": 83694240, + "step": 68770 + }, + { + "epoch": 7.659538924156365, + "grad_norm": 0.026867737993597984, + "learning_rate": 3.876410369175875e-05, + "loss": 0.064, + "num_input_tokens_seen": 83700480, + "step": 68775 + }, + { + "epoch": 7.660095779039982, + "grad_norm": 0.02894599363207817, + "learning_rate": 3.8762075298993284e-05, + "loss": 0.0112, + "num_input_tokens_seen": 83706624, + "step": 68780 + }, + { + "epoch": 7.6606526339236, + "grad_norm": 0.32618996500968933, + "learning_rate": 3.876004677623399e-05, + "loss": 0.0491, + "num_input_tokens_seen": 83712832, + "step": 68785 + }, + { + "epoch": 7.661209488807216, + "grad_norm": 1.3489322662353516, + "learning_rate": 3.875801812350004e-05, + "loss": 0.0402, + "num_input_tokens_seen": 83718816, + "step": 68790 + }, + { + "epoch": 7.661766343690834, + "grad_norm": 1.9733573198318481, + "learning_rate": 3.875598934081058e-05, + "loss": 0.0189, + "num_input_tokens_seen": 83725088, + "step": 68795 + }, + { + "epoch": 7.662323198574452, + "grad_norm": 0.6413870453834534, + "learning_rate": 3.875396042818478e-05, + "loss": 0.1509, + "num_input_tokens_seen": 83731392, + "step": 68800 + }, + { + "epoch": 7.6628800534580686, + "grad_norm": 1.0027015209197998, + "learning_rate": 3.8751931385641804e-05, + "loss": 0.1214, + "num_input_tokens_seen": 83737184, + "step": 68805 + }, + { + "epoch": 7.663436908341686, + "grad_norm": 0.00562430452555418, + "learning_rate": 3.874990221320082e-05, + "loss": 0.0204, + "num_input_tokens_seen": 83742976, + "step": 68810 + }, + { + "epoch": 7.663993763225303, + "grad_norm": 0.011785277165472507, + "learning_rate": 3.8747872910880995e-05, + "loss": 0.0037, + "num_input_tokens_seen": 83749088, + "step": 68815 + }, + { + "epoch": 7.664550618108921, + "grad_norm": 0.09233594685792923, + "learning_rate": 3.874584347870149e-05, + "loss": 0.0214, + "num_input_tokens_seen": 83755296, + "step": 68820 + }, + { + "epoch": 7.665107472992538, + "grad_norm": 0.9413385391235352, + "learning_rate": 3.874381391668148e-05, + "loss": 0.0174, + "num_input_tokens_seen": 83761440, + "step": 68825 + }, + { + "epoch": 7.665664327876155, + "grad_norm": 0.4754176735877991, + "learning_rate": 3.8741784224840144e-05, + "loss": 0.0238, + "num_input_tokens_seen": 83766784, + "step": 68830 + }, + { + "epoch": 7.666221182759773, + "grad_norm": 0.2582256495952606, + "learning_rate": 3.873975440319664e-05, + "loss": 0.0748, + "num_input_tokens_seen": 83773120, + "step": 68835 + }, + { + "epoch": 7.6667780376433905, + "grad_norm": 0.003446367336437106, + "learning_rate": 3.873772445177015e-05, + "loss": 0.0866, + "num_input_tokens_seen": 83779424, + "step": 68840 + }, + { + "epoch": 7.667334892527007, + "grad_norm": 0.0019894689321517944, + "learning_rate": 3.873569437057985e-05, + "loss": 0.0077, + "num_input_tokens_seen": 83785536, + "step": 68845 + }, + { + "epoch": 7.667891747410625, + "grad_norm": 0.020340051501989365, + "learning_rate": 3.873366415964491e-05, + "loss": 0.0421, + "num_input_tokens_seen": 83791456, + "step": 68850 + }, + { + "epoch": 7.668448602294242, + "grad_norm": 0.2677307724952698, + "learning_rate": 3.8731633818984505e-05, + "loss": 0.0445, + "num_input_tokens_seen": 83797664, + "step": 68855 + }, + { + "epoch": 7.669005457177859, + "grad_norm": 0.9151142239570618, + "learning_rate": 3.872960334861781e-05, + "loss": 0.0578, + "num_input_tokens_seen": 83803872, + "step": 68860 + }, + { + "epoch": 7.669562312061477, + "grad_norm": 0.05999337136745453, + "learning_rate": 3.872757274856402e-05, + "loss": 0.0849, + "num_input_tokens_seen": 83809824, + "step": 68865 + }, + { + "epoch": 7.670119166945094, + "grad_norm": 0.8213164210319519, + "learning_rate": 3.872554201884231e-05, + "loss": 0.1685, + "num_input_tokens_seen": 83815744, + "step": 68870 + }, + { + "epoch": 7.6706760218287116, + "grad_norm": 1.6123981475830078, + "learning_rate": 3.872351115947186e-05, + "loss": 0.176, + "num_input_tokens_seen": 83821824, + "step": 68875 + }, + { + "epoch": 7.671232876712329, + "grad_norm": 1.9462577104568481, + "learning_rate": 3.872148017047185e-05, + "loss": 0.0621, + "num_input_tokens_seen": 83828000, + "step": 68880 + }, + { + "epoch": 7.671789731595946, + "grad_norm": 0.004148878622800112, + "learning_rate": 3.871944905186146e-05, + "loss": 0.0804, + "num_input_tokens_seen": 83834368, + "step": 68885 + }, + { + "epoch": 7.672346586479564, + "grad_norm": 1.1972503662109375, + "learning_rate": 3.871741780365988e-05, + "loss": 0.0426, + "num_input_tokens_seen": 83840224, + "step": 68890 + }, + { + "epoch": 7.6729034413631805, + "grad_norm": 0.041294462978839874, + "learning_rate": 3.871538642588631e-05, + "loss": 0.0119, + "num_input_tokens_seen": 83846368, + "step": 68895 + }, + { + "epoch": 7.673460296246798, + "grad_norm": 0.0794491246342659, + "learning_rate": 3.871335491855992e-05, + "loss": 0.0239, + "num_input_tokens_seen": 83852672, + "step": 68900 + }, + { + "epoch": 7.674017151130416, + "grad_norm": 0.07700329273939133, + "learning_rate": 3.871132328169991e-05, + "loss": 0.0713, + "num_input_tokens_seen": 83858880, + "step": 68905 + }, + { + "epoch": 7.674574006014033, + "grad_norm": 0.0025847801007330418, + "learning_rate": 3.870929151532546e-05, + "loss": 0.0367, + "num_input_tokens_seen": 83864928, + "step": 68910 + }, + { + "epoch": 7.67513086089765, + "grad_norm": 0.010049315169453621, + "learning_rate": 3.870725961945577e-05, + "loss": 0.0214, + "num_input_tokens_seen": 83870912, + "step": 68915 + }, + { + "epoch": 7.675687715781267, + "grad_norm": 0.3218810260295868, + "learning_rate": 3.8705227594110024e-05, + "loss": 0.0328, + "num_input_tokens_seen": 83877216, + "step": 68920 + }, + { + "epoch": 7.676244570664885, + "grad_norm": 0.7107936143875122, + "learning_rate": 3.8703195439307436e-05, + "loss": 0.0226, + "num_input_tokens_seen": 83883392, + "step": 68925 + }, + { + "epoch": 7.676801425548502, + "grad_norm": 0.10427490621805191, + "learning_rate": 3.870116315506719e-05, + "loss": 0.0544, + "num_input_tokens_seen": 83889984, + "step": 68930 + }, + { + "epoch": 7.677358280432119, + "grad_norm": 0.00010959339124383405, + "learning_rate": 3.8699130741408466e-05, + "loss": 0.0053, + "num_input_tokens_seen": 83896320, + "step": 68935 + }, + { + "epoch": 7.677915135315737, + "grad_norm": 1.83388090133667, + "learning_rate": 3.869709819835049e-05, + "loss": 0.0532, + "num_input_tokens_seen": 83902688, + "step": 68940 + }, + { + "epoch": 7.678471990199354, + "grad_norm": 0.010940495878458023, + "learning_rate": 3.8695065525912436e-05, + "loss": 0.045, + "num_input_tokens_seen": 83908576, + "step": 68945 + }, + { + "epoch": 7.679028845082971, + "grad_norm": 1.1273679733276367, + "learning_rate": 3.869303272411352e-05, + "loss": 0.1205, + "num_input_tokens_seen": 83914336, + "step": 68950 + }, + { + "epoch": 7.679585699966589, + "grad_norm": 0.08392269909381866, + "learning_rate": 3.869099979297295e-05, + "loss": 0.1915, + "num_input_tokens_seen": 83920672, + "step": 68955 + }, + { + "epoch": 7.680142554850206, + "grad_norm": 1.2497022151947021, + "learning_rate": 3.86889667325099e-05, + "loss": 0.1827, + "num_input_tokens_seen": 83926464, + "step": 68960 + }, + { + "epoch": 7.6806994097338235, + "grad_norm": 0.542072057723999, + "learning_rate": 3.8686933542743606e-05, + "loss": 0.1681, + "num_input_tokens_seen": 83932576, + "step": 68965 + }, + { + "epoch": 7.68125626461744, + "grad_norm": 0.08316837251186371, + "learning_rate": 3.8684900223693256e-05, + "loss": 0.0784, + "num_input_tokens_seen": 83938784, + "step": 68970 + }, + { + "epoch": 7.681813119501058, + "grad_norm": 0.0005661814357154071, + "learning_rate": 3.8682866775378055e-05, + "loss": 0.1401, + "num_input_tokens_seen": 83945024, + "step": 68975 + }, + { + "epoch": 7.682369974384676, + "grad_norm": 0.032979566603899, + "learning_rate": 3.868083319781722e-05, + "loss": 0.015, + "num_input_tokens_seen": 83951104, + "step": 68980 + }, + { + "epoch": 7.682926829268292, + "grad_norm": 1.2376272678375244, + "learning_rate": 3.867879949102995e-05, + "loss": 0.0602, + "num_input_tokens_seen": 83957184, + "step": 68985 + }, + { + "epoch": 7.68348368415191, + "grad_norm": 0.221525177359581, + "learning_rate": 3.867676565503546e-05, + "loss": 0.141, + "num_input_tokens_seen": 83963072, + "step": 68990 + }, + { + "epoch": 7.684040539035527, + "grad_norm": 0.00030230398988351226, + "learning_rate": 3.867473168985296e-05, + "loss": 0.0315, + "num_input_tokens_seen": 83969056, + "step": 68995 + }, + { + "epoch": 7.6845973939191445, + "grad_norm": 0.16507378220558167, + "learning_rate": 3.867269759550167e-05, + "loss": 0.0166, + "num_input_tokens_seen": 83975232, + "step": 69000 + }, + { + "epoch": 7.685154248802762, + "grad_norm": 0.24304048717021942, + "learning_rate": 3.8670663372000785e-05, + "loss": 0.0507, + "num_input_tokens_seen": 83981408, + "step": 69005 + }, + { + "epoch": 7.685711103686379, + "grad_norm": 0.00013562307867687196, + "learning_rate": 3.866862901936954e-05, + "loss": 0.0384, + "num_input_tokens_seen": 83987712, + "step": 69010 + }, + { + "epoch": 7.686267958569997, + "grad_norm": 0.01920531876385212, + "learning_rate": 3.866659453762714e-05, + "loss": 0.0634, + "num_input_tokens_seen": 83993856, + "step": 69015 + }, + { + "epoch": 7.686824813453614, + "grad_norm": 0.4782085418701172, + "learning_rate": 3.866455992679281e-05, + "loss": 0.028, + "num_input_tokens_seen": 83999968, + "step": 69020 + }, + { + "epoch": 7.687381668337231, + "grad_norm": 1.0782865285873413, + "learning_rate": 3.866252518688576e-05, + "loss": 0.1006, + "num_input_tokens_seen": 84005952, + "step": 69025 + }, + { + "epoch": 7.687938523220849, + "grad_norm": 0.6814184188842773, + "learning_rate": 3.866049031792521e-05, + "loss": 0.0519, + "num_input_tokens_seen": 84011968, + "step": 69030 + }, + { + "epoch": 7.688495378104466, + "grad_norm": 0.4318407475948334, + "learning_rate": 3.865845531993039e-05, + "loss": 0.0519, + "num_input_tokens_seen": 84017824, + "step": 69035 + }, + { + "epoch": 7.689052232988083, + "grad_norm": 0.26126959919929504, + "learning_rate": 3.8656420192920515e-05, + "loss": 0.0437, + "num_input_tokens_seen": 84023968, + "step": 69040 + }, + { + "epoch": 7.689609087871701, + "grad_norm": 0.00015096261631697416, + "learning_rate": 3.865438493691481e-05, + "loss": 0.0534, + "num_input_tokens_seen": 84030432, + "step": 69045 + }, + { + "epoch": 7.690165942755318, + "grad_norm": 0.018758775666356087, + "learning_rate": 3.865234955193249e-05, + "loss": 0.003, + "num_input_tokens_seen": 84036928, + "step": 69050 + }, + { + "epoch": 7.690722797638935, + "grad_norm": 0.001140682608820498, + "learning_rate": 3.86503140379928e-05, + "loss": 0.0641, + "num_input_tokens_seen": 84043296, + "step": 69055 + }, + { + "epoch": 7.691279652522553, + "grad_norm": 0.17059777677059174, + "learning_rate": 3.864827839511496e-05, + "loss": 0.0089, + "num_input_tokens_seen": 84049568, + "step": 69060 + }, + { + "epoch": 7.69183650740617, + "grad_norm": 0.04067014902830124, + "learning_rate": 3.8646242623318194e-05, + "loss": 0.0583, + "num_input_tokens_seen": 84055584, + "step": 69065 + }, + { + "epoch": 7.6923933622897875, + "grad_norm": 0.527311384677887, + "learning_rate": 3.8644206722621725e-05, + "loss": 0.0375, + "num_input_tokens_seen": 84061568, + "step": 69070 + }, + { + "epoch": 7.692950217173404, + "grad_norm": 0.0006199082126840949, + "learning_rate": 3.86421706930448e-05, + "loss": 0.017, + "num_input_tokens_seen": 84068064, + "step": 69075 + }, + { + "epoch": 7.693507072057022, + "grad_norm": 0.012904807925224304, + "learning_rate": 3.864013453460664e-05, + "loss": 0.031, + "num_input_tokens_seen": 84074304, + "step": 69080 + }, + { + "epoch": 7.69406392694064, + "grad_norm": 0.00030403281562030315, + "learning_rate": 3.8638098247326485e-05, + "loss": 0.012, + "num_input_tokens_seen": 84080448, + "step": 69085 + }, + { + "epoch": 7.6946207818242565, + "grad_norm": 2.111199378967285, + "learning_rate": 3.8636061831223566e-05, + "loss": 0.1485, + "num_input_tokens_seen": 84086848, + "step": 69090 + }, + { + "epoch": 7.695177636707874, + "grad_norm": 0.6742793917655945, + "learning_rate": 3.863402528631711e-05, + "loss": 0.0359, + "num_input_tokens_seen": 84093472, + "step": 69095 + }, + { + "epoch": 7.695734491591491, + "grad_norm": 0.00014331444981507957, + "learning_rate": 3.863198861262637e-05, + "loss": 0.0939, + "num_input_tokens_seen": 84099680, + "step": 69100 + }, + { + "epoch": 7.696291346475109, + "grad_norm": 0.5799751877784729, + "learning_rate": 3.862995181017057e-05, + "loss": 0.0454, + "num_input_tokens_seen": 84105344, + "step": 69105 + }, + { + "epoch": 7.696848201358726, + "grad_norm": 0.7277407646179199, + "learning_rate": 3.862791487896895e-05, + "loss": 0.0348, + "num_input_tokens_seen": 84111648, + "step": 69110 + }, + { + "epoch": 7.697405056242343, + "grad_norm": 1.3676986694335938, + "learning_rate": 3.862587781904077e-05, + "loss": 0.0638, + "num_input_tokens_seen": 84117856, + "step": 69115 + }, + { + "epoch": 7.697961911125961, + "grad_norm": 0.10688731074333191, + "learning_rate": 3.862384063040525e-05, + "loss": 0.0086, + "num_input_tokens_seen": 84124096, + "step": 69120 + }, + { + "epoch": 7.6985187660095775, + "grad_norm": 0.0013607090804725885, + "learning_rate": 3.862180331308164e-05, + "loss": 0.0581, + "num_input_tokens_seen": 84130048, + "step": 69125 + }, + { + "epoch": 7.699075620893195, + "grad_norm": 0.2578437626361847, + "learning_rate": 3.8619765867089187e-05, + "loss": 0.0433, + "num_input_tokens_seen": 84136128, + "step": 69130 + }, + { + "epoch": 7.699632475776813, + "grad_norm": 0.007467955816537142, + "learning_rate": 3.861772829244712e-05, + "loss": 0.0065, + "num_input_tokens_seen": 84142464, + "step": 69135 + }, + { + "epoch": 7.70018933066043, + "grad_norm": 0.05736476927995682, + "learning_rate": 3.8615690589174715e-05, + "loss": 0.0508, + "num_input_tokens_seen": 84148352, + "step": 69140 + }, + { + "epoch": 7.700746185544047, + "grad_norm": 2.25700306892395, + "learning_rate": 3.86136527572912e-05, + "loss": 0.0503, + "num_input_tokens_seen": 84154400, + "step": 69145 + }, + { + "epoch": 7.701303040427664, + "grad_norm": 0.20014187693595886, + "learning_rate": 3.861161479681582e-05, + "loss": 0.0585, + "num_input_tokens_seen": 84160576, + "step": 69150 + }, + { + "epoch": 7.701859895311282, + "grad_norm": 0.5399474501609802, + "learning_rate": 3.860957670776784e-05, + "loss": 0.087, + "num_input_tokens_seen": 84166304, + "step": 69155 + }, + { + "epoch": 7.7024167501948995, + "grad_norm": 1.272484302520752, + "learning_rate": 3.8607538490166504e-05, + "loss": 0.11, + "num_input_tokens_seen": 84172352, + "step": 69160 + }, + { + "epoch": 7.702973605078516, + "grad_norm": 0.210288405418396, + "learning_rate": 3.8605500144031056e-05, + "loss": 0.0125, + "num_input_tokens_seen": 84178688, + "step": 69165 + }, + { + "epoch": 7.703530459962134, + "grad_norm": 0.1601049154996872, + "learning_rate": 3.860346166938077e-05, + "loss": 0.0606, + "num_input_tokens_seen": 84184576, + "step": 69170 + }, + { + "epoch": 7.704087314845751, + "grad_norm": 0.051512014120817184, + "learning_rate": 3.860142306623489e-05, + "loss": 0.0504, + "num_input_tokens_seen": 84190592, + "step": 69175 + }, + { + "epoch": 7.704644169729368, + "grad_norm": 2.2945916652679443, + "learning_rate": 3.8599384334612666e-05, + "loss": 0.0576, + "num_input_tokens_seen": 84196704, + "step": 69180 + }, + { + "epoch": 7.705201024612986, + "grad_norm": 0.949391782283783, + "learning_rate": 3.859734547453336e-05, + "loss": 0.1147, + "num_input_tokens_seen": 84202656, + "step": 69185 + }, + { + "epoch": 7.705757879496603, + "grad_norm": 0.06794261187314987, + "learning_rate": 3.859530648601624e-05, + "loss": 0.0463, + "num_input_tokens_seen": 84208352, + "step": 69190 + }, + { + "epoch": 7.7063147343802205, + "grad_norm": 0.005057013127952814, + "learning_rate": 3.8593267369080544e-05, + "loss": 0.0251, + "num_input_tokens_seen": 84214464, + "step": 69195 + }, + { + "epoch": 7.706871589263838, + "grad_norm": 1.3144327402114868, + "learning_rate": 3.859122812374556e-05, + "loss": 0.0869, + "num_input_tokens_seen": 84220800, + "step": 69200 + }, + { + "epoch": 7.707428444147455, + "grad_norm": 0.9270020723342896, + "learning_rate": 3.858918875003053e-05, + "loss": 0.0271, + "num_input_tokens_seen": 84227104, + "step": 69205 + }, + { + "epoch": 7.707985299031073, + "grad_norm": 0.20627406239509583, + "learning_rate": 3.858714924795473e-05, + "loss": 0.0575, + "num_input_tokens_seen": 84233184, + "step": 69210 + }, + { + "epoch": 7.70854215391469, + "grad_norm": 0.31951674818992615, + "learning_rate": 3.8585109617537416e-05, + "loss": 0.0846, + "num_input_tokens_seen": 84238464, + "step": 69215 + }, + { + "epoch": 7.709099008798307, + "grad_norm": 0.6265694499015808, + "learning_rate": 3.858306985879786e-05, + "loss": 0.0143, + "num_input_tokens_seen": 84244928, + "step": 69220 + }, + { + "epoch": 7.709655863681925, + "grad_norm": 0.10182032734155655, + "learning_rate": 3.8581029971755325e-05, + "loss": 0.0157, + "num_input_tokens_seen": 84251136, + "step": 69225 + }, + { + "epoch": 7.710212718565542, + "grad_norm": 0.2669858932495117, + "learning_rate": 3.8578989956429076e-05, + "loss": 0.0897, + "num_input_tokens_seen": 84257184, + "step": 69230 + }, + { + "epoch": 7.710769573449159, + "grad_norm": 0.004376956261694431, + "learning_rate": 3.857694981283839e-05, + "loss": 0.0587, + "num_input_tokens_seen": 84263008, + "step": 69235 + }, + { + "epoch": 7.711326428332777, + "grad_norm": 0.6904677152633667, + "learning_rate": 3.8574909541002546e-05, + "loss": 0.1056, + "num_input_tokens_seen": 84269248, + "step": 69240 + }, + { + "epoch": 7.711883283216394, + "grad_norm": 1.2933481931686401, + "learning_rate": 3.8572869140940793e-05, + "loss": 0.1126, + "num_input_tokens_seen": 84275360, + "step": 69245 + }, + { + "epoch": 7.712440138100011, + "grad_norm": 2.1966824531555176, + "learning_rate": 3.857082861267242e-05, + "loss": 0.179, + "num_input_tokens_seen": 84281280, + "step": 69250 + }, + { + "epoch": 7.712996992983628, + "grad_norm": 0.07294847071170807, + "learning_rate": 3.85687879562167e-05, + "loss": 0.0175, + "num_input_tokens_seen": 84287584, + "step": 69255 + }, + { + "epoch": 7.713553847867246, + "grad_norm": 0.5722013711929321, + "learning_rate": 3.85667471715929e-05, + "loss": 0.0471, + "num_input_tokens_seen": 84293344, + "step": 69260 + }, + { + "epoch": 7.7141107027508635, + "grad_norm": 0.0003928911464754492, + "learning_rate": 3.856470625882031e-05, + "loss": 0.1161, + "num_input_tokens_seen": 84299680, + "step": 69265 + }, + { + "epoch": 7.71466755763448, + "grad_norm": 0.0018681795336306095, + "learning_rate": 3.8562665217918206e-05, + "loss": 0.086, + "num_input_tokens_seen": 84305024, + "step": 69270 + }, + { + "epoch": 7.715224412518098, + "grad_norm": 0.004135590046644211, + "learning_rate": 3.8560624048905857e-05, + "loss": 0.0116, + "num_input_tokens_seen": 84311040, + "step": 69275 + }, + { + "epoch": 7.715781267401715, + "grad_norm": 0.04466551914811134, + "learning_rate": 3.8558582751802555e-05, + "loss": 0.0037, + "num_input_tokens_seen": 84317600, + "step": 69280 + }, + { + "epoch": 7.7163381222853324, + "grad_norm": 0.027807099744677544, + "learning_rate": 3.855654132662757e-05, + "loss": 0.075, + "num_input_tokens_seen": 84323776, + "step": 69285 + }, + { + "epoch": 7.71689497716895, + "grad_norm": 0.005919079761952162, + "learning_rate": 3.8554499773400186e-05, + "loss": 0.0377, + "num_input_tokens_seen": 84330016, + "step": 69290 + }, + { + "epoch": 7.717451832052567, + "grad_norm": 1.0824116468429565, + "learning_rate": 3.85524580921397e-05, + "loss": 0.0546, + "num_input_tokens_seen": 84336608, + "step": 69295 + }, + { + "epoch": 7.718008686936185, + "grad_norm": 0.017954278737306595, + "learning_rate": 3.855041628286538e-05, + "loss": 0.0834, + "num_input_tokens_seen": 84342720, + "step": 69300 + }, + { + "epoch": 7.718565541819801, + "grad_norm": 1.2233803272247314, + "learning_rate": 3.8548374345596524e-05, + "loss": 0.0991, + "num_input_tokens_seen": 84348704, + "step": 69305 + }, + { + "epoch": 7.719122396703419, + "grad_norm": 0.0003710152523126453, + "learning_rate": 3.854633228035242e-05, + "loss": 0.0083, + "num_input_tokens_seen": 84354848, + "step": 69310 + }, + { + "epoch": 7.719679251587037, + "grad_norm": 0.05484083667397499, + "learning_rate": 3.854429008715237e-05, + "loss": 0.0558, + "num_input_tokens_seen": 84361088, + "step": 69315 + }, + { + "epoch": 7.7202361064706535, + "grad_norm": 1.2735997438430786, + "learning_rate": 3.8542247766015635e-05, + "loss": 0.1257, + "num_input_tokens_seen": 84367264, + "step": 69320 + }, + { + "epoch": 7.720792961354271, + "grad_norm": 1.2668365240097046, + "learning_rate": 3.854020531696151e-05, + "loss": 0.0752, + "num_input_tokens_seen": 84373600, + "step": 69325 + }, + { + "epoch": 7.721349816237888, + "grad_norm": 0.30119070410728455, + "learning_rate": 3.853816274000931e-05, + "loss": 0.051, + "num_input_tokens_seen": 84379936, + "step": 69330 + }, + { + "epoch": 7.721906671121506, + "grad_norm": 0.3022940456867218, + "learning_rate": 3.8536120035178315e-05, + "loss": 0.0292, + "num_input_tokens_seen": 84385984, + "step": 69335 + }, + { + "epoch": 7.722463526005123, + "grad_norm": 0.0123488400131464, + "learning_rate": 3.853407720248782e-05, + "loss": 0.0316, + "num_input_tokens_seen": 84392320, + "step": 69340 + }, + { + "epoch": 7.72302038088874, + "grad_norm": 2.2467005252838135, + "learning_rate": 3.8532034241957126e-05, + "loss": 0.1341, + "num_input_tokens_seen": 84398080, + "step": 69345 + }, + { + "epoch": 7.723577235772358, + "grad_norm": 2.980625629425049, + "learning_rate": 3.852999115360552e-05, + "loss": 0.0408, + "num_input_tokens_seen": 84404352, + "step": 69350 + }, + { + "epoch": 7.724134090655975, + "grad_norm": 0.00016092868463601917, + "learning_rate": 3.8527947937452315e-05, + "loss": 0.027, + "num_input_tokens_seen": 84410624, + "step": 69355 + }, + { + "epoch": 7.724690945539592, + "grad_norm": 1.557445764541626, + "learning_rate": 3.852590459351679e-05, + "loss": 0.1126, + "num_input_tokens_seen": 84416640, + "step": 69360 + }, + { + "epoch": 7.72524780042321, + "grad_norm": 0.06018993258476257, + "learning_rate": 3.852386112181827e-05, + "loss": 0.0243, + "num_input_tokens_seen": 84422784, + "step": 69365 + }, + { + "epoch": 7.725804655306827, + "grad_norm": 0.0003572323767002672, + "learning_rate": 3.852181752237605e-05, + "loss": 0.0608, + "num_input_tokens_seen": 84428576, + "step": 69370 + }, + { + "epoch": 7.726361510190444, + "grad_norm": 0.00018062339222524315, + "learning_rate": 3.851977379520942e-05, + "loss": 0.0134, + "num_input_tokens_seen": 84434976, + "step": 69375 + }, + { + "epoch": 7.726918365074062, + "grad_norm": 0.031165366992354393, + "learning_rate": 3.8517729940337704e-05, + "loss": 0.0071, + "num_input_tokens_seen": 84441120, + "step": 69380 + }, + { + "epoch": 7.727475219957679, + "grad_norm": 0.7079368233680725, + "learning_rate": 3.85156859577802e-05, + "loss": 0.1323, + "num_input_tokens_seen": 84447072, + "step": 69385 + }, + { + "epoch": 7.7280320748412965, + "grad_norm": 0.014866950921714306, + "learning_rate": 3.8513641847556206e-05, + "loss": 0.1435, + "num_input_tokens_seen": 84453504, + "step": 69390 + }, + { + "epoch": 7.728588929724914, + "grad_norm": 0.05459717661142349, + "learning_rate": 3.851159760968504e-05, + "loss": 0.0285, + "num_input_tokens_seen": 84459712, + "step": 69395 + }, + { + "epoch": 7.729145784608531, + "grad_norm": 0.17731690406799316, + "learning_rate": 3.850955324418601e-05, + "loss": 0.0144, + "num_input_tokens_seen": 84465760, + "step": 69400 + }, + { + "epoch": 7.729702639492149, + "grad_norm": 0.09283305704593658, + "learning_rate": 3.8507508751078425e-05, + "loss": 0.007, + "num_input_tokens_seen": 84471712, + "step": 69405 + }, + { + "epoch": 7.730259494375765, + "grad_norm": 0.4571312367916107, + "learning_rate": 3.8505464130381594e-05, + "loss": 0.0649, + "num_input_tokens_seen": 84477920, + "step": 69410 + }, + { + "epoch": 7.730816349259383, + "grad_norm": 0.023039914667606354, + "learning_rate": 3.850341938211484e-05, + "loss": 0.0171, + "num_input_tokens_seen": 84484096, + "step": 69415 + }, + { + "epoch": 7.731373204143001, + "grad_norm": 0.33401796221733093, + "learning_rate": 3.850137450629747e-05, + "loss": 0.029, + "num_input_tokens_seen": 84490272, + "step": 69420 + }, + { + "epoch": 7.731930059026618, + "grad_norm": 0.0009708944708108902, + "learning_rate": 3.84993295029488e-05, + "loss": 0.0757, + "num_input_tokens_seen": 84496416, + "step": 69425 + }, + { + "epoch": 7.732486913910235, + "grad_norm": 0.001536281779408455, + "learning_rate": 3.849728437208815e-05, + "loss": 0.0292, + "num_input_tokens_seen": 84502400, + "step": 69430 + }, + { + "epoch": 7.733043768793852, + "grad_norm": 0.0013723222073167562, + "learning_rate": 3.8495239113734824e-05, + "loss": 0.0991, + "num_input_tokens_seen": 84508096, + "step": 69435 + }, + { + "epoch": 7.73360062367747, + "grad_norm": 0.027400456368923187, + "learning_rate": 3.849319372790816e-05, + "loss": 0.0632, + "num_input_tokens_seen": 84514208, + "step": 69440 + }, + { + "epoch": 7.734157478561087, + "grad_norm": 0.0013592576142400503, + "learning_rate": 3.849114821462747e-05, + "loss": 0.084, + "num_input_tokens_seen": 84520224, + "step": 69445 + }, + { + "epoch": 7.734714333444704, + "grad_norm": 1.2761188745498657, + "learning_rate": 3.848910257391208e-05, + "loss": 0.1443, + "num_input_tokens_seen": 84526624, + "step": 69450 + }, + { + "epoch": 7.735271188328322, + "grad_norm": 1.2147547006607056, + "learning_rate": 3.848705680578131e-05, + "loss": 0.0748, + "num_input_tokens_seen": 84532576, + "step": 69455 + }, + { + "epoch": 7.735828043211939, + "grad_norm": 0.0036574804689735174, + "learning_rate": 3.848501091025447e-05, + "loss": 0.0288, + "num_input_tokens_seen": 84538624, + "step": 69460 + }, + { + "epoch": 7.736384898095556, + "grad_norm": 0.15943196415901184, + "learning_rate": 3.8482964887350915e-05, + "loss": 0.0138, + "num_input_tokens_seen": 84544480, + "step": 69465 + }, + { + "epoch": 7.736941752979174, + "grad_norm": 0.03472493961453438, + "learning_rate": 3.848091873708994e-05, + "loss": 0.0436, + "num_input_tokens_seen": 84550496, + "step": 69470 + }, + { + "epoch": 7.737498607862791, + "grad_norm": 0.5294923186302185, + "learning_rate": 3.84788724594909e-05, + "loss": 0.0311, + "num_input_tokens_seen": 84556896, + "step": 69475 + }, + { + "epoch": 7.738055462746408, + "grad_norm": 0.9760745763778687, + "learning_rate": 3.84768260545731e-05, + "loss": 0.1343, + "num_input_tokens_seen": 84563104, + "step": 69480 + }, + { + "epoch": 7.738612317630025, + "grad_norm": 0.4005581736564636, + "learning_rate": 3.847477952235588e-05, + "loss": 0.008, + "num_input_tokens_seen": 84569440, + "step": 69485 + }, + { + "epoch": 7.739169172513643, + "grad_norm": 1.0334854125976562, + "learning_rate": 3.847273286285858e-05, + "loss": 0.1449, + "num_input_tokens_seen": 84575776, + "step": 69490 + }, + { + "epoch": 7.739726027397261, + "grad_norm": 0.009795156307518482, + "learning_rate": 3.8470686076100516e-05, + "loss": 0.0018, + "num_input_tokens_seen": 84581920, + "step": 69495 + }, + { + "epoch": 7.740282882280877, + "grad_norm": 0.682810366153717, + "learning_rate": 3.846863916210104e-05, + "loss": 0.1128, + "num_input_tokens_seen": 84587872, + "step": 69500 + }, + { + "epoch": 7.740839737164495, + "grad_norm": 0.00120795879047364, + "learning_rate": 3.846659212087946e-05, + "loss": 0.0044, + "num_input_tokens_seen": 84593888, + "step": 69505 + }, + { + "epoch": 7.741396592048112, + "grad_norm": 0.017274947836995125, + "learning_rate": 3.846454495245515e-05, + "loss": 0.0496, + "num_input_tokens_seen": 84600384, + "step": 69510 + }, + { + "epoch": 7.7419534469317295, + "grad_norm": 3.5996456146240234, + "learning_rate": 3.8462497656847405e-05, + "loss": 0.1071, + "num_input_tokens_seen": 84606240, + "step": 69515 + }, + { + "epoch": 7.742510301815347, + "grad_norm": 1.8848941326141357, + "learning_rate": 3.846045023407559e-05, + "loss": 0.1221, + "num_input_tokens_seen": 84612448, + "step": 69520 + }, + { + "epoch": 7.743067156698964, + "grad_norm": 0.14279931783676147, + "learning_rate": 3.8458402684159045e-05, + "loss": 0.0205, + "num_input_tokens_seen": 84618464, + "step": 69525 + }, + { + "epoch": 7.743624011582582, + "grad_norm": 0.4327310025691986, + "learning_rate": 3.845635500711709e-05, + "loss": 0.0317, + "num_input_tokens_seen": 84624416, + "step": 69530 + }, + { + "epoch": 7.744180866466198, + "grad_norm": 0.0009728830773383379, + "learning_rate": 3.84543072029691e-05, + "loss": 0.0043, + "num_input_tokens_seen": 84630688, + "step": 69535 + }, + { + "epoch": 7.744737721349816, + "grad_norm": 0.007012867368757725, + "learning_rate": 3.845225927173438e-05, + "loss": 0.1338, + "num_input_tokens_seen": 84636320, + "step": 69540 + }, + { + "epoch": 7.745294576233434, + "grad_norm": 0.666392982006073, + "learning_rate": 3.845021121343231e-05, + "loss": 0.0785, + "num_input_tokens_seen": 84642432, + "step": 69545 + }, + { + "epoch": 7.7458514311170505, + "grad_norm": 0.0015493194805458188, + "learning_rate": 3.8448163028082206e-05, + "loss": 0.0082, + "num_input_tokens_seen": 84648576, + "step": 69550 + }, + { + "epoch": 7.746408286000668, + "grad_norm": 0.012887677177786827, + "learning_rate": 3.844611471570343e-05, + "loss": 0.0164, + "num_input_tokens_seen": 84655104, + "step": 69555 + }, + { + "epoch": 7.746965140884286, + "grad_norm": 0.028004705905914307, + "learning_rate": 3.8444066276315334e-05, + "loss": 0.0162, + "num_input_tokens_seen": 84661440, + "step": 69560 + }, + { + "epoch": 7.747521995767903, + "grad_norm": 0.01822085864841938, + "learning_rate": 3.844201770993725e-05, + "loss": 0.0057, + "num_input_tokens_seen": 84667712, + "step": 69565 + }, + { + "epoch": 7.74807885065152, + "grad_norm": 1.3281654119491577, + "learning_rate": 3.843996901658855e-05, + "loss": 0.0759, + "num_input_tokens_seen": 84673792, + "step": 69570 + }, + { + "epoch": 7.748635705535138, + "grad_norm": 0.018856093287467957, + "learning_rate": 3.843792019628857e-05, + "loss": 0.0157, + "num_input_tokens_seen": 84680096, + "step": 69575 + }, + { + "epoch": 7.749192560418755, + "grad_norm": 0.0017434293404221535, + "learning_rate": 3.843587124905668e-05, + "loss": 0.0123, + "num_input_tokens_seen": 84686368, + "step": 69580 + }, + { + "epoch": 7.7497494153023725, + "grad_norm": 0.021617162972688675, + "learning_rate": 3.843382217491221e-05, + "loss": 0.0216, + "num_input_tokens_seen": 84692448, + "step": 69585 + }, + { + "epoch": 7.750306270185989, + "grad_norm": 0.5724247694015503, + "learning_rate": 3.8431772973874515e-05, + "loss": 0.0465, + "num_input_tokens_seen": 84698528, + "step": 69590 + }, + { + "epoch": 7.750863125069607, + "grad_norm": 0.37998124957084656, + "learning_rate": 3.842972364596298e-05, + "loss": 0.0246, + "num_input_tokens_seen": 84704768, + "step": 69595 + }, + { + "epoch": 7.751419979953225, + "grad_norm": 0.04237492009997368, + "learning_rate": 3.842767419119694e-05, + "loss": 0.0035, + "num_input_tokens_seen": 84711232, + "step": 69600 + }, + { + "epoch": 7.751976834836841, + "grad_norm": 1.2471169233322144, + "learning_rate": 3.8425624609595754e-05, + "loss": 0.0968, + "num_input_tokens_seen": 84717216, + "step": 69605 + }, + { + "epoch": 7.752533689720459, + "grad_norm": 2.146395683288574, + "learning_rate": 3.8423574901178795e-05, + "loss": 0.0678, + "num_input_tokens_seen": 84723360, + "step": 69610 + }, + { + "epoch": 7.753090544604076, + "grad_norm": 0.23999987542629242, + "learning_rate": 3.842152506596541e-05, + "loss": 0.0424, + "num_input_tokens_seen": 84729696, + "step": 69615 + }, + { + "epoch": 7.7536473994876935, + "grad_norm": 0.00034168263664469123, + "learning_rate": 3.8419475103974973e-05, + "loss": 0.0184, + "num_input_tokens_seen": 84735776, + "step": 69620 + }, + { + "epoch": 7.754204254371311, + "grad_norm": 0.001659050234593451, + "learning_rate": 3.841742501522684e-05, + "loss": 0.2136, + "num_input_tokens_seen": 84742272, + "step": 69625 + }, + { + "epoch": 7.754761109254928, + "grad_norm": 0.0003035725967492908, + "learning_rate": 3.841537479974038e-05, + "loss": 0.0057, + "num_input_tokens_seen": 84748640, + "step": 69630 + }, + { + "epoch": 7.755317964138546, + "grad_norm": 0.4943928122520447, + "learning_rate": 3.841332445753495e-05, + "loss": 0.0321, + "num_input_tokens_seen": 84754816, + "step": 69635 + }, + { + "epoch": 7.7558748190221625, + "grad_norm": 0.2864070534706116, + "learning_rate": 3.841127398862993e-05, + "loss": 0.0115, + "num_input_tokens_seen": 84761184, + "step": 69640 + }, + { + "epoch": 7.75643167390578, + "grad_norm": 0.6086183786392212, + "learning_rate": 3.840922339304468e-05, + "loss": 0.081, + "num_input_tokens_seen": 84767424, + "step": 69645 + }, + { + "epoch": 7.756988528789398, + "grad_norm": 0.9090656638145447, + "learning_rate": 3.840717267079857e-05, + "loss": 0.0732, + "num_input_tokens_seen": 84773408, + "step": 69650 + }, + { + "epoch": 7.757545383673015, + "grad_norm": 0.32582762837409973, + "learning_rate": 3.840512182191098e-05, + "loss": 0.0237, + "num_input_tokens_seen": 84778976, + "step": 69655 + }, + { + "epoch": 7.758102238556632, + "grad_norm": 0.06194489821791649, + "learning_rate": 3.840307084640127e-05, + "loss": 0.0228, + "num_input_tokens_seen": 84785216, + "step": 69660 + }, + { + "epoch": 7.758659093440249, + "grad_norm": 0.5185919404029846, + "learning_rate": 3.840101974428881e-05, + "loss": 0.1876, + "num_input_tokens_seen": 84790560, + "step": 69665 + }, + { + "epoch": 7.759215948323867, + "grad_norm": 0.7004634141921997, + "learning_rate": 3.8398968515592996e-05, + "loss": 0.0816, + "num_input_tokens_seen": 84796864, + "step": 69670 + }, + { + "epoch": 7.759772803207484, + "grad_norm": 0.9373036623001099, + "learning_rate": 3.839691716033318e-05, + "loss": 0.0366, + "num_input_tokens_seen": 84803456, + "step": 69675 + }, + { + "epoch": 7.760329658091101, + "grad_norm": 1.2119542360305786, + "learning_rate": 3.8394865678528744e-05, + "loss": 0.0951, + "num_input_tokens_seen": 84808576, + "step": 69680 + }, + { + "epoch": 7.760886512974719, + "grad_norm": 0.2977425456047058, + "learning_rate": 3.8392814070199076e-05, + "loss": 0.0427, + "num_input_tokens_seen": 84814624, + "step": 69685 + }, + { + "epoch": 7.761443367858336, + "grad_norm": 1.0595813989639282, + "learning_rate": 3.8390762335363554e-05, + "loss": 0.0703, + "num_input_tokens_seen": 84820768, + "step": 69690 + }, + { + "epoch": 7.762000222741953, + "grad_norm": 0.021846797317266464, + "learning_rate": 3.838871047404154e-05, + "loss": 0.0631, + "num_input_tokens_seen": 84827008, + "step": 69695 + }, + { + "epoch": 7.762557077625571, + "grad_norm": 0.9775339365005493, + "learning_rate": 3.8386658486252445e-05, + "loss": 0.1323, + "num_input_tokens_seen": 84833024, + "step": 69700 + }, + { + "epoch": 7.763113932509188, + "grad_norm": 1.1935511827468872, + "learning_rate": 3.838460637201563e-05, + "loss": 0.055, + "num_input_tokens_seen": 84839104, + "step": 69705 + }, + { + "epoch": 7.7636707873928055, + "grad_norm": 0.035672999918460846, + "learning_rate": 3.838255413135048e-05, + "loss": 0.0102, + "num_input_tokens_seen": 84845088, + "step": 69710 + }, + { + "epoch": 7.764227642276423, + "grad_norm": 0.19657844305038452, + "learning_rate": 3.838050176427639e-05, + "loss": 0.0383, + "num_input_tokens_seen": 84851104, + "step": 69715 + }, + { + "epoch": 7.76478449716004, + "grad_norm": 0.7911627888679504, + "learning_rate": 3.8378449270812736e-05, + "loss": 0.029, + "num_input_tokens_seen": 84857216, + "step": 69720 + }, + { + "epoch": 7.765341352043658, + "grad_norm": 0.4031495451927185, + "learning_rate": 3.837639665097891e-05, + "loss": 0.0481, + "num_input_tokens_seen": 84863296, + "step": 69725 + }, + { + "epoch": 7.765898206927274, + "grad_norm": 0.5871042609214783, + "learning_rate": 3.83743439047943e-05, + "loss": 0.0417, + "num_input_tokens_seen": 84869312, + "step": 69730 + }, + { + "epoch": 7.766455061810892, + "grad_norm": 0.18125301599502563, + "learning_rate": 3.83722910322783e-05, + "loss": 0.0661, + "num_input_tokens_seen": 84875424, + "step": 69735 + }, + { + "epoch": 7.76701191669451, + "grad_norm": 1.2790796756744385, + "learning_rate": 3.837023803345029e-05, + "loss": 0.0988, + "num_input_tokens_seen": 84881504, + "step": 69740 + }, + { + "epoch": 7.7675687715781265, + "grad_norm": 0.019365595653653145, + "learning_rate": 3.836818490832967e-05, + "loss": 0.1228, + "num_input_tokens_seen": 84887904, + "step": 69745 + }, + { + "epoch": 7.768125626461744, + "grad_norm": 2.5056874752044678, + "learning_rate": 3.836613165693585e-05, + "loss": 0.1133, + "num_input_tokens_seen": 84894048, + "step": 69750 + }, + { + "epoch": 7.768682481345362, + "grad_norm": 1.9407672882080078, + "learning_rate": 3.836407827928818e-05, + "loss": 0.0873, + "num_input_tokens_seen": 84899936, + "step": 69755 + }, + { + "epoch": 7.769239336228979, + "grad_norm": 0.009086595848202705, + "learning_rate": 3.836202477540611e-05, + "loss": 0.016, + "num_input_tokens_seen": 84906176, + "step": 69760 + }, + { + "epoch": 7.769796191112596, + "grad_norm": 0.3690492808818817, + "learning_rate": 3.8359971145308996e-05, + "loss": 0.0269, + "num_input_tokens_seen": 84912352, + "step": 69765 + }, + { + "epoch": 7.770353045996213, + "grad_norm": 0.2662811577320099, + "learning_rate": 3.835791738901626e-05, + "loss": 0.0472, + "num_input_tokens_seen": 84918464, + "step": 69770 + }, + { + "epoch": 7.770909900879831, + "grad_norm": 1.5463545322418213, + "learning_rate": 3.835586350654728e-05, + "loss": 0.0796, + "num_input_tokens_seen": 84924608, + "step": 69775 + }, + { + "epoch": 7.7714667557634485, + "grad_norm": 0.000642402155790478, + "learning_rate": 3.835380949792147e-05, + "loss": 0.0857, + "num_input_tokens_seen": 84931072, + "step": 69780 + }, + { + "epoch": 7.772023610647065, + "grad_norm": 0.08619551360607147, + "learning_rate": 3.835175536315824e-05, + "loss": 0.0868, + "num_input_tokens_seen": 84937248, + "step": 69785 + }, + { + "epoch": 7.772580465530683, + "grad_norm": 0.11965072154998779, + "learning_rate": 3.834970110227698e-05, + "loss": 0.1282, + "num_input_tokens_seen": 84943328, + "step": 69790 + }, + { + "epoch": 7.7731373204143, + "grad_norm": 0.7227843999862671, + "learning_rate": 3.8347646715297096e-05, + "loss": 0.0902, + "num_input_tokens_seen": 84949408, + "step": 69795 + }, + { + "epoch": 7.773694175297917, + "grad_norm": 0.6595370769500732, + "learning_rate": 3.8345592202238e-05, + "loss": 0.043, + "num_input_tokens_seen": 84955680, + "step": 69800 + }, + { + "epoch": 7.774251030181535, + "grad_norm": 0.00488992128521204, + "learning_rate": 3.834353756311909e-05, + "loss": 0.0705, + "num_input_tokens_seen": 84961600, + "step": 69805 + }, + { + "epoch": 7.774807885065152, + "grad_norm": 0.06748524308204651, + "learning_rate": 3.834148279795977e-05, + "loss": 0.0094, + "num_input_tokens_seen": 84967648, + "step": 69810 + }, + { + "epoch": 7.7753647399487695, + "grad_norm": 0.7417551279067993, + "learning_rate": 3.833942790677946e-05, + "loss": 0.066, + "num_input_tokens_seen": 84973920, + "step": 69815 + }, + { + "epoch": 7.775921594832386, + "grad_norm": 1.094500184059143, + "learning_rate": 3.833737288959757e-05, + "loss": 0.0623, + "num_input_tokens_seen": 84980256, + "step": 69820 + }, + { + "epoch": 7.776478449716004, + "grad_norm": 0.7278791666030884, + "learning_rate": 3.8335317746433506e-05, + "loss": 0.0739, + "num_input_tokens_seen": 84985920, + "step": 69825 + }, + { + "epoch": 7.777035304599622, + "grad_norm": 0.6806206703186035, + "learning_rate": 3.8333262477306675e-05, + "loss": 0.0508, + "num_input_tokens_seen": 84992288, + "step": 69830 + }, + { + "epoch": 7.7775921594832385, + "grad_norm": 0.006552577950060368, + "learning_rate": 3.833120708223651e-05, + "loss": 0.0722, + "num_input_tokens_seen": 84998368, + "step": 69835 + }, + { + "epoch": 7.778149014366856, + "grad_norm": 0.43844136595726013, + "learning_rate": 3.83291515612424e-05, + "loss": 0.0587, + "num_input_tokens_seen": 85004256, + "step": 69840 + }, + { + "epoch": 7.778705869250473, + "grad_norm": 0.1345052719116211, + "learning_rate": 3.832709591434378e-05, + "loss": 0.0754, + "num_input_tokens_seen": 85010464, + "step": 69845 + }, + { + "epoch": 7.779262724134091, + "grad_norm": 0.028086170554161072, + "learning_rate": 3.832504014156006e-05, + "loss": 0.0354, + "num_input_tokens_seen": 85016768, + "step": 69850 + }, + { + "epoch": 7.779819579017708, + "grad_norm": 1.5439770221710205, + "learning_rate": 3.8322984242910674e-05, + "loss": 0.0593, + "num_input_tokens_seen": 85022784, + "step": 69855 + }, + { + "epoch": 7.780376433901325, + "grad_norm": 1.7836416959762573, + "learning_rate": 3.8320928218415005e-05, + "loss": 0.1817, + "num_input_tokens_seen": 85028992, + "step": 69860 + }, + { + "epoch": 7.780933288784943, + "grad_norm": 1.0776951313018799, + "learning_rate": 3.831887206809252e-05, + "loss": 0.1034, + "num_input_tokens_seen": 85035232, + "step": 69865 + }, + { + "epoch": 7.7814901436685595, + "grad_norm": 0.07492318749427795, + "learning_rate": 3.83168157919626e-05, + "loss": 0.0718, + "num_input_tokens_seen": 85041280, + "step": 69870 + }, + { + "epoch": 7.782046998552177, + "grad_norm": 0.28603121638298035, + "learning_rate": 3.831475939004469e-05, + "loss": 0.0242, + "num_input_tokens_seen": 85047360, + "step": 69875 + }, + { + "epoch": 7.782603853435795, + "grad_norm": 1.9967948198318481, + "learning_rate": 3.8312702862358215e-05, + "loss": 0.0913, + "num_input_tokens_seen": 85053568, + "step": 69880 + }, + { + "epoch": 7.783160708319412, + "grad_norm": 0.0003764716675505042, + "learning_rate": 3.8310646208922585e-05, + "loss": 0.0099, + "num_input_tokens_seen": 85059808, + "step": 69885 + }, + { + "epoch": 7.783717563203029, + "grad_norm": 0.01730307750403881, + "learning_rate": 3.830858942975724e-05, + "loss": 0.0322, + "num_input_tokens_seen": 85065920, + "step": 69890 + }, + { + "epoch": 7.784274418086647, + "grad_norm": 0.4347330927848816, + "learning_rate": 3.830653252488161e-05, + "loss": 0.0567, + "num_input_tokens_seen": 85072096, + "step": 69895 + }, + { + "epoch": 7.784831272970264, + "grad_norm": 0.049526408314704895, + "learning_rate": 3.8304475494315117e-05, + "loss": 0.0458, + "num_input_tokens_seen": 85078176, + "step": 69900 + }, + { + "epoch": 7.7853881278538815, + "grad_norm": 0.8756393790245056, + "learning_rate": 3.830241833807719e-05, + "loss": 0.0238, + "num_input_tokens_seen": 85084128, + "step": 69905 + }, + { + "epoch": 7.785944982737498, + "grad_norm": 0.015533595345914364, + "learning_rate": 3.830036105618727e-05, + "loss": 0.0334, + "num_input_tokens_seen": 85090304, + "step": 69910 + }, + { + "epoch": 7.786501837621116, + "grad_norm": 0.02224677987396717, + "learning_rate": 3.829830364866479e-05, + "loss": 0.015, + "num_input_tokens_seen": 85096384, + "step": 69915 + }, + { + "epoch": 7.787058692504734, + "grad_norm": 0.35167089104652405, + "learning_rate": 3.829624611552917e-05, + "loss": 0.0149, + "num_input_tokens_seen": 85102688, + "step": 69920 + }, + { + "epoch": 7.78761554738835, + "grad_norm": 0.44746455550193787, + "learning_rate": 3.829418845679985e-05, + "loss": 0.0569, + "num_input_tokens_seen": 85108640, + "step": 69925 + }, + { + "epoch": 7.788172402271968, + "grad_norm": 0.6581408381462097, + "learning_rate": 3.829213067249627e-05, + "loss": 0.0455, + "num_input_tokens_seen": 85114528, + "step": 69930 + }, + { + "epoch": 7.788729257155586, + "grad_norm": 0.9790089726448059, + "learning_rate": 3.829007276263786e-05, + "loss": 0.0348, + "num_input_tokens_seen": 85120736, + "step": 69935 + }, + { + "epoch": 7.7892861120392025, + "grad_norm": 0.03887297213077545, + "learning_rate": 3.828801472724408e-05, + "loss": 0.0239, + "num_input_tokens_seen": 85126784, + "step": 69940 + }, + { + "epoch": 7.78984296692282, + "grad_norm": 0.4869694709777832, + "learning_rate": 3.8285956566334345e-05, + "loss": 0.1054, + "num_input_tokens_seen": 85132896, + "step": 69945 + }, + { + "epoch": 7.790399821806437, + "grad_norm": 0.37308037281036377, + "learning_rate": 3.828389827992811e-05, + "loss": 0.0654, + "num_input_tokens_seen": 85138944, + "step": 69950 + }, + { + "epoch": 7.790956676690055, + "grad_norm": 1.1566933393478394, + "learning_rate": 3.828183986804481e-05, + "loss": 0.0992, + "num_input_tokens_seen": 85144576, + "step": 69955 + }, + { + "epoch": 7.791513531573672, + "grad_norm": 0.581727921962738, + "learning_rate": 3.827978133070389e-05, + "loss": 0.1549, + "num_input_tokens_seen": 85150688, + "step": 69960 + }, + { + "epoch": 7.792070386457289, + "grad_norm": 0.07856867462396622, + "learning_rate": 3.8277722667924796e-05, + "loss": 0.0116, + "num_input_tokens_seen": 85156640, + "step": 69965 + }, + { + "epoch": 7.792627241340907, + "grad_norm": 1.0167462825775146, + "learning_rate": 3.827566387972698e-05, + "loss": 0.0324, + "num_input_tokens_seen": 85162880, + "step": 69970 + }, + { + "epoch": 7.793184096224524, + "grad_norm": 0.08114047348499298, + "learning_rate": 3.8273604966129876e-05, + "loss": 0.0613, + "num_input_tokens_seen": 85168800, + "step": 69975 + }, + { + "epoch": 7.793740951108141, + "grad_norm": 0.5948168635368347, + "learning_rate": 3.8271545927152944e-05, + "loss": 0.048, + "num_input_tokens_seen": 85175040, + "step": 69980 + }, + { + "epoch": 7.794297805991759, + "grad_norm": 0.03519666939973831, + "learning_rate": 3.826948676281562e-05, + "loss": 0.0048, + "num_input_tokens_seen": 85180992, + "step": 69985 + }, + { + "epoch": 7.794854660875376, + "grad_norm": 0.1549900472164154, + "learning_rate": 3.826742747313737e-05, + "loss": 0.1208, + "num_input_tokens_seen": 85186816, + "step": 69990 + }, + { + "epoch": 7.795411515758993, + "grad_norm": 0.9526334404945374, + "learning_rate": 3.826536805813763e-05, + "loss": 0.0392, + "num_input_tokens_seen": 85192736, + "step": 69995 + }, + { + "epoch": 7.79596837064261, + "grad_norm": 0.14096789062023163, + "learning_rate": 3.826330851783587e-05, + "loss": 0.038, + "num_input_tokens_seen": 85199264, + "step": 70000 + }, + { + "epoch": 7.796525225526228, + "grad_norm": 0.2084377259016037, + "learning_rate": 3.826124885225153e-05, + "loss": 0.0546, + "num_input_tokens_seen": 85206112, + "step": 70005 + }, + { + "epoch": 7.7970820804098455, + "grad_norm": 0.08068147301673889, + "learning_rate": 3.8259189061404066e-05, + "loss": 0.0334, + "num_input_tokens_seen": 85212352, + "step": 70010 + }, + { + "epoch": 7.797638935293462, + "grad_norm": 0.01594485342502594, + "learning_rate": 3.825712914531294e-05, + "loss": 0.0269, + "num_input_tokens_seen": 85218496, + "step": 70015 + }, + { + "epoch": 7.79819579017708, + "grad_norm": 0.8214643597602844, + "learning_rate": 3.82550691039976e-05, + "loss": 0.232, + "num_input_tokens_seen": 85224160, + "step": 70020 + }, + { + "epoch": 7.798752645060697, + "grad_norm": 1.0946186780929565, + "learning_rate": 3.825300893747753e-05, + "loss": 0.0065, + "num_input_tokens_seen": 85230336, + "step": 70025 + }, + { + "epoch": 7.799309499944314, + "grad_norm": 0.3431028723716736, + "learning_rate": 3.825094864577216e-05, + "loss": 0.0144, + "num_input_tokens_seen": 85235744, + "step": 70030 + }, + { + "epoch": 7.799866354827932, + "grad_norm": 0.32115438580513, + "learning_rate": 3.824888822890097e-05, + "loss": 0.0989, + "num_input_tokens_seen": 85241792, + "step": 70035 + }, + { + "epoch": 7.800423209711549, + "grad_norm": 1.117893099784851, + "learning_rate": 3.824682768688341e-05, + "loss": 0.0492, + "num_input_tokens_seen": 85248032, + "step": 70040 + }, + { + "epoch": 7.800980064595167, + "grad_norm": 0.0006829608464613557, + "learning_rate": 3.824476701973896e-05, + "loss": 0.0573, + "num_input_tokens_seen": 85254240, + "step": 70045 + }, + { + "epoch": 7.801536919478783, + "grad_norm": 0.0008512109634466469, + "learning_rate": 3.8242706227487065e-05, + "loss": 0.0205, + "num_input_tokens_seen": 85260448, + "step": 70050 + }, + { + "epoch": 7.802093774362401, + "grad_norm": 0.672219455242157, + "learning_rate": 3.8240645310147196e-05, + "loss": 0.0693, + "num_input_tokens_seen": 85266432, + "step": 70055 + }, + { + "epoch": 7.802650629246019, + "grad_norm": 0.07625169306993484, + "learning_rate": 3.8238584267738834e-05, + "loss": 0.0399, + "num_input_tokens_seen": 85272576, + "step": 70060 + }, + { + "epoch": 7.8032074841296355, + "grad_norm": 0.05913307145237923, + "learning_rate": 3.823652310028143e-05, + "loss": 0.1245, + "num_input_tokens_seen": 85278720, + "step": 70065 + }, + { + "epoch": 7.803764339013253, + "grad_norm": 0.04360165446996689, + "learning_rate": 3.8234461807794466e-05, + "loss": 0.1728, + "num_input_tokens_seen": 85284992, + "step": 70070 + }, + { + "epoch": 7.804321193896871, + "grad_norm": 1.0248574018478394, + "learning_rate": 3.82324003902974e-05, + "loss": 0.0417, + "num_input_tokens_seen": 85291104, + "step": 70075 + }, + { + "epoch": 7.804878048780488, + "grad_norm": 0.0060949623584747314, + "learning_rate": 3.823033884780971e-05, + "loss": 0.0408, + "num_input_tokens_seen": 85297216, + "step": 70080 + }, + { + "epoch": 7.805434903664105, + "grad_norm": 0.0002003531699301675, + "learning_rate": 3.822827718035088e-05, + "loss": 0.0705, + "num_input_tokens_seen": 85303456, + "step": 70085 + }, + { + "epoch": 7.805991758547722, + "grad_norm": 0.16675128042697906, + "learning_rate": 3.822621538794037e-05, + "loss": 0.1051, + "num_input_tokens_seen": 85309600, + "step": 70090 + }, + { + "epoch": 7.80654861343134, + "grad_norm": 0.028293870389461517, + "learning_rate": 3.822415347059766e-05, + "loss": 0.0909, + "num_input_tokens_seen": 85315648, + "step": 70095 + }, + { + "epoch": 7.807105468314957, + "grad_norm": 0.326438844203949, + "learning_rate": 3.822209142834221e-05, + "loss": 0.0208, + "num_input_tokens_seen": 85321984, + "step": 70100 + }, + { + "epoch": 7.807662323198574, + "grad_norm": 0.43374285101890564, + "learning_rate": 3.8220029261193535e-05, + "loss": 0.0509, + "num_input_tokens_seen": 85328128, + "step": 70105 + }, + { + "epoch": 7.808219178082192, + "grad_norm": 0.001217217999510467, + "learning_rate": 3.821796696917108e-05, + "loss": 0.0291, + "num_input_tokens_seen": 85334144, + "step": 70110 + }, + { + "epoch": 7.80877603296581, + "grad_norm": 0.01210729405283928, + "learning_rate": 3.8215904552294334e-05, + "loss": 0.0062, + "num_input_tokens_seen": 85340160, + "step": 70115 + }, + { + "epoch": 7.809332887849426, + "grad_norm": 0.5943917036056519, + "learning_rate": 3.821384201058279e-05, + "loss": 0.0272, + "num_input_tokens_seen": 85346176, + "step": 70120 + }, + { + "epoch": 7.809889742733044, + "grad_norm": 0.18323752284049988, + "learning_rate": 3.8211779344055915e-05, + "loss": 0.084, + "num_input_tokens_seen": 85352192, + "step": 70125 + }, + { + "epoch": 7.810446597616661, + "grad_norm": 0.4866549074649811, + "learning_rate": 3.82097165527332e-05, + "loss": 0.0551, + "num_input_tokens_seen": 85357952, + "step": 70130 + }, + { + "epoch": 7.8110034525002785, + "grad_norm": 0.1006053239107132, + "learning_rate": 3.820765363663413e-05, + "loss": 0.0091, + "num_input_tokens_seen": 85363936, + "step": 70135 + }, + { + "epoch": 7.811560307383896, + "grad_norm": 1.9378108978271484, + "learning_rate": 3.820559059577819e-05, + "loss": 0.2134, + "num_input_tokens_seen": 85369856, + "step": 70140 + }, + { + "epoch": 7.812117162267513, + "grad_norm": 0.0037570055574178696, + "learning_rate": 3.8203527430184874e-05, + "loss": 0.029, + "num_input_tokens_seen": 85376256, + "step": 70145 + }, + { + "epoch": 7.812674017151131, + "grad_norm": 0.001773308264091611, + "learning_rate": 3.8201464139873646e-05, + "loss": 0.149, + "num_input_tokens_seen": 85382464, + "step": 70150 + }, + { + "epoch": 7.813230872034747, + "grad_norm": 0.008825672790408134, + "learning_rate": 3.819940072486403e-05, + "loss": 0.088, + "num_input_tokens_seen": 85388704, + "step": 70155 + }, + { + "epoch": 7.813787726918365, + "grad_norm": 0.8248987793922424, + "learning_rate": 3.8197337185175486e-05, + "loss": 0.0457, + "num_input_tokens_seen": 85394944, + "step": 70160 + }, + { + "epoch": 7.814344581801983, + "grad_norm": 0.48347532749176025, + "learning_rate": 3.8195273520827525e-05, + "loss": 0.0541, + "num_input_tokens_seen": 85400672, + "step": 70165 + }, + { + "epoch": 7.8149014366856, + "grad_norm": 1.041938066482544, + "learning_rate": 3.819320973183963e-05, + "loss": 0.0646, + "num_input_tokens_seen": 85406944, + "step": 70170 + }, + { + "epoch": 7.815458291569217, + "grad_norm": 0.04220325127243996, + "learning_rate": 3.8191145818231304e-05, + "loss": 0.1497, + "num_input_tokens_seen": 85413088, + "step": 70175 + }, + { + "epoch": 7.816015146452834, + "grad_norm": 0.009939953684806824, + "learning_rate": 3.818908178002203e-05, + "loss": 0.0922, + "num_input_tokens_seen": 85419424, + "step": 70180 + }, + { + "epoch": 7.816572001336452, + "grad_norm": 0.052749503403902054, + "learning_rate": 3.8187017617231315e-05, + "loss": 0.0483, + "num_input_tokens_seen": 85425568, + "step": 70185 + }, + { + "epoch": 7.817128856220069, + "grad_norm": 2.3176023960113525, + "learning_rate": 3.818495332987866e-05, + "loss": 0.1361, + "num_input_tokens_seen": 85431392, + "step": 70190 + }, + { + "epoch": 7.817685711103686, + "grad_norm": 0.0004441964556463063, + "learning_rate": 3.818288891798355e-05, + "loss": 0.0186, + "num_input_tokens_seen": 85437568, + "step": 70195 + }, + { + "epoch": 7.818242565987304, + "grad_norm": 2.2423417568206787, + "learning_rate": 3.818082438156549e-05, + "loss": 0.2789, + "num_input_tokens_seen": 85443584, + "step": 70200 + }, + { + "epoch": 7.818799420870921, + "grad_norm": 0.14495830237865448, + "learning_rate": 3.8178759720644e-05, + "loss": 0.0082, + "num_input_tokens_seen": 85449152, + "step": 70205 + }, + { + "epoch": 7.819356275754538, + "grad_norm": 0.30105096101760864, + "learning_rate": 3.817669493523855e-05, + "loss": 0.131, + "num_input_tokens_seen": 85455360, + "step": 70210 + }, + { + "epoch": 7.819913130638156, + "grad_norm": 1.7017515897750854, + "learning_rate": 3.8174630025368665e-05, + "loss": 0.2051, + "num_input_tokens_seen": 85461216, + "step": 70215 + }, + { + "epoch": 7.820469985521773, + "grad_norm": 1.3013644218444824, + "learning_rate": 3.817256499105384e-05, + "loss": 0.1392, + "num_input_tokens_seen": 85467328, + "step": 70220 + }, + { + "epoch": 7.82102684040539, + "grad_norm": 1.410470962524414, + "learning_rate": 3.817049983231358e-05, + "loss": 0.0858, + "num_input_tokens_seen": 85473184, + "step": 70225 + }, + { + "epoch": 7.821583695289007, + "grad_norm": 0.8672294020652771, + "learning_rate": 3.8168434549167406e-05, + "loss": 0.1338, + "num_input_tokens_seen": 85479072, + "step": 70230 + }, + { + "epoch": 7.822140550172625, + "grad_norm": 0.007441969122737646, + "learning_rate": 3.816636914163481e-05, + "loss": 0.0634, + "num_input_tokens_seen": 85485344, + "step": 70235 + }, + { + "epoch": 7.822697405056243, + "grad_norm": 0.0898580327630043, + "learning_rate": 3.8164303609735317e-05, + "loss": 0.0304, + "num_input_tokens_seen": 85491904, + "step": 70240 + }, + { + "epoch": 7.823254259939859, + "grad_norm": 0.01406295970082283, + "learning_rate": 3.816223795348842e-05, + "loss": 0.0321, + "num_input_tokens_seen": 85498016, + "step": 70245 + }, + { + "epoch": 7.823811114823477, + "grad_norm": 0.000623110041487962, + "learning_rate": 3.8160172172913656e-05, + "loss": 0.0391, + "num_input_tokens_seen": 85504320, + "step": 70250 + }, + { + "epoch": 7.824367969707095, + "grad_norm": 0.08580252528190613, + "learning_rate": 3.815810626803051e-05, + "loss": 0.0378, + "num_input_tokens_seen": 85510464, + "step": 70255 + }, + { + "epoch": 7.8249248245907115, + "grad_norm": 0.7420941591262817, + "learning_rate": 3.815604023885851e-05, + "loss": 0.0835, + "num_input_tokens_seen": 85516544, + "step": 70260 + }, + { + "epoch": 7.825481679474329, + "grad_norm": 0.20299042761325836, + "learning_rate": 3.8153974085417164e-05, + "loss": 0.0216, + "num_input_tokens_seen": 85521824, + "step": 70265 + }, + { + "epoch": 7.826038534357946, + "grad_norm": 0.04560920223593712, + "learning_rate": 3.8151907807726e-05, + "loss": 0.0762, + "num_input_tokens_seen": 85527776, + "step": 70270 + }, + { + "epoch": 7.826595389241564, + "grad_norm": 0.06221354380249977, + "learning_rate": 3.814984140580453e-05, + "loss": 0.0031, + "num_input_tokens_seen": 85533760, + "step": 70275 + }, + { + "epoch": 7.827152244125181, + "grad_norm": 0.009278045035898685, + "learning_rate": 3.8147774879672274e-05, + "loss": 0.0346, + "num_input_tokens_seen": 85539616, + "step": 70280 + }, + { + "epoch": 7.827709099008798, + "grad_norm": 1.573394536972046, + "learning_rate": 3.814570822934875e-05, + "loss": 0.0627, + "num_input_tokens_seen": 85545824, + "step": 70285 + }, + { + "epoch": 7.828265953892416, + "grad_norm": 0.8883260488510132, + "learning_rate": 3.814364145485347e-05, + "loss": 0.014, + "num_input_tokens_seen": 85551776, + "step": 70290 + }, + { + "epoch": 7.828822808776033, + "grad_norm": 0.145797461271286, + "learning_rate": 3.814157455620598e-05, + "loss": 0.0185, + "num_input_tokens_seen": 85557920, + "step": 70295 + }, + { + "epoch": 7.82937966365965, + "grad_norm": 1.4316915273666382, + "learning_rate": 3.8139507533425784e-05, + "loss": 0.1015, + "num_input_tokens_seen": 85564064, + "step": 70300 + }, + { + "epoch": 7.829936518543268, + "grad_norm": 0.011562066152691841, + "learning_rate": 3.813744038653241e-05, + "loss": 0.0434, + "num_input_tokens_seen": 85570016, + "step": 70305 + }, + { + "epoch": 7.830493373426885, + "grad_norm": 1.325432538986206, + "learning_rate": 3.813537311554539e-05, + "loss": 0.0553, + "num_input_tokens_seen": 85575456, + "step": 70310 + }, + { + "epoch": 7.831050228310502, + "grad_norm": 1.19832444190979, + "learning_rate": 3.813330572048424e-05, + "loss": 0.1579, + "num_input_tokens_seen": 85581376, + "step": 70315 + }, + { + "epoch": 7.83160708319412, + "grad_norm": 0.884308934211731, + "learning_rate": 3.81312382013685e-05, + "loss": 0.0838, + "num_input_tokens_seen": 85587680, + "step": 70320 + }, + { + "epoch": 7.832163938077737, + "grad_norm": 0.13593275845050812, + "learning_rate": 3.8129170558217696e-05, + "loss": 0.0119, + "num_input_tokens_seen": 85593888, + "step": 70325 + }, + { + "epoch": 7.8327207929613545, + "grad_norm": 0.1615716814994812, + "learning_rate": 3.812710279105135e-05, + "loss": 0.0343, + "num_input_tokens_seen": 85600192, + "step": 70330 + }, + { + "epoch": 7.833277647844971, + "grad_norm": 0.007347912527620792, + "learning_rate": 3.8125034899889014e-05, + "loss": 0.0082, + "num_input_tokens_seen": 85606560, + "step": 70335 + }, + { + "epoch": 7.833834502728589, + "grad_norm": 0.040904704481363297, + "learning_rate": 3.81229668847502e-05, + "loss": 0.0613, + "num_input_tokens_seen": 85612576, + "step": 70340 + }, + { + "epoch": 7.834391357612207, + "grad_norm": 0.16654492914676666, + "learning_rate": 3.812089874565445e-05, + "loss": 0.0086, + "num_input_tokens_seen": 85618624, + "step": 70345 + }, + { + "epoch": 7.834948212495823, + "grad_norm": 0.43056586384773254, + "learning_rate": 3.8118830482621295e-05, + "loss": 0.0115, + "num_input_tokens_seen": 85624800, + "step": 70350 + }, + { + "epoch": 7.835505067379441, + "grad_norm": 0.0012499918229877949, + "learning_rate": 3.811676209567028e-05, + "loss": 0.0359, + "num_input_tokens_seen": 85629984, + "step": 70355 + }, + { + "epoch": 7.836061922263058, + "grad_norm": 0.9506930708885193, + "learning_rate": 3.811469358482094e-05, + "loss": 0.0476, + "num_input_tokens_seen": 85636064, + "step": 70360 + }, + { + "epoch": 7.8366187771466755, + "grad_norm": 0.2898736000061035, + "learning_rate": 3.81126249500928e-05, + "loss": 0.0831, + "num_input_tokens_seen": 85642016, + "step": 70365 + }, + { + "epoch": 7.837175632030293, + "grad_norm": 0.128175288438797, + "learning_rate": 3.811055619150543e-05, + "loss": 0.104, + "num_input_tokens_seen": 85647520, + "step": 70370 + }, + { + "epoch": 7.83773248691391, + "grad_norm": 0.07034100592136383, + "learning_rate": 3.810848730907834e-05, + "loss": 0.0279, + "num_input_tokens_seen": 85653600, + "step": 70375 + }, + { + "epoch": 7.838289341797528, + "grad_norm": 0.01992238499224186, + "learning_rate": 3.810641830283109e-05, + "loss": 0.1314, + "num_input_tokens_seen": 85659840, + "step": 70380 + }, + { + "epoch": 7.8388461966811445, + "grad_norm": 0.1847870647907257, + "learning_rate": 3.8104349172783216e-05, + "loss": 0.0762, + "num_input_tokens_seen": 85666016, + "step": 70385 + }, + { + "epoch": 7.839403051564762, + "grad_norm": 0.083372563123703, + "learning_rate": 3.810227991895427e-05, + "loss": 0.094, + "num_input_tokens_seen": 85672128, + "step": 70390 + }, + { + "epoch": 7.83995990644838, + "grad_norm": 0.2933284342288971, + "learning_rate": 3.810021054136379e-05, + "loss": 0.0485, + "num_input_tokens_seen": 85678016, + "step": 70395 + }, + { + "epoch": 7.840516761331997, + "grad_norm": 0.013886326923966408, + "learning_rate": 3.809814104003132e-05, + "loss": 0.0441, + "num_input_tokens_seen": 85684192, + "step": 70400 + }, + { + "epoch": 7.841073616215614, + "grad_norm": 0.17511458694934845, + "learning_rate": 3.809607141497642e-05, + "loss": 0.0397, + "num_input_tokens_seen": 85689600, + "step": 70405 + }, + { + "epoch": 7.841630471099231, + "grad_norm": 1.374794840812683, + "learning_rate": 3.809400166621863e-05, + "loss": 0.0615, + "num_input_tokens_seen": 85695904, + "step": 70410 + }, + { + "epoch": 7.842187325982849, + "grad_norm": 0.05111677199602127, + "learning_rate": 3.809193179377751e-05, + "loss": 0.0362, + "num_input_tokens_seen": 85702016, + "step": 70415 + }, + { + "epoch": 7.842744180866466, + "grad_norm": 0.8675469756126404, + "learning_rate": 3.80898617976726e-05, + "loss": 0.0486, + "num_input_tokens_seen": 85708160, + "step": 70420 + }, + { + "epoch": 7.843301035750083, + "grad_norm": 0.06520388275384903, + "learning_rate": 3.808779167792345e-05, + "loss": 0.0411, + "num_input_tokens_seen": 85714400, + "step": 70425 + }, + { + "epoch": 7.843857890633701, + "grad_norm": 0.506784975528717, + "learning_rate": 3.808572143454964e-05, + "loss": 0.0201, + "num_input_tokens_seen": 85720896, + "step": 70430 + }, + { + "epoch": 7.8444147455173185, + "grad_norm": 0.900547981262207, + "learning_rate": 3.808365106757069e-05, + "loss": 0.0422, + "num_input_tokens_seen": 85727008, + "step": 70435 + }, + { + "epoch": 7.844971600400935, + "grad_norm": 0.2291111946105957, + "learning_rate": 3.808158057700618e-05, + "loss": 0.0386, + "num_input_tokens_seen": 85733152, + "step": 70440 + }, + { + "epoch": 7.845528455284553, + "grad_norm": 0.011610534973442554, + "learning_rate": 3.807950996287566e-05, + "loss": 0.0338, + "num_input_tokens_seen": 85739456, + "step": 70445 + }, + { + "epoch": 7.84608531016817, + "grad_norm": 1.642533540725708, + "learning_rate": 3.8077439225198694e-05, + "loss": 0.1896, + "num_input_tokens_seen": 85745632, + "step": 70450 + }, + { + "epoch": 7.8466421650517875, + "grad_norm": 1.1580588817596436, + "learning_rate": 3.8075368363994835e-05, + "loss": 0.0746, + "num_input_tokens_seen": 85751616, + "step": 70455 + }, + { + "epoch": 7.847199019935405, + "grad_norm": 0.0006151003181003034, + "learning_rate": 3.807329737928363e-05, + "loss": 0.092, + "num_input_tokens_seen": 85757856, + "step": 70460 + }, + { + "epoch": 7.847755874819022, + "grad_norm": 0.11429109424352646, + "learning_rate": 3.807122627108468e-05, + "loss": 0.0179, + "num_input_tokens_seen": 85764032, + "step": 70465 + }, + { + "epoch": 7.84831272970264, + "grad_norm": 0.6577939391136169, + "learning_rate": 3.806915503941751e-05, + "loss": 0.0482, + "num_input_tokens_seen": 85770208, + "step": 70470 + }, + { + "epoch": 7.848869584586257, + "grad_norm": 0.005551662761718035, + "learning_rate": 3.8067083684301716e-05, + "loss": 0.1026, + "num_input_tokens_seen": 85776512, + "step": 70475 + }, + { + "epoch": 7.849426439469874, + "grad_norm": 0.06051284819841385, + "learning_rate": 3.8065012205756834e-05, + "loss": 0.0056, + "num_input_tokens_seen": 85782976, + "step": 70480 + }, + { + "epoch": 7.849983294353492, + "grad_norm": 0.4531280994415283, + "learning_rate": 3.8062940603802456e-05, + "loss": 0.0458, + "num_input_tokens_seen": 85789280, + "step": 70485 + }, + { + "epoch": 7.8505401492371085, + "grad_norm": 1.646456003189087, + "learning_rate": 3.806086887845812e-05, + "loss": 0.082, + "num_input_tokens_seen": 85795424, + "step": 70490 + }, + { + "epoch": 7.851097004120726, + "grad_norm": 0.0961497351527214, + "learning_rate": 3.805879702974343e-05, + "loss": 0.0301, + "num_input_tokens_seen": 85801152, + "step": 70495 + }, + { + "epoch": 7.851653859004344, + "grad_norm": 0.0027986252680420876, + "learning_rate": 3.8056725057677935e-05, + "loss": 0.0463, + "num_input_tokens_seen": 85807360, + "step": 70500 + }, + { + "epoch": 7.852210713887961, + "grad_norm": 0.014946416020393372, + "learning_rate": 3.80546529622812e-05, + "loss": 0.0284, + "num_input_tokens_seen": 85813408, + "step": 70505 + }, + { + "epoch": 7.852767568771578, + "grad_norm": 0.017353655770421028, + "learning_rate": 3.805258074357283e-05, + "loss": 0.0307, + "num_input_tokens_seen": 85818688, + "step": 70510 + }, + { + "epoch": 7.853324423655195, + "grad_norm": 0.43120312690734863, + "learning_rate": 3.805050840157236e-05, + "loss": 0.044, + "num_input_tokens_seen": 85825056, + "step": 70515 + }, + { + "epoch": 7.853881278538813, + "grad_norm": 0.7087199091911316, + "learning_rate": 3.804843593629938e-05, + "loss": 0.0884, + "num_input_tokens_seen": 85830336, + "step": 70520 + }, + { + "epoch": 7.8544381334224305, + "grad_norm": 0.06770902872085571, + "learning_rate": 3.804636334777348e-05, + "loss": 0.011, + "num_input_tokens_seen": 85836576, + "step": 70525 + }, + { + "epoch": 7.854994988306047, + "grad_norm": 0.5798407196998596, + "learning_rate": 3.804429063601422e-05, + "loss": 0.0377, + "num_input_tokens_seen": 85842528, + "step": 70530 + }, + { + "epoch": 7.855551843189665, + "grad_norm": 1.2055485248565674, + "learning_rate": 3.8042217801041186e-05, + "loss": 0.1527, + "num_input_tokens_seen": 85848544, + "step": 70535 + }, + { + "epoch": 7.856108698073282, + "grad_norm": 0.2574886679649353, + "learning_rate": 3.804014484287396e-05, + "loss": 0.0106, + "num_input_tokens_seen": 85854624, + "step": 70540 + }, + { + "epoch": 7.856665552956899, + "grad_norm": 0.02581559307873249, + "learning_rate": 3.8038071761532105e-05, + "loss": 0.1641, + "num_input_tokens_seen": 85861024, + "step": 70545 + }, + { + "epoch": 7.857222407840517, + "grad_norm": 0.014509675092995167, + "learning_rate": 3.803599855703523e-05, + "loss": 0.0202, + "num_input_tokens_seen": 85867072, + "step": 70550 + }, + { + "epoch": 7.857779262724134, + "grad_norm": 0.6842885613441467, + "learning_rate": 3.803392522940289e-05, + "loss": 0.06, + "num_input_tokens_seen": 85873088, + "step": 70555 + }, + { + "epoch": 7.8583361176077515, + "grad_norm": 1.3134461641311646, + "learning_rate": 3.80318517786547e-05, + "loss": 0.1112, + "num_input_tokens_seen": 85879136, + "step": 70560 + }, + { + "epoch": 7.858892972491368, + "grad_norm": 0.27051252126693726, + "learning_rate": 3.8029778204810215e-05, + "loss": 0.0781, + "num_input_tokens_seen": 85885536, + "step": 70565 + }, + { + "epoch": 7.859449827374986, + "grad_norm": 0.22640949487686157, + "learning_rate": 3.8027704507889045e-05, + "loss": 0.0479, + "num_input_tokens_seen": 85891616, + "step": 70570 + }, + { + "epoch": 7.860006682258604, + "grad_norm": 0.05602623522281647, + "learning_rate": 3.802563068791076e-05, + "loss": 0.1583, + "num_input_tokens_seen": 85897632, + "step": 70575 + }, + { + "epoch": 7.8605635371422204, + "grad_norm": 0.0403035506606102, + "learning_rate": 3.802355674489497e-05, + "loss": 0.0711, + "num_input_tokens_seen": 85903744, + "step": 70580 + }, + { + "epoch": 7.861120392025838, + "grad_norm": 0.7527206540107727, + "learning_rate": 3.802148267886124e-05, + "loss": 0.0583, + "num_input_tokens_seen": 85909984, + "step": 70585 + }, + { + "epoch": 7.861677246909455, + "grad_norm": 0.1791929006576538, + "learning_rate": 3.801940848982918e-05, + "loss": 0.0402, + "num_input_tokens_seen": 85916256, + "step": 70590 + }, + { + "epoch": 7.862234101793073, + "grad_norm": 0.212907612323761, + "learning_rate": 3.801733417781838e-05, + "loss": 0.1366, + "num_input_tokens_seen": 85922240, + "step": 70595 + }, + { + "epoch": 7.86279095667669, + "grad_norm": 0.126901775598526, + "learning_rate": 3.801525974284842e-05, + "loss": 0.114, + "num_input_tokens_seen": 85928160, + "step": 70600 + }, + { + "epoch": 7.863347811560307, + "grad_norm": 0.2512640953063965, + "learning_rate": 3.8013185184938907e-05, + "loss": 0.0141, + "num_input_tokens_seen": 85934016, + "step": 70605 + }, + { + "epoch": 7.863904666443925, + "grad_norm": 0.0006129307439550757, + "learning_rate": 3.801111050410943e-05, + "loss": 0.0938, + "num_input_tokens_seen": 85940000, + "step": 70610 + }, + { + "epoch": 7.864461521327542, + "grad_norm": 0.8313866853713989, + "learning_rate": 3.80090357003796e-05, + "loss": 0.0313, + "num_input_tokens_seen": 85946368, + "step": 70615 + }, + { + "epoch": 7.865018376211159, + "grad_norm": 0.0874609500169754, + "learning_rate": 3.8006960773768996e-05, + "loss": 0.0389, + "num_input_tokens_seen": 85952448, + "step": 70620 + }, + { + "epoch": 7.865575231094777, + "grad_norm": 1.1867780685424805, + "learning_rate": 3.8004885724297234e-05, + "loss": 0.0918, + "num_input_tokens_seen": 85958848, + "step": 70625 + }, + { + "epoch": 7.8661320859783945, + "grad_norm": 0.0003950130194425583, + "learning_rate": 3.80028105519839e-05, + "loss": 0.0339, + "num_input_tokens_seen": 85965184, + "step": 70630 + }, + { + "epoch": 7.866688940862011, + "grad_norm": 2.0688161849975586, + "learning_rate": 3.8000735256848605e-05, + "loss": 0.2246, + "num_input_tokens_seen": 85971136, + "step": 70635 + }, + { + "epoch": 7.867245795745629, + "grad_norm": 0.045007407665252686, + "learning_rate": 3.799865983891095e-05, + "loss": 0.0768, + "num_input_tokens_seen": 85976992, + "step": 70640 + }, + { + "epoch": 7.867802650629246, + "grad_norm": 0.06978819519281387, + "learning_rate": 3.799658429819054e-05, + "loss": 0.0487, + "num_input_tokens_seen": 85983136, + "step": 70645 + }, + { + "epoch": 7.8683595055128634, + "grad_norm": 0.0005378134665079415, + "learning_rate": 3.7994508634706973e-05, + "loss": 0.0459, + "num_input_tokens_seen": 85989504, + "step": 70650 + }, + { + "epoch": 7.868916360396481, + "grad_norm": 1.5414942502975464, + "learning_rate": 3.799243284847987e-05, + "loss": 0.1063, + "num_input_tokens_seen": 85995808, + "step": 70655 + }, + { + "epoch": 7.869473215280098, + "grad_norm": 0.0009186594397760928, + "learning_rate": 3.7990356939528824e-05, + "loss": 0.0792, + "num_input_tokens_seen": 86002176, + "step": 70660 + }, + { + "epoch": 7.870030070163716, + "grad_norm": 0.2653346657752991, + "learning_rate": 3.7988280907873456e-05, + "loss": 0.0965, + "num_input_tokens_seen": 86007776, + "step": 70665 + }, + { + "epoch": 7.870586925047332, + "grad_norm": 0.012921658344566822, + "learning_rate": 3.7986204753533354e-05, + "loss": 0.0374, + "num_input_tokens_seen": 86013792, + "step": 70670 + }, + { + "epoch": 7.87114377993095, + "grad_norm": 0.13593803346157074, + "learning_rate": 3.798412847652815e-05, + "loss": 0.1026, + "num_input_tokens_seen": 86019392, + "step": 70675 + }, + { + "epoch": 7.871700634814568, + "grad_norm": 0.004261313937604427, + "learning_rate": 3.7982052076877454e-05, + "loss": 0.1321, + "num_input_tokens_seen": 86025472, + "step": 70680 + }, + { + "epoch": 7.8722574896981845, + "grad_norm": 1.0113548040390015, + "learning_rate": 3.797997555460087e-05, + "loss": 0.0559, + "num_input_tokens_seen": 86031904, + "step": 70685 + }, + { + "epoch": 7.872814344581802, + "grad_norm": 1.1722633838653564, + "learning_rate": 3.797789890971802e-05, + "loss": 0.0577, + "num_input_tokens_seen": 86038208, + "step": 70690 + }, + { + "epoch": 7.873371199465419, + "grad_norm": 1.0163494348526, + "learning_rate": 3.797582214224852e-05, + "loss": 0.071, + "num_input_tokens_seen": 86044192, + "step": 70695 + }, + { + "epoch": 7.873928054349037, + "grad_norm": 0.0004683523147832602, + "learning_rate": 3.7973745252211977e-05, + "loss": 0.0553, + "num_input_tokens_seen": 86050304, + "step": 70700 + }, + { + "epoch": 7.874484909232654, + "grad_norm": 0.7868080735206604, + "learning_rate": 3.797166823962802e-05, + "loss": 0.025, + "num_input_tokens_seen": 86056576, + "step": 70705 + }, + { + "epoch": 7.875041764116271, + "grad_norm": 0.6202961206436157, + "learning_rate": 3.796959110451627e-05, + "loss": 0.0489, + "num_input_tokens_seen": 86062752, + "step": 70710 + }, + { + "epoch": 7.875598618999889, + "grad_norm": 0.07470150291919708, + "learning_rate": 3.796751384689634e-05, + "loss": 0.0516, + "num_input_tokens_seen": 86069120, + "step": 70715 + }, + { + "epoch": 7.876155473883506, + "grad_norm": 0.11830184608697891, + "learning_rate": 3.796543646678784e-05, + "loss": 0.0473, + "num_input_tokens_seen": 86075488, + "step": 70720 + }, + { + "epoch": 7.876712328767123, + "grad_norm": 0.05824883282184601, + "learning_rate": 3.7963358964210416e-05, + "loss": 0.0929, + "num_input_tokens_seen": 86080832, + "step": 70725 + }, + { + "epoch": 7.877269183650741, + "grad_norm": 0.5719242691993713, + "learning_rate": 3.796128133918367e-05, + "loss": 0.0875, + "num_input_tokens_seen": 86086880, + "step": 70730 + }, + { + "epoch": 7.877826038534358, + "grad_norm": 0.00956591498106718, + "learning_rate": 3.7959203591727245e-05, + "loss": 0.0034, + "num_input_tokens_seen": 86092736, + "step": 70735 + }, + { + "epoch": 7.878382893417975, + "grad_norm": 0.2019215077161789, + "learning_rate": 3.795712572186076e-05, + "loss": 0.0495, + "num_input_tokens_seen": 86098976, + "step": 70740 + }, + { + "epoch": 7.878939748301592, + "grad_norm": 1.7821900844573975, + "learning_rate": 3.795504772960384e-05, + "loss": 0.1337, + "num_input_tokens_seen": 86105216, + "step": 70745 + }, + { + "epoch": 7.87949660318521, + "grad_norm": 0.39509859681129456, + "learning_rate": 3.795296961497611e-05, + "loss": 0.0202, + "num_input_tokens_seen": 86111296, + "step": 70750 + }, + { + "epoch": 7.8800534580688275, + "grad_norm": 0.006286434829235077, + "learning_rate": 3.795089137799721e-05, + "loss": 0.0363, + "num_input_tokens_seen": 86117376, + "step": 70755 + }, + { + "epoch": 7.880610312952444, + "grad_norm": 0.35528239607810974, + "learning_rate": 3.794881301868677e-05, + "loss": 0.0438, + "num_input_tokens_seen": 86123424, + "step": 70760 + }, + { + "epoch": 7.881167167836062, + "grad_norm": 0.6207554936408997, + "learning_rate": 3.7946734537064405e-05, + "loss": 0.0607, + "num_input_tokens_seen": 86129440, + "step": 70765 + }, + { + "epoch": 7.881724022719679, + "grad_norm": 0.4950978457927704, + "learning_rate": 3.7944655933149763e-05, + "loss": 0.0875, + "num_input_tokens_seen": 86135552, + "step": 70770 + }, + { + "epoch": 7.882280877603296, + "grad_norm": 1.4135793447494507, + "learning_rate": 3.7942577206962474e-05, + "loss": 0.0874, + "num_input_tokens_seen": 86141760, + "step": 70775 + }, + { + "epoch": 7.882837732486914, + "grad_norm": 0.005616502836346626, + "learning_rate": 3.794049835852218e-05, + "loss": 0.0707, + "num_input_tokens_seen": 86147936, + "step": 70780 + }, + { + "epoch": 7.883394587370531, + "grad_norm": 0.07307079434394836, + "learning_rate": 3.793841938784851e-05, + "loss": 0.0209, + "num_input_tokens_seen": 86154080, + "step": 70785 + }, + { + "epoch": 7.883951442254149, + "grad_norm": 0.06880097091197968, + "learning_rate": 3.793634029496109e-05, + "loss": 0.0917, + "num_input_tokens_seen": 86160192, + "step": 70790 + }, + { + "epoch": 7.884508297137766, + "grad_norm": 1.649465560913086, + "learning_rate": 3.7934261079879585e-05, + "loss": 0.0336, + "num_input_tokens_seen": 86165824, + "step": 70795 + }, + { + "epoch": 7.885065152021383, + "grad_norm": 0.6549158096313477, + "learning_rate": 3.793218174262362e-05, + "loss": 0.0216, + "num_input_tokens_seen": 86171840, + "step": 70800 + }, + { + "epoch": 7.885622006905001, + "grad_norm": 0.03021017089486122, + "learning_rate": 3.793010228321283e-05, + "loss": 0.0299, + "num_input_tokens_seen": 86177568, + "step": 70805 + }, + { + "epoch": 7.886178861788618, + "grad_norm": 0.07202602177858353, + "learning_rate": 3.7928022701666874e-05, + "loss": 0.0196, + "num_input_tokens_seen": 86183904, + "step": 70810 + }, + { + "epoch": 7.886735716672235, + "grad_norm": 0.09561755508184433, + "learning_rate": 3.792594299800538e-05, + "loss": 0.0677, + "num_input_tokens_seen": 86189824, + "step": 70815 + }, + { + "epoch": 7.887292571555853, + "grad_norm": 0.0006989472894929349, + "learning_rate": 3.7923863172248e-05, + "loss": 0.063, + "num_input_tokens_seen": 86196032, + "step": 70820 + }, + { + "epoch": 7.88784942643947, + "grad_norm": 0.009960591793060303, + "learning_rate": 3.792178322441437e-05, + "loss": 0.0303, + "num_input_tokens_seen": 86202048, + "step": 70825 + }, + { + "epoch": 7.888406281323087, + "grad_norm": 0.03983113914728165, + "learning_rate": 3.7919703154524157e-05, + "loss": 0.1012, + "num_input_tokens_seen": 86208384, + "step": 70830 + }, + { + "epoch": 7.888963136206705, + "grad_norm": 1.3905092477798462, + "learning_rate": 3.7917622962597e-05, + "loss": 0.0577, + "num_input_tokens_seen": 86214592, + "step": 70835 + }, + { + "epoch": 7.889519991090322, + "grad_norm": 1.8610620498657227, + "learning_rate": 3.791554264865253e-05, + "loss": 0.1032, + "num_input_tokens_seen": 86220800, + "step": 70840 + }, + { + "epoch": 7.890076845973939, + "grad_norm": 0.08192306011915207, + "learning_rate": 3.791346221271043e-05, + "loss": 0.0945, + "num_input_tokens_seen": 86227072, + "step": 70845 + }, + { + "epoch": 7.890633700857556, + "grad_norm": 0.009907614439725876, + "learning_rate": 3.7911381654790315e-05, + "loss": 0.0135, + "num_input_tokens_seen": 86233248, + "step": 70850 + }, + { + "epoch": 7.891190555741174, + "grad_norm": 0.004929722752422094, + "learning_rate": 3.790930097491186e-05, + "loss": 0.0978, + "num_input_tokens_seen": 86239424, + "step": 70855 + }, + { + "epoch": 7.891747410624792, + "grad_norm": 0.24142591655254364, + "learning_rate": 3.7907220173094717e-05, + "loss": 0.0761, + "num_input_tokens_seen": 86245600, + "step": 70860 + }, + { + "epoch": 7.892304265508408, + "grad_norm": 0.007131167221814394, + "learning_rate": 3.790513924935854e-05, + "loss": 0.0385, + "num_input_tokens_seen": 86252064, + "step": 70865 + }, + { + "epoch": 7.892861120392026, + "grad_norm": 0.5353394150733948, + "learning_rate": 3.790305820372298e-05, + "loss": 0.1016, + "num_input_tokens_seen": 86258016, + "step": 70870 + }, + { + "epoch": 7.893417975275643, + "grad_norm": 0.13739065825939178, + "learning_rate": 3.79009770362077e-05, + "loss": 0.073, + "num_input_tokens_seen": 86264256, + "step": 70875 + }, + { + "epoch": 7.8939748301592605, + "grad_norm": 0.24810734391212463, + "learning_rate": 3.7898895746832355e-05, + "loss": 0.028, + "num_input_tokens_seen": 86269888, + "step": 70880 + }, + { + "epoch": 7.894531685042878, + "grad_norm": 0.39209669828414917, + "learning_rate": 3.78968143356166e-05, + "loss": 0.0593, + "num_input_tokens_seen": 86275840, + "step": 70885 + }, + { + "epoch": 7.895088539926495, + "grad_norm": 0.09695035964250565, + "learning_rate": 3.789473280258011e-05, + "loss": 0.036, + "num_input_tokens_seen": 86282176, + "step": 70890 + }, + { + "epoch": 7.895645394810113, + "grad_norm": 0.06443163007497787, + "learning_rate": 3.789265114774254e-05, + "loss": 0.1807, + "num_input_tokens_seen": 86288000, + "step": 70895 + }, + { + "epoch": 7.896202249693729, + "grad_norm": 0.006939534563571215, + "learning_rate": 3.789056937112354e-05, + "loss": 0.0484, + "num_input_tokens_seen": 86293632, + "step": 70900 + }, + { + "epoch": 7.896759104577347, + "grad_norm": 0.1924232840538025, + "learning_rate": 3.7888487472742796e-05, + "loss": 0.0519, + "num_input_tokens_seen": 86299328, + "step": 70905 + }, + { + "epoch": 7.897315959460965, + "grad_norm": 0.36977359652519226, + "learning_rate": 3.788640545261995e-05, + "loss": 0.0344, + "num_input_tokens_seen": 86305920, + "step": 70910 + }, + { + "epoch": 7.8978728143445815, + "grad_norm": 0.1704505980014801, + "learning_rate": 3.788432331077469e-05, + "loss": 0.0442, + "num_input_tokens_seen": 86312320, + "step": 70915 + }, + { + "epoch": 7.898429669228199, + "grad_norm": 0.4327191412448883, + "learning_rate": 3.788224104722666e-05, + "loss": 0.0212, + "num_input_tokens_seen": 86318528, + "step": 70920 + }, + { + "epoch": 7.898986524111816, + "grad_norm": 0.007597814779728651, + "learning_rate": 3.788015866199555e-05, + "loss": 0.1456, + "num_input_tokens_seen": 86324608, + "step": 70925 + }, + { + "epoch": 7.899543378995434, + "grad_norm": 0.2710687220096588, + "learning_rate": 3.787807615510103e-05, + "loss": 0.0272, + "num_input_tokens_seen": 86331040, + "step": 70930 + }, + { + "epoch": 7.900100233879051, + "grad_norm": 1.3954576253890991, + "learning_rate": 3.787599352656275e-05, + "loss": 0.106, + "num_input_tokens_seen": 86337152, + "step": 70935 + }, + { + "epoch": 7.900657088762668, + "grad_norm": 0.04235179349780083, + "learning_rate": 3.7873910776400405e-05, + "loss": 0.0373, + "num_input_tokens_seen": 86343392, + "step": 70940 + }, + { + "epoch": 7.901213943646286, + "grad_norm": 0.02327943593263626, + "learning_rate": 3.787182790463365e-05, + "loss": 0.0034, + "num_input_tokens_seen": 86349824, + "step": 70945 + }, + { + "epoch": 7.9017707985299035, + "grad_norm": 0.011811381205916405, + "learning_rate": 3.786974491128218e-05, + "loss": 0.0354, + "num_input_tokens_seen": 86356032, + "step": 70950 + }, + { + "epoch": 7.90232765341352, + "grad_norm": 0.27331411838531494, + "learning_rate": 3.786766179636564e-05, + "loss": 0.0204, + "num_input_tokens_seen": 86362016, + "step": 70955 + }, + { + "epoch": 7.902884508297138, + "grad_norm": 0.06331409513950348, + "learning_rate": 3.786557855990374e-05, + "loss": 0.0185, + "num_input_tokens_seen": 86368096, + "step": 70960 + }, + { + "epoch": 7.903441363180755, + "grad_norm": 0.5592966079711914, + "learning_rate": 3.786349520191614e-05, + "loss": 0.0209, + "num_input_tokens_seen": 86374208, + "step": 70965 + }, + { + "epoch": 7.903998218064372, + "grad_norm": 0.18487350642681122, + "learning_rate": 3.7861411722422515e-05, + "loss": 0.0185, + "num_input_tokens_seen": 86380608, + "step": 70970 + }, + { + "epoch": 7.90455507294799, + "grad_norm": 0.0037339760456234217, + "learning_rate": 3.785932812144256e-05, + "loss": 0.0266, + "num_input_tokens_seen": 86386688, + "step": 70975 + }, + { + "epoch": 7.905111927831607, + "grad_norm": 0.5240460634231567, + "learning_rate": 3.785724439899594e-05, + "loss": 0.0133, + "num_input_tokens_seen": 86392800, + "step": 70980 + }, + { + "epoch": 7.9056687827152246, + "grad_norm": 0.22531382739543915, + "learning_rate": 3.785516055510235e-05, + "loss": 0.0643, + "num_input_tokens_seen": 86398912, + "step": 70985 + }, + { + "epoch": 7.906225637598842, + "grad_norm": 1.1235014200210571, + "learning_rate": 3.785307658978147e-05, + "loss": 0.1652, + "num_input_tokens_seen": 86404992, + "step": 70990 + }, + { + "epoch": 7.906782492482459, + "grad_norm": 0.024062130600214005, + "learning_rate": 3.785099250305298e-05, + "loss": 0.0471, + "num_input_tokens_seen": 86410784, + "step": 70995 + }, + { + "epoch": 7.907339347366077, + "grad_norm": 0.17420174181461334, + "learning_rate": 3.784890829493658e-05, + "loss": 0.0053, + "num_input_tokens_seen": 86417024, + "step": 71000 + }, + { + "epoch": 7.9078962022496935, + "grad_norm": 0.25709131360054016, + "learning_rate": 3.7846823965451936e-05, + "loss": 0.1334, + "num_input_tokens_seen": 86422528, + "step": 71005 + }, + { + "epoch": 7.908453057133311, + "grad_norm": 0.1319061666727066, + "learning_rate": 3.784473951461876e-05, + "loss": 0.0054, + "num_input_tokens_seen": 86428512, + "step": 71010 + }, + { + "epoch": 7.909009912016929, + "grad_norm": 0.27635353803634644, + "learning_rate": 3.7842654942456715e-05, + "loss": 0.0605, + "num_input_tokens_seen": 86434752, + "step": 71015 + }, + { + "epoch": 7.909566766900546, + "grad_norm": 0.12864813208580017, + "learning_rate": 3.784057024898551e-05, + "loss": 0.0237, + "num_input_tokens_seen": 86441056, + "step": 71020 + }, + { + "epoch": 7.910123621784163, + "grad_norm": 1.1150619983673096, + "learning_rate": 3.783848543422483e-05, + "loss": 0.054, + "num_input_tokens_seen": 86447200, + "step": 71025 + }, + { + "epoch": 7.91068047666778, + "grad_norm": 0.12308251857757568, + "learning_rate": 3.783640049819437e-05, + "loss": 0.0517, + "num_input_tokens_seen": 86453056, + "step": 71030 + }, + { + "epoch": 7.911237331551398, + "grad_norm": 0.1628912389278412, + "learning_rate": 3.7834315440913825e-05, + "loss": 0.0066, + "num_input_tokens_seen": 86459392, + "step": 71035 + }, + { + "epoch": 7.911794186435015, + "grad_norm": 1.6767778396606445, + "learning_rate": 3.783223026240288e-05, + "loss": 0.0492, + "num_input_tokens_seen": 86465088, + "step": 71040 + }, + { + "epoch": 7.912351041318632, + "grad_norm": 0.6648775935173035, + "learning_rate": 3.7830144962681245e-05, + "loss": 0.0438, + "num_input_tokens_seen": 86470944, + "step": 71045 + }, + { + "epoch": 7.91290789620225, + "grad_norm": 0.2037826031446457, + "learning_rate": 3.7828059541768615e-05, + "loss": 0.0962, + "num_input_tokens_seen": 86476896, + "step": 71050 + }, + { + "epoch": 7.913464751085867, + "grad_norm": 1.5184791088104248, + "learning_rate": 3.782597399968467e-05, + "loss": 0.0802, + "num_input_tokens_seen": 86483232, + "step": 71055 + }, + { + "epoch": 7.914021605969484, + "grad_norm": 1.234312653541565, + "learning_rate": 3.782388833644914e-05, + "loss": 0.1104, + "num_input_tokens_seen": 86489312, + "step": 71060 + }, + { + "epoch": 7.914578460853102, + "grad_norm": 0.04406433552503586, + "learning_rate": 3.7821802552081706e-05, + "loss": 0.0277, + "num_input_tokens_seen": 86495808, + "step": 71065 + }, + { + "epoch": 7.915135315736719, + "grad_norm": 0.34841620922088623, + "learning_rate": 3.781971664660207e-05, + "loss": 0.031, + "num_input_tokens_seen": 86502080, + "step": 71070 + }, + { + "epoch": 7.9156921706203365, + "grad_norm": 0.6527409553527832, + "learning_rate": 3.781763062002995e-05, + "loss": 0.0497, + "num_input_tokens_seen": 86508480, + "step": 71075 + }, + { + "epoch": 7.916249025503953, + "grad_norm": 1.1827257871627808, + "learning_rate": 3.781554447238503e-05, + "loss": 0.0985, + "num_input_tokens_seen": 86514688, + "step": 71080 + }, + { + "epoch": 7.916805880387571, + "grad_norm": 0.5490688681602478, + "learning_rate": 3.781345820368703e-05, + "loss": 0.0183, + "num_input_tokens_seen": 86520832, + "step": 71085 + }, + { + "epoch": 7.917362735271189, + "grad_norm": 0.8906384110450745, + "learning_rate": 3.781137181395564e-05, + "loss": 0.041, + "num_input_tokens_seen": 86527104, + "step": 71090 + }, + { + "epoch": 7.917919590154805, + "grad_norm": 0.04904567822813988, + "learning_rate": 3.78092853032106e-05, + "loss": 0.0232, + "num_input_tokens_seen": 86533152, + "step": 71095 + }, + { + "epoch": 7.918476445038423, + "grad_norm": 0.002489604288712144, + "learning_rate": 3.780719867147158e-05, + "loss": 0.1455, + "num_input_tokens_seen": 86539264, + "step": 71100 + }, + { + "epoch": 7.91903329992204, + "grad_norm": 0.023655591532588005, + "learning_rate": 3.7805111918758306e-05, + "loss": 0.1743, + "num_input_tokens_seen": 86545248, + "step": 71105 + }, + { + "epoch": 7.9195901548056575, + "grad_norm": 0.007430602330714464, + "learning_rate": 3.7803025045090503e-05, + "loss": 0.0264, + "num_input_tokens_seen": 86551712, + "step": 71110 + }, + { + "epoch": 7.920147009689275, + "grad_norm": 0.0012549428502097726, + "learning_rate": 3.780093805048787e-05, + "loss": 0.0669, + "num_input_tokens_seen": 86557664, + "step": 71115 + }, + { + "epoch": 7.920703864572892, + "grad_norm": 1.2440911531448364, + "learning_rate": 3.779885093497011e-05, + "loss": 0.0942, + "num_input_tokens_seen": 86563328, + "step": 71120 + }, + { + "epoch": 7.92126071945651, + "grad_norm": 0.7315720319747925, + "learning_rate": 3.779676369855696e-05, + "loss": 0.0383, + "num_input_tokens_seen": 86568768, + "step": 71125 + }, + { + "epoch": 7.921817574340127, + "grad_norm": 1.462443232536316, + "learning_rate": 3.779467634126812e-05, + "loss": 0.1572, + "num_input_tokens_seen": 86575072, + "step": 71130 + }, + { + "epoch": 7.922374429223744, + "grad_norm": 0.827900767326355, + "learning_rate": 3.77925888631233e-05, + "loss": 0.0675, + "num_input_tokens_seen": 86580672, + "step": 71135 + }, + { + "epoch": 7.922931284107362, + "grad_norm": 0.0074846576899290085, + "learning_rate": 3.7790501264142244e-05, + "loss": 0.0667, + "num_input_tokens_seen": 86586048, + "step": 71140 + }, + { + "epoch": 7.923488138990979, + "grad_norm": 0.3819248378276825, + "learning_rate": 3.7788413544344654e-05, + "loss": 0.0605, + "num_input_tokens_seen": 86592160, + "step": 71145 + }, + { + "epoch": 7.924044993874596, + "grad_norm": 0.4717102348804474, + "learning_rate": 3.7786325703750246e-05, + "loss": 0.0398, + "num_input_tokens_seen": 86598528, + "step": 71150 + }, + { + "epoch": 7.924601848758214, + "grad_norm": 0.9827039837837219, + "learning_rate": 3.778423774237875e-05, + "loss": 0.0445, + "num_input_tokens_seen": 86604736, + "step": 71155 + }, + { + "epoch": 7.925158703641831, + "grad_norm": 0.148529514670372, + "learning_rate": 3.778214966024989e-05, + "loss": 0.0145, + "num_input_tokens_seen": 86610976, + "step": 71160 + }, + { + "epoch": 7.925715558525448, + "grad_norm": 0.005837930366396904, + "learning_rate": 3.7780061457383386e-05, + "loss": 0.0466, + "num_input_tokens_seen": 86617184, + "step": 71165 + }, + { + "epoch": 7.926272413409066, + "grad_norm": 0.3928084671497345, + "learning_rate": 3.777797313379895e-05, + "loss": 0.0887, + "num_input_tokens_seen": 86623456, + "step": 71170 + }, + { + "epoch": 7.926829268292683, + "grad_norm": 0.04248358681797981, + "learning_rate": 3.777588468951633e-05, + "loss": 0.0014, + "num_input_tokens_seen": 86629632, + "step": 71175 + }, + { + "epoch": 7.9273861231763005, + "grad_norm": 0.052164942026138306, + "learning_rate": 3.777379612455525e-05, + "loss": 0.0988, + "num_input_tokens_seen": 86635520, + "step": 71180 + }, + { + "epoch": 7.927942978059917, + "grad_norm": 0.04383254051208496, + "learning_rate": 3.777170743893542e-05, + "loss": 0.0745, + "num_input_tokens_seen": 86641344, + "step": 71185 + }, + { + "epoch": 7.928499832943535, + "grad_norm": 0.15045633912086487, + "learning_rate": 3.7769618632676584e-05, + "loss": 0.0464, + "num_input_tokens_seen": 86647680, + "step": 71190 + }, + { + "epoch": 7.929056687827153, + "grad_norm": 0.32284411787986755, + "learning_rate": 3.7767529705798463e-05, + "loss": 0.0846, + "num_input_tokens_seen": 86653760, + "step": 71195 + }, + { + "epoch": 7.9296135427107695, + "grad_norm": 0.04522501304745674, + "learning_rate": 3.776544065832081e-05, + "loss": 0.07, + "num_input_tokens_seen": 86659840, + "step": 71200 + }, + { + "epoch": 7.930170397594387, + "grad_norm": 0.08647814393043518, + "learning_rate": 3.776335149026333e-05, + "loss": 0.0139, + "num_input_tokens_seen": 86666144, + "step": 71205 + }, + { + "epoch": 7.930727252478004, + "grad_norm": 0.10743732005357742, + "learning_rate": 3.776126220164578e-05, + "loss": 0.0101, + "num_input_tokens_seen": 86671936, + "step": 71210 + }, + { + "epoch": 7.931284107361622, + "grad_norm": 0.001710055861622095, + "learning_rate": 3.7759172792487874e-05, + "loss": 0.0286, + "num_input_tokens_seen": 86678400, + "step": 71215 + }, + { + "epoch": 7.931840962245239, + "grad_norm": 0.4603642225265503, + "learning_rate": 3.775708326280936e-05, + "loss": 0.1762, + "num_input_tokens_seen": 86684544, + "step": 71220 + }, + { + "epoch": 7.932397817128856, + "grad_norm": 1.873918890953064, + "learning_rate": 3.775499361262998e-05, + "loss": 0.0912, + "num_input_tokens_seen": 86690848, + "step": 71225 + }, + { + "epoch": 7.932954672012474, + "grad_norm": 1.2961888313293457, + "learning_rate": 3.7752903841969456e-05, + "loss": 0.0272, + "num_input_tokens_seen": 86697120, + "step": 71230 + }, + { + "epoch": 7.9335115268960905, + "grad_norm": 0.2262551188468933, + "learning_rate": 3.775081395084754e-05, + "loss": 0.0418, + "num_input_tokens_seen": 86703136, + "step": 71235 + }, + { + "epoch": 7.934068381779708, + "grad_norm": 0.5935333371162415, + "learning_rate": 3.774872393928398e-05, + "loss": 0.1261, + "num_input_tokens_seen": 86708864, + "step": 71240 + }, + { + "epoch": 7.934625236663326, + "grad_norm": 0.5518175959587097, + "learning_rate": 3.77466338072985e-05, + "loss": 0.1565, + "num_input_tokens_seen": 86714752, + "step": 71245 + }, + { + "epoch": 7.935182091546943, + "grad_norm": 1.230270266532898, + "learning_rate": 3.774454355491086e-05, + "loss": 0.0348, + "num_input_tokens_seen": 86720704, + "step": 71250 + }, + { + "epoch": 7.93573894643056, + "grad_norm": 0.34048688411712646, + "learning_rate": 3.7742453182140786e-05, + "loss": 0.013, + "num_input_tokens_seen": 86727008, + "step": 71255 + }, + { + "epoch": 7.936295801314177, + "grad_norm": 0.017690487205982208, + "learning_rate": 3.774036268900803e-05, + "loss": 0.0631, + "num_input_tokens_seen": 86733408, + "step": 71260 + }, + { + "epoch": 7.936852656197795, + "grad_norm": 0.0018718101782724261, + "learning_rate": 3.7738272075532355e-05, + "loss": 0.0439, + "num_input_tokens_seen": 86739552, + "step": 71265 + }, + { + "epoch": 7.9374095110814125, + "grad_norm": 0.35055699944496155, + "learning_rate": 3.773618134173348e-05, + "loss": 0.0893, + "num_input_tokens_seen": 86745536, + "step": 71270 + }, + { + "epoch": 7.937966365965029, + "grad_norm": 0.123257577419281, + "learning_rate": 3.773409048763118e-05, + "loss": 0.0858, + "num_input_tokens_seen": 86751776, + "step": 71275 + }, + { + "epoch": 7.938523220848647, + "grad_norm": 1.2397793531417847, + "learning_rate": 3.773199951324519e-05, + "loss": 0.0813, + "num_input_tokens_seen": 86757824, + "step": 71280 + }, + { + "epoch": 7.939080075732264, + "grad_norm": 0.26617249846458435, + "learning_rate": 3.772990841859526e-05, + "loss": 0.1598, + "num_input_tokens_seen": 86764064, + "step": 71285 + }, + { + "epoch": 7.939636930615881, + "grad_norm": 0.6785154342651367, + "learning_rate": 3.7727817203701146e-05, + "loss": 0.0135, + "num_input_tokens_seen": 86770208, + "step": 71290 + }, + { + "epoch": 7.940193785499499, + "grad_norm": 1.6443147659301758, + "learning_rate": 3.7725725868582596e-05, + "loss": 0.0303, + "num_input_tokens_seen": 86776320, + "step": 71295 + }, + { + "epoch": 7.940750640383116, + "grad_norm": 0.22498546540737152, + "learning_rate": 3.772363441325938e-05, + "loss": 0.0122, + "num_input_tokens_seen": 86782432, + "step": 71300 + }, + { + "epoch": 7.9413074952667335, + "grad_norm": 1.1001731157302856, + "learning_rate": 3.772154283775123e-05, + "loss": 0.0315, + "num_input_tokens_seen": 86788416, + "step": 71305 + }, + { + "epoch": 7.941864350150351, + "grad_norm": 0.06553663313388824, + "learning_rate": 3.7719451142077935e-05, + "loss": 0.0238, + "num_input_tokens_seen": 86794592, + "step": 71310 + }, + { + "epoch": 7.942421205033968, + "grad_norm": 0.5937007665634155, + "learning_rate": 3.7717359326259216e-05, + "loss": 0.0824, + "num_input_tokens_seen": 86800832, + "step": 71315 + }, + { + "epoch": 7.942978059917586, + "grad_norm": 0.06284313648939133, + "learning_rate": 3.771526739031486e-05, + "loss": 0.0138, + "num_input_tokens_seen": 86807168, + "step": 71320 + }, + { + "epoch": 7.943534914801202, + "grad_norm": 0.031069934368133545, + "learning_rate": 3.7713175334264614e-05, + "loss": 0.0613, + "num_input_tokens_seen": 86813152, + "step": 71325 + }, + { + "epoch": 7.94409176968482, + "grad_norm": 0.044535256922245026, + "learning_rate": 3.7711083158128236e-05, + "loss": 0.0102, + "num_input_tokens_seen": 86819008, + "step": 71330 + }, + { + "epoch": 7.944648624568438, + "grad_norm": 0.9141813516616821, + "learning_rate": 3.7708990861925494e-05, + "loss": 0.0487, + "num_input_tokens_seen": 86825440, + "step": 71335 + }, + { + "epoch": 7.945205479452055, + "grad_norm": 0.002452428685501218, + "learning_rate": 3.7706898445676154e-05, + "loss": 0.0629, + "num_input_tokens_seen": 86831456, + "step": 71340 + }, + { + "epoch": 7.945762334335672, + "grad_norm": 2.2390036582946777, + "learning_rate": 3.770480590939998e-05, + "loss": 0.1667, + "num_input_tokens_seen": 86837216, + "step": 71345 + }, + { + "epoch": 7.94631918921929, + "grad_norm": 0.9420896768569946, + "learning_rate": 3.770271325311673e-05, + "loss": 0.1411, + "num_input_tokens_seen": 86843200, + "step": 71350 + }, + { + "epoch": 7.946876044102907, + "grad_norm": 1.4283270835876465, + "learning_rate": 3.770062047684618e-05, + "loss": 0.1331, + "num_input_tokens_seen": 86849408, + "step": 71355 + }, + { + "epoch": 7.947432898986524, + "grad_norm": 0.08005326986312866, + "learning_rate": 3.76985275806081e-05, + "loss": 0.1215, + "num_input_tokens_seen": 86855584, + "step": 71360 + }, + { + "epoch": 7.947989753870141, + "grad_norm": 0.06534618139266968, + "learning_rate": 3.769643456442224e-05, + "loss": 0.102, + "num_input_tokens_seen": 86861312, + "step": 71365 + }, + { + "epoch": 7.948546608753759, + "grad_norm": 0.06599786132574081, + "learning_rate": 3.769434142830839e-05, + "loss": 0.0096, + "num_input_tokens_seen": 86867424, + "step": 71370 + }, + { + "epoch": 7.9491034636373765, + "grad_norm": 0.003318769158795476, + "learning_rate": 3.7692248172286314e-05, + "loss": 0.1139, + "num_input_tokens_seen": 86873344, + "step": 71375 + }, + { + "epoch": 7.949660318520993, + "grad_norm": 0.010506688617169857, + "learning_rate": 3.7690154796375784e-05, + "loss": 0.0282, + "num_input_tokens_seen": 86879520, + "step": 71380 + }, + { + "epoch": 7.950217173404611, + "grad_norm": 0.06298849731683731, + "learning_rate": 3.768806130059658e-05, + "loss": 0.0153, + "num_input_tokens_seen": 86885632, + "step": 71385 + }, + { + "epoch": 7.950774028288228, + "grad_norm": 0.014205233193933964, + "learning_rate": 3.768596768496847e-05, + "loss": 0.0394, + "num_input_tokens_seen": 86892096, + "step": 71390 + }, + { + "epoch": 7.951330883171845, + "grad_norm": 0.23517140746116638, + "learning_rate": 3.768387394951123e-05, + "loss": 0.0512, + "num_input_tokens_seen": 86898080, + "step": 71395 + }, + { + "epoch": 7.951887738055463, + "grad_norm": 0.10533920675516129, + "learning_rate": 3.7681780094244634e-05, + "loss": 0.0822, + "num_input_tokens_seen": 86904672, + "step": 71400 + }, + { + "epoch": 7.95244459293908, + "grad_norm": 0.013065419159829617, + "learning_rate": 3.7679686119188465e-05, + "loss": 0.0299, + "num_input_tokens_seen": 86910784, + "step": 71405 + }, + { + "epoch": 7.953001447822698, + "grad_norm": 0.00406995415687561, + "learning_rate": 3.767759202436251e-05, + "loss": 0.0087, + "num_input_tokens_seen": 86916960, + "step": 71410 + }, + { + "epoch": 7.953558302706314, + "grad_norm": 0.18820373713970184, + "learning_rate": 3.767549780978653e-05, + "loss": 0.0473, + "num_input_tokens_seen": 86923168, + "step": 71415 + }, + { + "epoch": 7.954115157589932, + "grad_norm": 0.008748175576329231, + "learning_rate": 3.767340347548033e-05, + "loss": 0.0685, + "num_input_tokens_seen": 86928960, + "step": 71420 + }, + { + "epoch": 7.95467201247355, + "grad_norm": 0.022902842611074448, + "learning_rate": 3.767130902146367e-05, + "loss": 0.0105, + "num_input_tokens_seen": 86935456, + "step": 71425 + }, + { + "epoch": 7.9552288673571665, + "grad_norm": 0.10442064702510834, + "learning_rate": 3.7669214447756354e-05, + "loss": 0.022, + "num_input_tokens_seen": 86941600, + "step": 71430 + }, + { + "epoch": 7.955785722240784, + "grad_norm": 1.4626429080963135, + "learning_rate": 3.7667119754378143e-05, + "loss": 0.1399, + "num_input_tokens_seen": 86947488, + "step": 71435 + }, + { + "epoch": 7.956342577124401, + "grad_norm": 0.38211384415626526, + "learning_rate": 3.766502494134885e-05, + "loss": 0.0486, + "num_input_tokens_seen": 86952576, + "step": 71440 + }, + { + "epoch": 7.956899432008019, + "grad_norm": 1.3010174036026, + "learning_rate": 3.7662930008688244e-05, + "loss": 0.0791, + "num_input_tokens_seen": 86958720, + "step": 71445 + }, + { + "epoch": 7.957456286891636, + "grad_norm": 0.4487755596637726, + "learning_rate": 3.766083495641612e-05, + "loss": 0.031, + "num_input_tokens_seen": 86965056, + "step": 71450 + }, + { + "epoch": 7.958013141775253, + "grad_norm": 0.11492010951042175, + "learning_rate": 3.7658739784552266e-05, + "loss": 0.0171, + "num_input_tokens_seen": 86970976, + "step": 71455 + }, + { + "epoch": 7.958569996658871, + "grad_norm": 0.03503000736236572, + "learning_rate": 3.7656644493116475e-05, + "loss": 0.0548, + "num_input_tokens_seen": 86976864, + "step": 71460 + }, + { + "epoch": 7.959126851542488, + "grad_norm": 0.05618582293391228, + "learning_rate": 3.765454908212853e-05, + "loss": 0.0541, + "num_input_tokens_seen": 86982912, + "step": 71465 + }, + { + "epoch": 7.959683706426105, + "grad_norm": 0.649387001991272, + "learning_rate": 3.7652453551608235e-05, + "loss": 0.0391, + "num_input_tokens_seen": 86988672, + "step": 71470 + }, + { + "epoch": 7.960240561309723, + "grad_norm": 1.0590916872024536, + "learning_rate": 3.765035790157538e-05, + "loss": 0.0884, + "num_input_tokens_seen": 86994112, + "step": 71475 + }, + { + "epoch": 7.96079741619334, + "grad_norm": 0.3143755793571472, + "learning_rate": 3.764826213204976e-05, + "loss": 0.0133, + "num_input_tokens_seen": 87000448, + "step": 71480 + }, + { + "epoch": 7.961354271076957, + "grad_norm": 0.5274842381477356, + "learning_rate": 3.7646166243051163e-05, + "loss": 0.2686, + "num_input_tokens_seen": 87006528, + "step": 71485 + }, + { + "epoch": 7.961911125960575, + "grad_norm": 0.0015553723787888885, + "learning_rate": 3.764407023459941e-05, + "loss": 0.0932, + "num_input_tokens_seen": 87012032, + "step": 71490 + }, + { + "epoch": 7.962467980844192, + "grad_norm": 0.8526320457458496, + "learning_rate": 3.7641974106714264e-05, + "loss": 0.1105, + "num_input_tokens_seen": 87017440, + "step": 71495 + }, + { + "epoch": 7.9630248357278095, + "grad_norm": 0.043576452881097794, + "learning_rate": 3.7639877859415555e-05, + "loss": 0.0564, + "num_input_tokens_seen": 87023424, + "step": 71500 + }, + { + "epoch": 7.963581690611426, + "grad_norm": 0.00035356031730771065, + "learning_rate": 3.7637781492723066e-05, + "loss": 0.0118, + "num_input_tokens_seen": 87029472, + "step": 71505 + }, + { + "epoch": 7.964138545495044, + "grad_norm": 0.0019602009560912848, + "learning_rate": 3.763568500665661e-05, + "loss": 0.0225, + "num_input_tokens_seen": 87035840, + "step": 71510 + }, + { + "epoch": 7.964695400378662, + "grad_norm": 0.024455495178699493, + "learning_rate": 3.763358840123599e-05, + "loss": 0.017, + "num_input_tokens_seen": 87042048, + "step": 71515 + }, + { + "epoch": 7.965252255262278, + "grad_norm": 0.03523244708776474, + "learning_rate": 3.7631491676481e-05, + "loss": 0.0763, + "num_input_tokens_seen": 87048224, + "step": 71520 + }, + { + "epoch": 7.965809110145896, + "grad_norm": 0.5056155920028687, + "learning_rate": 3.762939483241146e-05, + "loss": 0.1086, + "num_input_tokens_seen": 87054080, + "step": 71525 + }, + { + "epoch": 7.966365965029514, + "grad_norm": 2.277813673019409, + "learning_rate": 3.7627297869047154e-05, + "loss": 0.0442, + "num_input_tokens_seen": 87059968, + "step": 71530 + }, + { + "epoch": 7.966922819913131, + "grad_norm": 0.6070699095726013, + "learning_rate": 3.762520078640791e-05, + "loss": 0.0139, + "num_input_tokens_seen": 87066144, + "step": 71535 + }, + { + "epoch": 7.967479674796748, + "grad_norm": 0.04413892328739166, + "learning_rate": 3.762310358451352e-05, + "loss": 0.1118, + "num_input_tokens_seen": 87071936, + "step": 71540 + }, + { + "epoch": 7.968036529680365, + "grad_norm": 0.3023071885108948, + "learning_rate": 3.762100626338381e-05, + "loss": 0.0882, + "num_input_tokens_seen": 87077984, + "step": 71545 + }, + { + "epoch": 7.968593384563983, + "grad_norm": 0.06873166561126709, + "learning_rate": 3.761890882303859e-05, + "loss": 0.016, + "num_input_tokens_seen": 87084288, + "step": 71550 + }, + { + "epoch": 7.9691502394476, + "grad_norm": 0.006937665864825249, + "learning_rate": 3.761681126349766e-05, + "loss": 0.033, + "num_input_tokens_seen": 87090464, + "step": 71555 + }, + { + "epoch": 7.969707094331217, + "grad_norm": 0.1941959112882614, + "learning_rate": 3.761471358478084e-05, + "loss": 0.0148, + "num_input_tokens_seen": 87095936, + "step": 71560 + }, + { + "epoch": 7.970263949214835, + "grad_norm": 1.3934322595596313, + "learning_rate": 3.761261578690795e-05, + "loss": 0.0492, + "num_input_tokens_seen": 87102016, + "step": 71565 + }, + { + "epoch": 7.970820804098452, + "grad_norm": 0.8411424160003662, + "learning_rate": 3.7610517869898786e-05, + "loss": 0.0695, + "num_input_tokens_seen": 87107872, + "step": 71570 + }, + { + "epoch": 7.971377658982069, + "grad_norm": 0.8212249279022217, + "learning_rate": 3.7608419833773184e-05, + "loss": 0.0457, + "num_input_tokens_seen": 87113920, + "step": 71575 + }, + { + "epoch": 7.971934513865687, + "grad_norm": 0.1366763561964035, + "learning_rate": 3.760632167855095e-05, + "loss": 0.0042, + "num_input_tokens_seen": 87120000, + "step": 71580 + }, + { + "epoch": 7.972491368749304, + "grad_norm": 0.17320284247398376, + "learning_rate": 3.760422340425191e-05, + "loss": 0.0137, + "num_input_tokens_seen": 87126080, + "step": 71585 + }, + { + "epoch": 7.973048223632921, + "grad_norm": 1.141929030418396, + "learning_rate": 3.760212501089589e-05, + "loss": 0.0817, + "num_input_tokens_seen": 87132288, + "step": 71590 + }, + { + "epoch": 7.973605078516538, + "grad_norm": 0.0002534572558943182, + "learning_rate": 3.76000264985027e-05, + "loss": 0.0289, + "num_input_tokens_seen": 87138336, + "step": 71595 + }, + { + "epoch": 7.974161933400156, + "grad_norm": 0.0007181772380135953, + "learning_rate": 3.759792786709216e-05, + "loss": 0.0366, + "num_input_tokens_seen": 87144224, + "step": 71600 + }, + { + "epoch": 7.974718788283774, + "grad_norm": 0.06575602293014526, + "learning_rate": 3.75958291166841e-05, + "loss": 0.0307, + "num_input_tokens_seen": 87150240, + "step": 71605 + }, + { + "epoch": 7.97527564316739, + "grad_norm": 2.614238739013672, + "learning_rate": 3.7593730247298344e-05, + "loss": 0.047, + "num_input_tokens_seen": 87156192, + "step": 71610 + }, + { + "epoch": 7.975832498051008, + "grad_norm": 0.25415539741516113, + "learning_rate": 3.759163125895471e-05, + "loss": 0.0253, + "num_input_tokens_seen": 87161728, + "step": 71615 + }, + { + "epoch": 7.976389352934625, + "grad_norm": 0.8096100687980652, + "learning_rate": 3.758953215167304e-05, + "loss": 0.0865, + "num_input_tokens_seen": 87167776, + "step": 71620 + }, + { + "epoch": 7.9769462078182425, + "grad_norm": 0.43904489278793335, + "learning_rate": 3.7587432925473144e-05, + "loss": 0.0652, + "num_input_tokens_seen": 87173952, + "step": 71625 + }, + { + "epoch": 7.97750306270186, + "grad_norm": 2.5128042697906494, + "learning_rate": 3.758533358037486e-05, + "loss": 0.2405, + "num_input_tokens_seen": 87180256, + "step": 71630 + }, + { + "epoch": 7.978059917585477, + "grad_norm": 0.45846059918403625, + "learning_rate": 3.758323411639802e-05, + "loss": 0.0196, + "num_input_tokens_seen": 87185920, + "step": 71635 + }, + { + "epoch": 7.978616772469095, + "grad_norm": 1.4781206846237183, + "learning_rate": 3.758113453356244e-05, + "loss": 0.1698, + "num_input_tokens_seen": 87191840, + "step": 71640 + }, + { + "epoch": 7.979173627352711, + "grad_norm": 0.00471865851432085, + "learning_rate": 3.7579034831887985e-05, + "loss": 0.0813, + "num_input_tokens_seen": 87197952, + "step": 71645 + }, + { + "epoch": 7.979730482236329, + "grad_norm": 0.013058051466941833, + "learning_rate": 3.7576935011394455e-05, + "loss": 0.0749, + "num_input_tokens_seen": 87204160, + "step": 71650 + }, + { + "epoch": 7.980287337119947, + "grad_norm": 0.0004349127702880651, + "learning_rate": 3.75748350721017e-05, + "loss": 0.0361, + "num_input_tokens_seen": 87210560, + "step": 71655 + }, + { + "epoch": 7.9808441920035635, + "grad_norm": 0.006148257292807102, + "learning_rate": 3.757273501402956e-05, + "loss": 0.0402, + "num_input_tokens_seen": 87216320, + "step": 71660 + }, + { + "epoch": 7.981401046887181, + "grad_norm": 0.12875303626060486, + "learning_rate": 3.757063483719785e-05, + "loss": 0.1116, + "num_input_tokens_seen": 87222272, + "step": 71665 + }, + { + "epoch": 7.981957901770799, + "grad_norm": 0.3594565987586975, + "learning_rate": 3.7568534541626434e-05, + "loss": 0.0142, + "num_input_tokens_seen": 87228384, + "step": 71670 + }, + { + "epoch": 7.982514756654416, + "grad_norm": 1.1262614727020264, + "learning_rate": 3.756643412733514e-05, + "loss": 0.1423, + "num_input_tokens_seen": 87234592, + "step": 71675 + }, + { + "epoch": 7.983071611538033, + "grad_norm": 0.13675722479820251, + "learning_rate": 3.75643335943438e-05, + "loss": 0.0892, + "num_input_tokens_seen": 87240736, + "step": 71680 + }, + { + "epoch": 7.98362846642165, + "grad_norm": 1.052085280418396, + "learning_rate": 3.756223294267226e-05, + "loss": 0.1135, + "num_input_tokens_seen": 87246624, + "step": 71685 + }, + { + "epoch": 7.984185321305268, + "grad_norm": 1.5371357202529907, + "learning_rate": 3.756013217234038e-05, + "loss": 0.0394, + "num_input_tokens_seen": 87252736, + "step": 71690 + }, + { + "epoch": 7.9847421761888855, + "grad_norm": 0.020282099023461342, + "learning_rate": 3.755803128336798e-05, + "loss": 0.0442, + "num_input_tokens_seen": 87259008, + "step": 71695 + }, + { + "epoch": 7.985299031072502, + "grad_norm": 0.6321764588356018, + "learning_rate": 3.7555930275774906e-05, + "loss": 0.046, + "num_input_tokens_seen": 87264896, + "step": 71700 + }, + { + "epoch": 7.98585588595612, + "grad_norm": 0.671620786190033, + "learning_rate": 3.755382914958103e-05, + "loss": 0.181, + "num_input_tokens_seen": 87270944, + "step": 71705 + }, + { + "epoch": 7.986412740839738, + "grad_norm": 0.9064450263977051, + "learning_rate": 3.7551727904806167e-05, + "loss": 0.0603, + "num_input_tokens_seen": 87277184, + "step": 71710 + }, + { + "epoch": 7.986969595723354, + "grad_norm": 0.0024658930487930775, + "learning_rate": 3.754962654147018e-05, + "loss": 0.1354, + "num_input_tokens_seen": 87283392, + "step": 71715 + }, + { + "epoch": 7.987526450606972, + "grad_norm": 2.402099847793579, + "learning_rate": 3.7547525059592916e-05, + "loss": 0.1548, + "num_input_tokens_seen": 87289600, + "step": 71720 + }, + { + "epoch": 7.988083305490589, + "grad_norm": 0.04692229628562927, + "learning_rate": 3.754542345919422e-05, + "loss": 0.055, + "num_input_tokens_seen": 87295936, + "step": 71725 + }, + { + "epoch": 7.9886401603742065, + "grad_norm": 2.0630881786346436, + "learning_rate": 3.754332174029395e-05, + "loss": 0.1615, + "num_input_tokens_seen": 87301664, + "step": 71730 + }, + { + "epoch": 7.989197015257824, + "grad_norm": 1.2275184392929077, + "learning_rate": 3.754121990291196e-05, + "loss": 0.1689, + "num_input_tokens_seen": 87307424, + "step": 71735 + }, + { + "epoch": 7.989753870141441, + "grad_norm": 0.36762189865112305, + "learning_rate": 3.7539117947068095e-05, + "loss": 0.0565, + "num_input_tokens_seen": 87313280, + "step": 71740 + }, + { + "epoch": 7.990310725025059, + "grad_norm": 0.4017259478569031, + "learning_rate": 3.7537015872782225e-05, + "loss": 0.0184, + "num_input_tokens_seen": 87319552, + "step": 71745 + }, + { + "epoch": 7.9908675799086755, + "grad_norm": 0.0035650329664349556, + "learning_rate": 3.753491368007419e-05, + "loss": 0.0251, + "num_input_tokens_seen": 87325472, + "step": 71750 + }, + { + "epoch": 7.991424434792293, + "grad_norm": 0.4430190622806549, + "learning_rate": 3.753281136896385e-05, + "loss": 0.1249, + "num_input_tokens_seen": 87331424, + "step": 71755 + }, + { + "epoch": 7.991981289675911, + "grad_norm": 0.017163675278425217, + "learning_rate": 3.753070893947107e-05, + "loss": 0.0134, + "num_input_tokens_seen": 87337472, + "step": 71760 + }, + { + "epoch": 7.992538144559528, + "grad_norm": 0.0009754981729201972, + "learning_rate": 3.7528606391615697e-05, + "loss": 0.0678, + "num_input_tokens_seen": 87343680, + "step": 71765 + }, + { + "epoch": 7.993094999443145, + "grad_norm": 2.8047215938568115, + "learning_rate": 3.75265037254176e-05, + "loss": 0.0818, + "num_input_tokens_seen": 87349632, + "step": 71770 + }, + { + "epoch": 7.993651854326762, + "grad_norm": 0.08801990002393723, + "learning_rate": 3.752440094089664e-05, + "loss": 0.0794, + "num_input_tokens_seen": 87355872, + "step": 71775 + }, + { + "epoch": 7.99420870921038, + "grad_norm": 0.20680905878543854, + "learning_rate": 3.752229803807269e-05, + "loss": 0.0337, + "num_input_tokens_seen": 87361856, + "step": 71780 + }, + { + "epoch": 7.994765564093997, + "grad_norm": 1.0008705854415894, + "learning_rate": 3.7520195016965596e-05, + "loss": 0.0344, + "num_input_tokens_seen": 87368064, + "step": 71785 + }, + { + "epoch": 7.995322418977614, + "grad_norm": 0.015697531402111053, + "learning_rate": 3.7518091877595215e-05, + "loss": 0.0055, + "num_input_tokens_seen": 87373952, + "step": 71790 + }, + { + "epoch": 7.995879273861232, + "grad_norm": 0.011964878998696804, + "learning_rate": 3.751598861998145e-05, + "loss": 0.0152, + "num_input_tokens_seen": 87380096, + "step": 71795 + }, + { + "epoch": 7.996436128744849, + "grad_norm": 0.1839538812637329, + "learning_rate": 3.7513885244144134e-05, + "loss": 0.0201, + "num_input_tokens_seen": 87386208, + "step": 71800 + }, + { + "epoch": 7.996992983628466, + "grad_norm": 0.058891624212265015, + "learning_rate": 3.7511781750103135e-05, + "loss": 0.0156, + "num_input_tokens_seen": 87392192, + "step": 71805 + }, + { + "epoch": 7.997549838512084, + "grad_norm": 0.2704790532588959, + "learning_rate": 3.7509678137878354e-05, + "loss": 0.1065, + "num_input_tokens_seen": 87398048, + "step": 71810 + }, + { + "epoch": 7.998106693395701, + "grad_norm": 1.0712885856628418, + "learning_rate": 3.750757440748962e-05, + "loss": 0.136, + "num_input_tokens_seen": 87403584, + "step": 71815 + }, + { + "epoch": 7.9986635482793185, + "grad_norm": 0.6706500053405762, + "learning_rate": 3.7505470558956845e-05, + "loss": 0.0847, + "num_input_tokens_seen": 87409952, + "step": 71820 + }, + { + "epoch": 7.999220403162935, + "grad_norm": 0.500818133354187, + "learning_rate": 3.750336659229987e-05, + "loss": 0.0705, + "num_input_tokens_seen": 87416032, + "step": 71825 + }, + { + "epoch": 7.999777258046553, + "grad_norm": 0.2096920907497406, + "learning_rate": 3.750126250753857e-05, + "loss": 0.085, + "num_input_tokens_seen": 87422112, + "step": 71830 + }, + { + "epoch": 8.0, + "eval_loss": 0.07789245992898941, + "eval_runtime": 112.6454, + "eval_samples_per_second": 35.43, + "eval_steps_per_second": 8.86, + "num_input_tokens_seen": 87424000, + "step": 71832 + }, + { + "epoch": 8.00033411293017, + "grad_norm": 1.3260209560394287, + "learning_rate": 3.749915830469285e-05, + "loss": 0.0649, + "num_input_tokens_seen": 87427712, + "step": 71835 + }, + { + "epoch": 8.000890967813788, + "grad_norm": 0.07346222549676895, + "learning_rate": 3.7497053983782556e-05, + "loss": 0.0432, + "num_input_tokens_seen": 87433536, + "step": 71840 + }, + { + "epoch": 8.001447822697404, + "grad_norm": 1.8676621913909912, + "learning_rate": 3.749494954482758e-05, + "loss": 0.0333, + "num_input_tokens_seen": 87439648, + "step": 71845 + }, + { + "epoch": 8.002004677581022, + "grad_norm": 0.37358132004737854, + "learning_rate": 3.7492844987847785e-05, + "loss": 0.1133, + "num_input_tokens_seen": 87445984, + "step": 71850 + }, + { + "epoch": 8.00256153246464, + "grad_norm": 0.006845843978226185, + "learning_rate": 3.7490740312863064e-05, + "loss": 0.1296, + "num_input_tokens_seen": 87452192, + "step": 71855 + }, + { + "epoch": 8.003118387348257, + "grad_norm": 0.4032733738422394, + "learning_rate": 3.74886355198933e-05, + "loss": 0.1051, + "num_input_tokens_seen": 87458560, + "step": 71860 + }, + { + "epoch": 8.003675242231875, + "grad_norm": 0.12602902948856354, + "learning_rate": 3.748653060895836e-05, + "loss": 0.0314, + "num_input_tokens_seen": 87464896, + "step": 71865 + }, + { + "epoch": 8.004232097115493, + "grad_norm": 0.7171456813812256, + "learning_rate": 3.748442558007814e-05, + "loss": 0.1073, + "num_input_tokens_seen": 87470880, + "step": 71870 + }, + { + "epoch": 8.004788951999108, + "grad_norm": 0.9957378506660461, + "learning_rate": 3.748232043327251e-05, + "loss": 0.0375, + "num_input_tokens_seen": 87476960, + "step": 71875 + }, + { + "epoch": 8.005345806882726, + "grad_norm": 0.24531520903110504, + "learning_rate": 3.748021516856137e-05, + "loss": 0.0442, + "num_input_tokens_seen": 87482944, + "step": 71880 + }, + { + "epoch": 8.005902661766344, + "grad_norm": 0.05238809064030647, + "learning_rate": 3.74781097859646e-05, + "loss": 0.0072, + "num_input_tokens_seen": 87489376, + "step": 71885 + }, + { + "epoch": 8.006459516649961, + "grad_norm": 0.07962749153375626, + "learning_rate": 3.747600428550207e-05, + "loss": 0.0465, + "num_input_tokens_seen": 87495488, + "step": 71890 + }, + { + "epoch": 8.00701637153358, + "grad_norm": 0.03328300267457962, + "learning_rate": 3.7473898667193705e-05, + "loss": 0.0284, + "num_input_tokens_seen": 87501696, + "step": 71895 + }, + { + "epoch": 8.007573226417195, + "grad_norm": 1.8503304719924927, + "learning_rate": 3.747179293105936e-05, + "loss": 0.1084, + "num_input_tokens_seen": 87507744, + "step": 71900 + }, + { + "epoch": 8.008130081300813, + "grad_norm": 2.4442150592803955, + "learning_rate": 3.746968707711895e-05, + "loss": 0.0887, + "num_input_tokens_seen": 87513760, + "step": 71905 + }, + { + "epoch": 8.00868693618443, + "grad_norm": 1.3318637609481812, + "learning_rate": 3.746758110539234e-05, + "loss": 0.0804, + "num_input_tokens_seen": 87519904, + "step": 71910 + }, + { + "epoch": 8.009243791068048, + "grad_norm": 1.5032812356948853, + "learning_rate": 3.7465475015899446e-05, + "loss": 0.0741, + "num_input_tokens_seen": 87526112, + "step": 71915 + }, + { + "epoch": 8.009800645951666, + "grad_norm": 0.4800921380519867, + "learning_rate": 3.7463368808660156e-05, + "loss": 0.0392, + "num_input_tokens_seen": 87532288, + "step": 71920 + }, + { + "epoch": 8.010357500835282, + "grad_norm": 0.5489965677261353, + "learning_rate": 3.746126248369435e-05, + "loss": 0.1067, + "num_input_tokens_seen": 87538624, + "step": 71925 + }, + { + "epoch": 8.0109143557189, + "grad_norm": 0.0747959315776825, + "learning_rate": 3.7459156041021956e-05, + "loss": 0.022, + "num_input_tokens_seen": 87544896, + "step": 71930 + }, + { + "epoch": 8.011471210602517, + "grad_norm": 1.9779764413833618, + "learning_rate": 3.745704948066283e-05, + "loss": 0.0715, + "num_input_tokens_seen": 87550848, + "step": 71935 + }, + { + "epoch": 8.012028065486135, + "grad_norm": 0.08271250128746033, + "learning_rate": 3.74549428026369e-05, + "loss": 0.0674, + "num_input_tokens_seen": 87556768, + "step": 71940 + }, + { + "epoch": 8.012584920369752, + "grad_norm": 0.0005430968012660742, + "learning_rate": 3.745283600696407e-05, + "loss": 0.0879, + "num_input_tokens_seen": 87563072, + "step": 71945 + }, + { + "epoch": 8.013141775253368, + "grad_norm": 0.008884049020707607, + "learning_rate": 3.745072909366421e-05, + "loss": 0.0062, + "num_input_tokens_seen": 87569088, + "step": 71950 + }, + { + "epoch": 8.013698630136986, + "grad_norm": 0.0007600205135531723, + "learning_rate": 3.7448622062757246e-05, + "loss": 0.0113, + "num_input_tokens_seen": 87575296, + "step": 71955 + }, + { + "epoch": 8.014255485020604, + "grad_norm": 0.18654458224773407, + "learning_rate": 3.744651491426306e-05, + "loss": 0.0058, + "num_input_tokens_seen": 87581568, + "step": 71960 + }, + { + "epoch": 8.014812339904221, + "grad_norm": 1.1175881624221802, + "learning_rate": 3.744440764820159e-05, + "loss": 0.0711, + "num_input_tokens_seen": 87587808, + "step": 71965 + }, + { + "epoch": 8.015369194787839, + "grad_norm": 0.2638913094997406, + "learning_rate": 3.74423002645927e-05, + "loss": 0.0428, + "num_input_tokens_seen": 87593760, + "step": 71970 + }, + { + "epoch": 8.015926049671455, + "grad_norm": 0.41989782452583313, + "learning_rate": 3.744019276345632e-05, + "loss": 0.0114, + "num_input_tokens_seen": 87600000, + "step": 71975 + }, + { + "epoch": 8.016482904555073, + "grad_norm": 0.12514592707157135, + "learning_rate": 3.743808514481236e-05, + "loss": 0.0508, + "num_input_tokens_seen": 87606048, + "step": 71980 + }, + { + "epoch": 8.01703975943869, + "grad_norm": 0.009390766732394695, + "learning_rate": 3.7435977408680714e-05, + "loss": 0.0607, + "num_input_tokens_seen": 87612192, + "step": 71985 + }, + { + "epoch": 8.017596614322308, + "grad_norm": 0.41261720657348633, + "learning_rate": 3.74338695550813e-05, + "loss": 0.0629, + "num_input_tokens_seen": 87618336, + "step": 71990 + }, + { + "epoch": 8.018153469205926, + "grad_norm": 0.15865835547447205, + "learning_rate": 3.7431761584034025e-05, + "loss": 0.009, + "num_input_tokens_seen": 87624576, + "step": 71995 + }, + { + "epoch": 8.018710324089541, + "grad_norm": 0.9767871499061584, + "learning_rate": 3.742965349555881e-05, + "loss": 0.0619, + "num_input_tokens_seen": 87630784, + "step": 72000 + }, + { + "epoch": 8.019267178973159, + "grad_norm": 0.3733217120170593, + "learning_rate": 3.742754528967555e-05, + "loss": 0.0457, + "num_input_tokens_seen": 87636672, + "step": 72005 + }, + { + "epoch": 8.019824033856777, + "grad_norm": 0.05996394529938698, + "learning_rate": 3.742543696640416e-05, + "loss": 0.1681, + "num_input_tokens_seen": 87642528, + "step": 72010 + }, + { + "epoch": 8.020380888740394, + "grad_norm": 0.02906213514506817, + "learning_rate": 3.742332852576458e-05, + "loss": 0.0793, + "num_input_tokens_seen": 87648896, + "step": 72015 + }, + { + "epoch": 8.020937743624012, + "grad_norm": 0.00022508669644594193, + "learning_rate": 3.74212199677767e-05, + "loss": 0.0334, + "num_input_tokens_seen": 87655072, + "step": 72020 + }, + { + "epoch": 8.021494598507628, + "grad_norm": 0.04123742878437042, + "learning_rate": 3.741911129246045e-05, + "loss": 0.1716, + "num_input_tokens_seen": 87661152, + "step": 72025 + }, + { + "epoch": 8.022051453391246, + "grad_norm": 0.07113798707723618, + "learning_rate": 3.741700249983574e-05, + "loss": 0.0905, + "num_input_tokens_seen": 87667104, + "step": 72030 + }, + { + "epoch": 8.022608308274863, + "grad_norm": 0.14108212292194366, + "learning_rate": 3.7414893589922494e-05, + "loss": 0.0265, + "num_input_tokens_seen": 87673376, + "step": 72035 + }, + { + "epoch": 8.023165163158481, + "grad_norm": 0.19353510439395905, + "learning_rate": 3.7412784562740635e-05, + "loss": 0.0122, + "num_input_tokens_seen": 87679552, + "step": 72040 + }, + { + "epoch": 8.023722018042099, + "grad_norm": 0.098905049264431, + "learning_rate": 3.741067541831007e-05, + "loss": 0.0977, + "num_input_tokens_seen": 87685760, + "step": 72045 + }, + { + "epoch": 8.024278872925716, + "grad_norm": 0.013506662100553513, + "learning_rate": 3.740856615665074e-05, + "loss": 0.0191, + "num_input_tokens_seen": 87691840, + "step": 72050 + }, + { + "epoch": 8.024835727809332, + "grad_norm": 0.0009695421322248876, + "learning_rate": 3.7406456777782564e-05, + "loss": 0.0281, + "num_input_tokens_seen": 87698112, + "step": 72055 + }, + { + "epoch": 8.02539258269295, + "grad_norm": 0.5820003747940063, + "learning_rate": 3.740434728172546e-05, + "loss": 0.012, + "num_input_tokens_seen": 87704288, + "step": 72060 + }, + { + "epoch": 8.025949437576568, + "grad_norm": 0.03934919834136963, + "learning_rate": 3.7402237668499355e-05, + "loss": 0.0495, + "num_input_tokens_seen": 87710112, + "step": 72065 + }, + { + "epoch": 8.026506292460185, + "grad_norm": 0.37520429491996765, + "learning_rate": 3.740012793812419e-05, + "loss": 0.0352, + "num_input_tokens_seen": 87715936, + "step": 72070 + }, + { + "epoch": 8.027063147343803, + "grad_norm": 0.18437248468399048, + "learning_rate": 3.7398018090619867e-05, + "loss": 0.0613, + "num_input_tokens_seen": 87722176, + "step": 72075 + }, + { + "epoch": 8.027620002227419, + "grad_norm": 0.1946493536233902, + "learning_rate": 3.739590812600634e-05, + "loss": 0.0316, + "num_input_tokens_seen": 87728608, + "step": 72080 + }, + { + "epoch": 8.028176857111037, + "grad_norm": 0.18310880661010742, + "learning_rate": 3.739379804430353e-05, + "loss": 0.0426, + "num_input_tokens_seen": 87734144, + "step": 72085 + }, + { + "epoch": 8.028733711994654, + "grad_norm": 0.0005954401567578316, + "learning_rate": 3.739168784553136e-05, + "loss": 0.0179, + "num_input_tokens_seen": 87740192, + "step": 72090 + }, + { + "epoch": 8.029290566878272, + "grad_norm": 2.3542165756225586, + "learning_rate": 3.7389577529709776e-05, + "loss": 0.2016, + "num_input_tokens_seen": 87745856, + "step": 72095 + }, + { + "epoch": 8.02984742176189, + "grad_norm": 0.01220569759607315, + "learning_rate": 3.73874670968587e-05, + "loss": 0.0174, + "num_input_tokens_seen": 87751424, + "step": 72100 + }, + { + "epoch": 8.030404276645505, + "grad_norm": 0.8764874339103699, + "learning_rate": 3.738535654699807e-05, + "loss": 0.0897, + "num_input_tokens_seen": 87757472, + "step": 72105 + }, + { + "epoch": 8.030961131529123, + "grad_norm": 1.26396644115448, + "learning_rate": 3.7383245880147844e-05, + "loss": 0.1057, + "num_input_tokens_seen": 87763008, + "step": 72110 + }, + { + "epoch": 8.03151798641274, + "grad_norm": 0.2803719639778137, + "learning_rate": 3.7381135096327923e-05, + "loss": 0.1366, + "num_input_tokens_seen": 87769120, + "step": 72115 + }, + { + "epoch": 8.032074841296359, + "grad_norm": 0.11930684745311737, + "learning_rate": 3.7379024195558266e-05, + "loss": 0.0603, + "num_input_tokens_seen": 87775616, + "step": 72120 + }, + { + "epoch": 8.032631696179976, + "grad_norm": 0.9129625558853149, + "learning_rate": 3.73769131778588e-05, + "loss": 0.0743, + "num_input_tokens_seen": 87781920, + "step": 72125 + }, + { + "epoch": 8.033188551063592, + "grad_norm": 0.3093090057373047, + "learning_rate": 3.737480204324949e-05, + "loss": 0.1914, + "num_input_tokens_seen": 87788096, + "step": 72130 + }, + { + "epoch": 8.03374540594721, + "grad_norm": 0.005012740381062031, + "learning_rate": 3.737269079175024e-05, + "loss": 0.0234, + "num_input_tokens_seen": 87794112, + "step": 72135 + }, + { + "epoch": 8.034302260830827, + "grad_norm": 0.13544882833957672, + "learning_rate": 3.737057942338102e-05, + "loss": 0.0818, + "num_input_tokens_seen": 87800288, + "step": 72140 + }, + { + "epoch": 8.034859115714445, + "grad_norm": 1.9179682731628418, + "learning_rate": 3.7368467938161776e-05, + "loss": 0.082, + "num_input_tokens_seen": 87806272, + "step": 72145 + }, + { + "epoch": 8.035415970598063, + "grad_norm": 0.6044291257858276, + "learning_rate": 3.7366356336112426e-05, + "loss": 0.0955, + "num_input_tokens_seen": 87812480, + "step": 72150 + }, + { + "epoch": 8.035972825481679, + "grad_norm": 0.49798959493637085, + "learning_rate": 3.736424461725295e-05, + "loss": 0.0633, + "num_input_tokens_seen": 87818656, + "step": 72155 + }, + { + "epoch": 8.036529680365296, + "grad_norm": 0.028945541009306908, + "learning_rate": 3.7362132781603264e-05, + "loss": 0.0629, + "num_input_tokens_seen": 87824960, + "step": 72160 + }, + { + "epoch": 8.037086535248914, + "grad_norm": 0.18744508922100067, + "learning_rate": 3.736002082918334e-05, + "loss": 0.0311, + "num_input_tokens_seen": 87830752, + "step": 72165 + }, + { + "epoch": 8.037643390132532, + "grad_norm": 0.47386467456817627, + "learning_rate": 3.735790876001311e-05, + "loss": 0.0428, + "num_input_tokens_seen": 87836928, + "step": 72170 + }, + { + "epoch": 8.03820024501615, + "grad_norm": 0.3619350492954254, + "learning_rate": 3.7355796574112536e-05, + "loss": 0.0622, + "num_input_tokens_seen": 87843200, + "step": 72175 + }, + { + "epoch": 8.038757099899765, + "grad_norm": 0.0011915764771401882, + "learning_rate": 3.7353684271501555e-05, + "loss": 0.1642, + "num_input_tokens_seen": 87849408, + "step": 72180 + }, + { + "epoch": 8.039313954783383, + "grad_norm": 0.02707699127495289, + "learning_rate": 3.735157185220014e-05, + "loss": 0.0693, + "num_input_tokens_seen": 87855136, + "step": 72185 + }, + { + "epoch": 8.039870809667, + "grad_norm": 0.029649382457137108, + "learning_rate": 3.7349459316228224e-05, + "loss": 0.0059, + "num_input_tokens_seen": 87861312, + "step": 72190 + }, + { + "epoch": 8.040427664550618, + "grad_norm": 0.31436997652053833, + "learning_rate": 3.734734666360577e-05, + "loss": 0.0192, + "num_input_tokens_seen": 87866368, + "step": 72195 + }, + { + "epoch": 8.040984519434236, + "grad_norm": 0.0059327417984604836, + "learning_rate": 3.734523389435274e-05, + "loss": 0.12, + "num_input_tokens_seen": 87872480, + "step": 72200 + }, + { + "epoch": 8.041541374317854, + "grad_norm": 0.8912459015846252, + "learning_rate": 3.734312100848908e-05, + "loss": 0.0514, + "num_input_tokens_seen": 87879008, + "step": 72205 + }, + { + "epoch": 8.04209822920147, + "grad_norm": 0.7874661684036255, + "learning_rate": 3.734100800603475e-05, + "loss": 0.0523, + "num_input_tokens_seen": 87884928, + "step": 72210 + }, + { + "epoch": 8.042655084085087, + "grad_norm": 0.10740549862384796, + "learning_rate": 3.7338894887009724e-05, + "loss": 0.0163, + "num_input_tokens_seen": 87891200, + "step": 72215 + }, + { + "epoch": 8.043211938968705, + "grad_norm": 0.21774615347385406, + "learning_rate": 3.733678165143394e-05, + "loss": 0.0259, + "num_input_tokens_seen": 87897504, + "step": 72220 + }, + { + "epoch": 8.043768793852323, + "grad_norm": 0.18136544525623322, + "learning_rate": 3.733466829932738e-05, + "loss": 0.058, + "num_input_tokens_seen": 87903360, + "step": 72225 + }, + { + "epoch": 8.04432564873594, + "grad_norm": 0.05572185292840004, + "learning_rate": 3.733255483070998e-05, + "loss": 0.0631, + "num_input_tokens_seen": 87909568, + "step": 72230 + }, + { + "epoch": 8.044882503619556, + "grad_norm": 0.3358733057975769, + "learning_rate": 3.7330441245601726e-05, + "loss": 0.0135, + "num_input_tokens_seen": 87915712, + "step": 72235 + }, + { + "epoch": 8.045439358503174, + "grad_norm": 0.8434996604919434, + "learning_rate": 3.732832754402258e-05, + "loss": 0.0258, + "num_input_tokens_seen": 87921792, + "step": 72240 + }, + { + "epoch": 8.045996213386791, + "grad_norm": 0.1467963606119156, + "learning_rate": 3.73262137259925e-05, + "loss": 0.0079, + "num_input_tokens_seen": 87927872, + "step": 72245 + }, + { + "epoch": 8.04655306827041, + "grad_norm": 0.055548958480358124, + "learning_rate": 3.732409979153146e-05, + "loss": 0.0084, + "num_input_tokens_seen": 87934240, + "step": 72250 + }, + { + "epoch": 8.047109923154027, + "grad_norm": 0.5123178958892822, + "learning_rate": 3.732198574065942e-05, + "loss": 0.0466, + "num_input_tokens_seen": 87940608, + "step": 72255 + }, + { + "epoch": 8.047666778037643, + "grad_norm": 1.3948416709899902, + "learning_rate": 3.731987157339635e-05, + "loss": 0.069, + "num_input_tokens_seen": 87946848, + "step": 72260 + }, + { + "epoch": 8.04822363292126, + "grad_norm": 0.7284486293792725, + "learning_rate": 3.7317757289762225e-05, + "loss": 0.01, + "num_input_tokens_seen": 87953024, + "step": 72265 + }, + { + "epoch": 8.048780487804878, + "grad_norm": 0.00041900924406945705, + "learning_rate": 3.731564288977701e-05, + "loss": 0.0612, + "num_input_tokens_seen": 87958912, + "step": 72270 + }, + { + "epoch": 8.049337342688496, + "grad_norm": 0.19656789302825928, + "learning_rate": 3.7313528373460687e-05, + "loss": 0.0475, + "num_input_tokens_seen": 87964992, + "step": 72275 + }, + { + "epoch": 8.049894197572113, + "grad_norm": 0.30638253688812256, + "learning_rate": 3.7311413740833215e-05, + "loss": 0.0714, + "num_input_tokens_seen": 87971328, + "step": 72280 + }, + { + "epoch": 8.05045105245573, + "grad_norm": 1.1044585704803467, + "learning_rate": 3.730929899191458e-05, + "loss": 0.1859, + "num_input_tokens_seen": 87976992, + "step": 72285 + }, + { + "epoch": 8.051007907339347, + "grad_norm": 0.13144755363464355, + "learning_rate": 3.730718412672476e-05, + "loss": 0.1463, + "num_input_tokens_seen": 87983328, + "step": 72290 + }, + { + "epoch": 8.051564762222965, + "grad_norm": 0.06622761487960815, + "learning_rate": 3.730506914528372e-05, + "loss": 0.0411, + "num_input_tokens_seen": 87989632, + "step": 72295 + }, + { + "epoch": 8.052121617106582, + "grad_norm": 0.42151424288749695, + "learning_rate": 3.7302954047611443e-05, + "loss": 0.0957, + "num_input_tokens_seen": 87995680, + "step": 72300 + }, + { + "epoch": 8.0526784719902, + "grad_norm": 0.991839587688446, + "learning_rate": 3.73008388337279e-05, + "loss": 0.0568, + "num_input_tokens_seen": 88001888, + "step": 72305 + }, + { + "epoch": 8.053235326873816, + "grad_norm": 0.008593439124524593, + "learning_rate": 3.72987235036531e-05, + "loss": 0.0661, + "num_input_tokens_seen": 88007616, + "step": 72310 + }, + { + "epoch": 8.053792181757434, + "grad_norm": 0.6660601496696472, + "learning_rate": 3.729660805740699e-05, + "loss": 0.0971, + "num_input_tokens_seen": 88013984, + "step": 72315 + }, + { + "epoch": 8.054349036641051, + "grad_norm": 0.28417500853538513, + "learning_rate": 3.7294492495009556e-05, + "loss": 0.0256, + "num_input_tokens_seen": 88020288, + "step": 72320 + }, + { + "epoch": 8.054905891524669, + "grad_norm": 0.015073624439537525, + "learning_rate": 3.7292376816480804e-05, + "loss": 0.0159, + "num_input_tokens_seen": 88026368, + "step": 72325 + }, + { + "epoch": 8.055462746408287, + "grad_norm": 0.4232950210571289, + "learning_rate": 3.72902610218407e-05, + "loss": 0.0838, + "num_input_tokens_seen": 88032160, + "step": 72330 + }, + { + "epoch": 8.056019601291903, + "grad_norm": 1.459694504737854, + "learning_rate": 3.728814511110924e-05, + "loss": 0.1059, + "num_input_tokens_seen": 88037728, + "step": 72335 + }, + { + "epoch": 8.05657645617552, + "grad_norm": 0.007283250335603952, + "learning_rate": 3.728602908430639e-05, + "loss": 0.1123, + "num_input_tokens_seen": 88043744, + "step": 72340 + }, + { + "epoch": 8.057133311059138, + "grad_norm": 2.2963569164276123, + "learning_rate": 3.728391294145217e-05, + "loss": 0.0769, + "num_input_tokens_seen": 88049600, + "step": 72345 + }, + { + "epoch": 8.057690165942756, + "grad_norm": 0.16746409237384796, + "learning_rate": 3.7281796682566534e-05, + "loss": 0.007, + "num_input_tokens_seen": 88055904, + "step": 72350 + }, + { + "epoch": 8.058247020826373, + "grad_norm": 0.17668402194976807, + "learning_rate": 3.72796803076695e-05, + "loss": 0.0693, + "num_input_tokens_seen": 88062016, + "step": 72355 + }, + { + "epoch": 8.05880387570999, + "grad_norm": 0.0004440412449184805, + "learning_rate": 3.727756381678104e-05, + "loss": 0.0947, + "num_input_tokens_seen": 88067808, + "step": 72360 + }, + { + "epoch": 8.059360730593607, + "grad_norm": 0.6425817012786865, + "learning_rate": 3.727544720992115e-05, + "loss": 0.0893, + "num_input_tokens_seen": 88073408, + "step": 72365 + }, + { + "epoch": 8.059917585477224, + "grad_norm": 0.004083073232322931, + "learning_rate": 3.7273330487109833e-05, + "loss": 0.1527, + "num_input_tokens_seen": 88078976, + "step": 72370 + }, + { + "epoch": 8.060474440360842, + "grad_norm": 0.4390868544578552, + "learning_rate": 3.7271213648367074e-05, + "loss": 0.0204, + "num_input_tokens_seen": 88085024, + "step": 72375 + }, + { + "epoch": 8.06103129524446, + "grad_norm": 0.19321689009666443, + "learning_rate": 3.726909669371287e-05, + "loss": 0.0279, + "num_input_tokens_seen": 88091040, + "step": 72380 + }, + { + "epoch": 8.061588150128078, + "grad_norm": 0.5142722129821777, + "learning_rate": 3.726697962316722e-05, + "loss": 0.0211, + "num_input_tokens_seen": 88097248, + "step": 72385 + }, + { + "epoch": 8.062145005011693, + "grad_norm": 0.04082578048110008, + "learning_rate": 3.726486243675012e-05, + "loss": 0.0318, + "num_input_tokens_seen": 88103552, + "step": 72390 + }, + { + "epoch": 8.062701859895311, + "grad_norm": 0.4115043878555298, + "learning_rate": 3.726274513448157e-05, + "loss": 0.0087, + "num_input_tokens_seen": 88109792, + "step": 72395 + }, + { + "epoch": 8.063258714778929, + "grad_norm": 0.017566654831171036, + "learning_rate": 3.726062771638156e-05, + "loss": 0.0005, + "num_input_tokens_seen": 88116160, + "step": 72400 + }, + { + "epoch": 8.063815569662546, + "grad_norm": 0.10096380859613419, + "learning_rate": 3.725851018247011e-05, + "loss": 0.0442, + "num_input_tokens_seen": 88122368, + "step": 72405 + }, + { + "epoch": 8.064372424546164, + "grad_norm": 0.01440573949366808, + "learning_rate": 3.725639253276719e-05, + "loss": 0.036, + "num_input_tokens_seen": 88128320, + "step": 72410 + }, + { + "epoch": 8.06492927942978, + "grad_norm": 0.7123117446899414, + "learning_rate": 3.725427476729284e-05, + "loss": 0.0462, + "num_input_tokens_seen": 88134368, + "step": 72415 + }, + { + "epoch": 8.065486134313398, + "grad_norm": 0.12322971224784851, + "learning_rate": 3.7252156886067046e-05, + "loss": 0.0524, + "num_input_tokens_seen": 88140288, + "step": 72420 + }, + { + "epoch": 8.066042989197015, + "grad_norm": 0.03716926649212837, + "learning_rate": 3.7250038889109805e-05, + "loss": 0.0046, + "num_input_tokens_seen": 88146624, + "step": 72425 + }, + { + "epoch": 8.066599844080633, + "grad_norm": 0.6624975800514221, + "learning_rate": 3.724792077644114e-05, + "loss": 0.1663, + "num_input_tokens_seen": 88152928, + "step": 72430 + }, + { + "epoch": 8.06715669896425, + "grad_norm": 0.041858986020088196, + "learning_rate": 3.7245802548081045e-05, + "loss": 0.0137, + "num_input_tokens_seen": 88159136, + "step": 72435 + }, + { + "epoch": 8.067713553847867, + "grad_norm": 0.010636997409164906, + "learning_rate": 3.724368420404954e-05, + "loss": 0.0189, + "num_input_tokens_seen": 88165120, + "step": 72440 + }, + { + "epoch": 8.068270408731484, + "grad_norm": 0.7414106130599976, + "learning_rate": 3.724156574436662e-05, + "loss": 0.046, + "num_input_tokens_seen": 88171296, + "step": 72445 + }, + { + "epoch": 8.068827263615102, + "grad_norm": 0.10231568664312363, + "learning_rate": 3.723944716905231e-05, + "loss": 0.0059, + "num_input_tokens_seen": 88177248, + "step": 72450 + }, + { + "epoch": 8.06938411849872, + "grad_norm": 0.46865564584732056, + "learning_rate": 3.723732847812661e-05, + "loss": 0.028, + "num_input_tokens_seen": 88183456, + "step": 72455 + }, + { + "epoch": 8.069940973382337, + "grad_norm": 0.06120776757597923, + "learning_rate": 3.723520967160955e-05, + "loss": 0.013, + "num_input_tokens_seen": 88189728, + "step": 72460 + }, + { + "epoch": 8.070497828265953, + "grad_norm": 0.7543430924415588, + "learning_rate": 3.723309074952112e-05, + "loss": 0.0437, + "num_input_tokens_seen": 88195776, + "step": 72465 + }, + { + "epoch": 8.07105468314957, + "grad_norm": 0.027991095557808876, + "learning_rate": 3.723097171188134e-05, + "loss": 0.0435, + "num_input_tokens_seen": 88201984, + "step": 72470 + }, + { + "epoch": 8.071611538033189, + "grad_norm": 0.10567855834960938, + "learning_rate": 3.722885255871025e-05, + "loss": 0.1002, + "num_input_tokens_seen": 88208288, + "step": 72475 + }, + { + "epoch": 8.072168392916806, + "grad_norm": 0.17440316081047058, + "learning_rate": 3.7226733290027846e-05, + "loss": 0.1299, + "num_input_tokens_seen": 88214432, + "step": 72480 + }, + { + "epoch": 8.072725247800424, + "grad_norm": 0.49241188168525696, + "learning_rate": 3.7224613905854146e-05, + "loss": 0.0294, + "num_input_tokens_seen": 88220224, + "step": 72485 + }, + { + "epoch": 8.07328210268404, + "grad_norm": 0.11859997361898422, + "learning_rate": 3.722249440620917e-05, + "loss": 0.0038, + "num_input_tokens_seen": 88226720, + "step": 72490 + }, + { + "epoch": 8.073838957567657, + "grad_norm": 1.6106033325195312, + "learning_rate": 3.722037479111295e-05, + "loss": 0.1113, + "num_input_tokens_seen": 88232128, + "step": 72495 + }, + { + "epoch": 8.074395812451275, + "grad_norm": 0.22069412469863892, + "learning_rate": 3.72182550605855e-05, + "loss": 0.1271, + "num_input_tokens_seen": 88238304, + "step": 72500 + }, + { + "epoch": 8.074952667334893, + "grad_norm": 1.8753674030303955, + "learning_rate": 3.7216135214646836e-05, + "loss": 0.0509, + "num_input_tokens_seen": 88244640, + "step": 72505 + }, + { + "epoch": 8.07550952221851, + "grad_norm": 0.10783951729536057, + "learning_rate": 3.721401525331699e-05, + "loss": 0.0269, + "num_input_tokens_seen": 88250400, + "step": 72510 + }, + { + "epoch": 8.076066377102126, + "grad_norm": 0.43161749839782715, + "learning_rate": 3.721189517661599e-05, + "loss": 0.0711, + "num_input_tokens_seen": 88256160, + "step": 72515 + }, + { + "epoch": 8.076623231985744, + "grad_norm": 0.06724735349416733, + "learning_rate": 3.720977498456384e-05, + "loss": 0.0069, + "num_input_tokens_seen": 88262208, + "step": 72520 + }, + { + "epoch": 8.077180086869362, + "grad_norm": 0.0006763249984942377, + "learning_rate": 3.7207654677180595e-05, + "loss": 0.1269, + "num_input_tokens_seen": 88267904, + "step": 72525 + }, + { + "epoch": 8.07773694175298, + "grad_norm": 2.8126537799835205, + "learning_rate": 3.7205534254486266e-05, + "loss": 0.0395, + "num_input_tokens_seen": 88274176, + "step": 72530 + }, + { + "epoch": 8.078293796636597, + "grad_norm": 1.2131484746932983, + "learning_rate": 3.7203413716500893e-05, + "loss": 0.0751, + "num_input_tokens_seen": 88279808, + "step": 72535 + }, + { + "epoch": 8.078850651520213, + "grad_norm": 1.7838610410690308, + "learning_rate": 3.7201293063244494e-05, + "loss": 0.0553, + "num_input_tokens_seen": 88286144, + "step": 72540 + }, + { + "epoch": 8.07940750640383, + "grad_norm": 0.03358886018395424, + "learning_rate": 3.719917229473711e-05, + "loss": 0.0375, + "num_input_tokens_seen": 88292288, + "step": 72545 + }, + { + "epoch": 8.079964361287448, + "grad_norm": 0.12356237322092056, + "learning_rate": 3.719705141099877e-05, + "loss": 0.0791, + "num_input_tokens_seen": 88298016, + "step": 72550 + }, + { + "epoch": 8.080521216171066, + "grad_norm": 0.3141056299209595, + "learning_rate": 3.719493041204951e-05, + "loss": 0.0255, + "num_input_tokens_seen": 88304064, + "step": 72555 + }, + { + "epoch": 8.081078071054684, + "grad_norm": 0.8838388323783875, + "learning_rate": 3.719280929790936e-05, + "loss": 0.0625, + "num_input_tokens_seen": 88310176, + "step": 72560 + }, + { + "epoch": 8.081634925938301, + "grad_norm": 0.2770826816558838, + "learning_rate": 3.7190688068598356e-05, + "loss": 0.1103, + "num_input_tokens_seen": 88316320, + "step": 72565 + }, + { + "epoch": 8.082191780821917, + "grad_norm": 0.0016595569904893637, + "learning_rate": 3.7188566724136536e-05, + "loss": 0.0716, + "num_input_tokens_seen": 88322400, + "step": 72570 + }, + { + "epoch": 8.082748635705535, + "grad_norm": 0.07732124626636505, + "learning_rate": 3.718644526454394e-05, + "loss": 0.038, + "num_input_tokens_seen": 88328480, + "step": 72575 + }, + { + "epoch": 8.083305490589153, + "grad_norm": 0.26734861731529236, + "learning_rate": 3.718432368984059e-05, + "loss": 0.0045, + "num_input_tokens_seen": 88334592, + "step": 72580 + }, + { + "epoch": 8.08386234547277, + "grad_norm": 0.028182189911603928, + "learning_rate": 3.718220200004656e-05, + "loss": 0.0383, + "num_input_tokens_seen": 88340384, + "step": 72585 + }, + { + "epoch": 8.084419200356388, + "grad_norm": 0.10845541208982468, + "learning_rate": 3.718008019518187e-05, + "loss": 0.0854, + "num_input_tokens_seen": 88346496, + "step": 72590 + }, + { + "epoch": 8.084976055240004, + "grad_norm": 0.016897140070796013, + "learning_rate": 3.717795827526656e-05, + "loss": 0.0055, + "num_input_tokens_seen": 88352544, + "step": 72595 + }, + { + "epoch": 8.085532910123622, + "grad_norm": 0.0023687363136559725, + "learning_rate": 3.717583624032067e-05, + "loss": 0.0124, + "num_input_tokens_seen": 88358880, + "step": 72600 + }, + { + "epoch": 8.08608976500724, + "grad_norm": 0.11186536401510239, + "learning_rate": 3.7173714090364264e-05, + "loss": 0.028, + "num_input_tokens_seen": 88365184, + "step": 72605 + }, + { + "epoch": 8.086646619890857, + "grad_norm": 0.40595540404319763, + "learning_rate": 3.7171591825417375e-05, + "loss": 0.1052, + "num_input_tokens_seen": 88371264, + "step": 72610 + }, + { + "epoch": 8.087203474774475, + "grad_norm": 0.01462267991155386, + "learning_rate": 3.716946944550004e-05, + "loss": 0.0106, + "num_input_tokens_seen": 88377280, + "step": 72615 + }, + { + "epoch": 8.08776032965809, + "grad_norm": 0.032559048384428024, + "learning_rate": 3.716734695063232e-05, + "loss": 0.0233, + "num_input_tokens_seen": 88383552, + "step": 72620 + }, + { + "epoch": 8.088317184541708, + "grad_norm": 0.0031788568012416363, + "learning_rate": 3.716522434083426e-05, + "loss": 0.0002, + "num_input_tokens_seen": 88389600, + "step": 72625 + }, + { + "epoch": 8.088874039425326, + "grad_norm": 0.22408483922481537, + "learning_rate": 3.716310161612591e-05, + "loss": 0.0158, + "num_input_tokens_seen": 88395776, + "step": 72630 + }, + { + "epoch": 8.089430894308943, + "grad_norm": 0.20122651755809784, + "learning_rate": 3.716097877652732e-05, + "loss": 0.1174, + "num_input_tokens_seen": 88401952, + "step": 72635 + }, + { + "epoch": 8.089987749192561, + "grad_norm": 1.515602707862854, + "learning_rate": 3.715885582205854e-05, + "loss": 0.0424, + "num_input_tokens_seen": 88407936, + "step": 72640 + }, + { + "epoch": 8.090544604076177, + "grad_norm": 0.032161276787519455, + "learning_rate": 3.7156732752739624e-05, + "loss": 0.0503, + "num_input_tokens_seen": 88413888, + "step": 72645 + }, + { + "epoch": 8.091101458959795, + "grad_norm": 0.19895586371421814, + "learning_rate": 3.715460956859063e-05, + "loss": 0.0195, + "num_input_tokens_seen": 88420128, + "step": 72650 + }, + { + "epoch": 8.091658313843412, + "grad_norm": 0.13875433802604675, + "learning_rate": 3.7152486269631616e-05, + "loss": 0.0164, + "num_input_tokens_seen": 88426432, + "step": 72655 + }, + { + "epoch": 8.09221516872703, + "grad_norm": 0.014115267433226109, + "learning_rate": 3.7150362855882624e-05, + "loss": 0.0203, + "num_input_tokens_seen": 88432736, + "step": 72660 + }, + { + "epoch": 8.092772023610648, + "grad_norm": 1.1699721813201904, + "learning_rate": 3.7148239327363724e-05, + "loss": 0.0328, + "num_input_tokens_seen": 88438720, + "step": 72665 + }, + { + "epoch": 8.093328878494264, + "grad_norm": 2.0284578800201416, + "learning_rate": 3.714611568409498e-05, + "loss": 0.2265, + "num_input_tokens_seen": 88444736, + "step": 72670 + }, + { + "epoch": 8.093885733377881, + "grad_norm": 0.4717763066291809, + "learning_rate": 3.714399192609643e-05, + "loss": 0.0351, + "num_input_tokens_seen": 88450976, + "step": 72675 + }, + { + "epoch": 8.094442588261499, + "grad_norm": 0.36727339029312134, + "learning_rate": 3.714186805338815e-05, + "loss": 0.0401, + "num_input_tokens_seen": 88457632, + "step": 72680 + }, + { + "epoch": 8.094999443145117, + "grad_norm": 1.4744937419891357, + "learning_rate": 3.7139744065990195e-05, + "loss": 0.0991, + "num_input_tokens_seen": 88463392, + "step": 72685 + }, + { + "epoch": 8.095556298028734, + "grad_norm": 0.015542195178568363, + "learning_rate": 3.7137619963922634e-05, + "loss": 0.0013, + "num_input_tokens_seen": 88469472, + "step": 72690 + }, + { + "epoch": 8.09611315291235, + "grad_norm": 0.5048357844352722, + "learning_rate": 3.713549574720553e-05, + "loss": 0.1178, + "num_input_tokens_seen": 88475392, + "step": 72695 + }, + { + "epoch": 8.096670007795968, + "grad_norm": 0.8608165979385376, + "learning_rate": 3.713337141585894e-05, + "loss": 0.0496, + "num_input_tokens_seen": 88481440, + "step": 72700 + }, + { + "epoch": 8.097226862679586, + "grad_norm": 0.7721536755561829, + "learning_rate": 3.7131246969902944e-05, + "loss": 0.0832, + "num_input_tokens_seen": 88487424, + "step": 72705 + }, + { + "epoch": 8.097783717563203, + "grad_norm": 0.017170274630188942, + "learning_rate": 3.712912240935759e-05, + "loss": 0.0077, + "num_input_tokens_seen": 88493760, + "step": 72710 + }, + { + "epoch": 8.098340572446821, + "grad_norm": 0.03187044709920883, + "learning_rate": 3.7126997734242966e-05, + "loss": 0.0088, + "num_input_tokens_seen": 88500128, + "step": 72715 + }, + { + "epoch": 8.098897427330437, + "grad_norm": 0.02476711943745613, + "learning_rate": 3.712487294457913e-05, + "loss": 0.0038, + "num_input_tokens_seen": 88506656, + "step": 72720 + }, + { + "epoch": 8.099454282214055, + "grad_norm": 0.06194653362035751, + "learning_rate": 3.712274804038615e-05, + "loss": 0.0019, + "num_input_tokens_seen": 88512960, + "step": 72725 + }, + { + "epoch": 8.100011137097672, + "grad_norm": 2.190805673599243, + "learning_rate": 3.712062302168411e-05, + "loss": 0.0964, + "num_input_tokens_seen": 88519360, + "step": 72730 + }, + { + "epoch": 8.10056799198129, + "grad_norm": 0.3963550925254822, + "learning_rate": 3.711849788849307e-05, + "loss": 0.015, + "num_input_tokens_seen": 88525568, + "step": 72735 + }, + { + "epoch": 8.101124846864908, + "grad_norm": 1.0539418458938599, + "learning_rate": 3.7116372640833116e-05, + "loss": 0.1132, + "num_input_tokens_seen": 88531840, + "step": 72740 + }, + { + "epoch": 8.101681701748525, + "grad_norm": 0.04386898875236511, + "learning_rate": 3.711424727872431e-05, + "loss": 0.0522, + "num_input_tokens_seen": 88538304, + "step": 72745 + }, + { + "epoch": 8.102238556632141, + "grad_norm": 0.29477429389953613, + "learning_rate": 3.7112121802186724e-05, + "loss": 0.0177, + "num_input_tokens_seen": 88544352, + "step": 72750 + }, + { + "epoch": 8.102795411515759, + "grad_norm": 0.25942134857177734, + "learning_rate": 3.7109996211240454e-05, + "loss": 0.1068, + "num_input_tokens_seen": 88550304, + "step": 72755 + }, + { + "epoch": 8.103352266399376, + "grad_norm": 0.0022521859500557184, + "learning_rate": 3.710787050590556e-05, + "loss": 0.0603, + "num_input_tokens_seen": 88556288, + "step": 72760 + }, + { + "epoch": 8.103909121282994, + "grad_norm": 0.2738342881202698, + "learning_rate": 3.710574468620214e-05, + "loss": 0.0114, + "num_input_tokens_seen": 88562304, + "step": 72765 + }, + { + "epoch": 8.104465976166612, + "grad_norm": 0.3111749589443207, + "learning_rate": 3.710361875215025e-05, + "loss": 0.008, + "num_input_tokens_seen": 88568416, + "step": 72770 + }, + { + "epoch": 8.105022831050228, + "grad_norm": 0.12094004452228546, + "learning_rate": 3.710149270376999e-05, + "loss": 0.009, + "num_input_tokens_seen": 88574400, + "step": 72775 + }, + { + "epoch": 8.105579685933845, + "grad_norm": 0.3920368552207947, + "learning_rate": 3.7099366541081434e-05, + "loss": 0.0334, + "num_input_tokens_seen": 88580608, + "step": 72780 + }, + { + "epoch": 8.106136540817463, + "grad_norm": 0.05187809094786644, + "learning_rate": 3.709724026410467e-05, + "loss": 0.0472, + "num_input_tokens_seen": 88586656, + "step": 72785 + }, + { + "epoch": 8.10669339570108, + "grad_norm": 2.455211877822876, + "learning_rate": 3.709511387285978e-05, + "loss": 0.1433, + "num_input_tokens_seen": 88592896, + "step": 72790 + }, + { + "epoch": 8.107250250584698, + "grad_norm": 0.08389507234096527, + "learning_rate": 3.709298736736684e-05, + "loss": 0.01, + "num_input_tokens_seen": 88599200, + "step": 72795 + }, + { + "epoch": 8.107807105468314, + "grad_norm": 1.4452235698699951, + "learning_rate": 3.7090860747645955e-05, + "loss": 0.0263, + "num_input_tokens_seen": 88605408, + "step": 72800 + }, + { + "epoch": 8.108363960351932, + "grad_norm": 0.25658026337623596, + "learning_rate": 3.70887340137172e-05, + "loss": 0.0213, + "num_input_tokens_seen": 88611584, + "step": 72805 + }, + { + "epoch": 8.10892081523555, + "grad_norm": 0.07183795422315598, + "learning_rate": 3.7086607165600665e-05, + "loss": 0.0372, + "num_input_tokens_seen": 88617920, + "step": 72810 + }, + { + "epoch": 8.109477670119167, + "grad_norm": 0.00019731992506422102, + "learning_rate": 3.708448020331645e-05, + "loss": 0.0049, + "num_input_tokens_seen": 88624064, + "step": 72815 + }, + { + "epoch": 8.110034525002785, + "grad_norm": 1.8412349224090576, + "learning_rate": 3.708235312688463e-05, + "loss": 0.0954, + "num_input_tokens_seen": 88630432, + "step": 72820 + }, + { + "epoch": 8.1105913798864, + "grad_norm": 0.7012084126472473, + "learning_rate": 3.7080225936325303e-05, + "loss": 0.0294, + "num_input_tokens_seen": 88636832, + "step": 72825 + }, + { + "epoch": 8.111148234770019, + "grad_norm": 1.0005921125411987, + "learning_rate": 3.7078098631658565e-05, + "loss": 0.0971, + "num_input_tokens_seen": 88642880, + "step": 72830 + }, + { + "epoch": 8.111705089653636, + "grad_norm": 0.007401713170111179, + "learning_rate": 3.707597121290451e-05, + "loss": 0.0139, + "num_input_tokens_seen": 88649088, + "step": 72835 + }, + { + "epoch": 8.112261944537254, + "grad_norm": 0.025438932701945305, + "learning_rate": 3.707384368008323e-05, + "loss": 0.0186, + "num_input_tokens_seen": 88655296, + "step": 72840 + }, + { + "epoch": 8.112818799420872, + "grad_norm": 0.06481225043535233, + "learning_rate": 3.7071716033214835e-05, + "loss": 0.0032, + "num_input_tokens_seen": 88661312, + "step": 72845 + }, + { + "epoch": 8.113375654304487, + "grad_norm": 0.06419811397790909, + "learning_rate": 3.7069588272319394e-05, + "loss": 0.0427, + "num_input_tokens_seen": 88667232, + "step": 72850 + }, + { + "epoch": 8.113932509188105, + "grad_norm": 0.549613356590271, + "learning_rate": 3.7067460397417025e-05, + "loss": 0.0136, + "num_input_tokens_seen": 88673280, + "step": 72855 + }, + { + "epoch": 8.114489364071723, + "grad_norm": 0.0005811756709590554, + "learning_rate": 3.706533240852783e-05, + "loss": 0.0808, + "num_input_tokens_seen": 88679584, + "step": 72860 + }, + { + "epoch": 8.11504621895534, + "grad_norm": 1.4888802766799927, + "learning_rate": 3.70632043056719e-05, + "loss": 0.0891, + "num_input_tokens_seen": 88685696, + "step": 72865 + }, + { + "epoch": 8.115603073838958, + "grad_norm": 0.673559844493866, + "learning_rate": 3.706107608886934e-05, + "loss": 0.0925, + "num_input_tokens_seen": 88691648, + "step": 72870 + }, + { + "epoch": 8.116159928722574, + "grad_norm": 0.4683810770511627, + "learning_rate": 3.7058947758140255e-05, + "loss": 0.0479, + "num_input_tokens_seen": 88697728, + "step": 72875 + }, + { + "epoch": 8.116716783606192, + "grad_norm": 0.623373806476593, + "learning_rate": 3.705681931350474e-05, + "loss": 0.013, + "num_input_tokens_seen": 88703872, + "step": 72880 + }, + { + "epoch": 8.11727363848981, + "grad_norm": 0.013041802681982517, + "learning_rate": 3.7054690754982925e-05, + "loss": 0.1102, + "num_input_tokens_seen": 88710176, + "step": 72885 + }, + { + "epoch": 8.117830493373427, + "grad_norm": 0.0006158994510769844, + "learning_rate": 3.7052562082594875e-05, + "loss": 0.108, + "num_input_tokens_seen": 88715872, + "step": 72890 + }, + { + "epoch": 8.118387348257045, + "grad_norm": 0.3257119357585907, + "learning_rate": 3.7050433296360745e-05, + "loss": 0.0363, + "num_input_tokens_seen": 88721536, + "step": 72895 + }, + { + "epoch": 8.11894420314066, + "grad_norm": 1.255813717842102, + "learning_rate": 3.7048304396300593e-05, + "loss": 0.0673, + "num_input_tokens_seen": 88726816, + "step": 72900 + }, + { + "epoch": 8.119501058024278, + "grad_norm": 0.011806193739175797, + "learning_rate": 3.7046175382434565e-05, + "loss": 0.0109, + "num_input_tokens_seen": 88733216, + "step": 72905 + }, + { + "epoch": 8.120057912907896, + "grad_norm": 0.04020533338189125, + "learning_rate": 3.704404625478276e-05, + "loss": 0.0152, + "num_input_tokens_seen": 88739200, + "step": 72910 + }, + { + "epoch": 8.120614767791514, + "grad_norm": 0.004295517224818468, + "learning_rate": 3.704191701336529e-05, + "loss": 0.0621, + "num_input_tokens_seen": 88745120, + "step": 72915 + }, + { + "epoch": 8.121171622675131, + "grad_norm": 0.44363468885421753, + "learning_rate": 3.703978765820226e-05, + "loss": 0.0336, + "num_input_tokens_seen": 88750976, + "step": 72920 + }, + { + "epoch": 8.121728477558749, + "grad_norm": 0.5629670023918152, + "learning_rate": 3.703765818931379e-05, + "loss": 0.0727, + "num_input_tokens_seen": 88756384, + "step": 72925 + }, + { + "epoch": 8.122285332442365, + "grad_norm": 0.002194795059040189, + "learning_rate": 3.703552860672e-05, + "loss": 0.0424, + "num_input_tokens_seen": 88762016, + "step": 72930 + }, + { + "epoch": 8.122842187325983, + "grad_norm": 0.678100049495697, + "learning_rate": 3.703339891044099e-05, + "loss": 0.1325, + "num_input_tokens_seen": 88767456, + "step": 72935 + }, + { + "epoch": 8.1233990422096, + "grad_norm": 1.8168306350708008, + "learning_rate": 3.70312691004969e-05, + "loss": 0.0526, + "num_input_tokens_seen": 88773856, + "step": 72940 + }, + { + "epoch": 8.123955897093218, + "grad_norm": 0.14017271995544434, + "learning_rate": 3.7029139176907826e-05, + "loss": 0.0144, + "num_input_tokens_seen": 88780000, + "step": 72945 + }, + { + "epoch": 8.124512751976836, + "grad_norm": 0.2126632034778595, + "learning_rate": 3.7027009139693894e-05, + "loss": 0.0265, + "num_input_tokens_seen": 88785600, + "step": 72950 + }, + { + "epoch": 8.125069606860452, + "grad_norm": 0.011109689250588417, + "learning_rate": 3.702487898887522e-05, + "loss": 0.0672, + "num_input_tokens_seen": 88791456, + "step": 72955 + }, + { + "epoch": 8.12562646174407, + "grad_norm": 0.7093721628189087, + "learning_rate": 3.702274872447194e-05, + "loss": 0.0491, + "num_input_tokens_seen": 88797568, + "step": 72960 + }, + { + "epoch": 8.126183316627687, + "grad_norm": 0.6631881594657898, + "learning_rate": 3.702061834650416e-05, + "loss": 0.0074, + "num_input_tokens_seen": 88804000, + "step": 72965 + }, + { + "epoch": 8.126740171511305, + "grad_norm": 0.5518066883087158, + "learning_rate": 3.701848785499201e-05, + "loss": 0.0506, + "num_input_tokens_seen": 88810400, + "step": 72970 + }, + { + "epoch": 8.127297026394922, + "grad_norm": 0.6240283846855164, + "learning_rate": 3.701635724995561e-05, + "loss": 0.0169, + "num_input_tokens_seen": 88816960, + "step": 72975 + }, + { + "epoch": 8.127853881278538, + "grad_norm": 0.504288375377655, + "learning_rate": 3.7014226531415095e-05, + "loss": 0.0357, + "num_input_tokens_seen": 88823168, + "step": 72980 + }, + { + "epoch": 8.128410736162156, + "grad_norm": 0.02949771098792553, + "learning_rate": 3.701209569939058e-05, + "loss": 0.0058, + "num_input_tokens_seen": 88829248, + "step": 72985 + }, + { + "epoch": 8.128967591045773, + "grad_norm": 0.023349182680249214, + "learning_rate": 3.7009964753902205e-05, + "loss": 0.0262, + "num_input_tokens_seen": 88835520, + "step": 72990 + }, + { + "epoch": 8.129524445929391, + "grad_norm": 1.4244409799575806, + "learning_rate": 3.700783369497008e-05, + "loss": 0.0438, + "num_input_tokens_seen": 88841088, + "step": 72995 + }, + { + "epoch": 8.130081300813009, + "grad_norm": 0.0721561536192894, + "learning_rate": 3.700570252261435e-05, + "loss": 0.1061, + "num_input_tokens_seen": 88847328, + "step": 73000 + }, + { + "epoch": 8.130638155696625, + "grad_norm": 0.8389483094215393, + "learning_rate": 3.700357123685514e-05, + "loss": 0.0256, + "num_input_tokens_seen": 88853600, + "step": 73005 + }, + { + "epoch": 8.131195010580242, + "grad_norm": 0.01586693339049816, + "learning_rate": 3.7001439837712584e-05, + "loss": 0.2041, + "num_input_tokens_seen": 88859040, + "step": 73010 + }, + { + "epoch": 8.13175186546386, + "grad_norm": 0.0021992893889546394, + "learning_rate": 3.699930832520682e-05, + "loss": 0.0787, + "num_input_tokens_seen": 88865120, + "step": 73015 + }, + { + "epoch": 8.132308720347478, + "grad_norm": 0.5831504464149475, + "learning_rate": 3.6997176699357964e-05, + "loss": 0.0668, + "num_input_tokens_seen": 88871136, + "step": 73020 + }, + { + "epoch": 8.132865575231095, + "grad_norm": 2.1362974643707275, + "learning_rate": 3.699504496018616e-05, + "loss": 0.1716, + "num_input_tokens_seen": 88877600, + "step": 73025 + }, + { + "epoch": 8.133422430114711, + "grad_norm": 0.0008666568319313228, + "learning_rate": 3.699291310771156e-05, + "loss": 0.0116, + "num_input_tokens_seen": 88883744, + "step": 73030 + }, + { + "epoch": 8.133979284998329, + "grad_norm": 1.65836763381958, + "learning_rate": 3.699078114195428e-05, + "loss": 0.1052, + "num_input_tokens_seen": 88889376, + "step": 73035 + }, + { + "epoch": 8.134536139881947, + "grad_norm": 0.0023486586287617683, + "learning_rate": 3.6988649062934454e-05, + "loss": 0.1848, + "num_input_tokens_seen": 88895616, + "step": 73040 + }, + { + "epoch": 8.135092994765564, + "grad_norm": 0.1949937790632248, + "learning_rate": 3.698651687067225e-05, + "loss": 0.0084, + "num_input_tokens_seen": 88901856, + "step": 73045 + }, + { + "epoch": 8.135649849649182, + "grad_norm": 0.42897701263427734, + "learning_rate": 3.6984384565187776e-05, + "loss": 0.1167, + "num_input_tokens_seen": 88908160, + "step": 73050 + }, + { + "epoch": 8.136206704532798, + "grad_norm": 0.2964794635772705, + "learning_rate": 3.69822521465012e-05, + "loss": 0.0627, + "num_input_tokens_seen": 88913888, + "step": 73055 + }, + { + "epoch": 8.136763559416416, + "grad_norm": 0.0014303073985502124, + "learning_rate": 3.698011961463265e-05, + "loss": 0.078, + "num_input_tokens_seen": 88919968, + "step": 73060 + }, + { + "epoch": 8.137320414300033, + "grad_norm": 0.10226119309663773, + "learning_rate": 3.6977986969602266e-05, + "loss": 0.0819, + "num_input_tokens_seen": 88926368, + "step": 73065 + }, + { + "epoch": 8.137877269183651, + "grad_norm": 1.161969780921936, + "learning_rate": 3.6975854211430205e-05, + "loss": 0.0748, + "num_input_tokens_seen": 88932480, + "step": 73070 + }, + { + "epoch": 8.138434124067269, + "grad_norm": 0.23546218872070312, + "learning_rate": 3.69737213401366e-05, + "loss": 0.007, + "num_input_tokens_seen": 88937984, + "step": 73075 + }, + { + "epoch": 8.138990978950885, + "grad_norm": 1.4496278762817383, + "learning_rate": 3.697158835574162e-05, + "loss": 0.1112, + "num_input_tokens_seen": 88944064, + "step": 73080 + }, + { + "epoch": 8.139547833834502, + "grad_norm": 2.8217267990112305, + "learning_rate": 3.6969455258265376e-05, + "loss": 0.1176, + "num_input_tokens_seen": 88950688, + "step": 73085 + }, + { + "epoch": 8.14010468871812, + "grad_norm": 1.729680061340332, + "learning_rate": 3.696732204772805e-05, + "loss": 0.1539, + "num_input_tokens_seen": 88957152, + "step": 73090 + }, + { + "epoch": 8.140661543601738, + "grad_norm": 0.647640585899353, + "learning_rate": 3.696518872414977e-05, + "loss": 0.1377, + "num_input_tokens_seen": 88963168, + "step": 73095 + }, + { + "epoch": 8.141218398485355, + "grad_norm": 0.07427822798490524, + "learning_rate": 3.69630552875507e-05, + "loss": 0.0958, + "num_input_tokens_seen": 88969376, + "step": 73100 + }, + { + "epoch": 8.141775253368973, + "grad_norm": 0.2698385715484619, + "learning_rate": 3.6960921737950985e-05, + "loss": 0.0449, + "num_input_tokens_seen": 88975680, + "step": 73105 + }, + { + "epoch": 8.142332108252589, + "grad_norm": 1.2929401397705078, + "learning_rate": 3.695878807537079e-05, + "loss": 0.1252, + "num_input_tokens_seen": 88981376, + "step": 73110 + }, + { + "epoch": 8.142888963136206, + "grad_norm": 2.2350029945373535, + "learning_rate": 3.6956654299830255e-05, + "loss": 0.0637, + "num_input_tokens_seen": 88987584, + "step": 73115 + }, + { + "epoch": 8.143445818019824, + "grad_norm": 0.32805708050727844, + "learning_rate": 3.6954520411349545e-05, + "loss": 0.0373, + "num_input_tokens_seen": 88993600, + "step": 73120 + }, + { + "epoch": 8.144002672903442, + "grad_norm": 0.005991604644805193, + "learning_rate": 3.6952386409948805e-05, + "loss": 0.0374, + "num_input_tokens_seen": 88999904, + "step": 73125 + }, + { + "epoch": 8.14455952778706, + "grad_norm": 0.020806487649679184, + "learning_rate": 3.69502522956482e-05, + "loss": 0.0377, + "num_input_tokens_seen": 89005952, + "step": 73130 + }, + { + "epoch": 8.145116382670675, + "grad_norm": 1.1980003118515015, + "learning_rate": 3.694811806846789e-05, + "loss": 0.0569, + "num_input_tokens_seen": 89012064, + "step": 73135 + }, + { + "epoch": 8.145673237554293, + "grad_norm": 0.012091006152331829, + "learning_rate": 3.6945983728428035e-05, + "loss": 0.0372, + "num_input_tokens_seen": 89017600, + "step": 73140 + }, + { + "epoch": 8.14623009243791, + "grad_norm": 0.006456165108829737, + "learning_rate": 3.6943849275548794e-05, + "loss": 0.1234, + "num_input_tokens_seen": 89023584, + "step": 73145 + }, + { + "epoch": 8.146786947321528, + "grad_norm": 0.015184092335402966, + "learning_rate": 3.6941714709850314e-05, + "loss": 0.0231, + "num_input_tokens_seen": 89029760, + "step": 73150 + }, + { + "epoch": 8.147343802205146, + "grad_norm": 1.138235092163086, + "learning_rate": 3.693958003135278e-05, + "loss": 0.0795, + "num_input_tokens_seen": 89035520, + "step": 73155 + }, + { + "epoch": 8.147900657088762, + "grad_norm": 0.553023099899292, + "learning_rate": 3.693744524007635e-05, + "loss": 0.0456, + "num_input_tokens_seen": 89041664, + "step": 73160 + }, + { + "epoch": 8.14845751197238, + "grad_norm": 4.216190814971924, + "learning_rate": 3.693531033604118e-05, + "loss": 0.0961, + "num_input_tokens_seen": 89047584, + "step": 73165 + }, + { + "epoch": 8.149014366855997, + "grad_norm": 0.31882020831108093, + "learning_rate": 3.6933175319267445e-05, + "loss": 0.0193, + "num_input_tokens_seen": 89053568, + "step": 73170 + }, + { + "epoch": 8.149571221739615, + "grad_norm": 0.025531044229865074, + "learning_rate": 3.6931040189775305e-05, + "loss": 0.0094, + "num_input_tokens_seen": 89059456, + "step": 73175 + }, + { + "epoch": 8.150128076623233, + "grad_norm": 0.4214577376842499, + "learning_rate": 3.692890494758494e-05, + "loss": 0.0544, + "num_input_tokens_seen": 89065792, + "step": 73180 + }, + { + "epoch": 8.150684931506849, + "grad_norm": 0.0014895932981744409, + "learning_rate": 3.6926769592716504e-05, + "loss": 0.0445, + "num_input_tokens_seen": 89071424, + "step": 73185 + }, + { + "epoch": 8.151241786390466, + "grad_norm": 0.40295013785362244, + "learning_rate": 3.6924634125190164e-05, + "loss": 0.0269, + "num_input_tokens_seen": 89077536, + "step": 73190 + }, + { + "epoch": 8.151798641274084, + "grad_norm": 1.2039144039154053, + "learning_rate": 3.692249854502612e-05, + "loss": 0.1537, + "num_input_tokens_seen": 89083552, + "step": 73195 + }, + { + "epoch": 8.152355496157702, + "grad_norm": 0.42578282952308655, + "learning_rate": 3.692036285224451e-05, + "loss": 0.0283, + "num_input_tokens_seen": 89089408, + "step": 73200 + }, + { + "epoch": 8.15291235104132, + "grad_norm": 2.2305819988250732, + "learning_rate": 3.6918227046865536e-05, + "loss": 0.053, + "num_input_tokens_seen": 89095744, + "step": 73205 + }, + { + "epoch": 8.153469205924935, + "grad_norm": 0.004592134617269039, + "learning_rate": 3.691609112890935e-05, + "loss": 0.0609, + "num_input_tokens_seen": 89101728, + "step": 73210 + }, + { + "epoch": 8.154026060808553, + "grad_norm": 0.17851929366588593, + "learning_rate": 3.6913955098396134e-05, + "loss": 0.006, + "num_input_tokens_seen": 89107776, + "step": 73215 + }, + { + "epoch": 8.15458291569217, + "grad_norm": 0.8632848262786865, + "learning_rate": 3.691181895534607e-05, + "loss": 0.0617, + "num_input_tokens_seen": 89113984, + "step": 73220 + }, + { + "epoch": 8.155139770575788, + "grad_norm": 9.638040501158684e-05, + "learning_rate": 3.690968269977933e-05, + "loss": 0.0044, + "num_input_tokens_seen": 89119904, + "step": 73225 + }, + { + "epoch": 8.155696625459406, + "grad_norm": 1.6396820545196533, + "learning_rate": 3.6907546331716104e-05, + "loss": 0.0945, + "num_input_tokens_seen": 89125440, + "step": 73230 + }, + { + "epoch": 8.156253480343022, + "grad_norm": 1.5730103254318237, + "learning_rate": 3.690540985117655e-05, + "loss": 0.0924, + "num_input_tokens_seen": 89131552, + "step": 73235 + }, + { + "epoch": 8.15681033522664, + "grad_norm": 0.06144658848643303, + "learning_rate": 3.690327325818087e-05, + "loss": 0.0803, + "num_input_tokens_seen": 89137760, + "step": 73240 + }, + { + "epoch": 8.157367190110257, + "grad_norm": 0.011355533264577389, + "learning_rate": 3.6901136552749236e-05, + "loss": 0.0854, + "num_input_tokens_seen": 89144064, + "step": 73245 + }, + { + "epoch": 8.157924044993875, + "grad_norm": 0.44579461216926575, + "learning_rate": 3.689899973490183e-05, + "loss": 0.0914, + "num_input_tokens_seen": 89150400, + "step": 73250 + }, + { + "epoch": 8.158480899877492, + "grad_norm": 0.0003533510025590658, + "learning_rate": 3.6896862804658835e-05, + "loss": 0.0356, + "num_input_tokens_seen": 89156480, + "step": 73255 + }, + { + "epoch": 8.159037754761108, + "grad_norm": 1.1134898662567139, + "learning_rate": 3.689472576204044e-05, + "loss": 0.0568, + "num_input_tokens_seen": 89162496, + "step": 73260 + }, + { + "epoch": 8.159594609644726, + "grad_norm": 0.18167845904827118, + "learning_rate": 3.689258860706684e-05, + "loss": 0.028, + "num_input_tokens_seen": 89168576, + "step": 73265 + }, + { + "epoch": 8.160151464528344, + "grad_norm": 0.0200825035572052, + "learning_rate": 3.6890451339758205e-05, + "loss": 0.0458, + "num_input_tokens_seen": 89174528, + "step": 73270 + }, + { + "epoch": 8.160708319411961, + "grad_norm": 0.003804118139669299, + "learning_rate": 3.6888313960134735e-05, + "loss": 0.0733, + "num_input_tokens_seen": 89180480, + "step": 73275 + }, + { + "epoch": 8.161265174295579, + "grad_norm": 0.8633546829223633, + "learning_rate": 3.688617646821661e-05, + "loss": 0.069, + "num_input_tokens_seen": 89186240, + "step": 73280 + }, + { + "epoch": 8.161822029179197, + "grad_norm": 1.0955899953842163, + "learning_rate": 3.688403886402403e-05, + "loss": 0.0497, + "num_input_tokens_seen": 89192480, + "step": 73285 + }, + { + "epoch": 8.162378884062813, + "grad_norm": 0.001453375443816185, + "learning_rate": 3.6881901147577174e-05, + "loss": 0.0258, + "num_input_tokens_seen": 89198880, + "step": 73290 + }, + { + "epoch": 8.16293573894643, + "grad_norm": 0.03566717728972435, + "learning_rate": 3.687976331889625e-05, + "loss": 0.0687, + "num_input_tokens_seen": 89204800, + "step": 73295 + }, + { + "epoch": 8.163492593830048, + "grad_norm": 0.04049809277057648, + "learning_rate": 3.687762537800144e-05, + "loss": 0.0577, + "num_input_tokens_seen": 89210560, + "step": 73300 + }, + { + "epoch": 8.164049448713666, + "grad_norm": 1.0166494846343994, + "learning_rate": 3.6875487324912935e-05, + "loss": 0.0664, + "num_input_tokens_seen": 89216800, + "step": 73305 + }, + { + "epoch": 8.164606303597283, + "grad_norm": 0.06834981590509415, + "learning_rate": 3.687334915965096e-05, + "loss": 0.0475, + "num_input_tokens_seen": 89222880, + "step": 73310 + }, + { + "epoch": 8.1651631584809, + "grad_norm": 0.3439474105834961, + "learning_rate": 3.6871210882235665e-05, + "loss": 0.0342, + "num_input_tokens_seen": 89228960, + "step": 73315 + }, + { + "epoch": 8.165720013364517, + "grad_norm": 0.7961410284042358, + "learning_rate": 3.686907249268728e-05, + "loss": 0.0463, + "num_input_tokens_seen": 89234688, + "step": 73320 + }, + { + "epoch": 8.166276868248135, + "grad_norm": 0.000493681407533586, + "learning_rate": 3.6866933991025995e-05, + "loss": 0.0302, + "num_input_tokens_seen": 89240864, + "step": 73325 + }, + { + "epoch": 8.166833723131752, + "grad_norm": 0.0455034039914608, + "learning_rate": 3.686479537727202e-05, + "loss": 0.0849, + "num_input_tokens_seen": 89247232, + "step": 73330 + }, + { + "epoch": 8.16739057801537, + "grad_norm": 0.002568070776760578, + "learning_rate": 3.686265665144554e-05, + "loss": 0.0629, + "num_input_tokens_seen": 89253248, + "step": 73335 + }, + { + "epoch": 8.167947432898986, + "grad_norm": 0.12806867063045502, + "learning_rate": 3.686051781356676e-05, + "loss": 0.054, + "num_input_tokens_seen": 89259520, + "step": 73340 + }, + { + "epoch": 8.168504287782604, + "grad_norm": 1.9106799364089966, + "learning_rate": 3.6858378863655893e-05, + "loss": 0.0874, + "num_input_tokens_seen": 89265472, + "step": 73345 + }, + { + "epoch": 8.169061142666221, + "grad_norm": 0.7657883763313293, + "learning_rate": 3.685623980173313e-05, + "loss": 0.025, + "num_input_tokens_seen": 89271264, + "step": 73350 + }, + { + "epoch": 8.169617997549839, + "grad_norm": 0.0979410707950592, + "learning_rate": 3.685410062781869e-05, + "loss": 0.0632, + "num_input_tokens_seen": 89277504, + "step": 73355 + }, + { + "epoch": 8.170174852433457, + "grad_norm": 0.12520889937877655, + "learning_rate": 3.685196134193277e-05, + "loss": 0.0627, + "num_input_tokens_seen": 89283072, + "step": 73360 + }, + { + "epoch": 8.170731707317072, + "grad_norm": 0.08794023841619492, + "learning_rate": 3.684982194409558e-05, + "loss": 0.0295, + "num_input_tokens_seen": 89289088, + "step": 73365 + }, + { + "epoch": 8.17128856220069, + "grad_norm": 0.2976439595222473, + "learning_rate": 3.684768243432733e-05, + "loss": 0.0277, + "num_input_tokens_seen": 89295648, + "step": 73370 + }, + { + "epoch": 8.171845417084308, + "grad_norm": 1.2362215518951416, + "learning_rate": 3.684554281264822e-05, + "loss": 0.1843, + "num_input_tokens_seen": 89301632, + "step": 73375 + }, + { + "epoch": 8.172402271967925, + "grad_norm": 0.006469217129051685, + "learning_rate": 3.684340307907847e-05, + "loss": 0.1615, + "num_input_tokens_seen": 89307808, + "step": 73380 + }, + { + "epoch": 8.172959126851543, + "grad_norm": 1.008857011795044, + "learning_rate": 3.68412632336383e-05, + "loss": 0.1261, + "num_input_tokens_seen": 89313440, + "step": 73385 + }, + { + "epoch": 8.173515981735159, + "grad_norm": 0.7942050099372864, + "learning_rate": 3.6839123276347895e-05, + "loss": 0.0251, + "num_input_tokens_seen": 89319648, + "step": 73390 + }, + { + "epoch": 8.174072836618777, + "grad_norm": 0.02707832306623459, + "learning_rate": 3.68369832072275e-05, + "loss": 0.0646, + "num_input_tokens_seen": 89325792, + "step": 73395 + }, + { + "epoch": 8.174629691502394, + "grad_norm": 0.5295155048370361, + "learning_rate": 3.68348430262973e-05, + "loss": 0.0416, + "num_input_tokens_seen": 89332192, + "step": 73400 + }, + { + "epoch": 8.175186546386012, + "grad_norm": 0.02723374031484127, + "learning_rate": 3.683270273357754e-05, + "loss": 0.0396, + "num_input_tokens_seen": 89338368, + "step": 73405 + }, + { + "epoch": 8.17574340126963, + "grad_norm": 0.23957185447216034, + "learning_rate": 3.6830562329088416e-05, + "loss": 0.0821, + "num_input_tokens_seen": 89344448, + "step": 73410 + }, + { + "epoch": 8.176300256153246, + "grad_norm": 0.011316702701151371, + "learning_rate": 3.682842181285015e-05, + "loss": 0.0327, + "num_input_tokens_seen": 89350688, + "step": 73415 + }, + { + "epoch": 8.176857111036863, + "grad_norm": 2.1075363159179688, + "learning_rate": 3.6826281184882964e-05, + "loss": 0.1705, + "num_input_tokens_seen": 89356480, + "step": 73420 + }, + { + "epoch": 8.177413965920481, + "grad_norm": 0.5314512252807617, + "learning_rate": 3.682414044520708e-05, + "loss": 0.1323, + "num_input_tokens_seen": 89362496, + "step": 73425 + }, + { + "epoch": 8.177970820804099, + "grad_norm": 0.6109722852706909, + "learning_rate": 3.6821999593842715e-05, + "loss": 0.0725, + "num_input_tokens_seen": 89368320, + "step": 73430 + }, + { + "epoch": 8.178527675687716, + "grad_norm": 0.4425332844257355, + "learning_rate": 3.6819858630810096e-05, + "loss": 0.0798, + "num_input_tokens_seen": 89373856, + "step": 73435 + }, + { + "epoch": 8.179084530571334, + "grad_norm": 1.0193803310394287, + "learning_rate": 3.681771755612944e-05, + "loss": 0.083, + "num_input_tokens_seen": 89379968, + "step": 73440 + }, + { + "epoch": 8.17964138545495, + "grad_norm": 0.7744936347007751, + "learning_rate": 3.681557636982097e-05, + "loss": 0.1147, + "num_input_tokens_seen": 89386080, + "step": 73445 + }, + { + "epoch": 8.180198240338568, + "grad_norm": 0.6769410967826843, + "learning_rate": 3.681343507190491e-05, + "loss": 0.0834, + "num_input_tokens_seen": 89391872, + "step": 73450 + }, + { + "epoch": 8.180755095222185, + "grad_norm": 1.1563855409622192, + "learning_rate": 3.68112936624015e-05, + "loss": 0.0366, + "num_input_tokens_seen": 89398080, + "step": 73455 + }, + { + "epoch": 8.181311950105803, + "grad_norm": 0.20118261873722076, + "learning_rate": 3.680915214133096e-05, + "loss": 0.037, + "num_input_tokens_seen": 89404224, + "step": 73460 + }, + { + "epoch": 8.18186880498942, + "grad_norm": 0.07726327329874039, + "learning_rate": 3.680701050871351e-05, + "loss": 0.0274, + "num_input_tokens_seen": 89410016, + "step": 73465 + }, + { + "epoch": 8.182425659873036, + "grad_norm": 0.03350577875971794, + "learning_rate": 3.680486876456939e-05, + "loss": 0.0384, + "num_input_tokens_seen": 89416608, + "step": 73470 + }, + { + "epoch": 8.182982514756654, + "grad_norm": 0.22440959513187408, + "learning_rate": 3.6802726908918825e-05, + "loss": 0.0914, + "num_input_tokens_seen": 89422720, + "step": 73475 + }, + { + "epoch": 8.183539369640272, + "grad_norm": 0.7839179635047913, + "learning_rate": 3.680058494178205e-05, + "loss": 0.1671, + "num_input_tokens_seen": 89428672, + "step": 73480 + }, + { + "epoch": 8.18409622452389, + "grad_norm": 0.00022930718841962516, + "learning_rate": 3.67984428631793e-05, + "loss": 0.0196, + "num_input_tokens_seen": 89434784, + "step": 73485 + }, + { + "epoch": 8.184653079407507, + "grad_norm": 1.214853286743164, + "learning_rate": 3.6796300673130794e-05, + "loss": 0.0819, + "num_input_tokens_seen": 89440800, + "step": 73490 + }, + { + "epoch": 8.185209934291123, + "grad_norm": 0.0031386532355099916, + "learning_rate": 3.6794158371656786e-05, + "loss": 0.1169, + "num_input_tokens_seen": 89446848, + "step": 73495 + }, + { + "epoch": 8.18576678917474, + "grad_norm": 0.8074517846107483, + "learning_rate": 3.6792015958777495e-05, + "loss": 0.0362, + "num_input_tokens_seen": 89452864, + "step": 73500 + }, + { + "epoch": 8.186323644058358, + "grad_norm": 0.13806672394275665, + "learning_rate": 3.6789873434513175e-05, + "loss": 0.0292, + "num_input_tokens_seen": 89459008, + "step": 73505 + }, + { + "epoch": 8.186880498941976, + "grad_norm": 0.6597846746444702, + "learning_rate": 3.6787730798884046e-05, + "loss": 0.0349, + "num_input_tokens_seen": 89464800, + "step": 73510 + }, + { + "epoch": 8.187437353825594, + "grad_norm": 0.19003069400787354, + "learning_rate": 3.6785588051910356e-05, + "loss": 0.0393, + "num_input_tokens_seen": 89471136, + "step": 73515 + }, + { + "epoch": 8.18799420870921, + "grad_norm": 0.01833200827240944, + "learning_rate": 3.6783445193612346e-05, + "loss": 0.0424, + "num_input_tokens_seen": 89477216, + "step": 73520 + }, + { + "epoch": 8.188551063592827, + "grad_norm": 0.7286345958709717, + "learning_rate": 3.6781302224010255e-05, + "loss": 0.1123, + "num_input_tokens_seen": 89483328, + "step": 73525 + }, + { + "epoch": 8.189107918476445, + "grad_norm": 0.000154473542352207, + "learning_rate": 3.677915914312433e-05, + "loss": 0.0062, + "num_input_tokens_seen": 89489472, + "step": 73530 + }, + { + "epoch": 8.189664773360063, + "grad_norm": 0.1493683010339737, + "learning_rate": 3.6777015950974805e-05, + "loss": 0.021, + "num_input_tokens_seen": 89495776, + "step": 73535 + }, + { + "epoch": 8.19022162824368, + "grad_norm": 0.2718552052974701, + "learning_rate": 3.677487264758193e-05, + "loss": 0.0139, + "num_input_tokens_seen": 89501600, + "step": 73540 + }, + { + "epoch": 8.190778483127296, + "grad_norm": 0.028527889400720596, + "learning_rate": 3.677272923296595e-05, + "loss": 0.1592, + "num_input_tokens_seen": 89507776, + "step": 73545 + }, + { + "epoch": 8.191335338010914, + "grad_norm": 0.8660732507705688, + "learning_rate": 3.677058570714711e-05, + "loss": 0.0624, + "num_input_tokens_seen": 89513920, + "step": 73550 + }, + { + "epoch": 8.191892192894532, + "grad_norm": 0.1697416603565216, + "learning_rate": 3.676844207014566e-05, + "loss": 0.007, + "num_input_tokens_seen": 89519936, + "step": 73555 + }, + { + "epoch": 8.19244904777815, + "grad_norm": 0.0005210064700804651, + "learning_rate": 3.6766298321981837e-05, + "loss": 0.1203, + "num_input_tokens_seen": 89526304, + "step": 73560 + }, + { + "epoch": 8.193005902661767, + "grad_norm": 0.055804092437028885, + "learning_rate": 3.67641544626759e-05, + "loss": 0.0175, + "num_input_tokens_seen": 89532000, + "step": 73565 + }, + { + "epoch": 8.193562757545383, + "grad_norm": 0.8586171269416809, + "learning_rate": 3.6762010492248114e-05, + "loss": 0.0717, + "num_input_tokens_seen": 89538240, + "step": 73570 + }, + { + "epoch": 8.194119612429, + "grad_norm": 1.324636459350586, + "learning_rate": 3.67598664107187e-05, + "loss": 0.0937, + "num_input_tokens_seen": 89544352, + "step": 73575 + }, + { + "epoch": 8.194676467312618, + "grad_norm": 0.721123456954956, + "learning_rate": 3.675772221810793e-05, + "loss": 0.0301, + "num_input_tokens_seen": 89550624, + "step": 73580 + }, + { + "epoch": 8.195233322196236, + "grad_norm": 0.6402990221977234, + "learning_rate": 3.6755577914436056e-05, + "loss": 0.073, + "num_input_tokens_seen": 89556480, + "step": 73585 + }, + { + "epoch": 8.195790177079854, + "grad_norm": 0.10976119339466095, + "learning_rate": 3.675343349972333e-05, + "loss": 0.0345, + "num_input_tokens_seen": 89562784, + "step": 73590 + }, + { + "epoch": 8.19634703196347, + "grad_norm": 1.4930917024612427, + "learning_rate": 3.675128897399001e-05, + "loss": 0.1669, + "num_input_tokens_seen": 89568896, + "step": 73595 + }, + { + "epoch": 8.196903886847087, + "grad_norm": 0.1912301480770111, + "learning_rate": 3.674914433725635e-05, + "loss": 0.0284, + "num_input_tokens_seen": 89575072, + "step": 73600 + }, + { + "epoch": 8.197460741730705, + "grad_norm": 0.7665334343910217, + "learning_rate": 3.67469995895426e-05, + "loss": 0.0141, + "num_input_tokens_seen": 89581600, + "step": 73605 + }, + { + "epoch": 8.198017596614322, + "grad_norm": 0.07785314321517944, + "learning_rate": 3.6744854730869035e-05, + "loss": 0.0201, + "num_input_tokens_seen": 89587936, + "step": 73610 + }, + { + "epoch": 8.19857445149794, + "grad_norm": 0.47735172510147095, + "learning_rate": 3.67427097612559e-05, + "loss": 0.1032, + "num_input_tokens_seen": 89594080, + "step": 73615 + }, + { + "epoch": 8.199131306381556, + "grad_norm": 0.034464020282030106, + "learning_rate": 3.6740564680723476e-05, + "loss": 0.0203, + "num_input_tokens_seen": 89600320, + "step": 73620 + }, + { + "epoch": 8.199688161265174, + "grad_norm": 0.5514374375343323, + "learning_rate": 3.6738419489292e-05, + "loss": 0.0476, + "num_input_tokens_seen": 89606784, + "step": 73625 + }, + { + "epoch": 8.200245016148791, + "grad_norm": 0.4823121130466461, + "learning_rate": 3.673627418698175e-05, + "loss": 0.0408, + "num_input_tokens_seen": 89613120, + "step": 73630 + }, + { + "epoch": 8.200801871032409, + "grad_norm": 0.4998549818992615, + "learning_rate": 3.6734128773812995e-05, + "loss": 0.0131, + "num_input_tokens_seen": 89619072, + "step": 73635 + }, + { + "epoch": 8.201358725916027, + "grad_norm": 0.04096595197916031, + "learning_rate": 3.673198324980599e-05, + "loss": 0.072, + "num_input_tokens_seen": 89625184, + "step": 73640 + }, + { + "epoch": 8.201915580799644, + "grad_norm": 2.20393967628479, + "learning_rate": 3.6729837614981e-05, + "loss": 0.0414, + "num_input_tokens_seen": 89631520, + "step": 73645 + }, + { + "epoch": 8.20247243568326, + "grad_norm": 1.2823103666305542, + "learning_rate": 3.6727691869358296e-05, + "loss": 0.0542, + "num_input_tokens_seen": 89637792, + "step": 73650 + }, + { + "epoch": 8.203029290566878, + "grad_norm": 0.21860313415527344, + "learning_rate": 3.672554601295814e-05, + "loss": 0.0732, + "num_input_tokens_seen": 89644032, + "step": 73655 + }, + { + "epoch": 8.203586145450496, + "grad_norm": 1.9829163551330566, + "learning_rate": 3.6723400045800814e-05, + "loss": 0.1027, + "num_input_tokens_seen": 89649728, + "step": 73660 + }, + { + "epoch": 8.204143000334113, + "grad_norm": 0.08221538364887238, + "learning_rate": 3.6721253967906583e-05, + "loss": 0.0181, + "num_input_tokens_seen": 89656032, + "step": 73665 + }, + { + "epoch": 8.204699855217731, + "grad_norm": 0.009072856977581978, + "learning_rate": 3.671910777929572e-05, + "loss": 0.0592, + "num_input_tokens_seen": 89662336, + "step": 73670 + }, + { + "epoch": 8.205256710101347, + "grad_norm": 0.808716356754303, + "learning_rate": 3.6716961479988486e-05, + "loss": 0.0771, + "num_input_tokens_seen": 89668512, + "step": 73675 + }, + { + "epoch": 8.205813564984965, + "grad_norm": 0.05132870376110077, + "learning_rate": 3.6714815070005176e-05, + "loss": 0.0926, + "num_input_tokens_seen": 89674656, + "step": 73680 + }, + { + "epoch": 8.206370419868582, + "grad_norm": 0.1414850503206253, + "learning_rate": 3.6712668549366045e-05, + "loss": 0.0653, + "num_input_tokens_seen": 89680640, + "step": 73685 + }, + { + "epoch": 8.2069272747522, + "grad_norm": 0.28572848439216614, + "learning_rate": 3.6710521918091366e-05, + "loss": 0.0381, + "num_input_tokens_seen": 89686816, + "step": 73690 + }, + { + "epoch": 8.207484129635818, + "grad_norm": 0.06104809418320656, + "learning_rate": 3.670837517620144e-05, + "loss": 0.1504, + "num_input_tokens_seen": 89692800, + "step": 73695 + }, + { + "epoch": 8.208040984519434, + "grad_norm": 1.0444685220718384, + "learning_rate": 3.6706228323716525e-05, + "loss": 0.0639, + "num_input_tokens_seen": 89698944, + "step": 73700 + }, + { + "epoch": 8.208597839403051, + "grad_norm": 0.08254240453243256, + "learning_rate": 3.6704081360656906e-05, + "loss": 0.0067, + "num_input_tokens_seen": 89705568, + "step": 73705 + }, + { + "epoch": 8.209154694286669, + "grad_norm": 0.18199199438095093, + "learning_rate": 3.670193428704285e-05, + "loss": 0.0672, + "num_input_tokens_seen": 89711648, + "step": 73710 + }, + { + "epoch": 8.209711549170287, + "grad_norm": 0.04787700995802879, + "learning_rate": 3.6699787102894664e-05, + "loss": 0.0284, + "num_input_tokens_seen": 89717792, + "step": 73715 + }, + { + "epoch": 8.210268404053904, + "grad_norm": 0.016228651627898216, + "learning_rate": 3.669763980823261e-05, + "loss": 0.0582, + "num_input_tokens_seen": 89723840, + "step": 73720 + }, + { + "epoch": 8.21082525893752, + "grad_norm": 0.006944859866052866, + "learning_rate": 3.669549240307698e-05, + "loss": 0.0367, + "num_input_tokens_seen": 89729792, + "step": 73725 + }, + { + "epoch": 8.211382113821138, + "grad_norm": 0.233016699552536, + "learning_rate": 3.6693344887448044e-05, + "loss": 0.0556, + "num_input_tokens_seen": 89736224, + "step": 73730 + }, + { + "epoch": 8.211938968704755, + "grad_norm": 0.336127907037735, + "learning_rate": 3.669119726136611e-05, + "loss": 0.0212, + "num_input_tokens_seen": 89742336, + "step": 73735 + }, + { + "epoch": 8.212495823588373, + "grad_norm": 0.0039695980958640575, + "learning_rate": 3.668904952485144e-05, + "loss": 0.0066, + "num_input_tokens_seen": 89748352, + "step": 73740 + }, + { + "epoch": 8.21305267847199, + "grad_norm": 0.00793389044702053, + "learning_rate": 3.6686901677924336e-05, + "loss": 0.1084, + "num_input_tokens_seen": 89754432, + "step": 73745 + }, + { + "epoch": 8.213609533355607, + "grad_norm": 0.04516898840665817, + "learning_rate": 3.6684753720605084e-05, + "loss": 0.0382, + "num_input_tokens_seen": 89760704, + "step": 73750 + }, + { + "epoch": 8.214166388239224, + "grad_norm": 0.14292363822460175, + "learning_rate": 3.668260565291396e-05, + "loss": 0.0361, + "num_input_tokens_seen": 89766944, + "step": 73755 + }, + { + "epoch": 8.214723243122842, + "grad_norm": 0.8513925075531006, + "learning_rate": 3.668045747487128e-05, + "loss": 0.1352, + "num_input_tokens_seen": 89772992, + "step": 73760 + }, + { + "epoch": 8.21528009800646, + "grad_norm": 0.4401080310344696, + "learning_rate": 3.667830918649732e-05, + "loss": 0.0392, + "num_input_tokens_seen": 89779360, + "step": 73765 + }, + { + "epoch": 8.215836952890077, + "grad_norm": 0.0011052764020860195, + "learning_rate": 3.6676160787812365e-05, + "loss": 0.0299, + "num_input_tokens_seen": 89785376, + "step": 73770 + }, + { + "epoch": 8.216393807773693, + "grad_norm": 0.3020303249359131, + "learning_rate": 3.667401227883672e-05, + "loss": 0.0132, + "num_input_tokens_seen": 89791328, + "step": 73775 + }, + { + "epoch": 8.216950662657311, + "grad_norm": 0.25721052289009094, + "learning_rate": 3.667186365959068e-05, + "loss": 0.0987, + "num_input_tokens_seen": 89796992, + "step": 73780 + }, + { + "epoch": 8.217507517540929, + "grad_norm": 1.8976123332977295, + "learning_rate": 3.666971493009453e-05, + "loss": 0.0859, + "num_input_tokens_seen": 89803232, + "step": 73785 + }, + { + "epoch": 8.218064372424546, + "grad_norm": 0.037253573536872864, + "learning_rate": 3.666756609036858e-05, + "loss": 0.0087, + "num_input_tokens_seen": 89809408, + "step": 73790 + }, + { + "epoch": 8.218621227308164, + "grad_norm": 0.007295260671526194, + "learning_rate": 3.666541714043311e-05, + "loss": 0.0303, + "num_input_tokens_seen": 89815712, + "step": 73795 + }, + { + "epoch": 8.219178082191782, + "grad_norm": 0.0034369230270385742, + "learning_rate": 3.6663268080308445e-05, + "loss": 0.0022, + "num_input_tokens_seen": 89821920, + "step": 73800 + }, + { + "epoch": 8.219734937075398, + "grad_norm": 0.4680481553077698, + "learning_rate": 3.666111891001485e-05, + "loss": 0.2284, + "num_input_tokens_seen": 89828224, + "step": 73805 + }, + { + "epoch": 8.220291791959015, + "grad_norm": 2.101048231124878, + "learning_rate": 3.665896962957266e-05, + "loss": 0.1122, + "num_input_tokens_seen": 89834368, + "step": 73810 + }, + { + "epoch": 8.220848646842633, + "grad_norm": 1.7283748388290405, + "learning_rate": 3.6656820239002156e-05, + "loss": 0.0882, + "num_input_tokens_seen": 89840416, + "step": 73815 + }, + { + "epoch": 8.22140550172625, + "grad_norm": 0.13175761699676514, + "learning_rate": 3.665467073832364e-05, + "loss": 0.048, + "num_input_tokens_seen": 89846784, + "step": 73820 + }, + { + "epoch": 8.221962356609868, + "grad_norm": 0.005106819793581963, + "learning_rate": 3.665252112755743e-05, + "loss": 0.0514, + "num_input_tokens_seen": 89853312, + "step": 73825 + }, + { + "epoch": 8.222519211493484, + "grad_norm": 0.015741020441055298, + "learning_rate": 3.665037140672381e-05, + "loss": 0.195, + "num_input_tokens_seen": 89859424, + "step": 73830 + }, + { + "epoch": 8.223076066377102, + "grad_norm": 1.0416465997695923, + "learning_rate": 3.664822157584311e-05, + "loss": 0.0714, + "num_input_tokens_seen": 89865600, + "step": 73835 + }, + { + "epoch": 8.22363292126072, + "grad_norm": 0.20345455408096313, + "learning_rate": 3.6646071634935615e-05, + "loss": 0.197, + "num_input_tokens_seen": 89871648, + "step": 73840 + }, + { + "epoch": 8.224189776144337, + "grad_norm": 0.07999878376722336, + "learning_rate": 3.664392158402165e-05, + "loss": 0.0486, + "num_input_tokens_seen": 89877600, + "step": 73845 + }, + { + "epoch": 8.224746631027955, + "grad_norm": 0.24133720993995667, + "learning_rate": 3.664177142312151e-05, + "loss": 0.0186, + "num_input_tokens_seen": 89883904, + "step": 73850 + }, + { + "epoch": 8.22530348591157, + "grad_norm": 3.9480512142181396, + "learning_rate": 3.663962115225552e-05, + "loss": 0.1333, + "num_input_tokens_seen": 89890144, + "step": 73855 + }, + { + "epoch": 8.225860340795188, + "grad_norm": 0.04277068004012108, + "learning_rate": 3.663747077144398e-05, + "loss": 0.0076, + "num_input_tokens_seen": 89896544, + "step": 73860 + }, + { + "epoch": 8.226417195678806, + "grad_norm": 0.2529223561286926, + "learning_rate": 3.66353202807072e-05, + "loss": 0.0238, + "num_input_tokens_seen": 89902592, + "step": 73865 + }, + { + "epoch": 8.226974050562424, + "grad_norm": 0.001429807161912322, + "learning_rate": 3.66331696800655e-05, + "loss": 0.0174, + "num_input_tokens_seen": 89908640, + "step": 73870 + }, + { + "epoch": 8.227530905446041, + "grad_norm": 0.2772093117237091, + "learning_rate": 3.663101896953919e-05, + "loss": 0.0195, + "num_input_tokens_seen": 89914432, + "step": 73875 + }, + { + "epoch": 8.228087760329657, + "grad_norm": 0.001711671007797122, + "learning_rate": 3.6628868149148594e-05, + "loss": 0.0083, + "num_input_tokens_seen": 89920768, + "step": 73880 + }, + { + "epoch": 8.228644615213275, + "grad_norm": 0.010239364579319954, + "learning_rate": 3.662671721891402e-05, + "loss": 0.067, + "num_input_tokens_seen": 89926976, + "step": 73885 + }, + { + "epoch": 8.229201470096893, + "grad_norm": 1.267142415046692, + "learning_rate": 3.662456617885578e-05, + "loss": 0.0304, + "num_input_tokens_seen": 89932992, + "step": 73890 + }, + { + "epoch": 8.22975832498051, + "grad_norm": 0.00037604442331939936, + "learning_rate": 3.662241502899421e-05, + "loss": 0.0023, + "num_input_tokens_seen": 89939200, + "step": 73895 + }, + { + "epoch": 8.230315179864128, + "grad_norm": 1.5893173217773438, + "learning_rate": 3.662026376934961e-05, + "loss": 0.127, + "num_input_tokens_seen": 89945152, + "step": 73900 + }, + { + "epoch": 8.230872034747744, + "grad_norm": 0.015961287543177605, + "learning_rate": 3.6618112399942314e-05, + "loss": 0.0268, + "num_input_tokens_seen": 89951520, + "step": 73905 + }, + { + "epoch": 8.231428889631362, + "grad_norm": 0.0639922022819519, + "learning_rate": 3.661596092079264e-05, + "loss": 0.0468, + "num_input_tokens_seen": 89957856, + "step": 73910 + }, + { + "epoch": 8.23198574451498, + "grad_norm": 0.19731932878494263, + "learning_rate": 3.6613809331920895e-05, + "loss": 0.0356, + "num_input_tokens_seen": 89964000, + "step": 73915 + }, + { + "epoch": 8.232542599398597, + "grad_norm": 0.6306784152984619, + "learning_rate": 3.661165763334743e-05, + "loss": 0.0572, + "num_input_tokens_seen": 89970144, + "step": 73920 + }, + { + "epoch": 8.233099454282215, + "grad_norm": 0.06168939545750618, + "learning_rate": 3.660950582509255e-05, + "loss": 0.0379, + "num_input_tokens_seen": 89976096, + "step": 73925 + }, + { + "epoch": 8.23365630916583, + "grad_norm": 0.02271895669400692, + "learning_rate": 3.660735390717658e-05, + "loss": 0.0678, + "num_input_tokens_seen": 89982336, + "step": 73930 + }, + { + "epoch": 8.234213164049448, + "grad_norm": 0.00034840975422412157, + "learning_rate": 3.660520187961986e-05, + "loss": 0.0112, + "num_input_tokens_seen": 89988416, + "step": 73935 + }, + { + "epoch": 8.234770018933066, + "grad_norm": 0.10278857499361038, + "learning_rate": 3.660304974244271e-05, + "loss": 0.0556, + "num_input_tokens_seen": 89994176, + "step": 73940 + }, + { + "epoch": 8.235326873816684, + "grad_norm": 0.00090075156185776, + "learning_rate": 3.6600897495665455e-05, + "loss": 0.0457, + "num_input_tokens_seen": 90000032, + "step": 73945 + }, + { + "epoch": 8.235883728700301, + "grad_norm": 1.5693247318267822, + "learning_rate": 3.6598745139308435e-05, + "loss": 0.1614, + "num_input_tokens_seen": 90005376, + "step": 73950 + }, + { + "epoch": 8.236440583583917, + "grad_norm": 0.009931412525475025, + "learning_rate": 3.659659267339197e-05, + "loss": 0.0889, + "num_input_tokens_seen": 90011424, + "step": 73955 + }, + { + "epoch": 8.236997438467535, + "grad_norm": 0.2481876015663147, + "learning_rate": 3.6594440097936395e-05, + "loss": 0.0384, + "num_input_tokens_seen": 90017280, + "step": 73960 + }, + { + "epoch": 8.237554293351153, + "grad_norm": 0.05398247018456459, + "learning_rate": 3.6592287412962046e-05, + "loss": 0.1016, + "num_input_tokens_seen": 90023488, + "step": 73965 + }, + { + "epoch": 8.23811114823477, + "grad_norm": 0.08740118145942688, + "learning_rate": 3.6590134618489255e-05, + "loss": 0.0855, + "num_input_tokens_seen": 90029216, + "step": 73970 + }, + { + "epoch": 8.238668003118388, + "grad_norm": 0.22762121260166168, + "learning_rate": 3.658798171453836e-05, + "loss": 0.1058, + "num_input_tokens_seen": 90035360, + "step": 73975 + }, + { + "epoch": 8.239224858002006, + "grad_norm": 0.00038719826261512935, + "learning_rate": 3.658582870112969e-05, + "loss": 0.1212, + "num_input_tokens_seen": 90041440, + "step": 73980 + }, + { + "epoch": 8.239781712885621, + "grad_norm": 0.006078002974390984, + "learning_rate": 3.658367557828358e-05, + "loss": 0.0028, + "num_input_tokens_seen": 90047712, + "step": 73985 + }, + { + "epoch": 8.24033856776924, + "grad_norm": 2.079206943511963, + "learning_rate": 3.658152234602038e-05, + "loss": 0.187, + "num_input_tokens_seen": 90053856, + "step": 73990 + }, + { + "epoch": 8.240895422652857, + "grad_norm": 0.8237510919570923, + "learning_rate": 3.6579369004360417e-05, + "loss": 0.0336, + "num_input_tokens_seen": 90059776, + "step": 73995 + }, + { + "epoch": 8.241452277536474, + "grad_norm": 1.732745885848999, + "learning_rate": 3.657721555332404e-05, + "loss": 0.0877, + "num_input_tokens_seen": 90065792, + "step": 74000 + }, + { + "epoch": 8.242009132420092, + "grad_norm": 0.20039048790931702, + "learning_rate": 3.657506199293159e-05, + "loss": 0.1526, + "num_input_tokens_seen": 90071552, + "step": 74005 + }, + { + "epoch": 8.242565987303708, + "grad_norm": 0.12266326695680618, + "learning_rate": 3.6572908323203404e-05, + "loss": 0.0384, + "num_input_tokens_seen": 90077568, + "step": 74010 + }, + { + "epoch": 8.243122842187326, + "grad_norm": 0.012613797560334206, + "learning_rate": 3.657075454415983e-05, + "loss": 0.0383, + "num_input_tokens_seen": 90083392, + "step": 74015 + }, + { + "epoch": 8.243679697070943, + "grad_norm": 0.21210570633411407, + "learning_rate": 3.65686006558212e-05, + "loss": 0.1185, + "num_input_tokens_seen": 90089952, + "step": 74020 + }, + { + "epoch": 8.244236551954561, + "grad_norm": 0.5669306516647339, + "learning_rate": 3.656644665820788e-05, + "loss": 0.0575, + "num_input_tokens_seen": 90095616, + "step": 74025 + }, + { + "epoch": 8.244793406838179, + "grad_norm": 0.027019113302230835, + "learning_rate": 3.656429255134019e-05, + "loss": 0.0104, + "num_input_tokens_seen": 90101984, + "step": 74030 + }, + { + "epoch": 8.245350261721795, + "grad_norm": 0.42648079991340637, + "learning_rate": 3.656213833523851e-05, + "loss": 0.0552, + "num_input_tokens_seen": 90108064, + "step": 74035 + }, + { + "epoch": 8.245907116605412, + "grad_norm": 0.5144851207733154, + "learning_rate": 3.655998400992315e-05, + "loss": 0.0542, + "num_input_tokens_seen": 90114240, + "step": 74040 + }, + { + "epoch": 8.24646397148903, + "grad_norm": 0.1626606583595276, + "learning_rate": 3.6557829575414496e-05, + "loss": 0.0585, + "num_input_tokens_seen": 90120192, + "step": 74045 + }, + { + "epoch": 8.247020826372648, + "grad_norm": 0.41371428966522217, + "learning_rate": 3.6555675031732874e-05, + "loss": 0.0398, + "num_input_tokens_seen": 90126496, + "step": 74050 + }, + { + "epoch": 8.247577681256265, + "grad_norm": 0.9836983680725098, + "learning_rate": 3.6553520378898643e-05, + "loss": 0.0299, + "num_input_tokens_seen": 90132160, + "step": 74055 + }, + { + "epoch": 8.248134536139881, + "grad_norm": 0.31570449471473694, + "learning_rate": 3.655136561693215e-05, + "loss": 0.0632, + "num_input_tokens_seen": 90138272, + "step": 74060 + }, + { + "epoch": 8.248691391023499, + "grad_norm": 0.3935639262199402, + "learning_rate": 3.654921074585377e-05, + "loss": 0.0125, + "num_input_tokens_seen": 90144352, + "step": 74065 + }, + { + "epoch": 8.249248245907117, + "grad_norm": 0.7566217184066772, + "learning_rate": 3.6547055765683826e-05, + "loss": 0.1517, + "num_input_tokens_seen": 90150432, + "step": 74070 + }, + { + "epoch": 8.249805100790734, + "grad_norm": 0.005879857577383518, + "learning_rate": 3.65449006764427e-05, + "loss": 0.0956, + "num_input_tokens_seen": 90156992, + "step": 74075 + }, + { + "epoch": 8.250361955674352, + "grad_norm": 0.0012999987229704857, + "learning_rate": 3.6542745478150724e-05, + "loss": 0.0125, + "num_input_tokens_seen": 90163136, + "step": 74080 + }, + { + "epoch": 8.250918810557968, + "grad_norm": 0.2126636505126953, + "learning_rate": 3.654059017082828e-05, + "loss": 0.0694, + "num_input_tokens_seen": 90169120, + "step": 74085 + }, + { + "epoch": 8.251475665441586, + "grad_norm": 0.20724952220916748, + "learning_rate": 3.653843475449571e-05, + "loss": 0.0489, + "num_input_tokens_seen": 90174720, + "step": 74090 + }, + { + "epoch": 8.252032520325203, + "grad_norm": 0.3011403977870941, + "learning_rate": 3.6536279229173384e-05, + "loss": 0.0179, + "num_input_tokens_seen": 90181024, + "step": 74095 + }, + { + "epoch": 8.25258937520882, + "grad_norm": 2.661172866821289, + "learning_rate": 3.653412359488165e-05, + "loss": 0.0354, + "num_input_tokens_seen": 90186880, + "step": 74100 + }, + { + "epoch": 8.253146230092439, + "grad_norm": 0.5970048308372498, + "learning_rate": 3.6531967851640886e-05, + "loss": 0.0689, + "num_input_tokens_seen": 90192896, + "step": 74105 + }, + { + "epoch": 8.253703084976054, + "grad_norm": 0.00038788170786574483, + "learning_rate": 3.652981199947145e-05, + "loss": 0.0154, + "num_input_tokens_seen": 90199040, + "step": 74110 + }, + { + "epoch": 8.254259939859672, + "grad_norm": 0.8555520176887512, + "learning_rate": 3.652765603839369e-05, + "loss": 0.0899, + "num_input_tokens_seen": 90204992, + "step": 74115 + }, + { + "epoch": 8.25481679474329, + "grad_norm": 0.06101090833544731, + "learning_rate": 3.652549996842799e-05, + "loss": 0.0466, + "num_input_tokens_seen": 90210976, + "step": 74120 + }, + { + "epoch": 8.255373649626907, + "grad_norm": 1.0589733123779297, + "learning_rate": 3.652334378959471e-05, + "loss": 0.0421, + "num_input_tokens_seen": 90217152, + "step": 74125 + }, + { + "epoch": 8.255930504510525, + "grad_norm": 1.9562219381332397, + "learning_rate": 3.6521187501914214e-05, + "loss": 0.1168, + "num_input_tokens_seen": 90223104, + "step": 74130 + }, + { + "epoch": 8.256487359394143, + "grad_norm": 0.516988217830658, + "learning_rate": 3.651903110540687e-05, + "loss": 0.075, + "num_input_tokens_seen": 90228992, + "step": 74135 + }, + { + "epoch": 8.257044214277759, + "grad_norm": 0.6923424601554871, + "learning_rate": 3.6516874600093046e-05, + "loss": 0.1158, + "num_input_tokens_seen": 90234304, + "step": 74140 + }, + { + "epoch": 8.257601069161376, + "grad_norm": 1.324487566947937, + "learning_rate": 3.651471798599312e-05, + "loss": 0.0979, + "num_input_tokens_seen": 90240000, + "step": 74145 + }, + { + "epoch": 8.258157924044994, + "grad_norm": 0.24978622794151306, + "learning_rate": 3.651256126312745e-05, + "loss": 0.0401, + "num_input_tokens_seen": 90246048, + "step": 74150 + }, + { + "epoch": 8.258714778928612, + "grad_norm": 0.008241894654929638, + "learning_rate": 3.651040443151642e-05, + "loss": 0.0407, + "num_input_tokens_seen": 90252480, + "step": 74155 + }, + { + "epoch": 8.25927163381223, + "grad_norm": 0.00041427434189245105, + "learning_rate": 3.6508247491180405e-05, + "loss": 0.0291, + "num_input_tokens_seen": 90257920, + "step": 74160 + }, + { + "epoch": 8.259828488695845, + "grad_norm": 0.5985589623451233, + "learning_rate": 3.650609044213976e-05, + "loss": 0.0297, + "num_input_tokens_seen": 90264000, + "step": 74165 + }, + { + "epoch": 8.260385343579463, + "grad_norm": 0.008816301822662354, + "learning_rate": 3.6503933284414885e-05, + "loss": 0.0609, + "num_input_tokens_seen": 90270176, + "step": 74170 + }, + { + "epoch": 8.26094219846308, + "grad_norm": 0.6114367246627808, + "learning_rate": 3.6501776018026126e-05, + "loss": 0.0539, + "num_input_tokens_seen": 90276160, + "step": 74175 + }, + { + "epoch": 8.261499053346698, + "grad_norm": 0.6608883142471313, + "learning_rate": 3.649961864299389e-05, + "loss": 0.0402, + "num_input_tokens_seen": 90282304, + "step": 74180 + }, + { + "epoch": 8.262055908230316, + "grad_norm": 0.02834603749215603, + "learning_rate": 3.649746115933854e-05, + "loss": 0.1058, + "num_input_tokens_seen": 90287904, + "step": 74185 + }, + { + "epoch": 8.262612763113932, + "grad_norm": 0.7087092399597168, + "learning_rate": 3.649530356708045e-05, + "loss": 0.0766, + "num_input_tokens_seen": 90293984, + "step": 74190 + }, + { + "epoch": 8.26316961799755, + "grad_norm": 0.31693387031555176, + "learning_rate": 3.649314586624002e-05, + "loss": 0.0533, + "num_input_tokens_seen": 90299744, + "step": 74195 + }, + { + "epoch": 8.263726472881167, + "grad_norm": 0.0708172395825386, + "learning_rate": 3.649098805683762e-05, + "loss": 0.0058, + "num_input_tokens_seen": 90305792, + "step": 74200 + }, + { + "epoch": 8.264283327764785, + "grad_norm": 1.0819141864776611, + "learning_rate": 3.648883013889363e-05, + "loss": 0.0675, + "num_input_tokens_seen": 90311904, + "step": 74205 + }, + { + "epoch": 8.264840182648403, + "grad_norm": 0.21665243804454803, + "learning_rate": 3.648667211242842e-05, + "loss": 0.0189, + "num_input_tokens_seen": 90318080, + "step": 74210 + }, + { + "epoch": 8.265397037532018, + "grad_norm": 0.08473943173885345, + "learning_rate": 3.64845139774624e-05, + "loss": 0.004, + "num_input_tokens_seen": 90324256, + "step": 74215 + }, + { + "epoch": 8.265953892415636, + "grad_norm": 0.005213153548538685, + "learning_rate": 3.648235573401594e-05, + "loss": 0.0811, + "num_input_tokens_seen": 90330752, + "step": 74220 + }, + { + "epoch": 8.266510747299254, + "grad_norm": 0.0023064755368977785, + "learning_rate": 3.648019738210944e-05, + "loss": 0.14, + "num_input_tokens_seen": 90336832, + "step": 74225 + }, + { + "epoch": 8.267067602182872, + "grad_norm": 0.28883975744247437, + "learning_rate": 3.647803892176327e-05, + "loss": 0.0968, + "num_input_tokens_seen": 90343232, + "step": 74230 + }, + { + "epoch": 8.26762445706649, + "grad_norm": 0.00022778143465984613, + "learning_rate": 3.647588035299783e-05, + "loss": 0.0543, + "num_input_tokens_seen": 90349248, + "step": 74235 + }, + { + "epoch": 8.268181311950105, + "grad_norm": 0.2837255597114563, + "learning_rate": 3.647372167583351e-05, + "loss": 0.0492, + "num_input_tokens_seen": 90355488, + "step": 74240 + }, + { + "epoch": 8.268738166833723, + "grad_norm": 0.24298588931560516, + "learning_rate": 3.6471562890290684e-05, + "loss": 0.1228, + "num_input_tokens_seen": 90361792, + "step": 74245 + }, + { + "epoch": 8.26929502171734, + "grad_norm": 0.23708048462867737, + "learning_rate": 3.646940399638976e-05, + "loss": 0.0878, + "num_input_tokens_seen": 90367904, + "step": 74250 + }, + { + "epoch": 8.269851876600958, + "grad_norm": 1.0715984106063843, + "learning_rate": 3.646724499415113e-05, + "loss": 0.055, + "num_input_tokens_seen": 90373760, + "step": 74255 + }, + { + "epoch": 8.270408731484576, + "grad_norm": 0.3857909142971039, + "learning_rate": 3.646508588359518e-05, + "loss": 0.0404, + "num_input_tokens_seen": 90379744, + "step": 74260 + }, + { + "epoch": 8.270965586368192, + "grad_norm": 0.32414525747299194, + "learning_rate": 3.646292666474231e-05, + "loss": 0.0191, + "num_input_tokens_seen": 90385888, + "step": 74265 + }, + { + "epoch": 8.27152244125181, + "grad_norm": 0.23266896605491638, + "learning_rate": 3.646076733761291e-05, + "loss": 0.0386, + "num_input_tokens_seen": 90392096, + "step": 74270 + }, + { + "epoch": 8.272079296135427, + "grad_norm": 1.8912345170974731, + "learning_rate": 3.645860790222739e-05, + "loss": 0.1302, + "num_input_tokens_seen": 90398272, + "step": 74275 + }, + { + "epoch": 8.272636151019045, + "grad_norm": 0.03210362419486046, + "learning_rate": 3.645644835860613e-05, + "loss": 0.0295, + "num_input_tokens_seen": 90404736, + "step": 74280 + }, + { + "epoch": 8.273193005902662, + "grad_norm": 0.0004235612868797034, + "learning_rate": 3.645428870676954e-05, + "loss": 0.0154, + "num_input_tokens_seen": 90411008, + "step": 74285 + }, + { + "epoch": 8.273749860786278, + "grad_norm": 0.08931087702512741, + "learning_rate": 3.6452128946738015e-05, + "loss": 0.003, + "num_input_tokens_seen": 90416896, + "step": 74290 + }, + { + "epoch": 8.274306715669896, + "grad_norm": 0.30135205388069153, + "learning_rate": 3.6449969078531955e-05, + "loss": 0.0574, + "num_input_tokens_seen": 90422944, + "step": 74295 + }, + { + "epoch": 8.274863570553514, + "grad_norm": 0.47271424531936646, + "learning_rate": 3.644780910217176e-05, + "loss": 0.0352, + "num_input_tokens_seen": 90429024, + "step": 74300 + }, + { + "epoch": 8.275420425437131, + "grad_norm": 0.5571274161338806, + "learning_rate": 3.644564901767784e-05, + "loss": 0.0549, + "num_input_tokens_seen": 90434784, + "step": 74305 + }, + { + "epoch": 8.275977280320749, + "grad_norm": 0.17982231080532074, + "learning_rate": 3.64434888250706e-05, + "loss": 0.0248, + "num_input_tokens_seen": 90440736, + "step": 74310 + }, + { + "epoch": 8.276534135204365, + "grad_norm": 0.27231428027153015, + "learning_rate": 3.6441328524370447e-05, + "loss": 0.1245, + "num_input_tokens_seen": 90446752, + "step": 74315 + }, + { + "epoch": 8.277090990087983, + "grad_norm": 0.48165395855903625, + "learning_rate": 3.643916811559776e-05, + "loss": 0.0279, + "num_input_tokens_seen": 90452928, + "step": 74320 + }, + { + "epoch": 8.2776478449716, + "grad_norm": 0.3279275894165039, + "learning_rate": 3.6437007598772974e-05, + "loss": 0.1137, + "num_input_tokens_seen": 90459072, + "step": 74325 + }, + { + "epoch": 8.278204699855218, + "grad_norm": 0.4642895758152008, + "learning_rate": 3.643484697391649e-05, + "loss": 0.0332, + "num_input_tokens_seen": 90465344, + "step": 74330 + }, + { + "epoch": 8.278761554738836, + "grad_norm": 0.25005394220352173, + "learning_rate": 3.643268624104871e-05, + "loss": 0.0335, + "num_input_tokens_seen": 90471648, + "step": 74335 + }, + { + "epoch": 8.279318409622453, + "grad_norm": 0.6036606431007385, + "learning_rate": 3.643052540019005e-05, + "loss": 0.0387, + "num_input_tokens_seen": 90477696, + "step": 74340 + }, + { + "epoch": 8.27987526450607, + "grad_norm": 0.045388735830783844, + "learning_rate": 3.642836445136092e-05, + "loss": 0.0411, + "num_input_tokens_seen": 90483712, + "step": 74345 + }, + { + "epoch": 8.280432119389687, + "grad_norm": 0.15440846979618073, + "learning_rate": 3.642620339458173e-05, + "loss": 0.0532, + "num_input_tokens_seen": 90489760, + "step": 74350 + }, + { + "epoch": 8.280988974273304, + "grad_norm": 0.006923201028257608, + "learning_rate": 3.6424042229872894e-05, + "loss": 0.0417, + "num_input_tokens_seen": 90495744, + "step": 74355 + }, + { + "epoch": 8.281545829156922, + "grad_norm": 0.4507245123386383, + "learning_rate": 3.6421880957254834e-05, + "loss": 0.0497, + "num_input_tokens_seen": 90502080, + "step": 74360 + }, + { + "epoch": 8.28210268404054, + "grad_norm": 0.009672231040894985, + "learning_rate": 3.641971957674795e-05, + "loss": 0.0343, + "num_input_tokens_seen": 90508192, + "step": 74365 + }, + { + "epoch": 8.282659538924156, + "grad_norm": 0.0010372094111517072, + "learning_rate": 3.641755808837267e-05, + "loss": 0.0741, + "num_input_tokens_seen": 90514272, + "step": 74370 + }, + { + "epoch": 8.283216393807773, + "grad_norm": 0.4237405061721802, + "learning_rate": 3.64153964921494e-05, + "loss": 0.1404, + "num_input_tokens_seen": 90520608, + "step": 74375 + }, + { + "epoch": 8.283773248691391, + "grad_norm": 0.018955804407596588, + "learning_rate": 3.641323478809857e-05, + "loss": 0.1104, + "num_input_tokens_seen": 90527104, + "step": 74380 + }, + { + "epoch": 8.284330103575009, + "grad_norm": 0.016862409189343452, + "learning_rate": 3.641107297624059e-05, + "loss": 0.0131, + "num_input_tokens_seen": 90533504, + "step": 74385 + }, + { + "epoch": 8.284886958458626, + "grad_norm": 0.006245961878448725, + "learning_rate": 3.640891105659588e-05, + "loss": 0.017, + "num_input_tokens_seen": 90539424, + "step": 74390 + }, + { + "epoch": 8.285443813342242, + "grad_norm": 0.3240044414997101, + "learning_rate": 3.640674902918488e-05, + "loss": 0.0144, + "num_input_tokens_seen": 90545504, + "step": 74395 + }, + { + "epoch": 8.28600066822586, + "grad_norm": 0.001973454374819994, + "learning_rate": 3.6404586894027984e-05, + "loss": 0.0383, + "num_input_tokens_seen": 90551552, + "step": 74400 + }, + { + "epoch": 8.286557523109478, + "grad_norm": 0.0020986006129533052, + "learning_rate": 3.640242465114562e-05, + "loss": 0.001, + "num_input_tokens_seen": 90557856, + "step": 74405 + }, + { + "epoch": 8.287114377993095, + "grad_norm": 0.9057922959327698, + "learning_rate": 3.640026230055823e-05, + "loss": 0.1345, + "num_input_tokens_seen": 90563840, + "step": 74410 + }, + { + "epoch": 8.287671232876713, + "grad_norm": 0.008818204514682293, + "learning_rate": 3.6398099842286226e-05, + "loss": 0.041, + "num_input_tokens_seen": 90570080, + "step": 74415 + }, + { + "epoch": 8.288228087760329, + "grad_norm": 1.393763780593872, + "learning_rate": 3.6395937276350045e-05, + "loss": 0.0744, + "num_input_tokens_seen": 90576320, + "step": 74420 + }, + { + "epoch": 8.288784942643947, + "grad_norm": 0.873019814491272, + "learning_rate": 3.63937746027701e-05, + "loss": 0.1007, + "num_input_tokens_seen": 90582432, + "step": 74425 + }, + { + "epoch": 8.289341797527564, + "grad_norm": 0.29375264048576355, + "learning_rate": 3.6391611821566816e-05, + "loss": 0.0721, + "num_input_tokens_seen": 90588608, + "step": 74430 + }, + { + "epoch": 8.289898652411182, + "grad_norm": 3.5557210445404053, + "learning_rate": 3.638944893276064e-05, + "loss": 0.0778, + "num_input_tokens_seen": 90594592, + "step": 74435 + }, + { + "epoch": 8.2904555072948, + "grad_norm": 0.19840103387832642, + "learning_rate": 3.6387285936371994e-05, + "loss": 0.0566, + "num_input_tokens_seen": 90600608, + "step": 74440 + }, + { + "epoch": 8.291012362178416, + "grad_norm": 1.2192015647888184, + "learning_rate": 3.6385122832421316e-05, + "loss": 0.091, + "num_input_tokens_seen": 90606496, + "step": 74445 + }, + { + "epoch": 8.291569217062033, + "grad_norm": 0.05713725462555885, + "learning_rate": 3.638295962092902e-05, + "loss": 0.0088, + "num_input_tokens_seen": 90612416, + "step": 74450 + }, + { + "epoch": 8.29212607194565, + "grad_norm": 0.7375770807266235, + "learning_rate": 3.638079630191556e-05, + "loss": 0.0125, + "num_input_tokens_seen": 90618912, + "step": 74455 + }, + { + "epoch": 8.292682926829269, + "grad_norm": 1.0911818742752075, + "learning_rate": 3.637863287540135e-05, + "loss": 0.052, + "num_input_tokens_seen": 90625024, + "step": 74460 + }, + { + "epoch": 8.293239781712886, + "grad_norm": 0.025042936205863953, + "learning_rate": 3.637646934140684e-05, + "loss": 0.0766, + "num_input_tokens_seen": 90630912, + "step": 74465 + }, + { + "epoch": 8.293796636596502, + "grad_norm": 0.19460314512252808, + "learning_rate": 3.637430569995247e-05, + "loss": 0.0161, + "num_input_tokens_seen": 90637088, + "step": 74470 + }, + { + "epoch": 8.29435349148012, + "grad_norm": 0.8475550413131714, + "learning_rate": 3.6372141951058665e-05, + "loss": 0.1075, + "num_input_tokens_seen": 90643040, + "step": 74475 + }, + { + "epoch": 8.294910346363737, + "grad_norm": 0.0009090631501749158, + "learning_rate": 3.636997809474587e-05, + "loss": 0.0144, + "num_input_tokens_seen": 90649152, + "step": 74480 + }, + { + "epoch": 8.295467201247355, + "grad_norm": 0.9918485879898071, + "learning_rate": 3.636781413103452e-05, + "loss": 0.1114, + "num_input_tokens_seen": 90655136, + "step": 74485 + }, + { + "epoch": 8.296024056130973, + "grad_norm": 0.032278016209602356, + "learning_rate": 3.636565005994506e-05, + "loss": 0.0315, + "num_input_tokens_seen": 90661024, + "step": 74490 + }, + { + "epoch": 8.29658091101459, + "grad_norm": 0.22589440643787384, + "learning_rate": 3.6363485881497916e-05, + "loss": 0.0733, + "num_input_tokens_seen": 90667072, + "step": 74495 + }, + { + "epoch": 8.297137765898206, + "grad_norm": 0.2158937007188797, + "learning_rate": 3.636132159571355e-05, + "loss": 0.0453, + "num_input_tokens_seen": 90672448, + "step": 74500 + }, + { + "epoch": 8.297694620781824, + "grad_norm": 0.08682732284069061, + "learning_rate": 3.635915720261241e-05, + "loss": 0.0148, + "num_input_tokens_seen": 90678592, + "step": 74505 + }, + { + "epoch": 8.298251475665442, + "grad_norm": 0.022473450750112534, + "learning_rate": 3.635699270221492e-05, + "loss": 0.0353, + "num_input_tokens_seen": 90684736, + "step": 74510 + }, + { + "epoch": 8.29880833054906, + "grad_norm": 0.01584302820265293, + "learning_rate": 3.6354828094541545e-05, + "loss": 0.0742, + "num_input_tokens_seen": 90690976, + "step": 74515 + }, + { + "epoch": 8.299365185432677, + "grad_norm": 0.4093567728996277, + "learning_rate": 3.63526633796127e-05, + "loss": 0.1166, + "num_input_tokens_seen": 90696576, + "step": 74520 + }, + { + "epoch": 8.299922040316293, + "grad_norm": 0.14032158255577087, + "learning_rate": 3.6350498557448874e-05, + "loss": 0.0261, + "num_input_tokens_seen": 90702752, + "step": 74525 + }, + { + "epoch": 8.30047889519991, + "grad_norm": 0.06865240633487701, + "learning_rate": 3.6348333628070495e-05, + "loss": 0.0067, + "num_input_tokens_seen": 90708992, + "step": 74530 + }, + { + "epoch": 8.301035750083528, + "grad_norm": 0.02183217741549015, + "learning_rate": 3.6346168591497995e-05, + "loss": 0.0831, + "num_input_tokens_seen": 90714944, + "step": 74535 + }, + { + "epoch": 8.301592604967146, + "grad_norm": 0.3045613467693329, + "learning_rate": 3.634400344775186e-05, + "loss": 0.067, + "num_input_tokens_seen": 90721056, + "step": 74540 + }, + { + "epoch": 8.302149459850764, + "grad_norm": 0.8952564597129822, + "learning_rate": 3.634183819685252e-05, + "loss": 0.0191, + "num_input_tokens_seen": 90727424, + "step": 74545 + }, + { + "epoch": 8.30270631473438, + "grad_norm": 0.022654730826616287, + "learning_rate": 3.6339672838820425e-05, + "loss": 0.0394, + "num_input_tokens_seen": 90733664, + "step": 74550 + }, + { + "epoch": 8.303263169617997, + "grad_norm": 0.44429120421409607, + "learning_rate": 3.633750737367604e-05, + "loss": 0.018, + "num_input_tokens_seen": 90739712, + "step": 74555 + }, + { + "epoch": 8.303820024501615, + "grad_norm": 0.02774653024971485, + "learning_rate": 3.633534180143981e-05, + "loss": 0.0119, + "num_input_tokens_seen": 90746080, + "step": 74560 + }, + { + "epoch": 8.304376879385233, + "grad_norm": 0.003934122622013092, + "learning_rate": 3.6333176122132204e-05, + "loss": 0.0611, + "num_input_tokens_seen": 90752448, + "step": 74565 + }, + { + "epoch": 8.30493373426885, + "grad_norm": 0.010960744693875313, + "learning_rate": 3.6331010335773654e-05, + "loss": 0.0218, + "num_input_tokens_seen": 90758432, + "step": 74570 + }, + { + "epoch": 8.305490589152466, + "grad_norm": 0.5708192586898804, + "learning_rate": 3.6328844442384645e-05, + "loss": 0.034, + "num_input_tokens_seen": 90764608, + "step": 74575 + }, + { + "epoch": 8.306047444036084, + "grad_norm": 0.0015495804836973548, + "learning_rate": 3.6326678441985626e-05, + "loss": 0.0064, + "num_input_tokens_seen": 90770528, + "step": 74580 + }, + { + "epoch": 8.306604298919702, + "grad_norm": 0.05099399760365486, + "learning_rate": 3.632451233459705e-05, + "loss": 0.0519, + "num_input_tokens_seen": 90776928, + "step": 74585 + }, + { + "epoch": 8.30716115380332, + "grad_norm": 0.168866366147995, + "learning_rate": 3.632234612023938e-05, + "loss": 0.0623, + "num_input_tokens_seen": 90783072, + "step": 74590 + }, + { + "epoch": 8.307718008686937, + "grad_norm": 0.28199371695518494, + "learning_rate": 3.632017979893308e-05, + "loss": 0.0737, + "num_input_tokens_seen": 90789120, + "step": 74595 + }, + { + "epoch": 8.308274863570553, + "grad_norm": 0.06178046390414238, + "learning_rate": 3.631801337069861e-05, + "loss": 0.0251, + "num_input_tokens_seen": 90795136, + "step": 74600 + }, + { + "epoch": 8.30883171845417, + "grad_norm": 0.002509845420718193, + "learning_rate": 3.6315846835556445e-05, + "loss": 0.0069, + "num_input_tokens_seen": 90801088, + "step": 74605 + }, + { + "epoch": 8.309388573337788, + "grad_norm": 0.00018488276691641659, + "learning_rate": 3.6313680193527035e-05, + "loss": 0.0296, + "num_input_tokens_seen": 90807456, + "step": 74610 + }, + { + "epoch": 8.309945428221406, + "grad_norm": 0.2304263710975647, + "learning_rate": 3.6311513444630845e-05, + "loss": 0.011, + "num_input_tokens_seen": 90813216, + "step": 74615 + }, + { + "epoch": 8.310502283105023, + "grad_norm": 0.12107273936271667, + "learning_rate": 3.630934658888836e-05, + "loss": 0.0076, + "num_input_tokens_seen": 90818656, + "step": 74620 + }, + { + "epoch": 8.31105913798864, + "grad_norm": 0.18652983009815216, + "learning_rate": 3.630717962632003e-05, + "loss": 0.0172, + "num_input_tokens_seen": 90824800, + "step": 74625 + }, + { + "epoch": 8.311615992872257, + "grad_norm": 3.6142168045043945, + "learning_rate": 3.630501255694632e-05, + "loss": 0.0784, + "num_input_tokens_seen": 90830752, + "step": 74630 + }, + { + "epoch": 8.312172847755875, + "grad_norm": 0.17350225150585175, + "learning_rate": 3.6302845380787724e-05, + "loss": 0.0292, + "num_input_tokens_seen": 90836768, + "step": 74635 + }, + { + "epoch": 8.312729702639492, + "grad_norm": 0.3436630070209503, + "learning_rate": 3.6300678097864685e-05, + "loss": 0.0339, + "num_input_tokens_seen": 90842848, + "step": 74640 + }, + { + "epoch": 8.31328655752311, + "grad_norm": 0.5145862102508545, + "learning_rate": 3.62985107081977e-05, + "loss": 0.0273, + "num_input_tokens_seen": 90849024, + "step": 74645 + }, + { + "epoch": 8.313843412406726, + "grad_norm": 0.002756283385679126, + "learning_rate": 3.629634321180722e-05, + "loss": 0.0357, + "num_input_tokens_seen": 90854912, + "step": 74650 + }, + { + "epoch": 8.314400267290344, + "grad_norm": 0.977794885635376, + "learning_rate": 3.629417560871373e-05, + "loss": 0.0351, + "num_input_tokens_seen": 90860736, + "step": 74655 + }, + { + "epoch": 8.314957122173961, + "grad_norm": 1.487884759902954, + "learning_rate": 3.6292007898937707e-05, + "loss": 0.0737, + "num_input_tokens_seen": 90867040, + "step": 74660 + }, + { + "epoch": 8.315513977057579, + "grad_norm": 0.20702172815799713, + "learning_rate": 3.6289840082499615e-05, + "loss": 0.1212, + "num_input_tokens_seen": 90873248, + "step": 74665 + }, + { + "epoch": 8.316070831941197, + "grad_norm": 0.020597903057932854, + "learning_rate": 3.628767215941995e-05, + "loss": 0.0034, + "num_input_tokens_seen": 90879360, + "step": 74670 + }, + { + "epoch": 8.316627686824813, + "grad_norm": 0.005564976949244738, + "learning_rate": 3.628550412971916e-05, + "loss": 0.0418, + "num_input_tokens_seen": 90885440, + "step": 74675 + }, + { + "epoch": 8.31718454170843, + "grad_norm": 1.758965253829956, + "learning_rate": 3.628333599341776e-05, + "loss": 0.0711, + "num_input_tokens_seen": 90891488, + "step": 74680 + }, + { + "epoch": 8.317741396592048, + "grad_norm": 0.004534367471933365, + "learning_rate": 3.62811677505362e-05, + "loss": 0.0013, + "num_input_tokens_seen": 90897664, + "step": 74685 + }, + { + "epoch": 8.318298251475666, + "grad_norm": 0.02968338131904602, + "learning_rate": 3.627899940109497e-05, + "loss": 0.0941, + "num_input_tokens_seen": 90903648, + "step": 74690 + }, + { + "epoch": 8.318855106359283, + "grad_norm": 0.6811795830726624, + "learning_rate": 3.6276830945114565e-05, + "loss": 0.0109, + "num_input_tokens_seen": 90909536, + "step": 74695 + }, + { + "epoch": 8.319411961242901, + "grad_norm": 0.1020173504948616, + "learning_rate": 3.627466238261545e-05, + "loss": 0.1084, + "num_input_tokens_seen": 90915616, + "step": 74700 + }, + { + "epoch": 8.319968816126517, + "grad_norm": 0.0016917785396799445, + "learning_rate": 3.627249371361812e-05, + "loss": 0.0015, + "num_input_tokens_seen": 90921728, + "step": 74705 + }, + { + "epoch": 8.320525671010135, + "grad_norm": 0.08160339295864105, + "learning_rate": 3.6270324938143046e-05, + "loss": 0.0466, + "num_input_tokens_seen": 90927616, + "step": 74710 + }, + { + "epoch": 8.321082525893752, + "grad_norm": 0.0002378589560976252, + "learning_rate": 3.626815605621073e-05, + "loss": 0.0429, + "num_input_tokens_seen": 90933920, + "step": 74715 + }, + { + "epoch": 8.32163938077737, + "grad_norm": 0.020022591575980186, + "learning_rate": 3.626598706784165e-05, + "loss": 0.0416, + "num_input_tokens_seen": 90939904, + "step": 74720 + }, + { + "epoch": 8.322196235660988, + "grad_norm": 0.09151619672775269, + "learning_rate": 3.62638179730563e-05, + "loss": 0.0237, + "num_input_tokens_seen": 90946336, + "step": 74725 + }, + { + "epoch": 8.322753090544603, + "grad_norm": 0.2707352042198181, + "learning_rate": 3.6261648771875175e-05, + "loss": 0.0229, + "num_input_tokens_seen": 90952512, + "step": 74730 + }, + { + "epoch": 8.323309945428221, + "grad_norm": 0.024316934868693352, + "learning_rate": 3.625947946431874e-05, + "loss": 0.0151, + "num_input_tokens_seen": 90958848, + "step": 74735 + }, + { + "epoch": 8.323866800311839, + "grad_norm": 0.034204430878162384, + "learning_rate": 3.62573100504075e-05, + "loss": 0.0092, + "num_input_tokens_seen": 90965024, + "step": 74740 + }, + { + "epoch": 8.324423655195456, + "grad_norm": 0.7936198115348816, + "learning_rate": 3.6255140530161954e-05, + "loss": 0.1206, + "num_input_tokens_seen": 90971040, + "step": 74745 + }, + { + "epoch": 8.324980510079074, + "grad_norm": 0.01161674689501524, + "learning_rate": 3.6252970903602576e-05, + "loss": 0.0008, + "num_input_tokens_seen": 90977088, + "step": 74750 + }, + { + "epoch": 8.32553736496269, + "grad_norm": 0.0002197977009927854, + "learning_rate": 3.625080117074989e-05, + "loss": 0.094, + "num_input_tokens_seen": 90983520, + "step": 74755 + }, + { + "epoch": 8.326094219846308, + "grad_norm": 0.007409562822431326, + "learning_rate": 3.624863133162436e-05, + "loss": 0.1457, + "num_input_tokens_seen": 90989376, + "step": 74760 + }, + { + "epoch": 8.326651074729925, + "grad_norm": 1.8643537759780884, + "learning_rate": 3.62464613862465e-05, + "loss": 0.0526, + "num_input_tokens_seen": 90995456, + "step": 74765 + }, + { + "epoch": 8.327207929613543, + "grad_norm": 0.2521630525588989, + "learning_rate": 3.62442913346368e-05, + "loss": 0.0074, + "num_input_tokens_seen": 91001344, + "step": 74770 + }, + { + "epoch": 8.32776478449716, + "grad_norm": 0.03811168298125267, + "learning_rate": 3.624212117681575e-05, + "loss": 0.0058, + "num_input_tokens_seen": 91007616, + "step": 74775 + }, + { + "epoch": 8.328321639380777, + "grad_norm": 0.13800504803657532, + "learning_rate": 3.623995091280388e-05, + "loss": 0.0216, + "num_input_tokens_seen": 91013664, + "step": 74780 + }, + { + "epoch": 8.328878494264394, + "grad_norm": 1.5172642469406128, + "learning_rate": 3.6237780542621645e-05, + "loss": 0.0512, + "num_input_tokens_seen": 91019648, + "step": 74785 + }, + { + "epoch": 8.329435349148012, + "grad_norm": 2.064192771911621, + "learning_rate": 3.623561006628959e-05, + "loss": 0.056, + "num_input_tokens_seen": 91025984, + "step": 74790 + }, + { + "epoch": 8.32999220403163, + "grad_norm": 0.781383752822876, + "learning_rate": 3.623343948382818e-05, + "loss": 0.0673, + "num_input_tokens_seen": 91031808, + "step": 74795 + }, + { + "epoch": 8.330549058915247, + "grad_norm": 0.2826751172542572, + "learning_rate": 3.623126879525794e-05, + "loss": 0.0336, + "num_input_tokens_seen": 91037696, + "step": 74800 + }, + { + "epoch": 8.331105913798863, + "grad_norm": 0.6748218536376953, + "learning_rate": 3.622909800059937e-05, + "loss": 0.0299, + "num_input_tokens_seen": 91044064, + "step": 74805 + }, + { + "epoch": 8.331662768682481, + "grad_norm": 0.6280698180198669, + "learning_rate": 3.622692709987297e-05, + "loss": 0.0296, + "num_input_tokens_seen": 91050080, + "step": 74810 + }, + { + "epoch": 8.332219623566099, + "grad_norm": 0.017923269420862198, + "learning_rate": 3.6224756093099254e-05, + "loss": 0.0026, + "num_input_tokens_seen": 91056192, + "step": 74815 + }, + { + "epoch": 8.332776478449716, + "grad_norm": 0.3612819314002991, + "learning_rate": 3.622258498029872e-05, + "loss": 0.0361, + "num_input_tokens_seen": 91062240, + "step": 74820 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.0007364415214397013, + "learning_rate": 3.622041376149188e-05, + "loss": 0.028, + "num_input_tokens_seen": 91067968, + "step": 74825 + }, + { + "epoch": 8.33389018821695, + "grad_norm": 0.7660107612609863, + "learning_rate": 3.621824243669924e-05, + "loss": 0.0805, + "num_input_tokens_seen": 91074144, + "step": 74830 + }, + { + "epoch": 8.334447043100567, + "grad_norm": 0.15049628913402557, + "learning_rate": 3.621607100594131e-05, + "loss": 0.0063, + "num_input_tokens_seen": 91079936, + "step": 74835 + }, + { + "epoch": 8.335003897984185, + "grad_norm": 1.2597559690475464, + "learning_rate": 3.621389946923861e-05, + "loss": 0.0405, + "num_input_tokens_seen": 91085760, + "step": 74840 + }, + { + "epoch": 8.335560752867803, + "grad_norm": 0.39038124680519104, + "learning_rate": 3.621172782661164e-05, + "loss": 0.082, + "num_input_tokens_seen": 91091552, + "step": 74845 + }, + { + "epoch": 8.33611760775142, + "grad_norm": 0.0029833188746124506, + "learning_rate": 3.6209556078080926e-05, + "loss": 0.1613, + "num_input_tokens_seen": 91097760, + "step": 74850 + }, + { + "epoch": 8.336674462635038, + "grad_norm": 0.008540134876966476, + "learning_rate": 3.620738422366696e-05, + "loss": 0.0002, + "num_input_tokens_seen": 91104288, + "step": 74855 + }, + { + "epoch": 8.337231317518654, + "grad_norm": 0.42348113656044006, + "learning_rate": 3.6205212263390285e-05, + "loss": 0.1485, + "num_input_tokens_seen": 91110560, + "step": 74860 + }, + { + "epoch": 8.337788172402272, + "grad_norm": 3.5703821182250977, + "learning_rate": 3.620304019727139e-05, + "loss": 0.0532, + "num_input_tokens_seen": 91116768, + "step": 74865 + }, + { + "epoch": 8.33834502728589, + "grad_norm": 0.10340820997953415, + "learning_rate": 3.620086802533081e-05, + "loss": 0.0223, + "num_input_tokens_seen": 91122912, + "step": 74870 + }, + { + "epoch": 8.338901882169507, + "grad_norm": 0.01281904336065054, + "learning_rate": 3.619869574758906e-05, + "loss": 0.025, + "num_input_tokens_seen": 91128736, + "step": 74875 + }, + { + "epoch": 8.339458737053125, + "grad_norm": 0.14837802946567535, + "learning_rate": 3.619652336406666e-05, + "loss": 0.0238, + "num_input_tokens_seen": 91134848, + "step": 74880 + }, + { + "epoch": 8.34001559193674, + "grad_norm": 0.5570815205574036, + "learning_rate": 3.619435087478412e-05, + "loss": 0.0353, + "num_input_tokens_seen": 91140896, + "step": 74885 + }, + { + "epoch": 8.340572446820358, + "grad_norm": 0.009677249938249588, + "learning_rate": 3.619217827976197e-05, + "loss": 0.0243, + "num_input_tokens_seen": 91147072, + "step": 74890 + }, + { + "epoch": 8.341129301703976, + "grad_norm": 0.004355654586106539, + "learning_rate": 3.619000557902073e-05, + "loss": 0.0554, + "num_input_tokens_seen": 91152832, + "step": 74895 + }, + { + "epoch": 8.341686156587594, + "grad_norm": 0.220774307847023, + "learning_rate": 3.618783277258091e-05, + "loss": 0.0142, + "num_input_tokens_seen": 91158752, + "step": 74900 + }, + { + "epoch": 8.342243011471211, + "grad_norm": 0.0721706971526146, + "learning_rate": 3.6185659860463064e-05, + "loss": 0.0634, + "num_input_tokens_seen": 91164736, + "step": 74905 + }, + { + "epoch": 8.342799866354827, + "grad_norm": 1.1493425369262695, + "learning_rate": 3.618348684268769e-05, + "loss": 0.0248, + "num_input_tokens_seen": 91170080, + "step": 74910 + }, + { + "epoch": 8.343356721238445, + "grad_norm": 2.2109973430633545, + "learning_rate": 3.618131371927532e-05, + "loss": 0.0412, + "num_input_tokens_seen": 91176288, + "step": 74915 + }, + { + "epoch": 8.343913576122063, + "grad_norm": 0.1623506098985672, + "learning_rate": 3.6179140490246485e-05, + "loss": 0.0112, + "num_input_tokens_seen": 91182592, + "step": 74920 + }, + { + "epoch": 8.34447043100568, + "grad_norm": 1.501193881034851, + "learning_rate": 3.617696715562172e-05, + "loss": 0.1043, + "num_input_tokens_seen": 91188960, + "step": 74925 + }, + { + "epoch": 8.345027285889298, + "grad_norm": 0.000773962470702827, + "learning_rate": 3.617479371542153e-05, + "loss": 0.0614, + "num_input_tokens_seen": 91195040, + "step": 74930 + }, + { + "epoch": 8.345584140772914, + "grad_norm": 0.14620578289031982, + "learning_rate": 3.6172620169666474e-05, + "loss": 0.0418, + "num_input_tokens_seen": 91201248, + "step": 74935 + }, + { + "epoch": 8.346140995656532, + "grad_norm": 0.732123613357544, + "learning_rate": 3.617044651837706e-05, + "loss": 0.0202, + "num_input_tokens_seen": 91207392, + "step": 74940 + }, + { + "epoch": 8.34669785054015, + "grad_norm": 1.0854095220565796, + "learning_rate": 3.616827276157384e-05, + "loss": 0.1159, + "num_input_tokens_seen": 91213408, + "step": 74945 + }, + { + "epoch": 8.347254705423767, + "grad_norm": 0.00047500478103756905, + "learning_rate": 3.616609889927732e-05, + "loss": 0.1658, + "num_input_tokens_seen": 91219488, + "step": 74950 + }, + { + "epoch": 8.347811560307385, + "grad_norm": 0.0033473134972155094, + "learning_rate": 3.6163924931508065e-05, + "loss": 0.1451, + "num_input_tokens_seen": 91225568, + "step": 74955 + }, + { + "epoch": 8.348368415191, + "grad_norm": 0.014182992279529572, + "learning_rate": 3.6161750858286586e-05, + "loss": 0.0237, + "num_input_tokens_seen": 91231904, + "step": 74960 + }, + { + "epoch": 8.348925270074618, + "grad_norm": 1.2644251585006714, + "learning_rate": 3.6159576679633426e-05, + "loss": 0.0413, + "num_input_tokens_seen": 91238272, + "step": 74965 + }, + { + "epoch": 8.349482124958236, + "grad_norm": 0.00923032034188509, + "learning_rate": 3.6157402395569136e-05, + "loss": 0.0649, + "num_input_tokens_seen": 91244288, + "step": 74970 + }, + { + "epoch": 8.350038979841854, + "grad_norm": 0.010065899230539799, + "learning_rate": 3.615522800611423e-05, + "loss": 0.014, + "num_input_tokens_seen": 91250400, + "step": 74975 + }, + { + "epoch": 8.350595834725471, + "grad_norm": 1.4661483764648438, + "learning_rate": 3.6153053511289256e-05, + "loss": 0.0465, + "num_input_tokens_seen": 91256736, + "step": 74980 + }, + { + "epoch": 8.351152689609087, + "grad_norm": 0.01498746033757925, + "learning_rate": 3.6150878911114764e-05, + "loss": 0.0193, + "num_input_tokens_seen": 91263008, + "step": 74985 + }, + { + "epoch": 8.351709544492705, + "grad_norm": 1.1865949630737305, + "learning_rate": 3.6148704205611284e-05, + "loss": 0.0793, + "num_input_tokens_seen": 91268480, + "step": 74990 + }, + { + "epoch": 8.352266399376322, + "grad_norm": 0.0651845633983612, + "learning_rate": 3.6146529394799356e-05, + "loss": 0.0272, + "num_input_tokens_seen": 91274880, + "step": 74995 + }, + { + "epoch": 8.35282325425994, + "grad_norm": 1.23727548122406, + "learning_rate": 3.614435447869953e-05, + "loss": 0.1791, + "num_input_tokens_seen": 91281024, + "step": 75000 + }, + { + "epoch": 8.353380109143558, + "grad_norm": 0.5613580346107483, + "learning_rate": 3.614217945733236e-05, + "loss": 0.0899, + "num_input_tokens_seen": 91286560, + "step": 75005 + }, + { + "epoch": 8.353936964027174, + "grad_norm": 0.0025787847116589546, + "learning_rate": 3.6140004330718357e-05, + "loss": 0.0404, + "num_input_tokens_seen": 91292576, + "step": 75010 + }, + { + "epoch": 8.354493818910791, + "grad_norm": 0.7722119688987732, + "learning_rate": 3.6137829098878104e-05, + "loss": 0.034, + "num_input_tokens_seen": 91298560, + "step": 75015 + }, + { + "epoch": 8.355050673794409, + "grad_norm": 1.892324447631836, + "learning_rate": 3.613565376183212e-05, + "loss": 0.1263, + "num_input_tokens_seen": 91304832, + "step": 75020 + }, + { + "epoch": 8.355607528678027, + "grad_norm": 0.8063396215438843, + "learning_rate": 3.613347831960097e-05, + "loss": 0.0753, + "num_input_tokens_seen": 91310240, + "step": 75025 + }, + { + "epoch": 8.356164383561644, + "grad_norm": 1.924464225769043, + "learning_rate": 3.613130277220519e-05, + "loss": 0.0177, + "num_input_tokens_seen": 91315872, + "step": 75030 + }, + { + "epoch": 8.35672123844526, + "grad_norm": 0.08956986665725708, + "learning_rate": 3.6129127119665345e-05, + "loss": 0.0249, + "num_input_tokens_seen": 91322112, + "step": 75035 + }, + { + "epoch": 8.357278093328878, + "grad_norm": 0.0903935357928276, + "learning_rate": 3.612695136200198e-05, + "loss": 0.0594, + "num_input_tokens_seen": 91328256, + "step": 75040 + }, + { + "epoch": 8.357834948212496, + "grad_norm": 0.5809683203697205, + "learning_rate": 3.6124775499235644e-05, + "loss": 0.0381, + "num_input_tokens_seen": 91333824, + "step": 75045 + }, + { + "epoch": 8.358391803096113, + "grad_norm": 1.0247260332107544, + "learning_rate": 3.612259953138689e-05, + "loss": 0.0817, + "num_input_tokens_seen": 91339712, + "step": 75050 + }, + { + "epoch": 8.358948657979731, + "grad_norm": 0.0005907598533667624, + "learning_rate": 3.6120423458476265e-05, + "loss": 0.0878, + "num_input_tokens_seen": 91345824, + "step": 75055 + }, + { + "epoch": 8.359505512863349, + "grad_norm": 0.7763645648956299, + "learning_rate": 3.611824728052433e-05, + "loss": 0.0256, + "num_input_tokens_seen": 91352000, + "step": 75060 + }, + { + "epoch": 8.360062367746965, + "grad_norm": 0.9255682826042175, + "learning_rate": 3.611607099755165e-05, + "loss": 0.0677, + "num_input_tokens_seen": 91358432, + "step": 75065 + }, + { + "epoch": 8.360619222630582, + "grad_norm": 0.00020341199706308544, + "learning_rate": 3.611389460957877e-05, + "loss": 0.0191, + "num_input_tokens_seen": 91364800, + "step": 75070 + }, + { + "epoch": 8.3611760775142, + "grad_norm": 0.00024959738948382437, + "learning_rate": 3.611171811662626e-05, + "loss": 0.0102, + "num_input_tokens_seen": 91371200, + "step": 75075 + }, + { + "epoch": 8.361732932397818, + "grad_norm": 0.004400528036057949, + "learning_rate": 3.610954151871466e-05, + "loss": 0.0743, + "num_input_tokens_seen": 91377216, + "step": 75080 + }, + { + "epoch": 8.362289787281435, + "grad_norm": 0.24851572513580322, + "learning_rate": 3.610736481586454e-05, + "loss": 0.0954, + "num_input_tokens_seen": 91383552, + "step": 75085 + }, + { + "epoch": 8.362846642165051, + "grad_norm": 0.013815179467201233, + "learning_rate": 3.610518800809646e-05, + "loss": 0.0675, + "num_input_tokens_seen": 91389568, + "step": 75090 + }, + { + "epoch": 8.363403497048669, + "grad_norm": 0.11670948565006256, + "learning_rate": 3.610301109543098e-05, + "loss": 0.0112, + "num_input_tokens_seen": 91395552, + "step": 75095 + }, + { + "epoch": 8.363960351932286, + "grad_norm": 0.5548452734947205, + "learning_rate": 3.6100834077888675e-05, + "loss": 0.1186, + "num_input_tokens_seen": 91401536, + "step": 75100 + }, + { + "epoch": 8.364517206815904, + "grad_norm": 0.05838415399193764, + "learning_rate": 3.609865695549008e-05, + "loss": 0.0444, + "num_input_tokens_seen": 91407808, + "step": 75105 + }, + { + "epoch": 8.365074061699522, + "grad_norm": 0.24305009841918945, + "learning_rate": 3.60964797282558e-05, + "loss": 0.0464, + "num_input_tokens_seen": 91413696, + "step": 75110 + }, + { + "epoch": 8.365630916583138, + "grad_norm": 0.7431947588920593, + "learning_rate": 3.6094302396206366e-05, + "loss": 0.0333, + "num_input_tokens_seen": 91419744, + "step": 75115 + }, + { + "epoch": 8.366187771466755, + "grad_norm": 0.06244128569960594, + "learning_rate": 3.609212495936236e-05, + "loss": 0.0276, + "num_input_tokens_seen": 91425728, + "step": 75120 + }, + { + "epoch": 8.366744626350373, + "grad_norm": 0.00043987578828819096, + "learning_rate": 3.608994741774434e-05, + "loss": 0.1081, + "num_input_tokens_seen": 91431872, + "step": 75125 + }, + { + "epoch": 8.36730148123399, + "grad_norm": 0.19745244085788727, + "learning_rate": 3.6087769771372894e-05, + "loss": 0.0327, + "num_input_tokens_seen": 91437728, + "step": 75130 + }, + { + "epoch": 8.367858336117608, + "grad_norm": 0.016277242451906204, + "learning_rate": 3.6085592020268574e-05, + "loss": 0.0087, + "num_input_tokens_seen": 91444064, + "step": 75135 + }, + { + "epoch": 8.368415191001224, + "grad_norm": 2.287095069885254, + "learning_rate": 3.6083414164451954e-05, + "loss": 0.0946, + "num_input_tokens_seen": 91449952, + "step": 75140 + }, + { + "epoch": 8.368972045884842, + "grad_norm": 0.03605502471327782, + "learning_rate": 3.60812362039436e-05, + "loss": 0.2029, + "num_input_tokens_seen": 91456320, + "step": 75145 + }, + { + "epoch": 8.36952890076846, + "grad_norm": 0.016780929639935493, + "learning_rate": 3.607905813876411e-05, + "loss": 0.0016, + "num_input_tokens_seen": 91462496, + "step": 75150 + }, + { + "epoch": 8.370085755652077, + "grad_norm": 0.005845590028911829, + "learning_rate": 3.607687996893402e-05, + "loss": 0.0902, + "num_input_tokens_seen": 91468736, + "step": 75155 + }, + { + "epoch": 8.370642610535695, + "grad_norm": 0.09681583195924759, + "learning_rate": 3.607470169447394e-05, + "loss": 0.1631, + "num_input_tokens_seen": 91474304, + "step": 75160 + }, + { + "epoch": 8.371199465419311, + "grad_norm": 0.04681794345378876, + "learning_rate": 3.607252331540442e-05, + "loss": 0.0083, + "num_input_tokens_seen": 91480384, + "step": 75165 + }, + { + "epoch": 8.371756320302929, + "grad_norm": 0.3927742838859558, + "learning_rate": 3.6070344831746055e-05, + "loss": 0.0164, + "num_input_tokens_seen": 91486528, + "step": 75170 + }, + { + "epoch": 8.372313175186546, + "grad_norm": 0.3031635582447052, + "learning_rate": 3.60681662435194e-05, + "loss": 0.0265, + "num_input_tokens_seen": 91492608, + "step": 75175 + }, + { + "epoch": 8.372870030070164, + "grad_norm": 0.01820315793156624, + "learning_rate": 3.606598755074506e-05, + "loss": 0.1369, + "num_input_tokens_seen": 91498624, + "step": 75180 + }, + { + "epoch": 8.373426884953782, + "grad_norm": 0.5242347717285156, + "learning_rate": 3.606380875344359e-05, + "loss": 0.0751, + "num_input_tokens_seen": 91504736, + "step": 75185 + }, + { + "epoch": 8.373983739837398, + "grad_norm": 1.2714964151382446, + "learning_rate": 3.6061629851635595e-05, + "loss": 0.0529, + "num_input_tokens_seen": 91511264, + "step": 75190 + }, + { + "epoch": 8.374540594721015, + "grad_norm": 0.08599833399057388, + "learning_rate": 3.605945084534164e-05, + "loss": 0.0283, + "num_input_tokens_seen": 91517280, + "step": 75195 + }, + { + "epoch": 8.375097449604633, + "grad_norm": 0.0007919915369711816, + "learning_rate": 3.60572717345823e-05, + "loss": 0.0486, + "num_input_tokens_seen": 91523168, + "step": 75200 + }, + { + "epoch": 8.37565430448825, + "grad_norm": 0.7824288606643677, + "learning_rate": 3.605509251937818e-05, + "loss": 0.0875, + "num_input_tokens_seen": 91529344, + "step": 75205 + }, + { + "epoch": 8.376211159371868, + "grad_norm": 1.9232642650604248, + "learning_rate": 3.6052913199749855e-05, + "loss": 0.0891, + "num_input_tokens_seen": 91535296, + "step": 75210 + }, + { + "epoch": 8.376768014255486, + "grad_norm": 0.07141400873661041, + "learning_rate": 3.60507337757179e-05, + "loss": 0.0106, + "num_input_tokens_seen": 91540832, + "step": 75215 + }, + { + "epoch": 8.377324869139102, + "grad_norm": 0.1726028323173523, + "learning_rate": 3.6048554247302924e-05, + "loss": 0.0433, + "num_input_tokens_seen": 91546880, + "step": 75220 + }, + { + "epoch": 8.37788172402272, + "grad_norm": 0.5896248817443848, + "learning_rate": 3.604637461452549e-05, + "loss": 0.0962, + "num_input_tokens_seen": 91553152, + "step": 75225 + }, + { + "epoch": 8.378438578906337, + "grad_norm": 0.15883372724056244, + "learning_rate": 3.604419487740621e-05, + "loss": 0.034, + "num_input_tokens_seen": 91559264, + "step": 75230 + }, + { + "epoch": 8.378995433789955, + "grad_norm": 1.1908758878707886, + "learning_rate": 3.604201503596565e-05, + "loss": 0.1062, + "num_input_tokens_seen": 91565120, + "step": 75235 + }, + { + "epoch": 8.379552288673572, + "grad_norm": 0.016736755147576332, + "learning_rate": 3.603983509022441e-05, + "loss": 0.0033, + "num_input_tokens_seen": 91571296, + "step": 75240 + }, + { + "epoch": 8.380109143557188, + "grad_norm": 0.035148341208696365, + "learning_rate": 3.603765504020309e-05, + "loss": 0.0091, + "num_input_tokens_seen": 91577664, + "step": 75245 + }, + { + "epoch": 8.380665998440806, + "grad_norm": 0.0377468504011631, + "learning_rate": 3.603547488592226e-05, + "loss": 0.0233, + "num_input_tokens_seen": 91583680, + "step": 75250 + }, + { + "epoch": 8.381222853324424, + "grad_norm": 0.0007398340385407209, + "learning_rate": 3.6033294627402545e-05, + "loss": 0.0323, + "num_input_tokens_seen": 91589248, + "step": 75255 + }, + { + "epoch": 8.381779708208041, + "grad_norm": 0.6950466632843018, + "learning_rate": 3.603111426466452e-05, + "loss": 0.0659, + "num_input_tokens_seen": 91594976, + "step": 75260 + }, + { + "epoch": 8.382336563091659, + "grad_norm": 0.1508866399526596, + "learning_rate": 3.602893379772878e-05, + "loss": 0.0406, + "num_input_tokens_seen": 91601216, + "step": 75265 + }, + { + "epoch": 8.382893417975275, + "grad_norm": 0.4810211658477783, + "learning_rate": 3.602675322661592e-05, + "loss": 0.0726, + "num_input_tokens_seen": 91607392, + "step": 75270 + }, + { + "epoch": 8.383450272858893, + "grad_norm": 1.6657936573028564, + "learning_rate": 3.602457255134655e-05, + "loss": 0.0733, + "num_input_tokens_seen": 91613696, + "step": 75275 + }, + { + "epoch": 8.38400712774251, + "grad_norm": 0.00041074471664614975, + "learning_rate": 3.602239177194125e-05, + "loss": 0.0196, + "num_input_tokens_seen": 91620032, + "step": 75280 + }, + { + "epoch": 8.384563982626128, + "grad_norm": 0.02120382711291313, + "learning_rate": 3.6020210888420636e-05, + "loss": 0.0181, + "num_input_tokens_seen": 91626176, + "step": 75285 + }, + { + "epoch": 8.385120837509746, + "grad_norm": 0.3343546688556671, + "learning_rate": 3.60180299008053e-05, + "loss": 0.0937, + "num_input_tokens_seen": 91632288, + "step": 75290 + }, + { + "epoch": 8.385677692393362, + "grad_norm": 0.10950464755296707, + "learning_rate": 3.6015848809115835e-05, + "loss": 0.0037, + "num_input_tokens_seen": 91638336, + "step": 75295 + }, + { + "epoch": 8.38623454727698, + "grad_norm": 0.280282199382782, + "learning_rate": 3.601366761337287e-05, + "loss": 0.1356, + "num_input_tokens_seen": 91643808, + "step": 75300 + }, + { + "epoch": 8.386791402160597, + "grad_norm": 0.2017405927181244, + "learning_rate": 3.6011486313596975e-05, + "loss": 0.0318, + "num_input_tokens_seen": 91650112, + "step": 75305 + }, + { + "epoch": 8.387348257044215, + "grad_norm": 1.395627498626709, + "learning_rate": 3.600930490980877e-05, + "loss": 0.0484, + "num_input_tokens_seen": 91656128, + "step": 75310 + }, + { + "epoch": 8.387905111927832, + "grad_norm": 0.04254228249192238, + "learning_rate": 3.6007123402028866e-05, + "loss": 0.028, + "num_input_tokens_seen": 91662144, + "step": 75315 + }, + { + "epoch": 8.388461966811448, + "grad_norm": 1.818816900253296, + "learning_rate": 3.600494179027786e-05, + "loss": 0.2469, + "num_input_tokens_seen": 91668096, + "step": 75320 + }, + { + "epoch": 8.389018821695066, + "grad_norm": 0.8210193514823914, + "learning_rate": 3.600276007457637e-05, + "loss": 0.0501, + "num_input_tokens_seen": 91674336, + "step": 75325 + }, + { + "epoch": 8.389575676578684, + "grad_norm": 0.6467251777648926, + "learning_rate": 3.6000578254944986e-05, + "loss": 0.075, + "num_input_tokens_seen": 91680448, + "step": 75330 + }, + { + "epoch": 8.390132531462301, + "grad_norm": 0.02947489731013775, + "learning_rate": 3.5998396331404324e-05, + "loss": 0.1435, + "num_input_tokens_seen": 91686464, + "step": 75335 + }, + { + "epoch": 8.390689386345919, + "grad_norm": 1.079683542251587, + "learning_rate": 3.599621430397501e-05, + "loss": 0.0619, + "num_input_tokens_seen": 91692352, + "step": 75340 + }, + { + "epoch": 8.391246241229535, + "grad_norm": 0.002769415732473135, + "learning_rate": 3.599403217267763e-05, + "loss": 0.0194, + "num_input_tokens_seen": 91698848, + "step": 75345 + }, + { + "epoch": 8.391803096113152, + "grad_norm": 0.5637642741203308, + "learning_rate": 3.599184993753282e-05, + "loss": 0.0156, + "num_input_tokens_seen": 91704864, + "step": 75350 + }, + { + "epoch": 8.39235995099677, + "grad_norm": 0.9444783926010132, + "learning_rate": 3.598966759856117e-05, + "loss": 0.0746, + "num_input_tokens_seen": 91710784, + "step": 75355 + }, + { + "epoch": 8.392916805880388, + "grad_norm": 1.0969946384429932, + "learning_rate": 3.5987485155783304e-05, + "loss": 0.0894, + "num_input_tokens_seen": 91716768, + "step": 75360 + }, + { + "epoch": 8.393473660764005, + "grad_norm": 0.2559245228767395, + "learning_rate": 3.598530260921984e-05, + "loss": 0.0059, + "num_input_tokens_seen": 91722944, + "step": 75365 + }, + { + "epoch": 8.394030515647621, + "grad_norm": 0.6836511492729187, + "learning_rate": 3.598311995889139e-05, + "loss": 0.022, + "num_input_tokens_seen": 91729312, + "step": 75370 + }, + { + "epoch": 8.394587370531239, + "grad_norm": 1.4034303426742554, + "learning_rate": 3.598093720481858e-05, + "loss": 0.0629, + "num_input_tokens_seen": 91734336, + "step": 75375 + }, + { + "epoch": 8.395144225414857, + "grad_norm": 0.01808704435825348, + "learning_rate": 3.5978754347022015e-05, + "loss": 0.0138, + "num_input_tokens_seen": 91740256, + "step": 75380 + }, + { + "epoch": 8.395701080298474, + "grad_norm": 0.00048741447972133756, + "learning_rate": 3.597657138552232e-05, + "loss": 0.0125, + "num_input_tokens_seen": 91746592, + "step": 75385 + }, + { + "epoch": 8.396257935182092, + "grad_norm": 0.1958456039428711, + "learning_rate": 3.5974388320340115e-05, + "loss": 0.098, + "num_input_tokens_seen": 91752768, + "step": 75390 + }, + { + "epoch": 8.396814790065708, + "grad_norm": 0.011167803779244423, + "learning_rate": 3.597220515149602e-05, + "loss": 0.0232, + "num_input_tokens_seen": 91759104, + "step": 75395 + }, + { + "epoch": 8.397371644949326, + "grad_norm": 0.05723012611269951, + "learning_rate": 3.597002187901065e-05, + "loss": 0.0167, + "num_input_tokens_seen": 91765248, + "step": 75400 + }, + { + "epoch": 8.397928499832943, + "grad_norm": 0.008822383359074593, + "learning_rate": 3.596783850290464e-05, + "loss": 0.1473, + "num_input_tokens_seen": 91771168, + "step": 75405 + }, + { + "epoch": 8.398485354716561, + "grad_norm": 1.4739230871200562, + "learning_rate": 3.596565502319861e-05, + "loss": 0.1766, + "num_input_tokens_seen": 91777440, + "step": 75410 + }, + { + "epoch": 8.399042209600179, + "grad_norm": 0.03392676264047623, + "learning_rate": 3.596347143991318e-05, + "loss": 0.0065, + "num_input_tokens_seen": 91783552, + "step": 75415 + }, + { + "epoch": 8.399599064483796, + "grad_norm": 0.001713078236207366, + "learning_rate": 3.5961287753068975e-05, + "loss": 0.0054, + "num_input_tokens_seen": 91789856, + "step": 75420 + }, + { + "epoch": 8.400155919367412, + "grad_norm": 0.00506898108869791, + "learning_rate": 3.595910396268663e-05, + "loss": 0.0302, + "num_input_tokens_seen": 91795904, + "step": 75425 + }, + { + "epoch": 8.40071277425103, + "grad_norm": 0.3325476348400116, + "learning_rate": 3.595692006878676e-05, + "loss": 0.117, + "num_input_tokens_seen": 91801696, + "step": 75430 + }, + { + "epoch": 8.401269629134648, + "grad_norm": 0.03439433500170708, + "learning_rate": 3.595473607139002e-05, + "loss": 0.0347, + "num_input_tokens_seen": 91807616, + "step": 75435 + }, + { + "epoch": 8.401826484018265, + "grad_norm": 0.06148921698331833, + "learning_rate": 3.5952551970517e-05, + "loss": 0.0078, + "num_input_tokens_seen": 91813952, + "step": 75440 + }, + { + "epoch": 8.402383338901883, + "grad_norm": 0.09274802356958389, + "learning_rate": 3.5950367766188366e-05, + "loss": 0.0895, + "num_input_tokens_seen": 91819968, + "step": 75445 + }, + { + "epoch": 8.402940193785499, + "grad_norm": 0.05934944003820419, + "learning_rate": 3.594818345842473e-05, + "loss": 0.1419, + "num_input_tokens_seen": 91826272, + "step": 75450 + }, + { + "epoch": 8.403497048669117, + "grad_norm": 0.000902116356883198, + "learning_rate": 3.5945999047246734e-05, + "loss": 0.0838, + "num_input_tokens_seen": 91832352, + "step": 75455 + }, + { + "epoch": 8.404053903552734, + "grad_norm": 0.0015706737758591771, + "learning_rate": 3.5943814532674997e-05, + "loss": 0.0537, + "num_input_tokens_seen": 91838496, + "step": 75460 + }, + { + "epoch": 8.404610758436352, + "grad_norm": 0.7346851825714111, + "learning_rate": 3.5941629914730166e-05, + "loss": 0.0166, + "num_input_tokens_seen": 91844512, + "step": 75465 + }, + { + "epoch": 8.40516761331997, + "grad_norm": 0.15692153573036194, + "learning_rate": 3.593944519343289e-05, + "loss": 0.0085, + "num_input_tokens_seen": 91850400, + "step": 75470 + }, + { + "epoch": 8.405724468203585, + "grad_norm": 0.05569930747151375, + "learning_rate": 3.593726036880377e-05, + "loss": 0.0448, + "num_input_tokens_seen": 91856096, + "step": 75475 + }, + { + "epoch": 8.406281323087203, + "grad_norm": 0.715448796749115, + "learning_rate": 3.593507544086347e-05, + "loss": 0.0644, + "num_input_tokens_seen": 91862464, + "step": 75480 + }, + { + "epoch": 8.40683817797082, + "grad_norm": 1.1128673553466797, + "learning_rate": 3.5932890409632616e-05, + "loss": 0.0435, + "num_input_tokens_seen": 91868512, + "step": 75485 + }, + { + "epoch": 8.407395032854438, + "grad_norm": 0.7629323601722717, + "learning_rate": 3.5930705275131865e-05, + "loss": 0.0596, + "num_input_tokens_seen": 91874656, + "step": 75490 + }, + { + "epoch": 8.407951887738056, + "grad_norm": 0.2488223910331726, + "learning_rate": 3.592852003738184e-05, + "loss": 0.0281, + "num_input_tokens_seen": 91880960, + "step": 75495 + }, + { + "epoch": 8.408508742621672, + "grad_norm": 0.3843240439891815, + "learning_rate": 3.592633469640318e-05, + "loss": 0.0749, + "num_input_tokens_seen": 91887264, + "step": 75500 + }, + { + "epoch": 8.40906559750529, + "grad_norm": 1.1345793008804321, + "learning_rate": 3.5924149252216547e-05, + "loss": 0.0705, + "num_input_tokens_seen": 91893280, + "step": 75505 + }, + { + "epoch": 8.409622452388907, + "grad_norm": 0.7678515911102295, + "learning_rate": 3.592196370484257e-05, + "loss": 0.0401, + "num_input_tokens_seen": 91899392, + "step": 75510 + }, + { + "epoch": 8.410179307272525, + "grad_norm": 0.3228338062763214, + "learning_rate": 3.591977805430189e-05, + "loss": 0.0223, + "num_input_tokens_seen": 91905472, + "step": 75515 + }, + { + "epoch": 8.410736162156143, + "grad_norm": 0.5056699514389038, + "learning_rate": 3.591759230061515e-05, + "loss": 0.0076, + "num_input_tokens_seen": 91911808, + "step": 75520 + }, + { + "epoch": 8.411293017039759, + "grad_norm": 0.25635507702827454, + "learning_rate": 3.5915406443803016e-05, + "loss": 0.1057, + "num_input_tokens_seen": 91918272, + "step": 75525 + }, + { + "epoch": 8.411849871923376, + "grad_norm": 0.13678254187107086, + "learning_rate": 3.591322048388612e-05, + "loss": 0.0156, + "num_input_tokens_seen": 91924480, + "step": 75530 + }, + { + "epoch": 8.412406726806994, + "grad_norm": 0.02267356403172016, + "learning_rate": 3.591103442088511e-05, + "loss": 0.0052, + "num_input_tokens_seen": 91930720, + "step": 75535 + }, + { + "epoch": 8.412963581690612, + "grad_norm": 1.2775651216506958, + "learning_rate": 3.590884825482064e-05, + "loss": 0.1128, + "num_input_tokens_seen": 91936608, + "step": 75540 + }, + { + "epoch": 8.41352043657423, + "grad_norm": 0.00033675943268463016, + "learning_rate": 3.5906661985713355e-05, + "loss": 0.0312, + "num_input_tokens_seen": 91942784, + "step": 75545 + }, + { + "epoch": 8.414077291457847, + "grad_norm": 0.016610028222203255, + "learning_rate": 3.5904475613583924e-05, + "loss": 0.0038, + "num_input_tokens_seen": 91948992, + "step": 75550 + }, + { + "epoch": 8.414634146341463, + "grad_norm": 0.30552437901496887, + "learning_rate": 3.590228913845297e-05, + "loss": 0.017, + "num_input_tokens_seen": 91955136, + "step": 75555 + }, + { + "epoch": 8.41519100122508, + "grad_norm": 0.12363552302122116, + "learning_rate": 3.5900102560341166e-05, + "loss": 0.0154, + "num_input_tokens_seen": 91961280, + "step": 75560 + }, + { + "epoch": 8.415747856108698, + "grad_norm": 0.04469645395874977, + "learning_rate": 3.589791587926916e-05, + "loss": 0.0194, + "num_input_tokens_seen": 91967488, + "step": 75565 + }, + { + "epoch": 8.416304710992316, + "grad_norm": 0.7536302208900452, + "learning_rate": 3.5895729095257605e-05, + "loss": 0.1043, + "num_input_tokens_seen": 91973632, + "step": 75570 + }, + { + "epoch": 8.416861565875934, + "grad_norm": 0.16004124283790588, + "learning_rate": 3.5893542208327156e-05, + "loss": 0.0324, + "num_input_tokens_seen": 91979200, + "step": 75575 + }, + { + "epoch": 8.41741842075955, + "grad_norm": 1.2966235876083374, + "learning_rate": 3.5891355218498475e-05, + "loss": 0.0749, + "num_input_tokens_seen": 91985376, + "step": 75580 + }, + { + "epoch": 8.417975275643167, + "grad_norm": 2.0434138774871826, + "learning_rate": 3.5889168125792213e-05, + "loss": 0.1515, + "num_input_tokens_seen": 91991168, + "step": 75585 + }, + { + "epoch": 8.418532130526785, + "grad_norm": 0.003968649543821812, + "learning_rate": 3.588698093022905e-05, + "loss": 0.1156, + "num_input_tokens_seen": 91997152, + "step": 75590 + }, + { + "epoch": 8.419088985410403, + "grad_norm": 0.014982433058321476, + "learning_rate": 3.588479363182962e-05, + "loss": 0.0569, + "num_input_tokens_seen": 92003264, + "step": 75595 + }, + { + "epoch": 8.41964584029402, + "grad_norm": 0.01696762442588806, + "learning_rate": 3.588260623061459e-05, + "loss": 0.0401, + "num_input_tokens_seen": 92009504, + "step": 75600 + }, + { + "epoch": 8.420202695177636, + "grad_norm": 0.24658973515033722, + "learning_rate": 3.588041872660463e-05, + "loss": 0.0473, + "num_input_tokens_seen": 92015744, + "step": 75605 + }, + { + "epoch": 8.420759550061254, + "grad_norm": 0.013021917082369328, + "learning_rate": 3.5878231119820396e-05, + "loss": 0.0084, + "num_input_tokens_seen": 92021600, + "step": 75610 + }, + { + "epoch": 8.421316404944871, + "grad_norm": 0.06957782059907913, + "learning_rate": 3.587604341028255e-05, + "loss": 0.0416, + "num_input_tokens_seen": 92027264, + "step": 75615 + }, + { + "epoch": 8.421873259828489, + "grad_norm": 0.06704530864953995, + "learning_rate": 3.5873855598011765e-05, + "loss": 0.0484, + "num_input_tokens_seen": 92033568, + "step": 75620 + }, + { + "epoch": 8.422430114712107, + "grad_norm": 0.8277367353439331, + "learning_rate": 3.58716676830287e-05, + "loss": 0.0515, + "num_input_tokens_seen": 92039520, + "step": 75625 + }, + { + "epoch": 8.422986969595723, + "grad_norm": 0.0003839671553578228, + "learning_rate": 3.5869479665354025e-05, + "loss": 0.0055, + "num_input_tokens_seen": 92045632, + "step": 75630 + }, + { + "epoch": 8.42354382447934, + "grad_norm": 1.2065227031707764, + "learning_rate": 3.5867291545008405e-05, + "loss": 0.0818, + "num_input_tokens_seen": 92051776, + "step": 75635 + }, + { + "epoch": 8.424100679362958, + "grad_norm": 0.0021129329688847065, + "learning_rate": 3.586510332201251e-05, + "loss": 0.0505, + "num_input_tokens_seen": 92058016, + "step": 75640 + }, + { + "epoch": 8.424657534246576, + "grad_norm": 0.19339965283870697, + "learning_rate": 3.586291499638701e-05, + "loss": 0.0107, + "num_input_tokens_seen": 92064256, + "step": 75645 + }, + { + "epoch": 8.425214389130193, + "grad_norm": 2.0649497509002686, + "learning_rate": 3.586072656815257e-05, + "loss": 0.1177, + "num_input_tokens_seen": 92070272, + "step": 75650 + }, + { + "epoch": 8.42577124401381, + "grad_norm": 0.38835209608078003, + "learning_rate": 3.5858538037329866e-05, + "loss": 0.0568, + "num_input_tokens_seen": 92076352, + "step": 75655 + }, + { + "epoch": 8.426328098897427, + "grad_norm": 0.11840806901454926, + "learning_rate": 3.585634940393958e-05, + "loss": 0.0308, + "num_input_tokens_seen": 92082752, + "step": 75660 + }, + { + "epoch": 8.426884953781045, + "grad_norm": 0.2022095024585724, + "learning_rate": 3.5854160668002366e-05, + "loss": 0.0081, + "num_input_tokens_seen": 92088960, + "step": 75665 + }, + { + "epoch": 8.427441808664662, + "grad_norm": 0.2871796786785126, + "learning_rate": 3.585197182953892e-05, + "loss": 0.0103, + "num_input_tokens_seen": 92095104, + "step": 75670 + }, + { + "epoch": 8.42799866354828, + "grad_norm": 0.000962807796895504, + "learning_rate": 3.584978288856989e-05, + "loss": 0.0783, + "num_input_tokens_seen": 92100608, + "step": 75675 + }, + { + "epoch": 8.428555518431896, + "grad_norm": 1.9872050285339355, + "learning_rate": 3.5847593845115976e-05, + "loss": 0.1039, + "num_input_tokens_seen": 92106592, + "step": 75680 + }, + { + "epoch": 8.429112373315514, + "grad_norm": 1.3507312536239624, + "learning_rate": 3.584540469919785e-05, + "loss": 0.0202, + "num_input_tokens_seen": 92112512, + "step": 75685 + }, + { + "epoch": 8.429669228199131, + "grad_norm": 0.00013211637269705534, + "learning_rate": 3.584321545083619e-05, + "loss": 0.0491, + "num_input_tokens_seen": 92118176, + "step": 75690 + }, + { + "epoch": 8.430226083082749, + "grad_norm": 0.00017269901582039893, + "learning_rate": 3.584102610005167e-05, + "loss": 0.0194, + "num_input_tokens_seen": 92124192, + "step": 75695 + }, + { + "epoch": 8.430782937966367, + "grad_norm": 0.0030489761848002672, + "learning_rate": 3.5838836646864965e-05, + "loss": 0.0443, + "num_input_tokens_seen": 92130336, + "step": 75700 + }, + { + "epoch": 8.431339792849982, + "grad_norm": 0.12138915061950684, + "learning_rate": 3.5836647091296766e-05, + "loss": 0.025, + "num_input_tokens_seen": 92135744, + "step": 75705 + }, + { + "epoch": 8.4318966477336, + "grad_norm": 0.4051373302936554, + "learning_rate": 3.583445743336776e-05, + "loss": 0.0713, + "num_input_tokens_seen": 92141536, + "step": 75710 + }, + { + "epoch": 8.432453502617218, + "grad_norm": 3.198485851287842, + "learning_rate": 3.5832267673098617e-05, + "loss": 0.2328, + "num_input_tokens_seen": 92147616, + "step": 75715 + }, + { + "epoch": 8.433010357500835, + "grad_norm": 0.0014427803689613938, + "learning_rate": 3.583007781051003e-05, + "loss": 0.2517, + "num_input_tokens_seen": 92152864, + "step": 75720 + }, + { + "epoch": 8.433567212384453, + "grad_norm": 0.04365462809801102, + "learning_rate": 3.582788784562268e-05, + "loss": 0.0252, + "num_input_tokens_seen": 92159296, + "step": 75725 + }, + { + "epoch": 8.434124067268069, + "grad_norm": 0.030876444652676582, + "learning_rate": 3.582569777845726e-05, + "loss": 0.0422, + "num_input_tokens_seen": 92165504, + "step": 75730 + }, + { + "epoch": 8.434680922151687, + "grad_norm": 0.001109662582166493, + "learning_rate": 3.582350760903444e-05, + "loss": 0.0059, + "num_input_tokens_seen": 92171616, + "step": 75735 + }, + { + "epoch": 8.435237777035304, + "grad_norm": 0.04501520097255707, + "learning_rate": 3.5821317337374926e-05, + "loss": 0.0572, + "num_input_tokens_seen": 92177536, + "step": 75740 + }, + { + "epoch": 8.435794631918922, + "grad_norm": 0.3832073211669922, + "learning_rate": 3.5819126963499397e-05, + "loss": 0.0158, + "num_input_tokens_seen": 92183488, + "step": 75745 + }, + { + "epoch": 8.43635148680254, + "grad_norm": 0.01483682356774807, + "learning_rate": 3.581693648742855e-05, + "loss": 0.0186, + "num_input_tokens_seen": 92189600, + "step": 75750 + }, + { + "epoch": 8.436908341686157, + "grad_norm": 1.2946462631225586, + "learning_rate": 3.5814745909183066e-05, + "loss": 0.0999, + "num_input_tokens_seen": 92195680, + "step": 75755 + }, + { + "epoch": 8.437465196569773, + "grad_norm": 0.30417537689208984, + "learning_rate": 3.581255522878365e-05, + "loss": 0.0102, + "num_input_tokens_seen": 92201728, + "step": 75760 + }, + { + "epoch": 8.438022051453391, + "grad_norm": 0.5330246090888977, + "learning_rate": 3.581036444625098e-05, + "loss": 0.0698, + "num_input_tokens_seen": 92207872, + "step": 75765 + }, + { + "epoch": 8.438578906337009, + "grad_norm": 0.44166135787963867, + "learning_rate": 3.5808173561605755e-05, + "loss": 0.2541, + "num_input_tokens_seen": 92213888, + "step": 75770 + }, + { + "epoch": 8.439135761220626, + "grad_norm": 0.006210738327354193, + "learning_rate": 3.580598257486867e-05, + "loss": 0.0925, + "num_input_tokens_seen": 92220032, + "step": 75775 + }, + { + "epoch": 8.439692616104244, + "grad_norm": 0.2665122449398041, + "learning_rate": 3.580379148606043e-05, + "loss": 0.0245, + "num_input_tokens_seen": 92225728, + "step": 75780 + }, + { + "epoch": 8.44024947098786, + "grad_norm": 0.3303435444831848, + "learning_rate": 3.580160029520173e-05, + "loss": 0.0087, + "num_input_tokens_seen": 92231808, + "step": 75785 + }, + { + "epoch": 8.440806325871478, + "grad_norm": 1.5144835710525513, + "learning_rate": 3.579940900231325e-05, + "loss": 0.1385, + "num_input_tokens_seen": 92238112, + "step": 75790 + }, + { + "epoch": 8.441363180755095, + "grad_norm": 1.705522060394287, + "learning_rate": 3.57972176074157e-05, + "loss": 0.1258, + "num_input_tokens_seen": 92243968, + "step": 75795 + }, + { + "epoch": 8.441920035638713, + "grad_norm": 1.5571702718734741, + "learning_rate": 3.5795026110529786e-05, + "loss": 0.0565, + "num_input_tokens_seen": 92250080, + "step": 75800 + }, + { + "epoch": 8.44247689052233, + "grad_norm": 1.7935528755187988, + "learning_rate": 3.5792834511676185e-05, + "loss": 0.2911, + "num_input_tokens_seen": 92256032, + "step": 75805 + }, + { + "epoch": 8.443033745405947, + "grad_norm": 0.6771458387374878, + "learning_rate": 3.579064281087563e-05, + "loss": 0.0268, + "num_input_tokens_seen": 92262336, + "step": 75810 + }, + { + "epoch": 8.443590600289564, + "grad_norm": 0.20377010107040405, + "learning_rate": 3.5788451008148807e-05, + "loss": 0.0698, + "num_input_tokens_seen": 92268448, + "step": 75815 + }, + { + "epoch": 8.444147455173182, + "grad_norm": 0.5799220204353333, + "learning_rate": 3.578625910351641e-05, + "loss": 0.0469, + "num_input_tokens_seen": 92274528, + "step": 75820 + }, + { + "epoch": 8.4447043100568, + "grad_norm": 1.185512900352478, + "learning_rate": 3.578406709699917e-05, + "loss": 0.0504, + "num_input_tokens_seen": 92280928, + "step": 75825 + }, + { + "epoch": 8.445261164940417, + "grad_norm": 0.0011446095304563642, + "learning_rate": 3.578187498861776e-05, + "loss": 0.0062, + "num_input_tokens_seen": 92287328, + "step": 75830 + }, + { + "epoch": 8.445818019824033, + "grad_norm": 1.1290870904922485, + "learning_rate": 3.577968277839292e-05, + "loss": 0.0719, + "num_input_tokens_seen": 92293472, + "step": 75835 + }, + { + "epoch": 8.44637487470765, + "grad_norm": 2.1724789142608643, + "learning_rate": 3.5777490466345326e-05, + "loss": 0.1564, + "num_input_tokens_seen": 92299648, + "step": 75840 + }, + { + "epoch": 8.446931729591268, + "grad_norm": 0.48994356393814087, + "learning_rate": 3.5775298052495704e-05, + "loss": 0.0709, + "num_input_tokens_seen": 92304960, + "step": 75845 + }, + { + "epoch": 8.447488584474886, + "grad_norm": 1.3605170249938965, + "learning_rate": 3.577310553686476e-05, + "loss": 0.0795, + "num_input_tokens_seen": 92311040, + "step": 75850 + }, + { + "epoch": 8.448045439358504, + "grad_norm": 0.0023985577281564474, + "learning_rate": 3.57709129194732e-05, + "loss": 0.0018, + "num_input_tokens_seen": 92317248, + "step": 75855 + }, + { + "epoch": 8.44860229424212, + "grad_norm": 0.18681077659130096, + "learning_rate": 3.576872020034174e-05, + "loss": 0.0188, + "num_input_tokens_seen": 92323488, + "step": 75860 + }, + { + "epoch": 8.449159149125737, + "grad_norm": 0.7362186908721924, + "learning_rate": 3.576652737949109e-05, + "loss": 0.0502, + "num_input_tokens_seen": 92329760, + "step": 75865 + }, + { + "epoch": 8.449716004009355, + "grad_norm": 1.6988160610198975, + "learning_rate": 3.5764334456941965e-05, + "loss": 0.082, + "num_input_tokens_seen": 92335808, + "step": 75870 + }, + { + "epoch": 8.450272858892973, + "grad_norm": 0.013836425729095936, + "learning_rate": 3.5762141432715075e-05, + "loss": 0.0315, + "num_input_tokens_seen": 92342048, + "step": 75875 + }, + { + "epoch": 8.45082971377659, + "grad_norm": 0.015020946972072124, + "learning_rate": 3.5759948306831133e-05, + "loss": 0.0586, + "num_input_tokens_seen": 92348256, + "step": 75880 + }, + { + "epoch": 8.451386568660206, + "grad_norm": 0.07743316143751144, + "learning_rate": 3.575775507931086e-05, + "loss": 0.0406, + "num_input_tokens_seen": 92354432, + "step": 75885 + }, + { + "epoch": 8.451943423543824, + "grad_norm": 0.0006995685980655253, + "learning_rate": 3.5755561750174974e-05, + "loss": 0.1245, + "num_input_tokens_seen": 92360416, + "step": 75890 + }, + { + "epoch": 8.452500278427442, + "grad_norm": 0.6469337940216064, + "learning_rate": 3.575336831944419e-05, + "loss": 0.0216, + "num_input_tokens_seen": 92366656, + "step": 75895 + }, + { + "epoch": 8.45305713331106, + "grad_norm": 0.0029452056623995304, + "learning_rate": 3.575117478713923e-05, + "loss": 0.0663, + "num_input_tokens_seen": 92372928, + "step": 75900 + }, + { + "epoch": 8.453613988194677, + "grad_norm": 0.5394361615180969, + "learning_rate": 3.57489811532808e-05, + "loss": 0.0372, + "num_input_tokens_seen": 92379360, + "step": 75905 + }, + { + "epoch": 8.454170843078295, + "grad_norm": 0.046146947890520096, + "learning_rate": 3.574678741788964e-05, + "loss": 0.004, + "num_input_tokens_seen": 92385408, + "step": 75910 + }, + { + "epoch": 8.45472769796191, + "grad_norm": 0.2808259129524231, + "learning_rate": 3.5744593580986455e-05, + "loss": 0.029, + "num_input_tokens_seen": 92391712, + "step": 75915 + }, + { + "epoch": 8.455284552845528, + "grad_norm": 1.3396379947662354, + "learning_rate": 3.574239964259199e-05, + "loss": 0.0509, + "num_input_tokens_seen": 92397824, + "step": 75920 + }, + { + "epoch": 8.455841407729146, + "grad_norm": 0.008482544682919979, + "learning_rate": 3.574020560272694e-05, + "loss": 0.0044, + "num_input_tokens_seen": 92403808, + "step": 75925 + }, + { + "epoch": 8.456398262612764, + "grad_norm": 1.221948504447937, + "learning_rate": 3.573801146141204e-05, + "loss": 0.1527, + "num_input_tokens_seen": 92409856, + "step": 75930 + }, + { + "epoch": 8.456955117496381, + "grad_norm": 1.3741519451141357, + "learning_rate": 3.573581721866803e-05, + "loss": 0.1535, + "num_input_tokens_seen": 92416224, + "step": 75935 + }, + { + "epoch": 8.457511972379997, + "grad_norm": 0.19003543257713318, + "learning_rate": 3.5733622874515615e-05, + "loss": 0.0276, + "num_input_tokens_seen": 92422048, + "step": 75940 + }, + { + "epoch": 8.458068827263615, + "grad_norm": 0.04676893725991249, + "learning_rate": 3.5731428428975545e-05, + "loss": 0.092, + "num_input_tokens_seen": 92428128, + "step": 75945 + }, + { + "epoch": 8.458625682147233, + "grad_norm": 0.8253921866416931, + "learning_rate": 3.572923388206853e-05, + "loss": 0.0708, + "num_input_tokens_seen": 92434304, + "step": 75950 + }, + { + "epoch": 8.45918253703085, + "grad_norm": 0.03264709934592247, + "learning_rate": 3.57270392338153e-05, + "loss": 0.0265, + "num_input_tokens_seen": 92440192, + "step": 75955 + }, + { + "epoch": 8.459739391914468, + "grad_norm": 0.6104171872138977, + "learning_rate": 3.5724844484236594e-05, + "loss": 0.0237, + "num_input_tokens_seen": 92446464, + "step": 75960 + }, + { + "epoch": 8.460296246798084, + "grad_norm": 0.0002358228957746178, + "learning_rate": 3.572264963335314e-05, + "loss": 0.0543, + "num_input_tokens_seen": 92452736, + "step": 75965 + }, + { + "epoch": 8.460853101681701, + "grad_norm": 0.29517170786857605, + "learning_rate": 3.572045468118567e-05, + "loss": 0.0458, + "num_input_tokens_seen": 92458976, + "step": 75970 + }, + { + "epoch": 8.46140995656532, + "grad_norm": 0.24051257967948914, + "learning_rate": 3.5718259627754916e-05, + "loss": 0.0093, + "num_input_tokens_seen": 92464768, + "step": 75975 + }, + { + "epoch": 8.461966811448937, + "grad_norm": 0.00029471819289028645, + "learning_rate": 3.5716064473081606e-05, + "loss": 0.1034, + "num_input_tokens_seen": 92471200, + "step": 75980 + }, + { + "epoch": 8.462523666332554, + "grad_norm": 0.01217821054160595, + "learning_rate": 3.571386921718649e-05, + "loss": 0.0787, + "num_input_tokens_seen": 92477344, + "step": 75985 + }, + { + "epoch": 8.46308052121617, + "grad_norm": 0.5227912068367004, + "learning_rate": 3.571167386009029e-05, + "loss": 0.0942, + "num_input_tokens_seen": 92483488, + "step": 75990 + }, + { + "epoch": 8.463637376099788, + "grad_norm": 0.3071081340312958, + "learning_rate": 3.5709478401813756e-05, + "loss": 0.0194, + "num_input_tokens_seen": 92489664, + "step": 75995 + }, + { + "epoch": 8.464194230983406, + "grad_norm": 1.1614649295806885, + "learning_rate": 3.570728284237761e-05, + "loss": 0.056, + "num_input_tokens_seen": 92495680, + "step": 76000 + }, + { + "epoch": 8.464751085867023, + "grad_norm": 0.0022105826064944267, + "learning_rate": 3.57050871818026e-05, + "loss": 0.0026, + "num_input_tokens_seen": 92501920, + "step": 76005 + }, + { + "epoch": 8.465307940750641, + "grad_norm": 0.7345966100692749, + "learning_rate": 3.5702891420109465e-05, + "loss": 0.0345, + "num_input_tokens_seen": 92508128, + "step": 76010 + }, + { + "epoch": 8.465864795634257, + "grad_norm": 0.06955702602863312, + "learning_rate": 3.570069555731895e-05, + "loss": 0.0052, + "num_input_tokens_seen": 92513952, + "step": 76015 + }, + { + "epoch": 8.466421650517875, + "grad_norm": 0.023463178426027298, + "learning_rate": 3.569849959345179e-05, + "loss": 0.0017, + "num_input_tokens_seen": 92520128, + "step": 76020 + }, + { + "epoch": 8.466978505401492, + "grad_norm": 0.00020995058002881706, + "learning_rate": 3.5696303528528726e-05, + "loss": 0.0236, + "num_input_tokens_seen": 92525984, + "step": 76025 + }, + { + "epoch": 8.46753536028511, + "grad_norm": 0.05029568448662758, + "learning_rate": 3.569410736257051e-05, + "loss": 0.2197, + "num_input_tokens_seen": 92531776, + "step": 76030 + }, + { + "epoch": 8.468092215168728, + "grad_norm": 0.9905053973197937, + "learning_rate": 3.569191109559788e-05, + "loss": 0.063, + "num_input_tokens_seen": 92537920, + "step": 76035 + }, + { + "epoch": 8.468649070052344, + "grad_norm": 0.35471057891845703, + "learning_rate": 3.568971472763159e-05, + "loss": 0.0854, + "num_input_tokens_seen": 92544000, + "step": 76040 + }, + { + "epoch": 8.469205924935961, + "grad_norm": 0.018911320716142654, + "learning_rate": 3.568751825869236e-05, + "loss": 0.0058, + "num_input_tokens_seen": 92550144, + "step": 76045 + }, + { + "epoch": 8.469762779819579, + "grad_norm": 0.26840928196907043, + "learning_rate": 3.568532168880098e-05, + "loss": 0.0514, + "num_input_tokens_seen": 92556608, + "step": 76050 + }, + { + "epoch": 8.470319634703197, + "grad_norm": 1.574202060699463, + "learning_rate": 3.5683125017978165e-05, + "loss": 0.1325, + "num_input_tokens_seen": 92562912, + "step": 76055 + }, + { + "epoch": 8.470876489586814, + "grad_norm": 0.26308155059814453, + "learning_rate": 3.568092824624467e-05, + "loss": 0.1163, + "num_input_tokens_seen": 92569408, + "step": 76060 + }, + { + "epoch": 8.47143334447043, + "grad_norm": 0.3125017285346985, + "learning_rate": 3.5678731373621265e-05, + "loss": 0.0768, + "num_input_tokens_seen": 92575136, + "step": 76065 + }, + { + "epoch": 8.471990199354048, + "grad_norm": 0.4014940559864044, + "learning_rate": 3.567653440012868e-05, + "loss": 0.0356, + "num_input_tokens_seen": 92581280, + "step": 76070 + }, + { + "epoch": 8.472547054237666, + "grad_norm": 1.1393296718597412, + "learning_rate": 3.567433732578767e-05, + "loss": 0.0896, + "num_input_tokens_seen": 92587808, + "step": 76075 + }, + { + "epoch": 8.473103909121283, + "grad_norm": 0.07508274167776108, + "learning_rate": 3.5672140150618995e-05, + "loss": 0.0058, + "num_input_tokens_seen": 92593664, + "step": 76080 + }, + { + "epoch": 8.4736607640049, + "grad_norm": 0.0005962660070508718, + "learning_rate": 3.5669942874643406e-05, + "loss": 0.1, + "num_input_tokens_seen": 92599840, + "step": 76085 + }, + { + "epoch": 8.474217618888517, + "grad_norm": 0.0034261851105839014, + "learning_rate": 3.566774549788166e-05, + "loss": 0.0233, + "num_input_tokens_seen": 92605312, + "step": 76090 + }, + { + "epoch": 8.474774473772134, + "grad_norm": 0.309049516916275, + "learning_rate": 3.56655480203545e-05, + "loss": 0.0092, + "num_input_tokens_seen": 92611488, + "step": 76095 + }, + { + "epoch": 8.475331328655752, + "grad_norm": 0.09956606477499008, + "learning_rate": 3.56633504420827e-05, + "loss": 0.1097, + "num_input_tokens_seen": 92617504, + "step": 76100 + }, + { + "epoch": 8.47588818353937, + "grad_norm": 0.9588139057159424, + "learning_rate": 3.5661152763087014e-05, + "loss": 0.0522, + "num_input_tokens_seen": 92623456, + "step": 76105 + }, + { + "epoch": 8.476445038422987, + "grad_norm": 1.1731888055801392, + "learning_rate": 3.5658954983388195e-05, + "loss": 0.1007, + "num_input_tokens_seen": 92630016, + "step": 76110 + }, + { + "epoch": 8.477001893306605, + "grad_norm": 0.014481138437986374, + "learning_rate": 3.5656757103007016e-05, + "loss": 0.0726, + "num_input_tokens_seen": 92636256, + "step": 76115 + }, + { + "epoch": 8.477558748190221, + "grad_norm": 0.01870655082166195, + "learning_rate": 3.5654559121964224e-05, + "loss": 0.0541, + "num_input_tokens_seen": 92642304, + "step": 76120 + }, + { + "epoch": 8.478115603073839, + "grad_norm": 0.0002795278269331902, + "learning_rate": 3.565236104028058e-05, + "loss": 0.0131, + "num_input_tokens_seen": 92648416, + "step": 76125 + }, + { + "epoch": 8.478672457957456, + "grad_norm": 0.14080575108528137, + "learning_rate": 3.565016285797685e-05, + "loss": 0.1115, + "num_input_tokens_seen": 92654496, + "step": 76130 + }, + { + "epoch": 8.479229312841074, + "grad_norm": 0.1462484747171402, + "learning_rate": 3.5647964575073805e-05, + "loss": 0.0268, + "num_input_tokens_seen": 92660192, + "step": 76135 + }, + { + "epoch": 8.479786167724692, + "grad_norm": 0.7774803638458252, + "learning_rate": 3.56457661915922e-05, + "loss": 0.0538, + "num_input_tokens_seen": 92666208, + "step": 76140 + }, + { + "epoch": 8.480343022608308, + "grad_norm": 0.20948395133018494, + "learning_rate": 3.5643567707552806e-05, + "loss": 0.0054, + "num_input_tokens_seen": 92672384, + "step": 76145 + }, + { + "epoch": 8.480899877491925, + "grad_norm": 2.1507785320281982, + "learning_rate": 3.5641369122976386e-05, + "loss": 0.1126, + "num_input_tokens_seen": 92678336, + "step": 76150 + }, + { + "epoch": 8.481456732375543, + "grad_norm": 0.14892715215682983, + "learning_rate": 3.563917043788371e-05, + "loss": 0.0414, + "num_input_tokens_seen": 92684320, + "step": 76155 + }, + { + "epoch": 8.48201358725916, + "grad_norm": 0.663057267665863, + "learning_rate": 3.5636971652295545e-05, + "loss": 0.0549, + "num_input_tokens_seen": 92690464, + "step": 76160 + }, + { + "epoch": 8.482570442142778, + "grad_norm": 0.017388449981808662, + "learning_rate": 3.563477276623266e-05, + "loss": 0.0185, + "num_input_tokens_seen": 92696768, + "step": 76165 + }, + { + "epoch": 8.483127297026394, + "grad_norm": 0.001124390633776784, + "learning_rate": 3.563257377971583e-05, + "loss": 0.0252, + "num_input_tokens_seen": 92702848, + "step": 76170 + }, + { + "epoch": 8.483684151910012, + "grad_norm": 0.554655134677887, + "learning_rate": 3.563037469276582e-05, + "loss": 0.0772, + "num_input_tokens_seen": 92709312, + "step": 76175 + }, + { + "epoch": 8.48424100679363, + "grad_norm": 0.011884896084666252, + "learning_rate": 3.56281755054034e-05, + "loss": 0.0049, + "num_input_tokens_seen": 92715616, + "step": 76180 + }, + { + "epoch": 8.484797861677247, + "grad_norm": 0.0005181334563530982, + "learning_rate": 3.5625976217649346e-05, + "loss": 0.0415, + "num_input_tokens_seen": 92721984, + "step": 76185 + }, + { + "epoch": 8.485354716560865, + "grad_norm": 0.6351130604743958, + "learning_rate": 3.562377682952444e-05, + "loss": 0.0092, + "num_input_tokens_seen": 92728352, + "step": 76190 + }, + { + "epoch": 8.48591157144448, + "grad_norm": 0.5773206353187561, + "learning_rate": 3.562157734104945e-05, + "loss": 0.1146, + "num_input_tokens_seen": 92734592, + "step": 76195 + }, + { + "epoch": 8.486468426328099, + "grad_norm": 0.0004210720071569085, + "learning_rate": 3.5619377752245146e-05, + "loss": 0.0373, + "num_input_tokens_seen": 92740640, + "step": 76200 + }, + { + "epoch": 8.487025281211716, + "grad_norm": 0.16818392276763916, + "learning_rate": 3.5617178063132315e-05, + "loss": 0.1124, + "num_input_tokens_seen": 92747040, + "step": 76205 + }, + { + "epoch": 8.487582136095334, + "grad_norm": 0.0018832621863111854, + "learning_rate": 3.561497827373173e-05, + "loss": 0.0502, + "num_input_tokens_seen": 92752992, + "step": 76210 + }, + { + "epoch": 8.488138990978952, + "grad_norm": 2.0118508338928223, + "learning_rate": 3.561277838406417e-05, + "loss": 0.1255, + "num_input_tokens_seen": 92759168, + "step": 76215 + }, + { + "epoch": 8.488695845862567, + "grad_norm": 0.0005646994686685503, + "learning_rate": 3.561057839415042e-05, + "loss": 0.0468, + "num_input_tokens_seen": 92765504, + "step": 76220 + }, + { + "epoch": 8.489252700746185, + "grad_norm": 0.07594478130340576, + "learning_rate": 3.560837830401125e-05, + "loss": 0.1591, + "num_input_tokens_seen": 92771392, + "step": 76225 + }, + { + "epoch": 8.489809555629803, + "grad_norm": 1.5947755575180054, + "learning_rate": 3.5606178113667455e-05, + "loss": 0.0343, + "num_input_tokens_seen": 92777504, + "step": 76230 + }, + { + "epoch": 8.49036641051342, + "grad_norm": 0.08782628923654556, + "learning_rate": 3.56039778231398e-05, + "loss": 0.0875, + "num_input_tokens_seen": 92783424, + "step": 76235 + }, + { + "epoch": 8.490923265397038, + "grad_norm": 0.6642929911613464, + "learning_rate": 3.560177743244908e-05, + "loss": 0.0109, + "num_input_tokens_seen": 92789600, + "step": 76240 + }, + { + "epoch": 8.491480120280654, + "grad_norm": 0.05046428367495537, + "learning_rate": 3.5599576941616087e-05, + "loss": 0.0343, + "num_input_tokens_seen": 92795552, + "step": 76245 + }, + { + "epoch": 8.492036975164272, + "grad_norm": 1.2171279191970825, + "learning_rate": 3.5597376350661584e-05, + "loss": 0.0802, + "num_input_tokens_seen": 92801664, + "step": 76250 + }, + { + "epoch": 8.49259383004789, + "grad_norm": 0.02822597324848175, + "learning_rate": 3.559517565960639e-05, + "loss": 0.0027, + "num_input_tokens_seen": 92807968, + "step": 76255 + }, + { + "epoch": 8.493150684931507, + "grad_norm": 0.10876735299825668, + "learning_rate": 3.5592974868471255e-05, + "loss": 0.0643, + "num_input_tokens_seen": 92813376, + "step": 76260 + }, + { + "epoch": 8.493707539815125, + "grad_norm": 0.19190151989459991, + "learning_rate": 3.5590773977277e-05, + "loss": 0.01, + "num_input_tokens_seen": 92819392, + "step": 76265 + }, + { + "epoch": 8.494264394698742, + "grad_norm": 0.24201014637947083, + "learning_rate": 3.5588572986044386e-05, + "loss": 0.0166, + "num_input_tokens_seen": 92825312, + "step": 76270 + }, + { + "epoch": 8.494821249582358, + "grad_norm": 0.2522542476654053, + "learning_rate": 3.5586371894794224e-05, + "loss": 0.0321, + "num_input_tokens_seen": 92831328, + "step": 76275 + }, + { + "epoch": 8.495378104465976, + "grad_norm": 0.00017080327961593866, + "learning_rate": 3.55841707035473e-05, + "loss": 0.0254, + "num_input_tokens_seen": 92837600, + "step": 76280 + }, + { + "epoch": 8.495934959349594, + "grad_norm": 0.0007013189606368542, + "learning_rate": 3.55819694123244e-05, + "loss": 0.0325, + "num_input_tokens_seen": 92843680, + "step": 76285 + }, + { + "epoch": 8.496491814233211, + "grad_norm": 0.6723465323448181, + "learning_rate": 3.557976802114632e-05, + "loss": 0.0481, + "num_input_tokens_seen": 92849568, + "step": 76290 + }, + { + "epoch": 8.497048669116829, + "grad_norm": 0.09912192076444626, + "learning_rate": 3.557756653003386e-05, + "loss": 0.0031, + "num_input_tokens_seen": 92855648, + "step": 76295 + }, + { + "epoch": 8.497605524000445, + "grad_norm": 0.6573255658149719, + "learning_rate": 3.55753649390078e-05, + "loss": 0.0361, + "num_input_tokens_seen": 92861920, + "step": 76300 + }, + { + "epoch": 8.498162378884063, + "grad_norm": 1.2820955514907837, + "learning_rate": 3.5573163248088956e-05, + "loss": 0.0634, + "num_input_tokens_seen": 92868128, + "step": 76305 + }, + { + "epoch": 8.49871923376768, + "grad_norm": 0.09802009165287018, + "learning_rate": 3.557096145729811e-05, + "loss": 0.0546, + "num_input_tokens_seen": 92874240, + "step": 76310 + }, + { + "epoch": 8.499276088651298, + "grad_norm": 0.9092124700546265, + "learning_rate": 3.556875956665606e-05, + "loss": 0.0992, + "num_input_tokens_seen": 92880256, + "step": 76315 + }, + { + "epoch": 8.499832943534916, + "grad_norm": 0.1584026962518692, + "learning_rate": 3.5566557576183613e-05, + "loss": 0.0027, + "num_input_tokens_seen": 92886432, + "step": 76320 + }, + { + "epoch": 8.500389798418531, + "grad_norm": 0.9143911600112915, + "learning_rate": 3.556435548590156e-05, + "loss": 0.0893, + "num_input_tokens_seen": 92892704, + "step": 76325 + }, + { + "epoch": 8.50094665330215, + "grad_norm": 0.0014242853503674269, + "learning_rate": 3.556215329583071e-05, + "loss": 0.0203, + "num_input_tokens_seen": 92899232, + "step": 76330 + }, + { + "epoch": 8.501503508185767, + "grad_norm": 0.0071327174082398415, + "learning_rate": 3.5559951005991854e-05, + "loss": 0.0053, + "num_input_tokens_seen": 92905312, + "step": 76335 + }, + { + "epoch": 8.502060363069385, + "grad_norm": 0.005401762202382088, + "learning_rate": 3.55577486164058e-05, + "loss": 0.1072, + "num_input_tokens_seen": 92910592, + "step": 76340 + }, + { + "epoch": 8.502617217953002, + "grad_norm": 0.18009741604328156, + "learning_rate": 3.555554612709336e-05, + "loss": 0.0894, + "num_input_tokens_seen": 92916736, + "step": 76345 + }, + { + "epoch": 8.503174072836618, + "grad_norm": 0.014028016477823257, + "learning_rate": 3.555334353807533e-05, + "loss": 0.0889, + "num_input_tokens_seen": 92923072, + "step": 76350 + }, + { + "epoch": 8.503730927720236, + "grad_norm": 0.4279289245605469, + "learning_rate": 3.555114084937251e-05, + "loss": 0.0134, + "num_input_tokens_seen": 92929344, + "step": 76355 + }, + { + "epoch": 8.504287782603853, + "grad_norm": 1.0634245872497559, + "learning_rate": 3.554893806100571e-05, + "loss": 0.0546, + "num_input_tokens_seen": 92935520, + "step": 76360 + }, + { + "epoch": 8.504844637487471, + "grad_norm": 0.05566752701997757, + "learning_rate": 3.554673517299574e-05, + "loss": 0.1053, + "num_input_tokens_seen": 92941472, + "step": 76365 + }, + { + "epoch": 8.505401492371089, + "grad_norm": 2.306236982345581, + "learning_rate": 3.55445321853634e-05, + "loss": 0.0637, + "num_input_tokens_seen": 92947616, + "step": 76370 + }, + { + "epoch": 8.505958347254705, + "grad_norm": 0.24198351800441742, + "learning_rate": 3.5542329098129525e-05, + "loss": 0.0655, + "num_input_tokens_seen": 92953920, + "step": 76375 + }, + { + "epoch": 8.506515202138322, + "grad_norm": 1.6400092840194702, + "learning_rate": 3.5540125911314885e-05, + "loss": 0.0601, + "num_input_tokens_seen": 92959776, + "step": 76380 + }, + { + "epoch": 8.50707205702194, + "grad_norm": 0.27617326378822327, + "learning_rate": 3.5537922624940316e-05, + "loss": 0.0867, + "num_input_tokens_seen": 92966048, + "step": 76385 + }, + { + "epoch": 8.507628911905558, + "grad_norm": 0.00021538103464990854, + "learning_rate": 3.553571923902663e-05, + "loss": 0.0214, + "num_input_tokens_seen": 92972160, + "step": 76390 + }, + { + "epoch": 8.508185766789175, + "grad_norm": 0.5992416739463806, + "learning_rate": 3.553351575359463e-05, + "loss": 0.138, + "num_input_tokens_seen": 92977856, + "step": 76395 + }, + { + "epoch": 8.508742621672791, + "grad_norm": 0.00023880063963588327, + "learning_rate": 3.553131216866514e-05, + "loss": 0.0233, + "num_input_tokens_seen": 92984000, + "step": 76400 + }, + { + "epoch": 8.509299476556409, + "grad_norm": 0.05204107239842415, + "learning_rate": 3.552910848425896e-05, + "loss": 0.0117, + "num_input_tokens_seen": 92989952, + "step": 76405 + }, + { + "epoch": 8.509856331440027, + "grad_norm": 0.9317140579223633, + "learning_rate": 3.5526904700396926e-05, + "loss": 0.0442, + "num_input_tokens_seen": 92996032, + "step": 76410 + }, + { + "epoch": 8.510413186323644, + "grad_norm": 0.8236796259880066, + "learning_rate": 3.5524700817099835e-05, + "loss": 0.0514, + "num_input_tokens_seen": 93002176, + "step": 76415 + }, + { + "epoch": 8.510970041207262, + "grad_norm": 0.003154992824420333, + "learning_rate": 3.552249683438851e-05, + "loss": 0.0392, + "num_input_tokens_seen": 93008384, + "step": 76420 + }, + { + "epoch": 8.511526896090878, + "grad_norm": 0.1516825258731842, + "learning_rate": 3.552029275228378e-05, + "loss": 0.076, + "num_input_tokens_seen": 93014432, + "step": 76425 + }, + { + "epoch": 8.512083750974496, + "grad_norm": 0.5642212629318237, + "learning_rate": 3.551808857080645e-05, + "loss": 0.1256, + "num_input_tokens_seen": 93020576, + "step": 76430 + }, + { + "epoch": 8.512640605858113, + "grad_norm": 2.169097661972046, + "learning_rate": 3.551588428997735e-05, + "loss": 0.1129, + "num_input_tokens_seen": 93026464, + "step": 76435 + }, + { + "epoch": 8.513197460741731, + "grad_norm": 0.2222486287355423, + "learning_rate": 3.55136799098173e-05, + "loss": 0.0285, + "num_input_tokens_seen": 93032608, + "step": 76440 + }, + { + "epoch": 8.513754315625349, + "grad_norm": 0.884834349155426, + "learning_rate": 3.5511475430347115e-05, + "loss": 0.0717, + "num_input_tokens_seen": 93038432, + "step": 76445 + }, + { + "epoch": 8.514311170508964, + "grad_norm": 0.7839409708976746, + "learning_rate": 3.550927085158762e-05, + "loss": 0.0285, + "num_input_tokens_seen": 93044768, + "step": 76450 + }, + { + "epoch": 8.514868025392582, + "grad_norm": 0.4792003333568573, + "learning_rate": 3.5507066173559644e-05, + "loss": 0.019, + "num_input_tokens_seen": 93050784, + "step": 76455 + }, + { + "epoch": 8.5154248802762, + "grad_norm": 2.635573387145996, + "learning_rate": 3.550486139628402e-05, + "loss": 0.0633, + "num_input_tokens_seen": 93056672, + "step": 76460 + }, + { + "epoch": 8.515981735159817, + "grad_norm": 0.03535723313689232, + "learning_rate": 3.550265651978155e-05, + "loss": 0.0451, + "num_input_tokens_seen": 93062400, + "step": 76465 + }, + { + "epoch": 8.516538590043435, + "grad_norm": 0.1251194328069687, + "learning_rate": 3.550045154407309e-05, + "loss": 0.0476, + "num_input_tokens_seen": 93068608, + "step": 76470 + }, + { + "epoch": 8.517095444927053, + "grad_norm": 0.5675764083862305, + "learning_rate": 3.5498246469179435e-05, + "loss": 0.0176, + "num_input_tokens_seen": 93074464, + "step": 76475 + }, + { + "epoch": 8.517652299810669, + "grad_norm": 0.018799543380737305, + "learning_rate": 3.549604129512144e-05, + "loss": 0.0122, + "num_input_tokens_seen": 93079968, + "step": 76480 + }, + { + "epoch": 8.518209154694286, + "grad_norm": 0.1321871131658554, + "learning_rate": 3.5493836021919926e-05, + "loss": 0.0102, + "num_input_tokens_seen": 93086240, + "step": 76485 + }, + { + "epoch": 8.518766009577904, + "grad_norm": 0.0008282884955406189, + "learning_rate": 3.549163064959572e-05, + "loss": 0.0643, + "num_input_tokens_seen": 93092480, + "step": 76490 + }, + { + "epoch": 8.519322864461522, + "grad_norm": 0.23994188010692596, + "learning_rate": 3.548942517816966e-05, + "loss": 0.0444, + "num_input_tokens_seen": 93098784, + "step": 76495 + }, + { + "epoch": 8.51987971934514, + "grad_norm": 0.49050939083099365, + "learning_rate": 3.548721960766257e-05, + "loss": 0.0862, + "num_input_tokens_seen": 93104576, + "step": 76500 + }, + { + "epoch": 8.520436574228755, + "grad_norm": 1.5363832712173462, + "learning_rate": 3.5485013938095297e-05, + "loss": 0.0665, + "num_input_tokens_seen": 93110688, + "step": 76505 + }, + { + "epoch": 8.520993429112373, + "grad_norm": 0.012241293676197529, + "learning_rate": 3.5482808169488664e-05, + "loss": 0.0384, + "num_input_tokens_seen": 93116384, + "step": 76510 + }, + { + "epoch": 8.52155028399599, + "grad_norm": 1.6074917316436768, + "learning_rate": 3.54806023018635e-05, + "loss": 0.1771, + "num_input_tokens_seen": 93122816, + "step": 76515 + }, + { + "epoch": 8.522107138879608, + "grad_norm": 0.12612606585025787, + "learning_rate": 3.547839633524066e-05, + "loss": 0.0899, + "num_input_tokens_seen": 93128768, + "step": 76520 + }, + { + "epoch": 8.522663993763226, + "grad_norm": 1.039053201675415, + "learning_rate": 3.547619026964097e-05, + "loss": 0.1071, + "num_input_tokens_seen": 93134752, + "step": 76525 + }, + { + "epoch": 8.523220848646842, + "grad_norm": 0.5677089691162109, + "learning_rate": 3.5473984105085275e-05, + "loss": 0.0303, + "num_input_tokens_seen": 93140736, + "step": 76530 + }, + { + "epoch": 8.52377770353046, + "grad_norm": 0.1827261745929718, + "learning_rate": 3.54717778415944e-05, + "loss": 0.0141, + "num_input_tokens_seen": 93147008, + "step": 76535 + }, + { + "epoch": 8.524334558414077, + "grad_norm": 0.005000900477170944, + "learning_rate": 3.5469571479189195e-05, + "loss": 0.1091, + "num_input_tokens_seen": 93152992, + "step": 76540 + }, + { + "epoch": 8.524891413297695, + "grad_norm": 0.10282933712005615, + "learning_rate": 3.54673650178905e-05, + "loss": 0.1222, + "num_input_tokens_seen": 93159040, + "step": 76545 + }, + { + "epoch": 8.525448268181313, + "grad_norm": 0.00018681050278246403, + "learning_rate": 3.546515845771915e-05, + "loss": 0.0266, + "num_input_tokens_seen": 93165344, + "step": 76550 + }, + { + "epoch": 8.526005123064929, + "grad_norm": 0.06717149913311005, + "learning_rate": 3.5462951798696004e-05, + "loss": 0.0046, + "num_input_tokens_seen": 93171552, + "step": 76555 + }, + { + "epoch": 8.526561977948546, + "grad_norm": 0.007555225398391485, + "learning_rate": 3.546074504084189e-05, + "loss": 0.0252, + "num_input_tokens_seen": 93177344, + "step": 76560 + }, + { + "epoch": 8.527118832832164, + "grad_norm": 0.2779240906238556, + "learning_rate": 3.545853818417766e-05, + "loss": 0.0544, + "num_input_tokens_seen": 93183392, + "step": 76565 + }, + { + "epoch": 8.527675687715782, + "grad_norm": 0.0007431336562149227, + "learning_rate": 3.545633122872416e-05, + "loss": 0.1276, + "num_input_tokens_seen": 93189088, + "step": 76570 + }, + { + "epoch": 8.5282325425994, + "grad_norm": 0.5820384621620178, + "learning_rate": 3.5454124174502234e-05, + "loss": 0.0802, + "num_input_tokens_seen": 93194880, + "step": 76575 + }, + { + "epoch": 8.528789397483015, + "grad_norm": 0.17125536501407623, + "learning_rate": 3.545191702153272e-05, + "loss": 0.005, + "num_input_tokens_seen": 93200960, + "step": 76580 + }, + { + "epoch": 8.529346252366633, + "grad_norm": 0.2370162010192871, + "learning_rate": 3.5449709769836484e-05, + "loss": 0.0142, + "num_input_tokens_seen": 93207232, + "step": 76585 + }, + { + "epoch": 8.52990310725025, + "grad_norm": 0.16800788044929504, + "learning_rate": 3.5447502419434366e-05, + "loss": 0.0167, + "num_input_tokens_seen": 93213472, + "step": 76590 + }, + { + "epoch": 8.530459962133868, + "grad_norm": 0.17848025262355804, + "learning_rate": 3.544529497034722e-05, + "loss": 0.0524, + "num_input_tokens_seen": 93219392, + "step": 76595 + }, + { + "epoch": 8.531016817017486, + "grad_norm": 0.11069445312023163, + "learning_rate": 3.544308742259589e-05, + "loss": 0.0211, + "num_input_tokens_seen": 93225280, + "step": 76600 + }, + { + "epoch": 8.531573671901103, + "grad_norm": 0.13568982481956482, + "learning_rate": 3.544087977620123e-05, + "loss": 0.012, + "num_input_tokens_seen": 93231392, + "step": 76605 + }, + { + "epoch": 8.53213052678472, + "grad_norm": 0.00019747931219171733, + "learning_rate": 3.5438672031184094e-05, + "loss": 0.0928, + "num_input_tokens_seen": 93237472, + "step": 76610 + }, + { + "epoch": 8.532687381668337, + "grad_norm": 0.0006012568483129144, + "learning_rate": 3.543646418756535e-05, + "loss": 0.075, + "num_input_tokens_seen": 93243680, + "step": 76615 + }, + { + "epoch": 8.533244236551955, + "grad_norm": 0.16697071492671967, + "learning_rate": 3.543425624536583e-05, + "loss": 0.0577, + "num_input_tokens_seen": 93249952, + "step": 76620 + }, + { + "epoch": 8.533801091435572, + "grad_norm": 0.7826722264289856, + "learning_rate": 3.5432048204606406e-05, + "loss": 0.0655, + "num_input_tokens_seen": 93255296, + "step": 76625 + }, + { + "epoch": 8.53435794631919, + "grad_norm": 0.5990260243415833, + "learning_rate": 3.542984006530792e-05, + "loss": 0.0183, + "num_input_tokens_seen": 93261408, + "step": 76630 + }, + { + "epoch": 8.534914801202806, + "grad_norm": 0.00029743617051281035, + "learning_rate": 3.542763182749125e-05, + "loss": 0.0535, + "num_input_tokens_seen": 93267296, + "step": 76635 + }, + { + "epoch": 8.535471656086424, + "grad_norm": 1.2614476680755615, + "learning_rate": 3.542542349117723e-05, + "loss": 0.0736, + "num_input_tokens_seen": 93273344, + "step": 76640 + }, + { + "epoch": 8.536028510970041, + "grad_norm": 0.1587853580713272, + "learning_rate": 3.542321505638674e-05, + "loss": 0.0322, + "num_input_tokens_seen": 93279584, + "step": 76645 + }, + { + "epoch": 8.536585365853659, + "grad_norm": 1.67304527759552, + "learning_rate": 3.5421006523140635e-05, + "loss": 0.0948, + "num_input_tokens_seen": 93285600, + "step": 76650 + }, + { + "epoch": 8.537142220737277, + "grad_norm": 0.9910018444061279, + "learning_rate": 3.541879789145976e-05, + "loss": 0.0248, + "num_input_tokens_seen": 93291008, + "step": 76655 + }, + { + "epoch": 8.537699075620893, + "grad_norm": 1.1391067504882812, + "learning_rate": 3.5416589161365013e-05, + "loss": 0.1166, + "num_input_tokens_seen": 93297216, + "step": 76660 + }, + { + "epoch": 8.53825593050451, + "grad_norm": 0.10011086612939835, + "learning_rate": 3.541438033287722e-05, + "loss": 0.0868, + "num_input_tokens_seen": 93303328, + "step": 76665 + }, + { + "epoch": 8.538812785388128, + "grad_norm": 1.2057842016220093, + "learning_rate": 3.541217140601727e-05, + "loss": 0.0877, + "num_input_tokens_seen": 93309408, + "step": 76670 + }, + { + "epoch": 8.539369640271746, + "grad_norm": 1.2366676330566406, + "learning_rate": 3.5409962380806014e-05, + "loss": 0.0634, + "num_input_tokens_seen": 93315744, + "step": 76675 + }, + { + "epoch": 8.539926495155363, + "grad_norm": 0.14452631771564484, + "learning_rate": 3.540775325726432e-05, + "loss": 0.005, + "num_input_tokens_seen": 93321760, + "step": 76680 + }, + { + "epoch": 8.54048335003898, + "grad_norm": 0.24374739825725555, + "learning_rate": 3.540554403541307e-05, + "loss": 0.0078, + "num_input_tokens_seen": 93327648, + "step": 76685 + }, + { + "epoch": 8.541040204922597, + "grad_norm": 0.03846706822514534, + "learning_rate": 3.540333471527311e-05, + "loss": 0.0069, + "num_input_tokens_seen": 93334272, + "step": 76690 + }, + { + "epoch": 8.541597059806215, + "grad_norm": 0.029323749244213104, + "learning_rate": 3.540112529686532e-05, + "loss": 0.0234, + "num_input_tokens_seen": 93340256, + "step": 76695 + }, + { + "epoch": 8.542153914689832, + "grad_norm": 1.6227062940597534, + "learning_rate": 3.539891578021057e-05, + "loss": 0.019, + "num_input_tokens_seen": 93346368, + "step": 76700 + }, + { + "epoch": 8.54271076957345, + "grad_norm": 0.0035199387930333614, + "learning_rate": 3.539670616532972e-05, + "loss": 0.11, + "num_input_tokens_seen": 93352672, + "step": 76705 + }, + { + "epoch": 8.543267624457066, + "grad_norm": 0.10566814243793488, + "learning_rate": 3.539449645224366e-05, + "loss": 0.0658, + "num_input_tokens_seen": 93358496, + "step": 76710 + }, + { + "epoch": 8.543824479340683, + "grad_norm": 0.0009507698705419898, + "learning_rate": 3.5392286640973255e-05, + "loss": 0.006, + "num_input_tokens_seen": 93364512, + "step": 76715 + }, + { + "epoch": 8.544381334224301, + "grad_norm": 0.20712445676326752, + "learning_rate": 3.5390076731539374e-05, + "loss": 0.0174, + "num_input_tokens_seen": 93370752, + "step": 76720 + }, + { + "epoch": 8.544938189107919, + "grad_norm": 0.01152595691382885, + "learning_rate": 3.538786672396289e-05, + "loss": 0.0499, + "num_input_tokens_seen": 93377056, + "step": 76725 + }, + { + "epoch": 8.545495043991536, + "grad_norm": 0.02018865942955017, + "learning_rate": 3.538565661826469e-05, + "loss": 0.0305, + "num_input_tokens_seen": 93383232, + "step": 76730 + }, + { + "epoch": 8.546051898875152, + "grad_norm": 0.15487568080425262, + "learning_rate": 3.538344641446563e-05, + "loss": 0.0283, + "num_input_tokens_seen": 93389280, + "step": 76735 + }, + { + "epoch": 8.54660875375877, + "grad_norm": 0.2750775218009949, + "learning_rate": 3.538123611258661e-05, + "loss": 0.0265, + "num_input_tokens_seen": 93395616, + "step": 76740 + }, + { + "epoch": 8.547165608642388, + "grad_norm": 0.5446832776069641, + "learning_rate": 3.5379025712648497e-05, + "loss": 0.1022, + "num_input_tokens_seen": 93402048, + "step": 76745 + }, + { + "epoch": 8.547722463526005, + "grad_norm": 4.276484489440918, + "learning_rate": 3.537681521467216e-05, + "loss": 0.0998, + "num_input_tokens_seen": 93408256, + "step": 76750 + }, + { + "epoch": 8.548279318409623, + "grad_norm": 0.006682036444544792, + "learning_rate": 3.5374604618678505e-05, + "loss": 0.0344, + "num_input_tokens_seen": 93413856, + "step": 76755 + }, + { + "epoch": 8.548836173293239, + "grad_norm": 0.025848452001810074, + "learning_rate": 3.537239392468839e-05, + "loss": 0.1141, + "num_input_tokens_seen": 93419840, + "step": 76760 + }, + { + "epoch": 8.549393028176857, + "grad_norm": 1.6150836944580078, + "learning_rate": 3.5370183132722706e-05, + "loss": 0.1074, + "num_input_tokens_seen": 93425984, + "step": 76765 + }, + { + "epoch": 8.549949883060474, + "grad_norm": 0.8627752661705017, + "learning_rate": 3.536797224280233e-05, + "loss": 0.0508, + "num_input_tokens_seen": 93432096, + "step": 76770 + }, + { + "epoch": 8.550506737944092, + "grad_norm": 0.5053221583366394, + "learning_rate": 3.536576125494815e-05, + "loss": 0.0849, + "num_input_tokens_seen": 93438144, + "step": 76775 + }, + { + "epoch": 8.55106359282771, + "grad_norm": 0.4179735481739044, + "learning_rate": 3.536355016918106e-05, + "loss": 0.0654, + "num_input_tokens_seen": 93444384, + "step": 76780 + }, + { + "epoch": 8.551620447711326, + "grad_norm": 0.15799419581890106, + "learning_rate": 3.536133898552192e-05, + "loss": 0.1151, + "num_input_tokens_seen": 93450720, + "step": 76785 + }, + { + "epoch": 8.552177302594943, + "grad_norm": 0.01941332221031189, + "learning_rate": 3.535912770399164e-05, + "loss": 0.1253, + "num_input_tokens_seen": 93456896, + "step": 76790 + }, + { + "epoch": 8.552734157478561, + "grad_norm": 0.007053708657622337, + "learning_rate": 3.5356916324611104e-05, + "loss": 0.0404, + "num_input_tokens_seen": 93462944, + "step": 76795 + }, + { + "epoch": 8.553291012362179, + "grad_norm": 0.11831745505332947, + "learning_rate": 3.535470484740118e-05, + "loss": 0.0377, + "num_input_tokens_seen": 93468992, + "step": 76800 + }, + { + "epoch": 8.553847867245796, + "grad_norm": 0.27510130405426025, + "learning_rate": 3.535249327238279e-05, + "loss": 0.0734, + "num_input_tokens_seen": 93475008, + "step": 76805 + }, + { + "epoch": 8.554404722129412, + "grad_norm": 0.1347840130329132, + "learning_rate": 3.535028159957679e-05, + "loss": 0.1751, + "num_input_tokens_seen": 93481120, + "step": 76810 + }, + { + "epoch": 8.55496157701303, + "grad_norm": 0.15557095408439636, + "learning_rate": 3.5348069829004105e-05, + "loss": 0.0327, + "num_input_tokens_seen": 93487168, + "step": 76815 + }, + { + "epoch": 8.555518431896648, + "grad_norm": 0.3825171887874603, + "learning_rate": 3.5345857960685604e-05, + "loss": 0.0119, + "num_input_tokens_seen": 93493216, + "step": 76820 + }, + { + "epoch": 8.556075286780265, + "grad_norm": 0.6602566838264465, + "learning_rate": 3.5343645994642175e-05, + "loss": 0.0412, + "num_input_tokens_seen": 93499264, + "step": 76825 + }, + { + "epoch": 8.556632141663883, + "grad_norm": 0.5646587014198303, + "learning_rate": 3.5341433930894735e-05, + "loss": 0.0087, + "num_input_tokens_seen": 93505664, + "step": 76830 + }, + { + "epoch": 8.5571889965475, + "grad_norm": 0.5758928656578064, + "learning_rate": 3.5339221769464156e-05, + "loss": 0.0281, + "num_input_tokens_seen": 93511680, + "step": 76835 + }, + { + "epoch": 8.557745851431116, + "grad_norm": 0.12824749946594238, + "learning_rate": 3.5337009510371356e-05, + "loss": 0.0714, + "num_input_tokens_seen": 93518016, + "step": 76840 + }, + { + "epoch": 8.558302706314734, + "grad_norm": 0.2668338119983673, + "learning_rate": 3.533479715363721e-05, + "loss": 0.0585, + "num_input_tokens_seen": 93524256, + "step": 76845 + }, + { + "epoch": 8.558859561198352, + "grad_norm": 1.3590940237045288, + "learning_rate": 3.5332584699282636e-05, + "loss": 0.1611, + "num_input_tokens_seen": 93530368, + "step": 76850 + }, + { + "epoch": 8.55941641608197, + "grad_norm": 0.049128707498311996, + "learning_rate": 3.5330372147328506e-05, + "loss": 0.056, + "num_input_tokens_seen": 93536320, + "step": 76855 + }, + { + "epoch": 8.559973270965587, + "grad_norm": 0.16905249655246735, + "learning_rate": 3.532815949779574e-05, + "loss": 0.025, + "num_input_tokens_seen": 93542528, + "step": 76860 + }, + { + "epoch": 8.560530125849203, + "grad_norm": 0.2680049240589142, + "learning_rate": 3.5325946750705236e-05, + "loss": 0.0283, + "num_input_tokens_seen": 93548416, + "step": 76865 + }, + { + "epoch": 8.56108698073282, + "grad_norm": 0.08629342913627625, + "learning_rate": 3.5323733906077885e-05, + "loss": 0.0963, + "num_input_tokens_seen": 93553856, + "step": 76870 + }, + { + "epoch": 8.561643835616438, + "grad_norm": 0.2140505313873291, + "learning_rate": 3.5321520963934606e-05, + "loss": 0.0054, + "num_input_tokens_seen": 93560096, + "step": 76875 + }, + { + "epoch": 8.562200690500056, + "grad_norm": 0.0006863887538202107, + "learning_rate": 3.531930792429628e-05, + "loss": 0.0575, + "num_input_tokens_seen": 93566016, + "step": 76880 + }, + { + "epoch": 8.562757545383674, + "grad_norm": 1.329627513885498, + "learning_rate": 3.531709478718383e-05, + "loss": 0.0738, + "num_input_tokens_seen": 93571648, + "step": 76885 + }, + { + "epoch": 8.56331440026729, + "grad_norm": 0.8239938020706177, + "learning_rate": 3.5314881552618163e-05, + "loss": 0.0377, + "num_input_tokens_seen": 93577952, + "step": 76890 + }, + { + "epoch": 8.563871255150907, + "grad_norm": 0.7256354093551636, + "learning_rate": 3.531266822062016e-05, + "loss": 0.1492, + "num_input_tokens_seen": 93583840, + "step": 76895 + }, + { + "epoch": 8.564428110034525, + "grad_norm": 0.8717544078826904, + "learning_rate": 3.531045479121075e-05, + "loss": 0.075, + "num_input_tokens_seen": 93590176, + "step": 76900 + }, + { + "epoch": 8.564984964918143, + "grad_norm": 1.3434900045394897, + "learning_rate": 3.5308241264410835e-05, + "loss": 0.1007, + "num_input_tokens_seen": 93595904, + "step": 76905 + }, + { + "epoch": 8.56554181980176, + "grad_norm": 1.1886261701583862, + "learning_rate": 3.530602764024132e-05, + "loss": 0.0714, + "num_input_tokens_seen": 93601920, + "step": 76910 + }, + { + "epoch": 8.566098674685376, + "grad_norm": 0.0008038734085857868, + "learning_rate": 3.5303813918723113e-05, + "loss": 0.0375, + "num_input_tokens_seen": 93607680, + "step": 76915 + }, + { + "epoch": 8.566655529568994, + "grad_norm": 0.08079369366168976, + "learning_rate": 3.530160009987714e-05, + "loss": 0.0042, + "num_input_tokens_seen": 93613728, + "step": 76920 + }, + { + "epoch": 8.567212384452612, + "grad_norm": 0.015263430774211884, + "learning_rate": 3.529938618372429e-05, + "loss": 0.0399, + "num_input_tokens_seen": 93620128, + "step": 76925 + }, + { + "epoch": 8.56776923933623, + "grad_norm": 1.9958170652389526, + "learning_rate": 3.529717217028549e-05, + "loss": 0.1238, + "num_input_tokens_seen": 93626208, + "step": 76930 + }, + { + "epoch": 8.568326094219847, + "grad_norm": 0.194312185049057, + "learning_rate": 3.529495805958165e-05, + "loss": 0.0759, + "num_input_tokens_seen": 93631232, + "step": 76935 + }, + { + "epoch": 8.568882949103463, + "grad_norm": 0.2784797251224518, + "learning_rate": 3.529274385163368e-05, + "loss": 0.0483, + "num_input_tokens_seen": 93637472, + "step": 76940 + }, + { + "epoch": 8.56943980398708, + "grad_norm": 0.6147591471672058, + "learning_rate": 3.52905295464625e-05, + "loss": 0.1073, + "num_input_tokens_seen": 93643392, + "step": 76945 + }, + { + "epoch": 8.569996658870698, + "grad_norm": 1.1250736713409424, + "learning_rate": 3.5288315144089025e-05, + "loss": 0.1004, + "num_input_tokens_seen": 93649248, + "step": 76950 + }, + { + "epoch": 8.570553513754316, + "grad_norm": 0.04791218042373657, + "learning_rate": 3.5286100644534164e-05, + "loss": 0.0102, + "num_input_tokens_seen": 93655648, + "step": 76955 + }, + { + "epoch": 8.571110368637934, + "grad_norm": 0.0001819790486479178, + "learning_rate": 3.528388604781885e-05, + "loss": 0.0245, + "num_input_tokens_seen": 93661920, + "step": 76960 + }, + { + "epoch": 8.571667223521551, + "grad_norm": 0.9339752793312073, + "learning_rate": 3.528167135396399e-05, + "loss": 0.0559, + "num_input_tokens_seen": 93667648, + "step": 76965 + }, + { + "epoch": 8.572224078405167, + "grad_norm": 0.008458871394395828, + "learning_rate": 3.5279456562990504e-05, + "loss": 0.0119, + "num_input_tokens_seen": 93673856, + "step": 76970 + }, + { + "epoch": 8.572780933288785, + "grad_norm": 0.5643329620361328, + "learning_rate": 3.5277241674919316e-05, + "loss": 0.0309, + "num_input_tokens_seen": 93679936, + "step": 76975 + }, + { + "epoch": 8.573337788172402, + "grad_norm": 0.0006966504151932895, + "learning_rate": 3.527502668977135e-05, + "loss": 0.0206, + "num_input_tokens_seen": 93686464, + "step": 76980 + }, + { + "epoch": 8.57389464305602, + "grad_norm": 0.004088698886334896, + "learning_rate": 3.527281160756752e-05, + "loss": 0.1088, + "num_input_tokens_seen": 93692576, + "step": 76985 + }, + { + "epoch": 8.574451497939638, + "grad_norm": 0.9853025674819946, + "learning_rate": 3.527059642832875e-05, + "loss": 0.0629, + "num_input_tokens_seen": 93699104, + "step": 76990 + }, + { + "epoch": 8.575008352823254, + "grad_norm": 0.4578250050544739, + "learning_rate": 3.526838115207598e-05, + "loss": 0.0226, + "num_input_tokens_seen": 93704544, + "step": 76995 + }, + { + "epoch": 8.575565207706871, + "grad_norm": 0.5570197701454163, + "learning_rate": 3.5266165778830114e-05, + "loss": 0.0843, + "num_input_tokens_seen": 93710368, + "step": 77000 + }, + { + "epoch": 8.576122062590489, + "grad_norm": 0.22436784207820892, + "learning_rate": 3.5263950308612094e-05, + "loss": 0.0448, + "num_input_tokens_seen": 93716512, + "step": 77005 + }, + { + "epoch": 8.576678917474107, + "grad_norm": 0.28796496987342834, + "learning_rate": 3.526173474144283e-05, + "loss": 0.0172, + "num_input_tokens_seen": 93722464, + "step": 77010 + }, + { + "epoch": 8.577235772357724, + "grad_norm": 0.5207822918891907, + "learning_rate": 3.525951907734326e-05, + "loss": 0.0356, + "num_input_tokens_seen": 93728416, + "step": 77015 + }, + { + "epoch": 8.57779262724134, + "grad_norm": 1.2060363292694092, + "learning_rate": 3.525730331633432e-05, + "loss": 0.0515, + "num_input_tokens_seen": 93734400, + "step": 77020 + }, + { + "epoch": 8.578349482124958, + "grad_norm": 1.3327569961547852, + "learning_rate": 3.525508745843693e-05, + "loss": 0.0613, + "num_input_tokens_seen": 93740320, + "step": 77025 + }, + { + "epoch": 8.578906337008576, + "grad_norm": 0.23724886775016785, + "learning_rate": 3.5252871503672025e-05, + "loss": 0.0645, + "num_input_tokens_seen": 93746208, + "step": 77030 + }, + { + "epoch": 8.579463191892193, + "grad_norm": 0.0019250878831371665, + "learning_rate": 3.525065545206053e-05, + "loss": 0.0003, + "num_input_tokens_seen": 93752640, + "step": 77035 + }, + { + "epoch": 8.580020046775811, + "grad_norm": 1.1696672439575195, + "learning_rate": 3.5248439303623384e-05, + "loss": 0.0858, + "num_input_tokens_seen": 93758528, + "step": 77040 + }, + { + "epoch": 8.580576901659427, + "grad_norm": 0.4780602753162384, + "learning_rate": 3.524622305838152e-05, + "loss": 0.0356, + "num_input_tokens_seen": 93764992, + "step": 77045 + }, + { + "epoch": 8.581133756543045, + "grad_norm": 0.36861512064933777, + "learning_rate": 3.524400671635587e-05, + "loss": 0.0529, + "num_input_tokens_seen": 93770848, + "step": 77050 + }, + { + "epoch": 8.581690611426662, + "grad_norm": 0.31040889024734497, + "learning_rate": 3.524179027756737e-05, + "loss": 0.0097, + "num_input_tokens_seen": 93776928, + "step": 77055 + }, + { + "epoch": 8.58224746631028, + "grad_norm": 0.024529416114091873, + "learning_rate": 3.5239573742036945e-05, + "loss": 0.03, + "num_input_tokens_seen": 93783520, + "step": 77060 + }, + { + "epoch": 8.582804321193898, + "grad_norm": 0.015477748587727547, + "learning_rate": 3.523735710978555e-05, + "loss": 0.0456, + "num_input_tokens_seen": 93789664, + "step": 77065 + }, + { + "epoch": 8.583361176077513, + "grad_norm": 0.08815327286720276, + "learning_rate": 3.523514038083411e-05, + "loss": 0.0069, + "num_input_tokens_seen": 93795936, + "step": 77070 + }, + { + "epoch": 8.583918030961131, + "grad_norm": 0.016332630068063736, + "learning_rate": 3.523292355520358e-05, + "loss": 0.0277, + "num_input_tokens_seen": 93802272, + "step": 77075 + }, + { + "epoch": 8.584474885844749, + "grad_norm": 0.011940856464207172, + "learning_rate": 3.523070663291488e-05, + "loss": 0.1115, + "num_input_tokens_seen": 93808352, + "step": 77080 + }, + { + "epoch": 8.585031740728367, + "grad_norm": 1.1986057758331299, + "learning_rate": 3.5228489613988955e-05, + "loss": 0.1396, + "num_input_tokens_seen": 93814464, + "step": 77085 + }, + { + "epoch": 8.585588595611984, + "grad_norm": 0.00029033998725935817, + "learning_rate": 3.5226272498446765e-05, + "loss": 0.036, + "num_input_tokens_seen": 93820704, + "step": 77090 + }, + { + "epoch": 8.5861454504956, + "grad_norm": 0.04843446984887123, + "learning_rate": 3.522405528630923e-05, + "loss": 0.0183, + "num_input_tokens_seen": 93826560, + "step": 77095 + }, + { + "epoch": 8.586702305379218, + "grad_norm": 0.2791549563407898, + "learning_rate": 3.52218379775973e-05, + "loss": 0.0089, + "num_input_tokens_seen": 93832928, + "step": 77100 + }, + { + "epoch": 8.587259160262835, + "grad_norm": 3.4365975856781006, + "learning_rate": 3.521962057233192e-05, + "loss": 0.1122, + "num_input_tokens_seen": 93839200, + "step": 77105 + }, + { + "epoch": 8.587816015146453, + "grad_norm": 0.05442795902490616, + "learning_rate": 3.5217403070534034e-05, + "loss": 0.0113, + "num_input_tokens_seen": 93845472, + "step": 77110 + }, + { + "epoch": 8.58837287003007, + "grad_norm": 0.11321146041154861, + "learning_rate": 3.52151854722246e-05, + "loss": 0.018, + "num_input_tokens_seen": 93851648, + "step": 77115 + }, + { + "epoch": 8.588929724913687, + "grad_norm": 0.20065434277057648, + "learning_rate": 3.5212967777424545e-05, + "loss": 0.0601, + "num_input_tokens_seen": 93857984, + "step": 77120 + }, + { + "epoch": 8.589486579797304, + "grad_norm": 0.03283548355102539, + "learning_rate": 3.5210749986154835e-05, + "loss": 0.0041, + "num_input_tokens_seen": 93864064, + "step": 77125 + }, + { + "epoch": 8.590043434680922, + "grad_norm": 0.00012843353033531457, + "learning_rate": 3.52085320984364e-05, + "loss": 0.0331, + "num_input_tokens_seen": 93870304, + "step": 77130 + }, + { + "epoch": 8.59060028956454, + "grad_norm": 2.098297119140625, + "learning_rate": 3.52063141142902e-05, + "loss": 0.1833, + "num_input_tokens_seen": 93876512, + "step": 77135 + }, + { + "epoch": 8.591157144448157, + "grad_norm": 0.012201039120554924, + "learning_rate": 3.52040960337372e-05, + "loss": 0.065, + "num_input_tokens_seen": 93882592, + "step": 77140 + }, + { + "epoch": 8.591713999331773, + "grad_norm": 0.11538641154766083, + "learning_rate": 3.5201877856798325e-05, + "loss": 0.1208, + "num_input_tokens_seen": 93888416, + "step": 77145 + }, + { + "epoch": 8.592270854215391, + "grad_norm": 1.1873027086257935, + "learning_rate": 3.519965958349455e-05, + "loss": 0.0333, + "num_input_tokens_seen": 93894368, + "step": 77150 + }, + { + "epoch": 8.592827709099009, + "grad_norm": 2.1703062057495117, + "learning_rate": 3.519744121384681e-05, + "loss": 0.1237, + "num_input_tokens_seen": 93900416, + "step": 77155 + }, + { + "epoch": 8.593384563982626, + "grad_norm": 0.0020892780739814043, + "learning_rate": 3.519522274787608e-05, + "loss": 0.0473, + "num_input_tokens_seen": 93906848, + "step": 77160 + }, + { + "epoch": 8.593941418866244, + "grad_norm": 0.04078971594572067, + "learning_rate": 3.519300418560329e-05, + "loss": 0.0844, + "num_input_tokens_seen": 93913056, + "step": 77165 + }, + { + "epoch": 8.59449827374986, + "grad_norm": 0.9794880747795105, + "learning_rate": 3.519078552704941e-05, + "loss": 0.0299, + "num_input_tokens_seen": 93919008, + "step": 77170 + }, + { + "epoch": 8.595055128633478, + "grad_norm": 0.5355150699615479, + "learning_rate": 3.5188566772235395e-05, + "loss": 0.0877, + "num_input_tokens_seen": 93925344, + "step": 77175 + }, + { + "epoch": 8.595611983517095, + "grad_norm": 0.06867098063230515, + "learning_rate": 3.518634792118221e-05, + "loss": 0.0809, + "num_input_tokens_seen": 93931456, + "step": 77180 + }, + { + "epoch": 8.596168838400713, + "grad_norm": 0.05597329139709473, + "learning_rate": 3.518412897391081e-05, + "loss": 0.001, + "num_input_tokens_seen": 93937632, + "step": 77185 + }, + { + "epoch": 8.59672569328433, + "grad_norm": 0.00063137779943645, + "learning_rate": 3.5181909930442146e-05, + "loss": 0.0388, + "num_input_tokens_seen": 93944096, + "step": 77190 + }, + { + "epoch": 8.597282548167948, + "grad_norm": 0.6191033124923706, + "learning_rate": 3.5179690790797194e-05, + "loss": 0.1661, + "num_input_tokens_seen": 93950560, + "step": 77195 + }, + { + "epoch": 8.597839403051564, + "grad_norm": 1.6499195098876953, + "learning_rate": 3.51774715549969e-05, + "loss": 0.0565, + "num_input_tokens_seen": 93956800, + "step": 77200 + }, + { + "epoch": 8.598396257935182, + "grad_norm": 0.0547451376914978, + "learning_rate": 3.517525222306223e-05, + "loss": 0.0135, + "num_input_tokens_seen": 93963072, + "step": 77205 + }, + { + "epoch": 8.5989531128188, + "grad_norm": 0.0007189566385932267, + "learning_rate": 3.517303279501416e-05, + "loss": 0.0402, + "num_input_tokens_seen": 93968992, + "step": 77210 + }, + { + "epoch": 8.599509967702417, + "grad_norm": 1.7920148372650146, + "learning_rate": 3.517081327087363e-05, + "loss": 0.1171, + "num_input_tokens_seen": 93975232, + "step": 77215 + }, + { + "epoch": 8.600066822586035, + "grad_norm": 1.4139419794082642, + "learning_rate": 3.516859365066163e-05, + "loss": 0.0545, + "num_input_tokens_seen": 93981344, + "step": 77220 + }, + { + "epoch": 8.60062367746965, + "grad_norm": 0.7178791761398315, + "learning_rate": 3.516637393439911e-05, + "loss": 0.0855, + "num_input_tokens_seen": 93987680, + "step": 77225 + }, + { + "epoch": 8.601180532353268, + "grad_norm": 0.003313579596579075, + "learning_rate": 3.516415412210705e-05, + "loss": 0.0208, + "num_input_tokens_seen": 93993952, + "step": 77230 + }, + { + "epoch": 8.601737387236886, + "grad_norm": 1.795175313949585, + "learning_rate": 3.516193421380641e-05, + "loss": 0.0824, + "num_input_tokens_seen": 94000224, + "step": 77235 + }, + { + "epoch": 8.602294242120504, + "grad_norm": 0.25705835223197937, + "learning_rate": 3.515971420951816e-05, + "loss": 0.0161, + "num_input_tokens_seen": 94006560, + "step": 77240 + }, + { + "epoch": 8.602851097004121, + "grad_norm": 0.010136950761079788, + "learning_rate": 3.5157494109263266e-05, + "loss": 0.0737, + "num_input_tokens_seen": 94012384, + "step": 77245 + }, + { + "epoch": 8.603407951887737, + "grad_norm": 0.8387547135353088, + "learning_rate": 3.51552739130627e-05, + "loss": 0.0243, + "num_input_tokens_seen": 94018624, + "step": 77250 + }, + { + "epoch": 8.603964806771355, + "grad_norm": 1.1309311389923096, + "learning_rate": 3.515305362093744e-05, + "loss": 0.1233, + "num_input_tokens_seen": 94025024, + "step": 77255 + }, + { + "epoch": 8.604521661654973, + "grad_norm": 0.8100247979164124, + "learning_rate": 3.515083323290845e-05, + "loss": 0.0094, + "num_input_tokens_seen": 94031264, + "step": 77260 + }, + { + "epoch": 8.60507851653859, + "grad_norm": 0.04059473052620888, + "learning_rate": 3.5148612748996714e-05, + "loss": 0.0546, + "num_input_tokens_seen": 94037568, + "step": 77265 + }, + { + "epoch": 8.605635371422208, + "grad_norm": 0.0003447950293775648, + "learning_rate": 3.5146392169223194e-05, + "loss": 0.0131, + "num_input_tokens_seen": 94043808, + "step": 77270 + }, + { + "epoch": 8.606192226305824, + "grad_norm": 0.007542145438492298, + "learning_rate": 3.514417149360887e-05, + "loss": 0.0182, + "num_input_tokens_seen": 94050112, + "step": 77275 + }, + { + "epoch": 8.606749081189442, + "grad_norm": 0.036088138818740845, + "learning_rate": 3.514195072217473e-05, + "loss": 0.0653, + "num_input_tokens_seen": 94056448, + "step": 77280 + }, + { + "epoch": 8.60730593607306, + "grad_norm": 0.0556248314678669, + "learning_rate": 3.5139729854941725e-05, + "loss": 0.0155, + "num_input_tokens_seen": 94062176, + "step": 77285 + }, + { + "epoch": 8.607862790956677, + "grad_norm": 0.05578913912177086, + "learning_rate": 3.513750889193085e-05, + "loss": 0.0182, + "num_input_tokens_seen": 94068096, + "step": 77290 + }, + { + "epoch": 8.608419645840295, + "grad_norm": 0.05879240110516548, + "learning_rate": 3.5135287833163094e-05, + "loss": 0.0119, + "num_input_tokens_seen": 94074688, + "step": 77295 + }, + { + "epoch": 8.60897650072391, + "grad_norm": 0.0006083126645535231, + "learning_rate": 3.513306667865941e-05, + "loss": 0.0986, + "num_input_tokens_seen": 94080800, + "step": 77300 + }, + { + "epoch": 8.609533355607528, + "grad_norm": 0.07992779463529587, + "learning_rate": 3.513084542844081e-05, + "loss": 0.2067, + "num_input_tokens_seen": 94086624, + "step": 77305 + }, + { + "epoch": 8.610090210491146, + "grad_norm": 0.0002307962131453678, + "learning_rate": 3.5128624082528236e-05, + "loss": 0.0312, + "num_input_tokens_seen": 94092192, + "step": 77310 + }, + { + "epoch": 8.610647065374764, + "grad_norm": 0.02970978617668152, + "learning_rate": 3.512640264094271e-05, + "loss": 0.0063, + "num_input_tokens_seen": 94098272, + "step": 77315 + }, + { + "epoch": 8.611203920258381, + "grad_norm": 0.041598983108997345, + "learning_rate": 3.512418110370519e-05, + "loss": 0.0456, + "num_input_tokens_seen": 94104160, + "step": 77320 + }, + { + "epoch": 8.611760775141999, + "grad_norm": 1.6516430377960205, + "learning_rate": 3.512195947083666e-05, + "loss": 0.1635, + "num_input_tokens_seen": 94109984, + "step": 77325 + }, + { + "epoch": 8.612317630025615, + "grad_norm": 0.0003221204096917063, + "learning_rate": 3.511973774235813e-05, + "loss": 0.024, + "num_input_tokens_seen": 94115872, + "step": 77330 + }, + { + "epoch": 8.612874484909232, + "grad_norm": 0.469968318939209, + "learning_rate": 3.511751591829056e-05, + "loss": 0.0066, + "num_input_tokens_seen": 94121856, + "step": 77335 + }, + { + "epoch": 8.61343133979285, + "grad_norm": 0.20537182688713074, + "learning_rate": 3.5115293998654955e-05, + "loss": 0.0089, + "num_input_tokens_seen": 94127584, + "step": 77340 + }, + { + "epoch": 8.613988194676468, + "grad_norm": 1.1772516965866089, + "learning_rate": 3.5113071983472284e-05, + "loss": 0.0247, + "num_input_tokens_seen": 94133696, + "step": 77345 + }, + { + "epoch": 8.614545049560085, + "grad_norm": 0.10412632673978806, + "learning_rate": 3.511084987276355e-05, + "loss": 0.0211, + "num_input_tokens_seen": 94139840, + "step": 77350 + }, + { + "epoch": 8.615101904443701, + "grad_norm": 0.6060212254524231, + "learning_rate": 3.5108627666549733e-05, + "loss": 0.0321, + "num_input_tokens_seen": 94146176, + "step": 77355 + }, + { + "epoch": 8.615658759327319, + "grad_norm": 1.7804516553878784, + "learning_rate": 3.510640536485183e-05, + "loss": 0.1032, + "num_input_tokens_seen": 94152416, + "step": 77360 + }, + { + "epoch": 8.616215614210937, + "grad_norm": 0.145631343126297, + "learning_rate": 3.510418296769084e-05, + "loss": 0.0074, + "num_input_tokens_seen": 94158496, + "step": 77365 + }, + { + "epoch": 8.616772469094554, + "grad_norm": 0.019257865846157074, + "learning_rate": 3.510196047508774e-05, + "loss": 0.0668, + "num_input_tokens_seen": 94164768, + "step": 77370 + }, + { + "epoch": 8.617329323978172, + "grad_norm": 1.2918260097503662, + "learning_rate": 3.5099737887063535e-05, + "loss": 0.026, + "num_input_tokens_seen": 94170912, + "step": 77375 + }, + { + "epoch": 8.617886178861788, + "grad_norm": 1.6139122247695923, + "learning_rate": 3.5097515203639204e-05, + "loss": 0.0988, + "num_input_tokens_seen": 94176672, + "step": 77380 + }, + { + "epoch": 8.618443033745406, + "grad_norm": 0.008268599398434162, + "learning_rate": 3.509529242483575e-05, + "loss": 0.0246, + "num_input_tokens_seen": 94182656, + "step": 77385 + }, + { + "epoch": 8.618999888629023, + "grad_norm": 0.02962503768503666, + "learning_rate": 3.5093069550674184e-05, + "loss": 0.0427, + "num_input_tokens_seen": 94188736, + "step": 77390 + }, + { + "epoch": 8.619556743512641, + "grad_norm": 0.7947252988815308, + "learning_rate": 3.509084658117549e-05, + "loss": 0.0892, + "num_input_tokens_seen": 94194944, + "step": 77395 + }, + { + "epoch": 8.620113598396259, + "grad_norm": 0.010263723321259022, + "learning_rate": 3.5088623516360654e-05, + "loss": 0.0881, + "num_input_tokens_seen": 94201120, + "step": 77400 + }, + { + "epoch": 8.620670453279875, + "grad_norm": 0.8677988648414612, + "learning_rate": 3.508640035625069e-05, + "loss": 0.0556, + "num_input_tokens_seen": 94207008, + "step": 77405 + }, + { + "epoch": 8.621227308163492, + "grad_norm": 0.5910415053367615, + "learning_rate": 3.50841771008666e-05, + "loss": 0.0254, + "num_input_tokens_seen": 94213088, + "step": 77410 + }, + { + "epoch": 8.62178416304711, + "grad_norm": 0.7105022668838501, + "learning_rate": 3.5081953750229365e-05, + "loss": 0.0657, + "num_input_tokens_seen": 94219008, + "step": 77415 + }, + { + "epoch": 8.622341017930728, + "grad_norm": 0.1265357881784439, + "learning_rate": 3.507973030436e-05, + "loss": 0.0178, + "num_input_tokens_seen": 94225120, + "step": 77420 + }, + { + "epoch": 8.622897872814345, + "grad_norm": 0.2281024008989334, + "learning_rate": 3.507750676327952e-05, + "loss": 0.1063, + "num_input_tokens_seen": 94231200, + "step": 77425 + }, + { + "epoch": 8.623454727697961, + "grad_norm": 0.8915014863014221, + "learning_rate": 3.5075283127008904e-05, + "loss": 0.0913, + "num_input_tokens_seen": 94237408, + "step": 77430 + }, + { + "epoch": 8.624011582581579, + "grad_norm": 0.007147664669901133, + "learning_rate": 3.507305939556917e-05, + "loss": 0.0869, + "num_input_tokens_seen": 94243456, + "step": 77435 + }, + { + "epoch": 8.624568437465197, + "grad_norm": 0.002846968360245228, + "learning_rate": 3.507083556898132e-05, + "loss": 0.1539, + "num_input_tokens_seen": 94249984, + "step": 77440 + }, + { + "epoch": 8.625125292348814, + "grad_norm": 2.778125524520874, + "learning_rate": 3.506861164726637e-05, + "loss": 0.0968, + "num_input_tokens_seen": 94256352, + "step": 77445 + }, + { + "epoch": 8.625682147232432, + "grad_norm": 1.3076692819595337, + "learning_rate": 3.50663876304453e-05, + "loss": 0.0681, + "num_input_tokens_seen": 94262272, + "step": 77450 + }, + { + "epoch": 8.626239002116048, + "grad_norm": 0.08493082970380783, + "learning_rate": 3.506416351853914e-05, + "loss": 0.089, + "num_input_tokens_seen": 94268096, + "step": 77455 + }, + { + "epoch": 8.626795856999665, + "grad_norm": 0.5536115169525146, + "learning_rate": 3.506193931156889e-05, + "loss": 0.0917, + "num_input_tokens_seen": 94273408, + "step": 77460 + }, + { + "epoch": 8.627352711883283, + "grad_norm": 0.635776698589325, + "learning_rate": 3.505971500955557e-05, + "loss": 0.0779, + "num_input_tokens_seen": 94279680, + "step": 77465 + }, + { + "epoch": 8.6279095667669, + "grad_norm": 0.01725015603005886, + "learning_rate": 3.5057490612520174e-05, + "loss": 0.0129, + "num_input_tokens_seen": 94286144, + "step": 77470 + }, + { + "epoch": 8.628466421650518, + "grad_norm": 2.2331972122192383, + "learning_rate": 3.505526612048372e-05, + "loss": 0.1324, + "num_input_tokens_seen": 94292192, + "step": 77475 + }, + { + "epoch": 8.629023276534134, + "grad_norm": 0.04429485648870468, + "learning_rate": 3.505304153346723e-05, + "loss": 0.0077, + "num_input_tokens_seen": 94298304, + "step": 77480 + }, + { + "epoch": 8.629580131417752, + "grad_norm": 1.4182029962539673, + "learning_rate": 3.50508168514917e-05, + "loss": 0.1581, + "num_input_tokens_seen": 94304256, + "step": 77485 + }, + { + "epoch": 8.63013698630137, + "grad_norm": 0.0015595131553709507, + "learning_rate": 3.5048592074578154e-05, + "loss": 0.0916, + "num_input_tokens_seen": 94310496, + "step": 77490 + }, + { + "epoch": 8.630693841184987, + "grad_norm": 0.27289506793022156, + "learning_rate": 3.50463672027476e-05, + "loss": 0.0112, + "num_input_tokens_seen": 94316640, + "step": 77495 + }, + { + "epoch": 8.631250696068605, + "grad_norm": 0.014773239381611347, + "learning_rate": 3.504414223602107e-05, + "loss": 0.0283, + "num_input_tokens_seen": 94321920, + "step": 77500 + }, + { + "epoch": 8.631807550952221, + "grad_norm": 0.761612594127655, + "learning_rate": 3.504191717441956e-05, + "loss": 0.0731, + "num_input_tokens_seen": 94328288, + "step": 77505 + }, + { + "epoch": 8.632364405835839, + "grad_norm": 0.7291088104248047, + "learning_rate": 3.5039692017964106e-05, + "loss": 0.0633, + "num_input_tokens_seen": 94334560, + "step": 77510 + }, + { + "epoch": 8.632921260719456, + "grad_norm": 0.0012902348535135388, + "learning_rate": 3.503746676667571e-05, + "loss": 0.0259, + "num_input_tokens_seen": 94340832, + "step": 77515 + }, + { + "epoch": 8.633478115603074, + "grad_norm": 1.7036445140838623, + "learning_rate": 3.5035241420575404e-05, + "loss": 0.0639, + "num_input_tokens_seen": 94347104, + "step": 77520 + }, + { + "epoch": 8.634034970486692, + "grad_norm": 1.285695195198059, + "learning_rate": 3.503301597968419e-05, + "loss": 0.0364, + "num_input_tokens_seen": 94353280, + "step": 77525 + }, + { + "epoch": 8.634591825370308, + "grad_norm": 0.13412253558635712, + "learning_rate": 3.5030790444023124e-05, + "loss": 0.1507, + "num_input_tokens_seen": 94359264, + "step": 77530 + }, + { + "epoch": 8.635148680253925, + "grad_norm": 0.8896024227142334, + "learning_rate": 3.502856481361319e-05, + "loss": 0.0502, + "num_input_tokens_seen": 94365216, + "step": 77535 + }, + { + "epoch": 8.635705535137543, + "grad_norm": 0.015885865315794945, + "learning_rate": 3.502633908847543e-05, + "loss": 0.1122, + "num_input_tokens_seen": 94371008, + "step": 77540 + }, + { + "epoch": 8.63626239002116, + "grad_norm": 0.0025303978472948074, + "learning_rate": 3.502411326863086e-05, + "loss": 0.1395, + "num_input_tokens_seen": 94377024, + "step": 77545 + }, + { + "epoch": 8.636819244904778, + "grad_norm": 2.73231840133667, + "learning_rate": 3.5021887354100506e-05, + "loss": 0.1777, + "num_input_tokens_seen": 94383232, + "step": 77550 + }, + { + "epoch": 8.637376099788396, + "grad_norm": 0.002145944396033883, + "learning_rate": 3.5019661344905405e-05, + "loss": 0.1131, + "num_input_tokens_seen": 94389536, + "step": 77555 + }, + { + "epoch": 8.637932954672012, + "grad_norm": 0.016282957047224045, + "learning_rate": 3.5017435241066577e-05, + "loss": 0.039, + "num_input_tokens_seen": 94395520, + "step": 77560 + }, + { + "epoch": 8.63848980955563, + "grad_norm": 0.3027459681034088, + "learning_rate": 3.501520904260505e-05, + "loss": 0.0612, + "num_input_tokens_seen": 94401760, + "step": 77565 + }, + { + "epoch": 8.639046664439247, + "grad_norm": 0.4651690721511841, + "learning_rate": 3.5012982749541836e-05, + "loss": 0.1305, + "num_input_tokens_seen": 94407712, + "step": 77570 + }, + { + "epoch": 8.639603519322865, + "grad_norm": 0.005768138449639082, + "learning_rate": 3.501075636189799e-05, + "loss": 0.1013, + "num_input_tokens_seen": 94413984, + "step": 77575 + }, + { + "epoch": 8.640160374206483, + "grad_norm": 1.7195860147476196, + "learning_rate": 3.500852987969452e-05, + "loss": 0.0434, + "num_input_tokens_seen": 94420224, + "step": 77580 + }, + { + "epoch": 8.640717229090098, + "grad_norm": 0.9861027002334595, + "learning_rate": 3.500630330295247e-05, + "loss": 0.1447, + "num_input_tokens_seen": 94426208, + "step": 77585 + }, + { + "epoch": 8.641274083973716, + "grad_norm": 0.1947314590215683, + "learning_rate": 3.500407663169287e-05, + "loss": 0.009, + "num_input_tokens_seen": 94432704, + "step": 77590 + }, + { + "epoch": 8.641830938857334, + "grad_norm": 0.15216846764087677, + "learning_rate": 3.5001849865936746e-05, + "loss": 0.0698, + "num_input_tokens_seen": 94439104, + "step": 77595 + }, + { + "epoch": 8.642387793740951, + "grad_norm": 0.7267584204673767, + "learning_rate": 3.4999623005705145e-05, + "loss": 0.0664, + "num_input_tokens_seen": 94444864, + "step": 77600 + }, + { + "epoch": 8.64294464862457, + "grad_norm": 0.0016986593836918473, + "learning_rate": 3.499739605101909e-05, + "loss": 0.001, + "num_input_tokens_seen": 94451488, + "step": 77605 + }, + { + "epoch": 8.643501503508185, + "grad_norm": 0.20452843606472015, + "learning_rate": 3.499516900189962e-05, + "loss": 0.0469, + "num_input_tokens_seen": 94457696, + "step": 77610 + }, + { + "epoch": 8.644058358391803, + "grad_norm": 0.33612436056137085, + "learning_rate": 3.499294185836777e-05, + "loss": 0.0661, + "num_input_tokens_seen": 94464032, + "step": 77615 + }, + { + "epoch": 8.64461521327542, + "grad_norm": 1.2708758115768433, + "learning_rate": 3.4990714620444575e-05, + "loss": 0.0704, + "num_input_tokens_seen": 94469568, + "step": 77620 + }, + { + "epoch": 8.645172068159038, + "grad_norm": 0.6025518774986267, + "learning_rate": 3.4988487288151085e-05, + "loss": 0.0168, + "num_input_tokens_seen": 94475776, + "step": 77625 + }, + { + "epoch": 8.645728923042656, + "grad_norm": 0.012649189680814743, + "learning_rate": 3.498625986150832e-05, + "loss": 0.0453, + "num_input_tokens_seen": 94481824, + "step": 77630 + }, + { + "epoch": 8.646285777926272, + "grad_norm": 0.014412588439881802, + "learning_rate": 3.4984032340537335e-05, + "loss": 0.082, + "num_input_tokens_seen": 94488160, + "step": 77635 + }, + { + "epoch": 8.64684263280989, + "grad_norm": 0.007934870198369026, + "learning_rate": 3.498180472525916e-05, + "loss": 0.0401, + "num_input_tokens_seen": 94494752, + "step": 77640 + }, + { + "epoch": 8.647399487693507, + "grad_norm": 0.0776517391204834, + "learning_rate": 3.4979577015694846e-05, + "loss": 0.1098, + "num_input_tokens_seen": 94500896, + "step": 77645 + }, + { + "epoch": 8.647956342577125, + "grad_norm": 4.27173376083374, + "learning_rate": 3.497734921186543e-05, + "loss": 0.0856, + "num_input_tokens_seen": 94507008, + "step": 77650 + }, + { + "epoch": 8.648513197460742, + "grad_norm": 0.3878321349620819, + "learning_rate": 3.497512131379196e-05, + "loss": 0.0199, + "num_input_tokens_seen": 94512672, + "step": 77655 + }, + { + "epoch": 8.64907005234436, + "grad_norm": 0.06930934637784958, + "learning_rate": 3.4972893321495474e-05, + "loss": 0.0393, + "num_input_tokens_seen": 94518624, + "step": 77660 + }, + { + "epoch": 8.649626907227976, + "grad_norm": 0.0009217319311574101, + "learning_rate": 3.4970665234997024e-05, + "loss": 0.0798, + "num_input_tokens_seen": 94524800, + "step": 77665 + }, + { + "epoch": 8.650183762111594, + "grad_norm": 0.08439735323190689, + "learning_rate": 3.496843705431765e-05, + "loss": 0.0139, + "num_input_tokens_seen": 94531296, + "step": 77670 + }, + { + "epoch": 8.650740616995211, + "grad_norm": 0.05501628294587135, + "learning_rate": 3.49662087794784e-05, + "loss": 0.0353, + "num_input_tokens_seen": 94537440, + "step": 77675 + }, + { + "epoch": 8.651297471878829, + "grad_norm": 0.003406199160963297, + "learning_rate": 3.496398041050033e-05, + "loss": 0.0302, + "num_input_tokens_seen": 94543360, + "step": 77680 + }, + { + "epoch": 8.651854326762447, + "grad_norm": 0.004387183580547571, + "learning_rate": 3.4961751947404475e-05, + "loss": 0.0288, + "num_input_tokens_seen": 94549568, + "step": 77685 + }, + { + "epoch": 8.652411181646062, + "grad_norm": 0.016349099576473236, + "learning_rate": 3.4959523390211896e-05, + "loss": 0.0814, + "num_input_tokens_seen": 94555680, + "step": 77690 + }, + { + "epoch": 8.65296803652968, + "grad_norm": 0.00864951778203249, + "learning_rate": 3.495729473894364e-05, + "loss": 0.0858, + "num_input_tokens_seen": 94561984, + "step": 77695 + }, + { + "epoch": 8.653524891413298, + "grad_norm": 0.3577433228492737, + "learning_rate": 3.495506599362075e-05, + "loss": 0.0243, + "num_input_tokens_seen": 94568064, + "step": 77700 + }, + { + "epoch": 8.654081746296916, + "grad_norm": 1.0218658447265625, + "learning_rate": 3.495283715426429e-05, + "loss": 0.0325, + "num_input_tokens_seen": 94573984, + "step": 77705 + }, + { + "epoch": 8.654638601180533, + "grad_norm": 1.0781241655349731, + "learning_rate": 3.495060822089531e-05, + "loss": 0.1727, + "num_input_tokens_seen": 94580032, + "step": 77710 + }, + { + "epoch": 8.655195456064149, + "grad_norm": 0.8557741045951843, + "learning_rate": 3.494837919353487e-05, + "loss": 0.0578, + "num_input_tokens_seen": 94586176, + "step": 77715 + }, + { + "epoch": 8.655752310947767, + "grad_norm": 0.025957593694329262, + "learning_rate": 3.4946150072204006e-05, + "loss": 0.0434, + "num_input_tokens_seen": 94592320, + "step": 77720 + }, + { + "epoch": 8.656309165831384, + "grad_norm": 0.053555067628622055, + "learning_rate": 3.494392085692378e-05, + "loss": 0.1014, + "num_input_tokens_seen": 94598464, + "step": 77725 + }, + { + "epoch": 8.656866020715002, + "grad_norm": 0.36385223269462585, + "learning_rate": 3.4941691547715275e-05, + "loss": 0.0136, + "num_input_tokens_seen": 94604608, + "step": 77730 + }, + { + "epoch": 8.65742287559862, + "grad_norm": 0.07253067940473557, + "learning_rate": 3.493946214459952e-05, + "loss": 0.038, + "num_input_tokens_seen": 94610720, + "step": 77735 + }, + { + "epoch": 8.657979730482236, + "grad_norm": 0.4822308421134949, + "learning_rate": 3.493723264759757e-05, + "loss": 0.0228, + "num_input_tokens_seen": 94617056, + "step": 77740 + }, + { + "epoch": 8.658536585365853, + "grad_norm": 0.21284139156341553, + "learning_rate": 3.493500305673051e-05, + "loss": 0.0889, + "num_input_tokens_seen": 94623040, + "step": 77745 + }, + { + "epoch": 8.659093440249471, + "grad_norm": 0.7566210627555847, + "learning_rate": 3.4932773372019376e-05, + "loss": 0.0591, + "num_input_tokens_seen": 94629408, + "step": 77750 + }, + { + "epoch": 8.659650295133089, + "grad_norm": 0.4213503897190094, + "learning_rate": 3.4930543593485254e-05, + "loss": 0.0186, + "num_input_tokens_seen": 94635808, + "step": 77755 + }, + { + "epoch": 8.660207150016706, + "grad_norm": 1.6062440872192383, + "learning_rate": 3.492831372114918e-05, + "loss": 0.1028, + "num_input_tokens_seen": 94641600, + "step": 77760 + }, + { + "epoch": 8.660764004900322, + "grad_norm": 0.0001212424976984039, + "learning_rate": 3.492608375503223e-05, + "loss": 0.0694, + "num_input_tokens_seen": 94647840, + "step": 77765 + }, + { + "epoch": 8.66132085978394, + "grad_norm": 0.0012870831415057182, + "learning_rate": 3.492385369515547e-05, + "loss": 0.0516, + "num_input_tokens_seen": 94653664, + "step": 77770 + }, + { + "epoch": 8.661877714667558, + "grad_norm": 0.003105924231931567, + "learning_rate": 3.4921623541539955e-05, + "loss": 0.0236, + "num_input_tokens_seen": 94659424, + "step": 77775 + }, + { + "epoch": 8.662434569551175, + "grad_norm": 0.8622638583183289, + "learning_rate": 3.491939329420677e-05, + "loss": 0.0951, + "num_input_tokens_seen": 94665344, + "step": 77780 + }, + { + "epoch": 8.662991424434793, + "grad_norm": 1.8948991298675537, + "learning_rate": 3.491716295317695e-05, + "loss": 0.1662, + "num_input_tokens_seen": 94671360, + "step": 77785 + }, + { + "epoch": 8.663548279318409, + "grad_norm": 0.01625313051044941, + "learning_rate": 3.4914932518471585e-05, + "loss": 0.0051, + "num_input_tokens_seen": 94677472, + "step": 77790 + }, + { + "epoch": 8.664105134202027, + "grad_norm": 0.0069144475273787975, + "learning_rate": 3.491270199011175e-05, + "loss": 0.0039, + "num_input_tokens_seen": 94683808, + "step": 77795 + }, + { + "epoch": 8.664661989085644, + "grad_norm": 0.6397637724876404, + "learning_rate": 3.491047136811849e-05, + "loss": 0.0406, + "num_input_tokens_seen": 94689920, + "step": 77800 + }, + { + "epoch": 8.665218843969262, + "grad_norm": 1.6663073301315308, + "learning_rate": 3.4908240652512897e-05, + "loss": 0.0168, + "num_input_tokens_seen": 94696096, + "step": 77805 + }, + { + "epoch": 8.66577569885288, + "grad_norm": 1.2629508972167969, + "learning_rate": 3.490600984331603e-05, + "loss": 0.0318, + "num_input_tokens_seen": 94702464, + "step": 77810 + }, + { + "epoch": 8.666332553736495, + "grad_norm": 0.9321048259735107, + "learning_rate": 3.490377894054896e-05, + "loss": 0.1554, + "num_input_tokens_seen": 94708416, + "step": 77815 + }, + { + "epoch": 8.666889408620113, + "grad_norm": 0.0023654825054109097, + "learning_rate": 3.490154794423276e-05, + "loss": 0.0819, + "num_input_tokens_seen": 94714912, + "step": 77820 + }, + { + "epoch": 8.66744626350373, + "grad_norm": 0.05594846233725548, + "learning_rate": 3.489931685438852e-05, + "loss": 0.0389, + "num_input_tokens_seen": 94720960, + "step": 77825 + }, + { + "epoch": 8.668003118387348, + "grad_norm": 0.01422036997973919, + "learning_rate": 3.489708567103729e-05, + "loss": 0.0271, + "num_input_tokens_seen": 94727584, + "step": 77830 + }, + { + "epoch": 8.668559973270966, + "grad_norm": 0.012631184421479702, + "learning_rate": 3.489485439420016e-05, + "loss": 0.0109, + "num_input_tokens_seen": 94733600, + "step": 77835 + }, + { + "epoch": 8.669116828154582, + "grad_norm": 0.18225644528865814, + "learning_rate": 3.48926230238982e-05, + "loss": 0.0781, + "num_input_tokens_seen": 94739584, + "step": 77840 + }, + { + "epoch": 8.6696736830382, + "grad_norm": 0.0006123361526988447, + "learning_rate": 3.489039156015249e-05, + "loss": 0.0009, + "num_input_tokens_seen": 94745760, + "step": 77845 + }, + { + "epoch": 8.670230537921817, + "grad_norm": 0.09997012466192245, + "learning_rate": 3.488816000298412e-05, + "loss": 0.0272, + "num_input_tokens_seen": 94752128, + "step": 77850 + }, + { + "epoch": 8.670787392805435, + "grad_norm": 0.11513121426105499, + "learning_rate": 3.4885928352414144e-05, + "loss": 0.0172, + "num_input_tokens_seen": 94758336, + "step": 77855 + }, + { + "epoch": 8.671344247689053, + "grad_norm": 0.5346808433532715, + "learning_rate": 3.488369660846365e-05, + "loss": 0.1169, + "num_input_tokens_seen": 94764448, + "step": 77860 + }, + { + "epoch": 8.671901102572669, + "grad_norm": 0.31750109791755676, + "learning_rate": 3.488146477115373e-05, + "loss": 0.0687, + "num_input_tokens_seen": 94770272, + "step": 77865 + }, + { + "epoch": 8.672457957456286, + "grad_norm": 0.7140223979949951, + "learning_rate": 3.487923284050546e-05, + "loss": 0.0048, + "num_input_tokens_seen": 94776640, + "step": 77870 + }, + { + "epoch": 8.673014812339904, + "grad_norm": 1.2044726610183716, + "learning_rate": 3.4877000816539915e-05, + "loss": 0.0466, + "num_input_tokens_seen": 94783168, + "step": 77875 + }, + { + "epoch": 8.673571667223522, + "grad_norm": 0.006915341597050428, + "learning_rate": 3.4874768699278186e-05, + "loss": 0.0404, + "num_input_tokens_seen": 94788768, + "step": 77880 + }, + { + "epoch": 8.67412852210714, + "grad_norm": 0.0010626550065353513, + "learning_rate": 3.487253648874136e-05, + "loss": 0.0039, + "num_input_tokens_seen": 94795232, + "step": 77885 + }, + { + "epoch": 8.674685376990757, + "grad_norm": 0.02156628854572773, + "learning_rate": 3.487030418495051e-05, + "loss": 0.0239, + "num_input_tokens_seen": 94801344, + "step": 77890 + }, + { + "epoch": 8.675242231874373, + "grad_norm": 0.44707533717155457, + "learning_rate": 3.486807178792674e-05, + "loss": 0.0191, + "num_input_tokens_seen": 94807328, + "step": 77895 + }, + { + "epoch": 8.67579908675799, + "grad_norm": 0.027948712930083275, + "learning_rate": 3.486583929769112e-05, + "loss": 0.0418, + "num_input_tokens_seen": 94813856, + "step": 77900 + }, + { + "epoch": 8.676355941641608, + "grad_norm": 0.0007576649659313262, + "learning_rate": 3.486360671426473e-05, + "loss": 0.0415, + "num_input_tokens_seen": 94820096, + "step": 77905 + }, + { + "epoch": 8.676912796525226, + "grad_norm": 0.15485140681266785, + "learning_rate": 3.4861374037668694e-05, + "loss": 0.0366, + "num_input_tokens_seen": 94826208, + "step": 77910 + }, + { + "epoch": 8.677469651408844, + "grad_norm": 0.9478199481964111, + "learning_rate": 3.485914126792407e-05, + "loss": 0.114, + "num_input_tokens_seen": 94832352, + "step": 77915 + }, + { + "epoch": 8.67802650629246, + "grad_norm": 2.2368125915527344, + "learning_rate": 3.4856908405051945e-05, + "loss": 0.1255, + "num_input_tokens_seen": 94838528, + "step": 77920 + }, + { + "epoch": 8.678583361176077, + "grad_norm": 1.4851144552230835, + "learning_rate": 3.4854675449073445e-05, + "loss": 0.0498, + "num_input_tokens_seen": 94844576, + "step": 77925 + }, + { + "epoch": 8.679140216059695, + "grad_norm": 0.0772772803902626, + "learning_rate": 3.485244240000962e-05, + "loss": 0.0087, + "num_input_tokens_seen": 94850784, + "step": 77930 + }, + { + "epoch": 8.679697070943313, + "grad_norm": 0.05916498601436615, + "learning_rate": 3.48502092578816e-05, + "loss": 0.1571, + "num_input_tokens_seen": 94857152, + "step": 77935 + }, + { + "epoch": 8.68025392582693, + "grad_norm": 0.6076157093048096, + "learning_rate": 3.484797602271045e-05, + "loss": 0.0251, + "num_input_tokens_seen": 94863520, + "step": 77940 + }, + { + "epoch": 8.680810780710546, + "grad_norm": 0.05538558214902878, + "learning_rate": 3.4845742694517285e-05, + "loss": 0.0029, + "num_input_tokens_seen": 94870176, + "step": 77945 + }, + { + "epoch": 8.681367635594164, + "grad_norm": 0.0021381601691246033, + "learning_rate": 3.4843509273323184e-05, + "loss": 0.1401, + "num_input_tokens_seen": 94876512, + "step": 77950 + }, + { + "epoch": 8.681924490477781, + "grad_norm": 0.09859276562929153, + "learning_rate": 3.4841275759149253e-05, + "loss": 0.0148, + "num_input_tokens_seen": 94882432, + "step": 77955 + }, + { + "epoch": 8.6824813453614, + "grad_norm": 0.31317251920700073, + "learning_rate": 3.4839042152016594e-05, + "loss": 0.0242, + "num_input_tokens_seen": 94888384, + "step": 77960 + }, + { + "epoch": 8.683038200245017, + "grad_norm": 0.909775972366333, + "learning_rate": 3.483680845194629e-05, + "loss": 0.0408, + "num_input_tokens_seen": 94894240, + "step": 77965 + }, + { + "epoch": 8.683595055128633, + "grad_norm": 0.2644176185131073, + "learning_rate": 3.483457465895946e-05, + "loss": 0.0679, + "num_input_tokens_seen": 94900448, + "step": 77970 + }, + { + "epoch": 8.68415191001225, + "grad_norm": 0.26607897877693176, + "learning_rate": 3.4832340773077184e-05, + "loss": 0.0132, + "num_input_tokens_seen": 94906848, + "step": 77975 + }, + { + "epoch": 8.684708764895868, + "grad_norm": 0.39552682638168335, + "learning_rate": 3.4830106794320576e-05, + "loss": 0.0449, + "num_input_tokens_seen": 94912928, + "step": 77980 + }, + { + "epoch": 8.685265619779486, + "grad_norm": 0.024926047772169113, + "learning_rate": 3.482787272271073e-05, + "loss": 0.1007, + "num_input_tokens_seen": 94919072, + "step": 77985 + }, + { + "epoch": 8.685822474663103, + "grad_norm": 0.8931904435157776, + "learning_rate": 3.4825638558268754e-05, + "loss": 0.0849, + "num_input_tokens_seen": 94924928, + "step": 77990 + }, + { + "epoch": 8.68637932954672, + "grad_norm": 0.11130089312791824, + "learning_rate": 3.482340430101575e-05, + "loss": 0.0251, + "num_input_tokens_seen": 94930720, + "step": 77995 + }, + { + "epoch": 8.686936184430337, + "grad_norm": 0.28962501883506775, + "learning_rate": 3.482116995097282e-05, + "loss": 0.0303, + "num_input_tokens_seen": 94936800, + "step": 78000 + }, + { + "epoch": 8.687493039313955, + "grad_norm": 0.05844372510910034, + "learning_rate": 3.4818935508161074e-05, + "loss": 0.0728, + "num_input_tokens_seen": 94942464, + "step": 78005 + }, + { + "epoch": 8.688049894197572, + "grad_norm": 0.006075448356568813, + "learning_rate": 3.481670097260162e-05, + "loss": 0.0231, + "num_input_tokens_seen": 94948672, + "step": 78010 + }, + { + "epoch": 8.68860674908119, + "grad_norm": 0.1976308822631836, + "learning_rate": 3.4814466344315556e-05, + "loss": 0.0511, + "num_input_tokens_seen": 94954944, + "step": 78015 + }, + { + "epoch": 8.689163603964808, + "grad_norm": 0.08083244413137436, + "learning_rate": 3.4812231623323994e-05, + "loss": 0.012, + "num_input_tokens_seen": 94961056, + "step": 78020 + }, + { + "epoch": 8.689720458848424, + "grad_norm": 1.7650690078735352, + "learning_rate": 3.480999680964804e-05, + "loss": 0.1332, + "num_input_tokens_seen": 94967168, + "step": 78025 + }, + { + "epoch": 8.690277313732041, + "grad_norm": 1.7159096002578735, + "learning_rate": 3.480776190330881e-05, + "loss": 0.1445, + "num_input_tokens_seen": 94973216, + "step": 78030 + }, + { + "epoch": 8.690834168615659, + "grad_norm": 0.2412412464618683, + "learning_rate": 3.480552690432741e-05, + "loss": 0.0804, + "num_input_tokens_seen": 94979008, + "step": 78035 + }, + { + "epoch": 8.691391023499277, + "grad_norm": 0.002264736220240593, + "learning_rate": 3.480329181272495e-05, + "loss": 0.0395, + "num_input_tokens_seen": 94984704, + "step": 78040 + }, + { + "epoch": 8.691947878382894, + "grad_norm": 0.9683325290679932, + "learning_rate": 3.480105662852255e-05, + "loss": 0.1087, + "num_input_tokens_seen": 94990784, + "step": 78045 + }, + { + "epoch": 8.69250473326651, + "grad_norm": 0.03686971962451935, + "learning_rate": 3.4798821351741314e-05, + "loss": 0.055, + "num_input_tokens_seen": 94996864, + "step": 78050 + }, + { + "epoch": 8.693061588150128, + "grad_norm": 0.0005177056300453842, + "learning_rate": 3.4796585982402355e-05, + "loss": 0.0527, + "num_input_tokens_seen": 95003168, + "step": 78055 + }, + { + "epoch": 8.693618443033746, + "grad_norm": 0.01813933067023754, + "learning_rate": 3.4794350520526795e-05, + "loss": 0.0388, + "num_input_tokens_seen": 95009152, + "step": 78060 + }, + { + "epoch": 8.694175297917363, + "grad_norm": 1.1951626539230347, + "learning_rate": 3.4792114966135754e-05, + "loss": 0.0449, + "num_input_tokens_seen": 95015328, + "step": 78065 + }, + { + "epoch": 8.69473215280098, + "grad_norm": 0.33329877257347107, + "learning_rate": 3.478987931925034e-05, + "loss": 0.0611, + "num_input_tokens_seen": 95021280, + "step": 78070 + }, + { + "epoch": 8.695289007684597, + "grad_norm": 0.784325361251831, + "learning_rate": 3.478764357989167e-05, + "loss": 0.0437, + "num_input_tokens_seen": 95027520, + "step": 78075 + }, + { + "epoch": 8.695845862568214, + "grad_norm": 0.05305355042219162, + "learning_rate": 3.4785407748080864e-05, + "loss": 0.0171, + "num_input_tokens_seen": 95033504, + "step": 78080 + }, + { + "epoch": 8.696402717451832, + "grad_norm": 0.3892225921154022, + "learning_rate": 3.478317182383904e-05, + "loss": 0.0653, + "num_input_tokens_seen": 95040160, + "step": 78085 + }, + { + "epoch": 8.69695957233545, + "grad_norm": 0.3053373098373413, + "learning_rate": 3.478093580718732e-05, + "loss": 0.0734, + "num_input_tokens_seen": 95046112, + "step": 78090 + }, + { + "epoch": 8.697516427219067, + "grad_norm": 0.0002564299793448299, + "learning_rate": 3.4778699698146826e-05, + "loss": 0.0605, + "num_input_tokens_seen": 95051936, + "step": 78095 + }, + { + "epoch": 8.698073282102683, + "grad_norm": 0.6063117980957031, + "learning_rate": 3.477646349673868e-05, + "loss": 0.0088, + "num_input_tokens_seen": 95057856, + "step": 78100 + }, + { + "epoch": 8.698630136986301, + "grad_norm": 0.19969549775123596, + "learning_rate": 3.4774227202984e-05, + "loss": 0.0371, + "num_input_tokens_seen": 95064416, + "step": 78105 + }, + { + "epoch": 8.699186991869919, + "grad_norm": 0.005225468892604113, + "learning_rate": 3.477199081690392e-05, + "loss": 0.1066, + "num_input_tokens_seen": 95070496, + "step": 78110 + }, + { + "epoch": 8.699743846753536, + "grad_norm": 0.6229292154312134, + "learning_rate": 3.476975433851956e-05, + "loss": 0.0539, + "num_input_tokens_seen": 95076384, + "step": 78115 + }, + { + "epoch": 8.700300701637154, + "grad_norm": 1.148833155632019, + "learning_rate": 3.476751776785203e-05, + "loss": 0.1545, + "num_input_tokens_seen": 95082528, + "step": 78120 + }, + { + "epoch": 8.70085755652077, + "grad_norm": 0.0012137481244280934, + "learning_rate": 3.476528110492248e-05, + "loss": 0.0435, + "num_input_tokens_seen": 95088544, + "step": 78125 + }, + { + "epoch": 8.701414411404388, + "grad_norm": 0.0007227659807540476, + "learning_rate": 3.476304434975202e-05, + "loss": 0.0634, + "num_input_tokens_seen": 95094656, + "step": 78130 + }, + { + "epoch": 8.701971266288005, + "grad_norm": 0.00010863257193705067, + "learning_rate": 3.4760807502361783e-05, + "loss": 0.0009, + "num_input_tokens_seen": 95100992, + "step": 78135 + }, + { + "epoch": 8.702528121171623, + "grad_norm": 2.982093572616577, + "learning_rate": 3.4758570562772906e-05, + "loss": 0.0956, + "num_input_tokens_seen": 95106752, + "step": 78140 + }, + { + "epoch": 8.70308497605524, + "grad_norm": 0.5085247159004211, + "learning_rate": 3.47563335310065e-05, + "loss": 0.1034, + "num_input_tokens_seen": 95112768, + "step": 78145 + }, + { + "epoch": 8.703641830938857, + "grad_norm": 0.0007947580888867378, + "learning_rate": 3.4754096407083725e-05, + "loss": 0.049, + "num_input_tokens_seen": 95119008, + "step": 78150 + }, + { + "epoch": 8.704198685822474, + "grad_norm": 0.26593920588493347, + "learning_rate": 3.475185919102568e-05, + "loss": 0.1416, + "num_input_tokens_seen": 95124928, + "step": 78155 + }, + { + "epoch": 8.704755540706092, + "grad_norm": 0.05971379205584526, + "learning_rate": 3.474962188285351e-05, + "loss": 0.0344, + "num_input_tokens_seen": 95131200, + "step": 78160 + }, + { + "epoch": 8.70531239558971, + "grad_norm": 1.5054363012313843, + "learning_rate": 3.474738448258836e-05, + "loss": 0.0626, + "num_input_tokens_seen": 95137024, + "step": 78165 + }, + { + "epoch": 8.705869250473327, + "grad_norm": 0.00973124336451292, + "learning_rate": 3.474514699025135e-05, + "loss": 0.01, + "num_input_tokens_seen": 95143552, + "step": 78170 + }, + { + "epoch": 8.706426105356943, + "grad_norm": 0.7143281698226929, + "learning_rate": 3.474290940586362e-05, + "loss": 0.032, + "num_input_tokens_seen": 95149088, + "step": 78175 + }, + { + "epoch": 8.70698296024056, + "grad_norm": 0.17798854410648346, + "learning_rate": 3.474067172944631e-05, + "loss": 0.017, + "num_input_tokens_seen": 95155072, + "step": 78180 + }, + { + "epoch": 8.707539815124179, + "grad_norm": 0.05510491877794266, + "learning_rate": 3.4738433961020546e-05, + "loss": 0.1204, + "num_input_tokens_seen": 95161152, + "step": 78185 + }, + { + "epoch": 8.708096670007796, + "grad_norm": 1.2117598056793213, + "learning_rate": 3.473619610060747e-05, + "loss": 0.0997, + "num_input_tokens_seen": 95166976, + "step": 78190 + }, + { + "epoch": 8.708653524891414, + "grad_norm": 1.8494189977645874, + "learning_rate": 3.473395814822822e-05, + "loss": 0.1392, + "num_input_tokens_seen": 95173216, + "step": 78195 + }, + { + "epoch": 8.70921037977503, + "grad_norm": 0.6621688008308411, + "learning_rate": 3.473172010390394e-05, + "loss": 0.0231, + "num_input_tokens_seen": 95178944, + "step": 78200 + }, + { + "epoch": 8.709767234658647, + "grad_norm": 0.2710970640182495, + "learning_rate": 3.472948196765576e-05, + "loss": 0.0514, + "num_input_tokens_seen": 95185088, + "step": 78205 + }, + { + "epoch": 8.710324089542265, + "grad_norm": 0.0006032605306245387, + "learning_rate": 3.472724373950483e-05, + "loss": 0.056, + "num_input_tokens_seen": 95191296, + "step": 78210 + }, + { + "epoch": 8.710880944425883, + "grad_norm": 0.10611195117235184, + "learning_rate": 3.4725005419472295e-05, + "loss": 0.0101, + "num_input_tokens_seen": 95197504, + "step": 78215 + }, + { + "epoch": 8.7114377993095, + "grad_norm": 0.00122794636990875, + "learning_rate": 3.4722767007579294e-05, + "loss": 0.07, + "num_input_tokens_seen": 95203424, + "step": 78220 + }, + { + "epoch": 8.711994654193116, + "grad_norm": 0.11410211026668549, + "learning_rate": 3.472052850384696e-05, + "loss": 0.1916, + "num_input_tokens_seen": 95209600, + "step": 78225 + }, + { + "epoch": 8.712551509076734, + "grad_norm": 0.0005835960619151592, + "learning_rate": 3.4718289908296454e-05, + "loss": 0.0278, + "num_input_tokens_seen": 95215776, + "step": 78230 + }, + { + "epoch": 8.713108363960352, + "grad_norm": 0.020052596926689148, + "learning_rate": 3.471605122094891e-05, + "loss": 0.0422, + "num_input_tokens_seen": 95222048, + "step": 78235 + }, + { + "epoch": 8.71366521884397, + "grad_norm": 0.0045072766952216625, + "learning_rate": 3.4713812441825476e-05, + "loss": 0.0269, + "num_input_tokens_seen": 95228064, + "step": 78240 + }, + { + "epoch": 8.714222073727587, + "grad_norm": 0.15446734428405762, + "learning_rate": 3.471157357094731e-05, + "loss": 0.0446, + "num_input_tokens_seen": 95233632, + "step": 78245 + }, + { + "epoch": 8.714778928611205, + "grad_norm": 0.1363077610731125, + "learning_rate": 3.4709334608335535e-05, + "loss": 0.002, + "num_input_tokens_seen": 95240192, + "step": 78250 + }, + { + "epoch": 8.71533578349482, + "grad_norm": 1.527706265449524, + "learning_rate": 3.470709555401133e-05, + "loss": 0.0892, + "num_input_tokens_seen": 95246112, + "step": 78255 + }, + { + "epoch": 8.715892638378438, + "grad_norm": 0.20585325360298157, + "learning_rate": 3.470485640799582e-05, + "loss": 0.0325, + "num_input_tokens_seen": 95251712, + "step": 78260 + }, + { + "epoch": 8.716449493262056, + "grad_norm": 0.0009430722566321492, + "learning_rate": 3.470261717031017e-05, + "loss": 0.071, + "num_input_tokens_seen": 95257568, + "step": 78265 + }, + { + "epoch": 8.717006348145674, + "grad_norm": 0.8336083292961121, + "learning_rate": 3.470037784097553e-05, + "loss": 0.1064, + "num_input_tokens_seen": 95263744, + "step": 78270 + }, + { + "epoch": 8.717563203029291, + "grad_norm": 0.8786649703979492, + "learning_rate": 3.469813842001305e-05, + "loss": 0.0675, + "num_input_tokens_seen": 95269984, + "step": 78275 + }, + { + "epoch": 8.718120057912907, + "grad_norm": 1.8265706300735474, + "learning_rate": 3.469589890744388e-05, + "loss": 0.093, + "num_input_tokens_seen": 95275968, + "step": 78280 + }, + { + "epoch": 8.718676912796525, + "grad_norm": 0.36817657947540283, + "learning_rate": 3.469365930328917e-05, + "loss": 0.0938, + "num_input_tokens_seen": 95282080, + "step": 78285 + }, + { + "epoch": 8.719233767680143, + "grad_norm": 0.002908907597884536, + "learning_rate": 3.469141960757009e-05, + "loss": 0.1143, + "num_input_tokens_seen": 95288224, + "step": 78290 + }, + { + "epoch": 8.71979062256376, + "grad_norm": 0.03060164488852024, + "learning_rate": 3.4689179820307786e-05, + "loss": 0.044, + "num_input_tokens_seen": 95294720, + "step": 78295 + }, + { + "epoch": 8.720347477447378, + "grad_norm": 0.11871221661567688, + "learning_rate": 3.468693994152342e-05, + "loss": 0.0177, + "num_input_tokens_seen": 95300768, + "step": 78300 + }, + { + "epoch": 8.720904332330994, + "grad_norm": 0.13191571831703186, + "learning_rate": 3.468469997123814e-05, + "loss": 0.0316, + "num_input_tokens_seen": 95306816, + "step": 78305 + }, + { + "epoch": 8.721461187214611, + "grad_norm": 1.0906447172164917, + "learning_rate": 3.4682459909473106e-05, + "loss": 0.0451, + "num_input_tokens_seen": 95312896, + "step": 78310 + }, + { + "epoch": 8.72201804209823, + "grad_norm": 0.3280802071094513, + "learning_rate": 3.4680219756249486e-05, + "loss": 0.0521, + "num_input_tokens_seen": 95318944, + "step": 78315 + }, + { + "epoch": 8.722574896981847, + "grad_norm": 0.015419010072946548, + "learning_rate": 3.467797951158843e-05, + "loss": 0.0206, + "num_input_tokens_seen": 95325184, + "step": 78320 + }, + { + "epoch": 8.723131751865465, + "grad_norm": 0.12158488482236862, + "learning_rate": 3.4675739175511106e-05, + "loss": 0.0701, + "num_input_tokens_seen": 95331424, + "step": 78325 + }, + { + "epoch": 8.72368860674908, + "grad_norm": 0.5091155171394348, + "learning_rate": 3.467349874803868e-05, + "loss": 0.0148, + "num_input_tokens_seen": 95337632, + "step": 78330 + }, + { + "epoch": 8.724245461632698, + "grad_norm": 0.9917527437210083, + "learning_rate": 3.46712582291923e-05, + "loss": 0.0712, + "num_input_tokens_seen": 95343456, + "step": 78335 + }, + { + "epoch": 8.724802316516316, + "grad_norm": 0.7680342197418213, + "learning_rate": 3.466901761899314e-05, + "loss": 0.0798, + "num_input_tokens_seen": 95349152, + "step": 78340 + }, + { + "epoch": 8.725359171399933, + "grad_norm": 1.0106335878372192, + "learning_rate": 3.466677691746236e-05, + "loss": 0.1546, + "num_input_tokens_seen": 95355296, + "step": 78345 + }, + { + "epoch": 8.725916026283551, + "grad_norm": 0.035036299377679825, + "learning_rate": 3.466453612462113e-05, + "loss": 0.0266, + "num_input_tokens_seen": 95361248, + "step": 78350 + }, + { + "epoch": 8.726472881167167, + "grad_norm": 0.3431946635246277, + "learning_rate": 3.466229524049062e-05, + "loss": 0.0103, + "num_input_tokens_seen": 95367424, + "step": 78355 + }, + { + "epoch": 8.727029736050785, + "grad_norm": 0.47028952836990356, + "learning_rate": 3.4660054265091976e-05, + "loss": 0.1111, + "num_input_tokens_seen": 95373344, + "step": 78360 + }, + { + "epoch": 8.727586590934402, + "grad_norm": 0.12115328758955002, + "learning_rate": 3.465781319844639e-05, + "loss": 0.0308, + "num_input_tokens_seen": 95379552, + "step": 78365 + }, + { + "epoch": 8.72814344581802, + "grad_norm": 0.2421799898147583, + "learning_rate": 3.465557204057501e-05, + "loss": 0.0136, + "num_input_tokens_seen": 95385632, + "step": 78370 + }, + { + "epoch": 8.728700300701638, + "grad_norm": 0.4526711106300354, + "learning_rate": 3.4653330791499026e-05, + "loss": 0.0206, + "num_input_tokens_seen": 95391712, + "step": 78375 + }, + { + "epoch": 8.729257155585255, + "grad_norm": 0.1903945505619049, + "learning_rate": 3.465108945123959e-05, + "loss": 0.086, + "num_input_tokens_seen": 95397312, + "step": 78380 + }, + { + "epoch": 8.729814010468871, + "grad_norm": 0.0015278669307008386, + "learning_rate": 3.464884801981789e-05, + "loss": 0.1319, + "num_input_tokens_seen": 95402880, + "step": 78385 + }, + { + "epoch": 8.730370865352489, + "grad_norm": 0.4113805294036865, + "learning_rate": 3.4646606497255094e-05, + "loss": 0.0216, + "num_input_tokens_seen": 95409024, + "step": 78390 + }, + { + "epoch": 8.730927720236107, + "grad_norm": 0.0828552395105362, + "learning_rate": 3.4644364883572354e-05, + "loss": 0.0274, + "num_input_tokens_seen": 95415360, + "step": 78395 + }, + { + "epoch": 8.731484575119724, + "grad_norm": 0.006781658157706261, + "learning_rate": 3.4642123178790875e-05, + "loss": 0.1426, + "num_input_tokens_seen": 95420576, + "step": 78400 + }, + { + "epoch": 8.732041430003342, + "grad_norm": 1.6003460884094238, + "learning_rate": 3.463988138293181e-05, + "loss": 0.0923, + "num_input_tokens_seen": 95426944, + "step": 78405 + }, + { + "epoch": 8.732598284886958, + "grad_norm": 0.7159802913665771, + "learning_rate": 3.463763949601635e-05, + "loss": 0.0553, + "num_input_tokens_seen": 95433440, + "step": 78410 + }, + { + "epoch": 8.733155139770576, + "grad_norm": 0.3400329351425171, + "learning_rate": 3.463539751806566e-05, + "loss": 0.0067, + "num_input_tokens_seen": 95439776, + "step": 78415 + }, + { + "epoch": 8.733711994654193, + "grad_norm": 0.00092743628192693, + "learning_rate": 3.463315544910092e-05, + "loss": 0.0115, + "num_input_tokens_seen": 95445952, + "step": 78420 + }, + { + "epoch": 8.734268849537811, + "grad_norm": 0.016503831371665, + "learning_rate": 3.463091328914331e-05, + "loss": 0.1209, + "num_input_tokens_seen": 95452128, + "step": 78425 + }, + { + "epoch": 8.734825704421429, + "grad_norm": 0.9854754209518433, + "learning_rate": 3.4628671038214e-05, + "loss": 0.0795, + "num_input_tokens_seen": 95457984, + "step": 78430 + }, + { + "epoch": 8.735382559305044, + "grad_norm": 0.009535841643810272, + "learning_rate": 3.462642869633419e-05, + "loss": 0.0777, + "num_input_tokens_seen": 95463936, + "step": 78435 + }, + { + "epoch": 8.735939414188662, + "grad_norm": 0.377638578414917, + "learning_rate": 3.462418626352504e-05, + "loss": 0.0215, + "num_input_tokens_seen": 95469920, + "step": 78440 + }, + { + "epoch": 8.73649626907228, + "grad_norm": 0.1240278035402298, + "learning_rate": 3.462194373980774e-05, + "loss": 0.0459, + "num_input_tokens_seen": 95476064, + "step": 78445 + }, + { + "epoch": 8.737053123955898, + "grad_norm": 1.9314204454421997, + "learning_rate": 3.4619701125203476e-05, + "loss": 0.1671, + "num_input_tokens_seen": 95482144, + "step": 78450 + }, + { + "epoch": 8.737609978839515, + "grad_norm": 0.08928439766168594, + "learning_rate": 3.461745841973343e-05, + "loss": 0.1304, + "num_input_tokens_seen": 95488288, + "step": 78455 + }, + { + "epoch": 8.738166833723131, + "grad_norm": 0.11885393410921097, + "learning_rate": 3.4615215623418785e-05, + "loss": 0.0612, + "num_input_tokens_seen": 95494240, + "step": 78460 + }, + { + "epoch": 8.738723688606749, + "grad_norm": 0.07072476297616959, + "learning_rate": 3.461297273628071e-05, + "loss": 0.1483, + "num_input_tokens_seen": 95500288, + "step": 78465 + }, + { + "epoch": 8.739280543490366, + "grad_norm": 0.2034914791584015, + "learning_rate": 3.461072975834042e-05, + "loss": 0.0225, + "num_input_tokens_seen": 95506592, + "step": 78470 + }, + { + "epoch": 8.739837398373984, + "grad_norm": 0.05318397656083107, + "learning_rate": 3.4608486689619085e-05, + "loss": 0.0048, + "num_input_tokens_seen": 95512960, + "step": 78475 + }, + { + "epoch": 8.740394253257602, + "grad_norm": 0.30640003085136414, + "learning_rate": 3.460624353013789e-05, + "loss": 0.039, + "num_input_tokens_seen": 95519072, + "step": 78480 + }, + { + "epoch": 8.740951108141218, + "grad_norm": 0.04675369709730148, + "learning_rate": 3.460400027991804e-05, + "loss": 0.0675, + "num_input_tokens_seen": 95525248, + "step": 78485 + }, + { + "epoch": 8.741507963024835, + "grad_norm": 0.11500484496355057, + "learning_rate": 3.4601756938980696e-05, + "loss": 0.0253, + "num_input_tokens_seen": 95531136, + "step": 78490 + }, + { + "epoch": 8.742064817908453, + "grad_norm": 0.6193937063217163, + "learning_rate": 3.4599513507347076e-05, + "loss": 0.1433, + "num_input_tokens_seen": 95536768, + "step": 78495 + }, + { + "epoch": 8.74262167279207, + "grad_norm": 1.359133005142212, + "learning_rate": 3.4597269985038355e-05, + "loss": 0.084, + "num_input_tokens_seen": 95542720, + "step": 78500 + }, + { + "epoch": 8.743178527675688, + "grad_norm": 0.0017945552244782448, + "learning_rate": 3.459502637207574e-05, + "loss": 0.0531, + "num_input_tokens_seen": 95548864, + "step": 78505 + }, + { + "epoch": 8.743735382559304, + "grad_norm": 1.3656193017959595, + "learning_rate": 3.45927826684804e-05, + "loss": 0.1397, + "num_input_tokens_seen": 95555520, + "step": 78510 + }, + { + "epoch": 8.744292237442922, + "grad_norm": 0.1368856579065323, + "learning_rate": 3.4590538874273545e-05, + "loss": 0.0167, + "num_input_tokens_seen": 95561600, + "step": 78515 + }, + { + "epoch": 8.74484909232654, + "grad_norm": 0.6869072318077087, + "learning_rate": 3.458829498947637e-05, + "loss": 0.014, + "num_input_tokens_seen": 95567264, + "step": 78520 + }, + { + "epoch": 8.745405947210157, + "grad_norm": 0.051376231014728546, + "learning_rate": 3.458605101411007e-05, + "loss": 0.0952, + "num_input_tokens_seen": 95573568, + "step": 78525 + }, + { + "epoch": 8.745962802093775, + "grad_norm": 0.0001663876319071278, + "learning_rate": 3.458380694819583e-05, + "loss": 0.0509, + "num_input_tokens_seen": 95580096, + "step": 78530 + }, + { + "epoch": 8.74651965697739, + "grad_norm": 0.18700966238975525, + "learning_rate": 3.4581562791754856e-05, + "loss": 0.0452, + "num_input_tokens_seen": 95586048, + "step": 78535 + }, + { + "epoch": 8.747076511861009, + "grad_norm": 0.4114854633808136, + "learning_rate": 3.4579318544808344e-05, + "loss": 0.0632, + "num_input_tokens_seen": 95592512, + "step": 78540 + }, + { + "epoch": 8.747633366744626, + "grad_norm": 0.22825047373771667, + "learning_rate": 3.4577074207377505e-05, + "loss": 0.0568, + "num_input_tokens_seen": 95598592, + "step": 78545 + }, + { + "epoch": 8.748190221628244, + "grad_norm": 0.27955055236816406, + "learning_rate": 3.457482977948352e-05, + "loss": 0.0499, + "num_input_tokens_seen": 95604672, + "step": 78550 + }, + { + "epoch": 8.748747076511862, + "grad_norm": 0.001211583032272756, + "learning_rate": 3.45725852611476e-05, + "loss": 0.0485, + "num_input_tokens_seen": 95610528, + "step": 78555 + }, + { + "epoch": 8.749303931395477, + "grad_norm": 0.24646393954753876, + "learning_rate": 3.457034065239093e-05, + "loss": 0.0918, + "num_input_tokens_seen": 95616352, + "step": 78560 + }, + { + "epoch": 8.749860786279095, + "grad_norm": 0.00018536824791226536, + "learning_rate": 3.4568095953234736e-05, + "loss": 0.0388, + "num_input_tokens_seen": 95622752, + "step": 78565 + }, + { + "epoch": 8.750417641162713, + "grad_norm": 0.030290499329566956, + "learning_rate": 3.456585116370021e-05, + "loss": 0.027, + "num_input_tokens_seen": 95628896, + "step": 78570 + }, + { + "epoch": 8.75097449604633, + "grad_norm": 0.041795071214437485, + "learning_rate": 3.4563606283808545e-05, + "loss": 0.0996, + "num_input_tokens_seen": 95634976, + "step": 78575 + }, + { + "epoch": 8.751531350929948, + "grad_norm": 0.08177479356527328, + "learning_rate": 3.456136131358097e-05, + "loss": 0.0561, + "num_input_tokens_seen": 95641312, + "step": 78580 + }, + { + "epoch": 8.752088205813564, + "grad_norm": 0.07233858853578568, + "learning_rate": 3.455911625303867e-05, + "loss": 0.08, + "num_input_tokens_seen": 95647168, + "step": 78585 + }, + { + "epoch": 8.752645060697182, + "grad_norm": 1.367709755897522, + "learning_rate": 3.455687110220286e-05, + "loss": 0.1284, + "num_input_tokens_seen": 95653312, + "step": 78590 + }, + { + "epoch": 8.7532019155808, + "grad_norm": 0.7471053600311279, + "learning_rate": 3.455462586109475e-05, + "loss": 0.1301, + "num_input_tokens_seen": 95658752, + "step": 78595 + }, + { + "epoch": 8.753758770464417, + "grad_norm": 0.004278900567442179, + "learning_rate": 3.4552380529735533e-05, + "loss": 0.0632, + "num_input_tokens_seen": 95665152, + "step": 78600 + }, + { + "epoch": 8.754315625348035, + "grad_norm": 0.6581600904464722, + "learning_rate": 3.455013510814644e-05, + "loss": 0.1428, + "num_input_tokens_seen": 95671392, + "step": 78605 + }, + { + "epoch": 8.754872480231652, + "grad_norm": 0.05924690514802933, + "learning_rate": 3.454788959634866e-05, + "loss": 0.0762, + "num_input_tokens_seen": 95677504, + "step": 78610 + }, + { + "epoch": 8.755429335115268, + "grad_norm": 0.26814454793930054, + "learning_rate": 3.454564399436342e-05, + "loss": 0.0628, + "num_input_tokens_seen": 95683584, + "step": 78615 + }, + { + "epoch": 8.755986189998886, + "grad_norm": 1.088287353515625, + "learning_rate": 3.454339830221192e-05, + "loss": 0.0606, + "num_input_tokens_seen": 95690112, + "step": 78620 + }, + { + "epoch": 8.756543044882504, + "grad_norm": 0.003097429173067212, + "learning_rate": 3.4541152519915385e-05, + "loss": 0.0211, + "num_input_tokens_seen": 95696192, + "step": 78625 + }, + { + "epoch": 8.757099899766121, + "grad_norm": 0.8075505495071411, + "learning_rate": 3.453890664749501e-05, + "loss": 0.0319, + "num_input_tokens_seen": 95702304, + "step": 78630 + }, + { + "epoch": 8.757656754649739, + "grad_norm": 0.003017788752913475, + "learning_rate": 3.453666068497202e-05, + "loss": 0.0192, + "num_input_tokens_seen": 95708160, + "step": 78635 + }, + { + "epoch": 8.758213609533355, + "grad_norm": 1.846999168395996, + "learning_rate": 3.453441463236764e-05, + "loss": 0.0614, + "num_input_tokens_seen": 95714176, + "step": 78640 + }, + { + "epoch": 8.758770464416973, + "grad_norm": 0.03185513615608215, + "learning_rate": 3.453216848970306e-05, + "loss": 0.033, + "num_input_tokens_seen": 95720288, + "step": 78645 + }, + { + "epoch": 8.75932731930059, + "grad_norm": 0.2100640833377838, + "learning_rate": 3.452992225699952e-05, + "loss": 0.0043, + "num_input_tokens_seen": 95726496, + "step": 78650 + }, + { + "epoch": 8.759884174184208, + "grad_norm": 2.46622896194458, + "learning_rate": 3.4527675934278225e-05, + "loss": 0.156, + "num_input_tokens_seen": 95732672, + "step": 78655 + }, + { + "epoch": 8.760441029067826, + "grad_norm": 0.06400418281555176, + "learning_rate": 3.45254295215604e-05, + "loss": 0.0057, + "num_input_tokens_seen": 95738816, + "step": 78660 + }, + { + "epoch": 8.760997883951442, + "grad_norm": 0.008378030732274055, + "learning_rate": 3.452318301886727e-05, + "loss": 0.0907, + "num_input_tokens_seen": 95744864, + "step": 78665 + }, + { + "epoch": 8.76155473883506, + "grad_norm": 0.11394771188497543, + "learning_rate": 3.4520936426220035e-05, + "loss": 0.0522, + "num_input_tokens_seen": 95751008, + "step": 78670 + }, + { + "epoch": 8.762111593718677, + "grad_norm": 0.3043488562107086, + "learning_rate": 3.4518689743639934e-05, + "loss": 0.0138, + "num_input_tokens_seen": 95757536, + "step": 78675 + }, + { + "epoch": 8.762668448602295, + "grad_norm": 0.5617272853851318, + "learning_rate": 3.451644297114818e-05, + "loss": 0.0654, + "num_input_tokens_seen": 95763712, + "step": 78680 + }, + { + "epoch": 8.763225303485912, + "grad_norm": 0.8874306082725525, + "learning_rate": 3.4514196108765994e-05, + "loss": 0.0822, + "num_input_tokens_seen": 95769600, + "step": 78685 + }, + { + "epoch": 8.763782158369528, + "grad_norm": 0.1388971358537674, + "learning_rate": 3.4511949156514604e-05, + "loss": 0.0091, + "num_input_tokens_seen": 95775712, + "step": 78690 + }, + { + "epoch": 8.764339013253146, + "grad_norm": 0.0034690757747739553, + "learning_rate": 3.450970211441523e-05, + "loss": 0.0451, + "num_input_tokens_seen": 95781856, + "step": 78695 + }, + { + "epoch": 8.764895868136763, + "grad_norm": 0.9186933040618896, + "learning_rate": 3.450745498248911e-05, + "loss": 0.1573, + "num_input_tokens_seen": 95788160, + "step": 78700 + }, + { + "epoch": 8.765452723020381, + "grad_norm": 0.6404287815093994, + "learning_rate": 3.450520776075746e-05, + "loss": 0.0702, + "num_input_tokens_seen": 95794240, + "step": 78705 + }, + { + "epoch": 8.766009577903999, + "grad_norm": 0.3782377243041992, + "learning_rate": 3.45029604492415e-05, + "loss": 0.0121, + "num_input_tokens_seen": 95800416, + "step": 78710 + }, + { + "epoch": 8.766566432787615, + "grad_norm": 0.6753807663917542, + "learning_rate": 3.450071304796247e-05, + "loss": 0.0621, + "num_input_tokens_seen": 95806464, + "step": 78715 + }, + { + "epoch": 8.767123287671232, + "grad_norm": 1.9589357376098633, + "learning_rate": 3.44984655569416e-05, + "loss": 0.1183, + "num_input_tokens_seen": 95812736, + "step": 78720 + }, + { + "epoch": 8.76768014255485, + "grad_norm": 0.018685435876250267, + "learning_rate": 3.4496217976200095e-05, + "loss": 0.0949, + "num_input_tokens_seen": 95818528, + "step": 78725 + }, + { + "epoch": 8.768236997438468, + "grad_norm": 0.8144859671592712, + "learning_rate": 3.449397030575921e-05, + "loss": 0.0247, + "num_input_tokens_seen": 95824896, + "step": 78730 + }, + { + "epoch": 8.768793852322085, + "grad_norm": 0.013320826925337315, + "learning_rate": 3.4491722545640174e-05, + "loss": 0.0486, + "num_input_tokens_seen": 95830944, + "step": 78735 + }, + { + "epoch": 8.769350707205703, + "grad_norm": 0.4133700430393219, + "learning_rate": 3.448947469586421e-05, + "loss": 0.0796, + "num_input_tokens_seen": 95836992, + "step": 78740 + }, + { + "epoch": 8.769907562089319, + "grad_norm": 0.10805230587720871, + "learning_rate": 3.448722675645256e-05, + "loss": 0.0213, + "num_input_tokens_seen": 95843296, + "step": 78745 + }, + { + "epoch": 8.770464416972937, + "grad_norm": 0.5406284332275391, + "learning_rate": 3.4484978727426446e-05, + "loss": 0.0771, + "num_input_tokens_seen": 95849344, + "step": 78750 + }, + { + "epoch": 8.771021271856554, + "grad_norm": 0.8624191880226135, + "learning_rate": 3.448273060880711e-05, + "loss": 0.0511, + "num_input_tokens_seen": 95855136, + "step": 78755 + }, + { + "epoch": 8.771578126740172, + "grad_norm": 0.21340255439281464, + "learning_rate": 3.4480482400615786e-05, + "loss": 0.0131, + "num_input_tokens_seen": 95861056, + "step": 78760 + }, + { + "epoch": 8.77213498162379, + "grad_norm": 0.042454227805137634, + "learning_rate": 3.4478234102873714e-05, + "loss": 0.0567, + "num_input_tokens_seen": 95867200, + "step": 78765 + }, + { + "epoch": 8.772691836507406, + "grad_norm": 0.02512185461819172, + "learning_rate": 3.447598571560213e-05, + "loss": 0.0287, + "num_input_tokens_seen": 95873312, + "step": 78770 + }, + { + "epoch": 8.773248691391023, + "grad_norm": 0.013332641683518887, + "learning_rate": 3.447373723882226e-05, + "loss": 0.0628, + "num_input_tokens_seen": 95879424, + "step": 78775 + }, + { + "epoch": 8.773805546274641, + "grad_norm": 0.3735682964324951, + "learning_rate": 3.447148867255535e-05, + "loss": 0.0454, + "num_input_tokens_seen": 95885792, + "step": 78780 + }, + { + "epoch": 8.774362401158259, + "grad_norm": 0.841620683670044, + "learning_rate": 3.4469240016822645e-05, + "loss": 0.0586, + "num_input_tokens_seen": 95891808, + "step": 78785 + }, + { + "epoch": 8.774919256041876, + "grad_norm": 0.01678713783621788, + "learning_rate": 3.446699127164538e-05, + "loss": 0.0163, + "num_input_tokens_seen": 95898176, + "step": 78790 + }, + { + "epoch": 8.775476110925492, + "grad_norm": 0.1709529012441635, + "learning_rate": 3.4464742437044807e-05, + "loss": 0.0305, + "num_input_tokens_seen": 95904160, + "step": 78795 + }, + { + "epoch": 8.77603296580911, + "grad_norm": 0.6718277335166931, + "learning_rate": 3.446249351304215e-05, + "loss": 0.0699, + "num_input_tokens_seen": 95910048, + "step": 78800 + }, + { + "epoch": 8.776589820692728, + "grad_norm": 0.004798348993062973, + "learning_rate": 3.4460244499658664e-05, + "loss": 0.0534, + "num_input_tokens_seen": 95916320, + "step": 78805 + }, + { + "epoch": 8.777146675576345, + "grad_norm": 0.3029223084449768, + "learning_rate": 3.445799539691558e-05, + "loss": 0.1287, + "num_input_tokens_seen": 95922400, + "step": 78810 + }, + { + "epoch": 8.777703530459963, + "grad_norm": 0.0026952035259455442, + "learning_rate": 3.4455746204834165e-05, + "loss": 0.0102, + "num_input_tokens_seen": 95928960, + "step": 78815 + }, + { + "epoch": 8.778260385343579, + "grad_norm": 0.0761660635471344, + "learning_rate": 3.4453496923435645e-05, + "loss": 0.0405, + "num_input_tokens_seen": 95935040, + "step": 78820 + }, + { + "epoch": 8.778817240227196, + "grad_norm": 0.24064701795578003, + "learning_rate": 3.4451247552741265e-05, + "loss": 0.0376, + "num_input_tokens_seen": 95940928, + "step": 78825 + }, + { + "epoch": 8.779374095110814, + "grad_norm": 0.34121304750442505, + "learning_rate": 3.4448998092772296e-05, + "loss": 0.0202, + "num_input_tokens_seen": 95946688, + "step": 78830 + }, + { + "epoch": 8.779930949994432, + "grad_norm": 0.04722687229514122, + "learning_rate": 3.444674854354996e-05, + "loss": 0.0242, + "num_input_tokens_seen": 95953312, + "step": 78835 + }, + { + "epoch": 8.78048780487805, + "grad_norm": 0.00037349716876633465, + "learning_rate": 3.444449890509552e-05, + "loss": 0.0127, + "num_input_tokens_seen": 95959424, + "step": 78840 + }, + { + "epoch": 8.781044659761665, + "grad_norm": 1.641055941581726, + "learning_rate": 3.444224917743022e-05, + "loss": 0.1312, + "num_input_tokens_seen": 95965664, + "step": 78845 + }, + { + "epoch": 8.781601514645283, + "grad_norm": 0.0011659306474030018, + "learning_rate": 3.4439999360575305e-05, + "loss": 0.0462, + "num_input_tokens_seen": 95971840, + "step": 78850 + }, + { + "epoch": 8.7821583695289, + "grad_norm": 0.026062598451972008, + "learning_rate": 3.4437749454552037e-05, + "loss": 0.0304, + "num_input_tokens_seen": 95977920, + "step": 78855 + }, + { + "epoch": 8.782715224412518, + "grad_norm": 0.008395979180932045, + "learning_rate": 3.443549945938167e-05, + "loss": 0.0123, + "num_input_tokens_seen": 95984096, + "step": 78860 + }, + { + "epoch": 8.783272079296136, + "grad_norm": 0.04030121490359306, + "learning_rate": 3.443324937508544e-05, + "loss": 0.0119, + "num_input_tokens_seen": 95990208, + "step": 78865 + }, + { + "epoch": 8.783828934179752, + "grad_norm": 0.0015198998153209686, + "learning_rate": 3.4430999201684616e-05, + "loss": 0.0184, + "num_input_tokens_seen": 95996448, + "step": 78870 + }, + { + "epoch": 8.78438578906337, + "grad_norm": 0.5752236247062683, + "learning_rate": 3.442874893920045e-05, + "loss": 0.0276, + "num_input_tokens_seen": 96002720, + "step": 78875 + }, + { + "epoch": 8.784942643946987, + "grad_norm": 0.8641034364700317, + "learning_rate": 3.44264985876542e-05, + "loss": 0.0485, + "num_input_tokens_seen": 96009280, + "step": 78880 + }, + { + "epoch": 8.785499498830605, + "grad_norm": 1.13966965675354, + "learning_rate": 3.4424248147067105e-05, + "loss": 0.0602, + "num_input_tokens_seen": 96015200, + "step": 78885 + }, + { + "epoch": 8.786056353714223, + "grad_norm": 0.052040573209524155, + "learning_rate": 3.442199761746045e-05, + "loss": 0.0921, + "num_input_tokens_seen": 96021472, + "step": 78890 + }, + { + "epoch": 8.786613208597839, + "grad_norm": 0.6466838121414185, + "learning_rate": 3.4419746998855476e-05, + "loss": 0.0192, + "num_input_tokens_seen": 96027680, + "step": 78895 + }, + { + "epoch": 8.787170063481456, + "grad_norm": 0.008754347451031208, + "learning_rate": 3.441749629127344e-05, + "loss": 0.054, + "num_input_tokens_seen": 96033952, + "step": 78900 + }, + { + "epoch": 8.787726918365074, + "grad_norm": 0.7184731960296631, + "learning_rate": 3.4415245494735605e-05, + "loss": 0.1252, + "num_input_tokens_seen": 96039840, + "step": 78905 + }, + { + "epoch": 8.788283773248692, + "grad_norm": 1.2517508268356323, + "learning_rate": 3.441299460926323e-05, + "loss": 0.0557, + "num_input_tokens_seen": 96045984, + "step": 78910 + }, + { + "epoch": 8.78884062813231, + "grad_norm": 0.01815417967736721, + "learning_rate": 3.44107436348776e-05, + "loss": 0.0749, + "num_input_tokens_seen": 96052128, + "step": 78915 + }, + { + "epoch": 8.789397483015925, + "grad_norm": 0.10222082585096359, + "learning_rate": 3.440849257159993e-05, + "loss": 0.0108, + "num_input_tokens_seen": 96058336, + "step": 78920 + }, + { + "epoch": 8.789954337899543, + "grad_norm": 1.4223518371582031, + "learning_rate": 3.440624141945153e-05, + "loss": 0.0509, + "num_input_tokens_seen": 96064608, + "step": 78925 + }, + { + "epoch": 8.79051119278316, + "grad_norm": 0.7041604518890381, + "learning_rate": 3.440399017845363e-05, + "loss": 0.0693, + "num_input_tokens_seen": 96070592, + "step": 78930 + }, + { + "epoch": 8.791068047666778, + "grad_norm": 1.346267819404602, + "learning_rate": 3.440173884862752e-05, + "loss": 0.0195, + "num_input_tokens_seen": 96076256, + "step": 78935 + }, + { + "epoch": 8.791624902550396, + "grad_norm": 0.11719012260437012, + "learning_rate": 3.4399487429994445e-05, + "loss": 0.0116, + "num_input_tokens_seen": 96082624, + "step": 78940 + }, + { + "epoch": 8.792181757434012, + "grad_norm": 0.019945289939641953, + "learning_rate": 3.4397235922575675e-05, + "loss": 0.0021, + "num_input_tokens_seen": 96089152, + "step": 78945 + }, + { + "epoch": 8.79273861231763, + "grad_norm": 0.03608965873718262, + "learning_rate": 3.43949843263925e-05, + "loss": 0.1068, + "num_input_tokens_seen": 96094944, + "step": 78950 + }, + { + "epoch": 8.793295467201247, + "grad_norm": 1.762162446975708, + "learning_rate": 3.4392732641466156e-05, + "loss": 0.0966, + "num_input_tokens_seen": 96100960, + "step": 78955 + }, + { + "epoch": 8.793852322084865, + "grad_norm": 0.19413577020168304, + "learning_rate": 3.439048086781794e-05, + "loss": 0.0064, + "num_input_tokens_seen": 96106848, + "step": 78960 + }, + { + "epoch": 8.794409176968482, + "grad_norm": 0.8192326426506042, + "learning_rate": 3.43882290054691e-05, + "loss": 0.0694, + "num_input_tokens_seen": 96112864, + "step": 78965 + }, + { + "epoch": 8.7949660318521, + "grad_norm": 0.011557959020137787, + "learning_rate": 3.438597705444091e-05, + "loss": 0.0173, + "num_input_tokens_seen": 96119328, + "step": 78970 + }, + { + "epoch": 8.795522886735716, + "grad_norm": 0.026422694325447083, + "learning_rate": 3.438372501475466e-05, + "loss": 0.0103, + "num_input_tokens_seen": 96125280, + "step": 78975 + }, + { + "epoch": 8.796079741619334, + "grad_norm": 0.36682558059692383, + "learning_rate": 3.43814728864316e-05, + "loss": 0.0334, + "num_input_tokens_seen": 96130752, + "step": 78980 + }, + { + "epoch": 8.796636596502951, + "grad_norm": 0.01752442680299282, + "learning_rate": 3.437922066949302e-05, + "loss": 0.0816, + "num_input_tokens_seen": 96136832, + "step": 78985 + }, + { + "epoch": 8.797193451386569, + "grad_norm": 0.04804392531514168, + "learning_rate": 3.4376968363960176e-05, + "loss": 0.0166, + "num_input_tokens_seen": 96142976, + "step": 78990 + }, + { + "epoch": 8.797750306270187, + "grad_norm": 0.019010715186595917, + "learning_rate": 3.437471596985437e-05, + "loss": 0.0392, + "num_input_tokens_seen": 96148960, + "step": 78995 + }, + { + "epoch": 8.798307161153803, + "grad_norm": 9.960890020010993e-05, + "learning_rate": 3.437246348719684e-05, + "loss": 0.0267, + "num_input_tokens_seen": 96155200, + "step": 79000 + }, + { + "epoch": 8.79886401603742, + "grad_norm": 1.3046188354492188, + "learning_rate": 3.4370210916008886e-05, + "loss": 0.1048, + "num_input_tokens_seen": 96161472, + "step": 79005 + }, + { + "epoch": 8.799420870921038, + "grad_norm": 0.09062346816062927, + "learning_rate": 3.4367958256311796e-05, + "loss": 0.1741, + "num_input_tokens_seen": 96167744, + "step": 79010 + }, + { + "epoch": 8.799977725804656, + "grad_norm": 0.18270039558410645, + "learning_rate": 3.436570550812683e-05, + "loss": 0.0304, + "num_input_tokens_seen": 96173984, + "step": 79015 + }, + { + "epoch": 8.800534580688273, + "grad_norm": 1.1214255094528198, + "learning_rate": 3.436345267147527e-05, + "loss": 0.0278, + "num_input_tokens_seen": 96180224, + "step": 79020 + }, + { + "epoch": 8.80109143557189, + "grad_norm": 0.43633779883384705, + "learning_rate": 3.436119974637839e-05, + "loss": 0.1491, + "num_input_tokens_seen": 96185792, + "step": 79025 + }, + { + "epoch": 8.801648290455507, + "grad_norm": 0.31831568479537964, + "learning_rate": 3.435894673285749e-05, + "loss": 0.0215, + "num_input_tokens_seen": 96191520, + "step": 79030 + }, + { + "epoch": 8.802205145339125, + "grad_norm": 0.3117334544658661, + "learning_rate": 3.435669363093383e-05, + "loss": 0.0069, + "num_input_tokens_seen": 96197760, + "step": 79035 + }, + { + "epoch": 8.802762000222742, + "grad_norm": 0.1788233369588852, + "learning_rate": 3.435444044062871e-05, + "loss": 0.0674, + "num_input_tokens_seen": 96203936, + "step": 79040 + }, + { + "epoch": 8.80331885510636, + "grad_norm": 0.047087281942367554, + "learning_rate": 3.43521871619634e-05, + "loss": 0.0212, + "num_input_tokens_seen": 96210272, + "step": 79045 + }, + { + "epoch": 8.803875709989976, + "grad_norm": 0.18238644301891327, + "learning_rate": 3.4349933794959196e-05, + "loss": 0.0455, + "num_input_tokens_seen": 96216096, + "step": 79050 + }, + { + "epoch": 8.804432564873593, + "grad_norm": 1.1505094766616821, + "learning_rate": 3.434768033963738e-05, + "loss": 0.0691, + "num_input_tokens_seen": 96222304, + "step": 79055 + }, + { + "epoch": 8.804989419757211, + "grad_norm": 0.09242741018533707, + "learning_rate": 3.434542679601922e-05, + "loss": 0.0213, + "num_input_tokens_seen": 96228608, + "step": 79060 + }, + { + "epoch": 8.805546274640829, + "grad_norm": 0.00020148858311586082, + "learning_rate": 3.434317316412602e-05, + "loss": 0.0549, + "num_input_tokens_seen": 96234016, + "step": 79065 + }, + { + "epoch": 8.806103129524447, + "grad_norm": 0.03574991226196289, + "learning_rate": 3.4340919443979076e-05, + "loss": 0.0015, + "num_input_tokens_seen": 96240608, + "step": 79070 + }, + { + "epoch": 8.806659984408064, + "grad_norm": 0.2717433273792267, + "learning_rate": 3.433866563559965e-05, + "loss": 0.0133, + "num_input_tokens_seen": 96246944, + "step": 79075 + }, + { + "epoch": 8.80721683929168, + "grad_norm": 0.027100130915641785, + "learning_rate": 3.4336411739009056e-05, + "loss": 0.0155, + "num_input_tokens_seen": 96252800, + "step": 79080 + }, + { + "epoch": 8.807773694175298, + "grad_norm": 0.052027627825737, + "learning_rate": 3.4334157754228565e-05, + "loss": 0.0063, + "num_input_tokens_seen": 96258976, + "step": 79085 + }, + { + "epoch": 8.808330549058915, + "grad_norm": 0.5936998724937439, + "learning_rate": 3.433190368127947e-05, + "loss": 0.0363, + "num_input_tokens_seen": 96265184, + "step": 79090 + }, + { + "epoch": 8.808887403942533, + "grad_norm": 0.029918719083070755, + "learning_rate": 3.4329649520183084e-05, + "loss": 0.0448, + "num_input_tokens_seen": 96271296, + "step": 79095 + }, + { + "epoch": 8.80944425882615, + "grad_norm": 0.06672251224517822, + "learning_rate": 3.432739527096067e-05, + "loss": 0.0765, + "num_input_tokens_seen": 96277440, + "step": 79100 + }, + { + "epoch": 8.810001113709767, + "grad_norm": 1.7173856496810913, + "learning_rate": 3.4325140933633545e-05, + "loss": 0.0864, + "num_input_tokens_seen": 96283168, + "step": 79105 + }, + { + "epoch": 8.810557968593384, + "grad_norm": 7.519648352172226e-05, + "learning_rate": 3.4322886508222985e-05, + "loss": 0.0704, + "num_input_tokens_seen": 96289216, + "step": 79110 + }, + { + "epoch": 8.811114823477002, + "grad_norm": 0.0008792033768258989, + "learning_rate": 3.43206319947503e-05, + "loss": 0.089, + "num_input_tokens_seen": 96295104, + "step": 79115 + }, + { + "epoch": 8.81167167836062, + "grad_norm": 0.0012443126179277897, + "learning_rate": 3.4318377393236764e-05, + "loss": 0.0444, + "num_input_tokens_seen": 96301216, + "step": 79120 + }, + { + "epoch": 8.812228533244237, + "grad_norm": 0.017006289213895798, + "learning_rate": 3.43161227037037e-05, + "loss": 0.0147, + "num_input_tokens_seen": 96307360, + "step": 79125 + }, + { + "epoch": 8.812785388127853, + "grad_norm": 0.8011855483055115, + "learning_rate": 3.4313867926172394e-05, + "loss": 0.0112, + "num_input_tokens_seen": 96313728, + "step": 79130 + }, + { + "epoch": 8.813342243011471, + "grad_norm": 0.33827072381973267, + "learning_rate": 3.431161306066414e-05, + "loss": 0.0217, + "num_input_tokens_seen": 96319968, + "step": 79135 + }, + { + "epoch": 8.813899097895089, + "grad_norm": 0.08167047798633575, + "learning_rate": 3.430935810720024e-05, + "loss": 0.0565, + "num_input_tokens_seen": 96325568, + "step": 79140 + }, + { + "epoch": 8.814455952778706, + "grad_norm": 0.8975026607513428, + "learning_rate": 3.4307103065801996e-05, + "loss": 0.1041, + "num_input_tokens_seen": 96331616, + "step": 79145 + }, + { + "epoch": 8.815012807662324, + "grad_norm": 0.0021082661114633083, + "learning_rate": 3.4304847936490705e-05, + "loss": 0.11, + "num_input_tokens_seen": 96337824, + "step": 79150 + }, + { + "epoch": 8.81556966254594, + "grad_norm": 0.24020631611347198, + "learning_rate": 3.4302592719287664e-05, + "loss": 0.0699, + "num_input_tokens_seen": 96343552, + "step": 79155 + }, + { + "epoch": 8.816126517429558, + "grad_norm": 0.08236725628376007, + "learning_rate": 3.430033741421419e-05, + "loss": 0.1024, + "num_input_tokens_seen": 96349952, + "step": 79160 + }, + { + "epoch": 8.816683372313175, + "grad_norm": 0.04442427679896355, + "learning_rate": 3.4298082021291576e-05, + "loss": 0.0055, + "num_input_tokens_seen": 96356192, + "step": 79165 + }, + { + "epoch": 8.817240227196793, + "grad_norm": 1.3800233602523804, + "learning_rate": 3.4295826540541125e-05, + "loss": 0.0911, + "num_input_tokens_seen": 96362496, + "step": 79170 + }, + { + "epoch": 8.81779708208041, + "grad_norm": 0.8219786286354065, + "learning_rate": 3.429357097198415e-05, + "loss": 0.0489, + "num_input_tokens_seen": 96368480, + "step": 79175 + }, + { + "epoch": 8.818353936964026, + "grad_norm": 0.021214313805103302, + "learning_rate": 3.4291315315641945e-05, + "loss": 0.011, + "num_input_tokens_seen": 96374656, + "step": 79180 + }, + { + "epoch": 8.818910791847644, + "grad_norm": 9.131139813689515e-05, + "learning_rate": 3.428905957153583e-05, + "loss": 0.0328, + "num_input_tokens_seen": 96380896, + "step": 79185 + }, + { + "epoch": 8.819467646731262, + "grad_norm": 1.5121240615844727, + "learning_rate": 3.42868037396871e-05, + "loss": 0.0702, + "num_input_tokens_seen": 96386560, + "step": 79190 + }, + { + "epoch": 8.82002450161488, + "grad_norm": 0.045626651495695114, + "learning_rate": 3.428454782011707e-05, + "loss": 0.0051, + "num_input_tokens_seen": 96392864, + "step": 79195 + }, + { + "epoch": 8.820581356498497, + "grad_norm": 1.0349607467651367, + "learning_rate": 3.428229181284705e-05, + "loss": 0.047, + "num_input_tokens_seen": 96398944, + "step": 79200 + }, + { + "epoch": 8.821138211382113, + "grad_norm": 2.8329174518585205, + "learning_rate": 3.428003571789834e-05, + "loss": 0.0662, + "num_input_tokens_seen": 96404608, + "step": 79205 + }, + { + "epoch": 8.82169506626573, + "grad_norm": 1.5163954496383667, + "learning_rate": 3.4277779535292264e-05, + "loss": 0.0803, + "num_input_tokens_seen": 96410528, + "step": 79210 + }, + { + "epoch": 8.822251921149348, + "grad_norm": 0.3434875011444092, + "learning_rate": 3.427552326505012e-05, + "loss": 0.1132, + "num_input_tokens_seen": 96416480, + "step": 79215 + }, + { + "epoch": 8.822808776032966, + "grad_norm": 0.4309166669845581, + "learning_rate": 3.4273266907193235e-05, + "loss": 0.0837, + "num_input_tokens_seen": 96422592, + "step": 79220 + }, + { + "epoch": 8.823365630916584, + "grad_norm": 1.081395149230957, + "learning_rate": 3.4271010461742906e-05, + "loss": 0.0522, + "num_input_tokens_seen": 96428800, + "step": 79225 + }, + { + "epoch": 8.8239224858002, + "grad_norm": 0.015051547437906265, + "learning_rate": 3.4268753928720464e-05, + "loss": 0.0282, + "num_input_tokens_seen": 96434816, + "step": 79230 + }, + { + "epoch": 8.824479340683817, + "grad_norm": 0.00402638828381896, + "learning_rate": 3.426649730814721e-05, + "loss": 0.1487, + "num_input_tokens_seen": 96441120, + "step": 79235 + }, + { + "epoch": 8.825036195567435, + "grad_norm": 1.0993808507919312, + "learning_rate": 3.426424060004447e-05, + "loss": 0.0558, + "num_input_tokens_seen": 96447136, + "step": 79240 + }, + { + "epoch": 8.825593050451053, + "grad_norm": 0.3170419931411743, + "learning_rate": 3.426198380443355e-05, + "loss": 0.1228, + "num_input_tokens_seen": 96452864, + "step": 79245 + }, + { + "epoch": 8.82614990533467, + "grad_norm": 0.0005599340074695647, + "learning_rate": 3.425972692133578e-05, + "loss": 0.0207, + "num_input_tokens_seen": 96459136, + "step": 79250 + }, + { + "epoch": 8.826706760218286, + "grad_norm": 0.0015338012017309666, + "learning_rate": 3.425746995077246e-05, + "loss": 0.0336, + "num_input_tokens_seen": 96465056, + "step": 79255 + }, + { + "epoch": 8.827263615101904, + "grad_norm": 0.19979508221149445, + "learning_rate": 3.425521289276492e-05, + "loss": 0.0425, + "num_input_tokens_seen": 96471168, + "step": 79260 + }, + { + "epoch": 8.827820469985522, + "grad_norm": 0.07685094326734543, + "learning_rate": 3.425295574733449e-05, + "loss": 0.013, + "num_input_tokens_seen": 96477248, + "step": 79265 + }, + { + "epoch": 8.82837732486914, + "grad_norm": 0.004609332419931889, + "learning_rate": 3.425069851450247e-05, + "loss": 0.046, + "num_input_tokens_seen": 96483168, + "step": 79270 + }, + { + "epoch": 8.828934179752757, + "grad_norm": 0.011521950364112854, + "learning_rate": 3.4248441194290196e-05, + "loss": 0.0353, + "num_input_tokens_seen": 96489088, + "step": 79275 + }, + { + "epoch": 8.829491034636373, + "grad_norm": 0.33747971057891846, + "learning_rate": 3.4246183786718975e-05, + "loss": 0.0146, + "num_input_tokens_seen": 96495360, + "step": 79280 + }, + { + "epoch": 8.83004788951999, + "grad_norm": 0.04583540931344032, + "learning_rate": 3.424392629181015e-05, + "loss": 0.0564, + "num_input_tokens_seen": 96501216, + "step": 79285 + }, + { + "epoch": 8.830604744403608, + "grad_norm": 0.13895085453987122, + "learning_rate": 3.424166870958503e-05, + "loss": 0.0103, + "num_input_tokens_seen": 96507296, + "step": 79290 + }, + { + "epoch": 8.831161599287226, + "grad_norm": 0.03613976016640663, + "learning_rate": 3.4239411040064956e-05, + "loss": 0.0105, + "num_input_tokens_seen": 96513376, + "step": 79295 + }, + { + "epoch": 8.831718454170844, + "grad_norm": 1.895102858543396, + "learning_rate": 3.4237153283271226e-05, + "loss": 0.0254, + "num_input_tokens_seen": 96519296, + "step": 79300 + }, + { + "epoch": 8.832275309054461, + "grad_norm": 0.008396460674703121, + "learning_rate": 3.42348954392252e-05, + "loss": 0.0004, + "num_input_tokens_seen": 96525888, + "step": 79305 + }, + { + "epoch": 8.832832163938077, + "grad_norm": 3.0498266220092773, + "learning_rate": 3.423263750794817e-05, + "loss": 0.0687, + "num_input_tokens_seen": 96531968, + "step": 79310 + }, + { + "epoch": 8.833389018821695, + "grad_norm": 0.045947298407554626, + "learning_rate": 3.42303794894615e-05, + "loss": 0.0684, + "num_input_tokens_seen": 96538336, + "step": 79315 + }, + { + "epoch": 8.833945873705312, + "grad_norm": 0.11018973588943481, + "learning_rate": 3.4228121383786484e-05, + "loss": 0.0353, + "num_input_tokens_seen": 96544768, + "step": 79320 + }, + { + "epoch": 8.83450272858893, + "grad_norm": 0.7874758243560791, + "learning_rate": 3.4225863190944475e-05, + "loss": 0.0198, + "num_input_tokens_seen": 96550912, + "step": 79325 + }, + { + "epoch": 8.835059583472548, + "grad_norm": 0.1711115837097168, + "learning_rate": 3.4223604910956796e-05, + "loss": 0.0038, + "num_input_tokens_seen": 96557184, + "step": 79330 + }, + { + "epoch": 8.835616438356164, + "grad_norm": 0.6935531497001648, + "learning_rate": 3.422134654384478e-05, + "loss": 0.0447, + "num_input_tokens_seen": 96563072, + "step": 79335 + }, + { + "epoch": 8.836173293239781, + "grad_norm": 1.3464219570159912, + "learning_rate": 3.421908808962975e-05, + "loss": 0.1637, + "num_input_tokens_seen": 96569376, + "step": 79340 + }, + { + "epoch": 8.836730148123399, + "grad_norm": 0.5631504058837891, + "learning_rate": 3.421682954833306e-05, + "loss": 0.0055, + "num_input_tokens_seen": 96575552, + "step": 79345 + }, + { + "epoch": 8.837287003007017, + "grad_norm": 0.4375567138195038, + "learning_rate": 3.421457091997602e-05, + "loss": 0.1579, + "num_input_tokens_seen": 96581664, + "step": 79350 + }, + { + "epoch": 8.837843857890634, + "grad_norm": 1.1754637956619263, + "learning_rate": 3.421231220457998e-05, + "loss": 0.0925, + "num_input_tokens_seen": 96588032, + "step": 79355 + }, + { + "epoch": 8.83840071277425, + "grad_norm": 1.5282909870147705, + "learning_rate": 3.421005340216627e-05, + "loss": 0.0422, + "num_input_tokens_seen": 96594400, + "step": 79360 + }, + { + "epoch": 8.838957567657868, + "grad_norm": 0.21775689721107483, + "learning_rate": 3.4207794512756224e-05, + "loss": 0.0072, + "num_input_tokens_seen": 96600544, + "step": 79365 + }, + { + "epoch": 8.839514422541486, + "grad_norm": 0.0016700060805305839, + "learning_rate": 3.4205535536371185e-05, + "loss": 0.0246, + "num_input_tokens_seen": 96606656, + "step": 79370 + }, + { + "epoch": 8.840071277425103, + "grad_norm": 0.07268306612968445, + "learning_rate": 3.420327647303249e-05, + "loss": 0.0103, + "num_input_tokens_seen": 96612544, + "step": 79375 + }, + { + "epoch": 8.840628132308721, + "grad_norm": 0.017806628718972206, + "learning_rate": 3.420101732276147e-05, + "loss": 0.0915, + "num_input_tokens_seen": 96618080, + "step": 79380 + }, + { + "epoch": 8.841184987192337, + "grad_norm": 1.0145219564437866, + "learning_rate": 3.4198758085579466e-05, + "loss": 0.0242, + "num_input_tokens_seen": 96624192, + "step": 79385 + }, + { + "epoch": 8.841741842075955, + "grad_norm": 0.07152466475963593, + "learning_rate": 3.419649876150783e-05, + "loss": 0.0939, + "num_input_tokens_seen": 96630304, + "step": 79390 + }, + { + "epoch": 8.842298696959572, + "grad_norm": 1.1064457893371582, + "learning_rate": 3.4194239350567894e-05, + "loss": 0.0483, + "num_input_tokens_seen": 96636096, + "step": 79395 + }, + { + "epoch": 8.84285555184319, + "grad_norm": 0.0024466775357723236, + "learning_rate": 3.4191979852781e-05, + "loss": 0.0021, + "num_input_tokens_seen": 96642240, + "step": 79400 + }, + { + "epoch": 8.843412406726808, + "grad_norm": 0.12092115730047226, + "learning_rate": 3.418972026816849e-05, + "loss": 0.1454, + "num_input_tokens_seen": 96648608, + "step": 79405 + }, + { + "epoch": 8.843969261610424, + "grad_norm": 0.08288076519966125, + "learning_rate": 3.41874605967517e-05, + "loss": 0.0851, + "num_input_tokens_seen": 96654560, + "step": 79410 + }, + { + "epoch": 8.844526116494041, + "grad_norm": 0.04146500304341316, + "learning_rate": 3.4185200838552e-05, + "loss": 0.0081, + "num_input_tokens_seen": 96661152, + "step": 79415 + }, + { + "epoch": 8.845082971377659, + "grad_norm": 1.082874059677124, + "learning_rate": 3.418294099359071e-05, + "loss": 0.1224, + "num_input_tokens_seen": 96667232, + "step": 79420 + }, + { + "epoch": 8.845639826261277, + "grad_norm": 0.6259256601333618, + "learning_rate": 3.418068106188919e-05, + "loss": 0.1185, + "num_input_tokens_seen": 96673632, + "step": 79425 + }, + { + "epoch": 8.846196681144894, + "grad_norm": 0.0016844479832798243, + "learning_rate": 3.417842104346878e-05, + "loss": 0.0148, + "num_input_tokens_seen": 96679328, + "step": 79430 + }, + { + "epoch": 8.846753536028512, + "grad_norm": 0.5695328116416931, + "learning_rate": 3.4176160938350835e-05, + "loss": 0.054, + "num_input_tokens_seen": 96685536, + "step": 79435 + }, + { + "epoch": 8.847310390912128, + "grad_norm": 0.008341262117028236, + "learning_rate": 3.417390074655669e-05, + "loss": 0.025, + "num_input_tokens_seen": 96691328, + "step": 79440 + }, + { + "epoch": 8.847867245795745, + "grad_norm": 1.4074901342391968, + "learning_rate": 3.41716404681077e-05, + "loss": 0.0426, + "num_input_tokens_seen": 96697472, + "step": 79445 + }, + { + "epoch": 8.848424100679363, + "grad_norm": 0.19629980623722076, + "learning_rate": 3.416938010302523e-05, + "loss": 0.0451, + "num_input_tokens_seen": 96703776, + "step": 79450 + }, + { + "epoch": 8.84898095556298, + "grad_norm": 0.1185280978679657, + "learning_rate": 3.416711965133061e-05, + "loss": 0.1142, + "num_input_tokens_seen": 96709408, + "step": 79455 + }, + { + "epoch": 8.849537810446598, + "grad_norm": 0.00047905996325425804, + "learning_rate": 3.41648591130452e-05, + "loss": 0.0373, + "num_input_tokens_seen": 96715392, + "step": 79460 + }, + { + "epoch": 8.850094665330214, + "grad_norm": 0.027597833424806595, + "learning_rate": 3.416259848819036e-05, + "loss": 0.0129, + "num_input_tokens_seen": 96721440, + "step": 79465 + }, + { + "epoch": 8.850651520213832, + "grad_norm": 0.49566301703453064, + "learning_rate": 3.416033777678742e-05, + "loss": 0.0254, + "num_input_tokens_seen": 96727872, + "step": 79470 + }, + { + "epoch": 8.85120837509745, + "grad_norm": 0.1522243618965149, + "learning_rate": 3.4158076978857764e-05, + "loss": 0.0532, + "num_input_tokens_seen": 96734336, + "step": 79475 + }, + { + "epoch": 8.851765229981067, + "grad_norm": 0.283380389213562, + "learning_rate": 3.415581609442273e-05, + "loss": 0.113, + "num_input_tokens_seen": 96740352, + "step": 79480 + }, + { + "epoch": 8.852322084864685, + "grad_norm": 0.012214653193950653, + "learning_rate": 3.415355512350368e-05, + "loss": 0.089, + "num_input_tokens_seen": 96746624, + "step": 79485 + }, + { + "epoch": 8.852878939748301, + "grad_norm": 0.09161774069070816, + "learning_rate": 3.415129406612197e-05, + "loss": 0.0621, + "num_input_tokens_seen": 96752768, + "step": 79490 + }, + { + "epoch": 8.853435794631919, + "grad_norm": 0.06087273731827736, + "learning_rate": 3.414903292229895e-05, + "loss": 0.0171, + "num_input_tokens_seen": 96758816, + "step": 79495 + }, + { + "epoch": 8.853992649515536, + "grad_norm": 0.09970629215240479, + "learning_rate": 3.414677169205599e-05, + "loss": 0.0487, + "num_input_tokens_seen": 96764736, + "step": 79500 + }, + { + "epoch": 8.854549504399154, + "grad_norm": 0.00436524348333478, + "learning_rate": 3.4144510375414434e-05, + "loss": 0.0894, + "num_input_tokens_seen": 96770432, + "step": 79505 + }, + { + "epoch": 8.855106359282772, + "grad_norm": 1.46627938747406, + "learning_rate": 3.4142248972395664e-05, + "loss": 0.1156, + "num_input_tokens_seen": 96775840, + "step": 79510 + }, + { + "epoch": 8.855663214166388, + "grad_norm": 0.3138662874698639, + "learning_rate": 3.413998748302101e-05, + "loss": 0.0168, + "num_input_tokens_seen": 96782016, + "step": 79515 + }, + { + "epoch": 8.856220069050005, + "grad_norm": 0.19897611439228058, + "learning_rate": 3.413772590731187e-05, + "loss": 0.0293, + "num_input_tokens_seen": 96788064, + "step": 79520 + }, + { + "epoch": 8.856776923933623, + "grad_norm": 0.6818826794624329, + "learning_rate": 3.413546424528958e-05, + "loss": 0.1174, + "num_input_tokens_seen": 96793920, + "step": 79525 + }, + { + "epoch": 8.85733377881724, + "grad_norm": 0.5282993912696838, + "learning_rate": 3.413320249697551e-05, + "loss": 0.0523, + "num_input_tokens_seen": 96800192, + "step": 79530 + }, + { + "epoch": 8.857890633700858, + "grad_norm": 0.00041579781100153923, + "learning_rate": 3.413094066239102e-05, + "loss": 0.0782, + "num_input_tokens_seen": 96806144, + "step": 79535 + }, + { + "epoch": 8.858447488584474, + "grad_norm": 0.00617108354344964, + "learning_rate": 3.412867874155749e-05, + "loss": 0.1232, + "num_input_tokens_seen": 96812160, + "step": 79540 + }, + { + "epoch": 8.859004343468092, + "grad_norm": 0.14042967557907104, + "learning_rate": 3.412641673449627e-05, + "loss": 0.0497, + "num_input_tokens_seen": 96818688, + "step": 79545 + }, + { + "epoch": 8.85956119835171, + "grad_norm": 0.0064596147276461124, + "learning_rate": 3.412415464122873e-05, + "loss": 0.0265, + "num_input_tokens_seen": 96824800, + "step": 79550 + }, + { + "epoch": 8.860118053235327, + "grad_norm": 1.1624305248260498, + "learning_rate": 3.412189246177625e-05, + "loss": 0.1277, + "num_input_tokens_seen": 96831104, + "step": 79555 + }, + { + "epoch": 8.860674908118945, + "grad_norm": 0.006856407504528761, + "learning_rate": 3.411963019616017e-05, + "loss": 0.1111, + "num_input_tokens_seen": 96837120, + "step": 79560 + }, + { + "epoch": 8.86123176300256, + "grad_norm": 0.15269571542739868, + "learning_rate": 3.411736784440189e-05, + "loss": 0.0543, + "num_input_tokens_seen": 96843392, + "step": 79565 + }, + { + "epoch": 8.861788617886178, + "grad_norm": 0.007987374439835548, + "learning_rate": 3.4115105406522765e-05, + "loss": 0.0524, + "num_input_tokens_seen": 96849472, + "step": 79570 + }, + { + "epoch": 8.862345472769796, + "grad_norm": 0.4866718351840973, + "learning_rate": 3.411284288254416e-05, + "loss": 0.0196, + "num_input_tokens_seen": 96855456, + "step": 79575 + }, + { + "epoch": 8.862902327653414, + "grad_norm": 0.0022552390582859516, + "learning_rate": 3.411058027248746e-05, + "loss": 0.0693, + "num_input_tokens_seen": 96861376, + "step": 79580 + }, + { + "epoch": 8.863459182537031, + "grad_norm": 0.0037505975924432278, + "learning_rate": 3.410831757637402e-05, + "loss": 0.1536, + "num_input_tokens_seen": 96867520, + "step": 79585 + }, + { + "epoch": 8.864016037420647, + "grad_norm": 0.011762588284909725, + "learning_rate": 3.410605479422523e-05, + "loss": 0.0027, + "num_input_tokens_seen": 96873760, + "step": 79590 + }, + { + "epoch": 8.864572892304265, + "grad_norm": 0.6165902018547058, + "learning_rate": 3.4103791926062455e-05, + "loss": 0.0267, + "num_input_tokens_seen": 96879968, + "step": 79595 + }, + { + "epoch": 8.865129747187883, + "grad_norm": 0.052021026611328125, + "learning_rate": 3.410152897190707e-05, + "loss": 0.1177, + "num_input_tokens_seen": 96886048, + "step": 79600 + }, + { + "epoch": 8.8656866020715, + "grad_norm": 1.2715868949890137, + "learning_rate": 3.4099265931780455e-05, + "loss": 0.0497, + "num_input_tokens_seen": 96892512, + "step": 79605 + }, + { + "epoch": 8.866243456955118, + "grad_norm": 0.3496716320514679, + "learning_rate": 3.409700280570398e-05, + "loss": 0.0241, + "num_input_tokens_seen": 96898720, + "step": 79610 + }, + { + "epoch": 8.866800311838734, + "grad_norm": 1.7242064476013184, + "learning_rate": 3.409473959369903e-05, + "loss": 0.0873, + "num_input_tokens_seen": 96904832, + "step": 79615 + }, + { + "epoch": 8.867357166722352, + "grad_norm": 0.21156176924705505, + "learning_rate": 3.409247629578698e-05, + "loss": 0.0833, + "num_input_tokens_seen": 96911008, + "step": 79620 + }, + { + "epoch": 8.86791402160597, + "grad_norm": 0.014291508123278618, + "learning_rate": 3.40902129119892e-05, + "loss": 0.0762, + "num_input_tokens_seen": 96916928, + "step": 79625 + }, + { + "epoch": 8.868470876489587, + "grad_norm": 0.012417481280863285, + "learning_rate": 3.408794944232708e-05, + "loss": 0.0042, + "num_input_tokens_seen": 96923008, + "step": 79630 + }, + { + "epoch": 8.869027731373205, + "grad_norm": 0.24491052329540253, + "learning_rate": 3.408568588682199e-05, + "loss": 0.0517, + "num_input_tokens_seen": 96928672, + "step": 79635 + }, + { + "epoch": 8.86958458625682, + "grad_norm": 0.0063833389431238174, + "learning_rate": 3.408342224549532e-05, + "loss": 0.031, + "num_input_tokens_seen": 96934976, + "step": 79640 + }, + { + "epoch": 8.870141441140438, + "grad_norm": 0.018975522369146347, + "learning_rate": 3.408115851836845e-05, + "loss": 0.0165, + "num_input_tokens_seen": 96940800, + "step": 79645 + }, + { + "epoch": 8.870698296024056, + "grad_norm": 1.8146275281906128, + "learning_rate": 3.4078894705462774e-05, + "loss": 0.1158, + "num_input_tokens_seen": 96947072, + "step": 79650 + }, + { + "epoch": 8.871255150907674, + "grad_norm": 0.8592698574066162, + "learning_rate": 3.407663080679965e-05, + "loss": 0.0964, + "num_input_tokens_seen": 96953600, + "step": 79655 + }, + { + "epoch": 8.871812005791291, + "grad_norm": 0.40065908432006836, + "learning_rate": 3.407436682240048e-05, + "loss": 0.0224, + "num_input_tokens_seen": 96959808, + "step": 79660 + }, + { + "epoch": 8.872368860674909, + "grad_norm": 0.0045985388569533825, + "learning_rate": 3.407210275228664e-05, + "loss": 0.0118, + "num_input_tokens_seen": 96966272, + "step": 79665 + }, + { + "epoch": 8.872925715558525, + "grad_norm": 0.049314357340335846, + "learning_rate": 3.406983859647953e-05, + "loss": 0.0058, + "num_input_tokens_seen": 96972576, + "step": 79670 + }, + { + "epoch": 8.873482570442143, + "grad_norm": 0.12842917442321777, + "learning_rate": 3.406757435500053e-05, + "loss": 0.0435, + "num_input_tokens_seen": 96979072, + "step": 79675 + }, + { + "epoch": 8.87403942532576, + "grad_norm": 0.2773802578449249, + "learning_rate": 3.406531002787101e-05, + "loss": 0.0109, + "num_input_tokens_seen": 96985408, + "step": 79680 + }, + { + "epoch": 8.874596280209378, + "grad_norm": 0.16316188871860504, + "learning_rate": 3.406304561511238e-05, + "loss": 0.0216, + "num_input_tokens_seen": 96991584, + "step": 79685 + }, + { + "epoch": 8.875153135092996, + "grad_norm": 0.4605129063129425, + "learning_rate": 3.406078111674603e-05, + "loss": 0.1558, + "num_input_tokens_seen": 96997056, + "step": 79690 + }, + { + "epoch": 8.875709989976611, + "grad_norm": 0.00592406652867794, + "learning_rate": 3.4058516532793336e-05, + "loss": 0.1369, + "num_input_tokens_seen": 97003168, + "step": 79695 + }, + { + "epoch": 8.876266844860229, + "grad_norm": 1.2785359621047974, + "learning_rate": 3.40562518632757e-05, + "loss": 0.111, + "num_input_tokens_seen": 97009376, + "step": 79700 + }, + { + "epoch": 8.876823699743847, + "grad_norm": 0.021112006157636642, + "learning_rate": 3.4053987108214504e-05, + "loss": 0.0157, + "num_input_tokens_seen": 97015168, + "step": 79705 + }, + { + "epoch": 8.877380554627464, + "grad_norm": 0.08291587233543396, + "learning_rate": 3.405172226763115e-05, + "loss": 0.0959, + "num_input_tokens_seen": 97021216, + "step": 79710 + }, + { + "epoch": 8.877937409511082, + "grad_norm": 0.0009710786398500204, + "learning_rate": 3.4049457341547024e-05, + "loss": 0.0254, + "num_input_tokens_seen": 97027520, + "step": 79715 + }, + { + "epoch": 8.878494264394698, + "grad_norm": 0.49452266097068787, + "learning_rate": 3.4047192329983524e-05, + "loss": 0.0481, + "num_input_tokens_seen": 97033632, + "step": 79720 + }, + { + "epoch": 8.879051119278316, + "grad_norm": 0.4474318027496338, + "learning_rate": 3.404492723296205e-05, + "loss": 0.0853, + "num_input_tokens_seen": 97039808, + "step": 79725 + }, + { + "epoch": 8.879607974161933, + "grad_norm": 0.00010281198774464428, + "learning_rate": 3.404266205050398e-05, + "loss": 0.0009, + "num_input_tokens_seen": 97046016, + "step": 79730 + }, + { + "epoch": 8.880164829045551, + "grad_norm": 0.4945926368236542, + "learning_rate": 3.404039678263074e-05, + "loss": 0.0644, + "num_input_tokens_seen": 97052032, + "step": 79735 + }, + { + "epoch": 8.880721683929169, + "grad_norm": 0.08182802051305771, + "learning_rate": 3.403813142936369e-05, + "loss": 0.0433, + "num_input_tokens_seen": 97057696, + "step": 79740 + }, + { + "epoch": 8.881278538812785, + "grad_norm": 0.5109949707984924, + "learning_rate": 3.4035865990724255e-05, + "loss": 0.0492, + "num_input_tokens_seen": 97063392, + "step": 79745 + }, + { + "epoch": 8.881835393696402, + "grad_norm": 0.17370465397834778, + "learning_rate": 3.403360046673383e-05, + "loss": 0.0338, + "num_input_tokens_seen": 97069408, + "step": 79750 + }, + { + "epoch": 8.88239224858002, + "grad_norm": 0.05471606180071831, + "learning_rate": 3.4031334857413804e-05, + "loss": 0.0532, + "num_input_tokens_seen": 97075584, + "step": 79755 + }, + { + "epoch": 8.882949103463638, + "grad_norm": 0.9758578538894653, + "learning_rate": 3.4029069162785595e-05, + "loss": 0.0702, + "num_input_tokens_seen": 97081504, + "step": 79760 + }, + { + "epoch": 8.883505958347255, + "grad_norm": 0.2504694163799286, + "learning_rate": 3.402680338287058e-05, + "loss": 0.0618, + "num_input_tokens_seen": 97087840, + "step": 79765 + }, + { + "epoch": 8.884062813230871, + "grad_norm": 0.02432652935385704, + "learning_rate": 3.402453751769019e-05, + "loss": 0.0075, + "num_input_tokens_seen": 97093792, + "step": 79770 + }, + { + "epoch": 8.884619668114489, + "grad_norm": 0.01255981158465147, + "learning_rate": 3.40222715672658e-05, + "loss": 0.0359, + "num_input_tokens_seen": 97099968, + "step": 79775 + }, + { + "epoch": 8.885176522998107, + "grad_norm": 1.4999791383743286, + "learning_rate": 3.402000553161883e-05, + "loss": 0.1258, + "num_input_tokens_seen": 97105888, + "step": 79780 + }, + { + "epoch": 8.885733377881724, + "grad_norm": 0.05791256204247475, + "learning_rate": 3.4017739410770686e-05, + "loss": 0.0204, + "num_input_tokens_seen": 97112128, + "step": 79785 + }, + { + "epoch": 8.886290232765342, + "grad_norm": 0.034669049084186554, + "learning_rate": 3.4015473204742764e-05, + "loss": 0.0321, + "num_input_tokens_seen": 97118464, + "step": 79790 + }, + { + "epoch": 8.88684708764896, + "grad_norm": 1.2379335165023804, + "learning_rate": 3.401320691355648e-05, + "loss": 0.0946, + "num_input_tokens_seen": 97124768, + "step": 79795 + }, + { + "epoch": 8.887403942532575, + "grad_norm": 0.8209512233734131, + "learning_rate": 3.401094053723323e-05, + "loss": 0.0687, + "num_input_tokens_seen": 97130784, + "step": 79800 + }, + { + "epoch": 8.887960797416193, + "grad_norm": 0.08383949100971222, + "learning_rate": 3.400867407579442e-05, + "loss": 0.1538, + "num_input_tokens_seen": 97137024, + "step": 79805 + }, + { + "epoch": 8.88851765229981, + "grad_norm": 0.03303822875022888, + "learning_rate": 3.4006407529261476e-05, + "loss": 0.0017, + "num_input_tokens_seen": 97143328, + "step": 79810 + }, + { + "epoch": 8.889074507183429, + "grad_norm": 0.00020348404359538108, + "learning_rate": 3.4004140897655795e-05, + "loss": 0.0511, + "num_input_tokens_seen": 97149600, + "step": 79815 + }, + { + "epoch": 8.889631362067046, + "grad_norm": 0.08308994770050049, + "learning_rate": 3.40018741809988e-05, + "loss": 0.0327, + "num_input_tokens_seen": 97156032, + "step": 79820 + }, + { + "epoch": 8.890188216950662, + "grad_norm": 0.19843614101409912, + "learning_rate": 3.399960737931187e-05, + "loss": 0.0119, + "num_input_tokens_seen": 97162208, + "step": 79825 + }, + { + "epoch": 8.89074507183428, + "grad_norm": 0.02731589414179325, + "learning_rate": 3.399734049261645e-05, + "loss": 0.0355, + "num_input_tokens_seen": 97168480, + "step": 79830 + }, + { + "epoch": 8.891301926717897, + "grad_norm": 0.997135579586029, + "learning_rate": 3.399507352093393e-05, + "loss": 0.0398, + "num_input_tokens_seen": 97174816, + "step": 79835 + }, + { + "epoch": 8.891858781601515, + "grad_norm": 0.048393312841653824, + "learning_rate": 3.399280646428575e-05, + "loss": 0.0779, + "num_input_tokens_seen": 97181024, + "step": 79840 + }, + { + "epoch": 8.892415636485133, + "grad_norm": 0.035878736525774, + "learning_rate": 3.39905393226933e-05, + "loss": 0.1345, + "num_input_tokens_seen": 97186944, + "step": 79845 + }, + { + "epoch": 8.892972491368749, + "grad_norm": 0.5338337421417236, + "learning_rate": 3.3988272096178e-05, + "loss": 0.0381, + "num_input_tokens_seen": 97192992, + "step": 79850 + }, + { + "epoch": 8.893529346252366, + "grad_norm": 1.035546064376831, + "learning_rate": 3.3986004784761274e-05, + "loss": 0.0903, + "num_input_tokens_seen": 97198400, + "step": 79855 + }, + { + "epoch": 8.894086201135984, + "grad_norm": 0.17814874649047852, + "learning_rate": 3.398373738846453e-05, + "loss": 0.0098, + "num_input_tokens_seen": 97204480, + "step": 79860 + }, + { + "epoch": 8.894643056019602, + "grad_norm": 0.7153460383415222, + "learning_rate": 3.3981469907309196e-05, + "loss": 0.0941, + "num_input_tokens_seen": 97210432, + "step": 79865 + }, + { + "epoch": 8.89519991090322, + "grad_norm": 0.5914092063903809, + "learning_rate": 3.3979202341316677e-05, + "loss": 0.0262, + "num_input_tokens_seen": 97216128, + "step": 79870 + }, + { + "epoch": 8.895756765786835, + "grad_norm": 0.07389947772026062, + "learning_rate": 3.39769346905084e-05, + "loss": 0.2226, + "num_input_tokens_seen": 97222272, + "step": 79875 + }, + { + "epoch": 8.896313620670453, + "grad_norm": 0.08224815875291824, + "learning_rate": 3.397466695490578e-05, + "loss": 0.0092, + "num_input_tokens_seen": 97228512, + "step": 79880 + }, + { + "epoch": 8.89687047555407, + "grad_norm": 0.08559759706258774, + "learning_rate": 3.3972399134530236e-05, + "loss": 0.01, + "num_input_tokens_seen": 97234464, + "step": 79885 + }, + { + "epoch": 8.897427330437688, + "grad_norm": 0.08966373652219772, + "learning_rate": 3.39701312294032e-05, + "loss": 0.0353, + "num_input_tokens_seen": 97240672, + "step": 79890 + }, + { + "epoch": 8.897984185321306, + "grad_norm": 0.0003528175875544548, + "learning_rate": 3.3967863239546084e-05, + "loss": 0.0085, + "num_input_tokens_seen": 97246304, + "step": 79895 + }, + { + "epoch": 8.898541040204922, + "grad_norm": 0.004625664558261633, + "learning_rate": 3.3965595164980326e-05, + "loss": 0.0595, + "num_input_tokens_seen": 97252448, + "step": 79900 + }, + { + "epoch": 8.89909789508854, + "grad_norm": 1.9869611263275146, + "learning_rate": 3.3963327005727326e-05, + "loss": 0.1359, + "num_input_tokens_seen": 97258400, + "step": 79905 + }, + { + "epoch": 8.899654749972157, + "grad_norm": 0.010302546434104443, + "learning_rate": 3.396105876180852e-05, + "loss": 0.0218, + "num_input_tokens_seen": 97264512, + "step": 79910 + }, + { + "epoch": 8.900211604855775, + "grad_norm": 0.4215424060821533, + "learning_rate": 3.395879043324534e-05, + "loss": 0.0342, + "num_input_tokens_seen": 97270816, + "step": 79915 + }, + { + "epoch": 8.900768459739393, + "grad_norm": 0.2812682092189789, + "learning_rate": 3.39565220200592e-05, + "loss": 0.012, + "num_input_tokens_seen": 97276928, + "step": 79920 + }, + { + "epoch": 8.901325314623008, + "grad_norm": 1.1699433326721191, + "learning_rate": 3.395425352227155e-05, + "loss": 0.091, + "num_input_tokens_seen": 97283104, + "step": 79925 + }, + { + "epoch": 8.901882169506626, + "grad_norm": 0.005656917579472065, + "learning_rate": 3.395198493990379e-05, + "loss": 0.1574, + "num_input_tokens_seen": 97289088, + "step": 79930 + }, + { + "epoch": 8.902439024390244, + "grad_norm": 0.006504130084067583, + "learning_rate": 3.394971627297736e-05, + "loss": 0.0021, + "num_input_tokens_seen": 97295136, + "step": 79935 + }, + { + "epoch": 8.902995879273861, + "grad_norm": 0.8375278115272522, + "learning_rate": 3.394744752151369e-05, + "loss": 0.224, + "num_input_tokens_seen": 97301120, + "step": 79940 + }, + { + "epoch": 8.90355273415748, + "grad_norm": 0.03663169592618942, + "learning_rate": 3.3945178685534205e-05, + "loss": 0.0314, + "num_input_tokens_seen": 97307136, + "step": 79945 + }, + { + "epoch": 8.904109589041095, + "grad_norm": 1.0722575187683105, + "learning_rate": 3.3942909765060346e-05, + "loss": 0.0617, + "num_input_tokens_seen": 97312672, + "step": 79950 + }, + { + "epoch": 8.904666443924713, + "grad_norm": 1.229315161705017, + "learning_rate": 3.394064076011354e-05, + "loss": 0.0253, + "num_input_tokens_seen": 97318816, + "step": 79955 + }, + { + "epoch": 8.90522329880833, + "grad_norm": 0.03598973900079727, + "learning_rate": 3.393837167071523e-05, + "loss": 0.0059, + "num_input_tokens_seen": 97324992, + "step": 79960 + }, + { + "epoch": 8.905780153691948, + "grad_norm": 0.0008673330303281546, + "learning_rate": 3.393610249688682e-05, + "loss": 0.0035, + "num_input_tokens_seen": 97331360, + "step": 79965 + }, + { + "epoch": 8.906337008575566, + "grad_norm": 1.3380059003829956, + "learning_rate": 3.3933833238649765e-05, + "loss": 0.1475, + "num_input_tokens_seen": 97337280, + "step": 79970 + }, + { + "epoch": 8.906893863459182, + "grad_norm": 1.2428202629089355, + "learning_rate": 3.393156389602551e-05, + "loss": 0.2275, + "num_input_tokens_seen": 97343296, + "step": 79975 + }, + { + "epoch": 8.9074507183428, + "grad_norm": 0.7947535514831543, + "learning_rate": 3.392929446903546e-05, + "loss": 0.0463, + "num_input_tokens_seen": 97349152, + "step": 79980 + }, + { + "epoch": 8.908007573226417, + "grad_norm": 0.00044663672451861203, + "learning_rate": 3.392702495770108e-05, + "loss": 0.0518, + "num_input_tokens_seen": 97355488, + "step": 79985 + }, + { + "epoch": 8.908564428110035, + "grad_norm": 0.17208661139011383, + "learning_rate": 3.392475536204379e-05, + "loss": 0.0368, + "num_input_tokens_seen": 97361216, + "step": 79990 + }, + { + "epoch": 8.909121282993652, + "grad_norm": 0.2213558852672577, + "learning_rate": 3.392248568208505e-05, + "loss": 0.1321, + "num_input_tokens_seen": 97367456, + "step": 79995 + }, + { + "epoch": 8.909678137877268, + "grad_norm": 0.027122508734464645, + "learning_rate": 3.392021591784627e-05, + "loss": 0.0364, + "num_input_tokens_seen": 97373344, + "step": 80000 + }, + { + "epoch": 8.910234992760886, + "grad_norm": 0.016641512513160706, + "learning_rate": 3.3917946069348913e-05, + "loss": 0.0399, + "num_input_tokens_seen": 97379360, + "step": 80005 + }, + { + "epoch": 8.910791847644504, + "grad_norm": 0.4182289242744446, + "learning_rate": 3.3915676136614406e-05, + "loss": 0.0309, + "num_input_tokens_seen": 97385024, + "step": 80010 + }, + { + "epoch": 8.911348702528121, + "grad_norm": 0.19762104749679565, + "learning_rate": 3.3913406119664196e-05, + "loss": 0.0141, + "num_input_tokens_seen": 97391392, + "step": 80015 + }, + { + "epoch": 8.911905557411739, + "grad_norm": 0.4102504253387451, + "learning_rate": 3.3911136018519725e-05, + "loss": 0.0572, + "num_input_tokens_seen": 97397344, + "step": 80020 + }, + { + "epoch": 8.912462412295357, + "grad_norm": 0.0048713586293160915, + "learning_rate": 3.390886583320243e-05, + "loss": 0.0041, + "num_input_tokens_seen": 97403680, + "step": 80025 + }, + { + "epoch": 8.913019267178973, + "grad_norm": 0.6765467524528503, + "learning_rate": 3.390659556373377e-05, + "loss": 0.1225, + "num_input_tokens_seen": 97409760, + "step": 80030 + }, + { + "epoch": 8.91357612206259, + "grad_norm": 0.3869827687740326, + "learning_rate": 3.390432521013516e-05, + "loss": 0.0697, + "num_input_tokens_seen": 97415488, + "step": 80035 + }, + { + "epoch": 8.914132976946208, + "grad_norm": 0.019266923889517784, + "learning_rate": 3.390205477242808e-05, + "loss": 0.0583, + "num_input_tokens_seen": 97421408, + "step": 80040 + }, + { + "epoch": 8.914689831829826, + "grad_norm": 0.4847545325756073, + "learning_rate": 3.389978425063396e-05, + "loss": 0.0185, + "num_input_tokens_seen": 97427584, + "step": 80045 + }, + { + "epoch": 8.915246686713443, + "grad_norm": 0.08201435953378677, + "learning_rate": 3.389751364477424e-05, + "loss": 0.0165, + "num_input_tokens_seen": 97433792, + "step": 80050 + }, + { + "epoch": 8.91580354159706, + "grad_norm": 0.04011613875627518, + "learning_rate": 3.389524295487038e-05, + "loss": 0.0954, + "num_input_tokens_seen": 97439872, + "step": 80055 + }, + { + "epoch": 8.916360396480677, + "grad_norm": 0.0029020854271948338, + "learning_rate": 3.389297218094382e-05, + "loss": 0.007, + "num_input_tokens_seen": 97446208, + "step": 80060 + }, + { + "epoch": 8.916917251364294, + "grad_norm": 0.6692907214164734, + "learning_rate": 3.3890701323016014e-05, + "loss": 0.0252, + "num_input_tokens_seen": 97452256, + "step": 80065 + }, + { + "epoch": 8.917474106247912, + "grad_norm": 0.007794405799359083, + "learning_rate": 3.388843038110842e-05, + "loss": 0.0623, + "num_input_tokens_seen": 97457696, + "step": 80070 + }, + { + "epoch": 8.91803096113153, + "grad_norm": 0.000777208071667701, + "learning_rate": 3.3886159355242476e-05, + "loss": 0.0427, + "num_input_tokens_seen": 97464256, + "step": 80075 + }, + { + "epoch": 8.918587816015146, + "grad_norm": 1.2286874055862427, + "learning_rate": 3.388388824543963e-05, + "loss": 0.0487, + "num_input_tokens_seen": 97470592, + "step": 80080 + }, + { + "epoch": 8.919144670898763, + "grad_norm": 1.176692008972168, + "learning_rate": 3.388161705172135e-05, + "loss": 0.0995, + "num_input_tokens_seen": 97476576, + "step": 80085 + }, + { + "epoch": 8.919701525782381, + "grad_norm": 0.10177001357078552, + "learning_rate": 3.387934577410907e-05, + "loss": 0.0785, + "num_input_tokens_seen": 97482688, + "step": 80090 + }, + { + "epoch": 8.920258380665999, + "grad_norm": 1.7066785097122192, + "learning_rate": 3.3877074412624264e-05, + "loss": 0.077, + "num_input_tokens_seen": 97488960, + "step": 80095 + }, + { + "epoch": 8.920815235549616, + "grad_norm": 0.11463721841573715, + "learning_rate": 3.387480296728837e-05, + "loss": 0.0292, + "num_input_tokens_seen": 97494816, + "step": 80100 + }, + { + "epoch": 8.921372090433232, + "grad_norm": 1.1701037883758545, + "learning_rate": 3.387253143812286e-05, + "loss": 0.1569, + "num_input_tokens_seen": 97500256, + "step": 80105 + }, + { + "epoch": 8.92192894531685, + "grad_norm": 0.036432377994060516, + "learning_rate": 3.387025982514918e-05, + "loss": 0.0011, + "num_input_tokens_seen": 97506112, + "step": 80110 + }, + { + "epoch": 8.922485800200468, + "grad_norm": 0.43205633759498596, + "learning_rate": 3.3867988128388784e-05, + "loss": 0.037, + "num_input_tokens_seen": 97512192, + "step": 80115 + }, + { + "epoch": 8.923042655084085, + "grad_norm": 0.5112236738204956, + "learning_rate": 3.3865716347863144e-05, + "loss": 0.0705, + "num_input_tokens_seen": 97518336, + "step": 80120 + }, + { + "epoch": 8.923599509967703, + "grad_norm": 0.051037244498729706, + "learning_rate": 3.38634444835937e-05, + "loss": 0.063, + "num_input_tokens_seen": 97524160, + "step": 80125 + }, + { + "epoch": 8.92415636485132, + "grad_norm": 1.5234402418136597, + "learning_rate": 3.3861172535601935e-05, + "loss": 0.0874, + "num_input_tokens_seen": 97529920, + "step": 80130 + }, + { + "epoch": 8.924713219734937, + "grad_norm": 0.6012077331542969, + "learning_rate": 3.385890050390929e-05, + "loss": 0.0206, + "num_input_tokens_seen": 97536032, + "step": 80135 + }, + { + "epoch": 8.925270074618554, + "grad_norm": 0.007073049433529377, + "learning_rate": 3.3856628388537225e-05, + "loss": 0.0265, + "num_input_tokens_seen": 97542336, + "step": 80140 + }, + { + "epoch": 8.925826929502172, + "grad_norm": 0.8604639172554016, + "learning_rate": 3.385435618950722e-05, + "loss": 0.0471, + "num_input_tokens_seen": 97548416, + "step": 80145 + }, + { + "epoch": 8.92638378438579, + "grad_norm": 0.006457879673689604, + "learning_rate": 3.385208390684072e-05, + "loss": 0.1052, + "num_input_tokens_seen": 97554560, + "step": 80150 + }, + { + "epoch": 8.926940639269407, + "grad_norm": 0.4625939130783081, + "learning_rate": 3.3849811540559194e-05, + "loss": 0.0165, + "num_input_tokens_seen": 97560832, + "step": 80155 + }, + { + "epoch": 8.927497494153023, + "grad_norm": 0.00016657587548252195, + "learning_rate": 3.384753909068411e-05, + "loss": 0.0696, + "num_input_tokens_seen": 97566848, + "step": 80160 + }, + { + "epoch": 8.92805434903664, + "grad_norm": 1.905349850654602, + "learning_rate": 3.3845266557236934e-05, + "loss": 0.1238, + "num_input_tokens_seen": 97572128, + "step": 80165 + }, + { + "epoch": 8.928611203920259, + "grad_norm": 0.07469391822814941, + "learning_rate": 3.384299394023912e-05, + "loss": 0.0062, + "num_input_tokens_seen": 97577760, + "step": 80170 + }, + { + "epoch": 8.929168058803876, + "grad_norm": 0.007676612585783005, + "learning_rate": 3.3840721239712154e-05, + "loss": 0.019, + "num_input_tokens_seen": 97583968, + "step": 80175 + }, + { + "epoch": 8.929724913687494, + "grad_norm": 0.6751137971878052, + "learning_rate": 3.3838448455677484e-05, + "loss": 0.0886, + "num_input_tokens_seen": 97589824, + "step": 80180 + }, + { + "epoch": 8.93028176857111, + "grad_norm": 0.0485989972949028, + "learning_rate": 3.383617558815659e-05, + "loss": 0.0079, + "num_input_tokens_seen": 97596224, + "step": 80185 + }, + { + "epoch": 8.930838623454727, + "grad_norm": 0.44057339429855347, + "learning_rate": 3.383390263717094e-05, + "loss": 0.0838, + "num_input_tokens_seen": 97602528, + "step": 80190 + }, + { + "epoch": 8.931395478338345, + "grad_norm": 0.0005622797762043774, + "learning_rate": 3.3831629602742e-05, + "loss": 0.0369, + "num_input_tokens_seen": 97608736, + "step": 80195 + }, + { + "epoch": 8.931952333221963, + "grad_norm": 0.474782258272171, + "learning_rate": 3.3829356484891246e-05, + "loss": 0.0605, + "num_input_tokens_seen": 97614624, + "step": 80200 + }, + { + "epoch": 8.93250918810558, + "grad_norm": 0.20623676478862762, + "learning_rate": 3.382708328364014e-05, + "loss": 0.058, + "num_input_tokens_seen": 97620832, + "step": 80205 + }, + { + "epoch": 8.933066042989196, + "grad_norm": 0.13914044201374054, + "learning_rate": 3.3824809999010167e-05, + "loss": 0.0277, + "num_input_tokens_seen": 97626464, + "step": 80210 + }, + { + "epoch": 8.933622897872814, + "grad_norm": 1.6850016117095947, + "learning_rate": 3.382253663102279e-05, + "loss": 0.1388, + "num_input_tokens_seen": 97631552, + "step": 80215 + }, + { + "epoch": 8.934179752756432, + "grad_norm": 1.5573029518127441, + "learning_rate": 3.382026317969949e-05, + "loss": 0.0621, + "num_input_tokens_seen": 97637024, + "step": 80220 + }, + { + "epoch": 8.93473660764005, + "grad_norm": 0.6383238434791565, + "learning_rate": 3.381798964506173e-05, + "loss": 0.0521, + "num_input_tokens_seen": 97643008, + "step": 80225 + }, + { + "epoch": 8.935293462523667, + "grad_norm": 2.582613706588745, + "learning_rate": 3.381571602713099e-05, + "loss": 0.0529, + "num_input_tokens_seen": 97648832, + "step": 80230 + }, + { + "epoch": 8.935850317407283, + "grad_norm": 2.0120975971221924, + "learning_rate": 3.381344232592876e-05, + "loss": 0.1312, + "num_input_tokens_seen": 97655040, + "step": 80235 + }, + { + "epoch": 8.9364071722909, + "grad_norm": 0.015483061783015728, + "learning_rate": 3.38111685414765e-05, + "loss": 0.0268, + "num_input_tokens_seen": 97660288, + "step": 80240 + }, + { + "epoch": 8.936964027174518, + "grad_norm": 0.0016303642187267542, + "learning_rate": 3.380889467379569e-05, + "loss": 0.0074, + "num_input_tokens_seen": 97666496, + "step": 80245 + }, + { + "epoch": 8.937520882058136, + "grad_norm": 2.46413516998291, + "learning_rate": 3.380662072290782e-05, + "loss": 0.0645, + "num_input_tokens_seen": 97672448, + "step": 80250 + }, + { + "epoch": 8.938077736941754, + "grad_norm": 0.6043395400047302, + "learning_rate": 3.380434668883436e-05, + "loss": 0.0514, + "num_input_tokens_seen": 97678528, + "step": 80255 + }, + { + "epoch": 8.93863459182537, + "grad_norm": 0.6660237908363342, + "learning_rate": 3.3802072571596796e-05, + "loss": 0.0852, + "num_input_tokens_seen": 97684512, + "step": 80260 + }, + { + "epoch": 8.939191446708987, + "grad_norm": 0.5808783173561096, + "learning_rate": 3.3799798371216596e-05, + "loss": 0.0317, + "num_input_tokens_seen": 97690752, + "step": 80265 + }, + { + "epoch": 8.939748301592605, + "grad_norm": 3.4778623580932617, + "learning_rate": 3.379752408771526e-05, + "loss": 0.1238, + "num_input_tokens_seen": 97696960, + "step": 80270 + }, + { + "epoch": 8.940305156476223, + "grad_norm": 0.0007239978294819593, + "learning_rate": 3.3795249721114254e-05, + "loss": 0.0351, + "num_input_tokens_seen": 97702752, + "step": 80275 + }, + { + "epoch": 8.94086201135984, + "grad_norm": 0.408650279045105, + "learning_rate": 3.379297527143506e-05, + "loss": 0.0203, + "num_input_tokens_seen": 97709024, + "step": 80280 + }, + { + "epoch": 8.941418866243456, + "grad_norm": 1.8397032022476196, + "learning_rate": 3.379070073869918e-05, + "loss": 0.1572, + "num_input_tokens_seen": 97715520, + "step": 80285 + }, + { + "epoch": 8.941975721127074, + "grad_norm": 0.007950671948492527, + "learning_rate": 3.3788426122928083e-05, + "loss": 0.0417, + "num_input_tokens_seen": 97721792, + "step": 80290 + }, + { + "epoch": 8.942532576010692, + "grad_norm": 1.0635219812393188, + "learning_rate": 3.378615142414327e-05, + "loss": 0.0403, + "num_input_tokens_seen": 97727360, + "step": 80295 + }, + { + "epoch": 8.94308943089431, + "grad_norm": 0.10201744735240936, + "learning_rate": 3.378387664236621e-05, + "loss": 0.0799, + "num_input_tokens_seen": 97733248, + "step": 80300 + }, + { + "epoch": 8.943646285777927, + "grad_norm": 0.09874977171421051, + "learning_rate": 3.37816017776184e-05, + "loss": 0.0279, + "num_input_tokens_seen": 97739808, + "step": 80305 + }, + { + "epoch": 8.944203140661543, + "grad_norm": 0.35697317123413086, + "learning_rate": 3.377932682992132e-05, + "loss": 0.0239, + "num_input_tokens_seen": 97745856, + "step": 80310 + }, + { + "epoch": 8.94475999554516, + "grad_norm": 1.072860598564148, + "learning_rate": 3.3777051799296466e-05, + "loss": 0.016, + "num_input_tokens_seen": 97751808, + "step": 80315 + }, + { + "epoch": 8.945316850428778, + "grad_norm": 1.1591824293136597, + "learning_rate": 3.377477668576533e-05, + "loss": 0.0447, + "num_input_tokens_seen": 97757920, + "step": 80320 + }, + { + "epoch": 8.945873705312396, + "grad_norm": 2.486135244369507, + "learning_rate": 3.377250148934939e-05, + "loss": 0.1308, + "num_input_tokens_seen": 97764000, + "step": 80325 + }, + { + "epoch": 8.946430560196013, + "grad_norm": 0.12636375427246094, + "learning_rate": 3.3770226210070156e-05, + "loss": 0.0452, + "num_input_tokens_seen": 97769408, + "step": 80330 + }, + { + "epoch": 8.94698741507963, + "grad_norm": 0.0012673166347667575, + "learning_rate": 3.37679508479491e-05, + "loss": 0.0376, + "num_input_tokens_seen": 97775616, + "step": 80335 + }, + { + "epoch": 8.947544269963247, + "grad_norm": 0.3837047219276428, + "learning_rate": 3.3765675403007725e-05, + "loss": 0.0237, + "num_input_tokens_seen": 97781920, + "step": 80340 + }, + { + "epoch": 8.948101124846865, + "grad_norm": 0.4119654893875122, + "learning_rate": 3.376339987526752e-05, + "loss": 0.0216, + "num_input_tokens_seen": 97788192, + "step": 80345 + }, + { + "epoch": 8.948657979730482, + "grad_norm": 0.023019738495349884, + "learning_rate": 3.376112426475e-05, + "loss": 0.1363, + "num_input_tokens_seen": 97794304, + "step": 80350 + }, + { + "epoch": 8.9492148346141, + "grad_norm": 0.0024827385786920786, + "learning_rate": 3.375884857147662e-05, + "loss": 0.0091, + "num_input_tokens_seen": 97800448, + "step": 80355 + }, + { + "epoch": 8.949771689497716, + "grad_norm": 2.776050329208374, + "learning_rate": 3.375657279546891e-05, + "loss": 0.0943, + "num_input_tokens_seen": 97806848, + "step": 80360 + }, + { + "epoch": 8.950328544381334, + "grad_norm": 0.00024624232901260257, + "learning_rate": 3.375429693674835e-05, + "loss": 0.034, + "num_input_tokens_seen": 97813376, + "step": 80365 + }, + { + "epoch": 8.950885399264951, + "grad_norm": 0.9815363883972168, + "learning_rate": 3.375202099533644e-05, + "loss": 0.0683, + "num_input_tokens_seen": 97819232, + "step": 80370 + }, + { + "epoch": 8.951442254148569, + "grad_norm": 1.0520695447921753, + "learning_rate": 3.374974497125468e-05, + "loss": 0.0554, + "num_input_tokens_seen": 97825312, + "step": 80375 + }, + { + "epoch": 8.951999109032187, + "grad_norm": 0.5929498076438904, + "learning_rate": 3.374746886452458e-05, + "loss": 0.0328, + "num_input_tokens_seen": 97831648, + "step": 80380 + }, + { + "epoch": 8.952555963915804, + "grad_norm": 0.4561990797519684, + "learning_rate": 3.374519267516761e-05, + "loss": 0.0192, + "num_input_tokens_seen": 97837440, + "step": 80385 + }, + { + "epoch": 8.95311281879942, + "grad_norm": 1.1721359491348267, + "learning_rate": 3.3742916403205303e-05, + "loss": 0.0394, + "num_input_tokens_seen": 97843264, + "step": 80390 + }, + { + "epoch": 8.953669673683038, + "grad_norm": 0.0008691630209796131, + "learning_rate": 3.374064004865914e-05, + "loss": 0.0335, + "num_input_tokens_seen": 97849440, + "step": 80395 + }, + { + "epoch": 8.954226528566656, + "grad_norm": 0.40075162053108215, + "learning_rate": 3.3738363611550626e-05, + "loss": 0.0742, + "num_input_tokens_seen": 97855680, + "step": 80400 + }, + { + "epoch": 8.954783383450273, + "grad_norm": 0.059102002531290054, + "learning_rate": 3.373608709190127e-05, + "loss": 0.0298, + "num_input_tokens_seen": 97861920, + "step": 80405 + }, + { + "epoch": 8.955340238333891, + "grad_norm": 1.1773109436035156, + "learning_rate": 3.373381048973257e-05, + "loss": 0.0637, + "num_input_tokens_seen": 97868480, + "step": 80410 + }, + { + "epoch": 8.955897093217507, + "grad_norm": 0.3297075927257538, + "learning_rate": 3.373153380506604e-05, + "loss": 0.1216, + "num_input_tokens_seen": 97874720, + "step": 80415 + }, + { + "epoch": 8.956453948101124, + "grad_norm": 0.07333333045244217, + "learning_rate": 3.372925703792317e-05, + "loss": 0.0993, + "num_input_tokens_seen": 97880768, + "step": 80420 + }, + { + "epoch": 8.957010802984742, + "grad_norm": 0.1613546460866928, + "learning_rate": 3.372698018832548e-05, + "loss": 0.0565, + "num_input_tokens_seen": 97887072, + "step": 80425 + }, + { + "epoch": 8.95756765786836, + "grad_norm": 0.03200598806142807, + "learning_rate": 3.372470325629446e-05, + "loss": 0.0634, + "num_input_tokens_seen": 97893344, + "step": 80430 + }, + { + "epoch": 8.958124512751978, + "grad_norm": 2.082083225250244, + "learning_rate": 3.372242624185164e-05, + "loss": 0.1024, + "num_input_tokens_seen": 97899488, + "step": 80435 + }, + { + "epoch": 8.958681367635593, + "grad_norm": 0.0454748198390007, + "learning_rate": 3.37201491450185e-05, + "loss": 0.0052, + "num_input_tokens_seen": 97905536, + "step": 80440 + }, + { + "epoch": 8.959238222519211, + "grad_norm": 0.0005161810549907386, + "learning_rate": 3.371787196581658e-05, + "loss": 0.0548, + "num_input_tokens_seen": 97911488, + "step": 80445 + }, + { + "epoch": 8.959795077402829, + "grad_norm": 0.384640097618103, + "learning_rate": 3.371559470426737e-05, + "loss": 0.0211, + "num_input_tokens_seen": 97917408, + "step": 80450 + }, + { + "epoch": 8.960351932286446, + "grad_norm": 0.013662034645676613, + "learning_rate": 3.371331736039238e-05, + "loss": 0.0058, + "num_input_tokens_seen": 97923904, + "step": 80455 + }, + { + "epoch": 8.960908787170064, + "grad_norm": 0.08800995349884033, + "learning_rate": 3.371103993421313e-05, + "loss": 0.0764, + "num_input_tokens_seen": 97929856, + "step": 80460 + }, + { + "epoch": 8.96146564205368, + "grad_norm": 0.03239927068352699, + "learning_rate": 3.370876242575113e-05, + "loss": 0.0728, + "num_input_tokens_seen": 97936000, + "step": 80465 + }, + { + "epoch": 8.962022496937298, + "grad_norm": 0.015468936413526535, + "learning_rate": 3.370648483502789e-05, + "loss": 0.0575, + "num_input_tokens_seen": 97941728, + "step": 80470 + }, + { + "epoch": 8.962579351820915, + "grad_norm": 2.231839656829834, + "learning_rate": 3.370420716206493e-05, + "loss": 0.1024, + "num_input_tokens_seen": 97947104, + "step": 80475 + }, + { + "epoch": 8.963136206704533, + "grad_norm": 1.486018180847168, + "learning_rate": 3.370192940688376e-05, + "loss": 0.0263, + "num_input_tokens_seen": 97953568, + "step": 80480 + }, + { + "epoch": 8.96369306158815, + "grad_norm": 0.6508786678314209, + "learning_rate": 3.369965156950589e-05, + "loss": 0.0946, + "num_input_tokens_seen": 97960064, + "step": 80485 + }, + { + "epoch": 8.964249916471768, + "grad_norm": 0.2350345104932785, + "learning_rate": 3.369737364995284e-05, + "loss": 0.0311, + "num_input_tokens_seen": 97966336, + "step": 80490 + }, + { + "epoch": 8.964806771355384, + "grad_norm": 0.0697280615568161, + "learning_rate": 3.369509564824613e-05, + "loss": 0.1304, + "num_input_tokens_seen": 97972256, + "step": 80495 + }, + { + "epoch": 8.965363626239002, + "grad_norm": 1.3681937456130981, + "learning_rate": 3.3692817564407276e-05, + "loss": 0.1151, + "num_input_tokens_seen": 97978368, + "step": 80500 + }, + { + "epoch": 8.96592048112262, + "grad_norm": 0.002409841865301132, + "learning_rate": 3.369053939845779e-05, + "loss": 0.0346, + "num_input_tokens_seen": 97984352, + "step": 80505 + }, + { + "epoch": 8.966477336006237, + "grad_norm": 0.0675264298915863, + "learning_rate": 3.36882611504192e-05, + "loss": 0.0343, + "num_input_tokens_seen": 97990016, + "step": 80510 + }, + { + "epoch": 8.967034190889855, + "grad_norm": 0.08513010293245316, + "learning_rate": 3.368598282031302e-05, + "loss": 0.0292, + "num_input_tokens_seen": 97995904, + "step": 80515 + }, + { + "epoch": 8.96759104577347, + "grad_norm": 0.005418344866484404, + "learning_rate": 3.368370440816078e-05, + "loss": 0.0694, + "num_input_tokens_seen": 98002272, + "step": 80520 + }, + { + "epoch": 8.968147900657089, + "grad_norm": 0.32477277517318726, + "learning_rate": 3.368142591398399e-05, + "loss": 0.0632, + "num_input_tokens_seen": 98008096, + "step": 80525 + }, + { + "epoch": 8.968704755540706, + "grad_norm": 0.6335891485214233, + "learning_rate": 3.3679147337804176e-05, + "loss": 0.0556, + "num_input_tokens_seen": 98014304, + "step": 80530 + }, + { + "epoch": 8.969261610424324, + "grad_norm": 0.7548618912696838, + "learning_rate": 3.367686867964287e-05, + "loss": 0.0165, + "num_input_tokens_seen": 98020480, + "step": 80535 + }, + { + "epoch": 8.969818465307942, + "grad_norm": 0.011201120913028717, + "learning_rate": 3.367458993952158e-05, + "loss": 0.0422, + "num_input_tokens_seen": 98026560, + "step": 80540 + }, + { + "epoch": 8.970375320191557, + "grad_norm": 2.9360952377319336, + "learning_rate": 3.367231111746184e-05, + "loss": 0.2131, + "num_input_tokens_seen": 98032800, + "step": 80545 + }, + { + "epoch": 8.970932175075175, + "grad_norm": 0.2587609589099884, + "learning_rate": 3.367003221348517e-05, + "loss": 0.1449, + "num_input_tokens_seen": 98038944, + "step": 80550 + }, + { + "epoch": 8.971489029958793, + "grad_norm": 0.00048193385009653866, + "learning_rate": 3.3667753227613096e-05, + "loss": 0.0072, + "num_input_tokens_seen": 98045216, + "step": 80555 + }, + { + "epoch": 8.97204588484241, + "grad_norm": 0.004144189413636923, + "learning_rate": 3.366547415986716e-05, + "loss": 0.0421, + "num_input_tokens_seen": 98051520, + "step": 80560 + }, + { + "epoch": 8.972602739726028, + "grad_norm": 0.000445742771262303, + "learning_rate": 3.3663195010268865e-05, + "loss": 0.0007, + "num_input_tokens_seen": 98057376, + "step": 80565 + }, + { + "epoch": 8.973159594609644, + "grad_norm": 0.14834344387054443, + "learning_rate": 3.3660915778839764e-05, + "loss": 0.0664, + "num_input_tokens_seen": 98063392, + "step": 80570 + }, + { + "epoch": 8.973716449493262, + "grad_norm": 0.02344409190118313, + "learning_rate": 3.365863646560137e-05, + "loss": 0.005, + "num_input_tokens_seen": 98069536, + "step": 80575 + }, + { + "epoch": 8.97427330437688, + "grad_norm": 1.2494474649429321, + "learning_rate": 3.365635707057522e-05, + "loss": 0.0782, + "num_input_tokens_seen": 98075552, + "step": 80580 + }, + { + "epoch": 8.974830159260497, + "grad_norm": 0.49678030610084534, + "learning_rate": 3.365407759378284e-05, + "loss": 0.0463, + "num_input_tokens_seen": 98080928, + "step": 80585 + }, + { + "epoch": 8.975387014144115, + "grad_norm": 1.550711750984192, + "learning_rate": 3.3651798035245756e-05, + "loss": 0.0365, + "num_input_tokens_seen": 98087520, + "step": 80590 + }, + { + "epoch": 8.97594386902773, + "grad_norm": 0.005093301646411419, + "learning_rate": 3.364951839498552e-05, + "loss": 0.0114, + "num_input_tokens_seen": 98093920, + "step": 80595 + }, + { + "epoch": 8.976500723911348, + "grad_norm": 0.015216940082609653, + "learning_rate": 3.3647238673023644e-05, + "loss": 0.1498, + "num_input_tokens_seen": 98100064, + "step": 80600 + }, + { + "epoch": 8.977057578794966, + "grad_norm": 0.0014420553343370557, + "learning_rate": 3.3644958869381673e-05, + "loss": 0.1127, + "num_input_tokens_seen": 98106112, + "step": 80605 + }, + { + "epoch": 8.977614433678584, + "grad_norm": 0.2803739011287689, + "learning_rate": 3.3642678984081146e-05, + "loss": 0.113, + "num_input_tokens_seen": 98112224, + "step": 80610 + }, + { + "epoch": 8.978171288562201, + "grad_norm": 0.1903815120458603, + "learning_rate": 3.364039901714358e-05, + "loss": 0.0634, + "num_input_tokens_seen": 98118432, + "step": 80615 + }, + { + "epoch": 8.978728143445817, + "grad_norm": 1.5714212656021118, + "learning_rate": 3.363811896859053e-05, + "loss": 0.1155, + "num_input_tokens_seen": 98124320, + "step": 80620 + }, + { + "epoch": 8.979284998329435, + "grad_norm": 0.1299239844083786, + "learning_rate": 3.363583883844352e-05, + "loss": 0.0482, + "num_input_tokens_seen": 98130752, + "step": 80625 + }, + { + "epoch": 8.979841853213053, + "grad_norm": 0.002076755976304412, + "learning_rate": 3.3633558626724104e-05, + "loss": 0.047, + "num_input_tokens_seen": 98137056, + "step": 80630 + }, + { + "epoch": 8.98039870809667, + "grad_norm": 0.5495311617851257, + "learning_rate": 3.36312783334538e-05, + "loss": 0.1036, + "num_input_tokens_seen": 98143200, + "step": 80635 + }, + { + "epoch": 8.980955562980288, + "grad_norm": 1.227520227432251, + "learning_rate": 3.362899795865416e-05, + "loss": 0.0618, + "num_input_tokens_seen": 98149056, + "step": 80640 + }, + { + "epoch": 8.981512417863904, + "grad_norm": 0.27480632066726685, + "learning_rate": 3.362671750234673e-05, + "loss": 0.0415, + "num_input_tokens_seen": 98155136, + "step": 80645 + }, + { + "epoch": 8.982069272747522, + "grad_norm": 0.33995217084884644, + "learning_rate": 3.3624436964553024e-05, + "loss": 0.0288, + "num_input_tokens_seen": 98161376, + "step": 80650 + }, + { + "epoch": 8.98262612763114, + "grad_norm": 0.05698731541633606, + "learning_rate": 3.362215634529461e-05, + "loss": 0.0897, + "num_input_tokens_seen": 98167456, + "step": 80655 + }, + { + "epoch": 8.983182982514757, + "grad_norm": 0.6974306106567383, + "learning_rate": 3.3619875644593026e-05, + "loss": 0.0197, + "num_input_tokens_seen": 98173856, + "step": 80660 + }, + { + "epoch": 8.983739837398375, + "grad_norm": 0.9901477098464966, + "learning_rate": 3.36175948624698e-05, + "loss": 0.1966, + "num_input_tokens_seen": 98180320, + "step": 80665 + }, + { + "epoch": 8.98429669228199, + "grad_norm": 0.3889741599559784, + "learning_rate": 3.3615313998946496e-05, + "loss": 0.1002, + "num_input_tokens_seen": 98186048, + "step": 80670 + }, + { + "epoch": 8.984853547165608, + "grad_norm": 0.05614430457353592, + "learning_rate": 3.361303305404465e-05, + "loss": 0.1041, + "num_input_tokens_seen": 98192192, + "step": 80675 + }, + { + "epoch": 8.985410402049226, + "grad_norm": 0.43304258584976196, + "learning_rate": 3.36107520277858e-05, + "loss": 0.0235, + "num_input_tokens_seen": 98197792, + "step": 80680 + }, + { + "epoch": 8.985967256932843, + "grad_norm": 0.9465844631195068, + "learning_rate": 3.3608470920191494e-05, + "loss": 0.0689, + "num_input_tokens_seen": 98203744, + "step": 80685 + }, + { + "epoch": 8.986524111816461, + "grad_norm": 0.013606488704681396, + "learning_rate": 3.3606189731283286e-05, + "loss": 0.0112, + "num_input_tokens_seen": 98209792, + "step": 80690 + }, + { + "epoch": 8.987080966700077, + "grad_norm": 1.3041976690292358, + "learning_rate": 3.3603908461082724e-05, + "loss": 0.0545, + "num_input_tokens_seen": 98216000, + "step": 80695 + }, + { + "epoch": 8.987637821583695, + "grad_norm": 0.2800102233886719, + "learning_rate": 3.3601627109611356e-05, + "loss": 0.0557, + "num_input_tokens_seen": 98222368, + "step": 80700 + }, + { + "epoch": 8.988194676467312, + "grad_norm": 2.8252294063568115, + "learning_rate": 3.359934567689072e-05, + "loss": 0.1124, + "num_input_tokens_seen": 98228608, + "step": 80705 + }, + { + "epoch": 8.98875153135093, + "grad_norm": 0.0003996948944404721, + "learning_rate": 3.3597064162942385e-05, + "loss": 0.0224, + "num_input_tokens_seen": 98234880, + "step": 80710 + }, + { + "epoch": 8.989308386234548, + "grad_norm": 1.163586974143982, + "learning_rate": 3.359478256778788e-05, + "loss": 0.0541, + "num_input_tokens_seen": 98240832, + "step": 80715 + }, + { + "epoch": 8.989865241118165, + "grad_norm": 0.8671236634254456, + "learning_rate": 3.359250089144877e-05, + "loss": 0.0497, + "num_input_tokens_seen": 98246656, + "step": 80720 + }, + { + "epoch": 8.990422096001781, + "grad_norm": 1.65716552734375, + "learning_rate": 3.359021913394661e-05, + "loss": 0.097, + "num_input_tokens_seen": 98252960, + "step": 80725 + }, + { + "epoch": 8.990978950885399, + "grad_norm": 0.4732095003128052, + "learning_rate": 3.358793729530294e-05, + "loss": 0.0255, + "num_input_tokens_seen": 98258464, + "step": 80730 + }, + { + "epoch": 8.991535805769017, + "grad_norm": 0.5601478219032288, + "learning_rate": 3.358565537553933e-05, + "loss": 0.0808, + "num_input_tokens_seen": 98264704, + "step": 80735 + }, + { + "epoch": 8.992092660652634, + "grad_norm": 0.06843564659357071, + "learning_rate": 3.358337337467733e-05, + "loss": 0.0179, + "num_input_tokens_seen": 98270976, + "step": 80740 + }, + { + "epoch": 8.992649515536252, + "grad_norm": 0.5570576190948486, + "learning_rate": 3.3581091292738474e-05, + "loss": 0.042, + "num_input_tokens_seen": 98277088, + "step": 80745 + }, + { + "epoch": 8.993206370419868, + "grad_norm": 0.057117003947496414, + "learning_rate": 3.3578809129744355e-05, + "loss": 0.1265, + "num_input_tokens_seen": 98282944, + "step": 80750 + }, + { + "epoch": 8.993763225303486, + "grad_norm": 0.5281667709350586, + "learning_rate": 3.35765268857165e-05, + "loss": 0.0545, + "num_input_tokens_seen": 98289056, + "step": 80755 + }, + { + "epoch": 8.994320080187103, + "grad_norm": 0.0003338076057843864, + "learning_rate": 3.357424456067648e-05, + "loss": 0.0502, + "num_input_tokens_seen": 98294848, + "step": 80760 + }, + { + "epoch": 8.994876935070721, + "grad_norm": 0.17956720292568207, + "learning_rate": 3.357196215464585e-05, + "loss": 0.0028, + "num_input_tokens_seen": 98301024, + "step": 80765 + }, + { + "epoch": 8.995433789954339, + "grad_norm": 0.14743247628211975, + "learning_rate": 3.356967966764617e-05, + "loss": 0.1324, + "num_input_tokens_seen": 98307040, + "step": 80770 + }, + { + "epoch": 8.995990644837955, + "grad_norm": 0.013125110417604446, + "learning_rate": 3.3567397099699e-05, + "loss": 0.0223, + "num_input_tokens_seen": 98313152, + "step": 80775 + }, + { + "epoch": 8.996547499721572, + "grad_norm": 0.6553931832313538, + "learning_rate": 3.356511445082589e-05, + "loss": 0.1107, + "num_input_tokens_seen": 98319168, + "step": 80780 + }, + { + "epoch": 8.99710435460519, + "grad_norm": 0.0039869556203484535, + "learning_rate": 3.356283172104843e-05, + "loss": 0.0467, + "num_input_tokens_seen": 98325216, + "step": 80785 + }, + { + "epoch": 8.997661209488808, + "grad_norm": 0.13956516981124878, + "learning_rate": 3.3560548910388155e-05, + "loss": 0.0934, + "num_input_tokens_seen": 98330912, + "step": 80790 + }, + { + "epoch": 8.998218064372425, + "grad_norm": 0.07406696677207947, + "learning_rate": 3.355826601886664e-05, + "loss": 0.0109, + "num_input_tokens_seen": 98337184, + "step": 80795 + }, + { + "epoch": 8.998774919256041, + "grad_norm": 0.22557666897773743, + "learning_rate": 3.355598304650544e-05, + "loss": 0.057, + "num_input_tokens_seen": 98343264, + "step": 80800 + }, + { + "epoch": 8.999331774139659, + "grad_norm": 0.001215729396790266, + "learning_rate": 3.355369999332613e-05, + "loss": 0.1491, + "num_input_tokens_seen": 98348960, + "step": 80805 + }, + { + "epoch": 8.999888629023276, + "grad_norm": 0.4940868020057678, + "learning_rate": 3.355141685935027e-05, + "loss": 0.0729, + "num_input_tokens_seen": 98355200, + "step": 80810 + }, + { + "epoch": 9.0, + "eval_loss": 0.07710938155651093, + "eval_runtime": 112.2879, + "eval_samples_per_second": 35.543, + "eval_steps_per_second": 8.888, + "num_input_tokens_seen": 98355744, + "step": 80811 + }, + { + "epoch": 9.000445483906894, + "grad_norm": 0.06184947490692139, + "learning_rate": 3.3549133644599416e-05, + "loss": 0.0825, + "num_input_tokens_seen": 98360576, + "step": 80815 + }, + { + "epoch": 9.001002338790512, + "grad_norm": 0.5602763891220093, + "learning_rate": 3.354685034909516e-05, + "loss": 0.024, + "num_input_tokens_seen": 98366624, + "step": 80820 + }, + { + "epoch": 9.001559193674128, + "grad_norm": 0.011167045682668686, + "learning_rate": 3.3544566972859045e-05, + "loss": 0.0919, + "num_input_tokens_seen": 98372384, + "step": 80825 + }, + { + "epoch": 9.002116048557745, + "grad_norm": 1.0750962495803833, + "learning_rate": 3.3542283515912656e-05, + "loss": 0.1035, + "num_input_tokens_seen": 98378176, + "step": 80830 + }, + { + "epoch": 9.002672903441363, + "grad_norm": 0.05548188462853432, + "learning_rate": 3.3539999978277556e-05, + "loss": 0.1115, + "num_input_tokens_seen": 98383552, + "step": 80835 + }, + { + "epoch": 9.00322975832498, + "grad_norm": 0.12660089135169983, + "learning_rate": 3.353771635997531e-05, + "loss": 0.0119, + "num_input_tokens_seen": 98389792, + "step": 80840 + }, + { + "epoch": 9.003786613208598, + "grad_norm": 1.2052677869796753, + "learning_rate": 3.353543266102749e-05, + "loss": 0.0505, + "num_input_tokens_seen": 98395776, + "step": 80845 + }, + { + "epoch": 9.004343468092214, + "grad_norm": 0.04976950213313103, + "learning_rate": 3.353314888145568e-05, + "loss": 0.0359, + "num_input_tokens_seen": 98401312, + "step": 80850 + }, + { + "epoch": 9.004900322975832, + "grad_norm": 0.44923847913742065, + "learning_rate": 3.353086502128143e-05, + "loss": 0.0361, + "num_input_tokens_seen": 98407488, + "step": 80855 + }, + { + "epoch": 9.00545717785945, + "grad_norm": 0.26309698820114136, + "learning_rate": 3.3528581080526336e-05, + "loss": 0.0083, + "num_input_tokens_seen": 98413696, + "step": 80860 + }, + { + "epoch": 9.006014032743067, + "grad_norm": 0.010497304610908031, + "learning_rate": 3.352629705921195e-05, + "loss": 0.0408, + "num_input_tokens_seen": 98419776, + "step": 80865 + }, + { + "epoch": 9.006570887626685, + "grad_norm": 0.0008543786825612187, + "learning_rate": 3.352401295735986e-05, + "loss": 0.0129, + "num_input_tokens_seen": 98425984, + "step": 80870 + }, + { + "epoch": 9.007127742510303, + "grad_norm": 1.308247447013855, + "learning_rate": 3.352172877499165e-05, + "loss": 0.1508, + "num_input_tokens_seen": 98432160, + "step": 80875 + }, + { + "epoch": 9.007684597393919, + "grad_norm": 0.5821569561958313, + "learning_rate": 3.351944451212887e-05, + "loss": 0.101, + "num_input_tokens_seen": 98438304, + "step": 80880 + }, + { + "epoch": 9.008241452277536, + "grad_norm": 1.6066547632217407, + "learning_rate": 3.3517160168793116e-05, + "loss": 0.0974, + "num_input_tokens_seen": 98444480, + "step": 80885 + }, + { + "epoch": 9.008798307161154, + "grad_norm": 0.6917492151260376, + "learning_rate": 3.351487574500597e-05, + "loss": 0.0144, + "num_input_tokens_seen": 98450816, + "step": 80890 + }, + { + "epoch": 9.009355162044772, + "grad_norm": 0.10203748941421509, + "learning_rate": 3.351259124078899e-05, + "loss": 0.0092, + "num_input_tokens_seen": 98457280, + "step": 80895 + }, + { + "epoch": 9.00991201692839, + "grad_norm": 0.8078155517578125, + "learning_rate": 3.351030665616376e-05, + "loss": 0.0553, + "num_input_tokens_seen": 98462912, + "step": 80900 + }, + { + "epoch": 9.010468871812005, + "grad_norm": 0.03647070750594139, + "learning_rate": 3.350802199115187e-05, + "loss": 0.0083, + "num_input_tokens_seen": 98469056, + "step": 80905 + }, + { + "epoch": 9.011025726695623, + "grad_norm": 0.14738313853740692, + "learning_rate": 3.35057372457749e-05, + "loss": 0.0889, + "num_input_tokens_seen": 98475232, + "step": 80910 + }, + { + "epoch": 9.01158258157924, + "grad_norm": 0.00043614034075289965, + "learning_rate": 3.3503452420054424e-05, + "loss": 0.1194, + "num_input_tokens_seen": 98481408, + "step": 80915 + }, + { + "epoch": 9.012139436462858, + "grad_norm": 0.1664266735315323, + "learning_rate": 3.350116751401203e-05, + "loss": 0.0417, + "num_input_tokens_seen": 98487520, + "step": 80920 + }, + { + "epoch": 9.012696291346476, + "grad_norm": 2.2463419437408447, + "learning_rate": 3.3498882527669294e-05, + "loss": 0.0203, + "num_input_tokens_seen": 98493984, + "step": 80925 + }, + { + "epoch": 9.013253146230092, + "grad_norm": 1.2862811088562012, + "learning_rate": 3.3496597461047804e-05, + "loss": 0.0511, + "num_input_tokens_seen": 98500256, + "step": 80930 + }, + { + "epoch": 9.01381000111371, + "grad_norm": 0.3256187438964844, + "learning_rate": 3.349431231416914e-05, + "loss": 0.1387, + "num_input_tokens_seen": 98506304, + "step": 80935 + }, + { + "epoch": 9.014366855997327, + "grad_norm": 0.3361518085002899, + "learning_rate": 3.34920270870549e-05, + "loss": 0.0092, + "num_input_tokens_seen": 98512352, + "step": 80940 + }, + { + "epoch": 9.014923710880945, + "grad_norm": 0.0006010553915984929, + "learning_rate": 3.348974177972666e-05, + "loss": 0.0392, + "num_input_tokens_seen": 98517984, + "step": 80945 + }, + { + "epoch": 9.015480565764562, + "grad_norm": 1.3101507425308228, + "learning_rate": 3.3487456392206e-05, + "loss": 0.1407, + "num_input_tokens_seen": 98523840, + "step": 80950 + }, + { + "epoch": 9.016037420648178, + "grad_norm": 0.004996092524379492, + "learning_rate": 3.3485170924514514e-05, + "loss": 0.0374, + "num_input_tokens_seen": 98529952, + "step": 80955 + }, + { + "epoch": 9.016594275531796, + "grad_norm": 0.04763292148709297, + "learning_rate": 3.3482885376673786e-05, + "loss": 0.0639, + "num_input_tokens_seen": 98536320, + "step": 80960 + }, + { + "epoch": 9.017151130415414, + "grad_norm": 0.1741093546152115, + "learning_rate": 3.348059974870543e-05, + "loss": 0.0731, + "num_input_tokens_seen": 98542240, + "step": 80965 + }, + { + "epoch": 9.017707985299031, + "grad_norm": 0.0009555184515193105, + "learning_rate": 3.347831404063099e-05, + "loss": 0.0115, + "num_input_tokens_seen": 98548448, + "step": 80970 + }, + { + "epoch": 9.018264840182649, + "grad_norm": 0.7206113934516907, + "learning_rate": 3.34760282524721e-05, + "loss": 0.0397, + "num_input_tokens_seen": 98554432, + "step": 80975 + }, + { + "epoch": 9.018821695066265, + "grad_norm": 0.01877305842936039, + "learning_rate": 3.347374238425032e-05, + "loss": 0.0596, + "num_input_tokens_seen": 98560960, + "step": 80980 + }, + { + "epoch": 9.019378549949883, + "grad_norm": 0.7690560817718506, + "learning_rate": 3.3471456435987264e-05, + "loss": 0.0516, + "num_input_tokens_seen": 98567424, + "step": 80985 + }, + { + "epoch": 9.0199354048335, + "grad_norm": 0.14106006920337677, + "learning_rate": 3.346917040770451e-05, + "loss": 0.0462, + "num_input_tokens_seen": 98573568, + "step": 80990 + }, + { + "epoch": 9.020492259717118, + "grad_norm": 0.07022827863693237, + "learning_rate": 3.346688429942365e-05, + "loss": 0.0569, + "num_input_tokens_seen": 98579648, + "step": 80995 + }, + { + "epoch": 9.021049114600736, + "grad_norm": 0.006078932899981737, + "learning_rate": 3.34645981111663e-05, + "loss": 0.0061, + "num_input_tokens_seen": 98585600, + "step": 81000 + }, + { + "epoch": 9.021605969484352, + "grad_norm": 0.06398102641105652, + "learning_rate": 3.346231184295402e-05, + "loss": 0.045, + "num_input_tokens_seen": 98591808, + "step": 81005 + }, + { + "epoch": 9.02216282436797, + "grad_norm": 0.263145387172699, + "learning_rate": 3.346002549480844e-05, + "loss": 0.0602, + "num_input_tokens_seen": 98597376, + "step": 81010 + }, + { + "epoch": 9.022719679251587, + "grad_norm": 0.21433448791503906, + "learning_rate": 3.345773906675113e-05, + "loss": 0.0719, + "num_input_tokens_seen": 98603424, + "step": 81015 + }, + { + "epoch": 9.023276534135205, + "grad_norm": 0.6984963417053223, + "learning_rate": 3.34554525588037e-05, + "loss": 0.0369, + "num_input_tokens_seen": 98609696, + "step": 81020 + }, + { + "epoch": 9.023833389018822, + "grad_norm": 1.1181244850158691, + "learning_rate": 3.345316597098775e-05, + "loss": 0.0569, + "num_input_tokens_seen": 98615584, + "step": 81025 + }, + { + "epoch": 9.024390243902438, + "grad_norm": 0.11061255633831024, + "learning_rate": 3.3450879303324875e-05, + "loss": 0.039, + "num_input_tokens_seen": 98621728, + "step": 81030 + }, + { + "epoch": 9.024947098786056, + "grad_norm": 0.6631901264190674, + "learning_rate": 3.3448592555836676e-05, + "loss": 0.0426, + "num_input_tokens_seen": 98627872, + "step": 81035 + }, + { + "epoch": 9.025503953669674, + "grad_norm": 0.001973559148609638, + "learning_rate": 3.3446305728544746e-05, + "loss": 0.021, + "num_input_tokens_seen": 98634240, + "step": 81040 + }, + { + "epoch": 9.026060808553291, + "grad_norm": 0.1361994445323944, + "learning_rate": 3.34440188214707e-05, + "loss": 0.0073, + "num_input_tokens_seen": 98640416, + "step": 81045 + }, + { + "epoch": 9.026617663436909, + "grad_norm": 0.15135304629802704, + "learning_rate": 3.3441731834636126e-05, + "loss": 0.1223, + "num_input_tokens_seen": 98646656, + "step": 81050 + }, + { + "epoch": 9.027174518320527, + "grad_norm": 0.008392213843762875, + "learning_rate": 3.343944476806262e-05, + "loss": 0.0166, + "num_input_tokens_seen": 98652768, + "step": 81055 + }, + { + "epoch": 9.027731373204142, + "grad_norm": 0.27761155366897583, + "learning_rate": 3.3437157621771814e-05, + "loss": 0.048, + "num_input_tokens_seen": 98658080, + "step": 81060 + }, + { + "epoch": 9.02828822808776, + "grad_norm": 0.005282638128846884, + "learning_rate": 3.343487039578529e-05, + "loss": 0.0503, + "num_input_tokens_seen": 98664352, + "step": 81065 + }, + { + "epoch": 9.028845082971378, + "grad_norm": 0.42840757966041565, + "learning_rate": 3.343258309012466e-05, + "loss": 0.0549, + "num_input_tokens_seen": 98670016, + "step": 81070 + }, + { + "epoch": 9.029401937854995, + "grad_norm": 0.2724533975124359, + "learning_rate": 3.343029570481152e-05, + "loss": 0.0531, + "num_input_tokens_seen": 98676064, + "step": 81075 + }, + { + "epoch": 9.029958792738613, + "grad_norm": 0.7610793113708496, + "learning_rate": 3.342800823986748e-05, + "loss": 0.0746, + "num_input_tokens_seen": 98682080, + "step": 81080 + }, + { + "epoch": 9.030515647622229, + "grad_norm": 0.6352273225784302, + "learning_rate": 3.3425720695314156e-05, + "loss": 0.0272, + "num_input_tokens_seen": 98688160, + "step": 81085 + }, + { + "epoch": 9.031072502505847, + "grad_norm": 1.106257677078247, + "learning_rate": 3.342343307117314e-05, + "loss": 0.0746, + "num_input_tokens_seen": 98694240, + "step": 81090 + }, + { + "epoch": 9.031629357389464, + "grad_norm": 0.14491918683052063, + "learning_rate": 3.342114536746607e-05, + "loss": 0.0051, + "num_input_tokens_seen": 98700672, + "step": 81095 + }, + { + "epoch": 9.032186212273082, + "grad_norm": 0.5228701233863831, + "learning_rate": 3.3418857584214516e-05, + "loss": 0.0105, + "num_input_tokens_seen": 98706848, + "step": 81100 + }, + { + "epoch": 9.0327430671567, + "grad_norm": 0.012544045224785805, + "learning_rate": 3.3416569721440116e-05, + "loss": 0.0007, + "num_input_tokens_seen": 98713440, + "step": 81105 + }, + { + "epoch": 9.033299922040316, + "grad_norm": 0.4897569417953491, + "learning_rate": 3.341428177916447e-05, + "loss": 0.0323, + "num_input_tokens_seen": 98719488, + "step": 81110 + }, + { + "epoch": 9.033856776923933, + "grad_norm": 0.8741334676742554, + "learning_rate": 3.341199375740919e-05, + "loss": 0.0408, + "num_input_tokens_seen": 98725696, + "step": 81115 + }, + { + "epoch": 9.034413631807551, + "grad_norm": 0.15750034153461456, + "learning_rate": 3.340970565619589e-05, + "loss": 0.0235, + "num_input_tokens_seen": 98731840, + "step": 81120 + }, + { + "epoch": 9.034970486691169, + "grad_norm": 0.010574591346085072, + "learning_rate": 3.340741747554618e-05, + "loss": 0.0157, + "num_input_tokens_seen": 98737984, + "step": 81125 + }, + { + "epoch": 9.035527341574786, + "grad_norm": 0.9175511598587036, + "learning_rate": 3.340512921548168e-05, + "loss": 0.1481, + "num_input_tokens_seen": 98744128, + "step": 81130 + }, + { + "epoch": 9.036084196458402, + "grad_norm": 0.03611036762595177, + "learning_rate": 3.3402840876024e-05, + "loss": 0.0425, + "num_input_tokens_seen": 98749984, + "step": 81135 + }, + { + "epoch": 9.03664105134202, + "grad_norm": 0.008753878064453602, + "learning_rate": 3.3400552457194756e-05, + "loss": 0.0576, + "num_input_tokens_seen": 98756128, + "step": 81140 + }, + { + "epoch": 9.037197906225638, + "grad_norm": 0.17136061191558838, + "learning_rate": 3.3398263959015555e-05, + "loss": 0.0582, + "num_input_tokens_seen": 98762240, + "step": 81145 + }, + { + "epoch": 9.037754761109255, + "grad_norm": 0.44191643595695496, + "learning_rate": 3.339597538150803e-05, + "loss": 0.1257, + "num_input_tokens_seen": 98768544, + "step": 81150 + }, + { + "epoch": 9.038311615992873, + "grad_norm": 0.001213316572830081, + "learning_rate": 3.33936867246938e-05, + "loss": 0.0943, + "num_input_tokens_seen": 98773856, + "step": 81155 + }, + { + "epoch": 9.038868470876489, + "grad_norm": 0.00021079367434140295, + "learning_rate": 3.339139798859445e-05, + "loss": 0.1338, + "num_input_tokens_seen": 98780288, + "step": 81160 + }, + { + "epoch": 9.039425325760106, + "grad_norm": 0.5494847893714905, + "learning_rate": 3.338910917323165e-05, + "loss": 0.013, + "num_input_tokens_seen": 98786208, + "step": 81165 + }, + { + "epoch": 9.039982180643724, + "grad_norm": 0.0002978882985189557, + "learning_rate": 3.338682027862697e-05, + "loss": 0.0623, + "num_input_tokens_seen": 98792128, + "step": 81170 + }, + { + "epoch": 9.040539035527342, + "grad_norm": 0.0025468396488577127, + "learning_rate": 3.338453130480207e-05, + "loss": 0.0718, + "num_input_tokens_seen": 98798176, + "step": 81175 + }, + { + "epoch": 9.04109589041096, + "grad_norm": 0.1642242670059204, + "learning_rate": 3.3382242251778546e-05, + "loss": 0.0108, + "num_input_tokens_seen": 98804544, + "step": 81180 + }, + { + "epoch": 9.041652745294575, + "grad_norm": 0.12132159620523453, + "learning_rate": 3.337995311957802e-05, + "loss": 0.0223, + "num_input_tokens_seen": 98810560, + "step": 81185 + }, + { + "epoch": 9.042209600178193, + "grad_norm": 0.4627402126789093, + "learning_rate": 3.337766390822214e-05, + "loss": 0.0484, + "num_input_tokens_seen": 98816672, + "step": 81190 + }, + { + "epoch": 9.04276645506181, + "grad_norm": 1.426298975944519, + "learning_rate": 3.3375374617732503e-05, + "loss": 0.0307, + "num_input_tokens_seen": 98822720, + "step": 81195 + }, + { + "epoch": 9.043323309945428, + "grad_norm": 0.024430586025118828, + "learning_rate": 3.3373085248130746e-05, + "loss": 0.053, + "num_input_tokens_seen": 98829024, + "step": 81200 + }, + { + "epoch": 9.043880164829046, + "grad_norm": 0.7977350354194641, + "learning_rate": 3.3370795799438487e-05, + "loss": 0.01, + "num_input_tokens_seen": 98834976, + "step": 81205 + }, + { + "epoch": 9.044437019712662, + "grad_norm": 0.7433570027351379, + "learning_rate": 3.3368506271677355e-05, + "loss": 0.072, + "num_input_tokens_seen": 98841248, + "step": 81210 + }, + { + "epoch": 9.04499387459628, + "grad_norm": 2.2804226875305176, + "learning_rate": 3.336621666486898e-05, + "loss": 0.1005, + "num_input_tokens_seen": 98847360, + "step": 81215 + }, + { + "epoch": 9.045550729479897, + "grad_norm": 0.003315854351967573, + "learning_rate": 3.336392697903498e-05, + "loss": 0.0099, + "num_input_tokens_seen": 98853536, + "step": 81220 + }, + { + "epoch": 9.046107584363515, + "grad_norm": 0.28124284744262695, + "learning_rate": 3.3361637214197e-05, + "loss": 0.0198, + "num_input_tokens_seen": 98859776, + "step": 81225 + }, + { + "epoch": 9.046664439247133, + "grad_norm": 0.31054675579071045, + "learning_rate": 3.335934737037665e-05, + "loss": 0.0084, + "num_input_tokens_seen": 98866080, + "step": 81230 + }, + { + "epoch": 9.04722129413075, + "grad_norm": 0.03228507190942764, + "learning_rate": 3.335705744759556e-05, + "loss": 0.0009, + "num_input_tokens_seen": 98872288, + "step": 81235 + }, + { + "epoch": 9.047778149014366, + "grad_norm": 0.003158707870170474, + "learning_rate": 3.3354767445875376e-05, + "loss": 0.0459, + "num_input_tokens_seen": 98877728, + "step": 81240 + }, + { + "epoch": 9.048335003897984, + "grad_norm": 0.0022707448806613684, + "learning_rate": 3.3352477365237713e-05, + "loss": 0.0312, + "num_input_tokens_seen": 98883680, + "step": 81245 + }, + { + "epoch": 9.048891858781602, + "grad_norm": 0.6086447238922119, + "learning_rate": 3.335018720570422e-05, + "loss": 0.0552, + "num_input_tokens_seen": 98889408, + "step": 81250 + }, + { + "epoch": 9.04944871366522, + "grad_norm": 0.44886282086372375, + "learning_rate": 3.334789696729651e-05, + "loss": 0.0535, + "num_input_tokens_seen": 98895616, + "step": 81255 + }, + { + "epoch": 9.050005568548837, + "grad_norm": 0.07668779790401459, + "learning_rate": 3.334560665003623e-05, + "loss": 0.0852, + "num_input_tokens_seen": 98901664, + "step": 81260 + }, + { + "epoch": 9.050562423432453, + "grad_norm": 0.0005051825428381562, + "learning_rate": 3.3343316253945e-05, + "loss": 0.0387, + "num_input_tokens_seen": 98907872, + "step": 81265 + }, + { + "epoch": 9.05111927831607, + "grad_norm": 0.0005523967556655407, + "learning_rate": 3.334102577904448e-05, + "loss": 0.0356, + "num_input_tokens_seen": 98913824, + "step": 81270 + }, + { + "epoch": 9.051676133199688, + "grad_norm": 0.9285399317741394, + "learning_rate": 3.333873522535627e-05, + "loss": 0.0699, + "num_input_tokens_seen": 98919680, + "step": 81275 + }, + { + "epoch": 9.052232988083306, + "grad_norm": 0.028132809326052666, + "learning_rate": 3.3336444592902025e-05, + "loss": 0.0773, + "num_input_tokens_seen": 98925984, + "step": 81280 + }, + { + "epoch": 9.052789842966924, + "grad_norm": 0.10036604851484299, + "learning_rate": 3.33341538817034e-05, + "loss": 0.042, + "num_input_tokens_seen": 98932320, + "step": 81285 + }, + { + "epoch": 9.05334669785054, + "grad_norm": 0.0046267276629805565, + "learning_rate": 3.3331863091782e-05, + "loss": 0.0541, + "num_input_tokens_seen": 98938848, + "step": 81290 + }, + { + "epoch": 9.053903552734157, + "grad_norm": 1.548794150352478, + "learning_rate": 3.332957222315948e-05, + "loss": 0.1637, + "num_input_tokens_seen": 98944832, + "step": 81295 + }, + { + "epoch": 9.054460407617775, + "grad_norm": 0.10994765907526016, + "learning_rate": 3.332728127585748e-05, + "loss": 0.1277, + "num_input_tokens_seen": 98951424, + "step": 81300 + }, + { + "epoch": 9.055017262501392, + "grad_norm": 0.014800514094531536, + "learning_rate": 3.332499024989763e-05, + "loss": 0.0131, + "num_input_tokens_seen": 98957696, + "step": 81305 + }, + { + "epoch": 9.05557411738501, + "grad_norm": 0.08530913293361664, + "learning_rate": 3.332269914530159e-05, + "loss": 0.0052, + "num_input_tokens_seen": 98963680, + "step": 81310 + }, + { + "epoch": 9.056130972268626, + "grad_norm": 0.14611102640628815, + "learning_rate": 3.332040796209098e-05, + "loss": 0.0022, + "num_input_tokens_seen": 98969792, + "step": 81315 + }, + { + "epoch": 9.056687827152244, + "grad_norm": 0.01591327227652073, + "learning_rate": 3.331811670028746e-05, + "loss": 0.0429, + "num_input_tokens_seen": 98975296, + "step": 81320 + }, + { + "epoch": 9.057244682035861, + "grad_norm": 0.02470850571990013, + "learning_rate": 3.331582535991265e-05, + "loss": 0.0051, + "num_input_tokens_seen": 98981440, + "step": 81325 + }, + { + "epoch": 9.057801536919479, + "grad_norm": 0.21717636287212372, + "learning_rate": 3.331353394098821e-05, + "loss": 0.0581, + "num_input_tokens_seen": 98987520, + "step": 81330 + }, + { + "epoch": 9.058358391803097, + "grad_norm": 0.07239966094493866, + "learning_rate": 3.331124244353579e-05, + "loss": 0.0154, + "num_input_tokens_seen": 98993536, + "step": 81335 + }, + { + "epoch": 9.058915246686713, + "grad_norm": 0.46480679512023926, + "learning_rate": 3.330895086757702e-05, + "loss": 0.1664, + "num_input_tokens_seen": 98999616, + "step": 81340 + }, + { + "epoch": 9.05947210157033, + "grad_norm": 0.6568213701248169, + "learning_rate": 3.330665921313355e-05, + "loss": 0.1683, + "num_input_tokens_seen": 99005184, + "step": 81345 + }, + { + "epoch": 9.060028956453948, + "grad_norm": 3.155148983001709, + "learning_rate": 3.3304367480227026e-05, + "loss": 0.1026, + "num_input_tokens_seen": 99011392, + "step": 81350 + }, + { + "epoch": 9.060585811337566, + "grad_norm": 0.11257291585206985, + "learning_rate": 3.330207566887911e-05, + "loss": 0.0277, + "num_input_tokens_seen": 99017664, + "step": 81355 + }, + { + "epoch": 9.061142666221183, + "grad_norm": 0.004853707738220692, + "learning_rate": 3.3299783779111424e-05, + "loss": 0.0857, + "num_input_tokens_seen": 99023136, + "step": 81360 + }, + { + "epoch": 9.0616995211048, + "grad_norm": 0.7192617058753967, + "learning_rate": 3.329749181094564e-05, + "loss": 0.0177, + "num_input_tokens_seen": 99029344, + "step": 81365 + }, + { + "epoch": 9.062256375988417, + "grad_norm": 0.05818657949566841, + "learning_rate": 3.329519976440339e-05, + "loss": 0.0768, + "num_input_tokens_seen": 99035712, + "step": 81370 + }, + { + "epoch": 9.062813230872035, + "grad_norm": 0.003774491837248206, + "learning_rate": 3.329290763950633e-05, + "loss": 0.0264, + "num_input_tokens_seen": 99042016, + "step": 81375 + }, + { + "epoch": 9.063370085755652, + "grad_norm": 0.9710713028907776, + "learning_rate": 3.329061543627613e-05, + "loss": 0.097, + "num_input_tokens_seen": 99048224, + "step": 81380 + }, + { + "epoch": 9.06392694063927, + "grad_norm": 1.0847426652908325, + "learning_rate": 3.3288323154734414e-05, + "loss": 0.0643, + "num_input_tokens_seen": 99054464, + "step": 81385 + }, + { + "epoch": 9.064483795522886, + "grad_norm": 0.558516263961792, + "learning_rate": 3.328603079490285e-05, + "loss": 0.0791, + "num_input_tokens_seen": 99060288, + "step": 81390 + }, + { + "epoch": 9.065040650406504, + "grad_norm": 0.00027330248849466443, + "learning_rate": 3.328373835680307e-05, + "loss": 0.0964, + "num_input_tokens_seen": 99066528, + "step": 81395 + }, + { + "epoch": 9.065597505290121, + "grad_norm": 0.5901575088500977, + "learning_rate": 3.328144584045676e-05, + "loss": 0.0149, + "num_input_tokens_seen": 99072608, + "step": 81400 + }, + { + "epoch": 9.066154360173739, + "grad_norm": 0.04844922572374344, + "learning_rate": 3.3279153245885556e-05, + "loss": 0.0511, + "num_input_tokens_seen": 99079008, + "step": 81405 + }, + { + "epoch": 9.066711215057357, + "grad_norm": 0.35563379526138306, + "learning_rate": 3.3276860573111115e-05, + "loss": 0.0377, + "num_input_tokens_seen": 99085216, + "step": 81410 + }, + { + "epoch": 9.067268069940974, + "grad_norm": 0.08523914217948914, + "learning_rate": 3.32745678221551e-05, + "loss": 0.0245, + "num_input_tokens_seen": 99091488, + "step": 81415 + }, + { + "epoch": 9.06782492482459, + "grad_norm": 0.5372883081436157, + "learning_rate": 3.327227499303915e-05, + "loss": 0.0593, + "num_input_tokens_seen": 99097280, + "step": 81420 + }, + { + "epoch": 9.068381779708208, + "grad_norm": 0.3936280310153961, + "learning_rate": 3.3269982085784945e-05, + "loss": 0.0581, + "num_input_tokens_seen": 99103392, + "step": 81425 + }, + { + "epoch": 9.068938634591825, + "grad_norm": 0.19545231759548187, + "learning_rate": 3.326768910041413e-05, + "loss": 0.0095, + "num_input_tokens_seen": 99109696, + "step": 81430 + }, + { + "epoch": 9.069495489475443, + "grad_norm": 0.06666859239339828, + "learning_rate": 3.3265396036948374e-05, + "loss": 0.0528, + "num_input_tokens_seen": 99115424, + "step": 81435 + }, + { + "epoch": 9.07005234435906, + "grad_norm": 0.12711472809314728, + "learning_rate": 3.3263102895409325e-05, + "loss": 0.0067, + "num_input_tokens_seen": 99121600, + "step": 81440 + }, + { + "epoch": 9.070609199242677, + "grad_norm": 0.15381349623203278, + "learning_rate": 3.326080967581865e-05, + "loss": 0.0551, + "num_input_tokens_seen": 99127712, + "step": 81445 + }, + { + "epoch": 9.071166054126294, + "grad_norm": 0.04363812506198883, + "learning_rate": 3.325851637819801e-05, + "loss": 0.009, + "num_input_tokens_seen": 99133344, + "step": 81450 + }, + { + "epoch": 9.071722909009912, + "grad_norm": 0.09673769772052765, + "learning_rate": 3.325622300256906e-05, + "loss": 0.1123, + "num_input_tokens_seen": 99139680, + "step": 81455 + }, + { + "epoch": 9.07227976389353, + "grad_norm": 0.11564341932535172, + "learning_rate": 3.325392954895348e-05, + "loss": 0.0356, + "num_input_tokens_seen": 99145600, + "step": 81460 + }, + { + "epoch": 9.072836618777147, + "grad_norm": 1.901627779006958, + "learning_rate": 3.3251636017372914e-05, + "loss": 0.1591, + "num_input_tokens_seen": 99151840, + "step": 81465 + }, + { + "epoch": 9.073393473660763, + "grad_norm": 0.2721787095069885, + "learning_rate": 3.3249342407849036e-05, + "loss": 0.0585, + "num_input_tokens_seen": 99158016, + "step": 81470 + }, + { + "epoch": 9.073950328544381, + "grad_norm": 1.0766174793243408, + "learning_rate": 3.3247048720403514e-05, + "loss": 0.0372, + "num_input_tokens_seen": 99164384, + "step": 81475 + }, + { + "epoch": 9.074507183427999, + "grad_norm": 2.3443071842193604, + "learning_rate": 3.3244754955058005e-05, + "loss": 0.0305, + "num_input_tokens_seen": 99170432, + "step": 81480 + }, + { + "epoch": 9.075064038311616, + "grad_norm": 0.020516246557235718, + "learning_rate": 3.3242461111834186e-05, + "loss": 0.0208, + "num_input_tokens_seen": 99176704, + "step": 81485 + }, + { + "epoch": 9.075620893195234, + "grad_norm": 0.5265535116195679, + "learning_rate": 3.324016719075371e-05, + "loss": 0.0076, + "num_input_tokens_seen": 99182688, + "step": 81490 + }, + { + "epoch": 9.07617774807885, + "grad_norm": 1.2950501441955566, + "learning_rate": 3.323787319183825e-05, + "loss": 0.0421, + "num_input_tokens_seen": 99188576, + "step": 81495 + }, + { + "epoch": 9.076734602962468, + "grad_norm": 0.3861944377422333, + "learning_rate": 3.323557911510949e-05, + "loss": 0.0441, + "num_input_tokens_seen": 99194752, + "step": 81500 + }, + { + "epoch": 9.077291457846085, + "grad_norm": 1.0939511060714722, + "learning_rate": 3.323328496058907e-05, + "loss": 0.047, + "num_input_tokens_seen": 99200768, + "step": 81505 + }, + { + "epoch": 9.077848312729703, + "grad_norm": 0.06210458651185036, + "learning_rate": 3.3230990728298695e-05, + "loss": 0.0057, + "num_input_tokens_seen": 99207328, + "step": 81510 + }, + { + "epoch": 9.07840516761332, + "grad_norm": 0.30836376547813416, + "learning_rate": 3.322869641826001e-05, + "loss": 0.0328, + "num_input_tokens_seen": 99213312, + "step": 81515 + }, + { + "epoch": 9.078962022496937, + "grad_norm": 0.08688271045684814, + "learning_rate": 3.3226402030494694e-05, + "loss": 0.0095, + "num_input_tokens_seen": 99219776, + "step": 81520 + }, + { + "epoch": 9.079518877380554, + "grad_norm": 0.0006164017831906676, + "learning_rate": 3.322410756502442e-05, + "loss": 0.0293, + "num_input_tokens_seen": 99225280, + "step": 81525 + }, + { + "epoch": 9.080075732264172, + "grad_norm": 0.027468299493193626, + "learning_rate": 3.322181302187086e-05, + "loss": 0.0331, + "num_input_tokens_seen": 99231360, + "step": 81530 + }, + { + "epoch": 9.08063258714779, + "grad_norm": 0.0031951547134667635, + "learning_rate": 3.321951840105569e-05, + "loss": 0.1847, + "num_input_tokens_seen": 99237504, + "step": 81535 + }, + { + "epoch": 9.081189442031407, + "grad_norm": 2.6610281467437744, + "learning_rate": 3.321722370260058e-05, + "loss": 0.0345, + "num_input_tokens_seen": 99243680, + "step": 81540 + }, + { + "epoch": 9.081746296915023, + "grad_norm": 0.0003151865385007113, + "learning_rate": 3.321492892652722e-05, + "loss": 0.1036, + "num_input_tokens_seen": 99249888, + "step": 81545 + }, + { + "epoch": 9.08230315179864, + "grad_norm": 0.0014521824195981026, + "learning_rate": 3.321263407285727e-05, + "loss": 0.033, + "num_input_tokens_seen": 99255072, + "step": 81550 + }, + { + "epoch": 9.082860006682258, + "grad_norm": 0.07021934539079666, + "learning_rate": 3.3210339141612406e-05, + "loss": 0.0742, + "num_input_tokens_seen": 99261280, + "step": 81555 + }, + { + "epoch": 9.083416861565876, + "grad_norm": 1.2447174787521362, + "learning_rate": 3.320804413281432e-05, + "loss": 0.1231, + "num_input_tokens_seen": 99267392, + "step": 81560 + }, + { + "epoch": 9.083973716449494, + "grad_norm": 0.5180215835571289, + "learning_rate": 3.320574904648467e-05, + "loss": 0.0758, + "num_input_tokens_seen": 99273312, + "step": 81565 + }, + { + "epoch": 9.084530571333111, + "grad_norm": 0.987917423248291, + "learning_rate": 3.320345388264515e-05, + "loss": 0.0741, + "num_input_tokens_seen": 99279552, + "step": 81570 + }, + { + "epoch": 9.085087426216727, + "grad_norm": 1.3140789270401, + "learning_rate": 3.320115864131744e-05, + "loss": 0.0365, + "num_input_tokens_seen": 99285856, + "step": 81575 + }, + { + "epoch": 9.085644281100345, + "grad_norm": 0.04662375524640083, + "learning_rate": 3.319886332252321e-05, + "loss": 0.0405, + "num_input_tokens_seen": 99292000, + "step": 81580 + }, + { + "epoch": 9.086201135983963, + "grad_norm": 0.07535643875598907, + "learning_rate": 3.319656792628415e-05, + "loss": 0.0271, + "num_input_tokens_seen": 99297920, + "step": 81585 + }, + { + "epoch": 9.08675799086758, + "grad_norm": 0.6961100697517395, + "learning_rate": 3.3194272452621934e-05, + "loss": 0.0383, + "num_input_tokens_seen": 99304032, + "step": 81590 + }, + { + "epoch": 9.087314845751198, + "grad_norm": 0.4994705617427826, + "learning_rate": 3.319197690155826e-05, + "loss": 0.0304, + "num_input_tokens_seen": 99310240, + "step": 81595 + }, + { + "epoch": 9.087871700634814, + "grad_norm": 0.052387844771146774, + "learning_rate": 3.318968127311479e-05, + "loss": 0.0614, + "num_input_tokens_seen": 99316320, + "step": 81600 + }, + { + "epoch": 9.088428555518432, + "grad_norm": 0.0022418212611228228, + "learning_rate": 3.318738556731323e-05, + "loss": 0.0215, + "num_input_tokens_seen": 99322176, + "step": 81605 + }, + { + "epoch": 9.08898541040205, + "grad_norm": 0.31787601113319397, + "learning_rate": 3.318508978417524e-05, + "loss": 0.0044, + "num_input_tokens_seen": 99328224, + "step": 81610 + }, + { + "epoch": 9.089542265285667, + "grad_norm": 0.0013652759371325374, + "learning_rate": 3.3182793923722535e-05, + "loss": 0.0448, + "num_input_tokens_seen": 99334528, + "step": 81615 + }, + { + "epoch": 9.090099120169285, + "grad_norm": 0.1826661378145218, + "learning_rate": 3.318049798597677e-05, + "loss": 0.0443, + "num_input_tokens_seen": 99340640, + "step": 81620 + }, + { + "epoch": 9.0906559750529, + "grad_norm": 0.002400875324383378, + "learning_rate": 3.317820197095966e-05, + "loss": 0.0639, + "num_input_tokens_seen": 99346784, + "step": 81625 + }, + { + "epoch": 9.091212829936518, + "grad_norm": 0.04117303341627121, + "learning_rate": 3.317590587869287e-05, + "loss": 0.0237, + "num_input_tokens_seen": 99352608, + "step": 81630 + }, + { + "epoch": 9.091769684820136, + "grad_norm": 0.4074319303035736, + "learning_rate": 3.31736097091981e-05, + "loss": 0.0389, + "num_input_tokens_seen": 99358976, + "step": 81635 + }, + { + "epoch": 9.092326539703754, + "grad_norm": 0.2540757656097412, + "learning_rate": 3.3171313462497046e-05, + "loss": 0.0364, + "num_input_tokens_seen": 99364800, + "step": 81640 + }, + { + "epoch": 9.092883394587371, + "grad_norm": 0.803233802318573, + "learning_rate": 3.316901713861138e-05, + "loss": 0.0211, + "num_input_tokens_seen": 99370816, + "step": 81645 + }, + { + "epoch": 9.093440249470987, + "grad_norm": 0.0005546145257540047, + "learning_rate": 3.3166720737562806e-05, + "loss": 0.0063, + "num_input_tokens_seen": 99376832, + "step": 81650 + }, + { + "epoch": 9.093997104354605, + "grad_norm": 0.6497820615768433, + "learning_rate": 3.316442425937302e-05, + "loss": 0.1153, + "num_input_tokens_seen": 99383040, + "step": 81655 + }, + { + "epoch": 9.094553959238223, + "grad_norm": 0.46512383222579956, + "learning_rate": 3.316212770406369e-05, + "loss": 0.022, + "num_input_tokens_seen": 99389088, + "step": 81660 + }, + { + "epoch": 9.09511081412184, + "grad_norm": 0.0025474531576037407, + "learning_rate": 3.315983107165653e-05, + "loss": 0.0019, + "num_input_tokens_seen": 99395168, + "step": 81665 + }, + { + "epoch": 9.095667669005458, + "grad_norm": 1.8461840152740479, + "learning_rate": 3.315753436217323e-05, + "loss": 0.0765, + "num_input_tokens_seen": 99401024, + "step": 81670 + }, + { + "epoch": 9.096224523889074, + "grad_norm": 0.09529999643564224, + "learning_rate": 3.315523757563549e-05, + "loss": 0.094, + "num_input_tokens_seen": 99406464, + "step": 81675 + }, + { + "epoch": 9.096781378772691, + "grad_norm": 0.08961820602416992, + "learning_rate": 3.3152940712064984e-05, + "loss": 0.0179, + "num_input_tokens_seen": 99412288, + "step": 81680 + }, + { + "epoch": 9.09733823365631, + "grad_norm": 1.3423057794570923, + "learning_rate": 3.315064377148343e-05, + "loss": 0.156, + "num_input_tokens_seen": 99418592, + "step": 81685 + }, + { + "epoch": 9.097895088539927, + "grad_norm": 0.90006422996521, + "learning_rate": 3.3148346753912505e-05, + "loss": 0.0758, + "num_input_tokens_seen": 99424000, + "step": 81690 + }, + { + "epoch": 9.098451943423544, + "grad_norm": 0.0007220424595288932, + "learning_rate": 3.314604965937392e-05, + "loss": 0.0363, + "num_input_tokens_seen": 99429984, + "step": 81695 + }, + { + "epoch": 9.09900879830716, + "grad_norm": 0.36224794387817383, + "learning_rate": 3.314375248788938e-05, + "loss": 0.024, + "num_input_tokens_seen": 99436256, + "step": 81700 + }, + { + "epoch": 9.099565653190778, + "grad_norm": 0.5762689113616943, + "learning_rate": 3.3141455239480566e-05, + "loss": 0.0684, + "num_input_tokens_seen": 99442080, + "step": 81705 + }, + { + "epoch": 9.100122508074396, + "grad_norm": 0.012536066584289074, + "learning_rate": 3.313915791416919e-05, + "loss": 0.006, + "num_input_tokens_seen": 99448256, + "step": 81710 + }, + { + "epoch": 9.100679362958013, + "grad_norm": 1.0285097360610962, + "learning_rate": 3.3136860511976944e-05, + "loss": 0.036, + "num_input_tokens_seen": 99454400, + "step": 81715 + }, + { + "epoch": 9.101236217841631, + "grad_norm": 0.7318660020828247, + "learning_rate": 3.3134563032925524e-05, + "loss": 0.0218, + "num_input_tokens_seen": 99460320, + "step": 81720 + }, + { + "epoch": 9.101793072725247, + "grad_norm": 0.4213750660419464, + "learning_rate": 3.313226547703664e-05, + "loss": 0.0561, + "num_input_tokens_seen": 99466336, + "step": 81725 + }, + { + "epoch": 9.102349927608865, + "grad_norm": 0.007748625241219997, + "learning_rate": 3.3129967844332e-05, + "loss": 0.0312, + "num_input_tokens_seen": 99472672, + "step": 81730 + }, + { + "epoch": 9.102906782492482, + "grad_norm": 0.05278163030743599, + "learning_rate": 3.3127670134833296e-05, + "loss": 0.0609, + "num_input_tokens_seen": 99478944, + "step": 81735 + }, + { + "epoch": 9.1034636373761, + "grad_norm": 0.0333702526986599, + "learning_rate": 3.312537234856223e-05, + "loss": 0.0924, + "num_input_tokens_seen": 99484832, + "step": 81740 + }, + { + "epoch": 9.104020492259718, + "grad_norm": 0.6697988510131836, + "learning_rate": 3.3123074485540526e-05, + "loss": 0.0731, + "num_input_tokens_seen": 99491008, + "step": 81745 + }, + { + "epoch": 9.104577347143334, + "grad_norm": 0.41493672132492065, + "learning_rate": 3.3120776545789865e-05, + "loss": 0.061, + "num_input_tokens_seen": 99496608, + "step": 81750 + }, + { + "epoch": 9.105134202026951, + "grad_norm": 0.17228572070598602, + "learning_rate": 3.311847852933197e-05, + "loss": 0.0783, + "num_input_tokens_seen": 99502688, + "step": 81755 + }, + { + "epoch": 9.105691056910569, + "grad_norm": 0.0016444111242890358, + "learning_rate": 3.311618043618854e-05, + "loss": 0.0169, + "num_input_tokens_seen": 99508544, + "step": 81760 + }, + { + "epoch": 9.106247911794187, + "grad_norm": 0.7628913521766663, + "learning_rate": 3.3113882266381276e-05, + "loss": 0.1003, + "num_input_tokens_seen": 99514880, + "step": 81765 + }, + { + "epoch": 9.106804766677804, + "grad_norm": 0.14819003641605377, + "learning_rate": 3.31115840199319e-05, + "loss": 0.1576, + "num_input_tokens_seen": 99521152, + "step": 81770 + }, + { + "epoch": 9.107361621561422, + "grad_norm": 0.2575362026691437, + "learning_rate": 3.310928569686211e-05, + "loss": 0.0232, + "num_input_tokens_seen": 99526720, + "step": 81775 + }, + { + "epoch": 9.107918476445038, + "grad_norm": 0.7688719630241394, + "learning_rate": 3.3106987297193616e-05, + "loss": 0.1576, + "num_input_tokens_seen": 99532128, + "step": 81780 + }, + { + "epoch": 9.108475331328655, + "grad_norm": 1.1625174283981323, + "learning_rate": 3.3104688820948136e-05, + "loss": 0.0252, + "num_input_tokens_seen": 99538400, + "step": 81785 + }, + { + "epoch": 9.109032186212273, + "grad_norm": 0.004372279625386, + "learning_rate": 3.310239026814738e-05, + "loss": 0.0421, + "num_input_tokens_seen": 99544832, + "step": 81790 + }, + { + "epoch": 9.10958904109589, + "grad_norm": 1.1731592416763306, + "learning_rate": 3.310009163881305e-05, + "loss": 0.1716, + "num_input_tokens_seen": 99550848, + "step": 81795 + }, + { + "epoch": 9.110145895979509, + "grad_norm": 0.00580520648509264, + "learning_rate": 3.309779293296688e-05, + "loss": 0.1629, + "num_input_tokens_seen": 99556960, + "step": 81800 + }, + { + "epoch": 9.110702750863124, + "grad_norm": 0.04101381078362465, + "learning_rate": 3.3095494150630544e-05, + "loss": 0.0561, + "num_input_tokens_seen": 99563296, + "step": 81805 + }, + { + "epoch": 9.111259605746742, + "grad_norm": 0.004515499342232943, + "learning_rate": 3.30931952918258e-05, + "loss": 0.0545, + "num_input_tokens_seen": 99569440, + "step": 81810 + }, + { + "epoch": 9.11181646063036, + "grad_norm": 0.017880767583847046, + "learning_rate": 3.309089635657433e-05, + "loss": 0.0075, + "num_input_tokens_seen": 99575968, + "step": 81815 + }, + { + "epoch": 9.112373315513977, + "grad_norm": 0.04334201663732529, + "learning_rate": 3.308859734489787e-05, + "loss": 0.0216, + "num_input_tokens_seen": 99582112, + "step": 81820 + }, + { + "epoch": 9.112930170397595, + "grad_norm": 0.3519405424594879, + "learning_rate": 3.308629825681812e-05, + "loss": 0.026, + "num_input_tokens_seen": 99587552, + "step": 81825 + }, + { + "epoch": 9.113487025281211, + "grad_norm": 0.07593969255685806, + "learning_rate": 3.308399909235681e-05, + "loss": 0.0482, + "num_input_tokens_seen": 99593760, + "step": 81830 + }, + { + "epoch": 9.114043880164829, + "grad_norm": 0.0033570483792573214, + "learning_rate": 3.3081699851535655e-05, + "loss": 0.0168, + "num_input_tokens_seen": 99599776, + "step": 81835 + }, + { + "epoch": 9.114600735048446, + "grad_norm": 1.8400185108184814, + "learning_rate": 3.3079400534376356e-05, + "loss": 0.101, + "num_input_tokens_seen": 99605824, + "step": 81840 + }, + { + "epoch": 9.115157589932064, + "grad_norm": 1.0663243532180786, + "learning_rate": 3.3077101140900656e-05, + "loss": 0.0505, + "num_input_tokens_seen": 99611776, + "step": 81845 + }, + { + "epoch": 9.115714444815682, + "grad_norm": 0.2144966572523117, + "learning_rate": 3.3074801671130266e-05, + "loss": 0.1248, + "num_input_tokens_seen": 99617920, + "step": 81850 + }, + { + "epoch": 9.116271299699298, + "grad_norm": 0.011145518161356449, + "learning_rate": 3.30725021250869e-05, + "loss": 0.005, + "num_input_tokens_seen": 99624000, + "step": 81855 + }, + { + "epoch": 9.116828154582915, + "grad_norm": 1.449951171875, + "learning_rate": 3.3070202502792286e-05, + "loss": 0.0511, + "num_input_tokens_seen": 99630208, + "step": 81860 + }, + { + "epoch": 9.117385009466533, + "grad_norm": 0.21927979588508606, + "learning_rate": 3.306790280426814e-05, + "loss": 0.103, + "num_input_tokens_seen": 99636192, + "step": 81865 + }, + { + "epoch": 9.11794186435015, + "grad_norm": 0.008596536703407764, + "learning_rate": 3.3065603029536194e-05, + "loss": 0.0916, + "num_input_tokens_seen": 99642560, + "step": 81870 + }, + { + "epoch": 9.118498719233768, + "grad_norm": 0.5265907645225525, + "learning_rate": 3.306330317861817e-05, + "loss": 0.0219, + "num_input_tokens_seen": 99648800, + "step": 81875 + }, + { + "epoch": 9.119055574117384, + "grad_norm": 0.0909072533249855, + "learning_rate": 3.3061003251535774e-05, + "loss": 0.0228, + "num_input_tokens_seen": 99655008, + "step": 81880 + }, + { + "epoch": 9.119612429001002, + "grad_norm": 0.767731785774231, + "learning_rate": 3.3058703248310755e-05, + "loss": 0.0662, + "num_input_tokens_seen": 99661056, + "step": 81885 + }, + { + "epoch": 9.12016928388462, + "grad_norm": 0.07533981651067734, + "learning_rate": 3.3056403168964824e-05, + "loss": 0.0267, + "num_input_tokens_seen": 99667232, + "step": 81890 + }, + { + "epoch": 9.120726138768237, + "grad_norm": 0.27681025862693787, + "learning_rate": 3.30541030135197e-05, + "loss": 0.0035, + "num_input_tokens_seen": 99673536, + "step": 81895 + }, + { + "epoch": 9.121282993651855, + "grad_norm": 0.03487639129161835, + "learning_rate": 3.3051802781997134e-05, + "loss": 0.0085, + "num_input_tokens_seen": 99679872, + "step": 81900 + }, + { + "epoch": 9.12183984853547, + "grad_norm": 0.05906661972403526, + "learning_rate": 3.304950247441883e-05, + "loss": 0.142, + "num_input_tokens_seen": 99686112, + "step": 81905 + }, + { + "epoch": 9.122396703419088, + "grad_norm": 0.03216834366321564, + "learning_rate": 3.304720209080653e-05, + "loss": 0.053, + "num_input_tokens_seen": 99692352, + "step": 81910 + }, + { + "epoch": 9.122953558302706, + "grad_norm": 0.006417438853532076, + "learning_rate": 3.3044901631181965e-05, + "loss": 0.051, + "num_input_tokens_seen": 99698496, + "step": 81915 + }, + { + "epoch": 9.123510413186324, + "grad_norm": 1.3652935028076172, + "learning_rate": 3.304260109556685e-05, + "loss": 0.0328, + "num_input_tokens_seen": 99704576, + "step": 81920 + }, + { + "epoch": 9.124067268069942, + "grad_norm": 0.4098038077354431, + "learning_rate": 3.304030048398292e-05, + "loss": 0.08, + "num_input_tokens_seen": 99710560, + "step": 81925 + }, + { + "epoch": 9.12462412295356, + "grad_norm": 1.3312851190567017, + "learning_rate": 3.303799979645192e-05, + "loss": 0.1047, + "num_input_tokens_seen": 99716384, + "step": 81930 + }, + { + "epoch": 9.125180977837175, + "grad_norm": 0.08124882727861404, + "learning_rate": 3.303569903299557e-05, + "loss": 0.02, + "num_input_tokens_seen": 99722560, + "step": 81935 + }, + { + "epoch": 9.125737832720793, + "grad_norm": 0.04706169664859772, + "learning_rate": 3.303339819363561e-05, + "loss": 0.0325, + "num_input_tokens_seen": 99728544, + "step": 81940 + }, + { + "epoch": 9.12629468760441, + "grad_norm": 0.002425665967166424, + "learning_rate": 3.303109727839376e-05, + "loss": 0.0312, + "num_input_tokens_seen": 99734784, + "step": 81945 + }, + { + "epoch": 9.126851542488028, + "grad_norm": 0.16413719952106476, + "learning_rate": 3.302879628729176e-05, + "loss": 0.0186, + "num_input_tokens_seen": 99741120, + "step": 81950 + }, + { + "epoch": 9.127408397371646, + "grad_norm": 0.9200358390808105, + "learning_rate": 3.302649522035135e-05, + "loss": 0.1756, + "num_input_tokens_seen": 99747296, + "step": 81955 + }, + { + "epoch": 9.127965252255262, + "grad_norm": 0.2557167708873749, + "learning_rate": 3.302419407759426e-05, + "loss": 0.0994, + "num_input_tokens_seen": 99753344, + "step": 81960 + }, + { + "epoch": 9.12852210713888, + "grad_norm": 0.06247374042868614, + "learning_rate": 3.3021892859042236e-05, + "loss": 0.0036, + "num_input_tokens_seen": 99759840, + "step": 81965 + }, + { + "epoch": 9.129078962022497, + "grad_norm": 0.14091643691062927, + "learning_rate": 3.3019591564717e-05, + "loss": 0.0035, + "num_input_tokens_seen": 99765824, + "step": 81970 + }, + { + "epoch": 9.129635816906115, + "grad_norm": 0.29784008860588074, + "learning_rate": 3.3017290194640296e-05, + "loss": 0.1368, + "num_input_tokens_seen": 99772256, + "step": 81975 + }, + { + "epoch": 9.130192671789732, + "grad_norm": 1.1245768070220947, + "learning_rate": 3.3014988748833865e-05, + "loss": 0.2007, + "num_input_tokens_seen": 99778112, + "step": 81980 + }, + { + "epoch": 9.130749526673348, + "grad_norm": 0.17170396447181702, + "learning_rate": 3.3012687227319446e-05, + "loss": 0.094, + "num_input_tokens_seen": 99784160, + "step": 81985 + }, + { + "epoch": 9.131306381556966, + "grad_norm": 0.18587101995944977, + "learning_rate": 3.301038563011877e-05, + "loss": 0.1453, + "num_input_tokens_seen": 99790144, + "step": 81990 + }, + { + "epoch": 9.131863236440584, + "grad_norm": 0.9122540950775146, + "learning_rate": 3.300808395725359e-05, + "loss": 0.0233, + "num_input_tokens_seen": 99796416, + "step": 81995 + }, + { + "epoch": 9.132420091324201, + "grad_norm": 0.021740691736340523, + "learning_rate": 3.300578220874564e-05, + "loss": 0.0797, + "num_input_tokens_seen": 99802368, + "step": 82000 + }, + { + "epoch": 9.132976946207819, + "grad_norm": 0.0003131493867840618, + "learning_rate": 3.300348038461666e-05, + "loss": 0.0905, + "num_input_tokens_seen": 99808032, + "step": 82005 + }, + { + "epoch": 9.133533801091435, + "grad_norm": 0.031943436712026596, + "learning_rate": 3.300117848488839e-05, + "loss": 0.0266, + "num_input_tokens_seen": 99814016, + "step": 82010 + }, + { + "epoch": 9.134090655975053, + "grad_norm": 0.9928299188613892, + "learning_rate": 3.299887650958259e-05, + "loss": 0.0432, + "num_input_tokens_seen": 99819904, + "step": 82015 + }, + { + "epoch": 9.13464751085867, + "grad_norm": 0.10935191065073013, + "learning_rate": 3.299657445872098e-05, + "loss": 0.0596, + "num_input_tokens_seen": 99826176, + "step": 82020 + }, + { + "epoch": 9.135204365742288, + "grad_norm": 1.1934764385223389, + "learning_rate": 3.2994272332325334e-05, + "loss": 0.0507, + "num_input_tokens_seen": 99832096, + "step": 82025 + }, + { + "epoch": 9.135761220625906, + "grad_norm": 0.019935259595513344, + "learning_rate": 3.299197013041737e-05, + "loss": 0.1233, + "num_input_tokens_seen": 99838560, + "step": 82030 + }, + { + "epoch": 9.136318075509521, + "grad_norm": 0.03902288153767586, + "learning_rate": 3.298966785301885e-05, + "loss": 0.0025, + "num_input_tokens_seen": 99844800, + "step": 82035 + }, + { + "epoch": 9.13687493039314, + "grad_norm": 0.005147854331880808, + "learning_rate": 3.2987365500151515e-05, + "loss": 0.0402, + "num_input_tokens_seen": 99850976, + "step": 82040 + }, + { + "epoch": 9.137431785276757, + "grad_norm": 1.0765228271484375, + "learning_rate": 3.298506307183711e-05, + "loss": 0.0827, + "num_input_tokens_seen": 99857248, + "step": 82045 + }, + { + "epoch": 9.137988640160374, + "grad_norm": 0.02669093944132328, + "learning_rate": 3.2982760568097384e-05, + "loss": 0.0202, + "num_input_tokens_seen": 99863136, + "step": 82050 + }, + { + "epoch": 9.138545495043992, + "grad_norm": 0.7580830454826355, + "learning_rate": 3.298045798895409e-05, + "loss": 0.0787, + "num_input_tokens_seen": 99869376, + "step": 82055 + }, + { + "epoch": 9.139102349927608, + "grad_norm": 0.00037097668973729014, + "learning_rate": 3.297815533442899e-05, + "loss": 0.0887, + "num_input_tokens_seen": 99875456, + "step": 82060 + }, + { + "epoch": 9.139659204811226, + "grad_norm": 0.4401482939720154, + "learning_rate": 3.297585260454381e-05, + "loss": 0.0163, + "num_input_tokens_seen": 99881856, + "step": 82065 + }, + { + "epoch": 9.140216059694843, + "grad_norm": 0.06748514622449875, + "learning_rate": 3.297354979932031e-05, + "loss": 0.0416, + "num_input_tokens_seen": 99888544, + "step": 82070 + }, + { + "epoch": 9.140772914578461, + "grad_norm": 1.0506770610809326, + "learning_rate": 3.297124691878025e-05, + "loss": 0.0294, + "num_input_tokens_seen": 99894496, + "step": 82075 + }, + { + "epoch": 9.141329769462079, + "grad_norm": 0.0319368876516819, + "learning_rate": 3.2968943962945374e-05, + "loss": 0.0616, + "num_input_tokens_seen": 99900320, + "step": 82080 + }, + { + "epoch": 9.141886624345695, + "grad_norm": 0.05482575297355652, + "learning_rate": 3.296664093183743e-05, + "loss": 0.1082, + "num_input_tokens_seen": 99906624, + "step": 82085 + }, + { + "epoch": 9.142443479229312, + "grad_norm": 0.044879648834466934, + "learning_rate": 3.296433782547819e-05, + "loss": 0.0127, + "num_input_tokens_seen": 99912704, + "step": 82090 + }, + { + "epoch": 9.14300033411293, + "grad_norm": 0.7159651517868042, + "learning_rate": 3.2962034643889395e-05, + "loss": 0.0163, + "num_input_tokens_seen": 99918752, + "step": 82095 + }, + { + "epoch": 9.143557188996548, + "grad_norm": 0.4615945518016815, + "learning_rate": 3.295973138709281e-05, + "loss": 0.1111, + "num_input_tokens_seen": 99924864, + "step": 82100 + }, + { + "epoch": 9.144114043880165, + "grad_norm": 0.12860174477100372, + "learning_rate": 3.295742805511017e-05, + "loss": 0.0154, + "num_input_tokens_seen": 99930784, + "step": 82105 + }, + { + "epoch": 9.144670898763783, + "grad_norm": 0.8192504048347473, + "learning_rate": 3.295512464796326e-05, + "loss": 0.027, + "num_input_tokens_seen": 99936960, + "step": 82110 + }, + { + "epoch": 9.145227753647399, + "grad_norm": 0.7444885969161987, + "learning_rate": 3.295282116567382e-05, + "loss": 0.1357, + "num_input_tokens_seen": 99942752, + "step": 82115 + }, + { + "epoch": 9.145784608531017, + "grad_norm": 0.22292113304138184, + "learning_rate": 3.295051760826361e-05, + "loss": 0.0242, + "num_input_tokens_seen": 99948928, + "step": 82120 + }, + { + "epoch": 9.146341463414634, + "grad_norm": 0.12082623690366745, + "learning_rate": 3.2948213975754396e-05, + "loss": 0.0347, + "num_input_tokens_seen": 99955104, + "step": 82125 + }, + { + "epoch": 9.146898318298252, + "grad_norm": 0.467241495847702, + "learning_rate": 3.2945910268167934e-05, + "loss": 0.0334, + "num_input_tokens_seen": 99961184, + "step": 82130 + }, + { + "epoch": 9.14745517318187, + "grad_norm": 0.011034478433430195, + "learning_rate": 3.294360648552597e-05, + "loss": 0.135, + "num_input_tokens_seen": 99967008, + "step": 82135 + }, + { + "epoch": 9.148012028065486, + "grad_norm": 0.2375529259443283, + "learning_rate": 3.294130262785029e-05, + "loss": 0.0233, + "num_input_tokens_seen": 99973184, + "step": 82140 + }, + { + "epoch": 9.148568882949103, + "grad_norm": 0.7705557942390442, + "learning_rate": 3.293899869516265e-05, + "loss": 0.0855, + "num_input_tokens_seen": 99979104, + "step": 82145 + }, + { + "epoch": 9.14912573783272, + "grad_norm": 0.011270883493125439, + "learning_rate": 3.2936694687484794e-05, + "loss": 0.0518, + "num_input_tokens_seen": 99985216, + "step": 82150 + }, + { + "epoch": 9.149682592716339, + "grad_norm": 0.45748504996299744, + "learning_rate": 3.2934390604838506e-05, + "loss": 0.0136, + "num_input_tokens_seen": 99991584, + "step": 82155 + }, + { + "epoch": 9.150239447599956, + "grad_norm": 1.1737991571426392, + "learning_rate": 3.293208644724554e-05, + "loss": 0.0476, + "num_input_tokens_seen": 99997792, + "step": 82160 + }, + { + "epoch": 9.150796302483572, + "grad_norm": 2.869788646697998, + "learning_rate": 3.292978221472766e-05, + "loss": 0.0344, + "num_input_tokens_seen": 100003808, + "step": 82165 + }, + { + "epoch": 9.15135315736719, + "grad_norm": 0.04850471019744873, + "learning_rate": 3.292747790730663e-05, + "loss": 0.0338, + "num_input_tokens_seen": 100010112, + "step": 82170 + }, + { + "epoch": 9.151910012250807, + "grad_norm": 0.19069766998291016, + "learning_rate": 3.292517352500422e-05, + "loss": 0.0391, + "num_input_tokens_seen": 100016320, + "step": 82175 + }, + { + "epoch": 9.152466867134425, + "grad_norm": 0.00931991171091795, + "learning_rate": 3.292286906784221e-05, + "loss": 0.035, + "num_input_tokens_seen": 100022592, + "step": 82180 + }, + { + "epoch": 9.153023722018043, + "grad_norm": 0.259914755821228, + "learning_rate": 3.292056453584233e-05, + "loss": 0.0104, + "num_input_tokens_seen": 100028864, + "step": 82185 + }, + { + "epoch": 9.153580576901659, + "grad_norm": 0.0026560500264167786, + "learning_rate": 3.2918259929026395e-05, + "loss": 0.0107, + "num_input_tokens_seen": 100035072, + "step": 82190 + }, + { + "epoch": 9.154137431785276, + "grad_norm": 0.10224395990371704, + "learning_rate": 3.291595524741614e-05, + "loss": 0.0064, + "num_input_tokens_seen": 100041344, + "step": 82195 + }, + { + "epoch": 9.154694286668894, + "grad_norm": 1.0278924703598022, + "learning_rate": 3.291365049103335e-05, + "loss": 0.0748, + "num_input_tokens_seen": 100047232, + "step": 82200 + }, + { + "epoch": 9.155251141552512, + "grad_norm": 0.14193281531333923, + "learning_rate": 3.2911345659899796e-05, + "loss": 0.083, + "num_input_tokens_seen": 100053440, + "step": 82205 + }, + { + "epoch": 9.15580799643613, + "grad_norm": 0.02399471588432789, + "learning_rate": 3.290904075403723e-05, + "loss": 0.0123, + "num_input_tokens_seen": 100059712, + "step": 82210 + }, + { + "epoch": 9.156364851319745, + "grad_norm": 0.28957733511924744, + "learning_rate": 3.290673577346745e-05, + "loss": 0.0507, + "num_input_tokens_seen": 100065568, + "step": 82215 + }, + { + "epoch": 9.156921706203363, + "grad_norm": 0.035649776458740234, + "learning_rate": 3.290443071821221e-05, + "loss": 0.0214, + "num_input_tokens_seen": 100071584, + "step": 82220 + }, + { + "epoch": 9.15747856108698, + "grad_norm": 0.46669262647628784, + "learning_rate": 3.290212558829329e-05, + "loss": 0.018, + "num_input_tokens_seen": 100077664, + "step": 82225 + }, + { + "epoch": 9.158035415970598, + "grad_norm": 0.44510191679000854, + "learning_rate": 3.289982038373246e-05, + "loss": 0.0573, + "num_input_tokens_seen": 100084032, + "step": 82230 + }, + { + "epoch": 9.158592270854216, + "grad_norm": 0.05074096843600273, + "learning_rate": 3.2897515104551495e-05, + "loss": 0.0755, + "num_input_tokens_seen": 100089728, + "step": 82235 + }, + { + "epoch": 9.159149125737832, + "grad_norm": 0.15685831010341644, + "learning_rate": 3.289520975077218e-05, + "loss": 0.0075, + "num_input_tokens_seen": 100096096, + "step": 82240 + }, + { + "epoch": 9.15970598062145, + "grad_norm": 0.009126552380621433, + "learning_rate": 3.289290432241628e-05, + "loss": 0.0414, + "num_input_tokens_seen": 100102240, + "step": 82245 + }, + { + "epoch": 9.160262835505067, + "grad_norm": 0.38589218258857727, + "learning_rate": 3.289059881950558e-05, + "loss": 0.1038, + "num_input_tokens_seen": 100107808, + "step": 82250 + }, + { + "epoch": 9.160819690388685, + "grad_norm": 0.11696716398000717, + "learning_rate": 3.288829324206184e-05, + "loss": 0.009, + "num_input_tokens_seen": 100114208, + "step": 82255 + }, + { + "epoch": 9.161376545272303, + "grad_norm": 0.021320773288607597, + "learning_rate": 3.288598759010686e-05, + "loss": 0.016, + "num_input_tokens_seen": 100120064, + "step": 82260 + }, + { + "epoch": 9.161933400155919, + "grad_norm": 0.8200761079788208, + "learning_rate": 3.2883681863662406e-05, + "loss": 0.148, + "num_input_tokens_seen": 100126080, + "step": 82265 + }, + { + "epoch": 9.162490255039536, + "grad_norm": 0.15402092039585114, + "learning_rate": 3.2881376062750255e-05, + "loss": 0.035, + "num_input_tokens_seen": 100132608, + "step": 82270 + }, + { + "epoch": 9.163047109923154, + "grad_norm": 0.5825920701026917, + "learning_rate": 3.28790701873922e-05, + "loss": 0.0142, + "num_input_tokens_seen": 100138432, + "step": 82275 + }, + { + "epoch": 9.163603964806772, + "grad_norm": 0.007844671607017517, + "learning_rate": 3.287676423761001e-05, + "loss": 0.0427, + "num_input_tokens_seen": 100144320, + "step": 82280 + }, + { + "epoch": 9.16416081969039, + "grad_norm": 0.557065486907959, + "learning_rate": 3.2874458213425486e-05, + "loss": 0.0206, + "num_input_tokens_seen": 100150816, + "step": 82285 + }, + { + "epoch": 9.164717674574007, + "grad_norm": 0.01862085796892643, + "learning_rate": 3.287215211486038e-05, + "loss": 0.0478, + "num_input_tokens_seen": 100156736, + "step": 82290 + }, + { + "epoch": 9.165274529457623, + "grad_norm": 0.007270338479429483, + "learning_rate": 3.286984594193649e-05, + "loss": 0.0168, + "num_input_tokens_seen": 100163008, + "step": 82295 + }, + { + "epoch": 9.16583138434124, + "grad_norm": 0.2663581073284149, + "learning_rate": 3.286753969467561e-05, + "loss": 0.0402, + "num_input_tokens_seen": 100168896, + "step": 82300 + }, + { + "epoch": 9.166388239224858, + "grad_norm": 0.5357717871665955, + "learning_rate": 3.28652333730995e-05, + "loss": 0.0225, + "num_input_tokens_seen": 100175104, + "step": 82305 + }, + { + "epoch": 9.166945094108476, + "grad_norm": 1.2088876962661743, + "learning_rate": 3.286292697722997e-05, + "loss": 0.0287, + "num_input_tokens_seen": 100181088, + "step": 82310 + }, + { + "epoch": 9.167501948992093, + "grad_norm": 0.038389693945646286, + "learning_rate": 3.286062050708879e-05, + "loss": 0.0297, + "num_input_tokens_seen": 100187072, + "step": 82315 + }, + { + "epoch": 9.16805880387571, + "grad_norm": 0.12705540657043457, + "learning_rate": 3.285831396269776e-05, + "loss": 0.0015, + "num_input_tokens_seen": 100192896, + "step": 82320 + }, + { + "epoch": 9.168615658759327, + "grad_norm": 0.024067331105470657, + "learning_rate": 3.285600734407865e-05, + "loss": 0.0104, + "num_input_tokens_seen": 100199040, + "step": 82325 + }, + { + "epoch": 9.169172513642945, + "grad_norm": 1.6150182485580444, + "learning_rate": 3.2853700651253255e-05, + "loss": 0.0744, + "num_input_tokens_seen": 100204992, + "step": 82330 + }, + { + "epoch": 9.169729368526562, + "grad_norm": 0.0022157880011945963, + "learning_rate": 3.285139388424338e-05, + "loss": 0.0106, + "num_input_tokens_seen": 100211104, + "step": 82335 + }, + { + "epoch": 9.17028622341018, + "grad_norm": 0.6587483286857605, + "learning_rate": 3.284908704307078e-05, + "loss": 0.0201, + "num_input_tokens_seen": 100217280, + "step": 82340 + }, + { + "epoch": 9.170843078293796, + "grad_norm": 0.019043851643800735, + "learning_rate": 3.284678012775727e-05, + "loss": 0.0184, + "num_input_tokens_seen": 100223360, + "step": 82345 + }, + { + "epoch": 9.171399933177414, + "grad_norm": 2.783513069152832, + "learning_rate": 3.284447313832464e-05, + "loss": 0.1076, + "num_input_tokens_seen": 100229408, + "step": 82350 + }, + { + "epoch": 9.171956788061031, + "grad_norm": 0.017361244186758995, + "learning_rate": 3.284216607479468e-05, + "loss": 0.0062, + "num_input_tokens_seen": 100235488, + "step": 82355 + }, + { + "epoch": 9.172513642944649, + "grad_norm": 0.019451193511486053, + "learning_rate": 3.2839858937189165e-05, + "loss": 0.0421, + "num_input_tokens_seen": 100241504, + "step": 82360 + }, + { + "epoch": 9.173070497828267, + "grad_norm": 0.0905500277876854, + "learning_rate": 3.283755172552991e-05, + "loss": 0.052, + "num_input_tokens_seen": 100247616, + "step": 82365 + }, + { + "epoch": 9.173627352711883, + "grad_norm": 1.7604154348373413, + "learning_rate": 3.28352444398387e-05, + "loss": 0.0548, + "num_input_tokens_seen": 100253248, + "step": 82370 + }, + { + "epoch": 9.1741842075955, + "grad_norm": 0.20709799230098724, + "learning_rate": 3.283293708013732e-05, + "loss": 0.012, + "num_input_tokens_seen": 100259072, + "step": 82375 + }, + { + "epoch": 9.174741062479118, + "grad_norm": 0.5369780659675598, + "learning_rate": 3.2830629646447586e-05, + "loss": 0.0054, + "num_input_tokens_seen": 100265248, + "step": 82380 + }, + { + "epoch": 9.175297917362736, + "grad_norm": 0.6495987176895142, + "learning_rate": 3.2828322138791274e-05, + "loss": 0.094, + "num_input_tokens_seen": 100270816, + "step": 82385 + }, + { + "epoch": 9.175854772246353, + "grad_norm": 2.5583388805389404, + "learning_rate": 3.282601455719019e-05, + "loss": 0.2184, + "num_input_tokens_seen": 100276896, + "step": 82390 + }, + { + "epoch": 9.17641162712997, + "grad_norm": 0.08355735242366791, + "learning_rate": 3.2823706901666124e-05, + "loss": 0.0815, + "num_input_tokens_seen": 100283136, + "step": 82395 + }, + { + "epoch": 9.176968482013587, + "grad_norm": 0.022191185504198074, + "learning_rate": 3.282139917224088e-05, + "loss": 0.0475, + "num_input_tokens_seen": 100289152, + "step": 82400 + }, + { + "epoch": 9.177525336897205, + "grad_norm": 0.011364332400262356, + "learning_rate": 3.281909136893626e-05, + "loss": 0.0607, + "num_input_tokens_seen": 100295168, + "step": 82405 + }, + { + "epoch": 9.178082191780822, + "grad_norm": 0.013278168626129627, + "learning_rate": 3.281678349177405e-05, + "loss": 0.1229, + "num_input_tokens_seen": 100301024, + "step": 82410 + }, + { + "epoch": 9.17863904666444, + "grad_norm": 0.0005592730594798923, + "learning_rate": 3.281447554077606e-05, + "loss": 0.0154, + "num_input_tokens_seen": 100307008, + "step": 82415 + }, + { + "epoch": 9.179195901548056, + "grad_norm": 0.3883097767829895, + "learning_rate": 3.281216751596409e-05, + "loss": 0.0766, + "num_input_tokens_seen": 100312992, + "step": 82420 + }, + { + "epoch": 9.179752756431673, + "grad_norm": 0.02085786499083042, + "learning_rate": 3.2809859417359933e-05, + "loss": 0.0009, + "num_input_tokens_seen": 100318816, + "step": 82425 + }, + { + "epoch": 9.180309611315291, + "grad_norm": 0.027520012110471725, + "learning_rate": 3.280755124498541e-05, + "loss": 0.0943, + "num_input_tokens_seen": 100325088, + "step": 82430 + }, + { + "epoch": 9.180866466198909, + "grad_norm": 1.1137405633926392, + "learning_rate": 3.280524299886229e-05, + "loss": 0.1284, + "num_input_tokens_seen": 100331232, + "step": 82435 + }, + { + "epoch": 9.181423321082526, + "grad_norm": 0.4499078392982483, + "learning_rate": 3.280293467901241e-05, + "loss": 0.1527, + "num_input_tokens_seen": 100337184, + "step": 82440 + }, + { + "epoch": 9.181980175966142, + "grad_norm": 0.17240193486213684, + "learning_rate": 3.280062628545756e-05, + "loss": 0.0693, + "num_input_tokens_seen": 100343648, + "step": 82445 + }, + { + "epoch": 9.18253703084976, + "grad_norm": 0.007464994676411152, + "learning_rate": 3.279831781821955e-05, + "loss": 0.033, + "num_input_tokens_seen": 100349760, + "step": 82450 + }, + { + "epoch": 9.183093885733378, + "grad_norm": 0.0020408465061336756, + "learning_rate": 3.2796009277320166e-05, + "loss": 0.0283, + "num_input_tokens_seen": 100355840, + "step": 82455 + }, + { + "epoch": 9.183650740616995, + "grad_norm": 0.2063974291086197, + "learning_rate": 3.2793700662781235e-05, + "loss": 0.091, + "num_input_tokens_seen": 100361728, + "step": 82460 + }, + { + "epoch": 9.184207595500613, + "grad_norm": 0.011009120382368565, + "learning_rate": 3.279139197462456e-05, + "loss": 0.0464, + "num_input_tokens_seen": 100367616, + "step": 82465 + }, + { + "epoch": 9.18476445038423, + "grad_norm": 1.7841111421585083, + "learning_rate": 3.2789083212871944e-05, + "loss": 0.1965, + "num_input_tokens_seen": 100372896, + "step": 82470 + }, + { + "epoch": 9.185321305267847, + "grad_norm": 0.3640519976615906, + "learning_rate": 3.2786774377545195e-05, + "loss": 0.0342, + "num_input_tokens_seen": 100378880, + "step": 82475 + }, + { + "epoch": 9.185878160151464, + "grad_norm": 0.004175017587840557, + "learning_rate": 3.2784465468666123e-05, + "loss": 0.0102, + "num_input_tokens_seen": 100384928, + "step": 82480 + }, + { + "epoch": 9.186435015035082, + "grad_norm": 0.02650589495897293, + "learning_rate": 3.2782156486256535e-05, + "loss": 0.0963, + "num_input_tokens_seen": 100390752, + "step": 82485 + }, + { + "epoch": 9.1869918699187, + "grad_norm": 0.3128986954689026, + "learning_rate": 3.277984743033825e-05, + "loss": 0.0151, + "num_input_tokens_seen": 100396672, + "step": 82490 + }, + { + "epoch": 9.187548724802317, + "grad_norm": 0.03809044137597084, + "learning_rate": 3.2777538300933066e-05, + "loss": 0.0546, + "num_input_tokens_seen": 100403296, + "step": 82495 + }, + { + "epoch": 9.188105579685933, + "grad_norm": 0.8585852384567261, + "learning_rate": 3.277522909806281e-05, + "loss": 0.0279, + "num_input_tokens_seen": 100409376, + "step": 82500 + }, + { + "epoch": 9.188662434569551, + "grad_norm": 1.4892956018447876, + "learning_rate": 3.2772919821749276e-05, + "loss": 0.1535, + "num_input_tokens_seen": 100415168, + "step": 82505 + }, + { + "epoch": 9.189219289453169, + "grad_norm": 0.6001770496368408, + "learning_rate": 3.2770610472014295e-05, + "loss": 0.0241, + "num_input_tokens_seen": 100421184, + "step": 82510 + }, + { + "epoch": 9.189776144336786, + "grad_norm": 1.3275562524795532, + "learning_rate": 3.276830104887967e-05, + "loss": 0.0934, + "num_input_tokens_seen": 100427136, + "step": 82515 + }, + { + "epoch": 9.190332999220404, + "grad_norm": 0.5699490904808044, + "learning_rate": 3.2765991552367213e-05, + "loss": 0.0295, + "num_input_tokens_seen": 100433216, + "step": 82520 + }, + { + "epoch": 9.19088985410402, + "grad_norm": 0.315466046333313, + "learning_rate": 3.276368198249875e-05, + "loss": 0.0308, + "num_input_tokens_seen": 100439552, + "step": 82525 + }, + { + "epoch": 9.191446708987637, + "grad_norm": 0.9546127915382385, + "learning_rate": 3.2761372339296094e-05, + "loss": 0.0567, + "num_input_tokens_seen": 100445600, + "step": 82530 + }, + { + "epoch": 9.192003563871255, + "grad_norm": 0.4917888641357422, + "learning_rate": 3.2759062622781055e-05, + "loss": 0.0538, + "num_input_tokens_seen": 100451456, + "step": 82535 + }, + { + "epoch": 9.192560418754873, + "grad_norm": 0.012665659189224243, + "learning_rate": 3.275675283297545e-05, + "loss": 0.0561, + "num_input_tokens_seen": 100458016, + "step": 82540 + }, + { + "epoch": 9.19311727363849, + "grad_norm": 0.5660903453826904, + "learning_rate": 3.27544429699011e-05, + "loss": 0.0165, + "num_input_tokens_seen": 100464128, + "step": 82545 + }, + { + "epoch": 9.193674128522106, + "grad_norm": 0.005078980699181557, + "learning_rate": 3.2752133033579826e-05, + "loss": 0.0683, + "num_input_tokens_seen": 100470368, + "step": 82550 + }, + { + "epoch": 9.194230983405724, + "grad_norm": 0.4913381338119507, + "learning_rate": 3.274982302403344e-05, + "loss": 0.0766, + "num_input_tokens_seen": 100476480, + "step": 82555 + }, + { + "epoch": 9.194787838289342, + "grad_norm": 1.5657222270965576, + "learning_rate": 3.274751294128378e-05, + "loss": 0.0956, + "num_input_tokens_seen": 100482336, + "step": 82560 + }, + { + "epoch": 9.19534469317296, + "grad_norm": 2.046987533569336, + "learning_rate": 3.274520278535263e-05, + "loss": 0.1454, + "num_input_tokens_seen": 100488544, + "step": 82565 + }, + { + "epoch": 9.195901548056577, + "grad_norm": 0.010250739753246307, + "learning_rate": 3.274289255626184e-05, + "loss": 0.1061, + "num_input_tokens_seen": 100494912, + "step": 82570 + }, + { + "epoch": 9.196458402940193, + "grad_norm": 0.29257550835609436, + "learning_rate": 3.2740582254033245e-05, + "loss": 0.0361, + "num_input_tokens_seen": 100500960, + "step": 82575 + }, + { + "epoch": 9.19701525782381, + "grad_norm": 0.1205957680940628, + "learning_rate": 3.2738271878688634e-05, + "loss": 0.0589, + "num_input_tokens_seen": 100506976, + "step": 82580 + }, + { + "epoch": 9.197572112707428, + "grad_norm": 0.1892479509115219, + "learning_rate": 3.273596143024985e-05, + "loss": 0.0438, + "num_input_tokens_seen": 100512288, + "step": 82585 + }, + { + "epoch": 9.198128967591046, + "grad_norm": 0.24404209852218628, + "learning_rate": 3.2733650908738706e-05, + "loss": 0.025, + "num_input_tokens_seen": 100517408, + "step": 82590 + }, + { + "epoch": 9.198685822474664, + "grad_norm": 0.08336615562438965, + "learning_rate": 3.273134031417704e-05, + "loss": 0.0318, + "num_input_tokens_seen": 100523712, + "step": 82595 + }, + { + "epoch": 9.19924267735828, + "grad_norm": 0.008148947730660439, + "learning_rate": 3.272902964658667e-05, + "loss": 0.0812, + "num_input_tokens_seen": 100529824, + "step": 82600 + }, + { + "epoch": 9.199799532241897, + "grad_norm": 1.2227224111557007, + "learning_rate": 3.272671890598942e-05, + "loss": 0.0493, + "num_input_tokens_seen": 100536000, + "step": 82605 + }, + { + "epoch": 9.200356387125515, + "grad_norm": 0.07713648676872253, + "learning_rate": 3.2724408092407124e-05, + "loss": 0.0951, + "num_input_tokens_seen": 100542464, + "step": 82610 + }, + { + "epoch": 9.200913242009133, + "grad_norm": 0.5079584121704102, + "learning_rate": 3.272209720586159e-05, + "loss": 0.0275, + "num_input_tokens_seen": 100548704, + "step": 82615 + }, + { + "epoch": 9.20147009689275, + "grad_norm": 1.2375153303146362, + "learning_rate": 3.271978624637468e-05, + "loss": 0.035, + "num_input_tokens_seen": 100554880, + "step": 82620 + }, + { + "epoch": 9.202026951776368, + "grad_norm": 6.6818413734436035, + "learning_rate": 3.271747521396819e-05, + "loss": 0.1114, + "num_input_tokens_seen": 100561376, + "step": 82625 + }, + { + "epoch": 9.202583806659984, + "grad_norm": 0.16873963177204132, + "learning_rate": 3.2715164108663976e-05, + "loss": 0.0066, + "num_input_tokens_seen": 100567712, + "step": 82630 + }, + { + "epoch": 9.203140661543602, + "grad_norm": 0.02180459536612034, + "learning_rate": 3.2712852930483845e-05, + "loss": 0.0043, + "num_input_tokens_seen": 100573952, + "step": 82635 + }, + { + "epoch": 9.20369751642722, + "grad_norm": 0.00031024214695207775, + "learning_rate": 3.2710541679449644e-05, + "loss": 0.0095, + "num_input_tokens_seen": 100580416, + "step": 82640 + }, + { + "epoch": 9.204254371310837, + "grad_norm": 0.060277123004198074, + "learning_rate": 3.27082303555832e-05, + "loss": 0.0157, + "num_input_tokens_seen": 100586464, + "step": 82645 + }, + { + "epoch": 9.204811226194455, + "grad_norm": 0.7432336211204529, + "learning_rate": 3.270591895890633e-05, + "loss": 0.049, + "num_input_tokens_seen": 100592576, + "step": 82650 + }, + { + "epoch": 9.20536808107807, + "grad_norm": 0.11156473308801651, + "learning_rate": 3.27036074894409e-05, + "loss": 0.0242, + "num_input_tokens_seen": 100598464, + "step": 82655 + }, + { + "epoch": 9.205924935961688, + "grad_norm": 0.18265396356582642, + "learning_rate": 3.270129594720872e-05, + "loss": 0.118, + "num_input_tokens_seen": 100604800, + "step": 82660 + }, + { + "epoch": 9.206481790845306, + "grad_norm": 0.9423224329948425, + "learning_rate": 3.269898433223163e-05, + "loss": 0.0369, + "num_input_tokens_seen": 100611136, + "step": 82665 + }, + { + "epoch": 9.207038645728923, + "grad_norm": 0.007240124512463808, + "learning_rate": 3.2696672644531464e-05, + "loss": 0.0749, + "num_input_tokens_seen": 100617536, + "step": 82670 + }, + { + "epoch": 9.207595500612541, + "grad_norm": 0.1335015445947647, + "learning_rate": 3.269436088413006e-05, + "loss": 0.07, + "num_input_tokens_seen": 100624032, + "step": 82675 + }, + { + "epoch": 9.208152355496157, + "grad_norm": 0.7392123937606812, + "learning_rate": 3.269204905104925e-05, + "loss": 0.0317, + "num_input_tokens_seen": 100630240, + "step": 82680 + }, + { + "epoch": 9.208709210379775, + "grad_norm": 0.09709146618843079, + "learning_rate": 3.2689737145310875e-05, + "loss": 0.0079, + "num_input_tokens_seen": 100636192, + "step": 82685 + }, + { + "epoch": 9.209266065263392, + "grad_norm": 0.060060322284698486, + "learning_rate": 3.2687425166936767e-05, + "loss": 0.0951, + "num_input_tokens_seen": 100642464, + "step": 82690 + }, + { + "epoch": 9.20982292014701, + "grad_norm": 0.7612717747688293, + "learning_rate": 3.2685113115948774e-05, + "loss": 0.0596, + "num_input_tokens_seen": 100648480, + "step": 82695 + }, + { + "epoch": 9.210379775030628, + "grad_norm": 0.26975345611572266, + "learning_rate": 3.268280099236873e-05, + "loss": 0.1871, + "num_input_tokens_seen": 100654560, + "step": 82700 + }, + { + "epoch": 9.210936629914244, + "grad_norm": 0.11912748217582703, + "learning_rate": 3.268048879621848e-05, + "loss": 0.1929, + "num_input_tokens_seen": 100660864, + "step": 82705 + }, + { + "epoch": 9.211493484797861, + "grad_norm": 0.8301590085029602, + "learning_rate": 3.267817652751985e-05, + "loss": 0.0388, + "num_input_tokens_seen": 100666880, + "step": 82710 + }, + { + "epoch": 9.212050339681479, + "grad_norm": 0.010899469256401062, + "learning_rate": 3.2675864186294703e-05, + "loss": 0.0007, + "num_input_tokens_seen": 100673120, + "step": 82715 + }, + { + "epoch": 9.212607194565097, + "grad_norm": 0.0008774920715950429, + "learning_rate": 3.267355177256486e-05, + "loss": 0.0109, + "num_input_tokens_seen": 100679072, + "step": 82720 + }, + { + "epoch": 9.213164049448714, + "grad_norm": 0.019512098282575607, + "learning_rate": 3.267123928635217e-05, + "loss": 0.0894, + "num_input_tokens_seen": 100685216, + "step": 82725 + }, + { + "epoch": 9.21372090433233, + "grad_norm": 1.1019065380096436, + "learning_rate": 3.266892672767848e-05, + "loss": 0.087, + "num_input_tokens_seen": 100691232, + "step": 82730 + }, + { + "epoch": 9.214277759215948, + "grad_norm": 0.033045172691345215, + "learning_rate": 3.266661409656564e-05, + "loss": 0.0247, + "num_input_tokens_seen": 100697408, + "step": 82735 + }, + { + "epoch": 9.214834614099566, + "grad_norm": 0.5597168207168579, + "learning_rate": 3.2664301393035486e-05, + "loss": 0.0282, + "num_input_tokens_seen": 100703648, + "step": 82740 + }, + { + "epoch": 9.215391468983183, + "grad_norm": 1.0991004705429077, + "learning_rate": 3.266198861710986e-05, + "loss": 0.0948, + "num_input_tokens_seen": 100709888, + "step": 82745 + }, + { + "epoch": 9.215948323866801, + "grad_norm": 0.5452957153320312, + "learning_rate": 3.265967576881061e-05, + "loss": 0.0499, + "num_input_tokens_seen": 100715456, + "step": 82750 + }, + { + "epoch": 9.216505178750417, + "grad_norm": 0.06912204623222351, + "learning_rate": 3.26573628481596e-05, + "loss": 0.0302, + "num_input_tokens_seen": 100721472, + "step": 82755 + }, + { + "epoch": 9.217062033634035, + "grad_norm": 0.031546756625175476, + "learning_rate": 3.265504985517865e-05, + "loss": 0.0491, + "num_input_tokens_seen": 100727104, + "step": 82760 + }, + { + "epoch": 9.217618888517652, + "grad_norm": 0.026093540713191032, + "learning_rate": 3.265273678988963e-05, + "loss": 0.0537, + "num_input_tokens_seen": 100733632, + "step": 82765 + }, + { + "epoch": 9.21817574340127, + "grad_norm": 0.8735789060592651, + "learning_rate": 3.265042365231437e-05, + "loss": 0.0319, + "num_input_tokens_seen": 100739616, + "step": 82770 + }, + { + "epoch": 9.218732598284888, + "grad_norm": 1.3005810976028442, + "learning_rate": 3.2648110442474735e-05, + "loss": 0.082, + "num_input_tokens_seen": 100745984, + "step": 82775 + }, + { + "epoch": 9.219289453168503, + "grad_norm": 0.051263585686683655, + "learning_rate": 3.264579716039256e-05, + "loss": 0.0968, + "num_input_tokens_seen": 100751872, + "step": 82780 + }, + { + "epoch": 9.219846308052121, + "grad_norm": 0.00023439932556357235, + "learning_rate": 3.2643483806089726e-05, + "loss": 0.0307, + "num_input_tokens_seen": 100758080, + "step": 82785 + }, + { + "epoch": 9.220403162935739, + "grad_norm": 0.31682005524635315, + "learning_rate": 3.264117037958805e-05, + "loss": 0.1111, + "num_input_tokens_seen": 100763808, + "step": 82790 + }, + { + "epoch": 9.220960017819356, + "grad_norm": 0.19516058266162872, + "learning_rate": 3.263885688090939e-05, + "loss": 0.0142, + "num_input_tokens_seen": 100769952, + "step": 82795 + }, + { + "epoch": 9.221516872702974, + "grad_norm": 0.0006665513501502573, + "learning_rate": 3.263654331007562e-05, + "loss": 0.0196, + "num_input_tokens_seen": 100776256, + "step": 82800 + }, + { + "epoch": 9.22207372758659, + "grad_norm": 0.3184928596019745, + "learning_rate": 3.263422966710857e-05, + "loss": 0.0362, + "num_input_tokens_seen": 100782208, + "step": 82805 + }, + { + "epoch": 9.222630582470208, + "grad_norm": 1.2899277210235596, + "learning_rate": 3.263191595203012e-05, + "loss": 0.0662, + "num_input_tokens_seen": 100788256, + "step": 82810 + }, + { + "epoch": 9.223187437353825, + "grad_norm": 0.12027521431446075, + "learning_rate": 3.26296021648621e-05, + "loss": 0.0361, + "num_input_tokens_seen": 100794336, + "step": 82815 + }, + { + "epoch": 9.223744292237443, + "grad_norm": 0.23546777665615082, + "learning_rate": 3.2627288305626366e-05, + "loss": 0.0094, + "num_input_tokens_seen": 100800032, + "step": 82820 + }, + { + "epoch": 9.22430114712106, + "grad_norm": 0.4242381155490875, + "learning_rate": 3.262497437434479e-05, + "loss": 0.0228, + "num_input_tokens_seen": 100806400, + "step": 82825 + }, + { + "epoch": 9.224858002004678, + "grad_norm": 0.42906227707862854, + "learning_rate": 3.2622660371039224e-05, + "loss": 0.0211, + "num_input_tokens_seen": 100812512, + "step": 82830 + }, + { + "epoch": 9.225414856888294, + "grad_norm": 0.6902559995651245, + "learning_rate": 3.262034629573153e-05, + "loss": 0.0291, + "num_input_tokens_seen": 100818272, + "step": 82835 + }, + { + "epoch": 9.225971711771912, + "grad_norm": 0.042865950614213943, + "learning_rate": 3.2618032148443547e-05, + "loss": 0.0597, + "num_input_tokens_seen": 100824160, + "step": 82840 + }, + { + "epoch": 9.22652856665553, + "grad_norm": 0.4641629755496979, + "learning_rate": 3.2615717929197156e-05, + "loss": 0.0128, + "num_input_tokens_seen": 100830400, + "step": 82845 + }, + { + "epoch": 9.227085421539147, + "grad_norm": 0.0001664650480961427, + "learning_rate": 3.26134036380142e-05, + "loss": 0.0064, + "num_input_tokens_seen": 100836320, + "step": 82850 + }, + { + "epoch": 9.227642276422765, + "grad_norm": 0.46696195006370544, + "learning_rate": 3.261108927491655e-05, + "loss": 0.0917, + "num_input_tokens_seen": 100842208, + "step": 82855 + }, + { + "epoch": 9.228199131306381, + "grad_norm": 1.3155070543289185, + "learning_rate": 3.2608774839926064e-05, + "loss": 0.1422, + "num_input_tokens_seen": 100848192, + "step": 82860 + }, + { + "epoch": 9.228755986189999, + "grad_norm": 1.0219963788986206, + "learning_rate": 3.26064603330646e-05, + "loss": 0.0247, + "num_input_tokens_seen": 100854432, + "step": 82865 + }, + { + "epoch": 9.229312841073616, + "grad_norm": 0.1567326784133911, + "learning_rate": 3.260414575435403e-05, + "loss": 0.0043, + "num_input_tokens_seen": 100860416, + "step": 82870 + }, + { + "epoch": 9.229869695957234, + "grad_norm": 0.2661476731300354, + "learning_rate": 3.260183110381621e-05, + "loss": 0.0118, + "num_input_tokens_seen": 100866080, + "step": 82875 + }, + { + "epoch": 9.230426550840852, + "grad_norm": 0.03636086359620094, + "learning_rate": 3.2599516381473006e-05, + "loss": 0.0313, + "num_input_tokens_seen": 100872448, + "step": 82880 + }, + { + "epoch": 9.230983405724468, + "grad_norm": 0.16270191967487335, + "learning_rate": 3.259720158734628e-05, + "loss": 0.0584, + "num_input_tokens_seen": 100877856, + "step": 82885 + }, + { + "epoch": 9.231540260608085, + "grad_norm": 1.4674334526062012, + "learning_rate": 3.2594886721457904e-05, + "loss": 0.0923, + "num_input_tokens_seen": 100883904, + "step": 82890 + }, + { + "epoch": 9.232097115491703, + "grad_norm": 0.0008114305092021823, + "learning_rate": 3.2592571783829725e-05, + "loss": 0.0294, + "num_input_tokens_seen": 100890144, + "step": 82895 + }, + { + "epoch": 9.23265397037532, + "grad_norm": 0.13022960722446442, + "learning_rate": 3.2590256774483625e-05, + "loss": 0.0046, + "num_input_tokens_seen": 100896416, + "step": 82900 + }, + { + "epoch": 9.233210825258938, + "grad_norm": 0.014705405570566654, + "learning_rate": 3.258794169344148e-05, + "loss": 0.1344, + "num_input_tokens_seen": 100902464, + "step": 82905 + }, + { + "epoch": 9.233767680142554, + "grad_norm": 0.6609113216400146, + "learning_rate": 3.258562654072513e-05, + "loss": 0.0479, + "num_input_tokens_seen": 100908576, + "step": 82910 + }, + { + "epoch": 9.234324535026172, + "grad_norm": 0.7818823456764221, + "learning_rate": 3.258331131635647e-05, + "loss": 0.1822, + "num_input_tokens_seen": 100914496, + "step": 82915 + }, + { + "epoch": 9.23488138990979, + "grad_norm": 0.0799914225935936, + "learning_rate": 3.258099602035736e-05, + "loss": 0.044, + "num_input_tokens_seen": 100920896, + "step": 82920 + }, + { + "epoch": 9.235438244793407, + "grad_norm": 0.00835313368588686, + "learning_rate": 3.257868065274966e-05, + "loss": 0.0356, + "num_input_tokens_seen": 100926880, + "step": 82925 + }, + { + "epoch": 9.235995099677025, + "grad_norm": 0.318497896194458, + "learning_rate": 3.257636521355526e-05, + "loss": 0.0165, + "num_input_tokens_seen": 100933312, + "step": 82930 + }, + { + "epoch": 9.23655195456064, + "grad_norm": 0.544598400592804, + "learning_rate": 3.2574049702796014e-05, + "loss": 0.0205, + "num_input_tokens_seen": 100939680, + "step": 82935 + }, + { + "epoch": 9.237108809444258, + "grad_norm": 0.008897456340491772, + "learning_rate": 3.257173412049381e-05, + "loss": 0.0119, + "num_input_tokens_seen": 100945888, + "step": 82940 + }, + { + "epoch": 9.237665664327876, + "grad_norm": 0.25907576084136963, + "learning_rate": 3.25694184666705e-05, + "loss": 0.0112, + "num_input_tokens_seen": 100951872, + "step": 82945 + }, + { + "epoch": 9.238222519211494, + "grad_norm": 0.0034428422804921865, + "learning_rate": 3.256710274134797e-05, + "loss": 0.0125, + "num_input_tokens_seen": 100957312, + "step": 82950 + }, + { + "epoch": 9.238779374095111, + "grad_norm": 1.1980654001235962, + "learning_rate": 3.2564786944548095e-05, + "loss": 0.085, + "num_input_tokens_seen": 100963488, + "step": 82955 + }, + { + "epoch": 9.239336228978727, + "grad_norm": 0.0028986551333218813, + "learning_rate": 3.2562471076292745e-05, + "loss": 0.0486, + "num_input_tokens_seen": 100969632, + "step": 82960 + }, + { + "epoch": 9.239893083862345, + "grad_norm": 1.1547566652297974, + "learning_rate": 3.25601551366038e-05, + "loss": 0.0861, + "num_input_tokens_seen": 100976096, + "step": 82965 + }, + { + "epoch": 9.240449938745963, + "grad_norm": 0.05137960985302925, + "learning_rate": 3.2557839125503125e-05, + "loss": 0.0897, + "num_input_tokens_seen": 100982080, + "step": 82970 + }, + { + "epoch": 9.24100679362958, + "grad_norm": 0.024156102910637856, + "learning_rate": 3.255552304301261e-05, + "loss": 0.0252, + "num_input_tokens_seen": 100988192, + "step": 82975 + }, + { + "epoch": 9.241563648513198, + "grad_norm": 1.4455804824829102, + "learning_rate": 3.255320688915412e-05, + "loss": 0.1278, + "num_input_tokens_seen": 100994336, + "step": 82980 + }, + { + "epoch": 9.242120503396816, + "grad_norm": 0.07381051033735275, + "learning_rate": 3.255089066394955e-05, + "loss": 0.049, + "num_input_tokens_seen": 101000352, + "step": 82985 + }, + { + "epoch": 9.242677358280432, + "grad_norm": 0.47677022218704224, + "learning_rate": 3.2548574367420766e-05, + "loss": 0.0535, + "num_input_tokens_seen": 101006560, + "step": 82990 + }, + { + "epoch": 9.24323421316405, + "grad_norm": 0.08164114505052567, + "learning_rate": 3.254625799958964e-05, + "loss": 0.0266, + "num_input_tokens_seen": 101012128, + "step": 82995 + }, + { + "epoch": 9.243791068047667, + "grad_norm": 0.18199194967746735, + "learning_rate": 3.2543941560478066e-05, + "loss": 0.0086, + "num_input_tokens_seen": 101018240, + "step": 83000 + }, + { + "epoch": 9.244347922931285, + "grad_norm": 0.06422416120767593, + "learning_rate": 3.254162505010792e-05, + "loss": 0.0818, + "num_input_tokens_seen": 101024256, + "step": 83005 + }, + { + "epoch": 9.244904777814902, + "grad_norm": 0.0794135108590126, + "learning_rate": 3.253930846850108e-05, + "loss": 0.0066, + "num_input_tokens_seen": 101030432, + "step": 83010 + }, + { + "epoch": 9.245461632698518, + "grad_norm": 1.9973489046096802, + "learning_rate": 3.253699181567944e-05, + "loss": 0.0671, + "num_input_tokens_seen": 101036160, + "step": 83015 + }, + { + "epoch": 9.246018487582136, + "grad_norm": 1.2968720197677612, + "learning_rate": 3.2534675091664866e-05, + "loss": 0.0832, + "num_input_tokens_seen": 101042464, + "step": 83020 + }, + { + "epoch": 9.246575342465754, + "grad_norm": 0.836841344833374, + "learning_rate": 3.253235829647926e-05, + "loss": 0.1011, + "num_input_tokens_seen": 101048256, + "step": 83025 + }, + { + "epoch": 9.247132197349371, + "grad_norm": 0.02329009398818016, + "learning_rate": 3.253004143014448e-05, + "loss": 0.1053, + "num_input_tokens_seen": 101054528, + "step": 83030 + }, + { + "epoch": 9.247689052232989, + "grad_norm": 0.5979257822036743, + "learning_rate": 3.2527724492682436e-05, + "loss": 0.0785, + "num_input_tokens_seen": 101060672, + "step": 83035 + }, + { + "epoch": 9.248245907116605, + "grad_norm": 0.013206614181399345, + "learning_rate": 3.2525407484115e-05, + "loss": 0.0214, + "num_input_tokens_seen": 101066272, + "step": 83040 + }, + { + "epoch": 9.248802762000222, + "grad_norm": 1.1446833610534668, + "learning_rate": 3.252309040446405e-05, + "loss": 0.1156, + "num_input_tokens_seen": 101071968, + "step": 83045 + }, + { + "epoch": 9.24935961688384, + "grad_norm": 0.040797919034957886, + "learning_rate": 3.25207732537515e-05, + "loss": 0.0063, + "num_input_tokens_seen": 101077920, + "step": 83050 + }, + { + "epoch": 9.249916471767458, + "grad_norm": 0.12096386402845383, + "learning_rate": 3.2518456031999214e-05, + "loss": 0.0537, + "num_input_tokens_seen": 101083616, + "step": 83055 + }, + { + "epoch": 9.250473326651075, + "grad_norm": 0.6742554903030396, + "learning_rate": 3.2516138739229095e-05, + "loss": 0.0656, + "num_input_tokens_seen": 101089472, + "step": 83060 + }, + { + "epoch": 9.251030181534691, + "grad_norm": 0.13296793401241302, + "learning_rate": 3.251382137546302e-05, + "loss": 0.0498, + "num_input_tokens_seen": 101095840, + "step": 83065 + }, + { + "epoch": 9.251587036418309, + "grad_norm": 2.216989755630493, + "learning_rate": 3.251150394072288e-05, + "loss": 0.1923, + "num_input_tokens_seen": 101102080, + "step": 83070 + }, + { + "epoch": 9.252143891301927, + "grad_norm": 1.0935099124908447, + "learning_rate": 3.250918643503056e-05, + "loss": 0.1292, + "num_input_tokens_seen": 101107872, + "step": 83075 + }, + { + "epoch": 9.252700746185544, + "grad_norm": 0.13271655142307281, + "learning_rate": 3.250686885840796e-05, + "loss": 0.0209, + "num_input_tokens_seen": 101114400, + "step": 83080 + }, + { + "epoch": 9.253257601069162, + "grad_norm": 0.5033780932426453, + "learning_rate": 3.250455121087698e-05, + "loss": 0.0447, + "num_input_tokens_seen": 101120544, + "step": 83085 + }, + { + "epoch": 9.253814455952778, + "grad_norm": 0.21853554248809814, + "learning_rate": 3.2502233492459486e-05, + "loss": 0.0344, + "num_input_tokens_seen": 101126304, + "step": 83090 + }, + { + "epoch": 9.254371310836396, + "grad_norm": 0.019181473180651665, + "learning_rate": 3.24999157031774e-05, + "loss": 0.0081, + "num_input_tokens_seen": 101132608, + "step": 83095 + }, + { + "epoch": 9.254928165720013, + "grad_norm": 1.3018022775650024, + "learning_rate": 3.2497597843052594e-05, + "loss": 0.1747, + "num_input_tokens_seen": 101137952, + "step": 83100 + }, + { + "epoch": 9.255485020603631, + "grad_norm": 0.1194181814789772, + "learning_rate": 3.2495279912106975e-05, + "loss": 0.0142, + "num_input_tokens_seen": 101143872, + "step": 83105 + }, + { + "epoch": 9.256041875487249, + "grad_norm": 0.874464750289917, + "learning_rate": 3.249296191036243e-05, + "loss": 0.015, + "num_input_tokens_seen": 101149824, + "step": 83110 + }, + { + "epoch": 9.256598730370865, + "grad_norm": 0.2515937387943268, + "learning_rate": 3.249064383784085e-05, + "loss": 0.0369, + "num_input_tokens_seen": 101155968, + "step": 83115 + }, + { + "epoch": 9.257155585254482, + "grad_norm": 0.06794083118438721, + "learning_rate": 3.248832569456415e-05, + "loss": 0.0646, + "num_input_tokens_seen": 101162336, + "step": 83120 + }, + { + "epoch": 9.2577124401381, + "grad_norm": 0.0007133677718229592, + "learning_rate": 3.24860074805542e-05, + "loss": 0.0142, + "num_input_tokens_seen": 101168352, + "step": 83125 + }, + { + "epoch": 9.258269295021718, + "grad_norm": 2.498729944229126, + "learning_rate": 3.2483689195832925e-05, + "loss": 0.0717, + "num_input_tokens_seen": 101174560, + "step": 83130 + }, + { + "epoch": 9.258826149905335, + "grad_norm": 1.043419599533081, + "learning_rate": 3.2481370840422196e-05, + "loss": 0.0465, + "num_input_tokens_seen": 101180352, + "step": 83135 + }, + { + "epoch": 9.259383004788951, + "grad_norm": 0.1899305135011673, + "learning_rate": 3.2479052414343934e-05, + "loss": 0.0028, + "num_input_tokens_seen": 101186496, + "step": 83140 + }, + { + "epoch": 9.259939859672569, + "grad_norm": 0.0011663397308439016, + "learning_rate": 3.2476733917620035e-05, + "loss": 0.0002, + "num_input_tokens_seen": 101192800, + "step": 83145 + }, + { + "epoch": 9.260496714556187, + "grad_norm": 2.3705718517303467, + "learning_rate": 3.247441535027238e-05, + "loss": 0.0734, + "num_input_tokens_seen": 101198720, + "step": 83150 + }, + { + "epoch": 9.261053569439804, + "grad_norm": 0.0017452785978093743, + "learning_rate": 3.24720967123229e-05, + "loss": 0.0829, + "num_input_tokens_seen": 101204928, + "step": 83155 + }, + { + "epoch": 9.261610424323422, + "grad_norm": 0.4307185113430023, + "learning_rate": 3.2469778003793466e-05, + "loss": 0.1593, + "num_input_tokens_seen": 101211072, + "step": 83160 + }, + { + "epoch": 9.262167279207038, + "grad_norm": 0.0669679269194603, + "learning_rate": 3.2467459224706004e-05, + "loss": 0.1169, + "num_input_tokens_seen": 101217088, + "step": 83165 + }, + { + "epoch": 9.262724134090655, + "grad_norm": 0.1424959897994995, + "learning_rate": 3.2465140375082396e-05, + "loss": 0.0561, + "num_input_tokens_seen": 101223424, + "step": 83170 + }, + { + "epoch": 9.263280988974273, + "grad_norm": 0.18332557380199432, + "learning_rate": 3.246282145494456e-05, + "loss": 0.0072, + "num_input_tokens_seen": 101229472, + "step": 83175 + }, + { + "epoch": 9.26383784385789, + "grad_norm": 0.7590614557266235, + "learning_rate": 3.24605024643144e-05, + "loss": 0.023, + "num_input_tokens_seen": 101235424, + "step": 83180 + }, + { + "epoch": 9.264394698741508, + "grad_norm": 0.0017235783161595464, + "learning_rate": 3.245818340321381e-05, + "loss": 0.0108, + "num_input_tokens_seen": 101241824, + "step": 83185 + }, + { + "epoch": 9.264951553625126, + "grad_norm": 0.03146723657846451, + "learning_rate": 3.2455864271664713e-05, + "loss": 0.0478, + "num_input_tokens_seen": 101247424, + "step": 83190 + }, + { + "epoch": 9.265508408508742, + "grad_norm": 0.4083574414253235, + "learning_rate": 3.2453545069689e-05, + "loss": 0.0126, + "num_input_tokens_seen": 101253568, + "step": 83195 + }, + { + "epoch": 9.26606526339236, + "grad_norm": 0.00015518754662480205, + "learning_rate": 3.245122579730858e-05, + "loss": 0.0128, + "num_input_tokens_seen": 101260000, + "step": 83200 + }, + { + "epoch": 9.266622118275977, + "grad_norm": 0.3831098675727844, + "learning_rate": 3.244890645454537e-05, + "loss": 0.037, + "num_input_tokens_seen": 101265920, + "step": 83205 + }, + { + "epoch": 9.267178973159595, + "grad_norm": 0.0183479692786932, + "learning_rate": 3.244658704142126e-05, + "loss": 0.0332, + "num_input_tokens_seen": 101272000, + "step": 83210 + }, + { + "epoch": 9.267735828043213, + "grad_norm": 0.19879025220870972, + "learning_rate": 3.244426755795817e-05, + "loss": 0.032, + "num_input_tokens_seen": 101278368, + "step": 83215 + }, + { + "epoch": 9.268292682926829, + "grad_norm": 0.23966668546199799, + "learning_rate": 3.244194800417801e-05, + "loss": 0.0661, + "num_input_tokens_seen": 101284256, + "step": 83220 + }, + { + "epoch": 9.268849537810446, + "grad_norm": 0.20952710509300232, + "learning_rate": 3.24396283801027e-05, + "loss": 0.0622, + "num_input_tokens_seen": 101290336, + "step": 83225 + }, + { + "epoch": 9.269406392694064, + "grad_norm": 0.6309425830841064, + "learning_rate": 3.243730868575413e-05, + "loss": 0.0413, + "num_input_tokens_seen": 101295136, + "step": 83230 + }, + { + "epoch": 9.269963247577682, + "grad_norm": 0.44364693760871887, + "learning_rate": 3.2434988921154216e-05, + "loss": 0.0279, + "num_input_tokens_seen": 101301504, + "step": 83235 + }, + { + "epoch": 9.2705201024613, + "grad_norm": 0.8683478832244873, + "learning_rate": 3.243266908632488e-05, + "loss": 0.0698, + "num_input_tokens_seen": 101307264, + "step": 83240 + }, + { + "epoch": 9.271076957344915, + "grad_norm": 3.887669563293457, + "learning_rate": 3.243034918128804e-05, + "loss": 0.1142, + "num_input_tokens_seen": 101313088, + "step": 83245 + }, + { + "epoch": 9.271633812228533, + "grad_norm": 0.00031054223654791713, + "learning_rate": 3.242802920606559e-05, + "loss": 0.005, + "num_input_tokens_seen": 101318752, + "step": 83250 + }, + { + "epoch": 9.27219066711215, + "grad_norm": 0.004559115506708622, + "learning_rate": 3.242570916067944e-05, + "loss": 0.1239, + "num_input_tokens_seen": 101324448, + "step": 83255 + }, + { + "epoch": 9.272747521995768, + "grad_norm": 0.023585805669426918, + "learning_rate": 3.2423389045151534e-05, + "loss": 0.0665, + "num_input_tokens_seen": 101330752, + "step": 83260 + }, + { + "epoch": 9.273304376879386, + "grad_norm": 1.7344374656677246, + "learning_rate": 3.2421068859503765e-05, + "loss": 0.1106, + "num_input_tokens_seen": 101336960, + "step": 83265 + }, + { + "epoch": 9.273861231763002, + "grad_norm": 0.011661817319691181, + "learning_rate": 3.241874860375805e-05, + "loss": 0.0675, + "num_input_tokens_seen": 101342912, + "step": 83270 + }, + { + "epoch": 9.27441808664662, + "grad_norm": 0.3738732635974884, + "learning_rate": 3.2416428277936325e-05, + "loss": 0.0396, + "num_input_tokens_seen": 101348992, + "step": 83275 + }, + { + "epoch": 9.274974941530237, + "grad_norm": 1.1203486919403076, + "learning_rate": 3.241410788206048e-05, + "loss": 0.057, + "num_input_tokens_seen": 101355040, + "step": 83280 + }, + { + "epoch": 9.275531796413855, + "grad_norm": 0.7461473941802979, + "learning_rate": 3.2411787416152455e-05, + "loss": 0.0656, + "num_input_tokens_seen": 101361536, + "step": 83285 + }, + { + "epoch": 9.276088651297473, + "grad_norm": 2.796455144882202, + "learning_rate": 3.2409466880234154e-05, + "loss": 0.1894, + "num_input_tokens_seen": 101367296, + "step": 83290 + }, + { + "epoch": 9.276645506181088, + "grad_norm": 0.00019382704340387136, + "learning_rate": 3.2407146274327506e-05, + "loss": 0.0037, + "num_input_tokens_seen": 101373408, + "step": 83295 + }, + { + "epoch": 9.277202361064706, + "grad_norm": 0.16721059381961823, + "learning_rate": 3.240482559845442e-05, + "loss": 0.0036, + "num_input_tokens_seen": 101379360, + "step": 83300 + }, + { + "epoch": 9.277759215948324, + "grad_norm": 0.003370231483131647, + "learning_rate": 3.240250485263683e-05, + "loss": 0.0494, + "num_input_tokens_seen": 101385600, + "step": 83305 + }, + { + "epoch": 9.278316070831941, + "grad_norm": 0.00041019017226062715, + "learning_rate": 3.2400184036896645e-05, + "loss": 0.0943, + "num_input_tokens_seen": 101391232, + "step": 83310 + }, + { + "epoch": 9.278872925715559, + "grad_norm": 2.693560838699341, + "learning_rate": 3.2397863151255794e-05, + "loss": 0.1064, + "num_input_tokens_seen": 101396832, + "step": 83315 + }, + { + "epoch": 9.279429780599175, + "grad_norm": 0.36718282103538513, + "learning_rate": 3.239554219573621e-05, + "loss": 0.0071, + "num_input_tokens_seen": 101402944, + "step": 83320 + }, + { + "epoch": 9.279986635482793, + "grad_norm": 0.18005654215812683, + "learning_rate": 3.239322117035979e-05, + "loss": 0.0206, + "num_input_tokens_seen": 101408992, + "step": 83325 + }, + { + "epoch": 9.28054349036641, + "grad_norm": 3.0888168811798096, + "learning_rate": 3.239090007514848e-05, + "loss": 0.0733, + "num_input_tokens_seen": 101415008, + "step": 83330 + }, + { + "epoch": 9.281100345250028, + "grad_norm": 0.9651597142219543, + "learning_rate": 3.238857891012419e-05, + "loss": 0.0499, + "num_input_tokens_seen": 101421056, + "step": 83335 + }, + { + "epoch": 9.281657200133646, + "grad_norm": 1.3480104207992554, + "learning_rate": 3.238625767530886e-05, + "loss": 0.0481, + "num_input_tokens_seen": 101427168, + "step": 83340 + }, + { + "epoch": 9.282214055017263, + "grad_norm": 0.24596120417118073, + "learning_rate": 3.238393637072441e-05, + "loss": 0.139, + "num_input_tokens_seen": 101433312, + "step": 83345 + }, + { + "epoch": 9.28277090990088, + "grad_norm": 1.3600823879241943, + "learning_rate": 3.238161499639276e-05, + "loss": 0.109, + "num_input_tokens_seen": 101438976, + "step": 83350 + }, + { + "epoch": 9.283327764784497, + "grad_norm": 0.5691824555397034, + "learning_rate": 3.2379293552335844e-05, + "loss": 0.0271, + "num_input_tokens_seen": 101445088, + "step": 83355 + }, + { + "epoch": 9.283884619668115, + "grad_norm": 0.3823808431625366, + "learning_rate": 3.237697203857559e-05, + "loss": 0.0156, + "num_input_tokens_seen": 101451456, + "step": 83360 + }, + { + "epoch": 9.284441474551732, + "grad_norm": 0.006006099283695221, + "learning_rate": 3.237465045513393e-05, + "loss": 0.018, + "num_input_tokens_seen": 101457568, + "step": 83365 + }, + { + "epoch": 9.28499832943535, + "grad_norm": 0.0007335245027206838, + "learning_rate": 3.237232880203278e-05, + "loss": 0.0229, + "num_input_tokens_seen": 101463936, + "step": 83370 + }, + { + "epoch": 9.285555184318966, + "grad_norm": 0.000366601743735373, + "learning_rate": 3.237000707929407e-05, + "loss": 0.0004, + "num_input_tokens_seen": 101470176, + "step": 83375 + }, + { + "epoch": 9.286112039202584, + "grad_norm": 0.02599945291876793, + "learning_rate": 3.236768528693975e-05, + "loss": 0.0848, + "num_input_tokens_seen": 101475936, + "step": 83380 + }, + { + "epoch": 9.286668894086201, + "grad_norm": 0.0650152638554573, + "learning_rate": 3.236536342499174e-05, + "loss": 0.0104, + "num_input_tokens_seen": 101482240, + "step": 83385 + }, + { + "epoch": 9.287225748969819, + "grad_norm": 0.05668259039521217, + "learning_rate": 3.236304149347196e-05, + "loss": 0.0311, + "num_input_tokens_seen": 101488224, + "step": 83390 + }, + { + "epoch": 9.287782603853437, + "grad_norm": 0.39960581064224243, + "learning_rate": 3.236071949240237e-05, + "loss": 0.0162, + "num_input_tokens_seen": 101493824, + "step": 83395 + }, + { + "epoch": 9.288339458737052, + "grad_norm": 1.0991953611373901, + "learning_rate": 3.2358397421804874e-05, + "loss": 0.098, + "num_input_tokens_seen": 101499904, + "step": 83400 + }, + { + "epoch": 9.28889631362067, + "grad_norm": 0.7549014687538147, + "learning_rate": 3.235607528170143e-05, + "loss": 0.0418, + "num_input_tokens_seen": 101506112, + "step": 83405 + }, + { + "epoch": 9.289453168504288, + "grad_norm": 1.3465564250946045, + "learning_rate": 3.235375307211395e-05, + "loss": 0.04, + "num_input_tokens_seen": 101512352, + "step": 83410 + }, + { + "epoch": 9.290010023387905, + "grad_norm": 0.03658851608633995, + "learning_rate": 3.2351430793064384e-05, + "loss": 0.0147, + "num_input_tokens_seen": 101518432, + "step": 83415 + }, + { + "epoch": 9.290566878271523, + "grad_norm": 0.052260130643844604, + "learning_rate": 3.234910844457467e-05, + "loss": 0.08, + "num_input_tokens_seen": 101524480, + "step": 83420 + }, + { + "epoch": 9.291123733155139, + "grad_norm": 0.00016184624109882861, + "learning_rate": 3.2346786026666733e-05, + "loss": 0.0079, + "num_input_tokens_seen": 101530368, + "step": 83425 + }, + { + "epoch": 9.291680588038757, + "grad_norm": 0.00685745757073164, + "learning_rate": 3.234446353936252e-05, + "loss": 0.0051, + "num_input_tokens_seen": 101536608, + "step": 83430 + }, + { + "epoch": 9.292237442922374, + "grad_norm": 0.03763095289468765, + "learning_rate": 3.2342140982683965e-05, + "loss": 0.0481, + "num_input_tokens_seen": 101542752, + "step": 83435 + }, + { + "epoch": 9.292794297805992, + "grad_norm": 0.006302753463387489, + "learning_rate": 3.233981835665301e-05, + "loss": 0.0661, + "num_input_tokens_seen": 101548800, + "step": 83440 + }, + { + "epoch": 9.29335115268961, + "grad_norm": 0.04513409361243248, + "learning_rate": 3.233749566129157e-05, + "loss": 0.0429, + "num_input_tokens_seen": 101554688, + "step": 83445 + }, + { + "epoch": 9.293908007573226, + "grad_norm": 0.1701621562242508, + "learning_rate": 3.233517289662162e-05, + "loss": 0.0751, + "num_input_tokens_seen": 101560768, + "step": 83450 + }, + { + "epoch": 9.294464862456843, + "grad_norm": 0.6234262585639954, + "learning_rate": 3.233285006266509e-05, + "loss": 0.1049, + "num_input_tokens_seen": 101567040, + "step": 83455 + }, + { + "epoch": 9.295021717340461, + "grad_norm": 0.6899979710578918, + "learning_rate": 3.23305271594439e-05, + "loss": 0.1884, + "num_input_tokens_seen": 101572800, + "step": 83460 + }, + { + "epoch": 9.295578572224079, + "grad_norm": 0.0011440046364441514, + "learning_rate": 3.232820418698003e-05, + "loss": 0.1289, + "num_input_tokens_seen": 101578944, + "step": 83465 + }, + { + "epoch": 9.296135427107696, + "grad_norm": 0.6635850071907043, + "learning_rate": 3.232588114529538e-05, + "loss": 0.0844, + "num_input_tokens_seen": 101585088, + "step": 83470 + }, + { + "epoch": 9.296692281991312, + "grad_norm": 0.09545363485813141, + "learning_rate": 3.2323558034411915e-05, + "loss": 0.0309, + "num_input_tokens_seen": 101591264, + "step": 83475 + }, + { + "epoch": 9.29724913687493, + "grad_norm": 0.6990931034088135, + "learning_rate": 3.232123485435159e-05, + "loss": 0.0138, + "num_input_tokens_seen": 101597184, + "step": 83480 + }, + { + "epoch": 9.297805991758548, + "grad_norm": 0.02257378399372101, + "learning_rate": 3.2318911605136326e-05, + "loss": 0.0022, + "num_input_tokens_seen": 101603200, + "step": 83485 + }, + { + "epoch": 9.298362846642165, + "grad_norm": 0.11546678841114044, + "learning_rate": 3.2316588286788085e-05, + "loss": 0.0226, + "num_input_tokens_seen": 101609504, + "step": 83490 + }, + { + "epoch": 9.298919701525783, + "grad_norm": 0.06851410865783691, + "learning_rate": 3.23142648993288e-05, + "loss": 0.0286, + "num_input_tokens_seen": 101615488, + "step": 83495 + }, + { + "epoch": 9.299476556409399, + "grad_norm": 0.0010063303634524345, + "learning_rate": 3.2311941442780426e-05, + "loss": 0.1103, + "num_input_tokens_seen": 101621568, + "step": 83500 + }, + { + "epoch": 9.300033411293017, + "grad_norm": 0.644691526889801, + "learning_rate": 3.23096179171649e-05, + "loss": 0.0677, + "num_input_tokens_seen": 101627840, + "step": 83505 + }, + { + "epoch": 9.300590266176634, + "grad_norm": 0.18629342317581177, + "learning_rate": 3.230729432250418e-05, + "loss": 0.0116, + "num_input_tokens_seen": 101634112, + "step": 83510 + }, + { + "epoch": 9.301147121060252, + "grad_norm": 1.22552490234375, + "learning_rate": 3.2304970658820215e-05, + "loss": 0.0888, + "num_input_tokens_seen": 101640128, + "step": 83515 + }, + { + "epoch": 9.30170397594387, + "grad_norm": 0.12188336253166199, + "learning_rate": 3.230264692613495e-05, + "loss": 0.0158, + "num_input_tokens_seen": 101646048, + "step": 83520 + }, + { + "epoch": 9.302260830827485, + "grad_norm": 0.4884910583496094, + "learning_rate": 3.2300323124470336e-05, + "loss": 0.1295, + "num_input_tokens_seen": 101651872, + "step": 83525 + }, + { + "epoch": 9.302817685711103, + "grad_norm": 0.18690040707588196, + "learning_rate": 3.229799925384831e-05, + "loss": 0.0924, + "num_input_tokens_seen": 101657952, + "step": 83530 + }, + { + "epoch": 9.30337454059472, + "grad_norm": 0.016852442175149918, + "learning_rate": 3.229567531429084e-05, + "loss": 0.0411, + "num_input_tokens_seen": 101663936, + "step": 83535 + }, + { + "epoch": 9.303931395478338, + "grad_norm": 0.2740362286567688, + "learning_rate": 3.2293351305819874e-05, + "loss": 0.0719, + "num_input_tokens_seen": 101669856, + "step": 83540 + }, + { + "epoch": 9.304488250361956, + "grad_norm": 0.3977144956588745, + "learning_rate": 3.2291027228457356e-05, + "loss": 0.1128, + "num_input_tokens_seen": 101676064, + "step": 83545 + }, + { + "epoch": 9.305045105245574, + "grad_norm": 0.9890657663345337, + "learning_rate": 3.2288703082225245e-05, + "loss": 0.0211, + "num_input_tokens_seen": 101682528, + "step": 83550 + }, + { + "epoch": 9.30560196012919, + "grad_norm": 0.017307862639427185, + "learning_rate": 3.22863788671455e-05, + "loss": 0.0009, + "num_input_tokens_seen": 101688800, + "step": 83555 + }, + { + "epoch": 9.306158815012807, + "grad_norm": 0.06678365916013718, + "learning_rate": 3.2284054583240066e-05, + "loss": 0.0186, + "num_input_tokens_seen": 101694976, + "step": 83560 + }, + { + "epoch": 9.306715669896425, + "grad_norm": 0.903647243976593, + "learning_rate": 3.2281730230530894e-05, + "loss": 0.0301, + "num_input_tokens_seen": 101700928, + "step": 83565 + }, + { + "epoch": 9.307272524780043, + "grad_norm": 1.8286172151565552, + "learning_rate": 3.227940580903996e-05, + "loss": 0.0911, + "num_input_tokens_seen": 101706336, + "step": 83570 + }, + { + "epoch": 9.30782937966366, + "grad_norm": 0.20093151926994324, + "learning_rate": 3.2277081318789196e-05, + "loss": 0.0105, + "num_input_tokens_seen": 101712224, + "step": 83575 + }, + { + "epoch": 9.308386234547276, + "grad_norm": 1.6499607563018799, + "learning_rate": 3.227475675980056e-05, + "loss": 0.1114, + "num_input_tokens_seen": 101718400, + "step": 83580 + }, + { + "epoch": 9.308943089430894, + "grad_norm": 0.8305293917655945, + "learning_rate": 3.2272432132096035e-05, + "loss": 0.1226, + "num_input_tokens_seen": 101724480, + "step": 83585 + }, + { + "epoch": 9.309499944314512, + "grad_norm": 0.6788014769554138, + "learning_rate": 3.2270107435697546e-05, + "loss": 0.1506, + "num_input_tokens_seen": 101731008, + "step": 83590 + }, + { + "epoch": 9.31005679919813, + "grad_norm": 0.04680659994482994, + "learning_rate": 3.226778267062709e-05, + "loss": 0.0133, + "num_input_tokens_seen": 101737120, + "step": 83595 + }, + { + "epoch": 9.310613654081747, + "grad_norm": 0.3158487379550934, + "learning_rate": 3.226545783690659e-05, + "loss": 0.0459, + "num_input_tokens_seen": 101743360, + "step": 83600 + }, + { + "epoch": 9.311170508965363, + "grad_norm": 1.6466999053955078, + "learning_rate": 3.226313293455801e-05, + "loss": 0.0427, + "num_input_tokens_seen": 101749376, + "step": 83605 + }, + { + "epoch": 9.31172736384898, + "grad_norm": 0.22004736959934235, + "learning_rate": 3.2260807963603336e-05, + "loss": 0.0312, + "num_input_tokens_seen": 101755392, + "step": 83610 + }, + { + "epoch": 9.312284218732598, + "grad_norm": 1.6053869724273682, + "learning_rate": 3.22584829240645e-05, + "loss": 0.0969, + "num_input_tokens_seen": 101761568, + "step": 83615 + }, + { + "epoch": 9.312841073616216, + "grad_norm": 1.221850037574768, + "learning_rate": 3.2256157815963494e-05, + "loss": 0.1356, + "num_input_tokens_seen": 101767296, + "step": 83620 + }, + { + "epoch": 9.313397928499834, + "grad_norm": 1.1200268268585205, + "learning_rate": 3.2253832639322254e-05, + "loss": 0.05, + "num_input_tokens_seen": 101773280, + "step": 83625 + }, + { + "epoch": 9.31395478338345, + "grad_norm": 0.16732531785964966, + "learning_rate": 3.225150739416276e-05, + "loss": 0.0274, + "num_input_tokens_seen": 101779776, + "step": 83630 + }, + { + "epoch": 9.314511638267067, + "grad_norm": 0.6741031408309937, + "learning_rate": 3.224918208050696e-05, + "loss": 0.0459, + "num_input_tokens_seen": 101785792, + "step": 83635 + }, + { + "epoch": 9.315068493150685, + "grad_norm": 0.1251325011253357, + "learning_rate": 3.2246856698376826e-05, + "loss": 0.0301, + "num_input_tokens_seen": 101792032, + "step": 83640 + }, + { + "epoch": 9.315625348034303, + "grad_norm": 0.0806226134300232, + "learning_rate": 3.224453124779433e-05, + "loss": 0.0065, + "num_input_tokens_seen": 101798176, + "step": 83645 + }, + { + "epoch": 9.31618220291792, + "grad_norm": 0.0032962257973849773, + "learning_rate": 3.224220572878143e-05, + "loss": 0.0086, + "num_input_tokens_seen": 101804288, + "step": 83650 + }, + { + "epoch": 9.316739057801536, + "grad_norm": 0.043821897357702255, + "learning_rate": 3.2239880141360103e-05, + "loss": 0.0869, + "num_input_tokens_seen": 101810656, + "step": 83655 + }, + { + "epoch": 9.317295912685154, + "grad_norm": 0.2495381385087967, + "learning_rate": 3.223755448555229e-05, + "loss": 0.0489, + "num_input_tokens_seen": 101816640, + "step": 83660 + }, + { + "epoch": 9.317852767568771, + "grad_norm": 0.006168197840452194, + "learning_rate": 3.2235228761379996e-05, + "loss": 0.0233, + "num_input_tokens_seen": 101823168, + "step": 83665 + }, + { + "epoch": 9.31840962245239, + "grad_norm": 2.370676040649414, + "learning_rate": 3.223290296886515e-05, + "loss": 0.0226, + "num_input_tokens_seen": 101829120, + "step": 83670 + }, + { + "epoch": 9.318966477336007, + "grad_norm": 0.25757288932800293, + "learning_rate": 3.2230577108029755e-05, + "loss": 0.0372, + "num_input_tokens_seen": 101835200, + "step": 83675 + }, + { + "epoch": 9.319523332219624, + "grad_norm": 0.49194765090942383, + "learning_rate": 3.222825117889576e-05, + "loss": 0.0293, + "num_input_tokens_seen": 101841088, + "step": 83680 + }, + { + "epoch": 9.32008018710324, + "grad_norm": 1.4749572277069092, + "learning_rate": 3.222592518148514e-05, + "loss": 0.16, + "num_input_tokens_seen": 101847072, + "step": 83685 + }, + { + "epoch": 9.320637041986858, + "grad_norm": 0.3460999131202698, + "learning_rate": 3.2223599115819875e-05, + "loss": 0.0372, + "num_input_tokens_seen": 101852832, + "step": 83690 + }, + { + "epoch": 9.321193896870476, + "grad_norm": 0.6658666729927063, + "learning_rate": 3.222127298192193e-05, + "loss": 0.0202, + "num_input_tokens_seen": 101858848, + "step": 83695 + }, + { + "epoch": 9.321750751754093, + "grad_norm": 0.15489909052848816, + "learning_rate": 3.221894677981326e-05, + "loss": 0.0174, + "num_input_tokens_seen": 101864832, + "step": 83700 + }, + { + "epoch": 9.322307606637711, + "grad_norm": 0.08377551287412643, + "learning_rate": 3.221662050951587e-05, + "loss": 0.0073, + "num_input_tokens_seen": 101871104, + "step": 83705 + }, + { + "epoch": 9.322864461521327, + "grad_norm": 0.67157381772995, + "learning_rate": 3.221429417105171e-05, + "loss": 0.1131, + "num_input_tokens_seen": 101877376, + "step": 83710 + }, + { + "epoch": 9.323421316404945, + "grad_norm": 1.2046619653701782, + "learning_rate": 3.221196776444276e-05, + "loss": 0.1133, + "num_input_tokens_seen": 101883552, + "step": 83715 + }, + { + "epoch": 9.323978171288562, + "grad_norm": 1.2663724422454834, + "learning_rate": 3.2209641289711e-05, + "loss": 0.0891, + "num_input_tokens_seen": 101889888, + "step": 83720 + }, + { + "epoch": 9.32453502617218, + "grad_norm": 0.5334964394569397, + "learning_rate": 3.22073147468784e-05, + "loss": 0.0172, + "num_input_tokens_seen": 101896064, + "step": 83725 + }, + { + "epoch": 9.325091881055798, + "grad_norm": 0.0006921649910509586, + "learning_rate": 3.220498813596694e-05, + "loss": 0.0117, + "num_input_tokens_seen": 101902560, + "step": 83730 + }, + { + "epoch": 9.325648735939414, + "grad_norm": 1.2356749773025513, + "learning_rate": 3.22026614569986e-05, + "loss": 0.0229, + "num_input_tokens_seen": 101907936, + "step": 83735 + }, + { + "epoch": 9.326205590823031, + "grad_norm": 1.2822712659835815, + "learning_rate": 3.2200334709995346e-05, + "loss": 0.08, + "num_input_tokens_seen": 101914080, + "step": 83740 + }, + { + "epoch": 9.326762445706649, + "grad_norm": 0.17486055195331573, + "learning_rate": 3.219800789497916e-05, + "loss": 0.0085, + "num_input_tokens_seen": 101920192, + "step": 83745 + }, + { + "epoch": 9.327319300590267, + "grad_norm": 0.7162151336669922, + "learning_rate": 3.219568101197202e-05, + "loss": 0.0468, + "num_input_tokens_seen": 101925824, + "step": 83750 + }, + { + "epoch": 9.327876155473884, + "grad_norm": 0.16316956281661987, + "learning_rate": 3.219335406099591e-05, + "loss": 0.0027, + "num_input_tokens_seen": 101931456, + "step": 83755 + }, + { + "epoch": 9.3284330103575, + "grad_norm": 0.9747870564460754, + "learning_rate": 3.219102704207282e-05, + "loss": 0.0095, + "num_input_tokens_seen": 101937856, + "step": 83760 + }, + { + "epoch": 9.328989865241118, + "grad_norm": 0.11082849651575089, + "learning_rate": 3.21886999552247e-05, + "loss": 0.0441, + "num_input_tokens_seen": 101944000, + "step": 83765 + }, + { + "epoch": 9.329546720124736, + "grad_norm": 0.00015329006419051439, + "learning_rate": 3.218637280047356e-05, + "loss": 0.0035, + "num_input_tokens_seen": 101950176, + "step": 83770 + }, + { + "epoch": 9.330103575008353, + "grad_norm": 0.26707926392555237, + "learning_rate": 3.218404557784137e-05, + "loss": 0.022, + "num_input_tokens_seen": 101956256, + "step": 83775 + }, + { + "epoch": 9.33066042989197, + "grad_norm": 9.3645321612712e-05, + "learning_rate": 3.218171828735011e-05, + "loss": 0.0652, + "num_input_tokens_seen": 101962752, + "step": 83780 + }, + { + "epoch": 9.331217284775587, + "grad_norm": 0.3081347644329071, + "learning_rate": 3.217939092902177e-05, + "loss": 0.0891, + "num_input_tokens_seen": 101969088, + "step": 83785 + }, + { + "epoch": 9.331774139659204, + "grad_norm": 0.04383505880832672, + "learning_rate": 3.217706350287833e-05, + "loss": 0.09, + "num_input_tokens_seen": 101975040, + "step": 83790 + }, + { + "epoch": 9.332330994542822, + "grad_norm": 0.002896016463637352, + "learning_rate": 3.2174736008941775e-05, + "loss": 0.0526, + "num_input_tokens_seen": 101981568, + "step": 83795 + }, + { + "epoch": 9.33288784942644, + "grad_norm": 3.358011484146118, + "learning_rate": 3.217240844723409e-05, + "loss": 0.1106, + "num_input_tokens_seen": 101987360, + "step": 83800 + }, + { + "epoch": 9.333444704310057, + "grad_norm": 0.9959526062011719, + "learning_rate": 3.217008081777726e-05, + "loss": 0.0278, + "num_input_tokens_seen": 101993440, + "step": 83805 + }, + { + "epoch": 9.334001559193673, + "grad_norm": 1.2672522068023682, + "learning_rate": 3.216775312059327e-05, + "loss": 0.0732, + "num_input_tokens_seen": 101998752, + "step": 83810 + }, + { + "epoch": 9.334558414077291, + "grad_norm": 0.002348782494664192, + "learning_rate": 3.216542535570412e-05, + "loss": 0.0114, + "num_input_tokens_seen": 102005152, + "step": 83815 + }, + { + "epoch": 9.335115268960909, + "grad_norm": 0.9884918332099915, + "learning_rate": 3.216309752313177e-05, + "loss": 0.0531, + "num_input_tokens_seen": 102011168, + "step": 83820 + }, + { + "epoch": 9.335672123844526, + "grad_norm": 0.5752087831497192, + "learning_rate": 3.216076962289823e-05, + "loss": 0.1001, + "num_input_tokens_seen": 102016576, + "step": 83825 + }, + { + "epoch": 9.336228978728144, + "grad_norm": 0.9419698119163513, + "learning_rate": 3.21584416550255e-05, + "loss": 0.0995, + "num_input_tokens_seen": 102022848, + "step": 83830 + }, + { + "epoch": 9.33678583361176, + "grad_norm": 0.029412085190415382, + "learning_rate": 3.2156113619535535e-05, + "loss": 0.0052, + "num_input_tokens_seen": 102028928, + "step": 83835 + }, + { + "epoch": 9.337342688495378, + "grad_norm": 1.0799109935760498, + "learning_rate": 3.215378551645035e-05, + "loss": 0.0374, + "num_input_tokens_seen": 102035392, + "step": 83840 + }, + { + "epoch": 9.337899543378995, + "grad_norm": 0.00019734565285034478, + "learning_rate": 3.215145734579193e-05, + "loss": 0.009, + "num_input_tokens_seen": 102041504, + "step": 83845 + }, + { + "epoch": 9.338456398262613, + "grad_norm": 0.029392018914222717, + "learning_rate": 3.214912910758226e-05, + "loss": 0.0705, + "num_input_tokens_seen": 102047744, + "step": 83850 + }, + { + "epoch": 9.33901325314623, + "grad_norm": 0.0015893387608230114, + "learning_rate": 3.214680080184334e-05, + "loss": 0.0074, + "num_input_tokens_seen": 102053792, + "step": 83855 + }, + { + "epoch": 9.339570108029847, + "grad_norm": 0.6784437894821167, + "learning_rate": 3.214447242859717e-05, + "loss": 0.057, + "num_input_tokens_seen": 102059616, + "step": 83860 + }, + { + "epoch": 9.340126962913464, + "grad_norm": 0.0007226369343698025, + "learning_rate": 3.2142143987865717e-05, + "loss": 0.0121, + "num_input_tokens_seen": 102065920, + "step": 83865 + }, + { + "epoch": 9.340683817797082, + "grad_norm": 0.23329080641269684, + "learning_rate": 3.2139815479671005e-05, + "loss": 0.0127, + "num_input_tokens_seen": 102072544, + "step": 83870 + }, + { + "epoch": 9.3412406726807, + "grad_norm": 0.8423636555671692, + "learning_rate": 3.213748690403501e-05, + "loss": 0.0377, + "num_input_tokens_seen": 102078528, + "step": 83875 + }, + { + "epoch": 9.341797527564317, + "grad_norm": 0.056955501437187195, + "learning_rate": 3.213515826097973e-05, + "loss": 0.0704, + "num_input_tokens_seen": 102084544, + "step": 83880 + }, + { + "epoch": 9.342354382447935, + "grad_norm": 0.24541114270687103, + "learning_rate": 3.213282955052717e-05, + "loss": 0.0247, + "num_input_tokens_seen": 102090720, + "step": 83885 + }, + { + "epoch": 9.34291123733155, + "grad_norm": 0.6349668502807617, + "learning_rate": 3.2130500772699315e-05, + "loss": 0.0609, + "num_input_tokens_seen": 102096864, + "step": 83890 + }, + { + "epoch": 9.343468092215168, + "grad_norm": 0.0037532036658376455, + "learning_rate": 3.2128171927518176e-05, + "loss": 0.0865, + "num_input_tokens_seen": 102102976, + "step": 83895 + }, + { + "epoch": 9.344024947098786, + "grad_norm": 0.5785046219825745, + "learning_rate": 3.212584301500574e-05, + "loss": 0.0103, + "num_input_tokens_seen": 102109408, + "step": 83900 + }, + { + "epoch": 9.344581801982404, + "grad_norm": 0.0004335476260166615, + "learning_rate": 3.2123514035184e-05, + "loss": 0.0632, + "num_input_tokens_seen": 102115744, + "step": 83905 + }, + { + "epoch": 9.345138656866022, + "grad_norm": 0.16858701407909393, + "learning_rate": 3.212118498807497e-05, + "loss": 0.0227, + "num_input_tokens_seen": 102121728, + "step": 83910 + }, + { + "epoch": 9.345695511749637, + "grad_norm": 0.1321854442358017, + "learning_rate": 3.211885587370063e-05, + "loss": 0.0181, + "num_input_tokens_seen": 102127936, + "step": 83915 + }, + { + "epoch": 9.346252366633255, + "grad_norm": 0.06958431750535965, + "learning_rate": 3.2116526692083e-05, + "loss": 0.01, + "num_input_tokens_seen": 102133664, + "step": 83920 + }, + { + "epoch": 9.346809221516873, + "grad_norm": 0.0049663567915558815, + "learning_rate": 3.2114197443244084e-05, + "loss": 0.0276, + "num_input_tokens_seen": 102139264, + "step": 83925 + }, + { + "epoch": 9.34736607640049, + "grad_norm": 0.5162289142608643, + "learning_rate": 3.211186812720586e-05, + "loss": 0.0229, + "num_input_tokens_seen": 102145504, + "step": 83930 + }, + { + "epoch": 9.347922931284108, + "grad_norm": 0.0014304810902103782, + "learning_rate": 3.210953874399035e-05, + "loss": 0.0149, + "num_input_tokens_seen": 102151872, + "step": 83935 + }, + { + "epoch": 9.348479786167724, + "grad_norm": 1.075925588607788, + "learning_rate": 3.2107209293619554e-05, + "loss": 0.0706, + "num_input_tokens_seen": 102157952, + "step": 83940 + }, + { + "epoch": 9.349036641051342, + "grad_norm": 0.004223927855491638, + "learning_rate": 3.210487977611546e-05, + "loss": 0.0782, + "num_input_tokens_seen": 102164096, + "step": 83945 + }, + { + "epoch": 9.34959349593496, + "grad_norm": 0.2561804950237274, + "learning_rate": 3.210255019150009e-05, + "loss": 0.0422, + "num_input_tokens_seen": 102170336, + "step": 83950 + }, + { + "epoch": 9.350150350818577, + "grad_norm": 0.007929111830890179, + "learning_rate": 3.210022053979545e-05, + "loss": 0.1954, + "num_input_tokens_seen": 102176288, + "step": 83955 + }, + { + "epoch": 9.350707205702195, + "grad_norm": 0.6968590617179871, + "learning_rate": 3.209789082102353e-05, + "loss": 0.2253, + "num_input_tokens_seen": 102182336, + "step": 83960 + }, + { + "epoch": 9.35126406058581, + "grad_norm": 0.42430245876312256, + "learning_rate": 3.209556103520635e-05, + "loss": 0.0901, + "num_input_tokens_seen": 102188448, + "step": 83965 + }, + { + "epoch": 9.351820915469428, + "grad_norm": 0.37987029552459717, + "learning_rate": 3.209323118236591e-05, + "loss": 0.0282, + "num_input_tokens_seen": 102194624, + "step": 83970 + }, + { + "epoch": 9.352377770353046, + "grad_norm": 0.03753846138715744, + "learning_rate": 3.209090126252421e-05, + "loss": 0.1625, + "num_input_tokens_seen": 102200992, + "step": 83975 + }, + { + "epoch": 9.352934625236664, + "grad_norm": 1.110946536064148, + "learning_rate": 3.208857127570328e-05, + "loss": 0.1073, + "num_input_tokens_seen": 102206848, + "step": 83980 + }, + { + "epoch": 9.353491480120281, + "grad_norm": 1.7103127241134644, + "learning_rate": 3.20862412219251e-05, + "loss": 0.1445, + "num_input_tokens_seen": 102212288, + "step": 83985 + }, + { + "epoch": 9.354048335003897, + "grad_norm": 0.05149431526660919, + "learning_rate": 3.2083911101211706e-05, + "loss": 0.0244, + "num_input_tokens_seen": 102218368, + "step": 83990 + }, + { + "epoch": 9.354605189887515, + "grad_norm": 0.4786800444126129, + "learning_rate": 3.208158091358509e-05, + "loss": 0.0255, + "num_input_tokens_seen": 102223680, + "step": 83995 + }, + { + "epoch": 9.355162044771133, + "grad_norm": 0.032147377729415894, + "learning_rate": 3.2079250659067276e-05, + "loss": 0.1105, + "num_input_tokens_seen": 102229536, + "step": 84000 + }, + { + "epoch": 9.35571889965475, + "grad_norm": 1.6757696866989136, + "learning_rate": 3.207692033768026e-05, + "loss": 0.1392, + "num_input_tokens_seen": 102235584, + "step": 84005 + }, + { + "epoch": 9.356275754538368, + "grad_norm": 0.8437905311584473, + "learning_rate": 3.207458994944606e-05, + "loss": 0.0889, + "num_input_tokens_seen": 102241696, + "step": 84010 + }, + { + "epoch": 9.356832609421984, + "grad_norm": 1.7999776601791382, + "learning_rate": 3.20722594943867e-05, + "loss": 0.1274, + "num_input_tokens_seen": 102247904, + "step": 84015 + }, + { + "epoch": 9.357389464305601, + "grad_norm": 0.7880619168281555, + "learning_rate": 3.206992897252417e-05, + "loss": 0.0283, + "num_input_tokens_seen": 102252896, + "step": 84020 + }, + { + "epoch": 9.35794631918922, + "grad_norm": 0.008690748363733292, + "learning_rate": 3.206759838388051e-05, + "loss": 0.0783, + "num_input_tokens_seen": 102259040, + "step": 84025 + }, + { + "epoch": 9.358503174072837, + "grad_norm": 0.024412935599684715, + "learning_rate": 3.206526772847771e-05, + "loss": 0.0701, + "num_input_tokens_seen": 102265280, + "step": 84030 + }, + { + "epoch": 9.359060028956455, + "grad_norm": 0.008309831842780113, + "learning_rate": 3.206293700633781e-05, + "loss": 0.0299, + "num_input_tokens_seen": 102271040, + "step": 84035 + }, + { + "epoch": 9.359616883840072, + "grad_norm": 1.3898776769638062, + "learning_rate": 3.206060621748279e-05, + "loss": 0.084, + "num_input_tokens_seen": 102276608, + "step": 84040 + }, + { + "epoch": 9.360173738723688, + "grad_norm": 0.268740713596344, + "learning_rate": 3.20582753619347e-05, + "loss": 0.0122, + "num_input_tokens_seen": 102282720, + "step": 84045 + }, + { + "epoch": 9.360730593607306, + "grad_norm": 0.18136253952980042, + "learning_rate": 3.2055944439715545e-05, + "loss": 0.0437, + "num_input_tokens_seen": 102288544, + "step": 84050 + }, + { + "epoch": 9.361287448490923, + "grad_norm": 2.4321489334106445, + "learning_rate": 3.205361345084734e-05, + "loss": 0.0454, + "num_input_tokens_seen": 102294752, + "step": 84055 + }, + { + "epoch": 9.361844303374541, + "grad_norm": 0.044555675238370895, + "learning_rate": 3.2051282395352106e-05, + "loss": 0.0105, + "num_input_tokens_seen": 102300736, + "step": 84060 + }, + { + "epoch": 9.362401158258159, + "grad_norm": 0.19262486696243286, + "learning_rate": 3.204895127325186e-05, + "loss": 0.0482, + "num_input_tokens_seen": 102306880, + "step": 84065 + }, + { + "epoch": 9.362958013141775, + "grad_norm": 0.0035564375575631857, + "learning_rate": 3.204662008456862e-05, + "loss": 0.0856, + "num_input_tokens_seen": 102313216, + "step": 84070 + }, + { + "epoch": 9.363514868025392, + "grad_norm": 0.009153060615062714, + "learning_rate": 3.2044288829324403e-05, + "loss": 0.1437, + "num_input_tokens_seen": 102319552, + "step": 84075 + }, + { + "epoch": 9.36407172290901, + "grad_norm": 0.00011429264122853056, + "learning_rate": 3.2041957507541246e-05, + "loss": 0.0398, + "num_input_tokens_seen": 102325472, + "step": 84080 + }, + { + "epoch": 9.364628577792628, + "grad_norm": 0.3366655111312866, + "learning_rate": 3.203962611924115e-05, + "loss": 0.1285, + "num_input_tokens_seen": 102331552, + "step": 84085 + }, + { + "epoch": 9.365185432676245, + "grad_norm": 0.15839558839797974, + "learning_rate": 3.203729466444614e-05, + "loss": 0.0108, + "num_input_tokens_seen": 102337728, + "step": 84090 + }, + { + "epoch": 9.365742287559861, + "grad_norm": 0.439313679933548, + "learning_rate": 3.2034963143178254e-05, + "loss": 0.12, + "num_input_tokens_seen": 102343712, + "step": 84095 + }, + { + "epoch": 9.366299142443479, + "grad_norm": 0.1937652975320816, + "learning_rate": 3.20326315554595e-05, + "loss": 0.0603, + "num_input_tokens_seen": 102349792, + "step": 84100 + }, + { + "epoch": 9.366855997327097, + "grad_norm": 0.006614474579691887, + "learning_rate": 3.2030299901311904e-05, + "loss": 0.0231, + "num_input_tokens_seen": 102355616, + "step": 84105 + }, + { + "epoch": 9.367412852210714, + "grad_norm": 0.0011315313167870045, + "learning_rate": 3.20279681807575e-05, + "loss": 0.013, + "num_input_tokens_seen": 102361632, + "step": 84110 + }, + { + "epoch": 9.367969707094332, + "grad_norm": 0.0003337519592605531, + "learning_rate": 3.2025636393818305e-05, + "loss": 0.0483, + "num_input_tokens_seen": 102367904, + "step": 84115 + }, + { + "epoch": 9.368526561977948, + "grad_norm": 1.0311325788497925, + "learning_rate": 3.202330454051634e-05, + "loss": 0.2198, + "num_input_tokens_seen": 102374048, + "step": 84120 + }, + { + "epoch": 9.369083416861566, + "grad_norm": 1.0639327764511108, + "learning_rate": 3.202097262087364e-05, + "loss": 0.0494, + "num_input_tokens_seen": 102380224, + "step": 84125 + }, + { + "epoch": 9.369640271745183, + "grad_norm": 0.2196621298789978, + "learning_rate": 3.2018640634912236e-05, + "loss": 0.168, + "num_input_tokens_seen": 102386112, + "step": 84130 + }, + { + "epoch": 9.3701971266288, + "grad_norm": 1.086272120475769, + "learning_rate": 3.201630858265413e-05, + "loss": 0.1322, + "num_input_tokens_seen": 102392224, + "step": 84135 + }, + { + "epoch": 9.370753981512419, + "grad_norm": 1.307499647140503, + "learning_rate": 3.201397646412138e-05, + "loss": 0.0133, + "num_input_tokens_seen": 102398720, + "step": 84140 + }, + { + "epoch": 9.371310836396034, + "grad_norm": 1.1424407958984375, + "learning_rate": 3.2011644279336007e-05, + "loss": 0.129, + "num_input_tokens_seen": 102404960, + "step": 84145 + }, + { + "epoch": 9.371867691279652, + "grad_norm": 0.0009556320146657526, + "learning_rate": 3.200931202832003e-05, + "loss": 0.005, + "num_input_tokens_seen": 102411040, + "step": 84150 + }, + { + "epoch": 9.37242454616327, + "grad_norm": 0.577454686164856, + "learning_rate": 3.200697971109548e-05, + "loss": 0.0862, + "num_input_tokens_seen": 102417376, + "step": 84155 + }, + { + "epoch": 9.372981401046887, + "grad_norm": 0.6378628611564636, + "learning_rate": 3.20046473276844e-05, + "loss": 0.0378, + "num_input_tokens_seen": 102423520, + "step": 84160 + }, + { + "epoch": 9.373538255930505, + "grad_norm": 0.08155475556850433, + "learning_rate": 3.200231487810881e-05, + "loss": 0.0612, + "num_input_tokens_seen": 102429504, + "step": 84165 + }, + { + "epoch": 9.374095110814121, + "grad_norm": 0.24932053685188293, + "learning_rate": 3.1999982362390744e-05, + "loss": 0.0634, + "num_input_tokens_seen": 102435552, + "step": 84170 + }, + { + "epoch": 9.374651965697739, + "grad_norm": 2.5077342987060547, + "learning_rate": 3.199764978055224e-05, + "loss": 0.1199, + "num_input_tokens_seen": 102441728, + "step": 84175 + }, + { + "epoch": 9.375208820581356, + "grad_norm": 0.05683113634586334, + "learning_rate": 3.1995317132615336e-05, + "loss": 0.0792, + "num_input_tokens_seen": 102447584, + "step": 84180 + }, + { + "epoch": 9.375765675464974, + "grad_norm": 2.0343470573425293, + "learning_rate": 3.199298441860204e-05, + "loss": 0.1206, + "num_input_tokens_seen": 102453568, + "step": 84185 + }, + { + "epoch": 9.376322530348592, + "grad_norm": 0.35332515835762024, + "learning_rate": 3.1990651638534416e-05, + "loss": 0.0426, + "num_input_tokens_seen": 102459808, + "step": 84190 + }, + { + "epoch": 9.376879385232208, + "grad_norm": 0.11562492698431015, + "learning_rate": 3.198831879243449e-05, + "loss": 0.0266, + "num_input_tokens_seen": 102466112, + "step": 84195 + }, + { + "epoch": 9.377436240115825, + "grad_norm": 0.06781558692455292, + "learning_rate": 3.1985985880324284e-05, + "loss": 0.0741, + "num_input_tokens_seen": 102471808, + "step": 84200 + }, + { + "epoch": 9.377993094999443, + "grad_norm": 0.5093222856521606, + "learning_rate": 3.198365290222585e-05, + "loss": 0.0534, + "num_input_tokens_seen": 102477728, + "step": 84205 + }, + { + "epoch": 9.37854994988306, + "grad_norm": 0.4238617718219757, + "learning_rate": 3.198131985816121e-05, + "loss": 0.0108, + "num_input_tokens_seen": 102483840, + "step": 84210 + }, + { + "epoch": 9.379106804766678, + "grad_norm": 0.35936570167541504, + "learning_rate": 3.1978986748152417e-05, + "loss": 0.0553, + "num_input_tokens_seen": 102490016, + "step": 84215 + }, + { + "epoch": 9.379663659650294, + "grad_norm": 0.06666581332683563, + "learning_rate": 3.19766535722215e-05, + "loss": 0.0024, + "num_input_tokens_seen": 102496256, + "step": 84220 + }, + { + "epoch": 9.380220514533912, + "grad_norm": 0.0945907011628151, + "learning_rate": 3.1974320330390505e-05, + "loss": 0.0872, + "num_input_tokens_seen": 102502560, + "step": 84225 + }, + { + "epoch": 9.38077736941753, + "grad_norm": 0.7460611462593079, + "learning_rate": 3.1971987022681465e-05, + "loss": 0.0501, + "num_input_tokens_seen": 102508768, + "step": 84230 + }, + { + "epoch": 9.381334224301147, + "grad_norm": 1.5749335289001465, + "learning_rate": 3.196965364911642e-05, + "loss": 0.1395, + "num_input_tokens_seen": 102514144, + "step": 84235 + }, + { + "epoch": 9.381891079184765, + "grad_norm": 0.0011056552175432444, + "learning_rate": 3.196732020971742e-05, + "loss": 0.1002, + "num_input_tokens_seen": 102520416, + "step": 84240 + }, + { + "epoch": 9.382447934068383, + "grad_norm": 0.7308976054191589, + "learning_rate": 3.1964986704506486e-05, + "loss": 0.1528, + "num_input_tokens_seen": 102526496, + "step": 84245 + }, + { + "epoch": 9.383004788951999, + "grad_norm": 0.24166102707386017, + "learning_rate": 3.196265313350568e-05, + "loss": 0.0065, + "num_input_tokens_seen": 102532608, + "step": 84250 + }, + { + "epoch": 9.383561643835616, + "grad_norm": 0.002503825817257166, + "learning_rate": 3.1960319496737036e-05, + "loss": 0.0005, + "num_input_tokens_seen": 102538816, + "step": 84255 + }, + { + "epoch": 9.384118498719234, + "grad_norm": 0.9434624314308167, + "learning_rate": 3.195798579422259e-05, + "loss": 0.0229, + "num_input_tokens_seen": 102545152, + "step": 84260 + }, + { + "epoch": 9.384675353602852, + "grad_norm": 0.927688717842102, + "learning_rate": 3.195565202598441e-05, + "loss": 0.0393, + "num_input_tokens_seen": 102551296, + "step": 84265 + }, + { + "epoch": 9.38523220848647, + "grad_norm": 0.06367339938879013, + "learning_rate": 3.195331819204451e-05, + "loss": 0.0072, + "num_input_tokens_seen": 102557440, + "step": 84270 + }, + { + "epoch": 9.385789063370085, + "grad_norm": 0.00878860428929329, + "learning_rate": 3.1950984292424954e-05, + "loss": 0.1292, + "num_input_tokens_seen": 102563552, + "step": 84275 + }, + { + "epoch": 9.386345918253703, + "grad_norm": 0.7153269052505493, + "learning_rate": 3.194865032714778e-05, + "loss": 0.0362, + "num_input_tokens_seen": 102569632, + "step": 84280 + }, + { + "epoch": 9.38690277313732, + "grad_norm": 0.019913863390684128, + "learning_rate": 3.1946316296235035e-05, + "loss": 0.0133, + "num_input_tokens_seen": 102575936, + "step": 84285 + }, + { + "epoch": 9.387459628020938, + "grad_norm": 0.6325975656509399, + "learning_rate": 3.1943982199708775e-05, + "loss": 0.0193, + "num_input_tokens_seen": 102582240, + "step": 84290 + }, + { + "epoch": 9.388016482904556, + "grad_norm": 0.07198836654424667, + "learning_rate": 3.194164803759103e-05, + "loss": 0.0225, + "num_input_tokens_seen": 102588384, + "step": 84295 + }, + { + "epoch": 9.388573337788172, + "grad_norm": 0.0073439511470496655, + "learning_rate": 3.1939313809903864e-05, + "loss": 0.0158, + "num_input_tokens_seen": 102594624, + "step": 84300 + }, + { + "epoch": 9.38913019267179, + "grad_norm": 0.01633393205702305, + "learning_rate": 3.193697951666932e-05, + "loss": 0.0918, + "num_input_tokens_seen": 102600832, + "step": 84305 + }, + { + "epoch": 9.389687047555407, + "grad_norm": 0.6760483384132385, + "learning_rate": 3.1934645157909446e-05, + "loss": 0.0345, + "num_input_tokens_seen": 102606880, + "step": 84310 + }, + { + "epoch": 9.390243902439025, + "grad_norm": 1.2063485383987427, + "learning_rate": 3.1932310733646284e-05, + "loss": 0.0465, + "num_input_tokens_seen": 102613248, + "step": 84315 + }, + { + "epoch": 9.390800757322642, + "grad_norm": 0.05748768523335457, + "learning_rate": 3.19299762439019e-05, + "loss": 0.0764, + "num_input_tokens_seen": 102619488, + "step": 84320 + }, + { + "epoch": 9.391357612206258, + "grad_norm": 0.0009556390577927232, + "learning_rate": 3.192764168869834e-05, + "loss": 0.1138, + "num_input_tokens_seen": 102625312, + "step": 84325 + }, + { + "epoch": 9.391914467089876, + "grad_norm": 0.024125684052705765, + "learning_rate": 3.192530706805765e-05, + "loss": 0.0395, + "num_input_tokens_seen": 102631616, + "step": 84330 + }, + { + "epoch": 9.392471321973494, + "grad_norm": 0.006700572557747364, + "learning_rate": 3.1922972382001894e-05, + "loss": 0.0828, + "num_input_tokens_seen": 102637664, + "step": 84335 + }, + { + "epoch": 9.393028176857111, + "grad_norm": 0.6213148236274719, + "learning_rate": 3.192063763055311e-05, + "loss": 0.099, + "num_input_tokens_seen": 102643744, + "step": 84340 + }, + { + "epoch": 9.393585031740729, + "grad_norm": 0.8675689101219177, + "learning_rate": 3.191830281373336e-05, + "loss": 0.0868, + "num_input_tokens_seen": 102649568, + "step": 84345 + }, + { + "epoch": 9.394141886624345, + "grad_norm": 0.0007341047748923302, + "learning_rate": 3.1915967931564695e-05, + "loss": 0.0486, + "num_input_tokens_seen": 102655648, + "step": 84350 + }, + { + "epoch": 9.394698741507963, + "grad_norm": 0.02159021608531475, + "learning_rate": 3.1913632984069176e-05, + "loss": 0.0472, + "num_input_tokens_seen": 102661760, + "step": 84355 + }, + { + "epoch": 9.39525559639158, + "grad_norm": 0.026077715680003166, + "learning_rate": 3.191129797126885e-05, + "loss": 0.0016, + "num_input_tokens_seen": 102668128, + "step": 84360 + }, + { + "epoch": 9.395812451275198, + "grad_norm": 0.018181683495640755, + "learning_rate": 3.190896289318578e-05, + "loss": 0.0062, + "num_input_tokens_seen": 102674496, + "step": 84365 + }, + { + "epoch": 9.396369306158816, + "grad_norm": 0.0026280812453478575, + "learning_rate": 3.190662774984202e-05, + "loss": 0.0396, + "num_input_tokens_seen": 102680800, + "step": 84370 + }, + { + "epoch": 9.396926161042432, + "grad_norm": 0.5445377230644226, + "learning_rate": 3.190429254125963e-05, + "loss": 0.0253, + "num_input_tokens_seen": 102687008, + "step": 84375 + }, + { + "epoch": 9.39748301592605, + "grad_norm": 0.08089102059602737, + "learning_rate": 3.190195726746066e-05, + "loss": 0.0035, + "num_input_tokens_seen": 102692960, + "step": 84380 + }, + { + "epoch": 9.398039870809667, + "grad_norm": 0.010158447548747063, + "learning_rate": 3.189962192846718e-05, + "loss": 0.025, + "num_input_tokens_seen": 102699488, + "step": 84385 + }, + { + "epoch": 9.398596725693285, + "grad_norm": 0.1927948296070099, + "learning_rate": 3.1897286524301236e-05, + "loss": 0.0057, + "num_input_tokens_seen": 102705824, + "step": 84390 + }, + { + "epoch": 9.399153580576902, + "grad_norm": 1.3089674711227417, + "learning_rate": 3.1894951054984905e-05, + "loss": 0.0157, + "num_input_tokens_seen": 102711872, + "step": 84395 + }, + { + "epoch": 9.39971043546052, + "grad_norm": 0.1478099524974823, + "learning_rate": 3.189261552054022e-05, + "loss": 0.0925, + "num_input_tokens_seen": 102718016, + "step": 84400 + }, + { + "epoch": 9.400267290344136, + "grad_norm": 0.14740291237831116, + "learning_rate": 3.189027992098928e-05, + "loss": 0.027, + "num_input_tokens_seen": 102723808, + "step": 84405 + }, + { + "epoch": 9.400824145227753, + "grad_norm": 0.36016279458999634, + "learning_rate": 3.188794425635411e-05, + "loss": 0.0373, + "num_input_tokens_seen": 102729952, + "step": 84410 + }, + { + "epoch": 9.401381000111371, + "grad_norm": 0.0437534861266613, + "learning_rate": 3.1885608526656796e-05, + "loss": 0.0455, + "num_input_tokens_seen": 102736224, + "step": 84415 + }, + { + "epoch": 9.401937854994989, + "grad_norm": 1.1569074392318726, + "learning_rate": 3.18832727319194e-05, + "loss": 0.09, + "num_input_tokens_seen": 102742016, + "step": 84420 + }, + { + "epoch": 9.402494709878606, + "grad_norm": 0.2345277965068817, + "learning_rate": 3.188093687216396e-05, + "loss": 0.0402, + "num_input_tokens_seen": 102748128, + "step": 84425 + }, + { + "epoch": 9.403051564762222, + "grad_norm": 0.040679123252630234, + "learning_rate": 3.187860094741257e-05, + "loss": 0.1902, + "num_input_tokens_seen": 102754272, + "step": 84430 + }, + { + "epoch": 9.40360841964584, + "grad_norm": 0.004460678435862064, + "learning_rate": 3.187626495768728e-05, + "loss": 0.0565, + "num_input_tokens_seen": 102760256, + "step": 84435 + }, + { + "epoch": 9.404165274529458, + "grad_norm": 1.6100083589553833, + "learning_rate": 3.187392890301016e-05, + "loss": 0.2252, + "num_input_tokens_seen": 102766208, + "step": 84440 + }, + { + "epoch": 9.404722129413075, + "grad_norm": 1.4862650632858276, + "learning_rate": 3.187159278340327e-05, + "loss": 0.066, + "num_input_tokens_seen": 102772480, + "step": 84445 + }, + { + "epoch": 9.405278984296693, + "grad_norm": 0.14370451867580414, + "learning_rate": 3.186925659888868e-05, + "loss": 0.0362, + "num_input_tokens_seen": 102778560, + "step": 84450 + }, + { + "epoch": 9.405835839180309, + "grad_norm": 0.08760868012905121, + "learning_rate": 3.186692034948846e-05, + "loss": 0.02, + "num_input_tokens_seen": 102784896, + "step": 84455 + }, + { + "epoch": 9.406392694063927, + "grad_norm": 2.362126111984253, + "learning_rate": 3.1864584035224674e-05, + "loss": 0.1676, + "num_input_tokens_seen": 102791072, + "step": 84460 + }, + { + "epoch": 9.406949548947544, + "grad_norm": 1.7183057069778442, + "learning_rate": 3.18622476561194e-05, + "loss": 0.0449, + "num_input_tokens_seen": 102797504, + "step": 84465 + }, + { + "epoch": 9.407506403831162, + "grad_norm": 0.6537988185882568, + "learning_rate": 3.1859911212194696e-05, + "loss": 0.0363, + "num_input_tokens_seen": 102803712, + "step": 84470 + }, + { + "epoch": 9.40806325871478, + "grad_norm": 0.16987355053424835, + "learning_rate": 3.1857574703472627e-05, + "loss": 0.0301, + "num_input_tokens_seen": 102809664, + "step": 84475 + }, + { + "epoch": 9.408620113598396, + "grad_norm": 0.596727192401886, + "learning_rate": 3.185523812997528e-05, + "loss": 0.0819, + "num_input_tokens_seen": 102815520, + "step": 84480 + }, + { + "epoch": 9.409176968482013, + "grad_norm": 0.3243715465068817, + "learning_rate": 3.1852901491724714e-05, + "loss": 0.0524, + "num_input_tokens_seen": 102821120, + "step": 84485 + }, + { + "epoch": 9.409733823365631, + "grad_norm": 0.44214722514152527, + "learning_rate": 3.1850564788743004e-05, + "loss": 0.0381, + "num_input_tokens_seen": 102827328, + "step": 84490 + }, + { + "epoch": 9.410290678249249, + "grad_norm": 0.001005512778647244, + "learning_rate": 3.184822802105221e-05, + "loss": 0.0088, + "num_input_tokens_seen": 102833696, + "step": 84495 + }, + { + "epoch": 9.410847533132866, + "grad_norm": 0.8531047105789185, + "learning_rate": 3.184589118867443e-05, + "loss": 0.1192, + "num_input_tokens_seen": 102840064, + "step": 84500 + }, + { + "epoch": 9.411404388016482, + "grad_norm": 0.002390923211351037, + "learning_rate": 3.184355429163172e-05, + "loss": 0.0439, + "num_input_tokens_seen": 102846368, + "step": 84505 + }, + { + "epoch": 9.4119612429001, + "grad_norm": 3.3178348541259766, + "learning_rate": 3.184121732994615e-05, + "loss": 0.0401, + "num_input_tokens_seen": 102852608, + "step": 84510 + }, + { + "epoch": 9.412518097783718, + "grad_norm": 0.04470902681350708, + "learning_rate": 3.183888030363981e-05, + "loss": 0.0297, + "num_input_tokens_seen": 102858880, + "step": 84515 + }, + { + "epoch": 9.413074952667335, + "grad_norm": 1.0711002349853516, + "learning_rate": 3.1836543212734754e-05, + "loss": 0.0357, + "num_input_tokens_seen": 102864480, + "step": 84520 + }, + { + "epoch": 9.413631807550953, + "grad_norm": 0.001530202105641365, + "learning_rate": 3.183420605725308e-05, + "loss": 0.0278, + "num_input_tokens_seen": 102870944, + "step": 84525 + }, + { + "epoch": 9.414188662434569, + "grad_norm": 0.08060815185308456, + "learning_rate": 3.183186883721685e-05, + "loss": 0.0135, + "num_input_tokens_seen": 102877120, + "step": 84530 + }, + { + "epoch": 9.414745517318186, + "grad_norm": 1.175944209098816, + "learning_rate": 3.182953155264815e-05, + "loss": 0.1118, + "num_input_tokens_seen": 102883168, + "step": 84535 + }, + { + "epoch": 9.415302372201804, + "grad_norm": 0.1239171028137207, + "learning_rate": 3.182719420356905e-05, + "loss": 0.0268, + "num_input_tokens_seen": 102889504, + "step": 84540 + }, + { + "epoch": 9.415859227085422, + "grad_norm": 1.6054446697235107, + "learning_rate": 3.182485679000162e-05, + "loss": 0.2185, + "num_input_tokens_seen": 102895040, + "step": 84545 + }, + { + "epoch": 9.41641608196904, + "grad_norm": 0.12214759737253189, + "learning_rate": 3.182251931196797e-05, + "loss": 0.0715, + "num_input_tokens_seen": 102901152, + "step": 84550 + }, + { + "epoch": 9.416972936852655, + "grad_norm": 1.0115383863449097, + "learning_rate": 3.182018176949014e-05, + "loss": 0.0892, + "num_input_tokens_seen": 102907680, + "step": 84555 + }, + { + "epoch": 9.417529791736273, + "grad_norm": 0.001913748332299292, + "learning_rate": 3.181784416259025e-05, + "loss": 0.0115, + "num_input_tokens_seen": 102913824, + "step": 84560 + }, + { + "epoch": 9.41808664661989, + "grad_norm": 2.1224937438964844, + "learning_rate": 3.181550649129034e-05, + "loss": 0.0181, + "num_input_tokens_seen": 102920448, + "step": 84565 + }, + { + "epoch": 9.418643501503508, + "grad_norm": 0.18231095373630524, + "learning_rate": 3.181316875561252e-05, + "loss": 0.0199, + "num_input_tokens_seen": 102926592, + "step": 84570 + }, + { + "epoch": 9.419200356387126, + "grad_norm": 0.00011100230040028691, + "learning_rate": 3.181083095557886e-05, + "loss": 0.0373, + "num_input_tokens_seen": 102932832, + "step": 84575 + }, + { + "epoch": 9.419757211270742, + "grad_norm": 0.2384411096572876, + "learning_rate": 3.180849309121144e-05, + "loss": 0.0351, + "num_input_tokens_seen": 102939264, + "step": 84580 + }, + { + "epoch": 9.42031406615436, + "grad_norm": 0.015218468382954597, + "learning_rate": 3.1806155162532366e-05, + "loss": 0.0622, + "num_input_tokens_seen": 102945152, + "step": 84585 + }, + { + "epoch": 9.420870921037977, + "grad_norm": 0.07644227892160416, + "learning_rate": 3.1803817169563685e-05, + "loss": 0.0499, + "num_input_tokens_seen": 102951328, + "step": 84590 + }, + { + "epoch": 9.421427775921595, + "grad_norm": 0.0002002370310947299, + "learning_rate": 3.180147911232751e-05, + "loss": 0.0711, + "num_input_tokens_seen": 102957280, + "step": 84595 + }, + { + "epoch": 9.421984630805213, + "grad_norm": 0.21130985021591187, + "learning_rate": 3.179914099084591e-05, + "loss": 0.0536, + "num_input_tokens_seen": 102963552, + "step": 84600 + }, + { + "epoch": 9.42254148568883, + "grad_norm": 0.0003675630141515285, + "learning_rate": 3.179680280514098e-05, + "loss": 0.005, + "num_input_tokens_seen": 102969280, + "step": 84605 + }, + { + "epoch": 9.423098340572446, + "grad_norm": 1.5405904054641724, + "learning_rate": 3.1794464555234796e-05, + "loss": 0.2311, + "num_input_tokens_seen": 102975424, + "step": 84610 + }, + { + "epoch": 9.423655195456064, + "grad_norm": 1.002102017402649, + "learning_rate": 3.1792126241149455e-05, + "loss": 0.0435, + "num_input_tokens_seen": 102981344, + "step": 84615 + }, + { + "epoch": 9.424212050339682, + "grad_norm": 0.18240554630756378, + "learning_rate": 3.1789787862907046e-05, + "loss": 0.0839, + "num_input_tokens_seen": 102987232, + "step": 84620 + }, + { + "epoch": 9.4247689052233, + "grad_norm": 0.028839487582445145, + "learning_rate": 3.178744942052963e-05, + "loss": 0.0192, + "num_input_tokens_seen": 102993696, + "step": 84625 + }, + { + "epoch": 9.425325760106917, + "grad_norm": 0.0013663277495652437, + "learning_rate": 3.1785110914039334e-05, + "loss": 0.005, + "num_input_tokens_seen": 102999552, + "step": 84630 + }, + { + "epoch": 9.425882614990533, + "grad_norm": 0.04463532939553261, + "learning_rate": 3.1782772343458226e-05, + "loss": 0.0643, + "num_input_tokens_seen": 103004736, + "step": 84635 + }, + { + "epoch": 9.42643946987415, + "grad_norm": 0.09202942252159119, + "learning_rate": 3.17804337088084e-05, + "loss": 0.0147, + "num_input_tokens_seen": 103010848, + "step": 84640 + }, + { + "epoch": 9.426996324757768, + "grad_norm": 0.17395980656147003, + "learning_rate": 3.177809501011195e-05, + "loss": 0.0351, + "num_input_tokens_seen": 103016832, + "step": 84645 + }, + { + "epoch": 9.427553179641386, + "grad_norm": 0.022996801882982254, + "learning_rate": 3.177575624739095e-05, + "loss": 0.0523, + "num_input_tokens_seen": 103023072, + "step": 84650 + }, + { + "epoch": 9.428110034525004, + "grad_norm": 0.8374307155609131, + "learning_rate": 3.1773417420667505e-05, + "loss": 0.0511, + "num_input_tokens_seen": 103029312, + "step": 84655 + }, + { + "epoch": 9.42866688940862, + "grad_norm": 0.0007608357118442655, + "learning_rate": 3.177107852996371e-05, + "loss": 0.0113, + "num_input_tokens_seen": 103035264, + "step": 84660 + }, + { + "epoch": 9.429223744292237, + "grad_norm": 0.4485451877117157, + "learning_rate": 3.1768739575301654e-05, + "loss": 0.0463, + "num_input_tokens_seen": 103041440, + "step": 84665 + }, + { + "epoch": 9.429780599175855, + "grad_norm": 0.054942093789577484, + "learning_rate": 3.176640055670343e-05, + "loss": 0.014, + "num_input_tokens_seen": 103047712, + "step": 84670 + }, + { + "epoch": 9.430337454059472, + "grad_norm": 1.1750481128692627, + "learning_rate": 3.176406147419113e-05, + "loss": 0.1423, + "num_input_tokens_seen": 103053280, + "step": 84675 + }, + { + "epoch": 9.43089430894309, + "grad_norm": 0.005350544583052397, + "learning_rate": 3.1761722327786854e-05, + "loss": 0.0268, + "num_input_tokens_seen": 103059584, + "step": 84680 + }, + { + "epoch": 9.431451163826706, + "grad_norm": 0.0036622919142246246, + "learning_rate": 3.175938311751269e-05, + "loss": 0.0134, + "num_input_tokens_seen": 103065824, + "step": 84685 + }, + { + "epoch": 9.432008018710324, + "grad_norm": 2.005382776260376, + "learning_rate": 3.175704384339073e-05, + "loss": 0.0808, + "num_input_tokens_seen": 103072480, + "step": 84690 + }, + { + "epoch": 9.432564873593941, + "grad_norm": 0.010196829214692116, + "learning_rate": 3.1754704505443086e-05, + "loss": 0.0028, + "num_input_tokens_seen": 103078560, + "step": 84695 + }, + { + "epoch": 9.433121728477559, + "grad_norm": 0.03938649967312813, + "learning_rate": 3.175236510369184e-05, + "loss": 0.0263, + "num_input_tokens_seen": 103084544, + "step": 84700 + }, + { + "epoch": 9.433678583361177, + "grad_norm": 0.07499221712350845, + "learning_rate": 3.17500256381591e-05, + "loss": 0.0691, + "num_input_tokens_seen": 103090976, + "step": 84705 + }, + { + "epoch": 9.434235438244793, + "grad_norm": 0.15926514565944672, + "learning_rate": 3.174768610886696e-05, + "loss": 0.0374, + "num_input_tokens_seen": 103097216, + "step": 84710 + }, + { + "epoch": 9.43479229312841, + "grad_norm": 0.03648081049323082, + "learning_rate": 3.1745346515837524e-05, + "loss": 0.1004, + "num_input_tokens_seen": 103103104, + "step": 84715 + }, + { + "epoch": 9.435349148012028, + "grad_norm": 0.08448697626590729, + "learning_rate": 3.1743006859092874e-05, + "loss": 0.0255, + "num_input_tokens_seen": 103109408, + "step": 84720 + }, + { + "epoch": 9.435906002895646, + "grad_norm": 0.010085388086736202, + "learning_rate": 3.174066713865513e-05, + "loss": 0.0321, + "num_input_tokens_seen": 103115328, + "step": 84725 + }, + { + "epoch": 9.436462857779263, + "grad_norm": 1.579547643661499, + "learning_rate": 3.1738327354546383e-05, + "loss": 0.1061, + "num_input_tokens_seen": 103121312, + "step": 84730 + }, + { + "epoch": 9.43701971266288, + "grad_norm": 0.004984942264854908, + "learning_rate": 3.173598750678874e-05, + "loss": 0.0237, + "num_input_tokens_seen": 103126880, + "step": 84735 + }, + { + "epoch": 9.437576567546497, + "grad_norm": 0.36164435744285583, + "learning_rate": 3.1733647595404286e-05, + "loss": 0.1237, + "num_input_tokens_seen": 103133024, + "step": 84740 + }, + { + "epoch": 9.438133422430115, + "grad_norm": 0.9833382964134216, + "learning_rate": 3.173130762041514e-05, + "loss": 0.0446, + "num_input_tokens_seen": 103138944, + "step": 84745 + }, + { + "epoch": 9.438690277313732, + "grad_norm": 0.05844106897711754, + "learning_rate": 3.172896758184341e-05, + "loss": 0.0014, + "num_input_tokens_seen": 103145216, + "step": 84750 + }, + { + "epoch": 9.43924713219735, + "grad_norm": 0.0769381895661354, + "learning_rate": 3.1726627479711176e-05, + "loss": 0.094, + "num_input_tokens_seen": 103151616, + "step": 84755 + }, + { + "epoch": 9.439803987080968, + "grad_norm": 0.32222941517829895, + "learning_rate": 3.1724287314040564e-05, + "loss": 0.0092, + "num_input_tokens_seen": 103157664, + "step": 84760 + }, + { + "epoch": 9.440360841964583, + "grad_norm": 0.1050761491060257, + "learning_rate": 3.172194708485367e-05, + "loss": 0.1217, + "num_input_tokens_seen": 103163296, + "step": 84765 + }, + { + "epoch": 9.440917696848201, + "grad_norm": 0.13193422555923462, + "learning_rate": 3.17196067921726e-05, + "loss": 0.0119, + "num_input_tokens_seen": 103169312, + "step": 84770 + }, + { + "epoch": 9.441474551731819, + "grad_norm": 0.3386804759502411, + "learning_rate": 3.171726643601946e-05, + "loss": 0.0053, + "num_input_tokens_seen": 103175264, + "step": 84775 + }, + { + "epoch": 9.442031406615436, + "grad_norm": 0.0009044691105373204, + "learning_rate": 3.171492601641636e-05, + "loss": 0.0276, + "num_input_tokens_seen": 103181408, + "step": 84780 + }, + { + "epoch": 9.442588261499054, + "grad_norm": 0.447070837020874, + "learning_rate": 3.17125855333854e-05, + "loss": 0.0351, + "num_input_tokens_seen": 103187392, + "step": 84785 + }, + { + "epoch": 9.44314511638267, + "grad_norm": 0.00018493429524824023, + "learning_rate": 3.171024498694869e-05, + "loss": 0.0284, + "num_input_tokens_seen": 103192768, + "step": 84790 + }, + { + "epoch": 9.443701971266288, + "grad_norm": 0.05023206025362015, + "learning_rate": 3.170790437712834e-05, + "loss": 0.0664, + "num_input_tokens_seen": 103198976, + "step": 84795 + }, + { + "epoch": 9.444258826149905, + "grad_norm": 1.1320992708206177, + "learning_rate": 3.1705563703946466e-05, + "loss": 0.0531, + "num_input_tokens_seen": 103205056, + "step": 84800 + }, + { + "epoch": 9.444815681033523, + "grad_norm": 0.07455829530954361, + "learning_rate": 3.170322296742516e-05, + "loss": 0.0348, + "num_input_tokens_seen": 103211264, + "step": 84805 + }, + { + "epoch": 9.44537253591714, + "grad_norm": 0.33291229605674744, + "learning_rate": 3.170088216758656e-05, + "loss": 0.0086, + "num_input_tokens_seen": 103217312, + "step": 84810 + }, + { + "epoch": 9.445929390800757, + "grad_norm": 0.1038324385881424, + "learning_rate": 3.169854130445274e-05, + "loss": 0.0732, + "num_input_tokens_seen": 103223360, + "step": 84815 + }, + { + "epoch": 9.446486245684374, + "grad_norm": 1.2018004655838013, + "learning_rate": 3.169620037804584e-05, + "loss": 0.0889, + "num_input_tokens_seen": 103229664, + "step": 84820 + }, + { + "epoch": 9.447043100567992, + "grad_norm": 0.013383770361542702, + "learning_rate": 3.169385938838796e-05, + "loss": 0.0493, + "num_input_tokens_seen": 103235680, + "step": 84825 + }, + { + "epoch": 9.44759995545161, + "grad_norm": 0.060608431696891785, + "learning_rate": 3.1691518335501215e-05, + "loss": 0.038, + "num_input_tokens_seen": 103241856, + "step": 84830 + }, + { + "epoch": 9.448156810335227, + "grad_norm": 1.2785379886627197, + "learning_rate": 3.1689177219407715e-05, + "loss": 0.1411, + "num_input_tokens_seen": 103248000, + "step": 84835 + }, + { + "epoch": 9.448713665218843, + "grad_norm": 0.0006519518792629242, + "learning_rate": 3.168683604012958e-05, + "loss": 0.0615, + "num_input_tokens_seen": 103254208, + "step": 84840 + }, + { + "epoch": 9.449270520102461, + "grad_norm": 0.7104271054267883, + "learning_rate": 3.168449479768893e-05, + "loss": 0.051, + "num_input_tokens_seen": 103260096, + "step": 84845 + }, + { + "epoch": 9.449827374986079, + "grad_norm": 0.8507111072540283, + "learning_rate": 3.168215349210786e-05, + "loss": 0.1618, + "num_input_tokens_seen": 103266112, + "step": 84850 + }, + { + "epoch": 9.450384229869696, + "grad_norm": 1.4527310132980347, + "learning_rate": 3.167981212340849e-05, + "loss": 0.0471, + "num_input_tokens_seen": 103272384, + "step": 84855 + }, + { + "epoch": 9.450941084753314, + "grad_norm": 0.06286457926034927, + "learning_rate": 3.167747069161296e-05, + "loss": 0.0336, + "num_input_tokens_seen": 103278656, + "step": 84860 + }, + { + "epoch": 9.45149793963693, + "grad_norm": 0.011329573579132557, + "learning_rate": 3.1675129196743355e-05, + "loss": 0.0255, + "num_input_tokens_seen": 103284960, + "step": 84865 + }, + { + "epoch": 9.452054794520548, + "grad_norm": 0.012391314841806889, + "learning_rate": 3.1672787638821824e-05, + "loss": 0.0235, + "num_input_tokens_seen": 103291104, + "step": 84870 + }, + { + "epoch": 9.452611649404165, + "grad_norm": 0.7712931632995605, + "learning_rate": 3.167044601787045e-05, + "loss": 0.0663, + "num_input_tokens_seen": 103297120, + "step": 84875 + }, + { + "epoch": 9.453168504287783, + "grad_norm": 0.7753166556358337, + "learning_rate": 3.166810433391137e-05, + "loss": 0.0429, + "num_input_tokens_seen": 103302656, + "step": 84880 + }, + { + "epoch": 9.4537253591714, + "grad_norm": 0.18008092045783997, + "learning_rate": 3.166576258696672e-05, + "loss": 0.011, + "num_input_tokens_seen": 103308832, + "step": 84885 + }, + { + "epoch": 9.454282214055016, + "grad_norm": 0.026289675384759903, + "learning_rate": 3.166342077705858e-05, + "loss": 0.0453, + "num_input_tokens_seen": 103314880, + "step": 84890 + }, + { + "epoch": 9.454839068938634, + "grad_norm": 0.282100111246109, + "learning_rate": 3.1661078904209115e-05, + "loss": 0.035, + "num_input_tokens_seen": 103320864, + "step": 84895 + }, + { + "epoch": 9.455395923822252, + "grad_norm": 0.15842999517917633, + "learning_rate": 3.16587369684404e-05, + "loss": 0.0039, + "num_input_tokens_seen": 103326912, + "step": 84900 + }, + { + "epoch": 9.45595277870587, + "grad_norm": 1.1784145832061768, + "learning_rate": 3.1656394969774595e-05, + "loss": 0.0935, + "num_input_tokens_seen": 103332960, + "step": 84905 + }, + { + "epoch": 9.456509633589487, + "grad_norm": 0.1770496964454651, + "learning_rate": 3.16540529082338e-05, + "loss": 0.0746, + "num_input_tokens_seen": 103339008, + "step": 84910 + }, + { + "epoch": 9.457066488473103, + "grad_norm": 0.03479939326643944, + "learning_rate": 3.1651710783840144e-05, + "loss": 0.02, + "num_input_tokens_seen": 103345120, + "step": 84915 + }, + { + "epoch": 9.45762334335672, + "grad_norm": 0.8262555003166199, + "learning_rate": 3.1649368596615755e-05, + "loss": 0.0643, + "num_input_tokens_seen": 103351360, + "step": 84920 + }, + { + "epoch": 9.458180198240338, + "grad_norm": 0.08100516349077225, + "learning_rate": 3.164702634658275e-05, + "loss": 0.0539, + "num_input_tokens_seen": 103357408, + "step": 84925 + }, + { + "epoch": 9.458737053123956, + "grad_norm": 0.40763530135154724, + "learning_rate": 3.164468403376326e-05, + "loss": 0.1292, + "num_input_tokens_seen": 103363104, + "step": 84930 + }, + { + "epoch": 9.459293908007574, + "grad_norm": 0.1693315953016281, + "learning_rate": 3.1642341658179395e-05, + "loss": 0.0323, + "num_input_tokens_seen": 103369344, + "step": 84935 + }, + { + "epoch": 9.45985076289119, + "grad_norm": 0.06818687170743942, + "learning_rate": 3.16399992198533e-05, + "loss": 0.0041, + "num_input_tokens_seen": 103375744, + "step": 84940 + }, + { + "epoch": 9.460407617774807, + "grad_norm": 0.16045159101486206, + "learning_rate": 3.1637656718807084e-05, + "loss": 0.0036, + "num_input_tokens_seen": 103382208, + "step": 84945 + }, + { + "epoch": 9.460964472658425, + "grad_norm": 0.4459827244281769, + "learning_rate": 3.163531415506288e-05, + "loss": 0.0273, + "num_input_tokens_seen": 103388416, + "step": 84950 + }, + { + "epoch": 9.461521327542043, + "grad_norm": 0.008824959397315979, + "learning_rate": 3.163297152864283e-05, + "loss": 0.0283, + "num_input_tokens_seen": 103394336, + "step": 84955 + }, + { + "epoch": 9.46207818242566, + "grad_norm": 0.1868392676115036, + "learning_rate": 3.163062883956904e-05, + "loss": 0.0502, + "num_input_tokens_seen": 103399808, + "step": 84960 + }, + { + "epoch": 9.462635037309278, + "grad_norm": 0.06913867592811584, + "learning_rate": 3.162828608786366e-05, + "loss": 0.073, + "num_input_tokens_seen": 103405824, + "step": 84965 + }, + { + "epoch": 9.463191892192894, + "grad_norm": 0.8706284165382385, + "learning_rate": 3.16259432735488e-05, + "loss": 0.056, + "num_input_tokens_seen": 103412064, + "step": 84970 + }, + { + "epoch": 9.463748747076512, + "grad_norm": 0.07045342028141022, + "learning_rate": 3.162360039664659e-05, + "loss": 0.0015, + "num_input_tokens_seen": 103418400, + "step": 84975 + }, + { + "epoch": 9.46430560196013, + "grad_norm": 0.45296624302864075, + "learning_rate": 3.162125745717918e-05, + "loss": 0.0434, + "num_input_tokens_seen": 103424640, + "step": 84980 + }, + { + "epoch": 9.464862456843747, + "grad_norm": 0.0002445148420520127, + "learning_rate": 3.161891445516869e-05, + "loss": 0.0572, + "num_input_tokens_seen": 103430720, + "step": 84985 + }, + { + "epoch": 9.465419311727365, + "grad_norm": 0.05090859904885292, + "learning_rate": 3.161657139063724e-05, + "loss": 0.016, + "num_input_tokens_seen": 103436768, + "step": 84990 + }, + { + "epoch": 9.46597616661098, + "grad_norm": 0.29715022444725037, + "learning_rate": 3.161422826360697e-05, + "loss": 0.0069, + "num_input_tokens_seen": 103442432, + "step": 84995 + }, + { + "epoch": 9.466533021494598, + "grad_norm": 0.017893163487315178, + "learning_rate": 3.161188507410003e-05, + "loss": 0.0145, + "num_input_tokens_seen": 103448672, + "step": 85000 + }, + { + "epoch": 9.467089876378216, + "grad_norm": 0.011324598453938961, + "learning_rate": 3.160954182213853e-05, + "loss": 0.0281, + "num_input_tokens_seen": 103455104, + "step": 85005 + }, + { + "epoch": 9.467646731261834, + "grad_norm": 1.96012282371521, + "learning_rate": 3.160719850774461e-05, + "loss": 0.242, + "num_input_tokens_seen": 103461248, + "step": 85010 + }, + { + "epoch": 9.468203586145451, + "grad_norm": 0.0048199850134551525, + "learning_rate": 3.160485513094041e-05, + "loss": 0.0589, + "num_input_tokens_seen": 103467520, + "step": 85015 + }, + { + "epoch": 9.468760441029067, + "grad_norm": 0.6177287697792053, + "learning_rate": 3.1602511691748055e-05, + "loss": 0.0284, + "num_input_tokens_seen": 103473792, + "step": 85020 + }, + { + "epoch": 9.469317295912685, + "grad_norm": 0.0002174976107198745, + "learning_rate": 3.16001681901897e-05, + "loss": 0.0356, + "num_input_tokens_seen": 103480032, + "step": 85025 + }, + { + "epoch": 9.469874150796302, + "grad_norm": 0.42424288392066956, + "learning_rate": 3.159782462628745e-05, + "loss": 0.0169, + "num_input_tokens_seen": 103486336, + "step": 85030 + }, + { + "epoch": 9.47043100567992, + "grad_norm": 1.0667200088500977, + "learning_rate": 3.159548100006347e-05, + "loss": 0.0652, + "num_input_tokens_seen": 103491488, + "step": 85035 + }, + { + "epoch": 9.470987860563538, + "grad_norm": 1.105980634689331, + "learning_rate": 3.1593137311539886e-05, + "loss": 0.0388, + "num_input_tokens_seen": 103497408, + "step": 85040 + }, + { + "epoch": 9.471544715447154, + "grad_norm": 0.03220508247613907, + "learning_rate": 3.1590793560738843e-05, + "loss": 0.0386, + "num_input_tokens_seen": 103502976, + "step": 85045 + }, + { + "epoch": 9.472101570330771, + "grad_norm": 0.0018500803271308541, + "learning_rate": 3.158844974768247e-05, + "loss": 0.0505, + "num_input_tokens_seen": 103508992, + "step": 85050 + }, + { + "epoch": 9.472658425214389, + "grad_norm": 0.05500912666320801, + "learning_rate": 3.158610587239291e-05, + "loss": 0.0401, + "num_input_tokens_seen": 103514880, + "step": 85055 + }, + { + "epoch": 9.473215280098007, + "grad_norm": 0.030634501948952675, + "learning_rate": 3.1583761934892294e-05, + "loss": 0.0066, + "num_input_tokens_seen": 103521152, + "step": 85060 + }, + { + "epoch": 9.473772134981624, + "grad_norm": 0.3315444588661194, + "learning_rate": 3.158141793520279e-05, + "loss": 0.0316, + "num_input_tokens_seen": 103527104, + "step": 85065 + }, + { + "epoch": 9.47432898986524, + "grad_norm": 0.7039896249771118, + "learning_rate": 3.157907387334651e-05, + "loss": 0.0339, + "num_input_tokens_seen": 103533120, + "step": 85070 + }, + { + "epoch": 9.474885844748858, + "grad_norm": 0.6203030347824097, + "learning_rate": 3.157672974934561e-05, + "loss": 0.0906, + "num_input_tokens_seen": 103538912, + "step": 85075 + }, + { + "epoch": 9.475442699632476, + "grad_norm": 0.04577318951487541, + "learning_rate": 3.1574385563222225e-05, + "loss": 0.0749, + "num_input_tokens_seen": 103544544, + "step": 85080 + }, + { + "epoch": 9.475999554516093, + "grad_norm": 0.4254806935787201, + "learning_rate": 3.157204131499851e-05, + "loss": 0.0873, + "num_input_tokens_seen": 103550944, + "step": 85085 + }, + { + "epoch": 9.476556409399711, + "grad_norm": 0.01363389939069748, + "learning_rate": 3.156969700469658e-05, + "loss": 0.0034, + "num_input_tokens_seen": 103557120, + "step": 85090 + }, + { + "epoch": 9.477113264283329, + "grad_norm": 1.104827642440796, + "learning_rate": 3.156735263233862e-05, + "loss": 0.0748, + "num_input_tokens_seen": 103563040, + "step": 85095 + }, + { + "epoch": 9.477670119166945, + "grad_norm": 0.014463246800005436, + "learning_rate": 3.156500819794674e-05, + "loss": 0.138, + "num_input_tokens_seen": 103569184, + "step": 85100 + }, + { + "epoch": 9.478226974050562, + "grad_norm": 0.0989653542637825, + "learning_rate": 3.1562663701543106e-05, + "loss": 0.0353, + "num_input_tokens_seen": 103575328, + "step": 85105 + }, + { + "epoch": 9.47878382893418, + "grad_norm": 0.42963123321533203, + "learning_rate": 3.156031914314985e-05, + "loss": 0.0868, + "num_input_tokens_seen": 103581504, + "step": 85110 + }, + { + "epoch": 9.479340683817798, + "grad_norm": 0.13747622072696686, + "learning_rate": 3.155797452278912e-05, + "loss": 0.0419, + "num_input_tokens_seen": 103588000, + "step": 85115 + }, + { + "epoch": 9.479897538701415, + "grad_norm": 0.011323075741529465, + "learning_rate": 3.155562984048308e-05, + "loss": 0.0872, + "num_input_tokens_seen": 103594144, + "step": 85120 + }, + { + "epoch": 9.480454393585031, + "grad_norm": 0.20528250932693481, + "learning_rate": 3.155328509625385e-05, + "loss": 0.0451, + "num_input_tokens_seen": 103600192, + "step": 85125 + }, + { + "epoch": 9.481011248468649, + "grad_norm": 1.2178490161895752, + "learning_rate": 3.155094029012359e-05, + "loss": 0.1547, + "num_input_tokens_seen": 103606336, + "step": 85130 + }, + { + "epoch": 9.481568103352267, + "grad_norm": 0.04884656146168709, + "learning_rate": 3.154859542211446e-05, + "loss": 0.0146, + "num_input_tokens_seen": 103612480, + "step": 85135 + }, + { + "epoch": 9.482124958235884, + "grad_norm": 0.02724427357316017, + "learning_rate": 3.15462504922486e-05, + "loss": 0.015, + "num_input_tokens_seen": 103618912, + "step": 85140 + }, + { + "epoch": 9.482681813119502, + "grad_norm": 1.0721555948257446, + "learning_rate": 3.154390550054815e-05, + "loss": 0.0318, + "num_input_tokens_seen": 103625024, + "step": 85145 + }, + { + "epoch": 9.483238668003118, + "grad_norm": 0.11817514896392822, + "learning_rate": 3.154156044703528e-05, + "loss": 0.0024, + "num_input_tokens_seen": 103631424, + "step": 85150 + }, + { + "epoch": 9.483795522886735, + "grad_norm": 0.013662554323673248, + "learning_rate": 3.1539215331732125e-05, + "loss": 0.081, + "num_input_tokens_seen": 103637376, + "step": 85155 + }, + { + "epoch": 9.484352377770353, + "grad_norm": 0.005609635729342699, + "learning_rate": 3.153687015466085e-05, + "loss": 0.0639, + "num_input_tokens_seen": 103643392, + "step": 85160 + }, + { + "epoch": 9.48490923265397, + "grad_norm": 2.227818489074707, + "learning_rate": 3.1534524915843586e-05, + "loss": 0.095, + "num_input_tokens_seen": 103648864, + "step": 85165 + }, + { + "epoch": 9.485466087537588, + "grad_norm": 0.0011813149321824312, + "learning_rate": 3.153217961530251e-05, + "loss": 0.0035, + "num_input_tokens_seen": 103654912, + "step": 85170 + }, + { + "epoch": 9.486022942421204, + "grad_norm": 0.11936422437429428, + "learning_rate": 3.152983425305975e-05, + "loss": 0.0038, + "num_input_tokens_seen": 103661152, + "step": 85175 + }, + { + "epoch": 9.486579797304822, + "grad_norm": 0.4691615104675293, + "learning_rate": 3.152748882913749e-05, + "loss": 0.0456, + "num_input_tokens_seen": 103667264, + "step": 85180 + }, + { + "epoch": 9.48713665218844, + "grad_norm": 0.01174166053533554, + "learning_rate": 3.152514334355786e-05, + "loss": 0.0376, + "num_input_tokens_seen": 103673664, + "step": 85185 + }, + { + "epoch": 9.487693507072057, + "grad_norm": 0.36517998576164246, + "learning_rate": 3.152279779634302e-05, + "loss": 0.0094, + "num_input_tokens_seen": 103679712, + "step": 85190 + }, + { + "epoch": 9.488250361955675, + "grad_norm": 0.006342385429888964, + "learning_rate": 3.152045218751514e-05, + "loss": 0.03, + "num_input_tokens_seen": 103685824, + "step": 85195 + }, + { + "epoch": 9.488807216839291, + "grad_norm": 0.0009895754046738148, + "learning_rate": 3.151810651709636e-05, + "loss": 0.0781, + "num_input_tokens_seen": 103691712, + "step": 85200 + }, + { + "epoch": 9.489364071722909, + "grad_norm": 0.00674655893817544, + "learning_rate": 3.151576078510884e-05, + "loss": 0.063, + "num_input_tokens_seen": 103698048, + "step": 85205 + }, + { + "epoch": 9.489920926606526, + "grad_norm": 0.0006360249826684594, + "learning_rate": 3.1513414991574736e-05, + "loss": 0.0476, + "num_input_tokens_seen": 103704384, + "step": 85210 + }, + { + "epoch": 9.490477781490144, + "grad_norm": 0.7505905032157898, + "learning_rate": 3.151106913651621e-05, + "loss": 0.009, + "num_input_tokens_seen": 103710208, + "step": 85215 + }, + { + "epoch": 9.491034636373762, + "grad_norm": 0.023591389879584312, + "learning_rate": 3.150872321995543e-05, + "loss": 0.0173, + "num_input_tokens_seen": 103716256, + "step": 85220 + }, + { + "epoch": 9.491591491257378, + "grad_norm": 0.00014722030027769506, + "learning_rate": 3.150637724191453e-05, + "loss": 0.0179, + "num_input_tokens_seen": 103721920, + "step": 85225 + }, + { + "epoch": 9.492148346140995, + "grad_norm": 0.4429142475128174, + "learning_rate": 3.150403120241569e-05, + "loss": 0.0725, + "num_input_tokens_seen": 103727904, + "step": 85230 + }, + { + "epoch": 9.492705201024613, + "grad_norm": 2.4700095653533936, + "learning_rate": 3.150168510148107e-05, + "loss": 0.0386, + "num_input_tokens_seen": 103733920, + "step": 85235 + }, + { + "epoch": 9.49326205590823, + "grad_norm": 0.7455558776855469, + "learning_rate": 3.1499338939132814e-05, + "loss": 0.1408, + "num_input_tokens_seen": 103740224, + "step": 85240 + }, + { + "epoch": 9.493818910791848, + "grad_norm": 0.025499997660517693, + "learning_rate": 3.14969927153931e-05, + "loss": 0.0982, + "num_input_tokens_seen": 103746208, + "step": 85245 + }, + { + "epoch": 9.494375765675464, + "grad_norm": 0.10427714139223099, + "learning_rate": 3.149464643028409e-05, + "loss": 0.0036, + "num_input_tokens_seen": 103752672, + "step": 85250 + }, + { + "epoch": 9.494932620559082, + "grad_norm": 0.362048864364624, + "learning_rate": 3.1492300083827934e-05, + "loss": 0.0508, + "num_input_tokens_seen": 103758656, + "step": 85255 + }, + { + "epoch": 9.4954894754427, + "grad_norm": 0.6073670983314514, + "learning_rate": 3.14899536760468e-05, + "loss": 0.1187, + "num_input_tokens_seen": 103764768, + "step": 85260 + }, + { + "epoch": 9.496046330326317, + "grad_norm": 1.6060433387756348, + "learning_rate": 3.148760720696286e-05, + "loss": 0.1506, + "num_input_tokens_seen": 103770848, + "step": 85265 + }, + { + "epoch": 9.496603185209935, + "grad_norm": 0.0024164312053471804, + "learning_rate": 3.148526067659827e-05, + "loss": 0.0101, + "num_input_tokens_seen": 103776672, + "step": 85270 + }, + { + "epoch": 9.49716004009355, + "grad_norm": 0.001792731462046504, + "learning_rate": 3.14829140849752e-05, + "loss": 0.0061, + "num_input_tokens_seen": 103782752, + "step": 85275 + }, + { + "epoch": 9.497716894977168, + "grad_norm": 0.8595190644264221, + "learning_rate": 3.1480567432115804e-05, + "loss": 0.1058, + "num_input_tokens_seen": 103788352, + "step": 85280 + }, + { + "epoch": 9.498273749860786, + "grad_norm": 0.001459560007788241, + "learning_rate": 3.1478220718042265e-05, + "loss": 0.0331, + "num_input_tokens_seen": 103794464, + "step": 85285 + }, + { + "epoch": 9.498830604744404, + "grad_norm": 0.5116769075393677, + "learning_rate": 3.1475873942776734e-05, + "loss": 0.0644, + "num_input_tokens_seen": 103800320, + "step": 85290 + }, + { + "epoch": 9.499387459628021, + "grad_norm": 0.09039776027202606, + "learning_rate": 3.147352710634139e-05, + "loss": 0.073, + "num_input_tokens_seen": 103806464, + "step": 85295 + }, + { + "epoch": 9.49994431451164, + "grad_norm": 0.5412148833274841, + "learning_rate": 3.14711802087584e-05, + "loss": 0.0197, + "num_input_tokens_seen": 103812480, + "step": 85300 + }, + { + "epoch": 9.500501169395255, + "grad_norm": 0.42190101742744446, + "learning_rate": 3.146883325004992e-05, + "loss": 0.1061, + "num_input_tokens_seen": 103818528, + "step": 85305 + }, + { + "epoch": 9.501058024278873, + "grad_norm": 0.28336822986602783, + "learning_rate": 3.1466486230238134e-05, + "loss": 0.1086, + "num_input_tokens_seen": 103824288, + "step": 85310 + }, + { + "epoch": 9.50161487916249, + "grad_norm": 1.6646850109100342, + "learning_rate": 3.146413914934519e-05, + "loss": 0.1832, + "num_input_tokens_seen": 103829792, + "step": 85315 + }, + { + "epoch": 9.502171734046108, + "grad_norm": 0.914813220500946, + "learning_rate": 3.1461792007393285e-05, + "loss": 0.0782, + "num_input_tokens_seen": 103835712, + "step": 85320 + }, + { + "epoch": 9.502728588929726, + "grad_norm": 0.0012444255407899618, + "learning_rate": 3.1459444804404584e-05, + "loss": 0.1664, + "num_input_tokens_seen": 103841824, + "step": 85325 + }, + { + "epoch": 9.503285443813342, + "grad_norm": 0.1494455188512802, + "learning_rate": 3.145709754040124e-05, + "loss": 0.0176, + "num_input_tokens_seen": 103847936, + "step": 85330 + }, + { + "epoch": 9.50384229869696, + "grad_norm": 0.005373327527195215, + "learning_rate": 3.145475021540545e-05, + "loss": 0.0225, + "num_input_tokens_seen": 103854048, + "step": 85335 + }, + { + "epoch": 9.504399153580577, + "grad_norm": 0.38647839426994324, + "learning_rate": 3.145240282943935e-05, + "loss": 0.0332, + "num_input_tokens_seen": 103860320, + "step": 85340 + }, + { + "epoch": 9.504956008464195, + "grad_norm": 0.7989434599876404, + "learning_rate": 3.145005538252516e-05, + "loss": 0.0616, + "num_input_tokens_seen": 103866304, + "step": 85345 + }, + { + "epoch": 9.505512863347812, + "grad_norm": 0.4820212423801422, + "learning_rate": 3.1447707874685015e-05, + "loss": 0.2182, + "num_input_tokens_seen": 103872640, + "step": 85350 + }, + { + "epoch": 9.506069718231428, + "grad_norm": 1.0638198852539062, + "learning_rate": 3.144536030594111e-05, + "loss": 0.0202, + "num_input_tokens_seen": 103878912, + "step": 85355 + }, + { + "epoch": 9.506626573115046, + "grad_norm": 0.03630482777953148, + "learning_rate": 3.144301267631561e-05, + "loss": 0.0259, + "num_input_tokens_seen": 103885248, + "step": 85360 + }, + { + "epoch": 9.507183427998664, + "grad_norm": 0.018993981182575226, + "learning_rate": 3.144066498583069e-05, + "loss": 0.0115, + "num_input_tokens_seen": 103890976, + "step": 85365 + }, + { + "epoch": 9.507740282882281, + "grad_norm": 0.4402005970478058, + "learning_rate": 3.143831723450853e-05, + "loss": 0.0195, + "num_input_tokens_seen": 103897056, + "step": 85370 + }, + { + "epoch": 9.508297137765899, + "grad_norm": 0.0004892784054391086, + "learning_rate": 3.14359694223713e-05, + "loss": 0.0218, + "num_input_tokens_seen": 103903200, + "step": 85375 + }, + { + "epoch": 9.508853992649515, + "grad_norm": 0.70885169506073, + "learning_rate": 3.14336215494412e-05, + "loss": 0.0702, + "num_input_tokens_seen": 103909280, + "step": 85380 + }, + { + "epoch": 9.509410847533132, + "grad_norm": 0.6252517104148865, + "learning_rate": 3.1431273615740373e-05, + "loss": 0.1321, + "num_input_tokens_seen": 103915424, + "step": 85385 + }, + { + "epoch": 9.50996770241675, + "grad_norm": 0.624068558216095, + "learning_rate": 3.1428925621291025e-05, + "loss": 0.0276, + "num_input_tokens_seen": 103921472, + "step": 85390 + }, + { + "epoch": 9.510524557300368, + "grad_norm": 0.05618235096335411, + "learning_rate": 3.1426577566115316e-05, + "loss": 0.0223, + "num_input_tokens_seen": 103927488, + "step": 85395 + }, + { + "epoch": 9.511081412183986, + "grad_norm": 0.050520118325948715, + "learning_rate": 3.142422945023544e-05, + "loss": 0.0037, + "num_input_tokens_seen": 103933664, + "step": 85400 + }, + { + "epoch": 9.511638267067601, + "grad_norm": 0.052665021270513535, + "learning_rate": 3.1421881273673566e-05, + "loss": 0.0613, + "num_input_tokens_seen": 103940096, + "step": 85405 + }, + { + "epoch": 9.512195121951219, + "grad_norm": 0.036141399294137955, + "learning_rate": 3.1419533036451876e-05, + "loss": 0.0043, + "num_input_tokens_seen": 103946176, + "step": 85410 + }, + { + "epoch": 9.512751976834837, + "grad_norm": 0.05594050884246826, + "learning_rate": 3.141718473859256e-05, + "loss": 0.0329, + "num_input_tokens_seen": 103952320, + "step": 85415 + }, + { + "epoch": 9.513308831718454, + "grad_norm": 0.10318908840417862, + "learning_rate": 3.141483638011779e-05, + "loss": 0.048, + "num_input_tokens_seen": 103958688, + "step": 85420 + }, + { + "epoch": 9.513865686602072, + "grad_norm": 0.5972126126289368, + "learning_rate": 3.1412487961049744e-05, + "loss": 0.0227, + "num_input_tokens_seen": 103964864, + "step": 85425 + }, + { + "epoch": 9.514422541485688, + "grad_norm": 1.1337960958480835, + "learning_rate": 3.141013948141062e-05, + "loss": 0.0752, + "num_input_tokens_seen": 103971136, + "step": 85430 + }, + { + "epoch": 9.514979396369306, + "grad_norm": 0.001604145742021501, + "learning_rate": 3.140779094122259e-05, + "loss": 0.0477, + "num_input_tokens_seen": 103977120, + "step": 85435 + }, + { + "epoch": 9.515536251252923, + "grad_norm": 0.00891557801514864, + "learning_rate": 3.140544234050784e-05, + "loss": 0.0351, + "num_input_tokens_seen": 103983360, + "step": 85440 + }, + { + "epoch": 9.516093106136541, + "grad_norm": 0.00020242204482201487, + "learning_rate": 3.140309367928856e-05, + "loss": 0.1109, + "num_input_tokens_seen": 103989248, + "step": 85445 + }, + { + "epoch": 9.516649961020159, + "grad_norm": 0.011549350805580616, + "learning_rate": 3.140074495758692e-05, + "loss": 0.0111, + "num_input_tokens_seen": 103995360, + "step": 85450 + }, + { + "epoch": 9.517206815903776, + "grad_norm": 0.02372669242322445, + "learning_rate": 3.139839617542513e-05, + "loss": 0.0325, + "num_input_tokens_seen": 104001440, + "step": 85455 + }, + { + "epoch": 9.517763670787392, + "grad_norm": 3.3271589279174805, + "learning_rate": 3.1396047332825345e-05, + "loss": 0.0665, + "num_input_tokens_seen": 104007712, + "step": 85460 + }, + { + "epoch": 9.51832052567101, + "grad_norm": 0.07615996897220612, + "learning_rate": 3.139369842980978e-05, + "loss": 0.0141, + "num_input_tokens_seen": 104014080, + "step": 85465 + }, + { + "epoch": 9.518877380554628, + "grad_norm": 0.0014310521073639393, + "learning_rate": 3.1391349466400606e-05, + "loss": 0.0227, + "num_input_tokens_seen": 104020352, + "step": 85470 + }, + { + "epoch": 9.519434235438245, + "grad_norm": 0.01185675896704197, + "learning_rate": 3.1389000442620015e-05, + "loss": 0.1296, + "num_input_tokens_seen": 104026304, + "step": 85475 + }, + { + "epoch": 9.519991090321863, + "grad_norm": 0.04716344550251961, + "learning_rate": 3.1386651358490196e-05, + "loss": 0.0032, + "num_input_tokens_seen": 104032320, + "step": 85480 + }, + { + "epoch": 9.520547945205479, + "grad_norm": 0.0011650110827758908, + "learning_rate": 3.138430221403334e-05, + "loss": 0.0202, + "num_input_tokens_seen": 104037728, + "step": 85485 + }, + { + "epoch": 9.521104800089097, + "grad_norm": 0.010330542922019958, + "learning_rate": 3.138195300927164e-05, + "loss": 0.0108, + "num_input_tokens_seen": 104044032, + "step": 85490 + }, + { + "epoch": 9.521661654972714, + "grad_norm": 0.08271005004644394, + "learning_rate": 3.137960374422727e-05, + "loss": 0.0215, + "num_input_tokens_seen": 104050208, + "step": 85495 + }, + { + "epoch": 9.522218509856332, + "grad_norm": 0.2117154747247696, + "learning_rate": 3.1377254418922434e-05, + "loss": 0.0255, + "num_input_tokens_seen": 104056384, + "step": 85500 + }, + { + "epoch": 9.52277536473995, + "grad_norm": 0.0024739354848861694, + "learning_rate": 3.137490503337933e-05, + "loss": 0.0386, + "num_input_tokens_seen": 104062624, + "step": 85505 + }, + { + "epoch": 9.523332219623565, + "grad_norm": 0.00045769650023430586, + "learning_rate": 3.137255558762013e-05, + "loss": 0.0112, + "num_input_tokens_seen": 104068640, + "step": 85510 + }, + { + "epoch": 9.523889074507183, + "grad_norm": 0.5980697274208069, + "learning_rate": 3.137020608166705e-05, + "loss": 0.025, + "num_input_tokens_seen": 104074592, + "step": 85515 + }, + { + "epoch": 9.5244459293908, + "grad_norm": 0.7921797037124634, + "learning_rate": 3.1367856515542254e-05, + "loss": 0.0134, + "num_input_tokens_seen": 104080960, + "step": 85520 + }, + { + "epoch": 9.525002784274418, + "grad_norm": 0.42490872740745544, + "learning_rate": 3.136550688926796e-05, + "loss": 0.1014, + "num_input_tokens_seen": 104086720, + "step": 85525 + }, + { + "epoch": 9.525559639158036, + "grad_norm": 0.033879511058330536, + "learning_rate": 3.136315720286635e-05, + "loss": 0.0315, + "num_input_tokens_seen": 104092960, + "step": 85530 + }, + { + "epoch": 9.526116494041652, + "grad_norm": 0.010085581801831722, + "learning_rate": 3.136080745635962e-05, + "loss": 0.1438, + "num_input_tokens_seen": 104099168, + "step": 85535 + }, + { + "epoch": 9.52667334892527, + "grad_norm": 0.4309697449207306, + "learning_rate": 3.135845764976998e-05, + "loss": 0.1056, + "num_input_tokens_seen": 104104672, + "step": 85540 + }, + { + "epoch": 9.527230203808887, + "grad_norm": 0.9164520502090454, + "learning_rate": 3.13561077831196e-05, + "loss": 0.0481, + "num_input_tokens_seen": 104110688, + "step": 85545 + }, + { + "epoch": 9.527787058692505, + "grad_norm": 0.026238009333610535, + "learning_rate": 3.135375785643069e-05, + "loss": 0.0083, + "num_input_tokens_seen": 104117024, + "step": 85550 + }, + { + "epoch": 9.528343913576123, + "grad_norm": 0.078498475253582, + "learning_rate": 3.135140786972545e-05, + "loss": 0.0207, + "num_input_tokens_seen": 104123168, + "step": 85555 + }, + { + "epoch": 9.528900768459739, + "grad_norm": 0.0074761095456779, + "learning_rate": 3.134905782302607e-05, + "loss": 0.0706, + "num_input_tokens_seen": 104129312, + "step": 85560 + }, + { + "epoch": 9.529457623343356, + "grad_norm": 0.23061197996139526, + "learning_rate": 3.134670771635476e-05, + "loss": 0.0374, + "num_input_tokens_seen": 104135488, + "step": 85565 + }, + { + "epoch": 9.530014478226974, + "grad_norm": 0.20428398251533508, + "learning_rate": 3.1344357549733714e-05, + "loss": 0.0341, + "num_input_tokens_seen": 104141440, + "step": 85570 + }, + { + "epoch": 9.530571333110592, + "grad_norm": 0.604715883731842, + "learning_rate": 3.134200732318512e-05, + "loss": 0.0366, + "num_input_tokens_seen": 104147552, + "step": 85575 + }, + { + "epoch": 9.53112818799421, + "grad_norm": 2.268972396850586, + "learning_rate": 3.133965703673119e-05, + "loss": 0.1263, + "num_input_tokens_seen": 104153760, + "step": 85580 + }, + { + "epoch": 9.531685042877825, + "grad_norm": 0.0009754009661264718, + "learning_rate": 3.133730669039411e-05, + "loss": 0.0015, + "num_input_tokens_seen": 104159904, + "step": 85585 + }, + { + "epoch": 9.532241897761443, + "grad_norm": 1.608825922012329, + "learning_rate": 3.13349562841961e-05, + "loss": 0.1603, + "num_input_tokens_seen": 104166144, + "step": 85590 + }, + { + "epoch": 9.53279875264506, + "grad_norm": 0.237697035074234, + "learning_rate": 3.133260581815934e-05, + "loss": 0.0104, + "num_input_tokens_seen": 104172192, + "step": 85595 + }, + { + "epoch": 9.533355607528678, + "grad_norm": 0.01792333275079727, + "learning_rate": 3.1330255292306067e-05, + "loss": 0.0327, + "num_input_tokens_seen": 104178080, + "step": 85600 + }, + { + "epoch": 9.533912462412296, + "grad_norm": 0.5071533918380737, + "learning_rate": 3.1327904706658446e-05, + "loss": 0.0471, + "num_input_tokens_seen": 104184224, + "step": 85605 + }, + { + "epoch": 9.534469317295912, + "grad_norm": 0.506177544593811, + "learning_rate": 3.13255540612387e-05, + "loss": 0.0795, + "num_input_tokens_seen": 104190208, + "step": 85610 + }, + { + "epoch": 9.53502617217953, + "grad_norm": 0.24735115468502045, + "learning_rate": 3.132320335606902e-05, + "loss": 0.0353, + "num_input_tokens_seen": 104196096, + "step": 85615 + }, + { + "epoch": 9.535583027063147, + "grad_norm": 0.22674822807312012, + "learning_rate": 3.132085259117163e-05, + "loss": 0.0193, + "num_input_tokens_seen": 104202336, + "step": 85620 + }, + { + "epoch": 9.536139881946765, + "grad_norm": 1.5725864171981812, + "learning_rate": 3.131850176656871e-05, + "loss": 0.0574, + "num_input_tokens_seen": 104208416, + "step": 85625 + }, + { + "epoch": 9.536696736830383, + "grad_norm": 0.00015513543621636927, + "learning_rate": 3.131615088228249e-05, + "loss": 0.0081, + "num_input_tokens_seen": 104214784, + "step": 85630 + }, + { + "epoch": 9.537253591713998, + "grad_norm": 0.00019842041365336627, + "learning_rate": 3.131379993833516e-05, + "loss": 0.0497, + "num_input_tokens_seen": 104221024, + "step": 85635 + }, + { + "epoch": 9.537810446597616, + "grad_norm": 1.5330475568771362, + "learning_rate": 3.1311448934748926e-05, + "loss": 0.0375, + "num_input_tokens_seen": 104227392, + "step": 85640 + }, + { + "epoch": 9.538367301481234, + "grad_norm": 0.0014876648783683777, + "learning_rate": 3.130909787154601e-05, + "loss": 0.1828, + "num_input_tokens_seen": 104233408, + "step": 85645 + }, + { + "epoch": 9.538924156364851, + "grad_norm": 0.5529204607009888, + "learning_rate": 3.1306746748748606e-05, + "loss": 0.0317, + "num_input_tokens_seen": 104239136, + "step": 85650 + }, + { + "epoch": 9.53948101124847, + "grad_norm": 0.09013094007968903, + "learning_rate": 3.130439556637892e-05, + "loss": 0.028, + "num_input_tokens_seen": 104245440, + "step": 85655 + }, + { + "epoch": 9.540037866132085, + "grad_norm": 1.5573995113372803, + "learning_rate": 3.1302044324459175e-05, + "loss": 0.0267, + "num_input_tokens_seen": 104251552, + "step": 85660 + }, + { + "epoch": 9.540594721015703, + "grad_norm": 2.995086193084717, + "learning_rate": 3.129969302301157e-05, + "loss": 0.1868, + "num_input_tokens_seen": 104257504, + "step": 85665 + }, + { + "epoch": 9.54115157589932, + "grad_norm": 0.3504260182380676, + "learning_rate": 3.1297341662058314e-05, + "loss": 0.0947, + "num_input_tokens_seen": 104263456, + "step": 85670 + }, + { + "epoch": 9.541708430782938, + "grad_norm": 0.0002716944145504385, + "learning_rate": 3.129499024162163e-05, + "loss": 0.0073, + "num_input_tokens_seen": 104269760, + "step": 85675 + }, + { + "epoch": 9.542265285666556, + "grad_norm": 0.003452420001849532, + "learning_rate": 3.1292638761723715e-05, + "loss": 0.0679, + "num_input_tokens_seen": 104276320, + "step": 85680 + }, + { + "epoch": 9.542822140550173, + "grad_norm": 0.2857401967048645, + "learning_rate": 3.129028722238678e-05, + "loss": 0.1486, + "num_input_tokens_seen": 104282624, + "step": 85685 + }, + { + "epoch": 9.54337899543379, + "grad_norm": 0.00038463217788375914, + "learning_rate": 3.128793562363304e-05, + "loss": 0.0079, + "num_input_tokens_seen": 104288896, + "step": 85690 + }, + { + "epoch": 9.543935850317407, + "grad_norm": 0.09642466902732849, + "learning_rate": 3.128558396548472e-05, + "loss": 0.0083, + "num_input_tokens_seen": 104295328, + "step": 85695 + }, + { + "epoch": 9.544492705201025, + "grad_norm": 0.7334728837013245, + "learning_rate": 3.1283232247964016e-05, + "loss": 0.0533, + "num_input_tokens_seen": 104300800, + "step": 85700 + }, + { + "epoch": 9.545049560084642, + "grad_norm": 0.008557797409594059, + "learning_rate": 3.1280880471093155e-05, + "loss": 0.0077, + "num_input_tokens_seen": 104306688, + "step": 85705 + }, + { + "epoch": 9.54560641496826, + "grad_norm": 0.07528983056545258, + "learning_rate": 3.1278528634894344e-05, + "loss": 0.074, + "num_input_tokens_seen": 104311872, + "step": 85710 + }, + { + "epoch": 9.546163269851876, + "grad_norm": 1.6979304552078247, + "learning_rate": 3.12761767393898e-05, + "loss": 0.0388, + "num_input_tokens_seen": 104317856, + "step": 85715 + }, + { + "epoch": 9.546720124735494, + "grad_norm": 0.032828692346811295, + "learning_rate": 3.127382478460174e-05, + "loss": 0.034, + "num_input_tokens_seen": 104323936, + "step": 85720 + }, + { + "epoch": 9.547276979619111, + "grad_norm": 0.14493677020072937, + "learning_rate": 3.127147277055237e-05, + "loss": 0.0326, + "num_input_tokens_seen": 104329856, + "step": 85725 + }, + { + "epoch": 9.547833834502729, + "grad_norm": 0.4118160307407379, + "learning_rate": 3.126912069726392e-05, + "loss": 0.0548, + "num_input_tokens_seen": 104336096, + "step": 85730 + }, + { + "epoch": 9.548390689386347, + "grad_norm": 0.49820610880851746, + "learning_rate": 3.1266768564758604e-05, + "loss": 0.0223, + "num_input_tokens_seen": 104342048, + "step": 85735 + }, + { + "epoch": 9.548947544269963, + "grad_norm": 0.7789983749389648, + "learning_rate": 3.126441637305864e-05, + "loss": 0.007, + "num_input_tokens_seen": 104348544, + "step": 85740 + }, + { + "epoch": 9.54950439915358, + "grad_norm": 0.018073854967951775, + "learning_rate": 3.126206412218624e-05, + "loss": 0.0938, + "num_input_tokens_seen": 104354496, + "step": 85745 + }, + { + "epoch": 9.550061254037198, + "grad_norm": 0.2451035976409912, + "learning_rate": 3.1259711812163635e-05, + "loss": 0.0094, + "num_input_tokens_seen": 104360512, + "step": 85750 + }, + { + "epoch": 9.550618108920816, + "grad_norm": 0.23189030587673187, + "learning_rate": 3.125735944301302e-05, + "loss": 0.0295, + "num_input_tokens_seen": 104366752, + "step": 85755 + }, + { + "epoch": 9.551174963804433, + "grad_norm": 0.29941651225090027, + "learning_rate": 3.1255007014756646e-05, + "loss": 0.0066, + "num_input_tokens_seen": 104373024, + "step": 85760 + }, + { + "epoch": 9.551731818688049, + "grad_norm": 1.0574183464050293, + "learning_rate": 3.125265452741672e-05, + "loss": 0.0557, + "num_input_tokens_seen": 104379552, + "step": 85765 + }, + { + "epoch": 9.552288673571667, + "grad_norm": 1.5331398248672485, + "learning_rate": 3.125030198101546e-05, + "loss": 0.0568, + "num_input_tokens_seen": 104385632, + "step": 85770 + }, + { + "epoch": 9.552845528455284, + "grad_norm": 0.5487685203552246, + "learning_rate": 3.124794937557508e-05, + "loss": 0.0374, + "num_input_tokens_seen": 104391712, + "step": 85775 + }, + { + "epoch": 9.553402383338902, + "grad_norm": 0.0026118403766304255, + "learning_rate": 3.1245596711117824e-05, + "loss": 0.0198, + "num_input_tokens_seen": 104397376, + "step": 85780 + }, + { + "epoch": 9.55395923822252, + "grad_norm": 0.0006027670460753143, + "learning_rate": 3.12432439876659e-05, + "loss": 0.0362, + "num_input_tokens_seen": 104403648, + "step": 85785 + }, + { + "epoch": 9.554516093106137, + "grad_norm": 0.1088423952460289, + "learning_rate": 3.124089120524154e-05, + "loss": 0.0831, + "num_input_tokens_seen": 104409952, + "step": 85790 + }, + { + "epoch": 9.555072947989753, + "grad_norm": 0.7510665655136108, + "learning_rate": 3.1238538363866956e-05, + "loss": 0.0471, + "num_input_tokens_seen": 104415840, + "step": 85795 + }, + { + "epoch": 9.555629802873371, + "grad_norm": 0.0014430348528549075, + "learning_rate": 3.1236185463564384e-05, + "loss": 0.0252, + "num_input_tokens_seen": 104421824, + "step": 85800 + }, + { + "epoch": 9.556186657756989, + "grad_norm": 0.11448323726654053, + "learning_rate": 3.123383250435603e-05, + "loss": 0.0178, + "num_input_tokens_seen": 104428128, + "step": 85805 + }, + { + "epoch": 9.556743512640606, + "grad_norm": 0.010373711585998535, + "learning_rate": 3.123147948626415e-05, + "loss": 0.0694, + "num_input_tokens_seen": 104434528, + "step": 85810 + }, + { + "epoch": 9.557300367524224, + "grad_norm": 1.2432001829147339, + "learning_rate": 3.1229126409310945e-05, + "loss": 0.0399, + "num_input_tokens_seen": 104440672, + "step": 85815 + }, + { + "epoch": 9.55785722240784, + "grad_norm": 0.09445229172706604, + "learning_rate": 3.122677327351865e-05, + "loss": 0.103, + "num_input_tokens_seen": 104446560, + "step": 85820 + }, + { + "epoch": 9.558414077291458, + "grad_norm": 0.0271645188331604, + "learning_rate": 3.122442007890951e-05, + "loss": 0.0686, + "num_input_tokens_seen": 104452576, + "step": 85825 + }, + { + "epoch": 9.558970932175075, + "grad_norm": 0.0008045642753131688, + "learning_rate": 3.1222066825505714e-05, + "loss": 0.0876, + "num_input_tokens_seen": 104458752, + "step": 85830 + }, + { + "epoch": 9.559527787058693, + "grad_norm": 0.0031546049285680056, + "learning_rate": 3.1219713513329516e-05, + "loss": 0.0097, + "num_input_tokens_seen": 104464704, + "step": 85835 + }, + { + "epoch": 9.56008464194231, + "grad_norm": 0.5636831521987915, + "learning_rate": 3.1217360142403146e-05, + "loss": 0.0091, + "num_input_tokens_seen": 104470720, + "step": 85840 + }, + { + "epoch": 9.560641496825927, + "grad_norm": 0.14754870533943176, + "learning_rate": 3.121500671274882e-05, + "loss": 0.0043, + "num_input_tokens_seen": 104476960, + "step": 85845 + }, + { + "epoch": 9.561198351709544, + "grad_norm": 1.4131288528442383, + "learning_rate": 3.121265322438879e-05, + "loss": 0.0308, + "num_input_tokens_seen": 104482688, + "step": 85850 + }, + { + "epoch": 9.561755206593162, + "grad_norm": 0.1418285071849823, + "learning_rate": 3.121029967734526e-05, + "loss": 0.0061, + "num_input_tokens_seen": 104488768, + "step": 85855 + }, + { + "epoch": 9.56231206147678, + "grad_norm": 0.0009755811770446599, + "learning_rate": 3.1207946071640484e-05, + "loss": 0.0328, + "num_input_tokens_seen": 104495104, + "step": 85860 + }, + { + "epoch": 9.562868916360397, + "grad_norm": 0.26021265983581543, + "learning_rate": 3.120559240729667e-05, + "loss": 0.0812, + "num_input_tokens_seen": 104501344, + "step": 85865 + }, + { + "epoch": 9.563425771244013, + "grad_norm": 0.0011092873755842447, + "learning_rate": 3.120323868433607e-05, + "loss": 0.0123, + "num_input_tokens_seen": 104507488, + "step": 85870 + }, + { + "epoch": 9.56398262612763, + "grad_norm": 0.0399431511759758, + "learning_rate": 3.120088490278091e-05, + "loss": 0.0152, + "num_input_tokens_seen": 104513408, + "step": 85875 + }, + { + "epoch": 9.564539481011249, + "grad_norm": 1.2988032102584839, + "learning_rate": 3.119853106265343e-05, + "loss": 0.051, + "num_input_tokens_seen": 104519424, + "step": 85880 + }, + { + "epoch": 9.565096335894866, + "grad_norm": 2.44696307182312, + "learning_rate": 3.1196177163975856e-05, + "loss": 0.2735, + "num_input_tokens_seen": 104525664, + "step": 85885 + }, + { + "epoch": 9.565653190778484, + "grad_norm": 0.007876550778746605, + "learning_rate": 3.119382320677042e-05, + "loss": 0.0092, + "num_input_tokens_seen": 104531840, + "step": 85890 + }, + { + "epoch": 9.5662100456621, + "grad_norm": 0.001980441389605403, + "learning_rate": 3.119146919105937e-05, + "loss": 0.0171, + "num_input_tokens_seen": 104537472, + "step": 85895 + }, + { + "epoch": 9.566766900545717, + "grad_norm": 0.004858039785176516, + "learning_rate": 3.118911511686492e-05, + "loss": 0.0536, + "num_input_tokens_seen": 104543456, + "step": 85900 + }, + { + "epoch": 9.567323755429335, + "grad_norm": 0.03976082801818848, + "learning_rate": 3.118676098420933e-05, + "loss": 0.0186, + "num_input_tokens_seen": 104549440, + "step": 85905 + }, + { + "epoch": 9.567880610312953, + "grad_norm": 0.13864074647426605, + "learning_rate": 3.118440679311482e-05, + "loss": 0.0465, + "num_input_tokens_seen": 104555648, + "step": 85910 + }, + { + "epoch": 9.56843746519657, + "grad_norm": 0.7328222990036011, + "learning_rate": 3.118205254360364e-05, + "loss": 0.0886, + "num_input_tokens_seen": 104561888, + "step": 85915 + }, + { + "epoch": 9.568994320080186, + "grad_norm": 0.43322399258613586, + "learning_rate": 3.1179698235698014e-05, + "loss": 0.0382, + "num_input_tokens_seen": 104568096, + "step": 85920 + }, + { + "epoch": 9.569551174963804, + "grad_norm": 0.0003918147995136678, + "learning_rate": 3.1177343869420185e-05, + "loss": 0.0568, + "num_input_tokens_seen": 104574400, + "step": 85925 + }, + { + "epoch": 9.570108029847422, + "grad_norm": 2.537494659423828, + "learning_rate": 3.11749894447924e-05, + "loss": 0.0212, + "num_input_tokens_seen": 104580928, + "step": 85930 + }, + { + "epoch": 9.57066488473104, + "grad_norm": 1.3338515758514404, + "learning_rate": 3.1172634961836886e-05, + "loss": 0.0761, + "num_input_tokens_seen": 104587328, + "step": 85935 + }, + { + "epoch": 9.571221739614657, + "grad_norm": 0.41860851645469666, + "learning_rate": 3.1170280420575894e-05, + "loss": 0.0399, + "num_input_tokens_seen": 104593504, + "step": 85940 + }, + { + "epoch": 9.571778594498273, + "grad_norm": 0.09317202866077423, + "learning_rate": 3.1167925821031664e-05, + "loss": 0.0391, + "num_input_tokens_seen": 104599712, + "step": 85945 + }, + { + "epoch": 9.57233544938189, + "grad_norm": 1.6102874279022217, + "learning_rate": 3.1165571163226426e-05, + "loss": 0.1656, + "num_input_tokens_seen": 104605760, + "step": 85950 + }, + { + "epoch": 9.572892304265508, + "grad_norm": 0.8124723434448242, + "learning_rate": 3.116321644718243e-05, + "loss": 0.0543, + "num_input_tokens_seen": 104611328, + "step": 85955 + }, + { + "epoch": 9.573449159149126, + "grad_norm": 0.0005083895521238446, + "learning_rate": 3.116086167292192e-05, + "loss": 0.0075, + "num_input_tokens_seen": 104617472, + "step": 85960 + }, + { + "epoch": 9.574006014032744, + "grad_norm": 1.4872223138809204, + "learning_rate": 3.115850684046713e-05, + "loss": 0.1127, + "num_input_tokens_seen": 104623360, + "step": 85965 + }, + { + "epoch": 9.57456286891636, + "grad_norm": 0.09965778142213821, + "learning_rate": 3.1156151949840315e-05, + "loss": 0.1556, + "num_input_tokens_seen": 104629440, + "step": 85970 + }, + { + "epoch": 9.575119723799977, + "grad_norm": 0.6474587321281433, + "learning_rate": 3.115379700106371e-05, + "loss": 0.0883, + "num_input_tokens_seen": 104635904, + "step": 85975 + }, + { + "epoch": 9.575676578683595, + "grad_norm": 0.0029754014685750008, + "learning_rate": 3.1151441994159555e-05, + "loss": 0.0178, + "num_input_tokens_seen": 104642144, + "step": 85980 + }, + { + "epoch": 9.576233433567213, + "grad_norm": 1.9165327548980713, + "learning_rate": 3.114908692915011e-05, + "loss": 0.1042, + "num_input_tokens_seen": 104648672, + "step": 85985 + }, + { + "epoch": 9.57679028845083, + "grad_norm": 0.11787613481283188, + "learning_rate": 3.1146731806057616e-05, + "loss": 0.1142, + "num_input_tokens_seen": 104654848, + "step": 85990 + }, + { + "epoch": 9.577347143334446, + "grad_norm": 0.006741105578839779, + "learning_rate": 3.114437662490431e-05, + "loss": 0.0108, + "num_input_tokens_seen": 104661088, + "step": 85995 + }, + { + "epoch": 9.577903998218064, + "grad_norm": 3.3350863456726074, + "learning_rate": 3.1142021385712436e-05, + "loss": 0.0456, + "num_input_tokens_seen": 104667520, + "step": 86000 + }, + { + "epoch": 9.578460853101681, + "grad_norm": 0.030892612412571907, + "learning_rate": 3.113966608850427e-05, + "loss": 0.0009, + "num_input_tokens_seen": 104673664, + "step": 86005 + }, + { + "epoch": 9.5790177079853, + "grad_norm": 0.004157577641308308, + "learning_rate": 3.1137310733302015e-05, + "loss": 0.0024, + "num_input_tokens_seen": 104679296, + "step": 86010 + }, + { + "epoch": 9.579574562868917, + "grad_norm": 0.9228656888008118, + "learning_rate": 3.1134955320127953e-05, + "loss": 0.0281, + "num_input_tokens_seen": 104685280, + "step": 86015 + }, + { + "epoch": 9.580131417752535, + "grad_norm": 0.15364891290664673, + "learning_rate": 3.113259984900433e-05, + "loss": 0.0348, + "num_input_tokens_seen": 104690624, + "step": 86020 + }, + { + "epoch": 9.58068827263615, + "grad_norm": 0.6894879937171936, + "learning_rate": 3.113024431995338e-05, + "loss": 0.0233, + "num_input_tokens_seen": 104697024, + "step": 86025 + }, + { + "epoch": 9.581245127519768, + "grad_norm": 0.04101654887199402, + "learning_rate": 3.112788873299736e-05, + "loss": 0.0732, + "num_input_tokens_seen": 104702912, + "step": 86030 + }, + { + "epoch": 9.581801982403386, + "grad_norm": 0.04519401863217354, + "learning_rate": 3.112553308815853e-05, + "loss": 0.0084, + "num_input_tokens_seen": 104709024, + "step": 86035 + }, + { + "epoch": 9.582358837287003, + "grad_norm": 0.0022640787065029144, + "learning_rate": 3.1123177385459125e-05, + "loss": 0.0058, + "num_input_tokens_seen": 104715296, + "step": 86040 + }, + { + "epoch": 9.582915692170621, + "grad_norm": 0.0016544999089092016, + "learning_rate": 3.1120821624921406e-05, + "loss": 0.0104, + "num_input_tokens_seen": 104721312, + "step": 86045 + }, + { + "epoch": 9.583472547054237, + "grad_norm": 0.23429149389266968, + "learning_rate": 3.111846580656762e-05, + "loss": 0.0148, + "num_input_tokens_seen": 104727616, + "step": 86050 + }, + { + "epoch": 9.584029401937855, + "grad_norm": 0.09881100803613663, + "learning_rate": 3.1116109930420024e-05, + "loss": 0.0175, + "num_input_tokens_seen": 104734016, + "step": 86055 + }, + { + "epoch": 9.584586256821472, + "grad_norm": 0.17092619836330414, + "learning_rate": 3.111375399650087e-05, + "loss": 0.0194, + "num_input_tokens_seen": 104740160, + "step": 86060 + }, + { + "epoch": 9.58514311170509, + "grad_norm": 0.5536221265792847, + "learning_rate": 3.1111398004832414e-05, + "loss": 0.0335, + "num_input_tokens_seen": 104746528, + "step": 86065 + }, + { + "epoch": 9.585699966588708, + "grad_norm": 0.5023964643478394, + "learning_rate": 3.1109041955436903e-05, + "loss": 0.0238, + "num_input_tokens_seen": 104752864, + "step": 86070 + }, + { + "epoch": 9.586256821472324, + "grad_norm": 0.001308786915615201, + "learning_rate": 3.1106685848336596e-05, + "loss": 0.0442, + "num_input_tokens_seen": 104759104, + "step": 86075 + }, + { + "epoch": 9.586813676355941, + "grad_norm": 0.09721936285495758, + "learning_rate": 3.110432968355375e-05, + "loss": 0.0543, + "num_input_tokens_seen": 104764864, + "step": 86080 + }, + { + "epoch": 9.587370531239559, + "grad_norm": 0.01112970057874918, + "learning_rate": 3.110197346111062e-05, + "loss": 0.0056, + "num_input_tokens_seen": 104771136, + "step": 86085 + }, + { + "epoch": 9.587927386123177, + "grad_norm": 0.05548188462853432, + "learning_rate": 3.109961718102946e-05, + "loss": 0.0071, + "num_input_tokens_seen": 104777184, + "step": 86090 + }, + { + "epoch": 9.588484241006794, + "grad_norm": 0.056622158735990524, + "learning_rate": 3.109726084333253e-05, + "loss": 0.0579, + "num_input_tokens_seen": 104782560, + "step": 86095 + }, + { + "epoch": 9.58904109589041, + "grad_norm": 1.6323028802871704, + "learning_rate": 3.109490444804209e-05, + "loss": 0.152, + "num_input_tokens_seen": 104788704, + "step": 86100 + }, + { + "epoch": 9.589597950774028, + "grad_norm": 0.000848097843118012, + "learning_rate": 3.1092547995180395e-05, + "loss": 0.006, + "num_input_tokens_seen": 104795424, + "step": 86105 + }, + { + "epoch": 9.590154805657646, + "grad_norm": 0.00787199754267931, + "learning_rate": 3.10901914847697e-05, + "loss": 0.0159, + "num_input_tokens_seen": 104801472, + "step": 86110 + }, + { + "epoch": 9.590711660541263, + "grad_norm": 0.21318919956684113, + "learning_rate": 3.108783491683226e-05, + "loss": 0.0373, + "num_input_tokens_seen": 104806976, + "step": 86115 + }, + { + "epoch": 9.591268515424881, + "grad_norm": 0.001939161098562181, + "learning_rate": 3.108547829139035e-05, + "loss": 0.0982, + "num_input_tokens_seen": 104812992, + "step": 86120 + }, + { + "epoch": 9.591825370308497, + "grad_norm": 0.0002565162430983037, + "learning_rate": 3.108312160846622e-05, + "loss": 0.0224, + "num_input_tokens_seen": 104819232, + "step": 86125 + }, + { + "epoch": 9.592382225192114, + "grad_norm": 0.00029261718736961484, + "learning_rate": 3.1080764868082126e-05, + "loss": 0.0625, + "num_input_tokens_seen": 104825664, + "step": 86130 + }, + { + "epoch": 9.592939080075732, + "grad_norm": 0.11326257884502411, + "learning_rate": 3.107840807026035e-05, + "loss": 0.0362, + "num_input_tokens_seen": 104831104, + "step": 86135 + }, + { + "epoch": 9.59349593495935, + "grad_norm": 0.15456780791282654, + "learning_rate": 3.1076051215023134e-05, + "loss": 0.0644, + "num_input_tokens_seen": 104836864, + "step": 86140 + }, + { + "epoch": 9.594052789842967, + "grad_norm": 0.0002582402667030692, + "learning_rate": 3.1073694302392745e-05, + "loss": 0.1212, + "num_input_tokens_seen": 104842880, + "step": 86145 + }, + { + "epoch": 9.594609644726585, + "grad_norm": 0.06498876959085464, + "learning_rate": 3.1071337332391446e-05, + "loss": 0.0171, + "num_input_tokens_seen": 104848608, + "step": 86150 + }, + { + "epoch": 9.595166499610201, + "grad_norm": 1.0352118015289307, + "learning_rate": 3.1068980305041496e-05, + "loss": 0.0818, + "num_input_tokens_seen": 104854432, + "step": 86155 + }, + { + "epoch": 9.595723354493819, + "grad_norm": 0.1147577092051506, + "learning_rate": 3.106662322036518e-05, + "loss": 0.0452, + "num_input_tokens_seen": 104860288, + "step": 86160 + }, + { + "epoch": 9.596280209377436, + "grad_norm": 0.2608508765697479, + "learning_rate": 3.106426607838473e-05, + "loss": 0.0442, + "num_input_tokens_seen": 104866400, + "step": 86165 + }, + { + "epoch": 9.596837064261054, + "grad_norm": 2.4170708656311035, + "learning_rate": 3.106190887912244e-05, + "loss": 0.1048, + "num_input_tokens_seen": 104872576, + "step": 86170 + }, + { + "epoch": 9.597393919144672, + "grad_norm": 0.00998780783265829, + "learning_rate": 3.105955162260056e-05, + "loss": 0.0994, + "num_input_tokens_seen": 104878784, + "step": 86175 + }, + { + "epoch": 9.597950774028288, + "grad_norm": 0.03770899400115013, + "learning_rate": 3.105719430884137e-05, + "loss": 0.0074, + "num_input_tokens_seen": 104884992, + "step": 86180 + }, + { + "epoch": 9.598507628911905, + "grad_norm": 1.4648854732513428, + "learning_rate": 3.105483693786711e-05, + "loss": 0.0285, + "num_input_tokens_seen": 104891296, + "step": 86185 + }, + { + "epoch": 9.599064483795523, + "grad_norm": 0.004498645663261414, + "learning_rate": 3.105247950970007e-05, + "loss": 0.0441, + "num_input_tokens_seen": 104897312, + "step": 86190 + }, + { + "epoch": 9.59962133867914, + "grad_norm": 0.02421458438038826, + "learning_rate": 3.1050122024362514e-05, + "loss": 0.0335, + "num_input_tokens_seen": 104903424, + "step": 86195 + }, + { + "epoch": 9.600178193562758, + "grad_norm": 0.49279773235321045, + "learning_rate": 3.1047764481876704e-05, + "loss": 0.0865, + "num_input_tokens_seen": 104909344, + "step": 86200 + }, + { + "epoch": 9.600735048446374, + "grad_norm": 0.00586998974904418, + "learning_rate": 3.104540688226492e-05, + "loss": 0.0048, + "num_input_tokens_seen": 104915584, + "step": 86205 + }, + { + "epoch": 9.601291903329992, + "grad_norm": 0.029675450176000595, + "learning_rate": 3.104304922554942e-05, + "loss": 0.0727, + "num_input_tokens_seen": 104921824, + "step": 86210 + }, + { + "epoch": 9.60184875821361, + "grad_norm": 0.7750488519668579, + "learning_rate": 3.104069151175248e-05, + "loss": 0.0213, + "num_input_tokens_seen": 104927904, + "step": 86215 + }, + { + "epoch": 9.602405613097227, + "grad_norm": 0.3832825720310211, + "learning_rate": 3.103833374089637e-05, + "loss": 0.1007, + "num_input_tokens_seen": 104934016, + "step": 86220 + }, + { + "epoch": 9.602962467980845, + "grad_norm": 0.1475178450345993, + "learning_rate": 3.1035975913003353e-05, + "loss": 0.0026, + "num_input_tokens_seen": 104940256, + "step": 86225 + }, + { + "epoch": 9.60351932286446, + "grad_norm": 1.9817346334457397, + "learning_rate": 3.103361802809572e-05, + "loss": 0.1063, + "num_input_tokens_seen": 104946496, + "step": 86230 + }, + { + "epoch": 9.604076177748079, + "grad_norm": 0.010432899929583073, + "learning_rate": 3.1031260086195726e-05, + "loss": 0.0219, + "num_input_tokens_seen": 104952896, + "step": 86235 + }, + { + "epoch": 9.604633032631696, + "grad_norm": 0.03622884675860405, + "learning_rate": 3.102890208732564e-05, + "loss": 0.1573, + "num_input_tokens_seen": 104959104, + "step": 86240 + }, + { + "epoch": 9.605189887515314, + "grad_norm": 0.6126188039779663, + "learning_rate": 3.1026544031507754e-05, + "loss": 0.0257, + "num_input_tokens_seen": 104965184, + "step": 86245 + }, + { + "epoch": 9.605746742398932, + "grad_norm": 0.9226516485214233, + "learning_rate": 3.1024185918764325e-05, + "loss": 0.0071, + "num_input_tokens_seen": 104971264, + "step": 86250 + }, + { + "epoch": 9.606303597282547, + "grad_norm": 0.25750523805618286, + "learning_rate": 3.1021827749117635e-05, + "loss": 0.0461, + "num_input_tokens_seen": 104977312, + "step": 86255 + }, + { + "epoch": 9.606860452166165, + "grad_norm": 0.38099268078804016, + "learning_rate": 3.101946952258996e-05, + "loss": 0.0975, + "num_input_tokens_seen": 104982688, + "step": 86260 + }, + { + "epoch": 9.607417307049783, + "grad_norm": 0.5173860788345337, + "learning_rate": 3.101711123920357e-05, + "loss": 0.0166, + "num_input_tokens_seen": 104989024, + "step": 86265 + }, + { + "epoch": 9.6079741619334, + "grad_norm": 0.07863841950893402, + "learning_rate": 3.101475289898074e-05, + "loss": 0.1405, + "num_input_tokens_seen": 104995200, + "step": 86270 + }, + { + "epoch": 9.608531016817018, + "grad_norm": 1.825954794883728, + "learning_rate": 3.1012394501943754e-05, + "loss": 0.1571, + "num_input_tokens_seen": 105001568, + "step": 86275 + }, + { + "epoch": 9.609087871700634, + "grad_norm": 0.532002866268158, + "learning_rate": 3.101003604811489e-05, + "loss": 0.1089, + "num_input_tokens_seen": 105007808, + "step": 86280 + }, + { + "epoch": 9.609644726584252, + "grad_norm": 1.0528923273086548, + "learning_rate": 3.100767753751641e-05, + "loss": 0.0872, + "num_input_tokens_seen": 105013792, + "step": 86285 + }, + { + "epoch": 9.61020158146787, + "grad_norm": 0.0003672097809612751, + "learning_rate": 3.100531897017061e-05, + "loss": 0.0338, + "num_input_tokens_seen": 105019776, + "step": 86290 + }, + { + "epoch": 9.610758436351487, + "grad_norm": 0.8064835667610168, + "learning_rate": 3.1002960346099754e-05, + "loss": 0.0483, + "num_input_tokens_seen": 105025920, + "step": 86295 + }, + { + "epoch": 9.611315291235105, + "grad_norm": 0.1705222725868225, + "learning_rate": 3.100060166532614e-05, + "loss": 0.0184, + "num_input_tokens_seen": 105031744, + "step": 86300 + }, + { + "epoch": 9.61187214611872, + "grad_norm": 1.3483483791351318, + "learning_rate": 3.099824292787202e-05, + "loss": 0.102, + "num_input_tokens_seen": 105037632, + "step": 86305 + }, + { + "epoch": 9.612429001002338, + "grad_norm": 0.009281270205974579, + "learning_rate": 3.09958841337597e-05, + "loss": 0.1984, + "num_input_tokens_seen": 105043744, + "step": 86310 + }, + { + "epoch": 9.612985855885956, + "grad_norm": 1.4553372859954834, + "learning_rate": 3.099352528301145e-05, + "loss": 0.0296, + "num_input_tokens_seen": 105049248, + "step": 86315 + }, + { + "epoch": 9.613542710769574, + "grad_norm": 1.421749234199524, + "learning_rate": 3.099116637564955e-05, + "loss": 0.072, + "num_input_tokens_seen": 105055136, + "step": 86320 + }, + { + "epoch": 9.614099565653191, + "grad_norm": 2.329465389251709, + "learning_rate": 3.098880741169629e-05, + "loss": 0.0743, + "num_input_tokens_seen": 105061472, + "step": 86325 + }, + { + "epoch": 9.614656420536807, + "grad_norm": 0.18738116323947906, + "learning_rate": 3.098644839117393e-05, + "loss": 0.0214, + "num_input_tokens_seen": 105067808, + "step": 86330 + }, + { + "epoch": 9.615213275420425, + "grad_norm": 0.42840585112571716, + "learning_rate": 3.098408931410478e-05, + "loss": 0.0502, + "num_input_tokens_seen": 105073824, + "step": 86335 + }, + { + "epoch": 9.615770130304043, + "grad_norm": 0.31386619806289673, + "learning_rate": 3.098173018051111e-05, + "loss": 0.0741, + "num_input_tokens_seen": 105079840, + "step": 86340 + }, + { + "epoch": 9.61632698518766, + "grad_norm": 1.4013099670410156, + "learning_rate": 3.09793709904152e-05, + "loss": 0.0336, + "num_input_tokens_seen": 105085792, + "step": 86345 + }, + { + "epoch": 9.616883840071278, + "grad_norm": 0.46688786149024963, + "learning_rate": 3.097701174383936e-05, + "loss": 0.0852, + "num_input_tokens_seen": 105091712, + "step": 86350 + }, + { + "epoch": 9.617440694954894, + "grad_norm": 0.0013719790149480104, + "learning_rate": 3.0974652440805834e-05, + "loss": 0.0475, + "num_input_tokens_seen": 105097952, + "step": 86355 + }, + { + "epoch": 9.617997549838512, + "grad_norm": 0.017155082896351814, + "learning_rate": 3.097229308133694e-05, + "loss": 0.0858, + "num_input_tokens_seen": 105103616, + "step": 86360 + }, + { + "epoch": 9.61855440472213, + "grad_norm": 1.563603162765503, + "learning_rate": 3.096993366545495e-05, + "loss": 0.2267, + "num_input_tokens_seen": 105109632, + "step": 86365 + }, + { + "epoch": 9.619111259605747, + "grad_norm": 0.026516348123550415, + "learning_rate": 3.096757419318215e-05, + "loss": 0.0649, + "num_input_tokens_seen": 105115392, + "step": 86370 + }, + { + "epoch": 9.619668114489365, + "grad_norm": 0.10024195164442062, + "learning_rate": 3.0965214664540835e-05, + "loss": 0.0282, + "num_input_tokens_seen": 105121568, + "step": 86375 + }, + { + "epoch": 9.620224969372982, + "grad_norm": 0.028948187828063965, + "learning_rate": 3.0962855079553285e-05, + "loss": 0.0182, + "num_input_tokens_seen": 105127648, + "step": 86380 + }, + { + "epoch": 9.620781824256598, + "grad_norm": 0.565629243850708, + "learning_rate": 3.09604954382418e-05, + "loss": 0.0694, + "num_input_tokens_seen": 105133984, + "step": 86385 + }, + { + "epoch": 9.621338679140216, + "grad_norm": 1.1977851390838623, + "learning_rate": 3.095813574062865e-05, + "loss": 0.1153, + "num_input_tokens_seen": 105140320, + "step": 86390 + }, + { + "epoch": 9.621895534023833, + "grad_norm": 0.006466308142989874, + "learning_rate": 3.0955775986736135e-05, + "loss": 0.0021, + "num_input_tokens_seen": 105146496, + "step": 86395 + }, + { + "epoch": 9.622452388907451, + "grad_norm": 0.44624602794647217, + "learning_rate": 3.095341617658655e-05, + "loss": 0.0943, + "num_input_tokens_seen": 105152768, + "step": 86400 + }, + { + "epoch": 9.623009243791069, + "grad_norm": 0.6035301089286804, + "learning_rate": 3.095105631020217e-05, + "loss": 0.0265, + "num_input_tokens_seen": 105158592, + "step": 86405 + }, + { + "epoch": 9.623566098674685, + "grad_norm": 0.10916838049888611, + "learning_rate": 3.0948696387605305e-05, + "loss": 0.0271, + "num_input_tokens_seen": 105164832, + "step": 86410 + }, + { + "epoch": 9.624122953558302, + "grad_norm": 0.13205039501190186, + "learning_rate": 3.0946336408818233e-05, + "loss": 0.0143, + "num_input_tokens_seen": 105170496, + "step": 86415 + }, + { + "epoch": 9.62467980844192, + "grad_norm": 0.38084927201271057, + "learning_rate": 3.0943976373863255e-05, + "loss": 0.014, + "num_input_tokens_seen": 105176576, + "step": 86420 + }, + { + "epoch": 9.625236663325538, + "grad_norm": 0.22464638948440552, + "learning_rate": 3.094161628276265e-05, + "loss": 0.0691, + "num_input_tokens_seen": 105182720, + "step": 86425 + }, + { + "epoch": 9.625793518209155, + "grad_norm": 7.788289804011583e-05, + "learning_rate": 3.093925613553872e-05, + "loss": 0.0023, + "num_input_tokens_seen": 105188864, + "step": 86430 + }, + { + "epoch": 9.626350373092771, + "grad_norm": 0.03459528461098671, + "learning_rate": 3.0936895932213763e-05, + "loss": 0.113, + "num_input_tokens_seen": 105195168, + "step": 86435 + }, + { + "epoch": 9.626907227976389, + "grad_norm": 0.03733661398291588, + "learning_rate": 3.0934535672810056e-05, + "loss": 0.0397, + "num_input_tokens_seen": 105201408, + "step": 86440 + }, + { + "epoch": 9.627464082860007, + "grad_norm": 0.0037895692512392998, + "learning_rate": 3.093217535734992e-05, + "loss": 0.0615, + "num_input_tokens_seen": 105207456, + "step": 86445 + }, + { + "epoch": 9.628020937743624, + "grad_norm": 0.0889376625418663, + "learning_rate": 3.092981498585562e-05, + "loss": 0.0614, + "num_input_tokens_seen": 105213280, + "step": 86450 + }, + { + "epoch": 9.628577792627242, + "grad_norm": 0.07594381272792816, + "learning_rate": 3.092745455834948e-05, + "loss": 0.0162, + "num_input_tokens_seen": 105218720, + "step": 86455 + }, + { + "epoch": 9.629134647510858, + "grad_norm": 0.005629954393953085, + "learning_rate": 3.0925094074853775e-05, + "loss": 0.0779, + "num_input_tokens_seen": 105224896, + "step": 86460 + }, + { + "epoch": 9.629691502394476, + "grad_norm": 0.32880842685699463, + "learning_rate": 3.092273353539081e-05, + "loss": 0.052, + "num_input_tokens_seen": 105230880, + "step": 86465 + }, + { + "epoch": 9.630248357278093, + "grad_norm": 0.02322341501712799, + "learning_rate": 3.092037293998289e-05, + "loss": 0.1295, + "num_input_tokens_seen": 105236928, + "step": 86470 + }, + { + "epoch": 9.630805212161711, + "grad_norm": 0.21837852895259857, + "learning_rate": 3.0918012288652285e-05, + "loss": 0.0103, + "num_input_tokens_seen": 105243168, + "step": 86475 + }, + { + "epoch": 9.631362067045329, + "grad_norm": 0.9557376503944397, + "learning_rate": 3.091565158142133e-05, + "loss": 0.1431, + "num_input_tokens_seen": 105249184, + "step": 86480 + }, + { + "epoch": 9.631918921928944, + "grad_norm": 0.7222771644592285, + "learning_rate": 3.0913290818312296e-05, + "loss": 0.0609, + "num_input_tokens_seen": 105255392, + "step": 86485 + }, + { + "epoch": 9.632475776812562, + "grad_norm": 1.2661445140838623, + "learning_rate": 3.09109299993475e-05, + "loss": 0.0625, + "num_input_tokens_seen": 105260992, + "step": 86490 + }, + { + "epoch": 9.63303263169618, + "grad_norm": 0.2673555016517639, + "learning_rate": 3.090856912454923e-05, + "loss": 0.0071, + "num_input_tokens_seen": 105266944, + "step": 86495 + }, + { + "epoch": 9.633589486579798, + "grad_norm": 8.004606206668541e-05, + "learning_rate": 3.090620819393979e-05, + "loss": 0.0858, + "num_input_tokens_seen": 105273056, + "step": 86500 + }, + { + "epoch": 9.634146341463415, + "grad_norm": 0.6227310299873352, + "learning_rate": 3.0903847207541486e-05, + "loss": 0.0654, + "num_input_tokens_seen": 105279360, + "step": 86505 + }, + { + "epoch": 9.634703196347033, + "grad_norm": 0.32759809494018555, + "learning_rate": 3.090148616537661e-05, + "loss": 0.0536, + "num_input_tokens_seen": 105285600, + "step": 86510 + }, + { + "epoch": 9.635260051230649, + "grad_norm": 0.3633415699005127, + "learning_rate": 3.0899125067467474e-05, + "loss": 0.0316, + "num_input_tokens_seen": 105291712, + "step": 86515 + }, + { + "epoch": 9.635816906114266, + "grad_norm": 1.4200390577316284, + "learning_rate": 3.089676391383637e-05, + "loss": 0.0734, + "num_input_tokens_seen": 105297824, + "step": 86520 + }, + { + "epoch": 9.636373760997884, + "grad_norm": 1.1238480806350708, + "learning_rate": 3.089440270450561e-05, + "loss": 0.0659, + "num_input_tokens_seen": 105303552, + "step": 86525 + }, + { + "epoch": 9.636930615881502, + "grad_norm": 0.06236334890127182, + "learning_rate": 3.089204143949749e-05, + "loss": 0.006, + "num_input_tokens_seen": 105309600, + "step": 86530 + }, + { + "epoch": 9.63748747076512, + "grad_norm": 0.9121884703636169, + "learning_rate": 3.088968011883433e-05, + "loss": 0.0276, + "num_input_tokens_seen": 105315520, + "step": 86535 + }, + { + "epoch": 9.638044325648735, + "grad_norm": 0.47561535239219666, + "learning_rate": 3.088731874253841e-05, + "loss": 0.1253, + "num_input_tokens_seen": 105321824, + "step": 86540 + }, + { + "epoch": 9.638601180532353, + "grad_norm": 0.1850528120994568, + "learning_rate": 3.088495731063205e-05, + "loss": 0.0105, + "num_input_tokens_seen": 105328320, + "step": 86545 + }, + { + "epoch": 9.63915803541597, + "grad_norm": 0.9651541709899902, + "learning_rate": 3.088259582313756e-05, + "loss": 0.0993, + "num_input_tokens_seen": 105334528, + "step": 86550 + }, + { + "epoch": 9.639714890299588, + "grad_norm": 0.06882112473249435, + "learning_rate": 3.0880234280077234e-05, + "loss": 0.1061, + "num_input_tokens_seen": 105340224, + "step": 86555 + }, + { + "epoch": 9.640271745183206, + "grad_norm": 1.1914055347442627, + "learning_rate": 3.087787268147338e-05, + "loss": 0.1239, + "num_input_tokens_seen": 105346368, + "step": 86560 + }, + { + "epoch": 9.640828600066822, + "grad_norm": 0.22022497653961182, + "learning_rate": 3.087551102734831e-05, + "loss": 0.0087, + "num_input_tokens_seen": 105352928, + "step": 86565 + }, + { + "epoch": 9.64138545495044, + "grad_norm": 3.4741878509521484, + "learning_rate": 3.087314931772434e-05, + "loss": 0.1439, + "num_input_tokens_seen": 105359040, + "step": 86570 + }, + { + "epoch": 9.641942309834057, + "grad_norm": 0.00995023176074028, + "learning_rate": 3.087078755262376e-05, + "loss": 0.0056, + "num_input_tokens_seen": 105365088, + "step": 86575 + }, + { + "epoch": 9.642499164717675, + "grad_norm": 0.30837738513946533, + "learning_rate": 3.08684257320689e-05, + "loss": 0.1083, + "num_input_tokens_seen": 105371520, + "step": 86580 + }, + { + "epoch": 9.643056019601293, + "grad_norm": 0.0002515529340598732, + "learning_rate": 3.086606385608204e-05, + "loss": 0.2025, + "num_input_tokens_seen": 105377696, + "step": 86585 + }, + { + "epoch": 9.643612874484909, + "grad_norm": 0.07516235113143921, + "learning_rate": 3.086370192468552e-05, + "loss": 0.007, + "num_input_tokens_seen": 105384224, + "step": 86590 + }, + { + "epoch": 9.644169729368526, + "grad_norm": 0.08078047633171082, + "learning_rate": 3.0861339937901634e-05, + "loss": 0.0071, + "num_input_tokens_seen": 105390176, + "step": 86595 + }, + { + "epoch": 9.644726584252144, + "grad_norm": 0.11495715379714966, + "learning_rate": 3.085897789575269e-05, + "loss": 0.0896, + "num_input_tokens_seen": 105396256, + "step": 86600 + }, + { + "epoch": 9.645283439135762, + "grad_norm": 0.042902007699012756, + "learning_rate": 3.085661579826102e-05, + "loss": 0.0692, + "num_input_tokens_seen": 105402432, + "step": 86605 + }, + { + "epoch": 9.64584029401938, + "grad_norm": 0.01713470183312893, + "learning_rate": 3.085425364544891e-05, + "loss": 0.1389, + "num_input_tokens_seen": 105408672, + "step": 86610 + }, + { + "epoch": 9.646397148902995, + "grad_norm": 2.438939332962036, + "learning_rate": 3.0851891437338686e-05, + "loss": 0.0289, + "num_input_tokens_seen": 105414720, + "step": 86615 + }, + { + "epoch": 9.646954003786613, + "grad_norm": 0.0004278932174202055, + "learning_rate": 3.0849529173952665e-05, + "loss": 0.0418, + "num_input_tokens_seen": 105421088, + "step": 86620 + }, + { + "epoch": 9.64751085867023, + "grad_norm": 0.13372214138507843, + "learning_rate": 3.084716685531315e-05, + "loss": 0.0237, + "num_input_tokens_seen": 105426624, + "step": 86625 + }, + { + "epoch": 9.648067713553848, + "grad_norm": 0.030548816546797752, + "learning_rate": 3.084480448144245e-05, + "loss": 0.0184, + "num_input_tokens_seen": 105433376, + "step": 86630 + }, + { + "epoch": 9.648624568437466, + "grad_norm": 1.0137330293655396, + "learning_rate": 3.08424420523629e-05, + "loss": 0.1436, + "num_input_tokens_seen": 105439808, + "step": 86635 + }, + { + "epoch": 9.649181423321082, + "grad_norm": 0.4618799090385437, + "learning_rate": 3.0840079568096804e-05, + "loss": 0.0806, + "num_input_tokens_seen": 105446208, + "step": 86640 + }, + { + "epoch": 9.6497382782047, + "grad_norm": 0.00020616300753317773, + "learning_rate": 3.083771702866647e-05, + "loss": 0.0506, + "num_input_tokens_seen": 105451808, + "step": 86645 + }, + { + "epoch": 9.650295133088317, + "grad_norm": 0.09945900738239288, + "learning_rate": 3.0835354434094235e-05, + "loss": 0.0117, + "num_input_tokens_seen": 105457760, + "step": 86650 + }, + { + "epoch": 9.650851987971935, + "grad_norm": 0.018130704760551453, + "learning_rate": 3.083299178440239e-05, + "loss": 0.0061, + "num_input_tokens_seen": 105463968, + "step": 86655 + }, + { + "epoch": 9.651408842855552, + "grad_norm": 0.025548439472913742, + "learning_rate": 3.083062907961327e-05, + "loss": 0.0012, + "num_input_tokens_seen": 105470272, + "step": 86660 + }, + { + "epoch": 9.651965697739168, + "grad_norm": 0.18452554941177368, + "learning_rate": 3.082826631974918e-05, + "loss": 0.056, + "num_input_tokens_seen": 105476480, + "step": 86665 + }, + { + "epoch": 9.652522552622786, + "grad_norm": 0.7299559116363525, + "learning_rate": 3.082590350483246e-05, + "loss": 0.0261, + "num_input_tokens_seen": 105482464, + "step": 86670 + }, + { + "epoch": 9.653079407506404, + "grad_norm": 0.07123667001724243, + "learning_rate": 3.0823540634885404e-05, + "loss": 0.0976, + "num_input_tokens_seen": 105488672, + "step": 86675 + }, + { + "epoch": 9.653636262390021, + "grad_norm": 0.0241193026304245, + "learning_rate": 3.082117770993033e-05, + "loss": 0.0107, + "num_input_tokens_seen": 105494720, + "step": 86680 + }, + { + "epoch": 9.654193117273639, + "grad_norm": 0.5847342014312744, + "learning_rate": 3.0818814729989584e-05, + "loss": 0.0471, + "num_input_tokens_seen": 105501056, + "step": 86685 + }, + { + "epoch": 9.654749972157255, + "grad_norm": 0.023955805227160454, + "learning_rate": 3.081645169508547e-05, + "loss": 0.0171, + "num_input_tokens_seen": 105507104, + "step": 86690 + }, + { + "epoch": 9.655306827040873, + "grad_norm": 0.25563785433769226, + "learning_rate": 3.0814088605240305e-05, + "loss": 0.0494, + "num_input_tokens_seen": 105512448, + "step": 86695 + }, + { + "epoch": 9.65586368192449, + "grad_norm": 0.007846041582524776, + "learning_rate": 3.0811725460476414e-05, + "loss": 0.035, + "num_input_tokens_seen": 105518880, + "step": 86700 + }, + { + "epoch": 9.656420536808108, + "grad_norm": 0.6795318722724915, + "learning_rate": 3.080936226081612e-05, + "loss": 0.0623, + "num_input_tokens_seen": 105524896, + "step": 86705 + }, + { + "epoch": 9.656977391691726, + "grad_norm": 0.0014021011302247643, + "learning_rate": 3.080699900628175e-05, + "loss": 0.0104, + "num_input_tokens_seen": 105531136, + "step": 86710 + }, + { + "epoch": 9.657534246575342, + "grad_norm": 0.03617100045084953, + "learning_rate": 3.0804635696895614e-05, + "loss": 0.0102, + "num_input_tokens_seen": 105537408, + "step": 86715 + }, + { + "epoch": 9.65809110145896, + "grad_norm": 0.013032038696110249, + "learning_rate": 3.0802272332680055e-05, + "loss": 0.0271, + "num_input_tokens_seen": 105543488, + "step": 86720 + }, + { + "epoch": 9.658647956342577, + "grad_norm": 0.0003115884028375149, + "learning_rate": 3.079990891365737e-05, + "loss": 0.0115, + "num_input_tokens_seen": 105549984, + "step": 86725 + }, + { + "epoch": 9.659204811226195, + "grad_norm": 0.009238136932253838, + "learning_rate": 3.079754543984991e-05, + "loss": 0.0093, + "num_input_tokens_seen": 105555904, + "step": 86730 + }, + { + "epoch": 9.659761666109812, + "grad_norm": 0.8023186326026917, + "learning_rate": 3.0795181911279984e-05, + "loss": 0.1497, + "num_input_tokens_seen": 105562336, + "step": 86735 + }, + { + "epoch": 9.66031852099343, + "grad_norm": 0.26902249455451965, + "learning_rate": 3.079281832796992e-05, + "loss": 0.1001, + "num_input_tokens_seen": 105568544, + "step": 86740 + }, + { + "epoch": 9.660875375877046, + "grad_norm": 0.35520699620246887, + "learning_rate": 3.079045468994205e-05, + "loss": 0.0379, + "num_input_tokens_seen": 105574464, + "step": 86745 + }, + { + "epoch": 9.661432230760663, + "grad_norm": 0.022381406277418137, + "learning_rate": 3.0788090997218696e-05, + "loss": 0.0152, + "num_input_tokens_seen": 105580608, + "step": 86750 + }, + { + "epoch": 9.661989085644281, + "grad_norm": 0.1089031919836998, + "learning_rate": 3.078572724982219e-05, + "loss": 0.0365, + "num_input_tokens_seen": 105587008, + "step": 86755 + }, + { + "epoch": 9.662545940527899, + "grad_norm": 0.42255523800849915, + "learning_rate": 3.0783363447774846e-05, + "loss": 0.0116, + "num_input_tokens_seen": 105592960, + "step": 86760 + }, + { + "epoch": 9.663102795411517, + "grad_norm": 0.22209052741527557, + "learning_rate": 3.0780999591099e-05, + "loss": 0.0921, + "num_input_tokens_seen": 105599232, + "step": 86765 + }, + { + "epoch": 9.663659650295132, + "grad_norm": 0.030176173895597458, + "learning_rate": 3.077863567981699e-05, + "loss": 0.0589, + "num_input_tokens_seen": 105605536, + "step": 86770 + }, + { + "epoch": 9.66421650517875, + "grad_norm": 0.09534098207950592, + "learning_rate": 3.077627171395112e-05, + "loss": 0.0024, + "num_input_tokens_seen": 105611936, + "step": 86775 + }, + { + "epoch": 9.664773360062368, + "grad_norm": 0.01557543221861124, + "learning_rate": 3.077390769352375e-05, + "loss": 0.0442, + "num_input_tokens_seen": 105617824, + "step": 86780 + }, + { + "epoch": 9.665330214945985, + "grad_norm": 0.18939612805843353, + "learning_rate": 3.077154361855719e-05, + "loss": 0.1131, + "num_input_tokens_seen": 105624032, + "step": 86785 + }, + { + "epoch": 9.665887069829603, + "grad_norm": 0.16451051831245422, + "learning_rate": 3.076917948907379e-05, + "loss": 0.0076, + "num_input_tokens_seen": 105630208, + "step": 86790 + }, + { + "epoch": 9.666443924713219, + "grad_norm": 1.6780719757080078, + "learning_rate": 3.0766815305095846e-05, + "loss": 0.0578, + "num_input_tokens_seen": 105636608, + "step": 86795 + }, + { + "epoch": 9.667000779596837, + "grad_norm": 1.1948031187057495, + "learning_rate": 3.076445106664573e-05, + "loss": 0.1246, + "num_input_tokens_seen": 105641696, + "step": 86800 + }, + { + "epoch": 9.667557634480454, + "grad_norm": 0.30306771397590637, + "learning_rate": 3.076208677374574e-05, + "loss": 0.2004, + "num_input_tokens_seen": 105647648, + "step": 86805 + }, + { + "epoch": 9.668114489364072, + "grad_norm": 0.6156055331230164, + "learning_rate": 3.075972242641823e-05, + "loss": 0.0638, + "num_input_tokens_seen": 105653664, + "step": 86810 + }, + { + "epoch": 9.66867134424769, + "grad_norm": 1.0581218004226685, + "learning_rate": 3.075735802468553e-05, + "loss": 0.1649, + "num_input_tokens_seen": 105660192, + "step": 86815 + }, + { + "epoch": 9.669228199131306, + "grad_norm": 0.002070956164970994, + "learning_rate": 3.0754993568569965e-05, + "loss": 0.0226, + "num_input_tokens_seen": 105666144, + "step": 86820 + }, + { + "epoch": 9.669785054014923, + "grad_norm": 2.0948240756988525, + "learning_rate": 3.0752629058093884e-05, + "loss": 0.0616, + "num_input_tokens_seen": 105672576, + "step": 86825 + }, + { + "epoch": 9.670341908898541, + "grad_norm": 0.16384021937847137, + "learning_rate": 3.07502644932796e-05, + "loss": 0.0512, + "num_input_tokens_seen": 105678816, + "step": 86830 + }, + { + "epoch": 9.670898763782159, + "grad_norm": 1.3429622650146484, + "learning_rate": 3.074789987414947e-05, + "loss": 0.0665, + "num_input_tokens_seen": 105684928, + "step": 86835 + }, + { + "epoch": 9.671455618665776, + "grad_norm": 0.8904281258583069, + "learning_rate": 3.0745535200725824e-05, + "loss": 0.0892, + "num_input_tokens_seen": 105690944, + "step": 86840 + }, + { + "epoch": 9.672012473549392, + "grad_norm": 0.20966637134552002, + "learning_rate": 3.074317047303098e-05, + "loss": 0.0238, + "num_input_tokens_seen": 105697312, + "step": 86845 + }, + { + "epoch": 9.67256932843301, + "grad_norm": 0.6059575080871582, + "learning_rate": 3.0740805691087306e-05, + "loss": 0.0476, + "num_input_tokens_seen": 105703328, + "step": 86850 + }, + { + "epoch": 9.673126183316628, + "grad_norm": 0.008271588943898678, + "learning_rate": 3.073844085491711e-05, + "loss": 0.0092, + "num_input_tokens_seen": 105709824, + "step": 86855 + }, + { + "epoch": 9.673683038200245, + "grad_norm": 0.10852228105068207, + "learning_rate": 3.073607596454275e-05, + "loss": 0.0275, + "num_input_tokens_seen": 105716064, + "step": 86860 + }, + { + "epoch": 9.674239893083863, + "grad_norm": 1.864977478981018, + "learning_rate": 3.073371101998655e-05, + "loss": 0.0904, + "num_input_tokens_seen": 105722432, + "step": 86865 + }, + { + "epoch": 9.67479674796748, + "grad_norm": 0.009484934620559216, + "learning_rate": 3.073134602127086e-05, + "loss": 0.017, + "num_input_tokens_seen": 105728384, + "step": 86870 + }, + { + "epoch": 9.675353602851096, + "grad_norm": 0.001990628195926547, + "learning_rate": 3.072898096841802e-05, + "loss": 0.0105, + "num_input_tokens_seen": 105734784, + "step": 86875 + }, + { + "epoch": 9.675910457734714, + "grad_norm": 0.826461911201477, + "learning_rate": 3.0726615861450355e-05, + "loss": 0.0138, + "num_input_tokens_seen": 105740864, + "step": 86880 + }, + { + "epoch": 9.676467312618332, + "grad_norm": 0.1512053906917572, + "learning_rate": 3.072425070039023e-05, + "loss": 0.0515, + "num_input_tokens_seen": 105747360, + "step": 86885 + }, + { + "epoch": 9.67702416750195, + "grad_norm": 0.6419461369514465, + "learning_rate": 3.072188548525996e-05, + "loss": 0.1104, + "num_input_tokens_seen": 105753952, + "step": 86890 + }, + { + "epoch": 9.677581022385567, + "grad_norm": 0.0008489599567838013, + "learning_rate": 3.07195202160819e-05, + "loss": 0.0154, + "num_input_tokens_seen": 105760096, + "step": 86895 + }, + { + "epoch": 9.678137877269183, + "grad_norm": 0.15911424160003662, + "learning_rate": 3.071715489287839e-05, + "loss": 0.1381, + "num_input_tokens_seen": 105766304, + "step": 86900 + }, + { + "epoch": 9.6786947321528, + "grad_norm": 0.22620464861392975, + "learning_rate": 3.071478951567176e-05, + "loss": 0.0519, + "num_input_tokens_seen": 105772384, + "step": 86905 + }, + { + "epoch": 9.679251587036418, + "grad_norm": 0.5406627655029297, + "learning_rate": 3.071242408448438e-05, + "loss": 0.0339, + "num_input_tokens_seen": 105778368, + "step": 86910 + }, + { + "epoch": 9.679808441920036, + "grad_norm": 0.006943732500076294, + "learning_rate": 3.0710058599338566e-05, + "loss": 0.0453, + "num_input_tokens_seen": 105784576, + "step": 86915 + }, + { + "epoch": 9.680365296803654, + "grad_norm": 0.8937313556671143, + "learning_rate": 3.070769306025668e-05, + "loss": 0.061, + "num_input_tokens_seen": 105790560, + "step": 86920 + }, + { + "epoch": 9.68092215168727, + "grad_norm": 0.3051685690879822, + "learning_rate": 3.0705327467261056e-05, + "loss": 0.0083, + "num_input_tokens_seen": 105796672, + "step": 86925 + }, + { + "epoch": 9.681479006570887, + "grad_norm": 0.0016964473761618137, + "learning_rate": 3.070296182037405e-05, + "loss": 0.0446, + "num_input_tokens_seen": 105802784, + "step": 86930 + }, + { + "epoch": 9.682035861454505, + "grad_norm": 0.31078311800956726, + "learning_rate": 3.0700596119618e-05, + "loss": 0.0274, + "num_input_tokens_seen": 105808992, + "step": 86935 + }, + { + "epoch": 9.682592716338123, + "grad_norm": 0.0010628784075379372, + "learning_rate": 3.069823036501525e-05, + "loss": 0.0211, + "num_input_tokens_seen": 105815264, + "step": 86940 + }, + { + "epoch": 9.68314957122174, + "grad_norm": 0.8691527843475342, + "learning_rate": 3.069586455658814e-05, + "loss": 0.0905, + "num_input_tokens_seen": 105821152, + "step": 86945 + }, + { + "epoch": 9.683706426105356, + "grad_norm": 0.0036310788709670305, + "learning_rate": 3.0693498694359034e-05, + "loss": 0.1089, + "num_input_tokens_seen": 105827360, + "step": 86950 + }, + { + "epoch": 9.684263280988974, + "grad_norm": 0.0433783121407032, + "learning_rate": 3.069113277835028e-05, + "loss": 0.1969, + "num_input_tokens_seen": 105832928, + "step": 86955 + }, + { + "epoch": 9.684820135872592, + "grad_norm": 0.039748042821884155, + "learning_rate": 3.06887668085842e-05, + "loss": 0.0505, + "num_input_tokens_seen": 105838944, + "step": 86960 + }, + { + "epoch": 9.68537699075621, + "grad_norm": 1.3218191862106323, + "learning_rate": 3.068640078508317e-05, + "loss": 0.0927, + "num_input_tokens_seen": 105844960, + "step": 86965 + }, + { + "epoch": 9.685933845639827, + "grad_norm": 0.021643763408064842, + "learning_rate": 3.068403470786953e-05, + "loss": 0.0277, + "num_input_tokens_seen": 105851328, + "step": 86970 + }, + { + "epoch": 9.686490700523443, + "grad_norm": 0.0005971097270958126, + "learning_rate": 3.068166857696562e-05, + "loss": 0.0169, + "num_input_tokens_seen": 105857376, + "step": 86975 + }, + { + "epoch": 9.68704755540706, + "grad_norm": 0.03286916762590408, + "learning_rate": 3.067930239239381e-05, + "loss": 0.0182, + "num_input_tokens_seen": 105863456, + "step": 86980 + }, + { + "epoch": 9.687604410290678, + "grad_norm": 0.31438612937927246, + "learning_rate": 3.0676936154176425e-05, + "loss": 0.1035, + "num_input_tokens_seen": 105869792, + "step": 86985 + }, + { + "epoch": 9.688161265174296, + "grad_norm": 0.5229984521865845, + "learning_rate": 3.0674569862335846e-05, + "loss": 0.035, + "num_input_tokens_seen": 105875712, + "step": 86990 + }, + { + "epoch": 9.688718120057914, + "grad_norm": 0.0006797392852604389, + "learning_rate": 3.067220351689439e-05, + "loss": 0.0658, + "num_input_tokens_seen": 105881760, + "step": 86995 + }, + { + "epoch": 9.68927497494153, + "grad_norm": 3.390965223312378, + "learning_rate": 3.066983711787444e-05, + "loss": 0.0584, + "num_input_tokens_seen": 105887808, + "step": 87000 + }, + { + "epoch": 9.689831829825147, + "grad_norm": 1.7806397676467896, + "learning_rate": 3.066747066529833e-05, + "loss": 0.1009, + "num_input_tokens_seen": 105893888, + "step": 87005 + }, + { + "epoch": 9.690388684708765, + "grad_norm": 0.6257490515708923, + "learning_rate": 3.066510415918842e-05, + "loss": 0.0302, + "num_input_tokens_seen": 105899808, + "step": 87010 + }, + { + "epoch": 9.690945539592382, + "grad_norm": 0.5949208736419678, + "learning_rate": 3.066273759956707e-05, + "loss": 0.02, + "num_input_tokens_seen": 105905952, + "step": 87015 + }, + { + "epoch": 9.691502394476, + "grad_norm": 0.024197233840823174, + "learning_rate": 3.066037098645662e-05, + "loss": 0.0273, + "num_input_tokens_seen": 105911680, + "step": 87020 + }, + { + "epoch": 9.692059249359616, + "grad_norm": 0.016249576583504677, + "learning_rate": 3.065800431987943e-05, + "loss": 0.0208, + "num_input_tokens_seen": 105917856, + "step": 87025 + }, + { + "epoch": 9.692616104243234, + "grad_norm": 0.01103037316352129, + "learning_rate": 3.065563759985786e-05, + "loss": 0.0289, + "num_input_tokens_seen": 105923872, + "step": 87030 + }, + { + "epoch": 9.693172959126851, + "grad_norm": 0.33250489830970764, + "learning_rate": 3.0653270826414246e-05, + "loss": 0.0719, + "num_input_tokens_seen": 105929664, + "step": 87035 + }, + { + "epoch": 9.693729814010469, + "grad_norm": 0.00044220013660378754, + "learning_rate": 3.065090399957098e-05, + "loss": 0.0103, + "num_input_tokens_seen": 105935680, + "step": 87040 + }, + { + "epoch": 9.694286668894087, + "grad_norm": 0.2757277488708496, + "learning_rate": 3.064853711935039e-05, + "loss": 0.0187, + "num_input_tokens_seen": 105941856, + "step": 87045 + }, + { + "epoch": 9.694843523777703, + "grad_norm": 0.03531264513731003, + "learning_rate": 3.064617018577484e-05, + "loss": 0.157, + "num_input_tokens_seen": 105947552, + "step": 87050 + }, + { + "epoch": 9.69540037866132, + "grad_norm": 0.11451302468776703, + "learning_rate": 3.0643803198866695e-05, + "loss": 0.0431, + "num_input_tokens_seen": 105953664, + "step": 87055 + }, + { + "epoch": 9.695957233544938, + "grad_norm": 0.4501056671142578, + "learning_rate": 3.06414361586483e-05, + "loss": 0.0109, + "num_input_tokens_seen": 105959744, + "step": 87060 + }, + { + "epoch": 9.696514088428556, + "grad_norm": 0.780737042427063, + "learning_rate": 3.063906906514203e-05, + "loss": 0.0304, + "num_input_tokens_seen": 105965888, + "step": 87065 + }, + { + "epoch": 9.697070943312173, + "grad_norm": 0.1301680952310562, + "learning_rate": 3.0636701918370225e-05, + "loss": 0.0143, + "num_input_tokens_seen": 105971552, + "step": 87070 + }, + { + "epoch": 9.69762779819579, + "grad_norm": 0.4333226978778839, + "learning_rate": 3.0634334718355265e-05, + "loss": 0.0535, + "num_input_tokens_seen": 105977760, + "step": 87075 + }, + { + "epoch": 9.698184653079407, + "grad_norm": 0.0022572961170226336, + "learning_rate": 3.063196746511949e-05, + "loss": 0.0373, + "num_input_tokens_seen": 105983776, + "step": 87080 + }, + { + "epoch": 9.698741507963025, + "grad_norm": 0.06822637468576431, + "learning_rate": 3.062960015868527e-05, + "loss": 0.0409, + "num_input_tokens_seen": 105990080, + "step": 87085 + }, + { + "epoch": 9.699298362846642, + "grad_norm": 0.6025720834732056, + "learning_rate": 3.062723279907497e-05, + "loss": 0.0385, + "num_input_tokens_seen": 105995680, + "step": 87090 + }, + { + "epoch": 9.69985521773026, + "grad_norm": 0.6121464371681213, + "learning_rate": 3.062486538631094e-05, + "loss": 0.0457, + "num_input_tokens_seen": 106001856, + "step": 87095 + }, + { + "epoch": 9.700412072613878, + "grad_norm": 0.4150142967700958, + "learning_rate": 3.062249792041556e-05, + "loss": 0.0665, + "num_input_tokens_seen": 106008000, + "step": 87100 + }, + { + "epoch": 9.700968927497494, + "grad_norm": 0.5644798874855042, + "learning_rate": 3.062013040141118e-05, + "loss": 0.0233, + "num_input_tokens_seen": 106013920, + "step": 87105 + }, + { + "epoch": 9.701525782381111, + "grad_norm": 0.47566866874694824, + "learning_rate": 3.061776282932016e-05, + "loss": 0.117, + "num_input_tokens_seen": 106019776, + "step": 87110 + }, + { + "epoch": 9.702082637264729, + "grad_norm": 0.6926906704902649, + "learning_rate": 3.0615395204164876e-05, + "loss": 0.1139, + "num_input_tokens_seen": 106026112, + "step": 87115 + }, + { + "epoch": 9.702639492148347, + "grad_norm": 0.02911212295293808, + "learning_rate": 3.061302752596768e-05, + "loss": 0.0753, + "num_input_tokens_seen": 106032384, + "step": 87120 + }, + { + "epoch": 9.703196347031964, + "grad_norm": 0.08296716213226318, + "learning_rate": 3.0610659794750946e-05, + "loss": 0.1022, + "num_input_tokens_seen": 106038752, + "step": 87125 + }, + { + "epoch": 9.70375320191558, + "grad_norm": 0.48354586958885193, + "learning_rate": 3.060829201053703e-05, + "loss": 0.027, + "num_input_tokens_seen": 106044000, + "step": 87130 + }, + { + "epoch": 9.704310056799198, + "grad_norm": 1.1886180639266968, + "learning_rate": 3.060592417334831e-05, + "loss": 0.0315, + "num_input_tokens_seen": 106050336, + "step": 87135 + }, + { + "epoch": 9.704866911682815, + "grad_norm": 0.05554790422320366, + "learning_rate": 3.060355628320714e-05, + "loss": 0.0542, + "num_input_tokens_seen": 106056640, + "step": 87140 + }, + { + "epoch": 9.705423766566433, + "grad_norm": 0.012669527903199196, + "learning_rate": 3.060118834013589e-05, + "loss": 0.0126, + "num_input_tokens_seen": 106062400, + "step": 87145 + }, + { + "epoch": 9.70598062145005, + "grad_norm": 2.102081537246704, + "learning_rate": 3.059882034415693e-05, + "loss": 0.1042, + "num_input_tokens_seen": 106068608, + "step": 87150 + }, + { + "epoch": 9.706537476333667, + "grad_norm": 0.1962527334690094, + "learning_rate": 3.0596452295292626e-05, + "loss": 0.0229, + "num_input_tokens_seen": 106074528, + "step": 87155 + }, + { + "epoch": 9.707094331217284, + "grad_norm": 0.002515099011361599, + "learning_rate": 3.0594084193565353e-05, + "loss": 0.0053, + "num_input_tokens_seen": 106080640, + "step": 87160 + }, + { + "epoch": 9.707651186100902, + "grad_norm": 0.015509974211454391, + "learning_rate": 3.059171603899746e-05, + "loss": 0.0556, + "num_input_tokens_seen": 106086880, + "step": 87165 + }, + { + "epoch": 9.70820804098452, + "grad_norm": 0.11801024526357651, + "learning_rate": 3.058934783161134e-05, + "loss": 0.123, + "num_input_tokens_seen": 106092480, + "step": 87170 + }, + { + "epoch": 9.708764895868137, + "grad_norm": 1.1720188856124878, + "learning_rate": 3.0586979571429344e-05, + "loss": 0.0243, + "num_input_tokens_seen": 106098752, + "step": 87175 + }, + { + "epoch": 9.709321750751753, + "grad_norm": 0.2722305357456207, + "learning_rate": 3.0584611258473854e-05, + "loss": 0.0547, + "num_input_tokens_seen": 106104640, + "step": 87180 + }, + { + "epoch": 9.709878605635371, + "grad_norm": 0.5546491742134094, + "learning_rate": 3.0582242892767246e-05, + "loss": 0.0307, + "num_input_tokens_seen": 106110848, + "step": 87185 + }, + { + "epoch": 9.710435460518989, + "grad_norm": 0.0002193976688431576, + "learning_rate": 3.057987447433187e-05, + "loss": 0.0081, + "num_input_tokens_seen": 106117088, + "step": 87190 + }, + { + "epoch": 9.710992315402606, + "grad_norm": 0.0330176018178463, + "learning_rate": 3.057750600319011e-05, + "loss": 0.0036, + "num_input_tokens_seen": 106123456, + "step": 87195 + }, + { + "epoch": 9.711549170286224, + "grad_norm": 0.7895150780677795, + "learning_rate": 3.057513747936434e-05, + "loss": 0.0259, + "num_input_tokens_seen": 106129216, + "step": 87200 + }, + { + "epoch": 9.712106025169842, + "grad_norm": 0.8180335760116577, + "learning_rate": 3.057276890287693e-05, + "loss": 0.0254, + "num_input_tokens_seen": 106135296, + "step": 87205 + }, + { + "epoch": 9.712662880053458, + "grad_norm": 0.5422626733779907, + "learning_rate": 3.057040027375024e-05, + "loss": 0.1084, + "num_input_tokens_seen": 106141152, + "step": 87210 + }, + { + "epoch": 9.713219734937075, + "grad_norm": 0.7225140333175659, + "learning_rate": 3.056803159200666e-05, + "loss": 0.0707, + "num_input_tokens_seen": 106147200, + "step": 87215 + }, + { + "epoch": 9.713776589820693, + "grad_norm": 0.03320716693997383, + "learning_rate": 3.056566285766858e-05, + "loss": 0.0722, + "num_input_tokens_seen": 106153248, + "step": 87220 + }, + { + "epoch": 9.71433344470431, + "grad_norm": 0.1309550404548645, + "learning_rate": 3.056329407075834e-05, + "loss": 0.0785, + "num_input_tokens_seen": 106159424, + "step": 87225 + }, + { + "epoch": 9.714890299587928, + "grad_norm": 0.014338862150907516, + "learning_rate": 3.0560925231298334e-05, + "loss": 0.0445, + "num_input_tokens_seen": 106165440, + "step": 87230 + }, + { + "epoch": 9.715447154471544, + "grad_norm": 0.03608594089746475, + "learning_rate": 3.055855633931093e-05, + "loss": 0.0918, + "num_input_tokens_seen": 106171296, + "step": 87235 + }, + { + "epoch": 9.716004009355162, + "grad_norm": 0.31354856491088867, + "learning_rate": 3.055618739481851e-05, + "loss": 0.0452, + "num_input_tokens_seen": 106177344, + "step": 87240 + }, + { + "epoch": 9.71656086423878, + "grad_norm": 0.026575637981295586, + "learning_rate": 3.055381839784345e-05, + "loss": 0.0887, + "num_input_tokens_seen": 106183360, + "step": 87245 + }, + { + "epoch": 9.717117719122397, + "grad_norm": 0.3045003414154053, + "learning_rate": 3.0551449348408125e-05, + "loss": 0.0248, + "num_input_tokens_seen": 106189344, + "step": 87250 + }, + { + "epoch": 9.717674574006015, + "grad_norm": 0.17610502243041992, + "learning_rate": 3.054908024653491e-05, + "loss": 0.0173, + "num_input_tokens_seen": 106195680, + "step": 87255 + }, + { + "epoch": 9.71823142888963, + "grad_norm": 0.5940115451812744, + "learning_rate": 3.054671109224619e-05, + "loss": 0.0455, + "num_input_tokens_seen": 106201184, + "step": 87260 + }, + { + "epoch": 9.718788283773248, + "grad_norm": 1.537591576576233, + "learning_rate": 3.0544341885564344e-05, + "loss": 0.0544, + "num_input_tokens_seen": 106207520, + "step": 87265 + }, + { + "epoch": 9.719345138656866, + "grad_norm": 0.07101114839315414, + "learning_rate": 3.054197262651173e-05, + "loss": 0.0651, + "num_input_tokens_seen": 106212992, + "step": 87270 + }, + { + "epoch": 9.719901993540484, + "grad_norm": 1.0312721729278564, + "learning_rate": 3.053960331511076e-05, + "loss": 0.041, + "num_input_tokens_seen": 106218944, + "step": 87275 + }, + { + "epoch": 9.720458848424101, + "grad_norm": 0.003056140849366784, + "learning_rate": 3.05372339513838e-05, + "loss": 0.0929, + "num_input_tokens_seen": 106225248, + "step": 87280 + }, + { + "epoch": 9.721015703307717, + "grad_norm": 0.3778272569179535, + "learning_rate": 3.053486453535322e-05, + "loss": 0.0702, + "num_input_tokens_seen": 106230784, + "step": 87285 + }, + { + "epoch": 9.721572558191335, + "grad_norm": 3.5557079315185547, + "learning_rate": 3.053249506704142e-05, + "loss": 0.1424, + "num_input_tokens_seen": 106236992, + "step": 87290 + }, + { + "epoch": 9.722129413074953, + "grad_norm": 0.6346990466117859, + "learning_rate": 3.0530125546470756e-05, + "loss": 0.0157, + "num_input_tokens_seen": 106242816, + "step": 87295 + }, + { + "epoch": 9.72268626795857, + "grad_norm": 0.18769192695617676, + "learning_rate": 3.052775597366364e-05, + "loss": 0.0557, + "num_input_tokens_seen": 106248896, + "step": 87300 + }, + { + "epoch": 9.723243122842188, + "grad_norm": 0.9345420002937317, + "learning_rate": 3.052538634864243e-05, + "loss": 0.0421, + "num_input_tokens_seen": 106255008, + "step": 87305 + }, + { + "epoch": 9.723799977725804, + "grad_norm": 2.5454251766204834, + "learning_rate": 3.052301667142952e-05, + "loss": 0.1873, + "num_input_tokens_seen": 106261504, + "step": 87310 + }, + { + "epoch": 9.724356832609422, + "grad_norm": 0.05601733177900314, + "learning_rate": 3.05206469420473e-05, + "loss": 0.0118, + "num_input_tokens_seen": 106267616, + "step": 87315 + }, + { + "epoch": 9.72491368749304, + "grad_norm": 0.08650465309619904, + "learning_rate": 3.0518277160518143e-05, + "loss": 0.0178, + "num_input_tokens_seen": 106273728, + "step": 87320 + }, + { + "epoch": 9.725470542376657, + "grad_norm": 1.279615044593811, + "learning_rate": 3.051590732686444e-05, + "loss": 0.0707, + "num_input_tokens_seen": 106279776, + "step": 87325 + }, + { + "epoch": 9.726027397260275, + "grad_norm": 0.3222278654575348, + "learning_rate": 3.0513537441108565e-05, + "loss": 0.0761, + "num_input_tokens_seen": 106285856, + "step": 87330 + }, + { + "epoch": 9.72658425214389, + "grad_norm": 2.2295784950256348, + "learning_rate": 3.0511167503272913e-05, + "loss": 0.0576, + "num_input_tokens_seen": 106291776, + "step": 87335 + }, + { + "epoch": 9.727141107027508, + "grad_norm": 0.06669821590185165, + "learning_rate": 3.0508797513379876e-05, + "loss": 0.014, + "num_input_tokens_seen": 106297952, + "step": 87340 + }, + { + "epoch": 9.727697961911126, + "grad_norm": 1.0833344459533691, + "learning_rate": 3.0506427471451827e-05, + "loss": 0.0639, + "num_input_tokens_seen": 106304064, + "step": 87345 + }, + { + "epoch": 9.728254816794744, + "grad_norm": 0.0012494238326326013, + "learning_rate": 3.0504057377511164e-05, + "loss": 0.1499, + "num_input_tokens_seen": 106309888, + "step": 87350 + }, + { + "epoch": 9.728811671678361, + "grad_norm": 0.14205776154994965, + "learning_rate": 3.0501687231580265e-05, + "loss": 0.0652, + "num_input_tokens_seen": 106316064, + "step": 87355 + }, + { + "epoch": 9.729368526561977, + "grad_norm": 0.8722509145736694, + "learning_rate": 3.0499317033681524e-05, + "loss": 0.0815, + "num_input_tokens_seen": 106322112, + "step": 87360 + }, + { + "epoch": 9.729925381445595, + "grad_norm": 2.2125205993652344, + "learning_rate": 3.0496946783837325e-05, + "loss": 0.0821, + "num_input_tokens_seen": 106327744, + "step": 87365 + }, + { + "epoch": 9.730482236329212, + "grad_norm": 0.09735765308141708, + "learning_rate": 3.0494576482070058e-05, + "loss": 0.172, + "num_input_tokens_seen": 106333600, + "step": 87370 + }, + { + "epoch": 9.73103909121283, + "grad_norm": 0.004865748807787895, + "learning_rate": 3.0492206128402123e-05, + "loss": 0.0584, + "num_input_tokens_seen": 106339488, + "step": 87375 + }, + { + "epoch": 9.731595946096448, + "grad_norm": 0.929212212562561, + "learning_rate": 3.048983572285589e-05, + "loss": 0.0625, + "num_input_tokens_seen": 106345664, + "step": 87380 + }, + { + "epoch": 9.732152800980064, + "grad_norm": 1.7346981763839722, + "learning_rate": 3.048746526545377e-05, + "loss": 0.101, + "num_input_tokens_seen": 106351520, + "step": 87385 + }, + { + "epoch": 9.732709655863681, + "grad_norm": 0.8073269128799438, + "learning_rate": 3.0485094756218134e-05, + "loss": 0.0093, + "num_input_tokens_seen": 106357824, + "step": 87390 + }, + { + "epoch": 9.733266510747299, + "grad_norm": 1.3787877559661865, + "learning_rate": 3.0482724195171398e-05, + "loss": 0.0802, + "num_input_tokens_seen": 106363360, + "step": 87395 + }, + { + "epoch": 9.733823365630917, + "grad_norm": 0.5389178991317749, + "learning_rate": 3.0480353582335926e-05, + "loss": 0.0845, + "num_input_tokens_seen": 106369152, + "step": 87400 + }, + { + "epoch": 9.734380220514534, + "grad_norm": 0.8993239998817444, + "learning_rate": 3.0477982917734126e-05, + "loss": 0.1027, + "num_input_tokens_seen": 106375520, + "step": 87405 + }, + { + "epoch": 9.73493707539815, + "grad_norm": 0.5386711359024048, + "learning_rate": 3.047561220138839e-05, + "loss": 0.0513, + "num_input_tokens_seen": 106381696, + "step": 87410 + }, + { + "epoch": 9.735493930281768, + "grad_norm": 0.31026625633239746, + "learning_rate": 3.047324143332111e-05, + "loss": 0.0159, + "num_input_tokens_seen": 106388032, + "step": 87415 + }, + { + "epoch": 9.736050785165386, + "grad_norm": 0.6856677532196045, + "learning_rate": 3.047087061355468e-05, + "loss": 0.0309, + "num_input_tokens_seen": 106394112, + "step": 87420 + }, + { + "epoch": 9.736607640049003, + "grad_norm": 0.5824652910232544, + "learning_rate": 3.0468499742111497e-05, + "loss": 0.0866, + "num_input_tokens_seen": 106400256, + "step": 87425 + }, + { + "epoch": 9.737164494932621, + "grad_norm": 0.04838362708687782, + "learning_rate": 3.0466128819013944e-05, + "loss": 0.0724, + "num_input_tokens_seen": 106406496, + "step": 87430 + }, + { + "epoch": 9.737721349816239, + "grad_norm": 0.21091970801353455, + "learning_rate": 3.046375784428443e-05, + "loss": 0.0178, + "num_input_tokens_seen": 106412032, + "step": 87435 + }, + { + "epoch": 9.738278204699855, + "grad_norm": 0.06920304149389267, + "learning_rate": 3.046138681794535e-05, + "loss": 0.0106, + "num_input_tokens_seen": 106418336, + "step": 87440 + }, + { + "epoch": 9.738835059583472, + "grad_norm": 0.5328476428985596, + "learning_rate": 3.045901574001909e-05, + "loss": 0.0591, + "num_input_tokens_seen": 106424288, + "step": 87445 + }, + { + "epoch": 9.73939191446709, + "grad_norm": 1.9694534540176392, + "learning_rate": 3.0456644610528053e-05, + "loss": 0.1256, + "num_input_tokens_seen": 106430272, + "step": 87450 + }, + { + "epoch": 9.739948769350708, + "grad_norm": 0.002132068620994687, + "learning_rate": 3.045427342949464e-05, + "loss": 0.0404, + "num_input_tokens_seen": 106435904, + "step": 87455 + }, + { + "epoch": 9.740505624234325, + "grad_norm": 0.5639649033546448, + "learning_rate": 3.045190219694124e-05, + "loss": 0.033, + "num_input_tokens_seen": 106442016, + "step": 87460 + }, + { + "epoch": 9.741062479117941, + "grad_norm": 0.4601665735244751, + "learning_rate": 3.044953091289025e-05, + "loss": 0.0587, + "num_input_tokens_seen": 106448640, + "step": 87465 + }, + { + "epoch": 9.741619334001559, + "grad_norm": 0.15969575941562653, + "learning_rate": 3.0447159577364094e-05, + "loss": 0.0227, + "num_input_tokens_seen": 106454336, + "step": 87470 + }, + { + "epoch": 9.742176188885177, + "grad_norm": 0.013906560838222504, + "learning_rate": 3.0444788190385137e-05, + "loss": 0.0154, + "num_input_tokens_seen": 106460384, + "step": 87475 + }, + { + "epoch": 9.742733043768794, + "grad_norm": 0.0015588818350806832, + "learning_rate": 3.04424167519758e-05, + "loss": 0.0085, + "num_input_tokens_seen": 106466656, + "step": 87480 + }, + { + "epoch": 9.743289898652412, + "grad_norm": 0.015624037012457848, + "learning_rate": 3.044004526215847e-05, + "loss": 0.0013, + "num_input_tokens_seen": 106472672, + "step": 87485 + }, + { + "epoch": 9.743846753536028, + "grad_norm": 0.15283450484275818, + "learning_rate": 3.043767372095555e-05, + "loss": 0.0316, + "num_input_tokens_seen": 106478304, + "step": 87490 + }, + { + "epoch": 9.744403608419645, + "grad_norm": 0.03692327439785004, + "learning_rate": 3.0435302128389455e-05, + "loss": 0.0201, + "num_input_tokens_seen": 106484608, + "step": 87495 + }, + { + "epoch": 9.744960463303263, + "grad_norm": 0.00833162385970354, + "learning_rate": 3.0432930484482568e-05, + "loss": 0.0014, + "num_input_tokens_seen": 106491008, + "step": 87500 + }, + { + "epoch": 9.74551731818688, + "grad_norm": 0.0005284076905809343, + "learning_rate": 3.0430558789257312e-05, + "loss": 0.0254, + "num_input_tokens_seen": 106496512, + "step": 87505 + }, + { + "epoch": 9.746074173070499, + "grad_norm": 1.9779690504074097, + "learning_rate": 3.0428187042736067e-05, + "loss": 0.2692, + "num_input_tokens_seen": 106502592, + "step": 87510 + }, + { + "epoch": 9.746631027954114, + "grad_norm": 0.7435001730918884, + "learning_rate": 3.0425815244941248e-05, + "loss": 0.0568, + "num_input_tokens_seen": 106508576, + "step": 87515 + }, + { + "epoch": 9.747187882837732, + "grad_norm": 0.0057016219943761826, + "learning_rate": 3.0423443395895263e-05, + "loss": 0.002, + "num_input_tokens_seen": 106514720, + "step": 87520 + }, + { + "epoch": 9.74774473772135, + "grad_norm": 0.06727329641580582, + "learning_rate": 3.0421071495620502e-05, + "loss": 0.0149, + "num_input_tokens_seen": 106520480, + "step": 87525 + }, + { + "epoch": 9.748301592604967, + "grad_norm": 0.15091988444328308, + "learning_rate": 3.0418699544139384e-05, + "loss": 0.0484, + "num_input_tokens_seen": 106526464, + "step": 87530 + }, + { + "epoch": 9.748858447488585, + "grad_norm": 0.0020150276832282543, + "learning_rate": 3.0416327541474298e-05, + "loss": 0.0199, + "num_input_tokens_seen": 106532768, + "step": 87535 + }, + { + "epoch": 9.749415302372201, + "grad_norm": 0.7451679110527039, + "learning_rate": 3.0413955487647673e-05, + "loss": 0.0271, + "num_input_tokens_seen": 106538816, + "step": 87540 + }, + { + "epoch": 9.749972157255819, + "grad_norm": 0.0013124961405992508, + "learning_rate": 3.0411583382681885e-05, + "loss": 0.0079, + "num_input_tokens_seen": 106545248, + "step": 87545 + }, + { + "epoch": 9.750529012139436, + "grad_norm": 1.527331829071045, + "learning_rate": 3.0409211226599366e-05, + "loss": 0.0933, + "num_input_tokens_seen": 106551104, + "step": 87550 + }, + { + "epoch": 9.751085867023054, + "grad_norm": 0.5975748896598816, + "learning_rate": 3.040683901942251e-05, + "loss": 0.0269, + "num_input_tokens_seen": 106557152, + "step": 87555 + }, + { + "epoch": 9.751642721906672, + "grad_norm": 0.18823017179965973, + "learning_rate": 3.0404466761173727e-05, + "loss": 0.0361, + "num_input_tokens_seen": 106563264, + "step": 87560 + }, + { + "epoch": 9.75219957679029, + "grad_norm": 1.0289583206176758, + "learning_rate": 3.040209445187543e-05, + "loss": 0.1168, + "num_input_tokens_seen": 106568800, + "step": 87565 + }, + { + "epoch": 9.752756431673905, + "grad_norm": 0.6733640432357788, + "learning_rate": 3.039972209155002e-05, + "loss": 0.0104, + "num_input_tokens_seen": 106575008, + "step": 87570 + }, + { + "epoch": 9.753313286557523, + "grad_norm": 0.002610976342111826, + "learning_rate": 3.039734968021991e-05, + "loss": 0.0224, + "num_input_tokens_seen": 106580928, + "step": 87575 + }, + { + "epoch": 9.75387014144114, + "grad_norm": 0.268251895904541, + "learning_rate": 3.0394977217907506e-05, + "loss": 0.041, + "num_input_tokens_seen": 106586976, + "step": 87580 + }, + { + "epoch": 9.754426996324758, + "grad_norm": 0.0010976862395182252, + "learning_rate": 3.0392604704635218e-05, + "loss": 0.0345, + "num_input_tokens_seen": 106593280, + "step": 87585 + }, + { + "epoch": 9.754983851208376, + "grad_norm": 1.3079861402511597, + "learning_rate": 3.0390232140425462e-05, + "loss": 0.0421, + "num_input_tokens_seen": 106599360, + "step": 87590 + }, + { + "epoch": 9.755540706091992, + "grad_norm": 3.1243069171905518, + "learning_rate": 3.0387859525300644e-05, + "loss": 0.0487, + "num_input_tokens_seen": 106605632, + "step": 87595 + }, + { + "epoch": 9.75609756097561, + "grad_norm": 0.013296476565301418, + "learning_rate": 3.038548685928318e-05, + "loss": 0.0336, + "num_input_tokens_seen": 106611712, + "step": 87600 + }, + { + "epoch": 9.756654415859227, + "grad_norm": 1.3818567991256714, + "learning_rate": 3.038311414239547e-05, + "loss": 0.0575, + "num_input_tokens_seen": 106618016, + "step": 87605 + }, + { + "epoch": 9.757211270742845, + "grad_norm": 0.13861840963363647, + "learning_rate": 3.0380741374659933e-05, + "loss": 0.0128, + "num_input_tokens_seen": 106623968, + "step": 87610 + }, + { + "epoch": 9.757768125626463, + "grad_norm": 0.08383746445178986, + "learning_rate": 3.037836855609899e-05, + "loss": 0.0543, + "num_input_tokens_seen": 106629792, + "step": 87615 + }, + { + "epoch": 9.758324980510078, + "grad_norm": 0.05762116238474846, + "learning_rate": 3.0375995686735043e-05, + "loss": 0.037, + "num_input_tokens_seen": 106636064, + "step": 87620 + }, + { + "epoch": 9.758881835393696, + "grad_norm": 0.3072187006473541, + "learning_rate": 3.0373622766590516e-05, + "loss": 0.017, + "num_input_tokens_seen": 106641920, + "step": 87625 + }, + { + "epoch": 9.759438690277314, + "grad_norm": 1.0976052284240723, + "learning_rate": 3.0371249795687804e-05, + "loss": 0.1197, + "num_input_tokens_seen": 106647680, + "step": 87630 + }, + { + "epoch": 9.759995545160931, + "grad_norm": 0.00036594163975678384, + "learning_rate": 3.0368876774049347e-05, + "loss": 0.0025, + "num_input_tokens_seen": 106653952, + "step": 87635 + }, + { + "epoch": 9.76055240004455, + "grad_norm": 0.46607133746147156, + "learning_rate": 3.036650370169754e-05, + "loss": 0.0613, + "num_input_tokens_seen": 106660192, + "step": 87640 + }, + { + "epoch": 9.761109254928165, + "grad_norm": 0.00042092803050763905, + "learning_rate": 3.0364130578654805e-05, + "loss": 0.0012, + "num_input_tokens_seen": 106666400, + "step": 87645 + }, + { + "epoch": 9.761666109811783, + "grad_norm": 0.1433032751083374, + "learning_rate": 3.0361757404943562e-05, + "loss": 0.004, + "num_input_tokens_seen": 106672416, + "step": 87650 + }, + { + "epoch": 9.7622229646954, + "grad_norm": 0.01353619433939457, + "learning_rate": 3.0359384180586224e-05, + "loss": 0.0208, + "num_input_tokens_seen": 106678560, + "step": 87655 + }, + { + "epoch": 9.762779819579018, + "grad_norm": 0.0002018475061049685, + "learning_rate": 3.0357010905605216e-05, + "loss": 0.0121, + "num_input_tokens_seen": 106684544, + "step": 87660 + }, + { + "epoch": 9.763336674462636, + "grad_norm": 1.2707927227020264, + "learning_rate": 3.0354637580022938e-05, + "loss": 0.1844, + "num_input_tokens_seen": 106690592, + "step": 87665 + }, + { + "epoch": 9.763893529346252, + "grad_norm": 0.08396857976913452, + "learning_rate": 3.0352264203861825e-05, + "loss": 0.2115, + "num_input_tokens_seen": 106696800, + "step": 87670 + }, + { + "epoch": 9.76445038422987, + "grad_norm": 0.6930649876594543, + "learning_rate": 3.034989077714428e-05, + "loss": 0.0105, + "num_input_tokens_seen": 106702944, + "step": 87675 + }, + { + "epoch": 9.765007239113487, + "grad_norm": 0.008527952246367931, + "learning_rate": 3.0347517299892737e-05, + "loss": 0.1135, + "num_input_tokens_seen": 106709120, + "step": 87680 + }, + { + "epoch": 9.765564093997105, + "grad_norm": 0.006770595908164978, + "learning_rate": 3.034514377212961e-05, + "loss": 0.1161, + "num_input_tokens_seen": 106715616, + "step": 87685 + }, + { + "epoch": 9.766120948880722, + "grad_norm": 0.0031591574661433697, + "learning_rate": 3.034277019387731e-05, + "loss": 0.0083, + "num_input_tokens_seen": 106721184, + "step": 87690 + }, + { + "epoch": 9.766677803764338, + "grad_norm": 0.0029520071111619473, + "learning_rate": 3.034039656515827e-05, + "loss": 0.0315, + "num_input_tokens_seen": 106727136, + "step": 87695 + }, + { + "epoch": 9.767234658647956, + "grad_norm": 0.2140112966299057, + "learning_rate": 3.0338022885994904e-05, + "loss": 0.0219, + "num_input_tokens_seen": 106732704, + "step": 87700 + }, + { + "epoch": 9.767791513531574, + "grad_norm": 0.019533181563019753, + "learning_rate": 3.033564915640964e-05, + "loss": 0.0233, + "num_input_tokens_seen": 106739168, + "step": 87705 + }, + { + "epoch": 9.768348368415191, + "grad_norm": 1.2833006381988525, + "learning_rate": 3.0333275376424885e-05, + "loss": 0.031, + "num_input_tokens_seen": 106745312, + "step": 87710 + }, + { + "epoch": 9.768905223298809, + "grad_norm": 0.605797529220581, + "learning_rate": 3.033090154606308e-05, + "loss": 0.0099, + "num_input_tokens_seen": 106751392, + "step": 87715 + }, + { + "epoch": 9.769462078182425, + "grad_norm": 0.5571209788322449, + "learning_rate": 3.0328527665346633e-05, + "loss": 0.1062, + "num_input_tokens_seen": 106757536, + "step": 87720 + }, + { + "epoch": 9.770018933066043, + "grad_norm": 1.265629768371582, + "learning_rate": 3.032615373429798e-05, + "loss": 0.0479, + "num_input_tokens_seen": 106763392, + "step": 87725 + }, + { + "epoch": 9.77057578794966, + "grad_norm": 1.0347031354904175, + "learning_rate": 3.0323779752939535e-05, + "loss": 0.2134, + "num_input_tokens_seen": 106768896, + "step": 87730 + }, + { + "epoch": 9.771132642833278, + "grad_norm": 0.6778856515884399, + "learning_rate": 3.0321405721293723e-05, + "loss": 0.0595, + "num_input_tokens_seen": 106774848, + "step": 87735 + }, + { + "epoch": 9.771689497716896, + "grad_norm": 0.4067601263523102, + "learning_rate": 3.0319031639382966e-05, + "loss": 0.0666, + "num_input_tokens_seen": 106780928, + "step": 87740 + }, + { + "epoch": 9.772246352600511, + "grad_norm": 1.1918431520462036, + "learning_rate": 3.03166575072297e-05, + "loss": 0.0611, + "num_input_tokens_seen": 106787008, + "step": 87745 + }, + { + "epoch": 9.77280320748413, + "grad_norm": 0.07318370789289474, + "learning_rate": 3.031428332485634e-05, + "loss": 0.0093, + "num_input_tokens_seen": 106793312, + "step": 87750 + }, + { + "epoch": 9.773360062367747, + "grad_norm": 0.005177826154977083, + "learning_rate": 3.0311909092285322e-05, + "loss": 0.0654, + "num_input_tokens_seen": 106799456, + "step": 87755 + }, + { + "epoch": 9.773916917251364, + "grad_norm": 0.008624203503131866, + "learning_rate": 3.0309534809539066e-05, + "loss": 0.0337, + "num_input_tokens_seen": 106805760, + "step": 87760 + }, + { + "epoch": 9.774473772134982, + "grad_norm": 0.9124932289123535, + "learning_rate": 3.0307160476640002e-05, + "loss": 0.0996, + "num_input_tokens_seen": 106811584, + "step": 87765 + }, + { + "epoch": 9.775030627018598, + "grad_norm": 0.3933631181716919, + "learning_rate": 3.0304786093610547e-05, + "loss": 0.0531, + "num_input_tokens_seen": 106817632, + "step": 87770 + }, + { + "epoch": 9.775587481902216, + "grad_norm": 0.7657262682914734, + "learning_rate": 3.030241166047314e-05, + "loss": 0.1085, + "num_input_tokens_seen": 106823584, + "step": 87775 + }, + { + "epoch": 9.776144336785833, + "grad_norm": 0.0011046731378883123, + "learning_rate": 3.0300037177250205e-05, + "loss": 0.011, + "num_input_tokens_seen": 106829664, + "step": 87780 + }, + { + "epoch": 9.776701191669451, + "grad_norm": 0.4677371382713318, + "learning_rate": 3.0297662643964176e-05, + "loss": 0.0866, + "num_input_tokens_seen": 106835776, + "step": 87785 + }, + { + "epoch": 9.777258046553069, + "grad_norm": 2.289837121963501, + "learning_rate": 3.0295288060637484e-05, + "loss": 0.1218, + "num_input_tokens_seen": 106842112, + "step": 87790 + }, + { + "epoch": 9.777814901436686, + "grad_norm": 0.0010210612090304494, + "learning_rate": 3.0292913427292545e-05, + "loss": 0.008, + "num_input_tokens_seen": 106848608, + "step": 87795 + }, + { + "epoch": 9.778371756320302, + "grad_norm": 0.0018272658344358206, + "learning_rate": 3.02905387439518e-05, + "loss": 0.0379, + "num_input_tokens_seen": 106855072, + "step": 87800 + }, + { + "epoch": 9.77892861120392, + "grad_norm": 0.04301817715167999, + "learning_rate": 3.028816401063768e-05, + "loss": 0.0461, + "num_input_tokens_seen": 106861312, + "step": 87805 + }, + { + "epoch": 9.779485466087538, + "grad_norm": 0.01338761392980814, + "learning_rate": 3.0285789227372612e-05, + "loss": 0.0517, + "num_input_tokens_seen": 106867552, + "step": 87810 + }, + { + "epoch": 9.780042320971155, + "grad_norm": 0.0013374313712120056, + "learning_rate": 3.0283414394179034e-05, + "loss": 0.0002, + "num_input_tokens_seen": 106874016, + "step": 87815 + }, + { + "epoch": 9.780599175854773, + "grad_norm": 0.1636655479669571, + "learning_rate": 3.028103951107937e-05, + "loss": 0.0372, + "num_input_tokens_seen": 106879840, + "step": 87820 + }, + { + "epoch": 9.781156030738389, + "grad_norm": 1.2238346338272095, + "learning_rate": 3.027866457809606e-05, + "loss": 0.0169, + "num_input_tokens_seen": 106885856, + "step": 87825 + }, + { + "epoch": 9.781712885622007, + "grad_norm": 0.6100024580955505, + "learning_rate": 3.027628959525153e-05, + "loss": 0.0651, + "num_input_tokens_seen": 106891968, + "step": 87830 + }, + { + "epoch": 9.782269740505624, + "grad_norm": 0.09009654819965363, + "learning_rate": 3.0273914562568218e-05, + "loss": 0.0444, + "num_input_tokens_seen": 106897824, + "step": 87835 + }, + { + "epoch": 9.782826595389242, + "grad_norm": 0.0001173011987702921, + "learning_rate": 3.027153948006856e-05, + "loss": 0.0025, + "num_input_tokens_seen": 106904064, + "step": 87840 + }, + { + "epoch": 9.78338345027286, + "grad_norm": 0.06685404479503632, + "learning_rate": 3.0269164347774987e-05, + "loss": 0.0753, + "num_input_tokens_seen": 106909664, + "step": 87845 + }, + { + "epoch": 9.783940305156476, + "grad_norm": 0.000468717043986544, + "learning_rate": 3.0266789165709937e-05, + "loss": 0.0047, + "num_input_tokens_seen": 106916256, + "step": 87850 + }, + { + "epoch": 9.784497160040093, + "grad_norm": 2.371258497238159, + "learning_rate": 3.026441393389584e-05, + "loss": 0.1251, + "num_input_tokens_seen": 106922880, + "step": 87855 + }, + { + "epoch": 9.78505401492371, + "grad_norm": 1.095346450805664, + "learning_rate": 3.026203865235514e-05, + "loss": 0.0871, + "num_input_tokens_seen": 106928832, + "step": 87860 + }, + { + "epoch": 9.785610869807329, + "grad_norm": 0.7317836284637451, + "learning_rate": 3.0259663321110265e-05, + "loss": 0.0572, + "num_input_tokens_seen": 106934752, + "step": 87865 + }, + { + "epoch": 9.786167724690946, + "grad_norm": 0.36475691199302673, + "learning_rate": 3.0257287940183654e-05, + "loss": 0.0357, + "num_input_tokens_seen": 106940832, + "step": 87870 + }, + { + "epoch": 9.786724579574562, + "grad_norm": 0.2026854008436203, + "learning_rate": 3.025491250959775e-05, + "loss": 0.0033, + "num_input_tokens_seen": 106946944, + "step": 87875 + }, + { + "epoch": 9.78728143445818, + "grad_norm": 1.1414469480514526, + "learning_rate": 3.0252537029374994e-05, + "loss": 0.0417, + "num_input_tokens_seen": 106952800, + "step": 87880 + }, + { + "epoch": 9.787838289341797, + "grad_norm": 0.008593316189944744, + "learning_rate": 3.0250161499537803e-05, + "loss": 0.141, + "num_input_tokens_seen": 106959456, + "step": 87885 + }, + { + "epoch": 9.788395144225415, + "grad_norm": 0.007359049282968044, + "learning_rate": 3.024778592010864e-05, + "loss": 0.0215, + "num_input_tokens_seen": 106965568, + "step": 87890 + }, + { + "epoch": 9.788951999109033, + "grad_norm": 0.12998422980308533, + "learning_rate": 3.024541029110993e-05, + "loss": 0.0769, + "num_input_tokens_seen": 106971808, + "step": 87895 + }, + { + "epoch": 9.789508853992649, + "grad_norm": 0.5547216534614563, + "learning_rate": 3.0243034612564125e-05, + "loss": 0.0289, + "num_input_tokens_seen": 106977408, + "step": 87900 + }, + { + "epoch": 9.790065708876266, + "grad_norm": 0.23479026556015015, + "learning_rate": 3.024065888449365e-05, + "loss": 0.0347, + "num_input_tokens_seen": 106983744, + "step": 87905 + }, + { + "epoch": 9.790622563759884, + "grad_norm": 1.6765832901000977, + "learning_rate": 3.0238283106920957e-05, + "loss": 0.0592, + "num_input_tokens_seen": 106989920, + "step": 87910 + }, + { + "epoch": 9.791179418643502, + "grad_norm": 1.007741093635559, + "learning_rate": 3.023590727986848e-05, + "loss": 0.0246, + "num_input_tokens_seen": 106996000, + "step": 87915 + }, + { + "epoch": 9.79173627352712, + "grad_norm": 1.402830719947815, + "learning_rate": 3.023353140335866e-05, + "loss": 0.03, + "num_input_tokens_seen": 107001984, + "step": 87920 + }, + { + "epoch": 9.792293128410737, + "grad_norm": 0.0002526637399569154, + "learning_rate": 3.0231155477413952e-05, + "loss": 0.0052, + "num_input_tokens_seen": 107008288, + "step": 87925 + }, + { + "epoch": 9.792849983294353, + "grad_norm": 0.06190329045057297, + "learning_rate": 3.022877950205678e-05, + "loss": 0.0172, + "num_input_tokens_seen": 107014464, + "step": 87930 + }, + { + "epoch": 9.79340683817797, + "grad_norm": 0.39375972747802734, + "learning_rate": 3.0226403477309606e-05, + "loss": 0.0731, + "num_input_tokens_seen": 107020544, + "step": 87935 + }, + { + "epoch": 9.793963693061588, + "grad_norm": 0.07339408993721008, + "learning_rate": 3.022402740319486e-05, + "loss": 0.0348, + "num_input_tokens_seen": 107026464, + "step": 87940 + }, + { + "epoch": 9.794520547945206, + "grad_norm": 0.09623842686414719, + "learning_rate": 3.022165127973499e-05, + "loss": 0.1967, + "num_input_tokens_seen": 107032320, + "step": 87945 + }, + { + "epoch": 9.795077402828824, + "grad_norm": 0.6298523545265198, + "learning_rate": 3.0219275106952437e-05, + "loss": 0.017, + "num_input_tokens_seen": 107038624, + "step": 87950 + }, + { + "epoch": 9.79563425771244, + "grad_norm": 0.19672434031963348, + "learning_rate": 3.0216898884869648e-05, + "loss": 0.1624, + "num_input_tokens_seen": 107044800, + "step": 87955 + }, + { + "epoch": 9.796191112596057, + "grad_norm": 2.186755657196045, + "learning_rate": 3.0214522613509078e-05, + "loss": 0.0631, + "num_input_tokens_seen": 107051008, + "step": 87960 + }, + { + "epoch": 9.796747967479675, + "grad_norm": 1.0713192224502563, + "learning_rate": 3.0212146292893155e-05, + "loss": 0.0491, + "num_input_tokens_seen": 107057088, + "step": 87965 + }, + { + "epoch": 9.797304822363293, + "grad_norm": 0.28584930300712585, + "learning_rate": 3.020976992304434e-05, + "loss": 0.0267, + "num_input_tokens_seen": 107063328, + "step": 87970 + }, + { + "epoch": 9.79786167724691, + "grad_norm": 1.021989345550537, + "learning_rate": 3.020739350398507e-05, + "loss": 0.0936, + "num_input_tokens_seen": 107069472, + "step": 87975 + }, + { + "epoch": 9.798418532130526, + "grad_norm": 0.005160285625606775, + "learning_rate": 3.0205017035737804e-05, + "loss": 0.0545, + "num_input_tokens_seen": 107075680, + "step": 87980 + }, + { + "epoch": 9.798975387014144, + "grad_norm": 0.566876232624054, + "learning_rate": 3.0202640518324977e-05, + "loss": 0.0833, + "num_input_tokens_seen": 107081824, + "step": 87985 + }, + { + "epoch": 9.799532241897762, + "grad_norm": 0.8156289458274841, + "learning_rate": 3.0200263951769037e-05, + "loss": 0.0396, + "num_input_tokens_seen": 107088064, + "step": 87990 + }, + { + "epoch": 9.80008909678138, + "grad_norm": 0.0003243458631914109, + "learning_rate": 3.0197887336092447e-05, + "loss": 0.0569, + "num_input_tokens_seen": 107094464, + "step": 87995 + }, + { + "epoch": 9.800645951664997, + "grad_norm": 1.5261154174804688, + "learning_rate": 3.019551067131764e-05, + "loss": 0.0988, + "num_input_tokens_seen": 107100640, + "step": 88000 + }, + { + "epoch": 9.801202806548613, + "grad_norm": 0.3458182215690613, + "learning_rate": 3.0193133957467074e-05, + "loss": 0.0145, + "num_input_tokens_seen": 107106912, + "step": 88005 + }, + { + "epoch": 9.80175966143223, + "grad_norm": 0.003831684123724699, + "learning_rate": 3.0190757194563195e-05, + "loss": 0.0652, + "num_input_tokens_seen": 107113152, + "step": 88010 + }, + { + "epoch": 9.802316516315848, + "grad_norm": 1.2817671298980713, + "learning_rate": 3.0188380382628458e-05, + "loss": 0.1121, + "num_input_tokens_seen": 107119040, + "step": 88015 + }, + { + "epoch": 9.802873371199466, + "grad_norm": 2.488835573196411, + "learning_rate": 3.018600352168531e-05, + "loss": 0.0593, + "num_input_tokens_seen": 107124928, + "step": 88020 + }, + { + "epoch": 9.803430226083083, + "grad_norm": 1.0870016813278198, + "learning_rate": 3.0183626611756198e-05, + "loss": 0.0868, + "num_input_tokens_seen": 107131008, + "step": 88025 + }, + { + "epoch": 9.8039870809667, + "grad_norm": 1.0743972063064575, + "learning_rate": 3.0181249652863593e-05, + "loss": 0.0713, + "num_input_tokens_seen": 107137248, + "step": 88030 + }, + { + "epoch": 9.804543935850317, + "grad_norm": 0.0034428176004439592, + "learning_rate": 3.0178872645029928e-05, + "loss": 0.0599, + "num_input_tokens_seen": 107143488, + "step": 88035 + }, + { + "epoch": 9.805100790733935, + "grad_norm": 0.003670993959531188, + "learning_rate": 3.0176495588277658e-05, + "loss": 0.0758, + "num_input_tokens_seen": 107149120, + "step": 88040 + }, + { + "epoch": 9.805657645617552, + "grad_norm": 0.11630647629499435, + "learning_rate": 3.0174118482629242e-05, + "loss": 0.1045, + "num_input_tokens_seen": 107154880, + "step": 88045 + }, + { + "epoch": 9.80621450050117, + "grad_norm": 0.05638229846954346, + "learning_rate": 3.017174132810713e-05, + "loss": 0.0381, + "num_input_tokens_seen": 107161088, + "step": 88050 + }, + { + "epoch": 9.806771355384786, + "grad_norm": 2.2409465312957764, + "learning_rate": 3.0169364124733785e-05, + "loss": 0.094, + "num_input_tokens_seen": 107167136, + "step": 88055 + }, + { + "epoch": 9.807328210268404, + "grad_norm": 0.13061314821243286, + "learning_rate": 3.0166986872531644e-05, + "loss": 0.0845, + "num_input_tokens_seen": 107172992, + "step": 88060 + }, + { + "epoch": 9.807885065152021, + "grad_norm": 0.0006991309346631169, + "learning_rate": 3.0164609571523183e-05, + "loss": 0.086, + "num_input_tokens_seen": 107179264, + "step": 88065 + }, + { + "epoch": 9.808441920035639, + "grad_norm": 0.4646469056606293, + "learning_rate": 3.0162232221730835e-05, + "loss": 0.0605, + "num_input_tokens_seen": 107185216, + "step": 88070 + }, + { + "epoch": 9.808998774919257, + "grad_norm": 0.30300629138946533, + "learning_rate": 3.015985482317708e-05, + "loss": 0.0238, + "num_input_tokens_seen": 107191232, + "step": 88075 + }, + { + "epoch": 9.809555629802873, + "grad_norm": 1.8394650220870972, + "learning_rate": 3.0157477375884353e-05, + "loss": 0.064, + "num_input_tokens_seen": 107197408, + "step": 88080 + }, + { + "epoch": 9.81011248468649, + "grad_norm": 0.04293219745159149, + "learning_rate": 3.015509987987512e-05, + "loss": 0.0365, + "num_input_tokens_seen": 107203520, + "step": 88085 + }, + { + "epoch": 9.810669339570108, + "grad_norm": 0.06320304423570633, + "learning_rate": 3.0152722335171846e-05, + "loss": 0.033, + "num_input_tokens_seen": 107209696, + "step": 88090 + }, + { + "epoch": 9.811226194453726, + "grad_norm": 0.0010546145495027304, + "learning_rate": 3.015034474179697e-05, + "loss": 0.0115, + "num_input_tokens_seen": 107216000, + "step": 88095 + }, + { + "epoch": 9.811783049337343, + "grad_norm": 0.016272198408842087, + "learning_rate": 3.0147967099772973e-05, + "loss": 0.0374, + "num_input_tokens_seen": 107221696, + "step": 88100 + }, + { + "epoch": 9.81233990422096, + "grad_norm": 0.45863044261932373, + "learning_rate": 3.0145589409122292e-05, + "loss": 0.0379, + "num_input_tokens_seen": 107227616, + "step": 88105 + }, + { + "epoch": 9.812896759104577, + "grad_norm": 0.9676703214645386, + "learning_rate": 3.01432116698674e-05, + "loss": 0.0267, + "num_input_tokens_seen": 107233120, + "step": 88110 + }, + { + "epoch": 9.813453613988194, + "grad_norm": 0.053990159183740616, + "learning_rate": 3.014083388203076e-05, + "loss": 0.0652, + "num_input_tokens_seen": 107239328, + "step": 88115 + }, + { + "epoch": 9.814010468871812, + "grad_norm": 0.42994457483291626, + "learning_rate": 3.0138456045634817e-05, + "loss": 0.0631, + "num_input_tokens_seen": 107245152, + "step": 88120 + }, + { + "epoch": 9.81456732375543, + "grad_norm": 0.04782614856958389, + "learning_rate": 3.0136078160702046e-05, + "loss": 0.0072, + "num_input_tokens_seen": 107251200, + "step": 88125 + }, + { + "epoch": 9.815124178639046, + "grad_norm": 0.04264216125011444, + "learning_rate": 3.0133700227254897e-05, + "loss": 0.028, + "num_input_tokens_seen": 107257408, + "step": 88130 + }, + { + "epoch": 9.815681033522663, + "grad_norm": 0.7204816937446594, + "learning_rate": 3.013132224531584e-05, + "loss": 0.1298, + "num_input_tokens_seen": 107263264, + "step": 88135 + }, + { + "epoch": 9.816237888406281, + "grad_norm": 1.2372770309448242, + "learning_rate": 3.0128944214907328e-05, + "loss": 0.0651, + "num_input_tokens_seen": 107269600, + "step": 88140 + }, + { + "epoch": 9.816794743289899, + "grad_norm": 0.9360485672950745, + "learning_rate": 3.0126566136051832e-05, + "loss": 0.0225, + "num_input_tokens_seen": 107275776, + "step": 88145 + }, + { + "epoch": 9.817351598173516, + "grad_norm": 0.03193344175815582, + "learning_rate": 3.0124188008771815e-05, + "loss": 0.0462, + "num_input_tokens_seen": 107281984, + "step": 88150 + }, + { + "epoch": 9.817908453057134, + "grad_norm": 1.5511373281478882, + "learning_rate": 3.0121809833089733e-05, + "loss": 0.0813, + "num_input_tokens_seen": 107288192, + "step": 88155 + }, + { + "epoch": 9.81846530794075, + "grad_norm": 2.4178528785705566, + "learning_rate": 3.0119431609028053e-05, + "loss": 0.1159, + "num_input_tokens_seen": 107294272, + "step": 88160 + }, + { + "epoch": 9.819022162824368, + "grad_norm": 0.002210461301729083, + "learning_rate": 3.011705333660924e-05, + "loss": 0.0016, + "num_input_tokens_seen": 107300448, + "step": 88165 + }, + { + "epoch": 9.819579017707985, + "grad_norm": 0.01210559532046318, + "learning_rate": 3.0114675015855765e-05, + "loss": 0.026, + "num_input_tokens_seen": 107306624, + "step": 88170 + }, + { + "epoch": 9.820135872591603, + "grad_norm": 0.0024666464887559414, + "learning_rate": 3.0112296646790078e-05, + "loss": 0.003, + "num_input_tokens_seen": 107313088, + "step": 88175 + }, + { + "epoch": 9.82069272747522, + "grad_norm": 0.27467912435531616, + "learning_rate": 3.0109918229434653e-05, + "loss": 0.0077, + "num_input_tokens_seen": 107319264, + "step": 88180 + }, + { + "epoch": 9.821249582358837, + "grad_norm": 0.0032373983412981033, + "learning_rate": 3.010753976381196e-05, + "loss": 0.0015, + "num_input_tokens_seen": 107325568, + "step": 88185 + }, + { + "epoch": 9.821806437242454, + "grad_norm": 0.0036389201413840055, + "learning_rate": 3.010516124994446e-05, + "loss": 0.092, + "num_input_tokens_seen": 107331808, + "step": 88190 + }, + { + "epoch": 9.822363292126072, + "grad_norm": 0.025377444922924042, + "learning_rate": 3.0102782687854626e-05, + "loss": 0.0245, + "num_input_tokens_seen": 107337888, + "step": 88195 + }, + { + "epoch": 9.82292014700969, + "grad_norm": 0.08737301826477051, + "learning_rate": 3.0100404077564913e-05, + "loss": 0.0099, + "num_input_tokens_seen": 107344256, + "step": 88200 + }, + { + "epoch": 9.823477001893307, + "grad_norm": 0.25164565443992615, + "learning_rate": 3.0098025419097808e-05, + "loss": 0.0387, + "num_input_tokens_seen": 107350208, + "step": 88205 + }, + { + "epoch": 9.824033856776923, + "grad_norm": 0.08808556944131851, + "learning_rate": 3.0095646712475763e-05, + "loss": 0.0891, + "num_input_tokens_seen": 107356672, + "step": 88210 + }, + { + "epoch": 9.82459071166054, + "grad_norm": 0.34542718529701233, + "learning_rate": 3.009326795772125e-05, + "loss": 0.0552, + "num_input_tokens_seen": 107362400, + "step": 88215 + }, + { + "epoch": 9.825147566544159, + "grad_norm": 0.4433428645133972, + "learning_rate": 3.0090889154856745e-05, + "loss": 0.0264, + "num_input_tokens_seen": 107368416, + "step": 88220 + }, + { + "epoch": 9.825704421427776, + "grad_norm": 0.23123669624328613, + "learning_rate": 3.008851030390471e-05, + "loss": 0.0581, + "num_input_tokens_seen": 107374336, + "step": 88225 + }, + { + "epoch": 9.826261276311394, + "grad_norm": 1.25537109375, + "learning_rate": 3.008613140488762e-05, + "loss": 0.066, + "num_input_tokens_seen": 107379232, + "step": 88230 + }, + { + "epoch": 9.82681813119501, + "grad_norm": 0.010011889971792698, + "learning_rate": 3.0083752457827942e-05, + "loss": 0.0976, + "num_input_tokens_seen": 107385184, + "step": 88235 + }, + { + "epoch": 9.827374986078627, + "grad_norm": 0.02325735241174698, + "learning_rate": 3.0081373462748145e-05, + "loss": 0.0266, + "num_input_tokens_seen": 107391360, + "step": 88240 + }, + { + "epoch": 9.827931840962245, + "grad_norm": 0.10687846690416336, + "learning_rate": 3.0078994419670715e-05, + "loss": 0.1369, + "num_input_tokens_seen": 107396960, + "step": 88245 + }, + { + "epoch": 9.828488695845863, + "grad_norm": 0.04196722060441971, + "learning_rate": 3.0076615328618108e-05, + "loss": 0.014, + "num_input_tokens_seen": 107402880, + "step": 88250 + }, + { + "epoch": 9.82904555072948, + "grad_norm": 1.4556927680969238, + "learning_rate": 3.0074236189612804e-05, + "loss": 0.0746, + "num_input_tokens_seen": 107409088, + "step": 88255 + }, + { + "epoch": 9.829602405613098, + "grad_norm": 0.08897525072097778, + "learning_rate": 3.0071857002677266e-05, + "loss": 0.0074, + "num_input_tokens_seen": 107415168, + "step": 88260 + }, + { + "epoch": 9.830159260496714, + "grad_norm": 0.5215508341789246, + "learning_rate": 3.0069477767833985e-05, + "loss": 0.0373, + "num_input_tokens_seen": 107421312, + "step": 88265 + }, + { + "epoch": 9.830716115380332, + "grad_norm": 0.013866751454770565, + "learning_rate": 3.0067098485105422e-05, + "loss": 0.0094, + "num_input_tokens_seen": 107427168, + "step": 88270 + }, + { + "epoch": 9.83127297026395, + "grad_norm": 1.6964739561080933, + "learning_rate": 3.006471915451405e-05, + "loss": 0.0501, + "num_input_tokens_seen": 107433184, + "step": 88275 + }, + { + "epoch": 9.831829825147567, + "grad_norm": 1.033859133720398, + "learning_rate": 3.006233977608235e-05, + "loss": 0.0471, + "num_input_tokens_seen": 107439744, + "step": 88280 + }, + { + "epoch": 9.832386680031185, + "grad_norm": 0.8210543990135193, + "learning_rate": 3.0059960349832794e-05, + "loss": 0.0397, + "num_input_tokens_seen": 107445856, + "step": 88285 + }, + { + "epoch": 9.8329435349148, + "grad_norm": 1.6400262117385864, + "learning_rate": 3.0057580875787866e-05, + "loss": 0.1314, + "num_input_tokens_seen": 107451584, + "step": 88290 + }, + { + "epoch": 9.833500389798418, + "grad_norm": 3.056968927383423, + "learning_rate": 3.005520135397003e-05, + "loss": 0.2576, + "num_input_tokens_seen": 107457728, + "step": 88295 + }, + { + "epoch": 9.834057244682036, + "grad_norm": 2.142627477645874, + "learning_rate": 3.0052821784401765e-05, + "loss": 0.1337, + "num_input_tokens_seen": 107463680, + "step": 88300 + }, + { + "epoch": 9.834614099565654, + "grad_norm": 1.2106176614761353, + "learning_rate": 3.005044216710555e-05, + "loss": 0.0228, + "num_input_tokens_seen": 107470144, + "step": 88305 + }, + { + "epoch": 9.835170954449271, + "grad_norm": 0.0008516214438714087, + "learning_rate": 3.0048062502103862e-05, + "loss": 0.0182, + "num_input_tokens_seen": 107476576, + "step": 88310 + }, + { + "epoch": 9.835727809332887, + "grad_norm": 0.35896721482276917, + "learning_rate": 3.0045682789419183e-05, + "loss": 0.0074, + "num_input_tokens_seen": 107482752, + "step": 88315 + }, + { + "epoch": 9.836284664216505, + "grad_norm": 1.2480223178863525, + "learning_rate": 3.004330302907398e-05, + "loss": 0.0809, + "num_input_tokens_seen": 107488960, + "step": 88320 + }, + { + "epoch": 9.836841519100123, + "grad_norm": 2.003222703933716, + "learning_rate": 3.0040923221090743e-05, + "loss": 0.0966, + "num_input_tokens_seen": 107495264, + "step": 88325 + }, + { + "epoch": 9.83739837398374, + "grad_norm": 0.14935238659381866, + "learning_rate": 3.0038543365491957e-05, + "loss": 0.0475, + "num_input_tokens_seen": 107501504, + "step": 88330 + }, + { + "epoch": 9.837955228867358, + "grad_norm": 0.012181634083390236, + "learning_rate": 3.0036163462300077e-05, + "loss": 0.0069, + "num_input_tokens_seen": 107507488, + "step": 88335 + }, + { + "epoch": 9.838512083750974, + "grad_norm": 0.5247333645820618, + "learning_rate": 3.003378351153761e-05, + "loss": 0.0682, + "num_input_tokens_seen": 107513632, + "step": 88340 + }, + { + "epoch": 9.839068938634592, + "grad_norm": 0.10958866029977798, + "learning_rate": 3.0031403513227017e-05, + "loss": 0.049, + "num_input_tokens_seen": 107519168, + "step": 88345 + }, + { + "epoch": 9.83962579351821, + "grad_norm": 1.856062412261963, + "learning_rate": 3.0029023467390794e-05, + "loss": 0.0281, + "num_input_tokens_seen": 107525344, + "step": 88350 + }, + { + "epoch": 9.840182648401827, + "grad_norm": 0.04348788782954216, + "learning_rate": 3.0026643374051405e-05, + "loss": 0.081, + "num_input_tokens_seen": 107531584, + "step": 88355 + }, + { + "epoch": 9.840739503285445, + "grad_norm": 0.05846882238984108, + "learning_rate": 3.0024263233231347e-05, + "loss": 0.0134, + "num_input_tokens_seen": 107538016, + "step": 88360 + }, + { + "epoch": 9.84129635816906, + "grad_norm": 0.2504812479019165, + "learning_rate": 3.0021883044953104e-05, + "loss": 0.0884, + "num_input_tokens_seen": 107544128, + "step": 88365 + }, + { + "epoch": 9.841853213052678, + "grad_norm": 0.267572820186615, + "learning_rate": 3.0019502809239142e-05, + "loss": 0.0282, + "num_input_tokens_seen": 107550208, + "step": 88370 + }, + { + "epoch": 9.842410067936296, + "grad_norm": 0.0004544768889900297, + "learning_rate": 3.001712252611196e-05, + "loss": 0.1203, + "num_input_tokens_seen": 107556640, + "step": 88375 + }, + { + "epoch": 9.842966922819913, + "grad_norm": 0.0058846622705459595, + "learning_rate": 3.001474219559403e-05, + "loss": 0.0473, + "num_input_tokens_seen": 107562496, + "step": 88380 + }, + { + "epoch": 9.843523777703531, + "grad_norm": 0.0011359885102137923, + "learning_rate": 3.0012361817707848e-05, + "loss": 0.0027, + "num_input_tokens_seen": 107568768, + "step": 88385 + }, + { + "epoch": 9.844080632587147, + "grad_norm": 0.013125664554536343, + "learning_rate": 3.0009981392475895e-05, + "loss": 0.0402, + "num_input_tokens_seen": 107574752, + "step": 88390 + }, + { + "epoch": 9.844637487470765, + "grad_norm": 0.0204685777425766, + "learning_rate": 3.0007600919920648e-05, + "loss": 0.0043, + "num_input_tokens_seen": 107580992, + "step": 88395 + }, + { + "epoch": 9.845194342354382, + "grad_norm": 0.16826942563056946, + "learning_rate": 3.0005220400064605e-05, + "loss": 0.0217, + "num_input_tokens_seen": 107586848, + "step": 88400 + }, + { + "epoch": 9.845751197238, + "grad_norm": 0.17622420191764832, + "learning_rate": 3.0002839832930235e-05, + "loss": 0.1043, + "num_input_tokens_seen": 107593184, + "step": 88405 + }, + { + "epoch": 9.846308052121618, + "grad_norm": 1.474359154701233, + "learning_rate": 3.0000459218540043e-05, + "loss": 0.04, + "num_input_tokens_seen": 107599104, + "step": 88410 + }, + { + "epoch": 9.846864907005234, + "grad_norm": 1.51577889919281, + "learning_rate": 2.9998078556916502e-05, + "loss": 0.1295, + "num_input_tokens_seen": 107604352, + "step": 88415 + }, + { + "epoch": 9.847421761888851, + "grad_norm": 0.30788129568099976, + "learning_rate": 2.999569784808211e-05, + "loss": 0.0273, + "num_input_tokens_seen": 107610624, + "step": 88420 + }, + { + "epoch": 9.847978616772469, + "grad_norm": 1.0195646286010742, + "learning_rate": 2.999331709205935e-05, + "loss": 0.0262, + "num_input_tokens_seen": 107616960, + "step": 88425 + }, + { + "epoch": 9.848535471656087, + "grad_norm": 0.30059128999710083, + "learning_rate": 2.9990936288870703e-05, + "loss": 0.0645, + "num_input_tokens_seen": 107622784, + "step": 88430 + }, + { + "epoch": 9.849092326539704, + "grad_norm": 1.548007607460022, + "learning_rate": 2.9988555438538667e-05, + "loss": 0.0549, + "num_input_tokens_seen": 107628800, + "step": 88435 + }, + { + "epoch": 9.84964918142332, + "grad_norm": 1.5511506795883179, + "learning_rate": 2.998617454108573e-05, + "loss": 0.1215, + "num_input_tokens_seen": 107635136, + "step": 88440 + }, + { + "epoch": 9.850206036306938, + "grad_norm": 0.013707753270864487, + "learning_rate": 2.998379359653438e-05, + "loss": 0.0341, + "num_input_tokens_seen": 107641184, + "step": 88445 + }, + { + "epoch": 9.850762891190556, + "grad_norm": 1.4838557243347168, + "learning_rate": 2.99814126049071e-05, + "loss": 0.0423, + "num_input_tokens_seen": 107647232, + "step": 88450 + }, + { + "epoch": 9.851319746074173, + "grad_norm": 0.002841002307832241, + "learning_rate": 2.997903156622639e-05, + "loss": 0.0132, + "num_input_tokens_seen": 107653408, + "step": 88455 + }, + { + "epoch": 9.851876600957791, + "grad_norm": 0.13826794922351837, + "learning_rate": 2.9976650480514744e-05, + "loss": 0.034, + "num_input_tokens_seen": 107659264, + "step": 88460 + }, + { + "epoch": 9.852433455841407, + "grad_norm": 0.2963428497314453, + "learning_rate": 2.9974269347794642e-05, + "loss": 0.0499, + "num_input_tokens_seen": 107665440, + "step": 88465 + }, + { + "epoch": 9.852990310725025, + "grad_norm": 1.3701281547546387, + "learning_rate": 2.997188816808858e-05, + "loss": 0.0646, + "num_input_tokens_seen": 107672160, + "step": 88470 + }, + { + "epoch": 9.853547165608642, + "grad_norm": 0.5784638524055481, + "learning_rate": 2.996950694141905e-05, + "loss": 0.0223, + "num_input_tokens_seen": 107677920, + "step": 88475 + }, + { + "epoch": 9.85410402049226, + "grad_norm": 0.5571588277816772, + "learning_rate": 2.9967125667808553e-05, + "loss": 0.0965, + "num_input_tokens_seen": 107683136, + "step": 88480 + }, + { + "epoch": 9.854660875375878, + "grad_norm": 0.13735389709472656, + "learning_rate": 2.9964744347279566e-05, + "loss": 0.0175, + "num_input_tokens_seen": 107688864, + "step": 88485 + }, + { + "epoch": 9.855217730259493, + "grad_norm": 0.00204640906304121, + "learning_rate": 2.996236297985459e-05, + "loss": 0.0995, + "num_input_tokens_seen": 107694944, + "step": 88490 + }, + { + "epoch": 9.855774585143111, + "grad_norm": 0.7599163055419922, + "learning_rate": 2.995998156555613e-05, + "loss": 0.0867, + "num_input_tokens_seen": 107700576, + "step": 88495 + }, + { + "epoch": 9.856331440026729, + "grad_norm": 0.1677328646183014, + "learning_rate": 2.995760010440666e-05, + "loss": 0.105, + "num_input_tokens_seen": 107706272, + "step": 88500 + }, + { + "epoch": 9.856888294910346, + "grad_norm": 0.012860272079706192, + "learning_rate": 2.9955218596428692e-05, + "loss": 0.0912, + "num_input_tokens_seen": 107712384, + "step": 88505 + }, + { + "epoch": 9.857445149793964, + "grad_norm": 0.49292343854904175, + "learning_rate": 2.995283704164471e-05, + "loss": 0.0877, + "num_input_tokens_seen": 107718528, + "step": 88510 + }, + { + "epoch": 9.858002004677582, + "grad_norm": 0.7527464032173157, + "learning_rate": 2.9950455440077212e-05, + "loss": 0.0837, + "num_input_tokens_seen": 107724288, + "step": 88515 + }, + { + "epoch": 9.858558859561198, + "grad_norm": 0.564264178276062, + "learning_rate": 2.9948073791748697e-05, + "loss": 0.1664, + "num_input_tokens_seen": 107730336, + "step": 88520 + }, + { + "epoch": 9.859115714444815, + "grad_norm": 0.018402524292469025, + "learning_rate": 2.9945692096681667e-05, + "loss": 0.0339, + "num_input_tokens_seen": 107736480, + "step": 88525 + }, + { + "epoch": 9.859672569328433, + "grad_norm": 0.002835346618667245, + "learning_rate": 2.994331035489861e-05, + "loss": 0.1103, + "num_input_tokens_seen": 107742112, + "step": 88530 + }, + { + "epoch": 9.86022942421205, + "grad_norm": 1.167258620262146, + "learning_rate": 2.994092856642202e-05, + "loss": 0.0717, + "num_input_tokens_seen": 107748320, + "step": 88535 + }, + { + "epoch": 9.860786279095668, + "grad_norm": 0.7304325103759766, + "learning_rate": 2.9938546731274413e-05, + "loss": 0.0521, + "num_input_tokens_seen": 107754336, + "step": 88540 + }, + { + "epoch": 9.861343133979284, + "grad_norm": 1.0297088623046875, + "learning_rate": 2.9936164849478265e-05, + "loss": 0.0602, + "num_input_tokens_seen": 107760448, + "step": 88545 + }, + { + "epoch": 9.861899988862902, + "grad_norm": 0.032705437391996384, + "learning_rate": 2.9933782921056087e-05, + "loss": 0.1303, + "num_input_tokens_seen": 107766816, + "step": 88550 + }, + { + "epoch": 9.86245684374652, + "grad_norm": 0.3167615234851837, + "learning_rate": 2.993140094603038e-05, + "loss": 0.1278, + "num_input_tokens_seen": 107772736, + "step": 88555 + }, + { + "epoch": 9.863013698630137, + "grad_norm": 0.0050154756754636765, + "learning_rate": 2.9929018924423634e-05, + "loss": 0.0131, + "num_input_tokens_seen": 107778176, + "step": 88560 + }, + { + "epoch": 9.863570553513755, + "grad_norm": 1.127191424369812, + "learning_rate": 2.9926636856258367e-05, + "loss": 0.0655, + "num_input_tokens_seen": 107784256, + "step": 88565 + }, + { + "epoch": 9.864127408397371, + "grad_norm": 0.6958691477775574, + "learning_rate": 2.9924254741557055e-05, + "loss": 0.0764, + "num_input_tokens_seen": 107790400, + "step": 88570 + }, + { + "epoch": 9.864684263280989, + "grad_norm": 0.028390763327479362, + "learning_rate": 2.992187258034222e-05, + "loss": 0.0132, + "num_input_tokens_seen": 107796160, + "step": 88575 + }, + { + "epoch": 9.865241118164606, + "grad_norm": 1.3194645643234253, + "learning_rate": 2.991949037263635e-05, + "loss": 0.0494, + "num_input_tokens_seen": 107802240, + "step": 88580 + }, + { + "epoch": 9.865797973048224, + "grad_norm": 0.000979314441792667, + "learning_rate": 2.991710811846195e-05, + "loss": 0.0063, + "num_input_tokens_seen": 107808448, + "step": 88585 + }, + { + "epoch": 9.866354827931842, + "grad_norm": 0.06584195792675018, + "learning_rate": 2.9914725817841534e-05, + "loss": 0.0803, + "num_input_tokens_seen": 107814592, + "step": 88590 + }, + { + "epoch": 9.866911682815457, + "grad_norm": 0.022988727316260338, + "learning_rate": 2.9912343470797588e-05, + "loss": 0.0653, + "num_input_tokens_seen": 107820576, + "step": 88595 + }, + { + "epoch": 9.867468537699075, + "grad_norm": 0.45736780762672424, + "learning_rate": 2.990996107735262e-05, + "loss": 0.0105, + "num_input_tokens_seen": 107826720, + "step": 88600 + }, + { + "epoch": 9.868025392582693, + "grad_norm": 0.03547260910272598, + "learning_rate": 2.9907578637529144e-05, + "loss": 0.0098, + "num_input_tokens_seen": 107832704, + "step": 88605 + }, + { + "epoch": 9.86858224746631, + "grad_norm": 0.022007131949067116, + "learning_rate": 2.9905196151349645e-05, + "loss": 0.0259, + "num_input_tokens_seen": 107838528, + "step": 88610 + }, + { + "epoch": 9.869139102349928, + "grad_norm": 0.03840620443224907, + "learning_rate": 2.990281361883665e-05, + "loss": 0.005, + "num_input_tokens_seen": 107844704, + "step": 88615 + }, + { + "epoch": 9.869695957233546, + "grad_norm": 1.9442369937896729, + "learning_rate": 2.9900431040012645e-05, + "loss": 0.1141, + "num_input_tokens_seen": 107850880, + "step": 88620 + }, + { + "epoch": 9.870252812117162, + "grad_norm": 0.5022078156471252, + "learning_rate": 2.9898048414900148e-05, + "loss": 0.0563, + "num_input_tokens_seen": 107857152, + "step": 88625 + }, + { + "epoch": 9.87080966700078, + "grad_norm": 0.006227289792150259, + "learning_rate": 2.9895665743521652e-05, + "loss": 0.0026, + "num_input_tokens_seen": 107863328, + "step": 88630 + }, + { + "epoch": 9.871366521884397, + "grad_norm": 0.42590615153312683, + "learning_rate": 2.9893283025899676e-05, + "loss": 0.0086, + "num_input_tokens_seen": 107869312, + "step": 88635 + }, + { + "epoch": 9.871923376768015, + "grad_norm": 0.13964414596557617, + "learning_rate": 2.989090026205672e-05, + "loss": 0.0065, + "num_input_tokens_seen": 107875296, + "step": 88640 + }, + { + "epoch": 9.872480231651632, + "grad_norm": 0.058755069971084595, + "learning_rate": 2.988851745201529e-05, + "loss": 0.0618, + "num_input_tokens_seen": 107881440, + "step": 88645 + }, + { + "epoch": 9.873037086535248, + "grad_norm": 0.0025240913964807987, + "learning_rate": 2.9886134595797898e-05, + "loss": 0.0856, + "num_input_tokens_seen": 107887200, + "step": 88650 + }, + { + "epoch": 9.873593941418866, + "grad_norm": 0.4824431836605072, + "learning_rate": 2.9883751693427052e-05, + "loss": 0.0105, + "num_input_tokens_seen": 107893216, + "step": 88655 + }, + { + "epoch": 9.874150796302484, + "grad_norm": 1.9924163818359375, + "learning_rate": 2.9881368744925257e-05, + "loss": 0.0969, + "num_input_tokens_seen": 107899328, + "step": 88660 + }, + { + "epoch": 9.874707651186101, + "grad_norm": 0.00519545515999198, + "learning_rate": 2.9878985750315024e-05, + "loss": 0.0827, + "num_input_tokens_seen": 107905344, + "step": 88665 + }, + { + "epoch": 9.875264506069719, + "grad_norm": 0.007980790920555592, + "learning_rate": 2.987660270961886e-05, + "loss": 0.0053, + "num_input_tokens_seen": 107911712, + "step": 88670 + }, + { + "epoch": 9.875821360953335, + "grad_norm": 1.3778630495071411, + "learning_rate": 2.987421962285928e-05, + "loss": 0.0645, + "num_input_tokens_seen": 107918240, + "step": 88675 + }, + { + "epoch": 9.876378215836953, + "grad_norm": 0.2667481005191803, + "learning_rate": 2.9871836490058785e-05, + "loss": 0.0374, + "num_input_tokens_seen": 107924512, + "step": 88680 + }, + { + "epoch": 9.87693507072057, + "grad_norm": 0.10100995749235153, + "learning_rate": 2.98694533112399e-05, + "loss": 0.0053, + "num_input_tokens_seen": 107930624, + "step": 88685 + }, + { + "epoch": 9.877491925604188, + "grad_norm": 0.4980795085430145, + "learning_rate": 2.9867070086425115e-05, + "loss": 0.0186, + "num_input_tokens_seen": 107936544, + "step": 88690 + }, + { + "epoch": 9.878048780487806, + "grad_norm": 0.14339707791805267, + "learning_rate": 2.9864686815636967e-05, + "loss": 0.0827, + "num_input_tokens_seen": 107942688, + "step": 88695 + }, + { + "epoch": 9.878605635371422, + "grad_norm": 1.107081651687622, + "learning_rate": 2.986230349889795e-05, + "loss": 0.048, + "num_input_tokens_seen": 107949120, + "step": 88700 + }, + { + "epoch": 9.87916249025504, + "grad_norm": 0.007399614434689283, + "learning_rate": 2.9859920136230572e-05, + "loss": 0.0302, + "num_input_tokens_seen": 107955168, + "step": 88705 + }, + { + "epoch": 9.879719345138657, + "grad_norm": 0.8293627500534058, + "learning_rate": 2.9857536727657364e-05, + "loss": 0.0289, + "num_input_tokens_seen": 107961408, + "step": 88710 + }, + { + "epoch": 9.880276200022275, + "grad_norm": 0.10382584482431412, + "learning_rate": 2.9855153273200824e-05, + "loss": 0.0177, + "num_input_tokens_seen": 107967520, + "step": 88715 + }, + { + "epoch": 9.880833054905892, + "grad_norm": 1.2965941429138184, + "learning_rate": 2.9852769772883478e-05, + "loss": 0.1032, + "num_input_tokens_seen": 107973792, + "step": 88720 + }, + { + "epoch": 9.881389909789508, + "grad_norm": 0.00028177621425129473, + "learning_rate": 2.985038622672783e-05, + "loss": 0.0236, + "num_input_tokens_seen": 107979776, + "step": 88725 + }, + { + "epoch": 9.881946764673126, + "grad_norm": 1.5505764484405518, + "learning_rate": 2.9848002634756396e-05, + "loss": 0.1589, + "num_input_tokens_seen": 107985632, + "step": 88730 + }, + { + "epoch": 9.882503619556744, + "grad_norm": 0.0023740713950246572, + "learning_rate": 2.984561899699169e-05, + "loss": 0.0248, + "num_input_tokens_seen": 107992096, + "step": 88735 + }, + { + "epoch": 9.883060474440361, + "grad_norm": 2.2394466400146484, + "learning_rate": 2.9843235313456236e-05, + "loss": 0.1024, + "num_input_tokens_seen": 107997984, + "step": 88740 + }, + { + "epoch": 9.883617329323979, + "grad_norm": 0.5931178331375122, + "learning_rate": 2.9840851584172545e-05, + "loss": 0.0352, + "num_input_tokens_seen": 108004192, + "step": 88745 + }, + { + "epoch": 9.884174184207595, + "grad_norm": 1.2046338319778442, + "learning_rate": 2.983846780916313e-05, + "loss": 0.1255, + "num_input_tokens_seen": 108010528, + "step": 88750 + }, + { + "epoch": 9.884731039091212, + "grad_norm": 0.12911991775035858, + "learning_rate": 2.9836083988450513e-05, + "loss": 0.0285, + "num_input_tokens_seen": 108017088, + "step": 88755 + }, + { + "epoch": 9.88528789397483, + "grad_norm": 0.008206391707062721, + "learning_rate": 2.9833700122057206e-05, + "loss": 0.029, + "num_input_tokens_seen": 108023200, + "step": 88760 + }, + { + "epoch": 9.885844748858448, + "grad_norm": 0.8012175559997559, + "learning_rate": 2.9831316210005723e-05, + "loss": 0.0816, + "num_input_tokens_seen": 108028960, + "step": 88765 + }, + { + "epoch": 9.886401603742065, + "grad_norm": 0.7366905808448792, + "learning_rate": 2.98289322523186e-05, + "loss": 0.063, + "num_input_tokens_seen": 108034976, + "step": 88770 + }, + { + "epoch": 9.886958458625681, + "grad_norm": 0.07418631762266159, + "learning_rate": 2.9826548249018326e-05, + "loss": 0.0078, + "num_input_tokens_seen": 108041280, + "step": 88775 + }, + { + "epoch": 9.887515313509299, + "grad_norm": 0.005257913842797279, + "learning_rate": 2.982416420012745e-05, + "loss": 0.0084, + "num_input_tokens_seen": 108047520, + "step": 88780 + }, + { + "epoch": 9.888072168392917, + "grad_norm": 0.6899328827857971, + "learning_rate": 2.9821780105668473e-05, + "loss": 0.1474, + "num_input_tokens_seen": 108053728, + "step": 88785 + }, + { + "epoch": 9.888629023276534, + "grad_norm": 0.1949712038040161, + "learning_rate": 2.981939596566392e-05, + "loss": 0.0484, + "num_input_tokens_seen": 108059424, + "step": 88790 + }, + { + "epoch": 9.889185878160152, + "grad_norm": 0.40123388171195984, + "learning_rate": 2.9817011780136317e-05, + "loss": 0.0556, + "num_input_tokens_seen": 108065152, + "step": 88795 + }, + { + "epoch": 9.889742733043768, + "grad_norm": 1.1956337690353394, + "learning_rate": 2.981462754910817e-05, + "loss": 0.0867, + "num_input_tokens_seen": 108071456, + "step": 88800 + }, + { + "epoch": 9.890299587927386, + "grad_norm": 0.921941876411438, + "learning_rate": 2.9812243272602013e-05, + "loss": 0.0643, + "num_input_tokens_seen": 108077344, + "step": 88805 + }, + { + "epoch": 9.890856442811003, + "grad_norm": 0.09342940151691437, + "learning_rate": 2.9809858950640363e-05, + "loss": 0.0113, + "num_input_tokens_seen": 108083616, + "step": 88810 + }, + { + "epoch": 9.891413297694621, + "grad_norm": 0.016978681087493896, + "learning_rate": 2.9807474583245743e-05, + "loss": 0.0038, + "num_input_tokens_seen": 108089760, + "step": 88815 + }, + { + "epoch": 9.891970152578239, + "grad_norm": 0.08335357159376144, + "learning_rate": 2.980509017044067e-05, + "loss": 0.0338, + "num_input_tokens_seen": 108095936, + "step": 88820 + }, + { + "epoch": 9.892527007461855, + "grad_norm": 0.06889167428016663, + "learning_rate": 2.980270571224767e-05, + "loss": 0.0092, + "num_input_tokens_seen": 108101632, + "step": 88825 + }, + { + "epoch": 9.893083862345472, + "grad_norm": 0.018184378743171692, + "learning_rate": 2.9800321208689268e-05, + "loss": 0.0781, + "num_input_tokens_seen": 108107776, + "step": 88830 + }, + { + "epoch": 9.89364071722909, + "grad_norm": 0.1976614147424698, + "learning_rate": 2.979793665978799e-05, + "loss": 0.0361, + "num_input_tokens_seen": 108113728, + "step": 88835 + }, + { + "epoch": 9.894197572112708, + "grad_norm": 0.014844652265310287, + "learning_rate": 2.9795552065566352e-05, + "loss": 0.0566, + "num_input_tokens_seen": 108119776, + "step": 88840 + }, + { + "epoch": 9.894754426996325, + "grad_norm": 1.3557145595550537, + "learning_rate": 2.979316742604688e-05, + "loss": 0.0668, + "num_input_tokens_seen": 108126016, + "step": 88845 + }, + { + "epoch": 9.895311281879943, + "grad_norm": 0.16351014375686646, + "learning_rate": 2.979078274125211e-05, + "loss": 0.0335, + "num_input_tokens_seen": 108132128, + "step": 88850 + }, + { + "epoch": 9.895868136763559, + "grad_norm": 0.4171530604362488, + "learning_rate": 2.9788398011204554e-05, + "loss": 0.0085, + "num_input_tokens_seen": 108138272, + "step": 88855 + }, + { + "epoch": 9.896424991647176, + "grad_norm": 1.0861363410949707, + "learning_rate": 2.9786013235926734e-05, + "loss": 0.0537, + "num_input_tokens_seen": 108144384, + "step": 88860 + }, + { + "epoch": 9.896981846530794, + "grad_norm": 0.0019420268945395947, + "learning_rate": 2.97836284154412e-05, + "loss": 0.1289, + "num_input_tokens_seen": 108150528, + "step": 88865 + }, + { + "epoch": 9.897538701414412, + "grad_norm": 0.2761628031730652, + "learning_rate": 2.9781243549770454e-05, + "loss": 0.0735, + "num_input_tokens_seen": 108156352, + "step": 88870 + }, + { + "epoch": 9.89809555629803, + "grad_norm": 1.7440295219421387, + "learning_rate": 2.9778858638937036e-05, + "loss": 0.0465, + "num_input_tokens_seen": 108162240, + "step": 88875 + }, + { + "epoch": 9.898652411181645, + "grad_norm": 1.881912350654602, + "learning_rate": 2.9776473682963463e-05, + "loss": 0.0475, + "num_input_tokens_seen": 108168256, + "step": 88880 + }, + { + "epoch": 9.899209266065263, + "grad_norm": 1.1945202350616455, + "learning_rate": 2.977408868187227e-05, + "loss": 0.0547, + "num_input_tokens_seen": 108174528, + "step": 88885 + }, + { + "epoch": 9.89976612094888, + "grad_norm": 0.3930628001689911, + "learning_rate": 2.9771703635685992e-05, + "loss": 0.0213, + "num_input_tokens_seen": 108180704, + "step": 88890 + }, + { + "epoch": 9.900322975832498, + "grad_norm": 0.0883706584572792, + "learning_rate": 2.9769318544427143e-05, + "loss": 0.0092, + "num_input_tokens_seen": 108186560, + "step": 88895 + }, + { + "epoch": 9.900879830716116, + "grad_norm": 0.576642632484436, + "learning_rate": 2.976693340811827e-05, + "loss": 0.0579, + "num_input_tokens_seen": 108192576, + "step": 88900 + }, + { + "epoch": 9.901436685599732, + "grad_norm": 0.24337275326251984, + "learning_rate": 2.976454822678188e-05, + "loss": 0.0064, + "num_input_tokens_seen": 108198656, + "step": 88905 + }, + { + "epoch": 9.90199354048335, + "grad_norm": 3.749177932739258, + "learning_rate": 2.9762163000440518e-05, + "loss": 0.1024, + "num_input_tokens_seen": 108204992, + "step": 88910 + }, + { + "epoch": 9.902550395366967, + "grad_norm": 2.413642406463623, + "learning_rate": 2.975977772911671e-05, + "loss": 0.1449, + "num_input_tokens_seen": 108211104, + "step": 88915 + }, + { + "epoch": 9.903107250250585, + "grad_norm": 0.0021656914614140987, + "learning_rate": 2.975739241283299e-05, + "loss": 0.0361, + "num_input_tokens_seen": 108216928, + "step": 88920 + }, + { + "epoch": 9.903664105134203, + "grad_norm": 0.9770973324775696, + "learning_rate": 2.9755007051611887e-05, + "loss": 0.0215, + "num_input_tokens_seen": 108223136, + "step": 88925 + }, + { + "epoch": 9.904220960017819, + "grad_norm": 0.00019576813792809844, + "learning_rate": 2.9752621645475933e-05, + "loss": 0.0399, + "num_input_tokens_seen": 108228672, + "step": 88930 + }, + { + "epoch": 9.904777814901436, + "grad_norm": 0.024230342358350754, + "learning_rate": 2.9750236194447662e-05, + "loss": 0.0023, + "num_input_tokens_seen": 108234752, + "step": 88935 + }, + { + "epoch": 9.905334669785054, + "grad_norm": 0.17426520586013794, + "learning_rate": 2.9747850698549596e-05, + "loss": 0.0151, + "num_input_tokens_seen": 108240992, + "step": 88940 + }, + { + "epoch": 9.905891524668672, + "grad_norm": 0.034570712596178055, + "learning_rate": 2.9745465157804287e-05, + "loss": 0.0414, + "num_input_tokens_seen": 108247040, + "step": 88945 + }, + { + "epoch": 9.90644837955229, + "grad_norm": 0.6049357652664185, + "learning_rate": 2.9743079572234252e-05, + "loss": 0.056, + "num_input_tokens_seen": 108253248, + "step": 88950 + }, + { + "epoch": 9.907005234435905, + "grad_norm": 0.00018378406821284443, + "learning_rate": 2.9740693941862025e-05, + "loss": 0.0569, + "num_input_tokens_seen": 108259328, + "step": 88955 + }, + { + "epoch": 9.907562089319523, + "grad_norm": 1.7389955520629883, + "learning_rate": 2.9738308266710158e-05, + "loss": 0.1112, + "num_input_tokens_seen": 108265216, + "step": 88960 + }, + { + "epoch": 9.90811894420314, + "grad_norm": 0.041551828384399414, + "learning_rate": 2.9735922546801165e-05, + "loss": 0.0053, + "num_input_tokens_seen": 108271360, + "step": 88965 + }, + { + "epoch": 9.908675799086758, + "grad_norm": 0.4335051476955414, + "learning_rate": 2.973353678215759e-05, + "loss": 0.0418, + "num_input_tokens_seen": 108277664, + "step": 88970 + }, + { + "epoch": 9.909232653970376, + "grad_norm": 0.17862996459007263, + "learning_rate": 2.973115097280197e-05, + "loss": 0.0308, + "num_input_tokens_seen": 108282752, + "step": 88975 + }, + { + "epoch": 9.909789508853994, + "grad_norm": 1.0371863842010498, + "learning_rate": 2.9728765118756835e-05, + "loss": 0.1234, + "num_input_tokens_seen": 108288896, + "step": 88980 + }, + { + "epoch": 9.91034636373761, + "grad_norm": 0.07120432704687119, + "learning_rate": 2.9726379220044726e-05, + "loss": 0.092, + "num_input_tokens_seen": 108295168, + "step": 88985 + }, + { + "epoch": 9.910903218621227, + "grad_norm": 0.00018932503007818013, + "learning_rate": 2.9723993276688177e-05, + "loss": 0.0054, + "num_input_tokens_seen": 108301344, + "step": 88990 + }, + { + "epoch": 9.911460073504845, + "grad_norm": 0.23202694952487946, + "learning_rate": 2.972160728870973e-05, + "loss": 0.0304, + "num_input_tokens_seen": 108306752, + "step": 88995 + }, + { + "epoch": 9.912016928388462, + "grad_norm": 0.00020423921523615718, + "learning_rate": 2.9719221256131917e-05, + "loss": 0.0607, + "num_input_tokens_seen": 108313024, + "step": 89000 + }, + { + "epoch": 9.91257378327208, + "grad_norm": 0.3825015425682068, + "learning_rate": 2.971683517897728e-05, + "loss": 0.0125, + "num_input_tokens_seen": 108319040, + "step": 89005 + }, + { + "epoch": 9.913130638155696, + "grad_norm": 0.0674070194363594, + "learning_rate": 2.9714449057268357e-05, + "loss": 0.0983, + "num_input_tokens_seen": 108325408, + "step": 89010 + }, + { + "epoch": 9.913687493039314, + "grad_norm": 0.9107239246368408, + "learning_rate": 2.9712062891027682e-05, + "loss": 0.0412, + "num_input_tokens_seen": 108331520, + "step": 89015 + }, + { + "epoch": 9.914244347922931, + "grad_norm": 1.9351812601089478, + "learning_rate": 2.9709676680277797e-05, + "loss": 0.0859, + "num_input_tokens_seen": 108337536, + "step": 89020 + }, + { + "epoch": 9.914801202806549, + "grad_norm": 0.002599494531750679, + "learning_rate": 2.9707290425041247e-05, + "loss": 0.0646, + "num_input_tokens_seen": 108343456, + "step": 89025 + }, + { + "epoch": 9.915358057690167, + "grad_norm": 0.09653705358505249, + "learning_rate": 2.9704904125340566e-05, + "loss": 0.0493, + "num_input_tokens_seen": 108349408, + "step": 89030 + }, + { + "epoch": 9.915914912573783, + "grad_norm": 0.13467444479465485, + "learning_rate": 2.9702517781198293e-05, + "loss": 0.0748, + "num_input_tokens_seen": 108355616, + "step": 89035 + }, + { + "epoch": 9.9164717674574, + "grad_norm": 0.18686407804489136, + "learning_rate": 2.9700131392636975e-05, + "loss": 0.0407, + "num_input_tokens_seen": 108361856, + "step": 89040 + }, + { + "epoch": 9.917028622341018, + "grad_norm": 0.00786623265594244, + "learning_rate": 2.9697744959679153e-05, + "loss": 0.0093, + "num_input_tokens_seen": 108367936, + "step": 89045 + }, + { + "epoch": 9.917585477224636, + "grad_norm": 0.1180146187543869, + "learning_rate": 2.9695358482347356e-05, + "loss": 0.0535, + "num_input_tokens_seen": 108374144, + "step": 89050 + }, + { + "epoch": 9.918142332108253, + "grad_norm": 0.10009113699197769, + "learning_rate": 2.9692971960664144e-05, + "loss": 0.0467, + "num_input_tokens_seen": 108380320, + "step": 89055 + }, + { + "epoch": 9.91869918699187, + "grad_norm": 0.8567180633544922, + "learning_rate": 2.9690585394652053e-05, + "loss": 0.0338, + "num_input_tokens_seen": 108386176, + "step": 89060 + }, + { + "epoch": 9.919256041875487, + "grad_norm": 0.40641871094703674, + "learning_rate": 2.9688198784333626e-05, + "loss": 0.0412, + "num_input_tokens_seen": 108391648, + "step": 89065 + }, + { + "epoch": 9.919812896759105, + "grad_norm": 0.01488855667412281, + "learning_rate": 2.96858121297314e-05, + "loss": 0.0068, + "num_input_tokens_seen": 108397728, + "step": 89070 + }, + { + "epoch": 9.920369751642722, + "grad_norm": 0.38288936018943787, + "learning_rate": 2.968342543086793e-05, + "loss": 0.0194, + "num_input_tokens_seen": 108403648, + "step": 89075 + }, + { + "epoch": 9.92092660652634, + "grad_norm": 0.6633113026618958, + "learning_rate": 2.9681038687765745e-05, + "loss": 0.0146, + "num_input_tokens_seen": 108409408, + "step": 89080 + }, + { + "epoch": 9.921483461409956, + "grad_norm": 0.758606493473053, + "learning_rate": 2.96786519004474e-05, + "loss": 0.0192, + "num_input_tokens_seen": 108415744, + "step": 89085 + }, + { + "epoch": 9.922040316293574, + "grad_norm": 0.6900745034217834, + "learning_rate": 2.9676265068935448e-05, + "loss": 0.0241, + "num_input_tokens_seen": 108422176, + "step": 89090 + }, + { + "epoch": 9.922597171177191, + "grad_norm": 0.019306551665067673, + "learning_rate": 2.9673878193252424e-05, + "loss": 0.0042, + "num_input_tokens_seen": 108428384, + "step": 89095 + }, + { + "epoch": 9.923154026060809, + "grad_norm": 0.3394370973110199, + "learning_rate": 2.9671491273420878e-05, + "loss": 0.018, + "num_input_tokens_seen": 108434560, + "step": 89100 + }, + { + "epoch": 9.923710880944427, + "grad_norm": 0.06370151042938232, + "learning_rate": 2.9669104309463346e-05, + "loss": 0.0675, + "num_input_tokens_seen": 108440448, + "step": 89105 + }, + { + "epoch": 9.924267735828042, + "grad_norm": 0.8997480869293213, + "learning_rate": 2.9666717301402385e-05, + "loss": 0.1073, + "num_input_tokens_seen": 108446368, + "step": 89110 + }, + { + "epoch": 9.92482459071166, + "grad_norm": 0.171955406665802, + "learning_rate": 2.966433024926055e-05, + "loss": 0.0043, + "num_input_tokens_seen": 108452832, + "step": 89115 + }, + { + "epoch": 9.925381445595278, + "grad_norm": 2.771430253982544, + "learning_rate": 2.9661943153060367e-05, + "loss": 0.0653, + "num_input_tokens_seen": 108458816, + "step": 89120 + }, + { + "epoch": 9.925938300478895, + "grad_norm": 0.0602245070040226, + "learning_rate": 2.9659556012824407e-05, + "loss": 0.0083, + "num_input_tokens_seen": 108465024, + "step": 89125 + }, + { + "epoch": 9.926495155362513, + "grad_norm": 0.0026170031633228064, + "learning_rate": 2.96571688285752e-05, + "loss": 0.0157, + "num_input_tokens_seen": 108471520, + "step": 89130 + }, + { + "epoch": 9.927052010246129, + "grad_norm": 0.0603925921022892, + "learning_rate": 2.9654781600335297e-05, + "loss": 0.0042, + "num_input_tokens_seen": 108477824, + "step": 89135 + }, + { + "epoch": 9.927608865129747, + "grad_norm": 2.4941654205322266, + "learning_rate": 2.965239432812726e-05, + "loss": 0.1463, + "num_input_tokens_seen": 108483584, + "step": 89140 + }, + { + "epoch": 9.928165720013364, + "grad_norm": 1.0126655101776123, + "learning_rate": 2.965000701197363e-05, + "loss": 0.1552, + "num_input_tokens_seen": 108489792, + "step": 89145 + }, + { + "epoch": 9.928722574896982, + "grad_norm": 0.17925450205802917, + "learning_rate": 2.964761965189696e-05, + "loss": 0.099, + "num_input_tokens_seen": 108496160, + "step": 89150 + }, + { + "epoch": 9.9292794297806, + "grad_norm": 2.003676176071167, + "learning_rate": 2.964523224791979e-05, + "loss": 0.0959, + "num_input_tokens_seen": 108502496, + "step": 89155 + }, + { + "epoch": 9.929836284664216, + "grad_norm": 0.002058525336906314, + "learning_rate": 2.964284480006469e-05, + "loss": 0.1065, + "num_input_tokens_seen": 108508416, + "step": 89160 + }, + { + "epoch": 9.930393139547833, + "grad_norm": 0.916100025177002, + "learning_rate": 2.9640457308354197e-05, + "loss": 0.1222, + "num_input_tokens_seen": 108514784, + "step": 89165 + }, + { + "epoch": 9.930949994431451, + "grad_norm": 0.1468050181865692, + "learning_rate": 2.963806977281086e-05, + "loss": 0.0052, + "num_input_tokens_seen": 108521216, + "step": 89170 + }, + { + "epoch": 9.931506849315069, + "grad_norm": 0.2437218278646469, + "learning_rate": 2.963568219345725e-05, + "loss": 0.0188, + "num_input_tokens_seen": 108527520, + "step": 89175 + }, + { + "epoch": 9.932063704198686, + "grad_norm": 0.001093731145374477, + "learning_rate": 2.96332945703159e-05, + "loss": 0.002, + "num_input_tokens_seen": 108533600, + "step": 89180 + }, + { + "epoch": 9.932620559082302, + "grad_norm": 0.23361657559871674, + "learning_rate": 2.9630906903409377e-05, + "loss": 0.1224, + "num_input_tokens_seen": 108539488, + "step": 89185 + }, + { + "epoch": 9.93317741396592, + "grad_norm": 0.017103418707847595, + "learning_rate": 2.9628519192760217e-05, + "loss": 0.0774, + "num_input_tokens_seen": 108545760, + "step": 89190 + }, + { + "epoch": 9.933734268849538, + "grad_norm": 0.5112379789352417, + "learning_rate": 2.962613143839099e-05, + "loss": 0.0188, + "num_input_tokens_seen": 108552064, + "step": 89195 + }, + { + "epoch": 9.934291123733155, + "grad_norm": 0.004121489357203245, + "learning_rate": 2.9623743640324253e-05, + "loss": 0.0613, + "num_input_tokens_seen": 108558240, + "step": 89200 + }, + { + "epoch": 9.934847978616773, + "grad_norm": 0.02301499806344509, + "learning_rate": 2.9621355798582545e-05, + "loss": 0.0035, + "num_input_tokens_seen": 108564512, + "step": 89205 + }, + { + "epoch": 9.93540483350039, + "grad_norm": 0.8159221410751343, + "learning_rate": 2.9618967913188435e-05, + "loss": 0.1907, + "num_input_tokens_seen": 108570656, + "step": 89210 + }, + { + "epoch": 9.935961688384007, + "grad_norm": 0.0993429645895958, + "learning_rate": 2.9616579984164467e-05, + "loss": 0.0172, + "num_input_tokens_seen": 108576928, + "step": 89215 + }, + { + "epoch": 9.936518543267624, + "grad_norm": 0.1351844072341919, + "learning_rate": 2.9614192011533204e-05, + "loss": 0.0558, + "num_input_tokens_seen": 108582848, + "step": 89220 + }, + { + "epoch": 9.937075398151242, + "grad_norm": 0.022795865312218666, + "learning_rate": 2.96118039953172e-05, + "loss": 0.0024, + "num_input_tokens_seen": 108589280, + "step": 89225 + }, + { + "epoch": 9.93763225303486, + "grad_norm": 1.2195137739181519, + "learning_rate": 2.960941593553901e-05, + "loss": 0.1089, + "num_input_tokens_seen": 108595456, + "step": 89230 + }, + { + "epoch": 9.938189107918477, + "grad_norm": 0.2928886115550995, + "learning_rate": 2.9607027832221197e-05, + "loss": 0.0227, + "num_input_tokens_seen": 108601760, + "step": 89235 + }, + { + "epoch": 9.938745962802093, + "grad_norm": 0.015595195814967155, + "learning_rate": 2.9604639685386316e-05, + "loss": 0.0465, + "num_input_tokens_seen": 108607488, + "step": 89240 + }, + { + "epoch": 9.93930281768571, + "grad_norm": 0.0039698113687336445, + "learning_rate": 2.960225149505692e-05, + "loss": 0.0022, + "num_input_tokens_seen": 108613728, + "step": 89245 + }, + { + "epoch": 9.939859672569328, + "grad_norm": 0.23238621652126312, + "learning_rate": 2.9599863261255572e-05, + "loss": 0.0724, + "num_input_tokens_seen": 108619328, + "step": 89250 + }, + { + "epoch": 9.940416527452946, + "grad_norm": 0.19804491102695465, + "learning_rate": 2.9597474984004837e-05, + "loss": 0.0712, + "num_input_tokens_seen": 108625568, + "step": 89255 + }, + { + "epoch": 9.940973382336564, + "grad_norm": 0.47893476486206055, + "learning_rate": 2.9595086663327258e-05, + "loss": 0.0124, + "num_input_tokens_seen": 108632000, + "step": 89260 + }, + { + "epoch": 9.94153023722018, + "grad_norm": 0.5077097415924072, + "learning_rate": 2.9592698299245407e-05, + "loss": 0.0148, + "num_input_tokens_seen": 108638240, + "step": 89265 + }, + { + "epoch": 9.942087092103797, + "grad_norm": 0.07729525119066238, + "learning_rate": 2.9590309891781842e-05, + "loss": 0.0281, + "num_input_tokens_seen": 108644384, + "step": 89270 + }, + { + "epoch": 9.942643946987415, + "grad_norm": 0.0018610093975439668, + "learning_rate": 2.958792144095912e-05, + "loss": 0.065, + "num_input_tokens_seen": 108650624, + "step": 89275 + }, + { + "epoch": 9.943200801871033, + "grad_norm": 0.9106758832931519, + "learning_rate": 2.958553294679981e-05, + "loss": 0.0597, + "num_input_tokens_seen": 108656832, + "step": 89280 + }, + { + "epoch": 9.94375765675465, + "grad_norm": 0.14088234305381775, + "learning_rate": 2.9583144409326464e-05, + "loss": 0.0413, + "num_input_tokens_seen": 108662752, + "step": 89285 + }, + { + "epoch": 9.944314511638266, + "grad_norm": 0.13060776889324188, + "learning_rate": 2.9580755828561646e-05, + "loss": 0.0439, + "num_input_tokens_seen": 108668928, + "step": 89290 + }, + { + "epoch": 9.944871366521884, + "grad_norm": 0.1944071650505066, + "learning_rate": 2.9578367204527924e-05, + "loss": 0.0397, + "num_input_tokens_seen": 108674816, + "step": 89295 + }, + { + "epoch": 9.945428221405502, + "grad_norm": 0.42951858043670654, + "learning_rate": 2.9575978537247844e-05, + "loss": 0.0276, + "num_input_tokens_seen": 108681216, + "step": 89300 + }, + { + "epoch": 9.94598507628912, + "grad_norm": 0.02269641123712063, + "learning_rate": 2.957358982674399e-05, + "loss": 0.0541, + "num_input_tokens_seen": 108687456, + "step": 89305 + }, + { + "epoch": 9.946541931172737, + "grad_norm": 0.5881518721580505, + "learning_rate": 2.9571201073038918e-05, + "loss": 0.0431, + "num_input_tokens_seen": 108693664, + "step": 89310 + }, + { + "epoch": 9.947098786056353, + "grad_norm": 3.983628034591675, + "learning_rate": 2.9568812276155187e-05, + "loss": 0.1021, + "num_input_tokens_seen": 108699840, + "step": 89315 + }, + { + "epoch": 9.94765564093997, + "grad_norm": 0.7605734467506409, + "learning_rate": 2.956642343611536e-05, + "loss": 0.0986, + "num_input_tokens_seen": 108706368, + "step": 89320 + }, + { + "epoch": 9.948212495823588, + "grad_norm": 3.3119089603424072, + "learning_rate": 2.9564034552942003e-05, + "loss": 0.0222, + "num_input_tokens_seen": 108712672, + "step": 89325 + }, + { + "epoch": 9.948769350707206, + "grad_norm": 0.3063421845436096, + "learning_rate": 2.956164562665769e-05, + "loss": 0.0085, + "num_input_tokens_seen": 108718752, + "step": 89330 + }, + { + "epoch": 9.949326205590824, + "grad_norm": 2.9273531436920166, + "learning_rate": 2.9559256657284973e-05, + "loss": 0.1974, + "num_input_tokens_seen": 108724704, + "step": 89335 + }, + { + "epoch": 9.949883060474441, + "grad_norm": 0.02957196533679962, + "learning_rate": 2.955686764484643e-05, + "loss": 0.0055, + "num_input_tokens_seen": 108730688, + "step": 89340 + }, + { + "epoch": 9.950439915358057, + "grad_norm": 1.963840365409851, + "learning_rate": 2.955447858936462e-05, + "loss": 0.1092, + "num_input_tokens_seen": 108736672, + "step": 89345 + }, + { + "epoch": 9.950996770241675, + "grad_norm": 0.8658335208892822, + "learning_rate": 2.9552089490862113e-05, + "loss": 0.0828, + "num_input_tokens_seen": 108742720, + "step": 89350 + }, + { + "epoch": 9.951553625125293, + "grad_norm": 0.24469755589962006, + "learning_rate": 2.9549700349361466e-05, + "loss": 0.0937, + "num_input_tokens_seen": 108748672, + "step": 89355 + }, + { + "epoch": 9.95211048000891, + "grad_norm": 0.32543274760246277, + "learning_rate": 2.9547311164885254e-05, + "loss": 0.033, + "num_input_tokens_seen": 108754816, + "step": 89360 + }, + { + "epoch": 9.952667334892528, + "grad_norm": 0.0714855045080185, + "learning_rate": 2.9544921937456055e-05, + "loss": 0.1047, + "num_input_tokens_seen": 108761152, + "step": 89365 + }, + { + "epoch": 9.953224189776144, + "grad_norm": 0.06077888235449791, + "learning_rate": 2.9542532667096418e-05, + "loss": 0.0047, + "num_input_tokens_seen": 108767232, + "step": 89370 + }, + { + "epoch": 9.953781044659761, + "grad_norm": 0.03880210593342781, + "learning_rate": 2.9540143353828925e-05, + "loss": 0.0105, + "num_input_tokens_seen": 108773280, + "step": 89375 + }, + { + "epoch": 9.954337899543379, + "grad_norm": 0.00536862388253212, + "learning_rate": 2.9537753997676136e-05, + "loss": 0.0098, + "num_input_tokens_seen": 108779168, + "step": 89380 + }, + { + "epoch": 9.954894754426997, + "grad_norm": 0.3570762276649475, + "learning_rate": 2.9535364598660637e-05, + "loss": 0.1107, + "num_input_tokens_seen": 108785408, + "step": 89385 + }, + { + "epoch": 9.955451609310614, + "grad_norm": 0.05273326858878136, + "learning_rate": 2.953297515680497e-05, + "loss": 0.0047, + "num_input_tokens_seen": 108791680, + "step": 89390 + }, + { + "epoch": 9.95600846419423, + "grad_norm": 0.005662106908857822, + "learning_rate": 2.9530585672131727e-05, + "loss": 0.0073, + "num_input_tokens_seen": 108797760, + "step": 89395 + }, + { + "epoch": 9.956565319077848, + "grad_norm": 0.3633394241333008, + "learning_rate": 2.952819614466348e-05, + "loss": 0.0135, + "num_input_tokens_seen": 108803136, + "step": 89400 + }, + { + "epoch": 9.957122173961466, + "grad_norm": 0.9520740509033203, + "learning_rate": 2.9525806574422777e-05, + "loss": 0.0812, + "num_input_tokens_seen": 108809408, + "step": 89405 + }, + { + "epoch": 9.957679028845083, + "grad_norm": 0.12583191692829132, + "learning_rate": 2.952341696143222e-05, + "loss": 0.0855, + "num_input_tokens_seen": 108815520, + "step": 89410 + }, + { + "epoch": 9.958235883728701, + "grad_norm": 1.4249885082244873, + "learning_rate": 2.9521027305714355e-05, + "loss": 0.1007, + "num_input_tokens_seen": 108821568, + "step": 89415 + }, + { + "epoch": 9.958792738612317, + "grad_norm": 0.04214358702301979, + "learning_rate": 2.9518637607291764e-05, + "loss": 0.0637, + "num_input_tokens_seen": 108827552, + "step": 89420 + }, + { + "epoch": 9.959349593495935, + "grad_norm": 1.3971059322357178, + "learning_rate": 2.951624786618703e-05, + "loss": 0.0844, + "num_input_tokens_seen": 108833728, + "step": 89425 + }, + { + "epoch": 9.959906448379552, + "grad_norm": 0.5416478514671326, + "learning_rate": 2.9513858082422713e-05, + "loss": 0.0296, + "num_input_tokens_seen": 108839904, + "step": 89430 + }, + { + "epoch": 9.96046330326317, + "grad_norm": 0.9139345288276672, + "learning_rate": 2.951146825602139e-05, + "loss": 0.0628, + "num_input_tokens_seen": 108846080, + "step": 89435 + }, + { + "epoch": 9.961020158146788, + "grad_norm": 0.5435175895690918, + "learning_rate": 2.9509078387005635e-05, + "loss": 0.0659, + "num_input_tokens_seen": 108852160, + "step": 89440 + }, + { + "epoch": 9.961577013030404, + "grad_norm": 0.02198590151965618, + "learning_rate": 2.950668847539802e-05, + "loss": 0.0025, + "num_input_tokens_seen": 108858304, + "step": 89445 + }, + { + "epoch": 9.962133867914021, + "grad_norm": 1.0481265783309937, + "learning_rate": 2.950429852122112e-05, + "loss": 0.1329, + "num_input_tokens_seen": 108864544, + "step": 89450 + }, + { + "epoch": 9.962690722797639, + "grad_norm": 0.10776473581790924, + "learning_rate": 2.9501908524497514e-05, + "loss": 0.0334, + "num_input_tokens_seen": 108870880, + "step": 89455 + }, + { + "epoch": 9.963247577681257, + "grad_norm": 0.07728692889213562, + "learning_rate": 2.9499518485249777e-05, + "loss": 0.049, + "num_input_tokens_seen": 108877024, + "step": 89460 + }, + { + "epoch": 9.963804432564874, + "grad_norm": 1.019935131072998, + "learning_rate": 2.9497128403500478e-05, + "loss": 0.0624, + "num_input_tokens_seen": 108882720, + "step": 89465 + }, + { + "epoch": 9.96436128744849, + "grad_norm": 0.7062588930130005, + "learning_rate": 2.9494738279272205e-05, + "loss": 0.0877, + "num_input_tokens_seen": 108888704, + "step": 89470 + }, + { + "epoch": 9.964918142332108, + "grad_norm": 0.02291557751595974, + "learning_rate": 2.9492348112587525e-05, + "loss": 0.0018, + "num_input_tokens_seen": 108895360, + "step": 89475 + }, + { + "epoch": 9.965474997215725, + "grad_norm": 0.014256984926760197, + "learning_rate": 2.9489957903469017e-05, + "loss": 0.0115, + "num_input_tokens_seen": 108901632, + "step": 89480 + }, + { + "epoch": 9.966031852099343, + "grad_norm": 0.0029128154274076223, + "learning_rate": 2.948756765193926e-05, + "loss": 0.0057, + "num_input_tokens_seen": 108907936, + "step": 89485 + }, + { + "epoch": 9.96658870698296, + "grad_norm": 0.13578832149505615, + "learning_rate": 2.9485177358020827e-05, + "loss": 0.0422, + "num_input_tokens_seen": 108913920, + "step": 89490 + }, + { + "epoch": 9.967145561866577, + "grad_norm": 0.5490050315856934, + "learning_rate": 2.9482787021736308e-05, + "loss": 0.0204, + "num_input_tokens_seen": 108920256, + "step": 89495 + }, + { + "epoch": 9.967702416750194, + "grad_norm": 0.08364997804164886, + "learning_rate": 2.9480396643108267e-05, + "loss": 0.0124, + "num_input_tokens_seen": 108926176, + "step": 89500 + }, + { + "epoch": 9.968259271633812, + "grad_norm": 1.5736256837844849, + "learning_rate": 2.9478006222159294e-05, + "loss": 0.0878, + "num_input_tokens_seen": 108932064, + "step": 89505 + }, + { + "epoch": 9.96881612651743, + "grad_norm": 0.05636514723300934, + "learning_rate": 2.9475615758911963e-05, + "loss": 0.032, + "num_input_tokens_seen": 108937952, + "step": 89510 + }, + { + "epoch": 9.969372981401047, + "grad_norm": 0.05023657903075218, + "learning_rate": 2.9473225253388852e-05, + "loss": 0.0139, + "num_input_tokens_seen": 108944128, + "step": 89515 + }, + { + "epoch": 9.969929836284663, + "grad_norm": 0.030606111511588097, + "learning_rate": 2.9470834705612556e-05, + "loss": 0.1053, + "num_input_tokens_seen": 108950144, + "step": 89520 + }, + { + "epoch": 9.970486691168281, + "grad_norm": 0.04648324102163315, + "learning_rate": 2.9468444115605636e-05, + "loss": 0.0269, + "num_input_tokens_seen": 108956352, + "step": 89525 + }, + { + "epoch": 9.971043546051899, + "grad_norm": 0.0006782818236388266, + "learning_rate": 2.946605348339069e-05, + "loss": 0.0778, + "num_input_tokens_seen": 108962432, + "step": 89530 + }, + { + "epoch": 9.971600400935516, + "grad_norm": 0.3570927083492279, + "learning_rate": 2.946366280899028e-05, + "loss": 0.0451, + "num_input_tokens_seen": 108968416, + "step": 89535 + }, + { + "epoch": 9.972157255819134, + "grad_norm": 0.0003967854136135429, + "learning_rate": 2.9461272092426994e-05, + "loss": 0.0234, + "num_input_tokens_seen": 108974816, + "step": 89540 + }, + { + "epoch": 9.97271411070275, + "grad_norm": 0.025519654154777527, + "learning_rate": 2.945888133372343e-05, + "loss": 0.0507, + "num_input_tokens_seen": 108980960, + "step": 89545 + }, + { + "epoch": 9.973270965586368, + "grad_norm": 0.18738006055355072, + "learning_rate": 2.9456490532902154e-05, + "loss": 0.0238, + "num_input_tokens_seen": 108987072, + "step": 89550 + }, + { + "epoch": 9.973827820469985, + "grad_norm": 1.7152377367019653, + "learning_rate": 2.9454099689985758e-05, + "loss": 0.0702, + "num_input_tokens_seen": 108993376, + "step": 89555 + }, + { + "epoch": 9.974384675353603, + "grad_norm": 0.00711754010990262, + "learning_rate": 2.945170880499682e-05, + "loss": 0.0245, + "num_input_tokens_seen": 108999456, + "step": 89560 + }, + { + "epoch": 9.97494153023722, + "grad_norm": 0.9207944273948669, + "learning_rate": 2.9449317877957923e-05, + "loss": 0.0429, + "num_input_tokens_seen": 109005504, + "step": 89565 + }, + { + "epoch": 9.975498385120838, + "grad_norm": 0.189643993973732, + "learning_rate": 2.944692690889166e-05, + "loss": 0.0141, + "num_input_tokens_seen": 109011680, + "step": 89570 + }, + { + "epoch": 9.976055240004454, + "grad_norm": 0.6294909119606018, + "learning_rate": 2.94445358978206e-05, + "loss": 0.069, + "num_input_tokens_seen": 109018048, + "step": 89575 + }, + { + "epoch": 9.976612094888072, + "grad_norm": 0.00012563059863168746, + "learning_rate": 2.944214484476735e-05, + "loss": 0.0026, + "num_input_tokens_seen": 109024224, + "step": 89580 + }, + { + "epoch": 9.97716894977169, + "grad_norm": 0.7323429584503174, + "learning_rate": 2.9439753749754473e-05, + "loss": 0.0176, + "num_input_tokens_seen": 109030368, + "step": 89585 + }, + { + "epoch": 9.977725804655307, + "grad_norm": 2.830413818359375, + "learning_rate": 2.943736261280457e-05, + "loss": 0.3188, + "num_input_tokens_seen": 109036608, + "step": 89590 + }, + { + "epoch": 9.978282659538925, + "grad_norm": 1.0713684558868408, + "learning_rate": 2.9434971433940216e-05, + "loss": 0.1171, + "num_input_tokens_seen": 109043008, + "step": 89595 + }, + { + "epoch": 9.97883951442254, + "grad_norm": 1.4950997829437256, + "learning_rate": 2.943258021318401e-05, + "loss": 0.0929, + "num_input_tokens_seen": 109049248, + "step": 89600 + }, + { + "epoch": 9.979396369306158, + "grad_norm": 0.030624179169535637, + "learning_rate": 2.9430188950558536e-05, + "loss": 0.1055, + "num_input_tokens_seen": 109055392, + "step": 89605 + }, + { + "epoch": 9.979953224189776, + "grad_norm": 3.0881497859954834, + "learning_rate": 2.9427797646086362e-05, + "loss": 0.2326, + "num_input_tokens_seen": 109061408, + "step": 89610 + }, + { + "epoch": 9.980510079073394, + "grad_norm": 0.6454558968544006, + "learning_rate": 2.9425406299790108e-05, + "loss": 0.0661, + "num_input_tokens_seen": 109067072, + "step": 89615 + }, + { + "epoch": 9.981066933957011, + "grad_norm": 0.12920571863651276, + "learning_rate": 2.9423014911692337e-05, + "loss": 0.1406, + "num_input_tokens_seen": 109073312, + "step": 89620 + }, + { + "epoch": 9.981623788840627, + "grad_norm": 0.006514384876936674, + "learning_rate": 2.9420623481815658e-05, + "loss": 0.0101, + "num_input_tokens_seen": 109079552, + "step": 89625 + }, + { + "epoch": 9.982180643724245, + "grad_norm": 0.27886781096458435, + "learning_rate": 2.9418232010182634e-05, + "loss": 0.005, + "num_input_tokens_seen": 109086048, + "step": 89630 + }, + { + "epoch": 9.982737498607863, + "grad_norm": 0.2196720540523529, + "learning_rate": 2.9415840496815872e-05, + "loss": 0.0253, + "num_input_tokens_seen": 109092096, + "step": 89635 + }, + { + "epoch": 9.98329435349148, + "grad_norm": 0.00511678121984005, + "learning_rate": 2.9413448941737963e-05, + "loss": 0.0189, + "num_input_tokens_seen": 109098272, + "step": 89640 + }, + { + "epoch": 9.983851208375098, + "grad_norm": 1.3173612356185913, + "learning_rate": 2.9411057344971494e-05, + "loss": 0.0887, + "num_input_tokens_seen": 109104576, + "step": 89645 + }, + { + "epoch": 9.984408063258714, + "grad_norm": 1.5871760845184326, + "learning_rate": 2.940866570653905e-05, + "loss": 0.0902, + "num_input_tokens_seen": 109110784, + "step": 89650 + }, + { + "epoch": 9.984964918142332, + "grad_norm": 0.9306166768074036, + "learning_rate": 2.9406274026463226e-05, + "loss": 0.0574, + "num_input_tokens_seen": 109116960, + "step": 89655 + }, + { + "epoch": 9.98552177302595, + "grad_norm": 0.8713843822479248, + "learning_rate": 2.9403882304766617e-05, + "loss": 0.1167, + "num_input_tokens_seen": 109123200, + "step": 89660 + }, + { + "epoch": 9.986078627909567, + "grad_norm": 0.2233889251947403, + "learning_rate": 2.940149054147181e-05, + "loss": 0.1333, + "num_input_tokens_seen": 109129120, + "step": 89665 + }, + { + "epoch": 9.986635482793185, + "grad_norm": 0.2694464921951294, + "learning_rate": 2.9399098736601395e-05, + "loss": 0.0097, + "num_input_tokens_seen": 109135456, + "step": 89670 + }, + { + "epoch": 9.987192337676802, + "grad_norm": 0.5399957895278931, + "learning_rate": 2.9396706890177972e-05, + "loss": 0.0636, + "num_input_tokens_seen": 109141664, + "step": 89675 + }, + { + "epoch": 9.987749192560418, + "grad_norm": 0.5406844019889832, + "learning_rate": 2.9394315002224127e-05, + "loss": 0.0167, + "num_input_tokens_seen": 109147744, + "step": 89680 + }, + { + "epoch": 9.988306047444036, + "grad_norm": 0.007496044971048832, + "learning_rate": 2.9391923072762463e-05, + "loss": 0.0678, + "num_input_tokens_seen": 109154112, + "step": 89685 + }, + { + "epoch": 9.988862902327654, + "grad_norm": 0.056861184537410736, + "learning_rate": 2.938953110181556e-05, + "loss": 0.0511, + "num_input_tokens_seen": 109159936, + "step": 89690 + }, + { + "epoch": 9.989419757211271, + "grad_norm": 0.08949354290962219, + "learning_rate": 2.9387139089406013e-05, + "loss": 0.0048, + "num_input_tokens_seen": 109166048, + "step": 89695 + }, + { + "epoch": 9.989976612094889, + "grad_norm": 0.2492903620004654, + "learning_rate": 2.9384747035556436e-05, + "loss": 0.2101, + "num_input_tokens_seen": 109171616, + "step": 89700 + }, + { + "epoch": 9.990533466978505, + "grad_norm": 0.26066213846206665, + "learning_rate": 2.9382354940289404e-05, + "loss": 0.0141, + "num_input_tokens_seen": 109177920, + "step": 89705 + }, + { + "epoch": 9.991090321862123, + "grad_norm": 0.9637545943260193, + "learning_rate": 2.937996280362752e-05, + "loss": 0.1114, + "num_input_tokens_seen": 109184032, + "step": 89710 + }, + { + "epoch": 9.99164717674574, + "grad_norm": 0.052802857011556625, + "learning_rate": 2.9377570625593377e-05, + "loss": 0.0815, + "num_input_tokens_seen": 109190080, + "step": 89715 + }, + { + "epoch": 9.992204031629358, + "grad_norm": 0.7019839286804199, + "learning_rate": 2.937517840620957e-05, + "loss": 0.1421, + "num_input_tokens_seen": 109196352, + "step": 89720 + }, + { + "epoch": 9.992760886512976, + "grad_norm": 0.04270428419113159, + "learning_rate": 2.9372786145498698e-05, + "loss": 0.0491, + "num_input_tokens_seen": 109202368, + "step": 89725 + }, + { + "epoch": 9.993317741396591, + "grad_norm": 0.39776742458343506, + "learning_rate": 2.9370393843483357e-05, + "loss": 0.0714, + "num_input_tokens_seen": 109208704, + "step": 89730 + }, + { + "epoch": 9.99387459628021, + "grad_norm": 0.3317534923553467, + "learning_rate": 2.936800150018615e-05, + "loss": 0.0125, + "num_input_tokens_seen": 109214880, + "step": 89735 + }, + { + "epoch": 9.994431451163827, + "grad_norm": 0.039610475301742554, + "learning_rate": 2.9365609115629667e-05, + "loss": 0.043, + "num_input_tokens_seen": 109220928, + "step": 89740 + }, + { + "epoch": 9.994988306047444, + "grad_norm": 0.031862903386354446, + "learning_rate": 2.9363216689836508e-05, + "loss": 0.0337, + "num_input_tokens_seen": 109227104, + "step": 89745 + }, + { + "epoch": 9.995545160931062, + "grad_norm": 0.7606440782546997, + "learning_rate": 2.936082422282927e-05, + "loss": 0.0834, + "num_input_tokens_seen": 109232864, + "step": 89750 + }, + { + "epoch": 9.996102015814678, + "grad_norm": 1.1581518650054932, + "learning_rate": 2.935843171463056e-05, + "loss": 0.0367, + "num_input_tokens_seen": 109239040, + "step": 89755 + }, + { + "epoch": 9.996658870698296, + "grad_norm": 0.009556248784065247, + "learning_rate": 2.935603916526296e-05, + "loss": 0.0065, + "num_input_tokens_seen": 109245056, + "step": 89760 + }, + { + "epoch": 9.997215725581913, + "grad_norm": 0.001159742707386613, + "learning_rate": 2.9353646574749082e-05, + "loss": 0.0126, + "num_input_tokens_seen": 109251072, + "step": 89765 + }, + { + "epoch": 9.997772580465531, + "grad_norm": 3.8569676876068115, + "learning_rate": 2.9351253943111528e-05, + "loss": 0.1111, + "num_input_tokens_seen": 109257088, + "step": 89770 + }, + { + "epoch": 9.998329435349149, + "grad_norm": 1.1329423189163208, + "learning_rate": 2.934886127037289e-05, + "loss": 0.041, + "num_input_tokens_seen": 109263232, + "step": 89775 + }, + { + "epoch": 9.998886290232765, + "grad_norm": 0.2637823522090912, + "learning_rate": 2.9346468556555778e-05, + "loss": 0.0526, + "num_input_tokens_seen": 109269280, + "step": 89780 + }, + { + "epoch": 9.999443145116382, + "grad_norm": 0.005125812254846096, + "learning_rate": 2.9344075801682787e-05, + "loss": 0.0735, + "num_input_tokens_seen": 109274688, + "step": 89785 + }, + { + "epoch": 10.0, + "grad_norm": 6.645389556884766, + "learning_rate": 2.9341683005776515e-05, + "loss": 0.0876, + "num_input_tokens_seen": 109279616, + "step": 89790 + }, + { + "epoch": 10.0, + "eval_loss": 0.07813919335603714, + "eval_runtime": 112.2559, + "eval_samples_per_second": 35.553, + "eval_steps_per_second": 8.89, + "num_input_tokens_seen": 109279616, + "step": 89790 + }, + { + "epoch": 10.000556854883618, + "grad_norm": 0.01989409700036049, + "learning_rate": 2.933929016885958e-05, + "loss": 0.0254, + "num_input_tokens_seen": 109285536, + "step": 89795 + }, + { + "epoch": 10.001113709767235, + "grad_norm": 0.0004956034244969487, + "learning_rate": 2.9336897290954556e-05, + "loss": 0.019, + "num_input_tokens_seen": 109291840, + "step": 89800 + }, + { + "epoch": 10.001670564650851, + "grad_norm": 0.5382969975471497, + "learning_rate": 2.9334504372084077e-05, + "loss": 0.0106, + "num_input_tokens_seen": 109298016, + "step": 89805 + }, + { + "epoch": 10.002227419534469, + "grad_norm": 1.3301092386245728, + "learning_rate": 2.9332111412270726e-05, + "loss": 0.1095, + "num_input_tokens_seen": 109302784, + "step": 89810 + }, + { + "epoch": 10.002784274418087, + "grad_norm": 0.29401758313179016, + "learning_rate": 2.9329718411537114e-05, + "loss": 0.138, + "num_input_tokens_seen": 109309024, + "step": 89815 + }, + { + "epoch": 10.003341129301704, + "grad_norm": 0.6484407186508179, + "learning_rate": 2.932732536990584e-05, + "loss": 0.0109, + "num_input_tokens_seen": 109315072, + "step": 89820 + }, + { + "epoch": 10.003897984185322, + "grad_norm": 0.03954162448644638, + "learning_rate": 2.9324932287399507e-05, + "loss": 0.0366, + "num_input_tokens_seen": 109321120, + "step": 89825 + }, + { + "epoch": 10.004454839068938, + "grad_norm": 0.11274958401918411, + "learning_rate": 2.932253916404073e-05, + "loss": 0.1038, + "num_input_tokens_seen": 109327264, + "step": 89830 + }, + { + "epoch": 10.005011693952556, + "grad_norm": 0.3588860034942627, + "learning_rate": 2.9320145999852105e-05, + "loss": 0.1429, + "num_input_tokens_seen": 109333312, + "step": 89835 + }, + { + "epoch": 10.005568548836173, + "grad_norm": 0.4417874813079834, + "learning_rate": 2.9317752794856247e-05, + "loss": 0.0445, + "num_input_tokens_seen": 109339104, + "step": 89840 + }, + { + "epoch": 10.00612540371979, + "grad_norm": 0.0002413435431662947, + "learning_rate": 2.9315359549075744e-05, + "loss": 0.0657, + "num_input_tokens_seen": 109344928, + "step": 89845 + }, + { + "epoch": 10.006682258603409, + "grad_norm": 1.1231728792190552, + "learning_rate": 2.931296626253322e-05, + "loss": 0.0487, + "num_input_tokens_seen": 109351264, + "step": 89850 + }, + { + "epoch": 10.007239113487024, + "grad_norm": 0.21163763105869293, + "learning_rate": 2.931057293525127e-05, + "loss": 0.0137, + "num_input_tokens_seen": 109357664, + "step": 89855 + }, + { + "epoch": 10.007795968370642, + "grad_norm": 0.7490569949150085, + "learning_rate": 2.9308179567252504e-05, + "loss": 0.0235, + "num_input_tokens_seen": 109363680, + "step": 89860 + }, + { + "epoch": 10.00835282325426, + "grad_norm": 0.38783055543899536, + "learning_rate": 2.9305786158559535e-05, + "loss": 0.1056, + "num_input_tokens_seen": 109369952, + "step": 89865 + }, + { + "epoch": 10.008909678137877, + "grad_norm": 0.29939189553260803, + "learning_rate": 2.9303392709194953e-05, + "loss": 0.0778, + "num_input_tokens_seen": 109376096, + "step": 89870 + }, + { + "epoch": 10.009466533021495, + "grad_norm": 0.26405787467956543, + "learning_rate": 2.9300999219181396e-05, + "loss": 0.0211, + "num_input_tokens_seen": 109382112, + "step": 89875 + }, + { + "epoch": 10.010023387905113, + "grad_norm": 0.0016662169946357608, + "learning_rate": 2.9298605688541446e-05, + "loss": 0.0097, + "num_input_tokens_seen": 109388064, + "step": 89880 + }, + { + "epoch": 10.010580242788729, + "grad_norm": 0.3518497049808502, + "learning_rate": 2.9296212117297728e-05, + "loss": 0.0217, + "num_input_tokens_seen": 109393888, + "step": 89885 + }, + { + "epoch": 10.011137097672346, + "grad_norm": 1.234463095664978, + "learning_rate": 2.9293818505472837e-05, + "loss": 0.0369, + "num_input_tokens_seen": 109400064, + "step": 89890 + }, + { + "epoch": 10.011693952555964, + "grad_norm": 0.08476325124502182, + "learning_rate": 2.929142485308939e-05, + "loss": 0.0051, + "num_input_tokens_seen": 109406208, + "step": 89895 + }, + { + "epoch": 10.012250807439582, + "grad_norm": 0.004369326401501894, + "learning_rate": 2.9289031160170005e-05, + "loss": 0.105, + "num_input_tokens_seen": 109412576, + "step": 89900 + }, + { + "epoch": 10.0128076623232, + "grad_norm": 0.13564059138298035, + "learning_rate": 2.928663742673728e-05, + "loss": 0.0214, + "num_input_tokens_seen": 109418656, + "step": 89905 + }, + { + "epoch": 10.013364517206815, + "grad_norm": 0.6118622422218323, + "learning_rate": 2.928424365281383e-05, + "loss": 0.044, + "num_input_tokens_seen": 109424800, + "step": 89910 + }, + { + "epoch": 10.013921372090433, + "grad_norm": 0.008197814226150513, + "learning_rate": 2.9281849838422267e-05, + "loss": 0.006, + "num_input_tokens_seen": 109430976, + "step": 89915 + }, + { + "epoch": 10.01447822697405, + "grad_norm": 0.875106692314148, + "learning_rate": 2.9279455983585195e-05, + "loss": 0.127, + "num_input_tokens_seen": 109437312, + "step": 89920 + }, + { + "epoch": 10.015035081857668, + "grad_norm": 0.042364396154880524, + "learning_rate": 2.9277062088325242e-05, + "loss": 0.0016, + "num_input_tokens_seen": 109443264, + "step": 89925 + }, + { + "epoch": 10.015591936741286, + "grad_norm": 0.0052088601514697075, + "learning_rate": 2.927466815266501e-05, + "loss": 0.0227, + "num_input_tokens_seen": 109449440, + "step": 89930 + }, + { + "epoch": 10.016148791624902, + "grad_norm": 0.0002655604330357164, + "learning_rate": 2.927227417662711e-05, + "loss": 0.0151, + "num_input_tokens_seen": 109455424, + "step": 89935 + }, + { + "epoch": 10.01670564650852, + "grad_norm": 1.0025265216827393, + "learning_rate": 2.926988016023416e-05, + "loss": 0.0285, + "num_input_tokens_seen": 109461216, + "step": 89940 + }, + { + "epoch": 10.017262501392137, + "grad_norm": 0.22069604694843292, + "learning_rate": 2.9267486103508763e-05, + "loss": 0.0568, + "num_input_tokens_seen": 109467296, + "step": 89945 + }, + { + "epoch": 10.017819356275755, + "grad_norm": 2.1696603298187256, + "learning_rate": 2.9265092006473548e-05, + "loss": 0.0893, + "num_input_tokens_seen": 109473472, + "step": 89950 + }, + { + "epoch": 10.018376211159373, + "grad_norm": 0.05406998470425606, + "learning_rate": 2.9262697869151117e-05, + "loss": 0.1263, + "num_input_tokens_seen": 109479744, + "step": 89955 + }, + { + "epoch": 10.018933066042989, + "grad_norm": 0.14635813236236572, + "learning_rate": 2.9260303691564095e-05, + "loss": 0.0086, + "num_input_tokens_seen": 109485792, + "step": 89960 + }, + { + "epoch": 10.019489920926606, + "grad_norm": 0.035335857421159744, + "learning_rate": 2.925790947373509e-05, + "loss": 0.0046, + "num_input_tokens_seen": 109491840, + "step": 89965 + }, + { + "epoch": 10.020046775810224, + "grad_norm": 0.5239550471305847, + "learning_rate": 2.9255515215686714e-05, + "loss": 0.0157, + "num_input_tokens_seen": 109497920, + "step": 89970 + }, + { + "epoch": 10.020603630693842, + "grad_norm": 0.0004892511642538011, + "learning_rate": 2.9253120917441596e-05, + "loss": 0.0093, + "num_input_tokens_seen": 109504192, + "step": 89975 + }, + { + "epoch": 10.02116048557746, + "grad_norm": 0.07141068577766418, + "learning_rate": 2.925072657902233e-05, + "loss": 0.002, + "num_input_tokens_seen": 109510560, + "step": 89980 + }, + { + "epoch": 10.021717340461075, + "grad_norm": 1.458875298500061, + "learning_rate": 2.924833220045156e-05, + "loss": 0.1799, + "num_input_tokens_seen": 109516608, + "step": 89985 + }, + { + "epoch": 10.022274195344693, + "grad_norm": 0.03521132469177246, + "learning_rate": 2.924593778175188e-05, + "loss": 0.1094, + "num_input_tokens_seen": 109522976, + "step": 89990 + }, + { + "epoch": 10.02283105022831, + "grad_norm": 0.11777536571025848, + "learning_rate": 2.924354332294592e-05, + "loss": 0.0346, + "num_input_tokens_seen": 109529120, + "step": 89995 + }, + { + "epoch": 10.023387905111928, + "grad_norm": 0.8845700621604919, + "learning_rate": 2.9241148824056292e-05, + "loss": 0.0158, + "num_input_tokens_seen": 109534624, + "step": 90000 + }, + { + "epoch": 10.023944759995546, + "grad_norm": 0.05091031640768051, + "learning_rate": 2.9238754285105614e-05, + "loss": 0.055, + "num_input_tokens_seen": 109540640, + "step": 90005 + }, + { + "epoch": 10.024501614879162, + "grad_norm": 0.378250390291214, + "learning_rate": 2.9236359706116505e-05, + "loss": 0.0054, + "num_input_tokens_seen": 109546496, + "step": 90010 + }, + { + "epoch": 10.02505846976278, + "grad_norm": 0.1021905317902565, + "learning_rate": 2.9233965087111588e-05, + "loss": 0.0682, + "num_input_tokens_seen": 109551680, + "step": 90015 + }, + { + "epoch": 10.025615324646397, + "grad_norm": 0.05878377705812454, + "learning_rate": 2.9231570428113475e-05, + "loss": 0.0053, + "num_input_tokens_seen": 109557600, + "step": 90020 + }, + { + "epoch": 10.026172179530015, + "grad_norm": 0.32398176193237305, + "learning_rate": 2.9229175729144792e-05, + "loss": 0.0077, + "num_input_tokens_seen": 109563968, + "step": 90025 + }, + { + "epoch": 10.026729034413632, + "grad_norm": 0.007046797778457403, + "learning_rate": 2.9226780990228158e-05, + "loss": 0.0139, + "num_input_tokens_seen": 109569984, + "step": 90030 + }, + { + "epoch": 10.027285889297248, + "grad_norm": 1.286930799484253, + "learning_rate": 2.9224386211386185e-05, + "loss": 0.0327, + "num_input_tokens_seen": 109575424, + "step": 90035 + }, + { + "epoch": 10.027842744180866, + "grad_norm": 0.6442610621452332, + "learning_rate": 2.92219913926415e-05, + "loss": 0.0299, + "num_input_tokens_seen": 109581568, + "step": 90040 + }, + { + "epoch": 10.028399599064484, + "grad_norm": 0.3170303404331207, + "learning_rate": 2.921959653401673e-05, + "loss": 0.1083, + "num_input_tokens_seen": 109587648, + "step": 90045 + }, + { + "epoch": 10.028956453948101, + "grad_norm": 0.3831429183483124, + "learning_rate": 2.9217201635534487e-05, + "loss": 0.0599, + "num_input_tokens_seen": 109593248, + "step": 90050 + }, + { + "epoch": 10.029513308831719, + "grad_norm": 0.7419175505638123, + "learning_rate": 2.9214806697217396e-05, + "loss": 0.0404, + "num_input_tokens_seen": 109598784, + "step": 90055 + }, + { + "epoch": 10.030070163715337, + "grad_norm": 0.5556513667106628, + "learning_rate": 2.9212411719088074e-05, + "loss": 0.0455, + "num_input_tokens_seen": 109604832, + "step": 90060 + }, + { + "epoch": 10.030627018598953, + "grad_norm": 0.4842556118965149, + "learning_rate": 2.921001670116915e-05, + "loss": 0.0171, + "num_input_tokens_seen": 109610784, + "step": 90065 + }, + { + "epoch": 10.03118387348257, + "grad_norm": 0.3878639042377472, + "learning_rate": 2.920762164348325e-05, + "loss": 0.0769, + "num_input_tokens_seen": 109617056, + "step": 90070 + }, + { + "epoch": 10.031740728366188, + "grad_norm": 1.1937637329101562, + "learning_rate": 2.9205226546052987e-05, + "loss": 0.0337, + "num_input_tokens_seen": 109623136, + "step": 90075 + }, + { + "epoch": 10.032297583249806, + "grad_norm": 0.4494868516921997, + "learning_rate": 2.9202831408901e-05, + "loss": 0.0092, + "num_input_tokens_seen": 109629600, + "step": 90080 + }, + { + "epoch": 10.032854438133423, + "grad_norm": 0.027315454557538033, + "learning_rate": 2.9200436232049895e-05, + "loss": 0.033, + "num_input_tokens_seen": 109635776, + "step": 90085 + }, + { + "epoch": 10.03341129301704, + "grad_norm": 0.23767708241939545, + "learning_rate": 2.9198041015522305e-05, + "loss": 0.0646, + "num_input_tokens_seen": 109641920, + "step": 90090 + }, + { + "epoch": 10.033968147900657, + "grad_norm": 0.39819061756134033, + "learning_rate": 2.9195645759340855e-05, + "loss": 0.0304, + "num_input_tokens_seen": 109648128, + "step": 90095 + }, + { + "epoch": 10.034525002784275, + "grad_norm": 0.04582267627120018, + "learning_rate": 2.9193250463528166e-05, + "loss": 0.0459, + "num_input_tokens_seen": 109654080, + "step": 90100 + }, + { + "epoch": 10.035081857667892, + "grad_norm": 1.4106084108352661, + "learning_rate": 2.9190855128106875e-05, + "loss": 0.049, + "num_input_tokens_seen": 109659744, + "step": 90105 + }, + { + "epoch": 10.03563871255151, + "grad_norm": 0.5509694218635559, + "learning_rate": 2.918845975309959e-05, + "loss": 0.1251, + "num_input_tokens_seen": 109665600, + "step": 90110 + }, + { + "epoch": 10.036195567435126, + "grad_norm": 0.17509660124778748, + "learning_rate": 2.9186064338528955e-05, + "loss": 0.111, + "num_input_tokens_seen": 109671872, + "step": 90115 + }, + { + "epoch": 10.036752422318743, + "grad_norm": 0.00012336958025116473, + "learning_rate": 2.9183668884417582e-05, + "loss": 0.048, + "num_input_tokens_seen": 109677984, + "step": 90120 + }, + { + "epoch": 10.037309277202361, + "grad_norm": 0.21905463933944702, + "learning_rate": 2.918127339078811e-05, + "loss": 0.0577, + "num_input_tokens_seen": 109684192, + "step": 90125 + }, + { + "epoch": 10.037866132085979, + "grad_norm": 0.20199057459831238, + "learning_rate": 2.9178877857663156e-05, + "loss": 0.0485, + "num_input_tokens_seen": 109690336, + "step": 90130 + }, + { + "epoch": 10.038422986969596, + "grad_norm": 0.1347237527370453, + "learning_rate": 2.9176482285065355e-05, + "loss": 0.0022, + "num_input_tokens_seen": 109696544, + "step": 90135 + }, + { + "epoch": 10.038979841853212, + "grad_norm": 0.07218373566865921, + "learning_rate": 2.9174086673017337e-05, + "loss": 0.0063, + "num_input_tokens_seen": 109702912, + "step": 90140 + }, + { + "epoch": 10.03953669673683, + "grad_norm": 0.16434553265571594, + "learning_rate": 2.917169102154172e-05, + "loss": 0.0079, + "num_input_tokens_seen": 109708576, + "step": 90145 + }, + { + "epoch": 10.040093551620448, + "grad_norm": 8.945462468545884e-05, + "learning_rate": 2.916929533066114e-05, + "loss": 0.0175, + "num_input_tokens_seen": 109714880, + "step": 90150 + }, + { + "epoch": 10.040650406504065, + "grad_norm": 0.0002519238041713834, + "learning_rate": 2.9166899600398225e-05, + "loss": 0.0329, + "num_input_tokens_seen": 109721152, + "step": 90155 + }, + { + "epoch": 10.041207261387683, + "grad_norm": 2.344637155532837, + "learning_rate": 2.9164503830775607e-05, + "loss": 0.1974, + "num_input_tokens_seen": 109727200, + "step": 90160 + }, + { + "epoch": 10.041764116271299, + "grad_norm": 0.6454798579216003, + "learning_rate": 2.9162108021815915e-05, + "loss": 0.0843, + "num_input_tokens_seen": 109732768, + "step": 90165 + }, + { + "epoch": 10.042320971154917, + "grad_norm": 0.005775453057140112, + "learning_rate": 2.915971217354177e-05, + "loss": 0.082, + "num_input_tokens_seen": 109738976, + "step": 90170 + }, + { + "epoch": 10.042877826038534, + "grad_norm": 0.04869465157389641, + "learning_rate": 2.9157316285975823e-05, + "loss": 0.0131, + "num_input_tokens_seen": 109745088, + "step": 90175 + }, + { + "epoch": 10.043434680922152, + "grad_norm": 0.0597756989300251, + "learning_rate": 2.915492035914068e-05, + "loss": 0.0351, + "num_input_tokens_seen": 109751104, + "step": 90180 + }, + { + "epoch": 10.04399153580577, + "grad_norm": 0.5155568718910217, + "learning_rate": 2.9152524393059e-05, + "loss": 0.1022, + "num_input_tokens_seen": 109757312, + "step": 90185 + }, + { + "epoch": 10.044548390689386, + "grad_norm": 0.022249197587370872, + "learning_rate": 2.9150128387753385e-05, + "loss": 0.0216, + "num_input_tokens_seen": 109763136, + "step": 90190 + }, + { + "epoch": 10.045105245573003, + "grad_norm": 0.7664013504981995, + "learning_rate": 2.9147732343246488e-05, + "loss": 0.03, + "num_input_tokens_seen": 109768448, + "step": 90195 + }, + { + "epoch": 10.045662100456621, + "grad_norm": 1.198426365852356, + "learning_rate": 2.914533625956094e-05, + "loss": 0.0376, + "num_input_tokens_seen": 109774688, + "step": 90200 + }, + { + "epoch": 10.046218955340239, + "grad_norm": 0.0003738888481166214, + "learning_rate": 2.9142940136719366e-05, + "loss": 0.055, + "num_input_tokens_seen": 109780448, + "step": 90205 + }, + { + "epoch": 10.046775810223856, + "grad_norm": 0.03623858094215393, + "learning_rate": 2.9140543974744405e-05, + "loss": 0.0588, + "num_input_tokens_seen": 109786368, + "step": 90210 + }, + { + "epoch": 10.047332665107472, + "grad_norm": 0.6338223814964294, + "learning_rate": 2.9138147773658688e-05, + "loss": 0.0297, + "num_input_tokens_seen": 109792384, + "step": 90215 + }, + { + "epoch": 10.04788951999109, + "grad_norm": 0.7550414204597473, + "learning_rate": 2.913575153348485e-05, + "loss": 0.1387, + "num_input_tokens_seen": 109797664, + "step": 90220 + }, + { + "epoch": 10.048446374874707, + "grad_norm": 0.33024513721466064, + "learning_rate": 2.9133355254245526e-05, + "loss": 0.0047, + "num_input_tokens_seen": 109803872, + "step": 90225 + }, + { + "epoch": 10.049003229758325, + "grad_norm": 0.915779173374176, + "learning_rate": 2.9130958935963348e-05, + "loss": 0.0549, + "num_input_tokens_seen": 109809824, + "step": 90230 + }, + { + "epoch": 10.049560084641943, + "grad_norm": 0.5117797255516052, + "learning_rate": 2.9128562578660956e-05, + "loss": 0.1374, + "num_input_tokens_seen": 109815968, + "step": 90235 + }, + { + "epoch": 10.05011693952556, + "grad_norm": 1.2245608568191528, + "learning_rate": 2.9126166182360982e-05, + "loss": 0.1059, + "num_input_tokens_seen": 109822176, + "step": 90240 + }, + { + "epoch": 10.050673794409176, + "grad_norm": 0.06112973019480705, + "learning_rate": 2.912376974708606e-05, + "loss": 0.0121, + "num_input_tokens_seen": 109828512, + "step": 90245 + }, + { + "epoch": 10.051230649292794, + "grad_norm": 0.08098883926868439, + "learning_rate": 2.912137327285883e-05, + "loss": 0.1555, + "num_input_tokens_seen": 109834976, + "step": 90250 + }, + { + "epoch": 10.051787504176412, + "grad_norm": 0.007850832305848598, + "learning_rate": 2.9118976759701934e-05, + "loss": 0.012, + "num_input_tokens_seen": 109841312, + "step": 90255 + }, + { + "epoch": 10.05234435906003, + "grad_norm": 1.092487096786499, + "learning_rate": 2.9116580207637988e-05, + "loss": 0.092, + "num_input_tokens_seen": 109847424, + "step": 90260 + }, + { + "epoch": 10.052901213943647, + "grad_norm": 1.3690377473831177, + "learning_rate": 2.911418361668965e-05, + "loss": 0.0786, + "num_input_tokens_seen": 109853568, + "step": 90265 + }, + { + "epoch": 10.053458068827263, + "grad_norm": 0.00617495970800519, + "learning_rate": 2.9111786986879557e-05, + "loss": 0.0019, + "num_input_tokens_seen": 109859776, + "step": 90270 + }, + { + "epoch": 10.05401492371088, + "grad_norm": 0.0030393823981285095, + "learning_rate": 2.9109390318230338e-05, + "loss": 0.023, + "num_input_tokens_seen": 109865760, + "step": 90275 + }, + { + "epoch": 10.054571778594498, + "grad_norm": 0.04987160861492157, + "learning_rate": 2.9106993610764638e-05, + "loss": 0.0677, + "num_input_tokens_seen": 109871936, + "step": 90280 + }, + { + "epoch": 10.055128633478116, + "grad_norm": 1.7952507734298706, + "learning_rate": 2.9104596864505084e-05, + "loss": 0.0499, + "num_input_tokens_seen": 109877920, + "step": 90285 + }, + { + "epoch": 10.055685488361734, + "grad_norm": 1.4804309606552124, + "learning_rate": 2.9102200079474327e-05, + "loss": 0.0287, + "num_input_tokens_seen": 109884160, + "step": 90290 + }, + { + "epoch": 10.05624234324535, + "grad_norm": 1.1354527473449707, + "learning_rate": 2.9099803255695012e-05, + "loss": 0.156, + "num_input_tokens_seen": 109890112, + "step": 90295 + }, + { + "epoch": 10.056799198128967, + "grad_norm": 0.001871173270046711, + "learning_rate": 2.9097406393189763e-05, + "loss": 0.1092, + "num_input_tokens_seen": 109896160, + "step": 90300 + }, + { + "epoch": 10.057356053012585, + "grad_norm": 1.049967646598816, + "learning_rate": 2.9095009491981235e-05, + "loss": 0.0578, + "num_input_tokens_seen": 109902368, + "step": 90305 + }, + { + "epoch": 10.057912907896203, + "grad_norm": 0.006224131677299738, + "learning_rate": 2.909261255209206e-05, + "loss": 0.025, + "num_input_tokens_seen": 109908352, + "step": 90310 + }, + { + "epoch": 10.05846976277982, + "grad_norm": 0.789426863193512, + "learning_rate": 2.9090215573544876e-05, + "loss": 0.02, + "num_input_tokens_seen": 109914560, + "step": 90315 + }, + { + "epoch": 10.059026617663436, + "grad_norm": 0.009816220961511135, + "learning_rate": 2.9087818556362328e-05, + "loss": 0.0295, + "num_input_tokens_seen": 109920640, + "step": 90320 + }, + { + "epoch": 10.059583472547054, + "grad_norm": 0.00020362052600830793, + "learning_rate": 2.9085421500567055e-05, + "loss": 0.1781, + "num_input_tokens_seen": 109926464, + "step": 90325 + }, + { + "epoch": 10.060140327430672, + "grad_norm": 0.15651479363441467, + "learning_rate": 2.9083024406181712e-05, + "loss": 0.0051, + "num_input_tokens_seen": 109932704, + "step": 90330 + }, + { + "epoch": 10.06069718231429, + "grad_norm": 0.00857486017048359, + "learning_rate": 2.9080627273228927e-05, + "loss": 0.0542, + "num_input_tokens_seen": 109938880, + "step": 90335 + }, + { + "epoch": 10.061254037197907, + "grad_norm": 0.52097088098526, + "learning_rate": 2.907823010173135e-05, + "loss": 0.0194, + "num_input_tokens_seen": 109944992, + "step": 90340 + }, + { + "epoch": 10.061810892081523, + "grad_norm": 0.43473172187805176, + "learning_rate": 2.907583289171163e-05, + "loss": 0.0143, + "num_input_tokens_seen": 109950976, + "step": 90345 + }, + { + "epoch": 10.06236774696514, + "grad_norm": 0.04486992955207825, + "learning_rate": 2.9073435643192393e-05, + "loss": 0.0083, + "num_input_tokens_seen": 109957472, + "step": 90350 + }, + { + "epoch": 10.062924601848758, + "grad_norm": 0.08004800230264664, + "learning_rate": 2.9071038356196295e-05, + "loss": 0.0192, + "num_input_tokens_seen": 109963584, + "step": 90355 + }, + { + "epoch": 10.063481456732376, + "grad_norm": 0.008266261778771877, + "learning_rate": 2.906864103074598e-05, + "loss": 0.0313, + "num_input_tokens_seen": 109969888, + "step": 90360 + }, + { + "epoch": 10.064038311615993, + "grad_norm": 1.682579517364502, + "learning_rate": 2.906624366686409e-05, + "loss": 0.0508, + "num_input_tokens_seen": 109976000, + "step": 90365 + }, + { + "epoch": 10.06459516649961, + "grad_norm": 1.1762722730636597, + "learning_rate": 2.9063846264573262e-05, + "loss": 0.059, + "num_input_tokens_seen": 109982144, + "step": 90370 + }, + { + "epoch": 10.065152021383227, + "grad_norm": 0.17508311569690704, + "learning_rate": 2.9061448823896158e-05, + "loss": 0.0053, + "num_input_tokens_seen": 109988416, + "step": 90375 + }, + { + "epoch": 10.065708876266845, + "grad_norm": 0.04419058933854103, + "learning_rate": 2.905905134485542e-05, + "loss": 0.0472, + "num_input_tokens_seen": 109994720, + "step": 90380 + }, + { + "epoch": 10.066265731150462, + "grad_norm": 0.014380628243088722, + "learning_rate": 2.905665382747368e-05, + "loss": 0.0117, + "num_input_tokens_seen": 110000992, + "step": 90385 + }, + { + "epoch": 10.06682258603408, + "grad_norm": 0.00035954578197561204, + "learning_rate": 2.9054256271773605e-05, + "loss": 0.0004, + "num_input_tokens_seen": 110007200, + "step": 90390 + }, + { + "epoch": 10.067379440917696, + "grad_norm": 0.01044188067317009, + "learning_rate": 2.905185867777782e-05, + "loss": 0.0101, + "num_input_tokens_seen": 110013632, + "step": 90395 + }, + { + "epoch": 10.067936295801314, + "grad_norm": 2.7180216312408447, + "learning_rate": 2.9049461045508997e-05, + "loss": 0.1013, + "num_input_tokens_seen": 110019776, + "step": 90400 + }, + { + "epoch": 10.068493150684931, + "grad_norm": 0.024776367470622063, + "learning_rate": 2.9047063374989757e-05, + "loss": 0.0081, + "num_input_tokens_seen": 110025824, + "step": 90405 + }, + { + "epoch": 10.069050005568549, + "grad_norm": 0.007897446863353252, + "learning_rate": 2.9044665666242764e-05, + "loss": 0.0253, + "num_input_tokens_seen": 110031712, + "step": 90410 + }, + { + "epoch": 10.069606860452167, + "grad_norm": 0.12271004170179367, + "learning_rate": 2.9042267919290673e-05, + "loss": 0.0136, + "num_input_tokens_seen": 110037984, + "step": 90415 + }, + { + "epoch": 10.070163715335784, + "grad_norm": 0.005597525276243687, + "learning_rate": 2.903987013415611e-05, + "loss": 0.0568, + "num_input_tokens_seen": 110044192, + "step": 90420 + }, + { + "epoch": 10.0707205702194, + "grad_norm": 0.002669590525329113, + "learning_rate": 2.9037472310861747e-05, + "loss": 0.0245, + "num_input_tokens_seen": 110050304, + "step": 90425 + }, + { + "epoch": 10.071277425103018, + "grad_norm": 0.03286857157945633, + "learning_rate": 2.9035074449430215e-05, + "loss": 0.1002, + "num_input_tokens_seen": 110055872, + "step": 90430 + }, + { + "epoch": 10.071834279986636, + "grad_norm": 0.09543849527835846, + "learning_rate": 2.903267654988418e-05, + "loss": 0.0388, + "num_input_tokens_seen": 110062496, + "step": 90435 + }, + { + "epoch": 10.072391134870253, + "grad_norm": 0.2614850401878357, + "learning_rate": 2.9030278612246275e-05, + "loss": 0.0249, + "num_input_tokens_seen": 110068960, + "step": 90440 + }, + { + "epoch": 10.072947989753871, + "grad_norm": 0.0007451309356838465, + "learning_rate": 2.9027880636539164e-05, + "loss": 0.0639, + "num_input_tokens_seen": 110074976, + "step": 90445 + }, + { + "epoch": 10.073504844637487, + "grad_norm": 1.0058534145355225, + "learning_rate": 2.9025482622785493e-05, + "loss": 0.0682, + "num_input_tokens_seen": 110080768, + "step": 90450 + }, + { + "epoch": 10.074061699521105, + "grad_norm": 1.3455605506896973, + "learning_rate": 2.9023084571007915e-05, + "loss": 0.0632, + "num_input_tokens_seen": 110086848, + "step": 90455 + }, + { + "epoch": 10.074618554404722, + "grad_norm": 0.007877367548644543, + "learning_rate": 2.902068648122908e-05, + "loss": 0.0067, + "num_input_tokens_seen": 110092896, + "step": 90460 + }, + { + "epoch": 10.07517540928834, + "grad_norm": 0.929928183555603, + "learning_rate": 2.901828835347164e-05, + "loss": 0.0301, + "num_input_tokens_seen": 110098752, + "step": 90465 + }, + { + "epoch": 10.075732264171958, + "grad_norm": 0.003123176982626319, + "learning_rate": 2.9015890187758243e-05, + "loss": 0.0098, + "num_input_tokens_seen": 110104896, + "step": 90470 + }, + { + "epoch": 10.076289119055573, + "grad_norm": 0.007521332241594791, + "learning_rate": 2.9013491984111553e-05, + "loss": 0.0069, + "num_input_tokens_seen": 110111104, + "step": 90475 + }, + { + "epoch": 10.076845973939191, + "grad_norm": 1.1779288053512573, + "learning_rate": 2.9011093742554206e-05, + "loss": 0.1151, + "num_input_tokens_seen": 110117152, + "step": 90480 + }, + { + "epoch": 10.077402828822809, + "grad_norm": 1.1584304571151733, + "learning_rate": 2.9008695463108876e-05, + "loss": 0.0701, + "num_input_tokens_seen": 110123680, + "step": 90485 + }, + { + "epoch": 10.077959683706426, + "grad_norm": 0.8833099007606506, + "learning_rate": 2.9006297145798194e-05, + "loss": 0.0416, + "num_input_tokens_seen": 110129952, + "step": 90490 + }, + { + "epoch": 10.078516538590044, + "grad_norm": 1.417075276374817, + "learning_rate": 2.9003898790644835e-05, + "loss": 0.1836, + "num_input_tokens_seen": 110135808, + "step": 90495 + }, + { + "epoch": 10.07907339347366, + "grad_norm": 0.2897055745124817, + "learning_rate": 2.900150039767144e-05, + "loss": 0.0086, + "num_input_tokens_seen": 110142176, + "step": 90500 + }, + { + "epoch": 10.079630248357278, + "grad_norm": 0.002837708219885826, + "learning_rate": 2.8999101966900667e-05, + "loss": 0.0744, + "num_input_tokens_seen": 110148448, + "step": 90505 + }, + { + "epoch": 10.080187103240895, + "grad_norm": 0.9344477653503418, + "learning_rate": 2.8996703498355176e-05, + "loss": 0.0322, + "num_input_tokens_seen": 110153792, + "step": 90510 + }, + { + "epoch": 10.080743958124513, + "grad_norm": 0.9184141755104065, + "learning_rate": 2.8994304992057614e-05, + "loss": 0.0692, + "num_input_tokens_seen": 110159872, + "step": 90515 + }, + { + "epoch": 10.08130081300813, + "grad_norm": 2.339839458465576, + "learning_rate": 2.8991906448030643e-05, + "loss": 0.0841, + "num_input_tokens_seen": 110166080, + "step": 90520 + }, + { + "epoch": 10.081857667891747, + "grad_norm": 9.630355634726584e-05, + "learning_rate": 2.8989507866296916e-05, + "loss": 0.0137, + "num_input_tokens_seen": 110172032, + "step": 90525 + }, + { + "epoch": 10.082414522775364, + "grad_norm": 0.547074556350708, + "learning_rate": 2.8987109246879096e-05, + "loss": 0.0436, + "num_input_tokens_seen": 110178368, + "step": 90530 + }, + { + "epoch": 10.082971377658982, + "grad_norm": 0.027321144938468933, + "learning_rate": 2.898471058979983e-05, + "loss": 0.0027, + "num_input_tokens_seen": 110184480, + "step": 90535 + }, + { + "epoch": 10.0835282325426, + "grad_norm": 0.00014801295765209943, + "learning_rate": 2.8982311895081778e-05, + "loss": 0.0235, + "num_input_tokens_seen": 110190688, + "step": 90540 + }, + { + "epoch": 10.084085087426217, + "grad_norm": 0.9979311227798462, + "learning_rate": 2.89799131627476e-05, + "loss": 0.0555, + "num_input_tokens_seen": 110196736, + "step": 90545 + }, + { + "epoch": 10.084641942309833, + "grad_norm": 0.2556215524673462, + "learning_rate": 2.897751439281996e-05, + "loss": 0.0038, + "num_input_tokens_seen": 110203008, + "step": 90550 + }, + { + "epoch": 10.085198797193451, + "grad_norm": 0.609765350818634, + "learning_rate": 2.8975115585321506e-05, + "loss": 0.01, + "num_input_tokens_seen": 110209312, + "step": 90555 + }, + { + "epoch": 10.085755652077069, + "grad_norm": 1.743622899055481, + "learning_rate": 2.89727167402749e-05, + "loss": 0.1372, + "num_input_tokens_seen": 110215584, + "step": 90560 + }, + { + "epoch": 10.086312506960686, + "grad_norm": 1.4657498598098755, + "learning_rate": 2.89703178577028e-05, + "loss": 0.024, + "num_input_tokens_seen": 110221952, + "step": 90565 + }, + { + "epoch": 10.086869361844304, + "grad_norm": 0.04331941157579422, + "learning_rate": 2.8967918937627868e-05, + "loss": 0.0421, + "num_input_tokens_seen": 110227968, + "step": 90570 + }, + { + "epoch": 10.08742621672792, + "grad_norm": 0.4758150279521942, + "learning_rate": 2.8965519980072764e-05, + "loss": 0.0187, + "num_input_tokens_seen": 110233568, + "step": 90575 + }, + { + "epoch": 10.087983071611538, + "grad_norm": 0.39623817801475525, + "learning_rate": 2.8963120985060143e-05, + "loss": 0.0456, + "num_input_tokens_seen": 110239264, + "step": 90580 + }, + { + "epoch": 10.088539926495155, + "grad_norm": 0.007539946585893631, + "learning_rate": 2.8960721952612673e-05, + "loss": 0.1074, + "num_input_tokens_seen": 110245056, + "step": 90585 + }, + { + "epoch": 10.089096781378773, + "grad_norm": 0.2641230523586273, + "learning_rate": 2.8958322882753015e-05, + "loss": 0.0199, + "num_input_tokens_seen": 110250784, + "step": 90590 + }, + { + "epoch": 10.08965363626239, + "grad_norm": 1.0980236530303955, + "learning_rate": 2.8955923775503818e-05, + "loss": 0.0506, + "num_input_tokens_seen": 110256960, + "step": 90595 + }, + { + "epoch": 10.090210491146008, + "grad_norm": 0.631952166557312, + "learning_rate": 2.8953524630887753e-05, + "loss": 0.0443, + "num_input_tokens_seen": 110263296, + "step": 90600 + }, + { + "epoch": 10.090767346029624, + "grad_norm": 0.501375675201416, + "learning_rate": 2.8951125448927485e-05, + "loss": 0.0232, + "num_input_tokens_seen": 110268448, + "step": 90605 + }, + { + "epoch": 10.091324200913242, + "grad_norm": 0.11003098636865616, + "learning_rate": 2.8948726229645662e-05, + "loss": 0.029, + "num_input_tokens_seen": 110274016, + "step": 90610 + }, + { + "epoch": 10.09188105579686, + "grad_norm": 1.9660907983779907, + "learning_rate": 2.894632697306497e-05, + "loss": 0.1742, + "num_input_tokens_seen": 110280032, + "step": 90615 + }, + { + "epoch": 10.092437910680477, + "grad_norm": 0.0007484194939024746, + "learning_rate": 2.8943927679208042e-05, + "loss": 0.0637, + "num_input_tokens_seen": 110286240, + "step": 90620 + }, + { + "epoch": 10.092994765564095, + "grad_norm": 0.03212971240282059, + "learning_rate": 2.894152834809757e-05, + "loss": 0.0011, + "num_input_tokens_seen": 110292544, + "step": 90625 + }, + { + "epoch": 10.09355162044771, + "grad_norm": 0.0026915932539850473, + "learning_rate": 2.89391289797562e-05, + "loss": 0.0051, + "num_input_tokens_seen": 110298720, + "step": 90630 + }, + { + "epoch": 10.094108475331328, + "grad_norm": 0.14151586592197418, + "learning_rate": 2.89367295742066e-05, + "loss": 0.0988, + "num_input_tokens_seen": 110304992, + "step": 90635 + }, + { + "epoch": 10.094665330214946, + "grad_norm": 0.00023402566148433834, + "learning_rate": 2.8934330131471437e-05, + "loss": 0.1255, + "num_input_tokens_seen": 110310976, + "step": 90640 + }, + { + "epoch": 10.095222185098564, + "grad_norm": 0.055063147097826004, + "learning_rate": 2.8931930651573368e-05, + "loss": 0.0872, + "num_input_tokens_seen": 110317120, + "step": 90645 + }, + { + "epoch": 10.095779039982181, + "grad_norm": 0.006375538185238838, + "learning_rate": 2.8929531134535076e-05, + "loss": 0.0077, + "num_input_tokens_seen": 110323328, + "step": 90650 + }, + { + "epoch": 10.096335894865797, + "grad_norm": 1.1372727155685425, + "learning_rate": 2.89271315803792e-05, + "loss": 0.0187, + "num_input_tokens_seen": 110329632, + "step": 90655 + }, + { + "epoch": 10.096892749749415, + "grad_norm": 0.10289443284273148, + "learning_rate": 2.8924731989128436e-05, + "loss": 0.0823, + "num_input_tokens_seen": 110336032, + "step": 90660 + }, + { + "epoch": 10.097449604633033, + "grad_norm": 0.0026524942368268967, + "learning_rate": 2.892233236080542e-05, + "loss": 0.0087, + "num_input_tokens_seen": 110342528, + "step": 90665 + }, + { + "epoch": 10.09800645951665, + "grad_norm": 1.4930092096328735, + "learning_rate": 2.8919932695432832e-05, + "loss": 0.0311, + "num_input_tokens_seen": 110348512, + "step": 90670 + }, + { + "epoch": 10.098563314400268, + "grad_norm": 2.6418020725250244, + "learning_rate": 2.8917532993033353e-05, + "loss": 0.0749, + "num_input_tokens_seen": 110354208, + "step": 90675 + }, + { + "epoch": 10.099120169283884, + "grad_norm": 0.5054805278778076, + "learning_rate": 2.8915133253629624e-05, + "loss": 0.0376, + "num_input_tokens_seen": 110360192, + "step": 90680 + }, + { + "epoch": 10.099677024167502, + "grad_norm": 0.5474571585655212, + "learning_rate": 2.891273347724433e-05, + "loss": 0.0035, + "num_input_tokens_seen": 110366496, + "step": 90685 + }, + { + "epoch": 10.10023387905112, + "grad_norm": 1.0749174356460571, + "learning_rate": 2.891033366390013e-05, + "loss": 0.0647, + "num_input_tokens_seen": 110372288, + "step": 90690 + }, + { + "epoch": 10.100790733934737, + "grad_norm": 0.0015495638363063335, + "learning_rate": 2.890793381361969e-05, + "loss": 0.0487, + "num_input_tokens_seen": 110378304, + "step": 90695 + }, + { + "epoch": 10.101347588818355, + "grad_norm": 0.4321005344390869, + "learning_rate": 2.8905533926425698e-05, + "loss": 0.0211, + "num_input_tokens_seen": 110384512, + "step": 90700 + }, + { + "epoch": 10.10190444370197, + "grad_norm": 0.010301937349140644, + "learning_rate": 2.8903134002340803e-05, + "loss": 0.0425, + "num_input_tokens_seen": 110390880, + "step": 90705 + }, + { + "epoch": 10.102461298585588, + "grad_norm": 1.2195733785629272, + "learning_rate": 2.890073404138768e-05, + "loss": 0.1391, + "num_input_tokens_seen": 110396640, + "step": 90710 + }, + { + "epoch": 10.103018153469206, + "grad_norm": 0.12459763139486313, + "learning_rate": 2.8898334043588997e-05, + "loss": 0.1214, + "num_input_tokens_seen": 110402752, + "step": 90715 + }, + { + "epoch": 10.103575008352824, + "grad_norm": 0.14335979521274567, + "learning_rate": 2.8895934008967428e-05, + "loss": 0.0456, + "num_input_tokens_seen": 110408992, + "step": 90720 + }, + { + "epoch": 10.104131863236441, + "grad_norm": 0.7794531583786011, + "learning_rate": 2.8893533937545635e-05, + "loss": 0.0084, + "num_input_tokens_seen": 110414944, + "step": 90725 + }, + { + "epoch": 10.104688718120057, + "grad_norm": 0.004556640516966581, + "learning_rate": 2.8891133829346302e-05, + "loss": 0.0233, + "num_input_tokens_seen": 110420992, + "step": 90730 + }, + { + "epoch": 10.105245573003675, + "grad_norm": 0.8968685865402222, + "learning_rate": 2.8888733684392095e-05, + "loss": 0.0587, + "num_input_tokens_seen": 110427168, + "step": 90735 + }, + { + "epoch": 10.105802427887292, + "grad_norm": 0.007901025004684925, + "learning_rate": 2.888633350270567e-05, + "loss": 0.0728, + "num_input_tokens_seen": 110433568, + "step": 90740 + }, + { + "epoch": 10.10635928277091, + "grad_norm": 0.040576860308647156, + "learning_rate": 2.888393328430973e-05, + "loss": 0.1186, + "num_input_tokens_seen": 110439744, + "step": 90745 + }, + { + "epoch": 10.106916137654528, + "grad_norm": 0.128811776638031, + "learning_rate": 2.888153302922691e-05, + "loss": 0.0165, + "num_input_tokens_seen": 110446048, + "step": 90750 + }, + { + "epoch": 10.107472992538144, + "grad_norm": 0.004203699994832277, + "learning_rate": 2.887913273747991e-05, + "loss": 0.0054, + "num_input_tokens_seen": 110452160, + "step": 90755 + }, + { + "epoch": 10.108029847421761, + "grad_norm": 0.6163837909698486, + "learning_rate": 2.8876732409091396e-05, + "loss": 0.081, + "num_input_tokens_seen": 110458368, + "step": 90760 + }, + { + "epoch": 10.108586702305379, + "grad_norm": 2.1642024517059326, + "learning_rate": 2.887433204408403e-05, + "loss": 0.1422, + "num_input_tokens_seen": 110464416, + "step": 90765 + }, + { + "epoch": 10.109143557188997, + "grad_norm": 0.46128228306770325, + "learning_rate": 2.8871931642480503e-05, + "loss": 0.074, + "num_input_tokens_seen": 110470624, + "step": 90770 + }, + { + "epoch": 10.109700412072614, + "grad_norm": 1.3861902952194214, + "learning_rate": 2.886953120430347e-05, + "loss": 0.1221, + "num_input_tokens_seen": 110476704, + "step": 90775 + }, + { + "epoch": 10.110257266956232, + "grad_norm": 0.18968915939331055, + "learning_rate": 2.886713072957562e-05, + "loss": 0.0188, + "num_input_tokens_seen": 110482912, + "step": 90780 + }, + { + "epoch": 10.110814121839848, + "grad_norm": 0.07588160037994385, + "learning_rate": 2.886473021831962e-05, + "loss": 0.0042, + "num_input_tokens_seen": 110489472, + "step": 90785 + }, + { + "epoch": 10.111370976723466, + "grad_norm": 0.8851087689399719, + "learning_rate": 2.8862329670558148e-05, + "loss": 0.0596, + "num_input_tokens_seen": 110495552, + "step": 90790 + }, + { + "epoch": 10.111927831607083, + "grad_norm": 0.3326655328273773, + "learning_rate": 2.885992908631388e-05, + "loss": 0.0249, + "num_input_tokens_seen": 110501824, + "step": 90795 + }, + { + "epoch": 10.112484686490701, + "grad_norm": 0.007463650777935982, + "learning_rate": 2.8857528465609484e-05, + "loss": 0.083, + "num_input_tokens_seen": 110507936, + "step": 90800 + }, + { + "epoch": 10.113041541374319, + "grad_norm": 0.3615042269229889, + "learning_rate": 2.8855127808467647e-05, + "loss": 0.0513, + "num_input_tokens_seen": 110514016, + "step": 90805 + }, + { + "epoch": 10.113598396257935, + "grad_norm": 0.19957005977630615, + "learning_rate": 2.8852727114911034e-05, + "loss": 0.0068, + "num_input_tokens_seen": 110520416, + "step": 90810 + }, + { + "epoch": 10.114155251141552, + "grad_norm": 0.018616707995533943, + "learning_rate": 2.8850326384962324e-05, + "loss": 0.0113, + "num_input_tokens_seen": 110526624, + "step": 90815 + }, + { + "epoch": 10.11471210602517, + "grad_norm": 0.02798556350171566, + "learning_rate": 2.8847925618644205e-05, + "loss": 0.0616, + "num_input_tokens_seen": 110532736, + "step": 90820 + }, + { + "epoch": 10.115268960908788, + "grad_norm": 0.5034606456756592, + "learning_rate": 2.8845524815979336e-05, + "loss": 0.0768, + "num_input_tokens_seen": 110539232, + "step": 90825 + }, + { + "epoch": 10.115825815792405, + "grad_norm": 0.22612696886062622, + "learning_rate": 2.8843123976990415e-05, + "loss": 0.0594, + "num_input_tokens_seen": 110545504, + "step": 90830 + }, + { + "epoch": 10.116382670676021, + "grad_norm": 0.15664923191070557, + "learning_rate": 2.88407231017001e-05, + "loss": 0.021, + "num_input_tokens_seen": 110551552, + "step": 90835 + }, + { + "epoch": 10.116939525559639, + "grad_norm": 1.304167628288269, + "learning_rate": 2.8838322190131078e-05, + "loss": 0.1376, + "num_input_tokens_seen": 110556832, + "step": 90840 + }, + { + "epoch": 10.117496380443256, + "grad_norm": 0.1463639885187149, + "learning_rate": 2.8835921242306036e-05, + "loss": 0.0543, + "num_input_tokens_seen": 110563008, + "step": 90845 + }, + { + "epoch": 10.118053235326874, + "grad_norm": 0.16915374994277954, + "learning_rate": 2.8833520258247636e-05, + "loss": 0.0281, + "num_input_tokens_seen": 110569088, + "step": 90850 + }, + { + "epoch": 10.118610090210492, + "grad_norm": 0.1752793788909912, + "learning_rate": 2.8831119237978577e-05, + "loss": 0.0219, + "num_input_tokens_seen": 110575328, + "step": 90855 + }, + { + "epoch": 10.119166945094108, + "grad_norm": 0.05674353986978531, + "learning_rate": 2.882871818152152e-05, + "loss": 0.0237, + "num_input_tokens_seen": 110581856, + "step": 90860 + }, + { + "epoch": 10.119723799977725, + "grad_norm": 0.31245478987693787, + "learning_rate": 2.8826317088899152e-05, + "loss": 0.0186, + "num_input_tokens_seen": 110587872, + "step": 90865 + }, + { + "epoch": 10.120280654861343, + "grad_norm": 0.0016111478907987475, + "learning_rate": 2.882391596013415e-05, + "loss": 0.0444, + "num_input_tokens_seen": 110593984, + "step": 90870 + }, + { + "epoch": 10.12083750974496, + "grad_norm": 0.037683699280023575, + "learning_rate": 2.88215147952492e-05, + "loss": 0.065, + "num_input_tokens_seen": 110600064, + "step": 90875 + }, + { + "epoch": 10.121394364628578, + "grad_norm": 0.5551219582557678, + "learning_rate": 2.8819113594266988e-05, + "loss": 0.0456, + "num_input_tokens_seen": 110606432, + "step": 90880 + }, + { + "epoch": 10.121951219512194, + "grad_norm": 0.39368611574172974, + "learning_rate": 2.881671235721018e-05, + "loss": 0.0566, + "num_input_tokens_seen": 110612320, + "step": 90885 + }, + { + "epoch": 10.122508074395812, + "grad_norm": 0.013904497027397156, + "learning_rate": 2.8814311084101474e-05, + "loss": 0.0149, + "num_input_tokens_seen": 110618304, + "step": 90890 + }, + { + "epoch": 10.12306492927943, + "grad_norm": 0.0016157504869624972, + "learning_rate": 2.8811909774963534e-05, + "loss": 0.057, + "num_input_tokens_seen": 110624672, + "step": 90895 + }, + { + "epoch": 10.123621784163047, + "grad_norm": 0.007400983944535255, + "learning_rate": 2.880950842981906e-05, + "loss": 0.0217, + "num_input_tokens_seen": 110630816, + "step": 90900 + }, + { + "epoch": 10.124178639046665, + "grad_norm": 0.04632476344704628, + "learning_rate": 2.8807107048690723e-05, + "loss": 0.0564, + "num_input_tokens_seen": 110636736, + "step": 90905 + }, + { + "epoch": 10.124735493930281, + "grad_norm": 0.2223532795906067, + "learning_rate": 2.880470563160121e-05, + "loss": 0.0432, + "num_input_tokens_seen": 110642976, + "step": 90910 + }, + { + "epoch": 10.125292348813899, + "grad_norm": 1.4888356924057007, + "learning_rate": 2.88023041785732e-05, + "loss": 0.0496, + "num_input_tokens_seen": 110648864, + "step": 90915 + }, + { + "epoch": 10.125849203697516, + "grad_norm": 1.3296363353729248, + "learning_rate": 2.8799902689629388e-05, + "loss": 0.0412, + "num_input_tokens_seen": 110655360, + "step": 90920 + }, + { + "epoch": 10.126406058581134, + "grad_norm": 0.7093852758407593, + "learning_rate": 2.879750116479245e-05, + "loss": 0.0653, + "num_input_tokens_seen": 110661248, + "step": 90925 + }, + { + "epoch": 10.126962913464752, + "grad_norm": 0.007126636803150177, + "learning_rate": 2.879509960408507e-05, + "loss": 0.0635, + "num_input_tokens_seen": 110667456, + "step": 90930 + }, + { + "epoch": 10.127519768348368, + "grad_norm": 0.0008165432373061776, + "learning_rate": 2.8792698007529934e-05, + "loss": 0.0592, + "num_input_tokens_seen": 110673792, + "step": 90935 + }, + { + "epoch": 10.128076623231985, + "grad_norm": 1.1862106323242188, + "learning_rate": 2.8790296375149724e-05, + "loss": 0.0623, + "num_input_tokens_seen": 110679744, + "step": 90940 + }, + { + "epoch": 10.128633478115603, + "grad_norm": 2.2168147563934326, + "learning_rate": 2.8787894706967128e-05, + "loss": 0.0487, + "num_input_tokens_seen": 110685664, + "step": 90945 + }, + { + "epoch": 10.12919033299922, + "grad_norm": 0.1860790103673935, + "learning_rate": 2.878549300300483e-05, + "loss": 0.114, + "num_input_tokens_seen": 110691712, + "step": 90950 + }, + { + "epoch": 10.129747187882838, + "grad_norm": 0.40579211711883545, + "learning_rate": 2.8783091263285522e-05, + "loss": 0.103, + "num_input_tokens_seen": 110697792, + "step": 90955 + }, + { + "epoch": 10.130304042766456, + "grad_norm": 0.02464175783097744, + "learning_rate": 2.8780689487831892e-05, + "loss": 0.0667, + "num_input_tokens_seen": 110704000, + "step": 90960 + }, + { + "epoch": 10.130860897650072, + "grad_norm": 0.003029440063983202, + "learning_rate": 2.8778287676666608e-05, + "loss": 0.0205, + "num_input_tokens_seen": 110709760, + "step": 90965 + }, + { + "epoch": 10.13141775253369, + "grad_norm": 0.03374866768717766, + "learning_rate": 2.877588582981237e-05, + "loss": 0.0065, + "num_input_tokens_seen": 110716128, + "step": 90970 + }, + { + "epoch": 10.131974607417307, + "grad_norm": 1.6565072536468506, + "learning_rate": 2.8773483947291875e-05, + "loss": 0.0327, + "num_input_tokens_seen": 110722592, + "step": 90975 + }, + { + "epoch": 10.132531462300925, + "grad_norm": 0.14819388091564178, + "learning_rate": 2.8771082029127793e-05, + "loss": 0.1209, + "num_input_tokens_seen": 110727808, + "step": 90980 + }, + { + "epoch": 10.133088317184543, + "grad_norm": 0.7938374876976013, + "learning_rate": 2.876868007534283e-05, + "loss": 0.0718, + "num_input_tokens_seen": 110734016, + "step": 90985 + }, + { + "epoch": 10.133645172068158, + "grad_norm": 0.8612012267112732, + "learning_rate": 2.8766278085959654e-05, + "loss": 0.0718, + "num_input_tokens_seen": 110740064, + "step": 90990 + }, + { + "epoch": 10.134202026951776, + "grad_norm": 0.0426490344107151, + "learning_rate": 2.876387606100097e-05, + "loss": 0.111, + "num_input_tokens_seen": 110745952, + "step": 90995 + }, + { + "epoch": 10.134758881835394, + "grad_norm": 0.05646542087197304, + "learning_rate": 2.8761474000489458e-05, + "loss": 0.0019, + "num_input_tokens_seen": 110752128, + "step": 91000 + }, + { + "epoch": 10.135315736719011, + "grad_norm": 0.01638360135257244, + "learning_rate": 2.8759071904447803e-05, + "loss": 0.0823, + "num_input_tokens_seen": 110758304, + "step": 91005 + }, + { + "epoch": 10.135872591602629, + "grad_norm": 0.18592973053455353, + "learning_rate": 2.8756669772898713e-05, + "loss": 0.0555, + "num_input_tokens_seen": 110764608, + "step": 91010 + }, + { + "epoch": 10.136429446486245, + "grad_norm": 0.14682869613170624, + "learning_rate": 2.8754267605864866e-05, + "loss": 0.0051, + "num_input_tokens_seen": 110770496, + "step": 91015 + }, + { + "epoch": 10.136986301369863, + "grad_norm": 1.0949865579605103, + "learning_rate": 2.8751865403368954e-05, + "loss": 0.0559, + "num_input_tokens_seen": 110776544, + "step": 91020 + }, + { + "epoch": 10.13754315625348, + "grad_norm": 0.9032002091407776, + "learning_rate": 2.8749463165433665e-05, + "loss": 0.0282, + "num_input_tokens_seen": 110782432, + "step": 91025 + }, + { + "epoch": 10.138100011137098, + "grad_norm": 2.0611681938171387, + "learning_rate": 2.8747060892081695e-05, + "loss": 0.103, + "num_input_tokens_seen": 110788640, + "step": 91030 + }, + { + "epoch": 10.138656866020716, + "grad_norm": 0.2960342764854431, + "learning_rate": 2.8744658583335725e-05, + "loss": 0.0039, + "num_input_tokens_seen": 110794304, + "step": 91035 + }, + { + "epoch": 10.139213720904332, + "grad_norm": 0.6293990015983582, + "learning_rate": 2.8742256239218456e-05, + "loss": 0.0744, + "num_input_tokens_seen": 110800448, + "step": 91040 + }, + { + "epoch": 10.13977057578795, + "grad_norm": 0.7759097218513489, + "learning_rate": 2.873985385975259e-05, + "loss": 0.0192, + "num_input_tokens_seen": 110806688, + "step": 91045 + }, + { + "epoch": 10.140327430671567, + "grad_norm": 0.9739901423454285, + "learning_rate": 2.8737451444960793e-05, + "loss": 0.073, + "num_input_tokens_seen": 110812768, + "step": 91050 + }, + { + "epoch": 10.140884285555185, + "grad_norm": 0.7465595602989197, + "learning_rate": 2.8735048994865787e-05, + "loss": 0.0359, + "num_input_tokens_seen": 110819072, + "step": 91055 + }, + { + "epoch": 10.141441140438802, + "grad_norm": 0.012309868820011616, + "learning_rate": 2.8732646509490242e-05, + "loss": 0.0567, + "num_input_tokens_seen": 110825152, + "step": 91060 + }, + { + "epoch": 10.141997995322418, + "grad_norm": 0.39385583996772766, + "learning_rate": 2.873024398885686e-05, + "loss": 0.1395, + "num_input_tokens_seen": 110830944, + "step": 91065 + }, + { + "epoch": 10.142554850206036, + "grad_norm": 0.40950724482536316, + "learning_rate": 2.872784143298834e-05, + "loss": 0.0114, + "num_input_tokens_seen": 110836992, + "step": 91070 + }, + { + "epoch": 10.143111705089654, + "grad_norm": 0.8945701122283936, + "learning_rate": 2.8725438841907366e-05, + "loss": 0.1297, + "num_input_tokens_seen": 110843200, + "step": 91075 + }, + { + "epoch": 10.143668559973271, + "grad_norm": 0.37172043323516846, + "learning_rate": 2.872303621563664e-05, + "loss": 0.0396, + "num_input_tokens_seen": 110848896, + "step": 91080 + }, + { + "epoch": 10.144225414856889, + "grad_norm": 3.8267550468444824, + "learning_rate": 2.872063355419885e-05, + "loss": 0.0708, + "num_input_tokens_seen": 110854976, + "step": 91085 + }, + { + "epoch": 10.144782269740505, + "grad_norm": 0.013150733895599842, + "learning_rate": 2.8718230857616703e-05, + "loss": 0.0408, + "num_input_tokens_seen": 110860960, + "step": 91090 + }, + { + "epoch": 10.145339124624122, + "grad_norm": 0.25026756525039673, + "learning_rate": 2.871582812591288e-05, + "loss": 0.0143, + "num_input_tokens_seen": 110867136, + "step": 91095 + }, + { + "epoch": 10.14589597950774, + "grad_norm": 4.129035949707031, + "learning_rate": 2.8713425359110084e-05, + "loss": 0.1176, + "num_input_tokens_seen": 110873248, + "step": 91100 + }, + { + "epoch": 10.146452834391358, + "grad_norm": 1.1227619647979736, + "learning_rate": 2.8711022557231016e-05, + "loss": 0.1291, + "num_input_tokens_seen": 110879456, + "step": 91105 + }, + { + "epoch": 10.147009689274975, + "grad_norm": 0.30400916934013367, + "learning_rate": 2.8708619720298357e-05, + "loss": 0.0322, + "num_input_tokens_seen": 110885632, + "step": 91110 + }, + { + "epoch": 10.147566544158593, + "grad_norm": 0.11440841853618622, + "learning_rate": 2.870621684833482e-05, + "loss": 0.1485, + "num_input_tokens_seen": 110891328, + "step": 91115 + }, + { + "epoch": 10.148123399042209, + "grad_norm": 0.08819035440683365, + "learning_rate": 2.8703813941363093e-05, + "loss": 0.0101, + "num_input_tokens_seen": 110897152, + "step": 91120 + }, + { + "epoch": 10.148680253925827, + "grad_norm": 0.17114955186843872, + "learning_rate": 2.870141099940588e-05, + "loss": 0.0354, + "num_input_tokens_seen": 110903360, + "step": 91125 + }, + { + "epoch": 10.149237108809444, + "grad_norm": 0.3007355332374573, + "learning_rate": 2.8699008022485875e-05, + "loss": 0.073, + "num_input_tokens_seen": 110909376, + "step": 91130 + }, + { + "epoch": 10.149793963693062, + "grad_norm": 0.055399347096681595, + "learning_rate": 2.8696605010625767e-05, + "loss": 0.0281, + "num_input_tokens_seen": 110915456, + "step": 91135 + }, + { + "epoch": 10.15035081857668, + "grad_norm": 0.10414067655801773, + "learning_rate": 2.869420196384827e-05, + "loss": 0.0115, + "num_input_tokens_seen": 110921344, + "step": 91140 + }, + { + "epoch": 10.150907673460296, + "grad_norm": 0.7228611707687378, + "learning_rate": 2.8691798882176073e-05, + "loss": 0.0171, + "num_input_tokens_seen": 110927520, + "step": 91145 + }, + { + "epoch": 10.151464528343913, + "grad_norm": 1.0779058933258057, + "learning_rate": 2.868939576563188e-05, + "loss": 0.0365, + "num_input_tokens_seen": 110933568, + "step": 91150 + }, + { + "epoch": 10.152021383227531, + "grad_norm": 1.302973747253418, + "learning_rate": 2.868699261423839e-05, + "loss": 0.0952, + "num_input_tokens_seen": 110939616, + "step": 91155 + }, + { + "epoch": 10.152578238111149, + "grad_norm": 1.0846186876296997, + "learning_rate": 2.8684589428018298e-05, + "loss": 0.1279, + "num_input_tokens_seen": 110945696, + "step": 91160 + }, + { + "epoch": 10.153135092994766, + "grad_norm": 0.0003504948690533638, + "learning_rate": 2.8682186206994306e-05, + "loss": 0.0776, + "num_input_tokens_seen": 110951648, + "step": 91165 + }, + { + "epoch": 10.153691947878382, + "grad_norm": 0.01713210716843605, + "learning_rate": 2.8679782951189116e-05, + "loss": 0.0718, + "num_input_tokens_seen": 110957920, + "step": 91170 + }, + { + "epoch": 10.154248802762, + "grad_norm": 0.008674596436321735, + "learning_rate": 2.867737966062543e-05, + "loss": 0.0496, + "num_input_tokens_seen": 110964288, + "step": 91175 + }, + { + "epoch": 10.154805657645618, + "grad_norm": 0.0011325225932523608, + "learning_rate": 2.867497633532594e-05, + "loss": 0.0239, + "num_input_tokens_seen": 110970560, + "step": 91180 + }, + { + "epoch": 10.155362512529235, + "grad_norm": 0.00011177943088114262, + "learning_rate": 2.867257297531336e-05, + "loss": 0.024, + "num_input_tokens_seen": 110976704, + "step": 91185 + }, + { + "epoch": 10.155919367412853, + "grad_norm": 0.018505994230508804, + "learning_rate": 2.867016958061039e-05, + "loss": 0.0279, + "num_input_tokens_seen": 110982560, + "step": 91190 + }, + { + "epoch": 10.156476222296469, + "grad_norm": 0.3092496991157532, + "learning_rate": 2.8667766151239715e-05, + "loss": 0.115, + "num_input_tokens_seen": 110987968, + "step": 91195 + }, + { + "epoch": 10.157033077180087, + "grad_norm": 0.03438702970743179, + "learning_rate": 2.8665362687224062e-05, + "loss": 0.0388, + "num_input_tokens_seen": 110994048, + "step": 91200 + }, + { + "epoch": 10.157589932063704, + "grad_norm": 0.04394587129354477, + "learning_rate": 2.8662959188586113e-05, + "loss": 0.0297, + "num_input_tokens_seen": 111000352, + "step": 91205 + }, + { + "epoch": 10.158146786947322, + "grad_norm": 0.00012653709563892335, + "learning_rate": 2.8660555655348593e-05, + "loss": 0.0243, + "num_input_tokens_seen": 111006432, + "step": 91210 + }, + { + "epoch": 10.15870364183094, + "grad_norm": 1.0724900960922241, + "learning_rate": 2.865815208753418e-05, + "loss": 0.0638, + "num_input_tokens_seen": 111012480, + "step": 91215 + }, + { + "epoch": 10.159260496714555, + "grad_norm": 0.08872310817241669, + "learning_rate": 2.865574848516559e-05, + "loss": 0.0235, + "num_input_tokens_seen": 111018464, + "step": 91220 + }, + { + "epoch": 10.159817351598173, + "grad_norm": 0.00010893840953940526, + "learning_rate": 2.865334484826553e-05, + "loss": 0.047, + "num_input_tokens_seen": 111024576, + "step": 91225 + }, + { + "epoch": 10.16037420648179, + "grad_norm": 0.8551454544067383, + "learning_rate": 2.86509411768567e-05, + "loss": 0.0751, + "num_input_tokens_seen": 111030912, + "step": 91230 + }, + { + "epoch": 10.160931061365408, + "grad_norm": 0.0009643540252000093, + "learning_rate": 2.8648537470961808e-05, + "loss": 0.0133, + "num_input_tokens_seen": 111036992, + "step": 91235 + }, + { + "epoch": 10.161487916249026, + "grad_norm": 0.005854233633726835, + "learning_rate": 2.8646133730603553e-05, + "loss": 0.0287, + "num_input_tokens_seen": 111042976, + "step": 91240 + }, + { + "epoch": 10.162044771132642, + "grad_norm": 0.3029846251010895, + "learning_rate": 2.8643729955804637e-05, + "loss": 0.0223, + "num_input_tokens_seen": 111048512, + "step": 91245 + }, + { + "epoch": 10.16260162601626, + "grad_norm": 1.065689206123352, + "learning_rate": 2.8641326146587787e-05, + "loss": 0.2028, + "num_input_tokens_seen": 111054240, + "step": 91250 + }, + { + "epoch": 10.163158480899877, + "grad_norm": 0.34527653455734253, + "learning_rate": 2.8638922302975684e-05, + "loss": 0.0211, + "num_input_tokens_seen": 111060864, + "step": 91255 + }, + { + "epoch": 10.163715335783495, + "grad_norm": 0.30411067605018616, + "learning_rate": 2.8636518424991048e-05, + "loss": 0.1106, + "num_input_tokens_seen": 111067232, + "step": 91260 + }, + { + "epoch": 10.164272190667113, + "grad_norm": 1.881687045097351, + "learning_rate": 2.8634114512656584e-05, + "loss": 0.0453, + "num_input_tokens_seen": 111072672, + "step": 91265 + }, + { + "epoch": 10.164829045550729, + "grad_norm": 0.09053368866443634, + "learning_rate": 2.8631710565994994e-05, + "loss": 0.0016, + "num_input_tokens_seen": 111078848, + "step": 91270 + }, + { + "epoch": 10.165385900434346, + "grad_norm": 0.008035166189074516, + "learning_rate": 2.8629306585028987e-05, + "loss": 0.0433, + "num_input_tokens_seen": 111085120, + "step": 91275 + }, + { + "epoch": 10.165942755317964, + "grad_norm": 0.057009629905223846, + "learning_rate": 2.8626902569781273e-05, + "loss": 0.0537, + "num_input_tokens_seen": 111091552, + "step": 91280 + }, + { + "epoch": 10.166499610201582, + "grad_norm": 0.07768198102712631, + "learning_rate": 2.8624498520274556e-05, + "loss": 0.0259, + "num_input_tokens_seen": 111097600, + "step": 91285 + }, + { + "epoch": 10.1670564650852, + "grad_norm": 1.6981441974639893, + "learning_rate": 2.862209443653155e-05, + "loss": 0.0334, + "num_input_tokens_seen": 111103936, + "step": 91290 + }, + { + "epoch": 10.167613319968815, + "grad_norm": 0.733269453048706, + "learning_rate": 2.861969031857496e-05, + "loss": 0.0331, + "num_input_tokens_seen": 111110112, + "step": 91295 + }, + { + "epoch": 10.168170174852433, + "grad_norm": 0.10309189558029175, + "learning_rate": 2.8617286166427493e-05, + "loss": 0.0023, + "num_input_tokens_seen": 111115872, + "step": 91300 + }, + { + "epoch": 10.16872702973605, + "grad_norm": 0.09909731894731522, + "learning_rate": 2.8614881980111863e-05, + "loss": 0.0211, + "num_input_tokens_seen": 111122048, + "step": 91305 + }, + { + "epoch": 10.169283884619668, + "grad_norm": 0.9500758647918701, + "learning_rate": 2.8612477759650773e-05, + "loss": 0.0778, + "num_input_tokens_seen": 111128160, + "step": 91310 + }, + { + "epoch": 10.169840739503286, + "grad_norm": 0.016118820756673813, + "learning_rate": 2.8610073505066937e-05, + "loss": 0.0203, + "num_input_tokens_seen": 111134016, + "step": 91315 + }, + { + "epoch": 10.170397594386904, + "grad_norm": 0.5820413827896118, + "learning_rate": 2.8607669216383066e-05, + "loss": 0.0127, + "num_input_tokens_seen": 111139808, + "step": 91320 + }, + { + "epoch": 10.17095444927052, + "grad_norm": 0.06068648025393486, + "learning_rate": 2.8605264893621864e-05, + "loss": 0.0018, + "num_input_tokens_seen": 111146144, + "step": 91325 + }, + { + "epoch": 10.171511304154137, + "grad_norm": 0.5550402402877808, + "learning_rate": 2.8602860536806054e-05, + "loss": 0.0131, + "num_input_tokens_seen": 111152544, + "step": 91330 + }, + { + "epoch": 10.172068159037755, + "grad_norm": 0.12013895064592361, + "learning_rate": 2.8600456145958337e-05, + "loss": 0.0111, + "num_input_tokens_seen": 111158656, + "step": 91335 + }, + { + "epoch": 10.172625013921373, + "grad_norm": 0.10402604192495346, + "learning_rate": 2.8598051721101427e-05, + "loss": 0.1062, + "num_input_tokens_seen": 111164480, + "step": 91340 + }, + { + "epoch": 10.17318186880499, + "grad_norm": 0.4297538995742798, + "learning_rate": 2.8595647262258036e-05, + "loss": 0.0151, + "num_input_tokens_seen": 111170368, + "step": 91345 + }, + { + "epoch": 10.173738723688606, + "grad_norm": 0.028366003185510635, + "learning_rate": 2.8593242769450866e-05, + "loss": 0.0138, + "num_input_tokens_seen": 111176480, + "step": 91350 + }, + { + "epoch": 10.174295578572224, + "grad_norm": 0.23966702818870544, + "learning_rate": 2.8590838242702656e-05, + "loss": 0.0185, + "num_input_tokens_seen": 111182624, + "step": 91355 + }, + { + "epoch": 10.174852433455841, + "grad_norm": 1.2530896663665771, + "learning_rate": 2.8588433682036092e-05, + "loss": 0.0327, + "num_input_tokens_seen": 111188576, + "step": 91360 + }, + { + "epoch": 10.17540928833946, + "grad_norm": 0.003595358459278941, + "learning_rate": 2.8586029087473902e-05, + "loss": 0.1332, + "num_input_tokens_seen": 111194592, + "step": 91365 + }, + { + "epoch": 10.175966143223077, + "grad_norm": 0.00042499095434322953, + "learning_rate": 2.8583624459038787e-05, + "loss": 0.0427, + "num_input_tokens_seen": 111200832, + "step": 91370 + }, + { + "epoch": 10.176522998106693, + "grad_norm": 0.9058846831321716, + "learning_rate": 2.8581219796753473e-05, + "loss": 0.065, + "num_input_tokens_seen": 111206912, + "step": 91375 + }, + { + "epoch": 10.17707985299031, + "grad_norm": 1.3935108184814453, + "learning_rate": 2.8578815100640666e-05, + "loss": 0.1474, + "num_input_tokens_seen": 111213248, + "step": 91380 + }, + { + "epoch": 10.177636707873928, + "grad_norm": 0.8073003888130188, + "learning_rate": 2.8576410370723082e-05, + "loss": 0.0692, + "num_input_tokens_seen": 111219392, + "step": 91385 + }, + { + "epoch": 10.178193562757546, + "grad_norm": 1.1926164627075195, + "learning_rate": 2.8574005607023446e-05, + "loss": 0.0196, + "num_input_tokens_seen": 111225760, + "step": 91390 + }, + { + "epoch": 10.178750417641163, + "grad_norm": 0.8617776036262512, + "learning_rate": 2.8571600809564454e-05, + "loss": 0.0503, + "num_input_tokens_seen": 111231840, + "step": 91395 + }, + { + "epoch": 10.17930727252478, + "grad_norm": 0.005060552153736353, + "learning_rate": 2.8569195978368835e-05, + "loss": 0.0854, + "num_input_tokens_seen": 111238112, + "step": 91400 + }, + { + "epoch": 10.179864127408397, + "grad_norm": 0.7255014777183533, + "learning_rate": 2.8566791113459295e-05, + "loss": 0.0389, + "num_input_tokens_seen": 111244192, + "step": 91405 + }, + { + "epoch": 10.180420982292015, + "grad_norm": 0.027036933228373528, + "learning_rate": 2.8564386214858563e-05, + "loss": 0.0766, + "num_input_tokens_seen": 111250272, + "step": 91410 + }, + { + "epoch": 10.180977837175632, + "grad_norm": 0.06277791410684586, + "learning_rate": 2.8561981282589344e-05, + "loss": 0.0791, + "num_input_tokens_seen": 111256480, + "step": 91415 + }, + { + "epoch": 10.18153469205925, + "grad_norm": 0.6364471912384033, + "learning_rate": 2.855957631667435e-05, + "loss": 0.1289, + "num_input_tokens_seen": 111262272, + "step": 91420 + }, + { + "epoch": 10.182091546942866, + "grad_norm": 0.004353055730462074, + "learning_rate": 2.855717131713632e-05, + "loss": 0.034, + "num_input_tokens_seen": 111268640, + "step": 91425 + }, + { + "epoch": 10.182648401826484, + "grad_norm": 0.038792580366134644, + "learning_rate": 2.8554766283997953e-05, + "loss": 0.0741, + "num_input_tokens_seen": 111274656, + "step": 91430 + }, + { + "epoch": 10.183205256710101, + "grad_norm": 0.00043728871969506145, + "learning_rate": 2.855236121728197e-05, + "loss": 0.0001, + "num_input_tokens_seen": 111280736, + "step": 91435 + }, + { + "epoch": 10.183762111593719, + "grad_norm": 0.2524789571762085, + "learning_rate": 2.8549956117011085e-05, + "loss": 0.0089, + "num_input_tokens_seen": 111286976, + "step": 91440 + }, + { + "epoch": 10.184318966477337, + "grad_norm": 0.0004973618779331446, + "learning_rate": 2.8547550983208016e-05, + "loss": 0.0061, + "num_input_tokens_seen": 111293376, + "step": 91445 + }, + { + "epoch": 10.184875821360952, + "grad_norm": 1.1975973844528198, + "learning_rate": 2.8545145815895496e-05, + "loss": 0.0384, + "num_input_tokens_seen": 111299392, + "step": 91450 + }, + { + "epoch": 10.18543267624457, + "grad_norm": 0.09384099394083023, + "learning_rate": 2.854274061509623e-05, + "loss": 0.0109, + "num_input_tokens_seen": 111305824, + "step": 91455 + }, + { + "epoch": 10.185989531128188, + "grad_norm": 0.35996130108833313, + "learning_rate": 2.8540335380832943e-05, + "loss": 0.0295, + "num_input_tokens_seen": 111312064, + "step": 91460 + }, + { + "epoch": 10.186546386011806, + "grad_norm": 1.3709893226623535, + "learning_rate": 2.8537930113128347e-05, + "loss": 0.1148, + "num_input_tokens_seen": 111318048, + "step": 91465 + }, + { + "epoch": 10.187103240895423, + "grad_norm": 1.1454640626907349, + "learning_rate": 2.8535524812005164e-05, + "loss": 0.0771, + "num_input_tokens_seen": 111324160, + "step": 91470 + }, + { + "epoch": 10.18766009577904, + "grad_norm": 0.0007932299631647766, + "learning_rate": 2.8533119477486125e-05, + "loss": 0.0597, + "num_input_tokens_seen": 111329920, + "step": 91475 + }, + { + "epoch": 10.188216950662657, + "grad_norm": 0.002311941934749484, + "learning_rate": 2.8530714109593937e-05, + "loss": 0.0093, + "num_input_tokens_seen": 111336064, + "step": 91480 + }, + { + "epoch": 10.188773805546274, + "grad_norm": 0.01509399339556694, + "learning_rate": 2.852830870835133e-05, + "loss": 0.0202, + "num_input_tokens_seen": 111342240, + "step": 91485 + }, + { + "epoch": 10.189330660429892, + "grad_norm": 0.0003172459255438298, + "learning_rate": 2.8525903273781014e-05, + "loss": 0.058, + "num_input_tokens_seen": 111348352, + "step": 91490 + }, + { + "epoch": 10.18988751531351, + "grad_norm": 0.03376547247171402, + "learning_rate": 2.8523497805905724e-05, + "loss": 0.0468, + "num_input_tokens_seen": 111354528, + "step": 91495 + }, + { + "epoch": 10.190444370197127, + "grad_norm": 0.5699974894523621, + "learning_rate": 2.8521092304748165e-05, + "loss": 0.0318, + "num_input_tokens_seen": 111360416, + "step": 91500 + }, + { + "epoch": 10.191001225080743, + "grad_norm": 4.063316345214844, + "learning_rate": 2.851868677033107e-05, + "loss": 0.0841, + "num_input_tokens_seen": 111366880, + "step": 91505 + }, + { + "epoch": 10.191558079964361, + "grad_norm": 0.6103674173355103, + "learning_rate": 2.8516281202677164e-05, + "loss": 0.0259, + "num_input_tokens_seen": 111373024, + "step": 91510 + }, + { + "epoch": 10.192114934847979, + "grad_norm": 0.04722949489951134, + "learning_rate": 2.8513875601809164e-05, + "loss": 0.0178, + "num_input_tokens_seen": 111379328, + "step": 91515 + }, + { + "epoch": 10.192671789731596, + "grad_norm": 0.05272765830159187, + "learning_rate": 2.8511469967749794e-05, + "loss": 0.0042, + "num_input_tokens_seen": 111385792, + "step": 91520 + }, + { + "epoch": 10.193228644615214, + "grad_norm": 0.07919107377529144, + "learning_rate": 2.8509064300521777e-05, + "loss": 0.0154, + "num_input_tokens_seen": 111391904, + "step": 91525 + }, + { + "epoch": 10.19378549949883, + "grad_norm": 0.0577910915017128, + "learning_rate": 2.8506658600147835e-05, + "loss": 0.0423, + "num_input_tokens_seen": 111398080, + "step": 91530 + }, + { + "epoch": 10.194342354382448, + "grad_norm": 1.1120319366455078, + "learning_rate": 2.8504252866650694e-05, + "loss": 0.1621, + "num_input_tokens_seen": 111403648, + "step": 91535 + }, + { + "epoch": 10.194899209266065, + "grad_norm": 0.1987336426973343, + "learning_rate": 2.850184710005307e-05, + "loss": 0.0444, + "num_input_tokens_seen": 111409664, + "step": 91540 + }, + { + "epoch": 10.195456064149683, + "grad_norm": 0.0002165569894714281, + "learning_rate": 2.8499441300377706e-05, + "loss": 0.0781, + "num_input_tokens_seen": 111415808, + "step": 91545 + }, + { + "epoch": 10.1960129190333, + "grad_norm": 0.5309587717056274, + "learning_rate": 2.8497035467647304e-05, + "loss": 0.0449, + "num_input_tokens_seen": 111421984, + "step": 91550 + }, + { + "epoch": 10.196569773916917, + "grad_norm": 0.0015406819293275476, + "learning_rate": 2.849462960188461e-05, + "loss": 0.1096, + "num_input_tokens_seen": 111428352, + "step": 91555 + }, + { + "epoch": 10.197126628800534, + "grad_norm": 0.0005926384474150836, + "learning_rate": 2.849222370311233e-05, + "loss": 0.0597, + "num_input_tokens_seen": 111434656, + "step": 91560 + }, + { + "epoch": 10.197683483684152, + "grad_norm": 1.502699613571167, + "learning_rate": 2.8489817771353206e-05, + "loss": 0.0493, + "num_input_tokens_seen": 111440640, + "step": 91565 + }, + { + "epoch": 10.19824033856777, + "grad_norm": 1.5337313413619995, + "learning_rate": 2.848741180662996e-05, + "loss": 0.0717, + "num_input_tokens_seen": 111446656, + "step": 91570 + }, + { + "epoch": 10.198797193451387, + "grad_norm": 0.2157393842935562, + "learning_rate": 2.848500580896531e-05, + "loss": 0.0215, + "num_input_tokens_seen": 111452608, + "step": 91575 + }, + { + "epoch": 10.199354048335003, + "grad_norm": 0.15180440247058868, + "learning_rate": 2.8482599778381995e-05, + "loss": 0.0443, + "num_input_tokens_seen": 111459008, + "step": 91580 + }, + { + "epoch": 10.19991090321862, + "grad_norm": 0.04134388640522957, + "learning_rate": 2.8480193714902726e-05, + "loss": 0.0784, + "num_input_tokens_seen": 111464928, + "step": 91585 + }, + { + "epoch": 10.200467758102238, + "grad_norm": 0.7489548921585083, + "learning_rate": 2.847778761855024e-05, + "loss": 0.1398, + "num_input_tokens_seen": 111471168, + "step": 91590 + }, + { + "epoch": 10.201024612985856, + "grad_norm": 0.3075897693634033, + "learning_rate": 2.8475381489347268e-05, + "loss": 0.0298, + "num_input_tokens_seen": 111477152, + "step": 91595 + }, + { + "epoch": 10.201581467869474, + "grad_norm": 0.016021251678466797, + "learning_rate": 2.847297532731653e-05, + "loss": 0.0058, + "num_input_tokens_seen": 111482880, + "step": 91600 + }, + { + "epoch": 10.20213832275309, + "grad_norm": 0.03421509638428688, + "learning_rate": 2.847056913248076e-05, + "loss": 0.0908, + "num_input_tokens_seen": 111488864, + "step": 91605 + }, + { + "epoch": 10.202695177636707, + "grad_norm": 0.014891136437654495, + "learning_rate": 2.8468162904862684e-05, + "loss": 0.0786, + "num_input_tokens_seen": 111494944, + "step": 91610 + }, + { + "epoch": 10.203252032520325, + "grad_norm": 0.00019511807477101684, + "learning_rate": 2.8465756644485032e-05, + "loss": 0.1547, + "num_input_tokens_seen": 111501120, + "step": 91615 + }, + { + "epoch": 10.203808887403943, + "grad_norm": 0.0077295368537306786, + "learning_rate": 2.8463350351370526e-05, + "loss": 0.0158, + "num_input_tokens_seen": 111507328, + "step": 91620 + }, + { + "epoch": 10.20436574228756, + "grad_norm": 0.9068087935447693, + "learning_rate": 2.8460944025541903e-05, + "loss": 0.1088, + "num_input_tokens_seen": 111513120, + "step": 91625 + }, + { + "epoch": 10.204922597171176, + "grad_norm": 2.6367266178131104, + "learning_rate": 2.8458537667021895e-05, + "loss": 0.0524, + "num_input_tokens_seen": 111519296, + "step": 91630 + }, + { + "epoch": 10.205479452054794, + "grad_norm": 0.09261908382177353, + "learning_rate": 2.845613127583322e-05, + "loss": 0.0441, + "num_input_tokens_seen": 111525056, + "step": 91635 + }, + { + "epoch": 10.206036306938412, + "grad_norm": 0.03398897871375084, + "learning_rate": 2.8453724851998624e-05, + "loss": 0.0108, + "num_input_tokens_seen": 111531072, + "step": 91640 + }, + { + "epoch": 10.20659316182203, + "grad_norm": 0.044038545340299606, + "learning_rate": 2.8451318395540828e-05, + "loss": 0.0607, + "num_input_tokens_seen": 111537120, + "step": 91645 + }, + { + "epoch": 10.207150016705647, + "grad_norm": 1.4374810457229614, + "learning_rate": 2.8448911906482563e-05, + "loss": 0.1306, + "num_input_tokens_seen": 111542560, + "step": 91650 + }, + { + "epoch": 10.207706871589265, + "grad_norm": 0.009603804908692837, + "learning_rate": 2.844650538484656e-05, + "loss": 0.0338, + "num_input_tokens_seen": 111548672, + "step": 91655 + }, + { + "epoch": 10.20826372647288, + "grad_norm": 0.046809807419776917, + "learning_rate": 2.8444098830655554e-05, + "loss": 0.0581, + "num_input_tokens_seen": 111554656, + "step": 91660 + }, + { + "epoch": 10.208820581356498, + "grad_norm": 0.46004006266593933, + "learning_rate": 2.844169224393228e-05, + "loss": 0.0437, + "num_input_tokens_seen": 111560704, + "step": 91665 + }, + { + "epoch": 10.209377436240116, + "grad_norm": 0.0007961440715007484, + "learning_rate": 2.8439285624699456e-05, + "loss": 0.0063, + "num_input_tokens_seen": 111566784, + "step": 91670 + }, + { + "epoch": 10.209934291123734, + "grad_norm": 0.051551856100559235, + "learning_rate": 2.8436878972979837e-05, + "loss": 0.0149, + "num_input_tokens_seen": 111573152, + "step": 91675 + }, + { + "epoch": 10.210491146007351, + "grad_norm": 0.006703378167003393, + "learning_rate": 2.8434472288796128e-05, + "loss": 0.0038, + "num_input_tokens_seen": 111579264, + "step": 91680 + }, + { + "epoch": 10.211048000890967, + "grad_norm": 0.1500519961118698, + "learning_rate": 2.843206557217108e-05, + "loss": 0.0618, + "num_input_tokens_seen": 111585088, + "step": 91685 + }, + { + "epoch": 10.211604855774585, + "grad_norm": 0.5775561332702637, + "learning_rate": 2.842965882312743e-05, + "loss": 0.0184, + "num_input_tokens_seen": 111591168, + "step": 91690 + }, + { + "epoch": 10.212161710658203, + "grad_norm": 0.9565050601959229, + "learning_rate": 2.8427252041687895e-05, + "loss": 0.1332, + "num_input_tokens_seen": 111597344, + "step": 91695 + }, + { + "epoch": 10.21271856554182, + "grad_norm": 0.47088345885276794, + "learning_rate": 2.842484522787523e-05, + "loss": 0.0794, + "num_input_tokens_seen": 111603296, + "step": 91700 + }, + { + "epoch": 10.213275420425438, + "grad_norm": 1.6977410316467285, + "learning_rate": 2.8422438381712153e-05, + "loss": 0.0293, + "num_input_tokens_seen": 111609472, + "step": 91705 + }, + { + "epoch": 10.213832275309054, + "grad_norm": 0.530379056930542, + "learning_rate": 2.84200315032214e-05, + "loss": 0.0616, + "num_input_tokens_seen": 111615456, + "step": 91710 + }, + { + "epoch": 10.214389130192671, + "grad_norm": 0.8025586009025574, + "learning_rate": 2.8417624592425712e-05, + "loss": 0.1623, + "num_input_tokens_seen": 111621632, + "step": 91715 + }, + { + "epoch": 10.21494598507629, + "grad_norm": 0.43228283524513245, + "learning_rate": 2.841521764934782e-05, + "loss": 0.0747, + "num_input_tokens_seen": 111627776, + "step": 91720 + }, + { + "epoch": 10.215502839959907, + "grad_norm": 0.003824918996542692, + "learning_rate": 2.8412810674010466e-05, + "loss": 0.0961, + "num_input_tokens_seen": 111633728, + "step": 91725 + }, + { + "epoch": 10.216059694843524, + "grad_norm": 0.004723045509308577, + "learning_rate": 2.8410403666436375e-05, + "loss": 0.1491, + "num_input_tokens_seen": 111639968, + "step": 91730 + }, + { + "epoch": 10.21661654972714, + "grad_norm": 0.7535207867622375, + "learning_rate": 2.8407996626648292e-05, + "loss": 0.0292, + "num_input_tokens_seen": 111645920, + "step": 91735 + }, + { + "epoch": 10.217173404610758, + "grad_norm": 0.6707132458686829, + "learning_rate": 2.840558955466895e-05, + "loss": 0.0074, + "num_input_tokens_seen": 111652032, + "step": 91740 + }, + { + "epoch": 10.217730259494376, + "grad_norm": 0.01953730545938015, + "learning_rate": 2.8403182450521084e-05, + "loss": 0.0305, + "num_input_tokens_seen": 111658336, + "step": 91745 + }, + { + "epoch": 10.218287114377993, + "grad_norm": 0.04344072937965393, + "learning_rate": 2.8400775314227433e-05, + "loss": 0.0838, + "num_input_tokens_seen": 111664672, + "step": 91750 + }, + { + "epoch": 10.218843969261611, + "grad_norm": 0.036621611565351486, + "learning_rate": 2.839836814581074e-05, + "loss": 0.0391, + "num_input_tokens_seen": 111670752, + "step": 91755 + }, + { + "epoch": 10.219400824145227, + "grad_norm": 0.20326271653175354, + "learning_rate": 2.839596094529373e-05, + "loss": 0.0505, + "num_input_tokens_seen": 111677056, + "step": 91760 + }, + { + "epoch": 10.219957679028845, + "grad_norm": 0.1659948229789734, + "learning_rate": 2.839355371269915e-05, + "loss": 0.0543, + "num_input_tokens_seen": 111683424, + "step": 91765 + }, + { + "epoch": 10.220514533912462, + "grad_norm": 0.22293928265571594, + "learning_rate": 2.8391146448049742e-05, + "loss": 0.0096, + "num_input_tokens_seen": 111689184, + "step": 91770 + }, + { + "epoch": 10.22107138879608, + "grad_norm": 0.1411641538143158, + "learning_rate": 2.8388739151368238e-05, + "loss": 0.1001, + "num_input_tokens_seen": 111695168, + "step": 91775 + }, + { + "epoch": 10.221628243679698, + "grad_norm": 0.008632254786789417, + "learning_rate": 2.838633182267737e-05, + "loss": 0.0252, + "num_input_tokens_seen": 111701120, + "step": 91780 + }, + { + "epoch": 10.222185098563314, + "grad_norm": 0.0038515289779752493, + "learning_rate": 2.8383924461999888e-05, + "loss": 0.108, + "num_input_tokens_seen": 111707680, + "step": 91785 + }, + { + "epoch": 10.222741953446931, + "grad_norm": 0.0010782600147649646, + "learning_rate": 2.8381517069358533e-05, + "loss": 0.0449, + "num_input_tokens_seen": 111713696, + "step": 91790 + }, + { + "epoch": 10.223298808330549, + "grad_norm": 0.1804640144109726, + "learning_rate": 2.8379109644776037e-05, + "loss": 0.0689, + "num_input_tokens_seen": 111720096, + "step": 91795 + }, + { + "epoch": 10.223855663214167, + "grad_norm": 0.23294728994369507, + "learning_rate": 2.837670218827514e-05, + "loss": 0.0307, + "num_input_tokens_seen": 111726240, + "step": 91800 + }, + { + "epoch": 10.224412518097784, + "grad_norm": 0.042141422629356384, + "learning_rate": 2.8374294699878595e-05, + "loss": 0.064, + "num_input_tokens_seen": 111731968, + "step": 91805 + }, + { + "epoch": 10.2249693729814, + "grad_norm": 0.5160315036773682, + "learning_rate": 2.8371887179609125e-05, + "loss": 0.0221, + "num_input_tokens_seen": 111737088, + "step": 91810 + }, + { + "epoch": 10.225526227865018, + "grad_norm": 1.2784504890441895, + "learning_rate": 2.8369479627489477e-05, + "loss": 0.0856, + "num_input_tokens_seen": 111742688, + "step": 91815 + }, + { + "epoch": 10.226083082748636, + "grad_norm": 0.0018527730135247111, + "learning_rate": 2.8367072043542398e-05, + "loss": 0.1151, + "num_input_tokens_seen": 111748448, + "step": 91820 + }, + { + "epoch": 10.226639937632253, + "grad_norm": 0.7116564512252808, + "learning_rate": 2.8364664427790627e-05, + "loss": 0.0149, + "num_input_tokens_seen": 111754624, + "step": 91825 + }, + { + "epoch": 10.22719679251587, + "grad_norm": 0.9660497307777405, + "learning_rate": 2.8362256780256902e-05, + "loss": 0.0784, + "num_input_tokens_seen": 111760608, + "step": 91830 + }, + { + "epoch": 10.227753647399489, + "grad_norm": 0.78351229429245, + "learning_rate": 2.835984910096397e-05, + "loss": 0.0557, + "num_input_tokens_seen": 111766880, + "step": 91835 + }, + { + "epoch": 10.228310502283104, + "grad_norm": 0.15476654469966888, + "learning_rate": 2.8357441389934575e-05, + "loss": 0.0044, + "num_input_tokens_seen": 111772832, + "step": 91840 + }, + { + "epoch": 10.228867357166722, + "grad_norm": 0.028979986906051636, + "learning_rate": 2.8355033647191447e-05, + "loss": 0.0102, + "num_input_tokens_seen": 111779136, + "step": 91845 + }, + { + "epoch": 10.22942421205034, + "grad_norm": 0.023407848551869392, + "learning_rate": 2.8352625872757343e-05, + "loss": 0.0464, + "num_input_tokens_seen": 111785024, + "step": 91850 + }, + { + "epoch": 10.229981066933957, + "grad_norm": 0.023142511025071144, + "learning_rate": 2.8350218066655006e-05, + "loss": 0.0219, + "num_input_tokens_seen": 111791264, + "step": 91855 + }, + { + "epoch": 10.230537921817575, + "grad_norm": 0.8912487030029297, + "learning_rate": 2.8347810228907168e-05, + "loss": 0.015, + "num_input_tokens_seen": 111797216, + "step": 91860 + }, + { + "epoch": 10.231094776701191, + "grad_norm": 0.1592322140932083, + "learning_rate": 2.8345402359536582e-05, + "loss": 0.0421, + "num_input_tokens_seen": 111803072, + "step": 91865 + }, + { + "epoch": 10.231651631584809, + "grad_norm": 1.5668399333953857, + "learning_rate": 2.8342994458565992e-05, + "loss": 0.1631, + "num_input_tokens_seen": 111809248, + "step": 91870 + }, + { + "epoch": 10.232208486468426, + "grad_norm": 0.007214614190161228, + "learning_rate": 2.8340586526018136e-05, + "loss": 0.0226, + "num_input_tokens_seen": 111815136, + "step": 91875 + }, + { + "epoch": 10.232765341352044, + "grad_norm": 0.9704936146736145, + "learning_rate": 2.8338178561915774e-05, + "loss": 0.0879, + "num_input_tokens_seen": 111821312, + "step": 91880 + }, + { + "epoch": 10.233322196235662, + "grad_norm": 0.11313116550445557, + "learning_rate": 2.8335770566281633e-05, + "loss": 0.0578, + "num_input_tokens_seen": 111827200, + "step": 91885 + }, + { + "epoch": 10.233879051119278, + "grad_norm": 0.3250432312488556, + "learning_rate": 2.8333362539138468e-05, + "loss": 0.0726, + "num_input_tokens_seen": 111833184, + "step": 91890 + }, + { + "epoch": 10.234435906002895, + "grad_norm": 0.685632586479187, + "learning_rate": 2.8330954480509026e-05, + "loss": 0.0432, + "num_input_tokens_seen": 111839584, + "step": 91895 + }, + { + "epoch": 10.234992760886513, + "grad_norm": 1.3050695657730103, + "learning_rate": 2.832854639041605e-05, + "loss": 0.0406, + "num_input_tokens_seen": 111845600, + "step": 91900 + }, + { + "epoch": 10.23554961577013, + "grad_norm": 0.03437475487589836, + "learning_rate": 2.8326138268882285e-05, + "loss": 0.0456, + "num_input_tokens_seen": 111851264, + "step": 91905 + }, + { + "epoch": 10.236106470653748, + "grad_norm": 0.03945271670818329, + "learning_rate": 2.8323730115930475e-05, + "loss": 0.0238, + "num_input_tokens_seen": 111857536, + "step": 91910 + }, + { + "epoch": 10.236663325537364, + "grad_norm": 1.1336945295333862, + "learning_rate": 2.8321321931583376e-05, + "loss": 0.1064, + "num_input_tokens_seen": 111863392, + "step": 91915 + }, + { + "epoch": 10.237220180420982, + "grad_norm": 0.0005925996229052544, + "learning_rate": 2.8318913715863725e-05, + "loss": 0.1057, + "num_input_tokens_seen": 111869792, + "step": 91920 + }, + { + "epoch": 10.2377770353046, + "grad_norm": 0.20035779476165771, + "learning_rate": 2.8316505468794287e-05, + "loss": 0.0132, + "num_input_tokens_seen": 111876096, + "step": 91925 + }, + { + "epoch": 10.238333890188217, + "grad_norm": 0.08444629609584808, + "learning_rate": 2.8314097190397786e-05, + "loss": 0.0737, + "num_input_tokens_seen": 111882112, + "step": 91930 + }, + { + "epoch": 10.238890745071835, + "grad_norm": 0.0059782057069242, + "learning_rate": 2.831168888069699e-05, + "loss": 0.0042, + "num_input_tokens_seen": 111888128, + "step": 91935 + }, + { + "epoch": 10.23944759995545, + "grad_norm": 0.3498585522174835, + "learning_rate": 2.8309280539714634e-05, + "loss": 0.0364, + "num_input_tokens_seen": 111894400, + "step": 91940 + }, + { + "epoch": 10.240004454839069, + "grad_norm": 0.15713299810886383, + "learning_rate": 2.830687216747347e-05, + "loss": 0.0063, + "num_input_tokens_seen": 111900544, + "step": 91945 + }, + { + "epoch": 10.240561309722686, + "grad_norm": 0.8788855075836182, + "learning_rate": 2.8304463763996253e-05, + "loss": 0.056, + "num_input_tokens_seen": 111906848, + "step": 91950 + }, + { + "epoch": 10.241118164606304, + "grad_norm": 0.025400295853614807, + "learning_rate": 2.8302055329305727e-05, + "loss": 0.0036, + "num_input_tokens_seen": 111913248, + "step": 91955 + }, + { + "epoch": 10.241675019489922, + "grad_norm": 1.7037986516952515, + "learning_rate": 2.8299646863424646e-05, + "loss": 0.0365, + "num_input_tokens_seen": 111918944, + "step": 91960 + }, + { + "epoch": 10.242231874373537, + "grad_norm": 0.11810924112796783, + "learning_rate": 2.829723836637575e-05, + "loss": 0.0127, + "num_input_tokens_seen": 111925024, + "step": 91965 + }, + { + "epoch": 10.242788729257155, + "grad_norm": 0.11638613790273666, + "learning_rate": 2.8294829838181797e-05, + "loss": 0.012, + "num_input_tokens_seen": 111931104, + "step": 91970 + }, + { + "epoch": 10.243345584140773, + "grad_norm": 0.14326909184455872, + "learning_rate": 2.8292421278865545e-05, + "loss": 0.0363, + "num_input_tokens_seen": 111937248, + "step": 91975 + }, + { + "epoch": 10.24390243902439, + "grad_norm": 0.05681135132908821, + "learning_rate": 2.8290012688449722e-05, + "loss": 0.0627, + "num_input_tokens_seen": 111943264, + "step": 91980 + }, + { + "epoch": 10.244459293908008, + "grad_norm": 0.41663825511932373, + "learning_rate": 2.828760406695711e-05, + "loss": 0.0227, + "num_input_tokens_seen": 111949600, + "step": 91985 + }, + { + "epoch": 10.245016148791624, + "grad_norm": 1.767153263092041, + "learning_rate": 2.8285195414410437e-05, + "loss": 0.1043, + "num_input_tokens_seen": 111955552, + "step": 91990 + }, + { + "epoch": 10.245573003675242, + "grad_norm": 1.5360572338104248, + "learning_rate": 2.8282786730832456e-05, + "loss": 0.055, + "num_input_tokens_seen": 111961408, + "step": 91995 + }, + { + "epoch": 10.24612985855886, + "grad_norm": 0.017517339438199997, + "learning_rate": 2.8280378016245934e-05, + "loss": 0.0518, + "num_input_tokens_seen": 111967616, + "step": 92000 + }, + { + "epoch": 10.246686713442477, + "grad_norm": 0.0058195567689836025, + "learning_rate": 2.8277969270673604e-05, + "loss": 0.1095, + "num_input_tokens_seen": 111973696, + "step": 92005 + }, + { + "epoch": 10.247243568326095, + "grad_norm": 0.14024853706359863, + "learning_rate": 2.827556049413823e-05, + "loss": 0.0067, + "num_input_tokens_seen": 111979072, + "step": 92010 + }, + { + "epoch": 10.247800423209712, + "grad_norm": 0.004240850452333689, + "learning_rate": 2.8273151686662564e-05, + "loss": 0.109, + "num_input_tokens_seen": 111984544, + "step": 92015 + }, + { + "epoch": 10.248357278093328, + "grad_norm": 0.00011950042244279757, + "learning_rate": 2.8270742848269356e-05, + "loss": 0.062, + "num_input_tokens_seen": 111990592, + "step": 92020 + }, + { + "epoch": 10.248914132976946, + "grad_norm": 0.8089439868927002, + "learning_rate": 2.8268333978981367e-05, + "loss": 0.0368, + "num_input_tokens_seen": 111996448, + "step": 92025 + }, + { + "epoch": 10.249470987860564, + "grad_norm": 0.4816240072250366, + "learning_rate": 2.8265925078821337e-05, + "loss": 0.0631, + "num_input_tokens_seen": 112002400, + "step": 92030 + }, + { + "epoch": 10.250027842744181, + "grad_norm": 0.008255144581198692, + "learning_rate": 2.8263516147812035e-05, + "loss": 0.0044, + "num_input_tokens_seen": 112008480, + "step": 92035 + }, + { + "epoch": 10.250584697627799, + "grad_norm": 0.8191385865211487, + "learning_rate": 2.8261107185976206e-05, + "loss": 0.052, + "num_input_tokens_seen": 112014784, + "step": 92040 + }, + { + "epoch": 10.251141552511415, + "grad_norm": 0.010469670407474041, + "learning_rate": 2.8258698193336607e-05, + "loss": 0.0089, + "num_input_tokens_seen": 112021184, + "step": 92045 + }, + { + "epoch": 10.251698407395033, + "grad_norm": 0.05618894472718239, + "learning_rate": 2.825628916991599e-05, + "loss": 0.0482, + "num_input_tokens_seen": 112027520, + "step": 92050 + }, + { + "epoch": 10.25225526227865, + "grad_norm": 0.12219945341348648, + "learning_rate": 2.8253880115737114e-05, + "loss": 0.0161, + "num_input_tokens_seen": 112033920, + "step": 92055 + }, + { + "epoch": 10.252812117162268, + "grad_norm": 0.28994905948638916, + "learning_rate": 2.8251471030822736e-05, + "loss": 0.096, + "num_input_tokens_seen": 112039552, + "step": 92060 + }, + { + "epoch": 10.253368972045886, + "grad_norm": 0.07551062107086182, + "learning_rate": 2.8249061915195606e-05, + "loss": 0.0128, + "num_input_tokens_seen": 112045760, + "step": 92065 + }, + { + "epoch": 10.253925826929501, + "grad_norm": 1.6071528196334839, + "learning_rate": 2.8246652768878485e-05, + "loss": 0.0433, + "num_input_tokens_seen": 112051840, + "step": 92070 + }, + { + "epoch": 10.25448268181312, + "grad_norm": 0.3407832980155945, + "learning_rate": 2.8244243591894125e-05, + "loss": 0.087, + "num_input_tokens_seen": 112057920, + "step": 92075 + }, + { + "epoch": 10.255039536696737, + "grad_norm": 0.026707159355282784, + "learning_rate": 2.8241834384265293e-05, + "loss": 0.0207, + "num_input_tokens_seen": 112064032, + "step": 92080 + }, + { + "epoch": 10.255596391580355, + "grad_norm": 1.040977120399475, + "learning_rate": 2.8239425146014725e-05, + "loss": 0.0432, + "num_input_tokens_seen": 112070208, + "step": 92085 + }, + { + "epoch": 10.256153246463972, + "grad_norm": 0.49733588099479675, + "learning_rate": 2.8237015877165198e-05, + "loss": 0.0218, + "num_input_tokens_seen": 112076288, + "step": 92090 + }, + { + "epoch": 10.256710101347588, + "grad_norm": 0.0024535420816391706, + "learning_rate": 2.8234606577739464e-05, + "loss": 0.0952, + "num_input_tokens_seen": 112081600, + "step": 92095 + }, + { + "epoch": 10.257266956231206, + "grad_norm": 0.0003021569282282144, + "learning_rate": 2.8232197247760277e-05, + "loss": 0.0521, + "num_input_tokens_seen": 112087552, + "step": 92100 + }, + { + "epoch": 10.257823811114823, + "grad_norm": 1.0780808925628662, + "learning_rate": 2.8229787887250403e-05, + "loss": 0.1869, + "num_input_tokens_seen": 112093056, + "step": 92105 + }, + { + "epoch": 10.258380665998441, + "grad_norm": 0.02232200652360916, + "learning_rate": 2.822737849623259e-05, + "loss": 0.0041, + "num_input_tokens_seen": 112098944, + "step": 92110 + }, + { + "epoch": 10.258937520882059, + "grad_norm": 0.003554312279447913, + "learning_rate": 2.82249690747296e-05, + "loss": 0.0061, + "num_input_tokens_seen": 112104960, + "step": 92115 + }, + { + "epoch": 10.259494375765675, + "grad_norm": 0.6849742531776428, + "learning_rate": 2.8222559622764194e-05, + "loss": 0.025, + "num_input_tokens_seen": 112110976, + "step": 92120 + }, + { + "epoch": 10.260051230649292, + "grad_norm": 0.23112955689430237, + "learning_rate": 2.8220150140359124e-05, + "loss": 0.0071, + "num_input_tokens_seen": 112116928, + "step": 92125 + }, + { + "epoch": 10.26060808553291, + "grad_norm": 0.004337976220995188, + "learning_rate": 2.821774062753717e-05, + "loss": 0.1562, + "num_input_tokens_seen": 112122592, + "step": 92130 + }, + { + "epoch": 10.261164940416528, + "grad_norm": 0.9866208434104919, + "learning_rate": 2.8215331084321068e-05, + "loss": 0.0307, + "num_input_tokens_seen": 112128864, + "step": 92135 + }, + { + "epoch": 10.261721795300145, + "grad_norm": 0.2004287987947464, + "learning_rate": 2.8212921510733592e-05, + "loss": 0.0481, + "num_input_tokens_seen": 112134304, + "step": 92140 + }, + { + "epoch": 10.262278650183761, + "grad_norm": 0.024697551503777504, + "learning_rate": 2.82105119067975e-05, + "loss": 0.0814, + "num_input_tokens_seen": 112140512, + "step": 92145 + }, + { + "epoch": 10.262835505067379, + "grad_norm": 0.001097403233870864, + "learning_rate": 2.820810227253554e-05, + "loss": 0.0021, + "num_input_tokens_seen": 112146848, + "step": 92150 + }, + { + "epoch": 10.263392359950997, + "grad_norm": 2.3496763706207275, + "learning_rate": 2.8205692607970496e-05, + "loss": 0.1358, + "num_input_tokens_seen": 112152736, + "step": 92155 + }, + { + "epoch": 10.263949214834614, + "grad_norm": 0.003434078535065055, + "learning_rate": 2.820328291312511e-05, + "loss": 0.1053, + "num_input_tokens_seen": 112158880, + "step": 92160 + }, + { + "epoch": 10.264506069718232, + "grad_norm": 0.0010440985206514597, + "learning_rate": 2.820087318802216e-05, + "loss": 0.1882, + "num_input_tokens_seen": 112165024, + "step": 92165 + }, + { + "epoch": 10.26506292460185, + "grad_norm": 0.7931717038154602, + "learning_rate": 2.8198463432684385e-05, + "loss": 0.1309, + "num_input_tokens_seen": 112171136, + "step": 92170 + }, + { + "epoch": 10.265619779485466, + "grad_norm": 0.349404513835907, + "learning_rate": 2.819605364713457e-05, + "loss": 0.0635, + "num_input_tokens_seen": 112177120, + "step": 92175 + }, + { + "epoch": 10.266176634369083, + "grad_norm": 0.04639579355716705, + "learning_rate": 2.819364383139546e-05, + "loss": 0.0094, + "num_input_tokens_seen": 112183040, + "step": 92180 + }, + { + "epoch": 10.266733489252701, + "grad_norm": 0.44965851306915283, + "learning_rate": 2.819123398548983e-05, + "loss": 0.02, + "num_input_tokens_seen": 112189248, + "step": 92185 + }, + { + "epoch": 10.267290344136319, + "grad_norm": 0.0006465906626544893, + "learning_rate": 2.8188824109440437e-05, + "loss": 0.0407, + "num_input_tokens_seen": 112195328, + "step": 92190 + }, + { + "epoch": 10.267847199019936, + "grad_norm": 0.3072986602783203, + "learning_rate": 2.8186414203270045e-05, + "loss": 0.016, + "num_input_tokens_seen": 112201408, + "step": 92195 + }, + { + "epoch": 10.268404053903552, + "grad_norm": 0.004701707046478987, + "learning_rate": 2.8184004267001425e-05, + "loss": 0.0806, + "num_input_tokens_seen": 112207840, + "step": 92200 + }, + { + "epoch": 10.26896090878717, + "grad_norm": 0.02908317558467388, + "learning_rate": 2.818159430065732e-05, + "loss": 0.0862, + "num_input_tokens_seen": 112213280, + "step": 92205 + }, + { + "epoch": 10.269517763670788, + "grad_norm": 0.6580759286880493, + "learning_rate": 2.817918430426052e-05, + "loss": 0.0344, + "num_input_tokens_seen": 112218848, + "step": 92210 + }, + { + "epoch": 10.270074618554405, + "grad_norm": 0.0034651581663638353, + "learning_rate": 2.8176774277833773e-05, + "loss": 0.1013, + "num_input_tokens_seen": 112225312, + "step": 92215 + }, + { + "epoch": 10.270631473438023, + "grad_norm": 1.5758317708969116, + "learning_rate": 2.8174364221399848e-05, + "loss": 0.0491, + "num_input_tokens_seen": 112231424, + "step": 92220 + }, + { + "epoch": 10.271188328321639, + "grad_norm": 0.11259758472442627, + "learning_rate": 2.817195413498151e-05, + "loss": 0.0694, + "num_input_tokens_seen": 112237472, + "step": 92225 + }, + { + "epoch": 10.271745183205256, + "grad_norm": 0.0012548795202746987, + "learning_rate": 2.8169544018601517e-05, + "loss": 0.0502, + "num_input_tokens_seen": 112243648, + "step": 92230 + }, + { + "epoch": 10.272302038088874, + "grad_norm": 0.27195775508880615, + "learning_rate": 2.8167133872282654e-05, + "loss": 0.0653, + "num_input_tokens_seen": 112249280, + "step": 92235 + }, + { + "epoch": 10.272858892972492, + "grad_norm": 0.02357127144932747, + "learning_rate": 2.8164723696047662e-05, + "loss": 0.1177, + "num_input_tokens_seen": 112255584, + "step": 92240 + }, + { + "epoch": 10.27341574785611, + "grad_norm": 0.00027632436831481755, + "learning_rate": 2.8162313489919322e-05, + "loss": 0.0866, + "num_input_tokens_seen": 112261056, + "step": 92245 + }, + { + "epoch": 10.273972602739725, + "grad_norm": 0.5412960648536682, + "learning_rate": 2.8159903253920406e-05, + "loss": 0.0129, + "num_input_tokens_seen": 112267232, + "step": 92250 + }, + { + "epoch": 10.274529457623343, + "grad_norm": 0.6257970333099365, + "learning_rate": 2.8157492988073665e-05, + "loss": 0.0216, + "num_input_tokens_seen": 112273536, + "step": 92255 + }, + { + "epoch": 10.27508631250696, + "grad_norm": 0.011675698682665825, + "learning_rate": 2.8155082692401873e-05, + "loss": 0.0051, + "num_input_tokens_seen": 112279744, + "step": 92260 + }, + { + "epoch": 10.275643167390578, + "grad_norm": 0.5450135469436646, + "learning_rate": 2.8152672366927797e-05, + "loss": 0.0343, + "num_input_tokens_seen": 112285536, + "step": 92265 + }, + { + "epoch": 10.276200022274196, + "grad_norm": 0.15403659641742706, + "learning_rate": 2.815026201167421e-05, + "loss": 0.0155, + "num_input_tokens_seen": 112291936, + "step": 92270 + }, + { + "epoch": 10.276756877157812, + "grad_norm": 0.06798084080219269, + "learning_rate": 2.814785162666387e-05, + "loss": 0.0284, + "num_input_tokens_seen": 112298368, + "step": 92275 + }, + { + "epoch": 10.27731373204143, + "grad_norm": 0.1217183843255043, + "learning_rate": 2.8145441211919544e-05, + "loss": 0.0101, + "num_input_tokens_seen": 112304352, + "step": 92280 + }, + { + "epoch": 10.277870586925047, + "grad_norm": 0.0011850817827507854, + "learning_rate": 2.8143030767464017e-05, + "loss": 0.1115, + "num_input_tokens_seen": 112310496, + "step": 92285 + }, + { + "epoch": 10.278427441808665, + "grad_norm": 0.9110755920410156, + "learning_rate": 2.8140620293320036e-05, + "loss": 0.0271, + "num_input_tokens_seen": 112316800, + "step": 92290 + }, + { + "epoch": 10.278984296692283, + "grad_norm": 0.0004761756572406739, + "learning_rate": 2.813820978951039e-05, + "loss": 0.0334, + "num_input_tokens_seen": 112322912, + "step": 92295 + }, + { + "epoch": 10.279541151575899, + "grad_norm": 0.5315924286842346, + "learning_rate": 2.8135799256057826e-05, + "loss": 0.0156, + "num_input_tokens_seen": 112328992, + "step": 92300 + }, + { + "epoch": 10.280098006459516, + "grad_norm": 3.760105609893799, + "learning_rate": 2.813338869298514e-05, + "loss": 0.0745, + "num_input_tokens_seen": 112334752, + "step": 92305 + }, + { + "epoch": 10.280654861343134, + "grad_norm": 1.2592954635620117, + "learning_rate": 2.8130978100315076e-05, + "loss": 0.0452, + "num_input_tokens_seen": 112340896, + "step": 92310 + }, + { + "epoch": 10.281211716226752, + "grad_norm": 0.18220435082912445, + "learning_rate": 2.8128567478070417e-05, + "loss": 0.0186, + "num_input_tokens_seen": 112346432, + "step": 92315 + }, + { + "epoch": 10.28176857111037, + "grad_norm": 0.1456006020307541, + "learning_rate": 2.8126156826273936e-05, + "loss": 0.0783, + "num_input_tokens_seen": 112352704, + "step": 92320 + }, + { + "epoch": 10.282325425993985, + "grad_norm": 0.3724498152732849, + "learning_rate": 2.8123746144948393e-05, + "loss": 0.0617, + "num_input_tokens_seen": 112358848, + "step": 92325 + }, + { + "epoch": 10.282882280877603, + "grad_norm": 0.8305439352989197, + "learning_rate": 2.8121335434116576e-05, + "loss": 0.108, + "num_input_tokens_seen": 112364768, + "step": 92330 + }, + { + "epoch": 10.28343913576122, + "grad_norm": 1.5214110612869263, + "learning_rate": 2.8118924693801236e-05, + "loss": 0.0944, + "num_input_tokens_seen": 112370880, + "step": 92335 + }, + { + "epoch": 10.283995990644838, + "grad_norm": 0.2620253264904022, + "learning_rate": 2.8116513924025155e-05, + "loss": 0.0688, + "num_input_tokens_seen": 112377152, + "step": 92340 + }, + { + "epoch": 10.284552845528456, + "grad_norm": 0.947500467300415, + "learning_rate": 2.81141031248111e-05, + "loss": 0.0579, + "num_input_tokens_seen": 112383168, + "step": 92345 + }, + { + "epoch": 10.285109700412072, + "grad_norm": 0.008446435444056988, + "learning_rate": 2.8111692296181853e-05, + "loss": 0.047, + "num_input_tokens_seen": 112388832, + "step": 92350 + }, + { + "epoch": 10.28566655529569, + "grad_norm": 0.5432351231575012, + "learning_rate": 2.8109281438160172e-05, + "loss": 0.1522, + "num_input_tokens_seen": 112394560, + "step": 92355 + }, + { + "epoch": 10.286223410179307, + "grad_norm": 1.361854910850525, + "learning_rate": 2.8106870550768844e-05, + "loss": 0.1831, + "num_input_tokens_seen": 112400576, + "step": 92360 + }, + { + "epoch": 10.286780265062925, + "grad_norm": 4.763218879699707, + "learning_rate": 2.8104459634030632e-05, + "loss": 0.0565, + "num_input_tokens_seen": 112406816, + "step": 92365 + }, + { + "epoch": 10.287337119946542, + "grad_norm": 1.5049797296524048, + "learning_rate": 2.810204868796831e-05, + "loss": 0.0568, + "num_input_tokens_seen": 112412256, + "step": 92370 + }, + { + "epoch": 10.28789397483016, + "grad_norm": 0.30432847142219543, + "learning_rate": 2.8099637712604647e-05, + "loss": 0.0145, + "num_input_tokens_seen": 112418368, + "step": 92375 + }, + { + "epoch": 10.288450829713776, + "grad_norm": 0.10436443984508514, + "learning_rate": 2.8097226707962433e-05, + "loss": 0.0309, + "num_input_tokens_seen": 112424320, + "step": 92380 + }, + { + "epoch": 10.289007684597394, + "grad_norm": 1.7050896883010864, + "learning_rate": 2.8094815674064423e-05, + "loss": 0.0926, + "num_input_tokens_seen": 112430912, + "step": 92385 + }, + { + "epoch": 10.289564539481011, + "grad_norm": 0.0002595819823909551, + "learning_rate": 2.809240461093341e-05, + "loss": 0.0144, + "num_input_tokens_seen": 112436704, + "step": 92390 + }, + { + "epoch": 10.290121394364629, + "grad_norm": 0.18774788081645966, + "learning_rate": 2.808999351859215e-05, + "loss": 0.0533, + "num_input_tokens_seen": 112442592, + "step": 92395 + }, + { + "epoch": 10.290678249248247, + "grad_norm": 0.024891668930649757, + "learning_rate": 2.8087582397063427e-05, + "loss": 0.001, + "num_input_tokens_seen": 112448736, + "step": 92400 + }, + { + "epoch": 10.291235104131863, + "grad_norm": 1.0739121437072754, + "learning_rate": 2.8085171246370016e-05, + "loss": 0.0986, + "num_input_tokens_seen": 112454912, + "step": 92405 + }, + { + "epoch": 10.29179195901548, + "grad_norm": 0.000429527455708012, + "learning_rate": 2.808276006653469e-05, + "loss": 0.0129, + "num_input_tokens_seen": 112461024, + "step": 92410 + }, + { + "epoch": 10.292348813899098, + "grad_norm": 0.18618924915790558, + "learning_rate": 2.8080348857580223e-05, + "loss": 0.0938, + "num_input_tokens_seen": 112467104, + "step": 92415 + }, + { + "epoch": 10.292905668782716, + "grad_norm": 0.00920833833515644, + "learning_rate": 2.8077937619529393e-05, + "loss": 0.0789, + "num_input_tokens_seen": 112473280, + "step": 92420 + }, + { + "epoch": 10.293462523666333, + "grad_norm": 0.1238619014620781, + "learning_rate": 2.8075526352404978e-05, + "loss": 0.0122, + "num_input_tokens_seen": 112479360, + "step": 92425 + }, + { + "epoch": 10.29401937854995, + "grad_norm": 0.011969266459345818, + "learning_rate": 2.8073115056229755e-05, + "loss": 0.0283, + "num_input_tokens_seen": 112485504, + "step": 92430 + }, + { + "epoch": 10.294576233433567, + "grad_norm": 0.05961209908127785, + "learning_rate": 2.8070703731026493e-05, + "loss": 0.0432, + "num_input_tokens_seen": 112491456, + "step": 92435 + }, + { + "epoch": 10.295133088317185, + "grad_norm": 0.3689352571964264, + "learning_rate": 2.8068292376817977e-05, + "loss": 0.0604, + "num_input_tokens_seen": 112497408, + "step": 92440 + }, + { + "epoch": 10.295689943200802, + "grad_norm": 0.14380192756652832, + "learning_rate": 2.806588099362698e-05, + "loss": 0.0159, + "num_input_tokens_seen": 112503392, + "step": 92445 + }, + { + "epoch": 10.29624679808442, + "grad_norm": 1.5992860794067383, + "learning_rate": 2.806346958147629e-05, + "loss": 0.0911, + "num_input_tokens_seen": 112509408, + "step": 92450 + }, + { + "epoch": 10.296803652968036, + "grad_norm": 0.21820871531963348, + "learning_rate": 2.8061058140388657e-05, + "loss": 0.0425, + "num_input_tokens_seen": 112514944, + "step": 92455 + }, + { + "epoch": 10.297360507851653, + "grad_norm": 0.04498155415058136, + "learning_rate": 2.805864667038689e-05, + "loss": 0.0052, + "num_input_tokens_seen": 112520992, + "step": 92460 + }, + { + "epoch": 10.297917362735271, + "grad_norm": 0.6621696949005127, + "learning_rate": 2.805623517149375e-05, + "loss": 0.0164, + "num_input_tokens_seen": 112527072, + "step": 92465 + }, + { + "epoch": 10.298474217618889, + "grad_norm": 0.04950140789151192, + "learning_rate": 2.8053823643732025e-05, + "loss": 0.0739, + "num_input_tokens_seen": 112533312, + "step": 92470 + }, + { + "epoch": 10.299031072502506, + "grad_norm": 0.01600864715874195, + "learning_rate": 2.8051412087124484e-05, + "loss": 0.256, + "num_input_tokens_seen": 112539456, + "step": 92475 + }, + { + "epoch": 10.299587927386122, + "grad_norm": 0.0051517002284526825, + "learning_rate": 2.804900050169391e-05, + "loss": 0.0012, + "num_input_tokens_seen": 112546016, + "step": 92480 + }, + { + "epoch": 10.30014478226974, + "grad_norm": 0.43599605560302734, + "learning_rate": 2.8046588887463094e-05, + "loss": 0.0563, + "num_input_tokens_seen": 112552096, + "step": 92485 + }, + { + "epoch": 10.300701637153358, + "grad_norm": 0.07503950595855713, + "learning_rate": 2.8044177244454795e-05, + "loss": 0.0637, + "num_input_tokens_seen": 112557344, + "step": 92490 + }, + { + "epoch": 10.301258492036975, + "grad_norm": 0.112666055560112, + "learning_rate": 2.8041765572691804e-05, + "loss": 0.0125, + "num_input_tokens_seen": 112563904, + "step": 92495 + }, + { + "epoch": 10.301815346920593, + "grad_norm": 0.6839946508407593, + "learning_rate": 2.8039353872196906e-05, + "loss": 0.0332, + "num_input_tokens_seen": 112570336, + "step": 92500 + }, + { + "epoch": 10.302372201804209, + "grad_norm": 0.0069990637712180614, + "learning_rate": 2.8036942142992867e-05, + "loss": 0.0743, + "num_input_tokens_seen": 112576512, + "step": 92505 + }, + { + "epoch": 10.302929056687827, + "grad_norm": 0.04033608362078667, + "learning_rate": 2.8034530385102482e-05, + "loss": 0.0255, + "num_input_tokens_seen": 112582816, + "step": 92510 + }, + { + "epoch": 10.303485911571444, + "grad_norm": 0.7892587184906006, + "learning_rate": 2.8032118598548522e-05, + "loss": 0.0221, + "num_input_tokens_seen": 112588960, + "step": 92515 + }, + { + "epoch": 10.304042766455062, + "grad_norm": 0.00029045582050457597, + "learning_rate": 2.802970678335377e-05, + "loss": 0.0602, + "num_input_tokens_seen": 112595136, + "step": 92520 + }, + { + "epoch": 10.30459962133868, + "grad_norm": 0.20431505143642426, + "learning_rate": 2.8027294939541022e-05, + "loss": 0.0573, + "num_input_tokens_seen": 112601408, + "step": 92525 + }, + { + "epoch": 10.305156476222297, + "grad_norm": 0.012203997001051903, + "learning_rate": 2.802488306713304e-05, + "loss": 0.0554, + "num_input_tokens_seen": 112607936, + "step": 92530 + }, + { + "epoch": 10.305713331105913, + "grad_norm": 0.1653786152601242, + "learning_rate": 2.802247116615262e-05, + "loss": 0.0454, + "num_input_tokens_seen": 112613920, + "step": 92535 + }, + { + "epoch": 10.306270185989531, + "grad_norm": 0.5406063199043274, + "learning_rate": 2.802005923662253e-05, + "loss": 0.0998, + "num_input_tokens_seen": 112620032, + "step": 92540 + }, + { + "epoch": 10.306827040873149, + "grad_norm": 0.548392653465271, + "learning_rate": 2.8017647278565568e-05, + "loss": 0.0085, + "num_input_tokens_seen": 112626336, + "step": 92545 + }, + { + "epoch": 10.307383895756766, + "grad_norm": 1.2438596487045288, + "learning_rate": 2.8015235292004503e-05, + "loss": 0.1613, + "num_input_tokens_seen": 112632480, + "step": 92550 + }, + { + "epoch": 10.307940750640384, + "grad_norm": 0.0007832178962416947, + "learning_rate": 2.8012823276962125e-05, + "loss": 0.0125, + "num_input_tokens_seen": 112638624, + "step": 92555 + }, + { + "epoch": 10.308497605524, + "grad_norm": 0.027365311980247498, + "learning_rate": 2.8010411233461225e-05, + "loss": 0.0379, + "num_input_tokens_seen": 112644640, + "step": 92560 + }, + { + "epoch": 10.309054460407618, + "grad_norm": 0.16560623049736023, + "learning_rate": 2.8007999161524573e-05, + "loss": 0.0176, + "num_input_tokens_seen": 112650880, + "step": 92565 + }, + { + "epoch": 10.309611315291235, + "grad_norm": 0.012340114451944828, + "learning_rate": 2.800558706117496e-05, + "loss": 0.0319, + "num_input_tokens_seen": 112656800, + "step": 92570 + }, + { + "epoch": 10.310168170174853, + "grad_norm": 0.17490200698375702, + "learning_rate": 2.8003174932435168e-05, + "loss": 0.1357, + "num_input_tokens_seen": 112662464, + "step": 92575 + }, + { + "epoch": 10.31072502505847, + "grad_norm": 0.9260194301605225, + "learning_rate": 2.8000762775327987e-05, + "loss": 0.1079, + "num_input_tokens_seen": 112668896, + "step": 92580 + }, + { + "epoch": 10.311281879942086, + "grad_norm": 0.8055756688117981, + "learning_rate": 2.7998350589876193e-05, + "loss": 0.171, + "num_input_tokens_seen": 112675264, + "step": 92585 + }, + { + "epoch": 10.311838734825704, + "grad_norm": 0.15725693106651306, + "learning_rate": 2.7995938376102576e-05, + "loss": 0.0116, + "num_input_tokens_seen": 112681568, + "step": 92590 + }, + { + "epoch": 10.312395589709322, + "grad_norm": 0.1870012730360031, + "learning_rate": 2.7993526134029923e-05, + "loss": 0.073, + "num_input_tokens_seen": 112688032, + "step": 92595 + }, + { + "epoch": 10.31295244459294, + "grad_norm": 0.19472403824329376, + "learning_rate": 2.7991113863681013e-05, + "loss": 0.0798, + "num_input_tokens_seen": 112694048, + "step": 92600 + }, + { + "epoch": 10.313509299476557, + "grad_norm": 1.4933584928512573, + "learning_rate": 2.7988701565078645e-05, + "loss": 0.1289, + "num_input_tokens_seen": 112700224, + "step": 92605 + }, + { + "epoch": 10.314066154360173, + "grad_norm": 0.0020339335314929485, + "learning_rate": 2.7986289238245587e-05, + "loss": 0.1189, + "num_input_tokens_seen": 112705600, + "step": 92610 + }, + { + "epoch": 10.31462300924379, + "grad_norm": 0.011818229220807552, + "learning_rate": 2.798387688320464e-05, + "loss": 0.0033, + "num_input_tokens_seen": 112711872, + "step": 92615 + }, + { + "epoch": 10.315179864127408, + "grad_norm": 0.0072340755723416805, + "learning_rate": 2.7981464499978583e-05, + "loss": 0.0128, + "num_input_tokens_seen": 112718016, + "step": 92620 + }, + { + "epoch": 10.315736719011026, + "grad_norm": 0.15545006096363068, + "learning_rate": 2.7979052088590203e-05, + "loss": 0.0342, + "num_input_tokens_seen": 112724224, + "step": 92625 + }, + { + "epoch": 10.316293573894644, + "grad_norm": 0.024716321378946304, + "learning_rate": 2.7976639649062292e-05, + "loss": 0.0231, + "num_input_tokens_seen": 112729760, + "step": 92630 + }, + { + "epoch": 10.31685042877826, + "grad_norm": 0.028789494186639786, + "learning_rate": 2.7974227181417633e-05, + "loss": 0.0653, + "num_input_tokens_seen": 112735776, + "step": 92635 + }, + { + "epoch": 10.317407283661877, + "grad_norm": 0.6677447557449341, + "learning_rate": 2.7971814685679022e-05, + "loss": 0.1029, + "num_input_tokens_seen": 112741856, + "step": 92640 + }, + { + "epoch": 10.317964138545495, + "grad_norm": 0.21101908385753632, + "learning_rate": 2.796940216186923e-05, + "loss": 0.0181, + "num_input_tokens_seen": 112748064, + "step": 92645 + }, + { + "epoch": 10.318520993429113, + "grad_norm": 0.0014678476145491004, + "learning_rate": 2.7966989610011057e-05, + "loss": 0.0153, + "num_input_tokens_seen": 112754240, + "step": 92650 + }, + { + "epoch": 10.31907784831273, + "grad_norm": 0.24695396423339844, + "learning_rate": 2.79645770301273e-05, + "loss": 0.066, + "num_input_tokens_seen": 112760608, + "step": 92655 + }, + { + "epoch": 10.319634703196346, + "grad_norm": 0.6339007019996643, + "learning_rate": 2.7962164422240726e-05, + "loss": 0.0951, + "num_input_tokens_seen": 112767072, + "step": 92660 + }, + { + "epoch": 10.320191558079964, + "grad_norm": 0.00845656543970108, + "learning_rate": 2.7959751786374145e-05, + "loss": 0.0365, + "num_input_tokens_seen": 112773280, + "step": 92665 + }, + { + "epoch": 10.320748412963582, + "grad_norm": 1.1565459966659546, + "learning_rate": 2.795733912255033e-05, + "loss": 0.022, + "num_input_tokens_seen": 112779680, + "step": 92670 + }, + { + "epoch": 10.3213052678472, + "grad_norm": 0.5319098830223083, + "learning_rate": 2.7954926430792084e-05, + "loss": 0.0385, + "num_input_tokens_seen": 112785888, + "step": 92675 + }, + { + "epoch": 10.321862122730817, + "grad_norm": 0.06210974231362343, + "learning_rate": 2.7952513711122187e-05, + "loss": 0.0189, + "num_input_tokens_seen": 112791776, + "step": 92680 + }, + { + "epoch": 10.322418977614433, + "grad_norm": 0.39326396584510803, + "learning_rate": 2.7950100963563426e-05, + "loss": 0.0268, + "num_input_tokens_seen": 112797536, + "step": 92685 + }, + { + "epoch": 10.32297583249805, + "grad_norm": 0.41784635186195374, + "learning_rate": 2.794768818813861e-05, + "loss": 0.134, + "num_input_tokens_seen": 112803584, + "step": 92690 + }, + { + "epoch": 10.323532687381668, + "grad_norm": 0.0002263796195620671, + "learning_rate": 2.7945275384870507e-05, + "loss": 0.0168, + "num_input_tokens_seen": 112809504, + "step": 92695 + }, + { + "epoch": 10.324089542265286, + "grad_norm": 0.0008655679994262755, + "learning_rate": 2.7942862553781927e-05, + "loss": 0.0021, + "num_input_tokens_seen": 112815616, + "step": 92700 + }, + { + "epoch": 10.324646397148904, + "grad_norm": 0.0013284533051773906, + "learning_rate": 2.7940449694895644e-05, + "loss": 0.0239, + "num_input_tokens_seen": 112821408, + "step": 92705 + }, + { + "epoch": 10.32520325203252, + "grad_norm": 0.13528725504875183, + "learning_rate": 2.793803680823447e-05, + "loss": 0.0119, + "num_input_tokens_seen": 112826912, + "step": 92710 + }, + { + "epoch": 10.325760106916137, + "grad_norm": 1.8128663301467896, + "learning_rate": 2.7935623893821174e-05, + "loss": 0.0929, + "num_input_tokens_seen": 112832864, + "step": 92715 + }, + { + "epoch": 10.326316961799755, + "grad_norm": 1.403710126876831, + "learning_rate": 2.7933210951678558e-05, + "loss": 0.0493, + "num_input_tokens_seen": 112838976, + "step": 92720 + }, + { + "epoch": 10.326873816683372, + "grad_norm": 0.5512146949768066, + "learning_rate": 2.793079798182942e-05, + "loss": 0.0337, + "num_input_tokens_seen": 112845312, + "step": 92725 + }, + { + "epoch": 10.32743067156699, + "grad_norm": 0.003162527224048972, + "learning_rate": 2.7928384984296542e-05, + "loss": 0.097, + "num_input_tokens_seen": 112851584, + "step": 92730 + }, + { + "epoch": 10.327987526450608, + "grad_norm": 1.152876615524292, + "learning_rate": 2.7925971959102725e-05, + "loss": 0.0943, + "num_input_tokens_seen": 112857568, + "step": 92735 + }, + { + "epoch": 10.328544381334224, + "grad_norm": 0.3188199996948242, + "learning_rate": 2.792355890627076e-05, + "loss": 0.079, + "num_input_tokens_seen": 112863648, + "step": 92740 + }, + { + "epoch": 10.329101236217841, + "grad_norm": 0.16023941338062286, + "learning_rate": 2.792114582582343e-05, + "loss": 0.0945, + "num_input_tokens_seen": 112869696, + "step": 92745 + }, + { + "epoch": 10.329658091101459, + "grad_norm": 0.062147751450538635, + "learning_rate": 2.7918732717783548e-05, + "loss": 0.037, + "num_input_tokens_seen": 112875584, + "step": 92750 + }, + { + "epoch": 10.330214945985077, + "grad_norm": 0.1792113035917282, + "learning_rate": 2.7916319582173887e-05, + "loss": 0.0203, + "num_input_tokens_seen": 112881696, + "step": 92755 + }, + { + "epoch": 10.330771800868694, + "grad_norm": 0.22603248059749603, + "learning_rate": 2.7913906419017262e-05, + "loss": 0.0261, + "num_input_tokens_seen": 112887808, + "step": 92760 + }, + { + "epoch": 10.33132865575231, + "grad_norm": 0.06553509086370468, + "learning_rate": 2.791149322833645e-05, + "loss": 0.0798, + "num_input_tokens_seen": 112893792, + "step": 92765 + }, + { + "epoch": 10.331885510635928, + "grad_norm": 0.02825113572180271, + "learning_rate": 2.7909080010154253e-05, + "loss": 0.0117, + "num_input_tokens_seen": 112899904, + "step": 92770 + }, + { + "epoch": 10.332442365519546, + "grad_norm": 0.002699002157896757, + "learning_rate": 2.7906666764493468e-05, + "loss": 0.0637, + "num_input_tokens_seen": 112906272, + "step": 92775 + }, + { + "epoch": 10.332999220403163, + "grad_norm": 0.2381465882062912, + "learning_rate": 2.7904253491376875e-05, + "loss": 0.1438, + "num_input_tokens_seen": 112912352, + "step": 92780 + }, + { + "epoch": 10.333556075286781, + "grad_norm": 0.010140800848603249, + "learning_rate": 2.7901840190827294e-05, + "loss": 0.0061, + "num_input_tokens_seen": 112918784, + "step": 92785 + }, + { + "epoch": 10.334112930170397, + "grad_norm": 0.289492130279541, + "learning_rate": 2.78994268628675e-05, + "loss": 0.0431, + "num_input_tokens_seen": 112924896, + "step": 92790 + }, + { + "epoch": 10.334669785054015, + "grad_norm": 1.644718050956726, + "learning_rate": 2.7897013507520302e-05, + "loss": 0.0731, + "num_input_tokens_seen": 112930752, + "step": 92795 + }, + { + "epoch": 10.335226639937632, + "grad_norm": 0.10718036442995071, + "learning_rate": 2.7894600124808484e-05, + "loss": 0.0057, + "num_input_tokens_seen": 112936672, + "step": 92800 + }, + { + "epoch": 10.33578349482125, + "grad_norm": 0.00121585326269269, + "learning_rate": 2.789218671475486e-05, + "loss": 0.0162, + "num_input_tokens_seen": 112942752, + "step": 92805 + }, + { + "epoch": 10.336340349704868, + "grad_norm": 1.4344371557235718, + "learning_rate": 2.7889773277382207e-05, + "loss": 0.0489, + "num_input_tokens_seen": 112948800, + "step": 92810 + }, + { + "epoch": 10.336897204588483, + "grad_norm": 0.8288952112197876, + "learning_rate": 2.7887359812713325e-05, + "loss": 0.0421, + "num_input_tokens_seen": 112954944, + "step": 92815 + }, + { + "epoch": 10.337454059472101, + "grad_norm": 1.8632031679153442, + "learning_rate": 2.788494632077103e-05, + "loss": 0.1033, + "num_input_tokens_seen": 112960832, + "step": 92820 + }, + { + "epoch": 10.338010914355719, + "grad_norm": 0.13025079667568207, + "learning_rate": 2.7882532801578093e-05, + "loss": 0.0123, + "num_input_tokens_seen": 112966976, + "step": 92825 + }, + { + "epoch": 10.338567769239337, + "grad_norm": 0.05197088047862053, + "learning_rate": 2.7880119255157328e-05, + "loss": 0.0228, + "num_input_tokens_seen": 112973088, + "step": 92830 + }, + { + "epoch": 10.339124624122954, + "grad_norm": 0.009555966593325138, + "learning_rate": 2.7877705681531528e-05, + "loss": 0.0306, + "num_input_tokens_seen": 112979520, + "step": 92835 + }, + { + "epoch": 10.33968147900657, + "grad_norm": 0.45787233114242554, + "learning_rate": 2.7875292080723492e-05, + "loss": 0.017, + "num_input_tokens_seen": 112985312, + "step": 92840 + }, + { + "epoch": 10.340238333890188, + "grad_norm": 0.007316064089536667, + "learning_rate": 2.7872878452756022e-05, + "loss": 0.0225, + "num_input_tokens_seen": 112991520, + "step": 92845 + }, + { + "epoch": 10.340795188773805, + "grad_norm": 0.5698518753051758, + "learning_rate": 2.7870464797651914e-05, + "loss": 0.0582, + "num_input_tokens_seen": 112997408, + "step": 92850 + }, + { + "epoch": 10.341352043657423, + "grad_norm": 1.6918559074401855, + "learning_rate": 2.786805111543397e-05, + "loss": 0.0678, + "num_input_tokens_seen": 113003264, + "step": 92855 + }, + { + "epoch": 10.34190889854104, + "grad_norm": 2.7626442909240723, + "learning_rate": 2.7865637406124974e-05, + "loss": 0.0835, + "num_input_tokens_seen": 113009056, + "step": 92860 + }, + { + "epoch": 10.342465753424657, + "grad_norm": 0.4817332327365875, + "learning_rate": 2.786322366974774e-05, + "loss": 0.0331, + "num_input_tokens_seen": 113015328, + "step": 92865 + }, + { + "epoch": 10.343022608308274, + "grad_norm": 0.003523099236190319, + "learning_rate": 2.7860809906325068e-05, + "loss": 0.0217, + "num_input_tokens_seen": 113021600, + "step": 92870 + }, + { + "epoch": 10.343579463191892, + "grad_norm": 0.001492472249083221, + "learning_rate": 2.7858396115879755e-05, + "loss": 0.0016, + "num_input_tokens_seen": 113027712, + "step": 92875 + }, + { + "epoch": 10.34413631807551, + "grad_norm": 0.2039211094379425, + "learning_rate": 2.7855982298434596e-05, + "loss": 0.0945, + "num_input_tokens_seen": 113033952, + "step": 92880 + }, + { + "epoch": 10.344693172959127, + "grad_norm": 0.11113575845956802, + "learning_rate": 2.7853568454012397e-05, + "loss": 0.0811, + "num_input_tokens_seen": 113040064, + "step": 92885 + }, + { + "epoch": 10.345250027842745, + "grad_norm": 0.3176799714565277, + "learning_rate": 2.7851154582635963e-05, + "loss": 0.0629, + "num_input_tokens_seen": 113046016, + "step": 92890 + }, + { + "epoch": 10.345806882726361, + "grad_norm": 0.15194030106067657, + "learning_rate": 2.7848740684328085e-05, + "loss": 0.0259, + "num_input_tokens_seen": 113052224, + "step": 92895 + }, + { + "epoch": 10.346363737609979, + "grad_norm": 0.9059536457061768, + "learning_rate": 2.7846326759111567e-05, + "loss": 0.0414, + "num_input_tokens_seen": 113058560, + "step": 92900 + }, + { + "epoch": 10.346920592493596, + "grad_norm": 0.00031845521880313754, + "learning_rate": 2.784391280700922e-05, + "loss": 0.0101, + "num_input_tokens_seen": 113064608, + "step": 92905 + }, + { + "epoch": 10.347477447377214, + "grad_norm": 0.05173518881201744, + "learning_rate": 2.7841498828043826e-05, + "loss": 0.0712, + "num_input_tokens_seen": 113070592, + "step": 92910 + }, + { + "epoch": 10.348034302260832, + "grad_norm": 0.04168519377708435, + "learning_rate": 2.7839084822238215e-05, + "loss": 0.0221, + "num_input_tokens_seen": 113076640, + "step": 92915 + }, + { + "epoch": 10.348591157144448, + "grad_norm": 0.3234943449497223, + "learning_rate": 2.783667078961516e-05, + "loss": 0.0362, + "num_input_tokens_seen": 113082656, + "step": 92920 + }, + { + "epoch": 10.349148012028065, + "grad_norm": 0.00014934675709810108, + "learning_rate": 2.7834256730197476e-05, + "loss": 0.103, + "num_input_tokens_seen": 113088512, + "step": 92925 + }, + { + "epoch": 10.349704866911683, + "grad_norm": 0.729282796382904, + "learning_rate": 2.7831842644007978e-05, + "loss": 0.038, + "num_input_tokens_seen": 113094080, + "step": 92930 + }, + { + "epoch": 10.3502617217953, + "grad_norm": 2.1597790718078613, + "learning_rate": 2.7829428531069446e-05, + "loss": 0.1095, + "num_input_tokens_seen": 113100576, + "step": 92935 + }, + { + "epoch": 10.350818576678918, + "grad_norm": 0.5225149989128113, + "learning_rate": 2.7827014391404704e-05, + "loss": 0.0282, + "num_input_tokens_seen": 113106784, + "step": 92940 + }, + { + "epoch": 10.351375431562534, + "grad_norm": 0.13982374966144562, + "learning_rate": 2.782460022503654e-05, + "loss": 0.0293, + "num_input_tokens_seen": 113113056, + "step": 92945 + }, + { + "epoch": 10.351932286446152, + "grad_norm": 0.9788869619369507, + "learning_rate": 2.7822186031987767e-05, + "loss": 0.0294, + "num_input_tokens_seen": 113119392, + "step": 92950 + }, + { + "epoch": 10.35248914132977, + "grad_norm": 0.009006880223751068, + "learning_rate": 2.7819771812281183e-05, + "loss": 0.023, + "num_input_tokens_seen": 113125536, + "step": 92955 + }, + { + "epoch": 10.353045996213387, + "grad_norm": 0.06608002632856369, + "learning_rate": 2.7817357565939593e-05, + "loss": 0.0456, + "num_input_tokens_seen": 113131744, + "step": 92960 + }, + { + "epoch": 10.353602851097005, + "grad_norm": 0.02921781875193119, + "learning_rate": 2.781494329298581e-05, + "loss": 0.0014, + "num_input_tokens_seen": 113138208, + "step": 92965 + }, + { + "epoch": 10.35415970598062, + "grad_norm": 1.206286907196045, + "learning_rate": 2.7812528993442628e-05, + "loss": 0.0462, + "num_input_tokens_seen": 113144512, + "step": 92970 + }, + { + "epoch": 10.354716560864238, + "grad_norm": 0.00587435532361269, + "learning_rate": 2.781011466733286e-05, + "loss": 0.056, + "num_input_tokens_seen": 113150208, + "step": 92975 + }, + { + "epoch": 10.355273415747856, + "grad_norm": 0.06295119225978851, + "learning_rate": 2.7807700314679304e-05, + "loss": 0.0383, + "num_input_tokens_seen": 113156480, + "step": 92980 + }, + { + "epoch": 10.355830270631474, + "grad_norm": 0.1391192227602005, + "learning_rate": 2.780528593550477e-05, + "loss": 0.0112, + "num_input_tokens_seen": 113162848, + "step": 92985 + }, + { + "epoch": 10.356387125515091, + "grad_norm": 1.398436427116394, + "learning_rate": 2.7802871529832063e-05, + "loss": 0.0393, + "num_input_tokens_seen": 113168960, + "step": 92990 + }, + { + "epoch": 10.356943980398707, + "grad_norm": 0.043868791311979294, + "learning_rate": 2.7800457097683985e-05, + "loss": 0.0181, + "num_input_tokens_seen": 113174944, + "step": 92995 + }, + { + "epoch": 10.357500835282325, + "grad_norm": 0.02326877787709236, + "learning_rate": 2.7798042639083356e-05, + "loss": 0.0484, + "num_input_tokens_seen": 113181024, + "step": 93000 + }, + { + "epoch": 10.358057690165943, + "grad_norm": 0.008756049908697605, + "learning_rate": 2.779562815405296e-05, + "loss": 0.0005, + "num_input_tokens_seen": 113187232, + "step": 93005 + }, + { + "epoch": 10.35861454504956, + "grad_norm": 0.39953914284706116, + "learning_rate": 2.779321364261563e-05, + "loss": 0.0215, + "num_input_tokens_seen": 113193568, + "step": 93010 + }, + { + "epoch": 10.359171399933178, + "grad_norm": 0.07105234265327454, + "learning_rate": 2.7790799104794146e-05, + "loss": 0.0367, + "num_input_tokens_seen": 113199808, + "step": 93015 + }, + { + "epoch": 10.359728254816794, + "grad_norm": 0.1422143131494522, + "learning_rate": 2.7788384540611334e-05, + "loss": 0.0226, + "num_input_tokens_seen": 113205824, + "step": 93020 + }, + { + "epoch": 10.360285109700412, + "grad_norm": 0.2701050341129303, + "learning_rate": 2.7785969950089996e-05, + "loss": 0.0271, + "num_input_tokens_seen": 113211424, + "step": 93025 + }, + { + "epoch": 10.36084196458403, + "grad_norm": 0.3459244668483734, + "learning_rate": 2.7783555333252943e-05, + "loss": 0.0304, + "num_input_tokens_seen": 113217504, + "step": 93030 + }, + { + "epoch": 10.361398819467647, + "grad_norm": 0.02462952397763729, + "learning_rate": 2.7781140690122974e-05, + "loss": 0.0017, + "num_input_tokens_seen": 113223520, + "step": 93035 + }, + { + "epoch": 10.361955674351265, + "grad_norm": 1.1990973949432373, + "learning_rate": 2.7778726020722905e-05, + "loss": 0.1651, + "num_input_tokens_seen": 113229408, + "step": 93040 + }, + { + "epoch": 10.36251252923488, + "grad_norm": 0.05270816385746002, + "learning_rate": 2.777631132507555e-05, + "loss": 0.0025, + "num_input_tokens_seen": 113235712, + "step": 93045 + }, + { + "epoch": 10.363069384118498, + "grad_norm": 0.30248114466667175, + "learning_rate": 2.7773896603203696e-05, + "loss": 0.0211, + "num_input_tokens_seen": 113241824, + "step": 93050 + }, + { + "epoch": 10.363626239002116, + "grad_norm": 1.5109413862228394, + "learning_rate": 2.777148185513017e-05, + "loss": 0.038, + "num_input_tokens_seen": 113247744, + "step": 93055 + }, + { + "epoch": 10.364183093885734, + "grad_norm": 0.053045984357595444, + "learning_rate": 2.7769067080877787e-05, + "loss": 0.1227, + "num_input_tokens_seen": 113254080, + "step": 93060 + }, + { + "epoch": 10.364739948769351, + "grad_norm": 0.034211717545986176, + "learning_rate": 2.776665228046934e-05, + "loss": 0.0283, + "num_input_tokens_seen": 113260256, + "step": 93065 + }, + { + "epoch": 10.365296803652967, + "grad_norm": 0.8314429521560669, + "learning_rate": 2.776423745392765e-05, + "loss": 0.1492, + "num_input_tokens_seen": 113266048, + "step": 93070 + }, + { + "epoch": 10.365853658536585, + "grad_norm": 2.133232355117798, + "learning_rate": 2.7761822601275515e-05, + "loss": 0.0656, + "num_input_tokens_seen": 113272128, + "step": 93075 + }, + { + "epoch": 10.366410513420202, + "grad_norm": 0.3188242018222809, + "learning_rate": 2.7759407722535763e-05, + "loss": 0.0304, + "num_input_tokens_seen": 113278048, + "step": 93080 + }, + { + "epoch": 10.36696736830382, + "grad_norm": 0.06513009965419769, + "learning_rate": 2.775699281773118e-05, + "loss": 0.1201, + "num_input_tokens_seen": 113283712, + "step": 93085 + }, + { + "epoch": 10.367524223187438, + "grad_norm": 0.7207965850830078, + "learning_rate": 2.7754577886884598e-05, + "loss": 0.0503, + "num_input_tokens_seen": 113289792, + "step": 93090 + }, + { + "epoch": 10.368081078071056, + "grad_norm": 0.006218329071998596, + "learning_rate": 2.7752162930018826e-05, + "loss": 0.0114, + "num_input_tokens_seen": 113296064, + "step": 93095 + }, + { + "epoch": 10.368637932954671, + "grad_norm": 0.0004853932186961174, + "learning_rate": 2.774974794715666e-05, + "loss": 0.0326, + "num_input_tokens_seen": 113302144, + "step": 93100 + }, + { + "epoch": 10.369194787838289, + "grad_norm": 0.3726639449596405, + "learning_rate": 2.7747332938320936e-05, + "loss": 0.1025, + "num_input_tokens_seen": 113308256, + "step": 93105 + }, + { + "epoch": 10.369751642721907, + "grad_norm": 0.11716842651367188, + "learning_rate": 2.774491790353444e-05, + "loss": 0.0431, + "num_input_tokens_seen": 113314432, + "step": 93110 + }, + { + "epoch": 10.370308497605524, + "grad_norm": 1.292385458946228, + "learning_rate": 2.774250284282e-05, + "loss": 0.0492, + "num_input_tokens_seen": 113320384, + "step": 93115 + }, + { + "epoch": 10.370865352489142, + "grad_norm": 0.9889729022979736, + "learning_rate": 2.7740087756200424e-05, + "loss": 0.0534, + "num_input_tokens_seen": 113326688, + "step": 93120 + }, + { + "epoch": 10.371422207372758, + "grad_norm": 0.007386188022792339, + "learning_rate": 2.7737672643698515e-05, + "loss": 0.0024, + "num_input_tokens_seen": 113332992, + "step": 93125 + }, + { + "epoch": 10.371979062256376, + "grad_norm": 0.0002030748873949051, + "learning_rate": 2.7735257505337103e-05, + "loss": 0.022, + "num_input_tokens_seen": 113338944, + "step": 93130 + }, + { + "epoch": 10.372535917139993, + "grad_norm": 0.4746929109096527, + "learning_rate": 2.7732842341138987e-05, + "loss": 0.0937, + "num_input_tokens_seen": 113344864, + "step": 93135 + }, + { + "epoch": 10.373092772023611, + "grad_norm": 0.15075692534446716, + "learning_rate": 2.7730427151126994e-05, + "loss": 0.0093, + "num_input_tokens_seen": 113350976, + "step": 93140 + }, + { + "epoch": 10.373649626907229, + "grad_norm": 0.17656120657920837, + "learning_rate": 2.772801193532392e-05, + "loss": 0.1903, + "num_input_tokens_seen": 113356416, + "step": 93145 + }, + { + "epoch": 10.374206481790845, + "grad_norm": 0.11043761670589447, + "learning_rate": 2.772559669375259e-05, + "loss": 0.0071, + "num_input_tokens_seen": 113362592, + "step": 93150 + }, + { + "epoch": 10.374763336674462, + "grad_norm": 1.7934482097625732, + "learning_rate": 2.7723181426435817e-05, + "loss": 0.1512, + "num_input_tokens_seen": 113368864, + "step": 93155 + }, + { + "epoch": 10.37532019155808, + "grad_norm": 0.0315113365650177, + "learning_rate": 2.7720766133396414e-05, + "loss": 0.0104, + "num_input_tokens_seen": 113375424, + "step": 93160 + }, + { + "epoch": 10.375877046441698, + "grad_norm": 0.0021575281862169504, + "learning_rate": 2.7718350814657202e-05, + "loss": 0.0196, + "num_input_tokens_seen": 113381664, + "step": 93165 + }, + { + "epoch": 10.376433901325315, + "grad_norm": 0.05419548973441124, + "learning_rate": 2.771593547024098e-05, + "loss": 0.0188, + "num_input_tokens_seen": 113387904, + "step": 93170 + }, + { + "epoch": 10.376990756208931, + "grad_norm": 0.0002743789809755981, + "learning_rate": 2.771352010017057e-05, + "loss": 0.0099, + "num_input_tokens_seen": 113394208, + "step": 93175 + }, + { + "epoch": 10.377547611092549, + "grad_norm": 0.058968622237443924, + "learning_rate": 2.7711104704468792e-05, + "loss": 0.0818, + "num_input_tokens_seen": 113400736, + "step": 93180 + }, + { + "epoch": 10.378104465976167, + "grad_norm": 2.318325996398926, + "learning_rate": 2.770868928315845e-05, + "loss": 0.1005, + "num_input_tokens_seen": 113406496, + "step": 93185 + }, + { + "epoch": 10.378661320859784, + "grad_norm": 0.1273142248392105, + "learning_rate": 2.770627383626238e-05, + "loss": 0.0405, + "num_input_tokens_seen": 113412384, + "step": 93190 + }, + { + "epoch": 10.379218175743402, + "grad_norm": 0.1719844788312912, + "learning_rate": 2.770385836380338e-05, + "loss": 0.1075, + "num_input_tokens_seen": 113418368, + "step": 93195 + }, + { + "epoch": 10.379775030627018, + "grad_norm": 2.8259334564208984, + "learning_rate": 2.7701442865804272e-05, + "loss": 0.0478, + "num_input_tokens_seen": 113424480, + "step": 93200 + }, + { + "epoch": 10.380331885510635, + "grad_norm": 0.00096701126312837, + "learning_rate": 2.769902734228787e-05, + "loss": 0.0199, + "num_input_tokens_seen": 113430560, + "step": 93205 + }, + { + "epoch": 10.380888740394253, + "grad_norm": 0.013525918126106262, + "learning_rate": 2.7696611793277e-05, + "loss": 0.0028, + "num_input_tokens_seen": 113436800, + "step": 93210 + }, + { + "epoch": 10.38144559527787, + "grad_norm": 1.0323344469070435, + "learning_rate": 2.769419621879446e-05, + "loss": 0.1377, + "num_input_tokens_seen": 113442272, + "step": 93215 + }, + { + "epoch": 10.382002450161488, + "grad_norm": 0.23019851744174957, + "learning_rate": 2.7691780618863085e-05, + "loss": 0.0633, + "num_input_tokens_seen": 113447904, + "step": 93220 + }, + { + "epoch": 10.382559305045106, + "grad_norm": 0.8611365556716919, + "learning_rate": 2.768936499350569e-05, + "loss": 0.0574, + "num_input_tokens_seen": 113454208, + "step": 93225 + }, + { + "epoch": 10.383116159928722, + "grad_norm": 0.10886753350496292, + "learning_rate": 2.7686949342745077e-05, + "loss": 0.0744, + "num_input_tokens_seen": 113460416, + "step": 93230 + }, + { + "epoch": 10.38367301481234, + "grad_norm": 0.0003373775689397007, + "learning_rate": 2.768453366660408e-05, + "loss": 0.008, + "num_input_tokens_seen": 113466336, + "step": 93235 + }, + { + "epoch": 10.384229869695957, + "grad_norm": 0.2020307034254074, + "learning_rate": 2.7682117965105515e-05, + "loss": 0.0092, + "num_input_tokens_seen": 113471872, + "step": 93240 + }, + { + "epoch": 10.384786724579575, + "grad_norm": 0.7830066084861755, + "learning_rate": 2.7679702238272188e-05, + "loss": 0.0233, + "num_input_tokens_seen": 113477280, + "step": 93245 + }, + { + "epoch": 10.385343579463193, + "grad_norm": 0.00017474903143011034, + "learning_rate": 2.7677286486126935e-05, + "loss": 0.0405, + "num_input_tokens_seen": 113483168, + "step": 93250 + }, + { + "epoch": 10.385900434346809, + "grad_norm": 0.352262020111084, + "learning_rate": 2.7674870708692558e-05, + "loss": 0.0169, + "num_input_tokens_seen": 113489536, + "step": 93255 + }, + { + "epoch": 10.386457289230426, + "grad_norm": 0.3467409014701843, + "learning_rate": 2.7672454905991896e-05, + "loss": 0.0119, + "num_input_tokens_seen": 113495552, + "step": 93260 + }, + { + "epoch": 10.387014144114044, + "grad_norm": 0.02409827709197998, + "learning_rate": 2.7670039078047745e-05, + "loss": 0.0176, + "num_input_tokens_seen": 113501792, + "step": 93265 + }, + { + "epoch": 10.387570998997662, + "grad_norm": 0.21851177513599396, + "learning_rate": 2.7667623224882937e-05, + "loss": 0.0276, + "num_input_tokens_seen": 113508064, + "step": 93270 + }, + { + "epoch": 10.38812785388128, + "grad_norm": 0.0053461892530322075, + "learning_rate": 2.7665207346520294e-05, + "loss": 0.0139, + "num_input_tokens_seen": 113514048, + "step": 93275 + }, + { + "epoch": 10.388684708764895, + "grad_norm": 0.5866061449050903, + "learning_rate": 2.7662791442982627e-05, + "loss": 0.0206, + "num_input_tokens_seen": 113520480, + "step": 93280 + }, + { + "epoch": 10.389241563648513, + "grad_norm": 0.527979850769043, + "learning_rate": 2.766037551429277e-05, + "loss": 0.0906, + "num_input_tokens_seen": 113526752, + "step": 93285 + }, + { + "epoch": 10.38979841853213, + "grad_norm": 0.10014509409666061, + "learning_rate": 2.765795956047353e-05, + "loss": 0.0258, + "num_input_tokens_seen": 113532800, + "step": 93290 + }, + { + "epoch": 10.390355273415748, + "grad_norm": 0.26246386766433716, + "learning_rate": 2.7655543581547737e-05, + "loss": 0.1043, + "num_input_tokens_seen": 113538560, + "step": 93295 + }, + { + "epoch": 10.390912128299366, + "grad_norm": 0.5773049592971802, + "learning_rate": 2.7653127577538202e-05, + "loss": 0.0052, + "num_input_tokens_seen": 113544384, + "step": 93300 + }, + { + "epoch": 10.391468983182982, + "grad_norm": 0.0024457403924316168, + "learning_rate": 2.7650711548467744e-05, + "loss": 0.0609, + "num_input_tokens_seen": 113550720, + "step": 93305 + }, + { + "epoch": 10.3920258380666, + "grad_norm": 0.8085178136825562, + "learning_rate": 2.7648295494359206e-05, + "loss": 0.0296, + "num_input_tokens_seen": 113556224, + "step": 93310 + }, + { + "epoch": 10.392582692950217, + "grad_norm": 0.00031504774233326316, + "learning_rate": 2.7645879415235386e-05, + "loss": 0.0674, + "num_input_tokens_seen": 113562464, + "step": 93315 + }, + { + "epoch": 10.393139547833835, + "grad_norm": 1.1917022466659546, + "learning_rate": 2.764346331111912e-05, + "loss": 0.1122, + "num_input_tokens_seen": 113568544, + "step": 93320 + }, + { + "epoch": 10.393696402717453, + "grad_norm": 0.024151967838406563, + "learning_rate": 2.7641047182033225e-05, + "loss": 0.0504, + "num_input_tokens_seen": 113574528, + "step": 93325 + }, + { + "epoch": 10.394253257601068, + "grad_norm": 0.006693241186439991, + "learning_rate": 2.7638631028000515e-05, + "loss": 0.0222, + "num_input_tokens_seen": 113580480, + "step": 93330 + }, + { + "epoch": 10.394810112484686, + "grad_norm": 1.4586642980575562, + "learning_rate": 2.7636214849043834e-05, + "loss": 0.1653, + "num_input_tokens_seen": 113585728, + "step": 93335 + }, + { + "epoch": 10.395366967368304, + "grad_norm": 0.00570065900683403, + "learning_rate": 2.7633798645185986e-05, + "loss": 0.0021, + "num_input_tokens_seen": 113592096, + "step": 93340 + }, + { + "epoch": 10.395923822251921, + "grad_norm": 0.39005187153816223, + "learning_rate": 2.76313824164498e-05, + "loss": 0.0062, + "num_input_tokens_seen": 113598496, + "step": 93345 + }, + { + "epoch": 10.39648067713554, + "grad_norm": 0.8309157490730286, + "learning_rate": 2.76289661628581e-05, + "loss": 0.1062, + "num_input_tokens_seen": 113604640, + "step": 93350 + }, + { + "epoch": 10.397037532019155, + "grad_norm": 0.005977937486022711, + "learning_rate": 2.7626549884433705e-05, + "loss": 0.037, + "num_input_tokens_seen": 113610912, + "step": 93355 + }, + { + "epoch": 10.397594386902773, + "grad_norm": 0.2653365135192871, + "learning_rate": 2.762413358119944e-05, + "loss": 0.0436, + "num_input_tokens_seen": 113617248, + "step": 93360 + }, + { + "epoch": 10.39815124178639, + "grad_norm": 0.1078672707080841, + "learning_rate": 2.7621717253178138e-05, + "loss": 0.0174, + "num_input_tokens_seen": 113623296, + "step": 93365 + }, + { + "epoch": 10.398708096670008, + "grad_norm": 0.03626357391476631, + "learning_rate": 2.7619300900392613e-05, + "loss": 0.1117, + "num_input_tokens_seen": 113629440, + "step": 93370 + }, + { + "epoch": 10.399264951553626, + "grad_norm": 0.7457846999168396, + "learning_rate": 2.761688452286569e-05, + "loss": 0.0395, + "num_input_tokens_seen": 113635808, + "step": 93375 + }, + { + "epoch": 10.399821806437242, + "grad_norm": 0.023025499656796455, + "learning_rate": 2.7614468120620203e-05, + "loss": 0.0445, + "num_input_tokens_seen": 113642080, + "step": 93380 + }, + { + "epoch": 10.40037866132086, + "grad_norm": 1.532791256904602, + "learning_rate": 2.761205169367896e-05, + "loss": 0.2184, + "num_input_tokens_seen": 113647424, + "step": 93385 + }, + { + "epoch": 10.400935516204477, + "grad_norm": 0.2704774737358093, + "learning_rate": 2.76096352420648e-05, + "loss": 0.0861, + "num_input_tokens_seen": 113653760, + "step": 93390 + }, + { + "epoch": 10.401492371088095, + "grad_norm": 2.1178221702575684, + "learning_rate": 2.7607218765800548e-05, + "loss": 0.0885, + "num_input_tokens_seen": 113660064, + "step": 93395 + }, + { + "epoch": 10.402049225971712, + "grad_norm": 0.004731941502541304, + "learning_rate": 2.7604802264909018e-05, + "loss": 0.076, + "num_input_tokens_seen": 113665280, + "step": 93400 + }, + { + "epoch": 10.402606080855328, + "grad_norm": 0.08805692195892334, + "learning_rate": 2.760238573941305e-05, + "loss": 0.0355, + "num_input_tokens_seen": 113671200, + "step": 93405 + }, + { + "epoch": 10.403162935738946, + "grad_norm": 0.1261848658323288, + "learning_rate": 2.759996918933546e-05, + "loss": 0.0778, + "num_input_tokens_seen": 113677600, + "step": 93410 + }, + { + "epoch": 10.403719790622564, + "grad_norm": 0.1417936235666275, + "learning_rate": 2.7597552614699084e-05, + "loss": 0.0852, + "num_input_tokens_seen": 113683520, + "step": 93415 + }, + { + "epoch": 10.404276645506181, + "grad_norm": 1.4855573177337646, + "learning_rate": 2.759513601552674e-05, + "loss": 0.036, + "num_input_tokens_seen": 113689536, + "step": 93420 + }, + { + "epoch": 10.404833500389799, + "grad_norm": 0.34307757019996643, + "learning_rate": 2.7592719391841253e-05, + "loss": 0.0084, + "num_input_tokens_seen": 113695808, + "step": 93425 + }, + { + "epoch": 10.405390355273417, + "grad_norm": 0.00015030484064482152, + "learning_rate": 2.759030274366546e-05, + "loss": 0.0012, + "num_input_tokens_seen": 113701600, + "step": 93430 + }, + { + "epoch": 10.405947210157033, + "grad_norm": 0.0006724482518620789, + "learning_rate": 2.758788607102218e-05, + "loss": 0.0835, + "num_input_tokens_seen": 113707776, + "step": 93435 + }, + { + "epoch": 10.40650406504065, + "grad_norm": 0.03235509991645813, + "learning_rate": 2.7585469373934242e-05, + "loss": 0.0049, + "num_input_tokens_seen": 113713824, + "step": 93440 + }, + { + "epoch": 10.407060919924268, + "grad_norm": 1.1929235458374023, + "learning_rate": 2.7583052652424474e-05, + "loss": 0.0718, + "num_input_tokens_seen": 113720128, + "step": 93445 + }, + { + "epoch": 10.407617774807886, + "grad_norm": 0.8858451247215271, + "learning_rate": 2.7580635906515704e-05, + "loss": 0.0183, + "num_input_tokens_seen": 113726432, + "step": 93450 + }, + { + "epoch": 10.408174629691503, + "grad_norm": 0.005426988936960697, + "learning_rate": 2.757821913623076e-05, + "loss": 0.0569, + "num_input_tokens_seen": 113732384, + "step": 93455 + }, + { + "epoch": 10.408731484575119, + "grad_norm": 0.004384133964776993, + "learning_rate": 2.7575802341592467e-05, + "loss": 0.0084, + "num_input_tokens_seen": 113738464, + "step": 93460 + }, + { + "epoch": 10.409288339458737, + "grad_norm": 0.5231144428253174, + "learning_rate": 2.7573385522623667e-05, + "loss": 0.0223, + "num_input_tokens_seen": 113744608, + "step": 93465 + }, + { + "epoch": 10.409845194342354, + "grad_norm": 0.6360700130462646, + "learning_rate": 2.757096867934717e-05, + "loss": 0.1153, + "num_input_tokens_seen": 113751072, + "step": 93470 + }, + { + "epoch": 10.410402049225972, + "grad_norm": 0.5064123868942261, + "learning_rate": 2.756855181178582e-05, + "loss": 0.0686, + "num_input_tokens_seen": 113757312, + "step": 93475 + }, + { + "epoch": 10.41095890410959, + "grad_norm": 1.0474612712860107, + "learning_rate": 2.756613491996244e-05, + "loss": 0.0218, + "num_input_tokens_seen": 113763328, + "step": 93480 + }, + { + "epoch": 10.411515758993206, + "grad_norm": 0.06274653971195221, + "learning_rate": 2.756371800389986e-05, + "loss": 0.0775, + "num_input_tokens_seen": 113769312, + "step": 93485 + }, + { + "epoch": 10.412072613876823, + "grad_norm": 0.017638586461544037, + "learning_rate": 2.7561301063620905e-05, + "loss": 0.0433, + "num_input_tokens_seen": 113775648, + "step": 93490 + }, + { + "epoch": 10.412629468760441, + "grad_norm": 0.033136505633592606, + "learning_rate": 2.755888409914841e-05, + "loss": 0.0608, + "num_input_tokens_seen": 113781792, + "step": 93495 + }, + { + "epoch": 10.413186323644059, + "grad_norm": 0.20351901650428772, + "learning_rate": 2.75564671105052e-05, + "loss": 0.0072, + "num_input_tokens_seen": 113787840, + "step": 93500 + }, + { + "epoch": 10.413743178527676, + "grad_norm": 1.210222840309143, + "learning_rate": 2.7554050097714118e-05, + "loss": 0.0521, + "num_input_tokens_seen": 113793856, + "step": 93505 + }, + { + "epoch": 10.414300033411292, + "grad_norm": 0.027398960664868355, + "learning_rate": 2.7551633060797988e-05, + "loss": 0.0406, + "num_input_tokens_seen": 113799936, + "step": 93510 + }, + { + "epoch": 10.41485688829491, + "grad_norm": 1.196349859237671, + "learning_rate": 2.7549215999779633e-05, + "loss": 0.2004, + "num_input_tokens_seen": 113805952, + "step": 93515 + }, + { + "epoch": 10.415413743178528, + "grad_norm": 0.7201687693595886, + "learning_rate": 2.7546798914681894e-05, + "loss": 0.0192, + "num_input_tokens_seen": 113812192, + "step": 93520 + }, + { + "epoch": 10.415970598062145, + "grad_norm": 0.5165740847587585, + "learning_rate": 2.7544381805527596e-05, + "loss": 0.0095, + "num_input_tokens_seen": 113818272, + "step": 93525 + }, + { + "epoch": 10.416527452945763, + "grad_norm": 0.13439621031284332, + "learning_rate": 2.7541964672339578e-05, + "loss": 0.0316, + "num_input_tokens_seen": 113824128, + "step": 93530 + }, + { + "epoch": 10.417084307829379, + "grad_norm": 0.16837619245052338, + "learning_rate": 2.7539547515140663e-05, + "loss": 0.1143, + "num_input_tokens_seen": 113830368, + "step": 93535 + }, + { + "epoch": 10.417641162712997, + "grad_norm": 0.00428277300670743, + "learning_rate": 2.7537130333953686e-05, + "loss": 0.0117, + "num_input_tokens_seen": 113836384, + "step": 93540 + }, + { + "epoch": 10.418198017596614, + "grad_norm": 0.25245538353919983, + "learning_rate": 2.7534713128801488e-05, + "loss": 0.0117, + "num_input_tokens_seen": 113842272, + "step": 93545 + }, + { + "epoch": 10.418754872480232, + "grad_norm": 0.0009974915301427245, + "learning_rate": 2.7532295899706884e-05, + "loss": 0.0267, + "num_input_tokens_seen": 113848448, + "step": 93550 + }, + { + "epoch": 10.41931172736385, + "grad_norm": 0.6526152491569519, + "learning_rate": 2.752987864669272e-05, + "loss": 0.0879, + "num_input_tokens_seen": 113853952, + "step": 93555 + }, + { + "epoch": 10.419868582247465, + "grad_norm": 1.102063536643982, + "learning_rate": 2.7527461369781832e-05, + "loss": 0.1122, + "num_input_tokens_seen": 113859616, + "step": 93560 + }, + { + "epoch": 10.420425437131083, + "grad_norm": 1.8723456859588623, + "learning_rate": 2.7525044068997036e-05, + "loss": 0.1186, + "num_input_tokens_seen": 113865568, + "step": 93565 + }, + { + "epoch": 10.4209822920147, + "grad_norm": 3.2430546283721924, + "learning_rate": 2.7522626744361184e-05, + "loss": 0.1233, + "num_input_tokens_seen": 113871360, + "step": 93570 + }, + { + "epoch": 10.421539146898319, + "grad_norm": 0.5256139039993286, + "learning_rate": 2.7520209395897097e-05, + "loss": 0.0418, + "num_input_tokens_seen": 113876960, + "step": 93575 + }, + { + "epoch": 10.422096001781936, + "grad_norm": 0.7177392244338989, + "learning_rate": 2.7517792023627616e-05, + "loss": 0.0264, + "num_input_tokens_seen": 113882912, + "step": 93580 + }, + { + "epoch": 10.422652856665554, + "grad_norm": 0.5249438285827637, + "learning_rate": 2.7515374627575567e-05, + "loss": 0.0736, + "num_input_tokens_seen": 113888576, + "step": 93585 + }, + { + "epoch": 10.42320971154917, + "grad_norm": 0.01563384011387825, + "learning_rate": 2.751295720776379e-05, + "loss": 0.0018, + "num_input_tokens_seen": 113894784, + "step": 93590 + }, + { + "epoch": 10.423766566432787, + "grad_norm": 2.0843710899353027, + "learning_rate": 2.7510539764215128e-05, + "loss": 0.1005, + "num_input_tokens_seen": 113900864, + "step": 93595 + }, + { + "epoch": 10.424323421316405, + "grad_norm": 0.007881044410169125, + "learning_rate": 2.7508122296952393e-05, + "loss": 0.0129, + "num_input_tokens_seen": 113907136, + "step": 93600 + }, + { + "epoch": 10.424880276200023, + "grad_norm": 0.007636151276528835, + "learning_rate": 2.7505704805998444e-05, + "loss": 0.0401, + "num_input_tokens_seen": 113913216, + "step": 93605 + }, + { + "epoch": 10.42543713108364, + "grad_norm": 0.9659072756767273, + "learning_rate": 2.75032872913761e-05, + "loss": 0.0655, + "num_input_tokens_seen": 113919232, + "step": 93610 + }, + { + "epoch": 10.425993985967256, + "grad_norm": 0.014777419157326221, + "learning_rate": 2.7500869753108206e-05, + "loss": 0.0279, + "num_input_tokens_seen": 113924928, + "step": 93615 + }, + { + "epoch": 10.426550840850874, + "grad_norm": 0.04754889756441116, + "learning_rate": 2.749845219121759e-05, + "loss": 0.0041, + "num_input_tokens_seen": 113931200, + "step": 93620 + }, + { + "epoch": 10.427107695734492, + "grad_norm": 0.5703436136245728, + "learning_rate": 2.749603460572709e-05, + "loss": 0.0614, + "num_input_tokens_seen": 113937408, + "step": 93625 + }, + { + "epoch": 10.42766455061811, + "grad_norm": 0.0750032365322113, + "learning_rate": 2.7493616996659543e-05, + "loss": 0.002, + "num_input_tokens_seen": 113943424, + "step": 93630 + }, + { + "epoch": 10.428221405501727, + "grad_norm": 0.007515369914472103, + "learning_rate": 2.7491199364037796e-05, + "loss": 0.031, + "num_input_tokens_seen": 113949792, + "step": 93635 + }, + { + "epoch": 10.428778260385343, + "grad_norm": 0.0062655117362737656, + "learning_rate": 2.7488781707884663e-05, + "loss": 0.0382, + "num_input_tokens_seen": 113956032, + "step": 93640 + }, + { + "epoch": 10.42933511526896, + "grad_norm": 0.004462277516722679, + "learning_rate": 2.7486364028223e-05, + "loss": 0.0348, + "num_input_tokens_seen": 113962048, + "step": 93645 + }, + { + "epoch": 10.429891970152578, + "grad_norm": 0.0009508506627753377, + "learning_rate": 2.748394632507563e-05, + "loss": 0.0098, + "num_input_tokens_seen": 113968192, + "step": 93650 + }, + { + "epoch": 10.430448825036196, + "grad_norm": 0.2848353981971741, + "learning_rate": 2.7481528598465407e-05, + "loss": 0.0314, + "num_input_tokens_seen": 113974400, + "step": 93655 + }, + { + "epoch": 10.431005679919814, + "grad_norm": 0.03142571449279785, + "learning_rate": 2.7479110848415146e-05, + "loss": 0.015, + "num_input_tokens_seen": 113980480, + "step": 93660 + }, + { + "epoch": 10.43156253480343, + "grad_norm": 0.15593960881233215, + "learning_rate": 2.7476693074947706e-05, + "loss": 0.173, + "num_input_tokens_seen": 113986816, + "step": 93665 + }, + { + "epoch": 10.432119389687047, + "grad_norm": 0.26019158959388733, + "learning_rate": 2.7474275278085915e-05, + "loss": 0.0075, + "num_input_tokens_seen": 113992896, + "step": 93670 + }, + { + "epoch": 10.432676244570665, + "grad_norm": 0.0007969929138198495, + "learning_rate": 2.7471857457852607e-05, + "loss": 0.1327, + "num_input_tokens_seen": 113999360, + "step": 93675 + }, + { + "epoch": 10.433233099454283, + "grad_norm": 0.000900476414244622, + "learning_rate": 2.7469439614270626e-05, + "loss": 0.0166, + "num_input_tokens_seen": 114005152, + "step": 93680 + }, + { + "epoch": 10.4337899543379, + "grad_norm": 0.36581921577453613, + "learning_rate": 2.7467021747362808e-05, + "loss": 0.0117, + "num_input_tokens_seen": 114011072, + "step": 93685 + }, + { + "epoch": 10.434346809221516, + "grad_norm": 0.009902079589664936, + "learning_rate": 2.7464603857152e-05, + "loss": 0.0008, + "num_input_tokens_seen": 114017024, + "step": 93690 + }, + { + "epoch": 10.434903664105134, + "grad_norm": 0.38296476006507874, + "learning_rate": 2.7462185943661028e-05, + "loss": 0.0138, + "num_input_tokens_seen": 114023264, + "step": 93695 + }, + { + "epoch": 10.435460518988751, + "grad_norm": 0.3791390061378479, + "learning_rate": 2.7459768006912734e-05, + "loss": 0.0385, + "num_input_tokens_seen": 114029312, + "step": 93700 + }, + { + "epoch": 10.43601737387237, + "grad_norm": 0.16807548701763153, + "learning_rate": 2.7457350046929968e-05, + "loss": 0.0194, + "num_input_tokens_seen": 114034880, + "step": 93705 + }, + { + "epoch": 10.436574228755987, + "grad_norm": 0.11554154008626938, + "learning_rate": 2.7454932063735554e-05, + "loss": 0.1001, + "num_input_tokens_seen": 114040800, + "step": 93710 + }, + { + "epoch": 10.437131083639603, + "grad_norm": 0.00495103420689702, + "learning_rate": 2.7452514057352354e-05, + "loss": 0.1139, + "num_input_tokens_seen": 114047072, + "step": 93715 + }, + { + "epoch": 10.43768793852322, + "grad_norm": 0.04862958937883377, + "learning_rate": 2.7450096027803178e-05, + "loss": 0.0037, + "num_input_tokens_seen": 114053312, + "step": 93720 + }, + { + "epoch": 10.438244793406838, + "grad_norm": 0.00023870203585829586, + "learning_rate": 2.744767797511089e-05, + "loss": 0.0325, + "num_input_tokens_seen": 114059488, + "step": 93725 + }, + { + "epoch": 10.438801648290456, + "grad_norm": 0.00026648803032003343, + "learning_rate": 2.744525989929832e-05, + "loss": 0.0416, + "num_input_tokens_seen": 114066112, + "step": 93730 + }, + { + "epoch": 10.439358503174073, + "grad_norm": 0.13395485281944275, + "learning_rate": 2.744284180038831e-05, + "loss": 0.0853, + "num_input_tokens_seen": 114071744, + "step": 93735 + }, + { + "epoch": 10.43991535805769, + "grad_norm": 0.0009384212316945195, + "learning_rate": 2.744042367840371e-05, + "loss": 0.0593, + "num_input_tokens_seen": 114077856, + "step": 93740 + }, + { + "epoch": 10.440472212941307, + "grad_norm": 0.07763194292783737, + "learning_rate": 2.7438005533367344e-05, + "loss": 0.0913, + "num_input_tokens_seen": 114083776, + "step": 93745 + }, + { + "epoch": 10.441029067824925, + "grad_norm": 0.011749638244509697, + "learning_rate": 2.7435587365302067e-05, + "loss": 0.093, + "num_input_tokens_seen": 114090048, + "step": 93750 + }, + { + "epoch": 10.441585922708542, + "grad_norm": 1.0794230699539185, + "learning_rate": 2.7433169174230712e-05, + "loss": 0.0245, + "num_input_tokens_seen": 114096448, + "step": 93755 + }, + { + "epoch": 10.44214277759216, + "grad_norm": 1.0183727741241455, + "learning_rate": 2.7430750960176134e-05, + "loss": 0.1062, + "num_input_tokens_seen": 114102560, + "step": 93760 + }, + { + "epoch": 10.442699632475776, + "grad_norm": 1.3220117092132568, + "learning_rate": 2.7428332723161155e-05, + "loss": 0.0753, + "num_input_tokens_seen": 114108224, + "step": 93765 + }, + { + "epoch": 10.443256487359394, + "grad_norm": 0.27385616302490234, + "learning_rate": 2.742591446320863e-05, + "loss": 0.0026, + "num_input_tokens_seen": 114114496, + "step": 93770 + }, + { + "epoch": 10.443813342243011, + "grad_norm": 0.08843035995960236, + "learning_rate": 2.74234961803414e-05, + "loss": 0.0293, + "num_input_tokens_seen": 114120384, + "step": 93775 + }, + { + "epoch": 10.444370197126629, + "grad_norm": 0.0029347697272896767, + "learning_rate": 2.742107787458231e-05, + "loss": 0.013, + "num_input_tokens_seen": 114126528, + "step": 93780 + }, + { + "epoch": 10.444927052010247, + "grad_norm": 1.126259446144104, + "learning_rate": 2.7418659545954202e-05, + "loss": 0.0525, + "num_input_tokens_seen": 114132224, + "step": 93785 + }, + { + "epoch": 10.445483906893864, + "grad_norm": 0.45474863052368164, + "learning_rate": 2.741624119447991e-05, + "loss": 0.0332, + "num_input_tokens_seen": 114138592, + "step": 93790 + }, + { + "epoch": 10.44604076177748, + "grad_norm": 0.20501147210597992, + "learning_rate": 2.741382282018229e-05, + "loss": 0.0235, + "num_input_tokens_seen": 114144928, + "step": 93795 + }, + { + "epoch": 10.446597616661098, + "grad_norm": 0.012047874741256237, + "learning_rate": 2.7411404423084176e-05, + "loss": 0.002, + "num_input_tokens_seen": 114150976, + "step": 93800 + }, + { + "epoch": 10.447154471544716, + "grad_norm": 0.3626602590084076, + "learning_rate": 2.7408986003208408e-05, + "loss": 0.1026, + "num_input_tokens_seen": 114157120, + "step": 93805 + }, + { + "epoch": 10.447711326428333, + "grad_norm": 0.005662004463374615, + "learning_rate": 2.740656756057785e-05, + "loss": 0.0421, + "num_input_tokens_seen": 114163424, + "step": 93810 + }, + { + "epoch": 10.448268181311951, + "grad_norm": 1.2485723495483398, + "learning_rate": 2.7404149095215324e-05, + "loss": 0.0789, + "num_input_tokens_seen": 114169536, + "step": 93815 + }, + { + "epoch": 10.448825036195567, + "grad_norm": 0.17491215467453003, + "learning_rate": 2.7401730607143694e-05, + "loss": 0.0052, + "num_input_tokens_seen": 114175200, + "step": 93820 + }, + { + "epoch": 10.449381891079184, + "grad_norm": 0.0073731583543121815, + "learning_rate": 2.7399312096385783e-05, + "loss": 0.0377, + "num_input_tokens_seen": 114181632, + "step": 93825 + }, + { + "epoch": 10.449938745962802, + "grad_norm": 0.03015439212322235, + "learning_rate": 2.739689356296445e-05, + "loss": 0.023, + "num_input_tokens_seen": 114187840, + "step": 93830 + }, + { + "epoch": 10.45049560084642, + "grad_norm": 0.606009304523468, + "learning_rate": 2.739447500690254e-05, + "loss": 0.0081, + "num_input_tokens_seen": 114194368, + "step": 93835 + }, + { + "epoch": 10.451052455730037, + "grad_norm": 0.16359837353229523, + "learning_rate": 2.739205642822289e-05, + "loss": 0.0295, + "num_input_tokens_seen": 114200672, + "step": 93840 + }, + { + "epoch": 10.451609310613653, + "grad_norm": 0.002503184136003256, + "learning_rate": 2.738963782694836e-05, + "loss": 0.0146, + "num_input_tokens_seen": 114207296, + "step": 93845 + }, + { + "epoch": 10.452166165497271, + "grad_norm": 0.2498418539762497, + "learning_rate": 2.7387219203101777e-05, + "loss": 0.0302, + "num_input_tokens_seen": 114213152, + "step": 93850 + }, + { + "epoch": 10.452723020380889, + "grad_norm": 0.07418037950992584, + "learning_rate": 2.7384800556706004e-05, + "loss": 0.0104, + "num_input_tokens_seen": 114218784, + "step": 93855 + }, + { + "epoch": 10.453279875264506, + "grad_norm": 0.27259406447410583, + "learning_rate": 2.738238188778387e-05, + "loss": 0.1051, + "num_input_tokens_seen": 114225184, + "step": 93860 + }, + { + "epoch": 10.453836730148124, + "grad_norm": 0.7286447286605835, + "learning_rate": 2.7379963196358233e-05, + "loss": 0.0363, + "num_input_tokens_seen": 114231392, + "step": 93865 + }, + { + "epoch": 10.45439358503174, + "grad_norm": 0.15601874887943268, + "learning_rate": 2.7377544482451946e-05, + "loss": 0.0337, + "num_input_tokens_seen": 114237664, + "step": 93870 + }, + { + "epoch": 10.454950439915358, + "grad_norm": 0.5396082401275635, + "learning_rate": 2.7375125746087836e-05, + "loss": 0.075, + "num_input_tokens_seen": 114243552, + "step": 93875 + }, + { + "epoch": 10.455507294798975, + "grad_norm": 0.6591966152191162, + "learning_rate": 2.7372706987288765e-05, + "loss": 0.1157, + "num_input_tokens_seen": 114249632, + "step": 93880 + }, + { + "epoch": 10.456064149682593, + "grad_norm": 0.026155395433306694, + "learning_rate": 2.737028820607757e-05, + "loss": 0.0437, + "num_input_tokens_seen": 114255488, + "step": 93885 + }, + { + "epoch": 10.45662100456621, + "grad_norm": 0.33853819966316223, + "learning_rate": 2.7367869402477115e-05, + "loss": 0.0156, + "num_input_tokens_seen": 114261600, + "step": 93890 + }, + { + "epoch": 10.457177859449827, + "grad_norm": 0.3169662654399872, + "learning_rate": 2.7365450576510225e-05, + "loss": 0.0578, + "num_input_tokens_seen": 114267808, + "step": 93895 + }, + { + "epoch": 10.457734714333444, + "grad_norm": 0.10570594668388367, + "learning_rate": 2.736303172819976e-05, + "loss": 0.0319, + "num_input_tokens_seen": 114274208, + "step": 93900 + }, + { + "epoch": 10.458291569217062, + "grad_norm": 0.007809388916939497, + "learning_rate": 2.7360612857568573e-05, + "loss": 0.0877, + "num_input_tokens_seen": 114279936, + "step": 93905 + }, + { + "epoch": 10.45884842410068, + "grad_norm": 2.3335044384002686, + "learning_rate": 2.7358193964639507e-05, + "loss": 0.1199, + "num_input_tokens_seen": 114286080, + "step": 93910 + }, + { + "epoch": 10.459405278984297, + "grad_norm": 0.5964556336402893, + "learning_rate": 2.7355775049435406e-05, + "loss": 0.081, + "num_input_tokens_seen": 114291360, + "step": 93915 + }, + { + "epoch": 10.459962133867913, + "grad_norm": 1.0307875871658325, + "learning_rate": 2.7353356111979122e-05, + "loss": 0.0937, + "num_input_tokens_seen": 114297536, + "step": 93920 + }, + { + "epoch": 10.46051898875153, + "grad_norm": 0.011389887891709805, + "learning_rate": 2.7350937152293506e-05, + "loss": 0.051, + "num_input_tokens_seen": 114303488, + "step": 93925 + }, + { + "epoch": 10.461075843635149, + "grad_norm": 0.002219582675024867, + "learning_rate": 2.7348518170401406e-05, + "loss": 0.0198, + "num_input_tokens_seen": 114309504, + "step": 93930 + }, + { + "epoch": 10.461632698518766, + "grad_norm": 0.03911472111940384, + "learning_rate": 2.7346099166325663e-05, + "loss": 0.0014, + "num_input_tokens_seen": 114315584, + "step": 93935 + }, + { + "epoch": 10.462189553402384, + "grad_norm": 0.031498633325099945, + "learning_rate": 2.734368014008914e-05, + "loss": 0.0648, + "num_input_tokens_seen": 114321088, + "step": 93940 + }, + { + "epoch": 10.462746408286002, + "grad_norm": 0.49827849864959717, + "learning_rate": 2.7341261091714676e-05, + "loss": 0.0596, + "num_input_tokens_seen": 114327040, + "step": 93945 + }, + { + "epoch": 10.463303263169617, + "grad_norm": 0.07571117579936981, + "learning_rate": 2.7338842021225136e-05, + "loss": 0.0305, + "num_input_tokens_seen": 114332192, + "step": 93950 + }, + { + "epoch": 10.463860118053235, + "grad_norm": 0.05081783980131149, + "learning_rate": 2.7336422928643347e-05, + "loss": 0.0124, + "num_input_tokens_seen": 114338368, + "step": 93955 + }, + { + "epoch": 10.464416972936853, + "grad_norm": 1.9650038480758667, + "learning_rate": 2.7334003813992175e-05, + "loss": 0.0621, + "num_input_tokens_seen": 114344480, + "step": 93960 + }, + { + "epoch": 10.46497382782047, + "grad_norm": 0.004314091056585312, + "learning_rate": 2.733158467729447e-05, + "loss": 0.0396, + "num_input_tokens_seen": 114350560, + "step": 93965 + }, + { + "epoch": 10.465530682704088, + "grad_norm": 1.7060211896896362, + "learning_rate": 2.7329165518573076e-05, + "loss": 0.0678, + "num_input_tokens_seen": 114357120, + "step": 93970 + }, + { + "epoch": 10.466087537587704, + "grad_norm": 0.00018943852046504617, + "learning_rate": 2.732674633785085e-05, + "loss": 0.1478, + "num_input_tokens_seen": 114363392, + "step": 93975 + }, + { + "epoch": 10.466644392471322, + "grad_norm": 1.672871708869934, + "learning_rate": 2.732432713515064e-05, + "loss": 0.1412, + "num_input_tokens_seen": 114369280, + "step": 93980 + }, + { + "epoch": 10.46720124735494, + "grad_norm": 0.4839814305305481, + "learning_rate": 2.7321907910495304e-05, + "loss": 0.0728, + "num_input_tokens_seen": 114375296, + "step": 93985 + }, + { + "epoch": 10.467758102238557, + "grad_norm": 0.6899987459182739, + "learning_rate": 2.731948866390768e-05, + "loss": 0.0863, + "num_input_tokens_seen": 114380960, + "step": 93990 + }, + { + "epoch": 10.468314957122175, + "grad_norm": 0.037478215992450714, + "learning_rate": 2.731706939541062e-05, + "loss": 0.0219, + "num_input_tokens_seen": 114387040, + "step": 93995 + }, + { + "epoch": 10.46887181200579, + "grad_norm": 0.0339437760412693, + "learning_rate": 2.7314650105027e-05, + "loss": 0.0191, + "num_input_tokens_seen": 114392672, + "step": 94000 + }, + { + "epoch": 10.469428666889408, + "grad_norm": 0.16906125843524933, + "learning_rate": 2.7312230792779648e-05, + "loss": 0.0203, + "num_input_tokens_seen": 114398752, + "step": 94005 + }, + { + "epoch": 10.469985521773026, + "grad_norm": 0.7603434920310974, + "learning_rate": 2.7309811458691425e-05, + "loss": 0.0388, + "num_input_tokens_seen": 114405184, + "step": 94010 + }, + { + "epoch": 10.470542376656644, + "grad_norm": 0.12746959924697876, + "learning_rate": 2.730739210278517e-05, + "loss": 0.0174, + "num_input_tokens_seen": 114411616, + "step": 94015 + }, + { + "epoch": 10.471099231540261, + "grad_norm": 0.007305795326828957, + "learning_rate": 2.7304972725083768e-05, + "loss": 0.0144, + "num_input_tokens_seen": 114417952, + "step": 94020 + }, + { + "epoch": 10.471656086423877, + "grad_norm": 0.51918625831604, + "learning_rate": 2.7302553325610036e-05, + "loss": 0.0095, + "num_input_tokens_seen": 114424032, + "step": 94025 + }, + { + "epoch": 10.472212941307495, + "grad_norm": 1.6022827625274658, + "learning_rate": 2.730013390438685e-05, + "loss": 0.1091, + "num_input_tokens_seen": 114428864, + "step": 94030 + }, + { + "epoch": 10.472769796191113, + "grad_norm": 0.22030562162399292, + "learning_rate": 2.7297714461437057e-05, + "loss": 0.0423, + "num_input_tokens_seen": 114435072, + "step": 94035 + }, + { + "epoch": 10.47332665107473, + "grad_norm": 0.007631324697285891, + "learning_rate": 2.7295294996783503e-05, + "loss": 0.0339, + "num_input_tokens_seen": 114440800, + "step": 94040 + }, + { + "epoch": 10.473883505958348, + "grad_norm": 0.02113563008606434, + "learning_rate": 2.7292875510449063e-05, + "loss": 0.1666, + "num_input_tokens_seen": 114446176, + "step": 94045 + }, + { + "epoch": 10.474440360841964, + "grad_norm": 0.5317493677139282, + "learning_rate": 2.729045600245657e-05, + "loss": 0.0236, + "num_input_tokens_seen": 114451968, + "step": 94050 + }, + { + "epoch": 10.474997215725582, + "grad_norm": 0.0010005783988162875, + "learning_rate": 2.728803647282888e-05, + "loss": 0.0165, + "num_input_tokens_seen": 114458048, + "step": 94055 + }, + { + "epoch": 10.4755540706092, + "grad_norm": 0.02353874035179615, + "learning_rate": 2.7285616921588857e-05, + "loss": 0.0294, + "num_input_tokens_seen": 114464128, + "step": 94060 + }, + { + "epoch": 10.476110925492817, + "grad_norm": 2.5901308059692383, + "learning_rate": 2.728319734875935e-05, + "loss": 0.0346, + "num_input_tokens_seen": 114469984, + "step": 94065 + }, + { + "epoch": 10.476667780376435, + "grad_norm": 0.7104277610778809, + "learning_rate": 2.7280777754363218e-05, + "loss": 0.0697, + "num_input_tokens_seen": 114475904, + "step": 94070 + }, + { + "epoch": 10.47722463526005, + "grad_norm": 0.3781377077102661, + "learning_rate": 2.7278358138423305e-05, + "loss": 0.0106, + "num_input_tokens_seen": 114481792, + "step": 94075 + }, + { + "epoch": 10.477781490143668, + "grad_norm": 0.07391767203807831, + "learning_rate": 2.7275938500962476e-05, + "loss": 0.0235, + "num_input_tokens_seen": 114487776, + "step": 94080 + }, + { + "epoch": 10.478338345027286, + "grad_norm": 0.00011170956713613123, + "learning_rate": 2.7273518842003586e-05, + "loss": 0.0233, + "num_input_tokens_seen": 114493824, + "step": 94085 + }, + { + "epoch": 10.478895199910903, + "grad_norm": 1.4522119760513306, + "learning_rate": 2.7271099161569493e-05, + "loss": 0.0683, + "num_input_tokens_seen": 114500064, + "step": 94090 + }, + { + "epoch": 10.479452054794521, + "grad_norm": 0.006769614294171333, + "learning_rate": 2.7268679459683044e-05, + "loss": 0.0082, + "num_input_tokens_seen": 114505568, + "step": 94095 + }, + { + "epoch": 10.480008909678137, + "grad_norm": 0.2006627470254898, + "learning_rate": 2.72662597363671e-05, + "loss": 0.0366, + "num_input_tokens_seen": 114511904, + "step": 94100 + }, + { + "epoch": 10.480565764561755, + "grad_norm": 1.309922456741333, + "learning_rate": 2.726383999164452e-05, + "loss": 0.0631, + "num_input_tokens_seen": 114517984, + "step": 94105 + }, + { + "epoch": 10.481122619445372, + "grad_norm": 1.300176739692688, + "learning_rate": 2.7261420225538153e-05, + "loss": 0.0642, + "num_input_tokens_seen": 114523328, + "step": 94110 + }, + { + "epoch": 10.48167947432899, + "grad_norm": 0.5878670811653137, + "learning_rate": 2.7259000438070866e-05, + "loss": 0.0154, + "num_input_tokens_seen": 114529536, + "step": 94115 + }, + { + "epoch": 10.482236329212608, + "grad_norm": 0.0756126269698143, + "learning_rate": 2.7256580629265504e-05, + "loss": 0.0212, + "num_input_tokens_seen": 114535840, + "step": 94120 + }, + { + "epoch": 10.482793184096224, + "grad_norm": 0.13279367983341217, + "learning_rate": 2.7254160799144935e-05, + "loss": 0.0045, + "num_input_tokens_seen": 114541920, + "step": 94125 + }, + { + "epoch": 10.483350038979841, + "grad_norm": 0.3366486430168152, + "learning_rate": 2.7251740947732013e-05, + "loss": 0.1583, + "num_input_tokens_seen": 114548128, + "step": 94130 + }, + { + "epoch": 10.483906893863459, + "grad_norm": 0.0011880145175382495, + "learning_rate": 2.7249321075049583e-05, + "loss": 0.0104, + "num_input_tokens_seen": 114553760, + "step": 94135 + }, + { + "epoch": 10.484463748747077, + "grad_norm": 0.30090272426605225, + "learning_rate": 2.724690118112052e-05, + "loss": 0.0773, + "num_input_tokens_seen": 114559648, + "step": 94140 + }, + { + "epoch": 10.485020603630694, + "grad_norm": 0.07901928573846817, + "learning_rate": 2.724448126596768e-05, + "loss": 0.0033, + "num_input_tokens_seen": 114565920, + "step": 94145 + }, + { + "epoch": 10.485577458514312, + "grad_norm": 1.708871603012085, + "learning_rate": 2.724206132961391e-05, + "loss": 0.0984, + "num_input_tokens_seen": 114572032, + "step": 94150 + }, + { + "epoch": 10.486134313397928, + "grad_norm": 0.10455453395843506, + "learning_rate": 2.7239641372082076e-05, + "loss": 0.0751, + "num_input_tokens_seen": 114577920, + "step": 94155 + }, + { + "epoch": 10.486691168281546, + "grad_norm": 0.004890408832579851, + "learning_rate": 2.7237221393395035e-05, + "loss": 0.0692, + "num_input_tokens_seen": 114584160, + "step": 94160 + }, + { + "epoch": 10.487248023165163, + "grad_norm": 0.10345510393381119, + "learning_rate": 2.723480139357565e-05, + "loss": 0.0738, + "num_input_tokens_seen": 114590208, + "step": 94165 + }, + { + "epoch": 10.487804878048781, + "grad_norm": 0.4637083113193512, + "learning_rate": 2.7232381372646763e-05, + "loss": 0.0251, + "num_input_tokens_seen": 114596480, + "step": 94170 + }, + { + "epoch": 10.488361732932399, + "grad_norm": 0.10626467317342758, + "learning_rate": 2.7229961330631252e-05, + "loss": 0.007, + "num_input_tokens_seen": 114602464, + "step": 94175 + }, + { + "epoch": 10.488918587816014, + "grad_norm": 0.0012683019740507007, + "learning_rate": 2.7227541267551977e-05, + "loss": 0.0437, + "num_input_tokens_seen": 114608256, + "step": 94180 + }, + { + "epoch": 10.489475442699632, + "grad_norm": 0.9569409489631653, + "learning_rate": 2.722512118343178e-05, + "loss": 0.0164, + "num_input_tokens_seen": 114614752, + "step": 94185 + }, + { + "epoch": 10.49003229758325, + "grad_norm": 1.2585681676864624, + "learning_rate": 2.7222701078293538e-05, + "loss": 0.053, + "num_input_tokens_seen": 114620992, + "step": 94190 + }, + { + "epoch": 10.490589152466868, + "grad_norm": 0.25687676668167114, + "learning_rate": 2.7220280952160093e-05, + "loss": 0.0366, + "num_input_tokens_seen": 114627296, + "step": 94195 + }, + { + "epoch": 10.491146007350485, + "grad_norm": 0.0007270567584782839, + "learning_rate": 2.7217860805054323e-05, + "loss": 0.0198, + "num_input_tokens_seen": 114633600, + "step": 94200 + }, + { + "epoch": 10.491702862234101, + "grad_norm": 0.004982989281415939, + "learning_rate": 2.7215440636999083e-05, + "loss": 0.1366, + "num_input_tokens_seen": 114639392, + "step": 94205 + }, + { + "epoch": 10.492259717117719, + "grad_norm": 0.020681729540228844, + "learning_rate": 2.721302044801723e-05, + "loss": 0.0081, + "num_input_tokens_seen": 114645408, + "step": 94210 + }, + { + "epoch": 10.492816572001336, + "grad_norm": 0.05048011988401413, + "learning_rate": 2.7210600238131624e-05, + "loss": 0.0607, + "num_input_tokens_seen": 114651328, + "step": 94215 + }, + { + "epoch": 10.493373426884954, + "grad_norm": 0.15639285743236542, + "learning_rate": 2.7208180007365124e-05, + "loss": 0.0497, + "num_input_tokens_seen": 114657024, + "step": 94220 + }, + { + "epoch": 10.493930281768572, + "grad_norm": 0.52459716796875, + "learning_rate": 2.72057597557406e-05, + "loss": 0.0437, + "num_input_tokens_seen": 114663456, + "step": 94225 + }, + { + "epoch": 10.494487136652188, + "grad_norm": 0.022143250331282616, + "learning_rate": 2.720333948328091e-05, + "loss": 0.0277, + "num_input_tokens_seen": 114668992, + "step": 94230 + }, + { + "epoch": 10.495043991535805, + "grad_norm": 0.001124316593632102, + "learning_rate": 2.7200919190008905e-05, + "loss": 0.026, + "num_input_tokens_seen": 114675072, + "step": 94235 + }, + { + "epoch": 10.495600846419423, + "grad_norm": 0.5662717223167419, + "learning_rate": 2.7198498875947466e-05, + "loss": 0.0439, + "num_input_tokens_seen": 114681312, + "step": 94240 + }, + { + "epoch": 10.49615770130304, + "grad_norm": 0.7919946312904358, + "learning_rate": 2.719607854111943e-05, + "loss": 0.0582, + "num_input_tokens_seen": 114687104, + "step": 94245 + }, + { + "epoch": 10.496714556186658, + "grad_norm": 1.3383185863494873, + "learning_rate": 2.7193658185547682e-05, + "loss": 0.0495, + "num_input_tokens_seen": 114693088, + "step": 94250 + }, + { + "epoch": 10.497271411070274, + "grad_norm": 0.14132075011730194, + "learning_rate": 2.719123780925507e-05, + "loss": 0.0253, + "num_input_tokens_seen": 114699552, + "step": 94255 + }, + { + "epoch": 10.497828265953892, + "grad_norm": 1.9973937273025513, + "learning_rate": 2.7188817412264474e-05, + "loss": 0.2477, + "num_input_tokens_seen": 114705952, + "step": 94260 + }, + { + "epoch": 10.49838512083751, + "grad_norm": 0.8060983419418335, + "learning_rate": 2.7186396994598728e-05, + "loss": 0.1134, + "num_input_tokens_seen": 114711904, + "step": 94265 + }, + { + "epoch": 10.498941975721127, + "grad_norm": 0.05106319487094879, + "learning_rate": 2.7183976556280716e-05, + "loss": 0.0026, + "num_input_tokens_seen": 114718464, + "step": 94270 + }, + { + "epoch": 10.499498830604745, + "grad_norm": 0.31508517265319824, + "learning_rate": 2.71815560973333e-05, + "loss": 0.0304, + "num_input_tokens_seen": 114724288, + "step": 94275 + }, + { + "epoch": 10.500055685488363, + "grad_norm": 0.001954425359144807, + "learning_rate": 2.7179135617779334e-05, + "loss": 0.0078, + "num_input_tokens_seen": 114730208, + "step": 94280 + }, + { + "epoch": 10.500612540371979, + "grad_norm": 0.12921343743801117, + "learning_rate": 2.7176715117641687e-05, + "loss": 0.0892, + "num_input_tokens_seen": 114736064, + "step": 94285 + }, + { + "epoch": 10.501169395255596, + "grad_norm": 0.06860262155532837, + "learning_rate": 2.7174294596943222e-05, + "loss": 0.0221, + "num_input_tokens_seen": 114742080, + "step": 94290 + }, + { + "epoch": 10.501726250139214, + "grad_norm": 1.4858205318450928, + "learning_rate": 2.7171874055706804e-05, + "loss": 0.1073, + "num_input_tokens_seen": 114748128, + "step": 94295 + }, + { + "epoch": 10.502283105022832, + "grad_norm": 0.03410849720239639, + "learning_rate": 2.7169453493955292e-05, + "loss": 0.0242, + "num_input_tokens_seen": 114753856, + "step": 94300 + }, + { + "epoch": 10.50283995990645, + "grad_norm": 0.007079639006406069, + "learning_rate": 2.7167032911711553e-05, + "loss": 0.1021, + "num_input_tokens_seen": 114759904, + "step": 94305 + }, + { + "epoch": 10.503396814790065, + "grad_norm": 0.20251250267028809, + "learning_rate": 2.716461230899846e-05, + "loss": 0.08, + "num_input_tokens_seen": 114765984, + "step": 94310 + }, + { + "epoch": 10.503953669673683, + "grad_norm": 0.02407819591462612, + "learning_rate": 2.716219168583886e-05, + "loss": 0.0486, + "num_input_tokens_seen": 114772032, + "step": 94315 + }, + { + "epoch": 10.5045105245573, + "grad_norm": 8.09963607788086, + "learning_rate": 2.715977104225564e-05, + "loss": 0.1652, + "num_input_tokens_seen": 114778208, + "step": 94320 + }, + { + "epoch": 10.505067379440918, + "grad_norm": 0.4515438377857208, + "learning_rate": 2.715735037827164e-05, + "loss": 0.0624, + "num_input_tokens_seen": 114784128, + "step": 94325 + }, + { + "epoch": 10.505624234324536, + "grad_norm": 0.028148330748081207, + "learning_rate": 2.7154929693909735e-05, + "loss": 0.0885, + "num_input_tokens_seen": 114790208, + "step": 94330 + }, + { + "epoch": 10.506181089208152, + "grad_norm": 1.1319764852523804, + "learning_rate": 2.7152508989192804e-05, + "loss": 0.0862, + "num_input_tokens_seen": 114796128, + "step": 94335 + }, + { + "epoch": 10.50673794409177, + "grad_norm": 0.00010086111433338374, + "learning_rate": 2.715008826414369e-05, + "loss": 0.0006, + "num_input_tokens_seen": 114802208, + "step": 94340 + }, + { + "epoch": 10.507294798975387, + "grad_norm": 1.1275838613510132, + "learning_rate": 2.714766751878528e-05, + "loss": 0.0514, + "num_input_tokens_seen": 114808288, + "step": 94345 + }, + { + "epoch": 10.507851653859005, + "grad_norm": 0.7471752762794495, + "learning_rate": 2.7145246753140424e-05, + "loss": 0.0626, + "num_input_tokens_seen": 114814400, + "step": 94350 + }, + { + "epoch": 10.508408508742622, + "grad_norm": 0.0007719359127804637, + "learning_rate": 2.7142825967231993e-05, + "loss": 0.0793, + "num_input_tokens_seen": 114820320, + "step": 94355 + }, + { + "epoch": 10.508965363626238, + "grad_norm": 0.011093460023403168, + "learning_rate": 2.7140405161082853e-05, + "loss": 0.031, + "num_input_tokens_seen": 114826432, + "step": 94360 + }, + { + "epoch": 10.509522218509856, + "grad_norm": 0.8264135718345642, + "learning_rate": 2.7137984334715877e-05, + "loss": 0.0185, + "num_input_tokens_seen": 114832576, + "step": 94365 + }, + { + "epoch": 10.510079073393474, + "grad_norm": 0.9427444338798523, + "learning_rate": 2.7135563488153924e-05, + "loss": 0.0692, + "num_input_tokens_seen": 114838560, + "step": 94370 + }, + { + "epoch": 10.510635928277091, + "grad_norm": 0.3553839921951294, + "learning_rate": 2.713314262141986e-05, + "loss": 0.0192, + "num_input_tokens_seen": 114844736, + "step": 94375 + }, + { + "epoch": 10.511192783160709, + "grad_norm": 0.794529378414154, + "learning_rate": 2.713072173453656e-05, + "loss": 0.0648, + "num_input_tokens_seen": 114850720, + "step": 94380 + }, + { + "epoch": 10.511749638044325, + "grad_norm": 8.824391989037395e-05, + "learning_rate": 2.7128300827526875e-05, + "loss": 0.0131, + "num_input_tokens_seen": 114856960, + "step": 94385 + }, + { + "epoch": 10.512306492927943, + "grad_norm": 0.0002373107272433117, + "learning_rate": 2.71258799004137e-05, + "loss": 0.0009, + "num_input_tokens_seen": 114863392, + "step": 94390 + }, + { + "epoch": 10.51286334781156, + "grad_norm": 1.6278561353683472, + "learning_rate": 2.712345895321987e-05, + "loss": 0.1143, + "num_input_tokens_seen": 114869568, + "step": 94395 + }, + { + "epoch": 10.513420202695178, + "grad_norm": 0.05421683192253113, + "learning_rate": 2.712103798596828e-05, + "loss": 0.1023, + "num_input_tokens_seen": 114875264, + "step": 94400 + }, + { + "epoch": 10.513977057578796, + "grad_norm": 0.00014663666661363095, + "learning_rate": 2.7118616998681784e-05, + "loss": 0.0568, + "num_input_tokens_seen": 114881632, + "step": 94405 + }, + { + "epoch": 10.514533912462412, + "grad_norm": 0.033329229801893234, + "learning_rate": 2.7116195991383248e-05, + "loss": 0.0025, + "num_input_tokens_seen": 114887968, + "step": 94410 + }, + { + "epoch": 10.51509076734603, + "grad_norm": 0.35717305541038513, + "learning_rate": 2.711377496409555e-05, + "loss": 0.0072, + "num_input_tokens_seen": 114894176, + "step": 94415 + }, + { + "epoch": 10.515647622229647, + "grad_norm": 1.7235934734344482, + "learning_rate": 2.7111353916841555e-05, + "loss": 0.1157, + "num_input_tokens_seen": 114900192, + "step": 94420 + }, + { + "epoch": 10.516204477113265, + "grad_norm": 1.5627366304397583, + "learning_rate": 2.7108932849644124e-05, + "loss": 0.0644, + "num_input_tokens_seen": 114905632, + "step": 94425 + }, + { + "epoch": 10.516761331996882, + "grad_norm": 0.044426508247852325, + "learning_rate": 2.7106511762526143e-05, + "loss": 0.039, + "num_input_tokens_seen": 114910944, + "step": 94430 + }, + { + "epoch": 10.517318186880498, + "grad_norm": 0.022579185664653778, + "learning_rate": 2.7104090655510463e-05, + "loss": 0.0464, + "num_input_tokens_seen": 114917376, + "step": 94435 + }, + { + "epoch": 10.517875041764116, + "grad_norm": 0.3072700798511505, + "learning_rate": 2.7101669528619968e-05, + "loss": 0.0858, + "num_input_tokens_seen": 114923424, + "step": 94440 + }, + { + "epoch": 10.518431896647733, + "grad_norm": 0.7336418628692627, + "learning_rate": 2.709924838187751e-05, + "loss": 0.0354, + "num_input_tokens_seen": 114929248, + "step": 94445 + }, + { + "epoch": 10.518988751531351, + "grad_norm": 0.07499539107084274, + "learning_rate": 2.7096827215305982e-05, + "loss": 0.0299, + "num_input_tokens_seen": 114935584, + "step": 94450 + }, + { + "epoch": 10.519545606414969, + "grad_norm": 1.441681981086731, + "learning_rate": 2.7094406028928238e-05, + "loss": 0.0398, + "num_input_tokens_seen": 114941760, + "step": 94455 + }, + { + "epoch": 10.520102461298585, + "grad_norm": 0.026808403432369232, + "learning_rate": 2.709198482276714e-05, + "loss": 0.011, + "num_input_tokens_seen": 114947328, + "step": 94460 + }, + { + "epoch": 10.520659316182202, + "grad_norm": 1.1446095705032349, + "learning_rate": 2.708956359684558e-05, + "loss": 0.0551, + "num_input_tokens_seen": 114952544, + "step": 94465 + }, + { + "epoch": 10.52121617106582, + "grad_norm": 3.7178170680999756, + "learning_rate": 2.708714235118641e-05, + "loss": 0.067, + "num_input_tokens_seen": 114958784, + "step": 94470 + }, + { + "epoch": 10.521773025949438, + "grad_norm": 1.1838752031326294, + "learning_rate": 2.7084721085812514e-05, + "loss": 0.1414, + "num_input_tokens_seen": 114964736, + "step": 94475 + }, + { + "epoch": 10.522329880833055, + "grad_norm": 0.07040894776582718, + "learning_rate": 2.7082299800746758e-05, + "loss": 0.0204, + "num_input_tokens_seen": 114970944, + "step": 94480 + }, + { + "epoch": 10.522886735716671, + "grad_norm": 0.5193418860435486, + "learning_rate": 2.7079878496012005e-05, + "loss": 0.0508, + "num_input_tokens_seen": 114976768, + "step": 94485 + }, + { + "epoch": 10.523443590600289, + "grad_norm": 0.06987234950065613, + "learning_rate": 2.7077457171631144e-05, + "loss": 0.1178, + "num_input_tokens_seen": 114982880, + "step": 94490 + }, + { + "epoch": 10.524000445483907, + "grad_norm": 0.07588636130094528, + "learning_rate": 2.7075035827627026e-05, + "loss": 0.0113, + "num_input_tokens_seen": 114989280, + "step": 94495 + }, + { + "epoch": 10.524557300367524, + "grad_norm": 2.775237560272217, + "learning_rate": 2.707261446402254e-05, + "loss": 0.1232, + "num_input_tokens_seen": 114994336, + "step": 94500 + }, + { + "epoch": 10.525114155251142, + "grad_norm": 0.08022747188806534, + "learning_rate": 2.7070193080840545e-05, + "loss": 0.0747, + "num_input_tokens_seen": 115000224, + "step": 94505 + }, + { + "epoch": 10.52567101013476, + "grad_norm": 0.5632601976394653, + "learning_rate": 2.7067771678103914e-05, + "loss": 0.0628, + "num_input_tokens_seen": 115006464, + "step": 94510 + }, + { + "epoch": 10.526227865018376, + "grad_norm": 1.0371215343475342, + "learning_rate": 2.706535025583553e-05, + "loss": 0.0305, + "num_input_tokens_seen": 115012640, + "step": 94515 + }, + { + "epoch": 10.526784719901993, + "grad_norm": 0.755563497543335, + "learning_rate": 2.7062928814058254e-05, + "loss": 0.0055, + "num_input_tokens_seen": 115019104, + "step": 94520 + }, + { + "epoch": 10.527341574785611, + "grad_norm": 0.004585783462971449, + "learning_rate": 2.706050735279496e-05, + "loss": 0.147, + "num_input_tokens_seen": 115024928, + "step": 94525 + }, + { + "epoch": 10.527898429669229, + "grad_norm": 0.8516733646392822, + "learning_rate": 2.7058085872068527e-05, + "loss": 0.1332, + "num_input_tokens_seen": 115031008, + "step": 94530 + }, + { + "epoch": 10.528455284552846, + "grad_norm": 1.104832649230957, + "learning_rate": 2.7055664371901827e-05, + "loss": 0.3014, + "num_input_tokens_seen": 115037280, + "step": 94535 + }, + { + "epoch": 10.529012139436462, + "grad_norm": 0.780029833316803, + "learning_rate": 2.7053242852317723e-05, + "loss": 0.0222, + "num_input_tokens_seen": 115043392, + "step": 94540 + }, + { + "epoch": 10.52956899432008, + "grad_norm": 5.696587562561035, + "learning_rate": 2.7050821313339096e-05, + "loss": 0.0952, + "num_input_tokens_seen": 115049888, + "step": 94545 + }, + { + "epoch": 10.530125849203698, + "grad_norm": 0.015906549990177155, + "learning_rate": 2.704839975498883e-05, + "loss": 0.0835, + "num_input_tokens_seen": 115056384, + "step": 94550 + }, + { + "epoch": 10.530682704087315, + "grad_norm": 0.0016084820963442326, + "learning_rate": 2.7045978177289777e-05, + "loss": 0.0029, + "num_input_tokens_seen": 115062368, + "step": 94555 + }, + { + "epoch": 10.531239558970933, + "grad_norm": 0.4105883538722992, + "learning_rate": 2.7043556580264823e-05, + "loss": 0.0345, + "num_input_tokens_seen": 115068288, + "step": 94560 + }, + { + "epoch": 10.531796413854549, + "grad_norm": 0.9000365138053894, + "learning_rate": 2.7041134963936837e-05, + "loss": 0.012, + "num_input_tokens_seen": 115074240, + "step": 94565 + }, + { + "epoch": 10.532353268738166, + "grad_norm": 0.025543520227074623, + "learning_rate": 2.70387133283287e-05, + "loss": 0.001, + "num_input_tokens_seen": 115080480, + "step": 94570 + }, + { + "epoch": 10.532910123621784, + "grad_norm": 0.6547905206680298, + "learning_rate": 2.7036291673463282e-05, + "loss": 0.1355, + "num_input_tokens_seen": 115086624, + "step": 94575 + }, + { + "epoch": 10.533466978505402, + "grad_norm": 0.00019551179138943553, + "learning_rate": 2.7033869999363455e-05, + "loss": 0.0598, + "num_input_tokens_seen": 115092000, + "step": 94580 + }, + { + "epoch": 10.53402383338902, + "grad_norm": 0.6279412508010864, + "learning_rate": 2.7031448306052097e-05, + "loss": 0.0179, + "num_input_tokens_seen": 115098016, + "step": 94585 + }, + { + "epoch": 10.534580688272635, + "grad_norm": 1.3213187456130981, + "learning_rate": 2.7029026593552083e-05, + "loss": 0.0909, + "num_input_tokens_seen": 115104160, + "step": 94590 + }, + { + "epoch": 10.535137543156253, + "grad_norm": 0.5367922186851501, + "learning_rate": 2.702660486188629e-05, + "loss": 0.018, + "num_input_tokens_seen": 115110368, + "step": 94595 + }, + { + "epoch": 10.53569439803987, + "grad_norm": 0.8586497902870178, + "learning_rate": 2.7024183111077585e-05, + "loss": 0.0226, + "num_input_tokens_seen": 115116448, + "step": 94600 + }, + { + "epoch": 10.536251252923488, + "grad_norm": 1.9292196035385132, + "learning_rate": 2.7021761341148848e-05, + "loss": 0.0756, + "num_input_tokens_seen": 115122240, + "step": 94605 + }, + { + "epoch": 10.536808107807106, + "grad_norm": 0.2284945547580719, + "learning_rate": 2.7019339552122964e-05, + "loss": 0.0615, + "num_input_tokens_seen": 115128416, + "step": 94610 + }, + { + "epoch": 10.537364962690722, + "grad_norm": 0.047786224633455276, + "learning_rate": 2.701691774402279e-05, + "loss": 0.0376, + "num_input_tokens_seen": 115134240, + "step": 94615 + }, + { + "epoch": 10.53792181757434, + "grad_norm": 0.009747861884534359, + "learning_rate": 2.701449591687122e-05, + "loss": 0.0113, + "num_input_tokens_seen": 115139936, + "step": 94620 + }, + { + "epoch": 10.538478672457957, + "grad_norm": 0.19945833086967468, + "learning_rate": 2.7012074070691117e-05, + "loss": 0.1244, + "num_input_tokens_seen": 115145568, + "step": 94625 + }, + { + "epoch": 10.539035527341575, + "grad_norm": 0.027520226314663887, + "learning_rate": 2.7009652205505364e-05, + "loss": 0.0097, + "num_input_tokens_seen": 115151200, + "step": 94630 + }, + { + "epoch": 10.539592382225193, + "grad_norm": 0.0003802382852882147, + "learning_rate": 2.7007230321336836e-05, + "loss": 0.0402, + "num_input_tokens_seen": 115157440, + "step": 94635 + }, + { + "epoch": 10.54014923710881, + "grad_norm": 0.023587200790643692, + "learning_rate": 2.7004808418208404e-05, + "loss": 0.0153, + "num_input_tokens_seen": 115163520, + "step": 94640 + }, + { + "epoch": 10.540706091992426, + "grad_norm": 0.3383467495441437, + "learning_rate": 2.700238649614296e-05, + "loss": 0.0372, + "num_input_tokens_seen": 115168864, + "step": 94645 + }, + { + "epoch": 10.541262946876044, + "grad_norm": 1.576216697692871, + "learning_rate": 2.6999964555163365e-05, + "loss": 0.0561, + "num_input_tokens_seen": 115175328, + "step": 94650 + }, + { + "epoch": 10.541819801759662, + "grad_norm": 0.014795830473303795, + "learning_rate": 2.6997542595292507e-05, + "loss": 0.0757, + "num_input_tokens_seen": 115181344, + "step": 94655 + }, + { + "epoch": 10.54237665664328, + "grad_norm": 0.21199360489845276, + "learning_rate": 2.6995120616553256e-05, + "loss": 0.0155, + "num_input_tokens_seen": 115187680, + "step": 94660 + }, + { + "epoch": 10.542933511526897, + "grad_norm": 0.18712088465690613, + "learning_rate": 2.6992698618968494e-05, + "loss": 0.0305, + "num_input_tokens_seen": 115193984, + "step": 94665 + }, + { + "epoch": 10.543490366410513, + "grad_norm": 0.27264276146888733, + "learning_rate": 2.6990276602561094e-05, + "loss": 0.0838, + "num_input_tokens_seen": 115199936, + "step": 94670 + }, + { + "epoch": 10.54404722129413, + "grad_norm": 1.3134938478469849, + "learning_rate": 2.6987854567353937e-05, + "loss": 0.0755, + "num_input_tokens_seen": 115206144, + "step": 94675 + }, + { + "epoch": 10.544604076177748, + "grad_norm": 0.14702917635440826, + "learning_rate": 2.6985432513369903e-05, + "loss": 0.0595, + "num_input_tokens_seen": 115212224, + "step": 94680 + }, + { + "epoch": 10.545160931061366, + "grad_norm": 0.011569567024707794, + "learning_rate": 2.698301044063187e-05, + "loss": 0.0111, + "num_input_tokens_seen": 115218496, + "step": 94685 + }, + { + "epoch": 10.545717785944984, + "grad_norm": 1.0409568548202515, + "learning_rate": 2.698058834916271e-05, + "loss": 0.0281, + "num_input_tokens_seen": 115224800, + "step": 94690 + }, + { + "epoch": 10.5462746408286, + "grad_norm": 0.0938839465379715, + "learning_rate": 2.6978166238985307e-05, + "loss": 0.0438, + "num_input_tokens_seen": 115231008, + "step": 94695 + }, + { + "epoch": 10.546831495712217, + "grad_norm": 0.0013253593351691961, + "learning_rate": 2.6975744110122537e-05, + "loss": 0.0663, + "num_input_tokens_seen": 115237120, + "step": 94700 + }, + { + "epoch": 10.547388350595835, + "grad_norm": 1.054789662361145, + "learning_rate": 2.6973321962597287e-05, + "loss": 0.0711, + "num_input_tokens_seen": 115243296, + "step": 94705 + }, + { + "epoch": 10.547945205479452, + "grad_norm": 0.5861185193061829, + "learning_rate": 2.6970899796432426e-05, + "loss": 0.0235, + "num_input_tokens_seen": 115249344, + "step": 94710 + }, + { + "epoch": 10.54850206036307, + "grad_norm": 0.5161581039428711, + "learning_rate": 2.6968477611650844e-05, + "loss": 0.0468, + "num_input_tokens_seen": 115255296, + "step": 94715 + }, + { + "epoch": 10.549058915246686, + "grad_norm": 1.2758134603500366, + "learning_rate": 2.6966055408275403e-05, + "loss": 0.128, + "num_input_tokens_seen": 115261568, + "step": 94720 + }, + { + "epoch": 10.549615770130304, + "grad_norm": 0.009504708461463451, + "learning_rate": 2.6963633186329e-05, + "loss": 0.0215, + "num_input_tokens_seen": 115268096, + "step": 94725 + }, + { + "epoch": 10.550172625013921, + "grad_norm": 0.062340833246707916, + "learning_rate": 2.696121094583451e-05, + "loss": 0.0674, + "num_input_tokens_seen": 115274656, + "step": 94730 + }, + { + "epoch": 10.550729479897539, + "grad_norm": 0.2813360393047333, + "learning_rate": 2.6958788686814806e-05, + "loss": 0.03, + "num_input_tokens_seen": 115280960, + "step": 94735 + }, + { + "epoch": 10.551286334781157, + "grad_norm": 0.009231803007423878, + "learning_rate": 2.6956366409292776e-05, + "loss": 0.0082, + "num_input_tokens_seen": 115287040, + "step": 94740 + }, + { + "epoch": 10.551843189664773, + "grad_norm": 0.10710722953081131, + "learning_rate": 2.6953944113291297e-05, + "loss": 0.076, + "num_input_tokens_seen": 115293376, + "step": 94745 + }, + { + "epoch": 10.55240004454839, + "grad_norm": 1.1413630247116089, + "learning_rate": 2.6951521798833258e-05, + "loss": 0.0186, + "num_input_tokens_seen": 115299616, + "step": 94750 + }, + { + "epoch": 10.552956899432008, + "grad_norm": 0.6575286984443665, + "learning_rate": 2.6949099465941518e-05, + "loss": 0.0371, + "num_input_tokens_seen": 115305344, + "step": 94755 + }, + { + "epoch": 10.553513754315626, + "grad_norm": 0.00018153070413973182, + "learning_rate": 2.6946677114638985e-05, + "loss": 0.105, + "num_input_tokens_seen": 115311584, + "step": 94760 + }, + { + "epoch": 10.554070609199243, + "grad_norm": 0.18578174710273743, + "learning_rate": 2.6944254744948516e-05, + "loss": 0.0494, + "num_input_tokens_seen": 115317440, + "step": 94765 + }, + { + "epoch": 10.55462746408286, + "grad_norm": 1.6000826358795166, + "learning_rate": 2.694183235689301e-05, + "loss": 0.0461, + "num_input_tokens_seen": 115323072, + "step": 94770 + }, + { + "epoch": 10.555184318966477, + "grad_norm": 0.23586083948612213, + "learning_rate": 2.693940995049534e-05, + "loss": 0.0232, + "num_input_tokens_seen": 115329120, + "step": 94775 + }, + { + "epoch": 10.555741173850095, + "grad_norm": 0.0005139262066222727, + "learning_rate": 2.693698752577839e-05, + "loss": 0.0349, + "num_input_tokens_seen": 115334944, + "step": 94780 + }, + { + "epoch": 10.556298028733712, + "grad_norm": 1.0218555927276611, + "learning_rate": 2.6934565082765038e-05, + "loss": 0.1545, + "num_input_tokens_seen": 115340224, + "step": 94785 + }, + { + "epoch": 10.55685488361733, + "grad_norm": 0.00015736525529064238, + "learning_rate": 2.6932142621478174e-05, + "loss": 0.082, + "num_input_tokens_seen": 115346496, + "step": 94790 + }, + { + "epoch": 10.557411738500946, + "grad_norm": 1.2068538665771484, + "learning_rate": 2.6929720141940674e-05, + "loss": 0.0923, + "num_input_tokens_seen": 115352000, + "step": 94795 + }, + { + "epoch": 10.557968593384564, + "grad_norm": 0.007738392800092697, + "learning_rate": 2.692729764417542e-05, + "loss": 0.0083, + "num_input_tokens_seen": 115358144, + "step": 94800 + }, + { + "epoch": 10.558525448268181, + "grad_norm": 0.006560968700796366, + "learning_rate": 2.692487512820529e-05, + "loss": 0.0548, + "num_input_tokens_seen": 115364096, + "step": 94805 + }, + { + "epoch": 10.559082303151799, + "grad_norm": 0.4724790155887604, + "learning_rate": 2.6922452594053182e-05, + "loss": 0.0547, + "num_input_tokens_seen": 115370400, + "step": 94810 + }, + { + "epoch": 10.559639158035417, + "grad_norm": 0.00957262422889471, + "learning_rate": 2.692003004174196e-05, + "loss": 0.0081, + "num_input_tokens_seen": 115376768, + "step": 94815 + }, + { + "epoch": 10.560196012919032, + "grad_norm": 1.8071621656417847, + "learning_rate": 2.6917607471294526e-05, + "loss": 0.0536, + "num_input_tokens_seen": 115382592, + "step": 94820 + }, + { + "epoch": 10.56075286780265, + "grad_norm": 0.041964102536439896, + "learning_rate": 2.691518488273374e-05, + "loss": 0.0427, + "num_input_tokens_seen": 115388640, + "step": 94825 + }, + { + "epoch": 10.561309722686268, + "grad_norm": 2.311274766921997, + "learning_rate": 2.6912762276082505e-05, + "loss": 0.0917, + "num_input_tokens_seen": 115394560, + "step": 94830 + }, + { + "epoch": 10.561866577569885, + "grad_norm": 0.04409073293209076, + "learning_rate": 2.6910339651363704e-05, + "loss": 0.006, + "num_input_tokens_seen": 115400448, + "step": 94835 + }, + { + "epoch": 10.562423432453503, + "grad_norm": 0.37337490916252136, + "learning_rate": 2.6907917008600204e-05, + "loss": 0.0143, + "num_input_tokens_seen": 115406560, + "step": 94840 + }, + { + "epoch": 10.562980287337119, + "grad_norm": 0.004448085557669401, + "learning_rate": 2.690549434781491e-05, + "loss": 0.0528, + "num_input_tokens_seen": 115412128, + "step": 94845 + }, + { + "epoch": 10.563537142220737, + "grad_norm": 1.309949517250061, + "learning_rate": 2.690307166903068e-05, + "loss": 0.0558, + "num_input_tokens_seen": 115418624, + "step": 94850 + }, + { + "epoch": 10.564093997104354, + "grad_norm": 0.0006209585699252784, + "learning_rate": 2.690064897227043e-05, + "loss": 0.027, + "num_input_tokens_seen": 115424640, + "step": 94855 + }, + { + "epoch": 10.564650851987972, + "grad_norm": 0.029344726353883743, + "learning_rate": 2.6898226257557017e-05, + "loss": 0.01, + "num_input_tokens_seen": 115430880, + "step": 94860 + }, + { + "epoch": 10.56520770687159, + "grad_norm": 0.0011691672261804342, + "learning_rate": 2.6895803524913337e-05, + "loss": 0.0212, + "num_input_tokens_seen": 115436928, + "step": 94865 + }, + { + "epoch": 10.565764561755207, + "grad_norm": 0.0025859735906124115, + "learning_rate": 2.6893380774362285e-05, + "loss": 0.0336, + "num_input_tokens_seen": 115443424, + "step": 94870 + }, + { + "epoch": 10.566321416638823, + "grad_norm": 0.0031601020600646734, + "learning_rate": 2.6890958005926726e-05, + "loss": 0.0463, + "num_input_tokens_seen": 115449568, + "step": 94875 + }, + { + "epoch": 10.566878271522441, + "grad_norm": 0.028226954862475395, + "learning_rate": 2.6888535219629552e-05, + "loss": 0.0498, + "num_input_tokens_seen": 115455808, + "step": 94880 + }, + { + "epoch": 10.567435126406059, + "grad_norm": 0.0002797802444547415, + "learning_rate": 2.688611241549365e-05, + "loss": 0.0049, + "num_input_tokens_seen": 115462176, + "step": 94885 + }, + { + "epoch": 10.567991981289676, + "grad_norm": 0.6494504809379578, + "learning_rate": 2.6883689593541907e-05, + "loss": 0.0181, + "num_input_tokens_seen": 115468352, + "step": 94890 + }, + { + "epoch": 10.568548836173294, + "grad_norm": 0.2823149263858795, + "learning_rate": 2.688126675379721e-05, + "loss": 0.0218, + "num_input_tokens_seen": 115474432, + "step": 94895 + }, + { + "epoch": 10.56910569105691, + "grad_norm": 2.0727784633636475, + "learning_rate": 2.6878843896282436e-05, + "loss": 0.0957, + "num_input_tokens_seen": 115480448, + "step": 94900 + }, + { + "epoch": 10.569662545940528, + "grad_norm": 0.00011794755846494809, + "learning_rate": 2.687642102102048e-05, + "loss": 0.0095, + "num_input_tokens_seen": 115486496, + "step": 94905 + }, + { + "epoch": 10.570219400824145, + "grad_norm": 0.1876748949289322, + "learning_rate": 2.6873998128034216e-05, + "loss": 0.0365, + "num_input_tokens_seen": 115492384, + "step": 94910 + }, + { + "epoch": 10.570776255707763, + "grad_norm": 2.6576902866363525, + "learning_rate": 2.6871575217346544e-05, + "loss": 0.0409, + "num_input_tokens_seen": 115498560, + "step": 94915 + }, + { + "epoch": 10.57133311059138, + "grad_norm": 1.2842097282409668, + "learning_rate": 2.686915228898035e-05, + "loss": 0.0924, + "num_input_tokens_seen": 115504672, + "step": 94920 + }, + { + "epoch": 10.571889965474996, + "grad_norm": 0.15247738361358643, + "learning_rate": 2.6866729342958508e-05, + "loss": 0.0043, + "num_input_tokens_seen": 115511040, + "step": 94925 + }, + { + "epoch": 10.572446820358614, + "grad_norm": 0.9081435203552246, + "learning_rate": 2.6864306379303918e-05, + "loss": 0.1174, + "num_input_tokens_seen": 115516992, + "step": 94930 + }, + { + "epoch": 10.573003675242232, + "grad_norm": 1.7612216472625732, + "learning_rate": 2.6861883398039452e-05, + "loss": 0.0555, + "num_input_tokens_seen": 115522816, + "step": 94935 + }, + { + "epoch": 10.57356053012585, + "grad_norm": 0.9418447017669678, + "learning_rate": 2.6859460399188007e-05, + "loss": 0.0473, + "num_input_tokens_seen": 115528512, + "step": 94940 + }, + { + "epoch": 10.574117385009467, + "grad_norm": 0.02003268152475357, + "learning_rate": 2.685703738277247e-05, + "loss": 0.0118, + "num_input_tokens_seen": 115534272, + "step": 94945 + }, + { + "epoch": 10.574674239893083, + "grad_norm": 1.42160165309906, + "learning_rate": 2.6854614348815727e-05, + "loss": 0.2325, + "num_input_tokens_seen": 115540704, + "step": 94950 + }, + { + "epoch": 10.5752310947767, + "grad_norm": 0.0039609624072909355, + "learning_rate": 2.6852191297340666e-05, + "loss": 0.0486, + "num_input_tokens_seen": 115546912, + "step": 94955 + }, + { + "epoch": 10.575787949660318, + "grad_norm": 0.007109473459422588, + "learning_rate": 2.6849768228370177e-05, + "loss": 0.0199, + "num_input_tokens_seen": 115552960, + "step": 94960 + }, + { + "epoch": 10.576344804543936, + "grad_norm": 0.004354036878794432, + "learning_rate": 2.6847345141927143e-05, + "loss": 0.0007, + "num_input_tokens_seen": 115559200, + "step": 94965 + }, + { + "epoch": 10.576901659427554, + "grad_norm": 0.014285547658801079, + "learning_rate": 2.6844922038034454e-05, + "loss": 0.1366, + "num_input_tokens_seen": 115565344, + "step": 94970 + }, + { + "epoch": 10.57745851431117, + "grad_norm": 0.0027025898452848196, + "learning_rate": 2.6842498916714997e-05, + "loss": 0.0189, + "num_input_tokens_seen": 115571872, + "step": 94975 + }, + { + "epoch": 10.578015369194787, + "grad_norm": 0.023123648017644882, + "learning_rate": 2.684007577799167e-05, + "loss": 0.017, + "num_input_tokens_seen": 115577920, + "step": 94980 + }, + { + "epoch": 10.578572224078405, + "grad_norm": 0.05354781448841095, + "learning_rate": 2.683765262188734e-05, + "loss": 0.052, + "num_input_tokens_seen": 115584256, + "step": 94985 + }, + { + "epoch": 10.579129078962023, + "grad_norm": 0.3622572720050812, + "learning_rate": 2.683522944842492e-05, + "loss": 0.0651, + "num_input_tokens_seen": 115590144, + "step": 94990 + }, + { + "epoch": 10.57968593384564, + "grad_norm": 0.017382007092237473, + "learning_rate": 2.683280625762728e-05, + "loss": 0.0634, + "num_input_tokens_seen": 115596416, + "step": 94995 + }, + { + "epoch": 10.580242788729258, + "grad_norm": 0.019875336438417435, + "learning_rate": 2.6830383049517322e-05, + "loss": 0.0354, + "num_input_tokens_seen": 115602080, + "step": 95000 + }, + { + "epoch": 10.580799643612874, + "grad_norm": 0.8183109164237976, + "learning_rate": 2.682795982411792e-05, + "loss": 0.0434, + "num_input_tokens_seen": 115607936, + "step": 95005 + }, + { + "epoch": 10.581356498496492, + "grad_norm": 0.0020072595216333866, + "learning_rate": 2.6825536581451978e-05, + "loss": 0.0369, + "num_input_tokens_seen": 115613824, + "step": 95010 + }, + { + "epoch": 10.58191335338011, + "grad_norm": 0.5496671795845032, + "learning_rate": 2.6823113321542387e-05, + "loss": 0.0095, + "num_input_tokens_seen": 115620064, + "step": 95015 + }, + { + "epoch": 10.582470208263727, + "grad_norm": 0.2439941167831421, + "learning_rate": 2.6820690044412023e-05, + "loss": 0.0368, + "num_input_tokens_seen": 115625888, + "step": 95020 + }, + { + "epoch": 10.583027063147345, + "grad_norm": 0.07212256640195847, + "learning_rate": 2.6818266750083786e-05, + "loss": 0.0108, + "num_input_tokens_seen": 115632032, + "step": 95025 + }, + { + "epoch": 10.58358391803096, + "grad_norm": 0.343538373708725, + "learning_rate": 2.6815843438580557e-05, + "loss": 0.0872, + "num_input_tokens_seen": 115638240, + "step": 95030 + }, + { + "epoch": 10.584140772914578, + "grad_norm": 0.030984453856945038, + "learning_rate": 2.681342010992524e-05, + "loss": 0.1708, + "num_input_tokens_seen": 115644128, + "step": 95035 + }, + { + "epoch": 10.584697627798196, + "grad_norm": 0.9787034392356873, + "learning_rate": 2.681099676414071e-05, + "loss": 0.1026, + "num_input_tokens_seen": 115650464, + "step": 95040 + }, + { + "epoch": 10.585254482681814, + "grad_norm": 0.0016399495070800185, + "learning_rate": 2.6808573401249864e-05, + "loss": 0.0479, + "num_input_tokens_seen": 115656576, + "step": 95045 + }, + { + "epoch": 10.585811337565431, + "grad_norm": 0.0740499198436737, + "learning_rate": 2.6806150021275605e-05, + "loss": 0.0045, + "num_input_tokens_seen": 115662656, + "step": 95050 + }, + { + "epoch": 10.586368192449047, + "grad_norm": 0.0011864436091855168, + "learning_rate": 2.68037266242408e-05, + "loss": 0.0024, + "num_input_tokens_seen": 115668864, + "step": 95055 + }, + { + "epoch": 10.586925047332665, + "grad_norm": 0.4838974177837372, + "learning_rate": 2.680130321016836e-05, + "loss": 0.0535, + "num_input_tokens_seen": 115674816, + "step": 95060 + }, + { + "epoch": 10.587481902216282, + "grad_norm": 0.071595698595047, + "learning_rate": 2.679887977908116e-05, + "loss": 0.02, + "num_input_tokens_seen": 115680768, + "step": 95065 + }, + { + "epoch": 10.5880387570999, + "grad_norm": 0.0021698346827179193, + "learning_rate": 2.6796456331002105e-05, + "loss": 0.0264, + "num_input_tokens_seen": 115687040, + "step": 95070 + }, + { + "epoch": 10.588595611983518, + "grad_norm": 0.14839154481887817, + "learning_rate": 2.6794032865954076e-05, + "loss": 0.0752, + "num_input_tokens_seen": 115693440, + "step": 95075 + }, + { + "epoch": 10.589152466867134, + "grad_norm": 0.013719538226723671, + "learning_rate": 2.679160938395997e-05, + "loss": 0.0032, + "num_input_tokens_seen": 115699744, + "step": 95080 + }, + { + "epoch": 10.589709321750751, + "grad_norm": 0.891211986541748, + "learning_rate": 2.678918588504269e-05, + "loss": 0.0156, + "num_input_tokens_seen": 115705792, + "step": 95085 + }, + { + "epoch": 10.590266176634369, + "grad_norm": 0.06820686906576157, + "learning_rate": 2.67867623692251e-05, + "loss": 0.1437, + "num_input_tokens_seen": 115711904, + "step": 95090 + }, + { + "epoch": 10.590823031517987, + "grad_norm": 0.1827521175146103, + "learning_rate": 2.6784338836530116e-05, + "loss": 0.0053, + "num_input_tokens_seen": 115717728, + "step": 95095 + }, + { + "epoch": 10.591379886401604, + "grad_norm": 0.7490527033805847, + "learning_rate": 2.678191528698062e-05, + "loss": 0.035, + "num_input_tokens_seen": 115724128, + "step": 95100 + }, + { + "epoch": 10.59193674128522, + "grad_norm": 0.06289774179458618, + "learning_rate": 2.6779491720599502e-05, + "loss": 0.0033, + "num_input_tokens_seen": 115729568, + "step": 95105 + }, + { + "epoch": 10.592493596168838, + "grad_norm": 0.00040871306555345654, + "learning_rate": 2.6777068137409666e-05, + "loss": 0.0534, + "num_input_tokens_seen": 115735712, + "step": 95110 + }, + { + "epoch": 10.593050451052456, + "grad_norm": 0.05192878097295761, + "learning_rate": 2.6774644537433995e-05, + "loss": 0.002, + "num_input_tokens_seen": 115741888, + "step": 95115 + }, + { + "epoch": 10.593607305936073, + "grad_norm": 0.09798341989517212, + "learning_rate": 2.6772220920695383e-05, + "loss": 0.122, + "num_input_tokens_seen": 115748288, + "step": 95120 + }, + { + "epoch": 10.594164160819691, + "grad_norm": 0.18615972995758057, + "learning_rate": 2.676979728721673e-05, + "loss": 0.0517, + "num_input_tokens_seen": 115754400, + "step": 95125 + }, + { + "epoch": 10.594721015703307, + "grad_norm": 0.0005236372817307711, + "learning_rate": 2.676737363702092e-05, + "loss": 0.033, + "num_input_tokens_seen": 115760768, + "step": 95130 + }, + { + "epoch": 10.595277870586925, + "grad_norm": 1.2254929542541504, + "learning_rate": 2.676494997013085e-05, + "loss": 0.0766, + "num_input_tokens_seen": 115766816, + "step": 95135 + }, + { + "epoch": 10.595834725470542, + "grad_norm": 1.7854501008987427, + "learning_rate": 2.676252628656941e-05, + "loss": 0.0839, + "num_input_tokens_seen": 115772608, + "step": 95140 + }, + { + "epoch": 10.59639158035416, + "grad_norm": 2.812455892562866, + "learning_rate": 2.6760102586359503e-05, + "loss": 0.075, + "num_input_tokens_seen": 115778528, + "step": 95145 + }, + { + "epoch": 10.596948435237778, + "grad_norm": 0.018731795251369476, + "learning_rate": 2.6757678869524013e-05, + "loss": 0.0043, + "num_input_tokens_seen": 115784960, + "step": 95150 + }, + { + "epoch": 10.597505290121394, + "grad_norm": 0.07797203958034515, + "learning_rate": 2.6755255136085843e-05, + "loss": 0.0608, + "num_input_tokens_seen": 115791168, + "step": 95155 + }, + { + "epoch": 10.598062145005011, + "grad_norm": 0.0004820672329515219, + "learning_rate": 2.6752831386067874e-05, + "loss": 0.1184, + "num_input_tokens_seen": 115797472, + "step": 95160 + }, + { + "epoch": 10.598618999888629, + "grad_norm": 0.08359835296869278, + "learning_rate": 2.6750407619493016e-05, + "loss": 0.0481, + "num_input_tokens_seen": 115803360, + "step": 95165 + }, + { + "epoch": 10.599175854772247, + "grad_norm": 3.9429585933685303, + "learning_rate": 2.674798383638415e-05, + "loss": 0.1387, + "num_input_tokens_seen": 115809504, + "step": 95170 + }, + { + "epoch": 10.599732709655864, + "grad_norm": 0.0004496368346735835, + "learning_rate": 2.674556003676417e-05, + "loss": 0.1943, + "num_input_tokens_seen": 115815584, + "step": 95175 + }, + { + "epoch": 10.60028956453948, + "grad_norm": 0.004055694676935673, + "learning_rate": 2.6743136220655996e-05, + "loss": 0.0324, + "num_input_tokens_seen": 115821056, + "step": 95180 + }, + { + "epoch": 10.600846419423098, + "grad_norm": 0.6714714765548706, + "learning_rate": 2.674071238808249e-05, + "loss": 0.0701, + "num_input_tokens_seen": 115827008, + "step": 95185 + }, + { + "epoch": 10.601403274306715, + "grad_norm": 0.004143085330724716, + "learning_rate": 2.6738288539066565e-05, + "loss": 0.0355, + "num_input_tokens_seen": 115832992, + "step": 95190 + }, + { + "epoch": 10.601960129190333, + "grad_norm": 0.23409141600131989, + "learning_rate": 2.6735864673631107e-05, + "loss": 0.109, + "num_input_tokens_seen": 115838912, + "step": 95195 + }, + { + "epoch": 10.60251698407395, + "grad_norm": 0.3882797360420227, + "learning_rate": 2.673344079179902e-05, + "loss": 0.0359, + "num_input_tokens_seen": 115845088, + "step": 95200 + }, + { + "epoch": 10.603073838957567, + "grad_norm": 0.5580593943595886, + "learning_rate": 2.6731016893593196e-05, + "loss": 0.0825, + "num_input_tokens_seen": 115851136, + "step": 95205 + }, + { + "epoch": 10.603630693841184, + "grad_norm": 0.7703304886817932, + "learning_rate": 2.6728592979036532e-05, + "loss": 0.1067, + "num_input_tokens_seen": 115856992, + "step": 95210 + }, + { + "epoch": 10.604187548724802, + "grad_norm": 0.000854819139931351, + "learning_rate": 2.6726169048151923e-05, + "loss": 0.1141, + "num_input_tokens_seen": 115863136, + "step": 95215 + }, + { + "epoch": 10.60474440360842, + "grad_norm": 0.010838893242180347, + "learning_rate": 2.672374510096226e-05, + "loss": 0.0683, + "num_input_tokens_seen": 115869312, + "step": 95220 + }, + { + "epoch": 10.605301258492037, + "grad_norm": 0.0009989069076254964, + "learning_rate": 2.672132113749045e-05, + "loss": 0.0437, + "num_input_tokens_seen": 115875328, + "step": 95225 + }, + { + "epoch": 10.605858113375655, + "grad_norm": 0.0015979373129084706, + "learning_rate": 2.6718897157759376e-05, + "loss": 0.0674, + "num_input_tokens_seen": 115881184, + "step": 95230 + }, + { + "epoch": 10.606414968259271, + "grad_norm": 0.06920304149389267, + "learning_rate": 2.6716473161791943e-05, + "loss": 0.0202, + "num_input_tokens_seen": 115887104, + "step": 95235 + }, + { + "epoch": 10.606971823142889, + "grad_norm": 0.0006141412886790931, + "learning_rate": 2.671404914961105e-05, + "loss": 0.0014, + "num_input_tokens_seen": 115893184, + "step": 95240 + }, + { + "epoch": 10.607528678026506, + "grad_norm": 0.01215343363583088, + "learning_rate": 2.6711625121239582e-05, + "loss": 0.0274, + "num_input_tokens_seen": 115899424, + "step": 95245 + }, + { + "epoch": 10.608085532910124, + "grad_norm": 1.6622039079666138, + "learning_rate": 2.670920107670045e-05, + "loss": 0.0359, + "num_input_tokens_seen": 115905920, + "step": 95250 + }, + { + "epoch": 10.608642387793742, + "grad_norm": 0.023986585438251495, + "learning_rate": 2.6706777016016543e-05, + "loss": 0.0428, + "num_input_tokens_seen": 115911872, + "step": 95255 + }, + { + "epoch": 10.609199242677358, + "grad_norm": 0.3870657980442047, + "learning_rate": 2.670435293921076e-05, + "loss": 0.0928, + "num_input_tokens_seen": 115917248, + "step": 95260 + }, + { + "epoch": 10.609756097560975, + "grad_norm": 0.06089795008301735, + "learning_rate": 2.6701928846305996e-05, + "loss": 0.034, + "num_input_tokens_seen": 115923232, + "step": 95265 + }, + { + "epoch": 10.610312952444593, + "grad_norm": 0.08590230345726013, + "learning_rate": 2.6699504737325147e-05, + "loss": 0.0157, + "num_input_tokens_seen": 115928896, + "step": 95270 + }, + { + "epoch": 10.61086980732821, + "grad_norm": 2.163132429122925, + "learning_rate": 2.669708061229112e-05, + "loss": 0.0402, + "num_input_tokens_seen": 115935072, + "step": 95275 + }, + { + "epoch": 10.611426662211828, + "grad_norm": 0.0005821239319629967, + "learning_rate": 2.6694656471226807e-05, + "loss": 0.0065, + "num_input_tokens_seen": 115941440, + "step": 95280 + }, + { + "epoch": 10.611983517095444, + "grad_norm": 0.0013753704261034727, + "learning_rate": 2.6692232314155104e-05, + "loss": 0.11, + "num_input_tokens_seen": 115947232, + "step": 95285 + }, + { + "epoch": 10.612540371979062, + "grad_norm": 0.02538730762898922, + "learning_rate": 2.6689808141098916e-05, + "loss": 0.0236, + "num_input_tokens_seen": 115953504, + "step": 95290 + }, + { + "epoch": 10.61309722686268, + "grad_norm": 0.0448993444442749, + "learning_rate": 2.668738395208113e-05, + "loss": 0.0226, + "num_input_tokens_seen": 115960000, + "step": 95295 + }, + { + "epoch": 10.613654081746297, + "grad_norm": 0.6892030835151672, + "learning_rate": 2.6684959747124656e-05, + "loss": 0.0532, + "num_input_tokens_seen": 115966368, + "step": 95300 + }, + { + "epoch": 10.614210936629915, + "grad_norm": 0.08660288155078888, + "learning_rate": 2.6682535526252378e-05, + "loss": 0.0077, + "num_input_tokens_seen": 115972352, + "step": 95305 + }, + { + "epoch": 10.61476779151353, + "grad_norm": 0.0692540779709816, + "learning_rate": 2.6680111289487214e-05, + "loss": 0.0678, + "num_input_tokens_seen": 115978656, + "step": 95310 + }, + { + "epoch": 10.615324646397148, + "grad_norm": 0.40284812450408936, + "learning_rate": 2.6677687036852044e-05, + "loss": 0.0835, + "num_input_tokens_seen": 115984800, + "step": 95315 + }, + { + "epoch": 10.615881501280766, + "grad_norm": 0.06136724352836609, + "learning_rate": 2.667526276836978e-05, + "loss": 0.0057, + "num_input_tokens_seen": 115991104, + "step": 95320 + }, + { + "epoch": 10.616438356164384, + "grad_norm": 0.4179040789604187, + "learning_rate": 2.6672838484063317e-05, + "loss": 0.0332, + "num_input_tokens_seen": 115997472, + "step": 95325 + }, + { + "epoch": 10.616995211048001, + "grad_norm": 0.01919476129114628, + "learning_rate": 2.667041418395555e-05, + "loss": 0.0116, + "num_input_tokens_seen": 116002912, + "step": 95330 + }, + { + "epoch": 10.61755206593162, + "grad_norm": 0.9934719800949097, + "learning_rate": 2.6667989868069393e-05, + "loss": 0.0135, + "num_input_tokens_seen": 116009248, + "step": 95335 + }, + { + "epoch": 10.618108920815235, + "grad_norm": 0.02392452582716942, + "learning_rate": 2.666556553642773e-05, + "loss": 0.0063, + "num_input_tokens_seen": 116015200, + "step": 95340 + }, + { + "epoch": 10.618665775698853, + "grad_norm": 0.014195885509252548, + "learning_rate": 2.6663141189053466e-05, + "loss": 0.0201, + "num_input_tokens_seen": 116021216, + "step": 95345 + }, + { + "epoch": 10.61922263058247, + "grad_norm": 0.7180224657058716, + "learning_rate": 2.6660716825969495e-05, + "loss": 0.0442, + "num_input_tokens_seen": 116027200, + "step": 95350 + }, + { + "epoch": 10.619779485466088, + "grad_norm": 0.5772701501846313, + "learning_rate": 2.6658292447198725e-05, + "loss": 0.079, + "num_input_tokens_seen": 116033024, + "step": 95355 + }, + { + "epoch": 10.620336340349706, + "grad_norm": 0.018995480611920357, + "learning_rate": 2.665586805276406e-05, + "loss": 0.1615, + "num_input_tokens_seen": 116039040, + "step": 95360 + }, + { + "epoch": 10.620893195233322, + "grad_norm": 1.0540176630020142, + "learning_rate": 2.6653443642688392e-05, + "loss": 0.0881, + "num_input_tokens_seen": 116045024, + "step": 95365 + }, + { + "epoch": 10.62145005011694, + "grad_norm": 0.03887971118092537, + "learning_rate": 2.6651019216994626e-05, + "loss": 0.0383, + "num_input_tokens_seen": 116051232, + "step": 95370 + }, + { + "epoch": 10.622006905000557, + "grad_norm": 0.13341312110424042, + "learning_rate": 2.6648594775705656e-05, + "loss": 0.0214, + "num_input_tokens_seen": 116057472, + "step": 95375 + }, + { + "epoch": 10.622563759884175, + "grad_norm": 0.0019727356266230345, + "learning_rate": 2.6646170318844388e-05, + "loss": 0.0256, + "num_input_tokens_seen": 116063552, + "step": 95380 + }, + { + "epoch": 10.623120614767792, + "grad_norm": 0.000306386238662526, + "learning_rate": 2.6643745846433726e-05, + "loss": 0.0141, + "num_input_tokens_seen": 116069632, + "step": 95385 + }, + { + "epoch": 10.623677469651408, + "grad_norm": 0.010014000348746777, + "learning_rate": 2.6641321358496567e-05, + "loss": 0.0161, + "num_input_tokens_seen": 116075744, + "step": 95390 + }, + { + "epoch": 10.624234324535026, + "grad_norm": 0.007178908679634333, + "learning_rate": 2.663889685505582e-05, + "loss": 0.0038, + "num_input_tokens_seen": 116081568, + "step": 95395 + }, + { + "epoch": 10.624791179418644, + "grad_norm": 0.12007023394107819, + "learning_rate": 2.6636472336134368e-05, + "loss": 0.1223, + "num_input_tokens_seen": 116088064, + "step": 95400 + }, + { + "epoch": 10.625348034302261, + "grad_norm": 0.0415482334792614, + "learning_rate": 2.6634047801755124e-05, + "loss": 0.0901, + "num_input_tokens_seen": 116093792, + "step": 95405 + }, + { + "epoch": 10.625904889185879, + "grad_norm": 0.08314427733421326, + "learning_rate": 2.663162325194099e-05, + "loss": 0.071, + "num_input_tokens_seen": 116100160, + "step": 95410 + }, + { + "epoch": 10.626461744069495, + "grad_norm": 0.41500231623649597, + "learning_rate": 2.662919868671487e-05, + "loss": 0.0248, + "num_input_tokens_seen": 116106176, + "step": 95415 + }, + { + "epoch": 10.627018598953113, + "grad_norm": 0.032734449952840805, + "learning_rate": 2.662677410609966e-05, + "loss": 0.0903, + "num_input_tokens_seen": 116112448, + "step": 95420 + }, + { + "epoch": 10.62757545383673, + "grad_norm": 0.8433188199996948, + "learning_rate": 2.662434951011827e-05, + "loss": 0.0404, + "num_input_tokens_seen": 116118432, + "step": 95425 + }, + { + "epoch": 10.628132308720348, + "grad_norm": 0.1284695565700531, + "learning_rate": 2.6621924898793598e-05, + "loss": 0.0226, + "num_input_tokens_seen": 116124384, + "step": 95430 + }, + { + "epoch": 10.628689163603966, + "grad_norm": 0.00018754079064819962, + "learning_rate": 2.661950027214854e-05, + "loss": 0.0217, + "num_input_tokens_seen": 116130624, + "step": 95435 + }, + { + "epoch": 10.629246018487581, + "grad_norm": 0.5820909738540649, + "learning_rate": 2.6617075630206012e-05, + "loss": 0.0233, + "num_input_tokens_seen": 116136896, + "step": 95440 + }, + { + "epoch": 10.6298028733712, + "grad_norm": 0.1060333326458931, + "learning_rate": 2.6614650972988902e-05, + "loss": 0.0154, + "num_input_tokens_seen": 116143296, + "step": 95445 + }, + { + "epoch": 10.630359728254817, + "grad_norm": 0.020025676116347313, + "learning_rate": 2.6612226300520117e-05, + "loss": 0.0276, + "num_input_tokens_seen": 116149728, + "step": 95450 + }, + { + "epoch": 10.630916583138434, + "grad_norm": 0.0027576524298638105, + "learning_rate": 2.660980161282257e-05, + "loss": 0.0366, + "num_input_tokens_seen": 116155936, + "step": 95455 + }, + { + "epoch": 10.631473438022052, + "grad_norm": 0.05050155892968178, + "learning_rate": 2.6607376909919157e-05, + "loss": 0.0285, + "num_input_tokens_seen": 116162208, + "step": 95460 + }, + { + "epoch": 10.632030292905668, + "grad_norm": 1.2127153873443604, + "learning_rate": 2.6604952191832782e-05, + "loss": 0.0483, + "num_input_tokens_seen": 116168224, + "step": 95465 + }, + { + "epoch": 10.632587147789286, + "grad_norm": 0.0099617475643754, + "learning_rate": 2.660252745858634e-05, + "loss": 0.0985, + "num_input_tokens_seen": 116173760, + "step": 95470 + }, + { + "epoch": 10.633144002672903, + "grad_norm": 1.3919464349746704, + "learning_rate": 2.6600102710202745e-05, + "loss": 0.1177, + "num_input_tokens_seen": 116179680, + "step": 95475 + }, + { + "epoch": 10.633700857556521, + "grad_norm": 2.0349621772766113, + "learning_rate": 2.6597677946704908e-05, + "loss": 0.025, + "num_input_tokens_seen": 116185920, + "step": 95480 + }, + { + "epoch": 10.634257712440139, + "grad_norm": 0.021768808364868164, + "learning_rate": 2.659525316811571e-05, + "loss": 0.0705, + "num_input_tokens_seen": 116192000, + "step": 95485 + }, + { + "epoch": 10.634814567323755, + "grad_norm": 0.00039449954056181014, + "learning_rate": 2.6592828374458077e-05, + "loss": 0.0004, + "num_input_tokens_seen": 116198272, + "step": 95490 + }, + { + "epoch": 10.635371422207372, + "grad_norm": 0.4828765392303467, + "learning_rate": 2.6590403565754895e-05, + "loss": 0.031, + "num_input_tokens_seen": 116204480, + "step": 95495 + }, + { + "epoch": 10.63592827709099, + "grad_norm": 0.0757300853729248, + "learning_rate": 2.6587978742029085e-05, + "loss": 0.0115, + "num_input_tokens_seen": 116210624, + "step": 95500 + }, + { + "epoch": 10.636485131974608, + "grad_norm": 0.011671925894916058, + "learning_rate": 2.6585553903303538e-05, + "loss": 0.0254, + "num_input_tokens_seen": 116216672, + "step": 95505 + }, + { + "epoch": 10.637041986858225, + "grad_norm": 0.618148922920227, + "learning_rate": 2.6583129049601168e-05, + "loss": 0.0919, + "num_input_tokens_seen": 116223040, + "step": 95510 + }, + { + "epoch": 10.637598841741841, + "grad_norm": 9.106989455176517e-05, + "learning_rate": 2.6580704180944878e-05, + "loss": 0.0023, + "num_input_tokens_seen": 116229376, + "step": 95515 + }, + { + "epoch": 10.638155696625459, + "grad_norm": 0.00020327525271568447, + "learning_rate": 2.6578279297357562e-05, + "loss": 0.1373, + "num_input_tokens_seen": 116235168, + "step": 95520 + }, + { + "epoch": 10.638712551509077, + "grad_norm": 0.018088338896632195, + "learning_rate": 2.6575854398862145e-05, + "loss": 0.0838, + "num_input_tokens_seen": 116241280, + "step": 95525 + }, + { + "epoch": 10.639269406392694, + "grad_norm": 0.023843752220273018, + "learning_rate": 2.657342948548151e-05, + "loss": 0.0463, + "num_input_tokens_seen": 116247360, + "step": 95530 + }, + { + "epoch": 10.639826261276312, + "grad_norm": 0.0001977149659069255, + "learning_rate": 2.6571004557238576e-05, + "loss": 0.1202, + "num_input_tokens_seen": 116253600, + "step": 95535 + }, + { + "epoch": 10.640383116159928, + "grad_norm": 1.4555639028549194, + "learning_rate": 2.656857961415625e-05, + "loss": 0.0499, + "num_input_tokens_seen": 116259648, + "step": 95540 + }, + { + "epoch": 10.640939971043545, + "grad_norm": 0.16315221786499023, + "learning_rate": 2.6566154656257425e-05, + "loss": 0.0438, + "num_input_tokens_seen": 116266048, + "step": 95545 + }, + { + "epoch": 10.641496825927163, + "grad_norm": 1.722528100013733, + "learning_rate": 2.6563729683565018e-05, + "loss": 0.0488, + "num_input_tokens_seen": 116272352, + "step": 95550 + }, + { + "epoch": 10.64205368081078, + "grad_norm": 0.00020980006956961006, + "learning_rate": 2.656130469610193e-05, + "loss": 0.0098, + "num_input_tokens_seen": 116278656, + "step": 95555 + }, + { + "epoch": 10.642610535694399, + "grad_norm": 0.012548287399113178, + "learning_rate": 2.6558879693891074e-05, + "loss": 0.0008, + "num_input_tokens_seen": 116284992, + "step": 95560 + }, + { + "epoch": 10.643167390578016, + "grad_norm": 0.004949226975440979, + "learning_rate": 2.6556454676955345e-05, + "loss": 0.0591, + "num_input_tokens_seen": 116291072, + "step": 95565 + }, + { + "epoch": 10.643724245461632, + "grad_norm": 1.2019472122192383, + "learning_rate": 2.6554029645317658e-05, + "loss": 0.0172, + "num_input_tokens_seen": 116297120, + "step": 95570 + }, + { + "epoch": 10.64428110034525, + "grad_norm": 0.019421076402068138, + "learning_rate": 2.6551604599000913e-05, + "loss": 0.0888, + "num_input_tokens_seen": 116303104, + "step": 95575 + }, + { + "epoch": 10.644837955228867, + "grad_norm": 0.11168977618217468, + "learning_rate": 2.654917953802802e-05, + "loss": 0.0891, + "num_input_tokens_seen": 116308992, + "step": 95580 + }, + { + "epoch": 10.645394810112485, + "grad_norm": 0.07592448592185974, + "learning_rate": 2.6546754462421885e-05, + "loss": 0.0822, + "num_input_tokens_seen": 116314912, + "step": 95585 + }, + { + "epoch": 10.645951664996103, + "grad_norm": 0.8005033135414124, + "learning_rate": 2.6544329372205412e-05, + "loss": 0.0098, + "num_input_tokens_seen": 116320864, + "step": 95590 + }, + { + "epoch": 10.646508519879719, + "grad_norm": 0.05886869877576828, + "learning_rate": 2.6541904267401517e-05, + "loss": 0.0412, + "num_input_tokens_seen": 116326912, + "step": 95595 + }, + { + "epoch": 10.647065374763336, + "grad_norm": 1.7452192306518555, + "learning_rate": 2.6539479148033097e-05, + "loss": 0.0733, + "num_input_tokens_seen": 116332928, + "step": 95600 + }, + { + "epoch": 10.647622229646954, + "grad_norm": 0.008648917078971863, + "learning_rate": 2.653705401412306e-05, + "loss": 0.0289, + "num_input_tokens_seen": 116339104, + "step": 95605 + }, + { + "epoch": 10.648179084530572, + "grad_norm": 0.05231565609574318, + "learning_rate": 2.653462886569432e-05, + "loss": 0.0289, + "num_input_tokens_seen": 116345408, + "step": 95610 + }, + { + "epoch": 10.64873593941419, + "grad_norm": 3.219935178756714, + "learning_rate": 2.653220370276978e-05, + "loss": 0.0551, + "num_input_tokens_seen": 116350944, + "step": 95615 + }, + { + "epoch": 10.649292794297805, + "grad_norm": 0.3693287968635559, + "learning_rate": 2.6529778525372356e-05, + "loss": 0.0715, + "num_input_tokens_seen": 116357600, + "step": 95620 + }, + { + "epoch": 10.649849649181423, + "grad_norm": 0.0036380307283252478, + "learning_rate": 2.652735333352494e-05, + "loss": 0.0561, + "num_input_tokens_seen": 116363872, + "step": 95625 + }, + { + "epoch": 10.65040650406504, + "grad_norm": 0.06867681443691254, + "learning_rate": 2.6524928127250455e-05, + "loss": 0.0532, + "num_input_tokens_seen": 116369888, + "step": 95630 + }, + { + "epoch": 10.650963358948658, + "grad_norm": 0.8931956887245178, + "learning_rate": 2.652250290657179e-05, + "loss": 0.0723, + "num_input_tokens_seen": 116375840, + "step": 95635 + }, + { + "epoch": 10.651520213832276, + "grad_norm": 3.652222156524658, + "learning_rate": 2.6520077671511872e-05, + "loss": 0.0629, + "num_input_tokens_seen": 116381696, + "step": 95640 + }, + { + "epoch": 10.652077068715892, + "grad_norm": 0.0054188137874007225, + "learning_rate": 2.6517652422093605e-05, + "loss": 0.04, + "num_input_tokens_seen": 116388064, + "step": 95645 + }, + { + "epoch": 10.65263392359951, + "grad_norm": 0.0015822274144738913, + "learning_rate": 2.651522715833989e-05, + "loss": 0.1413, + "num_input_tokens_seen": 116394048, + "step": 95650 + }, + { + "epoch": 10.653190778483127, + "grad_norm": 0.39808139204978943, + "learning_rate": 2.6512801880273648e-05, + "loss": 0.0857, + "num_input_tokens_seen": 116399808, + "step": 95655 + }, + { + "epoch": 10.653747633366745, + "grad_norm": 0.4214056432247162, + "learning_rate": 2.6510376587917773e-05, + "loss": 0.0191, + "num_input_tokens_seen": 116406016, + "step": 95660 + }, + { + "epoch": 10.654304488250363, + "grad_norm": 1.0837849378585815, + "learning_rate": 2.6507951281295183e-05, + "loss": 0.1094, + "num_input_tokens_seen": 116411712, + "step": 95665 + }, + { + "epoch": 10.654861343133978, + "grad_norm": 0.4985491633415222, + "learning_rate": 2.6505525960428786e-05, + "loss": 0.0567, + "num_input_tokens_seen": 116417984, + "step": 95670 + }, + { + "epoch": 10.655418198017596, + "grad_norm": 0.6406513452529907, + "learning_rate": 2.6503100625341488e-05, + "loss": 0.0402, + "num_input_tokens_seen": 116424416, + "step": 95675 + }, + { + "epoch": 10.655975052901214, + "grad_norm": 0.002087271073833108, + "learning_rate": 2.6500675276056203e-05, + "loss": 0.0796, + "num_input_tokens_seen": 116430144, + "step": 95680 + }, + { + "epoch": 10.656531907784832, + "grad_norm": 0.00021040918363723904, + "learning_rate": 2.6498249912595836e-05, + "loss": 0.0563, + "num_input_tokens_seen": 116436672, + "step": 95685 + }, + { + "epoch": 10.65708876266845, + "grad_norm": 0.1301773488521576, + "learning_rate": 2.64958245349833e-05, + "loss": 0.011, + "num_input_tokens_seen": 116442720, + "step": 95690 + }, + { + "epoch": 10.657645617552067, + "grad_norm": 0.9951856136322021, + "learning_rate": 2.6493399143241505e-05, + "loss": 0.0525, + "num_input_tokens_seen": 116448640, + "step": 95695 + }, + { + "epoch": 10.658202472435683, + "grad_norm": 1.1014657020568848, + "learning_rate": 2.649097373739335e-05, + "loss": 0.0343, + "num_input_tokens_seen": 116454752, + "step": 95700 + }, + { + "epoch": 10.6587593273193, + "grad_norm": 0.6921049356460571, + "learning_rate": 2.6488548317461766e-05, + "loss": 0.0214, + "num_input_tokens_seen": 116460768, + "step": 95705 + }, + { + "epoch": 10.659316182202918, + "grad_norm": 0.939407467842102, + "learning_rate": 2.648612288346964e-05, + "loss": 0.0888, + "num_input_tokens_seen": 116466848, + "step": 95710 + }, + { + "epoch": 10.659873037086536, + "grad_norm": 0.10369844734668732, + "learning_rate": 2.6483697435439896e-05, + "loss": 0.1675, + "num_input_tokens_seen": 116473184, + "step": 95715 + }, + { + "epoch": 10.660429891970153, + "grad_norm": 0.15807946026325226, + "learning_rate": 2.6481271973395437e-05, + "loss": 0.0417, + "num_input_tokens_seen": 116479232, + "step": 95720 + }, + { + "epoch": 10.66098674685377, + "grad_norm": 3.2379069328308105, + "learning_rate": 2.6478846497359184e-05, + "loss": 0.0878, + "num_input_tokens_seen": 116485344, + "step": 95725 + }, + { + "epoch": 10.661543601737387, + "grad_norm": 3.149221181869507, + "learning_rate": 2.6476421007354042e-05, + "loss": 0.125, + "num_input_tokens_seen": 116491936, + "step": 95730 + }, + { + "epoch": 10.662100456621005, + "grad_norm": 0.22729116678237915, + "learning_rate": 2.647399550340291e-05, + "loss": 0.0533, + "num_input_tokens_seen": 116497984, + "step": 95735 + }, + { + "epoch": 10.662657311504622, + "grad_norm": 0.1425083875656128, + "learning_rate": 2.647156998552872e-05, + "loss": 0.0092, + "num_input_tokens_seen": 116503840, + "step": 95740 + }, + { + "epoch": 10.66321416638824, + "grad_norm": 0.052004653960466385, + "learning_rate": 2.6469144453754368e-05, + "loss": 0.1338, + "num_input_tokens_seen": 116509248, + "step": 95745 + }, + { + "epoch": 10.663771021271856, + "grad_norm": 0.9133530855178833, + "learning_rate": 2.6466718908102774e-05, + "loss": 0.0447, + "num_input_tokens_seen": 116515168, + "step": 95750 + }, + { + "epoch": 10.664327876155474, + "grad_norm": 0.7269521355628967, + "learning_rate": 2.6464293348596837e-05, + "loss": 0.056, + "num_input_tokens_seen": 116521440, + "step": 95755 + }, + { + "epoch": 10.664884731039091, + "grad_norm": 0.6388680934906006, + "learning_rate": 2.646186777525948e-05, + "loss": 0.0318, + "num_input_tokens_seen": 116527648, + "step": 95760 + }, + { + "epoch": 10.665441585922709, + "grad_norm": 0.8068234324455261, + "learning_rate": 2.6459442188113615e-05, + "loss": 0.067, + "num_input_tokens_seen": 116533760, + "step": 95765 + }, + { + "epoch": 10.665998440806327, + "grad_norm": 0.013526265509426594, + "learning_rate": 2.6457016587182143e-05, + "loss": 0.0782, + "num_input_tokens_seen": 116540128, + "step": 95770 + }, + { + "epoch": 10.666555295689943, + "grad_norm": 0.15423692762851715, + "learning_rate": 2.6454590972487985e-05, + "loss": 0.0426, + "num_input_tokens_seen": 116546496, + "step": 95775 + }, + { + "epoch": 10.66711215057356, + "grad_norm": 0.3818698525428772, + "learning_rate": 2.6452165344054048e-05, + "loss": 0.0114, + "num_input_tokens_seen": 116552096, + "step": 95780 + }, + { + "epoch": 10.667669005457178, + "grad_norm": 0.1197868213057518, + "learning_rate": 2.6449739701903242e-05, + "loss": 0.0152, + "num_input_tokens_seen": 116558528, + "step": 95785 + }, + { + "epoch": 10.668225860340796, + "grad_norm": 0.00587842334061861, + "learning_rate": 2.6447314046058492e-05, + "loss": 0.0773, + "num_input_tokens_seen": 116564576, + "step": 95790 + }, + { + "epoch": 10.668782715224413, + "grad_norm": 0.004853836260735989, + "learning_rate": 2.6444888376542698e-05, + "loss": 0.0112, + "num_input_tokens_seen": 116570848, + "step": 95795 + }, + { + "epoch": 10.66933957010803, + "grad_norm": 0.1583271026611328, + "learning_rate": 2.6442462693378778e-05, + "loss": 0.0081, + "num_input_tokens_seen": 116576864, + "step": 95800 + }, + { + "epoch": 10.669896424991647, + "grad_norm": 0.03004547767341137, + "learning_rate": 2.6440036996589634e-05, + "loss": 0.0575, + "num_input_tokens_seen": 116583040, + "step": 95805 + }, + { + "epoch": 10.670453279875264, + "grad_norm": 0.009037344716489315, + "learning_rate": 2.64376112861982e-05, + "loss": 0.0239, + "num_input_tokens_seen": 116589088, + "step": 95810 + }, + { + "epoch": 10.671010134758882, + "grad_norm": 0.026114484295248985, + "learning_rate": 2.643518556222736e-05, + "loss": 0.0044, + "num_input_tokens_seen": 116595328, + "step": 95815 + }, + { + "epoch": 10.6715669896425, + "grad_norm": 1.8355015516281128, + "learning_rate": 2.643275982470005e-05, + "loss": 0.1109, + "num_input_tokens_seen": 116601536, + "step": 95820 + }, + { + "epoch": 10.672123844526116, + "grad_norm": 1.616241455078125, + "learning_rate": 2.643033407363918e-05, + "loss": 0.0662, + "num_input_tokens_seen": 116607168, + "step": 95825 + }, + { + "epoch": 10.672680699409733, + "grad_norm": 0.701790988445282, + "learning_rate": 2.6427908309067652e-05, + "loss": 0.1003, + "num_input_tokens_seen": 116613664, + "step": 95830 + }, + { + "epoch": 10.673237554293351, + "grad_norm": 0.31438907980918884, + "learning_rate": 2.6425482531008387e-05, + "loss": 0.0165, + "num_input_tokens_seen": 116619776, + "step": 95835 + }, + { + "epoch": 10.673794409176969, + "grad_norm": 0.33694761991500854, + "learning_rate": 2.6423056739484297e-05, + "loss": 0.0332, + "num_input_tokens_seen": 116625856, + "step": 95840 + }, + { + "epoch": 10.674351264060586, + "grad_norm": 0.8427005410194397, + "learning_rate": 2.6420630934518303e-05, + "loss": 0.0827, + "num_input_tokens_seen": 116632064, + "step": 95845 + }, + { + "epoch": 10.674908118944202, + "grad_norm": 0.11322581768035889, + "learning_rate": 2.6418205116133304e-05, + "loss": 0.0479, + "num_input_tokens_seen": 116638496, + "step": 95850 + }, + { + "epoch": 10.67546497382782, + "grad_norm": 0.5633811950683594, + "learning_rate": 2.641577928435222e-05, + "loss": 0.0696, + "num_input_tokens_seen": 116644640, + "step": 95855 + }, + { + "epoch": 10.676021828711438, + "grad_norm": 0.10912857949733734, + "learning_rate": 2.6413353439197976e-05, + "loss": 0.0077, + "num_input_tokens_seen": 116650688, + "step": 95860 + }, + { + "epoch": 10.676578683595055, + "grad_norm": 0.12119729071855545, + "learning_rate": 2.6410927580693468e-05, + "loss": 0.0051, + "num_input_tokens_seen": 116656800, + "step": 95865 + }, + { + "epoch": 10.677135538478673, + "grad_norm": 0.20749396085739136, + "learning_rate": 2.640850170886163e-05, + "loss": 0.0928, + "num_input_tokens_seen": 116662432, + "step": 95870 + }, + { + "epoch": 10.677692393362289, + "grad_norm": 0.858246386051178, + "learning_rate": 2.6406075823725344e-05, + "loss": 0.0192, + "num_input_tokens_seen": 116668576, + "step": 95875 + }, + { + "epoch": 10.678249248245907, + "grad_norm": 0.15209443867206573, + "learning_rate": 2.640364992530756e-05, + "loss": 0.0159, + "num_input_tokens_seen": 116674944, + "step": 95880 + }, + { + "epoch": 10.678806103129524, + "grad_norm": 0.09137583523988724, + "learning_rate": 2.6401224013631175e-05, + "loss": 0.0089, + "num_input_tokens_seen": 116680832, + "step": 95885 + }, + { + "epoch": 10.679362958013142, + "grad_norm": 0.26789626479148865, + "learning_rate": 2.6398798088719105e-05, + "loss": 0.0083, + "num_input_tokens_seen": 116686752, + "step": 95890 + }, + { + "epoch": 10.67991981289676, + "grad_norm": 0.8932722210884094, + "learning_rate": 2.6396372150594273e-05, + "loss": 0.0998, + "num_input_tokens_seen": 116692448, + "step": 95895 + }, + { + "epoch": 10.680476667780376, + "grad_norm": 1.9419450759887695, + "learning_rate": 2.639394619927958e-05, + "loss": 0.0239, + "num_input_tokens_seen": 116698528, + "step": 95900 + }, + { + "epoch": 10.681033522663993, + "grad_norm": 1.2023972272872925, + "learning_rate": 2.6391520234797958e-05, + "loss": 0.0129, + "num_input_tokens_seen": 116704800, + "step": 95905 + }, + { + "epoch": 10.68159037754761, + "grad_norm": 0.0222818274050951, + "learning_rate": 2.63890942571723e-05, + "loss": 0.124, + "num_input_tokens_seen": 116711072, + "step": 95910 + }, + { + "epoch": 10.682147232431229, + "grad_norm": 0.01684156432747841, + "learning_rate": 2.6386668266425535e-05, + "loss": 0.0321, + "num_input_tokens_seen": 116717344, + "step": 95915 + }, + { + "epoch": 10.682704087314846, + "grad_norm": 0.0017687490908429027, + "learning_rate": 2.6384242262580582e-05, + "loss": 0.0214, + "num_input_tokens_seen": 116723520, + "step": 95920 + }, + { + "epoch": 10.683260942198464, + "grad_norm": 0.6002137064933777, + "learning_rate": 2.638181624566035e-05, + "loss": 0.1608, + "num_input_tokens_seen": 116729728, + "step": 95925 + }, + { + "epoch": 10.68381779708208, + "grad_norm": 0.25407877564430237, + "learning_rate": 2.6379390215687764e-05, + "loss": 0.0479, + "num_input_tokens_seen": 116735264, + "step": 95930 + }, + { + "epoch": 10.684374651965697, + "grad_norm": 0.9916078448295593, + "learning_rate": 2.6376964172685725e-05, + "loss": 0.0106, + "num_input_tokens_seen": 116741344, + "step": 95935 + }, + { + "epoch": 10.684931506849315, + "grad_norm": 0.0008480013348162174, + "learning_rate": 2.6374538116677162e-05, + "loss": 0.0253, + "num_input_tokens_seen": 116747488, + "step": 95940 + }, + { + "epoch": 10.685488361732933, + "grad_norm": 0.738086998462677, + "learning_rate": 2.637211204768497e-05, + "loss": 0.1452, + "num_input_tokens_seen": 116753344, + "step": 95945 + }, + { + "epoch": 10.68604521661655, + "grad_norm": 0.8171157836914062, + "learning_rate": 2.6369685965732094e-05, + "loss": 0.0535, + "num_input_tokens_seen": 116759584, + "step": 95950 + }, + { + "epoch": 10.686602071500166, + "grad_norm": 0.0007025257218629122, + "learning_rate": 2.6367259870841436e-05, + "loss": 0.0782, + "num_input_tokens_seen": 116765792, + "step": 95955 + }, + { + "epoch": 10.687158926383784, + "grad_norm": 0.10480006784200668, + "learning_rate": 2.6364833763035908e-05, + "loss": 0.0973, + "num_input_tokens_seen": 116771872, + "step": 95960 + }, + { + "epoch": 10.687715781267402, + "grad_norm": 0.6522805690765381, + "learning_rate": 2.636240764233844e-05, + "loss": 0.0774, + "num_input_tokens_seen": 116777984, + "step": 95965 + }, + { + "epoch": 10.68827263615102, + "grad_norm": 0.3895639479160309, + "learning_rate": 2.6359981508771932e-05, + "loss": 0.0259, + "num_input_tokens_seen": 116783680, + "step": 95970 + }, + { + "epoch": 10.688829491034637, + "grad_norm": 0.023403655737638474, + "learning_rate": 2.635755536235931e-05, + "loss": 0.0048, + "num_input_tokens_seen": 116789824, + "step": 95975 + }, + { + "epoch": 10.689386345918253, + "grad_norm": 1.3485430479049683, + "learning_rate": 2.6355129203123492e-05, + "loss": 0.186, + "num_input_tokens_seen": 116795680, + "step": 95980 + }, + { + "epoch": 10.68994320080187, + "grad_norm": 0.02974516712129116, + "learning_rate": 2.635270303108739e-05, + "loss": 0.0018, + "num_input_tokens_seen": 116801984, + "step": 95985 + }, + { + "epoch": 10.690500055685488, + "grad_norm": 2.8799405097961426, + "learning_rate": 2.6350276846273926e-05, + "loss": 0.1234, + "num_input_tokens_seen": 116808352, + "step": 95990 + }, + { + "epoch": 10.691056910569106, + "grad_norm": 0.000279969914117828, + "learning_rate": 2.6347850648706012e-05, + "loss": 0.0767, + "num_input_tokens_seen": 116814240, + "step": 95995 + }, + { + "epoch": 10.691613765452724, + "grad_norm": 0.993374228477478, + "learning_rate": 2.634542443840658e-05, + "loss": 0.0954, + "num_input_tokens_seen": 116820288, + "step": 96000 + }, + { + "epoch": 10.69217062033634, + "grad_norm": 0.02770918421447277, + "learning_rate": 2.6342998215398523e-05, + "loss": 0.0012, + "num_input_tokens_seen": 116826400, + "step": 96005 + }, + { + "epoch": 10.692727475219957, + "grad_norm": 1.0734509229660034, + "learning_rate": 2.6340571979704775e-05, + "loss": 0.0939, + "num_input_tokens_seen": 116832672, + "step": 96010 + }, + { + "epoch": 10.693284330103575, + "grad_norm": 0.06822116672992706, + "learning_rate": 2.6338145731348258e-05, + "loss": 0.0192, + "num_input_tokens_seen": 116838912, + "step": 96015 + }, + { + "epoch": 10.693841184987193, + "grad_norm": 2.3272950649261475, + "learning_rate": 2.6335719470351878e-05, + "loss": 0.0795, + "num_input_tokens_seen": 116844256, + "step": 96020 + }, + { + "epoch": 10.69439803987081, + "grad_norm": 0.8837109804153442, + "learning_rate": 2.6333293196738555e-05, + "loss": 0.0193, + "num_input_tokens_seen": 116850656, + "step": 96025 + }, + { + "epoch": 10.694954894754426, + "grad_norm": 0.29279857873916626, + "learning_rate": 2.633086691053121e-05, + "loss": 0.0537, + "num_input_tokens_seen": 116856512, + "step": 96030 + }, + { + "epoch": 10.695511749638044, + "grad_norm": 1.0045922994613647, + "learning_rate": 2.6328440611752768e-05, + "loss": 0.1341, + "num_input_tokens_seen": 116862080, + "step": 96035 + }, + { + "epoch": 10.696068604521662, + "grad_norm": 1.7507927417755127, + "learning_rate": 2.6326014300426134e-05, + "loss": 0.058, + "num_input_tokens_seen": 116868192, + "step": 96040 + }, + { + "epoch": 10.69662545940528, + "grad_norm": 0.10671639442443848, + "learning_rate": 2.6323587976574227e-05, + "loss": 0.076, + "num_input_tokens_seen": 116874368, + "step": 96045 + }, + { + "epoch": 10.697182314288897, + "grad_norm": 0.2731306552886963, + "learning_rate": 2.632116164021998e-05, + "loss": 0.1074, + "num_input_tokens_seen": 116880288, + "step": 96050 + }, + { + "epoch": 10.697739169172515, + "grad_norm": 0.014314951375126839, + "learning_rate": 2.6318735291386298e-05, + "loss": 0.0406, + "num_input_tokens_seen": 116886400, + "step": 96055 + }, + { + "epoch": 10.69829602405613, + "grad_norm": 0.1044955775141716, + "learning_rate": 2.631630893009611e-05, + "loss": 0.0082, + "num_input_tokens_seen": 116892288, + "step": 96060 + }, + { + "epoch": 10.698852878939748, + "grad_norm": 0.15553973615169525, + "learning_rate": 2.6313882556372327e-05, + "loss": 0.0411, + "num_input_tokens_seen": 116898368, + "step": 96065 + }, + { + "epoch": 10.699409733823366, + "grad_norm": 0.10421215742826462, + "learning_rate": 2.6311456170237868e-05, + "loss": 0.0271, + "num_input_tokens_seen": 116904448, + "step": 96070 + }, + { + "epoch": 10.699966588706983, + "grad_norm": 0.5732226967811584, + "learning_rate": 2.630902977171566e-05, + "loss": 0.0642, + "num_input_tokens_seen": 116910592, + "step": 96075 + }, + { + "epoch": 10.700523443590601, + "grad_norm": 0.001421054475940764, + "learning_rate": 2.6306603360828607e-05, + "loss": 0.0042, + "num_input_tokens_seen": 116916960, + "step": 96080 + }, + { + "epoch": 10.701080298474217, + "grad_norm": 0.017018403857946396, + "learning_rate": 2.630417693759965e-05, + "loss": 0.0113, + "num_input_tokens_seen": 116923104, + "step": 96085 + }, + { + "epoch": 10.701637153357835, + "grad_norm": 1.7430311441421509, + "learning_rate": 2.630175050205169e-05, + "loss": 0.1165, + "num_input_tokens_seen": 116928928, + "step": 96090 + }, + { + "epoch": 10.702194008241452, + "grad_norm": 1.1666738986968994, + "learning_rate": 2.629932405420766e-05, + "loss": 0.1142, + "num_input_tokens_seen": 116934944, + "step": 96095 + }, + { + "epoch": 10.70275086312507, + "grad_norm": 0.9216097593307495, + "learning_rate": 2.629689759409047e-05, + "loss": 0.0962, + "num_input_tokens_seen": 116940896, + "step": 96100 + }, + { + "epoch": 10.703307718008688, + "grad_norm": 0.29528769850730896, + "learning_rate": 2.629447112172304e-05, + "loss": 0.0097, + "num_input_tokens_seen": 116947264, + "step": 96105 + }, + { + "epoch": 10.703864572892304, + "grad_norm": 0.8160899877548218, + "learning_rate": 2.62920446371283e-05, + "loss": 0.0532, + "num_input_tokens_seen": 116952800, + "step": 96110 + }, + { + "epoch": 10.704421427775921, + "grad_norm": 0.01061225775629282, + "learning_rate": 2.6289618140329154e-05, + "loss": 0.0012, + "num_input_tokens_seen": 116958880, + "step": 96115 + }, + { + "epoch": 10.704978282659539, + "grad_norm": 0.2535746097564697, + "learning_rate": 2.628719163134854e-05, + "loss": 0.0106, + "num_input_tokens_seen": 116965024, + "step": 96120 + }, + { + "epoch": 10.705535137543157, + "grad_norm": 0.008375597186386585, + "learning_rate": 2.6284765110209365e-05, + "loss": 0.0316, + "num_input_tokens_seen": 116970912, + "step": 96125 + }, + { + "epoch": 10.706091992426774, + "grad_norm": 0.00014006670971866697, + "learning_rate": 2.6282338576934552e-05, + "loss": 0.0106, + "num_input_tokens_seen": 116977024, + "step": 96130 + }, + { + "epoch": 10.70664884731039, + "grad_norm": 0.07744605839252472, + "learning_rate": 2.6279912031547026e-05, + "loss": 0.0894, + "num_input_tokens_seen": 116983008, + "step": 96135 + }, + { + "epoch": 10.707205702194008, + "grad_norm": 0.4585436284542084, + "learning_rate": 2.6277485474069707e-05, + "loss": 0.1309, + "num_input_tokens_seen": 116989024, + "step": 96140 + }, + { + "epoch": 10.707762557077626, + "grad_norm": 0.3674272298812866, + "learning_rate": 2.6275058904525512e-05, + "loss": 0.0706, + "num_input_tokens_seen": 116994880, + "step": 96145 + }, + { + "epoch": 10.708319411961243, + "grad_norm": 0.00371662899851799, + "learning_rate": 2.6272632322937363e-05, + "loss": 0.0723, + "num_input_tokens_seen": 117000960, + "step": 96150 + }, + { + "epoch": 10.708876266844861, + "grad_norm": 0.031488966196775436, + "learning_rate": 2.6270205729328183e-05, + "loss": 0.0051, + "num_input_tokens_seen": 117006976, + "step": 96155 + }, + { + "epoch": 10.709433121728477, + "grad_norm": 0.5072953701019287, + "learning_rate": 2.6267779123720897e-05, + "loss": 0.0685, + "num_input_tokens_seen": 117013024, + "step": 96160 + }, + { + "epoch": 10.709989976612095, + "grad_norm": 0.07179543375968933, + "learning_rate": 2.626535250613842e-05, + "loss": 0.0894, + "num_input_tokens_seen": 117019008, + "step": 96165 + }, + { + "epoch": 10.710546831495712, + "grad_norm": 0.9128488898277283, + "learning_rate": 2.626292587660367e-05, + "loss": 0.0298, + "num_input_tokens_seen": 117025408, + "step": 96170 + }, + { + "epoch": 10.71110368637933, + "grad_norm": 0.0038669267669320107, + "learning_rate": 2.6260499235139573e-05, + "loss": 0.0207, + "num_input_tokens_seen": 117031616, + "step": 96175 + }, + { + "epoch": 10.711660541262948, + "grad_norm": 0.08687406778335571, + "learning_rate": 2.625807258176906e-05, + "loss": 0.0313, + "num_input_tokens_seen": 117037760, + "step": 96180 + }, + { + "epoch": 10.712217396146563, + "grad_norm": 0.011985734105110168, + "learning_rate": 2.6255645916515036e-05, + "loss": 0.045, + "num_input_tokens_seen": 117043680, + "step": 96185 + }, + { + "epoch": 10.712774251030181, + "grad_norm": 0.03523484617471695, + "learning_rate": 2.625321923940043e-05, + "loss": 0.0006, + "num_input_tokens_seen": 117049984, + "step": 96190 + }, + { + "epoch": 10.713331105913799, + "grad_norm": 0.0007246356108225882, + "learning_rate": 2.6250792550448167e-05, + "loss": 0.0497, + "num_input_tokens_seen": 117056128, + "step": 96195 + }, + { + "epoch": 10.713887960797416, + "grad_norm": 0.12046144157648087, + "learning_rate": 2.624836584968116e-05, + "loss": 0.0511, + "num_input_tokens_seen": 117062272, + "step": 96200 + }, + { + "epoch": 10.714444815681034, + "grad_norm": 1.3851287364959717, + "learning_rate": 2.624593913712235e-05, + "loss": 0.1178, + "num_input_tokens_seen": 117068000, + "step": 96205 + }, + { + "epoch": 10.71500167056465, + "grad_norm": 0.6016901731491089, + "learning_rate": 2.6243512412794636e-05, + "loss": 0.0898, + "num_input_tokens_seen": 117074016, + "step": 96210 + }, + { + "epoch": 10.715558525448268, + "grad_norm": 0.0004827426455449313, + "learning_rate": 2.6241085676720955e-05, + "loss": 0.0923, + "num_input_tokens_seen": 117080288, + "step": 96215 + }, + { + "epoch": 10.716115380331885, + "grad_norm": 0.5422625541687012, + "learning_rate": 2.6238658928924227e-05, + "loss": 0.0727, + "num_input_tokens_seen": 117086400, + "step": 96220 + }, + { + "epoch": 10.716672235215503, + "grad_norm": 0.7158446907997131, + "learning_rate": 2.6236232169427368e-05, + "loss": 0.0679, + "num_input_tokens_seen": 117091776, + "step": 96225 + }, + { + "epoch": 10.71722909009912, + "grad_norm": 1.0957679748535156, + "learning_rate": 2.623380539825331e-05, + "loss": 0.075, + "num_input_tokens_seen": 117097472, + "step": 96230 + }, + { + "epoch": 10.717785944982737, + "grad_norm": 0.021419066935777664, + "learning_rate": 2.623137861542497e-05, + "loss": 0.0153, + "num_input_tokens_seen": 117103744, + "step": 96235 + }, + { + "epoch": 10.718342799866354, + "grad_norm": 0.8844719529151917, + "learning_rate": 2.622895182096527e-05, + "loss": 0.0163, + "num_input_tokens_seen": 117109792, + "step": 96240 + }, + { + "epoch": 10.718899654749972, + "grad_norm": 0.1531984508037567, + "learning_rate": 2.6226525014897136e-05, + "loss": 0.0163, + "num_input_tokens_seen": 117115904, + "step": 96245 + }, + { + "epoch": 10.71945650963359, + "grad_norm": 0.1801130771636963, + "learning_rate": 2.6224098197243497e-05, + "loss": 0.0562, + "num_input_tokens_seen": 117122176, + "step": 96250 + }, + { + "epoch": 10.720013364517207, + "grad_norm": 0.00025252028717659414, + "learning_rate": 2.622167136802726e-05, + "loss": 0.0002, + "num_input_tokens_seen": 117128544, + "step": 96255 + }, + { + "epoch": 10.720570219400823, + "grad_norm": 0.013762730173766613, + "learning_rate": 2.6219244527271364e-05, + "loss": 0.0125, + "num_input_tokens_seen": 117135136, + "step": 96260 + }, + { + "epoch": 10.721127074284441, + "grad_norm": 0.16875575482845306, + "learning_rate": 2.621681767499873e-05, + "loss": 0.0204, + "num_input_tokens_seen": 117141216, + "step": 96265 + }, + { + "epoch": 10.721683929168059, + "grad_norm": 0.9379763007164001, + "learning_rate": 2.621439081123227e-05, + "loss": 0.057, + "num_input_tokens_seen": 117147296, + "step": 96270 + }, + { + "epoch": 10.722240784051676, + "grad_norm": 0.03729914873838425, + "learning_rate": 2.621196393599492e-05, + "loss": 0.0162, + "num_input_tokens_seen": 117153440, + "step": 96275 + }, + { + "epoch": 10.722797638935294, + "grad_norm": 0.0026197757106274366, + "learning_rate": 2.6209537049309594e-05, + "loss": 0.0753, + "num_input_tokens_seen": 117159520, + "step": 96280 + }, + { + "epoch": 10.723354493818912, + "grad_norm": 0.1646508425474167, + "learning_rate": 2.6207110151199226e-05, + "loss": 0.0245, + "num_input_tokens_seen": 117165664, + "step": 96285 + }, + { + "epoch": 10.723911348702527, + "grad_norm": 0.20642706751823425, + "learning_rate": 2.620468324168674e-05, + "loss": 0.0124, + "num_input_tokens_seen": 117171840, + "step": 96290 + }, + { + "epoch": 10.724468203586145, + "grad_norm": 0.6939296126365662, + "learning_rate": 2.6202256320795048e-05, + "loss": 0.0527, + "num_input_tokens_seen": 117178240, + "step": 96295 + }, + { + "epoch": 10.725025058469763, + "grad_norm": 0.005845492240041494, + "learning_rate": 2.6199829388547093e-05, + "loss": 0.0386, + "num_input_tokens_seen": 117184448, + "step": 96300 + }, + { + "epoch": 10.72558191335338, + "grad_norm": 0.9114394783973694, + "learning_rate": 2.619740244496578e-05, + "loss": 0.0501, + "num_input_tokens_seen": 117190656, + "step": 96305 + }, + { + "epoch": 10.726138768236998, + "grad_norm": 0.306745320558548, + "learning_rate": 2.6194975490074043e-05, + "loss": 0.0207, + "num_input_tokens_seen": 117197024, + "step": 96310 + }, + { + "epoch": 10.726695623120614, + "grad_norm": 0.13602681457996368, + "learning_rate": 2.61925485238948e-05, + "loss": 0.0562, + "num_input_tokens_seen": 117203072, + "step": 96315 + }, + { + "epoch": 10.727252478004232, + "grad_norm": 0.09333111345767975, + "learning_rate": 2.6190121546450986e-05, + "loss": 0.0015, + "num_input_tokens_seen": 117209280, + "step": 96320 + }, + { + "epoch": 10.72780933288785, + "grad_norm": 0.00019883901404682547, + "learning_rate": 2.6187694557765524e-05, + "loss": 0.0017, + "num_input_tokens_seen": 117214912, + "step": 96325 + }, + { + "epoch": 10.728366187771467, + "grad_norm": 1.0028399229049683, + "learning_rate": 2.6185267557861325e-05, + "loss": 0.0085, + "num_input_tokens_seen": 117221216, + "step": 96330 + }, + { + "epoch": 10.728923042655085, + "grad_norm": 0.19127021729946136, + "learning_rate": 2.6182840546761335e-05, + "loss": 0.0205, + "num_input_tokens_seen": 117227232, + "step": 96335 + }, + { + "epoch": 10.7294798975387, + "grad_norm": 0.03606417030096054, + "learning_rate": 2.6180413524488462e-05, + "loss": 0.0618, + "num_input_tokens_seen": 117233184, + "step": 96340 + }, + { + "epoch": 10.730036752422318, + "grad_norm": 0.013733919709920883, + "learning_rate": 2.617798649106564e-05, + "loss": 0.0209, + "num_input_tokens_seen": 117239456, + "step": 96345 + }, + { + "epoch": 10.730593607305936, + "grad_norm": 3.0782861709594727, + "learning_rate": 2.617555944651579e-05, + "loss": 0.0482, + "num_input_tokens_seen": 117245568, + "step": 96350 + }, + { + "epoch": 10.731150462189554, + "grad_norm": 0.0008836325723677874, + "learning_rate": 2.617313239086184e-05, + "loss": 0.0918, + "num_input_tokens_seen": 117251840, + "step": 96355 + }, + { + "epoch": 10.731707317073171, + "grad_norm": 0.09380077570676804, + "learning_rate": 2.6170705324126716e-05, + "loss": 0.0237, + "num_input_tokens_seen": 117257888, + "step": 96360 + }, + { + "epoch": 10.732264171956787, + "grad_norm": 0.01826731115579605, + "learning_rate": 2.6168278246333337e-05, + "loss": 0.0077, + "num_input_tokens_seen": 117264064, + "step": 96365 + }, + { + "epoch": 10.732821026840405, + "grad_norm": 0.030868910253047943, + "learning_rate": 2.6165851157504644e-05, + "loss": 0.0036, + "num_input_tokens_seen": 117269984, + "step": 96370 + }, + { + "epoch": 10.733377881724023, + "grad_norm": 2.6973378658294678, + "learning_rate": 2.6163424057663543e-05, + "loss": 0.1148, + "num_input_tokens_seen": 117276096, + "step": 96375 + }, + { + "epoch": 10.73393473660764, + "grad_norm": 0.12800943851470947, + "learning_rate": 2.6160996946832973e-05, + "loss": 0.0067, + "num_input_tokens_seen": 117282144, + "step": 96380 + }, + { + "epoch": 10.734491591491258, + "grad_norm": 0.06536722183227539, + "learning_rate": 2.615856982503586e-05, + "loss": 0.0039, + "num_input_tokens_seen": 117288256, + "step": 96385 + }, + { + "epoch": 10.735048446374874, + "grad_norm": 0.568764328956604, + "learning_rate": 2.6156142692295122e-05, + "loss": 0.0196, + "num_input_tokens_seen": 117294240, + "step": 96390 + }, + { + "epoch": 10.735605301258492, + "grad_norm": 0.11646517366170883, + "learning_rate": 2.6153715548633693e-05, + "loss": 0.0025, + "num_input_tokens_seen": 117300288, + "step": 96395 + }, + { + "epoch": 10.73616215614211, + "grad_norm": 0.007968316785991192, + "learning_rate": 2.6151288394074498e-05, + "loss": 0.0286, + "num_input_tokens_seen": 117306336, + "step": 96400 + }, + { + "epoch": 10.736719011025727, + "grad_norm": 0.15148614346981049, + "learning_rate": 2.614886122864046e-05, + "loss": 0.1343, + "num_input_tokens_seen": 117312576, + "step": 96405 + }, + { + "epoch": 10.737275865909345, + "grad_norm": 1.0585428476333618, + "learning_rate": 2.61464340523545e-05, + "loss": 0.0923, + "num_input_tokens_seen": 117318560, + "step": 96410 + }, + { + "epoch": 10.737832720792962, + "grad_norm": 0.5347405672073364, + "learning_rate": 2.6144006865239557e-05, + "loss": 0.0049, + "num_input_tokens_seen": 117324832, + "step": 96415 + }, + { + "epoch": 10.738389575676578, + "grad_norm": 0.0001259089622180909, + "learning_rate": 2.6141579667318556e-05, + "loss": 0.0802, + "num_input_tokens_seen": 117331072, + "step": 96420 + }, + { + "epoch": 10.738946430560196, + "grad_norm": 0.012345736846327782, + "learning_rate": 2.613915245861442e-05, + "loss": 0.0059, + "num_input_tokens_seen": 117336864, + "step": 96425 + }, + { + "epoch": 10.739503285443813, + "grad_norm": 0.13254983723163605, + "learning_rate": 2.613672523915007e-05, + "loss": 0.028, + "num_input_tokens_seen": 117342976, + "step": 96430 + }, + { + "epoch": 10.740060140327431, + "grad_norm": 0.33672896027565, + "learning_rate": 2.613429800894845e-05, + "loss": 0.0203, + "num_input_tokens_seen": 117348992, + "step": 96435 + }, + { + "epoch": 10.740616995211049, + "grad_norm": 0.05407049506902695, + "learning_rate": 2.613187076803247e-05, + "loss": 0.0077, + "num_input_tokens_seen": 117355328, + "step": 96440 + }, + { + "epoch": 10.741173850094665, + "grad_norm": 2.9314961433410645, + "learning_rate": 2.6129443516425063e-05, + "loss": 0.2081, + "num_input_tokens_seen": 117361472, + "step": 96445 + }, + { + "epoch": 10.741730704978282, + "grad_norm": 1.329236388206482, + "learning_rate": 2.6127016254149156e-05, + "loss": 0.1118, + "num_input_tokens_seen": 117367648, + "step": 96450 + }, + { + "epoch": 10.7422875598619, + "grad_norm": 0.03555900603532791, + "learning_rate": 2.6124588981227684e-05, + "loss": 0.0617, + "num_input_tokens_seen": 117373632, + "step": 96455 + }, + { + "epoch": 10.742844414745518, + "grad_norm": 0.009756559506058693, + "learning_rate": 2.612216169768356e-05, + "loss": 0.0009, + "num_input_tokens_seen": 117379488, + "step": 96460 + }, + { + "epoch": 10.743401269629135, + "grad_norm": 0.03205752372741699, + "learning_rate": 2.6119734403539726e-05, + "loss": 0.1644, + "num_input_tokens_seen": 117385568, + "step": 96465 + }, + { + "epoch": 10.743958124512751, + "grad_norm": 2.516991376876831, + "learning_rate": 2.61173070988191e-05, + "loss": 0.0544, + "num_input_tokens_seen": 117391392, + "step": 96470 + }, + { + "epoch": 10.744514979396369, + "grad_norm": 0.08964863419532776, + "learning_rate": 2.6114879783544615e-05, + "loss": 0.0071, + "num_input_tokens_seen": 117397600, + "step": 96475 + }, + { + "epoch": 10.745071834279987, + "grad_norm": 0.1759195774793625, + "learning_rate": 2.6112452457739196e-05, + "loss": 0.0201, + "num_input_tokens_seen": 117403808, + "step": 96480 + }, + { + "epoch": 10.745628689163604, + "grad_norm": 0.5160420536994934, + "learning_rate": 2.611002512142577e-05, + "loss": 0.0117, + "num_input_tokens_seen": 117409952, + "step": 96485 + }, + { + "epoch": 10.746185544047222, + "grad_norm": 0.3840000629425049, + "learning_rate": 2.6107597774627272e-05, + "loss": 0.0259, + "num_input_tokens_seen": 117416064, + "step": 96490 + }, + { + "epoch": 10.746742398930838, + "grad_norm": 1.451832890510559, + "learning_rate": 2.610517041736662e-05, + "loss": 0.2026, + "num_input_tokens_seen": 117421824, + "step": 96495 + }, + { + "epoch": 10.747299253814456, + "grad_norm": 0.33374595642089844, + "learning_rate": 2.6102743049666757e-05, + "loss": 0.0178, + "num_input_tokens_seen": 117427680, + "step": 96500 + }, + { + "epoch": 10.747856108698073, + "grad_norm": 0.24602258205413818, + "learning_rate": 2.6100315671550596e-05, + "loss": 0.0751, + "num_input_tokens_seen": 117434048, + "step": 96505 + }, + { + "epoch": 10.748412963581691, + "grad_norm": 0.009392866864800453, + "learning_rate": 2.6097888283041077e-05, + "loss": 0.0201, + "num_input_tokens_seen": 117439840, + "step": 96510 + }, + { + "epoch": 10.748969818465309, + "grad_norm": 0.1432114988565445, + "learning_rate": 2.609546088416112e-05, + "loss": 0.0175, + "num_input_tokens_seen": 117446208, + "step": 96515 + }, + { + "epoch": 10.749526673348925, + "grad_norm": 1.7972443103790283, + "learning_rate": 2.6093033474933653e-05, + "loss": 0.0709, + "num_input_tokens_seen": 117452448, + "step": 96520 + }, + { + "epoch": 10.750083528232542, + "grad_norm": 1.3378592729568481, + "learning_rate": 2.6090606055381616e-05, + "loss": 0.1016, + "num_input_tokens_seen": 117458656, + "step": 96525 + }, + { + "epoch": 10.75064038311616, + "grad_norm": 1.056395411491394, + "learning_rate": 2.6088178625527925e-05, + "loss": 0.1442, + "num_input_tokens_seen": 117464736, + "step": 96530 + }, + { + "epoch": 10.751197237999778, + "grad_norm": 3.407315731048584, + "learning_rate": 2.6085751185395518e-05, + "loss": 0.1467, + "num_input_tokens_seen": 117470496, + "step": 96535 + }, + { + "epoch": 10.751754092883395, + "grad_norm": 1.3807772397994995, + "learning_rate": 2.6083323735007325e-05, + "loss": 0.0888, + "num_input_tokens_seen": 117476736, + "step": 96540 + }, + { + "epoch": 10.752310947767011, + "grad_norm": 0.40539371967315674, + "learning_rate": 2.6080896274386263e-05, + "loss": 0.0923, + "num_input_tokens_seen": 117482912, + "step": 96545 + }, + { + "epoch": 10.752867802650629, + "grad_norm": 1.2990235090255737, + "learning_rate": 2.6078468803555278e-05, + "loss": 0.0169, + "num_input_tokens_seen": 117489184, + "step": 96550 + }, + { + "epoch": 10.753424657534246, + "grad_norm": 0.02779824286699295, + "learning_rate": 2.6076041322537286e-05, + "loss": 0.0924, + "num_input_tokens_seen": 117494720, + "step": 96555 + }, + { + "epoch": 10.753981512417864, + "grad_norm": 1.2253962755203247, + "learning_rate": 2.607361383135522e-05, + "loss": 0.0849, + "num_input_tokens_seen": 117500672, + "step": 96560 + }, + { + "epoch": 10.754538367301482, + "grad_norm": 0.06374092400074005, + "learning_rate": 2.6071186330032016e-05, + "loss": 0.0391, + "num_input_tokens_seen": 117506368, + "step": 96565 + }, + { + "epoch": 10.755095222185098, + "grad_norm": 0.0003070785023737699, + "learning_rate": 2.60687588185906e-05, + "loss": 0.0169, + "num_input_tokens_seen": 117512160, + "step": 96570 + }, + { + "epoch": 10.755652077068715, + "grad_norm": 3.677194356918335, + "learning_rate": 2.60663312970539e-05, + "loss": 0.0698, + "num_input_tokens_seen": 117518144, + "step": 96575 + }, + { + "epoch": 10.756208931952333, + "grad_norm": 1.9863216876983643, + "learning_rate": 2.606390376544484e-05, + "loss": 0.0854, + "num_input_tokens_seen": 117524256, + "step": 96580 + }, + { + "epoch": 10.75676578683595, + "grad_norm": 0.95435631275177, + "learning_rate": 2.6061476223786364e-05, + "loss": 0.1096, + "num_input_tokens_seen": 117530464, + "step": 96585 + }, + { + "epoch": 10.757322641719568, + "grad_norm": 0.0012399860424920917, + "learning_rate": 2.605904867210139e-05, + "loss": 0.021, + "num_input_tokens_seen": 117536544, + "step": 96590 + }, + { + "epoch": 10.757879496603184, + "grad_norm": 0.02890944480895996, + "learning_rate": 2.605662111041285e-05, + "loss": 0.0088, + "num_input_tokens_seen": 117542848, + "step": 96595 + }, + { + "epoch": 10.758436351486802, + "grad_norm": 0.5326141715049744, + "learning_rate": 2.6054193538743688e-05, + "loss": 0.0123, + "num_input_tokens_seen": 117548640, + "step": 96600 + }, + { + "epoch": 10.75899320637042, + "grad_norm": 1.5513712167739868, + "learning_rate": 2.6051765957116813e-05, + "loss": 0.104, + "num_input_tokens_seen": 117554816, + "step": 96605 + }, + { + "epoch": 10.759550061254037, + "grad_norm": 0.03565165773034096, + "learning_rate": 2.604933836555517e-05, + "loss": 0.025, + "num_input_tokens_seen": 117561120, + "step": 96610 + }, + { + "epoch": 10.760106916137655, + "grad_norm": 1.0174051523208618, + "learning_rate": 2.6046910764081683e-05, + "loss": 0.0241, + "num_input_tokens_seen": 117567488, + "step": 96615 + }, + { + "epoch": 10.760663771021271, + "grad_norm": 0.005177769809961319, + "learning_rate": 2.604448315271929e-05, + "loss": 0.0315, + "num_input_tokens_seen": 117573856, + "step": 96620 + }, + { + "epoch": 10.761220625904889, + "grad_norm": 0.38838866353034973, + "learning_rate": 2.604205553149091e-05, + "loss": 0.0364, + "num_input_tokens_seen": 117579264, + "step": 96625 + }, + { + "epoch": 10.761777480788506, + "grad_norm": 0.10133158415555954, + "learning_rate": 2.6039627900419483e-05, + "loss": 0.0473, + "num_input_tokens_seen": 117585408, + "step": 96630 + }, + { + "epoch": 10.762334335672124, + "grad_norm": 0.2755565941333771, + "learning_rate": 2.6037200259527943e-05, + "loss": 0.0036, + "num_input_tokens_seen": 117591456, + "step": 96635 + }, + { + "epoch": 10.762891190555742, + "grad_norm": 0.027092453092336655, + "learning_rate": 2.603477260883921e-05, + "loss": 0.0841, + "num_input_tokens_seen": 117597920, + "step": 96640 + }, + { + "epoch": 10.76344804543936, + "grad_norm": 0.39120182394981384, + "learning_rate": 2.6032344948376226e-05, + "loss": 0.1546, + "num_input_tokens_seen": 117603904, + "step": 96645 + }, + { + "epoch": 10.764004900322975, + "grad_norm": 0.029197443276643753, + "learning_rate": 2.6029917278161913e-05, + "loss": 0.0656, + "num_input_tokens_seen": 117610016, + "step": 96650 + }, + { + "epoch": 10.764561755206593, + "grad_norm": 0.9807411432266235, + "learning_rate": 2.6027489598219202e-05, + "loss": 0.0549, + "num_input_tokens_seen": 117616448, + "step": 96655 + }, + { + "epoch": 10.76511861009021, + "grad_norm": 0.33623114228248596, + "learning_rate": 2.6025061908571035e-05, + "loss": 0.0297, + "num_input_tokens_seen": 117622720, + "step": 96660 + }, + { + "epoch": 10.765675464973828, + "grad_norm": 0.6771998405456543, + "learning_rate": 2.6022634209240333e-05, + "loss": 0.0782, + "num_input_tokens_seen": 117628384, + "step": 96665 + }, + { + "epoch": 10.766232319857446, + "grad_norm": 0.33769017457962036, + "learning_rate": 2.6020206500250037e-05, + "loss": 0.0573, + "num_input_tokens_seen": 117634784, + "step": 96670 + }, + { + "epoch": 10.766789174741062, + "grad_norm": 1.0640716552734375, + "learning_rate": 2.601777878162307e-05, + "loss": 0.0291, + "num_input_tokens_seen": 117641248, + "step": 96675 + }, + { + "epoch": 10.76734602962468, + "grad_norm": 0.6459600329399109, + "learning_rate": 2.601535105338237e-05, + "loss": 0.0079, + "num_input_tokens_seen": 117647264, + "step": 96680 + }, + { + "epoch": 10.767902884508297, + "grad_norm": 0.0038562719710171223, + "learning_rate": 2.6012923315550858e-05, + "loss": 0.0014, + "num_input_tokens_seen": 117653504, + "step": 96685 + }, + { + "epoch": 10.768459739391915, + "grad_norm": 0.04747634008526802, + "learning_rate": 2.6010495568151477e-05, + "loss": 0.0416, + "num_input_tokens_seen": 117659424, + "step": 96690 + }, + { + "epoch": 10.769016594275532, + "grad_norm": 0.00020080478861927986, + "learning_rate": 2.600806781120716e-05, + "loss": 0.0022, + "num_input_tokens_seen": 117665568, + "step": 96695 + }, + { + "epoch": 10.769573449159148, + "grad_norm": 0.14612704515457153, + "learning_rate": 2.6005640044740826e-05, + "loss": 0.0386, + "num_input_tokens_seen": 117671552, + "step": 96700 + }, + { + "epoch": 10.770130304042766, + "grad_norm": 0.06127312406897545, + "learning_rate": 2.6003212268775428e-05, + "loss": 0.1646, + "num_input_tokens_seen": 117677728, + "step": 96705 + }, + { + "epoch": 10.770687158926384, + "grad_norm": 0.181595578789711, + "learning_rate": 2.600078448333388e-05, + "loss": 0.0166, + "num_input_tokens_seen": 117683840, + "step": 96710 + }, + { + "epoch": 10.771244013810001, + "grad_norm": 0.002275500213727355, + "learning_rate": 2.5998356688439118e-05, + "loss": 0.0108, + "num_input_tokens_seen": 117689824, + "step": 96715 + }, + { + "epoch": 10.771800868693619, + "grad_norm": 0.4868120551109314, + "learning_rate": 2.599592888411408e-05, + "loss": 0.0199, + "num_input_tokens_seen": 117695584, + "step": 96720 + }, + { + "epoch": 10.772357723577235, + "grad_norm": 0.24442508816719055, + "learning_rate": 2.5993501070381693e-05, + "loss": 0.0108, + "num_input_tokens_seen": 117701792, + "step": 96725 + }, + { + "epoch": 10.772914578460853, + "grad_norm": 0.42742711305618286, + "learning_rate": 2.59910732472649e-05, + "loss": 0.0429, + "num_input_tokens_seen": 117708000, + "step": 96730 + }, + { + "epoch": 10.77347143334447, + "grad_norm": 0.001930883852764964, + "learning_rate": 2.5988645414786617e-05, + "loss": 0.0229, + "num_input_tokens_seen": 117714048, + "step": 96735 + }, + { + "epoch": 10.774028288228088, + "grad_norm": 0.03553209453821182, + "learning_rate": 2.5986217572969794e-05, + "loss": 0.0354, + "num_input_tokens_seen": 117720192, + "step": 96740 + }, + { + "epoch": 10.774585143111706, + "grad_norm": 0.451016366481781, + "learning_rate": 2.5983789721837354e-05, + "loss": 0.048, + "num_input_tokens_seen": 117726720, + "step": 96745 + }, + { + "epoch": 10.775141997995323, + "grad_norm": 0.5444949865341187, + "learning_rate": 2.5981361861412228e-05, + "loss": 0.1472, + "num_input_tokens_seen": 117732736, + "step": 96750 + }, + { + "epoch": 10.77569885287894, + "grad_norm": 0.01647457852959633, + "learning_rate": 2.597893399171736e-05, + "loss": 0.0011, + "num_input_tokens_seen": 117739136, + "step": 96755 + }, + { + "epoch": 10.776255707762557, + "grad_norm": 0.10825088620185852, + "learning_rate": 2.5976506112775668e-05, + "loss": 0.0054, + "num_input_tokens_seen": 117745024, + "step": 96760 + }, + { + "epoch": 10.776812562646175, + "grad_norm": 0.24285483360290527, + "learning_rate": 2.5974078224610097e-05, + "loss": 0.0103, + "num_input_tokens_seen": 117750976, + "step": 96765 + }, + { + "epoch": 10.777369417529792, + "grad_norm": 0.581322431564331, + "learning_rate": 2.5971650327243573e-05, + "loss": 0.0066, + "num_input_tokens_seen": 117757024, + "step": 96770 + }, + { + "epoch": 10.77792627241341, + "grad_norm": 0.11767688393592834, + "learning_rate": 2.596922242069904e-05, + "loss": 0.0476, + "num_input_tokens_seen": 117763296, + "step": 96775 + }, + { + "epoch": 10.778483127297026, + "grad_norm": 0.0014585127355530858, + "learning_rate": 2.596679450499942e-05, + "loss": 0.0066, + "num_input_tokens_seen": 117769376, + "step": 96780 + }, + { + "epoch": 10.779039982180644, + "grad_norm": 0.007599856238812208, + "learning_rate": 2.5964366580167647e-05, + "loss": 0.0192, + "num_input_tokens_seen": 117775264, + "step": 96785 + }, + { + "epoch": 10.779596837064261, + "grad_norm": 0.03981839865446091, + "learning_rate": 2.596193864622667e-05, + "loss": 0.0043, + "num_input_tokens_seen": 117781280, + "step": 96790 + }, + { + "epoch": 10.780153691947879, + "grad_norm": 1.1135684251785278, + "learning_rate": 2.595951070319941e-05, + "loss": 0.0475, + "num_input_tokens_seen": 117787648, + "step": 96795 + }, + { + "epoch": 10.780710546831497, + "grad_norm": 2.802945613861084, + "learning_rate": 2.5957082751108797e-05, + "loss": 0.0971, + "num_input_tokens_seen": 117793568, + "step": 96800 + }, + { + "epoch": 10.781267401715112, + "grad_norm": 0.00962933711707592, + "learning_rate": 2.5954654789977772e-05, + "loss": 0.0686, + "num_input_tokens_seen": 117799584, + "step": 96805 + }, + { + "epoch": 10.78182425659873, + "grad_norm": 0.01978049986064434, + "learning_rate": 2.5952226819829274e-05, + "loss": 0.0176, + "num_input_tokens_seen": 117805760, + "step": 96810 + }, + { + "epoch": 10.782381111482348, + "grad_norm": 0.0100404042750597, + "learning_rate": 2.594979884068622e-05, + "loss": 0.1096, + "num_input_tokens_seen": 117811584, + "step": 96815 + }, + { + "epoch": 10.782937966365965, + "grad_norm": 2.269103765487671, + "learning_rate": 2.5947370852571556e-05, + "loss": 0.1045, + "num_input_tokens_seen": 117817824, + "step": 96820 + }, + { + "epoch": 10.783494821249583, + "grad_norm": 1.4846383333206177, + "learning_rate": 2.5944942855508225e-05, + "loss": 0.0836, + "num_input_tokens_seen": 117824160, + "step": 96825 + }, + { + "epoch": 10.784051676133199, + "grad_norm": 0.0002979552955366671, + "learning_rate": 2.5942514849519144e-05, + "loss": 0.0309, + "num_input_tokens_seen": 117829728, + "step": 96830 + }, + { + "epoch": 10.784608531016817, + "grad_norm": 0.37444770336151123, + "learning_rate": 2.594008683462726e-05, + "loss": 0.0518, + "num_input_tokens_seen": 117835808, + "step": 96835 + }, + { + "epoch": 10.785165385900434, + "grad_norm": 0.7431262731552124, + "learning_rate": 2.59376588108555e-05, + "loss": 0.1971, + "num_input_tokens_seen": 117841824, + "step": 96840 + }, + { + "epoch": 10.785722240784052, + "grad_norm": 0.010904429480433464, + "learning_rate": 2.5935230778226798e-05, + "loss": 0.0022, + "num_input_tokens_seen": 117848128, + "step": 96845 + }, + { + "epoch": 10.78627909566767, + "grad_norm": 0.10186493396759033, + "learning_rate": 2.5932802736764093e-05, + "loss": 0.0624, + "num_input_tokens_seen": 117853952, + "step": 96850 + }, + { + "epoch": 10.786835950551286, + "grad_norm": 0.8910807371139526, + "learning_rate": 2.593037468649032e-05, + "loss": 0.076, + "num_input_tokens_seen": 117860256, + "step": 96855 + }, + { + "epoch": 10.787392805434903, + "grad_norm": 0.1659766435623169, + "learning_rate": 2.592794662742842e-05, + "loss": 0.0577, + "num_input_tokens_seen": 117866528, + "step": 96860 + }, + { + "epoch": 10.787949660318521, + "grad_norm": 2.9993326663970947, + "learning_rate": 2.592551855960131e-05, + "loss": 0.1207, + "num_input_tokens_seen": 117872704, + "step": 96865 + }, + { + "epoch": 10.788506515202139, + "grad_norm": 6.624306115554646e-05, + "learning_rate": 2.592309048303194e-05, + "loss": 0.0634, + "num_input_tokens_seen": 117878944, + "step": 96870 + }, + { + "epoch": 10.789063370085756, + "grad_norm": 0.10534583777189255, + "learning_rate": 2.5920662397743247e-05, + "loss": 0.0358, + "num_input_tokens_seen": 117884832, + "step": 96875 + }, + { + "epoch": 10.789620224969372, + "grad_norm": 0.0022823659237474203, + "learning_rate": 2.591823430375815e-05, + "loss": 0.0016, + "num_input_tokens_seen": 117890912, + "step": 96880 + }, + { + "epoch": 10.79017707985299, + "grad_norm": 0.0003790617047343403, + "learning_rate": 2.5915806201099598e-05, + "loss": 0.0706, + "num_input_tokens_seen": 117897152, + "step": 96885 + }, + { + "epoch": 10.790733934736608, + "grad_norm": 0.0012587456731125712, + "learning_rate": 2.591337808979052e-05, + "loss": 0.0131, + "num_input_tokens_seen": 117903648, + "step": 96890 + }, + { + "epoch": 10.791290789620225, + "grad_norm": 1.712956428527832, + "learning_rate": 2.591094996985386e-05, + "loss": 0.0966, + "num_input_tokens_seen": 117909920, + "step": 96895 + }, + { + "epoch": 10.791847644503843, + "grad_norm": 0.39799630641937256, + "learning_rate": 2.5908521841312543e-05, + "loss": 0.0868, + "num_input_tokens_seen": 117916320, + "step": 96900 + }, + { + "epoch": 10.792404499387459, + "grad_norm": 0.002337878569960594, + "learning_rate": 2.5906093704189514e-05, + "loss": 0.0367, + "num_input_tokens_seen": 117922240, + "step": 96905 + }, + { + "epoch": 10.792961354271077, + "grad_norm": 1.3492025136947632, + "learning_rate": 2.59036655585077e-05, + "loss": 0.202, + "num_input_tokens_seen": 117928064, + "step": 96910 + }, + { + "epoch": 10.793518209154694, + "grad_norm": 0.17533017694950104, + "learning_rate": 2.5901237404290042e-05, + "loss": 0.0201, + "num_input_tokens_seen": 117934048, + "step": 96915 + }, + { + "epoch": 10.794075064038312, + "grad_norm": 1.3062695264816284, + "learning_rate": 2.5898809241559473e-05, + "loss": 0.0966, + "num_input_tokens_seen": 117939968, + "step": 96920 + }, + { + "epoch": 10.79463191892193, + "grad_norm": 0.00501873716711998, + "learning_rate": 2.5896381070338937e-05, + "loss": 0.0101, + "num_input_tokens_seen": 117945920, + "step": 96925 + }, + { + "epoch": 10.795188773805545, + "grad_norm": 0.2172861546278, + "learning_rate": 2.5893952890651358e-05, + "loss": 0.007, + "num_input_tokens_seen": 117952160, + "step": 96930 + }, + { + "epoch": 10.795745628689163, + "grad_norm": 0.010212869383394718, + "learning_rate": 2.589152470251968e-05, + "loss": 0.0043, + "num_input_tokens_seen": 117958144, + "step": 96935 + }, + { + "epoch": 10.79630248357278, + "grad_norm": 0.33504143357276917, + "learning_rate": 2.5889096505966832e-05, + "loss": 0.0067, + "num_input_tokens_seen": 117964416, + "step": 96940 + }, + { + "epoch": 10.796859338456398, + "grad_norm": 0.0037664168048650026, + "learning_rate": 2.5886668301015767e-05, + "loss": 0.016, + "num_input_tokens_seen": 117970400, + "step": 96945 + }, + { + "epoch": 10.797416193340016, + "grad_norm": 0.8062165379524231, + "learning_rate": 2.5884240087689392e-05, + "loss": 0.0099, + "num_input_tokens_seen": 117976832, + "step": 96950 + }, + { + "epoch": 10.797973048223632, + "grad_norm": 0.9702278971672058, + "learning_rate": 2.5881811866010673e-05, + "loss": 0.165, + "num_input_tokens_seen": 117982944, + "step": 96955 + }, + { + "epoch": 10.79852990310725, + "grad_norm": 0.0074633280746638775, + "learning_rate": 2.587938363600253e-05, + "loss": 0.0463, + "num_input_tokens_seen": 117989376, + "step": 96960 + }, + { + "epoch": 10.799086757990867, + "grad_norm": 1.843355655670166, + "learning_rate": 2.5876955397687908e-05, + "loss": 0.0635, + "num_input_tokens_seen": 117995520, + "step": 96965 + }, + { + "epoch": 10.799643612874485, + "grad_norm": 1.1941214799880981, + "learning_rate": 2.587452715108974e-05, + "loss": 0.1024, + "num_input_tokens_seen": 118001376, + "step": 96970 + }, + { + "epoch": 10.800200467758103, + "grad_norm": 0.004928463604301214, + "learning_rate": 2.5872098896230956e-05, + "loss": 0.0334, + "num_input_tokens_seen": 118007680, + "step": 96975 + }, + { + "epoch": 10.80075732264172, + "grad_norm": 0.00041393146966584027, + "learning_rate": 2.58696706331345e-05, + "loss": 0.0058, + "num_input_tokens_seen": 118013664, + "step": 96980 + }, + { + "epoch": 10.801314177525336, + "grad_norm": 0.8800331950187683, + "learning_rate": 2.5867242361823314e-05, + "loss": 0.0738, + "num_input_tokens_seen": 118019264, + "step": 96985 + }, + { + "epoch": 10.801871032408954, + "grad_norm": 1.8046663999557495, + "learning_rate": 2.5864814082320325e-05, + "loss": 0.0572, + "num_input_tokens_seen": 118025344, + "step": 96990 + }, + { + "epoch": 10.802427887292572, + "grad_norm": 0.006784464698284864, + "learning_rate": 2.5862385794648476e-05, + "loss": 0.0029, + "num_input_tokens_seen": 118031552, + "step": 96995 + }, + { + "epoch": 10.80298474217619, + "grad_norm": 0.013915377669036388, + "learning_rate": 2.58599574988307e-05, + "loss": 0.0228, + "num_input_tokens_seen": 118036800, + "step": 97000 + }, + { + "epoch": 10.803541597059807, + "grad_norm": 0.2903095483779907, + "learning_rate": 2.585752919488994e-05, + "loss": 0.0289, + "num_input_tokens_seen": 118043104, + "step": 97005 + }, + { + "epoch": 10.804098451943423, + "grad_norm": 0.13591259717941284, + "learning_rate": 2.5855100882849125e-05, + "loss": 0.0215, + "num_input_tokens_seen": 118049280, + "step": 97010 + }, + { + "epoch": 10.80465530682704, + "grad_norm": 0.1596655696630478, + "learning_rate": 2.58526725627312e-05, + "loss": 0.0643, + "num_input_tokens_seen": 118054816, + "step": 97015 + }, + { + "epoch": 10.805212161710658, + "grad_norm": 0.840333878993988, + "learning_rate": 2.5850244234559102e-05, + "loss": 0.0266, + "num_input_tokens_seen": 118060992, + "step": 97020 + }, + { + "epoch": 10.805769016594276, + "grad_norm": 0.06093297153711319, + "learning_rate": 2.5847815898355766e-05, + "loss": 0.0192, + "num_input_tokens_seen": 118066880, + "step": 97025 + }, + { + "epoch": 10.806325871477894, + "grad_norm": 1.0598489046096802, + "learning_rate": 2.584538755414412e-05, + "loss": 0.0736, + "num_input_tokens_seen": 118071744, + "step": 97030 + }, + { + "epoch": 10.80688272636151, + "grad_norm": 0.00023820246860850602, + "learning_rate": 2.5842959201947116e-05, + "loss": 0.0052, + "num_input_tokens_seen": 118077984, + "step": 97035 + }, + { + "epoch": 10.807439581245127, + "grad_norm": 0.7698879241943359, + "learning_rate": 2.5840530841787692e-05, + "loss": 0.0249, + "num_input_tokens_seen": 118083968, + "step": 97040 + }, + { + "epoch": 10.807996436128745, + "grad_norm": 0.35831382870674133, + "learning_rate": 2.5838102473688774e-05, + "loss": 0.0376, + "num_input_tokens_seen": 118089984, + "step": 97045 + }, + { + "epoch": 10.808553291012363, + "grad_norm": 1.0407640933990479, + "learning_rate": 2.5835674097673313e-05, + "loss": 0.0667, + "num_input_tokens_seen": 118096000, + "step": 97050 + }, + { + "epoch": 10.80911014589598, + "grad_norm": 0.10609601438045502, + "learning_rate": 2.5833245713764238e-05, + "loss": 0.0085, + "num_input_tokens_seen": 118101984, + "step": 97055 + }, + { + "epoch": 10.809667000779596, + "grad_norm": 0.05444291606545448, + "learning_rate": 2.5830817321984484e-05, + "loss": 0.0337, + "num_input_tokens_seen": 118107552, + "step": 97060 + }, + { + "epoch": 10.810223855663214, + "grad_norm": 0.001985571812838316, + "learning_rate": 2.5828388922357004e-05, + "loss": 0.042, + "num_input_tokens_seen": 118113152, + "step": 97065 + }, + { + "epoch": 10.810780710546831, + "grad_norm": 0.00047633491340093315, + "learning_rate": 2.5825960514904722e-05, + "loss": 0.0733, + "num_input_tokens_seen": 118119264, + "step": 97070 + }, + { + "epoch": 10.811337565430449, + "grad_norm": 0.254266619682312, + "learning_rate": 2.582353209965059e-05, + "loss": 0.0237, + "num_input_tokens_seen": 118125344, + "step": 97075 + }, + { + "epoch": 10.811894420314067, + "grad_norm": 0.09866726398468018, + "learning_rate": 2.5821103676617525e-05, + "loss": 0.0023, + "num_input_tokens_seen": 118131584, + "step": 97080 + }, + { + "epoch": 10.812451275197683, + "grad_norm": 2.5271923542022705, + "learning_rate": 2.5818675245828483e-05, + "loss": 0.0895, + "num_input_tokens_seen": 118137696, + "step": 97085 + }, + { + "epoch": 10.8130081300813, + "grad_norm": 0.023986713960766792, + "learning_rate": 2.5816246807306395e-05, + "loss": 0.0019, + "num_input_tokens_seen": 118143808, + "step": 97090 + }, + { + "epoch": 10.813564984964918, + "grad_norm": 3.5415964126586914, + "learning_rate": 2.5813818361074204e-05, + "loss": 0.0886, + "num_input_tokens_seen": 118150176, + "step": 97095 + }, + { + "epoch": 10.814121839848536, + "grad_norm": 0.014074939303100109, + "learning_rate": 2.581138990715485e-05, + "loss": 0.0229, + "num_input_tokens_seen": 118155936, + "step": 97100 + }, + { + "epoch": 10.814678694732153, + "grad_norm": 0.0908341258764267, + "learning_rate": 2.580896144557126e-05, + "loss": 0.0206, + "num_input_tokens_seen": 118162336, + "step": 97105 + }, + { + "epoch": 10.815235549615771, + "grad_norm": 0.5283554792404175, + "learning_rate": 2.5806532976346393e-05, + "loss": 0.1592, + "num_input_tokens_seen": 118168224, + "step": 97110 + }, + { + "epoch": 10.815792404499387, + "grad_norm": 1.0586196184158325, + "learning_rate": 2.5804104499503162e-05, + "loss": 0.0329, + "num_input_tokens_seen": 118173888, + "step": 97115 + }, + { + "epoch": 10.816349259383005, + "grad_norm": 0.4014020562171936, + "learning_rate": 2.580167601506453e-05, + "loss": 0.0112, + "num_input_tokens_seen": 118180160, + "step": 97120 + }, + { + "epoch": 10.816906114266622, + "grad_norm": 0.30581140518188477, + "learning_rate": 2.579924752305342e-05, + "loss": 0.0985, + "num_input_tokens_seen": 118186368, + "step": 97125 + }, + { + "epoch": 10.81746296915024, + "grad_norm": 0.0002813787432387471, + "learning_rate": 2.579681902349278e-05, + "loss": 0.0076, + "num_input_tokens_seen": 118192128, + "step": 97130 + }, + { + "epoch": 10.818019824033858, + "grad_norm": 2.2285327911376953, + "learning_rate": 2.5794390516405546e-05, + "loss": 0.0566, + "num_input_tokens_seen": 118198272, + "step": 97135 + }, + { + "epoch": 10.818576678917474, + "grad_norm": 0.1864825338125229, + "learning_rate": 2.5791962001814652e-05, + "loss": 0.021, + "num_input_tokens_seen": 118204320, + "step": 97140 + }, + { + "epoch": 10.819133533801091, + "grad_norm": 0.06130561977624893, + "learning_rate": 2.578953347974305e-05, + "loss": 0.0179, + "num_input_tokens_seen": 118210496, + "step": 97145 + }, + { + "epoch": 10.819690388684709, + "grad_norm": 0.6662818193435669, + "learning_rate": 2.578710495021367e-05, + "loss": 0.1328, + "num_input_tokens_seen": 118215680, + "step": 97150 + }, + { + "epoch": 10.820247243568327, + "grad_norm": 0.4772246479988098, + "learning_rate": 2.5784676413249447e-05, + "loss": 0.0535, + "num_input_tokens_seen": 118221920, + "step": 97155 + }, + { + "epoch": 10.820804098451944, + "grad_norm": 0.037557851523160934, + "learning_rate": 2.5782247868873333e-05, + "loss": 0.0696, + "num_input_tokens_seen": 118227968, + "step": 97160 + }, + { + "epoch": 10.82136095333556, + "grad_norm": 0.0002658967860043049, + "learning_rate": 2.5779819317108256e-05, + "loss": 0.0175, + "num_input_tokens_seen": 118234080, + "step": 97165 + }, + { + "epoch": 10.821917808219178, + "grad_norm": 0.030960548669099808, + "learning_rate": 2.5777390757977164e-05, + "loss": 0.0101, + "num_input_tokens_seen": 118240256, + "step": 97170 + }, + { + "epoch": 10.822474663102795, + "grad_norm": 0.82733553647995, + "learning_rate": 2.5774962191502995e-05, + "loss": 0.0133, + "num_input_tokens_seen": 118246624, + "step": 97175 + }, + { + "epoch": 10.823031517986413, + "grad_norm": 1.8849620819091797, + "learning_rate": 2.577253361770869e-05, + "loss": 0.1424, + "num_input_tokens_seen": 118252832, + "step": 97180 + }, + { + "epoch": 10.82358837287003, + "grad_norm": 0.7109872698783875, + "learning_rate": 2.5770105036617176e-05, + "loss": 0.0614, + "num_input_tokens_seen": 118258912, + "step": 97185 + }, + { + "epoch": 10.824145227753647, + "grad_norm": 0.18618015944957733, + "learning_rate": 2.576767644825141e-05, + "loss": 0.0463, + "num_input_tokens_seen": 118265088, + "step": 97190 + }, + { + "epoch": 10.824702082637264, + "grad_norm": 0.07802349328994751, + "learning_rate": 2.5765247852634322e-05, + "loss": 0.0518, + "num_input_tokens_seen": 118270816, + "step": 97195 + }, + { + "epoch": 10.825258937520882, + "grad_norm": 0.10264093428850174, + "learning_rate": 2.5762819249788856e-05, + "loss": 0.0481, + "num_input_tokens_seen": 118276800, + "step": 97200 + }, + { + "epoch": 10.8258157924045, + "grad_norm": 0.03662451356649399, + "learning_rate": 2.576039063973795e-05, + "loss": 0.0896, + "num_input_tokens_seen": 118282880, + "step": 97205 + }, + { + "epoch": 10.826372647288117, + "grad_norm": 0.17356343567371368, + "learning_rate": 2.575796202250455e-05, + "loss": 0.0793, + "num_input_tokens_seen": 118288896, + "step": 97210 + }, + { + "epoch": 10.826929502171733, + "grad_norm": 0.02306346409022808, + "learning_rate": 2.5755533398111592e-05, + "loss": 0.1481, + "num_input_tokens_seen": 118294752, + "step": 97215 + }, + { + "epoch": 10.827486357055351, + "grad_norm": 0.9063066244125366, + "learning_rate": 2.575310476658201e-05, + "loss": 0.0335, + "num_input_tokens_seen": 118300864, + "step": 97220 + }, + { + "epoch": 10.828043211938969, + "grad_norm": 0.11796343326568604, + "learning_rate": 2.575067612793875e-05, + "loss": 0.0824, + "num_input_tokens_seen": 118306144, + "step": 97225 + }, + { + "epoch": 10.828600066822586, + "grad_norm": 0.017360538244247437, + "learning_rate": 2.5748247482204758e-05, + "loss": 0.0388, + "num_input_tokens_seen": 118312160, + "step": 97230 + }, + { + "epoch": 10.829156921706204, + "grad_norm": 1.211847186088562, + "learning_rate": 2.5745818829402962e-05, + "loss": 0.1081, + "num_input_tokens_seen": 118318496, + "step": 97235 + }, + { + "epoch": 10.82971377658982, + "grad_norm": 0.5994753241539001, + "learning_rate": 2.5743390169556315e-05, + "loss": 0.0242, + "num_input_tokens_seen": 118324512, + "step": 97240 + }, + { + "epoch": 10.830270631473438, + "grad_norm": 1.7902841567993164, + "learning_rate": 2.574096150268775e-05, + "loss": 0.0836, + "num_input_tokens_seen": 118330784, + "step": 97245 + }, + { + "epoch": 10.830827486357055, + "grad_norm": 0.018315060064196587, + "learning_rate": 2.573853282882021e-05, + "loss": 0.0046, + "num_input_tokens_seen": 118337280, + "step": 97250 + }, + { + "epoch": 10.831384341240673, + "grad_norm": 0.021491140127182007, + "learning_rate": 2.5736104147976636e-05, + "loss": 0.0282, + "num_input_tokens_seen": 118343168, + "step": 97255 + }, + { + "epoch": 10.83194119612429, + "grad_norm": 0.0001113383550546132, + "learning_rate": 2.573367546017996e-05, + "loss": 0.0021, + "num_input_tokens_seen": 118349600, + "step": 97260 + }, + { + "epoch": 10.832498051007907, + "grad_norm": 0.5278540849685669, + "learning_rate": 2.573124676545315e-05, + "loss": 0.0967, + "num_input_tokens_seen": 118355392, + "step": 97265 + }, + { + "epoch": 10.833054905891524, + "grad_norm": 0.03476724401116371, + "learning_rate": 2.5728818063819117e-05, + "loss": 0.0114, + "num_input_tokens_seen": 118361664, + "step": 97270 + }, + { + "epoch": 10.833611760775142, + "grad_norm": 0.2151334434747696, + "learning_rate": 2.5726389355300812e-05, + "loss": 0.0688, + "num_input_tokens_seen": 118368000, + "step": 97275 + }, + { + "epoch": 10.83416861565876, + "grad_norm": 1.4984427690505981, + "learning_rate": 2.572396063992118e-05, + "loss": 0.0686, + "num_input_tokens_seen": 118374080, + "step": 97280 + }, + { + "epoch": 10.834725470542377, + "grad_norm": 0.5296180844306946, + "learning_rate": 2.5721531917703158e-05, + "loss": 0.0467, + "num_input_tokens_seen": 118380192, + "step": 97285 + }, + { + "epoch": 10.835282325425993, + "grad_norm": 0.2158633917570114, + "learning_rate": 2.5719103188669695e-05, + "loss": 0.0344, + "num_input_tokens_seen": 118386240, + "step": 97290 + }, + { + "epoch": 10.83583918030961, + "grad_norm": 0.14479076862335205, + "learning_rate": 2.571667445284372e-05, + "loss": 0.0377, + "num_input_tokens_seen": 118391744, + "step": 97295 + }, + { + "epoch": 10.836396035193228, + "grad_norm": 0.3217619061470032, + "learning_rate": 2.5714245710248187e-05, + "loss": 0.0151, + "num_input_tokens_seen": 118398272, + "step": 97300 + }, + { + "epoch": 10.836952890076846, + "grad_norm": 0.5232460498809814, + "learning_rate": 2.571181696090602e-05, + "loss": 0.0056, + "num_input_tokens_seen": 118403968, + "step": 97305 + }, + { + "epoch": 10.837509744960464, + "grad_norm": 0.04568891227245331, + "learning_rate": 2.5709388204840185e-05, + "loss": 0.0395, + "num_input_tokens_seen": 118409664, + "step": 97310 + }, + { + "epoch": 10.83806659984408, + "grad_norm": 0.1459585428237915, + "learning_rate": 2.57069594420736e-05, + "loss": 0.0374, + "num_input_tokens_seen": 118415968, + "step": 97315 + }, + { + "epoch": 10.838623454727697, + "grad_norm": 1.141908884048462, + "learning_rate": 2.570453067262922e-05, + "loss": 0.0873, + "num_input_tokens_seen": 118421856, + "step": 97320 + }, + { + "epoch": 10.839180309611315, + "grad_norm": 1.0054506063461304, + "learning_rate": 2.5702101896529983e-05, + "loss": 0.0285, + "num_input_tokens_seen": 118428256, + "step": 97325 + }, + { + "epoch": 10.839737164494933, + "grad_norm": 2.6817636489868164, + "learning_rate": 2.5699673113798832e-05, + "loss": 0.1934, + "num_input_tokens_seen": 118434912, + "step": 97330 + }, + { + "epoch": 10.84029401937855, + "grad_norm": 0.14674325287342072, + "learning_rate": 2.5697244324458714e-05, + "loss": 0.0695, + "num_input_tokens_seen": 118440928, + "step": 97335 + }, + { + "epoch": 10.840850874262168, + "grad_norm": 0.0021488796919584274, + "learning_rate": 2.5694815528532558e-05, + "loss": 0.0481, + "num_input_tokens_seen": 118447136, + "step": 97340 + }, + { + "epoch": 10.841407729145784, + "grad_norm": 0.019396087154746056, + "learning_rate": 2.569238672604331e-05, + "loss": 0.0064, + "num_input_tokens_seen": 118453312, + "step": 97345 + }, + { + "epoch": 10.841964584029402, + "grad_norm": 0.25011488795280457, + "learning_rate": 2.5689957917013924e-05, + "loss": 0.0218, + "num_input_tokens_seen": 118459744, + "step": 97350 + }, + { + "epoch": 10.84252143891302, + "grad_norm": 2.8338446617126465, + "learning_rate": 2.5687529101467324e-05, + "loss": 0.1783, + "num_input_tokens_seen": 118465568, + "step": 97355 + }, + { + "epoch": 10.843078293796637, + "grad_norm": 0.26708096265792847, + "learning_rate": 2.5685100279426465e-05, + "loss": 0.0075, + "num_input_tokens_seen": 118471616, + "step": 97360 + }, + { + "epoch": 10.843635148680255, + "grad_norm": 0.0875890776515007, + "learning_rate": 2.568267145091428e-05, + "loss": 0.0965, + "num_input_tokens_seen": 118477248, + "step": 97365 + }, + { + "epoch": 10.84419200356387, + "grad_norm": 0.00035733095137402415, + "learning_rate": 2.5680242615953716e-05, + "loss": 0.0878, + "num_input_tokens_seen": 118483168, + "step": 97370 + }, + { + "epoch": 10.844748858447488, + "grad_norm": 1.2750017642974854, + "learning_rate": 2.5677813774567722e-05, + "loss": 0.0505, + "num_input_tokens_seen": 118489280, + "step": 97375 + }, + { + "epoch": 10.845305713331106, + "grad_norm": 0.003915810026228428, + "learning_rate": 2.5675384926779227e-05, + "loss": 0.1026, + "num_input_tokens_seen": 118495072, + "step": 97380 + }, + { + "epoch": 10.845862568214724, + "grad_norm": 1.318185806274414, + "learning_rate": 2.5672956072611187e-05, + "loss": 0.1064, + "num_input_tokens_seen": 118501184, + "step": 97385 + }, + { + "epoch": 10.846419423098341, + "grad_norm": 0.0003757753875106573, + "learning_rate": 2.567052721208653e-05, + "loss": 0.0197, + "num_input_tokens_seen": 118507168, + "step": 97390 + }, + { + "epoch": 10.846976277981957, + "grad_norm": 0.7932165861129761, + "learning_rate": 2.566809834522822e-05, + "loss": 0.1224, + "num_input_tokens_seen": 118513632, + "step": 97395 + }, + { + "epoch": 10.847533132865575, + "grad_norm": 0.08847390115261078, + "learning_rate": 2.5665669472059172e-05, + "loss": 0.0794, + "num_input_tokens_seen": 118519712, + "step": 97400 + }, + { + "epoch": 10.848089987749193, + "grad_norm": 0.00167077558580786, + "learning_rate": 2.5663240592602344e-05, + "loss": 0.0223, + "num_input_tokens_seen": 118525664, + "step": 97405 + }, + { + "epoch": 10.84864684263281, + "grad_norm": 1.2571525573730469, + "learning_rate": 2.5660811706880684e-05, + "loss": 0.1408, + "num_input_tokens_seen": 118531808, + "step": 97410 + }, + { + "epoch": 10.849203697516428, + "grad_norm": 0.01239081285893917, + "learning_rate": 2.565838281491712e-05, + "loss": 0.0626, + "num_input_tokens_seen": 118537952, + "step": 97415 + }, + { + "epoch": 10.849760552400044, + "grad_norm": 0.9560238718986511, + "learning_rate": 2.565595391673461e-05, + "loss": 0.0537, + "num_input_tokens_seen": 118543968, + "step": 97420 + }, + { + "epoch": 10.850317407283661, + "grad_norm": 0.8997625708580017, + "learning_rate": 2.565352501235609e-05, + "loss": 0.0287, + "num_input_tokens_seen": 118549952, + "step": 97425 + }, + { + "epoch": 10.85087426216728, + "grad_norm": 1.4502071142196655, + "learning_rate": 2.5651096101804495e-05, + "loss": 0.0667, + "num_input_tokens_seen": 118556256, + "step": 97430 + }, + { + "epoch": 10.851431117050897, + "grad_norm": 0.004703386686742306, + "learning_rate": 2.564866718510278e-05, + "loss": 0.0266, + "num_input_tokens_seen": 118562432, + "step": 97435 + }, + { + "epoch": 10.851987971934514, + "grad_norm": 2.2628297805786133, + "learning_rate": 2.5646238262273885e-05, + "loss": 0.1374, + "num_input_tokens_seen": 118568352, + "step": 97440 + }, + { + "epoch": 10.85254482681813, + "grad_norm": 0.04852946475148201, + "learning_rate": 2.564380933334075e-05, + "loss": 0.1488, + "num_input_tokens_seen": 118574784, + "step": 97445 + }, + { + "epoch": 10.853101681701748, + "grad_norm": 0.8312678933143616, + "learning_rate": 2.564138039832632e-05, + "loss": 0.0249, + "num_input_tokens_seen": 118581184, + "step": 97450 + }, + { + "epoch": 10.853658536585366, + "grad_norm": 0.00916082039475441, + "learning_rate": 2.5638951457253548e-05, + "loss": 0.0338, + "num_input_tokens_seen": 118587136, + "step": 97455 + }, + { + "epoch": 10.854215391468983, + "grad_norm": 0.2840903103351593, + "learning_rate": 2.563652251014535e-05, + "loss": 0.0441, + "num_input_tokens_seen": 118592928, + "step": 97460 + }, + { + "epoch": 10.854772246352601, + "grad_norm": 0.6447079181671143, + "learning_rate": 2.5634093557024695e-05, + "loss": 0.0233, + "num_input_tokens_seen": 118599424, + "step": 97465 + }, + { + "epoch": 10.855329101236219, + "grad_norm": 2.4103946685791016, + "learning_rate": 2.5631664597914522e-05, + "loss": 0.071, + "num_input_tokens_seen": 118605216, + "step": 97470 + }, + { + "epoch": 10.855885956119835, + "grad_norm": 0.041824821382761, + "learning_rate": 2.5629235632837767e-05, + "loss": 0.0139, + "num_input_tokens_seen": 118611264, + "step": 97475 + }, + { + "epoch": 10.856442811003452, + "grad_norm": 0.6205915212631226, + "learning_rate": 2.5626806661817382e-05, + "loss": 0.0218, + "num_input_tokens_seen": 118617632, + "step": 97480 + }, + { + "epoch": 10.85699966588707, + "grad_norm": 0.2942173480987549, + "learning_rate": 2.56243776848763e-05, + "loss": 0.0498, + "num_input_tokens_seen": 118623712, + "step": 97485 + }, + { + "epoch": 10.857556520770688, + "grad_norm": 0.00154151301831007, + "learning_rate": 2.5621948702037475e-05, + "loss": 0.0019, + "num_input_tokens_seen": 118630304, + "step": 97490 + }, + { + "epoch": 10.858113375654305, + "grad_norm": 3.7925033569335938, + "learning_rate": 2.5619519713323843e-05, + "loss": 0.1374, + "num_input_tokens_seen": 118636608, + "step": 97495 + }, + { + "epoch": 10.858670230537921, + "grad_norm": 0.07732213288545609, + "learning_rate": 2.5617090718758347e-05, + "loss": 0.0247, + "num_input_tokens_seen": 118643072, + "step": 97500 + }, + { + "epoch": 10.859227085421539, + "grad_norm": 1.2862006425857544, + "learning_rate": 2.561466171836394e-05, + "loss": 0.1544, + "num_input_tokens_seen": 118649312, + "step": 97505 + }, + { + "epoch": 10.859783940305157, + "grad_norm": 0.0022389309015125036, + "learning_rate": 2.561223271216356e-05, + "loss": 0.0163, + "num_input_tokens_seen": 118655072, + "step": 97510 + }, + { + "epoch": 10.860340795188774, + "grad_norm": 0.022383974865078926, + "learning_rate": 2.5609803700180153e-05, + "loss": 0.027, + "num_input_tokens_seen": 118661152, + "step": 97515 + }, + { + "epoch": 10.860897650072392, + "grad_norm": 0.02743842452764511, + "learning_rate": 2.5607374682436658e-05, + "loss": 0.0743, + "num_input_tokens_seen": 118667360, + "step": 97520 + }, + { + "epoch": 10.861454504956008, + "grad_norm": 0.5650442242622375, + "learning_rate": 2.5604945658956026e-05, + "loss": 0.0509, + "num_input_tokens_seen": 118673632, + "step": 97525 + }, + { + "epoch": 10.862011359839626, + "grad_norm": 0.029093651100993156, + "learning_rate": 2.5602516629761198e-05, + "loss": 0.1469, + "num_input_tokens_seen": 118679072, + "step": 97530 + }, + { + "epoch": 10.862568214723243, + "grad_norm": 0.3220129609107971, + "learning_rate": 2.560008759487511e-05, + "loss": 0.0471, + "num_input_tokens_seen": 118685088, + "step": 97535 + }, + { + "epoch": 10.86312506960686, + "grad_norm": 0.7783121466636658, + "learning_rate": 2.5597658554320725e-05, + "loss": 0.0514, + "num_input_tokens_seen": 118690464, + "step": 97540 + }, + { + "epoch": 10.863681924490479, + "grad_norm": 0.2184889018535614, + "learning_rate": 2.5595229508120965e-05, + "loss": 0.0087, + "num_input_tokens_seen": 118696480, + "step": 97545 + }, + { + "epoch": 10.864238779374094, + "grad_norm": 0.014300770126283169, + "learning_rate": 2.55928004562988e-05, + "loss": 0.0081, + "num_input_tokens_seen": 118702464, + "step": 97550 + }, + { + "epoch": 10.864795634257712, + "grad_norm": 0.1753070056438446, + "learning_rate": 2.5590371398877145e-05, + "loss": 0.0912, + "num_input_tokens_seen": 118708480, + "step": 97555 + }, + { + "epoch": 10.86535248914133, + "grad_norm": 0.15755032002925873, + "learning_rate": 2.5587942335878963e-05, + "loss": 0.0108, + "num_input_tokens_seen": 118714912, + "step": 97560 + }, + { + "epoch": 10.865909344024947, + "grad_norm": 0.0005999960703775287, + "learning_rate": 2.55855132673272e-05, + "loss": 0.0034, + "num_input_tokens_seen": 118721152, + "step": 97565 + }, + { + "epoch": 10.866466198908565, + "grad_norm": 2.395807981491089, + "learning_rate": 2.558308419324479e-05, + "loss": 0.1118, + "num_input_tokens_seen": 118727360, + "step": 97570 + }, + { + "epoch": 10.867023053792181, + "grad_norm": 0.07277524471282959, + "learning_rate": 2.558065511365469e-05, + "loss": 0.0138, + "num_input_tokens_seen": 118733600, + "step": 97575 + }, + { + "epoch": 10.867579908675799, + "grad_norm": 1.2230430841445923, + "learning_rate": 2.5578226028579826e-05, + "loss": 0.0373, + "num_input_tokens_seen": 118739840, + "step": 97580 + }, + { + "epoch": 10.868136763559416, + "grad_norm": 1.8165221214294434, + "learning_rate": 2.557579693804316e-05, + "loss": 0.0745, + "num_input_tokens_seen": 118745248, + "step": 97585 + }, + { + "epoch": 10.868693618443034, + "grad_norm": 0.0016751374350860715, + "learning_rate": 2.557336784206763e-05, + "loss": 0.0395, + "num_input_tokens_seen": 118751296, + "step": 97590 + }, + { + "epoch": 10.869250473326652, + "grad_norm": 0.7809942960739136, + "learning_rate": 2.557093874067618e-05, + "loss": 0.0422, + "num_input_tokens_seen": 118757088, + "step": 97595 + }, + { + "epoch": 10.869807328210268, + "grad_norm": 0.5073838233947754, + "learning_rate": 2.5568509633891762e-05, + "loss": 0.0208, + "num_input_tokens_seen": 118763104, + "step": 97600 + }, + { + "epoch": 10.870364183093885, + "grad_norm": 0.2317887395620346, + "learning_rate": 2.5566080521737307e-05, + "loss": 0.0349, + "num_input_tokens_seen": 118769312, + "step": 97605 + }, + { + "epoch": 10.870921037977503, + "grad_norm": 0.6534755229949951, + "learning_rate": 2.556365140423577e-05, + "loss": 0.1046, + "num_input_tokens_seen": 118775328, + "step": 97610 + }, + { + "epoch": 10.87147789286112, + "grad_norm": 0.008980855345726013, + "learning_rate": 2.5561222281410097e-05, + "loss": 0.0887, + "num_input_tokens_seen": 118781344, + "step": 97615 + }, + { + "epoch": 10.872034747744738, + "grad_norm": 0.033578742295503616, + "learning_rate": 2.5558793153283233e-05, + "loss": 0.0308, + "num_input_tokens_seen": 118787168, + "step": 97620 + }, + { + "epoch": 10.872591602628354, + "grad_norm": 0.004307516384869814, + "learning_rate": 2.5556364019878115e-05, + "loss": 0.014, + "num_input_tokens_seen": 118793536, + "step": 97625 + }, + { + "epoch": 10.873148457511972, + "grad_norm": 0.033676229417324066, + "learning_rate": 2.555393488121769e-05, + "loss": 0.0454, + "num_input_tokens_seen": 118799648, + "step": 97630 + }, + { + "epoch": 10.87370531239559, + "grad_norm": 0.013075952418148518, + "learning_rate": 2.555150573732491e-05, + "loss": 0.0976, + "num_input_tokens_seen": 118805920, + "step": 97635 + }, + { + "epoch": 10.874262167279207, + "grad_norm": 0.018249759450554848, + "learning_rate": 2.5549076588222715e-05, + "loss": 0.0377, + "num_input_tokens_seen": 118812256, + "step": 97640 + }, + { + "epoch": 10.874819022162825, + "grad_norm": 0.1062714159488678, + "learning_rate": 2.5546647433934056e-05, + "loss": 0.011, + "num_input_tokens_seen": 118818688, + "step": 97645 + }, + { + "epoch": 10.87537587704644, + "grad_norm": 1.9556257724761963, + "learning_rate": 2.5544218274481868e-05, + "loss": 0.0908, + "num_input_tokens_seen": 118824928, + "step": 97650 + }, + { + "epoch": 10.875932731930058, + "grad_norm": 0.25391024351119995, + "learning_rate": 2.55417891098891e-05, + "loss": 0.0921, + "num_input_tokens_seen": 118831168, + "step": 97655 + }, + { + "epoch": 10.876489586813676, + "grad_norm": 0.44310998916625977, + "learning_rate": 2.5539359940178714e-05, + "loss": 0.0121, + "num_input_tokens_seen": 118837184, + "step": 97660 + }, + { + "epoch": 10.877046441697294, + "grad_norm": 0.7256176471710205, + "learning_rate": 2.553693076537363e-05, + "loss": 0.0498, + "num_input_tokens_seen": 118843296, + "step": 97665 + }, + { + "epoch": 10.877603296580912, + "grad_norm": 0.40346649289131165, + "learning_rate": 2.5534501585496807e-05, + "loss": 0.0311, + "num_input_tokens_seen": 118849600, + "step": 97670 + }, + { + "epoch": 10.878160151464527, + "grad_norm": 0.007599739357829094, + "learning_rate": 2.5532072400571187e-05, + "loss": 0.0766, + "num_input_tokens_seen": 118856064, + "step": 97675 + }, + { + "epoch": 10.878717006348145, + "grad_norm": 0.7188601493835449, + "learning_rate": 2.552964321061972e-05, + "loss": 0.1344, + "num_input_tokens_seen": 118862208, + "step": 97680 + }, + { + "epoch": 10.879273861231763, + "grad_norm": 0.013225200586020947, + "learning_rate": 2.5527214015665346e-05, + "loss": 0.021, + "num_input_tokens_seen": 118868480, + "step": 97685 + }, + { + "epoch": 10.87983071611538, + "grad_norm": 0.0005686167860403657, + "learning_rate": 2.5524784815731014e-05, + "loss": 0.0651, + "num_input_tokens_seen": 118874816, + "step": 97690 + }, + { + "epoch": 10.880387570998998, + "grad_norm": 0.0011437512002885342, + "learning_rate": 2.5522355610839672e-05, + "loss": 0.0424, + "num_input_tokens_seen": 118880640, + "step": 97695 + }, + { + "epoch": 10.880944425882616, + "grad_norm": 0.4497832655906677, + "learning_rate": 2.5519926401014254e-05, + "loss": 0.0634, + "num_input_tokens_seen": 118886976, + "step": 97700 + }, + { + "epoch": 10.881501280766232, + "grad_norm": 1.8733075857162476, + "learning_rate": 2.5517497186277723e-05, + "loss": 0.0688, + "num_input_tokens_seen": 118893024, + "step": 97705 + }, + { + "epoch": 10.88205813564985, + "grad_norm": 1.108591079711914, + "learning_rate": 2.5515067966653012e-05, + "loss": 0.1289, + "num_input_tokens_seen": 118899040, + "step": 97710 + }, + { + "epoch": 10.882614990533467, + "grad_norm": 0.012973660603165627, + "learning_rate": 2.5512638742163076e-05, + "loss": 0.0996, + "num_input_tokens_seen": 118905440, + "step": 97715 + }, + { + "epoch": 10.883171845417085, + "grad_norm": 0.012428217567503452, + "learning_rate": 2.5510209512830853e-05, + "loss": 0.0642, + "num_input_tokens_seen": 118911424, + "step": 97720 + }, + { + "epoch": 10.883728700300702, + "grad_norm": 0.7110714912414551, + "learning_rate": 2.550778027867929e-05, + "loss": 0.0161, + "num_input_tokens_seen": 118916768, + "step": 97725 + }, + { + "epoch": 10.884285555184318, + "grad_norm": 0.017111901193857193, + "learning_rate": 2.550535103973134e-05, + "loss": 0.0421, + "num_input_tokens_seen": 118922880, + "step": 97730 + }, + { + "epoch": 10.884842410067936, + "grad_norm": 0.004031455609947443, + "learning_rate": 2.5502921796009943e-05, + "loss": 0.0154, + "num_input_tokens_seen": 118929504, + "step": 97735 + }, + { + "epoch": 10.885399264951554, + "grad_norm": 0.00011341040226398036, + "learning_rate": 2.5500492547538047e-05, + "loss": 0.014, + "num_input_tokens_seen": 118935648, + "step": 97740 + }, + { + "epoch": 10.885956119835171, + "grad_norm": 0.03759694844484329, + "learning_rate": 2.5498063294338604e-05, + "loss": 0.012, + "num_input_tokens_seen": 118941760, + "step": 97745 + }, + { + "epoch": 10.886512974718789, + "grad_norm": 0.0034040792379528284, + "learning_rate": 2.549563403643454e-05, + "loss": 0.0071, + "num_input_tokens_seen": 118947776, + "step": 97750 + }, + { + "epoch": 10.887069829602405, + "grad_norm": 0.023974308744072914, + "learning_rate": 2.549320477384883e-05, + "loss": 0.0405, + "num_input_tokens_seen": 118953952, + "step": 97755 + }, + { + "epoch": 10.887626684486023, + "grad_norm": 0.002181120216846466, + "learning_rate": 2.5490775506604397e-05, + "loss": 0.1085, + "num_input_tokens_seen": 118960192, + "step": 97760 + }, + { + "epoch": 10.88818353936964, + "grad_norm": 0.00013829334056936204, + "learning_rate": 2.5488346234724197e-05, + "loss": 0.0314, + "num_input_tokens_seen": 118966176, + "step": 97765 + }, + { + "epoch": 10.888740394253258, + "grad_norm": 0.008078720420598984, + "learning_rate": 2.5485916958231175e-05, + "loss": 0.1742, + "num_input_tokens_seen": 118971904, + "step": 97770 + }, + { + "epoch": 10.889297249136876, + "grad_norm": 0.13754132390022278, + "learning_rate": 2.5483487677148277e-05, + "loss": 0.0221, + "num_input_tokens_seen": 118977856, + "step": 97775 + }, + { + "epoch": 10.889854104020491, + "grad_norm": 0.03828655555844307, + "learning_rate": 2.548105839149846e-05, + "loss": 0.0025, + "num_input_tokens_seen": 118983936, + "step": 97780 + }, + { + "epoch": 10.89041095890411, + "grad_norm": 0.010039282962679863, + "learning_rate": 2.5478629101304652e-05, + "loss": 0.1216, + "num_input_tokens_seen": 118989952, + "step": 97785 + }, + { + "epoch": 10.890967813787727, + "grad_norm": 0.0950859859585762, + "learning_rate": 2.5476199806589813e-05, + "loss": 0.063, + "num_input_tokens_seen": 118996000, + "step": 97790 + }, + { + "epoch": 10.891524668671345, + "grad_norm": 0.022206351161003113, + "learning_rate": 2.547377050737688e-05, + "loss": 0.0766, + "num_input_tokens_seen": 119002112, + "step": 97795 + }, + { + "epoch": 10.892081523554962, + "grad_norm": 0.036116570234298706, + "learning_rate": 2.5471341203688808e-05, + "loss": 0.0399, + "num_input_tokens_seen": 119008192, + "step": 97800 + }, + { + "epoch": 10.89263837843858, + "grad_norm": 0.008188621141016483, + "learning_rate": 2.546891189554854e-05, + "loss": 0.0226, + "num_input_tokens_seen": 119014400, + "step": 97805 + }, + { + "epoch": 10.893195233322196, + "grad_norm": 0.06000886857509613, + "learning_rate": 2.5466482582979023e-05, + "loss": 0.0244, + "num_input_tokens_seen": 119020096, + "step": 97810 + }, + { + "epoch": 10.893752088205813, + "grad_norm": 0.23908977210521698, + "learning_rate": 2.5464053266003206e-05, + "loss": 0.0046, + "num_input_tokens_seen": 119026272, + "step": 97815 + }, + { + "epoch": 10.894308943089431, + "grad_norm": 1.7440226078033447, + "learning_rate": 2.5461623944644035e-05, + "loss": 0.0493, + "num_input_tokens_seen": 119032384, + "step": 97820 + }, + { + "epoch": 10.894865797973049, + "grad_norm": 0.9341450929641724, + "learning_rate": 2.545919461892446e-05, + "loss": 0.1371, + "num_input_tokens_seen": 119038368, + "step": 97825 + }, + { + "epoch": 10.895422652856666, + "grad_norm": 0.5156838297843933, + "learning_rate": 2.545676528886741e-05, + "loss": 0.0194, + "num_input_tokens_seen": 119044544, + "step": 97830 + }, + { + "epoch": 10.895979507740282, + "grad_norm": 0.12839379906654358, + "learning_rate": 2.5454335954495855e-05, + "loss": 0.0581, + "num_input_tokens_seen": 119050336, + "step": 97835 + }, + { + "epoch": 10.8965363626239, + "grad_norm": 0.9451374411582947, + "learning_rate": 2.5451906615832732e-05, + "loss": 0.0304, + "num_input_tokens_seen": 119056320, + "step": 97840 + }, + { + "epoch": 10.897093217507518, + "grad_norm": 0.012928341515362263, + "learning_rate": 2.544947727290099e-05, + "loss": 0.1429, + "num_input_tokens_seen": 119062240, + "step": 97845 + }, + { + "epoch": 10.897650072391135, + "grad_norm": 0.17210142314434052, + "learning_rate": 2.5447047925723576e-05, + "loss": 0.0216, + "num_input_tokens_seen": 119068608, + "step": 97850 + }, + { + "epoch": 10.898206927274753, + "grad_norm": 0.05628197640180588, + "learning_rate": 2.5444618574323432e-05, + "loss": 0.0056, + "num_input_tokens_seen": 119074624, + "step": 97855 + }, + { + "epoch": 10.898763782158369, + "grad_norm": 0.42545199394226074, + "learning_rate": 2.5442189218723516e-05, + "loss": 0.0282, + "num_input_tokens_seen": 119080704, + "step": 97860 + }, + { + "epoch": 10.899320637041987, + "grad_norm": 0.00036049800110049546, + "learning_rate": 2.543975985894676e-05, + "loss": 0.0403, + "num_input_tokens_seen": 119086560, + "step": 97865 + }, + { + "epoch": 10.899877491925604, + "grad_norm": 0.0047966111451387405, + "learning_rate": 2.5437330495016127e-05, + "loss": 0.0122, + "num_input_tokens_seen": 119092416, + "step": 97870 + }, + { + "epoch": 10.900434346809222, + "grad_norm": 0.0032074374612420797, + "learning_rate": 2.5434901126954554e-05, + "loss": 0.0508, + "num_input_tokens_seen": 119098624, + "step": 97875 + }, + { + "epoch": 10.90099120169284, + "grad_norm": 2.046658754348755, + "learning_rate": 2.5432471754784993e-05, + "loss": 0.1158, + "num_input_tokens_seen": 119104832, + "step": 97880 + }, + { + "epoch": 10.901548056576456, + "grad_norm": 0.19736047089099884, + "learning_rate": 2.5430042378530388e-05, + "loss": 0.0343, + "num_input_tokens_seen": 119111328, + "step": 97885 + }, + { + "epoch": 10.902104911460073, + "grad_norm": 0.0022386410273611546, + "learning_rate": 2.5427612998213685e-05, + "loss": 0.215, + "num_input_tokens_seen": 119117760, + "step": 97890 + }, + { + "epoch": 10.90266176634369, + "grad_norm": 0.004348650109022856, + "learning_rate": 2.5425183613857843e-05, + "loss": 0.062, + "num_input_tokens_seen": 119124000, + "step": 97895 + }, + { + "epoch": 10.903218621227309, + "grad_norm": 0.4823421537876129, + "learning_rate": 2.5422754225485796e-05, + "loss": 0.0292, + "num_input_tokens_seen": 119130080, + "step": 97900 + }, + { + "epoch": 10.903775476110926, + "grad_norm": 0.05785614997148514, + "learning_rate": 2.5420324833120496e-05, + "loss": 0.0017, + "num_input_tokens_seen": 119135872, + "step": 97905 + }, + { + "epoch": 10.904332330994542, + "grad_norm": 0.6576644778251648, + "learning_rate": 2.5417895436784895e-05, + "loss": 0.0336, + "num_input_tokens_seen": 119142048, + "step": 97910 + }, + { + "epoch": 10.90488918587816, + "grad_norm": 0.06197187677025795, + "learning_rate": 2.5415466036501927e-05, + "loss": 0.0152, + "num_input_tokens_seen": 119148256, + "step": 97915 + }, + { + "epoch": 10.905446040761777, + "grad_norm": 1.1360738277435303, + "learning_rate": 2.541303663229456e-05, + "loss": 0.0638, + "num_input_tokens_seen": 119154176, + "step": 97920 + }, + { + "epoch": 10.906002895645395, + "grad_norm": 0.7611427903175354, + "learning_rate": 2.5410607224185727e-05, + "loss": 0.066, + "num_input_tokens_seen": 119160224, + "step": 97925 + }, + { + "epoch": 10.906559750529013, + "grad_norm": 0.03509065508842468, + "learning_rate": 2.5408177812198387e-05, + "loss": 0.0753, + "num_input_tokens_seen": 119165920, + "step": 97930 + }, + { + "epoch": 10.907116605412629, + "grad_norm": 0.17155197262763977, + "learning_rate": 2.5405748396355465e-05, + "loss": 0.0602, + "num_input_tokens_seen": 119172160, + "step": 97935 + }, + { + "epoch": 10.907673460296246, + "grad_norm": 0.09055894613265991, + "learning_rate": 2.5403318976679936e-05, + "loss": 0.0284, + "num_input_tokens_seen": 119178496, + "step": 97940 + }, + { + "epoch": 10.908230315179864, + "grad_norm": 0.0003732749028131366, + "learning_rate": 2.5400889553194734e-05, + "loss": 0.0058, + "num_input_tokens_seen": 119184512, + "step": 97945 + }, + { + "epoch": 10.908787170063482, + "grad_norm": 0.0005081743584014475, + "learning_rate": 2.5398460125922806e-05, + "loss": 0.0083, + "num_input_tokens_seen": 119190976, + "step": 97950 + }, + { + "epoch": 10.9093440249471, + "grad_norm": 0.405988484621048, + "learning_rate": 2.5396030694887108e-05, + "loss": 0.0372, + "num_input_tokens_seen": 119197120, + "step": 97955 + }, + { + "epoch": 10.909900879830715, + "grad_norm": 0.002229145262390375, + "learning_rate": 2.5393601260110577e-05, + "loss": 0.1515, + "num_input_tokens_seen": 119203200, + "step": 97960 + }, + { + "epoch": 10.910457734714333, + "grad_norm": 0.664757490158081, + "learning_rate": 2.5391171821616168e-05, + "loss": 0.0392, + "num_input_tokens_seen": 119209216, + "step": 97965 + }, + { + "epoch": 10.91101458959795, + "grad_norm": 1.170605182647705, + "learning_rate": 2.5388742379426835e-05, + "loss": 0.0315, + "num_input_tokens_seen": 119216000, + "step": 97970 + }, + { + "epoch": 10.911571444481568, + "grad_norm": 0.09001148492097855, + "learning_rate": 2.538631293356551e-05, + "loss": 0.0202, + "num_input_tokens_seen": 119222080, + "step": 97975 + }, + { + "epoch": 10.912128299365186, + "grad_norm": 0.1463901847600937, + "learning_rate": 2.5383883484055155e-05, + "loss": 0.0098, + "num_input_tokens_seen": 119228416, + "step": 97980 + }, + { + "epoch": 10.912685154248802, + "grad_norm": 0.0973513200879097, + "learning_rate": 2.538145403091871e-05, + "loss": 0.0134, + "num_input_tokens_seen": 119234272, + "step": 97985 + }, + { + "epoch": 10.91324200913242, + "grad_norm": 0.924429714679718, + "learning_rate": 2.5379024574179126e-05, + "loss": 0.0984, + "num_input_tokens_seen": 119240032, + "step": 97990 + }, + { + "epoch": 10.913798864016037, + "grad_norm": 2.1223809719085693, + "learning_rate": 2.537659511385935e-05, + "loss": 0.1162, + "num_input_tokens_seen": 119246400, + "step": 97995 + }, + { + "epoch": 10.914355718899655, + "grad_norm": 0.03129236400127411, + "learning_rate": 2.5374165649982336e-05, + "loss": 0.0044, + "num_input_tokens_seen": 119252768, + "step": 98000 + }, + { + "epoch": 10.914912573783273, + "grad_norm": 0.6834895610809326, + "learning_rate": 2.5371736182571028e-05, + "loss": 0.0593, + "num_input_tokens_seen": 119258784, + "step": 98005 + }, + { + "epoch": 10.915469428666889, + "grad_norm": 0.12682165205478668, + "learning_rate": 2.5369306711648365e-05, + "loss": 0.0132, + "num_input_tokens_seen": 119264992, + "step": 98010 + }, + { + "epoch": 10.916026283550506, + "grad_norm": 0.594990611076355, + "learning_rate": 2.536687723723732e-05, + "loss": 0.0151, + "num_input_tokens_seen": 119271008, + "step": 98015 + }, + { + "epoch": 10.916583138434124, + "grad_norm": 0.1955372542142868, + "learning_rate": 2.536444775936081e-05, + "loss": 0.0208, + "num_input_tokens_seen": 119277056, + "step": 98020 + }, + { + "epoch": 10.917139993317742, + "grad_norm": 0.01978667825460434, + "learning_rate": 2.536201827804181e-05, + "loss": 0.0102, + "num_input_tokens_seen": 119283200, + "step": 98025 + }, + { + "epoch": 10.91769684820136, + "grad_norm": 0.0036295473109930754, + "learning_rate": 2.5359588793303246e-05, + "loss": 0.0551, + "num_input_tokens_seen": 119289248, + "step": 98030 + }, + { + "epoch": 10.918253703084975, + "grad_norm": 0.9932423233985901, + "learning_rate": 2.535715930516808e-05, + "loss": 0.0123, + "num_input_tokens_seen": 119295424, + "step": 98035 + }, + { + "epoch": 10.918810557968593, + "grad_norm": 0.011639509350061417, + "learning_rate": 2.5354729813659265e-05, + "loss": 0.0584, + "num_input_tokens_seen": 119301536, + "step": 98040 + }, + { + "epoch": 10.91936741285221, + "grad_norm": 0.005913838744163513, + "learning_rate": 2.5352300318799743e-05, + "loss": 0.0418, + "num_input_tokens_seen": 119307296, + "step": 98045 + }, + { + "epoch": 10.919924267735828, + "grad_norm": 1.3158965110778809, + "learning_rate": 2.534987082061246e-05, + "loss": 0.1315, + "num_input_tokens_seen": 119313088, + "step": 98050 + }, + { + "epoch": 10.920481122619446, + "grad_norm": 0.03817771375179291, + "learning_rate": 2.5347441319120364e-05, + "loss": 0.0453, + "num_input_tokens_seen": 119319264, + "step": 98055 + }, + { + "epoch": 10.921037977503063, + "grad_norm": 0.02911156788468361, + "learning_rate": 2.534501181434641e-05, + "loss": 0.0375, + "num_input_tokens_seen": 119325376, + "step": 98060 + }, + { + "epoch": 10.92159483238668, + "grad_norm": 2.7714738845825195, + "learning_rate": 2.534258230631354e-05, + "loss": 0.0572, + "num_input_tokens_seen": 119331424, + "step": 98065 + }, + { + "epoch": 10.922151687270297, + "grad_norm": 1.9234833717346191, + "learning_rate": 2.5340152795044708e-05, + "loss": 0.2986, + "num_input_tokens_seen": 119337440, + "step": 98070 + }, + { + "epoch": 10.922708542153915, + "grad_norm": 0.6616123914718628, + "learning_rate": 2.5337723280562858e-05, + "loss": 0.0193, + "num_input_tokens_seen": 119343488, + "step": 98075 + }, + { + "epoch": 10.923265397037532, + "grad_norm": 0.02431630529463291, + "learning_rate": 2.5335293762890943e-05, + "loss": 0.0432, + "num_input_tokens_seen": 119349632, + "step": 98080 + }, + { + "epoch": 10.92382225192115, + "grad_norm": 0.11882930248975754, + "learning_rate": 2.5332864242051914e-05, + "loss": 0.0354, + "num_input_tokens_seen": 119355424, + "step": 98085 + }, + { + "epoch": 10.924379106804766, + "grad_norm": 0.004061458632349968, + "learning_rate": 2.533043471806871e-05, + "loss": 0.0221, + "num_input_tokens_seen": 119361536, + "step": 98090 + }, + { + "epoch": 10.924935961688384, + "grad_norm": 0.03237837553024292, + "learning_rate": 2.532800519096428e-05, + "loss": 0.0279, + "num_input_tokens_seen": 119368224, + "step": 98095 + }, + { + "epoch": 10.925492816572001, + "grad_norm": 0.2279375046491623, + "learning_rate": 2.5325575660761585e-05, + "loss": 0.0044, + "num_input_tokens_seen": 119374400, + "step": 98100 + }, + { + "epoch": 10.926049671455619, + "grad_norm": 0.0002973415539599955, + "learning_rate": 2.5323146127483565e-05, + "loss": 0.009, + "num_input_tokens_seen": 119380448, + "step": 98105 + }, + { + "epoch": 10.926606526339237, + "grad_norm": 0.0025803199969232082, + "learning_rate": 2.5320716591153178e-05, + "loss": 0.0427, + "num_input_tokens_seen": 119386592, + "step": 98110 + }, + { + "epoch": 10.927163381222853, + "grad_norm": 0.08381356298923492, + "learning_rate": 2.5318287051793353e-05, + "loss": 0.0204, + "num_input_tokens_seen": 119392928, + "step": 98115 + }, + { + "epoch": 10.92772023610647, + "grad_norm": 1.5966075658798218, + "learning_rate": 2.5315857509427066e-05, + "loss": 0.1192, + "num_input_tokens_seen": 119399360, + "step": 98120 + }, + { + "epoch": 10.928277090990088, + "grad_norm": 0.14404666423797607, + "learning_rate": 2.5313427964077242e-05, + "loss": 0.0824, + "num_input_tokens_seen": 119405344, + "step": 98125 + }, + { + "epoch": 10.928833945873706, + "grad_norm": 3.5033397674560547, + "learning_rate": 2.531099841576684e-05, + "loss": 0.0713, + "num_input_tokens_seen": 119411488, + "step": 98130 + }, + { + "epoch": 10.929390800757323, + "grad_norm": 0.39237427711486816, + "learning_rate": 2.530856886451881e-05, + "loss": 0.0632, + "num_input_tokens_seen": 119417760, + "step": 98135 + }, + { + "epoch": 10.92994765564094, + "grad_norm": 0.8173558115959167, + "learning_rate": 2.5306139310356102e-05, + "loss": 0.0936, + "num_input_tokens_seen": 119423616, + "step": 98140 + }, + { + "epoch": 10.930504510524557, + "grad_norm": 0.07847125828266144, + "learning_rate": 2.5303709753301665e-05, + "loss": 0.0448, + "num_input_tokens_seen": 119429600, + "step": 98145 + }, + { + "epoch": 10.931061365408175, + "grad_norm": 0.0005606127087958157, + "learning_rate": 2.530128019337844e-05, + "loss": 0.1522, + "num_input_tokens_seen": 119435456, + "step": 98150 + }, + { + "epoch": 10.931618220291792, + "grad_norm": 0.05932692438364029, + "learning_rate": 2.5298850630609388e-05, + "loss": 0.0639, + "num_input_tokens_seen": 119441696, + "step": 98155 + }, + { + "epoch": 10.93217507517541, + "grad_norm": 0.9167909026145935, + "learning_rate": 2.5296421065017458e-05, + "loss": 0.0739, + "num_input_tokens_seen": 119447808, + "step": 98160 + }, + { + "epoch": 10.932731930059028, + "grad_norm": 1.4428703784942627, + "learning_rate": 2.529399149662558e-05, + "loss": 0.1167, + "num_input_tokens_seen": 119454240, + "step": 98165 + }, + { + "epoch": 10.933288784942643, + "grad_norm": 0.026474518701434135, + "learning_rate": 2.5291561925456726e-05, + "loss": 0.0025, + "num_input_tokens_seen": 119460384, + "step": 98170 + }, + { + "epoch": 10.933845639826261, + "grad_norm": 0.32181861996650696, + "learning_rate": 2.5289132351533827e-05, + "loss": 0.017, + "num_input_tokens_seen": 119466656, + "step": 98175 + }, + { + "epoch": 10.934402494709879, + "grad_norm": 0.4883926510810852, + "learning_rate": 2.5286702774879843e-05, + "loss": 0.0433, + "num_input_tokens_seen": 119472800, + "step": 98180 + }, + { + "epoch": 10.934959349593496, + "grad_norm": 0.00021569992532022297, + "learning_rate": 2.5284273195517726e-05, + "loss": 0.005, + "num_input_tokens_seen": 119478848, + "step": 98185 + }, + { + "epoch": 10.935516204477114, + "grad_norm": 0.10116144269704819, + "learning_rate": 2.528184361347042e-05, + "loss": 0.028, + "num_input_tokens_seen": 119484992, + "step": 98190 + }, + { + "epoch": 10.93607305936073, + "grad_norm": 0.08752105385065079, + "learning_rate": 2.5279414028760877e-05, + "loss": 0.0208, + "num_input_tokens_seen": 119491232, + "step": 98195 + }, + { + "epoch": 10.936629914244348, + "grad_norm": 0.020987574011087418, + "learning_rate": 2.527698444141204e-05, + "loss": 0.0167, + "num_input_tokens_seen": 119496960, + "step": 98200 + }, + { + "epoch": 10.937186769127965, + "grad_norm": 0.4028050899505615, + "learning_rate": 2.5274554851446868e-05, + "loss": 0.0278, + "num_input_tokens_seen": 119502848, + "step": 98205 + }, + { + "epoch": 10.937743624011583, + "grad_norm": 1.068983793258667, + "learning_rate": 2.5272125258888302e-05, + "loss": 0.0244, + "num_input_tokens_seen": 119508256, + "step": 98210 + }, + { + "epoch": 10.9383004788952, + "grad_norm": 0.0010531417792662978, + "learning_rate": 2.5269695663759296e-05, + "loss": 0.0778, + "num_input_tokens_seen": 119514464, + "step": 98215 + }, + { + "epoch": 10.938857333778817, + "grad_norm": 7.97125103417784e-05, + "learning_rate": 2.52672660660828e-05, + "loss": 0.2158, + "num_input_tokens_seen": 119520064, + "step": 98220 + }, + { + "epoch": 10.939414188662434, + "grad_norm": 0.1303188055753708, + "learning_rate": 2.526483646588176e-05, + "loss": 0.0077, + "num_input_tokens_seen": 119526400, + "step": 98225 + }, + { + "epoch": 10.939971043546052, + "grad_norm": 0.046104058623313904, + "learning_rate": 2.5262406863179127e-05, + "loss": 0.0255, + "num_input_tokens_seen": 119532832, + "step": 98230 + }, + { + "epoch": 10.94052789842967, + "grad_norm": 0.016967356204986572, + "learning_rate": 2.5259977257997853e-05, + "loss": 0.0411, + "num_input_tokens_seen": 119538816, + "step": 98235 + }, + { + "epoch": 10.941084753313287, + "grad_norm": 0.1396828442811966, + "learning_rate": 2.525754765036088e-05, + "loss": 0.0101, + "num_input_tokens_seen": 119544960, + "step": 98240 + }, + { + "epoch": 10.941641608196903, + "grad_norm": 0.13218414783477783, + "learning_rate": 2.525511804029117e-05, + "loss": 0.0449, + "num_input_tokens_seen": 119550976, + "step": 98245 + }, + { + "epoch": 10.942198463080521, + "grad_norm": 3.0484139919281006, + "learning_rate": 2.525268842781166e-05, + "loss": 0.0514, + "num_input_tokens_seen": 119557152, + "step": 98250 + }, + { + "epoch": 10.942755317964139, + "grad_norm": 1.6405671834945679, + "learning_rate": 2.525025881294531e-05, + "loss": 0.13, + "num_input_tokens_seen": 119563328, + "step": 98255 + }, + { + "epoch": 10.943312172847756, + "grad_norm": 0.009379053488373756, + "learning_rate": 2.5247829195715062e-05, + "loss": 0.1602, + "num_input_tokens_seen": 119569664, + "step": 98260 + }, + { + "epoch": 10.943869027731374, + "grad_norm": 2.906824827194214, + "learning_rate": 2.5245399576143874e-05, + "loss": 0.0881, + "num_input_tokens_seen": 119575968, + "step": 98265 + }, + { + "epoch": 10.94442588261499, + "grad_norm": 1.3422945737838745, + "learning_rate": 2.524296995425468e-05, + "loss": 0.2428, + "num_input_tokens_seen": 119581952, + "step": 98270 + }, + { + "epoch": 10.944982737498608, + "grad_norm": 0.043258827179670334, + "learning_rate": 2.5240540330070443e-05, + "loss": 0.004, + "num_input_tokens_seen": 119588096, + "step": 98275 + }, + { + "epoch": 10.945539592382225, + "grad_norm": 0.0035011591389775276, + "learning_rate": 2.523811070361412e-05, + "loss": 0.0564, + "num_input_tokens_seen": 119593536, + "step": 98280 + }, + { + "epoch": 10.946096447265843, + "grad_norm": 0.012947777286171913, + "learning_rate": 2.523568107490864e-05, + "loss": 0.0484, + "num_input_tokens_seen": 119599840, + "step": 98285 + }, + { + "epoch": 10.94665330214946, + "grad_norm": 0.05029670149087906, + "learning_rate": 2.5233251443976975e-05, + "loss": 0.0066, + "num_input_tokens_seen": 119605696, + "step": 98290 + }, + { + "epoch": 10.947210157033076, + "grad_norm": 0.18529318273067474, + "learning_rate": 2.5230821810842048e-05, + "loss": 0.0211, + "num_input_tokens_seen": 119612064, + "step": 98295 + }, + { + "epoch": 10.947767011916694, + "grad_norm": 1.3421308994293213, + "learning_rate": 2.5228392175526838e-05, + "loss": 0.163, + "num_input_tokens_seen": 119618080, + "step": 98300 + }, + { + "epoch": 10.948323866800312, + "grad_norm": 0.1002388447523117, + "learning_rate": 2.522596253805427e-05, + "loss": 0.011, + "num_input_tokens_seen": 119624384, + "step": 98305 + }, + { + "epoch": 10.94888072168393, + "grad_norm": 0.17737315595149994, + "learning_rate": 2.522353289844731e-05, + "loss": 0.0645, + "num_input_tokens_seen": 119630784, + "step": 98310 + }, + { + "epoch": 10.949437576567547, + "grad_norm": 0.040668562054634094, + "learning_rate": 2.5221103256728906e-05, + "loss": 0.0366, + "num_input_tokens_seen": 119636928, + "step": 98315 + }, + { + "epoch": 10.949994431451163, + "grad_norm": 0.00011593779345275834, + "learning_rate": 2.5218673612921995e-05, + "loss": 0.0882, + "num_input_tokens_seen": 119643296, + "step": 98320 + }, + { + "epoch": 10.95055128633478, + "grad_norm": 2.5259811878204346, + "learning_rate": 2.5216243967049546e-05, + "loss": 0.0487, + "num_input_tokens_seen": 119649376, + "step": 98325 + }, + { + "epoch": 10.951108141218398, + "grad_norm": 1.2728893756866455, + "learning_rate": 2.5213814319134493e-05, + "loss": 0.0745, + "num_input_tokens_seen": 119655296, + "step": 98330 + }, + { + "epoch": 10.951664996102016, + "grad_norm": 0.05523676425218582, + "learning_rate": 2.5211384669199793e-05, + "loss": 0.0895, + "num_input_tokens_seen": 119660992, + "step": 98335 + }, + { + "epoch": 10.952221850985634, + "grad_norm": 1.528063416481018, + "learning_rate": 2.52089550172684e-05, + "loss": 0.1312, + "num_input_tokens_seen": 119666976, + "step": 98340 + }, + { + "epoch": 10.95277870586925, + "grad_norm": 0.0018332971958443522, + "learning_rate": 2.5206525363363255e-05, + "loss": 0.0315, + "num_input_tokens_seen": 119673792, + "step": 98345 + }, + { + "epoch": 10.953335560752867, + "grad_norm": 0.25292107462882996, + "learning_rate": 2.520409570750732e-05, + "loss": 0.0051, + "num_input_tokens_seen": 119680128, + "step": 98350 + }, + { + "epoch": 10.953892415636485, + "grad_norm": 2.2637317180633545, + "learning_rate": 2.5201666049723528e-05, + "loss": 0.1141, + "num_input_tokens_seen": 119686016, + "step": 98355 + }, + { + "epoch": 10.954449270520103, + "grad_norm": 0.2685163915157318, + "learning_rate": 2.5199236390034846e-05, + "loss": 0.0099, + "num_input_tokens_seen": 119692064, + "step": 98360 + }, + { + "epoch": 10.95500612540372, + "grad_norm": 0.6602933406829834, + "learning_rate": 2.519680672846421e-05, + "loss": 0.0175, + "num_input_tokens_seen": 119698240, + "step": 98365 + }, + { + "epoch": 10.955562980287336, + "grad_norm": 0.24458752572536469, + "learning_rate": 2.519437706503458e-05, + "loss": 0.0526, + "num_input_tokens_seen": 119704128, + "step": 98370 + }, + { + "epoch": 10.956119835170954, + "grad_norm": 0.197765052318573, + "learning_rate": 2.5191947399768906e-05, + "loss": 0.0295, + "num_input_tokens_seen": 119710656, + "step": 98375 + }, + { + "epoch": 10.956676690054572, + "grad_norm": 0.008037171326577663, + "learning_rate": 2.5189517732690126e-05, + "loss": 0.0728, + "num_input_tokens_seen": 119716640, + "step": 98380 + }, + { + "epoch": 10.95723354493819, + "grad_norm": 0.15851011872291565, + "learning_rate": 2.5187088063821214e-05, + "loss": 0.0037, + "num_input_tokens_seen": 119722912, + "step": 98385 + }, + { + "epoch": 10.957790399821807, + "grad_norm": 1.0905948877334595, + "learning_rate": 2.518465839318509e-05, + "loss": 0.1707, + "num_input_tokens_seen": 119729280, + "step": 98390 + }, + { + "epoch": 10.958347254705425, + "grad_norm": 0.010550370439887047, + "learning_rate": 2.518222872080473e-05, + "loss": 0.0248, + "num_input_tokens_seen": 119735552, + "step": 98395 + }, + { + "epoch": 10.95890410958904, + "grad_norm": 0.0006574931321665645, + "learning_rate": 2.517979904670307e-05, + "loss": 0.0847, + "num_input_tokens_seen": 119741792, + "step": 98400 + }, + { + "epoch": 10.959460964472658, + "grad_norm": 0.9118292927742004, + "learning_rate": 2.517736937090306e-05, + "loss": 0.0783, + "num_input_tokens_seen": 119747872, + "step": 98405 + }, + { + "epoch": 10.960017819356276, + "grad_norm": 0.053264837712049484, + "learning_rate": 2.5174939693427658e-05, + "loss": 0.0977, + "num_input_tokens_seen": 119754048, + "step": 98410 + }, + { + "epoch": 10.960574674239894, + "grad_norm": 0.02107143960893154, + "learning_rate": 2.517251001429981e-05, + "loss": 0.1123, + "num_input_tokens_seen": 119760192, + "step": 98415 + }, + { + "epoch": 10.961131529123511, + "grad_norm": 0.7409965991973877, + "learning_rate": 2.5170080333542466e-05, + "loss": 0.1024, + "num_input_tokens_seen": 119765920, + "step": 98420 + }, + { + "epoch": 10.961688384007127, + "grad_norm": 0.000170423329109326, + "learning_rate": 2.5167650651178577e-05, + "loss": 0.099, + "num_input_tokens_seen": 119772128, + "step": 98425 + }, + { + "epoch": 10.962245238890745, + "grad_norm": 1.6918081045150757, + "learning_rate": 2.5165220967231102e-05, + "loss": 0.0892, + "num_input_tokens_seen": 119778080, + "step": 98430 + }, + { + "epoch": 10.962802093774362, + "grad_norm": 0.007321554236114025, + "learning_rate": 2.516279128172297e-05, + "loss": 0.025, + "num_input_tokens_seen": 119784096, + "step": 98435 + }, + { + "epoch": 10.96335894865798, + "grad_norm": 0.11054661870002747, + "learning_rate": 2.5160361594677147e-05, + "loss": 0.0486, + "num_input_tokens_seen": 119790368, + "step": 98440 + }, + { + "epoch": 10.963915803541598, + "grad_norm": 0.25400155782699585, + "learning_rate": 2.5157931906116582e-05, + "loss": 0.0329, + "num_input_tokens_seen": 119796448, + "step": 98445 + }, + { + "epoch": 10.964472658425214, + "grad_norm": 0.1772599071264267, + "learning_rate": 2.5155502216064226e-05, + "loss": 0.0368, + "num_input_tokens_seen": 119802304, + "step": 98450 + }, + { + "epoch": 10.965029513308831, + "grad_norm": 0.018870187923312187, + "learning_rate": 2.5153072524543027e-05, + "loss": 0.0063, + "num_input_tokens_seen": 119808544, + "step": 98455 + }, + { + "epoch": 10.965586368192449, + "grad_norm": 1.363335132598877, + "learning_rate": 2.515064283157593e-05, + "loss": 0.0747, + "num_input_tokens_seen": 119814496, + "step": 98460 + }, + { + "epoch": 10.966143223076067, + "grad_norm": 2.1105926036834717, + "learning_rate": 2.514821313718589e-05, + "loss": 0.1025, + "num_input_tokens_seen": 119820928, + "step": 98465 + }, + { + "epoch": 10.966700077959684, + "grad_norm": 3.0879735946655273, + "learning_rate": 2.5145783441395864e-05, + "loss": 0.094, + "num_input_tokens_seen": 119827072, + "step": 98470 + }, + { + "epoch": 10.9672569328433, + "grad_norm": 0.029935136437416077, + "learning_rate": 2.5143353744228794e-05, + "loss": 0.0401, + "num_input_tokens_seen": 119832992, + "step": 98475 + }, + { + "epoch": 10.967813787726918, + "grad_norm": 0.3009065091609955, + "learning_rate": 2.5140924045707637e-05, + "loss": 0.0286, + "num_input_tokens_seen": 119839232, + "step": 98480 + }, + { + "epoch": 10.968370642610536, + "grad_norm": 0.017031073570251465, + "learning_rate": 2.5138494345855333e-05, + "loss": 0.0257, + "num_input_tokens_seen": 119845504, + "step": 98485 + }, + { + "epoch": 10.968927497494153, + "grad_norm": 0.2843519449234009, + "learning_rate": 2.5136064644694845e-05, + "loss": 0.0255, + "num_input_tokens_seen": 119851936, + "step": 98490 + }, + { + "epoch": 10.969484352377771, + "grad_norm": 0.557978093624115, + "learning_rate": 2.5133634942249113e-05, + "loss": 0.0868, + "num_input_tokens_seen": 119857344, + "step": 98495 + }, + { + "epoch": 10.970041207261387, + "grad_norm": 0.08588715642690659, + "learning_rate": 2.5131205238541088e-05, + "loss": 0.008, + "num_input_tokens_seen": 119863648, + "step": 98500 + }, + { + "epoch": 10.970598062145005, + "grad_norm": 0.28541266918182373, + "learning_rate": 2.5128775533593735e-05, + "loss": 0.0592, + "num_input_tokens_seen": 119869696, + "step": 98505 + }, + { + "epoch": 10.971154917028622, + "grad_norm": 0.35661232471466064, + "learning_rate": 2.5126345827429986e-05, + "loss": 0.0057, + "num_input_tokens_seen": 119875936, + "step": 98510 + }, + { + "epoch": 10.97171177191224, + "grad_norm": 0.3602151572704315, + "learning_rate": 2.5123916120072804e-05, + "loss": 0.1309, + "num_input_tokens_seen": 119882176, + "step": 98515 + }, + { + "epoch": 10.972268626795858, + "grad_norm": 0.6385986804962158, + "learning_rate": 2.512148641154513e-05, + "loss": 0.0404, + "num_input_tokens_seen": 119888384, + "step": 98520 + }, + { + "epoch": 10.972825481679475, + "grad_norm": 0.00011312174319755286, + "learning_rate": 2.5119056701869926e-05, + "loss": 0.0224, + "num_input_tokens_seen": 119894368, + "step": 98525 + }, + { + "epoch": 10.973382336563091, + "grad_norm": 0.9017953276634216, + "learning_rate": 2.511662699107013e-05, + "loss": 0.1203, + "num_input_tokens_seen": 119900416, + "step": 98530 + }, + { + "epoch": 10.973939191446709, + "grad_norm": 1.005901575088501, + "learning_rate": 2.51141972791687e-05, + "loss": 0.0478, + "num_input_tokens_seen": 119906336, + "step": 98535 + }, + { + "epoch": 10.974496046330326, + "grad_norm": 0.0006908898358233273, + "learning_rate": 2.5111767566188588e-05, + "loss": 0.0183, + "num_input_tokens_seen": 119912416, + "step": 98540 + }, + { + "epoch": 10.975052901213944, + "grad_norm": 0.00019412592519074678, + "learning_rate": 2.5109337852152738e-05, + "loss": 0.0498, + "num_input_tokens_seen": 119918496, + "step": 98545 + }, + { + "epoch": 10.975609756097562, + "grad_norm": 0.41029971837997437, + "learning_rate": 2.510690813708411e-05, + "loss": 0.0133, + "num_input_tokens_seen": 119924480, + "step": 98550 + }, + { + "epoch": 10.976166610981178, + "grad_norm": 0.046738944947719574, + "learning_rate": 2.5104478421005644e-05, + "loss": 0.06, + "num_input_tokens_seen": 119930368, + "step": 98555 + }, + { + "epoch": 10.976723465864795, + "grad_norm": 1.2631109952926636, + "learning_rate": 2.5102048703940296e-05, + "loss": 0.0474, + "num_input_tokens_seen": 119936704, + "step": 98560 + }, + { + "epoch": 10.977280320748413, + "grad_norm": 0.4065188765525818, + "learning_rate": 2.5099618985911028e-05, + "loss": 0.0414, + "num_input_tokens_seen": 119942880, + "step": 98565 + }, + { + "epoch": 10.97783717563203, + "grad_norm": 0.14293353259563446, + "learning_rate": 2.5097189266940764e-05, + "loss": 0.0079, + "num_input_tokens_seen": 119948928, + "step": 98570 + }, + { + "epoch": 10.978394030515648, + "grad_norm": 0.41989755630493164, + "learning_rate": 2.5094759547052475e-05, + "loss": 0.0174, + "num_input_tokens_seen": 119954880, + "step": 98575 + }, + { + "epoch": 10.978950885399264, + "grad_norm": 0.0952019914984703, + "learning_rate": 2.5092329826269106e-05, + "loss": 0.0483, + "num_input_tokens_seen": 119960768, + "step": 98580 + }, + { + "epoch": 10.979507740282882, + "grad_norm": 0.4119338393211365, + "learning_rate": 2.5089900104613602e-05, + "loss": 0.028, + "num_input_tokens_seen": 119966848, + "step": 98585 + }, + { + "epoch": 10.9800645951665, + "grad_norm": 0.004766514524817467, + "learning_rate": 2.508747038210893e-05, + "loss": 0.0757, + "num_input_tokens_seen": 119973056, + "step": 98590 + }, + { + "epoch": 10.980621450050117, + "grad_norm": 0.003295455826446414, + "learning_rate": 2.508504065877802e-05, + "loss": 0.0299, + "num_input_tokens_seen": 119979296, + "step": 98595 + }, + { + "epoch": 10.981178304933735, + "grad_norm": 0.012522734701633453, + "learning_rate": 2.5082610934643842e-05, + "loss": 0.0522, + "num_input_tokens_seen": 119985184, + "step": 98600 + }, + { + "epoch": 10.981735159817351, + "grad_norm": 0.16680270433425903, + "learning_rate": 2.508018120972933e-05, + "loss": 0.0509, + "num_input_tokens_seen": 119991264, + "step": 98605 + }, + { + "epoch": 10.982292014700969, + "grad_norm": 0.5221645832061768, + "learning_rate": 2.5077751484057444e-05, + "loss": 0.127, + "num_input_tokens_seen": 119997312, + "step": 98610 + }, + { + "epoch": 10.982848869584586, + "grad_norm": 0.028141189366579056, + "learning_rate": 2.507532175765113e-05, + "loss": 0.1535, + "num_input_tokens_seen": 120003040, + "step": 98615 + }, + { + "epoch": 10.983405724468204, + "grad_norm": 0.09653521329164505, + "learning_rate": 2.5072892030533346e-05, + "loss": 0.0148, + "num_input_tokens_seen": 120008640, + "step": 98620 + }, + { + "epoch": 10.983962579351822, + "grad_norm": 0.00029897617059759796, + "learning_rate": 2.5070462302727044e-05, + "loss": 0.0107, + "num_input_tokens_seen": 120014752, + "step": 98625 + }, + { + "epoch": 10.984519434235438, + "grad_norm": 1.0829881429672241, + "learning_rate": 2.5068032574255157e-05, + "loss": 0.1632, + "num_input_tokens_seen": 120021056, + "step": 98630 + }, + { + "epoch": 10.985076289119055, + "grad_norm": 1.4456084966659546, + "learning_rate": 2.5065602845140657e-05, + "loss": 0.0268, + "num_input_tokens_seen": 120027168, + "step": 98635 + }, + { + "epoch": 10.985633144002673, + "grad_norm": 1.0346380472183228, + "learning_rate": 2.506317311540648e-05, + "loss": 0.0863, + "num_input_tokens_seen": 120032480, + "step": 98640 + }, + { + "epoch": 10.98618999888629, + "grad_norm": 0.0032333582639694214, + "learning_rate": 2.5060743385075587e-05, + "loss": 0.0017, + "num_input_tokens_seen": 120038432, + "step": 98645 + }, + { + "epoch": 10.986746853769908, + "grad_norm": 0.045760627835989, + "learning_rate": 2.5058313654170924e-05, + "loss": 0.0017, + "num_input_tokens_seen": 120044352, + "step": 98650 + }, + { + "epoch": 10.987303708653524, + "grad_norm": 0.025588661432266235, + "learning_rate": 2.5055883922715435e-05, + "loss": 0.0153, + "num_input_tokens_seen": 120050496, + "step": 98655 + }, + { + "epoch": 10.987860563537142, + "grad_norm": 0.0281450767070055, + "learning_rate": 2.5053454190732085e-05, + "loss": 0.015, + "num_input_tokens_seen": 120056992, + "step": 98660 + }, + { + "epoch": 10.98841741842076, + "grad_norm": 0.032102636992931366, + "learning_rate": 2.5051024458243815e-05, + "loss": 0.0352, + "num_input_tokens_seen": 120063200, + "step": 98665 + }, + { + "epoch": 10.988974273304377, + "grad_norm": 0.0005515867960639298, + "learning_rate": 2.5048594725273577e-05, + "loss": 0.0028, + "num_input_tokens_seen": 120069472, + "step": 98670 + }, + { + "epoch": 10.989531128187995, + "grad_norm": 0.010305032134056091, + "learning_rate": 2.5046164991844325e-05, + "loss": 0.0085, + "num_input_tokens_seen": 120075808, + "step": 98675 + }, + { + "epoch": 10.99008798307161, + "grad_norm": 0.0008709813700988889, + "learning_rate": 2.5043735257979e-05, + "loss": 0.0433, + "num_input_tokens_seen": 120082240, + "step": 98680 + }, + { + "epoch": 10.990644837955228, + "grad_norm": 0.30861252546310425, + "learning_rate": 2.504130552370057e-05, + "loss": 0.017, + "num_input_tokens_seen": 120088352, + "step": 98685 + }, + { + "epoch": 10.991201692838846, + "grad_norm": 0.004370034206658602, + "learning_rate": 2.5038875789031973e-05, + "loss": 0.0278, + "num_input_tokens_seen": 120094368, + "step": 98690 + }, + { + "epoch": 10.991758547722464, + "grad_norm": 0.0038212710060179234, + "learning_rate": 2.5036446053996164e-05, + "loss": 0.0024, + "num_input_tokens_seen": 120100736, + "step": 98695 + }, + { + "epoch": 10.992315402606081, + "grad_norm": 0.04571393504738808, + "learning_rate": 2.5034016318616093e-05, + "loss": 0.0105, + "num_input_tokens_seen": 120106464, + "step": 98700 + }, + { + "epoch": 10.992872257489697, + "grad_norm": 1.0844162702560425, + "learning_rate": 2.5031586582914713e-05, + "loss": 0.1112, + "num_input_tokens_seen": 120112512, + "step": 98705 + }, + { + "epoch": 10.993429112373315, + "grad_norm": 0.9959402680397034, + "learning_rate": 2.5029156846914963e-05, + "loss": 0.0618, + "num_input_tokens_seen": 120118432, + "step": 98710 + }, + { + "epoch": 10.993985967256933, + "grad_norm": 0.004170754458755255, + "learning_rate": 2.502672711063981e-05, + "loss": 0.0617, + "num_input_tokens_seen": 120124864, + "step": 98715 + }, + { + "epoch": 10.99454282214055, + "grad_norm": 0.021099617704749107, + "learning_rate": 2.5024297374112198e-05, + "loss": 0.0005, + "num_input_tokens_seen": 120131264, + "step": 98720 + }, + { + "epoch": 10.995099677024168, + "grad_norm": 0.263783723115921, + "learning_rate": 2.5021867637355072e-05, + "loss": 0.0158, + "num_input_tokens_seen": 120137248, + "step": 98725 + }, + { + "epoch": 10.995656531907784, + "grad_norm": 0.893272876739502, + "learning_rate": 2.5019437900391402e-05, + "loss": 0.0827, + "num_input_tokens_seen": 120143232, + "step": 98730 + }, + { + "epoch": 10.996213386791402, + "grad_norm": 0.0003280343662481755, + "learning_rate": 2.501700816324411e-05, + "loss": 0.0036, + "num_input_tokens_seen": 120149408, + "step": 98735 + }, + { + "epoch": 10.99677024167502, + "grad_norm": 2.8158180713653564, + "learning_rate": 2.5014578425936168e-05, + "loss": 0.1502, + "num_input_tokens_seen": 120155808, + "step": 98740 + }, + { + "epoch": 10.997327096558637, + "grad_norm": 0.47493645548820496, + "learning_rate": 2.5012148688490526e-05, + "loss": 0.128, + "num_input_tokens_seen": 120161728, + "step": 98745 + }, + { + "epoch": 10.997883951442255, + "grad_norm": 0.0013069455744698644, + "learning_rate": 2.5009718950930122e-05, + "loss": 0.0103, + "num_input_tokens_seen": 120167968, + "step": 98750 + }, + { + "epoch": 10.998440806325872, + "grad_norm": 0.19126097857952118, + "learning_rate": 2.5007289213277923e-05, + "loss": 0.0031, + "num_input_tokens_seen": 120174208, + "step": 98755 + }, + { + "epoch": 10.998997661209488, + "grad_norm": 8.162520680343732e-05, + "learning_rate": 2.500485947555687e-05, + "loss": 0.0688, + "num_input_tokens_seen": 120180512, + "step": 98760 + }, + { + "epoch": 10.999554516093106, + "grad_norm": 0.2592199742794037, + "learning_rate": 2.500242973778991e-05, + "loss": 0.0054, + "num_input_tokens_seen": 120186688, + "step": 98765 + }, + { + "epoch": 11.0, + "eval_loss": 0.07719666510820389, + "eval_runtime": 112.2422, + "eval_samples_per_second": 35.557, + "eval_steps_per_second": 8.891, + "num_input_tokens_seen": 120190896, + "step": 98769 + }, + { + "epoch": 11.000111370976724, + "grad_norm": 1.8187260627746582, + "learning_rate": 2.5e-05, + "loss": 0.08, + "num_input_tokens_seen": 120192208, + "step": 98770 + }, + { + "epoch": 11.000668225860341, + "grad_norm": 0.06272393465042114, + "learning_rate": 2.4997570262210098e-05, + "loss": 0.01, + "num_input_tokens_seen": 120198704, + "step": 98775 + }, + { + "epoch": 11.001225080743959, + "grad_norm": 0.07064955681562424, + "learning_rate": 2.499514052444314e-05, + "loss": 0.0723, + "num_input_tokens_seen": 120204496, + "step": 98780 + }, + { + "epoch": 11.001781935627575, + "grad_norm": 0.03474074974656105, + "learning_rate": 2.4992710786722087e-05, + "loss": 0.0259, + "num_input_tokens_seen": 120210352, + "step": 98785 + }, + { + "epoch": 11.002338790511192, + "grad_norm": 0.48681125044822693, + "learning_rate": 2.499028104906988e-05, + "loss": 0.0651, + "num_input_tokens_seen": 120215824, + "step": 98790 + }, + { + "epoch": 11.00289564539481, + "grad_norm": 0.0019495906308293343, + "learning_rate": 2.4987851311509483e-05, + "loss": 0.0314, + "num_input_tokens_seen": 120221264, + "step": 98795 + }, + { + "epoch": 11.003452500278428, + "grad_norm": 0.5230045318603516, + "learning_rate": 2.4985421574063834e-05, + "loss": 0.0644, + "num_input_tokens_seen": 120227536, + "step": 98800 + }, + { + "epoch": 11.004009355162045, + "grad_norm": 0.0013974389294162393, + "learning_rate": 2.4982991836755896e-05, + "loss": 0.0059, + "num_input_tokens_seen": 120233360, + "step": 98805 + }, + { + "epoch": 11.004566210045661, + "grad_norm": 0.009010469540953636, + "learning_rate": 2.4980562099608607e-05, + "loss": 0.0546, + "num_input_tokens_seen": 120239696, + "step": 98810 + }, + { + "epoch": 11.005123064929279, + "grad_norm": 0.5129752159118652, + "learning_rate": 2.4978132362644933e-05, + "loss": 0.0299, + "num_input_tokens_seen": 120246096, + "step": 98815 + }, + { + "epoch": 11.005679919812897, + "grad_norm": 0.626710832118988, + "learning_rate": 2.4975702625887808e-05, + "loss": 0.0534, + "num_input_tokens_seen": 120252176, + "step": 98820 + }, + { + "epoch": 11.006236774696514, + "grad_norm": 0.0007554434705525637, + "learning_rate": 2.4973272889360197e-05, + "loss": 0.0492, + "num_input_tokens_seen": 120258320, + "step": 98825 + }, + { + "epoch": 11.006793629580132, + "grad_norm": 0.007651817053556442, + "learning_rate": 2.497084315308504e-05, + "loss": 0.0279, + "num_input_tokens_seen": 120264464, + "step": 98830 + }, + { + "epoch": 11.007350484463748, + "grad_norm": 0.7878795862197876, + "learning_rate": 2.49684134170853e-05, + "loss": 0.0479, + "num_input_tokens_seen": 120270320, + "step": 98835 + }, + { + "epoch": 11.007907339347366, + "grad_norm": 0.0701993778347969, + "learning_rate": 2.4965983681383916e-05, + "loss": 0.0414, + "num_input_tokens_seen": 120276240, + "step": 98840 + }, + { + "epoch": 11.008464194230983, + "grad_norm": 1.5925616025924683, + "learning_rate": 2.4963553946003845e-05, + "loss": 0.1367, + "num_input_tokens_seen": 120282384, + "step": 98845 + }, + { + "epoch": 11.009021049114601, + "grad_norm": 0.0022012335248291492, + "learning_rate": 2.4961124210968033e-05, + "loss": 0.0515, + "num_input_tokens_seen": 120288560, + "step": 98850 + }, + { + "epoch": 11.009577903998219, + "grad_norm": 0.2812863886356354, + "learning_rate": 2.495869447629944e-05, + "loss": 0.0542, + "num_input_tokens_seen": 120294480, + "step": 98855 + }, + { + "epoch": 11.010134758881835, + "grad_norm": 0.3255709409713745, + "learning_rate": 2.4956264742021003e-05, + "loss": 0.068, + "num_input_tokens_seen": 120300624, + "step": 98860 + }, + { + "epoch": 11.010691613765452, + "grad_norm": 0.013975389301776886, + "learning_rate": 2.4953835008155688e-05, + "loss": 0.0152, + "num_input_tokens_seen": 120306544, + "step": 98865 + }, + { + "epoch": 11.01124846864907, + "grad_norm": 4.203700542449951, + "learning_rate": 2.4951405274726426e-05, + "loss": 0.0336, + "num_input_tokens_seen": 120312560, + "step": 98870 + }, + { + "epoch": 11.011805323532688, + "grad_norm": 0.2549431622028351, + "learning_rate": 2.4948975541756198e-05, + "loss": 0.01, + "num_input_tokens_seen": 120319024, + "step": 98875 + }, + { + "epoch": 11.012362178416305, + "grad_norm": 0.09323319047689438, + "learning_rate": 2.4946545809267918e-05, + "loss": 0.0731, + "num_input_tokens_seen": 120325200, + "step": 98880 + }, + { + "epoch": 11.012919033299921, + "grad_norm": 0.47590088844299316, + "learning_rate": 2.4944116077284568e-05, + "loss": 0.0277, + "num_input_tokens_seen": 120331440, + "step": 98885 + }, + { + "epoch": 11.013475888183539, + "grad_norm": 0.10020612180233002, + "learning_rate": 2.4941686345829082e-05, + "loss": 0.0157, + "num_input_tokens_seen": 120337712, + "step": 98890 + }, + { + "epoch": 11.014032743067157, + "grad_norm": 0.07804874330759048, + "learning_rate": 2.493925661492442e-05, + "loss": 0.0067, + "num_input_tokens_seen": 120344016, + "step": 98895 + }, + { + "epoch": 11.014589597950774, + "grad_norm": 0.0982111394405365, + "learning_rate": 2.493682688459352e-05, + "loss": 0.0388, + "num_input_tokens_seen": 120350128, + "step": 98900 + }, + { + "epoch": 11.015146452834392, + "grad_norm": 0.8452007174491882, + "learning_rate": 2.4934397154859352e-05, + "loss": 0.0719, + "num_input_tokens_seen": 120356272, + "step": 98905 + }, + { + "epoch": 11.01570330771801, + "grad_norm": 1.0722897052764893, + "learning_rate": 2.4931967425744845e-05, + "loss": 0.0105, + "num_input_tokens_seen": 120362544, + "step": 98910 + }, + { + "epoch": 11.016260162601625, + "grad_norm": 1.019049048423767, + "learning_rate": 2.492953769727297e-05, + "loss": 0.0554, + "num_input_tokens_seen": 120368208, + "step": 98915 + }, + { + "epoch": 11.016817017485243, + "grad_norm": 0.006837291177362204, + "learning_rate": 2.492710796946666e-05, + "loss": 0.0138, + "num_input_tokens_seen": 120374416, + "step": 98920 + }, + { + "epoch": 11.01737387236886, + "grad_norm": 0.00030159472953528166, + "learning_rate": 2.4924678242348874e-05, + "loss": 0.1158, + "num_input_tokens_seen": 120380272, + "step": 98925 + }, + { + "epoch": 11.017930727252478, + "grad_norm": 0.12559020519256592, + "learning_rate": 2.4922248515942565e-05, + "loss": 0.0554, + "num_input_tokens_seen": 120386352, + "step": 98930 + }, + { + "epoch": 11.018487582136096, + "grad_norm": 0.005657464265823364, + "learning_rate": 2.491981879027068e-05, + "loss": 0.0286, + "num_input_tokens_seen": 120391920, + "step": 98935 + }, + { + "epoch": 11.019044437019712, + "grad_norm": 0.0396563746035099, + "learning_rate": 2.4917389065356164e-05, + "loss": 0.0121, + "num_input_tokens_seen": 120397712, + "step": 98940 + }, + { + "epoch": 11.01960129190333, + "grad_norm": 0.22857888042926788, + "learning_rate": 2.491495934122199e-05, + "loss": 0.0599, + "num_input_tokens_seen": 120403920, + "step": 98945 + }, + { + "epoch": 11.020158146786947, + "grad_norm": 2.0613439083099365, + "learning_rate": 2.491252961789108e-05, + "loss": 0.057, + "num_input_tokens_seen": 120410256, + "step": 98950 + }, + { + "epoch": 11.020715001670565, + "grad_norm": 0.02204776369035244, + "learning_rate": 2.4910099895386404e-05, + "loss": 0.0698, + "num_input_tokens_seen": 120416464, + "step": 98955 + }, + { + "epoch": 11.021271856554183, + "grad_norm": 1.2825102806091309, + "learning_rate": 2.4907670173730903e-05, + "loss": 0.0534, + "num_input_tokens_seen": 120422768, + "step": 98960 + }, + { + "epoch": 11.021828711437799, + "grad_norm": 2.7913818359375, + "learning_rate": 2.4905240452947534e-05, + "loss": 0.2206, + "num_input_tokens_seen": 120428848, + "step": 98965 + }, + { + "epoch": 11.022385566321416, + "grad_norm": 1.2891305685043335, + "learning_rate": 2.4902810733059242e-05, + "loss": 0.0885, + "num_input_tokens_seen": 120434832, + "step": 98970 + }, + { + "epoch": 11.022942421205034, + "grad_norm": 1.1088104248046875, + "learning_rate": 2.4900381014088985e-05, + "loss": 0.0118, + "num_input_tokens_seen": 120441136, + "step": 98975 + }, + { + "epoch": 11.023499276088652, + "grad_norm": 1.4520457983016968, + "learning_rate": 2.4897951296059703e-05, + "loss": 0.0951, + "num_input_tokens_seen": 120447120, + "step": 98980 + }, + { + "epoch": 11.02405613097227, + "grad_norm": 0.000484983145724982, + "learning_rate": 2.489552157899436e-05, + "loss": 0.0353, + "num_input_tokens_seen": 120452912, + "step": 98985 + }, + { + "epoch": 11.024612985855885, + "grad_norm": 0.008061041124165058, + "learning_rate": 2.4893091862915892e-05, + "loss": 0.1168, + "num_input_tokens_seen": 120458864, + "step": 98990 + }, + { + "epoch": 11.025169840739503, + "grad_norm": 0.08283761888742447, + "learning_rate": 2.4890662147847265e-05, + "loss": 0.0256, + "num_input_tokens_seen": 120464976, + "step": 98995 + }, + { + "epoch": 11.02572669562312, + "grad_norm": 0.36605140566825867, + "learning_rate": 2.4888232433811408e-05, + "loss": 0.0037, + "num_input_tokens_seen": 120471184, + "step": 99000 + }, + { + "epoch": 11.026283550506738, + "grad_norm": 1.7516924142837524, + "learning_rate": 2.4885802720831306e-05, + "loss": 0.1874, + "num_input_tokens_seen": 120476880, + "step": 99005 + }, + { + "epoch": 11.026840405390356, + "grad_norm": 0.00025968949194066226, + "learning_rate": 2.488337300892987e-05, + "loss": 0.0165, + "num_input_tokens_seen": 120483056, + "step": 99010 + }, + { + "epoch": 11.027397260273972, + "grad_norm": 1.9015823602676392, + "learning_rate": 2.4880943298130077e-05, + "loss": 0.187, + "num_input_tokens_seen": 120488496, + "step": 99015 + }, + { + "epoch": 11.02795411515759, + "grad_norm": 0.43280714750289917, + "learning_rate": 2.4878513588454867e-05, + "loss": 0.0965, + "num_input_tokens_seen": 120494512, + "step": 99020 + }, + { + "epoch": 11.028510970041207, + "grad_norm": 2.0219318866729736, + "learning_rate": 2.48760838799272e-05, + "loss": 0.0567, + "num_input_tokens_seen": 120500496, + "step": 99025 + }, + { + "epoch": 11.029067824924825, + "grad_norm": 0.6781768202781677, + "learning_rate": 2.4873654172570016e-05, + "loss": 0.1388, + "num_input_tokens_seen": 120506640, + "step": 99030 + }, + { + "epoch": 11.029624679808443, + "grad_norm": 0.4859011173248291, + "learning_rate": 2.487122446640627e-05, + "loss": 0.0123, + "num_input_tokens_seen": 120512624, + "step": 99035 + }, + { + "epoch": 11.030181534692058, + "grad_norm": 0.0005448451265692711, + "learning_rate": 2.486879476145891e-05, + "loss": 0.1097, + "num_input_tokens_seen": 120518672, + "step": 99040 + }, + { + "epoch": 11.030738389575676, + "grad_norm": 0.37559694051742554, + "learning_rate": 2.4866365057750893e-05, + "loss": 0.0251, + "num_input_tokens_seen": 120524752, + "step": 99045 + }, + { + "epoch": 11.031295244459294, + "grad_norm": 0.03454533591866493, + "learning_rate": 2.486393535530516e-05, + "loss": 0.0199, + "num_input_tokens_seen": 120530352, + "step": 99050 + }, + { + "epoch": 11.031852099342911, + "grad_norm": 0.06361760944128036, + "learning_rate": 2.4861505654144673e-05, + "loss": 0.0175, + "num_input_tokens_seen": 120536592, + "step": 99055 + }, + { + "epoch": 11.03240895422653, + "grad_norm": 0.024119781330227852, + "learning_rate": 2.4859075954292362e-05, + "loss": 0.01, + "num_input_tokens_seen": 120542800, + "step": 99060 + }, + { + "epoch": 11.032965809110145, + "grad_norm": 1.3744914531707764, + "learning_rate": 2.485664625577121e-05, + "loss": 0.1059, + "num_input_tokens_seen": 120548784, + "step": 99065 + }, + { + "epoch": 11.033522663993763, + "grad_norm": 0.0004128130676690489, + "learning_rate": 2.4854216558604135e-05, + "loss": 0.0067, + "num_input_tokens_seen": 120554864, + "step": 99070 + }, + { + "epoch": 11.03407951887738, + "grad_norm": 0.11913326382637024, + "learning_rate": 2.4851786862814116e-05, + "loss": 0.004, + "num_input_tokens_seen": 120561296, + "step": 99075 + }, + { + "epoch": 11.034636373760998, + "grad_norm": 0.0918884426355362, + "learning_rate": 2.4849357168424074e-05, + "loss": 0.0686, + "num_input_tokens_seen": 120567216, + "step": 99080 + }, + { + "epoch": 11.035193228644616, + "grad_norm": 0.02543373592197895, + "learning_rate": 2.484692747545698e-05, + "loss": 0.0029, + "num_input_tokens_seen": 120573392, + "step": 99085 + }, + { + "epoch": 11.035750083528233, + "grad_norm": 0.6197919249534607, + "learning_rate": 2.4844497783935777e-05, + "loss": 0.0163, + "num_input_tokens_seen": 120579760, + "step": 99090 + }, + { + "epoch": 11.03630693841185, + "grad_norm": 0.3676705062389374, + "learning_rate": 2.484206809388342e-05, + "loss": 0.0057, + "num_input_tokens_seen": 120586256, + "step": 99095 + }, + { + "epoch": 11.036863793295467, + "grad_norm": 0.0002280346816405654, + "learning_rate": 2.4839638405322856e-05, + "loss": 0.0523, + "num_input_tokens_seen": 120592048, + "step": 99100 + }, + { + "epoch": 11.037420648179085, + "grad_norm": 0.01276334747672081, + "learning_rate": 2.4837208718277036e-05, + "loss": 0.0265, + "num_input_tokens_seen": 120598416, + "step": 99105 + }, + { + "epoch": 11.037977503062702, + "grad_norm": 0.2700284719467163, + "learning_rate": 2.4834779032768907e-05, + "loss": 0.0177, + "num_input_tokens_seen": 120604528, + "step": 99110 + }, + { + "epoch": 11.03853435794632, + "grad_norm": 0.0860571414232254, + "learning_rate": 2.4832349348821426e-05, + "loss": 0.0032, + "num_input_tokens_seen": 120610704, + "step": 99115 + }, + { + "epoch": 11.039091212829936, + "grad_norm": 0.002725540194660425, + "learning_rate": 2.4829919666457537e-05, + "loss": 0.0152, + "num_input_tokens_seen": 120616656, + "step": 99120 + }, + { + "epoch": 11.039648067713554, + "grad_norm": 0.015765776857733727, + "learning_rate": 2.48274899857002e-05, + "loss": 0.0072, + "num_input_tokens_seen": 120622704, + "step": 99125 + }, + { + "epoch": 11.040204922597171, + "grad_norm": 0.0063989548943936825, + "learning_rate": 2.482506030657234e-05, + "loss": 0.0622, + "num_input_tokens_seen": 120628912, + "step": 99130 + }, + { + "epoch": 11.040761777480789, + "grad_norm": 0.05714604631066322, + "learning_rate": 2.4822630629096947e-05, + "loss": 0.1298, + "num_input_tokens_seen": 120634960, + "step": 99135 + }, + { + "epoch": 11.041318632364407, + "grad_norm": 1.1488358974456787, + "learning_rate": 2.4820200953296934e-05, + "loss": 0.056, + "num_input_tokens_seen": 120641040, + "step": 99140 + }, + { + "epoch": 11.041875487248022, + "grad_norm": 0.3309987783432007, + "learning_rate": 2.4817771279195275e-05, + "loss": 0.0706, + "num_input_tokens_seen": 120647312, + "step": 99145 + }, + { + "epoch": 11.04243234213164, + "grad_norm": 0.17971962690353394, + "learning_rate": 2.481534160681491e-05, + "loss": 0.0274, + "num_input_tokens_seen": 120653200, + "step": 99150 + }, + { + "epoch": 11.042989197015258, + "grad_norm": 0.0004145845305174589, + "learning_rate": 2.4812911936178795e-05, + "loss": 0.0016, + "num_input_tokens_seen": 120659504, + "step": 99155 + }, + { + "epoch": 11.043546051898876, + "grad_norm": 0.2953217923641205, + "learning_rate": 2.4810482267309873e-05, + "loss": 0.0408, + "num_input_tokens_seen": 120665776, + "step": 99160 + }, + { + "epoch": 11.044102906782493, + "grad_norm": 0.3988269567489624, + "learning_rate": 2.4808052600231103e-05, + "loss": 0.009, + "num_input_tokens_seen": 120671824, + "step": 99165 + }, + { + "epoch": 11.044659761666109, + "grad_norm": 0.6911607384681702, + "learning_rate": 2.480562293496542e-05, + "loss": 0.1551, + "num_input_tokens_seen": 120677200, + "step": 99170 + }, + { + "epoch": 11.045216616549727, + "grad_norm": 0.27114441990852356, + "learning_rate": 2.4803193271535796e-05, + "loss": 0.0058, + "num_input_tokens_seen": 120683280, + "step": 99175 + }, + { + "epoch": 11.045773471433344, + "grad_norm": 1.0318140983581543, + "learning_rate": 2.480076360996516e-05, + "loss": 0.1046, + "num_input_tokens_seen": 120689328, + "step": 99180 + }, + { + "epoch": 11.046330326316962, + "grad_norm": 0.011390595696866512, + "learning_rate": 2.4798333950276478e-05, + "loss": 0.078, + "num_input_tokens_seen": 120695600, + "step": 99185 + }, + { + "epoch": 11.04688718120058, + "grad_norm": 1.2335848808288574, + "learning_rate": 2.4795904292492693e-05, + "loss": 0.0977, + "num_input_tokens_seen": 120701648, + "step": 99190 + }, + { + "epoch": 11.047444036084196, + "grad_norm": 0.060238491743803024, + "learning_rate": 2.4793474636636747e-05, + "loss": 0.0117, + "num_input_tokens_seen": 120707792, + "step": 99195 + }, + { + "epoch": 11.048000890967813, + "grad_norm": 0.0960557758808136, + "learning_rate": 2.479104498273161e-05, + "loss": 0.0414, + "num_input_tokens_seen": 120713616, + "step": 99200 + }, + { + "epoch": 11.048557745851431, + "grad_norm": 0.9444639086723328, + "learning_rate": 2.4788615330800213e-05, + "loss": 0.0955, + "num_input_tokens_seen": 120719792, + "step": 99205 + }, + { + "epoch": 11.049114600735049, + "grad_norm": 0.17429114878177643, + "learning_rate": 2.4786185680865516e-05, + "loss": 0.0071, + "num_input_tokens_seen": 120725520, + "step": 99210 + }, + { + "epoch": 11.049671455618666, + "grad_norm": 0.9607074856758118, + "learning_rate": 2.478375603295046e-05, + "loss": 0.0629, + "num_input_tokens_seen": 120731152, + "step": 99215 + }, + { + "epoch": 11.050228310502282, + "grad_norm": 0.08085176348686218, + "learning_rate": 2.4781326387078015e-05, + "loss": 0.0234, + "num_input_tokens_seen": 120736688, + "step": 99220 + }, + { + "epoch": 11.0507851653859, + "grad_norm": 0.29731449484825134, + "learning_rate": 2.4778896743271103e-05, + "loss": 0.0705, + "num_input_tokens_seen": 120742160, + "step": 99225 + }, + { + "epoch": 11.051342020269518, + "grad_norm": 0.010755597613751888, + "learning_rate": 2.4776467101552696e-05, + "loss": 0.1098, + "num_input_tokens_seen": 120747952, + "step": 99230 + }, + { + "epoch": 11.051898875153135, + "grad_norm": 0.03375055640935898, + "learning_rate": 2.4774037461945733e-05, + "loss": 0.0419, + "num_input_tokens_seen": 120753968, + "step": 99235 + }, + { + "epoch": 11.052455730036753, + "grad_norm": 0.16449128091335297, + "learning_rate": 2.477160782447317e-05, + "loss": 0.0065, + "num_input_tokens_seen": 120759984, + "step": 99240 + }, + { + "epoch": 11.05301258492037, + "grad_norm": 2.0850186347961426, + "learning_rate": 2.4769178189157954e-05, + "loss": 0.2208, + "num_input_tokens_seen": 120765296, + "step": 99245 + }, + { + "epoch": 11.053569439803987, + "grad_norm": 0.0804702416062355, + "learning_rate": 2.476674855602304e-05, + "loss": 0.0986, + "num_input_tokens_seen": 120771440, + "step": 99250 + }, + { + "epoch": 11.054126294687604, + "grad_norm": 0.07393618673086166, + "learning_rate": 2.4764318925091365e-05, + "loss": 0.0048, + "num_input_tokens_seen": 120777680, + "step": 99255 + }, + { + "epoch": 11.054683149571222, + "grad_norm": 0.020276987925171852, + "learning_rate": 2.476188929638589e-05, + "loss": 0.0513, + "num_input_tokens_seen": 120783856, + "step": 99260 + }, + { + "epoch": 11.05524000445484, + "grad_norm": 0.02398832142353058, + "learning_rate": 2.475945966992956e-05, + "loss": 0.0577, + "num_input_tokens_seen": 120789968, + "step": 99265 + }, + { + "epoch": 11.055796859338457, + "grad_norm": 0.008858718909323215, + "learning_rate": 2.4757030045745328e-05, + "loss": 0.0047, + "num_input_tokens_seen": 120796080, + "step": 99270 + }, + { + "epoch": 11.056353714222073, + "grad_norm": 0.04252585396170616, + "learning_rate": 2.4754600423856132e-05, + "loss": 0.0111, + "num_input_tokens_seen": 120802096, + "step": 99275 + }, + { + "epoch": 11.05691056910569, + "grad_norm": 3.855823040008545, + "learning_rate": 2.475217080428495e-05, + "loss": 0.0289, + "num_input_tokens_seen": 120808368, + "step": 99280 + }, + { + "epoch": 11.057467423989308, + "grad_norm": 0.06560006737709045, + "learning_rate": 2.4749741187054694e-05, + "loss": 0.0048, + "num_input_tokens_seen": 120814576, + "step": 99285 + }, + { + "epoch": 11.058024278872926, + "grad_norm": 0.03540223091840744, + "learning_rate": 2.4747311572188343e-05, + "loss": 0.1295, + "num_input_tokens_seen": 120820624, + "step": 99290 + }, + { + "epoch": 11.058581133756544, + "grad_norm": 0.26976993680000305, + "learning_rate": 2.4744881959708836e-05, + "loss": 0.0157, + "num_input_tokens_seen": 120826704, + "step": 99295 + }, + { + "epoch": 11.05913798864016, + "grad_norm": 0.2010473608970642, + "learning_rate": 2.4742452349639125e-05, + "loss": 0.0213, + "num_input_tokens_seen": 120832880, + "step": 99300 + }, + { + "epoch": 11.059694843523777, + "grad_norm": 0.3390643000602722, + "learning_rate": 2.4740022742002157e-05, + "loss": 0.0241, + "num_input_tokens_seen": 120839088, + "step": 99305 + }, + { + "epoch": 11.060251698407395, + "grad_norm": 2.147517442703247, + "learning_rate": 2.4737593136820882e-05, + "loss": 0.1538, + "num_input_tokens_seen": 120845296, + "step": 99310 + }, + { + "epoch": 11.060808553291013, + "grad_norm": 0.6763241291046143, + "learning_rate": 2.4735163534118247e-05, + "loss": 0.02, + "num_input_tokens_seen": 120851504, + "step": 99315 + }, + { + "epoch": 11.06136540817463, + "grad_norm": 0.23690453171730042, + "learning_rate": 2.473273393391721e-05, + "loss": 0.0066, + "num_input_tokens_seen": 120857808, + "step": 99320 + }, + { + "epoch": 11.061922263058246, + "grad_norm": 0.7117297649383545, + "learning_rate": 2.4730304336240713e-05, + "loss": 0.0147, + "num_input_tokens_seen": 120863696, + "step": 99325 + }, + { + "epoch": 11.062479117941864, + "grad_norm": 3.0109550952911377, + "learning_rate": 2.4727874741111707e-05, + "loss": 0.1389, + "num_input_tokens_seen": 120869968, + "step": 99330 + }, + { + "epoch": 11.063035972825482, + "grad_norm": 0.015566007234156132, + "learning_rate": 2.4725445148553135e-05, + "loss": 0.0144, + "num_input_tokens_seen": 120875984, + "step": 99335 + }, + { + "epoch": 11.0635928277091, + "grad_norm": 0.01972339302301407, + "learning_rate": 2.472301555858797e-05, + "loss": 0.0244, + "num_input_tokens_seen": 120881904, + "step": 99340 + }, + { + "epoch": 11.064149682592717, + "grad_norm": 0.03840675577521324, + "learning_rate": 2.472058597123913e-05, + "loss": 0.0117, + "num_input_tokens_seen": 120888048, + "step": 99345 + }, + { + "epoch": 11.064706537476333, + "grad_norm": 0.2566988170146942, + "learning_rate": 2.4718156386529594e-05, + "loss": 0.0127, + "num_input_tokens_seen": 120894160, + "step": 99350 + }, + { + "epoch": 11.06526339235995, + "grad_norm": 0.029448501765727997, + "learning_rate": 2.4715726804482277e-05, + "loss": 0.0715, + "num_input_tokens_seen": 120900112, + "step": 99355 + }, + { + "epoch": 11.065820247243568, + "grad_norm": 0.1886919140815735, + "learning_rate": 2.4713297225120162e-05, + "loss": 0.0655, + "num_input_tokens_seen": 120906288, + "step": 99360 + }, + { + "epoch": 11.066377102127186, + "grad_norm": 1.334151029586792, + "learning_rate": 2.471086764846618e-05, + "loss": 0.0494, + "num_input_tokens_seen": 120912240, + "step": 99365 + }, + { + "epoch": 11.066933957010804, + "grad_norm": 0.014836227521300316, + "learning_rate": 2.470843807454329e-05, + "loss": 0.0402, + "num_input_tokens_seen": 120918384, + "step": 99370 + }, + { + "epoch": 11.06749081189442, + "grad_norm": 0.11141348630189896, + "learning_rate": 2.4706008503374427e-05, + "loss": 0.0469, + "num_input_tokens_seen": 120924720, + "step": 99375 + }, + { + "epoch": 11.068047666778037, + "grad_norm": 1.020043134689331, + "learning_rate": 2.470357893498256e-05, + "loss": 0.1441, + "num_input_tokens_seen": 120930864, + "step": 99380 + }, + { + "epoch": 11.068604521661655, + "grad_norm": 0.410139262676239, + "learning_rate": 2.4701149369390618e-05, + "loss": 0.0394, + "num_input_tokens_seen": 120937072, + "step": 99385 + }, + { + "epoch": 11.069161376545273, + "grad_norm": 0.1595572680234909, + "learning_rate": 2.4698719806621564e-05, + "loss": 0.0242, + "num_input_tokens_seen": 120943376, + "step": 99390 + }, + { + "epoch": 11.06971823142889, + "grad_norm": 0.5686439275741577, + "learning_rate": 2.469629024669834e-05, + "loss": 0.0421, + "num_input_tokens_seen": 120949584, + "step": 99395 + }, + { + "epoch": 11.070275086312506, + "grad_norm": 1.1120704412460327, + "learning_rate": 2.4693860689643903e-05, + "loss": 0.0148, + "num_input_tokens_seen": 120955792, + "step": 99400 + }, + { + "epoch": 11.070831941196124, + "grad_norm": 0.03484424203634262, + "learning_rate": 2.4691431135481185e-05, + "loss": 0.0054, + "num_input_tokens_seen": 120961968, + "step": 99405 + }, + { + "epoch": 11.071388796079741, + "grad_norm": 0.0689607560634613, + "learning_rate": 2.468900158423317e-05, + "loss": 0.0199, + "num_input_tokens_seen": 120967888, + "step": 99410 + }, + { + "epoch": 11.07194565096336, + "grad_norm": 1.4026877880096436, + "learning_rate": 2.4686572035922757e-05, + "loss": 0.0643, + "num_input_tokens_seen": 120974096, + "step": 99415 + }, + { + "epoch": 11.072502505846977, + "grad_norm": 0.0166842732578516, + "learning_rate": 2.468414249057294e-05, + "loss": 0.0593, + "num_input_tokens_seen": 120980240, + "step": 99420 + }, + { + "epoch": 11.073059360730593, + "grad_norm": 0.4327719807624817, + "learning_rate": 2.4681712948206646e-05, + "loss": 0.0774, + "num_input_tokens_seen": 120986096, + "step": 99425 + }, + { + "epoch": 11.07361621561421, + "grad_norm": 2.6970388889312744, + "learning_rate": 2.4679283408846828e-05, + "loss": 0.0524, + "num_input_tokens_seen": 120992112, + "step": 99430 + }, + { + "epoch": 11.074173070497828, + "grad_norm": 0.010500160977244377, + "learning_rate": 2.4676853872516434e-05, + "loss": 0.0088, + "num_input_tokens_seen": 120998096, + "step": 99435 + }, + { + "epoch": 11.074729925381446, + "grad_norm": 1.3548624515533447, + "learning_rate": 2.467442433923842e-05, + "loss": 0.0463, + "num_input_tokens_seen": 121004304, + "step": 99440 + }, + { + "epoch": 11.075286780265063, + "grad_norm": 0.022694507613778114, + "learning_rate": 2.467199480903572e-05, + "loss": 0.0139, + "num_input_tokens_seen": 121010352, + "step": 99445 + }, + { + "epoch": 11.075843635148681, + "grad_norm": 0.07509572803974152, + "learning_rate": 2.46695652819313e-05, + "loss": 0.047, + "num_input_tokens_seen": 121016560, + "step": 99450 + }, + { + "epoch": 11.076400490032297, + "grad_norm": 0.0005438403459265828, + "learning_rate": 2.4667135757948092e-05, + "loss": 0.0393, + "num_input_tokens_seen": 121022864, + "step": 99455 + }, + { + "epoch": 11.076957344915915, + "grad_norm": 0.061118338257074356, + "learning_rate": 2.4664706237109063e-05, + "loss": 0.0401, + "num_input_tokens_seen": 121028752, + "step": 99460 + }, + { + "epoch": 11.077514199799532, + "grad_norm": 1.5382558107376099, + "learning_rate": 2.4662276719437138e-05, + "loss": 0.0475, + "num_input_tokens_seen": 121034800, + "step": 99465 + }, + { + "epoch": 11.07807105468315, + "grad_norm": 0.21776600182056427, + "learning_rate": 2.46598472049553e-05, + "loss": 0.0162, + "num_input_tokens_seen": 121040880, + "step": 99470 + }, + { + "epoch": 11.078627909566768, + "grad_norm": 0.2932703495025635, + "learning_rate": 2.4657417693686457e-05, + "loss": 0.0528, + "num_input_tokens_seen": 121047344, + "step": 99475 + }, + { + "epoch": 11.079184764450384, + "grad_norm": 0.2604064345359802, + "learning_rate": 2.46549881856536e-05, + "loss": 0.0102, + "num_input_tokens_seen": 121053392, + "step": 99480 + }, + { + "epoch": 11.079741619334001, + "grad_norm": 1.3209915161132812, + "learning_rate": 2.4652558680879635e-05, + "loss": 0.1482, + "num_input_tokens_seen": 121059440, + "step": 99485 + }, + { + "epoch": 11.080298474217619, + "grad_norm": 0.0106278657913208, + "learning_rate": 2.4650129179387544e-05, + "loss": 0.0426, + "num_input_tokens_seen": 121065360, + "step": 99490 + }, + { + "epoch": 11.080855329101237, + "grad_norm": 2.2038776874542236, + "learning_rate": 2.464769968120026e-05, + "loss": 0.0848, + "num_input_tokens_seen": 121071632, + "step": 99495 + }, + { + "epoch": 11.081412183984854, + "grad_norm": 0.004942012019455433, + "learning_rate": 2.4645270186340737e-05, + "loss": 0.0457, + "num_input_tokens_seen": 121077936, + "step": 99500 + }, + { + "epoch": 11.08196903886847, + "grad_norm": 0.005835318472236395, + "learning_rate": 2.4642840694831918e-05, + "loss": 0.0115, + "num_input_tokens_seen": 121083376, + "step": 99505 + }, + { + "epoch": 11.082525893752088, + "grad_norm": 0.36206600069999695, + "learning_rate": 2.4640411206696756e-05, + "loss": 0.0881, + "num_input_tokens_seen": 121089552, + "step": 99510 + }, + { + "epoch": 11.083082748635706, + "grad_norm": 1.008495807647705, + "learning_rate": 2.4637981721958197e-05, + "loss": 0.1636, + "num_input_tokens_seen": 121095664, + "step": 99515 + }, + { + "epoch": 11.083639603519323, + "grad_norm": 0.03295610845088959, + "learning_rate": 2.4635552240639194e-05, + "loss": 0.0278, + "num_input_tokens_seen": 121101552, + "step": 99520 + }, + { + "epoch": 11.08419645840294, + "grad_norm": 0.001120587345212698, + "learning_rate": 2.463312276276269e-05, + "loss": 0.1245, + "num_input_tokens_seen": 121107248, + "step": 99525 + }, + { + "epoch": 11.084753313286557, + "grad_norm": 0.013497733511030674, + "learning_rate": 2.463069328835164e-05, + "loss": 0.168, + "num_input_tokens_seen": 121112560, + "step": 99530 + }, + { + "epoch": 11.085310168170174, + "grad_norm": 0.8670488595962524, + "learning_rate": 2.4628263817428974e-05, + "loss": 0.0589, + "num_input_tokens_seen": 121118000, + "step": 99535 + }, + { + "epoch": 11.085867023053792, + "grad_norm": 0.06542603671550751, + "learning_rate": 2.462583435001767e-05, + "loss": 0.0153, + "num_input_tokens_seen": 121124496, + "step": 99540 + }, + { + "epoch": 11.08642387793741, + "grad_norm": 0.010061989538371563, + "learning_rate": 2.4623404886140648e-05, + "loss": 0.0019, + "num_input_tokens_seen": 121130640, + "step": 99545 + }, + { + "epoch": 11.086980732821027, + "grad_norm": 0.009379151277244091, + "learning_rate": 2.4620975425820876e-05, + "loss": 0.032, + "num_input_tokens_seen": 121136944, + "step": 99550 + }, + { + "epoch": 11.087537587704643, + "grad_norm": 1.1715649366378784, + "learning_rate": 2.4618545969081292e-05, + "loss": 0.0715, + "num_input_tokens_seen": 121142768, + "step": 99555 + }, + { + "epoch": 11.088094442588261, + "grad_norm": 0.1437823325395584, + "learning_rate": 2.461611651594485e-05, + "loss": 0.0642, + "num_input_tokens_seen": 121148752, + "step": 99560 + }, + { + "epoch": 11.088651297471879, + "grad_norm": 0.10274306684732437, + "learning_rate": 2.461368706643449e-05, + "loss": 0.0809, + "num_input_tokens_seen": 121154960, + "step": 99565 + }, + { + "epoch": 11.089208152355496, + "grad_norm": 0.006323967128992081, + "learning_rate": 2.461125762057317e-05, + "loss": 0.0097, + "num_input_tokens_seen": 121161232, + "step": 99570 + }, + { + "epoch": 11.089765007239114, + "grad_norm": 0.04284318536520004, + "learning_rate": 2.460882817838383e-05, + "loss": 0.0314, + "num_input_tokens_seen": 121167568, + "step": 99575 + }, + { + "epoch": 11.09032186212273, + "grad_norm": 0.09921100735664368, + "learning_rate": 2.460639873988943e-05, + "loss": 0.0127, + "num_input_tokens_seen": 121173680, + "step": 99580 + }, + { + "epoch": 11.090878717006348, + "grad_norm": 0.15475139021873474, + "learning_rate": 2.4603969305112898e-05, + "loss": 0.003, + "num_input_tokens_seen": 121179600, + "step": 99585 + }, + { + "epoch": 11.091435571889965, + "grad_norm": 0.08756709843873978, + "learning_rate": 2.46015398740772e-05, + "loss": 0.0516, + "num_input_tokens_seen": 121185808, + "step": 99590 + }, + { + "epoch": 11.091992426773583, + "grad_norm": 0.0004495974280871451, + "learning_rate": 2.4599110446805276e-05, + "loss": 0.0067, + "num_input_tokens_seen": 121191952, + "step": 99595 + }, + { + "epoch": 11.0925492816572, + "grad_norm": 0.0018139160238206387, + "learning_rate": 2.4596681023320073e-05, + "loss": 0.0092, + "num_input_tokens_seen": 121198256, + "step": 99600 + }, + { + "epoch": 11.093106136540818, + "grad_norm": 1.1110420227050781, + "learning_rate": 2.4594251603644544e-05, + "loss": 0.0278, + "num_input_tokens_seen": 121204208, + "step": 99605 + }, + { + "epoch": 11.093662991424434, + "grad_norm": 0.09291455149650574, + "learning_rate": 2.459182218780162e-05, + "loss": 0.0324, + "num_input_tokens_seen": 121210352, + "step": 99610 + }, + { + "epoch": 11.094219846308052, + "grad_norm": 0.6647811532020569, + "learning_rate": 2.4589392775814285e-05, + "loss": 0.0696, + "num_input_tokens_seen": 121216208, + "step": 99615 + }, + { + "epoch": 11.09477670119167, + "grad_norm": 0.11949014663696289, + "learning_rate": 2.458696336770544e-05, + "loss": 0.076, + "num_input_tokens_seen": 121222480, + "step": 99620 + }, + { + "epoch": 11.095333556075287, + "grad_norm": 1.5127431154251099, + "learning_rate": 2.4584533963498082e-05, + "loss": 0.0793, + "num_input_tokens_seen": 121228944, + "step": 99625 + }, + { + "epoch": 11.095890410958905, + "grad_norm": 0.005914374254643917, + "learning_rate": 2.4582104563215114e-05, + "loss": 0.0232, + "num_input_tokens_seen": 121235280, + "step": 99630 + }, + { + "epoch": 11.09644726584252, + "grad_norm": 0.05757269635796547, + "learning_rate": 2.4579675166879514e-05, + "loss": 0.0827, + "num_input_tokens_seen": 121241360, + "step": 99635 + }, + { + "epoch": 11.097004120726139, + "grad_norm": 0.8286226391792297, + "learning_rate": 2.457724577451421e-05, + "loss": 0.0245, + "num_input_tokens_seen": 121247536, + "step": 99640 + }, + { + "epoch": 11.097560975609756, + "grad_norm": 1.01827073097229, + "learning_rate": 2.457481638614217e-05, + "loss": 0.043, + "num_input_tokens_seen": 121253328, + "step": 99645 + }, + { + "epoch": 11.098117830493374, + "grad_norm": 0.6516523957252502, + "learning_rate": 2.4572387001786318e-05, + "loss": 0.0645, + "num_input_tokens_seen": 121259824, + "step": 99650 + }, + { + "epoch": 11.098674685376992, + "grad_norm": 0.2954920530319214, + "learning_rate": 2.456995762146962e-05, + "loss": 0.0127, + "num_input_tokens_seen": 121266256, + "step": 99655 + }, + { + "epoch": 11.099231540260607, + "grad_norm": 0.11772800981998444, + "learning_rate": 2.4567528245215016e-05, + "loss": 0.0057, + "num_input_tokens_seen": 121272208, + "step": 99660 + }, + { + "epoch": 11.099788395144225, + "grad_norm": 0.006704270374029875, + "learning_rate": 2.4565098873045455e-05, + "loss": 0.013, + "num_input_tokens_seen": 121278480, + "step": 99665 + }, + { + "epoch": 11.100345250027843, + "grad_norm": 0.04587977007031441, + "learning_rate": 2.4562669504983882e-05, + "loss": 0.0165, + "num_input_tokens_seen": 121284528, + "step": 99670 + }, + { + "epoch": 11.10090210491146, + "grad_norm": 0.12771378457546234, + "learning_rate": 2.4560240141053248e-05, + "loss": 0.0525, + "num_input_tokens_seen": 121290704, + "step": 99675 + }, + { + "epoch": 11.101458959795078, + "grad_norm": 0.03171723708510399, + "learning_rate": 2.455781078127649e-05, + "loss": 0.013, + "num_input_tokens_seen": 121297040, + "step": 99680 + }, + { + "epoch": 11.102015814678694, + "grad_norm": 2.081014633178711, + "learning_rate": 2.4555381425676577e-05, + "loss": 0.1032, + "num_input_tokens_seen": 121303440, + "step": 99685 + }, + { + "epoch": 11.102572669562312, + "grad_norm": 0.856715738773346, + "learning_rate": 2.4552952074276426e-05, + "loss": 0.1709, + "num_input_tokens_seen": 121309872, + "step": 99690 + }, + { + "epoch": 11.10312952444593, + "grad_norm": 0.7200286388397217, + "learning_rate": 2.4550522727099016e-05, + "loss": 0.0242, + "num_input_tokens_seen": 121315888, + "step": 99695 + }, + { + "epoch": 11.103686379329547, + "grad_norm": 0.003211401868611574, + "learning_rate": 2.454809338416727e-05, + "loss": 0.0924, + "num_input_tokens_seen": 121322064, + "step": 99700 + }, + { + "epoch": 11.104243234213165, + "grad_norm": 0.1679759919643402, + "learning_rate": 2.454566404550415e-05, + "loss": 0.0172, + "num_input_tokens_seen": 121328368, + "step": 99705 + }, + { + "epoch": 11.10480008909678, + "grad_norm": 0.00020707173098344356, + "learning_rate": 2.454323471113259e-05, + "loss": 0.018, + "num_input_tokens_seen": 121334800, + "step": 99710 + }, + { + "epoch": 11.105356943980398, + "grad_norm": 0.16768506169319153, + "learning_rate": 2.4540805381075553e-05, + "loss": 0.014, + "num_input_tokens_seen": 121340784, + "step": 99715 + }, + { + "epoch": 11.105913798864016, + "grad_norm": 1.1232976913452148, + "learning_rate": 2.453837605535597e-05, + "loss": 0.0967, + "num_input_tokens_seen": 121346096, + "step": 99720 + }, + { + "epoch": 11.106470653747634, + "grad_norm": 2.7339494228363037, + "learning_rate": 2.4535946733996803e-05, + "loss": 0.0668, + "num_input_tokens_seen": 121352176, + "step": 99725 + }, + { + "epoch": 11.107027508631251, + "grad_norm": 0.6067666411399841, + "learning_rate": 2.4533517417020982e-05, + "loss": 0.0833, + "num_input_tokens_seen": 121358288, + "step": 99730 + }, + { + "epoch": 11.107584363514867, + "grad_norm": 0.18744303286075592, + "learning_rate": 2.4531088104451468e-05, + "loss": 0.0163, + "num_input_tokens_seen": 121364624, + "step": 99735 + }, + { + "epoch": 11.108141218398485, + "grad_norm": 0.002933323848992586, + "learning_rate": 2.4528658796311194e-05, + "loss": 0.0285, + "num_input_tokens_seen": 121370704, + "step": 99740 + }, + { + "epoch": 11.108698073282103, + "grad_norm": 0.3285145163536072, + "learning_rate": 2.4526229492623132e-05, + "loss": 0.0055, + "num_input_tokens_seen": 121376688, + "step": 99745 + }, + { + "epoch": 11.10925492816572, + "grad_norm": 0.0007983316318131983, + "learning_rate": 2.4523800193410193e-05, + "loss": 0.0148, + "num_input_tokens_seen": 121382864, + "step": 99750 + }, + { + "epoch": 11.109811783049338, + "grad_norm": 0.30892911553382874, + "learning_rate": 2.452137089869536e-05, + "loss": 0.0661, + "num_input_tokens_seen": 121388848, + "step": 99755 + }, + { + "epoch": 11.110368637932954, + "grad_norm": 0.5908562541007996, + "learning_rate": 2.4518941608501546e-05, + "loss": 0.0075, + "num_input_tokens_seen": 121395024, + "step": 99760 + }, + { + "epoch": 11.110925492816571, + "grad_norm": 0.004384924191981554, + "learning_rate": 2.4516512322851725e-05, + "loss": 0.0521, + "num_input_tokens_seen": 121401008, + "step": 99765 + }, + { + "epoch": 11.11148234770019, + "grad_norm": 1.765030860900879, + "learning_rate": 2.4514083041768828e-05, + "loss": 0.0409, + "num_input_tokens_seen": 121407056, + "step": 99770 + }, + { + "epoch": 11.112039202583807, + "grad_norm": 1.1070873737335205, + "learning_rate": 2.4511653765275812e-05, + "loss": 0.0372, + "num_input_tokens_seen": 121413136, + "step": 99775 + }, + { + "epoch": 11.112596057467425, + "grad_norm": 0.4835853576660156, + "learning_rate": 2.450922449339561e-05, + "loss": 0.0105, + "num_input_tokens_seen": 121419408, + "step": 99780 + }, + { + "epoch": 11.113152912351042, + "grad_norm": 0.008759795688092709, + "learning_rate": 2.4506795226151184e-05, + "loss": 0.1138, + "num_input_tokens_seen": 121424848, + "step": 99785 + }, + { + "epoch": 11.113709767234658, + "grad_norm": 0.013486090116202831, + "learning_rate": 2.4504365963565463e-05, + "loss": 0.0921, + "num_input_tokens_seen": 121430864, + "step": 99790 + }, + { + "epoch": 11.114266622118276, + "grad_norm": 0.6311041116714478, + "learning_rate": 2.450193670566141e-05, + "loss": 0.0547, + "num_input_tokens_seen": 121436528, + "step": 99795 + }, + { + "epoch": 11.114823477001893, + "grad_norm": 0.026728292927145958, + "learning_rate": 2.4499507452461955e-05, + "loss": 0.0448, + "num_input_tokens_seen": 121442448, + "step": 99800 + }, + { + "epoch": 11.115380331885511, + "grad_norm": 0.41806966066360474, + "learning_rate": 2.4497078203990063e-05, + "loss": 0.0693, + "num_input_tokens_seen": 121448336, + "step": 99805 + }, + { + "epoch": 11.115937186769129, + "grad_norm": 0.10135446488857269, + "learning_rate": 2.449464896026866e-05, + "loss": 0.0094, + "num_input_tokens_seen": 121454608, + "step": 99810 + }, + { + "epoch": 11.116494041652745, + "grad_norm": 0.04462791606783867, + "learning_rate": 2.4492219721320716e-05, + "loss": 0.0484, + "num_input_tokens_seen": 121460656, + "step": 99815 + }, + { + "epoch": 11.117050896536362, + "grad_norm": 0.2292323261499405, + "learning_rate": 2.448979048716915e-05, + "loss": 0.0119, + "num_input_tokens_seen": 121466320, + "step": 99820 + }, + { + "epoch": 11.11760775141998, + "grad_norm": 0.44730493426322937, + "learning_rate": 2.448736125783693e-05, + "loss": 0.0393, + "num_input_tokens_seen": 121472176, + "step": 99825 + }, + { + "epoch": 11.118164606303598, + "grad_norm": 0.012456884607672691, + "learning_rate": 2.448493203334699e-05, + "loss": 0.0057, + "num_input_tokens_seen": 121478128, + "step": 99830 + }, + { + "epoch": 11.118721461187215, + "grad_norm": 0.05159616470336914, + "learning_rate": 2.448250281372228e-05, + "loss": 0.0396, + "num_input_tokens_seen": 121484400, + "step": 99835 + }, + { + "epoch": 11.119278316070831, + "grad_norm": 0.9396634101867676, + "learning_rate": 2.4480073598985745e-05, + "loss": 0.0781, + "num_input_tokens_seen": 121490544, + "step": 99840 + }, + { + "epoch": 11.119835170954449, + "grad_norm": 0.016017908230423927, + "learning_rate": 2.4477644389160337e-05, + "loss": 0.0014, + "num_input_tokens_seen": 121496592, + "step": 99845 + }, + { + "epoch": 11.120392025838067, + "grad_norm": 0.4408218562602997, + "learning_rate": 2.447521518426899e-05, + "loss": 0.0146, + "num_input_tokens_seen": 121502992, + "step": 99850 + }, + { + "epoch": 11.120948880721684, + "grad_norm": 0.0001337159628747031, + "learning_rate": 2.447278598433466e-05, + "loss": 0.0348, + "num_input_tokens_seen": 121509072, + "step": 99855 + }, + { + "epoch": 11.121505735605302, + "grad_norm": 0.73162841796875, + "learning_rate": 2.447035678938028e-05, + "loss": 0.0484, + "num_input_tokens_seen": 121515056, + "step": 99860 + }, + { + "epoch": 11.122062590488918, + "grad_norm": 0.44158488512039185, + "learning_rate": 2.446792759942882e-05, + "loss": 0.0661, + "num_input_tokens_seen": 121520944, + "step": 99865 + }, + { + "epoch": 11.122619445372536, + "grad_norm": 0.06956983357667923, + "learning_rate": 2.4465498414503192e-05, + "loss": 0.0687, + "num_input_tokens_seen": 121526992, + "step": 99870 + }, + { + "epoch": 11.123176300256153, + "grad_norm": 0.0833234190940857, + "learning_rate": 2.446306923462638e-05, + "loss": 0.0563, + "num_input_tokens_seen": 121532976, + "step": 99875 + }, + { + "epoch": 11.123733155139771, + "grad_norm": 0.2434380054473877, + "learning_rate": 2.446064005982129e-05, + "loss": 0.0355, + "num_input_tokens_seen": 121539120, + "step": 99880 + }, + { + "epoch": 11.124290010023389, + "grad_norm": 0.14014169573783875, + "learning_rate": 2.44582108901109e-05, + "loss": 0.1431, + "num_input_tokens_seen": 121544528, + "step": 99885 + }, + { + "epoch": 11.124846864907004, + "grad_norm": 0.20722965896129608, + "learning_rate": 2.445578172551813e-05, + "loss": 0.0558, + "num_input_tokens_seen": 121550640, + "step": 99890 + }, + { + "epoch": 11.125403719790622, + "grad_norm": 0.762259840965271, + "learning_rate": 2.445335256606595e-05, + "loss": 0.1113, + "num_input_tokens_seen": 121557040, + "step": 99895 + }, + { + "epoch": 11.12596057467424, + "grad_norm": 0.3995569944381714, + "learning_rate": 2.4450923411777284e-05, + "loss": 0.0204, + "num_input_tokens_seen": 121563120, + "step": 99900 + }, + { + "epoch": 11.126517429557857, + "grad_norm": 0.050649575889110565, + "learning_rate": 2.4448494262675097e-05, + "loss": 0.0023, + "num_input_tokens_seen": 121569232, + "step": 99905 + }, + { + "epoch": 11.127074284441475, + "grad_norm": 0.05978497117757797, + "learning_rate": 2.444606511878231e-05, + "loss": 0.011, + "num_input_tokens_seen": 121575344, + "step": 99910 + }, + { + "epoch": 11.127631139325091, + "grad_norm": 0.017370987683534622, + "learning_rate": 2.444363598012189e-05, + "loss": 0.0248, + "num_input_tokens_seen": 121581392, + "step": 99915 + }, + { + "epoch": 11.128187994208709, + "grad_norm": 1.705443024635315, + "learning_rate": 2.4441206846716776e-05, + "loss": 0.0759, + "num_input_tokens_seen": 121587536, + "step": 99920 + }, + { + "epoch": 11.128744849092326, + "grad_norm": 0.37809669971466064, + "learning_rate": 2.443877771858991e-05, + "loss": 0.0542, + "num_input_tokens_seen": 121593616, + "step": 99925 + }, + { + "epoch": 11.129301703975944, + "grad_norm": 0.04615643247961998, + "learning_rate": 2.443634859576423e-05, + "loss": 0.0132, + "num_input_tokens_seen": 121599664, + "step": 99930 + }, + { + "epoch": 11.129858558859562, + "grad_norm": 0.32415571808815, + "learning_rate": 2.44339194782627e-05, + "loss": 0.0692, + "num_input_tokens_seen": 121605744, + "step": 99935 + }, + { + "epoch": 11.130415413743178, + "grad_norm": 0.0044191195629537106, + "learning_rate": 2.443149036610824e-05, + "loss": 0.0202, + "num_input_tokens_seen": 121611824, + "step": 99940 + }, + { + "epoch": 11.130972268626795, + "grad_norm": 1.0040233135223389, + "learning_rate": 2.4429061259323826e-05, + "loss": 0.0731, + "num_input_tokens_seen": 121617904, + "step": 99945 + }, + { + "epoch": 11.131529123510413, + "grad_norm": 0.016156231984496117, + "learning_rate": 2.4426632157932368e-05, + "loss": 0.0338, + "num_input_tokens_seen": 121624112, + "step": 99950 + }, + { + "epoch": 11.13208597839403, + "grad_norm": 0.05983706936240196, + "learning_rate": 2.4424203061956842e-05, + "loss": 0.0023, + "num_input_tokens_seen": 121630256, + "step": 99955 + }, + { + "epoch": 11.132642833277648, + "grad_norm": 0.1442147195339203, + "learning_rate": 2.4421773971420173e-05, + "loss": 0.1605, + "num_input_tokens_seen": 121636304, + "step": 99960 + }, + { + "epoch": 11.133199688161266, + "grad_norm": 0.44301941990852356, + "learning_rate": 2.4419344886345317e-05, + "loss": 0.0708, + "num_input_tokens_seen": 121642224, + "step": 99965 + }, + { + "epoch": 11.133756543044882, + "grad_norm": 0.02101808413863182, + "learning_rate": 2.441691580675521e-05, + "loss": 0.0209, + "num_input_tokens_seen": 121648080, + "step": 99970 + }, + { + "epoch": 11.1343133979285, + "grad_norm": 0.018537264317274094, + "learning_rate": 2.4414486732672806e-05, + "loss": 0.126, + "num_input_tokens_seen": 121654352, + "step": 99975 + }, + { + "epoch": 11.134870252812117, + "grad_norm": 0.008715570904314518, + "learning_rate": 2.441205766412104e-05, + "loss": 0.0062, + "num_input_tokens_seen": 121660368, + "step": 99980 + }, + { + "epoch": 11.135427107695735, + "grad_norm": 0.007003776729106903, + "learning_rate": 2.440962860112286e-05, + "loss": 0.0154, + "num_input_tokens_seen": 121666224, + "step": 99985 + }, + { + "epoch": 11.135983962579353, + "grad_norm": 1.951279640197754, + "learning_rate": 2.440719954370121e-05, + "loss": 0.2461, + "num_input_tokens_seen": 121672208, + "step": 99990 + }, + { + "epoch": 11.136540817462969, + "grad_norm": 0.0007238421821966767, + "learning_rate": 2.440477049187904e-05, + "loss": 0.0384, + "num_input_tokens_seen": 121678608, + "step": 99995 + }, + { + "epoch": 11.137097672346586, + "grad_norm": 0.16136768460273743, + "learning_rate": 2.4402341445679274e-05, + "loss": 0.01, + "num_input_tokens_seen": 121684048, + "step": 100000 + }, + { + "epoch": 11.137654527230204, + "grad_norm": 0.000613973883446306, + "learning_rate": 2.4399912405124894e-05, + "loss": 0.0186, + "num_input_tokens_seen": 121689936, + "step": 100005 + }, + { + "epoch": 11.138211382113822, + "grad_norm": 0.7990198731422424, + "learning_rate": 2.4397483370238818e-05, + "loss": 0.0573, + "num_input_tokens_seen": 121696272, + "step": 100010 + }, + { + "epoch": 11.13876823699744, + "grad_norm": 0.19065141677856445, + "learning_rate": 2.4395054341043976e-05, + "loss": 0.0087, + "num_input_tokens_seen": 121702608, + "step": 100015 + }, + { + "epoch": 11.139325091881055, + "grad_norm": 0.15597829222679138, + "learning_rate": 2.4392625317563354e-05, + "loss": 0.0879, + "num_input_tokens_seen": 121708528, + "step": 100020 + }, + { + "epoch": 11.139881946764673, + "grad_norm": 0.002738324459642172, + "learning_rate": 2.439019629981985e-05, + "loss": 0.0085, + "num_input_tokens_seen": 121714576, + "step": 100025 + }, + { + "epoch": 11.14043880164829, + "grad_norm": 0.028941093012690544, + "learning_rate": 2.4387767287836453e-05, + "loss": 0.0052, + "num_input_tokens_seen": 121721008, + "step": 100030 + }, + { + "epoch": 11.140995656531908, + "grad_norm": 0.9582549929618835, + "learning_rate": 2.4385338281636065e-05, + "loss": 0.076, + "num_input_tokens_seen": 121727184, + "step": 100035 + }, + { + "epoch": 11.141552511415526, + "grad_norm": 0.10843884199857712, + "learning_rate": 2.438290928124166e-05, + "loss": 0.1517, + "num_input_tokens_seen": 121733328, + "step": 100040 + }, + { + "epoch": 11.142109366299142, + "grad_norm": 0.0006005737814120948, + "learning_rate": 2.4380480286676167e-05, + "loss": 0.0439, + "num_input_tokens_seen": 121739664, + "step": 100045 + }, + { + "epoch": 11.14266622118276, + "grad_norm": 0.043719708919525146, + "learning_rate": 2.4378051297962537e-05, + "loss": 0.0076, + "num_input_tokens_seen": 121745840, + "step": 100050 + }, + { + "epoch": 11.143223076066377, + "grad_norm": 0.7541078329086304, + "learning_rate": 2.4375622315123708e-05, + "loss": 0.0473, + "num_input_tokens_seen": 121751824, + "step": 100055 + }, + { + "epoch": 11.143779930949995, + "grad_norm": 0.6041896939277649, + "learning_rate": 2.437319333818263e-05, + "loss": 0.1037, + "num_input_tokens_seen": 121757168, + "step": 100060 + }, + { + "epoch": 11.144336785833612, + "grad_norm": 0.357850044965744, + "learning_rate": 2.4370764367162242e-05, + "loss": 0.1893, + "num_input_tokens_seen": 121763312, + "step": 100065 + }, + { + "epoch": 11.144893640717228, + "grad_norm": 0.2903341054916382, + "learning_rate": 2.4368335402085487e-05, + "loss": 0.0945, + "num_input_tokens_seen": 121768752, + "step": 100070 + }, + { + "epoch": 11.145450495600846, + "grad_norm": 1.8656092882156372, + "learning_rate": 2.436590644297531e-05, + "loss": 0.0947, + "num_input_tokens_seen": 121774992, + "step": 100075 + }, + { + "epoch": 11.146007350484464, + "grad_norm": 0.4107675552368164, + "learning_rate": 2.436347748985466e-05, + "loss": 0.026, + "num_input_tokens_seen": 121781040, + "step": 100080 + }, + { + "epoch": 11.146564205368081, + "grad_norm": 0.09450262039899826, + "learning_rate": 2.436104854274646e-05, + "loss": 0.0159, + "num_input_tokens_seen": 121786992, + "step": 100085 + }, + { + "epoch": 11.147121060251699, + "grad_norm": 0.08315917104482651, + "learning_rate": 2.4358619601673692e-05, + "loss": 0.0045, + "num_input_tokens_seen": 121793296, + "step": 100090 + }, + { + "epoch": 11.147677915135315, + "grad_norm": 0.12280625104904175, + "learning_rate": 2.4356190666659255e-05, + "loss": 0.0351, + "num_input_tokens_seen": 121799408, + "step": 100095 + }, + { + "epoch": 11.148234770018933, + "grad_norm": 0.00038909356226213276, + "learning_rate": 2.435376173772612e-05, + "loss": 0.0069, + "num_input_tokens_seen": 121805392, + "step": 100100 + }, + { + "epoch": 11.14879162490255, + "grad_norm": 0.5949743986129761, + "learning_rate": 2.4351332814897225e-05, + "loss": 0.1294, + "num_input_tokens_seen": 121811440, + "step": 100105 + }, + { + "epoch": 11.149348479786168, + "grad_norm": 1.0502269268035889, + "learning_rate": 2.434890389819551e-05, + "loss": 0.1237, + "num_input_tokens_seen": 121817680, + "step": 100110 + }, + { + "epoch": 11.149905334669786, + "grad_norm": 0.001877032918855548, + "learning_rate": 2.434647498764392e-05, + "loss": 0.0322, + "num_input_tokens_seen": 121823280, + "step": 100115 + }, + { + "epoch": 11.150462189553402, + "grad_norm": 0.017209570854902267, + "learning_rate": 2.4344046083265397e-05, + "loss": 0.0335, + "num_input_tokens_seen": 121828880, + "step": 100120 + }, + { + "epoch": 11.15101904443702, + "grad_norm": 1.0036537647247314, + "learning_rate": 2.4341617185082886e-05, + "loss": 0.1278, + "num_input_tokens_seen": 121834896, + "step": 100125 + }, + { + "epoch": 11.151575899320637, + "grad_norm": 0.04180973395705223, + "learning_rate": 2.433918829311933e-05, + "loss": 0.0067, + "num_input_tokens_seen": 121841072, + "step": 100130 + }, + { + "epoch": 11.152132754204255, + "grad_norm": 1.2822304964065552, + "learning_rate": 2.4336759407397662e-05, + "loss": 0.0876, + "num_input_tokens_seen": 121847120, + "step": 100135 + }, + { + "epoch": 11.152689609087872, + "grad_norm": 0.017077656462788582, + "learning_rate": 2.433433052794084e-05, + "loss": 0.1551, + "num_input_tokens_seen": 121853328, + "step": 100140 + }, + { + "epoch": 11.15324646397149, + "grad_norm": 0.06702197343111038, + "learning_rate": 2.433190165477179e-05, + "loss": 0.0045, + "num_input_tokens_seen": 121859504, + "step": 100145 + }, + { + "epoch": 11.153803318855106, + "grad_norm": 0.045647770166397095, + "learning_rate": 2.4329472787913478e-05, + "loss": 0.0304, + "num_input_tokens_seen": 121865296, + "step": 100150 + }, + { + "epoch": 11.154360173738723, + "grad_norm": 0.015142988413572311, + "learning_rate": 2.432704392738882e-05, + "loss": 0.003, + "num_input_tokens_seen": 121871472, + "step": 100155 + }, + { + "epoch": 11.154917028622341, + "grad_norm": 0.5859172344207764, + "learning_rate": 2.4324615073220782e-05, + "loss": 0.0294, + "num_input_tokens_seen": 121877520, + "step": 100160 + }, + { + "epoch": 11.155473883505959, + "grad_norm": 0.08796034008264542, + "learning_rate": 2.4322186225432283e-05, + "loss": 0.0436, + "num_input_tokens_seen": 121883216, + "step": 100165 + }, + { + "epoch": 11.156030738389576, + "grad_norm": 0.0001514389005023986, + "learning_rate": 2.431975738404629e-05, + "loss": 0.0454, + "num_input_tokens_seen": 121889200, + "step": 100170 + }, + { + "epoch": 11.156587593273192, + "grad_norm": 0.0012634897138923407, + "learning_rate": 2.431732854908573e-05, + "loss": 0.0843, + "num_input_tokens_seen": 121895152, + "step": 100175 + }, + { + "epoch": 11.15714444815681, + "grad_norm": 0.516943633556366, + "learning_rate": 2.4314899720573548e-05, + "loss": 0.0329, + "num_input_tokens_seen": 121901328, + "step": 100180 + }, + { + "epoch": 11.157701303040428, + "grad_norm": 0.20327426493167877, + "learning_rate": 2.4312470898532685e-05, + "loss": 0.0379, + "num_input_tokens_seen": 121907376, + "step": 100185 + }, + { + "epoch": 11.158258157924045, + "grad_norm": 0.8672677278518677, + "learning_rate": 2.431004208298609e-05, + "loss": 0.0866, + "num_input_tokens_seen": 121913424, + "step": 100190 + }, + { + "epoch": 11.158815012807663, + "grad_norm": 0.3358146846294403, + "learning_rate": 2.4307613273956696e-05, + "loss": 0.0493, + "num_input_tokens_seen": 121919632, + "step": 100195 + }, + { + "epoch": 11.159371867691279, + "grad_norm": 1.1967426538467407, + "learning_rate": 2.430518447146745e-05, + "loss": 0.0502, + "num_input_tokens_seen": 121925776, + "step": 100200 + }, + { + "epoch": 11.159928722574897, + "grad_norm": 1.130764365196228, + "learning_rate": 2.4302755675541295e-05, + "loss": 0.0973, + "num_input_tokens_seen": 121931504, + "step": 100205 + }, + { + "epoch": 11.160485577458514, + "grad_norm": 0.021538948640227318, + "learning_rate": 2.4300326886201173e-05, + "loss": 0.1401, + "num_input_tokens_seen": 121937456, + "step": 100210 + }, + { + "epoch": 11.161042432342132, + "grad_norm": 0.008424771949648857, + "learning_rate": 2.4297898103470012e-05, + "loss": 0.0097, + "num_input_tokens_seen": 121943472, + "step": 100215 + }, + { + "epoch": 11.16159928722575, + "grad_norm": 0.0706184133887291, + "learning_rate": 2.4295469327370787e-05, + "loss": 0.0016, + "num_input_tokens_seen": 121949680, + "step": 100220 + }, + { + "epoch": 11.162156142109366, + "grad_norm": 0.01500631868839264, + "learning_rate": 2.42930405579264e-05, + "loss": 0.0168, + "num_input_tokens_seen": 121956336, + "step": 100225 + }, + { + "epoch": 11.162712996992983, + "grad_norm": 0.03184417635202408, + "learning_rate": 2.429061179515982e-05, + "loss": 0.0127, + "num_input_tokens_seen": 121962672, + "step": 100230 + }, + { + "epoch": 11.163269851876601, + "grad_norm": 0.22960756719112396, + "learning_rate": 2.4288183039093975e-05, + "loss": 0.0177, + "num_input_tokens_seen": 121969200, + "step": 100235 + }, + { + "epoch": 11.163826706760219, + "grad_norm": 0.5406017899513245, + "learning_rate": 2.428575428975182e-05, + "loss": 0.1127, + "num_input_tokens_seen": 121975312, + "step": 100240 + }, + { + "epoch": 11.164383561643836, + "grad_norm": 0.014307468198239803, + "learning_rate": 2.428332554715628e-05, + "loss": 0.0019, + "num_input_tokens_seen": 121981424, + "step": 100245 + }, + { + "epoch": 11.164940416527452, + "grad_norm": 0.0003410236386116594, + "learning_rate": 2.428089681133031e-05, + "loss": 0.0101, + "num_input_tokens_seen": 121987568, + "step": 100250 + }, + { + "epoch": 11.16549727141107, + "grad_norm": 0.12314177304506302, + "learning_rate": 2.427846808229684e-05, + "loss": 0.0759, + "num_input_tokens_seen": 121993808, + "step": 100255 + }, + { + "epoch": 11.166054126294688, + "grad_norm": 0.09345352649688721, + "learning_rate": 2.4276039360078825e-05, + "loss": 0.0422, + "num_input_tokens_seen": 121999632, + "step": 100260 + }, + { + "epoch": 11.166610981178305, + "grad_norm": 0.06057227775454521, + "learning_rate": 2.427361064469919e-05, + "loss": 0.0115, + "num_input_tokens_seen": 122005424, + "step": 100265 + }, + { + "epoch": 11.167167836061923, + "grad_norm": 0.019616981968283653, + "learning_rate": 2.4271181936180892e-05, + "loss": 0.0136, + "num_input_tokens_seen": 122011664, + "step": 100270 + }, + { + "epoch": 11.167724690945539, + "grad_norm": 1.3727878332138062, + "learning_rate": 2.4268753234546854e-05, + "loss": 0.1032, + "num_input_tokens_seen": 122017776, + "step": 100275 + }, + { + "epoch": 11.168281545829156, + "grad_norm": 0.7980385422706604, + "learning_rate": 2.426632453982004e-05, + "loss": 0.0252, + "num_input_tokens_seen": 122023792, + "step": 100280 + }, + { + "epoch": 11.168838400712774, + "grad_norm": 0.0054521821439266205, + "learning_rate": 2.4263895852023367e-05, + "loss": 0.0278, + "num_input_tokens_seen": 122029744, + "step": 100285 + }, + { + "epoch": 11.169395255596392, + "grad_norm": 0.000147558530443348, + "learning_rate": 2.4261467171179793e-05, + "loss": 0.0528, + "num_input_tokens_seen": 122035856, + "step": 100290 + }, + { + "epoch": 11.16995211048001, + "grad_norm": 0.4481090009212494, + "learning_rate": 2.4259038497312254e-05, + "loss": 0.0469, + "num_input_tokens_seen": 122042256, + "step": 100295 + }, + { + "epoch": 11.170508965363627, + "grad_norm": 0.29959380626678467, + "learning_rate": 2.425660983044369e-05, + "loss": 0.0557, + "num_input_tokens_seen": 122047920, + "step": 100300 + }, + { + "epoch": 11.171065820247243, + "grad_norm": 0.01476887334138155, + "learning_rate": 2.425418117059704e-05, + "loss": 0.0051, + "num_input_tokens_seen": 122054416, + "step": 100305 + }, + { + "epoch": 11.17162267513086, + "grad_norm": 0.7502741813659668, + "learning_rate": 2.425175251779525e-05, + "loss": 0.0548, + "num_input_tokens_seen": 122060496, + "step": 100310 + }, + { + "epoch": 11.172179530014478, + "grad_norm": 0.10352081060409546, + "learning_rate": 2.424932387206125e-05, + "loss": 0.009, + "num_input_tokens_seen": 122066768, + "step": 100315 + }, + { + "epoch": 11.172736384898096, + "grad_norm": 0.006024740636348724, + "learning_rate": 2.4246895233418e-05, + "loss": 0.1231, + "num_input_tokens_seen": 122072880, + "step": 100320 + }, + { + "epoch": 11.173293239781714, + "grad_norm": 0.19795945286750793, + "learning_rate": 2.4244466601888417e-05, + "loss": 0.0741, + "num_input_tokens_seen": 122078992, + "step": 100325 + }, + { + "epoch": 11.17385009466533, + "grad_norm": 0.08064857125282288, + "learning_rate": 2.4242037977495456e-05, + "loss": 0.016, + "num_input_tokens_seen": 122085168, + "step": 100330 + }, + { + "epoch": 11.174406949548947, + "grad_norm": 0.0203833170235157, + "learning_rate": 2.4239609360262044e-05, + "loss": 0.0278, + "num_input_tokens_seen": 122091248, + "step": 100335 + }, + { + "epoch": 11.174963804432565, + "grad_norm": 1.3665493726730347, + "learning_rate": 2.423718075021115e-05, + "loss": 0.0791, + "num_input_tokens_seen": 122097392, + "step": 100340 + }, + { + "epoch": 11.175520659316183, + "grad_norm": 0.01635979488492012, + "learning_rate": 2.4234752147365673e-05, + "loss": 0.0468, + "num_input_tokens_seen": 122103504, + "step": 100345 + }, + { + "epoch": 11.1760775141998, + "grad_norm": 1.0414738655090332, + "learning_rate": 2.42323235517486e-05, + "loss": 0.0466, + "num_input_tokens_seen": 122109232, + "step": 100350 + }, + { + "epoch": 11.176634369083416, + "grad_norm": 0.028496267274022102, + "learning_rate": 2.422989496338282e-05, + "loss": 0.0389, + "num_input_tokens_seen": 122115632, + "step": 100355 + }, + { + "epoch": 11.177191223967034, + "grad_norm": 0.2858099937438965, + "learning_rate": 2.4227466382291317e-05, + "loss": 0.0078, + "num_input_tokens_seen": 122121968, + "step": 100360 + }, + { + "epoch": 11.177748078850652, + "grad_norm": 1.1519429683685303, + "learning_rate": 2.4225037808497004e-05, + "loss": 0.1562, + "num_input_tokens_seen": 122127984, + "step": 100365 + }, + { + "epoch": 11.17830493373427, + "grad_norm": 0.0005376862827688456, + "learning_rate": 2.4222609242022838e-05, + "loss": 0.0155, + "num_input_tokens_seen": 122134032, + "step": 100370 + }, + { + "epoch": 11.178861788617887, + "grad_norm": 1.3177995681762695, + "learning_rate": 2.4220180682891743e-05, + "loss": 0.0498, + "num_input_tokens_seen": 122140048, + "step": 100375 + }, + { + "epoch": 11.179418643501503, + "grad_norm": 2.4014201164245605, + "learning_rate": 2.4217752131126673e-05, + "loss": 0.0836, + "num_input_tokens_seen": 122146128, + "step": 100380 + }, + { + "epoch": 11.17997549838512, + "grad_norm": 0.009546536952257156, + "learning_rate": 2.4215323586750556e-05, + "loss": 0.0623, + "num_input_tokens_seen": 122152528, + "step": 100385 + }, + { + "epoch": 11.180532353268738, + "grad_norm": 0.030580928549170494, + "learning_rate": 2.421289504978634e-05, + "loss": 0.0011, + "num_input_tokens_seen": 122158576, + "step": 100390 + }, + { + "epoch": 11.181089208152356, + "grad_norm": 0.0020063798874616623, + "learning_rate": 2.4210466520256955e-05, + "loss": 0.0681, + "num_input_tokens_seen": 122164720, + "step": 100395 + }, + { + "epoch": 11.181646063035974, + "grad_norm": 0.29282325506210327, + "learning_rate": 2.420803799818535e-05, + "loss": 0.0306, + "num_input_tokens_seen": 122170736, + "step": 100400 + }, + { + "epoch": 11.18220291791959, + "grad_norm": 0.1989598125219345, + "learning_rate": 2.4205609483594456e-05, + "loss": 0.0683, + "num_input_tokens_seen": 122176272, + "step": 100405 + }, + { + "epoch": 11.182759772803207, + "grad_norm": 1.810293436050415, + "learning_rate": 2.420318097650723e-05, + "loss": 0.0882, + "num_input_tokens_seen": 122181904, + "step": 100410 + }, + { + "epoch": 11.183316627686825, + "grad_norm": 0.41610485315322876, + "learning_rate": 2.420075247694659e-05, + "loss": 0.0237, + "num_input_tokens_seen": 122187664, + "step": 100415 + }, + { + "epoch": 11.183873482570442, + "grad_norm": 0.8778702616691589, + "learning_rate": 2.4198323984935476e-05, + "loss": 0.0329, + "num_input_tokens_seen": 122193584, + "step": 100420 + }, + { + "epoch": 11.18443033745406, + "grad_norm": 0.09989892691373825, + "learning_rate": 2.419589550049685e-05, + "loss": 0.1197, + "num_input_tokens_seen": 122199728, + "step": 100425 + }, + { + "epoch": 11.184987192337676, + "grad_norm": 0.7509744763374329, + "learning_rate": 2.4193467023653616e-05, + "loss": 0.0651, + "num_input_tokens_seen": 122206160, + "step": 100430 + }, + { + "epoch": 11.185544047221294, + "grad_norm": 0.16359110176563263, + "learning_rate": 2.419103855442875e-05, + "loss": 0.0133, + "num_input_tokens_seen": 122212528, + "step": 100435 + }, + { + "epoch": 11.186100902104911, + "grad_norm": 1.4846806526184082, + "learning_rate": 2.4188610092845156e-05, + "loss": 0.0556, + "num_input_tokens_seen": 122218928, + "step": 100440 + }, + { + "epoch": 11.186657756988529, + "grad_norm": 1.8151745796203613, + "learning_rate": 2.4186181638925802e-05, + "loss": 0.0999, + "num_input_tokens_seen": 122224944, + "step": 100445 + }, + { + "epoch": 11.187214611872147, + "grad_norm": 0.03037019446492195, + "learning_rate": 2.4183753192693607e-05, + "loss": 0.0271, + "num_input_tokens_seen": 122231216, + "step": 100450 + }, + { + "epoch": 11.187771466755763, + "grad_norm": 0.06644617766141891, + "learning_rate": 2.4181324754171527e-05, + "loss": 0.0212, + "num_input_tokens_seen": 122237360, + "step": 100455 + }, + { + "epoch": 11.18832832163938, + "grad_norm": 0.14377819001674652, + "learning_rate": 2.4178896323382484e-05, + "loss": 0.1169, + "num_input_tokens_seen": 122243344, + "step": 100460 + }, + { + "epoch": 11.188885176522998, + "grad_norm": 0.044430118054151535, + "learning_rate": 2.4176467900349427e-05, + "loss": 0.0114, + "num_input_tokens_seen": 122249392, + "step": 100465 + }, + { + "epoch": 11.189442031406616, + "grad_norm": 4.004251956939697, + "learning_rate": 2.4174039485095284e-05, + "loss": 0.1208, + "num_input_tokens_seen": 122255472, + "step": 100470 + }, + { + "epoch": 11.189998886290233, + "grad_norm": 0.0008171722874976695, + "learning_rate": 2.4171611077643005e-05, + "loss": 0.0962, + "num_input_tokens_seen": 122261808, + "step": 100475 + }, + { + "epoch": 11.19055574117385, + "grad_norm": 0.23233278095722198, + "learning_rate": 2.4169182678015522e-05, + "loss": 0.018, + "num_input_tokens_seen": 122268016, + "step": 100480 + }, + { + "epoch": 11.191112596057467, + "grad_norm": 0.010195409879088402, + "learning_rate": 2.4166754286235775e-05, + "loss": 0.0057, + "num_input_tokens_seen": 122274352, + "step": 100485 + }, + { + "epoch": 11.191669450941085, + "grad_norm": 0.3820025622844696, + "learning_rate": 2.416432590232669e-05, + "loss": 0.0347, + "num_input_tokens_seen": 122280432, + "step": 100490 + }, + { + "epoch": 11.192226305824702, + "grad_norm": 0.01981854811310768, + "learning_rate": 2.4161897526311235e-05, + "loss": 0.0195, + "num_input_tokens_seen": 122286576, + "step": 100495 + }, + { + "epoch": 11.19278316070832, + "grad_norm": 0.00013927440159022808, + "learning_rate": 2.4159469158212314e-05, + "loss": 0.0173, + "num_input_tokens_seen": 122292784, + "step": 100500 + }, + { + "epoch": 11.193340015591938, + "grad_norm": 0.0045181517489254475, + "learning_rate": 2.4157040798052886e-05, + "loss": 0.0625, + "num_input_tokens_seen": 122298928, + "step": 100505 + }, + { + "epoch": 11.193896870475553, + "grad_norm": 0.005191830452531576, + "learning_rate": 2.4154612445855884e-05, + "loss": 0.0154, + "num_input_tokens_seen": 122305328, + "step": 100510 + }, + { + "epoch": 11.194453725359171, + "grad_norm": 0.0022615385241806507, + "learning_rate": 2.4152184101644247e-05, + "loss": 0.0084, + "num_input_tokens_seen": 122310992, + "step": 100515 + }, + { + "epoch": 11.195010580242789, + "grad_norm": 0.004442053847014904, + "learning_rate": 2.4149755765440907e-05, + "loss": 0.0051, + "num_input_tokens_seen": 122317008, + "step": 100520 + }, + { + "epoch": 11.195567435126407, + "grad_norm": 0.0017796816537156701, + "learning_rate": 2.414732743726881e-05, + "loss": 0.0124, + "num_input_tokens_seen": 122323248, + "step": 100525 + }, + { + "epoch": 11.196124290010024, + "grad_norm": 0.01851513609290123, + "learning_rate": 2.414489911715088e-05, + "loss": 0.0079, + "num_input_tokens_seen": 122329392, + "step": 100530 + }, + { + "epoch": 11.19668114489364, + "grad_norm": 0.020365649834275246, + "learning_rate": 2.414247080511007e-05, + "loss": 0.1413, + "num_input_tokens_seen": 122335632, + "step": 100535 + }, + { + "epoch": 11.197237999777258, + "grad_norm": 0.003230626229196787, + "learning_rate": 2.4140042501169308e-05, + "loss": 0.0708, + "num_input_tokens_seen": 122341648, + "step": 100540 + }, + { + "epoch": 11.197794854660875, + "grad_norm": 0.20566615462303162, + "learning_rate": 2.4137614205351536e-05, + "loss": 0.0229, + "num_input_tokens_seen": 122347984, + "step": 100545 + }, + { + "epoch": 11.198351709544493, + "grad_norm": 0.19990500807762146, + "learning_rate": 2.4135185917679677e-05, + "loss": 0.0534, + "num_input_tokens_seen": 122354224, + "step": 100550 + }, + { + "epoch": 11.19890856442811, + "grad_norm": 0.004199289251118898, + "learning_rate": 2.41327576381767e-05, + "loss": 0.0256, + "num_input_tokens_seen": 122360784, + "step": 100555 + }, + { + "epoch": 11.199465419311727, + "grad_norm": 0.28465089201927185, + "learning_rate": 2.41303293668655e-05, + "loss": 0.0233, + "num_input_tokens_seen": 122366768, + "step": 100560 + }, + { + "epoch": 11.200022274195344, + "grad_norm": 0.4640485942363739, + "learning_rate": 2.412790110376905e-05, + "loss": 0.0098, + "num_input_tokens_seen": 122373072, + "step": 100565 + }, + { + "epoch": 11.200579129078962, + "grad_norm": 3.21221661567688, + "learning_rate": 2.412547284891027e-05, + "loss": 0.0932, + "num_input_tokens_seen": 122379280, + "step": 100570 + }, + { + "epoch": 11.20113598396258, + "grad_norm": 0.016675017774105072, + "learning_rate": 2.41230446023121e-05, + "loss": 0.0123, + "num_input_tokens_seen": 122385232, + "step": 100575 + }, + { + "epoch": 11.201692838846197, + "grad_norm": 0.3042091727256775, + "learning_rate": 2.4120616363997472e-05, + "loss": 0.0117, + "num_input_tokens_seen": 122391440, + "step": 100580 + }, + { + "epoch": 11.202249693729813, + "grad_norm": 1.6983991861343384, + "learning_rate": 2.4118188133989336e-05, + "loss": 0.0903, + "num_input_tokens_seen": 122397872, + "step": 100585 + }, + { + "epoch": 11.202806548613431, + "grad_norm": 0.00596893485635519, + "learning_rate": 2.411575991231061e-05, + "loss": 0.0047, + "num_input_tokens_seen": 122403632, + "step": 100590 + }, + { + "epoch": 11.203363403497049, + "grad_norm": 0.2979452908039093, + "learning_rate": 2.411333169898425e-05, + "loss": 0.0504, + "num_input_tokens_seen": 122410096, + "step": 100595 + }, + { + "epoch": 11.203920258380666, + "grad_norm": 0.32589471340179443, + "learning_rate": 2.411090349403317e-05, + "loss": 0.0047, + "num_input_tokens_seen": 122416432, + "step": 100600 + }, + { + "epoch": 11.204477113264284, + "grad_norm": 0.39301103353500366, + "learning_rate": 2.4108475297480332e-05, + "loss": 0.0286, + "num_input_tokens_seen": 122422544, + "step": 100605 + }, + { + "epoch": 11.2050339681479, + "grad_norm": 0.007925768382847309, + "learning_rate": 2.4106047109348648e-05, + "loss": 0.0147, + "num_input_tokens_seen": 122428912, + "step": 100610 + }, + { + "epoch": 11.205590823031518, + "grad_norm": 1.8689327239990234, + "learning_rate": 2.4103618929661072e-05, + "loss": 0.1655, + "num_input_tokens_seen": 122435120, + "step": 100615 + }, + { + "epoch": 11.206147677915135, + "grad_norm": 0.006325347814708948, + "learning_rate": 2.4101190758440526e-05, + "loss": 0.0149, + "num_input_tokens_seen": 122441552, + "step": 100620 + }, + { + "epoch": 11.206704532798753, + "grad_norm": 0.0010558851063251495, + "learning_rate": 2.4098762595709967e-05, + "loss": 0.0187, + "num_input_tokens_seen": 122447664, + "step": 100625 + }, + { + "epoch": 11.20726138768237, + "grad_norm": 0.003077732166275382, + "learning_rate": 2.40963344414923e-05, + "loss": 0.01, + "num_input_tokens_seen": 122453904, + "step": 100630 + }, + { + "epoch": 11.207818242565986, + "grad_norm": 0.01229590829461813, + "learning_rate": 2.4093906295810488e-05, + "loss": 0.2163, + "num_input_tokens_seen": 122460176, + "step": 100635 + }, + { + "epoch": 11.208375097449604, + "grad_norm": 0.9675058722496033, + "learning_rate": 2.4091478158687456e-05, + "loss": 0.0464, + "num_input_tokens_seen": 122466320, + "step": 100640 + }, + { + "epoch": 11.208931952333222, + "grad_norm": 1.5706360340118408, + "learning_rate": 2.4089050030146143e-05, + "loss": 0.1514, + "num_input_tokens_seen": 122472624, + "step": 100645 + }, + { + "epoch": 11.20948880721684, + "grad_norm": 0.48231008648872375, + "learning_rate": 2.4086621910209477e-05, + "loss": 0.0212, + "num_input_tokens_seen": 122478928, + "step": 100650 + }, + { + "epoch": 11.210045662100457, + "grad_norm": 0.011215484701097012, + "learning_rate": 2.4084193798900405e-05, + "loss": 0.0568, + "num_input_tokens_seen": 122484784, + "step": 100655 + }, + { + "epoch": 11.210602516984075, + "grad_norm": 0.009738324210047722, + "learning_rate": 2.4081765696241853e-05, + "loss": 0.0496, + "num_input_tokens_seen": 122490768, + "step": 100660 + }, + { + "epoch": 11.21115937186769, + "grad_norm": 0.20314089953899384, + "learning_rate": 2.4079337602256763e-05, + "loss": 0.0104, + "num_input_tokens_seen": 122496368, + "step": 100665 + }, + { + "epoch": 11.211716226751308, + "grad_norm": 0.33502259850502014, + "learning_rate": 2.407690951696806e-05, + "loss": 0.1347, + "num_input_tokens_seen": 122501904, + "step": 100670 + }, + { + "epoch": 11.212273081634926, + "grad_norm": 0.036789420992136, + "learning_rate": 2.4074481440398693e-05, + "loss": 0.0067, + "num_input_tokens_seen": 122508208, + "step": 100675 + }, + { + "epoch": 11.212829936518544, + "grad_norm": 0.00016006901569198817, + "learning_rate": 2.4072053372571583e-05, + "loss": 0.027, + "num_input_tokens_seen": 122514288, + "step": 100680 + }, + { + "epoch": 11.213386791402161, + "grad_norm": 0.009272233583033085, + "learning_rate": 2.4069625313509685e-05, + "loss": 0.0129, + "num_input_tokens_seen": 122520272, + "step": 100685 + }, + { + "epoch": 11.213943646285777, + "grad_norm": 0.017462503165006638, + "learning_rate": 2.4067197263235903e-05, + "loss": 0.0743, + "num_input_tokens_seen": 122526352, + "step": 100690 + }, + { + "epoch": 11.214500501169395, + "grad_norm": 0.19449324905872345, + "learning_rate": 2.4064769221773204e-05, + "loss": 0.0267, + "num_input_tokens_seen": 122532496, + "step": 100695 + }, + { + "epoch": 11.215057356053013, + "grad_norm": 0.0002975938841700554, + "learning_rate": 2.40623411891445e-05, + "loss": 0.0226, + "num_input_tokens_seen": 122539088, + "step": 100700 + }, + { + "epoch": 11.21561421093663, + "grad_norm": 0.01600891351699829, + "learning_rate": 2.4059913165372746e-05, + "loss": 0.038, + "num_input_tokens_seen": 122545104, + "step": 100705 + }, + { + "epoch": 11.216171065820248, + "grad_norm": 0.0006937799043953419, + "learning_rate": 2.4057485150480858e-05, + "loss": 0.008, + "num_input_tokens_seen": 122551248, + "step": 100710 + }, + { + "epoch": 11.216727920703864, + "grad_norm": 0.0070844991132617, + "learning_rate": 2.405505714449178e-05, + "loss": 0.0088, + "num_input_tokens_seen": 122557168, + "step": 100715 + }, + { + "epoch": 11.217284775587482, + "grad_norm": 0.07030016928911209, + "learning_rate": 2.4052629147428443e-05, + "loss": 0.0425, + "num_input_tokens_seen": 122563472, + "step": 100720 + }, + { + "epoch": 11.2178416304711, + "grad_norm": 0.027076026424765587, + "learning_rate": 2.4050201159313784e-05, + "loss": 0.1607, + "num_input_tokens_seen": 122569328, + "step": 100725 + }, + { + "epoch": 11.218398485354717, + "grad_norm": 0.005504704546183348, + "learning_rate": 2.4047773180170735e-05, + "loss": 0.0149, + "num_input_tokens_seen": 122575120, + "step": 100730 + }, + { + "epoch": 11.218955340238335, + "grad_norm": 0.1716562807559967, + "learning_rate": 2.4045345210022234e-05, + "loss": 0.0926, + "num_input_tokens_seen": 122580720, + "step": 100735 + }, + { + "epoch": 11.21951219512195, + "grad_norm": 2.698169231414795, + "learning_rate": 2.4042917248891202e-05, + "loss": 0.0445, + "num_input_tokens_seen": 122586800, + "step": 100740 + }, + { + "epoch": 11.220069050005568, + "grad_norm": 0.0002615925623103976, + "learning_rate": 2.40404892968006e-05, + "loss": 0.0267, + "num_input_tokens_seen": 122592496, + "step": 100745 + }, + { + "epoch": 11.220625904889186, + "grad_norm": 0.7078649997711182, + "learning_rate": 2.403806135377333e-05, + "loss": 0.0605, + "num_input_tokens_seen": 122598032, + "step": 100750 + }, + { + "epoch": 11.221182759772804, + "grad_norm": 0.38872653245925903, + "learning_rate": 2.4035633419832356e-05, + "loss": 0.1054, + "num_input_tokens_seen": 122604048, + "step": 100755 + }, + { + "epoch": 11.221739614656421, + "grad_norm": 0.05299981310963631, + "learning_rate": 2.403320549500058e-05, + "loss": 0.18, + "num_input_tokens_seen": 122610192, + "step": 100760 + }, + { + "epoch": 11.222296469540037, + "grad_norm": 1.035400390625, + "learning_rate": 2.4030777579300965e-05, + "loss": 0.1424, + "num_input_tokens_seen": 122616368, + "step": 100765 + }, + { + "epoch": 11.222853324423655, + "grad_norm": 0.02048357017338276, + "learning_rate": 2.4028349672756426e-05, + "loss": 0.0081, + "num_input_tokens_seen": 122622416, + "step": 100770 + }, + { + "epoch": 11.223410179307272, + "grad_norm": 0.6149584650993347, + "learning_rate": 2.402592177538991e-05, + "loss": 0.0273, + "num_input_tokens_seen": 122628816, + "step": 100775 + }, + { + "epoch": 11.22396703419089, + "grad_norm": 0.15978360176086426, + "learning_rate": 2.4023493887224334e-05, + "loss": 0.0299, + "num_input_tokens_seen": 122634736, + "step": 100780 + }, + { + "epoch": 11.224523889074508, + "grad_norm": 0.006801108829677105, + "learning_rate": 2.402106600828265e-05, + "loss": 0.0504, + "num_input_tokens_seen": 122640208, + "step": 100785 + }, + { + "epoch": 11.225080743958124, + "grad_norm": 0.9216001629829407, + "learning_rate": 2.4018638138587775e-05, + "loss": 0.0311, + "num_input_tokens_seen": 122645936, + "step": 100790 + }, + { + "epoch": 11.225637598841741, + "grad_norm": 0.6860135793685913, + "learning_rate": 2.4016210278162655e-05, + "loss": 0.0201, + "num_input_tokens_seen": 122651728, + "step": 100795 + }, + { + "epoch": 11.226194453725359, + "grad_norm": 0.026959141716361046, + "learning_rate": 2.401378242703021e-05, + "loss": 0.0009, + "num_input_tokens_seen": 122657840, + "step": 100800 + }, + { + "epoch": 11.226751308608977, + "grad_norm": 1.2628679275512695, + "learning_rate": 2.4011354585213385e-05, + "loss": 0.0675, + "num_input_tokens_seen": 122663536, + "step": 100805 + }, + { + "epoch": 11.227308163492594, + "grad_norm": 0.39082151651382446, + "learning_rate": 2.40089267527351e-05, + "loss": 0.0204, + "num_input_tokens_seen": 122669744, + "step": 100810 + }, + { + "epoch": 11.22786501837621, + "grad_norm": 0.3349146246910095, + "learning_rate": 2.400649892961831e-05, + "loss": 0.0521, + "num_input_tokens_seen": 122675824, + "step": 100815 + }, + { + "epoch": 11.228421873259828, + "grad_norm": 0.7710785865783691, + "learning_rate": 2.400407111588592e-05, + "loss": 0.0361, + "num_input_tokens_seen": 122682288, + "step": 100820 + }, + { + "epoch": 11.228978728143446, + "grad_norm": 0.7011739611625671, + "learning_rate": 2.4001643311560885e-05, + "loss": 0.0749, + "num_input_tokens_seen": 122688048, + "step": 100825 + }, + { + "epoch": 11.229535583027063, + "grad_norm": 1.028404951095581, + "learning_rate": 2.3999215516666133e-05, + "loss": 0.089, + "num_input_tokens_seen": 122694064, + "step": 100830 + }, + { + "epoch": 11.230092437910681, + "grad_norm": 0.712928056716919, + "learning_rate": 2.3996787731224578e-05, + "loss": 0.0197, + "num_input_tokens_seen": 122700304, + "step": 100835 + }, + { + "epoch": 11.230649292794297, + "grad_norm": 0.8565124273300171, + "learning_rate": 2.3994359955259177e-05, + "loss": 0.0917, + "num_input_tokens_seen": 122706544, + "step": 100840 + }, + { + "epoch": 11.231206147677915, + "grad_norm": 0.06690315902233124, + "learning_rate": 2.3991932188792847e-05, + "loss": 0.0502, + "num_input_tokens_seen": 122712912, + "step": 100845 + }, + { + "epoch": 11.231763002561532, + "grad_norm": 0.0016988664865493774, + "learning_rate": 2.3989504431848532e-05, + "loss": 0.0106, + "num_input_tokens_seen": 122719120, + "step": 100850 + }, + { + "epoch": 11.23231985744515, + "grad_norm": 0.011927981860935688, + "learning_rate": 2.3987076684449148e-05, + "loss": 0.0537, + "num_input_tokens_seen": 122724624, + "step": 100855 + }, + { + "epoch": 11.232876712328768, + "grad_norm": 0.030773906037211418, + "learning_rate": 2.3984648946617644e-05, + "loss": 0.0215, + "num_input_tokens_seen": 122730640, + "step": 100860 + }, + { + "epoch": 11.233433567212385, + "grad_norm": 0.02027920074760914, + "learning_rate": 2.398222121837694e-05, + "loss": 0.0511, + "num_input_tokens_seen": 122736976, + "step": 100865 + }, + { + "epoch": 11.233990422096001, + "grad_norm": 0.018992925062775612, + "learning_rate": 2.3979793499749975e-05, + "loss": 0.0061, + "num_input_tokens_seen": 122742864, + "step": 100870 + }, + { + "epoch": 11.234547276979619, + "grad_norm": 0.138621523976326, + "learning_rate": 2.397736579075967e-05, + "loss": 0.0252, + "num_input_tokens_seen": 122748912, + "step": 100875 + }, + { + "epoch": 11.235104131863237, + "grad_norm": 0.0012026246404275298, + "learning_rate": 2.3974938091428974e-05, + "loss": 0.0005, + "num_input_tokens_seen": 122755088, + "step": 100880 + }, + { + "epoch": 11.235660986746854, + "grad_norm": 0.03010985441505909, + "learning_rate": 2.3972510401780804e-05, + "loss": 0.0078, + "num_input_tokens_seen": 122761296, + "step": 100885 + }, + { + "epoch": 11.236217841630472, + "grad_norm": 1.5843896865844727, + "learning_rate": 2.39700827218381e-05, + "loss": 0.1152, + "num_input_tokens_seen": 122767536, + "step": 100890 + }, + { + "epoch": 11.236774696514088, + "grad_norm": 0.12191236764192581, + "learning_rate": 2.396765505162378e-05, + "loss": 0.0113, + "num_input_tokens_seen": 122773712, + "step": 100895 + }, + { + "epoch": 11.237331551397705, + "grad_norm": 0.0016853294800966978, + "learning_rate": 2.39652273911608e-05, + "loss": 0.0132, + "num_input_tokens_seen": 122779792, + "step": 100900 + }, + { + "epoch": 11.237888406281323, + "grad_norm": 0.11709152907133102, + "learning_rate": 2.396279974047206e-05, + "loss": 0.0128, + "num_input_tokens_seen": 122786000, + "step": 100905 + }, + { + "epoch": 11.23844526116494, + "grad_norm": 0.2942253053188324, + "learning_rate": 2.396037209958052e-05, + "loss": 0.0187, + "num_input_tokens_seen": 122791824, + "step": 100910 + }, + { + "epoch": 11.239002116048558, + "grad_norm": 0.05753038451075554, + "learning_rate": 2.3957944468509092e-05, + "loss": 0.0928, + "num_input_tokens_seen": 122797232, + "step": 100915 + }, + { + "epoch": 11.239558970932174, + "grad_norm": 0.0013328248169273138, + "learning_rate": 2.3955516847280716e-05, + "loss": 0.1421, + "num_input_tokens_seen": 122803440, + "step": 100920 + }, + { + "epoch": 11.240115825815792, + "grad_norm": 0.09353289008140564, + "learning_rate": 2.3953089235918323e-05, + "loss": 0.107, + "num_input_tokens_seen": 122808752, + "step": 100925 + }, + { + "epoch": 11.24067268069941, + "grad_norm": 0.3099040687084198, + "learning_rate": 2.395066163444484e-05, + "loss": 0.0373, + "num_input_tokens_seen": 122814960, + "step": 100930 + }, + { + "epoch": 11.241229535583027, + "grad_norm": 0.5079947710037231, + "learning_rate": 2.3948234042883193e-05, + "loss": 0.0605, + "num_input_tokens_seen": 122820848, + "step": 100935 + }, + { + "epoch": 11.241786390466645, + "grad_norm": 0.015375655144453049, + "learning_rate": 2.3945806461256325e-05, + "loss": 0.0256, + "num_input_tokens_seen": 122826896, + "step": 100940 + }, + { + "epoch": 11.242343245350261, + "grad_norm": 0.0009323856793344021, + "learning_rate": 2.3943378889587152e-05, + "loss": 0.0595, + "num_input_tokens_seen": 122833072, + "step": 100945 + }, + { + "epoch": 11.242900100233879, + "grad_norm": 0.09980244934558868, + "learning_rate": 2.3940951327898623e-05, + "loss": 0.0543, + "num_input_tokens_seen": 122839344, + "step": 100950 + }, + { + "epoch": 11.243456955117496, + "grad_norm": 0.31959158182144165, + "learning_rate": 2.393852377621364e-05, + "loss": 0.0728, + "num_input_tokens_seen": 122845296, + "step": 100955 + }, + { + "epoch": 11.244013810001114, + "grad_norm": 0.014780712313950062, + "learning_rate": 2.393609623455517e-05, + "loss": 0.028, + "num_input_tokens_seen": 122851632, + "step": 100960 + }, + { + "epoch": 11.244570664884732, + "grad_norm": 0.060791224241256714, + "learning_rate": 2.3933668702946107e-05, + "loss": 0.0868, + "num_input_tokens_seen": 122857968, + "step": 100965 + }, + { + "epoch": 11.245127519768348, + "grad_norm": 0.37272101640701294, + "learning_rate": 2.393124118140941e-05, + "loss": 0.0155, + "num_input_tokens_seen": 122864240, + "step": 100970 + }, + { + "epoch": 11.245684374651965, + "grad_norm": 0.48080044984817505, + "learning_rate": 2.3928813669967987e-05, + "loss": 0.0755, + "num_input_tokens_seen": 122870416, + "step": 100975 + }, + { + "epoch": 11.246241229535583, + "grad_norm": 0.03434157371520996, + "learning_rate": 2.3926386168644785e-05, + "loss": 0.004, + "num_input_tokens_seen": 122876464, + "step": 100980 + }, + { + "epoch": 11.2467980844192, + "grad_norm": 0.004941697232425213, + "learning_rate": 2.392395867746272e-05, + "loss": 0.1306, + "num_input_tokens_seen": 122881968, + "step": 100985 + }, + { + "epoch": 11.247354939302818, + "grad_norm": 0.043031953275203705, + "learning_rate": 2.392153119644473e-05, + "loss": 0.0588, + "num_input_tokens_seen": 122887696, + "step": 100990 + }, + { + "epoch": 11.247911794186434, + "grad_norm": 0.01745578460395336, + "learning_rate": 2.391910372561374e-05, + "loss": 0.0082, + "num_input_tokens_seen": 122893808, + "step": 100995 + }, + { + "epoch": 11.248468649070052, + "grad_norm": 0.35229334235191345, + "learning_rate": 2.3916676264992684e-05, + "loss": 0.0081, + "num_input_tokens_seen": 122899888, + "step": 101000 + }, + { + "epoch": 11.24902550395367, + "grad_norm": 0.0007726356270723045, + "learning_rate": 2.3914248814604488e-05, + "loss": 0.0068, + "num_input_tokens_seen": 122906224, + "step": 101005 + }, + { + "epoch": 11.249582358837287, + "grad_norm": 0.25626444816589355, + "learning_rate": 2.391182137447208e-05, + "loss": 0.0258, + "num_input_tokens_seen": 122912464, + "step": 101010 + }, + { + "epoch": 11.250139213720905, + "grad_norm": 2.008873462677002, + "learning_rate": 2.390939394461839e-05, + "loss": 0.1073, + "num_input_tokens_seen": 122918768, + "step": 101015 + }, + { + "epoch": 11.250696068604523, + "grad_norm": 0.3275708854198456, + "learning_rate": 2.3906966525066353e-05, + "loss": 0.007, + "num_input_tokens_seen": 122925072, + "step": 101020 + }, + { + "epoch": 11.251252923488138, + "grad_norm": 0.21993137896060944, + "learning_rate": 2.3904539115838882e-05, + "loss": 0.0029, + "num_input_tokens_seen": 122931184, + "step": 101025 + }, + { + "epoch": 11.251809778371756, + "grad_norm": 0.005383329465985298, + "learning_rate": 2.3902111716958935e-05, + "loss": 0.0001, + "num_input_tokens_seen": 122937648, + "step": 101030 + }, + { + "epoch": 11.252366633255374, + "grad_norm": 0.11495714634656906, + "learning_rate": 2.3899684328449406e-05, + "loss": 0.0015, + "num_input_tokens_seen": 122943536, + "step": 101035 + }, + { + "epoch": 11.252923488138991, + "grad_norm": 1.6993262767791748, + "learning_rate": 2.389725695033325e-05, + "loss": 0.1172, + "num_input_tokens_seen": 122949584, + "step": 101040 + }, + { + "epoch": 11.25348034302261, + "grad_norm": 0.27602192759513855, + "learning_rate": 2.3894829582633378e-05, + "loss": 0.0138, + "num_input_tokens_seen": 122955600, + "step": 101045 + }, + { + "epoch": 11.254037197906225, + "grad_norm": 9.832940850174055e-05, + "learning_rate": 2.389240222537273e-05, + "loss": 0.0344, + "num_input_tokens_seen": 122961872, + "step": 101050 + }, + { + "epoch": 11.254594052789843, + "grad_norm": 0.49501827359199524, + "learning_rate": 2.388997487857423e-05, + "loss": 0.0624, + "num_input_tokens_seen": 122968240, + "step": 101055 + }, + { + "epoch": 11.25515090767346, + "grad_norm": 1.182960867881775, + "learning_rate": 2.388754754226081e-05, + "loss": 0.0696, + "num_input_tokens_seen": 122974480, + "step": 101060 + }, + { + "epoch": 11.255707762557078, + "grad_norm": 1.6622328758239746, + "learning_rate": 2.388512021645539e-05, + "loss": 0.086, + "num_input_tokens_seen": 122979984, + "step": 101065 + }, + { + "epoch": 11.256264617440696, + "grad_norm": 0.023064682260155678, + "learning_rate": 2.3882692901180906e-05, + "loss": 0.0187, + "num_input_tokens_seen": 122985872, + "step": 101070 + }, + { + "epoch": 11.256821472324312, + "grad_norm": 0.12390607595443726, + "learning_rate": 2.388026559646028e-05, + "loss": 0.0356, + "num_input_tokens_seen": 122992080, + "step": 101075 + }, + { + "epoch": 11.25737832720793, + "grad_norm": 0.5540971159934998, + "learning_rate": 2.3877838302316448e-05, + "loss": 0.0981, + "num_input_tokens_seen": 122997904, + "step": 101080 + }, + { + "epoch": 11.257935182091547, + "grad_norm": 0.0004167279985267669, + "learning_rate": 2.387541101877232e-05, + "loss": 0.002, + "num_input_tokens_seen": 123004016, + "step": 101085 + }, + { + "epoch": 11.258492036975165, + "grad_norm": 0.012719746679067612, + "learning_rate": 2.387298374585085e-05, + "loss": 0.0912, + "num_input_tokens_seen": 123010032, + "step": 101090 + }, + { + "epoch": 11.259048891858782, + "grad_norm": 0.03745007514953613, + "learning_rate": 2.387055648357494e-05, + "loss": 0.0134, + "num_input_tokens_seen": 123016176, + "step": 101095 + }, + { + "epoch": 11.259605746742398, + "grad_norm": 0.10654453933238983, + "learning_rate": 2.3868129231967534e-05, + "loss": 0.0218, + "num_input_tokens_seen": 123022256, + "step": 101100 + }, + { + "epoch": 11.260162601626016, + "grad_norm": 0.4755532145500183, + "learning_rate": 2.3865701991051554e-05, + "loss": 0.0362, + "num_input_tokens_seen": 123028304, + "step": 101105 + }, + { + "epoch": 11.260719456509634, + "grad_norm": 0.00012574036372825503, + "learning_rate": 2.386327476084993e-05, + "loss": 0.0561, + "num_input_tokens_seen": 123034576, + "step": 101110 + }, + { + "epoch": 11.261276311393251, + "grad_norm": 0.0002804808027576655, + "learning_rate": 2.3860847541385583e-05, + "loss": 0.0227, + "num_input_tokens_seen": 123040752, + "step": 101115 + }, + { + "epoch": 11.261833166276869, + "grad_norm": 0.00029165318119339645, + "learning_rate": 2.3858420332681446e-05, + "loss": 0.1396, + "num_input_tokens_seen": 123046960, + "step": 101120 + }, + { + "epoch": 11.262390021160485, + "grad_norm": 0.11243734508752823, + "learning_rate": 2.3855993134760442e-05, + "loss": 0.0124, + "num_input_tokens_seen": 123052688, + "step": 101125 + }, + { + "epoch": 11.262946876044102, + "grad_norm": 0.24437950551509857, + "learning_rate": 2.3853565947645505e-05, + "loss": 0.0732, + "num_input_tokens_seen": 123059024, + "step": 101130 + }, + { + "epoch": 11.26350373092772, + "grad_norm": 0.11146587133407593, + "learning_rate": 2.3851138771359546e-05, + "loss": 0.0044, + "num_input_tokens_seen": 123065104, + "step": 101135 + }, + { + "epoch": 11.264060585811338, + "grad_norm": 0.04300788789987564, + "learning_rate": 2.384871160592551e-05, + "loss": 0.0786, + "num_input_tokens_seen": 123071248, + "step": 101140 + }, + { + "epoch": 11.264617440694956, + "grad_norm": 0.0921238511800766, + "learning_rate": 2.3846284451366306e-05, + "loss": 0.1608, + "num_input_tokens_seen": 123077520, + "step": 101145 + }, + { + "epoch": 11.265174295578571, + "grad_norm": 0.7053590416908264, + "learning_rate": 2.3843857307704884e-05, + "loss": 0.0734, + "num_input_tokens_seen": 123083152, + "step": 101150 + }, + { + "epoch": 11.265731150462189, + "grad_norm": 1.3565818071365356, + "learning_rate": 2.3841430174964143e-05, + "loss": 0.0567, + "num_input_tokens_seen": 123089296, + "step": 101155 + }, + { + "epoch": 11.266288005345807, + "grad_norm": 1.1497266292572021, + "learning_rate": 2.3839003053167033e-05, + "loss": 0.0562, + "num_input_tokens_seen": 123095568, + "step": 101160 + }, + { + "epoch": 11.266844860229424, + "grad_norm": 0.09568015486001968, + "learning_rate": 2.3836575942336456e-05, + "loss": 0.0082, + "num_input_tokens_seen": 123101648, + "step": 101165 + }, + { + "epoch": 11.267401715113042, + "grad_norm": 0.0008749605622142553, + "learning_rate": 2.3834148842495362e-05, + "loss": 0.002, + "num_input_tokens_seen": 123107632, + "step": 101170 + }, + { + "epoch": 11.267958569996658, + "grad_norm": 0.0009698248468339443, + "learning_rate": 2.3831721753666662e-05, + "loss": 0.0337, + "num_input_tokens_seen": 123113776, + "step": 101175 + }, + { + "epoch": 11.268515424880276, + "grad_norm": 0.015528100542724133, + "learning_rate": 2.382929467587329e-05, + "loss": 0.0675, + "num_input_tokens_seen": 123120016, + "step": 101180 + }, + { + "epoch": 11.269072279763893, + "grad_norm": 0.003559822915121913, + "learning_rate": 2.382686760913816e-05, + "loss": 0.115, + "num_input_tokens_seen": 123126000, + "step": 101185 + }, + { + "epoch": 11.269629134647511, + "grad_norm": 0.36766675114631653, + "learning_rate": 2.3824440553484214e-05, + "loss": 0.0304, + "num_input_tokens_seen": 123131760, + "step": 101190 + }, + { + "epoch": 11.270185989531129, + "grad_norm": 1.7742348909378052, + "learning_rate": 2.382201350893436e-05, + "loss": 0.0816, + "num_input_tokens_seen": 123137776, + "step": 101195 + }, + { + "epoch": 11.270742844414745, + "grad_norm": 0.6331964135169983, + "learning_rate": 2.3819586475511543e-05, + "loss": 0.0096, + "num_input_tokens_seen": 123144240, + "step": 101200 + }, + { + "epoch": 11.271299699298362, + "grad_norm": 0.09774908423423767, + "learning_rate": 2.381715945323867e-05, + "loss": 0.004, + "num_input_tokens_seen": 123150384, + "step": 101205 + }, + { + "epoch": 11.27185655418198, + "grad_norm": 0.14687202870845795, + "learning_rate": 2.3814732442138678e-05, + "loss": 0.0155, + "num_input_tokens_seen": 123156528, + "step": 101210 + }, + { + "epoch": 11.272413409065598, + "grad_norm": 2.9429309368133545, + "learning_rate": 2.3812305442234478e-05, + "loss": 0.1246, + "num_input_tokens_seen": 123162640, + "step": 101215 + }, + { + "epoch": 11.272970263949215, + "grad_norm": 1.1697806119918823, + "learning_rate": 2.380987845354902e-05, + "loss": 0.0775, + "num_input_tokens_seen": 123168816, + "step": 101220 + }, + { + "epoch": 11.273527118832833, + "grad_norm": 0.8009971976280212, + "learning_rate": 2.3807451476105196e-05, + "loss": 0.0185, + "num_input_tokens_seen": 123175344, + "step": 101225 + }, + { + "epoch": 11.274083973716449, + "grad_norm": 0.007878364995121956, + "learning_rate": 2.3805024509925963e-05, + "loss": 0.0844, + "num_input_tokens_seen": 123181712, + "step": 101230 + }, + { + "epoch": 11.274640828600067, + "grad_norm": 1.082374930381775, + "learning_rate": 2.3802597555034222e-05, + "loss": 0.0655, + "num_input_tokens_seen": 123187984, + "step": 101235 + }, + { + "epoch": 11.275197683483684, + "grad_norm": 0.21786877512931824, + "learning_rate": 2.3800170611452913e-05, + "loss": 0.0414, + "num_input_tokens_seen": 123194032, + "step": 101240 + }, + { + "epoch": 11.275754538367302, + "grad_norm": 0.0003346659941598773, + "learning_rate": 2.3797743679204955e-05, + "loss": 0.0145, + "num_input_tokens_seen": 123200208, + "step": 101245 + }, + { + "epoch": 11.27631139325092, + "grad_norm": 1.9213707447052002, + "learning_rate": 2.3795316758313262e-05, + "loss": 0.0603, + "num_input_tokens_seen": 123205968, + "step": 101250 + }, + { + "epoch": 11.276868248134535, + "grad_norm": 0.026524335145950317, + "learning_rate": 2.379288984880078e-05, + "loss": 0.0062, + "num_input_tokens_seen": 123211952, + "step": 101255 + }, + { + "epoch": 11.277425103018153, + "grad_norm": 0.030567411333322525, + "learning_rate": 2.3790462950690408e-05, + "loss": 0.0144, + "num_input_tokens_seen": 123218160, + "step": 101260 + }, + { + "epoch": 11.27798195790177, + "grad_norm": 0.0009276201017200947, + "learning_rate": 2.378803606400509e-05, + "loss": 0.0457, + "num_input_tokens_seen": 123224240, + "step": 101265 + }, + { + "epoch": 11.278538812785389, + "grad_norm": 1.0667920112609863, + "learning_rate": 2.378560918876774e-05, + "loss": 0.0299, + "num_input_tokens_seen": 123229744, + "step": 101270 + }, + { + "epoch": 11.279095667669006, + "grad_norm": 0.004715165589004755, + "learning_rate": 2.3783182325001284e-05, + "loss": 0.0179, + "num_input_tokens_seen": 123236176, + "step": 101275 + }, + { + "epoch": 11.279652522552622, + "grad_norm": 5.512187957763672, + "learning_rate": 2.3780755472728645e-05, + "loss": 0.0857, + "num_input_tokens_seen": 123242512, + "step": 101280 + }, + { + "epoch": 11.28020937743624, + "grad_norm": 0.016075322404503822, + "learning_rate": 2.377832863197275e-05, + "loss": 0.0351, + "num_input_tokens_seen": 123247952, + "step": 101285 + }, + { + "epoch": 11.280766232319857, + "grad_norm": 0.0142136225476861, + "learning_rate": 2.3775901802756512e-05, + "loss": 0.1217, + "num_input_tokens_seen": 123254064, + "step": 101290 + }, + { + "epoch": 11.281323087203475, + "grad_norm": 0.24302972853183746, + "learning_rate": 2.3773474985102876e-05, + "loss": 0.0227, + "num_input_tokens_seen": 123260112, + "step": 101295 + }, + { + "epoch": 11.281879942087093, + "grad_norm": 0.6206744909286499, + "learning_rate": 2.3771048179034736e-05, + "loss": 0.0259, + "num_input_tokens_seen": 123266160, + "step": 101300 + }, + { + "epoch": 11.282436796970709, + "grad_norm": 0.055485017597675323, + "learning_rate": 2.3768621384575048e-05, + "loss": 0.0355, + "num_input_tokens_seen": 123272240, + "step": 101305 + }, + { + "epoch": 11.282993651854326, + "grad_norm": 0.3009154498577118, + "learning_rate": 2.3766194601746697e-05, + "loss": 0.0154, + "num_input_tokens_seen": 123278512, + "step": 101310 + }, + { + "epoch": 11.283550506737944, + "grad_norm": 0.06512390822172165, + "learning_rate": 2.376376783057264e-05, + "loss": 0.0343, + "num_input_tokens_seen": 123284368, + "step": 101315 + }, + { + "epoch": 11.284107361621562, + "grad_norm": 1.1654984951019287, + "learning_rate": 2.3761341071075783e-05, + "loss": 0.1372, + "num_input_tokens_seen": 123290192, + "step": 101320 + }, + { + "epoch": 11.28466421650518, + "grad_norm": 0.045149482786655426, + "learning_rate": 2.3758914323279054e-05, + "loss": 0.0712, + "num_input_tokens_seen": 123296080, + "step": 101325 + }, + { + "epoch": 11.285221071388795, + "grad_norm": 0.07128211855888367, + "learning_rate": 2.375648758720537e-05, + "loss": 0.0283, + "num_input_tokens_seen": 123302352, + "step": 101330 + }, + { + "epoch": 11.285777926272413, + "grad_norm": 0.07574309408664703, + "learning_rate": 2.3754060862877665e-05, + "loss": 0.0171, + "num_input_tokens_seen": 123308464, + "step": 101335 + }, + { + "epoch": 11.28633478115603, + "grad_norm": 0.05519205331802368, + "learning_rate": 2.3751634150318845e-05, + "loss": 0.0604, + "num_input_tokens_seen": 123314832, + "step": 101340 + }, + { + "epoch": 11.286891636039648, + "grad_norm": 0.24884414672851562, + "learning_rate": 2.3749207449551843e-05, + "loss": 0.0783, + "num_input_tokens_seen": 123320752, + "step": 101345 + }, + { + "epoch": 11.287448490923266, + "grad_norm": 0.9582452774047852, + "learning_rate": 2.3746780760599577e-05, + "loss": 0.025, + "num_input_tokens_seen": 123326832, + "step": 101350 + }, + { + "epoch": 11.288005345806884, + "grad_norm": 0.051511481404304504, + "learning_rate": 2.3744354083484977e-05, + "loss": 0.0124, + "num_input_tokens_seen": 123333136, + "step": 101355 + }, + { + "epoch": 11.2885622006905, + "grad_norm": 0.06606200337409973, + "learning_rate": 2.374192741823095e-05, + "loss": 0.1043, + "num_input_tokens_seen": 123339280, + "step": 101360 + }, + { + "epoch": 11.289119055574117, + "grad_norm": 0.03729425370693207, + "learning_rate": 2.3739500764860437e-05, + "loss": 0.0173, + "num_input_tokens_seen": 123344816, + "step": 101365 + }, + { + "epoch": 11.289675910457735, + "grad_norm": 1.5903451442718506, + "learning_rate": 2.373707412339633e-05, + "loss": 0.0713, + "num_input_tokens_seen": 123350320, + "step": 101370 + }, + { + "epoch": 11.290232765341353, + "grad_norm": 0.04320254549384117, + "learning_rate": 2.373464749386159e-05, + "loss": 0.0924, + "num_input_tokens_seen": 123356592, + "step": 101375 + }, + { + "epoch": 11.29078962022497, + "grad_norm": 0.0001578835945110768, + "learning_rate": 2.373222087627911e-05, + "loss": 0.0339, + "num_input_tokens_seen": 123362896, + "step": 101380 + }, + { + "epoch": 11.291346475108586, + "grad_norm": 0.07285565882921219, + "learning_rate": 2.372979427067182e-05, + "loss": 0.0371, + "num_input_tokens_seen": 123369136, + "step": 101385 + }, + { + "epoch": 11.291903329992204, + "grad_norm": 0.3239012658596039, + "learning_rate": 2.372736767706264e-05, + "loss": 0.0637, + "num_input_tokens_seen": 123375216, + "step": 101390 + }, + { + "epoch": 11.292460184875821, + "grad_norm": 1.227232813835144, + "learning_rate": 2.3724941095474497e-05, + "loss": 0.0641, + "num_input_tokens_seen": 123381392, + "step": 101395 + }, + { + "epoch": 11.29301703975944, + "grad_norm": 0.14055414497852325, + "learning_rate": 2.37225145259303e-05, + "loss": 0.0447, + "num_input_tokens_seen": 123387440, + "step": 101400 + }, + { + "epoch": 11.293573894643057, + "grad_norm": 0.07477086782455444, + "learning_rate": 2.3720087968452984e-05, + "loss": 0.0097, + "num_input_tokens_seen": 123393456, + "step": 101405 + }, + { + "epoch": 11.294130749526673, + "grad_norm": 0.0026913185138255358, + "learning_rate": 2.3717661423065457e-05, + "loss": 0.0999, + "num_input_tokens_seen": 123399952, + "step": 101410 + }, + { + "epoch": 11.29468760441029, + "grad_norm": 0.013461657799780369, + "learning_rate": 2.3715234889790648e-05, + "loss": 0.0012, + "num_input_tokens_seen": 123406320, + "step": 101415 + }, + { + "epoch": 11.295244459293908, + "grad_norm": 0.9831660985946655, + "learning_rate": 2.3712808368651466e-05, + "loss": 0.034, + "num_input_tokens_seen": 123412304, + "step": 101420 + }, + { + "epoch": 11.295801314177526, + "grad_norm": 1.5944377183914185, + "learning_rate": 2.371038185967086e-05, + "loss": 0.0915, + "num_input_tokens_seen": 123417328, + "step": 101425 + }, + { + "epoch": 11.296358169061143, + "grad_norm": 1.2314832210540771, + "learning_rate": 2.3707955362871707e-05, + "loss": 0.0366, + "num_input_tokens_seen": 123423312, + "step": 101430 + }, + { + "epoch": 11.29691502394476, + "grad_norm": 0.021936746314167976, + "learning_rate": 2.3705528878276972e-05, + "loss": 0.007, + "num_input_tokens_seen": 123429552, + "step": 101435 + }, + { + "epoch": 11.297471878828377, + "grad_norm": 0.2744474709033966, + "learning_rate": 2.3703102405909537e-05, + "loss": 0.031, + "num_input_tokens_seen": 123435728, + "step": 101440 + }, + { + "epoch": 11.298028733711995, + "grad_norm": 0.03208814561367035, + "learning_rate": 2.3700675945792347e-05, + "loss": 0.0071, + "num_input_tokens_seen": 123441904, + "step": 101445 + }, + { + "epoch": 11.298585588595612, + "grad_norm": 0.6239570379257202, + "learning_rate": 2.369824949794831e-05, + "loss": 0.0228, + "num_input_tokens_seen": 123447952, + "step": 101450 + }, + { + "epoch": 11.29914244347923, + "grad_norm": 0.08908326178789139, + "learning_rate": 2.3695823062400355e-05, + "loss": 0.0033, + "num_input_tokens_seen": 123453968, + "step": 101455 + }, + { + "epoch": 11.299699298362846, + "grad_norm": 0.0034412613604217768, + "learning_rate": 2.3693396639171392e-05, + "loss": 0.0644, + "num_input_tokens_seen": 123459504, + "step": 101460 + }, + { + "epoch": 11.300256153246464, + "grad_norm": 0.04595879465341568, + "learning_rate": 2.369097022828435e-05, + "loss": 0.0044, + "num_input_tokens_seen": 123465872, + "step": 101465 + }, + { + "epoch": 11.300813008130081, + "grad_norm": 0.09068156033754349, + "learning_rate": 2.3688543829762135e-05, + "loss": 0.0048, + "num_input_tokens_seen": 123472176, + "step": 101470 + }, + { + "epoch": 11.301369863013699, + "grad_norm": 0.18361099064350128, + "learning_rate": 2.368611744362768e-05, + "loss": 0.0111, + "num_input_tokens_seen": 123478704, + "step": 101475 + }, + { + "epoch": 11.301926717897317, + "grad_norm": 0.5285986065864563, + "learning_rate": 2.3683691069903895e-05, + "loss": 0.0244, + "num_input_tokens_seen": 123485008, + "step": 101480 + }, + { + "epoch": 11.302483572780933, + "grad_norm": 0.01596645638346672, + "learning_rate": 2.3681264708613704e-05, + "loss": 0.002, + "num_input_tokens_seen": 123491728, + "step": 101485 + }, + { + "epoch": 11.30304042766455, + "grad_norm": 0.02220277674496174, + "learning_rate": 2.3678838359780018e-05, + "loss": 0.0723, + "num_input_tokens_seen": 123498096, + "step": 101490 + }, + { + "epoch": 11.303597282548168, + "grad_norm": 0.2697945833206177, + "learning_rate": 2.3676412023425776e-05, + "loss": 0.0584, + "num_input_tokens_seen": 123503984, + "step": 101495 + }, + { + "epoch": 11.304154137431786, + "grad_norm": 0.24483363330364227, + "learning_rate": 2.3673985699573865e-05, + "loss": 0.1004, + "num_input_tokens_seen": 123510000, + "step": 101500 + }, + { + "epoch": 11.304710992315403, + "grad_norm": 0.0005652657710015774, + "learning_rate": 2.3671559388247238e-05, + "loss": 0.0194, + "num_input_tokens_seen": 123516240, + "step": 101505 + }, + { + "epoch": 11.30526784719902, + "grad_norm": 0.8675422072410583, + "learning_rate": 2.3669133089468787e-05, + "loss": 0.1606, + "num_input_tokens_seen": 123521776, + "step": 101510 + }, + { + "epoch": 11.305824702082637, + "grad_norm": 0.0008942247950471938, + "learning_rate": 2.3666706803261447e-05, + "loss": 0.0049, + "num_input_tokens_seen": 123527824, + "step": 101515 + }, + { + "epoch": 11.306381556966254, + "grad_norm": 0.1798192411661148, + "learning_rate": 2.3664280529648125e-05, + "loss": 0.0675, + "num_input_tokens_seen": 123534096, + "step": 101520 + }, + { + "epoch": 11.306938411849872, + "grad_norm": 0.551624059677124, + "learning_rate": 2.3661854268651748e-05, + "loss": 0.0164, + "num_input_tokens_seen": 123540240, + "step": 101525 + }, + { + "epoch": 11.30749526673349, + "grad_norm": 0.04427526891231537, + "learning_rate": 2.365942802029522e-05, + "loss": 0.0169, + "num_input_tokens_seen": 123546416, + "step": 101530 + }, + { + "epoch": 11.308052121617106, + "grad_norm": 1.095301866531372, + "learning_rate": 2.365700178460148e-05, + "loss": 0.0616, + "num_input_tokens_seen": 123552432, + "step": 101535 + }, + { + "epoch": 11.308608976500723, + "grad_norm": 0.009504461660981178, + "learning_rate": 2.365457556159343e-05, + "loss": 0.0034, + "num_input_tokens_seen": 123558448, + "step": 101540 + }, + { + "epoch": 11.309165831384341, + "grad_norm": 0.0006521494360640645, + "learning_rate": 2.365214935129399e-05, + "loss": 0.0185, + "num_input_tokens_seen": 123564880, + "step": 101545 + }, + { + "epoch": 11.309722686267959, + "grad_norm": 0.6201804280281067, + "learning_rate": 2.3649723153726073e-05, + "loss": 0.1279, + "num_input_tokens_seen": 123570576, + "step": 101550 + }, + { + "epoch": 11.310279541151576, + "grad_norm": 1.7084858417510986, + "learning_rate": 2.364729696891262e-05, + "loss": 0.0509, + "num_input_tokens_seen": 123576688, + "step": 101555 + }, + { + "epoch": 11.310836396035194, + "grad_norm": 0.003325902856886387, + "learning_rate": 2.3644870796876507e-05, + "loss": 0.0137, + "num_input_tokens_seen": 123583024, + "step": 101560 + }, + { + "epoch": 11.31139325091881, + "grad_norm": 0.49515363574028015, + "learning_rate": 2.3642444637640697e-05, + "loss": 0.0692, + "num_input_tokens_seen": 123589232, + "step": 101565 + }, + { + "epoch": 11.311950105802428, + "grad_norm": 0.23871393501758575, + "learning_rate": 2.364001849122807e-05, + "loss": 0.0114, + "num_input_tokens_seen": 123595088, + "step": 101570 + }, + { + "epoch": 11.312506960686045, + "grad_norm": 0.006338931154459715, + "learning_rate": 2.363759235766157e-05, + "loss": 0.1651, + "num_input_tokens_seen": 123601264, + "step": 101575 + }, + { + "epoch": 11.313063815569663, + "grad_norm": 0.7876545786857605, + "learning_rate": 2.363516623696409e-05, + "loss": 0.0742, + "num_input_tokens_seen": 123607376, + "step": 101580 + }, + { + "epoch": 11.31362067045328, + "grad_norm": 0.35466468334198, + "learning_rate": 2.363274012915857e-05, + "loss": 0.0455, + "num_input_tokens_seen": 123613136, + "step": 101585 + }, + { + "epoch": 11.314177525336897, + "grad_norm": 1.4239847660064697, + "learning_rate": 2.363031403426791e-05, + "loss": 0.056, + "num_input_tokens_seen": 123619120, + "step": 101590 + }, + { + "epoch": 11.314734380220514, + "grad_norm": 0.10906082391738892, + "learning_rate": 2.3627887952315032e-05, + "loss": 0.0383, + "num_input_tokens_seen": 123625104, + "step": 101595 + }, + { + "epoch": 11.315291235104132, + "grad_norm": 0.10974358022212982, + "learning_rate": 2.3625461883322847e-05, + "loss": 0.0601, + "num_input_tokens_seen": 123631152, + "step": 101600 + }, + { + "epoch": 11.31584808998775, + "grad_norm": 0.0014992899959906936, + "learning_rate": 2.3623035827314284e-05, + "loss": 0.0171, + "num_input_tokens_seen": 123637392, + "step": 101605 + }, + { + "epoch": 11.316404944871367, + "grad_norm": 0.016740834340453148, + "learning_rate": 2.362060978431224e-05, + "loss": 0.0205, + "num_input_tokens_seen": 123643376, + "step": 101610 + }, + { + "epoch": 11.316961799754983, + "grad_norm": 2.444901943206787, + "learning_rate": 2.3618183754339656e-05, + "loss": 0.0596, + "num_input_tokens_seen": 123649680, + "step": 101615 + }, + { + "epoch": 11.3175186546386, + "grad_norm": 0.08960200846195221, + "learning_rate": 2.3615757737419414e-05, + "loss": 0.0141, + "num_input_tokens_seen": 123655728, + "step": 101620 + }, + { + "epoch": 11.318075509522219, + "grad_norm": 0.38204094767570496, + "learning_rate": 2.361333173357447e-05, + "loss": 0.0183, + "num_input_tokens_seen": 123661872, + "step": 101625 + }, + { + "epoch": 11.318632364405836, + "grad_norm": 0.5645211935043335, + "learning_rate": 2.36109057428277e-05, + "loss": 0.0763, + "num_input_tokens_seen": 123667312, + "step": 101630 + }, + { + "epoch": 11.319189219289454, + "grad_norm": 0.0210721418261528, + "learning_rate": 2.360847976520205e-05, + "loss": 0.0088, + "num_input_tokens_seen": 123673296, + "step": 101635 + }, + { + "epoch": 11.31974607417307, + "grad_norm": 0.055794596672058105, + "learning_rate": 2.3606053800720417e-05, + "loss": 0.0833, + "num_input_tokens_seen": 123678928, + "step": 101640 + }, + { + "epoch": 11.320302929056687, + "grad_norm": 0.026575757190585136, + "learning_rate": 2.3603627849405733e-05, + "loss": 0.014, + "num_input_tokens_seen": 123684432, + "step": 101645 + }, + { + "epoch": 11.320859783940305, + "grad_norm": 0.12161251157522202, + "learning_rate": 2.3601201911280897e-05, + "loss": 0.0548, + "num_input_tokens_seen": 123690544, + "step": 101650 + }, + { + "epoch": 11.321416638823923, + "grad_norm": 0.0001788448280422017, + "learning_rate": 2.359877598636883e-05, + "loss": 0.0397, + "num_input_tokens_seen": 123696656, + "step": 101655 + }, + { + "epoch": 11.32197349370754, + "grad_norm": 0.04104110226035118, + "learning_rate": 2.3596350074692446e-05, + "loss": 0.0089, + "num_input_tokens_seen": 123702800, + "step": 101660 + }, + { + "epoch": 11.322530348591156, + "grad_norm": 3.224008083343506, + "learning_rate": 2.3593924176274658e-05, + "loss": 0.1107, + "num_input_tokens_seen": 123709072, + "step": 101665 + }, + { + "epoch": 11.323087203474774, + "grad_norm": 0.1283109188079834, + "learning_rate": 2.3591498291138387e-05, + "loss": 0.0047, + "num_input_tokens_seen": 123715376, + "step": 101670 + }, + { + "epoch": 11.323644058358392, + "grad_norm": 0.25573205947875977, + "learning_rate": 2.3589072419306538e-05, + "loss": 0.0624, + "num_input_tokens_seen": 123721616, + "step": 101675 + }, + { + "epoch": 11.32420091324201, + "grad_norm": 0.010808466002345085, + "learning_rate": 2.3586646560802033e-05, + "loss": 0.0051, + "num_input_tokens_seen": 123727792, + "step": 101680 + }, + { + "epoch": 11.324757768125627, + "grad_norm": 0.8473207950592041, + "learning_rate": 2.3584220715647785e-05, + "loss": 0.0613, + "num_input_tokens_seen": 123734128, + "step": 101685 + }, + { + "epoch": 11.325314623009243, + "grad_norm": 0.06270307302474976, + "learning_rate": 2.3581794883866706e-05, + "loss": 0.0031, + "num_input_tokens_seen": 123740208, + "step": 101690 + }, + { + "epoch": 11.32587147789286, + "grad_norm": 0.6193490028381348, + "learning_rate": 2.3579369065481703e-05, + "loss": 0.0349, + "num_input_tokens_seen": 123746320, + "step": 101695 + }, + { + "epoch": 11.326428332776478, + "grad_norm": 0.06949014216661453, + "learning_rate": 2.3576943260515712e-05, + "loss": 0.0106, + "num_input_tokens_seen": 123752272, + "step": 101700 + }, + { + "epoch": 11.326985187660096, + "grad_norm": 0.01732609048485756, + "learning_rate": 2.3574517468991615e-05, + "loss": 0.0488, + "num_input_tokens_seen": 123758736, + "step": 101705 + }, + { + "epoch": 11.327542042543714, + "grad_norm": 0.02213377133011818, + "learning_rate": 2.357209169093236e-05, + "loss": 0.0021, + "num_input_tokens_seen": 123764880, + "step": 101710 + }, + { + "epoch": 11.328098897427331, + "grad_norm": 0.7975308895111084, + "learning_rate": 2.3569665926360825e-05, + "loss": 0.0159, + "num_input_tokens_seen": 123771312, + "step": 101715 + }, + { + "epoch": 11.328655752310947, + "grad_norm": 1.903722882270813, + "learning_rate": 2.3567240175299956e-05, + "loss": 0.1774, + "num_input_tokens_seen": 123777360, + "step": 101720 + }, + { + "epoch": 11.329212607194565, + "grad_norm": 0.26419636607170105, + "learning_rate": 2.356481443777264e-05, + "loss": 0.0319, + "num_input_tokens_seen": 123783184, + "step": 101725 + }, + { + "epoch": 11.329769462078183, + "grad_norm": 0.14642098546028137, + "learning_rate": 2.3562388713801814e-05, + "loss": 0.0222, + "num_input_tokens_seen": 123788848, + "step": 101730 + }, + { + "epoch": 11.3303263169618, + "grad_norm": 1.2288246154785156, + "learning_rate": 2.355996300341037e-05, + "loss": 0.1652, + "num_input_tokens_seen": 123794832, + "step": 101735 + }, + { + "epoch": 11.330883171845418, + "grad_norm": 0.017742477357387543, + "learning_rate": 2.355753730662123e-05, + "loss": 0.011, + "num_input_tokens_seen": 123801168, + "step": 101740 + }, + { + "epoch": 11.331440026729034, + "grad_norm": 0.7213758230209351, + "learning_rate": 2.355511162345731e-05, + "loss": 0.0283, + "num_input_tokens_seen": 123807088, + "step": 101745 + }, + { + "epoch": 11.331996881612652, + "grad_norm": 0.014015515334904194, + "learning_rate": 2.3552685953941517e-05, + "loss": 0.0146, + "num_input_tokens_seen": 123813264, + "step": 101750 + }, + { + "epoch": 11.33255373649627, + "grad_norm": 0.05420561879873276, + "learning_rate": 2.355026029809676e-05, + "loss": 0.1446, + "num_input_tokens_seen": 123819152, + "step": 101755 + }, + { + "epoch": 11.333110591379887, + "grad_norm": 7.664402801310644e-05, + "learning_rate": 2.3547834655945965e-05, + "loss": 0.0495, + "num_input_tokens_seen": 123824976, + "step": 101760 + }, + { + "epoch": 11.333667446263505, + "grad_norm": 0.2732428014278412, + "learning_rate": 2.3545409027512018e-05, + "loss": 0.0217, + "num_input_tokens_seen": 123830832, + "step": 101765 + }, + { + "epoch": 11.33422430114712, + "grad_norm": 0.045578885823488235, + "learning_rate": 2.354298341281787e-05, + "loss": 0.0027, + "num_input_tokens_seen": 123837040, + "step": 101770 + }, + { + "epoch": 11.334781156030738, + "grad_norm": 0.00284352689050138, + "learning_rate": 2.3540557811886394e-05, + "loss": 0.0155, + "num_input_tokens_seen": 123843216, + "step": 101775 + }, + { + "epoch": 11.335338010914356, + "grad_norm": 0.014024445787072182, + "learning_rate": 2.3538132224740526e-05, + "loss": 0.0153, + "num_input_tokens_seen": 123849424, + "step": 101780 + }, + { + "epoch": 11.335894865797973, + "grad_norm": 2.0690689086914062, + "learning_rate": 2.3535706651403165e-05, + "loss": 0.1276, + "num_input_tokens_seen": 123855440, + "step": 101785 + }, + { + "epoch": 11.336451720681591, + "grad_norm": 0.3313339650630951, + "learning_rate": 2.353328109189724e-05, + "loss": 0.0084, + "num_input_tokens_seen": 123861328, + "step": 101790 + }, + { + "epoch": 11.337008575565207, + "grad_norm": 0.04974917694926262, + "learning_rate": 2.3530855546245638e-05, + "loss": 0.0067, + "num_input_tokens_seen": 123867408, + "step": 101795 + }, + { + "epoch": 11.337565430448825, + "grad_norm": 0.10512468218803406, + "learning_rate": 2.352843001447129e-05, + "loss": 0.0175, + "num_input_tokens_seen": 123873776, + "step": 101800 + }, + { + "epoch": 11.338122285332442, + "grad_norm": 0.04688400402665138, + "learning_rate": 2.3526004496597096e-05, + "loss": 0.037, + "num_input_tokens_seen": 123879440, + "step": 101805 + }, + { + "epoch": 11.33867914021606, + "grad_norm": 0.011885745450854301, + "learning_rate": 2.3523578992645974e-05, + "loss": 0.009, + "num_input_tokens_seen": 123885584, + "step": 101810 + }, + { + "epoch": 11.339235995099678, + "grad_norm": 1.1996840238571167, + "learning_rate": 2.3521153502640826e-05, + "loss": 0.0521, + "num_input_tokens_seen": 123891088, + "step": 101815 + }, + { + "epoch": 11.339792849983294, + "grad_norm": 0.06453178822994232, + "learning_rate": 2.3518728026604572e-05, + "loss": 0.0606, + "num_input_tokens_seen": 123897200, + "step": 101820 + }, + { + "epoch": 11.340349704866911, + "grad_norm": 0.00011151835497003049, + "learning_rate": 2.3516302564560107e-05, + "loss": 0.0474, + "num_input_tokens_seen": 123903120, + "step": 101825 + }, + { + "epoch": 11.340906559750529, + "grad_norm": 0.0016374830156564713, + "learning_rate": 2.3513877116530374e-05, + "loss": 0.0165, + "num_input_tokens_seen": 123909552, + "step": 101830 + }, + { + "epoch": 11.341463414634147, + "grad_norm": 0.038956042379140854, + "learning_rate": 2.3511451682538244e-05, + "loss": 0.0082, + "num_input_tokens_seen": 123915504, + "step": 101835 + }, + { + "epoch": 11.342020269517764, + "grad_norm": 1.48470139503479, + "learning_rate": 2.350902626260666e-05, + "loss": 0.0711, + "num_input_tokens_seen": 123921392, + "step": 101840 + }, + { + "epoch": 11.34257712440138, + "grad_norm": 0.27876225113868713, + "learning_rate": 2.35066008567585e-05, + "loss": 0.0386, + "num_input_tokens_seen": 123927344, + "step": 101845 + }, + { + "epoch": 11.343133979284998, + "grad_norm": 0.3906742036342621, + "learning_rate": 2.3504175465016706e-05, + "loss": 0.1411, + "num_input_tokens_seen": 123933712, + "step": 101850 + }, + { + "epoch": 11.343690834168616, + "grad_norm": 0.279380738735199, + "learning_rate": 2.3501750087404167e-05, + "loss": 0.0458, + "num_input_tokens_seen": 123939408, + "step": 101855 + }, + { + "epoch": 11.344247689052233, + "grad_norm": 1.404854655265808, + "learning_rate": 2.3499324723943803e-05, + "loss": 0.0685, + "num_input_tokens_seen": 123945776, + "step": 101860 + }, + { + "epoch": 11.344804543935851, + "grad_norm": 0.03370184451341629, + "learning_rate": 2.3496899374658515e-05, + "loss": 0.0183, + "num_input_tokens_seen": 123951792, + "step": 101865 + }, + { + "epoch": 11.345361398819467, + "grad_norm": 0.015428517945110798, + "learning_rate": 2.349447403957122e-05, + "loss": 0.0327, + "num_input_tokens_seen": 123957936, + "step": 101870 + }, + { + "epoch": 11.345918253703084, + "grad_norm": 0.06754323095083237, + "learning_rate": 2.349204871870482e-05, + "loss": 0.0069, + "num_input_tokens_seen": 123964048, + "step": 101875 + }, + { + "epoch": 11.346475108586702, + "grad_norm": 1.8883453607559204, + "learning_rate": 2.3489623412082236e-05, + "loss": 0.0387, + "num_input_tokens_seen": 123970064, + "step": 101880 + }, + { + "epoch": 11.34703196347032, + "grad_norm": 0.06852627545595169, + "learning_rate": 2.3487198119726358e-05, + "loss": 0.1097, + "num_input_tokens_seen": 123975888, + "step": 101885 + }, + { + "epoch": 11.347588818353938, + "grad_norm": 0.08143004029989243, + "learning_rate": 2.3484772841660115e-05, + "loss": 0.068, + "num_input_tokens_seen": 123981840, + "step": 101890 + }, + { + "epoch": 11.348145673237553, + "grad_norm": 0.0008588648051954806, + "learning_rate": 2.3482347577906398e-05, + "loss": 0.0508, + "num_input_tokens_seen": 123987792, + "step": 101895 + }, + { + "epoch": 11.348702528121171, + "grad_norm": 0.0629984438419342, + "learning_rate": 2.3479922328488134e-05, + "loss": 0.0842, + "num_input_tokens_seen": 123993552, + "step": 101900 + }, + { + "epoch": 11.349259383004789, + "grad_norm": 0.0177948959171772, + "learning_rate": 2.3477497093428207e-05, + "loss": 0.0387, + "num_input_tokens_seen": 123999888, + "step": 101905 + }, + { + "epoch": 11.349816237888406, + "grad_norm": 6.75696792313829e-05, + "learning_rate": 2.3475071872749554e-05, + "loss": 0.0121, + "num_input_tokens_seen": 124006128, + "step": 101910 + }, + { + "epoch": 11.350373092772024, + "grad_norm": 0.0001906032266560942, + "learning_rate": 2.3472646666475063e-05, + "loss": 0.0159, + "num_input_tokens_seen": 124012240, + "step": 101915 + }, + { + "epoch": 11.350929947655642, + "grad_norm": 0.04850851744413376, + "learning_rate": 2.3470221474627653e-05, + "loss": 0.0052, + "num_input_tokens_seen": 124018320, + "step": 101920 + }, + { + "epoch": 11.351486802539258, + "grad_norm": 0.24673941731452942, + "learning_rate": 2.346779629723022e-05, + "loss": 0.0117, + "num_input_tokens_seen": 124024176, + "step": 101925 + }, + { + "epoch": 11.352043657422875, + "grad_norm": 0.005417506210505962, + "learning_rate": 2.3465371134305684e-05, + "loss": 0.1042, + "num_input_tokens_seen": 124030160, + "step": 101930 + }, + { + "epoch": 11.352600512306493, + "grad_norm": 0.00218904297798872, + "learning_rate": 2.346294598587694e-05, + "loss": 0.0287, + "num_input_tokens_seen": 124036080, + "step": 101935 + }, + { + "epoch": 11.35315736719011, + "grad_norm": 0.015078768134117126, + "learning_rate": 2.3460520851966912e-05, + "loss": 0.0432, + "num_input_tokens_seen": 124042512, + "step": 101940 + }, + { + "epoch": 11.353714222073728, + "grad_norm": 0.33333057165145874, + "learning_rate": 2.345809573259849e-05, + "loss": 0.0958, + "num_input_tokens_seen": 124048496, + "step": 101945 + }, + { + "epoch": 11.354271076957344, + "grad_norm": 1.166670322418213, + "learning_rate": 2.3455670627794594e-05, + "loss": 0.1313, + "num_input_tokens_seen": 124054320, + "step": 101950 + }, + { + "epoch": 11.354827931840962, + "grad_norm": 0.5891835689544678, + "learning_rate": 2.3453245537578117e-05, + "loss": 0.1004, + "num_input_tokens_seen": 124060528, + "step": 101955 + }, + { + "epoch": 11.35538478672458, + "grad_norm": 0.7801439762115479, + "learning_rate": 2.345082046197199e-05, + "loss": 0.0607, + "num_input_tokens_seen": 124066384, + "step": 101960 + }, + { + "epoch": 11.355941641608197, + "grad_norm": 0.6993331909179688, + "learning_rate": 2.344839540099909e-05, + "loss": 0.066, + "num_input_tokens_seen": 124072368, + "step": 101965 + }, + { + "epoch": 11.356498496491815, + "grad_norm": 0.06383057683706284, + "learning_rate": 2.3445970354682348e-05, + "loss": 0.0501, + "num_input_tokens_seen": 124078352, + "step": 101970 + }, + { + "epoch": 11.35705535137543, + "grad_norm": 0.9975430965423584, + "learning_rate": 2.3443545323044658e-05, + "loss": 0.0878, + "num_input_tokens_seen": 124084528, + "step": 101975 + }, + { + "epoch": 11.357612206259049, + "grad_norm": 0.0013393328990787268, + "learning_rate": 2.344112030610893e-05, + "loss": 0.0439, + "num_input_tokens_seen": 124090768, + "step": 101980 + }, + { + "epoch": 11.358169061142666, + "grad_norm": 0.677445650100708, + "learning_rate": 2.343869530389807e-05, + "loss": 0.1241, + "num_input_tokens_seen": 124096944, + "step": 101985 + }, + { + "epoch": 11.358725916026284, + "grad_norm": 0.002137589966878295, + "learning_rate": 2.3436270316434984e-05, + "loss": 0.0079, + "num_input_tokens_seen": 124103152, + "step": 101990 + }, + { + "epoch": 11.359282770909902, + "grad_norm": 0.04520916938781738, + "learning_rate": 2.3433845343742578e-05, + "loss": 0.0221, + "num_input_tokens_seen": 124109136, + "step": 101995 + }, + { + "epoch": 11.359839625793517, + "grad_norm": 0.6681933403015137, + "learning_rate": 2.343142038584376e-05, + "loss": 0.0188, + "num_input_tokens_seen": 124115280, + "step": 102000 + }, + { + "epoch": 11.360396480677135, + "grad_norm": 0.00028734022635035217, + "learning_rate": 2.342899544276143e-05, + "loss": 0.0339, + "num_input_tokens_seen": 124121712, + "step": 102005 + }, + { + "epoch": 11.360953335560753, + "grad_norm": 0.4014195501804352, + "learning_rate": 2.3426570514518497e-05, + "loss": 0.1422, + "num_input_tokens_seen": 124127952, + "step": 102010 + }, + { + "epoch": 11.36151019044437, + "grad_norm": 0.7462997436523438, + "learning_rate": 2.3424145601137858e-05, + "loss": 0.0636, + "num_input_tokens_seen": 124133712, + "step": 102015 + }, + { + "epoch": 11.362067045327988, + "grad_norm": 2.217118978500366, + "learning_rate": 2.3421720702642444e-05, + "loss": 0.1164, + "num_input_tokens_seen": 124140016, + "step": 102020 + }, + { + "epoch": 11.362623900211604, + "grad_norm": 0.8190069198608398, + "learning_rate": 2.3419295819055125e-05, + "loss": 0.0355, + "num_input_tokens_seen": 124145936, + "step": 102025 + }, + { + "epoch": 11.363180755095222, + "grad_norm": 0.007068159524351358, + "learning_rate": 2.3416870950398838e-05, + "loss": 0.0465, + "num_input_tokens_seen": 124152144, + "step": 102030 + }, + { + "epoch": 11.36373760997884, + "grad_norm": 0.010469400323927402, + "learning_rate": 2.341444609669646e-05, + "loss": 0.0122, + "num_input_tokens_seen": 124158128, + "step": 102035 + }, + { + "epoch": 11.364294464862457, + "grad_norm": 0.00017463257245253772, + "learning_rate": 2.3412021257970917e-05, + "loss": 0.0426, + "num_input_tokens_seen": 124164400, + "step": 102040 + }, + { + "epoch": 11.364851319746075, + "grad_norm": 2.3888375759124756, + "learning_rate": 2.34095964342451e-05, + "loss": 0.0563, + "num_input_tokens_seen": 124170704, + "step": 102045 + }, + { + "epoch": 11.36540817462969, + "grad_norm": 1.1366298198699951, + "learning_rate": 2.3407171625541928e-05, + "loss": 0.0308, + "num_input_tokens_seen": 124176432, + "step": 102050 + }, + { + "epoch": 11.365965029513308, + "grad_norm": 0.32493463158607483, + "learning_rate": 2.340474683188429e-05, + "loss": 0.07, + "num_input_tokens_seen": 124182832, + "step": 102055 + }, + { + "epoch": 11.366521884396926, + "grad_norm": 0.02342057041823864, + "learning_rate": 2.34023220532951e-05, + "loss": 0.0486, + "num_input_tokens_seen": 124189328, + "step": 102060 + }, + { + "epoch": 11.367078739280544, + "grad_norm": 0.3882823884487152, + "learning_rate": 2.3399897289797257e-05, + "loss": 0.06, + "num_input_tokens_seen": 124195120, + "step": 102065 + }, + { + "epoch": 11.367635594164161, + "grad_norm": 0.06518054008483887, + "learning_rate": 2.3397472541413662e-05, + "loss": 0.0705, + "num_input_tokens_seen": 124200880, + "step": 102070 + }, + { + "epoch": 11.368192449047779, + "grad_norm": 0.002608819864690304, + "learning_rate": 2.339504780816723e-05, + "loss": 0.0332, + "num_input_tokens_seen": 124207152, + "step": 102075 + }, + { + "epoch": 11.368749303931395, + "grad_norm": 0.1498107761144638, + "learning_rate": 2.339262309008085e-05, + "loss": 0.0111, + "num_input_tokens_seen": 124213360, + "step": 102080 + }, + { + "epoch": 11.369306158815013, + "grad_norm": 0.0002185442135669291, + "learning_rate": 2.339019838717744e-05, + "loss": 0.0022, + "num_input_tokens_seen": 124219856, + "step": 102085 + }, + { + "epoch": 11.36986301369863, + "grad_norm": 0.039344217628240585, + "learning_rate": 2.3387773699479885e-05, + "loss": 0.0097, + "num_input_tokens_seen": 124225936, + "step": 102090 + }, + { + "epoch": 11.370419868582248, + "grad_norm": 0.018121587112545967, + "learning_rate": 2.338534902701111e-05, + "loss": 0.0388, + "num_input_tokens_seen": 124231856, + "step": 102095 + }, + { + "epoch": 11.370976723465866, + "grad_norm": 0.01985592395067215, + "learning_rate": 2.3382924369793997e-05, + "loss": 0.0114, + "num_input_tokens_seen": 124238096, + "step": 102100 + }, + { + "epoch": 11.371533578349482, + "grad_norm": 0.04886990785598755, + "learning_rate": 2.338049972785147e-05, + "loss": 0.0138, + "num_input_tokens_seen": 124244304, + "step": 102105 + }, + { + "epoch": 11.3720904332331, + "grad_norm": 0.007592162117362022, + "learning_rate": 2.3378075101206408e-05, + "loss": 0.0122, + "num_input_tokens_seen": 124250256, + "step": 102110 + }, + { + "epoch": 11.372647288116717, + "grad_norm": 0.34168341755867004, + "learning_rate": 2.3375650489881743e-05, + "loss": 0.0373, + "num_input_tokens_seen": 124256528, + "step": 102115 + }, + { + "epoch": 11.373204143000335, + "grad_norm": 1.228879451751709, + "learning_rate": 2.337322589390034e-05, + "loss": 0.0154, + "num_input_tokens_seen": 124262608, + "step": 102120 + }, + { + "epoch": 11.373760997883952, + "grad_norm": 0.798992395401001, + "learning_rate": 2.3370801313285137e-05, + "loss": 0.0418, + "num_input_tokens_seen": 124268720, + "step": 102125 + }, + { + "epoch": 11.374317852767568, + "grad_norm": 2.283853530883789, + "learning_rate": 2.3368376748059013e-05, + "loss": 0.0539, + "num_input_tokens_seen": 124274864, + "step": 102130 + }, + { + "epoch": 11.374874707651186, + "grad_norm": 0.008031978271901608, + "learning_rate": 2.3365952198244885e-05, + "loss": 0.0141, + "num_input_tokens_seen": 124280976, + "step": 102135 + }, + { + "epoch": 11.375431562534803, + "grad_norm": 0.8788422346115112, + "learning_rate": 2.336352766386564e-05, + "loss": 0.0814, + "num_input_tokens_seen": 124286640, + "step": 102140 + }, + { + "epoch": 11.375988417418421, + "grad_norm": 0.011537768878042698, + "learning_rate": 2.3361103144944197e-05, + "loss": 0.0548, + "num_input_tokens_seen": 124292848, + "step": 102145 + }, + { + "epoch": 11.376545272302039, + "grad_norm": 1.4442180395126343, + "learning_rate": 2.335867864150344e-05, + "loss": 0.0241, + "num_input_tokens_seen": 124299024, + "step": 102150 + }, + { + "epoch": 11.377102127185655, + "grad_norm": 0.08989078551530838, + "learning_rate": 2.335625415356628e-05, + "loss": 0.0023, + "num_input_tokens_seen": 124305232, + "step": 102155 + }, + { + "epoch": 11.377658982069272, + "grad_norm": 0.18651658296585083, + "learning_rate": 2.3353829681155618e-05, + "loss": 0.1349, + "num_input_tokens_seen": 124310672, + "step": 102160 + }, + { + "epoch": 11.37821583695289, + "grad_norm": 0.7718557715415955, + "learning_rate": 2.3351405224294353e-05, + "loss": 0.0216, + "num_input_tokens_seen": 124316784, + "step": 102165 + }, + { + "epoch": 11.378772691836508, + "grad_norm": 3.3120062351226807, + "learning_rate": 2.334898078300538e-05, + "loss": 0.0996, + "num_input_tokens_seen": 124322800, + "step": 102170 + }, + { + "epoch": 11.379329546720125, + "grad_norm": 0.5329737067222595, + "learning_rate": 2.334655635731162e-05, + "loss": 0.0208, + "num_input_tokens_seen": 124328688, + "step": 102175 + }, + { + "epoch": 11.379886401603741, + "grad_norm": 0.00013885347289033234, + "learning_rate": 2.3344131947235946e-05, + "loss": 0.0862, + "num_input_tokens_seen": 124334768, + "step": 102180 + }, + { + "epoch": 11.380443256487359, + "grad_norm": 0.022606277838349342, + "learning_rate": 2.3341707552801277e-05, + "loss": 0.1074, + "num_input_tokens_seen": 124341200, + "step": 102185 + }, + { + "epoch": 11.381000111370977, + "grad_norm": 0.3259350061416626, + "learning_rate": 2.333928317403051e-05, + "loss": 0.0416, + "num_input_tokens_seen": 124347280, + "step": 102190 + }, + { + "epoch": 11.381556966254594, + "grad_norm": 0.6018919348716736, + "learning_rate": 2.333685881094655e-05, + "loss": 0.029, + "num_input_tokens_seen": 124353488, + "step": 102195 + }, + { + "epoch": 11.382113821138212, + "grad_norm": 0.36915263533592224, + "learning_rate": 2.333443446357228e-05, + "loss": 0.1008, + "num_input_tokens_seen": 124359440, + "step": 102200 + }, + { + "epoch": 11.382670676021828, + "grad_norm": 0.7696464657783508, + "learning_rate": 2.333201013193062e-05, + "loss": 0.028, + "num_input_tokens_seen": 124364976, + "step": 102205 + }, + { + "epoch": 11.383227530905446, + "grad_norm": 0.15980258584022522, + "learning_rate": 2.3329585816044454e-05, + "loss": 0.0039, + "num_input_tokens_seen": 124371440, + "step": 102210 + }, + { + "epoch": 11.383784385789063, + "grad_norm": 1.109648585319519, + "learning_rate": 2.3327161515936695e-05, + "loss": 0.0274, + "num_input_tokens_seen": 124377488, + "step": 102215 + }, + { + "epoch": 11.384341240672681, + "grad_norm": 0.037225667387247086, + "learning_rate": 2.3324737231630228e-05, + "loss": 0.002, + "num_input_tokens_seen": 124383920, + "step": 102220 + }, + { + "epoch": 11.384898095556299, + "grad_norm": 1.7980421781539917, + "learning_rate": 2.332231296314797e-05, + "loss": 0.1474, + "num_input_tokens_seen": 124389808, + "step": 102225 + }, + { + "epoch": 11.385454950439915, + "grad_norm": 0.45683836936950684, + "learning_rate": 2.3319888710512795e-05, + "loss": 0.2172, + "num_input_tokens_seen": 124395536, + "step": 102230 + }, + { + "epoch": 11.386011805323532, + "grad_norm": 1.5831167697906494, + "learning_rate": 2.331746447374763e-05, + "loss": 0.1107, + "num_input_tokens_seen": 124401072, + "step": 102235 + }, + { + "epoch": 11.38656866020715, + "grad_norm": 0.0008777171606197953, + "learning_rate": 2.3315040252875353e-05, + "loss": 0.073, + "num_input_tokens_seen": 124407248, + "step": 102240 + }, + { + "epoch": 11.387125515090768, + "grad_norm": 0.00022954547603148967, + "learning_rate": 2.3312616047918878e-05, + "loss": 0.0331, + "num_input_tokens_seen": 124413552, + "step": 102245 + }, + { + "epoch": 11.387682369974385, + "grad_norm": 0.16442981362342834, + "learning_rate": 2.331019185890109e-05, + "loss": 0.012, + "num_input_tokens_seen": 124419888, + "step": 102250 + }, + { + "epoch": 11.388239224858001, + "grad_norm": 0.0015318082878366113, + "learning_rate": 2.33077676858449e-05, + "loss": 0.0834, + "num_input_tokens_seen": 124426288, + "step": 102255 + }, + { + "epoch": 11.388796079741619, + "grad_norm": 0.05312811955809593, + "learning_rate": 2.3305343528773195e-05, + "loss": 0.0514, + "num_input_tokens_seen": 124432432, + "step": 102260 + }, + { + "epoch": 11.389352934625236, + "grad_norm": 1.0323575735092163, + "learning_rate": 2.3302919387708886e-05, + "loss": 0.032, + "num_input_tokens_seen": 124438768, + "step": 102265 + }, + { + "epoch": 11.389909789508854, + "grad_norm": 0.6648248434066772, + "learning_rate": 2.3300495262674856e-05, + "loss": 0.0789, + "num_input_tokens_seen": 124444496, + "step": 102270 + }, + { + "epoch": 11.390466644392472, + "grad_norm": 0.007751329801976681, + "learning_rate": 2.3298071153694014e-05, + "loss": 0.1176, + "num_input_tokens_seen": 124450608, + "step": 102275 + }, + { + "epoch": 11.39102349927609, + "grad_norm": 0.19551633298397064, + "learning_rate": 2.3295647060789247e-05, + "loss": 0.0135, + "num_input_tokens_seen": 124456624, + "step": 102280 + }, + { + "epoch": 11.391580354159705, + "grad_norm": 0.0051323771476745605, + "learning_rate": 2.3293222983983466e-05, + "loss": 0.0087, + "num_input_tokens_seen": 124462896, + "step": 102285 + }, + { + "epoch": 11.392137209043323, + "grad_norm": 0.71097332239151, + "learning_rate": 2.329079892329955e-05, + "loss": 0.0443, + "num_input_tokens_seen": 124468944, + "step": 102290 + }, + { + "epoch": 11.39269406392694, + "grad_norm": 0.08683192729949951, + "learning_rate": 2.328837487876042e-05, + "loss": 0.0396, + "num_input_tokens_seen": 124474640, + "step": 102295 + }, + { + "epoch": 11.393250918810558, + "grad_norm": 1.0431383848190308, + "learning_rate": 2.3285950850388953e-05, + "loss": 0.059, + "num_input_tokens_seen": 124480848, + "step": 102300 + }, + { + "epoch": 11.393807773694176, + "grad_norm": 0.00010331358498660848, + "learning_rate": 2.3283526838208063e-05, + "loss": 0.0076, + "num_input_tokens_seen": 124486928, + "step": 102305 + }, + { + "epoch": 11.394364628577792, + "grad_norm": 0.01683877222239971, + "learning_rate": 2.3281102842240623e-05, + "loss": 0.0491, + "num_input_tokens_seen": 124492944, + "step": 102310 + }, + { + "epoch": 11.39492148346141, + "grad_norm": 0.001718592830002308, + "learning_rate": 2.3278678862509555e-05, + "loss": 0.0501, + "num_input_tokens_seen": 124499088, + "step": 102315 + }, + { + "epoch": 11.395478338345027, + "grad_norm": 0.00028285273583605886, + "learning_rate": 2.3276254899037738e-05, + "loss": 0.0122, + "num_input_tokens_seen": 124505104, + "step": 102320 + }, + { + "epoch": 11.396035193228645, + "grad_norm": 0.518286406993866, + "learning_rate": 2.3273830951848083e-05, + "loss": 0.0455, + "num_input_tokens_seen": 124511216, + "step": 102325 + }, + { + "epoch": 11.396592048112263, + "grad_norm": 0.1920080929994583, + "learning_rate": 2.3271407020963467e-05, + "loss": 0.0167, + "num_input_tokens_seen": 124517488, + "step": 102330 + }, + { + "epoch": 11.397148902995879, + "grad_norm": 0.07221759855747223, + "learning_rate": 2.3268983106406807e-05, + "loss": 0.0026, + "num_input_tokens_seen": 124523248, + "step": 102335 + }, + { + "epoch": 11.397705757879496, + "grad_norm": 0.029189566150307655, + "learning_rate": 2.326655920820098e-05, + "loss": 0.0095, + "num_input_tokens_seen": 124529296, + "step": 102340 + }, + { + "epoch": 11.398262612763114, + "grad_norm": 1.3102155923843384, + "learning_rate": 2.3264135326368895e-05, + "loss": 0.024, + "num_input_tokens_seen": 124535664, + "step": 102345 + }, + { + "epoch": 11.398819467646732, + "grad_norm": 0.7428733110427856, + "learning_rate": 2.326171146093344e-05, + "loss": 0.0406, + "num_input_tokens_seen": 124541648, + "step": 102350 + }, + { + "epoch": 11.39937632253035, + "grad_norm": 0.961432158946991, + "learning_rate": 2.325928761191752e-05, + "loss": 0.0331, + "num_input_tokens_seen": 124548080, + "step": 102355 + }, + { + "epoch": 11.399933177413965, + "grad_norm": 0.0038313495460897684, + "learning_rate": 2.3256863779344006e-05, + "loss": 0.0108, + "num_input_tokens_seen": 124554224, + "step": 102360 + }, + { + "epoch": 11.400490032297583, + "grad_norm": 1.2218133211135864, + "learning_rate": 2.325443996323583e-05, + "loss": 0.0882, + "num_input_tokens_seen": 124559888, + "step": 102365 + }, + { + "epoch": 11.4010468871812, + "grad_norm": 1.1615768671035767, + "learning_rate": 2.325201616361585e-05, + "loss": 0.0584, + "num_input_tokens_seen": 124565872, + "step": 102370 + }, + { + "epoch": 11.401603742064818, + "grad_norm": 0.15448322892189026, + "learning_rate": 2.324959238050699e-05, + "loss": 0.039, + "num_input_tokens_seen": 124572368, + "step": 102375 + }, + { + "epoch": 11.402160596948436, + "grad_norm": 0.23891787230968475, + "learning_rate": 2.3247168613932125e-05, + "loss": 0.0689, + "num_input_tokens_seen": 124578320, + "step": 102380 + }, + { + "epoch": 11.402717451832052, + "grad_norm": 0.18229003250598907, + "learning_rate": 2.3244744863914163e-05, + "loss": 0.0833, + "num_input_tokens_seen": 124584720, + "step": 102385 + }, + { + "epoch": 11.40327430671567, + "grad_norm": 0.3527698218822479, + "learning_rate": 2.324232113047599e-05, + "loss": 0.0399, + "num_input_tokens_seen": 124590768, + "step": 102390 + }, + { + "epoch": 11.403831161599287, + "grad_norm": 0.0008825613767839968, + "learning_rate": 2.3239897413640502e-05, + "loss": 0.0076, + "num_input_tokens_seen": 124596816, + "step": 102395 + }, + { + "epoch": 11.404388016482905, + "grad_norm": 0.0005559229175560176, + "learning_rate": 2.323747371343059e-05, + "loss": 0.0652, + "num_input_tokens_seen": 124602960, + "step": 102400 + }, + { + "epoch": 11.404944871366522, + "grad_norm": 0.15882301330566406, + "learning_rate": 2.3235050029869157e-05, + "loss": 0.032, + "num_input_tokens_seen": 124608976, + "step": 102405 + }, + { + "epoch": 11.405501726250138, + "grad_norm": 1.045462965965271, + "learning_rate": 2.3232626362979086e-05, + "loss": 0.1281, + "num_input_tokens_seen": 124615376, + "step": 102410 + }, + { + "epoch": 11.406058581133756, + "grad_norm": 0.04404095187783241, + "learning_rate": 2.323020271278328e-05, + "loss": 0.0319, + "num_input_tokens_seen": 124621264, + "step": 102415 + }, + { + "epoch": 11.406615436017374, + "grad_norm": 0.011225331574678421, + "learning_rate": 2.3227779079304612e-05, + "loss": 0.0718, + "num_input_tokens_seen": 124627440, + "step": 102420 + }, + { + "epoch": 11.407172290900991, + "grad_norm": 0.00013049329572822899, + "learning_rate": 2.322535546256601e-05, + "loss": 0.0164, + "num_input_tokens_seen": 124633424, + "step": 102425 + }, + { + "epoch": 11.407729145784609, + "grad_norm": 0.06873708218336105, + "learning_rate": 2.3222931862590333e-05, + "loss": 0.0546, + "num_input_tokens_seen": 124639504, + "step": 102430 + }, + { + "epoch": 11.408286000668227, + "grad_norm": 0.07906067371368408, + "learning_rate": 2.3220508279400503e-05, + "loss": 0.0923, + "num_input_tokens_seen": 124644944, + "step": 102435 + }, + { + "epoch": 11.408842855551843, + "grad_norm": 0.1816263049840927, + "learning_rate": 2.3218084713019382e-05, + "loss": 0.0362, + "num_input_tokens_seen": 124651472, + "step": 102440 + }, + { + "epoch": 11.40939971043546, + "grad_norm": 0.7061998248100281, + "learning_rate": 2.3215661163469887e-05, + "loss": 0.0465, + "num_input_tokens_seen": 124657616, + "step": 102445 + }, + { + "epoch": 11.409956565319078, + "grad_norm": 0.0005378624773584306, + "learning_rate": 2.32132376307749e-05, + "loss": 0.0378, + "num_input_tokens_seen": 124663792, + "step": 102450 + }, + { + "epoch": 11.410513420202696, + "grad_norm": 0.00029035916668362916, + "learning_rate": 2.321081411495732e-05, + "loss": 0.0035, + "num_input_tokens_seen": 124669712, + "step": 102455 + }, + { + "epoch": 11.411070275086313, + "grad_norm": 0.12521055340766907, + "learning_rate": 2.3208390616040028e-05, + "loss": 0.0462, + "num_input_tokens_seen": 124675824, + "step": 102460 + }, + { + "epoch": 11.41162712996993, + "grad_norm": 1.4934139251708984, + "learning_rate": 2.3205967134045926e-05, + "loss": 0.0939, + "num_input_tokens_seen": 124681968, + "step": 102465 + }, + { + "epoch": 11.412183984853547, + "grad_norm": 0.0020638867281377316, + "learning_rate": 2.3203543668997904e-05, + "loss": 0.0058, + "num_input_tokens_seen": 124688496, + "step": 102470 + }, + { + "epoch": 11.412740839737165, + "grad_norm": 0.7077879905700684, + "learning_rate": 2.3201120220918842e-05, + "loss": 0.0555, + "num_input_tokens_seen": 124694736, + "step": 102475 + }, + { + "epoch": 11.413297694620782, + "grad_norm": 0.7969807386398315, + "learning_rate": 2.319869678983165e-05, + "loss": 0.099, + "num_input_tokens_seen": 124700656, + "step": 102480 + }, + { + "epoch": 11.4138545495044, + "grad_norm": 0.0858001634478569, + "learning_rate": 2.3196273375759207e-05, + "loss": 0.0083, + "num_input_tokens_seen": 124706832, + "step": 102485 + }, + { + "epoch": 11.414411404388016, + "grad_norm": 0.006067700684070587, + "learning_rate": 2.3193849978724408e-05, + "loss": 0.0036, + "num_input_tokens_seen": 124712880, + "step": 102490 + }, + { + "epoch": 11.414968259271633, + "grad_norm": 0.2898915112018585, + "learning_rate": 2.319142659875014e-05, + "loss": 0.015, + "num_input_tokens_seen": 124719120, + "step": 102495 + }, + { + "epoch": 11.415525114155251, + "grad_norm": 1.0599080324172974, + "learning_rate": 2.3189003235859298e-05, + "loss": 0.1644, + "num_input_tokens_seen": 124724656, + "step": 102500 + }, + { + "epoch": 11.416081969038869, + "grad_norm": 1.3393806219100952, + "learning_rate": 2.3186579890074762e-05, + "loss": 0.0942, + "num_input_tokens_seen": 124730576, + "step": 102505 + }, + { + "epoch": 11.416638823922487, + "grad_norm": 0.007218766491860151, + "learning_rate": 2.3184156561419452e-05, + "loss": 0.002, + "num_input_tokens_seen": 124736304, + "step": 102510 + }, + { + "epoch": 11.417195678806102, + "grad_norm": 0.969296932220459, + "learning_rate": 2.318173324991622e-05, + "loss": 0.072, + "num_input_tokens_seen": 124742512, + "step": 102515 + }, + { + "epoch": 11.41775253368972, + "grad_norm": 0.02273363061249256, + "learning_rate": 2.3179309955587986e-05, + "loss": 0.0345, + "num_input_tokens_seen": 124748368, + "step": 102520 + }, + { + "epoch": 11.418309388573338, + "grad_norm": 0.07603918015956879, + "learning_rate": 2.3176886678457622e-05, + "loss": 0.1361, + "num_input_tokens_seen": 124754160, + "step": 102525 + }, + { + "epoch": 11.418866243456955, + "grad_norm": 0.006015130318701267, + "learning_rate": 2.3174463418548024e-05, + "loss": 0.0086, + "num_input_tokens_seen": 124760464, + "step": 102530 + }, + { + "epoch": 11.419423098340573, + "grad_norm": 0.011412225663661957, + "learning_rate": 2.3172040175882086e-05, + "loss": 0.0434, + "num_input_tokens_seen": 124766224, + "step": 102535 + }, + { + "epoch": 11.419979953224189, + "grad_norm": 0.0029151514172554016, + "learning_rate": 2.3169616950482694e-05, + "loss": 0.0071, + "num_input_tokens_seen": 124772624, + "step": 102540 + }, + { + "epoch": 11.420536808107807, + "grad_norm": 0.024440495297312737, + "learning_rate": 2.3167193742372728e-05, + "loss": 0.0561, + "num_input_tokens_seen": 124778480, + "step": 102545 + }, + { + "epoch": 11.421093662991424, + "grad_norm": 0.0001781235623639077, + "learning_rate": 2.3164770551575092e-05, + "loss": 0.0051, + "num_input_tokens_seen": 124784336, + "step": 102550 + }, + { + "epoch": 11.421650517875042, + "grad_norm": 0.02506156824529171, + "learning_rate": 2.3162347378112664e-05, + "loss": 0.0071, + "num_input_tokens_seen": 124790736, + "step": 102555 + }, + { + "epoch": 11.42220737275866, + "grad_norm": 2.586723804473877, + "learning_rate": 2.3159924222008346e-05, + "loss": 0.0369, + "num_input_tokens_seen": 124796976, + "step": 102560 + }, + { + "epoch": 11.422764227642276, + "grad_norm": 0.0007493706652894616, + "learning_rate": 2.315750108328501e-05, + "loss": 0.0157, + "num_input_tokens_seen": 124803120, + "step": 102565 + }, + { + "epoch": 11.423321082525893, + "grad_norm": 1.1719869375228882, + "learning_rate": 2.3155077961965555e-05, + "loss": 0.0141, + "num_input_tokens_seen": 124809104, + "step": 102570 + }, + { + "epoch": 11.423877937409511, + "grad_norm": 2.451042652130127, + "learning_rate": 2.315265485807286e-05, + "loss": 0.098, + "num_input_tokens_seen": 124815024, + "step": 102575 + }, + { + "epoch": 11.424434792293129, + "grad_norm": 0.024267755448818207, + "learning_rate": 2.3150231771629836e-05, + "loss": 0.0484, + "num_input_tokens_seen": 124821328, + "step": 102580 + }, + { + "epoch": 11.424991647176746, + "grad_norm": 0.05925151705741882, + "learning_rate": 2.3147808702659337e-05, + "loss": 0.0204, + "num_input_tokens_seen": 124827568, + "step": 102585 + }, + { + "epoch": 11.425548502060362, + "grad_norm": 0.2115486115217209, + "learning_rate": 2.314538565118428e-05, + "loss": 0.0096, + "num_input_tokens_seen": 124833488, + "step": 102590 + }, + { + "epoch": 11.42610535694398, + "grad_norm": 0.0008847780991345644, + "learning_rate": 2.3142962617227533e-05, + "loss": 0.0898, + "num_input_tokens_seen": 124839856, + "step": 102595 + }, + { + "epoch": 11.426662211827598, + "grad_norm": 0.030431222170591354, + "learning_rate": 2.3140539600812e-05, + "loss": 0.0485, + "num_input_tokens_seen": 124845968, + "step": 102600 + }, + { + "epoch": 11.427219066711215, + "grad_norm": 1.9573620557785034, + "learning_rate": 2.3138116601960557e-05, + "loss": 0.085, + "num_input_tokens_seen": 124852400, + "step": 102605 + }, + { + "epoch": 11.427775921594833, + "grad_norm": 0.33694371581077576, + "learning_rate": 2.3135693620696098e-05, + "loss": 0.004, + "num_input_tokens_seen": 124858448, + "step": 102610 + }, + { + "epoch": 11.428332776478449, + "grad_norm": 2.0625712871551514, + "learning_rate": 2.31332706570415e-05, + "loss": 0.1554, + "num_input_tokens_seen": 124864688, + "step": 102615 + }, + { + "epoch": 11.428889631362066, + "grad_norm": 0.24539715051651, + "learning_rate": 2.3130847711019664e-05, + "loss": 0.0297, + "num_input_tokens_seen": 124870832, + "step": 102620 + }, + { + "epoch": 11.429446486245684, + "grad_norm": 0.795749306678772, + "learning_rate": 2.3128424782653462e-05, + "loss": 0.0274, + "num_input_tokens_seen": 124876912, + "step": 102625 + }, + { + "epoch": 11.430003341129302, + "grad_norm": 0.44815173745155334, + "learning_rate": 2.312600187196579e-05, + "loss": 0.0162, + "num_input_tokens_seen": 124883088, + "step": 102630 + }, + { + "epoch": 11.43056019601292, + "grad_norm": 0.04929348826408386, + "learning_rate": 2.3123578978979528e-05, + "loss": 0.1089, + "num_input_tokens_seen": 124889072, + "step": 102635 + }, + { + "epoch": 11.431117050896537, + "grad_norm": 0.38182008266448975, + "learning_rate": 2.3121156103717576e-05, + "loss": 0.0253, + "num_input_tokens_seen": 124895440, + "step": 102640 + }, + { + "epoch": 11.431673905780153, + "grad_norm": 0.10790983587503433, + "learning_rate": 2.3118733246202794e-05, + "loss": 0.0212, + "num_input_tokens_seen": 124901008, + "step": 102645 + }, + { + "epoch": 11.43223076066377, + "grad_norm": 0.1461508721113205, + "learning_rate": 2.3116310406458096e-05, + "loss": 0.0106, + "num_input_tokens_seen": 124906896, + "step": 102650 + }, + { + "epoch": 11.432787615547388, + "grad_norm": 0.0002796475891955197, + "learning_rate": 2.311388758450635e-05, + "loss": 0.1406, + "num_input_tokens_seen": 124912592, + "step": 102655 + }, + { + "epoch": 11.433344470431006, + "grad_norm": 1.3843036890029907, + "learning_rate": 2.3111464780370454e-05, + "loss": 0.0683, + "num_input_tokens_seen": 124918768, + "step": 102660 + }, + { + "epoch": 11.433901325314624, + "grad_norm": 0.13204529881477356, + "learning_rate": 2.310904199407328e-05, + "loss": 0.0219, + "num_input_tokens_seen": 124924912, + "step": 102665 + }, + { + "epoch": 11.43445818019824, + "grad_norm": 0.002010015305131674, + "learning_rate": 2.3106619225637724e-05, + "loss": 0.0015, + "num_input_tokens_seen": 124930800, + "step": 102670 + }, + { + "epoch": 11.435015035081857, + "grad_norm": 0.37334948778152466, + "learning_rate": 2.3104196475086662e-05, + "loss": 0.0927, + "num_input_tokens_seen": 124937232, + "step": 102675 + }, + { + "epoch": 11.435571889965475, + "grad_norm": 0.5840146541595459, + "learning_rate": 2.3101773742442985e-05, + "loss": 0.0165, + "num_input_tokens_seen": 124943376, + "step": 102680 + }, + { + "epoch": 11.436128744849093, + "grad_norm": 0.4142248034477234, + "learning_rate": 2.3099351027729576e-05, + "loss": 0.1334, + "num_input_tokens_seen": 124949840, + "step": 102685 + }, + { + "epoch": 11.43668559973271, + "grad_norm": 0.00043801325955428183, + "learning_rate": 2.309692833096932e-05, + "loss": 0.0529, + "num_input_tokens_seen": 124956112, + "step": 102690 + }, + { + "epoch": 11.437242454616326, + "grad_norm": 0.009686380624771118, + "learning_rate": 2.309450565218509e-05, + "loss": 0.0213, + "num_input_tokens_seen": 124961776, + "step": 102695 + }, + { + "epoch": 11.437799309499944, + "grad_norm": 0.07787992805242538, + "learning_rate": 2.30920829913998e-05, + "loss": 0.0122, + "num_input_tokens_seen": 124967600, + "step": 102700 + }, + { + "epoch": 11.438356164383562, + "grad_norm": 2.0388500690460205, + "learning_rate": 2.3089660348636295e-05, + "loss": 0.0339, + "num_input_tokens_seen": 124973712, + "step": 102705 + }, + { + "epoch": 11.43891301926718, + "grad_norm": 0.08394229412078857, + "learning_rate": 2.3087237723917497e-05, + "loss": 0.0098, + "num_input_tokens_seen": 124979920, + "step": 102710 + }, + { + "epoch": 11.439469874150797, + "grad_norm": 0.833439826965332, + "learning_rate": 2.3084815117266257e-05, + "loss": 0.0611, + "num_input_tokens_seen": 124986352, + "step": 102715 + }, + { + "epoch": 11.440026729034413, + "grad_norm": 1.6525063514709473, + "learning_rate": 2.3082392528705483e-05, + "loss": 0.0188, + "num_input_tokens_seen": 124992528, + "step": 102720 + }, + { + "epoch": 11.44058358391803, + "grad_norm": 0.02292429842054844, + "learning_rate": 2.307996995825804e-05, + "loss": 0.0367, + "num_input_tokens_seen": 124998576, + "step": 102725 + }, + { + "epoch": 11.441140438801648, + "grad_norm": 1.1185301542282104, + "learning_rate": 2.3077547405946824e-05, + "loss": 0.1425, + "num_input_tokens_seen": 125004720, + "step": 102730 + }, + { + "epoch": 11.441697293685266, + "grad_norm": 0.017430569976568222, + "learning_rate": 2.307512487179471e-05, + "loss": 0.009, + "num_input_tokens_seen": 125010928, + "step": 102735 + }, + { + "epoch": 11.442254148568884, + "grad_norm": 0.1408890187740326, + "learning_rate": 2.3072702355824588e-05, + "loss": 0.0442, + "num_input_tokens_seen": 125017328, + "step": 102740 + }, + { + "epoch": 11.4428110034525, + "grad_norm": 0.7176536321640015, + "learning_rate": 2.3070279858059328e-05, + "loss": 0.0337, + "num_input_tokens_seen": 125023120, + "step": 102745 + }, + { + "epoch": 11.443367858336117, + "grad_norm": 0.002143310150131583, + "learning_rate": 2.306785737852183e-05, + "loss": 0.1023, + "num_input_tokens_seen": 125029168, + "step": 102750 + }, + { + "epoch": 11.443924713219735, + "grad_norm": 1.366661787033081, + "learning_rate": 2.3065434917234964e-05, + "loss": 0.1767, + "num_input_tokens_seen": 125035248, + "step": 102755 + }, + { + "epoch": 11.444481568103352, + "grad_norm": 0.0012482701567932963, + "learning_rate": 2.306301247422162e-05, + "loss": 0.002, + "num_input_tokens_seen": 125041712, + "step": 102760 + }, + { + "epoch": 11.44503842298697, + "grad_norm": 0.2569115459918976, + "learning_rate": 2.3060590049504658e-05, + "loss": 0.0042, + "num_input_tokens_seen": 125048144, + "step": 102765 + }, + { + "epoch": 11.445595277870588, + "grad_norm": 1.1363525390625, + "learning_rate": 2.3058167643107e-05, + "loss": 0.051, + "num_input_tokens_seen": 125054320, + "step": 102770 + }, + { + "epoch": 11.446152132754204, + "grad_norm": 9.10170710994862e-05, + "learning_rate": 2.305574525505148e-05, + "loss": 0.0691, + "num_input_tokens_seen": 125059920, + "step": 102775 + }, + { + "epoch": 11.446708987637821, + "grad_norm": 0.11562757939100266, + "learning_rate": 2.305332288536102e-05, + "loss": 0.0733, + "num_input_tokens_seen": 125065968, + "step": 102780 + }, + { + "epoch": 11.447265842521439, + "grad_norm": 0.0002904355642385781, + "learning_rate": 2.305090053405848e-05, + "loss": 0.0715, + "num_input_tokens_seen": 125071952, + "step": 102785 + }, + { + "epoch": 11.447822697405057, + "grad_norm": 0.004331869073212147, + "learning_rate": 2.304847820116675e-05, + "loss": 0.012, + "num_input_tokens_seen": 125078640, + "step": 102790 + }, + { + "epoch": 11.448379552288674, + "grad_norm": 0.17783193290233612, + "learning_rate": 2.3046055886708702e-05, + "loss": 0.1233, + "num_input_tokens_seen": 125084592, + "step": 102795 + }, + { + "epoch": 11.44893640717229, + "grad_norm": 0.7680426239967346, + "learning_rate": 2.304363359070723e-05, + "loss": 0.0947, + "num_input_tokens_seen": 125090416, + "step": 102800 + }, + { + "epoch": 11.449493262055908, + "grad_norm": 1.148679494857788, + "learning_rate": 2.3041211313185197e-05, + "loss": 0.2013, + "num_input_tokens_seen": 125096080, + "step": 102805 + }, + { + "epoch": 11.450050116939526, + "grad_norm": 0.07673797011375427, + "learning_rate": 2.3038789054165497e-05, + "loss": 0.0807, + "num_input_tokens_seen": 125102128, + "step": 102810 + }, + { + "epoch": 11.450606971823143, + "grad_norm": 1.0588942766189575, + "learning_rate": 2.3036366813671002e-05, + "loss": 0.0222, + "num_input_tokens_seen": 125108336, + "step": 102815 + }, + { + "epoch": 11.451163826706761, + "grad_norm": 0.017944661900401115, + "learning_rate": 2.3033944591724603e-05, + "loss": 0.0213, + "num_input_tokens_seen": 125114512, + "step": 102820 + }, + { + "epoch": 11.451720681590377, + "grad_norm": 0.1490589678287506, + "learning_rate": 2.3031522388349158e-05, + "loss": 0.0496, + "num_input_tokens_seen": 125120080, + "step": 102825 + }, + { + "epoch": 11.452277536473995, + "grad_norm": 0.023336004465818405, + "learning_rate": 2.302910020356758e-05, + "loss": 0.0291, + "num_input_tokens_seen": 125126288, + "step": 102830 + }, + { + "epoch": 11.452834391357612, + "grad_norm": 0.01742875762283802, + "learning_rate": 2.3026678037402712e-05, + "loss": 0.0498, + "num_input_tokens_seen": 125132592, + "step": 102835 + }, + { + "epoch": 11.45339124624123, + "grad_norm": 0.014057496562600136, + "learning_rate": 2.302425588987747e-05, + "loss": 0.0538, + "num_input_tokens_seen": 125138832, + "step": 102840 + }, + { + "epoch": 11.453948101124848, + "grad_norm": 0.7194610238075256, + "learning_rate": 2.3021833761014696e-05, + "loss": 0.099, + "num_input_tokens_seen": 125144848, + "step": 102845 + }, + { + "epoch": 11.454504956008464, + "grad_norm": 0.9707866311073303, + "learning_rate": 2.3019411650837293e-05, + "loss": 0.0542, + "num_input_tokens_seen": 125150992, + "step": 102850 + }, + { + "epoch": 11.455061810892081, + "grad_norm": 0.5722935795783997, + "learning_rate": 2.3016989559368134e-05, + "loss": 0.044, + "num_input_tokens_seen": 125157296, + "step": 102855 + }, + { + "epoch": 11.455618665775699, + "grad_norm": 0.13727790117263794, + "learning_rate": 2.30145674866301e-05, + "loss": 0.0177, + "num_input_tokens_seen": 125163312, + "step": 102860 + }, + { + "epoch": 11.456175520659317, + "grad_norm": 0.06772872805595398, + "learning_rate": 2.3012145432646065e-05, + "loss": 0.0593, + "num_input_tokens_seen": 125169264, + "step": 102865 + }, + { + "epoch": 11.456732375542934, + "grad_norm": 0.9812312722206116, + "learning_rate": 2.300972339743891e-05, + "loss": 0.0715, + "num_input_tokens_seen": 125175120, + "step": 102870 + }, + { + "epoch": 11.45728923042655, + "grad_norm": 0.26974397897720337, + "learning_rate": 2.3007301381031512e-05, + "loss": 0.0141, + "num_input_tokens_seen": 125181392, + "step": 102875 + }, + { + "epoch": 11.457846085310168, + "grad_norm": 0.004919544793665409, + "learning_rate": 2.300487938344675e-05, + "loss": 0.0118, + "num_input_tokens_seen": 125187408, + "step": 102880 + }, + { + "epoch": 11.458402940193785, + "grad_norm": 2.5014326572418213, + "learning_rate": 2.3002457404707502e-05, + "loss": 0.0771, + "num_input_tokens_seen": 125193520, + "step": 102885 + }, + { + "epoch": 11.458959795077403, + "grad_norm": 0.02469003200531006, + "learning_rate": 2.300003544483664e-05, + "loss": 0.055, + "num_input_tokens_seen": 125199504, + "step": 102890 + }, + { + "epoch": 11.45951664996102, + "grad_norm": 0.14223720133304596, + "learning_rate": 2.2997613503857048e-05, + "loss": 0.0275, + "num_input_tokens_seen": 125205904, + "step": 102895 + }, + { + "epoch": 11.460073504844637, + "grad_norm": 1.5961970090866089, + "learning_rate": 2.2995191581791602e-05, + "loss": 0.0493, + "num_input_tokens_seen": 125212144, + "step": 102900 + }, + { + "epoch": 11.460630359728254, + "grad_norm": 0.09005820006132126, + "learning_rate": 2.2992769678663177e-05, + "loss": 0.0107, + "num_input_tokens_seen": 125218480, + "step": 102905 + }, + { + "epoch": 11.461187214611872, + "grad_norm": 0.4918476641178131, + "learning_rate": 2.2990347794494642e-05, + "loss": 0.1151, + "num_input_tokens_seen": 125224656, + "step": 102910 + }, + { + "epoch": 11.46174406949549, + "grad_norm": 0.007904729805886745, + "learning_rate": 2.2987925929308895e-05, + "loss": 0.0358, + "num_input_tokens_seen": 125230896, + "step": 102915 + }, + { + "epoch": 11.462300924379107, + "grad_norm": 0.0034834975376725197, + "learning_rate": 2.2985504083128786e-05, + "loss": 0.0185, + "num_input_tokens_seen": 125237200, + "step": 102920 + }, + { + "epoch": 11.462857779262723, + "grad_norm": 1.9503202438354492, + "learning_rate": 2.2983082255977217e-05, + "loss": 0.0767, + "num_input_tokens_seen": 125243056, + "step": 102925 + }, + { + "epoch": 11.463414634146341, + "grad_norm": 0.043619997799396515, + "learning_rate": 2.2980660447877045e-05, + "loss": 0.0264, + "num_input_tokens_seen": 125248816, + "step": 102930 + }, + { + "epoch": 11.463971489029959, + "grad_norm": 9.314184717368335e-05, + "learning_rate": 2.2978238658851158e-05, + "loss": 0.087, + "num_input_tokens_seen": 125255024, + "step": 102935 + }, + { + "epoch": 11.464528343913576, + "grad_norm": 0.0029373527504503727, + "learning_rate": 2.297581688892242e-05, + "loss": 0.0173, + "num_input_tokens_seen": 125261264, + "step": 102940 + }, + { + "epoch": 11.465085198797194, + "grad_norm": 0.010361706838011742, + "learning_rate": 2.2973395138113725e-05, + "loss": 0.0252, + "num_input_tokens_seen": 125267312, + "step": 102945 + }, + { + "epoch": 11.46564205368081, + "grad_norm": 0.02103372849524021, + "learning_rate": 2.2970973406447923e-05, + "loss": 0.0209, + "num_input_tokens_seen": 125273584, + "step": 102950 + }, + { + "epoch": 11.466198908564428, + "grad_norm": 0.12136673182249069, + "learning_rate": 2.2968551693947912e-05, + "loss": 0.0244, + "num_input_tokens_seen": 125279312, + "step": 102955 + }, + { + "epoch": 11.466755763448045, + "grad_norm": 0.16173513233661652, + "learning_rate": 2.2966130000636554e-05, + "loss": 0.0535, + "num_input_tokens_seen": 125285520, + "step": 102960 + }, + { + "epoch": 11.467312618331663, + "grad_norm": 0.6570031046867371, + "learning_rate": 2.296370832653673e-05, + "loss": 0.0094, + "num_input_tokens_seen": 125291728, + "step": 102965 + }, + { + "epoch": 11.46786947321528, + "grad_norm": 1.047845721244812, + "learning_rate": 2.2961286671671304e-05, + "loss": 0.0579, + "num_input_tokens_seen": 125297744, + "step": 102970 + }, + { + "epoch": 11.468426328098898, + "grad_norm": 0.9307522773742676, + "learning_rate": 2.2958865036063172e-05, + "loss": 0.0958, + "num_input_tokens_seen": 125304048, + "step": 102975 + }, + { + "epoch": 11.468983182982514, + "grad_norm": 0.9027802348136902, + "learning_rate": 2.2956443419735183e-05, + "loss": 0.0451, + "num_input_tokens_seen": 125310224, + "step": 102980 + }, + { + "epoch": 11.469540037866132, + "grad_norm": 1.798832654953003, + "learning_rate": 2.2954021822710235e-05, + "loss": 0.0485, + "num_input_tokens_seen": 125316272, + "step": 102985 + }, + { + "epoch": 11.47009689274975, + "grad_norm": 0.5411168336868286, + "learning_rate": 2.295160024501118e-05, + "loss": 0.0134, + "num_input_tokens_seen": 125321744, + "step": 102990 + }, + { + "epoch": 11.470653747633367, + "grad_norm": 0.0632295235991478, + "learning_rate": 2.2949178686660906e-05, + "loss": 0.0172, + "num_input_tokens_seen": 125327728, + "step": 102995 + }, + { + "epoch": 11.471210602516985, + "grad_norm": 0.24567535519599915, + "learning_rate": 2.294675714768228e-05, + "loss": 0.0713, + "num_input_tokens_seen": 125333680, + "step": 103000 + }, + { + "epoch": 11.4717674574006, + "grad_norm": 0.5447579622268677, + "learning_rate": 2.2944335628098182e-05, + "loss": 0.0111, + "num_input_tokens_seen": 125339760, + "step": 103005 + }, + { + "epoch": 11.472324312284218, + "grad_norm": 0.0044069136492908, + "learning_rate": 2.294191412793148e-05, + "loss": 0.0229, + "num_input_tokens_seen": 125346128, + "step": 103010 + }, + { + "epoch": 11.472881167167836, + "grad_norm": 0.4510234594345093, + "learning_rate": 2.2939492647205045e-05, + "loss": 0.0494, + "num_input_tokens_seen": 125352240, + "step": 103015 + }, + { + "epoch": 11.473438022051454, + "grad_norm": 4.373502731323242, + "learning_rate": 2.2937071185941755e-05, + "loss": 0.1051, + "num_input_tokens_seen": 125357584, + "step": 103020 + }, + { + "epoch": 11.473994876935071, + "grad_norm": 1.6369829177856445, + "learning_rate": 2.2934649744164482e-05, + "loss": 0.1474, + "num_input_tokens_seen": 125362960, + "step": 103025 + }, + { + "epoch": 11.474551731818687, + "grad_norm": 9.608723485143855e-05, + "learning_rate": 2.293222832189609e-05, + "loss": 0.031, + "num_input_tokens_seen": 125368656, + "step": 103030 + }, + { + "epoch": 11.475108586702305, + "grad_norm": 0.12501607835292816, + "learning_rate": 2.2929806919159468e-05, + "loss": 0.1087, + "num_input_tokens_seen": 125374608, + "step": 103035 + }, + { + "epoch": 11.475665441585923, + "grad_norm": 0.07695776224136353, + "learning_rate": 2.2927385535977467e-05, + "loss": 0.0039, + "num_input_tokens_seen": 125380560, + "step": 103040 + }, + { + "epoch": 11.47622229646954, + "grad_norm": 0.12300772219896317, + "learning_rate": 2.2924964172372983e-05, + "loss": 0.0063, + "num_input_tokens_seen": 125386800, + "step": 103045 + }, + { + "epoch": 11.476779151353158, + "grad_norm": 0.00019323575543239713, + "learning_rate": 2.2922542828368862e-05, + "loss": 0.0134, + "num_input_tokens_seen": 125393008, + "step": 103050 + }, + { + "epoch": 11.477336006236774, + "grad_norm": 0.5125002264976501, + "learning_rate": 2.2920121503987997e-05, + "loss": 0.0817, + "num_input_tokens_seen": 125399152, + "step": 103055 + }, + { + "epoch": 11.477892861120392, + "grad_norm": 0.6618111729621887, + "learning_rate": 2.2917700199253248e-05, + "loss": 0.0226, + "num_input_tokens_seen": 125405264, + "step": 103060 + }, + { + "epoch": 11.47844971600401, + "grad_norm": 1.0885694026947021, + "learning_rate": 2.2915278914187492e-05, + "loss": 0.0438, + "num_input_tokens_seen": 125412048, + "step": 103065 + }, + { + "epoch": 11.479006570887627, + "grad_norm": 0.2774781584739685, + "learning_rate": 2.291285764881359e-05, + "loss": 0.0923, + "num_input_tokens_seen": 125417488, + "step": 103070 + }, + { + "epoch": 11.479563425771245, + "grad_norm": 0.9867200255393982, + "learning_rate": 2.2910436403154427e-05, + "loss": 0.0552, + "num_input_tokens_seen": 125423408, + "step": 103075 + }, + { + "epoch": 11.48012028065486, + "grad_norm": 0.8104636669158936, + "learning_rate": 2.2908015177232865e-05, + "loss": 0.0473, + "num_input_tokens_seen": 125429168, + "step": 103080 + }, + { + "epoch": 11.480677135538478, + "grad_norm": 0.2627895474433899, + "learning_rate": 2.2905593971071775e-05, + "loss": 0.044, + "num_input_tokens_seen": 125435376, + "step": 103085 + }, + { + "epoch": 11.481233990422096, + "grad_norm": 0.0009248016285710037, + "learning_rate": 2.2903172784694024e-05, + "loss": 0.0206, + "num_input_tokens_seen": 125441488, + "step": 103090 + }, + { + "epoch": 11.481790845305714, + "grad_norm": 0.023794416338205338, + "learning_rate": 2.2900751618122492e-05, + "loss": 0.0372, + "num_input_tokens_seen": 125447632, + "step": 103095 + }, + { + "epoch": 11.482347700189331, + "grad_norm": 0.43084755539894104, + "learning_rate": 2.2898330471380035e-05, + "loss": 0.0325, + "num_input_tokens_seen": 125453616, + "step": 103100 + }, + { + "epoch": 11.482904555072947, + "grad_norm": 0.3160298466682434, + "learning_rate": 2.289590934448954e-05, + "loss": 0.0083, + "num_input_tokens_seen": 125459920, + "step": 103105 + }, + { + "epoch": 11.483461409956565, + "grad_norm": 0.04316478595137596, + "learning_rate": 2.2893488237473856e-05, + "loss": 0.0696, + "num_input_tokens_seen": 125466128, + "step": 103110 + }, + { + "epoch": 11.484018264840183, + "grad_norm": 0.0006858175038360059, + "learning_rate": 2.289106715035588e-05, + "loss": 0.0652, + "num_input_tokens_seen": 125472272, + "step": 103115 + }, + { + "epoch": 11.4845751197238, + "grad_norm": 0.05028039216995239, + "learning_rate": 2.2888646083158444e-05, + "loss": 0.0075, + "num_input_tokens_seen": 125478224, + "step": 103120 + }, + { + "epoch": 11.485131974607418, + "grad_norm": 1.3646821975708008, + "learning_rate": 2.2886225035904452e-05, + "loss": 0.0242, + "num_input_tokens_seen": 125484272, + "step": 103125 + }, + { + "epoch": 11.485688829491036, + "grad_norm": 0.11072377115488052, + "learning_rate": 2.288380400861675e-05, + "loss": 0.049, + "num_input_tokens_seen": 125489776, + "step": 103130 + }, + { + "epoch": 11.486245684374651, + "grad_norm": 0.029807303100824356, + "learning_rate": 2.288138300131822e-05, + "loss": 0.0583, + "num_input_tokens_seen": 125496464, + "step": 103135 + }, + { + "epoch": 11.486802539258269, + "grad_norm": 0.011123386211693287, + "learning_rate": 2.2878962014031723e-05, + "loss": 0.0082, + "num_input_tokens_seen": 125502416, + "step": 103140 + }, + { + "epoch": 11.487359394141887, + "grad_norm": 1.3463869094848633, + "learning_rate": 2.287654104678013e-05, + "loss": 0.0505, + "num_input_tokens_seen": 125508400, + "step": 103145 + }, + { + "epoch": 11.487916249025504, + "grad_norm": 0.7157670259475708, + "learning_rate": 2.2874120099586307e-05, + "loss": 0.0317, + "num_input_tokens_seen": 125514128, + "step": 103150 + }, + { + "epoch": 11.488473103909122, + "grad_norm": 2.1850368976593018, + "learning_rate": 2.2871699172473127e-05, + "loss": 0.0144, + "num_input_tokens_seen": 125520048, + "step": 103155 + }, + { + "epoch": 11.489029958792738, + "grad_norm": 0.03422904759645462, + "learning_rate": 2.2869278265463447e-05, + "loss": 0.0173, + "num_input_tokens_seen": 125526352, + "step": 103160 + }, + { + "epoch": 11.489586813676356, + "grad_norm": 0.004264109767973423, + "learning_rate": 2.2866857378580148e-05, + "loss": 0.0186, + "num_input_tokens_seen": 125532400, + "step": 103165 + }, + { + "epoch": 11.490143668559973, + "grad_norm": 0.024506907910108566, + "learning_rate": 2.286443651184608e-05, + "loss": 0.0175, + "num_input_tokens_seen": 125538448, + "step": 103170 + }, + { + "epoch": 11.490700523443591, + "grad_norm": 0.0035720309242606163, + "learning_rate": 2.2862015665284132e-05, + "loss": 0.156, + "num_input_tokens_seen": 125544880, + "step": 103175 + }, + { + "epoch": 11.491257378327209, + "grad_norm": 0.06478382647037506, + "learning_rate": 2.2859594838917146e-05, + "loss": 0.1274, + "num_input_tokens_seen": 125551184, + "step": 103180 + }, + { + "epoch": 11.491814233210825, + "grad_norm": 0.0007667737081646919, + "learning_rate": 2.285717403276801e-05, + "loss": 0.0174, + "num_input_tokens_seen": 125557328, + "step": 103185 + }, + { + "epoch": 11.492371088094442, + "grad_norm": 1.1426119804382324, + "learning_rate": 2.285475324685958e-05, + "loss": 0.102, + "num_input_tokens_seen": 125563536, + "step": 103190 + }, + { + "epoch": 11.49292794297806, + "grad_norm": 0.007697213441133499, + "learning_rate": 2.2852332481214724e-05, + "loss": 0.0469, + "num_input_tokens_seen": 125569520, + "step": 103195 + }, + { + "epoch": 11.493484797861678, + "grad_norm": 1.1511796712875366, + "learning_rate": 2.2849911735856308e-05, + "loss": 0.0728, + "num_input_tokens_seen": 125575248, + "step": 103200 + }, + { + "epoch": 11.494041652745295, + "grad_norm": 0.0071665821596980095, + "learning_rate": 2.2847491010807205e-05, + "loss": 0.0613, + "num_input_tokens_seen": 125581200, + "step": 103205 + }, + { + "epoch": 11.494598507628911, + "grad_norm": 0.04359883442521095, + "learning_rate": 2.2845070306090264e-05, + "loss": 0.0537, + "num_input_tokens_seen": 125587088, + "step": 103210 + }, + { + "epoch": 11.495155362512529, + "grad_norm": 0.06748894602060318, + "learning_rate": 2.2842649621728368e-05, + "loss": 0.1496, + "num_input_tokens_seen": 125593232, + "step": 103215 + }, + { + "epoch": 11.495712217396147, + "grad_norm": 0.00044563456322066486, + "learning_rate": 2.284022895774437e-05, + "loss": 0.0753, + "num_input_tokens_seen": 125599344, + "step": 103220 + }, + { + "epoch": 11.496269072279764, + "grad_norm": 1.2353870868682861, + "learning_rate": 2.2837808314161144e-05, + "loss": 0.1318, + "num_input_tokens_seen": 125605424, + "step": 103225 + }, + { + "epoch": 11.496825927163382, + "grad_norm": 0.12264347821474075, + "learning_rate": 2.2835387691001543e-05, + "loss": 0.0177, + "num_input_tokens_seen": 125611792, + "step": 103230 + }, + { + "epoch": 11.497382782046998, + "grad_norm": 0.08287223428487778, + "learning_rate": 2.2832967088288453e-05, + "loss": 0.0027, + "num_input_tokens_seen": 125618000, + "step": 103235 + }, + { + "epoch": 11.497939636930615, + "grad_norm": 1.8194465637207031, + "learning_rate": 2.2830546506044707e-05, + "loss": 0.1105, + "num_input_tokens_seen": 125624112, + "step": 103240 + }, + { + "epoch": 11.498496491814233, + "grad_norm": 0.021929923444986343, + "learning_rate": 2.2828125944293198e-05, + "loss": 0.0587, + "num_input_tokens_seen": 125630448, + "step": 103245 + }, + { + "epoch": 11.49905334669785, + "grad_norm": 0.743765652179718, + "learning_rate": 2.282570540305678e-05, + "loss": 0.0151, + "num_input_tokens_seen": 125637040, + "step": 103250 + }, + { + "epoch": 11.499610201581469, + "grad_norm": 0.20232069492340088, + "learning_rate": 2.2823284882358316e-05, + "loss": 0.1427, + "num_input_tokens_seen": 125643088, + "step": 103255 + }, + { + "epoch": 11.500167056465084, + "grad_norm": 0.08948709070682526, + "learning_rate": 2.2820864382220668e-05, + "loss": 0.0229, + "num_input_tokens_seen": 125649168, + "step": 103260 + }, + { + "epoch": 11.500723911348702, + "grad_norm": 0.004194573499262333, + "learning_rate": 2.2818443902666707e-05, + "loss": 0.0805, + "num_input_tokens_seen": 125655184, + "step": 103265 + }, + { + "epoch": 11.50128076623232, + "grad_norm": 0.15967917442321777, + "learning_rate": 2.2816023443719283e-05, + "loss": 0.1028, + "num_input_tokens_seen": 125661104, + "step": 103270 + }, + { + "epoch": 11.501837621115937, + "grad_norm": 1.2760708332061768, + "learning_rate": 2.2813603005401278e-05, + "loss": 0.0966, + "num_input_tokens_seen": 125667248, + "step": 103275 + }, + { + "epoch": 11.502394475999555, + "grad_norm": 1.3537849187850952, + "learning_rate": 2.2811182587735535e-05, + "loss": 0.1414, + "num_input_tokens_seen": 125673136, + "step": 103280 + }, + { + "epoch": 11.502951330883171, + "grad_norm": 0.570730984210968, + "learning_rate": 2.2808762190744932e-05, + "loss": 0.0293, + "num_input_tokens_seen": 125679312, + "step": 103285 + }, + { + "epoch": 11.503508185766789, + "grad_norm": 0.03651067614555359, + "learning_rate": 2.280634181445232e-05, + "loss": 0.0339, + "num_input_tokens_seen": 125685520, + "step": 103290 + }, + { + "epoch": 11.504065040650406, + "grad_norm": 0.849481463432312, + "learning_rate": 2.280392145888057e-05, + "loss": 0.0219, + "num_input_tokens_seen": 125691888, + "step": 103295 + }, + { + "epoch": 11.504621895534024, + "grad_norm": 1.3254369497299194, + "learning_rate": 2.280150112405255e-05, + "loss": 0.0895, + "num_input_tokens_seen": 125698288, + "step": 103300 + }, + { + "epoch": 11.505178750417642, + "grad_norm": 0.30576035380363464, + "learning_rate": 2.27990808099911e-05, + "loss": 0.0278, + "num_input_tokens_seen": 125704400, + "step": 103305 + }, + { + "epoch": 11.505735605301258, + "grad_norm": 0.00039286381797865033, + "learning_rate": 2.2796660516719102e-05, + "loss": 0.0159, + "num_input_tokens_seen": 125710544, + "step": 103310 + }, + { + "epoch": 11.506292460184875, + "grad_norm": 0.8220914602279663, + "learning_rate": 2.27942402442594e-05, + "loss": 0.0235, + "num_input_tokens_seen": 125716688, + "step": 103315 + }, + { + "epoch": 11.506849315068493, + "grad_norm": 0.6679263710975647, + "learning_rate": 2.2791819992634885e-05, + "loss": 0.0076, + "num_input_tokens_seen": 125722768, + "step": 103320 + }, + { + "epoch": 11.50740616995211, + "grad_norm": 0.03494039177894592, + "learning_rate": 2.2789399761868382e-05, + "loss": 0.0754, + "num_input_tokens_seen": 125729040, + "step": 103325 + }, + { + "epoch": 11.507963024835728, + "grad_norm": 0.00020822246733587235, + "learning_rate": 2.278697955198278e-05, + "loss": 0.0347, + "num_input_tokens_seen": 125735632, + "step": 103330 + }, + { + "epoch": 11.508519879719344, + "grad_norm": 0.1743757575750351, + "learning_rate": 2.278455936300092e-05, + "loss": 0.0687, + "num_input_tokens_seen": 125741456, + "step": 103335 + }, + { + "epoch": 11.509076734602962, + "grad_norm": 0.023062633350491524, + "learning_rate": 2.278213919494568e-05, + "loss": 0.1826, + "num_input_tokens_seen": 125747536, + "step": 103340 + }, + { + "epoch": 11.50963358948658, + "grad_norm": 0.28699666261672974, + "learning_rate": 2.277971904783991e-05, + "loss": 0.0427, + "num_input_tokens_seen": 125753296, + "step": 103345 + }, + { + "epoch": 11.510190444370197, + "grad_norm": 0.8070105314254761, + "learning_rate": 2.2777298921706475e-05, + "loss": 0.0547, + "num_input_tokens_seen": 125759600, + "step": 103350 + }, + { + "epoch": 11.510747299253815, + "grad_norm": 0.8259128332138062, + "learning_rate": 2.2774878816568226e-05, + "loss": 0.0417, + "num_input_tokens_seen": 125766160, + "step": 103355 + }, + { + "epoch": 11.511304154137433, + "grad_norm": 0.08670564740896225, + "learning_rate": 2.277245873244804e-05, + "loss": 0.0364, + "num_input_tokens_seen": 125772560, + "step": 103360 + }, + { + "epoch": 11.511861009021048, + "grad_norm": 0.008915826678276062, + "learning_rate": 2.2770038669368753e-05, + "loss": 0.0022, + "num_input_tokens_seen": 125778672, + "step": 103365 + }, + { + "epoch": 11.512417863904666, + "grad_norm": 1.4306774139404297, + "learning_rate": 2.2767618627353246e-05, + "loss": 0.0795, + "num_input_tokens_seen": 125785072, + "step": 103370 + }, + { + "epoch": 11.512974718788284, + "grad_norm": 0.27169087529182434, + "learning_rate": 2.276519860642436e-05, + "loss": 0.0341, + "num_input_tokens_seen": 125791152, + "step": 103375 + }, + { + "epoch": 11.513531573671901, + "grad_norm": 0.00011153309606015682, + "learning_rate": 2.2762778606604978e-05, + "loss": 0.0086, + "num_input_tokens_seen": 125797232, + "step": 103380 + }, + { + "epoch": 11.51408842855552, + "grad_norm": 1.4432004690170288, + "learning_rate": 2.2760358627917926e-05, + "loss": 0.0761, + "num_input_tokens_seen": 125803312, + "step": 103385 + }, + { + "epoch": 11.514645283439135, + "grad_norm": 0.31568706035614014, + "learning_rate": 2.27579386703861e-05, + "loss": 0.0247, + "num_input_tokens_seen": 125809520, + "step": 103390 + }, + { + "epoch": 11.515202138322753, + "grad_norm": 0.2334991693496704, + "learning_rate": 2.2755518734032327e-05, + "loss": 0.0284, + "num_input_tokens_seen": 125815568, + "step": 103395 + }, + { + "epoch": 11.51575899320637, + "grad_norm": 0.3221968412399292, + "learning_rate": 2.2753098818879485e-05, + "loss": 0.0302, + "num_input_tokens_seen": 125821744, + "step": 103400 + }, + { + "epoch": 11.516315848089988, + "grad_norm": 0.3957216143608093, + "learning_rate": 2.275067892495042e-05, + "loss": 0.0102, + "num_input_tokens_seen": 125827984, + "step": 103405 + }, + { + "epoch": 11.516872702973606, + "grad_norm": 0.001689517986960709, + "learning_rate": 2.2748259052268e-05, + "loss": 0.0046, + "num_input_tokens_seen": 125834000, + "step": 103410 + }, + { + "epoch": 11.517429557857222, + "grad_norm": 0.14246422052383423, + "learning_rate": 2.274583920085507e-05, + "loss": 0.0029, + "num_input_tokens_seen": 125840528, + "step": 103415 + }, + { + "epoch": 11.51798641274084, + "grad_norm": 0.17282739281654358, + "learning_rate": 2.2743419370734505e-05, + "loss": 0.0343, + "num_input_tokens_seen": 125846544, + "step": 103420 + }, + { + "epoch": 11.518543267624457, + "grad_norm": 0.0010182517580688, + "learning_rate": 2.274099956192914e-05, + "loss": 0.0281, + "num_input_tokens_seen": 125852432, + "step": 103425 + }, + { + "epoch": 11.519100122508075, + "grad_norm": 0.4366588592529297, + "learning_rate": 2.2738579774461853e-05, + "loss": 0.0203, + "num_input_tokens_seen": 125858448, + "step": 103430 + }, + { + "epoch": 11.519656977391692, + "grad_norm": 0.5390165448188782, + "learning_rate": 2.273616000835549e-05, + "loss": 0.0115, + "num_input_tokens_seen": 125864304, + "step": 103435 + }, + { + "epoch": 11.520213832275308, + "grad_norm": 0.0027623921632766724, + "learning_rate": 2.273374026363291e-05, + "loss": 0.012, + "num_input_tokens_seen": 125870320, + "step": 103440 + }, + { + "epoch": 11.520770687158926, + "grad_norm": 0.05339831858873367, + "learning_rate": 2.273132054031696e-05, + "loss": 0.0676, + "num_input_tokens_seen": 125876432, + "step": 103445 + }, + { + "epoch": 11.521327542042544, + "grad_norm": 0.5093810558319092, + "learning_rate": 2.272890083843052e-05, + "loss": 0.0324, + "num_input_tokens_seen": 125882384, + "step": 103450 + }, + { + "epoch": 11.521884396926161, + "grad_norm": 0.02673182263970375, + "learning_rate": 2.2726481157996417e-05, + "loss": 0.0161, + "num_input_tokens_seen": 125888400, + "step": 103455 + }, + { + "epoch": 11.522441251809779, + "grad_norm": 0.7904255986213684, + "learning_rate": 2.272406149903753e-05, + "loss": 0.1242, + "num_input_tokens_seen": 125894736, + "step": 103460 + }, + { + "epoch": 11.522998106693397, + "grad_norm": 1.1659103631973267, + "learning_rate": 2.27216418615767e-05, + "loss": 0.0823, + "num_input_tokens_seen": 125901136, + "step": 103465 + }, + { + "epoch": 11.523554961577013, + "grad_norm": 0.020742537453770638, + "learning_rate": 2.2719222245636795e-05, + "loss": 0.0021, + "num_input_tokens_seen": 125907472, + "step": 103470 + }, + { + "epoch": 11.52411181646063, + "grad_norm": 0.0004458249022718519, + "learning_rate": 2.2716802651240656e-05, + "loss": 0.0039, + "num_input_tokens_seen": 125913552, + "step": 103475 + }, + { + "epoch": 11.524668671344248, + "grad_norm": 2.9614102840423584, + "learning_rate": 2.2714383078411152e-05, + "loss": 0.0784, + "num_input_tokens_seen": 125919920, + "step": 103480 + }, + { + "epoch": 11.525225526227866, + "grad_norm": 0.6366034746170044, + "learning_rate": 2.2711963527171125e-05, + "loss": 0.0882, + "num_input_tokens_seen": 125925872, + "step": 103485 + }, + { + "epoch": 11.525782381111483, + "grad_norm": 0.1146748885512352, + "learning_rate": 2.2709543997543442e-05, + "loss": 0.0489, + "num_input_tokens_seen": 125932112, + "step": 103490 + }, + { + "epoch": 11.5263392359951, + "grad_norm": 0.006445586681365967, + "learning_rate": 2.2707124489550942e-05, + "loss": 0.057, + "num_input_tokens_seen": 125938064, + "step": 103495 + }, + { + "epoch": 11.526896090878717, + "grad_norm": 0.6641862392425537, + "learning_rate": 2.27047050032165e-05, + "loss": 0.1061, + "num_input_tokens_seen": 125944432, + "step": 103500 + }, + { + "epoch": 11.527452945762334, + "grad_norm": 0.6419098973274231, + "learning_rate": 2.270228553856294e-05, + "loss": 0.0719, + "num_input_tokens_seen": 125950448, + "step": 103505 + }, + { + "epoch": 11.528009800645952, + "grad_norm": 0.02956344559788704, + "learning_rate": 2.2699866095613157e-05, + "loss": 0.0102, + "num_input_tokens_seen": 125956624, + "step": 103510 + }, + { + "epoch": 11.52856665552957, + "grad_norm": 0.6548739671707153, + "learning_rate": 2.269744667438996e-05, + "loss": 0.0285, + "num_input_tokens_seen": 125962864, + "step": 103515 + }, + { + "epoch": 11.529123510413186, + "grad_norm": 0.9856891632080078, + "learning_rate": 2.2695027274916238e-05, + "loss": 0.1285, + "num_input_tokens_seen": 125968528, + "step": 103520 + }, + { + "epoch": 11.529680365296803, + "grad_norm": 0.000735876674298197, + "learning_rate": 2.2692607897214824e-05, + "loss": 0.0056, + "num_input_tokens_seen": 125974672, + "step": 103525 + }, + { + "epoch": 11.530237220180421, + "grad_norm": 1.2607098817825317, + "learning_rate": 2.269018854130858e-05, + "loss": 0.0277, + "num_input_tokens_seen": 125980624, + "step": 103530 + }, + { + "epoch": 11.530794075064039, + "grad_norm": 0.2256370633840561, + "learning_rate": 2.2687769207220354e-05, + "loss": 0.0191, + "num_input_tokens_seen": 125986864, + "step": 103535 + }, + { + "epoch": 11.531350929947656, + "grad_norm": 0.00025201571406796575, + "learning_rate": 2.2685349894973008e-05, + "loss": 0.0386, + "num_input_tokens_seen": 125992944, + "step": 103540 + }, + { + "epoch": 11.531907784831272, + "grad_norm": 0.043983008712530136, + "learning_rate": 2.2682930604589375e-05, + "loss": 0.0443, + "num_input_tokens_seen": 125998960, + "step": 103545 + }, + { + "epoch": 11.53246463971489, + "grad_norm": 0.5815739035606384, + "learning_rate": 2.2680511336092327e-05, + "loss": 0.0258, + "num_input_tokens_seen": 126005008, + "step": 103550 + }, + { + "epoch": 11.533021494598508, + "grad_norm": 1.1903401613235474, + "learning_rate": 2.2678092089504705e-05, + "loss": 0.0819, + "num_input_tokens_seen": 126011088, + "step": 103555 + }, + { + "epoch": 11.533578349482125, + "grad_norm": 0.0011596173280850053, + "learning_rate": 2.2675672864849364e-05, + "loss": 0.1312, + "num_input_tokens_seen": 126017424, + "step": 103560 + }, + { + "epoch": 11.534135204365743, + "grad_norm": 0.22396203875541687, + "learning_rate": 2.267325366214915e-05, + "loss": 0.0162, + "num_input_tokens_seen": 126022864, + "step": 103565 + }, + { + "epoch": 11.534692059249359, + "grad_norm": 0.11186105757951736, + "learning_rate": 2.2670834481426927e-05, + "loss": 0.0065, + "num_input_tokens_seen": 126028944, + "step": 103570 + }, + { + "epoch": 11.535248914132977, + "grad_norm": 0.9755914807319641, + "learning_rate": 2.266841532270553e-05, + "loss": 0.0733, + "num_input_tokens_seen": 126035120, + "step": 103575 + }, + { + "epoch": 11.535805769016594, + "grad_norm": 0.05647272616624832, + "learning_rate": 2.266599618600783e-05, + "loss": 0.0152, + "num_input_tokens_seen": 126041488, + "step": 103580 + }, + { + "epoch": 11.536362623900212, + "grad_norm": 0.03259708732366562, + "learning_rate": 2.2663577071356652e-05, + "loss": 0.0091, + "num_input_tokens_seen": 126047504, + "step": 103585 + }, + { + "epoch": 11.53691947878383, + "grad_norm": 0.16160830855369568, + "learning_rate": 2.2661157978774873e-05, + "loss": 0.054, + "num_input_tokens_seen": 126053744, + "step": 103590 + }, + { + "epoch": 11.537476333667446, + "grad_norm": 0.0004917010082863271, + "learning_rate": 2.265873890828532e-05, + "loss": 0.0242, + "num_input_tokens_seen": 126060240, + "step": 103595 + }, + { + "epoch": 11.538033188551063, + "grad_norm": 0.35643482208251953, + "learning_rate": 2.2656319859910864e-05, + "loss": 0.0993, + "num_input_tokens_seen": 126065840, + "step": 103600 + }, + { + "epoch": 11.53859004343468, + "grad_norm": 0.185659259557724, + "learning_rate": 2.2653900833674336e-05, + "loss": 0.0205, + "num_input_tokens_seen": 126072240, + "step": 103605 + }, + { + "epoch": 11.539146898318299, + "grad_norm": 0.24586105346679688, + "learning_rate": 2.2651481829598603e-05, + "loss": 0.0478, + "num_input_tokens_seen": 126078032, + "step": 103610 + }, + { + "epoch": 11.539703753201916, + "grad_norm": 0.06931168586015701, + "learning_rate": 2.2649062847706497e-05, + "loss": 0.0232, + "num_input_tokens_seen": 126084528, + "step": 103615 + }, + { + "epoch": 11.540260608085532, + "grad_norm": 0.022244054824113846, + "learning_rate": 2.2646643888020884e-05, + "loss": 0.0274, + "num_input_tokens_seen": 126090704, + "step": 103620 + }, + { + "epoch": 11.54081746296915, + "grad_norm": 0.04456007108092308, + "learning_rate": 2.26442249505646e-05, + "loss": 0.0306, + "num_input_tokens_seen": 126096304, + "step": 103625 + }, + { + "epoch": 11.541374317852767, + "grad_norm": 0.27144256234169006, + "learning_rate": 2.2641806035360502e-05, + "loss": 0.0033, + "num_input_tokens_seen": 126102544, + "step": 103630 + }, + { + "epoch": 11.541931172736385, + "grad_norm": 0.21700555086135864, + "learning_rate": 2.2639387142431422e-05, + "loss": 0.001, + "num_input_tokens_seen": 126109008, + "step": 103635 + }, + { + "epoch": 11.542488027620003, + "grad_norm": 0.0012346365256235003, + "learning_rate": 2.2636968271800245e-05, + "loss": 0.0347, + "num_input_tokens_seen": 126114736, + "step": 103640 + }, + { + "epoch": 11.543044882503619, + "grad_norm": 1.1290524005889893, + "learning_rate": 2.2634549423489777e-05, + "loss": 0.1388, + "num_input_tokens_seen": 126120752, + "step": 103645 + }, + { + "epoch": 11.543601737387236, + "grad_norm": 1.2303884029388428, + "learning_rate": 2.263213059752289e-05, + "loss": 0.2007, + "num_input_tokens_seen": 126126960, + "step": 103650 + }, + { + "epoch": 11.544158592270854, + "grad_norm": 1.610315203666687, + "learning_rate": 2.2629711793922428e-05, + "loss": 0.0477, + "num_input_tokens_seen": 126132720, + "step": 103655 + }, + { + "epoch": 11.544715447154472, + "grad_norm": 0.6622666120529175, + "learning_rate": 2.262729301271124e-05, + "loss": 0.0433, + "num_input_tokens_seen": 126138896, + "step": 103660 + }, + { + "epoch": 11.54527230203809, + "grad_norm": 0.2434951812028885, + "learning_rate": 2.2624874253912166e-05, + "loss": 0.0371, + "num_input_tokens_seen": 126144784, + "step": 103665 + }, + { + "epoch": 11.545829156921705, + "grad_norm": 0.011882806196808815, + "learning_rate": 2.262245551754806e-05, + "loss": 0.0629, + "num_input_tokens_seen": 126150992, + "step": 103670 + }, + { + "epoch": 11.546386011805323, + "grad_norm": 0.0811966061592102, + "learning_rate": 2.2620036803641766e-05, + "loss": 0.0763, + "num_input_tokens_seen": 126156912, + "step": 103675 + }, + { + "epoch": 11.54694286668894, + "grad_norm": 0.00017694233974907547, + "learning_rate": 2.2617618112216132e-05, + "loss": 0.015, + "num_input_tokens_seen": 126163120, + "step": 103680 + }, + { + "epoch": 11.547499721572558, + "grad_norm": 0.0007336260168813169, + "learning_rate": 2.2615199443294002e-05, + "loss": 0.0302, + "num_input_tokens_seen": 126169232, + "step": 103685 + }, + { + "epoch": 11.548056576456176, + "grad_norm": 1.1828514337539673, + "learning_rate": 2.261278079689823e-05, + "loss": 0.0504, + "num_input_tokens_seen": 126175472, + "step": 103690 + }, + { + "epoch": 11.548613431339794, + "grad_norm": 0.0019015985308215022, + "learning_rate": 2.2610362173051642e-05, + "loss": 0.1225, + "num_input_tokens_seen": 126181520, + "step": 103695 + }, + { + "epoch": 11.54917028622341, + "grad_norm": 0.11352366954088211, + "learning_rate": 2.260794357177711e-05, + "loss": 0.0503, + "num_input_tokens_seen": 126187024, + "step": 103700 + }, + { + "epoch": 11.549727141107027, + "grad_norm": 0.25832271575927734, + "learning_rate": 2.260552499309747e-05, + "loss": 0.0218, + "num_input_tokens_seen": 126193008, + "step": 103705 + }, + { + "epoch": 11.550283995990645, + "grad_norm": 0.003156037535518408, + "learning_rate": 2.2603106437035557e-05, + "loss": 0.1342, + "num_input_tokens_seen": 126199024, + "step": 103710 + }, + { + "epoch": 11.550840850874263, + "grad_norm": 0.35553863644599915, + "learning_rate": 2.260068790361423e-05, + "loss": 0.0644, + "num_input_tokens_seen": 126205360, + "step": 103715 + }, + { + "epoch": 11.55139770575788, + "grad_norm": 0.09714750200510025, + "learning_rate": 2.2598269392856312e-05, + "loss": 0.0552, + "num_input_tokens_seen": 126211600, + "step": 103720 + }, + { + "epoch": 11.551954560641496, + "grad_norm": 1.0526198148727417, + "learning_rate": 2.2595850904784685e-05, + "loss": 0.0505, + "num_input_tokens_seen": 126217296, + "step": 103725 + }, + { + "epoch": 11.552511415525114, + "grad_norm": 1.298233985900879, + "learning_rate": 2.2593432439422157e-05, + "loss": 0.0665, + "num_input_tokens_seen": 126223504, + "step": 103730 + }, + { + "epoch": 11.553068270408732, + "grad_norm": 1.1734302043914795, + "learning_rate": 2.2591013996791598e-05, + "loss": 0.0448, + "num_input_tokens_seen": 126229808, + "step": 103735 + }, + { + "epoch": 11.55362512529235, + "grad_norm": 0.0006599921034649014, + "learning_rate": 2.2588595576915833e-05, + "loss": 0.0578, + "num_input_tokens_seen": 126236176, + "step": 103740 + }, + { + "epoch": 11.554181980175967, + "grad_norm": 0.7563011050224304, + "learning_rate": 2.2586177179817723e-05, + "loss": 0.0466, + "num_input_tokens_seen": 126242096, + "step": 103745 + }, + { + "epoch": 11.554738835059583, + "grad_norm": 0.02140086144208908, + "learning_rate": 2.2583758805520097e-05, + "loss": 0.0126, + "num_input_tokens_seen": 126248368, + "step": 103750 + }, + { + "epoch": 11.5552956899432, + "grad_norm": 0.014970016665756702, + "learning_rate": 2.258134045404581e-05, + "loss": 0.0495, + "num_input_tokens_seen": 126254448, + "step": 103755 + }, + { + "epoch": 11.555852544826818, + "grad_norm": 0.24158290028572083, + "learning_rate": 2.2578922125417694e-05, + "loss": 0.0328, + "num_input_tokens_seen": 126259760, + "step": 103760 + }, + { + "epoch": 11.556409399710436, + "grad_norm": 0.6506070494651794, + "learning_rate": 2.2576503819658606e-05, + "loss": 0.0846, + "num_input_tokens_seen": 126265744, + "step": 103765 + }, + { + "epoch": 11.556966254594053, + "grad_norm": 0.019350813701748848, + "learning_rate": 2.2574085536791376e-05, + "loss": 0.0156, + "num_input_tokens_seen": 126271760, + "step": 103770 + }, + { + "epoch": 11.55752310947767, + "grad_norm": 0.053029902279376984, + "learning_rate": 2.2571667276838857e-05, + "loss": 0.0071, + "num_input_tokens_seen": 126278128, + "step": 103775 + }, + { + "epoch": 11.558079964361287, + "grad_norm": 2.3103249073028564, + "learning_rate": 2.2569249039823872e-05, + "loss": 0.1358, + "num_input_tokens_seen": 126284016, + "step": 103780 + }, + { + "epoch": 11.558636819244905, + "grad_norm": 0.09108652919530869, + "learning_rate": 2.2566830825769297e-05, + "loss": 0.1358, + "num_input_tokens_seen": 126290032, + "step": 103785 + }, + { + "epoch": 11.559193674128522, + "grad_norm": 0.04643779993057251, + "learning_rate": 2.256441263469794e-05, + "loss": 0.014, + "num_input_tokens_seen": 126296048, + "step": 103790 + }, + { + "epoch": 11.55975052901214, + "grad_norm": 0.016595974564552307, + "learning_rate": 2.256199446663267e-05, + "loss": 0.0136, + "num_input_tokens_seen": 126302192, + "step": 103795 + }, + { + "epoch": 11.560307383895756, + "grad_norm": 0.0042108045890927315, + "learning_rate": 2.25595763215963e-05, + "loss": 0.0633, + "num_input_tokens_seen": 126308400, + "step": 103800 + }, + { + "epoch": 11.560864238779374, + "grad_norm": 0.03629302978515625, + "learning_rate": 2.2557158199611693e-05, + "loss": 0.1098, + "num_input_tokens_seen": 126314352, + "step": 103805 + }, + { + "epoch": 11.561421093662991, + "grad_norm": 0.0002818274952005595, + "learning_rate": 2.2554740100701686e-05, + "loss": 0.0028, + "num_input_tokens_seen": 126320368, + "step": 103810 + }, + { + "epoch": 11.561977948546609, + "grad_norm": 0.53423672914505, + "learning_rate": 2.255232202488912e-05, + "loss": 0.0231, + "num_input_tokens_seen": 126326192, + "step": 103815 + }, + { + "epoch": 11.562534803430227, + "grad_norm": 0.6905744075775146, + "learning_rate": 2.2549903972196825e-05, + "loss": 0.1024, + "num_input_tokens_seen": 126332624, + "step": 103820 + }, + { + "epoch": 11.563091658313844, + "grad_norm": 0.00938315037637949, + "learning_rate": 2.2547485942647662e-05, + "loss": 0.0264, + "num_input_tokens_seen": 126338480, + "step": 103825 + }, + { + "epoch": 11.56364851319746, + "grad_norm": 0.0001328343350905925, + "learning_rate": 2.2545067936264448e-05, + "loss": 0.0037, + "num_input_tokens_seen": 126344784, + "step": 103830 + }, + { + "epoch": 11.564205368081078, + "grad_norm": 0.014456895180046558, + "learning_rate": 2.254264995307004e-05, + "loss": 0.0142, + "num_input_tokens_seen": 126350192, + "step": 103835 + }, + { + "epoch": 11.564762222964696, + "grad_norm": 0.002589903539046645, + "learning_rate": 2.254023199308727e-05, + "loss": 0.0144, + "num_input_tokens_seen": 126356400, + "step": 103840 + }, + { + "epoch": 11.565319077848313, + "grad_norm": 1.2315083742141724, + "learning_rate": 2.2537814056338985e-05, + "loss": 0.0242, + "num_input_tokens_seen": 126362864, + "step": 103845 + }, + { + "epoch": 11.565875932731931, + "grad_norm": 0.43434736132621765, + "learning_rate": 2.2535396142848007e-05, + "loss": 0.0232, + "num_input_tokens_seen": 126368944, + "step": 103850 + }, + { + "epoch": 11.566432787615547, + "grad_norm": 1.3808794021606445, + "learning_rate": 2.25329782526372e-05, + "loss": 0.0859, + "num_input_tokens_seen": 126374768, + "step": 103855 + }, + { + "epoch": 11.566989642499165, + "grad_norm": 0.005018528550863266, + "learning_rate": 2.2530560385729376e-05, + "loss": 0.0539, + "num_input_tokens_seen": 126380944, + "step": 103860 + }, + { + "epoch": 11.567546497382782, + "grad_norm": 0.031038997694849968, + "learning_rate": 2.2528142542147402e-05, + "loss": 0.1484, + "num_input_tokens_seen": 126386768, + "step": 103865 + }, + { + "epoch": 11.5681033522664, + "grad_norm": 1.4588903188705444, + "learning_rate": 2.2525724721914095e-05, + "loss": 0.1341, + "num_input_tokens_seen": 126392848, + "step": 103870 + }, + { + "epoch": 11.568660207150018, + "grad_norm": 0.04688247665762901, + "learning_rate": 2.25233069250523e-05, + "loss": 0.0868, + "num_input_tokens_seen": 126399024, + "step": 103875 + }, + { + "epoch": 11.569217062033633, + "grad_norm": 0.271167129278183, + "learning_rate": 2.2520889151584857e-05, + "loss": 0.0549, + "num_input_tokens_seen": 126405072, + "step": 103880 + }, + { + "epoch": 11.569773916917251, + "grad_norm": 0.04424441605806351, + "learning_rate": 2.2518471401534605e-05, + "loss": 0.1085, + "num_input_tokens_seen": 126411152, + "step": 103885 + }, + { + "epoch": 11.570330771800869, + "grad_norm": 0.3687489926815033, + "learning_rate": 2.2516053674924374e-05, + "loss": 0.0097, + "num_input_tokens_seen": 126417104, + "step": 103890 + }, + { + "epoch": 11.570887626684486, + "grad_norm": 0.0003792939824052155, + "learning_rate": 2.251363597177701e-05, + "loss": 0.0711, + "num_input_tokens_seen": 126423216, + "step": 103895 + }, + { + "epoch": 11.571444481568104, + "grad_norm": 0.2986114025115967, + "learning_rate": 2.251121829211534e-05, + "loss": 0.0171, + "num_input_tokens_seen": 126429616, + "step": 103900 + }, + { + "epoch": 11.57200133645172, + "grad_norm": 0.16276457905769348, + "learning_rate": 2.2508800635962217e-05, + "loss": 0.0909, + "num_input_tokens_seen": 126435536, + "step": 103905 + }, + { + "epoch": 11.572558191335338, + "grad_norm": 0.05136919021606445, + "learning_rate": 2.2506383003340453e-05, + "loss": 0.0475, + "num_input_tokens_seen": 126441872, + "step": 103910 + }, + { + "epoch": 11.573115046218955, + "grad_norm": 0.2317216396331787, + "learning_rate": 2.250396539427292e-05, + "loss": 0.0083, + "num_input_tokens_seen": 126447920, + "step": 103915 + }, + { + "epoch": 11.573671901102573, + "grad_norm": 0.6852896809577942, + "learning_rate": 2.2501547808782413e-05, + "loss": 0.0831, + "num_input_tokens_seen": 126453936, + "step": 103920 + }, + { + "epoch": 11.57422875598619, + "grad_norm": 0.0013264929875731468, + "learning_rate": 2.24991302468918e-05, + "loss": 0.0101, + "num_input_tokens_seen": 126460208, + "step": 103925 + }, + { + "epoch": 11.574785610869807, + "grad_norm": 0.030812803655862808, + "learning_rate": 2.24967127086239e-05, + "loss": 0.1179, + "num_input_tokens_seen": 126466320, + "step": 103930 + }, + { + "epoch": 11.575342465753424, + "grad_norm": 0.02710595168173313, + "learning_rate": 2.2494295194001562e-05, + "loss": 0.0273, + "num_input_tokens_seen": 126472432, + "step": 103935 + }, + { + "epoch": 11.575899320637042, + "grad_norm": 0.003530384274199605, + "learning_rate": 2.2491877703047607e-05, + "loss": 0.0179, + "num_input_tokens_seen": 126478448, + "step": 103940 + }, + { + "epoch": 11.57645617552066, + "grad_norm": 0.049993909895420074, + "learning_rate": 2.248946023578488e-05, + "loss": 0.0225, + "num_input_tokens_seen": 126484688, + "step": 103945 + }, + { + "epoch": 11.577013030404277, + "grad_norm": 1.0099594593048096, + "learning_rate": 2.248704279223621e-05, + "loss": 0.0631, + "num_input_tokens_seen": 126490928, + "step": 103950 + }, + { + "epoch": 11.577569885287893, + "grad_norm": 0.589444100856781, + "learning_rate": 2.248462537242444e-05, + "loss": 0.1173, + "num_input_tokens_seen": 126497040, + "step": 103955 + }, + { + "epoch": 11.578126740171511, + "grad_norm": 0.6701090335845947, + "learning_rate": 2.248220797637239e-05, + "loss": 0.0452, + "num_input_tokens_seen": 126502512, + "step": 103960 + }, + { + "epoch": 11.578683595055129, + "grad_norm": 0.32781869173049927, + "learning_rate": 2.247979060410291e-05, + "loss": 0.0473, + "num_input_tokens_seen": 126508400, + "step": 103965 + }, + { + "epoch": 11.579240449938746, + "grad_norm": 0.4713725447654724, + "learning_rate": 2.2477373255638818e-05, + "loss": 0.0199, + "num_input_tokens_seen": 126514352, + "step": 103970 + }, + { + "epoch": 11.579797304822364, + "grad_norm": 0.033381253480911255, + "learning_rate": 2.247495593100297e-05, + "loss": 0.0115, + "num_input_tokens_seen": 126520336, + "step": 103975 + }, + { + "epoch": 11.58035415970598, + "grad_norm": 0.011594603769481182, + "learning_rate": 2.247253863021817e-05, + "loss": 0.1183, + "num_input_tokens_seen": 126526544, + "step": 103980 + }, + { + "epoch": 11.580911014589597, + "grad_norm": 0.05323737859725952, + "learning_rate": 2.2470121353307286e-05, + "loss": 0.0278, + "num_input_tokens_seen": 126532976, + "step": 103985 + }, + { + "epoch": 11.581467869473215, + "grad_norm": 1.135951042175293, + "learning_rate": 2.246770410029311e-05, + "loss": 0.1142, + "num_input_tokens_seen": 126539312, + "step": 103990 + }, + { + "epoch": 11.582024724356833, + "grad_norm": 0.8678188323974609, + "learning_rate": 2.2465286871198518e-05, + "loss": 0.0328, + "num_input_tokens_seen": 126545008, + "step": 103995 + }, + { + "epoch": 11.58258157924045, + "grad_norm": 0.014105520211160183, + "learning_rate": 2.2462869666046313e-05, + "loss": 0.0181, + "num_input_tokens_seen": 126551440, + "step": 104000 + }, + { + "epoch": 11.583138434124066, + "grad_norm": 0.09895846247673035, + "learning_rate": 2.246045248485934e-05, + "loss": 0.0344, + "num_input_tokens_seen": 126557936, + "step": 104005 + }, + { + "epoch": 11.583695289007684, + "grad_norm": 1.095215082168579, + "learning_rate": 2.2458035327660425e-05, + "loss": 0.0646, + "num_input_tokens_seen": 126564240, + "step": 104010 + }, + { + "epoch": 11.584252143891302, + "grad_norm": 0.42631587386131287, + "learning_rate": 2.2455618194472407e-05, + "loss": 0.0286, + "num_input_tokens_seen": 126570544, + "step": 104015 + }, + { + "epoch": 11.58480899877492, + "grad_norm": 0.0077325692400336266, + "learning_rate": 2.245320108531811e-05, + "loss": 0.0709, + "num_input_tokens_seen": 126576496, + "step": 104020 + }, + { + "epoch": 11.585365853658537, + "grad_norm": 0.01949356682598591, + "learning_rate": 2.2450784000220372e-05, + "loss": 0.0012, + "num_input_tokens_seen": 126582448, + "step": 104025 + }, + { + "epoch": 11.585922708542153, + "grad_norm": 0.8758266568183899, + "learning_rate": 2.2448366939202018e-05, + "loss": 0.089, + "num_input_tokens_seen": 126588752, + "step": 104030 + }, + { + "epoch": 11.58647956342577, + "grad_norm": 0.0892387181520462, + "learning_rate": 2.2445949902285888e-05, + "loss": 0.0252, + "num_input_tokens_seen": 126594832, + "step": 104035 + }, + { + "epoch": 11.587036418309388, + "grad_norm": 2.3383750915527344, + "learning_rate": 2.2443532889494794e-05, + "loss": 0.1218, + "num_input_tokens_seen": 126601072, + "step": 104040 + }, + { + "epoch": 11.587593273193006, + "grad_norm": 0.005749296862632036, + "learning_rate": 2.24411159008516e-05, + "loss": 0.0042, + "num_input_tokens_seen": 126607728, + "step": 104045 + }, + { + "epoch": 11.588150128076624, + "grad_norm": 0.0058365133590996265, + "learning_rate": 2.2438698936379098e-05, + "loss": 0.1487, + "num_input_tokens_seen": 126614128, + "step": 104050 + }, + { + "epoch": 11.588706982960241, + "grad_norm": 0.00047159241512417793, + "learning_rate": 2.2436281996100147e-05, + "loss": 0.0089, + "num_input_tokens_seen": 126619920, + "step": 104055 + }, + { + "epoch": 11.589263837843857, + "grad_norm": 0.8181002736091614, + "learning_rate": 2.2433865080037563e-05, + "loss": 0.0656, + "num_input_tokens_seen": 126626160, + "step": 104060 + }, + { + "epoch": 11.589820692727475, + "grad_norm": 1.6569409370422363, + "learning_rate": 2.2431448188214185e-05, + "loss": 0.0726, + "num_input_tokens_seen": 126632144, + "step": 104065 + }, + { + "epoch": 11.590377547611093, + "grad_norm": 0.0899541899561882, + "learning_rate": 2.242903132065283e-05, + "loss": 0.0027, + "num_input_tokens_seen": 126638416, + "step": 104070 + }, + { + "epoch": 11.59093440249471, + "grad_norm": 3.54754638671875, + "learning_rate": 2.242661447737634e-05, + "loss": 0.1025, + "num_input_tokens_seen": 126644528, + "step": 104075 + }, + { + "epoch": 11.591491257378328, + "grad_norm": 0.00979955680668354, + "learning_rate": 2.2424197658407532e-05, + "loss": 0.0509, + "num_input_tokens_seen": 126650640, + "step": 104080 + }, + { + "epoch": 11.592048112261944, + "grad_norm": 0.0004276145773474127, + "learning_rate": 2.2421780863769246e-05, + "loss": 0.0149, + "num_input_tokens_seen": 126656944, + "step": 104085 + }, + { + "epoch": 11.592604967145562, + "grad_norm": 0.0027920790016651154, + "learning_rate": 2.24193640934843e-05, + "loss": 0.0054, + "num_input_tokens_seen": 126662992, + "step": 104090 + }, + { + "epoch": 11.59316182202918, + "grad_norm": 0.8699786067008972, + "learning_rate": 2.2416947347575536e-05, + "loss": 0.0807, + "num_input_tokens_seen": 126669296, + "step": 104095 + }, + { + "epoch": 11.593718676912797, + "grad_norm": 4.159893035888672, + "learning_rate": 2.2414530626065757e-05, + "loss": 0.058, + "num_input_tokens_seen": 126675600, + "step": 104100 + }, + { + "epoch": 11.594275531796415, + "grad_norm": 0.01326084230095148, + "learning_rate": 2.241211392897783e-05, + "loss": 0.0283, + "num_input_tokens_seen": 126681456, + "step": 104105 + }, + { + "epoch": 11.59483238668003, + "grad_norm": 0.03200262784957886, + "learning_rate": 2.240969725633454e-05, + "loss": 0.0355, + "num_input_tokens_seen": 126687568, + "step": 104110 + }, + { + "epoch": 11.595389241563648, + "grad_norm": 0.9075819849967957, + "learning_rate": 2.2407280608158753e-05, + "loss": 0.0102, + "num_input_tokens_seen": 126693744, + "step": 104115 + }, + { + "epoch": 11.595946096447266, + "grad_norm": 0.2318236231803894, + "learning_rate": 2.2404863984473274e-05, + "loss": 0.1325, + "num_input_tokens_seen": 126699504, + "step": 104120 + }, + { + "epoch": 11.596502951330883, + "grad_norm": 1.4605978727340698, + "learning_rate": 2.240244738530092e-05, + "loss": 0.1116, + "num_input_tokens_seen": 126705648, + "step": 104125 + }, + { + "epoch": 11.597059806214501, + "grad_norm": 0.9180947542190552, + "learning_rate": 2.240003081066455e-05, + "loss": 0.0194, + "num_input_tokens_seen": 126711824, + "step": 104130 + }, + { + "epoch": 11.597616661098117, + "grad_norm": 1.492111325263977, + "learning_rate": 2.239761426058695e-05, + "loss": 0.1019, + "num_input_tokens_seen": 126717968, + "step": 104135 + }, + { + "epoch": 11.598173515981735, + "grad_norm": 0.0859919935464859, + "learning_rate": 2.2395197735090988e-05, + "loss": 0.0326, + "num_input_tokens_seen": 126723952, + "step": 104140 + }, + { + "epoch": 11.598730370865352, + "grad_norm": 0.0014170705107972026, + "learning_rate": 2.2392781234199458e-05, + "loss": 0.0182, + "num_input_tokens_seen": 126730160, + "step": 104145 + }, + { + "epoch": 11.59928722574897, + "grad_norm": 0.08434531092643738, + "learning_rate": 2.2390364757935208e-05, + "loss": 0.0142, + "num_input_tokens_seen": 126736304, + "step": 104150 + }, + { + "epoch": 11.599844080632588, + "grad_norm": 0.44233381748199463, + "learning_rate": 2.2387948306321047e-05, + "loss": 0.0476, + "num_input_tokens_seen": 126742640, + "step": 104155 + }, + { + "epoch": 11.600400935516204, + "grad_norm": 0.9923570156097412, + "learning_rate": 2.2385531879379813e-05, + "loss": 0.1055, + "num_input_tokens_seen": 126748784, + "step": 104160 + }, + { + "epoch": 11.600957790399821, + "grad_norm": 0.000123645702842623, + "learning_rate": 2.2383115477134317e-05, + "loss": 0.0023, + "num_input_tokens_seen": 126755056, + "step": 104165 + }, + { + "epoch": 11.601514645283439, + "grad_norm": 0.057365767657756805, + "learning_rate": 2.23806990996074e-05, + "loss": 0.133, + "num_input_tokens_seen": 126760912, + "step": 104170 + }, + { + "epoch": 11.602071500167057, + "grad_norm": 0.0020147503819316626, + "learning_rate": 2.237828274682187e-05, + "loss": 0.0095, + "num_input_tokens_seen": 126766032, + "step": 104175 + }, + { + "epoch": 11.602628355050674, + "grad_norm": 1.2152800559997559, + "learning_rate": 2.2375866418800568e-05, + "loss": 0.0317, + "num_input_tokens_seen": 126772080, + "step": 104180 + }, + { + "epoch": 11.603185209934292, + "grad_norm": 0.00233044964261353, + "learning_rate": 2.23734501155663e-05, + "loss": 0.0059, + "num_input_tokens_seen": 126778192, + "step": 104185 + }, + { + "epoch": 11.603742064817908, + "grad_norm": 0.3869825005531311, + "learning_rate": 2.2371033837141913e-05, + "loss": 0.0955, + "num_input_tokens_seen": 126783664, + "step": 104190 + }, + { + "epoch": 11.604298919701526, + "grad_norm": 0.048879582434892654, + "learning_rate": 2.2368617583550204e-05, + "loss": 0.0139, + "num_input_tokens_seen": 126789104, + "step": 104195 + }, + { + "epoch": 11.604855774585143, + "grad_norm": 0.0006547743687406182, + "learning_rate": 2.236620135481402e-05, + "loss": 0.0411, + "num_input_tokens_seen": 126795344, + "step": 104200 + }, + { + "epoch": 11.605412629468761, + "grad_norm": 0.34784913063049316, + "learning_rate": 2.2363785150956172e-05, + "loss": 0.0713, + "num_input_tokens_seen": 126800880, + "step": 104205 + }, + { + "epoch": 11.605969484352379, + "grad_norm": 0.5287179946899414, + "learning_rate": 2.2361368971999487e-05, + "loss": 0.0291, + "num_input_tokens_seen": 126806992, + "step": 104210 + }, + { + "epoch": 11.606526339235995, + "grad_norm": 0.010399413295090199, + "learning_rate": 2.235895281796678e-05, + "loss": 0.0546, + "num_input_tokens_seen": 126813232, + "step": 104215 + }, + { + "epoch": 11.607083194119612, + "grad_norm": 0.0709061399102211, + "learning_rate": 2.2356536688880885e-05, + "loss": 0.0225, + "num_input_tokens_seen": 126819472, + "step": 104220 + }, + { + "epoch": 11.60764004900323, + "grad_norm": 1.33534836769104, + "learning_rate": 2.235412058476462e-05, + "loss": 0.0641, + "num_input_tokens_seen": 126825872, + "step": 104225 + }, + { + "epoch": 11.608196903886848, + "grad_norm": 0.008178047835826874, + "learning_rate": 2.2351704505640806e-05, + "loss": 0.0613, + "num_input_tokens_seen": 126832144, + "step": 104230 + }, + { + "epoch": 11.608753758770465, + "grad_norm": 0.21211983263492584, + "learning_rate": 2.2349288451532258e-05, + "loss": 0.0472, + "num_input_tokens_seen": 126838800, + "step": 104235 + }, + { + "epoch": 11.609310613654081, + "grad_norm": 0.24475228786468506, + "learning_rate": 2.234687242246181e-05, + "loss": 0.0101, + "num_input_tokens_seen": 126844912, + "step": 104240 + }, + { + "epoch": 11.609867468537699, + "grad_norm": 0.30105873942375183, + "learning_rate": 2.234445641845227e-05, + "loss": 0.0498, + "num_input_tokens_seen": 126851184, + "step": 104245 + }, + { + "epoch": 11.610424323421316, + "grad_norm": 0.09320559352636337, + "learning_rate": 2.234204043952648e-05, + "loss": 0.0324, + "num_input_tokens_seen": 126857200, + "step": 104250 + }, + { + "epoch": 11.610981178304934, + "grad_norm": 0.0008483477286063135, + "learning_rate": 2.2339624485707233e-05, + "loss": 0.0281, + "num_input_tokens_seen": 126863472, + "step": 104255 + }, + { + "epoch": 11.611538033188552, + "grad_norm": 0.009775690734386444, + "learning_rate": 2.233720855701738e-05, + "loss": 0.0183, + "num_input_tokens_seen": 126869616, + "step": 104260 + }, + { + "epoch": 11.612094888072168, + "grad_norm": 0.4679954946041107, + "learning_rate": 2.233479265347971e-05, + "loss": 0.1222, + "num_input_tokens_seen": 126875664, + "step": 104265 + }, + { + "epoch": 11.612651742955785, + "grad_norm": 3.1700820922851562, + "learning_rate": 2.233237677511707e-05, + "loss": 0.1478, + "num_input_tokens_seen": 126881200, + "step": 104270 + }, + { + "epoch": 11.613208597839403, + "grad_norm": 0.20688793063163757, + "learning_rate": 2.232996092195226e-05, + "loss": 0.0348, + "num_input_tokens_seen": 126886672, + "step": 104275 + }, + { + "epoch": 11.61376545272302, + "grad_norm": 0.0003675195621326566, + "learning_rate": 2.2327545094008117e-05, + "loss": 0.0183, + "num_input_tokens_seen": 126892784, + "step": 104280 + }, + { + "epoch": 11.614322307606638, + "grad_norm": 9.022753511089832e-05, + "learning_rate": 2.2325129291307445e-05, + "loss": 0.0377, + "num_input_tokens_seen": 126898768, + "step": 104285 + }, + { + "epoch": 11.614879162490254, + "grad_norm": 0.35975170135498047, + "learning_rate": 2.2322713513873074e-05, + "loss": 0.0119, + "num_input_tokens_seen": 126904848, + "step": 104290 + }, + { + "epoch": 11.615436017373872, + "grad_norm": 0.009979860857129097, + "learning_rate": 2.2320297761727818e-05, + "loss": 0.0193, + "num_input_tokens_seen": 126910192, + "step": 104295 + }, + { + "epoch": 11.61599287225749, + "grad_norm": 0.10046952962875366, + "learning_rate": 2.23178820348945e-05, + "loss": 0.0226, + "num_input_tokens_seen": 126916112, + "step": 104300 + }, + { + "epoch": 11.616549727141107, + "grad_norm": 0.5179466605186462, + "learning_rate": 2.2315466333395926e-05, + "loss": 0.0625, + "num_input_tokens_seen": 126921840, + "step": 104305 + }, + { + "epoch": 11.617106582024725, + "grad_norm": 0.0010290181962773204, + "learning_rate": 2.2313050657254932e-05, + "loss": 0.04, + "num_input_tokens_seen": 126928240, + "step": 104310 + }, + { + "epoch": 11.617663436908341, + "grad_norm": 0.018130818381905556, + "learning_rate": 2.2310635006494315e-05, + "loss": 0.0182, + "num_input_tokens_seen": 126934416, + "step": 104315 + }, + { + "epoch": 11.618220291791959, + "grad_norm": 0.8808043003082275, + "learning_rate": 2.2308219381136925e-05, + "loss": 0.0498, + "num_input_tokens_seen": 126940400, + "step": 104320 + }, + { + "epoch": 11.618777146675576, + "grad_norm": 0.0005213749245740473, + "learning_rate": 2.230580378120554e-05, + "loss": 0.0128, + "num_input_tokens_seen": 126946416, + "step": 104325 + }, + { + "epoch": 11.619334001559194, + "grad_norm": 0.1164366602897644, + "learning_rate": 2.2303388206723007e-05, + "loss": 0.017, + "num_input_tokens_seen": 126952688, + "step": 104330 + }, + { + "epoch": 11.619890856442812, + "grad_norm": 0.0911102294921875, + "learning_rate": 2.2300972657712128e-05, + "loss": 0.0044, + "num_input_tokens_seen": 126958832, + "step": 104335 + }, + { + "epoch": 11.620447711326428, + "grad_norm": 0.045906927436590195, + "learning_rate": 2.229855713419573e-05, + "loss": 0.0101, + "num_input_tokens_seen": 126964400, + "step": 104340 + }, + { + "epoch": 11.621004566210045, + "grad_norm": 0.8242821097373962, + "learning_rate": 2.229614163619662e-05, + "loss": 0.1282, + "num_input_tokens_seen": 126970256, + "step": 104345 + }, + { + "epoch": 11.621561421093663, + "grad_norm": 0.15644249320030212, + "learning_rate": 2.2293726163737626e-05, + "loss": 0.0202, + "num_input_tokens_seen": 126976720, + "step": 104350 + }, + { + "epoch": 11.62211827597728, + "grad_norm": 0.15934132039546967, + "learning_rate": 2.2291310716841546e-05, + "loss": 0.0221, + "num_input_tokens_seen": 126982896, + "step": 104355 + }, + { + "epoch": 11.622675130860898, + "grad_norm": 0.6281900405883789, + "learning_rate": 2.2288895295531214e-05, + "loss": 0.0561, + "num_input_tokens_seen": 126989168, + "step": 104360 + }, + { + "epoch": 11.623231985744514, + "grad_norm": 0.0003505248168949038, + "learning_rate": 2.2286479899829436e-05, + "loss": 0.054, + "num_input_tokens_seen": 126995440, + "step": 104365 + }, + { + "epoch": 11.623788840628132, + "grad_norm": 0.0005049783503636718, + "learning_rate": 2.228406452975903e-05, + "loss": 0.0463, + "num_input_tokens_seen": 127001424, + "step": 104370 + }, + { + "epoch": 11.62434569551175, + "grad_norm": 0.9842986464500427, + "learning_rate": 2.22816491853428e-05, + "loss": 0.0289, + "num_input_tokens_seen": 127007600, + "step": 104375 + }, + { + "epoch": 11.624902550395367, + "grad_norm": 0.004902419168502092, + "learning_rate": 2.227923386660359e-05, + "loss": 0.0297, + "num_input_tokens_seen": 127013456, + "step": 104380 + }, + { + "epoch": 11.625459405278985, + "grad_norm": 0.06521245092153549, + "learning_rate": 2.227681857356418e-05, + "loss": 0.0288, + "num_input_tokens_seen": 127019536, + "step": 104385 + }, + { + "epoch": 11.6260162601626, + "grad_norm": 0.1448119431734085, + "learning_rate": 2.2274403306247415e-05, + "loss": 0.0358, + "num_input_tokens_seen": 127025424, + "step": 104390 + }, + { + "epoch": 11.626573115046218, + "grad_norm": 1.077741026878357, + "learning_rate": 2.2271988064676078e-05, + "loss": 0.0216, + "num_input_tokens_seen": 127031696, + "step": 104395 + }, + { + "epoch": 11.627129969929836, + "grad_norm": 0.5847159624099731, + "learning_rate": 2.226957284887301e-05, + "loss": 0.0305, + "num_input_tokens_seen": 127037936, + "step": 104400 + }, + { + "epoch": 11.627686824813454, + "grad_norm": 0.09267152845859528, + "learning_rate": 2.226715765886101e-05, + "loss": 0.0136, + "num_input_tokens_seen": 127044048, + "step": 104405 + }, + { + "epoch": 11.628243679697071, + "grad_norm": 0.004618707578629255, + "learning_rate": 2.2264742494662903e-05, + "loss": 0.0988, + "num_input_tokens_seen": 127049776, + "step": 104410 + }, + { + "epoch": 11.628800534580689, + "grad_norm": 1.3210910558700562, + "learning_rate": 2.2262327356301484e-05, + "loss": 0.0273, + "num_input_tokens_seen": 127055984, + "step": 104415 + }, + { + "epoch": 11.629357389464305, + "grad_norm": 0.0003707520372699946, + "learning_rate": 2.2259912243799585e-05, + "loss": 0.0985, + "num_input_tokens_seen": 127062352, + "step": 104420 + }, + { + "epoch": 11.629914244347923, + "grad_norm": 0.668880045413971, + "learning_rate": 2.2257497157180004e-05, + "loss": 0.203, + "num_input_tokens_seen": 127068400, + "step": 104425 + }, + { + "epoch": 11.63047109923154, + "grad_norm": 0.03581893816590309, + "learning_rate": 2.2255082096465564e-05, + "loss": 0.0503, + "num_input_tokens_seen": 127074256, + "step": 104430 + }, + { + "epoch": 11.631027954115158, + "grad_norm": 4.069427013397217, + "learning_rate": 2.225266706167907e-05, + "loss": 0.0451, + "num_input_tokens_seen": 127080496, + "step": 104435 + }, + { + "epoch": 11.631584808998776, + "grad_norm": 0.3071120083332062, + "learning_rate": 2.225025205284334e-05, + "loss": 0.013, + "num_input_tokens_seen": 127086640, + "step": 104440 + }, + { + "epoch": 11.632141663882392, + "grad_norm": 8.734555740375072e-05, + "learning_rate": 2.2247837069981173e-05, + "loss": 0.0386, + "num_input_tokens_seen": 127092624, + "step": 104445 + }, + { + "epoch": 11.63269851876601, + "grad_norm": 0.007283697370439768, + "learning_rate": 2.2245422113115405e-05, + "loss": 0.0097, + "num_input_tokens_seen": 127098512, + "step": 104450 + }, + { + "epoch": 11.633255373649627, + "grad_norm": 0.009480712935328484, + "learning_rate": 2.2243007182268815e-05, + "loss": 0.0012, + "num_input_tokens_seen": 127104496, + "step": 104455 + }, + { + "epoch": 11.633812228533245, + "grad_norm": 0.9781932830810547, + "learning_rate": 2.2240592277464246e-05, + "loss": 0.0875, + "num_input_tokens_seen": 127110384, + "step": 104460 + }, + { + "epoch": 11.634369083416862, + "grad_norm": 0.3218490779399872, + "learning_rate": 2.2238177398724487e-05, + "loss": 0.0784, + "num_input_tokens_seen": 127115504, + "step": 104465 + }, + { + "epoch": 11.634925938300478, + "grad_norm": 0.9609879851341248, + "learning_rate": 2.2235762546072357e-05, + "loss": 0.0342, + "num_input_tokens_seen": 127121776, + "step": 104470 + }, + { + "epoch": 11.635482793184096, + "grad_norm": 0.003712015226483345, + "learning_rate": 2.223334771953066e-05, + "loss": 0.0113, + "num_input_tokens_seen": 127127952, + "step": 104475 + }, + { + "epoch": 11.636039648067714, + "grad_norm": 0.39074546098709106, + "learning_rate": 2.2230932919122216e-05, + "loss": 0.1102, + "num_input_tokens_seen": 127134192, + "step": 104480 + }, + { + "epoch": 11.636596502951331, + "grad_norm": 2.0160632133483887, + "learning_rate": 2.2228518144869825e-05, + "loss": 0.0383, + "num_input_tokens_seen": 127140272, + "step": 104485 + }, + { + "epoch": 11.637153357834949, + "grad_norm": 1.197020411491394, + "learning_rate": 2.2226103396796306e-05, + "loss": 0.0614, + "num_input_tokens_seen": 127146256, + "step": 104490 + }, + { + "epoch": 11.637710212718565, + "grad_norm": 0.2725300192832947, + "learning_rate": 2.2223688674924457e-05, + "loss": 0.0163, + "num_input_tokens_seen": 127151920, + "step": 104495 + }, + { + "epoch": 11.638267067602182, + "grad_norm": 0.03217732533812523, + "learning_rate": 2.2221273979277097e-05, + "loss": 0.0162, + "num_input_tokens_seen": 127158160, + "step": 104500 + }, + { + "epoch": 11.6388239224858, + "grad_norm": 0.07780846953392029, + "learning_rate": 2.2218859309877022e-05, + "loss": 0.0123, + "num_input_tokens_seen": 127164304, + "step": 104505 + }, + { + "epoch": 11.639380777369418, + "grad_norm": 0.020274057984352112, + "learning_rate": 2.2216444666747067e-05, + "loss": 0.0017, + "num_input_tokens_seen": 127170448, + "step": 104510 + }, + { + "epoch": 11.639937632253035, + "grad_norm": 0.38273486495018005, + "learning_rate": 2.221403004991e-05, + "loss": 0.0123, + "num_input_tokens_seen": 127176656, + "step": 104515 + }, + { + "epoch": 11.640494487136651, + "grad_norm": 1.1242725849151611, + "learning_rate": 2.2211615459388675e-05, + "loss": 0.0087, + "num_input_tokens_seen": 127182896, + "step": 104520 + }, + { + "epoch": 11.641051342020269, + "grad_norm": 1.9057942628860474, + "learning_rate": 2.2209200895205863e-05, + "loss": 0.0419, + "num_input_tokens_seen": 127189040, + "step": 104525 + }, + { + "epoch": 11.641608196903887, + "grad_norm": 0.04636862501502037, + "learning_rate": 2.2206786357384377e-05, + "loss": 0.1095, + "num_input_tokens_seen": 127194608, + "step": 104530 + }, + { + "epoch": 11.642165051787504, + "grad_norm": 0.35270243883132935, + "learning_rate": 2.220437184594705e-05, + "loss": 0.0077, + "num_input_tokens_seen": 127200848, + "step": 104535 + }, + { + "epoch": 11.642721906671122, + "grad_norm": 0.006811340805143118, + "learning_rate": 2.220195736091665e-05, + "loss": 0.0531, + "num_input_tokens_seen": 127206896, + "step": 104540 + }, + { + "epoch": 11.64327876155474, + "grad_norm": 0.18321853876113892, + "learning_rate": 2.219954290231602e-05, + "loss": 0.0244, + "num_input_tokens_seen": 127213264, + "step": 104545 + }, + { + "epoch": 11.643835616438356, + "grad_norm": 0.8594557046890259, + "learning_rate": 2.2197128470167943e-05, + "loss": 0.0339, + "num_input_tokens_seen": 127219600, + "step": 104550 + }, + { + "epoch": 11.644392471321973, + "grad_norm": 0.02269202470779419, + "learning_rate": 2.219471406449524e-05, + "loss": 0.0903, + "num_input_tokens_seen": 127225552, + "step": 104555 + }, + { + "epoch": 11.644949326205591, + "grad_norm": 0.011563421227037907, + "learning_rate": 2.21922996853207e-05, + "loss": 0.0256, + "num_input_tokens_seen": 127231920, + "step": 104560 + }, + { + "epoch": 11.645506181089209, + "grad_norm": 0.6604558229446411, + "learning_rate": 2.218988533266715e-05, + "loss": 0.0791, + "num_input_tokens_seen": 127238032, + "step": 104565 + }, + { + "epoch": 11.646063035972826, + "grad_norm": 0.0008240575552918017, + "learning_rate": 2.2187471006557378e-05, + "loss": 0.175, + "num_input_tokens_seen": 127244240, + "step": 104570 + }, + { + "epoch": 11.646619890856442, + "grad_norm": 0.46907034516334534, + "learning_rate": 2.21850567070142e-05, + "loss": 0.1097, + "num_input_tokens_seen": 127250160, + "step": 104575 + }, + { + "epoch": 11.64717674574006, + "grad_norm": 0.7110357284545898, + "learning_rate": 2.218264243406041e-05, + "loss": 0.0133, + "num_input_tokens_seen": 127256592, + "step": 104580 + }, + { + "epoch": 11.647733600623678, + "grad_norm": 0.3067725896835327, + "learning_rate": 2.2180228187718827e-05, + "loss": 0.0102, + "num_input_tokens_seen": 127262992, + "step": 104585 + }, + { + "epoch": 11.648290455507295, + "grad_norm": 1.3816810846328735, + "learning_rate": 2.2177813968012236e-05, + "loss": 0.0239, + "num_input_tokens_seen": 127268976, + "step": 104590 + }, + { + "epoch": 11.648847310390913, + "grad_norm": 0.4345901608467102, + "learning_rate": 2.217539977496347e-05, + "loss": 0.0177, + "num_input_tokens_seen": 127274960, + "step": 104595 + }, + { + "epoch": 11.649404165274529, + "grad_norm": 0.0789722129702568, + "learning_rate": 2.2172985608595302e-05, + "loss": 0.0243, + "num_input_tokens_seen": 127280976, + "step": 104600 + }, + { + "epoch": 11.649961020158146, + "grad_norm": 0.0017220532754436135, + "learning_rate": 2.217057146893056e-05, + "loss": 0.0059, + "num_input_tokens_seen": 127287056, + "step": 104605 + }, + { + "epoch": 11.650517875041764, + "grad_norm": 2.008793830871582, + "learning_rate": 2.2168157355992028e-05, + "loss": 0.1227, + "num_input_tokens_seen": 127293232, + "step": 104610 + }, + { + "epoch": 11.651074729925382, + "grad_norm": 0.5352430939674377, + "learning_rate": 2.2165743269802526e-05, + "loss": 0.0623, + "num_input_tokens_seen": 127299568, + "step": 104615 + }, + { + "epoch": 11.651631584809, + "grad_norm": 0.12460262328386307, + "learning_rate": 2.2163329210384845e-05, + "loss": 0.0512, + "num_input_tokens_seen": 127305712, + "step": 104620 + }, + { + "epoch": 11.652188439692615, + "grad_norm": 0.17200015485286713, + "learning_rate": 2.2160915177761798e-05, + "loss": 0.2291, + "num_input_tokens_seen": 127311984, + "step": 104625 + }, + { + "epoch": 11.652745294576233, + "grad_norm": 0.6572973728179932, + "learning_rate": 2.2158501171956176e-05, + "loss": 0.0229, + "num_input_tokens_seen": 127318096, + "step": 104630 + }, + { + "epoch": 11.65330214945985, + "grad_norm": 0.013120245188474655, + "learning_rate": 2.2156087192990793e-05, + "loss": 0.0064, + "num_input_tokens_seen": 127323984, + "step": 104635 + }, + { + "epoch": 11.653859004343468, + "grad_norm": 0.40581318736076355, + "learning_rate": 2.215367324088844e-05, + "loss": 0.0427, + "num_input_tokens_seen": 127329840, + "step": 104640 + }, + { + "epoch": 11.654415859227086, + "grad_norm": 0.00024294080503750592, + "learning_rate": 2.2151259315671927e-05, + "loss": 0.0089, + "num_input_tokens_seen": 127336016, + "step": 104645 + }, + { + "epoch": 11.654972714110702, + "grad_norm": 0.0005040608230046928, + "learning_rate": 2.2148845417364043e-05, + "loss": 0.03, + "num_input_tokens_seen": 127342288, + "step": 104650 + }, + { + "epoch": 11.65552956899432, + "grad_norm": 1.0587626695632935, + "learning_rate": 2.2146431545987612e-05, + "loss": 0.0199, + "num_input_tokens_seen": 127348016, + "step": 104655 + }, + { + "epoch": 11.656086423877937, + "grad_norm": 0.024231715127825737, + "learning_rate": 2.214401770156541e-05, + "loss": 0.0788, + "num_input_tokens_seen": 127354480, + "step": 104660 + }, + { + "epoch": 11.656643278761555, + "grad_norm": 0.1650029420852661, + "learning_rate": 2.214160388412026e-05, + "loss": 0.0417, + "num_input_tokens_seen": 127360592, + "step": 104665 + }, + { + "epoch": 11.657200133645173, + "grad_norm": 0.03656498342752457, + "learning_rate": 2.2139190093674935e-05, + "loss": 0.0314, + "num_input_tokens_seen": 127366640, + "step": 104670 + }, + { + "epoch": 11.657756988528789, + "grad_norm": 0.004477743525058031, + "learning_rate": 2.2136776330252267e-05, + "loss": 0.0114, + "num_input_tokens_seen": 127372592, + "step": 104675 + }, + { + "epoch": 11.658313843412406, + "grad_norm": 0.0026500553358346224, + "learning_rate": 2.2134362593875032e-05, + "loss": 0.015, + "num_input_tokens_seen": 127378800, + "step": 104680 + }, + { + "epoch": 11.658870698296024, + "grad_norm": 2.711686849594116, + "learning_rate": 2.2131948884566046e-05, + "loss": 0.1695, + "num_input_tokens_seen": 127384784, + "step": 104685 + }, + { + "epoch": 11.659427553179642, + "grad_norm": 0.0017268002266064286, + "learning_rate": 2.212953520234809e-05, + "loss": 0.1199, + "num_input_tokens_seen": 127391120, + "step": 104690 + }, + { + "epoch": 11.65998440806326, + "grad_norm": 0.5361150503158569, + "learning_rate": 2.2127121547243987e-05, + "loss": 0.0397, + "num_input_tokens_seen": 127397264, + "step": 104695 + }, + { + "epoch": 11.660541262946875, + "grad_norm": 0.0866381824016571, + "learning_rate": 2.212470791927651e-05, + "loss": 0.0186, + "num_input_tokens_seen": 127403472, + "step": 104700 + }, + { + "epoch": 11.661098117830493, + "grad_norm": 0.17163820564746857, + "learning_rate": 2.212229431846848e-05, + "loss": 0.1636, + "num_input_tokens_seen": 127409168, + "step": 104705 + }, + { + "epoch": 11.66165497271411, + "grad_norm": 0.47060248255729675, + "learning_rate": 2.211988074484268e-05, + "loss": 0.013, + "num_input_tokens_seen": 127415024, + "step": 104710 + }, + { + "epoch": 11.662211827597728, + "grad_norm": 0.26526644825935364, + "learning_rate": 2.211746719842192e-05, + "loss": 0.1446, + "num_input_tokens_seen": 127420976, + "step": 104715 + }, + { + "epoch": 11.662768682481346, + "grad_norm": 0.0008647509966976941, + "learning_rate": 2.2115053679228977e-05, + "loss": 0.0035, + "num_input_tokens_seen": 127427344, + "step": 104720 + }, + { + "epoch": 11.663325537364962, + "grad_norm": 0.0009608871769160032, + "learning_rate": 2.2112640187286684e-05, + "loss": 0.0664, + "num_input_tokens_seen": 127433360, + "step": 104725 + }, + { + "epoch": 11.66388239224858, + "grad_norm": 0.3053084909915924, + "learning_rate": 2.21102267226178e-05, + "loss": 0.1175, + "num_input_tokens_seen": 127439536, + "step": 104730 + }, + { + "epoch": 11.664439247132197, + "grad_norm": 0.11273589730262756, + "learning_rate": 2.210781328524515e-05, + "loss": 0.0401, + "num_input_tokens_seen": 127446000, + "step": 104735 + }, + { + "epoch": 11.664996102015815, + "grad_norm": 0.01943381316959858, + "learning_rate": 2.2105399875191515e-05, + "loss": 0.0139, + "num_input_tokens_seen": 127452240, + "step": 104740 + }, + { + "epoch": 11.665552956899433, + "grad_norm": 0.09222424030303955, + "learning_rate": 2.21029864924797e-05, + "loss": 0.0097, + "num_input_tokens_seen": 127458480, + "step": 104745 + }, + { + "epoch": 11.666109811783048, + "grad_norm": 0.023494713008403778, + "learning_rate": 2.21005731371325e-05, + "loss": 0.019, + "num_input_tokens_seen": 127464784, + "step": 104750 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 0.039899684488773346, + "learning_rate": 2.209815980917271e-05, + "loss": 0.0017, + "num_input_tokens_seen": 127471056, + "step": 104755 + }, + { + "epoch": 11.667223521550284, + "grad_norm": 0.4804604947566986, + "learning_rate": 2.209574650862312e-05, + "loss": 0.0432, + "num_input_tokens_seen": 127477168, + "step": 104760 + }, + { + "epoch": 11.667780376433901, + "grad_norm": 0.9121831059455872, + "learning_rate": 2.209333323550654e-05, + "loss": 0.0719, + "num_input_tokens_seen": 127483216, + "step": 104765 + }, + { + "epoch": 11.668337231317519, + "grad_norm": 0.15209299325942993, + "learning_rate": 2.2090919989845752e-05, + "loss": 0.0249, + "num_input_tokens_seen": 127489520, + "step": 104770 + }, + { + "epoch": 11.668894086201137, + "grad_norm": 0.680156946182251, + "learning_rate": 2.2088506771663556e-05, + "loss": 0.0258, + "num_input_tokens_seen": 127495472, + "step": 104775 + }, + { + "epoch": 11.669450941084753, + "grad_norm": 1.0139857530593872, + "learning_rate": 2.2086093580982737e-05, + "loss": 0.0518, + "num_input_tokens_seen": 127501296, + "step": 104780 + }, + { + "epoch": 11.67000779596837, + "grad_norm": 0.21174196898937225, + "learning_rate": 2.2083680417826115e-05, + "loss": 0.1198, + "num_input_tokens_seen": 127507536, + "step": 104785 + }, + { + "epoch": 11.670564650851988, + "grad_norm": 1.5309432744979858, + "learning_rate": 2.2081267282216455e-05, + "loss": 0.081, + "num_input_tokens_seen": 127513712, + "step": 104790 + }, + { + "epoch": 11.671121505735606, + "grad_norm": 0.007996656931936741, + "learning_rate": 2.2078854174176575e-05, + "loss": 0.1457, + "num_input_tokens_seen": 127519664, + "step": 104795 + }, + { + "epoch": 11.671678360619223, + "grad_norm": 0.00031752497307024896, + "learning_rate": 2.2076441093729244e-05, + "loss": 0.0622, + "num_input_tokens_seen": 127525520, + "step": 104800 + }, + { + "epoch": 11.67223521550284, + "grad_norm": 1.297940969467163, + "learning_rate": 2.207402804089728e-05, + "loss": 0.093, + "num_input_tokens_seen": 127530704, + "step": 104805 + }, + { + "epoch": 11.672792070386457, + "grad_norm": 0.4626319110393524, + "learning_rate": 2.207161501570346e-05, + "loss": 0.0898, + "num_input_tokens_seen": 127537168, + "step": 104810 + }, + { + "epoch": 11.673348925270075, + "grad_norm": 0.0001626645534997806, + "learning_rate": 2.2069202018170586e-05, + "loss": 0.0141, + "num_input_tokens_seen": 127543632, + "step": 104815 + }, + { + "epoch": 11.673905780153692, + "grad_norm": 2.3524458408355713, + "learning_rate": 2.2066789048321444e-05, + "loss": 0.1558, + "num_input_tokens_seen": 127548848, + "step": 104820 + }, + { + "epoch": 11.67446263503731, + "grad_norm": 0.022626234218478203, + "learning_rate": 2.2064376106178835e-05, + "loss": 0.0354, + "num_input_tokens_seen": 127554992, + "step": 104825 + }, + { + "epoch": 11.675019489920926, + "grad_norm": 0.2906164824962616, + "learning_rate": 2.206196319176554e-05, + "loss": 0.0236, + "num_input_tokens_seen": 127561104, + "step": 104830 + }, + { + "epoch": 11.675576344804544, + "grad_norm": 0.034409403800964355, + "learning_rate": 2.205955030510436e-05, + "loss": 0.0413, + "num_input_tokens_seen": 127566864, + "step": 104835 + }, + { + "epoch": 11.676133199688161, + "grad_norm": 0.8502877950668335, + "learning_rate": 2.205713744621808e-05, + "loss": 0.0598, + "num_input_tokens_seen": 127573104, + "step": 104840 + }, + { + "epoch": 11.676690054571779, + "grad_norm": 0.3483322858810425, + "learning_rate": 2.20547246151295e-05, + "loss": 0.0609, + "num_input_tokens_seen": 127579696, + "step": 104845 + }, + { + "epoch": 11.677246909455397, + "grad_norm": 0.005544600542634726, + "learning_rate": 2.2052311811861394e-05, + "loss": 0.0684, + "num_input_tokens_seen": 127585744, + "step": 104850 + }, + { + "epoch": 11.677803764339012, + "grad_norm": 0.06457394361495972, + "learning_rate": 2.2049899036436577e-05, + "loss": 0.0109, + "num_input_tokens_seen": 127592048, + "step": 104855 + }, + { + "epoch": 11.67836061922263, + "grad_norm": 0.30145272612571716, + "learning_rate": 2.2047486288877815e-05, + "loss": 0.0419, + "num_input_tokens_seen": 127596976, + "step": 104860 + }, + { + "epoch": 11.678917474106248, + "grad_norm": 0.002790510654449463, + "learning_rate": 2.2045073569207922e-05, + "loss": 0.0782, + "num_input_tokens_seen": 127603248, + "step": 104865 + }, + { + "epoch": 11.679474328989865, + "grad_norm": 1.0081214904785156, + "learning_rate": 2.204266087744967e-05, + "loss": 0.1197, + "num_input_tokens_seen": 127609552, + "step": 104870 + }, + { + "epoch": 11.680031183873483, + "grad_norm": 0.0006529357051476836, + "learning_rate": 2.204024821362586e-05, + "loss": 0.0004, + "num_input_tokens_seen": 127616112, + "step": 104875 + }, + { + "epoch": 11.6805880387571, + "grad_norm": 0.3988756537437439, + "learning_rate": 2.2037835577759276e-05, + "loss": 0.0122, + "num_input_tokens_seen": 127622480, + "step": 104880 + }, + { + "epoch": 11.681144893640717, + "grad_norm": 0.13391907513141632, + "learning_rate": 2.203542296987271e-05, + "loss": 0.005, + "num_input_tokens_seen": 127628688, + "step": 104885 + }, + { + "epoch": 11.681701748524334, + "grad_norm": 0.003069036640226841, + "learning_rate": 2.2033010389988942e-05, + "loss": 0.004, + "num_input_tokens_seen": 127635152, + "step": 104890 + }, + { + "epoch": 11.682258603407952, + "grad_norm": 0.6406411528587341, + "learning_rate": 2.2030597838130777e-05, + "loss": 0.1844, + "num_input_tokens_seen": 127640976, + "step": 104895 + }, + { + "epoch": 11.68281545829157, + "grad_norm": 0.06675772368907928, + "learning_rate": 2.2028185314320987e-05, + "loss": 0.0936, + "num_input_tokens_seen": 127647152, + "step": 104900 + }, + { + "epoch": 11.683372313175187, + "grad_norm": 0.018555304035544395, + "learning_rate": 2.2025772818582373e-05, + "loss": 0.0126, + "num_input_tokens_seen": 127653040, + "step": 104905 + }, + { + "epoch": 11.683929168058803, + "grad_norm": 0.20021536946296692, + "learning_rate": 2.2023360350937707e-05, + "loss": 0.0251, + "num_input_tokens_seen": 127659120, + "step": 104910 + }, + { + "epoch": 11.684486022942421, + "grad_norm": 0.00021231725986581296, + "learning_rate": 2.2020947911409806e-05, + "loss": 0.0772, + "num_input_tokens_seen": 127665072, + "step": 104915 + }, + { + "epoch": 11.685042877826039, + "grad_norm": 0.033959876745939255, + "learning_rate": 2.201853550002142e-05, + "loss": 0.0857, + "num_input_tokens_seen": 127671312, + "step": 104920 + }, + { + "epoch": 11.685599732709656, + "grad_norm": 0.980635404586792, + "learning_rate": 2.2016123116795364e-05, + "loss": 0.0749, + "num_input_tokens_seen": 127677488, + "step": 104925 + }, + { + "epoch": 11.686156587593274, + "grad_norm": 0.00018446531612426043, + "learning_rate": 2.2013710761754415e-05, + "loss": 0.1175, + "num_input_tokens_seen": 127683632, + "step": 104930 + }, + { + "epoch": 11.68671344247689, + "grad_norm": 1.1523265838623047, + "learning_rate": 2.2011298434921364e-05, + "loss": 0.0953, + "num_input_tokens_seen": 127689456, + "step": 104935 + }, + { + "epoch": 11.687270297360508, + "grad_norm": 0.37137171626091003, + "learning_rate": 2.2008886136318996e-05, + "loss": 0.1089, + "num_input_tokens_seen": 127695056, + "step": 104940 + }, + { + "epoch": 11.687827152244125, + "grad_norm": 0.009813704527914524, + "learning_rate": 2.2006473865970083e-05, + "loss": 0.018, + "num_input_tokens_seen": 127701232, + "step": 104945 + }, + { + "epoch": 11.688384007127743, + "grad_norm": 0.9531048536300659, + "learning_rate": 2.2004061623897433e-05, + "loss": 0.0475, + "num_input_tokens_seen": 127707120, + "step": 104950 + }, + { + "epoch": 11.68894086201136, + "grad_norm": 0.00026840929058380425, + "learning_rate": 2.2001649410123812e-05, + "loss": 0.0771, + "num_input_tokens_seen": 127713392, + "step": 104955 + }, + { + "epoch": 11.689497716894977, + "grad_norm": 0.011449223384261131, + "learning_rate": 2.1999237224672022e-05, + "loss": 0.0168, + "num_input_tokens_seen": 127719216, + "step": 104960 + }, + { + "epoch": 11.690054571778594, + "grad_norm": 0.16368970274925232, + "learning_rate": 2.1996825067564838e-05, + "loss": 0.0719, + "num_input_tokens_seen": 127725360, + "step": 104965 + }, + { + "epoch": 11.690611426662212, + "grad_norm": 0.8841985464096069, + "learning_rate": 2.1994412938825047e-05, + "loss": 0.0355, + "num_input_tokens_seen": 127731088, + "step": 104970 + }, + { + "epoch": 11.69116828154583, + "grad_norm": 0.013503970578312874, + "learning_rate": 2.1992000838475437e-05, + "loss": 0.0003, + "num_input_tokens_seen": 127737136, + "step": 104975 + }, + { + "epoch": 11.691725136429447, + "grad_norm": 0.48581501841545105, + "learning_rate": 2.1989588766538787e-05, + "loss": 0.0175, + "num_input_tokens_seen": 127743504, + "step": 104980 + }, + { + "epoch": 11.692281991313063, + "grad_norm": 0.01876644417643547, + "learning_rate": 2.198717672303788e-05, + "loss": 0.0079, + "num_input_tokens_seen": 127749616, + "step": 104985 + }, + { + "epoch": 11.69283884619668, + "grad_norm": 0.847923755645752, + "learning_rate": 2.198476470799551e-05, + "loss": 0.104, + "num_input_tokens_seen": 127754928, + "step": 104990 + }, + { + "epoch": 11.693395701080298, + "grad_norm": 0.0055617704056203365, + "learning_rate": 2.198235272143444e-05, + "loss": 0.0043, + "num_input_tokens_seen": 127761072, + "step": 104995 + }, + { + "epoch": 11.693952555963916, + "grad_norm": 0.08950643986463547, + "learning_rate": 2.1979940763377482e-05, + "loss": 0.0083, + "num_input_tokens_seen": 127767088, + "step": 105000 + }, + { + "epoch": 11.694509410847534, + "grad_norm": 0.8097405433654785, + "learning_rate": 2.197752883384739e-05, + "loss": 0.0257, + "num_input_tokens_seen": 127773392, + "step": 105005 + }, + { + "epoch": 11.69506626573115, + "grad_norm": 0.42799562215805054, + "learning_rate": 2.1975116932866966e-05, + "loss": 0.0886, + "num_input_tokens_seen": 127779472, + "step": 105010 + }, + { + "epoch": 11.695623120614767, + "grad_norm": 0.0007753442041575909, + "learning_rate": 2.1972705060458983e-05, + "loss": 0.0088, + "num_input_tokens_seen": 127785168, + "step": 105015 + }, + { + "epoch": 11.696179975498385, + "grad_norm": 0.6158929467201233, + "learning_rate": 2.1970293216646233e-05, + "loss": 0.0313, + "num_input_tokens_seen": 127791120, + "step": 105020 + }, + { + "epoch": 11.696736830382003, + "grad_norm": 0.03233852982521057, + "learning_rate": 2.196788140145148e-05, + "loss": 0.1272, + "num_input_tokens_seen": 127797168, + "step": 105025 + }, + { + "epoch": 11.69729368526562, + "grad_norm": 0.0024700621142983437, + "learning_rate": 2.196546961489753e-05, + "loss": 0.0966, + "num_input_tokens_seen": 127803216, + "step": 105030 + }, + { + "epoch": 11.697850540149236, + "grad_norm": 0.004873573314398527, + "learning_rate": 2.1963057857007142e-05, + "loss": 0.0047, + "num_input_tokens_seen": 127809488, + "step": 105035 + }, + { + "epoch": 11.698407395032854, + "grad_norm": 0.13875898718833923, + "learning_rate": 2.196064612780311e-05, + "loss": 0.0394, + "num_input_tokens_seen": 127815536, + "step": 105040 + }, + { + "epoch": 11.698964249916472, + "grad_norm": 0.0015525267226621509, + "learning_rate": 2.1958234427308202e-05, + "loss": 0.035, + "num_input_tokens_seen": 127821744, + "step": 105045 + }, + { + "epoch": 11.69952110480009, + "grad_norm": 0.4619300365447998, + "learning_rate": 2.1955822755545218e-05, + "loss": 0.0375, + "num_input_tokens_seen": 127827984, + "step": 105050 + }, + { + "epoch": 11.700077959683707, + "grad_norm": 1.3004907369613647, + "learning_rate": 2.195341111253691e-05, + "loss": 0.0249, + "num_input_tokens_seen": 127834256, + "step": 105055 + }, + { + "epoch": 11.700634814567323, + "grad_norm": 0.0038165044970810413, + "learning_rate": 2.19509994983061e-05, + "loss": 0.0625, + "num_input_tokens_seen": 127840432, + "step": 105060 + }, + { + "epoch": 11.70119166945094, + "grad_norm": 0.005893003661185503, + "learning_rate": 2.1948587912875518e-05, + "loss": 0.0029, + "num_input_tokens_seen": 127846544, + "step": 105065 + }, + { + "epoch": 11.701748524334558, + "grad_norm": 0.001090428326278925, + "learning_rate": 2.1946176356267988e-05, + "loss": 0.0644, + "num_input_tokens_seen": 127852656, + "step": 105070 + }, + { + "epoch": 11.702305379218176, + "grad_norm": 1.8040722608566284, + "learning_rate": 2.1943764828506253e-05, + "loss": 0.0757, + "num_input_tokens_seen": 127858000, + "step": 105075 + }, + { + "epoch": 11.702862234101794, + "grad_norm": 0.2196020632982254, + "learning_rate": 2.1941353329613118e-05, + "loss": 0.0188, + "num_input_tokens_seen": 127864112, + "step": 105080 + }, + { + "epoch": 11.70341908898541, + "grad_norm": 0.9807997345924377, + "learning_rate": 2.1938941859611346e-05, + "loss": 0.1048, + "num_input_tokens_seen": 127870320, + "step": 105085 + }, + { + "epoch": 11.703975943869027, + "grad_norm": 1.1485791206359863, + "learning_rate": 2.1936530418523727e-05, + "loss": 0.0893, + "num_input_tokens_seen": 127876368, + "step": 105090 + }, + { + "epoch": 11.704532798752645, + "grad_norm": 0.04317827895283699, + "learning_rate": 2.1934119006373024e-05, + "loss": 0.0801, + "num_input_tokens_seen": 127882608, + "step": 105095 + }, + { + "epoch": 11.705089653636263, + "grad_norm": 2.8087308406829834, + "learning_rate": 2.1931707623182032e-05, + "loss": 0.0879, + "num_input_tokens_seen": 127889136, + "step": 105100 + }, + { + "epoch": 11.70564650851988, + "grad_norm": 0.07772563397884369, + "learning_rate": 2.1929296268973513e-05, + "loss": 0.0115, + "num_input_tokens_seen": 127895408, + "step": 105105 + }, + { + "epoch": 11.706203363403498, + "grad_norm": 0.2261117547750473, + "learning_rate": 2.1926884943770257e-05, + "loss": 0.0322, + "num_input_tokens_seen": 127901936, + "step": 105110 + }, + { + "epoch": 11.706760218287114, + "grad_norm": 0.013987404294312, + "learning_rate": 2.1924473647595028e-05, + "loss": 0.0478, + "num_input_tokens_seen": 127908144, + "step": 105115 + }, + { + "epoch": 11.707317073170731, + "grad_norm": 0.2657332122325897, + "learning_rate": 2.1922062380470616e-05, + "loss": 0.0884, + "num_input_tokens_seen": 127914320, + "step": 105120 + }, + { + "epoch": 11.70787392805435, + "grad_norm": 0.47841429710388184, + "learning_rate": 2.191965114241978e-05, + "loss": 0.1526, + "num_input_tokens_seen": 127920784, + "step": 105125 + }, + { + "epoch": 11.708430782937967, + "grad_norm": 0.0004065132816322148, + "learning_rate": 2.191723993346532e-05, + "loss": 0.0198, + "num_input_tokens_seen": 127927024, + "step": 105130 + }, + { + "epoch": 11.708987637821584, + "grad_norm": 0.32763993740081787, + "learning_rate": 2.1914828753629986e-05, + "loss": 0.0622, + "num_input_tokens_seen": 127933200, + "step": 105135 + }, + { + "epoch": 11.7095444927052, + "grad_norm": 0.12206179648637772, + "learning_rate": 2.191241760293658e-05, + "loss": 0.0322, + "num_input_tokens_seen": 127939376, + "step": 105140 + }, + { + "epoch": 11.710101347588818, + "grad_norm": 0.010076153092086315, + "learning_rate": 2.1910006481407854e-05, + "loss": 0.0027, + "num_input_tokens_seen": 127945680, + "step": 105145 + }, + { + "epoch": 11.710658202472436, + "grad_norm": 0.001954209990799427, + "learning_rate": 2.1907595389066596e-05, + "loss": 0.0791, + "num_input_tokens_seen": 127951536, + "step": 105150 + }, + { + "epoch": 11.711215057356053, + "grad_norm": 0.2376803308725357, + "learning_rate": 2.1905184325935572e-05, + "loss": 0.0237, + "num_input_tokens_seen": 127957872, + "step": 105155 + }, + { + "epoch": 11.711771912239671, + "grad_norm": 1.6481841802597046, + "learning_rate": 2.190277329203757e-05, + "loss": 0.0392, + "num_input_tokens_seen": 127964176, + "step": 105160 + }, + { + "epoch": 11.712328767123287, + "grad_norm": 0.1295025646686554, + "learning_rate": 2.1900362287395352e-05, + "loss": 0.0086, + "num_input_tokens_seen": 127970384, + "step": 105165 + }, + { + "epoch": 11.712885622006905, + "grad_norm": 0.6422185897827148, + "learning_rate": 2.1897951312031697e-05, + "loss": 0.0396, + "num_input_tokens_seen": 127976400, + "step": 105170 + }, + { + "epoch": 11.713442476890522, + "grad_norm": 0.00033847460872493684, + "learning_rate": 2.1895540365969374e-05, + "loss": 0.0135, + "num_input_tokens_seen": 127982768, + "step": 105175 + }, + { + "epoch": 11.71399933177414, + "grad_norm": 0.013928629457950592, + "learning_rate": 2.1893129449231166e-05, + "loss": 0.0028, + "num_input_tokens_seen": 127988912, + "step": 105180 + }, + { + "epoch": 11.714556186657758, + "grad_norm": 1.2262479066848755, + "learning_rate": 2.1890718561839823e-05, + "loss": 0.0494, + "num_input_tokens_seen": 127994736, + "step": 105185 + }, + { + "epoch": 11.715113041541374, + "grad_norm": 0.15422606468200684, + "learning_rate": 2.1888307703818156e-05, + "loss": 0.0032, + "num_input_tokens_seen": 128000976, + "step": 105190 + }, + { + "epoch": 11.715669896424991, + "grad_norm": 0.0004227336321491748, + "learning_rate": 2.1885896875188897e-05, + "loss": 0.0642, + "num_input_tokens_seen": 128007088, + "step": 105195 + }, + { + "epoch": 11.716226751308609, + "grad_norm": 0.9772680401802063, + "learning_rate": 2.188348607597485e-05, + "loss": 0.0717, + "num_input_tokens_seen": 128013456, + "step": 105200 + }, + { + "epoch": 11.716783606192227, + "grad_norm": 0.40946531295776367, + "learning_rate": 2.1881075306198766e-05, + "loss": 0.0351, + "num_input_tokens_seen": 128019472, + "step": 105205 + }, + { + "epoch": 11.717340461075844, + "grad_norm": 0.8015493750572205, + "learning_rate": 2.187866456588343e-05, + "loss": 0.0079, + "num_input_tokens_seen": 128025200, + "step": 105210 + }, + { + "epoch": 11.71789731595946, + "grad_norm": 0.05699080601334572, + "learning_rate": 2.1876253855051602e-05, + "loss": 0.1618, + "num_input_tokens_seen": 128031056, + "step": 105215 + }, + { + "epoch": 11.718454170843078, + "grad_norm": 0.06258276104927063, + "learning_rate": 2.187384317372607e-05, + "loss": 0.0083, + "num_input_tokens_seen": 128037360, + "step": 105220 + }, + { + "epoch": 11.719011025726696, + "grad_norm": 0.20933622121810913, + "learning_rate": 2.1871432521929582e-05, + "loss": 0.008, + "num_input_tokens_seen": 128043664, + "step": 105225 + }, + { + "epoch": 11.719567880610313, + "grad_norm": 0.3091272711753845, + "learning_rate": 2.186902189968493e-05, + "loss": 0.008, + "num_input_tokens_seen": 128049232, + "step": 105230 + }, + { + "epoch": 11.72012473549393, + "grad_norm": 1.7700766324996948, + "learning_rate": 2.1866611307014866e-05, + "loss": 0.1727, + "num_input_tokens_seen": 128055216, + "step": 105235 + }, + { + "epoch": 11.720681590377549, + "grad_norm": 0.0015963410260155797, + "learning_rate": 2.1864200743942176e-05, + "loss": 0.0416, + "num_input_tokens_seen": 128061072, + "step": 105240 + }, + { + "epoch": 11.721238445261164, + "grad_norm": 1.1623080968856812, + "learning_rate": 2.186179021048962e-05, + "loss": 0.1626, + "num_input_tokens_seen": 128066736, + "step": 105245 + }, + { + "epoch": 11.721795300144782, + "grad_norm": 0.13865508139133453, + "learning_rate": 2.185937970667997e-05, + "loss": 0.08, + "num_input_tokens_seen": 128073072, + "step": 105250 + }, + { + "epoch": 11.7223521550284, + "grad_norm": 0.00021601682237815112, + "learning_rate": 2.1856969232535985e-05, + "loss": 0.1155, + "num_input_tokens_seen": 128079344, + "step": 105255 + }, + { + "epoch": 11.722909009912017, + "grad_norm": 0.003361133625730872, + "learning_rate": 2.1854558788080458e-05, + "loss": 0.1881, + "num_input_tokens_seen": 128085616, + "step": 105260 + }, + { + "epoch": 11.723465864795635, + "grad_norm": 0.18047939240932465, + "learning_rate": 2.1852148373336133e-05, + "loss": 0.0638, + "num_input_tokens_seen": 128091184, + "step": 105265 + }, + { + "epoch": 11.724022719679251, + "grad_norm": 0.006981665268540382, + "learning_rate": 2.1849737988325798e-05, + "loss": 0.0417, + "num_input_tokens_seen": 128097200, + "step": 105270 + }, + { + "epoch": 11.724579574562869, + "grad_norm": 4.898571968078613, + "learning_rate": 2.1847327633072202e-05, + "loss": 0.1571, + "num_input_tokens_seen": 128103344, + "step": 105275 + }, + { + "epoch": 11.725136429446486, + "grad_norm": 0.00016499288904014975, + "learning_rate": 2.1844917307598132e-05, + "loss": 0.1097, + "num_input_tokens_seen": 128109552, + "step": 105280 + }, + { + "epoch": 11.725693284330104, + "grad_norm": 0.002846853109076619, + "learning_rate": 2.1842507011926338e-05, + "loss": 0.0909, + "num_input_tokens_seen": 128115440, + "step": 105285 + }, + { + "epoch": 11.726250139213722, + "grad_norm": 0.12399119138717651, + "learning_rate": 2.1840096746079603e-05, + "loss": 0.0401, + "num_input_tokens_seen": 128121488, + "step": 105290 + }, + { + "epoch": 11.726806994097338, + "grad_norm": 0.13742992281913757, + "learning_rate": 2.1837686510080677e-05, + "loss": 0.0554, + "num_input_tokens_seen": 128127152, + "step": 105295 + }, + { + "epoch": 11.727363848980955, + "grad_norm": 0.18041494488716125, + "learning_rate": 2.1835276303952344e-05, + "loss": 0.0434, + "num_input_tokens_seen": 128133264, + "step": 105300 + }, + { + "epoch": 11.727920703864573, + "grad_norm": 1.2175043821334839, + "learning_rate": 2.183286612771735e-05, + "loss": 0.0734, + "num_input_tokens_seen": 128138736, + "step": 105305 + }, + { + "epoch": 11.72847755874819, + "grad_norm": 0.2162666767835617, + "learning_rate": 2.1830455981398486e-05, + "loss": 0.0374, + "num_input_tokens_seen": 128144784, + "step": 105310 + }, + { + "epoch": 11.729034413631808, + "grad_norm": 0.033884041011333466, + "learning_rate": 2.1828045865018494e-05, + "loss": 0.0048, + "num_input_tokens_seen": 128150832, + "step": 105315 + }, + { + "epoch": 11.729591268515424, + "grad_norm": 0.009856938384473324, + "learning_rate": 2.182563577860016e-05, + "loss": 0.0647, + "num_input_tokens_seen": 128157232, + "step": 105320 + }, + { + "epoch": 11.730148123399042, + "grad_norm": 0.009868081659078598, + "learning_rate": 2.182322572216623e-05, + "loss": 0.0117, + "num_input_tokens_seen": 128163440, + "step": 105325 + }, + { + "epoch": 11.73070497828266, + "grad_norm": 0.03959887847304344, + "learning_rate": 2.1820815695739484e-05, + "loss": 0.0348, + "num_input_tokens_seen": 128169520, + "step": 105330 + }, + { + "epoch": 11.731261833166277, + "grad_norm": 0.6042383313179016, + "learning_rate": 2.1818405699342675e-05, + "loss": 0.0169, + "num_input_tokens_seen": 128175632, + "step": 105335 + }, + { + "epoch": 11.731818688049895, + "grad_norm": 0.4191100597381592, + "learning_rate": 2.1815995732998584e-05, + "loss": 0.0387, + "num_input_tokens_seen": 128181296, + "step": 105340 + }, + { + "epoch": 11.73237554293351, + "grad_norm": 0.4651266038417816, + "learning_rate": 2.1813585796729954e-05, + "loss": 0.0791, + "num_input_tokens_seen": 128187120, + "step": 105345 + }, + { + "epoch": 11.732932397817128, + "grad_norm": 0.014549237675964832, + "learning_rate": 2.1811175890559565e-05, + "loss": 0.0361, + "num_input_tokens_seen": 128193488, + "step": 105350 + }, + { + "epoch": 11.733489252700746, + "grad_norm": 0.08503399044275284, + "learning_rate": 2.180876601451018e-05, + "loss": 0.0251, + "num_input_tokens_seen": 128199504, + "step": 105355 + }, + { + "epoch": 11.734046107584364, + "grad_norm": 0.6445310711860657, + "learning_rate": 2.1806356168604546e-05, + "loss": 0.0682, + "num_input_tokens_seen": 128205104, + "step": 105360 + }, + { + "epoch": 11.734602962467982, + "grad_norm": 0.07956251502037048, + "learning_rate": 2.180394635286544e-05, + "loss": 0.1033, + "num_input_tokens_seen": 128210864, + "step": 105365 + }, + { + "epoch": 11.735159817351597, + "grad_norm": 0.014795803464949131, + "learning_rate": 2.1801536567315624e-05, + "loss": 0.047, + "num_input_tokens_seen": 128216976, + "step": 105370 + }, + { + "epoch": 11.735716672235215, + "grad_norm": 0.27209606766700745, + "learning_rate": 2.1799126811977856e-05, + "loss": 0.0217, + "num_input_tokens_seen": 128223312, + "step": 105375 + }, + { + "epoch": 11.736273527118833, + "grad_norm": 0.877345085144043, + "learning_rate": 2.1796717086874895e-05, + "loss": 0.0428, + "num_input_tokens_seen": 128229168, + "step": 105380 + }, + { + "epoch": 11.73683038200245, + "grad_norm": 0.0030280693899840117, + "learning_rate": 2.1794307392029513e-05, + "loss": 0.0196, + "num_input_tokens_seen": 128235216, + "step": 105385 + }, + { + "epoch": 11.737387236886068, + "grad_norm": 0.23819617927074432, + "learning_rate": 2.1791897727464464e-05, + "loss": 0.0232, + "num_input_tokens_seen": 128241424, + "step": 105390 + }, + { + "epoch": 11.737944091769684, + "grad_norm": 0.00046300620306283236, + "learning_rate": 2.1789488093202514e-05, + "loss": 0.0051, + "num_input_tokens_seen": 128247824, + "step": 105395 + }, + { + "epoch": 11.738500946653302, + "grad_norm": 0.14498525857925415, + "learning_rate": 2.178707848926641e-05, + "loss": 0.0729, + "num_input_tokens_seen": 128253904, + "step": 105400 + }, + { + "epoch": 11.73905780153692, + "grad_norm": 0.0024359039962291718, + "learning_rate": 2.1784668915678944e-05, + "loss": 0.0099, + "num_input_tokens_seen": 128260176, + "step": 105405 + }, + { + "epoch": 11.739614656420537, + "grad_norm": 0.9591577053070068, + "learning_rate": 2.1782259372462838e-05, + "loss": 0.1376, + "num_input_tokens_seen": 128266384, + "step": 105410 + }, + { + "epoch": 11.740171511304155, + "grad_norm": 0.0164351724088192, + "learning_rate": 2.177984985964088e-05, + "loss": 0.0053, + "num_input_tokens_seen": 128272240, + "step": 105415 + }, + { + "epoch": 11.74072836618777, + "grad_norm": 0.08274269104003906, + "learning_rate": 2.1777440377235815e-05, + "loss": 0.0072, + "num_input_tokens_seen": 128278768, + "step": 105420 + }, + { + "epoch": 11.741285221071388, + "grad_norm": 1.4395967721939087, + "learning_rate": 2.1775030925270412e-05, + "loss": 0.0808, + "num_input_tokens_seen": 128284976, + "step": 105425 + }, + { + "epoch": 11.741842075955006, + "grad_norm": 0.8555761575698853, + "learning_rate": 2.177262150376742e-05, + "loss": 0.0334, + "num_input_tokens_seen": 128291728, + "step": 105430 + }, + { + "epoch": 11.742398930838624, + "grad_norm": 0.7272293567657471, + "learning_rate": 2.177021211274961e-05, + "loss": 0.0422, + "num_input_tokens_seen": 128297200, + "step": 105435 + }, + { + "epoch": 11.742955785722241, + "grad_norm": 0.01444817241281271, + "learning_rate": 2.176780275223973e-05, + "loss": 0.0049, + "num_input_tokens_seen": 128302992, + "step": 105440 + }, + { + "epoch": 11.743512640605857, + "grad_norm": 0.004609711933881044, + "learning_rate": 2.1765393422260545e-05, + "loss": 0.0745, + "num_input_tokens_seen": 128309520, + "step": 105445 + }, + { + "epoch": 11.744069495489475, + "grad_norm": 0.8418415784835815, + "learning_rate": 2.1762984122834808e-05, + "loss": 0.1802, + "num_input_tokens_seen": 128315024, + "step": 105450 + }, + { + "epoch": 11.744626350373093, + "grad_norm": 0.168148472905159, + "learning_rate": 2.1760574853985284e-05, + "loss": 0.0019, + "num_input_tokens_seen": 128321072, + "step": 105455 + }, + { + "epoch": 11.74518320525671, + "grad_norm": 0.03839822858572006, + "learning_rate": 2.1758165615734716e-05, + "loss": 0.0046, + "num_input_tokens_seen": 128327248, + "step": 105460 + }, + { + "epoch": 11.745740060140328, + "grad_norm": 0.9081414341926575, + "learning_rate": 2.1755756408105887e-05, + "loss": 0.0648, + "num_input_tokens_seen": 128333040, + "step": 105465 + }, + { + "epoch": 11.746296915023946, + "grad_norm": 0.6690618991851807, + "learning_rate": 2.175334723112152e-05, + "loss": 0.0255, + "num_input_tokens_seen": 128339536, + "step": 105470 + }, + { + "epoch": 11.746853769907561, + "grad_norm": 0.3407738208770752, + "learning_rate": 2.17509380848044e-05, + "loss": 0.0215, + "num_input_tokens_seen": 128345680, + "step": 105475 + }, + { + "epoch": 11.74741062479118, + "grad_norm": 0.6357488632202148, + "learning_rate": 2.174852896917727e-05, + "loss": 0.0192, + "num_input_tokens_seen": 128351696, + "step": 105480 + }, + { + "epoch": 11.747967479674797, + "grad_norm": 0.019275927916169167, + "learning_rate": 2.1746119884262895e-05, + "loss": 0.2483, + "num_input_tokens_seen": 128357392, + "step": 105485 + }, + { + "epoch": 11.748524334558414, + "grad_norm": 0.008244925178587437, + "learning_rate": 2.1743710830084015e-05, + "loss": 0.0075, + "num_input_tokens_seen": 128363664, + "step": 105490 + }, + { + "epoch": 11.749081189442032, + "grad_norm": 1.9828680753707886, + "learning_rate": 2.1741301806663405e-05, + "loss": 0.0399, + "num_input_tokens_seen": 128369968, + "step": 105495 + }, + { + "epoch": 11.749638044325648, + "grad_norm": 0.0009226371766999364, + "learning_rate": 2.1738892814023803e-05, + "loss": 0.0047, + "num_input_tokens_seen": 128376624, + "step": 105500 + }, + { + "epoch": 11.750194899209266, + "grad_norm": 0.07406655699014664, + "learning_rate": 2.1736483852187974e-05, + "loss": 0.0015, + "num_input_tokens_seen": 128382768, + "step": 105505 + }, + { + "epoch": 11.750751754092883, + "grad_norm": 0.6322842240333557, + "learning_rate": 2.173407492117867e-05, + "loss": 0.0185, + "num_input_tokens_seen": 128389104, + "step": 105510 + }, + { + "epoch": 11.751308608976501, + "grad_norm": 0.009056922979652882, + "learning_rate": 2.1731666021018646e-05, + "loss": 0.0043, + "num_input_tokens_seen": 128395312, + "step": 105515 + }, + { + "epoch": 11.751865463860119, + "grad_norm": 1.2416645288467407, + "learning_rate": 2.172925715173065e-05, + "loss": 0.079, + "num_input_tokens_seen": 128400880, + "step": 105520 + }, + { + "epoch": 11.752422318743735, + "grad_norm": 0.010709024965763092, + "learning_rate": 2.1726848313337448e-05, + "loss": 0.0766, + "num_input_tokens_seen": 128407024, + "step": 105525 + }, + { + "epoch": 11.752979173627352, + "grad_norm": 0.07045907527208328, + "learning_rate": 2.1724439505861773e-05, + "loss": 0.0156, + "num_input_tokens_seen": 128413232, + "step": 105530 + }, + { + "epoch": 11.75353602851097, + "grad_norm": 0.0015759507659822702, + "learning_rate": 2.1722030729326408e-05, + "loss": 0.0241, + "num_input_tokens_seen": 128419600, + "step": 105535 + }, + { + "epoch": 11.754092883394588, + "grad_norm": 0.007834910415112972, + "learning_rate": 2.1719621983754072e-05, + "loss": 0.0094, + "num_input_tokens_seen": 128426000, + "step": 105540 + }, + { + "epoch": 11.754649738278205, + "grad_norm": 0.0017799899214878678, + "learning_rate": 2.171721326916755e-05, + "loss": 0.0216, + "num_input_tokens_seen": 128432144, + "step": 105545 + }, + { + "epoch": 11.755206593161821, + "grad_norm": 0.07434006035327911, + "learning_rate": 2.171480458558957e-05, + "loss": 0.0186, + "num_input_tokens_seen": 128438224, + "step": 105550 + }, + { + "epoch": 11.755763448045439, + "grad_norm": 0.0009215634199790657, + "learning_rate": 2.1712395933042897e-05, + "loss": 0.1435, + "num_input_tokens_seen": 128444560, + "step": 105555 + }, + { + "epoch": 11.756320302929057, + "grad_norm": 0.8329655528068542, + "learning_rate": 2.1709987311550273e-05, + "loss": 0.0405, + "num_input_tokens_seen": 128449904, + "step": 105560 + }, + { + "epoch": 11.756877157812674, + "grad_norm": 0.00622187927365303, + "learning_rate": 2.1707578721134464e-05, + "loss": 0.0784, + "num_input_tokens_seen": 128456144, + "step": 105565 + }, + { + "epoch": 11.757434012696292, + "grad_norm": 0.024318695068359375, + "learning_rate": 2.1705170161818202e-05, + "loss": 0.0244, + "num_input_tokens_seen": 128461648, + "step": 105570 + }, + { + "epoch": 11.757990867579908, + "grad_norm": 0.008413625881075859, + "learning_rate": 2.1702761633624255e-05, + "loss": 0.0175, + "num_input_tokens_seen": 128467760, + "step": 105575 + }, + { + "epoch": 11.758547722463526, + "grad_norm": 0.005154974292963743, + "learning_rate": 2.170035313657536e-05, + "loss": 0.0292, + "num_input_tokens_seen": 128474064, + "step": 105580 + }, + { + "epoch": 11.759104577347143, + "grad_norm": 0.0014830764848738909, + "learning_rate": 2.1697944670694282e-05, + "loss": 0.0086, + "num_input_tokens_seen": 128480240, + "step": 105585 + }, + { + "epoch": 11.75966143223076, + "grad_norm": 0.15518945455551147, + "learning_rate": 2.1695536236003746e-05, + "loss": 0.0428, + "num_input_tokens_seen": 128486224, + "step": 105590 + }, + { + "epoch": 11.760218287114379, + "grad_norm": 0.004259355831891298, + "learning_rate": 2.1693127832526537e-05, + "loss": 0.0062, + "num_input_tokens_seen": 128492304, + "step": 105595 + }, + { + "epoch": 11.760775141997996, + "grad_norm": 0.038091372698545456, + "learning_rate": 2.169071946028537e-05, + "loss": 0.0653, + "num_input_tokens_seen": 128498512, + "step": 105600 + }, + { + "epoch": 11.761331996881612, + "grad_norm": 0.08889439702033997, + "learning_rate": 2.168831111930302e-05, + "loss": 0.0204, + "num_input_tokens_seen": 128504720, + "step": 105605 + }, + { + "epoch": 11.76188885176523, + "grad_norm": 0.0001963000395335257, + "learning_rate": 2.1685902809602213e-05, + "loss": 0.0095, + "num_input_tokens_seen": 128510832, + "step": 105610 + }, + { + "epoch": 11.762445706648847, + "grad_norm": 0.011630590073764324, + "learning_rate": 2.168349453120572e-05, + "loss": 0.0143, + "num_input_tokens_seen": 128516624, + "step": 105615 + }, + { + "epoch": 11.763002561532465, + "grad_norm": 1.8060394525527954, + "learning_rate": 2.168108628413627e-05, + "loss": 0.0594, + "num_input_tokens_seen": 128522512, + "step": 105620 + }, + { + "epoch": 11.763559416416083, + "grad_norm": 0.00016985746333375573, + "learning_rate": 2.1678678068416626e-05, + "loss": 0.0611, + "num_input_tokens_seen": 128528880, + "step": 105625 + }, + { + "epoch": 11.764116271299699, + "grad_norm": 1.0864521265029907, + "learning_rate": 2.1676269884069524e-05, + "loss": 0.1212, + "num_input_tokens_seen": 128534992, + "step": 105630 + }, + { + "epoch": 11.764673126183316, + "grad_norm": 0.0012003856245428324, + "learning_rate": 2.1673861731117724e-05, + "loss": 0.0084, + "num_input_tokens_seen": 128541136, + "step": 105635 + }, + { + "epoch": 11.765229981066934, + "grad_norm": 0.015441585332155228, + "learning_rate": 2.1671453609583956e-05, + "loss": 0.0417, + "num_input_tokens_seen": 128547312, + "step": 105640 + }, + { + "epoch": 11.765786835950552, + "grad_norm": 1.2041573524475098, + "learning_rate": 2.166904551949098e-05, + "loss": 0.0706, + "num_input_tokens_seen": 128553520, + "step": 105645 + }, + { + "epoch": 11.76634369083417, + "grad_norm": 0.13688014447689056, + "learning_rate": 2.1666637460861528e-05, + "loss": 0.0062, + "num_input_tokens_seen": 128559632, + "step": 105650 + }, + { + "epoch": 11.766900545717785, + "grad_norm": 0.04838573932647705, + "learning_rate": 2.1664229433718373e-05, + "loss": 0.0154, + "num_input_tokens_seen": 128565456, + "step": 105655 + }, + { + "epoch": 11.767457400601403, + "grad_norm": 0.18463550508022308, + "learning_rate": 2.1661821438084225e-05, + "loss": 0.0449, + "num_input_tokens_seen": 128571696, + "step": 105660 + }, + { + "epoch": 11.76801425548502, + "grad_norm": 0.1024085059762001, + "learning_rate": 2.1659413473981867e-05, + "loss": 0.0231, + "num_input_tokens_seen": 128577648, + "step": 105665 + }, + { + "epoch": 11.768571110368638, + "grad_norm": 0.04697934538125992, + "learning_rate": 2.1657005541434007e-05, + "loss": 0.0489, + "num_input_tokens_seen": 128584208, + "step": 105670 + }, + { + "epoch": 11.769127965252256, + "grad_norm": 0.08702445030212402, + "learning_rate": 2.165459764046342e-05, + "loss": 0.0571, + "num_input_tokens_seen": 128590288, + "step": 105675 + }, + { + "epoch": 11.769684820135872, + "grad_norm": 0.14590483903884888, + "learning_rate": 2.1652189771092834e-05, + "loss": 0.004, + "num_input_tokens_seen": 128596496, + "step": 105680 + }, + { + "epoch": 11.77024167501949, + "grad_norm": 0.001808459754101932, + "learning_rate": 2.1649781933345e-05, + "loss": 0.0126, + "num_input_tokens_seen": 128602192, + "step": 105685 + }, + { + "epoch": 11.770798529903107, + "grad_norm": 0.01503149513155222, + "learning_rate": 2.164737412724266e-05, + "loss": 0.0594, + "num_input_tokens_seen": 128608624, + "step": 105690 + }, + { + "epoch": 11.771355384786725, + "grad_norm": 0.00844461191445589, + "learning_rate": 2.1644966352808556e-05, + "loss": 0.0025, + "num_input_tokens_seen": 128614576, + "step": 105695 + }, + { + "epoch": 11.771912239670343, + "grad_norm": 0.631846010684967, + "learning_rate": 2.1642558610065435e-05, + "loss": 0.024, + "num_input_tokens_seen": 128620656, + "step": 105700 + }, + { + "epoch": 11.772469094553959, + "grad_norm": 0.0016439571045339108, + "learning_rate": 2.1640150899036037e-05, + "loss": 0.0414, + "num_input_tokens_seen": 128626896, + "step": 105705 + }, + { + "epoch": 11.773025949437576, + "grad_norm": 0.08871419727802277, + "learning_rate": 2.16377432197431e-05, + "loss": 0.0391, + "num_input_tokens_seen": 128633008, + "step": 105710 + }, + { + "epoch": 11.773582804321194, + "grad_norm": 0.08687295019626617, + "learning_rate": 2.1635335572209382e-05, + "loss": 0.0587, + "num_input_tokens_seen": 128638608, + "step": 105715 + }, + { + "epoch": 11.774139659204812, + "grad_norm": 0.3076173961162567, + "learning_rate": 2.1632927956457598e-05, + "loss": 0.0067, + "num_input_tokens_seen": 128644720, + "step": 105720 + }, + { + "epoch": 11.77469651408843, + "grad_norm": 1.8093397617340088, + "learning_rate": 2.163052037251053e-05, + "loss": 0.138, + "num_input_tokens_seen": 128650576, + "step": 105725 + }, + { + "epoch": 11.775253368972045, + "grad_norm": 0.7106103897094727, + "learning_rate": 2.1628112820390878e-05, + "loss": 0.1178, + "num_input_tokens_seen": 128656240, + "step": 105730 + }, + { + "epoch": 11.775810223855663, + "grad_norm": 0.015533299185335636, + "learning_rate": 2.1625705300121414e-05, + "loss": 0.0039, + "num_input_tokens_seen": 128662192, + "step": 105735 + }, + { + "epoch": 11.77636707873928, + "grad_norm": 1.9635168313980103, + "learning_rate": 2.162329781172486e-05, + "loss": 0.068, + "num_input_tokens_seen": 128668400, + "step": 105740 + }, + { + "epoch": 11.776923933622898, + "grad_norm": 0.00019816512940451503, + "learning_rate": 2.1620890355223965e-05, + "loss": 0.0033, + "num_input_tokens_seen": 128674480, + "step": 105745 + }, + { + "epoch": 11.777480788506516, + "grad_norm": 1.2210890054702759, + "learning_rate": 2.161848293064147e-05, + "loss": 0.0606, + "num_input_tokens_seen": 128680464, + "step": 105750 + }, + { + "epoch": 11.778037643390132, + "grad_norm": 0.929713249206543, + "learning_rate": 2.1616075538000115e-05, + "loss": 0.1167, + "num_input_tokens_seen": 128686448, + "step": 105755 + }, + { + "epoch": 11.77859449827375, + "grad_norm": 0.8505357503890991, + "learning_rate": 2.161366817732264e-05, + "loss": 0.0341, + "num_input_tokens_seen": 128692656, + "step": 105760 + }, + { + "epoch": 11.779151353157367, + "grad_norm": 0.688441812992096, + "learning_rate": 2.161126084863177e-05, + "loss": 0.0219, + "num_input_tokens_seen": 128698768, + "step": 105765 + }, + { + "epoch": 11.779708208040985, + "grad_norm": 0.24618978798389435, + "learning_rate": 2.1608853551950267e-05, + "loss": 0.0047, + "num_input_tokens_seen": 128705008, + "step": 105770 + }, + { + "epoch": 11.780265062924602, + "grad_norm": 0.3935287594795227, + "learning_rate": 2.1606446287300853e-05, + "loss": 0.0096, + "num_input_tokens_seen": 128710544, + "step": 105775 + }, + { + "epoch": 11.780821917808218, + "grad_norm": 0.415322482585907, + "learning_rate": 2.1604039054706275e-05, + "loss": 0.014, + "num_input_tokens_seen": 128716592, + "step": 105780 + }, + { + "epoch": 11.781378772691836, + "grad_norm": 0.851557731628418, + "learning_rate": 2.160163185418927e-05, + "loss": 0.0237, + "num_input_tokens_seen": 128722704, + "step": 105785 + }, + { + "epoch": 11.781935627575454, + "grad_norm": 0.008416913449764252, + "learning_rate": 2.1599224685772576e-05, + "loss": 0.0298, + "num_input_tokens_seen": 128728304, + "step": 105790 + }, + { + "epoch": 11.782492482459071, + "grad_norm": 0.013598884455859661, + "learning_rate": 2.1596817549478922e-05, + "loss": 0.0312, + "num_input_tokens_seen": 128734448, + "step": 105795 + }, + { + "epoch": 11.783049337342689, + "grad_norm": 1.5168883800506592, + "learning_rate": 2.1594410445331064e-05, + "loss": 0.0777, + "num_input_tokens_seen": 128740816, + "step": 105800 + }, + { + "epoch": 11.783606192226305, + "grad_norm": 2.1285250186920166, + "learning_rate": 2.1592003373351714e-05, + "loss": 0.0939, + "num_input_tokens_seen": 128746864, + "step": 105805 + }, + { + "epoch": 11.784163047109923, + "grad_norm": 1.2060550451278687, + "learning_rate": 2.1589596333563638e-05, + "loss": 0.1343, + "num_input_tokens_seen": 128752464, + "step": 105810 + }, + { + "epoch": 11.78471990199354, + "grad_norm": 0.0003235488256905228, + "learning_rate": 2.158718932598954e-05, + "loss": 0.035, + "num_input_tokens_seen": 128758864, + "step": 105815 + }, + { + "epoch": 11.785276756877158, + "grad_norm": 0.5811526775360107, + "learning_rate": 2.1584782350652187e-05, + "loss": 0.0186, + "num_input_tokens_seen": 128764848, + "step": 105820 + }, + { + "epoch": 11.785833611760776, + "grad_norm": 1.4743763208389282, + "learning_rate": 2.1582375407574294e-05, + "loss": 0.0719, + "num_input_tokens_seen": 128770800, + "step": 105825 + }, + { + "epoch": 11.786390466644393, + "grad_norm": 0.0007889270200394094, + "learning_rate": 2.157996849677861e-05, + "loss": 0.0073, + "num_input_tokens_seen": 128776624, + "step": 105830 + }, + { + "epoch": 11.78694732152801, + "grad_norm": 0.7907230257987976, + "learning_rate": 2.1577561618287856e-05, + "loss": 0.0936, + "num_input_tokens_seen": 128782928, + "step": 105835 + }, + { + "epoch": 11.787504176411627, + "grad_norm": 1.5208382606506348, + "learning_rate": 2.157515477212478e-05, + "loss": 0.0916, + "num_input_tokens_seen": 128789008, + "step": 105840 + }, + { + "epoch": 11.788061031295245, + "grad_norm": 1.1075804233551025, + "learning_rate": 2.1572747958312107e-05, + "loss": 0.1862, + "num_input_tokens_seen": 128794800, + "step": 105845 + }, + { + "epoch": 11.788617886178862, + "grad_norm": 0.5916826725006104, + "learning_rate": 2.157034117687258e-05, + "loss": 0.1485, + "num_input_tokens_seen": 128800944, + "step": 105850 + }, + { + "epoch": 11.78917474106248, + "grad_norm": 0.1307719647884369, + "learning_rate": 2.1567934427828922e-05, + "loss": 0.0334, + "num_input_tokens_seen": 128807088, + "step": 105855 + }, + { + "epoch": 11.789731595946096, + "grad_norm": 2.3020081520080566, + "learning_rate": 2.156552771120388e-05, + "loss": 0.2042, + "num_input_tokens_seen": 128813168, + "step": 105860 + }, + { + "epoch": 11.790288450829713, + "grad_norm": 0.00019623876141849905, + "learning_rate": 2.1563121027020173e-05, + "loss": 0.0614, + "num_input_tokens_seen": 128819280, + "step": 105865 + }, + { + "epoch": 11.790845305713331, + "grad_norm": 1.8610492944717407, + "learning_rate": 2.1560714375300553e-05, + "loss": 0.1001, + "num_input_tokens_seen": 128825648, + "step": 105870 + }, + { + "epoch": 11.791402160596949, + "grad_norm": 0.036501627415418625, + "learning_rate": 2.1558307756067723e-05, + "loss": 0.0489, + "num_input_tokens_seen": 128832080, + "step": 105875 + }, + { + "epoch": 11.791959015480566, + "grad_norm": 0.5937293171882629, + "learning_rate": 2.155590116934445e-05, + "loss": 0.0049, + "num_input_tokens_seen": 128838224, + "step": 105880 + }, + { + "epoch": 11.792515870364182, + "grad_norm": 1.4755449295043945, + "learning_rate": 2.1553494615153443e-05, + "loss": 0.0915, + "num_input_tokens_seen": 128844112, + "step": 105885 + }, + { + "epoch": 11.7930727252478, + "grad_norm": 9.528388909529895e-05, + "learning_rate": 2.1551088093517447e-05, + "loss": 0.0194, + "num_input_tokens_seen": 128850192, + "step": 105890 + }, + { + "epoch": 11.793629580131418, + "grad_norm": 0.04157104715704918, + "learning_rate": 2.1548681604459178e-05, + "loss": 0.0289, + "num_input_tokens_seen": 128856272, + "step": 105895 + }, + { + "epoch": 11.794186435015035, + "grad_norm": 0.026038946583867073, + "learning_rate": 2.1546275148001385e-05, + "loss": 0.1401, + "num_input_tokens_seen": 128862480, + "step": 105900 + }, + { + "epoch": 11.794743289898653, + "grad_norm": 0.1414455771446228, + "learning_rate": 2.154386872416678e-05, + "loss": 0.0607, + "num_input_tokens_seen": 128868880, + "step": 105905 + }, + { + "epoch": 11.795300144782269, + "grad_norm": 0.03881935775279999, + "learning_rate": 2.1541462332978114e-05, + "loss": 0.0047, + "num_input_tokens_seen": 128874960, + "step": 105910 + }, + { + "epoch": 11.795856999665887, + "grad_norm": 0.20241346955299377, + "learning_rate": 2.1539055974458102e-05, + "loss": 0.1167, + "num_input_tokens_seen": 128881200, + "step": 105915 + }, + { + "epoch": 11.796413854549504, + "grad_norm": 0.14334025979042053, + "learning_rate": 2.1536649648629483e-05, + "loss": 0.0334, + "num_input_tokens_seen": 128887312, + "step": 105920 + }, + { + "epoch": 11.796970709433122, + "grad_norm": 0.00012372570927254856, + "learning_rate": 2.1534243355514973e-05, + "loss": 0.1337, + "num_input_tokens_seen": 128893520, + "step": 105925 + }, + { + "epoch": 11.79752756431674, + "grad_norm": 0.0795840248465538, + "learning_rate": 2.153183709513733e-05, + "loss": 0.0496, + "num_input_tokens_seen": 128899664, + "step": 105930 + }, + { + "epoch": 11.798084419200356, + "grad_norm": 0.005817849654704332, + "learning_rate": 2.1529430867519242e-05, + "loss": 0.0234, + "num_input_tokens_seen": 128905168, + "step": 105935 + }, + { + "epoch": 11.798641274083973, + "grad_norm": 0.18075135350227356, + "learning_rate": 2.1527024672683478e-05, + "loss": 0.0101, + "num_input_tokens_seen": 128911824, + "step": 105940 + }, + { + "epoch": 11.799198128967591, + "grad_norm": 1.3770959377288818, + "learning_rate": 2.1524618510652735e-05, + "loss": 0.0467, + "num_input_tokens_seen": 128917744, + "step": 105945 + }, + { + "epoch": 11.799754983851209, + "grad_norm": 1.0689916610717773, + "learning_rate": 2.1522212381449763e-05, + "loss": 0.0356, + "num_input_tokens_seen": 128924080, + "step": 105950 + }, + { + "epoch": 11.800311838734826, + "grad_norm": 0.007063917815685272, + "learning_rate": 2.1519806285097277e-05, + "loss": 0.0297, + "num_input_tokens_seen": 128930160, + "step": 105955 + }, + { + "epoch": 11.800868693618444, + "grad_norm": 1.3946138620376587, + "learning_rate": 2.1517400221618014e-05, + "loss": 0.0998, + "num_input_tokens_seen": 128936432, + "step": 105960 + }, + { + "epoch": 11.80142554850206, + "grad_norm": 0.13156212866306305, + "learning_rate": 2.151499419103469e-05, + "loss": 0.0244, + "num_input_tokens_seen": 128942448, + "step": 105965 + }, + { + "epoch": 11.801982403385678, + "grad_norm": 0.024937832728028297, + "learning_rate": 2.1512588193370048e-05, + "loss": 0.0091, + "num_input_tokens_seen": 128948784, + "step": 105970 + }, + { + "epoch": 11.802539258269295, + "grad_norm": 0.008154083974659443, + "learning_rate": 2.1510182228646793e-05, + "loss": 0.063, + "num_input_tokens_seen": 128954576, + "step": 105975 + }, + { + "epoch": 11.803096113152913, + "grad_norm": 2.254852533340454, + "learning_rate": 2.1507776296887672e-05, + "loss": 0.0471, + "num_input_tokens_seen": 128960816, + "step": 105980 + }, + { + "epoch": 11.80365296803653, + "grad_norm": 0.029479118064045906, + "learning_rate": 2.1505370398115396e-05, + "loss": 0.0196, + "num_input_tokens_seen": 128966864, + "step": 105985 + }, + { + "epoch": 11.804209822920146, + "grad_norm": 0.9620124697685242, + "learning_rate": 2.1502964532352698e-05, + "loss": 0.0316, + "num_input_tokens_seen": 128973168, + "step": 105990 + }, + { + "epoch": 11.804766677803764, + "grad_norm": 0.00012076015991624445, + "learning_rate": 2.1500558699622296e-05, + "loss": 0.0072, + "num_input_tokens_seen": 128979696, + "step": 105995 + }, + { + "epoch": 11.805323532687382, + "grad_norm": 0.00027675851015374064, + "learning_rate": 2.1498152899946935e-05, + "loss": 0.037, + "num_input_tokens_seen": 128985872, + "step": 106000 + }, + { + "epoch": 11.805880387571, + "grad_norm": 0.0007149401353672147, + "learning_rate": 2.149574713334931e-05, + "loss": 0.0064, + "num_input_tokens_seen": 128991984, + "step": 106005 + }, + { + "epoch": 11.806437242454617, + "grad_norm": 0.32008492946624756, + "learning_rate": 2.149334139985217e-05, + "loss": 0.0626, + "num_input_tokens_seen": 128998160, + "step": 106010 + }, + { + "epoch": 11.806994097338233, + "grad_norm": 2.912597179412842, + "learning_rate": 2.1490935699478226e-05, + "loss": 0.0622, + "num_input_tokens_seen": 129004144, + "step": 106015 + }, + { + "epoch": 11.80755095222185, + "grad_norm": 0.16616190969944, + "learning_rate": 2.1488530032250208e-05, + "loss": 0.0212, + "num_input_tokens_seen": 129010160, + "step": 106020 + }, + { + "epoch": 11.808107807105468, + "grad_norm": 0.13023753464221954, + "learning_rate": 2.148612439819084e-05, + "loss": 0.0639, + "num_input_tokens_seen": 129016496, + "step": 106025 + }, + { + "epoch": 11.808664661989086, + "grad_norm": 0.21317866444587708, + "learning_rate": 2.1483718797322838e-05, + "loss": 0.0549, + "num_input_tokens_seen": 129022384, + "step": 106030 + }, + { + "epoch": 11.809221516872704, + "grad_norm": 0.0002060000115307048, + "learning_rate": 2.1481313229668927e-05, + "loss": 0.0703, + "num_input_tokens_seen": 129028496, + "step": 106035 + }, + { + "epoch": 11.80977837175632, + "grad_norm": 2.512251138687134, + "learning_rate": 2.147890769525184e-05, + "loss": 0.0261, + "num_input_tokens_seen": 129034800, + "step": 106040 + }, + { + "epoch": 11.810335226639937, + "grad_norm": 0.07532087713479996, + "learning_rate": 2.1476502194094282e-05, + "loss": 0.0311, + "num_input_tokens_seen": 129040304, + "step": 106045 + }, + { + "epoch": 11.810892081523555, + "grad_norm": 0.08315229415893555, + "learning_rate": 2.1474096726218992e-05, + "loss": 0.0136, + "num_input_tokens_seen": 129046544, + "step": 106050 + }, + { + "epoch": 11.811448936407173, + "grad_norm": 0.0004399697354529053, + "learning_rate": 2.1471691291648672e-05, + "loss": 0.0347, + "num_input_tokens_seen": 129051888, + "step": 106055 + }, + { + "epoch": 11.81200579129079, + "grad_norm": 0.0009592858841642737, + "learning_rate": 2.146928589040607e-05, + "loss": 0.002, + "num_input_tokens_seen": 129058128, + "step": 106060 + }, + { + "epoch": 11.812562646174406, + "grad_norm": 0.4223019480705261, + "learning_rate": 2.1466880522513874e-05, + "loss": 0.0162, + "num_input_tokens_seen": 129063952, + "step": 106065 + }, + { + "epoch": 11.813119501058024, + "grad_norm": 0.0001644636067794636, + "learning_rate": 2.146447518799484e-05, + "loss": 0.1106, + "num_input_tokens_seen": 129070352, + "step": 106070 + }, + { + "epoch": 11.813676355941642, + "grad_norm": 0.5253087282180786, + "learning_rate": 2.1462069886871652e-05, + "loss": 0.0231, + "num_input_tokens_seen": 129076176, + "step": 106075 + }, + { + "epoch": 11.81423321082526, + "grad_norm": 0.0011200499720871449, + "learning_rate": 2.1459664619167063e-05, + "loss": 0.056, + "num_input_tokens_seen": 129082032, + "step": 106080 + }, + { + "epoch": 11.814790065708877, + "grad_norm": 0.00046429678332060575, + "learning_rate": 2.1457259384903772e-05, + "loss": 0.0557, + "num_input_tokens_seen": 129088400, + "step": 106085 + }, + { + "epoch": 11.815346920592493, + "grad_norm": 1.249753475189209, + "learning_rate": 2.1454854184104506e-05, + "loss": 0.0294, + "num_input_tokens_seen": 129094448, + "step": 106090 + }, + { + "epoch": 11.81590377547611, + "grad_norm": 0.05780387297272682, + "learning_rate": 2.145244901679198e-05, + "loss": 0.0031, + "num_input_tokens_seen": 129100592, + "step": 106095 + }, + { + "epoch": 11.816460630359728, + "grad_norm": 0.019492361694574356, + "learning_rate": 2.145004388298892e-05, + "loss": 0.0161, + "num_input_tokens_seen": 129106864, + "step": 106100 + }, + { + "epoch": 11.817017485243346, + "grad_norm": 0.0032063950784504414, + "learning_rate": 2.144763878271804e-05, + "loss": 0.1306, + "num_input_tokens_seen": 129112464, + "step": 106105 + }, + { + "epoch": 11.817574340126964, + "grad_norm": 0.0003904042241629213, + "learning_rate": 2.1445233716002056e-05, + "loss": 0.0949, + "num_input_tokens_seen": 129118736, + "step": 106110 + }, + { + "epoch": 11.81813119501058, + "grad_norm": 0.2850758731365204, + "learning_rate": 2.1442828682863682e-05, + "loss": 0.0102, + "num_input_tokens_seen": 129125040, + "step": 106115 + }, + { + "epoch": 11.818688049894197, + "grad_norm": 2.3894927501678467, + "learning_rate": 2.144042368332565e-05, + "loss": 0.0858, + "num_input_tokens_seen": 129131056, + "step": 106120 + }, + { + "epoch": 11.819244904777815, + "grad_norm": 0.5982391238212585, + "learning_rate": 2.1438018717410658e-05, + "loss": 0.0715, + "num_input_tokens_seen": 129136976, + "step": 106125 + }, + { + "epoch": 11.819801759661432, + "grad_norm": 0.009226514026522636, + "learning_rate": 2.1435613785141447e-05, + "loss": 0.1121, + "num_input_tokens_seen": 129142896, + "step": 106130 + }, + { + "epoch": 11.82035861454505, + "grad_norm": 0.02804972045123577, + "learning_rate": 2.14332088865407e-05, + "loss": 0.004, + "num_input_tokens_seen": 129148912, + "step": 106135 + }, + { + "epoch": 11.820915469428666, + "grad_norm": 0.17772722244262695, + "learning_rate": 2.1430804021631167e-05, + "loss": 0.0264, + "num_input_tokens_seen": 129154352, + "step": 106140 + }, + { + "epoch": 11.821472324312284, + "grad_norm": 0.716353714466095, + "learning_rate": 2.1428399190435548e-05, + "loss": 0.053, + "num_input_tokens_seen": 129160368, + "step": 106145 + }, + { + "epoch": 11.822029179195901, + "grad_norm": 0.1505042165517807, + "learning_rate": 2.1425994392976563e-05, + "loss": 0.0257, + "num_input_tokens_seen": 129166512, + "step": 106150 + }, + { + "epoch": 11.822586034079519, + "grad_norm": 0.008765002712607384, + "learning_rate": 2.1423589629276917e-05, + "loss": 0.0299, + "num_input_tokens_seen": 129172400, + "step": 106155 + }, + { + "epoch": 11.823142888963137, + "grad_norm": 1.0740156173706055, + "learning_rate": 2.1421184899359336e-05, + "loss": 0.0588, + "num_input_tokens_seen": 129178544, + "step": 106160 + }, + { + "epoch": 11.823699743846753, + "grad_norm": 0.1508970707654953, + "learning_rate": 2.141878020324653e-05, + "loss": 0.0059, + "num_input_tokens_seen": 129184720, + "step": 106165 + }, + { + "epoch": 11.82425659873037, + "grad_norm": 0.0003071699757128954, + "learning_rate": 2.1416375540961216e-05, + "loss": 0.0078, + "num_input_tokens_seen": 129190928, + "step": 106170 + }, + { + "epoch": 11.824813453613988, + "grad_norm": 0.0020161503925919533, + "learning_rate": 2.141397091252611e-05, + "loss": 0.0434, + "num_input_tokens_seen": 129196848, + "step": 106175 + }, + { + "epoch": 11.825370308497606, + "grad_norm": 0.018370548263192177, + "learning_rate": 2.1411566317963917e-05, + "loss": 0.0877, + "num_input_tokens_seen": 129203056, + "step": 106180 + }, + { + "epoch": 11.825927163381223, + "grad_norm": 0.09683848172426224, + "learning_rate": 2.1409161757297357e-05, + "loss": 0.0051, + "num_input_tokens_seen": 129208976, + "step": 106185 + }, + { + "epoch": 11.826484018264841, + "grad_norm": 0.0002402727259323001, + "learning_rate": 2.1406757230549136e-05, + "loss": 0.0374, + "num_input_tokens_seen": 129215024, + "step": 106190 + }, + { + "epoch": 11.827040873148457, + "grad_norm": 0.8248829245567322, + "learning_rate": 2.1404352737741977e-05, + "loss": 0.0568, + "num_input_tokens_seen": 129221104, + "step": 106195 + }, + { + "epoch": 11.827597728032075, + "grad_norm": 0.266661137342453, + "learning_rate": 2.1401948278898575e-05, + "loss": 0.0126, + "num_input_tokens_seen": 129227024, + "step": 106200 + }, + { + "epoch": 11.828154582915692, + "grad_norm": 2.8012266159057617, + "learning_rate": 2.1399543854041676e-05, + "loss": 0.0764, + "num_input_tokens_seen": 129233424, + "step": 106205 + }, + { + "epoch": 11.82871143779931, + "grad_norm": 1.3579788208007812, + "learning_rate": 2.139713946319395e-05, + "loss": 0.0415, + "num_input_tokens_seen": 129239632, + "step": 106210 + }, + { + "epoch": 11.829268292682928, + "grad_norm": 1.2679189443588257, + "learning_rate": 2.1394735106378146e-05, + "loss": 0.0388, + "num_input_tokens_seen": 129245776, + "step": 106215 + }, + { + "epoch": 11.829825147566543, + "grad_norm": 0.6655471324920654, + "learning_rate": 2.1392330783616936e-05, + "loss": 0.0494, + "num_input_tokens_seen": 129251792, + "step": 106220 + }, + { + "epoch": 11.830382002450161, + "grad_norm": 1.845302700996399, + "learning_rate": 2.1389926494933072e-05, + "loss": 0.0336, + "num_input_tokens_seen": 129257872, + "step": 106225 + }, + { + "epoch": 11.830938857333779, + "grad_norm": 0.7960523962974548, + "learning_rate": 2.1387522240349233e-05, + "loss": 0.0481, + "num_input_tokens_seen": 129263824, + "step": 106230 + }, + { + "epoch": 11.831495712217396, + "grad_norm": 0.00037367737968452275, + "learning_rate": 2.1385118019888146e-05, + "loss": 0.013, + "num_input_tokens_seen": 129269776, + "step": 106235 + }, + { + "epoch": 11.832052567101014, + "grad_norm": 0.19935083389282227, + "learning_rate": 2.1382713833572513e-05, + "loss": 0.003, + "num_input_tokens_seen": 129275984, + "step": 106240 + }, + { + "epoch": 11.83260942198463, + "grad_norm": 0.028273027390241623, + "learning_rate": 2.1380309681425047e-05, + "loss": 0.0119, + "num_input_tokens_seen": 129282128, + "step": 106245 + }, + { + "epoch": 11.833166276868248, + "grad_norm": 0.005175305530428886, + "learning_rate": 2.1377905563468456e-05, + "loss": 0.0407, + "num_input_tokens_seen": 129288432, + "step": 106250 + }, + { + "epoch": 11.833723131751865, + "grad_norm": 0.023263433948159218, + "learning_rate": 2.1375501479725453e-05, + "loss": 0.089, + "num_input_tokens_seen": 129294288, + "step": 106255 + }, + { + "epoch": 11.834279986635483, + "grad_norm": 0.09018000215291977, + "learning_rate": 2.1373097430218736e-05, + "loss": 0.0191, + "num_input_tokens_seen": 129299952, + "step": 106260 + }, + { + "epoch": 11.8348368415191, + "grad_norm": 0.00876062922179699, + "learning_rate": 2.1370693414971025e-05, + "loss": 0.0798, + "num_input_tokens_seen": 129306224, + "step": 106265 + }, + { + "epoch": 11.835393696402717, + "grad_norm": 0.00238980446010828, + "learning_rate": 2.1368289434005008e-05, + "loss": 0.0531, + "num_input_tokens_seen": 129312560, + "step": 106270 + }, + { + "epoch": 11.835950551286334, + "grad_norm": 0.10848736017942429, + "learning_rate": 2.136588548734343e-05, + "loss": 0.0627, + "num_input_tokens_seen": 129318928, + "step": 106275 + }, + { + "epoch": 11.836507406169952, + "grad_norm": 1.4377089738845825, + "learning_rate": 2.1363481575008954e-05, + "loss": 0.0407, + "num_input_tokens_seen": 129324784, + "step": 106280 + }, + { + "epoch": 11.83706426105357, + "grad_norm": 1.033064842224121, + "learning_rate": 2.1361077697024322e-05, + "loss": 0.0684, + "num_input_tokens_seen": 129330832, + "step": 106285 + }, + { + "epoch": 11.837621115937187, + "grad_norm": 0.017684074118733406, + "learning_rate": 2.135867385341222e-05, + "loss": 0.0068, + "num_input_tokens_seen": 129337008, + "step": 106290 + }, + { + "epoch": 11.838177970820805, + "grad_norm": 0.4449477791786194, + "learning_rate": 2.1356270044195366e-05, + "loss": 0.0661, + "num_input_tokens_seen": 129343280, + "step": 106295 + }, + { + "epoch": 11.838734825704421, + "grad_norm": 0.04991096630692482, + "learning_rate": 2.1353866269396456e-05, + "loss": 0.0118, + "num_input_tokens_seen": 129349200, + "step": 106300 + }, + { + "epoch": 11.839291680588039, + "grad_norm": 0.5874937772750854, + "learning_rate": 2.1351462529038205e-05, + "loss": 0.0229, + "num_input_tokens_seen": 129355376, + "step": 106305 + }, + { + "epoch": 11.839848535471656, + "grad_norm": 1.4750033617019653, + "learning_rate": 2.134905882314331e-05, + "loss": 0.0789, + "num_input_tokens_seen": 129361520, + "step": 106310 + }, + { + "epoch": 11.840405390355274, + "grad_norm": 0.010997436009347439, + "learning_rate": 2.134665515173448e-05, + "loss": 0.0375, + "num_input_tokens_seen": 129367920, + "step": 106315 + }, + { + "epoch": 11.840962245238892, + "grad_norm": 1.354194164276123, + "learning_rate": 2.134425151483442e-05, + "loss": 0.0541, + "num_input_tokens_seen": 129374224, + "step": 106320 + }, + { + "epoch": 11.841519100122508, + "grad_norm": 0.0022951809223741293, + "learning_rate": 2.1341847912465834e-05, + "loss": 0.0738, + "num_input_tokens_seen": 129380528, + "step": 106325 + }, + { + "epoch": 11.842075955006125, + "grad_norm": 0.00014228749205358326, + "learning_rate": 2.1339444344651416e-05, + "loss": 0.0421, + "num_input_tokens_seen": 129386768, + "step": 106330 + }, + { + "epoch": 11.842632809889743, + "grad_norm": 1.6563116312026978, + "learning_rate": 2.1337040811413896e-05, + "loss": 0.1593, + "num_input_tokens_seen": 129393232, + "step": 106335 + }, + { + "epoch": 11.84318966477336, + "grad_norm": 0.15619802474975586, + "learning_rate": 2.1334637312775944e-05, + "loss": 0.0099, + "num_input_tokens_seen": 129399248, + "step": 106340 + }, + { + "epoch": 11.843746519656978, + "grad_norm": 0.41132915019989014, + "learning_rate": 2.1332233848760294e-05, + "loss": 0.019, + "num_input_tokens_seen": 129405616, + "step": 106345 + }, + { + "epoch": 11.844303374540594, + "grad_norm": 1.0075597763061523, + "learning_rate": 2.132983041938962e-05, + "loss": 0.0371, + "num_input_tokens_seen": 129411728, + "step": 106350 + }, + { + "epoch": 11.844860229424212, + "grad_norm": 0.001888961880467832, + "learning_rate": 2.1327427024686645e-05, + "loss": 0.0884, + "num_input_tokens_seen": 129417520, + "step": 106355 + }, + { + "epoch": 11.84541708430783, + "grad_norm": 0.023540759459137917, + "learning_rate": 2.132502366467406e-05, + "loss": 0.0244, + "num_input_tokens_seen": 129423952, + "step": 106360 + }, + { + "epoch": 11.845973939191447, + "grad_norm": 0.05280585587024689, + "learning_rate": 2.1322620339374578e-05, + "loss": 0.0754, + "num_input_tokens_seen": 129430224, + "step": 106365 + }, + { + "epoch": 11.846530794075065, + "grad_norm": 0.28533995151519775, + "learning_rate": 2.1320217048810886e-05, + "loss": 0.0778, + "num_input_tokens_seen": 129436272, + "step": 106370 + }, + { + "epoch": 11.84708764895868, + "grad_norm": 0.03942515701055527, + "learning_rate": 2.13178137930057e-05, + "loss": 0.0179, + "num_input_tokens_seen": 129442864, + "step": 106375 + }, + { + "epoch": 11.847644503842298, + "grad_norm": 0.0012365557486191392, + "learning_rate": 2.1315410571981708e-05, + "loss": 0.0193, + "num_input_tokens_seen": 129448976, + "step": 106380 + }, + { + "epoch": 11.848201358725916, + "grad_norm": 0.15158939361572266, + "learning_rate": 2.131300738576162e-05, + "loss": 0.0509, + "num_input_tokens_seen": 129455152, + "step": 106385 + }, + { + "epoch": 11.848758213609534, + "grad_norm": 0.002699673641473055, + "learning_rate": 2.1310604234368124e-05, + "loss": 0.0194, + "num_input_tokens_seen": 129461424, + "step": 106390 + }, + { + "epoch": 11.849315068493151, + "grad_norm": 0.4650134742259979, + "learning_rate": 2.130820111782393e-05, + "loss": 0.0434, + "num_input_tokens_seen": 129468144, + "step": 106395 + }, + { + "epoch": 11.849871923376767, + "grad_norm": 0.044377345591783524, + "learning_rate": 2.130579803615173e-05, + "loss": 0.0831, + "num_input_tokens_seen": 129474352, + "step": 106400 + }, + { + "epoch": 11.850428778260385, + "grad_norm": 1.8480695486068726, + "learning_rate": 2.1303394989374236e-05, + "loss": 0.0753, + "num_input_tokens_seen": 129480272, + "step": 106405 + }, + { + "epoch": 11.850985633144003, + "grad_norm": 0.0021199786569923162, + "learning_rate": 2.1300991977514128e-05, + "loss": 0.0415, + "num_input_tokens_seen": 129486384, + "step": 106410 + }, + { + "epoch": 11.85154248802762, + "grad_norm": 2.1683101654052734, + "learning_rate": 2.1298589000594122e-05, + "loss": 0.0809, + "num_input_tokens_seen": 129492656, + "step": 106415 + }, + { + "epoch": 11.852099342911238, + "grad_norm": 0.0001206265515065752, + "learning_rate": 2.1296186058636906e-05, + "loss": 0.0017, + "num_input_tokens_seen": 129498992, + "step": 106420 + }, + { + "epoch": 11.852656197794854, + "grad_norm": 0.04387742280960083, + "learning_rate": 2.1293783151665182e-05, + "loss": 0.0042, + "num_input_tokens_seen": 129504880, + "step": 106425 + }, + { + "epoch": 11.853213052678472, + "grad_norm": 0.0006308297161012888, + "learning_rate": 2.1291380279701642e-05, + "loss": 0.0248, + "num_input_tokens_seen": 129510992, + "step": 106430 + }, + { + "epoch": 11.85376990756209, + "grad_norm": 2.4913265705108643, + "learning_rate": 2.1288977442768993e-05, + "loss": 0.0703, + "num_input_tokens_seen": 129516944, + "step": 106435 + }, + { + "epoch": 11.854326762445707, + "grad_norm": 0.002404897939413786, + "learning_rate": 2.1286574640889918e-05, + "loss": 0.0751, + "num_input_tokens_seen": 129523152, + "step": 106440 + }, + { + "epoch": 11.854883617329325, + "grad_norm": 0.12265463918447495, + "learning_rate": 2.1284171874087125e-05, + "loss": 0.0654, + "num_input_tokens_seen": 129529008, + "step": 106445 + }, + { + "epoch": 11.85544047221294, + "grad_norm": 0.15135081112384796, + "learning_rate": 2.12817691423833e-05, + "loss": 0.0397, + "num_input_tokens_seen": 129534832, + "step": 106450 + }, + { + "epoch": 11.855997327096558, + "grad_norm": 1.971587061882019, + "learning_rate": 2.1279366445801153e-05, + "loss": 0.1367, + "num_input_tokens_seen": 129541232, + "step": 106455 + }, + { + "epoch": 11.856554181980176, + "grad_norm": 0.7426392436027527, + "learning_rate": 2.1276963784363356e-05, + "loss": 0.1042, + "num_input_tokens_seen": 129547344, + "step": 106460 + }, + { + "epoch": 11.857111036863794, + "grad_norm": 0.0006300556706264615, + "learning_rate": 2.127456115809264e-05, + "loss": 0.0628, + "num_input_tokens_seen": 129553392, + "step": 106465 + }, + { + "epoch": 11.857667891747411, + "grad_norm": 0.382883757352829, + "learning_rate": 2.127215856701166e-05, + "loss": 0.1613, + "num_input_tokens_seen": 129559504, + "step": 106470 + }, + { + "epoch": 11.858224746631027, + "grad_norm": 0.06665342301130295, + "learning_rate": 2.1269756011143146e-05, + "loss": 0.0078, + "num_input_tokens_seen": 129565488, + "step": 106475 + }, + { + "epoch": 11.858781601514645, + "grad_norm": 2.3427977561950684, + "learning_rate": 2.126735349050976e-05, + "loss": 0.1019, + "num_input_tokens_seen": 129571824, + "step": 106480 + }, + { + "epoch": 11.859338456398262, + "grad_norm": 0.0017145745223388076, + "learning_rate": 2.126495100513422e-05, + "loss": 0.0262, + "num_input_tokens_seen": 129578416, + "step": 106485 + }, + { + "epoch": 11.85989531128188, + "grad_norm": 0.034824032336473465, + "learning_rate": 2.1262548555039203e-05, + "loss": 0.0174, + "num_input_tokens_seen": 129584688, + "step": 106490 + }, + { + "epoch": 11.860452166165498, + "grad_norm": 0.028483696281909943, + "learning_rate": 2.126014614024742e-05, + "loss": 0.0177, + "num_input_tokens_seen": 129590928, + "step": 106495 + }, + { + "epoch": 11.861009021049114, + "grad_norm": 0.03138642758131027, + "learning_rate": 2.125774376078154e-05, + "loss": 0.0558, + "num_input_tokens_seen": 129597360, + "step": 106500 + }, + { + "epoch": 11.861565875932731, + "grad_norm": 0.016768770292401314, + "learning_rate": 2.125534141666428e-05, + "loss": 0.0188, + "num_input_tokens_seen": 129603856, + "step": 106505 + }, + { + "epoch": 11.862122730816349, + "grad_norm": 0.22790895402431488, + "learning_rate": 2.1252939107918314e-05, + "loss": 0.0126, + "num_input_tokens_seen": 129609968, + "step": 106510 + }, + { + "epoch": 11.862679585699967, + "grad_norm": 0.45259466767311096, + "learning_rate": 2.125053683456634e-05, + "loss": 0.0183, + "num_input_tokens_seen": 129615984, + "step": 106515 + }, + { + "epoch": 11.863236440583584, + "grad_norm": 0.3497265875339508, + "learning_rate": 2.1248134596631052e-05, + "loss": 0.1864, + "num_input_tokens_seen": 129622160, + "step": 106520 + }, + { + "epoch": 11.863793295467202, + "grad_norm": 1.9647194147109985, + "learning_rate": 2.124573239413514e-05, + "loss": 0.1146, + "num_input_tokens_seen": 129628176, + "step": 106525 + }, + { + "epoch": 11.864350150350818, + "grad_norm": 0.003221312537789345, + "learning_rate": 2.1243330227101282e-05, + "loss": 0.083, + "num_input_tokens_seen": 129634160, + "step": 106530 + }, + { + "epoch": 11.864907005234436, + "grad_norm": 0.2622491419315338, + "learning_rate": 2.12409280955522e-05, + "loss": 0.0051, + "num_input_tokens_seen": 129639728, + "step": 106535 + }, + { + "epoch": 11.865463860118053, + "grad_norm": 0.1199725791811943, + "learning_rate": 2.1238525999510545e-05, + "loss": 0.0694, + "num_input_tokens_seen": 129645392, + "step": 106540 + }, + { + "epoch": 11.866020715001671, + "grad_norm": 0.0009692496387287974, + "learning_rate": 2.1236123938999036e-05, + "loss": 0.0009, + "num_input_tokens_seen": 129651600, + "step": 106545 + }, + { + "epoch": 11.866577569885289, + "grad_norm": 0.22381313145160675, + "learning_rate": 2.123372191404035e-05, + "loss": 0.0941, + "num_input_tokens_seen": 129657296, + "step": 106550 + }, + { + "epoch": 11.867134424768905, + "grad_norm": 0.032164864242076874, + "learning_rate": 2.1231319924657177e-05, + "loss": 0.1505, + "num_input_tokens_seen": 129663664, + "step": 106555 + }, + { + "epoch": 11.867691279652522, + "grad_norm": 0.6277604103088379, + "learning_rate": 2.1228917970872206e-05, + "loss": 0.0278, + "num_input_tokens_seen": 129669840, + "step": 106560 + }, + { + "epoch": 11.86824813453614, + "grad_norm": 0.012656360864639282, + "learning_rate": 2.1226516052708127e-05, + "loss": 0.0963, + "num_input_tokens_seen": 129675888, + "step": 106565 + }, + { + "epoch": 11.868804989419758, + "grad_norm": 0.8334646821022034, + "learning_rate": 2.122411417018763e-05, + "loss": 0.1106, + "num_input_tokens_seen": 129681616, + "step": 106570 + }, + { + "epoch": 11.869361844303375, + "grad_norm": 0.023727448657155037, + "learning_rate": 2.1221712323333398e-05, + "loss": 0.0199, + "num_input_tokens_seen": 129687504, + "step": 106575 + }, + { + "epoch": 11.869918699186991, + "grad_norm": 0.49379023909568787, + "learning_rate": 2.1219310512168124e-05, + "loss": 0.1139, + "num_input_tokens_seen": 129693712, + "step": 106580 + }, + { + "epoch": 11.870475554070609, + "grad_norm": 0.5586735606193542, + "learning_rate": 2.1216908736714484e-05, + "loss": 0.0382, + "num_input_tokens_seen": 129699920, + "step": 106585 + }, + { + "epoch": 11.871032408954227, + "grad_norm": 0.13593202829360962, + "learning_rate": 2.1214506996995175e-05, + "loss": 0.0297, + "num_input_tokens_seen": 129705840, + "step": 106590 + }, + { + "epoch": 11.871589263837844, + "grad_norm": 0.0018577276496216655, + "learning_rate": 2.121210529303288e-05, + "loss": 0.0056, + "num_input_tokens_seen": 129711920, + "step": 106595 + }, + { + "epoch": 11.872146118721462, + "grad_norm": 0.00011106507736258209, + "learning_rate": 2.120970362485029e-05, + "loss": 0.0084, + "num_input_tokens_seen": 129718256, + "step": 106600 + }, + { + "epoch": 11.872702973605078, + "grad_norm": 0.05200226977467537, + "learning_rate": 2.1207301992470072e-05, + "loss": 0.0251, + "num_input_tokens_seen": 129724464, + "step": 106605 + }, + { + "epoch": 11.873259828488695, + "grad_norm": 0.006861603353172541, + "learning_rate": 2.1204900395914944e-05, + "loss": 0.0041, + "num_input_tokens_seen": 129730640, + "step": 106610 + }, + { + "epoch": 11.873816683372313, + "grad_norm": 0.005720495246350765, + "learning_rate": 2.1202498835207554e-05, + "loss": 0.0077, + "num_input_tokens_seen": 129736624, + "step": 106615 + }, + { + "epoch": 11.87437353825593, + "grad_norm": 0.304025262594223, + "learning_rate": 2.1200097310370625e-05, + "loss": 0.1347, + "num_input_tokens_seen": 129742960, + "step": 106620 + }, + { + "epoch": 11.874930393139548, + "grad_norm": 0.028901077806949615, + "learning_rate": 2.11976958214268e-05, + "loss": 0.0361, + "num_input_tokens_seen": 129749168, + "step": 106625 + }, + { + "epoch": 11.875487248023164, + "grad_norm": 0.7174519300460815, + "learning_rate": 2.11952943683988e-05, + "loss": 0.0246, + "num_input_tokens_seen": 129755056, + "step": 106630 + }, + { + "epoch": 11.876044102906782, + "grad_norm": 1.0692765712738037, + "learning_rate": 2.1192892951309286e-05, + "loss": 0.047, + "num_input_tokens_seen": 129761200, + "step": 106635 + }, + { + "epoch": 11.8766009577904, + "grad_norm": 0.032740265130996704, + "learning_rate": 2.119049157018095e-05, + "loss": 0.0142, + "num_input_tokens_seen": 129767248, + "step": 106640 + }, + { + "epoch": 11.877157812674017, + "grad_norm": 0.00020310189574956894, + "learning_rate": 2.118809022503647e-05, + "loss": 0.0837, + "num_input_tokens_seen": 129773328, + "step": 106645 + }, + { + "epoch": 11.877714667557635, + "grad_norm": 0.8527345657348633, + "learning_rate": 2.118568891589854e-05, + "loss": 0.0152, + "num_input_tokens_seen": 129779664, + "step": 106650 + }, + { + "epoch": 11.878271522441253, + "grad_norm": 0.010609288699924946, + "learning_rate": 2.1183287642789826e-05, + "loss": 0.0964, + "num_input_tokens_seen": 129786096, + "step": 106655 + }, + { + "epoch": 11.878828377324869, + "grad_norm": 0.0008496216032654047, + "learning_rate": 2.1180886405733024e-05, + "loss": 0.086, + "num_input_tokens_seen": 129792048, + "step": 106660 + }, + { + "epoch": 11.879385232208486, + "grad_norm": 0.01580912061035633, + "learning_rate": 2.1178485204750804e-05, + "loss": 0.0073, + "num_input_tokens_seen": 129798096, + "step": 106665 + }, + { + "epoch": 11.879942087092104, + "grad_norm": 0.4574982821941376, + "learning_rate": 2.1176084039865858e-05, + "loss": 0.0677, + "num_input_tokens_seen": 129804176, + "step": 106670 + }, + { + "epoch": 11.880498941975722, + "grad_norm": 0.1515554040670395, + "learning_rate": 2.1173682911100853e-05, + "loss": 0.1651, + "num_input_tokens_seen": 129810480, + "step": 106675 + }, + { + "epoch": 11.88105579685934, + "grad_norm": 1.445664644241333, + "learning_rate": 2.1171281818478494e-05, + "loss": 0.039, + "num_input_tokens_seen": 129816016, + "step": 106680 + }, + { + "epoch": 11.881612651742955, + "grad_norm": 0.2793511152267456, + "learning_rate": 2.1168880762021433e-05, + "loss": 0.0073, + "num_input_tokens_seen": 129822128, + "step": 106685 + }, + { + "epoch": 11.882169506626573, + "grad_norm": 0.05141495168209076, + "learning_rate": 2.1166479741752367e-05, + "loss": 0.0191, + "num_input_tokens_seen": 129828528, + "step": 106690 + }, + { + "epoch": 11.88272636151019, + "grad_norm": 0.5030204653739929, + "learning_rate": 2.116407875769397e-05, + "loss": 0.0352, + "num_input_tokens_seen": 129834768, + "step": 106695 + }, + { + "epoch": 11.883283216393808, + "grad_norm": 0.6895280480384827, + "learning_rate": 2.1161677809868924e-05, + "loss": 0.1007, + "num_input_tokens_seen": 129840144, + "step": 106700 + }, + { + "epoch": 11.883840071277426, + "grad_norm": 0.01848553493618965, + "learning_rate": 2.1159276898299905e-05, + "loss": 0.1009, + "num_input_tokens_seen": 129846256, + "step": 106705 + }, + { + "epoch": 11.884396926161042, + "grad_norm": 1.7824851274490356, + "learning_rate": 2.1156876023009598e-05, + "loss": 0.1608, + "num_input_tokens_seen": 129852368, + "step": 106710 + }, + { + "epoch": 11.88495378104466, + "grad_norm": 0.44732633233070374, + "learning_rate": 2.1154475184020666e-05, + "loss": 0.0445, + "num_input_tokens_seen": 129858384, + "step": 106715 + }, + { + "epoch": 11.885510635928277, + "grad_norm": 0.6351332664489746, + "learning_rate": 2.1152074381355808e-05, + "loss": 0.0344, + "num_input_tokens_seen": 129864624, + "step": 106720 + }, + { + "epoch": 11.886067490811895, + "grad_norm": 0.2546214461326599, + "learning_rate": 2.1149673615037682e-05, + "loss": 0.0049, + "num_input_tokens_seen": 129870608, + "step": 106725 + }, + { + "epoch": 11.886624345695513, + "grad_norm": 0.546734094619751, + "learning_rate": 2.114727288508898e-05, + "loss": 0.0515, + "num_input_tokens_seen": 129876656, + "step": 106730 + }, + { + "epoch": 11.887181200579128, + "grad_norm": 1.0191069841384888, + "learning_rate": 2.1144872191532362e-05, + "loss": 0.0671, + "num_input_tokens_seen": 129882768, + "step": 106735 + }, + { + "epoch": 11.887738055462746, + "grad_norm": 0.2507193982601166, + "learning_rate": 2.114247153439053e-05, + "loss": 0.0505, + "num_input_tokens_seen": 129888560, + "step": 106740 + }, + { + "epoch": 11.888294910346364, + "grad_norm": 0.8746111989021301, + "learning_rate": 2.1140070913686128e-05, + "loss": 0.0967, + "num_input_tokens_seen": 129894928, + "step": 106745 + }, + { + "epoch": 11.888851765229981, + "grad_norm": 0.050269804894924164, + "learning_rate": 2.1137670329441864e-05, + "loss": 0.0984, + "num_input_tokens_seen": 129901168, + "step": 106750 + }, + { + "epoch": 11.8894086201136, + "grad_norm": 0.07416782528162003, + "learning_rate": 2.1135269781680384e-05, + "loss": 0.0058, + "num_input_tokens_seen": 129907280, + "step": 106755 + }, + { + "epoch": 11.889965474997215, + "grad_norm": 0.18627895414829254, + "learning_rate": 2.1132869270424387e-05, + "loss": 0.0215, + "num_input_tokens_seen": 129913552, + "step": 106760 + }, + { + "epoch": 11.890522329880833, + "grad_norm": 0.05721632018685341, + "learning_rate": 2.1130468795696533e-05, + "loss": 0.0403, + "num_input_tokens_seen": 129919600, + "step": 106765 + }, + { + "epoch": 11.89107918476445, + "grad_norm": 0.0029812215361744165, + "learning_rate": 2.112806835751951e-05, + "loss": 0.0386, + "num_input_tokens_seen": 129925936, + "step": 106770 + }, + { + "epoch": 11.891636039648068, + "grad_norm": 0.21906006336212158, + "learning_rate": 2.1125667955915973e-05, + "loss": 0.0353, + "num_input_tokens_seen": 129932080, + "step": 106775 + }, + { + "epoch": 11.892192894531686, + "grad_norm": 1.143186092376709, + "learning_rate": 2.1123267590908617e-05, + "loss": 0.0198, + "num_input_tokens_seen": 129937968, + "step": 106780 + }, + { + "epoch": 11.892749749415302, + "grad_norm": 0.00022749119671061635, + "learning_rate": 2.1120867262520094e-05, + "loss": 0.0381, + "num_input_tokens_seen": 129944272, + "step": 106785 + }, + { + "epoch": 11.89330660429892, + "grad_norm": 0.36090701818466187, + "learning_rate": 2.1118466970773095e-05, + "loss": 0.0171, + "num_input_tokens_seen": 129950096, + "step": 106790 + }, + { + "epoch": 11.893863459182537, + "grad_norm": 0.1648135483264923, + "learning_rate": 2.111606671569028e-05, + "loss": 0.0571, + "num_input_tokens_seen": 129956016, + "step": 106795 + }, + { + "epoch": 11.894420314066155, + "grad_norm": 0.001310308463871479, + "learning_rate": 2.1113666497294332e-05, + "loss": 0.0733, + "num_input_tokens_seen": 129962096, + "step": 106800 + }, + { + "epoch": 11.894977168949772, + "grad_norm": 0.0034150294959545135, + "learning_rate": 2.1111266315607908e-05, + "loss": 0.01, + "num_input_tokens_seen": 129968112, + "step": 106805 + }, + { + "epoch": 11.895534023833388, + "grad_norm": 0.5493070483207703, + "learning_rate": 2.1108866170653704e-05, + "loss": 0.0884, + "num_input_tokens_seen": 129974448, + "step": 106810 + }, + { + "epoch": 11.896090878717006, + "grad_norm": 0.015941541641950607, + "learning_rate": 2.110646606245436e-05, + "loss": 0.0611, + "num_input_tokens_seen": 129980240, + "step": 106815 + }, + { + "epoch": 11.896647733600624, + "grad_norm": 0.21415242552757263, + "learning_rate": 2.1104065991032574e-05, + "loss": 0.0038, + "num_input_tokens_seen": 129986448, + "step": 106820 + }, + { + "epoch": 11.897204588484241, + "grad_norm": 0.5507740378379822, + "learning_rate": 2.1101665956411005e-05, + "loss": 0.0585, + "num_input_tokens_seen": 129992912, + "step": 106825 + }, + { + "epoch": 11.897761443367859, + "grad_norm": 0.00016363142640329897, + "learning_rate": 2.1099265958612324e-05, + "loss": 0.1181, + "num_input_tokens_seen": 129999024, + "step": 106830 + }, + { + "epoch": 11.898318298251475, + "grad_norm": 0.00045765144750475883, + "learning_rate": 2.10968659976592e-05, + "loss": 0.1474, + "num_input_tokens_seen": 130005040, + "step": 106835 + }, + { + "epoch": 11.898875153135092, + "grad_norm": 0.0006420619902200997, + "learning_rate": 2.1094466073574308e-05, + "loss": 0.0795, + "num_input_tokens_seen": 130011184, + "step": 106840 + }, + { + "epoch": 11.89943200801871, + "grad_norm": 1.1605862379074097, + "learning_rate": 2.1092066186380304e-05, + "loss": 0.0379, + "num_input_tokens_seen": 130017552, + "step": 106845 + }, + { + "epoch": 11.899988862902328, + "grad_norm": 0.00139518897049129, + "learning_rate": 2.1089666336099874e-05, + "loss": 0.0085, + "num_input_tokens_seen": 130023792, + "step": 106850 + }, + { + "epoch": 11.900545717785945, + "grad_norm": 0.00015331829490605742, + "learning_rate": 2.1087266522755675e-05, + "loss": 0.0642, + "num_input_tokens_seen": 130030000, + "step": 106855 + }, + { + "epoch": 11.901102572669561, + "grad_norm": 0.924568235874176, + "learning_rate": 2.1084866746370382e-05, + "loss": 0.0292, + "num_input_tokens_seen": 130036464, + "step": 106860 + }, + { + "epoch": 11.901659427553179, + "grad_norm": 0.00023830588907003403, + "learning_rate": 2.108246700696665e-05, + "loss": 0.0097, + "num_input_tokens_seen": 130042416, + "step": 106865 + }, + { + "epoch": 11.902216282436797, + "grad_norm": 0.37174278497695923, + "learning_rate": 2.108006730456717e-05, + "loss": 0.0262, + "num_input_tokens_seen": 130048432, + "step": 106870 + }, + { + "epoch": 11.902773137320414, + "grad_norm": 1.1937906742095947, + "learning_rate": 2.107766763919458e-05, + "loss": 0.1168, + "num_input_tokens_seen": 130054544, + "step": 106875 + }, + { + "epoch": 11.903329992204032, + "grad_norm": 0.0007780031301081181, + "learning_rate": 2.107526801087157e-05, + "loss": 0.0085, + "num_input_tokens_seen": 130060560, + "step": 106880 + }, + { + "epoch": 11.90388684708765, + "grad_norm": 0.7394070625305176, + "learning_rate": 2.1072868419620795e-05, + "loss": 0.0593, + "num_input_tokens_seen": 130066224, + "step": 106885 + }, + { + "epoch": 11.904443701971266, + "grad_norm": 0.37978360056877136, + "learning_rate": 2.107046886546493e-05, + "loss": 0.0076, + "num_input_tokens_seen": 130072624, + "step": 106890 + }, + { + "epoch": 11.905000556854883, + "grad_norm": 1.6161623001098633, + "learning_rate": 2.1068069348426628e-05, + "loss": 0.0722, + "num_input_tokens_seen": 130079024, + "step": 106895 + }, + { + "epoch": 11.905557411738501, + "grad_norm": 0.06181642413139343, + "learning_rate": 2.106566986852857e-05, + "loss": 0.0294, + "num_input_tokens_seen": 130085168, + "step": 106900 + }, + { + "epoch": 11.906114266622119, + "grad_norm": 0.7185317277908325, + "learning_rate": 2.1063270425793403e-05, + "loss": 0.1473, + "num_input_tokens_seen": 130091440, + "step": 106905 + }, + { + "epoch": 11.906671121505736, + "grad_norm": 0.024498173967003822, + "learning_rate": 2.1060871020243804e-05, + "loss": 0.0682, + "num_input_tokens_seen": 130097520, + "step": 106910 + }, + { + "epoch": 11.907227976389352, + "grad_norm": 0.32872310280799866, + "learning_rate": 2.1058471651902434e-05, + "loss": 0.0073, + "num_input_tokens_seen": 130103536, + "step": 106915 + }, + { + "epoch": 11.90778483127297, + "grad_norm": 0.0038014070596545935, + "learning_rate": 2.105607232079196e-05, + "loss": 0.0208, + "num_input_tokens_seen": 130109392, + "step": 106920 + }, + { + "epoch": 11.908341686156588, + "grad_norm": 0.010280580259859562, + "learning_rate": 2.105367302693504e-05, + "loss": 0.0044, + "num_input_tokens_seen": 130115152, + "step": 106925 + }, + { + "epoch": 11.908898541040205, + "grad_norm": 0.21971416473388672, + "learning_rate": 2.105127377035434e-05, + "loss": 0.0171, + "num_input_tokens_seen": 130121264, + "step": 106930 + }, + { + "epoch": 11.909455395923823, + "grad_norm": 0.4121394157409668, + "learning_rate": 2.1048874551072517e-05, + "loss": 0.1158, + "num_input_tokens_seen": 130127408, + "step": 106935 + }, + { + "epoch": 11.910012250807439, + "grad_norm": 0.8905870318412781, + "learning_rate": 2.1046475369112256e-05, + "loss": 0.2074, + "num_input_tokens_seen": 130133456, + "step": 106940 + }, + { + "epoch": 11.910569105691057, + "grad_norm": 1.1718982458114624, + "learning_rate": 2.1044076224496184e-05, + "loss": 0.2465, + "num_input_tokens_seen": 130139344, + "step": 106945 + }, + { + "epoch": 11.911125960574674, + "grad_norm": 0.0726916566491127, + "learning_rate": 2.1041677117246994e-05, + "loss": 0.0396, + "num_input_tokens_seen": 130145584, + "step": 106950 + }, + { + "epoch": 11.911682815458292, + "grad_norm": 0.5849762558937073, + "learning_rate": 2.1039278047387326e-05, + "loss": 0.0894, + "num_input_tokens_seen": 130151184, + "step": 106955 + }, + { + "epoch": 11.91223967034191, + "grad_norm": 0.01701389253139496, + "learning_rate": 2.103687901493986e-05, + "loss": 0.0083, + "num_input_tokens_seen": 130157360, + "step": 106960 + }, + { + "epoch": 11.912796525225525, + "grad_norm": 0.12426462024450302, + "learning_rate": 2.1034480019927238e-05, + "loss": 0.0966, + "num_input_tokens_seen": 130163472, + "step": 106965 + }, + { + "epoch": 11.913353380109143, + "grad_norm": 0.5785426497459412, + "learning_rate": 2.1032081062372134e-05, + "loss": 0.0272, + "num_input_tokens_seen": 130169936, + "step": 106970 + }, + { + "epoch": 11.91391023499276, + "grad_norm": 0.3218823969364166, + "learning_rate": 2.1029682142297202e-05, + "loss": 0.0303, + "num_input_tokens_seen": 130175984, + "step": 106975 + }, + { + "epoch": 11.914467089876378, + "grad_norm": 0.23665696382522583, + "learning_rate": 2.1027283259725107e-05, + "loss": 0.056, + "num_input_tokens_seen": 130182128, + "step": 106980 + }, + { + "epoch": 11.915023944759996, + "grad_norm": 0.3153440058231354, + "learning_rate": 2.10248844146785e-05, + "loss": 0.0193, + "num_input_tokens_seen": 130188304, + "step": 106985 + }, + { + "epoch": 11.915580799643612, + "grad_norm": 0.4209771454334259, + "learning_rate": 2.1022485607180047e-05, + "loss": 0.0376, + "num_input_tokens_seen": 130194480, + "step": 106990 + }, + { + "epoch": 11.91613765452723, + "grad_norm": 0.001583638833835721, + "learning_rate": 2.1020086837252407e-05, + "loss": 0.0222, + "num_input_tokens_seen": 130200752, + "step": 106995 + }, + { + "epoch": 11.916694509410847, + "grad_norm": 1.3317992687225342, + "learning_rate": 2.1017688104918228e-05, + "loss": 0.2209, + "num_input_tokens_seen": 130206736, + "step": 107000 + }, + { + "epoch": 11.917251364294465, + "grad_norm": 1.6083487272262573, + "learning_rate": 2.1015289410200182e-05, + "loss": 0.1264, + "num_input_tokens_seen": 130212880, + "step": 107005 + }, + { + "epoch": 11.917808219178083, + "grad_norm": 0.00020542071433737874, + "learning_rate": 2.1012890753120913e-05, + "loss": 0.0641, + "num_input_tokens_seen": 130218736, + "step": 107010 + }, + { + "epoch": 11.9183650740617, + "grad_norm": 0.0024091757368296385, + "learning_rate": 2.1010492133703093e-05, + "loss": 0.0075, + "num_input_tokens_seen": 130225136, + "step": 107015 + }, + { + "epoch": 11.918921928945316, + "grad_norm": 0.017681511119008064, + "learning_rate": 2.100809355196936e-05, + "loss": 0.0013, + "num_input_tokens_seen": 130231440, + "step": 107020 + }, + { + "epoch": 11.919478783828934, + "grad_norm": 0.11397493630647659, + "learning_rate": 2.10056950079424e-05, + "loss": 0.0233, + "num_input_tokens_seen": 130237936, + "step": 107025 + }, + { + "epoch": 11.920035638712552, + "grad_norm": 0.13886088132858276, + "learning_rate": 2.1003296501644833e-05, + "loss": 0.0521, + "num_input_tokens_seen": 130244112, + "step": 107030 + }, + { + "epoch": 11.92059249359617, + "grad_norm": 2.8912317752838135, + "learning_rate": 2.100089803309934e-05, + "loss": 0.1773, + "num_input_tokens_seen": 130250352, + "step": 107035 + }, + { + "epoch": 11.921149348479787, + "grad_norm": 0.005679558031260967, + "learning_rate": 2.0998499602328567e-05, + "loss": 0.0078, + "num_input_tokens_seen": 130256368, + "step": 107040 + }, + { + "epoch": 11.921706203363403, + "grad_norm": 0.8877180814743042, + "learning_rate": 2.0996101209355174e-05, + "loss": 0.1186, + "num_input_tokens_seen": 130262704, + "step": 107045 + }, + { + "epoch": 11.92226305824702, + "grad_norm": 0.004393483512103558, + "learning_rate": 2.099370285420181e-05, + "loss": 0.0291, + "num_input_tokens_seen": 130268592, + "step": 107050 + }, + { + "epoch": 11.922819913130638, + "grad_norm": 0.001875335699878633, + "learning_rate": 2.0991304536891137e-05, + "loss": 0.0671, + "num_input_tokens_seen": 130274416, + "step": 107055 + }, + { + "epoch": 11.923376768014256, + "grad_norm": 0.49397873878479004, + "learning_rate": 2.09889062574458e-05, + "loss": 0.0387, + "num_input_tokens_seen": 130280496, + "step": 107060 + }, + { + "epoch": 11.923933622897874, + "grad_norm": 0.06242994964122772, + "learning_rate": 2.0986508015888463e-05, + "loss": 0.0242, + "num_input_tokens_seen": 130286768, + "step": 107065 + }, + { + "epoch": 11.92449047778149, + "grad_norm": 0.08703596889972687, + "learning_rate": 2.0984109812241766e-05, + "loss": 0.0166, + "num_input_tokens_seen": 130292976, + "step": 107070 + }, + { + "epoch": 11.925047332665107, + "grad_norm": 0.17336221039295197, + "learning_rate": 2.0981711646528373e-05, + "loss": 0.0354, + "num_input_tokens_seen": 130299184, + "step": 107075 + }, + { + "epoch": 11.925604187548725, + "grad_norm": 0.5736235976219177, + "learning_rate": 2.0979313518770925e-05, + "loss": 0.0269, + "num_input_tokens_seen": 130305008, + "step": 107080 + }, + { + "epoch": 11.926161042432343, + "grad_norm": 0.021989595144987106, + "learning_rate": 2.0976915428992098e-05, + "loss": 0.0178, + "num_input_tokens_seen": 130310864, + "step": 107085 + }, + { + "epoch": 11.92671789731596, + "grad_norm": 0.017703965306282043, + "learning_rate": 2.097451737721451e-05, + "loss": 0.0163, + "num_input_tokens_seen": 130316880, + "step": 107090 + }, + { + "epoch": 11.927274752199576, + "grad_norm": 0.004076228477060795, + "learning_rate": 2.0972119363460842e-05, + "loss": 0.0327, + "num_input_tokens_seen": 130323248, + "step": 107095 + }, + { + "epoch": 11.927831607083194, + "grad_norm": 1.6509172916412354, + "learning_rate": 2.096972138775373e-05, + "loss": 0.0931, + "num_input_tokens_seen": 130329136, + "step": 107100 + }, + { + "epoch": 11.928388461966811, + "grad_norm": 0.48699554800987244, + "learning_rate": 2.0967323450115834e-05, + "loss": 0.0326, + "num_input_tokens_seen": 130335248, + "step": 107105 + }, + { + "epoch": 11.92894531685043, + "grad_norm": 0.19258908927440643, + "learning_rate": 2.096492555056979e-05, + "loss": 0.0067, + "num_input_tokens_seen": 130341296, + "step": 107110 + }, + { + "epoch": 11.929502171734047, + "grad_norm": 0.08418063819408417, + "learning_rate": 2.0962527689138266e-05, + "loss": 0.0297, + "num_input_tokens_seen": 130347344, + "step": 107115 + }, + { + "epoch": 11.930059026617663, + "grad_norm": 0.6728242039680481, + "learning_rate": 2.0960129865843894e-05, + "loss": 0.0586, + "num_input_tokens_seen": 130353296, + "step": 107120 + }, + { + "epoch": 11.93061588150128, + "grad_norm": 2.8096909523010254, + "learning_rate": 2.095773208070934e-05, + "loss": 0.1328, + "num_input_tokens_seen": 130359536, + "step": 107125 + }, + { + "epoch": 11.931172736384898, + "grad_norm": 3.2677602767944336, + "learning_rate": 2.095533433375724e-05, + "loss": 0.0911, + "num_input_tokens_seen": 130365808, + "step": 107130 + }, + { + "epoch": 11.931729591268516, + "grad_norm": 0.14266233146190643, + "learning_rate": 2.0952936625010252e-05, + "loss": 0.0105, + "num_input_tokens_seen": 130371792, + "step": 107135 + }, + { + "epoch": 11.932286446152133, + "grad_norm": 0.00043277523946017027, + "learning_rate": 2.0950538954491012e-05, + "loss": 0.0258, + "num_input_tokens_seen": 130378000, + "step": 107140 + }, + { + "epoch": 11.93284330103575, + "grad_norm": 0.0034264831338077784, + "learning_rate": 2.0948141322222188e-05, + "loss": 0.0074, + "num_input_tokens_seen": 130384144, + "step": 107145 + }, + { + "epoch": 11.933400155919367, + "grad_norm": 0.2415260523557663, + "learning_rate": 2.09457437282264e-05, + "loss": 0.0294, + "num_input_tokens_seen": 130390384, + "step": 107150 + }, + { + "epoch": 11.933957010802985, + "grad_norm": 1.0494797229766846, + "learning_rate": 2.0943346172526323e-05, + "loss": 0.0227, + "num_input_tokens_seen": 130396208, + "step": 107155 + }, + { + "epoch": 11.934513865686602, + "grad_norm": 0.0010311921359971166, + "learning_rate": 2.0940948655144588e-05, + "loss": 0.0575, + "num_input_tokens_seen": 130402352, + "step": 107160 + }, + { + "epoch": 11.93507072057022, + "grad_norm": 1.8920087814331055, + "learning_rate": 2.0938551176103848e-05, + "loss": 0.0211, + "num_input_tokens_seen": 130407664, + "step": 107165 + }, + { + "epoch": 11.935627575453836, + "grad_norm": 0.09883236140012741, + "learning_rate": 2.093615373542674e-05, + "loss": 0.0014, + "num_input_tokens_seen": 130413712, + "step": 107170 + }, + { + "epoch": 11.936184430337454, + "grad_norm": 0.0292301457375288, + "learning_rate": 2.0933756333135923e-05, + "loss": 0.0602, + "num_input_tokens_seen": 130419728, + "step": 107175 + }, + { + "epoch": 11.936741285221071, + "grad_norm": 1.2803010940551758, + "learning_rate": 2.0931358969254027e-05, + "loss": 0.0291, + "num_input_tokens_seen": 130425584, + "step": 107180 + }, + { + "epoch": 11.937298140104689, + "grad_norm": 0.7850714325904846, + "learning_rate": 2.092896164380371e-05, + "loss": 0.0105, + "num_input_tokens_seen": 130431696, + "step": 107185 + }, + { + "epoch": 11.937854994988307, + "grad_norm": 0.039833128452301025, + "learning_rate": 2.0926564356807617e-05, + "loss": 0.0049, + "num_input_tokens_seen": 130437872, + "step": 107190 + }, + { + "epoch": 11.938411849871922, + "grad_norm": 1.9734972715377808, + "learning_rate": 2.092416710828838e-05, + "loss": 0.2093, + "num_input_tokens_seen": 130443952, + "step": 107195 + }, + { + "epoch": 11.93896870475554, + "grad_norm": 0.5334061980247498, + "learning_rate": 2.092176989826865e-05, + "loss": 0.079, + "num_input_tokens_seen": 130450192, + "step": 107200 + }, + { + "epoch": 11.939525559639158, + "grad_norm": 0.568037748336792, + "learning_rate": 2.091937272677108e-05, + "loss": 0.0186, + "num_input_tokens_seen": 130456272, + "step": 107205 + }, + { + "epoch": 11.940082414522776, + "grad_norm": 0.347278892993927, + "learning_rate": 2.0916975593818287e-05, + "loss": 0.0358, + "num_input_tokens_seen": 130462448, + "step": 107210 + }, + { + "epoch": 11.940639269406393, + "grad_norm": 0.0014733609277755022, + "learning_rate": 2.0914578499432948e-05, + "loss": 0.047, + "num_input_tokens_seen": 130468368, + "step": 107215 + }, + { + "epoch": 11.941196124290009, + "grad_norm": 0.007745648734271526, + "learning_rate": 2.0912181443637674e-05, + "loss": 0.0091, + "num_input_tokens_seen": 130474448, + "step": 107220 + }, + { + "epoch": 11.941752979173627, + "grad_norm": 0.7600090503692627, + "learning_rate": 2.090978442645513e-05, + "loss": 0.0309, + "num_input_tokens_seen": 130480560, + "step": 107225 + }, + { + "epoch": 11.942309834057244, + "grad_norm": 0.3679921329021454, + "learning_rate": 2.0907387447907943e-05, + "loss": 0.0715, + "num_input_tokens_seen": 130486800, + "step": 107230 + }, + { + "epoch": 11.942866688940862, + "grad_norm": 1.444699764251709, + "learning_rate": 2.0904990508018767e-05, + "loss": 0.0285, + "num_input_tokens_seen": 130493200, + "step": 107235 + }, + { + "epoch": 11.94342354382448, + "grad_norm": 0.08973075449466705, + "learning_rate": 2.0902593606810232e-05, + "loss": 0.12, + "num_input_tokens_seen": 130499536, + "step": 107240 + }, + { + "epoch": 11.943980398708097, + "grad_norm": 0.10140416026115417, + "learning_rate": 2.090019674430499e-05, + "loss": 0.0048, + "num_input_tokens_seen": 130505488, + "step": 107245 + }, + { + "epoch": 11.944537253591713, + "grad_norm": 0.4300418198108673, + "learning_rate": 2.089779992052567e-05, + "loss": 0.0861, + "num_input_tokens_seen": 130511408, + "step": 107250 + }, + { + "epoch": 11.945094108475331, + "grad_norm": 1.6856880187988281, + "learning_rate": 2.089540313549492e-05, + "loss": 0.0988, + "num_input_tokens_seen": 130517520, + "step": 107255 + }, + { + "epoch": 11.945650963358949, + "grad_norm": 0.0023566398303955793, + "learning_rate": 2.0893006389235367e-05, + "loss": 0.0235, + "num_input_tokens_seen": 130523984, + "step": 107260 + }, + { + "epoch": 11.946207818242566, + "grad_norm": 2.8759312629699707, + "learning_rate": 2.0890609681769668e-05, + "loss": 0.1849, + "num_input_tokens_seen": 130529904, + "step": 107265 + }, + { + "epoch": 11.946764673126184, + "grad_norm": 0.07381235063076019, + "learning_rate": 2.0888213013120442e-05, + "loss": 0.0239, + "num_input_tokens_seen": 130536048, + "step": 107270 + }, + { + "epoch": 11.9473215280098, + "grad_norm": 0.020602842792868614, + "learning_rate": 2.0885816383310354e-05, + "loss": 0.0924, + "num_input_tokens_seen": 130541904, + "step": 107275 + }, + { + "epoch": 11.947878382893418, + "grad_norm": 0.19981348514556885, + "learning_rate": 2.088341979236201e-05, + "loss": 0.0171, + "num_input_tokens_seen": 130548080, + "step": 107280 + }, + { + "epoch": 11.948435237777035, + "grad_norm": 0.0033591294195502996, + "learning_rate": 2.0881023240298075e-05, + "loss": 0.0438, + "num_input_tokens_seen": 130554160, + "step": 107285 + }, + { + "epoch": 11.948992092660653, + "grad_norm": 1.5884140729904175, + "learning_rate": 2.087862672714117e-05, + "loss": 0.0987, + "num_input_tokens_seen": 130559728, + "step": 107290 + }, + { + "epoch": 11.94954894754427, + "grad_norm": 0.0027400951366871595, + "learning_rate": 2.0876230252913945e-05, + "loss": 0.0131, + "num_input_tokens_seen": 130566032, + "step": 107295 + }, + { + "epoch": 11.950105802427887, + "grad_norm": 1.2298253774642944, + "learning_rate": 2.087383381763902e-05, + "loss": 0.1192, + "num_input_tokens_seen": 130571536, + "step": 107300 + }, + { + "epoch": 11.950662657311504, + "grad_norm": 0.05643666908144951, + "learning_rate": 2.0871437421339053e-05, + "loss": 0.0332, + "num_input_tokens_seen": 130577968, + "step": 107305 + }, + { + "epoch": 11.951219512195122, + "grad_norm": 0.20676547288894653, + "learning_rate": 2.0869041064036655e-05, + "loss": 0.0144, + "num_input_tokens_seen": 130583856, + "step": 107310 + }, + { + "epoch": 11.95177636707874, + "grad_norm": 0.009405190125107765, + "learning_rate": 2.086664474575448e-05, + "loss": 0.0018, + "num_input_tokens_seen": 130590224, + "step": 107315 + }, + { + "epoch": 11.952333221962357, + "grad_norm": 0.3289969563484192, + "learning_rate": 2.0864248466515153e-05, + "loss": 0.0058, + "num_input_tokens_seen": 130595984, + "step": 107320 + }, + { + "epoch": 11.952890076845973, + "grad_norm": 0.024112360551953316, + "learning_rate": 2.0861852226341318e-05, + "loss": 0.0031, + "num_input_tokens_seen": 130601936, + "step": 107325 + }, + { + "epoch": 11.95344693172959, + "grad_norm": 2.063242197036743, + "learning_rate": 2.0859456025255594e-05, + "loss": 0.0826, + "num_input_tokens_seen": 130608208, + "step": 107330 + }, + { + "epoch": 11.954003786613209, + "grad_norm": 0.03528907895088196, + "learning_rate": 2.085705986328064e-05, + "loss": 0.0675, + "num_input_tokens_seen": 130614064, + "step": 107335 + }, + { + "epoch": 11.954560641496826, + "grad_norm": 0.18198253214359283, + "learning_rate": 2.085466374043906e-05, + "loss": 0.0089, + "num_input_tokens_seen": 130620016, + "step": 107340 + }, + { + "epoch": 11.955117496380444, + "grad_norm": 0.54947829246521, + "learning_rate": 2.0852267656753514e-05, + "loss": 0.067, + "num_input_tokens_seen": 130626160, + "step": 107345 + }, + { + "epoch": 11.955674351264062, + "grad_norm": 0.5462474822998047, + "learning_rate": 2.0849871612246614e-05, + "loss": 0.0262, + "num_input_tokens_seen": 130632432, + "step": 107350 + }, + { + "epoch": 11.956231206147677, + "grad_norm": 0.2899802029132843, + "learning_rate": 2.084747560694101e-05, + "loss": 0.0378, + "num_input_tokens_seen": 130638480, + "step": 107355 + }, + { + "epoch": 11.956788061031295, + "grad_norm": 0.007065367419272661, + "learning_rate": 2.0845079640859318e-05, + "loss": 0.0012, + "num_input_tokens_seen": 130645008, + "step": 107360 + }, + { + "epoch": 11.957344915914913, + "grad_norm": 0.00035220608697272837, + "learning_rate": 2.0842683714024187e-05, + "loss": 0.0014, + "num_input_tokens_seen": 130651280, + "step": 107365 + }, + { + "epoch": 11.95790177079853, + "grad_norm": 1.0550576448440552, + "learning_rate": 2.084028782645823e-05, + "loss": 0.1279, + "num_input_tokens_seen": 130657584, + "step": 107370 + }, + { + "epoch": 11.958458625682148, + "grad_norm": 0.03117297776043415, + "learning_rate": 2.0837891978184094e-05, + "loss": 0.0988, + "num_input_tokens_seen": 130663536, + "step": 107375 + }, + { + "epoch": 11.959015480565764, + "grad_norm": 0.502471923828125, + "learning_rate": 2.0835496169224396e-05, + "loss": 0.0189, + "num_input_tokens_seen": 130669584, + "step": 107380 + }, + { + "epoch": 11.959572335449382, + "grad_norm": 1.2924110889434814, + "learning_rate": 2.0833100399601778e-05, + "loss": 0.07, + "num_input_tokens_seen": 130675824, + "step": 107385 + }, + { + "epoch": 11.960129190333, + "grad_norm": 1.6058862209320068, + "learning_rate": 2.0830704669338863e-05, + "loss": 0.1164, + "num_input_tokens_seen": 130681840, + "step": 107390 + }, + { + "epoch": 11.960686045216617, + "grad_norm": 0.012327403761446476, + "learning_rate": 2.082830897845829e-05, + "loss": 0.1299, + "num_input_tokens_seen": 130687856, + "step": 107395 + }, + { + "epoch": 11.961242900100235, + "grad_norm": 0.11102105677127838, + "learning_rate": 2.0825913326982665e-05, + "loss": 0.0081, + "num_input_tokens_seen": 130693840, + "step": 107400 + }, + { + "epoch": 11.96179975498385, + "grad_norm": 0.49244481325149536, + "learning_rate": 2.082351771493465e-05, + "loss": 0.0494, + "num_input_tokens_seen": 130699856, + "step": 107405 + }, + { + "epoch": 11.962356609867468, + "grad_norm": 0.07831507921218872, + "learning_rate": 2.0821122142336853e-05, + "loss": 0.0642, + "num_input_tokens_seen": 130705808, + "step": 107410 + }, + { + "epoch": 11.962913464751086, + "grad_norm": 0.10128562897443771, + "learning_rate": 2.0818726609211895e-05, + "loss": 0.0245, + "num_input_tokens_seen": 130712240, + "step": 107415 + }, + { + "epoch": 11.963470319634704, + "grad_norm": 2.4768011569976807, + "learning_rate": 2.0816331115582427e-05, + "loss": 0.0626, + "num_input_tokens_seen": 130718288, + "step": 107420 + }, + { + "epoch": 11.964027174518321, + "grad_norm": 0.03894301876425743, + "learning_rate": 2.081393566147105e-05, + "loss": 0.0126, + "num_input_tokens_seen": 130723888, + "step": 107425 + }, + { + "epoch": 11.964584029401937, + "grad_norm": 0.00014215346891433, + "learning_rate": 2.0811540246900416e-05, + "loss": 0.0021, + "num_input_tokens_seen": 130730256, + "step": 107430 + }, + { + "epoch": 11.965140884285555, + "grad_norm": 0.015055160038173199, + "learning_rate": 2.080914487189313e-05, + "loss": 0.0087, + "num_input_tokens_seen": 130736272, + "step": 107435 + }, + { + "epoch": 11.965697739169173, + "grad_norm": 7.797800208209082e-05, + "learning_rate": 2.080674953647184e-05, + "loss": 0.0393, + "num_input_tokens_seen": 130742448, + "step": 107440 + }, + { + "epoch": 11.96625459405279, + "grad_norm": 1.3168469667434692, + "learning_rate": 2.080435424065915e-05, + "loss": 0.0658, + "num_input_tokens_seen": 130748816, + "step": 107445 + }, + { + "epoch": 11.966811448936408, + "grad_norm": 1.0527313947677612, + "learning_rate": 2.0801958984477704e-05, + "loss": 0.1099, + "num_input_tokens_seen": 130754128, + "step": 107450 + }, + { + "epoch": 11.967368303820024, + "grad_norm": 2.0808002948760986, + "learning_rate": 2.0799563767950115e-05, + "loss": 0.1209, + "num_input_tokens_seen": 130760336, + "step": 107455 + }, + { + "epoch": 11.967925158703641, + "grad_norm": 1.3253768682479858, + "learning_rate": 2.0797168591099015e-05, + "loss": 0.0121, + "num_input_tokens_seen": 130766640, + "step": 107460 + }, + { + "epoch": 11.96848201358726, + "grad_norm": 0.18164175748825073, + "learning_rate": 2.0794773453947016e-05, + "loss": 0.0506, + "num_input_tokens_seen": 130773008, + "step": 107465 + }, + { + "epoch": 11.969038868470877, + "grad_norm": 1.3667535781860352, + "learning_rate": 2.0792378356516758e-05, + "loss": 0.1011, + "num_input_tokens_seen": 130779120, + "step": 107470 + }, + { + "epoch": 11.969595723354495, + "grad_norm": 7.846898370189592e-05, + "learning_rate": 2.0789983298830855e-05, + "loss": 0.0709, + "num_input_tokens_seen": 130785008, + "step": 107475 + }, + { + "epoch": 11.97015257823811, + "grad_norm": 0.00011633320536930114, + "learning_rate": 2.0787588280911936e-05, + "loss": 0.0115, + "num_input_tokens_seen": 130791216, + "step": 107480 + }, + { + "epoch": 11.970709433121728, + "grad_norm": 0.02446506917476654, + "learning_rate": 2.078519330278261e-05, + "loss": 0.0036, + "num_input_tokens_seen": 130797520, + "step": 107485 + }, + { + "epoch": 11.971266288005346, + "grad_norm": 0.0873987078666687, + "learning_rate": 2.078279836446553e-05, + "loss": 0.0328, + "num_input_tokens_seen": 130803664, + "step": 107490 + }, + { + "epoch": 11.971823142888963, + "grad_norm": 0.0011475698556751013, + "learning_rate": 2.0780403465983277e-05, + "loss": 0.0669, + "num_input_tokens_seen": 130809104, + "step": 107495 + }, + { + "epoch": 11.972379997772581, + "grad_norm": 0.0007097040652297437, + "learning_rate": 2.0778008607358505e-05, + "loss": 0.0455, + "num_input_tokens_seen": 130815120, + "step": 107500 + }, + { + "epoch": 11.972936852656197, + "grad_norm": 0.0037218842189759016, + "learning_rate": 2.077561378861382e-05, + "loss": 0.0091, + "num_input_tokens_seen": 130820976, + "step": 107505 + }, + { + "epoch": 11.973493707539815, + "grad_norm": 0.2364233285188675, + "learning_rate": 2.0773219009771855e-05, + "loss": 0.0343, + "num_input_tokens_seen": 130826768, + "step": 107510 + }, + { + "epoch": 11.974050562423432, + "grad_norm": 0.4008045792579651, + "learning_rate": 2.0770824270855214e-05, + "loss": 0.0358, + "num_input_tokens_seen": 130833008, + "step": 107515 + }, + { + "epoch": 11.97460741730705, + "grad_norm": 0.009626459330320358, + "learning_rate": 2.0768429571886534e-05, + "loss": 0.0085, + "num_input_tokens_seen": 130839088, + "step": 107520 + }, + { + "epoch": 11.975164272190668, + "grad_norm": 0.08420145511627197, + "learning_rate": 2.0766034912888418e-05, + "loss": 0.0252, + "num_input_tokens_seen": 130845232, + "step": 107525 + }, + { + "epoch": 11.975721127074284, + "grad_norm": 0.38450461626052856, + "learning_rate": 2.0763640293883504e-05, + "loss": 0.0379, + "num_input_tokens_seen": 130851472, + "step": 107530 + }, + { + "epoch": 11.976277981957901, + "grad_norm": 0.0001289811043534428, + "learning_rate": 2.0761245714894395e-05, + "loss": 0.0288, + "num_input_tokens_seen": 130857648, + "step": 107535 + }, + { + "epoch": 11.976834836841519, + "grad_norm": 1.8879777193069458, + "learning_rate": 2.0758851175943723e-05, + "loss": 0.0437, + "num_input_tokens_seen": 130863856, + "step": 107540 + }, + { + "epoch": 11.977391691725137, + "grad_norm": 0.46684691309928894, + "learning_rate": 2.0756456677054085e-05, + "loss": 0.0064, + "num_input_tokens_seen": 130870064, + "step": 107545 + }, + { + "epoch": 11.977948546608754, + "grad_norm": 0.011350403539836407, + "learning_rate": 2.0754062218248133e-05, + "loss": 0.0029, + "num_input_tokens_seen": 130876176, + "step": 107550 + }, + { + "epoch": 11.97850540149237, + "grad_norm": 0.00656982883810997, + "learning_rate": 2.0751667799548446e-05, + "loss": 0.0158, + "num_input_tokens_seen": 130882192, + "step": 107555 + }, + { + "epoch": 11.979062256375988, + "grad_norm": 0.675212025642395, + "learning_rate": 2.0749273420977673e-05, + "loss": 0.0217, + "num_input_tokens_seen": 130888016, + "step": 107560 + }, + { + "epoch": 11.979619111259606, + "grad_norm": 0.049304284155368805, + "learning_rate": 2.0746879082558413e-05, + "loss": 0.0353, + "num_input_tokens_seen": 130894128, + "step": 107565 + }, + { + "epoch": 11.980175966143223, + "grad_norm": 0.03440116345882416, + "learning_rate": 2.0744484784313292e-05, + "loss": 0.0863, + "num_input_tokens_seen": 130900400, + "step": 107570 + }, + { + "epoch": 11.980732821026841, + "grad_norm": 1.7812724113464355, + "learning_rate": 2.074209052626492e-05, + "loss": 0.0341, + "num_input_tokens_seen": 130906704, + "step": 107575 + }, + { + "epoch": 11.981289675910457, + "grad_norm": 0.32156479358673096, + "learning_rate": 2.0739696308435914e-05, + "loss": 0.0746, + "num_input_tokens_seen": 130912976, + "step": 107580 + }, + { + "epoch": 11.981846530794074, + "grad_norm": 2.0636563301086426, + "learning_rate": 2.0737302130848885e-05, + "loss": 0.1147, + "num_input_tokens_seen": 130919184, + "step": 107585 + }, + { + "epoch": 11.982403385677692, + "grad_norm": 2.8854997158050537, + "learning_rate": 2.073490799352646e-05, + "loss": 0.1685, + "num_input_tokens_seen": 130925136, + "step": 107590 + }, + { + "epoch": 11.98296024056131, + "grad_norm": 0.0005720264744013548, + "learning_rate": 2.073251389649124e-05, + "loss": 0.0015, + "num_input_tokens_seen": 130931440, + "step": 107595 + }, + { + "epoch": 11.983517095444927, + "grad_norm": 0.0983867347240448, + "learning_rate": 2.073011983976585e-05, + "loss": 0.057, + "num_input_tokens_seen": 130937296, + "step": 107600 + }, + { + "epoch": 11.984073950328545, + "grad_norm": 0.0012864492600783706, + "learning_rate": 2.0727725823372894e-05, + "loss": 0.0846, + "num_input_tokens_seen": 130943504, + "step": 107605 + }, + { + "epoch": 11.984630805212161, + "grad_norm": 0.3111530840396881, + "learning_rate": 2.0725331847335e-05, + "loss": 0.0115, + "num_input_tokens_seen": 130949840, + "step": 107610 + }, + { + "epoch": 11.985187660095779, + "grad_norm": 0.4109097719192505, + "learning_rate": 2.0722937911674757e-05, + "loss": 0.059, + "num_input_tokens_seen": 130955888, + "step": 107615 + }, + { + "epoch": 11.985744514979396, + "grad_norm": 2.654189109802246, + "learning_rate": 2.072054401641481e-05, + "loss": 0.1077, + "num_input_tokens_seen": 130961584, + "step": 107620 + }, + { + "epoch": 11.986301369863014, + "grad_norm": 0.005448061972856522, + "learning_rate": 2.0718150161577736e-05, + "loss": 0.02, + "num_input_tokens_seen": 130967952, + "step": 107625 + }, + { + "epoch": 11.986858224746632, + "grad_norm": 0.953580915927887, + "learning_rate": 2.0715756347186173e-05, + "loss": 0.0564, + "num_input_tokens_seen": 130974128, + "step": 107630 + }, + { + "epoch": 11.987415079630248, + "grad_norm": 2.5315215587615967, + "learning_rate": 2.0713362573262724e-05, + "loss": 0.0835, + "num_input_tokens_seen": 130980240, + "step": 107635 + }, + { + "epoch": 11.987971934513865, + "grad_norm": 0.23044244945049286, + "learning_rate": 2.071096883983e-05, + "loss": 0.1008, + "num_input_tokens_seen": 130986544, + "step": 107640 + }, + { + "epoch": 11.988528789397483, + "grad_norm": 0.2714545726776123, + "learning_rate": 2.070857514691061e-05, + "loss": 0.1651, + "num_input_tokens_seen": 130992848, + "step": 107645 + }, + { + "epoch": 11.9890856442811, + "grad_norm": 3.006676197052002, + "learning_rate": 2.070618149452717e-05, + "loss": 0.0847, + "num_input_tokens_seen": 130998960, + "step": 107650 + }, + { + "epoch": 11.989642499164718, + "grad_norm": 0.022680476307868958, + "learning_rate": 2.0703787882702278e-05, + "loss": 0.0349, + "num_input_tokens_seen": 131004880, + "step": 107655 + }, + { + "epoch": 11.990199354048334, + "grad_norm": 1.047912359237671, + "learning_rate": 2.070139431145856e-05, + "loss": 0.0429, + "num_input_tokens_seen": 131010992, + "step": 107660 + }, + { + "epoch": 11.990756208931952, + "grad_norm": 0.024815628305077553, + "learning_rate": 2.069900078081861e-05, + "loss": 0.0172, + "num_input_tokens_seen": 131016496, + "step": 107665 + }, + { + "epoch": 11.99131306381557, + "grad_norm": 0.7348236441612244, + "learning_rate": 2.069660729080505e-05, + "loss": 0.1553, + "num_input_tokens_seen": 131022608, + "step": 107670 + }, + { + "epoch": 11.991869918699187, + "grad_norm": 0.09351510554552078, + "learning_rate": 2.0694213841440468e-05, + "loss": 0.1578, + "num_input_tokens_seen": 131028528, + "step": 107675 + }, + { + "epoch": 11.992426773582805, + "grad_norm": 0.189986452460289, + "learning_rate": 2.0691820432747505e-05, + "loss": 0.0545, + "num_input_tokens_seen": 131034800, + "step": 107680 + }, + { + "epoch": 11.99298362846642, + "grad_norm": 0.6666711568832397, + "learning_rate": 2.0689427064748733e-05, + "loss": 0.0163, + "num_input_tokens_seen": 131041008, + "step": 107685 + }, + { + "epoch": 11.993540483350039, + "grad_norm": 1.5280529260635376, + "learning_rate": 2.0687033737466786e-05, + "loss": 0.147, + "num_input_tokens_seen": 131046928, + "step": 107690 + }, + { + "epoch": 11.994097338233656, + "grad_norm": 0.09623681008815765, + "learning_rate": 2.068464045092426e-05, + "loss": 0.0607, + "num_input_tokens_seen": 131053040, + "step": 107695 + }, + { + "epoch": 11.994654193117274, + "grad_norm": 0.0031965961679816246, + "learning_rate": 2.0682247205143762e-05, + "loss": 0.003, + "num_input_tokens_seen": 131059504, + "step": 107700 + }, + { + "epoch": 11.995211048000892, + "grad_norm": 0.17441442608833313, + "learning_rate": 2.0679854000147894e-05, + "loss": 0.0069, + "num_input_tokens_seen": 131065808, + "step": 107705 + }, + { + "epoch": 11.99576790288451, + "grad_norm": 1.0193719863891602, + "learning_rate": 2.0677460835959273e-05, + "loss": 0.0388, + "num_input_tokens_seen": 131072016, + "step": 107710 + }, + { + "epoch": 11.996324757768125, + "grad_norm": 0.06834828853607178, + "learning_rate": 2.0675067712600495e-05, + "loss": 0.029, + "num_input_tokens_seen": 131078448, + "step": 107715 + }, + { + "epoch": 11.996881612651743, + "grad_norm": 0.006192691624164581, + "learning_rate": 2.0672674630094167e-05, + "loss": 0.0616, + "num_input_tokens_seen": 131084624, + "step": 107720 + }, + { + "epoch": 11.99743846753536, + "grad_norm": 1.4224387407302856, + "learning_rate": 2.067028158846289e-05, + "loss": 0.0681, + "num_input_tokens_seen": 131090832, + "step": 107725 + }, + { + "epoch": 11.997995322418978, + "grad_norm": 0.05985308811068535, + "learning_rate": 2.066788858772928e-05, + "loss": 0.0307, + "num_input_tokens_seen": 131097264, + "step": 107730 + }, + { + "epoch": 11.998552177302596, + "grad_norm": 0.10445188730955124, + "learning_rate": 2.0665495627915922e-05, + "loss": 0.0119, + "num_input_tokens_seen": 131103056, + "step": 107735 + }, + { + "epoch": 11.999109032186212, + "grad_norm": 0.0017416116315871477, + "learning_rate": 2.0663102709045447e-05, + "loss": 0.0541, + "num_input_tokens_seen": 131109232, + "step": 107740 + }, + { + "epoch": 11.99966588706983, + "grad_norm": 0.00036910807830281556, + "learning_rate": 2.0660709831140422e-05, + "loss": 0.1594, + "num_input_tokens_seen": 131115120, + "step": 107745 + }, + { + "epoch": 12.0, + "eval_loss": 0.07988622039556503, + "eval_runtime": 112.199, + "eval_samples_per_second": 35.571, + "eval_steps_per_second": 8.895, + "num_input_tokens_seen": 131118336, + "step": 107748 + }, + { + "epoch": 12.000222741953447, + "grad_norm": 0.0043091485276818275, + "learning_rate": 2.065831699422349e-05, + "loss": 0.0347, + "num_input_tokens_seen": 131120896, + "step": 107750 + }, + { + "epoch": 12.000779596837065, + "grad_norm": 0.06608521193265915, + "learning_rate": 2.0655924198317215e-05, + "loss": 0.0259, + "num_input_tokens_seen": 131126784, + "step": 107755 + }, + { + "epoch": 12.001336451720682, + "grad_norm": 0.04466233775019646, + "learning_rate": 2.0653531443444224e-05, + "loss": 0.0084, + "num_input_tokens_seen": 131132288, + "step": 107760 + }, + { + "epoch": 12.001893306604298, + "grad_norm": 0.0023334689904004335, + "learning_rate": 2.065113872962711e-05, + "loss": 0.0109, + "num_input_tokens_seen": 131138528, + "step": 107765 + }, + { + "epoch": 12.002450161487916, + "grad_norm": 0.04745243489742279, + "learning_rate": 2.0648746056888478e-05, + "loss": 0.1025, + "num_input_tokens_seen": 131144800, + "step": 107770 + }, + { + "epoch": 12.003007016371534, + "grad_norm": 0.030265025794506073, + "learning_rate": 2.064635342525092e-05, + "loss": 0.0371, + "num_input_tokens_seen": 131150944, + "step": 107775 + }, + { + "epoch": 12.003563871255151, + "grad_norm": 0.2146482765674591, + "learning_rate": 2.0643960834737046e-05, + "loss": 0.0024, + "num_input_tokens_seen": 131157120, + "step": 107780 + }, + { + "epoch": 12.004120726138769, + "grad_norm": 1.4402748346328735, + "learning_rate": 2.064156828536945e-05, + "loss": 0.1381, + "num_input_tokens_seen": 131162816, + "step": 107785 + }, + { + "epoch": 12.004677581022385, + "grad_norm": 0.010291464626789093, + "learning_rate": 2.063917577717074e-05, + "loss": 0.0125, + "num_input_tokens_seen": 131168992, + "step": 107790 + }, + { + "epoch": 12.005234435906003, + "grad_norm": 0.02758556604385376, + "learning_rate": 2.0636783310163498e-05, + "loss": 0.0089, + "num_input_tokens_seen": 131175328, + "step": 107795 + }, + { + "epoch": 12.00579129078962, + "grad_norm": 0.1417030692100525, + "learning_rate": 2.0634390884370342e-05, + "loss": 0.0501, + "num_input_tokens_seen": 131181248, + "step": 107800 + }, + { + "epoch": 12.006348145673238, + "grad_norm": 3.7133421897888184, + "learning_rate": 2.0631998499813847e-05, + "loss": 0.2481, + "num_input_tokens_seen": 131187104, + "step": 107805 + }, + { + "epoch": 12.006905000556856, + "grad_norm": 0.21361508965492249, + "learning_rate": 2.0629606156516648e-05, + "loss": 0.044, + "num_input_tokens_seen": 131192928, + "step": 107810 + }, + { + "epoch": 12.007461855440472, + "grad_norm": 0.32613953948020935, + "learning_rate": 2.062721385450131e-05, + "loss": 0.0145, + "num_input_tokens_seen": 131198912, + "step": 107815 + }, + { + "epoch": 12.00801871032409, + "grad_norm": 0.35220324993133545, + "learning_rate": 2.0624821593790432e-05, + "loss": 0.062, + "num_input_tokens_seen": 131204960, + "step": 107820 + }, + { + "epoch": 12.008575565207707, + "grad_norm": 0.020317060872912407, + "learning_rate": 2.062242937440664e-05, + "loss": 0.1015, + "num_input_tokens_seen": 131211072, + "step": 107825 + }, + { + "epoch": 12.009132420091325, + "grad_norm": 0.3265669643878937, + "learning_rate": 2.0620037196372487e-05, + "loss": 0.0219, + "num_input_tokens_seen": 131217312, + "step": 107830 + }, + { + "epoch": 12.009689274974942, + "grad_norm": 0.00026921217795461416, + "learning_rate": 2.0617645059710605e-05, + "loss": 0.0077, + "num_input_tokens_seen": 131223424, + "step": 107835 + }, + { + "epoch": 12.010246129858558, + "grad_norm": 0.00013981912343297154, + "learning_rate": 2.061525296444357e-05, + "loss": 0.0108, + "num_input_tokens_seen": 131229664, + "step": 107840 + }, + { + "epoch": 12.010802984742176, + "grad_norm": 2.2467684745788574, + "learning_rate": 2.061286091059399e-05, + "loss": 0.0853, + "num_input_tokens_seen": 131236032, + "step": 107845 + }, + { + "epoch": 12.011359839625793, + "grad_norm": 1.9018638134002686, + "learning_rate": 2.0610468898184448e-05, + "loss": 0.0856, + "num_input_tokens_seen": 131242176, + "step": 107850 + }, + { + "epoch": 12.011916694509411, + "grad_norm": 0.6126478910446167, + "learning_rate": 2.060807692723755e-05, + "loss": 0.1048, + "num_input_tokens_seen": 131248192, + "step": 107855 + }, + { + "epoch": 12.012473549393029, + "grad_norm": 1.3695658445358276, + "learning_rate": 2.060568499777588e-05, + "loss": 0.0566, + "num_input_tokens_seen": 131254528, + "step": 107860 + }, + { + "epoch": 12.013030404276645, + "grad_norm": 0.059578489512205124, + "learning_rate": 2.060329310982204e-05, + "loss": 0.0447, + "num_input_tokens_seen": 131260384, + "step": 107865 + }, + { + "epoch": 12.013587259160262, + "grad_norm": 0.2794795036315918, + "learning_rate": 2.060090126339861e-05, + "loss": 0.0264, + "num_input_tokens_seen": 131266560, + "step": 107870 + }, + { + "epoch": 12.01414411404388, + "grad_norm": 0.0012191354762762785, + "learning_rate": 2.0598509458528205e-05, + "loss": 0.0043, + "num_input_tokens_seen": 131272960, + "step": 107875 + }, + { + "epoch": 12.014700968927498, + "grad_norm": 0.15307144820690155, + "learning_rate": 2.0596117695233385e-05, + "loss": 0.0159, + "num_input_tokens_seen": 131279072, + "step": 107880 + }, + { + "epoch": 12.015257823811115, + "grad_norm": 0.0016047685639932752, + "learning_rate": 2.0593725973536783e-05, + "loss": 0.0982, + "num_input_tokens_seen": 131285024, + "step": 107885 + }, + { + "epoch": 12.015814678694731, + "grad_norm": 0.2810296416282654, + "learning_rate": 2.0591334293460955e-05, + "loss": 0.0078, + "num_input_tokens_seen": 131291200, + "step": 107890 + }, + { + "epoch": 12.016371533578349, + "grad_norm": 0.9497328996658325, + "learning_rate": 2.0588942655028522e-05, + "loss": 0.1283, + "num_input_tokens_seen": 131297248, + "step": 107895 + }, + { + "epoch": 12.016928388461967, + "grad_norm": 0.1281631886959076, + "learning_rate": 2.058655105826204e-05, + "loss": 0.0729, + "num_input_tokens_seen": 131302944, + "step": 107900 + }, + { + "epoch": 12.017485243345584, + "grad_norm": 0.007692850194871426, + "learning_rate": 2.0584159503184133e-05, + "loss": 0.0239, + "num_input_tokens_seen": 131309088, + "step": 107905 + }, + { + "epoch": 12.018042098229202, + "grad_norm": 0.7421206831932068, + "learning_rate": 2.0581767989817372e-05, + "loss": 0.0247, + "num_input_tokens_seen": 131315264, + "step": 107910 + }, + { + "epoch": 12.01859895311282, + "grad_norm": 0.6372299194335938, + "learning_rate": 2.0579376518184358e-05, + "loss": 0.0198, + "num_input_tokens_seen": 131321216, + "step": 107915 + }, + { + "epoch": 12.019155807996436, + "grad_norm": 0.0030688997358083725, + "learning_rate": 2.0576985088307666e-05, + "loss": 0.0134, + "num_input_tokens_seen": 131327712, + "step": 107920 + }, + { + "epoch": 12.019712662880053, + "grad_norm": 0.00036046322202309966, + "learning_rate": 2.05745937002099e-05, + "loss": 0.0091, + "num_input_tokens_seen": 131333856, + "step": 107925 + }, + { + "epoch": 12.020269517763671, + "grad_norm": 0.06527278572320938, + "learning_rate": 2.057220235391364e-05, + "loss": 0.0101, + "num_input_tokens_seen": 131339264, + "step": 107930 + }, + { + "epoch": 12.020826372647289, + "grad_norm": 1.949907660484314, + "learning_rate": 2.056981104944148e-05, + "loss": 0.2552, + "num_input_tokens_seen": 131345344, + "step": 107935 + }, + { + "epoch": 12.021383227530906, + "grad_norm": 1.6480480432510376, + "learning_rate": 2.0567419786815997e-05, + "loss": 0.1468, + "num_input_tokens_seen": 131351584, + "step": 107940 + }, + { + "epoch": 12.021940082414522, + "grad_norm": 0.21101976931095123, + "learning_rate": 2.056502856605979e-05, + "loss": 0.035, + "num_input_tokens_seen": 131358080, + "step": 107945 + }, + { + "epoch": 12.02249693729814, + "grad_norm": 0.803874671459198, + "learning_rate": 2.0562637387195433e-05, + "loss": 0.0653, + "num_input_tokens_seen": 131364096, + "step": 107950 + }, + { + "epoch": 12.023053792181758, + "grad_norm": 0.045689765363931656, + "learning_rate": 2.0560246250245536e-05, + "loss": 0.036, + "num_input_tokens_seen": 131370272, + "step": 107955 + }, + { + "epoch": 12.023610647065375, + "grad_norm": 1.6646881103515625, + "learning_rate": 2.0557855155232654e-05, + "loss": 0.0379, + "num_input_tokens_seen": 131376512, + "step": 107960 + }, + { + "epoch": 12.024167501948993, + "grad_norm": 0.17045186460018158, + "learning_rate": 2.0555464102179402e-05, + "loss": 0.0109, + "num_input_tokens_seen": 131382464, + "step": 107965 + }, + { + "epoch": 12.024724356832609, + "grad_norm": 2.5102741718292236, + "learning_rate": 2.0553073091108345e-05, + "loss": 0.0395, + "num_input_tokens_seen": 131388640, + "step": 107970 + }, + { + "epoch": 12.025281211716226, + "grad_norm": 0.02805374376475811, + "learning_rate": 2.0550682122042083e-05, + "loss": 0.1237, + "num_input_tokens_seen": 131394080, + "step": 107975 + }, + { + "epoch": 12.025838066599844, + "grad_norm": 0.0380016565322876, + "learning_rate": 2.0548291195003186e-05, + "loss": 0.0563, + "num_input_tokens_seen": 131400480, + "step": 107980 + }, + { + "epoch": 12.026394921483462, + "grad_norm": 0.12517738342285156, + "learning_rate": 2.054590031001425e-05, + "loss": 0.0795, + "num_input_tokens_seen": 131406720, + "step": 107985 + }, + { + "epoch": 12.02695177636708, + "grad_norm": 1.152385950088501, + "learning_rate": 2.0543509467097852e-05, + "loss": 0.0704, + "num_input_tokens_seen": 131412704, + "step": 107990 + }, + { + "epoch": 12.027508631250695, + "grad_norm": 0.0030716131441295147, + "learning_rate": 2.054111866627658e-05, + "loss": 0.0397, + "num_input_tokens_seen": 131418944, + "step": 107995 + }, + { + "epoch": 12.028065486134313, + "grad_norm": 0.14449913799762726, + "learning_rate": 2.053872790757301e-05, + "loss": 0.0076, + "num_input_tokens_seen": 131424832, + "step": 108000 + }, + { + "epoch": 12.02862234101793, + "grad_norm": 0.01572040654718876, + "learning_rate": 2.0536337191009734e-05, + "loss": 0.0887, + "num_input_tokens_seen": 131430176, + "step": 108005 + }, + { + "epoch": 12.029179195901548, + "grad_norm": 0.03510206192731857, + "learning_rate": 2.0533946516609317e-05, + "loss": 0.0398, + "num_input_tokens_seen": 131436320, + "step": 108010 + }, + { + "epoch": 12.029736050785166, + "grad_norm": 0.1990409642457962, + "learning_rate": 2.0531555884394373e-05, + "loss": 0.065, + "num_input_tokens_seen": 131442528, + "step": 108015 + }, + { + "epoch": 12.030292905668782, + "grad_norm": 0.4639853537082672, + "learning_rate": 2.0529165294387447e-05, + "loss": 0.0104, + "num_input_tokens_seen": 131448704, + "step": 108020 + }, + { + "epoch": 12.0308497605524, + "grad_norm": 0.056065719574689865, + "learning_rate": 2.052677474661115e-05, + "loss": 0.0153, + "num_input_tokens_seen": 131454816, + "step": 108025 + }, + { + "epoch": 12.031406615436017, + "grad_norm": 0.40228012204170227, + "learning_rate": 2.0524384241088036e-05, + "loss": 0.1124, + "num_input_tokens_seen": 131460960, + "step": 108030 + }, + { + "epoch": 12.031963470319635, + "grad_norm": 2.313565969467163, + "learning_rate": 2.0521993777840708e-05, + "loss": 0.1467, + "num_input_tokens_seen": 131467296, + "step": 108035 + }, + { + "epoch": 12.032520325203253, + "grad_norm": 0.12248564511537552, + "learning_rate": 2.0519603356891732e-05, + "loss": 0.0782, + "num_input_tokens_seen": 131472384, + "step": 108040 + }, + { + "epoch": 12.033077180086869, + "grad_norm": 1.7291127443313599, + "learning_rate": 2.0517212978263698e-05, + "loss": 0.1833, + "num_input_tokens_seen": 131478432, + "step": 108045 + }, + { + "epoch": 12.033634034970486, + "grad_norm": 1.2701224088668823, + "learning_rate": 2.0514822641979172e-05, + "loss": 0.079, + "num_input_tokens_seen": 131484384, + "step": 108050 + }, + { + "epoch": 12.034190889854104, + "grad_norm": 0.4665597379207611, + "learning_rate": 2.0512432348060747e-05, + "loss": 0.0191, + "num_input_tokens_seen": 131490336, + "step": 108055 + }, + { + "epoch": 12.034747744737722, + "grad_norm": 0.6108145117759705, + "learning_rate": 2.051004209653099e-05, + "loss": 0.049, + "num_input_tokens_seen": 131496128, + "step": 108060 + }, + { + "epoch": 12.03530459962134, + "grad_norm": 0.7281357049942017, + "learning_rate": 2.050765188741248e-05, + "loss": 0.0314, + "num_input_tokens_seen": 131502016, + "step": 108065 + }, + { + "epoch": 12.035861454504955, + "grad_norm": 2.1537721157073975, + "learning_rate": 2.0505261720727797e-05, + "loss": 0.1003, + "num_input_tokens_seen": 131508032, + "step": 108070 + }, + { + "epoch": 12.036418309388573, + "grad_norm": 0.033881958574056625, + "learning_rate": 2.0502871596499525e-05, + "loss": 0.0871, + "num_input_tokens_seen": 131514432, + "step": 108075 + }, + { + "epoch": 12.03697516427219, + "grad_norm": 0.33802056312561035, + "learning_rate": 2.0500481514750222e-05, + "loss": 0.0462, + "num_input_tokens_seen": 131520288, + "step": 108080 + }, + { + "epoch": 12.037532019155808, + "grad_norm": 0.014848885126411915, + "learning_rate": 2.0498091475502492e-05, + "loss": 0.0061, + "num_input_tokens_seen": 131526208, + "step": 108085 + }, + { + "epoch": 12.038088874039426, + "grad_norm": 0.9652729630470276, + "learning_rate": 2.049570147877888e-05, + "loss": 0.0847, + "num_input_tokens_seen": 131532288, + "step": 108090 + }, + { + "epoch": 12.038645728923044, + "grad_norm": 0.014038776978850365, + "learning_rate": 2.0493311524601984e-05, + "loss": 0.0041, + "num_input_tokens_seen": 131538592, + "step": 108095 + }, + { + "epoch": 12.03920258380666, + "grad_norm": 0.059089917689561844, + "learning_rate": 2.0490921612994367e-05, + "loss": 0.0299, + "num_input_tokens_seen": 131544832, + "step": 108100 + }, + { + "epoch": 12.039759438690277, + "grad_norm": 0.007266322150826454, + "learning_rate": 2.0488531743978616e-05, + "loss": 0.1788, + "num_input_tokens_seen": 131550784, + "step": 108105 + }, + { + "epoch": 12.040316293573895, + "grad_norm": 0.38327884674072266, + "learning_rate": 2.048614191757729e-05, + "loss": 0.0323, + "num_input_tokens_seen": 131557024, + "step": 108110 + }, + { + "epoch": 12.040873148457512, + "grad_norm": 0.28361791372299194, + "learning_rate": 2.048375213381297e-05, + "loss": 0.0157, + "num_input_tokens_seen": 131562912, + "step": 108115 + }, + { + "epoch": 12.04143000334113, + "grad_norm": 0.46783745288848877, + "learning_rate": 2.0481362392708235e-05, + "loss": 0.0067, + "num_input_tokens_seen": 131569088, + "step": 108120 + }, + { + "epoch": 12.041986858224746, + "grad_norm": 0.006362756248563528, + "learning_rate": 2.047897269428565e-05, + "loss": 0.0126, + "num_input_tokens_seen": 131575360, + "step": 108125 + }, + { + "epoch": 12.042543713108364, + "grad_norm": 1.1421996355056763, + "learning_rate": 2.047658303856779e-05, + "loss": 0.0359, + "num_input_tokens_seen": 131581696, + "step": 108130 + }, + { + "epoch": 12.043100567991981, + "grad_norm": 0.44607165455818176, + "learning_rate": 2.0474193425577226e-05, + "loss": 0.008, + "num_input_tokens_seen": 131587808, + "step": 108135 + }, + { + "epoch": 12.043657422875599, + "grad_norm": 0.13606636226177216, + "learning_rate": 2.0471803855336524e-05, + "loss": 0.0206, + "num_input_tokens_seen": 131594080, + "step": 108140 + }, + { + "epoch": 12.044214277759217, + "grad_norm": 0.33343225717544556, + "learning_rate": 2.046941432786828e-05, + "loss": 0.0311, + "num_input_tokens_seen": 131599776, + "step": 108145 + }, + { + "epoch": 12.044771132642833, + "grad_norm": 0.44839656352996826, + "learning_rate": 2.046702484319503e-05, + "loss": 0.0634, + "num_input_tokens_seen": 131605824, + "step": 108150 + }, + { + "epoch": 12.04532798752645, + "grad_norm": 0.7196322083473206, + "learning_rate": 2.0464635401339372e-05, + "loss": 0.0444, + "num_input_tokens_seen": 131611712, + "step": 108155 + }, + { + "epoch": 12.045884842410068, + "grad_norm": 0.03324374556541443, + "learning_rate": 2.046224600232386e-05, + "loss": 0.0141, + "num_input_tokens_seen": 131618080, + "step": 108160 + }, + { + "epoch": 12.046441697293686, + "grad_norm": 0.043423041701316833, + "learning_rate": 2.0459856646171078e-05, + "loss": 0.0594, + "num_input_tokens_seen": 131624320, + "step": 108165 + }, + { + "epoch": 12.046998552177303, + "grad_norm": 0.0013515540631487966, + "learning_rate": 2.045746733290358e-05, + "loss": 0.0498, + "num_input_tokens_seen": 131630304, + "step": 108170 + }, + { + "epoch": 12.04755540706092, + "grad_norm": 0.1274387687444687, + "learning_rate": 2.045507806254395e-05, + "loss": 0.0483, + "num_input_tokens_seen": 131636000, + "step": 108175 + }, + { + "epoch": 12.048112261944537, + "grad_norm": 0.034753408282995224, + "learning_rate": 2.0452688835114742e-05, + "loss": 0.0195, + "num_input_tokens_seen": 131642048, + "step": 108180 + }, + { + "epoch": 12.048669116828155, + "grad_norm": 1.545482873916626, + "learning_rate": 2.0450299650638537e-05, + "loss": 0.055, + "num_input_tokens_seen": 131648320, + "step": 108185 + }, + { + "epoch": 12.049225971711772, + "grad_norm": 0.08266407996416092, + "learning_rate": 2.0447910509137893e-05, + "loss": 0.0274, + "num_input_tokens_seen": 131654336, + "step": 108190 + }, + { + "epoch": 12.04978282659539, + "grad_norm": 0.5689993500709534, + "learning_rate": 2.0445521410635386e-05, + "loss": 0.0134, + "num_input_tokens_seen": 131660352, + "step": 108195 + }, + { + "epoch": 12.050339681479006, + "grad_norm": 0.10548069328069687, + "learning_rate": 2.044313235515357e-05, + "loss": 0.0094, + "num_input_tokens_seen": 131665888, + "step": 108200 + }, + { + "epoch": 12.050896536362623, + "grad_norm": 0.05141671746969223, + "learning_rate": 2.044074334271503e-05, + "loss": 0.0706, + "num_input_tokens_seen": 131672032, + "step": 108205 + }, + { + "epoch": 12.051453391246241, + "grad_norm": 0.8398637771606445, + "learning_rate": 2.043835437334231e-05, + "loss": 0.0345, + "num_input_tokens_seen": 131678400, + "step": 108210 + }, + { + "epoch": 12.052010246129859, + "grad_norm": 0.029694287106394768, + "learning_rate": 2.0435965447058002e-05, + "loss": 0.1127, + "num_input_tokens_seen": 131684448, + "step": 108215 + }, + { + "epoch": 12.052567101013477, + "grad_norm": 0.1269119530916214, + "learning_rate": 2.043357656388464e-05, + "loss": 0.0103, + "num_input_tokens_seen": 131690464, + "step": 108220 + }, + { + "epoch": 12.053123955897092, + "grad_norm": 0.006665105931460857, + "learning_rate": 2.043118772384482e-05, + "loss": 0.0155, + "num_input_tokens_seen": 131696480, + "step": 108225 + }, + { + "epoch": 12.05368081078071, + "grad_norm": 0.0009075558045879006, + "learning_rate": 2.0428798926961095e-05, + "loss": 0.065, + "num_input_tokens_seen": 131702400, + "step": 108230 + }, + { + "epoch": 12.054237665664328, + "grad_norm": 1.560068130493164, + "learning_rate": 2.042641017325601e-05, + "loss": 0.0977, + "num_input_tokens_seen": 131708448, + "step": 108235 + }, + { + "epoch": 12.054794520547945, + "grad_norm": 0.003423805581405759, + "learning_rate": 2.0424021462752158e-05, + "loss": 0.0497, + "num_input_tokens_seen": 131714336, + "step": 108240 + }, + { + "epoch": 12.055351375431563, + "grad_norm": 1.0085233449935913, + "learning_rate": 2.0421632795472085e-05, + "loss": 0.0602, + "num_input_tokens_seen": 131720640, + "step": 108245 + }, + { + "epoch": 12.055908230315179, + "grad_norm": 0.030677979812026024, + "learning_rate": 2.0419244171438364e-05, + "loss": 0.0534, + "num_input_tokens_seen": 131726592, + "step": 108250 + }, + { + "epoch": 12.056465085198797, + "grad_norm": 0.6311168670654297, + "learning_rate": 2.041685559067354e-05, + "loss": 0.0851, + "num_input_tokens_seen": 131732640, + "step": 108255 + }, + { + "epoch": 12.057021940082414, + "grad_norm": 0.00025727669708430767, + "learning_rate": 2.0414467053200197e-05, + "loss": 0.0777, + "num_input_tokens_seen": 131739072, + "step": 108260 + }, + { + "epoch": 12.057578794966032, + "grad_norm": 0.5991091132164001, + "learning_rate": 2.0412078559040885e-05, + "loss": 0.012, + "num_input_tokens_seen": 131744800, + "step": 108265 + }, + { + "epoch": 12.05813564984965, + "grad_norm": 0.2753368616104126, + "learning_rate": 2.0409690108218164e-05, + "loss": 0.0367, + "num_input_tokens_seen": 131750912, + "step": 108270 + }, + { + "epoch": 12.058692504733267, + "grad_norm": 0.843821108341217, + "learning_rate": 2.04073017007546e-05, + "loss": 0.031, + "num_input_tokens_seen": 131756864, + "step": 108275 + }, + { + "epoch": 12.059249359616883, + "grad_norm": 0.4937673509120941, + "learning_rate": 2.040491333667275e-05, + "loss": 0.028, + "num_input_tokens_seen": 131762976, + "step": 108280 + }, + { + "epoch": 12.059806214500501, + "grad_norm": 0.6396634578704834, + "learning_rate": 2.040252501599517e-05, + "loss": 0.081, + "num_input_tokens_seen": 131769408, + "step": 108285 + }, + { + "epoch": 12.060363069384119, + "grad_norm": 0.07860849797725677, + "learning_rate": 2.0400136738744437e-05, + "loss": 0.0156, + "num_input_tokens_seen": 131775680, + "step": 108290 + }, + { + "epoch": 12.060919924267736, + "grad_norm": 0.2713663876056671, + "learning_rate": 2.0397748504943083e-05, + "loss": 0.0647, + "num_input_tokens_seen": 131781856, + "step": 108295 + }, + { + "epoch": 12.061476779151354, + "grad_norm": 0.5586079359054565, + "learning_rate": 2.0395360314613697e-05, + "loss": 0.0726, + "num_input_tokens_seen": 131788032, + "step": 108300 + }, + { + "epoch": 12.06203363403497, + "grad_norm": 0.01394599862396717, + "learning_rate": 2.0392972167778805e-05, + "loss": 0.0029, + "num_input_tokens_seen": 131794080, + "step": 108305 + }, + { + "epoch": 12.062590488918588, + "grad_norm": 1.3146568536758423, + "learning_rate": 2.0390584064460997e-05, + "loss": 0.0378, + "num_input_tokens_seen": 131800096, + "step": 108310 + }, + { + "epoch": 12.063147343802205, + "grad_norm": 0.06784076988697052, + "learning_rate": 2.038819600468281e-05, + "loss": 0.0285, + "num_input_tokens_seen": 131806304, + "step": 108315 + }, + { + "epoch": 12.063704198685823, + "grad_norm": 0.0001391218975186348, + "learning_rate": 2.0385807988466805e-05, + "loss": 0.0368, + "num_input_tokens_seen": 131812352, + "step": 108320 + }, + { + "epoch": 12.06426105356944, + "grad_norm": 0.002004851121455431, + "learning_rate": 2.0383420015835542e-05, + "loss": 0.0077, + "num_input_tokens_seen": 131818272, + "step": 108325 + }, + { + "epoch": 12.064817908453056, + "grad_norm": 0.007185711991041899, + "learning_rate": 2.0381032086811578e-05, + "loss": 0.0005, + "num_input_tokens_seen": 131824448, + "step": 108330 + }, + { + "epoch": 12.065374763336674, + "grad_norm": 0.002279202686622739, + "learning_rate": 2.037864420141746e-05, + "loss": 0.0472, + "num_input_tokens_seen": 131830592, + "step": 108335 + }, + { + "epoch": 12.065931618220292, + "grad_norm": 0.00855056382715702, + "learning_rate": 2.037625635967576e-05, + "loss": 0.063, + "num_input_tokens_seen": 131836640, + "step": 108340 + }, + { + "epoch": 12.06648847310391, + "grad_norm": 0.029856301844120026, + "learning_rate": 2.037386856160901e-05, + "loss": 0.0493, + "num_input_tokens_seen": 131842688, + "step": 108345 + }, + { + "epoch": 12.067045327987527, + "grad_norm": 0.45954659581184387, + "learning_rate": 2.037148080723979e-05, + "loss": 0.0186, + "num_input_tokens_seen": 131848800, + "step": 108350 + }, + { + "epoch": 12.067602182871143, + "grad_norm": 0.046772126108407974, + "learning_rate": 2.0369093096590632e-05, + "loss": 0.0426, + "num_input_tokens_seen": 131854688, + "step": 108355 + }, + { + "epoch": 12.06815903775476, + "grad_norm": 0.08659908920526505, + "learning_rate": 2.036670542968411e-05, + "loss": 0.0066, + "num_input_tokens_seen": 131860832, + "step": 108360 + }, + { + "epoch": 12.068715892638378, + "grad_norm": 0.022619979456067085, + "learning_rate": 2.0364317806542754e-05, + "loss": 0.0126, + "num_input_tokens_seen": 131867200, + "step": 108365 + }, + { + "epoch": 12.069272747521996, + "grad_norm": 2.189204692840576, + "learning_rate": 2.0361930227189145e-05, + "loss": 0.0514, + "num_input_tokens_seen": 131873504, + "step": 108370 + }, + { + "epoch": 12.069829602405614, + "grad_norm": 4.61776876449585, + "learning_rate": 2.0359542691645812e-05, + "loss": 0.1738, + "num_input_tokens_seen": 131879488, + "step": 108375 + }, + { + "epoch": 12.07038645728923, + "grad_norm": 0.0009529991657473147, + "learning_rate": 2.035715519993532e-05, + "loss": 0.0485, + "num_input_tokens_seen": 131885344, + "step": 108380 + }, + { + "epoch": 12.070943312172847, + "grad_norm": 0.5620415806770325, + "learning_rate": 2.0354767752080215e-05, + "loss": 0.1277, + "num_input_tokens_seen": 131891488, + "step": 108385 + }, + { + "epoch": 12.071500167056465, + "grad_norm": 0.0016404985217377543, + "learning_rate": 2.0352380348103054e-05, + "loss": 0.0489, + "num_input_tokens_seen": 131897664, + "step": 108390 + }, + { + "epoch": 12.072057021940083, + "grad_norm": 0.0018640905618667603, + "learning_rate": 2.0349992988026377e-05, + "loss": 0.0135, + "num_input_tokens_seen": 131903616, + "step": 108395 + }, + { + "epoch": 12.0726138768237, + "grad_norm": 0.0559411495923996, + "learning_rate": 2.034760567187275e-05, + "loss": 0.0275, + "num_input_tokens_seen": 131909472, + "step": 108400 + }, + { + "epoch": 12.073170731707316, + "grad_norm": 0.01671493984758854, + "learning_rate": 2.034521839966471e-05, + "loss": 0.0312, + "num_input_tokens_seen": 131915456, + "step": 108405 + }, + { + "epoch": 12.073727586590934, + "grad_norm": 4.013820171356201, + "learning_rate": 2.0342831171424814e-05, + "loss": 0.0609, + "num_input_tokens_seen": 131921600, + "step": 108410 + }, + { + "epoch": 12.074284441474552, + "grad_norm": 0.004786325618624687, + "learning_rate": 2.03404439871756e-05, + "loss": 0.0215, + "num_input_tokens_seen": 131927872, + "step": 108415 + }, + { + "epoch": 12.07484129635817, + "grad_norm": 0.2535800039768219, + "learning_rate": 2.033805684693964e-05, + "loss": 0.0767, + "num_input_tokens_seen": 131933920, + "step": 108420 + }, + { + "epoch": 12.075398151241787, + "grad_norm": 0.0007748206262476742, + "learning_rate": 2.0335669750739454e-05, + "loss": 0.0167, + "num_input_tokens_seen": 131940160, + "step": 108425 + }, + { + "epoch": 12.075955006125403, + "grad_norm": 0.0033985604532063007, + "learning_rate": 2.033328269859762e-05, + "loss": 0.0169, + "num_input_tokens_seen": 131946560, + "step": 108430 + }, + { + "epoch": 12.07651186100902, + "grad_norm": 0.018600255250930786, + "learning_rate": 2.0330895690536656e-05, + "loss": 0.1298, + "num_input_tokens_seen": 131952896, + "step": 108435 + }, + { + "epoch": 12.077068715892638, + "grad_norm": 0.06173434108495712, + "learning_rate": 2.0328508726579128e-05, + "loss": 0.0108, + "num_input_tokens_seen": 131958816, + "step": 108440 + }, + { + "epoch": 12.077625570776256, + "grad_norm": 0.016470780596137047, + "learning_rate": 2.032612180674758e-05, + "loss": 0.0094, + "num_input_tokens_seen": 131964768, + "step": 108445 + }, + { + "epoch": 12.078182425659874, + "grad_norm": 2.125443696975708, + "learning_rate": 2.0323734931064555e-05, + "loss": 0.0686, + "num_input_tokens_seen": 131970912, + "step": 108450 + }, + { + "epoch": 12.078739280543491, + "grad_norm": 0.008766905404627323, + "learning_rate": 2.0321348099552595e-05, + "loss": 0.0127, + "num_input_tokens_seen": 131977312, + "step": 108455 + }, + { + "epoch": 12.079296135427107, + "grad_norm": 0.37871089577674866, + "learning_rate": 2.0318961312234257e-05, + "loss": 0.0141, + "num_input_tokens_seen": 131983232, + "step": 108460 + }, + { + "epoch": 12.079852990310725, + "grad_norm": 7.959918730193749e-05, + "learning_rate": 2.0316574569132076e-05, + "loss": 0.0926, + "num_input_tokens_seen": 131989280, + "step": 108465 + }, + { + "epoch": 12.080409845194342, + "grad_norm": 1.1581095457077026, + "learning_rate": 2.031418787026861e-05, + "loss": 0.0564, + "num_input_tokens_seen": 131995392, + "step": 108470 + }, + { + "epoch": 12.08096670007796, + "grad_norm": 0.08816014975309372, + "learning_rate": 2.0311801215666383e-05, + "loss": 0.0018, + "num_input_tokens_seen": 132001472, + "step": 108475 + }, + { + "epoch": 12.081523554961578, + "grad_norm": 0.0020568727049976587, + "learning_rate": 2.0309414605347953e-05, + "loss": 0.0131, + "num_input_tokens_seen": 132007744, + "step": 108480 + }, + { + "epoch": 12.082080409845194, + "grad_norm": 0.1274263709783554, + "learning_rate": 2.030702803933585e-05, + "loss": 0.0036, + "num_input_tokens_seen": 132013472, + "step": 108485 + }, + { + "epoch": 12.082637264728811, + "grad_norm": 0.003588000312447548, + "learning_rate": 2.0304641517652647e-05, + "loss": 0.0446, + "num_input_tokens_seen": 132019776, + "step": 108490 + }, + { + "epoch": 12.083194119612429, + "grad_norm": 0.02242821641266346, + "learning_rate": 2.030225504032085e-05, + "loss": 0.0032, + "num_input_tokens_seen": 132025568, + "step": 108495 + }, + { + "epoch": 12.083750974496047, + "grad_norm": 0.02419205568730831, + "learning_rate": 2.0299868607363028e-05, + "loss": 0.0502, + "num_input_tokens_seen": 132031072, + "step": 108500 + }, + { + "epoch": 12.084307829379664, + "grad_norm": 0.004293297417461872, + "learning_rate": 2.0297482218801706e-05, + "loss": 0.0761, + "num_input_tokens_seen": 132037120, + "step": 108505 + }, + { + "epoch": 12.08486468426328, + "grad_norm": 0.022320955991744995, + "learning_rate": 2.029509587465944e-05, + "loss": 0.016, + "num_input_tokens_seen": 132043264, + "step": 108510 + }, + { + "epoch": 12.085421539146898, + "grad_norm": 0.022355157881975174, + "learning_rate": 2.0292709574958756e-05, + "loss": 0.0506, + "num_input_tokens_seen": 132049600, + "step": 108515 + }, + { + "epoch": 12.085978394030516, + "grad_norm": 0.004794543143361807, + "learning_rate": 2.0290323319722206e-05, + "loss": 0.1727, + "num_input_tokens_seen": 132055904, + "step": 108520 + }, + { + "epoch": 12.086535248914133, + "grad_norm": 0.8404191732406616, + "learning_rate": 2.028793710897232e-05, + "loss": 0.0533, + "num_input_tokens_seen": 132061760, + "step": 108525 + }, + { + "epoch": 12.087092103797751, + "grad_norm": 0.0011256368597969413, + "learning_rate": 2.0285550942731652e-05, + "loss": 0.0411, + "num_input_tokens_seen": 132067872, + "step": 108530 + }, + { + "epoch": 12.087648958681367, + "grad_norm": 0.6181650161743164, + "learning_rate": 2.0283164821022724e-05, + "loss": 0.0999, + "num_input_tokens_seen": 132074048, + "step": 108535 + }, + { + "epoch": 12.088205813564985, + "grad_norm": 0.00011293784336885437, + "learning_rate": 2.028077874386809e-05, + "loss": 0.0515, + "num_input_tokens_seen": 132080352, + "step": 108540 + }, + { + "epoch": 12.088762668448602, + "grad_norm": 0.8334814310073853, + "learning_rate": 2.0278392711290266e-05, + "loss": 0.0317, + "num_input_tokens_seen": 132086336, + "step": 108545 + }, + { + "epoch": 12.08931952333222, + "grad_norm": 0.005102680996060371, + "learning_rate": 2.027600672331183e-05, + "loss": 0.0007, + "num_input_tokens_seen": 132092768, + "step": 108550 + }, + { + "epoch": 12.089876378215838, + "grad_norm": 0.0008134509553201497, + "learning_rate": 2.0273620779955273e-05, + "loss": 0.0161, + "num_input_tokens_seen": 132098880, + "step": 108555 + }, + { + "epoch": 12.090433233099454, + "grad_norm": 1.4357187747955322, + "learning_rate": 2.0271234881243168e-05, + "loss": 0.0796, + "num_input_tokens_seen": 132105408, + "step": 108560 + }, + { + "epoch": 12.090990087983071, + "grad_norm": 1.3045960664749146, + "learning_rate": 2.0268849027198034e-05, + "loss": 0.1194, + "num_input_tokens_seen": 132111360, + "step": 108565 + }, + { + "epoch": 12.091546942866689, + "grad_norm": 0.0013927003601565957, + "learning_rate": 2.0266463217842415e-05, + "loss": 0.0057, + "num_input_tokens_seen": 132117568, + "step": 108570 + }, + { + "epoch": 12.092103797750307, + "grad_norm": 0.4004252851009369, + "learning_rate": 2.0264077453198838e-05, + "loss": 0.0864, + "num_input_tokens_seen": 132123360, + "step": 108575 + }, + { + "epoch": 12.092660652633924, + "grad_norm": 0.0004801298782695085, + "learning_rate": 2.026169173328985e-05, + "loss": 0.0143, + "num_input_tokens_seen": 132129440, + "step": 108580 + }, + { + "epoch": 12.09321750751754, + "grad_norm": 0.08281857520341873, + "learning_rate": 2.0259306058137974e-05, + "loss": 0.0012, + "num_input_tokens_seen": 132135392, + "step": 108585 + }, + { + "epoch": 12.093774362401158, + "grad_norm": 0.0085039297118783, + "learning_rate": 2.0256920427765754e-05, + "loss": 0.0003, + "num_input_tokens_seen": 132141888, + "step": 108590 + }, + { + "epoch": 12.094331217284775, + "grad_norm": 0.15417662262916565, + "learning_rate": 2.025453484219572e-05, + "loss": 0.0039, + "num_input_tokens_seen": 132148160, + "step": 108595 + }, + { + "epoch": 12.094888072168393, + "grad_norm": 0.9117783904075623, + "learning_rate": 2.0252149301450406e-05, + "loss": 0.0171, + "num_input_tokens_seen": 132154336, + "step": 108600 + }, + { + "epoch": 12.09544492705201, + "grad_norm": 0.03192802518606186, + "learning_rate": 2.0249763805552337e-05, + "loss": 0.0007, + "num_input_tokens_seen": 132160192, + "step": 108605 + }, + { + "epoch": 12.096001781935627, + "grad_norm": 0.1437610387802124, + "learning_rate": 2.0247378354524073e-05, + "loss": 0.0159, + "num_input_tokens_seen": 132166272, + "step": 108610 + }, + { + "epoch": 12.096558636819244, + "grad_norm": 9.458452404942364e-05, + "learning_rate": 2.0244992948388112e-05, + "loss": 0.0248, + "num_input_tokens_seen": 132172512, + "step": 108615 + }, + { + "epoch": 12.097115491702862, + "grad_norm": 0.007184638176113367, + "learning_rate": 2.0242607587167018e-05, + "loss": 0.0123, + "num_input_tokens_seen": 132177728, + "step": 108620 + }, + { + "epoch": 12.09767234658648, + "grad_norm": 1.1878159046173096, + "learning_rate": 2.0240222270883288e-05, + "loss": 0.0254, + "num_input_tokens_seen": 132183872, + "step": 108625 + }, + { + "epoch": 12.098229201470097, + "grad_norm": 0.00020292754925321788, + "learning_rate": 2.0237836999559484e-05, + "loss": 0.0555, + "num_input_tokens_seen": 132189856, + "step": 108630 + }, + { + "epoch": 12.098786056353715, + "grad_norm": 1.0940113067626953, + "learning_rate": 2.023545177321812e-05, + "loss": 0.0268, + "num_input_tokens_seen": 132195520, + "step": 108635 + }, + { + "epoch": 12.099342911237331, + "grad_norm": 0.7800055742263794, + "learning_rate": 2.023306659188174e-05, + "loss": 0.0911, + "num_input_tokens_seen": 132201536, + "step": 108640 + }, + { + "epoch": 12.099899766120949, + "grad_norm": 1.1732097864151, + "learning_rate": 2.023068145557286e-05, + "loss": 0.0668, + "num_input_tokens_seen": 132207776, + "step": 108645 + }, + { + "epoch": 12.100456621004566, + "grad_norm": 1.4144073724746704, + "learning_rate": 2.022829636431401e-05, + "loss": 0.09, + "num_input_tokens_seen": 132213760, + "step": 108650 + }, + { + "epoch": 12.101013475888184, + "grad_norm": 1.1239042282104492, + "learning_rate": 2.0225911318127732e-05, + "loss": 0.0191, + "num_input_tokens_seen": 132219744, + "step": 108655 + }, + { + "epoch": 12.101570330771802, + "grad_norm": 2.0458598136901855, + "learning_rate": 2.022352631703654e-05, + "loss": 0.1144, + "num_input_tokens_seen": 132225504, + "step": 108660 + }, + { + "epoch": 12.102127185655418, + "grad_norm": 0.01258212048560381, + "learning_rate": 2.0221141361062977e-05, + "loss": 0.0391, + "num_input_tokens_seen": 132231808, + "step": 108665 + }, + { + "epoch": 12.102684040539035, + "grad_norm": 0.022321073338389397, + "learning_rate": 2.0218756450229555e-05, + "loss": 0.02, + "num_input_tokens_seen": 132237824, + "step": 108670 + }, + { + "epoch": 12.103240895422653, + "grad_norm": 0.17574496567249298, + "learning_rate": 2.0216371584558812e-05, + "loss": 0.03, + "num_input_tokens_seen": 132243872, + "step": 108675 + }, + { + "epoch": 12.10379775030627, + "grad_norm": 0.00011184844333911315, + "learning_rate": 2.0213986764073268e-05, + "loss": 0.0055, + "num_input_tokens_seen": 132249664, + "step": 108680 + }, + { + "epoch": 12.104354605189888, + "grad_norm": 0.0010316995903849602, + "learning_rate": 2.021160198879546e-05, + "loss": 0.0476, + "num_input_tokens_seen": 132255456, + "step": 108685 + }, + { + "epoch": 12.104911460073504, + "grad_norm": 0.054778292775154114, + "learning_rate": 2.0209217258747894e-05, + "loss": 0.0419, + "num_input_tokens_seen": 132261344, + "step": 108690 + }, + { + "epoch": 12.105468314957122, + "grad_norm": 0.1864781528711319, + "learning_rate": 2.020683257395313e-05, + "loss": 0.0061, + "num_input_tokens_seen": 132267200, + "step": 108695 + }, + { + "epoch": 12.10602516984074, + "grad_norm": 0.026286320760846138, + "learning_rate": 2.020444793443365e-05, + "loss": 0.0072, + "num_input_tokens_seen": 132273248, + "step": 108700 + }, + { + "epoch": 12.106582024724357, + "grad_norm": 0.25272682309150696, + "learning_rate": 2.0202063340212023e-05, + "loss": 0.0166, + "num_input_tokens_seen": 132278720, + "step": 108705 + }, + { + "epoch": 12.107138879607975, + "grad_norm": 0.3392818570137024, + "learning_rate": 2.0199678791310735e-05, + "loss": 0.0335, + "num_input_tokens_seen": 132284480, + "step": 108710 + }, + { + "epoch": 12.10769573449159, + "grad_norm": 1.1397649049758911, + "learning_rate": 2.0197294287752336e-05, + "loss": 0.086, + "num_input_tokens_seen": 132290720, + "step": 108715 + }, + { + "epoch": 12.108252589375208, + "grad_norm": 0.012818989343941212, + "learning_rate": 2.0194909829559335e-05, + "loss": 0.0695, + "num_input_tokens_seen": 132296768, + "step": 108720 + }, + { + "epoch": 12.108809444258826, + "grad_norm": 0.049263183027505875, + "learning_rate": 2.019252541675427e-05, + "loss": 0.0012, + "num_input_tokens_seen": 132302560, + "step": 108725 + }, + { + "epoch": 12.109366299142444, + "grad_norm": 0.7956482768058777, + "learning_rate": 2.0190141049359643e-05, + "loss": 0.03, + "num_input_tokens_seen": 132308736, + "step": 108730 + }, + { + "epoch": 12.109923154026061, + "grad_norm": 0.031210023909807205, + "learning_rate": 2.0187756727397993e-05, + "loss": 0.0029, + "num_input_tokens_seen": 132314592, + "step": 108735 + }, + { + "epoch": 12.110480008909677, + "grad_norm": 0.002294982550665736, + "learning_rate": 2.0185372450891835e-05, + "loss": 0.1903, + "num_input_tokens_seen": 132320352, + "step": 108740 + }, + { + "epoch": 12.111036863793295, + "grad_norm": 0.1601322740316391, + "learning_rate": 2.0182988219863695e-05, + "loss": 0.0183, + "num_input_tokens_seen": 132326528, + "step": 108745 + }, + { + "epoch": 12.111593718676913, + "grad_norm": 1.2093403339385986, + "learning_rate": 2.0180604034336085e-05, + "loss": 0.026, + "num_input_tokens_seen": 132332896, + "step": 108750 + }, + { + "epoch": 12.11215057356053, + "grad_norm": 0.006197134964168072, + "learning_rate": 2.0178219894331536e-05, + "loss": 0.0098, + "num_input_tokens_seen": 132339008, + "step": 108755 + }, + { + "epoch": 12.112707428444148, + "grad_norm": 1.4038681983947754, + "learning_rate": 2.017583579987255e-05, + "loss": 0.0427, + "num_input_tokens_seen": 132345024, + "step": 108760 + }, + { + "epoch": 12.113264283327764, + "grad_norm": 0.02803514152765274, + "learning_rate": 2.017345175098168e-05, + "loss": 0.0022, + "num_input_tokens_seen": 132351296, + "step": 108765 + }, + { + "epoch": 12.113821138211382, + "grad_norm": 0.015347016975283623, + "learning_rate": 2.017106774768141e-05, + "loss": 0.0069, + "num_input_tokens_seen": 132357440, + "step": 108770 + }, + { + "epoch": 12.114377993095, + "grad_norm": 0.013529093004763126, + "learning_rate": 2.0168683789994283e-05, + "loss": 0.0047, + "num_input_tokens_seen": 132363712, + "step": 108775 + }, + { + "epoch": 12.114934847978617, + "grad_norm": 0.003050811355933547, + "learning_rate": 2.01662998779428e-05, + "loss": 0.0343, + "num_input_tokens_seen": 132369888, + "step": 108780 + }, + { + "epoch": 12.115491702862235, + "grad_norm": 0.6197918057441711, + "learning_rate": 2.0163916011549496e-05, + "loss": 0.0146, + "num_input_tokens_seen": 132375968, + "step": 108785 + }, + { + "epoch": 12.116048557745852, + "grad_norm": 0.019876442849636078, + "learning_rate": 2.0161532190836876e-05, + "loss": 0.0025, + "num_input_tokens_seen": 132382016, + "step": 108790 + }, + { + "epoch": 12.116605412629468, + "grad_norm": 0.021255258470773697, + "learning_rate": 2.0159148415827464e-05, + "loss": 0.0753, + "num_input_tokens_seen": 132387712, + "step": 108795 + }, + { + "epoch": 12.117162267513086, + "grad_norm": 0.10794266313314438, + "learning_rate": 2.015676468654377e-05, + "loss": 0.0535, + "num_input_tokens_seen": 132393856, + "step": 108800 + }, + { + "epoch": 12.117719122396704, + "grad_norm": 0.011382471770048141, + "learning_rate": 2.0154381003008315e-05, + "loss": 0.0358, + "num_input_tokens_seen": 132400064, + "step": 108805 + }, + { + "epoch": 12.118275977280321, + "grad_norm": 0.14244458079338074, + "learning_rate": 2.0151997365243613e-05, + "loss": 0.0084, + "num_input_tokens_seen": 132406112, + "step": 108810 + }, + { + "epoch": 12.118832832163939, + "grad_norm": 0.06834665685892105, + "learning_rate": 2.0149613773272183e-05, + "loss": 0.0398, + "num_input_tokens_seen": 132412128, + "step": 108815 + }, + { + "epoch": 12.119389687047555, + "grad_norm": 0.00863644853234291, + "learning_rate": 2.0147230227116524e-05, + "loss": 0.0474, + "num_input_tokens_seen": 132418144, + "step": 108820 + }, + { + "epoch": 12.119946541931172, + "grad_norm": 0.0006063695764169097, + "learning_rate": 2.0144846726799185e-05, + "loss": 0.0039, + "num_input_tokens_seen": 132423488, + "step": 108825 + }, + { + "epoch": 12.12050339681479, + "grad_norm": 0.6465527415275574, + "learning_rate": 2.014246327234264e-05, + "loss": 0.0239, + "num_input_tokens_seen": 132429536, + "step": 108830 + }, + { + "epoch": 12.121060251698408, + "grad_norm": 2.1023426055908203, + "learning_rate": 2.014007986376943e-05, + "loss": 0.1167, + "num_input_tokens_seen": 132435200, + "step": 108835 + }, + { + "epoch": 12.121617106582026, + "grad_norm": 0.4371498227119446, + "learning_rate": 2.0137696501102057e-05, + "loss": 0.0822, + "num_input_tokens_seen": 132440608, + "step": 108840 + }, + { + "epoch": 12.122173961465641, + "grad_norm": 0.0018402516143396497, + "learning_rate": 2.0135313184363042e-05, + "loss": 0.0037, + "num_input_tokens_seen": 132446944, + "step": 108845 + }, + { + "epoch": 12.122730816349259, + "grad_norm": 0.2920821011066437, + "learning_rate": 2.0132929913574884e-05, + "loss": 0.0136, + "num_input_tokens_seen": 132453184, + "step": 108850 + }, + { + "epoch": 12.123287671232877, + "grad_norm": 0.09524203836917877, + "learning_rate": 2.0130546688760107e-05, + "loss": 0.0297, + "num_input_tokens_seen": 132459168, + "step": 108855 + }, + { + "epoch": 12.123844526116494, + "grad_norm": 0.0003900196752510965, + "learning_rate": 2.0128163509941214e-05, + "loss": 0.0455, + "num_input_tokens_seen": 132465312, + "step": 108860 + }, + { + "epoch": 12.124401381000112, + "grad_norm": 1.0451099872589111, + "learning_rate": 2.0125780377140727e-05, + "loss": 0.0341, + "num_input_tokens_seen": 132471648, + "step": 108865 + }, + { + "epoch": 12.124958235883728, + "grad_norm": 0.02505040355026722, + "learning_rate": 2.0123397290381142e-05, + "loss": 0.0379, + "num_input_tokens_seen": 132477792, + "step": 108870 + }, + { + "epoch": 12.125515090767346, + "grad_norm": 0.12448494136333466, + "learning_rate": 2.012101424968498e-05, + "loss": 0.0019, + "num_input_tokens_seen": 132483744, + "step": 108875 + }, + { + "epoch": 12.126071945650963, + "grad_norm": 0.26050058007240295, + "learning_rate": 2.0118631255074745e-05, + "loss": 0.0121, + "num_input_tokens_seen": 132489472, + "step": 108880 + }, + { + "epoch": 12.126628800534581, + "grad_norm": 1.8961241245269775, + "learning_rate": 2.0116248306572954e-05, + "loss": 0.0619, + "num_input_tokens_seen": 132495200, + "step": 108885 + }, + { + "epoch": 12.127185655418199, + "grad_norm": 0.0003499474551063031, + "learning_rate": 2.0113865404202098e-05, + "loss": 0.007, + "num_input_tokens_seen": 132501248, + "step": 108890 + }, + { + "epoch": 12.127742510301815, + "grad_norm": 0.7101922631263733, + "learning_rate": 2.0111482547984716e-05, + "loss": 0.0135, + "num_input_tokens_seen": 132506752, + "step": 108895 + }, + { + "epoch": 12.128299365185432, + "grad_norm": 0.005895084235817194, + "learning_rate": 2.0109099737943284e-05, + "loss": 0.0365, + "num_input_tokens_seen": 132513024, + "step": 108900 + }, + { + "epoch": 12.12885622006905, + "grad_norm": 0.5394695997238159, + "learning_rate": 2.010671697410033e-05, + "loss": 0.1293, + "num_input_tokens_seen": 132519072, + "step": 108905 + }, + { + "epoch": 12.129413074952668, + "grad_norm": 0.018621746450662613, + "learning_rate": 2.0104334256478347e-05, + "loss": 0.0373, + "num_input_tokens_seen": 132525088, + "step": 108910 + }, + { + "epoch": 12.129969929836285, + "grad_norm": 1.266158938407898, + "learning_rate": 2.010195158509986e-05, + "loss": 0.1032, + "num_input_tokens_seen": 132530880, + "step": 108915 + }, + { + "epoch": 12.130526784719901, + "grad_norm": 0.4384775459766388, + "learning_rate": 2.0099568959987358e-05, + "loss": 0.0153, + "num_input_tokens_seen": 132537152, + "step": 108920 + }, + { + "epoch": 12.131083639603519, + "grad_norm": 7.543275569332764e-05, + "learning_rate": 2.009718638116336e-05, + "loss": 0.0231, + "num_input_tokens_seen": 132543136, + "step": 108925 + }, + { + "epoch": 12.131640494487137, + "grad_norm": 0.0003319086681585759, + "learning_rate": 2.0094803848650354e-05, + "loss": 0.0403, + "num_input_tokens_seen": 132549216, + "step": 108930 + }, + { + "epoch": 12.132197349370754, + "grad_norm": 0.8900489807128906, + "learning_rate": 2.0092421362470865e-05, + "loss": 0.1175, + "num_input_tokens_seen": 132555648, + "step": 108935 + }, + { + "epoch": 12.132754204254372, + "grad_norm": 0.0016456743469461799, + "learning_rate": 2.009003892264738e-05, + "loss": 0.1144, + "num_input_tokens_seen": 132561632, + "step": 108940 + }, + { + "epoch": 12.133311059137988, + "grad_norm": 1.258056879043579, + "learning_rate": 2.008765652920242e-05, + "loss": 0.0236, + "num_input_tokens_seen": 132567968, + "step": 108945 + }, + { + "epoch": 12.133867914021605, + "grad_norm": 0.0156837347894907, + "learning_rate": 2.008527418215847e-05, + "loss": 0.0818, + "num_input_tokens_seen": 132574400, + "step": 108950 + }, + { + "epoch": 12.134424768905223, + "grad_norm": 0.0022020034957677126, + "learning_rate": 2.0082891881538053e-05, + "loss": 0.0094, + "num_input_tokens_seen": 132580640, + "step": 108955 + }, + { + "epoch": 12.13498162378884, + "grad_norm": 0.039787184447050095, + "learning_rate": 2.008050962736365e-05, + "loss": 0.0578, + "num_input_tokens_seen": 132586112, + "step": 108960 + }, + { + "epoch": 12.135538478672458, + "grad_norm": 0.08123739808797836, + "learning_rate": 2.0078127419657785e-05, + "loss": 0.0055, + "num_input_tokens_seen": 132592320, + "step": 108965 + }, + { + "epoch": 12.136095333556074, + "grad_norm": 0.8578982353210449, + "learning_rate": 2.0075745258442947e-05, + "loss": 0.0737, + "num_input_tokens_seen": 132598400, + "step": 108970 + }, + { + "epoch": 12.136652188439692, + "grad_norm": 0.00015639080083929002, + "learning_rate": 2.0073363143741642e-05, + "loss": 0.033, + "num_input_tokens_seen": 132604448, + "step": 108975 + }, + { + "epoch": 12.13720904332331, + "grad_norm": 1.8727807998657227, + "learning_rate": 2.0070981075576365e-05, + "loss": 0.1015, + "num_input_tokens_seen": 132610848, + "step": 108980 + }, + { + "epoch": 12.137765898206927, + "grad_norm": 3.039994955062866, + "learning_rate": 2.0068599053969626e-05, + "loss": 0.2414, + "num_input_tokens_seen": 132616544, + "step": 108985 + }, + { + "epoch": 12.138322753090545, + "grad_norm": 0.06197775527834892, + "learning_rate": 2.0066217078943912e-05, + "loss": 0.0014, + "num_input_tokens_seen": 132622656, + "step": 108990 + }, + { + "epoch": 12.138879607974163, + "grad_norm": 0.014365348033607006, + "learning_rate": 2.006383515052174e-05, + "loss": 0.0315, + "num_input_tokens_seen": 132629024, + "step": 108995 + }, + { + "epoch": 12.139436462857779, + "grad_norm": 0.07941578328609467, + "learning_rate": 2.0061453268725593e-05, + "loss": 0.0498, + "num_input_tokens_seen": 132635168, + "step": 109000 + }, + { + "epoch": 12.139993317741396, + "grad_norm": 0.07916700094938278, + "learning_rate": 2.0059071433577985e-05, + "loss": 0.0397, + "num_input_tokens_seen": 132641184, + "step": 109005 + }, + { + "epoch": 12.140550172625014, + "grad_norm": 0.002335968427360058, + "learning_rate": 2.005668964510139e-05, + "loss": 0.0729, + "num_input_tokens_seen": 132647616, + "step": 109010 + }, + { + "epoch": 12.141107027508632, + "grad_norm": 0.04617474600672722, + "learning_rate": 2.005430790331834e-05, + "loss": 0.0949, + "num_input_tokens_seen": 132652896, + "step": 109015 + }, + { + "epoch": 12.14166388239225, + "grad_norm": 0.28748181462287903, + "learning_rate": 2.0051926208251298e-05, + "loss": 0.1224, + "num_input_tokens_seen": 132658272, + "step": 109020 + }, + { + "epoch": 12.142220737275865, + "grad_norm": 0.7161537408828735, + "learning_rate": 2.0049544559922794e-05, + "loss": 0.0937, + "num_input_tokens_seen": 132664352, + "step": 109025 + }, + { + "epoch": 12.142777592159483, + "grad_norm": 0.5171856880187988, + "learning_rate": 2.0047162958355292e-05, + "loss": 0.0369, + "num_input_tokens_seen": 132670048, + "step": 109030 + }, + { + "epoch": 12.1433344470431, + "grad_norm": 0.07116563618183136, + "learning_rate": 2.0044781403571314e-05, + "loss": 0.0162, + "num_input_tokens_seen": 132676192, + "step": 109035 + }, + { + "epoch": 12.143891301926718, + "grad_norm": 0.6999626159667969, + "learning_rate": 2.0042399895593343e-05, + "loss": 0.1099, + "num_input_tokens_seen": 132682272, + "step": 109040 + }, + { + "epoch": 12.144448156810336, + "grad_norm": 0.00032698240829631686, + "learning_rate": 2.0040018434443877e-05, + "loss": 0.1131, + "num_input_tokens_seen": 132687712, + "step": 109045 + }, + { + "epoch": 12.145005011693952, + "grad_norm": 1.3692057132720947, + "learning_rate": 2.0037637020145415e-05, + "loss": 0.0948, + "num_input_tokens_seen": 132693824, + "step": 109050 + }, + { + "epoch": 12.14556186657757, + "grad_norm": 0.2274361103773117, + "learning_rate": 2.003525565272044e-05, + "loss": 0.0975, + "num_input_tokens_seen": 132700160, + "step": 109055 + }, + { + "epoch": 12.146118721461187, + "grad_norm": 0.02492304891347885, + "learning_rate": 2.003287433219146e-05, + "loss": 0.0726, + "num_input_tokens_seen": 132705856, + "step": 109060 + }, + { + "epoch": 12.146675576344805, + "grad_norm": 0.3356721103191376, + "learning_rate": 2.0030493058580953e-05, + "loss": 0.1115, + "num_input_tokens_seen": 132712032, + "step": 109065 + }, + { + "epoch": 12.147232431228423, + "grad_norm": 0.011924990452826023, + "learning_rate": 2.002811183191143e-05, + "loss": 0.0573, + "num_input_tokens_seen": 132718496, + "step": 109070 + }, + { + "epoch": 12.147789286112038, + "grad_norm": 0.056277982890605927, + "learning_rate": 2.0025730652205367e-05, + "loss": 0.0019, + "num_input_tokens_seen": 132724800, + "step": 109075 + }, + { + "epoch": 12.148346140995656, + "grad_norm": 0.0757351890206337, + "learning_rate": 2.002334951948527e-05, + "loss": 0.0061, + "num_input_tokens_seen": 132730784, + "step": 109080 + }, + { + "epoch": 12.148902995879274, + "grad_norm": 1.4366633892059326, + "learning_rate": 2.002096843377361e-05, + "loss": 0.1062, + "num_input_tokens_seen": 132736704, + "step": 109085 + }, + { + "epoch": 12.149459850762891, + "grad_norm": 0.35233479738235474, + "learning_rate": 2.0018587395092907e-05, + "loss": 0.0723, + "num_input_tokens_seen": 132742304, + "step": 109090 + }, + { + "epoch": 12.15001670564651, + "grad_norm": 0.6818049550056458, + "learning_rate": 2.0016206403465625e-05, + "loss": 0.0191, + "num_input_tokens_seen": 132748672, + "step": 109095 + }, + { + "epoch": 12.150573560530125, + "grad_norm": 0.0002899533719755709, + "learning_rate": 2.0013825458914282e-05, + "loss": 0.0647, + "num_input_tokens_seen": 132754656, + "step": 109100 + }, + { + "epoch": 12.151130415413743, + "grad_norm": 0.8189632296562195, + "learning_rate": 2.0011444561461335e-05, + "loss": 0.041, + "num_input_tokens_seen": 132760704, + "step": 109105 + }, + { + "epoch": 12.15168727029736, + "grad_norm": 0.18900713324546814, + "learning_rate": 2.0009063711129303e-05, + "loss": 0.0169, + "num_input_tokens_seen": 132766880, + "step": 109110 + }, + { + "epoch": 12.152244125180978, + "grad_norm": 0.31351739168167114, + "learning_rate": 2.0006682907940657e-05, + "loss": 0.0394, + "num_input_tokens_seen": 132772960, + "step": 109115 + }, + { + "epoch": 12.152800980064596, + "grad_norm": 0.023656539618968964, + "learning_rate": 2.0004302151917896e-05, + "loss": 0.0765, + "num_input_tokens_seen": 132779136, + "step": 109120 + }, + { + "epoch": 12.153357834948212, + "grad_norm": 1.5776318311691284, + "learning_rate": 2.00019214430835e-05, + "loss": 0.0691, + "num_input_tokens_seen": 132785120, + "step": 109125 + }, + { + "epoch": 12.15391468983183, + "grad_norm": 0.7298611998558044, + "learning_rate": 1.9999540781459966e-05, + "loss": 0.0265, + "num_input_tokens_seen": 132791360, + "step": 109130 + }, + { + "epoch": 12.154471544715447, + "grad_norm": 0.0021280127111822367, + "learning_rate": 1.9997160167069767e-05, + "loss": 0.0763, + "num_input_tokens_seen": 132797312, + "step": 109135 + }, + { + "epoch": 12.155028399599065, + "grad_norm": 0.6259673833847046, + "learning_rate": 1.9994779599935408e-05, + "loss": 0.0056, + "num_input_tokens_seen": 132803648, + "step": 109140 + }, + { + "epoch": 12.155585254482682, + "grad_norm": 1.4156609773635864, + "learning_rate": 1.9992399080079358e-05, + "loss": 0.043, + "num_input_tokens_seen": 132809856, + "step": 109145 + }, + { + "epoch": 12.1561421093663, + "grad_norm": 1.3636419773101807, + "learning_rate": 1.9990018607524118e-05, + "loss": 0.123, + "num_input_tokens_seen": 132815648, + "step": 109150 + }, + { + "epoch": 12.156698964249916, + "grad_norm": 0.0004803170158993453, + "learning_rate": 1.9987638182292155e-05, + "loss": 0.0981, + "num_input_tokens_seen": 132821696, + "step": 109155 + }, + { + "epoch": 12.157255819133534, + "grad_norm": 0.0042866594158113, + "learning_rate": 1.9985257804405976e-05, + "loss": 0.0161, + "num_input_tokens_seen": 132827872, + "step": 109160 + }, + { + "epoch": 12.157812674017151, + "grad_norm": 0.07104983180761337, + "learning_rate": 1.9982877473888044e-05, + "loss": 0.0214, + "num_input_tokens_seen": 132833984, + "step": 109165 + }, + { + "epoch": 12.158369528900769, + "grad_norm": 0.007810730021446943, + "learning_rate": 1.9980497190760867e-05, + "loss": 0.035, + "num_input_tokens_seen": 132840416, + "step": 109170 + }, + { + "epoch": 12.158926383784387, + "grad_norm": 0.28137877583503723, + "learning_rate": 1.9978116955046905e-05, + "loss": 0.0321, + "num_input_tokens_seen": 132846752, + "step": 109175 + }, + { + "epoch": 12.159483238668003, + "grad_norm": 0.6521910429000854, + "learning_rate": 1.997573676676866e-05, + "loss": 0.0187, + "num_input_tokens_seen": 132852992, + "step": 109180 + }, + { + "epoch": 12.16004009355162, + "grad_norm": 0.2698107659816742, + "learning_rate": 1.9973356625948597e-05, + "loss": 0.0702, + "num_input_tokens_seen": 132859072, + "step": 109185 + }, + { + "epoch": 12.160596948435238, + "grad_norm": 0.5607172250747681, + "learning_rate": 1.9970976532609218e-05, + "loss": 0.2616, + "num_input_tokens_seen": 132864640, + "step": 109190 + }, + { + "epoch": 12.161153803318856, + "grad_norm": 0.03453570604324341, + "learning_rate": 1.996859648677299e-05, + "loss": 0.051, + "num_input_tokens_seen": 132870976, + "step": 109195 + }, + { + "epoch": 12.161710658202473, + "grad_norm": 0.00039492922951467335, + "learning_rate": 1.9966216488462402e-05, + "loss": 0.0334, + "num_input_tokens_seen": 132877184, + "step": 109200 + }, + { + "epoch": 12.162267513086089, + "grad_norm": 0.9704185724258423, + "learning_rate": 1.9963836537699925e-05, + "loss": 0.0852, + "num_input_tokens_seen": 132883264, + "step": 109205 + }, + { + "epoch": 12.162824367969707, + "grad_norm": 0.1660199910402298, + "learning_rate": 1.996145663450806e-05, + "loss": 0.0522, + "num_input_tokens_seen": 132889536, + "step": 109210 + }, + { + "epoch": 12.163381222853324, + "grad_norm": 0.07120644301176071, + "learning_rate": 1.995907677890926e-05, + "loss": 0.0025, + "num_input_tokens_seen": 132895616, + "step": 109215 + }, + { + "epoch": 12.163938077736942, + "grad_norm": 0.8192746043205261, + "learning_rate": 1.9956696970926024e-05, + "loss": 0.0479, + "num_input_tokens_seen": 132901760, + "step": 109220 + }, + { + "epoch": 12.16449493262056, + "grad_norm": 0.19086210429668427, + "learning_rate": 1.995431721058082e-05, + "loss": 0.0075, + "num_input_tokens_seen": 132907776, + "step": 109225 + }, + { + "epoch": 12.165051787504176, + "grad_norm": 0.004704990424215794, + "learning_rate": 1.9951937497896147e-05, + "loss": 0.054, + "num_input_tokens_seen": 132913920, + "step": 109230 + }, + { + "epoch": 12.165608642387793, + "grad_norm": 0.00023077100922819227, + "learning_rate": 1.994955783289445e-05, + "loss": 0.0437, + "num_input_tokens_seen": 132919840, + "step": 109235 + }, + { + "epoch": 12.166165497271411, + "grad_norm": 0.013513650745153427, + "learning_rate": 1.994717821559824e-05, + "loss": 0.0272, + "num_input_tokens_seen": 132925888, + "step": 109240 + }, + { + "epoch": 12.166722352155029, + "grad_norm": 1.0712811946868896, + "learning_rate": 1.9944798646029976e-05, + "loss": 0.0891, + "num_input_tokens_seen": 132932160, + "step": 109245 + }, + { + "epoch": 12.167279207038646, + "grad_norm": 0.005987302400171757, + "learning_rate": 1.9942419124212143e-05, + "loss": 0.0706, + "num_input_tokens_seen": 132938496, + "step": 109250 + }, + { + "epoch": 12.167836061922262, + "grad_norm": 0.1372145712375641, + "learning_rate": 1.9940039650167205e-05, + "loss": 0.0168, + "num_input_tokens_seen": 132944192, + "step": 109255 + }, + { + "epoch": 12.16839291680588, + "grad_norm": 0.025482501834630966, + "learning_rate": 1.9937660223917653e-05, + "loss": 0.021, + "num_input_tokens_seen": 132950528, + "step": 109260 + }, + { + "epoch": 12.168949771689498, + "grad_norm": 0.16082125902175903, + "learning_rate": 1.993528084548595e-05, + "loss": 0.0035, + "num_input_tokens_seen": 132956640, + "step": 109265 + }, + { + "epoch": 12.169506626573115, + "grad_norm": 0.09571098536252975, + "learning_rate": 1.9932901514894587e-05, + "loss": 0.0092, + "num_input_tokens_seen": 132963008, + "step": 109270 + }, + { + "epoch": 12.170063481456733, + "grad_norm": 0.19207081198692322, + "learning_rate": 1.993052223216602e-05, + "loss": 0.0197, + "num_input_tokens_seen": 132969312, + "step": 109275 + }, + { + "epoch": 12.170620336340349, + "grad_norm": 0.8161581754684448, + "learning_rate": 1.9928142997322736e-05, + "loss": 0.0456, + "num_input_tokens_seen": 132975360, + "step": 109280 + }, + { + "epoch": 12.171177191223967, + "grad_norm": 0.10677532106637955, + "learning_rate": 1.99257638103872e-05, + "loss": 0.0058, + "num_input_tokens_seen": 132981568, + "step": 109285 + }, + { + "epoch": 12.171734046107584, + "grad_norm": 0.3152264654636383, + "learning_rate": 1.99233846713819e-05, + "loss": 0.0149, + "num_input_tokens_seen": 132987424, + "step": 109290 + }, + { + "epoch": 12.172290900991202, + "grad_norm": 0.15828071534633636, + "learning_rate": 1.9921005580329284e-05, + "loss": 0.0328, + "num_input_tokens_seen": 132993504, + "step": 109295 + }, + { + "epoch": 12.17284775587482, + "grad_norm": 0.34399479627609253, + "learning_rate": 1.9918626537251857e-05, + "loss": 0.0474, + "num_input_tokens_seen": 132999392, + "step": 109300 + }, + { + "epoch": 12.173404610758435, + "grad_norm": 0.2757348120212555, + "learning_rate": 1.991624754217206e-05, + "loss": 0.0929, + "num_input_tokens_seen": 133005536, + "step": 109305 + }, + { + "epoch": 12.173961465642053, + "grad_norm": 0.1706429123878479, + "learning_rate": 1.9913868595112384e-05, + "loss": 0.1458, + "num_input_tokens_seen": 133011712, + "step": 109310 + }, + { + "epoch": 12.17451832052567, + "grad_norm": 0.03843580558896065, + "learning_rate": 1.9911489696095292e-05, + "loss": 0.0194, + "num_input_tokens_seen": 133017440, + "step": 109315 + }, + { + "epoch": 12.175075175409289, + "grad_norm": 0.014215008355677128, + "learning_rate": 1.990911084514326e-05, + "loss": 0.0309, + "num_input_tokens_seen": 133023456, + "step": 109320 + }, + { + "epoch": 12.175632030292906, + "grad_norm": 0.761868417263031, + "learning_rate": 1.9906732042278753e-05, + "loss": 0.0116, + "num_input_tokens_seen": 133029728, + "step": 109325 + }, + { + "epoch": 12.176188885176524, + "grad_norm": 0.010230372659862041, + "learning_rate": 1.9904353287524243e-05, + "loss": 0.0352, + "num_input_tokens_seen": 133035808, + "step": 109330 + }, + { + "epoch": 12.17674574006014, + "grad_norm": 0.02487565204501152, + "learning_rate": 1.9901974580902198e-05, + "loss": 0.0619, + "num_input_tokens_seen": 133041920, + "step": 109335 + }, + { + "epoch": 12.177302594943757, + "grad_norm": 1.855310082435608, + "learning_rate": 1.989959592243509e-05, + "loss": 0.07, + "num_input_tokens_seen": 133048032, + "step": 109340 + }, + { + "epoch": 12.177859449827375, + "grad_norm": 0.058107733726501465, + "learning_rate": 1.9897217312145376e-05, + "loss": 0.1055, + "num_input_tokens_seen": 133054016, + "step": 109345 + }, + { + "epoch": 12.178416304710993, + "grad_norm": 0.047777410596609116, + "learning_rate": 1.9894838750055544e-05, + "loss": 0.034, + "num_input_tokens_seen": 133060256, + "step": 109350 + }, + { + "epoch": 12.17897315959461, + "grad_norm": 0.015759117901325226, + "learning_rate": 1.9892460236188036e-05, + "loss": 0.0052, + "num_input_tokens_seen": 133066272, + "step": 109355 + }, + { + "epoch": 12.179530014478226, + "grad_norm": 0.6699228286743164, + "learning_rate": 1.9890081770565353e-05, + "loss": 0.0748, + "num_input_tokens_seen": 133072064, + "step": 109360 + }, + { + "epoch": 12.180086869361844, + "grad_norm": 2.181450843811035, + "learning_rate": 1.9887703353209924e-05, + "loss": 0.1023, + "num_input_tokens_seen": 133078400, + "step": 109365 + }, + { + "epoch": 12.180643724245462, + "grad_norm": 0.5875661969184875, + "learning_rate": 1.9885324984144244e-05, + "loss": 0.042, + "num_input_tokens_seen": 133084640, + "step": 109370 + }, + { + "epoch": 12.18120057912908, + "grad_norm": 1.6130114793777466, + "learning_rate": 1.988294666339076e-05, + "loss": 0.0634, + "num_input_tokens_seen": 133090432, + "step": 109375 + }, + { + "epoch": 12.181757434012697, + "grad_norm": 4.519295692443848, + "learning_rate": 1.9880568390971953e-05, + "loss": 0.3226, + "num_input_tokens_seen": 133096576, + "step": 109380 + }, + { + "epoch": 12.182314288896313, + "grad_norm": 0.11458739638328552, + "learning_rate": 1.987819016691027e-05, + "loss": 0.0746, + "num_input_tokens_seen": 133102816, + "step": 109385 + }, + { + "epoch": 12.18287114377993, + "grad_norm": 0.11702656000852585, + "learning_rate": 1.987581199122819e-05, + "loss": 0.1026, + "num_input_tokens_seen": 133108864, + "step": 109390 + }, + { + "epoch": 12.183427998663548, + "grad_norm": 2.4369239807128906, + "learning_rate": 1.9873433863948167e-05, + "loss": 0.0689, + "num_input_tokens_seen": 133115136, + "step": 109395 + }, + { + "epoch": 12.183984853547166, + "grad_norm": 0.0033280521165579557, + "learning_rate": 1.9871055785092674e-05, + "loss": 0.001, + "num_input_tokens_seen": 133121344, + "step": 109400 + }, + { + "epoch": 12.184541708430784, + "grad_norm": 2.3591275215148926, + "learning_rate": 1.9868677754684166e-05, + "loss": 0.1308, + "num_input_tokens_seen": 133127424, + "step": 109405 + }, + { + "epoch": 12.1850985633144, + "grad_norm": 0.5180402398109436, + "learning_rate": 1.9866299772745106e-05, + "loss": 0.0485, + "num_input_tokens_seen": 133133472, + "step": 109410 + }, + { + "epoch": 12.185655418198017, + "grad_norm": 0.2858682870864868, + "learning_rate": 1.9863921839297953e-05, + "loss": 0.0111, + "num_input_tokens_seen": 133139840, + "step": 109415 + }, + { + "epoch": 12.186212273081635, + "grad_norm": 0.49311110377311707, + "learning_rate": 1.9861543954365185e-05, + "loss": 0.0531, + "num_input_tokens_seen": 133145728, + "step": 109420 + }, + { + "epoch": 12.186769127965253, + "grad_norm": 0.0006274437182582915, + "learning_rate": 1.985916611796924e-05, + "loss": 0.0332, + "num_input_tokens_seen": 133151584, + "step": 109425 + }, + { + "epoch": 12.18732598284887, + "grad_norm": 0.028361307457089424, + "learning_rate": 1.9856788330132602e-05, + "loss": 0.0219, + "num_input_tokens_seen": 133157504, + "step": 109430 + }, + { + "epoch": 12.187882837732486, + "grad_norm": 0.17843537032604218, + "learning_rate": 1.9854410590877704e-05, + "loss": 0.0053, + "num_input_tokens_seen": 133163296, + "step": 109435 + }, + { + "epoch": 12.188439692616104, + "grad_norm": 1.1753625869750977, + "learning_rate": 1.9852032900227033e-05, + "loss": 0.0967, + "num_input_tokens_seen": 133169824, + "step": 109440 + }, + { + "epoch": 12.188996547499722, + "grad_norm": 0.01445411890745163, + "learning_rate": 1.984965525820303e-05, + "loss": 0.0047, + "num_input_tokens_seen": 133176064, + "step": 109445 + }, + { + "epoch": 12.18955340238334, + "grad_norm": 0.7855405807495117, + "learning_rate": 1.9847277664828164e-05, + "loss": 0.061, + "num_input_tokens_seen": 133182304, + "step": 109450 + }, + { + "epoch": 12.190110257266957, + "grad_norm": 0.04261688143014908, + "learning_rate": 1.984490012012488e-05, + "loss": 0.0677, + "num_input_tokens_seen": 133188416, + "step": 109455 + }, + { + "epoch": 12.190667112150573, + "grad_norm": 0.00011127362813567743, + "learning_rate": 1.9842522624115653e-05, + "loss": 0.1121, + "num_input_tokens_seen": 133194560, + "step": 109460 + }, + { + "epoch": 12.19122396703419, + "grad_norm": 1.0678927898406982, + "learning_rate": 1.9840145176822932e-05, + "loss": 0.1032, + "num_input_tokens_seen": 133200704, + "step": 109465 + }, + { + "epoch": 12.191780821917808, + "grad_norm": 1.1064292192459106, + "learning_rate": 1.983776777826917e-05, + "loss": 0.0688, + "num_input_tokens_seen": 133206080, + "step": 109470 + }, + { + "epoch": 12.192337676801426, + "grad_norm": 0.9552916884422302, + "learning_rate": 1.983539042847683e-05, + "loss": 0.049, + "num_input_tokens_seen": 133212480, + "step": 109475 + }, + { + "epoch": 12.192894531685043, + "grad_norm": 1.2208819389343262, + "learning_rate": 1.9833013127468362e-05, + "loss": 0.063, + "num_input_tokens_seen": 133218464, + "step": 109480 + }, + { + "epoch": 12.19345138656866, + "grad_norm": 0.14984536170959473, + "learning_rate": 1.9830635875266228e-05, + "loss": 0.003, + "num_input_tokens_seen": 133224672, + "step": 109485 + }, + { + "epoch": 12.194008241452277, + "grad_norm": 0.9169934988021851, + "learning_rate": 1.9828258671892873e-05, + "loss": 0.0174, + "num_input_tokens_seen": 133230880, + "step": 109490 + }, + { + "epoch": 12.194565096335895, + "grad_norm": 0.010230110958218575, + "learning_rate": 1.9825881517370767e-05, + "loss": 0.157, + "num_input_tokens_seen": 133235712, + "step": 109495 + }, + { + "epoch": 12.195121951219512, + "grad_norm": 0.8682049512863159, + "learning_rate": 1.9823504411722345e-05, + "loss": 0.0928, + "num_input_tokens_seen": 133241696, + "step": 109500 + }, + { + "epoch": 12.19567880610313, + "grad_norm": 0.0014308830723166466, + "learning_rate": 1.9821127354970088e-05, + "loss": 0.0355, + "num_input_tokens_seen": 133247104, + "step": 109505 + }, + { + "epoch": 12.196235660986748, + "grad_norm": 0.03228025510907173, + "learning_rate": 1.981875034713641e-05, + "loss": 0.0208, + "num_input_tokens_seen": 133253024, + "step": 109510 + }, + { + "epoch": 12.196792515870364, + "grad_norm": 1.3909565210342407, + "learning_rate": 1.9816373388243804e-05, + "loss": 0.0316, + "num_input_tokens_seen": 133259296, + "step": 109515 + }, + { + "epoch": 12.197349370753981, + "grad_norm": 0.33053863048553467, + "learning_rate": 1.9813996478314694e-05, + "loss": 0.0456, + "num_input_tokens_seen": 133265856, + "step": 109520 + }, + { + "epoch": 12.197906225637599, + "grad_norm": 3.0873453617095947, + "learning_rate": 1.981161961737155e-05, + "loss": 0.1052, + "num_input_tokens_seen": 133271520, + "step": 109525 + }, + { + "epoch": 12.198463080521217, + "grad_norm": 1.8292381763458252, + "learning_rate": 1.980924280543681e-05, + "loss": 0.0933, + "num_input_tokens_seen": 133278112, + "step": 109530 + }, + { + "epoch": 12.199019935404834, + "grad_norm": 0.6615291237831116, + "learning_rate": 1.9806866042532938e-05, + "loss": 0.0194, + "num_input_tokens_seen": 133284256, + "step": 109535 + }, + { + "epoch": 12.19957679028845, + "grad_norm": 0.20838823914527893, + "learning_rate": 1.980448932868237e-05, + "loss": 0.0054, + "num_input_tokens_seen": 133290336, + "step": 109540 + }, + { + "epoch": 12.200133645172068, + "grad_norm": 0.044814012944698334, + "learning_rate": 1.980211266390757e-05, + "loss": 0.0464, + "num_input_tokens_seen": 133296288, + "step": 109545 + }, + { + "epoch": 12.200690500055686, + "grad_norm": 0.3462511897087097, + "learning_rate": 1.979973604823097e-05, + "loss": 0.0645, + "num_input_tokens_seen": 133302240, + "step": 109550 + }, + { + "epoch": 12.201247354939303, + "grad_norm": 0.06496447324752808, + "learning_rate": 1.979735948167504e-05, + "loss": 0.0056, + "num_input_tokens_seen": 133308288, + "step": 109555 + }, + { + "epoch": 12.201804209822921, + "grad_norm": 0.34907984733581543, + "learning_rate": 1.9794982964262202e-05, + "loss": 0.0085, + "num_input_tokens_seen": 133314368, + "step": 109560 + }, + { + "epoch": 12.202361064706537, + "grad_norm": 0.5016957521438599, + "learning_rate": 1.9792606496014936e-05, + "loss": 0.1027, + "num_input_tokens_seen": 133319808, + "step": 109565 + }, + { + "epoch": 12.202917919590154, + "grad_norm": 0.037918757647275925, + "learning_rate": 1.979023007695566e-05, + "loss": 0.0114, + "num_input_tokens_seen": 133326080, + "step": 109570 + }, + { + "epoch": 12.203474774473772, + "grad_norm": 0.058884937316179276, + "learning_rate": 1.9787853707106854e-05, + "loss": 0.126, + "num_input_tokens_seen": 133332160, + "step": 109575 + }, + { + "epoch": 12.20403162935739, + "grad_norm": 0.06061116233468056, + "learning_rate": 1.9785477386490928e-05, + "loss": 0.0579, + "num_input_tokens_seen": 133338368, + "step": 109580 + }, + { + "epoch": 12.204588484241008, + "grad_norm": 1.936913013458252, + "learning_rate": 1.9783101115130354e-05, + "loss": 0.0384, + "num_input_tokens_seen": 133344576, + "step": 109585 + }, + { + "epoch": 12.205145339124623, + "grad_norm": 1.7684508562088013, + "learning_rate": 1.9780724893047566e-05, + "loss": 0.1169, + "num_input_tokens_seen": 133350496, + "step": 109590 + }, + { + "epoch": 12.205702194008241, + "grad_norm": 1.3449127674102783, + "learning_rate": 1.977834872026502e-05, + "loss": 0.159, + "num_input_tokens_seen": 133356704, + "step": 109595 + }, + { + "epoch": 12.206259048891859, + "grad_norm": 0.00018519347941037267, + "learning_rate": 1.9775972596805146e-05, + "loss": 0.0564, + "num_input_tokens_seen": 133363008, + "step": 109600 + }, + { + "epoch": 12.206815903775476, + "grad_norm": 0.13343094289302826, + "learning_rate": 1.97735965226904e-05, + "loss": 0.0052, + "num_input_tokens_seen": 133369440, + "step": 109605 + }, + { + "epoch": 12.207372758659094, + "grad_norm": 0.0018841115524992347, + "learning_rate": 1.9771220497943222e-05, + "loss": 0.0049, + "num_input_tokens_seen": 133376000, + "step": 109610 + }, + { + "epoch": 12.20792961354271, + "grad_norm": 0.006041951943188906, + "learning_rate": 1.9768844522586057e-05, + "loss": 0.0575, + "num_input_tokens_seen": 133382144, + "step": 109615 + }, + { + "epoch": 12.208486468426328, + "grad_norm": 0.1763485074043274, + "learning_rate": 1.9766468596641345e-05, + "loss": 0.0404, + "num_input_tokens_seen": 133388192, + "step": 109620 + }, + { + "epoch": 12.209043323309945, + "grad_norm": 0.00010224544530501589, + "learning_rate": 1.9764092720131532e-05, + "loss": 0.0141, + "num_input_tokens_seen": 133394496, + "step": 109625 + }, + { + "epoch": 12.209600178193563, + "grad_norm": 0.5693133473396301, + "learning_rate": 1.9761716893079045e-05, + "loss": 0.0499, + "num_input_tokens_seen": 133400448, + "step": 109630 + }, + { + "epoch": 12.21015703307718, + "grad_norm": 0.07693220674991608, + "learning_rate": 1.975934111550636e-05, + "loss": 0.0289, + "num_input_tokens_seen": 133406688, + "step": 109635 + }, + { + "epoch": 12.210713887960797, + "grad_norm": 0.19605915248394012, + "learning_rate": 1.975696538743588e-05, + "loss": 0.0308, + "num_input_tokens_seen": 133412864, + "step": 109640 + }, + { + "epoch": 12.211270742844414, + "grad_norm": 0.03852468356490135, + "learning_rate": 1.975458970889007e-05, + "loss": 0.0794, + "num_input_tokens_seen": 133418688, + "step": 109645 + }, + { + "epoch": 12.211827597728032, + "grad_norm": 0.14324897527694702, + "learning_rate": 1.9752214079891364e-05, + "loss": 0.0065, + "num_input_tokens_seen": 133424896, + "step": 109650 + }, + { + "epoch": 12.21238445261165, + "grad_norm": 0.00013511459110304713, + "learning_rate": 1.97498385004622e-05, + "loss": 0.0578, + "num_input_tokens_seen": 133430816, + "step": 109655 + }, + { + "epoch": 12.212941307495267, + "grad_norm": 0.3776331841945648, + "learning_rate": 1.9747462970625015e-05, + "loss": 0.0104, + "num_input_tokens_seen": 133436768, + "step": 109660 + }, + { + "epoch": 12.213498162378883, + "grad_norm": 0.15114299952983856, + "learning_rate": 1.9745087490402254e-05, + "loss": 0.0532, + "num_input_tokens_seen": 133443072, + "step": 109665 + }, + { + "epoch": 12.2140550172625, + "grad_norm": 0.11393499374389648, + "learning_rate": 1.9742712059816348e-05, + "loss": 0.0181, + "num_input_tokens_seen": 133449344, + "step": 109670 + }, + { + "epoch": 12.214611872146119, + "grad_norm": 0.008832762017846107, + "learning_rate": 1.974033667888974e-05, + "loss": 0.0144, + "num_input_tokens_seen": 133455328, + "step": 109675 + }, + { + "epoch": 12.215168727029736, + "grad_norm": 0.18542398512363434, + "learning_rate": 1.973796134764487e-05, + "loss": 0.0267, + "num_input_tokens_seen": 133461376, + "step": 109680 + }, + { + "epoch": 12.215725581913354, + "grad_norm": 0.2372499257326126, + "learning_rate": 1.973558606610417e-05, + "loss": 0.0539, + "num_input_tokens_seen": 133466848, + "step": 109685 + }, + { + "epoch": 12.216282436796972, + "grad_norm": 0.522799015045166, + "learning_rate": 1.9733210834290065e-05, + "loss": 0.0618, + "num_input_tokens_seen": 133472896, + "step": 109690 + }, + { + "epoch": 12.216839291680587, + "grad_norm": 0.0006816715467721224, + "learning_rate": 1.9730835652225022e-05, + "loss": 0.0044, + "num_input_tokens_seen": 133478848, + "step": 109695 + }, + { + "epoch": 12.217396146564205, + "grad_norm": 0.8822192549705505, + "learning_rate": 1.9728460519931442e-05, + "loss": 0.0709, + "num_input_tokens_seen": 133485056, + "step": 109700 + }, + { + "epoch": 12.217953001447823, + "grad_norm": 0.09289523959159851, + "learning_rate": 1.972608543743179e-05, + "loss": 0.0132, + "num_input_tokens_seen": 133490752, + "step": 109705 + }, + { + "epoch": 12.21850985633144, + "grad_norm": 0.2343292385339737, + "learning_rate": 1.972371040474847e-05, + "loss": 0.0973, + "num_input_tokens_seen": 133496704, + "step": 109710 + }, + { + "epoch": 12.219066711215058, + "grad_norm": 0.005258140154182911, + "learning_rate": 1.9721335421903946e-05, + "loss": 0.0215, + "num_input_tokens_seen": 133502720, + "step": 109715 + }, + { + "epoch": 12.219623566098674, + "grad_norm": 0.0839175432920456, + "learning_rate": 1.9718960488920634e-05, + "loss": 0.0142, + "num_input_tokens_seen": 133508672, + "step": 109720 + }, + { + "epoch": 12.220180420982292, + "grad_norm": 0.7582632899284363, + "learning_rate": 1.971658560582097e-05, + "loss": 0.0604, + "num_input_tokens_seen": 133514624, + "step": 109725 + }, + { + "epoch": 12.22073727586591, + "grad_norm": 0.9608827829360962, + "learning_rate": 1.971421077262739e-05, + "loss": 0.0873, + "num_input_tokens_seen": 133520384, + "step": 109730 + }, + { + "epoch": 12.221294130749527, + "grad_norm": 0.00017577351536601782, + "learning_rate": 1.9711835989362325e-05, + "loss": 0.1057, + "num_input_tokens_seen": 133526112, + "step": 109735 + }, + { + "epoch": 12.221850985633145, + "grad_norm": 0.22785262763500214, + "learning_rate": 1.9709461256048202e-05, + "loss": 0.0255, + "num_input_tokens_seen": 133532192, + "step": 109740 + }, + { + "epoch": 12.22240784051676, + "grad_norm": 0.008215518668293953, + "learning_rate": 1.970708657270746e-05, + "loss": 0.0203, + "num_input_tokens_seen": 133538144, + "step": 109745 + }, + { + "epoch": 12.222964695400378, + "grad_norm": 0.004184225108474493, + "learning_rate": 1.970471193936252e-05, + "loss": 0.0095, + "num_input_tokens_seen": 133544288, + "step": 109750 + }, + { + "epoch": 12.223521550283996, + "grad_norm": 0.028240613639354706, + "learning_rate": 1.9702337356035826e-05, + "loss": 0.013, + "num_input_tokens_seen": 133550656, + "step": 109755 + }, + { + "epoch": 12.224078405167614, + "grad_norm": 0.032233867794275284, + "learning_rate": 1.969996282274979e-05, + "loss": 0.0341, + "num_input_tokens_seen": 133556896, + "step": 109760 + }, + { + "epoch": 12.224635260051231, + "grad_norm": 0.00350326020270586, + "learning_rate": 1.9697588339526868e-05, + "loss": 0.0684, + "num_input_tokens_seen": 133563232, + "step": 109765 + }, + { + "epoch": 12.225192114934847, + "grad_norm": 0.024709578603506088, + "learning_rate": 1.9695213906389455e-05, + "loss": 0.0164, + "num_input_tokens_seen": 133569504, + "step": 109770 + }, + { + "epoch": 12.225748969818465, + "grad_norm": 0.4888738691806793, + "learning_rate": 1.9692839523360007e-05, + "loss": 0.0188, + "num_input_tokens_seen": 133575584, + "step": 109775 + }, + { + "epoch": 12.226305824702083, + "grad_norm": 1.775600790977478, + "learning_rate": 1.9690465190460937e-05, + "loss": 0.0468, + "num_input_tokens_seen": 133581984, + "step": 109780 + }, + { + "epoch": 12.2268626795857, + "grad_norm": 0.23996591567993164, + "learning_rate": 1.968809090771468e-05, + "loss": 0.0629, + "num_input_tokens_seen": 133588480, + "step": 109785 + }, + { + "epoch": 12.227419534469318, + "grad_norm": 0.0022671197075396776, + "learning_rate": 1.9685716675143658e-05, + "loss": 0.0022, + "num_input_tokens_seen": 133594848, + "step": 109790 + }, + { + "epoch": 12.227976389352934, + "grad_norm": 1.309499740600586, + "learning_rate": 1.9683342492770304e-05, + "loss": 0.0913, + "num_input_tokens_seen": 133601120, + "step": 109795 + }, + { + "epoch": 12.228533244236552, + "grad_norm": 0.01125167403370142, + "learning_rate": 1.9680968360617036e-05, + "loss": 0.0064, + "num_input_tokens_seen": 133607488, + "step": 109800 + }, + { + "epoch": 12.22909009912017, + "grad_norm": 1.5941320657730103, + "learning_rate": 1.9678594278706286e-05, + "loss": 0.0224, + "num_input_tokens_seen": 133613760, + "step": 109805 + }, + { + "epoch": 12.229646954003787, + "grad_norm": 0.02022760920226574, + "learning_rate": 1.9676220247060474e-05, + "loss": 0.0054, + "num_input_tokens_seen": 133619808, + "step": 109810 + }, + { + "epoch": 12.230203808887405, + "grad_norm": 0.04948895797133446, + "learning_rate": 1.967384626570203e-05, + "loss": 0.0554, + "num_input_tokens_seen": 133625920, + "step": 109815 + }, + { + "epoch": 12.23076066377102, + "grad_norm": 0.003106299787759781, + "learning_rate": 1.9671472334653363e-05, + "loss": 0.004, + "num_input_tokens_seen": 133631936, + "step": 109820 + }, + { + "epoch": 12.231317518654638, + "grad_norm": 0.00012907855852972716, + "learning_rate": 1.966909845393693e-05, + "loss": 0.0139, + "num_input_tokens_seen": 133638336, + "step": 109825 + }, + { + "epoch": 12.231874373538256, + "grad_norm": 0.5311320424079895, + "learning_rate": 1.966672462357511e-05, + "loss": 0.0192, + "num_input_tokens_seen": 133644096, + "step": 109830 + }, + { + "epoch": 12.232431228421873, + "grad_norm": 0.17565201222896576, + "learning_rate": 1.966435084359037e-05, + "loss": 0.0827, + "num_input_tokens_seen": 133649952, + "step": 109835 + }, + { + "epoch": 12.232988083305491, + "grad_norm": 0.5753812193870544, + "learning_rate": 1.9661977114005098e-05, + "loss": 0.0586, + "num_input_tokens_seen": 133655392, + "step": 109840 + }, + { + "epoch": 12.233544938189109, + "grad_norm": 0.0017149281920865178, + "learning_rate": 1.9659603434841733e-05, + "loss": 0.0054, + "num_input_tokens_seen": 133661696, + "step": 109845 + }, + { + "epoch": 12.234101793072725, + "grad_norm": 0.2227044403553009, + "learning_rate": 1.965722980612269e-05, + "loss": 0.0159, + "num_input_tokens_seen": 133667680, + "step": 109850 + }, + { + "epoch": 12.234658647956342, + "grad_norm": 0.012537883594632149, + "learning_rate": 1.96548562278704e-05, + "loss": 0.0311, + "num_input_tokens_seen": 133673600, + "step": 109855 + }, + { + "epoch": 12.23521550283996, + "grad_norm": 0.0023236663546413183, + "learning_rate": 1.9652482700107266e-05, + "loss": 0.0675, + "num_input_tokens_seen": 133679424, + "step": 109860 + }, + { + "epoch": 12.235772357723578, + "grad_norm": 1.7761081457138062, + "learning_rate": 1.9650109222855725e-05, + "loss": 0.0241, + "num_input_tokens_seen": 133685216, + "step": 109865 + }, + { + "epoch": 12.236329212607195, + "grad_norm": 1.0652942657470703, + "learning_rate": 1.9647735796138187e-05, + "loss": 0.0724, + "num_input_tokens_seen": 133691680, + "step": 109870 + }, + { + "epoch": 12.236886067490811, + "grad_norm": 0.0021379999816417694, + "learning_rate": 1.9645362419977068e-05, + "loss": 0.0628, + "num_input_tokens_seen": 133697152, + "step": 109875 + }, + { + "epoch": 12.237442922374429, + "grad_norm": 1.7366794347763062, + "learning_rate": 1.9642989094394796e-05, + "loss": 0.11, + "num_input_tokens_seen": 133703584, + "step": 109880 + }, + { + "epoch": 12.237999777258047, + "grad_norm": 0.021198850125074387, + "learning_rate": 1.964061581941378e-05, + "loss": 0.0359, + "num_input_tokens_seen": 133709120, + "step": 109885 + }, + { + "epoch": 12.238556632141664, + "grad_norm": 2.3795688152313232, + "learning_rate": 1.9638242595056444e-05, + "loss": 0.1057, + "num_input_tokens_seen": 133715360, + "step": 109890 + }, + { + "epoch": 12.239113487025282, + "grad_norm": 0.19636507332324982, + "learning_rate": 1.9635869421345198e-05, + "loss": 0.0159, + "num_input_tokens_seen": 133721472, + "step": 109895 + }, + { + "epoch": 12.239670341908898, + "grad_norm": 0.058645639568567276, + "learning_rate": 1.963349629830247e-05, + "loss": 0.0059, + "num_input_tokens_seen": 133727680, + "step": 109900 + }, + { + "epoch": 12.240227196792516, + "grad_norm": 0.0008902890258468688, + "learning_rate": 1.963112322595066e-05, + "loss": 0.1591, + "num_input_tokens_seen": 133734208, + "step": 109905 + }, + { + "epoch": 12.240784051676133, + "grad_norm": 0.0002006522408919409, + "learning_rate": 1.9628750204312205e-05, + "loss": 0.1123, + "num_input_tokens_seen": 133740384, + "step": 109910 + }, + { + "epoch": 12.241340906559751, + "grad_norm": 0.3306065797805786, + "learning_rate": 1.9626377233409493e-05, + "loss": 0.0085, + "num_input_tokens_seen": 133745600, + "step": 109915 + }, + { + "epoch": 12.241897761443369, + "grad_norm": 0.06411554664373398, + "learning_rate": 1.9624004313264962e-05, + "loss": 0.0111, + "num_input_tokens_seen": 133751680, + "step": 109920 + }, + { + "epoch": 12.242454616326985, + "grad_norm": 0.27328988909721375, + "learning_rate": 1.9621631443901016e-05, + "loss": 0.0364, + "num_input_tokens_seen": 133757824, + "step": 109925 + }, + { + "epoch": 12.243011471210602, + "grad_norm": 0.7499896883964539, + "learning_rate": 1.961925862534007e-05, + "loss": 0.0146, + "num_input_tokens_seen": 133764160, + "step": 109930 + }, + { + "epoch": 12.24356832609422, + "grad_norm": 0.5726506114006042, + "learning_rate": 1.9616885857604536e-05, + "loss": 0.0265, + "num_input_tokens_seen": 133770464, + "step": 109935 + }, + { + "epoch": 12.244125180977838, + "grad_norm": 1.264033555984497, + "learning_rate": 1.9614513140716834e-05, + "loss": 0.049, + "num_input_tokens_seen": 133776064, + "step": 109940 + }, + { + "epoch": 12.244682035861455, + "grad_norm": 0.63578861951828, + "learning_rate": 1.9612140474699362e-05, + "loss": 0.0275, + "num_input_tokens_seen": 133781792, + "step": 109945 + }, + { + "epoch": 12.245238890745071, + "grad_norm": 0.038123615086078644, + "learning_rate": 1.9609767859574547e-05, + "loss": 0.0085, + "num_input_tokens_seen": 133787808, + "step": 109950 + }, + { + "epoch": 12.245795745628689, + "grad_norm": 0.09474780410528183, + "learning_rate": 1.9607395295364788e-05, + "loss": 0.0093, + "num_input_tokens_seen": 133793984, + "step": 109955 + }, + { + "epoch": 12.246352600512306, + "grad_norm": 0.27937400341033936, + "learning_rate": 1.9605022782092506e-05, + "loss": 0.0464, + "num_input_tokens_seen": 133799264, + "step": 109960 + }, + { + "epoch": 12.246909455395924, + "grad_norm": 0.011654437519609928, + "learning_rate": 1.9602650319780096e-05, + "loss": 0.0353, + "num_input_tokens_seen": 133805728, + "step": 109965 + }, + { + "epoch": 12.247466310279542, + "grad_norm": 1.6054540872573853, + "learning_rate": 1.960027790844999e-05, + "loss": 0.1096, + "num_input_tokens_seen": 133811904, + "step": 109970 + }, + { + "epoch": 12.248023165163158, + "grad_norm": 0.0001468819536967203, + "learning_rate": 1.9597905548124573e-05, + "loss": 0.1536, + "num_input_tokens_seen": 133817952, + "step": 109975 + }, + { + "epoch": 12.248580020046775, + "grad_norm": 0.0003415833634790033, + "learning_rate": 1.9595533238826282e-05, + "loss": 0.1372, + "num_input_tokens_seen": 133823904, + "step": 109980 + }, + { + "epoch": 12.249136874930393, + "grad_norm": 1.0227214097976685, + "learning_rate": 1.9593160980577495e-05, + "loss": 0.051, + "num_input_tokens_seen": 133830048, + "step": 109985 + }, + { + "epoch": 12.24969372981401, + "grad_norm": 0.6931411623954773, + "learning_rate": 1.9590788773400644e-05, + "loss": 0.095, + "num_input_tokens_seen": 133836128, + "step": 109990 + }, + { + "epoch": 12.250250584697628, + "grad_norm": 0.0037471812684088945, + "learning_rate": 1.9588416617318118e-05, + "loss": 0.0059, + "num_input_tokens_seen": 133842848, + "step": 109995 + }, + { + "epoch": 12.250807439581244, + "grad_norm": 0.16502748429775238, + "learning_rate": 1.9586044512352343e-05, + "loss": 0.0061, + "num_input_tokens_seen": 133848736, + "step": 110000 + }, + { + "epoch": 12.251364294464862, + "grad_norm": 1.4227885007858276, + "learning_rate": 1.9583672458525708e-05, + "loss": 0.016, + "num_input_tokens_seen": 133855040, + "step": 110005 + }, + { + "epoch": 12.25192114934848, + "grad_norm": 0.6575284004211426, + "learning_rate": 1.958130045586063e-05, + "loss": 0.0434, + "num_input_tokens_seen": 133860832, + "step": 110010 + }, + { + "epoch": 12.252478004232097, + "grad_norm": 0.4492924213409424, + "learning_rate": 1.9578928504379507e-05, + "loss": 0.0724, + "num_input_tokens_seen": 133866880, + "step": 110015 + }, + { + "epoch": 12.253034859115715, + "grad_norm": 0.014019796624779701, + "learning_rate": 1.957655660410475e-05, + "loss": 0.0071, + "num_input_tokens_seen": 133873184, + "step": 110020 + }, + { + "epoch": 12.253591713999331, + "grad_norm": 0.10427834838628769, + "learning_rate": 1.9574184755058758e-05, + "loss": 0.0092, + "num_input_tokens_seen": 133879104, + "step": 110025 + }, + { + "epoch": 12.254148568882949, + "grad_norm": 0.0447910837829113, + "learning_rate": 1.9571812957263942e-05, + "loss": 0.0266, + "num_input_tokens_seen": 133884832, + "step": 110030 + }, + { + "epoch": 12.254705423766566, + "grad_norm": 1.6462589502334595, + "learning_rate": 1.9569441210742697e-05, + "loss": 0.2069, + "num_input_tokens_seen": 133891264, + "step": 110035 + }, + { + "epoch": 12.255262278650184, + "grad_norm": 0.055030860006809235, + "learning_rate": 1.9567069515517438e-05, + "loss": 0.0027, + "num_input_tokens_seen": 133897664, + "step": 110040 + }, + { + "epoch": 12.255819133533802, + "grad_norm": 0.1702926754951477, + "learning_rate": 1.9564697871610548e-05, + "loss": 0.0301, + "num_input_tokens_seen": 133903488, + "step": 110045 + }, + { + "epoch": 12.25637598841742, + "grad_norm": 0.0008394927135668695, + "learning_rate": 1.9562326279044456e-05, + "loss": 0.0442, + "num_input_tokens_seen": 133909664, + "step": 110050 + }, + { + "epoch": 12.256932843301035, + "grad_norm": 0.03929731622338295, + "learning_rate": 1.9559954737841537e-05, + "loss": 0.0198, + "num_input_tokens_seen": 133915968, + "step": 110055 + }, + { + "epoch": 12.257489698184653, + "grad_norm": 0.5119855403900146, + "learning_rate": 1.9557583248024214e-05, + "loss": 0.0235, + "num_input_tokens_seen": 133921952, + "step": 110060 + }, + { + "epoch": 12.25804655306827, + "grad_norm": 0.0007224510773085058, + "learning_rate": 1.955521180961487e-05, + "loss": 0.0204, + "num_input_tokens_seen": 133928352, + "step": 110065 + }, + { + "epoch": 12.258603407951888, + "grad_norm": 1.9098201990127563, + "learning_rate": 1.9552840422635918e-05, + "loss": 0.0561, + "num_input_tokens_seen": 133934624, + "step": 110070 + }, + { + "epoch": 12.259160262835506, + "grad_norm": 0.9367327690124512, + "learning_rate": 1.9550469087109746e-05, + "loss": 0.0201, + "num_input_tokens_seen": 133940832, + "step": 110075 + }, + { + "epoch": 12.259717117719122, + "grad_norm": 1.8003830909729004, + "learning_rate": 1.954809780305877e-05, + "loss": 0.0909, + "num_input_tokens_seen": 133947104, + "step": 110080 + }, + { + "epoch": 12.26027397260274, + "grad_norm": 0.015903692692518234, + "learning_rate": 1.954572657050537e-05, + "loss": 0.096, + "num_input_tokens_seen": 133952672, + "step": 110085 + }, + { + "epoch": 12.260830827486357, + "grad_norm": 0.08570346981287003, + "learning_rate": 1.9543355389471953e-05, + "loss": 0.0589, + "num_input_tokens_seen": 133959072, + "step": 110090 + }, + { + "epoch": 12.261387682369975, + "grad_norm": 0.4111548960208893, + "learning_rate": 1.954098425998091e-05, + "loss": 0.043, + "num_input_tokens_seen": 133965440, + "step": 110095 + }, + { + "epoch": 12.261944537253592, + "grad_norm": 0.7946839928627014, + "learning_rate": 1.953861318205466e-05, + "loss": 0.1493, + "num_input_tokens_seen": 133971488, + "step": 110100 + }, + { + "epoch": 12.262501392137208, + "grad_norm": 0.0007568236324004829, + "learning_rate": 1.953624215571557e-05, + "loss": 0.002, + "num_input_tokens_seen": 133977568, + "step": 110105 + }, + { + "epoch": 12.263058247020826, + "grad_norm": 0.10455276072025299, + "learning_rate": 1.9533871180986062e-05, + "loss": 0.0252, + "num_input_tokens_seen": 133983360, + "step": 110110 + }, + { + "epoch": 12.263615101904444, + "grad_norm": 1.8837140798568726, + "learning_rate": 1.9531500257888506e-05, + "loss": 0.098, + "num_input_tokens_seen": 133989568, + "step": 110115 + }, + { + "epoch": 12.264171956788061, + "grad_norm": 0.01194893941283226, + "learning_rate": 1.9529129386445323e-05, + "loss": 0.018, + "num_input_tokens_seen": 133995584, + "step": 110120 + }, + { + "epoch": 12.264728811671679, + "grad_norm": 0.009654260240495205, + "learning_rate": 1.952675856667889e-05, + "loss": 0.0183, + "num_input_tokens_seen": 134001792, + "step": 110125 + }, + { + "epoch": 12.265285666555295, + "grad_norm": 0.00011831130541395396, + "learning_rate": 1.9524387798611614e-05, + "loss": 0.0332, + "num_input_tokens_seen": 134007936, + "step": 110130 + }, + { + "epoch": 12.265842521438913, + "grad_norm": 0.008083266206085682, + "learning_rate": 1.9522017082265876e-05, + "loss": 0.0009, + "num_input_tokens_seen": 134014432, + "step": 110135 + }, + { + "epoch": 12.26639937632253, + "grad_norm": 0.08623266965150833, + "learning_rate": 1.951964641766408e-05, + "loss": 0.0346, + "num_input_tokens_seen": 134020704, + "step": 110140 + }, + { + "epoch": 12.266956231206148, + "grad_norm": 0.0001649165787966922, + "learning_rate": 1.951727580482861e-05, + "loss": 0.1068, + "num_input_tokens_seen": 134026656, + "step": 110145 + }, + { + "epoch": 12.267513086089766, + "grad_norm": 0.00029879872454330325, + "learning_rate": 1.9514905243781868e-05, + "loss": 0.028, + "num_input_tokens_seen": 134032384, + "step": 110150 + }, + { + "epoch": 12.268069940973382, + "grad_norm": 1.1508814096450806, + "learning_rate": 1.9512534734546233e-05, + "loss": 0.0287, + "num_input_tokens_seen": 134038464, + "step": 110155 + }, + { + "epoch": 12.268626795857, + "grad_norm": 0.016125250607728958, + "learning_rate": 1.9510164277144115e-05, + "loss": 0.0065, + "num_input_tokens_seen": 134044704, + "step": 110160 + }, + { + "epoch": 12.269183650740617, + "grad_norm": 0.002884132554754615, + "learning_rate": 1.950779387159788e-05, + "loss": 0.0194, + "num_input_tokens_seen": 134050912, + "step": 110165 + }, + { + "epoch": 12.269740505624235, + "grad_norm": 0.18629418313503265, + "learning_rate": 1.9505423517929948e-05, + "loss": 0.0245, + "num_input_tokens_seen": 134056832, + "step": 110170 + }, + { + "epoch": 12.270297360507852, + "grad_norm": 0.10462512075901031, + "learning_rate": 1.9503053216162677e-05, + "loss": 0.0018, + "num_input_tokens_seen": 134063232, + "step": 110175 + }, + { + "epoch": 12.270854215391468, + "grad_norm": 0.31690484285354614, + "learning_rate": 1.9500682966318478e-05, + "loss": 0.0174, + "num_input_tokens_seen": 134068512, + "step": 110180 + }, + { + "epoch": 12.271411070275086, + "grad_norm": 0.0002474222856108099, + "learning_rate": 1.9498312768419737e-05, + "loss": 0.0635, + "num_input_tokens_seen": 134073824, + "step": 110185 + }, + { + "epoch": 12.271967925158703, + "grad_norm": 0.07045938074588776, + "learning_rate": 1.9495942622488842e-05, + "loss": 0.0204, + "num_input_tokens_seen": 134079168, + "step": 110190 + }, + { + "epoch": 12.272524780042321, + "grad_norm": 1.5265913009643555, + "learning_rate": 1.9493572528548172e-05, + "loss": 0.0579, + "num_input_tokens_seen": 134085376, + "step": 110195 + }, + { + "epoch": 12.273081634925939, + "grad_norm": 0.004624741151928902, + "learning_rate": 1.9491202486620126e-05, + "loss": 0.0767, + "num_input_tokens_seen": 134091392, + "step": 110200 + }, + { + "epoch": 12.273638489809557, + "grad_norm": 0.052116405218839645, + "learning_rate": 1.9488832496727083e-05, + "loss": 0.007, + "num_input_tokens_seen": 134097504, + "step": 110205 + }, + { + "epoch": 12.274195344693172, + "grad_norm": 0.03991202265024185, + "learning_rate": 1.9486462558891437e-05, + "loss": 0.0335, + "num_input_tokens_seen": 134103744, + "step": 110210 + }, + { + "epoch": 12.27475219957679, + "grad_norm": 0.23098266124725342, + "learning_rate": 1.948409267313557e-05, + "loss": 0.0168, + "num_input_tokens_seen": 134109504, + "step": 110215 + }, + { + "epoch": 12.275309054460408, + "grad_norm": 0.19123578071594238, + "learning_rate": 1.9481722839481866e-05, + "loss": 0.0083, + "num_input_tokens_seen": 134115456, + "step": 110220 + }, + { + "epoch": 12.275865909344025, + "grad_norm": 1.4850844144821167, + "learning_rate": 1.94793530579527e-05, + "loss": 0.0675, + "num_input_tokens_seen": 134121472, + "step": 110225 + }, + { + "epoch": 12.276422764227643, + "grad_norm": 0.008837663568556309, + "learning_rate": 1.9476983328570484e-05, + "loss": 0.0084, + "num_input_tokens_seen": 134127712, + "step": 110230 + }, + { + "epoch": 12.276979619111259, + "grad_norm": 0.00032710921368561685, + "learning_rate": 1.947461365135757e-05, + "loss": 0.0045, + "num_input_tokens_seen": 134133792, + "step": 110235 + }, + { + "epoch": 12.277536473994877, + "grad_norm": 0.0001628310274099931, + "learning_rate": 1.947224402633637e-05, + "loss": 0.0014, + "num_input_tokens_seen": 134139808, + "step": 110240 + }, + { + "epoch": 12.278093328878494, + "grad_norm": 0.05043395981192589, + "learning_rate": 1.9469874453529243e-05, + "loss": 0.0431, + "num_input_tokens_seen": 134145920, + "step": 110245 + }, + { + "epoch": 12.278650183762112, + "grad_norm": 7.61191695346497e-05, + "learning_rate": 1.9467504932958592e-05, + "loss": 0.0636, + "num_input_tokens_seen": 134152032, + "step": 110250 + }, + { + "epoch": 12.27920703864573, + "grad_norm": 0.07095945626497269, + "learning_rate": 1.9465135464646784e-05, + "loss": 0.0049, + "num_input_tokens_seen": 134158208, + "step": 110255 + }, + { + "epoch": 12.279763893529346, + "grad_norm": 3.8339333534240723, + "learning_rate": 1.946276604861621e-05, + "loss": 0.0406, + "num_input_tokens_seen": 134164576, + "step": 110260 + }, + { + "epoch": 12.280320748412963, + "grad_norm": 1.5773141384124756, + "learning_rate": 1.946039668488924e-05, + "loss": 0.0246, + "num_input_tokens_seen": 134170208, + "step": 110265 + }, + { + "epoch": 12.280877603296581, + "grad_norm": 0.5339549779891968, + "learning_rate": 1.9458027373488268e-05, + "loss": 0.0103, + "num_input_tokens_seen": 134176224, + "step": 110270 + }, + { + "epoch": 12.281434458180199, + "grad_norm": 1.4533199071884155, + "learning_rate": 1.9455658114435665e-05, + "loss": 0.0686, + "num_input_tokens_seen": 134181856, + "step": 110275 + }, + { + "epoch": 12.281991313063816, + "grad_norm": 0.7745699286460876, + "learning_rate": 1.9453288907753818e-05, + "loss": 0.0407, + "num_input_tokens_seen": 134187296, + "step": 110280 + }, + { + "epoch": 12.282548167947432, + "grad_norm": 0.001157982274889946, + "learning_rate": 1.94509197534651e-05, + "loss": 0.0193, + "num_input_tokens_seen": 134193536, + "step": 110285 + }, + { + "epoch": 12.28310502283105, + "grad_norm": 0.0001793406845536083, + "learning_rate": 1.9448550651591884e-05, + "loss": 0.0136, + "num_input_tokens_seen": 134199776, + "step": 110290 + }, + { + "epoch": 12.283661877714668, + "grad_norm": 0.008704042062163353, + "learning_rate": 1.9446181602156564e-05, + "loss": 0.0082, + "num_input_tokens_seen": 134205888, + "step": 110295 + }, + { + "epoch": 12.284218732598285, + "grad_norm": 0.9381323456764221, + "learning_rate": 1.9443812605181498e-05, + "loss": 0.0525, + "num_input_tokens_seen": 134211648, + "step": 110300 + }, + { + "epoch": 12.284775587481903, + "grad_norm": 0.004118225071579218, + "learning_rate": 1.944144366068908e-05, + "loss": 0.1642, + "num_input_tokens_seen": 134217056, + "step": 110305 + }, + { + "epoch": 12.285332442365519, + "grad_norm": 0.07883631438016891, + "learning_rate": 1.9439074768701672e-05, + "loss": 0.0841, + "num_input_tokens_seen": 134222944, + "step": 110310 + }, + { + "epoch": 12.285889297249136, + "grad_norm": 1.2258793115615845, + "learning_rate": 1.9436705929241676e-05, + "loss": 0.0314, + "num_input_tokens_seen": 134228352, + "step": 110315 + }, + { + "epoch": 12.286446152132754, + "grad_norm": 0.5420292019844055, + "learning_rate": 1.943433714233143e-05, + "loss": 0.0109, + "num_input_tokens_seen": 134234624, + "step": 110320 + }, + { + "epoch": 12.287003007016372, + "grad_norm": 0.014590918086469173, + "learning_rate": 1.943196840799334e-05, + "loss": 0.0468, + "num_input_tokens_seen": 134240992, + "step": 110325 + }, + { + "epoch": 12.28755986189999, + "grad_norm": 0.011363441124558449, + "learning_rate": 1.9429599726249764e-05, + "loss": 0.0375, + "num_input_tokens_seen": 134246944, + "step": 110330 + }, + { + "epoch": 12.288116716783605, + "grad_norm": 1.1758389472961426, + "learning_rate": 1.942723109712309e-05, + "loss": 0.0426, + "num_input_tokens_seen": 134252416, + "step": 110335 + }, + { + "epoch": 12.288673571667223, + "grad_norm": 0.003605445846915245, + "learning_rate": 1.9424862520635673e-05, + "loss": 0.0208, + "num_input_tokens_seen": 134258816, + "step": 110340 + }, + { + "epoch": 12.28923042655084, + "grad_norm": 0.10175008326768875, + "learning_rate": 1.9422493996809904e-05, + "loss": 0.0022, + "num_input_tokens_seen": 134264960, + "step": 110345 + }, + { + "epoch": 12.289787281434458, + "grad_norm": 0.10816612839698792, + "learning_rate": 1.942012552566814e-05, + "loss": 0.0014, + "num_input_tokens_seen": 134271328, + "step": 110350 + }, + { + "epoch": 12.290344136318076, + "grad_norm": 2.3404061794281006, + "learning_rate": 1.941775710723277e-05, + "loss": 0.014, + "num_input_tokens_seen": 134277408, + "step": 110355 + }, + { + "epoch": 12.290900991201692, + "grad_norm": 0.00018758405349217355, + "learning_rate": 1.9415388741526148e-05, + "loss": 0.0063, + "num_input_tokens_seen": 134283552, + "step": 110360 + }, + { + "epoch": 12.29145784608531, + "grad_norm": 0.00047196424566209316, + "learning_rate": 1.941302042857066e-05, + "loss": 0.0125, + "num_input_tokens_seen": 134289888, + "step": 110365 + }, + { + "epoch": 12.292014700968927, + "grad_norm": 0.6092104315757751, + "learning_rate": 1.941065216838866e-05, + "loss": 0.0337, + "num_input_tokens_seen": 134295168, + "step": 110370 + }, + { + "epoch": 12.292571555852545, + "grad_norm": 0.00032028593705035746, + "learning_rate": 1.9408283961002547e-05, + "loss": 0.0055, + "num_input_tokens_seen": 134300640, + "step": 110375 + }, + { + "epoch": 12.293128410736163, + "grad_norm": 1.5400617122650146, + "learning_rate": 1.9405915806434656e-05, + "loss": 0.0307, + "num_input_tokens_seen": 134306624, + "step": 110380 + }, + { + "epoch": 12.293685265619779, + "grad_norm": 0.007537244353443384, + "learning_rate": 1.9403547704707384e-05, + "loss": 0.0776, + "num_input_tokens_seen": 134312928, + "step": 110385 + }, + { + "epoch": 12.294242120503396, + "grad_norm": 0.2600747048854828, + "learning_rate": 1.940117965584307e-05, + "loss": 0.0081, + "num_input_tokens_seen": 134318976, + "step": 110390 + }, + { + "epoch": 12.294798975387014, + "grad_norm": 0.006323600187897682, + "learning_rate": 1.9398811659864116e-05, + "loss": 0.0303, + "num_input_tokens_seen": 134325056, + "step": 110395 + }, + { + "epoch": 12.295355830270632, + "grad_norm": 0.0005375720793381333, + "learning_rate": 1.9396443716792867e-05, + "loss": 0.0028, + "num_input_tokens_seen": 134331200, + "step": 110400 + }, + { + "epoch": 12.29591268515425, + "grad_norm": 0.09908266365528107, + "learning_rate": 1.93940758266517e-05, + "loss": 0.0093, + "num_input_tokens_seen": 134337472, + "step": 110405 + }, + { + "epoch": 12.296469540037867, + "grad_norm": 0.05529188737273216, + "learning_rate": 1.9391707989462972e-05, + "loss": 0.0295, + "num_input_tokens_seen": 134343808, + "step": 110410 + }, + { + "epoch": 12.297026394921483, + "grad_norm": 1.0469576120376587, + "learning_rate": 1.9389340205249067e-05, + "loss": 0.0817, + "num_input_tokens_seen": 134350304, + "step": 110415 + }, + { + "epoch": 12.2975832498051, + "grad_norm": 0.0031449159141629934, + "learning_rate": 1.938697247403233e-05, + "loss": 0.0103, + "num_input_tokens_seen": 134356384, + "step": 110420 + }, + { + "epoch": 12.298140104688718, + "grad_norm": 1.2741444110870361, + "learning_rate": 1.9384604795835137e-05, + "loss": 0.059, + "num_input_tokens_seen": 134362368, + "step": 110425 + }, + { + "epoch": 12.298696959572336, + "grad_norm": 0.25768712162971497, + "learning_rate": 1.9382237170679846e-05, + "loss": 0.0836, + "num_input_tokens_seen": 134368416, + "step": 110430 + }, + { + "epoch": 12.299253814455954, + "grad_norm": 0.0005122500006109476, + "learning_rate": 1.9379869598588835e-05, + "loss": 0.0143, + "num_input_tokens_seen": 134374720, + "step": 110435 + }, + { + "epoch": 12.29981066933957, + "grad_norm": 0.009183816611766815, + "learning_rate": 1.9377502079584445e-05, + "loss": 0.0164, + "num_input_tokens_seen": 134380800, + "step": 110440 + }, + { + "epoch": 12.300367524223187, + "grad_norm": 0.13375777006149292, + "learning_rate": 1.937513461368907e-05, + "loss": 0.0451, + "num_input_tokens_seen": 134387136, + "step": 110445 + }, + { + "epoch": 12.300924379106805, + "grad_norm": 0.01135737169533968, + "learning_rate": 1.9372767200925036e-05, + "loss": 0.0248, + "num_input_tokens_seen": 134393248, + "step": 110450 + }, + { + "epoch": 12.301481233990422, + "grad_norm": 0.0004916533944196999, + "learning_rate": 1.937039984131474e-05, + "loss": 0.0146, + "num_input_tokens_seen": 134399872, + "step": 110455 + }, + { + "epoch": 12.30203808887404, + "grad_norm": 0.12912413477897644, + "learning_rate": 1.936803253488052e-05, + "loss": 0.0956, + "num_input_tokens_seen": 134405728, + "step": 110460 + }, + { + "epoch": 12.302594943757656, + "grad_norm": 0.46932610869407654, + "learning_rate": 1.9365665281644748e-05, + "loss": 0.0067, + "num_input_tokens_seen": 134411936, + "step": 110465 + }, + { + "epoch": 12.303151798641274, + "grad_norm": 0.07373227179050446, + "learning_rate": 1.936329808162978e-05, + "loss": 0.0034, + "num_input_tokens_seen": 134417920, + "step": 110470 + }, + { + "epoch": 12.303708653524891, + "grad_norm": 1.4965778589248657, + "learning_rate": 1.936093093485798e-05, + "loss": 0.08, + "num_input_tokens_seen": 134424096, + "step": 110475 + }, + { + "epoch": 12.304265508408509, + "grad_norm": 0.055998995900154114, + "learning_rate": 1.93585638413517e-05, + "loss": 0.0345, + "num_input_tokens_seen": 134430272, + "step": 110480 + }, + { + "epoch": 12.304822363292127, + "grad_norm": 0.006763918790966272, + "learning_rate": 1.9356196801133315e-05, + "loss": 0.0317, + "num_input_tokens_seen": 134436384, + "step": 110485 + }, + { + "epoch": 12.305379218175743, + "grad_norm": 1.736681342124939, + "learning_rate": 1.935382981422516e-05, + "loss": 0.0488, + "num_input_tokens_seen": 134442048, + "step": 110490 + }, + { + "epoch": 12.30593607305936, + "grad_norm": 0.036786891520023346, + "learning_rate": 1.9351462880649617e-05, + "loss": 0.0496, + "num_input_tokens_seen": 134448128, + "step": 110495 + }, + { + "epoch": 12.306492927942978, + "grad_norm": 0.8339391946792603, + "learning_rate": 1.9349096000429022e-05, + "loss": 0.1022, + "num_input_tokens_seen": 134454080, + "step": 110500 + }, + { + "epoch": 12.307049782826596, + "grad_norm": 1.7153679132461548, + "learning_rate": 1.9346729173585753e-05, + "loss": 0.0713, + "num_input_tokens_seen": 134460320, + "step": 110505 + }, + { + "epoch": 12.307606637710213, + "grad_norm": 0.8910980820655823, + "learning_rate": 1.9344362400142145e-05, + "loss": 0.0336, + "num_input_tokens_seen": 134466144, + "step": 110510 + }, + { + "epoch": 12.30816349259383, + "grad_norm": 0.5059346556663513, + "learning_rate": 1.9341995680120577e-05, + "loss": 0.0093, + "num_input_tokens_seen": 134472096, + "step": 110515 + }, + { + "epoch": 12.308720347477447, + "grad_norm": 0.13802048563957214, + "learning_rate": 1.9339629013543382e-05, + "loss": 0.0195, + "num_input_tokens_seen": 134478272, + "step": 110520 + }, + { + "epoch": 12.309277202361065, + "grad_norm": 0.35940903425216675, + "learning_rate": 1.9337262400432937e-05, + "loss": 0.053, + "num_input_tokens_seen": 134484416, + "step": 110525 + }, + { + "epoch": 12.309834057244682, + "grad_norm": 0.1459101438522339, + "learning_rate": 1.9334895840811578e-05, + "loss": 0.0071, + "num_input_tokens_seen": 134489824, + "step": 110530 + }, + { + "epoch": 12.3103909121283, + "grad_norm": 1.1868771314620972, + "learning_rate": 1.9332529334701672e-05, + "loss": 0.0674, + "num_input_tokens_seen": 134496032, + "step": 110535 + }, + { + "epoch": 12.310947767011916, + "grad_norm": 2.0808093547821045, + "learning_rate": 1.9330162882125562e-05, + "loss": 0.1226, + "num_input_tokens_seen": 134502112, + "step": 110540 + }, + { + "epoch": 12.311504621895534, + "grad_norm": 0.0002877523365896195, + "learning_rate": 1.932779648310561e-05, + "loss": 0.0979, + "num_input_tokens_seen": 134508352, + "step": 110545 + }, + { + "epoch": 12.312061476779151, + "grad_norm": 0.015657363459467888, + "learning_rate": 1.9325430137664164e-05, + "loss": 0.0797, + "num_input_tokens_seen": 134514912, + "step": 110550 + }, + { + "epoch": 12.312618331662769, + "grad_norm": 0.17172226309776306, + "learning_rate": 1.9323063845823577e-05, + "loss": 0.0238, + "num_input_tokens_seen": 134521216, + "step": 110555 + }, + { + "epoch": 12.313175186546387, + "grad_norm": 0.42994171380996704, + "learning_rate": 1.932069760760619e-05, + "loss": 0.0272, + "num_input_tokens_seen": 134527072, + "step": 110560 + }, + { + "epoch": 12.313732041430004, + "grad_norm": 0.0068677631206810474, + "learning_rate": 1.9318331423034382e-05, + "loss": 0.0991, + "num_input_tokens_seen": 134533152, + "step": 110565 + }, + { + "epoch": 12.31428889631362, + "grad_norm": 0.21322013437747955, + "learning_rate": 1.931596529213047e-05, + "loss": 0.0054, + "num_input_tokens_seen": 134539136, + "step": 110570 + }, + { + "epoch": 12.314845751197238, + "grad_norm": 1.5418510437011719, + "learning_rate": 1.9313599214916834e-05, + "loss": 0.0278, + "num_input_tokens_seen": 134545088, + "step": 110575 + }, + { + "epoch": 12.315402606080855, + "grad_norm": 0.025380730628967285, + "learning_rate": 1.9311233191415795e-05, + "loss": 0.1283, + "num_input_tokens_seen": 134551264, + "step": 110580 + }, + { + "epoch": 12.315959460964473, + "grad_norm": 0.39508453011512756, + "learning_rate": 1.9308867221649725e-05, + "loss": 0.0164, + "num_input_tokens_seen": 134557504, + "step": 110585 + }, + { + "epoch": 12.31651631584809, + "grad_norm": 0.013574333861470222, + "learning_rate": 1.930650130564096e-05, + "loss": 0.1479, + "num_input_tokens_seen": 134563648, + "step": 110590 + }, + { + "epoch": 12.317073170731707, + "grad_norm": 0.2380252629518509, + "learning_rate": 1.9304135443411857e-05, + "loss": 0.0184, + "num_input_tokens_seen": 134569504, + "step": 110595 + }, + { + "epoch": 12.317630025615324, + "grad_norm": 1.8943899869918823, + "learning_rate": 1.9301769634984755e-05, + "loss": 0.1359, + "num_input_tokens_seen": 134575328, + "step": 110600 + }, + { + "epoch": 12.318186880498942, + "grad_norm": 0.254975825548172, + "learning_rate": 1.9299403880382005e-05, + "loss": 0.007, + "num_input_tokens_seen": 134581504, + "step": 110605 + }, + { + "epoch": 12.31874373538256, + "grad_norm": 0.005744714755564928, + "learning_rate": 1.929703817962595e-05, + "loss": 0.0021, + "num_input_tokens_seen": 134587712, + "step": 110610 + }, + { + "epoch": 12.319300590266177, + "grad_norm": 0.00010396801371825859, + "learning_rate": 1.9294672532738943e-05, + "loss": 0.1339, + "num_input_tokens_seen": 134593280, + "step": 110615 + }, + { + "epoch": 12.319857445149793, + "grad_norm": 0.12346301227807999, + "learning_rate": 1.929230693974332e-05, + "loss": 0.0041, + "num_input_tokens_seen": 134599584, + "step": 110620 + }, + { + "epoch": 12.320414300033411, + "grad_norm": 0.00258361897431314, + "learning_rate": 1.9289941400661436e-05, + "loss": 0.0005, + "num_input_tokens_seen": 134605632, + "step": 110625 + }, + { + "epoch": 12.320971154917029, + "grad_norm": 0.001725927460938692, + "learning_rate": 1.9287575915515622e-05, + "loss": 0.0173, + "num_input_tokens_seen": 134611872, + "step": 110630 + }, + { + "epoch": 12.321528009800646, + "grad_norm": 0.023879682645201683, + "learning_rate": 1.9285210484328242e-05, + "loss": 0.0012, + "num_input_tokens_seen": 134618176, + "step": 110635 + }, + { + "epoch": 12.322084864684264, + "grad_norm": 0.0036154184490442276, + "learning_rate": 1.9282845107121615e-05, + "loss": 0.0119, + "num_input_tokens_seen": 134624096, + "step": 110640 + }, + { + "epoch": 12.32264171956788, + "grad_norm": 2.0471372604370117, + "learning_rate": 1.9280479783918105e-05, + "loss": 0.1375, + "num_input_tokens_seen": 134630016, + "step": 110645 + }, + { + "epoch": 12.323198574451498, + "grad_norm": 6.818724068580195e-05, + "learning_rate": 1.927811451474004e-05, + "loss": 0.1166, + "num_input_tokens_seen": 134636288, + "step": 110650 + }, + { + "epoch": 12.323755429335115, + "grad_norm": 0.27827176451683044, + "learning_rate": 1.9275749299609777e-05, + "loss": 0.0474, + "num_input_tokens_seen": 134642560, + "step": 110655 + }, + { + "epoch": 12.324312284218733, + "grad_norm": 0.000528416596353054, + "learning_rate": 1.9273384138549637e-05, + "loss": 0.0198, + "num_input_tokens_seen": 134648736, + "step": 110660 + }, + { + "epoch": 12.32486913910235, + "grad_norm": 0.010431689210236073, + "learning_rate": 1.9271019031581984e-05, + "loss": 0.0038, + "num_input_tokens_seen": 134655008, + "step": 110665 + }, + { + "epoch": 12.325425993985966, + "grad_norm": 0.20246578752994537, + "learning_rate": 1.9268653978729137e-05, + "loss": 0.0445, + "num_input_tokens_seen": 134661152, + "step": 110670 + }, + { + "epoch": 12.325982848869584, + "grad_norm": 0.03281499445438385, + "learning_rate": 1.926628898001345e-05, + "loss": 0.0097, + "num_input_tokens_seen": 134667264, + "step": 110675 + }, + { + "epoch": 12.326539703753202, + "grad_norm": 9.07794019440189e-05, + "learning_rate": 1.9263924035457252e-05, + "loss": 0.0745, + "num_input_tokens_seen": 134673568, + "step": 110680 + }, + { + "epoch": 12.32709655863682, + "grad_norm": 1.9654453992843628, + "learning_rate": 1.9261559145082893e-05, + "loss": 0.0539, + "num_input_tokens_seen": 134679744, + "step": 110685 + }, + { + "epoch": 12.327653413520437, + "grad_norm": 0.036401715129613876, + "learning_rate": 1.9259194308912696e-05, + "loss": 0.0122, + "num_input_tokens_seen": 134686016, + "step": 110690 + }, + { + "epoch": 12.328210268404053, + "grad_norm": 0.0022902237251400948, + "learning_rate": 1.9256829526969023e-05, + "loss": 0.0022, + "num_input_tokens_seen": 134691968, + "step": 110695 + }, + { + "epoch": 12.32876712328767, + "grad_norm": 1.3045755624771118, + "learning_rate": 1.9254464799274192e-05, + "loss": 0.0777, + "num_input_tokens_seen": 134697984, + "step": 110700 + }, + { + "epoch": 12.329323978171288, + "grad_norm": 0.4047689437866211, + "learning_rate": 1.9252100125850537e-05, + "loss": 0.0396, + "num_input_tokens_seen": 134703968, + "step": 110705 + }, + { + "epoch": 12.329880833054906, + "grad_norm": 0.016537902876734734, + "learning_rate": 1.9249735506720407e-05, + "loss": 0.031, + "num_input_tokens_seen": 134709984, + "step": 110710 + }, + { + "epoch": 12.330437687938524, + "grad_norm": 0.15198220312595367, + "learning_rate": 1.9247370941906122e-05, + "loss": 0.0432, + "num_input_tokens_seen": 134715776, + "step": 110715 + }, + { + "epoch": 12.33099454282214, + "grad_norm": 0.13385942578315735, + "learning_rate": 1.9245006431430048e-05, + "loss": 0.0208, + "num_input_tokens_seen": 134721536, + "step": 110720 + }, + { + "epoch": 12.331551397705757, + "grad_norm": 0.004376350436359644, + "learning_rate": 1.9242641975314474e-05, + "loss": 0.1025, + "num_input_tokens_seen": 134727872, + "step": 110725 + }, + { + "epoch": 12.332108252589375, + "grad_norm": 0.17426687479019165, + "learning_rate": 1.9240277573581777e-05, + "loss": 0.0117, + "num_input_tokens_seen": 134733696, + "step": 110730 + }, + { + "epoch": 12.332665107472993, + "grad_norm": 0.014119812287390232, + "learning_rate": 1.9237913226254264e-05, + "loss": 0.0067, + "num_input_tokens_seen": 134740064, + "step": 110735 + }, + { + "epoch": 12.33322196235661, + "grad_norm": 1.9649862051010132, + "learning_rate": 1.9235548933354282e-05, + "loss": 0.1017, + "num_input_tokens_seen": 134746368, + "step": 110740 + }, + { + "epoch": 12.333778817240226, + "grad_norm": 1.6749694347381592, + "learning_rate": 1.9233184694904157e-05, + "loss": 0.0388, + "num_input_tokens_seen": 134752608, + "step": 110745 + }, + { + "epoch": 12.334335672123844, + "grad_norm": 0.12360360473394394, + "learning_rate": 1.9230820510926225e-05, + "loss": 0.0422, + "num_input_tokens_seen": 134759008, + "step": 110750 + }, + { + "epoch": 12.334892527007462, + "grad_norm": 0.5748158097267151, + "learning_rate": 1.9228456381442813e-05, + "loss": 0.0189, + "num_input_tokens_seen": 134764896, + "step": 110755 + }, + { + "epoch": 12.33544938189108, + "grad_norm": 0.07690560072660446, + "learning_rate": 1.9226092306476256e-05, + "loss": 0.1347, + "num_input_tokens_seen": 134770176, + "step": 110760 + }, + { + "epoch": 12.336006236774697, + "grad_norm": 1.5455883741378784, + "learning_rate": 1.922372828604888e-05, + "loss": 0.0331, + "num_input_tokens_seen": 134776608, + "step": 110765 + }, + { + "epoch": 12.336563091658315, + "grad_norm": 0.15565000474452972, + "learning_rate": 1.9221364320183028e-05, + "loss": 0.1742, + "num_input_tokens_seen": 134783008, + "step": 110770 + }, + { + "epoch": 12.33711994654193, + "grad_norm": 0.5353695154190063, + "learning_rate": 1.9219000408901e-05, + "loss": 0.0087, + "num_input_tokens_seen": 134789504, + "step": 110775 + }, + { + "epoch": 12.337676801425548, + "grad_norm": 2.0415451526641846, + "learning_rate": 1.921663655222517e-05, + "loss": 0.0143, + "num_input_tokens_seen": 134794976, + "step": 110780 + }, + { + "epoch": 12.338233656309166, + "grad_norm": 9.906822378979996e-05, + "learning_rate": 1.921427275017782e-05, + "loss": 0.1136, + "num_input_tokens_seen": 134801280, + "step": 110785 + }, + { + "epoch": 12.338790511192784, + "grad_norm": 0.15308910608291626, + "learning_rate": 1.921190900278131e-05, + "loss": 0.0191, + "num_input_tokens_seen": 134807264, + "step": 110790 + }, + { + "epoch": 12.339347366076401, + "grad_norm": 0.8399007320404053, + "learning_rate": 1.920954531005795e-05, + "loss": 0.1211, + "num_input_tokens_seen": 134812992, + "step": 110795 + }, + { + "epoch": 12.339904220960017, + "grad_norm": 0.0783056989312172, + "learning_rate": 1.9207181672030085e-05, + "loss": 0.0032, + "num_input_tokens_seen": 134819040, + "step": 110800 + }, + { + "epoch": 12.340461075843635, + "grad_norm": 0.0018144570058211684, + "learning_rate": 1.920481808872002e-05, + "loss": 0.0073, + "num_input_tokens_seen": 134825088, + "step": 110805 + }, + { + "epoch": 12.341017930727253, + "grad_norm": 1.418871521949768, + "learning_rate": 1.92024545601501e-05, + "loss": 0.1514, + "num_input_tokens_seen": 134831424, + "step": 110810 + }, + { + "epoch": 12.34157478561087, + "grad_norm": 0.44677039980888367, + "learning_rate": 1.9200091086342634e-05, + "loss": 0.0408, + "num_input_tokens_seen": 134837632, + "step": 110815 + }, + { + "epoch": 12.342131640494488, + "grad_norm": 0.03008061647415161, + "learning_rate": 1.919772766731996e-05, + "loss": 0.0247, + "num_input_tokens_seen": 134843712, + "step": 110820 + }, + { + "epoch": 12.342688495378104, + "grad_norm": 0.04111146554350853, + "learning_rate": 1.919536430310439e-05, + "loss": 0.0117, + "num_input_tokens_seen": 134849600, + "step": 110825 + }, + { + "epoch": 12.343245350261721, + "grad_norm": 0.01313943974673748, + "learning_rate": 1.919300099371826e-05, + "loss": 0.074, + "num_input_tokens_seen": 134855712, + "step": 110830 + }, + { + "epoch": 12.343802205145339, + "grad_norm": 0.3104996979236603, + "learning_rate": 1.9190637739183888e-05, + "loss": 0.0438, + "num_input_tokens_seen": 134861536, + "step": 110835 + }, + { + "epoch": 12.344359060028957, + "grad_norm": 0.0203495305031538, + "learning_rate": 1.9188274539523598e-05, + "loss": 0.0117, + "num_input_tokens_seen": 134867968, + "step": 110840 + }, + { + "epoch": 12.344915914912574, + "grad_norm": 2.647049903869629, + "learning_rate": 1.91859113947597e-05, + "loss": 0.0301, + "num_input_tokens_seen": 134874368, + "step": 110845 + }, + { + "epoch": 12.34547276979619, + "grad_norm": 0.04609432443976402, + "learning_rate": 1.9183548304914546e-05, + "loss": 0.1407, + "num_input_tokens_seen": 134879968, + "step": 110850 + }, + { + "epoch": 12.346029624679808, + "grad_norm": 0.0304188784211874, + "learning_rate": 1.9181185270010418e-05, + "loss": 0.0181, + "num_input_tokens_seen": 134885696, + "step": 110855 + }, + { + "epoch": 12.346586479563426, + "grad_norm": 0.6777000427246094, + "learning_rate": 1.917882229006967e-05, + "loss": 0.0097, + "num_input_tokens_seen": 134892256, + "step": 110860 + }, + { + "epoch": 12.347143334447043, + "grad_norm": 0.0019880696199834347, + "learning_rate": 1.917645936511461e-05, + "loss": 0.0886, + "num_input_tokens_seen": 134898144, + "step": 110865 + }, + { + "epoch": 12.347700189330661, + "grad_norm": 0.7758181095123291, + "learning_rate": 1.9174096495167555e-05, + "loss": 0.0717, + "num_input_tokens_seen": 134904192, + "step": 110870 + }, + { + "epoch": 12.348257044214277, + "grad_norm": 6.377804675139487e-05, + "learning_rate": 1.917173368025082e-05, + "loss": 0.09, + "num_input_tokens_seen": 134910336, + "step": 110875 + }, + { + "epoch": 12.348813899097895, + "grad_norm": 0.0005044921999797225, + "learning_rate": 1.9169370920386737e-05, + "loss": 0.1047, + "num_input_tokens_seen": 134916480, + "step": 110880 + }, + { + "epoch": 12.349370753981512, + "grad_norm": 0.031345538794994354, + "learning_rate": 1.9167008215597613e-05, + "loss": 0.0007, + "num_input_tokens_seen": 134922496, + "step": 110885 + }, + { + "epoch": 12.34992760886513, + "grad_norm": 0.00020806165412068367, + "learning_rate": 1.9164645565905774e-05, + "loss": 0.0283, + "num_input_tokens_seen": 134928800, + "step": 110890 + }, + { + "epoch": 12.350484463748748, + "grad_norm": 0.0028260895051062107, + "learning_rate": 1.916228297133353e-05, + "loss": 0.0257, + "num_input_tokens_seen": 134934912, + "step": 110895 + }, + { + "epoch": 12.351041318632365, + "grad_norm": 2.9660587310791016, + "learning_rate": 1.91599204319032e-05, + "loss": 0.2063, + "num_input_tokens_seen": 134941152, + "step": 110900 + }, + { + "epoch": 12.351598173515981, + "grad_norm": 0.00012022595183225349, + "learning_rate": 1.9157557947637097e-05, + "loss": 0.0304, + "num_input_tokens_seen": 134946976, + "step": 110905 + }, + { + "epoch": 12.352155028399599, + "grad_norm": 1.073424220085144, + "learning_rate": 1.9155195518557554e-05, + "loss": 0.0399, + "num_input_tokens_seen": 134953312, + "step": 110910 + }, + { + "epoch": 12.352711883283217, + "grad_norm": 0.0018526904750615358, + "learning_rate": 1.9152833144686855e-05, + "loss": 0.0044, + "num_input_tokens_seen": 134959104, + "step": 110915 + }, + { + "epoch": 12.353268738166834, + "grad_norm": 0.3595006465911865, + "learning_rate": 1.9150470826047344e-05, + "loss": 0.0755, + "num_input_tokens_seen": 134965216, + "step": 110920 + }, + { + "epoch": 12.353825593050452, + "grad_norm": 1.1580467224121094, + "learning_rate": 1.9148108562661313e-05, + "loss": 0.0363, + "num_input_tokens_seen": 134971552, + "step": 110925 + }, + { + "epoch": 12.354382447934068, + "grad_norm": 0.0028378718998283148, + "learning_rate": 1.9145746354551093e-05, + "loss": 0.029, + "num_input_tokens_seen": 134977760, + "step": 110930 + }, + { + "epoch": 12.354939302817685, + "grad_norm": 0.0037448941729962826, + "learning_rate": 1.9143384201738986e-05, + "loss": 0.0029, + "num_input_tokens_seen": 134984160, + "step": 110935 + }, + { + "epoch": 12.355496157701303, + "grad_norm": 1.2006229162216187, + "learning_rate": 1.9141022104247308e-05, + "loss": 0.0946, + "num_input_tokens_seen": 134990336, + "step": 110940 + }, + { + "epoch": 12.35605301258492, + "grad_norm": 0.11960160732269287, + "learning_rate": 1.9138660062098368e-05, + "loss": 0.0443, + "num_input_tokens_seen": 134996672, + "step": 110945 + }, + { + "epoch": 12.356609867468539, + "grad_norm": 0.009885339997708797, + "learning_rate": 1.9136298075314486e-05, + "loss": 0.0484, + "num_input_tokens_seen": 135002912, + "step": 110950 + }, + { + "epoch": 12.357166722352154, + "grad_norm": 0.04157378524541855, + "learning_rate": 1.9133936143917957e-05, + "loss": 0.1165, + "num_input_tokens_seen": 135008896, + "step": 110955 + }, + { + "epoch": 12.357723577235772, + "grad_norm": 0.016047529876232147, + "learning_rate": 1.913157426793111e-05, + "loss": 0.028, + "num_input_tokens_seen": 135015136, + "step": 110960 + }, + { + "epoch": 12.35828043211939, + "grad_norm": 0.008654555305838585, + "learning_rate": 1.9129212447376236e-05, + "loss": 0.0581, + "num_input_tokens_seen": 135021216, + "step": 110965 + }, + { + "epoch": 12.358837287003007, + "grad_norm": 0.2444717288017273, + "learning_rate": 1.9126850682275665e-05, + "loss": 0.0554, + "num_input_tokens_seen": 135027424, + "step": 110970 + }, + { + "epoch": 12.359394141886625, + "grad_norm": 0.11719245463609695, + "learning_rate": 1.9124488972651684e-05, + "loss": 0.0852, + "num_input_tokens_seen": 135033568, + "step": 110975 + }, + { + "epoch": 12.359950996770241, + "grad_norm": 0.3816358745098114, + "learning_rate": 1.9122127318526626e-05, + "loss": 0.0121, + "num_input_tokens_seen": 135039424, + "step": 110980 + }, + { + "epoch": 12.360507851653859, + "grad_norm": 0.15345551073551178, + "learning_rate": 1.911976571992277e-05, + "loss": 0.0116, + "num_input_tokens_seen": 135045696, + "step": 110985 + }, + { + "epoch": 12.361064706537476, + "grad_norm": 0.0011481810361146927, + "learning_rate": 1.9117404176862446e-05, + "loss": 0.0031, + "num_input_tokens_seen": 135052224, + "step": 110990 + }, + { + "epoch": 12.361621561421094, + "grad_norm": 4.069070816040039, + "learning_rate": 1.911504268936795e-05, + "loss": 0.1345, + "num_input_tokens_seen": 135057952, + "step": 110995 + }, + { + "epoch": 12.362178416304712, + "grad_norm": 0.018523218110203743, + "learning_rate": 1.9112681257461592e-05, + "loss": 0.0678, + "num_input_tokens_seen": 135063808, + "step": 111000 + }, + { + "epoch": 12.362735271188328, + "grad_norm": 0.06988487392663956, + "learning_rate": 1.9110319881165676e-05, + "loss": 0.0353, + "num_input_tokens_seen": 135069664, + "step": 111005 + }, + { + "epoch": 12.363292126071945, + "grad_norm": 0.0017277173465117812, + "learning_rate": 1.910795856050251e-05, + "loss": 0.0314, + "num_input_tokens_seen": 135075936, + "step": 111010 + }, + { + "epoch": 12.363848980955563, + "grad_norm": 0.00015294643526431173, + "learning_rate": 1.910559729549439e-05, + "loss": 0.0489, + "num_input_tokens_seen": 135082048, + "step": 111015 + }, + { + "epoch": 12.36440583583918, + "grad_norm": 0.031037399545311928, + "learning_rate": 1.9103236086163633e-05, + "loss": 0.0181, + "num_input_tokens_seen": 135088320, + "step": 111020 + }, + { + "epoch": 12.364962690722798, + "grad_norm": 0.0011806132970377803, + "learning_rate": 1.9100874932532532e-05, + "loss": 0.1794, + "num_input_tokens_seen": 135093856, + "step": 111025 + }, + { + "epoch": 12.365519545606414, + "grad_norm": 0.0016301939031109214, + "learning_rate": 1.9098513834623395e-05, + "loss": 0.0233, + "num_input_tokens_seen": 135100000, + "step": 111030 + }, + { + "epoch": 12.366076400490032, + "grad_norm": 0.5063681602478027, + "learning_rate": 1.9096152792458517e-05, + "loss": 0.0391, + "num_input_tokens_seen": 135106112, + "step": 111035 + }, + { + "epoch": 12.36663325537365, + "grad_norm": 0.12835052609443665, + "learning_rate": 1.9093791806060217e-05, + "loss": 0.0517, + "num_input_tokens_seen": 135111392, + "step": 111040 + }, + { + "epoch": 12.367190110257267, + "grad_norm": 0.059077367186546326, + "learning_rate": 1.909143087545077e-05, + "loss": 0.0097, + "num_input_tokens_seen": 135117632, + "step": 111045 + }, + { + "epoch": 12.367746965140885, + "grad_norm": 0.0006222265656106174, + "learning_rate": 1.9089070000652508e-05, + "loss": 0.0679, + "num_input_tokens_seen": 135124000, + "step": 111050 + }, + { + "epoch": 12.3683038200245, + "grad_norm": 0.005882679484784603, + "learning_rate": 1.9086709181687703e-05, + "loss": 0.1076, + "num_input_tokens_seen": 135129632, + "step": 111055 + }, + { + "epoch": 12.368860674908118, + "grad_norm": 1.2146697044372559, + "learning_rate": 1.908434841857868e-05, + "loss": 0.1522, + "num_input_tokens_seen": 135135776, + "step": 111060 + }, + { + "epoch": 12.369417529791736, + "grad_norm": 0.03718406707048416, + "learning_rate": 1.9081987711347714e-05, + "loss": 0.0168, + "num_input_tokens_seen": 135141920, + "step": 111065 + }, + { + "epoch": 12.369974384675354, + "grad_norm": 0.18711641430854797, + "learning_rate": 1.9079627060017126e-05, + "loss": 0.0236, + "num_input_tokens_seen": 135147904, + "step": 111070 + }, + { + "epoch": 12.370531239558971, + "grad_norm": 0.15677209198474884, + "learning_rate": 1.9077266464609194e-05, + "loss": 0.0061, + "num_input_tokens_seen": 135154176, + "step": 111075 + }, + { + "epoch": 12.371088094442587, + "grad_norm": 0.004415994044393301, + "learning_rate": 1.9074905925146234e-05, + "loss": 0.0039, + "num_input_tokens_seen": 135160448, + "step": 111080 + }, + { + "epoch": 12.371644949326205, + "grad_norm": 0.009069716557860374, + "learning_rate": 1.9072545441650528e-05, + "loss": 0.0433, + "num_input_tokens_seen": 135166688, + "step": 111085 + }, + { + "epoch": 12.372201804209823, + "grad_norm": 0.29987362027168274, + "learning_rate": 1.9070185014144384e-05, + "loss": 0.1112, + "num_input_tokens_seen": 135172672, + "step": 111090 + }, + { + "epoch": 12.37275865909344, + "grad_norm": 0.11009014397859573, + "learning_rate": 1.9067824642650083e-05, + "loss": 0.0395, + "num_input_tokens_seen": 135178624, + "step": 111095 + }, + { + "epoch": 12.373315513977058, + "grad_norm": 0.4330350458621979, + "learning_rate": 1.9065464327189946e-05, + "loss": 0.0141, + "num_input_tokens_seen": 135184864, + "step": 111100 + }, + { + "epoch": 12.373872368860676, + "grad_norm": 0.0008268895908258855, + "learning_rate": 1.9063104067786253e-05, + "loss": 0.0531, + "num_input_tokens_seen": 135191168, + "step": 111105 + }, + { + "epoch": 12.374429223744292, + "grad_norm": 0.023912867531180382, + "learning_rate": 1.906074386446129e-05, + "loss": 0.0084, + "num_input_tokens_seen": 135197280, + "step": 111110 + }, + { + "epoch": 12.37498607862791, + "grad_norm": 0.5347961187362671, + "learning_rate": 1.9058383717237363e-05, + "loss": 0.077, + "num_input_tokens_seen": 135203488, + "step": 111115 + }, + { + "epoch": 12.375542933511527, + "grad_norm": 0.07510421425104141, + "learning_rate": 1.9056023626136754e-05, + "loss": 0.0309, + "num_input_tokens_seen": 135209984, + "step": 111120 + }, + { + "epoch": 12.376099788395145, + "grad_norm": 0.35286176204681396, + "learning_rate": 1.905366359118178e-05, + "loss": 0.1042, + "num_input_tokens_seen": 135216288, + "step": 111125 + }, + { + "epoch": 12.376656643278762, + "grad_norm": 1.171824336051941, + "learning_rate": 1.90513036123947e-05, + "loss": 0.0515, + "num_input_tokens_seen": 135222112, + "step": 111130 + }, + { + "epoch": 12.377213498162378, + "grad_norm": 0.0005564808379858732, + "learning_rate": 1.9048943689797832e-05, + "loss": 0.0053, + "num_input_tokens_seen": 135228192, + "step": 111135 + }, + { + "epoch": 12.377770353045996, + "grad_norm": 0.19075971841812134, + "learning_rate": 1.904658382341346e-05, + "loss": 0.0053, + "num_input_tokens_seen": 135234208, + "step": 111140 + }, + { + "epoch": 12.378327207929614, + "grad_norm": 0.00687428331002593, + "learning_rate": 1.9044224013263874e-05, + "loss": 0.007, + "num_input_tokens_seen": 135240096, + "step": 111145 + }, + { + "epoch": 12.378884062813231, + "grad_norm": 0.161250501871109, + "learning_rate": 1.9041864259371358e-05, + "loss": 0.1054, + "num_input_tokens_seen": 135246176, + "step": 111150 + }, + { + "epoch": 12.379440917696849, + "grad_norm": 1.2716580629348755, + "learning_rate": 1.9039504561758214e-05, + "loss": 0.0972, + "num_input_tokens_seen": 135252448, + "step": 111155 + }, + { + "epoch": 12.379997772580465, + "grad_norm": 0.9280474781990051, + "learning_rate": 1.903714492044672e-05, + "loss": 0.0359, + "num_input_tokens_seen": 135258400, + "step": 111160 + }, + { + "epoch": 12.380554627464083, + "grad_norm": 1.093819499015808, + "learning_rate": 1.9034785335459178e-05, + "loss": 0.0257, + "num_input_tokens_seen": 135264416, + "step": 111165 + }, + { + "epoch": 12.3811114823477, + "grad_norm": 0.00973917730152607, + "learning_rate": 1.903242580681786e-05, + "loss": 0.002, + "num_input_tokens_seen": 135270528, + "step": 111170 + }, + { + "epoch": 12.381668337231318, + "grad_norm": 0.1404072642326355, + "learning_rate": 1.9030066334545064e-05, + "loss": 0.0249, + "num_input_tokens_seen": 135276480, + "step": 111175 + }, + { + "epoch": 12.382225192114936, + "grad_norm": 0.3624456822872162, + "learning_rate": 1.9027706918663065e-05, + "loss": 0.0396, + "num_input_tokens_seen": 135282592, + "step": 111180 + }, + { + "epoch": 12.382782046998551, + "grad_norm": 0.19942088425159454, + "learning_rate": 1.9025347559194178e-05, + "loss": 0.006, + "num_input_tokens_seen": 135288608, + "step": 111185 + }, + { + "epoch": 12.38333890188217, + "grad_norm": 0.01007520779967308, + "learning_rate": 1.902298825616065e-05, + "loss": 0.0624, + "num_input_tokens_seen": 135294624, + "step": 111190 + }, + { + "epoch": 12.383895756765787, + "grad_norm": 0.042629875242710114, + "learning_rate": 1.90206290095848e-05, + "loss": 0.0178, + "num_input_tokens_seen": 135300768, + "step": 111195 + }, + { + "epoch": 12.384452611649404, + "grad_norm": 0.014700688421726227, + "learning_rate": 1.9018269819488895e-05, + "loss": 0.004, + "num_input_tokens_seen": 135307008, + "step": 111200 + }, + { + "epoch": 12.385009466533022, + "grad_norm": 0.5776166319847107, + "learning_rate": 1.9015910685895228e-05, + "loss": 0.0222, + "num_input_tokens_seen": 135312928, + "step": 111205 + }, + { + "epoch": 12.385566321416638, + "grad_norm": 0.00031760914134792984, + "learning_rate": 1.9013551608826076e-05, + "loss": 0.11, + "num_input_tokens_seen": 135318208, + "step": 111210 + }, + { + "epoch": 12.386123176300256, + "grad_norm": 1.6365002393722534, + "learning_rate": 1.9011192588303728e-05, + "loss": 0.107, + "num_input_tokens_seen": 135324512, + "step": 111215 + }, + { + "epoch": 12.386680031183873, + "grad_norm": 1.2899134159088135, + "learning_rate": 1.900883362435046e-05, + "loss": 0.0356, + "num_input_tokens_seen": 135330784, + "step": 111220 + }, + { + "epoch": 12.387236886067491, + "grad_norm": 1.178472876548767, + "learning_rate": 1.900647471698856e-05, + "loss": 0.0253, + "num_input_tokens_seen": 135336736, + "step": 111225 + }, + { + "epoch": 12.387793740951109, + "grad_norm": 0.00144200399518013, + "learning_rate": 1.9004115866240308e-05, + "loss": 0.0052, + "num_input_tokens_seen": 135342976, + "step": 111230 + }, + { + "epoch": 12.388350595834725, + "grad_norm": 0.017046038061380386, + "learning_rate": 1.9001757072127987e-05, + "loss": 0.0497, + "num_input_tokens_seen": 135349088, + "step": 111235 + }, + { + "epoch": 12.388907450718342, + "grad_norm": 0.017076153308153152, + "learning_rate": 1.899939833467387e-05, + "loss": 0.0187, + "num_input_tokens_seen": 135355296, + "step": 111240 + }, + { + "epoch": 12.38946430560196, + "grad_norm": 1.2408578395843506, + "learning_rate": 1.8997039653900255e-05, + "loss": 0.0592, + "num_input_tokens_seen": 135361280, + "step": 111245 + }, + { + "epoch": 12.390021160485578, + "grad_norm": 0.0005187506321817636, + "learning_rate": 1.8994681029829394e-05, + "loss": 0.1085, + "num_input_tokens_seen": 135367680, + "step": 111250 + }, + { + "epoch": 12.390578015369195, + "grad_norm": 0.0054418547078967094, + "learning_rate": 1.89923224624836e-05, + "loss": 0.0053, + "num_input_tokens_seen": 135373952, + "step": 111255 + }, + { + "epoch": 12.391134870252813, + "grad_norm": 0.0019187512807548046, + "learning_rate": 1.8989963951885118e-05, + "loss": 0.0141, + "num_input_tokens_seen": 135380256, + "step": 111260 + }, + { + "epoch": 12.391691725136429, + "grad_norm": 0.20948892831802368, + "learning_rate": 1.8987605498056252e-05, + "loss": 0.0649, + "num_input_tokens_seen": 135386240, + "step": 111265 + }, + { + "epoch": 12.392248580020047, + "grad_norm": 1.4268540143966675, + "learning_rate": 1.8985247101019265e-05, + "loss": 0.1, + "num_input_tokens_seen": 135392704, + "step": 111270 + }, + { + "epoch": 12.392805434903664, + "grad_norm": 0.2852981686592102, + "learning_rate": 1.898288876079644e-05, + "loss": 0.0163, + "num_input_tokens_seen": 135398304, + "step": 111275 + }, + { + "epoch": 12.393362289787282, + "grad_norm": 0.9022379517555237, + "learning_rate": 1.8980530477410047e-05, + "loss": 0.0564, + "num_input_tokens_seen": 135404192, + "step": 111280 + }, + { + "epoch": 12.3939191446709, + "grad_norm": 0.01568613201379776, + "learning_rate": 1.8978172250882374e-05, + "loss": 0.0493, + "num_input_tokens_seen": 135410208, + "step": 111285 + }, + { + "epoch": 12.394475999554516, + "grad_norm": 0.0002695443981792778, + "learning_rate": 1.897581408123568e-05, + "loss": 0.0132, + "num_input_tokens_seen": 135416608, + "step": 111290 + }, + { + "epoch": 12.395032854438133, + "grad_norm": 0.28338009119033813, + "learning_rate": 1.897345596849226e-05, + "loss": 0.0371, + "num_input_tokens_seen": 135422624, + "step": 111295 + }, + { + "epoch": 12.39558970932175, + "grad_norm": 1.112498164176941, + "learning_rate": 1.8971097912674362e-05, + "loss": 0.0371, + "num_input_tokens_seen": 135428864, + "step": 111300 + }, + { + "epoch": 12.396146564205369, + "grad_norm": 1.6355299949645996, + "learning_rate": 1.8968739913804287e-05, + "loss": 0.0693, + "num_input_tokens_seen": 135434144, + "step": 111305 + }, + { + "epoch": 12.396703419088986, + "grad_norm": 0.02120634913444519, + "learning_rate": 1.8966381971904282e-05, + "loss": 0.0214, + "num_input_tokens_seen": 135440512, + "step": 111310 + }, + { + "epoch": 12.397260273972602, + "grad_norm": 0.0004522221861407161, + "learning_rate": 1.8964024086996652e-05, + "loss": 0.0405, + "num_input_tokens_seen": 135446528, + "step": 111315 + }, + { + "epoch": 12.39781712885622, + "grad_norm": 0.4656103849411011, + "learning_rate": 1.8961666259103633e-05, + "loss": 0.0112, + "num_input_tokens_seen": 135452640, + "step": 111320 + }, + { + "epoch": 12.398373983739837, + "grad_norm": 0.009441710077226162, + "learning_rate": 1.8959308488247523e-05, + "loss": 0.0169, + "num_input_tokens_seen": 135458752, + "step": 111325 + }, + { + "epoch": 12.398930838623455, + "grad_norm": 0.10530304163694382, + "learning_rate": 1.895695077445058e-05, + "loss": 0.0071, + "num_input_tokens_seen": 135464960, + "step": 111330 + }, + { + "epoch": 12.399487693507073, + "grad_norm": 0.675174355506897, + "learning_rate": 1.8954593117735083e-05, + "loss": 0.1595, + "num_input_tokens_seen": 135470464, + "step": 111335 + }, + { + "epoch": 12.400044548390689, + "grad_norm": 0.2689448595046997, + "learning_rate": 1.8952235518123295e-05, + "loss": 0.04, + "num_input_tokens_seen": 135476192, + "step": 111340 + }, + { + "epoch": 12.400601403274306, + "grad_norm": 0.007659325376152992, + "learning_rate": 1.8949877975637492e-05, + "loss": 0.1253, + "num_input_tokens_seen": 135482240, + "step": 111345 + }, + { + "epoch": 12.401158258157924, + "grad_norm": 0.6656768321990967, + "learning_rate": 1.894752049029993e-05, + "loss": 0.1807, + "num_input_tokens_seen": 135488288, + "step": 111350 + }, + { + "epoch": 12.401715113041542, + "grad_norm": 0.025432543829083443, + "learning_rate": 1.8945163062132897e-05, + "loss": 0.0446, + "num_input_tokens_seen": 135494368, + "step": 111355 + }, + { + "epoch": 12.40227196792516, + "grad_norm": 0.2143317461013794, + "learning_rate": 1.8942805691158642e-05, + "loss": 0.0282, + "num_input_tokens_seen": 135500576, + "step": 111360 + }, + { + "epoch": 12.402828822808775, + "grad_norm": 0.7030622363090515, + "learning_rate": 1.8940448377399443e-05, + "loss": 0.141, + "num_input_tokens_seen": 135506656, + "step": 111365 + }, + { + "epoch": 12.403385677692393, + "grad_norm": 2.512779712677002, + "learning_rate": 1.8938091120877557e-05, + "loss": 0.1184, + "num_input_tokens_seen": 135512256, + "step": 111370 + }, + { + "epoch": 12.40394253257601, + "grad_norm": 1.1302887201309204, + "learning_rate": 1.893573392161527e-05, + "loss": 0.1146, + "num_input_tokens_seen": 135518048, + "step": 111375 + }, + { + "epoch": 12.404499387459628, + "grad_norm": 0.0034236980136483908, + "learning_rate": 1.8933376779634822e-05, + "loss": 0.0065, + "num_input_tokens_seen": 135524352, + "step": 111380 + }, + { + "epoch": 12.405056242343246, + "grad_norm": 1.0104612112045288, + "learning_rate": 1.8931019694958506e-05, + "loss": 0.0191, + "num_input_tokens_seen": 135530464, + "step": 111385 + }, + { + "epoch": 12.405613097226862, + "grad_norm": 0.04898793250322342, + "learning_rate": 1.8928662667608553e-05, + "loss": 0.0032, + "num_input_tokens_seen": 135536672, + "step": 111390 + }, + { + "epoch": 12.40616995211048, + "grad_norm": 0.021948831155896187, + "learning_rate": 1.892630569760726e-05, + "loss": 0.0483, + "num_input_tokens_seen": 135542752, + "step": 111395 + }, + { + "epoch": 12.406726806994097, + "grad_norm": 0.03558913618326187, + "learning_rate": 1.892394878497687e-05, + "loss": 0.1638, + "num_input_tokens_seen": 135548928, + "step": 111400 + }, + { + "epoch": 12.407283661877715, + "grad_norm": 0.250509649515152, + "learning_rate": 1.8921591929739654e-05, + "loss": 0.0852, + "num_input_tokens_seen": 135555168, + "step": 111405 + }, + { + "epoch": 12.407840516761333, + "grad_norm": 0.32166239619255066, + "learning_rate": 1.8919235131917866e-05, + "loss": 0.0046, + "num_input_tokens_seen": 135561216, + "step": 111410 + }, + { + "epoch": 12.408397371644948, + "grad_norm": 0.017774421721696854, + "learning_rate": 1.8916878391533785e-05, + "loss": 0.0116, + "num_input_tokens_seen": 135567456, + "step": 111415 + }, + { + "epoch": 12.408954226528566, + "grad_norm": 0.00997878611087799, + "learning_rate": 1.8914521708609655e-05, + "loss": 0.0406, + "num_input_tokens_seen": 135573600, + "step": 111420 + }, + { + "epoch": 12.409511081412184, + "grad_norm": 0.00030603152117691934, + "learning_rate": 1.8912165083167742e-05, + "loss": 0.0873, + "num_input_tokens_seen": 135579904, + "step": 111425 + }, + { + "epoch": 12.410067936295802, + "grad_norm": 0.02848442643880844, + "learning_rate": 1.8909808515230306e-05, + "loss": 0.0014, + "num_input_tokens_seen": 135586368, + "step": 111430 + }, + { + "epoch": 12.41062479117942, + "grad_norm": 0.5273196697235107, + "learning_rate": 1.8907452004819614e-05, + "loss": 0.0971, + "num_input_tokens_seen": 135592448, + "step": 111435 + }, + { + "epoch": 12.411181646063035, + "grad_norm": 0.6354087591171265, + "learning_rate": 1.8905095551957906e-05, + "loss": 0.0218, + "num_input_tokens_seen": 135598688, + "step": 111440 + }, + { + "epoch": 12.411738500946653, + "grad_norm": 0.19082510471343994, + "learning_rate": 1.8902739156667473e-05, + "loss": 0.0103, + "num_input_tokens_seen": 135604224, + "step": 111445 + }, + { + "epoch": 12.41229535583027, + "grad_norm": 0.6636462807655334, + "learning_rate": 1.8900382818970536e-05, + "loss": 0.0161, + "num_input_tokens_seen": 135610048, + "step": 111450 + }, + { + "epoch": 12.412852210713888, + "grad_norm": 0.0012647334951907396, + "learning_rate": 1.889802653888938e-05, + "loss": 0.0159, + "num_input_tokens_seen": 135616256, + "step": 111455 + }, + { + "epoch": 12.413409065597506, + "grad_norm": 0.001189646078273654, + "learning_rate": 1.8895670316446248e-05, + "loss": 0.0228, + "num_input_tokens_seen": 135622432, + "step": 111460 + }, + { + "epoch": 12.413965920481123, + "grad_norm": 0.042465221136808395, + "learning_rate": 1.8893314151663407e-05, + "loss": 0.0231, + "num_input_tokens_seen": 135628864, + "step": 111465 + }, + { + "epoch": 12.41452277536474, + "grad_norm": 0.045130375772714615, + "learning_rate": 1.8890958044563102e-05, + "loss": 0.0028, + "num_input_tokens_seen": 135634784, + "step": 111470 + }, + { + "epoch": 12.415079630248357, + "grad_norm": 0.3736400306224823, + "learning_rate": 1.8888601995167592e-05, + "loss": 0.1007, + "num_input_tokens_seen": 135640800, + "step": 111475 + }, + { + "epoch": 12.415636485131975, + "grad_norm": 0.3588889241218567, + "learning_rate": 1.8886246003499132e-05, + "loss": 0.0122, + "num_input_tokens_seen": 135646848, + "step": 111480 + }, + { + "epoch": 12.416193340015592, + "grad_norm": 0.08285383880138397, + "learning_rate": 1.8883890069579982e-05, + "loss": 0.0544, + "num_input_tokens_seen": 135653152, + "step": 111485 + }, + { + "epoch": 12.41675019489921, + "grad_norm": 0.6691033840179443, + "learning_rate": 1.8881534193432383e-05, + "loss": 0.0654, + "num_input_tokens_seen": 135659520, + "step": 111490 + }, + { + "epoch": 12.417307049782826, + "grad_norm": 0.09956740587949753, + "learning_rate": 1.88791783750786e-05, + "loss": 0.0147, + "num_input_tokens_seen": 135665888, + "step": 111495 + }, + { + "epoch": 12.417863904666444, + "grad_norm": 0.2671446204185486, + "learning_rate": 1.8876822614540877e-05, + "loss": 0.1192, + "num_input_tokens_seen": 135672064, + "step": 111500 + }, + { + "epoch": 12.418420759550061, + "grad_norm": 2.282853603363037, + "learning_rate": 1.887446691184148e-05, + "loss": 0.1038, + "num_input_tokens_seen": 135678080, + "step": 111505 + }, + { + "epoch": 12.418977614433679, + "grad_norm": 0.031315360218286514, + "learning_rate": 1.887211126700264e-05, + "loss": 0.0455, + "num_input_tokens_seen": 135683712, + "step": 111510 + }, + { + "epoch": 12.419534469317297, + "grad_norm": 0.38869622349739075, + "learning_rate": 1.8869755680046623e-05, + "loss": 0.0146, + "num_input_tokens_seen": 135690048, + "step": 111515 + }, + { + "epoch": 12.420091324200913, + "grad_norm": 0.01639881730079651, + "learning_rate": 1.8867400150995683e-05, + "loss": 0.0219, + "num_input_tokens_seen": 135696032, + "step": 111520 + }, + { + "epoch": 12.42064817908453, + "grad_norm": 0.9625377655029297, + "learning_rate": 1.8865044679872046e-05, + "loss": 0.0687, + "num_input_tokens_seen": 135702144, + "step": 111525 + }, + { + "epoch": 12.421205033968148, + "grad_norm": 0.04202641174197197, + "learning_rate": 1.8862689266697995e-05, + "loss": 0.0516, + "num_input_tokens_seen": 135708256, + "step": 111530 + }, + { + "epoch": 12.421761888851766, + "grad_norm": 1.2721027135849, + "learning_rate": 1.8860333911495743e-05, + "loss": 0.0646, + "num_input_tokens_seen": 135713888, + "step": 111535 + }, + { + "epoch": 12.422318743735383, + "grad_norm": 0.012257578782737255, + "learning_rate": 1.8857978614287566e-05, + "loss": 0.0505, + "num_input_tokens_seen": 135720416, + "step": 111540 + }, + { + "epoch": 12.422875598619, + "grad_norm": 0.5352786779403687, + "learning_rate": 1.88556233750957e-05, + "loss": 0.0235, + "num_input_tokens_seen": 135726592, + "step": 111545 + }, + { + "epoch": 12.423432453502617, + "grad_norm": 0.483985960483551, + "learning_rate": 1.8853268193942393e-05, + "loss": 0.0139, + "num_input_tokens_seen": 135732640, + "step": 111550 + }, + { + "epoch": 12.423989308386234, + "grad_norm": 0.04779493063688278, + "learning_rate": 1.8850913070849893e-05, + "loss": 0.0335, + "num_input_tokens_seen": 135738848, + "step": 111555 + }, + { + "epoch": 12.424546163269852, + "grad_norm": 0.01819000393152237, + "learning_rate": 1.884855800584045e-05, + "loss": 0.0809, + "num_input_tokens_seen": 135745056, + "step": 111560 + }, + { + "epoch": 12.42510301815347, + "grad_norm": 0.11136706918478012, + "learning_rate": 1.88462029989363e-05, + "loss": 0.0179, + "num_input_tokens_seen": 135751328, + "step": 111565 + }, + { + "epoch": 12.425659873037086, + "grad_norm": 0.00035519100492820144, + "learning_rate": 1.8843848050159697e-05, + "loss": 0.0054, + "num_input_tokens_seen": 135757344, + "step": 111570 + }, + { + "epoch": 12.426216727920703, + "grad_norm": 0.5626952052116394, + "learning_rate": 1.884149315953288e-05, + "loss": 0.0109, + "num_input_tokens_seen": 135763488, + "step": 111575 + }, + { + "epoch": 12.426773582804321, + "grad_norm": 0.14556951820850372, + "learning_rate": 1.8839138327078094e-05, + "loss": 0.0534, + "num_input_tokens_seen": 135769632, + "step": 111580 + }, + { + "epoch": 12.427330437687939, + "grad_norm": 1.0139504671096802, + "learning_rate": 1.8836783552817573e-05, + "loss": 0.0187, + "num_input_tokens_seen": 135775744, + "step": 111585 + }, + { + "epoch": 12.427887292571556, + "grad_norm": 0.7426566481590271, + "learning_rate": 1.8834428836773586e-05, + "loss": 0.0287, + "num_input_tokens_seen": 135781824, + "step": 111590 + }, + { + "epoch": 12.428444147455172, + "grad_norm": 0.12304673343896866, + "learning_rate": 1.8832074178968342e-05, + "loss": 0.0134, + "num_input_tokens_seen": 135787872, + "step": 111595 + }, + { + "epoch": 12.42900100233879, + "grad_norm": 0.373286634683609, + "learning_rate": 1.882971957942411e-05, + "loss": 0.0259, + "num_input_tokens_seen": 135793920, + "step": 111600 + }, + { + "epoch": 12.429557857222408, + "grad_norm": 0.009968524798750877, + "learning_rate": 1.8827365038163116e-05, + "loss": 0.0547, + "num_input_tokens_seen": 135799808, + "step": 111605 + }, + { + "epoch": 12.430114712106025, + "grad_norm": 0.42572927474975586, + "learning_rate": 1.8825010555207607e-05, + "loss": 0.0331, + "num_input_tokens_seen": 135806144, + "step": 111610 + }, + { + "epoch": 12.430671566989643, + "grad_norm": 0.9920939803123474, + "learning_rate": 1.882265613057982e-05, + "loss": 0.071, + "num_input_tokens_seen": 135812032, + "step": 111615 + }, + { + "epoch": 12.43122842187326, + "grad_norm": 0.0005671309190802276, + "learning_rate": 1.8820301764302e-05, + "loss": 0.0447, + "num_input_tokens_seen": 135816960, + "step": 111620 + }, + { + "epoch": 12.431785276756877, + "grad_norm": 0.12350891530513763, + "learning_rate": 1.881794745639637e-05, + "loss": 0.0068, + "num_input_tokens_seen": 135822912, + "step": 111625 + }, + { + "epoch": 12.432342131640494, + "grad_norm": 0.000173293097759597, + "learning_rate": 1.881559320688519e-05, + "loss": 0.012, + "num_input_tokens_seen": 135828928, + "step": 111630 + }, + { + "epoch": 12.432898986524112, + "grad_norm": 0.013775396160781384, + "learning_rate": 1.8813239015790678e-05, + "loss": 0.001, + "num_input_tokens_seen": 135834848, + "step": 111635 + }, + { + "epoch": 12.43345584140773, + "grad_norm": 0.01971256360411644, + "learning_rate": 1.8810884883135087e-05, + "loss": 0.0083, + "num_input_tokens_seen": 135841056, + "step": 111640 + }, + { + "epoch": 12.434012696291347, + "grad_norm": 0.00017859529179986566, + "learning_rate": 1.880853080894064e-05, + "loss": 0.0438, + "num_input_tokens_seen": 135846784, + "step": 111645 + }, + { + "epoch": 12.434569551174963, + "grad_norm": 0.24213598668575287, + "learning_rate": 1.880617679322959e-05, + "loss": 0.069, + "num_input_tokens_seen": 135853184, + "step": 111650 + }, + { + "epoch": 12.43512640605858, + "grad_norm": 0.29741331934928894, + "learning_rate": 1.880382283602415e-05, + "loss": 0.0321, + "num_input_tokens_seen": 135859584, + "step": 111655 + }, + { + "epoch": 12.435683260942199, + "grad_norm": 0.02258765883743763, + "learning_rate": 1.8801468937346584e-05, + "loss": 0.0987, + "num_input_tokens_seen": 135865120, + "step": 111660 + }, + { + "epoch": 12.436240115825816, + "grad_norm": 0.0011660351883620024, + "learning_rate": 1.8799115097219093e-05, + "loss": 0.0353, + "num_input_tokens_seen": 135871072, + "step": 111665 + }, + { + "epoch": 12.436796970709434, + "grad_norm": 0.5560592412948608, + "learning_rate": 1.8796761315663937e-05, + "loss": 0.0202, + "num_input_tokens_seen": 135877024, + "step": 111670 + }, + { + "epoch": 12.43735382559305, + "grad_norm": 3.0352976322174072, + "learning_rate": 1.8794407592703336e-05, + "loss": 0.0431, + "num_input_tokens_seen": 135883040, + "step": 111675 + }, + { + "epoch": 12.437910680476667, + "grad_norm": 0.17035965621471405, + "learning_rate": 1.8792053928359532e-05, + "loss": 0.01, + "num_input_tokens_seen": 135889152, + "step": 111680 + }, + { + "epoch": 12.438467535360285, + "grad_norm": 1.6187058687210083, + "learning_rate": 1.878970032265475e-05, + "loss": 0.0667, + "num_input_tokens_seen": 135895488, + "step": 111685 + }, + { + "epoch": 12.439024390243903, + "grad_norm": 0.007534559350460768, + "learning_rate": 1.8787346775611225e-05, + "loss": 0.0159, + "num_input_tokens_seen": 135901600, + "step": 111690 + }, + { + "epoch": 12.43958124512752, + "grad_norm": 0.011667916551232338, + "learning_rate": 1.8784993287251185e-05, + "loss": 0.0321, + "num_input_tokens_seen": 135907040, + "step": 111695 + }, + { + "epoch": 12.440138100011136, + "grad_norm": 0.5845484733581543, + "learning_rate": 1.8782639857596866e-05, + "loss": 0.1247, + "num_input_tokens_seen": 135913056, + "step": 111700 + }, + { + "epoch": 12.440694954894754, + "grad_norm": 1.1255677938461304, + "learning_rate": 1.8780286486670493e-05, + "loss": 0.1038, + "num_input_tokens_seen": 135919360, + "step": 111705 + }, + { + "epoch": 12.441251809778372, + "grad_norm": 0.03487497940659523, + "learning_rate": 1.87779331744943e-05, + "loss": 0.0045, + "num_input_tokens_seen": 135925472, + "step": 111710 + }, + { + "epoch": 12.44180866466199, + "grad_norm": 1.8940657377243042, + "learning_rate": 1.87755799210905e-05, + "loss": 0.1088, + "num_input_tokens_seen": 135930976, + "step": 111715 + }, + { + "epoch": 12.442365519545607, + "grad_norm": 0.00027072805096395314, + "learning_rate": 1.8773226726481354e-05, + "loss": 0.0609, + "num_input_tokens_seen": 135937152, + "step": 111720 + }, + { + "epoch": 12.442922374429223, + "grad_norm": 0.07439405471086502, + "learning_rate": 1.8770873590689057e-05, + "loss": 0.0548, + "num_input_tokens_seen": 135943424, + "step": 111725 + }, + { + "epoch": 12.44347922931284, + "grad_norm": 0.006569948047399521, + "learning_rate": 1.8768520513735853e-05, + "loss": 0.0692, + "num_input_tokens_seen": 135949344, + "step": 111730 + }, + { + "epoch": 12.444036084196458, + "grad_norm": 4.602468490600586, + "learning_rate": 1.8766167495643965e-05, + "loss": 0.1253, + "num_input_tokens_seen": 135955104, + "step": 111735 + }, + { + "epoch": 12.444592939080076, + "grad_norm": 0.5796924233436584, + "learning_rate": 1.8763814536435625e-05, + "loss": 0.1289, + "num_input_tokens_seen": 135960672, + "step": 111740 + }, + { + "epoch": 12.445149793963694, + "grad_norm": 0.8527304530143738, + "learning_rate": 1.876146163613305e-05, + "loss": 0.0938, + "num_input_tokens_seen": 135966688, + "step": 111745 + }, + { + "epoch": 12.44570664884731, + "grad_norm": 0.37561726570129395, + "learning_rate": 1.8759108794758468e-05, + "loss": 0.0191, + "num_input_tokens_seen": 135972832, + "step": 111750 + }, + { + "epoch": 12.446263503730927, + "grad_norm": 0.7120078206062317, + "learning_rate": 1.87567560123341e-05, + "loss": 0.0326, + "num_input_tokens_seen": 135979008, + "step": 111755 + }, + { + "epoch": 12.446820358614545, + "grad_norm": 0.0018059886060655117, + "learning_rate": 1.875440328888218e-05, + "loss": 0.0081, + "num_input_tokens_seen": 135985184, + "step": 111760 + }, + { + "epoch": 12.447377213498163, + "grad_norm": 0.0050379387103021145, + "learning_rate": 1.8752050624424916e-05, + "loss": 0.0787, + "num_input_tokens_seen": 135991136, + "step": 111765 + }, + { + "epoch": 12.44793406838178, + "grad_norm": 0.36405661702156067, + "learning_rate": 1.874969801898455e-05, + "loss": 0.1332, + "num_input_tokens_seen": 135997056, + "step": 111770 + }, + { + "epoch": 12.448490923265396, + "grad_norm": 1.9391809701919556, + "learning_rate": 1.8747345472583282e-05, + "loss": 0.0851, + "num_input_tokens_seen": 136003008, + "step": 111775 + }, + { + "epoch": 12.449047778149014, + "grad_norm": 0.13170643150806427, + "learning_rate": 1.874499298524336e-05, + "loss": 0.2093, + "num_input_tokens_seen": 136008544, + "step": 111780 + }, + { + "epoch": 12.449604633032632, + "grad_norm": 0.058376241475343704, + "learning_rate": 1.8742640556986974e-05, + "loss": 0.019, + "num_input_tokens_seen": 136014592, + "step": 111785 + }, + { + "epoch": 12.45016148791625, + "grad_norm": 0.27235403656959534, + "learning_rate": 1.874028818783637e-05, + "loss": 0.0486, + "num_input_tokens_seen": 136020384, + "step": 111790 + }, + { + "epoch": 12.450718342799867, + "grad_norm": 0.2731649577617645, + "learning_rate": 1.873793587781376e-05, + "loss": 0.0154, + "num_input_tokens_seen": 136026400, + "step": 111795 + }, + { + "epoch": 12.451275197683483, + "grad_norm": 0.3096867501735687, + "learning_rate": 1.8735583626941364e-05, + "loss": 0.1329, + "num_input_tokens_seen": 136032640, + "step": 111800 + }, + { + "epoch": 12.4518320525671, + "grad_norm": 0.15918059647083282, + "learning_rate": 1.8733231435241395e-05, + "loss": 0.0366, + "num_input_tokens_seen": 136038592, + "step": 111805 + }, + { + "epoch": 12.452388907450718, + "grad_norm": 0.4899330735206604, + "learning_rate": 1.873087930273608e-05, + "loss": 0.0092, + "num_input_tokens_seen": 136044864, + "step": 111810 + }, + { + "epoch": 12.452945762334336, + "grad_norm": 0.013709777034819126, + "learning_rate": 1.8728527229447628e-05, + "loss": 0.007, + "num_input_tokens_seen": 136050816, + "step": 111815 + }, + { + "epoch": 12.453502617217953, + "grad_norm": 0.018924295902252197, + "learning_rate": 1.872617521539827e-05, + "loss": 0.0052, + "num_input_tokens_seen": 136057088, + "step": 111820 + }, + { + "epoch": 12.454059472101571, + "grad_norm": 0.03585728257894516, + "learning_rate": 1.8723823260610206e-05, + "loss": 0.0585, + "num_input_tokens_seen": 136063328, + "step": 111825 + }, + { + "epoch": 12.454616326985187, + "grad_norm": 0.5544184446334839, + "learning_rate": 1.8721471365105665e-05, + "loss": 0.0143, + "num_input_tokens_seen": 136069792, + "step": 111830 + }, + { + "epoch": 12.455173181868805, + "grad_norm": 1.594236135482788, + "learning_rate": 1.8719119528906848e-05, + "loss": 0.0342, + "num_input_tokens_seen": 136075968, + "step": 111835 + }, + { + "epoch": 12.455730036752422, + "grad_norm": 0.11548194289207458, + "learning_rate": 1.871676775203599e-05, + "loss": 0.0118, + "num_input_tokens_seen": 136081536, + "step": 111840 + }, + { + "epoch": 12.45628689163604, + "grad_norm": 0.013396773487329483, + "learning_rate": 1.871441603451528e-05, + "loss": 0.0161, + "num_input_tokens_seen": 136087680, + "step": 111845 + }, + { + "epoch": 12.456843746519658, + "grad_norm": 0.4242329001426697, + "learning_rate": 1.8712064376366965e-05, + "loss": 0.0221, + "num_input_tokens_seen": 136093536, + "step": 111850 + }, + { + "epoch": 12.457400601403274, + "grad_norm": 0.10142125189304352, + "learning_rate": 1.8709712777613225e-05, + "loss": 0.0311, + "num_input_tokens_seen": 136099200, + "step": 111855 + }, + { + "epoch": 12.457957456286891, + "grad_norm": 0.028387051075696945, + "learning_rate": 1.8707361238276294e-05, + "loss": 0.0114, + "num_input_tokens_seen": 136105248, + "step": 111860 + }, + { + "epoch": 12.458514311170509, + "grad_norm": 0.00010864665819099173, + "learning_rate": 1.8705009758378375e-05, + "loss": 0.0166, + "num_input_tokens_seen": 136111552, + "step": 111865 + }, + { + "epoch": 12.459071166054127, + "grad_norm": 0.06454405933618546, + "learning_rate": 1.8702658337941685e-05, + "loss": 0.0024, + "num_input_tokens_seen": 136117280, + "step": 111870 + }, + { + "epoch": 12.459628020937744, + "grad_norm": 0.025950198993086815, + "learning_rate": 1.8700306976988433e-05, + "loss": 0.0799, + "num_input_tokens_seen": 136123712, + "step": 111875 + }, + { + "epoch": 12.46018487582136, + "grad_norm": 0.01991359516978264, + "learning_rate": 1.869795567554083e-05, + "loss": 0.0506, + "num_input_tokens_seen": 136129856, + "step": 111880 + }, + { + "epoch": 12.460741730704978, + "grad_norm": 1.2242099046707153, + "learning_rate": 1.8695604433621077e-05, + "loss": 0.0303, + "num_input_tokens_seen": 136136160, + "step": 111885 + }, + { + "epoch": 12.461298585588596, + "grad_norm": 0.09372227638959885, + "learning_rate": 1.8693253251251403e-05, + "loss": 0.0285, + "num_input_tokens_seen": 136142368, + "step": 111890 + }, + { + "epoch": 12.461855440472213, + "grad_norm": 0.0016326664481312037, + "learning_rate": 1.8690902128453993e-05, + "loss": 0.0597, + "num_input_tokens_seen": 136148288, + "step": 111895 + }, + { + "epoch": 12.462412295355831, + "grad_norm": 0.9061688780784607, + "learning_rate": 1.8688551065251077e-05, + "loss": 0.1188, + "num_input_tokens_seen": 136154592, + "step": 111900 + }, + { + "epoch": 12.462969150239447, + "grad_norm": 1.2354686260223389, + "learning_rate": 1.868620006166484e-05, + "loss": 0.1059, + "num_input_tokens_seen": 136160256, + "step": 111905 + }, + { + "epoch": 12.463526005123065, + "grad_norm": 2.0396969318389893, + "learning_rate": 1.8683849117717518e-05, + "loss": 0.0974, + "num_input_tokens_seen": 136165792, + "step": 111910 + }, + { + "epoch": 12.464082860006682, + "grad_norm": 1.1020857095718384, + "learning_rate": 1.8681498233431288e-05, + "loss": 0.1138, + "num_input_tokens_seen": 136172000, + "step": 111915 + }, + { + "epoch": 12.4646397148903, + "grad_norm": 0.10693889856338501, + "learning_rate": 1.8679147408828378e-05, + "loss": 0.0594, + "num_input_tokens_seen": 136177984, + "step": 111920 + }, + { + "epoch": 12.465196569773918, + "grad_norm": 0.16521212458610535, + "learning_rate": 1.8676796643930987e-05, + "loss": 0.0161, + "num_input_tokens_seen": 136184128, + "step": 111925 + }, + { + "epoch": 12.465753424657533, + "grad_norm": 0.1341559886932373, + "learning_rate": 1.8674445938761308e-05, + "loss": 0.0014, + "num_input_tokens_seen": 136190624, + "step": 111930 + }, + { + "epoch": 12.466310279541151, + "grad_norm": 1.486232042312622, + "learning_rate": 1.8672095293341567e-05, + "loss": 0.0425, + "num_input_tokens_seen": 136196928, + "step": 111935 + }, + { + "epoch": 12.466867134424769, + "grad_norm": 1.4117112159729004, + "learning_rate": 1.8669744707693943e-05, + "loss": 0.0798, + "num_input_tokens_seen": 136202912, + "step": 111940 + }, + { + "epoch": 12.467423989308386, + "grad_norm": 0.00019286328461021185, + "learning_rate": 1.8667394181840657e-05, + "loss": 0.0007, + "num_input_tokens_seen": 136209152, + "step": 111945 + }, + { + "epoch": 12.467980844192004, + "grad_norm": 0.1356058418750763, + "learning_rate": 1.8665043715803907e-05, + "loss": 0.0252, + "num_input_tokens_seen": 136214848, + "step": 111950 + }, + { + "epoch": 12.46853769907562, + "grad_norm": 0.37197765707969666, + "learning_rate": 1.8662693309605898e-05, + "loss": 0.0126, + "num_input_tokens_seen": 136220928, + "step": 111955 + }, + { + "epoch": 12.469094553959238, + "grad_norm": 0.056626491248607635, + "learning_rate": 1.8660342963268822e-05, + "loss": 0.0097, + "num_input_tokens_seen": 136227008, + "step": 111960 + }, + { + "epoch": 12.469651408842855, + "grad_norm": 0.8601911664009094, + "learning_rate": 1.865799267681489e-05, + "loss": 0.0364, + "num_input_tokens_seen": 136233024, + "step": 111965 + }, + { + "epoch": 12.470208263726473, + "grad_norm": 0.024350734427571297, + "learning_rate": 1.86556424502663e-05, + "loss": 0.0152, + "num_input_tokens_seen": 136238848, + "step": 111970 + }, + { + "epoch": 12.47076511861009, + "grad_norm": 2.7831852436065674, + "learning_rate": 1.8653292283645247e-05, + "loss": 0.1573, + "num_input_tokens_seen": 136245024, + "step": 111975 + }, + { + "epoch": 12.471321973493708, + "grad_norm": 0.9227805137634277, + "learning_rate": 1.865094217697393e-05, + "loss": 0.0696, + "num_input_tokens_seen": 136251168, + "step": 111980 + }, + { + "epoch": 12.471878828377324, + "grad_norm": 0.00026879776851274073, + "learning_rate": 1.864859213027456e-05, + "loss": 0.0043, + "num_input_tokens_seen": 136257120, + "step": 111985 + }, + { + "epoch": 12.472435683260942, + "grad_norm": 0.05234936997294426, + "learning_rate": 1.864624214356931e-05, + "loss": 0.0274, + "num_input_tokens_seen": 136263264, + "step": 111990 + }, + { + "epoch": 12.47299253814456, + "grad_norm": 1.5492571592330933, + "learning_rate": 1.8643892216880412e-05, + "loss": 0.0358, + "num_input_tokens_seen": 136269216, + "step": 111995 + }, + { + "epoch": 12.473549393028177, + "grad_norm": 1.2587774991989136, + "learning_rate": 1.864154235023003e-05, + "loss": 0.0993, + "num_input_tokens_seen": 136275456, + "step": 112000 + }, + { + "epoch": 12.474106247911795, + "grad_norm": 0.6599427461624146, + "learning_rate": 1.8639192543640383e-05, + "loss": 0.0104, + "num_input_tokens_seen": 136281632, + "step": 112005 + }, + { + "epoch": 12.474663102795411, + "grad_norm": 0.024848582223057747, + "learning_rate": 1.8636842797133656e-05, + "loss": 0.1266, + "num_input_tokens_seen": 136287712, + "step": 112010 + }, + { + "epoch": 12.475219957679029, + "grad_norm": 3.08538818359375, + "learning_rate": 1.8634493110732048e-05, + "loss": 0.1367, + "num_input_tokens_seen": 136293184, + "step": 112015 + }, + { + "epoch": 12.475776812562646, + "grad_norm": 0.0019811950623989105, + "learning_rate": 1.863214348445775e-05, + "loss": 0.0048, + "num_input_tokens_seen": 136299360, + "step": 112020 + }, + { + "epoch": 12.476333667446264, + "grad_norm": 0.15450190007686615, + "learning_rate": 1.8629793918332965e-05, + "loss": 0.0469, + "num_input_tokens_seen": 136305376, + "step": 112025 + }, + { + "epoch": 12.476890522329882, + "grad_norm": 0.0033804895356297493, + "learning_rate": 1.8627444412379875e-05, + "loss": 0.0035, + "num_input_tokens_seen": 136311520, + "step": 112030 + }, + { + "epoch": 12.477447377213498, + "grad_norm": 0.004730601329356432, + "learning_rate": 1.8625094966620683e-05, + "loss": 0.1089, + "num_input_tokens_seen": 136317856, + "step": 112035 + }, + { + "epoch": 12.478004232097115, + "grad_norm": 0.42358386516571045, + "learning_rate": 1.8622745581077565e-05, + "loss": 0.0077, + "num_input_tokens_seen": 136324192, + "step": 112040 + }, + { + "epoch": 12.478561086980733, + "grad_norm": 1.6277382373809814, + "learning_rate": 1.8620396255772736e-05, + "loss": 0.0282, + "num_input_tokens_seen": 136330336, + "step": 112045 + }, + { + "epoch": 12.47911794186435, + "grad_norm": 0.12888191640377045, + "learning_rate": 1.8618046990728366e-05, + "loss": 0.0922, + "num_input_tokens_seen": 136336800, + "step": 112050 + }, + { + "epoch": 12.479674796747968, + "grad_norm": 0.4997708201408386, + "learning_rate": 1.861569778596667e-05, + "loss": 0.0594, + "num_input_tokens_seen": 136343200, + "step": 112055 + }, + { + "epoch": 12.480231651631584, + "grad_norm": 2.1524171829223633, + "learning_rate": 1.8613348641509803e-05, + "loss": 0.0766, + "num_input_tokens_seen": 136349184, + "step": 112060 + }, + { + "epoch": 12.480788506515202, + "grad_norm": 0.00029863836243748665, + "learning_rate": 1.8610999557379998e-05, + "loss": 0.053, + "num_input_tokens_seen": 136355776, + "step": 112065 + }, + { + "epoch": 12.48134536139882, + "grad_norm": 1.6056944131851196, + "learning_rate": 1.86086505335994e-05, + "loss": 0.1051, + "num_input_tokens_seen": 136361760, + "step": 112070 + }, + { + "epoch": 12.481902216282437, + "grad_norm": 0.23963725566864014, + "learning_rate": 1.860630157019023e-05, + "loss": 0.056, + "num_input_tokens_seen": 136367712, + "step": 112075 + }, + { + "epoch": 12.482459071166055, + "grad_norm": 1.2579647302627563, + "learning_rate": 1.8603952667174658e-05, + "loss": 0.1131, + "num_input_tokens_seen": 136373952, + "step": 112080 + }, + { + "epoch": 12.48301592604967, + "grad_norm": 0.07012702524662018, + "learning_rate": 1.8601603824574888e-05, + "loss": 0.0117, + "num_input_tokens_seen": 136380320, + "step": 112085 + }, + { + "epoch": 12.483572780933288, + "grad_norm": 0.05755920708179474, + "learning_rate": 1.8599255042413085e-05, + "loss": 0.0202, + "num_input_tokens_seen": 136386112, + "step": 112090 + }, + { + "epoch": 12.484129635816906, + "grad_norm": 0.07396094501018524, + "learning_rate": 1.8596906320711456e-05, + "loss": 0.003, + "num_input_tokens_seen": 136392320, + "step": 112095 + }, + { + "epoch": 12.484686490700524, + "grad_norm": 0.17062030732631683, + "learning_rate": 1.8594557659492167e-05, + "loss": 0.0541, + "num_input_tokens_seen": 136398688, + "step": 112100 + }, + { + "epoch": 12.485243345584141, + "grad_norm": 0.03189864382147789, + "learning_rate": 1.859220905877742e-05, + "loss": 0.027, + "num_input_tokens_seen": 136405024, + "step": 112105 + }, + { + "epoch": 12.485800200467757, + "grad_norm": 0.298075407743454, + "learning_rate": 1.858986051858939e-05, + "loss": 0.0576, + "num_input_tokens_seen": 136411136, + "step": 112110 + }, + { + "epoch": 12.486357055351375, + "grad_norm": 0.30067434906959534, + "learning_rate": 1.8587512038950265e-05, + "loss": 0.0085, + "num_input_tokens_seen": 136417152, + "step": 112115 + }, + { + "epoch": 12.486913910234993, + "grad_norm": 0.11750542372465134, + "learning_rate": 1.858516361988222e-05, + "loss": 0.0878, + "num_input_tokens_seen": 136422496, + "step": 112120 + }, + { + "epoch": 12.48747076511861, + "grad_norm": 0.7129728198051453, + "learning_rate": 1.858281526140745e-05, + "loss": 0.1174, + "num_input_tokens_seen": 136428448, + "step": 112125 + }, + { + "epoch": 12.488027620002228, + "grad_norm": 0.4238358736038208, + "learning_rate": 1.8580466963548123e-05, + "loss": 0.1512, + "num_input_tokens_seen": 136434944, + "step": 112130 + }, + { + "epoch": 12.488584474885844, + "grad_norm": 0.0401383601129055, + "learning_rate": 1.857811872632644e-05, + "loss": 0.0326, + "num_input_tokens_seen": 136441056, + "step": 112135 + }, + { + "epoch": 12.489141329769462, + "grad_norm": 0.0002114202652592212, + "learning_rate": 1.8575770549764567e-05, + "loss": 0.0028, + "num_input_tokens_seen": 136447488, + "step": 112140 + }, + { + "epoch": 12.48969818465308, + "grad_norm": 3.4267852306365967, + "learning_rate": 1.8573422433884686e-05, + "loss": 0.1979, + "num_input_tokens_seen": 136453536, + "step": 112145 + }, + { + "epoch": 12.490255039536697, + "grad_norm": 1.1439529657363892, + "learning_rate": 1.857107437870898e-05, + "loss": 0.1777, + "num_input_tokens_seen": 136459424, + "step": 112150 + }, + { + "epoch": 12.490811894420315, + "grad_norm": 1.5203049182891846, + "learning_rate": 1.8568726384259626e-05, + "loss": 0.0851, + "num_input_tokens_seen": 136465472, + "step": 112155 + }, + { + "epoch": 12.49136874930393, + "grad_norm": 0.03215830773115158, + "learning_rate": 1.856637845055881e-05, + "loss": 0.0303, + "num_input_tokens_seen": 136471744, + "step": 112160 + }, + { + "epoch": 12.491925604187548, + "grad_norm": 0.0015015274984762073, + "learning_rate": 1.85640305776287e-05, + "loss": 0.0065, + "num_input_tokens_seen": 136477696, + "step": 112165 + }, + { + "epoch": 12.492482459071166, + "grad_norm": 0.0012965769274160266, + "learning_rate": 1.8561682765491474e-05, + "loss": 0.0082, + "num_input_tokens_seen": 136483808, + "step": 112170 + }, + { + "epoch": 12.493039313954784, + "grad_norm": 9.590721310814843e-05, + "learning_rate": 1.855933501416932e-05, + "loss": 0.003, + "num_input_tokens_seen": 136490112, + "step": 112175 + }, + { + "epoch": 12.493596168838401, + "grad_norm": 0.5255861282348633, + "learning_rate": 1.8556987323684394e-05, + "loss": 0.0235, + "num_input_tokens_seen": 136495968, + "step": 112180 + }, + { + "epoch": 12.494153023722019, + "grad_norm": 0.38323265314102173, + "learning_rate": 1.8554639694058902e-05, + "loss": 0.0097, + "num_input_tokens_seen": 136502240, + "step": 112185 + }, + { + "epoch": 12.494709878605635, + "grad_norm": 0.478344202041626, + "learning_rate": 1.8552292125314984e-05, + "loss": 0.0119, + "num_input_tokens_seen": 136508512, + "step": 112190 + }, + { + "epoch": 12.495266733489252, + "grad_norm": 0.4963894486427307, + "learning_rate": 1.8549944617474845e-05, + "loss": 0.0139, + "num_input_tokens_seen": 136514752, + "step": 112195 + }, + { + "epoch": 12.49582358837287, + "grad_norm": 0.29954108595848083, + "learning_rate": 1.8547597170560645e-05, + "loss": 0.0691, + "num_input_tokens_seen": 136521120, + "step": 112200 + }, + { + "epoch": 12.496380443256488, + "grad_norm": 0.16649089753627777, + "learning_rate": 1.8545249784594558e-05, + "loss": 0.0572, + "num_input_tokens_seen": 136527072, + "step": 112205 + }, + { + "epoch": 12.496937298140105, + "grad_norm": 0.8408333659172058, + "learning_rate": 1.8542902459598756e-05, + "loss": 0.0327, + "num_input_tokens_seen": 136532832, + "step": 112210 + }, + { + "epoch": 12.497494153023721, + "grad_norm": 0.8118954300880432, + "learning_rate": 1.8540555195595422e-05, + "loss": 0.0407, + "num_input_tokens_seen": 136538912, + "step": 112215 + }, + { + "epoch": 12.498051007907339, + "grad_norm": 0.0165545716881752, + "learning_rate": 1.853820799260671e-05, + "loss": 0.0738, + "num_input_tokens_seen": 136544736, + "step": 112220 + }, + { + "epoch": 12.498607862790957, + "grad_norm": 0.0023543343413621187, + "learning_rate": 1.8535860850654806e-05, + "loss": 0.0995, + "num_input_tokens_seen": 136550112, + "step": 112225 + }, + { + "epoch": 12.499164717674574, + "grad_norm": 1.0813636779785156, + "learning_rate": 1.8533513769761872e-05, + "loss": 0.0426, + "num_input_tokens_seen": 136555904, + "step": 112230 + }, + { + "epoch": 12.499721572558192, + "grad_norm": 0.006386066786944866, + "learning_rate": 1.8531166749950086e-05, + "loss": 0.0469, + "num_input_tokens_seen": 136561952, + "step": 112235 + }, + { + "epoch": 12.500278427441808, + "grad_norm": 0.009908763691782951, + "learning_rate": 1.8528819791241604e-05, + "loss": 0.0056, + "num_input_tokens_seen": 136568064, + "step": 112240 + }, + { + "epoch": 12.500835282325426, + "grad_norm": 0.0405423603951931, + "learning_rate": 1.8526472893658614e-05, + "loss": 0.0218, + "num_input_tokens_seen": 136574528, + "step": 112245 + }, + { + "epoch": 12.501392137209043, + "grad_norm": 1.0640034675598145, + "learning_rate": 1.852412605722326e-05, + "loss": 0.0283, + "num_input_tokens_seen": 136580608, + "step": 112250 + }, + { + "epoch": 12.501948992092661, + "grad_norm": 0.005944446194916964, + "learning_rate": 1.8521779281957744e-05, + "loss": 0.0027, + "num_input_tokens_seen": 136587136, + "step": 112255 + }, + { + "epoch": 12.502505846976279, + "grad_norm": 0.6921203136444092, + "learning_rate": 1.851943256788419e-05, + "loss": 0.1447, + "num_input_tokens_seen": 136593152, + "step": 112260 + }, + { + "epoch": 12.503062701859895, + "grad_norm": 0.03781360760331154, + "learning_rate": 1.8517085915024805e-05, + "loss": 0.0242, + "num_input_tokens_seen": 136599296, + "step": 112265 + }, + { + "epoch": 12.503619556743512, + "grad_norm": 0.0008082740823738277, + "learning_rate": 1.851473932340173e-05, + "loss": 0.0024, + "num_input_tokens_seen": 136605280, + "step": 112270 + }, + { + "epoch": 12.50417641162713, + "grad_norm": 0.029035571962594986, + "learning_rate": 1.8512392793037145e-05, + "loss": 0.0133, + "num_input_tokens_seen": 136610976, + "step": 112275 + }, + { + "epoch": 12.504733266510748, + "grad_norm": 0.027698248624801636, + "learning_rate": 1.8510046323953202e-05, + "loss": 0.0249, + "num_input_tokens_seen": 136617376, + "step": 112280 + }, + { + "epoch": 12.505290121394365, + "grad_norm": 0.07233038544654846, + "learning_rate": 1.8507699916172072e-05, + "loss": 0.0137, + "num_input_tokens_seen": 136623584, + "step": 112285 + }, + { + "epoch": 12.505846976277981, + "grad_norm": 0.5304548144340515, + "learning_rate": 1.8505353569715918e-05, + "loss": 0.0609, + "num_input_tokens_seen": 136628896, + "step": 112290 + }, + { + "epoch": 12.506403831161599, + "grad_norm": 0.09041086584329605, + "learning_rate": 1.8503007284606904e-05, + "loss": 0.058, + "num_input_tokens_seen": 136635168, + "step": 112295 + }, + { + "epoch": 12.506960686045216, + "grad_norm": 0.18699154257774353, + "learning_rate": 1.850066106086719e-05, + "loss": 0.0412, + "num_input_tokens_seen": 136641216, + "step": 112300 + }, + { + "epoch": 12.507517540928834, + "grad_norm": 0.028240129351615906, + "learning_rate": 1.8498314898518944e-05, + "loss": 0.0329, + "num_input_tokens_seen": 136647392, + "step": 112305 + }, + { + "epoch": 12.508074395812452, + "grad_norm": 1.525619626045227, + "learning_rate": 1.8495968797584305e-05, + "loss": 0.0568, + "num_input_tokens_seen": 136653248, + "step": 112310 + }, + { + "epoch": 12.50863125069607, + "grad_norm": 0.026581883430480957, + "learning_rate": 1.8493622758085478e-05, + "loss": 0.1579, + "num_input_tokens_seen": 136659360, + "step": 112315 + }, + { + "epoch": 12.509188105579685, + "grad_norm": 0.046343035995960236, + "learning_rate": 1.8491276780044575e-05, + "loss": 0.0752, + "num_input_tokens_seen": 136665600, + "step": 112320 + }, + { + "epoch": 12.509744960463303, + "grad_norm": 0.0001443246437702328, + "learning_rate": 1.8488930863483788e-05, + "loss": 0.0045, + "num_input_tokens_seen": 136671808, + "step": 112325 + }, + { + "epoch": 12.51030181534692, + "grad_norm": 0.0009668271522969007, + "learning_rate": 1.8486585008425263e-05, + "loss": 0.0078, + "num_input_tokens_seen": 136677984, + "step": 112330 + }, + { + "epoch": 12.510858670230538, + "grad_norm": 0.15008845925331116, + "learning_rate": 1.8484239214891166e-05, + "loss": 0.0061, + "num_input_tokens_seen": 136684000, + "step": 112335 + }, + { + "epoch": 12.511415525114156, + "grad_norm": 0.0005023068515583873, + "learning_rate": 1.8481893482903655e-05, + "loss": 0.0705, + "num_input_tokens_seen": 136690208, + "step": 112340 + }, + { + "epoch": 12.511972379997772, + "grad_norm": 0.3326208293437958, + "learning_rate": 1.8479547812484865e-05, + "loss": 0.1194, + "num_input_tokens_seen": 136696576, + "step": 112345 + }, + { + "epoch": 12.51252923488139, + "grad_norm": 0.09834080934524536, + "learning_rate": 1.8477202203656982e-05, + "loss": 0.0044, + "num_input_tokens_seen": 136703008, + "step": 112350 + }, + { + "epoch": 12.513086089765007, + "grad_norm": 0.9040029644966125, + "learning_rate": 1.8474856656442143e-05, + "loss": 0.035, + "num_input_tokens_seen": 136709440, + "step": 112355 + }, + { + "epoch": 12.513642944648625, + "grad_norm": 0.0675249770283699, + "learning_rate": 1.847251117086252e-05, + "loss": 0.0534, + "num_input_tokens_seen": 136715552, + "step": 112360 + }, + { + "epoch": 12.514199799532243, + "grad_norm": 0.5804081559181213, + "learning_rate": 1.847016574694025e-05, + "loss": 0.0247, + "num_input_tokens_seen": 136721760, + "step": 112365 + }, + { + "epoch": 12.514756654415859, + "grad_norm": 0.0038653784431517124, + "learning_rate": 1.8467820384697504e-05, + "loss": 0.0931, + "num_input_tokens_seen": 136727680, + "step": 112370 + }, + { + "epoch": 12.515313509299476, + "grad_norm": 0.0469658300280571, + "learning_rate": 1.846547508415642e-05, + "loss": 0.0208, + "num_input_tokens_seen": 136733952, + "step": 112375 + }, + { + "epoch": 12.515870364183094, + "grad_norm": 1.013278841972351, + "learning_rate": 1.8463129845339165e-05, + "loss": 0.0594, + "num_input_tokens_seen": 136739968, + "step": 112380 + }, + { + "epoch": 12.516427219066712, + "grad_norm": 1.561698079109192, + "learning_rate": 1.846078466826788e-05, + "loss": 0.0808, + "num_input_tokens_seen": 136745856, + "step": 112385 + }, + { + "epoch": 12.51698407395033, + "grad_norm": 1.890106201171875, + "learning_rate": 1.845843955296473e-05, + "loss": 0.1066, + "num_input_tokens_seen": 136751744, + "step": 112390 + }, + { + "epoch": 12.517540928833945, + "grad_norm": 0.08334828168153763, + "learning_rate": 1.8456094499451847e-05, + "loss": 0.0733, + "num_input_tokens_seen": 136757696, + "step": 112395 + }, + { + "epoch": 12.518097783717563, + "grad_norm": 0.5666133165359497, + "learning_rate": 1.845374950775141e-05, + "loss": 0.0121, + "num_input_tokens_seen": 136763584, + "step": 112400 + }, + { + "epoch": 12.51865463860118, + "grad_norm": 0.712015688419342, + "learning_rate": 1.845140457788554e-05, + "loss": 0.0819, + "num_input_tokens_seen": 136769568, + "step": 112405 + }, + { + "epoch": 12.519211493484798, + "grad_norm": 0.08737099915742874, + "learning_rate": 1.844905970987641e-05, + "loss": 0.0297, + "num_input_tokens_seen": 136775712, + "step": 112410 + }, + { + "epoch": 12.519768348368416, + "grad_norm": 3.248687744140625, + "learning_rate": 1.8446714903746152e-05, + "loss": 0.1779, + "num_input_tokens_seen": 136781664, + "step": 112415 + }, + { + "epoch": 12.520325203252032, + "grad_norm": 0.140777587890625, + "learning_rate": 1.844437015951693e-05, + "loss": 0.0036, + "num_input_tokens_seen": 136787712, + "step": 112420 + }, + { + "epoch": 12.52088205813565, + "grad_norm": 0.9489235281944275, + "learning_rate": 1.8442025477210884e-05, + "loss": 0.0458, + "num_input_tokens_seen": 136793984, + "step": 112425 + }, + { + "epoch": 12.521438913019267, + "grad_norm": 0.07452831417322159, + "learning_rate": 1.8439680856850162e-05, + "loss": 0.011, + "num_input_tokens_seen": 136799872, + "step": 112430 + }, + { + "epoch": 12.521995767902885, + "grad_norm": 1.3388203382492065, + "learning_rate": 1.8437336298456903e-05, + "loss": 0.1259, + "num_input_tokens_seen": 136805440, + "step": 112435 + }, + { + "epoch": 12.522552622786502, + "grad_norm": 6.159887561807409e-05, + "learning_rate": 1.8434991802053266e-05, + "loss": 0.0644, + "num_input_tokens_seen": 136811552, + "step": 112440 + }, + { + "epoch": 12.523109477670118, + "grad_norm": 0.011216765269637108, + "learning_rate": 1.843264736766139e-05, + "loss": 0.0379, + "num_input_tokens_seen": 136817568, + "step": 112445 + }, + { + "epoch": 12.523666332553736, + "grad_norm": 0.08955149352550507, + "learning_rate": 1.8430302995303423e-05, + "loss": 0.0421, + "num_input_tokens_seen": 136823840, + "step": 112450 + }, + { + "epoch": 12.524223187437354, + "grad_norm": 0.43513038754463196, + "learning_rate": 1.8427958685001496e-05, + "loss": 0.0217, + "num_input_tokens_seen": 136829952, + "step": 112455 + }, + { + "epoch": 12.524780042320971, + "grad_norm": 0.0401003323495388, + "learning_rate": 1.8425614436777784e-05, + "loss": 0.0033, + "num_input_tokens_seen": 136836160, + "step": 112460 + }, + { + "epoch": 12.525336897204589, + "grad_norm": 0.12522552907466888, + "learning_rate": 1.842327025065439e-05, + "loss": 0.0923, + "num_input_tokens_seen": 136841696, + "step": 112465 + }, + { + "epoch": 12.525893752088205, + "grad_norm": 1.1397480964660645, + "learning_rate": 1.8420926126653495e-05, + "loss": 0.0681, + "num_input_tokens_seen": 136847840, + "step": 112470 + }, + { + "epoch": 12.526450606971823, + "grad_norm": 0.050942253321409225, + "learning_rate": 1.8418582064797215e-05, + "loss": 0.1891, + "num_input_tokens_seen": 136853920, + "step": 112475 + }, + { + "epoch": 12.52700746185544, + "grad_norm": 0.13589373230934143, + "learning_rate": 1.8416238065107705e-05, + "loss": 0.0586, + "num_input_tokens_seen": 136860128, + "step": 112480 + }, + { + "epoch": 12.527564316739058, + "grad_norm": 1.9802964925765991, + "learning_rate": 1.84138941276071e-05, + "loss": 0.108, + "num_input_tokens_seen": 136866240, + "step": 112485 + }, + { + "epoch": 12.528121171622676, + "grad_norm": 0.8028169870376587, + "learning_rate": 1.8411550252317544e-05, + "loss": 0.0092, + "num_input_tokens_seen": 136872128, + "step": 112490 + }, + { + "epoch": 12.528678026506292, + "grad_norm": 0.0022659164387732744, + "learning_rate": 1.840920643926117e-05, + "loss": 0.1484, + "num_input_tokens_seen": 136877600, + "step": 112495 + }, + { + "epoch": 12.52923488138991, + "grad_norm": 1.816380500793457, + "learning_rate": 1.8406862688460124e-05, + "loss": 0.0252, + "num_input_tokens_seen": 136884128, + "step": 112500 + }, + { + "epoch": 12.529791736273527, + "grad_norm": 0.009362556971609592, + "learning_rate": 1.840451899993654e-05, + "loss": 0.0089, + "num_input_tokens_seen": 136890208, + "step": 112505 + }, + { + "epoch": 12.530348591157145, + "grad_norm": 0.6776344776153564, + "learning_rate": 1.840217537371256e-05, + "loss": 0.0387, + "num_input_tokens_seen": 136896320, + "step": 112510 + }, + { + "epoch": 12.530905446040762, + "grad_norm": 0.00048092028009705245, + "learning_rate": 1.8399831809810312e-05, + "loss": 0.0108, + "num_input_tokens_seen": 136902176, + "step": 112515 + }, + { + "epoch": 12.531462300924378, + "grad_norm": 1.2111064195632935, + "learning_rate": 1.8397488308251958e-05, + "loss": 0.0463, + "num_input_tokens_seen": 136908288, + "step": 112520 + }, + { + "epoch": 12.532019155807996, + "grad_norm": 0.3960718810558319, + "learning_rate": 1.83951448690596e-05, + "loss": 0.0433, + "num_input_tokens_seen": 136914336, + "step": 112525 + }, + { + "epoch": 12.532576010691614, + "grad_norm": 0.06472911685705185, + "learning_rate": 1.8392801492255404e-05, + "loss": 0.074, + "num_input_tokens_seen": 136920480, + "step": 112530 + }, + { + "epoch": 12.533132865575231, + "grad_norm": 0.004843192175030708, + "learning_rate": 1.8390458177861474e-05, + "loss": 0.0393, + "num_input_tokens_seen": 136926912, + "step": 112535 + }, + { + "epoch": 12.533689720458849, + "grad_norm": 1.1039412021636963, + "learning_rate": 1.838811492589998e-05, + "loss": 0.0214, + "num_input_tokens_seen": 136933248, + "step": 112540 + }, + { + "epoch": 12.534246575342467, + "grad_norm": 0.09235051274299622, + "learning_rate": 1.8385771736393026e-05, + "loss": 0.0731, + "num_input_tokens_seen": 136939168, + "step": 112545 + }, + { + "epoch": 12.534803430226082, + "grad_norm": 0.0035699347499758005, + "learning_rate": 1.8383428609362767e-05, + "loss": 0.0133, + "num_input_tokens_seen": 136945568, + "step": 112550 + }, + { + "epoch": 12.5353602851097, + "grad_norm": 0.6476581692695618, + "learning_rate": 1.8381085544831316e-05, + "loss": 0.0189, + "num_input_tokens_seen": 136951840, + "step": 112555 + }, + { + "epoch": 12.535917139993318, + "grad_norm": 0.09262805432081223, + "learning_rate": 1.8378742542820823e-05, + "loss": 0.0478, + "num_input_tokens_seen": 136958176, + "step": 112560 + }, + { + "epoch": 12.536473994876935, + "grad_norm": 0.11419056355953217, + "learning_rate": 1.8376399603353406e-05, + "loss": 0.0358, + "num_input_tokens_seen": 136963968, + "step": 112565 + }, + { + "epoch": 12.537030849760553, + "grad_norm": 0.058105118572711945, + "learning_rate": 1.8374056726451204e-05, + "loss": 0.0026, + "num_input_tokens_seen": 136969920, + "step": 112570 + }, + { + "epoch": 12.537587704644169, + "grad_norm": 0.02061164751648903, + "learning_rate": 1.8371713912136345e-05, + "loss": 0.0716, + "num_input_tokens_seen": 136976000, + "step": 112575 + }, + { + "epoch": 12.538144559527787, + "grad_norm": 1.0251247882843018, + "learning_rate": 1.836937116043096e-05, + "loss": 0.1207, + "num_input_tokens_seen": 136982304, + "step": 112580 + }, + { + "epoch": 12.538701414411404, + "grad_norm": 0.07758764922618866, + "learning_rate": 1.8367028471357168e-05, + "loss": 0.1635, + "num_input_tokens_seen": 136988384, + "step": 112585 + }, + { + "epoch": 12.539258269295022, + "grad_norm": 2.2911536693573, + "learning_rate": 1.836468584493712e-05, + "loss": 0.1122, + "num_input_tokens_seen": 136994176, + "step": 112590 + }, + { + "epoch": 12.53981512417864, + "grad_norm": 0.005834223236888647, + "learning_rate": 1.836234328119292e-05, + "loss": 0.0072, + "num_input_tokens_seen": 137000384, + "step": 112595 + }, + { + "epoch": 12.540371979062256, + "grad_norm": 0.05796588212251663, + "learning_rate": 1.8360000780146706e-05, + "loss": 0.0252, + "num_input_tokens_seen": 137006240, + "step": 112600 + }, + { + "epoch": 12.540928833945873, + "grad_norm": 0.002057618461549282, + "learning_rate": 1.8357658341820607e-05, + "loss": 0.0474, + "num_input_tokens_seen": 137012736, + "step": 112605 + }, + { + "epoch": 12.541485688829491, + "grad_norm": 0.2704899311065674, + "learning_rate": 1.8355315966236752e-05, + "loss": 0.0717, + "num_input_tokens_seen": 137018496, + "step": 112610 + }, + { + "epoch": 12.542042543713109, + "grad_norm": 0.004609673749655485, + "learning_rate": 1.8352973653417256e-05, + "loss": 0.0034, + "num_input_tokens_seen": 137024672, + "step": 112615 + }, + { + "epoch": 12.542599398596726, + "grad_norm": 0.8833907842636108, + "learning_rate": 1.835063140338425e-05, + "loss": 0.0613, + "num_input_tokens_seen": 137030784, + "step": 112620 + }, + { + "epoch": 12.543156253480342, + "grad_norm": 0.02703004516661167, + "learning_rate": 1.8348289216159855e-05, + "loss": 0.1733, + "num_input_tokens_seen": 137036832, + "step": 112625 + }, + { + "epoch": 12.54371310836396, + "grad_norm": 1.8763855695724487, + "learning_rate": 1.8345947091766203e-05, + "loss": 0.0417, + "num_input_tokens_seen": 137043008, + "step": 112630 + }, + { + "epoch": 12.544269963247578, + "grad_norm": 0.00021220764028839767, + "learning_rate": 1.8343605030225407e-05, + "loss": 0.0274, + "num_input_tokens_seen": 137049216, + "step": 112635 + }, + { + "epoch": 12.544826818131195, + "grad_norm": 0.40108975768089294, + "learning_rate": 1.8341263031559603e-05, + "loss": 0.0546, + "num_input_tokens_seen": 137055520, + "step": 112640 + }, + { + "epoch": 12.545383673014813, + "grad_norm": 0.012002326548099518, + "learning_rate": 1.833892109579089e-05, + "loss": 0.051, + "num_input_tokens_seen": 137061152, + "step": 112645 + }, + { + "epoch": 12.545940527898429, + "grad_norm": 0.2311740219593048, + "learning_rate": 1.833657922294142e-05, + "loss": 0.0116, + "num_input_tokens_seen": 137067776, + "step": 112650 + }, + { + "epoch": 12.546497382782047, + "grad_norm": 0.6060232520103455, + "learning_rate": 1.8334237413033285e-05, + "loss": 0.0187, + "num_input_tokens_seen": 137073920, + "step": 112655 + }, + { + "epoch": 12.547054237665664, + "grad_norm": 0.48733440041542053, + "learning_rate": 1.8331895666088627e-05, + "loss": 0.0106, + "num_input_tokens_seen": 137080032, + "step": 112660 + }, + { + "epoch": 12.547611092549282, + "grad_norm": 0.005407698918133974, + "learning_rate": 1.832955398212955e-05, + "loss": 0.0636, + "num_input_tokens_seen": 137086336, + "step": 112665 + }, + { + "epoch": 12.5481679474329, + "grad_norm": 0.9635187983512878, + "learning_rate": 1.8327212361178185e-05, + "loss": 0.0595, + "num_input_tokens_seen": 137092608, + "step": 112670 + }, + { + "epoch": 12.548724802316517, + "grad_norm": 0.3388013541698456, + "learning_rate": 1.8324870803256637e-05, + "loss": 0.0877, + "num_input_tokens_seen": 137099136, + "step": 112675 + }, + { + "epoch": 12.549281657200133, + "grad_norm": 0.2662105858325958, + "learning_rate": 1.8322529308387043e-05, + "loss": 0.0871, + "num_input_tokens_seen": 137104992, + "step": 112680 + }, + { + "epoch": 12.54983851208375, + "grad_norm": 0.5934346318244934, + "learning_rate": 1.8320187876591506e-05, + "loss": 0.0086, + "num_input_tokens_seen": 137111104, + "step": 112685 + }, + { + "epoch": 12.550395366967368, + "grad_norm": 1.0250039100646973, + "learning_rate": 1.8317846507892148e-05, + "loss": 0.0416, + "num_input_tokens_seen": 137117184, + "step": 112690 + }, + { + "epoch": 12.550952221850986, + "grad_norm": 1.265897512435913, + "learning_rate": 1.831550520231108e-05, + "loss": 0.0364, + "num_input_tokens_seen": 137123296, + "step": 112695 + }, + { + "epoch": 12.551509076734604, + "grad_norm": 0.3363363742828369, + "learning_rate": 1.8313163959870423e-05, + "loss": 0.0315, + "num_input_tokens_seen": 137129472, + "step": 112700 + }, + { + "epoch": 12.55206593161822, + "grad_norm": 0.06462794542312622, + "learning_rate": 1.8310822780592284e-05, + "loss": 0.0147, + "num_input_tokens_seen": 137135552, + "step": 112705 + }, + { + "epoch": 12.552622786501837, + "grad_norm": 0.07439647614955902, + "learning_rate": 1.8308481664498795e-05, + "loss": 0.021, + "num_input_tokens_seen": 137141344, + "step": 112710 + }, + { + "epoch": 12.553179641385455, + "grad_norm": 0.00018537932191975415, + "learning_rate": 1.8306140611612042e-05, + "loss": 0.0307, + "num_input_tokens_seen": 137147296, + "step": 112715 + }, + { + "epoch": 12.553736496269073, + "grad_norm": 0.07718107104301453, + "learning_rate": 1.8303799621954172e-05, + "loss": 0.0039, + "num_input_tokens_seen": 137153536, + "step": 112720 + }, + { + "epoch": 12.55429335115269, + "grad_norm": 0.7406341433525085, + "learning_rate": 1.830145869554726e-05, + "loss": 0.0573, + "num_input_tokens_seen": 137159424, + "step": 112725 + }, + { + "epoch": 12.554850206036306, + "grad_norm": 1.5818132162094116, + "learning_rate": 1.8299117832413452e-05, + "loss": 0.1013, + "num_input_tokens_seen": 137165312, + "step": 112730 + }, + { + "epoch": 12.555407060919924, + "grad_norm": 0.01755383051931858, + "learning_rate": 1.8296777032574835e-05, + "loss": 0.0471, + "num_input_tokens_seen": 137171232, + "step": 112735 + }, + { + "epoch": 12.555963915803542, + "grad_norm": 0.00043321962584741414, + "learning_rate": 1.829443629605354e-05, + "loss": 0.1059, + "num_input_tokens_seen": 137177376, + "step": 112740 + }, + { + "epoch": 12.55652077068716, + "grad_norm": 0.0022940116468816996, + "learning_rate": 1.8292095622871658e-05, + "loss": 0.1935, + "num_input_tokens_seen": 137183296, + "step": 112745 + }, + { + "epoch": 12.557077625570777, + "grad_norm": 0.003350670449435711, + "learning_rate": 1.8289755013051313e-05, + "loss": 0.0329, + "num_input_tokens_seen": 137189600, + "step": 112750 + }, + { + "epoch": 12.557634480454393, + "grad_norm": 0.4695670008659363, + "learning_rate": 1.828741446661461e-05, + "loss": 0.1857, + "num_input_tokens_seen": 137195936, + "step": 112755 + }, + { + "epoch": 12.55819133533801, + "grad_norm": 0.6048911809921265, + "learning_rate": 1.828507398358365e-05, + "loss": 0.0071, + "num_input_tokens_seen": 137202176, + "step": 112760 + }, + { + "epoch": 12.558748190221628, + "grad_norm": 0.006909705698490143, + "learning_rate": 1.8282733563980548e-05, + "loss": 0.035, + "num_input_tokens_seen": 137208544, + "step": 112765 + }, + { + "epoch": 12.559305045105246, + "grad_norm": 0.004148916807025671, + "learning_rate": 1.8280393207827407e-05, + "loss": 0.0354, + "num_input_tokens_seen": 137214688, + "step": 112770 + }, + { + "epoch": 12.559861899988864, + "grad_norm": 0.061518602073192596, + "learning_rate": 1.827805291514634e-05, + "loss": 0.0213, + "num_input_tokens_seen": 137220736, + "step": 112775 + }, + { + "epoch": 12.56041875487248, + "grad_norm": 0.010300654917955399, + "learning_rate": 1.8275712685959442e-05, + "loss": 0.0439, + "num_input_tokens_seen": 137226752, + "step": 112780 + }, + { + "epoch": 12.560975609756097, + "grad_norm": 0.0011112965876236558, + "learning_rate": 1.8273372520288833e-05, + "loss": 0.1049, + "num_input_tokens_seen": 137232416, + "step": 112785 + }, + { + "epoch": 12.561532464639715, + "grad_norm": 0.002354774158447981, + "learning_rate": 1.8271032418156604e-05, + "loss": 0.0081, + "num_input_tokens_seen": 137238688, + "step": 112790 + }, + { + "epoch": 12.562089319523333, + "grad_norm": 1.0693672895431519, + "learning_rate": 1.8268692379584867e-05, + "loss": 0.101, + "num_input_tokens_seen": 137244768, + "step": 112795 + }, + { + "epoch": 12.56264617440695, + "grad_norm": 0.233841210603714, + "learning_rate": 1.8266352404595716e-05, + "loss": 0.0333, + "num_input_tokens_seen": 137250944, + "step": 112800 + }, + { + "epoch": 12.563203029290566, + "grad_norm": 0.0003120290639344603, + "learning_rate": 1.826401249321128e-05, + "loss": 0.1363, + "num_input_tokens_seen": 137257088, + "step": 112805 + }, + { + "epoch": 12.563759884174184, + "grad_norm": 1.1577775478363037, + "learning_rate": 1.8261672645453622e-05, + "loss": 0.1079, + "num_input_tokens_seen": 137262976, + "step": 112810 + }, + { + "epoch": 12.564316739057801, + "grad_norm": 0.1749761700630188, + "learning_rate": 1.8259332861344877e-05, + "loss": 0.0119, + "num_input_tokens_seen": 137269280, + "step": 112815 + }, + { + "epoch": 12.56487359394142, + "grad_norm": 0.08578736335039139, + "learning_rate": 1.825699314090713e-05, + "loss": 0.0288, + "num_input_tokens_seen": 137275616, + "step": 112820 + }, + { + "epoch": 12.565430448825037, + "grad_norm": 0.7454496622085571, + "learning_rate": 1.8254653484162486e-05, + "loss": 0.0125, + "num_input_tokens_seen": 137281824, + "step": 112825 + }, + { + "epoch": 12.565987303708653, + "grad_norm": 0.3998029828071594, + "learning_rate": 1.8252313891133044e-05, + "loss": 0.0414, + "num_input_tokens_seen": 137288032, + "step": 112830 + }, + { + "epoch": 12.56654415859227, + "grad_norm": 0.009549027308821678, + "learning_rate": 1.8249974361840903e-05, + "loss": 0.0283, + "num_input_tokens_seen": 137294144, + "step": 112835 + }, + { + "epoch": 12.567101013475888, + "grad_norm": 1.1671332120895386, + "learning_rate": 1.8247634896308165e-05, + "loss": 0.0317, + "num_input_tokens_seen": 137300608, + "step": 112840 + }, + { + "epoch": 12.567657868359506, + "grad_norm": 0.09177068620920181, + "learning_rate": 1.8245295494556923e-05, + "loss": 0.1401, + "num_input_tokens_seen": 137306528, + "step": 112845 + }, + { + "epoch": 12.568214723243123, + "grad_norm": 0.7813521027565002, + "learning_rate": 1.8242956156609274e-05, + "loss": 0.0182, + "num_input_tokens_seen": 137312224, + "step": 112850 + }, + { + "epoch": 12.56877157812674, + "grad_norm": 0.656183660030365, + "learning_rate": 1.8240616882487327e-05, + "loss": 0.0318, + "num_input_tokens_seen": 137318240, + "step": 112855 + }, + { + "epoch": 12.569328433010357, + "grad_norm": 0.24064460396766663, + "learning_rate": 1.823827767221315e-05, + "loss": 0.0482, + "num_input_tokens_seen": 137324448, + "step": 112860 + }, + { + "epoch": 12.569885287893975, + "grad_norm": 2.768477439880371, + "learning_rate": 1.8235938525808882e-05, + "loss": 0.0795, + "num_input_tokens_seen": 137330688, + "step": 112865 + }, + { + "epoch": 12.570442142777592, + "grad_norm": 2.3166720867156982, + "learning_rate": 1.823359944329657e-05, + "loss": 0.1066, + "num_input_tokens_seen": 137337120, + "step": 112870 + }, + { + "epoch": 12.57099899766121, + "grad_norm": 0.6765023469924927, + "learning_rate": 1.8231260424698356e-05, + "loss": 0.0398, + "num_input_tokens_seen": 137343328, + "step": 112875 + }, + { + "epoch": 12.571555852544826, + "grad_norm": 0.12117336690425873, + "learning_rate": 1.822892147003629e-05, + "loss": 0.0555, + "num_input_tokens_seen": 137349344, + "step": 112880 + }, + { + "epoch": 12.572112707428444, + "grad_norm": 0.004475402645766735, + "learning_rate": 1.82265825793325e-05, + "loss": 0.0034, + "num_input_tokens_seen": 137355584, + "step": 112885 + }, + { + "epoch": 12.572669562312061, + "grad_norm": 0.07766925543546677, + "learning_rate": 1.8224243752609057e-05, + "loss": 0.0768, + "num_input_tokens_seen": 137361088, + "step": 112890 + }, + { + "epoch": 12.573226417195679, + "grad_norm": 0.8492938876152039, + "learning_rate": 1.8221904989888066e-05, + "loss": 0.0929, + "num_input_tokens_seen": 137367008, + "step": 112895 + }, + { + "epoch": 12.573783272079297, + "grad_norm": 0.022975943982601166, + "learning_rate": 1.8219566291191605e-05, + "loss": 0.098, + "num_input_tokens_seen": 137373312, + "step": 112900 + }, + { + "epoch": 12.574340126962914, + "grad_norm": 1.4250645637512207, + "learning_rate": 1.821722765654178e-05, + "loss": 0.0444, + "num_input_tokens_seen": 137379392, + "step": 112905 + }, + { + "epoch": 12.57489698184653, + "grad_norm": 1.547534465789795, + "learning_rate": 1.821488908596067e-05, + "loss": 0.0861, + "num_input_tokens_seen": 137385408, + "step": 112910 + }, + { + "epoch": 12.575453836730148, + "grad_norm": 3.9571280479431152, + "learning_rate": 1.8212550579470373e-05, + "loss": 0.0554, + "num_input_tokens_seen": 137391488, + "step": 112915 + }, + { + "epoch": 12.576010691613766, + "grad_norm": 1.2038201093673706, + "learning_rate": 1.8210212137092964e-05, + "loss": 0.0264, + "num_input_tokens_seen": 137397632, + "step": 112920 + }, + { + "epoch": 12.576567546497383, + "grad_norm": 0.00013977238268125802, + "learning_rate": 1.8207873758850555e-05, + "loss": 0.1084, + "num_input_tokens_seen": 137403808, + "step": 112925 + }, + { + "epoch": 12.577124401381, + "grad_norm": 1.6044554710388184, + "learning_rate": 1.8205535444765203e-05, + "loss": 0.0657, + "num_input_tokens_seen": 137410208, + "step": 112930 + }, + { + "epoch": 12.577681256264617, + "grad_norm": 0.9910964965820312, + "learning_rate": 1.8203197194859036e-05, + "loss": 0.1269, + "num_input_tokens_seen": 137416704, + "step": 112935 + }, + { + "epoch": 12.578238111148234, + "grad_norm": 1.0946528911590576, + "learning_rate": 1.8200859009154093e-05, + "loss": 0.1661, + "num_input_tokens_seen": 137423296, + "step": 112940 + }, + { + "epoch": 12.578794966031852, + "grad_norm": 0.000807004573289305, + "learning_rate": 1.8198520887672497e-05, + "loss": 0.0036, + "num_input_tokens_seen": 137429056, + "step": 112945 + }, + { + "epoch": 12.57935182091547, + "grad_norm": 0.17290453612804413, + "learning_rate": 1.8196182830436314e-05, + "loss": 0.0594, + "num_input_tokens_seen": 137435008, + "step": 112950 + }, + { + "epoch": 12.579908675799087, + "grad_norm": 0.0005720295011997223, + "learning_rate": 1.8193844837467644e-05, + "loss": 0.0024, + "num_input_tokens_seen": 137441248, + "step": 112955 + }, + { + "epoch": 12.580465530682703, + "grad_norm": 0.005708940792828798, + "learning_rate": 1.8191506908788554e-05, + "loss": 0.0172, + "num_input_tokens_seen": 137447168, + "step": 112960 + }, + { + "epoch": 12.581022385566321, + "grad_norm": 0.04215884953737259, + "learning_rate": 1.8189169044421146e-05, + "loss": 0.0674, + "num_input_tokens_seen": 137453536, + "step": 112965 + }, + { + "epoch": 12.581579240449939, + "grad_norm": 1.6291873455047607, + "learning_rate": 1.8186831244387482e-05, + "loss": 0.1239, + "num_input_tokens_seen": 137459648, + "step": 112970 + }, + { + "epoch": 12.582136095333556, + "grad_norm": 0.004484100267291069, + "learning_rate": 1.8184493508709664e-05, + "loss": 0.0769, + "num_input_tokens_seen": 137465632, + "step": 112975 + }, + { + "epoch": 12.582692950217174, + "grad_norm": 0.7844930291175842, + "learning_rate": 1.818215583740976e-05, + "loss": 0.0335, + "num_input_tokens_seen": 137471840, + "step": 112980 + }, + { + "epoch": 12.58324980510079, + "grad_norm": 0.2682201862335205, + "learning_rate": 1.817981823050986e-05, + "loss": 0.0446, + "num_input_tokens_seen": 137478272, + "step": 112985 + }, + { + "epoch": 12.583806659984408, + "grad_norm": 0.0008898635860532522, + "learning_rate": 1.817748068803203e-05, + "loss": 0.0859, + "num_input_tokens_seen": 137484640, + "step": 112990 + }, + { + "epoch": 12.584363514868025, + "grad_norm": 0.0005122101283632219, + "learning_rate": 1.817514320999838e-05, + "loss": 0.0019, + "num_input_tokens_seen": 137491136, + "step": 112995 + }, + { + "epoch": 12.584920369751643, + "grad_norm": 0.001427884097211063, + "learning_rate": 1.8172805796430952e-05, + "loss": 0.0309, + "num_input_tokens_seen": 137497120, + "step": 113000 + }, + { + "epoch": 12.58547722463526, + "grad_norm": 0.1268831342458725, + "learning_rate": 1.8170468447351856e-05, + "loss": 0.0086, + "num_input_tokens_seen": 137503008, + "step": 113005 + }, + { + "epoch": 12.586034079518878, + "grad_norm": 0.03497239947319031, + "learning_rate": 1.816813116278315e-05, + "loss": 0.0595, + "num_input_tokens_seen": 137509056, + "step": 113010 + }, + { + "epoch": 12.586590934402494, + "grad_norm": 0.050060197710990906, + "learning_rate": 1.8165793942746924e-05, + "loss": 0.0379, + "num_input_tokens_seen": 137515040, + "step": 113015 + }, + { + "epoch": 12.587147789286112, + "grad_norm": 0.0006905707414261997, + "learning_rate": 1.8163456787265245e-05, + "loss": 0.0006, + "num_input_tokens_seen": 137521344, + "step": 113020 + }, + { + "epoch": 12.58770464416973, + "grad_norm": 0.03893642500042915, + "learning_rate": 1.8161119696360202e-05, + "loss": 0.0733, + "num_input_tokens_seen": 137527232, + "step": 113025 + }, + { + "epoch": 12.588261499053347, + "grad_norm": 0.1345972716808319, + "learning_rate": 1.8158782670053853e-05, + "loss": 0.0643, + "num_input_tokens_seen": 137533248, + "step": 113030 + }, + { + "epoch": 12.588818353936965, + "grad_norm": 0.0007380497991107404, + "learning_rate": 1.815644570836829e-05, + "loss": 0.0424, + "num_input_tokens_seen": 137539424, + "step": 113035 + }, + { + "epoch": 12.58937520882058, + "grad_norm": 0.5870039463043213, + "learning_rate": 1.8154108811325573e-05, + "loss": 0.0485, + "num_input_tokens_seen": 137545664, + "step": 113040 + }, + { + "epoch": 12.589932063704198, + "grad_norm": 0.07652600854635239, + "learning_rate": 1.815177197894779e-05, + "loss": 0.0562, + "num_input_tokens_seen": 137551744, + "step": 113045 + }, + { + "epoch": 12.590488918587816, + "grad_norm": 0.20152407884597778, + "learning_rate": 1.8149435211257e-05, + "loss": 0.0557, + "num_input_tokens_seen": 137557824, + "step": 113050 + }, + { + "epoch": 12.591045773471434, + "grad_norm": 0.24845729768276215, + "learning_rate": 1.8147098508275295e-05, + "loss": 0.0991, + "num_input_tokens_seen": 137563456, + "step": 113055 + }, + { + "epoch": 12.591602628355052, + "grad_norm": 0.1935839205980301, + "learning_rate": 1.8144761870024718e-05, + "loss": 0.0635, + "num_input_tokens_seen": 137569600, + "step": 113060 + }, + { + "epoch": 12.592159483238667, + "grad_norm": 0.255995512008667, + "learning_rate": 1.8142425296527376e-05, + "loss": 0.055, + "num_input_tokens_seen": 137575616, + "step": 113065 + }, + { + "epoch": 12.592716338122285, + "grad_norm": 0.00031849820516072214, + "learning_rate": 1.8140088787805303e-05, + "loss": 0.0509, + "num_input_tokens_seen": 137581728, + "step": 113070 + }, + { + "epoch": 12.593273193005903, + "grad_norm": 0.826643705368042, + "learning_rate": 1.8137752343880604e-05, + "loss": 0.0194, + "num_input_tokens_seen": 137587712, + "step": 113075 + }, + { + "epoch": 12.59383004788952, + "grad_norm": 0.388070672750473, + "learning_rate": 1.813541596477532e-05, + "loss": 0.0191, + "num_input_tokens_seen": 137594176, + "step": 113080 + }, + { + "epoch": 12.594386902773138, + "grad_norm": 0.01272632647305727, + "learning_rate": 1.8133079650511542e-05, + "loss": 0.0691, + "num_input_tokens_seen": 137600288, + "step": 113085 + }, + { + "epoch": 12.594943757656754, + "grad_norm": 0.13978753983974457, + "learning_rate": 1.813074340111132e-05, + "loss": 0.0287, + "num_input_tokens_seen": 137606368, + "step": 113090 + }, + { + "epoch": 12.595500612540372, + "grad_norm": 0.0027190237306058407, + "learning_rate": 1.8128407216596736e-05, + "loss": 0.0069, + "num_input_tokens_seen": 137612768, + "step": 113095 + }, + { + "epoch": 12.59605746742399, + "grad_norm": 0.47534453868865967, + "learning_rate": 1.8126071096989846e-05, + "loss": 0.0243, + "num_input_tokens_seen": 137618816, + "step": 113100 + }, + { + "epoch": 12.596614322307607, + "grad_norm": 0.0214687492698431, + "learning_rate": 1.8123735042312728e-05, + "loss": 0.02, + "num_input_tokens_seen": 137625024, + "step": 113105 + }, + { + "epoch": 12.597171177191225, + "grad_norm": 0.41366204619407654, + "learning_rate": 1.8121399052587434e-05, + "loss": 0.0061, + "num_input_tokens_seen": 137630592, + "step": 113110 + }, + { + "epoch": 12.59772803207484, + "grad_norm": 0.5536109805107117, + "learning_rate": 1.8119063127836045e-05, + "loss": 0.0162, + "num_input_tokens_seen": 137636352, + "step": 113115 + }, + { + "epoch": 12.598284886958458, + "grad_norm": 0.020023802295327187, + "learning_rate": 1.8116727268080608e-05, + "loss": 0.0409, + "num_input_tokens_seen": 137641856, + "step": 113120 + }, + { + "epoch": 12.598841741842076, + "grad_norm": 0.5190600752830505, + "learning_rate": 1.811439147334321e-05, + "loss": 0.0577, + "num_input_tokens_seen": 137647968, + "step": 113125 + }, + { + "epoch": 12.599398596725694, + "grad_norm": 0.05650416761636734, + "learning_rate": 1.8112055743645884e-05, + "loss": 0.0292, + "num_input_tokens_seen": 137653504, + "step": 113130 + }, + { + "epoch": 12.599955451609311, + "grad_norm": 2.0490012168884277, + "learning_rate": 1.8109720079010724e-05, + "loss": 0.0382, + "num_input_tokens_seen": 137659488, + "step": 113135 + }, + { + "epoch": 12.600512306492927, + "grad_norm": 0.09140241891145706, + "learning_rate": 1.8107384479459773e-05, + "loss": 0.0921, + "num_input_tokens_seen": 137665504, + "step": 113140 + }, + { + "epoch": 12.601069161376545, + "grad_norm": 0.05287652835249901, + "learning_rate": 1.81050489450151e-05, + "loss": 0.0218, + "num_input_tokens_seen": 137671072, + "step": 113145 + }, + { + "epoch": 12.601626016260163, + "grad_norm": 0.05727677047252655, + "learning_rate": 1.8102713475698763e-05, + "loss": 0.1299, + "num_input_tokens_seen": 137676928, + "step": 113150 + }, + { + "epoch": 12.60218287114378, + "grad_norm": 1.0451953411102295, + "learning_rate": 1.8100378071532824e-05, + "loss": 0.0434, + "num_input_tokens_seen": 137683296, + "step": 113155 + }, + { + "epoch": 12.602739726027398, + "grad_norm": 0.6668602228164673, + "learning_rate": 1.8098042732539345e-05, + "loss": 0.0438, + "num_input_tokens_seen": 137689856, + "step": 113160 + }, + { + "epoch": 12.603296580911014, + "grad_norm": 0.05992167443037033, + "learning_rate": 1.8095707458740375e-05, + "loss": 0.0541, + "num_input_tokens_seen": 137696128, + "step": 113165 + }, + { + "epoch": 12.603853435794631, + "grad_norm": 0.4314325749874115, + "learning_rate": 1.8093372250157986e-05, + "loss": 0.0063, + "num_input_tokens_seen": 137702528, + "step": 113170 + }, + { + "epoch": 12.60441029067825, + "grad_norm": 0.029156336560845375, + "learning_rate": 1.8091037106814224e-05, + "loss": 0.0126, + "num_input_tokens_seen": 137708576, + "step": 113175 + }, + { + "epoch": 12.604967145561867, + "grad_norm": 0.4460148513317108, + "learning_rate": 1.8088702028731158e-05, + "loss": 0.0331, + "num_input_tokens_seen": 137714848, + "step": 113180 + }, + { + "epoch": 12.605524000445484, + "grad_norm": 0.02084645815193653, + "learning_rate": 1.8086367015930833e-05, + "loss": 0.0306, + "num_input_tokens_seen": 137720672, + "step": 113185 + }, + { + "epoch": 12.6060808553291, + "grad_norm": 0.20112167298793793, + "learning_rate": 1.8084032068435315e-05, + "loss": 0.0127, + "num_input_tokens_seen": 137726656, + "step": 113190 + }, + { + "epoch": 12.606637710212718, + "grad_norm": 1.7302175760269165, + "learning_rate": 1.8081697186266643e-05, + "loss": 0.0783, + "num_input_tokens_seen": 137733120, + "step": 113195 + }, + { + "epoch": 12.607194565096336, + "grad_norm": 0.23232223093509674, + "learning_rate": 1.8079362369446902e-05, + "loss": 0.0373, + "num_input_tokens_seen": 137739104, + "step": 113200 + }, + { + "epoch": 12.607751419979953, + "grad_norm": 0.04871956631541252, + "learning_rate": 1.8077027617998115e-05, + "loss": 0.0321, + "num_input_tokens_seen": 137745184, + "step": 113205 + }, + { + "epoch": 12.608308274863571, + "grad_norm": 0.08835727721452713, + "learning_rate": 1.8074692931942362e-05, + "loss": 0.1043, + "num_input_tokens_seen": 137751104, + "step": 113210 + }, + { + "epoch": 12.608865129747187, + "grad_norm": 1.4056450128555298, + "learning_rate": 1.8072358311301666e-05, + "loss": 0.106, + "num_input_tokens_seen": 137757280, + "step": 113215 + }, + { + "epoch": 12.609421984630805, + "grad_norm": 0.0012411862844601274, + "learning_rate": 1.8070023756098107e-05, + "loss": 0.0268, + "num_input_tokens_seen": 137763360, + "step": 113220 + }, + { + "epoch": 12.609978839514422, + "grad_norm": 0.029217615723609924, + "learning_rate": 1.806768926635372e-05, + "loss": 0.0788, + "num_input_tokens_seen": 137769664, + "step": 113225 + }, + { + "epoch": 12.61053569439804, + "grad_norm": 0.005775225814431906, + "learning_rate": 1.8065354842090567e-05, + "loss": 0.008, + "num_input_tokens_seen": 137775776, + "step": 113230 + }, + { + "epoch": 12.611092549281658, + "grad_norm": 0.14934715628623962, + "learning_rate": 1.806302048333069e-05, + "loss": 0.0303, + "num_input_tokens_seen": 137781760, + "step": 113235 + }, + { + "epoch": 12.611649404165275, + "grad_norm": 0.04278253763914108, + "learning_rate": 1.8060686190096148e-05, + "loss": 0.0179, + "num_input_tokens_seen": 137787904, + "step": 113240 + }, + { + "epoch": 12.612206259048891, + "grad_norm": 2.297900915145874, + "learning_rate": 1.8058351962408974e-05, + "loss": 0.1154, + "num_input_tokens_seen": 137793344, + "step": 113245 + }, + { + "epoch": 12.612763113932509, + "grad_norm": 0.0002862033143173903, + "learning_rate": 1.805601780029124e-05, + "loss": 0.0046, + "num_input_tokens_seen": 137799680, + "step": 113250 + }, + { + "epoch": 12.613319968816127, + "grad_norm": 0.21351560950279236, + "learning_rate": 1.8053683703764974e-05, + "loss": 0.0028, + "num_input_tokens_seen": 137805824, + "step": 113255 + }, + { + "epoch": 12.613876823699744, + "grad_norm": 0.21164901554584503, + "learning_rate": 1.8051349672852235e-05, + "loss": 0.0151, + "num_input_tokens_seen": 137812288, + "step": 113260 + }, + { + "epoch": 12.614433678583362, + "grad_norm": 0.5994006395339966, + "learning_rate": 1.804901570757505e-05, + "loss": 0.1213, + "num_input_tokens_seen": 137818176, + "step": 113265 + }, + { + "epoch": 12.614990533466978, + "grad_norm": 1.492663860321045, + "learning_rate": 1.8046681807955505e-05, + "loss": 0.0399, + "num_input_tokens_seen": 137824704, + "step": 113270 + }, + { + "epoch": 12.615547388350596, + "grad_norm": 0.0024174507707357407, + "learning_rate": 1.80443479740156e-05, + "loss": 0.006, + "num_input_tokens_seen": 137830752, + "step": 113275 + }, + { + "epoch": 12.616104243234213, + "grad_norm": 1.7600793838500977, + "learning_rate": 1.8042014205777414e-05, + "loss": 0.0746, + "num_input_tokens_seen": 137836864, + "step": 113280 + }, + { + "epoch": 12.61666109811783, + "grad_norm": 0.013851447030901909, + "learning_rate": 1.8039680503262974e-05, + "loss": 0.0063, + "num_input_tokens_seen": 137843264, + "step": 113285 + }, + { + "epoch": 12.617217953001449, + "grad_norm": 0.11458605527877808, + "learning_rate": 1.8037346866494332e-05, + "loss": 0.0102, + "num_input_tokens_seen": 137849312, + "step": 113290 + }, + { + "epoch": 12.617774807885064, + "grad_norm": 0.11988390982151031, + "learning_rate": 1.803501329549352e-05, + "loss": 0.1495, + "num_input_tokens_seen": 137855136, + "step": 113295 + }, + { + "epoch": 12.618331662768682, + "grad_norm": 0.1878555864095688, + "learning_rate": 1.8032679790282594e-05, + "loss": 0.0611, + "num_input_tokens_seen": 137861024, + "step": 113300 + }, + { + "epoch": 12.6188885176523, + "grad_norm": 0.8740783333778381, + "learning_rate": 1.8030346350883586e-05, + "loss": 0.1271, + "num_input_tokens_seen": 137867072, + "step": 113305 + }, + { + "epoch": 12.619445372535917, + "grad_norm": 1.1108758449554443, + "learning_rate": 1.8028012977318545e-05, + "loss": 0.0465, + "num_input_tokens_seen": 137873056, + "step": 113310 + }, + { + "epoch": 12.620002227419535, + "grad_norm": 0.5229429602622986, + "learning_rate": 1.80256796696095e-05, + "loss": 0.0079, + "num_input_tokens_seen": 137879232, + "step": 113315 + }, + { + "epoch": 12.620559082303151, + "grad_norm": 0.3620823919773102, + "learning_rate": 1.802334642777851e-05, + "loss": 0.0844, + "num_input_tokens_seen": 137885600, + "step": 113320 + }, + { + "epoch": 12.621115937186769, + "grad_norm": 2.761655330657959, + "learning_rate": 1.8021013251847586e-05, + "loss": 0.0288, + "num_input_tokens_seen": 137891072, + "step": 113325 + }, + { + "epoch": 12.621672792070386, + "grad_norm": 0.0749119222164154, + "learning_rate": 1.80186801418388e-05, + "loss": 0.11, + "num_input_tokens_seen": 137897472, + "step": 113330 + }, + { + "epoch": 12.622229646954004, + "grad_norm": 0.00012232319568283856, + "learning_rate": 1.801634709777416e-05, + "loss": 0.0106, + "num_input_tokens_seen": 137903712, + "step": 113335 + }, + { + "epoch": 12.622786501837622, + "grad_norm": 0.756782591342926, + "learning_rate": 1.8014014119675732e-05, + "loss": 0.08, + "num_input_tokens_seen": 137909856, + "step": 113340 + }, + { + "epoch": 12.623343356721238, + "grad_norm": 2.228346824645996, + "learning_rate": 1.8011681207565516e-05, + "loss": 0.1119, + "num_input_tokens_seen": 137915872, + "step": 113345 + }, + { + "epoch": 12.623900211604855, + "grad_norm": 0.0008646795176900923, + "learning_rate": 1.800934836146559e-05, + "loss": 0.0616, + "num_input_tokens_seen": 137921760, + "step": 113350 + }, + { + "epoch": 12.624457066488473, + "grad_norm": 0.41063475608825684, + "learning_rate": 1.8007015581397957e-05, + "loss": 0.0545, + "num_input_tokens_seen": 137928000, + "step": 113355 + }, + { + "epoch": 12.62501392137209, + "grad_norm": 0.023435533046722412, + "learning_rate": 1.8004682867384674e-05, + "loss": 0.0109, + "num_input_tokens_seen": 137934208, + "step": 113360 + }, + { + "epoch": 12.625570776255708, + "grad_norm": 1.5189036130905151, + "learning_rate": 1.800235021944776e-05, + "loss": 0.0899, + "num_input_tokens_seen": 137939872, + "step": 113365 + }, + { + "epoch": 12.626127631139326, + "grad_norm": 0.07219339162111282, + "learning_rate": 1.8000017637609256e-05, + "loss": 0.0049, + "num_input_tokens_seen": 137945696, + "step": 113370 + }, + { + "epoch": 12.626684486022942, + "grad_norm": 0.7467826008796692, + "learning_rate": 1.7997685121891193e-05, + "loss": 0.0777, + "num_input_tokens_seen": 137951968, + "step": 113375 + }, + { + "epoch": 12.62724134090656, + "grad_norm": 0.001151643693447113, + "learning_rate": 1.7995352672315606e-05, + "loss": 0.0055, + "num_input_tokens_seen": 137958016, + "step": 113380 + }, + { + "epoch": 12.627798195790177, + "grad_norm": 0.9072213768959045, + "learning_rate": 1.799302028890452e-05, + "loss": 0.0622, + "num_input_tokens_seen": 137963936, + "step": 113385 + }, + { + "epoch": 12.628355050673795, + "grad_norm": 0.3273201286792755, + "learning_rate": 1.7990687971679983e-05, + "loss": 0.0126, + "num_input_tokens_seen": 137969984, + "step": 113390 + }, + { + "epoch": 12.628911905557413, + "grad_norm": 0.6902332305908203, + "learning_rate": 1.7988355720663996e-05, + "loss": 0.1308, + "num_input_tokens_seen": 137975456, + "step": 113395 + }, + { + "epoch": 12.629468760441029, + "grad_norm": 0.04841597378253937, + "learning_rate": 1.7986023535878626e-05, + "loss": 0.0818, + "num_input_tokens_seen": 137981888, + "step": 113400 + }, + { + "epoch": 12.630025615324646, + "grad_norm": 0.011257210746407509, + "learning_rate": 1.7983691417345866e-05, + "loss": 0.0083, + "num_input_tokens_seen": 137987712, + "step": 113405 + }, + { + "epoch": 12.630582470208264, + "grad_norm": 0.046400684863328934, + "learning_rate": 1.7981359365087773e-05, + "loss": 0.0231, + "num_input_tokens_seen": 137994176, + "step": 113410 + }, + { + "epoch": 12.631139325091882, + "grad_norm": 0.33485493063926697, + "learning_rate": 1.797902737912636e-05, + "loss": 0.0092, + "num_input_tokens_seen": 138000576, + "step": 113415 + }, + { + "epoch": 12.6316961799755, + "grad_norm": 0.4905264973640442, + "learning_rate": 1.797669545948366e-05, + "loss": 0.0313, + "num_input_tokens_seen": 138006624, + "step": 113420 + }, + { + "epoch": 12.632253034859115, + "grad_norm": 0.04455128684639931, + "learning_rate": 1.7974363606181698e-05, + "loss": 0.0197, + "num_input_tokens_seen": 138012928, + "step": 113425 + }, + { + "epoch": 12.632809889742733, + "grad_norm": 0.0004095413605682552, + "learning_rate": 1.7972031819242503e-05, + "loss": 0.0445, + "num_input_tokens_seen": 138018912, + "step": 113430 + }, + { + "epoch": 12.63336674462635, + "grad_norm": 0.004766819532960653, + "learning_rate": 1.796970009868809e-05, + "loss": 0.0158, + "num_input_tokens_seen": 138024672, + "step": 113435 + }, + { + "epoch": 12.633923599509968, + "grad_norm": 0.018455680459737778, + "learning_rate": 1.7967368444540505e-05, + "loss": 0.0275, + "num_input_tokens_seen": 138030528, + "step": 113440 + }, + { + "epoch": 12.634480454393586, + "grad_norm": 0.22940032184123993, + "learning_rate": 1.7965036856821748e-05, + "loss": 0.0945, + "num_input_tokens_seen": 138036768, + "step": 113445 + }, + { + "epoch": 12.635037309277202, + "grad_norm": 0.051827263087034225, + "learning_rate": 1.7962705335553864e-05, + "loss": 0.0733, + "num_input_tokens_seen": 138043072, + "step": 113450 + }, + { + "epoch": 12.63559416416082, + "grad_norm": 0.002525069285184145, + "learning_rate": 1.7960373880758853e-05, + "loss": 0.0181, + "num_input_tokens_seen": 138049280, + "step": 113455 + }, + { + "epoch": 12.636151019044437, + "grad_norm": 0.24550116062164307, + "learning_rate": 1.7958042492458767e-05, + "loss": 0.0617, + "num_input_tokens_seen": 138055328, + "step": 113460 + }, + { + "epoch": 12.636707873928055, + "grad_norm": 0.049704086035490036, + "learning_rate": 1.7955711170675592e-05, + "loss": 0.0121, + "num_input_tokens_seen": 138061408, + "step": 113465 + }, + { + "epoch": 12.637264728811672, + "grad_norm": 0.5685896277427673, + "learning_rate": 1.7953379915431385e-05, + "loss": 0.1183, + "num_input_tokens_seen": 138067232, + "step": 113470 + }, + { + "epoch": 12.637821583695288, + "grad_norm": 1.0381885766983032, + "learning_rate": 1.7951048726748142e-05, + "loss": 0.0289, + "num_input_tokens_seen": 138073440, + "step": 113475 + }, + { + "epoch": 12.638378438578906, + "grad_norm": 0.3243270516395569, + "learning_rate": 1.79487176046479e-05, + "loss": 0.0949, + "num_input_tokens_seen": 138079872, + "step": 113480 + }, + { + "epoch": 12.638935293462524, + "grad_norm": 0.1349218189716339, + "learning_rate": 1.794638654915266e-05, + "loss": 0.0076, + "num_input_tokens_seen": 138086080, + "step": 113485 + }, + { + "epoch": 12.639492148346141, + "grad_norm": 0.39153948426246643, + "learning_rate": 1.7944055560284458e-05, + "loss": 0.0201, + "num_input_tokens_seen": 138092032, + "step": 113490 + }, + { + "epoch": 12.640049003229759, + "grad_norm": 0.06336932629346848, + "learning_rate": 1.7941724638065298e-05, + "loss": 0.0069, + "num_input_tokens_seen": 138097888, + "step": 113495 + }, + { + "epoch": 12.640605858113375, + "grad_norm": 0.7912975549697876, + "learning_rate": 1.793939378251721e-05, + "loss": 0.0143, + "num_input_tokens_seen": 138103776, + "step": 113500 + }, + { + "epoch": 12.641162712996993, + "grad_norm": 2.2795515060424805, + "learning_rate": 1.79370629936622e-05, + "loss": 0.1231, + "num_input_tokens_seen": 138110048, + "step": 113505 + }, + { + "epoch": 12.64171956788061, + "grad_norm": 0.15270276367664337, + "learning_rate": 1.7934732271522293e-05, + "loss": 0.0945, + "num_input_tokens_seen": 138116160, + "step": 113510 + }, + { + "epoch": 12.642276422764228, + "grad_norm": 0.30278217792510986, + "learning_rate": 1.7932401616119495e-05, + "loss": 0.0613, + "num_input_tokens_seen": 138122752, + "step": 113515 + }, + { + "epoch": 12.642833277647846, + "grad_norm": 0.07335156202316284, + "learning_rate": 1.793007102747583e-05, + "loss": 0.0069, + "num_input_tokens_seen": 138128704, + "step": 113520 + }, + { + "epoch": 12.643390132531461, + "grad_norm": 0.00018010233179666102, + "learning_rate": 1.79277405056133e-05, + "loss": 0.0405, + "num_input_tokens_seen": 138134624, + "step": 113525 + }, + { + "epoch": 12.64394698741508, + "grad_norm": 0.011186092160642147, + "learning_rate": 1.7925410050553942e-05, + "loss": 0.0413, + "num_input_tokens_seen": 138140864, + "step": 113530 + }, + { + "epoch": 12.644503842298697, + "grad_norm": 0.0054063377901911736, + "learning_rate": 1.792307966231974e-05, + "loss": 0.0688, + "num_input_tokens_seen": 138147136, + "step": 113535 + }, + { + "epoch": 12.645060697182315, + "grad_norm": 0.15562117099761963, + "learning_rate": 1.792074934093273e-05, + "loss": 0.0085, + "num_input_tokens_seen": 138153216, + "step": 113540 + }, + { + "epoch": 12.645617552065932, + "grad_norm": 1.4676488637924194, + "learning_rate": 1.7918419086414907e-05, + "loss": 0.0642, + "num_input_tokens_seen": 138159328, + "step": 113545 + }, + { + "epoch": 12.646174406949548, + "grad_norm": 0.06050547584891319, + "learning_rate": 1.7916088898788297e-05, + "loss": 0.0454, + "num_input_tokens_seen": 138164832, + "step": 113550 + }, + { + "epoch": 12.646731261833166, + "grad_norm": 1.461097240447998, + "learning_rate": 1.79137587780749e-05, + "loss": 0.2041, + "num_input_tokens_seen": 138171008, + "step": 113555 + }, + { + "epoch": 12.647288116716783, + "grad_norm": 1.1716312170028687, + "learning_rate": 1.791142872429673e-05, + "loss": 0.0843, + "num_input_tokens_seen": 138176800, + "step": 113560 + }, + { + "epoch": 12.647844971600401, + "grad_norm": 1.7466387748718262, + "learning_rate": 1.790909873747579e-05, + "loss": 0.1078, + "num_input_tokens_seen": 138182656, + "step": 113565 + }, + { + "epoch": 12.648401826484019, + "grad_norm": 0.0005458237137645483, + "learning_rate": 1.7906768817634103e-05, + "loss": 0.0681, + "num_input_tokens_seen": 138188448, + "step": 113570 + }, + { + "epoch": 12.648958681367635, + "grad_norm": 0.006809004116803408, + "learning_rate": 1.7904438964793663e-05, + "loss": 0.0021, + "num_input_tokens_seen": 138194848, + "step": 113575 + }, + { + "epoch": 12.649515536251252, + "grad_norm": 0.007174346596002579, + "learning_rate": 1.7902109178976477e-05, + "loss": 0.0396, + "num_input_tokens_seen": 138201024, + "step": 113580 + }, + { + "epoch": 12.65007239113487, + "grad_norm": 0.1883944272994995, + "learning_rate": 1.7899779460204564e-05, + "loss": 0.0266, + "num_input_tokens_seen": 138207008, + "step": 113585 + }, + { + "epoch": 12.650629246018488, + "grad_norm": 0.030160853639245033, + "learning_rate": 1.7897449808499914e-05, + "loss": 0.1305, + "num_input_tokens_seen": 138213024, + "step": 113590 + }, + { + "epoch": 12.651186100902105, + "grad_norm": 0.04882245883345604, + "learning_rate": 1.789512022388455e-05, + "loss": 0.0593, + "num_input_tokens_seen": 138219008, + "step": 113595 + }, + { + "epoch": 12.651742955785723, + "grad_norm": 0.007364299148321152, + "learning_rate": 1.789279070638045e-05, + "loss": 0.1026, + "num_input_tokens_seen": 138225184, + "step": 113600 + }, + { + "epoch": 12.652299810669339, + "grad_norm": 0.16539667546749115, + "learning_rate": 1.789046125600966e-05, + "loss": 0.0141, + "num_input_tokens_seen": 138231264, + "step": 113605 + }, + { + "epoch": 12.652856665552957, + "grad_norm": 0.06350419670343399, + "learning_rate": 1.788813187279414e-05, + "loss": 0.0271, + "num_input_tokens_seen": 138237312, + "step": 113610 + }, + { + "epoch": 12.653413520436574, + "grad_norm": 0.573594331741333, + "learning_rate": 1.788580255675593e-05, + "loss": 0.0544, + "num_input_tokens_seen": 138243328, + "step": 113615 + }, + { + "epoch": 12.653970375320192, + "grad_norm": 0.2412317991256714, + "learning_rate": 1.7883473307916997e-05, + "loss": 0.0153, + "num_input_tokens_seen": 138249536, + "step": 113620 + }, + { + "epoch": 12.65452723020381, + "grad_norm": 0.22657595574855804, + "learning_rate": 1.7881144126299373e-05, + "loss": 0.0421, + "num_input_tokens_seen": 138255552, + "step": 113625 + }, + { + "epoch": 12.655084085087426, + "grad_norm": 0.1782744824886322, + "learning_rate": 1.787881501192504e-05, + "loss": 0.0491, + "num_input_tokens_seen": 138261632, + "step": 113630 + }, + { + "epoch": 12.655640939971043, + "grad_norm": 0.1355772167444229, + "learning_rate": 1.787648596481601e-05, + "loss": 0.0285, + "num_input_tokens_seen": 138267744, + "step": 113635 + }, + { + "epoch": 12.656197794854661, + "grad_norm": 0.11299087107181549, + "learning_rate": 1.7874156984994274e-05, + "loss": 0.0028, + "num_input_tokens_seen": 138273664, + "step": 113640 + }, + { + "epoch": 12.656754649738279, + "grad_norm": 0.005103026516735554, + "learning_rate": 1.7871828072481833e-05, + "loss": 0.0058, + "num_input_tokens_seen": 138280032, + "step": 113645 + }, + { + "epoch": 12.657311504621896, + "grad_norm": 0.4152609705924988, + "learning_rate": 1.7869499227300688e-05, + "loss": 0.0314, + "num_input_tokens_seen": 138286112, + "step": 113650 + }, + { + "epoch": 12.657868359505512, + "grad_norm": 1.54975426197052, + "learning_rate": 1.7867170449472838e-05, + "loss": 0.0755, + "num_input_tokens_seen": 138292320, + "step": 113655 + }, + { + "epoch": 12.65842521438913, + "grad_norm": 0.0029427744448184967, + "learning_rate": 1.7864841739020276e-05, + "loss": 0.0244, + "num_input_tokens_seen": 138298592, + "step": 113660 + }, + { + "epoch": 12.658982069272747, + "grad_norm": 1.3975093364715576, + "learning_rate": 1.7862513095965e-05, + "loss": 0.1595, + "num_input_tokens_seen": 138304320, + "step": 113665 + }, + { + "epoch": 12.659538924156365, + "grad_norm": 1.0646344423294067, + "learning_rate": 1.7860184520328997e-05, + "loss": 0.0385, + "num_input_tokens_seen": 138310560, + "step": 113670 + }, + { + "epoch": 12.660095779039983, + "grad_norm": 0.029743509367108345, + "learning_rate": 1.7857856012134293e-05, + "loss": 0.0022, + "num_input_tokens_seen": 138316608, + "step": 113675 + }, + { + "epoch": 12.660652633923599, + "grad_norm": 0.00036064768210053444, + "learning_rate": 1.7855527571402842e-05, + "loss": 0.0101, + "num_input_tokens_seen": 138322784, + "step": 113680 + }, + { + "epoch": 12.661209488807216, + "grad_norm": 0.0042000748217105865, + "learning_rate": 1.7853199198156667e-05, + "loss": 0.0137, + "num_input_tokens_seen": 138328608, + "step": 113685 + }, + { + "epoch": 12.661766343690834, + "grad_norm": 0.02738039568066597, + "learning_rate": 1.7850870892417745e-05, + "loss": 0.0012, + "num_input_tokens_seen": 138334848, + "step": 113690 + }, + { + "epoch": 12.662323198574452, + "grad_norm": 1.3370734453201294, + "learning_rate": 1.7848542654208084e-05, + "loss": 0.1516, + "num_input_tokens_seen": 138340832, + "step": 113695 + }, + { + "epoch": 12.66288005345807, + "grad_norm": 1.859501838684082, + "learning_rate": 1.7846214483549656e-05, + "loss": 0.0388, + "num_input_tokens_seen": 138346976, + "step": 113700 + }, + { + "epoch": 12.663436908341685, + "grad_norm": 0.11223134398460388, + "learning_rate": 1.7843886380464474e-05, + "loss": 0.0489, + "num_input_tokens_seen": 138353216, + "step": 113705 + }, + { + "epoch": 12.663993763225303, + "grad_norm": 1.2076795101165771, + "learning_rate": 1.7841558344974514e-05, + "loss": 0.051, + "num_input_tokens_seen": 138359520, + "step": 113710 + }, + { + "epoch": 12.66455061810892, + "grad_norm": 0.6975600719451904, + "learning_rate": 1.7839230377101774e-05, + "loss": 0.0336, + "num_input_tokens_seen": 138365728, + "step": 113715 + }, + { + "epoch": 12.665107472992538, + "grad_norm": 0.18560326099395752, + "learning_rate": 1.7836902476868234e-05, + "loss": 0.1005, + "num_input_tokens_seen": 138371968, + "step": 113720 + }, + { + "epoch": 12.665664327876156, + "grad_norm": 0.17498716711997986, + "learning_rate": 1.7834574644295895e-05, + "loss": 0.0748, + "num_input_tokens_seen": 138378336, + "step": 113725 + }, + { + "epoch": 12.666221182759774, + "grad_norm": 0.6518356204032898, + "learning_rate": 1.7832246879406727e-05, + "loss": 0.021, + "num_input_tokens_seen": 138384320, + "step": 113730 + }, + { + "epoch": 12.66677803764339, + "grad_norm": 0.007296902593225241, + "learning_rate": 1.7829919182222752e-05, + "loss": 0.0445, + "num_input_tokens_seen": 138390624, + "step": 113735 + }, + { + "epoch": 12.667334892527007, + "grad_norm": 0.00136042560916394, + "learning_rate": 1.7827591552765916e-05, + "loss": 0.052, + "num_input_tokens_seen": 138396096, + "step": 113740 + }, + { + "epoch": 12.667891747410625, + "grad_norm": 0.48122066259384155, + "learning_rate": 1.7825263991058234e-05, + "loss": 0.0123, + "num_input_tokens_seen": 138402080, + "step": 113745 + }, + { + "epoch": 12.668448602294243, + "grad_norm": 0.14166058599948883, + "learning_rate": 1.7822936497121672e-05, + "loss": 0.0342, + "num_input_tokens_seen": 138408544, + "step": 113750 + }, + { + "epoch": 12.66900545717786, + "grad_norm": 1.0621308088302612, + "learning_rate": 1.7820609070978235e-05, + "loss": 0.1517, + "num_input_tokens_seen": 138414656, + "step": 113755 + }, + { + "epoch": 12.669562312061476, + "grad_norm": 0.05834704264998436, + "learning_rate": 1.7818281712649893e-05, + "loss": 0.0036, + "num_input_tokens_seen": 138421376, + "step": 113760 + }, + { + "epoch": 12.670119166945094, + "grad_norm": 1.068632960319519, + "learning_rate": 1.7815954422158637e-05, + "loss": 0.1258, + "num_input_tokens_seen": 138426784, + "step": 113765 + }, + { + "epoch": 12.670676021828712, + "grad_norm": 0.0028652504552155733, + "learning_rate": 1.7813627199526446e-05, + "loss": 0.1013, + "num_input_tokens_seen": 138432480, + "step": 113770 + }, + { + "epoch": 12.67123287671233, + "grad_norm": 0.09818577021360397, + "learning_rate": 1.7811300044775303e-05, + "loss": 0.0129, + "num_input_tokens_seen": 138438848, + "step": 113775 + }, + { + "epoch": 12.671789731595947, + "grad_norm": 0.04376846179366112, + "learning_rate": 1.780897295792719e-05, + "loss": 0.0468, + "num_input_tokens_seen": 138445344, + "step": 113780 + }, + { + "epoch": 12.672346586479563, + "grad_norm": 0.2762128710746765, + "learning_rate": 1.780664593900409e-05, + "loss": 0.0849, + "num_input_tokens_seen": 138450880, + "step": 113785 + }, + { + "epoch": 12.67290344136318, + "grad_norm": 0.0057142432779073715, + "learning_rate": 1.7804318988027982e-05, + "loss": 0.0271, + "num_input_tokens_seen": 138456864, + "step": 113790 + }, + { + "epoch": 12.673460296246798, + "grad_norm": 0.07054788619279861, + "learning_rate": 1.780199210502085e-05, + "loss": 0.0033, + "num_input_tokens_seen": 138463072, + "step": 113795 + }, + { + "epoch": 12.674017151130416, + "grad_norm": 0.0007591971079818904, + "learning_rate": 1.7799665290004656e-05, + "loss": 0.0554, + "num_input_tokens_seen": 138468864, + "step": 113800 + }, + { + "epoch": 12.674574006014034, + "grad_norm": 0.13702620565891266, + "learning_rate": 1.779733854300141e-05, + "loss": 0.0123, + "num_input_tokens_seen": 138475104, + "step": 113805 + }, + { + "epoch": 12.67513086089765, + "grad_norm": 0.32435300946235657, + "learning_rate": 1.7795011864033056e-05, + "loss": 0.0526, + "num_input_tokens_seen": 138481376, + "step": 113810 + }, + { + "epoch": 12.675687715781267, + "grad_norm": 1.0498981475830078, + "learning_rate": 1.77926852531216e-05, + "loss": 0.0797, + "num_input_tokens_seen": 138487168, + "step": 113815 + }, + { + "epoch": 12.676244570664885, + "grad_norm": 0.012582099996507168, + "learning_rate": 1.7790358710289e-05, + "loss": 0.0486, + "num_input_tokens_seen": 138493504, + "step": 113820 + }, + { + "epoch": 12.676801425548502, + "grad_norm": 0.0732831358909607, + "learning_rate": 1.778803223555724e-05, + "loss": 0.0658, + "num_input_tokens_seen": 138499936, + "step": 113825 + }, + { + "epoch": 12.67735828043212, + "grad_norm": 1.093648076057434, + "learning_rate": 1.778570582894829e-05, + "loss": 0.0669, + "num_input_tokens_seen": 138506080, + "step": 113830 + }, + { + "epoch": 12.677915135315736, + "grad_norm": 0.004490151535719633, + "learning_rate": 1.7783379490484138e-05, + "loss": 0.0418, + "num_input_tokens_seen": 138512256, + "step": 113835 + }, + { + "epoch": 12.678471990199354, + "grad_norm": 0.025870047509670258, + "learning_rate": 1.778105322018674e-05, + "loss": 0.029, + "num_input_tokens_seen": 138518304, + "step": 113840 + }, + { + "epoch": 12.679028845082971, + "grad_norm": 1.7301816940307617, + "learning_rate": 1.7778727018078086e-05, + "loss": 0.0198, + "num_input_tokens_seen": 138524288, + "step": 113845 + }, + { + "epoch": 12.679585699966589, + "grad_norm": 0.6574704051017761, + "learning_rate": 1.7776400884180127e-05, + "loss": 0.0201, + "num_input_tokens_seen": 138530304, + "step": 113850 + }, + { + "epoch": 12.680142554850207, + "grad_norm": 0.2589234709739685, + "learning_rate": 1.7774074818514864e-05, + "loss": 0.0136, + "num_input_tokens_seen": 138536448, + "step": 113855 + }, + { + "epoch": 12.680699409733823, + "grad_norm": 0.06322024017572403, + "learning_rate": 1.7771748821104238e-05, + "loss": 0.1241, + "num_input_tokens_seen": 138542624, + "step": 113860 + }, + { + "epoch": 12.68125626461744, + "grad_norm": 0.8742402195930481, + "learning_rate": 1.7769422891970254e-05, + "loss": 0.0221, + "num_input_tokens_seen": 138548832, + "step": 113865 + }, + { + "epoch": 12.681813119501058, + "grad_norm": 0.515177309513092, + "learning_rate": 1.7767097031134847e-05, + "loss": 0.0136, + "num_input_tokens_seen": 138554912, + "step": 113870 + }, + { + "epoch": 12.682369974384676, + "grad_norm": 2.7038605213165283, + "learning_rate": 1.7764771238620014e-05, + "loss": 0.0469, + "num_input_tokens_seen": 138560992, + "step": 113875 + }, + { + "epoch": 12.682926829268293, + "grad_norm": 0.22803807258605957, + "learning_rate": 1.7762445514447708e-05, + "loss": 0.1247, + "num_input_tokens_seen": 138567232, + "step": 113880 + }, + { + "epoch": 12.68348368415191, + "grad_norm": 0.0006920445594005287, + "learning_rate": 1.7760119858639906e-05, + "loss": 0.0525, + "num_input_tokens_seen": 138573248, + "step": 113885 + }, + { + "epoch": 12.684040539035527, + "grad_norm": 0.08522055298089981, + "learning_rate": 1.775779427121857e-05, + "loss": 0.055, + "num_input_tokens_seen": 138579456, + "step": 113890 + }, + { + "epoch": 12.684597393919145, + "grad_norm": 0.034642886370420456, + "learning_rate": 1.7755468752205673e-05, + "loss": 0.0028, + "num_input_tokens_seen": 138585856, + "step": 113895 + }, + { + "epoch": 12.685154248802762, + "grad_norm": 0.04804530739784241, + "learning_rate": 1.7753143301623176e-05, + "loss": 0.0037, + "num_input_tokens_seen": 138592064, + "step": 113900 + }, + { + "epoch": 12.68571110368638, + "grad_norm": 0.2919972538948059, + "learning_rate": 1.7750817919493048e-05, + "loss": 0.0808, + "num_input_tokens_seen": 138597440, + "step": 113905 + }, + { + "epoch": 12.686267958569996, + "grad_norm": 1.332002878189087, + "learning_rate": 1.774849260583725e-05, + "loss": 0.07, + "num_input_tokens_seen": 138603936, + "step": 113910 + }, + { + "epoch": 12.686824813453613, + "grad_norm": 0.00048281013732776046, + "learning_rate": 1.774616736067775e-05, + "loss": 0.0081, + "num_input_tokens_seen": 138610144, + "step": 113915 + }, + { + "epoch": 12.687381668337231, + "grad_norm": 0.20826762914657593, + "learning_rate": 1.7743842184036508e-05, + "loss": 0.1389, + "num_input_tokens_seen": 138616640, + "step": 113920 + }, + { + "epoch": 12.687938523220849, + "grad_norm": 0.0943027138710022, + "learning_rate": 1.77415170759355e-05, + "loss": 0.0161, + "num_input_tokens_seen": 138622624, + "step": 113925 + }, + { + "epoch": 12.688495378104466, + "grad_norm": 1.670135736465454, + "learning_rate": 1.7739192036396663e-05, + "loss": 0.0767, + "num_input_tokens_seen": 138628736, + "step": 113930 + }, + { + "epoch": 12.689052232988082, + "grad_norm": 0.06019977852702141, + "learning_rate": 1.7736867065441992e-05, + "loss": 0.1065, + "num_input_tokens_seen": 138634560, + "step": 113935 + }, + { + "epoch": 12.6896090878717, + "grad_norm": 0.614602267742157, + "learning_rate": 1.7734542163093415e-05, + "loss": 0.0174, + "num_input_tokens_seen": 138640576, + "step": 113940 + }, + { + "epoch": 12.690165942755318, + "grad_norm": 0.30782216787338257, + "learning_rate": 1.7732217329372918e-05, + "loss": 0.0477, + "num_input_tokens_seen": 138646880, + "step": 113945 + }, + { + "epoch": 12.690722797638935, + "grad_norm": 0.05736559256911278, + "learning_rate": 1.7729892564302446e-05, + "loss": 0.1313, + "num_input_tokens_seen": 138652768, + "step": 113950 + }, + { + "epoch": 12.691279652522553, + "grad_norm": 1.4756169319152832, + "learning_rate": 1.772756786790397e-05, + "loss": 0.0542, + "num_input_tokens_seen": 138658976, + "step": 113955 + }, + { + "epoch": 12.69183650740617, + "grad_norm": 0.1075136810541153, + "learning_rate": 1.7725243240199437e-05, + "loss": 0.0645, + "num_input_tokens_seen": 138664832, + "step": 113960 + }, + { + "epoch": 12.692393362289787, + "grad_norm": 0.04190422594547272, + "learning_rate": 1.7722918681210813e-05, + "loss": 0.0128, + "num_input_tokens_seen": 138670880, + "step": 113965 + }, + { + "epoch": 12.692950217173404, + "grad_norm": 0.0006260478403419256, + "learning_rate": 1.772059419096005e-05, + "loss": 0.0189, + "num_input_tokens_seen": 138677024, + "step": 113970 + }, + { + "epoch": 12.693507072057022, + "grad_norm": 2.055283784866333, + "learning_rate": 1.7718269769469108e-05, + "loss": 0.0617, + "num_input_tokens_seen": 138682976, + "step": 113975 + }, + { + "epoch": 12.69406392694064, + "grad_norm": 0.2174515277147293, + "learning_rate": 1.7715945416759943e-05, + "loss": 0.0063, + "num_input_tokens_seen": 138689088, + "step": 113980 + }, + { + "epoch": 12.694620781824257, + "grad_norm": 0.0514863021671772, + "learning_rate": 1.771362113285451e-05, + "loss": 0.0907, + "num_input_tokens_seen": 138695168, + "step": 113985 + }, + { + "epoch": 12.695177636707873, + "grad_norm": 0.003310036612674594, + "learning_rate": 1.771129691777476e-05, + "loss": 0.0321, + "num_input_tokens_seen": 138701248, + "step": 113990 + }, + { + "epoch": 12.695734491591491, + "grad_norm": 0.8870248198509216, + "learning_rate": 1.7708972771542653e-05, + "loss": 0.0602, + "num_input_tokens_seen": 138707360, + "step": 113995 + }, + { + "epoch": 12.696291346475109, + "grad_norm": 0.03776240721344948, + "learning_rate": 1.770664869418014e-05, + "loss": 0.0333, + "num_input_tokens_seen": 138713536, + "step": 114000 + }, + { + "epoch": 12.696848201358726, + "grad_norm": 0.01178159099072218, + "learning_rate": 1.770432468570916e-05, + "loss": 0.0394, + "num_input_tokens_seen": 138719808, + "step": 114005 + }, + { + "epoch": 12.697405056242344, + "grad_norm": 0.5166288614273071, + "learning_rate": 1.7702000746151704e-05, + "loss": 0.0716, + "num_input_tokens_seen": 138725568, + "step": 114010 + }, + { + "epoch": 12.69796191112596, + "grad_norm": 0.013897433876991272, + "learning_rate": 1.7699676875529674e-05, + "loss": 0.0063, + "num_input_tokens_seen": 138731872, + "step": 114015 + }, + { + "epoch": 12.698518766009578, + "grad_norm": 1.469896912574768, + "learning_rate": 1.7697353073865063e-05, + "loss": 0.1463, + "num_input_tokens_seen": 138737920, + "step": 114020 + }, + { + "epoch": 12.699075620893195, + "grad_norm": 0.20583251118659973, + "learning_rate": 1.7695029341179787e-05, + "loss": 0.0285, + "num_input_tokens_seen": 138743680, + "step": 114025 + }, + { + "epoch": 12.699632475776813, + "grad_norm": 0.019027475267648697, + "learning_rate": 1.7692705677495824e-05, + "loss": 0.0259, + "num_input_tokens_seen": 138749920, + "step": 114030 + }, + { + "epoch": 12.70018933066043, + "grad_norm": 1.2452516555786133, + "learning_rate": 1.7690382082835103e-05, + "loss": 0.0784, + "num_input_tokens_seen": 138756288, + "step": 114035 + }, + { + "epoch": 12.700746185544046, + "grad_norm": 0.010481827892363071, + "learning_rate": 1.7688058557219584e-05, + "loss": 0.0699, + "num_input_tokens_seen": 138762240, + "step": 114040 + }, + { + "epoch": 12.701303040427664, + "grad_norm": 0.2624726891517639, + "learning_rate": 1.768573510067121e-05, + "loss": 0.0413, + "num_input_tokens_seen": 138768288, + "step": 114045 + }, + { + "epoch": 12.701859895311282, + "grad_norm": 0.020786134526133537, + "learning_rate": 1.7683411713211927e-05, + "loss": 0.0237, + "num_input_tokens_seen": 138774880, + "step": 114050 + }, + { + "epoch": 12.7024167501949, + "grad_norm": 1.5284926891326904, + "learning_rate": 1.768108839486368e-05, + "loss": 0.2372, + "num_input_tokens_seen": 138780928, + "step": 114055 + }, + { + "epoch": 12.702973605078517, + "grad_norm": 0.0010853770654648542, + "learning_rate": 1.767876514564842e-05, + "loss": 0.0357, + "num_input_tokens_seen": 138786976, + "step": 114060 + }, + { + "epoch": 12.703530459962133, + "grad_norm": 1.9780800342559814, + "learning_rate": 1.7676441965588088e-05, + "loss": 0.1253, + "num_input_tokens_seen": 138793088, + "step": 114065 + }, + { + "epoch": 12.70408731484575, + "grad_norm": 0.011549110524356365, + "learning_rate": 1.767411885470463e-05, + "loss": 0.0602, + "num_input_tokens_seen": 138799360, + "step": 114070 + }, + { + "epoch": 12.704644169729368, + "grad_norm": 0.053877681493759155, + "learning_rate": 1.7671795813019982e-05, + "loss": 0.072, + "num_input_tokens_seen": 138805408, + "step": 114075 + }, + { + "epoch": 12.705201024612986, + "grad_norm": 0.9530550837516785, + "learning_rate": 1.7669472840556107e-05, + "loss": 0.0655, + "num_input_tokens_seen": 138811456, + "step": 114080 + }, + { + "epoch": 12.705757879496604, + "grad_norm": 0.34102755784988403, + "learning_rate": 1.7667149937334916e-05, + "loss": 0.0276, + "num_input_tokens_seen": 138817408, + "step": 114085 + }, + { + "epoch": 12.706314734380221, + "grad_norm": 0.017082931473851204, + "learning_rate": 1.7664827103378384e-05, + "loss": 0.0226, + "num_input_tokens_seen": 138822848, + "step": 114090 + }, + { + "epoch": 12.706871589263837, + "grad_norm": 0.009387300349771976, + "learning_rate": 1.766250433870843e-05, + "loss": 0.1092, + "num_input_tokens_seen": 138828992, + "step": 114095 + }, + { + "epoch": 12.707428444147455, + "grad_norm": 0.6595489382743835, + "learning_rate": 1.7660181643347008e-05, + "loss": 0.0254, + "num_input_tokens_seen": 138834944, + "step": 114100 + }, + { + "epoch": 12.707985299031073, + "grad_norm": 0.005856053438037634, + "learning_rate": 1.765785901731604e-05, + "loss": 0.0642, + "num_input_tokens_seen": 138840928, + "step": 114105 + }, + { + "epoch": 12.70854215391469, + "grad_norm": 0.31017208099365234, + "learning_rate": 1.765553646063749e-05, + "loss": 0.0253, + "num_input_tokens_seen": 138847072, + "step": 114110 + }, + { + "epoch": 12.709099008798308, + "grad_norm": 0.13357914984226227, + "learning_rate": 1.7653213973333272e-05, + "loss": 0.0047, + "num_input_tokens_seen": 138853408, + "step": 114115 + }, + { + "epoch": 12.709655863681924, + "grad_norm": 0.15865805745124817, + "learning_rate": 1.7650891555425337e-05, + "loss": 0.1476, + "num_input_tokens_seen": 138859744, + "step": 114120 + }, + { + "epoch": 12.710212718565542, + "grad_norm": 0.2372317612171173, + "learning_rate": 1.764856920693562e-05, + "loss": 0.0183, + "num_input_tokens_seen": 138865792, + "step": 114125 + }, + { + "epoch": 12.71076957344916, + "grad_norm": 0.6368883848190308, + "learning_rate": 1.7646246927886057e-05, + "loss": 0.028, + "num_input_tokens_seen": 138871904, + "step": 114130 + }, + { + "epoch": 12.711326428332777, + "grad_norm": 0.00037378788692876697, + "learning_rate": 1.7643924718298577e-05, + "loss": 0.0678, + "num_input_tokens_seen": 138877952, + "step": 114135 + }, + { + "epoch": 12.711883283216395, + "grad_norm": 0.15735414624214172, + "learning_rate": 1.764160257819513e-05, + "loss": 0.0283, + "num_input_tokens_seen": 138884128, + "step": 114140 + }, + { + "epoch": 12.71244013810001, + "grad_norm": 0.32829001545906067, + "learning_rate": 1.7639280507597637e-05, + "loss": 0.1168, + "num_input_tokens_seen": 138889856, + "step": 114145 + }, + { + "epoch": 12.712996992983628, + "grad_norm": 1.5958905220031738, + "learning_rate": 1.763695850652804e-05, + "loss": 0.0904, + "num_input_tokens_seen": 138896192, + "step": 114150 + }, + { + "epoch": 12.713553847867246, + "grad_norm": 0.042851559817790985, + "learning_rate": 1.7634636575008266e-05, + "loss": 0.0147, + "num_input_tokens_seen": 138902304, + "step": 114155 + }, + { + "epoch": 12.714110702750864, + "grad_norm": 0.7788214683532715, + "learning_rate": 1.7632314713060255e-05, + "loss": 0.0629, + "num_input_tokens_seen": 138908608, + "step": 114160 + }, + { + "epoch": 12.714667557634481, + "grad_norm": 0.3633890151977539, + "learning_rate": 1.7629992920705932e-05, + "loss": 0.0977, + "num_input_tokens_seen": 138914144, + "step": 114165 + }, + { + "epoch": 12.715224412518097, + "grad_norm": 0.01968032866716385, + "learning_rate": 1.7627671197967234e-05, + "loss": 0.0427, + "num_input_tokens_seen": 138920544, + "step": 114170 + }, + { + "epoch": 12.715781267401715, + "grad_norm": 0.9245668649673462, + "learning_rate": 1.7625349544866082e-05, + "loss": 0.0748, + "num_input_tokens_seen": 138926752, + "step": 114175 + }, + { + "epoch": 12.716338122285332, + "grad_norm": 0.0029801346827298403, + "learning_rate": 1.762302796142442e-05, + "loss": 0.0018, + "num_input_tokens_seen": 138933248, + "step": 114180 + }, + { + "epoch": 12.71689497716895, + "grad_norm": 0.41551926732063293, + "learning_rate": 1.762070644766416e-05, + "loss": 0.0186, + "num_input_tokens_seen": 138938720, + "step": 114185 + }, + { + "epoch": 12.717451832052568, + "grad_norm": 0.0003570356930140406, + "learning_rate": 1.7618385003607245e-05, + "loss": 0.0825, + "num_input_tokens_seen": 138944480, + "step": 114190 + }, + { + "epoch": 12.718008686936184, + "grad_norm": 1.876348614692688, + "learning_rate": 1.761606362927559e-05, + "loss": 0.1027, + "num_input_tokens_seen": 138950720, + "step": 114195 + }, + { + "epoch": 12.718565541819801, + "grad_norm": 0.0654926672577858, + "learning_rate": 1.7613742324691146e-05, + "loss": 0.0015, + "num_input_tokens_seen": 138956864, + "step": 114200 + }, + { + "epoch": 12.719122396703419, + "grad_norm": 0.4522102177143097, + "learning_rate": 1.7611421089875806e-05, + "loss": 0.0118, + "num_input_tokens_seen": 138963136, + "step": 114205 + }, + { + "epoch": 12.719679251587037, + "grad_norm": 0.1439911127090454, + "learning_rate": 1.760909992485153e-05, + "loss": 0.0573, + "num_input_tokens_seen": 138969600, + "step": 114210 + }, + { + "epoch": 12.720236106470654, + "grad_norm": 0.9437024593353271, + "learning_rate": 1.7606778829640212e-05, + "loss": 0.0242, + "num_input_tokens_seen": 138975712, + "step": 114215 + }, + { + "epoch": 12.72079296135427, + "grad_norm": 0.15712274610996246, + "learning_rate": 1.76044578042638e-05, + "loss": 0.0145, + "num_input_tokens_seen": 138981952, + "step": 114220 + }, + { + "epoch": 12.721349816237888, + "grad_norm": 0.0187880527228117, + "learning_rate": 1.7602136848744205e-05, + "loss": 0.0014, + "num_input_tokens_seen": 138988064, + "step": 114225 + }, + { + "epoch": 12.721906671121506, + "grad_norm": 0.5243099927902222, + "learning_rate": 1.7599815963103358e-05, + "loss": 0.0347, + "num_input_tokens_seen": 138993568, + "step": 114230 + }, + { + "epoch": 12.722463526005123, + "grad_norm": 0.5047354102134705, + "learning_rate": 1.7597495147363175e-05, + "loss": 0.0679, + "num_input_tokens_seen": 138999680, + "step": 114235 + }, + { + "epoch": 12.723020380888741, + "grad_norm": 0.0001511294103693217, + "learning_rate": 1.7595174401545587e-05, + "loss": 0.0604, + "num_input_tokens_seen": 139005856, + "step": 114240 + }, + { + "epoch": 12.723577235772357, + "grad_norm": 0.15267060697078705, + "learning_rate": 1.75928537256725e-05, + "loss": 0.028, + "num_input_tokens_seen": 139012064, + "step": 114245 + }, + { + "epoch": 12.724134090655975, + "grad_norm": 0.021561868488788605, + "learning_rate": 1.7590533119765855e-05, + "loss": 0.0077, + "num_input_tokens_seen": 139018400, + "step": 114250 + }, + { + "epoch": 12.724690945539592, + "grad_norm": 0.5231889486312866, + "learning_rate": 1.758821258384755e-05, + "loss": 0.0132, + "num_input_tokens_seen": 139024256, + "step": 114255 + }, + { + "epoch": 12.72524780042321, + "grad_norm": 0.029445543885231018, + "learning_rate": 1.7585892117939524e-05, + "loss": 0.0488, + "num_input_tokens_seen": 139030720, + "step": 114260 + }, + { + "epoch": 12.725804655306828, + "grad_norm": 0.011365225538611412, + "learning_rate": 1.7583571722063678e-05, + "loss": 0.0281, + "num_input_tokens_seen": 139037152, + "step": 114265 + }, + { + "epoch": 12.726361510190443, + "grad_norm": 0.07878053933382034, + "learning_rate": 1.758125139624195e-05, + "loss": 0.0411, + "num_input_tokens_seen": 139043136, + "step": 114270 + }, + { + "epoch": 12.726918365074061, + "grad_norm": 0.10318927466869354, + "learning_rate": 1.7578931140496234e-05, + "loss": 0.006, + "num_input_tokens_seen": 139049056, + "step": 114275 + }, + { + "epoch": 12.727475219957679, + "grad_norm": 0.4216702878475189, + "learning_rate": 1.7576610954848472e-05, + "loss": 0.0375, + "num_input_tokens_seen": 139054976, + "step": 114280 + }, + { + "epoch": 12.728032074841297, + "grad_norm": 2.64707088470459, + "learning_rate": 1.7574290839320558e-05, + "loss": 0.0425, + "num_input_tokens_seen": 139061312, + "step": 114285 + }, + { + "epoch": 12.728588929724914, + "grad_norm": 1.3832863569259644, + "learning_rate": 1.7571970793934422e-05, + "loss": 0.0494, + "num_input_tokens_seen": 139067456, + "step": 114290 + }, + { + "epoch": 12.72914578460853, + "grad_norm": 0.4820254445075989, + "learning_rate": 1.756965081871197e-05, + "loss": 0.1932, + "num_input_tokens_seen": 139073504, + "step": 114295 + }, + { + "epoch": 12.729702639492148, + "grad_norm": 0.014890271238982677, + "learning_rate": 1.756733091367512e-05, + "loss": 0.0027, + "num_input_tokens_seen": 139079680, + "step": 114300 + }, + { + "epoch": 12.730259494375765, + "grad_norm": 0.48157161474227905, + "learning_rate": 1.7565011078845783e-05, + "loss": 0.0151, + "num_input_tokens_seen": 139085504, + "step": 114305 + }, + { + "epoch": 12.730816349259383, + "grad_norm": 1.2379951477050781, + "learning_rate": 1.756269131424588e-05, + "loss": 0.0323, + "num_input_tokens_seen": 139091328, + "step": 114310 + }, + { + "epoch": 12.731373204143, + "grad_norm": 0.001325194607488811, + "learning_rate": 1.7560371619897304e-05, + "loss": 0.0017, + "num_input_tokens_seen": 139097824, + "step": 114315 + }, + { + "epoch": 12.731930059026618, + "grad_norm": 0.1288599669933319, + "learning_rate": 1.755805199582199e-05, + "loss": 0.0214, + "num_input_tokens_seen": 139104032, + "step": 114320 + }, + { + "epoch": 12.732486913910234, + "grad_norm": 0.18286655843257904, + "learning_rate": 1.7555732442041822e-05, + "loss": 0.0701, + "num_input_tokens_seen": 139109472, + "step": 114325 + }, + { + "epoch": 12.733043768793852, + "grad_norm": 0.001956317340955138, + "learning_rate": 1.7553412958578746e-05, + "loss": 0.0074, + "num_input_tokens_seen": 139115264, + "step": 114330 + }, + { + "epoch": 12.73360062367747, + "grad_norm": 0.8738968968391418, + "learning_rate": 1.755109354545463e-05, + "loss": 0.1008, + "num_input_tokens_seen": 139121664, + "step": 114335 + }, + { + "epoch": 12.734157478561087, + "grad_norm": 0.0002915085351560265, + "learning_rate": 1.7548774202691426e-05, + "loss": 0.0019, + "num_input_tokens_seen": 139127680, + "step": 114340 + }, + { + "epoch": 12.734714333444705, + "grad_norm": 0.0006758225499652326, + "learning_rate": 1.7546454930311e-05, + "loss": 0.0002, + "num_input_tokens_seen": 139134112, + "step": 114345 + }, + { + "epoch": 12.735271188328321, + "grad_norm": 0.022194288671016693, + "learning_rate": 1.7544135728335286e-05, + "loss": 0.015, + "num_input_tokens_seen": 139140288, + "step": 114350 + }, + { + "epoch": 12.735828043211939, + "grad_norm": 0.15380950272083282, + "learning_rate": 1.7541816596786183e-05, + "loss": 0.0226, + "num_input_tokens_seen": 139146464, + "step": 114355 + }, + { + "epoch": 12.736384898095556, + "grad_norm": 0.0077538699842989445, + "learning_rate": 1.75394975356856e-05, + "loss": 0.0038, + "num_input_tokens_seen": 139152672, + "step": 114360 + }, + { + "epoch": 12.736941752979174, + "grad_norm": 0.06455577164888382, + "learning_rate": 1.7537178545055437e-05, + "loss": 0.0585, + "num_input_tokens_seen": 139158528, + "step": 114365 + }, + { + "epoch": 12.737498607862792, + "grad_norm": 0.00027250274433754385, + "learning_rate": 1.7534859624917607e-05, + "loss": 0.1555, + "num_input_tokens_seen": 139164896, + "step": 114370 + }, + { + "epoch": 12.738055462746408, + "grad_norm": 0.9332330226898193, + "learning_rate": 1.7532540775294005e-05, + "loss": 0.0362, + "num_input_tokens_seen": 139171072, + "step": 114375 + }, + { + "epoch": 12.738612317630025, + "grad_norm": 0.0018086405470967293, + "learning_rate": 1.7530221996206543e-05, + "loss": 0.0274, + "num_input_tokens_seen": 139177056, + "step": 114380 + }, + { + "epoch": 12.739169172513643, + "grad_norm": 0.041000522673130035, + "learning_rate": 1.752790328767711e-05, + "loss": 0.0888, + "num_input_tokens_seen": 139182944, + "step": 114385 + }, + { + "epoch": 12.73972602739726, + "grad_norm": 0.022564085200428963, + "learning_rate": 1.7525584649727623e-05, + "loss": 0.0188, + "num_input_tokens_seen": 139188832, + "step": 114390 + }, + { + "epoch": 12.740282882280878, + "grad_norm": 0.6942830681800842, + "learning_rate": 1.752326608237998e-05, + "loss": 0.0328, + "num_input_tokens_seen": 139194688, + "step": 114395 + }, + { + "epoch": 12.740839737164494, + "grad_norm": 0.25518515706062317, + "learning_rate": 1.752094758565607e-05, + "loss": 0.0972, + "num_input_tokens_seen": 139200864, + "step": 114400 + }, + { + "epoch": 12.741396592048112, + "grad_norm": 0.11196301132440567, + "learning_rate": 1.751862915957781e-05, + "loss": 0.0877, + "num_input_tokens_seen": 139207008, + "step": 114405 + }, + { + "epoch": 12.74195344693173, + "grad_norm": 0.15758362412452698, + "learning_rate": 1.751631080416708e-05, + "loss": 0.0934, + "num_input_tokens_seen": 139213440, + "step": 114410 + }, + { + "epoch": 12.742510301815347, + "grad_norm": 0.017528152093291283, + "learning_rate": 1.751399251944581e-05, + "loss": 0.0026, + "num_input_tokens_seen": 139220064, + "step": 114415 + }, + { + "epoch": 12.743067156698965, + "grad_norm": 0.5271148681640625, + "learning_rate": 1.751167430543586e-05, + "loss": 0.0307, + "num_input_tokens_seen": 139226528, + "step": 114420 + }, + { + "epoch": 12.743624011582583, + "grad_norm": 1.0646398067474365, + "learning_rate": 1.750935616215915e-05, + "loss": 0.035, + "num_input_tokens_seen": 139232288, + "step": 114425 + }, + { + "epoch": 12.744180866466198, + "grad_norm": 0.7986086010932922, + "learning_rate": 1.7507038089637578e-05, + "loss": 0.0358, + "num_input_tokens_seen": 139238560, + "step": 114430 + }, + { + "epoch": 12.744737721349816, + "grad_norm": 1.284866452217102, + "learning_rate": 1.7504720087893034e-05, + "loss": 0.0502, + "num_input_tokens_seen": 139244448, + "step": 114435 + }, + { + "epoch": 12.745294576233434, + "grad_norm": 0.0019359514117240906, + "learning_rate": 1.7502402156947408e-05, + "loss": 0.0012, + "num_input_tokens_seen": 139250592, + "step": 114440 + }, + { + "epoch": 12.745851431117051, + "grad_norm": 0.2883159816265106, + "learning_rate": 1.750008429682261e-05, + "loss": 0.0099, + "num_input_tokens_seen": 139256768, + "step": 114445 + }, + { + "epoch": 12.746408286000669, + "grad_norm": 1.575636863708496, + "learning_rate": 1.7497766507540513e-05, + "loss": 0.0731, + "num_input_tokens_seen": 139263008, + "step": 114450 + }, + { + "epoch": 12.746965140884285, + "grad_norm": 0.01881292648613453, + "learning_rate": 1.7495448789123032e-05, + "loss": 0.0056, + "num_input_tokens_seen": 139269600, + "step": 114455 + }, + { + "epoch": 12.747521995767903, + "grad_norm": 0.5840901732444763, + "learning_rate": 1.7493131141592045e-05, + "loss": 0.0484, + "num_input_tokens_seen": 139274464, + "step": 114460 + }, + { + "epoch": 12.74807885065152, + "grad_norm": 0.0002920180559158325, + "learning_rate": 1.749081356496945e-05, + "loss": 0.0652, + "num_input_tokens_seen": 139280384, + "step": 114465 + }, + { + "epoch": 12.748635705535138, + "grad_norm": 0.3711923658847809, + "learning_rate": 1.748849605927713e-05, + "loss": 0.0268, + "num_input_tokens_seen": 139286560, + "step": 114470 + }, + { + "epoch": 12.749192560418756, + "grad_norm": 0.0009886723710224032, + "learning_rate": 1.7486178624536998e-05, + "loss": 0.0963, + "num_input_tokens_seen": 139292480, + "step": 114475 + }, + { + "epoch": 12.749749415302372, + "grad_norm": 0.15667417645454407, + "learning_rate": 1.748386126077091e-05, + "loss": 0.0209, + "num_input_tokens_seen": 139298368, + "step": 114480 + }, + { + "epoch": 12.75030627018599, + "grad_norm": 0.043977800756692886, + "learning_rate": 1.7481543968000795e-05, + "loss": 0.0037, + "num_input_tokens_seen": 139304736, + "step": 114485 + }, + { + "epoch": 12.750863125069607, + "grad_norm": 0.0012012738734483719, + "learning_rate": 1.7479226746248503e-05, + "loss": 0.1146, + "num_input_tokens_seen": 139310944, + "step": 114490 + }, + { + "epoch": 12.751419979953225, + "grad_norm": 0.9891979694366455, + "learning_rate": 1.747690959553595e-05, + "loss": 0.0261, + "num_input_tokens_seen": 139316896, + "step": 114495 + }, + { + "epoch": 12.751976834836842, + "grad_norm": 0.04052764177322388, + "learning_rate": 1.747459251588501e-05, + "loss": 0.0433, + "num_input_tokens_seen": 139322656, + "step": 114500 + }, + { + "epoch": 12.752533689720458, + "grad_norm": 0.030266204848885536, + "learning_rate": 1.7472275507317577e-05, + "loss": 0.0157, + "num_input_tokens_seen": 139328672, + "step": 114505 + }, + { + "epoch": 12.753090544604076, + "grad_norm": 0.0005969816702418029, + "learning_rate": 1.7469958569855526e-05, + "loss": 0.0028, + "num_input_tokens_seen": 139334656, + "step": 114510 + }, + { + "epoch": 12.753647399487694, + "grad_norm": 0.021977810189127922, + "learning_rate": 1.7467641703520755e-05, + "loss": 0.0462, + "num_input_tokens_seen": 139340832, + "step": 114515 + }, + { + "epoch": 12.754204254371311, + "grad_norm": 1.6520326137542725, + "learning_rate": 1.746532490833514e-05, + "loss": 0.0934, + "num_input_tokens_seen": 139347360, + "step": 114520 + }, + { + "epoch": 12.754761109254929, + "grad_norm": 0.0058489455841481686, + "learning_rate": 1.746300818432057e-05, + "loss": 0.0008, + "num_input_tokens_seen": 139353344, + "step": 114525 + }, + { + "epoch": 12.755317964138545, + "grad_norm": 0.03231247887015343, + "learning_rate": 1.7460691531498923e-05, + "loss": 0.1274, + "num_input_tokens_seen": 139359328, + "step": 114530 + }, + { + "epoch": 12.755874819022162, + "grad_norm": 0.7549070715904236, + "learning_rate": 1.745837494989209e-05, + "loss": 0.0492, + "num_input_tokens_seen": 139365536, + "step": 114535 + }, + { + "epoch": 12.75643167390578, + "grad_norm": 0.001575383939780295, + "learning_rate": 1.7456058439521936e-05, + "loss": 0.0141, + "num_input_tokens_seen": 139371776, + "step": 114540 + }, + { + "epoch": 12.756988528789398, + "grad_norm": 0.0003324183344375342, + "learning_rate": 1.7453742000410374e-05, + "loss": 0.0146, + "num_input_tokens_seen": 139377792, + "step": 114545 + }, + { + "epoch": 12.757545383673015, + "grad_norm": 0.004127695225179195, + "learning_rate": 1.7451425632579244e-05, + "loss": 0.0059, + "num_input_tokens_seen": 139383904, + "step": 114550 + }, + { + "epoch": 12.758102238556631, + "grad_norm": 0.27849680185317993, + "learning_rate": 1.7449109336050456e-05, + "loss": 0.0208, + "num_input_tokens_seen": 139390304, + "step": 114555 + }, + { + "epoch": 12.758659093440249, + "grad_norm": 0.0004696474352385849, + "learning_rate": 1.744679311084588e-05, + "loss": 0.0565, + "num_input_tokens_seen": 139396800, + "step": 114560 + }, + { + "epoch": 12.759215948323867, + "grad_norm": 0.04632839187979698, + "learning_rate": 1.74444769569874e-05, + "loss": 0.0093, + "num_input_tokens_seen": 139402912, + "step": 114565 + }, + { + "epoch": 12.759772803207484, + "grad_norm": 0.0004894821904599667, + "learning_rate": 1.744216087449688e-05, + "loss": 0.1495, + "num_input_tokens_seen": 139409152, + "step": 114570 + }, + { + "epoch": 12.760329658091102, + "grad_norm": 1.5037291049957275, + "learning_rate": 1.743984486339621e-05, + "loss": 0.1477, + "num_input_tokens_seen": 139415584, + "step": 114575 + }, + { + "epoch": 12.760886512974718, + "grad_norm": 0.22285929322242737, + "learning_rate": 1.7437528923707258e-05, + "loss": 0.0382, + "num_input_tokens_seen": 139421888, + "step": 114580 + }, + { + "epoch": 12.761443367858336, + "grad_norm": 0.18600425124168396, + "learning_rate": 1.7435213055451914e-05, + "loss": 0.0601, + "num_input_tokens_seen": 139427904, + "step": 114585 + }, + { + "epoch": 12.762000222741953, + "grad_norm": 0.4982169270515442, + "learning_rate": 1.7432897258652033e-05, + "loss": 0.0262, + "num_input_tokens_seen": 139434048, + "step": 114590 + }, + { + "epoch": 12.762557077625571, + "grad_norm": 3.9197998046875, + "learning_rate": 1.743058153332951e-05, + "loss": 0.0411, + "num_input_tokens_seen": 139440256, + "step": 114595 + }, + { + "epoch": 12.763113932509189, + "grad_norm": 0.12699608504772186, + "learning_rate": 1.7428265879506196e-05, + "loss": 0.0774, + "num_input_tokens_seen": 139446368, + "step": 114600 + }, + { + "epoch": 12.763670787392805, + "grad_norm": 0.01097436249256134, + "learning_rate": 1.7425950297203992e-05, + "loss": 0.0122, + "num_input_tokens_seen": 139452448, + "step": 114605 + }, + { + "epoch": 12.764227642276422, + "grad_norm": 0.16860762238502502, + "learning_rate": 1.7423634786444738e-05, + "loss": 0.0421, + "num_input_tokens_seen": 139458816, + "step": 114610 + }, + { + "epoch": 12.76478449716004, + "grad_norm": 1.2605215311050415, + "learning_rate": 1.7421319347250343e-05, + "loss": 0.0881, + "num_input_tokens_seen": 139464736, + "step": 114615 + }, + { + "epoch": 12.765341352043658, + "grad_norm": 0.7436789274215698, + "learning_rate": 1.741900397964264e-05, + "loss": 0.0358, + "num_input_tokens_seen": 139470304, + "step": 114620 + }, + { + "epoch": 12.765898206927275, + "grad_norm": 0.07977961003780365, + "learning_rate": 1.741668868364353e-05, + "loss": 0.0148, + "num_input_tokens_seen": 139476224, + "step": 114625 + }, + { + "epoch": 12.766455061810891, + "grad_norm": 0.005876309238374233, + "learning_rate": 1.7414373459274867e-05, + "loss": 0.0346, + "num_input_tokens_seen": 139482464, + "step": 114630 + }, + { + "epoch": 12.767011916694509, + "grad_norm": 0.00019586776033975184, + "learning_rate": 1.7412058306558527e-05, + "loss": 0.0123, + "num_input_tokens_seen": 139488544, + "step": 114635 + }, + { + "epoch": 12.767568771578127, + "grad_norm": 0.24181237816810608, + "learning_rate": 1.7409743225516374e-05, + "loss": 0.0947, + "num_input_tokens_seen": 139495040, + "step": 114640 + }, + { + "epoch": 12.768125626461744, + "grad_norm": 1.2264580726623535, + "learning_rate": 1.740742821617028e-05, + "loss": 0.0374, + "num_input_tokens_seen": 139500992, + "step": 114645 + }, + { + "epoch": 12.768682481345362, + "grad_norm": 0.04599433392286301, + "learning_rate": 1.740511327854211e-05, + "loss": 0.0543, + "num_input_tokens_seen": 139507360, + "step": 114650 + }, + { + "epoch": 12.76923933622898, + "grad_norm": 0.0009532411932013929, + "learning_rate": 1.7402798412653727e-05, + "loss": 0.0304, + "num_input_tokens_seen": 139513408, + "step": 114655 + }, + { + "epoch": 12.769796191112595, + "grad_norm": 0.3471611738204956, + "learning_rate": 1.7400483618526996e-05, + "loss": 0.0093, + "num_input_tokens_seen": 139519648, + "step": 114660 + }, + { + "epoch": 12.770353045996213, + "grad_norm": 1.1368372440338135, + "learning_rate": 1.7398168896183794e-05, + "loss": 0.0735, + "num_input_tokens_seen": 139525888, + "step": 114665 + }, + { + "epoch": 12.77090990087983, + "grad_norm": 2.5318541526794434, + "learning_rate": 1.7395854245645966e-05, + "loss": 0.107, + "num_input_tokens_seen": 139532064, + "step": 114670 + }, + { + "epoch": 12.771466755763448, + "grad_norm": 0.18421854078769684, + "learning_rate": 1.73935396669354e-05, + "loss": 0.0131, + "num_input_tokens_seen": 139537952, + "step": 114675 + }, + { + "epoch": 12.772023610647066, + "grad_norm": 0.013111164793372154, + "learning_rate": 1.7391225160073935e-05, + "loss": 0.0229, + "num_input_tokens_seen": 139544384, + "step": 114680 + }, + { + "epoch": 12.772580465530682, + "grad_norm": 0.0018157310551032424, + "learning_rate": 1.7388910725083452e-05, + "loss": 0.0063, + "num_input_tokens_seen": 139550496, + "step": 114685 + }, + { + "epoch": 12.7731373204143, + "grad_norm": 0.4268471598625183, + "learning_rate": 1.73865963619858e-05, + "loss": 0.0177, + "num_input_tokens_seen": 139556384, + "step": 114690 + }, + { + "epoch": 12.773694175297917, + "grad_norm": 0.0112177524715662, + "learning_rate": 1.738428207080285e-05, + "loss": 0.0194, + "num_input_tokens_seen": 139562400, + "step": 114695 + }, + { + "epoch": 12.774251030181535, + "grad_norm": 0.028259843587875366, + "learning_rate": 1.7381967851556456e-05, + "loss": 0.0089, + "num_input_tokens_seen": 139568416, + "step": 114700 + }, + { + "epoch": 12.774807885065153, + "grad_norm": 0.0013823575573042035, + "learning_rate": 1.7379653704268482e-05, + "loss": 0.0034, + "num_input_tokens_seen": 139574688, + "step": 114705 + }, + { + "epoch": 12.775364739948769, + "grad_norm": 0.03873688727617264, + "learning_rate": 1.7377339628960775e-05, + "loss": 0.0795, + "num_input_tokens_seen": 139580992, + "step": 114710 + }, + { + "epoch": 12.775921594832386, + "grad_norm": 0.00674130953848362, + "learning_rate": 1.7375025625655212e-05, + "loss": 0.0104, + "num_input_tokens_seen": 139587424, + "step": 114715 + }, + { + "epoch": 12.776478449716004, + "grad_norm": 0.025552209466695786, + "learning_rate": 1.7372711694373633e-05, + "loss": 0.0211, + "num_input_tokens_seen": 139593696, + "step": 114720 + }, + { + "epoch": 12.777035304599622, + "grad_norm": 0.00020539139222819358, + "learning_rate": 1.7370397835137915e-05, + "loss": 0.0023, + "num_input_tokens_seen": 139599840, + "step": 114725 + }, + { + "epoch": 12.77759215948324, + "grad_norm": 0.0005878459196537733, + "learning_rate": 1.7368084047969885e-05, + "loss": 0.007, + "num_input_tokens_seen": 139606048, + "step": 114730 + }, + { + "epoch": 12.778149014366855, + "grad_norm": 0.128055602312088, + "learning_rate": 1.7365770332891433e-05, + "loss": 0.0133, + "num_input_tokens_seen": 139612032, + "step": 114735 + }, + { + "epoch": 12.778705869250473, + "grad_norm": 0.003617813577875495, + "learning_rate": 1.736345668992438e-05, + "loss": 0.0027, + "num_input_tokens_seen": 139618368, + "step": 114740 + }, + { + "epoch": 12.77926272413409, + "grad_norm": 0.05203763395547867, + "learning_rate": 1.7361143119090613e-05, + "loss": 0.0749, + "num_input_tokens_seen": 139624160, + "step": 114745 + }, + { + "epoch": 12.779819579017708, + "grad_norm": 0.9890792965888977, + "learning_rate": 1.7358829620411955e-05, + "loss": 0.0851, + "num_input_tokens_seen": 139630656, + "step": 114750 + }, + { + "epoch": 12.780376433901326, + "grad_norm": 0.22122061252593994, + "learning_rate": 1.7356516193910283e-05, + "loss": 0.0742, + "num_input_tokens_seen": 139636672, + "step": 114755 + }, + { + "epoch": 12.780933288784942, + "grad_norm": 2.3254716396331787, + "learning_rate": 1.7354202839607432e-05, + "loss": 0.1207, + "num_input_tokens_seen": 139642848, + "step": 114760 + }, + { + "epoch": 12.78149014366856, + "grad_norm": 0.0505639873445034, + "learning_rate": 1.735188955752527e-05, + "loss": 0.0431, + "num_input_tokens_seen": 139648992, + "step": 114765 + }, + { + "epoch": 12.782046998552177, + "grad_norm": 0.062132641673088074, + "learning_rate": 1.734957634768563e-05, + "loss": 0.0603, + "num_input_tokens_seen": 139655136, + "step": 114770 + }, + { + "epoch": 12.782603853435795, + "grad_norm": 0.9794530272483826, + "learning_rate": 1.7347263210110376e-05, + "loss": 0.0576, + "num_input_tokens_seen": 139660864, + "step": 114775 + }, + { + "epoch": 12.783160708319413, + "grad_norm": 1.0851523876190186, + "learning_rate": 1.734495014482135e-05, + "loss": 0.0963, + "num_input_tokens_seen": 139667072, + "step": 114780 + }, + { + "epoch": 12.78371756320303, + "grad_norm": 0.33293214440345764, + "learning_rate": 1.7342637151840407e-05, + "loss": 0.029, + "num_input_tokens_seen": 139673312, + "step": 114785 + }, + { + "epoch": 12.784274418086646, + "grad_norm": 0.10524143278598785, + "learning_rate": 1.7340324231189386e-05, + "loss": 0.148, + "num_input_tokens_seen": 139679488, + "step": 114790 + }, + { + "epoch": 12.784831272970264, + "grad_norm": 0.7399181127548218, + "learning_rate": 1.7338011382890147e-05, + "loss": 0.0628, + "num_input_tokens_seen": 139684928, + "step": 114795 + }, + { + "epoch": 12.785388127853881, + "grad_norm": 0.27280479669570923, + "learning_rate": 1.7335698606964513e-05, + "loss": 0.0079, + "num_input_tokens_seen": 139691488, + "step": 114800 + }, + { + "epoch": 12.7859449827375, + "grad_norm": 0.007140596862882376, + "learning_rate": 1.7333385903434365e-05, + "loss": 0.0025, + "num_input_tokens_seen": 139697824, + "step": 114805 + }, + { + "epoch": 12.786501837621117, + "grad_norm": 0.023270664736628532, + "learning_rate": 1.7331073272321523e-05, + "loss": 0.0018, + "num_input_tokens_seen": 139704352, + "step": 114810 + }, + { + "epoch": 12.787058692504733, + "grad_norm": 0.7595528960227966, + "learning_rate": 1.7328760713647833e-05, + "loss": 0.0241, + "num_input_tokens_seen": 139710816, + "step": 114815 + }, + { + "epoch": 12.78761554738835, + "grad_norm": 0.001688286429271102, + "learning_rate": 1.7326448227435155e-05, + "loss": 0.0008, + "num_input_tokens_seen": 139716992, + "step": 114820 + }, + { + "epoch": 12.788172402271968, + "grad_norm": 0.22569184005260468, + "learning_rate": 1.7324135813705306e-05, + "loss": 0.1016, + "num_input_tokens_seen": 139722976, + "step": 114825 + }, + { + "epoch": 12.788729257155586, + "grad_norm": 0.4196818470954895, + "learning_rate": 1.7321823472480152e-05, + "loss": 0.0179, + "num_input_tokens_seen": 139728992, + "step": 114830 + }, + { + "epoch": 12.789286112039203, + "grad_norm": 1.0258315801620483, + "learning_rate": 1.731951120378153e-05, + "loss": 0.149, + "num_input_tokens_seen": 139734624, + "step": 114835 + }, + { + "epoch": 12.78984296692282, + "grad_norm": 1.9785627126693726, + "learning_rate": 1.7317199007631277e-05, + "loss": 0.0138, + "num_input_tokens_seen": 139740864, + "step": 114840 + }, + { + "epoch": 12.790399821806437, + "grad_norm": 0.0002563198795542121, + "learning_rate": 1.731488688405123e-05, + "loss": 0.074, + "num_input_tokens_seen": 139746848, + "step": 114845 + }, + { + "epoch": 12.790956676690055, + "grad_norm": 0.22435873746871948, + "learning_rate": 1.731257483306324e-05, + "loss": 0.0508, + "num_input_tokens_seen": 139753184, + "step": 114850 + }, + { + "epoch": 12.791513531573672, + "grad_norm": 0.23251323401927948, + "learning_rate": 1.7310262854689134e-05, + "loss": 0.0095, + "num_input_tokens_seen": 139759328, + "step": 114855 + }, + { + "epoch": 12.79207038645729, + "grad_norm": 0.8342938423156738, + "learning_rate": 1.7307950948950764e-05, + "loss": 0.078, + "num_input_tokens_seen": 139765536, + "step": 114860 + }, + { + "epoch": 12.792627241340906, + "grad_norm": 0.9230636954307556, + "learning_rate": 1.730563911586995e-05, + "loss": 0.0588, + "num_input_tokens_seen": 139771584, + "step": 114865 + }, + { + "epoch": 12.793184096224524, + "grad_norm": 0.3244798481464386, + "learning_rate": 1.7303327355468545e-05, + "loss": 0.0264, + "num_input_tokens_seen": 139777856, + "step": 114870 + }, + { + "epoch": 12.793740951108141, + "grad_norm": 0.45671704411506653, + "learning_rate": 1.7301015667768372e-05, + "loss": 0.0112, + "num_input_tokens_seen": 139784096, + "step": 114875 + }, + { + "epoch": 12.794297805991759, + "grad_norm": 0.5566986203193665, + "learning_rate": 1.729870405279129e-05, + "loss": 0.0234, + "num_input_tokens_seen": 139790144, + "step": 114880 + }, + { + "epoch": 12.794854660875377, + "grad_norm": 0.011441466398537159, + "learning_rate": 1.72963925105591e-05, + "loss": 0.0252, + "num_input_tokens_seen": 139796128, + "step": 114885 + }, + { + "epoch": 12.795411515758992, + "grad_norm": 1.552858591079712, + "learning_rate": 1.7294081041093674e-05, + "loss": 0.03, + "num_input_tokens_seen": 139802272, + "step": 114890 + }, + { + "epoch": 12.79596837064261, + "grad_norm": 0.00042543490417301655, + "learning_rate": 1.729176964441681e-05, + "loss": 0.0443, + "num_input_tokens_seen": 139808352, + "step": 114895 + }, + { + "epoch": 12.796525225526228, + "grad_norm": 0.11270458996295929, + "learning_rate": 1.7289458320550365e-05, + "loss": 0.0384, + "num_input_tokens_seen": 139814496, + "step": 114900 + }, + { + "epoch": 12.797082080409846, + "grad_norm": 0.4402664005756378, + "learning_rate": 1.728714706951616e-05, + "loss": 0.0541, + "num_input_tokens_seen": 139820256, + "step": 114905 + }, + { + "epoch": 12.797638935293463, + "grad_norm": 2.4106035232543945, + "learning_rate": 1.7284835891336037e-05, + "loss": 0.1324, + "num_input_tokens_seen": 139825600, + "step": 114910 + }, + { + "epoch": 12.798195790177079, + "grad_norm": 0.02348717674612999, + "learning_rate": 1.7282524786031815e-05, + "loss": 0.0212, + "num_input_tokens_seen": 139831424, + "step": 114915 + }, + { + "epoch": 12.798752645060697, + "grad_norm": 0.004361606203019619, + "learning_rate": 1.7280213753625332e-05, + "loss": 0.0079, + "num_input_tokens_seen": 139837440, + "step": 114920 + }, + { + "epoch": 12.799309499944314, + "grad_norm": 0.006054977420717478, + "learning_rate": 1.727790279413841e-05, + "loss": 0.0058, + "num_input_tokens_seen": 139843424, + "step": 114925 + }, + { + "epoch": 12.799866354827932, + "grad_norm": 0.043261993676424026, + "learning_rate": 1.7275591907592892e-05, + "loss": 0.153, + "num_input_tokens_seen": 139849600, + "step": 114930 + }, + { + "epoch": 12.80042320971155, + "grad_norm": 0.0007185914437286556, + "learning_rate": 1.7273281094010592e-05, + "loss": 0.0135, + "num_input_tokens_seen": 139855808, + "step": 114935 + }, + { + "epoch": 12.800980064595166, + "grad_norm": 0.15007416903972626, + "learning_rate": 1.7270970353413343e-05, + "loss": 0.0075, + "num_input_tokens_seen": 139861824, + "step": 114940 + }, + { + "epoch": 12.801536919478783, + "grad_norm": 0.0034927523229271173, + "learning_rate": 1.7268659685822964e-05, + "loss": 0.0043, + "num_input_tokens_seen": 139868096, + "step": 114945 + }, + { + "epoch": 12.802093774362401, + "grad_norm": 2.3370521068573, + "learning_rate": 1.7266349091261303e-05, + "loss": 0.1546, + "num_input_tokens_seen": 139873792, + "step": 114950 + }, + { + "epoch": 12.802650629246019, + "grad_norm": 0.7620893120765686, + "learning_rate": 1.7264038569750156e-05, + "loss": 0.0546, + "num_input_tokens_seen": 139880000, + "step": 114955 + }, + { + "epoch": 12.803207484129636, + "grad_norm": 0.24345676600933075, + "learning_rate": 1.7261728121311375e-05, + "loss": 0.0531, + "num_input_tokens_seen": 139886144, + "step": 114960 + }, + { + "epoch": 12.803764339013252, + "grad_norm": 0.2009899765253067, + "learning_rate": 1.7259417745966764e-05, + "loss": 0.0124, + "num_input_tokens_seen": 139892576, + "step": 114965 + }, + { + "epoch": 12.80432119389687, + "grad_norm": 0.0001655106316320598, + "learning_rate": 1.725710744373816e-05, + "loss": 0.0041, + "num_input_tokens_seen": 139898240, + "step": 114970 + }, + { + "epoch": 12.804878048780488, + "grad_norm": 2.066664218902588, + "learning_rate": 1.7254797214647373e-05, + "loss": 0.0692, + "num_input_tokens_seen": 139904320, + "step": 114975 + }, + { + "epoch": 12.805434903664105, + "grad_norm": 1.4167232513427734, + "learning_rate": 1.7252487058716238e-05, + "loss": 0.0705, + "num_input_tokens_seen": 139910560, + "step": 114980 + }, + { + "epoch": 12.805991758547723, + "grad_norm": 0.0002551583747845143, + "learning_rate": 1.7250176975966565e-05, + "loss": 0.0128, + "num_input_tokens_seen": 139916384, + "step": 114985 + }, + { + "epoch": 12.806548613431339, + "grad_norm": 0.0626574158668518, + "learning_rate": 1.7247866966420183e-05, + "loss": 0.0725, + "num_input_tokens_seen": 139922272, + "step": 114990 + }, + { + "epoch": 12.807105468314957, + "grad_norm": 0.00010114305041497573, + "learning_rate": 1.7245557030098908e-05, + "loss": 0.0884, + "num_input_tokens_seen": 139928256, + "step": 114995 + }, + { + "epoch": 12.807662323198574, + "grad_norm": 0.022884802892804146, + "learning_rate": 1.7243247167024563e-05, + "loss": 0.0415, + "num_input_tokens_seen": 139934400, + "step": 115000 + }, + { + "epoch": 12.808219178082192, + "grad_norm": 0.055689383298158646, + "learning_rate": 1.7240937377218948e-05, + "loss": 0.014, + "num_input_tokens_seen": 139940256, + "step": 115005 + }, + { + "epoch": 12.80877603296581, + "grad_norm": 0.0005902127595618367, + "learning_rate": 1.7238627660703915e-05, + "loss": 0.0185, + "num_input_tokens_seen": 139946336, + "step": 115010 + }, + { + "epoch": 12.809332887849427, + "grad_norm": 0.004427282605320215, + "learning_rate": 1.7236318017501245e-05, + "loss": 0.004, + "num_input_tokens_seen": 139952576, + "step": 115015 + }, + { + "epoch": 12.809889742733043, + "grad_norm": 0.027237942442297935, + "learning_rate": 1.723400844763279e-05, + "loss": 0.0156, + "num_input_tokens_seen": 139958432, + "step": 115020 + }, + { + "epoch": 12.81044659761666, + "grad_norm": 1.0316098928451538, + "learning_rate": 1.7231698951120328e-05, + "loss": 0.0258, + "num_input_tokens_seen": 139964896, + "step": 115025 + }, + { + "epoch": 12.811003452500278, + "grad_norm": 0.006669668946415186, + "learning_rate": 1.7229389527985707e-05, + "loss": 0.0417, + "num_input_tokens_seen": 139970944, + "step": 115030 + }, + { + "epoch": 12.811560307383896, + "grad_norm": 0.03284507989883423, + "learning_rate": 1.722708017825072e-05, + "loss": 0.0524, + "num_input_tokens_seen": 139976640, + "step": 115035 + }, + { + "epoch": 12.812117162267514, + "grad_norm": 0.06101818010210991, + "learning_rate": 1.72247709019372e-05, + "loss": 0.0103, + "num_input_tokens_seen": 139983008, + "step": 115040 + }, + { + "epoch": 12.81267401715113, + "grad_norm": 0.00023310900724027306, + "learning_rate": 1.7222461699066933e-05, + "loss": 0.1277, + "num_input_tokens_seen": 139989312, + "step": 115045 + }, + { + "epoch": 12.813230872034747, + "grad_norm": 0.00045980571303516626, + "learning_rate": 1.7220152569661756e-05, + "loss": 0.0362, + "num_input_tokens_seen": 139994944, + "step": 115050 + }, + { + "epoch": 12.813787726918365, + "grad_norm": 0.009276434779167175, + "learning_rate": 1.7217843513743467e-05, + "loss": 0.0462, + "num_input_tokens_seen": 140001056, + "step": 115055 + }, + { + "epoch": 12.814344581801983, + "grad_norm": 0.20155282318592072, + "learning_rate": 1.721553453133389e-05, + "loss": 0.0273, + "num_input_tokens_seen": 140007520, + "step": 115060 + }, + { + "epoch": 12.8149014366856, + "grad_norm": 0.5373207926750183, + "learning_rate": 1.721322562245481e-05, + "loss": 0.0403, + "num_input_tokens_seen": 140013440, + "step": 115065 + }, + { + "epoch": 12.815458291569216, + "grad_norm": 0.727338433265686, + "learning_rate": 1.721091678712807e-05, + "loss": 0.0245, + "num_input_tokens_seen": 140019584, + "step": 115070 + }, + { + "epoch": 12.816015146452834, + "grad_norm": 0.28731733560562134, + "learning_rate": 1.7208608025375442e-05, + "loss": 0.0071, + "num_input_tokens_seen": 140025696, + "step": 115075 + }, + { + "epoch": 12.816572001336452, + "grad_norm": 0.4492188096046448, + "learning_rate": 1.7206299337218774e-05, + "loss": 0.0136, + "num_input_tokens_seen": 140031744, + "step": 115080 + }, + { + "epoch": 12.81712885622007, + "grad_norm": 0.7409291863441467, + "learning_rate": 1.7203990722679836e-05, + "loss": 0.0205, + "num_input_tokens_seen": 140037696, + "step": 115085 + }, + { + "epoch": 12.817685711103687, + "grad_norm": 1.724177598953247, + "learning_rate": 1.720168218178046e-05, + "loss": 0.1372, + "num_input_tokens_seen": 140042976, + "step": 115090 + }, + { + "epoch": 12.818242565987303, + "grad_norm": 1.1719080209732056, + "learning_rate": 1.719937371454244e-05, + "loss": 0.0632, + "num_input_tokens_seen": 140049280, + "step": 115095 + }, + { + "epoch": 12.81879942087092, + "grad_norm": 0.335957795381546, + "learning_rate": 1.719706532098759e-05, + "loss": 0.0451, + "num_input_tokens_seen": 140055232, + "step": 115100 + }, + { + "epoch": 12.819356275754538, + "grad_norm": 0.07054691761732101, + "learning_rate": 1.7194757001137707e-05, + "loss": 0.1695, + "num_input_tokens_seen": 140061152, + "step": 115105 + }, + { + "epoch": 12.819913130638156, + "grad_norm": 1.1492156982421875, + "learning_rate": 1.71924487550146e-05, + "loss": 0.037, + "num_input_tokens_seen": 140067008, + "step": 115110 + }, + { + "epoch": 12.820469985521774, + "grad_norm": 0.00041215401142835617, + "learning_rate": 1.7190140582640066e-05, + "loss": 0.0546, + "num_input_tokens_seen": 140073056, + "step": 115115 + }, + { + "epoch": 12.82102684040539, + "grad_norm": 0.022296277806162834, + "learning_rate": 1.7187832484035916e-05, + "loss": 0.048, + "num_input_tokens_seen": 140078624, + "step": 115120 + }, + { + "epoch": 12.821583695289007, + "grad_norm": 0.9580041766166687, + "learning_rate": 1.718552445922394e-05, + "loss": 0.1343, + "num_input_tokens_seen": 140084800, + "step": 115125 + }, + { + "epoch": 12.822140550172625, + "grad_norm": 0.09142977744340897, + "learning_rate": 1.7183216508225954e-05, + "loss": 0.037, + "num_input_tokens_seen": 140090816, + "step": 115130 + }, + { + "epoch": 12.822697405056243, + "grad_norm": 0.0008850984158925712, + "learning_rate": 1.7180908631063742e-05, + "loss": 0.0221, + "num_input_tokens_seen": 140096736, + "step": 115135 + }, + { + "epoch": 12.82325425993986, + "grad_norm": 0.013974166475236416, + "learning_rate": 1.717860082775912e-05, + "loss": 0.0038, + "num_input_tokens_seen": 140102784, + "step": 115140 + }, + { + "epoch": 12.823811114823478, + "grad_norm": 0.027058878913521767, + "learning_rate": 1.7176293098333872e-05, + "loss": 0.096, + "num_input_tokens_seen": 140108864, + "step": 115145 + }, + { + "epoch": 12.824367969707094, + "grad_norm": 0.004392558243125677, + "learning_rate": 1.7173985442809815e-05, + "loss": 0.015, + "num_input_tokens_seen": 140114976, + "step": 115150 + }, + { + "epoch": 12.824924824590711, + "grad_norm": 0.07855726033449173, + "learning_rate": 1.717167786120873e-05, + "loss": 0.0035, + "num_input_tokens_seen": 140121152, + "step": 115155 + }, + { + "epoch": 12.82548167947433, + "grad_norm": 1.1447263956069946, + "learning_rate": 1.7169370353552423e-05, + "loss": 0.0317, + "num_input_tokens_seen": 140127008, + "step": 115160 + }, + { + "epoch": 12.826038534357947, + "grad_norm": 0.04514450207352638, + "learning_rate": 1.716706291986268e-05, + "loss": 0.0892, + "num_input_tokens_seen": 140132992, + "step": 115165 + }, + { + "epoch": 12.826595389241565, + "grad_norm": 0.14804412424564362, + "learning_rate": 1.7164755560161308e-05, + "loss": 0.0073, + "num_input_tokens_seen": 140139392, + "step": 115170 + }, + { + "epoch": 12.82715224412518, + "grad_norm": 0.0009393055806867778, + "learning_rate": 1.7162448274470095e-05, + "loss": 0.0359, + "num_input_tokens_seen": 140145472, + "step": 115175 + }, + { + "epoch": 12.827709099008798, + "grad_norm": 0.05890166014432907, + "learning_rate": 1.716014106281084e-05, + "loss": 0.0207, + "num_input_tokens_seen": 140151424, + "step": 115180 + }, + { + "epoch": 12.828265953892416, + "grad_norm": 0.07219059020280838, + "learning_rate": 1.715783392520533e-05, + "loss": 0.0142, + "num_input_tokens_seen": 140157312, + "step": 115185 + }, + { + "epoch": 12.828822808776033, + "grad_norm": 0.035354457795619965, + "learning_rate": 1.7155526861675365e-05, + "loss": 0.0964, + "num_input_tokens_seen": 140163424, + "step": 115190 + }, + { + "epoch": 12.829379663659651, + "grad_norm": 1.0173022747039795, + "learning_rate": 1.7153219872242727e-05, + "loss": 0.0526, + "num_input_tokens_seen": 140169824, + "step": 115195 + }, + { + "epoch": 12.829936518543267, + "grad_norm": 0.014544868841767311, + "learning_rate": 1.7150912956929226e-05, + "loss": 0.1174, + "num_input_tokens_seen": 140175808, + "step": 115200 + }, + { + "epoch": 12.830493373426885, + "grad_norm": 0.1370500773191452, + "learning_rate": 1.7148606115756627e-05, + "loss": 0.0105, + "num_input_tokens_seen": 140181728, + "step": 115205 + }, + { + "epoch": 12.831050228310502, + "grad_norm": 0.08474286645650864, + "learning_rate": 1.714629934874675e-05, + "loss": 0.0408, + "num_input_tokens_seen": 140187584, + "step": 115210 + }, + { + "epoch": 12.83160708319412, + "grad_norm": 0.011635624803602695, + "learning_rate": 1.714399265592136e-05, + "loss": 0.0225, + "num_input_tokens_seen": 140193760, + "step": 115215 + }, + { + "epoch": 12.832163938077738, + "grad_norm": 0.06545624136924744, + "learning_rate": 1.7141686037302247e-05, + "loss": 0.0193, + "num_input_tokens_seen": 140199360, + "step": 115220 + }, + { + "epoch": 12.832720792961354, + "grad_norm": 0.6677780151367188, + "learning_rate": 1.7139379492911216e-05, + "loss": 0.0174, + "num_input_tokens_seen": 140205472, + "step": 115225 + }, + { + "epoch": 12.833277647844971, + "grad_norm": 0.1287684142589569, + "learning_rate": 1.7137073022770033e-05, + "loss": 0.0022, + "num_input_tokens_seen": 140211584, + "step": 115230 + }, + { + "epoch": 12.833834502728589, + "grad_norm": 1.4832698106765747, + "learning_rate": 1.7134766626900503e-05, + "loss": 0.0625, + "num_input_tokens_seen": 140217888, + "step": 115235 + }, + { + "epoch": 12.834391357612207, + "grad_norm": 0.8202666640281677, + "learning_rate": 1.71324603053244e-05, + "loss": 0.0305, + "num_input_tokens_seen": 140223904, + "step": 115240 + }, + { + "epoch": 12.834948212495824, + "grad_norm": 0.1495206356048584, + "learning_rate": 1.7130154058063517e-05, + "loss": 0.0759, + "num_input_tokens_seen": 140230112, + "step": 115245 + }, + { + "epoch": 12.83550506737944, + "grad_norm": 1.7824500799179077, + "learning_rate": 1.7127847885139625e-05, + "loss": 0.1252, + "num_input_tokens_seen": 140236224, + "step": 115250 + }, + { + "epoch": 12.836061922263058, + "grad_norm": 0.1145128458738327, + "learning_rate": 1.7125541786574527e-05, + "loss": 0.0098, + "num_input_tokens_seen": 140242400, + "step": 115255 + }, + { + "epoch": 12.836618777146676, + "grad_norm": 0.010120649822056293, + "learning_rate": 1.712323576238999e-05, + "loss": 0.0202, + "num_input_tokens_seen": 140248512, + "step": 115260 + }, + { + "epoch": 12.837175632030293, + "grad_norm": 0.003714598948135972, + "learning_rate": 1.7120929812607807e-05, + "loss": 0.0315, + "num_input_tokens_seen": 140254976, + "step": 115265 + }, + { + "epoch": 12.837732486913911, + "grad_norm": 0.688067615032196, + "learning_rate": 1.7118623937249747e-05, + "loss": 0.1146, + "num_input_tokens_seen": 140261376, + "step": 115270 + }, + { + "epoch": 12.838289341797527, + "grad_norm": 0.4130278527736664, + "learning_rate": 1.7116318136337607e-05, + "loss": 0.0239, + "num_input_tokens_seen": 140267104, + "step": 115275 + }, + { + "epoch": 12.838846196681144, + "grad_norm": 0.616732120513916, + "learning_rate": 1.7114012409893143e-05, + "loss": 0.0101, + "num_input_tokens_seen": 140273152, + "step": 115280 + }, + { + "epoch": 12.839403051564762, + "grad_norm": 0.16740825772285461, + "learning_rate": 1.711170675793817e-05, + "loss": 0.0763, + "num_input_tokens_seen": 140279648, + "step": 115285 + }, + { + "epoch": 12.83995990644838, + "grad_norm": 0.015725180506706238, + "learning_rate": 1.710940118049443e-05, + "loss": 0.0389, + "num_input_tokens_seen": 140285632, + "step": 115290 + }, + { + "epoch": 12.840516761331997, + "grad_norm": 0.11898573487997055, + "learning_rate": 1.7107095677583736e-05, + "loss": 0.0839, + "num_input_tokens_seen": 140291392, + "step": 115295 + }, + { + "epoch": 12.841073616215613, + "grad_norm": 0.5739271640777588, + "learning_rate": 1.7104790249227825e-05, + "loss": 0.0247, + "num_input_tokens_seen": 140297824, + "step": 115300 + }, + { + "epoch": 12.841630471099231, + "grad_norm": 1.4811688661575317, + "learning_rate": 1.710248489544851e-05, + "loss": 0.0425, + "num_input_tokens_seen": 140303776, + "step": 115305 + }, + { + "epoch": 12.842187325982849, + "grad_norm": 0.0058268350549042225, + "learning_rate": 1.7100179616267547e-05, + "loss": 0.1224, + "num_input_tokens_seen": 140309792, + "step": 115310 + }, + { + "epoch": 12.842744180866466, + "grad_norm": 0.08591148257255554, + "learning_rate": 1.709787441170672e-05, + "loss": 0.0105, + "num_input_tokens_seen": 140316096, + "step": 115315 + }, + { + "epoch": 12.843301035750084, + "grad_norm": 0.20842145383358002, + "learning_rate": 1.70955692817878e-05, + "loss": 0.0324, + "num_input_tokens_seen": 140322368, + "step": 115320 + }, + { + "epoch": 12.8438578906337, + "grad_norm": 1.1660575866699219, + "learning_rate": 1.7093264226532564e-05, + "loss": 0.0818, + "num_input_tokens_seen": 140328160, + "step": 115325 + }, + { + "epoch": 12.844414745517318, + "grad_norm": 0.007326796650886536, + "learning_rate": 1.7090959245962773e-05, + "loss": 0.1575, + "num_input_tokens_seen": 140334304, + "step": 115330 + }, + { + "epoch": 12.844971600400935, + "grad_norm": 0.25364112854003906, + "learning_rate": 1.7088654340100217e-05, + "loss": 0.0533, + "num_input_tokens_seen": 140340576, + "step": 115335 + }, + { + "epoch": 12.845528455284553, + "grad_norm": 1.4416954517364502, + "learning_rate": 1.7086349508966655e-05, + "loss": 0.144, + "num_input_tokens_seen": 140346400, + "step": 115340 + }, + { + "epoch": 12.84608531016817, + "grad_norm": 2.5963950157165527, + "learning_rate": 1.7084044752583866e-05, + "loss": 0.1124, + "num_input_tokens_seen": 140352000, + "step": 115345 + }, + { + "epoch": 12.846642165051787, + "grad_norm": 0.723171055316925, + "learning_rate": 1.7081740070973608e-05, + "loss": 0.0625, + "num_input_tokens_seen": 140358080, + "step": 115350 + }, + { + "epoch": 12.847199019935404, + "grad_norm": 0.06159047409892082, + "learning_rate": 1.7079435464157674e-05, + "loss": 0.0124, + "num_input_tokens_seen": 140364576, + "step": 115355 + }, + { + "epoch": 12.847755874819022, + "grad_norm": 0.00025618253857828677, + "learning_rate": 1.7077130932157802e-05, + "loss": 0.0053, + "num_input_tokens_seen": 140370624, + "step": 115360 + }, + { + "epoch": 12.84831272970264, + "grad_norm": 1.2203991413116455, + "learning_rate": 1.7074826474995784e-05, + "loss": 0.0282, + "num_input_tokens_seen": 140377056, + "step": 115365 + }, + { + "epoch": 12.848869584586257, + "grad_norm": 1.3949750661849976, + "learning_rate": 1.7072522092693377e-05, + "loss": 0.136, + "num_input_tokens_seen": 140383392, + "step": 115370 + }, + { + "epoch": 12.849426439469875, + "grad_norm": 0.3203597366809845, + "learning_rate": 1.707021778527235e-05, + "loss": 0.0063, + "num_input_tokens_seen": 140389472, + "step": 115375 + }, + { + "epoch": 12.84998329435349, + "grad_norm": 0.09041309356689453, + "learning_rate": 1.7067913552754472e-05, + "loss": 0.0066, + "num_input_tokens_seen": 140395776, + "step": 115380 + }, + { + "epoch": 12.850540149237109, + "grad_norm": 0.6017599701881409, + "learning_rate": 1.706560939516151e-05, + "loss": 0.0285, + "num_input_tokens_seen": 140401760, + "step": 115385 + }, + { + "epoch": 12.851097004120726, + "grad_norm": 0.13984297215938568, + "learning_rate": 1.7063305312515215e-05, + "loss": 0.0204, + "num_input_tokens_seen": 140407744, + "step": 115390 + }, + { + "epoch": 12.851653859004344, + "grad_norm": 1.2013429403305054, + "learning_rate": 1.7061001304837364e-05, + "loss": 0.028, + "num_input_tokens_seen": 140414016, + "step": 115395 + }, + { + "epoch": 12.852210713887962, + "grad_norm": 0.09187877923250198, + "learning_rate": 1.7058697372149714e-05, + "loss": 0.1174, + "num_input_tokens_seen": 140420448, + "step": 115400 + }, + { + "epoch": 12.852767568771577, + "grad_norm": 1.1741989850997925, + "learning_rate": 1.7056393514474035e-05, + "loss": 0.0428, + "num_input_tokens_seen": 140426656, + "step": 115405 + }, + { + "epoch": 12.853324423655195, + "grad_norm": 0.00019171142776031047, + "learning_rate": 1.7054089731832075e-05, + "loss": 0.0872, + "num_input_tokens_seen": 140432960, + "step": 115410 + }, + { + "epoch": 12.853881278538813, + "grad_norm": 0.007690606638789177, + "learning_rate": 1.7051786024245613e-05, + "loss": 0.0809, + "num_input_tokens_seen": 140438816, + "step": 115415 + }, + { + "epoch": 12.85443813342243, + "grad_norm": 1.5711467266082764, + "learning_rate": 1.704948239173639e-05, + "loss": 0.1117, + "num_input_tokens_seen": 140444960, + "step": 115420 + }, + { + "epoch": 12.854994988306048, + "grad_norm": 0.2028823345899582, + "learning_rate": 1.7047178834326184e-05, + "loss": 0.0368, + "num_input_tokens_seen": 140451264, + "step": 115425 + }, + { + "epoch": 12.855551843189664, + "grad_norm": 0.02219536155462265, + "learning_rate": 1.7044875352036744e-05, + "loss": 0.1057, + "num_input_tokens_seen": 140457312, + "step": 115430 + }, + { + "epoch": 12.856108698073282, + "grad_norm": 0.02055487222969532, + "learning_rate": 1.704257194488983e-05, + "loss": 0.0121, + "num_input_tokens_seen": 140463424, + "step": 115435 + }, + { + "epoch": 12.8566655529569, + "grad_norm": 1.3591212034225464, + "learning_rate": 1.7040268612907195e-05, + "loss": 0.0788, + "num_input_tokens_seen": 140469696, + "step": 115440 + }, + { + "epoch": 12.857222407840517, + "grad_norm": 2.0800764560699463, + "learning_rate": 1.7037965356110608e-05, + "loss": 0.1668, + "num_input_tokens_seen": 140475936, + "step": 115445 + }, + { + "epoch": 12.857779262724135, + "grad_norm": 0.04091637581586838, + "learning_rate": 1.703566217452181e-05, + "loss": 0.0084, + "num_input_tokens_seen": 140481920, + "step": 115450 + }, + { + "epoch": 12.85833611760775, + "grad_norm": 0.0009380620322190225, + "learning_rate": 1.7033359068162567e-05, + "loss": 0.0071, + "num_input_tokens_seen": 140488000, + "step": 115455 + }, + { + "epoch": 12.858892972491368, + "grad_norm": 0.18144011497497559, + "learning_rate": 1.7031056037054632e-05, + "loss": 0.0716, + "num_input_tokens_seen": 140494240, + "step": 115460 + }, + { + "epoch": 12.859449827374986, + "grad_norm": 0.1254647821187973, + "learning_rate": 1.7028753081219757e-05, + "loss": 0.0272, + "num_input_tokens_seen": 140500384, + "step": 115465 + }, + { + "epoch": 12.860006682258604, + "grad_norm": 0.2807573080062866, + "learning_rate": 1.7026450200679693e-05, + "loss": 0.0335, + "num_input_tokens_seen": 140506464, + "step": 115470 + }, + { + "epoch": 12.860563537142221, + "grad_norm": 0.5026654601097107, + "learning_rate": 1.7024147395456197e-05, + "loss": 0.0381, + "num_input_tokens_seen": 140512448, + "step": 115475 + }, + { + "epoch": 12.861120392025839, + "grad_norm": 0.0005609226645901799, + "learning_rate": 1.7021844665571013e-05, + "loss": 0.016, + "num_input_tokens_seen": 140518240, + "step": 115480 + }, + { + "epoch": 12.861677246909455, + "grad_norm": 0.3092610836029053, + "learning_rate": 1.701954201104591e-05, + "loss": 0.0138, + "num_input_tokens_seen": 140524800, + "step": 115485 + }, + { + "epoch": 12.862234101793073, + "grad_norm": 0.04723915085196495, + "learning_rate": 1.701723943190261e-05, + "loss": 0.017, + "num_input_tokens_seen": 140530784, + "step": 115490 + }, + { + "epoch": 12.86279095667669, + "grad_norm": 0.055837418884038925, + "learning_rate": 1.7014936928162895e-05, + "loss": 0.0524, + "num_input_tokens_seen": 140536736, + "step": 115495 + }, + { + "epoch": 12.863347811560308, + "grad_norm": 0.029541144147515297, + "learning_rate": 1.701263449984849e-05, + "loss": 0.0575, + "num_input_tokens_seen": 140542720, + "step": 115500 + }, + { + "epoch": 12.863904666443926, + "grad_norm": 0.0006449425127357244, + "learning_rate": 1.7010332146981155e-05, + "loss": 0.0377, + "num_input_tokens_seen": 140549120, + "step": 115505 + }, + { + "epoch": 12.864461521327542, + "grad_norm": 0.1215628907084465, + "learning_rate": 1.700802986958263e-05, + "loss": 0.0115, + "num_input_tokens_seen": 140555296, + "step": 115510 + }, + { + "epoch": 12.86501837621116, + "grad_norm": 0.9607476592063904, + "learning_rate": 1.700572766767467e-05, + "loss": 0.0832, + "num_input_tokens_seen": 140561472, + "step": 115515 + }, + { + "epoch": 12.865575231094777, + "grad_norm": 1.06732976436615, + "learning_rate": 1.7003425541279016e-05, + "loss": 0.0881, + "num_input_tokens_seen": 140567968, + "step": 115520 + }, + { + "epoch": 12.866132085978395, + "grad_norm": 0.0017644795589148998, + "learning_rate": 1.7001123490417418e-05, + "loss": 0.1009, + "num_input_tokens_seen": 140574016, + "step": 115525 + }, + { + "epoch": 12.866688940862012, + "grad_norm": 0.024342598393559456, + "learning_rate": 1.699882151511161e-05, + "loss": 0.1086, + "num_input_tokens_seen": 140580032, + "step": 115530 + }, + { + "epoch": 12.867245795745628, + "grad_norm": 1.5992902517318726, + "learning_rate": 1.699651961538335e-05, + "loss": 0.0555, + "num_input_tokens_seen": 140585856, + "step": 115535 + }, + { + "epoch": 12.867802650629246, + "grad_norm": 0.5650725960731506, + "learning_rate": 1.6994217791254365e-05, + "loss": 0.0111, + "num_input_tokens_seen": 140591776, + "step": 115540 + }, + { + "epoch": 12.868359505512863, + "grad_norm": 0.0013178064255043864, + "learning_rate": 1.699191604274642e-05, + "loss": 0.0298, + "num_input_tokens_seen": 140597504, + "step": 115545 + }, + { + "epoch": 12.868916360396481, + "grad_norm": 0.0008003379334695637, + "learning_rate": 1.698961436988123e-05, + "loss": 0.0074, + "num_input_tokens_seen": 140603904, + "step": 115550 + }, + { + "epoch": 12.869473215280099, + "grad_norm": 0.006954172160476446, + "learning_rate": 1.698731277268056e-05, + "loss": 0.0106, + "num_input_tokens_seen": 140610176, + "step": 115555 + }, + { + "epoch": 12.870030070163715, + "grad_norm": 0.23867446184158325, + "learning_rate": 1.6985011251166137e-05, + "loss": 0.0144, + "num_input_tokens_seen": 140616160, + "step": 115560 + }, + { + "epoch": 12.870586925047332, + "grad_norm": 0.453393816947937, + "learning_rate": 1.698270980535971e-05, + "loss": 0.008, + "num_input_tokens_seen": 140622272, + "step": 115565 + }, + { + "epoch": 12.87114377993095, + "grad_norm": 0.8515182137489319, + "learning_rate": 1.6980408435283e-05, + "loss": 0.0438, + "num_input_tokens_seen": 140628544, + "step": 115570 + }, + { + "epoch": 12.871700634814568, + "grad_norm": 0.003587007988244295, + "learning_rate": 1.6978107140957773e-05, + "loss": 0.011, + "num_input_tokens_seen": 140634720, + "step": 115575 + }, + { + "epoch": 12.872257489698185, + "grad_norm": 1.139606237411499, + "learning_rate": 1.697580592240574e-05, + "loss": 0.0759, + "num_input_tokens_seen": 140640832, + "step": 115580 + }, + { + "epoch": 12.872814344581801, + "grad_norm": 0.0042482540011405945, + "learning_rate": 1.697350477964865e-05, + "loss": 0.058, + "num_input_tokens_seen": 140646592, + "step": 115585 + }, + { + "epoch": 12.873371199465419, + "grad_norm": 0.19544290006160736, + "learning_rate": 1.697120371270824e-05, + "loss": 0.0043, + "num_input_tokens_seen": 140652512, + "step": 115590 + }, + { + "epoch": 12.873928054349037, + "grad_norm": 0.20727069675922394, + "learning_rate": 1.6968902721606248e-05, + "loss": 0.0471, + "num_input_tokens_seen": 140658368, + "step": 115595 + }, + { + "epoch": 12.874484909232654, + "grad_norm": 0.7822268009185791, + "learning_rate": 1.6966601806364392e-05, + "loss": 0.0501, + "num_input_tokens_seen": 140664512, + "step": 115600 + }, + { + "epoch": 12.875041764116272, + "grad_norm": 1.180764079093933, + "learning_rate": 1.696430096700443e-05, + "loss": 0.1231, + "num_input_tokens_seen": 140670880, + "step": 115605 + }, + { + "epoch": 12.875598618999888, + "grad_norm": 1.4406192302703857, + "learning_rate": 1.6962000203548076e-05, + "loss": 0.0301, + "num_input_tokens_seen": 140676512, + "step": 115610 + }, + { + "epoch": 12.876155473883506, + "grad_norm": 0.0026144608855247498, + "learning_rate": 1.695969951601708e-05, + "loss": 0.0138, + "num_input_tokens_seen": 140682432, + "step": 115615 + }, + { + "epoch": 12.876712328767123, + "grad_norm": 0.14485542476177216, + "learning_rate": 1.695739890443315e-05, + "loss": 0.0107, + "num_input_tokens_seen": 140688352, + "step": 115620 + }, + { + "epoch": 12.877269183650741, + "grad_norm": 0.07747345417737961, + "learning_rate": 1.695509836881804e-05, + "loss": 0.0107, + "num_input_tokens_seen": 140694560, + "step": 115625 + }, + { + "epoch": 12.877826038534359, + "grad_norm": 0.005963695235550404, + "learning_rate": 1.695279790919348e-05, + "loss": 0.0459, + "num_input_tokens_seen": 140700832, + "step": 115630 + }, + { + "epoch": 12.878382893417974, + "grad_norm": 0.6394293308258057, + "learning_rate": 1.695049752558117e-05, + "loss": 0.0287, + "num_input_tokens_seen": 140707136, + "step": 115635 + }, + { + "epoch": 12.878939748301592, + "grad_norm": 0.04785439744591713, + "learning_rate": 1.6948197218002875e-05, + "loss": 0.0478, + "num_input_tokens_seen": 140713440, + "step": 115640 + }, + { + "epoch": 12.87949660318521, + "grad_norm": 1.9340392351150513, + "learning_rate": 1.6945896986480302e-05, + "loss": 0.0678, + "num_input_tokens_seen": 140719648, + "step": 115645 + }, + { + "epoch": 12.880053458068828, + "grad_norm": 0.0045484090223908424, + "learning_rate": 1.6943596831035192e-05, + "loss": 0.0055, + "num_input_tokens_seen": 140726176, + "step": 115650 + }, + { + "epoch": 12.880610312952445, + "grad_norm": 0.02274727076292038, + "learning_rate": 1.6941296751689257e-05, + "loss": 0.051, + "num_input_tokens_seen": 140732416, + "step": 115655 + }, + { + "epoch": 12.881167167836061, + "grad_norm": 0.6272582411766052, + "learning_rate": 1.6938996748464235e-05, + "loss": 0.0362, + "num_input_tokens_seen": 140738656, + "step": 115660 + }, + { + "epoch": 12.881724022719679, + "grad_norm": 0.00010085180110763758, + "learning_rate": 1.693669682138184e-05, + "loss": 0.0514, + "num_input_tokens_seen": 140744800, + "step": 115665 + }, + { + "epoch": 12.882280877603296, + "grad_norm": 0.000249709963100031, + "learning_rate": 1.6934396970463816e-05, + "loss": 0.0118, + "num_input_tokens_seen": 140750752, + "step": 115670 + }, + { + "epoch": 12.882837732486914, + "grad_norm": 0.12279052287340164, + "learning_rate": 1.6932097195731864e-05, + "loss": 0.0277, + "num_input_tokens_seen": 140756576, + "step": 115675 + }, + { + "epoch": 12.883394587370532, + "grad_norm": 0.24811461567878723, + "learning_rate": 1.6929797497207724e-05, + "loss": 0.0296, + "num_input_tokens_seen": 140762080, + "step": 115680 + }, + { + "epoch": 12.883951442254148, + "grad_norm": 0.2938348650932312, + "learning_rate": 1.69274978749131e-05, + "loss": 0.0075, + "num_input_tokens_seen": 140768480, + "step": 115685 + }, + { + "epoch": 12.884508297137765, + "grad_norm": 0.397146612405777, + "learning_rate": 1.6925198328869747e-05, + "loss": 0.0344, + "num_input_tokens_seen": 140774912, + "step": 115690 + }, + { + "epoch": 12.885065152021383, + "grad_norm": 0.23370537161827087, + "learning_rate": 1.6922898859099346e-05, + "loss": 0.0435, + "num_input_tokens_seen": 140780992, + "step": 115695 + }, + { + "epoch": 12.885622006905, + "grad_norm": 0.14877460896968842, + "learning_rate": 1.692059946562365e-05, + "loss": 0.0247, + "num_input_tokens_seen": 140786912, + "step": 115700 + }, + { + "epoch": 12.886178861788618, + "grad_norm": 0.17322547733783722, + "learning_rate": 1.6918300148464354e-05, + "loss": 0.0326, + "num_input_tokens_seen": 140792768, + "step": 115705 + }, + { + "epoch": 12.886735716672234, + "grad_norm": 0.0008565189782530069, + "learning_rate": 1.6916000907643198e-05, + "loss": 0.0454, + "num_input_tokens_seen": 140798880, + "step": 115710 + }, + { + "epoch": 12.887292571555852, + "grad_norm": 0.0003860844881273806, + "learning_rate": 1.6913701743181883e-05, + "loss": 0.0189, + "num_input_tokens_seen": 140805056, + "step": 115715 + }, + { + "epoch": 12.88784942643947, + "grad_norm": 0.0004073225427418947, + "learning_rate": 1.691140265510214e-05, + "loss": 0.0653, + "num_input_tokens_seen": 140811360, + "step": 115720 + }, + { + "epoch": 12.888406281323087, + "grad_norm": 0.0013027057284489274, + "learning_rate": 1.6909103643425677e-05, + "loss": 0.0094, + "num_input_tokens_seen": 140817696, + "step": 115725 + }, + { + "epoch": 12.888963136206705, + "grad_norm": 0.0020334708970040083, + "learning_rate": 1.690680470817421e-05, + "loss": 0.0498, + "num_input_tokens_seen": 140824128, + "step": 115730 + }, + { + "epoch": 12.889519991090323, + "grad_norm": 2.1320273876190186, + "learning_rate": 1.6904505849369458e-05, + "loss": 0.1733, + "num_input_tokens_seen": 140830560, + "step": 115735 + }, + { + "epoch": 12.890076845973939, + "grad_norm": 0.011155000887811184, + "learning_rate": 1.690220706703314e-05, + "loss": 0.0003, + "num_input_tokens_seen": 140836960, + "step": 115740 + }, + { + "epoch": 12.890633700857556, + "grad_norm": 1.3910341262817383, + "learning_rate": 1.6899908361186957e-05, + "loss": 0.0449, + "num_input_tokens_seen": 140842976, + "step": 115745 + }, + { + "epoch": 12.891190555741174, + "grad_norm": 0.017078304663300514, + "learning_rate": 1.689760973185263e-05, + "loss": 0.0411, + "num_input_tokens_seen": 140849664, + "step": 115750 + }, + { + "epoch": 12.891747410624792, + "grad_norm": 0.0004383905616123229, + "learning_rate": 1.6895311179051866e-05, + "loss": 0.0412, + "num_input_tokens_seen": 140855616, + "step": 115755 + }, + { + "epoch": 12.89230426550841, + "grad_norm": 0.0009231196017935872, + "learning_rate": 1.6893012702806393e-05, + "loss": 0.015, + "num_input_tokens_seen": 140861792, + "step": 115760 + }, + { + "epoch": 12.892861120392025, + "grad_norm": 0.0002769326092675328, + "learning_rate": 1.68907143031379e-05, + "loss": 0.0013, + "num_input_tokens_seen": 140868128, + "step": 115765 + }, + { + "epoch": 12.893417975275643, + "grad_norm": 0.04416141286492348, + "learning_rate": 1.688841598006811e-05, + "loss": 0.0238, + "num_input_tokens_seen": 140874528, + "step": 115770 + }, + { + "epoch": 12.89397483015926, + "grad_norm": 0.0003024028264917433, + "learning_rate": 1.688611773361873e-05, + "loss": 0.0324, + "num_input_tokens_seen": 140880608, + "step": 115775 + }, + { + "epoch": 12.894531685042878, + "grad_norm": 4.480502605438232, + "learning_rate": 1.6883819563811477e-05, + "loss": 0.0861, + "num_input_tokens_seen": 140886784, + "step": 115780 + }, + { + "epoch": 12.895088539926496, + "grad_norm": 0.7316274046897888, + "learning_rate": 1.6881521470668038e-05, + "loss": 0.0339, + "num_input_tokens_seen": 140893024, + "step": 115785 + }, + { + "epoch": 12.895645394810112, + "grad_norm": 0.41823291778564453, + "learning_rate": 1.687922345421014e-05, + "loss": 0.028, + "num_input_tokens_seen": 140899264, + "step": 115790 + }, + { + "epoch": 12.89620224969373, + "grad_norm": 0.005713083315640688, + "learning_rate": 1.6876925514459483e-05, + "loss": 0.009, + "num_input_tokens_seen": 140905344, + "step": 115795 + }, + { + "epoch": 12.896759104577347, + "grad_norm": 1.7612857818603516, + "learning_rate": 1.6874627651437773e-05, + "loss": 0.1948, + "num_input_tokens_seen": 140910944, + "step": 115800 + }, + { + "epoch": 12.897315959460965, + "grad_norm": 1.61869478225708, + "learning_rate": 1.687232986516671e-05, + "loss": 0.1022, + "num_input_tokens_seen": 140917120, + "step": 115805 + }, + { + "epoch": 12.897872814344582, + "grad_norm": 0.5673879384994507, + "learning_rate": 1.687003215566801e-05, + "loss": 0.022, + "num_input_tokens_seen": 140922976, + "step": 115810 + }, + { + "epoch": 12.898429669228198, + "grad_norm": 0.5864458680152893, + "learning_rate": 1.6867734522963357e-05, + "loss": 0.063, + "num_input_tokens_seen": 140929088, + "step": 115815 + }, + { + "epoch": 12.898986524111816, + "grad_norm": 0.9642155766487122, + "learning_rate": 1.686543696707449e-05, + "loss": 0.0679, + "num_input_tokens_seen": 140935552, + "step": 115820 + }, + { + "epoch": 12.899543378995434, + "grad_norm": 0.007475394289940596, + "learning_rate": 1.6863139488023065e-05, + "loss": 0.0024, + "num_input_tokens_seen": 140941760, + "step": 115825 + }, + { + "epoch": 12.900100233879051, + "grad_norm": 0.3161403238773346, + "learning_rate": 1.686084208583082e-05, + "loss": 0.0534, + "num_input_tokens_seen": 140947520, + "step": 115830 + }, + { + "epoch": 12.900657088762669, + "grad_norm": 0.9025158882141113, + "learning_rate": 1.6858544760519433e-05, + "loss": 0.0611, + "num_input_tokens_seen": 140953504, + "step": 115835 + }, + { + "epoch": 12.901213943646287, + "grad_norm": 0.4677062928676605, + "learning_rate": 1.6856247512110623e-05, + "loss": 0.0327, + "num_input_tokens_seen": 140959872, + "step": 115840 + }, + { + "epoch": 12.901770798529903, + "grad_norm": 0.07230760902166367, + "learning_rate": 1.6853950340626075e-05, + "loss": 0.0477, + "num_input_tokens_seen": 140966112, + "step": 115845 + }, + { + "epoch": 12.90232765341352, + "grad_norm": 0.5543136596679688, + "learning_rate": 1.6851653246087494e-05, + "loss": 0.0961, + "num_input_tokens_seen": 140972352, + "step": 115850 + }, + { + "epoch": 12.902884508297138, + "grad_norm": 0.3599923551082611, + "learning_rate": 1.6849356228516575e-05, + "loss": 0.0261, + "num_input_tokens_seen": 140977952, + "step": 115855 + }, + { + "epoch": 12.903441363180756, + "grad_norm": 0.02897726744413376, + "learning_rate": 1.6847059287935018e-05, + "loss": 0.0228, + "num_input_tokens_seen": 140984128, + "step": 115860 + }, + { + "epoch": 12.903998218064373, + "grad_norm": 0.010709701105952263, + "learning_rate": 1.6844762424364517e-05, + "loss": 0.1034, + "num_input_tokens_seen": 140989472, + "step": 115865 + }, + { + "epoch": 12.90455507294799, + "grad_norm": 0.030985990539193153, + "learning_rate": 1.6842465637826774e-05, + "loss": 0.0045, + "num_input_tokens_seen": 140995488, + "step": 115870 + }, + { + "epoch": 12.905111927831607, + "grad_norm": 0.10593961924314499, + "learning_rate": 1.6840168928343463e-05, + "loss": 0.0428, + "num_input_tokens_seen": 141001120, + "step": 115875 + }, + { + "epoch": 12.905668782715225, + "grad_norm": 0.7365128397941589, + "learning_rate": 1.6837872295936317e-05, + "loss": 0.0461, + "num_input_tokens_seen": 141007296, + "step": 115880 + }, + { + "epoch": 12.906225637598842, + "grad_norm": 0.008268797770142555, + "learning_rate": 1.6835575740626985e-05, + "loss": 0.0597, + "num_input_tokens_seen": 141013248, + "step": 115885 + }, + { + "epoch": 12.90678249248246, + "grad_norm": 0.5804177522659302, + "learning_rate": 1.68332792624372e-05, + "loss": 0.0545, + "num_input_tokens_seen": 141019392, + "step": 115890 + }, + { + "epoch": 12.907339347366076, + "grad_norm": 0.008579852059483528, + "learning_rate": 1.683098286138862e-05, + "loss": 0.004, + "num_input_tokens_seen": 141025504, + "step": 115895 + }, + { + "epoch": 12.907896202249693, + "grad_norm": 0.04835652559995651, + "learning_rate": 1.682868653750296e-05, + "loss": 0.0381, + "num_input_tokens_seen": 141031680, + "step": 115900 + }, + { + "epoch": 12.908453057133311, + "grad_norm": 1.6791791915893555, + "learning_rate": 1.68263902908019e-05, + "loss": 0.0771, + "num_input_tokens_seen": 141037792, + "step": 115905 + }, + { + "epoch": 12.909009912016929, + "grad_norm": 0.0031112716533243656, + "learning_rate": 1.6824094121307136e-05, + "loss": 0.0097, + "num_input_tokens_seen": 141043520, + "step": 115910 + }, + { + "epoch": 12.909566766900546, + "grad_norm": 0.08896086364984512, + "learning_rate": 1.682179802904035e-05, + "loss": 0.0079, + "num_input_tokens_seen": 141049792, + "step": 115915 + }, + { + "epoch": 12.910123621784162, + "grad_norm": 0.0003925264463759959, + "learning_rate": 1.6819502014023236e-05, + "loss": 0.0191, + "num_input_tokens_seen": 141055328, + "step": 115920 + }, + { + "epoch": 12.91068047666778, + "grad_norm": 0.15350989997386932, + "learning_rate": 1.6817206076277474e-05, + "loss": 0.0315, + "num_input_tokens_seen": 141061696, + "step": 115925 + }, + { + "epoch": 12.911237331551398, + "grad_norm": 0.6196745038032532, + "learning_rate": 1.6814910215824765e-05, + "loss": 0.0252, + "num_input_tokens_seen": 141067904, + "step": 115930 + }, + { + "epoch": 12.911794186435015, + "grad_norm": 0.00014150066999718547, + "learning_rate": 1.6812614432686778e-05, + "loss": 0.0348, + "num_input_tokens_seen": 141073440, + "step": 115935 + }, + { + "epoch": 12.912351041318633, + "grad_norm": 0.0031159291975200176, + "learning_rate": 1.6810318726885217e-05, + "loss": 0.0286, + "num_input_tokens_seen": 141079680, + "step": 115940 + }, + { + "epoch": 12.912907896202249, + "grad_norm": 0.006458669435232878, + "learning_rate": 1.6808023098441744e-05, + "loss": 0.0173, + "num_input_tokens_seen": 141085696, + "step": 115945 + }, + { + "epoch": 12.913464751085867, + "grad_norm": 0.0952722355723381, + "learning_rate": 1.680572754737807e-05, + "loss": 0.0223, + "num_input_tokens_seen": 141091456, + "step": 115950 + }, + { + "epoch": 12.914021605969484, + "grad_norm": 0.015869013965129852, + "learning_rate": 1.680343207371585e-05, + "loss": 0.0285, + "num_input_tokens_seen": 141097600, + "step": 115955 + }, + { + "epoch": 12.914578460853102, + "grad_norm": 0.0006573207792825997, + "learning_rate": 1.680113667747679e-05, + "loss": 0.0085, + "num_input_tokens_seen": 141103968, + "step": 115960 + }, + { + "epoch": 12.91513531573672, + "grad_norm": 0.07034211605787277, + "learning_rate": 1.6798841358682564e-05, + "loss": 0.0868, + "num_input_tokens_seen": 141109856, + "step": 115965 + }, + { + "epoch": 12.915692170620336, + "grad_norm": 1.1771020889282227, + "learning_rate": 1.6796546117354853e-05, + "loss": 0.0423, + "num_input_tokens_seen": 141116256, + "step": 115970 + }, + { + "epoch": 12.916249025503953, + "grad_norm": 0.0023291900288313627, + "learning_rate": 1.6794250953515332e-05, + "loss": 0.0234, + "num_input_tokens_seen": 141122560, + "step": 115975 + }, + { + "epoch": 12.916805880387571, + "grad_norm": 0.20915047824382782, + "learning_rate": 1.679195586718569e-05, + "loss": 0.0495, + "num_input_tokens_seen": 141128480, + "step": 115980 + }, + { + "epoch": 12.917362735271189, + "grad_norm": 0.0029970924369990826, + "learning_rate": 1.6789660858387593e-05, + "loss": 0.002, + "num_input_tokens_seen": 141134816, + "step": 115985 + }, + { + "epoch": 12.917919590154806, + "grad_norm": 0.6064392924308777, + "learning_rate": 1.6787365927142734e-05, + "loss": 0.0556, + "num_input_tokens_seen": 141140832, + "step": 115990 + }, + { + "epoch": 12.918476445038422, + "grad_norm": 0.0005172262899577618, + "learning_rate": 1.678507107347278e-05, + "loss": 0.0455, + "num_input_tokens_seen": 141146880, + "step": 115995 + }, + { + "epoch": 12.91903329992204, + "grad_norm": 0.4588295519351959, + "learning_rate": 1.6782776297399416e-05, + "loss": 0.0519, + "num_input_tokens_seen": 141152576, + "step": 116000 + }, + { + "epoch": 12.919590154805658, + "grad_norm": 0.2871452569961548, + "learning_rate": 1.6780481598944303e-05, + "loss": 0.0091, + "num_input_tokens_seen": 141158496, + "step": 116005 + }, + { + "epoch": 12.920147009689275, + "grad_norm": 0.08883661776781082, + "learning_rate": 1.6778186978129144e-05, + "loss": 0.0774, + "num_input_tokens_seen": 141164480, + "step": 116010 + }, + { + "epoch": 12.920703864572893, + "grad_norm": 0.00028523150831460953, + "learning_rate": 1.6775892434975577e-05, + "loss": 0.0021, + "num_input_tokens_seen": 141170368, + "step": 116015 + }, + { + "epoch": 12.921260719456509, + "grad_norm": 0.00491537619382143, + "learning_rate": 1.6773597969505312e-05, + "loss": 0.0131, + "num_input_tokens_seen": 141176608, + "step": 116020 + }, + { + "epoch": 12.921817574340126, + "grad_norm": 0.012698976323008537, + "learning_rate": 1.677130358173999e-05, + "loss": 0.0168, + "num_input_tokens_seen": 141182944, + "step": 116025 + }, + { + "epoch": 12.922374429223744, + "grad_norm": 0.06568478792905807, + "learning_rate": 1.6769009271701308e-05, + "loss": 0.06, + "num_input_tokens_seen": 141188768, + "step": 116030 + }, + { + "epoch": 12.922931284107362, + "grad_norm": 0.14144866168498993, + "learning_rate": 1.6766715039410936e-05, + "loss": 0.0401, + "num_input_tokens_seen": 141195104, + "step": 116035 + }, + { + "epoch": 12.92348813899098, + "grad_norm": 0.4004735052585602, + "learning_rate": 1.676442088489052e-05, + "loss": 0.0203, + "num_input_tokens_seen": 141201280, + "step": 116040 + }, + { + "epoch": 12.924044993874595, + "grad_norm": 0.028534116223454475, + "learning_rate": 1.6762126808161756e-05, + "loss": 0.0142, + "num_input_tokens_seen": 141207456, + "step": 116045 + }, + { + "epoch": 12.924601848758213, + "grad_norm": 2.599802255630493, + "learning_rate": 1.67598328092463e-05, + "loss": 0.0784, + "num_input_tokens_seen": 141213984, + "step": 116050 + }, + { + "epoch": 12.92515870364183, + "grad_norm": 0.2890520691871643, + "learning_rate": 1.675753888816583e-05, + "loss": 0.0178, + "num_input_tokens_seen": 141220096, + "step": 116055 + }, + { + "epoch": 12.925715558525448, + "grad_norm": 0.055676985532045364, + "learning_rate": 1.6755245044942004e-05, + "loss": 0.0869, + "num_input_tokens_seen": 141225632, + "step": 116060 + }, + { + "epoch": 12.926272413409066, + "grad_norm": 1.901381015777588, + "learning_rate": 1.6752951279596495e-05, + "loss": 0.0416, + "num_input_tokens_seen": 141231808, + "step": 116065 + }, + { + "epoch": 12.926829268292684, + "grad_norm": 0.3255804777145386, + "learning_rate": 1.675065759215097e-05, + "loss": 0.1216, + "num_input_tokens_seen": 141237824, + "step": 116070 + }, + { + "epoch": 12.9273861231763, + "grad_norm": 0.14083589613437653, + "learning_rate": 1.6748363982627095e-05, + "loss": 0.0138, + "num_input_tokens_seen": 141244064, + "step": 116075 + }, + { + "epoch": 12.927942978059917, + "grad_norm": 2.400148391723633, + "learning_rate": 1.6746070451046532e-05, + "loss": 0.1139, + "num_input_tokens_seen": 141250176, + "step": 116080 + }, + { + "epoch": 12.928499832943535, + "grad_norm": 0.04660497233271599, + "learning_rate": 1.6743776997430947e-05, + "loss": 0.0057, + "num_input_tokens_seen": 141256448, + "step": 116085 + }, + { + "epoch": 12.929056687827153, + "grad_norm": 0.0018417941173538566, + "learning_rate": 1.6741483621801993e-05, + "loss": 0.0311, + "num_input_tokens_seen": 141262304, + "step": 116090 + }, + { + "epoch": 12.92961354271077, + "grad_norm": 1.424271583557129, + "learning_rate": 1.673919032418136e-05, + "loss": 0.0402, + "num_input_tokens_seen": 141268608, + "step": 116095 + }, + { + "epoch": 12.930170397594386, + "grad_norm": 2.19309401512146, + "learning_rate": 1.6736897104590677e-05, + "loss": 0.0728, + "num_input_tokens_seen": 141274880, + "step": 116100 + }, + { + "epoch": 12.930727252478004, + "grad_norm": 0.00024009680782910436, + "learning_rate": 1.673460396305163e-05, + "loss": 0.044, + "num_input_tokens_seen": 141281152, + "step": 116105 + }, + { + "epoch": 12.931284107361622, + "grad_norm": 1.286313772201538, + "learning_rate": 1.673231089958587e-05, + "loss": 0.051, + "num_input_tokens_seen": 141287168, + "step": 116110 + }, + { + "epoch": 12.93184096224524, + "grad_norm": 0.5290204286575317, + "learning_rate": 1.6730017914215058e-05, + "loss": 0.0689, + "num_input_tokens_seen": 141293248, + "step": 116115 + }, + { + "epoch": 12.932397817128857, + "grad_norm": 1.1059391498565674, + "learning_rate": 1.672772500696085e-05, + "loss": 0.0564, + "num_input_tokens_seen": 141299136, + "step": 116120 + }, + { + "epoch": 12.932954672012473, + "grad_norm": 0.17746582627296448, + "learning_rate": 1.672543217784491e-05, + "loss": 0.049, + "num_input_tokens_seen": 141305024, + "step": 116125 + }, + { + "epoch": 12.93351152689609, + "grad_norm": 0.00020332526764832437, + "learning_rate": 1.672313942688889e-05, + "loss": 0.0389, + "num_input_tokens_seen": 141311072, + "step": 116130 + }, + { + "epoch": 12.934068381779708, + "grad_norm": 0.14019189774990082, + "learning_rate": 1.6720846754114453e-05, + "loss": 0.0069, + "num_input_tokens_seen": 141317248, + "step": 116135 + }, + { + "epoch": 12.934625236663326, + "grad_norm": 0.23553058505058289, + "learning_rate": 1.6718554159543247e-05, + "loss": 0.0316, + "num_input_tokens_seen": 141323328, + "step": 116140 + }, + { + "epoch": 12.935182091546944, + "grad_norm": 0.001529907458461821, + "learning_rate": 1.6716261643196933e-05, + "loss": 0.0493, + "num_input_tokens_seen": 141329728, + "step": 116145 + }, + { + "epoch": 12.93573894643056, + "grad_norm": 0.2543453872203827, + "learning_rate": 1.671396920509716e-05, + "loss": 0.0048, + "num_input_tokens_seen": 141335872, + "step": 116150 + }, + { + "epoch": 12.936295801314177, + "grad_norm": 0.021226465702056885, + "learning_rate": 1.6711676845265602e-05, + "loss": 0.0451, + "num_input_tokens_seen": 141342048, + "step": 116155 + }, + { + "epoch": 12.936852656197795, + "grad_norm": 0.11563930660486221, + "learning_rate": 1.6709384563723878e-05, + "loss": 0.01, + "num_input_tokens_seen": 141348224, + "step": 116160 + }, + { + "epoch": 12.937409511081412, + "grad_norm": 0.299993634223938, + "learning_rate": 1.6707092360493674e-05, + "loss": 0.0214, + "num_input_tokens_seen": 141354112, + "step": 116165 + }, + { + "epoch": 12.93796636596503, + "grad_norm": 0.009386558085680008, + "learning_rate": 1.6704800235596613e-05, + "loss": 0.0763, + "num_input_tokens_seen": 141360128, + "step": 116170 + }, + { + "epoch": 12.938523220848646, + "grad_norm": 1.5549451112747192, + "learning_rate": 1.6702508189054372e-05, + "loss": 0.0652, + "num_input_tokens_seen": 141366176, + "step": 116175 + }, + { + "epoch": 12.939080075732264, + "grad_norm": 0.008025259710848331, + "learning_rate": 1.670021622088858e-05, + "loss": 0.0197, + "num_input_tokens_seen": 141372704, + "step": 116180 + }, + { + "epoch": 12.939636930615881, + "grad_norm": 0.19372610747814178, + "learning_rate": 1.6697924331120904e-05, + "loss": 0.0285, + "num_input_tokens_seen": 141378752, + "step": 116185 + }, + { + "epoch": 12.940193785499499, + "grad_norm": 0.00010927113180514425, + "learning_rate": 1.6695632519772977e-05, + "loss": 0.0308, + "num_input_tokens_seen": 141384896, + "step": 116190 + }, + { + "epoch": 12.940750640383117, + "grad_norm": 0.0029559803660959005, + "learning_rate": 1.6693340786866463e-05, + "loss": 0.0059, + "num_input_tokens_seen": 141391104, + "step": 116195 + }, + { + "epoch": 12.941307495266734, + "grad_norm": 0.0006487605278380215, + "learning_rate": 1.6691049132422994e-05, + "loss": 0.1429, + "num_input_tokens_seen": 141397152, + "step": 116200 + }, + { + "epoch": 12.94186435015035, + "grad_norm": 0.0013581051025539637, + "learning_rate": 1.6688757556464225e-05, + "loss": 0.0114, + "num_input_tokens_seen": 141403232, + "step": 116205 + }, + { + "epoch": 12.942421205033968, + "grad_norm": 0.08184352517127991, + "learning_rate": 1.6686466059011793e-05, + "loss": 0.0221, + "num_input_tokens_seen": 141409376, + "step": 116210 + }, + { + "epoch": 12.942978059917586, + "grad_norm": 0.06827425956726074, + "learning_rate": 1.668417464008736e-05, + "loss": 0.0135, + "num_input_tokens_seen": 141415616, + "step": 116215 + }, + { + "epoch": 12.943534914801203, + "grad_norm": 0.21188569068908691, + "learning_rate": 1.6681883299712546e-05, + "loss": 0.087, + "num_input_tokens_seen": 141421504, + "step": 116220 + }, + { + "epoch": 12.944091769684821, + "grad_norm": 0.6164118051528931, + "learning_rate": 1.6679592037909024e-05, + "loss": 0.0161, + "num_input_tokens_seen": 141427648, + "step": 116225 + }, + { + "epoch": 12.944648624568437, + "grad_norm": 0.6253554821014404, + "learning_rate": 1.667730085469841e-05, + "loss": 0.0468, + "num_input_tokens_seen": 141433440, + "step": 116230 + }, + { + "epoch": 12.945205479452055, + "grad_norm": 1.438109040260315, + "learning_rate": 1.6675009750102366e-05, + "loss": 0.1419, + "num_input_tokens_seen": 141439008, + "step": 116235 + }, + { + "epoch": 12.945762334335672, + "grad_norm": 0.06314955651760101, + "learning_rate": 1.667271872414252e-05, + "loss": 0.0129, + "num_input_tokens_seen": 141445152, + "step": 116240 + }, + { + "epoch": 12.94631918921929, + "grad_norm": 0.5326612591743469, + "learning_rate": 1.667042777684052e-05, + "loss": 0.0155, + "num_input_tokens_seen": 141451456, + "step": 116245 + }, + { + "epoch": 12.946876044102908, + "grad_norm": 1.0849136114120483, + "learning_rate": 1.6668136908218002e-05, + "loss": 0.0687, + "num_input_tokens_seen": 141457248, + "step": 116250 + }, + { + "epoch": 12.947432898986523, + "grad_norm": 0.005434975028038025, + "learning_rate": 1.6665846118296606e-05, + "loss": 0.1382, + "num_input_tokens_seen": 141463008, + "step": 116255 + }, + { + "epoch": 12.947989753870141, + "grad_norm": 0.003045697696506977, + "learning_rate": 1.666355540709797e-05, + "loss": 0.0515, + "num_input_tokens_seen": 141468928, + "step": 116260 + }, + { + "epoch": 12.948546608753759, + "grad_norm": 0.007102958858013153, + "learning_rate": 1.6661264774643737e-05, + "loss": 0.0119, + "num_input_tokens_seen": 141475104, + "step": 116265 + }, + { + "epoch": 12.949103463637377, + "grad_norm": 0.008019691333174706, + "learning_rate": 1.665897422095553e-05, + "loss": 0.0046, + "num_input_tokens_seen": 141481312, + "step": 116270 + }, + { + "epoch": 12.949660318520994, + "grad_norm": 0.007622838020324707, + "learning_rate": 1.6656683746055005e-05, + "loss": 0.0043, + "num_input_tokens_seen": 141487552, + "step": 116275 + }, + { + "epoch": 12.95021717340461, + "grad_norm": 0.3857535421848297, + "learning_rate": 1.665439334996377e-05, + "loss": 0.015, + "num_input_tokens_seen": 141493440, + "step": 116280 + }, + { + "epoch": 12.950774028288228, + "grad_norm": 0.692151665687561, + "learning_rate": 1.66521030327035e-05, + "loss": 0.0274, + "num_input_tokens_seen": 141499360, + "step": 116285 + }, + { + "epoch": 12.951330883171845, + "grad_norm": 0.622879147529602, + "learning_rate": 1.6649812794295782e-05, + "loss": 0.0147, + "num_input_tokens_seen": 141505312, + "step": 116290 + }, + { + "epoch": 12.951887738055463, + "grad_norm": 0.09659653902053833, + "learning_rate": 1.6647522634762292e-05, + "loss": 0.0341, + "num_input_tokens_seen": 141511552, + "step": 116295 + }, + { + "epoch": 12.95244459293908, + "grad_norm": 0.018126104027032852, + "learning_rate": 1.6645232554124623e-05, + "loss": 0.1071, + "num_input_tokens_seen": 141517216, + "step": 116300 + }, + { + "epoch": 12.953001447822697, + "grad_norm": 0.007553677540272474, + "learning_rate": 1.664294255240444e-05, + "loss": 0.0402, + "num_input_tokens_seen": 141523456, + "step": 116305 + }, + { + "epoch": 12.953558302706314, + "grad_norm": 0.0072582196444272995, + "learning_rate": 1.6640652629623354e-05, + "loss": 0.0534, + "num_input_tokens_seen": 141529344, + "step": 116310 + }, + { + "epoch": 12.954115157589932, + "grad_norm": 0.008884894661605358, + "learning_rate": 1.6638362785803008e-05, + "loss": 0.1522, + "num_input_tokens_seen": 141535104, + "step": 116315 + }, + { + "epoch": 12.95467201247355, + "grad_norm": 0.87523353099823, + "learning_rate": 1.663607302096502e-05, + "loss": 0.0255, + "num_input_tokens_seen": 141541344, + "step": 116320 + }, + { + "epoch": 12.955228867357167, + "grad_norm": 1.4947012662887573, + "learning_rate": 1.6633783335131025e-05, + "loss": 0.1246, + "num_input_tokens_seen": 141547616, + "step": 116325 + }, + { + "epoch": 12.955785722240783, + "grad_norm": 1.0920904874801636, + "learning_rate": 1.6631493728322644e-05, + "loss": 0.1112, + "num_input_tokens_seen": 141553568, + "step": 116330 + }, + { + "epoch": 12.956342577124401, + "grad_norm": 0.000644934712909162, + "learning_rate": 1.662920420056152e-05, + "loss": 0.0061, + "num_input_tokens_seen": 141559936, + "step": 116335 + }, + { + "epoch": 12.956899432008019, + "grad_norm": 0.00016122061060741544, + "learning_rate": 1.662691475186926e-05, + "loss": 0.0133, + "num_input_tokens_seen": 141565888, + "step": 116340 + }, + { + "epoch": 12.957456286891636, + "grad_norm": 0.003237973665818572, + "learning_rate": 1.6624625382267502e-05, + "loss": 0.0038, + "num_input_tokens_seen": 141572000, + "step": 116345 + }, + { + "epoch": 12.958013141775254, + "grad_norm": 0.10646741092205048, + "learning_rate": 1.662233609177786e-05, + "loss": 0.019, + "num_input_tokens_seen": 141578080, + "step": 116350 + }, + { + "epoch": 12.95856999665887, + "grad_norm": 1.4043350219726562, + "learning_rate": 1.662004688042198e-05, + "loss": 0.0929, + "num_input_tokens_seen": 141584224, + "step": 116355 + }, + { + "epoch": 12.959126851542488, + "grad_norm": 1.1382465362548828, + "learning_rate": 1.6617757748221456e-05, + "loss": 0.0264, + "num_input_tokens_seen": 141590080, + "step": 116360 + }, + { + "epoch": 12.959683706426105, + "grad_norm": 1.0169174671173096, + "learning_rate": 1.6615468695197937e-05, + "loss": 0.0752, + "num_input_tokens_seen": 141596256, + "step": 116365 + }, + { + "epoch": 12.960240561309723, + "grad_norm": 1.141381025314331, + "learning_rate": 1.6613179721373026e-05, + "loss": 0.0398, + "num_input_tokens_seen": 141602464, + "step": 116370 + }, + { + "epoch": 12.96079741619334, + "grad_norm": 0.0017013716278597713, + "learning_rate": 1.661089082676836e-05, + "loss": 0.0533, + "num_input_tokens_seen": 141608576, + "step": 116375 + }, + { + "epoch": 12.961354271076956, + "grad_norm": 2.789397716522217, + "learning_rate": 1.6608602011405544e-05, + "loss": 0.0552, + "num_input_tokens_seen": 141614464, + "step": 116380 + }, + { + "epoch": 12.961911125960574, + "grad_norm": 0.00012829706247430295, + "learning_rate": 1.6606313275306212e-05, + "loss": 0.0034, + "num_input_tokens_seen": 141620576, + "step": 116385 + }, + { + "epoch": 12.962467980844192, + "grad_norm": 0.572294294834137, + "learning_rate": 1.660402461849197e-05, + "loss": 0.0612, + "num_input_tokens_seen": 141626784, + "step": 116390 + }, + { + "epoch": 12.96302483572781, + "grad_norm": 0.2142113894224167, + "learning_rate": 1.6601736040984447e-05, + "loss": 0.0737, + "num_input_tokens_seen": 141632544, + "step": 116395 + }, + { + "epoch": 12.963581690611427, + "grad_norm": 1.789062261581421, + "learning_rate": 1.6599447542805253e-05, + "loss": 0.107, + "num_input_tokens_seen": 141638720, + "step": 116400 + }, + { + "epoch": 12.964138545495043, + "grad_norm": 0.36596858501434326, + "learning_rate": 1.6597159123976007e-05, + "loss": 0.181, + "num_input_tokens_seen": 141644992, + "step": 116405 + }, + { + "epoch": 12.96469540037866, + "grad_norm": 0.5282505750656128, + "learning_rate": 1.659487078451832e-05, + "loss": 0.022, + "num_input_tokens_seen": 141650912, + "step": 116410 + }, + { + "epoch": 12.965252255262278, + "grad_norm": 1.4563720226287842, + "learning_rate": 1.6592582524453827e-05, + "loss": 0.1192, + "num_input_tokens_seen": 141656960, + "step": 116415 + }, + { + "epoch": 12.965809110145896, + "grad_norm": 0.0009856646647676826, + "learning_rate": 1.6590294343804113e-05, + "loss": 0.0096, + "num_input_tokens_seen": 141663040, + "step": 116420 + }, + { + "epoch": 12.966365965029514, + "grad_norm": 0.8976874351501465, + "learning_rate": 1.6588006242590813e-05, + "loss": 0.018, + "num_input_tokens_seen": 141668928, + "step": 116425 + }, + { + "epoch": 12.966922819913131, + "grad_norm": 0.00018310704035684466, + "learning_rate": 1.6585718220835532e-05, + "loss": 0.0149, + "num_input_tokens_seen": 141675104, + "step": 116430 + }, + { + "epoch": 12.967479674796747, + "grad_norm": 0.0618419423699379, + "learning_rate": 1.6583430278559893e-05, + "loss": 0.005, + "num_input_tokens_seen": 141681440, + "step": 116435 + }, + { + "epoch": 12.968036529680365, + "grad_norm": 0.0003045695775654167, + "learning_rate": 1.6581142415785486e-05, + "loss": 0.0608, + "num_input_tokens_seen": 141687200, + "step": 116440 + }, + { + "epoch": 12.968593384563983, + "grad_norm": 0.13228929042816162, + "learning_rate": 1.657885463253394e-05, + "loss": 0.1152, + "num_input_tokens_seen": 141693024, + "step": 116445 + }, + { + "epoch": 12.9691502394476, + "grad_norm": 0.7798888087272644, + "learning_rate": 1.657656692882686e-05, + "loss": 0.028, + "num_input_tokens_seen": 141698912, + "step": 116450 + }, + { + "epoch": 12.969707094331218, + "grad_norm": 3.976857900619507, + "learning_rate": 1.6574279304685853e-05, + "loss": 0.3126, + "num_input_tokens_seen": 141705056, + "step": 116455 + }, + { + "epoch": 12.970263949214834, + "grad_norm": 0.1514727622270584, + "learning_rate": 1.6571991760132526e-05, + "loss": 0.0042, + "num_input_tokens_seen": 141711200, + "step": 116460 + }, + { + "epoch": 12.970820804098452, + "grad_norm": 0.00024517145357094705, + "learning_rate": 1.6569704295188494e-05, + "loss": 0.0754, + "num_input_tokens_seen": 141717312, + "step": 116465 + }, + { + "epoch": 12.97137765898207, + "grad_norm": 0.0014888192526996136, + "learning_rate": 1.6567416909875355e-05, + "loss": 0.0803, + "num_input_tokens_seen": 141723296, + "step": 116470 + }, + { + "epoch": 12.971934513865687, + "grad_norm": 0.16140411794185638, + "learning_rate": 1.6565129604214718e-05, + "loss": 0.0087, + "num_input_tokens_seen": 141729536, + "step": 116475 + }, + { + "epoch": 12.972491368749305, + "grad_norm": 0.028903553262352943, + "learning_rate": 1.6562842378228195e-05, + "loss": 0.0005, + "num_input_tokens_seen": 141735648, + "step": 116480 + }, + { + "epoch": 12.97304822363292, + "grad_norm": 0.07556241750717163, + "learning_rate": 1.656055523193738e-05, + "loss": 0.0142, + "num_input_tokens_seen": 141741632, + "step": 116485 + }, + { + "epoch": 12.973605078516538, + "grad_norm": 0.005268592853099108, + "learning_rate": 1.6558268165363887e-05, + "loss": 0.0372, + "num_input_tokens_seen": 141747584, + "step": 116490 + }, + { + "epoch": 12.974161933400156, + "grad_norm": 0.03316721320152283, + "learning_rate": 1.6555981178529307e-05, + "loss": 0.0359, + "num_input_tokens_seen": 141753472, + "step": 116495 + }, + { + "epoch": 12.974718788283774, + "grad_norm": 0.9227413535118103, + "learning_rate": 1.655369427145526e-05, + "loss": 0.018, + "num_input_tokens_seen": 141759680, + "step": 116500 + }, + { + "epoch": 12.975275643167391, + "grad_norm": 0.24029046297073364, + "learning_rate": 1.6551407444163327e-05, + "loss": 0.0479, + "num_input_tokens_seen": 141765728, + "step": 116505 + }, + { + "epoch": 12.975832498051007, + "grad_norm": 0.191975936293602, + "learning_rate": 1.654912069667513e-05, + "loss": 0.0624, + "num_input_tokens_seen": 141771552, + "step": 116510 + }, + { + "epoch": 12.976389352934625, + "grad_norm": 0.9635160565376282, + "learning_rate": 1.6546834029012253e-05, + "loss": 0.0652, + "num_input_tokens_seen": 141777376, + "step": 116515 + }, + { + "epoch": 12.976946207818242, + "grad_norm": 0.04462290555238724, + "learning_rate": 1.6544547441196303e-05, + "loss": 0.0056, + "num_input_tokens_seen": 141783008, + "step": 116520 + }, + { + "epoch": 12.97750306270186, + "grad_norm": 0.004488808568567038, + "learning_rate": 1.6542260933248877e-05, + "loss": 0.004, + "num_input_tokens_seen": 141789408, + "step": 116525 + }, + { + "epoch": 12.978059917585478, + "grad_norm": 0.164352685213089, + "learning_rate": 1.6539974505191574e-05, + "loss": 0.0649, + "num_input_tokens_seen": 141795264, + "step": 116530 + }, + { + "epoch": 12.978616772469094, + "grad_norm": 0.08402591198682785, + "learning_rate": 1.6537688157045983e-05, + "loss": 0.0055, + "num_input_tokens_seen": 141801408, + "step": 116535 + }, + { + "epoch": 12.979173627352711, + "grad_norm": 0.18230900168418884, + "learning_rate": 1.653540188883372e-05, + "loss": 0.1537, + "num_input_tokens_seen": 141807648, + "step": 116540 + }, + { + "epoch": 12.979730482236329, + "grad_norm": 0.8414628505706787, + "learning_rate": 1.6533115700576353e-05, + "loss": 0.0653, + "num_input_tokens_seen": 141813888, + "step": 116545 + }, + { + "epoch": 12.980287337119947, + "grad_norm": 1.5880699157714844, + "learning_rate": 1.6530829592295503e-05, + "loss": 0.0376, + "num_input_tokens_seen": 141819648, + "step": 116550 + }, + { + "epoch": 12.980844192003564, + "grad_norm": 1.2939895391464233, + "learning_rate": 1.652854356401274e-05, + "loss": 0.0493, + "num_input_tokens_seen": 141825504, + "step": 116555 + }, + { + "epoch": 12.981401046887182, + "grad_norm": 0.0003355071821715683, + "learning_rate": 1.6526257615749687e-05, + "loss": 0.0048, + "num_input_tokens_seen": 141831776, + "step": 116560 + }, + { + "epoch": 12.981957901770798, + "grad_norm": 3.137221336364746, + "learning_rate": 1.6523971747527905e-05, + "loss": 0.0909, + "num_input_tokens_seen": 141837632, + "step": 116565 + }, + { + "epoch": 12.982514756654416, + "grad_norm": 1.1159034967422485, + "learning_rate": 1.6521685959369015e-05, + "loss": 0.1125, + "num_input_tokens_seen": 141843808, + "step": 116570 + }, + { + "epoch": 12.983071611538033, + "grad_norm": 0.9666290879249573, + "learning_rate": 1.651940025129458e-05, + "loss": 0.0799, + "num_input_tokens_seen": 141849088, + "step": 116575 + }, + { + "epoch": 12.983628466421651, + "grad_norm": 0.03356798738241196, + "learning_rate": 1.6517114623326213e-05, + "loss": 0.0749, + "num_input_tokens_seen": 141855520, + "step": 116580 + }, + { + "epoch": 12.984185321305269, + "grad_norm": 1.4296115636825562, + "learning_rate": 1.651482907548549e-05, + "loss": 0.205, + "num_input_tokens_seen": 141861888, + "step": 116585 + }, + { + "epoch": 12.984742176188885, + "grad_norm": 0.00012863133451901376, + "learning_rate": 1.6512543607794013e-05, + "loss": 0.0051, + "num_input_tokens_seen": 141868064, + "step": 116590 + }, + { + "epoch": 12.985299031072502, + "grad_norm": 2.1009762287139893, + "learning_rate": 1.651025822027335e-05, + "loss": 0.0272, + "num_input_tokens_seen": 141874112, + "step": 116595 + }, + { + "epoch": 12.98585588595612, + "grad_norm": 0.71610426902771, + "learning_rate": 1.650797291294511e-05, + "loss": 0.1314, + "num_input_tokens_seen": 141880000, + "step": 116600 + }, + { + "epoch": 12.986412740839738, + "grad_norm": 0.3667578399181366, + "learning_rate": 1.6505687685830863e-05, + "loss": 0.0794, + "num_input_tokens_seen": 141885504, + "step": 116605 + }, + { + "epoch": 12.986969595723355, + "grad_norm": 0.00043806011672131717, + "learning_rate": 1.6503402538952205e-05, + "loss": 0.0225, + "num_input_tokens_seen": 141891616, + "step": 116610 + }, + { + "epoch": 12.987526450606971, + "grad_norm": 0.278827965259552, + "learning_rate": 1.650111747233071e-05, + "loss": 0.0093, + "num_input_tokens_seen": 141897952, + "step": 116615 + }, + { + "epoch": 12.988083305490589, + "grad_norm": 0.40061625838279724, + "learning_rate": 1.6498832485987982e-05, + "loss": 0.0789, + "num_input_tokens_seen": 141903200, + "step": 116620 + }, + { + "epoch": 12.988640160374207, + "grad_norm": 0.06326070427894592, + "learning_rate": 1.6496547579945575e-05, + "loss": 0.009, + "num_input_tokens_seen": 141909472, + "step": 116625 + }, + { + "epoch": 12.989197015257824, + "grad_norm": 0.6088657379150391, + "learning_rate": 1.649426275422511e-05, + "loss": 0.0808, + "num_input_tokens_seen": 141915776, + "step": 116630 + }, + { + "epoch": 12.989753870141442, + "grad_norm": 0.23540662229061127, + "learning_rate": 1.649197800884813e-05, + "loss": 0.0349, + "num_input_tokens_seen": 141921856, + "step": 116635 + }, + { + "epoch": 12.990310725025058, + "grad_norm": 0.6910765171051025, + "learning_rate": 1.6489693343836247e-05, + "loss": 0.0592, + "num_input_tokens_seen": 141928320, + "step": 116640 + }, + { + "epoch": 12.990867579908675, + "grad_norm": 0.8449730277061462, + "learning_rate": 1.648740875921102e-05, + "loss": 0.1817, + "num_input_tokens_seen": 141934176, + "step": 116645 + }, + { + "epoch": 12.991424434792293, + "grad_norm": 0.5635315179824829, + "learning_rate": 1.6485124254994046e-05, + "loss": 0.0618, + "num_input_tokens_seen": 141940320, + "step": 116650 + }, + { + "epoch": 12.99198128967591, + "grad_norm": 0.0056398711167275906, + "learning_rate": 1.6482839831206886e-05, + "loss": 0.0504, + "num_input_tokens_seen": 141946560, + "step": 116655 + }, + { + "epoch": 12.992538144559528, + "grad_norm": 0.027516338974237442, + "learning_rate": 1.6480555487871136e-05, + "loss": 0.0106, + "num_input_tokens_seen": 141952896, + "step": 116660 + }, + { + "epoch": 12.993094999443144, + "grad_norm": 0.30796894431114197, + "learning_rate": 1.6478271225008358e-05, + "loss": 0.0565, + "num_input_tokens_seen": 141959072, + "step": 116665 + }, + { + "epoch": 12.993651854326762, + "grad_norm": 0.819220244884491, + "learning_rate": 1.647598704264014e-05, + "loss": 0.0413, + "num_input_tokens_seen": 141965280, + "step": 116670 + }, + { + "epoch": 12.99420870921038, + "grad_norm": 0.4940732419490814, + "learning_rate": 1.647370294078805e-05, + "loss": 0.0338, + "num_input_tokens_seen": 141971040, + "step": 116675 + }, + { + "epoch": 12.994765564093997, + "grad_norm": 0.5845985412597656, + "learning_rate": 1.6471418919473674e-05, + "loss": 0.0403, + "num_input_tokens_seen": 141977440, + "step": 116680 + }, + { + "epoch": 12.995322418977615, + "grad_norm": 0.3118615448474884, + "learning_rate": 1.646913497871857e-05, + "loss": 0.0374, + "num_input_tokens_seen": 141983392, + "step": 116685 + }, + { + "epoch": 12.995879273861231, + "grad_norm": 0.19026201963424683, + "learning_rate": 1.646685111854433e-05, + "loss": 0.0765, + "num_input_tokens_seen": 141989664, + "step": 116690 + }, + { + "epoch": 12.996436128744849, + "grad_norm": 1.220750093460083, + "learning_rate": 1.6464567338972507e-05, + "loss": 0.0612, + "num_input_tokens_seen": 141995424, + "step": 116695 + }, + { + "epoch": 12.996992983628466, + "grad_norm": 0.0545775480568409, + "learning_rate": 1.64622836400247e-05, + "loss": 0.0078, + "num_input_tokens_seen": 142001376, + "step": 116700 + }, + { + "epoch": 12.997549838512084, + "grad_norm": 0.5717941522598267, + "learning_rate": 1.6460000021722443e-05, + "loss": 0.0103, + "num_input_tokens_seen": 142007328, + "step": 116705 + }, + { + "epoch": 12.998106693395702, + "grad_norm": 0.09448802471160889, + "learning_rate": 1.6457716484087343e-05, + "loss": 0.0431, + "num_input_tokens_seen": 142013536, + "step": 116710 + }, + { + "epoch": 12.998663548279318, + "grad_norm": 0.0038775221910327673, + "learning_rate": 1.645543302714095e-05, + "loss": 0.0108, + "num_input_tokens_seen": 142019264, + "step": 116715 + }, + { + "epoch": 12.999220403162935, + "grad_norm": 0.004581431392580271, + "learning_rate": 1.645314965090484e-05, + "loss": 0.0116, + "num_input_tokens_seen": 142025600, + "step": 116720 + }, + { + "epoch": 12.999777258046553, + "grad_norm": 0.0014903137926012278, + "learning_rate": 1.645086635540058e-05, + "loss": 0.008, + "num_input_tokens_seen": 142031680, + "step": 116725 + }, + { + "epoch": 13.0, + "eval_loss": 0.07967492938041687, + "eval_runtime": 112.2976, + "eval_samples_per_second": 35.54, + "eval_steps_per_second": 8.887, + "num_input_tokens_seen": 142033584, + "step": 116727 + }, + { + "epoch": 13.00033411293017, + "grad_norm": 0.13118690252304077, + "learning_rate": 1.6448583140649737e-05, + "loss": 0.0424, + "num_input_tokens_seen": 142036912, + "step": 116730 + }, + { + "epoch": 13.000890967813788, + "grad_norm": 0.28186145424842834, + "learning_rate": 1.6446300006673876e-05, + "loss": 0.0422, + "num_input_tokens_seen": 142043184, + "step": 116735 + }, + { + "epoch": 13.001447822697404, + "grad_norm": 0.1247902512550354, + "learning_rate": 1.6444016953494564e-05, + "loss": 0.0131, + "num_input_tokens_seen": 142049360, + "step": 116740 + }, + { + "epoch": 13.002004677581022, + "grad_norm": 4.478014945983887, + "learning_rate": 1.6441733981133366e-05, + "loss": 0.1084, + "num_input_tokens_seen": 142055152, + "step": 116745 + }, + { + "epoch": 13.00256153246464, + "grad_norm": 0.04737599194049835, + "learning_rate": 1.643945108961185e-05, + "loss": 0.1246, + "num_input_tokens_seen": 142061424, + "step": 116750 + }, + { + "epoch": 13.003118387348257, + "grad_norm": 0.017739709466695786, + "learning_rate": 1.643716827895157e-05, + "loss": 0.0522, + "num_input_tokens_seen": 142067664, + "step": 116755 + }, + { + "epoch": 13.003675242231875, + "grad_norm": 7.625007856404409e-05, + "learning_rate": 1.643488554917411e-05, + "loss": 0.0281, + "num_input_tokens_seen": 142073680, + "step": 116760 + }, + { + "epoch": 13.004232097115493, + "grad_norm": 0.8197029829025269, + "learning_rate": 1.6432602900301004e-05, + "loss": 0.0809, + "num_input_tokens_seen": 142079728, + "step": 116765 + }, + { + "epoch": 13.004788951999108, + "grad_norm": 0.597718358039856, + "learning_rate": 1.6430320332353833e-05, + "loss": 0.0991, + "num_input_tokens_seen": 142085936, + "step": 116770 + }, + { + "epoch": 13.005345806882726, + "grad_norm": 0.011012892238795757, + "learning_rate": 1.642803784535415e-05, + "loss": 0.0208, + "num_input_tokens_seen": 142091792, + "step": 116775 + }, + { + "epoch": 13.005902661766344, + "grad_norm": 0.014134167693555355, + "learning_rate": 1.6425755439323526e-05, + "loss": 0.036, + "num_input_tokens_seen": 142098096, + "step": 116780 + }, + { + "epoch": 13.006459516649961, + "grad_norm": 0.0002961634600069374, + "learning_rate": 1.64234731142835e-05, + "loss": 0.0019, + "num_input_tokens_seen": 142104240, + "step": 116785 + }, + { + "epoch": 13.00701637153358, + "grad_norm": 2.5163791179656982, + "learning_rate": 1.642119087025565e-05, + "loss": 0.0922, + "num_input_tokens_seen": 142109360, + "step": 116790 + }, + { + "epoch": 13.007573226417195, + "grad_norm": 1.4290568828582764, + "learning_rate": 1.641890870726152e-05, + "loss": 0.0524, + "num_input_tokens_seen": 142115568, + "step": 116795 + }, + { + "epoch": 13.008130081300813, + "grad_norm": 1.155887246131897, + "learning_rate": 1.641662662532268e-05, + "loss": 0.1662, + "num_input_tokens_seen": 142121648, + "step": 116800 + }, + { + "epoch": 13.00868693618443, + "grad_norm": 0.054585523903369904, + "learning_rate": 1.641434462446067e-05, + "loss": 0.0255, + "num_input_tokens_seen": 142127600, + "step": 116805 + }, + { + "epoch": 13.009243791068048, + "grad_norm": 0.0006276695639826357, + "learning_rate": 1.641206270469706e-05, + "loss": 0.0241, + "num_input_tokens_seen": 142133552, + "step": 116810 + }, + { + "epoch": 13.009800645951666, + "grad_norm": 0.7374669909477234, + "learning_rate": 1.640978086605339e-05, + "loss": 0.0211, + "num_input_tokens_seen": 142139536, + "step": 116815 + }, + { + "epoch": 13.010357500835282, + "grad_norm": 0.601809024810791, + "learning_rate": 1.6407499108551235e-05, + "loss": 0.0632, + "num_input_tokens_seen": 142145680, + "step": 116820 + }, + { + "epoch": 13.0109143557189, + "grad_norm": 0.002066538203507662, + "learning_rate": 1.640521743221212e-05, + "loss": 0.0157, + "num_input_tokens_seen": 142151856, + "step": 116825 + }, + { + "epoch": 13.011471210602517, + "grad_norm": 0.017229802906513214, + "learning_rate": 1.6402935837057627e-05, + "loss": 0.0135, + "num_input_tokens_seen": 142158192, + "step": 116830 + }, + { + "epoch": 13.012028065486135, + "grad_norm": 0.28944888710975647, + "learning_rate": 1.640065432310928e-05, + "loss": 0.0258, + "num_input_tokens_seen": 142164432, + "step": 116835 + }, + { + "epoch": 13.012584920369752, + "grad_norm": 0.013581338338553905, + "learning_rate": 1.6398372890388653e-05, + "loss": 0.0242, + "num_input_tokens_seen": 142170768, + "step": 116840 + }, + { + "epoch": 13.013141775253368, + "grad_norm": 0.04618796706199646, + "learning_rate": 1.6396091538917278e-05, + "loss": 0.0319, + "num_input_tokens_seen": 142177040, + "step": 116845 + }, + { + "epoch": 13.013698630136986, + "grad_norm": 0.0014323320938274264, + "learning_rate": 1.639381026871672e-05, + "loss": 0.0247, + "num_input_tokens_seen": 142183120, + "step": 116850 + }, + { + "epoch": 13.014255485020604, + "grad_norm": 1.0959933996200562, + "learning_rate": 1.639152907980851e-05, + "loss": 0.0365, + "num_input_tokens_seen": 142189104, + "step": 116855 + }, + { + "epoch": 13.014812339904221, + "grad_norm": 0.012287615798413754, + "learning_rate": 1.6389247972214213e-05, + "loss": 0.0069, + "num_input_tokens_seen": 142194896, + "step": 116860 + }, + { + "epoch": 13.015369194787839, + "grad_norm": 1.5081031322479248, + "learning_rate": 1.6386966945955365e-05, + "loss": 0.0618, + "num_input_tokens_seen": 142200400, + "step": 116865 + }, + { + "epoch": 13.015926049671455, + "grad_norm": 0.08992787450551987, + "learning_rate": 1.6384686001053513e-05, + "loss": 0.0055, + "num_input_tokens_seen": 142206512, + "step": 116870 + }, + { + "epoch": 13.016482904555073, + "grad_norm": 0.013266326859593391, + "learning_rate": 1.6382405137530203e-05, + "loss": 0.0781, + "num_input_tokens_seen": 142212752, + "step": 116875 + }, + { + "epoch": 13.01703975943869, + "grad_norm": 0.0017071201000362635, + "learning_rate": 1.6380124355406983e-05, + "loss": 0.1154, + "num_input_tokens_seen": 142218896, + "step": 116880 + }, + { + "epoch": 13.017596614322308, + "grad_norm": 0.35980165004730225, + "learning_rate": 1.63778436547054e-05, + "loss": 0.0304, + "num_input_tokens_seen": 142224816, + "step": 116885 + }, + { + "epoch": 13.018153469205926, + "grad_norm": 0.1289696991443634, + "learning_rate": 1.6375563035446982e-05, + "loss": 0.0109, + "num_input_tokens_seen": 142231248, + "step": 116890 + }, + { + "epoch": 13.018710324089541, + "grad_norm": 2.0179696083068848, + "learning_rate": 1.6373282497653285e-05, + "loss": 0.1254, + "num_input_tokens_seen": 142237040, + "step": 116895 + }, + { + "epoch": 13.019267178973159, + "grad_norm": 0.00019193609477952123, + "learning_rate": 1.6371002041345838e-05, + "loss": 0.0153, + "num_input_tokens_seen": 142242992, + "step": 116900 + }, + { + "epoch": 13.019824033856777, + "grad_norm": 0.2838677763938904, + "learning_rate": 1.6368721666546207e-05, + "loss": 0.0276, + "num_input_tokens_seen": 142248848, + "step": 116905 + }, + { + "epoch": 13.020380888740394, + "grad_norm": 0.00048503014841116965, + "learning_rate": 1.63664413732759e-05, + "loss": 0.0098, + "num_input_tokens_seen": 142254672, + "step": 116910 + }, + { + "epoch": 13.020937743624012, + "grad_norm": 0.2895054221153259, + "learning_rate": 1.636416116155648e-05, + "loss": 0.0137, + "num_input_tokens_seen": 142260912, + "step": 116915 + }, + { + "epoch": 13.021494598507628, + "grad_norm": 0.819503128528595, + "learning_rate": 1.6361881031409474e-05, + "loss": 0.0591, + "num_input_tokens_seen": 142266896, + "step": 116920 + }, + { + "epoch": 13.022051453391246, + "grad_norm": 0.07401075959205627, + "learning_rate": 1.6359600982856424e-05, + "loss": 0.0487, + "num_input_tokens_seen": 142272848, + "step": 116925 + }, + { + "epoch": 13.022608308274863, + "grad_norm": 0.3227396607398987, + "learning_rate": 1.6357321015918864e-05, + "loss": 0.0051, + "num_input_tokens_seen": 142278640, + "step": 116930 + }, + { + "epoch": 13.023165163158481, + "grad_norm": 0.1710374504327774, + "learning_rate": 1.6355041130618332e-05, + "loss": 0.0786, + "num_input_tokens_seen": 142284880, + "step": 116935 + }, + { + "epoch": 13.023722018042099, + "grad_norm": 0.10903012007474899, + "learning_rate": 1.635276132697636e-05, + "loss": 0.0942, + "num_input_tokens_seen": 142290928, + "step": 116940 + }, + { + "epoch": 13.024278872925716, + "grad_norm": 0.005180043168365955, + "learning_rate": 1.635048160501449e-05, + "loss": 0.0575, + "num_input_tokens_seen": 142297360, + "step": 116945 + }, + { + "epoch": 13.024835727809332, + "grad_norm": 0.001357151777483523, + "learning_rate": 1.6348201964754247e-05, + "loss": 0.1141, + "num_input_tokens_seen": 142303472, + "step": 116950 + }, + { + "epoch": 13.02539258269295, + "grad_norm": 0.010407454334199429, + "learning_rate": 1.6345922406217173e-05, + "loss": 0.08, + "num_input_tokens_seen": 142309392, + "step": 116955 + }, + { + "epoch": 13.025949437576568, + "grad_norm": 0.002788207493722439, + "learning_rate": 1.6343642929424786e-05, + "loss": 0.0063, + "num_input_tokens_seen": 142315792, + "step": 116960 + }, + { + "epoch": 13.026506292460185, + "grad_norm": 0.9178561568260193, + "learning_rate": 1.634136353439864e-05, + "loss": 0.0224, + "num_input_tokens_seen": 142321840, + "step": 116965 + }, + { + "epoch": 13.027063147343803, + "grad_norm": 1.6187115907669067, + "learning_rate": 1.633908422116024e-05, + "loss": 0.0403, + "num_input_tokens_seen": 142327824, + "step": 116970 + }, + { + "epoch": 13.027620002227419, + "grad_norm": 0.011828781105577946, + "learning_rate": 1.633680498973114e-05, + "loss": 0.0121, + "num_input_tokens_seen": 142333808, + "step": 116975 + }, + { + "epoch": 13.028176857111037, + "grad_norm": 0.07977120578289032, + "learning_rate": 1.6334525840132847e-05, + "loss": 0.0198, + "num_input_tokens_seen": 142339856, + "step": 116980 + }, + { + "epoch": 13.028733711994654, + "grad_norm": 0.00033127301139757037, + "learning_rate": 1.6332246772386907e-05, + "loss": 0.0408, + "num_input_tokens_seen": 142346000, + "step": 116985 + }, + { + "epoch": 13.029290566878272, + "grad_norm": 0.6699174046516418, + "learning_rate": 1.6329967786514836e-05, + "loss": 0.0649, + "num_input_tokens_seen": 142352240, + "step": 116990 + }, + { + "epoch": 13.02984742176189, + "grad_norm": 2.9855709075927734, + "learning_rate": 1.6327688882538173e-05, + "loss": 0.0843, + "num_input_tokens_seen": 142358224, + "step": 116995 + }, + { + "epoch": 13.030404276645505, + "grad_norm": 0.003717126091942191, + "learning_rate": 1.632541006047843e-05, + "loss": 0.0147, + "num_input_tokens_seen": 142364464, + "step": 117000 + }, + { + "epoch": 13.030961131529123, + "grad_norm": 0.05344192683696747, + "learning_rate": 1.6323131320357142e-05, + "loss": 0.0215, + "num_input_tokens_seen": 142369840, + "step": 117005 + }, + { + "epoch": 13.03151798641274, + "grad_norm": 0.2339019924402237, + "learning_rate": 1.6320852662195827e-05, + "loss": 0.1773, + "num_input_tokens_seen": 142375632, + "step": 117010 + }, + { + "epoch": 13.032074841296359, + "grad_norm": 0.04206066578626633, + "learning_rate": 1.631857408601602e-05, + "loss": 0.0085, + "num_input_tokens_seen": 142381744, + "step": 117015 + }, + { + "epoch": 13.032631696179976, + "grad_norm": 0.00826602429151535, + "learning_rate": 1.6316295591839225e-05, + "loss": 0.0052, + "num_input_tokens_seen": 142388272, + "step": 117020 + }, + { + "epoch": 13.033188551063592, + "grad_norm": 0.8448679447174072, + "learning_rate": 1.6314017179686984e-05, + "loss": 0.0166, + "num_input_tokens_seen": 142394032, + "step": 117025 + }, + { + "epoch": 13.03374540594721, + "grad_norm": 0.12804768979549408, + "learning_rate": 1.63117388495808e-05, + "loss": 0.003, + "num_input_tokens_seen": 142400176, + "step": 117030 + }, + { + "epoch": 13.034302260830827, + "grad_norm": 2.612102746963501, + "learning_rate": 1.630946060154222e-05, + "loss": 0.0688, + "num_input_tokens_seen": 142406256, + "step": 117035 + }, + { + "epoch": 13.034859115714445, + "grad_norm": 0.03684715926647186, + "learning_rate": 1.630718243559273e-05, + "loss": 0.0304, + "num_input_tokens_seen": 142412336, + "step": 117040 + }, + { + "epoch": 13.035415970598063, + "grad_norm": 0.008837873116135597, + "learning_rate": 1.630490435175388e-05, + "loss": 0.03, + "num_input_tokens_seen": 142418544, + "step": 117045 + }, + { + "epoch": 13.035972825481679, + "grad_norm": 0.13794739544391632, + "learning_rate": 1.6302626350047163e-05, + "loss": 0.0035, + "num_input_tokens_seen": 142424656, + "step": 117050 + }, + { + "epoch": 13.036529680365296, + "grad_norm": 0.24842721223831177, + "learning_rate": 1.6300348430494116e-05, + "loss": 0.0105, + "num_input_tokens_seen": 142430960, + "step": 117055 + }, + { + "epoch": 13.037086535248914, + "grad_norm": 0.052076395601034164, + "learning_rate": 1.6298070593116248e-05, + "loss": 0.0142, + "num_input_tokens_seen": 142436560, + "step": 117060 + }, + { + "epoch": 13.037643390132532, + "grad_norm": 0.004616630729287863, + "learning_rate": 1.6295792837935077e-05, + "loss": 0.039, + "num_input_tokens_seen": 142442768, + "step": 117065 + }, + { + "epoch": 13.03820024501615, + "grad_norm": 0.00014121699496172369, + "learning_rate": 1.6293515164972108e-05, + "loss": 0.0548, + "num_input_tokens_seen": 142448784, + "step": 117070 + }, + { + "epoch": 13.038757099899765, + "grad_norm": 0.4380035996437073, + "learning_rate": 1.6291237574248875e-05, + "loss": 0.012, + "num_input_tokens_seen": 142454800, + "step": 117075 + }, + { + "epoch": 13.039313954783383, + "grad_norm": 3.169898509979248, + "learning_rate": 1.628896006578687e-05, + "loss": 0.0726, + "num_input_tokens_seen": 142460784, + "step": 117080 + }, + { + "epoch": 13.039870809667, + "grad_norm": 0.22008046507835388, + "learning_rate": 1.6286682639607625e-05, + "loss": 0.0098, + "num_input_tokens_seen": 142466640, + "step": 117085 + }, + { + "epoch": 13.040427664550618, + "grad_norm": 0.004743887577205896, + "learning_rate": 1.628440529573263e-05, + "loss": 0.1137, + "num_input_tokens_seen": 142472848, + "step": 117090 + }, + { + "epoch": 13.040984519434236, + "grad_norm": 1.1941113471984863, + "learning_rate": 1.628212803418343e-05, + "loss": 0.0573, + "num_input_tokens_seen": 142479152, + "step": 117095 + }, + { + "epoch": 13.041541374317854, + "grad_norm": 0.0011893206974491477, + "learning_rate": 1.6279850854981494e-05, + "loss": 0.0338, + "num_input_tokens_seen": 142485456, + "step": 117100 + }, + { + "epoch": 13.04209822920147, + "grad_norm": 0.6047544479370117, + "learning_rate": 1.6277573758148367e-05, + "loss": 0.0287, + "num_input_tokens_seen": 142491472, + "step": 117105 + }, + { + "epoch": 13.042655084085087, + "grad_norm": 0.7229219675064087, + "learning_rate": 1.6275296743705538e-05, + "loss": 0.0165, + "num_input_tokens_seen": 142497488, + "step": 117110 + }, + { + "epoch": 13.043211938968705, + "grad_norm": 0.0002757996553555131, + "learning_rate": 1.627301981167453e-05, + "loss": 0.0456, + "num_input_tokens_seen": 142503824, + "step": 117115 + }, + { + "epoch": 13.043768793852323, + "grad_norm": 0.1881968379020691, + "learning_rate": 1.6270742962076828e-05, + "loss": 0.0282, + "num_input_tokens_seen": 142510064, + "step": 117120 + }, + { + "epoch": 13.04432564873594, + "grad_norm": 0.01716373674571514, + "learning_rate": 1.626846619493397e-05, + "loss": 0.0022, + "num_input_tokens_seen": 142516368, + "step": 117125 + }, + { + "epoch": 13.044882503619556, + "grad_norm": 0.03372061252593994, + "learning_rate": 1.6266189510267427e-05, + "loss": 0.0156, + "num_input_tokens_seen": 142522576, + "step": 117130 + }, + { + "epoch": 13.045439358503174, + "grad_norm": 0.6670576930046082, + "learning_rate": 1.6263912908098732e-05, + "loss": 0.0044, + "num_input_tokens_seen": 142528496, + "step": 117135 + }, + { + "epoch": 13.045996213386791, + "grad_norm": 0.015559575520455837, + "learning_rate": 1.6261636388449376e-05, + "loss": 0.0578, + "num_input_tokens_seen": 142534768, + "step": 117140 + }, + { + "epoch": 13.04655306827041, + "grad_norm": 0.2951835095882416, + "learning_rate": 1.625935995134087e-05, + "loss": 0.2147, + "num_input_tokens_seen": 142540912, + "step": 117145 + }, + { + "epoch": 13.047109923154027, + "grad_norm": 0.7907804250717163, + "learning_rate": 1.62570835967947e-05, + "loss": 0.0155, + "num_input_tokens_seen": 142547216, + "step": 117150 + }, + { + "epoch": 13.047666778037643, + "grad_norm": 0.002787040313705802, + "learning_rate": 1.6254807324832393e-05, + "loss": 0.0186, + "num_input_tokens_seen": 142553584, + "step": 117155 + }, + { + "epoch": 13.04822363292126, + "grad_norm": 0.33120226860046387, + "learning_rate": 1.6252531135475424e-05, + "loss": 0.032, + "num_input_tokens_seen": 142559184, + "step": 117160 + }, + { + "epoch": 13.048780487804878, + "grad_norm": 1.156421184539795, + "learning_rate": 1.6250255028745323e-05, + "loss": 0.0299, + "num_input_tokens_seen": 142565296, + "step": 117165 + }, + { + "epoch": 13.049337342688496, + "grad_norm": 0.4325302541255951, + "learning_rate": 1.6247979004663557e-05, + "loss": 0.1034, + "num_input_tokens_seen": 142571440, + "step": 117170 + }, + { + "epoch": 13.049894197572113, + "grad_norm": 0.0032700751908123493, + "learning_rate": 1.6245703063251654e-05, + "loss": 0.0011, + "num_input_tokens_seen": 142577616, + "step": 117175 + }, + { + "epoch": 13.05045105245573, + "grad_norm": 0.00023659682483412325, + "learning_rate": 1.6243427204531092e-05, + "loss": 0.0049, + "num_input_tokens_seen": 142583920, + "step": 117180 + }, + { + "epoch": 13.051007907339347, + "grad_norm": 0.10419873148202896, + "learning_rate": 1.6241151428523383e-05, + "loss": 0.0109, + "num_input_tokens_seen": 142589840, + "step": 117185 + }, + { + "epoch": 13.051564762222965, + "grad_norm": 0.009777997620403767, + "learning_rate": 1.623887573525001e-05, + "loss": 0.0358, + "num_input_tokens_seen": 142595984, + "step": 117190 + }, + { + "epoch": 13.052121617106582, + "grad_norm": 0.03675396367907524, + "learning_rate": 1.6236600124732476e-05, + "loss": 0.0112, + "num_input_tokens_seen": 142601808, + "step": 117195 + }, + { + "epoch": 13.0526784719902, + "grad_norm": 0.0076521304436028, + "learning_rate": 1.6234324596992278e-05, + "loss": 0.0889, + "num_input_tokens_seen": 142608272, + "step": 117200 + }, + { + "epoch": 13.053235326873816, + "grad_norm": 0.10390543192625046, + "learning_rate": 1.6232049152050905e-05, + "loss": 0.0081, + "num_input_tokens_seen": 142614480, + "step": 117205 + }, + { + "epoch": 13.053792181757434, + "grad_norm": 0.005515381693840027, + "learning_rate": 1.622977378992985e-05, + "loss": 0.0307, + "num_input_tokens_seen": 142620752, + "step": 117210 + }, + { + "epoch": 13.054349036641051, + "grad_norm": 0.000542122928891331, + "learning_rate": 1.6227498510650612e-05, + "loss": 0.0048, + "num_input_tokens_seen": 142627024, + "step": 117215 + }, + { + "epoch": 13.054905891524669, + "grad_norm": 0.08046773076057434, + "learning_rate": 1.6225223314234673e-05, + "loss": 0.082, + "num_input_tokens_seen": 142632912, + "step": 117220 + }, + { + "epoch": 13.055462746408287, + "grad_norm": 0.07884781062602997, + "learning_rate": 1.622294820070354e-05, + "loss": 0.0166, + "num_input_tokens_seen": 142639056, + "step": 117225 + }, + { + "epoch": 13.056019601291903, + "grad_norm": 0.4139811396598816, + "learning_rate": 1.622067317007868e-05, + "loss": 0.165, + "num_input_tokens_seen": 142644880, + "step": 117230 + }, + { + "epoch": 13.05657645617552, + "grad_norm": 1.0891335010528564, + "learning_rate": 1.621839822238161e-05, + "loss": 0.0202, + "num_input_tokens_seen": 142650640, + "step": 117235 + }, + { + "epoch": 13.057133311059138, + "grad_norm": 0.004611619748175144, + "learning_rate": 1.6216123357633795e-05, + "loss": 0.0078, + "num_input_tokens_seen": 142656848, + "step": 117240 + }, + { + "epoch": 13.057690165942756, + "grad_norm": 0.10057543963193893, + "learning_rate": 1.6213848575856737e-05, + "loss": 0.1698, + "num_input_tokens_seen": 142662800, + "step": 117245 + }, + { + "epoch": 13.058247020826373, + "grad_norm": 0.010682031512260437, + "learning_rate": 1.6211573877071916e-05, + "loss": 0.0062, + "num_input_tokens_seen": 142668944, + "step": 117250 + }, + { + "epoch": 13.05880387570999, + "grad_norm": 0.03206514194607735, + "learning_rate": 1.6209299261300826e-05, + "loss": 0.0519, + "num_input_tokens_seen": 142675536, + "step": 117255 + }, + { + "epoch": 13.059360730593607, + "grad_norm": 0.0006887909257784486, + "learning_rate": 1.620702472856494e-05, + "loss": 0.0229, + "num_input_tokens_seen": 142681552, + "step": 117260 + }, + { + "epoch": 13.059917585477224, + "grad_norm": 0.13981443643569946, + "learning_rate": 1.6204750278885755e-05, + "loss": 0.0092, + "num_input_tokens_seen": 142687728, + "step": 117265 + }, + { + "epoch": 13.060474440360842, + "grad_norm": 0.9739043712615967, + "learning_rate": 1.6202475912284755e-05, + "loss": 0.082, + "num_input_tokens_seen": 142694032, + "step": 117270 + }, + { + "epoch": 13.06103129524446, + "grad_norm": 0.019066330045461655, + "learning_rate": 1.6200201628783406e-05, + "loss": 0.01, + "num_input_tokens_seen": 142700432, + "step": 117275 + }, + { + "epoch": 13.061588150128078, + "grad_norm": 0.2998862862586975, + "learning_rate": 1.6197927428403213e-05, + "loss": 0.0082, + "num_input_tokens_seen": 142706768, + "step": 117280 + }, + { + "epoch": 13.062145005011693, + "grad_norm": 3.680534839630127, + "learning_rate": 1.6195653311165644e-05, + "loss": 0.023, + "num_input_tokens_seen": 142713328, + "step": 117285 + }, + { + "epoch": 13.062701859895311, + "grad_norm": 0.5669322609901428, + "learning_rate": 1.6193379277092184e-05, + "loss": 0.0778, + "num_input_tokens_seen": 142718800, + "step": 117290 + }, + { + "epoch": 13.063258714778929, + "grad_norm": 0.003621090669184923, + "learning_rate": 1.619110532620431e-05, + "loss": 0.0047, + "num_input_tokens_seen": 142725136, + "step": 117295 + }, + { + "epoch": 13.063815569662546, + "grad_norm": 0.005186617374420166, + "learning_rate": 1.6188831458523506e-05, + "loss": 0.0555, + "num_input_tokens_seen": 142730736, + "step": 117300 + }, + { + "epoch": 13.064372424546164, + "grad_norm": 0.3168083727359772, + "learning_rate": 1.6186557674071243e-05, + "loss": 0.066, + "num_input_tokens_seen": 142736752, + "step": 117305 + }, + { + "epoch": 13.06492927942978, + "grad_norm": 0.0006398354889824986, + "learning_rate": 1.618428397286902e-05, + "loss": 0.0302, + "num_input_tokens_seen": 142742608, + "step": 117310 + }, + { + "epoch": 13.065486134313398, + "grad_norm": 0.7581995725631714, + "learning_rate": 1.6182010354938277e-05, + "loss": 0.02, + "num_input_tokens_seen": 142748720, + "step": 117315 + }, + { + "epoch": 13.066042989197015, + "grad_norm": 0.0005398188368417323, + "learning_rate": 1.6179736820300522e-05, + "loss": 0.0122, + "num_input_tokens_seen": 142755056, + "step": 117320 + }, + { + "epoch": 13.066599844080633, + "grad_norm": 0.008025635965168476, + "learning_rate": 1.6177463368977216e-05, + "loss": 0.0022, + "num_input_tokens_seen": 142761328, + "step": 117325 + }, + { + "epoch": 13.06715669896425, + "grad_norm": 0.2511395514011383, + "learning_rate": 1.6175190000989843e-05, + "loss": 0.1182, + "num_input_tokens_seen": 142767280, + "step": 117330 + }, + { + "epoch": 13.067713553847867, + "grad_norm": 0.017942793667316437, + "learning_rate": 1.6172916716359866e-05, + "loss": 0.025, + "num_input_tokens_seen": 142772592, + "step": 117335 + }, + { + "epoch": 13.068270408731484, + "grad_norm": 0.0004991538007743657, + "learning_rate": 1.6170643515108763e-05, + "loss": 0.0032, + "num_input_tokens_seen": 142778864, + "step": 117340 + }, + { + "epoch": 13.068827263615102, + "grad_norm": 0.021701455116271973, + "learning_rate": 1.6168370397258006e-05, + "loss": 0.0188, + "num_input_tokens_seen": 142785136, + "step": 117345 + }, + { + "epoch": 13.06938411849872, + "grad_norm": 0.0604017935693264, + "learning_rate": 1.616609736282907e-05, + "loss": 0.0535, + "num_input_tokens_seen": 142790640, + "step": 117350 + }, + { + "epoch": 13.069940973382337, + "grad_norm": 0.0010662429267540574, + "learning_rate": 1.6163824411843416e-05, + "loss": 0.0154, + "num_input_tokens_seen": 142796912, + "step": 117355 + }, + { + "epoch": 13.070497828265953, + "grad_norm": 0.09784164279699326, + "learning_rate": 1.6161551544322526e-05, + "loss": 0.0065, + "num_input_tokens_seen": 142803248, + "step": 117360 + }, + { + "epoch": 13.07105468314957, + "grad_norm": 0.10562856495380402, + "learning_rate": 1.6159278760287852e-05, + "loss": 0.0064, + "num_input_tokens_seen": 142809488, + "step": 117365 + }, + { + "epoch": 13.071611538033189, + "grad_norm": 0.09649457782506943, + "learning_rate": 1.6157006059760886e-05, + "loss": 0.0537, + "num_input_tokens_seen": 142815600, + "step": 117370 + }, + { + "epoch": 13.072168392916806, + "grad_norm": 0.567537784576416, + "learning_rate": 1.6154733442763075e-05, + "loss": 0.0607, + "num_input_tokens_seen": 142821552, + "step": 117375 + }, + { + "epoch": 13.072725247800424, + "grad_norm": 0.2644791603088379, + "learning_rate": 1.61524609093159e-05, + "loss": 0.1009, + "num_input_tokens_seen": 142827408, + "step": 117380 + }, + { + "epoch": 13.07328210268404, + "grad_norm": 1.3140394687652588, + "learning_rate": 1.6150188459440812e-05, + "loss": 0.0916, + "num_input_tokens_seen": 142833712, + "step": 117385 + }, + { + "epoch": 13.073838957567657, + "grad_norm": 0.0013193927006796002, + "learning_rate": 1.614791609315929e-05, + "loss": 0.0034, + "num_input_tokens_seen": 142839664, + "step": 117390 + }, + { + "epoch": 13.074395812451275, + "grad_norm": 0.1379675418138504, + "learning_rate": 1.6145643810492787e-05, + "loss": 0.0085, + "num_input_tokens_seen": 142844880, + "step": 117395 + }, + { + "epoch": 13.074952667334893, + "grad_norm": 0.001731546362861991, + "learning_rate": 1.614337161146278e-05, + "loss": 0.1848, + "num_input_tokens_seen": 142850544, + "step": 117400 + }, + { + "epoch": 13.07550952221851, + "grad_norm": 0.15821610391139984, + "learning_rate": 1.6141099496090718e-05, + "loss": 0.0017, + "num_input_tokens_seen": 142856912, + "step": 117405 + }, + { + "epoch": 13.076066377102126, + "grad_norm": 2.745418071746826, + "learning_rate": 1.6138827464398078e-05, + "loss": 0.0559, + "num_input_tokens_seen": 142862704, + "step": 117410 + }, + { + "epoch": 13.076623231985744, + "grad_norm": 0.006436657626181841, + "learning_rate": 1.61365555164063e-05, + "loss": 0.0032, + "num_input_tokens_seen": 142868816, + "step": 117415 + }, + { + "epoch": 13.077180086869362, + "grad_norm": 3.2362797260284424, + "learning_rate": 1.6134283652136866e-05, + "loss": 0.0917, + "num_input_tokens_seen": 142874928, + "step": 117420 + }, + { + "epoch": 13.07773694175298, + "grad_norm": 0.08847689628601074, + "learning_rate": 1.613201187161122e-05, + "loss": 0.0084, + "num_input_tokens_seen": 142880752, + "step": 117425 + }, + { + "epoch": 13.078293796636597, + "grad_norm": 0.00044970348244532943, + "learning_rate": 1.612974017485083e-05, + "loss": 0.0121, + "num_input_tokens_seen": 142886640, + "step": 117430 + }, + { + "epoch": 13.078850651520213, + "grad_norm": 0.6466606259346008, + "learning_rate": 1.612746856187714e-05, + "loss": 0.017, + "num_input_tokens_seen": 142892784, + "step": 117435 + }, + { + "epoch": 13.07940750640383, + "grad_norm": 0.31595486402511597, + "learning_rate": 1.6125197032711638e-05, + "loss": 0.0195, + "num_input_tokens_seen": 142898960, + "step": 117440 + }, + { + "epoch": 13.079964361287448, + "grad_norm": 0.2290620356798172, + "learning_rate": 1.6122925587375742e-05, + "loss": 0.0021, + "num_input_tokens_seen": 142905328, + "step": 117445 + }, + { + "epoch": 13.080521216171066, + "grad_norm": 0.03010561130940914, + "learning_rate": 1.6120654225890935e-05, + "loss": 0.0186, + "num_input_tokens_seen": 142911472, + "step": 117450 + }, + { + "epoch": 13.081078071054684, + "grad_norm": 2.4927408695220947, + "learning_rate": 1.611838294827866e-05, + "loss": 0.0956, + "num_input_tokens_seen": 142917616, + "step": 117455 + }, + { + "epoch": 13.081634925938301, + "grad_norm": 0.7086058855056763, + "learning_rate": 1.6116111754560378e-05, + "loss": 0.1016, + "num_input_tokens_seen": 142923760, + "step": 117460 + }, + { + "epoch": 13.082191780821917, + "grad_norm": 1.4866544008255005, + "learning_rate": 1.6113840644757533e-05, + "loss": 0.0188, + "num_input_tokens_seen": 142929968, + "step": 117465 + }, + { + "epoch": 13.082748635705535, + "grad_norm": 1.3876231908798218, + "learning_rate": 1.6111569618891587e-05, + "loss": 0.0454, + "num_input_tokens_seen": 142936080, + "step": 117470 + }, + { + "epoch": 13.083305490589153, + "grad_norm": 0.38071778416633606, + "learning_rate": 1.6109298676983985e-05, + "loss": 0.0868, + "num_input_tokens_seen": 142942192, + "step": 117475 + }, + { + "epoch": 13.08386234547277, + "grad_norm": 0.40334054827690125, + "learning_rate": 1.6107027819056185e-05, + "loss": 0.0377, + "num_input_tokens_seen": 142948560, + "step": 117480 + }, + { + "epoch": 13.084419200356388, + "grad_norm": 0.01615140773355961, + "learning_rate": 1.6104757045129622e-05, + "loss": 0.0048, + "num_input_tokens_seen": 142955152, + "step": 117485 + }, + { + "epoch": 13.084976055240004, + "grad_norm": 0.3993458151817322, + "learning_rate": 1.6102486355225766e-05, + "loss": 0.0593, + "num_input_tokens_seen": 142961424, + "step": 117490 + }, + { + "epoch": 13.085532910123622, + "grad_norm": 1.7730424404144287, + "learning_rate": 1.6100215749366043e-05, + "loss": 0.0384, + "num_input_tokens_seen": 142967696, + "step": 117495 + }, + { + "epoch": 13.08608976500724, + "grad_norm": 0.048379573971033096, + "learning_rate": 1.6097945227571925e-05, + "loss": 0.0665, + "num_input_tokens_seen": 142973616, + "step": 117500 + }, + { + "epoch": 13.086646619890857, + "grad_norm": 0.00018264245591126382, + "learning_rate": 1.6095674789864835e-05, + "loss": 0.0021, + "num_input_tokens_seen": 142979632, + "step": 117505 + }, + { + "epoch": 13.087203474774475, + "grad_norm": 0.05157945305109024, + "learning_rate": 1.6093404436266242e-05, + "loss": 0.0037, + "num_input_tokens_seen": 142985904, + "step": 117510 + }, + { + "epoch": 13.08776032965809, + "grad_norm": 1.2182906866073608, + "learning_rate": 1.609113416679757e-05, + "loss": 0.1382, + "num_input_tokens_seen": 142992176, + "step": 117515 + }, + { + "epoch": 13.088317184541708, + "grad_norm": 0.5278592109680176, + "learning_rate": 1.608886398148028e-05, + "loss": 0.0918, + "num_input_tokens_seen": 142998320, + "step": 117520 + }, + { + "epoch": 13.088874039425326, + "grad_norm": 0.0012272762833163142, + "learning_rate": 1.6086593880335806e-05, + "loss": 0.02, + "num_input_tokens_seen": 143004304, + "step": 117525 + }, + { + "epoch": 13.089430894308943, + "grad_norm": 0.007358203176409006, + "learning_rate": 1.6084323863385597e-05, + "loss": 0.0959, + "num_input_tokens_seen": 143010544, + "step": 117530 + }, + { + "epoch": 13.089987749192561, + "grad_norm": 0.3719680607318878, + "learning_rate": 1.6082053930651092e-05, + "loss": 0.0825, + "num_input_tokens_seen": 143016560, + "step": 117535 + }, + { + "epoch": 13.090544604076177, + "grad_norm": 0.0384964644908905, + "learning_rate": 1.607978408215373e-05, + "loss": 0.0065, + "num_input_tokens_seen": 143022736, + "step": 117540 + }, + { + "epoch": 13.091101458959795, + "grad_norm": 0.12212636321783066, + "learning_rate": 1.6077514317914953e-05, + "loss": 0.0111, + "num_input_tokens_seen": 143028688, + "step": 117545 + }, + { + "epoch": 13.091658313843412, + "grad_norm": 0.10547303408384323, + "learning_rate": 1.6075244637956212e-05, + "loss": 0.0229, + "num_input_tokens_seen": 143034896, + "step": 117550 + }, + { + "epoch": 13.09221516872703, + "grad_norm": 0.0006069047376513481, + "learning_rate": 1.607297504229892e-05, + "loss": 0.0031, + "num_input_tokens_seen": 143041200, + "step": 117555 + }, + { + "epoch": 13.092772023610648, + "grad_norm": 0.8412432074546814, + "learning_rate": 1.6070705530964547e-05, + "loss": 0.0781, + "num_input_tokens_seen": 143047376, + "step": 117560 + }, + { + "epoch": 13.093328878494264, + "grad_norm": 0.04205094277858734, + "learning_rate": 1.60684361039745e-05, + "loss": 0.0245, + "num_input_tokens_seen": 143052912, + "step": 117565 + }, + { + "epoch": 13.093885733377881, + "grad_norm": 0.016984259709715843, + "learning_rate": 1.6066166761350244e-05, + "loss": 0.0252, + "num_input_tokens_seen": 143059120, + "step": 117570 + }, + { + "epoch": 13.094442588261499, + "grad_norm": 0.22733904421329498, + "learning_rate": 1.6063897503113185e-05, + "loss": 0.0598, + "num_input_tokens_seen": 143064976, + "step": 117575 + }, + { + "epoch": 13.094999443145117, + "grad_norm": 4.048568248748779, + "learning_rate": 1.6061628329284782e-05, + "loss": 0.1554, + "num_input_tokens_seen": 143070928, + "step": 117580 + }, + { + "epoch": 13.095556298028734, + "grad_norm": 0.23237447440624237, + "learning_rate": 1.6059359239886458e-05, + "loss": 0.0601, + "num_input_tokens_seen": 143076688, + "step": 117585 + }, + { + "epoch": 13.09611315291235, + "grad_norm": 0.0002772859006654471, + "learning_rate": 1.6057090234939653e-05, + "loss": 0.001, + "num_input_tokens_seen": 143082992, + "step": 117590 + }, + { + "epoch": 13.096670007795968, + "grad_norm": 0.1628747135400772, + "learning_rate": 1.605482131446579e-05, + "loss": 0.0538, + "num_input_tokens_seen": 143088880, + "step": 117595 + }, + { + "epoch": 13.097226862679586, + "grad_norm": 0.00011157821427332237, + "learning_rate": 1.6052552478486315e-05, + "loss": 0.0131, + "num_input_tokens_seen": 143095184, + "step": 117600 + }, + { + "epoch": 13.097783717563203, + "grad_norm": 0.0001896145986393094, + "learning_rate": 1.6050283727022644e-05, + "loss": 0.0735, + "num_input_tokens_seen": 143101744, + "step": 117605 + }, + { + "epoch": 13.098340572446821, + "grad_norm": 0.6461156606674194, + "learning_rate": 1.6048015060096216e-05, + "loss": 0.0124, + "num_input_tokens_seen": 143107792, + "step": 117610 + }, + { + "epoch": 13.098897427330437, + "grad_norm": 0.010048595257103443, + "learning_rate": 1.6045746477728456e-05, + "loss": 0.0088, + "num_input_tokens_seen": 143114064, + "step": 117615 + }, + { + "epoch": 13.099454282214055, + "grad_norm": 0.07692713290452957, + "learning_rate": 1.6043477979940803e-05, + "loss": 0.0048, + "num_input_tokens_seen": 143120272, + "step": 117620 + }, + { + "epoch": 13.100011137097672, + "grad_norm": 0.08709962666034698, + "learning_rate": 1.6041209566754657e-05, + "loss": 0.1174, + "num_input_tokens_seen": 143126064, + "step": 117625 + }, + { + "epoch": 13.10056799198129, + "grad_norm": 0.03070785291492939, + "learning_rate": 1.6038941238191484e-05, + "loss": 0.0136, + "num_input_tokens_seen": 143132368, + "step": 117630 + }, + { + "epoch": 13.101124846864908, + "grad_norm": 0.41502201557159424, + "learning_rate": 1.6036672994272676e-05, + "loss": 0.0045, + "num_input_tokens_seen": 143138480, + "step": 117635 + }, + { + "epoch": 13.101681701748525, + "grad_norm": 0.06468712538480759, + "learning_rate": 1.6034404835019683e-05, + "loss": 0.0185, + "num_input_tokens_seen": 143144496, + "step": 117640 + }, + { + "epoch": 13.102238556632141, + "grad_norm": 0.43522319197654724, + "learning_rate": 1.6032136760453915e-05, + "loss": 0.1311, + "num_input_tokens_seen": 143150448, + "step": 117645 + }, + { + "epoch": 13.102795411515759, + "grad_norm": 0.828013002872467, + "learning_rate": 1.6029868770596802e-05, + "loss": 0.0729, + "num_input_tokens_seen": 143156400, + "step": 117650 + }, + { + "epoch": 13.103352266399376, + "grad_norm": 0.1374252587556839, + "learning_rate": 1.6027600865469767e-05, + "loss": 0.0046, + "num_input_tokens_seen": 143162288, + "step": 117655 + }, + { + "epoch": 13.103909121282994, + "grad_norm": 0.00039675910375081003, + "learning_rate": 1.602533304509423e-05, + "loss": 0.0197, + "num_input_tokens_seen": 143167696, + "step": 117660 + }, + { + "epoch": 13.104465976166612, + "grad_norm": 0.36326804757118225, + "learning_rate": 1.602306530949161e-05, + "loss": 0.0502, + "num_input_tokens_seen": 143173808, + "step": 117665 + }, + { + "epoch": 13.105022831050228, + "grad_norm": 0.00012457853881642222, + "learning_rate": 1.6020797658683333e-05, + "loss": 0.0442, + "num_input_tokens_seen": 143180208, + "step": 117670 + }, + { + "epoch": 13.105579685933845, + "grad_norm": 0.03531093895435333, + "learning_rate": 1.601853009269081e-05, + "loss": 0.0751, + "num_input_tokens_seen": 143186288, + "step": 117675 + }, + { + "epoch": 13.106136540817463, + "grad_norm": 0.0707969069480896, + "learning_rate": 1.6016262611535474e-05, + "loss": 0.2, + "num_input_tokens_seen": 143192528, + "step": 117680 + }, + { + "epoch": 13.10669339570108, + "grad_norm": 0.0007320138975046575, + "learning_rate": 1.6013995215238735e-05, + "loss": 0.0138, + "num_input_tokens_seen": 143198448, + "step": 117685 + }, + { + "epoch": 13.107250250584698, + "grad_norm": 0.0030263287480920553, + "learning_rate": 1.6011727903822005e-05, + "loss": 0.0919, + "num_input_tokens_seen": 143204848, + "step": 117690 + }, + { + "epoch": 13.107807105468314, + "grad_norm": 0.0001955624029505998, + "learning_rate": 1.6009460677306708e-05, + "loss": 0.0786, + "num_input_tokens_seen": 143210864, + "step": 117695 + }, + { + "epoch": 13.108363960351932, + "grad_norm": 0.016952330246567726, + "learning_rate": 1.600719353571426e-05, + "loss": 0.0155, + "num_input_tokens_seen": 143216976, + "step": 117700 + }, + { + "epoch": 13.10892081523555, + "grad_norm": 0.003305963007733226, + "learning_rate": 1.6004926479066074e-05, + "loss": 0.0008, + "num_input_tokens_seen": 143223088, + "step": 117705 + }, + { + "epoch": 13.109477670119167, + "grad_norm": 0.6134284734725952, + "learning_rate": 1.6002659507383553e-05, + "loss": 0.0375, + "num_input_tokens_seen": 143229136, + "step": 117710 + }, + { + "epoch": 13.110034525002785, + "grad_norm": 0.019554654136300087, + "learning_rate": 1.6000392620688144e-05, + "loss": 0.0089, + "num_input_tokens_seen": 143234800, + "step": 117715 + }, + { + "epoch": 13.1105913798864, + "grad_norm": 0.40553873777389526, + "learning_rate": 1.5998125819001215e-05, + "loss": 0.0185, + "num_input_tokens_seen": 143241040, + "step": 117720 + }, + { + "epoch": 13.111148234770019, + "grad_norm": 0.10283394902944565, + "learning_rate": 1.5995859102344214e-05, + "loss": 0.0191, + "num_input_tokens_seen": 143246992, + "step": 117725 + }, + { + "epoch": 13.111705089653636, + "grad_norm": 0.14969968795776367, + "learning_rate": 1.5993592470738527e-05, + "loss": 0.0401, + "num_input_tokens_seen": 143253072, + "step": 117730 + }, + { + "epoch": 13.112261944537254, + "grad_norm": 0.10538651049137115, + "learning_rate": 1.5991325924205586e-05, + "loss": 0.045, + "num_input_tokens_seen": 143259152, + "step": 117735 + }, + { + "epoch": 13.112818799420872, + "grad_norm": 0.06342368572950363, + "learning_rate": 1.598905946276678e-05, + "loss": 0.0154, + "num_input_tokens_seen": 143265040, + "step": 117740 + }, + { + "epoch": 13.113375654304487, + "grad_norm": 0.24805226922035217, + "learning_rate": 1.5986793086443536e-05, + "loss": 0.1862, + "num_input_tokens_seen": 143270800, + "step": 117745 + }, + { + "epoch": 13.113932509188105, + "grad_norm": 0.08164674788713455, + "learning_rate": 1.5984526795257242e-05, + "loss": 0.0921, + "num_input_tokens_seen": 143276912, + "step": 117750 + }, + { + "epoch": 13.114489364071723, + "grad_norm": 0.9444512724876404, + "learning_rate": 1.5982260589229327e-05, + "loss": 0.0226, + "num_input_tokens_seen": 143283184, + "step": 117755 + }, + { + "epoch": 13.11504621895534, + "grad_norm": 1.7922518253326416, + "learning_rate": 1.5979994468381176e-05, + "loss": 0.0323, + "num_input_tokens_seen": 143289136, + "step": 117760 + }, + { + "epoch": 13.115603073838958, + "grad_norm": 0.1623501181602478, + "learning_rate": 1.597772843273421e-05, + "loss": 0.0334, + "num_input_tokens_seen": 143295568, + "step": 117765 + }, + { + "epoch": 13.116159928722574, + "grad_norm": 3.6599109172821045, + "learning_rate": 1.5975462482309816e-05, + "loss": 0.0843, + "num_input_tokens_seen": 143301328, + "step": 117770 + }, + { + "epoch": 13.116716783606192, + "grad_norm": 0.07727744430303574, + "learning_rate": 1.5973196617129425e-05, + "loss": 0.0363, + "num_input_tokens_seen": 143307760, + "step": 117775 + }, + { + "epoch": 13.11727363848981, + "grad_norm": 0.2614835798740387, + "learning_rate": 1.597093083721441e-05, + "loss": 0.0472, + "num_input_tokens_seen": 143313776, + "step": 117780 + }, + { + "epoch": 13.117830493373427, + "grad_norm": 0.04393964633345604, + "learning_rate": 1.5968665142586202e-05, + "loss": 0.0082, + "num_input_tokens_seen": 143319888, + "step": 117785 + }, + { + "epoch": 13.118387348257045, + "grad_norm": 1.286020278930664, + "learning_rate": 1.5966399533266174e-05, + "loss": 0.1975, + "num_input_tokens_seen": 143326000, + "step": 117790 + }, + { + "epoch": 13.11894420314066, + "grad_norm": 0.09641139209270477, + "learning_rate": 1.596413400927575e-05, + "loss": 0.0504, + "num_input_tokens_seen": 143332176, + "step": 117795 + }, + { + "epoch": 13.119501058024278, + "grad_norm": 0.6520904302597046, + "learning_rate": 1.596186857063631e-05, + "loss": 0.0189, + "num_input_tokens_seen": 143337904, + "step": 117800 + }, + { + "epoch": 13.120057912907896, + "grad_norm": 1.3313782215118408, + "learning_rate": 1.5959603217369275e-05, + "loss": 0.1461, + "num_input_tokens_seen": 143344304, + "step": 117805 + }, + { + "epoch": 13.120614767791514, + "grad_norm": 0.9530947804450989, + "learning_rate": 1.595733794949602e-05, + "loss": 0.0254, + "num_input_tokens_seen": 143350416, + "step": 117810 + }, + { + "epoch": 13.121171622675131, + "grad_norm": 0.04137280583381653, + "learning_rate": 1.5955072767037962e-05, + "loss": 0.0074, + "num_input_tokens_seen": 143357008, + "step": 117815 + }, + { + "epoch": 13.121728477558749, + "grad_norm": 0.0009457064443267882, + "learning_rate": 1.595280767001648e-05, + "loss": 0.0165, + "num_input_tokens_seen": 143363024, + "step": 117820 + }, + { + "epoch": 13.122285332442365, + "grad_norm": 0.016909005120396614, + "learning_rate": 1.5950542658452985e-05, + "loss": 0.0413, + "num_input_tokens_seen": 143369328, + "step": 117825 + }, + { + "epoch": 13.122842187325983, + "grad_norm": 0.47529637813568115, + "learning_rate": 1.5948277732368855e-05, + "loss": 0.0246, + "num_input_tokens_seen": 143375280, + "step": 117830 + }, + { + "epoch": 13.1233990422096, + "grad_norm": 0.004627598915249109, + "learning_rate": 1.5946012891785505e-05, + "loss": 0.1004, + "num_input_tokens_seen": 143381296, + "step": 117835 + }, + { + "epoch": 13.123955897093218, + "grad_norm": 0.02388259768486023, + "learning_rate": 1.5943748136724307e-05, + "loss": 0.0362, + "num_input_tokens_seen": 143387824, + "step": 117840 + }, + { + "epoch": 13.124512751976836, + "grad_norm": 0.021985720843076706, + "learning_rate": 1.5941483467206674e-05, + "loss": 0.0184, + "num_input_tokens_seen": 143393872, + "step": 117845 + }, + { + "epoch": 13.125069606860452, + "grad_norm": 0.24252107739448547, + "learning_rate": 1.5939218883253974e-05, + "loss": 0.018, + "num_input_tokens_seen": 143399824, + "step": 117850 + }, + { + "epoch": 13.12562646174407, + "grad_norm": 0.046567291021347046, + "learning_rate": 1.5936954384887625e-05, + "loss": 0.0426, + "num_input_tokens_seen": 143405872, + "step": 117855 + }, + { + "epoch": 13.126183316627687, + "grad_norm": 0.09533023089170456, + "learning_rate": 1.5934689972128995e-05, + "loss": 0.0237, + "num_input_tokens_seen": 143412080, + "step": 117860 + }, + { + "epoch": 13.126740171511305, + "grad_norm": 1.08872389793396, + "learning_rate": 1.5932425644999487e-05, + "loss": 0.0377, + "num_input_tokens_seen": 143418128, + "step": 117865 + }, + { + "epoch": 13.127297026394922, + "grad_norm": 0.014582202769815922, + "learning_rate": 1.5930161403520477e-05, + "loss": 0.0118, + "num_input_tokens_seen": 143424208, + "step": 117870 + }, + { + "epoch": 13.127853881278538, + "grad_norm": 0.000158033479237929, + "learning_rate": 1.5927897247713365e-05, + "loss": 0.0429, + "num_input_tokens_seen": 143430224, + "step": 117875 + }, + { + "epoch": 13.128410736162156, + "grad_norm": 0.045868147164583206, + "learning_rate": 1.5925633177599528e-05, + "loss": 0.0013, + "num_input_tokens_seen": 143436240, + "step": 117880 + }, + { + "epoch": 13.128967591045773, + "grad_norm": 0.0002761476789601147, + "learning_rate": 1.592336919320036e-05, + "loss": 0.0241, + "num_input_tokens_seen": 143442128, + "step": 117885 + }, + { + "epoch": 13.129524445929391, + "grad_norm": 0.11304733157157898, + "learning_rate": 1.5921105294537235e-05, + "loss": 0.0546, + "num_input_tokens_seen": 143448144, + "step": 117890 + }, + { + "epoch": 13.130081300813009, + "grad_norm": 0.8501134514808655, + "learning_rate": 1.5918841481631553e-05, + "loss": 0.0039, + "num_input_tokens_seen": 143454416, + "step": 117895 + }, + { + "epoch": 13.130638155696625, + "grad_norm": 1.1888891458511353, + "learning_rate": 1.5916577754504674e-05, + "loss": 0.031, + "num_input_tokens_seen": 143460368, + "step": 117900 + }, + { + "epoch": 13.131195010580242, + "grad_norm": 0.5879678130149841, + "learning_rate": 1.5914314113178018e-05, + "loss": 0.0083, + "num_input_tokens_seen": 143466384, + "step": 117905 + }, + { + "epoch": 13.13175186546386, + "grad_norm": 0.006880601402372122, + "learning_rate": 1.5912050557672926e-05, + "loss": 0.0016, + "num_input_tokens_seen": 143472432, + "step": 117910 + }, + { + "epoch": 13.132308720347478, + "grad_norm": 2.0363199710845947, + "learning_rate": 1.59097870880108e-05, + "loss": 0.0933, + "num_input_tokens_seen": 143478640, + "step": 117915 + }, + { + "epoch": 13.132865575231095, + "grad_norm": 0.25956541299819946, + "learning_rate": 1.5907523704213024e-05, + "loss": 0.0208, + "num_input_tokens_seen": 143484656, + "step": 117920 + }, + { + "epoch": 13.133422430114711, + "grad_norm": 0.316704660654068, + "learning_rate": 1.5905260406300972e-05, + "loss": 0.0426, + "num_input_tokens_seen": 143490992, + "step": 117925 + }, + { + "epoch": 13.133979284998329, + "grad_norm": 0.07154963910579681, + "learning_rate": 1.5902997194296017e-05, + "loss": 0.0217, + "num_input_tokens_seen": 143497040, + "step": 117930 + }, + { + "epoch": 13.134536139881947, + "grad_norm": 0.09940733760595322, + "learning_rate": 1.5900734068219547e-05, + "loss": 0.0029, + "num_input_tokens_seen": 143503248, + "step": 117935 + }, + { + "epoch": 13.135092994765564, + "grad_norm": 1.8437265157699585, + "learning_rate": 1.5898471028092933e-05, + "loss": 0.0594, + "num_input_tokens_seen": 143509200, + "step": 117940 + }, + { + "epoch": 13.135649849649182, + "grad_norm": 0.079169861972332, + "learning_rate": 1.589620807393755e-05, + "loss": 0.0841, + "num_input_tokens_seen": 143514864, + "step": 117945 + }, + { + "epoch": 13.136206704532798, + "grad_norm": 0.05864707753062248, + "learning_rate": 1.5893945205774773e-05, + "loss": 0.0048, + "num_input_tokens_seen": 143521168, + "step": 117950 + }, + { + "epoch": 13.136763559416416, + "grad_norm": 0.00180914590600878, + "learning_rate": 1.5891682423625988e-05, + "loss": 0.0089, + "num_input_tokens_seen": 143527664, + "step": 117955 + }, + { + "epoch": 13.137320414300033, + "grad_norm": 0.02767576463520527, + "learning_rate": 1.5889419727512546e-05, + "loss": 0.069, + "num_input_tokens_seen": 143533648, + "step": 117960 + }, + { + "epoch": 13.137877269183651, + "grad_norm": 1.152276873588562, + "learning_rate": 1.5887157117455848e-05, + "loss": 0.0312, + "num_input_tokens_seen": 143539760, + "step": 117965 + }, + { + "epoch": 13.138434124067269, + "grad_norm": 0.00013081823999527842, + "learning_rate": 1.588489459347724e-05, + "loss": 0.0638, + "num_input_tokens_seen": 143545968, + "step": 117970 + }, + { + "epoch": 13.138990978950885, + "grad_norm": 0.23142628371715546, + "learning_rate": 1.588263215559812e-05, + "loss": 0.0032, + "num_input_tokens_seen": 143552144, + "step": 117975 + }, + { + "epoch": 13.139547833834502, + "grad_norm": 0.00010125795961357653, + "learning_rate": 1.588036980383983e-05, + "loss": 0.0981, + "num_input_tokens_seen": 143558160, + "step": 117980 + }, + { + "epoch": 13.14010468871812, + "grad_norm": 0.7868044376373291, + "learning_rate": 1.587810753822376e-05, + "loss": 0.0996, + "num_input_tokens_seen": 143564016, + "step": 117985 + }, + { + "epoch": 13.140661543601738, + "grad_norm": 2.471126079559326, + "learning_rate": 1.587584535877127e-05, + "loss": 0.0341, + "num_input_tokens_seen": 143570416, + "step": 117990 + }, + { + "epoch": 13.141218398485355, + "grad_norm": 2.253973960876465, + "learning_rate": 1.5873583265503734e-05, + "loss": 0.127, + "num_input_tokens_seen": 143576304, + "step": 117995 + }, + { + "epoch": 13.141775253368973, + "grad_norm": 0.005715079605579376, + "learning_rate": 1.5871321258442514e-05, + "loss": 0.1018, + "num_input_tokens_seen": 143582448, + "step": 118000 + }, + { + "epoch": 13.142332108252589, + "grad_norm": 0.1311880350112915, + "learning_rate": 1.5869059337608984e-05, + "loss": 0.0643, + "num_input_tokens_seen": 143587888, + "step": 118005 + }, + { + "epoch": 13.142888963136206, + "grad_norm": 0.0003948611265514046, + "learning_rate": 1.5866797503024496e-05, + "loss": 0.0857, + "num_input_tokens_seen": 143594128, + "step": 118010 + }, + { + "epoch": 13.143445818019824, + "grad_norm": 0.23398587107658386, + "learning_rate": 1.586453575471043e-05, + "loss": 0.1535, + "num_input_tokens_seen": 143600464, + "step": 118015 + }, + { + "epoch": 13.144002672903442, + "grad_norm": 0.0008135636453516781, + "learning_rate": 1.5862274092688137e-05, + "loss": 0.0336, + "num_input_tokens_seen": 143606416, + "step": 118020 + }, + { + "epoch": 13.14455952778706, + "grad_norm": 1.1526978015899658, + "learning_rate": 1.586001251697899e-05, + "loss": 0.119, + "num_input_tokens_seen": 143612016, + "step": 118025 + }, + { + "epoch": 13.145116382670675, + "grad_norm": 1.5140767097473145, + "learning_rate": 1.5857751027604338e-05, + "loss": 0.1273, + "num_input_tokens_seen": 143617904, + "step": 118030 + }, + { + "epoch": 13.145673237554293, + "grad_norm": 0.13819390535354614, + "learning_rate": 1.5855489624585572e-05, + "loss": 0.03, + "num_input_tokens_seen": 143624208, + "step": 118035 + }, + { + "epoch": 13.14623009243791, + "grad_norm": 0.3528984487056732, + "learning_rate": 1.585322830794401e-05, + "loss": 0.1559, + "num_input_tokens_seen": 143630544, + "step": 118040 + }, + { + "epoch": 13.146786947321528, + "grad_norm": 0.2890681028366089, + "learning_rate": 1.585096707770105e-05, + "loss": 0.089, + "num_input_tokens_seen": 143636688, + "step": 118045 + }, + { + "epoch": 13.147343802205146, + "grad_norm": 0.006883389316499233, + "learning_rate": 1.5848705933878032e-05, + "loss": 0.0091, + "num_input_tokens_seen": 143642704, + "step": 118050 + }, + { + "epoch": 13.147900657088762, + "grad_norm": 0.010761234909296036, + "learning_rate": 1.5846444876496323e-05, + "loss": 0.0132, + "num_input_tokens_seen": 143649008, + "step": 118055 + }, + { + "epoch": 13.14845751197238, + "grad_norm": 0.028988385573029518, + "learning_rate": 1.5844183905577266e-05, + "loss": 0.0939, + "num_input_tokens_seen": 143655184, + "step": 118060 + }, + { + "epoch": 13.149014366855997, + "grad_norm": 0.001309167011640966, + "learning_rate": 1.5841923021142238e-05, + "loss": 0.1259, + "num_input_tokens_seen": 143660912, + "step": 118065 + }, + { + "epoch": 13.149571221739615, + "grad_norm": 0.9907227158546448, + "learning_rate": 1.5839662223212575e-05, + "loss": 0.0069, + "num_input_tokens_seen": 143667280, + "step": 118070 + }, + { + "epoch": 13.150128076623233, + "grad_norm": 0.6925985217094421, + "learning_rate": 1.583740151180965e-05, + "loss": 0.0239, + "num_input_tokens_seen": 143673200, + "step": 118075 + }, + { + "epoch": 13.150684931506849, + "grad_norm": 0.0005066008307039738, + "learning_rate": 1.5835140886954802e-05, + "loss": 0.0475, + "num_input_tokens_seen": 143679376, + "step": 118080 + }, + { + "epoch": 13.151241786390466, + "grad_norm": 1.0635159015655518, + "learning_rate": 1.5832880348669397e-05, + "loss": 0.0307, + "num_input_tokens_seen": 143685456, + "step": 118085 + }, + { + "epoch": 13.151798641274084, + "grad_norm": 0.09358765184879303, + "learning_rate": 1.583061989697478e-05, + "loss": 0.0011, + "num_input_tokens_seen": 143691984, + "step": 118090 + }, + { + "epoch": 13.152355496157702, + "grad_norm": 0.0024430365301668644, + "learning_rate": 1.5828359531892303e-05, + "loss": 0.0405, + "num_input_tokens_seen": 143698192, + "step": 118095 + }, + { + "epoch": 13.15291235104132, + "grad_norm": 0.13905207812786102, + "learning_rate": 1.582609925344332e-05, + "loss": 0.1049, + "num_input_tokens_seen": 143704464, + "step": 118100 + }, + { + "epoch": 13.153469205924935, + "grad_norm": 0.12947364151477814, + "learning_rate": 1.582383906164917e-05, + "loss": 0.0459, + "num_input_tokens_seen": 143710480, + "step": 118105 + }, + { + "epoch": 13.154026060808553, + "grad_norm": 0.022886129096150398, + "learning_rate": 1.5821578956531232e-05, + "loss": 0.006, + "num_input_tokens_seen": 143716400, + "step": 118110 + }, + { + "epoch": 13.15458291569217, + "grad_norm": 0.569487452507019, + "learning_rate": 1.581931893811081e-05, + "loss": 0.0456, + "num_input_tokens_seen": 143722224, + "step": 118115 + }, + { + "epoch": 13.155139770575788, + "grad_norm": 1.2632739543914795, + "learning_rate": 1.5817059006409298e-05, + "loss": 0.0506, + "num_input_tokens_seen": 143727792, + "step": 118120 + }, + { + "epoch": 13.155696625459406, + "grad_norm": 0.11085354536771774, + "learning_rate": 1.5814799161448e-05, + "loss": 0.0051, + "num_input_tokens_seen": 143734096, + "step": 118125 + }, + { + "epoch": 13.156253480343022, + "grad_norm": 0.017622819170355797, + "learning_rate": 1.58125394032483e-05, + "loss": 0.0361, + "num_input_tokens_seen": 143740080, + "step": 118130 + }, + { + "epoch": 13.15681033522664, + "grad_norm": 0.40452516078948975, + "learning_rate": 1.581027973183152e-05, + "loss": 0.0468, + "num_input_tokens_seen": 143746384, + "step": 118135 + }, + { + "epoch": 13.157367190110257, + "grad_norm": 0.8199983835220337, + "learning_rate": 1.5808020147219012e-05, + "loss": 0.036, + "num_input_tokens_seen": 143752624, + "step": 118140 + }, + { + "epoch": 13.157924044993875, + "grad_norm": 0.1429309844970703, + "learning_rate": 1.5805760649432115e-05, + "loss": 0.095, + "num_input_tokens_seen": 143759024, + "step": 118145 + }, + { + "epoch": 13.158480899877492, + "grad_norm": 0.8219517469406128, + "learning_rate": 1.580350123849218e-05, + "loss": 0.009, + "num_input_tokens_seen": 143765456, + "step": 118150 + }, + { + "epoch": 13.159037754761108, + "grad_norm": 0.00019841449102386832, + "learning_rate": 1.580124191442054e-05, + "loss": 0.0247, + "num_input_tokens_seen": 143771536, + "step": 118155 + }, + { + "epoch": 13.159594609644726, + "grad_norm": 0.3257209360599518, + "learning_rate": 1.5798982677238545e-05, + "loss": 0.012, + "num_input_tokens_seen": 143777680, + "step": 118160 + }, + { + "epoch": 13.160151464528344, + "grad_norm": 0.007656106259673834, + "learning_rate": 1.579672352696752e-05, + "loss": 0.0645, + "num_input_tokens_seen": 143783824, + "step": 118165 + }, + { + "epoch": 13.160708319411961, + "grad_norm": 0.036208320409059525, + "learning_rate": 1.5794464463628828e-05, + "loss": 0.0144, + "num_input_tokens_seen": 143789776, + "step": 118170 + }, + { + "epoch": 13.161265174295579, + "grad_norm": 0.0018109948141500354, + "learning_rate": 1.5792205487243778e-05, + "loss": 0.0414, + "num_input_tokens_seen": 143796272, + "step": 118175 + }, + { + "epoch": 13.161822029179197, + "grad_norm": 8.635045378468931e-05, + "learning_rate": 1.5789946597833742e-05, + "loss": 0.0082, + "num_input_tokens_seen": 143802672, + "step": 118180 + }, + { + "epoch": 13.162378884062813, + "grad_norm": 0.3047638535499573, + "learning_rate": 1.5787687795420024e-05, + "loss": 0.0571, + "num_input_tokens_seen": 143808880, + "step": 118185 + }, + { + "epoch": 13.16293573894643, + "grad_norm": 0.029427923262119293, + "learning_rate": 1.5785429080023986e-05, + "loss": 0.0163, + "num_input_tokens_seen": 143814960, + "step": 118190 + }, + { + "epoch": 13.163492593830048, + "grad_norm": 0.007849018089473248, + "learning_rate": 1.578317045166695e-05, + "loss": 0.0338, + "num_input_tokens_seen": 143820912, + "step": 118195 + }, + { + "epoch": 13.164049448713666, + "grad_norm": 0.038073424249887466, + "learning_rate": 1.5780911910370256e-05, + "loss": 0.0144, + "num_input_tokens_seen": 143826960, + "step": 118200 + }, + { + "epoch": 13.164606303597283, + "grad_norm": 0.29416143894195557, + "learning_rate": 1.5778653456155228e-05, + "loss": 0.0067, + "num_input_tokens_seen": 143833072, + "step": 118205 + }, + { + "epoch": 13.1651631584809, + "grad_norm": 1.0991967916488647, + "learning_rate": 1.5776395089043214e-05, + "loss": 0.0329, + "num_input_tokens_seen": 143839248, + "step": 118210 + }, + { + "epoch": 13.165720013364517, + "grad_norm": 0.0938163548707962, + "learning_rate": 1.5774136809055534e-05, + "loss": 0.0018, + "num_input_tokens_seen": 143845520, + "step": 118215 + }, + { + "epoch": 13.166276868248135, + "grad_norm": 0.21226289868354797, + "learning_rate": 1.5771878616213525e-05, + "loss": 0.0745, + "num_input_tokens_seen": 143851408, + "step": 118220 + }, + { + "epoch": 13.166833723131752, + "grad_norm": 0.1697894036769867, + "learning_rate": 1.5769620510538515e-05, + "loss": 0.0384, + "num_input_tokens_seen": 143857584, + "step": 118225 + }, + { + "epoch": 13.16739057801537, + "grad_norm": 0.966468095779419, + "learning_rate": 1.5767362492051834e-05, + "loss": 0.0403, + "num_input_tokens_seen": 143863568, + "step": 118230 + }, + { + "epoch": 13.167947432898986, + "grad_norm": 0.28054356575012207, + "learning_rate": 1.576510456077481e-05, + "loss": 0.0593, + "num_input_tokens_seen": 143869296, + "step": 118235 + }, + { + "epoch": 13.168504287782604, + "grad_norm": 0.9247165322303772, + "learning_rate": 1.576284671672878e-05, + "loss": 0.0642, + "num_input_tokens_seen": 143875408, + "step": 118240 + }, + { + "epoch": 13.169061142666221, + "grad_norm": 0.002845372073352337, + "learning_rate": 1.5760588959935053e-05, + "loss": 0.0321, + "num_input_tokens_seen": 143881392, + "step": 118245 + }, + { + "epoch": 13.169617997549839, + "grad_norm": 0.0229951124638319, + "learning_rate": 1.5758331290414976e-05, + "loss": 0.014, + "num_input_tokens_seen": 143887536, + "step": 118250 + }, + { + "epoch": 13.170174852433457, + "grad_norm": 0.013407336547970772, + "learning_rate": 1.575607370818985e-05, + "loss": 0.027, + "num_input_tokens_seen": 143893680, + "step": 118255 + }, + { + "epoch": 13.170731707317072, + "grad_norm": 0.0007743217865936458, + "learning_rate": 1.5753816213281024e-05, + "loss": 0.0559, + "num_input_tokens_seen": 143899536, + "step": 118260 + }, + { + "epoch": 13.17128856220069, + "grad_norm": 0.47791633009910583, + "learning_rate": 1.575155880570981e-05, + "loss": 0.0479, + "num_input_tokens_seen": 143905456, + "step": 118265 + }, + { + "epoch": 13.171845417084308, + "grad_norm": 0.0007733259699307382, + "learning_rate": 1.5749301485497535e-05, + "loss": 0.044, + "num_input_tokens_seen": 143911536, + "step": 118270 + }, + { + "epoch": 13.172402271967925, + "grad_norm": 0.08579692989587784, + "learning_rate": 1.5747044252665517e-05, + "loss": 0.0322, + "num_input_tokens_seen": 143917392, + "step": 118275 + }, + { + "epoch": 13.172959126851543, + "grad_norm": 0.31496721506118774, + "learning_rate": 1.5744787107235086e-05, + "loss": 0.0178, + "num_input_tokens_seen": 143923568, + "step": 118280 + }, + { + "epoch": 13.173515981735159, + "grad_norm": 1.240330696105957, + "learning_rate": 1.5742530049227545e-05, + "loss": 0.0778, + "num_input_tokens_seen": 143929424, + "step": 118285 + }, + { + "epoch": 13.174072836618777, + "grad_norm": 1.5567926168441772, + "learning_rate": 1.5740273078664233e-05, + "loss": 0.079, + "num_input_tokens_seen": 143935472, + "step": 118290 + }, + { + "epoch": 13.174629691502394, + "grad_norm": 0.3775228261947632, + "learning_rate": 1.5738016195566454e-05, + "loss": 0.008, + "num_input_tokens_seen": 143941296, + "step": 118295 + }, + { + "epoch": 13.175186546386012, + "grad_norm": 0.031795524060726166, + "learning_rate": 1.573575939995554e-05, + "loss": 0.0812, + "num_input_tokens_seen": 143947472, + "step": 118300 + }, + { + "epoch": 13.17574340126963, + "grad_norm": 0.1279994696378708, + "learning_rate": 1.5733502691852788e-05, + "loss": 0.0068, + "num_input_tokens_seen": 143953616, + "step": 118305 + }, + { + "epoch": 13.176300256153246, + "grad_norm": 0.0006608995608985424, + "learning_rate": 1.5731246071279542e-05, + "loss": 0.0528, + "num_input_tokens_seen": 143959920, + "step": 118310 + }, + { + "epoch": 13.176857111036863, + "grad_norm": 0.04009415581822395, + "learning_rate": 1.5728989538257093e-05, + "loss": 0.0187, + "num_input_tokens_seen": 143966352, + "step": 118315 + }, + { + "epoch": 13.177413965920481, + "grad_norm": 0.011003613471984863, + "learning_rate": 1.572673309280677e-05, + "loss": 0.0134, + "num_input_tokens_seen": 143972368, + "step": 118320 + }, + { + "epoch": 13.177970820804099, + "grad_norm": 0.025976505130529404, + "learning_rate": 1.5724476734949878e-05, + "loss": 0.0943, + "num_input_tokens_seen": 143978736, + "step": 118325 + }, + { + "epoch": 13.178527675687716, + "grad_norm": 0.007314932066947222, + "learning_rate": 1.572222046470774e-05, + "loss": 0.0061, + "num_input_tokens_seen": 143984848, + "step": 118330 + }, + { + "epoch": 13.179084530571334, + "grad_norm": 0.23834455013275146, + "learning_rate": 1.5719964282101664e-05, + "loss": 0.0439, + "num_input_tokens_seen": 143990608, + "step": 118335 + }, + { + "epoch": 13.17964138545495, + "grad_norm": 1.0553940534591675, + "learning_rate": 1.571770818715296e-05, + "loss": 0.1309, + "num_input_tokens_seen": 143996720, + "step": 118340 + }, + { + "epoch": 13.180198240338568, + "grad_norm": 0.7689787149429321, + "learning_rate": 1.5715452179882934e-05, + "loss": 0.1758, + "num_input_tokens_seen": 144002864, + "step": 118345 + }, + { + "epoch": 13.180755095222185, + "grad_norm": 0.019030768424272537, + "learning_rate": 1.5713196260312906e-05, + "loss": 0.0008, + "num_input_tokens_seen": 144009040, + "step": 118350 + }, + { + "epoch": 13.181311950105803, + "grad_norm": 1.1352252960205078, + "learning_rate": 1.5710940428464174e-05, + "loss": 0.0382, + "num_input_tokens_seen": 144015248, + "step": 118355 + }, + { + "epoch": 13.18186880498942, + "grad_norm": 0.1131119504570961, + "learning_rate": 1.570868468435806e-05, + "loss": 0.0112, + "num_input_tokens_seen": 144021616, + "step": 118360 + }, + { + "epoch": 13.182425659873036, + "grad_norm": 0.26248878240585327, + "learning_rate": 1.570642902801585e-05, + "loss": 0.0241, + "num_input_tokens_seen": 144027632, + "step": 118365 + }, + { + "epoch": 13.182982514756654, + "grad_norm": 0.12664376199245453, + "learning_rate": 1.5704173459458877e-05, + "loss": 0.0219, + "num_input_tokens_seen": 144033872, + "step": 118370 + }, + { + "epoch": 13.183539369640272, + "grad_norm": 0.014508705586194992, + "learning_rate": 1.5701917978708426e-05, + "loss": 0.0055, + "num_input_tokens_seen": 144040112, + "step": 118375 + }, + { + "epoch": 13.18409622452389, + "grad_norm": 0.7071214318275452, + "learning_rate": 1.5699662585785812e-05, + "loss": 0.0603, + "num_input_tokens_seen": 144046128, + "step": 118380 + }, + { + "epoch": 13.184653079407507, + "grad_norm": 0.00012220321514178067, + "learning_rate": 1.5697407280712335e-05, + "loss": 0.0391, + "num_input_tokens_seen": 144052304, + "step": 118385 + }, + { + "epoch": 13.185209934291123, + "grad_norm": 0.3820338249206543, + "learning_rate": 1.56951520635093e-05, + "loss": 0.1038, + "num_input_tokens_seen": 144057680, + "step": 118390 + }, + { + "epoch": 13.18576678917474, + "grad_norm": 0.8820061683654785, + "learning_rate": 1.569289693419801e-05, + "loss": 0.0185, + "num_input_tokens_seen": 144063440, + "step": 118395 + }, + { + "epoch": 13.186323644058358, + "grad_norm": 0.016132688149809837, + "learning_rate": 1.5690641892799768e-05, + "loss": 0.014, + "num_input_tokens_seen": 144069008, + "step": 118400 + }, + { + "epoch": 13.186880498941976, + "grad_norm": 0.025350147858262062, + "learning_rate": 1.5688386939335864e-05, + "loss": 0.009, + "num_input_tokens_seen": 144075344, + "step": 118405 + }, + { + "epoch": 13.187437353825594, + "grad_norm": 0.0007849938701838255, + "learning_rate": 1.5686132073827615e-05, + "loss": 0.0023, + "num_input_tokens_seen": 144081744, + "step": 118410 + }, + { + "epoch": 13.18799420870921, + "grad_norm": 0.07881993055343628, + "learning_rate": 1.56838772962963e-05, + "loss": 0.0537, + "num_input_tokens_seen": 144087888, + "step": 118415 + }, + { + "epoch": 13.188551063592827, + "grad_norm": 1.1319440603256226, + "learning_rate": 1.5681622606763235e-05, + "loss": 0.1469, + "num_input_tokens_seen": 144093904, + "step": 118420 + }, + { + "epoch": 13.189107918476445, + "grad_norm": 0.746737003326416, + "learning_rate": 1.567936800524971e-05, + "loss": 0.0108, + "num_input_tokens_seen": 144099888, + "step": 118425 + }, + { + "epoch": 13.189664773360063, + "grad_norm": 0.09415857493877411, + "learning_rate": 1.5677113491777024e-05, + "loss": 0.0635, + "num_input_tokens_seen": 144106064, + "step": 118430 + }, + { + "epoch": 13.19022162824368, + "grad_norm": 1.4488176107406616, + "learning_rate": 1.5674859066366457e-05, + "loss": 0.0556, + "num_input_tokens_seen": 144112304, + "step": 118435 + }, + { + "epoch": 13.190778483127296, + "grad_norm": 0.000721924239769578, + "learning_rate": 1.5672604729039337e-05, + "loss": 0.0944, + "num_input_tokens_seen": 144118480, + "step": 118440 + }, + { + "epoch": 13.191335338010914, + "grad_norm": 0.005853366572409868, + "learning_rate": 1.567035047981692e-05, + "loss": 0.0015, + "num_input_tokens_seen": 144124752, + "step": 118445 + }, + { + "epoch": 13.191892192894532, + "grad_norm": 0.5145363211631775, + "learning_rate": 1.5668096318720526e-05, + "loss": 0.0193, + "num_input_tokens_seen": 144131184, + "step": 118450 + }, + { + "epoch": 13.19244904777815, + "grad_norm": 0.027331823483109474, + "learning_rate": 1.566584224577144e-05, + "loss": 0.0172, + "num_input_tokens_seen": 144137584, + "step": 118455 + }, + { + "epoch": 13.193005902661767, + "grad_norm": 0.001631145947612822, + "learning_rate": 1.5663588260990954e-05, + "loss": 0.0553, + "num_input_tokens_seen": 144143056, + "step": 118460 + }, + { + "epoch": 13.193562757545383, + "grad_norm": 2.069847822189331, + "learning_rate": 1.566133436440035e-05, + "loss": 0.0755, + "num_input_tokens_seen": 144149104, + "step": 118465 + }, + { + "epoch": 13.194119612429, + "grad_norm": 0.1508839726448059, + "learning_rate": 1.5659080556020933e-05, + "loss": 0.0194, + "num_input_tokens_seen": 144155088, + "step": 118470 + }, + { + "epoch": 13.194676467312618, + "grad_norm": 0.49527284502983093, + "learning_rate": 1.565682683587398e-05, + "loss": 0.1324, + "num_input_tokens_seen": 144161136, + "step": 118475 + }, + { + "epoch": 13.195233322196236, + "grad_norm": 0.0001704484166111797, + "learning_rate": 1.5654573203980784e-05, + "loss": 0.0271, + "num_input_tokens_seen": 144167056, + "step": 118480 + }, + { + "epoch": 13.195790177079854, + "grad_norm": 0.044559136033058167, + "learning_rate": 1.565231966036263e-05, + "loss": 0.0073, + "num_input_tokens_seen": 144173104, + "step": 118485 + }, + { + "epoch": 13.19634703196347, + "grad_norm": 1.0953882932662964, + "learning_rate": 1.565006620504081e-05, + "loss": 0.0824, + "num_input_tokens_seen": 144179344, + "step": 118490 + }, + { + "epoch": 13.196903886847087, + "grad_norm": 0.37469133734703064, + "learning_rate": 1.5647812838036592e-05, + "loss": 0.1758, + "num_input_tokens_seen": 144185008, + "step": 118495 + }, + { + "epoch": 13.197460741730705, + "grad_norm": 2.780972719192505, + "learning_rate": 1.564555955937129e-05, + "loss": 0.0855, + "num_input_tokens_seen": 144191344, + "step": 118500 + }, + { + "epoch": 13.198017596614322, + "grad_norm": 0.10411148518323898, + "learning_rate": 1.5643306369066173e-05, + "loss": 0.0027, + "num_input_tokens_seen": 144197392, + "step": 118505 + }, + { + "epoch": 13.19857445149794, + "grad_norm": 0.40886256098747253, + "learning_rate": 1.5641053267142512e-05, + "loss": 0.0097, + "num_input_tokens_seen": 144203408, + "step": 118510 + }, + { + "epoch": 13.199131306381556, + "grad_norm": 1.196171760559082, + "learning_rate": 1.5638800253621617e-05, + "loss": 0.0179, + "num_input_tokens_seen": 144209616, + "step": 118515 + }, + { + "epoch": 13.199688161265174, + "grad_norm": 0.05203362554311752, + "learning_rate": 1.5636547328524738e-05, + "loss": 0.0423, + "num_input_tokens_seen": 144215760, + "step": 118520 + }, + { + "epoch": 13.200245016148791, + "grad_norm": 0.8580573797225952, + "learning_rate": 1.5634294491873185e-05, + "loss": 0.056, + "num_input_tokens_seen": 144221968, + "step": 118525 + }, + { + "epoch": 13.200801871032409, + "grad_norm": 0.02506898157298565, + "learning_rate": 1.563204174368821e-05, + "loss": 0.0287, + "num_input_tokens_seen": 144228048, + "step": 118530 + }, + { + "epoch": 13.201358725916027, + "grad_norm": 0.05010732263326645, + "learning_rate": 1.5629789083991113e-05, + "loss": 0.002, + "num_input_tokens_seen": 144234384, + "step": 118535 + }, + { + "epoch": 13.201915580799644, + "grad_norm": 1.3027483224868774, + "learning_rate": 1.5627536512803166e-05, + "loss": 0.0579, + "num_input_tokens_seen": 144240560, + "step": 118540 + }, + { + "epoch": 13.20247243568326, + "grad_norm": 0.0007576004136353731, + "learning_rate": 1.562528403014565e-05, + "loss": 0.0175, + "num_input_tokens_seen": 144246448, + "step": 118545 + }, + { + "epoch": 13.203029290566878, + "grad_norm": 0.005130761303007603, + "learning_rate": 1.562303163603983e-05, + "loss": 0.0257, + "num_input_tokens_seen": 144252560, + "step": 118550 + }, + { + "epoch": 13.203586145450496, + "grad_norm": 0.3659552037715912, + "learning_rate": 1.562077933050699e-05, + "loss": 0.0217, + "num_input_tokens_seen": 144258672, + "step": 118555 + }, + { + "epoch": 13.204143000334113, + "grad_norm": 0.20547209680080414, + "learning_rate": 1.5618527113568406e-05, + "loss": 0.0171, + "num_input_tokens_seen": 144265232, + "step": 118560 + }, + { + "epoch": 13.204699855217731, + "grad_norm": 0.004810336511582136, + "learning_rate": 1.561627498524535e-05, + "loss": 0.0464, + "num_input_tokens_seen": 144271312, + "step": 118565 + }, + { + "epoch": 13.205256710101347, + "grad_norm": 0.6165716052055359, + "learning_rate": 1.561402294555909e-05, + "loss": 0.0172, + "num_input_tokens_seen": 144277520, + "step": 118570 + }, + { + "epoch": 13.205813564984965, + "grad_norm": 0.03644329309463501, + "learning_rate": 1.561177099453091e-05, + "loss": 0.0268, + "num_input_tokens_seen": 144283568, + "step": 118575 + }, + { + "epoch": 13.206370419868582, + "grad_norm": 0.2368139773607254, + "learning_rate": 1.5609519132182065e-05, + "loss": 0.0905, + "num_input_tokens_seen": 144289584, + "step": 118580 + }, + { + "epoch": 13.2069272747522, + "grad_norm": 0.1096777692437172, + "learning_rate": 1.560726735853385e-05, + "loss": 0.0978, + "num_input_tokens_seen": 144295696, + "step": 118585 + }, + { + "epoch": 13.207484129635818, + "grad_norm": 0.010110203176736832, + "learning_rate": 1.5605015673607507e-05, + "loss": 0.0445, + "num_input_tokens_seen": 144301552, + "step": 118590 + }, + { + "epoch": 13.208040984519434, + "grad_norm": 0.028619056567549706, + "learning_rate": 1.5602764077424324e-05, + "loss": 0.0694, + "num_input_tokens_seen": 144307664, + "step": 118595 + }, + { + "epoch": 13.208597839403051, + "grad_norm": 0.6057981252670288, + "learning_rate": 1.560051257000556e-05, + "loss": 0.0095, + "num_input_tokens_seen": 144313648, + "step": 118600 + }, + { + "epoch": 13.209154694286669, + "grad_norm": 0.2418573647737503, + "learning_rate": 1.559826115137249e-05, + "loss": 0.0182, + "num_input_tokens_seen": 144319728, + "step": 118605 + }, + { + "epoch": 13.209711549170287, + "grad_norm": 0.7053864598274231, + "learning_rate": 1.5596009821546375e-05, + "loss": 0.062, + "num_input_tokens_seen": 144325872, + "step": 118610 + }, + { + "epoch": 13.210268404053904, + "grad_norm": 0.001527181128039956, + "learning_rate": 1.5593758580548486e-05, + "loss": 0.005, + "num_input_tokens_seen": 144331888, + "step": 118615 + }, + { + "epoch": 13.21082525893752, + "grad_norm": 0.05813997611403465, + "learning_rate": 1.559150742840007e-05, + "loss": 0.0086, + "num_input_tokens_seen": 144337808, + "step": 118620 + }, + { + "epoch": 13.211382113821138, + "grad_norm": 0.02209511585533619, + "learning_rate": 1.5589256365122418e-05, + "loss": 0.015, + "num_input_tokens_seen": 144343728, + "step": 118625 + }, + { + "epoch": 13.211938968704755, + "grad_norm": 0.14900191128253937, + "learning_rate": 1.5587005390736768e-05, + "loss": 0.0538, + "num_input_tokens_seen": 144349648, + "step": 118630 + }, + { + "epoch": 13.212495823588373, + "grad_norm": 2.781144857406616, + "learning_rate": 1.5584754505264404e-05, + "loss": 0.2181, + "num_input_tokens_seen": 144355792, + "step": 118635 + }, + { + "epoch": 13.21305267847199, + "grad_norm": 0.006893330719321966, + "learning_rate": 1.5582503708726565e-05, + "loss": 0.0147, + "num_input_tokens_seen": 144362160, + "step": 118640 + }, + { + "epoch": 13.213609533355607, + "grad_norm": 0.003154060570523143, + "learning_rate": 1.558025300114454e-05, + "loss": 0.06, + "num_input_tokens_seen": 144368368, + "step": 118645 + }, + { + "epoch": 13.214166388239224, + "grad_norm": 0.3017161786556244, + "learning_rate": 1.5578002382539555e-05, + "loss": 0.0424, + "num_input_tokens_seen": 144374672, + "step": 118650 + }, + { + "epoch": 13.214723243122842, + "grad_norm": 1.3250253200531006, + "learning_rate": 1.55757518529329e-05, + "loss": 0.1459, + "num_input_tokens_seen": 144380784, + "step": 118655 + }, + { + "epoch": 13.21528009800646, + "grad_norm": 0.5790074467658997, + "learning_rate": 1.557350141234581e-05, + "loss": 0.234, + "num_input_tokens_seen": 144386608, + "step": 118660 + }, + { + "epoch": 13.215836952890077, + "grad_norm": 0.005987817421555519, + "learning_rate": 1.5571251060799558e-05, + "loss": 0.0532, + "num_input_tokens_seen": 144392752, + "step": 118665 + }, + { + "epoch": 13.216393807773693, + "grad_norm": 0.06975790858268738, + "learning_rate": 1.556900079831539e-05, + "loss": 0.0445, + "num_input_tokens_seen": 144398512, + "step": 118670 + }, + { + "epoch": 13.216950662657311, + "grad_norm": 0.7230608463287354, + "learning_rate": 1.556675062491457e-05, + "loss": 0.0596, + "num_input_tokens_seen": 144404592, + "step": 118675 + }, + { + "epoch": 13.217507517540929, + "grad_norm": 0.002671843161806464, + "learning_rate": 1.5564500540618345e-05, + "loss": 0.015, + "num_input_tokens_seen": 144410832, + "step": 118680 + }, + { + "epoch": 13.218064372424546, + "grad_norm": 0.006237485911697149, + "learning_rate": 1.556225054544797e-05, + "loss": 0.0094, + "num_input_tokens_seen": 144417168, + "step": 118685 + }, + { + "epoch": 13.218621227308164, + "grad_norm": 0.18697088956832886, + "learning_rate": 1.55600006394247e-05, + "loss": 0.0143, + "num_input_tokens_seen": 144423248, + "step": 118690 + }, + { + "epoch": 13.219178082191782, + "grad_norm": 0.10532836616039276, + "learning_rate": 1.5557750822569794e-05, + "loss": 0.1469, + "num_input_tokens_seen": 144429328, + "step": 118695 + }, + { + "epoch": 13.219734937075398, + "grad_norm": 1.6055275201797485, + "learning_rate": 1.555550109490449e-05, + "loss": 0.0841, + "num_input_tokens_seen": 144435536, + "step": 118700 + }, + { + "epoch": 13.220291791959015, + "grad_norm": 0.5036843419075012, + "learning_rate": 1.5553251456450048e-05, + "loss": 0.0414, + "num_input_tokens_seen": 144441840, + "step": 118705 + }, + { + "epoch": 13.220848646842633, + "grad_norm": 0.19439347088336945, + "learning_rate": 1.5551001907227706e-05, + "loss": 0.0097, + "num_input_tokens_seen": 144448048, + "step": 118710 + }, + { + "epoch": 13.22140550172625, + "grad_norm": 0.0010632254416123033, + "learning_rate": 1.5548752447258734e-05, + "loss": 0.0205, + "num_input_tokens_seen": 144453744, + "step": 118715 + }, + { + "epoch": 13.221962356609868, + "grad_norm": 1.3572044372558594, + "learning_rate": 1.5546503076564358e-05, + "loss": 0.1201, + "num_input_tokens_seen": 144459888, + "step": 118720 + }, + { + "epoch": 13.222519211493484, + "grad_norm": 0.031779780983924866, + "learning_rate": 1.554425379516584e-05, + "loss": 0.0822, + "num_input_tokens_seen": 144466192, + "step": 118725 + }, + { + "epoch": 13.223076066377102, + "grad_norm": 0.0309700109064579, + "learning_rate": 1.5542004603084418e-05, + "loss": 0.007, + "num_input_tokens_seen": 144472464, + "step": 118730 + }, + { + "epoch": 13.22363292126072, + "grad_norm": 0.22547686100006104, + "learning_rate": 1.5539755500341342e-05, + "loss": 0.1136, + "num_input_tokens_seen": 144478448, + "step": 118735 + }, + { + "epoch": 13.224189776144337, + "grad_norm": 0.1045527458190918, + "learning_rate": 1.5537506486957854e-05, + "loss": 0.0217, + "num_input_tokens_seen": 144484176, + "step": 118740 + }, + { + "epoch": 13.224746631027955, + "grad_norm": 0.0035027831327170134, + "learning_rate": 1.5535257562955203e-05, + "loss": 0.0505, + "num_input_tokens_seen": 144490256, + "step": 118745 + }, + { + "epoch": 13.22530348591157, + "grad_norm": 0.659336268901825, + "learning_rate": 1.5533008728354616e-05, + "loss": 0.0663, + "num_input_tokens_seen": 144496176, + "step": 118750 + }, + { + "epoch": 13.225860340795188, + "grad_norm": 0.006794023793190718, + "learning_rate": 1.5530759983177357e-05, + "loss": 0.0327, + "num_input_tokens_seen": 144502256, + "step": 118755 + }, + { + "epoch": 13.226417195678806, + "grad_norm": 1.928250789642334, + "learning_rate": 1.552851132744465e-05, + "loss": 0.1109, + "num_input_tokens_seen": 144508432, + "step": 118760 + }, + { + "epoch": 13.226974050562424, + "grad_norm": 0.06925533711910248, + "learning_rate": 1.552626276117775e-05, + "loss": 0.0298, + "num_input_tokens_seen": 144514608, + "step": 118765 + }, + { + "epoch": 13.227530905446041, + "grad_norm": 0.07905939221382141, + "learning_rate": 1.5524014284397876e-05, + "loss": 0.0099, + "num_input_tokens_seen": 144520848, + "step": 118770 + }, + { + "epoch": 13.228087760329657, + "grad_norm": 0.08141829073429108, + "learning_rate": 1.5521765897126295e-05, + "loss": 0.0009, + "num_input_tokens_seen": 144527216, + "step": 118775 + }, + { + "epoch": 13.228644615213275, + "grad_norm": 0.09977647662162781, + "learning_rate": 1.551951759938421e-05, + "loss": 0.0102, + "num_input_tokens_seen": 144532912, + "step": 118780 + }, + { + "epoch": 13.229201470096893, + "grad_norm": 0.4812389314174652, + "learning_rate": 1.551726939119289e-05, + "loss": 0.0114, + "num_input_tokens_seen": 144538864, + "step": 118785 + }, + { + "epoch": 13.22975832498051, + "grad_norm": 0.6713928580284119, + "learning_rate": 1.5515021272573553e-05, + "loss": 0.0487, + "num_input_tokens_seen": 144544368, + "step": 118790 + }, + { + "epoch": 13.230315179864128, + "grad_norm": 0.0032804543152451515, + "learning_rate": 1.5512773243547445e-05, + "loss": 0.0476, + "num_input_tokens_seen": 144550544, + "step": 118795 + }, + { + "epoch": 13.230872034747744, + "grad_norm": 0.462542861700058, + "learning_rate": 1.5510525304135787e-05, + "loss": 0.0389, + "num_input_tokens_seen": 144556560, + "step": 118800 + }, + { + "epoch": 13.231428889631362, + "grad_norm": 1.2144876718521118, + "learning_rate": 1.550827745435983e-05, + "loss": 0.0462, + "num_input_tokens_seen": 144562416, + "step": 118805 + }, + { + "epoch": 13.23198574451498, + "grad_norm": 0.015458297915756702, + "learning_rate": 1.5506029694240787e-05, + "loss": 0.0163, + "num_input_tokens_seen": 144568464, + "step": 118810 + }, + { + "epoch": 13.232542599398597, + "grad_norm": 1.7679553031921387, + "learning_rate": 1.5503782023799908e-05, + "loss": 0.0669, + "num_input_tokens_seen": 144574480, + "step": 118815 + }, + { + "epoch": 13.233099454282215, + "grad_norm": 0.7847536206245422, + "learning_rate": 1.550153444305841e-05, + "loss": 0.0217, + "num_input_tokens_seen": 144580784, + "step": 118820 + }, + { + "epoch": 13.23365630916583, + "grad_norm": 0.17287464439868927, + "learning_rate": 1.5499286952037536e-05, + "loss": 0.1112, + "num_input_tokens_seen": 144586384, + "step": 118825 + }, + { + "epoch": 13.234213164049448, + "grad_norm": 1.4521957635879517, + "learning_rate": 1.5497039550758496e-05, + "loss": 0.0528, + "num_input_tokens_seen": 144591984, + "step": 118830 + }, + { + "epoch": 13.234770018933066, + "grad_norm": 0.2103063464164734, + "learning_rate": 1.549479223924255e-05, + "loss": 0.0302, + "num_input_tokens_seen": 144598192, + "step": 118835 + }, + { + "epoch": 13.235326873816684, + "grad_norm": 0.00010771772940643132, + "learning_rate": 1.5492545017510886e-05, + "loss": 0.0303, + "num_input_tokens_seen": 144604528, + "step": 118840 + }, + { + "epoch": 13.235883728700301, + "grad_norm": 0.22062069177627563, + "learning_rate": 1.549029788558477e-05, + "loss": 0.0114, + "num_input_tokens_seen": 144610896, + "step": 118845 + }, + { + "epoch": 13.236440583583917, + "grad_norm": 0.0003162174834869802, + "learning_rate": 1.5488050843485395e-05, + "loss": 0.1453, + "num_input_tokens_seen": 144616912, + "step": 118850 + }, + { + "epoch": 13.236997438467535, + "grad_norm": 4.170963764190674, + "learning_rate": 1.548580389123401e-05, + "loss": 0.0795, + "num_input_tokens_seen": 144622928, + "step": 118855 + }, + { + "epoch": 13.237554293351153, + "grad_norm": 0.07190119475126266, + "learning_rate": 1.5483557028851824e-05, + "loss": 0.0521, + "num_input_tokens_seen": 144628880, + "step": 118860 + }, + { + "epoch": 13.23811114823477, + "grad_norm": 0.009992486797273159, + "learning_rate": 1.5481310256360072e-05, + "loss": 0.0125, + "num_input_tokens_seen": 144634960, + "step": 118865 + }, + { + "epoch": 13.238668003118388, + "grad_norm": 0.000248290307354182, + "learning_rate": 1.5479063573779967e-05, + "loss": 0.0064, + "num_input_tokens_seen": 144640912, + "step": 118870 + }, + { + "epoch": 13.239224858002006, + "grad_norm": 0.30945098400115967, + "learning_rate": 1.5476816981132738e-05, + "loss": 0.0332, + "num_input_tokens_seen": 144646512, + "step": 118875 + }, + { + "epoch": 13.239781712885621, + "grad_norm": 0.49618908762931824, + "learning_rate": 1.5474570478439598e-05, + "loss": 0.128, + "num_input_tokens_seen": 144652176, + "step": 118880 + }, + { + "epoch": 13.24033856776924, + "grad_norm": 0.0010201596887782216, + "learning_rate": 1.5472324065721778e-05, + "loss": 0.0241, + "num_input_tokens_seen": 144658480, + "step": 118885 + }, + { + "epoch": 13.240895422652857, + "grad_norm": 0.0014681540196761489, + "learning_rate": 1.5470077743000483e-05, + "loss": 0.0115, + "num_input_tokens_seen": 144664880, + "step": 118890 + }, + { + "epoch": 13.241452277536474, + "grad_norm": 1.061305046081543, + "learning_rate": 1.5467831510296943e-05, + "loss": 0.0263, + "num_input_tokens_seen": 144671152, + "step": 118895 + }, + { + "epoch": 13.242009132420092, + "grad_norm": 1.5922794342041016, + "learning_rate": 1.5465585367632366e-05, + "loss": 0.1144, + "num_input_tokens_seen": 144677424, + "step": 118900 + }, + { + "epoch": 13.242565987303708, + "grad_norm": 0.9030792713165283, + "learning_rate": 1.5463339315027987e-05, + "loss": 0.0275, + "num_input_tokens_seen": 144683568, + "step": 118905 + }, + { + "epoch": 13.243122842187326, + "grad_norm": 0.19110724329948425, + "learning_rate": 1.546109335250499e-05, + "loss": 0.0233, + "num_input_tokens_seen": 144689680, + "step": 118910 + }, + { + "epoch": 13.243679697070943, + "grad_norm": 0.22373390197753906, + "learning_rate": 1.5458847480084627e-05, + "loss": 0.1148, + "num_input_tokens_seen": 144695760, + "step": 118915 + }, + { + "epoch": 13.244236551954561, + "grad_norm": 0.096177838742733, + "learning_rate": 1.5456601697788093e-05, + "loss": 0.0034, + "num_input_tokens_seen": 144701904, + "step": 118920 + }, + { + "epoch": 13.244793406838179, + "grad_norm": 0.37095174193382263, + "learning_rate": 1.5454356005636586e-05, + "loss": 0.0318, + "num_input_tokens_seen": 144707760, + "step": 118925 + }, + { + "epoch": 13.245350261721795, + "grad_norm": 0.452140748500824, + "learning_rate": 1.545211040365135e-05, + "loss": 0.0226, + "num_input_tokens_seen": 144713552, + "step": 118930 + }, + { + "epoch": 13.245907116605412, + "grad_norm": 0.010691346600651741, + "learning_rate": 1.5449864891853568e-05, + "loss": 0.0154, + "num_input_tokens_seen": 144718928, + "step": 118935 + }, + { + "epoch": 13.24646397148903, + "grad_norm": 0.009346342645585537, + "learning_rate": 1.5447619470264472e-05, + "loss": 0.0065, + "num_input_tokens_seen": 144725520, + "step": 118940 + }, + { + "epoch": 13.247020826372648, + "grad_norm": 0.40789929032325745, + "learning_rate": 1.544537413890526e-05, + "loss": 0.0482, + "num_input_tokens_seen": 144731600, + "step": 118945 + }, + { + "epoch": 13.247577681256265, + "grad_norm": 0.03364724665880203, + "learning_rate": 1.5443128897797147e-05, + "loss": 0.0665, + "num_input_tokens_seen": 144737840, + "step": 118950 + }, + { + "epoch": 13.248134536139881, + "grad_norm": 1.2771010398864746, + "learning_rate": 1.5440883746961337e-05, + "loss": 0.073, + "num_input_tokens_seen": 144743440, + "step": 118955 + }, + { + "epoch": 13.248691391023499, + "grad_norm": 0.02224802039563656, + "learning_rate": 1.5438638686419036e-05, + "loss": 0.0062, + "num_input_tokens_seen": 144749392, + "step": 118960 + }, + { + "epoch": 13.249248245907117, + "grad_norm": 0.0009777419036254287, + "learning_rate": 1.5436393716191457e-05, + "loss": 0.0163, + "num_input_tokens_seen": 144755472, + "step": 118965 + }, + { + "epoch": 13.249805100790734, + "grad_norm": 0.0005317199975252151, + "learning_rate": 1.5434148836299803e-05, + "loss": 0.0126, + "num_input_tokens_seen": 144761392, + "step": 118970 + }, + { + "epoch": 13.250361955674352, + "grad_norm": 0.9234597682952881, + "learning_rate": 1.5431904046765273e-05, + "loss": 0.1293, + "num_input_tokens_seen": 144767216, + "step": 118975 + }, + { + "epoch": 13.250918810557968, + "grad_norm": 0.046306367963552475, + "learning_rate": 1.542965934760908e-05, + "loss": 0.0348, + "num_input_tokens_seen": 144773264, + "step": 118980 + }, + { + "epoch": 13.251475665441586, + "grad_norm": 1.362736463546753, + "learning_rate": 1.542741473885241e-05, + "loss": 0.0664, + "num_input_tokens_seen": 144779696, + "step": 118985 + }, + { + "epoch": 13.252032520325203, + "grad_norm": 2.0636000633239746, + "learning_rate": 1.5425170220516494e-05, + "loss": 0.0632, + "num_input_tokens_seen": 144785648, + "step": 118990 + }, + { + "epoch": 13.25258937520882, + "grad_norm": 0.016836782917380333, + "learning_rate": 1.54229257926225e-05, + "loss": 0.1052, + "num_input_tokens_seen": 144791728, + "step": 118995 + }, + { + "epoch": 13.253146230092439, + "grad_norm": 0.0003651643346529454, + "learning_rate": 1.5420681455191658e-05, + "loss": 0.0088, + "num_input_tokens_seen": 144797424, + "step": 119000 + }, + { + "epoch": 13.253703084976054, + "grad_norm": 0.04041877016425133, + "learning_rate": 1.5418437208245147e-05, + "loss": 0.0489, + "num_input_tokens_seen": 144803536, + "step": 119005 + }, + { + "epoch": 13.254259939859672, + "grad_norm": 0.0006185214151628315, + "learning_rate": 1.541619305180418e-05, + "loss": 0.0192, + "num_input_tokens_seen": 144809616, + "step": 119010 + }, + { + "epoch": 13.25481679474329, + "grad_norm": 0.5934172868728638, + "learning_rate": 1.5413948985889938e-05, + "loss": 0.1975, + "num_input_tokens_seen": 144815664, + "step": 119015 + }, + { + "epoch": 13.255373649626907, + "grad_norm": 0.03451324626803398, + "learning_rate": 1.541170501052364e-05, + "loss": 0.0013, + "num_input_tokens_seen": 144821744, + "step": 119020 + }, + { + "epoch": 13.255930504510525, + "grad_norm": 0.0025793807581067085, + "learning_rate": 1.540946112572646e-05, + "loss": 0.0173, + "num_input_tokens_seen": 144827952, + "step": 119025 + }, + { + "epoch": 13.256487359394143, + "grad_norm": 0.03527052327990532, + "learning_rate": 1.540721733151961e-05, + "loss": 0.0077, + "num_input_tokens_seen": 144834160, + "step": 119030 + }, + { + "epoch": 13.257044214277759, + "grad_norm": 0.1427379846572876, + "learning_rate": 1.5404973627924276e-05, + "loss": 0.0025, + "num_input_tokens_seen": 144840016, + "step": 119035 + }, + { + "epoch": 13.257601069161376, + "grad_norm": 0.0012173643335700035, + "learning_rate": 1.5402730014961654e-05, + "loss": 0.0127, + "num_input_tokens_seen": 144846288, + "step": 119040 + }, + { + "epoch": 13.258157924044994, + "grad_norm": 0.10875658690929413, + "learning_rate": 1.5400486492652927e-05, + "loss": 0.01, + "num_input_tokens_seen": 144852688, + "step": 119045 + }, + { + "epoch": 13.258714778928612, + "grad_norm": 0.08984248340129852, + "learning_rate": 1.5398243061019314e-05, + "loss": 0.0377, + "num_input_tokens_seen": 144858736, + "step": 119050 + }, + { + "epoch": 13.25927163381223, + "grad_norm": 0.06941217929124832, + "learning_rate": 1.539599972008197e-05, + "loss": 0.0266, + "num_input_tokens_seen": 144864816, + "step": 119055 + }, + { + "epoch": 13.259828488695845, + "grad_norm": 0.823062539100647, + "learning_rate": 1.5393756469862113e-05, + "loss": 0.0275, + "num_input_tokens_seen": 144870928, + "step": 119060 + }, + { + "epoch": 13.260385343579463, + "grad_norm": 0.01750972867012024, + "learning_rate": 1.5391513310380924e-05, + "loss": 0.0289, + "num_input_tokens_seen": 144877296, + "step": 119065 + }, + { + "epoch": 13.26094219846308, + "grad_norm": 0.04038941487669945, + "learning_rate": 1.5389270241659587e-05, + "loss": 0.1466, + "num_input_tokens_seen": 144883248, + "step": 119070 + }, + { + "epoch": 13.261499053346698, + "grad_norm": 0.04746004194021225, + "learning_rate": 1.538702726371929e-05, + "loss": 0.002, + "num_input_tokens_seen": 144889264, + "step": 119075 + }, + { + "epoch": 13.262055908230316, + "grad_norm": 0.1764853447675705, + "learning_rate": 1.5384784376581228e-05, + "loss": 0.0078, + "num_input_tokens_seen": 144895600, + "step": 119080 + }, + { + "epoch": 13.262612763113932, + "grad_norm": 0.0005147152696736157, + "learning_rate": 1.5382541580266578e-05, + "loss": 0.0059, + "num_input_tokens_seen": 144901712, + "step": 119085 + }, + { + "epoch": 13.26316961799755, + "grad_norm": 2.074496269226074, + "learning_rate": 1.538029887479653e-05, + "loss": 0.0373, + "num_input_tokens_seen": 144907792, + "step": 119090 + }, + { + "epoch": 13.263726472881167, + "grad_norm": 1.5191164016723633, + "learning_rate": 1.5378056260192262e-05, + "loss": 0.0667, + "num_input_tokens_seen": 144913936, + "step": 119095 + }, + { + "epoch": 13.264283327764785, + "grad_norm": 0.0006103558698669076, + "learning_rate": 1.5375813736474966e-05, + "loss": 0.0762, + "num_input_tokens_seen": 144919248, + "step": 119100 + }, + { + "epoch": 13.264840182648403, + "grad_norm": 0.0029745788779109716, + "learning_rate": 1.5373571303665813e-05, + "loss": 0.0593, + "num_input_tokens_seen": 144925328, + "step": 119105 + }, + { + "epoch": 13.265397037532018, + "grad_norm": 0.20865154266357422, + "learning_rate": 1.5371328961786003e-05, + "loss": 0.0505, + "num_input_tokens_seen": 144930992, + "step": 119110 + }, + { + "epoch": 13.265953892415636, + "grad_norm": 0.0003261453239247203, + "learning_rate": 1.5369086710856694e-05, + "loss": 0.015, + "num_input_tokens_seen": 144937008, + "step": 119115 + }, + { + "epoch": 13.266510747299254, + "grad_norm": 0.00014500679390039295, + "learning_rate": 1.536684455089909e-05, + "loss": 0.0479, + "num_input_tokens_seen": 144943216, + "step": 119120 + }, + { + "epoch": 13.267067602182872, + "grad_norm": 0.311170369386673, + "learning_rate": 1.536460248193434e-05, + "loss": 0.0046, + "num_input_tokens_seen": 144949040, + "step": 119125 + }, + { + "epoch": 13.26762445706649, + "grad_norm": 0.006443728692829609, + "learning_rate": 1.5362360503983653e-05, + "loss": 0.0069, + "num_input_tokens_seen": 144954960, + "step": 119130 + }, + { + "epoch": 13.268181311950105, + "grad_norm": 0.2621077001094818, + "learning_rate": 1.5360118617068186e-05, + "loss": 0.0556, + "num_input_tokens_seen": 144961072, + "step": 119135 + }, + { + "epoch": 13.268738166833723, + "grad_norm": 0.012346162460744381, + "learning_rate": 1.5357876821209127e-05, + "loss": 0.0017, + "num_input_tokens_seen": 144967280, + "step": 119140 + }, + { + "epoch": 13.26929502171734, + "grad_norm": 0.12294145673513412, + "learning_rate": 1.535563511642764e-05, + "loss": 0.1684, + "num_input_tokens_seen": 144973616, + "step": 119145 + }, + { + "epoch": 13.269851876600958, + "grad_norm": 0.5103297829627991, + "learning_rate": 1.535339350274492e-05, + "loss": 0.0451, + "num_input_tokens_seen": 144979664, + "step": 119150 + }, + { + "epoch": 13.270408731484576, + "grad_norm": 0.03488672897219658, + "learning_rate": 1.535115198018211e-05, + "loss": 0.0107, + "num_input_tokens_seen": 144985584, + "step": 119155 + }, + { + "epoch": 13.270965586368192, + "grad_norm": 0.0007797486032359302, + "learning_rate": 1.534891054876041e-05, + "loss": 0.0385, + "num_input_tokens_seen": 144992112, + "step": 119160 + }, + { + "epoch": 13.27152244125181, + "grad_norm": 0.00030065758619457483, + "learning_rate": 1.534666920850098e-05, + "loss": 0.0382, + "num_input_tokens_seen": 144998224, + "step": 119165 + }, + { + "epoch": 13.272079296135427, + "grad_norm": 1.2774287462234497, + "learning_rate": 1.5344427959424996e-05, + "loss": 0.0221, + "num_input_tokens_seen": 145004336, + "step": 119170 + }, + { + "epoch": 13.272636151019045, + "grad_norm": 0.0005389264551922679, + "learning_rate": 1.5342186801553616e-05, + "loss": 0.0302, + "num_input_tokens_seen": 145009840, + "step": 119175 + }, + { + "epoch": 13.273193005902662, + "grad_norm": 0.004462531302124262, + "learning_rate": 1.5339945734908033e-05, + "loss": 0.0381, + "num_input_tokens_seen": 145015888, + "step": 119180 + }, + { + "epoch": 13.273749860786278, + "grad_norm": 0.00019281856657471508, + "learning_rate": 1.5337704759509387e-05, + "loss": 0.0365, + "num_input_tokens_seen": 145021776, + "step": 119185 + }, + { + "epoch": 13.274306715669896, + "grad_norm": 0.011050606146454811, + "learning_rate": 1.5335463875378872e-05, + "loss": 0.0103, + "num_input_tokens_seen": 145028080, + "step": 119190 + }, + { + "epoch": 13.274863570553514, + "grad_norm": 0.00028723484138026834, + "learning_rate": 1.533322308253764e-05, + "loss": 0.037, + "num_input_tokens_seen": 145034384, + "step": 119195 + }, + { + "epoch": 13.275420425437131, + "grad_norm": 0.03209516033530235, + "learning_rate": 1.5330982381006865e-05, + "loss": 0.0096, + "num_input_tokens_seen": 145040368, + "step": 119200 + }, + { + "epoch": 13.275977280320749, + "grad_norm": 0.2015729546546936, + "learning_rate": 1.53287417708077e-05, + "loss": 0.1585, + "num_input_tokens_seen": 145046480, + "step": 119205 + }, + { + "epoch": 13.276534135204365, + "grad_norm": 0.0004957905621267855, + "learning_rate": 1.5326501251961327e-05, + "loss": 0.0082, + "num_input_tokens_seen": 145052528, + "step": 119210 + }, + { + "epoch": 13.277090990087983, + "grad_norm": 0.0010699424892663956, + "learning_rate": 1.5324260824488893e-05, + "loss": 0.0074, + "num_input_tokens_seen": 145058800, + "step": 119215 + }, + { + "epoch": 13.2776478449716, + "grad_norm": 0.20882773399353027, + "learning_rate": 1.532202048841157e-05, + "loss": 0.0798, + "num_input_tokens_seen": 145064880, + "step": 119220 + }, + { + "epoch": 13.278204699855218, + "grad_norm": 0.019446885213255882, + "learning_rate": 1.5319780243750516e-05, + "loss": 0.1071, + "num_input_tokens_seen": 145071216, + "step": 119225 + }, + { + "epoch": 13.278761554738836, + "grad_norm": 0.005294470116496086, + "learning_rate": 1.53175400905269e-05, + "loss": 0.002, + "num_input_tokens_seen": 145077488, + "step": 119230 + }, + { + "epoch": 13.279318409622453, + "grad_norm": 0.08514484763145447, + "learning_rate": 1.531530002876186e-05, + "loss": 0.024, + "num_input_tokens_seen": 145083600, + "step": 119235 + }, + { + "epoch": 13.27987526450607, + "grad_norm": 0.03987197205424309, + "learning_rate": 1.5313060058476588e-05, + "loss": 0.0557, + "num_input_tokens_seen": 145089520, + "step": 119240 + }, + { + "epoch": 13.280432119389687, + "grad_norm": 0.0037865834310650826, + "learning_rate": 1.531082017969221e-05, + "loss": 0.0135, + "num_input_tokens_seen": 145095600, + "step": 119245 + }, + { + "epoch": 13.280988974273304, + "grad_norm": 0.011890465393662453, + "learning_rate": 1.5308580392429914e-05, + "loss": 0.0661, + "num_input_tokens_seen": 145101424, + "step": 119250 + }, + { + "epoch": 13.281545829156922, + "grad_norm": 0.8300405144691467, + "learning_rate": 1.5306340696710826e-05, + "loss": 0.1502, + "num_input_tokens_seen": 145107248, + "step": 119255 + }, + { + "epoch": 13.28210268404054, + "grad_norm": 0.05229859799146652, + "learning_rate": 1.5304101092556124e-05, + "loss": 0.0775, + "num_input_tokens_seen": 145113616, + "step": 119260 + }, + { + "epoch": 13.282659538924156, + "grad_norm": 0.04587063938379288, + "learning_rate": 1.5301861579986952e-05, + "loss": 0.0259, + "num_input_tokens_seen": 145119792, + "step": 119265 + }, + { + "epoch": 13.283216393807773, + "grad_norm": 0.012302710674703121, + "learning_rate": 1.5299622159024475e-05, + "loss": 0.0117, + "num_input_tokens_seen": 145126288, + "step": 119270 + }, + { + "epoch": 13.283773248691391, + "grad_norm": 0.0003540499019436538, + "learning_rate": 1.5297382829689827e-05, + "loss": 0.0105, + "num_input_tokens_seen": 145132528, + "step": 119275 + }, + { + "epoch": 13.284330103575009, + "grad_norm": 0.0016493209404870868, + "learning_rate": 1.529514359200418e-05, + "loss": 0.0522, + "num_input_tokens_seen": 145138864, + "step": 119280 + }, + { + "epoch": 13.284886958458626, + "grad_norm": 0.0018327925354242325, + "learning_rate": 1.5292904445988676e-05, + "loss": 0.0025, + "num_input_tokens_seen": 145145008, + "step": 119285 + }, + { + "epoch": 13.285443813342242, + "grad_norm": 1.1397511959075928, + "learning_rate": 1.5290665391664467e-05, + "loss": 0.0585, + "num_input_tokens_seen": 145151152, + "step": 119290 + }, + { + "epoch": 13.28600066822586, + "grad_norm": 0.0003955709398724139, + "learning_rate": 1.52884264290527e-05, + "loss": 0.0007, + "num_input_tokens_seen": 145157968, + "step": 119295 + }, + { + "epoch": 13.286557523109478, + "grad_norm": 0.001399620552547276, + "learning_rate": 1.528618755817453e-05, + "loss": 0.0009, + "num_input_tokens_seen": 145164240, + "step": 119300 + }, + { + "epoch": 13.287114377993095, + "grad_norm": 0.11538496613502502, + "learning_rate": 1.528394877905109e-05, + "loss": 0.1479, + "num_input_tokens_seen": 145169968, + "step": 119305 + }, + { + "epoch": 13.287671232876713, + "grad_norm": 0.0007345327758230269, + "learning_rate": 1.5281710091703555e-05, + "loss": 0.0195, + "num_input_tokens_seen": 145176080, + "step": 119310 + }, + { + "epoch": 13.288228087760329, + "grad_norm": 0.3569099009037018, + "learning_rate": 1.5279471496153038e-05, + "loss": 0.0033, + "num_input_tokens_seen": 145181552, + "step": 119315 + }, + { + "epoch": 13.288784942643947, + "grad_norm": 0.27546226978302, + "learning_rate": 1.5277232992420712e-05, + "loss": 0.0269, + "num_input_tokens_seen": 145187728, + "step": 119320 + }, + { + "epoch": 13.289341797527564, + "grad_norm": 0.1802065074443817, + "learning_rate": 1.5274994580527714e-05, + "loss": 0.0268, + "num_input_tokens_seen": 145193872, + "step": 119325 + }, + { + "epoch": 13.289898652411182, + "grad_norm": 0.007012214511632919, + "learning_rate": 1.5272756260495168e-05, + "loss": 0.0852, + "num_input_tokens_seen": 145200016, + "step": 119330 + }, + { + "epoch": 13.2904555072948, + "grad_norm": 0.7054114937782288, + "learning_rate": 1.5270518032344243e-05, + "loss": 0.0211, + "num_input_tokens_seen": 145206608, + "step": 119335 + }, + { + "epoch": 13.291012362178416, + "grad_norm": 0.0018092391546815634, + "learning_rate": 1.526827989609607e-05, + "loss": 0.1178, + "num_input_tokens_seen": 145212464, + "step": 119340 + }, + { + "epoch": 13.291569217062033, + "grad_norm": 0.22130244970321655, + "learning_rate": 1.526604185177179e-05, + "loss": 0.052, + "num_input_tokens_seen": 145218704, + "step": 119345 + }, + { + "epoch": 13.29212607194565, + "grad_norm": 0.05344589799642563, + "learning_rate": 1.526380389939254e-05, + "loss": 0.0482, + "num_input_tokens_seen": 145225200, + "step": 119350 + }, + { + "epoch": 13.292682926829269, + "grad_norm": 0.03895367309451103, + "learning_rate": 1.5261566038979467e-05, + "loss": 0.0302, + "num_input_tokens_seen": 145231376, + "step": 119355 + }, + { + "epoch": 13.293239781712886, + "grad_norm": 0.07676273584365845, + "learning_rate": 1.5259328270553702e-05, + "loss": 0.0461, + "num_input_tokens_seen": 145237744, + "step": 119360 + }, + { + "epoch": 13.293796636596502, + "grad_norm": 0.4513314962387085, + "learning_rate": 1.5257090594136386e-05, + "loss": 0.0108, + "num_input_tokens_seen": 145243984, + "step": 119365 + }, + { + "epoch": 13.29435349148012, + "grad_norm": 0.2599361836910248, + "learning_rate": 1.5254853009748655e-05, + "loss": 0.0158, + "num_input_tokens_seen": 145250352, + "step": 119370 + }, + { + "epoch": 13.294910346363737, + "grad_norm": 0.3928971290588379, + "learning_rate": 1.5252615517411648e-05, + "loss": 0.0032, + "num_input_tokens_seen": 145256720, + "step": 119375 + }, + { + "epoch": 13.295467201247355, + "grad_norm": 0.12110806256532669, + "learning_rate": 1.5250378117146492e-05, + "loss": 0.0348, + "num_input_tokens_seen": 145263024, + "step": 119380 + }, + { + "epoch": 13.296024056130973, + "grad_norm": 0.2713061571121216, + "learning_rate": 1.5248140808974332e-05, + "loss": 0.0259, + "num_input_tokens_seen": 145269104, + "step": 119385 + }, + { + "epoch": 13.29658091101459, + "grad_norm": 0.011081078089773655, + "learning_rate": 1.5245903592916282e-05, + "loss": 0.0494, + "num_input_tokens_seen": 145275280, + "step": 119390 + }, + { + "epoch": 13.297137765898206, + "grad_norm": 0.015157175250351429, + "learning_rate": 1.5243666468993506e-05, + "loss": 0.1389, + "num_input_tokens_seen": 145280784, + "step": 119395 + }, + { + "epoch": 13.297694620781824, + "grad_norm": 0.00315475114621222, + "learning_rate": 1.52414294372271e-05, + "loss": 0.0369, + "num_input_tokens_seen": 145286736, + "step": 119400 + }, + { + "epoch": 13.298251475665442, + "grad_norm": 0.021421419456601143, + "learning_rate": 1.5239192497638222e-05, + "loss": 0.0263, + "num_input_tokens_seen": 145292656, + "step": 119405 + }, + { + "epoch": 13.29880833054906, + "grad_norm": 0.6232615113258362, + "learning_rate": 1.5236955650247987e-05, + "loss": 0.0151, + "num_input_tokens_seen": 145298992, + "step": 119410 + }, + { + "epoch": 13.299365185432677, + "grad_norm": 0.25968077778816223, + "learning_rate": 1.5234718895077533e-05, + "loss": 0.0484, + "num_input_tokens_seen": 145305168, + "step": 119415 + }, + { + "epoch": 13.299922040316293, + "grad_norm": 1.3195570707321167, + "learning_rate": 1.5232482232147976e-05, + "loss": 0.0951, + "num_input_tokens_seen": 145311024, + "step": 119420 + }, + { + "epoch": 13.30047889519991, + "grad_norm": 0.019380265846848488, + "learning_rate": 1.5230245661480455e-05, + "loss": 0.0225, + "num_input_tokens_seen": 145317232, + "step": 119425 + }, + { + "epoch": 13.301035750083528, + "grad_norm": 0.1257101148366928, + "learning_rate": 1.5228009183096087e-05, + "loss": 0.043, + "num_input_tokens_seen": 145323216, + "step": 119430 + }, + { + "epoch": 13.301592604967146, + "grad_norm": 0.6007900238037109, + "learning_rate": 1.5225772797016005e-05, + "loss": 0.1199, + "num_input_tokens_seen": 145329168, + "step": 119435 + }, + { + "epoch": 13.302149459850764, + "grad_norm": 0.0043829078786075115, + "learning_rate": 1.5223536503261327e-05, + "loss": 0.0075, + "num_input_tokens_seen": 145335280, + "step": 119440 + }, + { + "epoch": 13.30270631473438, + "grad_norm": 0.006909448187798262, + "learning_rate": 1.5221300301853184e-05, + "loss": 0.0016, + "num_input_tokens_seen": 145341712, + "step": 119445 + }, + { + "epoch": 13.303263169617997, + "grad_norm": 0.0013540367363020778, + "learning_rate": 1.5219064192812683e-05, + "loss": 0.0311, + "num_input_tokens_seen": 145347728, + "step": 119450 + }, + { + "epoch": 13.303820024501615, + "grad_norm": 0.27392488718032837, + "learning_rate": 1.5216828176160974e-05, + "loss": 0.0784, + "num_input_tokens_seen": 145353488, + "step": 119455 + }, + { + "epoch": 13.304376879385233, + "grad_norm": 0.6428354382514954, + "learning_rate": 1.5214592251919143e-05, + "loss": 0.2162, + "num_input_tokens_seen": 145358928, + "step": 119460 + }, + { + "epoch": 13.30493373426885, + "grad_norm": 0.7586969137191772, + "learning_rate": 1.5212356420108342e-05, + "loss": 0.0125, + "num_input_tokens_seen": 145364944, + "step": 119465 + }, + { + "epoch": 13.305490589152466, + "grad_norm": 0.0011522183194756508, + "learning_rate": 1.5210120680749668e-05, + "loss": 0.0567, + "num_input_tokens_seen": 145371024, + "step": 119470 + }, + { + "epoch": 13.306047444036084, + "grad_norm": 0.3143978714942932, + "learning_rate": 1.5207885033864255e-05, + "loss": 0.0807, + "num_input_tokens_seen": 145377200, + "step": 119475 + }, + { + "epoch": 13.306604298919702, + "grad_norm": 0.003778274869546294, + "learning_rate": 1.5205649479473204e-05, + "loss": 0.0056, + "num_input_tokens_seen": 145383376, + "step": 119480 + }, + { + "epoch": 13.30716115380332, + "grad_norm": 0.32056769728660583, + "learning_rate": 1.520341401759765e-05, + "loss": 0.0205, + "num_input_tokens_seen": 145389360, + "step": 119485 + }, + { + "epoch": 13.307718008686937, + "grad_norm": 0.0006184392259456217, + "learning_rate": 1.5201178648258696e-05, + "loss": 0.0489, + "num_input_tokens_seen": 145395664, + "step": 119490 + }, + { + "epoch": 13.308274863570553, + "grad_norm": 0.003349759615957737, + "learning_rate": 1.5198943371477462e-05, + "loss": 0.0049, + "num_input_tokens_seen": 145401936, + "step": 119495 + }, + { + "epoch": 13.30883171845417, + "grad_norm": 1.1623423099517822, + "learning_rate": 1.5196708187275054e-05, + "loss": 0.0748, + "num_input_tokens_seen": 145408112, + "step": 119500 + }, + { + "epoch": 13.309388573337788, + "grad_norm": 0.0046487292274832726, + "learning_rate": 1.5194473095672601e-05, + "loss": 0.0118, + "num_input_tokens_seen": 145414416, + "step": 119505 + }, + { + "epoch": 13.309945428221406, + "grad_norm": 0.6871376633644104, + "learning_rate": 1.5192238096691192e-05, + "loss": 0.0758, + "num_input_tokens_seen": 145420272, + "step": 119510 + }, + { + "epoch": 13.310502283105023, + "grad_norm": 0.003877032082527876, + "learning_rate": 1.519000319035197e-05, + "loss": 0.0019, + "num_input_tokens_seen": 145426352, + "step": 119515 + }, + { + "epoch": 13.31105913798864, + "grad_norm": 0.0024085906334221363, + "learning_rate": 1.5187768376676009e-05, + "loss": 0.0059, + "num_input_tokens_seen": 145432560, + "step": 119520 + }, + { + "epoch": 13.311615992872257, + "grad_norm": 0.016997508704662323, + "learning_rate": 1.5185533655684456e-05, + "loss": 0.0808, + "num_input_tokens_seen": 145438864, + "step": 119525 + }, + { + "epoch": 13.312172847755875, + "grad_norm": 0.007717866916209459, + "learning_rate": 1.5183299027398385e-05, + "loss": 0.1415, + "num_input_tokens_seen": 145444816, + "step": 119530 + }, + { + "epoch": 13.312729702639492, + "grad_norm": 0.8109947443008423, + "learning_rate": 1.5181064491838926e-05, + "loss": 0.0272, + "num_input_tokens_seen": 145450800, + "step": 119535 + }, + { + "epoch": 13.31328655752311, + "grad_norm": 1.0776209831237793, + "learning_rate": 1.5178830049027177e-05, + "loss": 0.1604, + "num_input_tokens_seen": 145456304, + "step": 119540 + }, + { + "epoch": 13.313843412406726, + "grad_norm": 0.003066073404625058, + "learning_rate": 1.5176595698984252e-05, + "loss": 0.0144, + "num_input_tokens_seen": 145462800, + "step": 119545 + }, + { + "epoch": 13.314400267290344, + "grad_norm": 0.3368648290634155, + "learning_rate": 1.5174361441731246e-05, + "loss": 0.0389, + "num_input_tokens_seen": 145468784, + "step": 119550 + }, + { + "epoch": 13.314957122173961, + "grad_norm": 0.012505370192229748, + "learning_rate": 1.5172127277289272e-05, + "loss": 0.0436, + "num_input_tokens_seen": 145475024, + "step": 119555 + }, + { + "epoch": 13.315513977057579, + "grad_norm": 0.00017084165301639587, + "learning_rate": 1.5169893205679428e-05, + "loss": 0.0047, + "num_input_tokens_seen": 145481008, + "step": 119560 + }, + { + "epoch": 13.316070831941197, + "grad_norm": 0.11961821466684341, + "learning_rate": 1.516765922692282e-05, + "loss": 0.1632, + "num_input_tokens_seen": 145487504, + "step": 119565 + }, + { + "epoch": 13.316627686824813, + "grad_norm": 0.1104574054479599, + "learning_rate": 1.5165425341040546e-05, + "loss": 0.0652, + "num_input_tokens_seen": 145493712, + "step": 119570 + }, + { + "epoch": 13.31718454170843, + "grad_norm": 2.1060261726379395, + "learning_rate": 1.5163191548053713e-05, + "loss": 0.0743, + "num_input_tokens_seen": 145499984, + "step": 119575 + }, + { + "epoch": 13.317741396592048, + "grad_norm": 0.44449952244758606, + "learning_rate": 1.5160957847983406e-05, + "loss": 0.0231, + "num_input_tokens_seen": 145506096, + "step": 119580 + }, + { + "epoch": 13.318298251475666, + "grad_norm": 0.16589950025081635, + "learning_rate": 1.5158724240850752e-05, + "loss": 0.0316, + "num_input_tokens_seen": 145512592, + "step": 119585 + }, + { + "epoch": 13.318855106359283, + "grad_norm": 1.631866216659546, + "learning_rate": 1.5156490726676813e-05, + "loss": 0.0708, + "num_input_tokens_seen": 145518544, + "step": 119590 + }, + { + "epoch": 13.319411961242901, + "grad_norm": 0.014558883383870125, + "learning_rate": 1.5154257305482723e-05, + "loss": 0.0161, + "num_input_tokens_seen": 145524688, + "step": 119595 + }, + { + "epoch": 13.319968816126517, + "grad_norm": 0.0004695463867392391, + "learning_rate": 1.5152023977289548e-05, + "loss": 0.1582, + "num_input_tokens_seen": 145530736, + "step": 119600 + }, + { + "epoch": 13.320525671010135, + "grad_norm": 0.023734232410788536, + "learning_rate": 1.5149790742118407e-05, + "loss": 0.0757, + "num_input_tokens_seen": 145536944, + "step": 119605 + }, + { + "epoch": 13.321082525893752, + "grad_norm": 0.00019679591059684753, + "learning_rate": 1.5147557599990375e-05, + "loss": 0.0016, + "num_input_tokens_seen": 145543280, + "step": 119610 + }, + { + "epoch": 13.32163938077737, + "grad_norm": 0.2470032423734665, + "learning_rate": 1.5145324550926566e-05, + "loss": 0.0391, + "num_input_tokens_seen": 145548688, + "step": 119615 + }, + { + "epoch": 13.322196235660988, + "grad_norm": 0.0011724516516551375, + "learning_rate": 1.514309159494805e-05, + "loss": 0.0395, + "num_input_tokens_seen": 145554864, + "step": 119620 + }, + { + "epoch": 13.322753090544603, + "grad_norm": 0.0222207959741354, + "learning_rate": 1.514085873207594e-05, + "loss": 0.0277, + "num_input_tokens_seen": 145561072, + "step": 119625 + }, + { + "epoch": 13.323309945428221, + "grad_norm": 0.1192295253276825, + "learning_rate": 1.5138625962331315e-05, + "loss": 0.0569, + "num_input_tokens_seen": 145566000, + "step": 119630 + }, + { + "epoch": 13.323866800311839, + "grad_norm": 0.5623458623886108, + "learning_rate": 1.5136393285735271e-05, + "loss": 0.0434, + "num_input_tokens_seen": 145572304, + "step": 119635 + }, + { + "epoch": 13.324423655195456, + "grad_norm": 1.7513891458511353, + "learning_rate": 1.5134160702308886e-05, + "loss": 0.0592, + "num_input_tokens_seen": 145578544, + "step": 119640 + }, + { + "epoch": 13.324980510079074, + "grad_norm": 1.0130671262741089, + "learning_rate": 1.513192821207327e-05, + "loss": 0.0714, + "num_input_tokens_seen": 145584816, + "step": 119645 + }, + { + "epoch": 13.32553736496269, + "grad_norm": 0.22169476747512817, + "learning_rate": 1.5129695815049488e-05, + "loss": 0.0473, + "num_input_tokens_seen": 145590864, + "step": 119650 + }, + { + "epoch": 13.326094219846308, + "grad_norm": 0.6093865036964417, + "learning_rate": 1.5127463511258649e-05, + "loss": 0.0985, + "num_input_tokens_seen": 145597264, + "step": 119655 + }, + { + "epoch": 13.326651074729925, + "grad_norm": 0.6258643865585327, + "learning_rate": 1.512523130072181e-05, + "loss": 0.0217, + "num_input_tokens_seen": 145603312, + "step": 119660 + }, + { + "epoch": 13.327207929613543, + "grad_norm": 0.08218813687562943, + "learning_rate": 1.5122999183460085e-05, + "loss": 0.0827, + "num_input_tokens_seen": 145609488, + "step": 119665 + }, + { + "epoch": 13.32776478449716, + "grad_norm": 0.7847026586532593, + "learning_rate": 1.5120767159494543e-05, + "loss": 0.0274, + "num_input_tokens_seen": 145615472, + "step": 119670 + }, + { + "epoch": 13.328321639380777, + "grad_norm": 0.0013256254605948925, + "learning_rate": 1.5118535228846273e-05, + "loss": 0.0513, + "num_input_tokens_seen": 145621776, + "step": 119675 + }, + { + "epoch": 13.328878494264394, + "grad_norm": 0.2527799606323242, + "learning_rate": 1.5116303391536351e-05, + "loss": 0.0076, + "num_input_tokens_seen": 145627920, + "step": 119680 + }, + { + "epoch": 13.329435349148012, + "grad_norm": 4.698611259460449, + "learning_rate": 1.5114071647585864e-05, + "loss": 0.0993, + "num_input_tokens_seen": 145633840, + "step": 119685 + }, + { + "epoch": 13.32999220403163, + "grad_norm": 0.2189960479736328, + "learning_rate": 1.5111839997015889e-05, + "loss": 0.0199, + "num_input_tokens_seen": 145639856, + "step": 119690 + }, + { + "epoch": 13.330549058915247, + "grad_norm": 0.046451348811388016, + "learning_rate": 1.5109608439847511e-05, + "loss": 0.0586, + "num_input_tokens_seen": 145646000, + "step": 119695 + }, + { + "epoch": 13.331105913798863, + "grad_norm": 0.37759336829185486, + "learning_rate": 1.51073769761018e-05, + "loss": 0.0199, + "num_input_tokens_seen": 145651920, + "step": 119700 + }, + { + "epoch": 13.331662768682481, + "grad_norm": 0.03490535542368889, + "learning_rate": 1.5105145605799848e-05, + "loss": 0.1057, + "num_input_tokens_seen": 145657840, + "step": 119705 + }, + { + "epoch": 13.332219623566099, + "grad_norm": 0.04674270376563072, + "learning_rate": 1.5102914328962708e-05, + "loss": 0.0022, + "num_input_tokens_seen": 145663856, + "step": 119710 + }, + { + "epoch": 13.332776478449716, + "grad_norm": 0.33276426792144775, + "learning_rate": 1.5100683145611489e-05, + "loss": 0.0644, + "num_input_tokens_seen": 145670096, + "step": 119715 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 0.031560588628053665, + "learning_rate": 1.5098452055767235e-05, + "loss": 0.0769, + "num_input_tokens_seen": 145676304, + "step": 119720 + }, + { + "epoch": 13.33389018821695, + "grad_norm": 0.017794959247112274, + "learning_rate": 1.5096221059451044e-05, + "loss": 0.0016, + "num_input_tokens_seen": 145682704, + "step": 119725 + }, + { + "epoch": 13.334447043100567, + "grad_norm": 9.923310426529497e-05, + "learning_rate": 1.5093990156683974e-05, + "loss": 0.0334, + "num_input_tokens_seen": 145688784, + "step": 119730 + }, + { + "epoch": 13.335003897984185, + "grad_norm": 0.003418365493416786, + "learning_rate": 1.5091759347487106e-05, + "loss": 0.0032, + "num_input_tokens_seen": 145694960, + "step": 119735 + }, + { + "epoch": 13.335560752867803, + "grad_norm": 0.016450170427560806, + "learning_rate": 1.5089528631881513e-05, + "loss": 0.0111, + "num_input_tokens_seen": 145701168, + "step": 119740 + }, + { + "epoch": 13.33611760775142, + "grad_norm": 0.0016899504698812962, + "learning_rate": 1.5087298009888256e-05, + "loss": 0.0031, + "num_input_tokens_seen": 145707280, + "step": 119745 + }, + { + "epoch": 13.336674462635038, + "grad_norm": 2.2277026176452637, + "learning_rate": 1.5085067481528417e-05, + "loss": 0.0771, + "num_input_tokens_seen": 145713360, + "step": 119750 + }, + { + "epoch": 13.337231317518654, + "grad_norm": 0.4053399860858917, + "learning_rate": 1.5082837046823053e-05, + "loss": 0.1378, + "num_input_tokens_seen": 145719696, + "step": 119755 + }, + { + "epoch": 13.337788172402272, + "grad_norm": 0.6054948568344116, + "learning_rate": 1.5080606705793243e-05, + "loss": 0.0557, + "num_input_tokens_seen": 145726000, + "step": 119760 + }, + { + "epoch": 13.33834502728589, + "grad_norm": 0.07221914827823639, + "learning_rate": 1.5078376458460048e-05, + "loss": 0.0473, + "num_input_tokens_seen": 145731920, + "step": 119765 + }, + { + "epoch": 13.338901882169507, + "grad_norm": 1.207262635231018, + "learning_rate": 1.507614630484454e-05, + "loss": 0.0748, + "num_input_tokens_seen": 145737968, + "step": 119770 + }, + { + "epoch": 13.339458737053125, + "grad_norm": 0.05427877604961395, + "learning_rate": 1.5073916244967776e-05, + "loss": 0.1256, + "num_input_tokens_seen": 145744144, + "step": 119775 + }, + { + "epoch": 13.34001559193674, + "grad_norm": 1.241251826286316, + "learning_rate": 1.507168627885083e-05, + "loss": 0.112, + "num_input_tokens_seen": 145750096, + "step": 119780 + }, + { + "epoch": 13.340572446820358, + "grad_norm": 1.4416546821594238, + "learning_rate": 1.5069456406514754e-05, + "loss": 0.0732, + "num_input_tokens_seen": 145756112, + "step": 119785 + }, + { + "epoch": 13.341129301703976, + "grad_norm": 1.2072476148605347, + "learning_rate": 1.506722662798063e-05, + "loss": 0.0842, + "num_input_tokens_seen": 145762032, + "step": 119790 + }, + { + "epoch": 13.341686156587594, + "grad_norm": 0.17123691737651825, + "learning_rate": 1.5064996943269493e-05, + "loss": 0.0919, + "num_input_tokens_seen": 145768112, + "step": 119795 + }, + { + "epoch": 13.342243011471211, + "grad_norm": 1.9163267612457275, + "learning_rate": 1.5062767352402437e-05, + "loss": 0.0684, + "num_input_tokens_seen": 145774096, + "step": 119800 + }, + { + "epoch": 13.342799866354827, + "grad_norm": 0.019641341641545296, + "learning_rate": 1.5060537855400491e-05, + "loss": 0.0005, + "num_input_tokens_seen": 145780112, + "step": 119805 + }, + { + "epoch": 13.343356721238445, + "grad_norm": 0.01960735395550728, + "learning_rate": 1.5058308452284736e-05, + "loss": 0.04, + "num_input_tokens_seen": 145786256, + "step": 119810 + }, + { + "epoch": 13.343913576122063, + "grad_norm": 0.04248341545462608, + "learning_rate": 1.5056079143076219e-05, + "loss": 0.0343, + "num_input_tokens_seen": 145792048, + "step": 119815 + }, + { + "epoch": 13.34447043100568, + "grad_norm": 0.00282806227914989, + "learning_rate": 1.5053849927796004e-05, + "loss": 0.0413, + "num_input_tokens_seen": 145798288, + "step": 119820 + }, + { + "epoch": 13.345027285889298, + "grad_norm": 0.0006465151673182845, + "learning_rate": 1.5051620806465144e-05, + "loss": 0.0603, + "num_input_tokens_seen": 145804528, + "step": 119825 + }, + { + "epoch": 13.345584140772914, + "grad_norm": 6.050631046295166, + "learning_rate": 1.5049391779104699e-05, + "loss": 0.1377, + "num_input_tokens_seen": 145810832, + "step": 119830 + }, + { + "epoch": 13.346140995656532, + "grad_norm": 0.3750971555709839, + "learning_rate": 1.5047162845735718e-05, + "loss": 0.0236, + "num_input_tokens_seen": 145816592, + "step": 119835 + }, + { + "epoch": 13.34669785054015, + "grad_norm": 0.0013406443176791072, + "learning_rate": 1.5044934006379261e-05, + "loss": 0.0285, + "num_input_tokens_seen": 145822608, + "step": 119840 + }, + { + "epoch": 13.347254705423767, + "grad_norm": 0.04406801238656044, + "learning_rate": 1.5042705261056372e-05, + "loss": 0.0398, + "num_input_tokens_seen": 145828560, + "step": 119845 + }, + { + "epoch": 13.347811560307385, + "grad_norm": 0.0001793832634575665, + "learning_rate": 1.5040476609788118e-05, + "loss": 0.0237, + "num_input_tokens_seen": 145834864, + "step": 119850 + }, + { + "epoch": 13.348368415191, + "grad_norm": 0.07899215817451477, + "learning_rate": 1.5038248052595527e-05, + "loss": 0.0255, + "num_input_tokens_seen": 145840496, + "step": 119855 + }, + { + "epoch": 13.348925270074618, + "grad_norm": 0.0003554603608790785, + "learning_rate": 1.5036019589499683e-05, + "loss": 0.003, + "num_input_tokens_seen": 145846960, + "step": 119860 + }, + { + "epoch": 13.349482124958236, + "grad_norm": 0.03218292444944382, + "learning_rate": 1.5033791220521601e-05, + "loss": 0.0163, + "num_input_tokens_seen": 145853296, + "step": 119865 + }, + { + "epoch": 13.350038979841854, + "grad_norm": 0.00020663696341216564, + "learning_rate": 1.5031562945682354e-05, + "loss": 0.0178, + "num_input_tokens_seen": 145858512, + "step": 119870 + }, + { + "epoch": 13.350595834725471, + "grad_norm": 0.007148208096623421, + "learning_rate": 1.5029334765002978e-05, + "loss": 0.0223, + "num_input_tokens_seen": 145864400, + "step": 119875 + }, + { + "epoch": 13.351152689609087, + "grad_norm": 0.301246702671051, + "learning_rate": 1.5027106678504533e-05, + "loss": 0.0739, + "num_input_tokens_seen": 145870352, + "step": 119880 + }, + { + "epoch": 13.351709544492705, + "grad_norm": 0.21787551045417786, + "learning_rate": 1.5024878686208044e-05, + "loss": 0.0664, + "num_input_tokens_seen": 145876432, + "step": 119885 + }, + { + "epoch": 13.352266399376322, + "grad_norm": 0.1730208843946457, + "learning_rate": 1.5022650788134573e-05, + "loss": 0.0053, + "num_input_tokens_seen": 145882704, + "step": 119890 + }, + { + "epoch": 13.35282325425994, + "grad_norm": 0.0037955089937895536, + "learning_rate": 1.5020422984305158e-05, + "loss": 0.0031, + "num_input_tokens_seen": 145888688, + "step": 119895 + }, + { + "epoch": 13.353380109143558, + "grad_norm": 0.003536202711984515, + "learning_rate": 1.5018195274740848e-05, + "loss": 0.0007, + "num_input_tokens_seen": 145894864, + "step": 119900 + }, + { + "epoch": 13.353936964027174, + "grad_norm": 0.19369223713874817, + "learning_rate": 1.5015967659462674e-05, + "loss": 0.0114, + "num_input_tokens_seen": 145901040, + "step": 119905 + }, + { + "epoch": 13.354493818910791, + "grad_norm": 1.6742891073226929, + "learning_rate": 1.5013740138491689e-05, + "loss": 0.0688, + "num_input_tokens_seen": 145907152, + "step": 119910 + }, + { + "epoch": 13.355050673794409, + "grad_norm": 0.1720634251832962, + "learning_rate": 1.5011512711848918e-05, + "loss": 0.0154, + "num_input_tokens_seen": 145913200, + "step": 119915 + }, + { + "epoch": 13.355607528678027, + "grad_norm": 0.5334783792495728, + "learning_rate": 1.5009285379555433e-05, + "loss": 0.0565, + "num_input_tokens_seen": 145919312, + "step": 119920 + }, + { + "epoch": 13.356164383561644, + "grad_norm": 0.1118764579296112, + "learning_rate": 1.5007058141632233e-05, + "loss": 0.0106, + "num_input_tokens_seen": 145925200, + "step": 119925 + }, + { + "epoch": 13.35672123844526, + "grad_norm": 0.3128012418746948, + "learning_rate": 1.5004830998100389e-05, + "loss": 0.0183, + "num_input_tokens_seen": 145931568, + "step": 119930 + }, + { + "epoch": 13.357278093328878, + "grad_norm": 2.6209466457366943, + "learning_rate": 1.5002603948980912e-05, + "loss": 0.1197, + "num_input_tokens_seen": 145937712, + "step": 119935 + }, + { + "epoch": 13.357834948212496, + "grad_norm": 0.15309156477451324, + "learning_rate": 1.5000376994294857e-05, + "loss": 0.1385, + "num_input_tokens_seen": 145943760, + "step": 119940 + }, + { + "epoch": 13.358391803096113, + "grad_norm": 0.12594330310821533, + "learning_rate": 1.4998150134063248e-05, + "loss": 0.1524, + "num_input_tokens_seen": 145950096, + "step": 119945 + }, + { + "epoch": 13.358948657979731, + "grad_norm": 0.008412042632699013, + "learning_rate": 1.4995923368307135e-05, + "loss": 0.0127, + "num_input_tokens_seen": 145956560, + "step": 119950 + }, + { + "epoch": 13.359505512863349, + "grad_norm": 1.019858479499817, + "learning_rate": 1.4993696697047532e-05, + "loss": 0.042, + "num_input_tokens_seen": 145962576, + "step": 119955 + }, + { + "epoch": 13.360062367746965, + "grad_norm": 0.827420175075531, + "learning_rate": 1.4991470120305484e-05, + "loss": 0.0244, + "num_input_tokens_seen": 145968912, + "step": 119960 + }, + { + "epoch": 13.360619222630582, + "grad_norm": 0.24663370847702026, + "learning_rate": 1.4989243638102018e-05, + "loss": 0.0281, + "num_input_tokens_seen": 145975152, + "step": 119965 + }, + { + "epoch": 13.3611760775142, + "grad_norm": 0.11708646267652512, + "learning_rate": 1.4987017250458168e-05, + "loss": 0.0185, + "num_input_tokens_seen": 145980976, + "step": 119970 + }, + { + "epoch": 13.361732932397818, + "grad_norm": 0.002142807701602578, + "learning_rate": 1.4984790957394962e-05, + "loss": 0.0483, + "num_input_tokens_seen": 145986928, + "step": 119975 + }, + { + "epoch": 13.362289787281435, + "grad_norm": 2.3166451454162598, + "learning_rate": 1.498256475893343e-05, + "loss": 0.1483, + "num_input_tokens_seen": 145993136, + "step": 119980 + }, + { + "epoch": 13.362846642165051, + "grad_norm": 1.1891452074050903, + "learning_rate": 1.4980338655094589e-05, + "loss": 0.0221, + "num_input_tokens_seen": 145998960, + "step": 119985 + }, + { + "epoch": 13.363403497048669, + "grad_norm": 0.026608524844050407, + "learning_rate": 1.4978112645899495e-05, + "loss": 0.0573, + "num_input_tokens_seen": 146005200, + "step": 119990 + }, + { + "epoch": 13.363960351932286, + "grad_norm": 0.06750483810901642, + "learning_rate": 1.4975886731369143e-05, + "loss": 0.003, + "num_input_tokens_seen": 146011600, + "step": 119995 + }, + { + "epoch": 13.364517206815904, + "grad_norm": 0.327585369348526, + "learning_rate": 1.4973660911524578e-05, + "loss": 0.079, + "num_input_tokens_seen": 146018064, + "step": 120000 + }, + { + "epoch": 13.365074061699522, + "grad_norm": 0.40170150995254517, + "learning_rate": 1.4971435186386814e-05, + "loss": 0.0439, + "num_input_tokens_seen": 146024176, + "step": 120005 + }, + { + "epoch": 13.365630916583138, + "grad_norm": 0.00014304518117569387, + "learning_rate": 1.4969209555976887e-05, + "loss": 0.0948, + "num_input_tokens_seen": 146029424, + "step": 120010 + }, + { + "epoch": 13.366187771466755, + "grad_norm": 0.09165464341640472, + "learning_rate": 1.4966984020315804e-05, + "loss": 0.0517, + "num_input_tokens_seen": 146035344, + "step": 120015 + }, + { + "epoch": 13.366744626350373, + "grad_norm": 0.017346076667308807, + "learning_rate": 1.4964758579424603e-05, + "loss": 0.0126, + "num_input_tokens_seen": 146041712, + "step": 120020 + }, + { + "epoch": 13.36730148123399, + "grad_norm": 0.03934187814593315, + "learning_rate": 1.4962533233324292e-05, + "loss": 0.0033, + "num_input_tokens_seen": 146048048, + "step": 120025 + }, + { + "epoch": 13.367858336117608, + "grad_norm": 0.10545990616083145, + "learning_rate": 1.4960307982035902e-05, + "loss": 0.0202, + "num_input_tokens_seen": 146054064, + "step": 120030 + }, + { + "epoch": 13.368415191001224, + "grad_norm": 0.008994468487799168, + "learning_rate": 1.4958082825580438e-05, + "loss": 0.0351, + "num_input_tokens_seen": 146060368, + "step": 120035 + }, + { + "epoch": 13.368972045884842, + "grad_norm": 1.6874064207077026, + "learning_rate": 1.4955857763978937e-05, + "loss": 0.2024, + "num_input_tokens_seen": 146066352, + "step": 120040 + }, + { + "epoch": 13.36952890076846, + "grad_norm": 0.7036293745040894, + "learning_rate": 1.4953632797252392e-05, + "loss": 0.0177, + "num_input_tokens_seen": 146072432, + "step": 120045 + }, + { + "epoch": 13.370085755652077, + "grad_norm": 0.007373399566859007, + "learning_rate": 1.4951407925421853e-05, + "loss": 0.0562, + "num_input_tokens_seen": 146077936, + "step": 120050 + }, + { + "epoch": 13.370642610535695, + "grad_norm": 0.0066399830393493176, + "learning_rate": 1.49491831485083e-05, + "loss": 0.023, + "num_input_tokens_seen": 146084016, + "step": 120055 + }, + { + "epoch": 13.371199465419311, + "grad_norm": 0.004464501049369574, + "learning_rate": 1.4946958466532779e-05, + "loss": 0.0586, + "num_input_tokens_seen": 146090352, + "step": 120060 + }, + { + "epoch": 13.371756320302929, + "grad_norm": 0.9761161208152771, + "learning_rate": 1.494473387951628e-05, + "loss": 0.0178, + "num_input_tokens_seen": 146096752, + "step": 120065 + }, + { + "epoch": 13.372313175186546, + "grad_norm": 1.3327052593231201, + "learning_rate": 1.494250938747983e-05, + "loss": 0.1057, + "num_input_tokens_seen": 146102864, + "step": 120070 + }, + { + "epoch": 13.372870030070164, + "grad_norm": 0.003182411892339587, + "learning_rate": 1.4940284990444437e-05, + "loss": 0.0101, + "num_input_tokens_seen": 146108528, + "step": 120075 + }, + { + "epoch": 13.373426884953782, + "grad_norm": 0.0012611359125003219, + "learning_rate": 1.493806068843111e-05, + "loss": 0.0423, + "num_input_tokens_seen": 146114928, + "step": 120080 + }, + { + "epoch": 13.373983739837398, + "grad_norm": 0.02294413186609745, + "learning_rate": 1.493583648146086e-05, + "loss": 0.0274, + "num_input_tokens_seen": 146120816, + "step": 120085 + }, + { + "epoch": 13.374540594721015, + "grad_norm": 0.0013996688649058342, + "learning_rate": 1.4933612369554703e-05, + "loss": 0.0305, + "num_input_tokens_seen": 146126960, + "step": 120090 + }, + { + "epoch": 13.375097449604633, + "grad_norm": 0.0005480620893649757, + "learning_rate": 1.4931388352733638e-05, + "loss": 0.0009, + "num_input_tokens_seen": 146133296, + "step": 120095 + }, + { + "epoch": 13.37565430448825, + "grad_norm": 0.14814576506614685, + "learning_rate": 1.4929164431018681e-05, + "loss": 0.1065, + "num_input_tokens_seen": 146139440, + "step": 120100 + }, + { + "epoch": 13.376211159371868, + "grad_norm": 0.5561509728431702, + "learning_rate": 1.4926940604430828e-05, + "loss": 0.0195, + "num_input_tokens_seen": 146145552, + "step": 120105 + }, + { + "epoch": 13.376768014255486, + "grad_norm": 0.025957029312849045, + "learning_rate": 1.4924716872991095e-05, + "loss": 0.0097, + "num_input_tokens_seen": 146151600, + "step": 120110 + }, + { + "epoch": 13.377324869139102, + "grad_norm": 2.0957953929901123, + "learning_rate": 1.4922493236720478e-05, + "loss": 0.0966, + "num_input_tokens_seen": 146157968, + "step": 120115 + }, + { + "epoch": 13.37788172402272, + "grad_norm": 0.6693779230117798, + "learning_rate": 1.492026969564e-05, + "loss": 0.1189, + "num_input_tokens_seen": 146164304, + "step": 120120 + }, + { + "epoch": 13.378438578906337, + "grad_norm": 1.2960134744644165, + "learning_rate": 1.4918046249770634e-05, + "loss": 0.0618, + "num_input_tokens_seen": 146169808, + "step": 120125 + }, + { + "epoch": 13.378995433789955, + "grad_norm": 0.050603464245796204, + "learning_rate": 1.4915822899133407e-05, + "loss": 0.0558, + "num_input_tokens_seen": 146176080, + "step": 120130 + }, + { + "epoch": 13.379552288673572, + "grad_norm": 0.2552317678928375, + "learning_rate": 1.491359964374931e-05, + "loss": 0.0602, + "num_input_tokens_seen": 146182224, + "step": 120135 + }, + { + "epoch": 13.380109143557188, + "grad_norm": 0.06650751829147339, + "learning_rate": 1.491137648363935e-05, + "loss": 0.0079, + "num_input_tokens_seen": 146188272, + "step": 120140 + }, + { + "epoch": 13.380665998440806, + "grad_norm": 1.1839457750320435, + "learning_rate": 1.4909153418824524e-05, + "loss": 0.034, + "num_input_tokens_seen": 146194320, + "step": 120145 + }, + { + "epoch": 13.381222853324424, + "grad_norm": 0.10852308571338654, + "learning_rate": 1.490693044932582e-05, + "loss": 0.0297, + "num_input_tokens_seen": 146200432, + "step": 120150 + }, + { + "epoch": 13.381779708208041, + "grad_norm": 0.18314634263515472, + "learning_rate": 1.490470757516425e-05, + "loss": 0.0025, + "num_input_tokens_seen": 146206800, + "step": 120155 + }, + { + "epoch": 13.382336563091659, + "grad_norm": 0.07433439791202545, + "learning_rate": 1.4902484796360802e-05, + "loss": 0.0115, + "num_input_tokens_seen": 146212784, + "step": 120160 + }, + { + "epoch": 13.382893417975275, + "grad_norm": 0.0519103966653347, + "learning_rate": 1.490026211293648e-05, + "loss": 0.0024, + "num_input_tokens_seen": 146218992, + "step": 120165 + }, + { + "epoch": 13.383450272858893, + "grad_norm": 0.38402268290519714, + "learning_rate": 1.4898039524912266e-05, + "loss": 0.0271, + "num_input_tokens_seen": 146225360, + "step": 120170 + }, + { + "epoch": 13.38400712774251, + "grad_norm": 0.3446996808052063, + "learning_rate": 1.4895817032309173e-05, + "loss": 0.0327, + "num_input_tokens_seen": 146231024, + "step": 120175 + }, + { + "epoch": 13.384563982626128, + "grad_norm": 0.2719894051551819, + "learning_rate": 1.4893594635148173e-05, + "loss": 0.0293, + "num_input_tokens_seen": 146237264, + "step": 120180 + }, + { + "epoch": 13.385120837509746, + "grad_norm": 0.041474636644124985, + "learning_rate": 1.4891372333450276e-05, + "loss": 0.0187, + "num_input_tokens_seen": 146243696, + "step": 120185 + }, + { + "epoch": 13.385677692393362, + "grad_norm": 0.013871701434254646, + "learning_rate": 1.4889150127236456e-05, + "loss": 0.0583, + "num_input_tokens_seen": 146250000, + "step": 120190 + }, + { + "epoch": 13.38623454727698, + "grad_norm": 0.18760870397090912, + "learning_rate": 1.4886928016527728e-05, + "loss": 0.1013, + "num_input_tokens_seen": 146255888, + "step": 120195 + }, + { + "epoch": 13.386791402160597, + "grad_norm": 0.19268536567687988, + "learning_rate": 1.4884706001345052e-05, + "loss": 0.0048, + "num_input_tokens_seen": 146262128, + "step": 120200 + }, + { + "epoch": 13.387348257044215, + "grad_norm": 0.8117290139198303, + "learning_rate": 1.4882484081709446e-05, + "loss": 0.048, + "num_input_tokens_seen": 146267664, + "step": 120205 + }, + { + "epoch": 13.387905111927832, + "grad_norm": 0.5087838172912598, + "learning_rate": 1.4880262257641874e-05, + "loss": 0.066, + "num_input_tokens_seen": 146273968, + "step": 120210 + }, + { + "epoch": 13.388461966811448, + "grad_norm": 0.0038894610479474068, + "learning_rate": 1.4878040529163339e-05, + "loss": 0.0086, + "num_input_tokens_seen": 146280144, + "step": 120215 + }, + { + "epoch": 13.389018821695066, + "grad_norm": 0.0910600870847702, + "learning_rate": 1.4875818896294818e-05, + "loss": 0.0022, + "num_input_tokens_seen": 146286096, + "step": 120220 + }, + { + "epoch": 13.389575676578684, + "grad_norm": 0.0010881697526201606, + "learning_rate": 1.4873597359057301e-05, + "loss": 0.0077, + "num_input_tokens_seen": 146292144, + "step": 120225 + }, + { + "epoch": 13.390132531462301, + "grad_norm": 0.27743589878082275, + "learning_rate": 1.4871375917471766e-05, + "loss": 0.0356, + "num_input_tokens_seen": 146298256, + "step": 120230 + }, + { + "epoch": 13.390689386345919, + "grad_norm": 0.0133829265832901, + "learning_rate": 1.4869154571559207e-05, + "loss": 0.0323, + "num_input_tokens_seen": 146304496, + "step": 120235 + }, + { + "epoch": 13.391246241229535, + "grad_norm": 0.15511806309223175, + "learning_rate": 1.4866933321340592e-05, + "loss": 0.0336, + "num_input_tokens_seen": 146310704, + "step": 120240 + }, + { + "epoch": 13.391803096113152, + "grad_norm": 0.037154924124479294, + "learning_rate": 1.486471216683692e-05, + "loss": 0.0541, + "num_input_tokens_seen": 146316912, + "step": 120245 + }, + { + "epoch": 13.39235995099677, + "grad_norm": 9.047576895682141e-05, + "learning_rate": 1.4862491108069152e-05, + "loss": 0.0274, + "num_input_tokens_seen": 146322512, + "step": 120250 + }, + { + "epoch": 13.392916805880388, + "grad_norm": 0.0035229770001024008, + "learning_rate": 1.4860270145058284e-05, + "loss": 0.004, + "num_input_tokens_seen": 146328816, + "step": 120255 + }, + { + "epoch": 13.393473660764005, + "grad_norm": 0.016746671870350838, + "learning_rate": 1.4858049277825275e-05, + "loss": 0.0076, + "num_input_tokens_seen": 146334992, + "step": 120260 + }, + { + "epoch": 13.394030515647621, + "grad_norm": 1.844590187072754, + "learning_rate": 1.4855828506391137e-05, + "loss": 0.084, + "num_input_tokens_seen": 146341104, + "step": 120265 + }, + { + "epoch": 13.394587370531239, + "grad_norm": 0.0004032284196000546, + "learning_rate": 1.4853607830776808e-05, + "loss": 0.0759, + "num_input_tokens_seen": 146347248, + "step": 120270 + }, + { + "epoch": 13.395144225414857, + "grad_norm": 0.36825573444366455, + "learning_rate": 1.4851387251003294e-05, + "loss": 0.0842, + "num_input_tokens_seen": 146353072, + "step": 120275 + }, + { + "epoch": 13.395701080298474, + "grad_norm": 0.02243870124220848, + "learning_rate": 1.4849166767091552e-05, + "loss": 0.0021, + "num_input_tokens_seen": 146359024, + "step": 120280 + }, + { + "epoch": 13.396257935182092, + "grad_norm": 0.20153585076332092, + "learning_rate": 1.4846946379062568e-05, + "loss": 0.0019, + "num_input_tokens_seen": 146365424, + "step": 120285 + }, + { + "epoch": 13.396814790065708, + "grad_norm": 0.0036099208518862724, + "learning_rate": 1.4844726086937305e-05, + "loss": 0.016, + "num_input_tokens_seen": 146371632, + "step": 120290 + }, + { + "epoch": 13.397371644949326, + "grad_norm": 0.6163133382797241, + "learning_rate": 1.4842505890736746e-05, + "loss": 0.0067, + "num_input_tokens_seen": 146377808, + "step": 120295 + }, + { + "epoch": 13.397928499832943, + "grad_norm": 0.6283578276634216, + "learning_rate": 1.4840285790481851e-05, + "loss": 0.0089, + "num_input_tokens_seen": 146383824, + "step": 120300 + }, + { + "epoch": 13.398485354716561, + "grad_norm": 0.006032613571733236, + "learning_rate": 1.48380657861936e-05, + "loss": 0.0003, + "num_input_tokens_seen": 146389936, + "step": 120305 + }, + { + "epoch": 13.399042209600179, + "grad_norm": 1.1688228845596313, + "learning_rate": 1.4835845877892957e-05, + "loss": 0.0467, + "num_input_tokens_seen": 146396208, + "step": 120310 + }, + { + "epoch": 13.399599064483796, + "grad_norm": 0.29128187894821167, + "learning_rate": 1.4833626065600897e-05, + "loss": 0.0045, + "num_input_tokens_seen": 146401712, + "step": 120315 + }, + { + "epoch": 13.400155919367412, + "grad_norm": 1.514195203781128, + "learning_rate": 1.4831406349338373e-05, + "loss": 0.056, + "num_input_tokens_seen": 146407856, + "step": 120320 + }, + { + "epoch": 13.40071277425103, + "grad_norm": 0.011373348534107208, + "learning_rate": 1.482918672912638e-05, + "loss": 0.0237, + "num_input_tokens_seen": 146413904, + "step": 120325 + }, + { + "epoch": 13.401269629134648, + "grad_norm": 0.0027695614844560623, + "learning_rate": 1.4826967204985851e-05, + "loss": 0.0418, + "num_input_tokens_seen": 146420048, + "step": 120330 + }, + { + "epoch": 13.401826484018265, + "grad_norm": 0.015487714670598507, + "learning_rate": 1.4824747776937775e-05, + "loss": 0.012, + "num_input_tokens_seen": 146426224, + "step": 120335 + }, + { + "epoch": 13.402383338901883, + "grad_norm": 0.03443346545100212, + "learning_rate": 1.4822528445003108e-05, + "loss": 0.0175, + "num_input_tokens_seen": 146432528, + "step": 120340 + }, + { + "epoch": 13.402940193785499, + "grad_norm": 0.18198508024215698, + "learning_rate": 1.4820309209202815e-05, + "loss": 0.0084, + "num_input_tokens_seen": 146438672, + "step": 120345 + }, + { + "epoch": 13.403497048669117, + "grad_norm": 0.005383157636970282, + "learning_rate": 1.4818090069557855e-05, + "loss": 0.0027, + "num_input_tokens_seen": 146444560, + "step": 120350 + }, + { + "epoch": 13.404053903552734, + "grad_norm": 0.010663802735507488, + "learning_rate": 1.4815871026089195e-05, + "loss": 0.0468, + "num_input_tokens_seen": 146450864, + "step": 120355 + }, + { + "epoch": 13.404610758436352, + "grad_norm": 0.0065858992747962475, + "learning_rate": 1.4813652078817788e-05, + "loss": 0.1038, + "num_input_tokens_seen": 146456944, + "step": 120360 + }, + { + "epoch": 13.40516761331997, + "grad_norm": 0.12573035061359406, + "learning_rate": 1.4811433227764604e-05, + "loss": 0.0319, + "num_input_tokens_seen": 146462960, + "step": 120365 + }, + { + "epoch": 13.405724468203585, + "grad_norm": 0.010662899352610111, + "learning_rate": 1.480921447295059e-05, + "loss": 0.0482, + "num_input_tokens_seen": 146469136, + "step": 120370 + }, + { + "epoch": 13.406281323087203, + "grad_norm": 0.021364552900195122, + "learning_rate": 1.4806995814396717e-05, + "loss": 0.0175, + "num_input_tokens_seen": 146475536, + "step": 120375 + }, + { + "epoch": 13.40683817797082, + "grad_norm": 2.0314431190490723, + "learning_rate": 1.480477725212393e-05, + "loss": 0.1079, + "num_input_tokens_seen": 146481680, + "step": 120380 + }, + { + "epoch": 13.407395032854438, + "grad_norm": 1.886107087135315, + "learning_rate": 1.4802558786153192e-05, + "loss": 0.0489, + "num_input_tokens_seen": 146488016, + "step": 120385 + }, + { + "epoch": 13.407951887738056, + "grad_norm": 0.2092839628458023, + "learning_rate": 1.4800340416505449e-05, + "loss": 0.0031, + "num_input_tokens_seen": 146494256, + "step": 120390 + }, + { + "epoch": 13.408508742621672, + "grad_norm": 0.02002483420073986, + "learning_rate": 1.4798122143201675e-05, + "loss": 0.0839, + "num_input_tokens_seen": 146499664, + "step": 120395 + }, + { + "epoch": 13.40906559750529, + "grad_norm": 1.0564099550247192, + "learning_rate": 1.47959039662628e-05, + "loss": 0.1113, + "num_input_tokens_seen": 146505968, + "step": 120400 + }, + { + "epoch": 13.409622452388907, + "grad_norm": 1.8217360973358154, + "learning_rate": 1.4793685885709796e-05, + "loss": 0.097, + "num_input_tokens_seen": 146511984, + "step": 120405 + }, + { + "epoch": 13.410179307272525, + "grad_norm": 0.31181100010871887, + "learning_rate": 1.4791467901563599e-05, + "loss": 0.0141, + "num_input_tokens_seen": 146518224, + "step": 120410 + }, + { + "epoch": 13.410736162156143, + "grad_norm": 0.03364057466387749, + "learning_rate": 1.4789250013845174e-05, + "loss": 0.0207, + "num_input_tokens_seen": 146524496, + "step": 120415 + }, + { + "epoch": 13.411293017039759, + "grad_norm": 0.009348927065730095, + "learning_rate": 1.478703222257546e-05, + "loss": 0.0495, + "num_input_tokens_seen": 146530448, + "step": 120420 + }, + { + "epoch": 13.411849871923376, + "grad_norm": 0.10211282223463058, + "learning_rate": 1.4784814527775409e-05, + "loss": 0.014, + "num_input_tokens_seen": 146536240, + "step": 120425 + }, + { + "epoch": 13.412406726806994, + "grad_norm": 0.006260641850531101, + "learning_rate": 1.4782596929465964e-05, + "loss": 0.0025, + "num_input_tokens_seen": 146542224, + "step": 120430 + }, + { + "epoch": 13.412963581690612, + "grad_norm": 0.014320164918899536, + "learning_rate": 1.4780379427668086e-05, + "loss": 0.0487, + "num_input_tokens_seen": 146548464, + "step": 120435 + }, + { + "epoch": 13.41352043657423, + "grad_norm": 0.004306568764150143, + "learning_rate": 1.4778162022402706e-05, + "loss": 0.008, + "num_input_tokens_seen": 146553872, + "step": 120440 + }, + { + "epoch": 13.414077291457847, + "grad_norm": 2.4665234088897705, + "learning_rate": 1.4775944713690782e-05, + "loss": 0.179, + "num_input_tokens_seen": 146560336, + "step": 120445 + }, + { + "epoch": 13.414634146341463, + "grad_norm": 0.13091295957565308, + "learning_rate": 1.4773727501553236e-05, + "loss": 0.0808, + "num_input_tokens_seen": 146566256, + "step": 120450 + }, + { + "epoch": 13.41519100122508, + "grad_norm": 0.0066894786432385445, + "learning_rate": 1.4771510386011045e-05, + "loss": 0.0669, + "num_input_tokens_seen": 146572656, + "step": 120455 + }, + { + "epoch": 13.415747856108698, + "grad_norm": 0.07256217300891876, + "learning_rate": 1.4769293367085118e-05, + "loss": 0.0827, + "num_input_tokens_seen": 146579216, + "step": 120460 + }, + { + "epoch": 13.416304710992316, + "grad_norm": 0.15000103414058685, + "learning_rate": 1.4767076444796424e-05, + "loss": 0.1169, + "num_input_tokens_seen": 146585360, + "step": 120465 + }, + { + "epoch": 13.416861565875934, + "grad_norm": 0.14821679890155792, + "learning_rate": 1.4764859619165886e-05, + "loss": 0.075, + "num_input_tokens_seen": 146591632, + "step": 120470 + }, + { + "epoch": 13.41741842075955, + "grad_norm": 0.18358875811100006, + "learning_rate": 1.4762642890214451e-05, + "loss": 0.0171, + "num_input_tokens_seen": 146597648, + "step": 120475 + }, + { + "epoch": 13.417975275643167, + "grad_norm": 0.28023889660835266, + "learning_rate": 1.4760426257963055e-05, + "loss": 0.0479, + "num_input_tokens_seen": 146603760, + "step": 120480 + }, + { + "epoch": 13.418532130526785, + "grad_norm": 0.061740126460790634, + "learning_rate": 1.4758209722432639e-05, + "loss": 0.0553, + "num_input_tokens_seen": 146609872, + "step": 120485 + }, + { + "epoch": 13.419088985410403, + "grad_norm": 1.0554372072219849, + "learning_rate": 1.4755993283644134e-05, + "loss": 0.0222, + "num_input_tokens_seen": 146616016, + "step": 120490 + }, + { + "epoch": 13.41964584029402, + "grad_norm": 1.3086892366409302, + "learning_rate": 1.4753776941618486e-05, + "loss": 0.0421, + "num_input_tokens_seen": 146622032, + "step": 120495 + }, + { + "epoch": 13.420202695177636, + "grad_norm": 0.10422811657190323, + "learning_rate": 1.4751560696376615e-05, + "loss": 0.0715, + "num_input_tokens_seen": 146628336, + "step": 120500 + }, + { + "epoch": 13.420759550061254, + "grad_norm": 0.1880866438150406, + "learning_rate": 1.4749344547939473e-05, + "loss": 0.0078, + "num_input_tokens_seen": 146634640, + "step": 120505 + }, + { + "epoch": 13.421316404944871, + "grad_norm": 1.4361646175384521, + "learning_rate": 1.4747128496327973e-05, + "loss": 0.0964, + "num_input_tokens_seen": 146640880, + "step": 120510 + }, + { + "epoch": 13.421873259828489, + "grad_norm": 0.005204858258366585, + "learning_rate": 1.4744912541563072e-05, + "loss": 0.0379, + "num_input_tokens_seen": 146646832, + "step": 120515 + }, + { + "epoch": 13.422430114712107, + "grad_norm": 0.06988348811864853, + "learning_rate": 1.4742696683665675e-05, + "loss": 0.0985, + "num_input_tokens_seen": 146652464, + "step": 120520 + }, + { + "epoch": 13.422986969595723, + "grad_norm": 0.00016553756722714752, + "learning_rate": 1.474048092265674e-05, + "loss": 0.0059, + "num_input_tokens_seen": 146658992, + "step": 120525 + }, + { + "epoch": 13.42354382447934, + "grad_norm": 0.17347712814807892, + "learning_rate": 1.4738265258557171e-05, + "loss": 0.1487, + "num_input_tokens_seen": 146664144, + "step": 120530 + }, + { + "epoch": 13.424100679362958, + "grad_norm": 0.1370260864496231, + "learning_rate": 1.4736049691387916e-05, + "loss": 0.0975, + "num_input_tokens_seen": 146670320, + "step": 120535 + }, + { + "epoch": 13.424657534246576, + "grad_norm": 0.5645748376846313, + "learning_rate": 1.473383422116989e-05, + "loss": 0.0638, + "num_input_tokens_seen": 146676592, + "step": 120540 + }, + { + "epoch": 13.425214389130193, + "grad_norm": 1.5407772064208984, + "learning_rate": 1.4731618847924028e-05, + "loss": 0.0469, + "num_input_tokens_seen": 146682736, + "step": 120545 + }, + { + "epoch": 13.42577124401381, + "grad_norm": 0.030296066775918007, + "learning_rate": 1.4729403571671249e-05, + "loss": 0.017, + "num_input_tokens_seen": 146688816, + "step": 120550 + }, + { + "epoch": 13.426328098897427, + "grad_norm": 0.04944680631160736, + "learning_rate": 1.472718839243249e-05, + "loss": 0.0093, + "num_input_tokens_seen": 146694800, + "step": 120555 + }, + { + "epoch": 13.426884953781045, + "grad_norm": 0.0633014664053917, + "learning_rate": 1.4724973310228663e-05, + "loss": 0.0106, + "num_input_tokens_seen": 146700880, + "step": 120560 + }, + { + "epoch": 13.427441808664662, + "grad_norm": 0.0004617871018126607, + "learning_rate": 1.472275832508069e-05, + "loss": 0.0089, + "num_input_tokens_seen": 146706992, + "step": 120565 + }, + { + "epoch": 13.42799866354828, + "grad_norm": 0.008987479843199253, + "learning_rate": 1.4720543437009504e-05, + "loss": 0.0009, + "num_input_tokens_seen": 146713360, + "step": 120570 + }, + { + "epoch": 13.428555518431896, + "grad_norm": 0.00014514786016661674, + "learning_rate": 1.471832864603602e-05, + "loss": 0.0205, + "num_input_tokens_seen": 146719408, + "step": 120575 + }, + { + "epoch": 13.429112373315514, + "grad_norm": 0.008579645305871964, + "learning_rate": 1.471611395218116e-05, + "loss": 0.0241, + "num_input_tokens_seen": 146725520, + "step": 120580 + }, + { + "epoch": 13.429669228199131, + "grad_norm": 0.03961779177188873, + "learning_rate": 1.471389935546584e-05, + "loss": 0.023, + "num_input_tokens_seen": 146731664, + "step": 120585 + }, + { + "epoch": 13.430226083082749, + "grad_norm": 0.015579383820295334, + "learning_rate": 1.4711684855910987e-05, + "loss": 0.1295, + "num_input_tokens_seen": 146737552, + "step": 120590 + }, + { + "epoch": 13.430782937966367, + "grad_norm": 0.002142841462045908, + "learning_rate": 1.4709470453537502e-05, + "loss": 0.0275, + "num_input_tokens_seen": 146743856, + "step": 120595 + }, + { + "epoch": 13.431339792849982, + "grad_norm": 0.4929395020008087, + "learning_rate": 1.470725614836633e-05, + "loss": 0.027, + "num_input_tokens_seen": 146749616, + "step": 120600 + }, + { + "epoch": 13.4318966477336, + "grad_norm": 1.7600857019424438, + "learning_rate": 1.4705041940418355e-05, + "loss": 0.0241, + "num_input_tokens_seen": 146755376, + "step": 120605 + }, + { + "epoch": 13.432453502617218, + "grad_norm": 0.00019455680740065873, + "learning_rate": 1.4702827829714522e-05, + "loss": 0.0189, + "num_input_tokens_seen": 146761680, + "step": 120610 + }, + { + "epoch": 13.433010357500835, + "grad_norm": 0.029901869595050812, + "learning_rate": 1.4700613816275712e-05, + "loss": 0.0139, + "num_input_tokens_seen": 146767984, + "step": 120615 + }, + { + "epoch": 13.433567212384453, + "grad_norm": 0.09403116255998611, + "learning_rate": 1.4698399900122869e-05, + "loss": 0.0156, + "num_input_tokens_seen": 146774096, + "step": 120620 + }, + { + "epoch": 13.434124067268069, + "grad_norm": 0.5126249194145203, + "learning_rate": 1.4696186081276886e-05, + "loss": 0.0262, + "num_input_tokens_seen": 146780336, + "step": 120625 + }, + { + "epoch": 13.434680922151687, + "grad_norm": 0.8611062169075012, + "learning_rate": 1.4693972359758686e-05, + "loss": 0.0138, + "num_input_tokens_seen": 146786544, + "step": 120630 + }, + { + "epoch": 13.435237777035304, + "grad_norm": 2.2131712436676025, + "learning_rate": 1.4691758735589172e-05, + "loss": 0.0612, + "num_input_tokens_seen": 146792464, + "step": 120635 + }, + { + "epoch": 13.435794631918922, + "grad_norm": 0.0001288087951252237, + "learning_rate": 1.4689545208789258e-05, + "loss": 0.0091, + "num_input_tokens_seen": 146798768, + "step": 120640 + }, + { + "epoch": 13.43635148680254, + "grad_norm": 0.6718806624412537, + "learning_rate": 1.4687331779379845e-05, + "loss": 0.1079, + "num_input_tokens_seen": 146804880, + "step": 120645 + }, + { + "epoch": 13.436908341686157, + "grad_norm": 0.17633123695850372, + "learning_rate": 1.468511844738185e-05, + "loss": 0.0952, + "num_input_tokens_seen": 146810864, + "step": 120650 + }, + { + "epoch": 13.437465196569773, + "grad_norm": 0.39455944299697876, + "learning_rate": 1.4682905212816172e-05, + "loss": 0.0247, + "num_input_tokens_seen": 146816784, + "step": 120655 + }, + { + "epoch": 13.438022051453391, + "grad_norm": 0.3564099967479706, + "learning_rate": 1.4680692075703725e-05, + "loss": 0.0489, + "num_input_tokens_seen": 146823120, + "step": 120660 + }, + { + "epoch": 13.438578906337009, + "grad_norm": 0.16124708950519562, + "learning_rate": 1.4678479036065402e-05, + "loss": 0.0036, + "num_input_tokens_seen": 146829360, + "step": 120665 + }, + { + "epoch": 13.439135761220626, + "grad_norm": 0.1001991331577301, + "learning_rate": 1.4676266093922126e-05, + "loss": 0.02, + "num_input_tokens_seen": 146835440, + "step": 120670 + }, + { + "epoch": 13.439692616104244, + "grad_norm": 0.0015169407706707716, + "learning_rate": 1.467405324929477e-05, + "loss": 0.0506, + "num_input_tokens_seen": 146841744, + "step": 120675 + }, + { + "epoch": 13.44024947098786, + "grad_norm": 2.2286596298217773, + "learning_rate": 1.4671840502204268e-05, + "loss": 0.0952, + "num_input_tokens_seen": 146847888, + "step": 120680 + }, + { + "epoch": 13.440806325871478, + "grad_norm": 0.7962412238121033, + "learning_rate": 1.46696278526715e-05, + "loss": 0.015, + "num_input_tokens_seen": 146853904, + "step": 120685 + }, + { + "epoch": 13.441363180755095, + "grad_norm": 0.511049211025238, + "learning_rate": 1.466741530071738e-05, + "loss": 0.0273, + "num_input_tokens_seen": 146860112, + "step": 120690 + }, + { + "epoch": 13.441920035638713, + "grad_norm": 1.0900952816009521, + "learning_rate": 1.4665202846362797e-05, + "loss": 0.0443, + "num_input_tokens_seen": 146866160, + "step": 120695 + }, + { + "epoch": 13.44247689052233, + "grad_norm": 0.4178643524646759, + "learning_rate": 1.4662990489628653e-05, + "loss": 0.02, + "num_input_tokens_seen": 146872336, + "step": 120700 + }, + { + "epoch": 13.443033745405947, + "grad_norm": 1.6983373165130615, + "learning_rate": 1.4660778230535846e-05, + "loss": 0.0506, + "num_input_tokens_seen": 146877616, + "step": 120705 + }, + { + "epoch": 13.443590600289564, + "grad_norm": 0.0007458459003828466, + "learning_rate": 1.4658566069105275e-05, + "loss": 0.0189, + "num_input_tokens_seen": 146883792, + "step": 120710 + }, + { + "epoch": 13.444147455173182, + "grad_norm": 0.879269540309906, + "learning_rate": 1.465635400535783e-05, + "loss": 0.128, + "num_input_tokens_seen": 146889872, + "step": 120715 + }, + { + "epoch": 13.4447043100568, + "grad_norm": 0.01020122691988945, + "learning_rate": 1.4654142039314412e-05, + "loss": 0.0057, + "num_input_tokens_seen": 146896080, + "step": 120720 + }, + { + "epoch": 13.445261164940417, + "grad_norm": 0.0008105612942017615, + "learning_rate": 1.4651930170995901e-05, + "loss": 0.004, + "num_input_tokens_seen": 146902224, + "step": 120725 + }, + { + "epoch": 13.445818019824033, + "grad_norm": 1.585067868232727, + "learning_rate": 1.4649718400423215e-05, + "loss": 0.082, + "num_input_tokens_seen": 146908080, + "step": 120730 + }, + { + "epoch": 13.44637487470765, + "grad_norm": 0.003045113757252693, + "learning_rate": 1.4647506727617215e-05, + "loss": 0.051, + "num_input_tokens_seen": 146913872, + "step": 120735 + }, + { + "epoch": 13.446931729591268, + "grad_norm": 0.1080319732427597, + "learning_rate": 1.464529515259882e-05, + "loss": 0.0244, + "num_input_tokens_seen": 146920048, + "step": 120740 + }, + { + "epoch": 13.447488584474886, + "grad_norm": 0.0004248014884069562, + "learning_rate": 1.4643083675388906e-05, + "loss": 0.0779, + "num_input_tokens_seen": 146926256, + "step": 120745 + }, + { + "epoch": 13.448045439358504, + "grad_norm": 1.2509632110595703, + "learning_rate": 1.4640872296008368e-05, + "loss": 0.0931, + "num_input_tokens_seen": 146931344, + "step": 120750 + }, + { + "epoch": 13.44860229424212, + "grad_norm": 1.080809235572815, + "learning_rate": 1.4638661014478083e-05, + "loss": 0.0452, + "num_input_tokens_seen": 146937040, + "step": 120755 + }, + { + "epoch": 13.449159149125737, + "grad_norm": 0.014663985930383205, + "learning_rate": 1.4636449830818955e-05, + "loss": 0.0183, + "num_input_tokens_seen": 146943440, + "step": 120760 + }, + { + "epoch": 13.449716004009355, + "grad_norm": 0.00010844409553101286, + "learning_rate": 1.4634238745051854e-05, + "loss": 0.0529, + "num_input_tokens_seen": 146949520, + "step": 120765 + }, + { + "epoch": 13.450272858892973, + "grad_norm": 0.018417686223983765, + "learning_rate": 1.4632027757197678e-05, + "loss": 0.0229, + "num_input_tokens_seen": 146955824, + "step": 120770 + }, + { + "epoch": 13.45082971377659, + "grad_norm": 0.6706711649894714, + "learning_rate": 1.4629816867277301e-05, + "loss": 0.1254, + "num_input_tokens_seen": 146962128, + "step": 120775 + }, + { + "epoch": 13.451386568660206, + "grad_norm": 0.10942690074443817, + "learning_rate": 1.4627606075311617e-05, + "loss": 0.1224, + "num_input_tokens_seen": 146968336, + "step": 120780 + }, + { + "epoch": 13.451943423543824, + "grad_norm": 1.5101948976516724, + "learning_rate": 1.4625395381321494e-05, + "loss": 0.049, + "num_input_tokens_seen": 146974480, + "step": 120785 + }, + { + "epoch": 13.452500278427442, + "grad_norm": 0.18510748445987701, + "learning_rate": 1.4623184785327843e-05, + "loss": 0.0356, + "num_input_tokens_seen": 146980240, + "step": 120790 + }, + { + "epoch": 13.45305713331106, + "grad_norm": 0.04057900607585907, + "learning_rate": 1.4620974287351504e-05, + "loss": 0.0274, + "num_input_tokens_seen": 146986224, + "step": 120795 + }, + { + "epoch": 13.453613988194677, + "grad_norm": 0.00026552099734544754, + "learning_rate": 1.4618763887413398e-05, + "loss": 0.0025, + "num_input_tokens_seen": 146992528, + "step": 120800 + }, + { + "epoch": 13.454170843078295, + "grad_norm": 0.04211890697479248, + "learning_rate": 1.4616553585534366e-05, + "loss": 0.0258, + "num_input_tokens_seen": 146999024, + "step": 120805 + }, + { + "epoch": 13.45472769796191, + "grad_norm": 2.499128818511963, + "learning_rate": 1.4614343381735317e-05, + "loss": 0.1551, + "num_input_tokens_seen": 147004848, + "step": 120810 + }, + { + "epoch": 13.455284552845528, + "grad_norm": 0.0004564691334962845, + "learning_rate": 1.461213327603711e-05, + "loss": 0.082, + "num_input_tokens_seen": 147010896, + "step": 120815 + }, + { + "epoch": 13.455841407729146, + "grad_norm": 1.3795583248138428, + "learning_rate": 1.4609923268460632e-05, + "loss": 0.0926, + "num_input_tokens_seen": 147017040, + "step": 120820 + }, + { + "epoch": 13.456398262612764, + "grad_norm": 0.7267374396324158, + "learning_rate": 1.4607713359026747e-05, + "loss": 0.0358, + "num_input_tokens_seen": 147022992, + "step": 120825 + }, + { + "epoch": 13.456955117496381, + "grad_norm": 0.009140015579760075, + "learning_rate": 1.4605503547756342e-05, + "loss": 0.0712, + "num_input_tokens_seen": 147029104, + "step": 120830 + }, + { + "epoch": 13.457511972379997, + "grad_norm": 0.34661588072776794, + "learning_rate": 1.4603293834670278e-05, + "loss": 0.0235, + "num_input_tokens_seen": 147035184, + "step": 120835 + }, + { + "epoch": 13.458068827263615, + "grad_norm": 0.11405503749847412, + "learning_rate": 1.4601084219789438e-05, + "loss": 0.0519, + "num_input_tokens_seen": 147041360, + "step": 120840 + }, + { + "epoch": 13.458625682147233, + "grad_norm": 0.049340590834617615, + "learning_rate": 1.4598874703134685e-05, + "loss": 0.0116, + "num_input_tokens_seen": 147047408, + "step": 120845 + }, + { + "epoch": 13.45918253703085, + "grad_norm": 0.00016102875815704465, + "learning_rate": 1.4596665284726896e-05, + "loss": 0.0719, + "num_input_tokens_seen": 147053648, + "step": 120850 + }, + { + "epoch": 13.459739391914468, + "grad_norm": 0.0010269455378875136, + "learning_rate": 1.459445596458694e-05, + "loss": 0.0128, + "num_input_tokens_seen": 147059792, + "step": 120855 + }, + { + "epoch": 13.460296246798084, + "grad_norm": 0.07837006449699402, + "learning_rate": 1.4592246742735683e-05, + "loss": 0.0037, + "num_input_tokens_seen": 147066096, + "step": 120860 + }, + { + "epoch": 13.460853101681701, + "grad_norm": 0.013169320300221443, + "learning_rate": 1.4590037619193985e-05, + "loss": 0.1342, + "num_input_tokens_seen": 147072240, + "step": 120865 + }, + { + "epoch": 13.46140995656532, + "grad_norm": 0.03389863669872284, + "learning_rate": 1.4587828593982733e-05, + "loss": 0.0047, + "num_input_tokens_seen": 147077904, + "step": 120870 + }, + { + "epoch": 13.461966811448937, + "grad_norm": 1.2431106567382812, + "learning_rate": 1.4585619667122782e-05, + "loss": 0.1252, + "num_input_tokens_seen": 147084304, + "step": 120875 + }, + { + "epoch": 13.462523666332554, + "grad_norm": 2.608172655105591, + "learning_rate": 1.4583410838634997e-05, + "loss": 0.1243, + "num_input_tokens_seen": 147090448, + "step": 120880 + }, + { + "epoch": 13.46308052121617, + "grad_norm": 1.7115201950073242, + "learning_rate": 1.4581202108540232e-05, + "loss": 0.0479, + "num_input_tokens_seen": 147096144, + "step": 120885 + }, + { + "epoch": 13.463637376099788, + "grad_norm": 0.06676080077886581, + "learning_rate": 1.4578993476859371e-05, + "loss": 0.0031, + "num_input_tokens_seen": 147102416, + "step": 120890 + }, + { + "epoch": 13.464194230983406, + "grad_norm": 0.004082613158971071, + "learning_rate": 1.4576784943613255e-05, + "loss": 0.006, + "num_input_tokens_seen": 147108720, + "step": 120895 + }, + { + "epoch": 13.464751085867023, + "grad_norm": 0.06948571652173996, + "learning_rate": 1.4574576508822768e-05, + "loss": 0.0355, + "num_input_tokens_seen": 147114672, + "step": 120900 + }, + { + "epoch": 13.465307940750641, + "grad_norm": 1.295699954032898, + "learning_rate": 1.4572368172508755e-05, + "loss": 0.0402, + "num_input_tokens_seen": 147121008, + "step": 120905 + }, + { + "epoch": 13.465864795634257, + "grad_norm": 0.002117870608344674, + "learning_rate": 1.4570159934692085e-05, + "loss": 0.0036, + "num_input_tokens_seen": 147127280, + "step": 120910 + }, + { + "epoch": 13.466421650517875, + "grad_norm": 0.006665114313364029, + "learning_rate": 1.4567951795393595e-05, + "loss": 0.0058, + "num_input_tokens_seen": 147134032, + "step": 120915 + }, + { + "epoch": 13.466978505401492, + "grad_norm": 0.006276017986238003, + "learning_rate": 1.456574375463417e-05, + "loss": 0.0596, + "num_input_tokens_seen": 147140048, + "step": 120920 + }, + { + "epoch": 13.46753536028511, + "grad_norm": 0.06359637528657913, + "learning_rate": 1.4563535812434656e-05, + "loss": 0.0014, + "num_input_tokens_seen": 147146384, + "step": 120925 + }, + { + "epoch": 13.468092215168728, + "grad_norm": 0.03138088807463646, + "learning_rate": 1.4561327968815908e-05, + "loss": 0.082, + "num_input_tokens_seen": 147152304, + "step": 120930 + }, + { + "epoch": 13.468649070052344, + "grad_norm": 0.1681368201971054, + "learning_rate": 1.455912022379877e-05, + "loss": 0.0091, + "num_input_tokens_seen": 147157616, + "step": 120935 + }, + { + "epoch": 13.469205924935961, + "grad_norm": 1.7886370420455933, + "learning_rate": 1.4556912577404114e-05, + "loss": 0.0479, + "num_input_tokens_seen": 147163728, + "step": 120940 + }, + { + "epoch": 13.469762779819579, + "grad_norm": 0.0030314403120428324, + "learning_rate": 1.4554705029652787e-05, + "loss": 0.0204, + "num_input_tokens_seen": 147169712, + "step": 120945 + }, + { + "epoch": 13.470319634703197, + "grad_norm": 0.23239652812480927, + "learning_rate": 1.4552497580565644e-05, + "loss": 0.018, + "num_input_tokens_seen": 147175664, + "step": 120950 + }, + { + "epoch": 13.470876489586814, + "grad_norm": 0.0002333903539692983, + "learning_rate": 1.4550290230163515e-05, + "loss": 0.008, + "num_input_tokens_seen": 147182160, + "step": 120955 + }, + { + "epoch": 13.47143334447043, + "grad_norm": 0.01014380156993866, + "learning_rate": 1.4548082978467281e-05, + "loss": 0.011, + "num_input_tokens_seen": 147188688, + "step": 120960 + }, + { + "epoch": 13.471990199354048, + "grad_norm": 0.003123981412500143, + "learning_rate": 1.4545875825497767e-05, + "loss": 0.142, + "num_input_tokens_seen": 147194736, + "step": 120965 + }, + { + "epoch": 13.472547054237666, + "grad_norm": 0.11562278866767883, + "learning_rate": 1.4543668771275842e-05, + "loss": 0.0842, + "num_input_tokens_seen": 147200880, + "step": 120970 + }, + { + "epoch": 13.473103909121283, + "grad_norm": 0.20992912352085114, + "learning_rate": 1.4541461815822353e-05, + "loss": 0.0179, + "num_input_tokens_seen": 147207216, + "step": 120975 + }, + { + "epoch": 13.4736607640049, + "grad_norm": 0.8764311075210571, + "learning_rate": 1.4539254959158116e-05, + "loss": 0.0303, + "num_input_tokens_seen": 147213296, + "step": 120980 + }, + { + "epoch": 13.474217618888517, + "grad_norm": 0.00029665176407434046, + "learning_rate": 1.4537048201304005e-05, + "loss": 0.0511, + "num_input_tokens_seen": 147219120, + "step": 120985 + }, + { + "epoch": 13.474774473772134, + "grad_norm": 0.12474358826875687, + "learning_rate": 1.4534841542280848e-05, + "loss": 0.0087, + "num_input_tokens_seen": 147225104, + "step": 120990 + }, + { + "epoch": 13.475331328655752, + "grad_norm": 0.10783359408378601, + "learning_rate": 1.4532634982109505e-05, + "loss": 0.0034, + "num_input_tokens_seen": 147231376, + "step": 120995 + }, + { + "epoch": 13.47588818353937, + "grad_norm": 0.14324310421943665, + "learning_rate": 1.4530428520810812e-05, + "loss": 0.0048, + "num_input_tokens_seen": 147237584, + "step": 121000 + }, + { + "epoch": 13.476445038422987, + "grad_norm": 0.004639350343495607, + "learning_rate": 1.4528222158405613e-05, + "loss": 0.1182, + "num_input_tokens_seen": 147243472, + "step": 121005 + }, + { + "epoch": 13.477001893306605, + "grad_norm": 0.1243492066860199, + "learning_rate": 1.4526015894914734e-05, + "loss": 0.0071, + "num_input_tokens_seen": 147249392, + "step": 121010 + }, + { + "epoch": 13.477558748190221, + "grad_norm": 0.004143341910094023, + "learning_rate": 1.4523809730359034e-05, + "loss": 0.0172, + "num_input_tokens_seen": 147255152, + "step": 121015 + }, + { + "epoch": 13.478115603073839, + "grad_norm": 0.5670117735862732, + "learning_rate": 1.4521603664759348e-05, + "loss": 0.0261, + "num_input_tokens_seen": 147261424, + "step": 121020 + }, + { + "epoch": 13.478672457957456, + "grad_norm": 0.44351667165756226, + "learning_rate": 1.4519397698136509e-05, + "loss": 0.0727, + "num_input_tokens_seen": 147267408, + "step": 121025 + }, + { + "epoch": 13.479229312841074, + "grad_norm": 0.049947112798690796, + "learning_rate": 1.4517191830511345e-05, + "loss": 0.0016, + "num_input_tokens_seen": 147273456, + "step": 121030 + }, + { + "epoch": 13.479786167724692, + "grad_norm": 0.05082650110125542, + "learning_rate": 1.4514986061904713e-05, + "loss": 0.1753, + "num_input_tokens_seen": 147279600, + "step": 121035 + }, + { + "epoch": 13.480343022608308, + "grad_norm": 0.49652400612831116, + "learning_rate": 1.4512780392337428e-05, + "loss": 0.0162, + "num_input_tokens_seen": 147285808, + "step": 121040 + }, + { + "epoch": 13.480899877491925, + "grad_norm": 0.36842119693756104, + "learning_rate": 1.4510574821830353e-05, + "loss": 0.1169, + "num_input_tokens_seen": 147292048, + "step": 121045 + }, + { + "epoch": 13.481456732375543, + "grad_norm": 1.5844085216522217, + "learning_rate": 1.4508369350404286e-05, + "loss": 0.0817, + "num_input_tokens_seen": 147298128, + "step": 121050 + }, + { + "epoch": 13.48201358725916, + "grad_norm": 0.0003756472433451563, + "learning_rate": 1.4506163978080083e-05, + "loss": 0.0374, + "num_input_tokens_seen": 147304240, + "step": 121055 + }, + { + "epoch": 13.482570442142778, + "grad_norm": 0.2079443335533142, + "learning_rate": 1.4503958704878563e-05, + "loss": 0.0403, + "num_input_tokens_seen": 147309680, + "step": 121060 + }, + { + "epoch": 13.483127297026394, + "grad_norm": 0.08911721408367157, + "learning_rate": 1.450175353082057e-05, + "loss": 0.0221, + "num_input_tokens_seen": 147315696, + "step": 121065 + }, + { + "epoch": 13.483684151910012, + "grad_norm": 0.006809086073189974, + "learning_rate": 1.4499548455926926e-05, + "loss": 0.0894, + "num_input_tokens_seen": 147321680, + "step": 121070 + }, + { + "epoch": 13.48424100679363, + "grad_norm": 0.021797727793455124, + "learning_rate": 1.4497343480218457e-05, + "loss": 0.0059, + "num_input_tokens_seen": 147327792, + "step": 121075 + }, + { + "epoch": 13.484797861677247, + "grad_norm": 0.0008371273288503289, + "learning_rate": 1.4495138603715986e-05, + "loss": 0.0139, + "num_input_tokens_seen": 147334128, + "step": 121080 + }, + { + "epoch": 13.485354716560865, + "grad_norm": 0.00014018930960446596, + "learning_rate": 1.4492933826440358e-05, + "loss": 0.004, + "num_input_tokens_seen": 147340112, + "step": 121085 + }, + { + "epoch": 13.48591157144448, + "grad_norm": 1.353771686553955, + "learning_rate": 1.4490729148412386e-05, + "loss": 0.115, + "num_input_tokens_seen": 147346320, + "step": 121090 + }, + { + "epoch": 13.486468426328099, + "grad_norm": 0.031215913593769073, + "learning_rate": 1.44885245696529e-05, + "loss": 0.0112, + "num_input_tokens_seen": 147352272, + "step": 121095 + }, + { + "epoch": 13.487025281211716, + "grad_norm": 0.0018394539365544915, + "learning_rate": 1.4486320090182709e-05, + "loss": 0.121, + "num_input_tokens_seen": 147358736, + "step": 121100 + }, + { + "epoch": 13.487582136095334, + "grad_norm": 0.46863460540771484, + "learning_rate": 1.4484115710022658e-05, + "loss": 0.0519, + "num_input_tokens_seen": 147364784, + "step": 121105 + }, + { + "epoch": 13.488138990978952, + "grad_norm": 0.8424895405769348, + "learning_rate": 1.448191142919355e-05, + "loss": 0.0639, + "num_input_tokens_seen": 147371024, + "step": 121110 + }, + { + "epoch": 13.488695845862567, + "grad_norm": 1.710000991821289, + "learning_rate": 1.4479707247716226e-05, + "loss": 0.1101, + "num_input_tokens_seen": 147377104, + "step": 121115 + }, + { + "epoch": 13.489252700746185, + "grad_norm": 0.25158700346946716, + "learning_rate": 1.4477503165611495e-05, + "loss": 0.034, + "num_input_tokens_seen": 147383312, + "step": 121120 + }, + { + "epoch": 13.489809555629803, + "grad_norm": 0.6809155344963074, + "learning_rate": 1.4475299182900176e-05, + "loss": 0.0292, + "num_input_tokens_seen": 147389200, + "step": 121125 + }, + { + "epoch": 13.49036641051342, + "grad_norm": 0.010934632271528244, + "learning_rate": 1.4473095299603079e-05, + "loss": 0.0011, + "num_input_tokens_seen": 147395056, + "step": 121130 + }, + { + "epoch": 13.490923265397038, + "grad_norm": 0.0018323564436286688, + "learning_rate": 1.4470891515741042e-05, + "loss": 0.0143, + "num_input_tokens_seen": 147401456, + "step": 121135 + }, + { + "epoch": 13.491480120280654, + "grad_norm": 0.0038086948916316032, + "learning_rate": 1.446868783133487e-05, + "loss": 0.0076, + "num_input_tokens_seen": 147407824, + "step": 121140 + }, + { + "epoch": 13.492036975164272, + "grad_norm": 1.0540492534637451, + "learning_rate": 1.4466484246405382e-05, + "loss": 0.1613, + "num_input_tokens_seen": 147414096, + "step": 121145 + }, + { + "epoch": 13.49259383004789, + "grad_norm": 0.00970409158617258, + "learning_rate": 1.4464280760973375e-05, + "loss": 0.0407, + "num_input_tokens_seen": 147420240, + "step": 121150 + }, + { + "epoch": 13.493150684931507, + "grad_norm": 0.08677230775356293, + "learning_rate": 1.4462077375059688e-05, + "loss": 0.0837, + "num_input_tokens_seen": 147426256, + "step": 121155 + }, + { + "epoch": 13.493707539815125, + "grad_norm": 1.4325928688049316, + "learning_rate": 1.4459874088685116e-05, + "loss": 0.042, + "num_input_tokens_seen": 147432432, + "step": 121160 + }, + { + "epoch": 13.494264394698742, + "grad_norm": 0.00014389304851647466, + "learning_rate": 1.4457670901870496e-05, + "loss": 0.0022, + "num_input_tokens_seen": 147438576, + "step": 121165 + }, + { + "epoch": 13.494821249582358, + "grad_norm": 0.01905113458633423, + "learning_rate": 1.4455467814636597e-05, + "loss": 0.1116, + "num_input_tokens_seen": 147444528, + "step": 121170 + }, + { + "epoch": 13.495378104465976, + "grad_norm": 0.011916941031813622, + "learning_rate": 1.4453264827004268e-05, + "loss": 0.0493, + "num_input_tokens_seen": 147449808, + "step": 121175 + }, + { + "epoch": 13.495934959349594, + "grad_norm": 0.01492008101195097, + "learning_rate": 1.4451061938994289e-05, + "loss": 0.0121, + "num_input_tokens_seen": 147456016, + "step": 121180 + }, + { + "epoch": 13.496491814233211, + "grad_norm": 0.0018672039732336998, + "learning_rate": 1.4448859150627494e-05, + "loss": 0.0095, + "num_input_tokens_seen": 147462064, + "step": 121185 + }, + { + "epoch": 13.497048669116829, + "grad_norm": 0.00019422800687607378, + "learning_rate": 1.444665646192468e-05, + "loss": 0.0827, + "num_input_tokens_seen": 147468112, + "step": 121190 + }, + { + "epoch": 13.497605524000445, + "grad_norm": 1.0066635608673096, + "learning_rate": 1.4444453872906644e-05, + "loss": 0.1009, + "num_input_tokens_seen": 147473904, + "step": 121195 + }, + { + "epoch": 13.498162378884063, + "grad_norm": 0.050323087722063065, + "learning_rate": 1.4442251383594193e-05, + "loss": 0.0675, + "num_input_tokens_seen": 147479664, + "step": 121200 + }, + { + "epoch": 13.49871923376768, + "grad_norm": 0.003995422273874283, + "learning_rate": 1.4440048994008146e-05, + "loss": 0.0658, + "num_input_tokens_seen": 147485424, + "step": 121205 + }, + { + "epoch": 13.499276088651298, + "grad_norm": 0.10519137978553772, + "learning_rate": 1.4437846704169297e-05, + "loss": 0.0473, + "num_input_tokens_seen": 147491312, + "step": 121210 + }, + { + "epoch": 13.499832943534916, + "grad_norm": 0.0008870215970091522, + "learning_rate": 1.4435644514098445e-05, + "loss": 0.0019, + "num_input_tokens_seen": 147497488, + "step": 121215 + }, + { + "epoch": 13.500389798418531, + "grad_norm": 0.003671390237286687, + "learning_rate": 1.443344242381639e-05, + "loss": 0.0294, + "num_input_tokens_seen": 147503440, + "step": 121220 + }, + { + "epoch": 13.50094665330215, + "grad_norm": 1.4134410619735718, + "learning_rate": 1.4431240433343942e-05, + "loss": 0.081, + "num_input_tokens_seen": 147509232, + "step": 121225 + }, + { + "epoch": 13.501503508185767, + "grad_norm": 0.26494571566581726, + "learning_rate": 1.442903854270189e-05, + "loss": 0.0382, + "num_input_tokens_seen": 147515152, + "step": 121230 + }, + { + "epoch": 13.502060363069385, + "grad_norm": 0.9513834118843079, + "learning_rate": 1.4426836751911055e-05, + "loss": 0.1158, + "num_input_tokens_seen": 147521200, + "step": 121235 + }, + { + "epoch": 13.502617217953002, + "grad_norm": 0.9139752388000488, + "learning_rate": 1.4424635060992198e-05, + "loss": 0.0344, + "num_input_tokens_seen": 147527472, + "step": 121240 + }, + { + "epoch": 13.503174072836618, + "grad_norm": 0.00012826618331018835, + "learning_rate": 1.4422433469966146e-05, + "loss": 0.0858, + "num_input_tokens_seen": 147533392, + "step": 121245 + }, + { + "epoch": 13.503730927720236, + "grad_norm": 0.6544731855392456, + "learning_rate": 1.4420231978853677e-05, + "loss": 0.0279, + "num_input_tokens_seen": 147539952, + "step": 121250 + }, + { + "epoch": 13.504287782603853, + "grad_norm": 0.022272832691669464, + "learning_rate": 1.4418030587675601e-05, + "loss": 0.0189, + "num_input_tokens_seen": 147546000, + "step": 121255 + }, + { + "epoch": 13.504844637487471, + "grad_norm": 0.3293175995349884, + "learning_rate": 1.4415829296452705e-05, + "loss": 0.0308, + "num_input_tokens_seen": 147552144, + "step": 121260 + }, + { + "epoch": 13.505401492371089, + "grad_norm": 0.08945544064044952, + "learning_rate": 1.4413628105205782e-05, + "loss": 0.1566, + "num_input_tokens_seen": 147558160, + "step": 121265 + }, + { + "epoch": 13.505958347254705, + "grad_norm": 0.7599549889564514, + "learning_rate": 1.4411427013955611e-05, + "loss": 0.0263, + "num_input_tokens_seen": 147564528, + "step": 121270 + }, + { + "epoch": 13.506515202138322, + "grad_norm": 0.013618367724120617, + "learning_rate": 1.4409226022723004e-05, + "loss": 0.0447, + "num_input_tokens_seen": 147570608, + "step": 121275 + }, + { + "epoch": 13.50707205702194, + "grad_norm": 0.07772878557443619, + "learning_rate": 1.4407025131528746e-05, + "loss": 0.0059, + "num_input_tokens_seen": 147576336, + "step": 121280 + }, + { + "epoch": 13.507628911905558, + "grad_norm": 0.43444886803627014, + "learning_rate": 1.4404824340393624e-05, + "loss": 0.0236, + "num_input_tokens_seen": 147582320, + "step": 121285 + }, + { + "epoch": 13.508185766789175, + "grad_norm": 0.012299620546400547, + "learning_rate": 1.440262364933841e-05, + "loss": 0.0089, + "num_input_tokens_seen": 147588592, + "step": 121290 + }, + { + "epoch": 13.508742621672791, + "grad_norm": 0.0011345010716468096, + "learning_rate": 1.440042305838392e-05, + "loss": 0.0207, + "num_input_tokens_seen": 147594384, + "step": 121295 + }, + { + "epoch": 13.509299476556409, + "grad_norm": 0.2294492870569229, + "learning_rate": 1.4398222567550912e-05, + "loss": 0.0763, + "num_input_tokens_seen": 147600656, + "step": 121300 + }, + { + "epoch": 13.509856331440027, + "grad_norm": 0.08069057762622833, + "learning_rate": 1.4396022176860202e-05, + "loss": 0.0419, + "num_input_tokens_seen": 147606576, + "step": 121305 + }, + { + "epoch": 13.510413186323644, + "grad_norm": 0.19843965768814087, + "learning_rate": 1.4393821886332554e-05, + "loss": 0.0481, + "num_input_tokens_seen": 147612720, + "step": 121310 + }, + { + "epoch": 13.510970041207262, + "grad_norm": 0.5302289128303528, + "learning_rate": 1.4391621695988755e-05, + "loss": 0.0731, + "num_input_tokens_seen": 147619216, + "step": 121315 + }, + { + "epoch": 13.511526896090878, + "grad_norm": 0.2688831388950348, + "learning_rate": 1.438942160584958e-05, + "loss": 0.0298, + "num_input_tokens_seen": 147625520, + "step": 121320 + }, + { + "epoch": 13.512083750974496, + "grad_norm": 1.8026529550552368, + "learning_rate": 1.4387221615935831e-05, + "loss": 0.2071, + "num_input_tokens_seen": 147631504, + "step": 121325 + }, + { + "epoch": 13.512640605858113, + "grad_norm": 0.06788259744644165, + "learning_rate": 1.4385021726268275e-05, + "loss": 0.0005, + "num_input_tokens_seen": 147637936, + "step": 121330 + }, + { + "epoch": 13.513197460741731, + "grad_norm": 0.49502432346343994, + "learning_rate": 1.4382821936867691e-05, + "loss": 0.0634, + "num_input_tokens_seen": 147644304, + "step": 121335 + }, + { + "epoch": 13.513754315625349, + "grad_norm": 0.15883885324001312, + "learning_rate": 1.4380622247754855e-05, + "loss": 0.0875, + "num_input_tokens_seen": 147650832, + "step": 121340 + }, + { + "epoch": 13.514311170508964, + "grad_norm": 1.0732396841049194, + "learning_rate": 1.4378422658950555e-05, + "loss": 0.0422, + "num_input_tokens_seen": 147656944, + "step": 121345 + }, + { + "epoch": 13.514868025392582, + "grad_norm": 0.1532866507768631, + "learning_rate": 1.4376223170475556e-05, + "loss": 0.0284, + "num_input_tokens_seen": 147662928, + "step": 121350 + }, + { + "epoch": 13.5154248802762, + "grad_norm": 0.8771163821220398, + "learning_rate": 1.437402378235066e-05, + "loss": 0.0632, + "num_input_tokens_seen": 147669008, + "step": 121355 + }, + { + "epoch": 13.515981735159817, + "grad_norm": 0.8463623523712158, + "learning_rate": 1.4371824494596603e-05, + "loss": 0.0244, + "num_input_tokens_seen": 147675120, + "step": 121360 + }, + { + "epoch": 13.516538590043435, + "grad_norm": 0.2151787281036377, + "learning_rate": 1.4369625307234185e-05, + "loss": 0.0123, + "num_input_tokens_seen": 147681328, + "step": 121365 + }, + { + "epoch": 13.517095444927053, + "grad_norm": 0.04378875717520714, + "learning_rate": 1.4367426220284169e-05, + "loss": 0.0189, + "num_input_tokens_seen": 147687728, + "step": 121370 + }, + { + "epoch": 13.517652299810669, + "grad_norm": 0.5986652970314026, + "learning_rate": 1.4365227233767337e-05, + "loss": 0.0622, + "num_input_tokens_seen": 147693872, + "step": 121375 + }, + { + "epoch": 13.518209154694286, + "grad_norm": 0.01704447716474533, + "learning_rate": 1.4363028347704466e-05, + "loss": 0.0022, + "num_input_tokens_seen": 147700048, + "step": 121380 + }, + { + "epoch": 13.518766009577904, + "grad_norm": 0.10186950862407684, + "learning_rate": 1.4360829562116296e-05, + "loss": 0.0045, + "num_input_tokens_seen": 147705872, + "step": 121385 + }, + { + "epoch": 13.519322864461522, + "grad_norm": 1.0471704006195068, + "learning_rate": 1.4358630877023621e-05, + "loss": 0.0633, + "num_input_tokens_seen": 147711952, + "step": 121390 + }, + { + "epoch": 13.51987971934514, + "grad_norm": 0.30231383442878723, + "learning_rate": 1.4356432292447198e-05, + "loss": 0.0688, + "num_input_tokens_seen": 147718064, + "step": 121395 + }, + { + "epoch": 13.520436574228755, + "grad_norm": 0.03597220778465271, + "learning_rate": 1.4354233808407802e-05, + "loss": 0.0231, + "num_input_tokens_seen": 147723568, + "step": 121400 + }, + { + "epoch": 13.520993429112373, + "grad_norm": 0.2512734830379486, + "learning_rate": 1.4352035424926202e-05, + "loss": 0.005, + "num_input_tokens_seen": 147730000, + "step": 121405 + }, + { + "epoch": 13.52155028399599, + "grad_norm": 0.04651067778468132, + "learning_rate": 1.4349837142023158e-05, + "loss": 0.0014, + "num_input_tokens_seen": 147736144, + "step": 121410 + }, + { + "epoch": 13.522107138879608, + "grad_norm": 0.05917096510529518, + "learning_rate": 1.4347638959719426e-05, + "loss": 0.0151, + "num_input_tokens_seen": 147742064, + "step": 121415 + }, + { + "epoch": 13.522663993763226, + "grad_norm": 0.5482668876647949, + "learning_rate": 1.4345440878035787e-05, + "loss": 0.0327, + "num_input_tokens_seen": 147748080, + "step": 121420 + }, + { + "epoch": 13.523220848646842, + "grad_norm": 0.010357133112847805, + "learning_rate": 1.4343242896992995e-05, + "loss": 0.0057, + "num_input_tokens_seen": 147754224, + "step": 121425 + }, + { + "epoch": 13.52377770353046, + "grad_norm": 0.07414041459560394, + "learning_rate": 1.4341045016611812e-05, + "loss": 0.0274, + "num_input_tokens_seen": 147760304, + "step": 121430 + }, + { + "epoch": 13.524334558414077, + "grad_norm": 0.023468876257538795, + "learning_rate": 1.4338847236912989e-05, + "loss": 0.0813, + "num_input_tokens_seen": 147766640, + "step": 121435 + }, + { + "epoch": 13.524891413297695, + "grad_norm": 0.030183924362063408, + "learning_rate": 1.4336649557917306e-05, + "loss": 0.0021, + "num_input_tokens_seen": 147772880, + "step": 121440 + }, + { + "epoch": 13.525448268181313, + "grad_norm": 0.03009759448468685, + "learning_rate": 1.43344519796455e-05, + "loss": 0.0038, + "num_input_tokens_seen": 147779248, + "step": 121445 + }, + { + "epoch": 13.526005123064929, + "grad_norm": 1.0338621139526367, + "learning_rate": 1.433225450211836e-05, + "loss": 0.059, + "num_input_tokens_seen": 147785520, + "step": 121450 + }, + { + "epoch": 13.526561977948546, + "grad_norm": 0.42057597637176514, + "learning_rate": 1.4330057125356605e-05, + "loss": 0.0255, + "num_input_tokens_seen": 147791728, + "step": 121455 + }, + { + "epoch": 13.527118832832164, + "grad_norm": 0.00984988920390606, + "learning_rate": 1.4327859849381017e-05, + "loss": 0.0246, + "num_input_tokens_seen": 147797680, + "step": 121460 + }, + { + "epoch": 13.527675687715782, + "grad_norm": 0.6009382009506226, + "learning_rate": 1.4325662674212334e-05, + "loss": 0.0157, + "num_input_tokens_seen": 147803408, + "step": 121465 + }, + { + "epoch": 13.5282325425994, + "grad_norm": 0.022371456027030945, + "learning_rate": 1.432346559987133e-05, + "loss": 0.0172, + "num_input_tokens_seen": 147809680, + "step": 121470 + }, + { + "epoch": 13.528789397483015, + "grad_norm": 0.034540046006441116, + "learning_rate": 1.4321268626378747e-05, + "loss": 0.0292, + "num_input_tokens_seen": 147816112, + "step": 121475 + }, + { + "epoch": 13.529346252366633, + "grad_norm": 0.4430781304836273, + "learning_rate": 1.4319071753755337e-05, + "loss": 0.0521, + "num_input_tokens_seen": 147822032, + "step": 121480 + }, + { + "epoch": 13.52990310725025, + "grad_norm": 1.9698249101638794, + "learning_rate": 1.4316874982021841e-05, + "loss": 0.0378, + "num_input_tokens_seen": 147827952, + "step": 121485 + }, + { + "epoch": 13.530459962133868, + "grad_norm": 0.10498324036598206, + "learning_rate": 1.4314678311199031e-05, + "loss": 0.0058, + "num_input_tokens_seen": 147834160, + "step": 121490 + }, + { + "epoch": 13.531016817017486, + "grad_norm": 1.2853803634643555, + "learning_rate": 1.4312481741307644e-05, + "loss": 0.0308, + "num_input_tokens_seen": 147840400, + "step": 121495 + }, + { + "epoch": 13.531573671901103, + "grad_norm": 0.0024258338380604982, + "learning_rate": 1.431028527236843e-05, + "loss": 0.072, + "num_input_tokens_seen": 147846192, + "step": 121500 + }, + { + "epoch": 13.53213052678472, + "grad_norm": 1.2217869758605957, + "learning_rate": 1.4308088904402128e-05, + "loss": 0.118, + "num_input_tokens_seen": 147852464, + "step": 121505 + }, + { + "epoch": 13.532687381668337, + "grad_norm": 0.9167493581771851, + "learning_rate": 1.4305892637429502e-05, + "loss": 0.0208, + "num_input_tokens_seen": 147858384, + "step": 121510 + }, + { + "epoch": 13.533244236551955, + "grad_norm": 0.03299831971526146, + "learning_rate": 1.4303696471471275e-05, + "loss": 0.0442, + "num_input_tokens_seen": 147864336, + "step": 121515 + }, + { + "epoch": 13.533801091435572, + "grad_norm": 0.42506179213523865, + "learning_rate": 1.4301500406548219e-05, + "loss": 0.0192, + "num_input_tokens_seen": 147870480, + "step": 121520 + }, + { + "epoch": 13.53435794631919, + "grad_norm": 0.04195229336619377, + "learning_rate": 1.4299304442681061e-05, + "loss": 0.0024, + "num_input_tokens_seen": 147876944, + "step": 121525 + }, + { + "epoch": 13.534914801202806, + "grad_norm": 0.0037917024455964565, + "learning_rate": 1.4297108579890544e-05, + "loss": 0.097, + "num_input_tokens_seen": 147883472, + "step": 121530 + }, + { + "epoch": 13.535471656086424, + "grad_norm": 0.09199953824281693, + "learning_rate": 1.4294912818197403e-05, + "loss": 0.0434, + "num_input_tokens_seen": 147889488, + "step": 121535 + }, + { + "epoch": 13.536028510970041, + "grad_norm": 0.9904938340187073, + "learning_rate": 1.4292717157622399e-05, + "loss": 0.0601, + "num_input_tokens_seen": 147895472, + "step": 121540 + }, + { + "epoch": 13.536585365853659, + "grad_norm": 0.8043900728225708, + "learning_rate": 1.4290521598186257e-05, + "loss": 0.0172, + "num_input_tokens_seen": 147901488, + "step": 121545 + }, + { + "epoch": 13.537142220737277, + "grad_norm": 0.056913822889328, + "learning_rate": 1.4288326139909719e-05, + "loss": 0.0482, + "num_input_tokens_seen": 147907504, + "step": 121550 + }, + { + "epoch": 13.537699075620893, + "grad_norm": 0.005644108168780804, + "learning_rate": 1.4286130782813514e-05, + "loss": 0.0239, + "num_input_tokens_seen": 147913552, + "step": 121555 + }, + { + "epoch": 13.53825593050451, + "grad_norm": 0.0756245106458664, + "learning_rate": 1.4283935526918396e-05, + "loss": 0.0272, + "num_input_tokens_seen": 147919856, + "step": 121560 + }, + { + "epoch": 13.538812785388128, + "grad_norm": 1.5232359170913696, + "learning_rate": 1.4281740372245087e-05, + "loss": 0.0539, + "num_input_tokens_seen": 147926000, + "step": 121565 + }, + { + "epoch": 13.539369640271746, + "grad_norm": 0.6687878966331482, + "learning_rate": 1.427954531881434e-05, + "loss": 0.0645, + "num_input_tokens_seen": 147932400, + "step": 121570 + }, + { + "epoch": 13.539926495155363, + "grad_norm": 1.32534658908844, + "learning_rate": 1.4277350366646863e-05, + "loss": 0.0618, + "num_input_tokens_seen": 147938224, + "step": 121575 + }, + { + "epoch": 13.54048335003898, + "grad_norm": 0.1412002146244049, + "learning_rate": 1.427515551576341e-05, + "loss": 0.0459, + "num_input_tokens_seen": 147944144, + "step": 121580 + }, + { + "epoch": 13.541040204922597, + "grad_norm": 1.4402421712875366, + "learning_rate": 1.4272960766184699e-05, + "loss": 0.0989, + "num_input_tokens_seen": 147950128, + "step": 121585 + }, + { + "epoch": 13.541597059806215, + "grad_norm": 0.2600584626197815, + "learning_rate": 1.4270766117931475e-05, + "loss": 0.0171, + "num_input_tokens_seen": 147956176, + "step": 121590 + }, + { + "epoch": 13.542153914689832, + "grad_norm": 0.006798325106501579, + "learning_rate": 1.4268571571024461e-05, + "loss": 0.0585, + "num_input_tokens_seen": 147962256, + "step": 121595 + }, + { + "epoch": 13.54271076957345, + "grad_norm": 0.7012548446655273, + "learning_rate": 1.4266377125484387e-05, + "loss": 0.0159, + "num_input_tokens_seen": 147968368, + "step": 121600 + }, + { + "epoch": 13.543267624457066, + "grad_norm": 0.15129496157169342, + "learning_rate": 1.426418278133197e-05, + "loss": 0.1338, + "num_input_tokens_seen": 147974352, + "step": 121605 + }, + { + "epoch": 13.543824479340683, + "grad_norm": 0.005921783857047558, + "learning_rate": 1.4261988538587958e-05, + "loss": 0.0005, + "num_input_tokens_seen": 147980592, + "step": 121610 + }, + { + "epoch": 13.544381334224301, + "grad_norm": 0.0006431384827010334, + "learning_rate": 1.425979439727307e-05, + "loss": 0.0169, + "num_input_tokens_seen": 147986640, + "step": 121615 + }, + { + "epoch": 13.544938189107919, + "grad_norm": 1.8308337926864624, + "learning_rate": 1.4257600357408024e-05, + "loss": 0.1146, + "num_input_tokens_seen": 147992656, + "step": 121620 + }, + { + "epoch": 13.545495043991536, + "grad_norm": 0.26006507873535156, + "learning_rate": 1.4255406419013545e-05, + "loss": 0.0022, + "num_input_tokens_seen": 147999056, + "step": 121625 + }, + { + "epoch": 13.546051898875152, + "grad_norm": 0.0005084152217023075, + "learning_rate": 1.4253212582110364e-05, + "loss": 0.0124, + "num_input_tokens_seen": 148005200, + "step": 121630 + }, + { + "epoch": 13.54660875375877, + "grad_norm": 0.03379327431321144, + "learning_rate": 1.4251018846719195e-05, + "loss": 0.0188, + "num_input_tokens_seen": 148010896, + "step": 121635 + }, + { + "epoch": 13.547165608642388, + "grad_norm": 0.014558355323970318, + "learning_rate": 1.4248825212860784e-05, + "loss": 0.0092, + "num_input_tokens_seen": 148017104, + "step": 121640 + }, + { + "epoch": 13.547722463526005, + "grad_norm": 0.4205400049686432, + "learning_rate": 1.4246631680555814e-05, + "loss": 0.0898, + "num_input_tokens_seen": 148023344, + "step": 121645 + }, + { + "epoch": 13.548279318409623, + "grad_norm": 0.0029789358377456665, + "learning_rate": 1.4244438249825032e-05, + "loss": 0.0078, + "num_input_tokens_seen": 148029584, + "step": 121650 + }, + { + "epoch": 13.548836173293239, + "grad_norm": 0.0009700339287519455, + "learning_rate": 1.4242244920689138e-05, + "loss": 0.1502, + "num_input_tokens_seen": 148036016, + "step": 121655 + }, + { + "epoch": 13.549393028176857, + "grad_norm": 0.00011830133735202253, + "learning_rate": 1.4240051693168869e-05, + "loss": 0.0012, + "num_input_tokens_seen": 148042160, + "step": 121660 + }, + { + "epoch": 13.549949883060474, + "grad_norm": 0.003352564526721835, + "learning_rate": 1.4237858567284934e-05, + "loss": 0.0151, + "num_input_tokens_seen": 148048240, + "step": 121665 + }, + { + "epoch": 13.550506737944092, + "grad_norm": 0.005604044999927282, + "learning_rate": 1.4235665543058046e-05, + "loss": 0.0035, + "num_input_tokens_seen": 148054608, + "step": 121670 + }, + { + "epoch": 13.55106359282771, + "grad_norm": 0.015529915690422058, + "learning_rate": 1.4233472620508909e-05, + "loss": 0.0133, + "num_input_tokens_seen": 148060496, + "step": 121675 + }, + { + "epoch": 13.551620447711326, + "grad_norm": 0.00993256364017725, + "learning_rate": 1.4231279799658265e-05, + "loss": 0.0602, + "num_input_tokens_seen": 148066640, + "step": 121680 + }, + { + "epoch": 13.552177302594943, + "grad_norm": 0.011374440044164658, + "learning_rate": 1.4229087080526804e-05, + "loss": 0.0079, + "num_input_tokens_seen": 148072688, + "step": 121685 + }, + { + "epoch": 13.552734157478561, + "grad_norm": 0.001570343622006476, + "learning_rate": 1.4226894463135248e-05, + "loss": 0.0684, + "num_input_tokens_seen": 148078672, + "step": 121690 + }, + { + "epoch": 13.553291012362179, + "grad_norm": 0.09549800306558609, + "learning_rate": 1.4224701947504298e-05, + "loss": 0.0346, + "num_input_tokens_seen": 148085040, + "step": 121695 + }, + { + "epoch": 13.553847867245796, + "grad_norm": 0.6411731243133545, + "learning_rate": 1.422250953365468e-05, + "loss": 0.0725, + "num_input_tokens_seen": 148090896, + "step": 121700 + }, + { + "epoch": 13.554404722129412, + "grad_norm": 0.014885993674397469, + "learning_rate": 1.4220317221607082e-05, + "loss": 0.0631, + "num_input_tokens_seen": 148096944, + "step": 121705 + }, + { + "epoch": 13.55496157701303, + "grad_norm": 0.00024585664505138993, + "learning_rate": 1.4218125011382236e-05, + "loss": 0.0834, + "num_input_tokens_seen": 148103280, + "step": 121710 + }, + { + "epoch": 13.555518431896648, + "grad_norm": 0.2004055678844452, + "learning_rate": 1.4215932903000837e-05, + "loss": 0.048, + "num_input_tokens_seen": 148109072, + "step": 121715 + }, + { + "epoch": 13.556075286780265, + "grad_norm": 0.03583669289946556, + "learning_rate": 1.421374089648359e-05, + "loss": 0.0037, + "num_input_tokens_seen": 148115280, + "step": 121720 + }, + { + "epoch": 13.556632141663883, + "grad_norm": 0.21151436865329742, + "learning_rate": 1.4211548991851196e-05, + "loss": 0.0105, + "num_input_tokens_seen": 148121168, + "step": 121725 + }, + { + "epoch": 13.5571889965475, + "grad_norm": 0.8017001152038574, + "learning_rate": 1.4209357189124372e-05, + "loss": 0.0461, + "num_input_tokens_seen": 148127376, + "step": 121730 + }, + { + "epoch": 13.557745851431116, + "grad_norm": 0.051479075103998184, + "learning_rate": 1.4207165488323814e-05, + "loss": 0.0074, + "num_input_tokens_seen": 148133680, + "step": 121735 + }, + { + "epoch": 13.558302706314734, + "grad_norm": 0.005729441996663809, + "learning_rate": 1.420497388947023e-05, + "loss": 0.0216, + "num_input_tokens_seen": 148139696, + "step": 121740 + }, + { + "epoch": 13.558859561198352, + "grad_norm": 0.9624790549278259, + "learning_rate": 1.4202782392584302e-05, + "loss": 0.0437, + "num_input_tokens_seen": 148146064, + "step": 121745 + }, + { + "epoch": 13.55941641608197, + "grad_norm": 1.087159514427185, + "learning_rate": 1.4200590997686758e-05, + "loss": 0.0793, + "num_input_tokens_seen": 148152144, + "step": 121750 + }, + { + "epoch": 13.559973270965587, + "grad_norm": 0.09519278258085251, + "learning_rate": 1.419839970479827e-05, + "loss": 0.1586, + "num_input_tokens_seen": 148158288, + "step": 121755 + }, + { + "epoch": 13.560530125849203, + "grad_norm": 1.678641438484192, + "learning_rate": 1.4196208513939573e-05, + "loss": 0.028, + "num_input_tokens_seen": 148164496, + "step": 121760 + }, + { + "epoch": 13.56108698073282, + "grad_norm": 0.0005868385196663439, + "learning_rate": 1.4194017425131323e-05, + "loss": 0.0068, + "num_input_tokens_seen": 148170384, + "step": 121765 + }, + { + "epoch": 13.561643835616438, + "grad_norm": 0.0004028152034152299, + "learning_rate": 1.4191826438394246e-05, + "loss": 0.0255, + "num_input_tokens_seen": 148176368, + "step": 121770 + }, + { + "epoch": 13.562200690500056, + "grad_norm": 0.0031995410099625587, + "learning_rate": 1.418963555374902e-05, + "loss": 0.0449, + "num_input_tokens_seen": 148181872, + "step": 121775 + }, + { + "epoch": 13.562757545383674, + "grad_norm": 1.3959914445877075, + "learning_rate": 1.4187444771216354e-05, + "loss": 0.0281, + "num_input_tokens_seen": 148188144, + "step": 121780 + }, + { + "epoch": 13.56331440026729, + "grad_norm": 0.9532381892204285, + "learning_rate": 1.4185254090816935e-05, + "loss": 0.0442, + "num_input_tokens_seen": 148194160, + "step": 121785 + }, + { + "epoch": 13.563871255150907, + "grad_norm": 0.12704333662986755, + "learning_rate": 1.418306351257146e-05, + "loss": 0.0036, + "num_input_tokens_seen": 148200304, + "step": 121790 + }, + { + "epoch": 13.564428110034525, + "grad_norm": 0.00043680184171535075, + "learning_rate": 1.4180873036500611e-05, + "loss": 0.0572, + "num_input_tokens_seen": 148206256, + "step": 121795 + }, + { + "epoch": 13.564984964918143, + "grad_norm": 0.10854438692331314, + "learning_rate": 1.4178682662625075e-05, + "loss": 0.0013, + "num_input_tokens_seen": 148212176, + "step": 121800 + }, + { + "epoch": 13.56554181980176, + "grad_norm": 0.14133992791175842, + "learning_rate": 1.4176492390965562e-05, + "loss": 0.0973, + "num_input_tokens_seen": 148218256, + "step": 121805 + }, + { + "epoch": 13.566098674685376, + "grad_norm": 0.0018607854144647717, + "learning_rate": 1.4174302221542751e-05, + "loss": 0.0045, + "num_input_tokens_seen": 148224592, + "step": 121810 + }, + { + "epoch": 13.566655529568994, + "grad_norm": 0.6334423422813416, + "learning_rate": 1.4172112154377332e-05, + "loss": 0.0054, + "num_input_tokens_seen": 148230608, + "step": 121815 + }, + { + "epoch": 13.567212384452612, + "grad_norm": 0.00020689652592409402, + "learning_rate": 1.4169922189489973e-05, + "loss": 0.0627, + "num_input_tokens_seen": 148236400, + "step": 121820 + }, + { + "epoch": 13.56776923933623, + "grad_norm": 1.6650915145874023, + "learning_rate": 1.416773232690139e-05, + "loss": 0.0646, + "num_input_tokens_seen": 148242704, + "step": 121825 + }, + { + "epoch": 13.568326094219847, + "grad_norm": 0.12940703332424164, + "learning_rate": 1.416554256663225e-05, + "loss": 0.0469, + "num_input_tokens_seen": 148249104, + "step": 121830 + }, + { + "epoch": 13.568882949103463, + "grad_norm": 0.6408736705780029, + "learning_rate": 1.4163352908703242e-05, + "loss": 0.0392, + "num_input_tokens_seen": 148254672, + "step": 121835 + }, + { + "epoch": 13.56943980398708, + "grad_norm": 0.645152747631073, + "learning_rate": 1.4161163353135044e-05, + "loss": 0.0577, + "num_input_tokens_seen": 148260976, + "step": 121840 + }, + { + "epoch": 13.569996658870698, + "grad_norm": 8.459921082248911e-05, + "learning_rate": 1.4158973899948345e-05, + "loss": 0.0337, + "num_input_tokens_seen": 148267024, + "step": 121845 + }, + { + "epoch": 13.570553513754316, + "grad_norm": 0.6255987286567688, + "learning_rate": 1.4156784549163816e-05, + "loss": 0.0081, + "num_input_tokens_seen": 148273136, + "step": 121850 + }, + { + "epoch": 13.571110368637934, + "grad_norm": 0.06632622331380844, + "learning_rate": 1.4154595300802153e-05, + "loss": 0.1298, + "num_input_tokens_seen": 148279312, + "step": 121855 + }, + { + "epoch": 13.571667223521551, + "grad_norm": 0.8621605038642883, + "learning_rate": 1.4152406154884027e-05, + "loss": 0.0218, + "num_input_tokens_seen": 148285104, + "step": 121860 + }, + { + "epoch": 13.572224078405167, + "grad_norm": 9.372428758069873e-05, + "learning_rate": 1.4150217111430114e-05, + "loss": 0.0016, + "num_input_tokens_seen": 148291376, + "step": 121865 + }, + { + "epoch": 13.572780933288785, + "grad_norm": 0.0033949571661651134, + "learning_rate": 1.4148028170461087e-05, + "loss": 0.0028, + "num_input_tokens_seen": 148297424, + "step": 121870 + }, + { + "epoch": 13.573337788172402, + "grad_norm": 0.43487316370010376, + "learning_rate": 1.4145839331997634e-05, + "loss": 0.0688, + "num_input_tokens_seen": 148303632, + "step": 121875 + }, + { + "epoch": 13.57389464305602, + "grad_norm": 0.00012255866022314876, + "learning_rate": 1.4143650596060429e-05, + "loss": 0.0646, + "num_input_tokens_seen": 148309776, + "step": 121880 + }, + { + "epoch": 13.574451497939638, + "grad_norm": 0.00834706611931324, + "learning_rate": 1.4141461962670138e-05, + "loss": 0.0506, + "num_input_tokens_seen": 148315888, + "step": 121885 + }, + { + "epoch": 13.575008352823254, + "grad_norm": 0.008437683805823326, + "learning_rate": 1.4139273431847434e-05, + "loss": 0.0103, + "num_input_tokens_seen": 148321968, + "step": 121890 + }, + { + "epoch": 13.575565207706871, + "grad_norm": 5.310293197631836, + "learning_rate": 1.4137085003612998e-05, + "loss": 0.1359, + "num_input_tokens_seen": 148328144, + "step": 121895 + }, + { + "epoch": 13.576122062590489, + "grad_norm": 1.77268385887146, + "learning_rate": 1.4134896677987492e-05, + "loss": 0.031, + "num_input_tokens_seen": 148334608, + "step": 121900 + }, + { + "epoch": 13.576678917474107, + "grad_norm": 0.11082767695188522, + "learning_rate": 1.4132708454991608e-05, + "loss": 0.0399, + "num_input_tokens_seen": 148340688, + "step": 121905 + }, + { + "epoch": 13.577235772357724, + "grad_norm": 0.7080255150794983, + "learning_rate": 1.413052033464598e-05, + "loss": 0.0613, + "num_input_tokens_seen": 148346544, + "step": 121910 + }, + { + "epoch": 13.57779262724134, + "grad_norm": 0.9392901062965393, + "learning_rate": 1.412833231697131e-05, + "loss": 0.0937, + "num_input_tokens_seen": 148352784, + "step": 121915 + }, + { + "epoch": 13.578349482124958, + "grad_norm": 1.2905479669570923, + "learning_rate": 1.4126144401988239e-05, + "loss": 0.1844, + "num_input_tokens_seen": 148358320, + "step": 121920 + }, + { + "epoch": 13.578906337008576, + "grad_norm": 0.44542714953422546, + "learning_rate": 1.4123956589717455e-05, + "loss": 0.0059, + "num_input_tokens_seen": 148364496, + "step": 121925 + }, + { + "epoch": 13.579463191892193, + "grad_norm": 0.038800448179244995, + "learning_rate": 1.4121768880179615e-05, + "loss": 0.0145, + "num_input_tokens_seen": 148370960, + "step": 121930 + }, + { + "epoch": 13.580020046775811, + "grad_norm": 1.1484969854354858, + "learning_rate": 1.4119581273395382e-05, + "loss": 0.0542, + "num_input_tokens_seen": 148377072, + "step": 121935 + }, + { + "epoch": 13.580576901659427, + "grad_norm": 1.5093237161636353, + "learning_rate": 1.4117393769385416e-05, + "loss": 0.0633, + "num_input_tokens_seen": 148383024, + "step": 121940 + }, + { + "epoch": 13.581133756543045, + "grad_norm": 2.7513158321380615, + "learning_rate": 1.4115206368170392e-05, + "loss": 0.0276, + "num_input_tokens_seen": 148389232, + "step": 121945 + }, + { + "epoch": 13.581690611426662, + "grad_norm": 0.04123237729072571, + "learning_rate": 1.4113019069770963e-05, + "loss": 0.0193, + "num_input_tokens_seen": 148395312, + "step": 121950 + }, + { + "epoch": 13.58224746631028, + "grad_norm": 1.0253281593322754, + "learning_rate": 1.4110831874207792e-05, + "loss": 0.1734, + "num_input_tokens_seen": 148400880, + "step": 121955 + }, + { + "epoch": 13.582804321193898, + "grad_norm": 0.6235867738723755, + "learning_rate": 1.410864478150153e-05, + "loss": 0.1398, + "num_input_tokens_seen": 148406672, + "step": 121960 + }, + { + "epoch": 13.583361176077513, + "grad_norm": 1.1970831155776978, + "learning_rate": 1.4106457791672853e-05, + "loss": 0.0601, + "num_input_tokens_seen": 148412624, + "step": 121965 + }, + { + "epoch": 13.583918030961131, + "grad_norm": 1.7692519426345825, + "learning_rate": 1.41042709047424e-05, + "loss": 0.0631, + "num_input_tokens_seen": 148418704, + "step": 121970 + }, + { + "epoch": 13.584474885844749, + "grad_norm": 0.16342249512672424, + "learning_rate": 1.4102084120730858e-05, + "loss": 0.05, + "num_input_tokens_seen": 148424816, + "step": 121975 + }, + { + "epoch": 13.585031740728367, + "grad_norm": 0.18099410831928253, + "learning_rate": 1.4099897439658843e-05, + "loss": 0.0373, + "num_input_tokens_seen": 148430896, + "step": 121980 + }, + { + "epoch": 13.585588595611984, + "grad_norm": 0.1595013439655304, + "learning_rate": 1.409771086154704e-05, + "loss": 0.0348, + "num_input_tokens_seen": 148436880, + "step": 121985 + }, + { + "epoch": 13.5861454504956, + "grad_norm": 0.10873007774353027, + "learning_rate": 1.4095524386416081e-05, + "loss": 0.0369, + "num_input_tokens_seen": 148442960, + "step": 121990 + }, + { + "epoch": 13.586702305379218, + "grad_norm": 0.0841837078332901, + "learning_rate": 1.4093338014286642e-05, + "loss": 0.0837, + "num_input_tokens_seen": 148449008, + "step": 121995 + }, + { + "epoch": 13.587259160262835, + "grad_norm": 0.19997087121009827, + "learning_rate": 1.4091151745179366e-05, + "loss": 0.0704, + "num_input_tokens_seen": 148455312, + "step": 122000 + }, + { + "epoch": 13.587816015146453, + "grad_norm": 1.7198628187179565, + "learning_rate": 1.4088965579114896e-05, + "loss": 0.0459, + "num_input_tokens_seen": 148461328, + "step": 122005 + }, + { + "epoch": 13.58837287003007, + "grad_norm": 0.00017651361122261733, + "learning_rate": 1.4086779516113883e-05, + "loss": 0.0249, + "num_input_tokens_seen": 148467632, + "step": 122010 + }, + { + "epoch": 13.588929724913687, + "grad_norm": 0.8934299349784851, + "learning_rate": 1.4084593556196987e-05, + "loss": 0.1069, + "num_input_tokens_seen": 148473584, + "step": 122015 + }, + { + "epoch": 13.589486579797304, + "grad_norm": 0.12412187457084656, + "learning_rate": 1.4082407699384854e-05, + "loss": 0.0453, + "num_input_tokens_seen": 148479536, + "step": 122020 + }, + { + "epoch": 13.590043434680922, + "grad_norm": 0.09939521551132202, + "learning_rate": 1.4080221945698125e-05, + "loss": 0.0024, + "num_input_tokens_seen": 148485808, + "step": 122025 + }, + { + "epoch": 13.59060028956454, + "grad_norm": 0.0017572533106431365, + "learning_rate": 1.4078036295157438e-05, + "loss": 0.036, + "num_input_tokens_seen": 148491312, + "step": 122030 + }, + { + "epoch": 13.591157144448157, + "grad_norm": 0.6290463805198669, + "learning_rate": 1.407585074778346e-05, + "loss": 0.0446, + "num_input_tokens_seen": 148497264, + "step": 122035 + }, + { + "epoch": 13.591713999331773, + "grad_norm": 0.28487515449523926, + "learning_rate": 1.4073665303596815e-05, + "loss": 0.0345, + "num_input_tokens_seen": 148503024, + "step": 122040 + }, + { + "epoch": 13.592270854215391, + "grad_norm": 0.0040563782677054405, + "learning_rate": 1.4071479962618172e-05, + "loss": 0.0448, + "num_input_tokens_seen": 148508752, + "step": 122045 + }, + { + "epoch": 13.592827709099009, + "grad_norm": 0.0008452960755676031, + "learning_rate": 1.4069294724868138e-05, + "loss": 0.0166, + "num_input_tokens_seen": 148515088, + "step": 122050 + }, + { + "epoch": 13.593384563982626, + "grad_norm": 0.7531645894050598, + "learning_rate": 1.4067109590367383e-05, + "loss": 0.0202, + "num_input_tokens_seen": 148521232, + "step": 122055 + }, + { + "epoch": 13.593941418866244, + "grad_norm": 0.515386164188385, + "learning_rate": 1.4064924559136527e-05, + "loss": 0.0693, + "num_input_tokens_seen": 148527216, + "step": 122060 + }, + { + "epoch": 13.59449827374986, + "grad_norm": 0.00039600199670530856, + "learning_rate": 1.4062739631196232e-05, + "loss": 0.0797, + "num_input_tokens_seen": 148533072, + "step": 122065 + }, + { + "epoch": 13.595055128633478, + "grad_norm": 0.20730647444725037, + "learning_rate": 1.4060554806567122e-05, + "loss": 0.0186, + "num_input_tokens_seen": 148539504, + "step": 122070 + }, + { + "epoch": 13.595611983517095, + "grad_norm": 0.020239003002643585, + "learning_rate": 1.4058370085269836e-05, + "loss": 0.0028, + "num_input_tokens_seen": 148545488, + "step": 122075 + }, + { + "epoch": 13.596168838400713, + "grad_norm": 0.0003496864519547671, + "learning_rate": 1.4056185467325e-05, + "loss": 0.0093, + "num_input_tokens_seen": 148551888, + "step": 122080 + }, + { + "epoch": 13.59672569328433, + "grad_norm": 0.010960050858557224, + "learning_rate": 1.4054000952753274e-05, + "loss": 0.0752, + "num_input_tokens_seen": 148557552, + "step": 122085 + }, + { + "epoch": 13.597282548167948, + "grad_norm": 0.08451348543167114, + "learning_rate": 1.4051816541575274e-05, + "loss": 0.1825, + "num_input_tokens_seen": 148563664, + "step": 122090 + }, + { + "epoch": 13.597839403051564, + "grad_norm": 0.4221630394458771, + "learning_rate": 1.4049632233811644e-05, + "loss": 0.0719, + "num_input_tokens_seen": 148569680, + "step": 122095 + }, + { + "epoch": 13.598396257935182, + "grad_norm": 9.73918431554921e-05, + "learning_rate": 1.4047448029482996e-05, + "loss": 0.0623, + "num_input_tokens_seen": 148575920, + "step": 122100 + }, + { + "epoch": 13.5989531128188, + "grad_norm": 1.0885612964630127, + "learning_rate": 1.4045263928609987e-05, + "loss": 0.1265, + "num_input_tokens_seen": 148581744, + "step": 122105 + }, + { + "epoch": 13.599509967702417, + "grad_norm": 0.5541732311248779, + "learning_rate": 1.404307993121323e-05, + "loss": 0.0278, + "num_input_tokens_seen": 148588016, + "step": 122110 + }, + { + "epoch": 13.600066822586035, + "grad_norm": 0.04483669996261597, + "learning_rate": 1.4040896037313367e-05, + "loss": 0.0084, + "num_input_tokens_seen": 148594288, + "step": 122115 + }, + { + "epoch": 13.60062367746965, + "grad_norm": 0.0005806115223094821, + "learning_rate": 1.4038712246931024e-05, + "loss": 0.0938, + "num_input_tokens_seen": 148600592, + "step": 122120 + }, + { + "epoch": 13.601180532353268, + "grad_norm": 0.0003704124828800559, + "learning_rate": 1.4036528560086826e-05, + "loss": 0.0299, + "num_input_tokens_seen": 148606224, + "step": 122125 + }, + { + "epoch": 13.601737387236886, + "grad_norm": 0.04714392498135567, + "learning_rate": 1.4034344976801389e-05, + "loss": 0.0021, + "num_input_tokens_seen": 148612336, + "step": 122130 + }, + { + "epoch": 13.602294242120504, + "grad_norm": 0.42053890228271484, + "learning_rate": 1.4032161497095359e-05, + "loss": 0.0189, + "num_input_tokens_seen": 148618672, + "step": 122135 + }, + { + "epoch": 13.602851097004121, + "grad_norm": 0.019466130062937737, + "learning_rate": 1.402997812098935e-05, + "loss": 0.0195, + "num_input_tokens_seen": 148624848, + "step": 122140 + }, + { + "epoch": 13.603407951887737, + "grad_norm": 0.000688571366481483, + "learning_rate": 1.402779484850399e-05, + "loss": 0.0145, + "num_input_tokens_seen": 148630992, + "step": 122145 + }, + { + "epoch": 13.603964806771355, + "grad_norm": 0.4622628688812256, + "learning_rate": 1.402561167965989e-05, + "loss": 0.0148, + "num_input_tokens_seen": 148637104, + "step": 122150 + }, + { + "epoch": 13.604521661654973, + "grad_norm": 0.19176225364208221, + "learning_rate": 1.4023428614477685e-05, + "loss": 0.1012, + "num_input_tokens_seen": 148643120, + "step": 122155 + }, + { + "epoch": 13.60507851653859, + "grad_norm": 0.0035022215452045202, + "learning_rate": 1.4021245652977982e-05, + "loss": 0.0098, + "num_input_tokens_seen": 148649168, + "step": 122160 + }, + { + "epoch": 13.605635371422208, + "grad_norm": 0.005294800736010075, + "learning_rate": 1.4019062795181431e-05, + "loss": 0.014, + "num_input_tokens_seen": 148655472, + "step": 122165 + }, + { + "epoch": 13.606192226305824, + "grad_norm": 0.15734492242336273, + "learning_rate": 1.401688004110861e-05, + "loss": 0.2396, + "num_input_tokens_seen": 148661744, + "step": 122170 + }, + { + "epoch": 13.606749081189442, + "grad_norm": 0.07999441027641296, + "learning_rate": 1.4014697390780163e-05, + "loss": 0.0525, + "num_input_tokens_seen": 148668368, + "step": 122175 + }, + { + "epoch": 13.60730593607306, + "grad_norm": 0.053840719163417816, + "learning_rate": 1.4012514844216695e-05, + "loss": 0.0239, + "num_input_tokens_seen": 148674448, + "step": 122180 + }, + { + "epoch": 13.607862790956677, + "grad_norm": 0.006272013299167156, + "learning_rate": 1.4010332401438836e-05, + "loss": 0.0426, + "num_input_tokens_seen": 148680112, + "step": 122185 + }, + { + "epoch": 13.608419645840295, + "grad_norm": 0.1869121789932251, + "learning_rate": 1.400815006246719e-05, + "loss": 0.0107, + "num_input_tokens_seen": 148686096, + "step": 122190 + }, + { + "epoch": 13.60897650072391, + "grad_norm": 0.19072595238685608, + "learning_rate": 1.4005967827322374e-05, + "loss": 0.0528, + "num_input_tokens_seen": 148691984, + "step": 122195 + }, + { + "epoch": 13.609533355607528, + "grad_norm": 0.04872073233127594, + "learning_rate": 1.4003785696025001e-05, + "loss": 0.0024, + "num_input_tokens_seen": 148698032, + "step": 122200 + }, + { + "epoch": 13.610090210491146, + "grad_norm": 1.6275113821029663, + "learning_rate": 1.4001603668595675e-05, + "loss": 0.0638, + "num_input_tokens_seen": 148703984, + "step": 122205 + }, + { + "epoch": 13.610647065374764, + "grad_norm": 2.499929904937744, + "learning_rate": 1.399942174505502e-05, + "loss": 0.0541, + "num_input_tokens_seen": 148710224, + "step": 122210 + }, + { + "epoch": 13.611203920258381, + "grad_norm": 0.00034818367566913366, + "learning_rate": 1.3997239925423641e-05, + "loss": 0.0321, + "num_input_tokens_seen": 148716400, + "step": 122215 + }, + { + "epoch": 13.611760775141999, + "grad_norm": 0.6128861904144287, + "learning_rate": 1.3995058209722145e-05, + "loss": 0.1114, + "num_input_tokens_seen": 148722096, + "step": 122220 + }, + { + "epoch": 13.612317630025615, + "grad_norm": 0.4055000841617584, + "learning_rate": 1.3992876597971133e-05, + "loss": 0.0152, + "num_input_tokens_seen": 148728400, + "step": 122225 + }, + { + "epoch": 13.612874484909232, + "grad_norm": 9.837372635956854e-05, + "learning_rate": 1.399069509019123e-05, + "loss": 0.0389, + "num_input_tokens_seen": 148734480, + "step": 122230 + }, + { + "epoch": 13.61343133979285, + "grad_norm": 1.127042531967163, + "learning_rate": 1.3988513686403034e-05, + "loss": 0.0999, + "num_input_tokens_seen": 148740304, + "step": 122235 + }, + { + "epoch": 13.613988194676468, + "grad_norm": 1.5366251468658447, + "learning_rate": 1.398633238662715e-05, + "loss": 0.0575, + "num_input_tokens_seen": 148746576, + "step": 122240 + }, + { + "epoch": 13.614545049560085, + "grad_norm": 0.0006569554097950459, + "learning_rate": 1.3984151190884165e-05, + "loss": 0.01, + "num_input_tokens_seen": 148752944, + "step": 122245 + }, + { + "epoch": 13.615101904443701, + "grad_norm": 0.03680967539548874, + "learning_rate": 1.3981970099194711e-05, + "loss": 0.0007, + "num_input_tokens_seen": 148759184, + "step": 122250 + }, + { + "epoch": 13.615658759327319, + "grad_norm": 0.8188788294792175, + "learning_rate": 1.3979789111579367e-05, + "loss": 0.0844, + "num_input_tokens_seen": 148765104, + "step": 122255 + }, + { + "epoch": 13.616215614210937, + "grad_norm": 0.3469376266002655, + "learning_rate": 1.3977608228058752e-05, + "loss": 0.0319, + "num_input_tokens_seen": 148771376, + "step": 122260 + }, + { + "epoch": 13.616772469094554, + "grad_norm": 1.8146955966949463, + "learning_rate": 1.3975427448653461e-05, + "loss": 0.1324, + "num_input_tokens_seen": 148776880, + "step": 122265 + }, + { + "epoch": 13.617329323978172, + "grad_norm": 0.5745969414710999, + "learning_rate": 1.3973246773384086e-05, + "loss": 0.0263, + "num_input_tokens_seen": 148782832, + "step": 122270 + }, + { + "epoch": 13.617886178861788, + "grad_norm": 0.6570839285850525, + "learning_rate": 1.3971066202271223e-05, + "loss": 0.0329, + "num_input_tokens_seen": 148788368, + "step": 122275 + }, + { + "epoch": 13.618443033745406, + "grad_norm": 0.006952754221856594, + "learning_rate": 1.3968885735335485e-05, + "loss": 0.0518, + "num_input_tokens_seen": 148794736, + "step": 122280 + }, + { + "epoch": 13.618999888629023, + "grad_norm": 0.06382207572460175, + "learning_rate": 1.396670537259746e-05, + "loss": 0.0057, + "num_input_tokens_seen": 148800688, + "step": 122285 + }, + { + "epoch": 13.619556743512641, + "grad_norm": 0.35773032903671265, + "learning_rate": 1.3964525114077745e-05, + "loss": 0.0632, + "num_input_tokens_seen": 148806832, + "step": 122290 + }, + { + "epoch": 13.620113598396259, + "grad_norm": 0.001596724265255034, + "learning_rate": 1.396234495979692e-05, + "loss": 0.0271, + "num_input_tokens_seen": 148813200, + "step": 122295 + }, + { + "epoch": 13.620670453279875, + "grad_norm": 0.004032611846923828, + "learning_rate": 1.3960164909775597e-05, + "loss": 0.0342, + "num_input_tokens_seen": 148819280, + "step": 122300 + }, + { + "epoch": 13.621227308163492, + "grad_norm": 9.981280891224742e-05, + "learning_rate": 1.3957984964034354e-05, + "loss": 0.0277, + "num_input_tokens_seen": 148825296, + "step": 122305 + }, + { + "epoch": 13.62178416304711, + "grad_norm": 0.07000650465488434, + "learning_rate": 1.3955805122593809e-05, + "loss": 0.0254, + "num_input_tokens_seen": 148831024, + "step": 122310 + }, + { + "epoch": 13.622341017930728, + "grad_norm": 0.03824908658862114, + "learning_rate": 1.3953625385474514e-05, + "loss": 0.0714, + "num_input_tokens_seen": 148837232, + "step": 122315 + }, + { + "epoch": 13.622897872814345, + "grad_norm": 1.155684232711792, + "learning_rate": 1.3951445752697087e-05, + "loss": 0.0592, + "num_input_tokens_seen": 148843632, + "step": 122320 + }, + { + "epoch": 13.623454727697961, + "grad_norm": 0.08409114927053452, + "learning_rate": 1.3949266224282097e-05, + "loss": 0.0048, + "num_input_tokens_seen": 148849744, + "step": 122325 + }, + { + "epoch": 13.624011582581579, + "grad_norm": 3.5887608528137207, + "learning_rate": 1.3947086800250153e-05, + "loss": 0.1959, + "num_input_tokens_seen": 148855984, + "step": 122330 + }, + { + "epoch": 13.624568437465197, + "grad_norm": 7.519587234128267e-05, + "learning_rate": 1.3944907480621827e-05, + "loss": 0.1624, + "num_input_tokens_seen": 148861936, + "step": 122335 + }, + { + "epoch": 13.625125292348814, + "grad_norm": 0.04798414558172226, + "learning_rate": 1.3942728265417707e-05, + "loss": 0.002, + "num_input_tokens_seen": 148868240, + "step": 122340 + }, + { + "epoch": 13.625682147232432, + "grad_norm": 0.0021936907432973385, + "learning_rate": 1.3940549154658367e-05, + "loss": 0.0013, + "num_input_tokens_seen": 148874640, + "step": 122345 + }, + { + "epoch": 13.626239002116048, + "grad_norm": 0.27551591396331787, + "learning_rate": 1.3938370148364414e-05, + "loss": 0.0522, + "num_input_tokens_seen": 148879952, + "step": 122350 + }, + { + "epoch": 13.626795856999665, + "grad_norm": 0.004607808776199818, + "learning_rate": 1.3936191246556413e-05, + "loss": 0.015, + "num_input_tokens_seen": 148886128, + "step": 122355 + }, + { + "epoch": 13.627352711883283, + "grad_norm": 0.06941675394773483, + "learning_rate": 1.3934012449254952e-05, + "loss": 0.006, + "num_input_tokens_seen": 148892272, + "step": 122360 + }, + { + "epoch": 13.6279095667669, + "grad_norm": 0.12702886760234833, + "learning_rate": 1.39318337564806e-05, + "loss": 0.1286, + "num_input_tokens_seen": 148898032, + "step": 122365 + }, + { + "epoch": 13.628466421650518, + "grad_norm": 0.20387670397758484, + "learning_rate": 1.3929655168253957e-05, + "loss": 0.0048, + "num_input_tokens_seen": 148903952, + "step": 122370 + }, + { + "epoch": 13.629023276534134, + "grad_norm": 0.195975661277771, + "learning_rate": 1.3927476684595578e-05, + "loss": 0.0361, + "num_input_tokens_seen": 148909904, + "step": 122375 + }, + { + "epoch": 13.629580131417752, + "grad_norm": 0.1773190200328827, + "learning_rate": 1.3925298305526075e-05, + "loss": 0.0153, + "num_input_tokens_seen": 148915952, + "step": 122380 + }, + { + "epoch": 13.63013698630137, + "grad_norm": 0.039500780403614044, + "learning_rate": 1.3923120031065979e-05, + "loss": 0.0325, + "num_input_tokens_seen": 148922096, + "step": 122385 + }, + { + "epoch": 13.630693841184987, + "grad_norm": 0.38321539759635925, + "learning_rate": 1.3920941861235904e-05, + "loss": 0.0051, + "num_input_tokens_seen": 148928144, + "step": 122390 + }, + { + "epoch": 13.631250696068605, + "grad_norm": 0.00041073429747484624, + "learning_rate": 1.3918763796056394e-05, + "loss": 0.1066, + "num_input_tokens_seen": 148933968, + "step": 122395 + }, + { + "epoch": 13.631807550952221, + "grad_norm": 2.8238091468811035, + "learning_rate": 1.3916585835548052e-05, + "loss": 0.0628, + "num_input_tokens_seen": 148939952, + "step": 122400 + }, + { + "epoch": 13.632364405835839, + "grad_norm": 0.18897077441215515, + "learning_rate": 1.3914407979731434e-05, + "loss": 0.0777, + "num_input_tokens_seen": 148945616, + "step": 122405 + }, + { + "epoch": 13.632921260719456, + "grad_norm": 0.7434444427490234, + "learning_rate": 1.3912230228627116e-05, + "loss": 0.0251, + "num_input_tokens_seen": 148951504, + "step": 122410 + }, + { + "epoch": 13.633478115603074, + "grad_norm": 0.005154459737241268, + "learning_rate": 1.3910052582255657e-05, + "loss": 0.0453, + "num_input_tokens_seen": 148957776, + "step": 122415 + }, + { + "epoch": 13.634034970486692, + "grad_norm": 2.334702730178833, + "learning_rate": 1.3907875040637647e-05, + "loss": 0.0977, + "num_input_tokens_seen": 148963536, + "step": 122420 + }, + { + "epoch": 13.634591825370308, + "grad_norm": 0.004533255938440561, + "learning_rate": 1.3905697603793641e-05, + "loss": 0.0383, + "num_input_tokens_seen": 148969552, + "step": 122425 + }, + { + "epoch": 13.635148680253925, + "grad_norm": 0.38281863927841187, + "learning_rate": 1.3903520271744214e-05, + "loss": 0.0093, + "num_input_tokens_seen": 148975600, + "step": 122430 + }, + { + "epoch": 13.635705535137543, + "grad_norm": 0.0018131268443539739, + "learning_rate": 1.3901343044509912e-05, + "loss": 0.0117, + "num_input_tokens_seen": 148981520, + "step": 122435 + }, + { + "epoch": 13.63626239002116, + "grad_norm": 0.10112326592206955, + "learning_rate": 1.3899165922111335e-05, + "loss": 0.024, + "num_input_tokens_seen": 148986768, + "step": 122440 + }, + { + "epoch": 13.636819244904778, + "grad_norm": 0.010308519005775452, + "learning_rate": 1.3896988904569014e-05, + "loss": 0.0364, + "num_input_tokens_seen": 148993104, + "step": 122445 + }, + { + "epoch": 13.637376099788396, + "grad_norm": 0.004059187136590481, + "learning_rate": 1.389481199190355e-05, + "loss": 0.0425, + "num_input_tokens_seen": 148999280, + "step": 122450 + }, + { + "epoch": 13.637932954672012, + "grad_norm": 0.04769459739327431, + "learning_rate": 1.3892635184135466e-05, + "loss": 0.0105, + "num_input_tokens_seen": 149005328, + "step": 122455 + }, + { + "epoch": 13.63848980955563, + "grad_norm": 0.03202463313937187, + "learning_rate": 1.3890458481285347e-05, + "loss": 0.0061, + "num_input_tokens_seen": 149010928, + "step": 122460 + }, + { + "epoch": 13.639046664439247, + "grad_norm": 0.05635354295372963, + "learning_rate": 1.3888281883373744e-05, + "loss": 0.0129, + "num_input_tokens_seen": 149017360, + "step": 122465 + }, + { + "epoch": 13.639603519322865, + "grad_norm": 0.5138768553733826, + "learning_rate": 1.3886105390421227e-05, + "loss": 0.021, + "num_input_tokens_seen": 149022928, + "step": 122470 + }, + { + "epoch": 13.640160374206483, + "grad_norm": 0.11997682601213455, + "learning_rate": 1.388392900244835e-05, + "loss": 0.0026, + "num_input_tokens_seen": 149028784, + "step": 122475 + }, + { + "epoch": 13.640717229090098, + "grad_norm": 1.2131471633911133, + "learning_rate": 1.388175271947567e-05, + "loss": 0.1183, + "num_input_tokens_seen": 149034832, + "step": 122480 + }, + { + "epoch": 13.641274083973716, + "grad_norm": 0.06537909060716629, + "learning_rate": 1.3879576541523736e-05, + "loss": 0.1882, + "num_input_tokens_seen": 149040560, + "step": 122485 + }, + { + "epoch": 13.641830938857334, + "grad_norm": 0.044677697122097015, + "learning_rate": 1.3877400468613116e-05, + "loss": 0.0138, + "num_input_tokens_seen": 149046800, + "step": 122490 + }, + { + "epoch": 13.642387793740951, + "grad_norm": 0.9992215633392334, + "learning_rate": 1.3875224500764363e-05, + "loss": 0.0212, + "num_input_tokens_seen": 149053104, + "step": 122495 + }, + { + "epoch": 13.64294464862457, + "grad_norm": 0.231882706284523, + "learning_rate": 1.3873048637998029e-05, + "loss": 0.0669, + "num_input_tokens_seen": 149059312, + "step": 122500 + }, + { + "epoch": 13.643501503508185, + "grad_norm": 0.7527369260787964, + "learning_rate": 1.387087288033465e-05, + "loss": 0.0942, + "num_input_tokens_seen": 149065552, + "step": 122505 + }, + { + "epoch": 13.644058358391803, + "grad_norm": 0.004413588438183069, + "learning_rate": 1.3868697227794808e-05, + "loss": 0.0721, + "num_input_tokens_seen": 149071280, + "step": 122510 + }, + { + "epoch": 13.64461521327542, + "grad_norm": 1.4137026071548462, + "learning_rate": 1.3866521680399031e-05, + "loss": 0.0782, + "num_input_tokens_seen": 149077296, + "step": 122515 + }, + { + "epoch": 13.645172068159038, + "grad_norm": 0.08888550102710724, + "learning_rate": 1.386434623816788e-05, + "loss": 0.0851, + "num_input_tokens_seen": 149083248, + "step": 122520 + }, + { + "epoch": 13.645728923042656, + "grad_norm": 0.013926742598414421, + "learning_rate": 1.3862170901121907e-05, + "loss": 0.0095, + "num_input_tokens_seen": 149089584, + "step": 122525 + }, + { + "epoch": 13.646285777926272, + "grad_norm": 1.1693073511123657, + "learning_rate": 1.3859995669281651e-05, + "loss": 0.0821, + "num_input_tokens_seen": 149095696, + "step": 122530 + }, + { + "epoch": 13.64684263280989, + "grad_norm": 1.5119746923446655, + "learning_rate": 1.3857820542667649e-05, + "loss": 0.0842, + "num_input_tokens_seen": 149101392, + "step": 122535 + }, + { + "epoch": 13.647399487693507, + "grad_norm": 0.0669514387845993, + "learning_rate": 1.3855645521300469e-05, + "loss": 0.0201, + "num_input_tokens_seen": 149107504, + "step": 122540 + }, + { + "epoch": 13.647956342577125, + "grad_norm": 5.686347961425781, + "learning_rate": 1.3853470605200646e-05, + "loss": 0.066, + "num_input_tokens_seen": 149113296, + "step": 122545 + }, + { + "epoch": 13.648513197460742, + "grad_norm": 0.011523409746587276, + "learning_rate": 1.3851295794388725e-05, + "loss": 0.0006, + "num_input_tokens_seen": 149119280, + "step": 122550 + }, + { + "epoch": 13.64907005234436, + "grad_norm": 0.048725128173828125, + "learning_rate": 1.3849121088885237e-05, + "loss": 0.0039, + "num_input_tokens_seen": 149125552, + "step": 122555 + }, + { + "epoch": 13.649626907227976, + "grad_norm": 0.018244393169879913, + "learning_rate": 1.3846946488710743e-05, + "loss": 0.0193, + "num_input_tokens_seen": 149131600, + "step": 122560 + }, + { + "epoch": 13.650183762111594, + "grad_norm": 0.04737585037946701, + "learning_rate": 1.3844771993885769e-05, + "loss": 0.0065, + "num_input_tokens_seen": 149137936, + "step": 122565 + }, + { + "epoch": 13.650740616995211, + "grad_norm": 0.38733965158462524, + "learning_rate": 1.3842597604430878e-05, + "loss": 0.0428, + "num_input_tokens_seen": 149143376, + "step": 122570 + }, + { + "epoch": 13.651297471878829, + "grad_norm": 0.0055235312320292, + "learning_rate": 1.3840423320366572e-05, + "loss": 0.0385, + "num_input_tokens_seen": 149149552, + "step": 122575 + }, + { + "epoch": 13.651854326762447, + "grad_norm": 0.09019561111927032, + "learning_rate": 1.3838249141713416e-05, + "loss": 0.0848, + "num_input_tokens_seen": 149155216, + "step": 122580 + }, + { + "epoch": 13.652411181646062, + "grad_norm": 0.21421119570732117, + "learning_rate": 1.3836075068491932e-05, + "loss": 0.0801, + "num_input_tokens_seen": 149160688, + "step": 122585 + }, + { + "epoch": 13.65296803652968, + "grad_norm": 0.0004758255381602794, + "learning_rate": 1.3833901100722674e-05, + "loss": 0.0129, + "num_input_tokens_seen": 149166416, + "step": 122590 + }, + { + "epoch": 13.653524891413298, + "grad_norm": 0.014241334982216358, + "learning_rate": 1.3831727238426167e-05, + "loss": 0.0049, + "num_input_tokens_seen": 149172592, + "step": 122595 + }, + { + "epoch": 13.654081746296916, + "grad_norm": 0.07274696230888367, + "learning_rate": 1.3829553481622943e-05, + "loss": 0.0584, + "num_input_tokens_seen": 149178768, + "step": 122600 + }, + { + "epoch": 13.654638601180533, + "grad_norm": 0.003661931725218892, + "learning_rate": 1.3827379830333525e-05, + "loss": 0.0436, + "num_input_tokens_seen": 149184880, + "step": 122605 + }, + { + "epoch": 13.655195456064149, + "grad_norm": 0.006150940433144569, + "learning_rate": 1.3825206284578468e-05, + "loss": 0.0104, + "num_input_tokens_seen": 149191216, + "step": 122610 + }, + { + "epoch": 13.655752310947767, + "grad_norm": 0.17729271948337555, + "learning_rate": 1.3823032844378289e-05, + "loss": 0.0043, + "num_input_tokens_seen": 149197648, + "step": 122615 + }, + { + "epoch": 13.656309165831384, + "grad_norm": 0.8939270973205566, + "learning_rate": 1.382085950975352e-05, + "loss": 0.013, + "num_input_tokens_seen": 149203440, + "step": 122620 + }, + { + "epoch": 13.656866020715002, + "grad_norm": 0.11110720783472061, + "learning_rate": 1.3818686280724691e-05, + "loss": 0.0283, + "num_input_tokens_seen": 149209520, + "step": 122625 + }, + { + "epoch": 13.65742287559862, + "grad_norm": 0.1195034384727478, + "learning_rate": 1.3816513157312317e-05, + "loss": 0.1067, + "num_input_tokens_seen": 149215440, + "step": 122630 + }, + { + "epoch": 13.657979730482236, + "grad_norm": 0.23224623501300812, + "learning_rate": 1.3814340139536947e-05, + "loss": 0.0183, + "num_input_tokens_seen": 149222096, + "step": 122635 + }, + { + "epoch": 13.658536585365853, + "grad_norm": 0.05384933575987816, + "learning_rate": 1.3812167227419093e-05, + "loss": 0.0323, + "num_input_tokens_seen": 149228080, + "step": 122640 + }, + { + "epoch": 13.659093440249471, + "grad_norm": 0.018199754878878593, + "learning_rate": 1.3809994420979287e-05, + "loss": 0.0134, + "num_input_tokens_seen": 149234096, + "step": 122645 + }, + { + "epoch": 13.659650295133089, + "grad_norm": 0.0026713102124631405, + "learning_rate": 1.3807821720238037e-05, + "loss": 0.085, + "num_input_tokens_seen": 149239440, + "step": 122650 + }, + { + "epoch": 13.660207150016706, + "grad_norm": 0.007732139900326729, + "learning_rate": 1.3805649125215889e-05, + "loss": 0.0165, + "num_input_tokens_seen": 149245584, + "step": 122655 + }, + { + "epoch": 13.660764004900322, + "grad_norm": 0.2854421138763428, + "learning_rate": 1.3803476635933343e-05, + "loss": 0.0193, + "num_input_tokens_seen": 149251472, + "step": 122660 + }, + { + "epoch": 13.66132085978394, + "grad_norm": 0.4901633858680725, + "learning_rate": 1.380130425241094e-05, + "loss": 0.0179, + "num_input_tokens_seen": 149257392, + "step": 122665 + }, + { + "epoch": 13.661877714667558, + "grad_norm": 1.1050654649734497, + "learning_rate": 1.3799131974669194e-05, + "loss": 0.1161, + "num_input_tokens_seen": 149263568, + "step": 122670 + }, + { + "epoch": 13.662434569551175, + "grad_norm": 0.03107958287000656, + "learning_rate": 1.3796959802728616e-05, + "loss": 0.0661, + "num_input_tokens_seen": 149269392, + "step": 122675 + }, + { + "epoch": 13.662991424434793, + "grad_norm": 0.05362852290272713, + "learning_rate": 1.379478773660972e-05, + "loss": 0.0037, + "num_input_tokens_seen": 149275312, + "step": 122680 + }, + { + "epoch": 13.663548279318409, + "grad_norm": 0.10484138131141663, + "learning_rate": 1.379261577633304e-05, + "loss": 0.0758, + "num_input_tokens_seen": 149281168, + "step": 122685 + }, + { + "epoch": 13.664105134202027, + "grad_norm": 0.12729017436504364, + "learning_rate": 1.3790443921919088e-05, + "loss": 0.0083, + "num_input_tokens_seen": 149287248, + "step": 122690 + }, + { + "epoch": 13.664661989085644, + "grad_norm": 0.009289780631661415, + "learning_rate": 1.378827217338837e-05, + "loss": 0.0084, + "num_input_tokens_seen": 149293680, + "step": 122695 + }, + { + "epoch": 13.665218843969262, + "grad_norm": 0.0005636674468405545, + "learning_rate": 1.3786100530761392e-05, + "loss": 0.0927, + "num_input_tokens_seen": 149299536, + "step": 122700 + }, + { + "epoch": 13.66577569885288, + "grad_norm": 1.2462961673736572, + "learning_rate": 1.3783928994058692e-05, + "loss": 0.0966, + "num_input_tokens_seen": 149305040, + "step": 122705 + }, + { + "epoch": 13.666332553736495, + "grad_norm": 2.194998025894165, + "learning_rate": 1.3781757563300762e-05, + "loss": 0.0921, + "num_input_tokens_seen": 149310736, + "step": 122710 + }, + { + "epoch": 13.666889408620113, + "grad_norm": 5.9957135817967355e-05, + "learning_rate": 1.3779586238508135e-05, + "loss": 0.0378, + "num_input_tokens_seen": 149316912, + "step": 122715 + }, + { + "epoch": 13.66744626350373, + "grad_norm": 1.3172004222869873, + "learning_rate": 1.3777415019701287e-05, + "loss": 0.1979, + "num_input_tokens_seen": 149322640, + "step": 122720 + }, + { + "epoch": 13.668003118387348, + "grad_norm": 1.243898868560791, + "learning_rate": 1.3775243906900756e-05, + "loss": 0.0768, + "num_input_tokens_seen": 149328784, + "step": 122725 + }, + { + "epoch": 13.668559973270966, + "grad_norm": 4.892502784729004, + "learning_rate": 1.3773072900127026e-05, + "loss": 0.0908, + "num_input_tokens_seen": 149334928, + "step": 122730 + }, + { + "epoch": 13.669116828154582, + "grad_norm": 0.05447689816355705, + "learning_rate": 1.3770901999400632e-05, + "loss": 0.1141, + "num_input_tokens_seen": 149341072, + "step": 122735 + }, + { + "epoch": 13.6696736830382, + "grad_norm": 1.3817100524902344, + "learning_rate": 1.3768731204742064e-05, + "loss": 0.0183, + "num_input_tokens_seen": 149347376, + "step": 122740 + }, + { + "epoch": 13.670230537921817, + "grad_norm": 0.00013699090050067753, + "learning_rate": 1.3766560516171827e-05, + "loss": 0.0359, + "num_input_tokens_seen": 149353776, + "step": 122745 + }, + { + "epoch": 13.670787392805435, + "grad_norm": 0.385747492313385, + "learning_rate": 1.3764389933710416e-05, + "loss": 0.0287, + "num_input_tokens_seen": 149360048, + "step": 122750 + }, + { + "epoch": 13.671344247689053, + "grad_norm": 0.7818925380706787, + "learning_rate": 1.3762219457378356e-05, + "loss": 0.0155, + "num_input_tokens_seen": 149366032, + "step": 122755 + }, + { + "epoch": 13.671901102572669, + "grad_norm": 2.1542675495147705, + "learning_rate": 1.3760049087196136e-05, + "loss": 0.1146, + "num_input_tokens_seen": 149371536, + "step": 122760 + }, + { + "epoch": 13.672457957456286, + "grad_norm": 0.008444314822554588, + "learning_rate": 1.3757878823184256e-05, + "loss": 0.078, + "num_input_tokens_seen": 149377616, + "step": 122765 + }, + { + "epoch": 13.673014812339904, + "grad_norm": 0.006547962781041861, + "learning_rate": 1.375570866536321e-05, + "loss": 0.0433, + "num_input_tokens_seen": 149384016, + "step": 122770 + }, + { + "epoch": 13.673571667223522, + "grad_norm": 1.1376179456710815, + "learning_rate": 1.3753538613753511e-05, + "loss": 0.0699, + "num_input_tokens_seen": 149390192, + "step": 122775 + }, + { + "epoch": 13.67412852210714, + "grad_norm": 0.0002169292129110545, + "learning_rate": 1.3751368668375641e-05, + "loss": 0.0675, + "num_input_tokens_seen": 149396304, + "step": 122780 + }, + { + "epoch": 13.674685376990757, + "grad_norm": 1.715121865272522, + "learning_rate": 1.3749198829250129e-05, + "loss": 0.0279, + "num_input_tokens_seen": 149402512, + "step": 122785 + }, + { + "epoch": 13.675242231874373, + "grad_norm": 0.00044075088226236403, + "learning_rate": 1.3747029096397427e-05, + "loss": 0.0621, + "num_input_tokens_seen": 149408368, + "step": 122790 + }, + { + "epoch": 13.67579908675799, + "grad_norm": 0.0013379083247855306, + "learning_rate": 1.3744859469838062e-05, + "loss": 0.0024, + "num_input_tokens_seen": 149414544, + "step": 122795 + }, + { + "epoch": 13.676355941641608, + "grad_norm": 0.7152418494224548, + "learning_rate": 1.3742689949592503e-05, + "loss": 0.0098, + "num_input_tokens_seen": 149420624, + "step": 122800 + }, + { + "epoch": 13.676912796525226, + "grad_norm": 0.0004929110873490572, + "learning_rate": 1.3740520535681267e-05, + "loss": 0.0473, + "num_input_tokens_seen": 149426896, + "step": 122805 + }, + { + "epoch": 13.677469651408844, + "grad_norm": 0.04458121210336685, + "learning_rate": 1.3738351228124841e-05, + "loss": 0.0961, + "num_input_tokens_seen": 149432976, + "step": 122810 + }, + { + "epoch": 13.67802650629246, + "grad_norm": 0.13449202477931976, + "learning_rate": 1.3736182026943706e-05, + "loss": 0.0054, + "num_input_tokens_seen": 149438640, + "step": 122815 + }, + { + "epoch": 13.678583361176077, + "grad_norm": 0.1509920358657837, + "learning_rate": 1.3734012932158346e-05, + "loss": 0.0079, + "num_input_tokens_seen": 149444592, + "step": 122820 + }, + { + "epoch": 13.679140216059695, + "grad_norm": 0.49106118083000183, + "learning_rate": 1.3731843943789269e-05, + "loss": 0.0069, + "num_input_tokens_seen": 149450640, + "step": 122825 + }, + { + "epoch": 13.679697070943313, + "grad_norm": 0.07229981571435928, + "learning_rate": 1.3729675061856956e-05, + "loss": 0.0025, + "num_input_tokens_seen": 149456880, + "step": 122830 + }, + { + "epoch": 13.68025392582693, + "grad_norm": 0.00028123604715801775, + "learning_rate": 1.3727506286381892e-05, + "loss": 0.0716, + "num_input_tokens_seen": 149463088, + "step": 122835 + }, + { + "epoch": 13.680810780710546, + "grad_norm": 0.00024136521096806973, + "learning_rate": 1.3725337617384553e-05, + "loss": 0.0055, + "num_input_tokens_seen": 149469072, + "step": 122840 + }, + { + "epoch": 13.681367635594164, + "grad_norm": 0.062093883752822876, + "learning_rate": 1.3723169054885442e-05, + "loss": 0.0027, + "num_input_tokens_seen": 149475024, + "step": 122845 + }, + { + "epoch": 13.681924490477781, + "grad_norm": 0.040552664548158646, + "learning_rate": 1.3721000598905023e-05, + "loss": 0.0254, + "num_input_tokens_seen": 149481232, + "step": 122850 + }, + { + "epoch": 13.6824813453614, + "grad_norm": 2.284350872039795, + "learning_rate": 1.3718832249463802e-05, + "loss": 0.0807, + "num_input_tokens_seen": 149486832, + "step": 122855 + }, + { + "epoch": 13.683038200245017, + "grad_norm": 0.01954207941889763, + "learning_rate": 1.3716664006582247e-05, + "loss": 0.0893, + "num_input_tokens_seen": 149493040, + "step": 122860 + }, + { + "epoch": 13.683595055128633, + "grad_norm": 0.005682850256562233, + "learning_rate": 1.371449587028084e-05, + "loss": 0.1202, + "num_input_tokens_seen": 149499184, + "step": 122865 + }, + { + "epoch": 13.68415191001225, + "grad_norm": 0.01271932665258646, + "learning_rate": 1.3712327840580055e-05, + "loss": 0.0961, + "num_input_tokens_seen": 149505360, + "step": 122870 + }, + { + "epoch": 13.684708764895868, + "grad_norm": 0.002177639864385128, + "learning_rate": 1.3710159917500384e-05, + "loss": 0.2946, + "num_input_tokens_seen": 149511312, + "step": 122875 + }, + { + "epoch": 13.685265619779486, + "grad_norm": 0.15391430258750916, + "learning_rate": 1.3707992101062301e-05, + "loss": 0.0495, + "num_input_tokens_seen": 149517488, + "step": 122880 + }, + { + "epoch": 13.685822474663103, + "grad_norm": 0.05028488487005234, + "learning_rate": 1.3705824391286276e-05, + "loss": 0.0169, + "num_input_tokens_seen": 149523760, + "step": 122885 + }, + { + "epoch": 13.68637932954672, + "grad_norm": 0.008296916261315346, + "learning_rate": 1.3703656788192779e-05, + "loss": 0.0401, + "num_input_tokens_seen": 149529808, + "step": 122890 + }, + { + "epoch": 13.686936184430337, + "grad_norm": 0.04819062724709511, + "learning_rate": 1.3701489291802306e-05, + "loss": 0.038, + "num_input_tokens_seen": 149535952, + "step": 122895 + }, + { + "epoch": 13.687493039313955, + "grad_norm": 0.007655916269868612, + "learning_rate": 1.3699321902135316e-05, + "loss": 0.0256, + "num_input_tokens_seen": 149542224, + "step": 122900 + }, + { + "epoch": 13.688049894197572, + "grad_norm": 0.04426029697060585, + "learning_rate": 1.3697154619212288e-05, + "loss": 0.044, + "num_input_tokens_seen": 149548464, + "step": 122905 + }, + { + "epoch": 13.68860674908119, + "grad_norm": 1.1513471603393555, + "learning_rate": 1.3694987443053674e-05, + "loss": 0.1364, + "num_input_tokens_seen": 149554320, + "step": 122910 + }, + { + "epoch": 13.689163603964808, + "grad_norm": 0.01839938387274742, + "learning_rate": 1.3692820373679976e-05, + "loss": 0.0035, + "num_input_tokens_seen": 149560336, + "step": 122915 + }, + { + "epoch": 13.689720458848424, + "grad_norm": 0.11163009703159332, + "learning_rate": 1.3690653411111643e-05, + "loss": 0.0226, + "num_input_tokens_seen": 149566704, + "step": 122920 + }, + { + "epoch": 13.690277313732041, + "grad_norm": 0.07117791473865509, + "learning_rate": 1.368848655536915e-05, + "loss": 0.0281, + "num_input_tokens_seen": 149572912, + "step": 122925 + }, + { + "epoch": 13.690834168615659, + "grad_norm": 0.020282894372940063, + "learning_rate": 1.368631980647297e-05, + "loss": 0.0568, + "num_input_tokens_seen": 149579056, + "step": 122930 + }, + { + "epoch": 13.691391023499277, + "grad_norm": 0.21354888379573822, + "learning_rate": 1.3684153164443564e-05, + "loss": 0.1043, + "num_input_tokens_seen": 149585392, + "step": 122935 + }, + { + "epoch": 13.691947878382894, + "grad_norm": 0.00572314765304327, + "learning_rate": 1.3681986629301385e-05, + "loss": 0.0725, + "num_input_tokens_seen": 149591312, + "step": 122940 + }, + { + "epoch": 13.69250473326651, + "grad_norm": 0.09728933125734329, + "learning_rate": 1.3679820201066923e-05, + "loss": 0.0237, + "num_input_tokens_seen": 149597520, + "step": 122945 + }, + { + "epoch": 13.693061588150128, + "grad_norm": 0.010596605949103832, + "learning_rate": 1.3677653879760628e-05, + "loss": 0.0451, + "num_input_tokens_seen": 149603920, + "step": 122950 + }, + { + "epoch": 13.693618443033746, + "grad_norm": 0.00394520815461874, + "learning_rate": 1.3675487665402958e-05, + "loss": 0.0169, + "num_input_tokens_seen": 149610256, + "step": 122955 + }, + { + "epoch": 13.694175297917363, + "grad_norm": 1.4343607425689697, + "learning_rate": 1.3673321558014376e-05, + "loss": 0.0953, + "num_input_tokens_seen": 149616304, + "step": 122960 + }, + { + "epoch": 13.69473215280098, + "grad_norm": 0.8971444368362427, + "learning_rate": 1.3671155557615356e-05, + "loss": 0.0174, + "num_input_tokens_seen": 149622512, + "step": 122965 + }, + { + "epoch": 13.695289007684597, + "grad_norm": 0.8536898493766785, + "learning_rate": 1.366898966422634e-05, + "loss": 0.0181, + "num_input_tokens_seen": 149628400, + "step": 122970 + }, + { + "epoch": 13.695845862568214, + "grad_norm": 0.025149716064333916, + "learning_rate": 1.366682387786781e-05, + "loss": 0.0009, + "num_input_tokens_seen": 149634672, + "step": 122975 + }, + { + "epoch": 13.696402717451832, + "grad_norm": 0.0013495172606781125, + "learning_rate": 1.366465819856019e-05, + "loss": 0.0865, + "num_input_tokens_seen": 149641232, + "step": 122980 + }, + { + "epoch": 13.69695957233545, + "grad_norm": 2.909778118133545, + "learning_rate": 1.3662492626323967e-05, + "loss": 0.1265, + "num_input_tokens_seen": 149647376, + "step": 122985 + }, + { + "epoch": 13.697516427219067, + "grad_norm": 0.0008997763507068157, + "learning_rate": 1.3660327161179573e-05, + "loss": 0.0038, + "num_input_tokens_seen": 149653456, + "step": 122990 + }, + { + "epoch": 13.698073282102683, + "grad_norm": 0.06651023030281067, + "learning_rate": 1.3658161803147485e-05, + "loss": 0.0236, + "num_input_tokens_seen": 149659536, + "step": 122995 + }, + { + "epoch": 13.698630136986301, + "grad_norm": 1.3511706590652466, + "learning_rate": 1.3655996552248146e-05, + "loss": 0.0892, + "num_input_tokens_seen": 149666128, + "step": 123000 + }, + { + "epoch": 13.699186991869919, + "grad_norm": 1.2161262035369873, + "learning_rate": 1.3653831408502004e-05, + "loss": 0.0441, + "num_input_tokens_seen": 149672336, + "step": 123005 + }, + { + "epoch": 13.699743846753536, + "grad_norm": 0.20968100428581238, + "learning_rate": 1.3651666371929511e-05, + "loss": 0.0168, + "num_input_tokens_seen": 149678256, + "step": 123010 + }, + { + "epoch": 13.700300701637154, + "grad_norm": 1.429834246635437, + "learning_rate": 1.3649501442551127e-05, + "loss": 0.0573, + "num_input_tokens_seen": 149684400, + "step": 123015 + }, + { + "epoch": 13.70085755652077, + "grad_norm": 1.3258769512176514, + "learning_rate": 1.3647336620387297e-05, + "loss": 0.064, + "num_input_tokens_seen": 149689680, + "step": 123020 + }, + { + "epoch": 13.701414411404388, + "grad_norm": 0.020160941407084465, + "learning_rate": 1.364517190545847e-05, + "loss": 0.0749, + "num_input_tokens_seen": 149695952, + "step": 123025 + }, + { + "epoch": 13.701971266288005, + "grad_norm": 0.7370668053627014, + "learning_rate": 1.3643007297785087e-05, + "loss": 0.016, + "num_input_tokens_seen": 149702192, + "step": 123030 + }, + { + "epoch": 13.702528121171623, + "grad_norm": 0.02397943288087845, + "learning_rate": 1.3640842797387592e-05, + "loss": 0.1182, + "num_input_tokens_seen": 149708240, + "step": 123035 + }, + { + "epoch": 13.70308497605524, + "grad_norm": 0.03897898644208908, + "learning_rate": 1.3638678404286447e-05, + "loss": 0.032, + "num_input_tokens_seen": 149714544, + "step": 123040 + }, + { + "epoch": 13.703641830938857, + "grad_norm": 0.6729670166969299, + "learning_rate": 1.3636514118502092e-05, + "loss": 0.04, + "num_input_tokens_seen": 149720592, + "step": 123045 + }, + { + "epoch": 13.704198685822474, + "grad_norm": 0.06983073800802231, + "learning_rate": 1.3634349940054958e-05, + "loss": 0.0969, + "num_input_tokens_seen": 149726928, + "step": 123050 + }, + { + "epoch": 13.704755540706092, + "grad_norm": 0.3177623152732849, + "learning_rate": 1.3632185868965492e-05, + "loss": 0.0797, + "num_input_tokens_seen": 149732944, + "step": 123055 + }, + { + "epoch": 13.70531239558971, + "grad_norm": 0.009829475544393063, + "learning_rate": 1.3630021905254142e-05, + "loss": 0.0034, + "num_input_tokens_seen": 149739088, + "step": 123060 + }, + { + "epoch": 13.705869250473327, + "grad_norm": 0.003154332283884287, + "learning_rate": 1.3627858048941339e-05, + "loss": 0.0762, + "num_input_tokens_seen": 149745296, + "step": 123065 + }, + { + "epoch": 13.706426105356943, + "grad_norm": 0.1933005005121231, + "learning_rate": 1.3625694300047535e-05, + "loss": 0.0158, + "num_input_tokens_seen": 149751312, + "step": 123070 + }, + { + "epoch": 13.70698296024056, + "grad_norm": 0.031576354056596756, + "learning_rate": 1.3623530658593161e-05, + "loss": 0.0767, + "num_input_tokens_seen": 149757776, + "step": 123075 + }, + { + "epoch": 13.707539815124179, + "grad_norm": 0.0057596368715167046, + "learning_rate": 1.3621367124598654e-05, + "loss": 0.0043, + "num_input_tokens_seen": 149763248, + "step": 123080 + }, + { + "epoch": 13.708096670007796, + "grad_norm": 0.03794455528259277, + "learning_rate": 1.3619203698084443e-05, + "loss": 0.0075, + "num_input_tokens_seen": 149769296, + "step": 123085 + }, + { + "epoch": 13.708653524891414, + "grad_norm": 0.0006851739599369466, + "learning_rate": 1.3617040379070982e-05, + "loss": 0.0283, + "num_input_tokens_seen": 149775152, + "step": 123090 + }, + { + "epoch": 13.70921037977503, + "grad_norm": 0.1585317850112915, + "learning_rate": 1.3614877167578693e-05, + "loss": 0.0177, + "num_input_tokens_seen": 149781296, + "step": 123095 + }, + { + "epoch": 13.709767234658647, + "grad_norm": 0.00029836632893420756, + "learning_rate": 1.3612714063628013e-05, + "loss": 0.0028, + "num_input_tokens_seen": 149787536, + "step": 123100 + }, + { + "epoch": 13.710324089542265, + "grad_norm": 0.005254761315882206, + "learning_rate": 1.3610551067239358e-05, + "loss": 0.1516, + "num_input_tokens_seen": 149793392, + "step": 123105 + }, + { + "epoch": 13.710880944425883, + "grad_norm": 2.076106309890747, + "learning_rate": 1.3608388178433185e-05, + "loss": 0.0786, + "num_input_tokens_seen": 149799728, + "step": 123110 + }, + { + "epoch": 13.7114377993095, + "grad_norm": 1.238834023475647, + "learning_rate": 1.3606225397229904e-05, + "loss": 0.0544, + "num_input_tokens_seen": 149805936, + "step": 123115 + }, + { + "epoch": 13.711994654193116, + "grad_norm": 0.9779006242752075, + "learning_rate": 1.3604062723649971e-05, + "loss": 0.0451, + "num_input_tokens_seen": 149812240, + "step": 123120 + }, + { + "epoch": 13.712551509076734, + "grad_norm": 0.6385411620140076, + "learning_rate": 1.3601900157713777e-05, + "loss": 0.0335, + "num_input_tokens_seen": 149818224, + "step": 123125 + }, + { + "epoch": 13.713108363960352, + "grad_norm": 0.17148953676223755, + "learning_rate": 1.3599737699441779e-05, + "loss": 0.0092, + "num_input_tokens_seen": 149824112, + "step": 123130 + }, + { + "epoch": 13.71366521884397, + "grad_norm": 0.018592849373817444, + "learning_rate": 1.359757534885438e-05, + "loss": 0.1061, + "num_input_tokens_seen": 149829968, + "step": 123135 + }, + { + "epoch": 13.714222073727587, + "grad_norm": 0.016527999192476273, + "learning_rate": 1.3595413105972027e-05, + "loss": 0.0752, + "num_input_tokens_seen": 149836208, + "step": 123140 + }, + { + "epoch": 13.714778928611205, + "grad_norm": 0.0005825128173455596, + "learning_rate": 1.3593250970815136e-05, + "loss": 0.0269, + "num_input_tokens_seen": 149841968, + "step": 123145 + }, + { + "epoch": 13.71533578349482, + "grad_norm": 1.4404739141464233, + "learning_rate": 1.3591088943404126e-05, + "loss": 0.1307, + "num_input_tokens_seen": 149848048, + "step": 123150 + }, + { + "epoch": 13.715892638378438, + "grad_norm": 0.06113903596997261, + "learning_rate": 1.3588927023759413e-05, + "loss": 0.0173, + "num_input_tokens_seen": 149854224, + "step": 123155 + }, + { + "epoch": 13.716449493262056, + "grad_norm": 0.6907510161399841, + "learning_rate": 1.3586765211901437e-05, + "loss": 0.0239, + "num_input_tokens_seen": 149860560, + "step": 123160 + }, + { + "epoch": 13.717006348145674, + "grad_norm": 0.5412728786468506, + "learning_rate": 1.3584603507850608e-05, + "loss": 0.0264, + "num_input_tokens_seen": 149866576, + "step": 123165 + }, + { + "epoch": 13.717563203029291, + "grad_norm": 0.3000974953174591, + "learning_rate": 1.3582441911627344e-05, + "loss": 0.1143, + "num_input_tokens_seen": 149872592, + "step": 123170 + }, + { + "epoch": 13.718120057912907, + "grad_norm": 0.38711652159690857, + "learning_rate": 1.3580280423252053e-05, + "loss": 0.0067, + "num_input_tokens_seen": 149879216, + "step": 123175 + }, + { + "epoch": 13.718676912796525, + "grad_norm": 0.01408905629068613, + "learning_rate": 1.3578119042745174e-05, + "loss": 0.0046, + "num_input_tokens_seen": 149885424, + "step": 123180 + }, + { + "epoch": 13.719233767680143, + "grad_norm": 0.03564143180847168, + "learning_rate": 1.3575957770127102e-05, + "loss": 0.015, + "num_input_tokens_seen": 149891504, + "step": 123185 + }, + { + "epoch": 13.71979062256376, + "grad_norm": 0.5646983981132507, + "learning_rate": 1.3573796605418282e-05, + "loss": 0.0301, + "num_input_tokens_seen": 149897648, + "step": 123190 + }, + { + "epoch": 13.720347477447378, + "grad_norm": 0.004813872743397951, + "learning_rate": 1.3571635548639086e-05, + "loss": 0.0259, + "num_input_tokens_seen": 149903696, + "step": 123195 + }, + { + "epoch": 13.720904332330994, + "grad_norm": 0.7705828547477722, + "learning_rate": 1.3569474599809961e-05, + "loss": 0.0051, + "num_input_tokens_seen": 149910032, + "step": 123200 + }, + { + "epoch": 13.721461187214611, + "grad_norm": 0.00032641616417095065, + "learning_rate": 1.3567313758951294e-05, + "loss": 0.0198, + "num_input_tokens_seen": 149916304, + "step": 123205 + }, + { + "epoch": 13.72201804209823, + "grad_norm": 0.4887358844280243, + "learning_rate": 1.3565153026083519e-05, + "loss": 0.0101, + "num_input_tokens_seen": 149922160, + "step": 123210 + }, + { + "epoch": 13.722574896981847, + "grad_norm": 0.8564047813415527, + "learning_rate": 1.3562992401227034e-05, + "loss": 0.0393, + "num_input_tokens_seen": 149928240, + "step": 123215 + }, + { + "epoch": 13.723131751865465, + "grad_norm": 1.9842619895935059, + "learning_rate": 1.356083188440225e-05, + "loss": 0.0594, + "num_input_tokens_seen": 149934064, + "step": 123220 + }, + { + "epoch": 13.72368860674908, + "grad_norm": 0.07119347900152206, + "learning_rate": 1.3558671475629564e-05, + "loss": 0.0122, + "num_input_tokens_seen": 149939824, + "step": 123225 + }, + { + "epoch": 13.724245461632698, + "grad_norm": 1.0214518308639526, + "learning_rate": 1.3556511174929403e-05, + "loss": 0.0948, + "num_input_tokens_seen": 149945584, + "step": 123230 + }, + { + "epoch": 13.724802316516316, + "grad_norm": 0.15845470130443573, + "learning_rate": 1.3554350982322161e-05, + "loss": 0.0048, + "num_input_tokens_seen": 149951696, + "step": 123235 + }, + { + "epoch": 13.725359171399933, + "grad_norm": 1.1328125, + "learning_rate": 1.3552190897828246e-05, + "loss": 0.0773, + "num_input_tokens_seen": 149958096, + "step": 123240 + }, + { + "epoch": 13.725916026283551, + "grad_norm": 0.9765520095825195, + "learning_rate": 1.3550030921468049e-05, + "loss": 0.04, + "num_input_tokens_seen": 149964240, + "step": 123245 + }, + { + "epoch": 13.726472881167167, + "grad_norm": 0.5510620474815369, + "learning_rate": 1.3547871053261991e-05, + "loss": 0.1517, + "num_input_tokens_seen": 149970384, + "step": 123250 + }, + { + "epoch": 13.727029736050785, + "grad_norm": 0.02042691595852375, + "learning_rate": 1.354571129323046e-05, + "loss": 0.1213, + "num_input_tokens_seen": 149976464, + "step": 123255 + }, + { + "epoch": 13.727586590934402, + "grad_norm": 0.09969670325517654, + "learning_rate": 1.354355164139387e-05, + "loss": 0.0454, + "num_input_tokens_seen": 149982640, + "step": 123260 + }, + { + "epoch": 13.72814344581802, + "grad_norm": 0.09250790625810623, + "learning_rate": 1.3541392097772615e-05, + "loss": 0.0785, + "num_input_tokens_seen": 149988528, + "step": 123265 + }, + { + "epoch": 13.728700300701638, + "grad_norm": 0.14324098825454712, + "learning_rate": 1.3539232662387092e-05, + "loss": 0.0146, + "num_input_tokens_seen": 149994512, + "step": 123270 + }, + { + "epoch": 13.729257155585255, + "grad_norm": 4.040326118469238, + "learning_rate": 1.3537073335257688e-05, + "loss": 0.051, + "num_input_tokens_seen": 150000720, + "step": 123275 + }, + { + "epoch": 13.729814010468871, + "grad_norm": 1.5434294939041138, + "learning_rate": 1.353491411640482e-05, + "loss": 0.1223, + "num_input_tokens_seen": 150006640, + "step": 123280 + }, + { + "epoch": 13.730370865352489, + "grad_norm": 2.3937058448791504, + "learning_rate": 1.3532755005848873e-05, + "loss": 0.0917, + "num_input_tokens_seen": 150013008, + "step": 123285 + }, + { + "epoch": 13.730927720236107, + "grad_norm": 0.11669079959392548, + "learning_rate": 1.3530596003610247e-05, + "loss": 0.0258, + "num_input_tokens_seen": 150019088, + "step": 123290 + }, + { + "epoch": 13.731484575119724, + "grad_norm": 0.7073428630828857, + "learning_rate": 1.3528437109709319e-05, + "loss": 0.025, + "num_input_tokens_seen": 150025648, + "step": 123295 + }, + { + "epoch": 13.732041430003342, + "grad_norm": 0.8028949499130249, + "learning_rate": 1.35262783241665e-05, + "loss": 0.0154, + "num_input_tokens_seen": 150031600, + "step": 123300 + }, + { + "epoch": 13.732598284886958, + "grad_norm": 0.04304639995098114, + "learning_rate": 1.3524119647002168e-05, + "loss": 0.0029, + "num_input_tokens_seen": 150037840, + "step": 123305 + }, + { + "epoch": 13.733155139770576, + "grad_norm": 0.000256815372267738, + "learning_rate": 1.3521961078236739e-05, + "loss": 0.0238, + "num_input_tokens_seen": 150044048, + "step": 123310 + }, + { + "epoch": 13.733711994654193, + "grad_norm": 0.7052451968193054, + "learning_rate": 1.3519802617890565e-05, + "loss": 0.0335, + "num_input_tokens_seen": 150050320, + "step": 123315 + }, + { + "epoch": 13.734268849537811, + "grad_norm": 0.0628868043422699, + "learning_rate": 1.3517644265984059e-05, + "loss": 0.0484, + "num_input_tokens_seen": 150056592, + "step": 123320 + }, + { + "epoch": 13.734825704421429, + "grad_norm": 0.00047426731907762587, + "learning_rate": 1.3515486022537596e-05, + "loss": 0.0178, + "num_input_tokens_seen": 150062896, + "step": 123325 + }, + { + "epoch": 13.735382559305044, + "grad_norm": 0.43423548340797424, + "learning_rate": 1.351332788757158e-05, + "loss": 0.0905, + "num_input_tokens_seen": 150068880, + "step": 123330 + }, + { + "epoch": 13.735939414188662, + "grad_norm": 0.42415013909339905, + "learning_rate": 1.3511169861106382e-05, + "loss": 0.0248, + "num_input_tokens_seen": 150075024, + "step": 123335 + }, + { + "epoch": 13.73649626907228, + "grad_norm": 0.09968353062868118, + "learning_rate": 1.3509011943162392e-05, + "loss": 0.0132, + "num_input_tokens_seen": 150081264, + "step": 123340 + }, + { + "epoch": 13.737053123955898, + "grad_norm": 0.009393184445798397, + "learning_rate": 1.3506854133759977e-05, + "loss": 0.0346, + "num_input_tokens_seen": 150087408, + "step": 123345 + }, + { + "epoch": 13.737609978839515, + "grad_norm": 0.00047402127529494464, + "learning_rate": 1.3504696432919544e-05, + "loss": 0.0457, + "num_input_tokens_seen": 150093776, + "step": 123350 + }, + { + "epoch": 13.738166833723131, + "grad_norm": 2.0360918045043945, + "learning_rate": 1.3502538840661466e-05, + "loss": 0.0236, + "num_input_tokens_seen": 150100080, + "step": 123355 + }, + { + "epoch": 13.738723688606749, + "grad_norm": 0.04267209768295288, + "learning_rate": 1.350038135700612e-05, + "loss": 0.0153, + "num_input_tokens_seen": 150105936, + "step": 123360 + }, + { + "epoch": 13.739280543490366, + "grad_norm": 0.39930739998817444, + "learning_rate": 1.3498223981973873e-05, + "loss": 0.0143, + "num_input_tokens_seen": 150112176, + "step": 123365 + }, + { + "epoch": 13.739837398373984, + "grad_norm": 1.275228500366211, + "learning_rate": 1.3496066715585126e-05, + "loss": 0.1007, + "num_input_tokens_seen": 150118448, + "step": 123370 + }, + { + "epoch": 13.740394253257602, + "grad_norm": 0.8378891944885254, + "learning_rate": 1.3493909557860235e-05, + "loss": 0.0205, + "num_input_tokens_seen": 150124656, + "step": 123375 + }, + { + "epoch": 13.740951108141218, + "grad_norm": 0.00019187459838576615, + "learning_rate": 1.3491752508819607e-05, + "loss": 0.0329, + "num_input_tokens_seen": 150130928, + "step": 123380 + }, + { + "epoch": 13.741507963024835, + "grad_norm": 1.937743067741394, + "learning_rate": 1.348959556848358e-05, + "loss": 0.117, + "num_input_tokens_seen": 150137264, + "step": 123385 + }, + { + "epoch": 13.742064817908453, + "grad_norm": 1.3837475776672363, + "learning_rate": 1.3487438736872551e-05, + "loss": 0.0323, + "num_input_tokens_seen": 150143376, + "step": 123390 + }, + { + "epoch": 13.74262167279207, + "grad_norm": 1.4657436609268188, + "learning_rate": 1.3485282014006878e-05, + "loss": 0.0441, + "num_input_tokens_seen": 150149520, + "step": 123395 + }, + { + "epoch": 13.743178527675688, + "grad_norm": 0.8043844699859619, + "learning_rate": 1.3483125399906955e-05, + "loss": 0.016, + "num_input_tokens_seen": 150155632, + "step": 123400 + }, + { + "epoch": 13.743735382559304, + "grad_norm": 0.009669474326074123, + "learning_rate": 1.3480968894593135e-05, + "loss": 0.0057, + "num_input_tokens_seen": 150161648, + "step": 123405 + }, + { + "epoch": 13.744292237442922, + "grad_norm": 0.45752134919166565, + "learning_rate": 1.3478812498085793e-05, + "loss": 0.0087, + "num_input_tokens_seen": 150167440, + "step": 123410 + }, + { + "epoch": 13.74484909232654, + "grad_norm": 0.033868685364723206, + "learning_rate": 1.3476656210405292e-05, + "loss": 0.0256, + "num_input_tokens_seen": 150174000, + "step": 123415 + }, + { + "epoch": 13.745405947210157, + "grad_norm": 0.3326132893562317, + "learning_rate": 1.3474500031572012e-05, + "loss": 0.0381, + "num_input_tokens_seen": 150179824, + "step": 123420 + }, + { + "epoch": 13.745962802093775, + "grad_norm": 0.011056876741349697, + "learning_rate": 1.3472343961606312e-05, + "loss": 0.0107, + "num_input_tokens_seen": 150185840, + "step": 123425 + }, + { + "epoch": 13.74651965697739, + "grad_norm": 0.24258507788181305, + "learning_rate": 1.3470188000528561e-05, + "loss": 0.1, + "num_input_tokens_seen": 150191056, + "step": 123430 + }, + { + "epoch": 13.747076511861009, + "grad_norm": 1.0640987157821655, + "learning_rate": 1.3468032148359121e-05, + "loss": 0.0586, + "num_input_tokens_seen": 150197040, + "step": 123435 + }, + { + "epoch": 13.747633366744626, + "grad_norm": 0.20035481452941895, + "learning_rate": 1.3465876405118349e-05, + "loss": 0.0141, + "num_input_tokens_seen": 150202832, + "step": 123440 + }, + { + "epoch": 13.748190221628244, + "grad_norm": 0.03249150142073631, + "learning_rate": 1.3463720770826624e-05, + "loss": 0.043, + "num_input_tokens_seen": 150208944, + "step": 123445 + }, + { + "epoch": 13.748747076511862, + "grad_norm": 0.06986451148986816, + "learning_rate": 1.3461565245504298e-05, + "loss": 0.0552, + "num_input_tokens_seen": 150214896, + "step": 123450 + }, + { + "epoch": 13.749303931395477, + "grad_norm": 1.335252285003662, + "learning_rate": 1.3459409829171731e-05, + "loss": 0.0475, + "num_input_tokens_seen": 150221040, + "step": 123455 + }, + { + "epoch": 13.749860786279095, + "grad_norm": 0.16738128662109375, + "learning_rate": 1.345725452184928e-05, + "loss": 0.0224, + "num_input_tokens_seen": 150227152, + "step": 123460 + }, + { + "epoch": 13.750417641162713, + "grad_norm": 0.02183409593999386, + "learning_rate": 1.3455099323557312e-05, + "loss": 0.0037, + "num_input_tokens_seen": 150233168, + "step": 123465 + }, + { + "epoch": 13.75097449604633, + "grad_norm": 0.5339546203613281, + "learning_rate": 1.3452944234316176e-05, + "loss": 0.1499, + "num_input_tokens_seen": 150239376, + "step": 123470 + }, + { + "epoch": 13.751531350929948, + "grad_norm": 0.1260344535112381, + "learning_rate": 1.345078925414624e-05, + "loss": 0.0071, + "num_input_tokens_seen": 150245744, + "step": 123475 + }, + { + "epoch": 13.752088205813564, + "grad_norm": 0.0036887770984321833, + "learning_rate": 1.3448634383067853e-05, + "loss": 0.0007, + "num_input_tokens_seen": 150251824, + "step": 123480 + }, + { + "epoch": 13.752645060697182, + "grad_norm": 0.5562736988067627, + "learning_rate": 1.3446479621101369e-05, + "loss": 0.0215, + "num_input_tokens_seen": 150257840, + "step": 123485 + }, + { + "epoch": 13.7532019155808, + "grad_norm": 0.017067648470401764, + "learning_rate": 1.344432496826713e-05, + "loss": 0.0053, + "num_input_tokens_seen": 150263760, + "step": 123490 + }, + { + "epoch": 13.753758770464417, + "grad_norm": 1.2273046970367432, + "learning_rate": 1.3442170424585512e-05, + "loss": 0.0339, + "num_input_tokens_seen": 150269520, + "step": 123495 + }, + { + "epoch": 13.754315625348035, + "grad_norm": 0.417885422706604, + "learning_rate": 1.3440015990076854e-05, + "loss": 0.0056, + "num_input_tokens_seen": 150275952, + "step": 123500 + }, + { + "epoch": 13.754872480231652, + "grad_norm": 0.02502923645079136, + "learning_rate": 1.3437861664761508e-05, + "loss": 0.2681, + "num_input_tokens_seen": 150281872, + "step": 123505 + }, + { + "epoch": 13.755429335115268, + "grad_norm": 0.5043497681617737, + "learning_rate": 1.343570744865981e-05, + "loss": 0.0338, + "num_input_tokens_seen": 150288368, + "step": 123510 + }, + { + "epoch": 13.755986189998886, + "grad_norm": 0.006354345008730888, + "learning_rate": 1.3433553341792129e-05, + "loss": 0.0277, + "num_input_tokens_seen": 150294544, + "step": 123515 + }, + { + "epoch": 13.756543044882504, + "grad_norm": 0.02521352656185627, + "learning_rate": 1.3431399344178796e-05, + "loss": 0.011, + "num_input_tokens_seen": 150300592, + "step": 123520 + }, + { + "epoch": 13.757099899766121, + "grad_norm": 1.3031624555587769, + "learning_rate": 1.3429245455840184e-05, + "loss": 0.0896, + "num_input_tokens_seen": 150306672, + "step": 123525 + }, + { + "epoch": 13.757656754649739, + "grad_norm": 1.8800393342971802, + "learning_rate": 1.3427091676796599e-05, + "loss": 0.0429, + "num_input_tokens_seen": 150312752, + "step": 123530 + }, + { + "epoch": 13.758213609533355, + "grad_norm": 0.00011587428161874413, + "learning_rate": 1.3424938007068417e-05, + "loss": 0.08, + "num_input_tokens_seen": 150318960, + "step": 123535 + }, + { + "epoch": 13.758770464416973, + "grad_norm": 0.0395595021545887, + "learning_rate": 1.3422784446675956e-05, + "loss": 0.0205, + "num_input_tokens_seen": 150325200, + "step": 123540 + }, + { + "epoch": 13.75932731930059, + "grad_norm": 2.1190497875213623, + "learning_rate": 1.3420630995639582e-05, + "loss": 0.1356, + "num_input_tokens_seen": 150330992, + "step": 123545 + }, + { + "epoch": 13.759884174184208, + "grad_norm": 0.08322807401418686, + "learning_rate": 1.3418477653979628e-05, + "loss": 0.0673, + "num_input_tokens_seen": 150337008, + "step": 123550 + }, + { + "epoch": 13.760441029067826, + "grad_norm": 0.17186838388442993, + "learning_rate": 1.341632442171643e-05, + "loss": 0.0781, + "num_input_tokens_seen": 150343056, + "step": 123555 + }, + { + "epoch": 13.760997883951442, + "grad_norm": 0.6341810822486877, + "learning_rate": 1.3414171298870317e-05, + "loss": 0.0522, + "num_input_tokens_seen": 150349392, + "step": 123560 + }, + { + "epoch": 13.76155473883506, + "grad_norm": 0.06579199433326721, + "learning_rate": 1.341201828546165e-05, + "loss": 0.0186, + "num_input_tokens_seen": 150355184, + "step": 123565 + }, + { + "epoch": 13.762111593718677, + "grad_norm": 0.5096563696861267, + "learning_rate": 1.3409865381510756e-05, + "loss": 0.069, + "num_input_tokens_seen": 150361552, + "step": 123570 + }, + { + "epoch": 13.762668448602295, + "grad_norm": 0.10191617906093597, + "learning_rate": 1.3407712587037968e-05, + "loss": 0.0623, + "num_input_tokens_seen": 150367760, + "step": 123575 + }, + { + "epoch": 13.763225303485912, + "grad_norm": 0.7236526012420654, + "learning_rate": 1.3405559902063611e-05, + "loss": 0.0254, + "num_input_tokens_seen": 150373968, + "step": 123580 + }, + { + "epoch": 13.763782158369528, + "grad_norm": 0.38722339272499084, + "learning_rate": 1.3403407326608043e-05, + "loss": 0.0213, + "num_input_tokens_seen": 150379760, + "step": 123585 + }, + { + "epoch": 13.764339013253146, + "grad_norm": 1.5084606409072876, + "learning_rate": 1.340125486069157e-05, + "loss": 0.083, + "num_input_tokens_seen": 150386160, + "step": 123590 + }, + { + "epoch": 13.764895868136763, + "grad_norm": 0.3180139660835266, + "learning_rate": 1.339910250433456e-05, + "loss": 0.008, + "num_input_tokens_seen": 150392272, + "step": 123595 + }, + { + "epoch": 13.765452723020381, + "grad_norm": 8.994466043077409e-05, + "learning_rate": 1.33969502575573e-05, + "loss": 0.0033, + "num_input_tokens_seen": 150397904, + "step": 123600 + }, + { + "epoch": 13.766009577903999, + "grad_norm": 1.4678304195404053, + "learning_rate": 1.3394798120380153e-05, + "loss": 0.1173, + "num_input_tokens_seen": 150403440, + "step": 123605 + }, + { + "epoch": 13.766566432787615, + "grad_norm": 1.2143676280975342, + "learning_rate": 1.3392646092823424e-05, + "loss": 0.0268, + "num_input_tokens_seen": 150409936, + "step": 123610 + }, + { + "epoch": 13.767123287671232, + "grad_norm": 0.0021403387654572725, + "learning_rate": 1.3390494174907462e-05, + "loss": 0.1028, + "num_input_tokens_seen": 150415664, + "step": 123615 + }, + { + "epoch": 13.76768014255485, + "grad_norm": 1.0003706216812134, + "learning_rate": 1.3388342366652584e-05, + "loss": 0.0805, + "num_input_tokens_seen": 150421584, + "step": 123620 + }, + { + "epoch": 13.768236997438468, + "grad_norm": 0.000527968630194664, + "learning_rate": 1.3386190668079116e-05, + "loss": 0.1047, + "num_input_tokens_seen": 150427632, + "step": 123625 + }, + { + "epoch": 13.768793852322085, + "grad_norm": 0.0001482723164372146, + "learning_rate": 1.3384039079207372e-05, + "loss": 0.0473, + "num_input_tokens_seen": 150433872, + "step": 123630 + }, + { + "epoch": 13.769350707205703, + "grad_norm": 0.9061377644538879, + "learning_rate": 1.3381887600057697e-05, + "loss": 0.1112, + "num_input_tokens_seen": 150439856, + "step": 123635 + }, + { + "epoch": 13.769907562089319, + "grad_norm": 0.12343640625476837, + "learning_rate": 1.3379736230650397e-05, + "loss": 0.0214, + "num_input_tokens_seen": 150445552, + "step": 123640 + }, + { + "epoch": 13.770464416972937, + "grad_norm": 0.12588441371917725, + "learning_rate": 1.3377584971005802e-05, + "loss": 0.0023, + "num_input_tokens_seen": 150451792, + "step": 123645 + }, + { + "epoch": 13.771021271856554, + "grad_norm": 0.3303558826446533, + "learning_rate": 1.337543382114422e-05, + "loss": 0.0454, + "num_input_tokens_seen": 150458192, + "step": 123650 + }, + { + "epoch": 13.771578126740172, + "grad_norm": 1.7549296617507935, + "learning_rate": 1.3373282781085988e-05, + "loss": 0.1684, + "num_input_tokens_seen": 150464048, + "step": 123655 + }, + { + "epoch": 13.77213498162379, + "grad_norm": 0.012173714116215706, + "learning_rate": 1.3371131850851404e-05, + "loss": 0.0353, + "num_input_tokens_seen": 150470064, + "step": 123660 + }, + { + "epoch": 13.772691836507406, + "grad_norm": 0.0014818924246355891, + "learning_rate": 1.3368981030460809e-05, + "loss": 0.0364, + "num_input_tokens_seen": 150476080, + "step": 123665 + }, + { + "epoch": 13.773248691391023, + "grad_norm": 0.6221370697021484, + "learning_rate": 1.3366830319934503e-05, + "loss": 0.0104, + "num_input_tokens_seen": 150482256, + "step": 123670 + }, + { + "epoch": 13.773805546274641, + "grad_norm": 0.3611590564250946, + "learning_rate": 1.3364679719292808e-05, + "loss": 0.0056, + "num_input_tokens_seen": 150488432, + "step": 123675 + }, + { + "epoch": 13.774362401158259, + "grad_norm": 0.0002633254916872829, + "learning_rate": 1.3362529228556026e-05, + "loss": 0.0091, + "num_input_tokens_seen": 150493936, + "step": 123680 + }, + { + "epoch": 13.774919256041876, + "grad_norm": 0.0018402813002467155, + "learning_rate": 1.3360378847744487e-05, + "loss": 0.0307, + "num_input_tokens_seen": 150500048, + "step": 123685 + }, + { + "epoch": 13.775476110925492, + "grad_norm": 0.04470134153962135, + "learning_rate": 1.3358228576878496e-05, + "loss": 0.0011, + "num_input_tokens_seen": 150506192, + "step": 123690 + }, + { + "epoch": 13.77603296580911, + "grad_norm": 0.4631170630455017, + "learning_rate": 1.3356078415978362e-05, + "loss": 0.0475, + "num_input_tokens_seen": 150512432, + "step": 123695 + }, + { + "epoch": 13.776589820692728, + "grad_norm": 1.289971947669983, + "learning_rate": 1.3353928365064386e-05, + "loss": 0.0336, + "num_input_tokens_seen": 150518672, + "step": 123700 + }, + { + "epoch": 13.777146675576345, + "grad_norm": 0.16095317900180817, + "learning_rate": 1.3351778424156896e-05, + "loss": 0.0151, + "num_input_tokens_seen": 150525008, + "step": 123705 + }, + { + "epoch": 13.777703530459963, + "grad_norm": 0.2206657975912094, + "learning_rate": 1.3349628593276186e-05, + "loss": 0.0615, + "num_input_tokens_seen": 150531248, + "step": 123710 + }, + { + "epoch": 13.778260385343579, + "grad_norm": 0.08569477498531342, + "learning_rate": 1.3347478872442588e-05, + "loss": 0.0106, + "num_input_tokens_seen": 150537520, + "step": 123715 + }, + { + "epoch": 13.778817240227196, + "grad_norm": 0.03651924431324005, + "learning_rate": 1.3345329261676365e-05, + "loss": 0.0011, + "num_input_tokens_seen": 150543984, + "step": 123720 + }, + { + "epoch": 13.779374095110814, + "grad_norm": 1.067082166671753, + "learning_rate": 1.3343179760997853e-05, + "loss": 0.0184, + "num_input_tokens_seen": 150550064, + "step": 123725 + }, + { + "epoch": 13.779930949994432, + "grad_norm": 1.4397281408309937, + "learning_rate": 1.334103037042734e-05, + "loss": 0.1075, + "num_input_tokens_seen": 150556176, + "step": 123730 + }, + { + "epoch": 13.78048780487805, + "grad_norm": 0.04613823443651199, + "learning_rate": 1.3338881089985148e-05, + "loss": 0.0068, + "num_input_tokens_seen": 150562448, + "step": 123735 + }, + { + "epoch": 13.781044659761665, + "grad_norm": 1.494519591331482, + "learning_rate": 1.3336731919691564e-05, + "loss": 0.0563, + "num_input_tokens_seen": 150568368, + "step": 123740 + }, + { + "epoch": 13.781601514645283, + "grad_norm": 0.20798727869987488, + "learning_rate": 1.333458285956689e-05, + "loss": 0.011, + "num_input_tokens_seen": 150574768, + "step": 123745 + }, + { + "epoch": 13.7821583695289, + "grad_norm": 0.18764865398406982, + "learning_rate": 1.333243390963142e-05, + "loss": 0.0054, + "num_input_tokens_seen": 150580720, + "step": 123750 + }, + { + "epoch": 13.782715224412518, + "grad_norm": 0.08070655167102814, + "learning_rate": 1.3330285069905469e-05, + "loss": 0.0095, + "num_input_tokens_seen": 150586960, + "step": 123755 + }, + { + "epoch": 13.783272079296136, + "grad_norm": 2.0928916931152344, + "learning_rate": 1.3328136340409325e-05, + "loss": 0.0979, + "num_input_tokens_seen": 150593104, + "step": 123760 + }, + { + "epoch": 13.783828934179752, + "grad_norm": 0.33258628845214844, + "learning_rate": 1.3325987721163286e-05, + "loss": 0.1501, + "num_input_tokens_seen": 150599248, + "step": 123765 + }, + { + "epoch": 13.78438578906337, + "grad_norm": 0.000535984116140753, + "learning_rate": 1.3323839212187633e-05, + "loss": 0.0523, + "num_input_tokens_seen": 150605232, + "step": 123770 + }, + { + "epoch": 13.784942643946987, + "grad_norm": 1.119131088256836, + "learning_rate": 1.3321690813502685e-05, + "loss": 0.176, + "num_input_tokens_seen": 150611600, + "step": 123775 + }, + { + "epoch": 13.785499498830605, + "grad_norm": 0.04060102626681328, + "learning_rate": 1.3319542525128714e-05, + "loss": 0.0184, + "num_input_tokens_seen": 150617904, + "step": 123780 + }, + { + "epoch": 13.786056353714223, + "grad_norm": 0.01355687901377678, + "learning_rate": 1.3317394347086042e-05, + "loss": 0.0121, + "num_input_tokens_seen": 150623984, + "step": 123785 + }, + { + "epoch": 13.786613208597839, + "grad_norm": 0.004219182766973972, + "learning_rate": 1.3315246279394922e-05, + "loss": 0.0844, + "num_input_tokens_seen": 150630160, + "step": 123790 + }, + { + "epoch": 13.787170063481456, + "grad_norm": 0.31195420026779175, + "learning_rate": 1.331309832207567e-05, + "loss": 0.0082, + "num_input_tokens_seen": 150636016, + "step": 123795 + }, + { + "epoch": 13.787726918365074, + "grad_norm": 2.8622796535491943, + "learning_rate": 1.331095047514856e-05, + "loss": 0.1082, + "num_input_tokens_seen": 150641808, + "step": 123800 + }, + { + "epoch": 13.788283773248692, + "grad_norm": 0.015322843566536903, + "learning_rate": 1.3308802738633897e-05, + "loss": 0.0462, + "num_input_tokens_seen": 150647792, + "step": 123805 + }, + { + "epoch": 13.78884062813231, + "grad_norm": 0.01313175167888403, + "learning_rate": 1.3306655112551959e-05, + "loss": 0.0052, + "num_input_tokens_seen": 150653680, + "step": 123810 + }, + { + "epoch": 13.789397483015925, + "grad_norm": 1.6562602519989014, + "learning_rate": 1.3304507596923029e-05, + "loss": 0.1181, + "num_input_tokens_seen": 150659664, + "step": 123815 + }, + { + "epoch": 13.789954337899543, + "grad_norm": 3.112358808517456, + "learning_rate": 1.3302360191767387e-05, + "loss": 0.1935, + "num_input_tokens_seen": 150665360, + "step": 123820 + }, + { + "epoch": 13.79051119278316, + "grad_norm": 0.03370405733585358, + "learning_rate": 1.3300212897105339e-05, + "loss": 0.0904, + "num_input_tokens_seen": 150671088, + "step": 123825 + }, + { + "epoch": 13.791068047666778, + "grad_norm": 0.7082811594009399, + "learning_rate": 1.3298065712957147e-05, + "loss": 0.2441, + "num_input_tokens_seen": 150677328, + "step": 123830 + }, + { + "epoch": 13.791624902550396, + "grad_norm": 0.007200600579380989, + "learning_rate": 1.3295918639343105e-05, + "loss": 0.0116, + "num_input_tokens_seen": 150683344, + "step": 123835 + }, + { + "epoch": 13.792181757434012, + "grad_norm": 0.17410364747047424, + "learning_rate": 1.3293771676283479e-05, + "loss": 0.0656, + "num_input_tokens_seen": 150689456, + "step": 123840 + }, + { + "epoch": 13.79273861231763, + "grad_norm": 0.970187246799469, + "learning_rate": 1.3291624823798565e-05, + "loss": 0.0785, + "num_input_tokens_seen": 150694864, + "step": 123845 + }, + { + "epoch": 13.793295467201247, + "grad_norm": 0.001514747622422874, + "learning_rate": 1.3289478081908635e-05, + "loss": 0.0044, + "num_input_tokens_seen": 150700848, + "step": 123850 + }, + { + "epoch": 13.793852322084865, + "grad_norm": 7.268341141752899e-05, + "learning_rate": 1.3287331450633958e-05, + "loss": 0.0427, + "num_input_tokens_seen": 150707440, + "step": 123855 + }, + { + "epoch": 13.794409176968482, + "grad_norm": 0.10564397275447845, + "learning_rate": 1.328518492999484e-05, + "loss": 0.0016, + "num_input_tokens_seen": 150713584, + "step": 123860 + }, + { + "epoch": 13.7949660318521, + "grad_norm": 0.00697808712720871, + "learning_rate": 1.3283038520011514e-05, + "loss": 0.1132, + "num_input_tokens_seen": 150719824, + "step": 123865 + }, + { + "epoch": 13.795522886735716, + "grad_norm": 0.2200966477394104, + "learning_rate": 1.3280892220704289e-05, + "loss": 0.0358, + "num_input_tokens_seen": 150726032, + "step": 123870 + }, + { + "epoch": 13.796079741619334, + "grad_norm": 0.7872010469436646, + "learning_rate": 1.3278746032093417e-05, + "loss": 0.0248, + "num_input_tokens_seen": 150732144, + "step": 123875 + }, + { + "epoch": 13.796636596502951, + "grad_norm": 0.4874371886253357, + "learning_rate": 1.3276599954199186e-05, + "loss": 0.0154, + "num_input_tokens_seen": 150738192, + "step": 123880 + }, + { + "epoch": 13.797193451386569, + "grad_norm": 0.015737168490886688, + "learning_rate": 1.3274453987041865e-05, + "loss": 0.0039, + "num_input_tokens_seen": 150744336, + "step": 123885 + }, + { + "epoch": 13.797750306270187, + "grad_norm": 0.7243762016296387, + "learning_rate": 1.327230813064172e-05, + "loss": 0.0767, + "num_input_tokens_seen": 150750352, + "step": 123890 + }, + { + "epoch": 13.798307161153803, + "grad_norm": 0.41768679022789, + "learning_rate": 1.3270162385019009e-05, + "loss": 0.067, + "num_input_tokens_seen": 150756528, + "step": 123895 + }, + { + "epoch": 13.79886401603742, + "grad_norm": 0.07094407826662064, + "learning_rate": 1.3268016750194023e-05, + "loss": 0.0593, + "num_input_tokens_seen": 150762544, + "step": 123900 + }, + { + "epoch": 13.799420870921038, + "grad_norm": 1.737156629562378, + "learning_rate": 1.3265871226187016e-05, + "loss": 0.0736, + "num_input_tokens_seen": 150768592, + "step": 123905 + }, + { + "epoch": 13.799977725804656, + "grad_norm": 0.01371296402066946, + "learning_rate": 1.3263725813018257e-05, + "loss": 0.0307, + "num_input_tokens_seen": 150774928, + "step": 123910 + }, + { + "epoch": 13.800534580688273, + "grad_norm": 0.06801637262105942, + "learning_rate": 1.3261580510708004e-05, + "loss": 0.0155, + "num_input_tokens_seen": 150781232, + "step": 123915 + }, + { + "epoch": 13.80109143557189, + "grad_norm": 0.00026274146512150764, + "learning_rate": 1.3259435319276536e-05, + "loss": 0.0246, + "num_input_tokens_seen": 150787088, + "step": 123920 + }, + { + "epoch": 13.801648290455507, + "grad_norm": 0.015201346948742867, + "learning_rate": 1.3257290238744097e-05, + "loss": 0.0162, + "num_input_tokens_seen": 150793296, + "step": 123925 + }, + { + "epoch": 13.802205145339125, + "grad_norm": 0.05191769823431969, + "learning_rate": 1.3255145269130981e-05, + "loss": 0.0199, + "num_input_tokens_seen": 150799312, + "step": 123930 + }, + { + "epoch": 13.802762000222742, + "grad_norm": 1.3878086805343628, + "learning_rate": 1.3253000410457406e-05, + "loss": 0.0661, + "num_input_tokens_seen": 150804976, + "step": 123935 + }, + { + "epoch": 13.80331885510636, + "grad_norm": 0.6760107278823853, + "learning_rate": 1.3250855662743666e-05, + "loss": 0.0587, + "num_input_tokens_seen": 150810672, + "step": 123940 + }, + { + "epoch": 13.803875709989976, + "grad_norm": 0.5840697884559631, + "learning_rate": 1.3248711026009997e-05, + "loss": 0.0178, + "num_input_tokens_seen": 150816848, + "step": 123945 + }, + { + "epoch": 13.804432564873593, + "grad_norm": 0.00014109708718024194, + "learning_rate": 1.3246566500276674e-05, + "loss": 0.0524, + "num_input_tokens_seen": 150823088, + "step": 123950 + }, + { + "epoch": 13.804989419757211, + "grad_norm": 0.047492947429418564, + "learning_rate": 1.3244422085563951e-05, + "loss": 0.0028, + "num_input_tokens_seen": 150829616, + "step": 123955 + }, + { + "epoch": 13.805546274640829, + "grad_norm": 0.0007136399508453906, + "learning_rate": 1.3242277781892076e-05, + "loss": 0.0187, + "num_input_tokens_seen": 150835440, + "step": 123960 + }, + { + "epoch": 13.806103129524447, + "grad_norm": 0.13936661183834076, + "learning_rate": 1.32401335892813e-05, + "loss": 0.0328, + "num_input_tokens_seen": 150841424, + "step": 123965 + }, + { + "epoch": 13.806659984408064, + "grad_norm": 0.7637333273887634, + "learning_rate": 1.3237989507751897e-05, + "loss": 0.0206, + "num_input_tokens_seen": 150847920, + "step": 123970 + }, + { + "epoch": 13.80721683929168, + "grad_norm": 0.1755300909280777, + "learning_rate": 1.3235845537324104e-05, + "loss": 0.0559, + "num_input_tokens_seen": 150854544, + "step": 123975 + }, + { + "epoch": 13.807773694175298, + "grad_norm": 0.238528773188591, + "learning_rate": 1.3233701678018173e-05, + "loss": 0.0385, + "num_input_tokens_seen": 150860464, + "step": 123980 + }, + { + "epoch": 13.808330549058915, + "grad_norm": 0.056762780994176865, + "learning_rate": 1.323155792985435e-05, + "loss": 0.0013, + "num_input_tokens_seen": 150866896, + "step": 123985 + }, + { + "epoch": 13.808887403942533, + "grad_norm": 0.05754511058330536, + "learning_rate": 1.32294142928529e-05, + "loss": 0.0199, + "num_input_tokens_seen": 150872752, + "step": 123990 + }, + { + "epoch": 13.80944425882615, + "grad_norm": 0.040871281176805496, + "learning_rate": 1.322727076703405e-05, + "loss": 0.0007, + "num_input_tokens_seen": 150879056, + "step": 123995 + }, + { + "epoch": 13.810001113709767, + "grad_norm": 0.05386175960302353, + "learning_rate": 1.3225127352418082e-05, + "loss": 0.166, + "num_input_tokens_seen": 150884848, + "step": 124000 + }, + { + "epoch": 13.810557968593384, + "grad_norm": 7.806518988218158e-05, + "learning_rate": 1.32229840490252e-05, + "loss": 0.0151, + "num_input_tokens_seen": 150890928, + "step": 124005 + }, + { + "epoch": 13.811114823477002, + "grad_norm": 0.32773807644844055, + "learning_rate": 1.322084085687568e-05, + "loss": 0.1128, + "num_input_tokens_seen": 150896880, + "step": 124010 + }, + { + "epoch": 13.81167167836062, + "grad_norm": 0.024247687309980392, + "learning_rate": 1.3218697775989744e-05, + "loss": 0.2184, + "num_input_tokens_seen": 150902832, + "step": 124015 + }, + { + "epoch": 13.812228533244237, + "grad_norm": 0.5505346059799194, + "learning_rate": 1.3216554806387657e-05, + "loss": 0.0632, + "num_input_tokens_seen": 150908848, + "step": 124020 + }, + { + "epoch": 13.812785388127853, + "grad_norm": 0.1857219934463501, + "learning_rate": 1.321441194808965e-05, + "loss": 0.0168, + "num_input_tokens_seen": 150915152, + "step": 124025 + }, + { + "epoch": 13.813342243011471, + "grad_norm": 0.67739337682724, + "learning_rate": 1.3212269201115968e-05, + "loss": 0.0177, + "num_input_tokens_seen": 150921360, + "step": 124030 + }, + { + "epoch": 13.813899097895089, + "grad_norm": 0.11664993315935135, + "learning_rate": 1.3210126565486833e-05, + "loss": 0.1816, + "num_input_tokens_seen": 150927568, + "step": 124035 + }, + { + "epoch": 13.814455952778706, + "grad_norm": 0.014865386299788952, + "learning_rate": 1.320798404122251e-05, + "loss": 0.1649, + "num_input_tokens_seen": 150933744, + "step": 124040 + }, + { + "epoch": 13.815012807662324, + "grad_norm": 1.6552373170852661, + "learning_rate": 1.3205841628343223e-05, + "loss": 0.1846, + "num_input_tokens_seen": 150939664, + "step": 124045 + }, + { + "epoch": 13.81556966254594, + "grad_norm": 0.06022472679615021, + "learning_rate": 1.3203699326869217e-05, + "loss": 0.0328, + "num_input_tokens_seen": 150945648, + "step": 124050 + }, + { + "epoch": 13.816126517429558, + "grad_norm": 0.025083942338824272, + "learning_rate": 1.3201557136820708e-05, + "loss": 0.0215, + "num_input_tokens_seen": 150952272, + "step": 124055 + }, + { + "epoch": 13.816683372313175, + "grad_norm": 0.004140520468354225, + "learning_rate": 1.3199415058217957e-05, + "loss": 0.0894, + "num_input_tokens_seen": 150958160, + "step": 124060 + }, + { + "epoch": 13.817240227196793, + "grad_norm": 0.0963217094540596, + "learning_rate": 1.3197273091081173e-05, + "loss": 0.002, + "num_input_tokens_seen": 150964432, + "step": 124065 + }, + { + "epoch": 13.81779708208041, + "grad_norm": 0.876781702041626, + "learning_rate": 1.319513123543061e-05, + "loss": 0.0094, + "num_input_tokens_seen": 150970288, + "step": 124070 + }, + { + "epoch": 13.818353936964026, + "grad_norm": 1.2073967456817627, + "learning_rate": 1.3192989491286493e-05, + "loss": 0.0443, + "num_input_tokens_seen": 150976016, + "step": 124075 + }, + { + "epoch": 13.818910791847644, + "grad_norm": 0.003941836301237345, + "learning_rate": 1.3190847858669048e-05, + "loss": 0.0857, + "num_input_tokens_seen": 150982384, + "step": 124080 + }, + { + "epoch": 13.819467646731262, + "grad_norm": 0.021721623837947845, + "learning_rate": 1.3188706337598497e-05, + "loss": 0.0821, + "num_input_tokens_seen": 150988176, + "step": 124085 + }, + { + "epoch": 13.82002450161488, + "grad_norm": 0.028576333075761795, + "learning_rate": 1.3186564928095086e-05, + "loss": 0.0258, + "num_input_tokens_seen": 150994608, + "step": 124090 + }, + { + "epoch": 13.820581356498497, + "grad_norm": 0.0002319066261406988, + "learning_rate": 1.3184423630179038e-05, + "loss": 0.0519, + "num_input_tokens_seen": 151000976, + "step": 124095 + }, + { + "epoch": 13.821138211382113, + "grad_norm": 0.011750505305826664, + "learning_rate": 1.3182282443870572e-05, + "loss": 0.0036, + "num_input_tokens_seen": 151006416, + "step": 124100 + }, + { + "epoch": 13.82169506626573, + "grad_norm": 0.7003211379051208, + "learning_rate": 1.3180141369189908e-05, + "loss": 0.0692, + "num_input_tokens_seen": 151011952, + "step": 124105 + }, + { + "epoch": 13.822251921149348, + "grad_norm": 0.052203547209501266, + "learning_rate": 1.3178000406157287e-05, + "loss": 0.0499, + "num_input_tokens_seen": 151018224, + "step": 124110 + }, + { + "epoch": 13.822808776032966, + "grad_norm": 1.9062514305114746, + "learning_rate": 1.3175859554792916e-05, + "loss": 0.0484, + "num_input_tokens_seen": 151024240, + "step": 124115 + }, + { + "epoch": 13.823365630916584, + "grad_norm": 0.07923093438148499, + "learning_rate": 1.3173718815117042e-05, + "loss": 0.0318, + "num_input_tokens_seen": 151029712, + "step": 124120 + }, + { + "epoch": 13.8239224858002, + "grad_norm": 1.1686869859695435, + "learning_rate": 1.3171578187149852e-05, + "loss": 0.1239, + "num_input_tokens_seen": 151035792, + "step": 124125 + }, + { + "epoch": 13.824479340683817, + "grad_norm": 0.013645267114043236, + "learning_rate": 1.3169437670911591e-05, + "loss": 0.0415, + "num_input_tokens_seen": 151042288, + "step": 124130 + }, + { + "epoch": 13.825036195567435, + "grad_norm": 2.6343722343444824, + "learning_rate": 1.316729726642246e-05, + "loss": 0.2012, + "num_input_tokens_seen": 151048432, + "step": 124135 + }, + { + "epoch": 13.825593050451053, + "grad_norm": 0.009833560325205326, + "learning_rate": 1.3165156973702696e-05, + "loss": 0.0533, + "num_input_tokens_seen": 151054288, + "step": 124140 + }, + { + "epoch": 13.82614990533467, + "grad_norm": 1.0537805557250977, + "learning_rate": 1.316301679277251e-05, + "loss": 0.0177, + "num_input_tokens_seen": 151060816, + "step": 124145 + }, + { + "epoch": 13.826706760218286, + "grad_norm": 2.2246968746185303, + "learning_rate": 1.316087672365211e-05, + "loss": 0.1013, + "num_input_tokens_seen": 151066864, + "step": 124150 + }, + { + "epoch": 13.827263615101904, + "grad_norm": 0.8797221183776855, + "learning_rate": 1.3158736766361704e-05, + "loss": 0.0109, + "num_input_tokens_seen": 151072976, + "step": 124155 + }, + { + "epoch": 13.827820469985522, + "grad_norm": 0.00222418992780149, + "learning_rate": 1.3156596920921526e-05, + "loss": 0.0107, + "num_input_tokens_seen": 151078832, + "step": 124160 + }, + { + "epoch": 13.82837732486914, + "grad_norm": 0.12653356790542603, + "learning_rate": 1.3154457187351782e-05, + "loss": 0.0025, + "num_input_tokens_seen": 151084752, + "step": 124165 + }, + { + "epoch": 13.828934179752757, + "grad_norm": 0.05766952410340309, + "learning_rate": 1.3152317565672677e-05, + "loss": 0.0029, + "num_input_tokens_seen": 151090896, + "step": 124170 + }, + { + "epoch": 13.829491034636373, + "grad_norm": 0.007063737139105797, + "learning_rate": 1.3150178055904422e-05, + "loss": 0.1323, + "num_input_tokens_seen": 151096656, + "step": 124175 + }, + { + "epoch": 13.83004788951999, + "grad_norm": 0.04903404787182808, + "learning_rate": 1.3148038658067233e-05, + "loss": 0.0436, + "num_input_tokens_seen": 151102960, + "step": 124180 + }, + { + "epoch": 13.830604744403608, + "grad_norm": 0.01048171054571867, + "learning_rate": 1.3145899372181303e-05, + "loss": 0.0418, + "num_input_tokens_seen": 151109104, + "step": 124185 + }, + { + "epoch": 13.831161599287226, + "grad_norm": 0.13831232488155365, + "learning_rate": 1.3143760198266874e-05, + "loss": 0.0909, + "num_input_tokens_seen": 151115024, + "step": 124190 + }, + { + "epoch": 13.831718454170844, + "grad_norm": 0.009567963890731335, + "learning_rate": 1.3141621136344109e-05, + "loss": 0.0581, + "num_input_tokens_seen": 151121456, + "step": 124195 + }, + { + "epoch": 13.832275309054461, + "grad_norm": 0.35737791657447815, + "learning_rate": 1.3139482186433243e-05, + "loss": 0.0369, + "num_input_tokens_seen": 151127760, + "step": 124200 + }, + { + "epoch": 13.832832163938077, + "grad_norm": 0.049873579293489456, + "learning_rate": 1.3137343348554459e-05, + "loss": 0.0018, + "num_input_tokens_seen": 151134224, + "step": 124205 + }, + { + "epoch": 13.833389018821695, + "grad_norm": 0.0031534889712929726, + "learning_rate": 1.3135204622727981e-05, + "loss": 0.0258, + "num_input_tokens_seen": 151140272, + "step": 124210 + }, + { + "epoch": 13.833945873705312, + "grad_norm": 0.00029530355823226273, + "learning_rate": 1.3133066008974004e-05, + "loss": 0.0269, + "num_input_tokens_seen": 151146640, + "step": 124215 + }, + { + "epoch": 13.83450272858893, + "grad_norm": 0.6718962788581848, + "learning_rate": 1.3130927507312724e-05, + "loss": 0.0117, + "num_input_tokens_seen": 151153072, + "step": 124220 + }, + { + "epoch": 13.835059583472548, + "grad_norm": 0.4187343716621399, + "learning_rate": 1.3128789117764334e-05, + "loss": 0.052, + "num_input_tokens_seen": 151159216, + "step": 124225 + }, + { + "epoch": 13.835616438356164, + "grad_norm": 0.5930418372154236, + "learning_rate": 1.3126650840349053e-05, + "loss": 0.0134, + "num_input_tokens_seen": 151165264, + "step": 124230 + }, + { + "epoch": 13.836173293239781, + "grad_norm": 0.010458577424287796, + "learning_rate": 1.3124512675087064e-05, + "loss": 0.034, + "num_input_tokens_seen": 151171312, + "step": 124235 + }, + { + "epoch": 13.836730148123399, + "grad_norm": 0.0016773829702287912, + "learning_rate": 1.3122374621998567e-05, + "loss": 0.0197, + "num_input_tokens_seen": 151177520, + "step": 124240 + }, + { + "epoch": 13.837287003007017, + "grad_norm": 0.06912346929311752, + "learning_rate": 1.3120236681103754e-05, + "loss": 0.1692, + "num_input_tokens_seen": 151183664, + "step": 124245 + }, + { + "epoch": 13.837843857890634, + "grad_norm": 0.6220277547836304, + "learning_rate": 1.3118098852422828e-05, + "loss": 0.0491, + "num_input_tokens_seen": 151190224, + "step": 124250 + }, + { + "epoch": 13.83840071277425, + "grad_norm": 0.14724457263946533, + "learning_rate": 1.3115961135975979e-05, + "loss": 0.0104, + "num_input_tokens_seen": 151196208, + "step": 124255 + }, + { + "epoch": 13.838957567657868, + "grad_norm": 1.243507742881775, + "learning_rate": 1.3113823531783389e-05, + "loss": 0.124, + "num_input_tokens_seen": 151202192, + "step": 124260 + }, + { + "epoch": 13.839514422541486, + "grad_norm": 0.0006405864842236042, + "learning_rate": 1.3111686039865279e-05, + "loss": 0.0293, + "num_input_tokens_seen": 151208880, + "step": 124265 + }, + { + "epoch": 13.840071277425103, + "grad_norm": 0.0015655122697353363, + "learning_rate": 1.31095486602418e-05, + "loss": 0.0027, + "num_input_tokens_seen": 151215376, + "step": 124270 + }, + { + "epoch": 13.840628132308721, + "grad_norm": 4.6112518310546875, + "learning_rate": 1.3107411392933166e-05, + "loss": 0.126, + "num_input_tokens_seen": 151220848, + "step": 124275 + }, + { + "epoch": 13.841184987192337, + "grad_norm": 0.0019403278129175305, + "learning_rate": 1.3105274237959556e-05, + "loss": 0.0259, + "num_input_tokens_seen": 151227088, + "step": 124280 + }, + { + "epoch": 13.841741842075955, + "grad_norm": 0.0006706665735691786, + "learning_rate": 1.3103137195341167e-05, + "loss": 0.0313, + "num_input_tokens_seen": 151233456, + "step": 124285 + }, + { + "epoch": 13.842298696959572, + "grad_norm": 1.1635342836380005, + "learning_rate": 1.3101000265098179e-05, + "loss": 0.1527, + "num_input_tokens_seen": 151239088, + "step": 124290 + }, + { + "epoch": 13.84285555184319, + "grad_norm": 0.018922138959169388, + "learning_rate": 1.3098863447250775e-05, + "loss": 0.0494, + "num_input_tokens_seen": 151245136, + "step": 124295 + }, + { + "epoch": 13.843412406726808, + "grad_norm": 0.009623932652175426, + "learning_rate": 1.3096726741819135e-05, + "loss": 0.0143, + "num_input_tokens_seen": 151251152, + "step": 124300 + }, + { + "epoch": 13.843969261610424, + "grad_norm": 0.09846041351556778, + "learning_rate": 1.3094590148823454e-05, + "loss": 0.1164, + "num_input_tokens_seen": 151257328, + "step": 124305 + }, + { + "epoch": 13.844526116494041, + "grad_norm": 0.08631175011396408, + "learning_rate": 1.309245366828391e-05, + "loss": 0.0498, + "num_input_tokens_seen": 151263760, + "step": 124310 + }, + { + "epoch": 13.845082971377659, + "grad_norm": 0.46688905358314514, + "learning_rate": 1.3090317300220681e-05, + "loss": 0.0188, + "num_input_tokens_seen": 151269936, + "step": 124315 + }, + { + "epoch": 13.845639826261277, + "grad_norm": 0.02826148457825184, + "learning_rate": 1.3088181044653936e-05, + "loss": 0.0299, + "num_input_tokens_seen": 151275952, + "step": 124320 + }, + { + "epoch": 13.846196681144894, + "grad_norm": 1.6807098388671875, + "learning_rate": 1.3086044901603875e-05, + "loss": 0.0461, + "num_input_tokens_seen": 151282000, + "step": 124325 + }, + { + "epoch": 13.846753536028512, + "grad_norm": 0.48835381865501404, + "learning_rate": 1.3083908871090655e-05, + "loss": 0.0041, + "num_input_tokens_seen": 151288208, + "step": 124330 + }, + { + "epoch": 13.847310390912128, + "grad_norm": 1.5872206687927246, + "learning_rate": 1.3081772953134484e-05, + "loss": 0.0964, + "num_input_tokens_seen": 151294256, + "step": 124335 + }, + { + "epoch": 13.847867245795745, + "grad_norm": 1.366469144821167, + "learning_rate": 1.3079637147755494e-05, + "loss": 0.0698, + "num_input_tokens_seen": 151300496, + "step": 124340 + }, + { + "epoch": 13.848424100679363, + "grad_norm": 0.8084822297096252, + "learning_rate": 1.3077501454973892e-05, + "loss": 0.0705, + "num_input_tokens_seen": 151306704, + "step": 124345 + }, + { + "epoch": 13.84898095556298, + "grad_norm": 1.7401231527328491, + "learning_rate": 1.307536587480983e-05, + "loss": 0.1476, + "num_input_tokens_seen": 151312720, + "step": 124350 + }, + { + "epoch": 13.849537810446598, + "grad_norm": 0.244803786277771, + "learning_rate": 1.3073230407283505e-05, + "loss": 0.013, + "num_input_tokens_seen": 151318896, + "step": 124355 + }, + { + "epoch": 13.850094665330214, + "grad_norm": 0.04375167563557625, + "learning_rate": 1.3071095052415072e-05, + "loss": 0.0995, + "num_input_tokens_seen": 151324848, + "step": 124360 + }, + { + "epoch": 13.850651520213832, + "grad_norm": 0.0006549222744069993, + "learning_rate": 1.3068959810224701e-05, + "loss": 0.002, + "num_input_tokens_seen": 151330864, + "step": 124365 + }, + { + "epoch": 13.85120837509745, + "grad_norm": 1.5662205219268799, + "learning_rate": 1.3066824680732559e-05, + "loss": 0.0393, + "num_input_tokens_seen": 151336624, + "step": 124370 + }, + { + "epoch": 13.851765229981067, + "grad_norm": 0.008408219553530216, + "learning_rate": 1.3064689663958824e-05, + "loss": 0.0779, + "num_input_tokens_seen": 151342576, + "step": 124375 + }, + { + "epoch": 13.852322084864685, + "grad_norm": 0.08052761107683182, + "learning_rate": 1.3062554759923662e-05, + "loss": 0.0521, + "num_input_tokens_seen": 151348976, + "step": 124380 + }, + { + "epoch": 13.852878939748301, + "grad_norm": 0.8395315408706665, + "learning_rate": 1.306041996864723e-05, + "loss": 0.064, + "num_input_tokens_seen": 151354672, + "step": 124385 + }, + { + "epoch": 13.853435794631919, + "grad_norm": 0.033970560878515244, + "learning_rate": 1.3058285290149688e-05, + "loss": 0.163, + "num_input_tokens_seen": 151360880, + "step": 124390 + }, + { + "epoch": 13.853992649515536, + "grad_norm": 0.010660713538527489, + "learning_rate": 1.3056150724451222e-05, + "loss": 0.0069, + "num_input_tokens_seen": 151366992, + "step": 124395 + }, + { + "epoch": 13.854549504399154, + "grad_norm": 0.05447697639465332, + "learning_rate": 1.3054016271571968e-05, + "loss": 0.0185, + "num_input_tokens_seen": 151372560, + "step": 124400 + }, + { + "epoch": 13.855106359282772, + "grad_norm": 0.9972509145736694, + "learning_rate": 1.3051881931532123e-05, + "loss": 0.017, + "num_input_tokens_seen": 151378672, + "step": 124405 + }, + { + "epoch": 13.855663214166388, + "grad_norm": 0.00027158032753504813, + "learning_rate": 1.3049747704351806e-05, + "loss": 0.0134, + "num_input_tokens_seen": 151385200, + "step": 124410 + }, + { + "epoch": 13.856220069050005, + "grad_norm": 0.05076058208942413, + "learning_rate": 1.3047613590051205e-05, + "loss": 0.0633, + "num_input_tokens_seen": 151391248, + "step": 124415 + }, + { + "epoch": 13.856776923933623, + "grad_norm": 0.5309546589851379, + "learning_rate": 1.3045479588650461e-05, + "loss": 0.0181, + "num_input_tokens_seen": 151397328, + "step": 124420 + }, + { + "epoch": 13.85733377881724, + "grad_norm": 0.2548898458480835, + "learning_rate": 1.304334570016975e-05, + "loss": 0.1148, + "num_input_tokens_seen": 151403248, + "step": 124425 + }, + { + "epoch": 13.857890633700858, + "grad_norm": 0.30613940954208374, + "learning_rate": 1.3041211924629219e-05, + "loss": 0.0097, + "num_input_tokens_seen": 151409168, + "step": 124430 + }, + { + "epoch": 13.858447488584474, + "grad_norm": 0.003973180428147316, + "learning_rate": 1.303907826204902e-05, + "loss": 0.0249, + "num_input_tokens_seen": 151415248, + "step": 124435 + }, + { + "epoch": 13.859004343468092, + "grad_norm": 0.0006241205846890807, + "learning_rate": 1.3036944712449301e-05, + "loss": 0.1066, + "num_input_tokens_seen": 151420624, + "step": 124440 + }, + { + "epoch": 13.85956119835171, + "grad_norm": 2.0930380821228027, + "learning_rate": 1.3034811275850234e-05, + "loss": 0.2069, + "num_input_tokens_seen": 151426704, + "step": 124445 + }, + { + "epoch": 13.860118053235327, + "grad_norm": 0.04482553154230118, + "learning_rate": 1.3032677952271963e-05, + "loss": 0.0464, + "num_input_tokens_seen": 151432656, + "step": 124450 + }, + { + "epoch": 13.860674908118945, + "grad_norm": 1.3819791078567505, + "learning_rate": 1.3030544741734635e-05, + "loss": 0.1085, + "num_input_tokens_seen": 151438576, + "step": 124455 + }, + { + "epoch": 13.86123176300256, + "grad_norm": 0.09555245190858841, + "learning_rate": 1.3028411644258393e-05, + "loss": 0.0066, + "num_input_tokens_seen": 151445168, + "step": 124460 + }, + { + "epoch": 13.861788617886178, + "grad_norm": 0.1337520331144333, + "learning_rate": 1.30262786598634e-05, + "loss": 0.0129, + "num_input_tokens_seen": 151450672, + "step": 124465 + }, + { + "epoch": 13.862345472769796, + "grad_norm": 0.00023033289471641183, + "learning_rate": 1.3024145788569792e-05, + "loss": 0.0446, + "num_input_tokens_seen": 151456848, + "step": 124470 + }, + { + "epoch": 13.862902327653414, + "grad_norm": 0.21243096888065338, + "learning_rate": 1.3022013030397735e-05, + "loss": 0.0789, + "num_input_tokens_seen": 151463024, + "step": 124475 + }, + { + "epoch": 13.863459182537031, + "grad_norm": 0.5439929366111755, + "learning_rate": 1.3019880385367356e-05, + "loss": 0.0882, + "num_input_tokens_seen": 151469072, + "step": 124480 + }, + { + "epoch": 13.864016037420647, + "grad_norm": 1.9438146352767944, + "learning_rate": 1.3017747853498808e-05, + "loss": 0.0795, + "num_input_tokens_seen": 151475408, + "step": 124485 + }, + { + "epoch": 13.864572892304265, + "grad_norm": 0.0005073861684650183, + "learning_rate": 1.3015615434812218e-05, + "loss": 0.0045, + "num_input_tokens_seen": 151481616, + "step": 124490 + }, + { + "epoch": 13.865129747187883, + "grad_norm": 0.0044560180976986885, + "learning_rate": 1.3013483129327755e-05, + "loss": 0.0058, + "num_input_tokens_seen": 151487536, + "step": 124495 + }, + { + "epoch": 13.8656866020715, + "grad_norm": 0.12701702117919922, + "learning_rate": 1.3011350937065547e-05, + "loss": 0.0301, + "num_input_tokens_seen": 151493488, + "step": 124500 + }, + { + "epoch": 13.866243456955118, + "grad_norm": 0.5728303790092468, + "learning_rate": 1.3009218858045736e-05, + "loss": 0.0998, + "num_input_tokens_seen": 151499600, + "step": 124505 + }, + { + "epoch": 13.866800311838734, + "grad_norm": 0.01332290843129158, + "learning_rate": 1.3007086892288445e-05, + "loss": 0.0376, + "num_input_tokens_seen": 151505808, + "step": 124510 + }, + { + "epoch": 13.867357166722352, + "grad_norm": 1.8886572122573853, + "learning_rate": 1.300495503981384e-05, + "loss": 0.045, + "num_input_tokens_seen": 151512272, + "step": 124515 + }, + { + "epoch": 13.86791402160597, + "grad_norm": 1.6557625532150269, + "learning_rate": 1.3002823300642037e-05, + "loss": 0.0303, + "num_input_tokens_seen": 151518608, + "step": 124520 + }, + { + "epoch": 13.868470876489587, + "grad_norm": 0.005872550420463085, + "learning_rate": 1.3000691674793198e-05, + "loss": 0.014, + "num_input_tokens_seen": 151524688, + "step": 124525 + }, + { + "epoch": 13.869027731373205, + "grad_norm": 0.005638951901346445, + "learning_rate": 1.2998560162287418e-05, + "loss": 0.0222, + "num_input_tokens_seen": 151530832, + "step": 124530 + }, + { + "epoch": 13.86958458625682, + "grad_norm": 0.8537248373031616, + "learning_rate": 1.2996428763144864e-05, + "loss": 0.0318, + "num_input_tokens_seen": 151536784, + "step": 124535 + }, + { + "epoch": 13.870141441140438, + "grad_norm": 0.18757234513759613, + "learning_rate": 1.2994297477385647e-05, + "loss": 0.015, + "num_input_tokens_seen": 151542704, + "step": 124540 + }, + { + "epoch": 13.870698296024056, + "grad_norm": 0.4383784830570221, + "learning_rate": 1.2992166305029918e-05, + "loss": 0.0675, + "num_input_tokens_seen": 151548176, + "step": 124545 + }, + { + "epoch": 13.871255150907674, + "grad_norm": 0.011680283583700657, + "learning_rate": 1.2990035246097803e-05, + "loss": 0.0016, + "num_input_tokens_seen": 151554384, + "step": 124550 + }, + { + "epoch": 13.871812005791291, + "grad_norm": 0.03582223132252693, + "learning_rate": 1.2987904300609424e-05, + "loss": 0.0095, + "num_input_tokens_seen": 151560784, + "step": 124555 + }, + { + "epoch": 13.872368860674909, + "grad_norm": 0.00012130647519370541, + "learning_rate": 1.2985773468584906e-05, + "loss": 0.0225, + "num_input_tokens_seen": 151567344, + "step": 124560 + }, + { + "epoch": 13.872925715558525, + "grad_norm": 0.22418653964996338, + "learning_rate": 1.2983642750044389e-05, + "loss": 0.0043, + "num_input_tokens_seen": 151573456, + "step": 124565 + }, + { + "epoch": 13.873482570442143, + "grad_norm": 0.13882000744342804, + "learning_rate": 1.2981512145007996e-05, + "loss": 0.0392, + "num_input_tokens_seen": 151579536, + "step": 124570 + }, + { + "epoch": 13.87403942532576, + "grad_norm": 0.005098951049149036, + "learning_rate": 1.2979381653495847e-05, + "loss": 0.0071, + "num_input_tokens_seen": 151585712, + "step": 124575 + }, + { + "epoch": 13.874596280209378, + "grad_norm": 0.07985381036996841, + "learning_rate": 1.2977251275528062e-05, + "loss": 0.0378, + "num_input_tokens_seen": 151592016, + "step": 124580 + }, + { + "epoch": 13.875153135092996, + "grad_norm": 0.0698404312133789, + "learning_rate": 1.297512101112478e-05, + "loss": 0.0591, + "num_input_tokens_seen": 151597904, + "step": 124585 + }, + { + "epoch": 13.875709989976611, + "grad_norm": 0.1676773875951767, + "learning_rate": 1.2972990860306106e-05, + "loss": 0.054, + "num_input_tokens_seen": 151604144, + "step": 124590 + }, + { + "epoch": 13.876266844860229, + "grad_norm": 0.008732126094400883, + "learning_rate": 1.2970860823092188e-05, + "loss": 0.0044, + "num_input_tokens_seen": 151609648, + "step": 124595 + }, + { + "epoch": 13.876823699743847, + "grad_norm": 0.0015280407387763262, + "learning_rate": 1.2968730899503107e-05, + "loss": 0.0085, + "num_input_tokens_seen": 151615792, + "step": 124600 + }, + { + "epoch": 13.877380554627464, + "grad_norm": 0.6393769979476929, + "learning_rate": 1.2966601089559011e-05, + "loss": 0.0334, + "num_input_tokens_seen": 151621904, + "step": 124605 + }, + { + "epoch": 13.877937409511082, + "grad_norm": 1.2984760999679565, + "learning_rate": 1.2964471393280001e-05, + "loss": 0.0293, + "num_input_tokens_seen": 151628048, + "step": 124610 + }, + { + "epoch": 13.878494264394698, + "grad_norm": 0.06429211050271988, + "learning_rate": 1.296234181068621e-05, + "loss": 0.0462, + "num_input_tokens_seen": 151634032, + "step": 124615 + }, + { + "epoch": 13.879051119278316, + "grad_norm": 0.23179331421852112, + "learning_rate": 1.2960212341797745e-05, + "loss": 0.0619, + "num_input_tokens_seen": 151640048, + "step": 124620 + }, + { + "epoch": 13.879607974161933, + "grad_norm": 0.07167010754346848, + "learning_rate": 1.295808298663472e-05, + "loss": 0.0041, + "num_input_tokens_seen": 151646096, + "step": 124625 + }, + { + "epoch": 13.880164829045551, + "grad_norm": 0.12896579504013062, + "learning_rate": 1.295595374521724e-05, + "loss": 0.0032, + "num_input_tokens_seen": 151652368, + "step": 124630 + }, + { + "epoch": 13.880721683929169, + "grad_norm": 2.983858585357666, + "learning_rate": 1.2953824617565435e-05, + "loss": 0.0867, + "num_input_tokens_seen": 151658256, + "step": 124635 + }, + { + "epoch": 13.881278538812785, + "grad_norm": 0.48385313153266907, + "learning_rate": 1.2951695603699409e-05, + "loss": 0.0183, + "num_input_tokens_seen": 151664432, + "step": 124640 + }, + { + "epoch": 13.881835393696402, + "grad_norm": 0.7977194786071777, + "learning_rate": 1.294956670363927e-05, + "loss": 0.0301, + "num_input_tokens_seen": 151670864, + "step": 124645 + }, + { + "epoch": 13.88239224858002, + "grad_norm": 0.2159605175256729, + "learning_rate": 1.2947437917405119e-05, + "loss": 0.0072, + "num_input_tokens_seen": 151677264, + "step": 124650 + }, + { + "epoch": 13.882949103463638, + "grad_norm": 0.2820735573768616, + "learning_rate": 1.2945309245017085e-05, + "loss": 0.0115, + "num_input_tokens_seen": 151683408, + "step": 124655 + }, + { + "epoch": 13.883505958347255, + "grad_norm": 2.144365072250366, + "learning_rate": 1.2943180686495249e-05, + "loss": 0.0994, + "num_input_tokens_seen": 151689456, + "step": 124660 + }, + { + "epoch": 13.884062813230871, + "grad_norm": 0.10227619856595993, + "learning_rate": 1.2941052241859744e-05, + "loss": 0.0751, + "num_input_tokens_seen": 151695344, + "step": 124665 + }, + { + "epoch": 13.884619668114489, + "grad_norm": 0.4666879177093506, + "learning_rate": 1.2938923911130673e-05, + "loss": 0.0875, + "num_input_tokens_seen": 151701136, + "step": 124670 + }, + { + "epoch": 13.885176522998107, + "grad_norm": 0.6757957339286804, + "learning_rate": 1.2936795694328107e-05, + "loss": 0.0142, + "num_input_tokens_seen": 151707312, + "step": 124675 + }, + { + "epoch": 13.885733377881724, + "grad_norm": 0.00013324285100679845, + "learning_rate": 1.293466759147218e-05, + "loss": 0.0904, + "num_input_tokens_seen": 151713232, + "step": 124680 + }, + { + "epoch": 13.886290232765342, + "grad_norm": 0.00025339197600260377, + "learning_rate": 1.2932539602582978e-05, + "loss": 0.0409, + "num_input_tokens_seen": 151719152, + "step": 124685 + }, + { + "epoch": 13.88684708764896, + "grad_norm": 0.00013309122005011886, + "learning_rate": 1.2930411727680614e-05, + "loss": 0.072, + "num_input_tokens_seen": 151725616, + "step": 124690 + }, + { + "epoch": 13.887403942532575, + "grad_norm": 0.014127450063824654, + "learning_rate": 1.2928283966785183e-05, + "loss": 0.0904, + "num_input_tokens_seen": 151731600, + "step": 124695 + }, + { + "epoch": 13.887960797416193, + "grad_norm": 0.17714540660381317, + "learning_rate": 1.2926156319916776e-05, + "loss": 0.1428, + "num_input_tokens_seen": 151737872, + "step": 124700 + }, + { + "epoch": 13.88851765229981, + "grad_norm": 0.019208120182156563, + "learning_rate": 1.2924028787095493e-05, + "loss": 0.0253, + "num_input_tokens_seen": 151743280, + "step": 124705 + }, + { + "epoch": 13.889074507183429, + "grad_norm": 0.0014726076042279601, + "learning_rate": 1.292190136834144e-05, + "loss": 0.0091, + "num_input_tokens_seen": 151749520, + "step": 124710 + }, + { + "epoch": 13.889631362067046, + "grad_norm": 0.9977748394012451, + "learning_rate": 1.2919774063674706e-05, + "loss": 0.0224, + "num_input_tokens_seen": 151755536, + "step": 124715 + }, + { + "epoch": 13.890188216950662, + "grad_norm": 0.00012468945351429284, + "learning_rate": 1.2917646873115386e-05, + "loss": 0.0917, + "num_input_tokens_seen": 151761520, + "step": 124720 + }, + { + "epoch": 13.89074507183428, + "grad_norm": 0.22167585790157318, + "learning_rate": 1.291551979668356e-05, + "loss": 0.004, + "num_input_tokens_seen": 151767280, + "step": 124725 + }, + { + "epoch": 13.891301926717897, + "grad_norm": 0.018020223826169968, + "learning_rate": 1.2913392834399341e-05, + "loss": 0.105, + "num_input_tokens_seen": 151773392, + "step": 124730 + }, + { + "epoch": 13.891858781601515, + "grad_norm": 1.5004361867904663, + "learning_rate": 1.29112659862828e-05, + "loss": 0.0413, + "num_input_tokens_seen": 151779440, + "step": 124735 + }, + { + "epoch": 13.892415636485133, + "grad_norm": 0.06405770778656006, + "learning_rate": 1.2909139252354058e-05, + "loss": 0.0101, + "num_input_tokens_seen": 151785616, + "step": 124740 + }, + { + "epoch": 13.892972491368749, + "grad_norm": 0.009236815385520458, + "learning_rate": 1.2907012632633163e-05, + "loss": 0.1423, + "num_input_tokens_seen": 151791888, + "step": 124745 + }, + { + "epoch": 13.893529346252366, + "grad_norm": 0.07924944907426834, + "learning_rate": 1.290488612714023e-05, + "loss": 0.0068, + "num_input_tokens_seen": 151797712, + "step": 124750 + }, + { + "epoch": 13.894086201135984, + "grad_norm": 0.7623640894889832, + "learning_rate": 1.2902759735895334e-05, + "loss": 0.0362, + "num_input_tokens_seen": 151803696, + "step": 124755 + }, + { + "epoch": 13.894643056019602, + "grad_norm": 0.05650320276618004, + "learning_rate": 1.2900633458918571e-05, + "loss": 0.0023, + "num_input_tokens_seen": 151809968, + "step": 124760 + }, + { + "epoch": 13.89519991090322, + "grad_norm": 0.1741044670343399, + "learning_rate": 1.2898507296230016e-05, + "loss": 0.0393, + "num_input_tokens_seen": 151816208, + "step": 124765 + }, + { + "epoch": 13.895756765786835, + "grad_norm": 0.7052584886550903, + "learning_rate": 1.2896381247849759e-05, + "loss": 0.0164, + "num_input_tokens_seen": 151822384, + "step": 124770 + }, + { + "epoch": 13.896313620670453, + "grad_norm": 1.247209906578064, + "learning_rate": 1.2894255313797868e-05, + "loss": 0.048, + "num_input_tokens_seen": 151828496, + "step": 124775 + }, + { + "epoch": 13.89687047555407, + "grad_norm": 0.019474366679787636, + "learning_rate": 1.2892129494094443e-05, + "loss": 0.0145, + "num_input_tokens_seen": 151834736, + "step": 124780 + }, + { + "epoch": 13.897427330437688, + "grad_norm": 0.03148564696311951, + "learning_rate": 1.2890003788759556e-05, + "loss": 0.008, + "num_input_tokens_seen": 151840528, + "step": 124785 + }, + { + "epoch": 13.897984185321306, + "grad_norm": 0.007295726332813501, + "learning_rate": 1.2887878197813285e-05, + "loss": 0.0809, + "num_input_tokens_seen": 151846608, + "step": 124790 + }, + { + "epoch": 13.898541040204922, + "grad_norm": 0.12115093320608139, + "learning_rate": 1.2885752721275702e-05, + "loss": 0.0342, + "num_input_tokens_seen": 151852816, + "step": 124795 + }, + { + "epoch": 13.89909789508854, + "grad_norm": 0.0524982325732708, + "learning_rate": 1.2883627359166895e-05, + "loss": 0.0163, + "num_input_tokens_seen": 151858672, + "step": 124800 + }, + { + "epoch": 13.899654749972157, + "grad_norm": 0.017410172149538994, + "learning_rate": 1.2881502111506926e-05, + "loss": 0.1016, + "num_input_tokens_seen": 151864752, + "step": 124805 + }, + { + "epoch": 13.900211604855775, + "grad_norm": 0.016553642228245735, + "learning_rate": 1.2879376978315893e-05, + "loss": 0.1086, + "num_input_tokens_seen": 151870640, + "step": 124810 + }, + { + "epoch": 13.900768459739393, + "grad_norm": 0.8647677898406982, + "learning_rate": 1.287725195961385e-05, + "loss": 0.019, + "num_input_tokens_seen": 151876528, + "step": 124815 + }, + { + "epoch": 13.901325314623008, + "grad_norm": 0.053176701068878174, + "learning_rate": 1.287512705542088e-05, + "loss": 0.0032, + "num_input_tokens_seen": 151882704, + "step": 124820 + }, + { + "epoch": 13.901882169506626, + "grad_norm": 0.6173973679542542, + "learning_rate": 1.2873002265757037e-05, + "loss": 0.014, + "num_input_tokens_seen": 151888944, + "step": 124825 + }, + { + "epoch": 13.902439024390244, + "grad_norm": 1.007920265197754, + "learning_rate": 1.2870877590642413e-05, + "loss": 0.0324, + "num_input_tokens_seen": 151894864, + "step": 124830 + }, + { + "epoch": 13.902995879273861, + "grad_norm": 0.013812977820634842, + "learning_rate": 1.2868753030097069e-05, + "loss": 0.0016, + "num_input_tokens_seen": 151900784, + "step": 124835 + }, + { + "epoch": 13.90355273415748, + "grad_norm": 0.39482876658439636, + "learning_rate": 1.2866628584141071e-05, + "loss": 0.1003, + "num_input_tokens_seen": 151907280, + "step": 124840 + }, + { + "epoch": 13.904109589041095, + "grad_norm": 0.021194754168391228, + "learning_rate": 1.2864504252794477e-05, + "loss": 0.0043, + "num_input_tokens_seen": 151913552, + "step": 124845 + }, + { + "epoch": 13.904666443924713, + "grad_norm": 0.18292559683322906, + "learning_rate": 1.2862380036077374e-05, + "loss": 0.0235, + "num_input_tokens_seen": 151919152, + "step": 124850 + }, + { + "epoch": 13.90522329880833, + "grad_norm": 1.7658357620239258, + "learning_rate": 1.2860255934009812e-05, + "loss": 0.0455, + "num_input_tokens_seen": 151925136, + "step": 124855 + }, + { + "epoch": 13.905780153691948, + "grad_norm": 0.03533129394054413, + "learning_rate": 1.2858131946611865e-05, + "loss": 0.1646, + "num_input_tokens_seen": 151931152, + "step": 124860 + }, + { + "epoch": 13.906337008575566, + "grad_norm": 0.3775619864463806, + "learning_rate": 1.2856008073903574e-05, + "loss": 0.008, + "num_input_tokens_seen": 151936944, + "step": 124865 + }, + { + "epoch": 13.906893863459182, + "grad_norm": 0.054041776806116104, + "learning_rate": 1.2853884315905033e-05, + "loss": 0.0252, + "num_input_tokens_seen": 151942064, + "step": 124870 + }, + { + "epoch": 13.9074507183428, + "grad_norm": 0.0003980564943049103, + "learning_rate": 1.285176067263627e-05, + "loss": 0.0016, + "num_input_tokens_seen": 151948656, + "step": 124875 + }, + { + "epoch": 13.908007573226417, + "grad_norm": 0.4499404728412628, + "learning_rate": 1.2849637144117375e-05, + "loss": 0.0113, + "num_input_tokens_seen": 151954704, + "step": 124880 + }, + { + "epoch": 13.908564428110035, + "grad_norm": 0.00010139331425307319, + "learning_rate": 1.2847513730368388e-05, + "loss": 0.0224, + "num_input_tokens_seen": 151960848, + "step": 124885 + }, + { + "epoch": 13.909121282993652, + "grad_norm": 0.3058549761772156, + "learning_rate": 1.2845390431409374e-05, + "loss": 0.0192, + "num_input_tokens_seen": 151967120, + "step": 124890 + }, + { + "epoch": 13.909678137877268, + "grad_norm": 2.493243455886841, + "learning_rate": 1.2843267247260372e-05, + "loss": 0.1246, + "num_input_tokens_seen": 151972976, + "step": 124895 + }, + { + "epoch": 13.910234992760886, + "grad_norm": 0.007714574225246906, + "learning_rate": 1.2841144177941461e-05, + "loss": 0.0148, + "num_input_tokens_seen": 151979088, + "step": 124900 + }, + { + "epoch": 13.910791847644504, + "grad_norm": 0.9216485619544983, + "learning_rate": 1.2839021223472687e-05, + "loss": 0.0655, + "num_input_tokens_seen": 151985072, + "step": 124905 + }, + { + "epoch": 13.911348702528121, + "grad_norm": 0.8906350135803223, + "learning_rate": 1.2836898383874102e-05, + "loss": 0.0486, + "num_input_tokens_seen": 151991280, + "step": 124910 + }, + { + "epoch": 13.911905557411739, + "grad_norm": 1.1695365905761719, + "learning_rate": 1.2834775659165743e-05, + "loss": 0.0527, + "num_input_tokens_seen": 151996464, + "step": 124915 + }, + { + "epoch": 13.912462412295357, + "grad_norm": 0.02761172689497471, + "learning_rate": 1.2832653049367685e-05, + "loss": 0.0069, + "num_input_tokens_seen": 152002704, + "step": 124920 + }, + { + "epoch": 13.913019267178973, + "grad_norm": 0.005609576124697924, + "learning_rate": 1.2830530554499959e-05, + "loss": 0.0463, + "num_input_tokens_seen": 152008912, + "step": 124925 + }, + { + "epoch": 13.91357612206259, + "grad_norm": 3.218017578125, + "learning_rate": 1.282840817458264e-05, + "loss": 0.036, + "num_input_tokens_seen": 152014960, + "step": 124930 + }, + { + "epoch": 13.914132976946208, + "grad_norm": 0.007459624204784632, + "learning_rate": 1.2826285909635738e-05, + "loss": 0.0047, + "num_input_tokens_seen": 152021104, + "step": 124935 + }, + { + "epoch": 13.914689831829826, + "grad_norm": 2.3475606441497803, + "learning_rate": 1.2824163759679331e-05, + "loss": 0.1083, + "num_input_tokens_seen": 152026544, + "step": 124940 + }, + { + "epoch": 13.915246686713443, + "grad_norm": 0.002759338356554508, + "learning_rate": 1.2822041724733442e-05, + "loss": 0.0265, + "num_input_tokens_seen": 152032848, + "step": 124945 + }, + { + "epoch": 13.91580354159706, + "grad_norm": 0.3933156132698059, + "learning_rate": 1.281991980481813e-05, + "loss": 0.0464, + "num_input_tokens_seen": 152038768, + "step": 124950 + }, + { + "epoch": 13.916360396480677, + "grad_norm": 0.00042764499085024, + "learning_rate": 1.2817797999953441e-05, + "loss": 0.0094, + "num_input_tokens_seen": 152044848, + "step": 124955 + }, + { + "epoch": 13.916917251364294, + "grad_norm": 0.07345789670944214, + "learning_rate": 1.2815676310159407e-05, + "loss": 0.126, + "num_input_tokens_seen": 152051216, + "step": 124960 + }, + { + "epoch": 13.917474106247912, + "grad_norm": 0.5041064023971558, + "learning_rate": 1.2813554735456063e-05, + "loss": 0.0352, + "num_input_tokens_seen": 152057424, + "step": 124965 + }, + { + "epoch": 13.91803096113153, + "grad_norm": 0.07468695938587189, + "learning_rate": 1.2811433275863468e-05, + "loss": 0.0478, + "num_input_tokens_seen": 152063536, + "step": 124970 + }, + { + "epoch": 13.918587816015146, + "grad_norm": 0.0095363799482584, + "learning_rate": 1.2809311931401652e-05, + "loss": 0.0072, + "num_input_tokens_seen": 152069520, + "step": 124975 + }, + { + "epoch": 13.919144670898763, + "grad_norm": 0.07089460641145706, + "learning_rate": 1.2807190702090649e-05, + "loss": 0.0463, + "num_input_tokens_seen": 152075792, + "step": 124980 + }, + { + "epoch": 13.919701525782381, + "grad_norm": 0.7746467590332031, + "learning_rate": 1.280506958795049e-05, + "loss": 0.0599, + "num_input_tokens_seen": 152082032, + "step": 124985 + }, + { + "epoch": 13.920258380665999, + "grad_norm": 0.506131649017334, + "learning_rate": 1.2802948589001231e-05, + "loss": 0.0292, + "num_input_tokens_seen": 152087984, + "step": 124990 + }, + { + "epoch": 13.920815235549616, + "grad_norm": 0.02819034270942211, + "learning_rate": 1.2800827705262886e-05, + "loss": 0.0017, + "num_input_tokens_seen": 152094064, + "step": 124995 + }, + { + "epoch": 13.921372090433232, + "grad_norm": 1.1039447784423828, + "learning_rate": 1.2798706936755512e-05, + "loss": 0.1287, + "num_input_tokens_seen": 152100112, + "step": 125000 + }, + { + "epoch": 13.92192894531685, + "grad_norm": 0.03818998858332634, + "learning_rate": 1.2796586283499109e-05, + "loss": 0.1125, + "num_input_tokens_seen": 152106288, + "step": 125005 + }, + { + "epoch": 13.922485800200468, + "grad_norm": 0.799729585647583, + "learning_rate": 1.2794465745513735e-05, + "loss": 0.0923, + "num_input_tokens_seen": 152112240, + "step": 125010 + }, + { + "epoch": 13.923042655084085, + "grad_norm": 0.00047216142411343753, + "learning_rate": 1.2792345322819402e-05, + "loss": 0.0105, + "num_input_tokens_seen": 152118384, + "step": 125015 + }, + { + "epoch": 13.923599509967703, + "grad_norm": 0.0036465972661972046, + "learning_rate": 1.2790225015436157e-05, + "loss": 0.0397, + "num_input_tokens_seen": 152124624, + "step": 125020 + }, + { + "epoch": 13.92415636485132, + "grad_norm": 1.5200388431549072, + "learning_rate": 1.278810482338402e-05, + "loss": 0.0691, + "num_input_tokens_seen": 152130736, + "step": 125025 + }, + { + "epoch": 13.924713219734937, + "grad_norm": 0.0030389486346393824, + "learning_rate": 1.2785984746683016e-05, + "loss": 0.0061, + "num_input_tokens_seen": 152136976, + "step": 125030 + }, + { + "epoch": 13.925270074618554, + "grad_norm": 0.053122628480196, + "learning_rate": 1.2783864785353165e-05, + "loss": 0.0556, + "num_input_tokens_seen": 152143248, + "step": 125035 + }, + { + "epoch": 13.925826929502172, + "grad_norm": 0.002627549460157752, + "learning_rate": 1.2781744939414503e-05, + "loss": 0.1112, + "num_input_tokens_seen": 152149392, + "step": 125040 + }, + { + "epoch": 13.92638378438579, + "grad_norm": 0.0025460347533226013, + "learning_rate": 1.2779625208887053e-05, + "loss": 0.0313, + "num_input_tokens_seen": 152155408, + "step": 125045 + }, + { + "epoch": 13.926940639269407, + "grad_norm": 0.016871701925992966, + "learning_rate": 1.2777505593790834e-05, + "loss": 0.1318, + "num_input_tokens_seen": 152161584, + "step": 125050 + }, + { + "epoch": 13.927497494153023, + "grad_norm": 0.7790927886962891, + "learning_rate": 1.2775386094145855e-05, + "loss": 0.0178, + "num_input_tokens_seen": 152167856, + "step": 125055 + }, + { + "epoch": 13.92805434903664, + "grad_norm": 0.03633873909711838, + "learning_rate": 1.277326670997216e-05, + "loss": 0.004, + "num_input_tokens_seen": 152174192, + "step": 125060 + }, + { + "epoch": 13.928611203920259, + "grad_norm": 0.13037820160388947, + "learning_rate": 1.2771147441289746e-05, + "loss": 0.0687, + "num_input_tokens_seen": 152180272, + "step": 125065 + }, + { + "epoch": 13.929168058803876, + "grad_norm": 0.1515825390815735, + "learning_rate": 1.2769028288118651e-05, + "loss": 0.0085, + "num_input_tokens_seen": 152186512, + "step": 125070 + }, + { + "epoch": 13.929724913687494, + "grad_norm": 0.0005907793529331684, + "learning_rate": 1.2766909250478887e-05, + "loss": 0.0826, + "num_input_tokens_seen": 152192240, + "step": 125075 + }, + { + "epoch": 13.93028176857111, + "grad_norm": 1.2359201908111572, + "learning_rate": 1.2764790328390463e-05, + "loss": 0.0404, + "num_input_tokens_seen": 152198544, + "step": 125080 + }, + { + "epoch": 13.930838623454727, + "grad_norm": 0.07111874967813492, + "learning_rate": 1.2762671521873395e-05, + "loss": 0.0641, + "num_input_tokens_seen": 152205008, + "step": 125085 + }, + { + "epoch": 13.931395478338345, + "grad_norm": 0.010103728622198105, + "learning_rate": 1.2760552830947691e-05, + "loss": 0.0267, + "num_input_tokens_seen": 152210544, + "step": 125090 + }, + { + "epoch": 13.931952333221963, + "grad_norm": 0.1677383929491043, + "learning_rate": 1.2758434255633384e-05, + "loss": 0.1326, + "num_input_tokens_seen": 152216880, + "step": 125095 + }, + { + "epoch": 13.93250918810558, + "grad_norm": 0.7276656031608582, + "learning_rate": 1.2756315795950468e-05, + "loss": 0.0309, + "num_input_tokens_seen": 152223376, + "step": 125100 + }, + { + "epoch": 13.933066042989196, + "grad_norm": 0.04811643809080124, + "learning_rate": 1.2754197451918965e-05, + "loss": 0.0311, + "num_input_tokens_seen": 152229104, + "step": 125105 + }, + { + "epoch": 13.933622897872814, + "grad_norm": 0.0011829219292849302, + "learning_rate": 1.2752079223558866e-05, + "loss": 0.0024, + "num_input_tokens_seen": 152235216, + "step": 125110 + }, + { + "epoch": 13.934179752756432, + "grad_norm": 0.02388043887913227, + "learning_rate": 1.2749961110890202e-05, + "loss": 0.0259, + "num_input_tokens_seen": 152241232, + "step": 125115 + }, + { + "epoch": 13.93473660764005, + "grad_norm": 0.037701502442359924, + "learning_rate": 1.2747843113932966e-05, + "loss": 0.0529, + "num_input_tokens_seen": 152247536, + "step": 125120 + }, + { + "epoch": 13.935293462523667, + "grad_norm": 0.3275238573551178, + "learning_rate": 1.2745725232707173e-05, + "loss": 0.0304, + "num_input_tokens_seen": 152253616, + "step": 125125 + }, + { + "epoch": 13.935850317407283, + "grad_norm": 0.00214424473233521, + "learning_rate": 1.274360746723281e-05, + "loss": 0.0745, + "num_input_tokens_seen": 152259824, + "step": 125130 + }, + { + "epoch": 13.9364071722909, + "grad_norm": 0.0019281520508229733, + "learning_rate": 1.2741489817529905e-05, + "loss": 0.0632, + "num_input_tokens_seen": 152265904, + "step": 125135 + }, + { + "epoch": 13.936964027174518, + "grad_norm": 1.6537368297576904, + "learning_rate": 1.2739372283618439e-05, + "loss": 0.087, + "num_input_tokens_seen": 152272208, + "step": 125140 + }, + { + "epoch": 13.937520882058136, + "grad_norm": 0.37342971563339233, + "learning_rate": 1.2737254865518444e-05, + "loss": 0.1197, + "num_input_tokens_seen": 152278224, + "step": 125145 + }, + { + "epoch": 13.938077736941754, + "grad_norm": 0.8172793984413147, + "learning_rate": 1.2735137563249885e-05, + "loss": 0.1437, + "num_input_tokens_seen": 152284304, + "step": 125150 + }, + { + "epoch": 13.93863459182537, + "grad_norm": 0.00414624996483326, + "learning_rate": 1.2733020376832788e-05, + "loss": 0.0675, + "num_input_tokens_seen": 152290384, + "step": 125155 + }, + { + "epoch": 13.939191446708987, + "grad_norm": 1.2567293643951416, + "learning_rate": 1.273090330628713e-05, + "loss": 0.1244, + "num_input_tokens_seen": 152296784, + "step": 125160 + }, + { + "epoch": 13.939748301592605, + "grad_norm": 1.2600136995315552, + "learning_rate": 1.2728786351632929e-05, + "loss": 0.0191, + "num_input_tokens_seen": 152302832, + "step": 125165 + }, + { + "epoch": 13.940305156476223, + "grad_norm": 0.23985078930854797, + "learning_rate": 1.2726669512890174e-05, + "loss": 0.0112, + "num_input_tokens_seen": 152308976, + "step": 125170 + }, + { + "epoch": 13.94086201135984, + "grad_norm": 0.01900755986571312, + "learning_rate": 1.2724552790078859e-05, + "loss": 0.016, + "num_input_tokens_seen": 152315248, + "step": 125175 + }, + { + "epoch": 13.941418866243456, + "grad_norm": 0.26858100295066833, + "learning_rate": 1.2722436183218967e-05, + "loss": 0.006, + "num_input_tokens_seen": 152321392, + "step": 125180 + }, + { + "epoch": 13.941975721127074, + "grad_norm": 0.08696489781141281, + "learning_rate": 1.2720319692330512e-05, + "loss": 0.095, + "num_input_tokens_seen": 152327760, + "step": 125185 + }, + { + "epoch": 13.942532576010692, + "grad_norm": 0.021444307640194893, + "learning_rate": 1.2718203317433474e-05, + "loss": 0.1133, + "num_input_tokens_seen": 152333712, + "step": 125190 + }, + { + "epoch": 13.94308943089431, + "grad_norm": 0.03811502456665039, + "learning_rate": 1.2716087058547849e-05, + "loss": 0.0166, + "num_input_tokens_seen": 152339664, + "step": 125195 + }, + { + "epoch": 13.943646285777927, + "grad_norm": 0.25390422344207764, + "learning_rate": 1.2713970915693613e-05, + "loss": 0.0774, + "num_input_tokens_seen": 152345808, + "step": 125200 + }, + { + "epoch": 13.944203140661543, + "grad_norm": 0.02341357432305813, + "learning_rate": 1.2711854888890773e-05, + "loss": 0.0751, + "num_input_tokens_seen": 152351312, + "step": 125205 + }, + { + "epoch": 13.94475999554516, + "grad_norm": 0.3139062523841858, + "learning_rate": 1.2709738978159303e-05, + "loss": 0.0978, + "num_input_tokens_seen": 152357008, + "step": 125210 + }, + { + "epoch": 13.945316850428778, + "grad_norm": 0.014910156838595867, + "learning_rate": 1.2707623183519202e-05, + "loss": 0.0135, + "num_input_tokens_seen": 152362800, + "step": 125215 + }, + { + "epoch": 13.945873705312396, + "grad_norm": 1.0020663738250732, + "learning_rate": 1.270550750499045e-05, + "loss": 0.0258, + "num_input_tokens_seen": 152368688, + "step": 125220 + }, + { + "epoch": 13.946430560196013, + "grad_norm": 0.12586401402950287, + "learning_rate": 1.2703391942593024e-05, + "loss": 0.058, + "num_input_tokens_seen": 152374608, + "step": 125225 + }, + { + "epoch": 13.94698741507963, + "grad_norm": 0.23940059542655945, + "learning_rate": 1.2701276496346908e-05, + "loss": 0.0178, + "num_input_tokens_seen": 152380496, + "step": 125230 + }, + { + "epoch": 13.947544269963247, + "grad_norm": 0.8400098085403442, + "learning_rate": 1.26991611662721e-05, + "loss": 0.1512, + "num_input_tokens_seen": 152386256, + "step": 125235 + }, + { + "epoch": 13.948101124846865, + "grad_norm": 0.023734500631690025, + "learning_rate": 1.2697045952388569e-05, + "loss": 0.0081, + "num_input_tokens_seen": 152392400, + "step": 125240 + }, + { + "epoch": 13.948657979730482, + "grad_norm": 0.09595940262079239, + "learning_rate": 1.2694930854716295e-05, + "loss": 0.0863, + "num_input_tokens_seen": 152398704, + "step": 125245 + }, + { + "epoch": 13.9492148346141, + "grad_norm": 0.8291745185852051, + "learning_rate": 1.2692815873275249e-05, + "loss": 0.028, + "num_input_tokens_seen": 152404720, + "step": 125250 + }, + { + "epoch": 13.949771689497716, + "grad_norm": 0.3155030310153961, + "learning_rate": 1.2690701008085426e-05, + "loss": 0.0668, + "num_input_tokens_seen": 152410832, + "step": 125255 + }, + { + "epoch": 13.950328544381334, + "grad_norm": 0.0038597434759140015, + "learning_rate": 1.2688586259166785e-05, + "loss": 0.0192, + "num_input_tokens_seen": 152417136, + "step": 125260 + }, + { + "epoch": 13.950885399264951, + "grad_norm": 0.09421406686306, + "learning_rate": 1.268647162653933e-05, + "loss": 0.0071, + "num_input_tokens_seen": 152423056, + "step": 125265 + }, + { + "epoch": 13.951442254148569, + "grad_norm": 0.08969645202159882, + "learning_rate": 1.2684357110222994e-05, + "loss": 0.0141, + "num_input_tokens_seen": 152429136, + "step": 125270 + }, + { + "epoch": 13.951999109032187, + "grad_norm": 1.6336839199066162, + "learning_rate": 1.2682242710237785e-05, + "loss": 0.048, + "num_input_tokens_seen": 152434704, + "step": 125275 + }, + { + "epoch": 13.952555963915804, + "grad_norm": 0.34286341071128845, + "learning_rate": 1.2680128426603652e-05, + "loss": 0.0199, + "num_input_tokens_seen": 152440976, + "step": 125280 + }, + { + "epoch": 13.95311281879942, + "grad_norm": 0.4566628932952881, + "learning_rate": 1.2678014259340587e-05, + "loss": 0.0243, + "num_input_tokens_seen": 152446768, + "step": 125285 + }, + { + "epoch": 13.953669673683038, + "grad_norm": 0.0008071499760262668, + "learning_rate": 1.2675900208468549e-05, + "loss": 0.0087, + "num_input_tokens_seen": 152452496, + "step": 125290 + }, + { + "epoch": 13.954226528566656, + "grad_norm": 0.47056666016578674, + "learning_rate": 1.2673786274007504e-05, + "loss": 0.0197, + "num_input_tokens_seen": 152458480, + "step": 125295 + }, + { + "epoch": 13.954783383450273, + "grad_norm": 0.02473115362226963, + "learning_rate": 1.2671672455977418e-05, + "loss": 0.0242, + "num_input_tokens_seen": 152464560, + "step": 125300 + }, + { + "epoch": 13.955340238333891, + "grad_norm": 0.003847070736810565, + "learning_rate": 1.2669558754398273e-05, + "loss": 0.0819, + "num_input_tokens_seen": 152470288, + "step": 125305 + }, + { + "epoch": 13.955897093217507, + "grad_norm": 0.04133004695177078, + "learning_rate": 1.266744516929002e-05, + "loss": 0.0359, + "num_input_tokens_seen": 152476432, + "step": 125310 + }, + { + "epoch": 13.956453948101124, + "grad_norm": 0.12406430393457413, + "learning_rate": 1.2665331700672634e-05, + "loss": 0.0667, + "num_input_tokens_seen": 152482608, + "step": 125315 + }, + { + "epoch": 13.957010802984742, + "grad_norm": 0.0010396477300673723, + "learning_rate": 1.2663218348566059e-05, + "loss": 0.0761, + "num_input_tokens_seen": 152488624, + "step": 125320 + }, + { + "epoch": 13.95756765786836, + "grad_norm": 0.3990660309791565, + "learning_rate": 1.2661105112990279e-05, + "loss": 0.046, + "num_input_tokens_seen": 152494576, + "step": 125325 + }, + { + "epoch": 13.958124512751978, + "grad_norm": 0.21886122226715088, + "learning_rate": 1.2658991993965241e-05, + "loss": 0.0139, + "num_input_tokens_seen": 152500784, + "step": 125330 + }, + { + "epoch": 13.958681367635593, + "grad_norm": 0.008115950040519238, + "learning_rate": 1.2656878991510929e-05, + "loss": 0.0107, + "num_input_tokens_seen": 152507120, + "step": 125335 + }, + { + "epoch": 13.959238222519211, + "grad_norm": 0.02548971213400364, + "learning_rate": 1.2654766105647265e-05, + "loss": 0.0029, + "num_input_tokens_seen": 152513552, + "step": 125340 + }, + { + "epoch": 13.959795077402829, + "grad_norm": 0.004968736786395311, + "learning_rate": 1.2652653336394232e-05, + "loss": 0.0039, + "num_input_tokens_seen": 152519248, + "step": 125345 + }, + { + "epoch": 13.960351932286446, + "grad_norm": 0.007261046674102545, + "learning_rate": 1.2650540683771778e-05, + "loss": 0.02, + "num_input_tokens_seen": 152525360, + "step": 125350 + }, + { + "epoch": 13.960908787170064, + "grad_norm": 1.8253843784332275, + "learning_rate": 1.2648428147799867e-05, + "loss": 0.178, + "num_input_tokens_seen": 152531600, + "step": 125355 + }, + { + "epoch": 13.96146564205368, + "grad_norm": 0.008643782697618008, + "learning_rate": 1.2646315728498447e-05, + "loss": 0.0552, + "num_input_tokens_seen": 152537808, + "step": 125360 + }, + { + "epoch": 13.962022496937298, + "grad_norm": 0.0005918082897551358, + "learning_rate": 1.2644203425887475e-05, + "loss": 0.1494, + "num_input_tokens_seen": 152543664, + "step": 125365 + }, + { + "epoch": 13.962579351820915, + "grad_norm": 0.6461175680160522, + "learning_rate": 1.264209123998689e-05, + "loss": 0.142, + "num_input_tokens_seen": 152549904, + "step": 125370 + }, + { + "epoch": 13.963136206704533, + "grad_norm": 0.21714165806770325, + "learning_rate": 1.2639979170816662e-05, + "loss": 0.0162, + "num_input_tokens_seen": 152556208, + "step": 125375 + }, + { + "epoch": 13.96369306158815, + "grad_norm": 0.00039930507773533463, + "learning_rate": 1.2637867218396737e-05, + "loss": 0.0483, + "num_input_tokens_seen": 152562320, + "step": 125380 + }, + { + "epoch": 13.964249916471768, + "grad_norm": 0.003667952958494425, + "learning_rate": 1.263575538274706e-05, + "loss": 0.1215, + "num_input_tokens_seen": 152568496, + "step": 125385 + }, + { + "epoch": 13.964806771355384, + "grad_norm": 0.003465136280283332, + "learning_rate": 1.2633643663887568e-05, + "loss": 0.0574, + "num_input_tokens_seen": 152574256, + "step": 125390 + }, + { + "epoch": 13.965363626239002, + "grad_norm": 0.023583611473441124, + "learning_rate": 1.263153206183823e-05, + "loss": 0.0689, + "num_input_tokens_seen": 152580112, + "step": 125395 + }, + { + "epoch": 13.96592048112262, + "grad_norm": 0.1810879111289978, + "learning_rate": 1.2629420576618973e-05, + "loss": 0.003, + "num_input_tokens_seen": 152586256, + "step": 125400 + }, + { + "epoch": 13.966477336006237, + "grad_norm": 0.5799333453178406, + "learning_rate": 1.2627309208249765e-05, + "loss": 0.0365, + "num_input_tokens_seen": 152592080, + "step": 125405 + }, + { + "epoch": 13.967034190889855, + "grad_norm": 0.0143134081736207, + "learning_rate": 1.2625197956750518e-05, + "loss": 0.0037, + "num_input_tokens_seen": 152598224, + "step": 125410 + }, + { + "epoch": 13.96759104577347, + "grad_norm": 0.012920793145895004, + "learning_rate": 1.26230868221412e-05, + "loss": 0.0327, + "num_input_tokens_seen": 152604464, + "step": 125415 + }, + { + "epoch": 13.968147900657089, + "grad_norm": 0.004188019782304764, + "learning_rate": 1.2620975804441733e-05, + "loss": 0.0226, + "num_input_tokens_seen": 152610160, + "step": 125420 + }, + { + "epoch": 13.968704755540706, + "grad_norm": 0.7564956545829773, + "learning_rate": 1.2618864903672079e-05, + "loss": 0.1176, + "num_input_tokens_seen": 152615856, + "step": 125425 + }, + { + "epoch": 13.969261610424324, + "grad_norm": 0.783725380897522, + "learning_rate": 1.2616754119852164e-05, + "loss": 0.029, + "num_input_tokens_seen": 152622000, + "step": 125430 + }, + { + "epoch": 13.969818465307942, + "grad_norm": 0.030340859666466713, + "learning_rate": 1.2614643453001928e-05, + "loss": 0.0152, + "num_input_tokens_seen": 152628336, + "step": 125435 + }, + { + "epoch": 13.970375320191557, + "grad_norm": 0.009726460091769695, + "learning_rate": 1.26125329031413e-05, + "loss": 0.009, + "num_input_tokens_seen": 152634544, + "step": 125440 + }, + { + "epoch": 13.970932175075175, + "grad_norm": 0.4619603753089905, + "learning_rate": 1.2610422470290228e-05, + "loss": 0.0081, + "num_input_tokens_seen": 152640784, + "step": 125445 + }, + { + "epoch": 13.971489029958793, + "grad_norm": 0.021539805456995964, + "learning_rate": 1.2608312154468645e-05, + "loss": 0.0553, + "num_input_tokens_seen": 152646928, + "step": 125450 + }, + { + "epoch": 13.97204588484241, + "grad_norm": 0.73455810546875, + "learning_rate": 1.2606201955696484e-05, + "loss": 0.0732, + "num_input_tokens_seen": 152653360, + "step": 125455 + }, + { + "epoch": 13.972602739726028, + "grad_norm": 0.3492192029953003, + "learning_rate": 1.2604091873993663e-05, + "loss": 0.0168, + "num_input_tokens_seen": 152659728, + "step": 125460 + }, + { + "epoch": 13.973159594609644, + "grad_norm": 0.05060122162103653, + "learning_rate": 1.2601981909380134e-05, + "loss": 0.0045, + "num_input_tokens_seen": 152666032, + "step": 125465 + }, + { + "epoch": 13.973716449493262, + "grad_norm": 1.0363917350769043, + "learning_rate": 1.2599872061875812e-05, + "loss": 0.0391, + "num_input_tokens_seen": 152671536, + "step": 125470 + }, + { + "epoch": 13.97427330437688, + "grad_norm": 1.6689575910568237, + "learning_rate": 1.259776233150064e-05, + "loss": 0.0185, + "num_input_tokens_seen": 152677968, + "step": 125475 + }, + { + "epoch": 13.974830159260497, + "grad_norm": 1.4193758964538574, + "learning_rate": 1.2595652718274541e-05, + "loss": 0.0843, + "num_input_tokens_seen": 152684048, + "step": 125480 + }, + { + "epoch": 13.975387014144115, + "grad_norm": 0.016426876187324524, + "learning_rate": 1.259354322221744e-05, + "loss": 0.1177, + "num_input_tokens_seen": 152689968, + "step": 125485 + }, + { + "epoch": 13.97594386902773, + "grad_norm": 0.007581581827253103, + "learning_rate": 1.2591433843349265e-05, + "loss": 0.0183, + "num_input_tokens_seen": 152695856, + "step": 125490 + }, + { + "epoch": 13.976500723911348, + "grad_norm": 0.03151476010680199, + "learning_rate": 1.2589324581689926e-05, + "loss": 0.0156, + "num_input_tokens_seen": 152701968, + "step": 125495 + }, + { + "epoch": 13.977057578794966, + "grad_norm": 0.28698140382766724, + "learning_rate": 1.2587215437259375e-05, + "loss": 0.0302, + "num_input_tokens_seen": 152708240, + "step": 125500 + }, + { + "epoch": 13.977614433678584, + "grad_norm": 2.0007522106170654, + "learning_rate": 1.2585106410077512e-05, + "loss": 0.0256, + "num_input_tokens_seen": 152714416, + "step": 125505 + }, + { + "epoch": 13.978171288562201, + "grad_norm": 0.0007227135938592255, + "learning_rate": 1.258299750016427e-05, + "loss": 0.101, + "num_input_tokens_seen": 152720624, + "step": 125510 + }, + { + "epoch": 13.978728143445817, + "grad_norm": 0.09147009998559952, + "learning_rate": 1.2580888707539556e-05, + "loss": 0.0859, + "num_input_tokens_seen": 152726672, + "step": 125515 + }, + { + "epoch": 13.979284998329435, + "grad_norm": 0.694052517414093, + "learning_rate": 1.2578780032223303e-05, + "loss": 0.0415, + "num_input_tokens_seen": 152732656, + "step": 125520 + }, + { + "epoch": 13.979841853213053, + "grad_norm": 0.08086694031953812, + "learning_rate": 1.257667147423543e-05, + "loss": 0.021, + "num_input_tokens_seen": 152739152, + "step": 125525 + }, + { + "epoch": 13.98039870809667, + "grad_norm": 0.013620618730783463, + "learning_rate": 1.2574563033595843e-05, + "loss": 0.047, + "num_input_tokens_seen": 152744944, + "step": 125530 + }, + { + "epoch": 13.980955562980288, + "grad_norm": 0.02478632517158985, + "learning_rate": 1.2572454710324458e-05, + "loss": 0.004, + "num_input_tokens_seen": 152750992, + "step": 125535 + }, + { + "epoch": 13.981512417863904, + "grad_norm": 1.297396183013916, + "learning_rate": 1.2570346504441202e-05, + "loss": 0.0974, + "num_input_tokens_seen": 152757200, + "step": 125540 + }, + { + "epoch": 13.982069272747522, + "grad_norm": 0.01960465870797634, + "learning_rate": 1.2568238415965974e-05, + "loss": 0.0038, + "num_input_tokens_seen": 152763664, + "step": 125545 + }, + { + "epoch": 13.98262612763114, + "grad_norm": 2.1977765560150146, + "learning_rate": 1.2566130444918711e-05, + "loss": 0.0709, + "num_input_tokens_seen": 152770032, + "step": 125550 + }, + { + "epoch": 13.983182982514757, + "grad_norm": 0.011804976500570774, + "learning_rate": 1.2564022591319291e-05, + "loss": 0.0247, + "num_input_tokens_seen": 152776048, + "step": 125555 + }, + { + "epoch": 13.983739837398375, + "grad_norm": 0.08064208179712296, + "learning_rate": 1.2561914855187651e-05, + "loss": 0.2059, + "num_input_tokens_seen": 152782320, + "step": 125560 + }, + { + "epoch": 13.98429669228199, + "grad_norm": 0.060039520263671875, + "learning_rate": 1.255980723654368e-05, + "loss": 0.0009, + "num_input_tokens_seen": 152788464, + "step": 125565 + }, + { + "epoch": 13.984853547165608, + "grad_norm": 0.2655096650123596, + "learning_rate": 1.2557699735407306e-05, + "loss": 0.1531, + "num_input_tokens_seen": 152794160, + "step": 125570 + }, + { + "epoch": 13.985410402049226, + "grad_norm": 0.002180692972615361, + "learning_rate": 1.2555592351798426e-05, + "loss": 0.0149, + "num_input_tokens_seen": 152800240, + "step": 125575 + }, + { + "epoch": 13.985967256932843, + "grad_norm": 0.400389701128006, + "learning_rate": 1.2553485085736946e-05, + "loss": 0.0406, + "num_input_tokens_seen": 152806288, + "step": 125580 + }, + { + "epoch": 13.986524111816461, + "grad_norm": 3.121722936630249, + "learning_rate": 1.2551377937242764e-05, + "loss": 0.0505, + "num_input_tokens_seen": 152812496, + "step": 125585 + }, + { + "epoch": 13.987080966700077, + "grad_norm": 1.352637529373169, + "learning_rate": 1.2549270906335797e-05, + "loss": 0.0908, + "num_input_tokens_seen": 152818864, + "step": 125590 + }, + { + "epoch": 13.987637821583695, + "grad_norm": 0.9013049602508545, + "learning_rate": 1.2547163993035946e-05, + "loss": 0.0748, + "num_input_tokens_seen": 152825072, + "step": 125595 + }, + { + "epoch": 13.988194676467312, + "grad_norm": 2.1517226696014404, + "learning_rate": 1.2545057197363109e-05, + "loss": 0.084, + "num_input_tokens_seen": 152830960, + "step": 125600 + }, + { + "epoch": 13.98875153135093, + "grad_norm": 0.7051643133163452, + "learning_rate": 1.254295051933717e-05, + "loss": 0.0155, + "num_input_tokens_seen": 152837040, + "step": 125605 + }, + { + "epoch": 13.989308386234548, + "grad_norm": 0.00016397680155932903, + "learning_rate": 1.2540843958978058e-05, + "loss": 0.0093, + "num_input_tokens_seen": 152843312, + "step": 125610 + }, + { + "epoch": 13.989865241118165, + "grad_norm": 1.3323149681091309, + "learning_rate": 1.2538737516305643e-05, + "loss": 0.0473, + "num_input_tokens_seen": 152849840, + "step": 125615 + }, + { + "epoch": 13.990422096001781, + "grad_norm": 0.006518850568681955, + "learning_rate": 1.2536631191339848e-05, + "loss": 0.0017, + "num_input_tokens_seen": 152856144, + "step": 125620 + }, + { + "epoch": 13.990978950885399, + "grad_norm": 0.22547638416290283, + "learning_rate": 1.253452498410056e-05, + "loss": 0.051, + "num_input_tokens_seen": 152862320, + "step": 125625 + }, + { + "epoch": 13.991535805769017, + "grad_norm": 1.7702553272247314, + "learning_rate": 1.2532418894607667e-05, + "loss": 0.0895, + "num_input_tokens_seen": 152868592, + "step": 125630 + }, + { + "epoch": 13.992092660652634, + "grad_norm": 0.032289888709783554, + "learning_rate": 1.2530312922881057e-05, + "loss": 0.0729, + "num_input_tokens_seen": 152874512, + "step": 125635 + }, + { + "epoch": 13.992649515536252, + "grad_norm": 0.8930648565292358, + "learning_rate": 1.2528207068940646e-05, + "loss": 0.0744, + "num_input_tokens_seen": 152880944, + "step": 125640 + }, + { + "epoch": 13.993206370419868, + "grad_norm": 0.4504418969154358, + "learning_rate": 1.2526101332806305e-05, + "loss": 0.1155, + "num_input_tokens_seen": 152886704, + "step": 125645 + }, + { + "epoch": 13.993763225303486, + "grad_norm": 0.0023655497934669256, + "learning_rate": 1.2523995714497933e-05, + "loss": 0.0177, + "num_input_tokens_seen": 152893008, + "step": 125650 + }, + { + "epoch": 13.994320080187103, + "grad_norm": 0.001063558622263372, + "learning_rate": 1.2521890214035409e-05, + "loss": 0.0159, + "num_input_tokens_seen": 152899568, + "step": 125655 + }, + { + "epoch": 13.994876935070721, + "grad_norm": 0.09170696884393692, + "learning_rate": 1.2519784831438635e-05, + "loss": 0.0278, + "num_input_tokens_seen": 152905904, + "step": 125660 + }, + { + "epoch": 13.995433789954339, + "grad_norm": 0.06050734594464302, + "learning_rate": 1.2517679566727488e-05, + "loss": 0.0256, + "num_input_tokens_seen": 152911312, + "step": 125665 + }, + { + "epoch": 13.995990644837955, + "grad_norm": 0.0002103122096741572, + "learning_rate": 1.2515574419921877e-05, + "loss": 0.0403, + "num_input_tokens_seen": 152917328, + "step": 125670 + }, + { + "epoch": 13.996547499721572, + "grad_norm": 0.0071816700510680676, + "learning_rate": 1.2513469391041644e-05, + "loss": 0.004, + "num_input_tokens_seen": 152923056, + "step": 125675 + }, + { + "epoch": 13.99710435460519, + "grad_norm": 0.06334657967090607, + "learning_rate": 1.2511364480106711e-05, + "loss": 0.0905, + "num_input_tokens_seen": 152928816, + "step": 125680 + }, + { + "epoch": 13.997661209488808, + "grad_norm": 0.4936703145503998, + "learning_rate": 1.2509259687136932e-05, + "loss": 0.033, + "num_input_tokens_seen": 152934928, + "step": 125685 + }, + { + "epoch": 13.998218064372425, + "grad_norm": 0.0025026716757565737, + "learning_rate": 1.2507155012152217e-05, + "loss": 0.0886, + "num_input_tokens_seen": 152941200, + "step": 125690 + }, + { + "epoch": 13.998774919256041, + "grad_norm": 0.7858964204788208, + "learning_rate": 1.2505050455172429e-05, + "loss": 0.0521, + "num_input_tokens_seen": 152947664, + "step": 125695 + }, + { + "epoch": 13.999331774139659, + "grad_norm": 1.0975743532180786, + "learning_rate": 1.2502946016217451e-05, + "loss": 0.018, + "num_input_tokens_seen": 152953680, + "step": 125700 + }, + { + "epoch": 13.999888629023276, + "grad_norm": 0.18987751007080078, + "learning_rate": 1.2500841695307154e-05, + "loss": 0.0087, + "num_input_tokens_seen": 152960048, + "step": 125705 + }, + { + "epoch": 14.0, + "eval_loss": 0.07992763072252274, + "eval_runtime": 112.3318, + "eval_samples_per_second": 35.529, + "eval_steps_per_second": 8.884, + "num_input_tokens_seen": 152960704, + "step": 125706 + }, + { + "epoch": 14.000445483906894, + "grad_norm": 0.43629971146583557, + "learning_rate": 1.2498737492461424e-05, + "loss": 0.0654, + "num_input_tokens_seen": 152965408, + "step": 125710 + }, + { + "epoch": 14.001002338790512, + "grad_norm": 0.0012291480088606477, + "learning_rate": 1.249663340770014e-05, + "loss": 0.0222, + "num_input_tokens_seen": 152971392, + "step": 125715 + }, + { + "epoch": 14.001559193674128, + "grad_norm": 0.013119184412062168, + "learning_rate": 1.2494529441043167e-05, + "loss": 0.0087, + "num_input_tokens_seen": 152977408, + "step": 125720 + }, + { + "epoch": 14.002116048557745, + "grad_norm": 0.29927554726600647, + "learning_rate": 1.2492425592510376e-05, + "loss": 0.0291, + "num_input_tokens_seen": 152983520, + "step": 125725 + }, + { + "epoch": 14.002672903441363, + "grad_norm": 0.001116941450163722, + "learning_rate": 1.2490321862121654e-05, + "loss": 0.0099, + "num_input_tokens_seen": 152989760, + "step": 125730 + }, + { + "epoch": 14.00322975832498, + "grad_norm": 0.6519089937210083, + "learning_rate": 1.2488218249896857e-05, + "loss": 0.0229, + "num_input_tokens_seen": 152995904, + "step": 125735 + }, + { + "epoch": 14.003786613208598, + "grad_norm": 0.43937966227531433, + "learning_rate": 1.248611475585588e-05, + "loss": 0.015, + "num_input_tokens_seen": 153002016, + "step": 125740 + }, + { + "epoch": 14.004343468092214, + "grad_norm": 0.07631713896989822, + "learning_rate": 1.2484011380018556e-05, + "loss": 0.0224, + "num_input_tokens_seen": 153008000, + "step": 125745 + }, + { + "epoch": 14.004900322975832, + "grad_norm": 0.00048201988101936877, + "learning_rate": 1.2481908122404784e-05, + "loss": 0.0049, + "num_input_tokens_seen": 153014016, + "step": 125750 + }, + { + "epoch": 14.00545717785945, + "grad_norm": 0.11400731652975082, + "learning_rate": 1.2479804983034407e-05, + "loss": 0.0152, + "num_input_tokens_seen": 153020160, + "step": 125755 + }, + { + "epoch": 14.006014032743067, + "grad_norm": 0.22797183692455292, + "learning_rate": 1.2477701961927315e-05, + "loss": 0.0606, + "num_input_tokens_seen": 153026496, + "step": 125760 + }, + { + "epoch": 14.006570887626685, + "grad_norm": 0.0021282858215272427, + "learning_rate": 1.247559905910336e-05, + "loss": 0.064, + "num_input_tokens_seen": 153032608, + "step": 125765 + }, + { + "epoch": 14.007127742510303, + "grad_norm": 0.025024738162755966, + "learning_rate": 1.2473496274582405e-05, + "loss": 0.0585, + "num_input_tokens_seen": 153038816, + "step": 125770 + }, + { + "epoch": 14.007684597393919, + "grad_norm": 0.24220845103263855, + "learning_rate": 1.2471393608384304e-05, + "loss": 0.0158, + "num_input_tokens_seen": 153045024, + "step": 125775 + }, + { + "epoch": 14.008241452277536, + "grad_norm": 0.6792880892753601, + "learning_rate": 1.246929106052894e-05, + "loss": 0.0152, + "num_input_tokens_seen": 153050976, + "step": 125780 + }, + { + "epoch": 14.008798307161154, + "grad_norm": 0.18821053206920624, + "learning_rate": 1.2467188631036158e-05, + "loss": 0.029, + "num_input_tokens_seen": 153057216, + "step": 125785 + }, + { + "epoch": 14.009355162044772, + "grad_norm": 0.053991008549928665, + "learning_rate": 1.2465086319925823e-05, + "loss": 0.1226, + "num_input_tokens_seen": 153063424, + "step": 125790 + }, + { + "epoch": 14.00991201692839, + "grad_norm": 0.014857192523777485, + "learning_rate": 1.2462984127217781e-05, + "loss": 0.1105, + "num_input_tokens_seen": 153069600, + "step": 125795 + }, + { + "epoch": 14.010468871812005, + "grad_norm": 0.0044077541679143906, + "learning_rate": 1.2460882052931907e-05, + "loss": 0.0363, + "num_input_tokens_seen": 153075808, + "step": 125800 + }, + { + "epoch": 14.011025726695623, + "grad_norm": 0.0006446263869293034, + "learning_rate": 1.2458780097088038e-05, + "loss": 0.1049, + "num_input_tokens_seen": 153082048, + "step": 125805 + }, + { + "epoch": 14.01158258157924, + "grad_norm": 3.320859432220459, + "learning_rate": 1.245667825970605e-05, + "loss": 0.1019, + "num_input_tokens_seen": 153088288, + "step": 125810 + }, + { + "epoch": 14.012139436462858, + "grad_norm": 0.22508245706558228, + "learning_rate": 1.2454576540805785e-05, + "loss": 0.012, + "num_input_tokens_seen": 153094272, + "step": 125815 + }, + { + "epoch": 14.012696291346476, + "grad_norm": 0.10480830073356628, + "learning_rate": 1.2452474940407093e-05, + "loss": 0.1043, + "num_input_tokens_seen": 153100608, + "step": 125820 + }, + { + "epoch": 14.013253146230092, + "grad_norm": 0.17885689437389374, + "learning_rate": 1.2450373458529824e-05, + "loss": 0.0698, + "num_input_tokens_seen": 153106464, + "step": 125825 + }, + { + "epoch": 14.01381000111371, + "grad_norm": 0.12154972553253174, + "learning_rate": 1.2448272095193836e-05, + "loss": 0.0377, + "num_input_tokens_seen": 153111968, + "step": 125830 + }, + { + "epoch": 14.014366855997327, + "grad_norm": 0.2895383834838867, + "learning_rate": 1.2446170850418978e-05, + "loss": 0.0141, + "num_input_tokens_seen": 153118304, + "step": 125835 + }, + { + "epoch": 14.014923710880945, + "grad_norm": 0.09180514514446259, + "learning_rate": 1.2444069724225093e-05, + "loss": 0.041, + "num_input_tokens_seen": 153124544, + "step": 125840 + }, + { + "epoch": 14.015480565764562, + "grad_norm": 0.17455635964870453, + "learning_rate": 1.2441968716632021e-05, + "loss": 0.0042, + "num_input_tokens_seen": 153130848, + "step": 125845 + }, + { + "epoch": 14.016037420648178, + "grad_norm": 0.020237190648913383, + "learning_rate": 1.2439867827659624e-05, + "loss": 0.0118, + "num_input_tokens_seen": 153136864, + "step": 125850 + }, + { + "epoch": 14.016594275531796, + "grad_norm": 0.00040899935993365943, + "learning_rate": 1.243776705732774e-05, + "loss": 0.0008, + "num_input_tokens_seen": 153143136, + "step": 125855 + }, + { + "epoch": 14.017151130415414, + "grad_norm": 0.20648692548274994, + "learning_rate": 1.2435666405656207e-05, + "loss": 0.0966, + "num_input_tokens_seen": 153149504, + "step": 125860 + }, + { + "epoch": 14.017707985299031, + "grad_norm": 0.04113093391060829, + "learning_rate": 1.2433565872664866e-05, + "loss": 0.0079, + "num_input_tokens_seen": 153155712, + "step": 125865 + }, + { + "epoch": 14.018264840182649, + "grad_norm": 0.00031909722019918263, + "learning_rate": 1.243146545837357e-05, + "loss": 0.0098, + "num_input_tokens_seen": 153161952, + "step": 125870 + }, + { + "epoch": 14.018821695066265, + "grad_norm": 0.09621699154376984, + "learning_rate": 1.2429365162802146e-05, + "loss": 0.0766, + "num_input_tokens_seen": 153168224, + "step": 125875 + }, + { + "epoch": 14.019378549949883, + "grad_norm": 1.6746472120285034, + "learning_rate": 1.2427264985970447e-05, + "loss": 0.0463, + "num_input_tokens_seen": 153174528, + "step": 125880 + }, + { + "epoch": 14.0199354048335, + "grad_norm": 0.07269037514925003, + "learning_rate": 1.2425164927898303e-05, + "loss": 0.0291, + "num_input_tokens_seen": 153180672, + "step": 125885 + }, + { + "epoch": 14.020492259717118, + "grad_norm": 0.016854330897331238, + "learning_rate": 1.242306498860555e-05, + "loss": 0.0391, + "num_input_tokens_seen": 153186784, + "step": 125890 + }, + { + "epoch": 14.021049114600736, + "grad_norm": 0.007427832577377558, + "learning_rate": 1.2420965168112018e-05, + "loss": 0.0668, + "num_input_tokens_seen": 153192928, + "step": 125895 + }, + { + "epoch": 14.021605969484352, + "grad_norm": 0.08898738771677017, + "learning_rate": 1.2418865466437554e-05, + "loss": 0.0274, + "num_input_tokens_seen": 153198944, + "step": 125900 + }, + { + "epoch": 14.02216282436797, + "grad_norm": 0.002182931173592806, + "learning_rate": 1.2416765883601989e-05, + "loss": 0.0823, + "num_input_tokens_seen": 153204608, + "step": 125905 + }, + { + "epoch": 14.022719679251587, + "grad_norm": 0.002657025819644332, + "learning_rate": 1.241466641962515e-05, + "loss": 0.0182, + "num_input_tokens_seen": 153210720, + "step": 125910 + }, + { + "epoch": 14.023276534135205, + "grad_norm": 0.004552147351205349, + "learning_rate": 1.241256707452687e-05, + "loss": 0.056, + "num_input_tokens_seen": 153216992, + "step": 125915 + }, + { + "epoch": 14.023833389018822, + "grad_norm": 0.0013754955725744367, + "learning_rate": 1.241046784832697e-05, + "loss": 0.0013, + "num_input_tokens_seen": 153223456, + "step": 125920 + }, + { + "epoch": 14.024390243902438, + "grad_norm": 0.010465345345437527, + "learning_rate": 1.2408368741045297e-05, + "loss": 0.01, + "num_input_tokens_seen": 153229888, + "step": 125925 + }, + { + "epoch": 14.024947098786056, + "grad_norm": 0.006382990162819624, + "learning_rate": 1.2406269752701668e-05, + "loss": 0.0054, + "num_input_tokens_seen": 153236224, + "step": 125930 + }, + { + "epoch": 14.025503953669674, + "grad_norm": 0.009698492474853992, + "learning_rate": 1.2404170883315914e-05, + "loss": 0.1075, + "num_input_tokens_seen": 153242112, + "step": 125935 + }, + { + "epoch": 14.026060808553291, + "grad_norm": 0.008521663956344128, + "learning_rate": 1.2402072132907846e-05, + "loss": 0.0059, + "num_input_tokens_seen": 153248576, + "step": 125940 + }, + { + "epoch": 14.026617663436909, + "grad_norm": 7.652495696675032e-05, + "learning_rate": 1.2399973501497311e-05, + "loss": 0.1629, + "num_input_tokens_seen": 153255008, + "step": 125945 + }, + { + "epoch": 14.027174518320527, + "grad_norm": 0.016185490414500237, + "learning_rate": 1.239787498910411e-05, + "loss": 0.0532, + "num_input_tokens_seen": 153261120, + "step": 125950 + }, + { + "epoch": 14.027731373204142, + "grad_norm": 0.00013468747783917934, + "learning_rate": 1.2395776595748096e-05, + "loss": 0.0239, + "num_input_tokens_seen": 153267264, + "step": 125955 + }, + { + "epoch": 14.02828822808776, + "grad_norm": 0.45463141798973083, + "learning_rate": 1.2393678321449054e-05, + "loss": 0.0151, + "num_input_tokens_seen": 153273408, + "step": 125960 + }, + { + "epoch": 14.028845082971378, + "grad_norm": 0.2765648365020752, + "learning_rate": 1.2391580166226826e-05, + "loss": 0.09, + "num_input_tokens_seen": 153279136, + "step": 125965 + }, + { + "epoch": 14.029401937854995, + "grad_norm": 0.4458773732185364, + "learning_rate": 1.2389482130101218e-05, + "loss": 0.0212, + "num_input_tokens_seen": 153285056, + "step": 125970 + }, + { + "epoch": 14.029958792738613, + "grad_norm": 0.0002725052763707936, + "learning_rate": 1.2387384213092062e-05, + "loss": 0.0447, + "num_input_tokens_seen": 153291296, + "step": 125975 + }, + { + "epoch": 14.030515647622229, + "grad_norm": 3.021376132965088, + "learning_rate": 1.2385286415219169e-05, + "loss": 0.0492, + "num_input_tokens_seen": 153297344, + "step": 125980 + }, + { + "epoch": 14.031072502505847, + "grad_norm": 0.13722263276576996, + "learning_rate": 1.2383188736502352e-05, + "loss": 0.0129, + "num_input_tokens_seen": 153303328, + "step": 125985 + }, + { + "epoch": 14.031629357389464, + "grad_norm": 0.00030903139850124717, + "learning_rate": 1.2381091176961415e-05, + "loss": 0.0173, + "num_input_tokens_seen": 153309536, + "step": 125990 + }, + { + "epoch": 14.032186212273082, + "grad_norm": 0.33159127831459045, + "learning_rate": 1.2378993736616192e-05, + "loss": 0.1848, + "num_input_tokens_seen": 153315616, + "step": 125995 + }, + { + "epoch": 14.0327430671567, + "grad_norm": 0.020152602344751358, + "learning_rate": 1.2376896415486485e-05, + "loss": 0.0202, + "num_input_tokens_seen": 153321696, + "step": 126000 + }, + { + "epoch": 14.033299922040316, + "grad_norm": 0.006074476521462202, + "learning_rate": 1.2374799213592107e-05, + "loss": 0.0101, + "num_input_tokens_seen": 153327808, + "step": 126005 + }, + { + "epoch": 14.033856776923933, + "grad_norm": 0.0001432630087947473, + "learning_rate": 1.2372702130952854e-05, + "loss": 0.1016, + "num_input_tokens_seen": 153334112, + "step": 126010 + }, + { + "epoch": 14.034413631807551, + "grad_norm": 2.6713082790374756, + "learning_rate": 1.2370605167588555e-05, + "loss": 0.0718, + "num_input_tokens_seen": 153339872, + "step": 126015 + }, + { + "epoch": 14.034970486691169, + "grad_norm": 0.0002005468704737723, + "learning_rate": 1.2368508323519002e-05, + "loss": 0.0027, + "num_input_tokens_seen": 153346208, + "step": 126020 + }, + { + "epoch": 14.035527341574786, + "grad_norm": 1.3346415758132935, + "learning_rate": 1.2366411598764017e-05, + "loss": 0.0268, + "num_input_tokens_seen": 153352160, + "step": 126025 + }, + { + "epoch": 14.036084196458402, + "grad_norm": 0.09116724133491516, + "learning_rate": 1.2364314993343392e-05, + "loss": 0.0161, + "num_input_tokens_seen": 153358400, + "step": 126030 + }, + { + "epoch": 14.03664105134202, + "grad_norm": 0.28023561835289, + "learning_rate": 1.236221850727694e-05, + "loss": 0.0078, + "num_input_tokens_seen": 153364448, + "step": 126035 + }, + { + "epoch": 14.037197906225638, + "grad_norm": 1.443272352218628, + "learning_rate": 1.236012214058445e-05, + "loss": 0.032, + "num_input_tokens_seen": 153369792, + "step": 126040 + }, + { + "epoch": 14.037754761109255, + "grad_norm": 0.03723539412021637, + "learning_rate": 1.2358025893285741e-05, + "loss": 0.0409, + "num_input_tokens_seen": 153375616, + "step": 126045 + }, + { + "epoch": 14.038311615992873, + "grad_norm": 2.0543854236602783, + "learning_rate": 1.2355929765400607e-05, + "loss": 0.1751, + "num_input_tokens_seen": 153381920, + "step": 126050 + }, + { + "epoch": 14.038868470876489, + "grad_norm": 0.004081588238477707, + "learning_rate": 1.2353833756948844e-05, + "loss": 0.0269, + "num_input_tokens_seen": 153388160, + "step": 126055 + }, + { + "epoch": 14.039425325760106, + "grad_norm": 0.06367382407188416, + "learning_rate": 1.2351737867950245e-05, + "loss": 0.0602, + "num_input_tokens_seen": 153394496, + "step": 126060 + }, + { + "epoch": 14.039982180643724, + "grad_norm": 0.8362192511558533, + "learning_rate": 1.2349642098424626e-05, + "loss": 0.0183, + "num_input_tokens_seen": 153400256, + "step": 126065 + }, + { + "epoch": 14.040539035527342, + "grad_norm": 0.0734826996922493, + "learning_rate": 1.2347546448391762e-05, + "loss": 0.0028, + "num_input_tokens_seen": 153406176, + "step": 126070 + }, + { + "epoch": 14.04109589041096, + "grad_norm": 0.0003252757014706731, + "learning_rate": 1.2345450917871479e-05, + "loss": 0.0984, + "num_input_tokens_seen": 153412224, + "step": 126075 + }, + { + "epoch": 14.041652745294575, + "grad_norm": 0.00036632930277846754, + "learning_rate": 1.2343355506883531e-05, + "loss": 0.008, + "num_input_tokens_seen": 153418432, + "step": 126080 + }, + { + "epoch": 14.042209600178193, + "grad_norm": 0.5256122946739197, + "learning_rate": 1.2341260215447742e-05, + "loss": 0.1414, + "num_input_tokens_seen": 153424544, + "step": 126085 + }, + { + "epoch": 14.04276645506181, + "grad_norm": 2.1274683475494385, + "learning_rate": 1.233916504358388e-05, + "loss": 0.0948, + "num_input_tokens_seen": 153430944, + "step": 126090 + }, + { + "epoch": 14.043323309945428, + "grad_norm": 0.0012573539279401302, + "learning_rate": 1.2337069991311758e-05, + "loss": 0.0055, + "num_input_tokens_seen": 153436544, + "step": 126095 + }, + { + "epoch": 14.043880164829046, + "grad_norm": 0.050778016448020935, + "learning_rate": 1.2334975058651154e-05, + "loss": 0.002, + "num_input_tokens_seen": 153442528, + "step": 126100 + }, + { + "epoch": 14.044437019712662, + "grad_norm": 0.09743675589561462, + "learning_rate": 1.233288024562186e-05, + "loss": 0.0165, + "num_input_tokens_seen": 153448384, + "step": 126105 + }, + { + "epoch": 14.04499387459628, + "grad_norm": 0.0990050807595253, + "learning_rate": 1.2330785552243651e-05, + "loss": 0.0391, + "num_input_tokens_seen": 153454432, + "step": 126110 + }, + { + "epoch": 14.045550729479897, + "grad_norm": 0.24095845222473145, + "learning_rate": 1.2328690978536334e-05, + "loss": 0.009, + "num_input_tokens_seen": 153460608, + "step": 126115 + }, + { + "epoch": 14.046107584363515, + "grad_norm": 0.22447676956653595, + "learning_rate": 1.232659652451968e-05, + "loss": 0.0309, + "num_input_tokens_seen": 153466848, + "step": 126120 + }, + { + "epoch": 14.046664439247133, + "grad_norm": 0.8119794726371765, + "learning_rate": 1.2324502190213475e-05, + "loss": 0.0143, + "num_input_tokens_seen": 153472736, + "step": 126125 + }, + { + "epoch": 14.04722129413075, + "grad_norm": 6.4321746826171875, + "learning_rate": 1.2322407975637495e-05, + "loss": 0.0809, + "num_input_tokens_seen": 153478976, + "step": 126130 + }, + { + "epoch": 14.047778149014366, + "grad_norm": 0.5063005089759827, + "learning_rate": 1.2320313880811537e-05, + "loss": 0.0054, + "num_input_tokens_seen": 153485184, + "step": 126135 + }, + { + "epoch": 14.048335003897984, + "grad_norm": 0.0007337324204854667, + "learning_rate": 1.2318219905755365e-05, + "loss": 0.0084, + "num_input_tokens_seen": 153491552, + "step": 126140 + }, + { + "epoch": 14.048891858781602, + "grad_norm": 0.0021764824632555246, + "learning_rate": 1.2316126050488783e-05, + "loss": 0.0158, + "num_input_tokens_seen": 153497024, + "step": 126145 + }, + { + "epoch": 14.04944871366522, + "grad_norm": 0.10430707037448883, + "learning_rate": 1.2314032315031536e-05, + "loss": 0.0053, + "num_input_tokens_seen": 153503040, + "step": 126150 + }, + { + "epoch": 14.050005568548837, + "grad_norm": 0.011674593202769756, + "learning_rate": 1.2311938699403428e-05, + "loss": 0.0945, + "num_input_tokens_seen": 153508960, + "step": 126155 + }, + { + "epoch": 14.050562423432453, + "grad_norm": 1.7851413488388062, + "learning_rate": 1.2309845203624212e-05, + "loss": 0.0686, + "num_input_tokens_seen": 153514720, + "step": 126160 + }, + { + "epoch": 14.05111927831607, + "grad_norm": 0.1279185265302658, + "learning_rate": 1.2307751827713685e-05, + "loss": 0.0018, + "num_input_tokens_seen": 153520800, + "step": 126165 + }, + { + "epoch": 14.051676133199688, + "grad_norm": 0.01550828106701374, + "learning_rate": 1.2305658571691611e-05, + "loss": 0.15, + "num_input_tokens_seen": 153526784, + "step": 126170 + }, + { + "epoch": 14.052232988083306, + "grad_norm": 1.372452974319458, + "learning_rate": 1.2303565435577767e-05, + "loss": 0.1318, + "num_input_tokens_seen": 153533376, + "step": 126175 + }, + { + "epoch": 14.052789842966924, + "grad_norm": 0.18225252628326416, + "learning_rate": 1.2301472419391905e-05, + "loss": 0.0085, + "num_input_tokens_seen": 153539040, + "step": 126180 + }, + { + "epoch": 14.05334669785054, + "grad_norm": 0.6803327202796936, + "learning_rate": 1.229937952315382e-05, + "loss": 0.0452, + "num_input_tokens_seen": 153544960, + "step": 126185 + }, + { + "epoch": 14.053903552734157, + "grad_norm": 0.15408354997634888, + "learning_rate": 1.2297286746883272e-05, + "loss": 0.0557, + "num_input_tokens_seen": 153550880, + "step": 126190 + }, + { + "epoch": 14.054460407617775, + "grad_norm": 0.18694685399532318, + "learning_rate": 1.2295194090600025e-05, + "loss": 0.0328, + "num_input_tokens_seen": 153556800, + "step": 126195 + }, + { + "epoch": 14.055017262501392, + "grad_norm": 0.1350756138563156, + "learning_rate": 1.2293101554323843e-05, + "loss": 0.0076, + "num_input_tokens_seen": 153563072, + "step": 126200 + }, + { + "epoch": 14.05557411738501, + "grad_norm": 0.12311726808547974, + "learning_rate": 1.2291009138074505e-05, + "loss": 0.0247, + "num_input_tokens_seen": 153569216, + "step": 126205 + }, + { + "epoch": 14.056130972268626, + "grad_norm": 0.12420579791069031, + "learning_rate": 1.2288916841871761e-05, + "loss": 0.003, + "num_input_tokens_seen": 153575488, + "step": 126210 + }, + { + "epoch": 14.056687827152244, + "grad_norm": 0.4709353744983673, + "learning_rate": 1.2286824665735389e-05, + "loss": 0.0865, + "num_input_tokens_seen": 153581696, + "step": 126215 + }, + { + "epoch": 14.057244682035861, + "grad_norm": 0.012971372343599796, + "learning_rate": 1.2284732609685142e-05, + "loss": 0.0298, + "num_input_tokens_seen": 153587712, + "step": 126220 + }, + { + "epoch": 14.057801536919479, + "grad_norm": 0.11587820947170258, + "learning_rate": 1.2282640673740787e-05, + "loss": 0.0116, + "num_input_tokens_seen": 153593792, + "step": 126225 + }, + { + "epoch": 14.058358391803097, + "grad_norm": 0.006692859809845686, + "learning_rate": 1.2280548857922067e-05, + "loss": 0.0012, + "num_input_tokens_seen": 153600064, + "step": 126230 + }, + { + "epoch": 14.058915246686713, + "grad_norm": 0.06402495503425598, + "learning_rate": 1.2278457162248763e-05, + "loss": 0.0683, + "num_input_tokens_seen": 153606368, + "step": 126235 + }, + { + "epoch": 14.05947210157033, + "grad_norm": 0.018480917438864708, + "learning_rate": 1.2276365586740626e-05, + "loss": 0.0116, + "num_input_tokens_seen": 153612000, + "step": 126240 + }, + { + "epoch": 14.060028956453948, + "grad_norm": 0.1268446445465088, + "learning_rate": 1.2274274131417407e-05, + "loss": 0.0609, + "num_input_tokens_seen": 153618272, + "step": 126245 + }, + { + "epoch": 14.060585811337566, + "grad_norm": 0.0011118322145193815, + "learning_rate": 1.2272182796298858e-05, + "loss": 0.048, + "num_input_tokens_seen": 153624480, + "step": 126250 + }, + { + "epoch": 14.061142666221183, + "grad_norm": 0.002469392726197839, + "learning_rate": 1.2270091581404747e-05, + "loss": 0.0035, + "num_input_tokens_seen": 153630976, + "step": 126255 + }, + { + "epoch": 14.0616995211048, + "grad_norm": 0.5142637491226196, + "learning_rate": 1.2268000486754813e-05, + "loss": 0.0199, + "num_input_tokens_seen": 153636928, + "step": 126260 + }, + { + "epoch": 14.062256375988417, + "grad_norm": 0.0016900445334613323, + "learning_rate": 1.226590951236883e-05, + "loss": 0.0076, + "num_input_tokens_seen": 153643104, + "step": 126265 + }, + { + "epoch": 14.062813230872035, + "grad_norm": 0.1778668314218521, + "learning_rate": 1.2263818658266519e-05, + "loss": 0.0121, + "num_input_tokens_seen": 153649408, + "step": 126270 + }, + { + "epoch": 14.063370085755652, + "grad_norm": 2.1934428215026855, + "learning_rate": 1.2261727924467653e-05, + "loss": 0.0766, + "num_input_tokens_seen": 153655584, + "step": 126275 + }, + { + "epoch": 14.06392694063927, + "grad_norm": 1.0771461725234985, + "learning_rate": 1.2259637310991965e-05, + "loss": 0.0623, + "num_input_tokens_seen": 153661728, + "step": 126280 + }, + { + "epoch": 14.064483795522886, + "grad_norm": 0.04228246584534645, + "learning_rate": 1.2257546817859217e-05, + "loss": 0.0291, + "num_input_tokens_seen": 153667680, + "step": 126285 + }, + { + "epoch": 14.065040650406504, + "grad_norm": 0.004497379995882511, + "learning_rate": 1.2255456445089147e-05, + "loss": 0.0167, + "num_input_tokens_seen": 153673792, + "step": 126290 + }, + { + "epoch": 14.065597505290121, + "grad_norm": 0.24928972125053406, + "learning_rate": 1.2253366192701504e-05, + "loss": 0.0059, + "num_input_tokens_seen": 153679968, + "step": 126295 + }, + { + "epoch": 14.066154360173739, + "grad_norm": 0.03800933063030243, + "learning_rate": 1.2251276060716018e-05, + "loss": 0.0249, + "num_input_tokens_seen": 153686080, + "step": 126300 + }, + { + "epoch": 14.066711215057357, + "grad_norm": 0.8817852139472961, + "learning_rate": 1.2249186049152456e-05, + "loss": 0.0224, + "num_input_tokens_seen": 153691744, + "step": 126305 + }, + { + "epoch": 14.067268069940974, + "grad_norm": 0.7318074703216553, + "learning_rate": 1.2247096158030546e-05, + "loss": 0.0495, + "num_input_tokens_seen": 153697984, + "step": 126310 + }, + { + "epoch": 14.06782492482459, + "grad_norm": 0.005155322607606649, + "learning_rate": 1.224500638737003e-05, + "loss": 0.009, + "num_input_tokens_seen": 153704032, + "step": 126315 + }, + { + "epoch": 14.068381779708208, + "grad_norm": 0.0017179084243252873, + "learning_rate": 1.224291673719065e-05, + "loss": 0.0201, + "num_input_tokens_seen": 153709664, + "step": 126320 + }, + { + "epoch": 14.068938634591825, + "grad_norm": 0.024191007018089294, + "learning_rate": 1.2240827207512132e-05, + "loss": 0.0903, + "num_input_tokens_seen": 153716064, + "step": 126325 + }, + { + "epoch": 14.069495489475443, + "grad_norm": 0.5588803291320801, + "learning_rate": 1.2238737798354233e-05, + "loss": 0.0114, + "num_input_tokens_seen": 153721984, + "step": 126330 + }, + { + "epoch": 14.07005234435906, + "grad_norm": 0.09259408712387085, + "learning_rate": 1.2236648509736678e-05, + "loss": 0.0156, + "num_input_tokens_seen": 153728096, + "step": 126335 + }, + { + "epoch": 14.070609199242677, + "grad_norm": 0.13831734657287598, + "learning_rate": 1.2234559341679206e-05, + "loss": 0.0193, + "num_input_tokens_seen": 153733408, + "step": 126340 + }, + { + "epoch": 14.071166054126294, + "grad_norm": 0.00607312610372901, + "learning_rate": 1.2232470294201537e-05, + "loss": 0.0019, + "num_input_tokens_seen": 153739392, + "step": 126345 + }, + { + "epoch": 14.071722909009912, + "grad_norm": 1.118358850479126, + "learning_rate": 1.2230381367323424e-05, + "loss": 0.0212, + "num_input_tokens_seen": 153745408, + "step": 126350 + }, + { + "epoch": 14.07227976389353, + "grad_norm": 0.04994472488760948, + "learning_rate": 1.2228292561064581e-05, + "loss": 0.007, + "num_input_tokens_seen": 153751776, + "step": 126355 + }, + { + "epoch": 14.072836618777147, + "grad_norm": 0.6465068459510803, + "learning_rate": 1.2226203875444767e-05, + "loss": 0.0681, + "num_input_tokens_seen": 153758016, + "step": 126360 + }, + { + "epoch": 14.073393473660763, + "grad_norm": 0.06391236186027527, + "learning_rate": 1.2224115310483672e-05, + "loss": 0.0011, + "num_input_tokens_seen": 153764352, + "step": 126365 + }, + { + "epoch": 14.073950328544381, + "grad_norm": 0.5532223582267761, + "learning_rate": 1.2222026866201056e-05, + "loss": 0.0161, + "num_input_tokens_seen": 153770368, + "step": 126370 + }, + { + "epoch": 14.074507183427999, + "grad_norm": 0.0033793996553868055, + "learning_rate": 1.2219938542616621e-05, + "loss": 0.0219, + "num_input_tokens_seen": 153776160, + "step": 126375 + }, + { + "epoch": 14.075064038311616, + "grad_norm": 0.9340412616729736, + "learning_rate": 1.2217850339750118e-05, + "loss": 0.0204, + "num_input_tokens_seen": 153781664, + "step": 126380 + }, + { + "epoch": 14.075620893195234, + "grad_norm": 1.508499264717102, + "learning_rate": 1.2215762257621254e-05, + "loss": 0.0717, + "num_input_tokens_seen": 153787616, + "step": 126385 + }, + { + "epoch": 14.07617774807885, + "grad_norm": 0.056869398802518845, + "learning_rate": 1.2213674296249763e-05, + "loss": 0.0101, + "num_input_tokens_seen": 153793568, + "step": 126390 + }, + { + "epoch": 14.076734602962468, + "grad_norm": 0.0004268190241418779, + "learning_rate": 1.2211586455655352e-05, + "loss": 0.0026, + "num_input_tokens_seen": 153799616, + "step": 126395 + }, + { + "epoch": 14.077291457846085, + "grad_norm": 0.003900448326021433, + "learning_rate": 1.2209498735857764e-05, + "loss": 0.0009, + "num_input_tokens_seen": 153805600, + "step": 126400 + }, + { + "epoch": 14.077848312729703, + "grad_norm": 0.6443041563034058, + "learning_rate": 1.2207411136876704e-05, + "loss": 0.1076, + "num_input_tokens_seen": 153812064, + "step": 126405 + }, + { + "epoch": 14.07840516761332, + "grad_norm": 0.274539053440094, + "learning_rate": 1.2205323658731897e-05, + "loss": 0.043, + "num_input_tokens_seen": 153818272, + "step": 126410 + }, + { + "epoch": 14.078962022496937, + "grad_norm": 0.10391441732645035, + "learning_rate": 1.220323630144305e-05, + "loss": 0.0016, + "num_input_tokens_seen": 153823904, + "step": 126415 + }, + { + "epoch": 14.079518877380554, + "grad_norm": 0.049471866339445114, + "learning_rate": 1.2201149065029899e-05, + "loss": 0.0552, + "num_input_tokens_seen": 153829952, + "step": 126420 + }, + { + "epoch": 14.080075732264172, + "grad_norm": 0.08321447670459747, + "learning_rate": 1.2199061949512136e-05, + "loss": 0.0629, + "num_input_tokens_seen": 153836352, + "step": 126425 + }, + { + "epoch": 14.08063258714779, + "grad_norm": 0.3648916780948639, + "learning_rate": 1.21969749549095e-05, + "loss": 0.0215, + "num_input_tokens_seen": 153842496, + "step": 126430 + }, + { + "epoch": 14.081189442031407, + "grad_norm": 0.0026751933619379997, + "learning_rate": 1.2194888081241696e-05, + "loss": 0.0656, + "num_input_tokens_seen": 153848704, + "step": 126435 + }, + { + "epoch": 14.081746296915023, + "grad_norm": 0.21369314193725586, + "learning_rate": 1.2192801328528433e-05, + "loss": 0.0158, + "num_input_tokens_seen": 153855104, + "step": 126440 + }, + { + "epoch": 14.08230315179864, + "grad_norm": 0.00014063487469684333, + "learning_rate": 1.219071469678941e-05, + "loss": 0.0016, + "num_input_tokens_seen": 153861184, + "step": 126445 + }, + { + "epoch": 14.082860006682258, + "grad_norm": 0.022113902494311333, + "learning_rate": 1.218862818604436e-05, + "loss": 0.0008, + "num_input_tokens_seen": 153867392, + "step": 126450 + }, + { + "epoch": 14.083416861565876, + "grad_norm": 0.0014815805479884148, + "learning_rate": 1.218654179631298e-05, + "loss": 0.0019, + "num_input_tokens_seen": 153873152, + "step": 126455 + }, + { + "epoch": 14.083973716449494, + "grad_norm": 0.07994390279054642, + "learning_rate": 1.2184455527614982e-05, + "loss": 0.0362, + "num_input_tokens_seen": 153879264, + "step": 126460 + }, + { + "epoch": 14.084530571333111, + "grad_norm": 0.0064360760152339935, + "learning_rate": 1.2182369379970056e-05, + "loss": 0.0732, + "num_input_tokens_seen": 153885088, + "step": 126465 + }, + { + "epoch": 14.085087426216727, + "grad_norm": 1.0496124029159546, + "learning_rate": 1.2180283353397934e-05, + "loss": 0.078, + "num_input_tokens_seen": 153891072, + "step": 126470 + }, + { + "epoch": 14.085644281100345, + "grad_norm": 0.09843393415212631, + "learning_rate": 1.2178197447918293e-05, + "loss": 0.0198, + "num_input_tokens_seen": 153897184, + "step": 126475 + }, + { + "epoch": 14.086201135983963, + "grad_norm": 0.014726534485816956, + "learning_rate": 1.2176111663550871e-05, + "loss": 0.0382, + "num_input_tokens_seen": 153903072, + "step": 126480 + }, + { + "epoch": 14.08675799086758, + "grad_norm": 0.03146795183420181, + "learning_rate": 1.217402600031533e-05, + "loss": 0.0078, + "num_input_tokens_seen": 153909216, + "step": 126485 + }, + { + "epoch": 14.087314845751198, + "grad_norm": 0.7189412117004395, + "learning_rate": 1.2171940458231398e-05, + "loss": 0.0408, + "num_input_tokens_seen": 153915264, + "step": 126490 + }, + { + "epoch": 14.087871700634814, + "grad_norm": 0.5841928720474243, + "learning_rate": 1.216985503731876e-05, + "loss": 0.1435, + "num_input_tokens_seen": 153921472, + "step": 126495 + }, + { + "epoch": 14.088428555518432, + "grad_norm": 0.036155957728624344, + "learning_rate": 1.2167769737597123e-05, + "loss": 0.1195, + "num_input_tokens_seen": 153927456, + "step": 126500 + }, + { + "epoch": 14.08898541040205, + "grad_norm": 0.09889766573905945, + "learning_rate": 1.2165684559086188e-05, + "loss": 0.01, + "num_input_tokens_seen": 153933408, + "step": 126505 + }, + { + "epoch": 14.089542265285667, + "grad_norm": 0.4353393316268921, + "learning_rate": 1.2163599501805642e-05, + "loss": 0.0197, + "num_input_tokens_seen": 153939680, + "step": 126510 + }, + { + "epoch": 14.090099120169285, + "grad_norm": 0.0024352911859750748, + "learning_rate": 1.2161514565775173e-05, + "loss": 0.0253, + "num_input_tokens_seen": 153945312, + "step": 126515 + }, + { + "epoch": 14.0906559750529, + "grad_norm": 0.5072495341300964, + "learning_rate": 1.2159429751014497e-05, + "loss": 0.0504, + "num_input_tokens_seen": 153950752, + "step": 126520 + }, + { + "epoch": 14.091212829936518, + "grad_norm": 0.059584613889455795, + "learning_rate": 1.2157345057543293e-05, + "loss": 0.0349, + "num_input_tokens_seen": 153956704, + "step": 126525 + }, + { + "epoch": 14.091769684820136, + "grad_norm": 0.00032778881723061204, + "learning_rate": 1.2155260485381254e-05, + "loss": 0.0309, + "num_input_tokens_seen": 153962784, + "step": 126530 + }, + { + "epoch": 14.092326539703754, + "grad_norm": 0.003709842450916767, + "learning_rate": 1.2153176034548063e-05, + "loss": 0.0193, + "num_input_tokens_seen": 153968832, + "step": 126535 + }, + { + "epoch": 14.092883394587371, + "grad_norm": 0.5316436290740967, + "learning_rate": 1.2151091705063425e-05, + "loss": 0.0213, + "num_input_tokens_seen": 153975168, + "step": 126540 + }, + { + "epoch": 14.093440249470987, + "grad_norm": 2.8552188873291016, + "learning_rate": 1.2149007496947013e-05, + "loss": 0.1161, + "num_input_tokens_seen": 153981504, + "step": 126545 + }, + { + "epoch": 14.093997104354605, + "grad_norm": 0.001192911178804934, + "learning_rate": 1.2146923410218536e-05, + "loss": 0.1145, + "num_input_tokens_seen": 153987744, + "step": 126550 + }, + { + "epoch": 14.094553959238223, + "grad_norm": 0.6313214898109436, + "learning_rate": 1.214483944489765e-05, + "loss": 0.1121, + "num_input_tokens_seen": 153993856, + "step": 126555 + }, + { + "epoch": 14.09511081412184, + "grad_norm": 2.629727602005005, + "learning_rate": 1.2142755601004063e-05, + "loss": 0.1643, + "num_input_tokens_seen": 153999936, + "step": 126560 + }, + { + "epoch": 14.095667669005458, + "grad_norm": 0.9948552250862122, + "learning_rate": 1.2140671878557441e-05, + "loss": 0.0144, + "num_input_tokens_seen": 154006080, + "step": 126565 + }, + { + "epoch": 14.096224523889074, + "grad_norm": 0.04364416375756264, + "learning_rate": 1.2138588277577484e-05, + "loss": 0.0284, + "num_input_tokens_seen": 154011936, + "step": 126570 + }, + { + "epoch": 14.096781378772691, + "grad_norm": 0.037434644997119904, + "learning_rate": 1.2136504798083868e-05, + "loss": 0.0006, + "num_input_tokens_seen": 154018464, + "step": 126575 + }, + { + "epoch": 14.09733823365631, + "grad_norm": 0.0026098848320543766, + "learning_rate": 1.2134421440096267e-05, + "loss": 0.0115, + "num_input_tokens_seen": 154024960, + "step": 126580 + }, + { + "epoch": 14.097895088539927, + "grad_norm": 0.027847614139318466, + "learning_rate": 1.2132338203634355e-05, + "loss": 0.02, + "num_input_tokens_seen": 154031136, + "step": 126585 + }, + { + "epoch": 14.098451943423544, + "grad_norm": 0.01468003448098898, + "learning_rate": 1.213025508871783e-05, + "loss": 0.0107, + "num_input_tokens_seen": 154037088, + "step": 126590 + }, + { + "epoch": 14.09900879830716, + "grad_norm": 0.10951320081949234, + "learning_rate": 1.2128172095366352e-05, + "loss": 0.0111, + "num_input_tokens_seen": 154043232, + "step": 126595 + }, + { + "epoch": 14.099565653190778, + "grad_norm": 0.03762255609035492, + "learning_rate": 1.2126089223599604e-05, + "loss": 0.06, + "num_input_tokens_seen": 154049248, + "step": 126600 + }, + { + "epoch": 14.100122508074396, + "grad_norm": 0.40319183468818665, + "learning_rate": 1.2124006473437249e-05, + "loss": 0.0182, + "num_input_tokens_seen": 154055488, + "step": 126605 + }, + { + "epoch": 14.100679362958013, + "grad_norm": 0.0112648606300354, + "learning_rate": 1.2121923844898978e-05, + "loss": 0.0892, + "num_input_tokens_seen": 154061248, + "step": 126610 + }, + { + "epoch": 14.101236217841631, + "grad_norm": 0.0048960912972688675, + "learning_rate": 1.2119841338004443e-05, + "loss": 0.0242, + "num_input_tokens_seen": 154067616, + "step": 126615 + }, + { + "epoch": 14.101793072725247, + "grad_norm": 0.0040598222985863686, + "learning_rate": 1.2117758952773336e-05, + "loss": 0.1436, + "num_input_tokens_seen": 154073408, + "step": 126620 + }, + { + "epoch": 14.102349927608865, + "grad_norm": 3.5826992988586426, + "learning_rate": 1.2115676689225316e-05, + "loss": 0.0285, + "num_input_tokens_seen": 154079520, + "step": 126625 + }, + { + "epoch": 14.102906782492482, + "grad_norm": 0.006699712481349707, + "learning_rate": 1.2113594547380055e-05, + "loss": 0.0484, + "num_input_tokens_seen": 154085376, + "step": 126630 + }, + { + "epoch": 14.1034636373761, + "grad_norm": 0.01545756496489048, + "learning_rate": 1.2111512527257207e-05, + "loss": 0.0085, + "num_input_tokens_seen": 154091648, + "step": 126635 + }, + { + "epoch": 14.104020492259718, + "grad_norm": 0.8970714807510376, + "learning_rate": 1.2109430628876459e-05, + "loss": 0.071, + "num_input_tokens_seen": 154097664, + "step": 126640 + }, + { + "epoch": 14.104577347143334, + "grad_norm": 0.0001568799780216068, + "learning_rate": 1.2107348852257467e-05, + "loss": 0.0109, + "num_input_tokens_seen": 154103936, + "step": 126645 + }, + { + "epoch": 14.105134202026951, + "grad_norm": 0.008688648231327534, + "learning_rate": 1.2105267197419892e-05, + "loss": 0.0062, + "num_input_tokens_seen": 154110112, + "step": 126650 + }, + { + "epoch": 14.105691056910569, + "grad_norm": 0.6126651167869568, + "learning_rate": 1.2103185664383393e-05, + "loss": 0.0077, + "num_input_tokens_seen": 154116480, + "step": 126655 + }, + { + "epoch": 14.106247911794187, + "grad_norm": 0.005198388360440731, + "learning_rate": 1.2101104253167647e-05, + "loss": 0.0171, + "num_input_tokens_seen": 154122816, + "step": 126660 + }, + { + "epoch": 14.106804766677804, + "grad_norm": 0.05176533758640289, + "learning_rate": 1.2099022963792294e-05, + "loss": 0.0937, + "num_input_tokens_seen": 154128864, + "step": 126665 + }, + { + "epoch": 14.107361621561422, + "grad_norm": 0.0018165468936786056, + "learning_rate": 1.2096941796277025e-05, + "loss": 0.1162, + "num_input_tokens_seen": 154135104, + "step": 126670 + }, + { + "epoch": 14.107918476445038, + "grad_norm": 0.05012528598308563, + "learning_rate": 1.2094860750641462e-05, + "loss": 0.0738, + "num_input_tokens_seen": 154141184, + "step": 126675 + }, + { + "epoch": 14.108475331328655, + "grad_norm": 0.277559369802475, + "learning_rate": 1.2092779826905288e-05, + "loss": 0.0071, + "num_input_tokens_seen": 154147488, + "step": 126680 + }, + { + "epoch": 14.109032186212273, + "grad_norm": 0.0090526407584548, + "learning_rate": 1.2090699025088137e-05, + "loss": 0.083, + "num_input_tokens_seen": 154153248, + "step": 126685 + }, + { + "epoch": 14.10958904109589, + "grad_norm": 0.7831863760948181, + "learning_rate": 1.2088618345209688e-05, + "loss": 0.0292, + "num_input_tokens_seen": 154159488, + "step": 126690 + }, + { + "epoch": 14.110145895979509, + "grad_norm": 0.04265354946255684, + "learning_rate": 1.2086537787289582e-05, + "loss": 0.014, + "num_input_tokens_seen": 154165696, + "step": 126695 + }, + { + "epoch": 14.110702750863124, + "grad_norm": 0.1200173944234848, + "learning_rate": 1.2084457351347472e-05, + "loss": 0.0727, + "num_input_tokens_seen": 154171840, + "step": 126700 + }, + { + "epoch": 14.111259605746742, + "grad_norm": 0.0015522351022809744, + "learning_rate": 1.2082377037403003e-05, + "loss": 0.0042, + "num_input_tokens_seen": 154178016, + "step": 126705 + }, + { + "epoch": 14.11181646063036, + "grad_norm": 0.03306444361805916, + "learning_rate": 1.208029684547584e-05, + "loss": 0.0007, + "num_input_tokens_seen": 154183808, + "step": 126710 + }, + { + "epoch": 14.112373315513977, + "grad_norm": 0.07433202862739563, + "learning_rate": 1.2078216775585626e-05, + "loss": 0.0018, + "num_input_tokens_seen": 154190176, + "step": 126715 + }, + { + "epoch": 14.112930170397595, + "grad_norm": 0.12681663036346436, + "learning_rate": 1.2076136827752007e-05, + "loss": 0.0042, + "num_input_tokens_seen": 154195840, + "step": 126720 + }, + { + "epoch": 14.113487025281211, + "grad_norm": 0.3159145712852478, + "learning_rate": 1.207405700199463e-05, + "loss": 0.0237, + "num_input_tokens_seen": 154202016, + "step": 126725 + }, + { + "epoch": 14.114043880164829, + "grad_norm": 0.0006758891395293176, + "learning_rate": 1.2071977298333128e-05, + "loss": 0.0635, + "num_input_tokens_seen": 154208160, + "step": 126730 + }, + { + "epoch": 14.114600735048446, + "grad_norm": 2.6572976112365723, + "learning_rate": 1.206989771678717e-05, + "loss": 0.0293, + "num_input_tokens_seen": 154213888, + "step": 126735 + }, + { + "epoch": 14.115157589932064, + "grad_norm": 3.2415456771850586, + "learning_rate": 1.206781825737639e-05, + "loss": 0.2288, + "num_input_tokens_seen": 154220288, + "step": 126740 + }, + { + "epoch": 14.115714444815682, + "grad_norm": 0.014644365757703781, + "learning_rate": 1.2065738920120426e-05, + "loss": 0.0222, + "num_input_tokens_seen": 154226528, + "step": 126745 + }, + { + "epoch": 14.116271299699298, + "grad_norm": 0.024726150557398796, + "learning_rate": 1.206365970503891e-05, + "loss": 0.0472, + "num_input_tokens_seen": 154232128, + "step": 126750 + }, + { + "epoch": 14.116828154582915, + "grad_norm": 0.0007558201905339956, + "learning_rate": 1.2061580612151502e-05, + "loss": 0.002, + "num_input_tokens_seen": 154238112, + "step": 126755 + }, + { + "epoch": 14.117385009466533, + "grad_norm": 0.1301565319299698, + "learning_rate": 1.2059501641477824e-05, + "loss": 0.01, + "num_input_tokens_seen": 154244352, + "step": 126760 + }, + { + "epoch": 14.11794186435015, + "grad_norm": 0.0011185527546331286, + "learning_rate": 1.2057422793037529e-05, + "loss": 0.0359, + "num_input_tokens_seen": 154250784, + "step": 126765 + }, + { + "epoch": 14.118498719233768, + "grad_norm": 0.0032857239712029696, + "learning_rate": 1.2055344066850244e-05, + "loss": 0.0206, + "num_input_tokens_seen": 154256896, + "step": 126770 + }, + { + "epoch": 14.119055574117384, + "grad_norm": 0.0006908153300173581, + "learning_rate": 1.2053265462935606e-05, + "loss": 0.0006, + "num_input_tokens_seen": 154262944, + "step": 126775 + }, + { + "epoch": 14.119612429001002, + "grad_norm": 0.006212321575731039, + "learning_rate": 1.2051186981313239e-05, + "loss": 0.01, + "num_input_tokens_seen": 154269248, + "step": 126780 + }, + { + "epoch": 14.12016928388462, + "grad_norm": 0.000266290211584419, + "learning_rate": 1.2049108622002795e-05, + "loss": 0.1173, + "num_input_tokens_seen": 154275040, + "step": 126785 + }, + { + "epoch": 14.120726138768237, + "grad_norm": 0.0017717538867145777, + "learning_rate": 1.2047030385023897e-05, + "loss": 0.0916, + "num_input_tokens_seen": 154281184, + "step": 126790 + }, + { + "epoch": 14.121282993651855, + "grad_norm": 0.2055884748697281, + "learning_rate": 1.2044952270396172e-05, + "loss": 0.0545, + "num_input_tokens_seen": 154286624, + "step": 126795 + }, + { + "epoch": 14.12183984853547, + "grad_norm": 0.5003485679626465, + "learning_rate": 1.2042874278139247e-05, + "loss": 0.1281, + "num_input_tokens_seen": 154292256, + "step": 126800 + }, + { + "epoch": 14.122396703419088, + "grad_norm": 0.05072106048464775, + "learning_rate": 1.2040796408272761e-05, + "loss": 0.076, + "num_input_tokens_seen": 154298368, + "step": 126805 + }, + { + "epoch": 14.122953558302706, + "grad_norm": 0.3099413514137268, + "learning_rate": 1.203871866081634e-05, + "loss": 0.0541, + "num_input_tokens_seen": 154304384, + "step": 126810 + }, + { + "epoch": 14.123510413186324, + "grad_norm": 1.0225152969360352, + "learning_rate": 1.20366410357896e-05, + "loss": 0.0523, + "num_input_tokens_seen": 154310624, + "step": 126815 + }, + { + "epoch": 14.124067268069942, + "grad_norm": 0.07472068816423416, + "learning_rate": 1.2034563533212165e-05, + "loss": 0.0031, + "num_input_tokens_seen": 154316672, + "step": 126820 + }, + { + "epoch": 14.12462412295356, + "grad_norm": 0.0038151326589286327, + "learning_rate": 1.2032486153103676e-05, + "loss": 0.0056, + "num_input_tokens_seen": 154322720, + "step": 126825 + }, + { + "epoch": 14.125180977837175, + "grad_norm": 0.03385788947343826, + "learning_rate": 1.2030408895483733e-05, + "loss": 0.0986, + "num_input_tokens_seen": 154328064, + "step": 126830 + }, + { + "epoch": 14.125737832720793, + "grad_norm": 0.0010161169338971376, + "learning_rate": 1.2028331760371981e-05, + "loss": 0.0216, + "num_input_tokens_seen": 154334496, + "step": 126835 + }, + { + "epoch": 14.12629468760441, + "grad_norm": 0.3226771056652069, + "learning_rate": 1.2026254747788026e-05, + "loss": 0.0309, + "num_input_tokens_seen": 154340608, + "step": 126840 + }, + { + "epoch": 14.126851542488028, + "grad_norm": 0.3443640172481537, + "learning_rate": 1.202417785775149e-05, + "loss": 0.0913, + "num_input_tokens_seen": 154346624, + "step": 126845 + }, + { + "epoch": 14.127408397371646, + "grad_norm": 0.1201808974146843, + "learning_rate": 1.2022101090281981e-05, + "loss": 0.1247, + "num_input_tokens_seen": 154352864, + "step": 126850 + }, + { + "epoch": 14.127965252255262, + "grad_norm": 0.08934609591960907, + "learning_rate": 1.2020024445399134e-05, + "loss": 0.0135, + "num_input_tokens_seen": 154359072, + "step": 126855 + }, + { + "epoch": 14.12852210713888, + "grad_norm": 0.001849370077252388, + "learning_rate": 1.2017947923122555e-05, + "loss": 0.0034, + "num_input_tokens_seen": 154365440, + "step": 126860 + }, + { + "epoch": 14.129078962022497, + "grad_norm": 0.03788440674543381, + "learning_rate": 1.2015871523471859e-05, + "loss": 0.0089, + "num_input_tokens_seen": 154371360, + "step": 126865 + }, + { + "epoch": 14.129635816906115, + "grad_norm": 0.19129450619220734, + "learning_rate": 1.201379524646665e-05, + "loss": 0.0748, + "num_input_tokens_seen": 154377312, + "step": 126870 + }, + { + "epoch": 14.130192671789732, + "grad_norm": 0.00042977629345841706, + "learning_rate": 1.2011719092126559e-05, + "loss": 0.0944, + "num_input_tokens_seen": 154383520, + "step": 126875 + }, + { + "epoch": 14.130749526673348, + "grad_norm": 0.5340969562530518, + "learning_rate": 1.2009643060471178e-05, + "loss": 0.0355, + "num_input_tokens_seen": 154389696, + "step": 126880 + }, + { + "epoch": 14.131306381556966, + "grad_norm": 0.20600394904613495, + "learning_rate": 1.2007567151520143e-05, + "loss": 0.0178, + "num_input_tokens_seen": 154395584, + "step": 126885 + }, + { + "epoch": 14.131863236440584, + "grad_norm": 0.021099578589200974, + "learning_rate": 1.2005491365293029e-05, + "loss": 0.1156, + "num_input_tokens_seen": 154401280, + "step": 126890 + }, + { + "epoch": 14.132420091324201, + "grad_norm": 0.056421909481287, + "learning_rate": 1.200341570180947e-05, + "loss": 0.0931, + "num_input_tokens_seen": 154407616, + "step": 126895 + }, + { + "epoch": 14.132976946207819, + "grad_norm": 0.2911090850830078, + "learning_rate": 1.200134016108905e-05, + "loss": 0.0113, + "num_input_tokens_seen": 154413792, + "step": 126900 + }, + { + "epoch": 14.133533801091435, + "grad_norm": 0.0002449621679261327, + "learning_rate": 1.1999264743151397e-05, + "loss": 0.0146, + "num_input_tokens_seen": 154419968, + "step": 126905 + }, + { + "epoch": 14.134090655975053, + "grad_norm": 3.0704121589660645, + "learning_rate": 1.1997189448016108e-05, + "loss": 0.0429, + "num_input_tokens_seen": 154426080, + "step": 126910 + }, + { + "epoch": 14.13464751085867, + "grad_norm": 1.1171189546585083, + "learning_rate": 1.199511427570278e-05, + "loss": 0.1132, + "num_input_tokens_seen": 154431968, + "step": 126915 + }, + { + "epoch": 14.135204365742288, + "grad_norm": 0.08628677576780319, + "learning_rate": 1.1993039226231006e-05, + "loss": 0.0025, + "num_input_tokens_seen": 154438432, + "step": 126920 + }, + { + "epoch": 14.135761220625906, + "grad_norm": 0.00025813374668359756, + "learning_rate": 1.1990964299620408e-05, + "loss": 0.0466, + "num_input_tokens_seen": 154444416, + "step": 126925 + }, + { + "epoch": 14.136318075509521, + "grad_norm": 0.000734912056941539, + "learning_rate": 1.1988889495890573e-05, + "loss": 0.067, + "num_input_tokens_seen": 154450720, + "step": 126930 + }, + { + "epoch": 14.13687493039314, + "grad_norm": 1.9498592615127563, + "learning_rate": 1.1986814815061104e-05, + "loss": 0.1329, + "num_input_tokens_seen": 154456672, + "step": 126935 + }, + { + "epoch": 14.137431785276757, + "grad_norm": 0.001366203185170889, + "learning_rate": 1.1984740257151586e-05, + "loss": 0.028, + "num_input_tokens_seen": 154462880, + "step": 126940 + }, + { + "epoch": 14.137988640160374, + "grad_norm": 0.0006333255441859365, + "learning_rate": 1.198266582218163e-05, + "loss": 0.0005, + "num_input_tokens_seen": 154469312, + "step": 126945 + }, + { + "epoch": 14.138545495043992, + "grad_norm": 0.1929127424955368, + "learning_rate": 1.198059151017082e-05, + "loss": 0.0653, + "num_input_tokens_seen": 154475200, + "step": 126950 + }, + { + "epoch": 14.139102349927608, + "grad_norm": 0.4816567301750183, + "learning_rate": 1.1978517321138768e-05, + "loss": 0.0287, + "num_input_tokens_seen": 154481216, + "step": 126955 + }, + { + "epoch": 14.139659204811226, + "grad_norm": 0.9448708295822144, + "learning_rate": 1.1976443255105035e-05, + "loss": 0.0445, + "num_input_tokens_seen": 154487392, + "step": 126960 + }, + { + "epoch": 14.140216059694843, + "grad_norm": 0.11830180138349533, + "learning_rate": 1.197436931208924e-05, + "loss": 0.0153, + "num_input_tokens_seen": 154493120, + "step": 126965 + }, + { + "epoch": 14.140772914578461, + "grad_norm": 0.06063733994960785, + "learning_rate": 1.1972295492110955e-05, + "loss": 0.0141, + "num_input_tokens_seen": 154499104, + "step": 126970 + }, + { + "epoch": 14.141329769462079, + "grad_norm": 0.05506733059883118, + "learning_rate": 1.1970221795189784e-05, + "loss": 0.0184, + "num_input_tokens_seen": 154505024, + "step": 126975 + }, + { + "epoch": 14.141886624345695, + "grad_norm": 0.0016591186868026853, + "learning_rate": 1.1968148221345308e-05, + "loss": 0.0215, + "num_input_tokens_seen": 154511264, + "step": 126980 + }, + { + "epoch": 14.142443479229312, + "grad_norm": 0.0004931677249260247, + "learning_rate": 1.1966074770597114e-05, + "loss": 0.0326, + "num_input_tokens_seen": 154517760, + "step": 126985 + }, + { + "epoch": 14.14300033411293, + "grad_norm": 0.7485671639442444, + "learning_rate": 1.1964001442964776e-05, + "loss": 0.0065, + "num_input_tokens_seen": 154524000, + "step": 126990 + }, + { + "epoch": 14.143557188996548, + "grad_norm": 0.007780670654028654, + "learning_rate": 1.1961928238467898e-05, + "loss": 0.0025, + "num_input_tokens_seen": 154530304, + "step": 126995 + }, + { + "epoch": 14.144114043880165, + "grad_norm": 0.000637802470009774, + "learning_rate": 1.195985515712605e-05, + "loss": 0.0513, + "num_input_tokens_seen": 154536736, + "step": 127000 + }, + { + "epoch": 14.144670898763783, + "grad_norm": 0.0013839669991284609, + "learning_rate": 1.1957782198958825e-05, + "loss": 0.0175, + "num_input_tokens_seen": 154542784, + "step": 127005 + }, + { + "epoch": 14.145227753647399, + "grad_norm": 1.1282763481140137, + "learning_rate": 1.1955709363985781e-05, + "loss": 0.0454, + "num_input_tokens_seen": 154548256, + "step": 127010 + }, + { + "epoch": 14.145784608531017, + "grad_norm": 0.00267773331142962, + "learning_rate": 1.1953636652226527e-05, + "loss": 0.0115, + "num_input_tokens_seen": 154554624, + "step": 127015 + }, + { + "epoch": 14.146341463414634, + "grad_norm": 0.00019143373356200755, + "learning_rate": 1.1951564063700615e-05, + "loss": 0.0125, + "num_input_tokens_seen": 154560896, + "step": 127020 + }, + { + "epoch": 14.146898318298252, + "grad_norm": 0.24354109168052673, + "learning_rate": 1.1949491598427646e-05, + "loss": 0.0112, + "num_input_tokens_seen": 154566624, + "step": 127025 + }, + { + "epoch": 14.14745517318187, + "grad_norm": 0.054247237741947174, + "learning_rate": 1.194741925642718e-05, + "loss": 0.0031, + "num_input_tokens_seen": 154572736, + "step": 127030 + }, + { + "epoch": 14.148012028065486, + "grad_norm": 0.00011026033462258056, + "learning_rate": 1.1945347037718802e-05, + "loss": 0.0561, + "num_input_tokens_seen": 154578848, + "step": 127035 + }, + { + "epoch": 14.148568882949103, + "grad_norm": 0.00022396871645469218, + "learning_rate": 1.1943274942322069e-05, + "loss": 0.0167, + "num_input_tokens_seen": 154585024, + "step": 127040 + }, + { + "epoch": 14.14912573783272, + "grad_norm": 0.00478784553706646, + "learning_rate": 1.1941202970256574e-05, + "loss": 0.0622, + "num_input_tokens_seen": 154590496, + "step": 127045 + }, + { + "epoch": 14.149682592716339, + "grad_norm": 0.44294315576553345, + "learning_rate": 1.193913112154188e-05, + "loss": 0.0197, + "num_input_tokens_seen": 154596512, + "step": 127050 + }, + { + "epoch": 14.150239447599956, + "grad_norm": 0.11148538440465927, + "learning_rate": 1.1937059396197558e-05, + "loss": 0.0351, + "num_input_tokens_seen": 154602432, + "step": 127055 + }, + { + "epoch": 14.150796302483572, + "grad_norm": 0.01241568848490715, + "learning_rate": 1.1934987794243167e-05, + "loss": 0.0755, + "num_input_tokens_seen": 154608576, + "step": 127060 + }, + { + "epoch": 14.15135315736719, + "grad_norm": 0.15351168811321259, + "learning_rate": 1.193291631569829e-05, + "loss": 0.0196, + "num_input_tokens_seen": 154614656, + "step": 127065 + }, + { + "epoch": 14.151910012250807, + "grad_norm": 0.015273485332727432, + "learning_rate": 1.1930844960582479e-05, + "loss": 0.0337, + "num_input_tokens_seen": 154620576, + "step": 127070 + }, + { + "epoch": 14.152466867134425, + "grad_norm": 0.010515446774661541, + "learning_rate": 1.1928773728915327e-05, + "loss": 0.0572, + "num_input_tokens_seen": 154626208, + "step": 127075 + }, + { + "epoch": 14.153023722018043, + "grad_norm": 0.008080034516751766, + "learning_rate": 1.1926702620716363e-05, + "loss": 0.0234, + "num_input_tokens_seen": 154632384, + "step": 127080 + }, + { + "epoch": 14.153580576901659, + "grad_norm": 0.0004982815589755774, + "learning_rate": 1.1924631636005174e-05, + "loss": 0.0047, + "num_input_tokens_seen": 154638208, + "step": 127085 + }, + { + "epoch": 14.154137431785276, + "grad_norm": 0.0466131865978241, + "learning_rate": 1.1922560774801305e-05, + "loss": 0.072, + "num_input_tokens_seen": 154643872, + "step": 127090 + }, + { + "epoch": 14.154694286668894, + "grad_norm": 0.32040879130363464, + "learning_rate": 1.1920490037124341e-05, + "loss": 0.0157, + "num_input_tokens_seen": 154650272, + "step": 127095 + }, + { + "epoch": 14.155251141552512, + "grad_norm": 0.5782248973846436, + "learning_rate": 1.1918419422993823e-05, + "loss": 0.1013, + "num_input_tokens_seen": 154656384, + "step": 127100 + }, + { + "epoch": 14.15580799643613, + "grad_norm": 0.15920081734657288, + "learning_rate": 1.1916348932429316e-05, + "loss": 0.1128, + "num_input_tokens_seen": 154662432, + "step": 127105 + }, + { + "epoch": 14.156364851319745, + "grad_norm": 0.0014681353932246566, + "learning_rate": 1.1914278565450365e-05, + "loss": 0.0347, + "num_input_tokens_seen": 154668448, + "step": 127110 + }, + { + "epoch": 14.156921706203363, + "grad_norm": 0.004789512138813734, + "learning_rate": 1.191220832207655e-05, + "loss": 0.0066, + "num_input_tokens_seen": 154674624, + "step": 127115 + }, + { + "epoch": 14.15747856108698, + "grad_norm": 0.1439797580242157, + "learning_rate": 1.191013820232741e-05, + "loss": 0.1613, + "num_input_tokens_seen": 154680992, + "step": 127120 + }, + { + "epoch": 14.158035415970598, + "grad_norm": 0.0182054340839386, + "learning_rate": 1.1908068206222503e-05, + "loss": 0.0031, + "num_input_tokens_seen": 154687040, + "step": 127125 + }, + { + "epoch": 14.158592270854216, + "grad_norm": 0.0015711481682956219, + "learning_rate": 1.1905998333781372e-05, + "loss": 0.0651, + "num_input_tokens_seen": 154693152, + "step": 127130 + }, + { + "epoch": 14.159149125737832, + "grad_norm": 0.11948320269584656, + "learning_rate": 1.1903928585023586e-05, + "loss": 0.0067, + "num_input_tokens_seen": 154699008, + "step": 127135 + }, + { + "epoch": 14.15970598062145, + "grad_norm": 0.03332754224538803, + "learning_rate": 1.1901858959968687e-05, + "loss": 0.0742, + "num_input_tokens_seen": 154705312, + "step": 127140 + }, + { + "epoch": 14.160262835505067, + "grad_norm": 0.003557985881343484, + "learning_rate": 1.1899789458636224e-05, + "loss": 0.0202, + "num_input_tokens_seen": 154711488, + "step": 127145 + }, + { + "epoch": 14.160819690388685, + "grad_norm": 0.2715534567832947, + "learning_rate": 1.1897720081045746e-05, + "loss": 0.0449, + "num_input_tokens_seen": 154717952, + "step": 127150 + }, + { + "epoch": 14.161376545272303, + "grad_norm": 0.005459191277623177, + "learning_rate": 1.189565082721679e-05, + "loss": 0.1111, + "num_input_tokens_seen": 154724256, + "step": 127155 + }, + { + "epoch": 14.161933400155919, + "grad_norm": 1.0226538181304932, + "learning_rate": 1.1893581697168918e-05, + "loss": 0.0726, + "num_input_tokens_seen": 154730272, + "step": 127160 + }, + { + "epoch": 14.162490255039536, + "grad_norm": 0.07184328138828278, + "learning_rate": 1.189151269092166e-05, + "loss": 0.0315, + "num_input_tokens_seen": 154736160, + "step": 127165 + }, + { + "epoch": 14.163047109923154, + "grad_norm": 3.3764851093292236, + "learning_rate": 1.1889443808494577e-05, + "loss": 0.0832, + "num_input_tokens_seen": 154741952, + "step": 127170 + }, + { + "epoch": 14.163603964806772, + "grad_norm": 0.004844017792493105, + "learning_rate": 1.18873750499072e-05, + "loss": 0.0431, + "num_input_tokens_seen": 154747744, + "step": 127175 + }, + { + "epoch": 14.16416081969039, + "grad_norm": 0.0009484774782322347, + "learning_rate": 1.188530641517907e-05, + "loss": 0.015, + "num_input_tokens_seen": 154753952, + "step": 127180 + }, + { + "epoch": 14.164717674574007, + "grad_norm": 0.07169903069734573, + "learning_rate": 1.1883237904329721e-05, + "loss": 0.1078, + "num_input_tokens_seen": 154760064, + "step": 127185 + }, + { + "epoch": 14.165274529457623, + "grad_norm": 0.8125227689743042, + "learning_rate": 1.188116951737871e-05, + "loss": 0.1534, + "num_input_tokens_seen": 154766208, + "step": 127190 + }, + { + "epoch": 14.16583138434124, + "grad_norm": 0.4233129918575287, + "learning_rate": 1.1879101254345561e-05, + "loss": 0.0109, + "num_input_tokens_seen": 154772096, + "step": 127195 + }, + { + "epoch": 14.166388239224858, + "grad_norm": 1.4694911241531372, + "learning_rate": 1.1877033115249814e-05, + "loss": 0.0803, + "num_input_tokens_seen": 154778080, + "step": 127200 + }, + { + "epoch": 14.166945094108476, + "grad_norm": 0.20849034190177917, + "learning_rate": 1.1874965100110993e-05, + "loss": 0.0248, + "num_input_tokens_seen": 154784160, + "step": 127205 + }, + { + "epoch": 14.167501948992093, + "grad_norm": 0.06355325132608414, + "learning_rate": 1.1872897208948652e-05, + "loss": 0.0037, + "num_input_tokens_seen": 154790336, + "step": 127210 + }, + { + "epoch": 14.16805880387571, + "grad_norm": 0.002610426628962159, + "learning_rate": 1.1870829441782305e-05, + "loss": 0.0205, + "num_input_tokens_seen": 154796064, + "step": 127215 + }, + { + "epoch": 14.168615658759327, + "grad_norm": 0.022383755072951317, + "learning_rate": 1.1868761798631512e-05, + "loss": 0.1273, + "num_input_tokens_seen": 154801984, + "step": 127220 + }, + { + "epoch": 14.169172513642945, + "grad_norm": 0.6521000266075134, + "learning_rate": 1.1866694279515763e-05, + "loss": 0.0723, + "num_input_tokens_seen": 154808000, + "step": 127225 + }, + { + "epoch": 14.169729368526562, + "grad_norm": 0.9550859332084656, + "learning_rate": 1.1864626884454622e-05, + "loss": 0.0643, + "num_input_tokens_seen": 154814176, + "step": 127230 + }, + { + "epoch": 14.17028622341018, + "grad_norm": 0.1080356165766716, + "learning_rate": 1.1862559613467591e-05, + "loss": 0.0323, + "num_input_tokens_seen": 154820224, + "step": 127235 + }, + { + "epoch": 14.170843078293796, + "grad_norm": 0.015487131662666798, + "learning_rate": 1.1860492466574222e-05, + "loss": 0.001, + "num_input_tokens_seen": 154825888, + "step": 127240 + }, + { + "epoch": 14.171399933177414, + "grad_norm": 0.0006629761774092913, + "learning_rate": 1.1858425443794027e-05, + "loss": 0.0843, + "num_input_tokens_seen": 154831680, + "step": 127245 + }, + { + "epoch": 14.171956788061031, + "grad_norm": 0.0001664208684815094, + "learning_rate": 1.1856358545146535e-05, + "loss": 0.1239, + "num_input_tokens_seen": 154838048, + "step": 127250 + }, + { + "epoch": 14.172513642944649, + "grad_norm": 1.2372491359710693, + "learning_rate": 1.1854291770651255e-05, + "loss": 0.0936, + "num_input_tokens_seen": 154844352, + "step": 127255 + }, + { + "epoch": 14.173070497828267, + "grad_norm": 0.29829367995262146, + "learning_rate": 1.1852225120327732e-05, + "loss": 0.0093, + "num_input_tokens_seen": 154850272, + "step": 127260 + }, + { + "epoch": 14.173627352711883, + "grad_norm": 0.47257834672927856, + "learning_rate": 1.1850158594195477e-05, + "loss": 0.0847, + "num_input_tokens_seen": 154855808, + "step": 127265 + }, + { + "epoch": 14.1741842075955, + "grad_norm": 0.00016313287778757513, + "learning_rate": 1.1848092192274008e-05, + "loss": 0.0233, + "num_input_tokens_seen": 154862016, + "step": 127270 + }, + { + "epoch": 14.174741062479118, + "grad_norm": 0.004375291056931019, + "learning_rate": 1.1846025914582837e-05, + "loss": 0.0367, + "num_input_tokens_seen": 154867616, + "step": 127275 + }, + { + "epoch": 14.175297917362736, + "grad_norm": 1.4463684558868408, + "learning_rate": 1.1843959761141499e-05, + "loss": 0.1403, + "num_input_tokens_seen": 154873760, + "step": 127280 + }, + { + "epoch": 14.175854772246353, + "grad_norm": 0.024359440430998802, + "learning_rate": 1.1841893731969491e-05, + "loss": 0.0723, + "num_input_tokens_seen": 154879904, + "step": 127285 + }, + { + "epoch": 14.17641162712997, + "grad_norm": 0.08365753293037415, + "learning_rate": 1.1839827827086362e-05, + "loss": 0.0105, + "num_input_tokens_seen": 154885856, + "step": 127290 + }, + { + "epoch": 14.176968482013587, + "grad_norm": 1.1613291501998901, + "learning_rate": 1.183776204651158e-05, + "loss": 0.0504, + "num_input_tokens_seen": 154892224, + "step": 127295 + }, + { + "epoch": 14.177525336897205, + "grad_norm": 0.005397404544055462, + "learning_rate": 1.1835696390264691e-05, + "loss": 0.0357, + "num_input_tokens_seen": 154898304, + "step": 127300 + }, + { + "epoch": 14.178082191780822, + "grad_norm": 0.15620486438274384, + "learning_rate": 1.1833630858365188e-05, + "loss": 0.0108, + "num_input_tokens_seen": 154903936, + "step": 127305 + }, + { + "epoch": 14.17863904666444, + "grad_norm": 0.5882481336593628, + "learning_rate": 1.18315654508326e-05, + "loss": 0.0354, + "num_input_tokens_seen": 154909952, + "step": 127310 + }, + { + "epoch": 14.179195901548056, + "grad_norm": 0.003152961377054453, + "learning_rate": 1.1829500167686426e-05, + "loss": 0.0127, + "num_input_tokens_seen": 154916192, + "step": 127315 + }, + { + "epoch": 14.179752756431673, + "grad_norm": 0.017473135143518448, + "learning_rate": 1.1827435008946174e-05, + "loss": 0.0375, + "num_input_tokens_seen": 154922368, + "step": 127320 + }, + { + "epoch": 14.180309611315291, + "grad_norm": 0.002648871159180999, + "learning_rate": 1.1825369974631345e-05, + "loss": 0.0184, + "num_input_tokens_seen": 154928608, + "step": 127325 + }, + { + "epoch": 14.180866466198909, + "grad_norm": 0.0001942794770002365, + "learning_rate": 1.1823305064761459e-05, + "loss": 0.0726, + "num_input_tokens_seen": 154934816, + "step": 127330 + }, + { + "epoch": 14.181423321082526, + "grad_norm": 0.13250626623630524, + "learning_rate": 1.1821240279356017e-05, + "loss": 0.0033, + "num_input_tokens_seen": 154941088, + "step": 127335 + }, + { + "epoch": 14.181980175966142, + "grad_norm": 0.04923207685351372, + "learning_rate": 1.1819175618434513e-05, + "loss": 0.0666, + "num_input_tokens_seen": 154947072, + "step": 127340 + }, + { + "epoch": 14.18253703084976, + "grad_norm": 0.39472514390945435, + "learning_rate": 1.1817111082016453e-05, + "loss": 0.0852, + "num_input_tokens_seen": 154953088, + "step": 127345 + }, + { + "epoch": 14.183093885733378, + "grad_norm": 3.2856316566467285, + "learning_rate": 1.1815046670121346e-05, + "loss": 0.0323, + "num_input_tokens_seen": 154959232, + "step": 127350 + }, + { + "epoch": 14.183650740616995, + "grad_norm": 0.00017370941350236535, + "learning_rate": 1.1812982382768677e-05, + "loss": 0.0869, + "num_input_tokens_seen": 154965632, + "step": 127355 + }, + { + "epoch": 14.184207595500613, + "grad_norm": 0.8748855590820312, + "learning_rate": 1.1810918219977977e-05, + "loss": 0.0998, + "num_input_tokens_seen": 154971424, + "step": 127360 + }, + { + "epoch": 14.18476445038423, + "grad_norm": 0.68890780210495, + "learning_rate": 1.18088541817687e-05, + "loss": 0.106, + "num_input_tokens_seen": 154977792, + "step": 127365 + }, + { + "epoch": 14.185321305267847, + "grad_norm": 0.2520381212234497, + "learning_rate": 1.1806790268160375e-05, + "loss": 0.0085, + "num_input_tokens_seen": 154983808, + "step": 127370 + }, + { + "epoch": 14.185878160151464, + "grad_norm": 2.465404510498047, + "learning_rate": 1.1804726479172476e-05, + "loss": 0.1551, + "num_input_tokens_seen": 154989664, + "step": 127375 + }, + { + "epoch": 14.186435015035082, + "grad_norm": 0.012462328188121319, + "learning_rate": 1.1802662814824513e-05, + "loss": 0.0279, + "num_input_tokens_seen": 154995488, + "step": 127380 + }, + { + "epoch": 14.1869918699187, + "grad_norm": 0.007451306097209454, + "learning_rate": 1.1800599275135979e-05, + "loss": 0.0487, + "num_input_tokens_seen": 155001664, + "step": 127385 + }, + { + "epoch": 14.187548724802317, + "grad_norm": 0.0329582579433918, + "learning_rate": 1.1798535860126355e-05, + "loss": 0.0595, + "num_input_tokens_seen": 155008160, + "step": 127390 + }, + { + "epoch": 14.188105579685933, + "grad_norm": 0.3633492588996887, + "learning_rate": 1.1796472569815132e-05, + "loss": 0.0137, + "num_input_tokens_seen": 155014112, + "step": 127395 + }, + { + "epoch": 14.188662434569551, + "grad_norm": 0.374939888715744, + "learning_rate": 1.1794409404221812e-05, + "loss": 0.0169, + "num_input_tokens_seen": 155019936, + "step": 127400 + }, + { + "epoch": 14.189219289453169, + "grad_norm": 0.0026720240712165833, + "learning_rate": 1.1792346363365875e-05, + "loss": 0.0915, + "num_input_tokens_seen": 155026240, + "step": 127405 + }, + { + "epoch": 14.189776144336786, + "grad_norm": 0.046697087585926056, + "learning_rate": 1.1790283447266806e-05, + "loss": 0.0467, + "num_input_tokens_seen": 155032704, + "step": 127410 + }, + { + "epoch": 14.190332999220404, + "grad_norm": 0.3862820565700531, + "learning_rate": 1.1788220655944084e-05, + "loss": 0.0059, + "num_input_tokens_seen": 155038880, + "step": 127415 + }, + { + "epoch": 14.19088985410402, + "grad_norm": 0.0008657946018502116, + "learning_rate": 1.1786157989417215e-05, + "loss": 0.0102, + "num_input_tokens_seen": 155045184, + "step": 127420 + }, + { + "epoch": 14.191446708987637, + "grad_norm": 0.008275991305708885, + "learning_rate": 1.1784095447705662e-05, + "loss": 0.003, + "num_input_tokens_seen": 155051520, + "step": 127425 + }, + { + "epoch": 14.192003563871255, + "grad_norm": 0.01930338516831398, + "learning_rate": 1.1782033030828923e-05, + "loss": 0.2149, + "num_input_tokens_seen": 155057760, + "step": 127430 + }, + { + "epoch": 14.192560418754873, + "grad_norm": 0.03976703807711601, + "learning_rate": 1.1779970738806472e-05, + "loss": 0.0181, + "num_input_tokens_seen": 155063872, + "step": 127435 + }, + { + "epoch": 14.19311727363849, + "grad_norm": 1.0413167476654053, + "learning_rate": 1.1777908571657792e-05, + "loss": 0.0442, + "num_input_tokens_seen": 155069824, + "step": 127440 + }, + { + "epoch": 14.193674128522106, + "grad_norm": 0.2364898920059204, + "learning_rate": 1.1775846529402345e-05, + "loss": 0.07, + "num_input_tokens_seen": 155076064, + "step": 127445 + }, + { + "epoch": 14.194230983405724, + "grad_norm": 1.0214616060256958, + "learning_rate": 1.1773784612059635e-05, + "loss": 0.0051, + "num_input_tokens_seen": 155082528, + "step": 127450 + }, + { + "epoch": 14.194787838289342, + "grad_norm": 0.9907035827636719, + "learning_rate": 1.1771722819649126e-05, + "loss": 0.0205, + "num_input_tokens_seen": 155088320, + "step": 127455 + }, + { + "epoch": 14.19534469317296, + "grad_norm": 1.4343945980072021, + "learning_rate": 1.1769661152190293e-05, + "loss": 0.036, + "num_input_tokens_seen": 155094272, + "step": 127460 + }, + { + "epoch": 14.195901548056577, + "grad_norm": 1.092577338218689, + "learning_rate": 1.17675996097026e-05, + "loss": 0.0529, + "num_input_tokens_seen": 155099904, + "step": 127465 + }, + { + "epoch": 14.196458402940193, + "grad_norm": 0.0008845152333378792, + "learning_rate": 1.1765538192205542e-05, + "loss": 0.0004, + "num_input_tokens_seen": 155106368, + "step": 127470 + }, + { + "epoch": 14.19701525782381, + "grad_norm": 0.15182386338710785, + "learning_rate": 1.1763476899718567e-05, + "loss": 0.0087, + "num_input_tokens_seen": 155112544, + "step": 127475 + }, + { + "epoch": 14.197572112707428, + "grad_norm": 0.00030227730167098343, + "learning_rate": 1.1761415732261177e-05, + "loss": 0.007, + "num_input_tokens_seen": 155118400, + "step": 127480 + }, + { + "epoch": 14.198128967591046, + "grad_norm": 0.002053400268778205, + "learning_rate": 1.1759354689852803e-05, + "loss": 0.0521, + "num_input_tokens_seen": 155124352, + "step": 127485 + }, + { + "epoch": 14.198685822474664, + "grad_norm": 0.1958072930574417, + "learning_rate": 1.1757293772512943e-05, + "loss": 0.0485, + "num_input_tokens_seen": 155130400, + "step": 127490 + }, + { + "epoch": 14.19924267735828, + "grad_norm": 0.04410118609666824, + "learning_rate": 1.1755232980261041e-05, + "loss": 0.015, + "num_input_tokens_seen": 155136032, + "step": 127495 + }, + { + "epoch": 14.199799532241897, + "grad_norm": 0.012446717359125614, + "learning_rate": 1.1753172313116586e-05, + "loss": 0.0829, + "num_input_tokens_seen": 155142208, + "step": 127500 + }, + { + "epoch": 14.200356387125515, + "grad_norm": 0.042223185300827026, + "learning_rate": 1.1751111771099032e-05, + "loss": 0.0934, + "num_input_tokens_seen": 155148576, + "step": 127505 + }, + { + "epoch": 14.200913242009133, + "grad_norm": 9.571620466886088e-05, + "learning_rate": 1.1749051354227844e-05, + "loss": 0.0984, + "num_input_tokens_seen": 155154944, + "step": 127510 + }, + { + "epoch": 14.20147009689275, + "grad_norm": 0.0004679938137996942, + "learning_rate": 1.1746991062522471e-05, + "loss": 0.0046, + "num_input_tokens_seen": 155161280, + "step": 127515 + }, + { + "epoch": 14.202026951776368, + "grad_norm": 9.164324001176283e-05, + "learning_rate": 1.1744930896002396e-05, + "loss": 0.0032, + "num_input_tokens_seen": 155167584, + "step": 127520 + }, + { + "epoch": 14.202583806659984, + "grad_norm": 0.09971582889556885, + "learning_rate": 1.1742870854687066e-05, + "loss": 0.0311, + "num_input_tokens_seen": 155173824, + "step": 127525 + }, + { + "epoch": 14.203140661543602, + "grad_norm": 1.1009252071380615, + "learning_rate": 1.1740810938595945e-05, + "loss": 0.0681, + "num_input_tokens_seen": 155180128, + "step": 127530 + }, + { + "epoch": 14.20369751642722, + "grad_norm": 0.5238050818443298, + "learning_rate": 1.1738751147748478e-05, + "loss": 0.0365, + "num_input_tokens_seen": 155186304, + "step": 127535 + }, + { + "epoch": 14.204254371310837, + "grad_norm": 0.03879215940833092, + "learning_rate": 1.1736691482164138e-05, + "loss": 0.003, + "num_input_tokens_seen": 155192672, + "step": 127540 + }, + { + "epoch": 14.204811226194455, + "grad_norm": 1.2538930177688599, + "learning_rate": 1.1734631941862376e-05, + "loss": 0.013, + "num_input_tokens_seen": 155199072, + "step": 127545 + }, + { + "epoch": 14.20536808107807, + "grad_norm": 0.006077752448618412, + "learning_rate": 1.1732572526862642e-05, + "loss": 0.0006, + "num_input_tokens_seen": 155204960, + "step": 127550 + }, + { + "epoch": 14.205924935961688, + "grad_norm": 1.2148425579071045, + "learning_rate": 1.173051323718439e-05, + "loss": 0.0582, + "num_input_tokens_seen": 155210976, + "step": 127555 + }, + { + "epoch": 14.206481790845306, + "grad_norm": 0.32738253474235535, + "learning_rate": 1.1728454072847065e-05, + "loss": 0.0165, + "num_input_tokens_seen": 155216832, + "step": 127560 + }, + { + "epoch": 14.207038645728923, + "grad_norm": 0.19608600437641144, + "learning_rate": 1.172639503387013e-05, + "loss": 0.0455, + "num_input_tokens_seen": 155222880, + "step": 127565 + }, + { + "epoch": 14.207595500612541, + "grad_norm": 0.0007797476137056947, + "learning_rate": 1.1724336120273021e-05, + "loss": 0.085, + "num_input_tokens_seen": 155228576, + "step": 127570 + }, + { + "epoch": 14.208152355496157, + "grad_norm": 1.2652140855789185, + "learning_rate": 1.1722277332075205e-05, + "loss": 0.034, + "num_input_tokens_seen": 155234080, + "step": 127575 + }, + { + "epoch": 14.208709210379775, + "grad_norm": 0.10149461030960083, + "learning_rate": 1.1720218669296113e-05, + "loss": 0.0972, + "num_input_tokens_seen": 155240128, + "step": 127580 + }, + { + "epoch": 14.209266065263392, + "grad_norm": 1.4176305532455444, + "learning_rate": 1.1718160131955197e-05, + "loss": 0.0513, + "num_input_tokens_seen": 155246432, + "step": 127585 + }, + { + "epoch": 14.20982292014701, + "grad_norm": 0.0003969343670178205, + "learning_rate": 1.1716101720071893e-05, + "loss": 0.0009, + "num_input_tokens_seen": 155252672, + "step": 127590 + }, + { + "epoch": 14.210379775030628, + "grad_norm": 0.016958104446530342, + "learning_rate": 1.1714043433665659e-05, + "loss": 0.0053, + "num_input_tokens_seen": 155258912, + "step": 127595 + }, + { + "epoch": 14.210936629914244, + "grad_norm": 0.10634022206068039, + "learning_rate": 1.1711985272755927e-05, + "loss": 0.0152, + "num_input_tokens_seen": 155265120, + "step": 127600 + }, + { + "epoch": 14.211493484797861, + "grad_norm": 1.7485828399658203, + "learning_rate": 1.1709927237362142e-05, + "loss": 0.0599, + "num_input_tokens_seen": 155271328, + "step": 127605 + }, + { + "epoch": 14.212050339681479, + "grad_norm": 0.04030771553516388, + "learning_rate": 1.1707869327503734e-05, + "loss": 0.0036, + "num_input_tokens_seen": 155277216, + "step": 127610 + }, + { + "epoch": 14.212607194565097, + "grad_norm": 0.17091169953346252, + "learning_rate": 1.1705811543200156e-05, + "loss": 0.0308, + "num_input_tokens_seen": 155283232, + "step": 127615 + }, + { + "epoch": 14.213164049448714, + "grad_norm": 0.06504762172698975, + "learning_rate": 1.1703753884470834e-05, + "loss": 0.0587, + "num_input_tokens_seen": 155289568, + "step": 127620 + }, + { + "epoch": 14.21372090433233, + "grad_norm": 0.009853560477495193, + "learning_rate": 1.1701696351335225e-05, + "loss": 0.0177, + "num_input_tokens_seen": 155295328, + "step": 127625 + }, + { + "epoch": 14.214277759215948, + "grad_norm": 0.06420443207025528, + "learning_rate": 1.1699638943812729e-05, + "loss": 0.0029, + "num_input_tokens_seen": 155301632, + "step": 127630 + }, + { + "epoch": 14.214834614099566, + "grad_norm": 0.6927095651626587, + "learning_rate": 1.169758166192281e-05, + "loss": 0.0687, + "num_input_tokens_seen": 155307552, + "step": 127635 + }, + { + "epoch": 14.215391468983183, + "grad_norm": 0.051902465522289276, + "learning_rate": 1.1695524505684882e-05, + "loss": 0.0098, + "num_input_tokens_seen": 155313984, + "step": 127640 + }, + { + "epoch": 14.215948323866801, + "grad_norm": 0.2822655141353607, + "learning_rate": 1.1693467475118392e-05, + "loss": 0.0354, + "num_input_tokens_seen": 155320160, + "step": 127645 + }, + { + "epoch": 14.216505178750417, + "grad_norm": 0.23432575166225433, + "learning_rate": 1.1691410570242763e-05, + "loss": 0.1047, + "num_input_tokens_seen": 155326496, + "step": 127650 + }, + { + "epoch": 14.217062033634035, + "grad_norm": 0.1825474351644516, + "learning_rate": 1.1689353791077424e-05, + "loss": 0.0147, + "num_input_tokens_seen": 155332704, + "step": 127655 + }, + { + "epoch": 14.217618888517652, + "grad_norm": 0.456347793340683, + "learning_rate": 1.1687297137641793e-05, + "loss": 0.0021, + "num_input_tokens_seen": 155339232, + "step": 127660 + }, + { + "epoch": 14.21817574340127, + "grad_norm": 0.015861282125115395, + "learning_rate": 1.1685240609955317e-05, + "loss": 0.0008, + "num_input_tokens_seen": 155345312, + "step": 127665 + }, + { + "epoch": 14.218732598284888, + "grad_norm": 0.380949467420578, + "learning_rate": 1.1683184208037409e-05, + "loss": 0.0181, + "num_input_tokens_seen": 155351232, + "step": 127670 + }, + { + "epoch": 14.219289453168503, + "grad_norm": 0.07378476858139038, + "learning_rate": 1.1681127931907496e-05, + "loss": 0.0076, + "num_input_tokens_seen": 155357664, + "step": 127675 + }, + { + "epoch": 14.219846308052121, + "grad_norm": 0.3153580129146576, + "learning_rate": 1.1679071781584994e-05, + "loss": 0.0808, + "num_input_tokens_seen": 155364032, + "step": 127680 + }, + { + "epoch": 14.220403162935739, + "grad_norm": 0.00022492313291877508, + "learning_rate": 1.1677015757089339e-05, + "loss": 0.105, + "num_input_tokens_seen": 155370112, + "step": 127685 + }, + { + "epoch": 14.220960017819356, + "grad_norm": 1.1869945526123047, + "learning_rate": 1.1674959858439932e-05, + "loss": 0.0118, + "num_input_tokens_seen": 155376448, + "step": 127690 + }, + { + "epoch": 14.221516872702974, + "grad_norm": 0.04252075031399727, + "learning_rate": 1.1672904085656228e-05, + "loss": 0.1362, + "num_input_tokens_seen": 155382464, + "step": 127695 + }, + { + "epoch": 14.22207372758659, + "grad_norm": 0.18556183576583862, + "learning_rate": 1.1670848438757601e-05, + "loss": 0.0716, + "num_input_tokens_seen": 155388512, + "step": 127700 + }, + { + "epoch": 14.222630582470208, + "grad_norm": 1.024721622467041, + "learning_rate": 1.1668792917763502e-05, + "loss": 0.0305, + "num_input_tokens_seen": 155394688, + "step": 127705 + }, + { + "epoch": 14.223187437353825, + "grad_norm": 0.013096611015498638, + "learning_rate": 1.1666737522693321e-05, + "loss": 0.002, + "num_input_tokens_seen": 155400736, + "step": 127710 + }, + { + "epoch": 14.223744292237443, + "grad_norm": 0.0390496589243412, + "learning_rate": 1.16646822535665e-05, + "loss": 0.0435, + "num_input_tokens_seen": 155406688, + "step": 127715 + }, + { + "epoch": 14.22430114712106, + "grad_norm": 0.0006413604132831097, + "learning_rate": 1.1662627110402438e-05, + "loss": 0.0841, + "num_input_tokens_seen": 155412992, + "step": 127720 + }, + { + "epoch": 14.224858002004678, + "grad_norm": 0.320295125246048, + "learning_rate": 1.1660572093220548e-05, + "loss": 0.0584, + "num_input_tokens_seen": 155419104, + "step": 127725 + }, + { + "epoch": 14.225414856888294, + "grad_norm": 0.05409404635429382, + "learning_rate": 1.1658517202040231e-05, + "loss": 0.0258, + "num_input_tokens_seen": 155425248, + "step": 127730 + }, + { + "epoch": 14.225971711771912, + "grad_norm": 1.7363446950912476, + "learning_rate": 1.1656462436880919e-05, + "loss": 0.0377, + "num_input_tokens_seen": 155430848, + "step": 127735 + }, + { + "epoch": 14.22652856665553, + "grad_norm": 0.016609787940979004, + "learning_rate": 1.165440779776201e-05, + "loss": 0.0032, + "num_input_tokens_seen": 155436896, + "step": 127740 + }, + { + "epoch": 14.227085421539147, + "grad_norm": 0.0008730980334803462, + "learning_rate": 1.1652353284702914e-05, + "loss": 0.0029, + "num_input_tokens_seen": 155443104, + "step": 127745 + }, + { + "epoch": 14.227642276422765, + "grad_norm": 0.3494108021259308, + "learning_rate": 1.1650298897723023e-05, + "loss": 0.0444, + "num_input_tokens_seen": 155449120, + "step": 127750 + }, + { + "epoch": 14.228199131306381, + "grad_norm": 0.129167377948761, + "learning_rate": 1.1648244636841762e-05, + "loss": 0.0225, + "num_input_tokens_seen": 155455424, + "step": 127755 + }, + { + "epoch": 14.228755986189999, + "grad_norm": 0.17832158505916595, + "learning_rate": 1.1646190502078522e-05, + "loss": 0.0075, + "num_input_tokens_seen": 155461536, + "step": 127760 + }, + { + "epoch": 14.229312841073616, + "grad_norm": 0.0753825455904007, + "learning_rate": 1.164413649345272e-05, + "loss": 0.0382, + "num_input_tokens_seen": 155467328, + "step": 127765 + }, + { + "epoch": 14.229869695957234, + "grad_norm": 0.0003431525838095695, + "learning_rate": 1.1642082610983748e-05, + "loss": 0.0086, + "num_input_tokens_seen": 155473504, + "step": 127770 + }, + { + "epoch": 14.230426550840852, + "grad_norm": 1.0543365478515625, + "learning_rate": 1.164002885469101e-05, + "loss": 0.0865, + "num_input_tokens_seen": 155479872, + "step": 127775 + }, + { + "epoch": 14.230983405724468, + "grad_norm": 0.002709463005885482, + "learning_rate": 1.1637975224593894e-05, + "loss": 0.078, + "num_input_tokens_seen": 155486144, + "step": 127780 + }, + { + "epoch": 14.231540260608085, + "grad_norm": 0.0034703246783465147, + "learning_rate": 1.1635921720711814e-05, + "loss": 0.005, + "num_input_tokens_seen": 155492160, + "step": 127785 + }, + { + "epoch": 14.232097115491703, + "grad_norm": 1.1428735256195068, + "learning_rate": 1.1633868343064164e-05, + "loss": 0.0217, + "num_input_tokens_seen": 155498464, + "step": 127790 + }, + { + "epoch": 14.23265397037532, + "grad_norm": 0.006948067806661129, + "learning_rate": 1.1631815091670334e-05, + "loss": 0.0281, + "num_input_tokens_seen": 155504576, + "step": 127795 + }, + { + "epoch": 14.233210825258938, + "grad_norm": 0.16707450151443481, + "learning_rate": 1.1629761966549713e-05, + "loss": 0.0049, + "num_input_tokens_seen": 155510816, + "step": 127800 + }, + { + "epoch": 14.233767680142554, + "grad_norm": 0.021148137748241425, + "learning_rate": 1.1627708967721709e-05, + "loss": 0.0007, + "num_input_tokens_seen": 155517056, + "step": 127805 + }, + { + "epoch": 14.234324535026172, + "grad_norm": 1.9823020696640015, + "learning_rate": 1.1625656095205708e-05, + "loss": 0.1114, + "num_input_tokens_seen": 155522784, + "step": 127810 + }, + { + "epoch": 14.23488138990979, + "grad_norm": 0.0003149045805912465, + "learning_rate": 1.1623603349021103e-05, + "loss": 0.0092, + "num_input_tokens_seen": 155528608, + "step": 127815 + }, + { + "epoch": 14.235438244793407, + "grad_norm": 1.035063624382019, + "learning_rate": 1.1621550729187272e-05, + "loss": 0.0244, + "num_input_tokens_seen": 155534752, + "step": 127820 + }, + { + "epoch": 14.235995099677025, + "grad_norm": 2.3200597763061523, + "learning_rate": 1.1619498235723618e-05, + "loss": 0.0918, + "num_input_tokens_seen": 155540896, + "step": 127825 + }, + { + "epoch": 14.23655195456064, + "grad_norm": 0.06913211196660995, + "learning_rate": 1.1617445868649517e-05, + "loss": 0.0199, + "num_input_tokens_seen": 155547104, + "step": 127830 + }, + { + "epoch": 14.237108809444258, + "grad_norm": 0.02937137894332409, + "learning_rate": 1.1615393627984372e-05, + "loss": 0.1045, + "num_input_tokens_seen": 155553344, + "step": 127835 + }, + { + "epoch": 14.237665664327876, + "grad_norm": 0.003711915109306574, + "learning_rate": 1.1613341513747558e-05, + "loss": 0.1155, + "num_input_tokens_seen": 155558784, + "step": 127840 + }, + { + "epoch": 14.238222519211494, + "grad_norm": 0.003568545915186405, + "learning_rate": 1.1611289525958458e-05, + "loss": 0.0718, + "num_input_tokens_seen": 155564928, + "step": 127845 + }, + { + "epoch": 14.238779374095111, + "grad_norm": 0.02613797038793564, + "learning_rate": 1.1609237664636444e-05, + "loss": 0.0746, + "num_input_tokens_seen": 155570976, + "step": 127850 + }, + { + "epoch": 14.239336228978727, + "grad_norm": 1.7778372764587402, + "learning_rate": 1.1607185929800921e-05, + "loss": 0.0556, + "num_input_tokens_seen": 155576800, + "step": 127855 + }, + { + "epoch": 14.239893083862345, + "grad_norm": 0.16108889877796173, + "learning_rate": 1.1605134321471256e-05, + "loss": 0.0074, + "num_input_tokens_seen": 155582848, + "step": 127860 + }, + { + "epoch": 14.240449938745963, + "grad_norm": 0.5852301716804504, + "learning_rate": 1.1603082839666828e-05, + "loss": 0.0392, + "num_input_tokens_seen": 155589184, + "step": 127865 + }, + { + "epoch": 14.24100679362958, + "grad_norm": 1.4556245803833008, + "learning_rate": 1.1601031484407007e-05, + "loss": 0.0793, + "num_input_tokens_seen": 155595520, + "step": 127870 + }, + { + "epoch": 14.241563648513198, + "grad_norm": 0.04978474974632263, + "learning_rate": 1.1598980255711189e-05, + "loss": 0.0419, + "num_input_tokens_seen": 155601472, + "step": 127875 + }, + { + "epoch": 14.242120503396816, + "grad_norm": 0.00014700682368129492, + "learning_rate": 1.1596929153598729e-05, + "loss": 0.0359, + "num_input_tokens_seen": 155607360, + "step": 127880 + }, + { + "epoch": 14.242677358280432, + "grad_norm": 0.6323466897010803, + "learning_rate": 1.159487817808903e-05, + "loss": 0.0372, + "num_input_tokens_seen": 155613216, + "step": 127885 + }, + { + "epoch": 14.24323421316405, + "grad_norm": 0.40591832995414734, + "learning_rate": 1.1592827329201428e-05, + "loss": 0.0308, + "num_input_tokens_seen": 155619392, + "step": 127890 + }, + { + "epoch": 14.243791068047667, + "grad_norm": 1.3696880340576172, + "learning_rate": 1.1590776606955325e-05, + "loss": 0.1737, + "num_input_tokens_seen": 155625248, + "step": 127895 + }, + { + "epoch": 14.244347922931285, + "grad_norm": 1.57562255859375, + "learning_rate": 1.1588726011370068e-05, + "loss": 0.1038, + "num_input_tokens_seen": 155631232, + "step": 127900 + }, + { + "epoch": 14.244904777814902, + "grad_norm": 0.006304899696260691, + "learning_rate": 1.158667554246505e-05, + "loss": 0.0642, + "num_input_tokens_seen": 155637024, + "step": 127905 + }, + { + "epoch": 14.245461632698518, + "grad_norm": 0.14590203762054443, + "learning_rate": 1.1584625200259627e-05, + "loss": 0.1235, + "num_input_tokens_seen": 155643136, + "step": 127910 + }, + { + "epoch": 14.246018487582136, + "grad_norm": 0.6030365228652954, + "learning_rate": 1.1582574984773168e-05, + "loss": 0.013, + "num_input_tokens_seen": 155648832, + "step": 127915 + }, + { + "epoch": 14.246575342465754, + "grad_norm": 0.06823690980672836, + "learning_rate": 1.1580524896025027e-05, + "loss": 0.0372, + "num_input_tokens_seen": 155654592, + "step": 127920 + }, + { + "epoch": 14.247132197349371, + "grad_norm": 0.040510207414627075, + "learning_rate": 1.1578474934034591e-05, + "loss": 0.0392, + "num_input_tokens_seen": 155660992, + "step": 127925 + }, + { + "epoch": 14.247689052232989, + "grad_norm": 0.8116829991340637, + "learning_rate": 1.1576425098821211e-05, + "loss": 0.0356, + "num_input_tokens_seen": 155666944, + "step": 127930 + }, + { + "epoch": 14.248245907116605, + "grad_norm": 0.004386832006275654, + "learning_rate": 1.1574375390404255e-05, + "loss": 0.0964, + "num_input_tokens_seen": 155673472, + "step": 127935 + }, + { + "epoch": 14.248802762000222, + "grad_norm": 0.000250366167165339, + "learning_rate": 1.1572325808803067e-05, + "loss": 0.0032, + "num_input_tokens_seen": 155679584, + "step": 127940 + }, + { + "epoch": 14.24935961688384, + "grad_norm": 0.5658208727836609, + "learning_rate": 1.1570276354037027e-05, + "loss": 0.0274, + "num_input_tokens_seen": 155685952, + "step": 127945 + }, + { + "epoch": 14.249916471767458, + "grad_norm": 0.9816149473190308, + "learning_rate": 1.156822702612548e-05, + "loss": 0.1483, + "num_input_tokens_seen": 155692064, + "step": 127950 + }, + { + "epoch": 14.250473326651075, + "grad_norm": 0.31104499101638794, + "learning_rate": 1.156617782508781e-05, + "loss": 0.0051, + "num_input_tokens_seen": 155698336, + "step": 127955 + }, + { + "epoch": 14.251030181534691, + "grad_norm": 1.7666865587234497, + "learning_rate": 1.156412875094334e-05, + "loss": 0.0834, + "num_input_tokens_seen": 155704480, + "step": 127960 + }, + { + "epoch": 14.251587036418309, + "grad_norm": 0.11041443049907684, + "learning_rate": 1.1562079803711433e-05, + "loss": 0.0382, + "num_input_tokens_seen": 155710432, + "step": 127965 + }, + { + "epoch": 14.252143891301927, + "grad_norm": 0.037465836852788925, + "learning_rate": 1.1560030983411457e-05, + "loss": 0.033, + "num_input_tokens_seen": 155715680, + "step": 127970 + }, + { + "epoch": 14.252700746185544, + "grad_norm": 0.00466775381937623, + "learning_rate": 1.1557982290062747e-05, + "loss": 0.0036, + "num_input_tokens_seen": 155721600, + "step": 127975 + }, + { + "epoch": 14.253257601069162, + "grad_norm": 0.0011502471752464771, + "learning_rate": 1.1555933723684673e-05, + "loss": 0.0544, + "num_input_tokens_seen": 155727776, + "step": 127980 + }, + { + "epoch": 14.253814455952778, + "grad_norm": 0.1847674548625946, + "learning_rate": 1.1553885284296575e-05, + "loss": 0.0686, + "num_input_tokens_seen": 155734048, + "step": 127985 + }, + { + "epoch": 14.254371310836396, + "grad_norm": 0.8310777544975281, + "learning_rate": 1.1551836971917803e-05, + "loss": 0.0456, + "num_input_tokens_seen": 155740288, + "step": 127990 + }, + { + "epoch": 14.254928165720013, + "grad_norm": 0.006617893930524588, + "learning_rate": 1.1549788786567698e-05, + "loss": 0.066, + "num_input_tokens_seen": 155746144, + "step": 127995 + }, + { + "epoch": 14.255485020603631, + "grad_norm": 0.011274071410298347, + "learning_rate": 1.1547740728265622e-05, + "loss": 0.0186, + "num_input_tokens_seen": 155752096, + "step": 128000 + }, + { + "epoch": 14.256041875487249, + "grad_norm": 0.0001530134613858536, + "learning_rate": 1.1545692797030914e-05, + "loss": 0.0013, + "num_input_tokens_seen": 155757728, + "step": 128005 + }, + { + "epoch": 14.256598730370865, + "grad_norm": 1.447945475578308, + "learning_rate": 1.1543644992882916e-05, + "loss": 0.0451, + "num_input_tokens_seen": 155764096, + "step": 128010 + }, + { + "epoch": 14.257155585254482, + "grad_norm": 0.01384898740798235, + "learning_rate": 1.1541597315840963e-05, + "loss": 0.0074, + "num_input_tokens_seen": 155770368, + "step": 128015 + }, + { + "epoch": 14.2577124401381, + "grad_norm": 0.08267945796251297, + "learning_rate": 1.1539549765924416e-05, + "loss": 0.0029, + "num_input_tokens_seen": 155776288, + "step": 128020 + }, + { + "epoch": 14.258269295021718, + "grad_norm": 0.9881368279457092, + "learning_rate": 1.1537502343152594e-05, + "loss": 0.0776, + "num_input_tokens_seen": 155781920, + "step": 128025 + }, + { + "epoch": 14.258826149905335, + "grad_norm": 2.052293062210083, + "learning_rate": 1.153545504754487e-05, + "loss": 0.2381, + "num_input_tokens_seen": 155788224, + "step": 128030 + }, + { + "epoch": 14.259383004788951, + "grad_norm": 0.000708972685970366, + "learning_rate": 1.1533407879120539e-05, + "loss": 0.0667, + "num_input_tokens_seen": 155794496, + "step": 128035 + }, + { + "epoch": 14.259939859672569, + "grad_norm": 2.387850761413574, + "learning_rate": 1.1531360837898973e-05, + "loss": 0.0638, + "num_input_tokens_seen": 155800640, + "step": 128040 + }, + { + "epoch": 14.260496714556187, + "grad_norm": 0.16142605245113373, + "learning_rate": 1.1529313923899482e-05, + "loss": 0.0589, + "num_input_tokens_seen": 155806688, + "step": 128045 + }, + { + "epoch": 14.261053569439804, + "grad_norm": 0.002465970814228058, + "learning_rate": 1.1527267137141423e-05, + "loss": 0.0065, + "num_input_tokens_seen": 155812992, + "step": 128050 + }, + { + "epoch": 14.261610424323422, + "grad_norm": 0.10823409259319305, + "learning_rate": 1.1525220477644123e-05, + "loss": 0.0079, + "num_input_tokens_seen": 155818848, + "step": 128055 + }, + { + "epoch": 14.262167279207038, + "grad_norm": 0.27661821246147156, + "learning_rate": 1.152317394542691e-05, + "loss": 0.0239, + "num_input_tokens_seen": 155824704, + "step": 128060 + }, + { + "epoch": 14.262724134090655, + "grad_norm": 2.3227157592773438, + "learning_rate": 1.1521127540509106e-05, + "loss": 0.2478, + "num_input_tokens_seen": 155831136, + "step": 128065 + }, + { + "epoch": 14.263280988974273, + "grad_norm": 1.2502952814102173, + "learning_rate": 1.1519081262910061e-05, + "loss": 0.0578, + "num_input_tokens_seen": 155837216, + "step": 128070 + }, + { + "epoch": 14.26383784385789, + "grad_norm": 0.03167865052819252, + "learning_rate": 1.1517035112649096e-05, + "loss": 0.06, + "num_input_tokens_seen": 155843520, + "step": 128075 + }, + { + "epoch": 14.264394698741508, + "grad_norm": 0.23383110761642456, + "learning_rate": 1.1514989089745535e-05, + "loss": 0.027, + "num_input_tokens_seen": 155849856, + "step": 128080 + }, + { + "epoch": 14.264951553625126, + "grad_norm": 0.1229802668094635, + "learning_rate": 1.1512943194218697e-05, + "loss": 0.0458, + "num_input_tokens_seen": 155855616, + "step": 128085 + }, + { + "epoch": 14.265508408508742, + "grad_norm": 0.2580881118774414, + "learning_rate": 1.1510897426087927e-05, + "loss": 0.0773, + "num_input_tokens_seen": 155861984, + "step": 128090 + }, + { + "epoch": 14.26606526339236, + "grad_norm": 0.2551570534706116, + "learning_rate": 1.1508851785372527e-05, + "loss": 0.0109, + "num_input_tokens_seen": 155867744, + "step": 128095 + }, + { + "epoch": 14.266622118275977, + "grad_norm": 0.8834690451622009, + "learning_rate": 1.150680627209185e-05, + "loss": 0.1873, + "num_input_tokens_seen": 155873440, + "step": 128100 + }, + { + "epoch": 14.267178973159595, + "grad_norm": 2.827885389328003, + "learning_rate": 1.1504760886265178e-05, + "loss": 0.1191, + "num_input_tokens_seen": 155879648, + "step": 128105 + }, + { + "epoch": 14.267735828043213, + "grad_norm": 0.10168375819921494, + "learning_rate": 1.1502715627911865e-05, + "loss": 0.0145, + "num_input_tokens_seen": 155885856, + "step": 128110 + }, + { + "epoch": 14.268292682926829, + "grad_norm": 0.33054134249687195, + "learning_rate": 1.1500670497051205e-05, + "loss": 0.0692, + "num_input_tokens_seen": 155892128, + "step": 128115 + }, + { + "epoch": 14.268849537810446, + "grad_norm": 0.24871325492858887, + "learning_rate": 1.1498625493702535e-05, + "loss": 0.0105, + "num_input_tokens_seen": 155898016, + "step": 128120 + }, + { + "epoch": 14.269406392694064, + "grad_norm": 0.008024781011044979, + "learning_rate": 1.1496580617885166e-05, + "loss": 0.0012, + "num_input_tokens_seen": 155904384, + "step": 128125 + }, + { + "epoch": 14.269963247577682, + "grad_norm": 0.1014091894030571, + "learning_rate": 1.1494535869618414e-05, + "loss": 0.001, + "num_input_tokens_seen": 155910464, + "step": 128130 + }, + { + "epoch": 14.2705201024613, + "grad_norm": 0.01028622966259718, + "learning_rate": 1.149249124892158e-05, + "loss": 0.0454, + "num_input_tokens_seen": 155916320, + "step": 128135 + }, + { + "epoch": 14.271076957344915, + "grad_norm": 0.12237999588251114, + "learning_rate": 1.1490446755813997e-05, + "loss": 0.0086, + "num_input_tokens_seen": 155922496, + "step": 128140 + }, + { + "epoch": 14.271633812228533, + "grad_norm": 1.542431116104126, + "learning_rate": 1.148840239031497e-05, + "loss": 0.127, + "num_input_tokens_seen": 155928512, + "step": 128145 + }, + { + "epoch": 14.27219066711215, + "grad_norm": 1.9877979755401611, + "learning_rate": 1.1486358152443805e-05, + "loss": 0.0455, + "num_input_tokens_seen": 155934784, + "step": 128150 + }, + { + "epoch": 14.272747521995768, + "grad_norm": 0.07604163885116577, + "learning_rate": 1.1484314042219808e-05, + "loss": 0.0335, + "num_input_tokens_seen": 155940928, + "step": 128155 + }, + { + "epoch": 14.273304376879386, + "grad_norm": 0.3474864661693573, + "learning_rate": 1.14822700596623e-05, + "loss": 0.0277, + "num_input_tokens_seen": 155947136, + "step": 128160 + }, + { + "epoch": 14.273861231763002, + "grad_norm": 0.06557025760412216, + "learning_rate": 1.1480226204790573e-05, + "loss": 0.0018, + "num_input_tokens_seen": 155953728, + "step": 128165 + }, + { + "epoch": 14.27441808664662, + "grad_norm": 0.001994107151404023, + "learning_rate": 1.147818247762395e-05, + "loss": 0.2017, + "num_input_tokens_seen": 155959904, + "step": 128170 + }, + { + "epoch": 14.274974941530237, + "grad_norm": 0.6089932918548584, + "learning_rate": 1.1476138878181727e-05, + "loss": 0.0809, + "num_input_tokens_seen": 155965984, + "step": 128175 + }, + { + "epoch": 14.275531796413855, + "grad_norm": 9.721384412841871e-05, + "learning_rate": 1.1474095406483209e-05, + "loss": 0.0036, + "num_input_tokens_seen": 155971968, + "step": 128180 + }, + { + "epoch": 14.276088651297473, + "grad_norm": 1.1117753982543945, + "learning_rate": 1.1472052062547686e-05, + "loss": 0.06, + "num_input_tokens_seen": 155978272, + "step": 128185 + }, + { + "epoch": 14.276645506181088, + "grad_norm": 0.08071175962686539, + "learning_rate": 1.1470008846394481e-05, + "loss": 0.0435, + "num_input_tokens_seen": 155984096, + "step": 128190 + }, + { + "epoch": 14.277202361064706, + "grad_norm": 1.1971815824508667, + "learning_rate": 1.1467965758042882e-05, + "loss": 0.0164, + "num_input_tokens_seen": 155990560, + "step": 128195 + }, + { + "epoch": 14.277759215948324, + "grad_norm": 0.11740938574075699, + "learning_rate": 1.1465922797512186e-05, + "loss": 0.0185, + "num_input_tokens_seen": 155996640, + "step": 128200 + }, + { + "epoch": 14.278316070831941, + "grad_norm": 0.8192350268363953, + "learning_rate": 1.1463879964821686e-05, + "loss": 0.0171, + "num_input_tokens_seen": 156002688, + "step": 128205 + }, + { + "epoch": 14.278872925715559, + "grad_norm": 0.8885430097579956, + "learning_rate": 1.146183725999069e-05, + "loss": 0.0719, + "num_input_tokens_seen": 156009184, + "step": 128210 + }, + { + "epoch": 14.279429780599175, + "grad_norm": 2.6501824855804443, + "learning_rate": 1.1459794683038484e-05, + "loss": 0.0978, + "num_input_tokens_seen": 156015424, + "step": 128215 + }, + { + "epoch": 14.279986635482793, + "grad_norm": 1.3649042844772339, + "learning_rate": 1.1457752233984379e-05, + "loss": 0.0602, + "num_input_tokens_seen": 156021152, + "step": 128220 + }, + { + "epoch": 14.28054349036641, + "grad_norm": 0.0019283787114545703, + "learning_rate": 1.1455709912847637e-05, + "loss": 0.0035, + "num_input_tokens_seen": 156027264, + "step": 128225 + }, + { + "epoch": 14.281100345250028, + "grad_norm": 0.2964422404766083, + "learning_rate": 1.1453667719647576e-05, + "loss": 0.0503, + "num_input_tokens_seen": 156033984, + "step": 128230 + }, + { + "epoch": 14.281657200133646, + "grad_norm": 0.2698756158351898, + "learning_rate": 1.1451625654403466e-05, + "loss": 0.031, + "num_input_tokens_seen": 156040064, + "step": 128235 + }, + { + "epoch": 14.282214055017263, + "grad_norm": 0.008709374815225601, + "learning_rate": 1.1449583717134619e-05, + "loss": 0.0099, + "num_input_tokens_seen": 156046080, + "step": 128240 + }, + { + "epoch": 14.28277090990088, + "grad_norm": 0.9509157538414001, + "learning_rate": 1.1447541907860307e-05, + "loss": 0.0459, + "num_input_tokens_seen": 156051872, + "step": 128245 + }, + { + "epoch": 14.283327764784497, + "grad_norm": 1.2440299987792969, + "learning_rate": 1.144550022659982e-05, + "loss": 0.0712, + "num_input_tokens_seen": 156058080, + "step": 128250 + }, + { + "epoch": 14.283884619668115, + "grad_norm": 0.002989609958603978, + "learning_rate": 1.1443458673372435e-05, + "loss": 0.012, + "num_input_tokens_seen": 156064224, + "step": 128255 + }, + { + "epoch": 14.284441474551732, + "grad_norm": 0.2445397675037384, + "learning_rate": 1.1441417248197454e-05, + "loss": 0.0518, + "num_input_tokens_seen": 156070432, + "step": 128260 + }, + { + "epoch": 14.28499832943535, + "grad_norm": 0.15635044872760773, + "learning_rate": 1.1439375951094148e-05, + "loss": 0.0038, + "num_input_tokens_seen": 156076640, + "step": 128265 + }, + { + "epoch": 14.285555184318966, + "grad_norm": 0.001736203208565712, + "learning_rate": 1.1437334782081801e-05, + "loss": 0.0055, + "num_input_tokens_seen": 156082720, + "step": 128270 + }, + { + "epoch": 14.286112039202584, + "grad_norm": 0.05384070426225662, + "learning_rate": 1.1435293741179687e-05, + "loss": 0.0112, + "num_input_tokens_seen": 156088640, + "step": 128275 + }, + { + "epoch": 14.286668894086201, + "grad_norm": 0.1782599687576294, + "learning_rate": 1.1433252828407099e-05, + "loss": 0.0905, + "num_input_tokens_seen": 156094304, + "step": 128280 + }, + { + "epoch": 14.287225748969819, + "grad_norm": 0.08849300444126129, + "learning_rate": 1.1431212043783295e-05, + "loss": 0.0336, + "num_input_tokens_seen": 156100832, + "step": 128285 + }, + { + "epoch": 14.287782603853437, + "grad_norm": 1.561156988143921, + "learning_rate": 1.1429171387327587e-05, + "loss": 0.0272, + "num_input_tokens_seen": 156106752, + "step": 128290 + }, + { + "epoch": 14.288339458737052, + "grad_norm": 0.020319638773798943, + "learning_rate": 1.1427130859059209e-05, + "loss": 0.0047, + "num_input_tokens_seen": 156112768, + "step": 128295 + }, + { + "epoch": 14.28889631362067, + "grad_norm": 0.00013062010111752898, + "learning_rate": 1.1425090458997462e-05, + "loss": 0.0097, + "num_input_tokens_seen": 156118944, + "step": 128300 + }, + { + "epoch": 14.289453168504288, + "grad_norm": 0.15503047406673431, + "learning_rate": 1.1423050187161605e-05, + "loss": 0.0116, + "num_input_tokens_seen": 156125056, + "step": 128305 + }, + { + "epoch": 14.290010023387905, + "grad_norm": 0.009100470691919327, + "learning_rate": 1.1421010043570923e-05, + "loss": 0.1639, + "num_input_tokens_seen": 156131136, + "step": 128310 + }, + { + "epoch": 14.290566878271523, + "grad_norm": 0.0052468362264335155, + "learning_rate": 1.141897002824468e-05, + "loss": 0.0012, + "num_input_tokens_seen": 156136928, + "step": 128315 + }, + { + "epoch": 14.291123733155139, + "grad_norm": 0.03696126490831375, + "learning_rate": 1.1416930141202149e-05, + "loss": 0.0932, + "num_input_tokens_seen": 156143296, + "step": 128320 + }, + { + "epoch": 14.291680588038757, + "grad_norm": 0.2280760258436203, + "learning_rate": 1.1414890382462585e-05, + "loss": 0.0329, + "num_input_tokens_seen": 156149088, + "step": 128325 + }, + { + "epoch": 14.292237442922374, + "grad_norm": 0.014245404861867428, + "learning_rate": 1.1412850752045274e-05, + "loss": 0.0203, + "num_input_tokens_seen": 156155104, + "step": 128330 + }, + { + "epoch": 14.292794297805992, + "grad_norm": 0.014468595385551453, + "learning_rate": 1.1410811249969475e-05, + "loss": 0.0016, + "num_input_tokens_seen": 156161184, + "step": 128335 + }, + { + "epoch": 14.29335115268961, + "grad_norm": 0.0644327700138092, + "learning_rate": 1.1408771876254448e-05, + "loss": 0.0224, + "num_input_tokens_seen": 156167392, + "step": 128340 + }, + { + "epoch": 14.293908007573226, + "grad_norm": 0.016318099573254585, + "learning_rate": 1.1406732630919453e-05, + "loss": 0.0042, + "num_input_tokens_seen": 156173472, + "step": 128345 + }, + { + "epoch": 14.294464862456843, + "grad_norm": 0.12424156814813614, + "learning_rate": 1.1404693513983769e-05, + "loss": 0.0747, + "num_input_tokens_seen": 156179872, + "step": 128350 + }, + { + "epoch": 14.295021717340461, + "grad_norm": 1.292391061782837, + "learning_rate": 1.1402654525466639e-05, + "loss": 0.0255, + "num_input_tokens_seen": 156185984, + "step": 128355 + }, + { + "epoch": 14.295578572224079, + "grad_norm": 0.009060884825885296, + "learning_rate": 1.1400615665387347e-05, + "loss": 0.1725, + "num_input_tokens_seen": 156191616, + "step": 128360 + }, + { + "epoch": 14.296135427107696, + "grad_norm": 1.5510554313659668, + "learning_rate": 1.1398576933765117e-05, + "loss": 0.1176, + "num_input_tokens_seen": 156196896, + "step": 128365 + }, + { + "epoch": 14.296692281991312, + "grad_norm": 0.6875202059745789, + "learning_rate": 1.1396538330619234e-05, + "loss": 0.1565, + "num_input_tokens_seen": 156202368, + "step": 128370 + }, + { + "epoch": 14.29724913687493, + "grad_norm": 0.01473152730613947, + "learning_rate": 1.1394499855968946e-05, + "loss": 0.0167, + "num_input_tokens_seen": 156207936, + "step": 128375 + }, + { + "epoch": 14.297805991758548, + "grad_norm": 0.25384363532066345, + "learning_rate": 1.13924615098335e-05, + "loss": 0.0029, + "num_input_tokens_seen": 156214176, + "step": 128380 + }, + { + "epoch": 14.298362846642165, + "grad_norm": 0.0015906734624877572, + "learning_rate": 1.1390423292232164e-05, + "loss": 0.0417, + "num_input_tokens_seen": 156220288, + "step": 128385 + }, + { + "epoch": 14.298919701525783, + "grad_norm": 0.006947418674826622, + "learning_rate": 1.1388385203184185e-05, + "loss": 0.0041, + "num_input_tokens_seen": 156226496, + "step": 128390 + }, + { + "epoch": 14.299476556409399, + "grad_norm": 0.06663902848958969, + "learning_rate": 1.1386347242708814e-05, + "loss": 0.0407, + "num_input_tokens_seen": 156232672, + "step": 128395 + }, + { + "epoch": 14.300033411293017, + "grad_norm": 0.4721396267414093, + "learning_rate": 1.138430941082529e-05, + "loss": 0.0599, + "num_input_tokens_seen": 156238624, + "step": 128400 + }, + { + "epoch": 14.300590266176634, + "grad_norm": 0.031642790883779526, + "learning_rate": 1.138227170755288e-05, + "loss": 0.0077, + "num_input_tokens_seen": 156244704, + "step": 128405 + }, + { + "epoch": 14.301147121060252, + "grad_norm": 0.40479394793510437, + "learning_rate": 1.1380234132910828e-05, + "loss": 0.0617, + "num_input_tokens_seen": 156250656, + "step": 128410 + }, + { + "epoch": 14.30170397594387, + "grad_norm": 0.08764346688985825, + "learning_rate": 1.1378196686918375e-05, + "loss": 0.1094, + "num_input_tokens_seen": 156256928, + "step": 128415 + }, + { + "epoch": 14.302260830827485, + "grad_norm": 0.11690060794353485, + "learning_rate": 1.1376159369594758e-05, + "loss": 0.0223, + "num_input_tokens_seen": 156263104, + "step": 128420 + }, + { + "epoch": 14.302817685711103, + "grad_norm": 1.665332555770874, + "learning_rate": 1.137412218095924e-05, + "loss": 0.1832, + "num_input_tokens_seen": 156269056, + "step": 128425 + }, + { + "epoch": 14.30337454059472, + "grad_norm": 0.011094028130173683, + "learning_rate": 1.1372085121031045e-05, + "loss": 0.0259, + "num_input_tokens_seen": 156275232, + "step": 128430 + }, + { + "epoch": 14.303931395478338, + "grad_norm": 1.1777029037475586, + "learning_rate": 1.1370048189829444e-05, + "loss": 0.0346, + "num_input_tokens_seen": 156281312, + "step": 128435 + }, + { + "epoch": 14.304488250361956, + "grad_norm": 1.0807762145996094, + "learning_rate": 1.1368011387373639e-05, + "loss": 0.0286, + "num_input_tokens_seen": 156287360, + "step": 128440 + }, + { + "epoch": 14.305045105245574, + "grad_norm": 2.089304208755493, + "learning_rate": 1.1365974713682897e-05, + "loss": 0.0681, + "num_input_tokens_seen": 156293696, + "step": 128445 + }, + { + "epoch": 14.30560196012919, + "grad_norm": 1.4566435813903809, + "learning_rate": 1.136393816877644e-05, + "loss": 0.0878, + "num_input_tokens_seen": 156299424, + "step": 128450 + }, + { + "epoch": 14.306158815012807, + "grad_norm": 0.26725608110427856, + "learning_rate": 1.136190175267352e-05, + "loss": 0.0067, + "num_input_tokens_seen": 156305536, + "step": 128455 + }, + { + "epoch": 14.306715669896425, + "grad_norm": 0.22724896669387817, + "learning_rate": 1.1359865465393366e-05, + "loss": 0.0134, + "num_input_tokens_seen": 156311456, + "step": 128460 + }, + { + "epoch": 14.307272524780043, + "grad_norm": 0.004350993782281876, + "learning_rate": 1.1357829306955209e-05, + "loss": 0.0245, + "num_input_tokens_seen": 156317696, + "step": 128465 + }, + { + "epoch": 14.30782937966366, + "grad_norm": 0.13327565789222717, + "learning_rate": 1.1355793277378274e-05, + "loss": 0.027, + "num_input_tokens_seen": 156323424, + "step": 128470 + }, + { + "epoch": 14.308386234547276, + "grad_norm": 0.029651379212737083, + "learning_rate": 1.1353757376681815e-05, + "loss": 0.0005, + "num_input_tokens_seen": 156329472, + "step": 128475 + }, + { + "epoch": 14.308943089430894, + "grad_norm": 0.9304077625274658, + "learning_rate": 1.135172160488505e-05, + "loss": 0.0501, + "num_input_tokens_seen": 156335712, + "step": 128480 + }, + { + "epoch": 14.309499944314512, + "grad_norm": 0.3489091694355011, + "learning_rate": 1.1349685962007209e-05, + "loss": 0.1323, + "num_input_tokens_seen": 156341824, + "step": 128485 + }, + { + "epoch": 14.31005679919813, + "grad_norm": 0.0038549331948161125, + "learning_rate": 1.1347650448067512e-05, + "loss": 0.0235, + "num_input_tokens_seen": 156347808, + "step": 128490 + }, + { + "epoch": 14.310613654081747, + "grad_norm": 0.011592437513172626, + "learning_rate": 1.1345615063085203e-05, + "loss": 0.0122, + "num_input_tokens_seen": 156354144, + "step": 128495 + }, + { + "epoch": 14.311170508965363, + "grad_norm": 0.03219029679894447, + "learning_rate": 1.1343579807079488e-05, + "loss": 0.0117, + "num_input_tokens_seen": 156360288, + "step": 128500 + }, + { + "epoch": 14.31172736384898, + "grad_norm": 0.3196002244949341, + "learning_rate": 1.1341544680069624e-05, + "loss": 0.144, + "num_input_tokens_seen": 156366656, + "step": 128505 + }, + { + "epoch": 14.312284218732598, + "grad_norm": 0.8008868098258972, + "learning_rate": 1.1339509682074795e-05, + "loss": 0.0089, + "num_input_tokens_seen": 156372736, + "step": 128510 + }, + { + "epoch": 14.312841073616216, + "grad_norm": 0.06665695458650589, + "learning_rate": 1.1337474813114251e-05, + "loss": 0.0298, + "num_input_tokens_seen": 156378688, + "step": 128515 + }, + { + "epoch": 14.313397928499834, + "grad_norm": 0.10892663151025772, + "learning_rate": 1.1335440073207195e-05, + "loss": 0.0282, + "num_input_tokens_seen": 156384896, + "step": 128520 + }, + { + "epoch": 14.31395478338345, + "grad_norm": 5.451999664306641, + "learning_rate": 1.1333405462372863e-05, + "loss": 0.0684, + "num_input_tokens_seen": 156391392, + "step": 128525 + }, + { + "epoch": 14.314511638267067, + "grad_norm": 0.8307653665542603, + "learning_rate": 1.1331370980630468e-05, + "loss": 0.0606, + "num_input_tokens_seen": 156397248, + "step": 128530 + }, + { + "epoch": 14.315068493150685, + "grad_norm": 0.000746999925468117, + "learning_rate": 1.1329336627999223e-05, + "loss": 0.0249, + "num_input_tokens_seen": 156403360, + "step": 128535 + }, + { + "epoch": 14.315625348034303, + "grad_norm": 2.348297119140625, + "learning_rate": 1.1327302404498339e-05, + "loss": 0.1854, + "num_input_tokens_seen": 156409376, + "step": 128540 + }, + { + "epoch": 14.31618220291792, + "grad_norm": 0.001971442485228181, + "learning_rate": 1.1325268310147043e-05, + "loss": 0.0151, + "num_input_tokens_seen": 156415552, + "step": 128545 + }, + { + "epoch": 14.316739057801536, + "grad_norm": 0.00034386443439871073, + "learning_rate": 1.1323234344964547e-05, + "loss": 0.0367, + "num_input_tokens_seen": 156421600, + "step": 128550 + }, + { + "epoch": 14.317295912685154, + "grad_norm": 0.058037128299474716, + "learning_rate": 1.1321200508970061e-05, + "loss": 0.0302, + "num_input_tokens_seen": 156427616, + "step": 128555 + }, + { + "epoch": 14.317852767568771, + "grad_norm": 0.014413139782845974, + "learning_rate": 1.1319166802182787e-05, + "loss": 0.0117, + "num_input_tokens_seen": 156433440, + "step": 128560 + }, + { + "epoch": 14.31840962245239, + "grad_norm": 0.11839131265878677, + "learning_rate": 1.131713322462195e-05, + "loss": 0.0619, + "num_input_tokens_seen": 156439520, + "step": 128565 + }, + { + "epoch": 14.318966477336007, + "grad_norm": 0.7182055115699768, + "learning_rate": 1.1315099776306745e-05, + "loss": 0.0279, + "num_input_tokens_seen": 156445344, + "step": 128570 + }, + { + "epoch": 14.319523332219624, + "grad_norm": 0.5265105962753296, + "learning_rate": 1.1313066457256397e-05, + "loss": 0.0281, + "num_input_tokens_seen": 156451456, + "step": 128575 + }, + { + "epoch": 14.32008018710324, + "grad_norm": 0.28249892592430115, + "learning_rate": 1.1311033267490099e-05, + "loss": 0.1671, + "num_input_tokens_seen": 156457408, + "step": 128580 + }, + { + "epoch": 14.320637041986858, + "grad_norm": 0.0755893662571907, + "learning_rate": 1.130900020702706e-05, + "loss": 0.0041, + "num_input_tokens_seen": 156463328, + "step": 128585 + }, + { + "epoch": 14.321193896870476, + "grad_norm": 0.14693935215473175, + "learning_rate": 1.1306967275886473e-05, + "loss": 0.0019, + "num_input_tokens_seen": 156469760, + "step": 128590 + }, + { + "epoch": 14.321750751754093, + "grad_norm": 0.10201200842857361, + "learning_rate": 1.1304934474087563e-05, + "loss": 0.0031, + "num_input_tokens_seen": 156475872, + "step": 128595 + }, + { + "epoch": 14.322307606637711, + "grad_norm": 0.12645885348320007, + "learning_rate": 1.1302901801649517e-05, + "loss": 0.0044, + "num_input_tokens_seen": 156481856, + "step": 128600 + }, + { + "epoch": 14.322864461521327, + "grad_norm": 0.10241477191448212, + "learning_rate": 1.130086925859154e-05, + "loss": 0.0504, + "num_input_tokens_seen": 156488288, + "step": 128605 + }, + { + "epoch": 14.323421316404945, + "grad_norm": 0.0016265609301626682, + "learning_rate": 1.1298836844932817e-05, + "loss": 0.0074, + "num_input_tokens_seen": 156494592, + "step": 128610 + }, + { + "epoch": 14.323978171288562, + "grad_norm": 0.020129073411226273, + "learning_rate": 1.1296804560692568e-05, + "loss": 0.0426, + "num_input_tokens_seen": 156500640, + "step": 128615 + }, + { + "epoch": 14.32453502617218, + "grad_norm": 0.13945257663726807, + "learning_rate": 1.1294772405889966e-05, + "loss": 0.0033, + "num_input_tokens_seen": 156506496, + "step": 128620 + }, + { + "epoch": 14.325091881055798, + "grad_norm": 0.683426558971405, + "learning_rate": 1.129274038054424e-05, + "loss": 0.0196, + "num_input_tokens_seen": 156512640, + "step": 128625 + }, + { + "epoch": 14.325648735939414, + "grad_norm": 0.07175247371196747, + "learning_rate": 1.1290708484674545e-05, + "loss": 0.0512, + "num_input_tokens_seen": 156518880, + "step": 128630 + }, + { + "epoch": 14.326205590823031, + "grad_norm": 1.5330781936645508, + "learning_rate": 1.1288676718300101e-05, + "loss": 0.1028, + "num_input_tokens_seen": 156525024, + "step": 128635 + }, + { + "epoch": 14.326762445706649, + "grad_norm": 0.0005704801296815276, + "learning_rate": 1.1286645081440081e-05, + "loss": 0.0773, + "num_input_tokens_seen": 156531168, + "step": 128640 + }, + { + "epoch": 14.327319300590267, + "grad_norm": 0.6919819116592407, + "learning_rate": 1.1284613574113693e-05, + "loss": 0.1121, + "num_input_tokens_seen": 156536928, + "step": 128645 + }, + { + "epoch": 14.327876155473884, + "grad_norm": 0.05689328908920288, + "learning_rate": 1.128258219634012e-05, + "loss": 0.0015, + "num_input_tokens_seen": 156543168, + "step": 128650 + }, + { + "epoch": 14.3284330103575, + "grad_norm": 0.6277748346328735, + "learning_rate": 1.1280550948138549e-05, + "loss": 0.0808, + "num_input_tokens_seen": 156549536, + "step": 128655 + }, + { + "epoch": 14.328989865241118, + "grad_norm": 0.0018764090491458774, + "learning_rate": 1.1278519829528155e-05, + "loss": 0.0206, + "num_input_tokens_seen": 156555712, + "step": 128660 + }, + { + "epoch": 14.329546720124736, + "grad_norm": 0.6701475381851196, + "learning_rate": 1.1276488840528146e-05, + "loss": 0.0752, + "num_input_tokens_seen": 156561792, + "step": 128665 + }, + { + "epoch": 14.330103575008353, + "grad_norm": 0.13572342693805695, + "learning_rate": 1.1274457981157693e-05, + "loss": 0.0086, + "num_input_tokens_seen": 156568224, + "step": 128670 + }, + { + "epoch": 14.33066042989197, + "grad_norm": 0.7802541255950928, + "learning_rate": 1.1272427251435982e-05, + "loss": 0.0168, + "num_input_tokens_seen": 156574368, + "step": 128675 + }, + { + "epoch": 14.331217284775587, + "grad_norm": 0.17904500663280487, + "learning_rate": 1.1270396651382188e-05, + "loss": 0.0232, + "num_input_tokens_seen": 156580576, + "step": 128680 + }, + { + "epoch": 14.331774139659204, + "grad_norm": 0.35655272006988525, + "learning_rate": 1.1268366181015502e-05, + "loss": 0.0068, + "num_input_tokens_seen": 156586496, + "step": 128685 + }, + { + "epoch": 14.332330994542822, + "grad_norm": 0.0024707817938178778, + "learning_rate": 1.1266335840355092e-05, + "loss": 0.0042, + "num_input_tokens_seen": 156592640, + "step": 128690 + }, + { + "epoch": 14.33288784942644, + "grad_norm": 0.004156025592237711, + "learning_rate": 1.1264305629420161e-05, + "loss": 0.009, + "num_input_tokens_seen": 156598848, + "step": 128695 + }, + { + "epoch": 14.333444704310057, + "grad_norm": 0.17628346383571625, + "learning_rate": 1.126227554822985e-05, + "loss": 0.0118, + "num_input_tokens_seen": 156604800, + "step": 128700 + }, + { + "epoch": 14.334001559193673, + "grad_norm": 0.008126771077513695, + "learning_rate": 1.1260245596803362e-05, + "loss": 0.0477, + "num_input_tokens_seen": 156611008, + "step": 128705 + }, + { + "epoch": 14.334558414077291, + "grad_norm": 0.5569392442703247, + "learning_rate": 1.1258215775159853e-05, + "loss": 0.0193, + "num_input_tokens_seen": 156617248, + "step": 128710 + }, + { + "epoch": 14.335115268960909, + "grad_norm": 2.0457427501678467, + "learning_rate": 1.1256186083318515e-05, + "loss": 0.0592, + "num_input_tokens_seen": 156623360, + "step": 128715 + }, + { + "epoch": 14.335672123844526, + "grad_norm": 0.004823042545467615, + "learning_rate": 1.1254156521298512e-05, + "loss": 0.0113, + "num_input_tokens_seen": 156629280, + "step": 128720 + }, + { + "epoch": 14.336228978728144, + "grad_norm": 0.06571433693170547, + "learning_rate": 1.1252127089119014e-05, + "loss": 0.0216, + "num_input_tokens_seen": 156635520, + "step": 128725 + }, + { + "epoch": 14.33678583361176, + "grad_norm": 0.0007286047330126166, + "learning_rate": 1.1250097786799177e-05, + "loss": 0.0971, + "num_input_tokens_seen": 156641600, + "step": 128730 + }, + { + "epoch": 14.337342688495378, + "grad_norm": 0.00031754968222230673, + "learning_rate": 1.1248068614358195e-05, + "loss": 0.0276, + "num_input_tokens_seen": 156647712, + "step": 128735 + }, + { + "epoch": 14.337899543378995, + "grad_norm": 2.373769998550415, + "learning_rate": 1.1246039571815223e-05, + "loss": 0.2054, + "num_input_tokens_seen": 156653888, + "step": 128740 + }, + { + "epoch": 14.338456398262613, + "grad_norm": 0.0034950540866702795, + "learning_rate": 1.1244010659189427e-05, + "loss": 0.1279, + "num_input_tokens_seen": 156659616, + "step": 128745 + }, + { + "epoch": 14.33901325314623, + "grad_norm": 0.09272073954343796, + "learning_rate": 1.124198187649996e-05, + "loss": 0.0242, + "num_input_tokens_seen": 156666144, + "step": 128750 + }, + { + "epoch": 14.339570108029847, + "grad_norm": 0.011784806847572327, + "learning_rate": 1.1239953223766009e-05, + "loss": 0.0078, + "num_input_tokens_seen": 156672256, + "step": 128755 + }, + { + "epoch": 14.340126962913464, + "grad_norm": 0.8198513984680176, + "learning_rate": 1.1237924701006714e-05, + "loss": 0.063, + "num_input_tokens_seen": 156678016, + "step": 128760 + }, + { + "epoch": 14.340683817797082, + "grad_norm": 1.052324891090393, + "learning_rate": 1.1235896308241262e-05, + "loss": 0.0352, + "num_input_tokens_seen": 156683552, + "step": 128765 + }, + { + "epoch": 14.3412406726807, + "grad_norm": 0.00015411098138429224, + "learning_rate": 1.1233868045488783e-05, + "loss": 0.1416, + "num_input_tokens_seen": 156689728, + "step": 128770 + }, + { + "epoch": 14.341797527564317, + "grad_norm": 0.3383760154247284, + "learning_rate": 1.1231839912768455e-05, + "loss": 0.053, + "num_input_tokens_seen": 156696096, + "step": 128775 + }, + { + "epoch": 14.342354382447935, + "grad_norm": 0.01416199654340744, + "learning_rate": 1.122981191009943e-05, + "loss": 0.0186, + "num_input_tokens_seen": 156702080, + "step": 128780 + }, + { + "epoch": 14.34291123733155, + "grad_norm": 0.3348569869995117, + "learning_rate": 1.1227784037500858e-05, + "loss": 0.0051, + "num_input_tokens_seen": 156708256, + "step": 128785 + }, + { + "epoch": 14.343468092215168, + "grad_norm": 0.19740553200244904, + "learning_rate": 1.1225756294991907e-05, + "loss": 0.0521, + "num_input_tokens_seen": 156714528, + "step": 128790 + }, + { + "epoch": 14.344024947098786, + "grad_norm": 0.7140234708786011, + "learning_rate": 1.1223728682591721e-05, + "loss": 0.0483, + "num_input_tokens_seen": 156720672, + "step": 128795 + }, + { + "epoch": 14.344581801982404, + "grad_norm": 0.12408271431922913, + "learning_rate": 1.1221701200319459e-05, + "loss": 0.0197, + "num_input_tokens_seen": 156726496, + "step": 128800 + }, + { + "epoch": 14.345138656866022, + "grad_norm": 0.045307986438274384, + "learning_rate": 1.121967384819426e-05, + "loss": 0.0136, + "num_input_tokens_seen": 156732480, + "step": 128805 + }, + { + "epoch": 14.345695511749637, + "grad_norm": 0.22380545735359192, + "learning_rate": 1.1217646626235287e-05, + "loss": 0.0117, + "num_input_tokens_seen": 156738752, + "step": 128810 + }, + { + "epoch": 14.346252366633255, + "grad_norm": 1.013803243637085, + "learning_rate": 1.1215619534461686e-05, + "loss": 0.0657, + "num_input_tokens_seen": 156744864, + "step": 128815 + }, + { + "epoch": 14.346809221516873, + "grad_norm": 0.012256092391908169, + "learning_rate": 1.1213592572892603e-05, + "loss": 0.0541, + "num_input_tokens_seen": 156751104, + "step": 128820 + }, + { + "epoch": 14.34736607640049, + "grad_norm": 0.00015826824528630823, + "learning_rate": 1.1211565741547173e-05, + "loss": 0.0009, + "num_input_tokens_seen": 156757280, + "step": 128825 + }, + { + "epoch": 14.347922931284108, + "grad_norm": 0.04417075589299202, + "learning_rate": 1.1209539040444561e-05, + "loss": 0.0462, + "num_input_tokens_seen": 156763264, + "step": 128830 + }, + { + "epoch": 14.348479786167724, + "grad_norm": 0.005147038493305445, + "learning_rate": 1.1207512469603892e-05, + "loss": 0.0195, + "num_input_tokens_seen": 156769216, + "step": 128835 + }, + { + "epoch": 14.349036641051342, + "grad_norm": 0.1025584489107132, + "learning_rate": 1.1205486029044338e-05, + "loss": 0.0213, + "num_input_tokens_seen": 156775424, + "step": 128840 + }, + { + "epoch": 14.34959349593496, + "grad_norm": 2.0937952995300293, + "learning_rate": 1.1203459718785e-05, + "loss": 0.0792, + "num_input_tokens_seen": 156781632, + "step": 128845 + }, + { + "epoch": 14.350150350818577, + "grad_norm": 0.007582674268633127, + "learning_rate": 1.1201433538845052e-05, + "loss": 0.0423, + "num_input_tokens_seen": 156787808, + "step": 128850 + }, + { + "epoch": 14.350707205702195, + "grad_norm": 0.0001577595976414159, + "learning_rate": 1.1199407489243607e-05, + "loss": 0.0458, + "num_input_tokens_seen": 156794080, + "step": 128855 + }, + { + "epoch": 14.35126406058581, + "grad_norm": 1.7550934553146362, + "learning_rate": 1.1197381569999824e-05, + "loss": 0.1243, + "num_input_tokens_seen": 156799744, + "step": 128860 + }, + { + "epoch": 14.351820915469428, + "grad_norm": 0.9701270461082458, + "learning_rate": 1.119535578113283e-05, + "loss": 0.0341, + "num_input_tokens_seen": 156805856, + "step": 128865 + }, + { + "epoch": 14.352377770353046, + "grad_norm": 0.0003836852265521884, + "learning_rate": 1.1193330122661764e-05, + "loss": 0.0662, + "num_input_tokens_seen": 156812032, + "step": 128870 + }, + { + "epoch": 14.352934625236664, + "grad_norm": 0.6306208372116089, + "learning_rate": 1.1191304594605745e-05, + "loss": 0.0752, + "num_input_tokens_seen": 156818592, + "step": 128875 + }, + { + "epoch": 14.353491480120281, + "grad_norm": 0.18990197777748108, + "learning_rate": 1.1189279196983926e-05, + "loss": 0.005, + "num_input_tokens_seen": 156824800, + "step": 128880 + }, + { + "epoch": 14.354048335003897, + "grad_norm": 0.08725237846374512, + "learning_rate": 1.118725392981543e-05, + "loss": 0.0064, + "num_input_tokens_seen": 156831008, + "step": 128885 + }, + { + "epoch": 14.354605189887515, + "grad_norm": 0.08151765167713165, + "learning_rate": 1.118522879311939e-05, + "loss": 0.0027, + "num_input_tokens_seen": 156837184, + "step": 128890 + }, + { + "epoch": 14.355162044771133, + "grad_norm": 0.1700707972049713, + "learning_rate": 1.1183203786914919e-05, + "loss": 0.015, + "num_input_tokens_seen": 156843136, + "step": 128895 + }, + { + "epoch": 14.35571889965475, + "grad_norm": 0.0008296496816910803, + "learning_rate": 1.118117891122117e-05, + "loss": 0.0043, + "num_input_tokens_seen": 156849536, + "step": 128900 + }, + { + "epoch": 14.356275754538368, + "grad_norm": 0.0011125057935714722, + "learning_rate": 1.1179154166057249e-05, + "loss": 0.009, + "num_input_tokens_seen": 156855552, + "step": 128905 + }, + { + "epoch": 14.356832609421984, + "grad_norm": 0.027842935174703598, + "learning_rate": 1.1177129551442309e-05, + "loss": 0.0505, + "num_input_tokens_seen": 156861856, + "step": 128910 + }, + { + "epoch": 14.357389464305601, + "grad_norm": 0.3445991575717926, + "learning_rate": 1.1175105067395433e-05, + "loss": 0.0511, + "num_input_tokens_seen": 156867360, + "step": 128915 + }, + { + "epoch": 14.35794631918922, + "grad_norm": 0.9256232976913452, + "learning_rate": 1.1173080713935777e-05, + "loss": 0.0549, + "num_input_tokens_seen": 156873216, + "step": 128920 + }, + { + "epoch": 14.358503174072837, + "grad_norm": 0.02630269154906273, + "learning_rate": 1.1171056491082444e-05, + "loss": 0.025, + "num_input_tokens_seen": 156879552, + "step": 128925 + }, + { + "epoch": 14.359060028956455, + "grad_norm": 0.08525003492832184, + "learning_rate": 1.116903239885457e-05, + "loss": 0.0693, + "num_input_tokens_seen": 156885568, + "step": 128930 + }, + { + "epoch": 14.359616883840072, + "grad_norm": 0.057333122938871384, + "learning_rate": 1.1167008437271264e-05, + "loss": 0.037, + "num_input_tokens_seen": 156891712, + "step": 128935 + }, + { + "epoch": 14.360173738723688, + "grad_norm": 0.007931393571197987, + "learning_rate": 1.116498460635165e-05, + "loss": 0.0235, + "num_input_tokens_seen": 156897728, + "step": 128940 + }, + { + "epoch": 14.360730593607306, + "grad_norm": 0.0008116225362755358, + "learning_rate": 1.116296090611483e-05, + "loss": 0.0593, + "num_input_tokens_seen": 156904032, + "step": 128945 + }, + { + "epoch": 14.361287448490923, + "grad_norm": 0.13613249361515045, + "learning_rate": 1.1160937336579937e-05, + "loss": 0.0371, + "num_input_tokens_seen": 156910272, + "step": 128950 + }, + { + "epoch": 14.361844303374541, + "grad_norm": 0.19351314008235931, + "learning_rate": 1.1158913897766082e-05, + "loss": 0.0443, + "num_input_tokens_seen": 156916320, + "step": 128955 + }, + { + "epoch": 14.362401158258159, + "grad_norm": 0.01129934098571539, + "learning_rate": 1.1156890589692374e-05, + "loss": 0.0642, + "num_input_tokens_seen": 156922464, + "step": 128960 + }, + { + "epoch": 14.362958013141775, + "grad_norm": 1.2309805154800415, + "learning_rate": 1.1154867412377914e-05, + "loss": 0.068, + "num_input_tokens_seen": 156928224, + "step": 128965 + }, + { + "epoch": 14.363514868025392, + "grad_norm": 0.1306058168411255, + "learning_rate": 1.1152844365841836e-05, + "loss": 0.0237, + "num_input_tokens_seen": 156934592, + "step": 128970 + }, + { + "epoch": 14.36407172290901, + "grad_norm": 1.9644731283187866, + "learning_rate": 1.1150821450103224e-05, + "loss": 0.0955, + "num_input_tokens_seen": 156940768, + "step": 128975 + }, + { + "epoch": 14.364628577792628, + "grad_norm": 0.03903500735759735, + "learning_rate": 1.1148798665181211e-05, + "loss": 0.0031, + "num_input_tokens_seen": 156946976, + "step": 128980 + }, + { + "epoch": 14.365185432676245, + "grad_norm": 0.08613373339176178, + "learning_rate": 1.1146776011094892e-05, + "loss": 0.0116, + "num_input_tokens_seen": 156952576, + "step": 128985 + }, + { + "epoch": 14.365742287559861, + "grad_norm": 0.10556897521018982, + "learning_rate": 1.1144753487863375e-05, + "loss": 0.1365, + "num_input_tokens_seen": 156958592, + "step": 128990 + }, + { + "epoch": 14.366299142443479, + "grad_norm": 0.0002291198179591447, + "learning_rate": 1.1142731095505749e-05, + "loss": 0.0302, + "num_input_tokens_seen": 156964576, + "step": 128995 + }, + { + "epoch": 14.366855997327097, + "grad_norm": 0.21785148978233337, + "learning_rate": 1.1140708834041139e-05, + "loss": 0.0244, + "num_input_tokens_seen": 156970496, + "step": 129000 + }, + { + "epoch": 14.367412852210714, + "grad_norm": 0.003938151989132166, + "learning_rate": 1.1138686703488641e-05, + "loss": 0.0439, + "num_input_tokens_seen": 156976576, + "step": 129005 + }, + { + "epoch": 14.367969707094332, + "grad_norm": 0.3206596374511719, + "learning_rate": 1.113666470386735e-05, + "loss": 0.0765, + "num_input_tokens_seen": 156982656, + "step": 129010 + }, + { + "epoch": 14.368526561977948, + "grad_norm": 0.326923131942749, + "learning_rate": 1.113464283519636e-05, + "loss": 0.0043, + "num_input_tokens_seen": 156988768, + "step": 129015 + }, + { + "epoch": 14.369083416861566, + "grad_norm": 1.4652063846588135, + "learning_rate": 1.1132621097494786e-05, + "loss": 0.0781, + "num_input_tokens_seen": 156994752, + "step": 129020 + }, + { + "epoch": 14.369640271745183, + "grad_norm": 0.08949969708919525, + "learning_rate": 1.1130599490781707e-05, + "loss": 0.0786, + "num_input_tokens_seen": 157001184, + "step": 129025 + }, + { + "epoch": 14.3701971266288, + "grad_norm": 0.8761323690414429, + "learning_rate": 1.1128578015076247e-05, + "loss": 0.0713, + "num_input_tokens_seen": 157007296, + "step": 129030 + }, + { + "epoch": 14.370753981512419, + "grad_norm": 0.2967900037765503, + "learning_rate": 1.112655667039746e-05, + "loss": 0.0395, + "num_input_tokens_seen": 157013312, + "step": 129035 + }, + { + "epoch": 14.371310836396034, + "grad_norm": 0.050408706068992615, + "learning_rate": 1.112453545676447e-05, + "loss": 0.0447, + "num_input_tokens_seen": 157019072, + "step": 129040 + }, + { + "epoch": 14.371867691279652, + "grad_norm": 0.14925742149353027, + "learning_rate": 1.1122514374196353e-05, + "loss": 0.0062, + "num_input_tokens_seen": 157025152, + "step": 129045 + }, + { + "epoch": 14.37242454616327, + "grad_norm": 0.5622954368591309, + "learning_rate": 1.1120493422712213e-05, + "loss": 0.1925, + "num_input_tokens_seen": 157031520, + "step": 129050 + }, + { + "epoch": 14.372981401046887, + "grad_norm": 0.19973821938037872, + "learning_rate": 1.1118472602331136e-05, + "loss": 0.0242, + "num_input_tokens_seen": 157037728, + "step": 129055 + }, + { + "epoch": 14.373538255930505, + "grad_norm": 0.6844708919525146, + "learning_rate": 1.1116451913072203e-05, + "loss": 0.0305, + "num_input_tokens_seen": 157044000, + "step": 129060 + }, + { + "epoch": 14.374095110814121, + "grad_norm": 0.7477544546127319, + "learning_rate": 1.11144313549545e-05, + "loss": 0.0362, + "num_input_tokens_seen": 157049984, + "step": 129065 + }, + { + "epoch": 14.374651965697739, + "grad_norm": 0.16708971560001373, + "learning_rate": 1.1112410927997125e-05, + "loss": 0.0051, + "num_input_tokens_seen": 157055808, + "step": 129070 + }, + { + "epoch": 14.375208820581356, + "grad_norm": 0.799957811832428, + "learning_rate": 1.1110390632219153e-05, + "loss": 0.0954, + "num_input_tokens_seen": 157062176, + "step": 129075 + }, + { + "epoch": 14.375765675464974, + "grad_norm": 2.1962170600891113, + "learning_rate": 1.1108370467639673e-05, + "loss": 0.0233, + "num_input_tokens_seen": 157068480, + "step": 129080 + }, + { + "epoch": 14.376322530348592, + "grad_norm": 0.4009949862957001, + "learning_rate": 1.1106350434277754e-05, + "loss": 0.0643, + "num_input_tokens_seen": 157074400, + "step": 129085 + }, + { + "epoch": 14.376879385232208, + "grad_norm": 0.1134597659111023, + "learning_rate": 1.1104330532152493e-05, + "loss": 0.0175, + "num_input_tokens_seen": 157080512, + "step": 129090 + }, + { + "epoch": 14.377436240115825, + "grad_norm": 0.1972348690032959, + "learning_rate": 1.1102310761282955e-05, + "loss": 0.0737, + "num_input_tokens_seen": 157086528, + "step": 129095 + }, + { + "epoch": 14.377993094999443, + "grad_norm": 1.7317169904708862, + "learning_rate": 1.1100291121688248e-05, + "loss": 0.0673, + "num_input_tokens_seen": 157092768, + "step": 129100 + }, + { + "epoch": 14.37854994988306, + "grad_norm": 0.020987916737794876, + "learning_rate": 1.1098271613387407e-05, + "loss": 0.0146, + "num_input_tokens_seen": 157098912, + "step": 129105 + }, + { + "epoch": 14.379106804766678, + "grad_norm": 0.00021637414465658367, + "learning_rate": 1.1096252236399538e-05, + "loss": 0.0629, + "num_input_tokens_seen": 157105376, + "step": 129110 + }, + { + "epoch": 14.379663659650294, + "grad_norm": 0.9939205646514893, + "learning_rate": 1.1094232990743695e-05, + "loss": 0.1125, + "num_input_tokens_seen": 157111360, + "step": 129115 + }, + { + "epoch": 14.380220514533912, + "grad_norm": 0.41001227498054504, + "learning_rate": 1.1092213876438973e-05, + "loss": 0.0067, + "num_input_tokens_seen": 157117600, + "step": 129120 + }, + { + "epoch": 14.38077736941753, + "grad_norm": 0.0023962699342519045, + "learning_rate": 1.1090194893504435e-05, + "loss": 0.0277, + "num_input_tokens_seen": 157123872, + "step": 129125 + }, + { + "epoch": 14.381334224301147, + "grad_norm": 0.2553839385509491, + "learning_rate": 1.108817604195915e-05, + "loss": 0.1043, + "num_input_tokens_seen": 157129952, + "step": 129130 + }, + { + "epoch": 14.381891079184765, + "grad_norm": 0.960157036781311, + "learning_rate": 1.1086157321822177e-05, + "loss": 0.0275, + "num_input_tokens_seen": 157136032, + "step": 129135 + }, + { + "epoch": 14.382447934068383, + "grad_norm": 0.6951694488525391, + "learning_rate": 1.108413873311261e-05, + "loss": 0.2032, + "num_input_tokens_seen": 157141760, + "step": 129140 + }, + { + "epoch": 14.383004788951999, + "grad_norm": 0.021793600171804428, + "learning_rate": 1.1082120275849495e-05, + "loss": 0.0199, + "num_input_tokens_seen": 157147424, + "step": 129145 + }, + { + "epoch": 14.383561643835616, + "grad_norm": 0.4213584363460541, + "learning_rate": 1.108010195005191e-05, + "loss": 0.1565, + "num_input_tokens_seen": 157153568, + "step": 129150 + }, + { + "epoch": 14.384118498719234, + "grad_norm": 0.003950002137571573, + "learning_rate": 1.1078083755738903e-05, + "loss": 0.0134, + "num_input_tokens_seen": 157159776, + "step": 129155 + }, + { + "epoch": 14.384675353602852, + "grad_norm": 0.16032102704048157, + "learning_rate": 1.1076065692929558e-05, + "loss": 0.0056, + "num_input_tokens_seen": 157165824, + "step": 129160 + }, + { + "epoch": 14.38523220848647, + "grad_norm": 0.5525836944580078, + "learning_rate": 1.107404776164292e-05, + "loss": 0.1184, + "num_input_tokens_seen": 157172192, + "step": 129165 + }, + { + "epoch": 14.385789063370085, + "grad_norm": 0.31414100527763367, + "learning_rate": 1.1072029961898066e-05, + "loss": 0.0389, + "num_input_tokens_seen": 157178240, + "step": 129170 + }, + { + "epoch": 14.386345918253703, + "grad_norm": 0.00028132597799412906, + "learning_rate": 1.107001229371405e-05, + "loss": 0.0301, + "num_input_tokens_seen": 157183936, + "step": 129175 + }, + { + "epoch": 14.38690277313732, + "grad_norm": 0.4504319131374359, + "learning_rate": 1.1067994757109929e-05, + "loss": 0.0067, + "num_input_tokens_seen": 157189888, + "step": 129180 + }, + { + "epoch": 14.387459628020938, + "grad_norm": 0.39291298389434814, + "learning_rate": 1.1065977352104748e-05, + "loss": 0.0507, + "num_input_tokens_seen": 157196288, + "step": 129185 + }, + { + "epoch": 14.388016482904556, + "grad_norm": 0.7783429026603699, + "learning_rate": 1.1063960078717584e-05, + "loss": 0.0957, + "num_input_tokens_seen": 157202496, + "step": 129190 + }, + { + "epoch": 14.388573337788172, + "grad_norm": 0.01050541177392006, + "learning_rate": 1.1061942936967485e-05, + "loss": 0.0399, + "num_input_tokens_seen": 157208704, + "step": 129195 + }, + { + "epoch": 14.38913019267179, + "grad_norm": 0.5242475867271423, + "learning_rate": 1.10599259268735e-05, + "loss": 0.0163, + "num_input_tokens_seen": 157214784, + "step": 129200 + }, + { + "epoch": 14.389687047555407, + "grad_norm": 2.094774007797241, + "learning_rate": 1.1057909048454682e-05, + "loss": 0.0679, + "num_input_tokens_seen": 157220832, + "step": 129205 + }, + { + "epoch": 14.390243902439025, + "grad_norm": 0.00147659657523036, + "learning_rate": 1.1055892301730075e-05, + "loss": 0.006, + "num_input_tokens_seen": 157227072, + "step": 129210 + }, + { + "epoch": 14.390800757322642, + "grad_norm": 0.0004671833594329655, + "learning_rate": 1.1053875686718746e-05, + "loss": 0.1025, + "num_input_tokens_seen": 157233600, + "step": 129215 + }, + { + "epoch": 14.391357612206258, + "grad_norm": 0.0013291233917698264, + "learning_rate": 1.1051859203439736e-05, + "loss": 0.0076, + "num_input_tokens_seen": 157239744, + "step": 129220 + }, + { + "epoch": 14.391914467089876, + "grad_norm": 0.15916205942630768, + "learning_rate": 1.104984285191209e-05, + "loss": 0.0119, + "num_input_tokens_seen": 157246016, + "step": 129225 + }, + { + "epoch": 14.392471321973494, + "grad_norm": 0.04449484869837761, + "learning_rate": 1.1047826632154845e-05, + "loss": 0.0368, + "num_input_tokens_seen": 157251712, + "step": 129230 + }, + { + "epoch": 14.393028176857111, + "grad_norm": 0.11066336929798126, + "learning_rate": 1.1045810544187065e-05, + "loss": 0.0186, + "num_input_tokens_seen": 157257856, + "step": 129235 + }, + { + "epoch": 14.393585031740729, + "grad_norm": 0.011468232609331608, + "learning_rate": 1.1043794588027778e-05, + "loss": 0.0506, + "num_input_tokens_seen": 157264064, + "step": 129240 + }, + { + "epoch": 14.394141886624345, + "grad_norm": 0.051279470324516296, + "learning_rate": 1.1041778763696049e-05, + "loss": 0.1083, + "num_input_tokens_seen": 157270144, + "step": 129245 + }, + { + "epoch": 14.394698741507963, + "grad_norm": 0.0009922666940838099, + "learning_rate": 1.1039763071210884e-05, + "loss": 0.0106, + "num_input_tokens_seen": 157276192, + "step": 129250 + }, + { + "epoch": 14.39525559639158, + "grad_norm": 3.2685844898223877, + "learning_rate": 1.103774751059135e-05, + "loss": 0.1487, + "num_input_tokens_seen": 157282112, + "step": 129255 + }, + { + "epoch": 14.395812451275198, + "grad_norm": 1.2273699045181274, + "learning_rate": 1.103573208185647e-05, + "loss": 0.0896, + "num_input_tokens_seen": 157287904, + "step": 129260 + }, + { + "epoch": 14.396369306158816, + "grad_norm": 0.17526303231716156, + "learning_rate": 1.1033716785025297e-05, + "loss": 0.0602, + "num_input_tokens_seen": 157294048, + "step": 129265 + }, + { + "epoch": 14.396926161042432, + "grad_norm": 0.12517154216766357, + "learning_rate": 1.1031701620116858e-05, + "loss": 0.0031, + "num_input_tokens_seen": 157300224, + "step": 129270 + }, + { + "epoch": 14.39748301592605, + "grad_norm": 0.07074989378452301, + "learning_rate": 1.1029686587150187e-05, + "loss": 0.0063, + "num_input_tokens_seen": 157306560, + "step": 129275 + }, + { + "epoch": 14.398039870809667, + "grad_norm": 0.34673580527305603, + "learning_rate": 1.1027671686144311e-05, + "loss": 0.0681, + "num_input_tokens_seen": 157312544, + "step": 129280 + }, + { + "epoch": 14.398596725693285, + "grad_norm": 3.327716112136841, + "learning_rate": 1.1025656917118283e-05, + "loss": 0.0616, + "num_input_tokens_seen": 157318624, + "step": 129285 + }, + { + "epoch": 14.399153580576902, + "grad_norm": 0.025573568418622017, + "learning_rate": 1.1023642280091118e-05, + "loss": 0.0042, + "num_input_tokens_seen": 157324448, + "step": 129290 + }, + { + "epoch": 14.39971043546052, + "grad_norm": 0.03208821639418602, + "learning_rate": 1.1021627775081847e-05, + "loss": 0.0803, + "num_input_tokens_seen": 157330400, + "step": 129295 + }, + { + "epoch": 14.400267290344136, + "grad_norm": 0.11043872684240341, + "learning_rate": 1.1019613402109497e-05, + "loss": 0.0706, + "num_input_tokens_seen": 157336256, + "step": 129300 + }, + { + "epoch": 14.400824145227753, + "grad_norm": 0.014583155512809753, + "learning_rate": 1.1017599161193104e-05, + "loss": 0.0339, + "num_input_tokens_seen": 157342080, + "step": 129305 + }, + { + "epoch": 14.401381000111371, + "grad_norm": 0.7171545028686523, + "learning_rate": 1.1015585052351682e-05, + "loss": 0.011, + "num_input_tokens_seen": 157348384, + "step": 129310 + }, + { + "epoch": 14.401937854994989, + "grad_norm": 0.21924906969070435, + "learning_rate": 1.1013571075604282e-05, + "loss": 0.0467, + "num_input_tokens_seen": 157354400, + "step": 129315 + }, + { + "epoch": 14.402494709878606, + "grad_norm": 0.004832122474908829, + "learning_rate": 1.101155723096989e-05, + "loss": 0.0029, + "num_input_tokens_seen": 157360544, + "step": 129320 + }, + { + "epoch": 14.403051564762222, + "grad_norm": 0.19638274610042572, + "learning_rate": 1.1009543518467557e-05, + "loss": 0.0922, + "num_input_tokens_seen": 157366048, + "step": 129325 + }, + { + "epoch": 14.40360841964584, + "grad_norm": 0.0010105031542479992, + "learning_rate": 1.1007529938116287e-05, + "loss": 0.1151, + "num_input_tokens_seen": 157372096, + "step": 129330 + }, + { + "epoch": 14.404165274529458, + "grad_norm": 0.024908075109124184, + "learning_rate": 1.1005516489935114e-05, + "loss": 0.0196, + "num_input_tokens_seen": 157378272, + "step": 129335 + }, + { + "epoch": 14.404722129413075, + "grad_norm": 0.0007462491630576551, + "learning_rate": 1.100350317394305e-05, + "loss": 0.0964, + "num_input_tokens_seen": 157384544, + "step": 129340 + }, + { + "epoch": 14.405278984296693, + "grad_norm": 3.553828001022339, + "learning_rate": 1.1001489990159114e-05, + "loss": 0.0395, + "num_input_tokens_seen": 157390816, + "step": 129345 + }, + { + "epoch": 14.405835839180309, + "grad_norm": 0.004165458492934704, + "learning_rate": 1.099947693860231e-05, + "loss": 0.0353, + "num_input_tokens_seen": 157397024, + "step": 129350 + }, + { + "epoch": 14.406392694063927, + "grad_norm": 0.34314584732055664, + "learning_rate": 1.0997464019291673e-05, + "loss": 0.0251, + "num_input_tokens_seen": 157403456, + "step": 129355 + }, + { + "epoch": 14.406949548947544, + "grad_norm": 1.5458160638809204, + "learning_rate": 1.0995451232246206e-05, + "loss": 0.0799, + "num_input_tokens_seen": 157409664, + "step": 129360 + }, + { + "epoch": 14.407506403831162, + "grad_norm": 0.005066984798759222, + "learning_rate": 1.0993438577484925e-05, + "loss": 0.0009, + "num_input_tokens_seen": 157415808, + "step": 129365 + }, + { + "epoch": 14.40806325871478, + "grad_norm": 0.030146557837724686, + "learning_rate": 1.0991426055026827e-05, + "loss": 0.0721, + "num_input_tokens_seen": 157421920, + "step": 129370 + }, + { + "epoch": 14.408620113598396, + "grad_norm": 0.07415738701820374, + "learning_rate": 1.0989413664890944e-05, + "loss": 0.0617, + "num_input_tokens_seen": 157427776, + "step": 129375 + }, + { + "epoch": 14.409176968482013, + "grad_norm": 0.8104115128517151, + "learning_rate": 1.0987401407096262e-05, + "loss": 0.0178, + "num_input_tokens_seen": 157433984, + "step": 129380 + }, + { + "epoch": 14.409733823365631, + "grad_norm": 1.6553641557693481, + "learning_rate": 1.0985389281661813e-05, + "loss": 0.0819, + "num_input_tokens_seen": 157439936, + "step": 129385 + }, + { + "epoch": 14.410290678249249, + "grad_norm": 0.12283532321453094, + "learning_rate": 1.098337728860659e-05, + "loss": 0.0902, + "num_input_tokens_seen": 157445888, + "step": 129390 + }, + { + "epoch": 14.410847533132866, + "grad_norm": 0.7234291434288025, + "learning_rate": 1.0981365427949597e-05, + "loss": 0.0669, + "num_input_tokens_seen": 157452064, + "step": 129395 + }, + { + "epoch": 14.411404388016482, + "grad_norm": 0.13840509951114655, + "learning_rate": 1.097935369970983e-05, + "loss": 0.0812, + "num_input_tokens_seen": 157458208, + "step": 129400 + }, + { + "epoch": 14.4119612429001, + "grad_norm": 0.35846078395843506, + "learning_rate": 1.097734210390631e-05, + "loss": 0.0178, + "num_input_tokens_seen": 157464480, + "step": 129405 + }, + { + "epoch": 14.412518097783718, + "grad_norm": 0.016832586377859116, + "learning_rate": 1.097533064055803e-05, + "loss": 0.0018, + "num_input_tokens_seen": 157470656, + "step": 129410 + }, + { + "epoch": 14.413074952667335, + "grad_norm": 0.0721527636051178, + "learning_rate": 1.0973319309683986e-05, + "loss": 0.0392, + "num_input_tokens_seen": 157476480, + "step": 129415 + }, + { + "epoch": 14.413631807550953, + "grad_norm": 0.9631634950637817, + "learning_rate": 1.097130811130317e-05, + "loss": 0.0511, + "num_input_tokens_seen": 157482752, + "step": 129420 + }, + { + "epoch": 14.414188662434569, + "grad_norm": 0.023334771394729614, + "learning_rate": 1.0969297045434599e-05, + "loss": 0.0078, + "num_input_tokens_seen": 157488704, + "step": 129425 + }, + { + "epoch": 14.414745517318186, + "grad_norm": 0.014800749719142914, + "learning_rate": 1.096728611209725e-05, + "loss": 0.0071, + "num_input_tokens_seen": 157495040, + "step": 129430 + }, + { + "epoch": 14.415302372201804, + "grad_norm": 0.0015765796415507793, + "learning_rate": 1.0965275311310144e-05, + "loss": 0.0346, + "num_input_tokens_seen": 157501024, + "step": 129435 + }, + { + "epoch": 14.415859227085422, + "grad_norm": 0.47493964433670044, + "learning_rate": 1.0963264643092239e-05, + "loss": 0.0245, + "num_input_tokens_seen": 157507008, + "step": 129440 + }, + { + "epoch": 14.41641608196904, + "grad_norm": 1.5403869152069092, + "learning_rate": 1.0961254107462554e-05, + "loss": 0.0718, + "num_input_tokens_seen": 157513280, + "step": 129445 + }, + { + "epoch": 14.416972936852655, + "grad_norm": 0.03556709364056587, + "learning_rate": 1.0959243704440061e-05, + "loss": 0.024, + "num_input_tokens_seen": 157519392, + "step": 129450 + }, + { + "epoch": 14.417529791736273, + "grad_norm": 0.02152284048497677, + "learning_rate": 1.0957233434043773e-05, + "loss": 0.0833, + "num_input_tokens_seen": 157525376, + "step": 129455 + }, + { + "epoch": 14.41808664661989, + "grad_norm": 0.0017513242783024907, + "learning_rate": 1.0955223296292663e-05, + "loss": 0.0143, + "num_input_tokens_seen": 157531392, + "step": 129460 + }, + { + "epoch": 14.418643501503508, + "grad_norm": 0.20876498520374298, + "learning_rate": 1.0953213291205725e-05, + "loss": 0.012, + "num_input_tokens_seen": 157537440, + "step": 129465 + }, + { + "epoch": 14.419200356387126, + "grad_norm": 0.6212307214736938, + "learning_rate": 1.0951203418801932e-05, + "loss": 0.0363, + "num_input_tokens_seen": 157543456, + "step": 129470 + }, + { + "epoch": 14.419757211270742, + "grad_norm": 0.006634407676756382, + "learning_rate": 1.0949193679100284e-05, + "loss": 0.03, + "num_input_tokens_seen": 157549344, + "step": 129475 + }, + { + "epoch": 14.42031406615436, + "grad_norm": 0.35767263174057007, + "learning_rate": 1.0947184072119762e-05, + "loss": 0.0139, + "num_input_tokens_seen": 157555712, + "step": 129480 + }, + { + "epoch": 14.420870921037977, + "grad_norm": 0.000980574986897409, + "learning_rate": 1.0945174597879343e-05, + "loss": 0.0159, + "num_input_tokens_seen": 157561632, + "step": 129485 + }, + { + "epoch": 14.421427775921595, + "grad_norm": 0.18093931674957275, + "learning_rate": 1.0943165256398003e-05, + "loss": 0.0172, + "num_input_tokens_seen": 157567776, + "step": 129490 + }, + { + "epoch": 14.421984630805213, + "grad_norm": 0.9293081760406494, + "learning_rate": 1.0941156047694739e-05, + "loss": 0.0282, + "num_input_tokens_seen": 157574176, + "step": 129495 + }, + { + "epoch": 14.42254148568883, + "grad_norm": 0.0002414352202322334, + "learning_rate": 1.0939146971788507e-05, + "loss": 0.0686, + "num_input_tokens_seen": 157580448, + "step": 129500 + }, + { + "epoch": 14.423098340572446, + "grad_norm": 0.6140003800392151, + "learning_rate": 1.0937138028698321e-05, + "loss": 0.0889, + "num_input_tokens_seen": 157586176, + "step": 129505 + }, + { + "epoch": 14.423655195456064, + "grad_norm": 0.0007539044599980116, + "learning_rate": 1.0935129218443113e-05, + "loss": 0.0479, + "num_input_tokens_seen": 157592096, + "step": 129510 + }, + { + "epoch": 14.424212050339682, + "grad_norm": 0.0006028888165019453, + "learning_rate": 1.0933120541041885e-05, + "loss": 0.0077, + "num_input_tokens_seen": 157598208, + "step": 129515 + }, + { + "epoch": 14.4247689052233, + "grad_norm": 1.7457038164138794, + "learning_rate": 1.0931111996513596e-05, + "loss": 0.0928, + "num_input_tokens_seen": 157604480, + "step": 129520 + }, + { + "epoch": 14.425325760106917, + "grad_norm": 1.7203086614608765, + "learning_rate": 1.0929103584877238e-05, + "loss": 0.0444, + "num_input_tokens_seen": 157610432, + "step": 129525 + }, + { + "epoch": 14.425882614990533, + "grad_norm": 0.38377121090888977, + "learning_rate": 1.0927095306151767e-05, + "loss": 0.0471, + "num_input_tokens_seen": 157616704, + "step": 129530 + }, + { + "epoch": 14.42643946987415, + "grad_norm": 0.0049746232107281685, + "learning_rate": 1.0925087160356157e-05, + "loss": 0.0191, + "num_input_tokens_seen": 157623072, + "step": 129535 + }, + { + "epoch": 14.426996324757768, + "grad_norm": 0.9961355328559875, + "learning_rate": 1.0923079147509363e-05, + "loss": 0.0212, + "num_input_tokens_seen": 157629408, + "step": 129540 + }, + { + "epoch": 14.427553179641386, + "grad_norm": 1.7059600353240967, + "learning_rate": 1.0921071267630378e-05, + "loss": 0.0277, + "num_input_tokens_seen": 157635360, + "step": 129545 + }, + { + "epoch": 14.428110034525004, + "grad_norm": 0.00022759543207939714, + "learning_rate": 1.0919063520738152e-05, + "loss": 0.0426, + "num_input_tokens_seen": 157641600, + "step": 129550 + }, + { + "epoch": 14.42866688940862, + "grad_norm": 0.0010442084167152643, + "learning_rate": 1.0917055906851653e-05, + "loss": 0.0158, + "num_input_tokens_seen": 157647872, + "step": 129555 + }, + { + "epoch": 14.429223744292237, + "grad_norm": 0.25474658608436584, + "learning_rate": 1.0915048425989838e-05, + "loss": 0.0289, + "num_input_tokens_seen": 157654080, + "step": 129560 + }, + { + "epoch": 14.429780599175855, + "grad_norm": 3.552612781524658, + "learning_rate": 1.0913041078171682e-05, + "loss": 0.1203, + "num_input_tokens_seen": 157660224, + "step": 129565 + }, + { + "epoch": 14.430337454059472, + "grad_norm": 0.024851592257618904, + "learning_rate": 1.0911033863416128e-05, + "loss": 0.0082, + "num_input_tokens_seen": 157665696, + "step": 129570 + }, + { + "epoch": 14.43089430894309, + "grad_norm": 0.005456523969769478, + "learning_rate": 1.090902678174216e-05, + "loss": 0.0913, + "num_input_tokens_seen": 157671712, + "step": 129575 + }, + { + "epoch": 14.431451163826706, + "grad_norm": 0.5845804214477539, + "learning_rate": 1.0907019833168722e-05, + "loss": 0.0205, + "num_input_tokens_seen": 157677728, + "step": 129580 + }, + { + "epoch": 14.432008018710324, + "grad_norm": 0.29326578974723816, + "learning_rate": 1.0905013017714771e-05, + "loss": 0.0661, + "num_input_tokens_seen": 157683808, + "step": 129585 + }, + { + "epoch": 14.432564873593941, + "grad_norm": 0.07970710843801498, + "learning_rate": 1.0903006335399257e-05, + "loss": 0.0367, + "num_input_tokens_seen": 157689888, + "step": 129590 + }, + { + "epoch": 14.433121728477559, + "grad_norm": 0.2812732756137848, + "learning_rate": 1.0900999786241154e-05, + "loss": 0.0214, + "num_input_tokens_seen": 157695968, + "step": 129595 + }, + { + "epoch": 14.433678583361177, + "grad_norm": 0.017265835776925087, + "learning_rate": 1.08989933702594e-05, + "loss": 0.0191, + "num_input_tokens_seen": 157702304, + "step": 129600 + }, + { + "epoch": 14.434235438244793, + "grad_norm": 0.65268474817276, + "learning_rate": 1.0896987087472954e-05, + "loss": 0.0437, + "num_input_tokens_seen": 157708384, + "step": 129605 + }, + { + "epoch": 14.43479229312841, + "grad_norm": 0.4492758512496948, + "learning_rate": 1.0894980937900762e-05, + "loss": 0.0433, + "num_input_tokens_seen": 157714400, + "step": 129610 + }, + { + "epoch": 14.435349148012028, + "grad_norm": 1.314285159111023, + "learning_rate": 1.0892974921561768e-05, + "loss": 0.0352, + "num_input_tokens_seen": 157720512, + "step": 129615 + }, + { + "epoch": 14.435906002895646, + "grad_norm": 0.021418137475848198, + "learning_rate": 1.0890969038474938e-05, + "loss": 0.0759, + "num_input_tokens_seen": 157726656, + "step": 129620 + }, + { + "epoch": 14.436462857779263, + "grad_norm": 0.00818618480116129, + "learning_rate": 1.0888963288659207e-05, + "loss": 0.1007, + "num_input_tokens_seen": 157732576, + "step": 129625 + }, + { + "epoch": 14.43701971266288, + "grad_norm": 0.47387728095054626, + "learning_rate": 1.0886957672133527e-05, + "loss": 0.0081, + "num_input_tokens_seen": 157738880, + "step": 129630 + }, + { + "epoch": 14.437576567546497, + "grad_norm": 0.5368305444717407, + "learning_rate": 1.0884952188916828e-05, + "loss": 0.0734, + "num_input_tokens_seen": 157745120, + "step": 129635 + }, + { + "epoch": 14.438133422430115, + "grad_norm": 9.035319089889526e-05, + "learning_rate": 1.0882946839028072e-05, + "loss": 0.076, + "num_input_tokens_seen": 157751584, + "step": 129640 + }, + { + "epoch": 14.438690277313732, + "grad_norm": 1.3401753902435303, + "learning_rate": 1.0880941622486185e-05, + "loss": 0.0707, + "num_input_tokens_seen": 157757792, + "step": 129645 + }, + { + "epoch": 14.43924713219735, + "grad_norm": 0.019344111904501915, + "learning_rate": 1.0878936539310137e-05, + "loss": 0.0432, + "num_input_tokens_seen": 157764160, + "step": 129650 + }, + { + "epoch": 14.439803987080968, + "grad_norm": 0.8195799589157104, + "learning_rate": 1.0876931589518825e-05, + "loss": 0.0699, + "num_input_tokens_seen": 157770368, + "step": 129655 + }, + { + "epoch": 14.440360841964583, + "grad_norm": 0.0647277683019638, + "learning_rate": 1.087492677313122e-05, + "loss": 0.0106, + "num_input_tokens_seen": 157776416, + "step": 129660 + }, + { + "epoch": 14.440917696848201, + "grad_norm": 0.6644046306610107, + "learning_rate": 1.0872922090166241e-05, + "loss": 0.084, + "num_input_tokens_seen": 157782240, + "step": 129665 + }, + { + "epoch": 14.441474551731819, + "grad_norm": 1.2314419746398926, + "learning_rate": 1.0870917540642839e-05, + "loss": 0.031, + "num_input_tokens_seen": 157788256, + "step": 129670 + }, + { + "epoch": 14.442031406615436, + "grad_norm": 0.002201755763962865, + "learning_rate": 1.0868913124579941e-05, + "loss": 0.063, + "num_input_tokens_seen": 157794304, + "step": 129675 + }, + { + "epoch": 14.442588261499054, + "grad_norm": 0.00012716156197711825, + "learning_rate": 1.0866908841996479e-05, + "loss": 0.0775, + "num_input_tokens_seen": 157800352, + "step": 129680 + }, + { + "epoch": 14.44314511638267, + "grad_norm": 0.004534871317446232, + "learning_rate": 1.0864904692911378e-05, + "loss": 0.0978, + "num_input_tokens_seen": 157806432, + "step": 129685 + }, + { + "epoch": 14.443701971266288, + "grad_norm": 0.00044153945054858923, + "learning_rate": 1.0862900677343588e-05, + "loss": 0.0379, + "num_input_tokens_seen": 157812608, + "step": 129690 + }, + { + "epoch": 14.444258826149905, + "grad_norm": 0.3770286738872528, + "learning_rate": 1.0860896795312025e-05, + "loss": 0.017, + "num_input_tokens_seen": 157818656, + "step": 129695 + }, + { + "epoch": 14.444815681033523, + "grad_norm": 0.053513236343860626, + "learning_rate": 1.0858893046835622e-05, + "loss": 0.0239, + "num_input_tokens_seen": 157824992, + "step": 129700 + }, + { + "epoch": 14.44537253591714, + "grad_norm": 0.9828165173530579, + "learning_rate": 1.0856889431933292e-05, + "loss": 0.0207, + "num_input_tokens_seen": 157831296, + "step": 129705 + }, + { + "epoch": 14.445929390800757, + "grad_norm": 0.6912058591842651, + "learning_rate": 1.0854885950623983e-05, + "loss": 0.0075, + "num_input_tokens_seen": 157837440, + "step": 129710 + }, + { + "epoch": 14.446486245684374, + "grad_norm": 0.10687866806983948, + "learning_rate": 1.08528826029266e-05, + "loss": 0.0663, + "num_input_tokens_seen": 157843488, + "step": 129715 + }, + { + "epoch": 14.447043100567992, + "grad_norm": 0.006917011458426714, + "learning_rate": 1.0850879388860086e-05, + "loss": 0.0051, + "num_input_tokens_seen": 157849952, + "step": 129720 + }, + { + "epoch": 14.44759995545161, + "grad_norm": 0.003637408372014761, + "learning_rate": 1.0848876308443351e-05, + "loss": 0.0773, + "num_input_tokens_seen": 157855712, + "step": 129725 + }, + { + "epoch": 14.448156810335227, + "grad_norm": 0.051897499710321426, + "learning_rate": 1.0846873361695317e-05, + "loss": 0.0765, + "num_input_tokens_seen": 157861312, + "step": 129730 + }, + { + "epoch": 14.448713665218843, + "grad_norm": 0.46580642461776733, + "learning_rate": 1.0844870548634895e-05, + "loss": 0.0212, + "num_input_tokens_seen": 157867360, + "step": 129735 + }, + { + "epoch": 14.449270520102461, + "grad_norm": 0.005903377663344145, + "learning_rate": 1.0842867869281017e-05, + "loss": 0.0137, + "num_input_tokens_seen": 157873280, + "step": 129740 + }, + { + "epoch": 14.449827374986079, + "grad_norm": 0.26688867807388306, + "learning_rate": 1.08408653236526e-05, + "loss": 0.0301, + "num_input_tokens_seen": 157879328, + "step": 129745 + }, + { + "epoch": 14.450384229869696, + "grad_norm": 1.2082420587539673, + "learning_rate": 1.083886291176855e-05, + "loss": 0.0163, + "num_input_tokens_seen": 157885408, + "step": 129750 + }, + { + "epoch": 14.450941084753314, + "grad_norm": 0.1764036864042282, + "learning_rate": 1.0836860633647777e-05, + "loss": 0.0035, + "num_input_tokens_seen": 157891456, + "step": 129755 + }, + { + "epoch": 14.45149793963693, + "grad_norm": 0.20102901756763458, + "learning_rate": 1.0834858489309213e-05, + "loss": 0.0765, + "num_input_tokens_seen": 157897696, + "step": 129760 + }, + { + "epoch": 14.452054794520548, + "grad_norm": 0.16462473571300507, + "learning_rate": 1.0832856478771757e-05, + "loss": 0.0033, + "num_input_tokens_seen": 157903840, + "step": 129765 + }, + { + "epoch": 14.452611649404165, + "grad_norm": 0.06829322129487991, + "learning_rate": 1.0830854602054328e-05, + "loss": 0.0605, + "num_input_tokens_seen": 157910016, + "step": 129770 + }, + { + "epoch": 14.453168504287783, + "grad_norm": 0.0002169005892938003, + "learning_rate": 1.0828852859175814e-05, + "loss": 0.0857, + "num_input_tokens_seen": 157915328, + "step": 129775 + }, + { + "epoch": 14.4537253591714, + "grad_norm": 0.2809772491455078, + "learning_rate": 1.082685125015515e-05, + "loss": 0.0333, + "num_input_tokens_seen": 157921248, + "step": 129780 + }, + { + "epoch": 14.454282214055016, + "grad_norm": 0.8742020726203918, + "learning_rate": 1.0824849775011222e-05, + "loss": 0.0366, + "num_input_tokens_seen": 157927200, + "step": 129785 + }, + { + "epoch": 14.454839068938634, + "grad_norm": 0.6273496150970459, + "learning_rate": 1.0822848433762955e-05, + "loss": 0.0753, + "num_input_tokens_seen": 157933632, + "step": 129790 + }, + { + "epoch": 14.455395923822252, + "grad_norm": 1.5413883924484253, + "learning_rate": 1.0820847226429242e-05, + "loss": 0.0657, + "num_input_tokens_seen": 157939616, + "step": 129795 + }, + { + "epoch": 14.45595277870587, + "grad_norm": 0.06387025117874146, + "learning_rate": 1.0818846153028985e-05, + "loss": 0.0342, + "num_input_tokens_seen": 157945792, + "step": 129800 + }, + { + "epoch": 14.456509633589487, + "grad_norm": 0.19652050733566284, + "learning_rate": 1.0816845213581083e-05, + "loss": 0.0338, + "num_input_tokens_seen": 157951968, + "step": 129805 + }, + { + "epoch": 14.457066488473103, + "grad_norm": 0.3446914255619049, + "learning_rate": 1.0814844408104449e-05, + "loss": 0.0757, + "num_input_tokens_seen": 157957856, + "step": 129810 + }, + { + "epoch": 14.45762334335672, + "grad_norm": 0.2042785882949829, + "learning_rate": 1.0812843736617973e-05, + "loss": 0.0136, + "num_input_tokens_seen": 157963296, + "step": 129815 + }, + { + "epoch": 14.458180198240338, + "grad_norm": 0.1495913565158844, + "learning_rate": 1.0810843199140555e-05, + "loss": 0.019, + "num_input_tokens_seen": 157969152, + "step": 129820 + }, + { + "epoch": 14.458737053123956, + "grad_norm": 0.0033167661167681217, + "learning_rate": 1.0808842795691082e-05, + "loss": 0.1926, + "num_input_tokens_seen": 157974976, + "step": 129825 + }, + { + "epoch": 14.459293908007574, + "grad_norm": 0.005964380223304033, + "learning_rate": 1.0806842526288468e-05, + "loss": 0.1006, + "num_input_tokens_seen": 157980992, + "step": 129830 + }, + { + "epoch": 14.45985076289119, + "grad_norm": 0.09867886453866959, + "learning_rate": 1.0804842390951589e-05, + "loss": 0.048, + "num_input_tokens_seen": 157986656, + "step": 129835 + }, + { + "epoch": 14.460407617774807, + "grad_norm": 0.6826224327087402, + "learning_rate": 1.0802842389699367e-05, + "loss": 0.0736, + "num_input_tokens_seen": 157992512, + "step": 129840 + }, + { + "epoch": 14.460964472658425, + "grad_norm": 0.0007636358495801687, + "learning_rate": 1.080084252255065e-05, + "loss": 0.0983, + "num_input_tokens_seen": 157998528, + "step": 129845 + }, + { + "epoch": 14.461521327542043, + "grad_norm": 0.061701107770204544, + "learning_rate": 1.0798842789524362e-05, + "loss": 0.0067, + "num_input_tokens_seen": 158004384, + "step": 129850 + }, + { + "epoch": 14.46207818242566, + "grad_norm": 0.8626852631568909, + "learning_rate": 1.0796843190639375e-05, + "loss": 0.069, + "num_input_tokens_seen": 158010560, + "step": 129855 + }, + { + "epoch": 14.462635037309278, + "grad_norm": 1.110277771949768, + "learning_rate": 1.0794843725914591e-05, + "loss": 0.069, + "num_input_tokens_seen": 158016544, + "step": 129860 + }, + { + "epoch": 14.463191892192894, + "grad_norm": 0.8674794435501099, + "learning_rate": 1.079284439536889e-05, + "loss": 0.026, + "num_input_tokens_seen": 158022976, + "step": 129865 + }, + { + "epoch": 14.463748747076512, + "grad_norm": 0.038448311388492584, + "learning_rate": 1.0790845199021155e-05, + "loss": 0.0136, + "num_input_tokens_seen": 158029312, + "step": 129870 + }, + { + "epoch": 14.46430560196013, + "grad_norm": 0.026007547974586487, + "learning_rate": 1.0788846136890263e-05, + "loss": 0.026, + "num_input_tokens_seen": 158034816, + "step": 129875 + }, + { + "epoch": 14.464862456843747, + "grad_norm": 0.6791285872459412, + "learning_rate": 1.0786847208995112e-05, + "loss": 0.0102, + "num_input_tokens_seen": 158040544, + "step": 129880 + }, + { + "epoch": 14.465419311727365, + "grad_norm": 0.03343725576996803, + "learning_rate": 1.0784848415354581e-05, + "loss": 0.0258, + "num_input_tokens_seen": 158046720, + "step": 129885 + }, + { + "epoch": 14.46597616661098, + "grad_norm": 0.09277646988630295, + "learning_rate": 1.0782849755987543e-05, + "loss": 0.0223, + "num_input_tokens_seen": 158052960, + "step": 129890 + }, + { + "epoch": 14.466533021494598, + "grad_norm": 0.08138740062713623, + "learning_rate": 1.078085123091287e-05, + "loss": 0.0219, + "num_input_tokens_seen": 158058656, + "step": 129895 + }, + { + "epoch": 14.467089876378216, + "grad_norm": 1.6623672246932983, + "learning_rate": 1.077885284014946e-05, + "loss": 0.0127, + "num_input_tokens_seen": 158064736, + "step": 129900 + }, + { + "epoch": 14.467646731261834, + "grad_norm": 0.003151597687974572, + "learning_rate": 1.0776854583716167e-05, + "loss": 0.0242, + "num_input_tokens_seen": 158070752, + "step": 129905 + }, + { + "epoch": 14.468203586145451, + "grad_norm": 0.6641559600830078, + "learning_rate": 1.07748564616319e-05, + "loss": 0.0163, + "num_input_tokens_seen": 158076864, + "step": 129910 + }, + { + "epoch": 14.468760441029067, + "grad_norm": 0.21142150461673737, + "learning_rate": 1.0772858473915486e-05, + "loss": 0.0093, + "num_input_tokens_seen": 158082784, + "step": 129915 + }, + { + "epoch": 14.469317295912685, + "grad_norm": 0.00018128335068468004, + "learning_rate": 1.0770860620585837e-05, + "loss": 0.0877, + "num_input_tokens_seen": 158088768, + "step": 129920 + }, + { + "epoch": 14.469874150796302, + "grad_norm": 0.003306092694401741, + "learning_rate": 1.0768862901661799e-05, + "loss": 0.006, + "num_input_tokens_seen": 158094752, + "step": 129925 + }, + { + "epoch": 14.47043100567992, + "grad_norm": 0.0006114828283898532, + "learning_rate": 1.076686531716226e-05, + "loss": 0.0026, + "num_input_tokens_seen": 158101056, + "step": 129930 + }, + { + "epoch": 14.470987860563538, + "grad_norm": 0.7484161853790283, + "learning_rate": 1.0764867867106085e-05, + "loss": 0.0227, + "num_input_tokens_seen": 158107456, + "step": 129935 + }, + { + "epoch": 14.471544715447154, + "grad_norm": 0.00872312393039465, + "learning_rate": 1.0762870551512138e-05, + "loss": 0.0564, + "num_input_tokens_seen": 158113728, + "step": 129940 + }, + { + "epoch": 14.472101570330771, + "grad_norm": 0.027865884825587273, + "learning_rate": 1.0760873370399275e-05, + "loss": 0.0185, + "num_input_tokens_seen": 158119744, + "step": 129945 + }, + { + "epoch": 14.472658425214389, + "grad_norm": 0.43619048595428467, + "learning_rate": 1.075887632378638e-05, + "loss": 0.0099, + "num_input_tokens_seen": 158125984, + "step": 129950 + }, + { + "epoch": 14.473215280098007, + "grad_norm": 1.7060339450836182, + "learning_rate": 1.0756879411692305e-05, + "loss": 0.2162, + "num_input_tokens_seen": 158131936, + "step": 129955 + }, + { + "epoch": 14.473772134981624, + "grad_norm": 0.3843775689601898, + "learning_rate": 1.0754882634135919e-05, + "loss": 0.0762, + "num_input_tokens_seen": 158137728, + "step": 129960 + }, + { + "epoch": 14.47432898986524, + "grad_norm": 0.25186485052108765, + "learning_rate": 1.075288599113607e-05, + "loss": 0.0663, + "num_input_tokens_seen": 158143840, + "step": 129965 + }, + { + "epoch": 14.474885844748858, + "grad_norm": 1.9979126453399658, + "learning_rate": 1.0750889482711634e-05, + "loss": 0.1644, + "num_input_tokens_seen": 158149664, + "step": 129970 + }, + { + "epoch": 14.475442699632476, + "grad_norm": 0.16951365768909454, + "learning_rate": 1.0748893108881458e-05, + "loss": 0.1476, + "num_input_tokens_seen": 158155264, + "step": 129975 + }, + { + "epoch": 14.475999554516093, + "grad_norm": 0.001462520333006978, + "learning_rate": 1.074689686966441e-05, + "loss": 0.0053, + "num_input_tokens_seen": 158161888, + "step": 129980 + }, + { + "epoch": 14.476556409399711, + "grad_norm": 0.044161684811115265, + "learning_rate": 1.0744900765079344e-05, + "loss": 0.0165, + "num_input_tokens_seen": 158167712, + "step": 129985 + }, + { + "epoch": 14.477113264283329, + "grad_norm": 0.6387468576431274, + "learning_rate": 1.074290479514511e-05, + "loss": 0.0102, + "num_input_tokens_seen": 158173920, + "step": 129990 + }, + { + "epoch": 14.477670119166945, + "grad_norm": 0.1089540421962738, + "learning_rate": 1.0740908959880549e-05, + "loss": 0.0406, + "num_input_tokens_seen": 158179680, + "step": 129995 + }, + { + "epoch": 14.478226974050562, + "grad_norm": 0.09810284525156021, + "learning_rate": 1.0738913259304543e-05, + "loss": 0.1228, + "num_input_tokens_seen": 158186048, + "step": 130000 + }, + { + "epoch": 14.47878382893418, + "grad_norm": 0.07083515077829361, + "learning_rate": 1.0736917693435921e-05, + "loss": 0.0049, + "num_input_tokens_seen": 158192224, + "step": 130005 + }, + { + "epoch": 14.479340683817798, + "grad_norm": 0.2635327875614166, + "learning_rate": 1.0734922262293545e-05, + "loss": 0.048, + "num_input_tokens_seen": 158198048, + "step": 130010 + }, + { + "epoch": 14.479897538701415, + "grad_norm": 0.0002278041938552633, + "learning_rate": 1.0732926965896254e-05, + "loss": 0.046, + "num_input_tokens_seen": 158204416, + "step": 130015 + }, + { + "epoch": 14.480454393585031, + "grad_norm": 0.0019269147887825966, + "learning_rate": 1.073093180426289e-05, + "loss": 0.0111, + "num_input_tokens_seen": 158210848, + "step": 130020 + }, + { + "epoch": 14.481011248468649, + "grad_norm": 0.8624489307403564, + "learning_rate": 1.0728936777412313e-05, + "loss": 0.0263, + "num_input_tokens_seen": 158216960, + "step": 130025 + }, + { + "epoch": 14.481568103352267, + "grad_norm": 0.33194127678871155, + "learning_rate": 1.0726941885363364e-05, + "loss": 0.0309, + "num_input_tokens_seen": 158223136, + "step": 130030 + }, + { + "epoch": 14.482124958235884, + "grad_norm": 0.0005873451009392738, + "learning_rate": 1.0724947128134889e-05, + "loss": 0.0026, + "num_input_tokens_seen": 158229184, + "step": 130035 + }, + { + "epoch": 14.482681813119502, + "grad_norm": 0.000394965463783592, + "learning_rate": 1.072295250574571e-05, + "loss": 0.0032, + "num_input_tokens_seen": 158235264, + "step": 130040 + }, + { + "epoch": 14.483238668003118, + "grad_norm": 1.6182414293289185, + "learning_rate": 1.0720958018214694e-05, + "loss": 0.0682, + "num_input_tokens_seen": 158241152, + "step": 130045 + }, + { + "epoch": 14.483795522886735, + "grad_norm": 1.3845959901809692, + "learning_rate": 1.071896366556066e-05, + "loss": 0.0829, + "num_input_tokens_seen": 158247104, + "step": 130050 + }, + { + "epoch": 14.484352377770353, + "grad_norm": 0.020080193877220154, + "learning_rate": 1.0716969447802478e-05, + "loss": 0.0067, + "num_input_tokens_seen": 158253120, + "step": 130055 + }, + { + "epoch": 14.48490923265397, + "grad_norm": 0.7346470355987549, + "learning_rate": 1.071497536495894e-05, + "loss": 0.0264, + "num_input_tokens_seen": 158258624, + "step": 130060 + }, + { + "epoch": 14.485466087537588, + "grad_norm": 0.017133628949522972, + "learning_rate": 1.0712981417048917e-05, + "loss": 0.0328, + "num_input_tokens_seen": 158264544, + "step": 130065 + }, + { + "epoch": 14.486022942421204, + "grad_norm": 0.708060622215271, + "learning_rate": 1.0710987604091219e-05, + "loss": 0.0238, + "num_input_tokens_seen": 158270464, + "step": 130070 + }, + { + "epoch": 14.486579797304822, + "grad_norm": 0.08417562395334244, + "learning_rate": 1.0708993926104702e-05, + "loss": 0.0161, + "num_input_tokens_seen": 158276832, + "step": 130075 + }, + { + "epoch": 14.48713665218844, + "grad_norm": 0.002419083844870329, + "learning_rate": 1.0707000383108187e-05, + "loss": 0.0225, + "num_input_tokens_seen": 158283136, + "step": 130080 + }, + { + "epoch": 14.487693507072057, + "grad_norm": 0.14655964076519012, + "learning_rate": 1.0705006975120507e-05, + "loss": 0.0494, + "num_input_tokens_seen": 158288608, + "step": 130085 + }, + { + "epoch": 14.488250361955675, + "grad_norm": 1.6394363641738892, + "learning_rate": 1.0703013702160477e-05, + "loss": 0.0164, + "num_input_tokens_seen": 158294848, + "step": 130090 + }, + { + "epoch": 14.488807216839291, + "grad_norm": 0.1168164610862732, + "learning_rate": 1.0701020564246947e-05, + "loss": 0.0825, + "num_input_tokens_seen": 158300896, + "step": 130095 + }, + { + "epoch": 14.489364071722909, + "grad_norm": 1.238421082496643, + "learning_rate": 1.0699027561398734e-05, + "loss": 0.0632, + "num_input_tokens_seen": 158306976, + "step": 130100 + }, + { + "epoch": 14.489920926606526, + "grad_norm": 1.530014991760254, + "learning_rate": 1.0697034693634665e-05, + "loss": 0.031, + "num_input_tokens_seen": 158312864, + "step": 130105 + }, + { + "epoch": 14.490477781490144, + "grad_norm": 0.12401701509952545, + "learning_rate": 1.0695041960973552e-05, + "loss": 0.0193, + "num_input_tokens_seen": 158318976, + "step": 130110 + }, + { + "epoch": 14.491034636373762, + "grad_norm": 0.006612193305045366, + "learning_rate": 1.0693049363434238e-05, + "loss": 0.0063, + "num_input_tokens_seen": 158325216, + "step": 130115 + }, + { + "epoch": 14.491591491257378, + "grad_norm": 0.00044118324876762927, + "learning_rate": 1.0691056901035524e-05, + "loss": 0.1018, + "num_input_tokens_seen": 158331424, + "step": 130120 + }, + { + "epoch": 14.492148346140995, + "grad_norm": 0.3125056028366089, + "learning_rate": 1.0689064573796253e-05, + "loss": 0.1428, + "num_input_tokens_seen": 158337600, + "step": 130125 + }, + { + "epoch": 14.492705201024613, + "grad_norm": 0.00015841967251617461, + "learning_rate": 1.0687072381735233e-05, + "loss": 0.0004, + "num_input_tokens_seen": 158343936, + "step": 130130 + }, + { + "epoch": 14.49326205590823, + "grad_norm": 0.0019908801186829805, + "learning_rate": 1.068508032487128e-05, + "loss": 0.0011, + "num_input_tokens_seen": 158350560, + "step": 130135 + }, + { + "epoch": 14.493818910791848, + "grad_norm": 0.012104129418730736, + "learning_rate": 1.0683088403223207e-05, + "loss": 0.0746, + "num_input_tokens_seen": 158356256, + "step": 130140 + }, + { + "epoch": 14.494375765675464, + "grad_norm": 0.15131843090057373, + "learning_rate": 1.068109661680984e-05, + "loss": 0.0791, + "num_input_tokens_seen": 158362944, + "step": 130145 + }, + { + "epoch": 14.494932620559082, + "grad_norm": 0.6032304763793945, + "learning_rate": 1.067910496564999e-05, + "loss": 0.0518, + "num_input_tokens_seen": 158368736, + "step": 130150 + }, + { + "epoch": 14.4954894754427, + "grad_norm": 0.3410142660140991, + "learning_rate": 1.0677113449762462e-05, + "loss": 0.0327, + "num_input_tokens_seen": 158374304, + "step": 130155 + }, + { + "epoch": 14.496046330326317, + "grad_norm": 0.175667405128479, + "learning_rate": 1.0675122069166072e-05, + "loss": 0.0187, + "num_input_tokens_seen": 158380416, + "step": 130160 + }, + { + "epoch": 14.496603185209935, + "grad_norm": 9.824666631175205e-05, + "learning_rate": 1.0673130823879635e-05, + "loss": 0.0362, + "num_input_tokens_seen": 158386528, + "step": 130165 + }, + { + "epoch": 14.49716004009355, + "grad_norm": 5.797977924346924, + "learning_rate": 1.0671139713921947e-05, + "loss": 0.051, + "num_input_tokens_seen": 158392544, + "step": 130170 + }, + { + "epoch": 14.497716894977168, + "grad_norm": 2.249159336090088, + "learning_rate": 1.0669148739311843e-05, + "loss": 0.0614, + "num_input_tokens_seen": 158398880, + "step": 130175 + }, + { + "epoch": 14.498273749860786, + "grad_norm": 0.29487067461013794, + "learning_rate": 1.0667157900068092e-05, + "loss": 0.0423, + "num_input_tokens_seen": 158405344, + "step": 130180 + }, + { + "epoch": 14.498830604744404, + "grad_norm": 0.903873085975647, + "learning_rate": 1.0665167196209528e-05, + "loss": 0.0301, + "num_input_tokens_seen": 158411584, + "step": 130185 + }, + { + "epoch": 14.499387459628021, + "grad_norm": 0.2913953363895416, + "learning_rate": 1.0663176627754937e-05, + "loss": 0.0454, + "num_input_tokens_seen": 158417696, + "step": 130190 + }, + { + "epoch": 14.49994431451164, + "grad_norm": 0.24934709072113037, + "learning_rate": 1.0661186194723136e-05, + "loss": 0.0183, + "num_input_tokens_seen": 158423712, + "step": 130195 + }, + { + "epoch": 14.500501169395255, + "grad_norm": 0.2588560879230499, + "learning_rate": 1.065919589713292e-05, + "loss": 0.0122, + "num_input_tokens_seen": 158429984, + "step": 130200 + }, + { + "epoch": 14.501058024278873, + "grad_norm": 0.76567143201828, + "learning_rate": 1.0657205735003086e-05, + "loss": 0.0331, + "num_input_tokens_seen": 158435808, + "step": 130205 + }, + { + "epoch": 14.50161487916249, + "grad_norm": 0.0027094371616840363, + "learning_rate": 1.065521570835243e-05, + "loss": 0.1169, + "num_input_tokens_seen": 158441856, + "step": 130210 + }, + { + "epoch": 14.502171734046108, + "grad_norm": 0.00012829031038563699, + "learning_rate": 1.0653225817199764e-05, + "loss": 0.1109, + "num_input_tokens_seen": 158448192, + "step": 130215 + }, + { + "epoch": 14.502728588929726, + "grad_norm": 1.5263893604278564, + "learning_rate": 1.0651236061563872e-05, + "loss": 0.0435, + "num_input_tokens_seen": 158454272, + "step": 130220 + }, + { + "epoch": 14.503285443813342, + "grad_norm": 0.057785410434007645, + "learning_rate": 1.064924644146355e-05, + "loss": 0.028, + "num_input_tokens_seen": 158459840, + "step": 130225 + }, + { + "epoch": 14.50384229869696, + "grad_norm": 0.05944915488362312, + "learning_rate": 1.0647256956917586e-05, + "loss": 0.0482, + "num_input_tokens_seen": 158465440, + "step": 130230 + }, + { + "epoch": 14.504399153580577, + "grad_norm": 0.41065606474876404, + "learning_rate": 1.064526760794479e-05, + "loss": 0.0288, + "num_input_tokens_seen": 158471680, + "step": 130235 + }, + { + "epoch": 14.504956008464195, + "grad_norm": 0.00516210962086916, + "learning_rate": 1.0643278394563932e-05, + "loss": 0.0076, + "num_input_tokens_seen": 158478048, + "step": 130240 + }, + { + "epoch": 14.505512863347812, + "grad_norm": 0.13164176046848297, + "learning_rate": 1.0641289316793828e-05, + "loss": 0.0159, + "num_input_tokens_seen": 158484256, + "step": 130245 + }, + { + "epoch": 14.506069718231428, + "grad_norm": 0.16198091208934784, + "learning_rate": 1.0639300374653235e-05, + "loss": 0.146, + "num_input_tokens_seen": 158490240, + "step": 130250 + }, + { + "epoch": 14.506626573115046, + "grad_norm": 0.00038467743434011936, + "learning_rate": 1.0637311568160965e-05, + "loss": 0.0282, + "num_input_tokens_seen": 158496384, + "step": 130255 + }, + { + "epoch": 14.507183427998664, + "grad_norm": 0.0018215691670775414, + "learning_rate": 1.0635322897335784e-05, + "loss": 0.0087, + "num_input_tokens_seen": 158502592, + "step": 130260 + }, + { + "epoch": 14.507740282882281, + "grad_norm": 1.6376372575759888, + "learning_rate": 1.0633334362196496e-05, + "loss": 0.0781, + "num_input_tokens_seen": 158507328, + "step": 130265 + }, + { + "epoch": 14.508297137765899, + "grad_norm": 0.15288300812244415, + "learning_rate": 1.0631345962761874e-05, + "loss": 0.0177, + "num_input_tokens_seen": 158513536, + "step": 130270 + }, + { + "epoch": 14.508853992649515, + "grad_norm": 0.0016002263873815536, + "learning_rate": 1.0629357699050704e-05, + "loss": 0.0409, + "num_input_tokens_seen": 158519872, + "step": 130275 + }, + { + "epoch": 14.509410847533132, + "grad_norm": 3.0718636512756348, + "learning_rate": 1.0627369571081756e-05, + "loss": 0.0289, + "num_input_tokens_seen": 158526208, + "step": 130280 + }, + { + "epoch": 14.50996770241675, + "grad_norm": 0.03088429383933544, + "learning_rate": 1.0625381578873822e-05, + "loss": 0.0006, + "num_input_tokens_seen": 158532480, + "step": 130285 + }, + { + "epoch": 14.510524557300368, + "grad_norm": 0.6497355699539185, + "learning_rate": 1.062339372244568e-05, + "loss": 0.0143, + "num_input_tokens_seen": 158538432, + "step": 130290 + }, + { + "epoch": 14.511081412183986, + "grad_norm": 0.006736066658049822, + "learning_rate": 1.06214060018161e-05, + "loss": 0.0124, + "num_input_tokens_seen": 158544288, + "step": 130295 + }, + { + "epoch": 14.511638267067601, + "grad_norm": 4.459402561187744, + "learning_rate": 1.0619418417003852e-05, + "loss": 0.1322, + "num_input_tokens_seen": 158550208, + "step": 130300 + }, + { + "epoch": 14.512195121951219, + "grad_norm": 0.010981127619743347, + "learning_rate": 1.0617430968027727e-05, + "loss": 0.0048, + "num_input_tokens_seen": 158556128, + "step": 130305 + }, + { + "epoch": 14.512751976834837, + "grad_norm": 0.18435750901699066, + "learning_rate": 1.0615443654906482e-05, + "loss": 0.0365, + "num_input_tokens_seen": 158561824, + "step": 130310 + }, + { + "epoch": 14.513308831718454, + "grad_norm": 0.004790232516825199, + "learning_rate": 1.0613456477658912e-05, + "loss": 0.0093, + "num_input_tokens_seen": 158568064, + "step": 130315 + }, + { + "epoch": 14.513865686602072, + "grad_norm": 0.37581881880760193, + "learning_rate": 1.0611469436303756e-05, + "loss": 0.0064, + "num_input_tokens_seen": 158574144, + "step": 130320 + }, + { + "epoch": 14.514422541485688, + "grad_norm": 4.191047668457031, + "learning_rate": 1.0609482530859807e-05, + "loss": 0.052, + "num_input_tokens_seen": 158580224, + "step": 130325 + }, + { + "epoch": 14.514979396369306, + "grad_norm": 0.03344640135765076, + "learning_rate": 1.0607495761345818e-05, + "loss": 0.0043, + "num_input_tokens_seen": 158586400, + "step": 130330 + }, + { + "epoch": 14.515536251252923, + "grad_norm": 0.01991494931280613, + "learning_rate": 1.060550912778057e-05, + "loss": 0.003, + "num_input_tokens_seen": 158592096, + "step": 130335 + }, + { + "epoch": 14.516093106136541, + "grad_norm": 0.04122791439294815, + "learning_rate": 1.0603522630182822e-05, + "loss": 0.012, + "num_input_tokens_seen": 158598112, + "step": 130340 + }, + { + "epoch": 14.516649961020159, + "grad_norm": 0.0459199883043766, + "learning_rate": 1.0601536268571335e-05, + "loss": 0.0349, + "num_input_tokens_seen": 158604128, + "step": 130345 + }, + { + "epoch": 14.517206815903776, + "grad_norm": 0.21008290350437164, + "learning_rate": 1.0599550042964868e-05, + "loss": 0.0189, + "num_input_tokens_seen": 158610336, + "step": 130350 + }, + { + "epoch": 14.517763670787392, + "grad_norm": 0.4264695942401886, + "learning_rate": 1.0597563953382195e-05, + "loss": 0.0301, + "num_input_tokens_seen": 158616032, + "step": 130355 + }, + { + "epoch": 14.51832052567101, + "grad_norm": 0.04368600621819496, + "learning_rate": 1.0595577999842068e-05, + "loss": 0.0011, + "num_input_tokens_seen": 158622272, + "step": 130360 + }, + { + "epoch": 14.518877380554628, + "grad_norm": 0.12755799293518066, + "learning_rate": 1.0593592182363249e-05, + "loss": 0.0353, + "num_input_tokens_seen": 158628160, + "step": 130365 + }, + { + "epoch": 14.519434235438245, + "grad_norm": 0.02090897597372532, + "learning_rate": 1.0591606500964486e-05, + "loss": 0.0203, + "num_input_tokens_seen": 158634304, + "step": 130370 + }, + { + "epoch": 14.519991090321863, + "grad_norm": 0.001951249549165368, + "learning_rate": 1.0589620955664553e-05, + "loss": 0.0575, + "num_input_tokens_seen": 158639968, + "step": 130375 + }, + { + "epoch": 14.520547945205479, + "grad_norm": 0.03272411599755287, + "learning_rate": 1.0587635546482184e-05, + "loss": 0.0528, + "num_input_tokens_seen": 158645952, + "step": 130380 + }, + { + "epoch": 14.521104800089097, + "grad_norm": 0.0681147351861, + "learning_rate": 1.0585650273436155e-05, + "loss": 0.0214, + "num_input_tokens_seen": 158652160, + "step": 130385 + }, + { + "epoch": 14.521661654972714, + "grad_norm": 0.02503081038594246, + "learning_rate": 1.0583665136545207e-05, + "loss": 0.1133, + "num_input_tokens_seen": 158658240, + "step": 130390 + }, + { + "epoch": 14.522218509856332, + "grad_norm": 0.45251667499542236, + "learning_rate": 1.0581680135828092e-05, + "loss": 0.0097, + "num_input_tokens_seen": 158664608, + "step": 130395 + }, + { + "epoch": 14.52277536473995, + "grad_norm": 1.6320383548736572, + "learning_rate": 1.057969527130355e-05, + "loss": 0.1013, + "num_input_tokens_seen": 158670912, + "step": 130400 + }, + { + "epoch": 14.523332219623565, + "grad_norm": 0.49015265703201294, + "learning_rate": 1.0577710542990352e-05, + "loss": 0.0136, + "num_input_tokens_seen": 158677312, + "step": 130405 + }, + { + "epoch": 14.523889074507183, + "grad_norm": 1.4675065279006958, + "learning_rate": 1.0575725950907228e-05, + "loss": 0.0338, + "num_input_tokens_seen": 158682816, + "step": 130410 + }, + { + "epoch": 14.5244459293908, + "grad_norm": 1.8039321899414062, + "learning_rate": 1.0573741495072933e-05, + "loss": 0.0578, + "num_input_tokens_seen": 158689152, + "step": 130415 + }, + { + "epoch": 14.525002784274418, + "grad_norm": 0.003998489584773779, + "learning_rate": 1.0571757175506197e-05, + "loss": 0.1091, + "num_input_tokens_seen": 158695616, + "step": 130420 + }, + { + "epoch": 14.525559639158036, + "grad_norm": 1.9904147386550903, + "learning_rate": 1.0569772992225782e-05, + "loss": 0.0367, + "num_input_tokens_seen": 158701632, + "step": 130425 + }, + { + "epoch": 14.526116494041652, + "grad_norm": 0.0001387480297125876, + "learning_rate": 1.0567788945250423e-05, + "loss": 0.0141, + "num_input_tokens_seen": 158707968, + "step": 130430 + }, + { + "epoch": 14.52667334892527, + "grad_norm": 0.0006302242400124669, + "learning_rate": 1.0565805034598863e-05, + "loss": 0.0437, + "num_input_tokens_seen": 158713952, + "step": 130435 + }, + { + "epoch": 14.527230203808887, + "grad_norm": 0.00040927546797320247, + "learning_rate": 1.0563821260289836e-05, + "loss": 0.0038, + "num_input_tokens_seen": 158720320, + "step": 130440 + }, + { + "epoch": 14.527787058692505, + "grad_norm": 0.8150036334991455, + "learning_rate": 1.0561837622342075e-05, + "loss": 0.139, + "num_input_tokens_seen": 158725888, + "step": 130445 + }, + { + "epoch": 14.528343913576123, + "grad_norm": 0.15199872851371765, + "learning_rate": 1.0559854120774335e-05, + "loss": 0.0853, + "num_input_tokens_seen": 158731872, + "step": 130450 + }, + { + "epoch": 14.528900768459739, + "grad_norm": 0.02139490470290184, + "learning_rate": 1.0557870755605331e-05, + "loss": 0.0043, + "num_input_tokens_seen": 158738208, + "step": 130455 + }, + { + "epoch": 14.529457623343356, + "grad_norm": 0.00010357851715525612, + "learning_rate": 1.055588752685383e-05, + "loss": 0.1282, + "num_input_tokens_seen": 158744256, + "step": 130460 + }, + { + "epoch": 14.530014478226974, + "grad_norm": 0.03403521701693535, + "learning_rate": 1.0553904434538522e-05, + "loss": 0.0574, + "num_input_tokens_seen": 158750432, + "step": 130465 + }, + { + "epoch": 14.530571333110592, + "grad_norm": 0.004081450868397951, + "learning_rate": 1.0551921478678173e-05, + "loss": 0.0105, + "num_input_tokens_seen": 158756480, + "step": 130470 + }, + { + "epoch": 14.53112818799421, + "grad_norm": 0.2769090533256531, + "learning_rate": 1.0549938659291492e-05, + "loss": 0.0072, + "num_input_tokens_seen": 158762528, + "step": 130475 + }, + { + "epoch": 14.531685042877825, + "grad_norm": 0.0002118819538736716, + "learning_rate": 1.0547955976397223e-05, + "loss": 0.0644, + "num_input_tokens_seen": 158768736, + "step": 130480 + }, + { + "epoch": 14.532241897761443, + "grad_norm": 0.08189702033996582, + "learning_rate": 1.0545973430014092e-05, + "loss": 0.018, + "num_input_tokens_seen": 158774016, + "step": 130485 + }, + { + "epoch": 14.53279875264506, + "grad_norm": 0.009900943376123905, + "learning_rate": 1.054399102016082e-05, + "loss": 0.0031, + "num_input_tokens_seen": 158780128, + "step": 130490 + }, + { + "epoch": 14.533355607528678, + "grad_norm": 0.34134355187416077, + "learning_rate": 1.0542008746856128e-05, + "loss": 0.0814, + "num_input_tokens_seen": 158786240, + "step": 130495 + }, + { + "epoch": 14.533912462412296, + "grad_norm": 1.6978955268859863, + "learning_rate": 1.0540026610118755e-05, + "loss": 0.0883, + "num_input_tokens_seen": 158792320, + "step": 130500 + }, + { + "epoch": 14.534469317295912, + "grad_norm": 0.0042899781838059425, + "learning_rate": 1.0538044609967416e-05, + "loss": 0.046, + "num_input_tokens_seen": 158797696, + "step": 130505 + }, + { + "epoch": 14.53502617217953, + "grad_norm": 0.040925368666648865, + "learning_rate": 1.0536062746420832e-05, + "loss": 0.1569, + "num_input_tokens_seen": 158803712, + "step": 130510 + }, + { + "epoch": 14.535583027063147, + "grad_norm": 0.14638996124267578, + "learning_rate": 1.0534081019497714e-05, + "loss": 0.1152, + "num_input_tokens_seen": 158809760, + "step": 130515 + }, + { + "epoch": 14.536139881946765, + "grad_norm": 0.00017959889373742044, + "learning_rate": 1.0532099429216801e-05, + "loss": 0.0984, + "num_input_tokens_seen": 158816000, + "step": 130520 + }, + { + "epoch": 14.536696736830383, + "grad_norm": 0.08846397697925568, + "learning_rate": 1.053011797559679e-05, + "loss": 0.0121, + "num_input_tokens_seen": 158822016, + "step": 130525 + }, + { + "epoch": 14.537253591713998, + "grad_norm": 0.17562958598136902, + "learning_rate": 1.0528136658656418e-05, + "loss": 0.0045, + "num_input_tokens_seen": 158828384, + "step": 130530 + }, + { + "epoch": 14.537810446597616, + "grad_norm": 0.05650540813803673, + "learning_rate": 1.052615547841439e-05, + "loss": 0.0072, + "num_input_tokens_seen": 158834848, + "step": 130535 + }, + { + "epoch": 14.538367301481234, + "grad_norm": 0.004856482148170471, + "learning_rate": 1.052417443488942e-05, + "loss": 0.0016, + "num_input_tokens_seen": 158840800, + "step": 130540 + }, + { + "epoch": 14.538924156364851, + "grad_norm": 0.25934529304504395, + "learning_rate": 1.052219352810021e-05, + "loss": 0.1221, + "num_input_tokens_seen": 158847136, + "step": 130545 + }, + { + "epoch": 14.53948101124847, + "grad_norm": 0.6235777139663696, + "learning_rate": 1.052021275806549e-05, + "loss": 0.0125, + "num_input_tokens_seen": 158852960, + "step": 130550 + }, + { + "epoch": 14.540037866132085, + "grad_norm": 0.0009267134009860456, + "learning_rate": 1.0518232124803965e-05, + "loss": 0.0872, + "num_input_tokens_seen": 158858560, + "step": 130555 + }, + { + "epoch": 14.540594721015703, + "grad_norm": 1.1813513040542603, + "learning_rate": 1.0516251628334336e-05, + "loss": 0.0731, + "num_input_tokens_seen": 158864800, + "step": 130560 + }, + { + "epoch": 14.54115157589932, + "grad_norm": 0.03753281384706497, + "learning_rate": 1.0514271268675308e-05, + "loss": 0.003, + "num_input_tokens_seen": 158871104, + "step": 130565 + }, + { + "epoch": 14.541708430782938, + "grad_norm": 0.030231235548853874, + "learning_rate": 1.0512291045845602e-05, + "loss": 0.0397, + "num_input_tokens_seen": 158877376, + "step": 130570 + }, + { + "epoch": 14.542265285666556, + "grad_norm": 0.9504321813583374, + "learning_rate": 1.0510310959863906e-05, + "loss": 0.0111, + "num_input_tokens_seen": 158883680, + "step": 130575 + }, + { + "epoch": 14.542822140550173, + "grad_norm": 0.040924593806266785, + "learning_rate": 1.0508331010748949e-05, + "loss": 0.0521, + "num_input_tokens_seen": 158890112, + "step": 130580 + }, + { + "epoch": 14.54337899543379, + "grad_norm": 0.4111759066581726, + "learning_rate": 1.0506351198519399e-05, + "loss": 0.0382, + "num_input_tokens_seen": 158896256, + "step": 130585 + }, + { + "epoch": 14.543935850317407, + "grad_norm": 0.889223575592041, + "learning_rate": 1.0504371523193982e-05, + "loss": 0.0675, + "num_input_tokens_seen": 158902400, + "step": 130590 + }, + { + "epoch": 14.544492705201025, + "grad_norm": 0.012810073792934418, + "learning_rate": 1.0502391984791382e-05, + "loss": 0.0497, + "num_input_tokens_seen": 158907968, + "step": 130595 + }, + { + "epoch": 14.545049560084642, + "grad_norm": 1.277230143547058, + "learning_rate": 1.0500412583330313e-05, + "loss": 0.0158, + "num_input_tokens_seen": 158913920, + "step": 130600 + }, + { + "epoch": 14.54560641496826, + "grad_norm": 0.00427609495818615, + "learning_rate": 1.0498433318829462e-05, + "loss": 0.0115, + "num_input_tokens_seen": 158919968, + "step": 130605 + }, + { + "epoch": 14.546163269851876, + "grad_norm": 0.19213701784610748, + "learning_rate": 1.049645419130753e-05, + "loss": 0.0095, + "num_input_tokens_seen": 158926400, + "step": 130610 + }, + { + "epoch": 14.546720124735494, + "grad_norm": 0.0003095519496127963, + "learning_rate": 1.04944752007832e-05, + "loss": 0.1177, + "num_input_tokens_seen": 158932672, + "step": 130615 + }, + { + "epoch": 14.547276979619111, + "grad_norm": 0.01575707271695137, + "learning_rate": 1.049249634727518e-05, + "loss": 0.0878, + "num_input_tokens_seen": 158938560, + "step": 130620 + }, + { + "epoch": 14.547833834502729, + "grad_norm": 0.09602555632591248, + "learning_rate": 1.0490517630802155e-05, + "loss": 0.0027, + "num_input_tokens_seen": 158944640, + "step": 130625 + }, + { + "epoch": 14.548390689386347, + "grad_norm": 0.04323479160666466, + "learning_rate": 1.0488539051382817e-05, + "loss": 0.0684, + "num_input_tokens_seen": 158950816, + "step": 130630 + }, + { + "epoch": 14.548947544269963, + "grad_norm": 0.1610952615737915, + "learning_rate": 1.0486560609035845e-05, + "loss": 0.0811, + "num_input_tokens_seen": 158956864, + "step": 130635 + }, + { + "epoch": 14.54950439915358, + "grad_norm": 0.00472257798537612, + "learning_rate": 1.0484582303779944e-05, + "loss": 0.0076, + "num_input_tokens_seen": 158963136, + "step": 130640 + }, + { + "epoch": 14.550061254037198, + "grad_norm": 0.012212906964123249, + "learning_rate": 1.0482604135633783e-05, + "loss": 0.0216, + "num_input_tokens_seen": 158968960, + "step": 130645 + }, + { + "epoch": 14.550618108920816, + "grad_norm": 0.08480988442897797, + "learning_rate": 1.048062610461608e-05, + "loss": 0.0446, + "num_input_tokens_seen": 158974816, + "step": 130650 + }, + { + "epoch": 14.551174963804433, + "grad_norm": 0.018602799624204636, + "learning_rate": 1.0478648210745473e-05, + "loss": 0.0205, + "num_input_tokens_seen": 158981248, + "step": 130655 + }, + { + "epoch": 14.551731818688049, + "grad_norm": 0.0007274982053786516, + "learning_rate": 1.0476670454040677e-05, + "loss": 0.0144, + "num_input_tokens_seen": 158987104, + "step": 130660 + }, + { + "epoch": 14.552288673571667, + "grad_norm": 0.009372086264193058, + "learning_rate": 1.0474692834520358e-05, + "loss": 0.0015, + "num_input_tokens_seen": 158993216, + "step": 130665 + }, + { + "epoch": 14.552845528455284, + "grad_norm": 0.4384443163871765, + "learning_rate": 1.0472715352203211e-05, + "loss": 0.1315, + "num_input_tokens_seen": 158999232, + "step": 130670 + }, + { + "epoch": 14.553402383338902, + "grad_norm": 0.37848764657974243, + "learning_rate": 1.0470738007107906e-05, + "loss": 0.0388, + "num_input_tokens_seen": 159005088, + "step": 130675 + }, + { + "epoch": 14.55395923822252, + "grad_norm": 0.05878477916121483, + "learning_rate": 1.0468760799253122e-05, + "loss": 0.0032, + "num_input_tokens_seen": 159010720, + "step": 130680 + }, + { + "epoch": 14.554516093106137, + "grad_norm": 0.08647347241640091, + "learning_rate": 1.0466783728657526e-05, + "loss": 0.0048, + "num_input_tokens_seen": 159016832, + "step": 130685 + }, + { + "epoch": 14.555072947989753, + "grad_norm": 0.00542556494474411, + "learning_rate": 1.0464806795339807e-05, + "loss": 0.033, + "num_input_tokens_seen": 159022880, + "step": 130690 + }, + { + "epoch": 14.555629802873371, + "grad_norm": 0.003016931004822254, + "learning_rate": 1.0462829999318634e-05, + "loss": 0.0469, + "num_input_tokens_seen": 159028992, + "step": 130695 + }, + { + "epoch": 14.556186657756989, + "grad_norm": 0.35928672552108765, + "learning_rate": 1.0460853340612683e-05, + "loss": 0.0101, + "num_input_tokens_seen": 159034848, + "step": 130700 + }, + { + "epoch": 14.556743512640606, + "grad_norm": 0.48737937211990356, + "learning_rate": 1.0458876819240609e-05, + "loss": 0.0106, + "num_input_tokens_seen": 159040704, + "step": 130705 + }, + { + "epoch": 14.557300367524224, + "grad_norm": 0.09406240284442902, + "learning_rate": 1.0456900435221103e-05, + "loss": 0.005, + "num_input_tokens_seen": 159046944, + "step": 130710 + }, + { + "epoch": 14.55785722240784, + "grad_norm": 0.35886695981025696, + "learning_rate": 1.0454924188572815e-05, + "loss": 0.0118, + "num_input_tokens_seen": 159052704, + "step": 130715 + }, + { + "epoch": 14.558414077291458, + "grad_norm": 0.15269134938716888, + "learning_rate": 1.045294807931444e-05, + "loss": 0.0129, + "num_input_tokens_seen": 159058272, + "step": 130720 + }, + { + "epoch": 14.558970932175075, + "grad_norm": 0.03467937558889389, + "learning_rate": 1.0450972107464604e-05, + "loss": 0.0585, + "num_input_tokens_seen": 159064736, + "step": 130725 + }, + { + "epoch": 14.559527787058693, + "grad_norm": 0.0007314037648029625, + "learning_rate": 1.0448996273042006e-05, + "loss": 0.0745, + "num_input_tokens_seen": 159071072, + "step": 130730 + }, + { + "epoch": 14.56008464194231, + "grad_norm": 0.022795941680669785, + "learning_rate": 1.0447020576065286e-05, + "loss": 0.0675, + "num_input_tokens_seen": 159076064, + "step": 130735 + }, + { + "epoch": 14.560641496825927, + "grad_norm": 0.043562039732933044, + "learning_rate": 1.0445045016553123e-05, + "loss": 0.0931, + "num_input_tokens_seen": 159082336, + "step": 130740 + }, + { + "epoch": 14.561198351709544, + "grad_norm": 0.1252000331878662, + "learning_rate": 1.0443069594524174e-05, + "loss": 0.0036, + "num_input_tokens_seen": 159089088, + "step": 130745 + }, + { + "epoch": 14.561755206593162, + "grad_norm": 0.0005820380174554884, + "learning_rate": 1.0441094309997094e-05, + "loss": 0.0034, + "num_input_tokens_seen": 159095104, + "step": 130750 + }, + { + "epoch": 14.56231206147678, + "grad_norm": 0.18554189801216125, + "learning_rate": 1.0439119162990535e-05, + "loss": 0.0043, + "num_input_tokens_seen": 159101216, + "step": 130755 + }, + { + "epoch": 14.562868916360397, + "grad_norm": 0.08119309693574905, + "learning_rate": 1.0437144153523167e-05, + "loss": 0.0232, + "num_input_tokens_seen": 159106912, + "step": 130760 + }, + { + "epoch": 14.563425771244013, + "grad_norm": 4.30470085144043, + "learning_rate": 1.0435169281613644e-05, + "loss": 0.1292, + "num_input_tokens_seen": 159112992, + "step": 130765 + }, + { + "epoch": 14.56398262612763, + "grad_norm": 1.0764960050582886, + "learning_rate": 1.0433194547280617e-05, + "loss": 0.0728, + "num_input_tokens_seen": 159118944, + "step": 130770 + }, + { + "epoch": 14.564539481011249, + "grad_norm": 1.81722092628479, + "learning_rate": 1.0431219950542726e-05, + "loss": 0.0466, + "num_input_tokens_seen": 159124288, + "step": 130775 + }, + { + "epoch": 14.565096335894866, + "grad_norm": 0.08390668034553528, + "learning_rate": 1.0429245491418646e-05, + "loss": 0.1378, + "num_input_tokens_seen": 159130528, + "step": 130780 + }, + { + "epoch": 14.565653190778484, + "grad_norm": 0.024400444701313972, + "learning_rate": 1.0427271169927005e-05, + "loss": 0.0398, + "num_input_tokens_seen": 159136576, + "step": 130785 + }, + { + "epoch": 14.5662100456621, + "grad_norm": 0.32003721594810486, + "learning_rate": 1.0425296986086474e-05, + "loss": 0.0074, + "num_input_tokens_seen": 159142912, + "step": 130790 + }, + { + "epoch": 14.566766900545717, + "grad_norm": 0.011187911033630371, + "learning_rate": 1.042332293991569e-05, + "loss": 0.0152, + "num_input_tokens_seen": 159149216, + "step": 130795 + }, + { + "epoch": 14.567323755429335, + "grad_norm": 0.04127996414899826, + "learning_rate": 1.0421349031433298e-05, + "loss": 0.0195, + "num_input_tokens_seen": 159155168, + "step": 130800 + }, + { + "epoch": 14.567880610312953, + "grad_norm": 0.020366230979561806, + "learning_rate": 1.0419375260657937e-05, + "loss": 0.0744, + "num_input_tokens_seen": 159161216, + "step": 130805 + }, + { + "epoch": 14.56843746519657, + "grad_norm": 0.0008862003451213241, + "learning_rate": 1.0417401627608267e-05, + "loss": 0.0478, + "num_input_tokens_seen": 159166976, + "step": 130810 + }, + { + "epoch": 14.568994320080186, + "grad_norm": 0.020612945780158043, + "learning_rate": 1.0415428132302923e-05, + "loss": 0.0017, + "num_input_tokens_seen": 159173024, + "step": 130815 + }, + { + "epoch": 14.569551174963804, + "grad_norm": 0.2939637005329132, + "learning_rate": 1.0413454774760544e-05, + "loss": 0.0046, + "num_input_tokens_seen": 159179424, + "step": 130820 + }, + { + "epoch": 14.570108029847422, + "grad_norm": 0.0013464516960084438, + "learning_rate": 1.0411481554999763e-05, + "loss": 0.0561, + "num_input_tokens_seen": 159185504, + "step": 130825 + }, + { + "epoch": 14.57066488473104, + "grad_norm": 0.0029988333117216825, + "learning_rate": 1.0409508473039233e-05, + "loss": 0.0048, + "num_input_tokens_seen": 159191808, + "step": 130830 + }, + { + "epoch": 14.571221739614657, + "grad_norm": 0.01712113246321678, + "learning_rate": 1.0407535528897588e-05, + "loss": 0.007, + "num_input_tokens_seen": 159197280, + "step": 130835 + }, + { + "epoch": 14.571778594498273, + "grad_norm": 0.01959000527858734, + "learning_rate": 1.0405562722593462e-05, + "loss": 0.0054, + "num_input_tokens_seen": 159203392, + "step": 130840 + }, + { + "epoch": 14.57233544938189, + "grad_norm": 0.005946085788309574, + "learning_rate": 1.0403590054145488e-05, + "loss": 0.0097, + "num_input_tokens_seen": 159209760, + "step": 130845 + }, + { + "epoch": 14.572892304265508, + "grad_norm": 0.4983347952365875, + "learning_rate": 1.040161752357229e-05, + "loss": 0.0078, + "num_input_tokens_seen": 159215936, + "step": 130850 + }, + { + "epoch": 14.573449159149126, + "grad_norm": 0.002914290875196457, + "learning_rate": 1.039964513089252e-05, + "loss": 0.1014, + "num_input_tokens_seen": 159221792, + "step": 130855 + }, + { + "epoch": 14.574006014032744, + "grad_norm": 0.1737855225801468, + "learning_rate": 1.0397672876124792e-05, + "loss": 0.055, + "num_input_tokens_seen": 159227936, + "step": 130860 + }, + { + "epoch": 14.57456286891636, + "grad_norm": 0.9578994512557983, + "learning_rate": 1.039570075928776e-05, + "loss": 0.0359, + "num_input_tokens_seen": 159234048, + "step": 130865 + }, + { + "epoch": 14.575119723799977, + "grad_norm": 0.11328157782554626, + "learning_rate": 1.039372878040002e-05, + "loss": 0.0065, + "num_input_tokens_seen": 159239936, + "step": 130870 + }, + { + "epoch": 14.575676578683595, + "grad_norm": 0.014248805120587349, + "learning_rate": 1.0391756939480218e-05, + "loss": 0.019, + "num_input_tokens_seen": 159246208, + "step": 130875 + }, + { + "epoch": 14.576233433567213, + "grad_norm": 2.4100940227508545, + "learning_rate": 1.038978523654697e-05, + "loss": 0.0568, + "num_input_tokens_seen": 159252064, + "step": 130880 + }, + { + "epoch": 14.57679028845083, + "grad_norm": 0.0018820571713149548, + "learning_rate": 1.0387813671618912e-05, + "loss": 0.0075, + "num_input_tokens_seen": 159258144, + "step": 130885 + }, + { + "epoch": 14.577347143334446, + "grad_norm": 0.23104019463062286, + "learning_rate": 1.0385842244714664e-05, + "loss": 0.0324, + "num_input_tokens_seen": 159264192, + "step": 130890 + }, + { + "epoch": 14.577903998218064, + "grad_norm": 0.034404028207063675, + "learning_rate": 1.0383870955852842e-05, + "loss": 0.0317, + "num_input_tokens_seen": 159270656, + "step": 130895 + }, + { + "epoch": 14.578460853101681, + "grad_norm": 0.026255814358592033, + "learning_rate": 1.0381899805052062e-05, + "loss": 0.0053, + "num_input_tokens_seen": 159276608, + "step": 130900 + }, + { + "epoch": 14.5790177079853, + "grad_norm": 1.3930373191833496, + "learning_rate": 1.0379928792330958e-05, + "loss": 0.0769, + "num_input_tokens_seen": 159282464, + "step": 130905 + }, + { + "epoch": 14.579574562868917, + "grad_norm": 7.615258073201403e-05, + "learning_rate": 1.037795791770814e-05, + "loss": 0.0032, + "num_input_tokens_seen": 159288800, + "step": 130910 + }, + { + "epoch": 14.580131417752535, + "grad_norm": 1.2013659477233887, + "learning_rate": 1.0375987181202226e-05, + "loss": 0.0233, + "num_input_tokens_seen": 159294784, + "step": 130915 + }, + { + "epoch": 14.58068827263615, + "grad_norm": 0.06041803956031799, + "learning_rate": 1.0374016582831819e-05, + "loss": 0.0233, + "num_input_tokens_seen": 159301056, + "step": 130920 + }, + { + "epoch": 14.581245127519768, + "grad_norm": 0.21510455012321472, + "learning_rate": 1.0372046122615553e-05, + "loss": 0.0304, + "num_input_tokens_seen": 159306880, + "step": 130925 + }, + { + "epoch": 14.581801982403386, + "grad_norm": 1.8422234058380127, + "learning_rate": 1.037007580057202e-05, + "loss": 0.0597, + "num_input_tokens_seen": 159313120, + "step": 130930 + }, + { + "epoch": 14.582358837287003, + "grad_norm": 0.005559703800827265, + "learning_rate": 1.0368105616719856e-05, + "loss": 0.0628, + "num_input_tokens_seen": 159319104, + "step": 130935 + }, + { + "epoch": 14.582915692170621, + "grad_norm": 0.006605896633118391, + "learning_rate": 1.0366135571077654e-05, + "loss": 0.0177, + "num_input_tokens_seen": 159325440, + "step": 130940 + }, + { + "epoch": 14.583472547054237, + "grad_norm": 0.00023102971317712218, + "learning_rate": 1.0364165663664027e-05, + "loss": 0.0164, + "num_input_tokens_seen": 159331808, + "step": 130945 + }, + { + "epoch": 14.584029401937855, + "grad_norm": 0.5039278864860535, + "learning_rate": 1.0362195894497572e-05, + "loss": 0.0679, + "num_input_tokens_seen": 159337920, + "step": 130950 + }, + { + "epoch": 14.584586256821472, + "grad_norm": 0.005043745040893555, + "learning_rate": 1.0360226263596915e-05, + "loss": 0.019, + "num_input_tokens_seen": 159343616, + "step": 130955 + }, + { + "epoch": 14.58514311170509, + "grad_norm": 0.029161490499973297, + "learning_rate": 1.0358256770980649e-05, + "loss": 0.017, + "num_input_tokens_seen": 159349312, + "step": 130960 + }, + { + "epoch": 14.585699966588708, + "grad_norm": 0.04837143048644066, + "learning_rate": 1.0356287416667376e-05, + "loss": 0.0083, + "num_input_tokens_seen": 159355616, + "step": 130965 + }, + { + "epoch": 14.586256821472324, + "grad_norm": 0.55295729637146, + "learning_rate": 1.0354318200675694e-05, + "loss": 0.0701, + "num_input_tokens_seen": 159361664, + "step": 130970 + }, + { + "epoch": 14.586813676355941, + "grad_norm": 0.17605943977832794, + "learning_rate": 1.0352349123024222e-05, + "loss": 0.0238, + "num_input_tokens_seen": 159367968, + "step": 130975 + }, + { + "epoch": 14.587370531239559, + "grad_norm": 0.07916086912155151, + "learning_rate": 1.0350380183731535e-05, + "loss": 0.0328, + "num_input_tokens_seen": 159373984, + "step": 130980 + }, + { + "epoch": 14.587927386123177, + "grad_norm": 0.5726343989372253, + "learning_rate": 1.0348411382816264e-05, + "loss": 0.0405, + "num_input_tokens_seen": 159380000, + "step": 130985 + }, + { + "epoch": 14.588484241006794, + "grad_norm": 0.09337896853685379, + "learning_rate": 1.0346442720296967e-05, + "loss": 0.0024, + "num_input_tokens_seen": 159386240, + "step": 130990 + }, + { + "epoch": 14.58904109589041, + "grad_norm": 0.6347705125808716, + "learning_rate": 1.034447419619227e-05, + "loss": 0.0155, + "num_input_tokens_seen": 159392288, + "step": 130995 + }, + { + "epoch": 14.589597950774028, + "grad_norm": 0.8121805191040039, + "learning_rate": 1.0342505810520745e-05, + "loss": 0.0693, + "num_input_tokens_seen": 159398336, + "step": 131000 + }, + { + "epoch": 14.590154805657646, + "grad_norm": 0.0035710232332348824, + "learning_rate": 1.0340537563301006e-05, + "loss": 0.0393, + "num_input_tokens_seen": 159404352, + "step": 131005 + }, + { + "epoch": 14.590711660541263, + "grad_norm": 0.5263100266456604, + "learning_rate": 1.0338569454551634e-05, + "loss": 0.0215, + "num_input_tokens_seen": 159410816, + "step": 131010 + }, + { + "epoch": 14.591268515424881, + "grad_norm": 0.0005034056957811117, + "learning_rate": 1.033660148429122e-05, + "loss": 0.0457, + "num_input_tokens_seen": 159416640, + "step": 131015 + }, + { + "epoch": 14.591825370308497, + "grad_norm": 0.02266196720302105, + "learning_rate": 1.0334633652538344e-05, + "loss": 0.0108, + "num_input_tokens_seen": 159422816, + "step": 131020 + }, + { + "epoch": 14.592382225192114, + "grad_norm": 0.0003415691026020795, + "learning_rate": 1.0332665959311612e-05, + "loss": 0.0073, + "num_input_tokens_seen": 159428416, + "step": 131025 + }, + { + "epoch": 14.592939080075732, + "grad_norm": 0.0002972586080431938, + "learning_rate": 1.0330698404629601e-05, + "loss": 0.0177, + "num_input_tokens_seen": 159434880, + "step": 131030 + }, + { + "epoch": 14.59349593495935, + "grad_norm": 0.0447901114821434, + "learning_rate": 1.0328730988510899e-05, + "loss": 0.0066, + "num_input_tokens_seen": 159441024, + "step": 131035 + }, + { + "epoch": 14.594052789842967, + "grad_norm": 1.9902443885803223, + "learning_rate": 1.0326763710974077e-05, + "loss": 0.0765, + "num_input_tokens_seen": 159446752, + "step": 131040 + }, + { + "epoch": 14.594609644726585, + "grad_norm": 0.5440115928649902, + "learning_rate": 1.0324796572037735e-05, + "loss": 0.0138, + "num_input_tokens_seen": 159453216, + "step": 131045 + }, + { + "epoch": 14.595166499610201, + "grad_norm": 0.2031620591878891, + "learning_rate": 1.0322829571720437e-05, + "loss": 0.0636, + "num_input_tokens_seen": 159459200, + "step": 131050 + }, + { + "epoch": 14.595723354493819, + "grad_norm": 0.0018934005638584495, + "learning_rate": 1.0320862710040797e-05, + "loss": 0.0668, + "num_input_tokens_seen": 159465568, + "step": 131055 + }, + { + "epoch": 14.596280209377436, + "grad_norm": 0.2575575113296509, + "learning_rate": 1.0318895987017346e-05, + "loss": 0.0056, + "num_input_tokens_seen": 159471520, + "step": 131060 + }, + { + "epoch": 14.596837064261054, + "grad_norm": 1.3240244388580322, + "learning_rate": 1.0316929402668693e-05, + "loss": 0.0704, + "num_input_tokens_seen": 159477728, + "step": 131065 + }, + { + "epoch": 14.597393919144672, + "grad_norm": 0.010250338353216648, + "learning_rate": 1.0314962957013399e-05, + "loss": 0.0143, + "num_input_tokens_seen": 159483968, + "step": 131070 + }, + { + "epoch": 14.597950774028288, + "grad_norm": 0.0006876615807414055, + "learning_rate": 1.0312996650070055e-05, + "loss": 0.0623, + "num_input_tokens_seen": 159489984, + "step": 131075 + }, + { + "epoch": 14.598507628911905, + "grad_norm": 0.19934196770191193, + "learning_rate": 1.0311030481857224e-05, + "loss": 0.0969, + "num_input_tokens_seen": 159496160, + "step": 131080 + }, + { + "epoch": 14.599064483795523, + "grad_norm": 0.010155596770346165, + "learning_rate": 1.0309064452393478e-05, + "loss": 0.0177, + "num_input_tokens_seen": 159502432, + "step": 131085 + }, + { + "epoch": 14.59962133867914, + "grad_norm": 0.006350996904075146, + "learning_rate": 1.030709856169738e-05, + "loss": 0.0723, + "num_input_tokens_seen": 159508864, + "step": 131090 + }, + { + "epoch": 14.600178193562758, + "grad_norm": 1.2790178060531616, + "learning_rate": 1.0305132809787516e-05, + "loss": 0.0861, + "num_input_tokens_seen": 159514688, + "step": 131095 + }, + { + "epoch": 14.600735048446374, + "grad_norm": 0.966774582862854, + "learning_rate": 1.0303167196682448e-05, + "loss": 0.1192, + "num_input_tokens_seen": 159520928, + "step": 131100 + }, + { + "epoch": 14.601291903329992, + "grad_norm": 0.005362056195735931, + "learning_rate": 1.0301201722400738e-05, + "loss": 0.0115, + "num_input_tokens_seen": 159526976, + "step": 131105 + }, + { + "epoch": 14.60184875821361, + "grad_norm": 0.14748334884643555, + "learning_rate": 1.0299236386960947e-05, + "loss": 0.0932, + "num_input_tokens_seen": 159533248, + "step": 131110 + }, + { + "epoch": 14.602405613097227, + "grad_norm": 0.19022583961486816, + "learning_rate": 1.0297271190381656e-05, + "loss": 0.0482, + "num_input_tokens_seen": 159539328, + "step": 131115 + }, + { + "epoch": 14.602962467980845, + "grad_norm": 0.45734405517578125, + "learning_rate": 1.0295306132681407e-05, + "loss": 0.0566, + "num_input_tokens_seen": 159544928, + "step": 131120 + }, + { + "epoch": 14.60351932286446, + "grad_norm": 0.057052131742239, + "learning_rate": 1.0293341213878783e-05, + "loss": 0.0347, + "num_input_tokens_seen": 159550816, + "step": 131125 + }, + { + "epoch": 14.604076177748079, + "grad_norm": 0.0006217230693437159, + "learning_rate": 1.0291376433992334e-05, + "loss": 0.0747, + "num_input_tokens_seen": 159557184, + "step": 131130 + }, + { + "epoch": 14.604633032631696, + "grad_norm": 0.00011033567716367543, + "learning_rate": 1.0289411793040618e-05, + "loss": 0.0652, + "num_input_tokens_seen": 159563488, + "step": 131135 + }, + { + "epoch": 14.605189887515314, + "grad_norm": 0.13638585805892944, + "learning_rate": 1.0287447291042185e-05, + "loss": 0.0636, + "num_input_tokens_seen": 159569568, + "step": 131140 + }, + { + "epoch": 14.605746742398932, + "grad_norm": 0.8462285995483398, + "learning_rate": 1.028548292801561e-05, + "loss": 0.0293, + "num_input_tokens_seen": 159575808, + "step": 131145 + }, + { + "epoch": 14.606303597282547, + "grad_norm": 0.8620046377182007, + "learning_rate": 1.0283518703979437e-05, + "loss": 0.0401, + "num_input_tokens_seen": 159582048, + "step": 131150 + }, + { + "epoch": 14.606860452166165, + "grad_norm": 0.11882087588310242, + "learning_rate": 1.0281554618952222e-05, + "loss": 0.0128, + "num_input_tokens_seen": 159588064, + "step": 131155 + }, + { + "epoch": 14.607417307049783, + "grad_norm": 0.00025123136583715677, + "learning_rate": 1.0279590672952504e-05, + "loss": 0.0691, + "num_input_tokens_seen": 159594336, + "step": 131160 + }, + { + "epoch": 14.6079741619334, + "grad_norm": 0.1926209032535553, + "learning_rate": 1.0277626865998858e-05, + "loss": 0.17, + "num_input_tokens_seen": 159600672, + "step": 131165 + }, + { + "epoch": 14.608531016817018, + "grad_norm": 0.3566209077835083, + "learning_rate": 1.027566319810982e-05, + "loss": 0.0082, + "num_input_tokens_seen": 159606816, + "step": 131170 + }, + { + "epoch": 14.609087871700634, + "grad_norm": 0.0003613471635617316, + "learning_rate": 1.0273699669303937e-05, + "loss": 0.0022, + "num_input_tokens_seen": 159612992, + "step": 131175 + }, + { + "epoch": 14.609644726584252, + "grad_norm": 1.6065723896026611, + "learning_rate": 1.0271736279599755e-05, + "loss": 0.0599, + "num_input_tokens_seen": 159619232, + "step": 131180 + }, + { + "epoch": 14.61020158146787, + "grad_norm": 0.034566815942525864, + "learning_rate": 1.0269773029015831e-05, + "loss": 0.0179, + "num_input_tokens_seen": 159624864, + "step": 131185 + }, + { + "epoch": 14.610758436351487, + "grad_norm": 0.6582068800926208, + "learning_rate": 1.0267809917570691e-05, + "loss": 0.0207, + "num_input_tokens_seen": 159630848, + "step": 131190 + }, + { + "epoch": 14.611315291235105, + "grad_norm": 0.05582863464951515, + "learning_rate": 1.0265846945282903e-05, + "loss": 0.013, + "num_input_tokens_seen": 159636608, + "step": 131195 + }, + { + "epoch": 14.61187214611872, + "grad_norm": 1.1053895950317383, + "learning_rate": 1.0263884112170994e-05, + "loss": 0.0241, + "num_input_tokens_seen": 159642816, + "step": 131200 + }, + { + "epoch": 14.612429001002338, + "grad_norm": 0.0008922889828681946, + "learning_rate": 1.0261921418253504e-05, + "loss": 0.0987, + "num_input_tokens_seen": 159648896, + "step": 131205 + }, + { + "epoch": 14.612985855885956, + "grad_norm": 0.673460066318512, + "learning_rate": 1.0259958863548965e-05, + "loss": 0.0815, + "num_input_tokens_seen": 159655264, + "step": 131210 + }, + { + "epoch": 14.613542710769574, + "grad_norm": 1.1743183135986328, + "learning_rate": 1.0257996448075938e-05, + "loss": 0.0318, + "num_input_tokens_seen": 159661536, + "step": 131215 + }, + { + "epoch": 14.614099565653191, + "grad_norm": 0.01466745138168335, + "learning_rate": 1.025603417185294e-05, + "loss": 0.0384, + "num_input_tokens_seen": 159667136, + "step": 131220 + }, + { + "epoch": 14.614656420536807, + "grad_norm": 0.24727171659469604, + "learning_rate": 1.0254072034898515e-05, + "loss": 0.042, + "num_input_tokens_seen": 159673440, + "step": 131225 + }, + { + "epoch": 14.615213275420425, + "grad_norm": 0.20101742446422577, + "learning_rate": 1.0252110037231183e-05, + "loss": 0.0567, + "num_input_tokens_seen": 159679392, + "step": 131230 + }, + { + "epoch": 14.615770130304043, + "grad_norm": 0.037787675857543945, + "learning_rate": 1.0250148178869498e-05, + "loss": 0.0278, + "num_input_tokens_seen": 159685472, + "step": 131235 + }, + { + "epoch": 14.61632698518766, + "grad_norm": 0.004874738864600658, + "learning_rate": 1.024818645983197e-05, + "loss": 0.1431, + "num_input_tokens_seen": 159691712, + "step": 131240 + }, + { + "epoch": 14.616883840071278, + "grad_norm": 0.007834416814148426, + "learning_rate": 1.0246224880137162e-05, + "loss": 0.0007, + "num_input_tokens_seen": 159697824, + "step": 131245 + }, + { + "epoch": 14.617440694954894, + "grad_norm": 0.5424598455429077, + "learning_rate": 1.0244263439803567e-05, + "loss": 0.0291, + "num_input_tokens_seen": 159703904, + "step": 131250 + }, + { + "epoch": 14.617997549838512, + "grad_norm": 0.0600578635931015, + "learning_rate": 1.0242302138849719e-05, + "loss": 0.0224, + "num_input_tokens_seen": 159710016, + "step": 131255 + }, + { + "epoch": 14.61855440472213, + "grad_norm": 1.69730544090271, + "learning_rate": 1.024034097729416e-05, + "loss": 0.1023, + "num_input_tokens_seen": 159716320, + "step": 131260 + }, + { + "epoch": 14.619111259605747, + "grad_norm": 0.0044190892949700356, + "learning_rate": 1.0238379955155394e-05, + "loss": 0.012, + "num_input_tokens_seen": 159722208, + "step": 131265 + }, + { + "epoch": 14.619668114489365, + "grad_norm": 0.13982585072517395, + "learning_rate": 1.0236419072451977e-05, + "loss": 0.0786, + "num_input_tokens_seen": 159728320, + "step": 131270 + }, + { + "epoch": 14.620224969372982, + "grad_norm": 3.1963112354278564, + "learning_rate": 1.0234458329202393e-05, + "loss": 0.1864, + "num_input_tokens_seen": 159734400, + "step": 131275 + }, + { + "epoch": 14.620781824256598, + "grad_norm": 0.6878770589828491, + "learning_rate": 1.0232497725425188e-05, + "loss": 0.0262, + "num_input_tokens_seen": 159740672, + "step": 131280 + }, + { + "epoch": 14.621338679140216, + "grad_norm": 0.9168602228164673, + "learning_rate": 1.0230537261138864e-05, + "loss": 0.0124, + "num_input_tokens_seen": 159747008, + "step": 131285 + }, + { + "epoch": 14.621895534023833, + "grad_norm": 0.11758873611688614, + "learning_rate": 1.0228576936361958e-05, + "loss": 0.0235, + "num_input_tokens_seen": 159753280, + "step": 131290 + }, + { + "epoch": 14.622452388907451, + "grad_norm": 0.00012370447802823037, + "learning_rate": 1.0226616751112978e-05, + "loss": 0.0037, + "num_input_tokens_seen": 159758880, + "step": 131295 + }, + { + "epoch": 14.623009243791069, + "grad_norm": 0.004154890775680542, + "learning_rate": 1.0224656705410438e-05, + "loss": 0.0319, + "num_input_tokens_seen": 159765056, + "step": 131300 + }, + { + "epoch": 14.623566098674685, + "grad_norm": 0.36997637152671814, + "learning_rate": 1.0222696799272844e-05, + "loss": 0.01, + "num_input_tokens_seen": 159771520, + "step": 131305 + }, + { + "epoch": 14.624122953558302, + "grad_norm": 2.2432520389556885, + "learning_rate": 1.0220737032718728e-05, + "loss": 0.0337, + "num_input_tokens_seen": 159777376, + "step": 131310 + }, + { + "epoch": 14.62467980844192, + "grad_norm": 0.0007135967025533319, + "learning_rate": 1.0218777405766591e-05, + "loss": 0.0008, + "num_input_tokens_seen": 159783424, + "step": 131315 + }, + { + "epoch": 14.625236663325538, + "grad_norm": 0.0002788606216199696, + "learning_rate": 1.0216817918434945e-05, + "loss": 0.0207, + "num_input_tokens_seen": 159789568, + "step": 131320 + }, + { + "epoch": 14.625793518209155, + "grad_norm": 0.15494944155216217, + "learning_rate": 1.0214858570742287e-05, + "loss": 0.0477, + "num_input_tokens_seen": 159795776, + "step": 131325 + }, + { + "epoch": 14.626350373092771, + "grad_norm": 0.034108590334653854, + "learning_rate": 1.0212899362707146e-05, + "loss": 0.0098, + "num_input_tokens_seen": 159802368, + "step": 131330 + }, + { + "epoch": 14.626907227976389, + "grad_norm": 0.056512266397476196, + "learning_rate": 1.021094029434801e-05, + "loss": 0.0242, + "num_input_tokens_seen": 159808672, + "step": 131335 + }, + { + "epoch": 14.627464082860007, + "grad_norm": 0.052637699991464615, + "learning_rate": 1.0208981365683398e-05, + "loss": 0.1138, + "num_input_tokens_seen": 159814944, + "step": 131340 + }, + { + "epoch": 14.628020937743624, + "grad_norm": 0.5388694405555725, + "learning_rate": 1.0207022576731809e-05, + "loss": 0.0081, + "num_input_tokens_seen": 159820320, + "step": 131345 + }, + { + "epoch": 14.628577792627242, + "grad_norm": 0.001820875215344131, + "learning_rate": 1.0205063927511743e-05, + "loss": 0.028, + "num_input_tokens_seen": 159826336, + "step": 131350 + }, + { + "epoch": 14.629134647510858, + "grad_norm": 0.00017085118452087045, + "learning_rate": 1.0203105418041692e-05, + "loss": 0.0291, + "num_input_tokens_seen": 159832608, + "step": 131355 + }, + { + "epoch": 14.629691502394476, + "grad_norm": 1.5040903091430664, + "learning_rate": 1.0201147048340177e-05, + "loss": 0.1008, + "num_input_tokens_seen": 159837984, + "step": 131360 + }, + { + "epoch": 14.630248357278093, + "grad_norm": 0.7697661519050598, + "learning_rate": 1.0199188818425681e-05, + "loss": 0.0279, + "num_input_tokens_seen": 159844384, + "step": 131365 + }, + { + "epoch": 14.630805212161711, + "grad_norm": 0.1431443989276886, + "learning_rate": 1.0197230728316706e-05, + "loss": 0.0765, + "num_input_tokens_seen": 159850720, + "step": 131370 + }, + { + "epoch": 14.631362067045329, + "grad_norm": 0.0029317124281078577, + "learning_rate": 1.019527277803174e-05, + "loss": 0.0715, + "num_input_tokens_seen": 159856928, + "step": 131375 + }, + { + "epoch": 14.631918921928944, + "grad_norm": 0.0009170607081614435, + "learning_rate": 1.0193314967589291e-05, + "loss": 0.0059, + "num_input_tokens_seen": 159863136, + "step": 131380 + }, + { + "epoch": 14.632475776812562, + "grad_norm": 0.00015706077101640403, + "learning_rate": 1.0191357297007837e-05, + "loss": 0.0053, + "num_input_tokens_seen": 159869472, + "step": 131385 + }, + { + "epoch": 14.63303263169618, + "grad_norm": 0.02292890101671219, + "learning_rate": 1.0189399766305893e-05, + "loss": 0.0203, + "num_input_tokens_seen": 159875616, + "step": 131390 + }, + { + "epoch": 14.633589486579798, + "grad_norm": 0.009627055376768112, + "learning_rate": 1.0187442375501921e-05, + "loss": 0.0334, + "num_input_tokens_seen": 159881664, + "step": 131395 + }, + { + "epoch": 14.634146341463415, + "grad_norm": 2.1253199577331543, + "learning_rate": 1.018548512461443e-05, + "loss": 0.1847, + "num_input_tokens_seen": 159887616, + "step": 131400 + }, + { + "epoch": 14.634703196347033, + "grad_norm": 0.2137083113193512, + "learning_rate": 1.0183528013661891e-05, + "loss": 0.0581, + "num_input_tokens_seen": 159893728, + "step": 131405 + }, + { + "epoch": 14.635260051230649, + "grad_norm": 0.006814830005168915, + "learning_rate": 1.018157104266281e-05, + "loss": 0.0547, + "num_input_tokens_seen": 159900000, + "step": 131410 + }, + { + "epoch": 14.635816906114266, + "grad_norm": 0.1922054886817932, + "learning_rate": 1.0179614211635663e-05, + "loss": 0.0079, + "num_input_tokens_seen": 159905984, + "step": 131415 + }, + { + "epoch": 14.636373760997884, + "grad_norm": 0.0009890807559713721, + "learning_rate": 1.0177657520598935e-05, + "loss": 0.0142, + "num_input_tokens_seen": 159911808, + "step": 131420 + }, + { + "epoch": 14.636930615881502, + "grad_norm": 0.7696645259857178, + "learning_rate": 1.0175700969571098e-05, + "loss": 0.0513, + "num_input_tokens_seen": 159917856, + "step": 131425 + }, + { + "epoch": 14.63748747076512, + "grad_norm": 1.0726341009140015, + "learning_rate": 1.017374455857065e-05, + "loss": 0.0196, + "num_input_tokens_seen": 159924416, + "step": 131430 + }, + { + "epoch": 14.638044325648735, + "grad_norm": 0.5143814086914062, + "learning_rate": 1.0171788287616065e-05, + "loss": 0.0559, + "num_input_tokens_seen": 159930496, + "step": 131435 + }, + { + "epoch": 14.638601180532353, + "grad_norm": 0.17502227425575256, + "learning_rate": 1.016983215672582e-05, + "loss": 0.0055, + "num_input_tokens_seen": 159936064, + "step": 131440 + }, + { + "epoch": 14.63915803541597, + "grad_norm": 0.00040979028563015163, + "learning_rate": 1.016787616591838e-05, + "loss": 0.046, + "num_input_tokens_seen": 159942560, + "step": 131445 + }, + { + "epoch": 14.639714890299588, + "grad_norm": 6.270800076890737e-05, + "learning_rate": 1.0165920315212244e-05, + "loss": 0.0245, + "num_input_tokens_seen": 159948640, + "step": 131450 + }, + { + "epoch": 14.640271745183206, + "grad_norm": 1.5146524906158447, + "learning_rate": 1.0163964604625866e-05, + "loss": 0.099, + "num_input_tokens_seen": 159954464, + "step": 131455 + }, + { + "epoch": 14.640828600066822, + "grad_norm": 0.0024480782449245453, + "learning_rate": 1.0162009034177747e-05, + "loss": 0.0025, + "num_input_tokens_seen": 159960576, + "step": 131460 + }, + { + "epoch": 14.64138545495044, + "grad_norm": 3.255540370941162, + "learning_rate": 1.0160053603886325e-05, + "loss": 0.0765, + "num_input_tokens_seen": 159966272, + "step": 131465 + }, + { + "epoch": 14.641942309834057, + "grad_norm": 0.38923683762550354, + "learning_rate": 1.015809831377009e-05, + "loss": 0.0598, + "num_input_tokens_seen": 159971808, + "step": 131470 + }, + { + "epoch": 14.642499164717675, + "grad_norm": 1.5984272956848145, + "learning_rate": 1.0156143163847504e-05, + "loss": 0.054, + "num_input_tokens_seen": 159977632, + "step": 131475 + }, + { + "epoch": 14.643056019601293, + "grad_norm": 0.01710096187889576, + "learning_rate": 1.0154188154137042e-05, + "loss": 0.0029, + "num_input_tokens_seen": 159983552, + "step": 131480 + }, + { + "epoch": 14.643612874484909, + "grad_norm": 0.01683075726032257, + "learning_rate": 1.0152233284657173e-05, + "loss": 0.0423, + "num_input_tokens_seen": 159989376, + "step": 131485 + }, + { + "epoch": 14.644169729368526, + "grad_norm": 0.2736825942993164, + "learning_rate": 1.0150278555426351e-05, + "loss": 0.0299, + "num_input_tokens_seen": 159995200, + "step": 131490 + }, + { + "epoch": 14.644726584252144, + "grad_norm": 1.156432867050171, + "learning_rate": 1.0148323966463041e-05, + "loss": 0.0601, + "num_input_tokens_seen": 160001408, + "step": 131495 + }, + { + "epoch": 14.645283439135762, + "grad_norm": 0.05976841598749161, + "learning_rate": 1.0146369517785716e-05, + "loss": 0.0311, + "num_input_tokens_seen": 160007488, + "step": 131500 + }, + { + "epoch": 14.64584029401938, + "grad_norm": 1.0416749715805054, + "learning_rate": 1.0144415209412833e-05, + "loss": 0.0097, + "num_input_tokens_seen": 160013792, + "step": 131505 + }, + { + "epoch": 14.646397148902995, + "grad_norm": 0.6567375063896179, + "learning_rate": 1.014246104136285e-05, + "loss": 0.0141, + "num_input_tokens_seen": 160019968, + "step": 131510 + }, + { + "epoch": 14.646954003786613, + "grad_norm": 0.936907947063446, + "learning_rate": 1.0140507013654218e-05, + "loss": 0.0076, + "num_input_tokens_seen": 160025952, + "step": 131515 + }, + { + "epoch": 14.64751085867023, + "grad_norm": 0.08154276758432388, + "learning_rate": 1.013855312630541e-05, + "loss": 0.0011, + "num_input_tokens_seen": 160032224, + "step": 131520 + }, + { + "epoch": 14.648067713553848, + "grad_norm": 0.00011725346848834306, + "learning_rate": 1.0136599379334865e-05, + "loss": 0.0252, + "num_input_tokens_seen": 160038144, + "step": 131525 + }, + { + "epoch": 14.648624568437466, + "grad_norm": 0.007906894199550152, + "learning_rate": 1.0134645772761059e-05, + "loss": 0.0619, + "num_input_tokens_seen": 160044320, + "step": 131530 + }, + { + "epoch": 14.649181423321082, + "grad_norm": 0.14169488847255707, + "learning_rate": 1.0132692306602432e-05, + "loss": 0.0194, + "num_input_tokens_seen": 160050688, + "step": 131535 + }, + { + "epoch": 14.6497382782047, + "grad_norm": 0.0026579471305012703, + "learning_rate": 1.0130738980877438e-05, + "loss": 0.0363, + "num_input_tokens_seen": 160056704, + "step": 131540 + }, + { + "epoch": 14.650295133088317, + "grad_norm": 1.7976555824279785, + "learning_rate": 1.0128785795604518e-05, + "loss": 0.055, + "num_input_tokens_seen": 160062752, + "step": 131545 + }, + { + "epoch": 14.650851987971935, + "grad_norm": 0.1825653314590454, + "learning_rate": 1.0126832750802139e-05, + "loss": 0.0144, + "num_input_tokens_seen": 160068832, + "step": 131550 + }, + { + "epoch": 14.651408842855552, + "grad_norm": 0.005667346995323896, + "learning_rate": 1.0124879846488742e-05, + "loss": 0.0501, + "num_input_tokens_seen": 160074944, + "step": 131555 + }, + { + "epoch": 14.651965697739168, + "grad_norm": 0.0051633017137646675, + "learning_rate": 1.012292708268277e-05, + "loss": 0.0536, + "num_input_tokens_seen": 160080864, + "step": 131560 + }, + { + "epoch": 14.652522552622786, + "grad_norm": 0.6286059021949768, + "learning_rate": 1.0120974459402665e-05, + "loss": 0.0833, + "num_input_tokens_seen": 160087168, + "step": 131565 + }, + { + "epoch": 14.653079407506404, + "grad_norm": 0.3659575879573822, + "learning_rate": 1.0119021976666888e-05, + "loss": 0.0066, + "num_input_tokens_seen": 160093184, + "step": 131570 + }, + { + "epoch": 14.653636262390021, + "grad_norm": 0.2911461889743805, + "learning_rate": 1.0117069634493858e-05, + "loss": 0.0607, + "num_input_tokens_seen": 160099360, + "step": 131575 + }, + { + "epoch": 14.654193117273639, + "grad_norm": 0.1706988364458084, + "learning_rate": 1.011511743290205e-05, + "loss": 0.0543, + "num_input_tokens_seen": 160105280, + "step": 131580 + }, + { + "epoch": 14.654749972157255, + "grad_norm": 2.225731372833252, + "learning_rate": 1.0113165371909864e-05, + "loss": 0.0411, + "num_input_tokens_seen": 160110976, + "step": 131585 + }, + { + "epoch": 14.655306827040873, + "grad_norm": 0.0988001823425293, + "learning_rate": 1.0111213451535764e-05, + "loss": 0.0672, + "num_input_tokens_seen": 160117024, + "step": 131590 + }, + { + "epoch": 14.65586368192449, + "grad_norm": 0.8806242942810059, + "learning_rate": 1.0109261671798176e-05, + "loss": 0.0329, + "num_input_tokens_seen": 160123264, + "step": 131595 + }, + { + "epoch": 14.656420536808108, + "grad_norm": 0.0008747925749048591, + "learning_rate": 1.0107310032715553e-05, + "loss": 0.0458, + "num_input_tokens_seen": 160129728, + "step": 131600 + }, + { + "epoch": 14.656977391691726, + "grad_norm": 1.750157117843628, + "learning_rate": 1.0105358534306315e-05, + "loss": 0.0811, + "num_input_tokens_seen": 160136032, + "step": 131605 + }, + { + "epoch": 14.657534246575342, + "grad_norm": 0.010261353105306625, + "learning_rate": 1.01034071765889e-05, + "loss": 0.0845, + "num_input_tokens_seen": 160141952, + "step": 131610 + }, + { + "epoch": 14.65809110145896, + "grad_norm": 0.33535492420196533, + "learning_rate": 1.010145595958173e-05, + "loss": 0.017, + "num_input_tokens_seen": 160148064, + "step": 131615 + }, + { + "epoch": 14.658647956342577, + "grad_norm": 0.00010963326349155977, + "learning_rate": 1.0099504883303254e-05, + "loss": 0.0161, + "num_input_tokens_seen": 160154304, + "step": 131620 + }, + { + "epoch": 14.659204811226195, + "grad_norm": 1.0160993337631226, + "learning_rate": 1.0097553947771893e-05, + "loss": 0.034, + "num_input_tokens_seen": 160160832, + "step": 131625 + }, + { + "epoch": 14.659761666109812, + "grad_norm": 1.8565471172332764, + "learning_rate": 1.0095603153006075e-05, + "loss": 0.0878, + "num_input_tokens_seen": 160167104, + "step": 131630 + }, + { + "epoch": 14.66031852099343, + "grad_norm": 0.001248806482180953, + "learning_rate": 1.0093652499024218e-05, + "loss": 0.0023, + "num_input_tokens_seen": 160173472, + "step": 131635 + }, + { + "epoch": 14.660875375877046, + "grad_norm": 0.9105693697929382, + "learning_rate": 1.0091701985844762e-05, + "loss": 0.1036, + "num_input_tokens_seen": 160179424, + "step": 131640 + }, + { + "epoch": 14.661432230760663, + "grad_norm": 1.0723284482955933, + "learning_rate": 1.0089751613486118e-05, + "loss": 0.0365, + "num_input_tokens_seen": 160185664, + "step": 131645 + }, + { + "epoch": 14.661989085644281, + "grad_norm": 0.7661765813827515, + "learning_rate": 1.0087801381966732e-05, + "loss": 0.0383, + "num_input_tokens_seen": 160191744, + "step": 131650 + }, + { + "epoch": 14.662545940527899, + "grad_norm": 0.10450057685375214, + "learning_rate": 1.0085851291305004e-05, + "loss": 0.0223, + "num_input_tokens_seen": 160197664, + "step": 131655 + }, + { + "epoch": 14.663102795411517, + "grad_norm": 0.018604036420583725, + "learning_rate": 1.0083901341519347e-05, + "loss": 0.0377, + "num_input_tokens_seen": 160203584, + "step": 131660 + }, + { + "epoch": 14.663659650295132, + "grad_norm": 0.5696600079536438, + "learning_rate": 1.0081951532628204e-05, + "loss": 0.0195, + "num_input_tokens_seen": 160209504, + "step": 131665 + }, + { + "epoch": 14.66421650517875, + "grad_norm": 1.2048823833465576, + "learning_rate": 1.0080001864649972e-05, + "loss": 0.0874, + "num_input_tokens_seen": 160215456, + "step": 131670 + }, + { + "epoch": 14.664773360062368, + "grad_norm": 0.0001911453000502661, + "learning_rate": 1.0078052337603084e-05, + "loss": 0.01, + "num_input_tokens_seen": 160221376, + "step": 131675 + }, + { + "epoch": 14.665330214945985, + "grad_norm": 2.8604917526245117, + "learning_rate": 1.0076102951505947e-05, + "loss": 0.1406, + "num_input_tokens_seen": 160227520, + "step": 131680 + }, + { + "epoch": 14.665887069829603, + "grad_norm": 1.483564019203186, + "learning_rate": 1.0074153706376974e-05, + "loss": 0.028, + "num_input_tokens_seen": 160233760, + "step": 131685 + }, + { + "epoch": 14.666443924713219, + "grad_norm": 0.0001609097671462223, + "learning_rate": 1.007220460223457e-05, + "loss": 0.1361, + "num_input_tokens_seen": 160239712, + "step": 131690 + }, + { + "epoch": 14.667000779596837, + "grad_norm": 0.17533868551254272, + "learning_rate": 1.007025563909716e-05, + "loss": 0.07, + "num_input_tokens_seen": 160245920, + "step": 131695 + }, + { + "epoch": 14.667557634480454, + "grad_norm": 1.4580368995666504, + "learning_rate": 1.006830681698315e-05, + "loss": 0.1391, + "num_input_tokens_seen": 160252032, + "step": 131700 + }, + { + "epoch": 14.668114489364072, + "grad_norm": 0.016514981165528297, + "learning_rate": 1.0066358135910942e-05, + "loss": 0.0101, + "num_input_tokens_seen": 160258080, + "step": 131705 + }, + { + "epoch": 14.66867134424769, + "grad_norm": 0.4146519601345062, + "learning_rate": 1.0064409595898942e-05, + "loss": 0.0116, + "num_input_tokens_seen": 160264512, + "step": 131710 + }, + { + "epoch": 14.669228199131306, + "grad_norm": 0.0005518403486348689, + "learning_rate": 1.0062461196965564e-05, + "loss": 0.0135, + "num_input_tokens_seen": 160270400, + "step": 131715 + }, + { + "epoch": 14.669785054014923, + "grad_norm": 2.9165456295013428, + "learning_rate": 1.0060512939129207e-05, + "loss": 0.1061, + "num_input_tokens_seen": 160275584, + "step": 131720 + }, + { + "epoch": 14.670341908898541, + "grad_norm": 0.028350941836833954, + "learning_rate": 1.0058564822408279e-05, + "loss": 0.0045, + "num_input_tokens_seen": 160281920, + "step": 131725 + }, + { + "epoch": 14.670898763782159, + "grad_norm": 0.09174466878175735, + "learning_rate": 1.0056616846821165e-05, + "loss": 0.047, + "num_input_tokens_seen": 160288416, + "step": 131730 + }, + { + "epoch": 14.671455618665776, + "grad_norm": 0.22379694879055023, + "learning_rate": 1.0054669012386287e-05, + "loss": 0.0225, + "num_input_tokens_seen": 160294464, + "step": 131735 + }, + { + "epoch": 14.672012473549392, + "grad_norm": 0.0002377178316237405, + "learning_rate": 1.0052721319122025e-05, + "loss": 0.0015, + "num_input_tokens_seen": 160300800, + "step": 131740 + }, + { + "epoch": 14.67256932843301, + "grad_norm": 0.9169391393661499, + "learning_rate": 1.0050773767046794e-05, + "loss": 0.0712, + "num_input_tokens_seen": 160306624, + "step": 131745 + }, + { + "epoch": 14.673126183316628, + "grad_norm": 0.0003631845465861261, + "learning_rate": 1.0048826356178983e-05, + "loss": 0.0167, + "num_input_tokens_seen": 160312736, + "step": 131750 + }, + { + "epoch": 14.673683038200245, + "grad_norm": 0.048768654465675354, + "learning_rate": 1.0046879086536987e-05, + "loss": 0.1144, + "num_input_tokens_seen": 160318880, + "step": 131755 + }, + { + "epoch": 14.674239893083863, + "grad_norm": 0.07642892003059387, + "learning_rate": 1.0044931958139186e-05, + "loss": 0.0031, + "num_input_tokens_seen": 160324896, + "step": 131760 + }, + { + "epoch": 14.67479674796748, + "grad_norm": 1.9579912424087524, + "learning_rate": 1.0042984971003996e-05, + "loss": 0.0465, + "num_input_tokens_seen": 160330944, + "step": 131765 + }, + { + "epoch": 14.675353602851096, + "grad_norm": 1.1569606065750122, + "learning_rate": 1.0041038125149795e-05, + "loss": 0.0562, + "num_input_tokens_seen": 160337024, + "step": 131770 + }, + { + "epoch": 14.675910457734714, + "grad_norm": 2.6172876358032227, + "learning_rate": 1.0039091420594976e-05, + "loss": 0.0602, + "num_input_tokens_seen": 160342304, + "step": 131775 + }, + { + "epoch": 14.676467312618332, + "grad_norm": 0.07976619154214859, + "learning_rate": 1.0037144857357916e-05, + "loss": 0.0212, + "num_input_tokens_seen": 160348544, + "step": 131780 + }, + { + "epoch": 14.67702416750195, + "grad_norm": 3.3617939949035645, + "learning_rate": 1.0035198435457015e-05, + "loss": 0.1319, + "num_input_tokens_seen": 160354592, + "step": 131785 + }, + { + "epoch": 14.677581022385567, + "grad_norm": 1.6742700338363647, + "learning_rate": 1.0033252154910652e-05, + "loss": 0.0745, + "num_input_tokens_seen": 160360736, + "step": 131790 + }, + { + "epoch": 14.678137877269183, + "grad_norm": 0.17691197991371155, + "learning_rate": 1.0031306015737226e-05, + "loss": 0.0153, + "num_input_tokens_seen": 160366304, + "step": 131795 + }, + { + "epoch": 14.6786947321528, + "grad_norm": 0.0011279763421043754, + "learning_rate": 1.0029360017955094e-05, + "loss": 0.0203, + "num_input_tokens_seen": 160372704, + "step": 131800 + }, + { + "epoch": 14.679251587036418, + "grad_norm": 0.0032976618967950344, + "learning_rate": 1.0027414161582658e-05, + "loss": 0.0501, + "num_input_tokens_seen": 160378816, + "step": 131805 + }, + { + "epoch": 14.679808441920036, + "grad_norm": 0.10596682131290436, + "learning_rate": 1.0025468446638281e-05, + "loss": 0.0095, + "num_input_tokens_seen": 160384960, + "step": 131810 + }, + { + "epoch": 14.680365296803654, + "grad_norm": 0.09951255470514297, + "learning_rate": 1.0023522873140361e-05, + "loss": 0.1153, + "num_input_tokens_seen": 160390816, + "step": 131815 + }, + { + "epoch": 14.68092215168727, + "grad_norm": 3.1259872913360596, + "learning_rate": 1.0021577441107265e-05, + "loss": 0.0781, + "num_input_tokens_seen": 160396512, + "step": 131820 + }, + { + "epoch": 14.681479006570887, + "grad_norm": 0.00012260072981007397, + "learning_rate": 1.0019632150557376e-05, + "loss": 0.0849, + "num_input_tokens_seen": 160402688, + "step": 131825 + }, + { + "epoch": 14.682035861454505, + "grad_norm": 0.27958086133003235, + "learning_rate": 1.0017687001509049e-05, + "loss": 0.0027, + "num_input_tokens_seen": 160408832, + "step": 131830 + }, + { + "epoch": 14.682592716338123, + "grad_norm": 0.3811851739883423, + "learning_rate": 1.0015741993980685e-05, + "loss": 0.0812, + "num_input_tokens_seen": 160414848, + "step": 131835 + }, + { + "epoch": 14.68314957122174, + "grad_norm": 0.2791137099266052, + "learning_rate": 1.001379712799064e-05, + "loss": 0.0249, + "num_input_tokens_seen": 160421120, + "step": 131840 + }, + { + "epoch": 14.683706426105356, + "grad_norm": 0.6870487332344055, + "learning_rate": 1.001185240355729e-05, + "loss": 0.0355, + "num_input_tokens_seen": 160426816, + "step": 131845 + }, + { + "epoch": 14.684263280988974, + "grad_norm": 0.08633355796337128, + "learning_rate": 1.000990782069899e-05, + "loss": 0.0591, + "num_input_tokens_seen": 160432640, + "step": 131850 + }, + { + "epoch": 14.684820135872592, + "grad_norm": 0.8866099715232849, + "learning_rate": 1.0007963379434131e-05, + "loss": 0.0961, + "num_input_tokens_seen": 160438944, + "step": 131855 + }, + { + "epoch": 14.68537699075621, + "grad_norm": 0.015898920595645905, + "learning_rate": 1.0006019079781062e-05, + "loss": 0.013, + "num_input_tokens_seen": 160445184, + "step": 131860 + }, + { + "epoch": 14.685933845639827, + "grad_norm": 0.007609265390783548, + "learning_rate": 1.0004074921758175e-05, + "loss": 0.0036, + "num_input_tokens_seen": 160451168, + "step": 131865 + }, + { + "epoch": 14.686490700523443, + "grad_norm": 1.4116361141204834, + "learning_rate": 1.0002130905383794e-05, + "loss": 0.0362, + "num_input_tokens_seen": 160457312, + "step": 131870 + }, + { + "epoch": 14.68704755540706, + "grad_norm": 0.0313316248357296, + "learning_rate": 1.0000187030676312e-05, + "loss": 0.0221, + "num_input_tokens_seen": 160463328, + "step": 131875 + }, + { + "epoch": 14.687604410290678, + "grad_norm": 1.1422178745269775, + "learning_rate": 9.998243297654072e-06, + "loss": 0.0451, + "num_input_tokens_seen": 160469280, + "step": 131880 + }, + { + "epoch": 14.688161265174296, + "grad_norm": 0.2673003673553467, + "learning_rate": 9.996299706335452e-06, + "loss": 0.1144, + "num_input_tokens_seen": 160475552, + "step": 131885 + }, + { + "epoch": 14.688718120057914, + "grad_norm": 0.38257086277008057, + "learning_rate": 9.994356256738805e-06, + "loss": 0.0511, + "num_input_tokens_seen": 160481792, + "step": 131890 + }, + { + "epoch": 14.68927497494153, + "grad_norm": 9.993257845053449e-05, + "learning_rate": 9.992412948882481e-06, + "loss": 0.0605, + "num_input_tokens_seen": 160488128, + "step": 131895 + }, + { + "epoch": 14.689831829825147, + "grad_norm": 0.0004096678749192506, + "learning_rate": 9.990469782784836e-06, + "loss": 0.0055, + "num_input_tokens_seen": 160494368, + "step": 131900 + }, + { + "epoch": 14.690388684708765, + "grad_norm": 0.06935016810894012, + "learning_rate": 9.988526758464237e-06, + "loss": 0.0332, + "num_input_tokens_seen": 160500320, + "step": 131905 + }, + { + "epoch": 14.690945539592382, + "grad_norm": 0.029386119917035103, + "learning_rate": 9.986583875939026e-06, + "loss": 0.0379, + "num_input_tokens_seen": 160506560, + "step": 131910 + }, + { + "epoch": 14.691502394476, + "grad_norm": 0.0024951405357569456, + "learning_rate": 9.984641135227563e-06, + "loss": 0.0547, + "num_input_tokens_seen": 160512128, + "step": 131915 + }, + { + "epoch": 14.692059249359616, + "grad_norm": 2.016324758529663, + "learning_rate": 9.982698536348184e-06, + "loss": 0.1209, + "num_input_tokens_seen": 160518496, + "step": 131920 + }, + { + "epoch": 14.692616104243234, + "grad_norm": 0.5881636738777161, + "learning_rate": 9.98075607931926e-06, + "loss": 0.0107, + "num_input_tokens_seen": 160524512, + "step": 131925 + }, + { + "epoch": 14.693172959126851, + "grad_norm": 0.27475330233573914, + "learning_rate": 9.978813764159117e-06, + "loss": 0.0068, + "num_input_tokens_seen": 160530816, + "step": 131930 + }, + { + "epoch": 14.693729814010469, + "grad_norm": 0.05757275968790054, + "learning_rate": 9.976871590886122e-06, + "loss": 0.1033, + "num_input_tokens_seen": 160536608, + "step": 131935 + }, + { + "epoch": 14.694286668894087, + "grad_norm": 0.0018152255797758698, + "learning_rate": 9.974929559518612e-06, + "loss": 0.092, + "num_input_tokens_seen": 160543040, + "step": 131940 + }, + { + "epoch": 14.694843523777703, + "grad_norm": 0.061780575662851334, + "learning_rate": 9.972987670074929e-06, + "loss": 0.0341, + "num_input_tokens_seen": 160549216, + "step": 131945 + }, + { + "epoch": 14.69540037866132, + "grad_norm": 0.323742538690567, + "learning_rate": 9.971045922573407e-06, + "loss": 0.0374, + "num_input_tokens_seen": 160554944, + "step": 131950 + }, + { + "epoch": 14.695957233544938, + "grad_norm": 0.48635929822921753, + "learning_rate": 9.96910431703241e-06, + "loss": 0.04, + "num_input_tokens_seen": 160561216, + "step": 131955 + }, + { + "epoch": 14.696514088428556, + "grad_norm": 0.44814059138298035, + "learning_rate": 9.96716285347026e-06, + "loss": 0.0674, + "num_input_tokens_seen": 160567104, + "step": 131960 + }, + { + "epoch": 14.697070943312173, + "grad_norm": 1.8914599418640137, + "learning_rate": 9.965221531905305e-06, + "loss": 0.2065, + "num_input_tokens_seen": 160573248, + "step": 131965 + }, + { + "epoch": 14.69762779819579, + "grad_norm": 0.046313486993312836, + "learning_rate": 9.963280352355869e-06, + "loss": 0.0043, + "num_input_tokens_seen": 160579008, + "step": 131970 + }, + { + "epoch": 14.698184653079407, + "grad_norm": 0.800497829914093, + "learning_rate": 9.961339314840307e-06, + "loss": 0.0195, + "num_input_tokens_seen": 160585344, + "step": 131975 + }, + { + "epoch": 14.698741507963025, + "grad_norm": 0.022438183426856995, + "learning_rate": 9.959398419376932e-06, + "loss": 0.11, + "num_input_tokens_seen": 160591584, + "step": 131980 + }, + { + "epoch": 14.699298362846642, + "grad_norm": 0.7821102738380432, + "learning_rate": 9.957457665984107e-06, + "loss": 0.0877, + "num_input_tokens_seen": 160597760, + "step": 131985 + }, + { + "epoch": 14.69985521773026, + "grad_norm": 0.00011384537356207147, + "learning_rate": 9.95551705468013e-06, + "loss": 0.0063, + "num_input_tokens_seen": 160603584, + "step": 131990 + }, + { + "epoch": 14.700412072613878, + "grad_norm": 0.044701624661684036, + "learning_rate": 9.95357658548336e-06, + "loss": 0.024, + "num_input_tokens_seen": 160609600, + "step": 131995 + }, + { + "epoch": 14.700968927497494, + "grad_norm": 0.7122172713279724, + "learning_rate": 9.9516362584121e-06, + "loss": 0.0203, + "num_input_tokens_seen": 160615520, + "step": 132000 + }, + { + "epoch": 14.701525782381111, + "grad_norm": 0.00040490503306500614, + "learning_rate": 9.949696073484704e-06, + "loss": 0.0704, + "num_input_tokens_seen": 160621632, + "step": 132005 + }, + { + "epoch": 14.702082637264729, + "grad_norm": 0.00026936011272482574, + "learning_rate": 9.947756030719486e-06, + "loss": 0.009, + "num_input_tokens_seen": 160627904, + "step": 132010 + }, + { + "epoch": 14.702639492148347, + "grad_norm": 1.3630279302597046, + "learning_rate": 9.945816130134772e-06, + "loss": 0.0896, + "num_input_tokens_seen": 160634016, + "step": 132015 + }, + { + "epoch": 14.703196347031964, + "grad_norm": 0.006934701930731535, + "learning_rate": 9.943876371748875e-06, + "loss": 0.0457, + "num_input_tokens_seen": 160640416, + "step": 132020 + }, + { + "epoch": 14.70375320191558, + "grad_norm": 0.11275991052389145, + "learning_rate": 9.941936755580139e-06, + "loss": 0.0129, + "num_input_tokens_seen": 160646176, + "step": 132025 + }, + { + "epoch": 14.704310056799198, + "grad_norm": 0.03221602737903595, + "learning_rate": 9.939997281646876e-06, + "loss": 0.1409, + "num_input_tokens_seen": 160652224, + "step": 132030 + }, + { + "epoch": 14.704866911682815, + "grad_norm": 0.036253150552511215, + "learning_rate": 9.938057949967403e-06, + "loss": 0.0931, + "num_input_tokens_seen": 160657888, + "step": 132035 + }, + { + "epoch": 14.705423766566433, + "grad_norm": 0.700356125831604, + "learning_rate": 9.936118760560032e-06, + "loss": 0.1196, + "num_input_tokens_seen": 160664256, + "step": 132040 + }, + { + "epoch": 14.70598062145005, + "grad_norm": 0.019013574346899986, + "learning_rate": 9.934179713443095e-06, + "loss": 0.0336, + "num_input_tokens_seen": 160669824, + "step": 132045 + }, + { + "epoch": 14.706537476333667, + "grad_norm": 1.9050184488296509, + "learning_rate": 9.932240808634893e-06, + "loss": 0.1885, + "num_input_tokens_seen": 160675744, + "step": 132050 + }, + { + "epoch": 14.707094331217284, + "grad_norm": 1.8976713418960571, + "learning_rate": 9.930302046153769e-06, + "loss": 0.1015, + "num_input_tokens_seen": 160681856, + "step": 132055 + }, + { + "epoch": 14.707651186100902, + "grad_norm": 1.8531187772750854, + "learning_rate": 9.928363426017994e-06, + "loss": 0.1285, + "num_input_tokens_seen": 160687488, + "step": 132060 + }, + { + "epoch": 14.70820804098452, + "grad_norm": 0.1276187151670456, + "learning_rate": 9.926424948245913e-06, + "loss": 0.0033, + "num_input_tokens_seen": 160693664, + "step": 132065 + }, + { + "epoch": 14.708764895868137, + "grad_norm": 1.2442195415496826, + "learning_rate": 9.924486612855827e-06, + "loss": 0.0477, + "num_input_tokens_seen": 160699776, + "step": 132070 + }, + { + "epoch": 14.709321750751753, + "grad_norm": 0.3584091365337372, + "learning_rate": 9.922548419866033e-06, + "loss": 0.0534, + "num_input_tokens_seen": 160705216, + "step": 132075 + }, + { + "epoch": 14.709878605635371, + "grad_norm": 0.10124973952770233, + "learning_rate": 9.920610369294856e-06, + "loss": 0.0504, + "num_input_tokens_seen": 160711104, + "step": 132080 + }, + { + "epoch": 14.710435460518989, + "grad_norm": 0.9037397503852844, + "learning_rate": 9.918672461160597e-06, + "loss": 0.0232, + "num_input_tokens_seen": 160717056, + "step": 132085 + }, + { + "epoch": 14.710992315402606, + "grad_norm": 0.8056529760360718, + "learning_rate": 9.916734695481559e-06, + "loss": 0.0301, + "num_input_tokens_seen": 160723360, + "step": 132090 + }, + { + "epoch": 14.711549170286224, + "grad_norm": 0.007319719530642033, + "learning_rate": 9.914797072276036e-06, + "loss": 0.0457, + "num_input_tokens_seen": 160729120, + "step": 132095 + }, + { + "epoch": 14.712106025169842, + "grad_norm": 0.045904792845249176, + "learning_rate": 9.912859591562351e-06, + "loss": 0.0166, + "num_input_tokens_seen": 160735200, + "step": 132100 + }, + { + "epoch": 14.712662880053458, + "grad_norm": 0.6344276070594788, + "learning_rate": 9.910922253358795e-06, + "loss": 0.0558, + "num_input_tokens_seen": 160741408, + "step": 132105 + }, + { + "epoch": 14.713219734937075, + "grad_norm": 0.09397788345813751, + "learning_rate": 9.908985057683667e-06, + "loss": 0.0311, + "num_input_tokens_seen": 160746848, + "step": 132110 + }, + { + "epoch": 14.713776589820693, + "grad_norm": 0.6534756422042847, + "learning_rate": 9.907048004555258e-06, + "loss": 0.0099, + "num_input_tokens_seen": 160753472, + "step": 132115 + }, + { + "epoch": 14.71433344470431, + "grad_norm": 8.696858276380226e-05, + "learning_rate": 9.905111093991881e-06, + "loss": 0.0023, + "num_input_tokens_seen": 160759744, + "step": 132120 + }, + { + "epoch": 14.714890299587928, + "grad_norm": 0.0007576423231512308, + "learning_rate": 9.903174326011817e-06, + "loss": 0.041, + "num_input_tokens_seen": 160765920, + "step": 132125 + }, + { + "epoch": 14.715447154471544, + "grad_norm": 0.12031959742307663, + "learning_rate": 9.901237700633381e-06, + "loss": 0.0079, + "num_input_tokens_seen": 160772096, + "step": 132130 + }, + { + "epoch": 14.716004009355162, + "grad_norm": 0.4891442656517029, + "learning_rate": 9.899301217874834e-06, + "loss": 0.0272, + "num_input_tokens_seen": 160778336, + "step": 132135 + }, + { + "epoch": 14.71656086423878, + "grad_norm": 1.4700983762741089, + "learning_rate": 9.897364877754498e-06, + "loss": 0.0283, + "num_input_tokens_seen": 160784640, + "step": 132140 + }, + { + "epoch": 14.717117719122397, + "grad_norm": 0.006701298523694277, + "learning_rate": 9.895428680290639e-06, + "loss": 0.1237, + "num_input_tokens_seen": 160790624, + "step": 132145 + }, + { + "epoch": 14.717674574006015, + "grad_norm": 0.5247284173965454, + "learning_rate": 9.893492625501569e-06, + "loss": 0.0114, + "num_input_tokens_seen": 160796576, + "step": 132150 + }, + { + "epoch": 14.71823142888963, + "grad_norm": 1.4541536569595337, + "learning_rate": 9.891556713405561e-06, + "loss": 0.119, + "num_input_tokens_seen": 160802592, + "step": 132155 + }, + { + "epoch": 14.718788283773248, + "grad_norm": 0.00464979512616992, + "learning_rate": 9.889620944020909e-06, + "loss": 0.0228, + "num_input_tokens_seen": 160808352, + "step": 132160 + }, + { + "epoch": 14.719345138656866, + "grad_norm": 0.004068510606884956, + "learning_rate": 9.88768531736588e-06, + "loss": 0.0204, + "num_input_tokens_seen": 160814464, + "step": 132165 + }, + { + "epoch": 14.719901993540484, + "grad_norm": 0.18948647379875183, + "learning_rate": 9.885749833458782e-06, + "loss": 0.0785, + "num_input_tokens_seen": 160821088, + "step": 132170 + }, + { + "epoch": 14.720458848424101, + "grad_norm": 0.9245074391365051, + "learning_rate": 9.883814492317885e-06, + "loss": 0.039, + "num_input_tokens_seen": 160827328, + "step": 132175 + }, + { + "epoch": 14.721015703307717, + "grad_norm": 0.48728740215301514, + "learning_rate": 9.881879293961472e-06, + "loss": 0.0243, + "num_input_tokens_seen": 160833696, + "step": 132180 + }, + { + "epoch": 14.721572558191335, + "grad_norm": 1.9141504764556885, + "learning_rate": 9.879944238407811e-06, + "loss": 0.1619, + "num_input_tokens_seen": 160839744, + "step": 132185 + }, + { + "epoch": 14.722129413074953, + "grad_norm": 1.2441707849502563, + "learning_rate": 9.878009325675202e-06, + "loss": 0.0796, + "num_input_tokens_seen": 160846176, + "step": 132190 + }, + { + "epoch": 14.72268626795857, + "grad_norm": 0.0572652593255043, + "learning_rate": 9.8760745557819e-06, + "loss": 0.0149, + "num_input_tokens_seen": 160852256, + "step": 132195 + }, + { + "epoch": 14.723243122842188, + "grad_norm": 0.025566548109054565, + "learning_rate": 9.87413992874621e-06, + "loss": 0.0271, + "num_input_tokens_seen": 160858464, + "step": 132200 + }, + { + "epoch": 14.723799977725804, + "grad_norm": 1.6610643863677979, + "learning_rate": 9.87220544458637e-06, + "loss": 0.096, + "num_input_tokens_seen": 160864160, + "step": 132205 + }, + { + "epoch": 14.724356832609422, + "grad_norm": 0.0002647251239977777, + "learning_rate": 9.870271103320674e-06, + "loss": 0.0055, + "num_input_tokens_seen": 160870272, + "step": 132210 + }, + { + "epoch": 14.72491368749304, + "grad_norm": 0.006820674054324627, + "learning_rate": 9.868336904967385e-06, + "loss": 0.0031, + "num_input_tokens_seen": 160876544, + "step": 132215 + }, + { + "epoch": 14.725470542376657, + "grad_norm": 0.00034691800829023123, + "learning_rate": 9.866402849544784e-06, + "loss": 0.0257, + "num_input_tokens_seen": 160882912, + "step": 132220 + }, + { + "epoch": 14.726027397260275, + "grad_norm": 3.2702550888061523, + "learning_rate": 9.864468937071134e-06, + "loss": 0.0517, + "num_input_tokens_seen": 160889184, + "step": 132225 + }, + { + "epoch": 14.72658425214389, + "grad_norm": 0.6073636412620544, + "learning_rate": 9.8625351675647e-06, + "loss": 0.0796, + "num_input_tokens_seen": 160895328, + "step": 132230 + }, + { + "epoch": 14.727141107027508, + "grad_norm": 0.17209096252918243, + "learning_rate": 9.86060154104374e-06, + "loss": 0.1697, + "num_input_tokens_seen": 160901344, + "step": 132235 + }, + { + "epoch": 14.727697961911126, + "grad_norm": 0.09381674975156784, + "learning_rate": 9.858668057526537e-06, + "loss": 0.0347, + "num_input_tokens_seen": 160907552, + "step": 132240 + }, + { + "epoch": 14.728254816794744, + "grad_norm": 0.004861076828092337, + "learning_rate": 9.856734717031347e-06, + "loss": 0.0249, + "num_input_tokens_seen": 160913536, + "step": 132245 + }, + { + "epoch": 14.728811671678361, + "grad_norm": 0.047209762036800385, + "learning_rate": 9.854801519576429e-06, + "loss": 0.0148, + "num_input_tokens_seen": 160920096, + "step": 132250 + }, + { + "epoch": 14.729368526561977, + "grad_norm": 0.0005324282101355493, + "learning_rate": 9.852868465180038e-06, + "loss": 0.0928, + "num_input_tokens_seen": 160926400, + "step": 132255 + }, + { + "epoch": 14.729925381445595, + "grad_norm": 0.011852361261844635, + "learning_rate": 9.850935553860446e-06, + "loss": 0.0187, + "num_input_tokens_seen": 160932576, + "step": 132260 + }, + { + "epoch": 14.730482236329212, + "grad_norm": 1.7477138042449951, + "learning_rate": 9.849002785635897e-06, + "loss": 0.1379, + "num_input_tokens_seen": 160938464, + "step": 132265 + }, + { + "epoch": 14.73103909121283, + "grad_norm": 0.051335614174604416, + "learning_rate": 9.847070160524674e-06, + "loss": 0.0152, + "num_input_tokens_seen": 160944544, + "step": 132270 + }, + { + "epoch": 14.731595946096448, + "grad_norm": 0.015374762937426567, + "learning_rate": 9.845137678544993e-06, + "loss": 0.0017, + "num_input_tokens_seen": 160951008, + "step": 132275 + }, + { + "epoch": 14.732152800980064, + "grad_norm": 0.00015557452570647, + "learning_rate": 9.843205339715141e-06, + "loss": 0.1115, + "num_input_tokens_seen": 160957056, + "step": 132280 + }, + { + "epoch": 14.732709655863681, + "grad_norm": 0.09535727649927139, + "learning_rate": 9.841273144053348e-06, + "loss": 0.0358, + "num_input_tokens_seen": 160963232, + "step": 132285 + }, + { + "epoch": 14.733266510747299, + "grad_norm": 0.11104031652212143, + "learning_rate": 9.839341091577883e-06, + "loss": 0.0854, + "num_input_tokens_seen": 160969344, + "step": 132290 + }, + { + "epoch": 14.733823365630917, + "grad_norm": 0.03747601807117462, + "learning_rate": 9.837409182306989e-06, + "loss": 0.1238, + "num_input_tokens_seen": 160975040, + "step": 132295 + }, + { + "epoch": 14.734380220514534, + "grad_norm": 0.2331518828868866, + "learning_rate": 9.835477416258912e-06, + "loss": 0.0053, + "num_input_tokens_seen": 160981376, + "step": 132300 + }, + { + "epoch": 14.73493707539815, + "grad_norm": 0.09070717543363571, + "learning_rate": 9.833545793451892e-06, + "loss": 0.103, + "num_input_tokens_seen": 160987488, + "step": 132305 + }, + { + "epoch": 14.735493930281768, + "grad_norm": 0.006790249142795801, + "learning_rate": 9.831614313904194e-06, + "loss": 0.0156, + "num_input_tokens_seen": 160993792, + "step": 132310 + }, + { + "epoch": 14.736050785165386, + "grad_norm": 0.07277800142765045, + "learning_rate": 9.82968297763405e-06, + "loss": 0.0086, + "num_input_tokens_seen": 160999744, + "step": 132315 + }, + { + "epoch": 14.736607640049003, + "grad_norm": 0.0005352633306756616, + "learning_rate": 9.827751784659703e-06, + "loss": 0.0964, + "num_input_tokens_seen": 161005952, + "step": 132320 + }, + { + "epoch": 14.737164494932621, + "grad_norm": 0.18155822157859802, + "learning_rate": 9.825820734999389e-06, + "loss": 0.1791, + "num_input_tokens_seen": 161011808, + "step": 132325 + }, + { + "epoch": 14.737721349816239, + "grad_norm": 0.0001648951874813065, + "learning_rate": 9.823889828671364e-06, + "loss": 0.0147, + "num_input_tokens_seen": 161017280, + "step": 132330 + }, + { + "epoch": 14.738278204699855, + "grad_norm": 0.9086172580718994, + "learning_rate": 9.82195906569385e-06, + "loss": 0.1067, + "num_input_tokens_seen": 161023456, + "step": 132335 + }, + { + "epoch": 14.738835059583472, + "grad_norm": 1.0035982131958008, + "learning_rate": 9.820028446085103e-06, + "loss": 0.0312, + "num_input_tokens_seen": 161029408, + "step": 132340 + }, + { + "epoch": 14.73939191446709, + "grad_norm": 0.040687642991542816, + "learning_rate": 9.818097969863347e-06, + "loss": 0.0179, + "num_input_tokens_seen": 161035936, + "step": 132345 + }, + { + "epoch": 14.739948769350708, + "grad_norm": 1.255496621131897, + "learning_rate": 9.816167637046823e-06, + "loss": 0.0713, + "num_input_tokens_seen": 161041952, + "step": 132350 + }, + { + "epoch": 14.740505624234325, + "grad_norm": 9.347622108180076e-05, + "learning_rate": 9.814237447653746e-06, + "loss": 0.0015, + "num_input_tokens_seen": 161048160, + "step": 132355 + }, + { + "epoch": 14.741062479117941, + "grad_norm": 0.03004593960940838, + "learning_rate": 9.812307401702375e-06, + "loss": 0.0158, + "num_input_tokens_seen": 161054176, + "step": 132360 + }, + { + "epoch": 14.741619334001559, + "grad_norm": 0.0016274971421808004, + "learning_rate": 9.81037749921093e-06, + "loss": 0.0065, + "num_input_tokens_seen": 161060480, + "step": 132365 + }, + { + "epoch": 14.742176188885177, + "grad_norm": 0.14647215604782104, + "learning_rate": 9.808447740197638e-06, + "loss": 0.0057, + "num_input_tokens_seen": 161066368, + "step": 132370 + }, + { + "epoch": 14.742733043768794, + "grad_norm": 0.0011975191300734878, + "learning_rate": 9.80651812468072e-06, + "loss": 0.0107, + "num_input_tokens_seen": 161072832, + "step": 132375 + }, + { + "epoch": 14.743289898652412, + "grad_norm": 0.0013258870458230376, + "learning_rate": 9.804588652678418e-06, + "loss": 0.0029, + "num_input_tokens_seen": 161078912, + "step": 132380 + }, + { + "epoch": 14.743846753536028, + "grad_norm": 2.5982067584991455, + "learning_rate": 9.802659324208943e-06, + "loss": 0.0875, + "num_input_tokens_seen": 161085120, + "step": 132385 + }, + { + "epoch": 14.744403608419645, + "grad_norm": 0.9245558381080627, + "learning_rate": 9.800730139290546e-06, + "loss": 0.0804, + "num_input_tokens_seen": 161091008, + "step": 132390 + }, + { + "epoch": 14.744960463303263, + "grad_norm": 0.28749552369117737, + "learning_rate": 9.79880109794141e-06, + "loss": 0.0437, + "num_input_tokens_seen": 161096992, + "step": 132395 + }, + { + "epoch": 14.74551731818688, + "grad_norm": 2.167383909225464, + "learning_rate": 9.796872200179789e-06, + "loss": 0.0834, + "num_input_tokens_seen": 161102880, + "step": 132400 + }, + { + "epoch": 14.746074173070499, + "grad_norm": 0.5796123743057251, + "learning_rate": 9.794943446023876e-06, + "loss": 0.0477, + "num_input_tokens_seen": 161108512, + "step": 132405 + }, + { + "epoch": 14.746631027954114, + "grad_norm": 0.005434713326394558, + "learning_rate": 9.793014835491918e-06, + "loss": 0.0829, + "num_input_tokens_seen": 161114816, + "step": 132410 + }, + { + "epoch": 14.747187882837732, + "grad_norm": 0.2898578643798828, + "learning_rate": 9.791086368602118e-06, + "loss": 0.106, + "num_input_tokens_seen": 161121024, + "step": 132415 + }, + { + "epoch": 14.74774473772135, + "grad_norm": 0.47402992844581604, + "learning_rate": 9.78915804537269e-06, + "loss": 0.1766, + "num_input_tokens_seen": 161127072, + "step": 132420 + }, + { + "epoch": 14.748301592604967, + "grad_norm": 0.00046025554183870554, + "learning_rate": 9.787229865821843e-06, + "loss": 0.0015, + "num_input_tokens_seen": 161133344, + "step": 132425 + }, + { + "epoch": 14.748858447488585, + "grad_norm": 0.001677907770499587, + "learning_rate": 9.785301829967807e-06, + "loss": 0.1402, + "num_input_tokens_seen": 161139264, + "step": 132430 + }, + { + "epoch": 14.749415302372201, + "grad_norm": 1.5730619430541992, + "learning_rate": 9.783373937828785e-06, + "loss": 0.126, + "num_input_tokens_seen": 161145280, + "step": 132435 + }, + { + "epoch": 14.749972157255819, + "grad_norm": 0.0006852046935819089, + "learning_rate": 9.781446189422988e-06, + "loss": 0.0169, + "num_input_tokens_seen": 161151392, + "step": 132440 + }, + { + "epoch": 14.750529012139436, + "grad_norm": 0.9824182987213135, + "learning_rate": 9.779518584768615e-06, + "loss": 0.0588, + "num_input_tokens_seen": 161157568, + "step": 132445 + }, + { + "epoch": 14.751085867023054, + "grad_norm": 0.132359579205513, + "learning_rate": 9.777591123883894e-06, + "loss": 0.2044, + "num_input_tokens_seen": 161162304, + "step": 132450 + }, + { + "epoch": 14.751642721906672, + "grad_norm": 0.1155928298830986, + "learning_rate": 9.775663806787011e-06, + "loss": 0.0616, + "num_input_tokens_seen": 161167968, + "step": 132455 + }, + { + "epoch": 14.75219957679029, + "grad_norm": 0.0019281187560409307, + "learning_rate": 9.7737366334962e-06, + "loss": 0.0499, + "num_input_tokens_seen": 161173664, + "step": 132460 + }, + { + "epoch": 14.752756431673905, + "grad_norm": 0.003144904738292098, + "learning_rate": 9.771809604029625e-06, + "loss": 0.0173, + "num_input_tokens_seen": 161179936, + "step": 132465 + }, + { + "epoch": 14.753313286557523, + "grad_norm": 0.03440338745713234, + "learning_rate": 9.769882718405521e-06, + "loss": 0.0909, + "num_input_tokens_seen": 161185760, + "step": 132470 + }, + { + "epoch": 14.75387014144114, + "grad_norm": 0.10334314405918121, + "learning_rate": 9.767955976642065e-06, + "loss": 0.054, + "num_input_tokens_seen": 161192320, + "step": 132475 + }, + { + "epoch": 14.754426996324758, + "grad_norm": 1.141821026802063, + "learning_rate": 9.76602937875748e-06, + "loss": 0.0756, + "num_input_tokens_seen": 161198656, + "step": 132480 + }, + { + "epoch": 14.754983851208376, + "grad_norm": 0.015306277200579643, + "learning_rate": 9.764102924769949e-06, + "loss": 0.0308, + "num_input_tokens_seen": 161204608, + "step": 132485 + }, + { + "epoch": 14.755540706091992, + "grad_norm": 0.5366643071174622, + "learning_rate": 9.762176614697677e-06, + "loss": 0.0985, + "num_input_tokens_seen": 161210912, + "step": 132490 + }, + { + "epoch": 14.75609756097561, + "grad_norm": 0.504765510559082, + "learning_rate": 9.760250448558852e-06, + "loss": 0.0143, + "num_input_tokens_seen": 161216736, + "step": 132495 + }, + { + "epoch": 14.756654415859227, + "grad_norm": 0.14110785722732544, + "learning_rate": 9.758324426371664e-06, + "loss": 0.0357, + "num_input_tokens_seen": 161222784, + "step": 132500 + }, + { + "epoch": 14.757211270742845, + "grad_norm": 0.03184864670038223, + "learning_rate": 9.756398548154322e-06, + "loss": 0.0037, + "num_input_tokens_seen": 161229152, + "step": 132505 + }, + { + "epoch": 14.757768125626463, + "grad_norm": 0.5211915373802185, + "learning_rate": 9.754472813925009e-06, + "loss": 0.0136, + "num_input_tokens_seen": 161235264, + "step": 132510 + }, + { + "epoch": 14.758324980510078, + "grad_norm": 0.04658428952097893, + "learning_rate": 9.752547223701917e-06, + "loss": 0.0739, + "num_input_tokens_seen": 161241344, + "step": 132515 + }, + { + "epoch": 14.758881835393696, + "grad_norm": 0.001779742306098342, + "learning_rate": 9.750621777503218e-06, + "loss": 0.0309, + "num_input_tokens_seen": 161247616, + "step": 132520 + }, + { + "epoch": 14.759438690277314, + "grad_norm": 0.08326119929552078, + "learning_rate": 9.748696475347127e-06, + "loss": 0.0035, + "num_input_tokens_seen": 161253984, + "step": 132525 + }, + { + "epoch": 14.759995545160931, + "grad_norm": 0.04436280205845833, + "learning_rate": 9.746771317251806e-06, + "loss": 0.0545, + "num_input_tokens_seen": 161260320, + "step": 132530 + }, + { + "epoch": 14.76055240004455, + "grad_norm": 0.021588141098618507, + "learning_rate": 9.744846303235469e-06, + "loss": 0.019, + "num_input_tokens_seen": 161266624, + "step": 132535 + }, + { + "epoch": 14.761109254928165, + "grad_norm": 0.0240485779941082, + "learning_rate": 9.742921433316266e-06, + "loss": 0.0569, + "num_input_tokens_seen": 161273024, + "step": 132540 + }, + { + "epoch": 14.761666109811783, + "grad_norm": 0.00024089889484457672, + "learning_rate": 9.740996707512399e-06, + "loss": 0.0114, + "num_input_tokens_seen": 161279200, + "step": 132545 + }, + { + "epoch": 14.7622229646954, + "grad_norm": 0.15477946400642395, + "learning_rate": 9.739072125842036e-06, + "loss": 0.0071, + "num_input_tokens_seen": 161285504, + "step": 132550 + }, + { + "epoch": 14.762779819579018, + "grad_norm": 0.21507155895233154, + "learning_rate": 9.73714768832337e-06, + "loss": 0.0403, + "num_input_tokens_seen": 161291552, + "step": 132555 + }, + { + "epoch": 14.763336674462636, + "grad_norm": 0.16564665734767914, + "learning_rate": 9.735223394974576e-06, + "loss": 0.0514, + "num_input_tokens_seen": 161297632, + "step": 132560 + }, + { + "epoch": 14.763893529346252, + "grad_norm": 0.8816958069801331, + "learning_rate": 9.733299245813826e-06, + "loss": 0.045, + "num_input_tokens_seen": 161303968, + "step": 132565 + }, + { + "epoch": 14.76445038422987, + "grad_norm": 0.09917458146810532, + "learning_rate": 9.731375240859287e-06, + "loss": 0.0342, + "num_input_tokens_seen": 161309888, + "step": 132570 + }, + { + "epoch": 14.765007239113487, + "grad_norm": 0.5383032560348511, + "learning_rate": 9.72945138012915e-06, + "loss": 0.0965, + "num_input_tokens_seen": 161315712, + "step": 132575 + }, + { + "epoch": 14.765564093997105, + "grad_norm": 0.03117230162024498, + "learning_rate": 9.727527663641578e-06, + "loss": 0.0274, + "num_input_tokens_seen": 161321056, + "step": 132580 + }, + { + "epoch": 14.766120948880722, + "grad_norm": 3.3846583366394043, + "learning_rate": 9.725604091414747e-06, + "loss": 0.2764, + "num_input_tokens_seen": 161327072, + "step": 132585 + }, + { + "epoch": 14.766677803764338, + "grad_norm": 0.31835636496543884, + "learning_rate": 9.723680663466811e-06, + "loss": 0.0231, + "num_input_tokens_seen": 161333024, + "step": 132590 + }, + { + "epoch": 14.767234658647956, + "grad_norm": 1.4257208108901978, + "learning_rate": 9.72175737981596e-06, + "loss": 0.1084, + "num_input_tokens_seen": 161339168, + "step": 132595 + }, + { + "epoch": 14.767791513531574, + "grad_norm": 0.6760024428367615, + "learning_rate": 9.719834240480344e-06, + "loss": 0.0578, + "num_input_tokens_seen": 161345312, + "step": 132600 + }, + { + "epoch": 14.768348368415191, + "grad_norm": 0.008789841085672379, + "learning_rate": 9.717911245478153e-06, + "loss": 0.0409, + "num_input_tokens_seen": 161351616, + "step": 132605 + }, + { + "epoch": 14.768905223298809, + "grad_norm": 0.003675157204270363, + "learning_rate": 9.715988394827515e-06, + "loss": 0.0395, + "num_input_tokens_seen": 161357696, + "step": 132610 + }, + { + "epoch": 14.769462078182425, + "grad_norm": 0.12392941862344742, + "learning_rate": 9.714065688546622e-06, + "loss": 0.0273, + "num_input_tokens_seen": 161363968, + "step": 132615 + }, + { + "epoch": 14.770018933066043, + "grad_norm": 0.09809832274913788, + "learning_rate": 9.712143126653617e-06, + "loss": 0.0157, + "num_input_tokens_seen": 161370016, + "step": 132620 + }, + { + "epoch": 14.77057578794966, + "grad_norm": 0.01360080111771822, + "learning_rate": 9.710220709166675e-06, + "loss": 0.0579, + "num_input_tokens_seen": 161376192, + "step": 132625 + }, + { + "epoch": 14.771132642833278, + "grad_norm": 0.01594296284019947, + "learning_rate": 9.708298436103952e-06, + "loss": 0.0562, + "num_input_tokens_seen": 161381760, + "step": 132630 + }, + { + "epoch": 14.771689497716896, + "grad_norm": 0.09931009262800217, + "learning_rate": 9.706376307483597e-06, + "loss": 0.0248, + "num_input_tokens_seen": 161387584, + "step": 132635 + }, + { + "epoch": 14.772246352600511, + "grad_norm": 0.04015407711267471, + "learning_rate": 9.704454323323766e-06, + "loss": 0.0765, + "num_input_tokens_seen": 161393568, + "step": 132640 + }, + { + "epoch": 14.77280320748413, + "grad_norm": 0.08625885844230652, + "learning_rate": 9.702532483642626e-06, + "loss": 0.0102, + "num_input_tokens_seen": 161399808, + "step": 132645 + }, + { + "epoch": 14.773360062367747, + "grad_norm": 0.9225283265113831, + "learning_rate": 9.700610788458322e-06, + "loss": 0.0322, + "num_input_tokens_seen": 161405664, + "step": 132650 + }, + { + "epoch": 14.773916917251364, + "grad_norm": 0.1513906866312027, + "learning_rate": 9.69868923778901e-06, + "loss": 0.0081, + "num_input_tokens_seen": 161411808, + "step": 132655 + }, + { + "epoch": 14.774473772134982, + "grad_norm": 0.03799287602305412, + "learning_rate": 9.696767831652826e-06, + "loss": 0.0014, + "num_input_tokens_seen": 161418144, + "step": 132660 + }, + { + "epoch": 14.775030627018598, + "grad_norm": 0.2568848729133606, + "learning_rate": 9.694846570067939e-06, + "loss": 0.0574, + "num_input_tokens_seen": 161424320, + "step": 132665 + }, + { + "epoch": 14.775587481902216, + "grad_norm": 0.5367859601974487, + "learning_rate": 9.692925453052482e-06, + "loss": 0.0183, + "num_input_tokens_seen": 161430592, + "step": 132670 + }, + { + "epoch": 14.776144336785833, + "grad_norm": 2.007676124572754, + "learning_rate": 9.691004480624621e-06, + "loss": 0.092, + "num_input_tokens_seen": 161436928, + "step": 132675 + }, + { + "epoch": 14.776701191669451, + "grad_norm": 0.01513752993196249, + "learning_rate": 9.689083652802475e-06, + "loss": 0.0636, + "num_input_tokens_seen": 161443104, + "step": 132680 + }, + { + "epoch": 14.777258046553069, + "grad_norm": 0.011814416386187077, + "learning_rate": 9.687162969604207e-06, + "loss": 0.0293, + "num_input_tokens_seen": 161448992, + "step": 132685 + }, + { + "epoch": 14.777814901436686, + "grad_norm": 7.510168506996706e-05, + "learning_rate": 9.685242431047945e-06, + "loss": 0.0256, + "num_input_tokens_seen": 161455360, + "step": 132690 + }, + { + "epoch": 14.778371756320302, + "grad_norm": 0.00542866438627243, + "learning_rate": 9.683322037151849e-06, + "loss": 0.0208, + "num_input_tokens_seen": 161461824, + "step": 132695 + }, + { + "epoch": 14.77892861120392, + "grad_norm": 2.1107091903686523, + "learning_rate": 9.681401787934044e-06, + "loss": 0.0856, + "num_input_tokens_seen": 161467840, + "step": 132700 + }, + { + "epoch": 14.779485466087538, + "grad_norm": 0.0012747637229040265, + "learning_rate": 9.679481683412674e-06, + "loss": 0.0465, + "num_input_tokens_seen": 161473504, + "step": 132705 + }, + { + "epoch": 14.780042320971155, + "grad_norm": 0.012269175611436367, + "learning_rate": 9.677561723605866e-06, + "loss": 0.0674, + "num_input_tokens_seen": 161479840, + "step": 132710 + }, + { + "epoch": 14.780599175854773, + "grad_norm": 0.0037476022262126207, + "learning_rate": 9.675641908531774e-06, + "loss": 0.0309, + "num_input_tokens_seen": 161485920, + "step": 132715 + }, + { + "epoch": 14.781156030738389, + "grad_norm": 0.10540423542261124, + "learning_rate": 9.673722238208518e-06, + "loss": 0.0176, + "num_input_tokens_seen": 161492288, + "step": 132720 + }, + { + "epoch": 14.781712885622007, + "grad_norm": 0.2599868178367615, + "learning_rate": 9.671802712654238e-06, + "loss": 0.0466, + "num_input_tokens_seen": 161497856, + "step": 132725 + }, + { + "epoch": 14.782269740505624, + "grad_norm": 0.0004753880202770233, + "learning_rate": 9.66988333188705e-06, + "loss": 0.1448, + "num_input_tokens_seen": 161504128, + "step": 132730 + }, + { + "epoch": 14.782826595389242, + "grad_norm": 0.021811077371239662, + "learning_rate": 9.667964095925109e-06, + "loss": 0.0133, + "num_input_tokens_seen": 161510016, + "step": 132735 + }, + { + "epoch": 14.78338345027286, + "grad_norm": 2.0470871925354004, + "learning_rate": 9.66604500478652e-06, + "loss": 0.1872, + "num_input_tokens_seen": 161516160, + "step": 132740 + }, + { + "epoch": 14.783940305156476, + "grad_norm": 0.05456696078181267, + "learning_rate": 9.664126058489428e-06, + "loss": 0.0011, + "num_input_tokens_seen": 161522144, + "step": 132745 + }, + { + "epoch": 14.784497160040093, + "grad_norm": 0.004282830283045769, + "learning_rate": 9.662207257051956e-06, + "loss": 0.0581, + "num_input_tokens_seen": 161528288, + "step": 132750 + }, + { + "epoch": 14.78505401492371, + "grad_norm": 0.1752147376537323, + "learning_rate": 9.660288600492223e-06, + "loss": 0.009, + "num_input_tokens_seen": 161534592, + "step": 132755 + }, + { + "epoch": 14.785610869807329, + "grad_norm": 8.810251165414229e-05, + "learning_rate": 9.658370088828345e-06, + "loss": 0.0698, + "num_input_tokens_seen": 161540064, + "step": 132760 + }, + { + "epoch": 14.786167724690946, + "grad_norm": 0.12186447530984879, + "learning_rate": 9.656451722078463e-06, + "loss": 0.0135, + "num_input_tokens_seen": 161545248, + "step": 132765 + }, + { + "epoch": 14.786724579574562, + "grad_norm": 0.0181170292198658, + "learning_rate": 9.654533500260687e-06, + "loss": 0.0032, + "num_input_tokens_seen": 161551424, + "step": 132770 + }, + { + "epoch": 14.78728143445818, + "grad_norm": 0.0031374439131468534, + "learning_rate": 9.652615423393136e-06, + "loss": 0.0074, + "num_input_tokens_seen": 161557632, + "step": 132775 + }, + { + "epoch": 14.787838289341797, + "grad_norm": 0.0010452549904584885, + "learning_rate": 9.650697491493921e-06, + "loss": 0.0646, + "num_input_tokens_seen": 161563744, + "step": 132780 + }, + { + "epoch": 14.788395144225415, + "grad_norm": 0.021670537069439888, + "learning_rate": 9.648779704581173e-06, + "loss": 0.0114, + "num_input_tokens_seen": 161570048, + "step": 132785 + }, + { + "epoch": 14.788951999109033, + "grad_norm": 0.0008083684951998293, + "learning_rate": 9.646862062672993e-06, + "loss": 0.0351, + "num_input_tokens_seen": 161576384, + "step": 132790 + }, + { + "epoch": 14.789508853992649, + "grad_norm": 0.10735561698675156, + "learning_rate": 9.644944565787517e-06, + "loss": 0.0877, + "num_input_tokens_seen": 161582656, + "step": 132795 + }, + { + "epoch": 14.790065708876266, + "grad_norm": 1.2085660696029663, + "learning_rate": 9.643027213942826e-06, + "loss": 0.0436, + "num_input_tokens_seen": 161588640, + "step": 132800 + }, + { + "epoch": 14.790622563759884, + "grad_norm": 0.011450852267444134, + "learning_rate": 9.641110007157056e-06, + "loss": 0.0018, + "num_input_tokens_seen": 161594720, + "step": 132805 + }, + { + "epoch": 14.791179418643502, + "grad_norm": 0.46363523602485657, + "learning_rate": 9.639192945448297e-06, + "loss": 0.0118, + "num_input_tokens_seen": 161600544, + "step": 132810 + }, + { + "epoch": 14.79173627352712, + "grad_norm": 0.04950612038373947, + "learning_rate": 9.637276028834676e-06, + "loss": 0.0644, + "num_input_tokens_seen": 161606848, + "step": 132815 + }, + { + "epoch": 14.792293128410737, + "grad_norm": 0.06425261497497559, + "learning_rate": 9.635359257334292e-06, + "loss": 0.0255, + "num_input_tokens_seen": 161612832, + "step": 132820 + }, + { + "epoch": 14.792849983294353, + "grad_norm": 0.035923268646001816, + "learning_rate": 9.63344263096525e-06, + "loss": 0.0733, + "num_input_tokens_seen": 161618400, + "step": 132825 + }, + { + "epoch": 14.79340683817797, + "grad_norm": 0.3501397967338562, + "learning_rate": 9.631526149745646e-06, + "loss": 0.0496, + "num_input_tokens_seen": 161624480, + "step": 132830 + }, + { + "epoch": 14.793963693061588, + "grad_norm": 0.3523484170436859, + "learning_rate": 9.6296098136936e-06, + "loss": 0.0039, + "num_input_tokens_seen": 161630496, + "step": 132835 + }, + { + "epoch": 14.794520547945206, + "grad_norm": 2.867438316345215, + "learning_rate": 9.627693622827199e-06, + "loss": 0.1319, + "num_input_tokens_seen": 161636384, + "step": 132840 + }, + { + "epoch": 14.795077402828824, + "grad_norm": 0.02918313257396221, + "learning_rate": 9.625777577164553e-06, + "loss": 0.0545, + "num_input_tokens_seen": 161642464, + "step": 132845 + }, + { + "epoch": 14.79563425771244, + "grad_norm": 0.45785272121429443, + "learning_rate": 9.623861676723744e-06, + "loss": 0.0045, + "num_input_tokens_seen": 161648832, + "step": 132850 + }, + { + "epoch": 14.796191112596057, + "grad_norm": 0.06850910931825638, + "learning_rate": 9.62194592152289e-06, + "loss": 0.0335, + "num_input_tokens_seen": 161654720, + "step": 132855 + }, + { + "epoch": 14.796747967479675, + "grad_norm": 0.01865210011601448, + "learning_rate": 9.62003031158007e-06, + "loss": 0.0962, + "num_input_tokens_seen": 161660928, + "step": 132860 + }, + { + "epoch": 14.797304822363293, + "grad_norm": 0.007742609828710556, + "learning_rate": 9.6181148469134e-06, + "loss": 0.0064, + "num_input_tokens_seen": 161667200, + "step": 132865 + }, + { + "epoch": 14.79786167724691, + "grad_norm": 0.00958375446498394, + "learning_rate": 9.616199527540946e-06, + "loss": 0.0018, + "num_input_tokens_seen": 161673664, + "step": 132870 + }, + { + "epoch": 14.798418532130526, + "grad_norm": 0.8223727941513062, + "learning_rate": 9.614284353480818e-06, + "loss": 0.1559, + "num_input_tokens_seen": 161679904, + "step": 132875 + }, + { + "epoch": 14.798975387014144, + "grad_norm": 0.006357818376272917, + "learning_rate": 9.612369324751092e-06, + "loss": 0.0739, + "num_input_tokens_seen": 161685952, + "step": 132880 + }, + { + "epoch": 14.799532241897762, + "grad_norm": 0.18812505900859833, + "learning_rate": 9.610454441369876e-06, + "loss": 0.0066, + "num_input_tokens_seen": 161692352, + "step": 132885 + }, + { + "epoch": 14.80008909678138, + "grad_norm": 0.6380557417869568, + "learning_rate": 9.608539703355249e-06, + "loss": 0.0113, + "num_input_tokens_seen": 161697696, + "step": 132890 + }, + { + "epoch": 14.800645951664997, + "grad_norm": 0.20702864229679108, + "learning_rate": 9.606625110725296e-06, + "loss": 0.0043, + "num_input_tokens_seen": 161704224, + "step": 132895 + }, + { + "epoch": 14.801202806548613, + "grad_norm": 0.030835403129458427, + "learning_rate": 9.604710663498098e-06, + "loss": 0.068, + "num_input_tokens_seen": 161710400, + "step": 132900 + }, + { + "epoch": 14.80175966143223, + "grad_norm": 0.20963214337825775, + "learning_rate": 9.602796361691738e-06, + "loss": 0.015, + "num_input_tokens_seen": 161716352, + "step": 132905 + }, + { + "epoch": 14.802316516315848, + "grad_norm": 0.013096844777464867, + "learning_rate": 9.60088220532431e-06, + "loss": 0.0588, + "num_input_tokens_seen": 161722400, + "step": 132910 + }, + { + "epoch": 14.802873371199466, + "grad_norm": 0.15302105247974396, + "learning_rate": 9.598968194413885e-06, + "loss": 0.1375, + "num_input_tokens_seen": 161728992, + "step": 132915 + }, + { + "epoch": 14.803430226083083, + "grad_norm": 0.28270411491394043, + "learning_rate": 9.597054328978546e-06, + "loss": 0.0683, + "num_input_tokens_seen": 161735200, + "step": 132920 + }, + { + "epoch": 14.8039870809667, + "grad_norm": 0.23386019468307495, + "learning_rate": 9.595140609036362e-06, + "loss": 0.0169, + "num_input_tokens_seen": 161741440, + "step": 132925 + }, + { + "epoch": 14.804543935850317, + "grad_norm": 0.10254216194152832, + "learning_rate": 9.593227034605423e-06, + "loss": 0.0251, + "num_input_tokens_seen": 161747296, + "step": 132930 + }, + { + "epoch": 14.805100790733935, + "grad_norm": 0.02021593041718006, + "learning_rate": 9.591313605703792e-06, + "loss": 0.0293, + "num_input_tokens_seen": 161753344, + "step": 132935 + }, + { + "epoch": 14.805657645617552, + "grad_norm": 0.0005015229107812047, + "learning_rate": 9.589400322349567e-06, + "loss": 0.0204, + "num_input_tokens_seen": 161759552, + "step": 132940 + }, + { + "epoch": 14.80621450050117, + "grad_norm": 0.060290977358818054, + "learning_rate": 9.587487184560783e-06, + "loss": 0.1522, + "num_input_tokens_seen": 161765632, + "step": 132945 + }, + { + "epoch": 14.806771355384786, + "grad_norm": 0.0011933848727494478, + "learning_rate": 9.585574192355542e-06, + "loss": 0.1388, + "num_input_tokens_seen": 161771584, + "step": 132950 + }, + { + "epoch": 14.807328210268404, + "grad_norm": 0.01963215135037899, + "learning_rate": 9.583661345751893e-06, + "loss": 0.0183, + "num_input_tokens_seen": 161777824, + "step": 132955 + }, + { + "epoch": 14.807885065152021, + "grad_norm": 1.2634681463241577, + "learning_rate": 9.581748644767922e-06, + "loss": 0.0464, + "num_input_tokens_seen": 161783936, + "step": 132960 + }, + { + "epoch": 14.808441920035639, + "grad_norm": 1.8517403602600098, + "learning_rate": 9.579836089421688e-06, + "loss": 0.0178, + "num_input_tokens_seen": 161790240, + "step": 132965 + }, + { + "epoch": 14.808998774919257, + "grad_norm": 0.010055366903543472, + "learning_rate": 9.577923679731257e-06, + "loss": 0.018, + "num_input_tokens_seen": 161796768, + "step": 132970 + }, + { + "epoch": 14.809555629802873, + "grad_norm": 0.9267547130584717, + "learning_rate": 9.576011415714683e-06, + "loss": 0.0477, + "num_input_tokens_seen": 161803104, + "step": 132975 + }, + { + "epoch": 14.81011248468649, + "grad_norm": 0.19416898488998413, + "learning_rate": 9.574099297390048e-06, + "loss": 0.0855, + "num_input_tokens_seen": 161809280, + "step": 132980 + }, + { + "epoch": 14.810669339570108, + "grad_norm": 0.09233973920345306, + "learning_rate": 9.572187324775406e-06, + "loss": 0.0856, + "num_input_tokens_seen": 161815040, + "step": 132985 + }, + { + "epoch": 14.811226194453726, + "grad_norm": 0.0034933879505842924, + "learning_rate": 9.570275497888815e-06, + "loss": 0.011, + "num_input_tokens_seen": 161821216, + "step": 132990 + }, + { + "epoch": 14.811783049337343, + "grad_norm": 1.9727866649627686, + "learning_rate": 9.568363816748325e-06, + "loss": 0.2329, + "num_input_tokens_seen": 161827328, + "step": 132995 + }, + { + "epoch": 14.81233990422096, + "grad_norm": 0.40959876775741577, + "learning_rate": 9.566452281372007e-06, + "loss": 0.0337, + "num_input_tokens_seen": 161833536, + "step": 133000 + }, + { + "epoch": 14.812896759104577, + "grad_norm": 0.5380585193634033, + "learning_rate": 9.564540891777907e-06, + "loss": 0.0329, + "num_input_tokens_seen": 161839680, + "step": 133005 + }, + { + "epoch": 14.813453613988194, + "grad_norm": 0.14627963304519653, + "learning_rate": 9.562629647984103e-06, + "loss": 0.057, + "num_input_tokens_seen": 161845312, + "step": 133010 + }, + { + "epoch": 14.814010468871812, + "grad_norm": 0.005503777414560318, + "learning_rate": 9.560718550008612e-06, + "loss": 0.0151, + "num_input_tokens_seen": 161851488, + "step": 133015 + }, + { + "epoch": 14.81456732375543, + "grad_norm": 0.01762508973479271, + "learning_rate": 9.558807597869512e-06, + "loss": 0.0145, + "num_input_tokens_seen": 161857568, + "step": 133020 + }, + { + "epoch": 14.815124178639046, + "grad_norm": 0.43328097462654114, + "learning_rate": 9.556896791584838e-06, + "loss": 0.0144, + "num_input_tokens_seen": 161863968, + "step": 133025 + }, + { + "epoch": 14.815681033522663, + "grad_norm": 0.007185924798250198, + "learning_rate": 9.554986131172655e-06, + "loss": 0.034, + "num_input_tokens_seen": 161870432, + "step": 133030 + }, + { + "epoch": 14.816237888406281, + "grad_norm": 0.03155156597495079, + "learning_rate": 9.553075616651002e-06, + "loss": 0.0052, + "num_input_tokens_seen": 161876416, + "step": 133035 + }, + { + "epoch": 14.816794743289899, + "grad_norm": 1.762986421585083, + "learning_rate": 9.551165248037927e-06, + "loss": 0.0466, + "num_input_tokens_seen": 161882272, + "step": 133040 + }, + { + "epoch": 14.817351598173516, + "grad_norm": 0.1432783603668213, + "learning_rate": 9.549255025351464e-06, + "loss": 0.0183, + "num_input_tokens_seen": 161888160, + "step": 133045 + }, + { + "epoch": 14.817908453057134, + "grad_norm": 4.724229335784912, + "learning_rate": 9.547344948609673e-06, + "loss": 0.0925, + "num_input_tokens_seen": 161894528, + "step": 133050 + }, + { + "epoch": 14.81846530794075, + "grad_norm": 0.19269251823425293, + "learning_rate": 9.545435017830592e-06, + "loss": 0.0274, + "num_input_tokens_seen": 161900832, + "step": 133055 + }, + { + "epoch": 14.819022162824368, + "grad_norm": 0.011439766734838486, + "learning_rate": 9.543525233032258e-06, + "loss": 0.0848, + "num_input_tokens_seen": 161906944, + "step": 133060 + }, + { + "epoch": 14.819579017707985, + "grad_norm": 0.08350127935409546, + "learning_rate": 9.541615594232703e-06, + "loss": 0.0169, + "num_input_tokens_seen": 161912928, + "step": 133065 + }, + { + "epoch": 14.820135872591603, + "grad_norm": 0.017113225534558296, + "learning_rate": 9.539706101449982e-06, + "loss": 0.0601, + "num_input_tokens_seen": 161919168, + "step": 133070 + }, + { + "epoch": 14.82069272747522, + "grad_norm": 1.3451998233795166, + "learning_rate": 9.537796754702117e-06, + "loss": 0.018, + "num_input_tokens_seen": 161925152, + "step": 133075 + }, + { + "epoch": 14.821249582358837, + "grad_norm": 0.5032172203063965, + "learning_rate": 9.535887554007155e-06, + "loss": 0.0372, + "num_input_tokens_seen": 161931264, + "step": 133080 + }, + { + "epoch": 14.821806437242454, + "grad_norm": 0.00037815363612025976, + "learning_rate": 9.533978499383128e-06, + "loss": 0.0008, + "num_input_tokens_seen": 161937696, + "step": 133085 + }, + { + "epoch": 14.822363292126072, + "grad_norm": 0.02153559774160385, + "learning_rate": 9.532069590848064e-06, + "loss": 0.0972, + "num_input_tokens_seen": 161943840, + "step": 133090 + }, + { + "epoch": 14.82292014700969, + "grad_norm": 0.3285917639732361, + "learning_rate": 9.530160828419987e-06, + "loss": 0.0696, + "num_input_tokens_seen": 161949632, + "step": 133095 + }, + { + "epoch": 14.823477001893307, + "grad_norm": 0.2963559925556183, + "learning_rate": 9.528252212116945e-06, + "loss": 0.0049, + "num_input_tokens_seen": 161955776, + "step": 133100 + }, + { + "epoch": 14.824033856776923, + "grad_norm": 1.2130632400512695, + "learning_rate": 9.526343741956953e-06, + "loss": 0.0679, + "num_input_tokens_seen": 161961760, + "step": 133105 + }, + { + "epoch": 14.82459071166054, + "grad_norm": 0.003041682066395879, + "learning_rate": 9.524435417958044e-06, + "loss": 0.0521, + "num_input_tokens_seen": 161967904, + "step": 133110 + }, + { + "epoch": 14.825147566544159, + "grad_norm": 1.4260770082473755, + "learning_rate": 9.522527240138235e-06, + "loss": 0.0818, + "num_input_tokens_seen": 161973920, + "step": 133115 + }, + { + "epoch": 14.825704421427776, + "grad_norm": 0.412322998046875, + "learning_rate": 9.520619208515561e-06, + "loss": 0.069, + "num_input_tokens_seen": 161979392, + "step": 133120 + }, + { + "epoch": 14.826261276311394, + "grad_norm": 0.28554216027259827, + "learning_rate": 9.518711323108042e-06, + "loss": 0.0233, + "num_input_tokens_seen": 161985728, + "step": 133125 + }, + { + "epoch": 14.82681813119501, + "grad_norm": 0.3167073428630829, + "learning_rate": 9.516803583933697e-06, + "loss": 0.0756, + "num_input_tokens_seen": 161991968, + "step": 133130 + }, + { + "epoch": 14.827374986078627, + "grad_norm": 0.4458216428756714, + "learning_rate": 9.514895991010541e-06, + "loss": 0.0875, + "num_input_tokens_seen": 161998240, + "step": 133135 + }, + { + "epoch": 14.827931840962245, + "grad_norm": 1.639612078666687, + "learning_rate": 9.512988544356605e-06, + "loss": 0.0524, + "num_input_tokens_seen": 162003968, + "step": 133140 + }, + { + "epoch": 14.828488695845863, + "grad_norm": 1.448319911956787, + "learning_rate": 9.511081243989894e-06, + "loss": 0.0972, + "num_input_tokens_seen": 162010112, + "step": 133145 + }, + { + "epoch": 14.82904555072948, + "grad_norm": 1.5951967239379883, + "learning_rate": 9.509174089928435e-06, + "loss": 0.05, + "num_input_tokens_seen": 162016544, + "step": 133150 + }, + { + "epoch": 14.829602405613098, + "grad_norm": 1.2630465030670166, + "learning_rate": 9.50726708219024e-06, + "loss": 0.0272, + "num_input_tokens_seen": 162022816, + "step": 133155 + }, + { + "epoch": 14.830159260496714, + "grad_norm": 0.6313053369522095, + "learning_rate": 9.505360220793322e-06, + "loss": 0.1525, + "num_input_tokens_seen": 162028736, + "step": 133160 + }, + { + "epoch": 14.830716115380332, + "grad_norm": 0.05732393264770508, + "learning_rate": 9.50345350575568e-06, + "loss": 0.0144, + "num_input_tokens_seen": 162035008, + "step": 133165 + }, + { + "epoch": 14.83127297026395, + "grad_norm": 0.38911527395248413, + "learning_rate": 9.501546937095343e-06, + "loss": 0.0418, + "num_input_tokens_seen": 162040928, + "step": 133170 + }, + { + "epoch": 14.831829825147567, + "grad_norm": 0.02871818281710148, + "learning_rate": 9.499640514830316e-06, + "loss": 0.0985, + "num_input_tokens_seen": 162046688, + "step": 133175 + }, + { + "epoch": 14.832386680031185, + "grad_norm": 0.008418894372880459, + "learning_rate": 9.497734238978601e-06, + "loss": 0.027, + "num_input_tokens_seen": 162052800, + "step": 133180 + }, + { + "epoch": 14.8329435349148, + "grad_norm": 0.0009158666362054646, + "learning_rate": 9.495828109558197e-06, + "loss": 0.0013, + "num_input_tokens_seen": 162058848, + "step": 133185 + }, + { + "epoch": 14.833500389798418, + "grad_norm": 0.0399448536336422, + "learning_rate": 9.493922126587127e-06, + "loss": 0.0036, + "num_input_tokens_seen": 162065280, + "step": 133190 + }, + { + "epoch": 14.834057244682036, + "grad_norm": 0.1304769217967987, + "learning_rate": 9.492016290083376e-06, + "loss": 0.0479, + "num_input_tokens_seen": 162071520, + "step": 133195 + }, + { + "epoch": 14.834614099565654, + "grad_norm": 0.2166639268398285, + "learning_rate": 9.490110600064975e-06, + "loss": 0.0621, + "num_input_tokens_seen": 162077664, + "step": 133200 + }, + { + "epoch": 14.835170954449271, + "grad_norm": 0.001184841152280569, + "learning_rate": 9.488205056549887e-06, + "loss": 0.1433, + "num_input_tokens_seen": 162083936, + "step": 133205 + }, + { + "epoch": 14.835727809332887, + "grad_norm": 0.0009164498769678175, + "learning_rate": 9.486299659556138e-06, + "loss": 0.0286, + "num_input_tokens_seen": 162090048, + "step": 133210 + }, + { + "epoch": 14.836284664216505, + "grad_norm": 0.45573729276657104, + "learning_rate": 9.484394409101713e-06, + "loss": 0.0203, + "num_input_tokens_seen": 162095808, + "step": 133215 + }, + { + "epoch": 14.836841519100123, + "grad_norm": 0.03916889429092407, + "learning_rate": 9.48248930520462e-06, + "loss": 0.0972, + "num_input_tokens_seen": 162101536, + "step": 133220 + }, + { + "epoch": 14.83739837398374, + "grad_norm": 0.32044529914855957, + "learning_rate": 9.480584347882848e-06, + "loss": 0.0635, + "num_input_tokens_seen": 162107328, + "step": 133225 + }, + { + "epoch": 14.837955228867358, + "grad_norm": 0.3484245240688324, + "learning_rate": 9.478679537154392e-06, + "loss": 0.0259, + "num_input_tokens_seen": 162113152, + "step": 133230 + }, + { + "epoch": 14.838512083750974, + "grad_norm": 0.0007138255168683827, + "learning_rate": 9.476774873037234e-06, + "loss": 0.0263, + "num_input_tokens_seen": 162119136, + "step": 133235 + }, + { + "epoch": 14.839068938634592, + "grad_norm": 0.45496025681495667, + "learning_rate": 9.474870355549382e-06, + "loss": 0.0485, + "num_input_tokens_seen": 162125312, + "step": 133240 + }, + { + "epoch": 14.83962579351821, + "grad_norm": 0.18709971010684967, + "learning_rate": 9.472965984708818e-06, + "loss": 0.0387, + "num_input_tokens_seen": 162131488, + "step": 133245 + }, + { + "epoch": 14.840182648401827, + "grad_norm": 0.5475391745567322, + "learning_rate": 9.47106176053353e-06, + "loss": 0.0892, + "num_input_tokens_seen": 162137600, + "step": 133250 + }, + { + "epoch": 14.840739503285445, + "grad_norm": 0.23315772414207458, + "learning_rate": 9.469157683041499e-06, + "loss": 0.0035, + "num_input_tokens_seen": 162143616, + "step": 133255 + }, + { + "epoch": 14.84129635816906, + "grad_norm": 0.0016963975504040718, + "learning_rate": 9.467253752250724e-06, + "loss": 0.0142, + "num_input_tokens_seen": 162150048, + "step": 133260 + }, + { + "epoch": 14.841853213052678, + "grad_norm": 0.011127596721053123, + "learning_rate": 9.465349968179174e-06, + "loss": 0.0864, + "num_input_tokens_seen": 162156288, + "step": 133265 + }, + { + "epoch": 14.842410067936296, + "grad_norm": 1.4285606145858765, + "learning_rate": 9.463446330844854e-06, + "loss": 0.0335, + "num_input_tokens_seen": 162162528, + "step": 133270 + }, + { + "epoch": 14.842966922819913, + "grad_norm": 0.11470899730920792, + "learning_rate": 9.461542840265717e-06, + "loss": 0.0157, + "num_input_tokens_seen": 162168576, + "step": 133275 + }, + { + "epoch": 14.843523777703531, + "grad_norm": 0.03630711883306503, + "learning_rate": 9.459639496459766e-06, + "loss": 0.0263, + "num_input_tokens_seen": 162174880, + "step": 133280 + }, + { + "epoch": 14.844080632587147, + "grad_norm": 1.5670892000198364, + "learning_rate": 9.45773629944496e-06, + "loss": 0.0285, + "num_input_tokens_seen": 162181056, + "step": 133285 + }, + { + "epoch": 14.844637487470765, + "grad_norm": 0.00048596959095448256, + "learning_rate": 9.455833249239296e-06, + "loss": 0.0097, + "num_input_tokens_seen": 162186784, + "step": 133290 + }, + { + "epoch": 14.845194342354382, + "grad_norm": 1.46393620967865, + "learning_rate": 9.453930345860742e-06, + "loss": 0.0604, + "num_input_tokens_seen": 162192960, + "step": 133295 + }, + { + "epoch": 14.845751197238, + "grad_norm": 0.01632876694202423, + "learning_rate": 9.45202758932727e-06, + "loss": 0.0643, + "num_input_tokens_seen": 162199456, + "step": 133300 + }, + { + "epoch": 14.846308052121618, + "grad_norm": 0.6346536874771118, + "learning_rate": 9.450124979656855e-06, + "loss": 0.0141, + "num_input_tokens_seen": 162205824, + "step": 133305 + }, + { + "epoch": 14.846864907005234, + "grad_norm": 0.000333413016051054, + "learning_rate": 9.448222516867459e-06, + "loss": 0.0813, + "num_input_tokens_seen": 162211456, + "step": 133310 + }, + { + "epoch": 14.847421761888851, + "grad_norm": 2.0782909393310547, + "learning_rate": 9.446320200977069e-06, + "loss": 0.1049, + "num_input_tokens_seen": 162217376, + "step": 133315 + }, + { + "epoch": 14.847978616772469, + "grad_norm": 0.004651170689612627, + "learning_rate": 9.444418032003646e-06, + "loss": 0.0741, + "num_input_tokens_seen": 162223296, + "step": 133320 + }, + { + "epoch": 14.848535471656087, + "grad_norm": 1.7348146438598633, + "learning_rate": 9.44251600996516e-06, + "loss": 0.2051, + "num_input_tokens_seen": 162229760, + "step": 133325 + }, + { + "epoch": 14.849092326539704, + "grad_norm": 0.0013844072818756104, + "learning_rate": 9.440614134879564e-06, + "loss": 0.0861, + "num_input_tokens_seen": 162235520, + "step": 133330 + }, + { + "epoch": 14.84964918142332, + "grad_norm": 0.26737484335899353, + "learning_rate": 9.438712406764843e-06, + "loss": 0.0273, + "num_input_tokens_seen": 162241536, + "step": 133335 + }, + { + "epoch": 14.850206036306938, + "grad_norm": 0.0005890215397812426, + "learning_rate": 9.436810825638941e-06, + "loss": 0.0927, + "num_input_tokens_seen": 162247456, + "step": 133340 + }, + { + "epoch": 14.850762891190556, + "grad_norm": 0.0006590995471924543, + "learning_rate": 9.43490939151985e-06, + "loss": 0.1195, + "num_input_tokens_seen": 162253344, + "step": 133345 + }, + { + "epoch": 14.851319746074173, + "grad_norm": 0.035320945084095, + "learning_rate": 9.433008104425489e-06, + "loss": 0.0348, + "num_input_tokens_seen": 162259488, + "step": 133350 + }, + { + "epoch": 14.851876600957791, + "grad_norm": 0.1127525120973587, + "learning_rate": 9.431106964373851e-06, + "loss": 0.0489, + "num_input_tokens_seen": 162265696, + "step": 133355 + }, + { + "epoch": 14.852433455841407, + "grad_norm": 0.9361832737922668, + "learning_rate": 9.42920597138287e-06, + "loss": 0.0505, + "num_input_tokens_seen": 162272192, + "step": 133360 + }, + { + "epoch": 14.852990310725025, + "grad_norm": 0.0010359070729464293, + "learning_rate": 9.427305125470524e-06, + "loss": 0.0544, + "num_input_tokens_seen": 162278208, + "step": 133365 + }, + { + "epoch": 14.853547165608642, + "grad_norm": 1.2961740493774414, + "learning_rate": 9.425404426654755e-06, + "loss": 0.0227, + "num_input_tokens_seen": 162284352, + "step": 133370 + }, + { + "epoch": 14.85410402049226, + "grad_norm": 1.0660110712051392, + "learning_rate": 9.42350387495352e-06, + "loss": 0.1011, + "num_input_tokens_seen": 162290464, + "step": 133375 + }, + { + "epoch": 14.854660875375878, + "grad_norm": 0.011758674867451191, + "learning_rate": 9.421603470384766e-06, + "loss": 0.0157, + "num_input_tokens_seen": 162296448, + "step": 133380 + }, + { + "epoch": 14.855217730259493, + "grad_norm": 0.03771655634045601, + "learning_rate": 9.419703212966452e-06, + "loss": 0.0204, + "num_input_tokens_seen": 162302368, + "step": 133385 + }, + { + "epoch": 14.855774585143111, + "grad_norm": 0.005187998525798321, + "learning_rate": 9.417803102716527e-06, + "loss": 0.016, + "num_input_tokens_seen": 162308192, + "step": 133390 + }, + { + "epoch": 14.856331440026729, + "grad_norm": 1.4943047761917114, + "learning_rate": 9.415903139652935e-06, + "loss": 0.0625, + "num_input_tokens_seen": 162314368, + "step": 133395 + }, + { + "epoch": 14.856888294910346, + "grad_norm": 0.0009658317430876195, + "learning_rate": 9.414003323793616e-06, + "loss": 0.0078, + "num_input_tokens_seen": 162320608, + "step": 133400 + }, + { + "epoch": 14.857445149793964, + "grad_norm": 0.7155598998069763, + "learning_rate": 9.41210365515653e-06, + "loss": 0.0395, + "num_input_tokens_seen": 162326976, + "step": 133405 + }, + { + "epoch": 14.858002004677582, + "grad_norm": 0.010282110422849655, + "learning_rate": 9.410204133759604e-06, + "loss": 0.0048, + "num_input_tokens_seen": 162333536, + "step": 133410 + }, + { + "epoch": 14.858558859561198, + "grad_norm": 0.05700451135635376, + "learning_rate": 9.40830475962081e-06, + "loss": 0.1011, + "num_input_tokens_seen": 162339776, + "step": 133415 + }, + { + "epoch": 14.859115714444815, + "grad_norm": 0.8314839005470276, + "learning_rate": 9.406405532758051e-06, + "loss": 0.0407, + "num_input_tokens_seen": 162345856, + "step": 133420 + }, + { + "epoch": 14.859672569328433, + "grad_norm": 0.08993212133646011, + "learning_rate": 9.404506453189294e-06, + "loss": 0.0553, + "num_input_tokens_seen": 162351712, + "step": 133425 + }, + { + "epoch": 14.86022942421205, + "grad_norm": 0.4403533935546875, + "learning_rate": 9.40260752093246e-06, + "loss": 0.089, + "num_input_tokens_seen": 162357344, + "step": 133430 + }, + { + "epoch": 14.860786279095668, + "grad_norm": 0.18133093416690826, + "learning_rate": 9.400708736005503e-06, + "loss": 0.0733, + "num_input_tokens_seen": 162363616, + "step": 133435 + }, + { + "epoch": 14.861343133979284, + "grad_norm": 0.000287941686110571, + "learning_rate": 9.39881009842635e-06, + "loss": 0.0031, + "num_input_tokens_seen": 162369984, + "step": 133440 + }, + { + "epoch": 14.861899988862902, + "grad_norm": 0.0019465337973088026, + "learning_rate": 9.396911608212936e-06, + "loss": 0.0234, + "num_input_tokens_seen": 162376032, + "step": 133445 + }, + { + "epoch": 14.86245684374652, + "grad_norm": 0.003872158471494913, + "learning_rate": 9.395013265383182e-06, + "loss": 0.0088, + "num_input_tokens_seen": 162382272, + "step": 133450 + }, + { + "epoch": 14.863013698630137, + "grad_norm": 0.010843402706086636, + "learning_rate": 9.393115069955041e-06, + "loss": 0.0065, + "num_input_tokens_seen": 162388288, + "step": 133455 + }, + { + "epoch": 14.863570553513755, + "grad_norm": 0.009658681228756905, + "learning_rate": 9.39121702194643e-06, + "loss": 0.0076, + "num_input_tokens_seen": 162394240, + "step": 133460 + }, + { + "epoch": 14.864127408397371, + "grad_norm": 0.007481010165065527, + "learning_rate": 9.389319121375282e-06, + "loss": 0.0466, + "num_input_tokens_seen": 162400160, + "step": 133465 + }, + { + "epoch": 14.864684263280989, + "grad_norm": 0.05456266924738884, + "learning_rate": 9.387421368259514e-06, + "loss": 0.0181, + "num_input_tokens_seen": 162406048, + "step": 133470 + }, + { + "epoch": 14.865241118164606, + "grad_norm": 0.47774818539619446, + "learning_rate": 9.385523762617066e-06, + "loss": 0.0541, + "num_input_tokens_seen": 162411456, + "step": 133475 + }, + { + "epoch": 14.865797973048224, + "grad_norm": 0.8611634969711304, + "learning_rate": 9.383626304465848e-06, + "loss": 0.0127, + "num_input_tokens_seen": 162417824, + "step": 133480 + }, + { + "epoch": 14.866354827931842, + "grad_norm": 0.0002366041298955679, + "learning_rate": 9.3817289938238e-06, + "loss": 0.0753, + "num_input_tokens_seen": 162424320, + "step": 133485 + }, + { + "epoch": 14.866911682815457, + "grad_norm": 0.15890663862228394, + "learning_rate": 9.379831830708834e-06, + "loss": 0.0064, + "num_input_tokens_seen": 162430432, + "step": 133490 + }, + { + "epoch": 14.867468537699075, + "grad_norm": 1.6570250988006592, + "learning_rate": 9.377934815138872e-06, + "loss": 0.0808, + "num_input_tokens_seen": 162436064, + "step": 133495 + }, + { + "epoch": 14.868025392582693, + "grad_norm": 0.15242132544517517, + "learning_rate": 9.376037947131824e-06, + "loss": 0.0043, + "num_input_tokens_seen": 162442240, + "step": 133500 + }, + { + "epoch": 14.86858224746631, + "grad_norm": 0.38094082474708557, + "learning_rate": 9.374141226705622e-06, + "loss": 0.0583, + "num_input_tokens_seen": 162448384, + "step": 133505 + }, + { + "epoch": 14.869139102349928, + "grad_norm": 0.0014684482011944056, + "learning_rate": 9.372244653878177e-06, + "loss": 0.0016, + "num_input_tokens_seen": 162454592, + "step": 133510 + }, + { + "epoch": 14.869695957233546, + "grad_norm": 0.018021438270807266, + "learning_rate": 9.3703482286674e-06, + "loss": 0.0208, + "num_input_tokens_seen": 162460800, + "step": 133515 + }, + { + "epoch": 14.870252812117162, + "grad_norm": 0.0469168983399868, + "learning_rate": 9.3684519510912e-06, + "loss": 0.0128, + "num_input_tokens_seen": 162467040, + "step": 133520 + }, + { + "epoch": 14.87080966700078, + "grad_norm": 0.021862560883164406, + "learning_rate": 9.366555821167503e-06, + "loss": 0.0044, + "num_input_tokens_seen": 162473408, + "step": 133525 + }, + { + "epoch": 14.871366521884397, + "grad_norm": 0.04621400684118271, + "learning_rate": 9.364659838914202e-06, + "loss": 0.0035, + "num_input_tokens_seen": 162479040, + "step": 133530 + }, + { + "epoch": 14.871923376768015, + "grad_norm": 0.15497110784053802, + "learning_rate": 9.362764004349234e-06, + "loss": 0.1161, + "num_input_tokens_seen": 162485024, + "step": 133535 + }, + { + "epoch": 14.872480231651632, + "grad_norm": 0.05352887883782387, + "learning_rate": 9.36086831749047e-06, + "loss": 0.0179, + "num_input_tokens_seen": 162490784, + "step": 133540 + }, + { + "epoch": 14.873037086535248, + "grad_norm": 0.5290300250053406, + "learning_rate": 9.358972778355846e-06, + "loss": 0.0172, + "num_input_tokens_seen": 162496864, + "step": 133545 + }, + { + "epoch": 14.873593941418866, + "grad_norm": 0.24442191421985626, + "learning_rate": 9.357077386963246e-06, + "loss": 0.0237, + "num_input_tokens_seen": 162503040, + "step": 133550 + }, + { + "epoch": 14.874150796302484, + "grad_norm": 0.00043374334927648306, + "learning_rate": 9.355182143330588e-06, + "loss": 0.0262, + "num_input_tokens_seen": 162509216, + "step": 133555 + }, + { + "epoch": 14.874707651186101, + "grad_norm": 0.0006009242497384548, + "learning_rate": 9.353287047475773e-06, + "loss": 0.0239, + "num_input_tokens_seen": 162514944, + "step": 133560 + }, + { + "epoch": 14.875264506069719, + "grad_norm": 0.0015963857294991612, + "learning_rate": 9.351392099416696e-06, + "loss": 0.0481, + "num_input_tokens_seen": 162520832, + "step": 133565 + }, + { + "epoch": 14.875821360953335, + "grad_norm": 1.6082631349563599, + "learning_rate": 9.349497299171247e-06, + "loss": 0.0324, + "num_input_tokens_seen": 162526912, + "step": 133570 + }, + { + "epoch": 14.876378215836953, + "grad_norm": 0.0029979555401951075, + "learning_rate": 9.347602646757347e-06, + "loss": 0.0174, + "num_input_tokens_seen": 162533088, + "step": 133575 + }, + { + "epoch": 14.87693507072057, + "grad_norm": 0.17330139875411987, + "learning_rate": 9.34570814219288e-06, + "loss": 0.0304, + "num_input_tokens_seen": 162539072, + "step": 133580 + }, + { + "epoch": 14.877491925604188, + "grad_norm": 0.01891741342842579, + "learning_rate": 9.343813785495742e-06, + "loss": 0.0012, + "num_input_tokens_seen": 162545120, + "step": 133585 + }, + { + "epoch": 14.878048780487806, + "grad_norm": 1.216996431350708, + "learning_rate": 9.341919576683817e-06, + "loss": 0.0638, + "num_input_tokens_seen": 162551168, + "step": 133590 + }, + { + "epoch": 14.878605635371422, + "grad_norm": 0.01599825732409954, + "learning_rate": 9.340025515775016e-06, + "loss": 0.0099, + "num_input_tokens_seen": 162556960, + "step": 133595 + }, + { + "epoch": 14.87916249025504, + "grad_norm": 0.8606253266334534, + "learning_rate": 9.338131602787212e-06, + "loss": 0.0215, + "num_input_tokens_seen": 162562816, + "step": 133600 + }, + { + "epoch": 14.879719345138657, + "grad_norm": 0.11040593683719635, + "learning_rate": 9.336237837738318e-06, + "loss": 0.011, + "num_input_tokens_seen": 162569184, + "step": 133605 + }, + { + "epoch": 14.880276200022275, + "grad_norm": 0.001984815113246441, + "learning_rate": 9.334344220646193e-06, + "loss": 0.0813, + "num_input_tokens_seen": 162575136, + "step": 133610 + }, + { + "epoch": 14.880833054905892, + "grad_norm": 1.5567392110824585, + "learning_rate": 9.332450751528747e-06, + "loss": 0.0622, + "num_input_tokens_seen": 162580768, + "step": 133615 + }, + { + "epoch": 14.881389909789508, + "grad_norm": 0.21022023260593414, + "learning_rate": 9.330557430403843e-06, + "loss": 0.0245, + "num_input_tokens_seen": 162586688, + "step": 133620 + }, + { + "epoch": 14.881946764673126, + "grad_norm": 0.16051076352596283, + "learning_rate": 9.328664257289391e-06, + "loss": 0.0227, + "num_input_tokens_seen": 162592960, + "step": 133625 + }, + { + "epoch": 14.882503619556744, + "grad_norm": 1.608420729637146, + "learning_rate": 9.326771232203257e-06, + "loss": 0.0855, + "num_input_tokens_seen": 162598816, + "step": 133630 + }, + { + "epoch": 14.883060474440361, + "grad_norm": 0.10915140062570572, + "learning_rate": 9.32487835516333e-06, + "loss": 0.0054, + "num_input_tokens_seen": 162605056, + "step": 133635 + }, + { + "epoch": 14.883617329323979, + "grad_norm": 0.996574878692627, + "learning_rate": 9.322985626187474e-06, + "loss": 0.0517, + "num_input_tokens_seen": 162610752, + "step": 133640 + }, + { + "epoch": 14.884174184207595, + "grad_norm": 0.16778630018234253, + "learning_rate": 9.321093045293591e-06, + "loss": 0.0672, + "num_input_tokens_seen": 162616352, + "step": 133645 + }, + { + "epoch": 14.884731039091212, + "grad_norm": 0.008778366260230541, + "learning_rate": 9.319200612499543e-06, + "loss": 0.0034, + "num_input_tokens_seen": 162622656, + "step": 133650 + }, + { + "epoch": 14.88528789397483, + "grad_norm": 0.0031037351582199335, + "learning_rate": 9.31730832782321e-06, + "loss": 0.1406, + "num_input_tokens_seen": 162628640, + "step": 133655 + }, + { + "epoch": 14.885844748858448, + "grad_norm": 0.04148150980472565, + "learning_rate": 9.315416191282455e-06, + "loss": 0.0187, + "num_input_tokens_seen": 162635136, + "step": 133660 + }, + { + "epoch": 14.886401603742065, + "grad_norm": 0.032341118901968, + "learning_rate": 9.31352420289517e-06, + "loss": 0.0044, + "num_input_tokens_seen": 162640736, + "step": 133665 + }, + { + "epoch": 14.886958458625681, + "grad_norm": 0.006756039336323738, + "learning_rate": 9.311632362679206e-06, + "loss": 0.02, + "num_input_tokens_seen": 162646464, + "step": 133670 + }, + { + "epoch": 14.887515313509299, + "grad_norm": 0.39886656403541565, + "learning_rate": 9.309740670652462e-06, + "loss": 0.0228, + "num_input_tokens_seen": 162652608, + "step": 133675 + }, + { + "epoch": 14.888072168392917, + "grad_norm": 2.866640090942383, + "learning_rate": 9.30784912683277e-06, + "loss": 0.0477, + "num_input_tokens_seen": 162658336, + "step": 133680 + }, + { + "epoch": 14.888629023276534, + "grad_norm": 0.001084126066416502, + "learning_rate": 9.305957731238027e-06, + "loss": 0.0405, + "num_input_tokens_seen": 162664096, + "step": 133685 + }, + { + "epoch": 14.889185878160152, + "grad_norm": 0.0015695416368544102, + "learning_rate": 9.304066483886075e-06, + "loss": 0.0737, + "num_input_tokens_seen": 162670176, + "step": 133690 + }, + { + "epoch": 14.889742733043768, + "grad_norm": 2.1258370876312256, + "learning_rate": 9.302175384794803e-06, + "loss": 0.1092, + "num_input_tokens_seen": 162676320, + "step": 133695 + }, + { + "epoch": 14.890299587927386, + "grad_norm": 0.17178212106227875, + "learning_rate": 9.30028443398206e-06, + "loss": 0.0256, + "num_input_tokens_seen": 162682400, + "step": 133700 + }, + { + "epoch": 14.890856442811003, + "grad_norm": 0.008672456257045269, + "learning_rate": 9.298393631465706e-06, + "loss": 0.0453, + "num_input_tokens_seen": 162688960, + "step": 133705 + }, + { + "epoch": 14.891413297694621, + "grad_norm": 0.024976501241326332, + "learning_rate": 9.296502977263608e-06, + "loss": 0.1115, + "num_input_tokens_seen": 162695136, + "step": 133710 + }, + { + "epoch": 14.891970152578239, + "grad_norm": 0.0003460849111434072, + "learning_rate": 9.29461247139361e-06, + "loss": 0.0421, + "num_input_tokens_seen": 162701312, + "step": 133715 + }, + { + "epoch": 14.892527007461855, + "grad_norm": 0.007118947338312864, + "learning_rate": 9.292722113873587e-06, + "loss": 0.0119, + "num_input_tokens_seen": 162707488, + "step": 133720 + }, + { + "epoch": 14.893083862345472, + "grad_norm": 0.2363770455121994, + "learning_rate": 9.290831904721392e-06, + "loss": 0.1376, + "num_input_tokens_seen": 162713216, + "step": 133725 + }, + { + "epoch": 14.89364071722909, + "grad_norm": 0.0737951323390007, + "learning_rate": 9.288941843954874e-06, + "loss": 0.0445, + "num_input_tokens_seen": 162718848, + "step": 133730 + }, + { + "epoch": 14.894197572112708, + "grad_norm": 0.04776814207434654, + "learning_rate": 9.287051931591878e-06, + "loss": 0.0032, + "num_input_tokens_seen": 162725184, + "step": 133735 + }, + { + "epoch": 14.894754426996325, + "grad_norm": 0.0023955930955708027, + "learning_rate": 9.285162167650275e-06, + "loss": 0.1032, + "num_input_tokens_seen": 162731168, + "step": 133740 + }, + { + "epoch": 14.895311281879943, + "grad_norm": 0.2794683575630188, + "learning_rate": 9.283272552147898e-06, + "loss": 0.0144, + "num_input_tokens_seen": 162737248, + "step": 133745 + }, + { + "epoch": 14.895868136763559, + "grad_norm": 0.9951714277267456, + "learning_rate": 9.281383085102619e-06, + "loss": 0.0588, + "num_input_tokens_seen": 162743488, + "step": 133750 + }, + { + "epoch": 14.896424991647176, + "grad_norm": 0.04781847819685936, + "learning_rate": 9.279493766532255e-06, + "loss": 0.0846, + "num_input_tokens_seen": 162749632, + "step": 133755 + }, + { + "epoch": 14.896981846530794, + "grad_norm": 2.011746406555176, + "learning_rate": 9.277604596454675e-06, + "loss": 0.0742, + "num_input_tokens_seen": 162755424, + "step": 133760 + }, + { + "epoch": 14.897538701414412, + "grad_norm": 0.016114575788378716, + "learning_rate": 9.275715574887708e-06, + "loss": 0.0032, + "num_input_tokens_seen": 162761984, + "step": 133765 + }, + { + "epoch": 14.89809555629803, + "grad_norm": 1.5911113023757935, + "learning_rate": 9.273826701849213e-06, + "loss": 0.0277, + "num_input_tokens_seen": 162767648, + "step": 133770 + }, + { + "epoch": 14.898652411181645, + "grad_norm": 0.3873917758464813, + "learning_rate": 9.271937977357026e-06, + "loss": 0.0271, + "num_input_tokens_seen": 162773408, + "step": 133775 + }, + { + "epoch": 14.899209266065263, + "grad_norm": 0.00787624903023243, + "learning_rate": 9.270049401428985e-06, + "loss": 0.0023, + "num_input_tokens_seen": 162779808, + "step": 133780 + }, + { + "epoch": 14.89976612094888, + "grad_norm": 0.002337685553357005, + "learning_rate": 9.268160974082923e-06, + "loss": 0.0046, + "num_input_tokens_seen": 162785856, + "step": 133785 + }, + { + "epoch": 14.900322975832498, + "grad_norm": 0.10131686180830002, + "learning_rate": 9.266272695336692e-06, + "loss": 0.0049, + "num_input_tokens_seen": 162792064, + "step": 133790 + }, + { + "epoch": 14.900879830716116, + "grad_norm": 0.03156714886426926, + "learning_rate": 9.26438456520812e-06, + "loss": 0.0268, + "num_input_tokens_seen": 162798080, + "step": 133795 + }, + { + "epoch": 14.901436685599732, + "grad_norm": 1.8223329782485962, + "learning_rate": 9.262496583715045e-06, + "loss": 0.0416, + "num_input_tokens_seen": 162804160, + "step": 133800 + }, + { + "epoch": 14.90199354048335, + "grad_norm": 0.013393942266702652, + "learning_rate": 9.260608750875288e-06, + "loss": 0.0203, + "num_input_tokens_seen": 162810368, + "step": 133805 + }, + { + "epoch": 14.902550395366967, + "grad_norm": 2.265986442565918, + "learning_rate": 9.258721066706702e-06, + "loss": 0.1662, + "num_input_tokens_seen": 162816448, + "step": 133810 + }, + { + "epoch": 14.903107250250585, + "grad_norm": 1.432502031326294, + "learning_rate": 9.256833531227097e-06, + "loss": 0.0739, + "num_input_tokens_seen": 162821952, + "step": 133815 + }, + { + "epoch": 14.903664105134203, + "grad_norm": 0.33466264605522156, + "learning_rate": 9.254946144454333e-06, + "loss": 0.0797, + "num_input_tokens_seen": 162828064, + "step": 133820 + }, + { + "epoch": 14.904220960017819, + "grad_norm": 0.7103945016860962, + "learning_rate": 9.253058906406196e-06, + "loss": 0.0294, + "num_input_tokens_seen": 162834208, + "step": 133825 + }, + { + "epoch": 14.904777814901436, + "grad_norm": 0.0192437581717968, + "learning_rate": 9.251171817100542e-06, + "loss": 0.0027, + "num_input_tokens_seen": 162840064, + "step": 133830 + }, + { + "epoch": 14.905334669785054, + "grad_norm": 1.3868049383163452, + "learning_rate": 9.249284876555184e-06, + "loss": 0.0589, + "num_input_tokens_seen": 162845856, + "step": 133835 + }, + { + "epoch": 14.905891524668672, + "grad_norm": 0.0048671746626496315, + "learning_rate": 9.247398084787956e-06, + "loss": 0.0074, + "num_input_tokens_seen": 162852320, + "step": 133840 + }, + { + "epoch": 14.90644837955229, + "grad_norm": 0.00029502352117560804, + "learning_rate": 9.245511441816673e-06, + "loss": 0.11, + "num_input_tokens_seen": 162858208, + "step": 133845 + }, + { + "epoch": 14.907005234435905, + "grad_norm": 0.0005652224062941968, + "learning_rate": 9.243624947659157e-06, + "loss": 0.1144, + "num_input_tokens_seen": 162864448, + "step": 133850 + }, + { + "epoch": 14.907562089319523, + "grad_norm": 0.9120925068855286, + "learning_rate": 9.241738602333219e-06, + "loss": 0.0629, + "num_input_tokens_seen": 162870048, + "step": 133855 + }, + { + "epoch": 14.90811894420314, + "grad_norm": 0.11309653520584106, + "learning_rate": 9.239852405856694e-06, + "loss": 0.003, + "num_input_tokens_seen": 162876000, + "step": 133860 + }, + { + "epoch": 14.908675799086758, + "grad_norm": 0.12712906301021576, + "learning_rate": 9.237966358247388e-06, + "loss": 0.0082, + "num_input_tokens_seen": 162882240, + "step": 133865 + }, + { + "epoch": 14.909232653970376, + "grad_norm": 0.0413832850754261, + "learning_rate": 9.236080459523119e-06, + "loss": 0.0079, + "num_input_tokens_seen": 162888864, + "step": 133870 + }, + { + "epoch": 14.909789508853994, + "grad_norm": 0.009565459564328194, + "learning_rate": 9.234194709701694e-06, + "loss": 0.0018, + "num_input_tokens_seen": 162894976, + "step": 133875 + }, + { + "epoch": 14.91034636373761, + "grad_norm": 0.001280972850508988, + "learning_rate": 9.232309108800938e-06, + "loss": 0.0287, + "num_input_tokens_seen": 162901408, + "step": 133880 + }, + { + "epoch": 14.910903218621227, + "grad_norm": 0.0012501028832048178, + "learning_rate": 9.230423656838643e-06, + "loss": 0.0084, + "num_input_tokens_seen": 162907296, + "step": 133885 + }, + { + "epoch": 14.911460073504845, + "grad_norm": 0.10928177088499069, + "learning_rate": 9.228538353832644e-06, + "loss": 0.0231, + "num_input_tokens_seen": 162913344, + "step": 133890 + }, + { + "epoch": 14.912016928388462, + "grad_norm": 0.0005655385321006179, + "learning_rate": 9.226653199800736e-06, + "loss": 0.0417, + "num_input_tokens_seen": 162919456, + "step": 133895 + }, + { + "epoch": 14.91257378327208, + "grad_norm": 0.3702925145626068, + "learning_rate": 9.224768194760724e-06, + "loss": 0.0933, + "num_input_tokens_seen": 162925760, + "step": 133900 + }, + { + "epoch": 14.913130638155696, + "grad_norm": 0.6624330282211304, + "learning_rate": 9.22288333873041e-06, + "loss": 0.018, + "num_input_tokens_seen": 162931840, + "step": 133905 + }, + { + "epoch": 14.913687493039314, + "grad_norm": 0.1417159140110016, + "learning_rate": 9.220998631727609e-06, + "loss": 0.027, + "num_input_tokens_seen": 162938432, + "step": 133910 + }, + { + "epoch": 14.914244347922931, + "grad_norm": 0.00152703991625458, + "learning_rate": 9.219114073770118e-06, + "loss": 0.0176, + "num_input_tokens_seen": 162944448, + "step": 133915 + }, + { + "epoch": 14.914801202806549, + "grad_norm": 0.07349030673503876, + "learning_rate": 9.217229664875737e-06, + "loss": 0.0703, + "num_input_tokens_seen": 162950528, + "step": 133920 + }, + { + "epoch": 14.915358057690167, + "grad_norm": 0.009854215197265148, + "learning_rate": 9.215345405062261e-06, + "loss": 0.1011, + "num_input_tokens_seen": 162956384, + "step": 133925 + }, + { + "epoch": 14.915914912573783, + "grad_norm": 0.004727538209408522, + "learning_rate": 9.213461294347502e-06, + "loss": 0.0525, + "num_input_tokens_seen": 162962464, + "step": 133930 + }, + { + "epoch": 14.9164717674574, + "grad_norm": 0.9448115229606628, + "learning_rate": 9.211577332749238e-06, + "loss": 0.0144, + "num_input_tokens_seen": 162968992, + "step": 133935 + }, + { + "epoch": 14.917028622341018, + "grad_norm": 1.8759392499923706, + "learning_rate": 9.209693520285295e-06, + "loss": 0.1171, + "num_input_tokens_seen": 162975136, + "step": 133940 + }, + { + "epoch": 14.917585477224636, + "grad_norm": 0.23920969665050507, + "learning_rate": 9.207809856973431e-06, + "loss": 0.0617, + "num_input_tokens_seen": 162981664, + "step": 133945 + }, + { + "epoch": 14.918142332108253, + "grad_norm": 0.02306840755045414, + "learning_rate": 9.205926342831465e-06, + "loss": 0.0632, + "num_input_tokens_seen": 162987648, + "step": 133950 + }, + { + "epoch": 14.91869918699187, + "grad_norm": 1.1258389949798584, + "learning_rate": 9.204042977877168e-06, + "loss": 0.215, + "num_input_tokens_seen": 162993344, + "step": 133955 + }, + { + "epoch": 14.919256041875487, + "grad_norm": 2.0979061126708984, + "learning_rate": 9.202159762128348e-06, + "loss": 0.1212, + "num_input_tokens_seen": 162999328, + "step": 133960 + }, + { + "epoch": 14.919812896759105, + "grad_norm": 0.7406435012817383, + "learning_rate": 9.200276695602786e-06, + "loss": 0.024, + "num_input_tokens_seen": 163005408, + "step": 133965 + }, + { + "epoch": 14.920369751642722, + "grad_norm": 1.293716311454773, + "learning_rate": 9.198393778318271e-06, + "loss": 0.0647, + "num_input_tokens_seen": 163010976, + "step": 133970 + }, + { + "epoch": 14.92092660652634, + "grad_norm": 0.002469999250024557, + "learning_rate": 9.196511010292578e-06, + "loss": 0.0111, + "num_input_tokens_seen": 163017312, + "step": 133975 + }, + { + "epoch": 14.921483461409956, + "grad_norm": 0.8438058495521545, + "learning_rate": 9.194628391543509e-06, + "loss": 0.049, + "num_input_tokens_seen": 163023584, + "step": 133980 + }, + { + "epoch": 14.922040316293574, + "grad_norm": 1.5453990697860718, + "learning_rate": 9.192745922088836e-06, + "loss": 0.0724, + "num_input_tokens_seen": 163029408, + "step": 133985 + }, + { + "epoch": 14.922597171177191, + "grad_norm": 0.000683099206071347, + "learning_rate": 9.190863601946345e-06, + "loss": 0.0007, + "num_input_tokens_seen": 163035456, + "step": 133990 + }, + { + "epoch": 14.923154026060809, + "grad_norm": 0.003197995712980628, + "learning_rate": 9.188981431133803e-06, + "loss": 0.0215, + "num_input_tokens_seen": 163041568, + "step": 133995 + }, + { + "epoch": 14.923710880944427, + "grad_norm": 0.8296494483947754, + "learning_rate": 9.187099409669009e-06, + "loss": 0.0228, + "num_input_tokens_seen": 163047584, + "step": 134000 + }, + { + "epoch": 14.924267735828042, + "grad_norm": 0.6621905565261841, + "learning_rate": 9.185217537569719e-06, + "loss": 0.0703, + "num_input_tokens_seen": 163053536, + "step": 134005 + }, + { + "epoch": 14.92482459071166, + "grad_norm": 0.002818300388753414, + "learning_rate": 9.183335814853738e-06, + "loss": 0.0112, + "num_input_tokens_seen": 163059520, + "step": 134010 + }, + { + "epoch": 14.925381445595278, + "grad_norm": 0.036217205226421356, + "learning_rate": 9.181454241538807e-06, + "loss": 0.0817, + "num_input_tokens_seen": 163065792, + "step": 134015 + }, + { + "epoch": 14.925938300478895, + "grad_norm": 1.2186487913131714, + "learning_rate": 9.179572817642721e-06, + "loss": 0.1125, + "num_input_tokens_seen": 163071776, + "step": 134020 + }, + { + "epoch": 14.926495155362513, + "grad_norm": 0.3598285913467407, + "learning_rate": 9.177691543183236e-06, + "loss": 0.1127, + "num_input_tokens_seen": 163077920, + "step": 134025 + }, + { + "epoch": 14.927052010246129, + "grad_norm": 0.09830372035503387, + "learning_rate": 9.17581041817814e-06, + "loss": 0.0087, + "num_input_tokens_seen": 163083936, + "step": 134030 + }, + { + "epoch": 14.927608865129747, + "grad_norm": 0.005861308891326189, + "learning_rate": 9.173929442645196e-06, + "loss": 0.0647, + "num_input_tokens_seen": 163089920, + "step": 134035 + }, + { + "epoch": 14.928165720013364, + "grad_norm": 0.2261507362127304, + "learning_rate": 9.172048616602163e-06, + "loss": 0.128, + "num_input_tokens_seen": 163096064, + "step": 134040 + }, + { + "epoch": 14.928722574896982, + "grad_norm": 0.012199856340885162, + "learning_rate": 9.170167940066806e-06, + "loss": 0.0078, + "num_input_tokens_seen": 163102112, + "step": 134045 + }, + { + "epoch": 14.9292794297806, + "grad_norm": 0.002278602682054043, + "learning_rate": 9.168287413056904e-06, + "loss": 0.01, + "num_input_tokens_seen": 163108480, + "step": 134050 + }, + { + "epoch": 14.929836284664216, + "grad_norm": 0.03907826170325279, + "learning_rate": 9.16640703559021e-06, + "loss": 0.0412, + "num_input_tokens_seen": 163114368, + "step": 134055 + }, + { + "epoch": 14.930393139547833, + "grad_norm": 3.2435033321380615, + "learning_rate": 9.16452680768449e-06, + "loss": 0.1619, + "num_input_tokens_seen": 163120384, + "step": 134060 + }, + { + "epoch": 14.930949994431451, + "grad_norm": 0.18718920648097992, + "learning_rate": 9.16264672935749e-06, + "loss": 0.0082, + "num_input_tokens_seen": 163126688, + "step": 134065 + }, + { + "epoch": 14.931506849315069, + "grad_norm": 0.8896313905715942, + "learning_rate": 9.160766800626991e-06, + "loss": 0.0276, + "num_input_tokens_seen": 163132544, + "step": 134070 + }, + { + "epoch": 14.932063704198686, + "grad_norm": 0.7379207611083984, + "learning_rate": 9.158887021510731e-06, + "loss": 0.0318, + "num_input_tokens_seen": 163138464, + "step": 134075 + }, + { + "epoch": 14.932620559082302, + "grad_norm": 0.00034517349558882415, + "learning_rate": 9.157007392026482e-06, + "loss": 0.0391, + "num_input_tokens_seen": 163144896, + "step": 134080 + }, + { + "epoch": 14.93317741396592, + "grad_norm": 0.06678562611341476, + "learning_rate": 9.155127912191994e-06, + "loss": 0.0296, + "num_input_tokens_seen": 163151072, + "step": 134085 + }, + { + "epoch": 14.933734268849538, + "grad_norm": 0.206770658493042, + "learning_rate": 9.153248582025014e-06, + "loss": 0.0158, + "num_input_tokens_seen": 163157280, + "step": 134090 + }, + { + "epoch": 14.934291123733155, + "grad_norm": 2.635204553604126, + "learning_rate": 9.15136940154329e-06, + "loss": 0.1372, + "num_input_tokens_seen": 163163360, + "step": 134095 + }, + { + "epoch": 14.934847978616773, + "grad_norm": 0.5761767625808716, + "learning_rate": 9.14949037076459e-06, + "loss": 0.0133, + "num_input_tokens_seen": 163169472, + "step": 134100 + }, + { + "epoch": 14.93540483350039, + "grad_norm": 0.047085586935281754, + "learning_rate": 9.14761148970665e-06, + "loss": 0.0316, + "num_input_tokens_seen": 163175680, + "step": 134105 + }, + { + "epoch": 14.935961688384007, + "grad_norm": 0.17135830223560333, + "learning_rate": 9.145732758387224e-06, + "loss": 0.1483, + "num_input_tokens_seen": 163181632, + "step": 134110 + }, + { + "epoch": 14.936518543267624, + "grad_norm": 0.0029068663716316223, + "learning_rate": 9.143854176824043e-06, + "loss": 0.0587, + "num_input_tokens_seen": 163187488, + "step": 134115 + }, + { + "epoch": 14.937075398151242, + "grad_norm": 0.00027593833510763943, + "learning_rate": 9.14197574503487e-06, + "loss": 0.0185, + "num_input_tokens_seen": 163193632, + "step": 134120 + }, + { + "epoch": 14.93763225303486, + "grad_norm": 0.02487647905945778, + "learning_rate": 9.140097463037445e-06, + "loss": 0.0138, + "num_input_tokens_seen": 163199552, + "step": 134125 + }, + { + "epoch": 14.938189107918477, + "grad_norm": 0.0003408256161492318, + "learning_rate": 9.138219330849504e-06, + "loss": 0.0525, + "num_input_tokens_seen": 163205696, + "step": 134130 + }, + { + "epoch": 14.938745962802093, + "grad_norm": 2.0431952476501465, + "learning_rate": 9.136341348488789e-06, + "loss": 0.0761, + "num_input_tokens_seen": 163211872, + "step": 134135 + }, + { + "epoch": 14.93930281768571, + "grad_norm": 0.005927149206399918, + "learning_rate": 9.134463515973033e-06, + "loss": 0.0107, + "num_input_tokens_seen": 163217664, + "step": 134140 + }, + { + "epoch": 14.939859672569328, + "grad_norm": 1.9021776914596558, + "learning_rate": 9.13258583331999e-06, + "loss": 0.1079, + "num_input_tokens_seen": 163222496, + "step": 134145 + }, + { + "epoch": 14.940416527452946, + "grad_norm": 0.0008273914572782815, + "learning_rate": 9.130708300547375e-06, + "loss": 0.0095, + "num_input_tokens_seen": 163228704, + "step": 134150 + }, + { + "epoch": 14.940973382336564, + "grad_norm": 0.3235844671726227, + "learning_rate": 9.128830917672954e-06, + "loss": 0.0375, + "num_input_tokens_seen": 163234880, + "step": 134155 + }, + { + "epoch": 14.94153023722018, + "grad_norm": 0.9500619173049927, + "learning_rate": 9.12695368471442e-06, + "loss": 0.3175, + "num_input_tokens_seen": 163241248, + "step": 134160 + }, + { + "epoch": 14.942087092103797, + "grad_norm": 0.5395500659942627, + "learning_rate": 9.125076601689537e-06, + "loss": 0.0242, + "num_input_tokens_seen": 163247296, + "step": 134165 + }, + { + "epoch": 14.942643946987415, + "grad_norm": 0.020574551075696945, + "learning_rate": 9.123199668616018e-06, + "loss": 0.1603, + "num_input_tokens_seen": 163253344, + "step": 134170 + }, + { + "epoch": 14.943200801871033, + "grad_norm": 0.012465786188840866, + "learning_rate": 9.121322885511601e-06, + "loss": 0.0332, + "num_input_tokens_seen": 163259776, + "step": 134175 + }, + { + "epoch": 14.94375765675465, + "grad_norm": 1.2774945497512817, + "learning_rate": 9.119446252394014e-06, + "loss": 0.0809, + "num_input_tokens_seen": 163266080, + "step": 134180 + }, + { + "epoch": 14.944314511638266, + "grad_norm": 0.15098272264003754, + "learning_rate": 9.117569769280982e-06, + "loss": 0.1205, + "num_input_tokens_seen": 163272064, + "step": 134185 + }, + { + "epoch": 14.944871366521884, + "grad_norm": 0.8099818825721741, + "learning_rate": 9.11569343619022e-06, + "loss": 0.0103, + "num_input_tokens_seen": 163278496, + "step": 134190 + }, + { + "epoch": 14.945428221405502, + "grad_norm": 0.0023953220807015896, + "learning_rate": 9.113817253139468e-06, + "loss": 0.1274, + "num_input_tokens_seen": 163284736, + "step": 134195 + }, + { + "epoch": 14.94598507628912, + "grad_norm": 0.9802836775779724, + "learning_rate": 9.111941220146437e-06, + "loss": 0.0247, + "num_input_tokens_seen": 163291040, + "step": 134200 + }, + { + "epoch": 14.946541931172737, + "grad_norm": 0.8029323816299438, + "learning_rate": 9.110065337228852e-06, + "loss": 0.012, + "num_input_tokens_seen": 163296960, + "step": 134205 + }, + { + "epoch": 14.947098786056353, + "grad_norm": 0.018207846209406853, + "learning_rate": 9.108189604404422e-06, + "loss": 0.0624, + "num_input_tokens_seen": 163302944, + "step": 134210 + }, + { + "epoch": 14.94765564093997, + "grad_norm": 1.0560835599899292, + "learning_rate": 9.106314021690884e-06, + "loss": 0.0868, + "num_input_tokens_seen": 163309152, + "step": 134215 + }, + { + "epoch": 14.948212495823588, + "grad_norm": 0.016664408147335052, + "learning_rate": 9.104438589105932e-06, + "loss": 0.0313, + "num_input_tokens_seen": 163315552, + "step": 134220 + }, + { + "epoch": 14.948769350707206, + "grad_norm": 2.3377697467803955, + "learning_rate": 9.102563306667313e-06, + "loss": 0.0666, + "num_input_tokens_seen": 163321728, + "step": 134225 + }, + { + "epoch": 14.949326205590824, + "grad_norm": 0.0004455207963474095, + "learning_rate": 9.100688174392703e-06, + "loss": 0.0034, + "num_input_tokens_seen": 163327616, + "step": 134230 + }, + { + "epoch": 14.949883060474441, + "grad_norm": 1.1178749799728394, + "learning_rate": 9.098813192299837e-06, + "loss": 0.0524, + "num_input_tokens_seen": 163333728, + "step": 134235 + }, + { + "epoch": 14.950439915358057, + "grad_norm": 0.09150104224681854, + "learning_rate": 9.096938360406415e-06, + "loss": 0.0231, + "num_input_tokens_seen": 163339872, + "step": 134240 + }, + { + "epoch": 14.950996770241675, + "grad_norm": 0.49048206210136414, + "learning_rate": 9.09506367873016e-06, + "loss": 0.0989, + "num_input_tokens_seen": 163345984, + "step": 134245 + }, + { + "epoch": 14.951553625125293, + "grad_norm": 0.09502663463354111, + "learning_rate": 9.09318914728877e-06, + "loss": 0.0241, + "num_input_tokens_seen": 163352320, + "step": 134250 + }, + { + "epoch": 14.95211048000891, + "grad_norm": 0.18483467400074005, + "learning_rate": 9.09131476609995e-06, + "loss": 0.0039, + "num_input_tokens_seen": 163358432, + "step": 134255 + }, + { + "epoch": 14.952667334892528, + "grad_norm": 0.00010391260002506897, + "learning_rate": 9.089440535181404e-06, + "loss": 0.0057, + "num_input_tokens_seen": 163364800, + "step": 134260 + }, + { + "epoch": 14.953224189776144, + "grad_norm": 0.0015628845430910587, + "learning_rate": 9.087566454550847e-06, + "loss": 0.0041, + "num_input_tokens_seen": 163371104, + "step": 134265 + }, + { + "epoch": 14.953781044659761, + "grad_norm": 0.0446537546813488, + "learning_rate": 9.085692524225972e-06, + "loss": 0.0034, + "num_input_tokens_seen": 163377408, + "step": 134270 + }, + { + "epoch": 14.954337899543379, + "grad_norm": 0.009227651171386242, + "learning_rate": 9.083818744224481e-06, + "loss": 0.0134, + "num_input_tokens_seen": 163383584, + "step": 134275 + }, + { + "epoch": 14.954894754426997, + "grad_norm": 0.06304094940423965, + "learning_rate": 9.081945114564069e-06, + "loss": 0.0522, + "num_input_tokens_seen": 163389248, + "step": 134280 + }, + { + "epoch": 14.955451609310614, + "grad_norm": 0.01663264073431492, + "learning_rate": 9.080071635262444e-06, + "loss": 0.017, + "num_input_tokens_seen": 163395648, + "step": 134285 + }, + { + "epoch": 14.95600846419423, + "grad_norm": 0.239800825715065, + "learning_rate": 9.07819830633729e-06, + "loss": 0.0076, + "num_input_tokens_seen": 163402048, + "step": 134290 + }, + { + "epoch": 14.956565319077848, + "grad_norm": 6.804356962675229e-05, + "learning_rate": 9.076325127806318e-06, + "loss": 0.0569, + "num_input_tokens_seen": 163408448, + "step": 134295 + }, + { + "epoch": 14.957122173961466, + "grad_norm": 0.058389853686094284, + "learning_rate": 9.07445209968721e-06, + "loss": 0.0229, + "num_input_tokens_seen": 163414656, + "step": 134300 + }, + { + "epoch": 14.957679028845083, + "grad_norm": 1.323236346244812, + "learning_rate": 9.072579221997665e-06, + "loss": 0.1711, + "num_input_tokens_seen": 163420800, + "step": 134305 + }, + { + "epoch": 14.958235883728701, + "grad_norm": 0.18270651996135712, + "learning_rate": 9.07070649475536e-06, + "loss": 0.0837, + "num_input_tokens_seen": 163426976, + "step": 134310 + }, + { + "epoch": 14.958792738612317, + "grad_norm": 0.17825119197368622, + "learning_rate": 9.068833917978003e-06, + "loss": 0.0035, + "num_input_tokens_seen": 163433120, + "step": 134315 + }, + { + "epoch": 14.959349593495935, + "grad_norm": 0.07102746516466141, + "learning_rate": 9.066961491683276e-06, + "loss": 0.1023, + "num_input_tokens_seen": 163439456, + "step": 134320 + }, + { + "epoch": 14.959906448379552, + "grad_norm": 0.00038235203828662634, + "learning_rate": 9.065089215888861e-06, + "loss": 0.0188, + "num_input_tokens_seen": 163445696, + "step": 134325 + }, + { + "epoch": 14.96046330326317, + "grad_norm": 0.7800449728965759, + "learning_rate": 9.063217090612435e-06, + "loss": 0.0379, + "num_input_tokens_seen": 163452032, + "step": 134330 + }, + { + "epoch": 14.961020158146788, + "grad_norm": 1.3712657690048218, + "learning_rate": 9.061345115871702e-06, + "loss": 0.0398, + "num_input_tokens_seen": 163458272, + "step": 134335 + }, + { + "epoch": 14.961577013030404, + "grad_norm": 3.332211971282959, + "learning_rate": 9.059473291684325e-06, + "loss": 0.0287, + "num_input_tokens_seen": 163464480, + "step": 134340 + }, + { + "epoch": 14.962133867914021, + "grad_norm": 0.775251567363739, + "learning_rate": 9.057601618068013e-06, + "loss": 0.0072, + "num_input_tokens_seen": 163470528, + "step": 134345 + }, + { + "epoch": 14.962690722797639, + "grad_norm": 0.02019393816590309, + "learning_rate": 9.055730095040408e-06, + "loss": 0.0404, + "num_input_tokens_seen": 163476704, + "step": 134350 + }, + { + "epoch": 14.963247577681257, + "grad_norm": 0.0511992983520031, + "learning_rate": 9.053858722619216e-06, + "loss": 0.0475, + "num_input_tokens_seen": 163482720, + "step": 134355 + }, + { + "epoch": 14.963804432564874, + "grad_norm": 0.0017012879252433777, + "learning_rate": 9.051987500822096e-06, + "loss": 0.0315, + "num_input_tokens_seen": 163488704, + "step": 134360 + }, + { + "epoch": 14.96436128744849, + "grad_norm": 0.8787556886672974, + "learning_rate": 9.050116429666738e-06, + "loss": 0.0254, + "num_input_tokens_seen": 163494880, + "step": 134365 + }, + { + "epoch": 14.964918142332108, + "grad_norm": 0.19778484106063843, + "learning_rate": 9.04824550917081e-06, + "loss": 0.0243, + "num_input_tokens_seen": 163500960, + "step": 134370 + }, + { + "epoch": 14.965474997215725, + "grad_norm": 0.00037795433308929205, + "learning_rate": 9.046374739351984e-06, + "loss": 0.0252, + "num_input_tokens_seen": 163506816, + "step": 134375 + }, + { + "epoch": 14.966031852099343, + "grad_norm": 0.00030578821315430105, + "learning_rate": 9.044504120227923e-06, + "loss": 0.0019, + "num_input_tokens_seen": 163512672, + "step": 134380 + }, + { + "epoch": 14.96658870698296, + "grad_norm": 0.0010155298514291644, + "learning_rate": 9.04263365181631e-06, + "loss": 0.1205, + "num_input_tokens_seen": 163518720, + "step": 134385 + }, + { + "epoch": 14.967145561866577, + "grad_norm": 0.784667432308197, + "learning_rate": 9.040763334134808e-06, + "loss": 0.0831, + "num_input_tokens_seen": 163524640, + "step": 134390 + }, + { + "epoch": 14.967702416750194, + "grad_norm": 0.000961543875746429, + "learning_rate": 9.038893167201082e-06, + "loss": 0.0014, + "num_input_tokens_seen": 163531040, + "step": 134395 + }, + { + "epoch": 14.968259271633812, + "grad_norm": 0.09480054676532745, + "learning_rate": 9.037023151032791e-06, + "loss": 0.033, + "num_input_tokens_seen": 163537248, + "step": 134400 + }, + { + "epoch": 14.96881612651743, + "grad_norm": 0.5034013390541077, + "learning_rate": 9.035153285647614e-06, + "loss": 0.0082, + "num_input_tokens_seen": 163543200, + "step": 134405 + }, + { + "epoch": 14.969372981401047, + "grad_norm": 0.004685710649937391, + "learning_rate": 9.033283571063195e-06, + "loss": 0.0081, + "num_input_tokens_seen": 163549440, + "step": 134410 + }, + { + "epoch": 14.969929836284663, + "grad_norm": 1.097462773323059, + "learning_rate": 9.031414007297222e-06, + "loss": 0.1443, + "num_input_tokens_seen": 163555424, + "step": 134415 + }, + { + "epoch": 14.970486691168281, + "grad_norm": 0.02923441119492054, + "learning_rate": 9.02954459436732e-06, + "loss": 0.008, + "num_input_tokens_seen": 163561536, + "step": 134420 + }, + { + "epoch": 14.971043546051899, + "grad_norm": 1.0843276977539062, + "learning_rate": 9.027675332291174e-06, + "loss": 0.088, + "num_input_tokens_seen": 163567584, + "step": 134425 + }, + { + "epoch": 14.971600400935516, + "grad_norm": 0.017379796132445335, + "learning_rate": 9.025806221086425e-06, + "loss": 0.0355, + "num_input_tokens_seen": 163573792, + "step": 134430 + }, + { + "epoch": 14.972157255819134, + "grad_norm": 0.3560565114021301, + "learning_rate": 9.023937260770741e-06, + "loss": 0.0197, + "num_input_tokens_seen": 163579776, + "step": 134435 + }, + { + "epoch": 14.97271411070275, + "grad_norm": 0.0005771229043602943, + "learning_rate": 9.02206845136177e-06, + "loss": 0.0591, + "num_input_tokens_seen": 163585984, + "step": 134440 + }, + { + "epoch": 14.973270965586368, + "grad_norm": 0.059005968272686005, + "learning_rate": 9.020199792877163e-06, + "loss": 0.0054, + "num_input_tokens_seen": 163592096, + "step": 134445 + }, + { + "epoch": 14.973827820469985, + "grad_norm": 0.3665355145931244, + "learning_rate": 9.018331285334564e-06, + "loss": 0.0823, + "num_input_tokens_seen": 163598240, + "step": 134450 + }, + { + "epoch": 14.974384675353603, + "grad_norm": 0.06757490336894989, + "learning_rate": 9.016462928751637e-06, + "loss": 0.0244, + "num_input_tokens_seen": 163604032, + "step": 134455 + }, + { + "epoch": 14.97494153023722, + "grad_norm": 0.2274094521999359, + "learning_rate": 9.014594723146026e-06, + "loss": 0.0833, + "num_input_tokens_seen": 163610176, + "step": 134460 + }, + { + "epoch": 14.975498385120838, + "grad_norm": 0.09239383786916733, + "learning_rate": 9.012726668535374e-06, + "loss": 0.0044, + "num_input_tokens_seen": 163615648, + "step": 134465 + }, + { + "epoch": 14.976055240004454, + "grad_norm": 0.0626763328909874, + "learning_rate": 9.010858764937319e-06, + "loss": 0.0464, + "num_input_tokens_seen": 163621728, + "step": 134470 + }, + { + "epoch": 14.976612094888072, + "grad_norm": 0.8855029344558716, + "learning_rate": 9.008991012369522e-06, + "loss": 0.0442, + "num_input_tokens_seen": 163627296, + "step": 134475 + }, + { + "epoch": 14.97716894977169, + "grad_norm": 0.1165391281247139, + "learning_rate": 9.007123410849608e-06, + "loss": 0.0339, + "num_input_tokens_seen": 163633408, + "step": 134480 + }, + { + "epoch": 14.977725804655307, + "grad_norm": 0.0003316747024655342, + "learning_rate": 9.005255960395234e-06, + "loss": 0.0011, + "num_input_tokens_seen": 163639840, + "step": 134485 + }, + { + "epoch": 14.978282659538925, + "grad_norm": 0.0004285263712517917, + "learning_rate": 9.003388661024034e-06, + "loss": 0.0079, + "num_input_tokens_seen": 163646144, + "step": 134490 + }, + { + "epoch": 14.97883951442254, + "grad_norm": 0.16053161025047302, + "learning_rate": 9.001521512753644e-06, + "loss": 0.0022, + "num_input_tokens_seen": 163652192, + "step": 134495 + }, + { + "epoch": 14.979396369306158, + "grad_norm": 0.36405467987060547, + "learning_rate": 8.999654515601691e-06, + "loss": 0.1118, + "num_input_tokens_seen": 163658368, + "step": 134500 + }, + { + "epoch": 14.979953224189776, + "grad_norm": 0.000336811994202435, + "learning_rate": 8.99778766958583e-06, + "loss": 0.0559, + "num_input_tokens_seen": 163664672, + "step": 134505 + }, + { + "epoch": 14.980510079073394, + "grad_norm": 0.003704819595441222, + "learning_rate": 8.995920974723685e-06, + "loss": 0.0752, + "num_input_tokens_seen": 163670016, + "step": 134510 + }, + { + "epoch": 14.981066933957011, + "grad_norm": 0.041524164378643036, + "learning_rate": 8.994054431032888e-06, + "loss": 0.12, + "num_input_tokens_seen": 163676192, + "step": 134515 + }, + { + "epoch": 14.981623788840627, + "grad_norm": 0.002317133592441678, + "learning_rate": 8.992188038531065e-06, + "loss": 0.168, + "num_input_tokens_seen": 163681888, + "step": 134520 + }, + { + "epoch": 14.982180643724245, + "grad_norm": 0.2555120289325714, + "learning_rate": 8.990321797235857e-06, + "loss": 0.0086, + "num_input_tokens_seen": 163688000, + "step": 134525 + }, + { + "epoch": 14.982737498607863, + "grad_norm": 0.006129870191216469, + "learning_rate": 8.988455707164877e-06, + "loss": 0.0032, + "num_input_tokens_seen": 163694144, + "step": 134530 + }, + { + "epoch": 14.98329435349148, + "grad_norm": 0.0926394835114479, + "learning_rate": 8.986589768335781e-06, + "loss": 0.0659, + "num_input_tokens_seen": 163700256, + "step": 134535 + }, + { + "epoch": 14.983851208375098, + "grad_norm": 2.0366015434265137, + "learning_rate": 8.984723980766167e-06, + "loss": 0.0983, + "num_input_tokens_seen": 163706400, + "step": 134540 + }, + { + "epoch": 14.984408063258714, + "grad_norm": 0.11563079804182053, + "learning_rate": 8.982858344473655e-06, + "loss": 0.0216, + "num_input_tokens_seen": 163712416, + "step": 134545 + }, + { + "epoch": 14.984964918142332, + "grad_norm": 1.3733805418014526, + "learning_rate": 8.980992859475887e-06, + "loss": 0.02, + "num_input_tokens_seen": 163718688, + "step": 134550 + }, + { + "epoch": 14.98552177302595, + "grad_norm": 0.35816141963005066, + "learning_rate": 8.979127525790468e-06, + "loss": 0.0145, + "num_input_tokens_seen": 163724384, + "step": 134555 + }, + { + "epoch": 14.986078627909567, + "grad_norm": 4.149240493774414, + "learning_rate": 8.977262343435042e-06, + "loss": 0.0784, + "num_input_tokens_seen": 163730752, + "step": 134560 + }, + { + "epoch": 14.986635482793185, + "grad_norm": 1.7994611263275146, + "learning_rate": 8.975397312427192e-06, + "loss": 0.0349, + "num_input_tokens_seen": 163736928, + "step": 134565 + }, + { + "epoch": 14.987192337676802, + "grad_norm": 0.714806079864502, + "learning_rate": 8.973532432784561e-06, + "loss": 0.0581, + "num_input_tokens_seen": 163742688, + "step": 134570 + }, + { + "epoch": 14.987749192560418, + "grad_norm": 0.00020104862051084638, + "learning_rate": 8.97166770452475e-06, + "loss": 0.0113, + "num_input_tokens_seen": 163749152, + "step": 134575 + }, + { + "epoch": 14.988306047444036, + "grad_norm": 0.15147151052951813, + "learning_rate": 8.969803127665389e-06, + "loss": 0.0451, + "num_input_tokens_seen": 163755264, + "step": 134580 + }, + { + "epoch": 14.988862902327654, + "grad_norm": 0.46323683857917786, + "learning_rate": 8.96793870222408e-06, + "loss": 0.008, + "num_input_tokens_seen": 163761216, + "step": 134585 + }, + { + "epoch": 14.989419757211271, + "grad_norm": 1.3006352186203003, + "learning_rate": 8.966074428218432e-06, + "loss": 0.0298, + "num_input_tokens_seen": 163767584, + "step": 134590 + }, + { + "epoch": 14.989976612094889, + "grad_norm": 0.9384991526603699, + "learning_rate": 8.964210305666051e-06, + "loss": 0.0162, + "num_input_tokens_seen": 163773824, + "step": 134595 + }, + { + "epoch": 14.990533466978505, + "grad_norm": 0.004308796487748623, + "learning_rate": 8.96234633458456e-06, + "loss": 0.0028, + "num_input_tokens_seen": 163780288, + "step": 134600 + }, + { + "epoch": 14.991090321862123, + "grad_norm": 0.005744137801229954, + "learning_rate": 8.960482514991555e-06, + "loss": 0.0087, + "num_input_tokens_seen": 163786304, + "step": 134605 + }, + { + "epoch": 14.99164717674574, + "grad_norm": 0.004130836576223373, + "learning_rate": 8.958618846904646e-06, + "loss": 0.0063, + "num_input_tokens_seen": 163792384, + "step": 134610 + }, + { + "epoch": 14.992204031629358, + "grad_norm": 0.29521167278289795, + "learning_rate": 8.956755330341424e-06, + "loss": 0.0642, + "num_input_tokens_seen": 163798656, + "step": 134615 + }, + { + "epoch": 14.992760886512976, + "grad_norm": 1.1644123792648315, + "learning_rate": 8.95489196531951e-06, + "loss": 0.04, + "num_input_tokens_seen": 163804800, + "step": 134620 + }, + { + "epoch": 14.993317741396591, + "grad_norm": 0.0009403322474099696, + "learning_rate": 8.953028751856487e-06, + "loss": 0.1009, + "num_input_tokens_seen": 163810816, + "step": 134625 + }, + { + "epoch": 14.99387459628021, + "grad_norm": 0.12388036400079727, + "learning_rate": 8.951165689969982e-06, + "loss": 0.0106, + "num_input_tokens_seen": 163817280, + "step": 134630 + }, + { + "epoch": 14.994431451163827, + "grad_norm": 0.14695937931537628, + "learning_rate": 8.94930277967756e-06, + "loss": 0.1175, + "num_input_tokens_seen": 163823424, + "step": 134635 + }, + { + "epoch": 14.994988306047444, + "grad_norm": 0.00011676499707391486, + "learning_rate": 8.947440020996838e-06, + "loss": 0.0263, + "num_input_tokens_seen": 163829440, + "step": 134640 + }, + { + "epoch": 14.995545160931062, + "grad_norm": 0.00011693190026562661, + "learning_rate": 8.9455774139454e-06, + "loss": 0.0116, + "num_input_tokens_seen": 163835680, + "step": 134645 + }, + { + "epoch": 14.996102015814678, + "grad_norm": 0.041891906410455704, + "learning_rate": 8.943714958540852e-06, + "loss": 0.0172, + "num_input_tokens_seen": 163842208, + "step": 134650 + }, + { + "epoch": 14.996658870698296, + "grad_norm": 0.047047268599271774, + "learning_rate": 8.941852654800784e-06, + "loss": 0.0374, + "num_input_tokens_seen": 163847936, + "step": 134655 + }, + { + "epoch": 14.997215725581913, + "grad_norm": 1.3679882287979126, + "learning_rate": 8.939990502742782e-06, + "loss": 0.2188, + "num_input_tokens_seen": 163853696, + "step": 134660 + }, + { + "epoch": 14.997772580465531, + "grad_norm": 1.8555192947387695, + "learning_rate": 8.938128502384426e-06, + "loss": 0.0613, + "num_input_tokens_seen": 163859968, + "step": 134665 + }, + { + "epoch": 14.998329435349149, + "grad_norm": 0.00475125340744853, + "learning_rate": 8.936266653743325e-06, + "loss": 0.0862, + "num_input_tokens_seen": 163866208, + "step": 134670 + }, + { + "epoch": 14.998886290232765, + "grad_norm": 0.0033196350559592247, + "learning_rate": 8.934404956837055e-06, + "loss": 0.0381, + "num_input_tokens_seen": 163872416, + "step": 134675 + }, + { + "epoch": 14.999443145116382, + "grad_norm": 0.005075558088719845, + "learning_rate": 8.932543411683204e-06, + "loss": 0.0444, + "num_input_tokens_seen": 163878656, + "step": 134680 + }, + { + "epoch": 15.0, + "grad_norm": 0.0002570000942796469, + "learning_rate": 8.930682018299342e-06, + "loss": 0.0414, + "num_input_tokens_seen": 163884192, + "step": 134685 + }, + { + "epoch": 15.0, + "eval_loss": 0.0805651843547821, + "eval_runtime": 112.1371, + "eval_samples_per_second": 35.59, + "eval_steps_per_second": 8.9, + "num_input_tokens_seen": 163884192, + "step": 134685 + }, + { + "epoch": 15.000556854883618, + "grad_norm": 0.010721441358327866, + "learning_rate": 8.928820776703073e-06, + "loss": 0.0041, + "num_input_tokens_seen": 163890464, + "step": 134690 + }, + { + "epoch": 15.001113709767235, + "grad_norm": 0.0034458653535693884, + "learning_rate": 8.926959686911959e-06, + "loss": 0.0651, + "num_input_tokens_seen": 163896736, + "step": 134695 + }, + { + "epoch": 15.001670564650851, + "grad_norm": 0.022914299741387367, + "learning_rate": 8.925098748943594e-06, + "loss": 0.0311, + "num_input_tokens_seen": 163903072, + "step": 134700 + }, + { + "epoch": 15.002227419534469, + "grad_norm": 0.9193957448005676, + "learning_rate": 8.923237962815555e-06, + "loss": 0.0172, + "num_input_tokens_seen": 163908832, + "step": 134705 + }, + { + "epoch": 15.002784274418087, + "grad_norm": 0.5438243746757507, + "learning_rate": 8.921377328545411e-06, + "loss": 0.0859, + "num_input_tokens_seen": 163914912, + "step": 134710 + }, + { + "epoch": 15.003341129301704, + "grad_norm": 0.2334200143814087, + "learning_rate": 8.919516846150732e-06, + "loss": 0.1196, + "num_input_tokens_seen": 163921056, + "step": 134715 + }, + { + "epoch": 15.003897984185322, + "grad_norm": 0.1789204627275467, + "learning_rate": 8.917656515649109e-06, + "loss": 0.0105, + "num_input_tokens_seen": 163927552, + "step": 134720 + }, + { + "epoch": 15.004454839068938, + "grad_norm": 1.1888079643249512, + "learning_rate": 8.915796337058106e-06, + "loss": 0.0244, + "num_input_tokens_seen": 163933568, + "step": 134725 + }, + { + "epoch": 15.005011693952556, + "grad_norm": 0.06555982679128647, + "learning_rate": 8.913936310395291e-06, + "loss": 0.1122, + "num_input_tokens_seen": 163939840, + "step": 134730 + }, + { + "epoch": 15.005568548836173, + "grad_norm": 0.5402480363845825, + "learning_rate": 8.912076435678229e-06, + "loss": 0.1354, + "num_input_tokens_seen": 163945984, + "step": 134735 + }, + { + "epoch": 15.00612540371979, + "grad_norm": 1.9566196203231812, + "learning_rate": 8.910216712924501e-06, + "loss": 0.1102, + "num_input_tokens_seen": 163951936, + "step": 134740 + }, + { + "epoch": 15.006682258603409, + "grad_norm": 1.9820305109024048, + "learning_rate": 8.908357142151661e-06, + "loss": 0.0583, + "num_input_tokens_seen": 163958176, + "step": 134745 + }, + { + "epoch": 15.007239113487024, + "grad_norm": 0.03111713007092476, + "learning_rate": 8.906497723377294e-06, + "loss": 0.0811, + "num_input_tokens_seen": 163964352, + "step": 134750 + }, + { + "epoch": 15.007795968370642, + "grad_norm": 0.11884793639183044, + "learning_rate": 8.904638456618936e-06, + "loss": 0.0089, + "num_input_tokens_seen": 163970688, + "step": 134755 + }, + { + "epoch": 15.00835282325426, + "grad_norm": 0.013322481885552406, + "learning_rate": 8.902779341894168e-06, + "loss": 0.1113, + "num_input_tokens_seen": 163976704, + "step": 134760 + }, + { + "epoch": 15.008909678137877, + "grad_norm": 0.5403599143028259, + "learning_rate": 8.900920379220543e-06, + "loss": 0.0094, + "num_input_tokens_seen": 163982624, + "step": 134765 + }, + { + "epoch": 15.009466533021495, + "grad_norm": 0.1654871702194214, + "learning_rate": 8.899061568615627e-06, + "loss": 0.0115, + "num_input_tokens_seen": 163988736, + "step": 134770 + }, + { + "epoch": 15.010023387905113, + "grad_norm": 0.07439576089382172, + "learning_rate": 8.897202910096977e-06, + "loss": 0.1274, + "num_input_tokens_seen": 163994592, + "step": 134775 + }, + { + "epoch": 15.010580242788729, + "grad_norm": 0.07277409732341766, + "learning_rate": 8.895344403682147e-06, + "loss": 0.007, + "num_input_tokens_seen": 164000544, + "step": 134780 + }, + { + "epoch": 15.011137097672346, + "grad_norm": 0.25094226002693176, + "learning_rate": 8.89348604938868e-06, + "loss": 0.0032, + "num_input_tokens_seen": 164006048, + "step": 134785 + }, + { + "epoch": 15.011693952555964, + "grad_norm": 0.9542445540428162, + "learning_rate": 8.891627847234152e-06, + "loss": 0.0327, + "num_input_tokens_seen": 164012128, + "step": 134790 + }, + { + "epoch": 15.012250807439582, + "grad_norm": 0.13305917382240295, + "learning_rate": 8.889769797236105e-06, + "loss": 0.0204, + "num_input_tokens_seen": 164017856, + "step": 134795 + }, + { + "epoch": 15.0128076623232, + "grad_norm": 0.10330872237682343, + "learning_rate": 8.887911899412091e-06, + "loss": 0.0307, + "num_input_tokens_seen": 164024224, + "step": 134800 + }, + { + "epoch": 15.013364517206815, + "grad_norm": 0.30540311336517334, + "learning_rate": 8.886054153779647e-06, + "loss": 0.0465, + "num_input_tokens_seen": 164030144, + "step": 134805 + }, + { + "epoch": 15.013921372090433, + "grad_norm": 0.008282197639346123, + "learning_rate": 8.884196560356341e-06, + "loss": 0.0942, + "num_input_tokens_seen": 164036544, + "step": 134810 + }, + { + "epoch": 15.01447822697405, + "grad_norm": 0.0002090974012389779, + "learning_rate": 8.882339119159702e-06, + "loss": 0.0001, + "num_input_tokens_seen": 164042496, + "step": 134815 + }, + { + "epoch": 15.015035081857668, + "grad_norm": 0.03719909489154816, + "learning_rate": 8.880481830207302e-06, + "loss": 0.066, + "num_input_tokens_seen": 164048512, + "step": 134820 + }, + { + "epoch": 15.015591936741286, + "grad_norm": 0.051630206406116486, + "learning_rate": 8.878624693516646e-06, + "loss": 0.0217, + "num_input_tokens_seen": 164054592, + "step": 134825 + }, + { + "epoch": 15.016148791624902, + "grad_norm": 0.41940227150917053, + "learning_rate": 8.876767709105308e-06, + "loss": 0.0208, + "num_input_tokens_seen": 164060928, + "step": 134830 + }, + { + "epoch": 15.01670564650852, + "grad_norm": 1.9393761157989502, + "learning_rate": 8.874910876990805e-06, + "loss": 0.0986, + "num_input_tokens_seen": 164066720, + "step": 134835 + }, + { + "epoch": 15.017262501392137, + "grad_norm": 0.17654143273830414, + "learning_rate": 8.873054197190697e-06, + "loss": 0.0102, + "num_input_tokens_seen": 164072736, + "step": 134840 + }, + { + "epoch": 15.017819356275755, + "grad_norm": 0.37534299492836, + "learning_rate": 8.871197669722515e-06, + "loss": 0.0384, + "num_input_tokens_seen": 164078912, + "step": 134845 + }, + { + "epoch": 15.018376211159373, + "grad_norm": 0.08539997041225433, + "learning_rate": 8.869341294603792e-06, + "loss": 0.014, + "num_input_tokens_seen": 164085280, + "step": 134850 + }, + { + "epoch": 15.018933066042989, + "grad_norm": 0.10854235291481018, + "learning_rate": 8.867485071852053e-06, + "loss": 0.013, + "num_input_tokens_seen": 164091360, + "step": 134855 + }, + { + "epoch": 15.019489920926606, + "grad_norm": 0.01173497550189495, + "learning_rate": 8.865629001484853e-06, + "loss": 0.0764, + "num_input_tokens_seen": 164097728, + "step": 134860 + }, + { + "epoch": 15.020046775810224, + "grad_norm": 0.033325884491205215, + "learning_rate": 8.863773083519716e-06, + "loss": 0.0074, + "num_input_tokens_seen": 164103968, + "step": 134865 + }, + { + "epoch": 15.020603630693842, + "grad_norm": 1.2026679515838623, + "learning_rate": 8.861917317974166e-06, + "loss": 0.0249, + "num_input_tokens_seen": 164110112, + "step": 134870 + }, + { + "epoch": 15.02116048557746, + "grad_norm": 0.0031282338313758373, + "learning_rate": 8.860061704865733e-06, + "loss": 0.0147, + "num_input_tokens_seen": 164116352, + "step": 134875 + }, + { + "epoch": 15.021717340461075, + "grad_norm": 0.20313310623168945, + "learning_rate": 8.858206244211953e-06, + "loss": 0.0164, + "num_input_tokens_seen": 164122816, + "step": 134880 + }, + { + "epoch": 15.022274195344693, + "grad_norm": 1.3159308433532715, + "learning_rate": 8.856350936030342e-06, + "loss": 0.0917, + "num_input_tokens_seen": 164128992, + "step": 134885 + }, + { + "epoch": 15.02283105022831, + "grad_norm": 0.019669773057103157, + "learning_rate": 8.854495780338436e-06, + "loss": 0.0022, + "num_input_tokens_seen": 164135392, + "step": 134890 + }, + { + "epoch": 15.023387905111928, + "grad_norm": 0.09238331764936447, + "learning_rate": 8.852640777153754e-06, + "loss": 0.0315, + "num_input_tokens_seen": 164141280, + "step": 134895 + }, + { + "epoch": 15.023944759995546, + "grad_norm": 0.036828044801950455, + "learning_rate": 8.850785926493819e-06, + "loss": 0.0716, + "num_input_tokens_seen": 164147104, + "step": 134900 + }, + { + "epoch": 15.024501614879162, + "grad_norm": 0.0576833039522171, + "learning_rate": 8.848931228376136e-06, + "loss": 0.0093, + "num_input_tokens_seen": 164153152, + "step": 134905 + }, + { + "epoch": 15.02505846976278, + "grad_norm": 0.0007661264389753342, + "learning_rate": 8.847076682818251e-06, + "loss": 0.0318, + "num_input_tokens_seen": 164159328, + "step": 134910 + }, + { + "epoch": 15.025615324646397, + "grad_norm": 1.6116937398910522, + "learning_rate": 8.845222289837666e-06, + "loss": 0.2002, + "num_input_tokens_seen": 164165440, + "step": 134915 + }, + { + "epoch": 15.026172179530015, + "grad_norm": 0.0020668492652475834, + "learning_rate": 8.8433680494519e-06, + "loss": 0.0122, + "num_input_tokens_seen": 164171456, + "step": 134920 + }, + { + "epoch": 15.026729034413632, + "grad_norm": 0.48766231536865234, + "learning_rate": 8.841513961678457e-06, + "loss": 0.0096, + "num_input_tokens_seen": 164177600, + "step": 134925 + }, + { + "epoch": 15.027285889297248, + "grad_norm": 0.7464370131492615, + "learning_rate": 8.83966002653487e-06, + "loss": 0.0268, + "num_input_tokens_seen": 164183744, + "step": 134930 + }, + { + "epoch": 15.027842744180866, + "grad_norm": 0.01305758859962225, + "learning_rate": 8.837806244038635e-06, + "loss": 0.0069, + "num_input_tokens_seen": 164189824, + "step": 134935 + }, + { + "epoch": 15.028399599064484, + "grad_norm": 0.16515280306339264, + "learning_rate": 8.835952614207285e-06, + "loss": 0.0271, + "num_input_tokens_seen": 164195744, + "step": 134940 + }, + { + "epoch": 15.028956453948101, + "grad_norm": 0.0008271600818261504, + "learning_rate": 8.834099137058304e-06, + "loss": 0.0266, + "num_input_tokens_seen": 164202336, + "step": 134945 + }, + { + "epoch": 15.029513308831719, + "grad_norm": 0.031200164929032326, + "learning_rate": 8.832245812609203e-06, + "loss": 0.0212, + "num_input_tokens_seen": 164208736, + "step": 134950 + }, + { + "epoch": 15.030070163715337, + "grad_norm": 0.08294343948364258, + "learning_rate": 8.830392640877497e-06, + "loss": 0.0178, + "num_input_tokens_seen": 164214688, + "step": 134955 + }, + { + "epoch": 15.030627018598953, + "grad_norm": 1.4746501445770264, + "learning_rate": 8.828539621880682e-06, + "loss": 0.1106, + "num_input_tokens_seen": 164220672, + "step": 134960 + }, + { + "epoch": 15.03118387348257, + "grad_norm": 0.0038335416465997696, + "learning_rate": 8.826686755636283e-06, + "loss": 0.0302, + "num_input_tokens_seen": 164226976, + "step": 134965 + }, + { + "epoch": 15.031740728366188, + "grad_norm": 0.3483237028121948, + "learning_rate": 8.824834042161767e-06, + "loss": 0.0326, + "num_input_tokens_seen": 164233280, + "step": 134970 + }, + { + "epoch": 15.032297583249806, + "grad_norm": 0.0035515842027962208, + "learning_rate": 8.822981481474662e-06, + "loss": 0.0252, + "num_input_tokens_seen": 164239584, + "step": 134975 + }, + { + "epoch": 15.032854438133423, + "grad_norm": 0.403259813785553, + "learning_rate": 8.821129073592452e-06, + "loss": 0.0953, + "num_input_tokens_seen": 164245568, + "step": 134980 + }, + { + "epoch": 15.03341129301704, + "grad_norm": 0.46159836649894714, + "learning_rate": 8.819276818532646e-06, + "loss": 0.0076, + "num_input_tokens_seen": 164252064, + "step": 134985 + }, + { + "epoch": 15.033968147900657, + "grad_norm": 0.7790855169296265, + "learning_rate": 8.817424716312736e-06, + "loss": 0.0611, + "num_input_tokens_seen": 164258048, + "step": 134990 + }, + { + "epoch": 15.034525002784275, + "grad_norm": 0.2706703841686249, + "learning_rate": 8.815572766950211e-06, + "loss": 0.0275, + "num_input_tokens_seen": 164263872, + "step": 134995 + }, + { + "epoch": 15.035081857667892, + "grad_norm": 0.7262081503868103, + "learning_rate": 8.813720970462563e-06, + "loss": 0.0141, + "num_input_tokens_seen": 164270368, + "step": 135000 + }, + { + "epoch": 15.03563871255151, + "grad_norm": 0.0035308445803821087, + "learning_rate": 8.811869326867297e-06, + "loss": 0.0598, + "num_input_tokens_seen": 164276704, + "step": 135005 + }, + { + "epoch": 15.036195567435126, + "grad_norm": 0.00026361786876805127, + "learning_rate": 8.810017836181895e-06, + "loss": 0.0181, + "num_input_tokens_seen": 164282720, + "step": 135010 + }, + { + "epoch": 15.036752422318743, + "grad_norm": 0.16884668171405792, + "learning_rate": 8.808166498423844e-06, + "loss": 0.0682, + "num_input_tokens_seen": 164288992, + "step": 135015 + }, + { + "epoch": 15.037309277202361, + "grad_norm": 0.3802889585494995, + "learning_rate": 8.806315313610625e-06, + "loss": 0.0273, + "num_input_tokens_seen": 164294880, + "step": 135020 + }, + { + "epoch": 15.037866132085979, + "grad_norm": 0.27957791090011597, + "learning_rate": 8.804464281759742e-06, + "loss": 0.0068, + "num_input_tokens_seen": 164300800, + "step": 135025 + }, + { + "epoch": 15.038422986969596, + "grad_norm": 0.08145146816968918, + "learning_rate": 8.80261340288866e-06, + "loss": 0.0231, + "num_input_tokens_seen": 164307136, + "step": 135030 + }, + { + "epoch": 15.038979841853212, + "grad_norm": 0.016942959278821945, + "learning_rate": 8.80076267701488e-06, + "loss": 0.0948, + "num_input_tokens_seen": 164312640, + "step": 135035 + }, + { + "epoch": 15.03953669673683, + "grad_norm": 0.1631069928407669, + "learning_rate": 8.798912104155873e-06, + "loss": 0.0018, + "num_input_tokens_seen": 164318912, + "step": 135040 + }, + { + "epoch": 15.040093551620448, + "grad_norm": 0.012119092047214508, + "learning_rate": 8.797061684329125e-06, + "loss": 0.07, + "num_input_tokens_seen": 164324928, + "step": 135045 + }, + { + "epoch": 15.040650406504065, + "grad_norm": 0.08377183228731155, + "learning_rate": 8.795211417552101e-06, + "loss": 0.0154, + "num_input_tokens_seen": 164331136, + "step": 135050 + }, + { + "epoch": 15.041207261387683, + "grad_norm": 0.11355030536651611, + "learning_rate": 8.793361303842295e-06, + "loss": 0.024, + "num_input_tokens_seen": 164336448, + "step": 135055 + }, + { + "epoch": 15.041764116271299, + "grad_norm": 0.3658309280872345, + "learning_rate": 8.79151134321718e-06, + "loss": 0.0142, + "num_input_tokens_seen": 164342560, + "step": 135060 + }, + { + "epoch": 15.042320971154917, + "grad_norm": 1.4422156810760498, + "learning_rate": 8.789661535694224e-06, + "loss": 0.0272, + "num_input_tokens_seen": 164348768, + "step": 135065 + }, + { + "epoch": 15.042877826038534, + "grad_norm": 0.0235500056296587, + "learning_rate": 8.787811881290894e-06, + "loss": 0.0115, + "num_input_tokens_seen": 164355104, + "step": 135070 + }, + { + "epoch": 15.043434680922152, + "grad_norm": 0.00027517799753695726, + "learning_rate": 8.785962380024679e-06, + "loss": 0.1188, + "num_input_tokens_seen": 164361088, + "step": 135075 + }, + { + "epoch": 15.04399153580577, + "grad_norm": 1.1677219867706299, + "learning_rate": 8.784113031913039e-06, + "loss": 0.0081, + "num_input_tokens_seen": 164367296, + "step": 135080 + }, + { + "epoch": 15.044548390689386, + "grad_norm": 0.07224267721176147, + "learning_rate": 8.782263836973443e-06, + "loss": 0.0032, + "num_input_tokens_seen": 164373248, + "step": 135085 + }, + { + "epoch": 15.045105245573003, + "grad_norm": 0.14517174661159515, + "learning_rate": 8.780414795223348e-06, + "loss": 0.039, + "num_input_tokens_seen": 164379392, + "step": 135090 + }, + { + "epoch": 15.045662100456621, + "grad_norm": 1.3333160877227783, + "learning_rate": 8.778565906680242e-06, + "loss": 0.0307, + "num_input_tokens_seen": 164384864, + "step": 135095 + }, + { + "epoch": 15.046218955340239, + "grad_norm": 0.05868047475814819, + "learning_rate": 8.776717171361567e-06, + "loss": 0.0783, + "num_input_tokens_seen": 164390976, + "step": 135100 + }, + { + "epoch": 15.046775810223856, + "grad_norm": 0.0047740330919623375, + "learning_rate": 8.774868589284806e-06, + "loss": 0.0127, + "num_input_tokens_seen": 164396928, + "step": 135105 + }, + { + "epoch": 15.047332665107472, + "grad_norm": 0.08860386908054352, + "learning_rate": 8.77302016046741e-06, + "loss": 0.0146, + "num_input_tokens_seen": 164403168, + "step": 135110 + }, + { + "epoch": 15.04788951999109, + "grad_norm": 0.471855491399765, + "learning_rate": 8.77117188492684e-06, + "loss": 0.1507, + "num_input_tokens_seen": 164409120, + "step": 135115 + }, + { + "epoch": 15.048446374874707, + "grad_norm": 0.15932855010032654, + "learning_rate": 8.769323762680545e-06, + "loss": 0.0026, + "num_input_tokens_seen": 164414880, + "step": 135120 + }, + { + "epoch": 15.049003229758325, + "grad_norm": 0.9381309747695923, + "learning_rate": 8.767475793746e-06, + "loss": 0.0158, + "num_input_tokens_seen": 164420992, + "step": 135125 + }, + { + "epoch": 15.049560084641943, + "grad_norm": 0.494728684425354, + "learning_rate": 8.76562797814065e-06, + "loss": 0.0967, + "num_input_tokens_seen": 164426880, + "step": 135130 + }, + { + "epoch": 15.05011693952556, + "grad_norm": 0.03241511434316635, + "learning_rate": 8.76378031588195e-06, + "loss": 0.0531, + "num_input_tokens_seen": 164432320, + "step": 135135 + }, + { + "epoch": 15.050673794409176, + "grad_norm": 0.4459938704967499, + "learning_rate": 8.761932806987346e-06, + "loss": 0.0081, + "num_input_tokens_seen": 164438528, + "step": 135140 + }, + { + "epoch": 15.051230649292794, + "grad_norm": 0.03197076916694641, + "learning_rate": 8.760085451474307e-06, + "loss": 0.0323, + "num_input_tokens_seen": 164445152, + "step": 135145 + }, + { + "epoch": 15.051787504176412, + "grad_norm": 0.04870474711060524, + "learning_rate": 8.75823824936026e-06, + "loss": 0.0164, + "num_input_tokens_seen": 164451392, + "step": 135150 + }, + { + "epoch": 15.05234435906003, + "grad_norm": 0.07242454588413239, + "learning_rate": 8.756391200662683e-06, + "loss": 0.0465, + "num_input_tokens_seen": 164457632, + "step": 135155 + }, + { + "epoch": 15.052901213943647, + "grad_norm": 1.552917718887329, + "learning_rate": 8.75454430539899e-06, + "loss": 0.1054, + "num_input_tokens_seen": 164463552, + "step": 135160 + }, + { + "epoch": 15.053458068827263, + "grad_norm": 0.2938184142112732, + "learning_rate": 8.752697563586648e-06, + "loss": 0.047, + "num_input_tokens_seen": 164469632, + "step": 135165 + }, + { + "epoch": 15.05401492371088, + "grad_norm": 0.01653527468442917, + "learning_rate": 8.75085097524309e-06, + "loss": 0.0275, + "num_input_tokens_seen": 164475712, + "step": 135170 + }, + { + "epoch": 15.054571778594498, + "grad_norm": 0.8876051306724548, + "learning_rate": 8.749004540385766e-06, + "loss": 0.013, + "num_input_tokens_seen": 164481824, + "step": 135175 + }, + { + "epoch": 15.055128633478116, + "grad_norm": 0.2815383970737457, + "learning_rate": 8.747158259032118e-06, + "loss": 0.0891, + "num_input_tokens_seen": 164487904, + "step": 135180 + }, + { + "epoch": 15.055685488361734, + "grad_norm": 0.0010700594866648316, + "learning_rate": 8.745312131199581e-06, + "loss": 0.0732, + "num_input_tokens_seen": 164493728, + "step": 135185 + }, + { + "epoch": 15.05624234324535, + "grad_norm": 0.0035215977113693953, + "learning_rate": 8.743466156905586e-06, + "loss": 0.0532, + "num_input_tokens_seen": 164499744, + "step": 135190 + }, + { + "epoch": 15.056799198128967, + "grad_norm": 0.11952485144138336, + "learning_rate": 8.741620336167586e-06, + "loss": 0.0341, + "num_input_tokens_seen": 164505792, + "step": 135195 + }, + { + "epoch": 15.057356053012585, + "grad_norm": 0.0011270296527072787, + "learning_rate": 8.739774669003006e-06, + "loss": 0.006, + "num_input_tokens_seen": 164512288, + "step": 135200 + }, + { + "epoch": 15.057912907896203, + "grad_norm": 0.06752708554267883, + "learning_rate": 8.737929155429283e-06, + "loss": 0.033, + "num_input_tokens_seen": 164518336, + "step": 135205 + }, + { + "epoch": 15.05846976277982, + "grad_norm": 0.503567636013031, + "learning_rate": 8.73608379546384e-06, + "loss": 0.0779, + "num_input_tokens_seen": 164524576, + "step": 135210 + }, + { + "epoch": 15.059026617663436, + "grad_norm": 0.00031508051324635744, + "learning_rate": 8.734238589124124e-06, + "loss": 0.0132, + "num_input_tokens_seen": 164530656, + "step": 135215 + }, + { + "epoch": 15.059583472547054, + "grad_norm": 0.08692409843206406, + "learning_rate": 8.73239353642755e-06, + "loss": 0.0137, + "num_input_tokens_seen": 164536896, + "step": 135220 + }, + { + "epoch": 15.060140327430672, + "grad_norm": 0.13548444211483002, + "learning_rate": 8.730548637391566e-06, + "loss": 0.1096, + "num_input_tokens_seen": 164543072, + "step": 135225 + }, + { + "epoch": 15.06069718231429, + "grad_norm": 0.10532710701227188, + "learning_rate": 8.728703892033571e-06, + "loss": 0.0999, + "num_input_tokens_seen": 164549024, + "step": 135230 + }, + { + "epoch": 15.061254037197907, + "grad_norm": 0.21515819430351257, + "learning_rate": 8.72685930037101e-06, + "loss": 0.0094, + "num_input_tokens_seen": 164555168, + "step": 135235 + }, + { + "epoch": 15.061810892081523, + "grad_norm": 0.235055074095726, + "learning_rate": 8.725014862421293e-06, + "loss": 0.0224, + "num_input_tokens_seen": 164560672, + "step": 135240 + }, + { + "epoch": 15.06236774696514, + "grad_norm": 0.003426068928092718, + "learning_rate": 8.723170578201862e-06, + "loss": 0.0373, + "num_input_tokens_seen": 164566912, + "step": 135245 + }, + { + "epoch": 15.062924601848758, + "grad_norm": 0.24470677971839905, + "learning_rate": 8.721326447730122e-06, + "loss": 0.0634, + "num_input_tokens_seen": 164573152, + "step": 135250 + }, + { + "epoch": 15.063481456732376, + "grad_norm": 0.1096336841583252, + "learning_rate": 8.719482471023496e-06, + "loss": 0.0126, + "num_input_tokens_seen": 164579168, + "step": 135255 + }, + { + "epoch": 15.064038311615993, + "grad_norm": 0.029691636562347412, + "learning_rate": 8.717638648099394e-06, + "loss": 0.0383, + "num_input_tokens_seen": 164585152, + "step": 135260 + }, + { + "epoch": 15.06459516649961, + "grad_norm": 7.645231380593032e-05, + "learning_rate": 8.715794978975248e-06, + "loss": 0.0025, + "num_input_tokens_seen": 164591200, + "step": 135265 + }, + { + "epoch": 15.065152021383227, + "grad_norm": 0.15331073105335236, + "learning_rate": 8.713951463668465e-06, + "loss": 0.0319, + "num_input_tokens_seen": 164597472, + "step": 135270 + }, + { + "epoch": 15.065708876266845, + "grad_norm": 0.38961803913116455, + "learning_rate": 8.712108102196459e-06, + "loss": 0.016, + "num_input_tokens_seen": 164603392, + "step": 135275 + }, + { + "epoch": 15.066265731150462, + "grad_norm": 0.06476225703954697, + "learning_rate": 8.710264894576634e-06, + "loss": 0.0026, + "num_input_tokens_seen": 164609408, + "step": 135280 + }, + { + "epoch": 15.06682258603408, + "grad_norm": 2.4121761322021484, + "learning_rate": 8.708421840826417e-06, + "loss": 0.1307, + "num_input_tokens_seen": 164614912, + "step": 135285 + }, + { + "epoch": 15.067379440917696, + "grad_norm": 0.014073868282139301, + "learning_rate": 8.706578940963198e-06, + "loss": 0.0421, + "num_input_tokens_seen": 164621024, + "step": 135290 + }, + { + "epoch": 15.067936295801314, + "grad_norm": 2.323529005050659, + "learning_rate": 8.704736195004405e-06, + "loss": 0.137, + "num_input_tokens_seen": 164627008, + "step": 135295 + }, + { + "epoch": 15.068493150684931, + "grad_norm": 1.129847526550293, + "learning_rate": 8.702893602967432e-06, + "loss": 0.0238, + "num_input_tokens_seen": 164633248, + "step": 135300 + }, + { + "epoch": 15.069050005568549, + "grad_norm": 0.0060736313462257385, + "learning_rate": 8.701051164869686e-06, + "loss": 0.0084, + "num_input_tokens_seen": 164639232, + "step": 135305 + }, + { + "epoch": 15.069606860452167, + "grad_norm": 1.2591544389724731, + "learning_rate": 8.699208880728565e-06, + "loss": 0.0172, + "num_input_tokens_seen": 164644800, + "step": 135310 + }, + { + "epoch": 15.070163715335784, + "grad_norm": 0.20102699100971222, + "learning_rate": 8.697366750561484e-06, + "loss": 0.04, + "num_input_tokens_seen": 164650976, + "step": 135315 + }, + { + "epoch": 15.0707205702194, + "grad_norm": 0.003354146145284176, + "learning_rate": 8.695524774385832e-06, + "loss": 0.0064, + "num_input_tokens_seen": 164656928, + "step": 135320 + }, + { + "epoch": 15.071277425103018, + "grad_norm": 0.007219262886792421, + "learning_rate": 8.693682952219015e-06, + "loss": 0.0161, + "num_input_tokens_seen": 164663104, + "step": 135325 + }, + { + "epoch": 15.071834279986636, + "grad_norm": 0.04589034989476204, + "learning_rate": 8.691841284078417e-06, + "loss": 0.1393, + "num_input_tokens_seen": 164669088, + "step": 135330 + }, + { + "epoch": 15.072391134870253, + "grad_norm": 0.371986448764801, + "learning_rate": 8.689999769981452e-06, + "loss": 0.0409, + "num_input_tokens_seen": 164675264, + "step": 135335 + }, + { + "epoch": 15.072947989753871, + "grad_norm": 0.17121918499469757, + "learning_rate": 8.688158409945499e-06, + "loss": 0.013, + "num_input_tokens_seen": 164681472, + "step": 135340 + }, + { + "epoch": 15.073504844637487, + "grad_norm": 0.014452151954174042, + "learning_rate": 8.686317203987977e-06, + "loss": 0.0604, + "num_input_tokens_seen": 164687712, + "step": 135345 + }, + { + "epoch": 15.074061699521105, + "grad_norm": 0.18567770719528198, + "learning_rate": 8.684476152126239e-06, + "loss": 0.0206, + "num_input_tokens_seen": 164693760, + "step": 135350 + }, + { + "epoch": 15.074618554404722, + "grad_norm": 0.00021097045100759715, + "learning_rate": 8.682635254377705e-06, + "loss": 0.0018, + "num_input_tokens_seen": 164700320, + "step": 135355 + }, + { + "epoch": 15.07517540928834, + "grad_norm": 0.5809826850891113, + "learning_rate": 8.680794510759754e-06, + "loss": 0.0246, + "num_input_tokens_seen": 164706272, + "step": 135360 + }, + { + "epoch": 15.075732264171958, + "grad_norm": 0.15884767472743988, + "learning_rate": 8.678953921289767e-06, + "loss": 0.0291, + "num_input_tokens_seen": 164712576, + "step": 135365 + }, + { + "epoch": 15.076289119055573, + "grad_norm": 0.048741262406110764, + "learning_rate": 8.677113485985153e-06, + "loss": 0.0036, + "num_input_tokens_seen": 164718112, + "step": 135370 + }, + { + "epoch": 15.076845973939191, + "grad_norm": 0.08796080946922302, + "learning_rate": 8.67527320486326e-06, + "loss": 0.0717, + "num_input_tokens_seen": 164723552, + "step": 135375 + }, + { + "epoch": 15.077402828822809, + "grad_norm": 0.0002307360846316442, + "learning_rate": 8.673433077941503e-06, + "loss": 0.0141, + "num_input_tokens_seen": 164729696, + "step": 135380 + }, + { + "epoch": 15.077959683706426, + "grad_norm": 0.06078166514635086, + "learning_rate": 8.671593105237241e-06, + "loss": 0.0152, + "num_input_tokens_seen": 164735904, + "step": 135385 + }, + { + "epoch": 15.078516538590044, + "grad_norm": 0.12536577880382538, + "learning_rate": 8.669753286767874e-06, + "loss": 0.0897, + "num_input_tokens_seen": 164741472, + "step": 135390 + }, + { + "epoch": 15.07907339347366, + "grad_norm": 0.1141502633690834, + "learning_rate": 8.667913622550769e-06, + "loss": 0.0561, + "num_input_tokens_seen": 164747456, + "step": 135395 + }, + { + "epoch": 15.079630248357278, + "grad_norm": 0.0036484687589108944, + "learning_rate": 8.666074112603303e-06, + "loss": 0.0291, + "num_input_tokens_seen": 164753312, + "step": 135400 + }, + { + "epoch": 15.080187103240895, + "grad_norm": 0.022602057084441185, + "learning_rate": 8.664234756942849e-06, + "loss": 0.005, + "num_input_tokens_seen": 164759296, + "step": 135405 + }, + { + "epoch": 15.080743958124513, + "grad_norm": 0.0320589579641819, + "learning_rate": 8.662395555586791e-06, + "loss": 0.1136, + "num_input_tokens_seen": 164765312, + "step": 135410 + }, + { + "epoch": 15.08130081300813, + "grad_norm": 0.39410316944122314, + "learning_rate": 8.660556508552501e-06, + "loss": 0.006, + "num_input_tokens_seen": 164771520, + "step": 135415 + }, + { + "epoch": 15.081857667891747, + "grad_norm": 0.36646905541419983, + "learning_rate": 8.658717615857343e-06, + "loss": 0.0202, + "num_input_tokens_seen": 164776800, + "step": 135420 + }, + { + "epoch": 15.082414522775364, + "grad_norm": 0.44146957993507385, + "learning_rate": 8.65687887751868e-06, + "loss": 0.0273, + "num_input_tokens_seen": 164783008, + "step": 135425 + }, + { + "epoch": 15.082971377658982, + "grad_norm": 0.0008642423781566322, + "learning_rate": 8.655040293553898e-06, + "loss": 0.016, + "num_input_tokens_seen": 164789344, + "step": 135430 + }, + { + "epoch": 15.0835282325426, + "grad_norm": 0.22646477818489075, + "learning_rate": 8.653201863980348e-06, + "loss": 0.0062, + "num_input_tokens_seen": 164795552, + "step": 135435 + }, + { + "epoch": 15.084085087426217, + "grad_norm": 0.47846874594688416, + "learning_rate": 8.651363588815414e-06, + "loss": 0.0084, + "num_input_tokens_seen": 164801952, + "step": 135440 + }, + { + "epoch": 15.084641942309833, + "grad_norm": 0.5082579255104065, + "learning_rate": 8.649525468076447e-06, + "loss": 0.0722, + "num_input_tokens_seen": 164808192, + "step": 135445 + }, + { + "epoch": 15.085198797193451, + "grad_norm": 1.065006971359253, + "learning_rate": 8.647687501780813e-06, + "loss": 0.0298, + "num_input_tokens_seen": 164814368, + "step": 135450 + }, + { + "epoch": 15.085755652077069, + "grad_norm": 0.0005813827738165855, + "learning_rate": 8.645849689945863e-06, + "loss": 0.0544, + "num_input_tokens_seen": 164820480, + "step": 135455 + }, + { + "epoch": 15.086312506960686, + "grad_norm": 0.9760822653770447, + "learning_rate": 8.644012032588971e-06, + "loss": 0.0485, + "num_input_tokens_seen": 164826592, + "step": 135460 + }, + { + "epoch": 15.086869361844304, + "grad_norm": 1.4630811214447021, + "learning_rate": 8.642174529727492e-06, + "loss": 0.1465, + "num_input_tokens_seen": 164832992, + "step": 135465 + }, + { + "epoch": 15.08742621672792, + "grad_norm": 0.00020103085262235254, + "learning_rate": 8.640337181378782e-06, + "loss": 0.055, + "num_input_tokens_seen": 164839424, + "step": 135470 + }, + { + "epoch": 15.087983071611538, + "grad_norm": 0.1137123852968216, + "learning_rate": 8.638499987560183e-06, + "loss": 0.0107, + "num_input_tokens_seen": 164845600, + "step": 135475 + }, + { + "epoch": 15.088539926495155, + "grad_norm": 0.20153369009494781, + "learning_rate": 8.63666294828907e-06, + "loss": 0.007, + "num_input_tokens_seen": 164851584, + "step": 135480 + }, + { + "epoch": 15.089096781378773, + "grad_norm": 0.00023867283016443253, + "learning_rate": 8.634826063582777e-06, + "loss": 0.0013, + "num_input_tokens_seen": 164857856, + "step": 135485 + }, + { + "epoch": 15.08965363626239, + "grad_norm": 0.059385836124420166, + "learning_rate": 8.632989333458682e-06, + "loss": 0.0538, + "num_input_tokens_seen": 164864256, + "step": 135490 + }, + { + "epoch": 15.090210491146008, + "grad_norm": 0.0009001379949040711, + "learning_rate": 8.631152757934097e-06, + "loss": 0.188, + "num_input_tokens_seen": 164870528, + "step": 135495 + }, + { + "epoch": 15.090767346029624, + "grad_norm": 1.7030311822891235, + "learning_rate": 8.629316337026396e-06, + "loss": 0.055, + "num_input_tokens_seen": 164876320, + "step": 135500 + }, + { + "epoch": 15.091324200913242, + "grad_norm": 0.0015509043587371707, + "learning_rate": 8.627480070752911e-06, + "loss": 0.0215, + "num_input_tokens_seen": 164882592, + "step": 135505 + }, + { + "epoch": 15.09188105579686, + "grad_norm": 0.004605855327099562, + "learning_rate": 8.625643959131002e-06, + "loss": 0.004, + "num_input_tokens_seen": 164889024, + "step": 135510 + }, + { + "epoch": 15.092437910680477, + "grad_norm": 0.02680426649749279, + "learning_rate": 8.623808002178007e-06, + "loss": 0.0035, + "num_input_tokens_seen": 164895008, + "step": 135515 + }, + { + "epoch": 15.092994765564095, + "grad_norm": 0.13399763405323029, + "learning_rate": 8.621972199911263e-06, + "loss": 0.109, + "num_input_tokens_seen": 164900896, + "step": 135520 + }, + { + "epoch": 15.09355162044771, + "grad_norm": 0.7085047364234924, + "learning_rate": 8.620136552348107e-06, + "loss": 0.0178, + "num_input_tokens_seen": 164906496, + "step": 135525 + }, + { + "epoch": 15.094108475331328, + "grad_norm": 0.6019113659858704, + "learning_rate": 8.618301059505892e-06, + "loss": 0.0158, + "num_input_tokens_seen": 164912320, + "step": 135530 + }, + { + "epoch": 15.094665330214946, + "grad_norm": 0.009740477427840233, + "learning_rate": 8.616465721401948e-06, + "loss": 0.0071, + "num_input_tokens_seen": 164918688, + "step": 135535 + }, + { + "epoch": 15.095222185098564, + "grad_norm": 0.14062832295894623, + "learning_rate": 8.614630538053615e-06, + "loss": 0.0661, + "num_input_tokens_seen": 164924768, + "step": 135540 + }, + { + "epoch": 15.095779039982181, + "grad_norm": 0.5309106111526489, + "learning_rate": 8.612795509478212e-06, + "loss": 0.0631, + "num_input_tokens_seen": 164930208, + "step": 135545 + }, + { + "epoch": 15.096335894865797, + "grad_norm": 0.7731629014015198, + "learning_rate": 8.610960635693094e-06, + "loss": 0.119, + "num_input_tokens_seen": 164936480, + "step": 135550 + }, + { + "epoch": 15.096892749749415, + "grad_norm": 0.1952662169933319, + "learning_rate": 8.609125916715573e-06, + "loss": 0.0147, + "num_input_tokens_seen": 164942688, + "step": 135555 + }, + { + "epoch": 15.097449604633033, + "grad_norm": 0.005801135674118996, + "learning_rate": 8.60729135256301e-06, + "loss": 0.0084, + "num_input_tokens_seen": 164948800, + "step": 135560 + }, + { + "epoch": 15.09800645951665, + "grad_norm": 0.10474108159542084, + "learning_rate": 8.605456943252696e-06, + "loss": 0.0287, + "num_input_tokens_seen": 164955008, + "step": 135565 + }, + { + "epoch": 15.098563314400268, + "grad_norm": 1.8458218574523926, + "learning_rate": 8.603622688801985e-06, + "loss": 0.072, + "num_input_tokens_seen": 164960896, + "step": 135570 + }, + { + "epoch": 15.099120169283884, + "grad_norm": 0.0008374659810215235, + "learning_rate": 8.601788589228185e-06, + "loss": 0.0479, + "num_input_tokens_seen": 164966880, + "step": 135575 + }, + { + "epoch": 15.099677024167502, + "grad_norm": 0.1167682409286499, + "learning_rate": 8.599954644548639e-06, + "loss": 0.0128, + "num_input_tokens_seen": 164973184, + "step": 135580 + }, + { + "epoch": 15.10023387905112, + "grad_norm": 0.020925011485815048, + "learning_rate": 8.598120854780659e-06, + "loss": 0.004, + "num_input_tokens_seen": 164979360, + "step": 135585 + }, + { + "epoch": 15.100790733934737, + "grad_norm": 0.1376204788684845, + "learning_rate": 8.59628721994157e-06, + "loss": 0.0219, + "num_input_tokens_seen": 164984960, + "step": 135590 + }, + { + "epoch": 15.101347588818355, + "grad_norm": 0.0007319919532164931, + "learning_rate": 8.594453740048683e-06, + "loss": 0.021, + "num_input_tokens_seen": 164991104, + "step": 135595 + }, + { + "epoch": 15.10190444370197, + "grad_norm": 0.4859743118286133, + "learning_rate": 8.592620415119332e-06, + "loss": 0.0954, + "num_input_tokens_seen": 164997216, + "step": 135600 + }, + { + "epoch": 15.102461298585588, + "grad_norm": 0.006020290311425924, + "learning_rate": 8.590787245170826e-06, + "loss": 0.0035, + "num_input_tokens_seen": 165003296, + "step": 135605 + }, + { + "epoch": 15.103018153469206, + "grad_norm": 0.4670819938182831, + "learning_rate": 8.588954230220481e-06, + "loss": 0.0228, + "num_input_tokens_seen": 165009472, + "step": 135610 + }, + { + "epoch": 15.103575008352824, + "grad_norm": 0.012586895376443863, + "learning_rate": 8.587121370285603e-06, + "loss": 0.0084, + "num_input_tokens_seen": 165015744, + "step": 135615 + }, + { + "epoch": 15.104131863236441, + "grad_norm": 0.001510055037215352, + "learning_rate": 8.585288665383523e-06, + "loss": 0.005, + "num_input_tokens_seen": 165021504, + "step": 135620 + }, + { + "epoch": 15.104688718120057, + "grad_norm": 0.5856953859329224, + "learning_rate": 8.583456115531535e-06, + "loss": 0.0272, + "num_input_tokens_seen": 165027776, + "step": 135625 + }, + { + "epoch": 15.105245573003675, + "grad_norm": 0.9943732023239136, + "learning_rate": 8.581623720746973e-06, + "loss": 0.0888, + "num_input_tokens_seen": 165033664, + "step": 135630 + }, + { + "epoch": 15.105802427887292, + "grad_norm": 0.010621662251651287, + "learning_rate": 8.579791481047111e-06, + "loss": 0.0381, + "num_input_tokens_seen": 165040096, + "step": 135635 + }, + { + "epoch": 15.10635928277091, + "grad_norm": 0.17074763774871826, + "learning_rate": 8.577959396449284e-06, + "loss": 0.0034, + "num_input_tokens_seen": 165046144, + "step": 135640 + }, + { + "epoch": 15.106916137654528, + "grad_norm": 0.6446183919906616, + "learning_rate": 8.57612746697078e-06, + "loss": 0.0219, + "num_input_tokens_seen": 165052224, + "step": 135645 + }, + { + "epoch": 15.107472992538144, + "grad_norm": 0.5899809002876282, + "learning_rate": 8.574295692628917e-06, + "loss": 0.0272, + "num_input_tokens_seen": 165058304, + "step": 135650 + }, + { + "epoch": 15.108029847421761, + "grad_norm": 0.4043891429901123, + "learning_rate": 8.572464073440992e-06, + "loss": 0.0322, + "num_input_tokens_seen": 165064384, + "step": 135655 + }, + { + "epoch": 15.108586702305379, + "grad_norm": 0.4504908323287964, + "learning_rate": 8.570632609424303e-06, + "loss": 0.0638, + "num_input_tokens_seen": 165070592, + "step": 135660 + }, + { + "epoch": 15.109143557188997, + "grad_norm": 0.11017804592847824, + "learning_rate": 8.568801300596146e-06, + "loss": 0.0136, + "num_input_tokens_seen": 165076736, + "step": 135665 + }, + { + "epoch": 15.109700412072614, + "grad_norm": 0.010492842644453049, + "learning_rate": 8.566970146973836e-06, + "loss": 0.009, + "num_input_tokens_seen": 165082880, + "step": 135670 + }, + { + "epoch": 15.110257266956232, + "grad_norm": 0.4465811252593994, + "learning_rate": 8.565139148574655e-06, + "loss": 0.0733, + "num_input_tokens_seen": 165088512, + "step": 135675 + }, + { + "epoch": 15.110814121839848, + "grad_norm": 0.43303778767585754, + "learning_rate": 8.563308305415905e-06, + "loss": 0.0079, + "num_input_tokens_seen": 165094400, + "step": 135680 + }, + { + "epoch": 15.111370976723466, + "grad_norm": 0.7784727215766907, + "learning_rate": 8.561477617514865e-06, + "loss": 0.0659, + "num_input_tokens_seen": 165100320, + "step": 135685 + }, + { + "epoch": 15.111927831607083, + "grad_norm": 0.0022689918987452984, + "learning_rate": 8.55964708488885e-06, + "loss": 0.0686, + "num_input_tokens_seen": 165106528, + "step": 135690 + }, + { + "epoch": 15.112484686490701, + "grad_norm": 0.47612982988357544, + "learning_rate": 8.55781670755513e-06, + "loss": 0.0111, + "num_input_tokens_seen": 165112256, + "step": 135695 + }, + { + "epoch": 15.113041541374319, + "grad_norm": 0.9151254296302795, + "learning_rate": 8.555986485531014e-06, + "loss": 0.0291, + "num_input_tokens_seen": 165118496, + "step": 135700 + }, + { + "epoch": 15.113598396257935, + "grad_norm": 0.002331915544345975, + "learning_rate": 8.55415641883378e-06, + "loss": 0.0333, + "num_input_tokens_seen": 165124480, + "step": 135705 + }, + { + "epoch": 15.114155251141552, + "grad_norm": 0.5271823406219482, + "learning_rate": 8.552326507480717e-06, + "loss": 0.0534, + "num_input_tokens_seen": 165130688, + "step": 135710 + }, + { + "epoch": 15.11471210602517, + "grad_norm": 1.7599276304244995, + "learning_rate": 8.550496751489097e-06, + "loss": 0.0327, + "num_input_tokens_seen": 165136672, + "step": 135715 + }, + { + "epoch": 15.115268960908788, + "grad_norm": 1.294384479522705, + "learning_rate": 8.548667150876224e-06, + "loss": 0.0626, + "num_input_tokens_seen": 165142848, + "step": 135720 + }, + { + "epoch": 15.115825815792405, + "grad_norm": 0.00018768808513414115, + "learning_rate": 8.546837705659371e-06, + "loss": 0.0599, + "num_input_tokens_seen": 165149024, + "step": 135725 + }, + { + "epoch": 15.116382670676021, + "grad_norm": 1.301428198814392, + "learning_rate": 8.545008415855815e-06, + "loss": 0.0675, + "num_input_tokens_seen": 165154560, + "step": 135730 + }, + { + "epoch": 15.116939525559639, + "grad_norm": 0.010990952141582966, + "learning_rate": 8.543179281482832e-06, + "loss": 0.0035, + "num_input_tokens_seen": 165160672, + "step": 135735 + }, + { + "epoch": 15.117496380443256, + "grad_norm": 0.010365305468440056, + "learning_rate": 8.541350302557713e-06, + "loss": 0.0162, + "num_input_tokens_seen": 165166752, + "step": 135740 + }, + { + "epoch": 15.118053235326874, + "grad_norm": 1.7525813579559326, + "learning_rate": 8.53952147909772e-06, + "loss": 0.0416, + "num_input_tokens_seen": 165172704, + "step": 135745 + }, + { + "epoch": 15.118610090210492, + "grad_norm": 0.37834110856056213, + "learning_rate": 8.537692811120149e-06, + "loss": 0.0053, + "num_input_tokens_seen": 165178784, + "step": 135750 + }, + { + "epoch": 15.119166945094108, + "grad_norm": 0.2050672322511673, + "learning_rate": 8.535864298642244e-06, + "loss": 0.0165, + "num_input_tokens_seen": 165184768, + "step": 135755 + }, + { + "epoch": 15.119723799977725, + "grad_norm": 0.09868577122688293, + "learning_rate": 8.534035941681299e-06, + "loss": 0.0474, + "num_input_tokens_seen": 165191008, + "step": 135760 + }, + { + "epoch": 15.120280654861343, + "grad_norm": 0.18730390071868896, + "learning_rate": 8.532207740254578e-06, + "loss": 0.0035, + "num_input_tokens_seen": 165197248, + "step": 135765 + }, + { + "epoch": 15.12083750974496, + "grad_norm": 4.288725852966309, + "learning_rate": 8.53037969437934e-06, + "loss": 0.1364, + "num_input_tokens_seen": 165203168, + "step": 135770 + }, + { + "epoch": 15.121394364628578, + "grad_norm": 0.002450550440698862, + "learning_rate": 8.528551804072877e-06, + "loss": 0.0591, + "num_input_tokens_seen": 165209248, + "step": 135775 + }, + { + "epoch": 15.121951219512194, + "grad_norm": 0.24028462171554565, + "learning_rate": 8.526724069352422e-06, + "loss": 0.004, + "num_input_tokens_seen": 165215552, + "step": 135780 + }, + { + "epoch": 15.122508074395812, + "grad_norm": 1.2420248985290527, + "learning_rate": 8.524896490235264e-06, + "loss": 0.0812, + "num_input_tokens_seen": 165221728, + "step": 135785 + }, + { + "epoch": 15.12306492927943, + "grad_norm": 0.202137291431427, + "learning_rate": 8.52306906673865e-06, + "loss": 0.0256, + "num_input_tokens_seen": 165227872, + "step": 135790 + }, + { + "epoch": 15.123621784163047, + "grad_norm": 0.603761613368988, + "learning_rate": 8.521241798879859e-06, + "loss": 0.0309, + "num_input_tokens_seen": 165233984, + "step": 135795 + }, + { + "epoch": 15.124178639046665, + "grad_norm": 0.7595005035400391, + "learning_rate": 8.519414686676141e-06, + "loss": 0.0999, + "num_input_tokens_seen": 165239808, + "step": 135800 + }, + { + "epoch": 15.124735493930281, + "grad_norm": 0.016362899914383888, + "learning_rate": 8.517587730144758e-06, + "loss": 0.0106, + "num_input_tokens_seen": 165245824, + "step": 135805 + }, + { + "epoch": 15.125292348813899, + "grad_norm": 0.0066276490688323975, + "learning_rate": 8.515760929302955e-06, + "loss": 0.0202, + "num_input_tokens_seen": 165251744, + "step": 135810 + }, + { + "epoch": 15.125849203697516, + "grad_norm": 0.05329539254307747, + "learning_rate": 8.513934284168002e-06, + "loss": 0.0229, + "num_input_tokens_seen": 165258016, + "step": 135815 + }, + { + "epoch": 15.126406058581134, + "grad_norm": 0.002914014272391796, + "learning_rate": 8.512107794757152e-06, + "loss": 0.0005, + "num_input_tokens_seen": 165264384, + "step": 135820 + }, + { + "epoch": 15.126962913464752, + "grad_norm": 0.017855627462267876, + "learning_rate": 8.510281461087652e-06, + "loss": 0.0009, + "num_input_tokens_seen": 165270816, + "step": 135825 + }, + { + "epoch": 15.127519768348368, + "grad_norm": 0.21540962159633636, + "learning_rate": 8.508455283176747e-06, + "loss": 0.0146, + "num_input_tokens_seen": 165276736, + "step": 135830 + }, + { + "epoch": 15.128076623231985, + "grad_norm": 0.056412223726511, + "learning_rate": 8.506629261041702e-06, + "loss": 0.0935, + "num_input_tokens_seen": 165282528, + "step": 135835 + }, + { + "epoch": 15.128633478115603, + "grad_norm": 0.00011532779171830043, + "learning_rate": 8.50480339469975e-06, + "loss": 0.0053, + "num_input_tokens_seen": 165288544, + "step": 135840 + }, + { + "epoch": 15.12919033299922, + "grad_norm": 0.154355987906456, + "learning_rate": 8.502977684168156e-06, + "loss": 0.0014, + "num_input_tokens_seen": 165294848, + "step": 135845 + }, + { + "epoch": 15.129747187882838, + "grad_norm": 0.0002532034704927355, + "learning_rate": 8.501152129464152e-06, + "loss": 0.0984, + "num_input_tokens_seen": 165301120, + "step": 135850 + }, + { + "epoch": 15.130304042766456, + "grad_norm": 0.2718011140823364, + "learning_rate": 8.499326730604987e-06, + "loss": 0.0885, + "num_input_tokens_seen": 165307072, + "step": 135855 + }, + { + "epoch": 15.130860897650072, + "grad_norm": 1.6616069078445435, + "learning_rate": 8.497501487607893e-06, + "loss": 0.2493, + "num_input_tokens_seen": 165312736, + "step": 135860 + }, + { + "epoch": 15.13141775253369, + "grad_norm": 0.42488402128219604, + "learning_rate": 8.495676400490124e-06, + "loss": 0.0731, + "num_input_tokens_seen": 165318944, + "step": 135865 + }, + { + "epoch": 15.131974607417307, + "grad_norm": 0.0027788409497588873, + "learning_rate": 8.493851469268919e-06, + "loss": 0.0768, + "num_input_tokens_seen": 165325024, + "step": 135870 + }, + { + "epoch": 15.132531462300925, + "grad_norm": 0.09990999102592468, + "learning_rate": 8.492026693961507e-06, + "loss": 0.01, + "num_input_tokens_seen": 165331168, + "step": 135875 + }, + { + "epoch": 15.133088317184543, + "grad_norm": 0.14678360521793365, + "learning_rate": 8.490202074585125e-06, + "loss": 0.0041, + "num_input_tokens_seen": 165337408, + "step": 135880 + }, + { + "epoch": 15.133645172068158, + "grad_norm": 0.00010999414371326566, + "learning_rate": 8.488377611157016e-06, + "loss": 0.0303, + "num_input_tokens_seen": 165343200, + "step": 135885 + }, + { + "epoch": 15.134202026951776, + "grad_norm": 0.03611382842063904, + "learning_rate": 8.486553303694403e-06, + "loss": 0.0077, + "num_input_tokens_seen": 165349248, + "step": 135890 + }, + { + "epoch": 15.134758881835394, + "grad_norm": 0.00013413256965577602, + "learning_rate": 8.484729152214541e-06, + "loss": 0.0507, + "num_input_tokens_seen": 165355520, + "step": 135895 + }, + { + "epoch": 15.135315736719011, + "grad_norm": 0.0024050166830420494, + "learning_rate": 8.482905156734628e-06, + "loss": 0.0073, + "num_input_tokens_seen": 165361792, + "step": 135900 + }, + { + "epoch": 15.135872591602629, + "grad_norm": 0.002211812185123563, + "learning_rate": 8.481081317271917e-06, + "loss": 0.0593, + "num_input_tokens_seen": 165367968, + "step": 135905 + }, + { + "epoch": 15.136429446486245, + "grad_norm": 1.0637941360473633, + "learning_rate": 8.479257633843619e-06, + "loss": 0.0502, + "num_input_tokens_seen": 165374112, + "step": 135910 + }, + { + "epoch": 15.136986301369863, + "grad_norm": 1.0115242004394531, + "learning_rate": 8.477434106466975e-06, + "loss": 0.2257, + "num_input_tokens_seen": 165380256, + "step": 135915 + }, + { + "epoch": 15.13754315625348, + "grad_norm": 0.039113596081733704, + "learning_rate": 8.475610735159207e-06, + "loss": 0.1377, + "num_input_tokens_seen": 165385920, + "step": 135920 + }, + { + "epoch": 15.138100011137098, + "grad_norm": 0.6361060738563538, + "learning_rate": 8.473787519937534e-06, + "loss": 0.0204, + "num_input_tokens_seen": 165391936, + "step": 135925 + }, + { + "epoch": 15.138656866020716, + "grad_norm": 2.1458942890167236, + "learning_rate": 8.471964460819167e-06, + "loss": 0.0532, + "num_input_tokens_seen": 165397984, + "step": 135930 + }, + { + "epoch": 15.139213720904332, + "grad_norm": 0.4421967566013336, + "learning_rate": 8.470141557821351e-06, + "loss": 0.0606, + "num_input_tokens_seen": 165404224, + "step": 135935 + }, + { + "epoch": 15.13977057578795, + "grad_norm": 0.00929633155465126, + "learning_rate": 8.46831881096129e-06, + "loss": 0.0157, + "num_input_tokens_seen": 165410432, + "step": 135940 + }, + { + "epoch": 15.140327430671567, + "grad_norm": 0.0007839277968741953, + "learning_rate": 8.466496220256202e-06, + "loss": 0.0011, + "num_input_tokens_seen": 165416672, + "step": 135945 + }, + { + "epoch": 15.140884285555185, + "grad_norm": 0.0001495707838330418, + "learning_rate": 8.464673785723293e-06, + "loss": 0.014, + "num_input_tokens_seen": 165422784, + "step": 135950 + }, + { + "epoch": 15.141441140438802, + "grad_norm": 0.0011837328784167767, + "learning_rate": 8.462851507379799e-06, + "loss": 0.0535, + "num_input_tokens_seen": 165428608, + "step": 135955 + }, + { + "epoch": 15.141997995322418, + "grad_norm": 0.27982065081596375, + "learning_rate": 8.461029385242914e-06, + "loss": 0.0369, + "num_input_tokens_seen": 165435104, + "step": 135960 + }, + { + "epoch": 15.142554850206036, + "grad_norm": 0.005348204635083675, + "learning_rate": 8.459207419329874e-06, + "loss": 0.1012, + "num_input_tokens_seen": 165440896, + "step": 135965 + }, + { + "epoch": 15.143111705089654, + "grad_norm": 1.7565217018127441, + "learning_rate": 8.457385609657853e-06, + "loss": 0.0306, + "num_input_tokens_seen": 165446880, + "step": 135970 + }, + { + "epoch": 15.143668559973271, + "grad_norm": 0.8224751353263855, + "learning_rate": 8.45556395624409e-06, + "loss": 0.096, + "num_input_tokens_seen": 165453056, + "step": 135975 + }, + { + "epoch": 15.144225414856889, + "grad_norm": 0.007448985241353512, + "learning_rate": 8.453742459105767e-06, + "loss": 0.1657, + "num_input_tokens_seen": 165459328, + "step": 135980 + }, + { + "epoch": 15.144782269740505, + "grad_norm": 0.027695296332240105, + "learning_rate": 8.451921118260116e-06, + "loss": 0.0597, + "num_input_tokens_seen": 165465408, + "step": 135985 + }, + { + "epoch": 15.145339124624122, + "grad_norm": 1.5112682580947876, + "learning_rate": 8.450099933724328e-06, + "loss": 0.0341, + "num_input_tokens_seen": 165471040, + "step": 135990 + }, + { + "epoch": 15.14589597950774, + "grad_norm": 0.021772991865873337, + "learning_rate": 8.448278905515605e-06, + "loss": 0.0187, + "num_input_tokens_seen": 165477152, + "step": 135995 + }, + { + "epoch": 15.146452834391358, + "grad_norm": 0.007125556468963623, + "learning_rate": 8.44645803365114e-06, + "loss": 0.031, + "num_input_tokens_seen": 165483264, + "step": 136000 + }, + { + "epoch": 15.147009689274975, + "grad_norm": 0.01017944049090147, + "learning_rate": 8.44463731814815e-06, + "loss": 0.0075, + "num_input_tokens_seen": 165489184, + "step": 136005 + }, + { + "epoch": 15.147566544158593, + "grad_norm": 0.3155440390110016, + "learning_rate": 8.442816759023826e-06, + "loss": 0.0419, + "num_input_tokens_seen": 165495328, + "step": 136010 + }, + { + "epoch": 15.148123399042209, + "grad_norm": 0.5275053381919861, + "learning_rate": 8.440996356295361e-06, + "loss": 0.0069, + "num_input_tokens_seen": 165501600, + "step": 136015 + }, + { + "epoch": 15.148680253925827, + "grad_norm": 0.20906144380569458, + "learning_rate": 8.439176109979944e-06, + "loss": 0.033, + "num_input_tokens_seen": 165507648, + "step": 136020 + }, + { + "epoch": 15.149237108809444, + "grad_norm": 0.0027208151295781136, + "learning_rate": 8.437356020094786e-06, + "loss": 0.0306, + "num_input_tokens_seen": 165513056, + "step": 136025 + }, + { + "epoch": 15.149793963693062, + "grad_norm": 0.12509866058826447, + "learning_rate": 8.435536086657062e-06, + "loss": 0.0305, + "num_input_tokens_seen": 165519264, + "step": 136030 + }, + { + "epoch": 15.15035081857668, + "grad_norm": 0.00027358633815310895, + "learning_rate": 8.433716309683979e-06, + "loss": 0.0001, + "num_input_tokens_seen": 165525664, + "step": 136035 + }, + { + "epoch": 15.150907673460296, + "grad_norm": 0.12903685867786407, + "learning_rate": 8.431896689192717e-06, + "loss": 0.0073, + "num_input_tokens_seen": 165531712, + "step": 136040 + }, + { + "epoch": 15.151464528343913, + "grad_norm": 0.8574044704437256, + "learning_rate": 8.430077225200466e-06, + "loss": 0.0211, + "num_input_tokens_seen": 165537792, + "step": 136045 + }, + { + "epoch": 15.152021383227531, + "grad_norm": 2.043686866760254, + "learning_rate": 8.428257917724402e-06, + "loss": 0.1289, + "num_input_tokens_seen": 165543712, + "step": 136050 + }, + { + "epoch": 15.152578238111149, + "grad_norm": 0.3224480450153351, + "learning_rate": 8.42643876678173e-06, + "loss": 0.1368, + "num_input_tokens_seen": 165549824, + "step": 136055 + }, + { + "epoch": 15.153135092994766, + "grad_norm": 2.579547882080078, + "learning_rate": 8.42461977238962e-06, + "loss": 0.1084, + "num_input_tokens_seen": 165556032, + "step": 136060 + }, + { + "epoch": 15.153691947878382, + "grad_norm": 0.5156447291374207, + "learning_rate": 8.422800934565256e-06, + "loss": 0.0814, + "num_input_tokens_seen": 165561600, + "step": 136065 + }, + { + "epoch": 15.154248802762, + "grad_norm": 0.08059270679950714, + "learning_rate": 8.420982253325813e-06, + "loss": 0.0721, + "num_input_tokens_seen": 165567680, + "step": 136070 + }, + { + "epoch": 15.154805657645618, + "grad_norm": 1.5662871599197388, + "learning_rate": 8.419163728688481e-06, + "loss": 0.0628, + "num_input_tokens_seen": 165573472, + "step": 136075 + }, + { + "epoch": 15.155362512529235, + "grad_norm": 1.1000322103500366, + "learning_rate": 8.417345360670433e-06, + "loss": 0.0375, + "num_input_tokens_seen": 165579968, + "step": 136080 + }, + { + "epoch": 15.155919367412853, + "grad_norm": 0.5515666007995605, + "learning_rate": 8.415527149288844e-06, + "loss": 0.1034, + "num_input_tokens_seen": 165585472, + "step": 136085 + }, + { + "epoch": 15.156476222296469, + "grad_norm": 2.3113958835601807, + "learning_rate": 8.41370909456088e-06, + "loss": 0.1961, + "num_input_tokens_seen": 165590784, + "step": 136090 + }, + { + "epoch": 15.157033077180087, + "grad_norm": 0.0724644660949707, + "learning_rate": 8.41189119650373e-06, + "loss": 0.0138, + "num_input_tokens_seen": 165597120, + "step": 136095 + }, + { + "epoch": 15.157589932063704, + "grad_norm": 0.030397502705454826, + "learning_rate": 8.410073455134549e-06, + "loss": 0.0483, + "num_input_tokens_seen": 165603360, + "step": 136100 + }, + { + "epoch": 15.158146786947322, + "grad_norm": 1.3138835430145264, + "learning_rate": 8.408255870470524e-06, + "loss": 0.1741, + "num_input_tokens_seen": 165609696, + "step": 136105 + }, + { + "epoch": 15.15870364183094, + "grad_norm": 1.4468480348587036, + "learning_rate": 8.40643844252882e-06, + "loss": 0.0276, + "num_input_tokens_seen": 165615648, + "step": 136110 + }, + { + "epoch": 15.159260496714555, + "grad_norm": 1.617923617362976, + "learning_rate": 8.404621171326593e-06, + "loss": 0.1659, + "num_input_tokens_seen": 165621952, + "step": 136115 + }, + { + "epoch": 15.159817351598173, + "grad_norm": 0.0037651555612683296, + "learning_rate": 8.402804056881011e-06, + "loss": 0.1411, + "num_input_tokens_seen": 165628192, + "step": 136120 + }, + { + "epoch": 15.16037420648179, + "grad_norm": 0.2708234190940857, + "learning_rate": 8.400987099209248e-06, + "loss": 0.0471, + "num_input_tokens_seen": 165634112, + "step": 136125 + }, + { + "epoch": 15.160931061365408, + "grad_norm": 0.12936347723007202, + "learning_rate": 8.399170298328462e-06, + "loss": 0.0024, + "num_input_tokens_seen": 165640416, + "step": 136130 + }, + { + "epoch": 15.161487916249026, + "grad_norm": 0.009225581772625446, + "learning_rate": 8.397353654255812e-06, + "loss": 0.0406, + "num_input_tokens_seen": 165646912, + "step": 136135 + }, + { + "epoch": 15.162044771132642, + "grad_norm": 0.02543811872601509, + "learning_rate": 8.395537167008452e-06, + "loss": 0.0185, + "num_input_tokens_seen": 165653280, + "step": 136140 + }, + { + "epoch": 15.16260162601626, + "grad_norm": 0.09250655770301819, + "learning_rate": 8.393720836603553e-06, + "loss": 0.0061, + "num_input_tokens_seen": 165659232, + "step": 136145 + }, + { + "epoch": 15.163158480899877, + "grad_norm": 1.7345656156539917, + "learning_rate": 8.39190466305826e-06, + "loss": 0.1358, + "num_input_tokens_seen": 165665152, + "step": 136150 + }, + { + "epoch": 15.163715335783495, + "grad_norm": 0.3162291944026947, + "learning_rate": 8.390088646389746e-06, + "loss": 0.0034, + "num_input_tokens_seen": 165671328, + "step": 136155 + }, + { + "epoch": 15.164272190667113, + "grad_norm": 0.6525737047195435, + "learning_rate": 8.38827278661514e-06, + "loss": 0.0272, + "num_input_tokens_seen": 165677344, + "step": 136160 + }, + { + "epoch": 15.164829045550729, + "grad_norm": 0.8720806241035461, + "learning_rate": 8.386457083751612e-06, + "loss": 0.0878, + "num_input_tokens_seen": 165683360, + "step": 136165 + }, + { + "epoch": 15.165385900434346, + "grad_norm": 0.001600915566086769, + "learning_rate": 8.384641537816299e-06, + "loss": 0.0007, + "num_input_tokens_seen": 165689472, + "step": 136170 + }, + { + "epoch": 15.165942755317964, + "grad_norm": 1.4219363927841187, + "learning_rate": 8.38282614882637e-06, + "loss": 0.0962, + "num_input_tokens_seen": 165695424, + "step": 136175 + }, + { + "epoch": 15.166499610201582, + "grad_norm": 0.000791211030445993, + "learning_rate": 8.381010916798967e-06, + "loss": 0.0228, + "num_input_tokens_seen": 165701696, + "step": 136180 + }, + { + "epoch": 15.1670564650852, + "grad_norm": 0.07791486382484436, + "learning_rate": 8.379195841751215e-06, + "loss": 0.0939, + "num_input_tokens_seen": 165707872, + "step": 136185 + }, + { + "epoch": 15.167613319968815, + "grad_norm": 0.04193326085805893, + "learning_rate": 8.377380923700282e-06, + "loss": 0.0022, + "num_input_tokens_seen": 165714112, + "step": 136190 + }, + { + "epoch": 15.168170174852433, + "grad_norm": 0.05979887396097183, + "learning_rate": 8.375566162663298e-06, + "loss": 0.0036, + "num_input_tokens_seen": 165720288, + "step": 136195 + }, + { + "epoch": 15.16872702973605, + "grad_norm": 0.13617943227291107, + "learning_rate": 8.373751558657416e-06, + "loss": 0.0077, + "num_input_tokens_seen": 165726656, + "step": 136200 + }, + { + "epoch": 15.169283884619668, + "grad_norm": 0.1869036853313446, + "learning_rate": 8.371937111699773e-06, + "loss": 0.0481, + "num_input_tokens_seen": 165732800, + "step": 136205 + }, + { + "epoch": 15.169840739503286, + "grad_norm": 0.8467721343040466, + "learning_rate": 8.370122821807508e-06, + "loss": 0.0275, + "num_input_tokens_seen": 165738816, + "step": 136210 + }, + { + "epoch": 15.170397594386904, + "grad_norm": 2.989560127258301, + "learning_rate": 8.368308688997747e-06, + "loss": 0.0624, + "num_input_tokens_seen": 165745024, + "step": 136215 + }, + { + "epoch": 15.17095444927052, + "grad_norm": 0.46578356623649597, + "learning_rate": 8.366494713287643e-06, + "loss": 0.0409, + "num_input_tokens_seen": 165750912, + "step": 136220 + }, + { + "epoch": 15.171511304154137, + "grad_norm": 0.12105181068181992, + "learning_rate": 8.364680894694324e-06, + "loss": 0.046, + "num_input_tokens_seen": 165756768, + "step": 136225 + }, + { + "epoch": 15.172068159037755, + "grad_norm": 0.00036009858013130724, + "learning_rate": 8.362867233234919e-06, + "loss": 0.0005, + "num_input_tokens_seen": 165763072, + "step": 136230 + }, + { + "epoch": 15.172625013921373, + "grad_norm": 0.07462128251791, + "learning_rate": 8.361053728926558e-06, + "loss": 0.0185, + "num_input_tokens_seen": 165769056, + "step": 136235 + }, + { + "epoch": 15.17318186880499, + "grad_norm": 0.0032401985954493284, + "learning_rate": 8.359240381786381e-06, + "loss": 0.0106, + "num_input_tokens_seen": 165775136, + "step": 136240 + }, + { + "epoch": 15.173738723688606, + "grad_norm": 0.035162243992090225, + "learning_rate": 8.357427191831505e-06, + "loss": 0.0533, + "num_input_tokens_seen": 165781408, + "step": 136245 + }, + { + "epoch": 15.174295578572224, + "grad_norm": 0.019279340282082558, + "learning_rate": 8.355614159079069e-06, + "loss": 0.0093, + "num_input_tokens_seen": 165786272, + "step": 136250 + }, + { + "epoch": 15.174852433455841, + "grad_norm": 0.018171532079577446, + "learning_rate": 8.353801283546194e-06, + "loss": 0.1036, + "num_input_tokens_seen": 165792224, + "step": 136255 + }, + { + "epoch": 15.17540928833946, + "grad_norm": 0.4325588345527649, + "learning_rate": 8.351988565250002e-06, + "loss": 0.0402, + "num_input_tokens_seen": 165798528, + "step": 136260 + }, + { + "epoch": 15.175966143223077, + "grad_norm": 0.00011255871504545212, + "learning_rate": 8.350176004207609e-06, + "loss": 0.0033, + "num_input_tokens_seen": 165804576, + "step": 136265 + }, + { + "epoch": 15.176522998106693, + "grad_norm": 0.385952889919281, + "learning_rate": 8.348363600436149e-06, + "loss": 0.0105, + "num_input_tokens_seen": 165810912, + "step": 136270 + }, + { + "epoch": 15.17707985299031, + "grad_norm": 0.15878984332084656, + "learning_rate": 8.346551353952739e-06, + "loss": 0.0698, + "num_input_tokens_seen": 165817664, + "step": 136275 + }, + { + "epoch": 15.177636707873928, + "grad_norm": 0.06667076796293259, + "learning_rate": 8.344739264774493e-06, + "loss": 0.0336, + "num_input_tokens_seen": 165823840, + "step": 136280 + }, + { + "epoch": 15.178193562757546, + "grad_norm": 0.002315173391252756, + "learning_rate": 8.342927332918519e-06, + "loss": 0.0029, + "num_input_tokens_seen": 165829952, + "step": 136285 + }, + { + "epoch": 15.178750417641163, + "grad_norm": 0.03514141961932182, + "learning_rate": 8.341115558401952e-06, + "loss": 0.0444, + "num_input_tokens_seen": 165835680, + "step": 136290 + }, + { + "epoch": 15.17930727252478, + "grad_norm": 0.006719070486724377, + "learning_rate": 8.339303941241886e-06, + "loss": 0.002, + "num_input_tokens_seen": 165841952, + "step": 136295 + }, + { + "epoch": 15.179864127408397, + "grad_norm": 0.017684990540146828, + "learning_rate": 8.337492481455458e-06, + "loss": 0.0081, + "num_input_tokens_seen": 165848256, + "step": 136300 + }, + { + "epoch": 15.180420982292015, + "grad_norm": 0.0018053504172712564, + "learning_rate": 8.335681179059748e-06, + "loss": 0.0954, + "num_input_tokens_seen": 165854336, + "step": 136305 + }, + { + "epoch": 15.180977837175632, + "grad_norm": 0.034968167543411255, + "learning_rate": 8.33387003407189e-06, + "loss": 0.0186, + "num_input_tokens_seen": 165860416, + "step": 136310 + }, + { + "epoch": 15.18153469205925, + "grad_norm": 0.02051622048020363, + "learning_rate": 8.332059046508972e-06, + "loss": 0.0023, + "num_input_tokens_seen": 165866464, + "step": 136315 + }, + { + "epoch": 15.182091546942866, + "grad_norm": 0.01627347618341446, + "learning_rate": 8.330248216388117e-06, + "loss": 0.0336, + "num_input_tokens_seen": 165872384, + "step": 136320 + }, + { + "epoch": 15.182648401826484, + "grad_norm": 0.01200492400676012, + "learning_rate": 8.328437543726428e-06, + "loss": 0.0075, + "num_input_tokens_seen": 165878912, + "step": 136325 + }, + { + "epoch": 15.183205256710101, + "grad_norm": 0.03686124458909035, + "learning_rate": 8.326627028541e-06, + "loss": 0.1084, + "num_input_tokens_seen": 165885152, + "step": 136330 + }, + { + "epoch": 15.183762111593719, + "grad_norm": 0.3249222934246063, + "learning_rate": 8.324816670848931e-06, + "loss": 0.0635, + "num_input_tokens_seen": 165891264, + "step": 136335 + }, + { + "epoch": 15.184318966477337, + "grad_norm": 0.18975567817687988, + "learning_rate": 8.323006470667336e-06, + "loss": 0.03, + "num_input_tokens_seen": 165897536, + "step": 136340 + }, + { + "epoch": 15.184875821360952, + "grad_norm": 0.014933235943317413, + "learning_rate": 8.321196428013305e-06, + "loss": 0.0015, + "num_input_tokens_seen": 165903520, + "step": 136345 + }, + { + "epoch": 15.18543267624457, + "grad_norm": 0.4900017976760864, + "learning_rate": 8.31938654290394e-06, + "loss": 0.0269, + "num_input_tokens_seen": 165908672, + "step": 136350 + }, + { + "epoch": 15.185989531128188, + "grad_norm": 0.748872697353363, + "learning_rate": 8.317576815356323e-06, + "loss": 0.0152, + "num_input_tokens_seen": 165914976, + "step": 136355 + }, + { + "epoch": 15.186546386011806, + "grad_norm": 1.5450975894927979, + "learning_rate": 8.315767245387568e-06, + "loss": 0.122, + "num_input_tokens_seen": 165921376, + "step": 136360 + }, + { + "epoch": 15.187103240895423, + "grad_norm": 0.00012559056631289423, + "learning_rate": 8.313957833014751e-06, + "loss": 0.0139, + "num_input_tokens_seen": 165927456, + "step": 136365 + }, + { + "epoch": 15.18766009577904, + "grad_norm": 0.14449411630630493, + "learning_rate": 8.312148578254986e-06, + "loss": 0.0439, + "num_input_tokens_seen": 165933632, + "step": 136370 + }, + { + "epoch": 15.188216950662657, + "grad_norm": 0.00704490439966321, + "learning_rate": 8.310339481125331e-06, + "loss": 0.0222, + "num_input_tokens_seen": 165939840, + "step": 136375 + }, + { + "epoch": 15.188773805546274, + "grad_norm": 0.019659049808979034, + "learning_rate": 8.308530541642901e-06, + "loss": 0.0053, + "num_input_tokens_seen": 165945632, + "step": 136380 + }, + { + "epoch": 15.189330660429892, + "grad_norm": 0.021913185715675354, + "learning_rate": 8.306721759824761e-06, + "loss": 0.0025, + "num_input_tokens_seen": 165951904, + "step": 136385 + }, + { + "epoch": 15.18988751531351, + "grad_norm": 0.0031922366470098495, + "learning_rate": 8.304913135688022e-06, + "loss": 0.0452, + "num_input_tokens_seen": 165958176, + "step": 136390 + }, + { + "epoch": 15.190444370197127, + "grad_norm": 0.00043526568333618343, + "learning_rate": 8.30310466924975e-06, + "loss": 0.0357, + "num_input_tokens_seen": 165964064, + "step": 136395 + }, + { + "epoch": 15.191001225080743, + "grad_norm": 0.06280919909477234, + "learning_rate": 8.301296360527033e-06, + "loss": 0.0048, + "num_input_tokens_seen": 165970272, + "step": 136400 + }, + { + "epoch": 15.191558079964361, + "grad_norm": 1.8061355352401733, + "learning_rate": 8.299488209536943e-06, + "loss": 0.0467, + "num_input_tokens_seen": 165976640, + "step": 136405 + }, + { + "epoch": 15.192114934847979, + "grad_norm": 0.006351328454911709, + "learning_rate": 8.297680216296574e-06, + "loss": 0.0089, + "num_input_tokens_seen": 165982592, + "step": 136410 + }, + { + "epoch": 15.192671789731596, + "grad_norm": 0.8761689066886902, + "learning_rate": 8.295872380822999e-06, + "loss": 0.0333, + "num_input_tokens_seen": 165988000, + "step": 136415 + }, + { + "epoch": 15.193228644615214, + "grad_norm": 0.8448179364204407, + "learning_rate": 8.294064703133292e-06, + "loss": 0.0286, + "num_input_tokens_seen": 165993824, + "step": 136420 + }, + { + "epoch": 15.19378549949883, + "grad_norm": 0.007590392604470253, + "learning_rate": 8.29225718324452e-06, + "loss": 0.0101, + "num_input_tokens_seen": 166000000, + "step": 136425 + }, + { + "epoch": 15.194342354382448, + "grad_norm": 0.0072119166143238544, + "learning_rate": 8.290449821173774e-06, + "loss": 0.0322, + "num_input_tokens_seen": 166005888, + "step": 136430 + }, + { + "epoch": 15.194899209266065, + "grad_norm": 1.1039161682128906, + "learning_rate": 8.288642616938106e-06, + "loss": 0.0722, + "num_input_tokens_seen": 166011520, + "step": 136435 + }, + { + "epoch": 15.195456064149683, + "grad_norm": 1.997235655784607, + "learning_rate": 8.286835570554608e-06, + "loss": 0.0363, + "num_input_tokens_seen": 166017888, + "step": 136440 + }, + { + "epoch": 15.1960129190333, + "grad_norm": 0.0036134421825408936, + "learning_rate": 8.285028682040339e-06, + "loss": 0.1075, + "num_input_tokens_seen": 166023872, + "step": 136445 + }, + { + "epoch": 15.196569773916917, + "grad_norm": 0.6116533875465393, + "learning_rate": 8.283221951412361e-06, + "loss": 0.0213, + "num_input_tokens_seen": 166030176, + "step": 136450 + }, + { + "epoch": 15.197126628800534, + "grad_norm": 0.6361989378929138, + "learning_rate": 8.281415378687742e-06, + "loss": 0.1471, + "num_input_tokens_seen": 166036192, + "step": 136455 + }, + { + "epoch": 15.197683483684152, + "grad_norm": 0.21109475195407867, + "learning_rate": 8.279608963883556e-06, + "loss": 0.0345, + "num_input_tokens_seen": 166042048, + "step": 136460 + }, + { + "epoch": 15.19824033856777, + "grad_norm": 0.044744834303855896, + "learning_rate": 8.27780270701686e-06, + "loss": 0.1008, + "num_input_tokens_seen": 166048032, + "step": 136465 + }, + { + "epoch": 15.198797193451387, + "grad_norm": 0.02071298472583294, + "learning_rate": 8.275996608104713e-06, + "loss": 0.0078, + "num_input_tokens_seen": 166054144, + "step": 136470 + }, + { + "epoch": 15.199354048335003, + "grad_norm": 0.00016598394722677767, + "learning_rate": 8.274190667164172e-06, + "loss": 0.0273, + "num_input_tokens_seen": 166060448, + "step": 136475 + }, + { + "epoch": 15.19991090321862, + "grad_norm": 0.007688567973673344, + "learning_rate": 8.272384884212305e-06, + "loss": 0.0889, + "num_input_tokens_seen": 166066720, + "step": 136480 + }, + { + "epoch": 15.200467758102238, + "grad_norm": 0.04126904904842377, + "learning_rate": 8.270579259266163e-06, + "loss": 0.0449, + "num_input_tokens_seen": 166072704, + "step": 136485 + }, + { + "epoch": 15.201024612985856, + "grad_norm": 0.4234125316143036, + "learning_rate": 8.268773792342812e-06, + "loss": 0.0111, + "num_input_tokens_seen": 166079072, + "step": 136490 + }, + { + "epoch": 15.201581467869474, + "grad_norm": 0.3423387110233307, + "learning_rate": 8.266968483459286e-06, + "loss": 0.0466, + "num_input_tokens_seen": 166085152, + "step": 136495 + }, + { + "epoch": 15.20213832275309, + "grad_norm": 0.4855247139930725, + "learning_rate": 8.265163332632656e-06, + "loss": 0.0357, + "num_input_tokens_seen": 166091264, + "step": 136500 + }, + { + "epoch": 15.202695177636707, + "grad_norm": 0.0024850131012499332, + "learning_rate": 8.263358339879956e-06, + "loss": 0.0873, + "num_input_tokens_seen": 166097312, + "step": 136505 + }, + { + "epoch": 15.203252032520325, + "grad_norm": 0.5774843692779541, + "learning_rate": 8.261553505218255e-06, + "loss": 0.0173, + "num_input_tokens_seen": 166103264, + "step": 136510 + }, + { + "epoch": 15.203808887403943, + "grad_norm": 0.003061415860429406, + "learning_rate": 8.259748828664593e-06, + "loss": 0.0947, + "num_input_tokens_seen": 166109344, + "step": 136515 + }, + { + "epoch": 15.20436574228756, + "grad_norm": 0.00013346225023269653, + "learning_rate": 8.257944310236015e-06, + "loss": 0.033, + "num_input_tokens_seen": 166115488, + "step": 136520 + }, + { + "epoch": 15.204922597171176, + "grad_norm": 1.1677428483963013, + "learning_rate": 8.256139949949557e-06, + "loss": 0.0384, + "num_input_tokens_seen": 166121312, + "step": 136525 + }, + { + "epoch": 15.205479452054794, + "grad_norm": 0.3030482232570648, + "learning_rate": 8.254335747822281e-06, + "loss": 0.007, + "num_input_tokens_seen": 166126816, + "step": 136530 + }, + { + "epoch": 15.206036306938412, + "grad_norm": 1.299725890159607, + "learning_rate": 8.252531703871219e-06, + "loss": 0.0376, + "num_input_tokens_seen": 166132992, + "step": 136535 + }, + { + "epoch": 15.20659316182203, + "grad_norm": 0.3924882411956787, + "learning_rate": 8.250727818113416e-06, + "loss": 0.0358, + "num_input_tokens_seen": 166139296, + "step": 136540 + }, + { + "epoch": 15.207150016705647, + "grad_norm": 1.879544973373413, + "learning_rate": 8.248924090565897e-06, + "loss": 0.0299, + "num_input_tokens_seen": 166145472, + "step": 136545 + }, + { + "epoch": 15.207706871589265, + "grad_norm": 0.1747966706752777, + "learning_rate": 8.247120521245721e-06, + "loss": 0.0088, + "num_input_tokens_seen": 166150976, + "step": 136550 + }, + { + "epoch": 15.20826372647288, + "grad_norm": 0.03799388185143471, + "learning_rate": 8.245317110169903e-06, + "loss": 0.0082, + "num_input_tokens_seen": 166156832, + "step": 136555 + }, + { + "epoch": 15.208820581356498, + "grad_norm": 1.2288627624511719, + "learning_rate": 8.243513857355506e-06, + "loss": 0.0831, + "num_input_tokens_seen": 166162144, + "step": 136560 + }, + { + "epoch": 15.209377436240116, + "grad_norm": 0.13375897705554962, + "learning_rate": 8.241710762819532e-06, + "loss": 0.0032, + "num_input_tokens_seen": 166168352, + "step": 136565 + }, + { + "epoch": 15.209934291123734, + "grad_norm": 0.385234534740448, + "learning_rate": 8.239907826579032e-06, + "loss": 0.0266, + "num_input_tokens_seen": 166174688, + "step": 136570 + }, + { + "epoch": 15.210491146007351, + "grad_norm": 0.24829256534576416, + "learning_rate": 8.238105048651022e-06, + "loss": 0.0444, + "num_input_tokens_seen": 166180960, + "step": 136575 + }, + { + "epoch": 15.211048000890967, + "grad_norm": 0.0004812843690160662, + "learning_rate": 8.236302429052547e-06, + "loss": 0.0219, + "num_input_tokens_seen": 166187232, + "step": 136580 + }, + { + "epoch": 15.211604855774585, + "grad_norm": 1.8498287200927734, + "learning_rate": 8.23449996780063e-06, + "loss": 0.0425, + "num_input_tokens_seen": 166193632, + "step": 136585 + }, + { + "epoch": 15.212161710658203, + "grad_norm": 0.25072646141052246, + "learning_rate": 8.23269766491229e-06, + "loss": 0.0338, + "num_input_tokens_seen": 166199712, + "step": 136590 + }, + { + "epoch": 15.21271856554182, + "grad_norm": 0.00011429926962591708, + "learning_rate": 8.230895520404555e-06, + "loss": 0.0045, + "num_input_tokens_seen": 166206112, + "step": 136595 + }, + { + "epoch": 15.213275420425438, + "grad_norm": 0.3922572731971741, + "learning_rate": 8.229093534294437e-06, + "loss": 0.0462, + "num_input_tokens_seen": 166212320, + "step": 136600 + }, + { + "epoch": 15.213832275309054, + "grad_norm": 0.45556125044822693, + "learning_rate": 8.227291706598978e-06, + "loss": 0.0058, + "num_input_tokens_seen": 166218816, + "step": 136605 + }, + { + "epoch": 15.214389130192671, + "grad_norm": 1.2790210247039795, + "learning_rate": 8.225490037335187e-06, + "loss": 0.0941, + "num_input_tokens_seen": 166224512, + "step": 136610 + }, + { + "epoch": 15.21494598507629, + "grad_norm": 2.4827849864959717, + "learning_rate": 8.223688526520079e-06, + "loss": 0.0911, + "num_input_tokens_seen": 166230592, + "step": 136615 + }, + { + "epoch": 15.215502839959907, + "grad_norm": 0.7894872426986694, + "learning_rate": 8.22188717417067e-06, + "loss": 0.0173, + "num_input_tokens_seen": 166236640, + "step": 136620 + }, + { + "epoch": 15.216059694843524, + "grad_norm": 0.13496020436286926, + "learning_rate": 8.220085980303985e-06, + "loss": 0.0071, + "num_input_tokens_seen": 166242400, + "step": 136625 + }, + { + "epoch": 15.21661654972714, + "grad_norm": 0.6633291840553284, + "learning_rate": 8.21828494493703e-06, + "loss": 0.0847, + "num_input_tokens_seen": 166248384, + "step": 136630 + }, + { + "epoch": 15.217173404610758, + "grad_norm": 2.570544481277466, + "learning_rate": 8.216484068086822e-06, + "loss": 0.1848, + "num_input_tokens_seen": 166254624, + "step": 136635 + }, + { + "epoch": 15.217730259494376, + "grad_norm": 0.03936314955353737, + "learning_rate": 8.214683349770358e-06, + "loss": 0.0019, + "num_input_tokens_seen": 166261024, + "step": 136640 + }, + { + "epoch": 15.218287114377993, + "grad_norm": 0.13524648547172546, + "learning_rate": 8.21288279000467e-06, + "loss": 0.0826, + "num_input_tokens_seen": 166267168, + "step": 136645 + }, + { + "epoch": 15.218843969261611, + "grad_norm": 0.00028690637554973364, + "learning_rate": 8.21108238880674e-06, + "loss": 0.0028, + "num_input_tokens_seen": 166273344, + "step": 136650 + }, + { + "epoch": 15.219400824145227, + "grad_norm": 1.6208820343017578, + "learning_rate": 8.209282146193601e-06, + "loss": 0.0292, + "num_input_tokens_seen": 166279616, + "step": 136655 + }, + { + "epoch": 15.219957679028845, + "grad_norm": 0.00044807090307585895, + "learning_rate": 8.207482062182242e-06, + "loss": 0.0182, + "num_input_tokens_seen": 166285792, + "step": 136660 + }, + { + "epoch": 15.220514533912462, + "grad_norm": 0.3486485481262207, + "learning_rate": 8.205682136789669e-06, + "loss": 0.0553, + "num_input_tokens_seen": 166291968, + "step": 136665 + }, + { + "epoch": 15.22107138879608, + "grad_norm": 0.0003693142789416015, + "learning_rate": 8.203882370032876e-06, + "loss": 0.0024, + "num_input_tokens_seen": 166298016, + "step": 136670 + }, + { + "epoch": 15.221628243679698, + "grad_norm": 2.5095760822296143, + "learning_rate": 8.202082761928879e-06, + "loss": 0.0539, + "num_input_tokens_seen": 166304064, + "step": 136675 + }, + { + "epoch": 15.222185098563314, + "grad_norm": 0.6204533576965332, + "learning_rate": 8.20028331249467e-06, + "loss": 0.0838, + "num_input_tokens_seen": 166310048, + "step": 136680 + }, + { + "epoch": 15.222741953446931, + "grad_norm": 0.5205725431442261, + "learning_rate": 8.198484021747241e-06, + "loss": 0.1053, + "num_input_tokens_seen": 166315424, + "step": 136685 + }, + { + "epoch": 15.223298808330549, + "grad_norm": 0.03106783889234066, + "learning_rate": 8.196684889703584e-06, + "loss": 0.1621, + "num_input_tokens_seen": 166321376, + "step": 136690 + }, + { + "epoch": 15.223855663214167, + "grad_norm": 0.006187442224472761, + "learning_rate": 8.194885916380713e-06, + "loss": 0.0044, + "num_input_tokens_seen": 166327328, + "step": 136695 + }, + { + "epoch": 15.224412518097784, + "grad_norm": 0.07935254275798798, + "learning_rate": 8.193087101795597e-06, + "loss": 0.0313, + "num_input_tokens_seen": 166333248, + "step": 136700 + }, + { + "epoch": 15.2249693729814, + "grad_norm": 0.014650685712695122, + "learning_rate": 8.191288445965257e-06, + "loss": 0.0329, + "num_input_tokens_seen": 166338976, + "step": 136705 + }, + { + "epoch": 15.225526227865018, + "grad_norm": 0.22205357253551483, + "learning_rate": 8.189489948906648e-06, + "loss": 0.0493, + "num_input_tokens_seen": 166345408, + "step": 136710 + }, + { + "epoch": 15.226083082748636, + "grad_norm": 0.016457566991448402, + "learning_rate": 8.18769161063678e-06, + "loss": 0.0194, + "num_input_tokens_seen": 166351744, + "step": 136715 + }, + { + "epoch": 15.226639937632253, + "grad_norm": 1.1141347885131836, + "learning_rate": 8.185893431172632e-06, + "loss": 0.045, + "num_input_tokens_seen": 166357440, + "step": 136720 + }, + { + "epoch": 15.22719679251587, + "grad_norm": 0.9432808756828308, + "learning_rate": 8.184095410531196e-06, + "loss": 0.0223, + "num_input_tokens_seen": 166363008, + "step": 136725 + }, + { + "epoch": 15.227753647399489, + "grad_norm": 0.0003623014490585774, + "learning_rate": 8.18229754872945e-06, + "loss": 0.0467, + "num_input_tokens_seen": 166368992, + "step": 136730 + }, + { + "epoch": 15.228310502283104, + "grad_norm": 0.005081553012132645, + "learning_rate": 8.180499845784381e-06, + "loss": 0.0155, + "num_input_tokens_seen": 166375488, + "step": 136735 + }, + { + "epoch": 15.228867357166722, + "grad_norm": 0.7459869980812073, + "learning_rate": 8.178702301712957e-06, + "loss": 0.0271, + "num_input_tokens_seen": 166381504, + "step": 136740 + }, + { + "epoch": 15.22942421205034, + "grad_norm": 0.5747753977775574, + "learning_rate": 8.176904916532174e-06, + "loss": 0.1473, + "num_input_tokens_seen": 166387584, + "step": 136745 + }, + { + "epoch": 15.229981066933957, + "grad_norm": 2.2763872146606445, + "learning_rate": 8.175107690259004e-06, + "loss": 0.0536, + "num_input_tokens_seen": 166393856, + "step": 136750 + }, + { + "epoch": 15.230537921817575, + "grad_norm": 0.1743934154510498, + "learning_rate": 8.17331062291042e-06, + "loss": 0.0649, + "num_input_tokens_seen": 166399808, + "step": 136755 + }, + { + "epoch": 15.231094776701191, + "grad_norm": 3.4926934242248535, + "learning_rate": 8.171513714503393e-06, + "loss": 0.092, + "num_input_tokens_seen": 166405696, + "step": 136760 + }, + { + "epoch": 15.231651631584809, + "grad_norm": 0.023724984377622604, + "learning_rate": 8.169716965054911e-06, + "loss": 0.0067, + "num_input_tokens_seen": 166411808, + "step": 136765 + }, + { + "epoch": 15.232208486468426, + "grad_norm": 0.007075740490108728, + "learning_rate": 8.167920374581925e-06, + "loss": 0.0012, + "num_input_tokens_seen": 166418080, + "step": 136770 + }, + { + "epoch": 15.232765341352044, + "grad_norm": 0.7750581502914429, + "learning_rate": 8.166123943101433e-06, + "loss": 0.0234, + "num_input_tokens_seen": 166424032, + "step": 136775 + }, + { + "epoch": 15.233322196235662, + "grad_norm": 1.5118036270141602, + "learning_rate": 8.164327670630373e-06, + "loss": 0.1093, + "num_input_tokens_seen": 166429920, + "step": 136780 + }, + { + "epoch": 15.233879051119278, + "grad_norm": 0.025319766253232956, + "learning_rate": 8.162531557185735e-06, + "loss": 0.0243, + "num_input_tokens_seen": 166435936, + "step": 136785 + }, + { + "epoch": 15.234435906002895, + "grad_norm": 0.7963416576385498, + "learning_rate": 8.160735602784467e-06, + "loss": 0.0374, + "num_input_tokens_seen": 166442048, + "step": 136790 + }, + { + "epoch": 15.234992760886513, + "grad_norm": 0.005700371693819761, + "learning_rate": 8.15893980744355e-06, + "loss": 0.0025, + "num_input_tokens_seen": 166448256, + "step": 136795 + }, + { + "epoch": 15.23554961577013, + "grad_norm": 0.01284447219222784, + "learning_rate": 8.15714417117994e-06, + "loss": 0.0062, + "num_input_tokens_seen": 166453856, + "step": 136800 + }, + { + "epoch": 15.236106470653748, + "grad_norm": 0.7241504788398743, + "learning_rate": 8.155348694010598e-06, + "loss": 0.0738, + "num_input_tokens_seen": 166460032, + "step": 136805 + }, + { + "epoch": 15.236663325537364, + "grad_norm": 0.0890624150633812, + "learning_rate": 8.153553375952474e-06, + "loss": 0.0025, + "num_input_tokens_seen": 166466272, + "step": 136810 + }, + { + "epoch": 15.237220180420982, + "grad_norm": 0.05448530986905098, + "learning_rate": 8.151758217022545e-06, + "loss": 0.0484, + "num_input_tokens_seen": 166472160, + "step": 136815 + }, + { + "epoch": 15.2377770353046, + "grad_norm": 0.0012084890622645617, + "learning_rate": 8.149963217237758e-06, + "loss": 0.0142, + "num_input_tokens_seen": 166478592, + "step": 136820 + }, + { + "epoch": 15.238333890188217, + "grad_norm": 0.003119380446150899, + "learning_rate": 8.148168376615067e-06, + "loss": 0.1517, + "num_input_tokens_seen": 166484960, + "step": 136825 + }, + { + "epoch": 15.238890745071835, + "grad_norm": 1.3923410177230835, + "learning_rate": 8.146373695171422e-06, + "loss": 0.1253, + "num_input_tokens_seen": 166491136, + "step": 136830 + }, + { + "epoch": 15.23944759995545, + "grad_norm": 0.23320454359054565, + "learning_rate": 8.144579172923786e-06, + "loss": 0.0643, + "num_input_tokens_seen": 166496608, + "step": 136835 + }, + { + "epoch": 15.240004454839069, + "grad_norm": 1.3565900325775146, + "learning_rate": 8.142784809889098e-06, + "loss": 0.0235, + "num_input_tokens_seen": 166502752, + "step": 136840 + }, + { + "epoch": 15.240561309722686, + "grad_norm": 0.00019593659089878201, + "learning_rate": 8.14099060608432e-06, + "loss": 0.0552, + "num_input_tokens_seen": 166508864, + "step": 136845 + }, + { + "epoch": 15.241118164606304, + "grad_norm": 0.9549911022186279, + "learning_rate": 8.139196561526393e-06, + "loss": 0.0259, + "num_input_tokens_seen": 166515040, + "step": 136850 + }, + { + "epoch": 15.241675019489922, + "grad_norm": 0.027342354878783226, + "learning_rate": 8.137402676232263e-06, + "loss": 0.0043, + "num_input_tokens_seen": 166521088, + "step": 136855 + }, + { + "epoch": 15.242231874373537, + "grad_norm": 0.0004281902511138469, + "learning_rate": 8.135608950218868e-06, + "loss": 0.0895, + "num_input_tokens_seen": 166527072, + "step": 136860 + }, + { + "epoch": 15.242788729257155, + "grad_norm": 0.677029013633728, + "learning_rate": 8.133815383503163e-06, + "loss": 0.0188, + "num_input_tokens_seen": 166533376, + "step": 136865 + }, + { + "epoch": 15.243345584140773, + "grad_norm": 0.01053652260452509, + "learning_rate": 8.132021976102086e-06, + "loss": 0.1167, + "num_input_tokens_seen": 166539648, + "step": 136870 + }, + { + "epoch": 15.24390243902439, + "grad_norm": 0.0036576958373188972, + "learning_rate": 8.130228728032577e-06, + "loss": 0.008, + "num_input_tokens_seen": 166545824, + "step": 136875 + }, + { + "epoch": 15.244459293908008, + "grad_norm": 0.051442939788103104, + "learning_rate": 8.128435639311565e-06, + "loss": 0.0024, + "num_input_tokens_seen": 166552224, + "step": 136880 + }, + { + "epoch": 15.245016148791624, + "grad_norm": 1.3218599557876587, + "learning_rate": 8.126642709956004e-06, + "loss": 0.1068, + "num_input_tokens_seen": 166558176, + "step": 136885 + }, + { + "epoch": 15.245573003675242, + "grad_norm": 0.39125490188598633, + "learning_rate": 8.124849939982812e-06, + "loss": 0.0098, + "num_input_tokens_seen": 166563744, + "step": 136890 + }, + { + "epoch": 15.24612985855886, + "grad_norm": 0.4222796559333801, + "learning_rate": 8.12305732940895e-06, + "loss": 0.1052, + "num_input_tokens_seen": 166569920, + "step": 136895 + }, + { + "epoch": 15.246686713442477, + "grad_norm": 0.14784784615039825, + "learning_rate": 8.121264878251317e-06, + "loss": 0.1293, + "num_input_tokens_seen": 166576224, + "step": 136900 + }, + { + "epoch": 15.247243568326095, + "grad_norm": 0.8300996422767639, + "learning_rate": 8.119472586526869e-06, + "loss": 0.0233, + "num_input_tokens_seen": 166582400, + "step": 136905 + }, + { + "epoch": 15.247800423209712, + "grad_norm": 0.08623696118593216, + "learning_rate": 8.117680454252516e-06, + "loss": 0.0128, + "num_input_tokens_seen": 166588864, + "step": 136910 + }, + { + "epoch": 15.248357278093328, + "grad_norm": 0.016641909256577492, + "learning_rate": 8.115888481445208e-06, + "loss": 0.0143, + "num_input_tokens_seen": 166595008, + "step": 136915 + }, + { + "epoch": 15.248914132976946, + "grad_norm": 3.550135612487793, + "learning_rate": 8.114096668121857e-06, + "loss": 0.0897, + "num_input_tokens_seen": 166601472, + "step": 136920 + }, + { + "epoch": 15.249470987860564, + "grad_norm": 0.24839994311332703, + "learning_rate": 8.112305014299396e-06, + "loss": 0.0592, + "num_input_tokens_seen": 166607712, + "step": 136925 + }, + { + "epoch": 15.250027842744181, + "grad_norm": 0.0006082541076466441, + "learning_rate": 8.110513519994733e-06, + "loss": 0.0123, + "num_input_tokens_seen": 166613920, + "step": 136930 + }, + { + "epoch": 15.250584697627799, + "grad_norm": 0.003265530103817582, + "learning_rate": 8.10872218522481e-06, + "loss": 0.0451, + "num_input_tokens_seen": 166620192, + "step": 136935 + }, + { + "epoch": 15.251141552511415, + "grad_norm": 0.10923603177070618, + "learning_rate": 8.10693101000654e-06, + "loss": 0.0056, + "num_input_tokens_seen": 166626560, + "step": 136940 + }, + { + "epoch": 15.251698407395033, + "grad_norm": 0.15994983911514282, + "learning_rate": 8.105139994356842e-06, + "loss": 0.0323, + "num_input_tokens_seen": 166632544, + "step": 136945 + }, + { + "epoch": 15.25225526227865, + "grad_norm": 0.12313828617334366, + "learning_rate": 8.103349138292623e-06, + "loss": 0.0815, + "num_input_tokens_seen": 166638112, + "step": 136950 + }, + { + "epoch": 15.252812117162268, + "grad_norm": 0.4972812533378601, + "learning_rate": 8.101558441830817e-06, + "loss": 0.0286, + "num_input_tokens_seen": 166644160, + "step": 136955 + }, + { + "epoch": 15.253368972045886, + "grad_norm": 0.00044816540321335196, + "learning_rate": 8.099767904988324e-06, + "loss": 0.0395, + "num_input_tokens_seen": 166650688, + "step": 136960 + }, + { + "epoch": 15.253925826929501, + "grad_norm": 0.07963228970766068, + "learning_rate": 8.097977527782077e-06, + "loss": 0.0145, + "num_input_tokens_seen": 166656864, + "step": 136965 + }, + { + "epoch": 15.25448268181312, + "grad_norm": 0.005618389695882797, + "learning_rate": 8.09618731022896e-06, + "loss": 0.0121, + "num_input_tokens_seen": 166662912, + "step": 136970 + }, + { + "epoch": 15.255039536696737, + "grad_norm": 0.3934541344642639, + "learning_rate": 8.094397252345903e-06, + "loss": 0.0178, + "num_input_tokens_seen": 166668512, + "step": 136975 + }, + { + "epoch": 15.255596391580355, + "grad_norm": 0.03247550129890442, + "learning_rate": 8.0926073541498e-06, + "loss": 0.026, + "num_input_tokens_seen": 166674624, + "step": 136980 + }, + { + "epoch": 15.256153246463972, + "grad_norm": 0.02020220272243023, + "learning_rate": 8.090817615657579e-06, + "loss": 0.0594, + "num_input_tokens_seen": 166680896, + "step": 136985 + }, + { + "epoch": 15.256710101347588, + "grad_norm": 0.17417439818382263, + "learning_rate": 8.089028036886128e-06, + "loss": 0.0915, + "num_input_tokens_seen": 166687296, + "step": 136990 + }, + { + "epoch": 15.257266956231206, + "grad_norm": 0.8899382948875427, + "learning_rate": 8.087238617852357e-06, + "loss": 0.0423, + "num_input_tokens_seen": 166693472, + "step": 136995 + }, + { + "epoch": 15.257823811114823, + "grad_norm": 0.05583206191658974, + "learning_rate": 8.08544935857317e-06, + "loss": 0.0625, + "num_input_tokens_seen": 166699456, + "step": 137000 + }, + { + "epoch": 15.258380665998441, + "grad_norm": 1.8659610748291016, + "learning_rate": 8.083660259065456e-06, + "loss": 0.0864, + "num_input_tokens_seen": 166705760, + "step": 137005 + }, + { + "epoch": 15.258937520882059, + "grad_norm": 0.2202596664428711, + "learning_rate": 8.081871319346133e-06, + "loss": 0.0701, + "num_input_tokens_seen": 166712064, + "step": 137010 + }, + { + "epoch": 15.259494375765675, + "grad_norm": 0.9236525893211365, + "learning_rate": 8.080082539432087e-06, + "loss": 0.1351, + "num_input_tokens_seen": 166717984, + "step": 137015 + }, + { + "epoch": 15.260051230649292, + "grad_norm": 0.23044776916503906, + "learning_rate": 8.078293919340219e-06, + "loss": 0.0111, + "num_input_tokens_seen": 166723840, + "step": 137020 + }, + { + "epoch": 15.26060808553291, + "grad_norm": 0.006405955646187067, + "learning_rate": 8.076505459087416e-06, + "loss": 0.0333, + "num_input_tokens_seen": 166729952, + "step": 137025 + }, + { + "epoch": 15.261164940416528, + "grad_norm": 0.00043304619612172246, + "learning_rate": 8.074717158690583e-06, + "loss": 0.0514, + "num_input_tokens_seen": 166736160, + "step": 137030 + }, + { + "epoch": 15.261721795300145, + "grad_norm": 0.08506496250629425, + "learning_rate": 8.072929018166608e-06, + "loss": 0.0052, + "num_input_tokens_seen": 166742400, + "step": 137035 + }, + { + "epoch": 15.262278650183761, + "grad_norm": 0.04210938885807991, + "learning_rate": 8.07114103753238e-06, + "loss": 0.0476, + "num_input_tokens_seen": 166748640, + "step": 137040 + }, + { + "epoch": 15.262835505067379, + "grad_norm": 0.43419545888900757, + "learning_rate": 8.069353216804782e-06, + "loss": 0.0184, + "num_input_tokens_seen": 166754752, + "step": 137045 + }, + { + "epoch": 15.263392359950997, + "grad_norm": 0.021914998069405556, + "learning_rate": 8.067565556000714e-06, + "loss": 0.0283, + "num_input_tokens_seen": 166761120, + "step": 137050 + }, + { + "epoch": 15.263949214834614, + "grad_norm": 0.0030631092377007008, + "learning_rate": 8.065778055137049e-06, + "loss": 0.0114, + "num_input_tokens_seen": 166766688, + "step": 137055 + }, + { + "epoch": 15.264506069718232, + "grad_norm": 0.10695816576480865, + "learning_rate": 8.063990714230682e-06, + "loss": 0.0027, + "num_input_tokens_seen": 166772960, + "step": 137060 + }, + { + "epoch": 15.26506292460185, + "grad_norm": 0.8115270137786865, + "learning_rate": 8.062203533298495e-06, + "loss": 0.0629, + "num_input_tokens_seen": 166779136, + "step": 137065 + }, + { + "epoch": 15.265619779485466, + "grad_norm": 0.20603615045547485, + "learning_rate": 8.060416512357365e-06, + "loss": 0.0123, + "num_input_tokens_seen": 166785472, + "step": 137070 + }, + { + "epoch": 15.266176634369083, + "grad_norm": 0.23411984741687775, + "learning_rate": 8.058629651424165e-06, + "loss": 0.1277, + "num_input_tokens_seen": 166790848, + "step": 137075 + }, + { + "epoch": 15.266733489252701, + "grad_norm": 1.4155985116958618, + "learning_rate": 8.05684295051579e-06, + "loss": 0.0203, + "num_input_tokens_seen": 166797088, + "step": 137080 + }, + { + "epoch": 15.267290344136319, + "grad_norm": 1.3973922729492188, + "learning_rate": 8.055056409649102e-06, + "loss": 0.0574, + "num_input_tokens_seen": 166802912, + "step": 137085 + }, + { + "epoch": 15.267847199019936, + "grad_norm": 0.5593692660331726, + "learning_rate": 8.05327002884099e-06, + "loss": 0.0142, + "num_input_tokens_seen": 166809152, + "step": 137090 + }, + { + "epoch": 15.268404053903552, + "grad_norm": 0.030822305008769035, + "learning_rate": 8.05148380810831e-06, + "loss": 0.0513, + "num_input_tokens_seen": 166815360, + "step": 137095 + }, + { + "epoch": 15.26896090878717, + "grad_norm": 0.27200907468795776, + "learning_rate": 8.04969774746795e-06, + "loss": 0.1146, + "num_input_tokens_seen": 166821504, + "step": 137100 + }, + { + "epoch": 15.269517763670788, + "grad_norm": 0.0013580088270828128, + "learning_rate": 8.047911846936768e-06, + "loss": 0.0059, + "num_input_tokens_seen": 166827776, + "step": 137105 + }, + { + "epoch": 15.270074618554405, + "grad_norm": 0.0003613661683630198, + "learning_rate": 8.046126106531658e-06, + "loss": 0.0237, + "num_input_tokens_seen": 166834112, + "step": 137110 + }, + { + "epoch": 15.270631473438023, + "grad_norm": 0.0006107727531343699, + "learning_rate": 8.044340526269454e-06, + "loss": 0.1362, + "num_input_tokens_seen": 166839904, + "step": 137115 + }, + { + "epoch": 15.271188328321639, + "grad_norm": 0.004003751091659069, + "learning_rate": 8.042555106167044e-06, + "loss": 0.002, + "num_input_tokens_seen": 166845888, + "step": 137120 + }, + { + "epoch": 15.271745183205256, + "grad_norm": 0.010197620838880539, + "learning_rate": 8.040769846241281e-06, + "loss": 0.0449, + "num_input_tokens_seen": 166851488, + "step": 137125 + }, + { + "epoch": 15.272302038088874, + "grad_norm": 0.043327707797288895, + "learning_rate": 8.03898474650904e-06, + "loss": 0.0099, + "num_input_tokens_seen": 166857664, + "step": 137130 + }, + { + "epoch": 15.272858892972492, + "grad_norm": 0.32363009452819824, + "learning_rate": 8.03719980698718e-06, + "loss": 0.0603, + "num_input_tokens_seen": 166863808, + "step": 137135 + }, + { + "epoch": 15.27341574785611, + "grad_norm": 0.000302236556308344, + "learning_rate": 8.035415027692555e-06, + "loss": 0.0014, + "num_input_tokens_seen": 166870336, + "step": 137140 + }, + { + "epoch": 15.273972602739725, + "grad_norm": 0.7107395529747009, + "learning_rate": 8.033630408642024e-06, + "loss": 0.076, + "num_input_tokens_seen": 166876096, + "step": 137145 + }, + { + "epoch": 15.274529457623343, + "grad_norm": 0.005781786050647497, + "learning_rate": 8.031845949852452e-06, + "loss": 0.1644, + "num_input_tokens_seen": 166882240, + "step": 137150 + }, + { + "epoch": 15.27508631250696, + "grad_norm": 1.2705421447753906, + "learning_rate": 8.030061651340687e-06, + "loss": 0.0854, + "num_input_tokens_seen": 166888192, + "step": 137155 + }, + { + "epoch": 15.275643167390578, + "grad_norm": 2.574533700942993, + "learning_rate": 8.028277513123589e-06, + "loss": 0.0497, + "num_input_tokens_seen": 166894336, + "step": 137160 + }, + { + "epoch": 15.276200022274196, + "grad_norm": 0.04382544010877609, + "learning_rate": 8.026493535218e-06, + "loss": 0.0565, + "num_input_tokens_seen": 166900576, + "step": 137165 + }, + { + "epoch": 15.276756877157812, + "grad_norm": 0.00012524457997642457, + "learning_rate": 8.024709717640785e-06, + "loss": 0.0049, + "num_input_tokens_seen": 166906560, + "step": 137170 + }, + { + "epoch": 15.27731373204143, + "grad_norm": 0.12982502579689026, + "learning_rate": 8.022926060408777e-06, + "loss": 0.0217, + "num_input_tokens_seen": 166912768, + "step": 137175 + }, + { + "epoch": 15.277870586925047, + "grad_norm": 1.5582289695739746, + "learning_rate": 8.021142563538855e-06, + "loss": 0.1116, + "num_input_tokens_seen": 166917920, + "step": 137180 + }, + { + "epoch": 15.278427441808665, + "grad_norm": 0.07187901437282562, + "learning_rate": 8.019359227047826e-06, + "loss": 0.0551, + "num_input_tokens_seen": 166923872, + "step": 137185 + }, + { + "epoch": 15.278984296692283, + "grad_norm": 0.012125871144235134, + "learning_rate": 8.01757605095256e-06, + "loss": 0.0026, + "num_input_tokens_seen": 166930432, + "step": 137190 + }, + { + "epoch": 15.279541151575899, + "grad_norm": 0.98654705286026, + "learning_rate": 8.015793035269889e-06, + "loss": 0.1364, + "num_input_tokens_seen": 166936832, + "step": 137195 + }, + { + "epoch": 15.280098006459516, + "grad_norm": 0.5666738748550415, + "learning_rate": 8.014010180016667e-06, + "loss": 0.0362, + "num_input_tokens_seen": 166943040, + "step": 137200 + }, + { + "epoch": 15.280654861343134, + "grad_norm": 0.3629359304904938, + "learning_rate": 8.01222748520973e-06, + "loss": 0.0607, + "num_input_tokens_seen": 166948736, + "step": 137205 + }, + { + "epoch": 15.281211716226752, + "grad_norm": 0.14960385859012604, + "learning_rate": 8.010444950865914e-06, + "loss": 0.0818, + "num_input_tokens_seen": 166954624, + "step": 137210 + }, + { + "epoch": 15.28176857111037, + "grad_norm": 0.11597909778356552, + "learning_rate": 8.008662577002047e-06, + "loss": 0.0047, + "num_input_tokens_seen": 166960672, + "step": 137215 + }, + { + "epoch": 15.282325425993985, + "grad_norm": 0.08618778735399246, + "learning_rate": 8.006880363634986e-06, + "loss": 0.087, + "num_input_tokens_seen": 166967104, + "step": 137220 + }, + { + "epoch": 15.282882280877603, + "grad_norm": 0.303252249956131, + "learning_rate": 8.005098310781554e-06, + "loss": 0.0048, + "num_input_tokens_seen": 166973120, + "step": 137225 + }, + { + "epoch": 15.28343913576122, + "grad_norm": 0.17954711616039276, + "learning_rate": 8.003316418458581e-06, + "loss": 0.0631, + "num_input_tokens_seen": 166978912, + "step": 137230 + }, + { + "epoch": 15.283995990644838, + "grad_norm": 0.013696798123419285, + "learning_rate": 8.0015346866829e-06, + "loss": 0.0707, + "num_input_tokens_seen": 166985216, + "step": 137235 + }, + { + "epoch": 15.284552845528456, + "grad_norm": 0.030546169728040695, + "learning_rate": 7.999753115471345e-06, + "loss": 0.111, + "num_input_tokens_seen": 166991392, + "step": 137240 + }, + { + "epoch": 15.285109700412072, + "grad_norm": 0.004115473013371229, + "learning_rate": 7.997971704840736e-06, + "loss": 0.0089, + "num_input_tokens_seen": 166997376, + "step": 137245 + }, + { + "epoch": 15.28566655529569, + "grad_norm": 0.0538453534245491, + "learning_rate": 7.996190454807915e-06, + "loss": 0.0017, + "num_input_tokens_seen": 167003296, + "step": 137250 + }, + { + "epoch": 15.286223410179307, + "grad_norm": 0.018792416900396347, + "learning_rate": 7.994409365389699e-06, + "loss": 0.0071, + "num_input_tokens_seen": 167009248, + "step": 137255 + }, + { + "epoch": 15.286780265062925, + "grad_norm": 1.0870320796966553, + "learning_rate": 7.992628436602911e-06, + "loss": 0.0333, + "num_input_tokens_seen": 167015488, + "step": 137260 + }, + { + "epoch": 15.287337119946542, + "grad_norm": 0.001721975626423955, + "learning_rate": 7.990847668464363e-06, + "loss": 0.0301, + "num_input_tokens_seen": 167021568, + "step": 137265 + }, + { + "epoch": 15.28789397483016, + "grad_norm": 0.00018447900947649032, + "learning_rate": 7.989067060990896e-06, + "loss": 0.0713, + "num_input_tokens_seen": 167027776, + "step": 137270 + }, + { + "epoch": 15.288450829713776, + "grad_norm": 0.45740628242492676, + "learning_rate": 7.987286614199322e-06, + "loss": 0.0601, + "num_input_tokens_seen": 167033952, + "step": 137275 + }, + { + "epoch": 15.289007684597394, + "grad_norm": 1.10651695728302, + "learning_rate": 7.985506328106454e-06, + "loss": 0.0287, + "num_input_tokens_seen": 167040192, + "step": 137280 + }, + { + "epoch": 15.289564539481011, + "grad_norm": 0.8406269550323486, + "learning_rate": 7.9837262027291e-06, + "loss": 0.0144, + "num_input_tokens_seen": 167046016, + "step": 137285 + }, + { + "epoch": 15.290121394364629, + "grad_norm": 0.002465136582031846, + "learning_rate": 7.981946238084099e-06, + "loss": 0.0001, + "num_input_tokens_seen": 167052352, + "step": 137290 + }, + { + "epoch": 15.290678249248247, + "grad_norm": 0.15469242632389069, + "learning_rate": 7.980166434188239e-06, + "loss": 0.0438, + "num_input_tokens_seen": 167058656, + "step": 137295 + }, + { + "epoch": 15.291235104131863, + "grad_norm": 0.39847585558891296, + "learning_rate": 7.978386791058357e-06, + "loss": 0.093, + "num_input_tokens_seen": 167064992, + "step": 137300 + }, + { + "epoch": 15.29179195901548, + "grad_norm": 0.39108607172966003, + "learning_rate": 7.976607308711237e-06, + "loss": 0.015, + "num_input_tokens_seen": 167071296, + "step": 137305 + }, + { + "epoch": 15.292348813899098, + "grad_norm": 0.008561636321246624, + "learning_rate": 7.974827987163705e-06, + "loss": 0.0818, + "num_input_tokens_seen": 167077280, + "step": 137310 + }, + { + "epoch": 15.292905668782716, + "grad_norm": 0.020931705832481384, + "learning_rate": 7.973048826432555e-06, + "loss": 0.0115, + "num_input_tokens_seen": 167083328, + "step": 137315 + }, + { + "epoch": 15.293462523666333, + "grad_norm": 0.0273959431797266, + "learning_rate": 7.97126982653461e-06, + "loss": 0.0045, + "num_input_tokens_seen": 167089280, + "step": 137320 + }, + { + "epoch": 15.29401937854995, + "grad_norm": 0.015023351646959782, + "learning_rate": 7.969490987486666e-06, + "loss": 0.0972, + "num_input_tokens_seen": 167095488, + "step": 137325 + }, + { + "epoch": 15.294576233433567, + "grad_norm": 1.8027147054672241, + "learning_rate": 7.967712309305522e-06, + "loss": 0.0919, + "num_input_tokens_seen": 167101280, + "step": 137330 + }, + { + "epoch": 15.295133088317185, + "grad_norm": 0.006917237304151058, + "learning_rate": 7.965933792007974e-06, + "loss": 0.0275, + "num_input_tokens_seen": 167107424, + "step": 137335 + }, + { + "epoch": 15.295689943200802, + "grad_norm": 0.00015202687063720077, + "learning_rate": 7.964155435610838e-06, + "loss": 0.0017, + "num_input_tokens_seen": 167113504, + "step": 137340 + }, + { + "epoch": 15.29624679808442, + "grad_norm": 0.024155400693416595, + "learning_rate": 7.962377240130902e-06, + "loss": 0.0829, + "num_input_tokens_seen": 167119712, + "step": 137345 + }, + { + "epoch": 15.296803652968036, + "grad_norm": 0.09865304082632065, + "learning_rate": 7.960599205584963e-06, + "loss": 0.0169, + "num_input_tokens_seen": 167125920, + "step": 137350 + }, + { + "epoch": 15.297360507851653, + "grad_norm": 0.5523442625999451, + "learning_rate": 7.958821331989808e-06, + "loss": 0.0083, + "num_input_tokens_seen": 167132224, + "step": 137355 + }, + { + "epoch": 15.297917362735271, + "grad_norm": 0.6966347694396973, + "learning_rate": 7.957043619362247e-06, + "loss": 0.0604, + "num_input_tokens_seen": 167138304, + "step": 137360 + }, + { + "epoch": 15.298474217618889, + "grad_norm": 0.11836383491754532, + "learning_rate": 7.955266067719056e-06, + "loss": 0.0097, + "num_input_tokens_seen": 167144608, + "step": 137365 + }, + { + "epoch": 15.299031072502506, + "grad_norm": 0.00015191796410363168, + "learning_rate": 7.953488677077048e-06, + "loss": 0.0161, + "num_input_tokens_seen": 167150496, + "step": 137370 + }, + { + "epoch": 15.299587927386122, + "grad_norm": 0.00041740050073713064, + "learning_rate": 7.951711447452982e-06, + "loss": 0.0525, + "num_input_tokens_seen": 167156672, + "step": 137375 + }, + { + "epoch": 15.30014478226974, + "grad_norm": 0.5290771722793579, + "learning_rate": 7.949934378863666e-06, + "loss": 0.0194, + "num_input_tokens_seen": 167162688, + "step": 137380 + }, + { + "epoch": 15.300701637153358, + "grad_norm": 0.6703926920890808, + "learning_rate": 7.948157471325873e-06, + "loss": 0.0194, + "num_input_tokens_seen": 167168704, + "step": 137385 + }, + { + "epoch": 15.301258492036975, + "grad_norm": 0.385225772857666, + "learning_rate": 7.946380724856406e-06, + "loss": 0.009, + "num_input_tokens_seen": 167174880, + "step": 137390 + }, + { + "epoch": 15.301815346920593, + "grad_norm": 0.0005025360151194036, + "learning_rate": 7.944604139472031e-06, + "loss": 0.0448, + "num_input_tokens_seen": 167180928, + "step": 137395 + }, + { + "epoch": 15.302372201804209, + "grad_norm": 0.8942041397094727, + "learning_rate": 7.942827715189538e-06, + "loss": 0.0546, + "num_input_tokens_seen": 167187200, + "step": 137400 + }, + { + "epoch": 15.302929056687827, + "grad_norm": 1.2238590717315674, + "learning_rate": 7.941051452025694e-06, + "loss": 0.0731, + "num_input_tokens_seen": 167193440, + "step": 137405 + }, + { + "epoch": 15.303485911571444, + "grad_norm": 0.032416827976703644, + "learning_rate": 7.939275349997294e-06, + "loss": 0.0111, + "num_input_tokens_seen": 167199232, + "step": 137410 + }, + { + "epoch": 15.304042766455062, + "grad_norm": 0.020068759098649025, + "learning_rate": 7.937499409121107e-06, + "loss": 0.1474, + "num_input_tokens_seen": 167205376, + "step": 137415 + }, + { + "epoch": 15.30459962133868, + "grad_norm": 0.043768856674432755, + "learning_rate": 7.93572362941391e-06, + "loss": 0.0871, + "num_input_tokens_seen": 167211712, + "step": 137420 + }, + { + "epoch": 15.305156476222297, + "grad_norm": 0.024576464667916298, + "learning_rate": 7.933948010892474e-06, + "loss": 0.0028, + "num_input_tokens_seen": 167217632, + "step": 137425 + }, + { + "epoch": 15.305713331105913, + "grad_norm": 0.5808587670326233, + "learning_rate": 7.932172553573563e-06, + "loss": 0.0105, + "num_input_tokens_seen": 167223712, + "step": 137430 + }, + { + "epoch": 15.306270185989531, + "grad_norm": 0.012915931642055511, + "learning_rate": 7.930397257473968e-06, + "loss": 0.0142, + "num_input_tokens_seen": 167229856, + "step": 137435 + }, + { + "epoch": 15.306827040873149, + "grad_norm": 0.0030437964014708996, + "learning_rate": 7.928622122610436e-06, + "loss": 0.1265, + "num_input_tokens_seen": 167235968, + "step": 137440 + }, + { + "epoch": 15.307383895756766, + "grad_norm": 0.10385121405124664, + "learning_rate": 7.92684714899976e-06, + "loss": 0.022, + "num_input_tokens_seen": 167242336, + "step": 137445 + }, + { + "epoch": 15.307940750640384, + "grad_norm": 0.15945467352867126, + "learning_rate": 7.92507233665868e-06, + "loss": 0.0922, + "num_input_tokens_seen": 167248320, + "step": 137450 + }, + { + "epoch": 15.308497605524, + "grad_norm": 0.06014611944556236, + "learning_rate": 7.923297685603976e-06, + "loss": 0.0685, + "num_input_tokens_seen": 167254464, + "step": 137455 + }, + { + "epoch": 15.309054460407618, + "grad_norm": 0.2196522355079651, + "learning_rate": 7.921523195852401e-06, + "loss": 0.0643, + "num_input_tokens_seen": 167260736, + "step": 137460 + }, + { + "epoch": 15.309611315291235, + "grad_norm": 0.00819444004446268, + "learning_rate": 7.919748867420728e-06, + "loss": 0.078, + "num_input_tokens_seen": 167266784, + "step": 137465 + }, + { + "epoch": 15.310168170174853, + "grad_norm": 0.004807872232049704, + "learning_rate": 7.917974700325714e-06, + "loss": 0.0192, + "num_input_tokens_seen": 167271904, + "step": 137470 + }, + { + "epoch": 15.31072502505847, + "grad_norm": 0.008761205710470676, + "learning_rate": 7.916200694584114e-06, + "loss": 0.0108, + "num_input_tokens_seen": 167277920, + "step": 137475 + }, + { + "epoch": 15.311281879942086, + "grad_norm": 1.1131393909454346, + "learning_rate": 7.914426850212678e-06, + "loss": 0.045, + "num_input_tokens_seen": 167284128, + "step": 137480 + }, + { + "epoch": 15.311838734825704, + "grad_norm": 2.355919599533081, + "learning_rate": 7.91265316722818e-06, + "loss": 0.0301, + "num_input_tokens_seen": 167290528, + "step": 137485 + }, + { + "epoch": 15.312395589709322, + "grad_norm": 0.14739960432052612, + "learning_rate": 7.910879645647359e-06, + "loss": 0.1335, + "num_input_tokens_seen": 167296512, + "step": 137490 + }, + { + "epoch": 15.31295244459294, + "grad_norm": 0.0015907479682937264, + "learning_rate": 7.909106285486973e-06, + "loss": 0.0648, + "num_input_tokens_seen": 167302752, + "step": 137495 + }, + { + "epoch": 15.313509299476557, + "grad_norm": 0.08166076987981796, + "learning_rate": 7.90733308676376e-06, + "loss": 0.0595, + "num_input_tokens_seen": 167309056, + "step": 137500 + }, + { + "epoch": 15.314066154360173, + "grad_norm": 1.8725813627243042, + "learning_rate": 7.905560049494493e-06, + "loss": 0.1386, + "num_input_tokens_seen": 167315264, + "step": 137505 + }, + { + "epoch": 15.31462300924379, + "grad_norm": 0.7941797971725464, + "learning_rate": 7.903787173695895e-06, + "loss": 0.0768, + "num_input_tokens_seen": 167321184, + "step": 137510 + }, + { + "epoch": 15.315179864127408, + "grad_norm": 0.06274211406707764, + "learning_rate": 7.902014459384743e-06, + "loss": 0.2122, + "num_input_tokens_seen": 167326848, + "step": 137515 + }, + { + "epoch": 15.315736719011026, + "grad_norm": 0.2030508816242218, + "learning_rate": 7.900241906577745e-06, + "loss": 0.0135, + "num_input_tokens_seen": 167333056, + "step": 137520 + }, + { + "epoch": 15.316293573894644, + "grad_norm": 0.18694531917572021, + "learning_rate": 7.898469515291673e-06, + "loss": 0.0126, + "num_input_tokens_seen": 167339328, + "step": 137525 + }, + { + "epoch": 15.31685042877826, + "grad_norm": 0.0024762575048953295, + "learning_rate": 7.89669728554325e-06, + "loss": 0.1009, + "num_input_tokens_seen": 167345216, + "step": 137530 + }, + { + "epoch": 15.317407283661877, + "grad_norm": 0.9116628766059875, + "learning_rate": 7.89492521734923e-06, + "loss": 0.0342, + "num_input_tokens_seen": 167351360, + "step": 137535 + }, + { + "epoch": 15.317964138545495, + "grad_norm": 0.0009346422739326954, + "learning_rate": 7.893153310726348e-06, + "loss": 0.0005, + "num_input_tokens_seen": 167357824, + "step": 137540 + }, + { + "epoch": 15.318520993429113, + "grad_norm": 0.0020615016110241413, + "learning_rate": 7.891381565691337e-06, + "loss": 0.0097, + "num_input_tokens_seen": 167364096, + "step": 137545 + }, + { + "epoch": 15.31907784831273, + "grad_norm": 0.1581413298845291, + "learning_rate": 7.889609982260927e-06, + "loss": 0.0061, + "num_input_tokens_seen": 167370016, + "step": 137550 + }, + { + "epoch": 15.319634703196346, + "grad_norm": 0.00539708137512207, + "learning_rate": 7.887838560451865e-06, + "loss": 0.0062, + "num_input_tokens_seen": 167376160, + "step": 137555 + }, + { + "epoch": 15.320191558079964, + "grad_norm": 0.015267476439476013, + "learning_rate": 7.88606730028088e-06, + "loss": 0.0211, + "num_input_tokens_seen": 167382208, + "step": 137560 + }, + { + "epoch": 15.320748412963582, + "grad_norm": 2.33251953125, + "learning_rate": 7.884296201764702e-06, + "loss": 0.1254, + "num_input_tokens_seen": 167387360, + "step": 137565 + }, + { + "epoch": 15.3213052678472, + "grad_norm": 0.0005384424584917724, + "learning_rate": 7.882525264920049e-06, + "loss": 0.0026, + "num_input_tokens_seen": 167393696, + "step": 137570 + }, + { + "epoch": 15.321862122730817, + "grad_norm": 1.7600291967391968, + "learning_rate": 7.88075448976367e-06, + "loss": 0.022, + "num_input_tokens_seen": 167399968, + "step": 137575 + }, + { + "epoch": 15.322418977614433, + "grad_norm": 0.0002443971752654761, + "learning_rate": 7.878983876312268e-06, + "loss": 0.0335, + "num_input_tokens_seen": 167406240, + "step": 137580 + }, + { + "epoch": 15.32297583249805, + "grad_norm": 0.3327871859073639, + "learning_rate": 7.8772134245826e-06, + "loss": 0.1274, + "num_input_tokens_seen": 167412160, + "step": 137585 + }, + { + "epoch": 15.323532687381668, + "grad_norm": 0.05272180959582329, + "learning_rate": 7.875443134591354e-06, + "loss": 0.0053, + "num_input_tokens_seen": 167418080, + "step": 137590 + }, + { + "epoch": 15.324089542265286, + "grad_norm": 1.5575588941574097, + "learning_rate": 7.873673006355273e-06, + "loss": 0.063, + "num_input_tokens_seen": 167424064, + "step": 137595 + }, + { + "epoch": 15.324646397148904, + "grad_norm": 0.7307419180870056, + "learning_rate": 7.871903039891066e-06, + "loss": 0.0141, + "num_input_tokens_seen": 167429856, + "step": 137600 + }, + { + "epoch": 15.32520325203252, + "grad_norm": 0.004087422508746386, + "learning_rate": 7.870133235215465e-06, + "loss": 0.0286, + "num_input_tokens_seen": 167436032, + "step": 137605 + }, + { + "epoch": 15.325760106916137, + "grad_norm": 0.030456263571977615, + "learning_rate": 7.86836359234518e-06, + "loss": 0.0976, + "num_input_tokens_seen": 167441888, + "step": 137610 + }, + { + "epoch": 15.326316961799755, + "grad_norm": 0.02782588265836239, + "learning_rate": 7.866594111296925e-06, + "loss": 0.0005, + "num_input_tokens_seen": 167448064, + "step": 137615 + }, + { + "epoch": 15.326873816683372, + "grad_norm": 0.07140739262104034, + "learning_rate": 7.864824792087408e-06, + "loss": 0.0178, + "num_input_tokens_seen": 167454240, + "step": 137620 + }, + { + "epoch": 15.32743067156699, + "grad_norm": 0.6391831636428833, + "learning_rate": 7.86305563473336e-06, + "loss": 0.0243, + "num_input_tokens_seen": 167460448, + "step": 137625 + }, + { + "epoch": 15.327987526450608, + "grad_norm": 0.00038106273859739304, + "learning_rate": 7.861286639251478e-06, + "loss": 0.0072, + "num_input_tokens_seen": 167466400, + "step": 137630 + }, + { + "epoch": 15.328544381334224, + "grad_norm": 0.028042174875736237, + "learning_rate": 7.859517805658476e-06, + "loss": 0.0697, + "num_input_tokens_seen": 167472768, + "step": 137635 + }, + { + "epoch": 15.329101236217841, + "grad_norm": 0.9630667567253113, + "learning_rate": 7.857749133971054e-06, + "loss": 0.0456, + "num_input_tokens_seen": 167478624, + "step": 137640 + }, + { + "epoch": 15.329658091101459, + "grad_norm": 1.4836323261260986, + "learning_rate": 7.855980624205934e-06, + "loss": 0.0924, + "num_input_tokens_seen": 167484384, + "step": 137645 + }, + { + "epoch": 15.330214945985077, + "grad_norm": 0.0023090706672519445, + "learning_rate": 7.854212276379801e-06, + "loss": 0.0082, + "num_input_tokens_seen": 167490144, + "step": 137650 + }, + { + "epoch": 15.330771800868694, + "grad_norm": 0.026048757135868073, + "learning_rate": 7.852444090509384e-06, + "loss": 0.0457, + "num_input_tokens_seen": 167496224, + "step": 137655 + }, + { + "epoch": 15.33132865575231, + "grad_norm": 0.8195158839225769, + "learning_rate": 7.850676066611367e-06, + "loss": 0.0238, + "num_input_tokens_seen": 167502112, + "step": 137660 + }, + { + "epoch": 15.331885510635928, + "grad_norm": 0.3059732913970947, + "learning_rate": 7.848908204702455e-06, + "loss": 0.0263, + "num_input_tokens_seen": 167508256, + "step": 137665 + }, + { + "epoch": 15.332442365519546, + "grad_norm": 0.051342081278562546, + "learning_rate": 7.847140504799338e-06, + "loss": 0.1423, + "num_input_tokens_seen": 167513792, + "step": 137670 + }, + { + "epoch": 15.332999220403163, + "grad_norm": 0.059091754257678986, + "learning_rate": 7.845372966918729e-06, + "loss": 0.046, + "num_input_tokens_seen": 167520448, + "step": 137675 + }, + { + "epoch": 15.333556075286781, + "grad_norm": 0.025400882586836815, + "learning_rate": 7.843605591077318e-06, + "loss": 0.0113, + "num_input_tokens_seen": 167526336, + "step": 137680 + }, + { + "epoch": 15.334112930170397, + "grad_norm": 0.00026351684937253594, + "learning_rate": 7.841838377291796e-06, + "loss": 0.0862, + "num_input_tokens_seen": 167532640, + "step": 137685 + }, + { + "epoch": 15.334669785054015, + "grad_norm": 0.07812279462814331, + "learning_rate": 7.840071325578852e-06, + "loss": 0.0087, + "num_input_tokens_seen": 167538752, + "step": 137690 + }, + { + "epoch": 15.335226639937632, + "grad_norm": 0.14685191214084625, + "learning_rate": 7.838304435955188e-06, + "loss": 0.0427, + "num_input_tokens_seen": 167545024, + "step": 137695 + }, + { + "epoch": 15.33578349482125, + "grad_norm": 0.04914001375436783, + "learning_rate": 7.836537708437481e-06, + "loss": 0.0817, + "num_input_tokens_seen": 167550464, + "step": 137700 + }, + { + "epoch": 15.336340349704868, + "grad_norm": 0.30636778473854065, + "learning_rate": 7.834771143042444e-06, + "loss": 0.0746, + "num_input_tokens_seen": 167556640, + "step": 137705 + }, + { + "epoch": 15.336897204588483, + "grad_norm": 0.2866193652153015, + "learning_rate": 7.833004739786728e-06, + "loss": 0.0382, + "num_input_tokens_seen": 167562816, + "step": 137710 + }, + { + "epoch": 15.337454059472101, + "grad_norm": 0.00023740765755064785, + "learning_rate": 7.831238498687044e-06, + "loss": 0.0001, + "num_input_tokens_seen": 167568768, + "step": 137715 + }, + { + "epoch": 15.338010914355719, + "grad_norm": 0.03095945157110691, + "learning_rate": 7.829472419760062e-06, + "loss": 0.0625, + "num_input_tokens_seen": 167575040, + "step": 137720 + }, + { + "epoch": 15.338567769239337, + "grad_norm": 0.27603885531425476, + "learning_rate": 7.827706503022475e-06, + "loss": 0.0196, + "num_input_tokens_seen": 167580928, + "step": 137725 + }, + { + "epoch": 15.339124624122954, + "grad_norm": 1.3878628015518188, + "learning_rate": 7.825940748490962e-06, + "loss": 0.0311, + "num_input_tokens_seen": 167587008, + "step": 137730 + }, + { + "epoch": 15.33968147900657, + "grad_norm": 0.021100034937262535, + "learning_rate": 7.824175156182195e-06, + "loss": 0.0057, + "num_input_tokens_seen": 167593344, + "step": 137735 + }, + { + "epoch": 15.340238333890188, + "grad_norm": 0.6575237512588501, + "learning_rate": 7.822409726112847e-06, + "loss": 0.0246, + "num_input_tokens_seen": 167599328, + "step": 137740 + }, + { + "epoch": 15.340795188773805, + "grad_norm": 0.04770370572805405, + "learning_rate": 7.820644458299612e-06, + "loss": 0.0479, + "num_input_tokens_seen": 167605536, + "step": 137745 + }, + { + "epoch": 15.341352043657423, + "grad_norm": 0.5729649066925049, + "learning_rate": 7.818879352759151e-06, + "loss": 0.0919, + "num_input_tokens_seen": 167611104, + "step": 137750 + }, + { + "epoch": 15.34190889854104, + "grad_norm": 0.9030929803848267, + "learning_rate": 7.817114409508141e-06, + "loss": 0.0369, + "num_input_tokens_seen": 167617408, + "step": 137755 + }, + { + "epoch": 15.342465753424657, + "grad_norm": 1.6105101108551025, + "learning_rate": 7.815349628563245e-06, + "loss": 0.0856, + "num_input_tokens_seen": 167623488, + "step": 137760 + }, + { + "epoch": 15.343022608308274, + "grad_norm": 0.022200919687747955, + "learning_rate": 7.813585009941146e-06, + "loss": 0.0007, + "num_input_tokens_seen": 167629792, + "step": 137765 + }, + { + "epoch": 15.343579463191892, + "grad_norm": 0.0009574765572324395, + "learning_rate": 7.8118205536585e-06, + "loss": 0.0354, + "num_input_tokens_seen": 167635424, + "step": 137770 + }, + { + "epoch": 15.34413631807551, + "grad_norm": 0.013822133652865887, + "learning_rate": 7.810056259731996e-06, + "loss": 0.0729, + "num_input_tokens_seen": 167641568, + "step": 137775 + }, + { + "epoch": 15.344693172959127, + "grad_norm": 2.2782931327819824, + "learning_rate": 7.808292128178266e-06, + "loss": 0.0783, + "num_input_tokens_seen": 167647680, + "step": 137780 + }, + { + "epoch": 15.345250027842745, + "grad_norm": 0.006889726035296917, + "learning_rate": 7.806528159013999e-06, + "loss": 0.001, + "num_input_tokens_seen": 167653568, + "step": 137785 + }, + { + "epoch": 15.345806882726361, + "grad_norm": 0.002984913531690836, + "learning_rate": 7.80476435225584e-06, + "loss": 0.0251, + "num_input_tokens_seen": 167660096, + "step": 137790 + }, + { + "epoch": 15.346363737609979, + "grad_norm": 0.0051361857913434505, + "learning_rate": 7.803000707920465e-06, + "loss": 0.0085, + "num_input_tokens_seen": 167666432, + "step": 137795 + }, + { + "epoch": 15.346920592493596, + "grad_norm": 0.00010868369281524792, + "learning_rate": 7.80123722602453e-06, + "loss": 0.0082, + "num_input_tokens_seen": 167671968, + "step": 137800 + }, + { + "epoch": 15.347477447377214, + "grad_norm": 0.7128734588623047, + "learning_rate": 7.799473906584686e-06, + "loss": 0.0127, + "num_input_tokens_seen": 167677760, + "step": 137805 + }, + { + "epoch": 15.348034302260832, + "grad_norm": 0.01903659477829933, + "learning_rate": 7.797710749617584e-06, + "loss": 0.0117, + "num_input_tokens_seen": 167683296, + "step": 137810 + }, + { + "epoch": 15.348591157144448, + "grad_norm": 0.3064557909965515, + "learning_rate": 7.795947755139896e-06, + "loss": 0.0085, + "num_input_tokens_seen": 167689504, + "step": 137815 + }, + { + "epoch": 15.349148012028065, + "grad_norm": 0.021844107657670975, + "learning_rate": 7.794184923168263e-06, + "loss": 0.0046, + "num_input_tokens_seen": 167695648, + "step": 137820 + }, + { + "epoch": 15.349704866911683, + "grad_norm": 0.09984756261110306, + "learning_rate": 7.79242225371934e-06, + "loss": 0.02, + "num_input_tokens_seen": 167701920, + "step": 137825 + }, + { + "epoch": 15.3502617217953, + "grad_norm": 0.570853054523468, + "learning_rate": 7.790659746809775e-06, + "loss": 0.1227, + "num_input_tokens_seen": 167708448, + "step": 137830 + }, + { + "epoch": 15.350818576678918, + "grad_norm": 2.491457223892212, + "learning_rate": 7.788897402456208e-06, + "loss": 0.1764, + "num_input_tokens_seen": 167714624, + "step": 137835 + }, + { + "epoch": 15.351375431562534, + "grad_norm": 0.011859256774187088, + "learning_rate": 7.787135220675301e-06, + "loss": 0.0026, + "num_input_tokens_seen": 167720576, + "step": 137840 + }, + { + "epoch": 15.351932286446152, + "grad_norm": 0.04509120434522629, + "learning_rate": 7.785373201483686e-06, + "loss": 0.0102, + "num_input_tokens_seen": 167726528, + "step": 137845 + }, + { + "epoch": 15.35248914132977, + "grad_norm": 0.38109922409057617, + "learning_rate": 7.783611344898031e-06, + "loss": 0.1291, + "num_input_tokens_seen": 167732480, + "step": 137850 + }, + { + "epoch": 15.353045996213387, + "grad_norm": 2.87357497215271, + "learning_rate": 7.781849650934941e-06, + "loss": 0.0639, + "num_input_tokens_seen": 167738720, + "step": 137855 + }, + { + "epoch": 15.353602851097005, + "grad_norm": 1.1205368041992188, + "learning_rate": 7.780088119611087e-06, + "loss": 0.0489, + "num_input_tokens_seen": 167744704, + "step": 137860 + }, + { + "epoch": 15.35415970598062, + "grad_norm": 1.0190141201019287, + "learning_rate": 7.778326750943088e-06, + "loss": 0.044, + "num_input_tokens_seen": 167750944, + "step": 137865 + }, + { + "epoch": 15.354716560864238, + "grad_norm": 0.025979075580835342, + "learning_rate": 7.776565544947598e-06, + "loss": 0.0011, + "num_input_tokens_seen": 167757248, + "step": 137870 + }, + { + "epoch": 15.355273415747856, + "grad_norm": 0.0001019444243866019, + "learning_rate": 7.774804501641248e-06, + "loss": 0.0179, + "num_input_tokens_seen": 167763520, + "step": 137875 + }, + { + "epoch": 15.355830270631474, + "grad_norm": 0.4123826324939728, + "learning_rate": 7.773043621040665e-06, + "loss": 0.0051, + "num_input_tokens_seen": 167769664, + "step": 137880 + }, + { + "epoch": 15.356387125515091, + "grad_norm": 0.17688558995723724, + "learning_rate": 7.771282903162482e-06, + "loss": 0.0511, + "num_input_tokens_seen": 167775936, + "step": 137885 + }, + { + "epoch": 15.356943980398707, + "grad_norm": 0.015335159376263618, + "learning_rate": 7.769522348023345e-06, + "loss": 0.0748, + "num_input_tokens_seen": 167781952, + "step": 137890 + }, + { + "epoch": 15.357500835282325, + "grad_norm": 0.0001552786270622164, + "learning_rate": 7.767761955639875e-06, + "loss": 0.0099, + "num_input_tokens_seen": 167787968, + "step": 137895 + }, + { + "epoch": 15.358057690165943, + "grad_norm": 0.0003888314531650394, + "learning_rate": 7.766001726028696e-06, + "loss": 0.0131, + "num_input_tokens_seen": 167793952, + "step": 137900 + }, + { + "epoch": 15.35861454504956, + "grad_norm": 2.2235052585601807, + "learning_rate": 7.76424165920643e-06, + "loss": 0.0961, + "num_input_tokens_seen": 167799936, + "step": 137905 + }, + { + "epoch": 15.359171399933178, + "grad_norm": 0.0007985366974025965, + "learning_rate": 7.76248175518972e-06, + "loss": 0.1424, + "num_input_tokens_seen": 167806016, + "step": 137910 + }, + { + "epoch": 15.359728254816794, + "grad_norm": 0.9175445437431335, + "learning_rate": 7.760722013995175e-06, + "loss": 0.0246, + "num_input_tokens_seen": 167812320, + "step": 137915 + }, + { + "epoch": 15.360285109700412, + "grad_norm": 0.007943226024508476, + "learning_rate": 7.758962435639435e-06, + "loss": 0.0127, + "num_input_tokens_seen": 167818688, + "step": 137920 + }, + { + "epoch": 15.36084196458403, + "grad_norm": 1.6135390996932983, + "learning_rate": 7.757203020139092e-06, + "loss": 0.0604, + "num_input_tokens_seen": 167824896, + "step": 137925 + }, + { + "epoch": 15.361398819467647, + "grad_norm": 0.19085615873336792, + "learning_rate": 7.755443767510792e-06, + "loss": 0.0133, + "num_input_tokens_seen": 167830944, + "step": 137930 + }, + { + "epoch": 15.361955674351265, + "grad_norm": 0.012647584080696106, + "learning_rate": 7.75368467777113e-06, + "loss": 0.0228, + "num_input_tokens_seen": 167837216, + "step": 137935 + }, + { + "epoch": 15.36251252923488, + "grad_norm": 0.19636939465999603, + "learning_rate": 7.751925750936745e-06, + "loss": 0.004, + "num_input_tokens_seen": 167842880, + "step": 137940 + }, + { + "epoch": 15.363069384118498, + "grad_norm": 0.006684304215013981, + "learning_rate": 7.75016698702424e-06, + "loss": 0.0826, + "num_input_tokens_seen": 167848960, + "step": 137945 + }, + { + "epoch": 15.363626239002116, + "grad_norm": 0.09530557692050934, + "learning_rate": 7.748408386050226e-06, + "loss": 0.0212, + "num_input_tokens_seen": 167854848, + "step": 137950 + }, + { + "epoch": 15.364183093885734, + "grad_norm": 0.531648576259613, + "learning_rate": 7.74664994803131e-06, + "loss": 0.0668, + "num_input_tokens_seen": 167861056, + "step": 137955 + }, + { + "epoch": 15.364739948769351, + "grad_norm": 1.3963693380355835, + "learning_rate": 7.744891672984117e-06, + "loss": 0.0581, + "num_input_tokens_seen": 167866944, + "step": 137960 + }, + { + "epoch": 15.365296803652967, + "grad_norm": 0.2469886988401413, + "learning_rate": 7.743133560925244e-06, + "loss": 0.1003, + "num_input_tokens_seen": 167873248, + "step": 137965 + }, + { + "epoch": 15.365853658536585, + "grad_norm": 0.011264224536716938, + "learning_rate": 7.741375611871304e-06, + "loss": 0.0257, + "num_input_tokens_seen": 167879328, + "step": 137970 + }, + { + "epoch": 15.366410513420202, + "grad_norm": 0.0008424557745456696, + "learning_rate": 7.739617825838888e-06, + "loss": 0.0968, + "num_input_tokens_seen": 167885472, + "step": 137975 + }, + { + "epoch": 15.36696736830382, + "grad_norm": 0.018665509298443794, + "learning_rate": 7.737860202844621e-06, + "loss": 0.0037, + "num_input_tokens_seen": 167891168, + "step": 137980 + }, + { + "epoch": 15.367524223187438, + "grad_norm": 0.8844872713088989, + "learning_rate": 7.736102742905086e-06, + "loss": 0.0249, + "num_input_tokens_seen": 167897312, + "step": 137985 + }, + { + "epoch": 15.368081078071056, + "grad_norm": 0.2527371644973755, + "learning_rate": 7.734345446036897e-06, + "loss": 0.0058, + "num_input_tokens_seen": 167903616, + "step": 137990 + }, + { + "epoch": 15.368637932954671, + "grad_norm": 0.0554405152797699, + "learning_rate": 7.73258831225665e-06, + "loss": 0.0177, + "num_input_tokens_seen": 167909920, + "step": 137995 + }, + { + "epoch": 15.369194787838289, + "grad_norm": 0.16019481420516968, + "learning_rate": 7.73083134158094e-06, + "loss": 0.0525, + "num_input_tokens_seen": 167916256, + "step": 138000 + }, + { + "epoch": 15.369751642721907, + "grad_norm": 0.4690505564212799, + "learning_rate": 7.72907453402636e-06, + "loss": 0.0113, + "num_input_tokens_seen": 167922464, + "step": 138005 + }, + { + "epoch": 15.370308497605524, + "grad_norm": 0.0031267215963453054, + "learning_rate": 7.727317889609512e-06, + "loss": 0.0002, + "num_input_tokens_seen": 167928608, + "step": 138010 + }, + { + "epoch": 15.370865352489142, + "grad_norm": 0.3833351731300354, + "learning_rate": 7.725561408346987e-06, + "loss": 0.0077, + "num_input_tokens_seen": 167934624, + "step": 138015 + }, + { + "epoch": 15.371422207372758, + "grad_norm": 0.03216720372438431, + "learning_rate": 7.723805090255373e-06, + "loss": 0.0179, + "num_input_tokens_seen": 167940768, + "step": 138020 + }, + { + "epoch": 15.371979062256376, + "grad_norm": 0.0012342837871983647, + "learning_rate": 7.722048935351256e-06, + "loss": 0.0211, + "num_input_tokens_seen": 167947072, + "step": 138025 + }, + { + "epoch": 15.372535917139993, + "grad_norm": 3.3875603675842285, + "learning_rate": 7.720292943651235e-06, + "loss": 0.0459, + "num_input_tokens_seen": 167953312, + "step": 138030 + }, + { + "epoch": 15.373092772023611, + "grad_norm": 0.010694117285311222, + "learning_rate": 7.718537115171892e-06, + "loss": 0.0856, + "num_input_tokens_seen": 167958592, + "step": 138035 + }, + { + "epoch": 15.373649626907229, + "grad_norm": 0.15311965346336365, + "learning_rate": 7.716781449929814e-06, + "loss": 0.1073, + "num_input_tokens_seen": 167964192, + "step": 138040 + }, + { + "epoch": 15.374206481790845, + "grad_norm": 0.017487676814198494, + "learning_rate": 7.715025947941571e-06, + "loss": 0.0258, + "num_input_tokens_seen": 167970368, + "step": 138045 + }, + { + "epoch": 15.374763336674462, + "grad_norm": 0.008523696102201939, + "learning_rate": 7.713270609223766e-06, + "loss": 0.0075, + "num_input_tokens_seen": 167976288, + "step": 138050 + }, + { + "epoch": 15.37532019155808, + "grad_norm": 0.2621747553348541, + "learning_rate": 7.711515433792962e-06, + "loss": 0.0218, + "num_input_tokens_seen": 167981952, + "step": 138055 + }, + { + "epoch": 15.375877046441698, + "grad_norm": 0.05675661563873291, + "learning_rate": 7.709760421665755e-06, + "loss": 0.0231, + "num_input_tokens_seen": 167987680, + "step": 138060 + }, + { + "epoch": 15.376433901325315, + "grad_norm": 0.3831983804702759, + "learning_rate": 7.708005572858713e-06, + "loss": 0.0299, + "num_input_tokens_seen": 167994208, + "step": 138065 + }, + { + "epoch": 15.376990756208931, + "grad_norm": 0.0005729669355787337, + "learning_rate": 7.706250887388412e-06, + "loss": 0.0071, + "num_input_tokens_seen": 168000256, + "step": 138070 + }, + { + "epoch": 15.377547611092549, + "grad_norm": 1.731122374534607, + "learning_rate": 7.704496365271418e-06, + "loss": 0.0353, + "num_input_tokens_seen": 168005504, + "step": 138075 + }, + { + "epoch": 15.378104465976167, + "grad_norm": 9.814854274736717e-05, + "learning_rate": 7.702742006524322e-06, + "loss": 0.001, + "num_input_tokens_seen": 168011808, + "step": 138080 + }, + { + "epoch": 15.378661320859784, + "grad_norm": 0.21858267486095428, + "learning_rate": 7.700987811163684e-06, + "loss": 0.0046, + "num_input_tokens_seen": 168018048, + "step": 138085 + }, + { + "epoch": 15.379218175743402, + "grad_norm": 0.35876262187957764, + "learning_rate": 7.699233779206077e-06, + "loss": 0.096, + "num_input_tokens_seen": 168023776, + "step": 138090 + }, + { + "epoch": 15.379775030627018, + "grad_norm": 0.1658477932214737, + "learning_rate": 7.697479910668062e-06, + "loss": 0.0051, + "num_input_tokens_seen": 168029344, + "step": 138095 + }, + { + "epoch": 15.380331885510635, + "grad_norm": 0.01971430890262127, + "learning_rate": 7.695726205566217e-06, + "loss": 0.0232, + "num_input_tokens_seen": 168035488, + "step": 138100 + }, + { + "epoch": 15.380888740394253, + "grad_norm": 0.2930241525173187, + "learning_rate": 7.693972663917095e-06, + "loss": 0.0393, + "num_input_tokens_seen": 168041536, + "step": 138105 + }, + { + "epoch": 15.38144559527787, + "grad_norm": 4.751911163330078, + "learning_rate": 7.692219285737284e-06, + "loss": 0.05, + "num_input_tokens_seen": 168047616, + "step": 138110 + }, + { + "epoch": 15.382002450161488, + "grad_norm": 0.06790398806333542, + "learning_rate": 7.690466071043312e-06, + "loss": 0.0375, + "num_input_tokens_seen": 168053376, + "step": 138115 + }, + { + "epoch": 15.382559305045106, + "grad_norm": 3.504664897918701, + "learning_rate": 7.688713019851762e-06, + "loss": 0.1477, + "num_input_tokens_seen": 168059360, + "step": 138120 + }, + { + "epoch": 15.383116159928722, + "grad_norm": 0.07211402803659439, + "learning_rate": 7.686960132179183e-06, + "loss": 0.0306, + "num_input_tokens_seen": 168065408, + "step": 138125 + }, + { + "epoch": 15.38367301481234, + "grad_norm": 0.12644366919994354, + "learning_rate": 7.685207408042142e-06, + "loss": 0.0029, + "num_input_tokens_seen": 168071296, + "step": 138130 + }, + { + "epoch": 15.384229869695957, + "grad_norm": 1.2170740365982056, + "learning_rate": 7.683454847457188e-06, + "loss": 0.0361, + "num_input_tokens_seen": 168077248, + "step": 138135 + }, + { + "epoch": 15.384786724579575, + "grad_norm": 0.34342479705810547, + "learning_rate": 7.681702450440878e-06, + "loss": 0.0131, + "num_input_tokens_seen": 168083168, + "step": 138140 + }, + { + "epoch": 15.385343579463193, + "grad_norm": 0.06929226964712143, + "learning_rate": 7.679950217009757e-06, + "loss": 0.0602, + "num_input_tokens_seen": 168089216, + "step": 138145 + }, + { + "epoch": 15.385900434346809, + "grad_norm": 0.28117111325263977, + "learning_rate": 7.67819814718039e-06, + "loss": 0.0203, + "num_input_tokens_seen": 168095456, + "step": 138150 + }, + { + "epoch": 15.386457289230426, + "grad_norm": 0.00017307655070908368, + "learning_rate": 7.676446240969317e-06, + "loss": 0.0561, + "num_input_tokens_seen": 168101952, + "step": 138155 + }, + { + "epoch": 15.387014144114044, + "grad_norm": 0.0002119759883498773, + "learning_rate": 7.674694498393092e-06, + "loss": 0.0151, + "num_input_tokens_seen": 168108288, + "step": 138160 + }, + { + "epoch": 15.387570998997662, + "grad_norm": 0.3192402124404907, + "learning_rate": 7.672942919468248e-06, + "loss": 0.034, + "num_input_tokens_seen": 168114496, + "step": 138165 + }, + { + "epoch": 15.38812785388128, + "grad_norm": 0.0003499683225527406, + "learning_rate": 7.67119150421135e-06, + "loss": 0.0026, + "num_input_tokens_seen": 168120864, + "step": 138170 + }, + { + "epoch": 15.388684708764895, + "grad_norm": 0.10539717972278595, + "learning_rate": 7.669440252638924e-06, + "loss": 0.0381, + "num_input_tokens_seen": 168126784, + "step": 138175 + }, + { + "epoch": 15.389241563648513, + "grad_norm": 0.6751338243484497, + "learning_rate": 7.667689164767535e-06, + "loss": 0.055, + "num_input_tokens_seen": 168132896, + "step": 138180 + }, + { + "epoch": 15.38979841853213, + "grad_norm": 0.8513445258140564, + "learning_rate": 7.665938240613693e-06, + "loss": 0.1027, + "num_input_tokens_seen": 168138784, + "step": 138185 + }, + { + "epoch": 15.390355273415748, + "grad_norm": 0.050980012863874435, + "learning_rate": 7.66418748019396e-06, + "loss": 0.0097, + "num_input_tokens_seen": 168144928, + "step": 138190 + }, + { + "epoch": 15.390912128299366, + "grad_norm": 0.0030447514727711678, + "learning_rate": 7.662436883524856e-06, + "loss": 0.1388, + "num_input_tokens_seen": 168151040, + "step": 138195 + }, + { + "epoch": 15.391468983182982, + "grad_norm": 0.4593167006969452, + "learning_rate": 7.660686450622937e-06, + "loss": 0.013, + "num_input_tokens_seen": 168157120, + "step": 138200 + }, + { + "epoch": 15.3920258380666, + "grad_norm": 0.046636637300252914, + "learning_rate": 7.658936181504723e-06, + "loss": 0.0018, + "num_input_tokens_seen": 168163360, + "step": 138205 + }, + { + "epoch": 15.392582692950217, + "grad_norm": 0.025639783591032028, + "learning_rate": 7.657186076186753e-06, + "loss": 0.1453, + "num_input_tokens_seen": 168169440, + "step": 138210 + }, + { + "epoch": 15.393139547833835, + "grad_norm": 0.0037698529195040464, + "learning_rate": 7.655436134685545e-06, + "loss": 0.0398, + "num_input_tokens_seen": 168175648, + "step": 138215 + }, + { + "epoch": 15.393696402717453, + "grad_norm": 1.482759952545166, + "learning_rate": 7.653686357017651e-06, + "loss": 0.1481, + "num_input_tokens_seen": 168181728, + "step": 138220 + }, + { + "epoch": 15.394253257601068, + "grad_norm": 0.6122120022773743, + "learning_rate": 7.651936743199583e-06, + "loss": 0.018, + "num_input_tokens_seen": 168187712, + "step": 138225 + }, + { + "epoch": 15.394810112484686, + "grad_norm": 0.0024276443291455507, + "learning_rate": 7.650187293247871e-06, + "loss": 0.0006, + "num_input_tokens_seen": 168193920, + "step": 138230 + }, + { + "epoch": 15.395366967368304, + "grad_norm": 0.05851985514163971, + "learning_rate": 7.648438007179043e-06, + "loss": 0.0404, + "num_input_tokens_seen": 168200064, + "step": 138235 + }, + { + "epoch": 15.395923822251921, + "grad_norm": 0.029132844880223274, + "learning_rate": 7.646688885009612e-06, + "loss": 0.0112, + "num_input_tokens_seen": 168206272, + "step": 138240 + }, + { + "epoch": 15.39648067713554, + "grad_norm": 2.259277582168579, + "learning_rate": 7.644939926756114e-06, + "loss": 0.0501, + "num_input_tokens_seen": 168212192, + "step": 138245 + }, + { + "epoch": 15.397037532019155, + "grad_norm": 0.23293298482894897, + "learning_rate": 7.643191132435057e-06, + "loss": 0.0569, + "num_input_tokens_seen": 168218016, + "step": 138250 + }, + { + "epoch": 15.397594386902773, + "grad_norm": 0.5560985207557678, + "learning_rate": 7.641442502062978e-06, + "loss": 0.0361, + "num_input_tokens_seen": 168224064, + "step": 138255 + }, + { + "epoch": 15.39815124178639, + "grad_norm": 0.3421640694141388, + "learning_rate": 7.63969403565637e-06, + "loss": 0.0876, + "num_input_tokens_seen": 168229952, + "step": 138260 + }, + { + "epoch": 15.398708096670008, + "grad_norm": 0.420168936252594, + "learning_rate": 7.637945733231767e-06, + "loss": 0.0113, + "num_input_tokens_seen": 168236128, + "step": 138265 + }, + { + "epoch": 15.399264951553626, + "grad_norm": 0.08550416678190231, + "learning_rate": 7.636197594805668e-06, + "loss": 0.0553, + "num_input_tokens_seen": 168242336, + "step": 138270 + }, + { + "epoch": 15.399821806437242, + "grad_norm": 0.00046589007251895964, + "learning_rate": 7.634449620394605e-06, + "loss": 0.042, + "num_input_tokens_seen": 168248704, + "step": 138275 + }, + { + "epoch": 15.40037866132086, + "grad_norm": 0.12463133782148361, + "learning_rate": 7.632701810015078e-06, + "loss": 0.0443, + "num_input_tokens_seen": 168255072, + "step": 138280 + }, + { + "epoch": 15.400935516204477, + "grad_norm": 0.3467411696910858, + "learning_rate": 7.630954163683592e-06, + "loss": 0.0349, + "num_input_tokens_seen": 168260992, + "step": 138285 + }, + { + "epoch": 15.401492371088095, + "grad_norm": 0.1582978367805481, + "learning_rate": 7.6292066814166595e-06, + "loss": 0.0253, + "num_input_tokens_seen": 168267072, + "step": 138290 + }, + { + "epoch": 15.402049225971712, + "grad_norm": 0.01995430514216423, + "learning_rate": 7.6274593632307905e-06, + "loss": 0.0839, + "num_input_tokens_seen": 168273344, + "step": 138295 + }, + { + "epoch": 15.402606080855328, + "grad_norm": 0.8779047131538391, + "learning_rate": 7.625712209142486e-06, + "loss": 0.0369, + "num_input_tokens_seen": 168279552, + "step": 138300 + }, + { + "epoch": 15.403162935738946, + "grad_norm": 1.6690928936004639, + "learning_rate": 7.62396521916825e-06, + "loss": 0.0399, + "num_input_tokens_seen": 168285952, + "step": 138305 + }, + { + "epoch": 15.403719790622564, + "grad_norm": 0.00040737894596531987, + "learning_rate": 7.622218393324576e-06, + "loss": 0.0127, + "num_input_tokens_seen": 168292192, + "step": 138310 + }, + { + "epoch": 15.404276645506181, + "grad_norm": 0.8376452326774597, + "learning_rate": 7.620471731627982e-06, + "loss": 0.1482, + "num_input_tokens_seen": 168297920, + "step": 138315 + }, + { + "epoch": 15.404833500389799, + "grad_norm": 0.3265676200389862, + "learning_rate": 7.618725234094948e-06, + "loss": 0.0292, + "num_input_tokens_seen": 168304032, + "step": 138320 + }, + { + "epoch": 15.405390355273417, + "grad_norm": 0.000806215510237962, + "learning_rate": 7.616978900741994e-06, + "loss": 0.005, + "num_input_tokens_seen": 168310496, + "step": 138325 + }, + { + "epoch": 15.405947210157033, + "grad_norm": 0.4922156035900116, + "learning_rate": 7.615232731585589e-06, + "loss": 0.0902, + "num_input_tokens_seen": 168316448, + "step": 138330 + }, + { + "epoch": 15.40650406504065, + "grad_norm": 0.0270523838698864, + "learning_rate": 7.613486726642244e-06, + "loss": 0.0055, + "num_input_tokens_seen": 168322656, + "step": 138335 + }, + { + "epoch": 15.407060919924268, + "grad_norm": 0.04369531199336052, + "learning_rate": 7.61174088592844e-06, + "loss": 0.0215, + "num_input_tokens_seen": 168328736, + "step": 138340 + }, + { + "epoch": 15.407617774807886, + "grad_norm": 0.09906549006700516, + "learning_rate": 7.6099952094606824e-06, + "loss": 0.0303, + "num_input_tokens_seen": 168334560, + "step": 138345 + }, + { + "epoch": 15.408174629691503, + "grad_norm": 0.04094422608613968, + "learning_rate": 7.608249697255451e-06, + "loss": 0.0028, + "num_input_tokens_seen": 168340832, + "step": 138350 + }, + { + "epoch": 15.408731484575119, + "grad_norm": 0.008166782557964325, + "learning_rate": 7.606504349329238e-06, + "loss": 0.0358, + "num_input_tokens_seen": 168347040, + "step": 138355 + }, + { + "epoch": 15.409288339458737, + "grad_norm": 0.014969178475439548, + "learning_rate": 7.604759165698519e-06, + "loss": 0.0329, + "num_input_tokens_seen": 168353280, + "step": 138360 + }, + { + "epoch": 15.409845194342354, + "grad_norm": 0.00015836999227758497, + "learning_rate": 7.603014146379792e-06, + "loss": 0.0551, + "num_input_tokens_seen": 168359648, + "step": 138365 + }, + { + "epoch": 15.410402049225972, + "grad_norm": 0.06289142370223999, + "learning_rate": 7.601269291389534e-06, + "loss": 0.1459, + "num_input_tokens_seen": 168365600, + "step": 138370 + }, + { + "epoch": 15.41095890410959, + "grad_norm": 3.105314016342163, + "learning_rate": 7.599524600744232e-06, + "loss": 0.1545, + "num_input_tokens_seen": 168371584, + "step": 138375 + }, + { + "epoch": 15.411515758993206, + "grad_norm": 0.03858618810772896, + "learning_rate": 7.597780074460348e-06, + "loss": 0.0534, + "num_input_tokens_seen": 168377536, + "step": 138380 + }, + { + "epoch": 15.412072613876823, + "grad_norm": 0.06499532610177994, + "learning_rate": 7.596035712554384e-06, + "loss": 0.0135, + "num_input_tokens_seen": 168383840, + "step": 138385 + }, + { + "epoch": 15.412629468760441, + "grad_norm": 0.000709826301317662, + "learning_rate": 7.594291515042798e-06, + "loss": 0.053, + "num_input_tokens_seen": 168389920, + "step": 138390 + }, + { + "epoch": 15.413186323644059, + "grad_norm": 0.053693488240242004, + "learning_rate": 7.592547481942083e-06, + "loss": 0.0177, + "num_input_tokens_seen": 168396192, + "step": 138395 + }, + { + "epoch": 15.413743178527676, + "grad_norm": 0.97575443983078, + "learning_rate": 7.590803613268705e-06, + "loss": 0.1381, + "num_input_tokens_seen": 168402208, + "step": 138400 + }, + { + "epoch": 15.414300033411292, + "grad_norm": 0.021865511313080788, + "learning_rate": 7.589059909039131e-06, + "loss": 0.0035, + "num_input_tokens_seen": 168408352, + "step": 138405 + }, + { + "epoch": 15.41485688829491, + "grad_norm": 0.07391221821308136, + "learning_rate": 7.587316369269829e-06, + "loss": 0.0399, + "num_input_tokens_seen": 168414528, + "step": 138410 + }, + { + "epoch": 15.415413743178528, + "grad_norm": 0.0010683167492970824, + "learning_rate": 7.585572993977283e-06, + "loss": 0.0064, + "num_input_tokens_seen": 168420160, + "step": 138415 + }, + { + "epoch": 15.415970598062145, + "grad_norm": 0.30581411719322205, + "learning_rate": 7.5838297831779534e-06, + "loss": 0.0156, + "num_input_tokens_seen": 168426208, + "step": 138420 + }, + { + "epoch": 15.416527452945763, + "grad_norm": 0.6111297011375427, + "learning_rate": 7.582086736888303e-06, + "loss": 0.0119, + "num_input_tokens_seen": 168432608, + "step": 138425 + }, + { + "epoch": 15.417084307829379, + "grad_norm": 0.598561704158783, + "learning_rate": 7.580343855124791e-06, + "loss": 0.0567, + "num_input_tokens_seen": 168438560, + "step": 138430 + }, + { + "epoch": 15.417641162712997, + "grad_norm": 0.019650913774967194, + "learning_rate": 7.578601137903896e-06, + "loss": 0.0081, + "num_input_tokens_seen": 168444864, + "step": 138435 + }, + { + "epoch": 15.418198017596614, + "grad_norm": 0.3046989440917969, + "learning_rate": 7.576858585242064e-06, + "loss": 0.0211, + "num_input_tokens_seen": 168450176, + "step": 138440 + }, + { + "epoch": 15.418754872480232, + "grad_norm": 0.0002608972426969558, + "learning_rate": 7.575116197155777e-06, + "loss": 0.052, + "num_input_tokens_seen": 168456416, + "step": 138445 + }, + { + "epoch": 15.41931172736385, + "grad_norm": 0.7442177534103394, + "learning_rate": 7.5733739736614604e-06, + "loss": 0.1289, + "num_input_tokens_seen": 168462592, + "step": 138450 + }, + { + "epoch": 15.419868582247465, + "grad_norm": 0.2556484043598175, + "learning_rate": 7.571631914775598e-06, + "loss": 0.1014, + "num_input_tokens_seen": 168468704, + "step": 138455 + }, + { + "epoch": 15.420425437131083, + "grad_norm": 0.07054329663515091, + "learning_rate": 7.5698900205146275e-06, + "loss": 0.0079, + "num_input_tokens_seen": 168474688, + "step": 138460 + }, + { + "epoch": 15.4209822920147, + "grad_norm": 0.007074333261698484, + "learning_rate": 7.568148290895019e-06, + "loss": 0.0032, + "num_input_tokens_seen": 168480928, + "step": 138465 + }, + { + "epoch": 15.421539146898319, + "grad_norm": 0.005188522394746542, + "learning_rate": 7.5664067259332175e-06, + "loss": 0.2214, + "num_input_tokens_seen": 168487136, + "step": 138470 + }, + { + "epoch": 15.422096001781936, + "grad_norm": 0.40550994873046875, + "learning_rate": 7.56466532564567e-06, + "loss": 0.0453, + "num_input_tokens_seen": 168493248, + "step": 138475 + }, + { + "epoch": 15.422652856665554, + "grad_norm": 0.002532802289351821, + "learning_rate": 7.56292409004882e-06, + "loss": 0.0084, + "num_input_tokens_seen": 168499456, + "step": 138480 + }, + { + "epoch": 15.42320971154917, + "grad_norm": 0.1650567352771759, + "learning_rate": 7.56118301915913e-06, + "loss": 0.0167, + "num_input_tokens_seen": 168505760, + "step": 138485 + }, + { + "epoch": 15.423766566432787, + "grad_norm": 0.0012375732185319066, + "learning_rate": 7.559442112993037e-06, + "loss": 0.0094, + "num_input_tokens_seen": 168511712, + "step": 138490 + }, + { + "epoch": 15.424323421316405, + "grad_norm": 0.8090671300888062, + "learning_rate": 7.557701371566988e-06, + "loss": 0.0132, + "num_input_tokens_seen": 168517984, + "step": 138495 + }, + { + "epoch": 15.424880276200023, + "grad_norm": 3.065450668334961, + "learning_rate": 7.555960794897418e-06, + "loss": 0.0363, + "num_input_tokens_seen": 168524064, + "step": 138500 + }, + { + "epoch": 15.42543713108364, + "grad_norm": 1.6086177825927734, + "learning_rate": 7.554220383000779e-06, + "loss": 0.1081, + "num_input_tokens_seen": 168529760, + "step": 138505 + }, + { + "epoch": 15.425993985967256, + "grad_norm": 0.020717382431030273, + "learning_rate": 7.5524801358935e-06, + "loss": 0.0569, + "num_input_tokens_seen": 168535808, + "step": 138510 + }, + { + "epoch": 15.426550840850874, + "grad_norm": 0.027171248570084572, + "learning_rate": 7.550740053592037e-06, + "loss": 0.0106, + "num_input_tokens_seen": 168542080, + "step": 138515 + }, + { + "epoch": 15.427107695734492, + "grad_norm": 1.3648262023925781, + "learning_rate": 7.5490001361128025e-06, + "loss": 0.1169, + "num_input_tokens_seen": 168548352, + "step": 138520 + }, + { + "epoch": 15.42766455061811, + "grad_norm": 1.3189117908477783, + "learning_rate": 7.5472603834722485e-06, + "loss": 0.1378, + "num_input_tokens_seen": 168554816, + "step": 138525 + }, + { + "epoch": 15.428221405501727, + "grad_norm": 0.00896312016993761, + "learning_rate": 7.545520795686797e-06, + "loss": 0.0139, + "num_input_tokens_seen": 168560960, + "step": 138530 + }, + { + "epoch": 15.428778260385343, + "grad_norm": 0.10929010063409805, + "learning_rate": 7.543781372772893e-06, + "loss": 0.007, + "num_input_tokens_seen": 168567040, + "step": 138535 + }, + { + "epoch": 15.42933511526896, + "grad_norm": 0.027304215356707573, + "learning_rate": 7.542042114746961e-06, + "loss": 0.0024, + "num_input_tokens_seen": 168573152, + "step": 138540 + }, + { + "epoch": 15.429891970152578, + "grad_norm": 0.0011835177429020405, + "learning_rate": 7.540303021625425e-06, + "loss": 0.0012, + "num_input_tokens_seen": 168579296, + "step": 138545 + }, + { + "epoch": 15.430448825036196, + "grad_norm": 0.038653697818517685, + "learning_rate": 7.53856409342471e-06, + "loss": 0.108, + "num_input_tokens_seen": 168585440, + "step": 138550 + }, + { + "epoch": 15.431005679919814, + "grad_norm": 0.011101197451353073, + "learning_rate": 7.536825330161254e-06, + "loss": 0.0019, + "num_input_tokens_seen": 168591840, + "step": 138555 + }, + { + "epoch": 15.43156253480343, + "grad_norm": 0.004002019762992859, + "learning_rate": 7.535086731851476e-06, + "loss": 0.0687, + "num_input_tokens_seen": 168597440, + "step": 138560 + }, + { + "epoch": 15.432119389687047, + "grad_norm": 0.0064531415700912476, + "learning_rate": 7.533348298511794e-06, + "loss": 0.0109, + "num_input_tokens_seen": 168603584, + "step": 138565 + }, + { + "epoch": 15.432676244570665, + "grad_norm": 4.613842964172363, + "learning_rate": 7.531610030158626e-06, + "loss": 0.091, + "num_input_tokens_seen": 168609984, + "step": 138570 + }, + { + "epoch": 15.433233099454283, + "grad_norm": 0.7824519872665405, + "learning_rate": 7.529871926808402e-06, + "loss": 0.1344, + "num_input_tokens_seen": 168615904, + "step": 138575 + }, + { + "epoch": 15.4337899543379, + "grad_norm": 0.7068015933036804, + "learning_rate": 7.528133988477528e-06, + "loss": 0.0413, + "num_input_tokens_seen": 168621856, + "step": 138580 + }, + { + "epoch": 15.434346809221516, + "grad_norm": 0.34009698033332825, + "learning_rate": 7.526396215182441e-06, + "loss": 0.0134, + "num_input_tokens_seen": 168628160, + "step": 138585 + }, + { + "epoch": 15.434903664105134, + "grad_norm": 0.8167307376861572, + "learning_rate": 7.524658606939527e-06, + "loss": 0.1449, + "num_input_tokens_seen": 168634240, + "step": 138590 + }, + { + "epoch": 15.435460518988751, + "grad_norm": 0.0029388093389570713, + "learning_rate": 7.52292116376522e-06, + "loss": 0.0173, + "num_input_tokens_seen": 168639840, + "step": 138595 + }, + { + "epoch": 15.43601737387237, + "grad_norm": 0.007774410769343376, + "learning_rate": 7.5211838856759196e-06, + "loss": 0.0126, + "num_input_tokens_seen": 168645824, + "step": 138600 + }, + { + "epoch": 15.436574228755987, + "grad_norm": 0.1483384668827057, + "learning_rate": 7.519446772688046e-06, + "loss": 0.0129, + "num_input_tokens_seen": 168651616, + "step": 138605 + }, + { + "epoch": 15.437131083639603, + "grad_norm": 0.5307512283325195, + "learning_rate": 7.517709824818006e-06, + "loss": 0.1451, + "num_input_tokens_seen": 168657472, + "step": 138610 + }, + { + "epoch": 15.43768793852322, + "grad_norm": 0.16086585819721222, + "learning_rate": 7.515973042082203e-06, + "loss": 0.0721, + "num_input_tokens_seen": 168663360, + "step": 138615 + }, + { + "epoch": 15.438244793406838, + "grad_norm": 2.2235593795776367, + "learning_rate": 7.514236424497031e-06, + "loss": 0.1126, + "num_input_tokens_seen": 168669568, + "step": 138620 + }, + { + "epoch": 15.438801648290456, + "grad_norm": 0.04311017692089081, + "learning_rate": 7.512499972078918e-06, + "loss": 0.0421, + "num_input_tokens_seen": 168675616, + "step": 138625 + }, + { + "epoch": 15.439358503174073, + "grad_norm": 0.0012760251993313432, + "learning_rate": 7.510763684844249e-06, + "loss": 0.0063, + "num_input_tokens_seen": 168681696, + "step": 138630 + }, + { + "epoch": 15.43991535805769, + "grad_norm": 0.5325751900672913, + "learning_rate": 7.509027562809432e-06, + "loss": 0.0523, + "num_input_tokens_seen": 168687904, + "step": 138635 + }, + { + "epoch": 15.440472212941307, + "grad_norm": 0.7551866769790649, + "learning_rate": 7.507291605990854e-06, + "loss": 0.0128, + "num_input_tokens_seen": 168694400, + "step": 138640 + }, + { + "epoch": 15.441029067824925, + "grad_norm": 0.1284540444612503, + "learning_rate": 7.505555814404932e-06, + "loss": 0.0029, + "num_input_tokens_seen": 168700384, + "step": 138645 + }, + { + "epoch": 15.441585922708542, + "grad_norm": 0.25611814856529236, + "learning_rate": 7.503820188068051e-06, + "loss": 0.0288, + "num_input_tokens_seen": 168706528, + "step": 138650 + }, + { + "epoch": 15.44214277759216, + "grad_norm": 0.49972814321517944, + "learning_rate": 7.502084726996594e-06, + "loss": 0.1856, + "num_input_tokens_seen": 168712224, + "step": 138655 + }, + { + "epoch": 15.442699632475776, + "grad_norm": 0.0007861655321903527, + "learning_rate": 7.500349431206985e-06, + "loss": 0.0121, + "num_input_tokens_seen": 168718016, + "step": 138660 + }, + { + "epoch": 15.443256487359394, + "grad_norm": 1.3607478141784668, + "learning_rate": 7.498614300715581e-06, + "loss": 0.0903, + "num_input_tokens_seen": 168723872, + "step": 138665 + }, + { + "epoch": 15.443813342243011, + "grad_norm": 0.4314540922641754, + "learning_rate": 7.496879335538792e-06, + "loss": 0.0057, + "num_input_tokens_seen": 168729984, + "step": 138670 + }, + { + "epoch": 15.444370197126629, + "grad_norm": 0.005547267850488424, + "learning_rate": 7.4951445356929965e-06, + "loss": 0.0158, + "num_input_tokens_seen": 168736256, + "step": 138675 + }, + { + "epoch": 15.444927052010247, + "grad_norm": 0.4709645211696625, + "learning_rate": 7.493409901194593e-06, + "loss": 0.0152, + "num_input_tokens_seen": 168742112, + "step": 138680 + }, + { + "epoch": 15.445483906893864, + "grad_norm": 0.8069272637367249, + "learning_rate": 7.491675432059955e-06, + "loss": 0.0367, + "num_input_tokens_seen": 168747904, + "step": 138685 + }, + { + "epoch": 15.44604076177748, + "grad_norm": 1.0800062417984009, + "learning_rate": 7.489941128305475e-06, + "loss": 0.0359, + "num_input_tokens_seen": 168754176, + "step": 138690 + }, + { + "epoch": 15.446597616661098, + "grad_norm": 0.21478132903575897, + "learning_rate": 7.4882069899475186e-06, + "loss": 0.0287, + "num_input_tokens_seen": 168760512, + "step": 138695 + }, + { + "epoch": 15.447154471544716, + "grad_norm": 0.15510223805904388, + "learning_rate": 7.486473017002485e-06, + "loss": 0.1081, + "num_input_tokens_seen": 168765856, + "step": 138700 + }, + { + "epoch": 15.447711326428333, + "grad_norm": 0.6040327548980713, + "learning_rate": 7.4847392094867466e-06, + "loss": 0.0182, + "num_input_tokens_seen": 168771776, + "step": 138705 + }, + { + "epoch": 15.448268181311951, + "grad_norm": 0.1712709218263626, + "learning_rate": 7.4830055674166815e-06, + "loss": 0.0127, + "num_input_tokens_seen": 168778272, + "step": 138710 + }, + { + "epoch": 15.448825036195567, + "grad_norm": 0.5555692911148071, + "learning_rate": 7.481272090808652e-06, + "loss": 0.0302, + "num_input_tokens_seen": 168784704, + "step": 138715 + }, + { + "epoch": 15.449381891079184, + "grad_norm": 0.268829345703125, + "learning_rate": 7.479538779679051e-06, + "loss": 0.0427, + "num_input_tokens_seen": 168790784, + "step": 138720 + }, + { + "epoch": 15.449938745962802, + "grad_norm": 0.9838219881057739, + "learning_rate": 7.4778056340442385e-06, + "loss": 0.0302, + "num_input_tokens_seen": 168796704, + "step": 138725 + }, + { + "epoch": 15.45049560084642, + "grad_norm": 1.301872968673706, + "learning_rate": 7.476072653920605e-06, + "loss": 0.1045, + "num_input_tokens_seen": 168803136, + "step": 138730 + }, + { + "epoch": 15.451052455730037, + "grad_norm": 0.005627220030874014, + "learning_rate": 7.47433983932449e-06, + "loss": 0.1, + "num_input_tokens_seen": 168808704, + "step": 138735 + }, + { + "epoch": 15.451609310613653, + "grad_norm": 0.5154936909675598, + "learning_rate": 7.472607190272282e-06, + "loss": 0.0402, + "num_input_tokens_seen": 168814816, + "step": 138740 + }, + { + "epoch": 15.452166165497271, + "grad_norm": 0.8055459260940552, + "learning_rate": 7.470874706780337e-06, + "loss": 0.0355, + "num_input_tokens_seen": 168820608, + "step": 138745 + }, + { + "epoch": 15.452723020380889, + "grad_norm": 4.202916145324707, + "learning_rate": 7.46914238886503e-06, + "loss": 0.0493, + "num_input_tokens_seen": 168825792, + "step": 138750 + }, + { + "epoch": 15.453279875264506, + "grad_norm": 0.06611645966768265, + "learning_rate": 7.467410236542719e-06, + "loss": 0.032, + "num_input_tokens_seen": 168831744, + "step": 138755 + }, + { + "epoch": 15.453836730148124, + "grad_norm": 0.04212208837270737, + "learning_rate": 7.465678249829766e-06, + "loss": 0.155, + "num_input_tokens_seen": 168837440, + "step": 138760 + }, + { + "epoch": 15.45439358503174, + "grad_norm": 0.039578307420015335, + "learning_rate": 7.463946428742522e-06, + "loss": 0.0206, + "num_input_tokens_seen": 168842944, + "step": 138765 + }, + { + "epoch": 15.454950439915358, + "grad_norm": 1.138755440711975, + "learning_rate": 7.46221477329736e-06, + "loss": 0.1571, + "num_input_tokens_seen": 168849248, + "step": 138770 + }, + { + "epoch": 15.455507294798975, + "grad_norm": 0.12971092760562897, + "learning_rate": 7.460483283510633e-06, + "loss": 0.0123, + "num_input_tokens_seen": 168855584, + "step": 138775 + }, + { + "epoch": 15.456064149682593, + "grad_norm": 0.9476183652877808, + "learning_rate": 7.458751959398694e-06, + "loss": 0.0315, + "num_input_tokens_seen": 168861600, + "step": 138780 + }, + { + "epoch": 15.45662100456621, + "grad_norm": 0.003929857164621353, + "learning_rate": 7.457020800977884e-06, + "loss": 0.0187, + "num_input_tokens_seen": 168867872, + "step": 138785 + }, + { + "epoch": 15.457177859449827, + "grad_norm": 0.0008402169914916158, + "learning_rate": 7.455289808264582e-06, + "loss": 0.0121, + "num_input_tokens_seen": 168874208, + "step": 138790 + }, + { + "epoch": 15.457734714333444, + "grad_norm": 0.11812157928943634, + "learning_rate": 7.453558981275113e-06, + "loss": 0.0148, + "num_input_tokens_seen": 168880736, + "step": 138795 + }, + { + "epoch": 15.458291569217062, + "grad_norm": 1.2477072477340698, + "learning_rate": 7.4518283200258444e-06, + "loss": 0.0437, + "num_input_tokens_seen": 168886656, + "step": 138800 + }, + { + "epoch": 15.45884842410068, + "grad_norm": 0.11070962995290756, + "learning_rate": 7.45009782453312e-06, + "loss": 0.0558, + "num_input_tokens_seen": 168892672, + "step": 138805 + }, + { + "epoch": 15.459405278984297, + "grad_norm": 0.22191889584064484, + "learning_rate": 7.44836749481328e-06, + "loss": 0.1157, + "num_input_tokens_seen": 168898528, + "step": 138810 + }, + { + "epoch": 15.459962133867913, + "grad_norm": 0.0013419733149930835, + "learning_rate": 7.446637330882664e-06, + "loss": 0.033, + "num_input_tokens_seen": 168904672, + "step": 138815 + }, + { + "epoch": 15.46051898875153, + "grad_norm": 0.5336927175521851, + "learning_rate": 7.444907332757628e-06, + "loss": 0.0276, + "num_input_tokens_seen": 168910944, + "step": 138820 + }, + { + "epoch": 15.461075843635149, + "grad_norm": 0.2428668886423111, + "learning_rate": 7.44317750045451e-06, + "loss": 0.0172, + "num_input_tokens_seen": 168917344, + "step": 138825 + }, + { + "epoch": 15.461632698518766, + "grad_norm": 1.2453612089157104, + "learning_rate": 7.441447833989645e-06, + "loss": 0.019, + "num_input_tokens_seen": 168923616, + "step": 138830 + }, + { + "epoch": 15.462189553402384, + "grad_norm": 3.8300106525421143, + "learning_rate": 7.4397183333793645e-06, + "loss": 0.1457, + "num_input_tokens_seen": 168929856, + "step": 138835 + }, + { + "epoch": 15.462746408286002, + "grad_norm": 0.02608797512948513, + "learning_rate": 7.437988998640022e-06, + "loss": 0.0048, + "num_input_tokens_seen": 168936064, + "step": 138840 + }, + { + "epoch": 15.463303263169617, + "grad_norm": 0.2992043197154999, + "learning_rate": 7.4362598297879364e-06, + "loss": 0.0116, + "num_input_tokens_seen": 168942336, + "step": 138845 + }, + { + "epoch": 15.463860118053235, + "grad_norm": 0.15271738171577454, + "learning_rate": 7.434530826839464e-06, + "loss": 0.0396, + "num_input_tokens_seen": 168948736, + "step": 138850 + }, + { + "epoch": 15.464416972936853, + "grad_norm": 0.20380499958992004, + "learning_rate": 7.432801989810906e-06, + "loss": 0.0155, + "num_input_tokens_seen": 168954720, + "step": 138855 + }, + { + "epoch": 15.46497382782047, + "grad_norm": 0.060403965413570404, + "learning_rate": 7.431073318718615e-06, + "loss": 0.0898, + "num_input_tokens_seen": 168960672, + "step": 138860 + }, + { + "epoch": 15.465530682704088, + "grad_norm": 0.009218771941959858, + "learning_rate": 7.429344813578906e-06, + "loss": 0.0052, + "num_input_tokens_seen": 168966528, + "step": 138865 + }, + { + "epoch": 15.466087537587704, + "grad_norm": 0.5404889583587646, + "learning_rate": 7.427616474408119e-06, + "loss": 0.0658, + "num_input_tokens_seen": 168972544, + "step": 138870 + }, + { + "epoch": 15.466644392471322, + "grad_norm": 1.2927778959274292, + "learning_rate": 7.4258883012225725e-06, + "loss": 0.0207, + "num_input_tokens_seen": 168978528, + "step": 138875 + }, + { + "epoch": 15.46720124735494, + "grad_norm": 0.9637303948402405, + "learning_rate": 7.424160294038593e-06, + "loss": 0.1084, + "num_input_tokens_seen": 168984768, + "step": 138880 + }, + { + "epoch": 15.467758102238557, + "grad_norm": 0.14718495309352875, + "learning_rate": 7.422432452872491e-06, + "loss": 0.0038, + "num_input_tokens_seen": 168991040, + "step": 138885 + }, + { + "epoch": 15.468314957122175, + "grad_norm": 0.07101619988679886, + "learning_rate": 7.420704777740608e-06, + "loss": 0.0458, + "num_input_tokens_seen": 168996512, + "step": 138890 + }, + { + "epoch": 15.46887181200579, + "grad_norm": 0.001866490812972188, + "learning_rate": 7.41897726865925e-06, + "loss": 0.0109, + "num_input_tokens_seen": 169002208, + "step": 138895 + }, + { + "epoch": 15.469428666889408, + "grad_norm": 8.00833513494581e-05, + "learning_rate": 7.41724992564474e-06, + "loss": 0.0022, + "num_input_tokens_seen": 169008480, + "step": 138900 + }, + { + "epoch": 15.469985521773026, + "grad_norm": 1.3948533535003662, + "learning_rate": 7.4155227487133825e-06, + "loss": 0.1152, + "num_input_tokens_seen": 169014656, + "step": 138905 + }, + { + "epoch": 15.470542376656644, + "grad_norm": 0.1144171729683876, + "learning_rate": 7.413795737881507e-06, + "loss": 0.0387, + "num_input_tokens_seen": 169020448, + "step": 138910 + }, + { + "epoch": 15.471099231540261, + "grad_norm": 0.2710064649581909, + "learning_rate": 7.4120688931654165e-06, + "loss": 0.0682, + "num_input_tokens_seen": 169026656, + "step": 138915 + }, + { + "epoch": 15.471656086423877, + "grad_norm": 0.0012401036219671369, + "learning_rate": 7.410342214581439e-06, + "loss": 0.004, + "num_input_tokens_seen": 169032832, + "step": 138920 + }, + { + "epoch": 15.472212941307495, + "grad_norm": 0.04306882247328758, + "learning_rate": 7.4086157021458575e-06, + "loss": 0.0054, + "num_input_tokens_seen": 169038880, + "step": 138925 + }, + { + "epoch": 15.472769796191113, + "grad_norm": 0.32759028673171997, + "learning_rate": 7.406889355875002e-06, + "loss": 0.0193, + "num_input_tokens_seen": 169044576, + "step": 138930 + }, + { + "epoch": 15.47332665107473, + "grad_norm": 0.1380498856306076, + "learning_rate": 7.4051631757851654e-06, + "loss": 0.0214, + "num_input_tokens_seen": 169050880, + "step": 138935 + }, + { + "epoch": 15.473883505958348, + "grad_norm": 0.0001522627571830526, + "learning_rate": 7.403437161892665e-06, + "loss": 0.0806, + "num_input_tokens_seen": 169057056, + "step": 138940 + }, + { + "epoch": 15.474440360841964, + "grad_norm": 7.222102431114763e-05, + "learning_rate": 7.4017113142137996e-06, + "loss": 0.1671, + "num_input_tokens_seen": 169062464, + "step": 138945 + }, + { + "epoch": 15.474997215725582, + "grad_norm": 1.4307245016098022, + "learning_rate": 7.399985632764872e-06, + "loss": 0.0741, + "num_input_tokens_seen": 169068832, + "step": 138950 + }, + { + "epoch": 15.4755540706092, + "grad_norm": 0.2340410202741623, + "learning_rate": 7.398260117562172e-06, + "loss": 0.0675, + "num_input_tokens_seen": 169075072, + "step": 138955 + }, + { + "epoch": 15.476110925492817, + "grad_norm": 0.0862613394856453, + "learning_rate": 7.396534768622015e-06, + "loss": 0.0183, + "num_input_tokens_seen": 169080896, + "step": 138960 + }, + { + "epoch": 15.476667780376435, + "grad_norm": 0.0642026886343956, + "learning_rate": 7.394809585960691e-06, + "loss": 0.0788, + "num_input_tokens_seen": 169087008, + "step": 138965 + }, + { + "epoch": 15.47722463526005, + "grad_norm": 1.771748423576355, + "learning_rate": 7.393084569594494e-06, + "loss": 0.1149, + "num_input_tokens_seen": 169092736, + "step": 138970 + }, + { + "epoch": 15.477781490143668, + "grad_norm": 0.10397662967443466, + "learning_rate": 7.391359719539714e-06, + "loss": 0.061, + "num_input_tokens_seen": 169098752, + "step": 138975 + }, + { + "epoch": 15.478338345027286, + "grad_norm": 3.5488147735595703, + "learning_rate": 7.389635035812656e-06, + "loss": 0.0802, + "num_input_tokens_seen": 169104800, + "step": 138980 + }, + { + "epoch": 15.478895199910903, + "grad_norm": 0.06398425251245499, + "learning_rate": 7.387910518429597e-06, + "loss": 0.0193, + "num_input_tokens_seen": 169111008, + "step": 138985 + }, + { + "epoch": 15.479452054794521, + "grad_norm": 0.10455498099327087, + "learning_rate": 7.38618616740685e-06, + "loss": 0.0448, + "num_input_tokens_seen": 169117184, + "step": 138990 + }, + { + "epoch": 15.480008909678137, + "grad_norm": 1.5025604963302612, + "learning_rate": 7.384461982760671e-06, + "loss": 0.0697, + "num_input_tokens_seen": 169123072, + "step": 138995 + }, + { + "epoch": 15.480565764561755, + "grad_norm": 0.6822134256362915, + "learning_rate": 7.382737964507369e-06, + "loss": 0.0279, + "num_input_tokens_seen": 169129056, + "step": 139000 + }, + { + "epoch": 15.481122619445372, + "grad_norm": 0.02384847402572632, + "learning_rate": 7.381014112663212e-06, + "loss": 0.0984, + "num_input_tokens_seen": 169135168, + "step": 139005 + }, + { + "epoch": 15.48167947432899, + "grad_norm": 0.023797964677214622, + "learning_rate": 7.379290427244504e-06, + "loss": 0.0379, + "num_input_tokens_seen": 169141120, + "step": 139010 + }, + { + "epoch": 15.482236329212608, + "grad_norm": 0.040425654500722885, + "learning_rate": 7.3775669082675135e-06, + "loss": 0.0562, + "num_input_tokens_seen": 169146336, + "step": 139015 + }, + { + "epoch": 15.482793184096224, + "grad_norm": 0.9256421327590942, + "learning_rate": 7.375843555748521e-06, + "loss": 0.0165, + "num_input_tokens_seen": 169152480, + "step": 139020 + }, + { + "epoch": 15.483350038979841, + "grad_norm": 0.9116631746292114, + "learning_rate": 7.374120369703802e-06, + "loss": 0.1217, + "num_input_tokens_seen": 169158688, + "step": 139025 + }, + { + "epoch": 15.483906893863459, + "grad_norm": 0.002627408830448985, + "learning_rate": 7.372397350149643e-06, + "loss": 0.0031, + "num_input_tokens_seen": 169165216, + "step": 139030 + }, + { + "epoch": 15.484463748747077, + "grad_norm": 0.027082497254014015, + "learning_rate": 7.3706744971023145e-06, + "loss": 0.0071, + "num_input_tokens_seen": 169171360, + "step": 139035 + }, + { + "epoch": 15.485020603630694, + "grad_norm": 0.04019804671406746, + "learning_rate": 7.368951810578089e-06, + "loss": 0.0419, + "num_input_tokens_seen": 169177792, + "step": 139040 + }, + { + "epoch": 15.485577458514312, + "grad_norm": 0.01858307607471943, + "learning_rate": 7.367229290593234e-06, + "loss": 0.0323, + "num_input_tokens_seen": 169183968, + "step": 139045 + }, + { + "epoch": 15.486134313397928, + "grad_norm": 1.9712998867034912, + "learning_rate": 7.365506937164032e-06, + "loss": 0.0862, + "num_input_tokens_seen": 169190368, + "step": 139050 + }, + { + "epoch": 15.486691168281546, + "grad_norm": 0.00038421331555582583, + "learning_rate": 7.363784750306746e-06, + "loss": 0.0137, + "num_input_tokens_seen": 169196352, + "step": 139055 + }, + { + "epoch": 15.487248023165163, + "grad_norm": 0.016341978684067726, + "learning_rate": 7.362062730037633e-06, + "loss": 0.0217, + "num_input_tokens_seen": 169202208, + "step": 139060 + }, + { + "epoch": 15.487804878048781, + "grad_norm": 0.2213626503944397, + "learning_rate": 7.360340876372987e-06, + "loss": 0.0137, + "num_input_tokens_seen": 169208480, + "step": 139065 + }, + { + "epoch": 15.488361732932399, + "grad_norm": 0.0007553005707450211, + "learning_rate": 7.358619189329036e-06, + "loss": 0.113, + "num_input_tokens_seen": 169214592, + "step": 139070 + }, + { + "epoch": 15.488918587816014, + "grad_norm": 0.24385876953601837, + "learning_rate": 7.356897668922069e-06, + "loss": 0.01, + "num_input_tokens_seen": 169220672, + "step": 139075 + }, + { + "epoch": 15.489475442699632, + "grad_norm": 0.02569560892879963, + "learning_rate": 7.355176315168333e-06, + "loss": 0.1383, + "num_input_tokens_seen": 169226784, + "step": 139080 + }, + { + "epoch": 15.49003229758325, + "grad_norm": 0.0002604986075311899, + "learning_rate": 7.3534551280841e-06, + "loss": 0.0038, + "num_input_tokens_seen": 169232800, + "step": 139085 + }, + { + "epoch": 15.490589152466868, + "grad_norm": 0.09137521684169769, + "learning_rate": 7.351734107685624e-06, + "loss": 0.0114, + "num_input_tokens_seen": 169238848, + "step": 139090 + }, + { + "epoch": 15.491146007350485, + "grad_norm": 0.013576056808233261, + "learning_rate": 7.3500132539891545e-06, + "loss": 0.0027, + "num_input_tokens_seen": 169244800, + "step": 139095 + }, + { + "epoch": 15.491702862234101, + "grad_norm": 0.3382944166660309, + "learning_rate": 7.348292567010947e-06, + "loss": 0.0854, + "num_input_tokens_seen": 169251232, + "step": 139100 + }, + { + "epoch": 15.492259717117719, + "grad_norm": 0.9823116660118103, + "learning_rate": 7.346572046767264e-06, + "loss": 0.0139, + "num_input_tokens_seen": 169257600, + "step": 139105 + }, + { + "epoch": 15.492816572001336, + "grad_norm": 0.0254824198782444, + "learning_rate": 7.344851693274352e-06, + "loss": 0.0314, + "num_input_tokens_seen": 169263520, + "step": 139110 + }, + { + "epoch": 15.493373426884954, + "grad_norm": 0.004007475450634956, + "learning_rate": 7.343131506548462e-06, + "loss": 0.0065, + "num_input_tokens_seen": 169270144, + "step": 139115 + }, + { + "epoch": 15.493930281768572, + "grad_norm": 0.604577898979187, + "learning_rate": 7.341411486605831e-06, + "loss": 0.01, + "num_input_tokens_seen": 169276544, + "step": 139120 + }, + { + "epoch": 15.494487136652188, + "grad_norm": 0.19225247204303741, + "learning_rate": 7.339691633462728e-06, + "loss": 0.011, + "num_input_tokens_seen": 169282624, + "step": 139125 + }, + { + "epoch": 15.495043991535805, + "grad_norm": 2.268967390060425, + "learning_rate": 7.337971947135374e-06, + "loss": 0.1033, + "num_input_tokens_seen": 169288640, + "step": 139130 + }, + { + "epoch": 15.495600846419423, + "grad_norm": 0.0010985714616253972, + "learning_rate": 7.336252427640044e-06, + "loss": 0.0954, + "num_input_tokens_seen": 169294560, + "step": 139135 + }, + { + "epoch": 15.49615770130304, + "grad_norm": 1.4427874088287354, + "learning_rate": 7.334533074992947e-06, + "loss": 0.1316, + "num_input_tokens_seen": 169300832, + "step": 139140 + }, + { + "epoch": 15.496714556186658, + "grad_norm": 0.0008917726227082312, + "learning_rate": 7.3328138892103464e-06, + "loss": 0.0132, + "num_input_tokens_seen": 169307008, + "step": 139145 + }, + { + "epoch": 15.497271411070274, + "grad_norm": 1.172687292098999, + "learning_rate": 7.331094870308463e-06, + "loss": 0.0215, + "num_input_tokens_seen": 169312928, + "step": 139150 + }, + { + "epoch": 15.497828265953892, + "grad_norm": 0.3167516887187958, + "learning_rate": 7.329376018303555e-06, + "loss": 0.0991, + "num_input_tokens_seen": 169318752, + "step": 139155 + }, + { + "epoch": 15.49838512083751, + "grad_norm": 0.09014414995908737, + "learning_rate": 7.327657333211846e-06, + "loss": 0.0106, + "num_input_tokens_seen": 169324928, + "step": 139160 + }, + { + "epoch": 15.498941975721127, + "grad_norm": 0.8651258945465088, + "learning_rate": 7.325938815049574e-06, + "loss": 0.0726, + "num_input_tokens_seen": 169330912, + "step": 139165 + }, + { + "epoch": 15.499498830604745, + "grad_norm": 0.044397879391908646, + "learning_rate": 7.324220463832962e-06, + "loss": 0.1162, + "num_input_tokens_seen": 169336896, + "step": 139170 + }, + { + "epoch": 15.500055685488363, + "grad_norm": 0.1593146026134491, + "learning_rate": 7.322502279578256e-06, + "loss": 0.0706, + "num_input_tokens_seen": 169343104, + "step": 139175 + }, + { + "epoch": 15.500612540371979, + "grad_norm": 1.0358927249908447, + "learning_rate": 7.32078426230168e-06, + "loss": 0.0642, + "num_input_tokens_seen": 169349184, + "step": 139180 + }, + { + "epoch": 15.501169395255596, + "grad_norm": 0.00018891185754910111, + "learning_rate": 7.319066412019459e-06, + "loss": 0.0025, + "num_input_tokens_seen": 169355200, + "step": 139185 + }, + { + "epoch": 15.501726250139214, + "grad_norm": 0.3735603988170624, + "learning_rate": 7.3173487287478165e-06, + "loss": 0.0985, + "num_input_tokens_seen": 169361440, + "step": 139190 + }, + { + "epoch": 15.502283105022832, + "grad_norm": 0.02059088833630085, + "learning_rate": 7.315631212502988e-06, + "loss": 0.0226, + "num_input_tokens_seen": 169367584, + "step": 139195 + }, + { + "epoch": 15.50283995990645, + "grad_norm": 0.007231262978166342, + "learning_rate": 7.313913863301186e-06, + "loss": 0.0922, + "num_input_tokens_seen": 169373696, + "step": 139200 + }, + { + "epoch": 15.503396814790065, + "grad_norm": 0.39581674337387085, + "learning_rate": 7.312196681158643e-06, + "loss": 0.0996, + "num_input_tokens_seen": 169380096, + "step": 139205 + }, + { + "epoch": 15.503953669673683, + "grad_norm": 0.48376160860061646, + "learning_rate": 7.310479666091574e-06, + "loss": 0.0093, + "num_input_tokens_seen": 169386112, + "step": 139210 + }, + { + "epoch": 15.5045105245573, + "grad_norm": 0.00033546105260029435, + "learning_rate": 7.308762818116197e-06, + "loss": 0.0408, + "num_input_tokens_seen": 169392160, + "step": 139215 + }, + { + "epoch": 15.505067379440918, + "grad_norm": 0.5965038537979126, + "learning_rate": 7.3070461372487215e-06, + "loss": 0.0264, + "num_input_tokens_seen": 169398208, + "step": 139220 + }, + { + "epoch": 15.505624234324536, + "grad_norm": 0.19069017469882965, + "learning_rate": 7.305329623505378e-06, + "loss": 0.0308, + "num_input_tokens_seen": 169404640, + "step": 139225 + }, + { + "epoch": 15.506181089208152, + "grad_norm": 1.2455778121948242, + "learning_rate": 7.303613276902374e-06, + "loss": 0.0475, + "num_input_tokens_seen": 169410112, + "step": 139230 + }, + { + "epoch": 15.50673794409177, + "grad_norm": 0.05209173634648323, + "learning_rate": 7.30189709745592e-06, + "loss": 0.0536, + "num_input_tokens_seen": 169416224, + "step": 139235 + }, + { + "epoch": 15.507294798975387, + "grad_norm": 0.6781684756278992, + "learning_rate": 7.30018108518222e-06, + "loss": 0.0181, + "num_input_tokens_seen": 169422048, + "step": 139240 + }, + { + "epoch": 15.507851653859005, + "grad_norm": 0.1901385635137558, + "learning_rate": 7.298465240097496e-06, + "loss": 0.0052, + "num_input_tokens_seen": 169428480, + "step": 139245 + }, + { + "epoch": 15.508408508742622, + "grad_norm": 0.16736090183258057, + "learning_rate": 7.296749562217944e-06, + "loss": 0.0853, + "num_input_tokens_seen": 169434560, + "step": 139250 + }, + { + "epoch": 15.508965363626238, + "grad_norm": 0.6572720408439636, + "learning_rate": 7.2950340515597935e-06, + "loss": 0.106, + "num_input_tokens_seen": 169440384, + "step": 139255 + }, + { + "epoch": 15.509522218509856, + "grad_norm": 0.24368305504322052, + "learning_rate": 7.293318708139213e-06, + "loss": 0.0879, + "num_input_tokens_seen": 169446144, + "step": 139260 + }, + { + "epoch": 15.510079073393474, + "grad_norm": 1.3983571529388428, + "learning_rate": 7.291603531972433e-06, + "loss": 0.0494, + "num_input_tokens_seen": 169451776, + "step": 139265 + }, + { + "epoch": 15.510635928277091, + "grad_norm": 0.0006801903946325183, + "learning_rate": 7.289888523075639e-06, + "loss": 0.0418, + "num_input_tokens_seen": 169458112, + "step": 139270 + }, + { + "epoch": 15.511192783160709, + "grad_norm": 0.0005736922030337155, + "learning_rate": 7.288173681465041e-06, + "loss": 0.0128, + "num_input_tokens_seen": 169464320, + "step": 139275 + }, + { + "epoch": 15.511749638044325, + "grad_norm": 0.1350027173757553, + "learning_rate": 7.286459007156835e-06, + "loss": 0.0116, + "num_input_tokens_seen": 169470432, + "step": 139280 + }, + { + "epoch": 15.512306492927943, + "grad_norm": 0.060064759105443954, + "learning_rate": 7.284744500167218e-06, + "loss": 0.0391, + "num_input_tokens_seen": 169476320, + "step": 139285 + }, + { + "epoch": 15.51286334781156, + "grad_norm": 1.2464343309402466, + "learning_rate": 7.283030160512372e-06, + "loss": 0.0155, + "num_input_tokens_seen": 169482528, + "step": 139290 + }, + { + "epoch": 15.513420202695178, + "grad_norm": 0.015454393811523914, + "learning_rate": 7.2813159882085065e-06, + "loss": 0.0004, + "num_input_tokens_seen": 169488704, + "step": 139295 + }, + { + "epoch": 15.513977057578796, + "grad_norm": 1.9768613576889038, + "learning_rate": 7.279601983271811e-06, + "loss": 0.0692, + "num_input_tokens_seen": 169495104, + "step": 139300 + }, + { + "epoch": 15.514533912462412, + "grad_norm": 0.035009391605854034, + "learning_rate": 7.27788814571847e-06, + "loss": 0.0023, + "num_input_tokens_seen": 169501184, + "step": 139305 + }, + { + "epoch": 15.51509076734603, + "grad_norm": 0.004173649940639734, + "learning_rate": 7.2761744755646675e-06, + "loss": 0.0287, + "num_input_tokens_seen": 169507232, + "step": 139310 + }, + { + "epoch": 15.515647622229647, + "grad_norm": 0.46031638979911804, + "learning_rate": 7.274460972826605e-06, + "loss": 0.0206, + "num_input_tokens_seen": 169513248, + "step": 139315 + }, + { + "epoch": 15.516204477113265, + "grad_norm": 0.11921411007642746, + "learning_rate": 7.272747637520452e-06, + "loss": 0.1231, + "num_input_tokens_seen": 169519264, + "step": 139320 + }, + { + "epoch": 15.516761331996882, + "grad_norm": 0.07581522315740585, + "learning_rate": 7.271034469662416e-06, + "loss": 0.0728, + "num_input_tokens_seen": 169525280, + "step": 139325 + }, + { + "epoch": 15.517318186880498, + "grad_norm": 0.055281415581703186, + "learning_rate": 7.269321469268647e-06, + "loss": 0.0391, + "num_input_tokens_seen": 169531168, + "step": 139330 + }, + { + "epoch": 15.517875041764116, + "grad_norm": 0.5824013352394104, + "learning_rate": 7.2676086363553524e-06, + "loss": 0.0277, + "num_input_tokens_seen": 169537376, + "step": 139335 + }, + { + "epoch": 15.518431896647733, + "grad_norm": 0.5427559614181519, + "learning_rate": 7.265895970938691e-06, + "loss": 0.0127, + "num_input_tokens_seen": 169543744, + "step": 139340 + }, + { + "epoch": 15.518988751531351, + "grad_norm": 2.3015685081481934, + "learning_rate": 7.264183473034858e-06, + "loss": 0.075, + "num_input_tokens_seen": 169550368, + "step": 139345 + }, + { + "epoch": 15.519545606414969, + "grad_norm": 0.4092155694961548, + "learning_rate": 7.262471142660024e-06, + "loss": 0.0151, + "num_input_tokens_seen": 169556320, + "step": 139350 + }, + { + "epoch": 15.520102461298585, + "grad_norm": 0.02780572883784771, + "learning_rate": 7.2607589798303595e-06, + "loss": 0.0041, + "num_input_tokens_seen": 169562656, + "step": 139355 + }, + { + "epoch": 15.520659316182202, + "grad_norm": 0.5208072066307068, + "learning_rate": 7.259046984562031e-06, + "loss": 0.0097, + "num_input_tokens_seen": 169568640, + "step": 139360 + }, + { + "epoch": 15.52121617106582, + "grad_norm": 0.01123287994414568, + "learning_rate": 7.2573351568712284e-06, + "loss": 0.0429, + "num_input_tokens_seen": 169575168, + "step": 139365 + }, + { + "epoch": 15.521773025949438, + "grad_norm": 0.26666465401649475, + "learning_rate": 7.255623496774109e-06, + "loss": 0.008, + "num_input_tokens_seen": 169581312, + "step": 139370 + }, + { + "epoch": 15.522329880833055, + "grad_norm": 0.23966887593269348, + "learning_rate": 7.253912004286839e-06, + "loss": 0.006, + "num_input_tokens_seen": 169587680, + "step": 139375 + }, + { + "epoch": 15.522886735716671, + "grad_norm": 0.17620518803596497, + "learning_rate": 7.2522006794255835e-06, + "loss": 0.0102, + "num_input_tokens_seen": 169593760, + "step": 139380 + }, + { + "epoch": 15.523443590600289, + "grad_norm": 0.16981732845306396, + "learning_rate": 7.250489522206519e-06, + "loss": 0.1485, + "num_input_tokens_seen": 169599392, + "step": 139385 + }, + { + "epoch": 15.524000445483907, + "grad_norm": 0.7435641884803772, + "learning_rate": 7.2487785326457944e-06, + "loss": 0.0431, + "num_input_tokens_seen": 169605472, + "step": 139390 + }, + { + "epoch": 15.524557300367524, + "grad_norm": 9.639201016398147e-05, + "learning_rate": 7.2470677107595865e-06, + "loss": 0.0328, + "num_input_tokens_seen": 169611808, + "step": 139395 + }, + { + "epoch": 15.525114155251142, + "grad_norm": 0.7584420442581177, + "learning_rate": 7.245357056564045e-06, + "loss": 0.0456, + "num_input_tokens_seen": 169617536, + "step": 139400 + }, + { + "epoch": 15.52567101013476, + "grad_norm": 0.057641226798295975, + "learning_rate": 7.243646570075332e-06, + "loss": 0.0013, + "num_input_tokens_seen": 169623584, + "step": 139405 + }, + { + "epoch": 15.526227865018376, + "grad_norm": 0.023421192541718483, + "learning_rate": 7.241936251309598e-06, + "loss": 0.0054, + "num_input_tokens_seen": 169629952, + "step": 139410 + }, + { + "epoch": 15.526784719901993, + "grad_norm": 0.28886887431144714, + "learning_rate": 7.24022610028301e-06, + "loss": 0.0146, + "num_input_tokens_seen": 169635584, + "step": 139415 + }, + { + "epoch": 15.527341574785611, + "grad_norm": 0.002105352468788624, + "learning_rate": 7.238516117011712e-06, + "loss": 0.0566, + "num_input_tokens_seen": 169641568, + "step": 139420 + }, + { + "epoch": 15.527898429669229, + "grad_norm": 1.073081374168396, + "learning_rate": 7.236806301511864e-06, + "loss": 0.045, + "num_input_tokens_seen": 169647488, + "step": 139425 + }, + { + "epoch": 15.528455284552846, + "grad_norm": 0.0005406801356002688, + "learning_rate": 7.2350966537996025e-06, + "loss": 0.0016, + "num_input_tokens_seen": 169653408, + "step": 139430 + }, + { + "epoch": 15.529012139436462, + "grad_norm": 0.8408355712890625, + "learning_rate": 7.233387173891093e-06, + "loss": 0.15, + "num_input_tokens_seen": 169659264, + "step": 139435 + }, + { + "epoch": 15.52956899432008, + "grad_norm": 1.3660463094711304, + "learning_rate": 7.231677861802472e-06, + "loss": 0.0434, + "num_input_tokens_seen": 169665248, + "step": 139440 + }, + { + "epoch": 15.530125849203698, + "grad_norm": 0.0008494920330122113, + "learning_rate": 7.229968717549901e-06, + "loss": 0.0033, + "num_input_tokens_seen": 169671616, + "step": 139445 + }, + { + "epoch": 15.530682704087315, + "grad_norm": 0.021926455199718475, + "learning_rate": 7.228259741149498e-06, + "loss": 0.1201, + "num_input_tokens_seen": 169677792, + "step": 139450 + }, + { + "epoch": 15.531239558970933, + "grad_norm": 0.008741946890950203, + "learning_rate": 7.226550932617429e-06, + "loss": 0.0242, + "num_input_tokens_seen": 169683744, + "step": 139455 + }, + { + "epoch": 15.531796413854549, + "grad_norm": 0.7291666865348816, + "learning_rate": 7.224842291969816e-06, + "loss": 0.0653, + "num_input_tokens_seen": 169689824, + "step": 139460 + }, + { + "epoch": 15.532353268738166, + "grad_norm": 0.009510873816907406, + "learning_rate": 7.223133819222819e-06, + "loss": 0.1502, + "num_input_tokens_seen": 169695872, + "step": 139465 + }, + { + "epoch": 15.532910123621784, + "grad_norm": 0.16214196383953094, + "learning_rate": 7.221425514392574e-06, + "loss": 0.0653, + "num_input_tokens_seen": 169702112, + "step": 139470 + }, + { + "epoch": 15.533466978505402, + "grad_norm": 0.0036707899998873472, + "learning_rate": 7.219717377495192e-06, + "loss": 0.0124, + "num_input_tokens_seen": 169708384, + "step": 139475 + }, + { + "epoch": 15.53402383338902, + "grad_norm": 1.9135528802871704, + "learning_rate": 7.218009408546833e-06, + "loss": 0.0618, + "num_input_tokens_seen": 169714336, + "step": 139480 + }, + { + "epoch": 15.534580688272635, + "grad_norm": 0.09579778462648392, + "learning_rate": 7.2163016075636135e-06, + "loss": 0.0146, + "num_input_tokens_seen": 169720512, + "step": 139485 + }, + { + "epoch": 15.535137543156253, + "grad_norm": 0.9585409760475159, + "learning_rate": 7.214593974561682e-06, + "loss": 0.0315, + "num_input_tokens_seen": 169726656, + "step": 139490 + }, + { + "epoch": 15.53569439803987, + "grad_norm": 0.00045432461774908006, + "learning_rate": 7.212886509557157e-06, + "loss": 0.0101, + "num_input_tokens_seen": 169732864, + "step": 139495 + }, + { + "epoch": 15.536251252923488, + "grad_norm": 0.22087809443473816, + "learning_rate": 7.211179212566174e-06, + "loss": 0.1041, + "num_input_tokens_seen": 169739072, + "step": 139500 + }, + { + "epoch": 15.536808107807106, + "grad_norm": 0.01788010634481907, + "learning_rate": 7.209472083604846e-06, + "loss": 0.0006, + "num_input_tokens_seen": 169745312, + "step": 139505 + }, + { + "epoch": 15.537364962690722, + "grad_norm": 0.022176794707775116, + "learning_rate": 7.207765122689314e-06, + "loss": 0.0788, + "num_input_tokens_seen": 169751360, + "step": 139510 + }, + { + "epoch": 15.53792181757434, + "grad_norm": 0.0026176830288022757, + "learning_rate": 7.2060583298356965e-06, + "loss": 0.0687, + "num_input_tokens_seen": 169757696, + "step": 139515 + }, + { + "epoch": 15.538478672457957, + "grad_norm": 2.0564112663269043, + "learning_rate": 7.2043517050601135e-06, + "loss": 0.1858, + "num_input_tokens_seen": 169763040, + "step": 139520 + }, + { + "epoch": 15.539035527341575, + "grad_norm": 0.04280300438404083, + "learning_rate": 7.2026452483786814e-06, + "loss": 0.0617, + "num_input_tokens_seen": 169768736, + "step": 139525 + }, + { + "epoch": 15.539592382225193, + "grad_norm": 0.0002505223383195698, + "learning_rate": 7.200938959807529e-06, + "loss": 0.0044, + "num_input_tokens_seen": 169774848, + "step": 139530 + }, + { + "epoch": 15.54014923710881, + "grad_norm": 0.00013325050531420857, + "learning_rate": 7.19923283936276e-06, + "loss": 0.0798, + "num_input_tokens_seen": 169780960, + "step": 139535 + }, + { + "epoch": 15.540706091992426, + "grad_norm": 0.6598721742630005, + "learning_rate": 7.197526887060515e-06, + "loss": 0.0258, + "num_input_tokens_seen": 169787040, + "step": 139540 + }, + { + "epoch": 15.541262946876044, + "grad_norm": 0.001759556820616126, + "learning_rate": 7.195821102916878e-06, + "loss": 0.0285, + "num_input_tokens_seen": 169793056, + "step": 139545 + }, + { + "epoch": 15.541819801759662, + "grad_norm": 0.48202791810035706, + "learning_rate": 7.1941154869479806e-06, + "loss": 0.0134, + "num_input_tokens_seen": 169798976, + "step": 139550 + }, + { + "epoch": 15.54237665664328, + "grad_norm": 0.8458806872367859, + "learning_rate": 7.192410039169922e-06, + "loss": 0.0672, + "num_input_tokens_seen": 169805472, + "step": 139555 + }, + { + "epoch": 15.542933511526897, + "grad_norm": 0.43318745493888855, + "learning_rate": 7.190704759598824e-06, + "loss": 0.0291, + "num_input_tokens_seen": 169811776, + "step": 139560 + }, + { + "epoch": 15.543490366410513, + "grad_norm": 0.0027908862102776766, + "learning_rate": 7.188999648250791e-06, + "loss": 0.0025, + "num_input_tokens_seen": 169818240, + "step": 139565 + }, + { + "epoch": 15.54404722129413, + "grad_norm": 0.08420975506305695, + "learning_rate": 7.1872947051419224e-06, + "loss": 0.0515, + "num_input_tokens_seen": 169824576, + "step": 139570 + }, + { + "epoch": 15.544604076177748, + "grad_norm": 0.00023651868104934692, + "learning_rate": 7.185589930288322e-06, + "loss": 0.0652, + "num_input_tokens_seen": 169830560, + "step": 139575 + }, + { + "epoch": 15.545160931061366, + "grad_norm": 0.15652166306972504, + "learning_rate": 7.183885323706102e-06, + "loss": 0.027, + "num_input_tokens_seen": 169836768, + "step": 139580 + }, + { + "epoch": 15.545717785944984, + "grad_norm": 0.8982486724853516, + "learning_rate": 7.182180885411363e-06, + "loss": 0.0282, + "num_input_tokens_seen": 169842688, + "step": 139585 + }, + { + "epoch": 15.5462746408286, + "grad_norm": 0.01898106001317501, + "learning_rate": 7.180476615420198e-06, + "loss": 0.0076, + "num_input_tokens_seen": 169848800, + "step": 139590 + }, + { + "epoch": 15.546831495712217, + "grad_norm": 0.0002548986522015184, + "learning_rate": 7.178772513748702e-06, + "loss": 0.0863, + "num_input_tokens_seen": 169854912, + "step": 139595 + }, + { + "epoch": 15.547388350595835, + "grad_norm": 0.07517490535974503, + "learning_rate": 7.177068580412985e-06, + "loss": 0.0145, + "num_input_tokens_seen": 169861120, + "step": 139600 + }, + { + "epoch": 15.547945205479452, + "grad_norm": 0.004008837975561619, + "learning_rate": 7.175364815429125e-06, + "loss": 0.0907, + "num_input_tokens_seen": 169867264, + "step": 139605 + }, + { + "epoch": 15.54850206036307, + "grad_norm": 0.22183358669281006, + "learning_rate": 7.173661218813235e-06, + "loss": 0.0456, + "num_input_tokens_seen": 169873600, + "step": 139610 + }, + { + "epoch": 15.549058915246686, + "grad_norm": 7.686614844715223e-05, + "learning_rate": 7.171957790581399e-06, + "loss": 0.052, + "num_input_tokens_seen": 169879584, + "step": 139615 + }, + { + "epoch": 15.549615770130304, + "grad_norm": 1.4045796394348145, + "learning_rate": 7.170254530749701e-06, + "loss": 0.1007, + "num_input_tokens_seen": 169885504, + "step": 139620 + }, + { + "epoch": 15.550172625013921, + "grad_norm": 0.0657266229391098, + "learning_rate": 7.168551439334228e-06, + "loss": 0.0351, + "num_input_tokens_seen": 169891648, + "step": 139625 + }, + { + "epoch": 15.550729479897539, + "grad_norm": 0.06587528437376022, + "learning_rate": 7.166848516351082e-06, + "loss": 0.0217, + "num_input_tokens_seen": 169897632, + "step": 139630 + }, + { + "epoch": 15.551286334781157, + "grad_norm": 0.005260325502604246, + "learning_rate": 7.165145761816339e-06, + "loss": 0.1473, + "num_input_tokens_seen": 169903680, + "step": 139635 + }, + { + "epoch": 15.551843189664773, + "grad_norm": 0.6221578121185303, + "learning_rate": 7.163443175746082e-06, + "loss": 0.0161, + "num_input_tokens_seen": 169909952, + "step": 139640 + }, + { + "epoch": 15.55240004454839, + "grad_norm": 1.7081935405731201, + "learning_rate": 7.161740758156388e-06, + "loss": 0.032, + "num_input_tokens_seen": 169916032, + "step": 139645 + }, + { + "epoch": 15.552956899432008, + "grad_norm": 0.02089695632457733, + "learning_rate": 7.160038509063352e-06, + "loss": 0.0649, + "num_input_tokens_seen": 169921952, + "step": 139650 + }, + { + "epoch": 15.553513754315626, + "grad_norm": 0.10467678308486938, + "learning_rate": 7.158336428483037e-06, + "loss": 0.0364, + "num_input_tokens_seen": 169927776, + "step": 139655 + }, + { + "epoch": 15.554070609199243, + "grad_norm": 0.012818840332329273, + "learning_rate": 7.156634516431543e-06, + "loss": 0.0101, + "num_input_tokens_seen": 169933952, + "step": 139660 + }, + { + "epoch": 15.55462746408286, + "grad_norm": 0.00017030345043167472, + "learning_rate": 7.154932772924919e-06, + "loss": 0.0342, + "num_input_tokens_seen": 169939744, + "step": 139665 + }, + { + "epoch": 15.555184318966477, + "grad_norm": 0.006094418466091156, + "learning_rate": 7.15323119797926e-06, + "loss": 0.0449, + "num_input_tokens_seen": 169946016, + "step": 139670 + }, + { + "epoch": 15.555741173850095, + "grad_norm": 0.7784807085990906, + "learning_rate": 7.151529791610623e-06, + "loss": 0.0141, + "num_input_tokens_seen": 169952128, + "step": 139675 + }, + { + "epoch": 15.556298028733712, + "grad_norm": 1.2713292837142944, + "learning_rate": 7.149828553835092e-06, + "loss": 0.089, + "num_input_tokens_seen": 169957952, + "step": 139680 + }, + { + "epoch": 15.55685488361733, + "grad_norm": 0.15876378118991852, + "learning_rate": 7.148127484668735e-06, + "loss": 0.0979, + "num_input_tokens_seen": 169963872, + "step": 139685 + }, + { + "epoch": 15.557411738500946, + "grad_norm": 1.1636961698532104, + "learning_rate": 7.146426584127614e-06, + "loss": 0.0381, + "num_input_tokens_seen": 169969440, + "step": 139690 + }, + { + "epoch": 15.557968593384564, + "grad_norm": 0.0004790890961885452, + "learning_rate": 7.144725852227793e-06, + "loss": 0.0034, + "num_input_tokens_seen": 169975872, + "step": 139695 + }, + { + "epoch": 15.558525448268181, + "grad_norm": 0.2832781970500946, + "learning_rate": 7.143025288985347e-06, + "loss": 0.1398, + "num_input_tokens_seen": 169981600, + "step": 139700 + }, + { + "epoch": 15.559082303151799, + "grad_norm": 0.040993042290210724, + "learning_rate": 7.141324894416335e-06, + "loss": 0.0145, + "num_input_tokens_seen": 169987776, + "step": 139705 + }, + { + "epoch": 15.559639158035417, + "grad_norm": 0.6345181465148926, + "learning_rate": 7.139624668536818e-06, + "loss": 0.1348, + "num_input_tokens_seen": 169993664, + "step": 139710 + }, + { + "epoch": 15.560196012919032, + "grad_norm": 0.00012983572378288954, + "learning_rate": 7.137924611362845e-06, + "loss": 0.0079, + "num_input_tokens_seen": 170000256, + "step": 139715 + }, + { + "epoch": 15.56075286780265, + "grad_norm": 0.0016867269296199083, + "learning_rate": 7.136224722910495e-06, + "loss": 0.0048, + "num_input_tokens_seen": 170006784, + "step": 139720 + }, + { + "epoch": 15.561309722686268, + "grad_norm": 0.4501022696495056, + "learning_rate": 7.134525003195808e-06, + "loss": 0.0843, + "num_input_tokens_seen": 170012960, + "step": 139725 + }, + { + "epoch": 15.561866577569885, + "grad_norm": 0.0002666290383785963, + "learning_rate": 7.1328254522348595e-06, + "loss": 0.0212, + "num_input_tokens_seen": 170019392, + "step": 139730 + }, + { + "epoch": 15.562423432453503, + "grad_norm": 0.9159664511680603, + "learning_rate": 7.131126070043676e-06, + "loss": 0.0587, + "num_input_tokens_seen": 170024928, + "step": 139735 + }, + { + "epoch": 15.562980287337119, + "grad_norm": 0.001174572273157537, + "learning_rate": 7.12942685663833e-06, + "loss": 0.0079, + "num_input_tokens_seen": 170031168, + "step": 139740 + }, + { + "epoch": 15.563537142220737, + "grad_norm": 1.6955091953277588, + "learning_rate": 7.127727812034854e-06, + "loss": 0.2334, + "num_input_tokens_seen": 170037408, + "step": 139745 + }, + { + "epoch": 15.564093997104354, + "grad_norm": 0.0015160772018134594, + "learning_rate": 7.126028936249321e-06, + "loss": 0.0215, + "num_input_tokens_seen": 170043392, + "step": 139750 + }, + { + "epoch": 15.564650851987972, + "grad_norm": 1.5999823808670044, + "learning_rate": 7.12433022929776e-06, + "loss": 0.0968, + "num_input_tokens_seen": 170049248, + "step": 139755 + }, + { + "epoch": 15.56520770687159, + "grad_norm": 0.05804643779993057, + "learning_rate": 7.122631691196225e-06, + "loss": 0.0714, + "num_input_tokens_seen": 170055072, + "step": 139760 + }, + { + "epoch": 15.565764561755207, + "grad_norm": 0.007184435613453388, + "learning_rate": 7.120933321960749e-06, + "loss": 0.009, + "num_input_tokens_seen": 170061056, + "step": 139765 + }, + { + "epoch": 15.566321416638823, + "grad_norm": 1.7491142749786377, + "learning_rate": 7.11923512160739e-06, + "loss": 0.0222, + "num_input_tokens_seen": 170067168, + "step": 139770 + }, + { + "epoch": 15.566878271522441, + "grad_norm": 0.16783106327056885, + "learning_rate": 7.117537090152179e-06, + "loss": 0.0089, + "num_input_tokens_seen": 170073088, + "step": 139775 + }, + { + "epoch": 15.567435126406059, + "grad_norm": 0.4853242039680481, + "learning_rate": 7.1158392276111595e-06, + "loss": 0.0335, + "num_input_tokens_seen": 170079360, + "step": 139780 + }, + { + "epoch": 15.567991981289676, + "grad_norm": 0.13550148904323578, + "learning_rate": 7.114141534000357e-06, + "loss": 0.0109, + "num_input_tokens_seen": 170085440, + "step": 139785 + }, + { + "epoch": 15.568548836173294, + "grad_norm": 0.20794710516929626, + "learning_rate": 7.112444009335828e-06, + "loss": 0.0028, + "num_input_tokens_seen": 170091648, + "step": 139790 + }, + { + "epoch": 15.56910569105691, + "grad_norm": 1.5193846225738525, + "learning_rate": 7.1107466536335875e-06, + "loss": 0.0893, + "num_input_tokens_seen": 170097440, + "step": 139795 + }, + { + "epoch": 15.569662545940528, + "grad_norm": 0.0529421791434288, + "learning_rate": 7.109049466909684e-06, + "loss": 0.0031, + "num_input_tokens_seen": 170103392, + "step": 139800 + }, + { + "epoch": 15.570219400824145, + "grad_norm": 0.0006062547909095883, + "learning_rate": 7.107352449180143e-06, + "loss": 0.0288, + "num_input_tokens_seen": 170109312, + "step": 139805 + }, + { + "epoch": 15.570776255707763, + "grad_norm": 0.09304086863994598, + "learning_rate": 7.105655600460995e-06, + "loss": 0.0836, + "num_input_tokens_seen": 170115360, + "step": 139810 + }, + { + "epoch": 15.57133311059138, + "grad_norm": 0.02478368766605854, + "learning_rate": 7.103958920768258e-06, + "loss": 0.027, + "num_input_tokens_seen": 170121408, + "step": 139815 + }, + { + "epoch": 15.571889965474996, + "grad_norm": 0.08924134075641632, + "learning_rate": 7.102262410117977e-06, + "loss": 0.007, + "num_input_tokens_seen": 170127456, + "step": 139820 + }, + { + "epoch": 15.572446820358614, + "grad_norm": 0.6854883432388306, + "learning_rate": 7.100566068526163e-06, + "loss": 0.1086, + "num_input_tokens_seen": 170133952, + "step": 139825 + }, + { + "epoch": 15.573003675242232, + "grad_norm": 0.23774372041225433, + "learning_rate": 7.098869896008845e-06, + "loss": 0.0114, + "num_input_tokens_seen": 170140160, + "step": 139830 + }, + { + "epoch": 15.57356053012585, + "grad_norm": 0.7250049114227295, + "learning_rate": 7.097173892582035e-06, + "loss": 0.0752, + "num_input_tokens_seen": 170146624, + "step": 139835 + }, + { + "epoch": 15.574117385009467, + "grad_norm": 1.1295620203018188, + "learning_rate": 7.09547805826177e-06, + "loss": 0.085, + "num_input_tokens_seen": 170150976, + "step": 139840 + }, + { + "epoch": 15.574674239893083, + "grad_norm": 0.013837854377925396, + "learning_rate": 7.09378239306405e-06, + "loss": 0.0171, + "num_input_tokens_seen": 170157408, + "step": 139845 + }, + { + "epoch": 15.5752310947767, + "grad_norm": 0.25936636328697205, + "learning_rate": 7.092086897004918e-06, + "loss": 0.0663, + "num_input_tokens_seen": 170163552, + "step": 139850 + }, + { + "epoch": 15.575787949660318, + "grad_norm": 0.7672013640403748, + "learning_rate": 7.090391570100358e-06, + "loss": 0.0175, + "num_input_tokens_seen": 170169696, + "step": 139855 + }, + { + "epoch": 15.576344804543936, + "grad_norm": 0.019553322345018387, + "learning_rate": 7.088696412366405e-06, + "loss": 0.033, + "num_input_tokens_seen": 170175936, + "step": 139860 + }, + { + "epoch": 15.576901659427554, + "grad_norm": 2.041492462158203, + "learning_rate": 7.087001423819059e-06, + "loss": 0.0671, + "num_input_tokens_seen": 170181888, + "step": 139865 + }, + { + "epoch": 15.57745851431117, + "grad_norm": 0.8041584491729736, + "learning_rate": 7.085306604474343e-06, + "loss": 0.1354, + "num_input_tokens_seen": 170188160, + "step": 139870 + }, + { + "epoch": 15.578015369194787, + "grad_norm": 0.019199179485440254, + "learning_rate": 7.083611954348265e-06, + "loss": 0.0241, + "num_input_tokens_seen": 170194624, + "step": 139875 + }, + { + "epoch": 15.578572224078405, + "grad_norm": 0.07213590294122696, + "learning_rate": 7.081917473456812e-06, + "loss": 0.0742, + "num_input_tokens_seen": 170200608, + "step": 139880 + }, + { + "epoch": 15.579129078962023, + "grad_norm": 1.3519439697265625, + "learning_rate": 7.080223161816013e-06, + "loss": 0.0646, + "num_input_tokens_seen": 170206336, + "step": 139885 + }, + { + "epoch": 15.57968593384564, + "grad_norm": 0.0025430337991565466, + "learning_rate": 7.078529019441854e-06, + "loss": 0.0205, + "num_input_tokens_seen": 170212512, + "step": 139890 + }, + { + "epoch": 15.580242788729258, + "grad_norm": 0.05599750950932503, + "learning_rate": 7.076835046350355e-06, + "loss": 0.0068, + "num_input_tokens_seen": 170218624, + "step": 139895 + }, + { + "epoch": 15.580799643612874, + "grad_norm": 0.005023699253797531, + "learning_rate": 7.075141242557512e-06, + "loss": 0.0099, + "num_input_tokens_seen": 170224864, + "step": 139900 + }, + { + "epoch": 15.581356498496492, + "grad_norm": 0.25094273686408997, + "learning_rate": 7.073447608079317e-06, + "loss": 0.1164, + "num_input_tokens_seen": 170230464, + "step": 139905 + }, + { + "epoch": 15.58191335338011, + "grad_norm": 0.0022017008159309626, + "learning_rate": 7.071754142931766e-06, + "loss": 0.0028, + "num_input_tokens_seen": 170236928, + "step": 139910 + }, + { + "epoch": 15.582470208263727, + "grad_norm": 0.8076741099357605, + "learning_rate": 7.0700608471308685e-06, + "loss": 0.0595, + "num_input_tokens_seen": 170243136, + "step": 139915 + }, + { + "epoch": 15.583027063147345, + "grad_norm": 0.3636394143104553, + "learning_rate": 7.068367720692612e-06, + "loss": 0.0314, + "num_input_tokens_seen": 170249248, + "step": 139920 + }, + { + "epoch": 15.58358391803096, + "grad_norm": 0.00019726854225154966, + "learning_rate": 7.066674763632986e-06, + "loss": 0.031, + "num_input_tokens_seen": 170255616, + "step": 139925 + }, + { + "epoch": 15.584140772914578, + "grad_norm": 0.0692179724574089, + "learning_rate": 7.0649819759679796e-06, + "loss": 0.0793, + "num_input_tokens_seen": 170261920, + "step": 139930 + }, + { + "epoch": 15.584697627798196, + "grad_norm": 0.0015151440165936947, + "learning_rate": 7.063289357713596e-06, + "loss": 0.0673, + "num_input_tokens_seen": 170268288, + "step": 139935 + }, + { + "epoch": 15.585254482681814, + "grad_norm": 0.2728523910045624, + "learning_rate": 7.061596908885806e-06, + "loss": 0.0573, + "num_input_tokens_seen": 170274432, + "step": 139940 + }, + { + "epoch": 15.585811337565431, + "grad_norm": 0.00024677766487002373, + "learning_rate": 7.059904629500613e-06, + "loss": 0.0122, + "num_input_tokens_seen": 170280672, + "step": 139945 + }, + { + "epoch": 15.586368192449047, + "grad_norm": 0.023409433662891388, + "learning_rate": 7.058212519573995e-06, + "loss": 0.0248, + "num_input_tokens_seen": 170286720, + "step": 139950 + }, + { + "epoch": 15.586925047332665, + "grad_norm": 9.150501864496619e-05, + "learning_rate": 7.056520579121933e-06, + "loss": 0.042, + "num_input_tokens_seen": 170292896, + "step": 139955 + }, + { + "epoch": 15.587481902216282, + "grad_norm": 0.000622984895016998, + "learning_rate": 7.054828808160404e-06, + "loss": 0.0798, + "num_input_tokens_seen": 170299072, + "step": 139960 + }, + { + "epoch": 15.5880387570999, + "grad_norm": 0.0022048226092010736, + "learning_rate": 7.053137206705401e-06, + "loss": 0.0039, + "num_input_tokens_seen": 170304864, + "step": 139965 + }, + { + "epoch": 15.588595611983518, + "grad_norm": 0.4295247793197632, + "learning_rate": 7.0514457747728966e-06, + "loss": 0.0617, + "num_input_tokens_seen": 170311200, + "step": 139970 + }, + { + "epoch": 15.589152466867134, + "grad_norm": 0.0003946383949369192, + "learning_rate": 7.049754512378867e-06, + "loss": 0.0867, + "num_input_tokens_seen": 170317440, + "step": 139975 + }, + { + "epoch": 15.589709321750751, + "grad_norm": 0.031124858185648918, + "learning_rate": 7.04806341953928e-06, + "loss": 0.0096, + "num_input_tokens_seen": 170323232, + "step": 139980 + }, + { + "epoch": 15.590266176634369, + "grad_norm": 3.2022817134857178, + "learning_rate": 7.046372496270126e-06, + "loss": 0.104, + "num_input_tokens_seen": 170329472, + "step": 139985 + }, + { + "epoch": 15.590823031517987, + "grad_norm": 0.09312406182289124, + "learning_rate": 7.044681742587364e-06, + "loss": 0.0069, + "num_input_tokens_seen": 170335712, + "step": 139990 + }, + { + "epoch": 15.591379886401604, + "grad_norm": 1.4019732475280762, + "learning_rate": 7.0429911585069726e-06, + "loss": 0.0889, + "num_input_tokens_seen": 170341952, + "step": 139995 + }, + { + "epoch": 15.59193674128522, + "grad_norm": 8.027733565540984e-05, + "learning_rate": 7.041300744044907e-06, + "loss": 0.1946, + "num_input_tokens_seen": 170348224, + "step": 140000 + }, + { + "epoch": 15.592493596168838, + "grad_norm": 0.0001921473303809762, + "learning_rate": 7.03961049921715e-06, + "loss": 0.0136, + "num_input_tokens_seen": 170354176, + "step": 140005 + }, + { + "epoch": 15.593050451052456, + "grad_norm": 0.019728951156139374, + "learning_rate": 7.037920424039657e-06, + "loss": 0.0034, + "num_input_tokens_seen": 170359968, + "step": 140010 + }, + { + "epoch": 15.593607305936073, + "grad_norm": 1.592003345489502, + "learning_rate": 7.0362305185284025e-06, + "loss": 0.0759, + "num_input_tokens_seen": 170366208, + "step": 140015 + }, + { + "epoch": 15.594164160819691, + "grad_norm": 0.049319397658109665, + "learning_rate": 7.034540782699345e-06, + "loss": 0.0379, + "num_input_tokens_seen": 170372192, + "step": 140020 + }, + { + "epoch": 15.594721015703307, + "grad_norm": 0.1094449982047081, + "learning_rate": 7.03285121656844e-06, + "loss": 0.0773, + "num_input_tokens_seen": 170378368, + "step": 140025 + }, + { + "epoch": 15.595277870586925, + "grad_norm": 0.13221701979637146, + "learning_rate": 7.031161820151644e-06, + "loss": 0.0161, + "num_input_tokens_seen": 170384672, + "step": 140030 + }, + { + "epoch": 15.595834725470542, + "grad_norm": 0.661656379699707, + "learning_rate": 7.029472593464931e-06, + "loss": 0.035, + "num_input_tokens_seen": 170390432, + "step": 140035 + }, + { + "epoch": 15.59639158035416, + "grad_norm": 0.1219334602355957, + "learning_rate": 7.027783536524243e-06, + "loss": 0.0027, + "num_input_tokens_seen": 170396992, + "step": 140040 + }, + { + "epoch": 15.596948435237778, + "grad_norm": 1.88503897190094, + "learning_rate": 7.026094649345544e-06, + "loss": 0.0253, + "num_input_tokens_seen": 170402976, + "step": 140045 + }, + { + "epoch": 15.597505290121394, + "grad_norm": 0.011233208701014519, + "learning_rate": 7.024405931944769e-06, + "loss": 0.0794, + "num_input_tokens_seen": 170409248, + "step": 140050 + }, + { + "epoch": 15.598062145005011, + "grad_norm": 0.4681289792060852, + "learning_rate": 7.0227173843378925e-06, + "loss": 0.0119, + "num_input_tokens_seen": 170415296, + "step": 140055 + }, + { + "epoch": 15.598618999888629, + "grad_norm": 1.105908989906311, + "learning_rate": 7.021029006540844e-06, + "loss": 0.0714, + "num_input_tokens_seen": 170421216, + "step": 140060 + }, + { + "epoch": 15.599175854772247, + "grad_norm": 0.5413749814033508, + "learning_rate": 7.019340798569596e-06, + "loss": 0.0085, + "num_input_tokens_seen": 170427488, + "step": 140065 + }, + { + "epoch": 15.599732709655864, + "grad_norm": 0.021051930263638496, + "learning_rate": 7.017652760440066e-06, + "loss": 0.0071, + "num_input_tokens_seen": 170433920, + "step": 140070 + }, + { + "epoch": 15.60028956453948, + "grad_norm": 0.020692648366093636, + "learning_rate": 7.0159648921682194e-06, + "loss": 0.0086, + "num_input_tokens_seen": 170440032, + "step": 140075 + }, + { + "epoch": 15.600846419423098, + "grad_norm": 0.00473701860755682, + "learning_rate": 7.014277193769986e-06, + "loss": 0.0082, + "num_input_tokens_seen": 170446272, + "step": 140080 + }, + { + "epoch": 15.601403274306715, + "grad_norm": 1.5401958227157593, + "learning_rate": 7.012589665261324e-06, + "loss": 0.0722, + "num_input_tokens_seen": 170452096, + "step": 140085 + }, + { + "epoch": 15.601960129190333, + "grad_norm": 0.09531350433826447, + "learning_rate": 7.010902306658162e-06, + "loss": 0.0014, + "num_input_tokens_seen": 170458336, + "step": 140090 + }, + { + "epoch": 15.60251698407395, + "grad_norm": 0.3701446056365967, + "learning_rate": 7.0092151179764395e-06, + "loss": 0.0337, + "num_input_tokens_seen": 170464800, + "step": 140095 + }, + { + "epoch": 15.603073838957567, + "grad_norm": 1.359033226966858, + "learning_rate": 7.0075280992320885e-06, + "loss": 0.0711, + "num_input_tokens_seen": 170470880, + "step": 140100 + }, + { + "epoch": 15.603630693841184, + "grad_norm": 0.018922388553619385, + "learning_rate": 7.005841250441056e-06, + "loss": 0.0347, + "num_input_tokens_seen": 170476608, + "step": 140105 + }, + { + "epoch": 15.604187548724802, + "grad_norm": 0.12144113332033157, + "learning_rate": 7.00415457161927e-06, + "loss": 0.0068, + "num_input_tokens_seen": 170482496, + "step": 140110 + }, + { + "epoch": 15.60474440360842, + "grad_norm": 0.04441020265221596, + "learning_rate": 7.002468062782661e-06, + "loss": 0.1297, + "num_input_tokens_seen": 170487776, + "step": 140115 + }, + { + "epoch": 15.605301258492037, + "grad_norm": 0.05106435716152191, + "learning_rate": 7.0007817239471555e-06, + "loss": 0.0018, + "num_input_tokens_seen": 170493760, + "step": 140120 + }, + { + "epoch": 15.605858113375655, + "grad_norm": 2.456719160079956, + "learning_rate": 6.999095555128693e-06, + "loss": 0.0297, + "num_input_tokens_seen": 170500000, + "step": 140125 + }, + { + "epoch": 15.606414968259271, + "grad_norm": 0.01662478968501091, + "learning_rate": 6.997409556343188e-06, + "loss": 0.0277, + "num_input_tokens_seen": 170506624, + "step": 140130 + }, + { + "epoch": 15.606971823142889, + "grad_norm": 0.051204271614551544, + "learning_rate": 6.995723727606587e-06, + "loss": 0.0072, + "num_input_tokens_seen": 170512640, + "step": 140135 + }, + { + "epoch": 15.607528678026506, + "grad_norm": 1.7579716444015503, + "learning_rate": 6.994038068934788e-06, + "loss": 0.1382, + "num_input_tokens_seen": 170518880, + "step": 140140 + }, + { + "epoch": 15.608085532910124, + "grad_norm": 0.5364060401916504, + "learning_rate": 6.992352580343731e-06, + "loss": 0.0309, + "num_input_tokens_seen": 170524864, + "step": 140145 + }, + { + "epoch": 15.608642387793742, + "grad_norm": 0.1812039464712143, + "learning_rate": 6.990667261849324e-06, + "loss": 0.0182, + "num_input_tokens_seen": 170530912, + "step": 140150 + }, + { + "epoch": 15.609199242677358, + "grad_norm": 0.09260555356740952, + "learning_rate": 6.988982113467501e-06, + "loss": 0.1445, + "num_input_tokens_seen": 170536736, + "step": 140155 + }, + { + "epoch": 15.609756097560975, + "grad_norm": 2.3728556632995605, + "learning_rate": 6.987297135214174e-06, + "loss": 0.0266, + "num_input_tokens_seen": 170542976, + "step": 140160 + }, + { + "epoch": 15.610312952444593, + "grad_norm": 0.0023383533116430044, + "learning_rate": 6.985612327105254e-06, + "loss": 0.0762, + "num_input_tokens_seen": 170548896, + "step": 140165 + }, + { + "epoch": 15.61086980732821, + "grad_norm": 0.013226991519331932, + "learning_rate": 6.983927689156652e-06, + "loss": 0.011, + "num_input_tokens_seen": 170555200, + "step": 140170 + }, + { + "epoch": 15.611426662211828, + "grad_norm": 0.10551712661981583, + "learning_rate": 6.982243221384296e-06, + "loss": 0.0155, + "num_input_tokens_seen": 170561056, + "step": 140175 + }, + { + "epoch": 15.611983517095444, + "grad_norm": 0.00025883992202579975, + "learning_rate": 6.98055892380409e-06, + "loss": 0.0735, + "num_input_tokens_seen": 170566976, + "step": 140180 + }, + { + "epoch": 15.612540371979062, + "grad_norm": 0.03618482127785683, + "learning_rate": 6.978874796431939e-06, + "loss": 0.0012, + "num_input_tokens_seen": 170573120, + "step": 140185 + }, + { + "epoch": 15.61309722686268, + "grad_norm": 0.11201327294111252, + "learning_rate": 6.977190839283745e-06, + "loss": 0.1256, + "num_input_tokens_seen": 170578848, + "step": 140190 + }, + { + "epoch": 15.613654081746297, + "grad_norm": 0.00020004046382382512, + "learning_rate": 6.975507052375432e-06, + "loss": 0.0011, + "num_input_tokens_seen": 170585152, + "step": 140195 + }, + { + "epoch": 15.614210936629915, + "grad_norm": 0.005313614849001169, + "learning_rate": 6.97382343572289e-06, + "loss": 0.0242, + "num_input_tokens_seen": 170591360, + "step": 140200 + }, + { + "epoch": 15.61476779151353, + "grad_norm": 0.0021457502152770758, + "learning_rate": 6.972139989342036e-06, + "loss": 0.0108, + "num_input_tokens_seen": 170597280, + "step": 140205 + }, + { + "epoch": 15.615324646397148, + "grad_norm": 0.0017683485057204962, + "learning_rate": 6.9704567132487605e-06, + "loss": 0.0218, + "num_input_tokens_seen": 170603328, + "step": 140210 + }, + { + "epoch": 15.615881501280766, + "grad_norm": 0.060845665633678436, + "learning_rate": 6.968773607458967e-06, + "loss": 0.0027, + "num_input_tokens_seen": 170609664, + "step": 140215 + }, + { + "epoch": 15.616438356164384, + "grad_norm": 0.16333292424678802, + "learning_rate": 6.967090671988546e-06, + "loss": 0.0371, + "num_input_tokens_seen": 170615968, + "step": 140220 + }, + { + "epoch": 15.616995211048001, + "grad_norm": 0.01842762902379036, + "learning_rate": 6.96540790685341e-06, + "loss": 0.1313, + "num_input_tokens_seen": 170621952, + "step": 140225 + }, + { + "epoch": 15.61755206593162, + "grad_norm": 0.05317225307226181, + "learning_rate": 6.963725312069444e-06, + "loss": 0.0461, + "num_input_tokens_seen": 170627904, + "step": 140230 + }, + { + "epoch": 15.618108920815235, + "grad_norm": 0.5162762999534607, + "learning_rate": 6.962042887652545e-06, + "loss": 0.0954, + "num_input_tokens_seen": 170634144, + "step": 140235 + }, + { + "epoch": 15.618665775698853, + "grad_norm": 0.00011051940964534879, + "learning_rate": 6.9603606336185925e-06, + "loss": 0.0541, + "num_input_tokens_seen": 170640352, + "step": 140240 + }, + { + "epoch": 15.61922263058247, + "grad_norm": 0.1279338002204895, + "learning_rate": 6.958678549983497e-06, + "loss": 0.0087, + "num_input_tokens_seen": 170646464, + "step": 140245 + }, + { + "epoch": 15.619779485466088, + "grad_norm": 0.00042448093881830573, + "learning_rate": 6.956996636763127e-06, + "loss": 0.2542, + "num_input_tokens_seen": 170652512, + "step": 140250 + }, + { + "epoch": 15.620336340349706, + "grad_norm": 3.1376688480377197, + "learning_rate": 6.9553148939733966e-06, + "loss": 0.0588, + "num_input_tokens_seen": 170658336, + "step": 140255 + }, + { + "epoch": 15.620893195233322, + "grad_norm": 0.030785106122493744, + "learning_rate": 6.953633321630157e-06, + "loss": 0.0146, + "num_input_tokens_seen": 170664192, + "step": 140260 + }, + { + "epoch": 15.62145005011694, + "grad_norm": 0.18635410070419312, + "learning_rate": 6.9519519197493195e-06, + "loss": 0.0915, + "num_input_tokens_seen": 170670240, + "step": 140265 + }, + { + "epoch": 15.622006905000557, + "grad_norm": 1.0472331047058105, + "learning_rate": 6.950270688346747e-06, + "loss": 0.0308, + "num_input_tokens_seen": 170676384, + "step": 140270 + }, + { + "epoch": 15.622563759884175, + "grad_norm": 0.21791484951972961, + "learning_rate": 6.9485896274383374e-06, + "loss": 0.0112, + "num_input_tokens_seen": 170682560, + "step": 140275 + }, + { + "epoch": 15.623120614767792, + "grad_norm": 0.0021664982195943594, + "learning_rate": 6.946908737039959e-06, + "loss": 0.0026, + "num_input_tokens_seen": 170688512, + "step": 140280 + }, + { + "epoch": 15.623677469651408, + "grad_norm": 0.00798252783715725, + "learning_rate": 6.9452280171674935e-06, + "loss": 0.0135, + "num_input_tokens_seen": 170694304, + "step": 140285 + }, + { + "epoch": 15.624234324535026, + "grad_norm": 0.06016826257109642, + "learning_rate": 6.943547467836814e-06, + "loss": 0.0055, + "num_input_tokens_seen": 170700384, + "step": 140290 + }, + { + "epoch": 15.624791179418644, + "grad_norm": 0.004494978114962578, + "learning_rate": 6.941867089063786e-06, + "loss": 0.0193, + "num_input_tokens_seen": 170706560, + "step": 140295 + }, + { + "epoch": 15.625348034302261, + "grad_norm": 0.04269597679376602, + "learning_rate": 6.940186880864302e-06, + "loss": 0.0198, + "num_input_tokens_seen": 170712864, + "step": 140300 + }, + { + "epoch": 15.625904889185879, + "grad_norm": 0.00039978325366973877, + "learning_rate": 6.938506843254219e-06, + "loss": 0.0418, + "num_input_tokens_seen": 170718848, + "step": 140305 + }, + { + "epoch": 15.626461744069495, + "grad_norm": 7.761183951515704e-05, + "learning_rate": 6.936826976249414e-06, + "loss": 0.0653, + "num_input_tokens_seen": 170725248, + "step": 140310 + }, + { + "epoch": 15.627018598953113, + "grad_norm": 0.04385439306497574, + "learning_rate": 6.935147279865739e-06, + "loss": 0.0293, + "num_input_tokens_seen": 170731680, + "step": 140315 + }, + { + "epoch": 15.62757545383673, + "grad_norm": 0.00018151600670535117, + "learning_rate": 6.9334677541190804e-06, + "loss": 0.068, + "num_input_tokens_seen": 170737664, + "step": 140320 + }, + { + "epoch": 15.628132308720348, + "grad_norm": 1.7945280075073242, + "learning_rate": 6.9317883990252935e-06, + "loss": 0.0877, + "num_input_tokens_seen": 170744320, + "step": 140325 + }, + { + "epoch": 15.628689163603966, + "grad_norm": 0.03706151992082596, + "learning_rate": 6.930109214600239e-06, + "loss": 0.0118, + "num_input_tokens_seen": 170750496, + "step": 140330 + }, + { + "epoch": 15.629246018487581, + "grad_norm": 0.0622766837477684, + "learning_rate": 6.928430200859776e-06, + "loss": 0.018, + "num_input_tokens_seen": 170755904, + "step": 140335 + }, + { + "epoch": 15.6298028733712, + "grad_norm": 0.07359625399112701, + "learning_rate": 6.926751357819772e-06, + "loss": 0.0093, + "num_input_tokens_seen": 170762112, + "step": 140340 + }, + { + "epoch": 15.630359728254817, + "grad_norm": 1.3730560541152954, + "learning_rate": 6.925072685496076e-06, + "loss": 0.0612, + "num_input_tokens_seen": 170768096, + "step": 140345 + }, + { + "epoch": 15.630916583138434, + "grad_norm": 0.2680375874042511, + "learning_rate": 6.923394183904558e-06, + "loss": 0.0702, + "num_input_tokens_seen": 170774240, + "step": 140350 + }, + { + "epoch": 15.631473438022052, + "grad_norm": 0.7729257941246033, + "learning_rate": 6.921715853061064e-06, + "loss": 0.1502, + "num_input_tokens_seen": 170780128, + "step": 140355 + }, + { + "epoch": 15.632030292905668, + "grad_norm": 0.8071351051330566, + "learning_rate": 6.920037692981448e-06, + "loss": 0.0157, + "num_input_tokens_seen": 170786336, + "step": 140360 + }, + { + "epoch": 15.632587147789286, + "grad_norm": 0.6418925523757935, + "learning_rate": 6.918359703681554e-06, + "loss": 0.0244, + "num_input_tokens_seen": 170791840, + "step": 140365 + }, + { + "epoch": 15.633144002672903, + "grad_norm": 0.39960578083992004, + "learning_rate": 6.916681885177248e-06, + "loss": 0.0118, + "num_input_tokens_seen": 170798112, + "step": 140370 + }, + { + "epoch": 15.633700857556521, + "grad_norm": 0.9406405687332153, + "learning_rate": 6.915004237484368e-06, + "loss": 0.0206, + "num_input_tokens_seen": 170804288, + "step": 140375 + }, + { + "epoch": 15.634257712440139, + "grad_norm": 0.8147303462028503, + "learning_rate": 6.913326760618763e-06, + "loss": 0.0115, + "num_input_tokens_seen": 170811040, + "step": 140380 + }, + { + "epoch": 15.634814567323755, + "grad_norm": 0.9199299812316895, + "learning_rate": 6.911649454596272e-06, + "loss": 0.0772, + "num_input_tokens_seen": 170816576, + "step": 140385 + }, + { + "epoch": 15.635371422207372, + "grad_norm": 0.14842280745506287, + "learning_rate": 6.909972319432747e-06, + "loss": 0.0573, + "num_input_tokens_seen": 170822816, + "step": 140390 + }, + { + "epoch": 15.63592827709099, + "grad_norm": 0.1787187159061432, + "learning_rate": 6.908295355144023e-06, + "loss": 0.0647, + "num_input_tokens_seen": 170829280, + "step": 140395 + }, + { + "epoch": 15.636485131974608, + "grad_norm": 0.09182735532522202, + "learning_rate": 6.906618561745959e-06, + "loss": 0.002, + "num_input_tokens_seen": 170835552, + "step": 140400 + }, + { + "epoch": 15.637041986858225, + "grad_norm": 1.0495625734329224, + "learning_rate": 6.904941939254364e-06, + "loss": 0.1258, + "num_input_tokens_seen": 170841408, + "step": 140405 + }, + { + "epoch": 15.637598841741841, + "grad_norm": 0.021476691588759422, + "learning_rate": 6.903265487685096e-06, + "loss": 0.1217, + "num_input_tokens_seen": 170847712, + "step": 140410 + }, + { + "epoch": 15.638155696625459, + "grad_norm": 0.04805326089262962, + "learning_rate": 6.901589207053977e-06, + "loss": 0.0045, + "num_input_tokens_seen": 170853600, + "step": 140415 + }, + { + "epoch": 15.638712551509077, + "grad_norm": 0.08942801505327225, + "learning_rate": 6.899913097376856e-06, + "loss": 0.0014, + "num_input_tokens_seen": 170859840, + "step": 140420 + }, + { + "epoch": 15.639269406392694, + "grad_norm": 0.6681938767433167, + "learning_rate": 6.898237158669557e-06, + "loss": 0.0487, + "num_input_tokens_seen": 170865984, + "step": 140425 + }, + { + "epoch": 15.639826261276312, + "grad_norm": 0.009452414698898792, + "learning_rate": 6.896561390947911e-06, + "loss": 0.0023, + "num_input_tokens_seen": 170871744, + "step": 140430 + }, + { + "epoch": 15.640383116159928, + "grad_norm": 0.021594488993287086, + "learning_rate": 6.89488579422774e-06, + "loss": 0.0812, + "num_input_tokens_seen": 170878112, + "step": 140435 + }, + { + "epoch": 15.640939971043545, + "grad_norm": 3.9717679023742676, + "learning_rate": 6.893210368524886e-06, + "loss": 0.0751, + "num_input_tokens_seen": 170883936, + "step": 140440 + }, + { + "epoch": 15.641496825927163, + "grad_norm": 0.016004618257284164, + "learning_rate": 6.891535113855166e-06, + "loss": 0.0836, + "num_input_tokens_seen": 170890112, + "step": 140445 + }, + { + "epoch": 15.64205368081078, + "grad_norm": 1.2945938110351562, + "learning_rate": 6.889860030234407e-06, + "loss": 0.047, + "num_input_tokens_seen": 170896320, + "step": 140450 + }, + { + "epoch": 15.642610535694399, + "grad_norm": 0.005489780101925135, + "learning_rate": 6.88818511767842e-06, + "loss": 0.0728, + "num_input_tokens_seen": 170902464, + "step": 140455 + }, + { + "epoch": 15.643167390578016, + "grad_norm": 0.002756959293037653, + "learning_rate": 6.886510376203043e-06, + "loss": 0.0018, + "num_input_tokens_seen": 170908448, + "step": 140460 + }, + { + "epoch": 15.643724245461632, + "grad_norm": 0.9461585283279419, + "learning_rate": 6.88483580582408e-06, + "loss": 0.0712, + "num_input_tokens_seen": 170914720, + "step": 140465 + }, + { + "epoch": 15.64428110034525, + "grad_norm": 0.023375486955046654, + "learning_rate": 6.883161406557373e-06, + "loss": 0.0078, + "num_input_tokens_seen": 170920768, + "step": 140470 + }, + { + "epoch": 15.644837955228867, + "grad_norm": 1.4487652778625488, + "learning_rate": 6.881487178418708e-06, + "loss": 0.0364, + "num_input_tokens_seen": 170926880, + "step": 140475 + }, + { + "epoch": 15.645394810112485, + "grad_norm": 1.111319899559021, + "learning_rate": 6.879813121423917e-06, + "loss": 0.0319, + "num_input_tokens_seen": 170932672, + "step": 140480 + }, + { + "epoch": 15.645951664996103, + "grad_norm": 0.0001750249502947554, + "learning_rate": 6.878139235588801e-06, + "loss": 0.0735, + "num_input_tokens_seen": 170938688, + "step": 140485 + }, + { + "epoch": 15.646508519879719, + "grad_norm": 0.0035776987206190825, + "learning_rate": 6.87646552092919e-06, + "loss": 0.0102, + "num_input_tokens_seen": 170944704, + "step": 140490 + }, + { + "epoch": 15.647065374763336, + "grad_norm": 0.00979663711041212, + "learning_rate": 6.874791977460879e-06, + "loss": 0.1365, + "num_input_tokens_seen": 170950912, + "step": 140495 + }, + { + "epoch": 15.647622229646954, + "grad_norm": 0.06689824163913727, + "learning_rate": 6.873118605199683e-06, + "loss": 0.008, + "num_input_tokens_seen": 170957056, + "step": 140500 + }, + { + "epoch": 15.648179084530572, + "grad_norm": 0.00013187198783271015, + "learning_rate": 6.8714454041613944e-06, + "loss": 0.0514, + "num_input_tokens_seen": 170962880, + "step": 140505 + }, + { + "epoch": 15.64873593941419, + "grad_norm": 0.013082618825137615, + "learning_rate": 6.869772374361835e-06, + "loss": 0.0363, + "num_input_tokens_seen": 170968736, + "step": 140510 + }, + { + "epoch": 15.649292794297805, + "grad_norm": 0.1429106593132019, + "learning_rate": 6.868099515816804e-06, + "loss": 0.0047, + "num_input_tokens_seen": 170974816, + "step": 140515 + }, + { + "epoch": 15.649849649181423, + "grad_norm": 0.3201691508293152, + "learning_rate": 6.8664268285421e-06, + "loss": 0.0697, + "num_input_tokens_seen": 170980960, + "step": 140520 + }, + { + "epoch": 15.65040650406504, + "grad_norm": 0.0001546279527246952, + "learning_rate": 6.864754312553512e-06, + "loss": 0.002, + "num_input_tokens_seen": 170987168, + "step": 140525 + }, + { + "epoch": 15.650963358948658, + "grad_norm": 0.8159081339836121, + "learning_rate": 6.863081967866861e-06, + "loss": 0.0102, + "num_input_tokens_seen": 170993568, + "step": 140530 + }, + { + "epoch": 15.651520213832276, + "grad_norm": 0.41974127292633057, + "learning_rate": 6.861409794497922e-06, + "loss": 0.0858, + "num_input_tokens_seen": 170999424, + "step": 140535 + }, + { + "epoch": 15.652077068715892, + "grad_norm": 0.578578531742096, + "learning_rate": 6.859737792462514e-06, + "loss": 0.01, + "num_input_tokens_seen": 171005472, + "step": 140540 + }, + { + "epoch": 15.65263392359951, + "grad_norm": 0.0024287390988320112, + "learning_rate": 6.858065961776402e-06, + "loss": 0.0679, + "num_input_tokens_seen": 171011392, + "step": 140545 + }, + { + "epoch": 15.653190778483127, + "grad_norm": 0.03189995512366295, + "learning_rate": 6.856394302455401e-06, + "loss": 0.0132, + "num_input_tokens_seen": 171017600, + "step": 140550 + }, + { + "epoch": 15.653747633366745, + "grad_norm": 0.027834393084049225, + "learning_rate": 6.8547228145152855e-06, + "loss": 0.0944, + "num_input_tokens_seen": 171023680, + "step": 140555 + }, + { + "epoch": 15.654304488250363, + "grad_norm": 0.00012052374222548679, + "learning_rate": 6.853051497971857e-06, + "loss": 0.0036, + "num_input_tokens_seen": 171030208, + "step": 140560 + }, + { + "epoch": 15.654861343133978, + "grad_norm": 0.00042810224113054574, + "learning_rate": 6.8513803528408945e-06, + "loss": 0.1693, + "num_input_tokens_seen": 171036544, + "step": 140565 + }, + { + "epoch": 15.655418198017596, + "grad_norm": 0.010912497527897358, + "learning_rate": 6.849709379138186e-06, + "loss": 0.0175, + "num_input_tokens_seen": 171042336, + "step": 140570 + }, + { + "epoch": 15.655975052901214, + "grad_norm": 0.0008930754265747964, + "learning_rate": 6.848038576879509e-06, + "loss": 0.0209, + "num_input_tokens_seen": 171048256, + "step": 140575 + }, + { + "epoch": 15.656531907784832, + "grad_norm": 0.40399348735809326, + "learning_rate": 6.846367946080656e-06, + "loss": 0.0887, + "num_input_tokens_seen": 171054336, + "step": 140580 + }, + { + "epoch": 15.65708876266845, + "grad_norm": 0.6163834929466248, + "learning_rate": 6.844697486757401e-06, + "loss": 0.0297, + "num_input_tokens_seen": 171060096, + "step": 140585 + }, + { + "epoch": 15.657645617552067, + "grad_norm": 1.6930298805236816, + "learning_rate": 6.843027198925528e-06, + "loss": 0.1784, + "num_input_tokens_seen": 171065792, + "step": 140590 + }, + { + "epoch": 15.658202472435683, + "grad_norm": 0.005792604293674231, + "learning_rate": 6.8413570826008e-06, + "loss": 0.0108, + "num_input_tokens_seen": 171071872, + "step": 140595 + }, + { + "epoch": 15.6587593273193, + "grad_norm": 0.19650645554065704, + "learning_rate": 6.839687137799009e-06, + "loss": 0.0046, + "num_input_tokens_seen": 171077664, + "step": 140600 + }, + { + "epoch": 15.659316182202918, + "grad_norm": 0.8280932903289795, + "learning_rate": 6.838017364535917e-06, + "loss": 0.0568, + "num_input_tokens_seen": 171083936, + "step": 140605 + }, + { + "epoch": 15.659873037086536, + "grad_norm": 0.0008101093699224293, + "learning_rate": 6.836347762827311e-06, + "loss": 0.0416, + "num_input_tokens_seen": 171090016, + "step": 140610 + }, + { + "epoch": 15.660429891970153, + "grad_norm": 0.10990545153617859, + "learning_rate": 6.834678332688951e-06, + "loss": 0.0598, + "num_input_tokens_seen": 171095680, + "step": 140615 + }, + { + "epoch": 15.66098674685377, + "grad_norm": 0.019959522411227226, + "learning_rate": 6.833009074136606e-06, + "loss": 0.0303, + "num_input_tokens_seen": 171101632, + "step": 140620 + }, + { + "epoch": 15.661543601737387, + "grad_norm": 0.012282253243029118, + "learning_rate": 6.831339987186042e-06, + "loss": 0.0245, + "num_input_tokens_seen": 171107872, + "step": 140625 + }, + { + "epoch": 15.662100456621005, + "grad_norm": 1.9250580072402954, + "learning_rate": 6.829671071853033e-06, + "loss": 0.0888, + "num_input_tokens_seen": 171114240, + "step": 140630 + }, + { + "epoch": 15.662657311504622, + "grad_norm": 0.00030613463604822755, + "learning_rate": 6.8280023281533406e-06, + "loss": 0.0105, + "num_input_tokens_seen": 171120704, + "step": 140635 + }, + { + "epoch": 15.66321416638824, + "grad_norm": 0.03633115068078041, + "learning_rate": 6.826333756102723e-06, + "loss": 0.0538, + "num_input_tokens_seen": 171126112, + "step": 140640 + }, + { + "epoch": 15.663771021271856, + "grad_norm": 0.08555828779935837, + "learning_rate": 6.824665355716939e-06, + "loss": 0.1216, + "num_input_tokens_seen": 171131968, + "step": 140645 + }, + { + "epoch": 15.664327876155474, + "grad_norm": 0.44355371594429016, + "learning_rate": 6.82299712701176e-06, + "loss": 0.0324, + "num_input_tokens_seen": 171138304, + "step": 140650 + }, + { + "epoch": 15.664884731039091, + "grad_norm": 3.6075217723846436, + "learning_rate": 6.821329070002927e-06, + "loss": 0.0926, + "num_input_tokens_seen": 171144544, + "step": 140655 + }, + { + "epoch": 15.665441585922709, + "grad_norm": 0.9030161499977112, + "learning_rate": 6.8196611847062196e-06, + "loss": 0.0557, + "num_input_tokens_seen": 171150944, + "step": 140660 + }, + { + "epoch": 15.665998440806327, + "grad_norm": 0.5294556021690369, + "learning_rate": 6.817993471137365e-06, + "loss": 0.0076, + "num_input_tokens_seen": 171157344, + "step": 140665 + }, + { + "epoch": 15.666555295689943, + "grad_norm": 0.00023751160188112408, + "learning_rate": 6.8163259293121365e-06, + "loss": 0.1089, + "num_input_tokens_seen": 171163744, + "step": 140670 + }, + { + "epoch": 15.66711215057356, + "grad_norm": 0.6540617942810059, + "learning_rate": 6.81465855924627e-06, + "loss": 0.0303, + "num_input_tokens_seen": 171170240, + "step": 140675 + }, + { + "epoch": 15.667669005457178, + "grad_norm": 0.7904096245765686, + "learning_rate": 6.812991360955531e-06, + "loss": 0.0298, + "num_input_tokens_seen": 171176032, + "step": 140680 + }, + { + "epoch": 15.668225860340796, + "grad_norm": 0.00015999602328520268, + "learning_rate": 6.8113243344556596e-06, + "loss": 0.0172, + "num_input_tokens_seen": 171182208, + "step": 140685 + }, + { + "epoch": 15.668782715224413, + "grad_norm": 0.0042787156999111176, + "learning_rate": 6.8096574797624015e-06, + "loss": 0.014, + "num_input_tokens_seen": 171188224, + "step": 140690 + }, + { + "epoch": 15.66933957010803, + "grad_norm": 0.5976364016532898, + "learning_rate": 6.807990796891497e-06, + "loss": 0.0522, + "num_input_tokens_seen": 171193952, + "step": 140695 + }, + { + "epoch": 15.669896424991647, + "grad_norm": 1.827691674232483, + "learning_rate": 6.8063242858587e-06, + "loss": 0.163, + "num_input_tokens_seen": 171200032, + "step": 140700 + }, + { + "epoch": 15.670453279875264, + "grad_norm": 0.23101459443569183, + "learning_rate": 6.804657946679749e-06, + "loss": 0.0117, + "num_input_tokens_seen": 171206272, + "step": 140705 + }, + { + "epoch": 15.671010134758882, + "grad_norm": 0.0022326121106743813, + "learning_rate": 6.802991779370379e-06, + "loss": 0.0073, + "num_input_tokens_seen": 171212608, + "step": 140710 + }, + { + "epoch": 15.6715669896425, + "grad_norm": 0.033351071178913116, + "learning_rate": 6.801325783946333e-06, + "loss": 0.0578, + "num_input_tokens_seen": 171218688, + "step": 140715 + }, + { + "epoch": 15.672123844526116, + "grad_norm": 0.00024161397595889866, + "learning_rate": 6.799659960423335e-06, + "loss": 0.0737, + "num_input_tokens_seen": 171224896, + "step": 140720 + }, + { + "epoch": 15.672680699409733, + "grad_norm": 0.5024808049201965, + "learning_rate": 6.79799430881714e-06, + "loss": 0.07, + "num_input_tokens_seen": 171230656, + "step": 140725 + }, + { + "epoch": 15.673237554293351, + "grad_norm": 0.0007945570396259427, + "learning_rate": 6.796328829143472e-06, + "loss": 0.0548, + "num_input_tokens_seen": 171236960, + "step": 140730 + }, + { + "epoch": 15.673794409176969, + "grad_norm": 0.04129885509610176, + "learning_rate": 6.7946635214180654e-06, + "loss": 0.0021, + "num_input_tokens_seen": 171243008, + "step": 140735 + }, + { + "epoch": 15.674351264060586, + "grad_norm": 0.003438501851633191, + "learning_rate": 6.792998385656637e-06, + "loss": 0.0273, + "num_input_tokens_seen": 171249248, + "step": 140740 + }, + { + "epoch": 15.674908118944202, + "grad_norm": 0.15633562207221985, + "learning_rate": 6.791333421874935e-06, + "loss": 0.009, + "num_input_tokens_seen": 171255712, + "step": 140745 + }, + { + "epoch": 15.67546497382782, + "grad_norm": 0.0009634998859837651, + "learning_rate": 6.789668630088669e-06, + "loss": 0.014, + "num_input_tokens_seen": 171262048, + "step": 140750 + }, + { + "epoch": 15.676021828711438, + "grad_norm": 1.3370102643966675, + "learning_rate": 6.788004010313578e-06, + "loss": 0.0352, + "num_input_tokens_seen": 171268256, + "step": 140755 + }, + { + "epoch": 15.676578683595055, + "grad_norm": 0.02025112882256508, + "learning_rate": 6.786339562565383e-06, + "loss": 0.0179, + "num_input_tokens_seen": 171273856, + "step": 140760 + }, + { + "epoch": 15.677135538478673, + "grad_norm": 0.16182248294353485, + "learning_rate": 6.784675286859804e-06, + "loss": 0.167, + "num_input_tokens_seen": 171279968, + "step": 140765 + }, + { + "epoch": 15.677692393362289, + "grad_norm": 0.2736119329929352, + "learning_rate": 6.783011183212551e-06, + "loss": 0.0703, + "num_input_tokens_seen": 171286080, + "step": 140770 + }, + { + "epoch": 15.678249248245907, + "grad_norm": 0.6937485933303833, + "learning_rate": 6.781347251639361e-06, + "loss": 0.0101, + "num_input_tokens_seen": 171292160, + "step": 140775 + }, + { + "epoch": 15.678806103129524, + "grad_norm": 0.8870074152946472, + "learning_rate": 6.779683492155944e-06, + "loss": 0.0245, + "num_input_tokens_seen": 171298528, + "step": 140780 + }, + { + "epoch": 15.679362958013142, + "grad_norm": 0.18636715412139893, + "learning_rate": 6.778019904778013e-06, + "loss": 0.0316, + "num_input_tokens_seen": 171304672, + "step": 140785 + }, + { + "epoch": 15.67991981289676, + "grad_norm": 0.09302739799022675, + "learning_rate": 6.776356489521277e-06, + "loss": 0.0238, + "num_input_tokens_seen": 171310848, + "step": 140790 + }, + { + "epoch": 15.680476667780376, + "grad_norm": 0.0002933884970843792, + "learning_rate": 6.774693246401461e-06, + "loss": 0.0499, + "num_input_tokens_seen": 171317248, + "step": 140795 + }, + { + "epoch": 15.681033522663993, + "grad_norm": 0.0039832619950175285, + "learning_rate": 6.773030175434261e-06, + "loss": 0.006, + "num_input_tokens_seen": 171323424, + "step": 140800 + }, + { + "epoch": 15.68159037754761, + "grad_norm": 1.8062947988510132, + "learning_rate": 6.77136727663541e-06, + "loss": 0.0841, + "num_input_tokens_seen": 171329792, + "step": 140805 + }, + { + "epoch": 15.682147232431229, + "grad_norm": 0.3153417110443115, + "learning_rate": 6.769704550020583e-06, + "loss": 0.0468, + "num_input_tokens_seen": 171335968, + "step": 140810 + }, + { + "epoch": 15.682704087314846, + "grad_norm": 0.017296474426984787, + "learning_rate": 6.768041995605512e-06, + "loss": 0.0026, + "num_input_tokens_seen": 171342304, + "step": 140815 + }, + { + "epoch": 15.683260942198464, + "grad_norm": 0.510307788848877, + "learning_rate": 6.766379613405885e-06, + "loss": 0.0129, + "num_input_tokens_seen": 171348128, + "step": 140820 + }, + { + "epoch": 15.68381779708208, + "grad_norm": 0.00011528599861776456, + "learning_rate": 6.7647174034374175e-06, + "loss": 0.0985, + "num_input_tokens_seen": 171354496, + "step": 140825 + }, + { + "epoch": 15.684374651965697, + "grad_norm": 2.324350357055664, + "learning_rate": 6.763055365715803e-06, + "loss": 0.028, + "num_input_tokens_seen": 171360544, + "step": 140830 + }, + { + "epoch": 15.684931506849315, + "grad_norm": 0.13447923958301544, + "learning_rate": 6.761393500256741e-06, + "loss": 0.0085, + "num_input_tokens_seen": 171366496, + "step": 140835 + }, + { + "epoch": 15.685488361732933, + "grad_norm": 0.09174036979675293, + "learning_rate": 6.759731807075925e-06, + "loss": 0.2011, + "num_input_tokens_seen": 171371808, + "step": 140840 + }, + { + "epoch": 15.68604521661655, + "grad_norm": 0.004496584180742502, + "learning_rate": 6.758070286189061e-06, + "loss": 0.0005, + "num_input_tokens_seen": 171377824, + "step": 140845 + }, + { + "epoch": 15.686602071500166, + "grad_norm": 0.007903114892542362, + "learning_rate": 6.7564089376118415e-06, + "loss": 0.028, + "num_input_tokens_seen": 171383968, + "step": 140850 + }, + { + "epoch": 15.687158926383784, + "grad_norm": 0.7813740968704224, + "learning_rate": 6.754747761359953e-06, + "loss": 0.0106, + "num_input_tokens_seen": 171390272, + "step": 140855 + }, + { + "epoch": 15.687715781267402, + "grad_norm": 1.781983733177185, + "learning_rate": 6.753086757449084e-06, + "loss": 0.2034, + "num_input_tokens_seen": 171395552, + "step": 140860 + }, + { + "epoch": 15.68827263615102, + "grad_norm": 0.00026656591217033565, + "learning_rate": 6.751425925894936e-06, + "loss": 0.0134, + "num_input_tokens_seen": 171401568, + "step": 140865 + }, + { + "epoch": 15.688829491034637, + "grad_norm": 1.0046300888061523, + "learning_rate": 6.749765266713184e-06, + "loss": 0.161, + "num_input_tokens_seen": 171407680, + "step": 140870 + }, + { + "epoch": 15.689386345918253, + "grad_norm": 0.439826101064682, + "learning_rate": 6.748104779919534e-06, + "loss": 0.0691, + "num_input_tokens_seen": 171413600, + "step": 140875 + }, + { + "epoch": 15.68994320080187, + "grad_norm": 0.33604076504707336, + "learning_rate": 6.746444465529645e-06, + "loss": 0.1748, + "num_input_tokens_seen": 171420064, + "step": 140880 + }, + { + "epoch": 15.690500055685488, + "grad_norm": 0.016676900908350945, + "learning_rate": 6.74478432355922e-06, + "loss": 0.0755, + "num_input_tokens_seen": 171426144, + "step": 140885 + }, + { + "epoch": 15.691056910569106, + "grad_norm": 1.234238862991333, + "learning_rate": 6.743124354023924e-06, + "loss": 0.0263, + "num_input_tokens_seen": 171432544, + "step": 140890 + }, + { + "epoch": 15.691613765452724, + "grad_norm": 0.0011672705877572298, + "learning_rate": 6.741464556939453e-06, + "loss": 0.0158, + "num_input_tokens_seen": 171438656, + "step": 140895 + }, + { + "epoch": 15.69217062033634, + "grad_norm": 0.00013268506154417992, + "learning_rate": 6.73980493232148e-06, + "loss": 0.0211, + "num_input_tokens_seen": 171444928, + "step": 140900 + }, + { + "epoch": 15.692727475219957, + "grad_norm": 0.9738280177116394, + "learning_rate": 6.738145480185676e-06, + "loss": 0.0114, + "num_input_tokens_seen": 171450720, + "step": 140905 + }, + { + "epoch": 15.693284330103575, + "grad_norm": 0.8106975555419922, + "learning_rate": 6.736486200547715e-06, + "loss": 0.038, + "num_input_tokens_seen": 171457024, + "step": 140910 + }, + { + "epoch": 15.693841184987193, + "grad_norm": 0.00011307161912554875, + "learning_rate": 6.7348270934232795e-06, + "loss": 0.0222, + "num_input_tokens_seen": 171462944, + "step": 140915 + }, + { + "epoch": 15.69439803987081, + "grad_norm": 0.0007792808464728296, + "learning_rate": 6.7331681588280375e-06, + "loss": 0.0468, + "num_input_tokens_seen": 171469152, + "step": 140920 + }, + { + "epoch": 15.694954894754426, + "grad_norm": 0.00570367556065321, + "learning_rate": 6.731509396777655e-06, + "loss": 0.0267, + "num_input_tokens_seen": 171475200, + "step": 140925 + }, + { + "epoch": 15.695511749638044, + "grad_norm": 0.004262600094079971, + "learning_rate": 6.729850807287796e-06, + "loss": 0.0062, + "num_input_tokens_seen": 171481216, + "step": 140930 + }, + { + "epoch": 15.696068604521662, + "grad_norm": 0.0010203261626884341, + "learning_rate": 6.728192390374144e-06, + "loss": 0.0759, + "num_input_tokens_seen": 171487360, + "step": 140935 + }, + { + "epoch": 15.69662545940528, + "grad_norm": 0.004173357039690018, + "learning_rate": 6.726534146052343e-06, + "loss": 0.0109, + "num_input_tokens_seen": 171493536, + "step": 140940 + }, + { + "epoch": 15.697182314288897, + "grad_norm": 0.047930359840393066, + "learning_rate": 6.724876074338085e-06, + "loss": 0.0148, + "num_input_tokens_seen": 171499456, + "step": 140945 + }, + { + "epoch": 15.697739169172515, + "grad_norm": 0.19587565958499908, + "learning_rate": 6.723218175246998e-06, + "loss": 0.0105, + "num_input_tokens_seen": 171505728, + "step": 140950 + }, + { + "epoch": 15.69829602405613, + "grad_norm": 0.06686396896839142, + "learning_rate": 6.721560448794767e-06, + "loss": 0.0245, + "num_input_tokens_seen": 171511712, + "step": 140955 + }, + { + "epoch": 15.698852878939748, + "grad_norm": 0.14463715255260468, + "learning_rate": 6.719902894997032e-06, + "loss": 0.0427, + "num_input_tokens_seen": 171518272, + "step": 140960 + }, + { + "epoch": 15.699409733823366, + "grad_norm": 0.02420998550951481, + "learning_rate": 6.71824551386947e-06, + "loss": 0.0208, + "num_input_tokens_seen": 171524288, + "step": 140965 + }, + { + "epoch": 15.699966588706983, + "grad_norm": 1.7706899642944336, + "learning_rate": 6.716588305427726e-06, + "loss": 0.1256, + "num_input_tokens_seen": 171530080, + "step": 140970 + }, + { + "epoch": 15.700523443590601, + "grad_norm": 0.03404109179973602, + "learning_rate": 6.7149312696874525e-06, + "loss": 0.0559, + "num_input_tokens_seen": 171536320, + "step": 140975 + }, + { + "epoch": 15.701080298474217, + "grad_norm": 0.0006148442625999451, + "learning_rate": 6.713274406664297e-06, + "loss": 0.0221, + "num_input_tokens_seen": 171542368, + "step": 140980 + }, + { + "epoch": 15.701637153357835, + "grad_norm": 0.007482504937797785, + "learning_rate": 6.7116177163739216e-06, + "loss": 0.017, + "num_input_tokens_seen": 171548672, + "step": 140985 + }, + { + "epoch": 15.702194008241452, + "grad_norm": 0.0928434208035469, + "learning_rate": 6.709961198831971e-06, + "loss": 0.1215, + "num_input_tokens_seen": 171554016, + "step": 140990 + }, + { + "epoch": 15.70275086312507, + "grad_norm": 0.12028733640909195, + "learning_rate": 6.70830485405409e-06, + "loss": 0.0664, + "num_input_tokens_seen": 171560160, + "step": 140995 + }, + { + "epoch": 15.703307718008688, + "grad_norm": 0.7269108295440674, + "learning_rate": 6.706648682055916e-06, + "loss": 0.0514, + "num_input_tokens_seen": 171566368, + "step": 141000 + }, + { + "epoch": 15.703864572892304, + "grad_norm": 0.3425212502479553, + "learning_rate": 6.704992682853112e-06, + "loss": 0.0476, + "num_input_tokens_seen": 171572448, + "step": 141005 + }, + { + "epoch": 15.704421427775921, + "grad_norm": 0.00030627826345153153, + "learning_rate": 6.703336856461298e-06, + "loss": 0.0099, + "num_input_tokens_seen": 171578912, + "step": 141010 + }, + { + "epoch": 15.704978282659539, + "grad_norm": 1.2481434345245361, + "learning_rate": 6.701681202896137e-06, + "loss": 0.1089, + "num_input_tokens_seen": 171585248, + "step": 141015 + }, + { + "epoch": 15.705535137543157, + "grad_norm": 0.2897057831287384, + "learning_rate": 6.700025722173256e-06, + "loss": 0.0365, + "num_input_tokens_seen": 171590880, + "step": 141020 + }, + { + "epoch": 15.706091992426774, + "grad_norm": 0.01609247922897339, + "learning_rate": 6.698370414308297e-06, + "loss": 0.0153, + "num_input_tokens_seen": 171596864, + "step": 141025 + }, + { + "epoch": 15.70664884731039, + "grad_norm": 1.9425692558288574, + "learning_rate": 6.696715279316882e-06, + "loss": 0.1547, + "num_input_tokens_seen": 171602688, + "step": 141030 + }, + { + "epoch": 15.707205702194008, + "grad_norm": 0.0145026920363307, + "learning_rate": 6.695060317214663e-06, + "loss": 0.0039, + "num_input_tokens_seen": 171608064, + "step": 141035 + }, + { + "epoch": 15.707762557077626, + "grad_norm": 0.19337378442287445, + "learning_rate": 6.693405528017266e-06, + "loss": 0.0075, + "num_input_tokens_seen": 171614304, + "step": 141040 + }, + { + "epoch": 15.708319411961243, + "grad_norm": 0.07989487797021866, + "learning_rate": 6.691750911740319e-06, + "loss": 0.0182, + "num_input_tokens_seen": 171621088, + "step": 141045 + }, + { + "epoch": 15.708876266844861, + "grad_norm": 0.976411759853363, + "learning_rate": 6.690096468399448e-06, + "loss": 0.095, + "num_input_tokens_seen": 171627104, + "step": 141050 + }, + { + "epoch": 15.709433121728477, + "grad_norm": 0.0030392890330404043, + "learning_rate": 6.688442198010292e-06, + "loss": 0.0031, + "num_input_tokens_seen": 171633504, + "step": 141055 + }, + { + "epoch": 15.709989976612095, + "grad_norm": 0.0008381067309528589, + "learning_rate": 6.686788100588462e-06, + "loss": 0.015, + "num_input_tokens_seen": 171639520, + "step": 141060 + }, + { + "epoch": 15.710546831495712, + "grad_norm": 0.14152655005455017, + "learning_rate": 6.685134176149607e-06, + "loss": 0.0222, + "num_input_tokens_seen": 171645632, + "step": 141065 + }, + { + "epoch": 15.71110368637933, + "grad_norm": 0.000938237237278372, + "learning_rate": 6.683480424709315e-06, + "loss": 0.0239, + "num_input_tokens_seen": 171651712, + "step": 141070 + }, + { + "epoch": 15.711660541262948, + "grad_norm": 0.0025935873854905367, + "learning_rate": 6.681826846283237e-06, + "loss": 0.0118, + "num_input_tokens_seen": 171657696, + "step": 141075 + }, + { + "epoch": 15.712217396146563, + "grad_norm": 0.009872069582343102, + "learning_rate": 6.6801734408869725e-06, + "loss": 0.1899, + "num_input_tokens_seen": 171663616, + "step": 141080 + }, + { + "epoch": 15.712774251030181, + "grad_norm": 0.44558021426200867, + "learning_rate": 6.678520208536154e-06, + "loss": 0.1687, + "num_input_tokens_seen": 171669824, + "step": 141085 + }, + { + "epoch": 15.713331105913799, + "grad_norm": 2.9529967308044434, + "learning_rate": 6.676867149246391e-06, + "loss": 0.1141, + "num_input_tokens_seen": 171676064, + "step": 141090 + }, + { + "epoch": 15.713887960797416, + "grad_norm": 0.09378919005393982, + "learning_rate": 6.675214263033297e-06, + "loss": 0.1519, + "num_input_tokens_seen": 171682496, + "step": 141095 + }, + { + "epoch": 15.714444815681034, + "grad_norm": 0.04660886898636818, + "learning_rate": 6.67356154991248e-06, + "loss": 0.0475, + "num_input_tokens_seen": 171689024, + "step": 141100 + }, + { + "epoch": 15.71500167056465, + "grad_norm": 0.004765527322888374, + "learning_rate": 6.6719090098995655e-06, + "loss": 0.1209, + "num_input_tokens_seen": 171694816, + "step": 141105 + }, + { + "epoch": 15.715558525448268, + "grad_norm": 1.9874253273010254, + "learning_rate": 6.670256643010153e-06, + "loss": 0.0704, + "num_input_tokens_seen": 171701088, + "step": 141110 + }, + { + "epoch": 15.716115380331885, + "grad_norm": 2.340427875518799, + "learning_rate": 6.668604449259852e-06, + "loss": 0.0189, + "num_input_tokens_seen": 171707488, + "step": 141115 + }, + { + "epoch": 15.716672235215503, + "grad_norm": 0.30167415738105774, + "learning_rate": 6.666952428664269e-06, + "loss": 0.1311, + "num_input_tokens_seen": 171713568, + "step": 141120 + }, + { + "epoch": 15.71722909009912, + "grad_norm": 1.49746835231781, + "learning_rate": 6.665300581239e-06, + "loss": 0.0444, + "num_input_tokens_seen": 171719936, + "step": 141125 + }, + { + "epoch": 15.717785944982737, + "grad_norm": 0.37703245878219604, + "learning_rate": 6.663648906999667e-06, + "loss": 0.019, + "num_input_tokens_seen": 171726112, + "step": 141130 + }, + { + "epoch": 15.718342799866354, + "grad_norm": 0.10626839101314545, + "learning_rate": 6.661997405961859e-06, + "loss": 0.0161, + "num_input_tokens_seen": 171731744, + "step": 141135 + }, + { + "epoch": 15.718899654749972, + "grad_norm": 2.5836892127990723, + "learning_rate": 6.660346078141178e-06, + "loss": 0.1013, + "num_input_tokens_seen": 171737824, + "step": 141140 + }, + { + "epoch": 15.71945650963359, + "grad_norm": 0.0008008565055206418, + "learning_rate": 6.658694923553213e-06, + "loss": 0.0628, + "num_input_tokens_seen": 171743264, + "step": 141145 + }, + { + "epoch": 15.720013364517207, + "grad_norm": 0.013723223470151424, + "learning_rate": 6.657043942213578e-06, + "loss": 0.0214, + "num_input_tokens_seen": 171749440, + "step": 141150 + }, + { + "epoch": 15.720570219400823, + "grad_norm": 0.8940221071243286, + "learning_rate": 6.655393134137853e-06, + "loss": 0.0781, + "num_input_tokens_seen": 171755264, + "step": 141155 + }, + { + "epoch": 15.721127074284441, + "grad_norm": 0.5179032683372498, + "learning_rate": 6.653742499341642e-06, + "loss": 0.034, + "num_input_tokens_seen": 171761152, + "step": 141160 + }, + { + "epoch": 15.721683929168059, + "grad_norm": 0.6748848557472229, + "learning_rate": 6.652092037840532e-06, + "loss": 0.0283, + "num_input_tokens_seen": 171767552, + "step": 141165 + }, + { + "epoch": 15.722240784051676, + "grad_norm": 0.09167549759149551, + "learning_rate": 6.650441749650116e-06, + "loss": 0.0064, + "num_input_tokens_seen": 171773600, + "step": 141170 + }, + { + "epoch": 15.722797638935294, + "grad_norm": 0.41243401169776917, + "learning_rate": 6.648791634785967e-06, + "loss": 0.0079, + "num_input_tokens_seen": 171779648, + "step": 141175 + }, + { + "epoch": 15.723354493818912, + "grad_norm": 0.033271223306655884, + "learning_rate": 6.647141693263695e-06, + "loss": 0.005, + "num_input_tokens_seen": 171785312, + "step": 141180 + }, + { + "epoch": 15.723911348702527, + "grad_norm": 1.1819062232971191, + "learning_rate": 6.645491925098874e-06, + "loss": 0.0456, + "num_input_tokens_seen": 171791520, + "step": 141185 + }, + { + "epoch": 15.724468203586145, + "grad_norm": 0.43357473611831665, + "learning_rate": 6.643842330307085e-06, + "loss": 0.0286, + "num_input_tokens_seen": 171797216, + "step": 141190 + }, + { + "epoch": 15.725025058469763, + "grad_norm": 0.8247392773628235, + "learning_rate": 6.642192908903905e-06, + "loss": 0.0135, + "num_input_tokens_seen": 171803488, + "step": 141195 + }, + { + "epoch": 15.72558191335338, + "grad_norm": 1.964333415031433, + "learning_rate": 6.640543660904927e-06, + "loss": 0.0908, + "num_input_tokens_seen": 171809568, + "step": 141200 + }, + { + "epoch": 15.726138768236998, + "grad_norm": 0.05959715321660042, + "learning_rate": 6.6388945863257195e-06, + "loss": 0.0481, + "num_input_tokens_seen": 171815872, + "step": 141205 + }, + { + "epoch": 15.726695623120614, + "grad_norm": 0.13125988841056824, + "learning_rate": 6.637245685181875e-06, + "loss": 0.0206, + "num_input_tokens_seen": 171821888, + "step": 141210 + }, + { + "epoch": 15.727252478004232, + "grad_norm": 0.1628761738538742, + "learning_rate": 6.635596957488943e-06, + "loss": 0.0362, + "num_input_tokens_seen": 171827968, + "step": 141215 + }, + { + "epoch": 15.72780933288785, + "grad_norm": 0.0008563663577660918, + "learning_rate": 6.633948403262519e-06, + "loss": 0.0205, + "num_input_tokens_seen": 171834208, + "step": 141220 + }, + { + "epoch": 15.728366187771467, + "grad_norm": 0.46069225668907166, + "learning_rate": 6.632300022518159e-06, + "loss": 0.2159, + "num_input_tokens_seen": 171840448, + "step": 141225 + }, + { + "epoch": 15.728923042655085, + "grad_norm": 0.6597110629081726, + "learning_rate": 6.630651815271449e-06, + "loss": 0.0695, + "num_input_tokens_seen": 171846624, + "step": 141230 + }, + { + "epoch": 15.7294798975387, + "grad_norm": 1.2834477424621582, + "learning_rate": 6.629003781537951e-06, + "loss": 0.0585, + "num_input_tokens_seen": 171852800, + "step": 141235 + }, + { + "epoch": 15.730036752422318, + "grad_norm": 0.03365478664636612, + "learning_rate": 6.627355921333231e-06, + "loss": 0.0116, + "num_input_tokens_seen": 171858912, + "step": 141240 + }, + { + "epoch": 15.730593607305936, + "grad_norm": 0.08101347088813782, + "learning_rate": 6.625708234672845e-06, + "loss": 0.1191, + "num_input_tokens_seen": 171865312, + "step": 141245 + }, + { + "epoch": 15.731150462189554, + "grad_norm": 0.2143232822418213, + "learning_rate": 6.624060721572372e-06, + "loss": 0.0672, + "num_input_tokens_seen": 171871040, + "step": 141250 + }, + { + "epoch": 15.731707317073171, + "grad_norm": 0.005972791463136673, + "learning_rate": 6.622413382047371e-06, + "loss": 0.0002, + "num_input_tokens_seen": 171877536, + "step": 141255 + }, + { + "epoch": 15.732264171956787, + "grad_norm": 0.5264750123023987, + "learning_rate": 6.6207662161133996e-06, + "loss": 0.0164, + "num_input_tokens_seen": 171883840, + "step": 141260 + }, + { + "epoch": 15.732821026840405, + "grad_norm": 0.0528896190226078, + "learning_rate": 6.6191192237860074e-06, + "loss": 0.0554, + "num_input_tokens_seen": 171889920, + "step": 141265 + }, + { + "epoch": 15.733377881724023, + "grad_norm": 1.1424980163574219, + "learning_rate": 6.617472405080768e-06, + "loss": 0.0243, + "num_input_tokens_seen": 171896288, + "step": 141270 + }, + { + "epoch": 15.73393473660764, + "grad_norm": 0.2814401686191559, + "learning_rate": 6.615825760013223e-06, + "loss": 0.0576, + "num_input_tokens_seen": 171902400, + "step": 141275 + }, + { + "epoch": 15.734491591491258, + "grad_norm": 0.2635363042354584, + "learning_rate": 6.614179288598948e-06, + "loss": 0.0075, + "num_input_tokens_seen": 171908384, + "step": 141280 + }, + { + "epoch": 15.735048446374874, + "grad_norm": 0.38538604974746704, + "learning_rate": 6.612532990853465e-06, + "loss": 0.0165, + "num_input_tokens_seen": 171914176, + "step": 141285 + }, + { + "epoch": 15.735605301258492, + "grad_norm": 0.000428813073085621, + "learning_rate": 6.610886866792346e-06, + "loss": 0.0131, + "num_input_tokens_seen": 171920128, + "step": 141290 + }, + { + "epoch": 15.73616215614211, + "grad_norm": 0.07167880237102509, + "learning_rate": 6.609240916431128e-06, + "loss": 0.0069, + "num_input_tokens_seen": 171926560, + "step": 141295 + }, + { + "epoch": 15.736719011025727, + "grad_norm": 0.5367389917373657, + "learning_rate": 6.607595139785372e-06, + "loss": 0.0294, + "num_input_tokens_seen": 171932512, + "step": 141300 + }, + { + "epoch": 15.737275865909345, + "grad_norm": 2.818392753601074, + "learning_rate": 6.605949536870612e-06, + "loss": 0.0654, + "num_input_tokens_seen": 171938752, + "step": 141305 + }, + { + "epoch": 15.737832720792962, + "grad_norm": 0.0001952455349965021, + "learning_rate": 6.6043041077024e-06, + "loss": 0.0126, + "num_input_tokens_seen": 171944992, + "step": 141310 + }, + { + "epoch": 15.738389575676578, + "grad_norm": 1.114911675453186, + "learning_rate": 6.602658852296265e-06, + "loss": 0.0344, + "num_input_tokens_seen": 171950880, + "step": 141315 + }, + { + "epoch": 15.738946430560196, + "grad_norm": 1.810381531715393, + "learning_rate": 6.601013770667763e-06, + "loss": 0.0383, + "num_input_tokens_seen": 171956992, + "step": 141320 + }, + { + "epoch": 15.739503285443813, + "grad_norm": 0.10812768340110779, + "learning_rate": 6.599368862832428e-06, + "loss": 0.0484, + "num_input_tokens_seen": 171962944, + "step": 141325 + }, + { + "epoch": 15.740060140327431, + "grad_norm": 0.001224695472046733, + "learning_rate": 6.597724128805796e-06, + "loss": 0.0059, + "num_input_tokens_seen": 171968032, + "step": 141330 + }, + { + "epoch": 15.740616995211049, + "grad_norm": 0.004281435161828995, + "learning_rate": 6.596079568603395e-06, + "loss": 0.0006, + "num_input_tokens_seen": 171974080, + "step": 141335 + }, + { + "epoch": 15.741173850094665, + "grad_norm": 0.34553879499435425, + "learning_rate": 6.594435182240777e-06, + "loss": 0.0358, + "num_input_tokens_seen": 171980512, + "step": 141340 + }, + { + "epoch": 15.741730704978282, + "grad_norm": 0.5695245265960693, + "learning_rate": 6.592790969733456e-06, + "loss": 0.0248, + "num_input_tokens_seen": 171986784, + "step": 141345 + }, + { + "epoch": 15.7422875598619, + "grad_norm": 0.02989812195301056, + "learning_rate": 6.591146931096978e-06, + "loss": 0.1739, + "num_input_tokens_seen": 171992256, + "step": 141350 + }, + { + "epoch": 15.742844414745518, + "grad_norm": 0.0025478953029960394, + "learning_rate": 6.589503066346869e-06, + "loss": 0.0017, + "num_input_tokens_seen": 171998784, + "step": 141355 + }, + { + "epoch": 15.743401269629135, + "grad_norm": 0.1480880230665207, + "learning_rate": 6.587859375498653e-06, + "loss": 0.0591, + "num_input_tokens_seen": 172004800, + "step": 141360 + }, + { + "epoch": 15.743958124512751, + "grad_norm": 0.3146522045135498, + "learning_rate": 6.586215858567849e-06, + "loss": 0.061, + "num_input_tokens_seen": 172010944, + "step": 141365 + }, + { + "epoch": 15.744514979396369, + "grad_norm": 0.6326841711997986, + "learning_rate": 6.584572515569998e-06, + "loss": 0.0256, + "num_input_tokens_seen": 172016928, + "step": 141370 + }, + { + "epoch": 15.745071834279987, + "grad_norm": 0.010907550342381, + "learning_rate": 6.582929346520611e-06, + "loss": 0.0024, + "num_input_tokens_seen": 172023072, + "step": 141375 + }, + { + "epoch": 15.745628689163604, + "grad_norm": 1.6489460468292236, + "learning_rate": 6.581286351435215e-06, + "loss": 0.104, + "num_input_tokens_seen": 172029120, + "step": 141380 + }, + { + "epoch": 15.746185544047222, + "grad_norm": 0.00012191386485937983, + "learning_rate": 6.579643530329316e-06, + "loss": 0.0753, + "num_input_tokens_seen": 172035488, + "step": 141385 + }, + { + "epoch": 15.746742398930838, + "grad_norm": 0.019984547048807144, + "learning_rate": 6.578000883218449e-06, + "loss": 0.1212, + "num_input_tokens_seen": 172041664, + "step": 141390 + }, + { + "epoch": 15.747299253814456, + "grad_norm": 0.06135771796107292, + "learning_rate": 6.5763584101181195e-06, + "loss": 0.0504, + "num_input_tokens_seen": 172047936, + "step": 141395 + }, + { + "epoch": 15.747856108698073, + "grad_norm": 0.07154002040624619, + "learning_rate": 6.574716111043857e-06, + "loss": 0.0296, + "num_input_tokens_seen": 172053984, + "step": 141400 + }, + { + "epoch": 15.748412963581691, + "grad_norm": 0.5109158754348755, + "learning_rate": 6.57307398601115e-06, + "loss": 0.0215, + "num_input_tokens_seen": 172060032, + "step": 141405 + }, + { + "epoch": 15.748969818465309, + "grad_norm": 1.134468913078308, + "learning_rate": 6.571432035035527e-06, + "loss": 0.0381, + "num_input_tokens_seen": 172065760, + "step": 141410 + }, + { + "epoch": 15.749526673348925, + "grad_norm": 0.0005865836283192039, + "learning_rate": 6.569790258132488e-06, + "loss": 0.0044, + "num_input_tokens_seen": 172072000, + "step": 141415 + }, + { + "epoch": 15.750083528232542, + "grad_norm": 0.034605491906404495, + "learning_rate": 6.568148655317555e-06, + "loss": 0.0389, + "num_input_tokens_seen": 172078080, + "step": 141420 + }, + { + "epoch": 15.75064038311616, + "grad_norm": 0.3274131417274475, + "learning_rate": 6.566507226606222e-06, + "loss": 0.044, + "num_input_tokens_seen": 172084256, + "step": 141425 + }, + { + "epoch": 15.751197237999778, + "grad_norm": 0.10325666517019272, + "learning_rate": 6.564865972014e-06, + "loss": 0.0027, + "num_input_tokens_seen": 172090336, + "step": 141430 + }, + { + "epoch": 15.751754092883395, + "grad_norm": 0.4011023938655853, + "learning_rate": 6.56322489155638e-06, + "loss": 0.1232, + "num_input_tokens_seen": 172095680, + "step": 141435 + }, + { + "epoch": 15.752310947767011, + "grad_norm": 2.549748420715332, + "learning_rate": 6.561583985248878e-06, + "loss": 0.1899, + "num_input_tokens_seen": 172101856, + "step": 141440 + }, + { + "epoch": 15.752867802650629, + "grad_norm": 1.0876524448394775, + "learning_rate": 6.55994325310699e-06, + "loss": 0.1117, + "num_input_tokens_seen": 172107776, + "step": 141445 + }, + { + "epoch": 15.753424657534246, + "grad_norm": 0.2992497980594635, + "learning_rate": 6.558302695146212e-06, + "loss": 0.0284, + "num_input_tokens_seen": 172113344, + "step": 141450 + }, + { + "epoch": 15.753981512417864, + "grad_norm": 0.008065157569944859, + "learning_rate": 6.5566623113820335e-06, + "loss": 0.0073, + "num_input_tokens_seen": 172119264, + "step": 141455 + }, + { + "epoch": 15.754538367301482, + "grad_norm": 0.001306872465647757, + "learning_rate": 6.5550221018299605e-06, + "loss": 0.0775, + "num_input_tokens_seen": 172125472, + "step": 141460 + }, + { + "epoch": 15.755095222185098, + "grad_norm": 0.0014796714531257749, + "learning_rate": 6.553382066505476e-06, + "loss": 0.0143, + "num_input_tokens_seen": 172131488, + "step": 141465 + }, + { + "epoch": 15.755652077068715, + "grad_norm": 0.055652961134910583, + "learning_rate": 6.551742205424094e-06, + "loss": 0.0312, + "num_input_tokens_seen": 172137728, + "step": 141470 + }, + { + "epoch": 15.756208931952333, + "grad_norm": 0.0009655067115090787, + "learning_rate": 6.55010251860127e-06, + "loss": 0.0092, + "num_input_tokens_seen": 172144064, + "step": 141475 + }, + { + "epoch": 15.75676578683595, + "grad_norm": 1.175012230873108, + "learning_rate": 6.548463006052516e-06, + "loss": 0.0252, + "num_input_tokens_seen": 172150336, + "step": 141480 + }, + { + "epoch": 15.757322641719568, + "grad_norm": 0.89998459815979, + "learning_rate": 6.546823667793306e-06, + "loss": 0.1541, + "num_input_tokens_seen": 172155936, + "step": 141485 + }, + { + "epoch": 15.757879496603184, + "grad_norm": 0.9211565256118774, + "learning_rate": 6.5451845038391384e-06, + "loss": 0.0173, + "num_input_tokens_seen": 172162144, + "step": 141490 + }, + { + "epoch": 15.758436351486802, + "grad_norm": 0.01689228042960167, + "learning_rate": 6.54354551420549e-06, + "loss": 0.0169, + "num_input_tokens_seen": 172168288, + "step": 141495 + }, + { + "epoch": 15.75899320637042, + "grad_norm": 0.15740084648132324, + "learning_rate": 6.54190669890784e-06, + "loss": 0.0233, + "num_input_tokens_seen": 172173856, + "step": 141500 + }, + { + "epoch": 15.759550061254037, + "grad_norm": 0.008826547302305698, + "learning_rate": 6.540268057961662e-06, + "loss": 0.0122, + "num_input_tokens_seen": 172179904, + "step": 141505 + }, + { + "epoch": 15.760106916137655, + "grad_norm": 0.2572039067745209, + "learning_rate": 6.5386295913824505e-06, + "loss": 0.0058, + "num_input_tokens_seen": 172186240, + "step": 141510 + }, + { + "epoch": 15.760663771021271, + "grad_norm": 0.012413350865244865, + "learning_rate": 6.5369912991856715e-06, + "loss": 0.0491, + "num_input_tokens_seen": 172192608, + "step": 141515 + }, + { + "epoch": 15.761220625904889, + "grad_norm": 0.0005195965641178191, + "learning_rate": 6.535353181386802e-06, + "loss": 0.0186, + "num_input_tokens_seen": 172198752, + "step": 141520 + }, + { + "epoch": 15.761777480788506, + "grad_norm": 0.18754296004772186, + "learning_rate": 6.533715238001317e-06, + "loss": 0.0652, + "num_input_tokens_seen": 172204832, + "step": 141525 + }, + { + "epoch": 15.762334335672124, + "grad_norm": 0.13679172098636627, + "learning_rate": 6.5320774690446785e-06, + "loss": 0.0469, + "num_input_tokens_seen": 172211232, + "step": 141530 + }, + { + "epoch": 15.762891190555742, + "grad_norm": 0.1807628720998764, + "learning_rate": 6.5304398745323735e-06, + "loss": 0.0084, + "num_input_tokens_seen": 172217088, + "step": 141535 + }, + { + "epoch": 15.76344804543936, + "grad_norm": 0.627916693687439, + "learning_rate": 6.52880245447986e-06, + "loss": 0.0229, + "num_input_tokens_seen": 172223136, + "step": 141540 + }, + { + "epoch": 15.764004900322975, + "grad_norm": 0.1189325675368309, + "learning_rate": 6.527165208902605e-06, + "loss": 0.0809, + "num_input_tokens_seen": 172229184, + "step": 141545 + }, + { + "epoch": 15.764561755206593, + "grad_norm": 0.0010975704062730074, + "learning_rate": 6.52552813781607e-06, + "loss": 0.0125, + "num_input_tokens_seen": 172235520, + "step": 141550 + }, + { + "epoch": 15.76511861009021, + "grad_norm": 0.036753468215465546, + "learning_rate": 6.523891241235727e-06, + "loss": 0.0555, + "num_input_tokens_seen": 172241472, + "step": 141555 + }, + { + "epoch": 15.765675464973828, + "grad_norm": 0.5069890022277832, + "learning_rate": 6.522254519177029e-06, + "loss": 0.0356, + "num_input_tokens_seen": 172247328, + "step": 141560 + }, + { + "epoch": 15.766232319857446, + "grad_norm": 0.7253314256668091, + "learning_rate": 6.5206179716554484e-06, + "loss": 0.0394, + "num_input_tokens_seen": 172253504, + "step": 141565 + }, + { + "epoch": 15.766789174741062, + "grad_norm": 0.003715843427926302, + "learning_rate": 6.518981598686436e-06, + "loss": 0.0204, + "num_input_tokens_seen": 172260032, + "step": 141570 + }, + { + "epoch": 15.76734602962468, + "grad_norm": 0.4689863622188568, + "learning_rate": 6.517345400285452e-06, + "loss": 0.0039, + "num_input_tokens_seen": 172266368, + "step": 141575 + }, + { + "epoch": 15.767902884508297, + "grad_norm": 0.06958433240652084, + "learning_rate": 6.515709376467938e-06, + "loss": 0.0199, + "num_input_tokens_seen": 172272320, + "step": 141580 + }, + { + "epoch": 15.768459739391915, + "grad_norm": 0.0010079111671075225, + "learning_rate": 6.514073527249368e-06, + "loss": 0.0082, + "num_input_tokens_seen": 172278400, + "step": 141585 + }, + { + "epoch": 15.769016594275532, + "grad_norm": 0.45050913095474243, + "learning_rate": 6.512437852645181e-06, + "loss": 0.0733, + "num_input_tokens_seen": 172284512, + "step": 141590 + }, + { + "epoch": 15.769573449159148, + "grad_norm": 1.4810669422149658, + "learning_rate": 6.510802352670834e-06, + "loss": 0.0753, + "num_input_tokens_seen": 172291136, + "step": 141595 + }, + { + "epoch": 15.770130304042766, + "grad_norm": 0.75260329246521, + "learning_rate": 6.509167027341762e-06, + "loss": 0.0223, + "num_input_tokens_seen": 172296672, + "step": 141600 + }, + { + "epoch": 15.770687158926384, + "grad_norm": 0.06504041701555252, + "learning_rate": 6.507531876673431e-06, + "loss": 0.0081, + "num_input_tokens_seen": 172303008, + "step": 141605 + }, + { + "epoch": 15.771244013810001, + "grad_norm": 0.0037020074669271708, + "learning_rate": 6.505896900681269e-06, + "loss": 0.0305, + "num_input_tokens_seen": 172309024, + "step": 141610 + }, + { + "epoch": 15.771800868693619, + "grad_norm": 1.134451150894165, + "learning_rate": 6.5042620993807426e-06, + "loss": 0.077, + "num_input_tokens_seen": 172315136, + "step": 141615 + }, + { + "epoch": 15.772357723577235, + "grad_norm": 0.028475027531385422, + "learning_rate": 6.5026274727872645e-06, + "loss": 0.0591, + "num_input_tokens_seen": 172321024, + "step": 141620 + }, + { + "epoch": 15.772914578460853, + "grad_norm": 0.21540194749832153, + "learning_rate": 6.500993020916299e-06, + "loss": 0.0057, + "num_input_tokens_seen": 172327296, + "step": 141625 + }, + { + "epoch": 15.77347143334447, + "grad_norm": 0.037310514599084854, + "learning_rate": 6.499358743783266e-06, + "loss": 0.0174, + "num_input_tokens_seen": 172333408, + "step": 141630 + }, + { + "epoch": 15.774028288228088, + "grad_norm": 0.9940747618675232, + "learning_rate": 6.497724641403622e-06, + "loss": 0.0141, + "num_input_tokens_seen": 172339488, + "step": 141635 + }, + { + "epoch": 15.774585143111706, + "grad_norm": 0.0015163170173764229, + "learning_rate": 6.496090713792791e-06, + "loss": 0.0049, + "num_input_tokens_seen": 172345696, + "step": 141640 + }, + { + "epoch": 15.775141997995323, + "grad_norm": 1.6749404668807983, + "learning_rate": 6.494456960966205e-06, + "loss": 0.1088, + "num_input_tokens_seen": 172351744, + "step": 141645 + }, + { + "epoch": 15.77569885287894, + "grad_norm": 0.201043963432312, + "learning_rate": 6.492823382939298e-06, + "loss": 0.1263, + "num_input_tokens_seen": 172358080, + "step": 141650 + }, + { + "epoch": 15.776255707762557, + "grad_norm": 0.01378658227622509, + "learning_rate": 6.491189979727505e-06, + "loss": 0.1004, + "num_input_tokens_seen": 172363680, + "step": 141655 + }, + { + "epoch": 15.776812562646175, + "grad_norm": 4.110743999481201, + "learning_rate": 6.489556751346254e-06, + "loss": 0.0444, + "num_input_tokens_seen": 172369728, + "step": 141660 + }, + { + "epoch": 15.777369417529792, + "grad_norm": 3.323000192642212, + "learning_rate": 6.487923697810969e-06, + "loss": 0.2336, + "num_input_tokens_seen": 172375168, + "step": 141665 + }, + { + "epoch": 15.77792627241341, + "grad_norm": 0.002632371848449111, + "learning_rate": 6.486290819137067e-06, + "loss": 0.008, + "num_input_tokens_seen": 172381344, + "step": 141670 + }, + { + "epoch": 15.778483127297026, + "grad_norm": 0.13010083138942719, + "learning_rate": 6.484658115339992e-06, + "loss": 0.0092, + "num_input_tokens_seen": 172387520, + "step": 141675 + }, + { + "epoch": 15.779039982180644, + "grad_norm": 0.006589197553694248, + "learning_rate": 6.483025586435146e-06, + "loss": 0.0045, + "num_input_tokens_seen": 172393952, + "step": 141680 + }, + { + "epoch": 15.779596837064261, + "grad_norm": 1.3732250928878784, + "learning_rate": 6.481393232437974e-06, + "loss": 0.0404, + "num_input_tokens_seen": 172400032, + "step": 141685 + }, + { + "epoch": 15.780153691947879, + "grad_norm": 0.5854051113128662, + "learning_rate": 6.4797610533638644e-06, + "loss": 0.0837, + "num_input_tokens_seen": 172405856, + "step": 141690 + }, + { + "epoch": 15.780710546831497, + "grad_norm": 0.708454966545105, + "learning_rate": 6.478129049228257e-06, + "loss": 0.0148, + "num_input_tokens_seen": 172412192, + "step": 141695 + }, + { + "epoch": 15.781267401715112, + "grad_norm": 0.8842538595199585, + "learning_rate": 6.476497220046554e-06, + "loss": 0.0115, + "num_input_tokens_seen": 172418400, + "step": 141700 + }, + { + "epoch": 15.78182425659873, + "grad_norm": 0.527580976486206, + "learning_rate": 6.474865565834184e-06, + "loss": 0.0087, + "num_input_tokens_seen": 172424480, + "step": 141705 + }, + { + "epoch": 15.782381111482348, + "grad_norm": 0.012476812116801739, + "learning_rate": 6.47323408660655e-06, + "loss": 0.0303, + "num_input_tokens_seen": 172430656, + "step": 141710 + }, + { + "epoch": 15.782937966365965, + "grad_norm": 0.3065296709537506, + "learning_rate": 6.47160278237906e-06, + "loss": 0.0073, + "num_input_tokens_seen": 172436928, + "step": 141715 + }, + { + "epoch": 15.783494821249583, + "grad_norm": 0.1296452432870865, + "learning_rate": 6.4699716531671224e-06, + "loss": 0.0132, + "num_input_tokens_seen": 172442720, + "step": 141720 + }, + { + "epoch": 15.784051676133199, + "grad_norm": 0.0005217664293013513, + "learning_rate": 6.468340698986156e-06, + "loss": 0.0153, + "num_input_tokens_seen": 172448768, + "step": 141725 + }, + { + "epoch": 15.784608531016817, + "grad_norm": 0.06946127861738205, + "learning_rate": 6.46670991985156e-06, + "loss": 0.0045, + "num_input_tokens_seen": 172455296, + "step": 141730 + }, + { + "epoch": 15.785165385900434, + "grad_norm": 0.0001662466675043106, + "learning_rate": 6.465079315778736e-06, + "loss": 0.0874, + "num_input_tokens_seen": 172461504, + "step": 141735 + }, + { + "epoch": 15.785722240784052, + "grad_norm": 0.44886523485183716, + "learning_rate": 6.46344888678308e-06, + "loss": 0.0119, + "num_input_tokens_seen": 172467520, + "step": 141740 + }, + { + "epoch": 15.78627909566767, + "grad_norm": 0.8301903605461121, + "learning_rate": 6.461818632880007e-06, + "loss": 0.0521, + "num_input_tokens_seen": 172473856, + "step": 141745 + }, + { + "epoch": 15.786835950551286, + "grad_norm": 0.02198905684053898, + "learning_rate": 6.460188554084903e-06, + "loss": 0.0075, + "num_input_tokens_seen": 172480160, + "step": 141750 + }, + { + "epoch": 15.787392805434903, + "grad_norm": 0.005783408414572477, + "learning_rate": 6.458558650413179e-06, + "loss": 0.0173, + "num_input_tokens_seen": 172486304, + "step": 141755 + }, + { + "epoch": 15.787949660318521, + "grad_norm": 0.00017521007976029068, + "learning_rate": 6.456928921880226e-06, + "loss": 0.0615, + "num_input_tokens_seen": 172492480, + "step": 141760 + }, + { + "epoch": 15.788506515202139, + "grad_norm": 0.0006062561878934503, + "learning_rate": 6.455299368501433e-06, + "loss": 0.0512, + "num_input_tokens_seen": 172498656, + "step": 141765 + }, + { + "epoch": 15.789063370085756, + "grad_norm": 0.19740715622901917, + "learning_rate": 6.453669990292189e-06, + "loss": 0.0915, + "num_input_tokens_seen": 172504704, + "step": 141770 + }, + { + "epoch": 15.789620224969372, + "grad_norm": 0.6812277436256409, + "learning_rate": 6.452040787267899e-06, + "loss": 0.0189, + "num_input_tokens_seen": 172510848, + "step": 141775 + }, + { + "epoch": 15.79017707985299, + "grad_norm": 0.16981254518032074, + "learning_rate": 6.4504117594439445e-06, + "loss": 0.004, + "num_input_tokens_seen": 172517120, + "step": 141780 + }, + { + "epoch": 15.790733934736608, + "grad_norm": 0.00015837240789551288, + "learning_rate": 6.448782906835709e-06, + "loss": 0.106, + "num_input_tokens_seen": 172522976, + "step": 141785 + }, + { + "epoch": 15.791290789620225, + "grad_norm": 0.00034912509727291763, + "learning_rate": 6.44715422945858e-06, + "loss": 0.024, + "num_input_tokens_seen": 172529088, + "step": 141790 + }, + { + "epoch": 15.791847644503843, + "grad_norm": 0.22235043346881866, + "learning_rate": 6.445525727327948e-06, + "loss": 0.0073, + "num_input_tokens_seen": 172535232, + "step": 141795 + }, + { + "epoch": 15.792404499387459, + "grad_norm": 0.0003104920906480402, + "learning_rate": 6.443897400459184e-06, + "loss": 0.0159, + "num_input_tokens_seen": 172541248, + "step": 141800 + }, + { + "epoch": 15.792961354271077, + "grad_norm": 1.2584664821624756, + "learning_rate": 6.442269248867688e-06, + "loss": 0.1097, + "num_input_tokens_seen": 172547488, + "step": 141805 + }, + { + "epoch": 15.793518209154694, + "grad_norm": 8.645549678476527e-05, + "learning_rate": 6.440641272568818e-06, + "loss": 0.0621, + "num_input_tokens_seen": 172553824, + "step": 141810 + }, + { + "epoch": 15.794075064038312, + "grad_norm": 0.031235788017511368, + "learning_rate": 6.439013471577965e-06, + "loss": 0.1516, + "num_input_tokens_seen": 172560064, + "step": 141815 + }, + { + "epoch": 15.79463191892193, + "grad_norm": 0.002451415406540036, + "learning_rate": 6.437385845910493e-06, + "loss": 0.0081, + "num_input_tokens_seen": 172566272, + "step": 141820 + }, + { + "epoch": 15.795188773805545, + "grad_norm": 0.007578485645353794, + "learning_rate": 6.43575839558179e-06, + "loss": 0.0197, + "num_input_tokens_seen": 172572320, + "step": 141825 + }, + { + "epoch": 15.795745628689163, + "grad_norm": 0.00032360194018110633, + "learning_rate": 6.434131120607223e-06, + "loss": 0.0017, + "num_input_tokens_seen": 172577760, + "step": 141830 + }, + { + "epoch": 15.79630248357278, + "grad_norm": 0.07776622474193573, + "learning_rate": 6.432504021002164e-06, + "loss": 0.1116, + "num_input_tokens_seen": 172583936, + "step": 141835 + }, + { + "epoch": 15.796859338456398, + "grad_norm": 0.13351628184318542, + "learning_rate": 6.430877096781973e-06, + "loss": 0.0032, + "num_input_tokens_seen": 172590016, + "step": 141840 + }, + { + "epoch": 15.797416193340016, + "grad_norm": 1.4664726257324219, + "learning_rate": 6.429250347962032e-06, + "loss": 0.0948, + "num_input_tokens_seen": 172596320, + "step": 141845 + }, + { + "epoch": 15.797973048223632, + "grad_norm": 1.2446601390838623, + "learning_rate": 6.427623774557698e-06, + "loss": 0.0809, + "num_input_tokens_seen": 172602400, + "step": 141850 + }, + { + "epoch": 15.79852990310725, + "grad_norm": 0.002919760998338461, + "learning_rate": 6.4259973765843415e-06, + "loss": 0.006, + "num_input_tokens_seen": 172608864, + "step": 141855 + }, + { + "epoch": 15.799086757990867, + "grad_norm": 0.7716296315193176, + "learning_rate": 6.4243711540573094e-06, + "loss": 0.0276, + "num_input_tokens_seen": 172615264, + "step": 141860 + }, + { + "epoch": 15.799643612874485, + "grad_norm": 0.0071389335207641125, + "learning_rate": 6.422745106991984e-06, + "loss": 0.0136, + "num_input_tokens_seen": 172621184, + "step": 141865 + }, + { + "epoch": 15.800200467758103, + "grad_norm": 0.00039558683056384325, + "learning_rate": 6.421119235403708e-06, + "loss": 0.0028, + "num_input_tokens_seen": 172627488, + "step": 141870 + }, + { + "epoch": 15.80075732264172, + "grad_norm": 0.20926205813884735, + "learning_rate": 6.4194935393078606e-06, + "loss": 0.0094, + "num_input_tokens_seen": 172634144, + "step": 141875 + }, + { + "epoch": 15.801314177525336, + "grad_norm": 0.02120181731879711, + "learning_rate": 6.417868018719767e-06, + "loss": 0.0303, + "num_input_tokens_seen": 172640224, + "step": 141880 + }, + { + "epoch": 15.801871032408954, + "grad_norm": 0.00024724515969865024, + "learning_rate": 6.416242673654807e-06, + "loss": 0.0471, + "num_input_tokens_seen": 172646144, + "step": 141885 + }, + { + "epoch": 15.802427887292572, + "grad_norm": 0.0018411739729344845, + "learning_rate": 6.414617504128315e-06, + "loss": 0.004, + "num_input_tokens_seen": 172652224, + "step": 141890 + }, + { + "epoch": 15.80298474217619, + "grad_norm": 0.45662468671798706, + "learning_rate": 6.412992510155658e-06, + "loss": 0.009, + "num_input_tokens_seen": 172658272, + "step": 141895 + }, + { + "epoch": 15.803541597059807, + "grad_norm": 0.01346607506275177, + "learning_rate": 6.41136769175218e-06, + "loss": 0.0603, + "num_input_tokens_seen": 172664544, + "step": 141900 + }, + { + "epoch": 15.804098451943423, + "grad_norm": 0.5553565621376038, + "learning_rate": 6.4097430489332254e-06, + "loss": 0.019, + "num_input_tokens_seen": 172670656, + "step": 141905 + }, + { + "epoch": 15.80465530682704, + "grad_norm": 0.22673961520195007, + "learning_rate": 6.408118581714137e-06, + "loss": 0.0127, + "num_input_tokens_seen": 172676800, + "step": 141910 + }, + { + "epoch": 15.805212161710658, + "grad_norm": 0.0003517312288749963, + "learning_rate": 6.406494290110271e-06, + "loss": 0.0492, + "num_input_tokens_seen": 172683232, + "step": 141915 + }, + { + "epoch": 15.805769016594276, + "grad_norm": 0.0005669615929946303, + "learning_rate": 6.404870174136962e-06, + "loss": 0.0261, + "num_input_tokens_seen": 172689248, + "step": 141920 + }, + { + "epoch": 15.806325871477894, + "grad_norm": 0.0013317075790837407, + "learning_rate": 6.403246233809551e-06, + "loss": 0.0088, + "num_input_tokens_seen": 172695712, + "step": 141925 + }, + { + "epoch": 15.80688272636151, + "grad_norm": 0.0001602344127604738, + "learning_rate": 6.401622469143381e-06, + "loss": 0.0164, + "num_input_tokens_seen": 172701312, + "step": 141930 + }, + { + "epoch": 15.807439581245127, + "grad_norm": 0.6350103616714478, + "learning_rate": 6.399998880153782e-06, + "loss": 0.0225, + "num_input_tokens_seen": 172707328, + "step": 141935 + }, + { + "epoch": 15.807996436128745, + "grad_norm": 0.003256375901401043, + "learning_rate": 6.398375466856099e-06, + "loss": 0.0283, + "num_input_tokens_seen": 172713824, + "step": 141940 + }, + { + "epoch": 15.808553291012363, + "grad_norm": 0.5497205853462219, + "learning_rate": 6.396752229265665e-06, + "loss": 0.0978, + "num_input_tokens_seen": 172720096, + "step": 141945 + }, + { + "epoch": 15.80911014589598, + "grad_norm": 0.017522579059004784, + "learning_rate": 6.395129167397812e-06, + "loss": 0.0871, + "num_input_tokens_seen": 172726368, + "step": 141950 + }, + { + "epoch": 15.809667000779596, + "grad_norm": 0.08201701939105988, + "learning_rate": 6.393506281267861e-06, + "loss": 0.0968, + "num_input_tokens_seen": 172732032, + "step": 141955 + }, + { + "epoch": 15.810223855663214, + "grad_norm": 0.0010071106953546405, + "learning_rate": 6.3918835708911575e-06, + "loss": 0.0115, + "num_input_tokens_seen": 172738336, + "step": 141960 + }, + { + "epoch": 15.810780710546831, + "grad_norm": 0.05467378348112106, + "learning_rate": 6.390261036283016e-06, + "loss": 0.0685, + "num_input_tokens_seen": 172744288, + "step": 141965 + }, + { + "epoch": 15.811337565430449, + "grad_norm": 0.004463959485292435, + "learning_rate": 6.388638677458775e-06, + "loss": 0.0063, + "num_input_tokens_seen": 172750304, + "step": 141970 + }, + { + "epoch": 15.811894420314067, + "grad_norm": 0.23952646553516388, + "learning_rate": 6.387016494433754e-06, + "loss": 0.022, + "num_input_tokens_seen": 172756448, + "step": 141975 + }, + { + "epoch": 15.812451275197683, + "grad_norm": 0.9946975111961365, + "learning_rate": 6.385394487223276e-06, + "loss": 0.0436, + "num_input_tokens_seen": 172761856, + "step": 141980 + }, + { + "epoch": 15.8130081300813, + "grad_norm": 0.022669458761811256, + "learning_rate": 6.3837726558426514e-06, + "loss": 0.0117, + "num_input_tokens_seen": 172768448, + "step": 141985 + }, + { + "epoch": 15.813564984964918, + "grad_norm": 0.07455816864967346, + "learning_rate": 6.382151000307215e-06, + "loss": 0.0312, + "num_input_tokens_seen": 172774656, + "step": 141990 + }, + { + "epoch": 15.814121839848536, + "grad_norm": 0.1340796947479248, + "learning_rate": 6.3805295206322835e-06, + "loss": 0.0235, + "num_input_tokens_seen": 172780704, + "step": 141995 + }, + { + "epoch": 15.814678694732153, + "grad_norm": 0.35221391916275024, + "learning_rate": 6.3789082168331655e-06, + "loss": 0.0291, + "num_input_tokens_seen": 172787040, + "step": 142000 + }, + { + "epoch": 15.815235549615771, + "grad_norm": 0.020946547389030457, + "learning_rate": 6.377287088925171e-06, + "loss": 0.0164, + "num_input_tokens_seen": 172792608, + "step": 142005 + }, + { + "epoch": 15.815792404499387, + "grad_norm": 0.00130240258295089, + "learning_rate": 6.375666136923627e-06, + "loss": 0.006, + "num_input_tokens_seen": 172798752, + "step": 142010 + }, + { + "epoch": 15.816349259383005, + "grad_norm": 0.17626264691352844, + "learning_rate": 6.374045360843831e-06, + "loss": 0.0387, + "num_input_tokens_seen": 172804768, + "step": 142015 + }, + { + "epoch": 15.816906114266622, + "grad_norm": 0.0003256468626204878, + "learning_rate": 6.372424760701115e-06, + "loss": 0.0018, + "num_input_tokens_seen": 172811008, + "step": 142020 + }, + { + "epoch": 15.81746296915024, + "grad_norm": 0.013319136574864388, + "learning_rate": 6.370804336510755e-06, + "loss": 0.0494, + "num_input_tokens_seen": 172817120, + "step": 142025 + }, + { + "epoch": 15.818019824033858, + "grad_norm": 0.0003041742602363229, + "learning_rate": 6.3691840882880825e-06, + "loss": 0.0047, + "num_input_tokens_seen": 172823392, + "step": 142030 + }, + { + "epoch": 15.818576678917474, + "grad_norm": 0.9516947865486145, + "learning_rate": 6.367564016048386e-06, + "loss": 0.1166, + "num_input_tokens_seen": 172829568, + "step": 142035 + }, + { + "epoch": 15.819133533801091, + "grad_norm": 1.5521372556686401, + "learning_rate": 6.36594411980698e-06, + "loss": 0.1577, + "num_input_tokens_seen": 172835136, + "step": 142040 + }, + { + "epoch": 15.819690388684709, + "grad_norm": 0.15241186320781708, + "learning_rate": 6.364324399579163e-06, + "loss": 0.0709, + "num_input_tokens_seen": 172841184, + "step": 142045 + }, + { + "epoch": 15.820247243568327, + "grad_norm": 0.9527701735496521, + "learning_rate": 6.3627048553802335e-06, + "loss": 0.1125, + "num_input_tokens_seen": 172847488, + "step": 142050 + }, + { + "epoch": 15.820804098451944, + "grad_norm": 0.6226734519004822, + "learning_rate": 6.36108548722548e-06, + "loss": 0.0114, + "num_input_tokens_seen": 172853728, + "step": 142055 + }, + { + "epoch": 15.82136095333556, + "grad_norm": 0.009158533997833729, + "learning_rate": 6.3594662951302145e-06, + "loss": 0.0052, + "num_input_tokens_seen": 172859040, + "step": 142060 + }, + { + "epoch": 15.821917808219178, + "grad_norm": 0.152910053730011, + "learning_rate": 6.357847279109727e-06, + "loss": 0.0041, + "num_input_tokens_seen": 172865376, + "step": 142065 + }, + { + "epoch": 15.822474663102795, + "grad_norm": 0.009491564705967903, + "learning_rate": 6.356228439179304e-06, + "loss": 0.0529, + "num_input_tokens_seen": 172871456, + "step": 142070 + }, + { + "epoch": 15.823031517986413, + "grad_norm": 0.32740649580955505, + "learning_rate": 6.3546097753542365e-06, + "loss": 0.0235, + "num_input_tokens_seen": 172877568, + "step": 142075 + }, + { + "epoch": 15.82358837287003, + "grad_norm": 0.001411194447427988, + "learning_rate": 6.352991287649824e-06, + "loss": 0.0099, + "num_input_tokens_seen": 172883648, + "step": 142080 + }, + { + "epoch": 15.824145227753647, + "grad_norm": 0.12193974107503891, + "learning_rate": 6.351372976081341e-06, + "loss": 0.0048, + "num_input_tokens_seen": 172890016, + "step": 142085 + }, + { + "epoch": 15.824702082637264, + "grad_norm": 0.05098731070756912, + "learning_rate": 6.349754840664096e-06, + "loss": 0.0022, + "num_input_tokens_seen": 172896416, + "step": 142090 + }, + { + "epoch": 15.825258937520882, + "grad_norm": 1.0431997776031494, + "learning_rate": 6.348136881413344e-06, + "loss": 0.0319, + "num_input_tokens_seen": 172902592, + "step": 142095 + }, + { + "epoch": 15.8258157924045, + "grad_norm": 0.3764148950576782, + "learning_rate": 6.346519098344389e-06, + "loss": 0.0372, + "num_input_tokens_seen": 172908000, + "step": 142100 + }, + { + "epoch": 15.826372647288117, + "grad_norm": 0.7596630454063416, + "learning_rate": 6.344901491472499e-06, + "loss": 0.0755, + "num_input_tokens_seen": 172914240, + "step": 142105 + }, + { + "epoch": 15.826929502171733, + "grad_norm": 0.002581941429525614, + "learning_rate": 6.3432840608129705e-06, + "loss": 0.04, + "num_input_tokens_seen": 172920224, + "step": 142110 + }, + { + "epoch": 15.827486357055351, + "grad_norm": 0.04870964586734772, + "learning_rate": 6.341666806381069e-06, + "loss": 0.0782, + "num_input_tokens_seen": 172926592, + "step": 142115 + }, + { + "epoch": 15.828043211938969, + "grad_norm": 0.053732775151729584, + "learning_rate": 6.340049728192077e-06, + "loss": 0.0082, + "num_input_tokens_seen": 172932608, + "step": 142120 + }, + { + "epoch": 15.828600066822586, + "grad_norm": 0.17903786897659302, + "learning_rate": 6.338432826261253e-06, + "loss": 0.0192, + "num_input_tokens_seen": 172938720, + "step": 142125 + }, + { + "epoch": 15.829156921706204, + "grad_norm": 0.5218816995620728, + "learning_rate": 6.3368161006038926e-06, + "loss": 0.1128, + "num_input_tokens_seen": 172945248, + "step": 142130 + }, + { + "epoch": 15.82971377658982, + "grad_norm": 0.003094837535172701, + "learning_rate": 6.335199551235257e-06, + "loss": 0.0019, + "num_input_tokens_seen": 172951040, + "step": 142135 + }, + { + "epoch": 15.830270631473438, + "grad_norm": 0.9559074640274048, + "learning_rate": 6.333583178170616e-06, + "loss": 0.009, + "num_input_tokens_seen": 172957280, + "step": 142140 + }, + { + "epoch": 15.830827486357055, + "grad_norm": 0.00014083321730140597, + "learning_rate": 6.3319669814252276e-06, + "loss": 0.0187, + "num_input_tokens_seen": 172963104, + "step": 142145 + }, + { + "epoch": 15.831384341240673, + "grad_norm": 0.6200023293495178, + "learning_rate": 6.330350961014375e-06, + "loss": 0.1423, + "num_input_tokens_seen": 172969440, + "step": 142150 + }, + { + "epoch": 15.83194119612429, + "grad_norm": 0.06860477477312088, + "learning_rate": 6.32873511695331e-06, + "loss": 0.0293, + "num_input_tokens_seen": 172975392, + "step": 142155 + }, + { + "epoch": 15.832498051007907, + "grad_norm": 0.7344257235527039, + "learning_rate": 6.327119449257307e-06, + "loss": 0.0153, + "num_input_tokens_seen": 172981440, + "step": 142160 + }, + { + "epoch": 15.833054905891524, + "grad_norm": 0.0040089720860123634, + "learning_rate": 6.3255039579416225e-06, + "loss": 0.0087, + "num_input_tokens_seen": 172987520, + "step": 142165 + }, + { + "epoch": 15.833611760775142, + "grad_norm": 1.5281647443771362, + "learning_rate": 6.323888643021514e-06, + "loss": 0.1069, + "num_input_tokens_seen": 172993600, + "step": 142170 + }, + { + "epoch": 15.83416861565876, + "grad_norm": 0.41405579447746277, + "learning_rate": 6.322273504512233e-06, + "loss": 0.0103, + "num_input_tokens_seen": 173000032, + "step": 142175 + }, + { + "epoch": 15.834725470542377, + "grad_norm": 0.0026821671053767204, + "learning_rate": 6.32065854242905e-06, + "loss": 0.0482, + "num_input_tokens_seen": 173006112, + "step": 142180 + }, + { + "epoch": 15.835282325425993, + "grad_norm": 0.4372912049293518, + "learning_rate": 6.319043756787215e-06, + "loss": 0.0163, + "num_input_tokens_seen": 173012256, + "step": 142185 + }, + { + "epoch": 15.83583918030961, + "grad_norm": 0.00016370631055906415, + "learning_rate": 6.317429147601978e-06, + "loss": 0.0786, + "num_input_tokens_seen": 173018560, + "step": 142190 + }, + { + "epoch": 15.836396035193228, + "grad_norm": 0.0001411404082318768, + "learning_rate": 6.315814714888582e-06, + "loss": 0.0537, + "num_input_tokens_seen": 173024608, + "step": 142195 + }, + { + "epoch": 15.836952890076846, + "grad_norm": 0.00012099729792680591, + "learning_rate": 6.314200458662292e-06, + "loss": 0.0123, + "num_input_tokens_seen": 173030816, + "step": 142200 + }, + { + "epoch": 15.837509744960464, + "grad_norm": 1.2656071186065674, + "learning_rate": 6.3125863789383455e-06, + "loss": 0.0301, + "num_input_tokens_seen": 173036832, + "step": 142205 + }, + { + "epoch": 15.83806659984408, + "grad_norm": 0.0001376995351165533, + "learning_rate": 6.310972475732005e-06, + "loss": 0.0049, + "num_input_tokens_seen": 173042592, + "step": 142210 + }, + { + "epoch": 15.838623454727697, + "grad_norm": 0.12589077651500702, + "learning_rate": 6.309358749058489e-06, + "loss": 0.0361, + "num_input_tokens_seen": 173048576, + "step": 142215 + }, + { + "epoch": 15.839180309611315, + "grad_norm": 0.00017143739387392998, + "learning_rate": 6.30774519893306e-06, + "loss": 0.0236, + "num_input_tokens_seen": 173054816, + "step": 142220 + }, + { + "epoch": 15.839737164494933, + "grad_norm": 0.03737572208046913, + "learning_rate": 6.306131825370948e-06, + "loss": 0.0323, + "num_input_tokens_seen": 173060640, + "step": 142225 + }, + { + "epoch": 15.84029401937855, + "grad_norm": 0.2326950579881668, + "learning_rate": 6.304518628387407e-06, + "loss": 0.0307, + "num_input_tokens_seen": 173066688, + "step": 142230 + }, + { + "epoch": 15.840850874262168, + "grad_norm": 0.22176772356033325, + "learning_rate": 6.302905607997664e-06, + "loss": 0.0129, + "num_input_tokens_seen": 173073120, + "step": 142235 + }, + { + "epoch": 15.841407729145784, + "grad_norm": 0.0009276912896893919, + "learning_rate": 6.301292764216957e-06, + "loss": 0.0563, + "num_input_tokens_seen": 173078816, + "step": 142240 + }, + { + "epoch": 15.841964584029402, + "grad_norm": 0.7719852328300476, + "learning_rate": 6.299680097060515e-06, + "loss": 0.0711, + "num_input_tokens_seen": 173084960, + "step": 142245 + }, + { + "epoch": 15.84252143891302, + "grad_norm": 3.5734739303588867, + "learning_rate": 6.298067606543584e-06, + "loss": 0.1611, + "num_input_tokens_seen": 173090688, + "step": 142250 + }, + { + "epoch": 15.843078293796637, + "grad_norm": 0.04413691535592079, + "learning_rate": 6.296455292681386e-06, + "loss": 0.0586, + "num_input_tokens_seen": 173096960, + "step": 142255 + }, + { + "epoch": 15.843635148680255, + "grad_norm": 0.269296258687973, + "learning_rate": 6.294843155489155e-06, + "loss": 0.0229, + "num_input_tokens_seen": 173103168, + "step": 142260 + }, + { + "epoch": 15.84419200356387, + "grad_norm": 0.000301039544865489, + "learning_rate": 6.293231194982111e-06, + "loss": 0.0955, + "num_input_tokens_seen": 173109184, + "step": 142265 + }, + { + "epoch": 15.844748858447488, + "grad_norm": 0.737765908241272, + "learning_rate": 6.291619411175489e-06, + "loss": 0.022, + "num_input_tokens_seen": 173115328, + "step": 142270 + }, + { + "epoch": 15.845305713331106, + "grad_norm": 0.024333685636520386, + "learning_rate": 6.290007804084505e-06, + "loss": 0.055, + "num_input_tokens_seen": 173121472, + "step": 142275 + }, + { + "epoch": 15.845862568214724, + "grad_norm": 0.26456356048583984, + "learning_rate": 6.288396373724403e-06, + "loss": 0.0192, + "num_input_tokens_seen": 173127328, + "step": 142280 + }, + { + "epoch": 15.846419423098341, + "grad_norm": 0.046660590916872025, + "learning_rate": 6.286785120110375e-06, + "loss": 0.0099, + "num_input_tokens_seen": 173133856, + "step": 142285 + }, + { + "epoch": 15.846976277981957, + "grad_norm": 0.08848965167999268, + "learning_rate": 6.28517404325766e-06, + "loss": 0.0128, + "num_input_tokens_seen": 173139936, + "step": 142290 + }, + { + "epoch": 15.847533132865575, + "grad_norm": 0.0025977541226893663, + "learning_rate": 6.283563143181464e-06, + "loss": 0.0239, + "num_input_tokens_seen": 173146080, + "step": 142295 + }, + { + "epoch": 15.848089987749193, + "grad_norm": 0.1025371253490448, + "learning_rate": 6.281952419897017e-06, + "loss": 0.0062, + "num_input_tokens_seen": 173152000, + "step": 142300 + }, + { + "epoch": 15.84864684263281, + "grad_norm": 0.00510072335600853, + "learning_rate": 6.280341873419523e-06, + "loss": 0.0318, + "num_input_tokens_seen": 173158208, + "step": 142305 + }, + { + "epoch": 15.849203697516428, + "grad_norm": 0.7454574704170227, + "learning_rate": 6.278731503764202e-06, + "loss": 0.0191, + "num_input_tokens_seen": 173163776, + "step": 142310 + }, + { + "epoch": 15.849760552400044, + "grad_norm": 0.000141287237056531, + "learning_rate": 6.277121310946252e-06, + "loss": 0.0019, + "num_input_tokens_seen": 173170272, + "step": 142315 + }, + { + "epoch": 15.850317407283661, + "grad_norm": 0.5953167080879211, + "learning_rate": 6.275511294980899e-06, + "loss": 0.0463, + "num_input_tokens_seen": 173176512, + "step": 142320 + }, + { + "epoch": 15.85087426216728, + "grad_norm": 0.30960309505462646, + "learning_rate": 6.273901455883344e-06, + "loss": 0.0097, + "num_input_tokens_seen": 173182720, + "step": 142325 + }, + { + "epoch": 15.851431117050897, + "grad_norm": 0.30988115072250366, + "learning_rate": 6.272291793668791e-06, + "loss": 0.0509, + "num_input_tokens_seen": 173188736, + "step": 142330 + }, + { + "epoch": 15.851987971934514, + "grad_norm": 0.06315526366233826, + "learning_rate": 6.270682308352441e-06, + "loss": 0.0093, + "num_input_tokens_seen": 173194848, + "step": 142335 + }, + { + "epoch": 15.85254482681813, + "grad_norm": 0.27607089281082153, + "learning_rate": 6.269072999949508e-06, + "loss": 0.1192, + "num_input_tokens_seen": 173200864, + "step": 142340 + }, + { + "epoch": 15.853101681701748, + "grad_norm": 0.5134167671203613, + "learning_rate": 6.26746386847519e-06, + "loss": 0.1757, + "num_input_tokens_seen": 173206720, + "step": 142345 + }, + { + "epoch": 15.853658536585366, + "grad_norm": 0.22447213530540466, + "learning_rate": 6.2658549139446745e-06, + "loss": 0.0971, + "num_input_tokens_seen": 173212736, + "step": 142350 + }, + { + "epoch": 15.854215391468983, + "grad_norm": 1.07337486743927, + "learning_rate": 6.264246136373184e-06, + "loss": 0.0893, + "num_input_tokens_seen": 173219008, + "step": 142355 + }, + { + "epoch": 15.854772246352601, + "grad_norm": 0.0008714208379387856, + "learning_rate": 6.262637535775887e-06, + "loss": 0.0998, + "num_input_tokens_seen": 173224800, + "step": 142360 + }, + { + "epoch": 15.855329101236219, + "grad_norm": 0.004320171661674976, + "learning_rate": 6.261029112167996e-06, + "loss": 0.0237, + "num_input_tokens_seen": 173230144, + "step": 142365 + }, + { + "epoch": 15.855885956119835, + "grad_norm": 1.9429969787597656, + "learning_rate": 6.259420865564691e-06, + "loss": 0.0687, + "num_input_tokens_seen": 173236032, + "step": 142370 + }, + { + "epoch": 15.856442811003452, + "grad_norm": 0.03338616341352463, + "learning_rate": 6.257812795981177e-06, + "loss": 0.1167, + "num_input_tokens_seen": 173241888, + "step": 142375 + }, + { + "epoch": 15.85699966588707, + "grad_norm": 0.4867826998233795, + "learning_rate": 6.256204903432639e-06, + "loss": 0.0863, + "num_input_tokens_seen": 173247488, + "step": 142380 + }, + { + "epoch": 15.857556520770688, + "grad_norm": 0.3868301808834076, + "learning_rate": 6.254597187934263e-06, + "loss": 0.0099, + "num_input_tokens_seen": 173253600, + "step": 142385 + }, + { + "epoch": 15.858113375654305, + "grad_norm": 0.11235887557268143, + "learning_rate": 6.252989649501226e-06, + "loss": 0.0037, + "num_input_tokens_seen": 173259488, + "step": 142390 + }, + { + "epoch": 15.858670230537921, + "grad_norm": 0.12299497425556183, + "learning_rate": 6.25138228814873e-06, + "loss": 0.0069, + "num_input_tokens_seen": 173265632, + "step": 142395 + }, + { + "epoch": 15.859227085421539, + "grad_norm": 0.00010951946023851633, + "learning_rate": 6.24977510389195e-06, + "loss": 0.0673, + "num_input_tokens_seen": 173272064, + "step": 142400 + }, + { + "epoch": 15.859783940305157, + "grad_norm": 0.0012111369287595153, + "learning_rate": 6.248168096746066e-06, + "loss": 0.0252, + "num_input_tokens_seen": 173278304, + "step": 142405 + }, + { + "epoch": 15.860340795188774, + "grad_norm": 0.0022199940867722034, + "learning_rate": 6.24656126672625e-06, + "loss": 0.0034, + "num_input_tokens_seen": 173284640, + "step": 142410 + }, + { + "epoch": 15.860897650072392, + "grad_norm": 0.3234733045101166, + "learning_rate": 6.244954613847698e-06, + "loss": 0.0145, + "num_input_tokens_seen": 173290944, + "step": 142415 + }, + { + "epoch": 15.861454504956008, + "grad_norm": 0.031840238720178604, + "learning_rate": 6.243348138125566e-06, + "loss": 0.0428, + "num_input_tokens_seen": 173297120, + "step": 142420 + }, + { + "epoch": 15.862011359839626, + "grad_norm": 0.0875321552157402, + "learning_rate": 6.241741839575055e-06, + "loss": 0.0399, + "num_input_tokens_seen": 173302880, + "step": 142425 + }, + { + "epoch": 15.862568214723243, + "grad_norm": 1.1010856628417969, + "learning_rate": 6.240135718211304e-06, + "loss": 0.0852, + "num_input_tokens_seen": 173309120, + "step": 142430 + }, + { + "epoch": 15.86312506960686, + "grad_norm": 0.33101290464401245, + "learning_rate": 6.238529774049509e-06, + "loss": 0.0924, + "num_input_tokens_seen": 173315200, + "step": 142435 + }, + { + "epoch": 15.863681924490479, + "grad_norm": 0.04115995019674301, + "learning_rate": 6.236924007104827e-06, + "loss": 0.1196, + "num_input_tokens_seen": 173321472, + "step": 142440 + }, + { + "epoch": 15.864238779374094, + "grad_norm": 0.0016090198187157512, + "learning_rate": 6.235318417392436e-06, + "loss": 0.0004, + "num_input_tokens_seen": 173327712, + "step": 142445 + }, + { + "epoch": 15.864795634257712, + "grad_norm": 0.1496497243642807, + "learning_rate": 6.233713004927496e-06, + "loss": 0.0034, + "num_input_tokens_seen": 173333696, + "step": 142450 + }, + { + "epoch": 15.86535248914133, + "grad_norm": 0.01977682299911976, + "learning_rate": 6.232107769725173e-06, + "loss": 0.1198, + "num_input_tokens_seen": 173339648, + "step": 142455 + }, + { + "epoch": 15.865909344024947, + "grad_norm": 0.016794463619589806, + "learning_rate": 6.230502711800621e-06, + "loss": 0.0833, + "num_input_tokens_seen": 173345792, + "step": 142460 + }, + { + "epoch": 15.866466198908565, + "grad_norm": 0.06351731717586517, + "learning_rate": 6.228897831169017e-06, + "loss": 0.0098, + "num_input_tokens_seen": 173351968, + "step": 142465 + }, + { + "epoch": 15.867023053792181, + "grad_norm": 0.02215585857629776, + "learning_rate": 6.22729312784551e-06, + "loss": 0.0323, + "num_input_tokens_seen": 173358048, + "step": 142470 + }, + { + "epoch": 15.867579908675799, + "grad_norm": 0.04623425379395485, + "learning_rate": 6.225688601845262e-06, + "loss": 0.0503, + "num_input_tokens_seen": 173364256, + "step": 142475 + }, + { + "epoch": 15.868136763559416, + "grad_norm": 0.01851017028093338, + "learning_rate": 6.224084253183418e-06, + "loss": 0.0101, + "num_input_tokens_seen": 173370400, + "step": 142480 + }, + { + "epoch": 15.868693618443034, + "grad_norm": 0.09604306519031525, + "learning_rate": 6.222480081875149e-06, + "loss": 0.0147, + "num_input_tokens_seen": 173376480, + "step": 142485 + }, + { + "epoch": 15.869250473326652, + "grad_norm": 1.2983418703079224, + "learning_rate": 6.220876087935593e-06, + "loss": 0.0256, + "num_input_tokens_seen": 173382272, + "step": 142490 + }, + { + "epoch": 15.869807328210268, + "grad_norm": 0.024207867681980133, + "learning_rate": 6.219272271379922e-06, + "loss": 0.04, + "num_input_tokens_seen": 173388384, + "step": 142495 + }, + { + "epoch": 15.870364183093885, + "grad_norm": 0.15053951740264893, + "learning_rate": 6.217668632223256e-06, + "loss": 0.0823, + "num_input_tokens_seen": 173393952, + "step": 142500 + }, + { + "epoch": 15.870921037977503, + "grad_norm": 0.028253789991140366, + "learning_rate": 6.216065170480767e-06, + "loss": 0.0102, + "num_input_tokens_seen": 173399936, + "step": 142505 + }, + { + "epoch": 15.87147789286112, + "grad_norm": 0.31127405166625977, + "learning_rate": 6.214461886167583e-06, + "loss": 0.0124, + "num_input_tokens_seen": 173406048, + "step": 142510 + }, + { + "epoch": 15.872034747744738, + "grad_norm": 0.7927184104919434, + "learning_rate": 6.212858779298866e-06, + "loss": 0.0147, + "num_input_tokens_seen": 173412064, + "step": 142515 + }, + { + "epoch": 15.872591602628354, + "grad_norm": 0.09697054326534271, + "learning_rate": 6.211255849889749e-06, + "loss": 0.0027, + "num_input_tokens_seen": 173418016, + "step": 142520 + }, + { + "epoch": 15.873148457511972, + "grad_norm": 1.4655110836029053, + "learning_rate": 6.209653097955376e-06, + "loss": 0.1696, + "num_input_tokens_seen": 173424128, + "step": 142525 + }, + { + "epoch": 15.87370531239559, + "grad_norm": 0.3359767198562622, + "learning_rate": 6.208050523510872e-06, + "loss": 0.0461, + "num_input_tokens_seen": 173430272, + "step": 142530 + }, + { + "epoch": 15.874262167279207, + "grad_norm": 0.00014447284047491848, + "learning_rate": 6.206448126571399e-06, + "loss": 0.0559, + "num_input_tokens_seen": 173436448, + "step": 142535 + }, + { + "epoch": 15.874819022162825, + "grad_norm": 0.016442926600575447, + "learning_rate": 6.204845907152076e-06, + "loss": 0.0036, + "num_input_tokens_seen": 173442912, + "step": 142540 + }, + { + "epoch": 15.87537587704644, + "grad_norm": 0.1662673056125641, + "learning_rate": 6.2032438652680465e-06, + "loss": 0.0191, + "num_input_tokens_seen": 173449088, + "step": 142545 + }, + { + "epoch": 15.875932731930058, + "grad_norm": 0.7709329724311829, + "learning_rate": 6.201642000934426e-06, + "loss": 0.0146, + "num_input_tokens_seen": 173455424, + "step": 142550 + }, + { + "epoch": 15.876489586813676, + "grad_norm": 0.00583249144256115, + "learning_rate": 6.200040314166369e-06, + "loss": 0.0011, + "num_input_tokens_seen": 173461536, + "step": 142555 + }, + { + "epoch": 15.877046441697294, + "grad_norm": 0.00037827243795618415, + "learning_rate": 6.198438804978984e-06, + "loss": 0.0635, + "num_input_tokens_seen": 173467616, + "step": 142560 + }, + { + "epoch": 15.877603296580912, + "grad_norm": 1.7608740329742432, + "learning_rate": 6.196837473387418e-06, + "loss": 0.1298, + "num_input_tokens_seen": 173473280, + "step": 142565 + }, + { + "epoch": 15.878160151464527, + "grad_norm": 0.7892014980316162, + "learning_rate": 6.195236319406786e-06, + "loss": 0.061, + "num_input_tokens_seen": 173479424, + "step": 142570 + }, + { + "epoch": 15.878717006348145, + "grad_norm": 0.370299369096756, + "learning_rate": 6.193635343052212e-06, + "loss": 0.0222, + "num_input_tokens_seen": 173485632, + "step": 142575 + }, + { + "epoch": 15.879273861231763, + "grad_norm": 0.25800463557243347, + "learning_rate": 6.1920345443388125e-06, + "loss": 0.0454, + "num_input_tokens_seen": 173491520, + "step": 142580 + }, + { + "epoch": 15.87983071611538, + "grad_norm": 0.08128655701875687, + "learning_rate": 6.190433923281722e-06, + "loss": 0.0055, + "num_input_tokens_seen": 173497568, + "step": 142585 + }, + { + "epoch": 15.880387570998998, + "grad_norm": 2.0171120166778564, + "learning_rate": 6.188833479896056e-06, + "loss": 0.063, + "num_input_tokens_seen": 173503648, + "step": 142590 + }, + { + "epoch": 15.880944425882616, + "grad_norm": 0.7561386227607727, + "learning_rate": 6.187233214196924e-06, + "loss": 0.0851, + "num_input_tokens_seen": 173509536, + "step": 142595 + }, + { + "epoch": 15.881501280766232, + "grad_norm": 0.00013254335499368608, + "learning_rate": 6.185633126199445e-06, + "loss": 0.0268, + "num_input_tokens_seen": 173515712, + "step": 142600 + }, + { + "epoch": 15.88205813564985, + "grad_norm": 0.00868966430425644, + "learning_rate": 6.184033215918739e-06, + "loss": 0.0301, + "num_input_tokens_seen": 173521824, + "step": 142605 + }, + { + "epoch": 15.882614990533467, + "grad_norm": 0.03501018509268761, + "learning_rate": 6.182433483369907e-06, + "loss": 0.0512, + "num_input_tokens_seen": 173527968, + "step": 142610 + }, + { + "epoch": 15.883171845417085, + "grad_norm": 0.10444270819425583, + "learning_rate": 6.180833928568083e-06, + "loss": 0.0124, + "num_input_tokens_seen": 173533984, + "step": 142615 + }, + { + "epoch": 15.883728700300702, + "grad_norm": 2.8836207389831543, + "learning_rate": 6.179234551528346e-06, + "loss": 0.078, + "num_input_tokens_seen": 173540256, + "step": 142620 + }, + { + "epoch": 15.884285555184318, + "grad_norm": 0.675179123878479, + "learning_rate": 6.177635352265823e-06, + "loss": 0.2267, + "num_input_tokens_seen": 173546592, + "step": 142625 + }, + { + "epoch": 15.884842410067936, + "grad_norm": 0.009633727371692657, + "learning_rate": 6.17603633079561e-06, + "loss": 0.0044, + "num_input_tokens_seen": 173552576, + "step": 142630 + }, + { + "epoch": 15.885399264951554, + "grad_norm": 2.708049774169922, + "learning_rate": 6.17443748713282e-06, + "loss": 0.2147, + "num_input_tokens_seen": 173558272, + "step": 142635 + }, + { + "epoch": 15.885956119835171, + "grad_norm": 0.33874648809432983, + "learning_rate": 6.1728388212925505e-06, + "loss": 0.0135, + "num_input_tokens_seen": 173564480, + "step": 142640 + }, + { + "epoch": 15.886512974718789, + "grad_norm": 0.00024290622968692333, + "learning_rate": 6.171240333289905e-06, + "loss": 0.0116, + "num_input_tokens_seen": 173570624, + "step": 142645 + }, + { + "epoch": 15.887069829602405, + "grad_norm": 0.003298315918073058, + "learning_rate": 6.169642023139971e-06, + "loss": 0.0659, + "num_input_tokens_seen": 173576576, + "step": 142650 + }, + { + "epoch": 15.887626684486023, + "grad_norm": 0.4417308568954468, + "learning_rate": 6.168043890857861e-06, + "loss": 0.0405, + "num_input_tokens_seen": 173582464, + "step": 142655 + }, + { + "epoch": 15.88818353936964, + "grad_norm": 0.053470950573682785, + "learning_rate": 6.166445936458665e-06, + "loss": 0.0022, + "num_input_tokens_seen": 173588608, + "step": 142660 + }, + { + "epoch": 15.888740394253258, + "grad_norm": 0.3256808817386627, + "learning_rate": 6.164848159957476e-06, + "loss": 0.0504, + "num_input_tokens_seen": 173594912, + "step": 142665 + }, + { + "epoch": 15.889297249136876, + "grad_norm": 0.05197898671030998, + "learning_rate": 6.16325056136938e-06, + "loss": 0.0157, + "num_input_tokens_seen": 173600736, + "step": 142670 + }, + { + "epoch": 15.889854104020491, + "grad_norm": 0.11830657720565796, + "learning_rate": 6.161653140709484e-06, + "loss": 0.0155, + "num_input_tokens_seen": 173607328, + "step": 142675 + }, + { + "epoch": 15.89041095890411, + "grad_norm": 1.2518353462219238, + "learning_rate": 6.160055897992856e-06, + "loss": 0.0546, + "num_input_tokens_seen": 173613664, + "step": 142680 + }, + { + "epoch": 15.890967813787727, + "grad_norm": 0.1492989957332611, + "learning_rate": 6.158458833234609e-06, + "loss": 0.0036, + "num_input_tokens_seen": 173619872, + "step": 142685 + }, + { + "epoch": 15.891524668671345, + "grad_norm": 0.016814718022942543, + "learning_rate": 6.156861946449802e-06, + "loss": 0.003, + "num_input_tokens_seen": 173626208, + "step": 142690 + }, + { + "epoch": 15.892081523554962, + "grad_norm": 1.261973261833191, + "learning_rate": 6.155265237653538e-06, + "loss": 0.0188, + "num_input_tokens_seen": 173632256, + "step": 142695 + }, + { + "epoch": 15.89263837843858, + "grad_norm": 0.4624481797218323, + "learning_rate": 6.153668706860883e-06, + "loss": 0.0037, + "num_input_tokens_seen": 173638080, + "step": 142700 + }, + { + "epoch": 15.893195233322196, + "grad_norm": 0.0009432995575480163, + "learning_rate": 6.152072354086932e-06, + "loss": 0.0019, + "num_input_tokens_seen": 173644416, + "step": 142705 + }, + { + "epoch": 15.893752088205813, + "grad_norm": 0.006473949644714594, + "learning_rate": 6.150476179346762e-06, + "loss": 0.0536, + "num_input_tokens_seen": 173650336, + "step": 142710 + }, + { + "epoch": 15.894308943089431, + "grad_norm": 0.6316639184951782, + "learning_rate": 6.148880182655445e-06, + "loss": 0.0368, + "num_input_tokens_seen": 173656416, + "step": 142715 + }, + { + "epoch": 15.894865797973049, + "grad_norm": 1.759175419807434, + "learning_rate": 6.1472843640280495e-06, + "loss": 0.1036, + "num_input_tokens_seen": 173662400, + "step": 142720 + }, + { + "epoch": 15.895422652856666, + "grad_norm": 0.14422842860221863, + "learning_rate": 6.145688723479667e-06, + "loss": 0.0088, + "num_input_tokens_seen": 173668704, + "step": 142725 + }, + { + "epoch": 15.895979507740282, + "grad_norm": 0.1412813365459442, + "learning_rate": 6.144093261025358e-06, + "loss": 0.0179, + "num_input_tokens_seen": 173674976, + "step": 142730 + }, + { + "epoch": 15.8965363626239, + "grad_norm": 0.05269530043005943, + "learning_rate": 6.1424979766801974e-06, + "loss": 0.0011, + "num_input_tokens_seen": 173681184, + "step": 142735 + }, + { + "epoch": 15.897093217507518, + "grad_norm": 0.01134940329939127, + "learning_rate": 6.140902870459245e-06, + "loss": 0.0049, + "num_input_tokens_seen": 173687648, + "step": 142740 + }, + { + "epoch": 15.897650072391135, + "grad_norm": 0.7746991515159607, + "learning_rate": 6.1393079423775785e-06, + "loss": 0.009, + "num_input_tokens_seen": 173693408, + "step": 142745 + }, + { + "epoch": 15.898206927274753, + "grad_norm": 1.6006118059158325, + "learning_rate": 6.1377131924502566e-06, + "loss": 0.2121, + "num_input_tokens_seen": 173699456, + "step": 142750 + }, + { + "epoch": 15.898763782158369, + "grad_norm": 0.02573339268565178, + "learning_rate": 6.136118620692349e-06, + "loss": 0.0469, + "num_input_tokens_seen": 173705472, + "step": 142755 + }, + { + "epoch": 15.899320637041987, + "grad_norm": 0.5870285630226135, + "learning_rate": 6.134524227118924e-06, + "loss": 0.0833, + "num_input_tokens_seen": 173711232, + "step": 142760 + }, + { + "epoch": 15.899877491925604, + "grad_norm": 0.00031246180878952146, + "learning_rate": 6.132930011745017e-06, + "loss": 0.0005, + "num_input_tokens_seen": 173717344, + "step": 142765 + }, + { + "epoch": 15.900434346809222, + "grad_norm": 0.3168620467185974, + "learning_rate": 6.131335974585711e-06, + "loss": 0.1858, + "num_input_tokens_seen": 173723712, + "step": 142770 + }, + { + "epoch": 15.90099120169284, + "grad_norm": 0.001775007345713675, + "learning_rate": 6.129742115656045e-06, + "loss": 0.0379, + "num_input_tokens_seen": 173729920, + "step": 142775 + }, + { + "epoch": 15.901548056576456, + "grad_norm": 0.26753270626068115, + "learning_rate": 6.128148434971093e-06, + "loss": 0.0357, + "num_input_tokens_seen": 173735712, + "step": 142780 + }, + { + "epoch": 15.902104911460073, + "grad_norm": 2.0369250774383545, + "learning_rate": 6.1265549325458944e-06, + "loss": 0.1076, + "num_input_tokens_seen": 173741632, + "step": 142785 + }, + { + "epoch": 15.90266176634369, + "grad_norm": 0.3916079103946686, + "learning_rate": 6.1249616083955105e-06, + "loss": 0.034, + "num_input_tokens_seen": 173747744, + "step": 142790 + }, + { + "epoch": 15.903218621227309, + "grad_norm": 0.00012027794582536444, + "learning_rate": 6.123368462534976e-06, + "loss": 0.009, + "num_input_tokens_seen": 173753888, + "step": 142795 + }, + { + "epoch": 15.903775476110926, + "grad_norm": 0.09764820337295532, + "learning_rate": 6.12177549497936e-06, + "loss": 0.0379, + "num_input_tokens_seen": 173760032, + "step": 142800 + }, + { + "epoch": 15.904332330994542, + "grad_norm": 0.0013249993789941072, + "learning_rate": 6.120182705743696e-06, + "loss": 0.1039, + "num_input_tokens_seen": 173766496, + "step": 142805 + }, + { + "epoch": 15.90488918587816, + "grad_norm": 1.681553602218628, + "learning_rate": 6.118590094843035e-06, + "loss": 0.0853, + "num_input_tokens_seen": 173772672, + "step": 142810 + }, + { + "epoch": 15.905446040761777, + "grad_norm": 0.8410309553146362, + "learning_rate": 6.116997662292412e-06, + "loss": 0.037, + "num_input_tokens_seen": 173778720, + "step": 142815 + }, + { + "epoch": 15.906002895645395, + "grad_norm": 0.8911100029945374, + "learning_rate": 6.11540540810688e-06, + "loss": 0.059, + "num_input_tokens_seen": 173784768, + "step": 142820 + }, + { + "epoch": 15.906559750529013, + "grad_norm": 0.004678123630583286, + "learning_rate": 6.113813332301465e-06, + "loss": 0.0783, + "num_input_tokens_seen": 173791136, + "step": 142825 + }, + { + "epoch": 15.907116605412629, + "grad_norm": 0.08112756907939911, + "learning_rate": 6.112221434891233e-06, + "loss": 0.0292, + "num_input_tokens_seen": 173797664, + "step": 142830 + }, + { + "epoch": 15.907673460296246, + "grad_norm": 1.1502097845077515, + "learning_rate": 6.110629715891186e-06, + "loss": 0.0558, + "num_input_tokens_seen": 173803584, + "step": 142835 + }, + { + "epoch": 15.908230315179864, + "grad_norm": 0.08109799772500992, + "learning_rate": 6.109038175316384e-06, + "loss": 0.0195, + "num_input_tokens_seen": 173809472, + "step": 142840 + }, + { + "epoch": 15.908787170063482, + "grad_norm": 0.1010044738650322, + "learning_rate": 6.1074468131818415e-06, + "loss": 0.0094, + "num_input_tokens_seen": 173815616, + "step": 142845 + }, + { + "epoch": 15.9093440249471, + "grad_norm": 0.30839940905570984, + "learning_rate": 6.1058556295026075e-06, + "loss": 0.1029, + "num_input_tokens_seen": 173821856, + "step": 142850 + }, + { + "epoch": 15.909900879830715, + "grad_norm": 0.05522199347615242, + "learning_rate": 6.104264624293707e-06, + "loss": 0.0074, + "num_input_tokens_seen": 173828224, + "step": 142855 + }, + { + "epoch": 15.910457734714333, + "grad_norm": 0.07888347655534744, + "learning_rate": 6.1026737975701656e-06, + "loss": 0.0446, + "num_input_tokens_seen": 173834464, + "step": 142860 + }, + { + "epoch": 15.91101458959795, + "grad_norm": 0.002542819594964385, + "learning_rate": 6.101083149347004e-06, + "loss": 0.0071, + "num_input_tokens_seen": 173840416, + "step": 142865 + }, + { + "epoch": 15.911571444481568, + "grad_norm": 2.4856698513031006, + "learning_rate": 6.09949267963926e-06, + "loss": 0.0177, + "num_input_tokens_seen": 173847104, + "step": 142870 + }, + { + "epoch": 15.912128299365186, + "grad_norm": 0.004745726007968187, + "learning_rate": 6.097902388461948e-06, + "loss": 0.0111, + "num_input_tokens_seen": 173853184, + "step": 142875 + }, + { + "epoch": 15.912685154248802, + "grad_norm": 0.004846915137022734, + "learning_rate": 6.096312275830096e-06, + "loss": 0.044, + "num_input_tokens_seen": 173859616, + "step": 142880 + }, + { + "epoch": 15.91324200913242, + "grad_norm": 0.003632415784522891, + "learning_rate": 6.09472234175871e-06, + "loss": 0.0019, + "num_input_tokens_seen": 173866112, + "step": 142885 + }, + { + "epoch": 15.913798864016037, + "grad_norm": 0.009047015570104122, + "learning_rate": 6.093132586262825e-06, + "loss": 0.0059, + "num_input_tokens_seen": 173872224, + "step": 142890 + }, + { + "epoch": 15.914355718899655, + "grad_norm": 0.8042683601379395, + "learning_rate": 6.0915430093574476e-06, + "loss": 0.0495, + "num_input_tokens_seen": 173878368, + "step": 142895 + }, + { + "epoch": 15.914912573783273, + "grad_norm": 0.0013197582447901368, + "learning_rate": 6.0899536110576055e-06, + "loss": 0.0175, + "num_input_tokens_seen": 173884480, + "step": 142900 + }, + { + "epoch": 15.915469428666889, + "grad_norm": 0.06131105124950409, + "learning_rate": 6.08836439137829e-06, + "loss": 0.0279, + "num_input_tokens_seen": 173890752, + "step": 142905 + }, + { + "epoch": 15.916026283550506, + "grad_norm": 0.7175065875053406, + "learning_rate": 6.086775350334531e-06, + "loss": 0.0212, + "num_input_tokens_seen": 173896544, + "step": 142910 + }, + { + "epoch": 15.916583138434124, + "grad_norm": 0.005001173820346594, + "learning_rate": 6.085186487941324e-06, + "loss": 0.0093, + "num_input_tokens_seen": 173902880, + "step": 142915 + }, + { + "epoch": 15.917139993317742, + "grad_norm": 0.06836843490600586, + "learning_rate": 6.083597804213695e-06, + "loss": 0.0043, + "num_input_tokens_seen": 173909152, + "step": 142920 + }, + { + "epoch": 15.91769684820136, + "grad_norm": 0.9471378326416016, + "learning_rate": 6.082009299166638e-06, + "loss": 0.0101, + "num_input_tokens_seen": 173915520, + "step": 142925 + }, + { + "epoch": 15.918253703084975, + "grad_norm": 1.7878550291061401, + "learning_rate": 6.08042097281516e-06, + "loss": 0.1703, + "num_input_tokens_seen": 173921504, + "step": 142930 + }, + { + "epoch": 15.918810557968593, + "grad_norm": 0.00027455686358734965, + "learning_rate": 6.078832825174258e-06, + "loss": 0.0178, + "num_input_tokens_seen": 173927072, + "step": 142935 + }, + { + "epoch": 15.91936741285221, + "grad_norm": 0.30210769176483154, + "learning_rate": 6.077244856258946e-06, + "loss": 0.0064, + "num_input_tokens_seen": 173933248, + "step": 142940 + }, + { + "epoch": 15.919924267735828, + "grad_norm": 1.6780104637145996, + "learning_rate": 6.075657066084215e-06, + "loss": 0.0602, + "num_input_tokens_seen": 173939264, + "step": 142945 + }, + { + "epoch": 15.920481122619446, + "grad_norm": 0.3542199730873108, + "learning_rate": 6.074069454665069e-06, + "loss": 0.0168, + "num_input_tokens_seen": 173944800, + "step": 142950 + }, + { + "epoch": 15.921037977503063, + "grad_norm": 0.00029118472593836486, + "learning_rate": 6.072482022016488e-06, + "loss": 0.0399, + "num_input_tokens_seen": 173950752, + "step": 142955 + }, + { + "epoch": 15.92159483238668, + "grad_norm": 1.1455729007720947, + "learning_rate": 6.07089476815349e-06, + "loss": 0.1204, + "num_input_tokens_seen": 173957120, + "step": 142960 + }, + { + "epoch": 15.922151687270297, + "grad_norm": 0.01267267670482397, + "learning_rate": 6.069307693091048e-06, + "loss": 0.0869, + "num_input_tokens_seen": 173963040, + "step": 142965 + }, + { + "epoch": 15.922708542153915, + "grad_norm": 0.12974853813648224, + "learning_rate": 6.0677207968441674e-06, + "loss": 0.0326, + "num_input_tokens_seen": 173969120, + "step": 142970 + }, + { + "epoch": 15.923265397037532, + "grad_norm": 0.646885097026825, + "learning_rate": 6.0661340794278345e-06, + "loss": 0.0374, + "num_input_tokens_seen": 173975232, + "step": 142975 + }, + { + "epoch": 15.92382225192115, + "grad_norm": 0.6768960356712341, + "learning_rate": 6.064547540857035e-06, + "loss": 0.019, + "num_input_tokens_seen": 173981728, + "step": 142980 + }, + { + "epoch": 15.924379106804766, + "grad_norm": 0.3119460940361023, + "learning_rate": 6.062961181146745e-06, + "loss": 0.0047, + "num_input_tokens_seen": 173987968, + "step": 142985 + }, + { + "epoch": 15.924935961688384, + "grad_norm": 0.0005992593942210078, + "learning_rate": 6.06137500031197e-06, + "loss": 0.0335, + "num_input_tokens_seen": 173994048, + "step": 142990 + }, + { + "epoch": 15.925492816572001, + "grad_norm": 0.00013152560859452933, + "learning_rate": 6.059788998367677e-06, + "loss": 0.0058, + "num_input_tokens_seen": 174000160, + "step": 142995 + }, + { + "epoch": 15.926049671455619, + "grad_norm": 0.9777619242668152, + "learning_rate": 6.0582031753288555e-06, + "loss": 0.0882, + "num_input_tokens_seen": 174006432, + "step": 143000 + }, + { + "epoch": 15.926606526339237, + "grad_norm": 2.33029842376709, + "learning_rate": 6.056617531210471e-06, + "loss": 0.1294, + "num_input_tokens_seen": 174012896, + "step": 143005 + }, + { + "epoch": 15.927163381222853, + "grad_norm": 0.5882622003555298, + "learning_rate": 6.055032066027519e-06, + "loss": 0.0039, + "num_input_tokens_seen": 174019168, + "step": 143010 + }, + { + "epoch": 15.92772023610647, + "grad_norm": 0.17663075029850006, + "learning_rate": 6.053446779794961e-06, + "loss": 0.0076, + "num_input_tokens_seen": 174025440, + "step": 143015 + }, + { + "epoch": 15.928277090990088, + "grad_norm": 1.153956413269043, + "learning_rate": 6.0518616725277925e-06, + "loss": 0.0511, + "num_input_tokens_seen": 174031616, + "step": 143020 + }, + { + "epoch": 15.928833945873706, + "grad_norm": 0.004668923560529947, + "learning_rate": 6.050276744240957e-06, + "loss": 0.0122, + "num_input_tokens_seen": 174038016, + "step": 143025 + }, + { + "epoch": 15.929390800757323, + "grad_norm": 0.10047520697116852, + "learning_rate": 6.048691994949446e-06, + "loss": 0.0103, + "num_input_tokens_seen": 174044352, + "step": 143030 + }, + { + "epoch": 15.92994765564094, + "grad_norm": 0.00013493478763848543, + "learning_rate": 6.047107424668217e-06, + "loss": 0.0288, + "num_input_tokens_seen": 174050720, + "step": 143035 + }, + { + "epoch": 15.930504510524557, + "grad_norm": 1.801510214805603, + "learning_rate": 6.045523033412248e-06, + "loss": 0.0327, + "num_input_tokens_seen": 174056992, + "step": 143040 + }, + { + "epoch": 15.931061365408175, + "grad_norm": 0.009987646713852882, + "learning_rate": 6.0439388211965014e-06, + "loss": 0.0052, + "num_input_tokens_seen": 174063328, + "step": 143045 + }, + { + "epoch": 15.931618220291792, + "grad_norm": 0.12913139164447784, + "learning_rate": 6.042354788035942e-06, + "loss": 0.0736, + "num_input_tokens_seen": 174069664, + "step": 143050 + }, + { + "epoch": 15.93217507517541, + "grad_norm": 0.08340125530958176, + "learning_rate": 6.040770933945519e-06, + "loss": 0.0051, + "num_input_tokens_seen": 174075968, + "step": 143055 + }, + { + "epoch": 15.932731930059028, + "grad_norm": 0.034674908965826035, + "learning_rate": 6.039187258940216e-06, + "loss": 0.0051, + "num_input_tokens_seen": 174082016, + "step": 143060 + }, + { + "epoch": 15.933288784942643, + "grad_norm": 0.42231613397598267, + "learning_rate": 6.037603763034977e-06, + "loss": 0.0519, + "num_input_tokens_seen": 174088320, + "step": 143065 + }, + { + "epoch": 15.933845639826261, + "grad_norm": 0.020229101181030273, + "learning_rate": 6.036020446244764e-06, + "loss": 0.0184, + "num_input_tokens_seen": 174094496, + "step": 143070 + }, + { + "epoch": 15.934402494709879, + "grad_norm": 0.5194847583770752, + "learning_rate": 6.034437308584526e-06, + "loss": 0.0114, + "num_input_tokens_seen": 174100352, + "step": 143075 + }, + { + "epoch": 15.934959349593496, + "grad_norm": 0.22939980030059814, + "learning_rate": 6.032854350069228e-06, + "loss": 0.0748, + "num_input_tokens_seen": 174106432, + "step": 143080 + }, + { + "epoch": 15.935516204477114, + "grad_norm": 0.5856714248657227, + "learning_rate": 6.031271570713809e-06, + "loss": 0.0261, + "num_input_tokens_seen": 174112832, + "step": 143085 + }, + { + "epoch": 15.93607305936073, + "grad_norm": 0.005821684375405312, + "learning_rate": 6.0296889705332425e-06, + "loss": 0.0303, + "num_input_tokens_seen": 174119328, + "step": 143090 + }, + { + "epoch": 15.936629914244348, + "grad_norm": 0.6422731280326843, + "learning_rate": 6.028106549542447e-06, + "loss": 0.0215, + "num_input_tokens_seen": 174125248, + "step": 143095 + }, + { + "epoch": 15.937186769127965, + "grad_norm": 0.010957648046314716, + "learning_rate": 6.026524307756395e-06, + "loss": 0.0012, + "num_input_tokens_seen": 174131552, + "step": 143100 + }, + { + "epoch": 15.937743624011583, + "grad_norm": 0.09428348392248154, + "learning_rate": 6.024942245190013e-06, + "loss": 0.0231, + "num_input_tokens_seen": 174137792, + "step": 143105 + }, + { + "epoch": 15.9383004788952, + "grad_norm": 0.0005878471420146525, + "learning_rate": 6.023360361858263e-06, + "loss": 0.0455, + "num_input_tokens_seen": 174143584, + "step": 143110 + }, + { + "epoch": 15.938857333778817, + "grad_norm": 0.08017226308584213, + "learning_rate": 6.021778657776078e-06, + "loss": 0.0058, + "num_input_tokens_seen": 174149568, + "step": 143115 + }, + { + "epoch": 15.939414188662434, + "grad_norm": 0.004633078817278147, + "learning_rate": 6.020197132958396e-06, + "loss": 0.0025, + "num_input_tokens_seen": 174155904, + "step": 143120 + }, + { + "epoch": 15.939971043546052, + "grad_norm": 0.030261743813753128, + "learning_rate": 6.018615787420154e-06, + "loss": 0.044, + "num_input_tokens_seen": 174161920, + "step": 143125 + }, + { + "epoch": 15.94052789842967, + "grad_norm": 0.7258365154266357, + "learning_rate": 6.017034621176301e-06, + "loss": 0.0637, + "num_input_tokens_seen": 174168096, + "step": 143130 + }, + { + "epoch": 15.941084753313287, + "grad_norm": 0.0055247885175049305, + "learning_rate": 6.015453634241763e-06, + "loss": 0.0611, + "num_input_tokens_seen": 174174080, + "step": 143135 + }, + { + "epoch": 15.941641608196903, + "grad_norm": 0.5891032218933105, + "learning_rate": 6.013872826631475e-06, + "loss": 0.0618, + "num_input_tokens_seen": 174179968, + "step": 143140 + }, + { + "epoch": 15.942198463080521, + "grad_norm": 1.6968082189559937, + "learning_rate": 6.012292198360364e-06, + "loss": 0.0645, + "num_input_tokens_seen": 174185856, + "step": 143145 + }, + { + "epoch": 15.942755317964139, + "grad_norm": 0.001397275715135038, + "learning_rate": 6.0107117494433735e-06, + "loss": 0.0012, + "num_input_tokens_seen": 174191840, + "step": 143150 + }, + { + "epoch": 15.943312172847756, + "grad_norm": 0.9147666692733765, + "learning_rate": 6.0091314798954165e-06, + "loss": 0.0852, + "num_input_tokens_seen": 174197760, + "step": 143155 + }, + { + "epoch": 15.943869027731374, + "grad_norm": 0.2754665017127991, + "learning_rate": 6.007551389731436e-06, + "loss": 0.0817, + "num_input_tokens_seen": 174204032, + "step": 143160 + }, + { + "epoch": 15.94442588261499, + "grad_norm": 0.2939443588256836, + "learning_rate": 6.005971478966354e-06, + "loss": 0.0062, + "num_input_tokens_seen": 174210496, + "step": 143165 + }, + { + "epoch": 15.944982737498608, + "grad_norm": 0.3010275065898895, + "learning_rate": 6.004391747615077e-06, + "loss": 0.0799, + "num_input_tokens_seen": 174216704, + "step": 143170 + }, + { + "epoch": 15.945539592382225, + "grad_norm": 0.002833619713783264, + "learning_rate": 6.002812195692545e-06, + "loss": 0.0004, + "num_input_tokens_seen": 174222368, + "step": 143175 + }, + { + "epoch": 15.946096447265843, + "grad_norm": 1.1191751956939697, + "learning_rate": 6.001232823213665e-06, + "loss": 0.0657, + "num_input_tokens_seen": 174228320, + "step": 143180 + }, + { + "epoch": 15.94665330214946, + "grad_norm": 0.6122361421585083, + "learning_rate": 5.999653630193372e-06, + "loss": 0.0767, + "num_input_tokens_seen": 174233984, + "step": 143185 + }, + { + "epoch": 15.947210157033076, + "grad_norm": 0.5496432185173035, + "learning_rate": 5.99807461664657e-06, + "loss": 0.0165, + "num_input_tokens_seen": 174240160, + "step": 143190 + }, + { + "epoch": 15.947767011916694, + "grad_norm": 0.013644393533468246, + "learning_rate": 5.99649578258818e-06, + "loss": 0.0128, + "num_input_tokens_seen": 174245728, + "step": 143195 + }, + { + "epoch": 15.948323866800312, + "grad_norm": 0.3468526303768158, + "learning_rate": 5.994917128033103e-06, + "loss": 0.0189, + "num_input_tokens_seen": 174251488, + "step": 143200 + }, + { + "epoch": 15.94888072168393, + "grad_norm": 0.9161796569824219, + "learning_rate": 5.993338652996269e-06, + "loss": 0.0393, + "num_input_tokens_seen": 174257600, + "step": 143205 + }, + { + "epoch": 15.949437576567547, + "grad_norm": 0.3836192190647125, + "learning_rate": 5.991760357492579e-06, + "loss": 0.0623, + "num_input_tokens_seen": 174263872, + "step": 143210 + }, + { + "epoch": 15.949994431451163, + "grad_norm": 0.1302611529827118, + "learning_rate": 5.990182241536943e-06, + "loss": 0.0041, + "num_input_tokens_seen": 174270208, + "step": 143215 + }, + { + "epoch": 15.95055128633478, + "grad_norm": 0.275922030210495, + "learning_rate": 5.988604305144258e-06, + "loss": 0.0063, + "num_input_tokens_seen": 174276384, + "step": 143220 + }, + { + "epoch": 15.951108141218398, + "grad_norm": 0.00016992558084893972, + "learning_rate": 5.987026548329441e-06, + "loss": 0.0057, + "num_input_tokens_seen": 174282528, + "step": 143225 + }, + { + "epoch": 15.951664996102016, + "grad_norm": 0.1961209774017334, + "learning_rate": 5.985448971107388e-06, + "loss": 0.0021, + "num_input_tokens_seen": 174288928, + "step": 143230 + }, + { + "epoch": 15.952221850985634, + "grad_norm": 0.660484254360199, + "learning_rate": 5.983871573493016e-06, + "loss": 0.0264, + "num_input_tokens_seen": 174295264, + "step": 143235 + }, + { + "epoch": 15.95277870586925, + "grad_norm": 0.0158721674233675, + "learning_rate": 5.9822943555012e-06, + "loss": 0.0116, + "num_input_tokens_seen": 174301504, + "step": 143240 + }, + { + "epoch": 15.953335560752867, + "grad_norm": 0.030921682715415955, + "learning_rate": 5.980717317146855e-06, + "loss": 0.0721, + "num_input_tokens_seen": 174307136, + "step": 143245 + }, + { + "epoch": 15.953892415636485, + "grad_norm": 0.0022971290163695812, + "learning_rate": 5.979140458444868e-06, + "loss": 0.0105, + "num_input_tokens_seen": 174313408, + "step": 143250 + }, + { + "epoch": 15.954449270520103, + "grad_norm": 0.8132627010345459, + "learning_rate": 5.977563779410147e-06, + "loss": 0.0471, + "num_input_tokens_seen": 174319424, + "step": 143255 + }, + { + "epoch": 15.95500612540372, + "grad_norm": 0.1096142902970314, + "learning_rate": 5.975987280057574e-06, + "loss": 0.1106, + "num_input_tokens_seen": 174324928, + "step": 143260 + }, + { + "epoch": 15.955562980287336, + "grad_norm": 0.06030130013823509, + "learning_rate": 5.974410960402044e-06, + "loss": 0.002, + "num_input_tokens_seen": 174330912, + "step": 143265 + }, + { + "epoch": 15.956119835170954, + "grad_norm": 0.00024438559194095433, + "learning_rate": 5.972834820458437e-06, + "loss": 0.0155, + "num_input_tokens_seen": 174337056, + "step": 143270 + }, + { + "epoch": 15.956676690054572, + "grad_norm": 0.004848916083574295, + "learning_rate": 5.971258860241658e-06, + "loss": 0.0222, + "num_input_tokens_seen": 174343232, + "step": 143275 + }, + { + "epoch": 15.95723354493819, + "grad_norm": 0.020452698692679405, + "learning_rate": 5.969683079766586e-06, + "loss": 0.0595, + "num_input_tokens_seen": 174349280, + "step": 143280 + }, + { + "epoch": 15.957790399821807, + "grad_norm": 0.4615243971347809, + "learning_rate": 5.9681074790481015e-06, + "loss": 0.0364, + "num_input_tokens_seen": 174355104, + "step": 143285 + }, + { + "epoch": 15.958347254705425, + "grad_norm": 0.03925371915102005, + "learning_rate": 5.9665320581010845e-06, + "loss": 0.014, + "num_input_tokens_seen": 174360768, + "step": 143290 + }, + { + "epoch": 15.95890410958904, + "grad_norm": 0.006334769073873758, + "learning_rate": 5.964956816940428e-06, + "loss": 0.1728, + "num_input_tokens_seen": 174367008, + "step": 143295 + }, + { + "epoch": 15.959460964472658, + "grad_norm": 0.05165621265769005, + "learning_rate": 5.963381755580996e-06, + "loss": 0.0007, + "num_input_tokens_seen": 174373184, + "step": 143300 + }, + { + "epoch": 15.960017819356276, + "grad_norm": 2.103569269180298, + "learning_rate": 5.961806874037684e-06, + "loss": 0.2662, + "num_input_tokens_seen": 174379264, + "step": 143305 + }, + { + "epoch": 15.960574674239894, + "grad_norm": 0.15773625671863556, + "learning_rate": 5.960232172325361e-06, + "loss": 0.2493, + "num_input_tokens_seen": 174384960, + "step": 143310 + }, + { + "epoch": 15.961131529123511, + "grad_norm": 0.02628256380558014, + "learning_rate": 5.9586576504588984e-06, + "loss": 0.0913, + "num_input_tokens_seen": 174390976, + "step": 143315 + }, + { + "epoch": 15.961688384007127, + "grad_norm": 0.0005311015993356705, + "learning_rate": 5.957083308453163e-06, + "loss": 0.0605, + "num_input_tokens_seen": 174397088, + "step": 143320 + }, + { + "epoch": 15.962245238890745, + "grad_norm": 0.09977360814809799, + "learning_rate": 5.955509146323038e-06, + "loss": 0.0048, + "num_input_tokens_seen": 174403584, + "step": 143325 + }, + { + "epoch": 15.962802093774362, + "grad_norm": 0.6798955202102661, + "learning_rate": 5.953935164083388e-06, + "loss": 0.2144, + "num_input_tokens_seen": 174409696, + "step": 143330 + }, + { + "epoch": 15.96335894865798, + "grad_norm": 0.003132987767457962, + "learning_rate": 5.952361361749081e-06, + "loss": 0.0127, + "num_input_tokens_seen": 174416000, + "step": 143335 + }, + { + "epoch": 15.963915803541598, + "grad_norm": 6.164767546579242e-05, + "learning_rate": 5.950787739334973e-06, + "loss": 0.0056, + "num_input_tokens_seen": 174421568, + "step": 143340 + }, + { + "epoch": 15.964472658425214, + "grad_norm": 0.10003170371055603, + "learning_rate": 5.949214296855945e-06, + "loss": 0.0887, + "num_input_tokens_seen": 174427680, + "step": 143345 + }, + { + "epoch": 15.965029513308831, + "grad_norm": 0.9425152540206909, + "learning_rate": 5.9476410343268455e-06, + "loss": 0.0713, + "num_input_tokens_seen": 174433920, + "step": 143350 + }, + { + "epoch": 15.965586368192449, + "grad_norm": 1.0502647161483765, + "learning_rate": 5.946067951762552e-06, + "loss": 0.015, + "num_input_tokens_seen": 174439808, + "step": 143355 + }, + { + "epoch": 15.966143223076067, + "grad_norm": 0.6261163353919983, + "learning_rate": 5.9444950491779e-06, + "loss": 0.0174, + "num_input_tokens_seen": 174445728, + "step": 143360 + }, + { + "epoch": 15.966700077959684, + "grad_norm": 0.018570756539702415, + "learning_rate": 5.942922326587766e-06, + "loss": 0.1203, + "num_input_tokens_seen": 174451968, + "step": 143365 + }, + { + "epoch": 15.9672569328433, + "grad_norm": 3.177921772003174, + "learning_rate": 5.941349784006992e-06, + "loss": 0.0953, + "num_input_tokens_seen": 174458176, + "step": 143370 + }, + { + "epoch": 15.967813787726918, + "grad_norm": 0.009704536758363247, + "learning_rate": 5.939777421450446e-06, + "loss": 0.1324, + "num_input_tokens_seen": 174464160, + "step": 143375 + }, + { + "epoch": 15.968370642610536, + "grad_norm": 0.00020564740407280624, + "learning_rate": 5.938205238932973e-06, + "loss": 0.0756, + "num_input_tokens_seen": 174470592, + "step": 143380 + }, + { + "epoch": 15.968927497494153, + "grad_norm": 0.0001685360330156982, + "learning_rate": 5.936633236469425e-06, + "loss": 0.0017, + "num_input_tokens_seen": 174476512, + "step": 143385 + }, + { + "epoch": 15.969484352377771, + "grad_norm": 0.042897727340459824, + "learning_rate": 5.935061414074638e-06, + "loss": 0.0499, + "num_input_tokens_seen": 174482528, + "step": 143390 + }, + { + "epoch": 15.970041207261387, + "grad_norm": 0.04718097671866417, + "learning_rate": 5.933489771763481e-06, + "loss": 0.0649, + "num_input_tokens_seen": 174488864, + "step": 143395 + }, + { + "epoch": 15.970598062145005, + "grad_norm": 0.005790852475911379, + "learning_rate": 5.9319183095507876e-06, + "loss": 0.1114, + "num_input_tokens_seen": 174495040, + "step": 143400 + }, + { + "epoch": 15.971154917028622, + "grad_norm": 0.0001125583949033171, + "learning_rate": 5.930347027451405e-06, + "loss": 0.0722, + "num_input_tokens_seen": 174501280, + "step": 143405 + }, + { + "epoch": 15.97171177191224, + "grad_norm": 0.5384904146194458, + "learning_rate": 5.928775925480165e-06, + "loss": 0.0975, + "num_input_tokens_seen": 174507008, + "step": 143410 + }, + { + "epoch": 15.972268626795858, + "grad_norm": 0.06731544435024261, + "learning_rate": 5.9272050036519225e-06, + "loss": 0.0654, + "num_input_tokens_seen": 174513312, + "step": 143415 + }, + { + "epoch": 15.972825481679475, + "grad_norm": 1.6538821458816528, + "learning_rate": 5.925634261981502e-06, + "loss": 0.0285, + "num_input_tokens_seen": 174519488, + "step": 143420 + }, + { + "epoch": 15.973382336563091, + "grad_norm": 1.5392037630081177, + "learning_rate": 5.9240637004837615e-06, + "loss": 0.0688, + "num_input_tokens_seen": 174525760, + "step": 143425 + }, + { + "epoch": 15.973939191446709, + "grad_norm": 1.5185635089874268, + "learning_rate": 5.922493319173511e-06, + "loss": 0.0799, + "num_input_tokens_seen": 174531232, + "step": 143430 + }, + { + "epoch": 15.974496046330326, + "grad_norm": 0.00018318289949093014, + "learning_rate": 5.920923118065602e-06, + "loss": 0.0078, + "num_input_tokens_seen": 174537632, + "step": 143435 + }, + { + "epoch": 15.975052901213944, + "grad_norm": 0.026955045759677887, + "learning_rate": 5.919353097174851e-06, + "loss": 0.0023, + "num_input_tokens_seen": 174544064, + "step": 143440 + }, + { + "epoch": 15.975609756097562, + "grad_norm": 0.1284625083208084, + "learning_rate": 5.917783256516107e-06, + "loss": 0.0367, + "num_input_tokens_seen": 174549856, + "step": 143445 + }, + { + "epoch": 15.976166610981178, + "grad_norm": 1.8519443273544312, + "learning_rate": 5.916213596104189e-06, + "loss": 0.0799, + "num_input_tokens_seen": 174555776, + "step": 143450 + }, + { + "epoch": 15.976723465864795, + "grad_norm": 1.4822146892547607, + "learning_rate": 5.914644115953921e-06, + "loss": 0.122, + "num_input_tokens_seen": 174561728, + "step": 143455 + }, + { + "epoch": 15.977280320748413, + "grad_norm": 0.0012215405004099011, + "learning_rate": 5.9130748160801245e-06, + "loss": 0.0072, + "num_input_tokens_seen": 174568224, + "step": 143460 + }, + { + "epoch": 15.97783717563203, + "grad_norm": 0.04369307681918144, + "learning_rate": 5.911505696497635e-06, + "loss": 0.059, + "num_input_tokens_seen": 174574432, + "step": 143465 + }, + { + "epoch": 15.978394030515648, + "grad_norm": 0.11446060240268707, + "learning_rate": 5.909936757221268e-06, + "loss": 0.0184, + "num_input_tokens_seen": 174580352, + "step": 143470 + }, + { + "epoch": 15.978950885399264, + "grad_norm": 0.13646262884140015, + "learning_rate": 5.908367998265843e-06, + "loss": 0.007, + "num_input_tokens_seen": 174586304, + "step": 143475 + }, + { + "epoch": 15.979507740282882, + "grad_norm": 0.015651119872927666, + "learning_rate": 5.90679941964617e-06, + "loss": 0.0017, + "num_input_tokens_seen": 174592512, + "step": 143480 + }, + { + "epoch": 15.9800645951665, + "grad_norm": 0.06846904754638672, + "learning_rate": 5.905231021377081e-06, + "loss": 0.0047, + "num_input_tokens_seen": 174598592, + "step": 143485 + }, + { + "epoch": 15.980621450050117, + "grad_norm": 1.1073838472366333, + "learning_rate": 5.9036628034733785e-06, + "loss": 0.0526, + "num_input_tokens_seen": 174604800, + "step": 143490 + }, + { + "epoch": 15.981178304933735, + "grad_norm": 0.8355138301849365, + "learning_rate": 5.902094765949892e-06, + "loss": 0.0366, + "num_input_tokens_seen": 174610688, + "step": 143495 + }, + { + "epoch": 15.981735159817351, + "grad_norm": 2.5911648273468018, + "learning_rate": 5.900526908821408e-06, + "loss": 0.0164, + "num_input_tokens_seen": 174616736, + "step": 143500 + }, + { + "epoch": 15.982292014700969, + "grad_norm": 0.8833644986152649, + "learning_rate": 5.898959232102758e-06, + "loss": 0.0233, + "num_input_tokens_seen": 174622912, + "step": 143505 + }, + { + "epoch": 15.982848869584586, + "grad_norm": 0.1281275600194931, + "learning_rate": 5.8973917358087325e-06, + "loss": 0.0187, + "num_input_tokens_seen": 174629472, + "step": 143510 + }, + { + "epoch": 15.983405724468204, + "grad_norm": 1.346348762512207, + "learning_rate": 5.895824419954157e-06, + "loss": 0.0953, + "num_input_tokens_seen": 174635360, + "step": 143515 + }, + { + "epoch": 15.983962579351822, + "grad_norm": 0.42544278502464294, + "learning_rate": 5.8942572845538224e-06, + "loss": 0.0088, + "num_input_tokens_seen": 174641312, + "step": 143520 + }, + { + "epoch": 15.984519434235438, + "grad_norm": 0.30233559012413025, + "learning_rate": 5.892690329622538e-06, + "loss": 0.0164, + "num_input_tokens_seen": 174647264, + "step": 143525 + }, + { + "epoch": 15.985076289119055, + "grad_norm": 0.0399189218878746, + "learning_rate": 5.891123555175093e-06, + "loss": 0.0664, + "num_input_tokens_seen": 174652960, + "step": 143530 + }, + { + "epoch": 15.985633144002673, + "grad_norm": 0.007436907850205898, + "learning_rate": 5.889556961226303e-06, + "loss": 0.0154, + "num_input_tokens_seen": 174658944, + "step": 143535 + }, + { + "epoch": 15.98618999888629, + "grad_norm": 1.5814323425292969, + "learning_rate": 5.887990547790958e-06, + "loss": 0.0849, + "num_input_tokens_seen": 174665280, + "step": 143540 + }, + { + "epoch": 15.986746853769908, + "grad_norm": 0.2547690272331238, + "learning_rate": 5.886424314883857e-06, + "loss": 0.0334, + "num_input_tokens_seen": 174671520, + "step": 143545 + }, + { + "epoch": 15.987303708653524, + "grad_norm": 0.05619259551167488, + "learning_rate": 5.884858262519782e-06, + "loss": 0.0092, + "num_input_tokens_seen": 174677600, + "step": 143550 + }, + { + "epoch": 15.987860563537142, + "grad_norm": 0.011465881019830704, + "learning_rate": 5.883292390713544e-06, + "loss": 0.0456, + "num_input_tokens_seen": 174683872, + "step": 143555 + }, + { + "epoch": 15.98841741842076, + "grad_norm": 0.011214425787329674, + "learning_rate": 5.881726699479917e-06, + "loss": 0.0026, + "num_input_tokens_seen": 174690432, + "step": 143560 + }, + { + "epoch": 15.988974273304377, + "grad_norm": 0.02690954878926277, + "learning_rate": 5.880161188833708e-06, + "loss": 0.1096, + "num_input_tokens_seen": 174696512, + "step": 143565 + }, + { + "epoch": 15.989531128187995, + "grad_norm": 0.01536305621266365, + "learning_rate": 5.878595858789693e-06, + "loss": 0.0271, + "num_input_tokens_seen": 174702656, + "step": 143570 + }, + { + "epoch": 15.99008798307161, + "grad_norm": 0.0013916778843849897, + "learning_rate": 5.877030709362663e-06, + "loss": 0.035, + "num_input_tokens_seen": 174708928, + "step": 143575 + }, + { + "epoch": 15.990644837955228, + "grad_norm": 0.0018834193469956517, + "learning_rate": 5.875465740567396e-06, + "loss": 0.0065, + "num_input_tokens_seen": 174714976, + "step": 143580 + }, + { + "epoch": 15.991201692838846, + "grad_norm": 3.0032553672790527, + "learning_rate": 5.8739009524186704e-06, + "loss": 0.0198, + "num_input_tokens_seen": 174721376, + "step": 143585 + }, + { + "epoch": 15.991758547722464, + "grad_norm": 0.030767425894737244, + "learning_rate": 5.872336344931282e-06, + "loss": 0.0483, + "num_input_tokens_seen": 174727424, + "step": 143590 + }, + { + "epoch": 15.992315402606081, + "grad_norm": 2.3614206314086914, + "learning_rate": 5.870771918120002e-06, + "loss": 0.1256, + "num_input_tokens_seen": 174733312, + "step": 143595 + }, + { + "epoch": 15.992872257489697, + "grad_norm": 0.0011033023474738002, + "learning_rate": 5.869207671999607e-06, + "loss": 0.0251, + "num_input_tokens_seen": 174739424, + "step": 143600 + }, + { + "epoch": 15.993429112373315, + "grad_norm": 1.2606604099273682, + "learning_rate": 5.8676436065848665e-06, + "loss": 0.057, + "num_input_tokens_seen": 174745536, + "step": 143605 + }, + { + "epoch": 15.993985967256933, + "grad_norm": 0.02368025667965412, + "learning_rate": 5.866079721890566e-06, + "loss": 0.0297, + "num_input_tokens_seen": 174751712, + "step": 143610 + }, + { + "epoch": 15.99454282214055, + "grad_norm": 0.35410276055336, + "learning_rate": 5.864516017931473e-06, + "loss": 0.0307, + "num_input_tokens_seen": 174757664, + "step": 143615 + }, + { + "epoch": 15.995099677024168, + "grad_norm": 0.15496855974197388, + "learning_rate": 5.862952494722357e-06, + "loss": 0.0723, + "num_input_tokens_seen": 174763648, + "step": 143620 + }, + { + "epoch": 15.995656531907784, + "grad_norm": 0.003237155731767416, + "learning_rate": 5.861389152277979e-06, + "loss": 0.0302, + "num_input_tokens_seen": 174769728, + "step": 143625 + }, + { + "epoch": 15.996213386791402, + "grad_norm": 0.007233147043734789, + "learning_rate": 5.859825990613125e-06, + "loss": 0.003, + "num_input_tokens_seen": 174775744, + "step": 143630 + }, + { + "epoch": 15.99677024167502, + "grad_norm": 0.1286839097738266, + "learning_rate": 5.85826300974254e-06, + "loss": 0.0127, + "num_input_tokens_seen": 174781856, + "step": 143635 + }, + { + "epoch": 15.997327096558637, + "grad_norm": 3.1472904682159424, + "learning_rate": 5.85670020968101e-06, + "loss": 0.0179, + "num_input_tokens_seen": 174787584, + "step": 143640 + }, + { + "epoch": 15.997883951442255, + "grad_norm": 0.03463902696967125, + "learning_rate": 5.855137590443271e-06, + "loss": 0.0045, + "num_input_tokens_seen": 174794112, + "step": 143645 + }, + { + "epoch": 15.998440806325872, + "grad_norm": 0.000563771347515285, + "learning_rate": 5.853575152044102e-06, + "loss": 0.0022, + "num_input_tokens_seen": 174800192, + "step": 143650 + }, + { + "epoch": 15.998997661209488, + "grad_norm": 0.04975702613592148, + "learning_rate": 5.8520128944982515e-06, + "loss": 0.0519, + "num_input_tokens_seen": 174806368, + "step": 143655 + }, + { + "epoch": 15.999554516093106, + "grad_norm": 0.04671671241521835, + "learning_rate": 5.850450817820485e-06, + "loss": 0.0072, + "num_input_tokens_seen": 174812704, + "step": 143660 + }, + { + "epoch": 16.0, + "eval_loss": 0.08125761896371841, + "eval_runtime": 111.1922, + "eval_samples_per_second": 35.893, + "eval_steps_per_second": 8.975, + "num_input_tokens_seen": 174816592, + "step": 143664 + }, + { + "epoch": 16.00011137097672, + "grad_norm": 0.16450297832489014, + "learning_rate": 5.848888922025553e-06, + "loss": 0.0432, + "num_input_tokens_seen": 174817968, + "step": 143665 + }, + { + "epoch": 16.00066822586034, + "grad_norm": 0.10324287414550781, + "learning_rate": 5.847327207128209e-06, + "loss": 0.0465, + "num_input_tokens_seen": 174824272, + "step": 143670 + }, + { + "epoch": 16.001225080743957, + "grad_norm": 1.5602012872695923, + "learning_rate": 5.845765673143197e-06, + "loss": 0.0736, + "num_input_tokens_seen": 174830128, + "step": 143675 + }, + { + "epoch": 16.001781935627577, + "grad_norm": 0.020976804196834564, + "learning_rate": 5.8442043200852804e-06, + "loss": 0.0698, + "num_input_tokens_seen": 174836080, + "step": 143680 + }, + { + "epoch": 16.002338790511192, + "grad_norm": 1.5207483768463135, + "learning_rate": 5.842643147969204e-06, + "loss": 0.1858, + "num_input_tokens_seen": 174842160, + "step": 143685 + }, + { + "epoch": 16.00289564539481, + "grad_norm": 1.0178818702697754, + "learning_rate": 5.84108215680971e-06, + "loss": 0.0469, + "num_input_tokens_seen": 174848432, + "step": 143690 + }, + { + "epoch": 16.003452500278428, + "grad_norm": 0.06586151570081711, + "learning_rate": 5.839521346621537e-06, + "loss": 0.0425, + "num_input_tokens_seen": 174854512, + "step": 143695 + }, + { + "epoch": 16.004009355162044, + "grad_norm": 0.0023794241715222597, + "learning_rate": 5.837960717419444e-06, + "loss": 0.0005, + "num_input_tokens_seen": 174861072, + "step": 143700 + }, + { + "epoch": 16.004566210045663, + "grad_norm": 0.048557035624980927, + "learning_rate": 5.836400269218159e-06, + "loss": 0.1667, + "num_input_tokens_seen": 174867024, + "step": 143705 + }, + { + "epoch": 16.00512306492928, + "grad_norm": 1.650665283203125, + "learning_rate": 5.8348400020324325e-06, + "loss": 0.1253, + "num_input_tokens_seen": 174872976, + "step": 143710 + }, + { + "epoch": 16.0056799198129, + "grad_norm": 0.00013075588503852487, + "learning_rate": 5.8332799158769965e-06, + "loss": 0.001, + "num_input_tokens_seen": 174879312, + "step": 143715 + }, + { + "epoch": 16.006236774696514, + "grad_norm": 1.0610510110855103, + "learning_rate": 5.831720010766589e-06, + "loss": 0.0091, + "num_input_tokens_seen": 174885488, + "step": 143720 + }, + { + "epoch": 16.00679362958013, + "grad_norm": 0.015689626336097717, + "learning_rate": 5.830160286715936e-06, + "loss": 0.0134, + "num_input_tokens_seen": 174891664, + "step": 143725 + }, + { + "epoch": 16.00735048446375, + "grad_norm": 0.014204631559550762, + "learning_rate": 5.828600743739784e-06, + "loss": 0.0712, + "num_input_tokens_seen": 174897648, + "step": 143730 + }, + { + "epoch": 16.007907339347366, + "grad_norm": 0.03901144117116928, + "learning_rate": 5.82704138185286e-06, + "loss": 0.0196, + "num_input_tokens_seen": 174904112, + "step": 143735 + }, + { + "epoch": 16.008464194230985, + "grad_norm": 0.018057996407151222, + "learning_rate": 5.8254822010698875e-06, + "loss": 0.0494, + "num_input_tokens_seen": 174910256, + "step": 143740 + }, + { + "epoch": 16.0090210491146, + "grad_norm": 0.0022721290588378906, + "learning_rate": 5.823923201405596e-06, + "loss": 0.0094, + "num_input_tokens_seen": 174916496, + "step": 143745 + }, + { + "epoch": 16.009577903998217, + "grad_norm": 0.048152875155210495, + "learning_rate": 5.822364382874715e-06, + "loss": 0.0803, + "num_input_tokens_seen": 174922256, + "step": 143750 + }, + { + "epoch": 16.010134758881836, + "grad_norm": 0.1388707160949707, + "learning_rate": 5.820805745491964e-06, + "loss": 0.0031, + "num_input_tokens_seen": 174928336, + "step": 143755 + }, + { + "epoch": 16.010691613765452, + "grad_norm": 1.23391592502594, + "learning_rate": 5.819247289272081e-06, + "loss": 0.065, + "num_input_tokens_seen": 174934288, + "step": 143760 + }, + { + "epoch": 16.01124846864907, + "grad_norm": 0.5120510458946228, + "learning_rate": 5.817689014229763e-06, + "loss": 0.0862, + "num_input_tokens_seen": 174940720, + "step": 143765 + }, + { + "epoch": 16.011805323532688, + "grad_norm": 0.0009936789283528924, + "learning_rate": 5.816130920379745e-06, + "loss": 0.0015, + "num_input_tokens_seen": 174947024, + "step": 143770 + }, + { + "epoch": 16.012362178416303, + "grad_norm": 0.46989837288856506, + "learning_rate": 5.814573007736734e-06, + "loss": 0.0785, + "num_input_tokens_seen": 174953232, + "step": 143775 + }, + { + "epoch": 16.012919033299923, + "grad_norm": 0.3109969198703766, + "learning_rate": 5.813015276315461e-06, + "loss": 0.1491, + "num_input_tokens_seen": 174959728, + "step": 143780 + }, + { + "epoch": 16.01347588818354, + "grad_norm": 0.00807380024343729, + "learning_rate": 5.811457726130629e-06, + "loss": 0.0462, + "num_input_tokens_seen": 174965776, + "step": 143785 + }, + { + "epoch": 16.01403274306716, + "grad_norm": 0.00017883542750496417, + "learning_rate": 5.809900357196954e-06, + "loss": 0.1154, + "num_input_tokens_seen": 174971824, + "step": 143790 + }, + { + "epoch": 16.014589597950774, + "grad_norm": 0.00010903587826760486, + "learning_rate": 5.808343169529137e-06, + "loss": 0.0145, + "num_input_tokens_seen": 174978000, + "step": 143795 + }, + { + "epoch": 16.01514645283439, + "grad_norm": 0.10924234241247177, + "learning_rate": 5.806786163141903e-06, + "loss": 0.0061, + "num_input_tokens_seen": 174984176, + "step": 143800 + }, + { + "epoch": 16.01570330771801, + "grad_norm": 0.0029180380515754223, + "learning_rate": 5.80522933804995e-06, + "loss": 0.1625, + "num_input_tokens_seen": 174989616, + "step": 143805 + }, + { + "epoch": 16.016260162601625, + "grad_norm": 0.0003299087402410805, + "learning_rate": 5.803672694267984e-06, + "loss": 0.0043, + "num_input_tokens_seen": 174995152, + "step": 143810 + }, + { + "epoch": 16.016817017485245, + "grad_norm": 0.008814660832285881, + "learning_rate": 5.802116231810703e-06, + "loss": 0.1116, + "num_input_tokens_seen": 175001552, + "step": 143815 + }, + { + "epoch": 16.01737387236886, + "grad_norm": 0.028397487476468086, + "learning_rate": 5.800559950692822e-06, + "loss": 0.0154, + "num_input_tokens_seen": 175007792, + "step": 143820 + }, + { + "epoch": 16.017930727252477, + "grad_norm": 0.03438045084476471, + "learning_rate": 5.7990038509290275e-06, + "loss": 0.0661, + "num_input_tokens_seen": 175013968, + "step": 143825 + }, + { + "epoch": 16.018487582136096, + "grad_norm": 1.589232325553894, + "learning_rate": 5.7974479325340394e-06, + "loss": 0.1158, + "num_input_tokens_seen": 175020048, + "step": 143830 + }, + { + "epoch": 16.019044437019712, + "grad_norm": 1.0684864521026611, + "learning_rate": 5.795892195522526e-06, + "loss": 0.0266, + "num_input_tokens_seen": 175025872, + "step": 143835 + }, + { + "epoch": 16.01960129190333, + "grad_norm": 0.06531299650669098, + "learning_rate": 5.794336639909204e-06, + "loss": 0.0196, + "num_input_tokens_seen": 175032240, + "step": 143840 + }, + { + "epoch": 16.020158146786947, + "grad_norm": 0.0009469427750445902, + "learning_rate": 5.792781265708752e-06, + "loss": 0.0337, + "num_input_tokens_seen": 175038640, + "step": 143845 + }, + { + "epoch": 16.020715001670563, + "grad_norm": 0.19768312573432922, + "learning_rate": 5.791226072935879e-06, + "loss": 0.0492, + "num_input_tokens_seen": 175044944, + "step": 143850 + }, + { + "epoch": 16.021271856554183, + "grad_norm": 0.39171576499938965, + "learning_rate": 5.789671061605265e-06, + "loss": 0.0101, + "num_input_tokens_seen": 175050896, + "step": 143855 + }, + { + "epoch": 16.0218287114378, + "grad_norm": 0.7088553309440613, + "learning_rate": 5.7881162317315966e-06, + "loss": 0.0128, + "num_input_tokens_seen": 175057232, + "step": 143860 + }, + { + "epoch": 16.022385566321418, + "grad_norm": 0.06952648609876633, + "learning_rate": 5.786561583329558e-06, + "loss": 0.3531, + "num_input_tokens_seen": 175062960, + "step": 143865 + }, + { + "epoch": 16.022942421205034, + "grad_norm": 0.14521870017051697, + "learning_rate": 5.785007116413843e-06, + "loss": 0.0506, + "num_input_tokens_seen": 175069008, + "step": 143870 + }, + { + "epoch": 16.02349927608865, + "grad_norm": 1.0663201808929443, + "learning_rate": 5.783452830999134e-06, + "loss": 0.0395, + "num_input_tokens_seen": 175075184, + "step": 143875 + }, + { + "epoch": 16.02405613097227, + "grad_norm": 3.1794965267181396, + "learning_rate": 5.781898727100107e-06, + "loss": 0.0975, + "num_input_tokens_seen": 175081520, + "step": 143880 + }, + { + "epoch": 16.024612985855885, + "grad_norm": 0.020698759704828262, + "learning_rate": 5.780344804731438e-06, + "loss": 0.0227, + "num_input_tokens_seen": 175087856, + "step": 143885 + }, + { + "epoch": 16.025169840739505, + "grad_norm": 0.4980488121509552, + "learning_rate": 5.778791063907818e-06, + "loss": 0.0701, + "num_input_tokens_seen": 175094128, + "step": 143890 + }, + { + "epoch": 16.02572669562312, + "grad_norm": 0.019427593797445297, + "learning_rate": 5.777237504643907e-06, + "loss": 0.0206, + "num_input_tokens_seen": 175100048, + "step": 143895 + }, + { + "epoch": 16.026283550506736, + "grad_norm": 0.06296281516551971, + "learning_rate": 5.7756841269544035e-06, + "loss": 0.0143, + "num_input_tokens_seen": 175106096, + "step": 143900 + }, + { + "epoch": 16.026840405390356, + "grad_norm": 0.4611305296421051, + "learning_rate": 5.774130930853952e-06, + "loss": 0.0262, + "num_input_tokens_seen": 175112144, + "step": 143905 + }, + { + "epoch": 16.027397260273972, + "grad_norm": 0.5623427033424377, + "learning_rate": 5.772577916357247e-06, + "loss": 0.1098, + "num_input_tokens_seen": 175118160, + "step": 143910 + }, + { + "epoch": 16.02795411515759, + "grad_norm": 0.5702061057090759, + "learning_rate": 5.771025083478937e-06, + "loss": 0.0694, + "num_input_tokens_seen": 175124528, + "step": 143915 + }, + { + "epoch": 16.028510970041207, + "grad_norm": 0.0055732447654008865, + "learning_rate": 5.76947243223371e-06, + "loss": 0.0753, + "num_input_tokens_seen": 175130576, + "step": 143920 + }, + { + "epoch": 16.029067824924823, + "grad_norm": 0.38308408856391907, + "learning_rate": 5.767919962636223e-06, + "loss": 0.0265, + "num_input_tokens_seen": 175137040, + "step": 143925 + }, + { + "epoch": 16.029624679808443, + "grad_norm": 2.898359775543213, + "learning_rate": 5.766367674701142e-06, + "loss": 0.2146, + "num_input_tokens_seen": 175143056, + "step": 143930 + }, + { + "epoch": 16.03018153469206, + "grad_norm": 0.03745463490486145, + "learning_rate": 5.7648155684431185e-06, + "loss": 0.0436, + "num_input_tokens_seen": 175149168, + "step": 143935 + }, + { + "epoch": 16.030738389575678, + "grad_norm": 0.0967043936252594, + "learning_rate": 5.7632636438768314e-06, + "loss": 0.076, + "num_input_tokens_seen": 175154832, + "step": 143940 + }, + { + "epoch": 16.031295244459294, + "grad_norm": 1.1946102380752563, + "learning_rate": 5.761711901016931e-06, + "loss": 0.0801, + "num_input_tokens_seen": 175160784, + "step": 143945 + }, + { + "epoch": 16.03185209934291, + "grad_norm": 1.9196287393569946, + "learning_rate": 5.7601603398780764e-06, + "loss": 0.1621, + "num_input_tokens_seen": 175167024, + "step": 143950 + }, + { + "epoch": 16.03240895422653, + "grad_norm": 1.3669297695159912, + "learning_rate": 5.758608960474915e-06, + "loss": 0.1017, + "num_input_tokens_seen": 175172784, + "step": 143955 + }, + { + "epoch": 16.032965809110145, + "grad_norm": 0.08467627316713333, + "learning_rate": 5.757057762822113e-06, + "loss": 0.0183, + "num_input_tokens_seen": 175178800, + "step": 143960 + }, + { + "epoch": 16.033522663993764, + "grad_norm": 0.1143302470445633, + "learning_rate": 5.755506746934311e-06, + "loss": 0.0099, + "num_input_tokens_seen": 175184880, + "step": 143965 + }, + { + "epoch": 16.03407951887738, + "grad_norm": 0.6282379031181335, + "learning_rate": 5.753955912826173e-06, + "loss": 0.0237, + "num_input_tokens_seen": 175190864, + "step": 143970 + }, + { + "epoch": 16.034636373760996, + "grad_norm": 1.5847439765930176, + "learning_rate": 5.752405260512342e-06, + "loss": 0.0829, + "num_input_tokens_seen": 175197040, + "step": 143975 + }, + { + "epoch": 16.035193228644616, + "grad_norm": 0.9397304058074951, + "learning_rate": 5.750854790007465e-06, + "loss": 0.0897, + "num_input_tokens_seen": 175203088, + "step": 143980 + }, + { + "epoch": 16.03575008352823, + "grad_norm": 0.0007260633865371346, + "learning_rate": 5.749304501326186e-06, + "loss": 0.0194, + "num_input_tokens_seen": 175209168, + "step": 143985 + }, + { + "epoch": 16.03630693841185, + "grad_norm": 0.024443913251161575, + "learning_rate": 5.747754394483141e-06, + "loss": 0.0086, + "num_input_tokens_seen": 175215504, + "step": 143990 + }, + { + "epoch": 16.036863793295467, + "grad_norm": 0.0008269868558272719, + "learning_rate": 5.7462044694929886e-06, + "loss": 0.0921, + "num_input_tokens_seen": 175221712, + "step": 143995 + }, + { + "epoch": 16.037420648179083, + "grad_norm": 0.0011295631993561983, + "learning_rate": 5.744654726370361e-06, + "loss": 0.0386, + "num_input_tokens_seen": 175227760, + "step": 144000 + }, + { + "epoch": 16.037977503062702, + "grad_norm": 0.0004769874212797731, + "learning_rate": 5.743105165129897e-06, + "loss": 0.213, + "num_input_tokens_seen": 175234160, + "step": 144005 + }, + { + "epoch": 16.038534357946318, + "grad_norm": 2.854091167449951, + "learning_rate": 5.741555785786224e-06, + "loss": 0.1048, + "num_input_tokens_seen": 175240080, + "step": 144010 + }, + { + "epoch": 16.039091212829938, + "grad_norm": 0.05015980452299118, + "learning_rate": 5.740006588353997e-06, + "loss": 0.032, + "num_input_tokens_seen": 175246352, + "step": 144015 + }, + { + "epoch": 16.039648067713554, + "grad_norm": 0.009784809313714504, + "learning_rate": 5.738457572847836e-06, + "loss": 0.0056, + "num_input_tokens_seen": 175252656, + "step": 144020 + }, + { + "epoch": 16.04020492259717, + "grad_norm": 0.0817052349448204, + "learning_rate": 5.736908739282373e-06, + "loss": 0.099, + "num_input_tokens_seen": 175258736, + "step": 144025 + }, + { + "epoch": 16.04076177748079, + "grad_norm": 0.051484279334545135, + "learning_rate": 5.735360087672237e-06, + "loss": 0.0615, + "num_input_tokens_seen": 175265040, + "step": 144030 + }, + { + "epoch": 16.041318632364405, + "grad_norm": 0.006681842263787985, + "learning_rate": 5.733811618032064e-06, + "loss": 0.0222, + "num_input_tokens_seen": 175271152, + "step": 144035 + }, + { + "epoch": 16.041875487248024, + "grad_norm": 0.0328117311000824, + "learning_rate": 5.732263330376472e-06, + "loss": 0.0288, + "num_input_tokens_seen": 175277616, + "step": 144040 + }, + { + "epoch": 16.04243234213164, + "grad_norm": 0.7713791728019714, + "learning_rate": 5.730715224720101e-06, + "loss": 0.0642, + "num_input_tokens_seen": 175283632, + "step": 144045 + }, + { + "epoch": 16.042989197015256, + "grad_norm": 0.0002193265681853518, + "learning_rate": 5.729167301077551e-06, + "loss": 0.0112, + "num_input_tokens_seen": 175289840, + "step": 144050 + }, + { + "epoch": 16.043546051898876, + "grad_norm": 0.0078070685267448425, + "learning_rate": 5.727619559463462e-06, + "loss": 0.0299, + "num_input_tokens_seen": 175295856, + "step": 144055 + }, + { + "epoch": 16.04410290678249, + "grad_norm": 0.0004950582515448332, + "learning_rate": 5.7260719998924406e-06, + "loss": 0.0017, + "num_input_tokens_seen": 175302160, + "step": 144060 + }, + { + "epoch": 16.04465976166611, + "grad_norm": 0.9199122786521912, + "learning_rate": 5.724524622379118e-06, + "loss": 0.1557, + "num_input_tokens_seen": 175308400, + "step": 144065 + }, + { + "epoch": 16.045216616549727, + "grad_norm": 1.0250166654586792, + "learning_rate": 5.7229774269381035e-06, + "loss": 0.0533, + "num_input_tokens_seen": 175314576, + "step": 144070 + }, + { + "epoch": 16.045773471433346, + "grad_norm": 0.0003087028453592211, + "learning_rate": 5.721430413584009e-06, + "loss": 0.0135, + "num_input_tokens_seen": 175320368, + "step": 144075 + }, + { + "epoch": 16.046330326316962, + "grad_norm": 0.5771805047988892, + "learning_rate": 5.719883582331445e-06, + "loss": 0.0507, + "num_input_tokens_seen": 175326672, + "step": 144080 + }, + { + "epoch": 16.046887181200578, + "grad_norm": 0.05889574810862541, + "learning_rate": 5.718336933195034e-06, + "loss": 0.0938, + "num_input_tokens_seen": 175332976, + "step": 144085 + }, + { + "epoch": 16.047444036084197, + "grad_norm": 0.0004748845531139523, + "learning_rate": 5.716790466189381e-06, + "loss": 0.0104, + "num_input_tokens_seen": 175339088, + "step": 144090 + }, + { + "epoch": 16.048000890967813, + "grad_norm": 0.04674446955323219, + "learning_rate": 5.715244181329091e-06, + "loss": 0.0679, + "num_input_tokens_seen": 175345264, + "step": 144095 + }, + { + "epoch": 16.048557745851433, + "grad_norm": 0.005651551298797131, + "learning_rate": 5.713698078628763e-06, + "loss": 0.0041, + "num_input_tokens_seen": 175351184, + "step": 144100 + }, + { + "epoch": 16.04911460073505, + "grad_norm": 0.0002484604774508625, + "learning_rate": 5.712152158103012e-06, + "loss": 0.0428, + "num_input_tokens_seen": 175357264, + "step": 144105 + }, + { + "epoch": 16.049671455618665, + "grad_norm": 1.4566571712493896, + "learning_rate": 5.710606419766434e-06, + "loss": 0.0368, + "num_input_tokens_seen": 175363152, + "step": 144110 + }, + { + "epoch": 16.050228310502284, + "grad_norm": 0.2187858521938324, + "learning_rate": 5.709060863633639e-06, + "loss": 0.0144, + "num_input_tokens_seen": 175369328, + "step": 144115 + }, + { + "epoch": 16.0507851653859, + "grad_norm": 0.0006206392426975071, + "learning_rate": 5.7075154897192205e-06, + "loss": 0.0169, + "num_input_tokens_seen": 175375120, + "step": 144120 + }, + { + "epoch": 16.05134202026952, + "grad_norm": 0.0037971464917063713, + "learning_rate": 5.705970298037774e-06, + "loss": 0.0807, + "num_input_tokens_seen": 175381488, + "step": 144125 + }, + { + "epoch": 16.051898875153135, + "grad_norm": 0.10303275287151337, + "learning_rate": 5.70442528860389e-06, + "loss": 0.0033, + "num_input_tokens_seen": 175387376, + "step": 144130 + }, + { + "epoch": 16.05245573003675, + "grad_norm": 1.0105702877044678, + "learning_rate": 5.702880461432175e-06, + "loss": 0.0457, + "num_input_tokens_seen": 175393936, + "step": 144135 + }, + { + "epoch": 16.05301258492037, + "grad_norm": 0.038194239139556885, + "learning_rate": 5.701335816537215e-06, + "loss": 0.0054, + "num_input_tokens_seen": 175399280, + "step": 144140 + }, + { + "epoch": 16.053569439803987, + "grad_norm": 0.5941223502159119, + "learning_rate": 5.6997913539335975e-06, + "loss": 0.0114, + "num_input_tokens_seen": 175405360, + "step": 144145 + }, + { + "epoch": 16.054126294687606, + "grad_norm": 0.008319323882460594, + "learning_rate": 5.69824707363591e-06, + "loss": 0.0108, + "num_input_tokens_seen": 175411344, + "step": 144150 + }, + { + "epoch": 16.054683149571222, + "grad_norm": 0.05599946901202202, + "learning_rate": 5.696702975658749e-06, + "loss": 0.0035, + "num_input_tokens_seen": 175417584, + "step": 144155 + }, + { + "epoch": 16.055240004454838, + "grad_norm": 1.5536763668060303, + "learning_rate": 5.695159060016686e-06, + "loss": 0.0287, + "num_input_tokens_seen": 175423728, + "step": 144160 + }, + { + "epoch": 16.055796859338457, + "grad_norm": 0.01746593788266182, + "learning_rate": 5.6936153267243274e-06, + "loss": 0.0446, + "num_input_tokens_seen": 175429840, + "step": 144165 + }, + { + "epoch": 16.056353714222073, + "grad_norm": 0.005780518986284733, + "learning_rate": 5.692071775796226e-06, + "loss": 0.0138, + "num_input_tokens_seen": 175435824, + "step": 144170 + }, + { + "epoch": 16.056910569105693, + "grad_norm": 1.8417174816131592, + "learning_rate": 5.690528407246984e-06, + "loss": 0.0326, + "num_input_tokens_seen": 175442032, + "step": 144175 + }, + { + "epoch": 16.05746742398931, + "grad_norm": 0.0990457683801651, + "learning_rate": 5.688985221091162e-06, + "loss": 0.007, + "num_input_tokens_seen": 175448432, + "step": 144180 + }, + { + "epoch": 16.058024278872924, + "grad_norm": 0.00717745628207922, + "learning_rate": 5.687442217343356e-06, + "loss": 0.0733, + "num_input_tokens_seen": 175454672, + "step": 144185 + }, + { + "epoch": 16.058581133756544, + "grad_norm": 0.0005763964727520943, + "learning_rate": 5.68589939601813e-06, + "loss": 0.0074, + "num_input_tokens_seen": 175460560, + "step": 144190 + }, + { + "epoch": 16.05913798864016, + "grad_norm": 0.1470128446817398, + "learning_rate": 5.6843567571300576e-06, + "loss": 0.0711, + "num_input_tokens_seen": 175466576, + "step": 144195 + }, + { + "epoch": 16.05969484352378, + "grad_norm": 0.08159621059894562, + "learning_rate": 5.682814300693706e-06, + "loss": 0.0016, + "num_input_tokens_seen": 175472720, + "step": 144200 + }, + { + "epoch": 16.060251698407395, + "grad_norm": 0.07999612390995026, + "learning_rate": 5.681272026723655e-06, + "loss": 0.0011, + "num_input_tokens_seen": 175478512, + "step": 144205 + }, + { + "epoch": 16.06080855329101, + "grad_norm": 8.952752250479534e-05, + "learning_rate": 5.6797299352344704e-06, + "loss": 0.0008, + "num_input_tokens_seen": 175484720, + "step": 144210 + }, + { + "epoch": 16.06136540817463, + "grad_norm": 0.0023817666806280613, + "learning_rate": 5.678188026240714e-06, + "loss": 0.0263, + "num_input_tokens_seen": 175490768, + "step": 144215 + }, + { + "epoch": 16.061922263058246, + "grad_norm": 0.15310978889465332, + "learning_rate": 5.676646299756944e-06, + "loss": 0.0173, + "num_input_tokens_seen": 175496848, + "step": 144220 + }, + { + "epoch": 16.062479117941866, + "grad_norm": 0.0009683467214927077, + "learning_rate": 5.675104755797739e-06, + "loss": 0.0027, + "num_input_tokens_seen": 175502960, + "step": 144225 + }, + { + "epoch": 16.06303597282548, + "grad_norm": 0.0305156409740448, + "learning_rate": 5.673563394377646e-06, + "loss": 0.0585, + "num_input_tokens_seen": 175509072, + "step": 144230 + }, + { + "epoch": 16.063592827709098, + "grad_norm": 0.48810875415802, + "learning_rate": 5.672022215511244e-06, + "loss": 0.037, + "num_input_tokens_seen": 175515632, + "step": 144235 + }, + { + "epoch": 16.064149682592717, + "grad_norm": 0.8762224912643433, + "learning_rate": 5.670481219213064e-06, + "loss": 0.0213, + "num_input_tokens_seen": 175521712, + "step": 144240 + }, + { + "epoch": 16.064706537476333, + "grad_norm": 1.7777243852615356, + "learning_rate": 5.668940405497683e-06, + "loss": 0.053, + "num_input_tokens_seen": 175527344, + "step": 144245 + }, + { + "epoch": 16.065263392359952, + "grad_norm": 0.003020663745701313, + "learning_rate": 5.667399774379642e-06, + "loss": 0.006, + "num_input_tokens_seen": 175533520, + "step": 144250 + }, + { + "epoch": 16.06582024724357, + "grad_norm": 0.14402726292610168, + "learning_rate": 5.665859325873504e-06, + "loss": 0.0064, + "num_input_tokens_seen": 175539152, + "step": 144255 + }, + { + "epoch": 16.066377102127184, + "grad_norm": 0.12507101893424988, + "learning_rate": 5.664319059993814e-06, + "loss": 0.0536, + "num_input_tokens_seen": 175545072, + "step": 144260 + }, + { + "epoch": 16.066933957010804, + "grad_norm": 0.0005521551356650889, + "learning_rate": 5.662778976755123e-06, + "loss": 0.075, + "num_input_tokens_seen": 175551440, + "step": 144265 + }, + { + "epoch": 16.06749081189442, + "grad_norm": 0.06348524987697601, + "learning_rate": 5.6612390761719705e-06, + "loss": 0.0036, + "num_input_tokens_seen": 175557584, + "step": 144270 + }, + { + "epoch": 16.06804766677804, + "grad_norm": 9.840302664088085e-05, + "learning_rate": 5.659699358258916e-06, + "loss": 0.0134, + "num_input_tokens_seen": 175563856, + "step": 144275 + }, + { + "epoch": 16.068604521661655, + "grad_norm": 0.0028061210177838802, + "learning_rate": 5.658159823030496e-06, + "loss": 0.0334, + "num_input_tokens_seen": 175570352, + "step": 144280 + }, + { + "epoch": 16.06916137654527, + "grad_norm": 0.2381417453289032, + "learning_rate": 5.656620470501253e-06, + "loss": 0.0097, + "num_input_tokens_seen": 175576880, + "step": 144285 + }, + { + "epoch": 16.06971823142889, + "grad_norm": 1.8182268142700195, + "learning_rate": 5.655081300685722e-06, + "loss": 0.0648, + "num_input_tokens_seen": 175582608, + "step": 144290 + }, + { + "epoch": 16.070275086312506, + "grad_norm": 0.06043173372745514, + "learning_rate": 5.653542313598451e-06, + "loss": 0.0365, + "num_input_tokens_seen": 175589232, + "step": 144295 + }, + { + "epoch": 16.070831941196126, + "grad_norm": 0.19271251559257507, + "learning_rate": 5.652003509253967e-06, + "loss": 0.0195, + "num_input_tokens_seen": 175595344, + "step": 144300 + }, + { + "epoch": 16.07138879607974, + "grad_norm": 0.1653531938791275, + "learning_rate": 5.6504648876668205e-06, + "loss": 0.0102, + "num_input_tokens_seen": 175601712, + "step": 144305 + }, + { + "epoch": 16.071945650963357, + "grad_norm": 2.013388156890869, + "learning_rate": 5.648926448851533e-06, + "loss": 0.0643, + "num_input_tokens_seen": 175607952, + "step": 144310 + }, + { + "epoch": 16.072502505846977, + "grad_norm": 0.006278737913817167, + "learning_rate": 5.647388192822639e-06, + "loss": 0.0155, + "num_input_tokens_seen": 175614256, + "step": 144315 + }, + { + "epoch": 16.073059360730593, + "grad_norm": 0.015818379819393158, + "learning_rate": 5.645850119594662e-06, + "loss": 0.0391, + "num_input_tokens_seen": 175620560, + "step": 144320 + }, + { + "epoch": 16.073616215614212, + "grad_norm": 0.1783296912908554, + "learning_rate": 5.644312229182144e-06, + "loss": 0.0853, + "num_input_tokens_seen": 175626768, + "step": 144325 + }, + { + "epoch": 16.074173070497828, + "grad_norm": 1.323687195777893, + "learning_rate": 5.642774521599606e-06, + "loss": 0.1017, + "num_input_tokens_seen": 175633104, + "step": 144330 + }, + { + "epoch": 16.074729925381444, + "grad_norm": 0.03935398533940315, + "learning_rate": 5.641236996861571e-06, + "loss": 0.0252, + "num_input_tokens_seen": 175638256, + "step": 144335 + }, + { + "epoch": 16.075286780265063, + "grad_norm": 0.001287570921704173, + "learning_rate": 5.6396996549825525e-06, + "loss": 0.0546, + "num_input_tokens_seen": 175643824, + "step": 144340 + }, + { + "epoch": 16.07584363514868, + "grad_norm": 0.0017239198787137866, + "learning_rate": 5.638162495977092e-06, + "loss": 0.2426, + "num_input_tokens_seen": 175650224, + "step": 144345 + }, + { + "epoch": 16.0764004900323, + "grad_norm": 0.00013194530038163066, + "learning_rate": 5.636625519859698e-06, + "loss": 0.0133, + "num_input_tokens_seen": 175656240, + "step": 144350 + }, + { + "epoch": 16.076957344915915, + "grad_norm": 0.5243756771087646, + "learning_rate": 5.635088726644891e-06, + "loss": 0.087, + "num_input_tokens_seen": 175662128, + "step": 144355 + }, + { + "epoch": 16.07751419979953, + "grad_norm": 0.5581670999526978, + "learning_rate": 5.633552116347177e-06, + "loss": 0.0478, + "num_input_tokens_seen": 175668240, + "step": 144360 + }, + { + "epoch": 16.07807105468315, + "grad_norm": 0.06708139181137085, + "learning_rate": 5.6320156889810875e-06, + "loss": 0.0137, + "num_input_tokens_seen": 175674480, + "step": 144365 + }, + { + "epoch": 16.078627909566766, + "grad_norm": 0.056173183023929596, + "learning_rate": 5.6304794445611205e-06, + "loss": 0.1053, + "num_input_tokens_seen": 175680656, + "step": 144370 + }, + { + "epoch": 16.079184764450385, + "grad_norm": 0.06300602108240128, + "learning_rate": 5.628943383101801e-06, + "loss": 0.0097, + "num_input_tokens_seen": 175686800, + "step": 144375 + }, + { + "epoch": 16.079741619334, + "grad_norm": 0.013668899424374104, + "learning_rate": 5.627407504617629e-06, + "loss": 0.0218, + "num_input_tokens_seen": 175692912, + "step": 144380 + }, + { + "epoch": 16.080298474217617, + "grad_norm": 0.0005084577715024352, + "learning_rate": 5.625871809123115e-06, + "loss": 0.0323, + "num_input_tokens_seen": 175698608, + "step": 144385 + }, + { + "epoch": 16.080855329101237, + "grad_norm": 0.004440456163138151, + "learning_rate": 5.6243362966327565e-06, + "loss": 0.0343, + "num_input_tokens_seen": 175704592, + "step": 144390 + }, + { + "epoch": 16.081412183984853, + "grad_norm": 0.8615488409996033, + "learning_rate": 5.622800967161074e-06, + "loss": 0.0686, + "num_input_tokens_seen": 175710672, + "step": 144395 + }, + { + "epoch": 16.081969038868472, + "grad_norm": 0.00043736814404837787, + "learning_rate": 5.62126582072256e-06, + "loss": 0.0904, + "num_input_tokens_seen": 175717072, + "step": 144400 + }, + { + "epoch": 16.082525893752088, + "grad_norm": 0.08939111232757568, + "learning_rate": 5.619730857331718e-06, + "loss": 0.0713, + "num_input_tokens_seen": 175722928, + "step": 144405 + }, + { + "epoch": 16.083082748635707, + "grad_norm": 0.0001951921876752749, + "learning_rate": 5.618196077003043e-06, + "loss": 0.031, + "num_input_tokens_seen": 175729296, + "step": 144410 + }, + { + "epoch": 16.083639603519323, + "grad_norm": 0.07817628234624863, + "learning_rate": 5.616661479751029e-06, + "loss": 0.0088, + "num_input_tokens_seen": 175735568, + "step": 144415 + }, + { + "epoch": 16.08419645840294, + "grad_norm": 0.10530110448598862, + "learning_rate": 5.615127065590184e-06, + "loss": 0.1036, + "num_input_tokens_seen": 175741936, + "step": 144420 + }, + { + "epoch": 16.08475331328656, + "grad_norm": 0.0017710509710013866, + "learning_rate": 5.6135928345349945e-06, + "loss": 0.0177, + "num_input_tokens_seen": 175748048, + "step": 144425 + }, + { + "epoch": 16.085310168170174, + "grad_norm": 0.0289764441549778, + "learning_rate": 5.612058786599953e-06, + "loss": 0.0575, + "num_input_tokens_seen": 175753936, + "step": 144430 + }, + { + "epoch": 16.085867023053794, + "grad_norm": 0.11797340214252472, + "learning_rate": 5.61052492179954e-06, + "loss": 0.0097, + "num_input_tokens_seen": 175760208, + "step": 144435 + }, + { + "epoch": 16.08642387793741, + "grad_norm": 0.01739841140806675, + "learning_rate": 5.608991240148265e-06, + "loss": 0.0306, + "num_input_tokens_seen": 175766544, + "step": 144440 + }, + { + "epoch": 16.086980732821026, + "grad_norm": 0.012714557349681854, + "learning_rate": 5.607457741660593e-06, + "loss": 0.0025, + "num_input_tokens_seen": 175772624, + "step": 144445 + }, + { + "epoch": 16.087537587704645, + "grad_norm": 0.17344322800636292, + "learning_rate": 5.605924426351036e-06, + "loss": 0.0054, + "num_input_tokens_seen": 175778544, + "step": 144450 + }, + { + "epoch": 16.08809444258826, + "grad_norm": 0.011259474791586399, + "learning_rate": 5.604391294234046e-06, + "loss": 0.0435, + "num_input_tokens_seen": 175784464, + "step": 144455 + }, + { + "epoch": 16.08865129747188, + "grad_norm": 1.3406236171722412, + "learning_rate": 5.602858345324125e-06, + "loss": 0.1354, + "num_input_tokens_seen": 175790672, + "step": 144460 + }, + { + "epoch": 16.089208152355496, + "grad_norm": 0.025818930938839912, + "learning_rate": 5.601325579635744e-06, + "loss": 0.0219, + "num_input_tokens_seen": 175796784, + "step": 144465 + }, + { + "epoch": 16.089765007239112, + "grad_norm": 0.0036262799985706806, + "learning_rate": 5.5997929971833895e-06, + "loss": 0.0042, + "num_input_tokens_seen": 175802800, + "step": 144470 + }, + { + "epoch": 16.09032186212273, + "grad_norm": 0.047438450157642365, + "learning_rate": 5.598260597981534e-06, + "loss": 0.0242, + "num_input_tokens_seen": 175808624, + "step": 144475 + }, + { + "epoch": 16.090878717006348, + "grad_norm": 0.019788620993494987, + "learning_rate": 5.596728382044652e-06, + "loss": 0.028, + "num_input_tokens_seen": 175814768, + "step": 144480 + }, + { + "epoch": 16.091435571889967, + "grad_norm": 2.4320027828216553, + "learning_rate": 5.595196349387208e-06, + "loss": 0.0741, + "num_input_tokens_seen": 175821008, + "step": 144485 + }, + { + "epoch": 16.091992426773583, + "grad_norm": 0.27452901005744934, + "learning_rate": 5.5936645000236905e-06, + "loss": 0.0101, + "num_input_tokens_seen": 175827184, + "step": 144490 + }, + { + "epoch": 16.0925492816572, + "grad_norm": 0.621478259563446, + "learning_rate": 5.592132833968558e-06, + "loss": 0.0254, + "num_input_tokens_seen": 175833424, + "step": 144495 + }, + { + "epoch": 16.09310613654082, + "grad_norm": 0.0685613602399826, + "learning_rate": 5.59060135123628e-06, + "loss": 0.0092, + "num_input_tokens_seen": 175839760, + "step": 144500 + }, + { + "epoch": 16.093662991424434, + "grad_norm": 0.00013085949467495084, + "learning_rate": 5.589070051841317e-06, + "loss": 0.0238, + "num_input_tokens_seen": 175845840, + "step": 144505 + }, + { + "epoch": 16.094219846308054, + "grad_norm": 0.06142321974039078, + "learning_rate": 5.587538935798145e-06, + "loss": 0.0895, + "num_input_tokens_seen": 175851984, + "step": 144510 + }, + { + "epoch": 16.09477670119167, + "grad_norm": 0.00197350955568254, + "learning_rate": 5.586008003121215e-06, + "loss": 0.0054, + "num_input_tokens_seen": 175858320, + "step": 144515 + }, + { + "epoch": 16.095333556075285, + "grad_norm": 0.0009882799349725246, + "learning_rate": 5.584477253825002e-06, + "loss": 0.0046, + "num_input_tokens_seen": 175864464, + "step": 144520 + }, + { + "epoch": 16.095890410958905, + "grad_norm": 0.2970637083053589, + "learning_rate": 5.582946687923954e-06, + "loss": 0.122, + "num_input_tokens_seen": 175870640, + "step": 144525 + }, + { + "epoch": 16.09644726584252, + "grad_norm": 0.004627166781574488, + "learning_rate": 5.581416305432532e-06, + "loss": 0.0194, + "num_input_tokens_seen": 175876848, + "step": 144530 + }, + { + "epoch": 16.09700412072614, + "grad_norm": 0.06584323197603226, + "learning_rate": 5.579886106365184e-06, + "loss": 0.0828, + "num_input_tokens_seen": 175883120, + "step": 144535 + }, + { + "epoch": 16.097560975609756, + "grad_norm": 0.00027066137408837676, + "learning_rate": 5.578356090736378e-06, + "loss": 0.0196, + "num_input_tokens_seen": 175889360, + "step": 144540 + }, + { + "epoch": 16.098117830493372, + "grad_norm": 0.30129486322402954, + "learning_rate": 5.576826258560558e-06, + "loss": 0.004, + "num_input_tokens_seen": 175895472, + "step": 144545 + }, + { + "epoch": 16.09867468537699, + "grad_norm": 0.3950554430484772, + "learning_rate": 5.575296609852177e-06, + "loss": 0.0221, + "num_input_tokens_seen": 175901616, + "step": 144550 + }, + { + "epoch": 16.099231540260607, + "grad_norm": 0.04804876819252968, + "learning_rate": 5.573767144625675e-06, + "loss": 0.0074, + "num_input_tokens_seen": 175907600, + "step": 144555 + }, + { + "epoch": 16.099788395144227, + "grad_norm": 0.8240209221839905, + "learning_rate": 5.572237862895513e-06, + "loss": 0.0279, + "num_input_tokens_seen": 175913808, + "step": 144560 + }, + { + "epoch": 16.100345250027843, + "grad_norm": 0.0036366148851811886, + "learning_rate": 5.570708764676122e-06, + "loss": 0.0056, + "num_input_tokens_seen": 175920112, + "step": 144565 + }, + { + "epoch": 16.10090210491146, + "grad_norm": 2.561835765838623, + "learning_rate": 5.56917984998197e-06, + "loss": 0.0963, + "num_input_tokens_seen": 175926128, + "step": 144570 + }, + { + "epoch": 16.101458959795078, + "grad_norm": 0.0005302695208229125, + "learning_rate": 5.567651118827466e-06, + "loss": 0.0217, + "num_input_tokens_seen": 175932240, + "step": 144575 + }, + { + "epoch": 16.102015814678694, + "grad_norm": 0.026376420632004738, + "learning_rate": 5.566122571227072e-06, + "loss": 0.0112, + "num_input_tokens_seen": 175937872, + "step": 144580 + }, + { + "epoch": 16.102572669562313, + "grad_norm": 4.871184825897217, + "learning_rate": 5.564594207195215e-06, + "loss": 0.0626, + "num_input_tokens_seen": 175943920, + "step": 144585 + }, + { + "epoch": 16.10312952444593, + "grad_norm": 0.00018853656365536153, + "learning_rate": 5.563066026746344e-06, + "loss": 0.0711, + "num_input_tokens_seen": 175950352, + "step": 144590 + }, + { + "epoch": 16.103686379329545, + "grad_norm": 0.0005335830501280725, + "learning_rate": 5.561538029894886e-06, + "loss": 0.0035, + "num_input_tokens_seen": 175956464, + "step": 144595 + }, + { + "epoch": 16.104243234213165, + "grad_norm": 0.6041802167892456, + "learning_rate": 5.560010216655276e-06, + "loss": 0.0111, + "num_input_tokens_seen": 175962480, + "step": 144600 + }, + { + "epoch": 16.10480008909678, + "grad_norm": 0.23658393323421478, + "learning_rate": 5.558482587041938e-06, + "loss": 0.0456, + "num_input_tokens_seen": 175968656, + "step": 144605 + }, + { + "epoch": 16.1053569439804, + "grad_norm": 7.292348163900897e-05, + "learning_rate": 5.5569551410693145e-06, + "loss": 0.0079, + "num_input_tokens_seen": 175974896, + "step": 144610 + }, + { + "epoch": 16.105913798864016, + "grad_norm": 0.26150715351104736, + "learning_rate": 5.555427878751829e-06, + "loss": 0.0239, + "num_input_tokens_seen": 175981040, + "step": 144615 + }, + { + "epoch": 16.106470653747632, + "grad_norm": 0.22048017382621765, + "learning_rate": 5.553900800103904e-06, + "loss": 0.0723, + "num_input_tokens_seen": 175987568, + "step": 144620 + }, + { + "epoch": 16.10702750863125, + "grad_norm": 0.17180342972278595, + "learning_rate": 5.5523739051399596e-06, + "loss": 0.005, + "num_input_tokens_seen": 175993488, + "step": 144625 + }, + { + "epoch": 16.107584363514867, + "grad_norm": 0.0010616880608722568, + "learning_rate": 5.550847193874431e-06, + "loss": 0.0042, + "num_input_tokens_seen": 175999856, + "step": 144630 + }, + { + "epoch": 16.108141218398487, + "grad_norm": 0.5031474232673645, + "learning_rate": 5.549320666321725e-06, + "loss": 0.0188, + "num_input_tokens_seen": 176006064, + "step": 144635 + }, + { + "epoch": 16.108698073282103, + "grad_norm": 0.02948920987546444, + "learning_rate": 5.547794322496286e-06, + "loss": 0.0173, + "num_input_tokens_seen": 176012080, + "step": 144640 + }, + { + "epoch": 16.10925492816572, + "grad_norm": 1.6626002788543701, + "learning_rate": 5.546268162412499e-06, + "loss": 0.0452, + "num_input_tokens_seen": 176018160, + "step": 144645 + }, + { + "epoch": 16.109811783049338, + "grad_norm": 1.1716852188110352, + "learning_rate": 5.544742186084801e-06, + "loss": 0.0903, + "num_input_tokens_seen": 176024304, + "step": 144650 + }, + { + "epoch": 16.110368637932954, + "grad_norm": 0.0019391472451388836, + "learning_rate": 5.543216393527595e-06, + "loss": 0.0213, + "num_input_tokens_seen": 176030448, + "step": 144655 + }, + { + "epoch": 16.110925492816573, + "grad_norm": 0.08303599804639816, + "learning_rate": 5.541690784755305e-06, + "loss": 0.0671, + "num_input_tokens_seen": 176036240, + "step": 144660 + }, + { + "epoch": 16.11148234770019, + "grad_norm": 0.9433035850524902, + "learning_rate": 5.5401653597823376e-06, + "loss": 0.03, + "num_input_tokens_seen": 176042640, + "step": 144665 + }, + { + "epoch": 16.112039202583805, + "grad_norm": 0.0001281053846469149, + "learning_rate": 5.538640118623095e-06, + "loss": 0.0825, + "num_input_tokens_seen": 176048816, + "step": 144670 + }, + { + "epoch": 16.112596057467425, + "grad_norm": 0.9364301562309265, + "learning_rate": 5.5371150612919835e-06, + "loss": 0.1307, + "num_input_tokens_seen": 176055120, + "step": 144675 + }, + { + "epoch": 16.11315291235104, + "grad_norm": 0.00018616343731991947, + "learning_rate": 5.535590187803422e-06, + "loss": 0.0041, + "num_input_tokens_seen": 176061232, + "step": 144680 + }, + { + "epoch": 16.11370976723466, + "grad_norm": 0.22980855405330658, + "learning_rate": 5.534065498171806e-06, + "loss": 0.0142, + "num_input_tokens_seen": 176067664, + "step": 144685 + }, + { + "epoch": 16.114266622118276, + "grad_norm": 0.002805842785164714, + "learning_rate": 5.5325409924115365e-06, + "loss": 0.0456, + "num_input_tokens_seen": 176073936, + "step": 144690 + }, + { + "epoch": 16.11482347700189, + "grad_norm": 0.8400957584381104, + "learning_rate": 5.531016670537007e-06, + "loss": 0.0407, + "num_input_tokens_seen": 176080048, + "step": 144695 + }, + { + "epoch": 16.11538033188551, + "grad_norm": 1.412049651145935, + "learning_rate": 5.52949253256263e-06, + "loss": 0.0326, + "num_input_tokens_seen": 176085968, + "step": 144700 + }, + { + "epoch": 16.115937186769127, + "grad_norm": 1.3856315612792969, + "learning_rate": 5.527968578502787e-06, + "loss": 0.059, + "num_input_tokens_seen": 176092336, + "step": 144705 + }, + { + "epoch": 16.116494041652746, + "grad_norm": 0.008052259683609009, + "learning_rate": 5.5264448083718916e-06, + "loss": 0.003, + "num_input_tokens_seen": 176098416, + "step": 144710 + }, + { + "epoch": 16.117050896536362, + "grad_norm": 0.1471911072731018, + "learning_rate": 5.524921222184326e-06, + "loss": 0.0615, + "num_input_tokens_seen": 176104272, + "step": 144715 + }, + { + "epoch": 16.11760775141998, + "grad_norm": 1.414952278137207, + "learning_rate": 5.523397819954482e-06, + "loss": 0.0936, + "num_input_tokens_seen": 176110288, + "step": 144720 + }, + { + "epoch": 16.118164606303598, + "grad_norm": 0.4372987151145935, + "learning_rate": 5.521874601696744e-06, + "loss": 0.0195, + "num_input_tokens_seen": 176116528, + "step": 144725 + }, + { + "epoch": 16.118721461187214, + "grad_norm": 0.0034388971980661154, + "learning_rate": 5.520351567425511e-06, + "loss": 0.0208, + "num_input_tokens_seen": 176122992, + "step": 144730 + }, + { + "epoch": 16.119278316070833, + "grad_norm": 0.19732880592346191, + "learning_rate": 5.5188287171551666e-06, + "loss": 0.0032, + "num_input_tokens_seen": 176129200, + "step": 144735 + }, + { + "epoch": 16.11983517095445, + "grad_norm": 0.11924352496862411, + "learning_rate": 5.517306050900092e-06, + "loss": 0.0147, + "num_input_tokens_seen": 176135120, + "step": 144740 + }, + { + "epoch": 16.120392025838065, + "grad_norm": 0.0079946368932724, + "learning_rate": 5.515783568674662e-06, + "loss": 0.008, + "num_input_tokens_seen": 176141232, + "step": 144745 + }, + { + "epoch": 16.120948880721684, + "grad_norm": 0.49596890807151794, + "learning_rate": 5.514261270493276e-06, + "loss": 0.032, + "num_input_tokens_seen": 176147408, + "step": 144750 + }, + { + "epoch": 16.1215057356053, + "grad_norm": 0.06339191645383835, + "learning_rate": 5.512739156370297e-06, + "loss": 0.0765, + "num_input_tokens_seen": 176153616, + "step": 144755 + }, + { + "epoch": 16.12206259048892, + "grad_norm": 0.0011786929098889232, + "learning_rate": 5.511217226320125e-06, + "loss": 0.0528, + "num_input_tokens_seen": 176159696, + "step": 144760 + }, + { + "epoch": 16.122619445372536, + "grad_norm": 9.007407061289996e-05, + "learning_rate": 5.5096954803571045e-06, + "loss": 0.077, + "num_input_tokens_seen": 176165520, + "step": 144765 + }, + { + "epoch": 16.123176300256155, + "grad_norm": 0.12640923261642456, + "learning_rate": 5.5081739184956325e-06, + "loss": 0.0181, + "num_input_tokens_seen": 176171184, + "step": 144770 + }, + { + "epoch": 16.12373315513977, + "grad_norm": 1.7199383974075317, + "learning_rate": 5.506652540750068e-06, + "loss": 0.023, + "num_input_tokens_seen": 176177136, + "step": 144775 + }, + { + "epoch": 16.124290010023387, + "grad_norm": 0.0004278389969840646, + "learning_rate": 5.5051313471347955e-06, + "loss": 0.0053, + "num_input_tokens_seen": 176183504, + "step": 144780 + }, + { + "epoch": 16.124846864907006, + "grad_norm": 0.7140864729881287, + "learning_rate": 5.503610337664175e-06, + "loss": 0.0632, + "num_input_tokens_seen": 176189296, + "step": 144785 + }, + { + "epoch": 16.125403719790622, + "grad_norm": 0.0010029026307165623, + "learning_rate": 5.502089512352576e-06, + "loss": 0.0489, + "num_input_tokens_seen": 176195280, + "step": 144790 + }, + { + "epoch": 16.12596057467424, + "grad_norm": 2.110182523727417, + "learning_rate": 5.500568871214357e-06, + "loss": 0.2513, + "num_input_tokens_seen": 176200784, + "step": 144795 + }, + { + "epoch": 16.126517429557857, + "grad_norm": 9.337131632491946e-05, + "learning_rate": 5.499048414263894e-06, + "loss": 0.0005, + "num_input_tokens_seen": 176206928, + "step": 144800 + }, + { + "epoch": 16.127074284441473, + "grad_norm": 2.12485671043396, + "learning_rate": 5.49752814151554e-06, + "loss": 0.053, + "num_input_tokens_seen": 176213168, + "step": 144805 + }, + { + "epoch": 16.127631139325093, + "grad_norm": 0.001809512497857213, + "learning_rate": 5.4960080529836614e-06, + "loss": 0.0142, + "num_input_tokens_seen": 176219280, + "step": 144810 + }, + { + "epoch": 16.12818799420871, + "grad_norm": 0.006300991866737604, + "learning_rate": 5.4944881486826114e-06, + "loss": 0.0486, + "num_input_tokens_seen": 176225552, + "step": 144815 + }, + { + "epoch": 16.128744849092328, + "grad_norm": 0.4666033089160919, + "learning_rate": 5.492968428626741e-06, + "loss": 0.0964, + "num_input_tokens_seen": 176231248, + "step": 144820 + }, + { + "epoch": 16.129301703975944, + "grad_norm": 1.8585909605026245, + "learning_rate": 5.49144889283042e-06, + "loss": 0.0886, + "num_input_tokens_seen": 176237552, + "step": 144825 + }, + { + "epoch": 16.12985855885956, + "grad_norm": 0.7007522583007812, + "learning_rate": 5.489929541307995e-06, + "loss": 0.036, + "num_input_tokens_seen": 176243600, + "step": 144830 + }, + { + "epoch": 16.13041541374318, + "grad_norm": 0.165908545255661, + "learning_rate": 5.488410374073816e-06, + "loss": 0.0063, + "num_input_tokens_seen": 176249808, + "step": 144835 + }, + { + "epoch": 16.130972268626795, + "grad_norm": 0.05140930786728859, + "learning_rate": 5.486891391142227e-06, + "loss": 0.0259, + "num_input_tokens_seen": 176255920, + "step": 144840 + }, + { + "epoch": 16.131529123510415, + "grad_norm": 4.4185967445373535, + "learning_rate": 5.48537259252759e-06, + "loss": 0.0653, + "num_input_tokens_seen": 176261904, + "step": 144845 + }, + { + "epoch": 16.13208597839403, + "grad_norm": 3.778907060623169, + "learning_rate": 5.483853978244236e-06, + "loss": 0.0814, + "num_input_tokens_seen": 176267920, + "step": 144850 + }, + { + "epoch": 16.132642833277647, + "grad_norm": 0.037350066006183624, + "learning_rate": 5.48233554830653e-06, + "loss": 0.0153, + "num_input_tokens_seen": 176273904, + "step": 144855 + }, + { + "epoch": 16.133199688161266, + "grad_norm": 0.4355766475200653, + "learning_rate": 5.480817302728788e-06, + "loss": 0.0147, + "num_input_tokens_seen": 176280080, + "step": 144860 + }, + { + "epoch": 16.133756543044882, + "grad_norm": 0.07798027992248535, + "learning_rate": 5.479299241525373e-06, + "loss": 0.0205, + "num_input_tokens_seen": 176285392, + "step": 144865 + }, + { + "epoch": 16.1343133979285, + "grad_norm": 0.44276753067970276, + "learning_rate": 5.47778136471061e-06, + "loss": 0.0109, + "num_input_tokens_seen": 176291280, + "step": 144870 + }, + { + "epoch": 16.134870252812117, + "grad_norm": 0.2208276093006134, + "learning_rate": 5.476263672298851e-06, + "loss": 0.0032, + "num_input_tokens_seen": 176297296, + "step": 144875 + }, + { + "epoch": 16.135427107695733, + "grad_norm": 0.043802835047245026, + "learning_rate": 5.474746164304423e-06, + "loss": 0.0092, + "num_input_tokens_seen": 176303376, + "step": 144880 + }, + { + "epoch": 16.135983962579353, + "grad_norm": 0.9641945958137512, + "learning_rate": 5.4732288407416595e-06, + "loss": 0.0267, + "num_input_tokens_seen": 176309968, + "step": 144885 + }, + { + "epoch": 16.13654081746297, + "grad_norm": 0.0014958124374970794, + "learning_rate": 5.471711701624887e-06, + "loss": 0.0029, + "num_input_tokens_seen": 176315984, + "step": 144890 + }, + { + "epoch": 16.137097672346588, + "grad_norm": 1.3591631650924683, + "learning_rate": 5.470194746968452e-06, + "loss": 0.0414, + "num_input_tokens_seen": 176322128, + "step": 144895 + }, + { + "epoch": 16.137654527230204, + "grad_norm": 0.0005565026076510549, + "learning_rate": 5.468677976786674e-06, + "loss": 0.1198, + "num_input_tokens_seen": 176328304, + "step": 144900 + }, + { + "epoch": 16.13821138211382, + "grad_norm": 0.7172354459762573, + "learning_rate": 5.467161391093881e-06, + "loss": 0.021, + "num_input_tokens_seen": 176334192, + "step": 144905 + }, + { + "epoch": 16.13876823699744, + "grad_norm": 0.13540636003017426, + "learning_rate": 5.465644989904389e-06, + "loss": 0.0053, + "num_input_tokens_seen": 176339536, + "step": 144910 + }, + { + "epoch": 16.139325091881055, + "grad_norm": 0.3257613182067871, + "learning_rate": 5.464128773232541e-06, + "loss": 0.0196, + "num_input_tokens_seen": 176345712, + "step": 144915 + }, + { + "epoch": 16.139881946764675, + "grad_norm": 0.0004915525787509978, + "learning_rate": 5.462612741092638e-06, + "loss": 0.0701, + "num_input_tokens_seen": 176351824, + "step": 144920 + }, + { + "epoch": 16.14043880164829, + "grad_norm": 0.6086170673370361, + "learning_rate": 5.461096893499021e-06, + "loss": 0.0157, + "num_input_tokens_seen": 176357968, + "step": 144925 + }, + { + "epoch": 16.140995656531906, + "grad_norm": 0.0027676105964928865, + "learning_rate": 5.459581230465996e-06, + "loss": 0.0252, + "num_input_tokens_seen": 176363952, + "step": 144930 + }, + { + "epoch": 16.141552511415526, + "grad_norm": 0.09749610722064972, + "learning_rate": 5.458065752007882e-06, + "loss": 0.006, + "num_input_tokens_seen": 176370256, + "step": 144935 + }, + { + "epoch": 16.14210936629914, + "grad_norm": 0.003050258383154869, + "learning_rate": 5.45655045813899e-06, + "loss": 0.0859, + "num_input_tokens_seen": 176376208, + "step": 144940 + }, + { + "epoch": 16.14266622118276, + "grad_norm": 1.4942854642868042, + "learning_rate": 5.4550353488736384e-06, + "loss": 0.0853, + "num_input_tokens_seen": 176382192, + "step": 144945 + }, + { + "epoch": 16.143223076066377, + "grad_norm": 0.17350554466247559, + "learning_rate": 5.453520424226141e-06, + "loss": 0.0034, + "num_input_tokens_seen": 176388496, + "step": 144950 + }, + { + "epoch": 16.143779930949993, + "grad_norm": 0.001471437863074243, + "learning_rate": 5.452005684210804e-06, + "loss": 0.016, + "num_input_tokens_seen": 176394448, + "step": 144955 + }, + { + "epoch": 16.144336785833612, + "grad_norm": 0.010585376061499119, + "learning_rate": 5.450491128841925e-06, + "loss": 0.0006, + "num_input_tokens_seen": 176400336, + "step": 144960 + }, + { + "epoch": 16.14489364071723, + "grad_norm": 0.06894408911466599, + "learning_rate": 5.448976758133828e-06, + "loss": 0.0057, + "num_input_tokens_seen": 176406416, + "step": 144965 + }, + { + "epoch": 16.145450495600848, + "grad_norm": 0.015758952125906944, + "learning_rate": 5.447462572100803e-06, + "loss": 0.016, + "num_input_tokens_seen": 176412240, + "step": 144970 + }, + { + "epoch": 16.146007350484464, + "grad_norm": 0.09633377194404602, + "learning_rate": 5.4459485707571725e-06, + "loss": 0.1584, + "num_input_tokens_seen": 176418256, + "step": 144975 + }, + { + "epoch": 16.14656420536808, + "grad_norm": 0.9176510572433472, + "learning_rate": 5.444434754117211e-06, + "loss": 0.0437, + "num_input_tokens_seen": 176424240, + "step": 144980 + }, + { + "epoch": 16.1471210602517, + "grad_norm": 2.079521894454956, + "learning_rate": 5.442921122195238e-06, + "loss": 0.0779, + "num_input_tokens_seen": 176430640, + "step": 144985 + }, + { + "epoch": 16.147677915135315, + "grad_norm": 0.009586404077708721, + "learning_rate": 5.441407675005539e-06, + "loss": 0.0387, + "num_input_tokens_seen": 176436816, + "step": 144990 + }, + { + "epoch": 16.148234770018934, + "grad_norm": 0.14282876253128052, + "learning_rate": 5.439894412562419e-06, + "loss": 0.0142, + "num_input_tokens_seen": 176442896, + "step": 144995 + }, + { + "epoch": 16.14879162490255, + "grad_norm": 1.3999145030975342, + "learning_rate": 5.438381334880169e-06, + "loss": 0.0535, + "num_input_tokens_seen": 176449168, + "step": 145000 + }, + { + "epoch": 16.149348479786166, + "grad_norm": 0.010276451706886292, + "learning_rate": 5.436868441973078e-06, + "loss": 0.0097, + "num_input_tokens_seen": 176454832, + "step": 145005 + }, + { + "epoch": 16.149905334669786, + "grad_norm": 0.015291382558643818, + "learning_rate": 5.435355733855432e-06, + "loss": 0.0637, + "num_input_tokens_seen": 176460944, + "step": 145010 + }, + { + "epoch": 16.1504621895534, + "grad_norm": 0.006105233449488878, + "learning_rate": 5.433843210541531e-06, + "loss": 0.1168, + "num_input_tokens_seen": 176466736, + "step": 145015 + }, + { + "epoch": 16.15101904443702, + "grad_norm": 0.000565522990655154, + "learning_rate": 5.43233087204566e-06, + "loss": 0.0649, + "num_input_tokens_seen": 176472912, + "step": 145020 + }, + { + "epoch": 16.151575899320637, + "grad_norm": 0.7678704261779785, + "learning_rate": 5.4308187183821e-06, + "loss": 0.1037, + "num_input_tokens_seen": 176479024, + "step": 145025 + }, + { + "epoch": 16.152132754204253, + "grad_norm": 0.0465485118329525, + "learning_rate": 5.42930674956513e-06, + "loss": 0.1088, + "num_input_tokens_seen": 176485200, + "step": 145030 + }, + { + "epoch": 16.152689609087872, + "grad_norm": 0.6233301162719727, + "learning_rate": 5.427794965609042e-06, + "loss": 0.0167, + "num_input_tokens_seen": 176491472, + "step": 145035 + }, + { + "epoch": 16.153246463971488, + "grad_norm": 0.22995296120643616, + "learning_rate": 5.4262833665281065e-06, + "loss": 0.0101, + "num_input_tokens_seen": 176497360, + "step": 145040 + }, + { + "epoch": 16.153803318855108, + "grad_norm": 1.3716360330581665, + "learning_rate": 5.424771952336621e-06, + "loss": 0.08, + "num_input_tokens_seen": 176503568, + "step": 145045 + }, + { + "epoch": 16.154360173738723, + "grad_norm": 1.7689648866653442, + "learning_rate": 5.423260723048834e-06, + "loss": 0.0915, + "num_input_tokens_seen": 176509424, + "step": 145050 + }, + { + "epoch": 16.15491702862234, + "grad_norm": 0.42948606610298157, + "learning_rate": 5.421749678679039e-06, + "loss": 0.0161, + "num_input_tokens_seen": 176515472, + "step": 145055 + }, + { + "epoch": 16.15547388350596, + "grad_norm": 0.9437388181686401, + "learning_rate": 5.420238819241499e-06, + "loss": 0.0343, + "num_input_tokens_seen": 176521776, + "step": 145060 + }, + { + "epoch": 16.156030738389575, + "grad_norm": 0.14922769367694855, + "learning_rate": 5.418728144750498e-06, + "loss": 0.0089, + "num_input_tokens_seen": 176528272, + "step": 145065 + }, + { + "epoch": 16.156587593273194, + "grad_norm": 0.15356846153736115, + "learning_rate": 5.417217655220297e-06, + "loss": 0.011, + "num_input_tokens_seen": 176534384, + "step": 145070 + }, + { + "epoch": 16.15714444815681, + "grad_norm": 0.05784756317734718, + "learning_rate": 5.415707350665164e-06, + "loss": 0.011, + "num_input_tokens_seen": 176539888, + "step": 145075 + }, + { + "epoch": 16.157701303040426, + "grad_norm": 0.8886793255805969, + "learning_rate": 5.414197231099361e-06, + "loss": 0.0535, + "num_input_tokens_seen": 176546128, + "step": 145080 + }, + { + "epoch": 16.158258157924045, + "grad_norm": 0.5361716151237488, + "learning_rate": 5.412687296537161e-06, + "loss": 0.0364, + "num_input_tokens_seen": 176552176, + "step": 145085 + }, + { + "epoch": 16.15881501280766, + "grad_norm": 0.8417794704437256, + "learning_rate": 5.411177546992824e-06, + "loss": 0.0271, + "num_input_tokens_seen": 176558320, + "step": 145090 + }, + { + "epoch": 16.15937186769128, + "grad_norm": 0.12007812410593033, + "learning_rate": 5.409667982480609e-06, + "loss": 0.0695, + "num_input_tokens_seen": 176564240, + "step": 145095 + }, + { + "epoch": 16.159928722574897, + "grad_norm": 1.5835758447647095, + "learning_rate": 5.408158603014768e-06, + "loss": 0.0577, + "num_input_tokens_seen": 176570416, + "step": 145100 + }, + { + "epoch": 16.160485577458513, + "grad_norm": 0.0008477714145556092, + "learning_rate": 5.406649408609574e-06, + "loss": 0.0412, + "num_input_tokens_seen": 176575824, + "step": 145105 + }, + { + "epoch": 16.161042432342132, + "grad_norm": 0.5321840047836304, + "learning_rate": 5.405140399279266e-06, + "loss": 0.0272, + "num_input_tokens_seen": 176582000, + "step": 145110 + }, + { + "epoch": 16.161599287225748, + "grad_norm": 0.0704999566078186, + "learning_rate": 5.403631575038115e-06, + "loss": 0.0764, + "num_input_tokens_seen": 176587728, + "step": 145115 + }, + { + "epoch": 16.162156142109367, + "grad_norm": 0.029981529340147972, + "learning_rate": 5.4021229359003615e-06, + "loss": 0.0587, + "num_input_tokens_seen": 176593936, + "step": 145120 + }, + { + "epoch": 16.162712996992983, + "grad_norm": 0.03303287923336029, + "learning_rate": 5.400614481880259e-06, + "loss": 0.0247, + "num_input_tokens_seen": 176599984, + "step": 145125 + }, + { + "epoch": 16.163269851876603, + "grad_norm": 0.20502431690692902, + "learning_rate": 5.39910621299205e-06, + "loss": 0.0036, + "num_input_tokens_seen": 176606096, + "step": 145130 + }, + { + "epoch": 16.16382670676022, + "grad_norm": 0.013107175007462502, + "learning_rate": 5.3975981292499926e-06, + "loss": 0.013, + "num_input_tokens_seen": 176612208, + "step": 145135 + }, + { + "epoch": 16.164383561643834, + "grad_norm": 0.041095659136772156, + "learning_rate": 5.396090230668327e-06, + "loss": 0.0782, + "num_input_tokens_seen": 176618192, + "step": 145140 + }, + { + "epoch": 16.164940416527454, + "grad_norm": 0.13982649147510529, + "learning_rate": 5.394582517261296e-06, + "loss": 0.0644, + "num_input_tokens_seen": 176624144, + "step": 145145 + }, + { + "epoch": 16.16549727141107, + "grad_norm": 1.9217939376831055, + "learning_rate": 5.393074989043132e-06, + "loss": 0.09, + "num_input_tokens_seen": 176630064, + "step": 145150 + }, + { + "epoch": 16.16605412629469, + "grad_norm": 0.00042587355710566044, + "learning_rate": 5.391567646028093e-06, + "loss": 0.08, + "num_input_tokens_seen": 176635920, + "step": 145155 + }, + { + "epoch": 16.166610981178305, + "grad_norm": 2.461639642715454, + "learning_rate": 5.390060488230397e-06, + "loss": 0.0888, + "num_input_tokens_seen": 176641680, + "step": 145160 + }, + { + "epoch": 16.16716783606192, + "grad_norm": 0.4390181601047516, + "learning_rate": 5.388553515664307e-06, + "loss": 0.0072, + "num_input_tokens_seen": 176648080, + "step": 145165 + }, + { + "epoch": 16.16772469094554, + "grad_norm": 0.00012735134805552661, + "learning_rate": 5.387046728344028e-06, + "loss": 0.0014, + "num_input_tokens_seen": 176654000, + "step": 145170 + }, + { + "epoch": 16.168281545829156, + "grad_norm": 0.12689386308193207, + "learning_rate": 5.3855401262838115e-06, + "loss": 0.0363, + "num_input_tokens_seen": 176660208, + "step": 145175 + }, + { + "epoch": 16.168838400712776, + "grad_norm": 0.7282029986381531, + "learning_rate": 5.384033709497879e-06, + "loss": 0.0589, + "num_input_tokens_seen": 176666320, + "step": 145180 + }, + { + "epoch": 16.169395255596392, + "grad_norm": 0.017793122678995132, + "learning_rate": 5.382527478000468e-06, + "loss": 0.0096, + "num_input_tokens_seen": 176672624, + "step": 145185 + }, + { + "epoch": 16.169952110480008, + "grad_norm": 0.005766706541180611, + "learning_rate": 5.381021431805805e-06, + "loss": 0.0213, + "num_input_tokens_seen": 176678896, + "step": 145190 + }, + { + "epoch": 16.170508965363627, + "grad_norm": 1.9100682735443115, + "learning_rate": 5.379515570928112e-06, + "loss": 0.0533, + "num_input_tokens_seen": 176685008, + "step": 145195 + }, + { + "epoch": 16.171065820247243, + "grad_norm": 0.011668987572193146, + "learning_rate": 5.378009895381605e-06, + "loss": 0.0908, + "num_input_tokens_seen": 176691216, + "step": 145200 + }, + { + "epoch": 16.171622675130862, + "grad_norm": 0.5957117676734924, + "learning_rate": 5.376504405180527e-06, + "loss": 0.0165, + "num_input_tokens_seen": 176697264, + "step": 145205 + }, + { + "epoch": 16.17217953001448, + "grad_norm": 1.2576299905776978, + "learning_rate": 5.374999100339084e-06, + "loss": 0.0469, + "num_input_tokens_seen": 176703440, + "step": 145210 + }, + { + "epoch": 16.172736384898094, + "grad_norm": 0.16077426075935364, + "learning_rate": 5.373493980871497e-06, + "loss": 0.005, + "num_input_tokens_seen": 176709456, + "step": 145215 + }, + { + "epoch": 16.173293239781714, + "grad_norm": 0.6789652109146118, + "learning_rate": 5.371989046791987e-06, + "loss": 0.0676, + "num_input_tokens_seen": 176715600, + "step": 145220 + }, + { + "epoch": 16.17385009466533, + "grad_norm": 0.5400715470314026, + "learning_rate": 5.370484298114756e-06, + "loss": 0.0712, + "num_input_tokens_seen": 176720976, + "step": 145225 + }, + { + "epoch": 16.17440694954895, + "grad_norm": 1.1215801239013672, + "learning_rate": 5.368979734854035e-06, + "loss": 0.057, + "num_input_tokens_seen": 176727024, + "step": 145230 + }, + { + "epoch": 16.174963804432565, + "grad_norm": 1.341614007949829, + "learning_rate": 5.36747535702403e-06, + "loss": 0.0385, + "num_input_tokens_seen": 176733168, + "step": 145235 + }, + { + "epoch": 16.17552065931618, + "grad_norm": 0.0013009964022785425, + "learning_rate": 5.36597116463895e-06, + "loss": 0.0355, + "num_input_tokens_seen": 176738992, + "step": 145240 + }, + { + "epoch": 16.1760775141998, + "grad_norm": 0.7668381929397583, + "learning_rate": 5.364467157712994e-06, + "loss": 0.0271, + "num_input_tokens_seen": 176744912, + "step": 145245 + }, + { + "epoch": 16.176634369083416, + "grad_norm": 1.7226078510284424, + "learning_rate": 5.362963336260385e-06, + "loss": 0.1246, + "num_input_tokens_seen": 176751120, + "step": 145250 + }, + { + "epoch": 16.177191223967036, + "grad_norm": 0.038588088005781174, + "learning_rate": 5.361459700295312e-06, + "loss": 0.0025, + "num_input_tokens_seen": 176757424, + "step": 145255 + }, + { + "epoch": 16.17774807885065, + "grad_norm": 1.5234229564666748, + "learning_rate": 5.359956249831996e-06, + "loss": 0.1974, + "num_input_tokens_seen": 176763728, + "step": 145260 + }, + { + "epoch": 16.178304933734267, + "grad_norm": 0.07990224659442902, + "learning_rate": 5.358452984884627e-06, + "loss": 0.0028, + "num_input_tokens_seen": 176769776, + "step": 145265 + }, + { + "epoch": 16.178861788617887, + "grad_norm": 0.24255870282649994, + "learning_rate": 5.356949905467407e-06, + "loss": 0.0776, + "num_input_tokens_seen": 176776080, + "step": 145270 + }, + { + "epoch": 16.179418643501503, + "grad_norm": 0.12199676036834717, + "learning_rate": 5.355447011594525e-06, + "loss": 0.0062, + "num_input_tokens_seen": 176782000, + "step": 145275 + }, + { + "epoch": 16.179975498385122, + "grad_norm": 0.04250360280275345, + "learning_rate": 5.3539443032801926e-06, + "loss": 0.0483, + "num_input_tokens_seen": 176787984, + "step": 145280 + }, + { + "epoch": 16.180532353268738, + "grad_norm": 0.041582509875297546, + "learning_rate": 5.3524417805385945e-06, + "loss": 0.0028, + "num_input_tokens_seen": 176794128, + "step": 145285 + }, + { + "epoch": 16.181089208152354, + "grad_norm": 0.018991488963365555, + "learning_rate": 5.350939443383929e-06, + "loss": 0.098, + "num_input_tokens_seen": 176800368, + "step": 145290 + }, + { + "epoch": 16.181646063035974, + "grad_norm": 0.2139715552330017, + "learning_rate": 5.3494372918303725e-06, + "loss": 0.0374, + "num_input_tokens_seen": 176805968, + "step": 145295 + }, + { + "epoch": 16.18220291791959, + "grad_norm": 0.005185168236494064, + "learning_rate": 5.347935325892134e-06, + "loss": 0.1031, + "num_input_tokens_seen": 176812016, + "step": 145300 + }, + { + "epoch": 16.18275977280321, + "grad_norm": 0.008448653854429722, + "learning_rate": 5.346433545583382e-06, + "loss": 0.0342, + "num_input_tokens_seen": 176817936, + "step": 145305 + }, + { + "epoch": 16.183316627686825, + "grad_norm": 0.007111692801117897, + "learning_rate": 5.344931950918325e-06, + "loss": 0.0365, + "num_input_tokens_seen": 176823888, + "step": 145310 + }, + { + "epoch": 16.18387348257044, + "grad_norm": 0.09845884144306183, + "learning_rate": 5.34343054191112e-06, + "loss": 0.0266, + "num_input_tokens_seen": 176830448, + "step": 145315 + }, + { + "epoch": 16.18443033745406, + "grad_norm": 0.25884318351745605, + "learning_rate": 5.3419293185759725e-06, + "loss": 0.0204, + "num_input_tokens_seen": 176836880, + "step": 145320 + }, + { + "epoch": 16.184987192337676, + "grad_norm": 0.0366206131875515, + "learning_rate": 5.34042828092704e-06, + "loss": 0.0223, + "num_input_tokens_seen": 176842768, + "step": 145325 + }, + { + "epoch": 16.185544047221295, + "grad_norm": 0.05728546157479286, + "learning_rate": 5.3389274289785244e-06, + "loss": 0.0595, + "num_input_tokens_seen": 176848880, + "step": 145330 + }, + { + "epoch": 16.18610090210491, + "grad_norm": 0.36514168977737427, + "learning_rate": 5.3374267627445905e-06, + "loss": 0.0198, + "num_input_tokens_seen": 176854960, + "step": 145335 + }, + { + "epoch": 16.186657756988527, + "grad_norm": 0.2886893153190613, + "learning_rate": 5.335926282239412e-06, + "loss": 0.0528, + "num_input_tokens_seen": 176860592, + "step": 145340 + }, + { + "epoch": 16.187214611872147, + "grad_norm": 0.002834922866895795, + "learning_rate": 5.3344259874771595e-06, + "loss": 0.0147, + "num_input_tokens_seen": 176866608, + "step": 145345 + }, + { + "epoch": 16.187771466755763, + "grad_norm": 2.417590618133545, + "learning_rate": 5.332925878472017e-06, + "loss": 0.1324, + "num_input_tokens_seen": 176872720, + "step": 145350 + }, + { + "epoch": 16.188328321639382, + "grad_norm": 1.6224113702774048, + "learning_rate": 5.3314259552381456e-06, + "loss": 0.1192, + "num_input_tokens_seen": 176878960, + "step": 145355 + }, + { + "epoch": 16.188885176522998, + "grad_norm": 0.11482954770326614, + "learning_rate": 5.329926217789713e-06, + "loss": 0.0736, + "num_input_tokens_seen": 176884528, + "step": 145360 + }, + { + "epoch": 16.189442031406614, + "grad_norm": 0.3522694706916809, + "learning_rate": 5.3284266661408815e-06, + "loss": 0.0283, + "num_input_tokens_seen": 176890416, + "step": 145365 + }, + { + "epoch": 16.189998886290233, + "grad_norm": 0.36903148889541626, + "learning_rate": 5.326927300305826e-06, + "loss": 0.078, + "num_input_tokens_seen": 176896560, + "step": 145370 + }, + { + "epoch": 16.19055574117385, + "grad_norm": 0.02351776510477066, + "learning_rate": 5.325428120298698e-06, + "loss": 0.0097, + "num_input_tokens_seen": 176902544, + "step": 145375 + }, + { + "epoch": 16.19111259605747, + "grad_norm": 0.34730178117752075, + "learning_rate": 5.323929126133678e-06, + "loss": 0.0235, + "num_input_tokens_seen": 176908912, + "step": 145380 + }, + { + "epoch": 16.191669450941085, + "grad_norm": 0.0007096808403730392, + "learning_rate": 5.322430317824897e-06, + "loss": 0.0667, + "num_input_tokens_seen": 176914992, + "step": 145385 + }, + { + "epoch": 16.1922263058247, + "grad_norm": 0.491267591714859, + "learning_rate": 5.3209316953865355e-06, + "loss": 0.0623, + "num_input_tokens_seen": 176921168, + "step": 145390 + }, + { + "epoch": 16.19278316070832, + "grad_norm": 1.0065120458602905, + "learning_rate": 5.319433258832735e-06, + "loss": 0.0314, + "num_input_tokens_seen": 176927504, + "step": 145395 + }, + { + "epoch": 16.193340015591936, + "grad_norm": 1.2225008010864258, + "learning_rate": 5.317935008177658e-06, + "loss": 0.0395, + "num_input_tokens_seen": 176933616, + "step": 145400 + }, + { + "epoch": 16.193896870475555, + "grad_norm": 0.22490932047367096, + "learning_rate": 5.316436943435457e-06, + "loss": 0.0138, + "num_input_tokens_seen": 176939824, + "step": 145405 + }, + { + "epoch": 16.19445372535917, + "grad_norm": 0.8073627948760986, + "learning_rate": 5.314939064620278e-06, + "loss": 0.079, + "num_input_tokens_seen": 176945904, + "step": 145410 + }, + { + "epoch": 16.195010580242787, + "grad_norm": 3.233591079711914, + "learning_rate": 5.313441371746264e-06, + "loss": 0.0452, + "num_input_tokens_seen": 176952304, + "step": 145415 + }, + { + "epoch": 16.195567435126407, + "grad_norm": 0.0039611333049833775, + "learning_rate": 5.311943864827576e-06, + "loss": 0.0141, + "num_input_tokens_seen": 176958704, + "step": 145420 + }, + { + "epoch": 16.196124290010022, + "grad_norm": 0.021076472476124763, + "learning_rate": 5.310446543878353e-06, + "loss": 0.0978, + "num_input_tokens_seen": 176965328, + "step": 145425 + }, + { + "epoch": 16.196681144893642, + "grad_norm": 2.0354323387145996, + "learning_rate": 5.308949408912736e-06, + "loss": 0.1062, + "num_input_tokens_seen": 176971600, + "step": 145430 + }, + { + "epoch": 16.197237999777258, + "grad_norm": 0.006753923371434212, + "learning_rate": 5.307452459944862e-06, + "loss": 0.0041, + "num_input_tokens_seen": 176977584, + "step": 145435 + }, + { + "epoch": 16.197794854660874, + "grad_norm": 0.0005143245798535645, + "learning_rate": 5.305955696988885e-06, + "loss": 0.0449, + "num_input_tokens_seen": 176983792, + "step": 145440 + }, + { + "epoch": 16.198351709544493, + "grad_norm": 0.043449778109788895, + "learning_rate": 5.304459120058927e-06, + "loss": 0.0387, + "num_input_tokens_seen": 176989840, + "step": 145445 + }, + { + "epoch": 16.19890856442811, + "grad_norm": 0.26038748025894165, + "learning_rate": 5.3029627291691445e-06, + "loss": 0.0261, + "num_input_tokens_seen": 176996208, + "step": 145450 + }, + { + "epoch": 16.19946541931173, + "grad_norm": 0.08485332876443863, + "learning_rate": 5.301466524333648e-06, + "loss": 0.0103, + "num_input_tokens_seen": 177002096, + "step": 145455 + }, + { + "epoch": 16.200022274195344, + "grad_norm": 0.11684244871139526, + "learning_rate": 5.29997050556659e-06, + "loss": 0.0389, + "num_input_tokens_seen": 177007760, + "step": 145460 + }, + { + "epoch": 16.20057912907896, + "grad_norm": 0.034127864986658096, + "learning_rate": 5.298474672882086e-06, + "loss": 0.0026, + "num_input_tokens_seen": 177013968, + "step": 145465 + }, + { + "epoch": 16.20113598396258, + "grad_norm": 1.7714323997497559, + "learning_rate": 5.29697902629428e-06, + "loss": 0.0621, + "num_input_tokens_seen": 177020272, + "step": 145470 + }, + { + "epoch": 16.201692838846196, + "grad_norm": 0.004332059063017368, + "learning_rate": 5.295483565817294e-06, + "loss": 0.0571, + "num_input_tokens_seen": 177026352, + "step": 145475 + }, + { + "epoch": 16.202249693729815, + "grad_norm": 2.7838735580444336, + "learning_rate": 5.293988291465252e-06, + "loss": 0.0506, + "num_input_tokens_seen": 177032400, + "step": 145480 + }, + { + "epoch": 16.20280654861343, + "grad_norm": 0.003075835295021534, + "learning_rate": 5.2924932032522716e-06, + "loss": 0.0117, + "num_input_tokens_seen": 177038320, + "step": 145485 + }, + { + "epoch": 16.20336340349705, + "grad_norm": 0.02967989258468151, + "learning_rate": 5.290998301192488e-06, + "loss": 0.0035, + "num_input_tokens_seen": 177044688, + "step": 145490 + }, + { + "epoch": 16.203920258380666, + "grad_norm": 0.3447904884815216, + "learning_rate": 5.289503585300018e-06, + "loss": 0.0446, + "num_input_tokens_seen": 177050864, + "step": 145495 + }, + { + "epoch": 16.204477113264282, + "grad_norm": 1.1079003810882568, + "learning_rate": 5.288009055588977e-06, + "loss": 0.1223, + "num_input_tokens_seen": 177057136, + "step": 145500 + }, + { + "epoch": 16.2050339681479, + "grad_norm": 0.0477510504424572, + "learning_rate": 5.2865147120734785e-06, + "loss": 0.05, + "num_input_tokens_seen": 177063312, + "step": 145505 + }, + { + "epoch": 16.205590823031518, + "grad_norm": 0.05189109966158867, + "learning_rate": 5.285020554767647e-06, + "loss": 0.0094, + "num_input_tokens_seen": 177069232, + "step": 145510 + }, + { + "epoch": 16.206147677915137, + "grad_norm": 0.011205392889678478, + "learning_rate": 5.283526583685588e-06, + "loss": 0.0172, + "num_input_tokens_seen": 177075440, + "step": 145515 + }, + { + "epoch": 16.206704532798753, + "grad_norm": 0.6765117645263672, + "learning_rate": 5.2820327988414215e-06, + "loss": 0.012, + "num_input_tokens_seen": 177081392, + "step": 145520 + }, + { + "epoch": 16.20726138768237, + "grad_norm": 0.4298374652862549, + "learning_rate": 5.280539200249254e-06, + "loss": 0.013, + "num_input_tokens_seen": 177086800, + "step": 145525 + }, + { + "epoch": 16.20781824256599, + "grad_norm": 6.969412788748741e-05, + "learning_rate": 5.279045787923192e-06, + "loss": 0.0302, + "num_input_tokens_seen": 177093296, + "step": 145530 + }, + { + "epoch": 16.208375097449604, + "grad_norm": 1.6674093008041382, + "learning_rate": 5.277552561877336e-06, + "loss": 0.0279, + "num_input_tokens_seen": 177099664, + "step": 145535 + }, + { + "epoch": 16.208931952333224, + "grad_norm": 0.017075883224606514, + "learning_rate": 5.276059522125806e-06, + "loss": 0.0026, + "num_input_tokens_seen": 177105360, + "step": 145540 + }, + { + "epoch": 16.20948880721684, + "grad_norm": 0.25013187527656555, + "learning_rate": 5.2745666686826925e-06, + "loss": 0.0779, + "num_input_tokens_seen": 177111312, + "step": 145545 + }, + { + "epoch": 16.210045662100455, + "grad_norm": 0.0006234376924112439, + "learning_rate": 5.273074001562103e-06, + "loss": 0.0496, + "num_input_tokens_seen": 177117520, + "step": 145550 + }, + { + "epoch": 16.210602516984075, + "grad_norm": 1.5572272539138794, + "learning_rate": 5.27158152077813e-06, + "loss": 0.0874, + "num_input_tokens_seen": 177123600, + "step": 145555 + }, + { + "epoch": 16.21115937186769, + "grad_norm": 0.04756378382444382, + "learning_rate": 5.270089226344879e-06, + "loss": 0.0513, + "num_input_tokens_seen": 177129744, + "step": 145560 + }, + { + "epoch": 16.21171622675131, + "grad_norm": 0.03505998104810715, + "learning_rate": 5.268597118276436e-06, + "loss": 0.0646, + "num_input_tokens_seen": 177136240, + "step": 145565 + }, + { + "epoch": 16.212273081634926, + "grad_norm": 0.650416374206543, + "learning_rate": 5.267105196586919e-06, + "loss": 0.0296, + "num_input_tokens_seen": 177142608, + "step": 145570 + }, + { + "epoch": 16.212829936518542, + "grad_norm": 0.00025186126003973186, + "learning_rate": 5.265613461290386e-06, + "loss": 0.0213, + "num_input_tokens_seen": 177148656, + "step": 145575 + }, + { + "epoch": 16.21338679140216, + "grad_norm": 0.01710011251270771, + "learning_rate": 5.2641219124009515e-06, + "loss": 0.0201, + "num_input_tokens_seen": 177154064, + "step": 145580 + }, + { + "epoch": 16.213943646285777, + "grad_norm": 0.0032831779681146145, + "learning_rate": 5.2626305499326925e-06, + "loss": 0.0073, + "num_input_tokens_seen": 177159760, + "step": 145585 + }, + { + "epoch": 16.214500501169397, + "grad_norm": 0.7595800757408142, + "learning_rate": 5.2611393738997064e-06, + "loss": 0.0082, + "num_input_tokens_seen": 177165456, + "step": 145590 + }, + { + "epoch": 16.215057356053013, + "grad_norm": 0.02192230150103569, + "learning_rate": 5.2596483843160735e-06, + "loss": 0.0094, + "num_input_tokens_seen": 177171792, + "step": 145595 + }, + { + "epoch": 16.21561421093663, + "grad_norm": 0.45127442479133606, + "learning_rate": 5.25815758119588e-06, + "loss": 0.0256, + "num_input_tokens_seen": 177178032, + "step": 145600 + }, + { + "epoch": 16.216171065820248, + "grad_norm": 0.019958315417170525, + "learning_rate": 5.256666964553197e-06, + "loss": 0.0367, + "num_input_tokens_seen": 177184272, + "step": 145605 + }, + { + "epoch": 16.216727920703864, + "grad_norm": 0.27621835470199585, + "learning_rate": 5.255176534402118e-06, + "loss": 0.0098, + "num_input_tokens_seen": 177190128, + "step": 145610 + }, + { + "epoch": 16.217284775587483, + "grad_norm": 1.2738591432571411, + "learning_rate": 5.253686290756718e-06, + "loss": 0.0198, + "num_input_tokens_seen": 177196112, + "step": 145615 + }, + { + "epoch": 16.2178416304711, + "grad_norm": 0.00011712851119227707, + "learning_rate": 5.252196233631068e-06, + "loss": 0.0043, + "num_input_tokens_seen": 177201744, + "step": 145620 + }, + { + "epoch": 16.218398485354715, + "grad_norm": 1.0591330528259277, + "learning_rate": 5.250706363039243e-06, + "loss": 0.1098, + "num_input_tokens_seen": 177207792, + "step": 145625 + }, + { + "epoch": 16.218955340238335, + "grad_norm": 0.00017210021906066686, + "learning_rate": 5.249216678995325e-06, + "loss": 0.0028, + "num_input_tokens_seen": 177214000, + "step": 145630 + }, + { + "epoch": 16.21951219512195, + "grad_norm": 0.013533670455217361, + "learning_rate": 5.247727181513379e-06, + "loss": 0.0056, + "num_input_tokens_seen": 177220080, + "step": 145635 + }, + { + "epoch": 16.22006905000557, + "grad_norm": 0.5903714895248413, + "learning_rate": 5.246237870607476e-06, + "loss": 0.0149, + "num_input_tokens_seen": 177226064, + "step": 145640 + }, + { + "epoch": 16.220625904889186, + "grad_norm": 0.18522337079048157, + "learning_rate": 5.24474874629168e-06, + "loss": 0.0062, + "num_input_tokens_seen": 177232304, + "step": 145645 + }, + { + "epoch": 16.2211827597728, + "grad_norm": 0.7047907114028931, + "learning_rate": 5.243259808580056e-06, + "loss": 0.1133, + "num_input_tokens_seen": 177238544, + "step": 145650 + }, + { + "epoch": 16.22173961465642, + "grad_norm": 0.00019864371279254556, + "learning_rate": 5.241771057486677e-06, + "loss": 0.0296, + "num_input_tokens_seen": 177244784, + "step": 145655 + }, + { + "epoch": 16.222296469540037, + "grad_norm": 0.680315375328064, + "learning_rate": 5.240282493025594e-06, + "loss": 0.0548, + "num_input_tokens_seen": 177250896, + "step": 145660 + }, + { + "epoch": 16.222853324423657, + "grad_norm": 0.003264747792854905, + "learning_rate": 5.238794115210882e-06, + "loss": 0.065, + "num_input_tokens_seen": 177256912, + "step": 145665 + }, + { + "epoch": 16.223410179307272, + "grad_norm": 1.6370354890823364, + "learning_rate": 5.237305924056593e-06, + "loss": 0.0473, + "num_input_tokens_seen": 177262704, + "step": 145670 + }, + { + "epoch": 16.22396703419089, + "grad_norm": 0.051193177700042725, + "learning_rate": 5.23581791957678e-06, + "loss": 0.0333, + "num_input_tokens_seen": 177268880, + "step": 145675 + }, + { + "epoch": 16.224523889074508, + "grad_norm": 0.10804186761379242, + "learning_rate": 5.2343301017854975e-06, + "loss": 0.0148, + "num_input_tokens_seen": 177275024, + "step": 145680 + }, + { + "epoch": 16.225080743958124, + "grad_norm": 2.063678026199341, + "learning_rate": 5.2328424706968085e-06, + "loss": 0.0078, + "num_input_tokens_seen": 177281200, + "step": 145685 + }, + { + "epoch": 16.225637598841743, + "grad_norm": 0.022298764437437057, + "learning_rate": 5.231355026324758e-06, + "loss": 0.0126, + "num_input_tokens_seen": 177287120, + "step": 145690 + }, + { + "epoch": 16.22619445372536, + "grad_norm": 0.0008286081138066947, + "learning_rate": 5.229867768683399e-06, + "loss": 0.1971, + "num_input_tokens_seen": 177293200, + "step": 145695 + }, + { + "epoch": 16.226751308608975, + "grad_norm": 0.3948650658130646, + "learning_rate": 5.228380697786772e-06, + "loss": 0.0489, + "num_input_tokens_seen": 177299344, + "step": 145700 + }, + { + "epoch": 16.227308163492594, + "grad_norm": 0.45985347032546997, + "learning_rate": 5.226893813648939e-06, + "loss": 0.0395, + "num_input_tokens_seen": 177304880, + "step": 145705 + }, + { + "epoch": 16.22786501837621, + "grad_norm": 0.7785657048225403, + "learning_rate": 5.225407116283925e-06, + "loss": 0.038, + "num_input_tokens_seen": 177310992, + "step": 145710 + }, + { + "epoch": 16.22842187325983, + "grad_norm": 0.04351262375712395, + "learning_rate": 5.223920605705801e-06, + "loss": 0.0273, + "num_input_tokens_seen": 177317040, + "step": 145715 + }, + { + "epoch": 16.228978728143446, + "grad_norm": 0.9479683637619019, + "learning_rate": 5.222434281928576e-06, + "loss": 0.0723, + "num_input_tokens_seen": 177323248, + "step": 145720 + }, + { + "epoch": 16.22953558302706, + "grad_norm": 0.006587578449398279, + "learning_rate": 5.220948144966312e-06, + "loss": 0.0714, + "num_input_tokens_seen": 177329392, + "step": 145725 + }, + { + "epoch": 16.23009243791068, + "grad_norm": 0.006297338288277388, + "learning_rate": 5.21946219483303e-06, + "loss": 0.0153, + "num_input_tokens_seen": 177335664, + "step": 145730 + }, + { + "epoch": 16.230649292794297, + "grad_norm": 0.08167305588722229, + "learning_rate": 5.217976431542787e-06, + "loss": 0.0193, + "num_input_tokens_seen": 177341584, + "step": 145735 + }, + { + "epoch": 16.231206147677916, + "grad_norm": 0.015003698877990246, + "learning_rate": 5.216490855109601e-06, + "loss": 0.0469, + "num_input_tokens_seen": 177347600, + "step": 145740 + }, + { + "epoch": 16.231763002561532, + "grad_norm": 0.0023747447412461042, + "learning_rate": 5.215005465547513e-06, + "loss": 0.0038, + "num_input_tokens_seen": 177353808, + "step": 145745 + }, + { + "epoch": 16.232319857445148, + "grad_norm": 0.05529511347413063, + "learning_rate": 5.213520262870542e-06, + "loss": 0.0202, + "num_input_tokens_seen": 177358480, + "step": 145750 + }, + { + "epoch": 16.232876712328768, + "grad_norm": 0.013333039358258247, + "learning_rate": 5.2120352470927305e-06, + "loss": 0.0471, + "num_input_tokens_seen": 177364752, + "step": 145755 + }, + { + "epoch": 16.233433567212384, + "grad_norm": 0.759928822517395, + "learning_rate": 5.210550418228099e-06, + "loss": 0.1021, + "num_input_tokens_seen": 177370672, + "step": 145760 + }, + { + "epoch": 16.233990422096003, + "grad_norm": 0.8906694650650024, + "learning_rate": 5.209065776290676e-06, + "loss": 0.0228, + "num_input_tokens_seen": 177376240, + "step": 145765 + }, + { + "epoch": 16.23454727697962, + "grad_norm": 1.9517403841018677, + "learning_rate": 5.207581321294477e-06, + "loss": 0.0862, + "num_input_tokens_seen": 177382416, + "step": 145770 + }, + { + "epoch": 16.235104131863235, + "grad_norm": 0.6478530168533325, + "learning_rate": 5.206097053253533e-06, + "loss": 0.0144, + "num_input_tokens_seen": 177388720, + "step": 145775 + }, + { + "epoch": 16.235660986746854, + "grad_norm": 0.00042914951336570084, + "learning_rate": 5.204612972181857e-06, + "loss": 0.0063, + "num_input_tokens_seen": 177394672, + "step": 145780 + }, + { + "epoch": 16.23621784163047, + "grad_norm": 0.0009186224779114127, + "learning_rate": 5.203129078093485e-06, + "loss": 0.0532, + "num_input_tokens_seen": 177400848, + "step": 145785 + }, + { + "epoch": 16.23677469651409, + "grad_norm": 0.004224629607051611, + "learning_rate": 5.201645371002406e-06, + "loss": 0.0859, + "num_input_tokens_seen": 177407184, + "step": 145790 + }, + { + "epoch": 16.237331551397705, + "grad_norm": 0.002813376020640135, + "learning_rate": 5.200161850922655e-06, + "loss": 0.0067, + "num_input_tokens_seen": 177413296, + "step": 145795 + }, + { + "epoch": 16.23788840628132, + "grad_norm": 1.734106421470642, + "learning_rate": 5.19867851786823e-06, + "loss": 0.046, + "num_input_tokens_seen": 177419152, + "step": 145800 + }, + { + "epoch": 16.23844526116494, + "grad_norm": 0.00015487930795643479, + "learning_rate": 5.1971953718531615e-06, + "loss": 0.0022, + "num_input_tokens_seen": 177425296, + "step": 145805 + }, + { + "epoch": 16.239002116048557, + "grad_norm": 0.7716036438941956, + "learning_rate": 5.195712412891446e-06, + "loss": 0.0148, + "num_input_tokens_seen": 177431408, + "step": 145810 + }, + { + "epoch": 16.239558970932176, + "grad_norm": 0.053428176790475845, + "learning_rate": 5.194229640997095e-06, + "loss": 0.0097, + "num_input_tokens_seen": 177437552, + "step": 145815 + }, + { + "epoch": 16.240115825815792, + "grad_norm": 0.8371108174324036, + "learning_rate": 5.192747056184105e-06, + "loss": 0.135, + "num_input_tokens_seen": 177443568, + "step": 145820 + }, + { + "epoch": 16.24067268069941, + "grad_norm": 0.34693053364753723, + "learning_rate": 5.191264658466493e-06, + "loss": 0.022, + "num_input_tokens_seen": 177449936, + "step": 145825 + }, + { + "epoch": 16.241229535583027, + "grad_norm": 0.00278083817102015, + "learning_rate": 5.189782447858261e-06, + "loss": 0.0467, + "num_input_tokens_seen": 177455728, + "step": 145830 + }, + { + "epoch": 16.241786390466643, + "grad_norm": 0.07574081420898438, + "learning_rate": 5.188300424373405e-06, + "loss": 0.0127, + "num_input_tokens_seen": 177461936, + "step": 145835 + }, + { + "epoch": 16.242343245350263, + "grad_norm": 0.951007604598999, + "learning_rate": 5.186818588025916e-06, + "loss": 0.0188, + "num_input_tokens_seen": 177467664, + "step": 145840 + }, + { + "epoch": 16.24290010023388, + "grad_norm": 0.19981203973293304, + "learning_rate": 5.185336938829807e-06, + "loss": 0.0045, + "num_input_tokens_seen": 177474288, + "step": 145845 + }, + { + "epoch": 16.243456955117498, + "grad_norm": 2.231193780899048, + "learning_rate": 5.183855476799057e-06, + "loss": 0.0831, + "num_input_tokens_seen": 177480080, + "step": 145850 + }, + { + "epoch": 16.244013810001114, + "grad_norm": 1.6187381744384766, + "learning_rate": 5.182374201947685e-06, + "loss": 0.0714, + "num_input_tokens_seen": 177486256, + "step": 145855 + }, + { + "epoch": 16.24457066488473, + "grad_norm": 0.10358340293169022, + "learning_rate": 5.1808931142896506e-06, + "loss": 0.0044, + "num_input_tokens_seen": 177492368, + "step": 145860 + }, + { + "epoch": 16.24512751976835, + "grad_norm": 0.017824716866016388, + "learning_rate": 5.179412213838969e-06, + "loss": 0.0249, + "num_input_tokens_seen": 177498416, + "step": 145865 + }, + { + "epoch": 16.245684374651965, + "grad_norm": 0.46532005071640015, + "learning_rate": 5.177931500609609e-06, + "loss": 0.0055, + "num_input_tokens_seen": 177504528, + "step": 145870 + }, + { + "epoch": 16.246241229535585, + "grad_norm": 0.6150248050689697, + "learning_rate": 5.176450974615577e-06, + "loss": 0.0271, + "num_input_tokens_seen": 177510736, + "step": 145875 + }, + { + "epoch": 16.2467980844192, + "grad_norm": 0.003049326129257679, + "learning_rate": 5.174970635870846e-06, + "loss": 0.1004, + "num_input_tokens_seen": 177517296, + "step": 145880 + }, + { + "epoch": 16.247354939302816, + "grad_norm": 0.15078607201576233, + "learning_rate": 5.173490484389401e-06, + "loss": 0.0197, + "num_input_tokens_seen": 177523792, + "step": 145885 + }, + { + "epoch": 16.247911794186436, + "grad_norm": 0.005566669628024101, + "learning_rate": 5.172010520185216e-06, + "loss": 0.0964, + "num_input_tokens_seen": 177529840, + "step": 145890 + }, + { + "epoch": 16.248468649070052, + "grad_norm": 0.0034191044978797436, + "learning_rate": 5.1705307432722865e-06, + "loss": 0.0068, + "num_input_tokens_seen": 177536048, + "step": 145895 + }, + { + "epoch": 16.24902550395367, + "grad_norm": 0.6267634630203247, + "learning_rate": 5.169051153664578e-06, + "loss": 0.1282, + "num_input_tokens_seen": 177542192, + "step": 145900 + }, + { + "epoch": 16.249582358837287, + "grad_norm": 0.039182037115097046, + "learning_rate": 5.167571751376072e-06, + "loss": 0.0707, + "num_input_tokens_seen": 177548144, + "step": 145905 + }, + { + "epoch": 16.250139213720903, + "grad_norm": 0.1776621788740158, + "learning_rate": 5.166092536420733e-06, + "loss": 0.0226, + "num_input_tokens_seen": 177554384, + "step": 145910 + }, + { + "epoch": 16.250696068604523, + "grad_norm": 0.06453804671764374, + "learning_rate": 5.164613508812546e-06, + "loss": 0.0132, + "num_input_tokens_seen": 177559856, + "step": 145915 + }, + { + "epoch": 16.25125292348814, + "grad_norm": 0.20313748717308044, + "learning_rate": 5.163134668565472e-06, + "loss": 0.0054, + "num_input_tokens_seen": 177566160, + "step": 145920 + }, + { + "epoch": 16.251809778371758, + "grad_norm": 2.1125025749206543, + "learning_rate": 5.161656015693489e-06, + "loss": 0.0565, + "num_input_tokens_seen": 177571984, + "step": 145925 + }, + { + "epoch": 16.252366633255374, + "grad_norm": 0.0008314131409861147, + "learning_rate": 5.16017755021056e-06, + "loss": 0.055, + "num_input_tokens_seen": 177578032, + "step": 145930 + }, + { + "epoch": 16.25292348813899, + "grad_norm": 2.819319725036621, + "learning_rate": 5.158699272130649e-06, + "loss": 0.1169, + "num_input_tokens_seen": 177584080, + "step": 145935 + }, + { + "epoch": 16.25348034302261, + "grad_norm": 0.7048434019088745, + "learning_rate": 5.157221181467714e-06, + "loss": 0.0145, + "num_input_tokens_seen": 177590480, + "step": 145940 + }, + { + "epoch": 16.254037197906225, + "grad_norm": 0.0037481747567653656, + "learning_rate": 5.155743278235728e-06, + "loss": 0.0192, + "num_input_tokens_seen": 177596688, + "step": 145945 + }, + { + "epoch": 16.254594052789844, + "grad_norm": 0.042510680854320526, + "learning_rate": 5.154265562448649e-06, + "loss": 0.1381, + "num_input_tokens_seen": 177602832, + "step": 145950 + }, + { + "epoch": 16.25515090767346, + "grad_norm": 0.9152361750602722, + "learning_rate": 5.152788034120429e-06, + "loss": 0.0187, + "num_input_tokens_seen": 177608848, + "step": 145955 + }, + { + "epoch": 16.255707762557076, + "grad_norm": 0.0003903428732883185, + "learning_rate": 5.151310693265021e-06, + "loss": 0.1061, + "num_input_tokens_seen": 177615024, + "step": 145960 + }, + { + "epoch": 16.256264617440696, + "grad_norm": 0.0015663551166653633, + "learning_rate": 5.149833539896393e-06, + "loss": 0.0263, + "num_input_tokens_seen": 177621104, + "step": 145965 + }, + { + "epoch": 16.25682147232431, + "grad_norm": 2.4236271381378174, + "learning_rate": 5.148356574028481e-06, + "loss": 0.1135, + "num_input_tokens_seen": 177627120, + "step": 145970 + }, + { + "epoch": 16.25737832720793, + "grad_norm": 0.04609733074903488, + "learning_rate": 5.1468797956752634e-06, + "loss": 0.0032, + "num_input_tokens_seen": 177633072, + "step": 145975 + }, + { + "epoch": 16.257935182091547, + "grad_norm": 0.6673648953437805, + "learning_rate": 5.14540320485066e-06, + "loss": 0.1619, + "num_input_tokens_seen": 177638960, + "step": 145980 + }, + { + "epoch": 16.258492036975163, + "grad_norm": 0.022616934031248093, + "learning_rate": 5.143926801568633e-06, + "loss": 0.0115, + "num_input_tokens_seen": 177644784, + "step": 145985 + }, + { + "epoch": 16.259048891858782, + "grad_norm": 0.001684649963863194, + "learning_rate": 5.142450585843122e-06, + "loss": 0.0004, + "num_input_tokens_seen": 177650960, + "step": 145990 + }, + { + "epoch": 16.2596057467424, + "grad_norm": 0.009016308933496475, + "learning_rate": 5.14097455768808e-06, + "loss": 0.0053, + "num_input_tokens_seen": 177657104, + "step": 145995 + }, + { + "epoch": 16.260162601626018, + "grad_norm": 0.007983952760696411, + "learning_rate": 5.1394987171174445e-06, + "loss": 0.0261, + "num_input_tokens_seen": 177663440, + "step": 146000 + }, + { + "epoch": 16.260719456509634, + "grad_norm": 1.0567998886108398, + "learning_rate": 5.138023064145156e-06, + "loss": 0.1259, + "num_input_tokens_seen": 177669648, + "step": 146005 + }, + { + "epoch": 16.26127631139325, + "grad_norm": 0.09343479573726654, + "learning_rate": 5.136547598785146e-06, + "loss": 0.0065, + "num_input_tokens_seen": 177675312, + "step": 146010 + }, + { + "epoch": 16.26183316627687, + "grad_norm": 0.16400428116321564, + "learning_rate": 5.135072321051365e-06, + "loss": 0.0095, + "num_input_tokens_seen": 177681328, + "step": 146015 + }, + { + "epoch": 16.262390021160485, + "grad_norm": 1.11748206615448, + "learning_rate": 5.133597230957743e-06, + "loss": 0.088, + "num_input_tokens_seen": 177686928, + "step": 146020 + }, + { + "epoch": 16.262946876044104, + "grad_norm": 0.008364004082977772, + "learning_rate": 5.132122328518211e-06, + "loss": 0.04, + "num_input_tokens_seen": 177693136, + "step": 146025 + }, + { + "epoch": 16.26350373092772, + "grad_norm": 0.33351075649261475, + "learning_rate": 5.130647613746692e-06, + "loss": 0.0107, + "num_input_tokens_seen": 177699312, + "step": 146030 + }, + { + "epoch": 16.264060585811336, + "grad_norm": 0.031031427904963493, + "learning_rate": 5.129173086657135e-06, + "loss": 0.0109, + "num_input_tokens_seen": 177705424, + "step": 146035 + }, + { + "epoch": 16.264617440694956, + "grad_norm": 0.18228377401828766, + "learning_rate": 5.127698747263457e-06, + "loss": 0.028, + "num_input_tokens_seen": 177711504, + "step": 146040 + }, + { + "epoch": 16.26517429557857, + "grad_norm": 0.0009026466286741197, + "learning_rate": 5.126224595579587e-06, + "loss": 0.0053, + "num_input_tokens_seen": 177717680, + "step": 146045 + }, + { + "epoch": 16.26573115046219, + "grad_norm": 0.042178329080343246, + "learning_rate": 5.124750631619446e-06, + "loss": 0.0149, + "num_input_tokens_seen": 177723920, + "step": 146050 + }, + { + "epoch": 16.266288005345807, + "grad_norm": 0.00047667455510236323, + "learning_rate": 5.123276855396955e-06, + "loss": 0.0157, + "num_input_tokens_seen": 177729808, + "step": 146055 + }, + { + "epoch": 16.266844860229423, + "grad_norm": 0.119200699031353, + "learning_rate": 5.1218032669260455e-06, + "loss": 0.011, + "num_input_tokens_seen": 177736016, + "step": 146060 + }, + { + "epoch": 16.267401715113042, + "grad_norm": 1.731716275215149, + "learning_rate": 5.120329866220622e-06, + "loss": 0.0844, + "num_input_tokens_seen": 177742096, + "step": 146065 + }, + { + "epoch": 16.267958569996658, + "grad_norm": 0.4814808666706085, + "learning_rate": 5.118856653294618e-06, + "loss": 0.0434, + "num_input_tokens_seen": 177747728, + "step": 146070 + }, + { + "epoch": 16.268515424880277, + "grad_norm": 1.3475353717803955, + "learning_rate": 5.11738362816194e-06, + "loss": 0.0622, + "num_input_tokens_seen": 177754032, + "step": 146075 + }, + { + "epoch": 16.269072279763893, + "grad_norm": 0.6856057047843933, + "learning_rate": 5.1159107908365035e-06, + "loss": 0.0967, + "num_input_tokens_seen": 177759888, + "step": 146080 + }, + { + "epoch": 16.26962913464751, + "grad_norm": 0.028810717165470123, + "learning_rate": 5.114438141332215e-06, + "loss": 0.0016, + "num_input_tokens_seen": 177766256, + "step": 146085 + }, + { + "epoch": 16.27018598953113, + "grad_norm": 0.0029374537989497185, + "learning_rate": 5.112965679662998e-06, + "loss": 0.0853, + "num_input_tokens_seen": 177772240, + "step": 146090 + }, + { + "epoch": 16.270742844414745, + "grad_norm": 1.6031670570373535, + "learning_rate": 5.111493405842752e-06, + "loss": 0.0837, + "num_input_tokens_seen": 177778448, + "step": 146095 + }, + { + "epoch": 16.271299699298364, + "grad_norm": 1.2830579280853271, + "learning_rate": 5.110021319885386e-06, + "loss": 0.057, + "num_input_tokens_seen": 177784624, + "step": 146100 + }, + { + "epoch": 16.27185655418198, + "grad_norm": 0.2389483004808426, + "learning_rate": 5.1085494218047955e-06, + "loss": 0.0152, + "num_input_tokens_seen": 177790992, + "step": 146105 + }, + { + "epoch": 16.272413409065596, + "grad_norm": 0.012990059331059456, + "learning_rate": 5.107077711614899e-06, + "loss": 0.0079, + "num_input_tokens_seen": 177797040, + "step": 146110 + }, + { + "epoch": 16.272970263949215, + "grad_norm": 0.00025687593733891845, + "learning_rate": 5.105606189329587e-06, + "loss": 0.0125, + "num_input_tokens_seen": 177803120, + "step": 146115 + }, + { + "epoch": 16.27352711883283, + "grad_norm": 2.076186418533325, + "learning_rate": 5.104134854962778e-06, + "loss": 0.0516, + "num_input_tokens_seen": 177808816, + "step": 146120 + }, + { + "epoch": 16.27408397371645, + "grad_norm": 0.007590703200548887, + "learning_rate": 5.1026637085283405e-06, + "loss": 0.024, + "num_input_tokens_seen": 177815088, + "step": 146125 + }, + { + "epoch": 16.274640828600067, + "grad_norm": 9.858373960014433e-05, + "learning_rate": 5.10119275004019e-06, + "loss": 0.0492, + "num_input_tokens_seen": 177821200, + "step": 146130 + }, + { + "epoch": 16.275197683483682, + "grad_norm": 1.0772840976715088, + "learning_rate": 5.099721979512215e-06, + "loss": 0.0371, + "num_input_tokens_seen": 177827024, + "step": 146135 + }, + { + "epoch": 16.275754538367302, + "grad_norm": 0.039403285831213, + "learning_rate": 5.098251396958312e-06, + "loss": 0.0556, + "num_input_tokens_seen": 177832816, + "step": 146140 + }, + { + "epoch": 16.276311393250918, + "grad_norm": 0.19301044940948486, + "learning_rate": 5.096781002392373e-06, + "loss": 0.0383, + "num_input_tokens_seen": 177838512, + "step": 146145 + }, + { + "epoch": 16.276868248134537, + "grad_norm": 0.028642576187849045, + "learning_rate": 5.095310795828282e-06, + "loss": 0.1695, + "num_input_tokens_seen": 177844592, + "step": 146150 + }, + { + "epoch": 16.277425103018153, + "grad_norm": 0.008946195244789124, + "learning_rate": 5.093840777279921e-06, + "loss": 0.0992, + "num_input_tokens_seen": 177850704, + "step": 146155 + }, + { + "epoch": 16.27798195790177, + "grad_norm": 2.1279096603393555, + "learning_rate": 5.092370946761188e-06, + "loss": 0.1525, + "num_input_tokens_seen": 177856784, + "step": 146160 + }, + { + "epoch": 16.27853881278539, + "grad_norm": 0.01500422228127718, + "learning_rate": 5.090901304285964e-06, + "loss": 0.0687, + "num_input_tokens_seen": 177862800, + "step": 146165 + }, + { + "epoch": 16.279095667669004, + "grad_norm": 0.0001767926150932908, + "learning_rate": 5.089431849868126e-06, + "loss": 0.0119, + "num_input_tokens_seen": 177868784, + "step": 146170 + }, + { + "epoch": 16.279652522552624, + "grad_norm": 1.1216659545898438, + "learning_rate": 5.087962583521549e-06, + "loss": 0.0295, + "num_input_tokens_seen": 177875024, + "step": 146175 + }, + { + "epoch": 16.28020937743624, + "grad_norm": 0.00247824564576149, + "learning_rate": 5.086493505260126e-06, + "loss": 0.0596, + "num_input_tokens_seen": 177880944, + "step": 146180 + }, + { + "epoch": 16.280766232319856, + "grad_norm": 1.0591881275177002, + "learning_rate": 5.085024615097722e-06, + "loss": 0.0191, + "num_input_tokens_seen": 177886928, + "step": 146185 + }, + { + "epoch": 16.281323087203475, + "grad_norm": 0.00025229674065485597, + "learning_rate": 5.083555913048227e-06, + "loss": 0.0442, + "num_input_tokens_seen": 177892592, + "step": 146190 + }, + { + "epoch": 16.28187994208709, + "grad_norm": 0.8007199764251709, + "learning_rate": 5.08208739912549e-06, + "loss": 0.0286, + "num_input_tokens_seen": 177898576, + "step": 146195 + }, + { + "epoch": 16.28243679697071, + "grad_norm": 0.1282109022140503, + "learning_rate": 5.080619073343401e-06, + "loss": 0.0946, + "num_input_tokens_seen": 177903568, + "step": 146200 + }, + { + "epoch": 16.282993651854326, + "grad_norm": 1.1149710416793823, + "learning_rate": 5.079150935715821e-06, + "loss": 0.0894, + "num_input_tokens_seen": 177909872, + "step": 146205 + }, + { + "epoch": 16.283550506737946, + "grad_norm": 0.7038898468017578, + "learning_rate": 5.0776829862566235e-06, + "loss": 0.1246, + "num_input_tokens_seen": 177916080, + "step": 146210 + }, + { + "epoch": 16.28410736162156, + "grad_norm": 1.8641666173934937, + "learning_rate": 5.076215224979675e-06, + "loss": 0.0866, + "num_input_tokens_seen": 177921936, + "step": 146215 + }, + { + "epoch": 16.284664216505178, + "grad_norm": 0.007662974298000336, + "learning_rate": 5.074747651898834e-06, + "loss": 0.0175, + "num_input_tokens_seen": 177928048, + "step": 146220 + }, + { + "epoch": 16.285221071388797, + "grad_norm": 0.010204232297837734, + "learning_rate": 5.0732802670279604e-06, + "loss": 0.1405, + "num_input_tokens_seen": 177934224, + "step": 146225 + }, + { + "epoch": 16.285777926272413, + "grad_norm": 0.4732109308242798, + "learning_rate": 5.071813070380924e-06, + "loss": 0.0061, + "num_input_tokens_seen": 177940336, + "step": 146230 + }, + { + "epoch": 16.286334781156032, + "grad_norm": 0.2252376675605774, + "learning_rate": 5.070346061971581e-06, + "loss": 0.0974, + "num_input_tokens_seen": 177945936, + "step": 146235 + }, + { + "epoch": 16.28689163603965, + "grad_norm": 0.3587166666984558, + "learning_rate": 5.068879241813787e-06, + "loss": 0.0255, + "num_input_tokens_seen": 177952336, + "step": 146240 + }, + { + "epoch": 16.287448490923264, + "grad_norm": 0.3379691541194916, + "learning_rate": 5.06741260992139e-06, + "loss": 0.008, + "num_input_tokens_seen": 177958224, + "step": 146245 + }, + { + "epoch": 16.288005345806884, + "grad_norm": 1.0622323751449585, + "learning_rate": 5.065946166308258e-06, + "loss": 0.0294, + "num_input_tokens_seen": 177963984, + "step": 146250 + }, + { + "epoch": 16.2885622006905, + "grad_norm": 1.4382230043411255, + "learning_rate": 5.064479910988226e-06, + "loss": 0.0337, + "num_input_tokens_seen": 177970384, + "step": 146255 + }, + { + "epoch": 16.28911905557412, + "grad_norm": 0.012642906978726387, + "learning_rate": 5.063013843975162e-06, + "loss": 0.019, + "num_input_tokens_seen": 177976720, + "step": 146260 + }, + { + "epoch": 16.289675910457735, + "grad_norm": 0.15907375514507294, + "learning_rate": 5.0615479652829064e-06, + "loss": 0.0071, + "num_input_tokens_seen": 177983056, + "step": 146265 + }, + { + "epoch": 16.29023276534135, + "grad_norm": 1.3576288223266602, + "learning_rate": 5.060082274925304e-06, + "loss": 0.032, + "num_input_tokens_seen": 177989232, + "step": 146270 + }, + { + "epoch": 16.29078962022497, + "grad_norm": 0.041823893785476685, + "learning_rate": 5.058616772916192e-06, + "loss": 0.1165, + "num_input_tokens_seen": 177995024, + "step": 146275 + }, + { + "epoch": 16.291346475108586, + "grad_norm": 0.016272209584712982, + "learning_rate": 5.05715145926943e-06, + "loss": 0.0114, + "num_input_tokens_seen": 178000560, + "step": 146280 + }, + { + "epoch": 16.291903329992206, + "grad_norm": 0.0003902733442373574, + "learning_rate": 5.055686333998849e-06, + "loss": 0.0051, + "num_input_tokens_seen": 178006352, + "step": 146285 + }, + { + "epoch": 16.29246018487582, + "grad_norm": 1.3922146558761597, + "learning_rate": 5.054221397118292e-06, + "loss": 0.021, + "num_input_tokens_seen": 178012592, + "step": 146290 + }, + { + "epoch": 16.293017039759437, + "grad_norm": 0.36510366201400757, + "learning_rate": 5.052756648641585e-06, + "loss": 0.0524, + "num_input_tokens_seen": 178018608, + "step": 146295 + }, + { + "epoch": 16.293573894643057, + "grad_norm": 1.7934606075286865, + "learning_rate": 5.0512920885825794e-06, + "loss": 0.0463, + "num_input_tokens_seen": 178024976, + "step": 146300 + }, + { + "epoch": 16.294130749526673, + "grad_norm": 3.941988945007324, + "learning_rate": 5.049827716955105e-06, + "loss": 0.0716, + "num_input_tokens_seen": 178030608, + "step": 146305 + }, + { + "epoch": 16.294687604410292, + "grad_norm": 0.30150270462036133, + "learning_rate": 5.04836353377299e-06, + "loss": 0.0369, + "num_input_tokens_seen": 178036784, + "step": 146310 + }, + { + "epoch": 16.295244459293908, + "grad_norm": 2.5811927318573, + "learning_rate": 5.04689953905006e-06, + "loss": 0.1756, + "num_input_tokens_seen": 178042544, + "step": 146315 + }, + { + "epoch": 16.295801314177524, + "grad_norm": 0.0012092324905097485, + "learning_rate": 5.045435732800155e-06, + "loss": 0.0435, + "num_input_tokens_seen": 178048944, + "step": 146320 + }, + { + "epoch": 16.296358169061143, + "grad_norm": 0.0017354345181956887, + "learning_rate": 5.043972115037093e-06, + "loss": 0.0483, + "num_input_tokens_seen": 178055088, + "step": 146325 + }, + { + "epoch": 16.29691502394476, + "grad_norm": 0.26553624868392944, + "learning_rate": 5.042508685774708e-06, + "loss": 0.0797, + "num_input_tokens_seen": 178061264, + "step": 146330 + }, + { + "epoch": 16.29747187882838, + "grad_norm": 0.31391641497612, + "learning_rate": 5.041045445026818e-06, + "loss": 0.0181, + "num_input_tokens_seen": 178067312, + "step": 146335 + }, + { + "epoch": 16.298028733711995, + "grad_norm": 0.00014149100752547383, + "learning_rate": 5.039582392807246e-06, + "loss": 0.0239, + "num_input_tokens_seen": 178073488, + "step": 146340 + }, + { + "epoch": 16.29858558859561, + "grad_norm": 2.7079994678497314, + "learning_rate": 5.038119529129804e-06, + "loss": 0.081, + "num_input_tokens_seen": 178079440, + "step": 146345 + }, + { + "epoch": 16.29914244347923, + "grad_norm": 2.1586315631866455, + "learning_rate": 5.03665685400832e-06, + "loss": 0.018, + "num_input_tokens_seen": 178085648, + "step": 146350 + }, + { + "epoch": 16.299699298362846, + "grad_norm": 0.034303147345781326, + "learning_rate": 5.0351943674566084e-06, + "loss": 0.0395, + "num_input_tokens_seen": 178091824, + "step": 146355 + }, + { + "epoch": 16.300256153246465, + "grad_norm": 1.2418652772903442, + "learning_rate": 5.033732069488481e-06, + "loss": 0.0393, + "num_input_tokens_seen": 178097872, + "step": 146360 + }, + { + "epoch": 16.30081300813008, + "grad_norm": 0.13145986199378967, + "learning_rate": 5.032269960117744e-06, + "loss": 0.0141, + "num_input_tokens_seen": 178103888, + "step": 146365 + }, + { + "epoch": 16.301369863013697, + "grad_norm": 0.7040879726409912, + "learning_rate": 5.030808039358223e-06, + "loss": 0.0339, + "num_input_tokens_seen": 178110224, + "step": 146370 + }, + { + "epoch": 16.301926717897317, + "grad_norm": 0.754930317401886, + "learning_rate": 5.029346307223712e-06, + "loss": 0.0836, + "num_input_tokens_seen": 178116528, + "step": 146375 + }, + { + "epoch": 16.302483572780933, + "grad_norm": 0.29249462485313416, + "learning_rate": 5.027884763728039e-06, + "loss": 0.0116, + "num_input_tokens_seen": 178122640, + "step": 146380 + }, + { + "epoch": 16.303040427664552, + "grad_norm": 0.004500449635088444, + "learning_rate": 5.026423408884981e-06, + "loss": 0.0028, + "num_input_tokens_seen": 178128688, + "step": 146385 + }, + { + "epoch": 16.303597282548168, + "grad_norm": 1.7254984378814697, + "learning_rate": 5.0249622427083645e-06, + "loss": 0.0564, + "num_input_tokens_seen": 178134768, + "step": 146390 + }, + { + "epoch": 16.304154137431784, + "grad_norm": 0.9236000180244446, + "learning_rate": 5.023501265211974e-06, + "loss": 0.0506, + "num_input_tokens_seen": 178140432, + "step": 146395 + }, + { + "epoch": 16.304710992315403, + "grad_norm": 1.5068690776824951, + "learning_rate": 5.022040476409629e-06, + "loss": 0.068, + "num_input_tokens_seen": 178146480, + "step": 146400 + }, + { + "epoch": 16.30526784719902, + "grad_norm": 0.8354858756065369, + "learning_rate": 5.020579876315115e-06, + "loss": 0.037, + "num_input_tokens_seen": 178151952, + "step": 146405 + }, + { + "epoch": 16.30582470208264, + "grad_norm": 1.0801379680633545, + "learning_rate": 5.019119464942235e-06, + "loss": 0.0391, + "num_input_tokens_seen": 178158000, + "step": 146410 + }, + { + "epoch": 16.306381556966254, + "grad_norm": 1.135498046875, + "learning_rate": 5.0176592423047705e-06, + "loss": 0.0907, + "num_input_tokens_seen": 178164112, + "step": 146415 + }, + { + "epoch": 16.30693841184987, + "grad_norm": 0.0007675283122807741, + "learning_rate": 5.01619920841653e-06, + "loss": 0.0388, + "num_input_tokens_seen": 178170576, + "step": 146420 + }, + { + "epoch": 16.30749526673349, + "grad_norm": 0.003899324918165803, + "learning_rate": 5.014739363291302e-06, + "loss": 0.028, + "num_input_tokens_seen": 178176880, + "step": 146425 + }, + { + "epoch": 16.308052121617106, + "grad_norm": 0.056039612740278244, + "learning_rate": 5.0132797069428694e-06, + "loss": 0.0608, + "num_input_tokens_seen": 178183216, + "step": 146430 + }, + { + "epoch": 16.308608976500725, + "grad_norm": 1.395022988319397, + "learning_rate": 5.011820239385017e-06, + "loss": 0.1119, + "num_input_tokens_seen": 178189584, + "step": 146435 + }, + { + "epoch": 16.30916583138434, + "grad_norm": 0.00012426664761733264, + "learning_rate": 5.010360960631546e-06, + "loss": 0.0037, + "num_input_tokens_seen": 178195440, + "step": 146440 + }, + { + "epoch": 16.309722686267957, + "grad_norm": 0.06874821335077286, + "learning_rate": 5.008901870696223e-06, + "loss": 0.0564, + "num_input_tokens_seen": 178201264, + "step": 146445 + }, + { + "epoch": 16.310279541151576, + "grad_norm": 1.064538598060608, + "learning_rate": 5.007442969592852e-06, + "loss": 0.0597, + "num_input_tokens_seen": 178206544, + "step": 146450 + }, + { + "epoch": 16.310836396035192, + "grad_norm": 0.055469002574682236, + "learning_rate": 5.005984257335192e-06, + "loss": 0.0398, + "num_input_tokens_seen": 178212272, + "step": 146455 + }, + { + "epoch": 16.31139325091881, + "grad_norm": 0.013320324011147022, + "learning_rate": 5.004525733937024e-06, + "loss": 0.0086, + "num_input_tokens_seen": 178218448, + "step": 146460 + }, + { + "epoch": 16.311950105802428, + "grad_norm": 0.03867018595337868, + "learning_rate": 5.003067399412137e-06, + "loss": 0.1169, + "num_input_tokens_seen": 178224656, + "step": 146465 + }, + { + "epoch": 16.312506960686044, + "grad_norm": 0.1111975610256195, + "learning_rate": 5.001609253774292e-06, + "loss": 0.0415, + "num_input_tokens_seen": 178230640, + "step": 146470 + }, + { + "epoch": 16.313063815569663, + "grad_norm": 2.0943603515625, + "learning_rate": 5.000151297037279e-06, + "loss": 0.036, + "num_input_tokens_seen": 178236112, + "step": 146475 + }, + { + "epoch": 16.31362067045328, + "grad_norm": 0.0027558240108191967, + "learning_rate": 4.99869352921486e-06, + "loss": 0.008, + "num_input_tokens_seen": 178242160, + "step": 146480 + }, + { + "epoch": 16.3141775253369, + "grad_norm": 0.0012098511215299368, + "learning_rate": 4.997235950320803e-06, + "loss": 0.0201, + "num_input_tokens_seen": 178247856, + "step": 146485 + }, + { + "epoch": 16.314734380220514, + "grad_norm": 0.19224882125854492, + "learning_rate": 4.995778560368874e-06, + "loss": 0.0027, + "num_input_tokens_seen": 178254128, + "step": 146490 + }, + { + "epoch": 16.31529123510413, + "grad_norm": 0.0036690484266728163, + "learning_rate": 4.99432135937285e-06, + "loss": 0.0046, + "num_input_tokens_seen": 178260304, + "step": 146495 + }, + { + "epoch": 16.31584808998775, + "grad_norm": 0.008934303186833858, + "learning_rate": 4.992864347346488e-06, + "loss": 0.0487, + "num_input_tokens_seen": 178266352, + "step": 146500 + }, + { + "epoch": 16.316404944871366, + "grad_norm": 0.07938224822282791, + "learning_rate": 4.991407524303551e-06, + "loss": 0.0413, + "num_input_tokens_seen": 178272336, + "step": 146505 + }, + { + "epoch": 16.316961799754985, + "grad_norm": 0.007145457435399294, + "learning_rate": 4.989950890257797e-06, + "loss": 0.0272, + "num_input_tokens_seen": 178278320, + "step": 146510 + }, + { + "epoch": 16.3175186546386, + "grad_norm": 0.8424050807952881, + "learning_rate": 4.988494445222994e-06, + "loss": 0.0266, + "num_input_tokens_seen": 178284464, + "step": 146515 + }, + { + "epoch": 16.318075509522217, + "grad_norm": 0.5134621858596802, + "learning_rate": 4.987038189212887e-06, + "loss": 0.0324, + "num_input_tokens_seen": 178290288, + "step": 146520 + }, + { + "epoch": 16.318632364405836, + "grad_norm": 1.011915683746338, + "learning_rate": 4.9855821222412506e-06, + "loss": 0.0134, + "num_input_tokens_seen": 178296656, + "step": 146525 + }, + { + "epoch": 16.319189219289452, + "grad_norm": 0.15538661181926727, + "learning_rate": 4.9841262443218126e-06, + "loss": 0.0082, + "num_input_tokens_seen": 178302832, + "step": 146530 + }, + { + "epoch": 16.31974607417307, + "grad_norm": 9.31759350351058e-05, + "learning_rate": 4.982670555468346e-06, + "loss": 0.1039, + "num_input_tokens_seen": 178309072, + "step": 146535 + }, + { + "epoch": 16.320302929056687, + "grad_norm": 0.027879631146788597, + "learning_rate": 4.981215055694588e-06, + "loss": 0.0105, + "num_input_tokens_seen": 178314736, + "step": 146540 + }, + { + "epoch": 16.320859783940307, + "grad_norm": 0.0019671188201755285, + "learning_rate": 4.979759745014301e-06, + "loss": 0.0061, + "num_input_tokens_seen": 178320880, + "step": 146545 + }, + { + "epoch": 16.321416638823923, + "grad_norm": 0.0011160854483023286, + "learning_rate": 4.978304623441221e-06, + "loss": 0.0432, + "num_input_tokens_seen": 178327216, + "step": 146550 + }, + { + "epoch": 16.32197349370754, + "grad_norm": 0.26702752709388733, + "learning_rate": 4.976849690989094e-06, + "loss": 0.0102, + "num_input_tokens_seen": 178333104, + "step": 146555 + }, + { + "epoch": 16.322530348591158, + "grad_norm": 0.7045627236366272, + "learning_rate": 4.9753949476716576e-06, + "loss": 0.0688, + "num_input_tokens_seen": 178339088, + "step": 146560 + }, + { + "epoch": 16.323087203474774, + "grad_norm": 9.956313442671672e-05, + "learning_rate": 4.9739403935026686e-06, + "loss": 0.0058, + "num_input_tokens_seen": 178345168, + "step": 146565 + }, + { + "epoch": 16.323644058358393, + "grad_norm": 0.10774592310190201, + "learning_rate": 4.972486028495854e-06, + "loss": 0.0099, + "num_input_tokens_seen": 178351504, + "step": 146570 + }, + { + "epoch": 16.32420091324201, + "grad_norm": 1.2834638357162476, + "learning_rate": 4.971031852664957e-06, + "loss": 0.0269, + "num_input_tokens_seen": 178357552, + "step": 146575 + }, + { + "epoch": 16.324757768125625, + "grad_norm": 0.00012820513802580535, + "learning_rate": 4.969577866023703e-06, + "loss": 0.0176, + "num_input_tokens_seen": 178363728, + "step": 146580 + }, + { + "epoch": 16.325314623009245, + "grad_norm": 0.0016924787778407335, + "learning_rate": 4.9681240685858414e-06, + "loss": 0.0011, + "num_input_tokens_seen": 178370032, + "step": 146585 + }, + { + "epoch": 16.32587147789286, + "grad_norm": 0.013026539236307144, + "learning_rate": 4.966670460365088e-06, + "loss": 0.0629, + "num_input_tokens_seen": 178376112, + "step": 146590 + }, + { + "epoch": 16.32642833277648, + "grad_norm": 0.05934000015258789, + "learning_rate": 4.965217041375201e-06, + "loss": 0.0139, + "num_input_tokens_seen": 178381552, + "step": 146595 + }, + { + "epoch": 16.326985187660096, + "grad_norm": 1.3856747150421143, + "learning_rate": 4.963763811629873e-06, + "loss": 0.187, + "num_input_tokens_seen": 178387280, + "step": 146600 + }, + { + "epoch": 16.327542042543712, + "grad_norm": 1.039225459098816, + "learning_rate": 4.962310771142858e-06, + "loss": 0.0208, + "num_input_tokens_seen": 178393456, + "step": 146605 + }, + { + "epoch": 16.32809889742733, + "grad_norm": 0.07027141749858856, + "learning_rate": 4.960857919927863e-06, + "loss": 0.0567, + "num_input_tokens_seen": 178399312, + "step": 146610 + }, + { + "epoch": 16.328655752310947, + "grad_norm": 0.08527876436710358, + "learning_rate": 4.959405257998628e-06, + "loss": 0.095, + "num_input_tokens_seen": 178405232, + "step": 146615 + }, + { + "epoch": 16.329212607194567, + "grad_norm": 2.7592408657073975, + "learning_rate": 4.957952785368866e-06, + "loss": 0.1212, + "num_input_tokens_seen": 178411088, + "step": 146620 + }, + { + "epoch": 16.329769462078183, + "grad_norm": 0.0018474239623174071, + "learning_rate": 4.956500502052297e-06, + "loss": 0.0017, + "num_input_tokens_seen": 178417104, + "step": 146625 + }, + { + "epoch": 16.3303263169618, + "grad_norm": 0.002384322229772806, + "learning_rate": 4.955048408062635e-06, + "loss": 0.0166, + "num_input_tokens_seen": 178423152, + "step": 146630 + }, + { + "epoch": 16.330883171845418, + "grad_norm": 0.003043352160602808, + "learning_rate": 4.9535965034136045e-06, + "loss": 0.0029, + "num_input_tokens_seen": 178429552, + "step": 146635 + }, + { + "epoch": 16.331440026729034, + "grad_norm": 0.7141083478927612, + "learning_rate": 4.952144788118915e-06, + "loss": 0.0673, + "num_input_tokens_seen": 178435408, + "step": 146640 + }, + { + "epoch": 16.331996881612653, + "grad_norm": 1.8247389793395996, + "learning_rate": 4.950693262192283e-06, + "loss": 0.098, + "num_input_tokens_seen": 178441552, + "step": 146645 + }, + { + "epoch": 16.33255373649627, + "grad_norm": 0.012230145744979382, + "learning_rate": 4.949241925647408e-06, + "loss": 0.0252, + "num_input_tokens_seen": 178447920, + "step": 146650 + }, + { + "epoch": 16.333110591379885, + "grad_norm": 0.013065202161669731, + "learning_rate": 4.947790778498015e-06, + "loss": 0.0368, + "num_input_tokens_seen": 178454064, + "step": 146655 + }, + { + "epoch": 16.333667446263505, + "grad_norm": 0.2609037756919861, + "learning_rate": 4.946339820757798e-06, + "loss": 0.0247, + "num_input_tokens_seen": 178460048, + "step": 146660 + }, + { + "epoch": 16.33422430114712, + "grad_norm": 0.02314598672091961, + "learning_rate": 4.944889052440471e-06, + "loss": 0.0246, + "num_input_tokens_seen": 178465936, + "step": 146665 + }, + { + "epoch": 16.33478115603074, + "grad_norm": 1.5474047660827637, + "learning_rate": 4.943438473559739e-06, + "loss": 0.0402, + "num_input_tokens_seen": 178472112, + "step": 146670 + }, + { + "epoch": 16.335338010914356, + "grad_norm": 0.002970827044919133, + "learning_rate": 4.9419880841292986e-06, + "loss": 0.0324, + "num_input_tokens_seen": 178478544, + "step": 146675 + }, + { + "epoch": 16.33589486579797, + "grad_norm": 1.7163636684417725, + "learning_rate": 4.9405378841628406e-06, + "loss": 0.0701, + "num_input_tokens_seen": 178483920, + "step": 146680 + }, + { + "epoch": 16.33645172068159, + "grad_norm": 0.00018148864910472184, + "learning_rate": 4.9390878736740834e-06, + "loss": 0.0735, + "num_input_tokens_seen": 178490352, + "step": 146685 + }, + { + "epoch": 16.337008575565207, + "grad_norm": 1.312570571899414, + "learning_rate": 4.937638052676716e-06, + "loss": 0.0627, + "num_input_tokens_seen": 178496208, + "step": 146690 + }, + { + "epoch": 16.337565430448826, + "grad_norm": 0.0015997058944776654, + "learning_rate": 4.936188421184426e-06, + "loss": 0.1135, + "num_input_tokens_seen": 178501808, + "step": 146695 + }, + { + "epoch": 16.338122285332442, + "grad_norm": 0.9610899090766907, + "learning_rate": 4.934738979210909e-06, + "loss": 0.0168, + "num_input_tokens_seen": 178507792, + "step": 146700 + }, + { + "epoch": 16.33867914021606, + "grad_norm": 0.12378652393817902, + "learning_rate": 4.933289726769863e-06, + "loss": 0.0191, + "num_input_tokens_seen": 178514000, + "step": 146705 + }, + { + "epoch": 16.339235995099678, + "grad_norm": 0.046461787074804306, + "learning_rate": 4.9318406638749645e-06, + "loss": 0.0014, + "num_input_tokens_seen": 178519856, + "step": 146710 + }, + { + "epoch": 16.339792849983294, + "grad_norm": 0.6132500767707825, + "learning_rate": 4.930391790539926e-06, + "loss": 0.068, + "num_input_tokens_seen": 178525872, + "step": 146715 + }, + { + "epoch": 16.340349704866913, + "grad_norm": 0.00031510574626736343, + "learning_rate": 4.928943106778399e-06, + "loss": 0.0056, + "num_input_tokens_seen": 178532080, + "step": 146720 + }, + { + "epoch": 16.34090655975053, + "grad_norm": 0.015450142323970795, + "learning_rate": 4.927494612604097e-06, + "loss": 0.0264, + "num_input_tokens_seen": 178537904, + "step": 146725 + }, + { + "epoch": 16.341463414634145, + "grad_norm": 0.00024355475034099072, + "learning_rate": 4.926046308030679e-06, + "loss": 0.006, + "num_input_tokens_seen": 178543792, + "step": 146730 + }, + { + "epoch": 16.342020269517764, + "grad_norm": 0.17761030793190002, + "learning_rate": 4.924598193071847e-06, + "loss": 0.0422, + "num_input_tokens_seen": 178549808, + "step": 146735 + }, + { + "epoch": 16.34257712440138, + "grad_norm": 0.0016513385344296694, + "learning_rate": 4.923150267741266e-06, + "loss": 0.0077, + "num_input_tokens_seen": 178555696, + "step": 146740 + }, + { + "epoch": 16.343133979285, + "grad_norm": 0.07555469125509262, + "learning_rate": 4.921702532052616e-06, + "loss": 0.0211, + "num_input_tokens_seen": 178562032, + "step": 146745 + }, + { + "epoch": 16.343690834168616, + "grad_norm": 0.04499127343297005, + "learning_rate": 4.920254986019568e-06, + "loss": 0.0092, + "num_input_tokens_seen": 178568400, + "step": 146750 + }, + { + "epoch": 16.34424768905223, + "grad_norm": 0.00014814532187301666, + "learning_rate": 4.918807629655806e-06, + "loss": 0.0126, + "num_input_tokens_seen": 178574352, + "step": 146755 + }, + { + "epoch": 16.34480454393585, + "grad_norm": 0.0003445815818849951, + "learning_rate": 4.9173604629749905e-06, + "loss": 0.001, + "num_input_tokens_seen": 178580688, + "step": 146760 + }, + { + "epoch": 16.345361398819467, + "grad_norm": 0.005878667812794447, + "learning_rate": 4.9159134859908e-06, + "loss": 0.0142, + "num_input_tokens_seen": 178586704, + "step": 146765 + }, + { + "epoch": 16.345918253703086, + "grad_norm": 0.028118973597884178, + "learning_rate": 4.914466698716888e-06, + "loss": 0.0321, + "num_input_tokens_seen": 178593008, + "step": 146770 + }, + { + "epoch": 16.346475108586702, + "grad_norm": 0.2800801396369934, + "learning_rate": 4.913020101166938e-06, + "loss": 0.1379, + "num_input_tokens_seen": 178598992, + "step": 146775 + }, + { + "epoch": 16.347031963470318, + "grad_norm": 0.012329846620559692, + "learning_rate": 4.911573693354602e-06, + "loss": 0.005, + "num_input_tokens_seen": 178605296, + "step": 146780 + }, + { + "epoch": 16.347588818353938, + "grad_norm": 0.41382601857185364, + "learning_rate": 4.9101274752935575e-06, + "loss": 0.104, + "num_input_tokens_seen": 178611088, + "step": 146785 + }, + { + "epoch": 16.348145673237553, + "grad_norm": 0.05900000035762787, + "learning_rate": 4.908681446997443e-06, + "loss": 0.0358, + "num_input_tokens_seen": 178617520, + "step": 146790 + }, + { + "epoch": 16.348702528121173, + "grad_norm": 0.9227908849716187, + "learning_rate": 4.907235608479935e-06, + "loss": 0.095, + "num_input_tokens_seen": 178623760, + "step": 146795 + }, + { + "epoch": 16.34925938300479, + "grad_norm": 0.0002571569348219782, + "learning_rate": 4.905789959754678e-06, + "loss": 0.0415, + "num_input_tokens_seen": 178629968, + "step": 146800 + }, + { + "epoch": 16.349816237888405, + "grad_norm": 0.2789567708969116, + "learning_rate": 4.9043445008353395e-06, + "loss": 0.0152, + "num_input_tokens_seen": 178636176, + "step": 146805 + }, + { + "epoch": 16.350373092772024, + "grad_norm": 0.020102960988879204, + "learning_rate": 4.90289923173557e-06, + "loss": 0.0003, + "num_input_tokens_seen": 178642032, + "step": 146810 + }, + { + "epoch": 16.35092994765564, + "grad_norm": 1.8513978719711304, + "learning_rate": 4.9014541524690175e-06, + "loss": 0.0369, + "num_input_tokens_seen": 178647664, + "step": 146815 + }, + { + "epoch": 16.35148680253926, + "grad_norm": 0.700288712978363, + "learning_rate": 4.900009263049327e-06, + "loss": 0.1003, + "num_input_tokens_seen": 178653552, + "step": 146820 + }, + { + "epoch": 16.352043657422875, + "grad_norm": 2.1499786376953125, + "learning_rate": 4.898564563490157e-06, + "loss": 0.0522, + "num_input_tokens_seen": 178659792, + "step": 146825 + }, + { + "epoch": 16.35260051230649, + "grad_norm": 0.009968435391783714, + "learning_rate": 4.897120053805155e-06, + "loss": 0.0144, + "num_input_tokens_seen": 178665648, + "step": 146830 + }, + { + "epoch": 16.35315736719011, + "grad_norm": 0.7081744074821472, + "learning_rate": 4.895675734007957e-06, + "loss": 0.012, + "num_input_tokens_seen": 178671728, + "step": 146835 + }, + { + "epoch": 16.353714222073727, + "grad_norm": 0.03369877487421036, + "learning_rate": 4.894231604112201e-06, + "loss": 0.0604, + "num_input_tokens_seen": 178677904, + "step": 146840 + }, + { + "epoch": 16.354271076957346, + "grad_norm": 1.5793826580047607, + "learning_rate": 4.892787664131546e-06, + "loss": 0.044, + "num_input_tokens_seen": 178683856, + "step": 146845 + }, + { + "epoch": 16.354827931840962, + "grad_norm": 0.017810625955462456, + "learning_rate": 4.89134391407961e-06, + "loss": 0.0318, + "num_input_tokens_seen": 178689872, + "step": 146850 + }, + { + "epoch": 16.355384786724578, + "grad_norm": 0.5154350996017456, + "learning_rate": 4.889900353970059e-06, + "loss": 0.0951, + "num_input_tokens_seen": 178696144, + "step": 146855 + }, + { + "epoch": 16.355941641608197, + "grad_norm": 1.0495091676712036, + "learning_rate": 4.888456983816498e-06, + "loss": 0.087, + "num_input_tokens_seen": 178701424, + "step": 146860 + }, + { + "epoch": 16.356498496491813, + "grad_norm": 2.262747049331665, + "learning_rate": 4.887013803632575e-06, + "loss": 0.0494, + "num_input_tokens_seen": 178707952, + "step": 146865 + }, + { + "epoch": 16.357055351375433, + "grad_norm": 9.103722550207749e-05, + "learning_rate": 4.885570813431928e-06, + "loss": 0.0269, + "num_input_tokens_seen": 178714224, + "step": 146870 + }, + { + "epoch": 16.35761220625905, + "grad_norm": 0.015201929025352001, + "learning_rate": 4.884128013228171e-06, + "loss": 0.1098, + "num_input_tokens_seen": 178720336, + "step": 146875 + }, + { + "epoch": 16.358169061142668, + "grad_norm": 0.23590144515037537, + "learning_rate": 4.882685403034945e-06, + "loss": 0.1151, + "num_input_tokens_seen": 178726128, + "step": 146880 + }, + { + "epoch": 16.358725916026284, + "grad_norm": 7.51252009649761e-05, + "learning_rate": 4.881242982865875e-06, + "loss": 0.0805, + "num_input_tokens_seen": 178732528, + "step": 146885 + }, + { + "epoch": 16.3592827709099, + "grad_norm": 0.018459169194102287, + "learning_rate": 4.879800752734584e-06, + "loss": 0.0303, + "num_input_tokens_seen": 178738544, + "step": 146890 + }, + { + "epoch": 16.35983962579352, + "grad_norm": 0.015064109116792679, + "learning_rate": 4.87835871265469e-06, + "loss": 0.0221, + "num_input_tokens_seen": 178744656, + "step": 146895 + }, + { + "epoch": 16.360396480677135, + "grad_norm": 1.4289343357086182, + "learning_rate": 4.876916862639824e-06, + "loss": 0.1236, + "num_input_tokens_seen": 178750896, + "step": 146900 + }, + { + "epoch": 16.360953335560755, + "grad_norm": 0.010860737413167953, + "learning_rate": 4.8754752027035996e-06, + "loss": 0.0247, + "num_input_tokens_seen": 178756752, + "step": 146905 + }, + { + "epoch": 16.36151019044437, + "grad_norm": 0.13844996690750122, + "learning_rate": 4.874033732859637e-06, + "loss": 0.0147, + "num_input_tokens_seen": 178762736, + "step": 146910 + }, + { + "epoch": 16.362067045327986, + "grad_norm": 0.8709157109260559, + "learning_rate": 4.8725924531215435e-06, + "loss": 0.0553, + "num_input_tokens_seen": 178768880, + "step": 146915 + }, + { + "epoch": 16.362623900211606, + "grad_norm": 0.008406633511185646, + "learning_rate": 4.871151363502949e-06, + "loss": 0.0503, + "num_input_tokens_seen": 178774960, + "step": 146920 + }, + { + "epoch": 16.36318075509522, + "grad_norm": 0.12635113298892975, + "learning_rate": 4.869710464017446e-06, + "loss": 0.0105, + "num_input_tokens_seen": 178781168, + "step": 146925 + }, + { + "epoch": 16.36373760997884, + "grad_norm": 0.2045736163854599, + "learning_rate": 4.868269754678672e-06, + "loss": 0.0069, + "num_input_tokens_seen": 178787376, + "step": 146930 + }, + { + "epoch": 16.364294464862457, + "grad_norm": 0.05434799939393997, + "learning_rate": 4.866829235500206e-06, + "loss": 0.0176, + "num_input_tokens_seen": 178793552, + "step": 146935 + }, + { + "epoch": 16.364851319746073, + "grad_norm": 0.0031645651906728745, + "learning_rate": 4.865388906495675e-06, + "loss": 0.0476, + "num_input_tokens_seen": 178799664, + "step": 146940 + }, + { + "epoch": 16.365408174629692, + "grad_norm": 0.19401873648166656, + "learning_rate": 4.86394876767867e-06, + "loss": 0.0054, + "num_input_tokens_seen": 178805136, + "step": 146945 + }, + { + "epoch": 16.36596502951331, + "grad_norm": 0.01973675936460495, + "learning_rate": 4.862508819062805e-06, + "loss": 0.1239, + "num_input_tokens_seen": 178811024, + "step": 146950 + }, + { + "epoch": 16.366521884396928, + "grad_norm": 0.04164247214794159, + "learning_rate": 4.861069060661683e-06, + "loss": 0.0646, + "num_input_tokens_seen": 178817360, + "step": 146955 + }, + { + "epoch": 16.367078739280544, + "grad_norm": 2.4516303539276123, + "learning_rate": 4.859629492488895e-06, + "loss": 0.0553, + "num_input_tokens_seen": 178823792, + "step": 146960 + }, + { + "epoch": 16.36763559416416, + "grad_norm": 0.008333496749401093, + "learning_rate": 4.8581901145580396e-06, + "loss": 0.008, + "num_input_tokens_seen": 178829872, + "step": 146965 + }, + { + "epoch": 16.36819244904778, + "grad_norm": 0.7103255987167358, + "learning_rate": 4.8567509268827226e-06, + "loss": 0.0082, + "num_input_tokens_seen": 178836048, + "step": 146970 + }, + { + "epoch": 16.368749303931395, + "grad_norm": 0.0009124899515882134, + "learning_rate": 4.8553119294765275e-06, + "loss": 0.0039, + "num_input_tokens_seen": 178842416, + "step": 146975 + }, + { + "epoch": 16.369306158815014, + "grad_norm": 0.024409005418419838, + "learning_rate": 4.8538731223530534e-06, + "loss": 0.0021, + "num_input_tokens_seen": 178848720, + "step": 146980 + }, + { + "epoch": 16.36986301369863, + "grad_norm": 0.021259717643260956, + "learning_rate": 4.8524345055258826e-06, + "loss": 0.0215, + "num_input_tokens_seen": 178855120, + "step": 146985 + }, + { + "epoch": 16.370419868582246, + "grad_norm": 1.0190377235412598, + "learning_rate": 4.850996079008616e-06, + "loss": 0.0428, + "num_input_tokens_seen": 178861168, + "step": 146990 + }, + { + "epoch": 16.370976723465866, + "grad_norm": 1.624338984489441, + "learning_rate": 4.849557842814828e-06, + "loss": 0.0723, + "num_input_tokens_seen": 178867248, + "step": 146995 + }, + { + "epoch": 16.37153357834948, + "grad_norm": 0.2915842533111572, + "learning_rate": 4.848119796958123e-06, + "loss": 0.0331, + "num_input_tokens_seen": 178873424, + "step": 147000 + }, + { + "epoch": 16.3720904332331, + "grad_norm": 0.19297343492507935, + "learning_rate": 4.846681941452058e-06, + "loss": 0.0139, + "num_input_tokens_seen": 178879408, + "step": 147005 + }, + { + "epoch": 16.372647288116717, + "grad_norm": 0.8486462831497192, + "learning_rate": 4.8452442763102376e-06, + "loss": 0.076, + "num_input_tokens_seen": 178885264, + "step": 147010 + }, + { + "epoch": 16.373204143000333, + "grad_norm": 0.7375569343566895, + "learning_rate": 4.843806801546225e-06, + "loss": 0.1464, + "num_input_tokens_seen": 178891600, + "step": 147015 + }, + { + "epoch": 16.373760997883952, + "grad_norm": 0.9099459052085876, + "learning_rate": 4.842369517173612e-06, + "loss": 0.0473, + "num_input_tokens_seen": 178897584, + "step": 147020 + }, + { + "epoch": 16.374317852767568, + "grad_norm": 0.059947770088911057, + "learning_rate": 4.840932423205968e-06, + "loss": 0.0124, + "num_input_tokens_seen": 178903760, + "step": 147025 + }, + { + "epoch": 16.374874707651188, + "grad_norm": 1.6665737628936768, + "learning_rate": 4.83949551965687e-06, + "loss": 0.0468, + "num_input_tokens_seen": 178909936, + "step": 147030 + }, + { + "epoch": 16.375431562534803, + "grad_norm": 0.5670236945152283, + "learning_rate": 4.838058806539883e-06, + "loss": 0.0365, + "num_input_tokens_seen": 178916112, + "step": 147035 + }, + { + "epoch": 16.37598841741842, + "grad_norm": 0.3321693539619446, + "learning_rate": 4.836622283868589e-06, + "loss": 0.0493, + "num_input_tokens_seen": 178922256, + "step": 147040 + }, + { + "epoch": 16.37654527230204, + "grad_norm": 0.8101499080657959, + "learning_rate": 4.835185951656554e-06, + "loss": 0.0106, + "num_input_tokens_seen": 178927664, + "step": 147045 + }, + { + "epoch": 16.377102127185655, + "grad_norm": 0.048327066004276276, + "learning_rate": 4.833749809917343e-06, + "loss": 0.0681, + "num_input_tokens_seen": 178933616, + "step": 147050 + }, + { + "epoch": 16.377658982069274, + "grad_norm": 0.6768263578414917, + "learning_rate": 4.832313858664514e-06, + "loss": 0.1092, + "num_input_tokens_seen": 178939888, + "step": 147055 + }, + { + "epoch": 16.37821583695289, + "grad_norm": 1.239103078842163, + "learning_rate": 4.830878097911645e-06, + "loss": 0.0324, + "num_input_tokens_seen": 178945840, + "step": 147060 + }, + { + "epoch": 16.378772691836506, + "grad_norm": 2.2947981357574463, + "learning_rate": 4.829442527672287e-06, + "loss": 0.1397, + "num_input_tokens_seen": 178951728, + "step": 147065 + }, + { + "epoch": 16.379329546720125, + "grad_norm": 0.006584970746189356, + "learning_rate": 4.828007147960012e-06, + "loss": 0.0302, + "num_input_tokens_seen": 178957840, + "step": 147070 + }, + { + "epoch": 16.37988640160374, + "grad_norm": 0.10609707236289978, + "learning_rate": 4.826571958788367e-06, + "loss": 0.017, + "num_input_tokens_seen": 178964016, + "step": 147075 + }, + { + "epoch": 16.38044325648736, + "grad_norm": 0.0964762419462204, + "learning_rate": 4.825136960170918e-06, + "loss": 0.0068, + "num_input_tokens_seen": 178969936, + "step": 147080 + }, + { + "epoch": 16.381000111370977, + "grad_norm": 0.6849004626274109, + "learning_rate": 4.823702152121204e-06, + "loss": 0.113, + "num_input_tokens_seen": 178975856, + "step": 147085 + }, + { + "epoch": 16.381556966254593, + "grad_norm": 0.002563515678048134, + "learning_rate": 4.822267534652794e-06, + "loss": 0.0018, + "num_input_tokens_seen": 178982096, + "step": 147090 + }, + { + "epoch": 16.382113821138212, + "grad_norm": 0.568034827709198, + "learning_rate": 4.820833107779235e-06, + "loss": 0.0457, + "num_input_tokens_seen": 178987216, + "step": 147095 + }, + { + "epoch": 16.382670676021828, + "grad_norm": 0.9065548777580261, + "learning_rate": 4.819398871514075e-06, + "loss": 0.0168, + "num_input_tokens_seen": 178993424, + "step": 147100 + }, + { + "epoch": 16.383227530905447, + "grad_norm": 0.927598237991333, + "learning_rate": 4.817964825870855e-06, + "loss": 0.0367, + "num_input_tokens_seen": 178999088, + "step": 147105 + }, + { + "epoch": 16.383784385789063, + "grad_norm": 0.40409764647483826, + "learning_rate": 4.8165309708631315e-06, + "loss": 0.0481, + "num_input_tokens_seen": 179005520, + "step": 147110 + }, + { + "epoch": 16.38434124067268, + "grad_norm": 0.09097613394260406, + "learning_rate": 4.815097306504438e-06, + "loss": 0.0254, + "num_input_tokens_seen": 179011504, + "step": 147115 + }, + { + "epoch": 16.3848980955563, + "grad_norm": 0.037626948207616806, + "learning_rate": 4.813663832808335e-06, + "loss": 0.008, + "num_input_tokens_seen": 179017584, + "step": 147120 + }, + { + "epoch": 16.385454950439915, + "grad_norm": 0.00019690312910825014, + "learning_rate": 4.812230549788338e-06, + "loss": 0.0402, + "num_input_tokens_seen": 179023696, + "step": 147125 + }, + { + "epoch": 16.386011805323534, + "grad_norm": 1.8676952123641968, + "learning_rate": 4.810797457458002e-06, + "loss": 0.0507, + "num_input_tokens_seen": 179029712, + "step": 147130 + }, + { + "epoch": 16.38656866020715, + "grad_norm": 0.37319424748420715, + "learning_rate": 4.809364555830853e-06, + "loss": 0.0197, + "num_input_tokens_seen": 179035152, + "step": 147135 + }, + { + "epoch": 16.387125515090766, + "grad_norm": 0.6322132349014282, + "learning_rate": 4.807931844920441e-06, + "loss": 0.0302, + "num_input_tokens_seen": 179041200, + "step": 147140 + }, + { + "epoch": 16.387682369974385, + "grad_norm": 0.0022708484902977943, + "learning_rate": 4.806499324740291e-06, + "loss": 0.0907, + "num_input_tokens_seen": 179047152, + "step": 147145 + }, + { + "epoch": 16.388239224858, + "grad_norm": 0.004137354902923107, + "learning_rate": 4.8050669953039304e-06, + "loss": 0.0047, + "num_input_tokens_seen": 179053136, + "step": 147150 + }, + { + "epoch": 16.38879607974162, + "grad_norm": 0.0023421787191182375, + "learning_rate": 4.803634856624886e-06, + "loss": 0.0728, + "num_input_tokens_seen": 179059088, + "step": 147155 + }, + { + "epoch": 16.389352934625236, + "grad_norm": 0.0041858539916574955, + "learning_rate": 4.8022029087166995e-06, + "loss": 0.0459, + "num_input_tokens_seen": 179065296, + "step": 147160 + }, + { + "epoch": 16.389909789508852, + "grad_norm": 0.09357304126024246, + "learning_rate": 4.800771151592889e-06, + "loss": 0.0024, + "num_input_tokens_seen": 179071376, + "step": 147165 + }, + { + "epoch": 16.390466644392472, + "grad_norm": 0.0002587952185422182, + "learning_rate": 4.799339585266976e-06, + "loss": 0.0435, + "num_input_tokens_seen": 179077200, + "step": 147170 + }, + { + "epoch": 16.391023499276088, + "grad_norm": 0.0623449869453907, + "learning_rate": 4.79790820975248e-06, + "loss": 0.0173, + "num_input_tokens_seen": 179083344, + "step": 147175 + }, + { + "epoch": 16.391580354159707, + "grad_norm": 0.2733246088027954, + "learning_rate": 4.796477025062934e-06, + "loss": 0.013, + "num_input_tokens_seen": 179089776, + "step": 147180 + }, + { + "epoch": 16.392137209043323, + "grad_norm": 0.0003824357991106808, + "learning_rate": 4.795046031211842e-06, + "loss": 0.0265, + "num_input_tokens_seen": 179096208, + "step": 147185 + }, + { + "epoch": 16.39269406392694, + "grad_norm": 0.4666123688220978, + "learning_rate": 4.7936152282127415e-06, + "loss": 0.0119, + "num_input_tokens_seen": 179102160, + "step": 147190 + }, + { + "epoch": 16.39325091881056, + "grad_norm": 0.04548802599310875, + "learning_rate": 4.79218461607912e-06, + "loss": 0.0539, + "num_input_tokens_seen": 179108080, + "step": 147195 + }, + { + "epoch": 16.393807773694174, + "grad_norm": 0.07490735501050949, + "learning_rate": 4.790754194824515e-06, + "loss": 0.0224, + "num_input_tokens_seen": 179114224, + "step": 147200 + }, + { + "epoch": 16.394364628577794, + "grad_norm": 0.03647056967020035, + "learning_rate": 4.789323964462417e-06, + "loss": 0.0174, + "num_input_tokens_seen": 179120304, + "step": 147205 + }, + { + "epoch": 16.39492148346141, + "grad_norm": 0.001966463401913643, + "learning_rate": 4.787893925006356e-06, + "loss": 0.0089, + "num_input_tokens_seen": 179126000, + "step": 147210 + }, + { + "epoch": 16.395478338345026, + "grad_norm": 0.21927128732204437, + "learning_rate": 4.786464076469829e-06, + "loss": 0.0034, + "num_input_tokens_seen": 179132208, + "step": 147215 + }, + { + "epoch": 16.396035193228645, + "grad_norm": 0.005126396659761667, + "learning_rate": 4.785034418866346e-06, + "loss": 0.0004, + "num_input_tokens_seen": 179138448, + "step": 147220 + }, + { + "epoch": 16.39659204811226, + "grad_norm": 0.0008341953507624567, + "learning_rate": 4.7836049522094e-06, + "loss": 0.0065, + "num_input_tokens_seen": 179144560, + "step": 147225 + }, + { + "epoch": 16.39714890299588, + "grad_norm": 0.00041827658424153924, + "learning_rate": 4.78217567651251e-06, + "loss": 0.0364, + "num_input_tokens_seen": 179150416, + "step": 147230 + }, + { + "epoch": 16.397705757879496, + "grad_norm": 0.0018496069824323058, + "learning_rate": 4.780746591789168e-06, + "loss": 0.0766, + "num_input_tokens_seen": 179156528, + "step": 147235 + }, + { + "epoch": 16.398262612763112, + "grad_norm": 0.4724724590778351, + "learning_rate": 4.779317698052873e-06, + "loss": 0.0149, + "num_input_tokens_seen": 179162128, + "step": 147240 + }, + { + "epoch": 16.39881946764673, + "grad_norm": 0.0018511373782530427, + "learning_rate": 4.77788899531712e-06, + "loss": 0.0105, + "num_input_tokens_seen": 179168176, + "step": 147245 + }, + { + "epoch": 16.399376322530347, + "grad_norm": 0.22149989008903503, + "learning_rate": 4.776460483595411e-06, + "loss": 0.0169, + "num_input_tokens_seen": 179174000, + "step": 147250 + }, + { + "epoch": 16.399933177413967, + "grad_norm": 0.298179030418396, + "learning_rate": 4.77503216290123e-06, + "loss": 0.0865, + "num_input_tokens_seen": 179180048, + "step": 147255 + }, + { + "epoch": 16.400490032297583, + "grad_norm": 0.05305086076259613, + "learning_rate": 4.77360403324808e-06, + "loss": 0.0004, + "num_input_tokens_seen": 179186384, + "step": 147260 + }, + { + "epoch": 16.401046887181202, + "grad_norm": 0.12338309735059738, + "learning_rate": 4.7721760946494444e-06, + "loss": 0.0141, + "num_input_tokens_seen": 179192400, + "step": 147265 + }, + { + "epoch": 16.401603742064818, + "grad_norm": 0.005301797762513161, + "learning_rate": 4.770748347118812e-06, + "loss": 0.0051, + "num_input_tokens_seen": 179198544, + "step": 147270 + }, + { + "epoch": 16.402160596948434, + "grad_norm": 2.1749091148376465, + "learning_rate": 4.769320790669671e-06, + "loss": 0.1003, + "num_input_tokens_seen": 179204592, + "step": 147275 + }, + { + "epoch": 16.402717451832054, + "grad_norm": 0.031905386596918106, + "learning_rate": 4.767893425315495e-06, + "loss": 0.0141, + "num_input_tokens_seen": 179210576, + "step": 147280 + }, + { + "epoch": 16.40327430671567, + "grad_norm": 2.225114583969116, + "learning_rate": 4.766466251069782e-06, + "loss": 0.1073, + "num_input_tokens_seen": 179216528, + "step": 147285 + }, + { + "epoch": 16.40383116159929, + "grad_norm": 1.2255357503890991, + "learning_rate": 4.765039267946006e-06, + "loss": 0.0849, + "num_input_tokens_seen": 179222448, + "step": 147290 + }, + { + "epoch": 16.404388016482905, + "grad_norm": 0.012266218662261963, + "learning_rate": 4.763612475957646e-06, + "loss": 0.0215, + "num_input_tokens_seen": 179228592, + "step": 147295 + }, + { + "epoch": 16.40494487136652, + "grad_norm": 0.025549685582518578, + "learning_rate": 4.762185875118175e-06, + "loss": 0.1657, + "num_input_tokens_seen": 179234544, + "step": 147300 + }, + { + "epoch": 16.40550172625014, + "grad_norm": 4.113225936889648, + "learning_rate": 4.760759465441078e-06, + "loss": 0.0688, + "num_input_tokens_seen": 179240688, + "step": 147305 + }, + { + "epoch": 16.406058581133756, + "grad_norm": 0.39791715145111084, + "learning_rate": 4.759333246939823e-06, + "loss": 0.012, + "num_input_tokens_seen": 179246576, + "step": 147310 + }, + { + "epoch": 16.406615436017375, + "grad_norm": 0.004999995697289705, + "learning_rate": 4.75790721962788e-06, + "loss": 0.1386, + "num_input_tokens_seen": 179252880, + "step": 147315 + }, + { + "epoch": 16.40717229090099, + "grad_norm": 0.0023369123227894306, + "learning_rate": 4.756481383518718e-06, + "loss": 0.0741, + "num_input_tokens_seen": 179259408, + "step": 147320 + }, + { + "epoch": 16.407729145784607, + "grad_norm": 0.12032553553581238, + "learning_rate": 4.755055738625813e-06, + "loss": 0.0456, + "num_input_tokens_seen": 179265200, + "step": 147325 + }, + { + "epoch": 16.408286000668227, + "grad_norm": 0.9975178837776184, + "learning_rate": 4.753630284962621e-06, + "loss": 0.0945, + "num_input_tokens_seen": 179270256, + "step": 147330 + }, + { + "epoch": 16.408842855551843, + "grad_norm": 0.31449857354164124, + "learning_rate": 4.752205022542622e-06, + "loss": 0.0048, + "num_input_tokens_seen": 179276336, + "step": 147335 + }, + { + "epoch": 16.409399710435462, + "grad_norm": 1.4634195566177368, + "learning_rate": 4.750779951379258e-06, + "loss": 0.0295, + "num_input_tokens_seen": 179281808, + "step": 147340 + }, + { + "epoch": 16.409956565319078, + "grad_norm": 2.02384352684021, + "learning_rate": 4.749355071486009e-06, + "loss": 0.0448, + "num_input_tokens_seen": 179287664, + "step": 147345 + }, + { + "epoch": 16.410513420202694, + "grad_norm": 1.0226224660873413, + "learning_rate": 4.747930382876318e-06, + "loss": 0.0933, + "num_input_tokens_seen": 179293488, + "step": 147350 + }, + { + "epoch": 16.411070275086313, + "grad_norm": 0.5852212309837341, + "learning_rate": 4.746505885563654e-06, + "loss": 0.1341, + "num_input_tokens_seen": 179299216, + "step": 147355 + }, + { + "epoch": 16.41162712996993, + "grad_norm": 0.001968999160453677, + "learning_rate": 4.745081579561473e-06, + "loss": 0.0133, + "num_input_tokens_seen": 179304880, + "step": 147360 + }, + { + "epoch": 16.41218398485355, + "grad_norm": 0.07225101441144943, + "learning_rate": 4.743657464883222e-06, + "loss": 0.0409, + "num_input_tokens_seen": 179310832, + "step": 147365 + }, + { + "epoch": 16.412740839737165, + "grad_norm": 0.40023183822631836, + "learning_rate": 4.742233541542349e-06, + "loss": 0.0199, + "num_input_tokens_seen": 179316368, + "step": 147370 + }, + { + "epoch": 16.41329769462078, + "grad_norm": 0.00048660184256732464, + "learning_rate": 4.740809809552319e-06, + "loss": 0.0088, + "num_input_tokens_seen": 179322544, + "step": 147375 + }, + { + "epoch": 16.4138545495044, + "grad_norm": 0.05115550383925438, + "learning_rate": 4.739386268926568e-06, + "loss": 0.0018, + "num_input_tokens_seen": 179328720, + "step": 147380 + }, + { + "epoch": 16.414411404388016, + "grad_norm": 0.9955559968948364, + "learning_rate": 4.737962919678549e-06, + "loss": 0.0693, + "num_input_tokens_seen": 179334960, + "step": 147385 + }, + { + "epoch": 16.414968259271635, + "grad_norm": 0.033482905477285385, + "learning_rate": 4.736539761821696e-06, + "loss": 0.094, + "num_input_tokens_seen": 179341648, + "step": 147390 + }, + { + "epoch": 16.41552511415525, + "grad_norm": 3.1219327449798584, + "learning_rate": 4.735116795369468e-06, + "loss": 0.1344, + "num_input_tokens_seen": 179347792, + "step": 147395 + }, + { + "epoch": 16.416081969038867, + "grad_norm": 0.37967169284820557, + "learning_rate": 4.733694020335289e-06, + "loss": 0.0085, + "num_input_tokens_seen": 179353904, + "step": 147400 + }, + { + "epoch": 16.416638823922487, + "grad_norm": 0.00013757409760728478, + "learning_rate": 4.732271436732621e-06, + "loss": 0.0651, + "num_input_tokens_seen": 179360048, + "step": 147405 + }, + { + "epoch": 16.417195678806102, + "grad_norm": 0.0053700353018939495, + "learning_rate": 4.7308490445748755e-06, + "loss": 0.1685, + "num_input_tokens_seen": 179366320, + "step": 147410 + }, + { + "epoch": 16.417752533689722, + "grad_norm": 1.6522727012634277, + "learning_rate": 4.729426843875506e-06, + "loss": 0.0727, + "num_input_tokens_seen": 179372144, + "step": 147415 + }, + { + "epoch": 16.418309388573338, + "grad_norm": 0.5650029182434082, + "learning_rate": 4.7280048346479335e-06, + "loss": 0.0092, + "num_input_tokens_seen": 179378352, + "step": 147420 + }, + { + "epoch": 16.418866243456954, + "grad_norm": 0.20616444945335388, + "learning_rate": 4.726583016905606e-06, + "loss": 0.016, + "num_input_tokens_seen": 179384368, + "step": 147425 + }, + { + "epoch": 16.419423098340573, + "grad_norm": 0.011579194106161594, + "learning_rate": 4.725161390661942e-06, + "loss": 0.0013, + "num_input_tokens_seen": 179390800, + "step": 147430 + }, + { + "epoch": 16.41997995322419, + "grad_norm": 1.1798118352890015, + "learning_rate": 4.723739955930373e-06, + "loss": 0.1104, + "num_input_tokens_seen": 179396688, + "step": 147435 + }, + { + "epoch": 16.42053680810781, + "grad_norm": 0.0005839732475578785, + "learning_rate": 4.722318712724319e-06, + "loss": 0.0278, + "num_input_tokens_seen": 179402352, + "step": 147440 + }, + { + "epoch": 16.421093662991424, + "grad_norm": 0.5969187617301941, + "learning_rate": 4.720897661057216e-06, + "loss": 0.0514, + "num_input_tokens_seen": 179408464, + "step": 147445 + }, + { + "epoch": 16.42165051787504, + "grad_norm": 1.0851867198944092, + "learning_rate": 4.719476800942485e-06, + "loss": 0.0622, + "num_input_tokens_seen": 179414640, + "step": 147450 + }, + { + "epoch": 16.42220737275866, + "grad_norm": 0.05331894010305405, + "learning_rate": 4.718056132393542e-06, + "loss": 0.003, + "num_input_tokens_seen": 179420624, + "step": 147455 + }, + { + "epoch": 16.422764227642276, + "grad_norm": 0.4570174515247345, + "learning_rate": 4.716635655423804e-06, + "loss": 0.0108, + "num_input_tokens_seen": 179426608, + "step": 147460 + }, + { + "epoch": 16.423321082525895, + "grad_norm": 0.07957490533590317, + "learning_rate": 4.715215370046697e-06, + "loss": 0.0114, + "num_input_tokens_seen": 179432496, + "step": 147465 + }, + { + "epoch": 16.42387793740951, + "grad_norm": 1.9387366771697998, + "learning_rate": 4.713795276275626e-06, + "loss": 0.0455, + "num_input_tokens_seen": 179437680, + "step": 147470 + }, + { + "epoch": 16.424434792293127, + "grad_norm": 0.6066524386405945, + "learning_rate": 4.71237537412402e-06, + "loss": 0.022, + "num_input_tokens_seen": 179443888, + "step": 147475 + }, + { + "epoch": 16.424991647176746, + "grad_norm": 0.13006797432899475, + "learning_rate": 4.710955663605282e-06, + "loss": 0.0027, + "num_input_tokens_seen": 179450256, + "step": 147480 + }, + { + "epoch": 16.425548502060362, + "grad_norm": 0.0033373222686350346, + "learning_rate": 4.7095361447328235e-06, + "loss": 0.0172, + "num_input_tokens_seen": 179456592, + "step": 147485 + }, + { + "epoch": 16.42610535694398, + "grad_norm": 0.06443873792886734, + "learning_rate": 4.7081168175200445e-06, + "loss": 0.0954, + "num_input_tokens_seen": 179462128, + "step": 147490 + }, + { + "epoch": 16.426662211827598, + "grad_norm": 0.008395003154873848, + "learning_rate": 4.706697681980368e-06, + "loss": 0.0318, + "num_input_tokens_seen": 179468336, + "step": 147495 + }, + { + "epoch": 16.427219066711213, + "grad_norm": 0.006667459383606911, + "learning_rate": 4.7052787381271916e-06, + "loss": 0.0206, + "num_input_tokens_seen": 179474768, + "step": 147500 + }, + { + "epoch": 16.427775921594833, + "grad_norm": 0.028522882610559464, + "learning_rate": 4.703859985973916e-06, + "loss": 0.002, + "num_input_tokens_seen": 179481040, + "step": 147505 + }, + { + "epoch": 16.42833277647845, + "grad_norm": 0.6034667491912842, + "learning_rate": 4.702441425533938e-06, + "loss": 0.0114, + "num_input_tokens_seen": 179487504, + "step": 147510 + }, + { + "epoch": 16.42888963136207, + "grad_norm": 0.42569032311439514, + "learning_rate": 4.701023056820667e-06, + "loss": 0.0961, + "num_input_tokens_seen": 179492752, + "step": 147515 + }, + { + "epoch": 16.429446486245684, + "grad_norm": 0.0853554904460907, + "learning_rate": 4.6996048798474915e-06, + "loss": 0.0278, + "num_input_tokens_seen": 179498992, + "step": 147520 + }, + { + "epoch": 16.4300033411293, + "grad_norm": 0.0411258190870285, + "learning_rate": 4.698186894627826e-06, + "loss": 0.0029, + "num_input_tokens_seen": 179505392, + "step": 147525 + }, + { + "epoch": 16.43056019601292, + "grad_norm": 0.09516841918230057, + "learning_rate": 4.696769101175036e-06, + "loss": 0.0302, + "num_input_tokens_seen": 179511280, + "step": 147530 + }, + { + "epoch": 16.431117050896535, + "grad_norm": 0.010345936752855778, + "learning_rate": 4.695351499502537e-06, + "loss": 0.0074, + "num_input_tokens_seen": 179517232, + "step": 147535 + }, + { + "epoch": 16.431673905780155, + "grad_norm": 0.5722445249557495, + "learning_rate": 4.693934089623703e-06, + "loss": 0.009, + "num_input_tokens_seen": 179523184, + "step": 147540 + }, + { + "epoch": 16.43223076066377, + "grad_norm": 0.0002404821279924363, + "learning_rate": 4.692516871551939e-06, + "loss": 0.0251, + "num_input_tokens_seen": 179529552, + "step": 147545 + }, + { + "epoch": 16.432787615547387, + "grad_norm": 0.0014033311745151877, + "learning_rate": 4.69109984530062e-06, + "loss": 0.0209, + "num_input_tokens_seen": 179535824, + "step": 147550 + }, + { + "epoch": 16.433344470431006, + "grad_norm": 0.0007038066978566349, + "learning_rate": 4.689683010883136e-06, + "loss": 0.026, + "num_input_tokens_seen": 179541968, + "step": 147555 + }, + { + "epoch": 16.433901325314622, + "grad_norm": 0.015534764155745506, + "learning_rate": 4.688266368312863e-06, + "loss": 0.0038, + "num_input_tokens_seen": 179548240, + "step": 147560 + }, + { + "epoch": 16.43445818019824, + "grad_norm": 0.8230705857276917, + "learning_rate": 4.686849917603192e-06, + "loss": 0.1284, + "num_input_tokens_seen": 179553808, + "step": 147565 + }, + { + "epoch": 16.435015035081857, + "grad_norm": 1.5849817991256714, + "learning_rate": 4.685433658767499e-06, + "loss": 0.0512, + "num_input_tokens_seen": 179559984, + "step": 147570 + }, + { + "epoch": 16.435571889965473, + "grad_norm": 3.3280088901519775, + "learning_rate": 4.684017591819162e-06, + "loss": 0.1867, + "num_input_tokens_seen": 179566192, + "step": 147575 + }, + { + "epoch": 16.436128744849093, + "grad_norm": 1.5589100122451782, + "learning_rate": 4.682601716771548e-06, + "loss": 0.0332, + "num_input_tokens_seen": 179572144, + "step": 147580 + }, + { + "epoch": 16.43668559973271, + "grad_norm": 2.8456521034240723, + "learning_rate": 4.681186033638046e-06, + "loss": 0.037, + "num_input_tokens_seen": 179578224, + "step": 147585 + }, + { + "epoch": 16.437242454616328, + "grad_norm": 0.0014561201678588986, + "learning_rate": 4.679770542432013e-06, + "loss": 0.0021, + "num_input_tokens_seen": 179584272, + "step": 147590 + }, + { + "epoch": 16.437799309499944, + "grad_norm": 0.0072242338210344315, + "learning_rate": 4.678355243166843e-06, + "loss": 0.0033, + "num_input_tokens_seen": 179590224, + "step": 147595 + }, + { + "epoch": 16.438356164383563, + "grad_norm": 0.00975334458053112, + "learning_rate": 4.676940135855873e-06, + "loss": 0.0086, + "num_input_tokens_seen": 179596464, + "step": 147600 + }, + { + "epoch": 16.43891301926718, + "grad_norm": 0.009638410061597824, + "learning_rate": 4.675525220512495e-06, + "loss": 0.0802, + "num_input_tokens_seen": 179602672, + "step": 147605 + }, + { + "epoch": 16.439469874150795, + "grad_norm": 1.5494794845581055, + "learning_rate": 4.674110497150058e-06, + "loss": 0.1025, + "num_input_tokens_seen": 179608752, + "step": 147610 + }, + { + "epoch": 16.440026729034415, + "grad_norm": 0.021747983992099762, + "learning_rate": 4.672695965781937e-06, + "loss": 0.1509, + "num_input_tokens_seen": 179614704, + "step": 147615 + }, + { + "epoch": 16.44058358391803, + "grad_norm": 0.5156254172325134, + "learning_rate": 4.671281626421492e-06, + "loss": 0.0392, + "num_input_tokens_seen": 179620560, + "step": 147620 + }, + { + "epoch": 16.44114043880165, + "grad_norm": 0.1117154061794281, + "learning_rate": 4.669867479082077e-06, + "loss": 0.01, + "num_input_tokens_seen": 179626864, + "step": 147625 + }, + { + "epoch": 16.441697293685266, + "grad_norm": 0.008128893561661243, + "learning_rate": 4.668453523777045e-06, + "loss": 0.1298, + "num_input_tokens_seen": 179632944, + "step": 147630 + }, + { + "epoch": 16.442254148568882, + "grad_norm": 0.2494635283946991, + "learning_rate": 4.667039760519765e-06, + "loss": 0.0141, + "num_input_tokens_seen": 179639056, + "step": 147635 + }, + { + "epoch": 16.4428110034525, + "grad_norm": 0.00011988820187980309, + "learning_rate": 4.665626189323585e-06, + "loss": 0.0036, + "num_input_tokens_seen": 179644784, + "step": 147640 + }, + { + "epoch": 16.443367858336117, + "grad_norm": 0.04182182624936104, + "learning_rate": 4.66421281020186e-06, + "loss": 0.0045, + "num_input_tokens_seen": 179650416, + "step": 147645 + }, + { + "epoch": 16.443924713219737, + "grad_norm": 0.027373865246772766, + "learning_rate": 4.662799623167929e-06, + "loss": 0.0228, + "num_input_tokens_seen": 179656464, + "step": 147650 + }, + { + "epoch": 16.444481568103352, + "grad_norm": 0.006437855772674084, + "learning_rate": 4.661386628235157e-06, + "loss": 0.0364, + "num_input_tokens_seen": 179662576, + "step": 147655 + }, + { + "epoch": 16.44503842298697, + "grad_norm": 0.004207337740808725, + "learning_rate": 4.6599738254168746e-06, + "loss": 0.0015, + "num_input_tokens_seen": 179668816, + "step": 147660 + }, + { + "epoch": 16.445595277870588, + "grad_norm": 0.0874936431646347, + "learning_rate": 4.658561214726445e-06, + "loss": 0.0026, + "num_input_tokens_seen": 179674672, + "step": 147665 + }, + { + "epoch": 16.446152132754204, + "grad_norm": 0.13145512342453003, + "learning_rate": 4.6571487961771995e-06, + "loss": 0.0163, + "num_input_tokens_seen": 179680976, + "step": 147670 + }, + { + "epoch": 16.446708987637823, + "grad_norm": 0.38276779651641846, + "learning_rate": 4.655736569782484e-06, + "loss": 0.0067, + "num_input_tokens_seen": 179687088, + "step": 147675 + }, + { + "epoch": 16.44726584252144, + "grad_norm": 0.04847068339586258, + "learning_rate": 4.654324535555629e-06, + "loss": 0.0084, + "num_input_tokens_seen": 179693040, + "step": 147680 + }, + { + "epoch": 16.447822697405055, + "grad_norm": 0.006647735368460417, + "learning_rate": 4.6529126935099834e-06, + "loss": 0.0727, + "num_input_tokens_seen": 179698992, + "step": 147685 + }, + { + "epoch": 16.448379552288674, + "grad_norm": 0.00010665472655091435, + "learning_rate": 4.6515010436588814e-06, + "loss": 0.0007, + "num_input_tokens_seen": 179705136, + "step": 147690 + }, + { + "epoch": 16.44893640717229, + "grad_norm": 0.9222396612167358, + "learning_rate": 4.650089586015657e-06, + "loss": 0.0333, + "num_input_tokens_seen": 179711280, + "step": 147695 + }, + { + "epoch": 16.44949326205591, + "grad_norm": 0.4132348299026489, + "learning_rate": 4.648678320593638e-06, + "loss": 0.0458, + "num_input_tokens_seen": 179717328, + "step": 147700 + }, + { + "epoch": 16.450050116939526, + "grad_norm": 0.24815183877944946, + "learning_rate": 4.6472672474061504e-06, + "loss": 0.0046, + "num_input_tokens_seen": 179723344, + "step": 147705 + }, + { + "epoch": 16.45060697182314, + "grad_norm": 0.000736747111659497, + "learning_rate": 4.645856366466539e-06, + "loss": 0.0282, + "num_input_tokens_seen": 179729648, + "step": 147710 + }, + { + "epoch": 16.45116382670676, + "grad_norm": 0.7521888017654419, + "learning_rate": 4.6444456777881205e-06, + "loss": 0.127, + "num_input_tokens_seen": 179735600, + "step": 147715 + }, + { + "epoch": 16.451720681590377, + "grad_norm": 0.02936057560145855, + "learning_rate": 4.6430351813842225e-06, + "loss": 0.0148, + "num_input_tokens_seen": 179740912, + "step": 147720 + }, + { + "epoch": 16.452277536473996, + "grad_norm": 1.8668794631958008, + "learning_rate": 4.641624877268158e-06, + "loss": 0.0669, + "num_input_tokens_seen": 179746896, + "step": 147725 + }, + { + "epoch": 16.452834391357612, + "grad_norm": 1.2531218528747559, + "learning_rate": 4.640214765453266e-06, + "loss": 0.1045, + "num_input_tokens_seen": 179753008, + "step": 147730 + }, + { + "epoch": 16.453391246241228, + "grad_norm": 0.008507939986884594, + "learning_rate": 4.638804845952849e-06, + "loss": 0.1018, + "num_input_tokens_seen": 179758800, + "step": 147735 + }, + { + "epoch": 16.453948101124848, + "grad_norm": 0.04206683859229088, + "learning_rate": 4.637395118780247e-06, + "loss": 0.0019, + "num_input_tokens_seen": 179765104, + "step": 147740 + }, + { + "epoch": 16.454504956008464, + "grad_norm": 0.0022444294299930334, + "learning_rate": 4.635985583948749e-06, + "loss": 0.0128, + "num_input_tokens_seen": 179771568, + "step": 147745 + }, + { + "epoch": 16.455061810892083, + "grad_norm": 0.00013831810792908072, + "learning_rate": 4.634576241471692e-06, + "loss": 0.0403, + "num_input_tokens_seen": 179777680, + "step": 147750 + }, + { + "epoch": 16.4556186657757, + "grad_norm": 0.0827820897102356, + "learning_rate": 4.63316709136237e-06, + "loss": 0.0266, + "num_input_tokens_seen": 179783664, + "step": 147755 + }, + { + "epoch": 16.456175520659315, + "grad_norm": 0.05137863755226135, + "learning_rate": 4.6317581336341066e-06, + "loss": 0.0118, + "num_input_tokens_seen": 179789808, + "step": 147760 + }, + { + "epoch": 16.456732375542934, + "grad_norm": 1.6795156002044678, + "learning_rate": 4.63034936830021e-06, + "loss": 0.0398, + "num_input_tokens_seen": 179795792, + "step": 147765 + }, + { + "epoch": 16.45728923042655, + "grad_norm": 0.007795895915478468, + "learning_rate": 4.628940795373982e-06, + "loss": 0.1424, + "num_input_tokens_seen": 179802128, + "step": 147770 + }, + { + "epoch": 16.45784608531017, + "grad_norm": 0.04893215373158455, + "learning_rate": 4.627532414868724e-06, + "loss": 0.0175, + "num_input_tokens_seen": 179808432, + "step": 147775 + }, + { + "epoch": 16.458402940193785, + "grad_norm": 0.007754769641906023, + "learning_rate": 4.626124226797748e-06, + "loss": 0.0136, + "num_input_tokens_seen": 179814672, + "step": 147780 + }, + { + "epoch": 16.4589597950774, + "grad_norm": 0.11801730841398239, + "learning_rate": 4.624716231174356e-06, + "loss": 0.0388, + "num_input_tokens_seen": 179820816, + "step": 147785 + }, + { + "epoch": 16.45951664996102, + "grad_norm": 0.22410033643245697, + "learning_rate": 4.6233084280118414e-06, + "loss": 0.0868, + "num_input_tokens_seen": 179827056, + "step": 147790 + }, + { + "epoch": 16.460073504844637, + "grad_norm": 0.006952435243874788, + "learning_rate": 4.621900817323496e-06, + "loss": 0.0452, + "num_input_tokens_seen": 179833360, + "step": 147795 + }, + { + "epoch": 16.460630359728256, + "grad_norm": 0.011191168799996376, + "learning_rate": 4.620493399122633e-06, + "loss": 0.0054, + "num_input_tokens_seen": 179839600, + "step": 147800 + }, + { + "epoch": 16.461187214611872, + "grad_norm": 0.2442750632762909, + "learning_rate": 4.619086173422532e-06, + "loss": 0.0062, + "num_input_tokens_seen": 179845520, + "step": 147805 + }, + { + "epoch": 16.461744069495488, + "grad_norm": 0.0001079066059901379, + "learning_rate": 4.617679140236503e-06, + "loss": 0.0174, + "num_input_tokens_seen": 179851376, + "step": 147810 + }, + { + "epoch": 16.462300924379107, + "grad_norm": 0.0072451625019311905, + "learning_rate": 4.616272299577809e-06, + "loss": 0.0597, + "num_input_tokens_seen": 179857840, + "step": 147815 + }, + { + "epoch": 16.462857779262723, + "grad_norm": 0.010315886698663235, + "learning_rate": 4.614865651459766e-06, + "loss": 0.0014, + "num_input_tokens_seen": 179863952, + "step": 147820 + }, + { + "epoch": 16.463414634146343, + "grad_norm": 0.01160977128893137, + "learning_rate": 4.613459195895639e-06, + "loss": 0.014, + "num_input_tokens_seen": 179870096, + "step": 147825 + }, + { + "epoch": 16.46397148902996, + "grad_norm": 0.31485220789909363, + "learning_rate": 4.61205293289873e-06, + "loss": 0.0391, + "num_input_tokens_seen": 179876176, + "step": 147830 + }, + { + "epoch": 16.464528343913575, + "grad_norm": 0.2862667143344879, + "learning_rate": 4.610646862482315e-06, + "loss": 0.0051, + "num_input_tokens_seen": 179882544, + "step": 147835 + }, + { + "epoch": 16.465085198797194, + "grad_norm": 2.4000062942504883, + "learning_rate": 4.609240984659677e-06, + "loss": 0.1106, + "num_input_tokens_seen": 179887888, + "step": 147840 + }, + { + "epoch": 16.46564205368081, + "grad_norm": 0.000291516276774928, + "learning_rate": 4.607835299444088e-06, + "loss": 0.024, + "num_input_tokens_seen": 179893872, + "step": 147845 + }, + { + "epoch": 16.46619890856443, + "grad_norm": 0.2665040194988251, + "learning_rate": 4.606429806848842e-06, + "loss": 0.0031, + "num_input_tokens_seen": 179899696, + "step": 147850 + }, + { + "epoch": 16.466755763448045, + "grad_norm": 2.2191853523254395, + "learning_rate": 4.6050245068872015e-06, + "loss": 0.0708, + "num_input_tokens_seen": 179905680, + "step": 147855 + }, + { + "epoch": 16.46731261833166, + "grad_norm": 0.10846513509750366, + "learning_rate": 4.603619399572445e-06, + "loss": 0.138, + "num_input_tokens_seen": 179911408, + "step": 147860 + }, + { + "epoch": 16.46786947321528, + "grad_norm": 0.0077006202191114426, + "learning_rate": 4.602214484917841e-06, + "loss": 0.0089, + "num_input_tokens_seen": 179917776, + "step": 147865 + }, + { + "epoch": 16.468426328098897, + "grad_norm": 0.000632154697086662, + "learning_rate": 4.600809762936667e-06, + "loss": 0.0213, + "num_input_tokens_seen": 179924080, + "step": 147870 + }, + { + "epoch": 16.468983182982516, + "grad_norm": 0.398444265127182, + "learning_rate": 4.599405233642184e-06, + "loss": 0.0289, + "num_input_tokens_seen": 179930000, + "step": 147875 + }, + { + "epoch": 16.469540037866132, + "grad_norm": 0.19117900729179382, + "learning_rate": 4.598000897047669e-06, + "loss": 0.0041, + "num_input_tokens_seen": 179935920, + "step": 147880 + }, + { + "epoch": 16.470096892749748, + "grad_norm": 0.0966157466173172, + "learning_rate": 4.596596753166382e-06, + "loss": 0.1322, + "num_input_tokens_seen": 179942480, + "step": 147885 + }, + { + "epoch": 16.470653747633367, + "grad_norm": 0.008675385266542435, + "learning_rate": 4.595192802011583e-06, + "loss": 0.1079, + "num_input_tokens_seen": 179948624, + "step": 147890 + }, + { + "epoch": 16.471210602516983, + "grad_norm": 0.3304287791252136, + "learning_rate": 4.593789043596533e-06, + "loss": 0.0302, + "num_input_tokens_seen": 179954704, + "step": 147895 + }, + { + "epoch": 16.471767457400603, + "grad_norm": 0.020596981048583984, + "learning_rate": 4.592385477934499e-06, + "loss": 0.0044, + "num_input_tokens_seen": 179960880, + "step": 147900 + }, + { + "epoch": 16.47232431228422, + "grad_norm": 1.752038598060608, + "learning_rate": 4.590982105038735e-06, + "loss": 0.0785, + "num_input_tokens_seen": 179967088, + "step": 147905 + }, + { + "epoch": 16.472881167167834, + "grad_norm": 0.02777598425745964, + "learning_rate": 4.589578924922497e-06, + "loss": 0.0474, + "num_input_tokens_seen": 179972912, + "step": 147910 + }, + { + "epoch": 16.473438022051454, + "grad_norm": 0.1248648464679718, + "learning_rate": 4.588175937599032e-06, + "loss": 0.0199, + "num_input_tokens_seen": 179978928, + "step": 147915 + }, + { + "epoch": 16.47399487693507, + "grad_norm": 0.2094091922044754, + "learning_rate": 4.586773143081604e-06, + "loss": 0.0889, + "num_input_tokens_seen": 179985328, + "step": 147920 + }, + { + "epoch": 16.47455173181869, + "grad_norm": 0.012935444712638855, + "learning_rate": 4.585370541383454e-06, + "loss": 0.0082, + "num_input_tokens_seen": 179991440, + "step": 147925 + }, + { + "epoch": 16.475108586702305, + "grad_norm": 0.007734011393040419, + "learning_rate": 4.583968132517846e-06, + "loss": 0.1154, + "num_input_tokens_seen": 179997488, + "step": 147930 + }, + { + "epoch": 16.475665441585924, + "grad_norm": 2.018690347671509, + "learning_rate": 4.582565916498005e-06, + "loss": 0.0631, + "num_input_tokens_seen": 180003600, + "step": 147935 + }, + { + "epoch": 16.47622229646954, + "grad_norm": 0.005598028190433979, + "learning_rate": 4.5811638933371924e-06, + "loss": 0.0377, + "num_input_tokens_seen": 180009424, + "step": 147940 + }, + { + "epoch": 16.476779151353156, + "grad_norm": 0.1790800541639328, + "learning_rate": 4.57976206304864e-06, + "loss": 0.0876, + "num_input_tokens_seen": 180015088, + "step": 147945 + }, + { + "epoch": 16.477336006236776, + "grad_norm": 0.048988763242959976, + "learning_rate": 4.578360425645603e-06, + "loss": 0.0072, + "num_input_tokens_seen": 180021136, + "step": 147950 + }, + { + "epoch": 16.47789286112039, + "grad_norm": 0.16306085884571075, + "learning_rate": 4.57695898114131e-06, + "loss": 0.082, + "num_input_tokens_seen": 180027344, + "step": 147955 + }, + { + "epoch": 16.47844971600401, + "grad_norm": 0.12550212442874908, + "learning_rate": 4.575557729549007e-06, + "loss": 0.0142, + "num_input_tokens_seen": 180033008, + "step": 147960 + }, + { + "epoch": 16.479006570887627, + "grad_norm": 2.2361879348754883, + "learning_rate": 4.574156670881915e-06, + "loss": 0.035, + "num_input_tokens_seen": 180039312, + "step": 147965 + }, + { + "epoch": 16.479563425771243, + "grad_norm": 0.00019334055832587183, + "learning_rate": 4.572755805153287e-06, + "loss": 0.007, + "num_input_tokens_seen": 180045584, + "step": 147970 + }, + { + "epoch": 16.480120280654862, + "grad_norm": 0.19061513245105743, + "learning_rate": 4.571355132376343e-06, + "loss": 0.0498, + "num_input_tokens_seen": 180052016, + "step": 147975 + }, + { + "epoch": 16.48067713553848, + "grad_norm": 0.030840855091810226, + "learning_rate": 4.56995465256432e-06, + "loss": 0.0555, + "num_input_tokens_seen": 180057616, + "step": 147980 + }, + { + "epoch": 16.481233990422098, + "grad_norm": 1.0417847633361816, + "learning_rate": 4.568554365730435e-06, + "loss": 0.032, + "num_input_tokens_seen": 180063152, + "step": 147985 + }, + { + "epoch": 16.481790845305714, + "grad_norm": 1.0548033714294434, + "learning_rate": 4.56715427188793e-06, + "loss": 0.0691, + "num_input_tokens_seen": 180069136, + "step": 147990 + }, + { + "epoch": 16.48234770018933, + "grad_norm": 0.016807738691568375, + "learning_rate": 4.565754371050018e-06, + "loss": 0.0839, + "num_input_tokens_seen": 180075312, + "step": 147995 + }, + { + "epoch": 16.48290455507295, + "grad_norm": 0.44172152876853943, + "learning_rate": 4.564354663229942e-06, + "loss": 0.1082, + "num_input_tokens_seen": 180081040, + "step": 148000 + }, + { + "epoch": 16.483461409956565, + "grad_norm": 0.00013331575610209256, + "learning_rate": 4.562955148440895e-06, + "loss": 0.0303, + "num_input_tokens_seen": 180087024, + "step": 148005 + }, + { + "epoch": 16.484018264840184, + "grad_norm": 0.07198949158191681, + "learning_rate": 4.561555826696115e-06, + "loss": 0.0014, + "num_input_tokens_seen": 180093136, + "step": 148010 + }, + { + "epoch": 16.4845751197238, + "grad_norm": 0.04005012288689613, + "learning_rate": 4.56015669800881e-06, + "loss": 0.0384, + "num_input_tokens_seen": 180099152, + "step": 148015 + }, + { + "epoch": 16.485131974607416, + "grad_norm": 0.17585398256778717, + "learning_rate": 4.558757762392207e-06, + "loss": 0.0447, + "num_input_tokens_seen": 180104432, + "step": 148020 + }, + { + "epoch": 16.485688829491036, + "grad_norm": 1.0540220737457275, + "learning_rate": 4.557359019859517e-06, + "loss": 0.074, + "num_input_tokens_seen": 180110192, + "step": 148025 + }, + { + "epoch": 16.48624568437465, + "grad_norm": 0.1382216215133667, + "learning_rate": 4.555960470423948e-06, + "loss": 0.0178, + "num_input_tokens_seen": 180116080, + "step": 148030 + }, + { + "epoch": 16.48680253925827, + "grad_norm": 0.13204766809940338, + "learning_rate": 4.554562114098704e-06, + "loss": 0.1106, + "num_input_tokens_seen": 180122096, + "step": 148035 + }, + { + "epoch": 16.487359394141887, + "grad_norm": 2.5837323665618896, + "learning_rate": 4.553163950897008e-06, + "loss": 0.1527, + "num_input_tokens_seen": 180128112, + "step": 148040 + }, + { + "epoch": 16.487916249025503, + "grad_norm": 2.257716417312622, + "learning_rate": 4.551765980832059e-06, + "loss": 0.0451, + "num_input_tokens_seen": 180134320, + "step": 148045 + }, + { + "epoch": 16.488473103909122, + "grad_norm": 0.01305604912340641, + "learning_rate": 4.550368203917066e-06, + "loss": 0.0706, + "num_input_tokens_seen": 180140432, + "step": 148050 + }, + { + "epoch": 16.489029958792738, + "grad_norm": 0.23916766047477722, + "learning_rate": 4.548970620165222e-06, + "loss": 0.0379, + "num_input_tokens_seen": 180146224, + "step": 148055 + }, + { + "epoch": 16.489586813676357, + "grad_norm": 0.17769379913806915, + "learning_rate": 4.547573229589744e-06, + "loss": 0.0045, + "num_input_tokens_seen": 180152272, + "step": 148060 + }, + { + "epoch": 16.490143668559973, + "grad_norm": 1.4181208610534668, + "learning_rate": 4.546176032203814e-06, + "loss": 0.0135, + "num_input_tokens_seen": 180158448, + "step": 148065 + }, + { + "epoch": 16.49070052344359, + "grad_norm": 1.234069585800171, + "learning_rate": 4.544779028020646e-06, + "loss": 0.0382, + "num_input_tokens_seen": 180164240, + "step": 148070 + }, + { + "epoch": 16.49125737832721, + "grad_norm": 2.2402684688568115, + "learning_rate": 4.543382217053427e-06, + "loss": 0.1095, + "num_input_tokens_seen": 180170064, + "step": 148075 + }, + { + "epoch": 16.491814233210825, + "grad_norm": 0.6813637614250183, + "learning_rate": 4.5419855993153544e-06, + "loss": 0.06, + "num_input_tokens_seen": 180175952, + "step": 148080 + }, + { + "epoch": 16.492371088094444, + "grad_norm": 0.06880473345518112, + "learning_rate": 4.5405891748196095e-06, + "loss": 0.0047, + "num_input_tokens_seen": 180181968, + "step": 148085 + }, + { + "epoch": 16.49292794297806, + "grad_norm": 1.2007310390472412, + "learning_rate": 4.539192943579401e-06, + "loss": 0.0295, + "num_input_tokens_seen": 180187760, + "step": 148090 + }, + { + "epoch": 16.493484797861676, + "grad_norm": 2.3254916667938232, + "learning_rate": 4.537796905607908e-06, + "loss": 0.0386, + "num_input_tokens_seen": 180193872, + "step": 148095 + }, + { + "epoch": 16.494041652745295, + "grad_norm": 0.03551720455288887, + "learning_rate": 4.536401060918316e-06, + "loss": 0.0094, + "num_input_tokens_seen": 180200048, + "step": 148100 + }, + { + "epoch": 16.49459850762891, + "grad_norm": 0.15491439402103424, + "learning_rate": 4.53500540952381e-06, + "loss": 0.1134, + "num_input_tokens_seen": 180206256, + "step": 148105 + }, + { + "epoch": 16.49515536251253, + "grad_norm": 0.8482948541641235, + "learning_rate": 4.53360995143757e-06, + "loss": 0.0803, + "num_input_tokens_seen": 180212304, + "step": 148110 + }, + { + "epoch": 16.495712217396147, + "grad_norm": 0.09341666102409363, + "learning_rate": 4.532214686672787e-06, + "loss": 0.0233, + "num_input_tokens_seen": 180218160, + "step": 148115 + }, + { + "epoch": 16.496269072279762, + "grad_norm": 0.4336663484573364, + "learning_rate": 4.530819615242635e-06, + "loss": 0.0355, + "num_input_tokens_seen": 180224176, + "step": 148120 + }, + { + "epoch": 16.496825927163382, + "grad_norm": 0.9354171752929688, + "learning_rate": 4.529424737160293e-06, + "loss": 0.0514, + "num_input_tokens_seen": 180230576, + "step": 148125 + }, + { + "epoch": 16.497382782046998, + "grad_norm": 0.022747892886400223, + "learning_rate": 4.5280300524389255e-06, + "loss": 0.0176, + "num_input_tokens_seen": 180236688, + "step": 148130 + }, + { + "epoch": 16.497939636930617, + "grad_norm": 0.017926745116710663, + "learning_rate": 4.526635561091724e-06, + "loss": 0.1076, + "num_input_tokens_seen": 180242864, + "step": 148135 + }, + { + "epoch": 16.498496491814233, + "grad_norm": 3.0206282138824463, + "learning_rate": 4.525241263131846e-06, + "loss": 0.0802, + "num_input_tokens_seen": 180248816, + "step": 148140 + }, + { + "epoch": 16.49905334669785, + "grad_norm": 0.1315966099500656, + "learning_rate": 4.523847158572481e-06, + "loss": 0.0028, + "num_input_tokens_seen": 180254992, + "step": 148145 + }, + { + "epoch": 16.49961020158147, + "grad_norm": 0.052061792463064194, + "learning_rate": 4.522453247426772e-06, + "loss": 0.0555, + "num_input_tokens_seen": 180261392, + "step": 148150 + }, + { + "epoch": 16.500167056465084, + "grad_norm": 0.720606803894043, + "learning_rate": 4.521059529707905e-06, + "loss": 0.0096, + "num_input_tokens_seen": 180267792, + "step": 148155 + }, + { + "epoch": 16.500723911348704, + "grad_norm": 0.004123342223465443, + "learning_rate": 4.519666005429033e-06, + "loss": 0.0012, + "num_input_tokens_seen": 180273936, + "step": 148160 + }, + { + "epoch": 16.50128076623232, + "grad_norm": 0.00021481855947058648, + "learning_rate": 4.518272674603327e-06, + "loss": 0.01, + "num_input_tokens_seen": 180280112, + "step": 148165 + }, + { + "epoch": 16.501837621115936, + "grad_norm": 0.05169902369379997, + "learning_rate": 4.5168795372439486e-06, + "loss": 0.0067, + "num_input_tokens_seen": 180286320, + "step": 148170 + }, + { + "epoch": 16.502394475999555, + "grad_norm": 0.1554776281118393, + "learning_rate": 4.515486593364054e-06, + "loss": 0.0244, + "num_input_tokens_seen": 180292368, + "step": 148175 + }, + { + "epoch": 16.50295133088317, + "grad_norm": 0.526005208492279, + "learning_rate": 4.514093842976791e-06, + "loss": 0.0066, + "num_input_tokens_seen": 180298448, + "step": 148180 + }, + { + "epoch": 16.50350818576679, + "grad_norm": 0.2588523030281067, + "learning_rate": 4.512701286095333e-06, + "loss": 0.0554, + "num_input_tokens_seen": 180304528, + "step": 148185 + }, + { + "epoch": 16.504065040650406, + "grad_norm": 0.0053842985071241856, + "learning_rate": 4.511308922732826e-06, + "loss": 0.0411, + "num_input_tokens_seen": 180310800, + "step": 148190 + }, + { + "epoch": 16.504621895534022, + "grad_norm": 0.007185703609138727, + "learning_rate": 4.509916752902421e-06, + "loss": 0.0305, + "num_input_tokens_seen": 180317008, + "step": 148195 + }, + { + "epoch": 16.50517875041764, + "grad_norm": 0.015495174564421177, + "learning_rate": 4.508524776617262e-06, + "loss": 0.014, + "num_input_tokens_seen": 180322896, + "step": 148200 + }, + { + "epoch": 16.505735605301258, + "grad_norm": 0.00022918642207514495, + "learning_rate": 4.507132993890511e-06, + "loss": 0.0078, + "num_input_tokens_seen": 180329072, + "step": 148205 + }, + { + "epoch": 16.506292460184877, + "grad_norm": 0.0050284117460250854, + "learning_rate": 4.505741404735301e-06, + "loss": 0.0399, + "num_input_tokens_seen": 180335152, + "step": 148210 + }, + { + "epoch": 16.506849315068493, + "grad_norm": 0.04368409141898155, + "learning_rate": 4.50435000916479e-06, + "loss": 0.0792, + "num_input_tokens_seen": 180341104, + "step": 148215 + }, + { + "epoch": 16.50740616995211, + "grad_norm": 1.4323030710220337, + "learning_rate": 4.502958807192112e-06, + "loss": 0.1043, + "num_input_tokens_seen": 180346480, + "step": 148220 + }, + { + "epoch": 16.50796302483573, + "grad_norm": 0.8593956232070923, + "learning_rate": 4.501567798830411e-06, + "loss": 0.0224, + "num_input_tokens_seen": 180352560, + "step": 148225 + }, + { + "epoch": 16.508519879719344, + "grad_norm": 0.19861933588981628, + "learning_rate": 4.500176984092819e-06, + "loss": 0.032, + "num_input_tokens_seen": 180358704, + "step": 148230 + }, + { + "epoch": 16.509076734602964, + "grad_norm": 0.18088944256305695, + "learning_rate": 4.498786362992488e-06, + "loss": 0.0439, + "num_input_tokens_seen": 180364976, + "step": 148235 + }, + { + "epoch": 16.50963358948658, + "grad_norm": 2.053196430206299, + "learning_rate": 4.497395935542542e-06, + "loss": 0.0891, + "num_input_tokens_seen": 180371312, + "step": 148240 + }, + { + "epoch": 16.510190444370195, + "grad_norm": 0.00635703606531024, + "learning_rate": 4.49600570175612e-06, + "loss": 0.0159, + "num_input_tokens_seen": 180377456, + "step": 148245 + }, + { + "epoch": 16.510747299253815, + "grad_norm": 0.12727852165699005, + "learning_rate": 4.494615661646342e-06, + "loss": 0.06, + "num_input_tokens_seen": 180383504, + "step": 148250 + }, + { + "epoch": 16.51130415413743, + "grad_norm": 0.016827315092086792, + "learning_rate": 4.493225815226357e-06, + "loss": 0.0661, + "num_input_tokens_seen": 180388720, + "step": 148255 + }, + { + "epoch": 16.51186100902105, + "grad_norm": 0.7161129117012024, + "learning_rate": 4.491836162509283e-06, + "loss": 0.0092, + "num_input_tokens_seen": 180394928, + "step": 148260 + }, + { + "epoch": 16.512417863904666, + "grad_norm": 0.008369632065296173, + "learning_rate": 4.49044670350825e-06, + "loss": 0.108, + "num_input_tokens_seen": 180400784, + "step": 148265 + }, + { + "epoch": 16.512974718788286, + "grad_norm": 1.9741394519805908, + "learning_rate": 4.489057438236369e-06, + "loss": 0.0809, + "num_input_tokens_seen": 180407280, + "step": 148270 + }, + { + "epoch": 16.5135315736719, + "grad_norm": 0.005510380491614342, + "learning_rate": 4.487668366706782e-06, + "loss": 0.0521, + "num_input_tokens_seen": 180413648, + "step": 148275 + }, + { + "epoch": 16.514088428555517, + "grad_norm": 1.4090901613235474, + "learning_rate": 4.486279488932596e-06, + "loss": 0.027, + "num_input_tokens_seen": 180419632, + "step": 148280 + }, + { + "epoch": 16.514645283439137, + "grad_norm": 0.013105804100632668, + "learning_rate": 4.484890804926941e-06, + "loss": 0.0644, + "num_input_tokens_seen": 180425648, + "step": 148285 + }, + { + "epoch": 16.515202138322753, + "grad_norm": 0.6773641705513, + "learning_rate": 4.483502314702928e-06, + "loss": 0.0524, + "num_input_tokens_seen": 180431600, + "step": 148290 + }, + { + "epoch": 16.51575899320637, + "grad_norm": 1.5116095542907715, + "learning_rate": 4.482114018273675e-06, + "loss": 0.0682, + "num_input_tokens_seen": 180437776, + "step": 148295 + }, + { + "epoch": 16.516315848089988, + "grad_norm": 5.2145562171936035, + "learning_rate": 4.480725915652287e-06, + "loss": 0.1246, + "num_input_tokens_seen": 180443952, + "step": 148300 + }, + { + "epoch": 16.516872702973604, + "grad_norm": 0.00144767458550632, + "learning_rate": 4.479338006851888e-06, + "loss": 0.0097, + "num_input_tokens_seen": 180450096, + "step": 148305 + }, + { + "epoch": 16.517429557857223, + "grad_norm": 0.004833908285945654, + "learning_rate": 4.477950291885583e-06, + "loss": 0.0202, + "num_input_tokens_seen": 180456304, + "step": 148310 + }, + { + "epoch": 16.51798641274084, + "grad_norm": 0.020229671150445938, + "learning_rate": 4.476562770766479e-06, + "loss": 0.0073, + "num_input_tokens_seen": 180462448, + "step": 148315 + }, + { + "epoch": 16.51854326762446, + "grad_norm": 0.16655506193637848, + "learning_rate": 4.475175443507676e-06, + "loss": 0.086, + "num_input_tokens_seen": 180468464, + "step": 148320 + }, + { + "epoch": 16.519100122508075, + "grad_norm": 0.2837037742137909, + "learning_rate": 4.4737883101222925e-06, + "loss": 0.0139, + "num_input_tokens_seen": 180474480, + "step": 148325 + }, + { + "epoch": 16.51965697739169, + "grad_norm": 1.144059658050537, + "learning_rate": 4.4724013706234165e-06, + "loss": 0.0669, + "num_input_tokens_seen": 180480528, + "step": 148330 + }, + { + "epoch": 16.52021383227531, + "grad_norm": 0.00010695921810111031, + "learning_rate": 4.471014625024169e-06, + "loss": 0.0583, + "num_input_tokens_seen": 180486928, + "step": 148335 + }, + { + "epoch": 16.520770687158926, + "grad_norm": 0.04695823788642883, + "learning_rate": 4.469628073337623e-06, + "loss": 0.0354, + "num_input_tokens_seen": 180492784, + "step": 148340 + }, + { + "epoch": 16.521327542042545, + "grad_norm": 0.04858176410198212, + "learning_rate": 4.468241715576896e-06, + "loss": 0.032, + "num_input_tokens_seen": 180498576, + "step": 148345 + }, + { + "epoch": 16.52188439692616, + "grad_norm": 0.9745320677757263, + "learning_rate": 4.466855551755067e-06, + "loss": 0.0756, + "num_input_tokens_seen": 180504432, + "step": 148350 + }, + { + "epoch": 16.522441251809777, + "grad_norm": 0.9461808204650879, + "learning_rate": 4.465469581885248e-06, + "loss": 0.0918, + "num_input_tokens_seen": 180509712, + "step": 148355 + }, + { + "epoch": 16.522998106693397, + "grad_norm": 0.9621710181236267, + "learning_rate": 4.4640838059805175e-06, + "loss": 0.1561, + "num_input_tokens_seen": 180515632, + "step": 148360 + }, + { + "epoch": 16.523554961577013, + "grad_norm": 1.2361818552017212, + "learning_rate": 4.462698224053971e-06, + "loss": 0.0811, + "num_input_tokens_seen": 180521520, + "step": 148365 + }, + { + "epoch": 16.524111816460632, + "grad_norm": 0.01720314286649227, + "learning_rate": 4.461312836118687e-06, + "loss": 0.0489, + "num_input_tokens_seen": 180527632, + "step": 148370 + }, + { + "epoch": 16.524668671344248, + "grad_norm": 0.0046353633515536785, + "learning_rate": 4.459927642187764e-06, + "loss": 0.0839, + "num_input_tokens_seen": 180533552, + "step": 148375 + }, + { + "epoch": 16.525225526227864, + "grad_norm": 0.001283219433389604, + "learning_rate": 4.4585426422742795e-06, + "loss": 0.0014, + "num_input_tokens_seen": 180539120, + "step": 148380 + }, + { + "epoch": 16.525782381111483, + "grad_norm": 0.013636181131005287, + "learning_rate": 4.457157836391321e-06, + "loss": 0.0035, + "num_input_tokens_seen": 180544688, + "step": 148385 + }, + { + "epoch": 16.5263392359951, + "grad_norm": 0.04330505058169365, + "learning_rate": 4.455773224551957e-06, + "loss": 0.0003, + "num_input_tokens_seen": 180550672, + "step": 148390 + }, + { + "epoch": 16.52689609087872, + "grad_norm": 0.24148248136043549, + "learning_rate": 4.45438880676928e-06, + "loss": 0.0493, + "num_input_tokens_seen": 180556624, + "step": 148395 + }, + { + "epoch": 16.527452945762334, + "grad_norm": 0.21453408896923065, + "learning_rate": 4.453004583056358e-06, + "loss": 0.0188, + "num_input_tokens_seen": 180562832, + "step": 148400 + }, + { + "epoch": 16.52800980064595, + "grad_norm": 3.1595511436462402, + "learning_rate": 4.4516205534262805e-06, + "loss": 0.039, + "num_input_tokens_seen": 180568304, + "step": 148405 + }, + { + "epoch": 16.52856665552957, + "grad_norm": 0.8629342913627625, + "learning_rate": 4.450236717892098e-06, + "loss": 0.0074, + "num_input_tokens_seen": 180574192, + "step": 148410 + }, + { + "epoch": 16.529123510413186, + "grad_norm": 0.0001510449656052515, + "learning_rate": 4.448853076466899e-06, + "loss": 0.2049, + "num_input_tokens_seen": 180580560, + "step": 148415 + }, + { + "epoch": 16.529680365296805, + "grad_norm": 1.241856336593628, + "learning_rate": 4.447469629163742e-06, + "loss": 0.0979, + "num_input_tokens_seen": 180586832, + "step": 148420 + }, + { + "epoch": 16.53023722018042, + "grad_norm": 0.006475728005170822, + "learning_rate": 4.446086375995709e-06, + "loss": 0.0462, + "num_input_tokens_seen": 180592976, + "step": 148425 + }, + { + "epoch": 16.530794075064037, + "grad_norm": 0.00925683043897152, + "learning_rate": 4.444703316975857e-06, + "loss": 0.0045, + "num_input_tokens_seen": 180598928, + "step": 148430 + }, + { + "epoch": 16.531350929947656, + "grad_norm": 0.010573306120932102, + "learning_rate": 4.44332045211725e-06, + "loss": 0.0569, + "num_input_tokens_seen": 180605488, + "step": 148435 + }, + { + "epoch": 16.531907784831272, + "grad_norm": 0.9082740545272827, + "learning_rate": 4.441937781432945e-06, + "loss": 0.0196, + "num_input_tokens_seen": 180611760, + "step": 148440 + }, + { + "epoch": 16.53246463971489, + "grad_norm": 0.09725825488567352, + "learning_rate": 4.4405553049360146e-06, + "loss": 0.1205, + "num_input_tokens_seen": 180617584, + "step": 148445 + }, + { + "epoch": 16.533021494598508, + "grad_norm": 0.25866344571113586, + "learning_rate": 4.439173022639512e-06, + "loss": 0.0529, + "num_input_tokens_seen": 180623280, + "step": 148450 + }, + { + "epoch": 16.533578349482124, + "grad_norm": 0.6268676519393921, + "learning_rate": 4.437790934556491e-06, + "loss": 0.0815, + "num_input_tokens_seen": 180629648, + "step": 148455 + }, + { + "epoch": 16.534135204365743, + "grad_norm": 2.2287912368774414, + "learning_rate": 4.436409040700004e-06, + "loss": 0.0846, + "num_input_tokens_seen": 180635792, + "step": 148460 + }, + { + "epoch": 16.53469205924936, + "grad_norm": 0.06150602176785469, + "learning_rate": 4.435027341083114e-06, + "loss": 0.0055, + "num_input_tokens_seen": 180641808, + "step": 148465 + }, + { + "epoch": 16.53524891413298, + "grad_norm": 0.4031185805797577, + "learning_rate": 4.433645835718864e-06, + "loss": 0.0069, + "num_input_tokens_seen": 180648208, + "step": 148470 + }, + { + "epoch": 16.535805769016594, + "grad_norm": 1.0835683345794678, + "learning_rate": 4.4322645246203106e-06, + "loss": 0.0932, + "num_input_tokens_seen": 180654256, + "step": 148475 + }, + { + "epoch": 16.53636262390021, + "grad_norm": 0.163790762424469, + "learning_rate": 4.4308834078004985e-06, + "loss": 0.0211, + "num_input_tokens_seen": 180660144, + "step": 148480 + }, + { + "epoch": 16.53691947878383, + "grad_norm": 0.0010577929206192493, + "learning_rate": 4.429502485272471e-06, + "loss": 0.1005, + "num_input_tokens_seen": 180666224, + "step": 148485 + }, + { + "epoch": 16.537476333667446, + "grad_norm": 0.22672604024410248, + "learning_rate": 4.428121757049267e-06, + "loss": 0.1423, + "num_input_tokens_seen": 180672048, + "step": 148490 + }, + { + "epoch": 16.538033188551065, + "grad_norm": 0.26758721470832825, + "learning_rate": 4.4267412231439436e-06, + "loss": 0.0223, + "num_input_tokens_seen": 180678160, + "step": 148495 + }, + { + "epoch": 16.53859004343468, + "grad_norm": 0.0015966958599165082, + "learning_rate": 4.425360883569529e-06, + "loss": 0.0042, + "num_input_tokens_seen": 180684240, + "step": 148500 + }, + { + "epoch": 16.539146898318297, + "grad_norm": 0.3874513804912567, + "learning_rate": 4.423980738339068e-06, + "loss": 0.1247, + "num_input_tokens_seen": 180690352, + "step": 148505 + }, + { + "epoch": 16.539703753201916, + "grad_norm": 0.24055588245391846, + "learning_rate": 4.422600787465591e-06, + "loss": 0.1047, + "num_input_tokens_seen": 180696688, + "step": 148510 + }, + { + "epoch": 16.540260608085532, + "grad_norm": 0.10347659885883331, + "learning_rate": 4.421221030962133e-06, + "loss": 0.0204, + "num_input_tokens_seen": 180702320, + "step": 148515 + }, + { + "epoch": 16.54081746296915, + "grad_norm": 1.608237862586975, + "learning_rate": 4.4198414688417344e-06, + "loss": 0.0603, + "num_input_tokens_seen": 180708688, + "step": 148520 + }, + { + "epoch": 16.541374317852767, + "grad_norm": 0.10249673575162888, + "learning_rate": 4.418462101117421e-06, + "loss": 0.0381, + "num_input_tokens_seen": 180714736, + "step": 148525 + }, + { + "epoch": 16.541931172736383, + "grad_norm": 0.04096012935042381, + "learning_rate": 4.417082927802224e-06, + "loss": 0.0439, + "num_input_tokens_seen": 180720880, + "step": 148530 + }, + { + "epoch": 16.542488027620003, + "grad_norm": 0.11443165689706802, + "learning_rate": 4.415703948909161e-06, + "loss": 0.0275, + "num_input_tokens_seen": 180726416, + "step": 148535 + }, + { + "epoch": 16.54304488250362, + "grad_norm": 2.0998289585113525, + "learning_rate": 4.414325164451274e-06, + "loss": 0.0556, + "num_input_tokens_seen": 180732336, + "step": 148540 + }, + { + "epoch": 16.543601737387238, + "grad_norm": 0.15044021606445312, + "learning_rate": 4.412946574441573e-06, + "loss": 0.0124, + "num_input_tokens_seen": 180738192, + "step": 148545 + }, + { + "epoch": 16.544158592270854, + "grad_norm": 1.2045371532440186, + "learning_rate": 4.4115681788930995e-06, + "loss": 0.1155, + "num_input_tokens_seen": 180744208, + "step": 148550 + }, + { + "epoch": 16.54471544715447, + "grad_norm": 0.7837570309638977, + "learning_rate": 4.410189977818843e-06, + "loss": 0.0912, + "num_input_tokens_seen": 180750384, + "step": 148555 + }, + { + "epoch": 16.54527230203809, + "grad_norm": 1.4593071937561035, + "learning_rate": 4.408811971231849e-06, + "loss": 0.1707, + "num_input_tokens_seen": 180756304, + "step": 148560 + }, + { + "epoch": 16.545829156921705, + "grad_norm": 0.00016313616652041674, + "learning_rate": 4.407434159145116e-06, + "loss": 0.01, + "num_input_tokens_seen": 180762352, + "step": 148565 + }, + { + "epoch": 16.546386011805325, + "grad_norm": 1.6075611114501953, + "learning_rate": 4.406056541571671e-06, + "loss": 0.1889, + "num_input_tokens_seen": 180768464, + "step": 148570 + }, + { + "epoch": 16.54694286668894, + "grad_norm": 0.004053956363350153, + "learning_rate": 4.404679118524521e-06, + "loss": 0.0079, + "num_input_tokens_seen": 180774608, + "step": 148575 + }, + { + "epoch": 16.547499721572557, + "grad_norm": 0.03178726136684418, + "learning_rate": 4.403301890016679e-06, + "loss": 0.0033, + "num_input_tokens_seen": 180780688, + "step": 148580 + }, + { + "epoch": 16.548056576456176, + "grad_norm": 0.00033910400816239417, + "learning_rate": 4.401924856061146e-06, + "loss": 0.0878, + "num_input_tokens_seen": 180787184, + "step": 148585 + }, + { + "epoch": 16.548613431339792, + "grad_norm": 0.9776778221130371, + "learning_rate": 4.400548016670941e-06, + "loss": 0.1693, + "num_input_tokens_seen": 180793552, + "step": 148590 + }, + { + "epoch": 16.54917028622341, + "grad_norm": 0.10040059685707092, + "learning_rate": 4.399171371859062e-06, + "loss": 0.008, + "num_input_tokens_seen": 180799920, + "step": 148595 + }, + { + "epoch": 16.549727141107027, + "grad_norm": 0.3192201256752014, + "learning_rate": 4.397794921638518e-06, + "loss": 0.1066, + "num_input_tokens_seen": 180806288, + "step": 148600 + }, + { + "epoch": 16.550283995990643, + "grad_norm": 0.0008421333623118699, + "learning_rate": 4.3964186660223e-06, + "loss": 0.0352, + "num_input_tokens_seen": 180812336, + "step": 148605 + }, + { + "epoch": 16.550840850874263, + "grad_norm": 0.005189838353544474, + "learning_rate": 4.395042605023422e-06, + "loss": 0.0009, + "num_input_tokens_seen": 180818384, + "step": 148610 + }, + { + "epoch": 16.55139770575788, + "grad_norm": 0.09215757995843887, + "learning_rate": 4.393666738654867e-06, + "loss": 0.0364, + "num_input_tokens_seen": 180824720, + "step": 148615 + }, + { + "epoch": 16.551954560641498, + "grad_norm": 0.025295257568359375, + "learning_rate": 4.3922910669296465e-06, + "loss": 0.0484, + "num_input_tokens_seen": 180830864, + "step": 148620 + }, + { + "epoch": 16.552511415525114, + "grad_norm": 0.03545156493782997, + "learning_rate": 4.3909155898607475e-06, + "loss": 0.0898, + "num_input_tokens_seen": 180837168, + "step": 148625 + }, + { + "epoch": 16.55306827040873, + "grad_norm": 0.26369747519493103, + "learning_rate": 4.389540307461163e-06, + "loss": 0.116, + "num_input_tokens_seen": 180843184, + "step": 148630 + }, + { + "epoch": 16.55362512529235, + "grad_norm": 0.010189928114414215, + "learning_rate": 4.388165219743875e-06, + "loss": 0.0162, + "num_input_tokens_seen": 180849264, + "step": 148635 + }, + { + "epoch": 16.554181980175965, + "grad_norm": 0.6379824280738831, + "learning_rate": 4.386790326721887e-06, + "loss": 0.0465, + "num_input_tokens_seen": 180855440, + "step": 148640 + }, + { + "epoch": 16.554738835059585, + "grad_norm": 0.500226616859436, + "learning_rate": 4.385415628408182e-06, + "loss": 0.0082, + "num_input_tokens_seen": 180861488, + "step": 148645 + }, + { + "epoch": 16.5552956899432, + "grad_norm": 0.47725048661231995, + "learning_rate": 4.384041124815738e-06, + "loss": 0.0093, + "num_input_tokens_seen": 180867536, + "step": 148650 + }, + { + "epoch": 16.55585254482682, + "grad_norm": 1.6186507940292358, + "learning_rate": 4.382666815957539e-06, + "loss": 0.0633, + "num_input_tokens_seen": 180873232, + "step": 148655 + }, + { + "epoch": 16.556409399710436, + "grad_norm": 0.2314387559890747, + "learning_rate": 4.3812927018465785e-06, + "loss": 0.0859, + "num_input_tokens_seen": 180879376, + "step": 148660 + }, + { + "epoch": 16.55696625459405, + "grad_norm": 0.22847971320152283, + "learning_rate": 4.379918782495821e-06, + "loss": 0.0107, + "num_input_tokens_seen": 180885456, + "step": 148665 + }, + { + "epoch": 16.55752310947767, + "grad_norm": 0.0417516827583313, + "learning_rate": 4.3785450579182624e-06, + "loss": 0.0111, + "num_input_tokens_seen": 180891632, + "step": 148670 + }, + { + "epoch": 16.558079964361287, + "grad_norm": 0.07793831825256348, + "learning_rate": 4.377171528126853e-06, + "loss": 0.03, + "num_input_tokens_seen": 180897744, + "step": 148675 + }, + { + "epoch": 16.558636819244906, + "grad_norm": 0.007204643450677395, + "learning_rate": 4.3757981931345895e-06, + "loss": 0.027, + "num_input_tokens_seen": 180903728, + "step": 148680 + }, + { + "epoch": 16.559193674128522, + "grad_norm": 3.1363770961761475, + "learning_rate": 4.3744250529544315e-06, + "loss": 0.1142, + "num_input_tokens_seen": 180909872, + "step": 148685 + }, + { + "epoch": 16.55975052901214, + "grad_norm": 3.1220357418060303, + "learning_rate": 4.373052107599357e-06, + "loss": 0.0264, + "num_input_tokens_seen": 180915920, + "step": 148690 + }, + { + "epoch": 16.560307383895758, + "grad_norm": 1.8603885173797607, + "learning_rate": 4.371679357082331e-06, + "loss": 0.0707, + "num_input_tokens_seen": 180922096, + "step": 148695 + }, + { + "epoch": 16.560864238779374, + "grad_norm": 0.020095573738217354, + "learning_rate": 4.370306801416324e-06, + "loss": 0.1118, + "num_input_tokens_seen": 180927536, + "step": 148700 + }, + { + "epoch": 16.561421093662993, + "grad_norm": 0.25413015484809875, + "learning_rate": 4.368934440614289e-06, + "loss": 0.053, + "num_input_tokens_seen": 180933680, + "step": 148705 + }, + { + "epoch": 16.56197794854661, + "grad_norm": 1.7926950454711914, + "learning_rate": 4.367562274689205e-06, + "loss": 0.0576, + "num_input_tokens_seen": 180939760, + "step": 148710 + }, + { + "epoch": 16.562534803430225, + "grad_norm": 0.00788615271449089, + "learning_rate": 4.3661903036540245e-06, + "loss": 0.0121, + "num_input_tokens_seen": 180946000, + "step": 148715 + }, + { + "epoch": 16.563091658313844, + "grad_norm": 0.0006548145902343094, + "learning_rate": 4.364818527521708e-06, + "loss": 0.0824, + "num_input_tokens_seen": 180952304, + "step": 148720 + }, + { + "epoch": 16.56364851319746, + "grad_norm": 0.008240959607064724, + "learning_rate": 4.363446946305208e-06, + "loss": 0.0018, + "num_input_tokens_seen": 180958768, + "step": 148725 + }, + { + "epoch": 16.56420536808108, + "grad_norm": 0.02345559187233448, + "learning_rate": 4.362075560017489e-06, + "loss": 0.0072, + "num_input_tokens_seen": 180964752, + "step": 148730 + }, + { + "epoch": 16.564762222964696, + "grad_norm": 0.32135066390037537, + "learning_rate": 4.3607043686714974e-06, + "loss": 0.0107, + "num_input_tokens_seen": 180970384, + "step": 148735 + }, + { + "epoch": 16.56531907784831, + "grad_norm": 0.38809871673583984, + "learning_rate": 4.359333372280203e-06, + "loss": 0.0762, + "num_input_tokens_seen": 180976464, + "step": 148740 + }, + { + "epoch": 16.56587593273193, + "grad_norm": 0.0851505845785141, + "learning_rate": 4.357962570856527e-06, + "loss": 0.0096, + "num_input_tokens_seen": 180982512, + "step": 148745 + }, + { + "epoch": 16.566432787615547, + "grad_norm": 0.00039804057450965047, + "learning_rate": 4.356591964413439e-06, + "loss": 0.0387, + "num_input_tokens_seen": 180988624, + "step": 148750 + }, + { + "epoch": 16.566989642499166, + "grad_norm": 1.1473968029022217, + "learning_rate": 4.355221552963873e-06, + "loss": 0.0322, + "num_input_tokens_seen": 180994832, + "step": 148755 + }, + { + "epoch": 16.567546497382782, + "grad_norm": 0.11617586761713028, + "learning_rate": 4.353851336520787e-06, + "loss": 0.0479, + "num_input_tokens_seen": 181000912, + "step": 148760 + }, + { + "epoch": 16.568103352266398, + "grad_norm": 0.21893192827701569, + "learning_rate": 4.352481315097115e-06, + "loss": 0.0684, + "num_input_tokens_seen": 181006832, + "step": 148765 + }, + { + "epoch": 16.568660207150018, + "grad_norm": 0.015463544987142086, + "learning_rate": 4.3511114887058e-06, + "loss": 0.0246, + "num_input_tokens_seen": 181013424, + "step": 148770 + }, + { + "epoch": 16.569217062033633, + "grad_norm": 0.0020868508145213127, + "learning_rate": 4.349741857359774e-06, + "loss": 0.0434, + "num_input_tokens_seen": 181019824, + "step": 148775 + }, + { + "epoch": 16.569773916917253, + "grad_norm": 0.06098281219601631, + "learning_rate": 4.348372421071989e-06, + "loss": 0.0945, + "num_input_tokens_seen": 181026160, + "step": 148780 + }, + { + "epoch": 16.57033077180087, + "grad_norm": 0.0006338886450976133, + "learning_rate": 4.347003179855369e-06, + "loss": 0.1009, + "num_input_tokens_seen": 181032528, + "step": 148785 + }, + { + "epoch": 16.570887626684485, + "grad_norm": 0.3458172380924225, + "learning_rate": 4.345634133722853e-06, + "loss": 0.0091, + "num_input_tokens_seen": 181038576, + "step": 148790 + }, + { + "epoch": 16.571444481568104, + "grad_norm": 0.0020398972555994987, + "learning_rate": 4.344265282687366e-06, + "loss": 0.0007, + "num_input_tokens_seen": 181044720, + "step": 148795 + }, + { + "epoch": 16.57200133645172, + "grad_norm": 0.038710299879312515, + "learning_rate": 4.342896626761847e-06, + "loss": 0.0702, + "num_input_tokens_seen": 181050960, + "step": 148800 + }, + { + "epoch": 16.57255819133534, + "grad_norm": 2.4524106979370117, + "learning_rate": 4.341528165959213e-06, + "loss": 0.1376, + "num_input_tokens_seen": 181056848, + "step": 148805 + }, + { + "epoch": 16.573115046218955, + "grad_norm": 0.20905043184757233, + "learning_rate": 4.3401599002924095e-06, + "loss": 0.0165, + "num_input_tokens_seen": 181062512, + "step": 148810 + }, + { + "epoch": 16.57367190110257, + "grad_norm": 1.0530449151992798, + "learning_rate": 4.338791829774336e-06, + "loss": 0.0287, + "num_input_tokens_seen": 181068880, + "step": 148815 + }, + { + "epoch": 16.57422875598619, + "grad_norm": 0.8532304167747498, + "learning_rate": 4.3374239544179324e-06, + "loss": 0.0409, + "num_input_tokens_seen": 181074704, + "step": 148820 + }, + { + "epoch": 16.574785610869807, + "grad_norm": 0.00013411539839580655, + "learning_rate": 4.336056274236108e-06, + "loss": 0.0178, + "num_input_tokens_seen": 181081040, + "step": 148825 + }, + { + "epoch": 16.575342465753426, + "grad_norm": 0.09884544461965561, + "learning_rate": 4.334688789241795e-06, + "loss": 0.0507, + "num_input_tokens_seen": 181087216, + "step": 148830 + }, + { + "epoch": 16.575899320637042, + "grad_norm": 2.1199519634246826, + "learning_rate": 4.333321499447904e-06, + "loss": 0.1459, + "num_input_tokens_seen": 181093328, + "step": 148835 + }, + { + "epoch": 16.576456175520658, + "grad_norm": 0.7603487372398376, + "learning_rate": 4.331954404867347e-06, + "loss": 0.0053, + "num_input_tokens_seen": 181099344, + "step": 148840 + }, + { + "epoch": 16.577013030404277, + "grad_norm": 0.026572469621896744, + "learning_rate": 4.330587505513034e-06, + "loss": 0.0272, + "num_input_tokens_seen": 181105136, + "step": 148845 + }, + { + "epoch": 16.577569885287893, + "grad_norm": 0.04172220081090927, + "learning_rate": 4.329220801397887e-06, + "loss": 0.004, + "num_input_tokens_seen": 181111088, + "step": 148850 + }, + { + "epoch": 16.578126740171513, + "grad_norm": 2.0054049491882324, + "learning_rate": 4.3278542925348135e-06, + "loss": 0.2747, + "num_input_tokens_seen": 181117136, + "step": 148855 + }, + { + "epoch": 16.57868359505513, + "grad_norm": 0.01850931905210018, + "learning_rate": 4.326487978936719e-06, + "loss": 0.0392, + "num_input_tokens_seen": 181123280, + "step": 148860 + }, + { + "epoch": 16.579240449938744, + "grad_norm": 0.033458735793828964, + "learning_rate": 4.325121860616499e-06, + "loss": 0.0603, + "num_input_tokens_seen": 181129488, + "step": 148865 + }, + { + "epoch": 16.579797304822364, + "grad_norm": 0.9327202439308167, + "learning_rate": 4.3237559375870766e-06, + "loss": 0.0158, + "num_input_tokens_seen": 181135376, + "step": 148870 + }, + { + "epoch": 16.58035415970598, + "grad_norm": 3.087134838104248, + "learning_rate": 4.3223902098613375e-06, + "loss": 0.057, + "num_input_tokens_seen": 181141584, + "step": 148875 + }, + { + "epoch": 16.5809110145896, + "grad_norm": 2.1756033897399902, + "learning_rate": 4.321024677452196e-06, + "loss": 0.0872, + "num_input_tokens_seen": 181147728, + "step": 148880 + }, + { + "epoch": 16.581467869473215, + "grad_norm": 0.010495370253920555, + "learning_rate": 4.319659340372545e-06, + "loss": 0.0815, + "num_input_tokens_seen": 181153808, + "step": 148885 + }, + { + "epoch": 16.58202472435683, + "grad_norm": 0.5068423748016357, + "learning_rate": 4.318294198635278e-06, + "loss": 0.0103, + "num_input_tokens_seen": 181159408, + "step": 148890 + }, + { + "epoch": 16.58258157924045, + "grad_norm": 0.03147829696536064, + "learning_rate": 4.316929252253288e-06, + "loss": 0.0196, + "num_input_tokens_seen": 181165456, + "step": 148895 + }, + { + "epoch": 16.583138434124066, + "grad_norm": 0.08424684405326843, + "learning_rate": 4.315564501239477e-06, + "loss": 0.0093, + "num_input_tokens_seen": 181171664, + "step": 148900 + }, + { + "epoch": 16.583695289007686, + "grad_norm": 0.07046417146921158, + "learning_rate": 4.314199945606734e-06, + "loss": 0.0208, + "num_input_tokens_seen": 181177808, + "step": 148905 + }, + { + "epoch": 16.5842521438913, + "grad_norm": 0.0016424717614427209, + "learning_rate": 4.312835585367945e-06, + "loss": 0.032, + "num_input_tokens_seen": 181184048, + "step": 148910 + }, + { + "epoch": 16.584808998774918, + "grad_norm": 0.004900780972093344, + "learning_rate": 4.31147142053599e-06, + "loss": 0.0755, + "num_input_tokens_seen": 181190032, + "step": 148915 + }, + { + "epoch": 16.585365853658537, + "grad_norm": 1.4110527038574219, + "learning_rate": 4.310107451123768e-06, + "loss": 0.0934, + "num_input_tokens_seen": 181196176, + "step": 148920 + }, + { + "epoch": 16.585922708542153, + "grad_norm": 0.6099837422370911, + "learning_rate": 4.3087436771441615e-06, + "loss": 0.0261, + "num_input_tokens_seen": 181201936, + "step": 148925 + }, + { + "epoch": 16.586479563425772, + "grad_norm": 0.17085237801074982, + "learning_rate": 4.307380098610045e-06, + "loss": 0.0094, + "num_input_tokens_seen": 181208272, + "step": 148930 + }, + { + "epoch": 16.58703641830939, + "grad_norm": 0.2683280110359192, + "learning_rate": 4.306016715534303e-06, + "loss": 0.0861, + "num_input_tokens_seen": 181214704, + "step": 148935 + }, + { + "epoch": 16.587593273193004, + "grad_norm": 0.20869161188602448, + "learning_rate": 4.3046535279298085e-06, + "loss": 0.0087, + "num_input_tokens_seen": 181220976, + "step": 148940 + }, + { + "epoch": 16.588150128076624, + "grad_norm": 0.9177181720733643, + "learning_rate": 4.3032905358094484e-06, + "loss": 0.126, + "num_input_tokens_seen": 181227184, + "step": 148945 + }, + { + "epoch": 16.58870698296024, + "grad_norm": 0.7436919212341309, + "learning_rate": 4.3019277391860815e-06, + "loss": 0.0468, + "num_input_tokens_seen": 181232848, + "step": 148950 + }, + { + "epoch": 16.58926383784386, + "grad_norm": 0.009056329727172852, + "learning_rate": 4.300565138072607e-06, + "loss": 0.0647, + "num_input_tokens_seen": 181239088, + "step": 148955 + }, + { + "epoch": 16.589820692727475, + "grad_norm": 0.28936123847961426, + "learning_rate": 4.299202732481863e-06, + "loss": 0.1037, + "num_input_tokens_seen": 181245200, + "step": 148960 + }, + { + "epoch": 16.59037754761109, + "grad_norm": 0.07005687057971954, + "learning_rate": 4.297840522426741e-06, + "loss": 0.0448, + "num_input_tokens_seen": 181251472, + "step": 148965 + }, + { + "epoch": 16.59093440249471, + "grad_norm": 0.3490368723869324, + "learning_rate": 4.296478507920096e-06, + "loss": 0.0356, + "num_input_tokens_seen": 181257680, + "step": 148970 + }, + { + "epoch": 16.591491257378326, + "grad_norm": 0.3680177927017212, + "learning_rate": 4.295116688974807e-06, + "loss": 0.0188, + "num_input_tokens_seen": 181263856, + "step": 148975 + }, + { + "epoch": 16.592048112261946, + "grad_norm": 0.21626320481300354, + "learning_rate": 4.293755065603727e-06, + "loss": 0.0024, + "num_input_tokens_seen": 181270000, + "step": 148980 + }, + { + "epoch": 16.59260496714556, + "grad_norm": 0.03333945944905281, + "learning_rate": 4.292393637819722e-06, + "loss": 0.017, + "num_input_tokens_seen": 181276304, + "step": 148985 + }, + { + "epoch": 16.59316182202918, + "grad_norm": 1.3941137790679932, + "learning_rate": 4.291032405635642e-06, + "loss": 0.1355, + "num_input_tokens_seen": 181282512, + "step": 148990 + }, + { + "epoch": 16.593718676912797, + "grad_norm": 0.3284333646297455, + "learning_rate": 4.289671369064357e-06, + "loss": 0.0219, + "num_input_tokens_seen": 181288048, + "step": 148995 + }, + { + "epoch": 16.594275531796413, + "grad_norm": 0.09253913909196854, + "learning_rate": 4.288310528118722e-06, + "loss": 0.0048, + "num_input_tokens_seen": 181294320, + "step": 149000 + }, + { + "epoch": 16.594832386680032, + "grad_norm": 0.7205637097358704, + "learning_rate": 4.286949882811586e-06, + "loss": 0.0458, + "num_input_tokens_seen": 181300528, + "step": 149005 + }, + { + "epoch": 16.595389241563648, + "grad_norm": 0.07413122057914734, + "learning_rate": 4.285589433155798e-06, + "loss": 0.0188, + "num_input_tokens_seen": 181306288, + "step": 149010 + }, + { + "epoch": 16.595946096447264, + "grad_norm": 0.7858470678329468, + "learning_rate": 4.284229179164221e-06, + "loss": 0.0941, + "num_input_tokens_seen": 181312400, + "step": 149015 + }, + { + "epoch": 16.596502951330883, + "grad_norm": 0.001659745816141367, + "learning_rate": 4.282869120849689e-06, + "loss": 0.0123, + "num_input_tokens_seen": 181318416, + "step": 149020 + }, + { + "epoch": 16.5970598062145, + "grad_norm": 0.0332660973072052, + "learning_rate": 4.281509258225063e-06, + "loss": 0.0491, + "num_input_tokens_seen": 181324336, + "step": 149025 + }, + { + "epoch": 16.59761666109812, + "grad_norm": 2.9367520809173584, + "learning_rate": 4.280149591303182e-06, + "loss": 0.0323, + "num_input_tokens_seen": 181330832, + "step": 149030 + }, + { + "epoch": 16.598173515981735, + "grad_norm": 0.46816757321357727, + "learning_rate": 4.278790120096887e-06, + "loss": 0.0724, + "num_input_tokens_seen": 181336816, + "step": 149035 + }, + { + "epoch": 16.598730370865354, + "grad_norm": 0.7309841513633728, + "learning_rate": 4.277430844619018e-06, + "loss": 0.0182, + "num_input_tokens_seen": 181342960, + "step": 149040 + }, + { + "epoch": 16.59928722574897, + "grad_norm": 5.388643741607666, + "learning_rate": 4.2760717648824195e-06, + "loss": 0.0411, + "num_input_tokens_seen": 181349264, + "step": 149045 + }, + { + "epoch": 16.599844080632586, + "grad_norm": 0.5595243573188782, + "learning_rate": 4.274712880899931e-06, + "loss": 0.0054, + "num_input_tokens_seen": 181355280, + "step": 149050 + }, + { + "epoch": 16.600400935516205, + "grad_norm": 0.13113753497600555, + "learning_rate": 4.273354192684381e-06, + "loss": 0.0791, + "num_input_tokens_seen": 181361520, + "step": 149055 + }, + { + "epoch": 16.60095779039982, + "grad_norm": 0.17556250095367432, + "learning_rate": 4.271995700248602e-06, + "loss": 0.0092, + "num_input_tokens_seen": 181367408, + "step": 149060 + }, + { + "epoch": 16.60151464528344, + "grad_norm": 0.07494545727968216, + "learning_rate": 4.270637403605435e-06, + "loss": 0.0562, + "num_input_tokens_seen": 181373392, + "step": 149065 + }, + { + "epoch": 16.602071500167057, + "grad_norm": 0.00010239400580758229, + "learning_rate": 4.269279302767701e-06, + "loss": 0.0083, + "num_input_tokens_seen": 181379408, + "step": 149070 + }, + { + "epoch": 16.602628355050673, + "grad_norm": 0.026999671012163162, + "learning_rate": 4.267921397748245e-06, + "loss": 0.0307, + "num_input_tokens_seen": 181385072, + "step": 149075 + }, + { + "epoch": 16.603185209934292, + "grad_norm": 0.10477966070175171, + "learning_rate": 4.266563688559869e-06, + "loss": 0.0052, + "num_input_tokens_seen": 181391184, + "step": 149080 + }, + { + "epoch": 16.603742064817908, + "grad_norm": 1.749010443687439, + "learning_rate": 4.265206175215417e-06, + "loss": 0.0531, + "num_input_tokens_seen": 181397328, + "step": 149085 + }, + { + "epoch": 16.604298919701527, + "grad_norm": 0.028222961351275444, + "learning_rate": 4.263848857727701e-06, + "loss": 0.0057, + "num_input_tokens_seen": 181403568, + "step": 149090 + }, + { + "epoch": 16.604855774585143, + "grad_norm": 0.08833928406238556, + "learning_rate": 4.26249173610955e-06, + "loss": 0.071, + "num_input_tokens_seen": 181409584, + "step": 149095 + }, + { + "epoch": 16.60541262946876, + "grad_norm": 0.009605101309716702, + "learning_rate": 4.261134810373779e-06, + "loss": 0.0616, + "num_input_tokens_seen": 181415664, + "step": 149100 + }, + { + "epoch": 16.60596948435238, + "grad_norm": 1.0270156860351562, + "learning_rate": 4.259778080533205e-06, + "loss": 0.0096, + "num_input_tokens_seen": 181421744, + "step": 149105 + }, + { + "epoch": 16.606526339235995, + "grad_norm": 0.07757432758808136, + "learning_rate": 4.2584215466006385e-06, + "loss": 0.1122, + "num_input_tokens_seen": 181427632, + "step": 149110 + }, + { + "epoch": 16.607083194119614, + "grad_norm": 0.38443291187286377, + "learning_rate": 4.257065208588903e-06, + "loss": 0.0128, + "num_input_tokens_seen": 181433456, + "step": 149115 + }, + { + "epoch": 16.60764004900323, + "grad_norm": 0.8403485417366028, + "learning_rate": 4.255709066510808e-06, + "loss": 0.0751, + "num_input_tokens_seen": 181439472, + "step": 149120 + }, + { + "epoch": 16.608196903886846, + "grad_norm": 0.7172843813896179, + "learning_rate": 4.2543531203791595e-06, + "loss": 0.0661, + "num_input_tokens_seen": 181445840, + "step": 149125 + }, + { + "epoch": 16.608753758770465, + "grad_norm": 0.2832147181034088, + "learning_rate": 4.252997370206763e-06, + "loss": 0.0101, + "num_input_tokens_seen": 181452080, + "step": 149130 + }, + { + "epoch": 16.60931061365408, + "grad_norm": 0.0001828497915994376, + "learning_rate": 4.2516418160064325e-06, + "loss": 0.0629, + "num_input_tokens_seen": 181457648, + "step": 149135 + }, + { + "epoch": 16.6098674685377, + "grad_norm": 1.1417802572250366, + "learning_rate": 4.250286457790961e-06, + "loss": 0.0578, + "num_input_tokens_seen": 181463888, + "step": 149140 + }, + { + "epoch": 16.610424323421316, + "grad_norm": 0.44768133759498596, + "learning_rate": 4.248931295573174e-06, + "loss": 0.0137, + "num_input_tokens_seen": 181470320, + "step": 149145 + }, + { + "epoch": 16.610981178304932, + "grad_norm": 0.17901495099067688, + "learning_rate": 4.24757632936584e-06, + "loss": 0.0676, + "num_input_tokens_seen": 181476528, + "step": 149150 + }, + { + "epoch": 16.611538033188552, + "grad_norm": 0.5122520923614502, + "learning_rate": 4.246221559181784e-06, + "loss": 0.007, + "num_input_tokens_seen": 181482640, + "step": 149155 + }, + { + "epoch": 16.612094888072168, + "grad_norm": 0.021879611536860466, + "learning_rate": 4.244866985033785e-06, + "loss": 0.062, + "num_input_tokens_seen": 181488720, + "step": 149160 + }, + { + "epoch": 16.612651742955787, + "grad_norm": 0.8349094390869141, + "learning_rate": 4.243512606934655e-06, + "loss": 0.0247, + "num_input_tokens_seen": 181494576, + "step": 149165 + }, + { + "epoch": 16.613208597839403, + "grad_norm": 0.0719083771109581, + "learning_rate": 4.2421584248971745e-06, + "loss": 0.0383, + "num_input_tokens_seen": 181501040, + "step": 149170 + }, + { + "epoch": 16.61376545272302, + "grad_norm": 1.5178340673446655, + "learning_rate": 4.24080443893414e-06, + "loss": 0.0287, + "num_input_tokens_seen": 181507184, + "step": 149175 + }, + { + "epoch": 16.61432230760664, + "grad_norm": 1.8289414644241333, + "learning_rate": 4.2394506490583325e-06, + "loss": 0.0312, + "num_input_tokens_seen": 181513264, + "step": 149180 + }, + { + "epoch": 16.614879162490254, + "grad_norm": 0.02127503603696823, + "learning_rate": 4.238097055282556e-06, + "loss": 0.0149, + "num_input_tokens_seen": 181519600, + "step": 149185 + }, + { + "epoch": 16.615436017373874, + "grad_norm": 0.18805666267871857, + "learning_rate": 4.2367436576195825e-06, + "loss": 0.0091, + "num_input_tokens_seen": 181525744, + "step": 149190 + }, + { + "epoch": 16.61599287225749, + "grad_norm": 0.000929257192183286, + "learning_rate": 4.235390456082203e-06, + "loss": 0.0322, + "num_input_tokens_seen": 181531664, + "step": 149195 + }, + { + "epoch": 16.616549727141106, + "grad_norm": 0.19431595504283905, + "learning_rate": 4.234037450683193e-06, + "loss": 0.0122, + "num_input_tokens_seen": 181538288, + "step": 149200 + }, + { + "epoch": 16.617106582024725, + "grad_norm": 1.5859839916229248, + "learning_rate": 4.232684641435339e-06, + "loss": 0.0569, + "num_input_tokens_seen": 181544144, + "step": 149205 + }, + { + "epoch": 16.61766343690834, + "grad_norm": 0.3410511612892151, + "learning_rate": 4.231332028351412e-06, + "loss": 0.0655, + "num_input_tokens_seen": 181550032, + "step": 149210 + }, + { + "epoch": 16.61822029179196, + "grad_norm": 0.0032472440507262945, + "learning_rate": 4.2299796114441985e-06, + "loss": 0.002, + "num_input_tokens_seen": 181556272, + "step": 149215 + }, + { + "epoch": 16.618777146675576, + "grad_norm": 0.36739474534988403, + "learning_rate": 4.228627390726472e-06, + "loss": 0.2433, + "num_input_tokens_seen": 181562320, + "step": 149220 + }, + { + "epoch": 16.619334001559192, + "grad_norm": 0.0002498178801033646, + "learning_rate": 4.2272753662109976e-06, + "loss": 0.0171, + "num_input_tokens_seen": 181568528, + "step": 149225 + }, + { + "epoch": 16.61989085644281, + "grad_norm": 0.11137717962265015, + "learning_rate": 4.225923537910545e-06, + "loss": 0.005, + "num_input_tokens_seen": 181574544, + "step": 149230 + }, + { + "epoch": 16.620447711326428, + "grad_norm": 0.47709834575653076, + "learning_rate": 4.224571905837895e-06, + "loss": 0.0849, + "num_input_tokens_seen": 181580592, + "step": 149235 + }, + { + "epoch": 16.621004566210047, + "grad_norm": 0.02398817054927349, + "learning_rate": 4.223220470005809e-06, + "loss": 0.0574, + "num_input_tokens_seen": 181586960, + "step": 149240 + }, + { + "epoch": 16.621561421093663, + "grad_norm": 0.31680649518966675, + "learning_rate": 4.2218692304270526e-06, + "loss": 0.1442, + "num_input_tokens_seen": 181592784, + "step": 149245 + }, + { + "epoch": 16.62211827597728, + "grad_norm": 0.06855995208024979, + "learning_rate": 4.2205181871143805e-06, + "loss": 0.0052, + "num_input_tokens_seen": 181599184, + "step": 149250 + }, + { + "epoch": 16.622675130860898, + "grad_norm": 0.023984335362911224, + "learning_rate": 4.2191673400805705e-06, + "loss": 0.13, + "num_input_tokens_seen": 181605328, + "step": 149255 + }, + { + "epoch": 16.623231985744514, + "grad_norm": 0.010014047846198082, + "learning_rate": 4.217816689338372e-06, + "loss": 0.0352, + "num_input_tokens_seen": 181611312, + "step": 149260 + }, + { + "epoch": 16.623788840628134, + "grad_norm": 0.2769463062286377, + "learning_rate": 4.2164662349005454e-06, + "loss": 0.004, + "num_input_tokens_seen": 181617392, + "step": 149265 + }, + { + "epoch": 16.62434569551175, + "grad_norm": 0.7981797456741333, + "learning_rate": 4.215115976779843e-06, + "loss": 0.0241, + "num_input_tokens_seen": 181623600, + "step": 149270 + }, + { + "epoch": 16.624902550395365, + "grad_norm": 0.37093591690063477, + "learning_rate": 4.213765914989026e-06, + "loss": 0.01, + "num_input_tokens_seen": 181629616, + "step": 149275 + }, + { + "epoch": 16.625459405278985, + "grad_norm": 2.8240785598754883, + "learning_rate": 4.2124160495408405e-06, + "loss": 0.0248, + "num_input_tokens_seen": 181635568, + "step": 149280 + }, + { + "epoch": 16.6260162601626, + "grad_norm": 3.875296115875244, + "learning_rate": 4.211066380448042e-06, + "loss": 0.037, + "num_input_tokens_seen": 181641392, + "step": 149285 + }, + { + "epoch": 16.62657311504622, + "grad_norm": 0.00022660972899757326, + "learning_rate": 4.209716907723382e-06, + "loss": 0.0557, + "num_input_tokens_seen": 181647280, + "step": 149290 + }, + { + "epoch": 16.627129969929836, + "grad_norm": 0.02850065752863884, + "learning_rate": 4.208367631379601e-06, + "loss": 0.0035, + "num_input_tokens_seen": 181653392, + "step": 149295 + }, + { + "epoch": 16.627686824813452, + "grad_norm": 0.22739994525909424, + "learning_rate": 4.207018551429437e-06, + "loss": 0.0211, + "num_input_tokens_seen": 181659504, + "step": 149300 + }, + { + "epoch": 16.62824367969707, + "grad_norm": 0.3037932813167572, + "learning_rate": 4.2056696678856505e-06, + "loss": 0.0213, + "num_input_tokens_seen": 181665872, + "step": 149305 + }, + { + "epoch": 16.628800534580687, + "grad_norm": 0.00016646542644593865, + "learning_rate": 4.204320980760976e-06, + "loss": 0.0507, + "num_input_tokens_seen": 181671920, + "step": 149310 + }, + { + "epoch": 16.629357389464307, + "grad_norm": 0.028266876935958862, + "learning_rate": 4.2029724900681465e-06, + "loss": 0.029, + "num_input_tokens_seen": 181677712, + "step": 149315 + }, + { + "epoch": 16.629914244347923, + "grad_norm": 0.5858495235443115, + "learning_rate": 4.2016241958199e-06, + "loss": 0.0141, + "num_input_tokens_seen": 181683824, + "step": 149320 + }, + { + "epoch": 16.63047109923154, + "grad_norm": 0.2520063817501068, + "learning_rate": 4.200276098028985e-06, + "loss": 0.0086, + "num_input_tokens_seen": 181690032, + "step": 149325 + }, + { + "epoch": 16.631027954115158, + "grad_norm": 0.0001445674424758181, + "learning_rate": 4.198928196708124e-06, + "loss": 0.0128, + "num_input_tokens_seen": 181695920, + "step": 149330 + }, + { + "epoch": 16.631584808998774, + "grad_norm": 0.0007752556120976806, + "learning_rate": 4.197580491870051e-06, + "loss": 0.0432, + "num_input_tokens_seen": 181702064, + "step": 149335 + }, + { + "epoch": 16.632141663882393, + "grad_norm": 0.16428694128990173, + "learning_rate": 4.196232983527498e-06, + "loss": 0.0631, + "num_input_tokens_seen": 181708304, + "step": 149340 + }, + { + "epoch": 16.63269851876601, + "grad_norm": 0.7004917860031128, + "learning_rate": 4.194885671693186e-06, + "loss": 0.1001, + "num_input_tokens_seen": 181713840, + "step": 149345 + }, + { + "epoch": 16.633255373649625, + "grad_norm": 1.1318665742874146, + "learning_rate": 4.193538556379856e-06, + "loss": 0.0544, + "num_input_tokens_seen": 181719696, + "step": 149350 + }, + { + "epoch": 16.633812228533245, + "grad_norm": 0.7737678289413452, + "learning_rate": 4.1921916376002155e-06, + "loss": 0.0091, + "num_input_tokens_seen": 181725968, + "step": 149355 + }, + { + "epoch": 16.63436908341686, + "grad_norm": 0.0009804433211684227, + "learning_rate": 4.190844915367007e-06, + "loss": 0.032, + "num_input_tokens_seen": 181732144, + "step": 149360 + }, + { + "epoch": 16.63492593830048, + "grad_norm": 2.47640061378479, + "learning_rate": 4.189498389692931e-06, + "loss": 0.1213, + "num_input_tokens_seen": 181738320, + "step": 149365 + }, + { + "epoch": 16.635482793184096, + "grad_norm": 0.00020763570501003414, + "learning_rate": 4.188152060590719e-06, + "loss": 0.0585, + "num_input_tokens_seen": 181744112, + "step": 149370 + }, + { + "epoch": 16.636039648067715, + "grad_norm": 0.10908928513526917, + "learning_rate": 4.186805928073082e-06, + "loss": 0.0039, + "num_input_tokens_seen": 181750416, + "step": 149375 + }, + { + "epoch": 16.63659650295133, + "grad_norm": 0.0006260598311200738, + "learning_rate": 4.1854599921527435e-06, + "loss": 0.0333, + "num_input_tokens_seen": 181756592, + "step": 149380 + }, + { + "epoch": 16.637153357834947, + "grad_norm": 0.03728031739592552, + "learning_rate": 4.184114252842411e-06, + "loss": 0.0316, + "num_input_tokens_seen": 181762736, + "step": 149385 + }, + { + "epoch": 16.637710212718567, + "grad_norm": 0.013749450445175171, + "learning_rate": 4.182768710154797e-06, + "loss": 0.0528, + "num_input_tokens_seen": 181768688, + "step": 149390 + }, + { + "epoch": 16.638267067602182, + "grad_norm": 0.17121239006519318, + "learning_rate": 4.181423364102602e-06, + "loss": 0.144, + "num_input_tokens_seen": 181774960, + "step": 149395 + }, + { + "epoch": 16.638823922485802, + "grad_norm": 0.2805958092212677, + "learning_rate": 4.1800782146985514e-06, + "loss": 0.012, + "num_input_tokens_seen": 181781328, + "step": 149400 + }, + { + "epoch": 16.639380777369418, + "grad_norm": 0.3812238574028015, + "learning_rate": 4.1787332619553445e-06, + "loss": 0.0421, + "num_input_tokens_seen": 181787504, + "step": 149405 + }, + { + "epoch": 16.639937632253034, + "grad_norm": 1.2528953552246094, + "learning_rate": 4.177388505885682e-06, + "loss": 0.0565, + "num_input_tokens_seen": 181793840, + "step": 149410 + }, + { + "epoch": 16.640494487136653, + "grad_norm": 0.16091646254062653, + "learning_rate": 4.176043946502261e-06, + "loss": 0.0159, + "num_input_tokens_seen": 181800048, + "step": 149415 + }, + { + "epoch": 16.64105134202027, + "grad_norm": 2.70094895362854, + "learning_rate": 4.174699583817798e-06, + "loss": 0.0625, + "num_input_tokens_seen": 181805936, + "step": 149420 + }, + { + "epoch": 16.64160819690389, + "grad_norm": 0.042272232472896576, + "learning_rate": 4.173355417844974e-06, + "loss": 0.1049, + "num_input_tokens_seen": 181812176, + "step": 149425 + }, + { + "epoch": 16.642165051787504, + "grad_norm": 0.7403698563575745, + "learning_rate": 4.172011448596499e-06, + "loss": 0.0985, + "num_input_tokens_seen": 181818288, + "step": 149430 + }, + { + "epoch": 16.64272190667112, + "grad_norm": 0.0008653742261230946, + "learning_rate": 4.170667676085066e-06, + "loss": 0.0669, + "num_input_tokens_seen": 181824880, + "step": 149435 + }, + { + "epoch": 16.64327876155474, + "grad_norm": 0.005704711657017469, + "learning_rate": 4.169324100323363e-06, + "loss": 0.0065, + "num_input_tokens_seen": 181830640, + "step": 149440 + }, + { + "epoch": 16.643835616438356, + "grad_norm": 0.00014734243450220674, + "learning_rate": 4.167980721324078e-06, + "loss": 0.0625, + "num_input_tokens_seen": 181836240, + "step": 149445 + }, + { + "epoch": 16.644392471321975, + "grad_norm": 0.00021728797582909465, + "learning_rate": 4.16663753909991e-06, + "loss": 0.0072, + "num_input_tokens_seen": 181842512, + "step": 149450 + }, + { + "epoch": 16.64494932620559, + "grad_norm": 0.015444094315171242, + "learning_rate": 4.1652945536635425e-06, + "loss": 0.0197, + "num_input_tokens_seen": 181848688, + "step": 149455 + }, + { + "epoch": 16.645506181089207, + "grad_norm": 0.009208236820995808, + "learning_rate": 4.1639517650276596e-06, + "loss": 0.0331, + "num_input_tokens_seen": 181854896, + "step": 149460 + }, + { + "epoch": 16.646063035972826, + "grad_norm": 0.03416505828499794, + "learning_rate": 4.1626091732049395e-06, + "loss": 0.0102, + "num_input_tokens_seen": 181860720, + "step": 149465 + }, + { + "epoch": 16.646619890856442, + "grad_norm": 0.0827348604798317, + "learning_rate": 4.1612667782080786e-06, + "loss": 0.0481, + "num_input_tokens_seen": 181866864, + "step": 149470 + }, + { + "epoch": 16.64717674574006, + "grad_norm": 0.049947962164878845, + "learning_rate": 4.159924580049742e-06, + "loss": 0.073, + "num_input_tokens_seen": 181873040, + "step": 149475 + }, + { + "epoch": 16.647733600623678, + "grad_norm": 0.06412393599748611, + "learning_rate": 4.1585825787426255e-06, + "loss": 0.0012, + "num_input_tokens_seen": 181879184, + "step": 149480 + }, + { + "epoch": 16.648290455507293, + "grad_norm": 0.0026502187829464674, + "learning_rate": 4.157240774299384e-06, + "loss": 0.0076, + "num_input_tokens_seen": 181885424, + "step": 149485 + }, + { + "epoch": 16.648847310390913, + "grad_norm": 0.372941792011261, + "learning_rate": 4.155899166732707e-06, + "loss": 0.0582, + "num_input_tokens_seen": 181891440, + "step": 149490 + }, + { + "epoch": 16.64940416527453, + "grad_norm": 1.0948209762573242, + "learning_rate": 4.1545577560552575e-06, + "loss": 0.0948, + "num_input_tokens_seen": 181897392, + "step": 149495 + }, + { + "epoch": 16.64996102015815, + "grad_norm": 0.0006123717757873237, + "learning_rate": 4.153216542279717e-06, + "loss": 0.0205, + "num_input_tokens_seen": 181903952, + "step": 149500 + }, + { + "epoch": 16.650517875041764, + "grad_norm": 0.12569867074489594, + "learning_rate": 4.1518755254187494e-06, + "loss": 0.005, + "num_input_tokens_seen": 181909424, + "step": 149505 + }, + { + "epoch": 16.65107472992538, + "grad_norm": 0.10928329825401306, + "learning_rate": 4.150534705485018e-06, + "loss": 0.0161, + "num_input_tokens_seen": 181915984, + "step": 149510 + }, + { + "epoch": 16.651631584809, + "grad_norm": 0.018902426585555077, + "learning_rate": 4.149194082491187e-06, + "loss": 0.0565, + "num_input_tokens_seen": 181922320, + "step": 149515 + }, + { + "epoch": 16.652188439692615, + "grad_norm": 0.13438500463962555, + "learning_rate": 4.147853656449926e-06, + "loss": 0.0053, + "num_input_tokens_seen": 181928656, + "step": 149520 + }, + { + "epoch": 16.652745294576235, + "grad_norm": 0.5227128863334656, + "learning_rate": 4.146513427373896e-06, + "loss": 0.0231, + "num_input_tokens_seen": 181934416, + "step": 149525 + }, + { + "epoch": 16.65330214945985, + "grad_norm": 0.07497914135456085, + "learning_rate": 4.145173395275756e-06, + "loss": 0.0023, + "num_input_tokens_seen": 181940656, + "step": 149530 + }, + { + "epoch": 16.653859004343467, + "grad_norm": 0.004365939646959305, + "learning_rate": 4.143833560168154e-06, + "loss": 0.0057, + "num_input_tokens_seen": 181946864, + "step": 149535 + }, + { + "epoch": 16.654415859227086, + "grad_norm": 0.49924564361572266, + "learning_rate": 4.142493922063759e-06, + "loss": 0.0387, + "num_input_tokens_seen": 181952880, + "step": 149540 + }, + { + "epoch": 16.654972714110702, + "grad_norm": 0.0004484808596316725, + "learning_rate": 4.141154480975215e-06, + "loss": 0.0038, + "num_input_tokens_seen": 181959120, + "step": 149545 + }, + { + "epoch": 16.65552956899432, + "grad_norm": 0.13148978352546692, + "learning_rate": 4.13981523691519e-06, + "loss": 0.1178, + "num_input_tokens_seen": 181965104, + "step": 149550 + }, + { + "epoch": 16.656086423877937, + "grad_norm": 0.054277945309877396, + "learning_rate": 4.138476189896309e-06, + "loss": 0.0365, + "num_input_tokens_seen": 181971280, + "step": 149555 + }, + { + "epoch": 16.656643278761553, + "grad_norm": 0.0073066228069365025, + "learning_rate": 4.137137339931244e-06, + "loss": 0.2129, + "num_input_tokens_seen": 181977488, + "step": 149560 + }, + { + "epoch": 16.657200133645173, + "grad_norm": 0.6244258880615234, + "learning_rate": 4.135798687032625e-06, + "loss": 0.0275, + "num_input_tokens_seen": 181983408, + "step": 149565 + }, + { + "epoch": 16.65775698852879, + "grad_norm": 0.002754050539806485, + "learning_rate": 4.134460231213108e-06, + "loss": 0.0414, + "num_input_tokens_seen": 181989680, + "step": 149570 + }, + { + "epoch": 16.658313843412408, + "grad_norm": 0.9170108437538147, + "learning_rate": 4.133121972485332e-06, + "loss": 0.0058, + "num_input_tokens_seen": 181995632, + "step": 149575 + }, + { + "epoch": 16.658870698296024, + "grad_norm": 1.4311704635620117, + "learning_rate": 4.1317839108619385e-06, + "loss": 0.1931, + "num_input_tokens_seen": 182002000, + "step": 149580 + }, + { + "epoch": 16.65942755317964, + "grad_norm": 0.24084758758544922, + "learning_rate": 4.130446046355557e-06, + "loss": 0.0097, + "num_input_tokens_seen": 182008400, + "step": 149585 + }, + { + "epoch": 16.65998440806326, + "grad_norm": 0.5814893245697021, + "learning_rate": 4.129108378978841e-06, + "loss": 0.219, + "num_input_tokens_seen": 182014480, + "step": 149590 + }, + { + "epoch": 16.660541262946875, + "grad_norm": 0.051017191261053085, + "learning_rate": 4.127770908744416e-06, + "loss": 0.072, + "num_input_tokens_seen": 182020048, + "step": 149595 + }, + { + "epoch": 16.661098117830495, + "grad_norm": 0.8306474089622498, + "learning_rate": 4.126433635664919e-06, + "loss": 0.0056, + "num_input_tokens_seen": 182026384, + "step": 149600 + }, + { + "epoch": 16.66165497271411, + "grad_norm": 0.3676050007343292, + "learning_rate": 4.125096559752972e-06, + "loss": 0.0396, + "num_input_tokens_seen": 182032304, + "step": 149605 + }, + { + "epoch": 16.662211827597726, + "grad_norm": 0.07584737986326218, + "learning_rate": 4.123759681021222e-06, + "loss": 0.038, + "num_input_tokens_seen": 182038544, + "step": 149610 + }, + { + "epoch": 16.662768682481346, + "grad_norm": 0.21479089558124542, + "learning_rate": 4.122422999482278e-06, + "loss": 0.0711, + "num_input_tokens_seen": 182044432, + "step": 149615 + }, + { + "epoch": 16.663325537364962, + "grad_norm": 0.003932063467800617, + "learning_rate": 4.121086515148784e-06, + "loss": 0.0767, + "num_input_tokens_seen": 182050448, + "step": 149620 + }, + { + "epoch": 16.66388239224858, + "grad_norm": 0.24059170484542847, + "learning_rate": 4.119750228033353e-06, + "loss": 0.0039, + "num_input_tokens_seen": 182056880, + "step": 149625 + }, + { + "epoch": 16.664439247132197, + "grad_norm": 1.2128304243087769, + "learning_rate": 4.118414138148613e-06, + "loss": 0.0351, + "num_input_tokens_seen": 182063280, + "step": 149630 + }, + { + "epoch": 16.664996102015813, + "grad_norm": 0.1610054075717926, + "learning_rate": 4.117078245507175e-06, + "loss": 0.0259, + "num_input_tokens_seen": 182069360, + "step": 149635 + }, + { + "epoch": 16.665552956899433, + "grad_norm": 1.2484458684921265, + "learning_rate": 4.115742550121671e-06, + "loss": 0.0555, + "num_input_tokens_seen": 182075440, + "step": 149640 + }, + { + "epoch": 16.66610981178305, + "grad_norm": 0.35122907161712646, + "learning_rate": 4.114407052004707e-06, + "loss": 0.0168, + "num_input_tokens_seen": 182081488, + "step": 149645 + }, + { + "epoch": 16.666666666666668, + "grad_norm": 1.930564522743225, + "learning_rate": 4.113071751168904e-06, + "loss": 0.1261, + "num_input_tokens_seen": 182087344, + "step": 149650 + }, + { + "epoch": 16.667223521550284, + "grad_norm": 0.6033247709274292, + "learning_rate": 4.111736647626868e-06, + "loss": 0.0162, + "num_input_tokens_seen": 182093584, + "step": 149655 + }, + { + "epoch": 16.6677803764339, + "grad_norm": 0.7641740441322327, + "learning_rate": 4.110401741391217e-06, + "loss": 0.0495, + "num_input_tokens_seen": 182099536, + "step": 149660 + }, + { + "epoch": 16.66833723131752, + "grad_norm": 0.08789876848459244, + "learning_rate": 4.109067032474556e-06, + "loss": 0.0031, + "num_input_tokens_seen": 182105744, + "step": 149665 + }, + { + "epoch": 16.668894086201135, + "grad_norm": 0.207245871424675, + "learning_rate": 4.1077325208895064e-06, + "loss": 0.0386, + "num_input_tokens_seen": 182111888, + "step": 149670 + }, + { + "epoch": 16.669450941084754, + "grad_norm": 0.00012347761366982013, + "learning_rate": 4.106398206648648e-06, + "loss": 0.0404, + "num_input_tokens_seen": 182118000, + "step": 149675 + }, + { + "epoch": 16.67000779596837, + "grad_norm": 0.0017831831937655807, + "learning_rate": 4.105064089764604e-06, + "loss": 0.02, + "num_input_tokens_seen": 182123888, + "step": 149680 + }, + { + "epoch": 16.670564650851986, + "grad_norm": 0.01269389595836401, + "learning_rate": 4.103730170249964e-06, + "loss": 0.0381, + "num_input_tokens_seen": 182130256, + "step": 149685 + }, + { + "epoch": 16.671121505735606, + "grad_norm": 1.0107492208480835, + "learning_rate": 4.102396448117341e-06, + "loss": 0.0181, + "num_input_tokens_seen": 182136656, + "step": 149690 + }, + { + "epoch": 16.67167836061922, + "grad_norm": 1.1547183990478516, + "learning_rate": 4.101062923379328e-06, + "loss": 0.0794, + "num_input_tokens_seen": 182142448, + "step": 149695 + }, + { + "epoch": 16.67223521550284, + "grad_norm": 0.022844096645712852, + "learning_rate": 4.099729596048518e-06, + "loss": 0.0626, + "num_input_tokens_seen": 182148848, + "step": 149700 + }, + { + "epoch": 16.672792070386457, + "grad_norm": 0.0006979690515436232, + "learning_rate": 4.0983964661375e-06, + "loss": 0.033, + "num_input_tokens_seen": 182154992, + "step": 149705 + }, + { + "epoch": 16.673348925270076, + "grad_norm": 1.4171298742294312, + "learning_rate": 4.097063533658882e-06, + "loss": 0.1127, + "num_input_tokens_seen": 182161232, + "step": 149710 + }, + { + "epoch": 16.673905780153692, + "grad_norm": 0.0016924587544053793, + "learning_rate": 4.095730798625244e-06, + "loss": 0.084, + "num_input_tokens_seen": 182167568, + "step": 149715 + }, + { + "epoch": 16.674462635037308, + "grad_norm": 0.003244840307161212, + "learning_rate": 4.094398261049178e-06, + "loss": 0.0068, + "num_input_tokens_seen": 182173936, + "step": 149720 + }, + { + "epoch": 16.675019489920928, + "grad_norm": 1.3405495882034302, + "learning_rate": 4.093065920943262e-06, + "loss": 0.168, + "num_input_tokens_seen": 182179920, + "step": 149725 + }, + { + "epoch": 16.675576344804544, + "grad_norm": 0.015058180317282677, + "learning_rate": 4.091733778320098e-06, + "loss": 0.0397, + "num_input_tokens_seen": 182186160, + "step": 149730 + }, + { + "epoch": 16.67613319968816, + "grad_norm": 0.6438291072845459, + "learning_rate": 4.09040183319225e-06, + "loss": 0.0531, + "num_input_tokens_seen": 182192432, + "step": 149735 + }, + { + "epoch": 16.67669005457178, + "grad_norm": 1.1638277769088745, + "learning_rate": 4.089070085572324e-06, + "loss": 0.0292, + "num_input_tokens_seen": 182198032, + "step": 149740 + }, + { + "epoch": 16.677246909455395, + "grad_norm": 0.5197248458862305, + "learning_rate": 4.087738535472879e-06, + "loss": 0.0328, + "num_input_tokens_seen": 182203856, + "step": 149745 + }, + { + "epoch": 16.677803764339014, + "grad_norm": 0.0386606827378273, + "learning_rate": 4.08640718290649e-06, + "loss": 0.0457, + "num_input_tokens_seen": 182209936, + "step": 149750 + }, + { + "epoch": 16.67836061922263, + "grad_norm": 1.6715785264968872, + "learning_rate": 4.085076027885748e-06, + "loss": 0.1605, + "num_input_tokens_seen": 182216016, + "step": 149755 + }, + { + "epoch": 16.67891747410625, + "grad_norm": 0.00021595926955342293, + "learning_rate": 4.083745070423211e-06, + "loss": 0.0259, + "num_input_tokens_seen": 182222032, + "step": 149760 + }, + { + "epoch": 16.679474328989865, + "grad_norm": 0.0014004649128764868, + "learning_rate": 4.0824143105314764e-06, + "loss": 0.0616, + "num_input_tokens_seen": 182227952, + "step": 149765 + }, + { + "epoch": 16.68003118387348, + "grad_norm": 0.14824379980564117, + "learning_rate": 4.081083748223083e-06, + "loss": 0.0842, + "num_input_tokens_seen": 182234352, + "step": 149770 + }, + { + "epoch": 16.6805880387571, + "grad_norm": 0.11350537091493607, + "learning_rate": 4.0797533835106206e-06, + "loss": 0.1131, + "num_input_tokens_seen": 182240592, + "step": 149775 + }, + { + "epoch": 16.681144893640717, + "grad_norm": 0.18145838379859924, + "learning_rate": 4.07842321640664e-06, + "loss": 0.0254, + "num_input_tokens_seen": 182246160, + "step": 149780 + }, + { + "epoch": 16.681701748524336, + "grad_norm": 0.0034416301641613245, + "learning_rate": 4.077093246923721e-06, + "loss": 0.0306, + "num_input_tokens_seen": 182252400, + "step": 149785 + }, + { + "epoch": 16.682258603407952, + "grad_norm": 1.3617147207260132, + "learning_rate": 4.075763475074421e-06, + "loss": 0.0184, + "num_input_tokens_seen": 182258544, + "step": 149790 + }, + { + "epoch": 16.682815458291568, + "grad_norm": 0.44001221656799316, + "learning_rate": 4.0744339008713e-06, + "loss": 0.0511, + "num_input_tokens_seen": 182264752, + "step": 149795 + }, + { + "epoch": 16.683372313175187, + "grad_norm": 0.000594018492847681, + "learning_rate": 4.073104524326906e-06, + "loss": 0.0164, + "num_input_tokens_seen": 182271088, + "step": 149800 + }, + { + "epoch": 16.683929168058803, + "grad_norm": 6.358677864074707, + "learning_rate": 4.071775345453815e-06, + "loss": 0.0681, + "num_input_tokens_seen": 182277040, + "step": 149805 + }, + { + "epoch": 16.684486022942423, + "grad_norm": 0.07601768523454666, + "learning_rate": 4.070446364264574e-06, + "loss": 0.0042, + "num_input_tokens_seen": 182283184, + "step": 149810 + }, + { + "epoch": 16.68504287782604, + "grad_norm": 0.0021511029917746782, + "learning_rate": 4.069117580771734e-06, + "loss": 0.0155, + "num_input_tokens_seen": 182289680, + "step": 149815 + }, + { + "epoch": 16.685599732709655, + "grad_norm": 0.7068884372711182, + "learning_rate": 4.067788994987842e-06, + "loss": 0.0393, + "num_input_tokens_seen": 182296144, + "step": 149820 + }, + { + "epoch": 16.686156587593274, + "grad_norm": 1.6264046430587769, + "learning_rate": 4.066460606925463e-06, + "loss": 0.0804, + "num_input_tokens_seen": 182302320, + "step": 149825 + }, + { + "epoch": 16.68671344247689, + "grad_norm": 0.001545761595480144, + "learning_rate": 4.065132416597125e-06, + "loss": 0.0347, + "num_input_tokens_seen": 182308464, + "step": 149830 + }, + { + "epoch": 16.68727029736051, + "grad_norm": 1.3288416862487793, + "learning_rate": 4.063804424015391e-06, + "loss": 0.029, + "num_input_tokens_seen": 182314928, + "step": 149835 + }, + { + "epoch": 16.687827152244125, + "grad_norm": 1.4565891027450562, + "learning_rate": 4.062476629192799e-06, + "loss": 0.0928, + "num_input_tokens_seen": 182321392, + "step": 149840 + }, + { + "epoch": 16.68838400712774, + "grad_norm": 1.576891303062439, + "learning_rate": 4.061149032141889e-06, + "loss": 0.0364, + "num_input_tokens_seen": 182327568, + "step": 149845 + }, + { + "epoch": 16.68894086201136, + "grad_norm": 1.1969506740570068, + "learning_rate": 4.059821632875196e-06, + "loss": 0.0116, + "num_input_tokens_seen": 182333616, + "step": 149850 + }, + { + "epoch": 16.689497716894977, + "grad_norm": 0.8067730069160461, + "learning_rate": 4.058494431405271e-06, + "loss": 0.0812, + "num_input_tokens_seen": 182339632, + "step": 149855 + }, + { + "epoch": 16.690054571778596, + "grad_norm": 0.008436344563961029, + "learning_rate": 4.057167427744646e-06, + "loss": 0.0095, + "num_input_tokens_seen": 182345904, + "step": 149860 + }, + { + "epoch": 16.690611426662212, + "grad_norm": 0.04950755089521408, + "learning_rate": 4.055840621905852e-06, + "loss": 0.1205, + "num_input_tokens_seen": 182352144, + "step": 149865 + }, + { + "epoch": 16.691168281545828, + "grad_norm": 0.0003505401546135545, + "learning_rate": 4.054514013901417e-06, + "loss": 0.028, + "num_input_tokens_seen": 182358384, + "step": 149870 + }, + { + "epoch": 16.691725136429447, + "grad_norm": 0.15473107993602753, + "learning_rate": 4.053187603743885e-06, + "loss": 0.0534, + "num_input_tokens_seen": 182364432, + "step": 149875 + }, + { + "epoch": 16.692281991313063, + "grad_norm": 2.1494038105010986, + "learning_rate": 4.051861391445774e-06, + "loss": 0.1981, + "num_input_tokens_seen": 182370672, + "step": 149880 + }, + { + "epoch": 16.692838846196683, + "grad_norm": 0.0002929746697191149, + "learning_rate": 4.050535377019626e-06, + "loss": 0.0375, + "num_input_tokens_seen": 182377040, + "step": 149885 + }, + { + "epoch": 16.6933957010803, + "grad_norm": 1.559545874595642, + "learning_rate": 4.049209560477943e-06, + "loss": 0.0186, + "num_input_tokens_seen": 182383088, + "step": 149890 + }, + { + "epoch": 16.693952555963914, + "grad_norm": 0.07163284718990326, + "learning_rate": 4.047883941833269e-06, + "loss": 0.0225, + "num_input_tokens_seen": 182389200, + "step": 149895 + }, + { + "epoch": 16.694509410847534, + "grad_norm": 1.5590784549713135, + "learning_rate": 4.046558521098112e-06, + "loss": 0.0949, + "num_input_tokens_seen": 182394352, + "step": 149900 + }, + { + "epoch": 16.69506626573115, + "grad_norm": 0.049088988453149796, + "learning_rate": 4.0452332982850005e-06, + "loss": 0.0173, + "num_input_tokens_seen": 182400368, + "step": 149905 + }, + { + "epoch": 16.69562312061477, + "grad_norm": 0.09076453000307083, + "learning_rate": 4.043908273406452e-06, + "loss": 0.0119, + "num_input_tokens_seen": 182406672, + "step": 149910 + }, + { + "epoch": 16.696179975498385, + "grad_norm": 0.2619282603263855, + "learning_rate": 4.042583446474979e-06, + "loss": 0.0122, + "num_input_tokens_seen": 182413264, + "step": 149915 + }, + { + "epoch": 16.696736830382, + "grad_norm": 0.009135962463915348, + "learning_rate": 4.041258817503088e-06, + "loss": 0.0081, + "num_input_tokens_seen": 182419088, + "step": 149920 + }, + { + "epoch": 16.69729368526562, + "grad_norm": 0.1191464215517044, + "learning_rate": 4.039934386503308e-06, + "loss": 0.0089, + "num_input_tokens_seen": 182425296, + "step": 149925 + }, + { + "epoch": 16.697850540149236, + "grad_norm": 0.005569090135395527, + "learning_rate": 4.03861015348814e-06, + "loss": 0.1001, + "num_input_tokens_seen": 182431440, + "step": 149930 + }, + { + "epoch": 16.698407395032856, + "grad_norm": 1.2468782663345337, + "learning_rate": 4.037286118470093e-06, + "loss": 0.0703, + "num_input_tokens_seen": 182437456, + "step": 149935 + }, + { + "epoch": 16.69896424991647, + "grad_norm": 0.33623161911964417, + "learning_rate": 4.035962281461667e-06, + "loss": 0.0019, + "num_input_tokens_seen": 182443568, + "step": 149940 + }, + { + "epoch": 16.699521104800088, + "grad_norm": 0.00045184732880443335, + "learning_rate": 4.03463864247538e-06, + "loss": 0.0131, + "num_input_tokens_seen": 182449808, + "step": 149945 + }, + { + "epoch": 16.700077959683707, + "grad_norm": 1.4859099388122559, + "learning_rate": 4.033315201523722e-06, + "loss": 0.1745, + "num_input_tokens_seen": 182455696, + "step": 149950 + }, + { + "epoch": 16.700634814567323, + "grad_norm": 0.00047048559645190835, + "learning_rate": 4.031991958619214e-06, + "loss": 0.0067, + "num_input_tokens_seen": 182461968, + "step": 149955 + }, + { + "epoch": 16.701191669450942, + "grad_norm": 0.012945204973220825, + "learning_rate": 4.030668913774324e-06, + "loss": 0.0007, + "num_input_tokens_seen": 182468272, + "step": 149960 + }, + { + "epoch": 16.70174852433456, + "grad_norm": 0.6314504146575928, + "learning_rate": 4.029346067001577e-06, + "loss": 0.0449, + "num_input_tokens_seen": 182474192, + "step": 149965 + }, + { + "epoch": 16.702305379218174, + "grad_norm": 0.4137194752693176, + "learning_rate": 4.028023418313451e-06, + "loss": 0.0623, + "num_input_tokens_seen": 182480368, + "step": 149970 + }, + { + "epoch": 16.702862234101794, + "grad_norm": 0.0065871975384652615, + "learning_rate": 4.026700967722452e-06, + "loss": 0.0016, + "num_input_tokens_seen": 182486768, + "step": 149975 + }, + { + "epoch": 16.70341908898541, + "grad_norm": 0.026678351685404778, + "learning_rate": 4.025378715241065e-06, + "loss": 0.0034, + "num_input_tokens_seen": 182493008, + "step": 149980 + }, + { + "epoch": 16.70397594386903, + "grad_norm": 0.005256936885416508, + "learning_rate": 4.024056660881781e-06, + "loss": 0.0247, + "num_input_tokens_seen": 182498992, + "step": 149985 + }, + { + "epoch": 16.704532798752645, + "grad_norm": 0.02426847629249096, + "learning_rate": 4.0227348046570825e-06, + "loss": 0.0481, + "num_input_tokens_seen": 182504944, + "step": 149990 + }, + { + "epoch": 16.70508965363626, + "grad_norm": 0.0005432447651401162, + "learning_rate": 4.021413146579467e-06, + "loss": 0.014, + "num_input_tokens_seen": 182511312, + "step": 149995 + }, + { + "epoch": 16.70564650851988, + "grad_norm": 0.04081199690699577, + "learning_rate": 4.0200916866614095e-06, + "loss": 0.0122, + "num_input_tokens_seen": 182517616, + "step": 150000 + }, + { + "epoch": 16.706203363403496, + "grad_norm": 0.09657975286245346, + "learning_rate": 4.018770424915397e-06, + "loss": 0.0442, + "num_input_tokens_seen": 182524112, + "step": 150005 + }, + { + "epoch": 16.706760218287116, + "grad_norm": 0.8455339074134827, + "learning_rate": 4.0174493613539e-06, + "loss": 0.0958, + "num_input_tokens_seen": 182530128, + "step": 150010 + }, + { + "epoch": 16.70731707317073, + "grad_norm": 0.012246214784681797, + "learning_rate": 4.016128495989413e-06, + "loss": 0.0913, + "num_input_tokens_seen": 182536336, + "step": 150015 + }, + { + "epoch": 16.707873928054347, + "grad_norm": 0.1344924420118332, + "learning_rate": 4.014807828834396e-06, + "loss": 0.1558, + "num_input_tokens_seen": 182542160, + "step": 150020 + }, + { + "epoch": 16.708430782937967, + "grad_norm": 0.06941758096218109, + "learning_rate": 4.0134873599013415e-06, + "loss": 0.0096, + "num_input_tokens_seen": 182548464, + "step": 150025 + }, + { + "epoch": 16.708987637821583, + "grad_norm": 2.513047933578491, + "learning_rate": 4.012167089202709e-06, + "loss": 0.073, + "num_input_tokens_seen": 182554736, + "step": 150030 + }, + { + "epoch": 16.709544492705202, + "grad_norm": 0.27685487270355225, + "learning_rate": 4.010847016750976e-06, + "loss": 0.0393, + "num_input_tokens_seen": 182561040, + "step": 150035 + }, + { + "epoch": 16.710101347588818, + "grad_norm": 0.01748201623558998, + "learning_rate": 4.009527142558603e-06, + "loss": 0.0006, + "num_input_tokens_seen": 182567344, + "step": 150040 + }, + { + "epoch": 16.710658202472437, + "grad_norm": 0.22047239542007446, + "learning_rate": 4.008207466638067e-06, + "loss": 0.006, + "num_input_tokens_seen": 182573680, + "step": 150045 + }, + { + "epoch": 16.711215057356053, + "grad_norm": 0.0333377905189991, + "learning_rate": 4.006887989001831e-06, + "loss": 0.0251, + "num_input_tokens_seen": 182579856, + "step": 150050 + }, + { + "epoch": 16.71177191223967, + "grad_norm": 0.2224433571100235, + "learning_rate": 4.00556870966236e-06, + "loss": 0.0995, + "num_input_tokens_seen": 182586064, + "step": 150055 + }, + { + "epoch": 16.71232876712329, + "grad_norm": 0.07514722645282745, + "learning_rate": 4.004249628632103e-06, + "loss": 0.0253, + "num_input_tokens_seen": 182592560, + "step": 150060 + }, + { + "epoch": 16.712885622006905, + "grad_norm": 0.167260080575943, + "learning_rate": 4.002930745923539e-06, + "loss": 0.0021, + "num_input_tokens_seen": 182598416, + "step": 150065 + }, + { + "epoch": 16.71344247689052, + "grad_norm": 0.3859797716140747, + "learning_rate": 4.001612061549109e-06, + "loss": 0.0437, + "num_input_tokens_seen": 182604464, + "step": 150070 + }, + { + "epoch": 16.71399933177414, + "grad_norm": 0.004145583137869835, + "learning_rate": 4.000293575521288e-06, + "loss": 0.0989, + "num_input_tokens_seen": 182610864, + "step": 150075 + }, + { + "epoch": 16.714556186657756, + "grad_norm": 0.0004807836376130581, + "learning_rate": 3.998975287852511e-06, + "loss": 0.0924, + "num_input_tokens_seen": 182617296, + "step": 150080 + }, + { + "epoch": 16.715113041541375, + "grad_norm": 0.695141077041626, + "learning_rate": 3.997657198555241e-06, + "loss": 0.0414, + "num_input_tokens_seen": 182623280, + "step": 150085 + }, + { + "epoch": 16.71566989642499, + "grad_norm": 0.5423758029937744, + "learning_rate": 3.996339307641919e-06, + "loss": 0.0792, + "num_input_tokens_seen": 182629264, + "step": 150090 + }, + { + "epoch": 16.71622675130861, + "grad_norm": 0.02893131412565708, + "learning_rate": 3.995021615125005e-06, + "loss": 0.0105, + "num_input_tokens_seen": 182635568, + "step": 150095 + }, + { + "epoch": 16.716783606192227, + "grad_norm": 0.00433710590004921, + "learning_rate": 3.9937041210169445e-06, + "loss": 0.1189, + "num_input_tokens_seen": 182641488, + "step": 150100 + }, + { + "epoch": 16.717340461075842, + "grad_norm": 3.291929006576538, + "learning_rate": 3.992386825330174e-06, + "loss": 0.1467, + "num_input_tokens_seen": 182647696, + "step": 150105 + }, + { + "epoch": 16.717897315959462, + "grad_norm": 0.029011987149715424, + "learning_rate": 3.99106972807714e-06, + "loss": 0.0017, + "num_input_tokens_seen": 182653744, + "step": 150110 + }, + { + "epoch": 16.718454170843078, + "grad_norm": 0.0014859894290566444, + "learning_rate": 3.9897528292702876e-06, + "loss": 0.0336, + "num_input_tokens_seen": 182660176, + "step": 150115 + }, + { + "epoch": 16.719011025726697, + "grad_norm": 0.036619286984205246, + "learning_rate": 3.988436128922052e-06, + "loss": 0.1129, + "num_input_tokens_seen": 182666512, + "step": 150120 + }, + { + "epoch": 16.719567880610313, + "grad_norm": 0.007158687338232994, + "learning_rate": 3.987119627044875e-06, + "loss": 0.0102, + "num_input_tokens_seen": 182672880, + "step": 150125 + }, + { + "epoch": 16.72012473549393, + "grad_norm": 0.5692189931869507, + "learning_rate": 3.98580332365118e-06, + "loss": 0.0674, + "num_input_tokens_seen": 182678992, + "step": 150130 + }, + { + "epoch": 16.72068159037755, + "grad_norm": 3.0250377655029297, + "learning_rate": 3.9844872187534135e-06, + "loss": 0.1079, + "num_input_tokens_seen": 182685360, + "step": 150135 + }, + { + "epoch": 16.721238445261164, + "grad_norm": 0.6794328689575195, + "learning_rate": 3.983171312363998e-06, + "loss": 0.0106, + "num_input_tokens_seen": 182691344, + "step": 150140 + }, + { + "epoch": 16.721795300144784, + "grad_norm": 0.8505963087081909, + "learning_rate": 3.98185560449538e-06, + "loss": 0.0647, + "num_input_tokens_seen": 182697296, + "step": 150145 + }, + { + "epoch": 16.7223521550284, + "grad_norm": 0.0021419087424874306, + "learning_rate": 3.980540095159963e-06, + "loss": 0.0187, + "num_input_tokens_seen": 182703280, + "step": 150150 + }, + { + "epoch": 16.722909009912016, + "grad_norm": 1.2013435363769531, + "learning_rate": 3.979224784370192e-06, + "loss": 0.0335, + "num_input_tokens_seen": 182709584, + "step": 150155 + }, + { + "epoch": 16.723465864795635, + "grad_norm": 0.017651550471782684, + "learning_rate": 3.977909672138483e-06, + "loss": 0.0089, + "num_input_tokens_seen": 182715888, + "step": 150160 + }, + { + "epoch": 16.72402271967925, + "grad_norm": 0.00014483000268228352, + "learning_rate": 3.976594758477253e-06, + "loss": 0.0038, + "num_input_tokens_seen": 182722000, + "step": 150165 + }, + { + "epoch": 16.72457957456287, + "grad_norm": 0.02260146103799343, + "learning_rate": 3.975280043398938e-06, + "loss": 0.0231, + "num_input_tokens_seen": 182728304, + "step": 150170 + }, + { + "epoch": 16.725136429446486, + "grad_norm": 0.019287770614027977, + "learning_rate": 3.973965526915946e-06, + "loss": 0.0392, + "num_input_tokens_seen": 182734320, + "step": 150175 + }, + { + "epoch": 16.725693284330102, + "grad_norm": 0.34154143929481506, + "learning_rate": 3.972651209040698e-06, + "loss": 0.101, + "num_input_tokens_seen": 182740144, + "step": 150180 + }, + { + "epoch": 16.72625013921372, + "grad_norm": 0.5149920582771301, + "learning_rate": 3.971337089785598e-06, + "loss": 0.0132, + "num_input_tokens_seen": 182746128, + "step": 150185 + }, + { + "epoch": 16.726806994097338, + "grad_norm": 1.5319949388504028, + "learning_rate": 3.970023169163073e-06, + "loss": 0.0433, + "num_input_tokens_seen": 182752304, + "step": 150190 + }, + { + "epoch": 16.727363848980957, + "grad_norm": 0.0001112075406126678, + "learning_rate": 3.968709447185529e-06, + "loss": 0.0057, + "num_input_tokens_seen": 182758672, + "step": 150195 + }, + { + "epoch": 16.727920703864573, + "grad_norm": 0.9902687668800354, + "learning_rate": 3.967395923865372e-06, + "loss": 0.0342, + "num_input_tokens_seen": 182765040, + "step": 150200 + }, + { + "epoch": 16.72847755874819, + "grad_norm": 0.24120976030826569, + "learning_rate": 3.966082599215007e-06, + "loss": 0.1648, + "num_input_tokens_seen": 182771120, + "step": 150205 + }, + { + "epoch": 16.72903441363181, + "grad_norm": 0.015193888917565346, + "learning_rate": 3.9647694732468485e-06, + "loss": 0.03, + "num_input_tokens_seen": 182777040, + "step": 150210 + }, + { + "epoch": 16.729591268515424, + "grad_norm": 0.0027893742080777884, + "learning_rate": 3.963456545973299e-06, + "loss": 0.009, + "num_input_tokens_seen": 182783152, + "step": 150215 + }, + { + "epoch": 16.730148123399044, + "grad_norm": 0.6431357860565186, + "learning_rate": 3.962143817406755e-06, + "loss": 0.034, + "num_input_tokens_seen": 182789424, + "step": 150220 + }, + { + "epoch": 16.73070497828266, + "grad_norm": 2.086444139480591, + "learning_rate": 3.96083128755961e-06, + "loss": 0.0541, + "num_input_tokens_seen": 182795760, + "step": 150225 + }, + { + "epoch": 16.731261833166275, + "grad_norm": 0.01026869285851717, + "learning_rate": 3.959518956444278e-06, + "loss": 0.0914, + "num_input_tokens_seen": 182801296, + "step": 150230 + }, + { + "epoch": 16.731818688049895, + "grad_norm": 0.01357769500464201, + "learning_rate": 3.95820682407314e-06, + "loss": 0.0953, + "num_input_tokens_seen": 182807280, + "step": 150235 + }, + { + "epoch": 16.73237554293351, + "grad_norm": 0.1018720343708992, + "learning_rate": 3.956894890458604e-06, + "loss": 0.0196, + "num_input_tokens_seen": 182813456, + "step": 150240 + }, + { + "epoch": 16.73293239781713, + "grad_norm": 0.030345171689987183, + "learning_rate": 3.955583155613052e-06, + "loss": 0.0029, + "num_input_tokens_seen": 182819728, + "step": 150245 + }, + { + "epoch": 16.733489252700746, + "grad_norm": 0.15793199837207794, + "learning_rate": 3.95427161954888e-06, + "loss": 0.0065, + "num_input_tokens_seen": 182826032, + "step": 150250 + }, + { + "epoch": 16.734046107584362, + "grad_norm": 0.23120373487472534, + "learning_rate": 3.95296028227847e-06, + "loss": 0.0068, + "num_input_tokens_seen": 182832464, + "step": 150255 + }, + { + "epoch": 16.73460296246798, + "grad_norm": 0.39772915840148926, + "learning_rate": 3.951649143814215e-06, + "loss": 0.0188, + "num_input_tokens_seen": 182838672, + "step": 150260 + }, + { + "epoch": 16.735159817351597, + "grad_norm": 0.019839348271489143, + "learning_rate": 3.9503382041685e-06, + "loss": 0.1009, + "num_input_tokens_seen": 182844912, + "step": 150265 + }, + { + "epoch": 16.735716672235217, + "grad_norm": 0.017818661406636238, + "learning_rate": 3.949027463353705e-06, + "loss": 0.0075, + "num_input_tokens_seen": 182850928, + "step": 150270 + }, + { + "epoch": 16.736273527118833, + "grad_norm": 0.000756981607992202, + "learning_rate": 3.947716921382205e-06, + "loss": 0.0136, + "num_input_tokens_seen": 182856976, + "step": 150275 + }, + { + "epoch": 16.73683038200245, + "grad_norm": 0.0014851647429168224, + "learning_rate": 3.94640657826639e-06, + "loss": 0.0037, + "num_input_tokens_seen": 182863536, + "step": 150280 + }, + { + "epoch": 16.737387236886068, + "grad_norm": 0.05467634275555611, + "learning_rate": 3.945096434018627e-06, + "loss": 0.0537, + "num_input_tokens_seen": 182869808, + "step": 150285 + }, + { + "epoch": 16.737944091769684, + "grad_norm": 0.38543710112571716, + "learning_rate": 3.94378648865131e-06, + "loss": 0.0118, + "num_input_tokens_seen": 182876176, + "step": 150290 + }, + { + "epoch": 16.738500946653303, + "grad_norm": 2.2757136821746826, + "learning_rate": 3.94247674217679e-06, + "loss": 0.0647, + "num_input_tokens_seen": 182882480, + "step": 150295 + }, + { + "epoch": 16.73905780153692, + "grad_norm": 0.449872225522995, + "learning_rate": 3.94116719460745e-06, + "loss": 0.0166, + "num_input_tokens_seen": 182888944, + "step": 150300 + }, + { + "epoch": 16.739614656420535, + "grad_norm": 1.1281394958496094, + "learning_rate": 3.939857845955655e-06, + "loss": 0.0241, + "num_input_tokens_seen": 182895024, + "step": 150305 + }, + { + "epoch": 16.740171511304155, + "grad_norm": 0.2535667419433594, + "learning_rate": 3.938548696233782e-06, + "loss": 0.0281, + "num_input_tokens_seen": 182901136, + "step": 150310 + }, + { + "epoch": 16.74072836618777, + "grad_norm": 0.16549056768417358, + "learning_rate": 3.937239745454188e-06, + "loss": 0.0672, + "num_input_tokens_seen": 182907664, + "step": 150315 + }, + { + "epoch": 16.74128522107139, + "grad_norm": 0.15768951177597046, + "learning_rate": 3.935930993629241e-06, + "loss": 0.0052, + "num_input_tokens_seen": 182913616, + "step": 150320 + }, + { + "epoch": 16.741842075955006, + "grad_norm": 0.3362935781478882, + "learning_rate": 3.934622440771296e-06, + "loss": 0.0615, + "num_input_tokens_seen": 182919088, + "step": 150325 + }, + { + "epoch": 16.742398930838622, + "grad_norm": 0.02185732126235962, + "learning_rate": 3.933314086892725e-06, + "loss": 0.0035, + "num_input_tokens_seen": 182925104, + "step": 150330 + }, + { + "epoch": 16.74295578572224, + "grad_norm": 0.009606041945517063, + "learning_rate": 3.932005932005883e-06, + "loss": 0.0043, + "num_input_tokens_seen": 182931536, + "step": 150335 + }, + { + "epoch": 16.743512640605857, + "grad_norm": 1.724044680595398, + "learning_rate": 3.930697976123121e-06, + "loss": 0.0503, + "num_input_tokens_seen": 182937616, + "step": 150340 + }, + { + "epoch": 16.744069495489477, + "grad_norm": 0.00020474247867241502, + "learning_rate": 3.929390219256793e-06, + "loss": 0.0171, + "num_input_tokens_seen": 182943952, + "step": 150345 + }, + { + "epoch": 16.744626350373093, + "grad_norm": 0.008525584824383259, + "learning_rate": 3.928082661419264e-06, + "loss": 0.002, + "num_input_tokens_seen": 182950352, + "step": 150350 + }, + { + "epoch": 16.74518320525671, + "grad_norm": 0.00036727797123603523, + "learning_rate": 3.926775302622868e-06, + "loss": 0.0107, + "num_input_tokens_seen": 182956592, + "step": 150355 + }, + { + "epoch": 16.745740060140328, + "grad_norm": 0.22221073508262634, + "learning_rate": 3.925468142879973e-06, + "loss": 0.0221, + "num_input_tokens_seen": 182962960, + "step": 150360 + }, + { + "epoch": 16.746296915023944, + "grad_norm": 0.01644127443432808, + "learning_rate": 3.924161182202906e-06, + "loss": 0.0049, + "num_input_tokens_seen": 182969104, + "step": 150365 + }, + { + "epoch": 16.746853769907563, + "grad_norm": 0.0005467559094540775, + "learning_rate": 3.92285442060403e-06, + "loss": 0.0962, + "num_input_tokens_seen": 182974800, + "step": 150370 + }, + { + "epoch": 16.74741062479118, + "grad_norm": 0.028487026691436768, + "learning_rate": 3.921547858095673e-06, + "loss": 0.124, + "num_input_tokens_seen": 182980976, + "step": 150375 + }, + { + "epoch": 16.747967479674795, + "grad_norm": 0.0019058826146647334, + "learning_rate": 3.920241494690191e-06, + "loss": 0.0095, + "num_input_tokens_seen": 182986960, + "step": 150380 + }, + { + "epoch": 16.748524334558414, + "grad_norm": 0.12046567350625992, + "learning_rate": 3.918935330399917e-06, + "loss": 0.0625, + "num_input_tokens_seen": 182993040, + "step": 150385 + }, + { + "epoch": 16.74908118944203, + "grad_norm": 0.0037054000422358513, + "learning_rate": 3.917629365237188e-06, + "loss": 0.0022, + "num_input_tokens_seen": 182998672, + "step": 150390 + }, + { + "epoch": 16.74963804432565, + "grad_norm": 0.6042073369026184, + "learning_rate": 3.916323599214333e-06, + "loss": 0.0557, + "num_input_tokens_seen": 183004688, + "step": 150395 + }, + { + "epoch": 16.750194899209266, + "grad_norm": 0.6155964136123657, + "learning_rate": 3.915018032343704e-06, + "loss": 0.1394, + "num_input_tokens_seen": 183010800, + "step": 150400 + }, + { + "epoch": 16.75075175409288, + "grad_norm": 0.07625414431095123, + "learning_rate": 3.913712664637617e-06, + "loss": 0.0062, + "num_input_tokens_seen": 183016528, + "step": 150405 + }, + { + "epoch": 16.7513086089765, + "grad_norm": 0.0003810207126662135, + "learning_rate": 3.912407496108411e-06, + "loss": 0.004, + "num_input_tokens_seen": 183022928, + "step": 150410 + }, + { + "epoch": 16.751865463860117, + "grad_norm": 0.13847674429416656, + "learning_rate": 3.911102526768407e-06, + "loss": 0.0018, + "num_input_tokens_seen": 183029040, + "step": 150415 + }, + { + "epoch": 16.752422318743736, + "grad_norm": 0.006220840848982334, + "learning_rate": 3.90979775662994e-06, + "loss": 0.0715, + "num_input_tokens_seen": 183034896, + "step": 150420 + }, + { + "epoch": 16.752979173627352, + "grad_norm": 0.013341319747269154, + "learning_rate": 3.908493185705323e-06, + "loss": 0.057, + "num_input_tokens_seen": 183040656, + "step": 150425 + }, + { + "epoch": 16.753536028510972, + "grad_norm": 0.11025539040565491, + "learning_rate": 3.907188814006893e-06, + "loss": 0.0595, + "num_input_tokens_seen": 183046576, + "step": 150430 + }, + { + "epoch": 16.754092883394588, + "grad_norm": 0.15363198518753052, + "learning_rate": 3.905884641546964e-06, + "loss": 0.0437, + "num_input_tokens_seen": 183052848, + "step": 150435 + }, + { + "epoch": 16.754649738278204, + "grad_norm": 0.25071704387664795, + "learning_rate": 3.9045806683378565e-06, + "loss": 0.0158, + "num_input_tokens_seen": 183058992, + "step": 150440 + }, + { + "epoch": 16.755206593161823, + "grad_norm": 8.543857984477654e-05, + "learning_rate": 3.9032768943918804e-06, + "loss": 0.015, + "num_input_tokens_seen": 183065200, + "step": 150445 + }, + { + "epoch": 16.75576344804544, + "grad_norm": 0.16413514316082, + "learning_rate": 3.901973319721358e-06, + "loss": 0.0798, + "num_input_tokens_seen": 183071632, + "step": 150450 + }, + { + "epoch": 16.75632030292906, + "grad_norm": 0.1323464959859848, + "learning_rate": 3.900669944338606e-06, + "loss": 0.0179, + "num_input_tokens_seen": 183077808, + "step": 150455 + }, + { + "epoch": 16.756877157812674, + "grad_norm": 0.0016655937070026994, + "learning_rate": 3.899366768255927e-06, + "loss": 0.0645, + "num_input_tokens_seen": 183084016, + "step": 150460 + }, + { + "epoch": 16.75743401269629, + "grad_norm": 0.0011838028440251946, + "learning_rate": 3.8980637914856316e-06, + "loss": 0.0512, + "num_input_tokens_seen": 183089904, + "step": 150465 + }, + { + "epoch": 16.75799086757991, + "grad_norm": 0.0036139176227152348, + "learning_rate": 3.896761014040035e-06, + "loss": 0.0377, + "num_input_tokens_seen": 183096080, + "step": 150470 + }, + { + "epoch": 16.758547722463526, + "grad_norm": 0.017594095319509506, + "learning_rate": 3.895458435931432e-06, + "loss": 0.0124, + "num_input_tokens_seen": 183102256, + "step": 150475 + }, + { + "epoch": 16.759104577347145, + "grad_norm": 0.043249428272247314, + "learning_rate": 3.8941560571721434e-06, + "loss": 0.0512, + "num_input_tokens_seen": 183108848, + "step": 150480 + }, + { + "epoch": 16.75966143223076, + "grad_norm": 3.449998617172241, + "learning_rate": 3.89285387777445e-06, + "loss": 0.1014, + "num_input_tokens_seen": 183115344, + "step": 150485 + }, + { + "epoch": 16.760218287114377, + "grad_norm": 1.661055326461792, + "learning_rate": 3.89155189775067e-06, + "loss": 0.1636, + "num_input_tokens_seen": 183121104, + "step": 150490 + }, + { + "epoch": 16.760775141997996, + "grad_norm": 0.32390427589416504, + "learning_rate": 3.890250117113084e-06, + "loss": 0.1082, + "num_input_tokens_seen": 183127376, + "step": 150495 + }, + { + "epoch": 16.761331996881612, + "grad_norm": 0.18075023591518402, + "learning_rate": 3.888948535874007e-06, + "loss": 0.0129, + "num_input_tokens_seen": 183133456, + "step": 150500 + }, + { + "epoch": 16.76188885176523, + "grad_norm": 0.0029345720540732145, + "learning_rate": 3.887647154045726e-06, + "loss": 0.0019, + "num_input_tokens_seen": 183139920, + "step": 150505 + }, + { + "epoch": 16.762445706648847, + "grad_norm": 0.003099943045526743, + "learning_rate": 3.886345971640532e-06, + "loss": 0.0044, + "num_input_tokens_seen": 183146128, + "step": 150510 + }, + { + "epoch": 16.763002561532463, + "grad_norm": 0.02051161788403988, + "learning_rate": 3.8850449886707105e-06, + "loss": 0.002, + "num_input_tokens_seen": 183152176, + "step": 150515 + }, + { + "epoch": 16.763559416416083, + "grad_norm": 0.048729922622442245, + "learning_rate": 3.883744205148559e-06, + "loss": 0.0039, + "num_input_tokens_seen": 183158096, + "step": 150520 + }, + { + "epoch": 16.7641162712997, + "grad_norm": 0.091005340218544, + "learning_rate": 3.882443621086365e-06, + "loss": 0.0017, + "num_input_tokens_seen": 183164496, + "step": 150525 + }, + { + "epoch": 16.764673126183318, + "grad_norm": 0.5181132555007935, + "learning_rate": 3.881143236496409e-06, + "loss": 0.0328, + "num_input_tokens_seen": 183170544, + "step": 150530 + }, + { + "epoch": 16.765229981066934, + "grad_norm": 0.15470369160175323, + "learning_rate": 3.879843051390969e-06, + "loss": 0.0764, + "num_input_tokens_seen": 183176880, + "step": 150535 + }, + { + "epoch": 16.76578683595055, + "grad_norm": 0.9523341655731201, + "learning_rate": 3.878543065782339e-06, + "loss": 0.0471, + "num_input_tokens_seen": 183183248, + "step": 150540 + }, + { + "epoch": 16.76634369083417, + "grad_norm": 1.6999292373657227, + "learning_rate": 3.877243279682788e-06, + "loss": 0.0511, + "num_input_tokens_seen": 183189424, + "step": 150545 + }, + { + "epoch": 16.766900545717785, + "grad_norm": 1.1610333919525146, + "learning_rate": 3.875943693104606e-06, + "loss": 0.0947, + "num_input_tokens_seen": 183195600, + "step": 150550 + }, + { + "epoch": 16.767457400601405, + "grad_norm": 1.1418471336364746, + "learning_rate": 3.874644306060049e-06, + "loss": 0.0557, + "num_input_tokens_seen": 183201744, + "step": 150555 + }, + { + "epoch": 16.76801425548502, + "grad_norm": 0.19241216778755188, + "learning_rate": 3.873345118561409e-06, + "loss": 0.0316, + "num_input_tokens_seen": 183207600, + "step": 150560 + }, + { + "epoch": 16.768571110368637, + "grad_norm": 0.0008561376016587019, + "learning_rate": 3.872046130620951e-06, + "loss": 0.0331, + "num_input_tokens_seen": 183213392, + "step": 150565 + }, + { + "epoch": 16.769127965252256, + "grad_norm": 1.0393227338790894, + "learning_rate": 3.870747342250939e-06, + "loss": 0.045, + "num_input_tokens_seen": 183219664, + "step": 150570 + }, + { + "epoch": 16.769684820135872, + "grad_norm": 0.3401104807853699, + "learning_rate": 3.8694487534636505e-06, + "loss": 0.0073, + "num_input_tokens_seen": 183225904, + "step": 150575 + }, + { + "epoch": 16.77024167501949, + "grad_norm": 1.0091534852981567, + "learning_rate": 3.86815036427135e-06, + "loss": 0.0584, + "num_input_tokens_seen": 183231952, + "step": 150580 + }, + { + "epoch": 16.770798529903107, + "grad_norm": 0.0002238065208075568, + "learning_rate": 3.866852174686297e-06, + "loss": 0.0085, + "num_input_tokens_seen": 183238192, + "step": 150585 + }, + { + "epoch": 16.771355384786723, + "grad_norm": 0.08875355124473572, + "learning_rate": 3.8655541847207544e-06, + "loss": 0.0075, + "num_input_tokens_seen": 183244048, + "step": 150590 + }, + { + "epoch": 16.771912239670343, + "grad_norm": 0.1953606903553009, + "learning_rate": 3.8642563943869895e-06, + "loss": 0.0091, + "num_input_tokens_seen": 183250032, + "step": 150595 + }, + { + "epoch": 16.77246909455396, + "grad_norm": 0.555296778678894, + "learning_rate": 3.862958803697256e-06, + "loss": 0.0208, + "num_input_tokens_seen": 183256016, + "step": 150600 + }, + { + "epoch": 16.773025949437578, + "grad_norm": 0.06578592211008072, + "learning_rate": 3.861661412663814e-06, + "loss": 0.0011, + "num_input_tokens_seen": 183262192, + "step": 150605 + }, + { + "epoch": 16.773582804321194, + "grad_norm": 1.7190340757369995, + "learning_rate": 3.860364221298907e-06, + "loss": 0.1576, + "num_input_tokens_seen": 183268528, + "step": 150610 + }, + { + "epoch": 16.77413965920481, + "grad_norm": 0.5156537294387817, + "learning_rate": 3.859067229614804e-06, + "loss": 0.0571, + "num_input_tokens_seen": 183274000, + "step": 150615 + }, + { + "epoch": 16.77469651408843, + "grad_norm": 0.14904440939426422, + "learning_rate": 3.857770437623742e-06, + "loss": 0.0022, + "num_input_tokens_seen": 183280272, + "step": 150620 + }, + { + "epoch": 16.775253368972045, + "grad_norm": 0.017440468072891235, + "learning_rate": 3.856473845337991e-06, + "loss": 0.0455, + "num_input_tokens_seen": 183286352, + "step": 150625 + }, + { + "epoch": 16.775810223855665, + "grad_norm": 0.0003608917468227446, + "learning_rate": 3.855177452769771e-06, + "loss": 0.008, + "num_input_tokens_seen": 183292336, + "step": 150630 + }, + { + "epoch": 16.77636707873928, + "grad_norm": 0.00013378815492615104, + "learning_rate": 3.853881259931344e-06, + "loss": 0.123, + "num_input_tokens_seen": 183298704, + "step": 150635 + }, + { + "epoch": 16.776923933622896, + "grad_norm": 0.05211276188492775, + "learning_rate": 3.852585266834949e-06, + "loss": 0.1084, + "num_input_tokens_seen": 183304816, + "step": 150640 + }, + { + "epoch": 16.777480788506516, + "grad_norm": 0.2975642681121826, + "learning_rate": 3.8512894734928335e-06, + "loss": 0.013, + "num_input_tokens_seen": 183311056, + "step": 150645 + }, + { + "epoch": 16.77803764339013, + "grad_norm": 0.38594651222229004, + "learning_rate": 3.849993879917232e-06, + "loss": 0.0133, + "num_input_tokens_seen": 183317200, + "step": 150650 + }, + { + "epoch": 16.77859449827375, + "grad_norm": 0.6385603547096252, + "learning_rate": 3.848698486120386e-06, + "loss": 0.0084, + "num_input_tokens_seen": 183323408, + "step": 150655 + }, + { + "epoch": 16.779151353157367, + "grad_norm": 1.8410656452178955, + "learning_rate": 3.847403292114521e-06, + "loss": 0.0502, + "num_input_tokens_seen": 183329840, + "step": 150660 + }, + { + "epoch": 16.779708208040983, + "grad_norm": 1.392316460609436, + "learning_rate": 3.8461082979118854e-06, + "loss": 0.0315, + "num_input_tokens_seen": 183336048, + "step": 150665 + }, + { + "epoch": 16.780265062924602, + "grad_norm": 0.5725911855697632, + "learning_rate": 3.844813503524705e-06, + "loss": 0.0342, + "num_input_tokens_seen": 183342160, + "step": 150670 + }, + { + "epoch": 16.78082191780822, + "grad_norm": 0.2887250781059265, + "learning_rate": 3.84351890896521e-06, + "loss": 0.0051, + "num_input_tokens_seen": 183348336, + "step": 150675 + }, + { + "epoch": 16.781378772691838, + "grad_norm": 0.0019207585137337446, + "learning_rate": 3.8422245142456235e-06, + "loss": 0.0399, + "num_input_tokens_seen": 183354800, + "step": 150680 + }, + { + "epoch": 16.781935627575454, + "grad_norm": 0.22035977244377136, + "learning_rate": 3.840930319378183e-06, + "loss": 0.0056, + "num_input_tokens_seen": 183360816, + "step": 150685 + }, + { + "epoch": 16.78249248245907, + "grad_norm": 1.5289921760559082, + "learning_rate": 3.839636324375104e-06, + "loss": 0.082, + "num_input_tokens_seen": 183366160, + "step": 150690 + }, + { + "epoch": 16.78304933734269, + "grad_norm": 1.6733680963516235, + "learning_rate": 3.838342529248626e-06, + "loss": 0.1029, + "num_input_tokens_seen": 183371696, + "step": 150695 + }, + { + "epoch": 16.783606192226305, + "grad_norm": 0.010082073509693146, + "learning_rate": 3.8370489340109425e-06, + "loss": 0.0244, + "num_input_tokens_seen": 183377584, + "step": 150700 + }, + { + "epoch": 16.784163047109924, + "grad_norm": 0.9241904616355896, + "learning_rate": 3.835755538674293e-06, + "loss": 0.0194, + "num_input_tokens_seen": 183383504, + "step": 150705 + }, + { + "epoch": 16.78471990199354, + "grad_norm": 0.0023816260509192944, + "learning_rate": 3.834462343250886e-06, + "loss": 0.0026, + "num_input_tokens_seen": 183389712, + "step": 150710 + }, + { + "epoch": 16.785276756877156, + "grad_norm": 0.04266171529889107, + "learning_rate": 3.8331693477529435e-06, + "loss": 0.0471, + "num_input_tokens_seen": 183396240, + "step": 150715 + }, + { + "epoch": 16.785833611760776, + "grad_norm": 1.2791764736175537, + "learning_rate": 3.831876552192676e-06, + "loss": 0.093, + "num_input_tokens_seen": 183401968, + "step": 150720 + }, + { + "epoch": 16.78639046664439, + "grad_norm": 0.0012677593622356653, + "learning_rate": 3.830583956582293e-06, + "loss": 0.08, + "num_input_tokens_seen": 183407760, + "step": 150725 + }, + { + "epoch": 16.78694732152801, + "grad_norm": 0.0011015881318598986, + "learning_rate": 3.829291560934001e-06, + "loss": 0.0249, + "num_input_tokens_seen": 183413776, + "step": 150730 + }, + { + "epoch": 16.787504176411627, + "grad_norm": 1.0829535722732544, + "learning_rate": 3.827999365260015e-06, + "loss": 0.0127, + "num_input_tokens_seen": 183419760, + "step": 150735 + }, + { + "epoch": 16.788061031295243, + "grad_norm": 0.23344029486179352, + "learning_rate": 3.826707369572541e-06, + "loss": 0.0108, + "num_input_tokens_seen": 183426064, + "step": 150740 + }, + { + "epoch": 16.788617886178862, + "grad_norm": 1.5360628366470337, + "learning_rate": 3.825415573883778e-06, + "loss": 0.0809, + "num_input_tokens_seen": 183432400, + "step": 150745 + }, + { + "epoch": 16.789174741062478, + "grad_norm": 0.004983730614185333, + "learning_rate": 3.824123978205924e-06, + "loss": 0.0427, + "num_input_tokens_seen": 183438576, + "step": 150750 + }, + { + "epoch": 16.789731595946098, + "grad_norm": 0.42241787910461426, + "learning_rate": 3.822832582551189e-06, + "loss": 0.0481, + "num_input_tokens_seen": 183444464, + "step": 150755 + }, + { + "epoch": 16.790288450829713, + "grad_norm": 0.40230634808540344, + "learning_rate": 3.821541386931765e-06, + "loss": 0.0089, + "num_input_tokens_seen": 183450832, + "step": 150760 + }, + { + "epoch": 16.790845305713333, + "grad_norm": 1.4992647171020508, + "learning_rate": 3.820250391359858e-06, + "loss": 0.2121, + "num_input_tokens_seen": 183456688, + "step": 150765 + }, + { + "epoch": 16.79140216059695, + "grad_norm": 1.0435857772827148, + "learning_rate": 3.818959595847646e-06, + "loss": 0.0971, + "num_input_tokens_seen": 183462672, + "step": 150770 + }, + { + "epoch": 16.791959015480565, + "grad_norm": 0.268436998128891, + "learning_rate": 3.8176690004073365e-06, + "loss": 0.0085, + "num_input_tokens_seen": 183468912, + "step": 150775 + }, + { + "epoch": 16.792515870364184, + "grad_norm": 0.00025207584258168936, + "learning_rate": 3.816378605051107e-06, + "loss": 0.001, + "num_input_tokens_seen": 183475216, + "step": 150780 + }, + { + "epoch": 16.7930727252478, + "grad_norm": 2.6476845741271973, + "learning_rate": 3.815088409791162e-06, + "loss": 0.0599, + "num_input_tokens_seen": 183481360, + "step": 150785 + }, + { + "epoch": 16.793629580131416, + "grad_norm": 0.30865776538848877, + "learning_rate": 3.813798414639677e-06, + "loss": 0.0048, + "num_input_tokens_seen": 183487312, + "step": 150790 + }, + { + "epoch": 16.794186435015035, + "grad_norm": 0.002195156179368496, + "learning_rate": 3.8125086196088426e-06, + "loss": 0.0006, + "num_input_tokens_seen": 183493456, + "step": 150795 + }, + { + "epoch": 16.79474328989865, + "grad_norm": 0.21483774483203888, + "learning_rate": 3.8112190247108326e-06, + "loss": 0.0087, + "num_input_tokens_seen": 183499824, + "step": 150800 + }, + { + "epoch": 16.79530014478227, + "grad_norm": 0.4731510281562805, + "learning_rate": 3.8099296299578396e-06, + "loss": 0.0624, + "num_input_tokens_seen": 183505520, + "step": 150805 + }, + { + "epoch": 16.795856999665887, + "grad_norm": 0.059209320694208145, + "learning_rate": 3.80864043536204e-06, + "loss": 0.0549, + "num_input_tokens_seen": 183511440, + "step": 150810 + }, + { + "epoch": 16.796413854549506, + "grad_norm": 0.03863459452986717, + "learning_rate": 3.8073514409356082e-06, + "loss": 0.0724, + "num_input_tokens_seen": 183518000, + "step": 150815 + }, + { + "epoch": 16.796970709433122, + "grad_norm": 0.15912136435508728, + "learning_rate": 3.806062646690717e-06, + "loss": 0.0938, + "num_input_tokens_seen": 183524080, + "step": 150820 + }, + { + "epoch": 16.797527564316738, + "grad_norm": 2.028301239013672, + "learning_rate": 3.8047740526395483e-06, + "loss": 0.0131, + "num_input_tokens_seen": 183530320, + "step": 150825 + }, + { + "epoch": 16.798084419200357, + "grad_norm": 0.3316415250301361, + "learning_rate": 3.8034856587942674e-06, + "loss": 0.1002, + "num_input_tokens_seen": 183536112, + "step": 150830 + }, + { + "epoch": 16.798641274083973, + "grad_norm": 0.008362330496311188, + "learning_rate": 3.802197465167051e-06, + "loss": 0.0969, + "num_input_tokens_seen": 183542064, + "step": 150835 + }, + { + "epoch": 16.799198128967593, + "grad_norm": 0.6097829341888428, + "learning_rate": 3.8009094717700614e-06, + "loss": 0.151, + "num_input_tokens_seen": 183548144, + "step": 150840 + }, + { + "epoch": 16.79975498385121, + "grad_norm": 0.1983129233121872, + "learning_rate": 3.799621678615467e-06, + "loss": 0.0287, + "num_input_tokens_seen": 183553520, + "step": 150845 + }, + { + "epoch": 16.800311838734824, + "grad_norm": 1.747872233390808, + "learning_rate": 3.798334085715427e-06, + "loss": 0.0461, + "num_input_tokens_seen": 183559760, + "step": 150850 + }, + { + "epoch": 16.800868693618444, + "grad_norm": 0.21420495212078094, + "learning_rate": 3.7970466930821123e-06, + "loss": 0.0549, + "num_input_tokens_seen": 183565936, + "step": 150855 + }, + { + "epoch": 16.80142554850206, + "grad_norm": 2.372068166732788, + "learning_rate": 3.7957595007276803e-06, + "loss": 0.1233, + "num_input_tokens_seen": 183572016, + "step": 150860 + }, + { + "epoch": 16.80198240338568, + "grad_norm": 0.0003083501069340855, + "learning_rate": 3.7944725086642873e-06, + "loss": 0.0627, + "num_input_tokens_seen": 183577936, + "step": 150865 + }, + { + "epoch": 16.802539258269295, + "grad_norm": 0.023082442581653595, + "learning_rate": 3.793185716904088e-06, + "loss": 0.0198, + "num_input_tokens_seen": 183583824, + "step": 150870 + }, + { + "epoch": 16.80309611315291, + "grad_norm": 0.00944427028298378, + "learning_rate": 3.791899125459242e-06, + "loss": 0.0316, + "num_input_tokens_seen": 183590352, + "step": 150875 + }, + { + "epoch": 16.80365296803653, + "grad_norm": 0.09145462512969971, + "learning_rate": 3.7906127343418972e-06, + "loss": 0.0064, + "num_input_tokens_seen": 183596368, + "step": 150880 + }, + { + "epoch": 16.804209822920146, + "grad_norm": 0.004359388258308172, + "learning_rate": 3.78932654356422e-06, + "loss": 0.0129, + "num_input_tokens_seen": 183602544, + "step": 150885 + }, + { + "epoch": 16.804766677803766, + "grad_norm": 0.12416402995586395, + "learning_rate": 3.788040553138333e-06, + "loss": 0.0497, + "num_input_tokens_seen": 183608688, + "step": 150890 + }, + { + "epoch": 16.80532353268738, + "grad_norm": 0.6714984178543091, + "learning_rate": 3.7867547630764056e-06, + "loss": 0.0585, + "num_input_tokens_seen": 183614160, + "step": 150895 + }, + { + "epoch": 16.805880387570998, + "grad_norm": 0.1403634250164032, + "learning_rate": 3.7854691733905685e-06, + "loss": 0.0295, + "num_input_tokens_seen": 183619952, + "step": 150900 + }, + { + "epoch": 16.806437242454617, + "grad_norm": 0.6387881636619568, + "learning_rate": 3.784183784092976e-06, + "loss": 0.0554, + "num_input_tokens_seen": 183625840, + "step": 150905 + }, + { + "epoch": 16.806994097338233, + "grad_norm": 0.009483441710472107, + "learning_rate": 3.782898595195769e-06, + "loss": 0.0122, + "num_input_tokens_seen": 183631696, + "step": 150910 + }, + { + "epoch": 16.807550952221852, + "grad_norm": 0.03326025605201721, + "learning_rate": 3.781613606711082e-06, + "loss": 0.0169, + "num_input_tokens_seen": 183637520, + "step": 150915 + }, + { + "epoch": 16.80810780710547, + "grad_norm": 0.03201696649193764, + "learning_rate": 3.780328818651049e-06, + "loss": 0.0084, + "num_input_tokens_seen": 183643568, + "step": 150920 + }, + { + "epoch": 16.808664661989084, + "grad_norm": 0.052348941564559937, + "learning_rate": 3.7790442310278146e-06, + "loss": 0.0482, + "num_input_tokens_seen": 183649232, + "step": 150925 + }, + { + "epoch": 16.809221516872704, + "grad_norm": 0.8228790163993835, + "learning_rate": 3.777759843853512e-06, + "loss": 0.0249, + "num_input_tokens_seen": 183655408, + "step": 150930 + }, + { + "epoch": 16.80977837175632, + "grad_norm": 0.010110700502991676, + "learning_rate": 3.7764756571402715e-06, + "loss": 0.0156, + "num_input_tokens_seen": 183661680, + "step": 150935 + }, + { + "epoch": 16.81033522663994, + "grad_norm": 0.006507131736725569, + "learning_rate": 3.7751916709002134e-06, + "loss": 0.0044, + "num_input_tokens_seen": 183667824, + "step": 150940 + }, + { + "epoch": 16.810892081523555, + "grad_norm": 0.20342712104320526, + "learning_rate": 3.7739078851454835e-06, + "loss": 0.0122, + "num_input_tokens_seen": 183674256, + "step": 150945 + }, + { + "epoch": 16.81144893640717, + "grad_norm": 0.0020781303755939007, + "learning_rate": 3.772624299888192e-06, + "loss": 0.0109, + "num_input_tokens_seen": 183680720, + "step": 150950 + }, + { + "epoch": 16.81200579129079, + "grad_norm": 1.3642432689666748, + "learning_rate": 3.771340915140484e-06, + "loss": 0.0598, + "num_input_tokens_seen": 183686864, + "step": 150955 + }, + { + "epoch": 16.812562646174406, + "grad_norm": 0.32983049750328064, + "learning_rate": 3.7700577309144588e-06, + "loss": 0.0459, + "num_input_tokens_seen": 183692688, + "step": 150960 + }, + { + "epoch": 16.813119501058026, + "grad_norm": 0.355535626411438, + "learning_rate": 3.7687747472222507e-06, + "loss": 0.0411, + "num_input_tokens_seen": 183698864, + "step": 150965 + }, + { + "epoch": 16.81367635594164, + "grad_norm": 0.14055392146110535, + "learning_rate": 3.76749196407597e-06, + "loss": 0.0029, + "num_input_tokens_seen": 183704720, + "step": 150970 + }, + { + "epoch": 16.814233210825257, + "grad_norm": 0.7113288044929504, + "learning_rate": 3.7662093814877454e-06, + "loss": 0.0311, + "num_input_tokens_seen": 183710768, + "step": 150975 + }, + { + "epoch": 16.814790065708877, + "grad_norm": 0.021334387362003326, + "learning_rate": 3.764926999469684e-06, + "loss": 0.0437, + "num_input_tokens_seen": 183716592, + "step": 150980 + }, + { + "epoch": 16.815346920592493, + "grad_norm": 0.32947981357574463, + "learning_rate": 3.7636448180339012e-06, + "loss": 0.0463, + "num_input_tokens_seen": 183722256, + "step": 150985 + }, + { + "epoch": 16.815903775476112, + "grad_norm": 0.19113244116306305, + "learning_rate": 3.7623628371925098e-06, + "loss": 0.1161, + "num_input_tokens_seen": 183728368, + "step": 150990 + }, + { + "epoch": 16.816460630359728, + "grad_norm": 0.03967214748263359, + "learning_rate": 3.7610810569576078e-06, + "loss": 0.0193, + "num_input_tokens_seen": 183734608, + "step": 150995 + }, + { + "epoch": 16.817017485243344, + "grad_norm": 0.1346108764410019, + "learning_rate": 3.7597994773413192e-06, + "loss": 0.0044, + "num_input_tokens_seen": 183740816, + "step": 151000 + }, + { + "epoch": 16.817574340126964, + "grad_norm": 0.00044001126661896706, + "learning_rate": 3.75851809835574e-06, + "loss": 0.0043, + "num_input_tokens_seen": 183746896, + "step": 151005 + }, + { + "epoch": 16.81813119501058, + "grad_norm": 0.9161257743835449, + "learning_rate": 3.757236920012977e-06, + "loss": 0.0232, + "num_input_tokens_seen": 183752496, + "step": 151010 + }, + { + "epoch": 16.8186880498942, + "grad_norm": 0.0031979521736502647, + "learning_rate": 3.755955942325126e-06, + "loss": 0.0065, + "num_input_tokens_seen": 183758704, + "step": 151015 + }, + { + "epoch": 16.819244904777815, + "grad_norm": 0.0026768625248223543, + "learning_rate": 3.7546751653042943e-06, + "loss": 0.0037, + "num_input_tokens_seen": 183764528, + "step": 151020 + }, + { + "epoch": 16.81980175966143, + "grad_norm": 0.014943509362637997, + "learning_rate": 3.753394588962575e-06, + "loss": 0.0647, + "num_input_tokens_seen": 183770256, + "step": 151025 + }, + { + "epoch": 16.82035861454505, + "grad_norm": 1.1193245649337769, + "learning_rate": 3.7521142133120747e-06, + "loss": 0.0563, + "num_input_tokens_seen": 183775760, + "step": 151030 + }, + { + "epoch": 16.820915469428666, + "grad_norm": 0.00332200457341969, + "learning_rate": 3.7508340383648698e-06, + "loss": 0.0275, + "num_input_tokens_seen": 183782160, + "step": 151035 + }, + { + "epoch": 16.821472324312285, + "grad_norm": 0.7474259734153748, + "learning_rate": 3.749554064133065e-06, + "loss": 0.0255, + "num_input_tokens_seen": 183788016, + "step": 151040 + }, + { + "epoch": 16.8220291791959, + "grad_norm": 0.022011198103427887, + "learning_rate": 3.748274290628745e-06, + "loss": 0.0574, + "num_input_tokens_seen": 183793968, + "step": 151045 + }, + { + "epoch": 16.822586034079517, + "grad_norm": 0.04802241921424866, + "learning_rate": 3.7469947178640055e-06, + "loss": 0.0524, + "num_input_tokens_seen": 183799888, + "step": 151050 + }, + { + "epoch": 16.823142888963137, + "grad_norm": 0.2505238950252533, + "learning_rate": 3.745715345850928e-06, + "loss": 0.0806, + "num_input_tokens_seen": 183805680, + "step": 151055 + }, + { + "epoch": 16.823699743846753, + "grad_norm": 0.07191110402345657, + "learning_rate": 3.744436174601598e-06, + "loss": 0.0695, + "num_input_tokens_seen": 183811376, + "step": 151060 + }, + { + "epoch": 16.824256598730372, + "grad_norm": 0.8332903981208801, + "learning_rate": 3.7431572041280923e-06, + "loss": 0.0667, + "num_input_tokens_seen": 183817296, + "step": 151065 + }, + { + "epoch": 16.824813453613988, + "grad_norm": 0.0006797648966312408, + "learning_rate": 3.7418784344425027e-06, + "loss": 0.0009, + "num_input_tokens_seen": 183823664, + "step": 151070 + }, + { + "epoch": 16.825370308497604, + "grad_norm": 0.0011636598501354456, + "learning_rate": 3.7405998655569043e-06, + "loss": 0.0589, + "num_input_tokens_seen": 183830096, + "step": 151075 + }, + { + "epoch": 16.825927163381223, + "grad_norm": 0.003255307674407959, + "learning_rate": 3.7393214974833724e-06, + "loss": 0.0581, + "num_input_tokens_seen": 183836016, + "step": 151080 + }, + { + "epoch": 16.82648401826484, + "grad_norm": 0.005477118771523237, + "learning_rate": 3.738043330233976e-06, + "loss": 0.0094, + "num_input_tokens_seen": 183842288, + "step": 151085 + }, + { + "epoch": 16.82704087314846, + "grad_norm": 0.44199520349502563, + "learning_rate": 3.736765363820802e-06, + "loss": 0.044, + "num_input_tokens_seen": 183848016, + "step": 151090 + }, + { + "epoch": 16.827597728032075, + "grad_norm": 0.0007695461390540004, + "learning_rate": 3.73548759825591e-06, + "loss": 0.0395, + "num_input_tokens_seen": 183854352, + "step": 151095 + }, + { + "epoch": 16.828154582915694, + "grad_norm": 0.008698204532265663, + "learning_rate": 3.734210033551383e-06, + "loss": 0.0002, + "num_input_tokens_seen": 183860720, + "step": 151100 + }, + { + "epoch": 16.82871143779931, + "grad_norm": 0.05755079165101051, + "learning_rate": 3.7329326697192717e-06, + "loss": 0.0102, + "num_input_tokens_seen": 183866736, + "step": 151105 + }, + { + "epoch": 16.829268292682926, + "grad_norm": 0.8490505218505859, + "learning_rate": 3.731655506771656e-06, + "loss": 0.0279, + "num_input_tokens_seen": 183872912, + "step": 151110 + }, + { + "epoch": 16.829825147566545, + "grad_norm": 0.6365557312965393, + "learning_rate": 3.730378544720586e-06, + "loss": 0.0889, + "num_input_tokens_seen": 183878960, + "step": 151115 + }, + { + "epoch": 16.83038200245016, + "grad_norm": 2.2677085399627686, + "learning_rate": 3.729101783578137e-06, + "loss": 0.1443, + "num_input_tokens_seen": 183884848, + "step": 151120 + }, + { + "epoch": 16.830938857333777, + "grad_norm": 0.029255904257297516, + "learning_rate": 3.7278252233563652e-06, + "loss": 0.0355, + "num_input_tokens_seen": 183890928, + "step": 151125 + }, + { + "epoch": 16.831495712217396, + "grad_norm": 0.00046405100147239864, + "learning_rate": 3.726548864067328e-06, + "loss": 0.1308, + "num_input_tokens_seen": 183897104, + "step": 151130 + }, + { + "epoch": 16.832052567101012, + "grad_norm": 0.31405094265937805, + "learning_rate": 3.7252727057230736e-06, + "loss": 0.0208, + "num_input_tokens_seen": 183903248, + "step": 151135 + }, + { + "epoch": 16.832609421984632, + "grad_norm": 0.0012832105858251452, + "learning_rate": 3.72399674833567e-06, + "loss": 0.0058, + "num_input_tokens_seen": 183909200, + "step": 151140 + }, + { + "epoch": 16.833166276868248, + "grad_norm": 0.00905117578804493, + "learning_rate": 3.7227209919171613e-06, + "loss": 0.0089, + "num_input_tokens_seen": 183915120, + "step": 151145 + }, + { + "epoch": 16.833723131751867, + "grad_norm": 0.0017664325423538685, + "learning_rate": 3.7214454364796037e-06, + "loss": 0.0103, + "num_input_tokens_seen": 183921296, + "step": 151150 + }, + { + "epoch": 16.834279986635483, + "grad_norm": 0.04220893606543541, + "learning_rate": 3.720170082035032e-06, + "loss": 0.0096, + "num_input_tokens_seen": 183927408, + "step": 151155 + }, + { + "epoch": 16.8348368415191, + "grad_norm": 0.0803665816783905, + "learning_rate": 3.7188949285955123e-06, + "loss": 0.0103, + "num_input_tokens_seen": 183933488, + "step": 151160 + }, + { + "epoch": 16.83539369640272, + "grad_norm": 0.032221075147390366, + "learning_rate": 3.717619976173073e-06, + "loss": 0.03, + "num_input_tokens_seen": 183939760, + "step": 151165 + }, + { + "epoch": 16.835950551286334, + "grad_norm": 0.040125854313373566, + "learning_rate": 3.716345224779769e-06, + "loss": 0.0236, + "num_input_tokens_seen": 183945552, + "step": 151170 + }, + { + "epoch": 16.836507406169954, + "grad_norm": 0.001913718064315617, + "learning_rate": 3.715070674427637e-06, + "loss": 0.0591, + "num_input_tokens_seen": 183952112, + "step": 151175 + }, + { + "epoch": 16.83706426105357, + "grad_norm": 0.7716726064682007, + "learning_rate": 3.7137963251287156e-06, + "loss": 0.078, + "num_input_tokens_seen": 183958032, + "step": 151180 + }, + { + "epoch": 16.837621115937186, + "grad_norm": 0.023715157061815262, + "learning_rate": 3.7125221768950364e-06, + "loss": 0.0023, + "num_input_tokens_seen": 183964208, + "step": 151185 + }, + { + "epoch": 16.838177970820805, + "grad_norm": 2.755751371383667, + "learning_rate": 3.711248229738648e-06, + "loss": 0.1103, + "num_input_tokens_seen": 183970480, + "step": 151190 + }, + { + "epoch": 16.83873482570442, + "grad_norm": 0.014316887594759464, + "learning_rate": 3.7099744836715742e-06, + "loss": 0.0016, + "num_input_tokens_seen": 183976784, + "step": 151195 + }, + { + "epoch": 16.83929168058804, + "grad_norm": 1.187981367111206, + "learning_rate": 3.7087009387058473e-06, + "loss": 0.0516, + "num_input_tokens_seen": 183983088, + "step": 151200 + }, + { + "epoch": 16.839848535471656, + "grad_norm": 0.06148815527558327, + "learning_rate": 3.7074275948534965e-06, + "loss": 0.0137, + "num_input_tokens_seen": 183989680, + "step": 151205 + }, + { + "epoch": 16.840405390355272, + "grad_norm": 0.00015833495126571506, + "learning_rate": 3.706154452126556e-06, + "loss": 0.1508, + "num_input_tokens_seen": 183995696, + "step": 151210 + }, + { + "epoch": 16.84096224523889, + "grad_norm": 0.6512703895568848, + "learning_rate": 3.704881510537045e-06, + "loss": 0.047, + "num_input_tokens_seen": 184001424, + "step": 151215 + }, + { + "epoch": 16.841519100122508, + "grad_norm": 0.000685687642544508, + "learning_rate": 3.703608770096992e-06, + "loss": 0.037, + "num_input_tokens_seen": 184007632, + "step": 151220 + }, + { + "epoch": 16.842075955006127, + "grad_norm": 0.37362444400787354, + "learning_rate": 3.7023362308184096e-06, + "loss": 0.0303, + "num_input_tokens_seen": 184013712, + "step": 151225 + }, + { + "epoch": 16.842632809889743, + "grad_norm": 0.008314759470522404, + "learning_rate": 3.70106389271333e-06, + "loss": 0.0238, + "num_input_tokens_seen": 184019952, + "step": 151230 + }, + { + "epoch": 16.84318966477336, + "grad_norm": 0.023167809471488, + "learning_rate": 3.6997917557937605e-06, + "loss": 0.1049, + "num_input_tokens_seen": 184026160, + "step": 151235 + }, + { + "epoch": 16.843746519656978, + "grad_norm": 0.777215838432312, + "learning_rate": 3.6985198200717303e-06, + "loss": 0.0483, + "num_input_tokens_seen": 184032208, + "step": 151240 + }, + { + "epoch": 16.844303374540594, + "grad_norm": 0.4943488538265228, + "learning_rate": 3.697248085559246e-06, + "loss": 0.0072, + "num_input_tokens_seen": 184038448, + "step": 151245 + }, + { + "epoch": 16.844860229424214, + "grad_norm": 0.033264849334955215, + "learning_rate": 3.695976552268321e-06, + "loss": 0.0153, + "num_input_tokens_seen": 184044720, + "step": 151250 + }, + { + "epoch": 16.84541708430783, + "grad_norm": 0.002096799435093999, + "learning_rate": 3.694705220210962e-06, + "loss": 0.0492, + "num_input_tokens_seen": 184050960, + "step": 151255 + }, + { + "epoch": 16.845973939191445, + "grad_norm": 0.0023570728953927755, + "learning_rate": 3.6934340893991863e-06, + "loss": 0.0091, + "num_input_tokens_seen": 184057456, + "step": 151260 + }, + { + "epoch": 16.846530794075065, + "grad_norm": 1.4386175870895386, + "learning_rate": 3.692163159844994e-06, + "loss": 0.0442, + "num_input_tokens_seen": 184063664, + "step": 151265 + }, + { + "epoch": 16.84708764895868, + "grad_norm": 0.44219914078712463, + "learning_rate": 3.6908924315603943e-06, + "loss": 0.0094, + "num_input_tokens_seen": 184069840, + "step": 151270 + }, + { + "epoch": 16.8476445038423, + "grad_norm": 2.4648470878601074, + "learning_rate": 3.6896219045573833e-06, + "loss": 0.0623, + "num_input_tokens_seen": 184075952, + "step": 151275 + }, + { + "epoch": 16.848201358725916, + "grad_norm": 1.972212553024292, + "learning_rate": 3.6883515788479706e-06, + "loss": 0.0429, + "num_input_tokens_seen": 184082160, + "step": 151280 + }, + { + "epoch": 16.848758213609532, + "grad_norm": 0.40944960713386536, + "learning_rate": 3.6870814544441494e-06, + "loss": 0.0712, + "num_input_tokens_seen": 184088144, + "step": 151285 + }, + { + "epoch": 16.84931506849315, + "grad_norm": 0.00018341081158723682, + "learning_rate": 3.6858115313579273e-06, + "loss": 0.0034, + "num_input_tokens_seen": 184094064, + "step": 151290 + }, + { + "epoch": 16.849871923376767, + "grad_norm": 0.1626955270767212, + "learning_rate": 3.684541809601283e-06, + "loss": 0.0041, + "num_input_tokens_seen": 184100144, + "step": 151295 + }, + { + "epoch": 16.850428778260387, + "grad_norm": 0.022301262244582176, + "learning_rate": 3.6832722891862237e-06, + "loss": 0.0157, + "num_input_tokens_seen": 184106384, + "step": 151300 + }, + { + "epoch": 16.850985633144003, + "grad_norm": 0.07573588937520981, + "learning_rate": 3.682002970124729e-06, + "loss": 0.0445, + "num_input_tokens_seen": 184112656, + "step": 151305 + }, + { + "epoch": 16.85154248802762, + "grad_norm": 1.1457902193069458, + "learning_rate": 3.6807338524288025e-06, + "loss": 0.0258, + "num_input_tokens_seen": 184118576, + "step": 151310 + }, + { + "epoch": 16.852099342911238, + "grad_norm": 0.010706265456974506, + "learning_rate": 3.6794649361104272e-06, + "loss": 0.0025, + "num_input_tokens_seen": 184124720, + "step": 151315 + }, + { + "epoch": 16.852656197794854, + "grad_norm": 3.7301244735717773, + "learning_rate": 3.6781962211815873e-06, + "loss": 0.0791, + "num_input_tokens_seen": 184130288, + "step": 151320 + }, + { + "epoch": 16.853213052678473, + "grad_norm": 0.17885974049568176, + "learning_rate": 3.67692770765426e-06, + "loss": 0.0028, + "num_input_tokens_seen": 184136272, + "step": 151325 + }, + { + "epoch": 16.85376990756209, + "grad_norm": 0.07366370409727097, + "learning_rate": 3.67565939554044e-06, + "loss": 0.0634, + "num_input_tokens_seen": 184142448, + "step": 151330 + }, + { + "epoch": 16.854326762445705, + "grad_norm": 0.0002387429412920028, + "learning_rate": 3.674391284852102e-06, + "loss": 0.0703, + "num_input_tokens_seen": 184148624, + "step": 151335 + }, + { + "epoch": 16.854883617329325, + "grad_norm": 0.022350972518324852, + "learning_rate": 3.673123375601223e-06, + "loss": 0.052, + "num_input_tokens_seen": 184154672, + "step": 151340 + }, + { + "epoch": 16.85544047221294, + "grad_norm": 0.07485480606555939, + "learning_rate": 3.6718556677997755e-06, + "loss": 0.0718, + "num_input_tokens_seen": 184160688, + "step": 151345 + }, + { + "epoch": 16.85599732709656, + "grad_norm": 0.6928832530975342, + "learning_rate": 3.6705881614597455e-06, + "loss": 0.0354, + "num_input_tokens_seen": 184166864, + "step": 151350 + }, + { + "epoch": 16.856554181980176, + "grad_norm": 0.00040539519977755845, + "learning_rate": 3.669320856593092e-06, + "loss": 0.0023, + "num_input_tokens_seen": 184173296, + "step": 151355 + }, + { + "epoch": 16.85711103686379, + "grad_norm": 0.7659522294998169, + "learning_rate": 3.668053753211806e-06, + "loss": 0.0655, + "num_input_tokens_seen": 184179344, + "step": 151360 + }, + { + "epoch": 16.85766789174741, + "grad_norm": 0.006341671105474234, + "learning_rate": 3.66678685132783e-06, + "loss": 0.006, + "num_input_tokens_seen": 184185584, + "step": 151365 + }, + { + "epoch": 16.858224746631027, + "grad_norm": 0.026066599413752556, + "learning_rate": 3.665520150953153e-06, + "loss": 0.121, + "num_input_tokens_seen": 184191696, + "step": 151370 + }, + { + "epoch": 16.858781601514647, + "grad_norm": 0.495808482170105, + "learning_rate": 3.6642536520997223e-06, + "loss": 0.026, + "num_input_tokens_seen": 184197776, + "step": 151375 + }, + { + "epoch": 16.859338456398262, + "grad_norm": 0.044857174158096313, + "learning_rate": 3.662987354779515e-06, + "loss": 0.0609, + "num_input_tokens_seen": 184203728, + "step": 151380 + }, + { + "epoch": 16.85989531128188, + "grad_norm": 0.007300965022295713, + "learning_rate": 3.6617212590044886e-06, + "loss": 0.1365, + "num_input_tokens_seen": 184209456, + "step": 151385 + }, + { + "epoch": 16.860452166165498, + "grad_norm": 0.035958725959062576, + "learning_rate": 3.6604553647866025e-06, + "loss": 0.0362, + "num_input_tokens_seen": 184215472, + "step": 151390 + }, + { + "epoch": 16.861009021049114, + "grad_norm": 0.6989887952804565, + "learning_rate": 3.659189672137811e-06, + "loss": 0.022, + "num_input_tokens_seen": 184221424, + "step": 151395 + }, + { + "epoch": 16.861565875932733, + "grad_norm": 0.05631590262055397, + "learning_rate": 3.6579241810700633e-06, + "loss": 0.0242, + "num_input_tokens_seen": 184227440, + "step": 151400 + }, + { + "epoch": 16.86212273081635, + "grad_norm": 0.003293851390480995, + "learning_rate": 3.656658891595327e-06, + "loss": 0.0049, + "num_input_tokens_seen": 184233488, + "step": 151405 + }, + { + "epoch": 16.862679585699965, + "grad_norm": 0.02825409360229969, + "learning_rate": 3.6553938037255488e-06, + "loss": 0.036, + "num_input_tokens_seen": 184239280, + "step": 151410 + }, + { + "epoch": 16.863236440583584, + "grad_norm": 0.024711454287171364, + "learning_rate": 3.6541289174726766e-06, + "loss": 0.1045, + "num_input_tokens_seen": 184245680, + "step": 151415 + }, + { + "epoch": 16.8637932954672, + "grad_norm": 0.0003719452943187207, + "learning_rate": 3.6528642328486547e-06, + "loss": 0.0246, + "num_input_tokens_seen": 184251856, + "step": 151420 + }, + { + "epoch": 16.86435015035082, + "grad_norm": 0.0028094530571252108, + "learning_rate": 3.6515997498654366e-06, + "loss": 0.0761, + "num_input_tokens_seen": 184257936, + "step": 151425 + }, + { + "epoch": 16.864907005234436, + "grad_norm": 0.01138477772474289, + "learning_rate": 3.65033546853496e-06, + "loss": 0.0119, + "num_input_tokens_seen": 184263888, + "step": 151430 + }, + { + "epoch": 16.86546386011805, + "grad_norm": 0.022903282195329666, + "learning_rate": 3.649071388869177e-06, + "loss": 0.0061, + "num_input_tokens_seen": 184270096, + "step": 151435 + }, + { + "epoch": 16.86602071500167, + "grad_norm": 0.005963170900940895, + "learning_rate": 3.6478075108800134e-06, + "loss": 0.0153, + "num_input_tokens_seen": 184276304, + "step": 151440 + }, + { + "epoch": 16.866577569885287, + "grad_norm": 0.35112226009368896, + "learning_rate": 3.646543834579419e-06, + "loss": 0.006, + "num_input_tokens_seen": 184282352, + "step": 151445 + }, + { + "epoch": 16.867134424768906, + "grad_norm": 0.010042439214885235, + "learning_rate": 3.6452803599793197e-06, + "loss": 0.007, + "num_input_tokens_seen": 184288560, + "step": 151450 + }, + { + "epoch": 16.867691279652522, + "grad_norm": 0.167665034532547, + "learning_rate": 3.644017087091664e-06, + "loss": 0.0064, + "num_input_tokens_seen": 184294256, + "step": 151455 + }, + { + "epoch": 16.868248134536138, + "grad_norm": 0.008460123091936111, + "learning_rate": 3.6427540159283763e-06, + "loss": 0.1535, + "num_input_tokens_seen": 184300464, + "step": 151460 + }, + { + "epoch": 16.868804989419758, + "grad_norm": 0.6951465606689453, + "learning_rate": 3.6414911465013885e-06, + "loss": 0.0375, + "num_input_tokens_seen": 184306480, + "step": 151465 + }, + { + "epoch": 16.869361844303373, + "grad_norm": 0.0015132392290979624, + "learning_rate": 3.6402284788226215e-06, + "loss": 0.0052, + "num_input_tokens_seen": 184312720, + "step": 151470 + }, + { + "epoch": 16.869918699186993, + "grad_norm": 0.012602088041603565, + "learning_rate": 3.638966012904016e-06, + "loss": 0.027, + "num_input_tokens_seen": 184318608, + "step": 151475 + }, + { + "epoch": 16.87047555407061, + "grad_norm": 1.5494569540023804, + "learning_rate": 3.6377037487574926e-06, + "loss": 0.116, + "num_input_tokens_seen": 184324496, + "step": 151480 + }, + { + "epoch": 16.87103240895423, + "grad_norm": 0.015265370719134808, + "learning_rate": 3.63644168639497e-06, + "loss": 0.0368, + "num_input_tokens_seen": 184330480, + "step": 151485 + }, + { + "epoch": 16.871589263837844, + "grad_norm": 0.020464148372411728, + "learning_rate": 3.6351798258283664e-06, + "loss": 0.0069, + "num_input_tokens_seen": 184336592, + "step": 151490 + }, + { + "epoch": 16.87214611872146, + "grad_norm": 0.38872116804122925, + "learning_rate": 3.633918167069614e-06, + "loss": 0.0106, + "num_input_tokens_seen": 184342640, + "step": 151495 + }, + { + "epoch": 16.87270297360508, + "grad_norm": 0.6957771182060242, + "learning_rate": 3.6326567101306166e-06, + "loss": 0.0879, + "num_input_tokens_seen": 184349136, + "step": 151500 + }, + { + "epoch": 16.873259828488695, + "grad_norm": 0.03159685060381889, + "learning_rate": 3.6313954550233063e-06, + "loss": 0.0525, + "num_input_tokens_seen": 184355088, + "step": 151505 + }, + { + "epoch": 16.873816683372315, + "grad_norm": 1.217049479484558, + "learning_rate": 3.6301344017595746e-06, + "loss": 0.015, + "num_input_tokens_seen": 184361296, + "step": 151510 + }, + { + "epoch": 16.87437353825593, + "grad_norm": 0.0010515818139538169, + "learning_rate": 3.6288735503513527e-06, + "loss": 0.0026, + "num_input_tokens_seen": 184367568, + "step": 151515 + }, + { + "epoch": 16.874930393139547, + "grad_norm": 1.0070748329162598, + "learning_rate": 3.6276129008105343e-06, + "loss": 0.0289, + "num_input_tokens_seen": 184374000, + "step": 151520 + }, + { + "epoch": 16.875487248023166, + "grad_norm": 1.6757900714874268, + "learning_rate": 3.626352453149043e-06, + "loss": 0.0559, + "num_input_tokens_seen": 184379760, + "step": 151525 + }, + { + "epoch": 16.876044102906782, + "grad_norm": 1.426182746887207, + "learning_rate": 3.6250922073787745e-06, + "loss": 0.0582, + "num_input_tokens_seen": 184386000, + "step": 151530 + }, + { + "epoch": 16.8766009577904, + "grad_norm": 0.006293763872236013, + "learning_rate": 3.623832163511637e-06, + "loss": 0.0589, + "num_input_tokens_seen": 184392336, + "step": 151535 + }, + { + "epoch": 16.877157812674017, + "grad_norm": 0.09186231344938278, + "learning_rate": 3.622572321559525e-06, + "loss": 0.0057, + "num_input_tokens_seen": 184398288, + "step": 151540 + }, + { + "epoch": 16.877714667557633, + "grad_norm": 0.0026005140971392393, + "learning_rate": 3.62131268153435e-06, + "loss": 0.0162, + "num_input_tokens_seen": 184404912, + "step": 151545 + }, + { + "epoch": 16.878271522441253, + "grad_norm": 0.02103549614548683, + "learning_rate": 3.6200532434480074e-06, + "loss": 0.0644, + "num_input_tokens_seen": 184411248, + "step": 151550 + }, + { + "epoch": 16.87882837732487, + "grad_norm": 0.003018310759216547, + "learning_rate": 3.618794007312387e-06, + "loss": 0.0997, + "num_input_tokens_seen": 184416816, + "step": 151555 + }, + { + "epoch": 16.879385232208488, + "grad_norm": 0.05172504857182503, + "learning_rate": 3.617534973139386e-06, + "loss": 0.007, + "num_input_tokens_seen": 184423024, + "step": 151560 + }, + { + "epoch": 16.879942087092104, + "grad_norm": 0.025134528055787086, + "learning_rate": 3.6162761409409023e-06, + "loss": 0.0844, + "num_input_tokens_seen": 184429232, + "step": 151565 + }, + { + "epoch": 16.88049894197572, + "grad_norm": 0.004980398342013359, + "learning_rate": 3.615017510728816e-06, + "loss": 0.0157, + "num_input_tokens_seen": 184435408, + "step": 151570 + }, + { + "epoch": 16.88105579685934, + "grad_norm": 0.5030084252357483, + "learning_rate": 3.613759082515031e-06, + "loss": 0.0288, + "num_input_tokens_seen": 184441072, + "step": 151575 + }, + { + "epoch": 16.881612651742955, + "grad_norm": 1.5862114429473877, + "learning_rate": 3.612500856311424e-06, + "loss": 0.1953, + "num_input_tokens_seen": 184447184, + "step": 151580 + }, + { + "epoch": 16.882169506626575, + "grad_norm": 0.02303207665681839, + "learning_rate": 3.6112428321298825e-06, + "loss": 0.0874, + "num_input_tokens_seen": 184453424, + "step": 151585 + }, + { + "epoch": 16.88272636151019, + "grad_norm": 0.33667248487472534, + "learning_rate": 3.6099850099822837e-06, + "loss": 0.0123, + "num_input_tokens_seen": 184459472, + "step": 151590 + }, + { + "epoch": 16.883283216393806, + "grad_norm": 0.0029158720280975103, + "learning_rate": 3.6087273898805174e-06, + "loss": 0.0084, + "num_input_tokens_seen": 184465936, + "step": 151595 + }, + { + "epoch": 16.883840071277426, + "grad_norm": 0.0038567299488931894, + "learning_rate": 3.607469971836461e-06, + "loss": 0.0269, + "num_input_tokens_seen": 184472176, + "step": 151600 + }, + { + "epoch": 16.884396926161042, + "grad_norm": 0.13896489143371582, + "learning_rate": 3.60621275586199e-06, + "loss": 0.0166, + "num_input_tokens_seen": 184478352, + "step": 151605 + }, + { + "epoch": 16.88495378104466, + "grad_norm": 0.03918902203440666, + "learning_rate": 3.604955741968974e-06, + "loss": 0.029, + "num_input_tokens_seen": 184484624, + "step": 151610 + }, + { + "epoch": 16.885510635928277, + "grad_norm": 0.30112698674201965, + "learning_rate": 3.603698930169297e-06, + "loss": 0.011, + "num_input_tokens_seen": 184490640, + "step": 151615 + }, + { + "epoch": 16.886067490811893, + "grad_norm": 0.005838451907038689, + "learning_rate": 3.602442320474822e-06, + "loss": 0.052, + "num_input_tokens_seen": 184496624, + "step": 151620 + }, + { + "epoch": 16.886624345695513, + "grad_norm": 0.027994517236948013, + "learning_rate": 3.601185912897434e-06, + "loss": 0.0346, + "num_input_tokens_seen": 184502640, + "step": 151625 + }, + { + "epoch": 16.88718120057913, + "grad_norm": 1.5793753862380981, + "learning_rate": 3.599929707448976e-06, + "loss": 0.186, + "num_input_tokens_seen": 184509040, + "step": 151630 + }, + { + "epoch": 16.887738055462748, + "grad_norm": 0.00018189268303103745, + "learning_rate": 3.5986737041413366e-06, + "loss": 0.0176, + "num_input_tokens_seen": 184515152, + "step": 151635 + }, + { + "epoch": 16.888294910346364, + "grad_norm": 0.056905150413513184, + "learning_rate": 3.5974179029863635e-06, + "loss": 0.0018, + "num_input_tokens_seen": 184521232, + "step": 151640 + }, + { + "epoch": 16.88885176522998, + "grad_norm": 0.13460546731948853, + "learning_rate": 3.5961623039959288e-06, + "loss": 0.0273, + "num_input_tokens_seen": 184527152, + "step": 151645 + }, + { + "epoch": 16.8894086201136, + "grad_norm": 0.0008267221273854375, + "learning_rate": 3.5949069071818913e-06, + "loss": 0.0822, + "num_input_tokens_seen": 184533680, + "step": 151650 + }, + { + "epoch": 16.889965474997215, + "grad_norm": 0.05893108993768692, + "learning_rate": 3.593651712556109e-06, + "loss": 0.0097, + "num_input_tokens_seen": 184539792, + "step": 151655 + }, + { + "epoch": 16.890522329880834, + "grad_norm": 3.4591944217681885, + "learning_rate": 3.592396720130431e-06, + "loss": 0.0811, + "num_input_tokens_seen": 184546000, + "step": 151660 + }, + { + "epoch": 16.89107918476445, + "grad_norm": 0.9323440790176392, + "learning_rate": 3.591141929916722e-06, + "loss": 0.029, + "num_input_tokens_seen": 184551696, + "step": 151665 + }, + { + "epoch": 16.891636039648066, + "grad_norm": 0.003764295717701316, + "learning_rate": 3.589887341926829e-06, + "loss": 0.0849, + "num_input_tokens_seen": 184557776, + "step": 151670 + }, + { + "epoch": 16.892192894531686, + "grad_norm": 1.2934542894363403, + "learning_rate": 3.588632956172605e-06, + "loss": 0.0249, + "num_input_tokens_seen": 184563760, + "step": 151675 + }, + { + "epoch": 16.8927497494153, + "grad_norm": 0.5753312110900879, + "learning_rate": 3.587378772665892e-06, + "loss": 0.0417, + "num_input_tokens_seen": 184569488, + "step": 151680 + }, + { + "epoch": 16.89330660429892, + "grad_norm": 0.14636246860027313, + "learning_rate": 3.5861247914185466e-06, + "loss": 0.0562, + "num_input_tokens_seen": 184575536, + "step": 151685 + }, + { + "epoch": 16.893863459182537, + "grad_norm": 0.12025529146194458, + "learning_rate": 3.5848710124424033e-06, + "loss": 0.0633, + "num_input_tokens_seen": 184581680, + "step": 151690 + }, + { + "epoch": 16.894420314066153, + "grad_norm": 3.423004627227783, + "learning_rate": 3.583617435749323e-06, + "loss": 0.0611, + "num_input_tokens_seen": 184587664, + "step": 151695 + }, + { + "epoch": 16.894977168949772, + "grad_norm": 0.018795333802700043, + "learning_rate": 3.5823640613511233e-06, + "loss": 0.0058, + "num_input_tokens_seen": 184594000, + "step": 151700 + }, + { + "epoch": 16.895534023833388, + "grad_norm": 0.17900192737579346, + "learning_rate": 3.581110889259659e-06, + "loss": 0.0041, + "num_input_tokens_seen": 184600176, + "step": 151705 + }, + { + "epoch": 16.896090878717008, + "grad_norm": 0.0744706466794014, + "learning_rate": 3.5798579194867595e-06, + "loss": 0.0068, + "num_input_tokens_seen": 184606160, + "step": 151710 + }, + { + "epoch": 16.896647733600624, + "grad_norm": 0.1128680408000946, + "learning_rate": 3.5786051520442676e-06, + "loss": 0.0085, + "num_input_tokens_seen": 184612016, + "step": 151715 + }, + { + "epoch": 16.89720458848424, + "grad_norm": 0.048410989344120026, + "learning_rate": 3.5773525869440106e-06, + "loss": 0.0099, + "num_input_tokens_seen": 184618352, + "step": 151720 + }, + { + "epoch": 16.89776144336786, + "grad_norm": 0.02177933230996132, + "learning_rate": 3.5761002241978257e-06, + "loss": 0.0008, + "num_input_tokens_seen": 184624368, + "step": 151725 + }, + { + "epoch": 16.898318298251475, + "grad_norm": 1.2957580089569092, + "learning_rate": 3.574848063817529e-06, + "loss": 0.0533, + "num_input_tokens_seen": 184630416, + "step": 151730 + }, + { + "epoch": 16.898875153135094, + "grad_norm": 0.15933090448379517, + "learning_rate": 3.5735961058149665e-06, + "loss": 0.1078, + "num_input_tokens_seen": 184636464, + "step": 151735 + }, + { + "epoch": 16.89943200801871, + "grad_norm": 0.00013348970969673246, + "learning_rate": 3.572344350201956e-06, + "loss": 0.0316, + "num_input_tokens_seen": 184642736, + "step": 151740 + }, + { + "epoch": 16.899988862902326, + "grad_norm": 0.042220957577228546, + "learning_rate": 3.5710927969903193e-06, + "loss": 0.0326, + "num_input_tokens_seen": 184648784, + "step": 151745 + }, + { + "epoch": 16.900545717785945, + "grad_norm": 0.4932604134082794, + "learning_rate": 3.569841446191874e-06, + "loss": 0.041, + "num_input_tokens_seen": 184654896, + "step": 151750 + }, + { + "epoch": 16.90110257266956, + "grad_norm": 0.00733205396682024, + "learning_rate": 3.5685902978184497e-06, + "loss": 0.005, + "num_input_tokens_seen": 184661168, + "step": 151755 + }, + { + "epoch": 16.90165942755318, + "grad_norm": 0.16174019873142242, + "learning_rate": 3.5673393518818573e-06, + "loss": 0.0402, + "num_input_tokens_seen": 184667152, + "step": 151760 + }, + { + "epoch": 16.902216282436797, + "grad_norm": 0.09301344305276871, + "learning_rate": 3.5660886083939277e-06, + "loss": 0.0037, + "num_input_tokens_seen": 184673296, + "step": 151765 + }, + { + "epoch": 16.902773137320413, + "grad_norm": 0.0003789440088439733, + "learning_rate": 3.564838067366452e-06, + "loss": 0.0643, + "num_input_tokens_seen": 184679344, + "step": 151770 + }, + { + "epoch": 16.903329992204032, + "grad_norm": 0.26981183886528015, + "learning_rate": 3.5635877288112602e-06, + "loss": 0.0156, + "num_input_tokens_seen": 184685360, + "step": 151775 + }, + { + "epoch": 16.903886847087648, + "grad_norm": 0.07943150401115417, + "learning_rate": 3.5623375927401503e-06, + "loss": 0.0016, + "num_input_tokens_seen": 184691408, + "step": 151780 + }, + { + "epoch": 16.904443701971267, + "grad_norm": 0.05773302540183067, + "learning_rate": 3.561087659164944e-06, + "loss": 0.0215, + "num_input_tokens_seen": 184697808, + "step": 151785 + }, + { + "epoch": 16.905000556854883, + "grad_norm": 8.772522414801642e-05, + "learning_rate": 3.55983792809744e-06, + "loss": 0.0048, + "num_input_tokens_seen": 184703760, + "step": 151790 + }, + { + "epoch": 16.9055574117385, + "grad_norm": 0.01674141176044941, + "learning_rate": 3.558588399549445e-06, + "loss": 0.05, + "num_input_tokens_seen": 184709808, + "step": 151795 + }, + { + "epoch": 16.90611426662212, + "grad_norm": 0.20749478042125702, + "learning_rate": 3.5573390735327617e-06, + "loss": 0.0853, + "num_input_tokens_seen": 184715792, + "step": 151800 + }, + { + "epoch": 16.906671121505735, + "grad_norm": 0.00019920092017855495, + "learning_rate": 3.5560899500591825e-06, + "loss": 0.0408, + "num_input_tokens_seen": 184722128, + "step": 151805 + }, + { + "epoch": 16.907227976389354, + "grad_norm": 0.019509214907884598, + "learning_rate": 3.554841029140524e-06, + "loss": 0.1848, + "num_input_tokens_seen": 184727696, + "step": 151810 + }, + { + "epoch": 16.90778483127297, + "grad_norm": 0.8837030529975891, + "learning_rate": 3.55359231078857e-06, + "loss": 0.0306, + "num_input_tokens_seen": 184733712, + "step": 151815 + }, + { + "epoch": 16.90834168615659, + "grad_norm": 1.6350291967391968, + "learning_rate": 3.55234379501512e-06, + "loss": 0.0523, + "num_input_tokens_seen": 184739760, + "step": 151820 + }, + { + "epoch": 16.908898541040205, + "grad_norm": 0.4681982696056366, + "learning_rate": 3.551095481831962e-06, + "loss": 0.0558, + "num_input_tokens_seen": 184745968, + "step": 151825 + }, + { + "epoch": 16.90945539592382, + "grad_norm": 1.2309045791625977, + "learning_rate": 3.5498473712508974e-06, + "loss": 0.0212, + "num_input_tokens_seen": 184751952, + "step": 151830 + }, + { + "epoch": 16.91001225080744, + "grad_norm": 0.07808125019073486, + "learning_rate": 3.5485994632837027e-06, + "loss": 0.0288, + "num_input_tokens_seen": 184758160, + "step": 151835 + }, + { + "epoch": 16.910569105691057, + "grad_norm": 0.0005960233975201845, + "learning_rate": 3.5473517579421856e-06, + "loss": 0.0069, + "num_input_tokens_seen": 184763376, + "step": 151840 + }, + { + "epoch": 16.911125960574672, + "grad_norm": 0.00012838061957154423, + "learning_rate": 3.5461042552381057e-06, + "loss": 0.0641, + "num_input_tokens_seen": 184769488, + "step": 151845 + }, + { + "epoch": 16.911682815458292, + "grad_norm": 0.03494254872202873, + "learning_rate": 3.544856955183268e-06, + "loss": 0.0122, + "num_input_tokens_seen": 184775696, + "step": 151850 + }, + { + "epoch": 16.912239670341908, + "grad_norm": 0.07050135731697083, + "learning_rate": 3.543609857789437e-06, + "loss": 0.0081, + "num_input_tokens_seen": 184781520, + "step": 151855 + }, + { + "epoch": 16.912796525225527, + "grad_norm": 0.2675477862358093, + "learning_rate": 3.5423629630684104e-06, + "loss": 0.0234, + "num_input_tokens_seen": 184787920, + "step": 151860 + }, + { + "epoch": 16.913353380109143, + "grad_norm": 0.6911801695823669, + "learning_rate": 3.5411162710319553e-06, + "loss": 0.0078, + "num_input_tokens_seen": 184794096, + "step": 151865 + }, + { + "epoch": 16.913910234992763, + "grad_norm": 0.0006046734633855522, + "learning_rate": 3.5398697816918486e-06, + "loss": 0.0209, + "num_input_tokens_seen": 184800368, + "step": 151870 + }, + { + "epoch": 16.91446708987638, + "grad_norm": 0.012794582173228264, + "learning_rate": 3.5386234950598616e-06, + "loss": 0.0023, + "num_input_tokens_seen": 184806576, + "step": 151875 + }, + { + "epoch": 16.915023944759994, + "grad_norm": 1.2523753643035889, + "learning_rate": 3.537377411147777e-06, + "loss": 0.1515, + "num_input_tokens_seen": 184812720, + "step": 151880 + }, + { + "epoch": 16.915580799643614, + "grad_norm": 0.0009258686914108694, + "learning_rate": 3.5361315299673542e-06, + "loss": 0.025, + "num_input_tokens_seen": 184818672, + "step": 151885 + }, + { + "epoch": 16.91613765452723, + "grad_norm": 0.037628788501024246, + "learning_rate": 3.53488585153037e-06, + "loss": 0.0074, + "num_input_tokens_seen": 184824912, + "step": 151890 + }, + { + "epoch": 16.91669450941085, + "grad_norm": 1.9247078895568848, + "learning_rate": 3.533640375848579e-06, + "loss": 0.0737, + "num_input_tokens_seen": 184830832, + "step": 151895 + }, + { + "epoch": 16.917251364294465, + "grad_norm": 1.6016349792480469, + "learning_rate": 3.532395102933758e-06, + "loss": 0.0782, + "num_input_tokens_seen": 184836400, + "step": 151900 + }, + { + "epoch": 16.91780821917808, + "grad_norm": 0.97316974401474, + "learning_rate": 3.5311500327976587e-06, + "loss": 0.1175, + "num_input_tokens_seen": 184842224, + "step": 151905 + }, + { + "epoch": 16.9183650740617, + "grad_norm": 0.4101376235485077, + "learning_rate": 3.5299051654520605e-06, + "loss": 0.0803, + "num_input_tokens_seen": 184848432, + "step": 151910 + }, + { + "epoch": 16.918921928945316, + "grad_norm": 0.10284069925546646, + "learning_rate": 3.5286605009086983e-06, + "loss": 0.0447, + "num_input_tokens_seen": 184854256, + "step": 151915 + }, + { + "epoch": 16.919478783828936, + "grad_norm": 0.0013776697451248765, + "learning_rate": 3.527416039179346e-06, + "loss": 0.1258, + "num_input_tokens_seen": 184860208, + "step": 151920 + }, + { + "epoch": 16.92003563871255, + "grad_norm": 1.750783085823059, + "learning_rate": 3.5261717802757473e-06, + "loss": 0.0938, + "num_input_tokens_seen": 184866768, + "step": 151925 + }, + { + "epoch": 16.920592493596168, + "grad_norm": 0.032625969499349594, + "learning_rate": 3.5249277242096674e-06, + "loss": 0.0021, + "num_input_tokens_seen": 184872784, + "step": 151930 + }, + { + "epoch": 16.921149348479787, + "grad_norm": 0.10365021973848343, + "learning_rate": 3.523683870992847e-06, + "loss": 0.1429, + "num_input_tokens_seen": 184879120, + "step": 151935 + }, + { + "epoch": 16.921706203363403, + "grad_norm": 0.09450352191925049, + "learning_rate": 3.522440220637041e-06, + "loss": 0.0293, + "num_input_tokens_seen": 184884656, + "step": 151940 + }, + { + "epoch": 16.922263058247022, + "grad_norm": 0.0010399650782346725, + "learning_rate": 3.5211967731539896e-06, + "loss": 0.0091, + "num_input_tokens_seen": 184890800, + "step": 151945 + }, + { + "epoch": 16.92281991313064, + "grad_norm": 0.0001142766559496522, + "learning_rate": 3.51995352855545e-06, + "loss": 0.1056, + "num_input_tokens_seen": 184896976, + "step": 151950 + }, + { + "epoch": 16.923376768014254, + "grad_norm": 0.6532533764839172, + "learning_rate": 3.518710486853155e-06, + "loss": 0.0194, + "num_input_tokens_seen": 184903120, + "step": 151955 + }, + { + "epoch": 16.923933622897874, + "grad_norm": 0.2223661094903946, + "learning_rate": 3.5174676480588533e-06, + "loss": 0.0075, + "num_input_tokens_seen": 184909392, + "step": 151960 + }, + { + "epoch": 16.92449047778149, + "grad_norm": 0.00028633562033064663, + "learning_rate": 3.5162250121842737e-06, + "loss": 0.0427, + "num_input_tokens_seen": 184915664, + "step": 151965 + }, + { + "epoch": 16.92504733266511, + "grad_norm": 0.10464734584093094, + "learning_rate": 3.5149825792411687e-06, + "loss": 0.0202, + "num_input_tokens_seen": 184921776, + "step": 151970 + }, + { + "epoch": 16.925604187548725, + "grad_norm": 0.008659489452838898, + "learning_rate": 3.5137403492412596e-06, + "loss": 0.1151, + "num_input_tokens_seen": 184928176, + "step": 151975 + }, + { + "epoch": 16.92616104243234, + "grad_norm": 0.7011662721633911, + "learning_rate": 3.5124983221962947e-06, + "loss": 0.0379, + "num_input_tokens_seen": 184934128, + "step": 151980 + }, + { + "epoch": 16.92671789731596, + "grad_norm": 0.02532205730676651, + "learning_rate": 3.5112564981179986e-06, + "loss": 0.0024, + "num_input_tokens_seen": 184940496, + "step": 151985 + }, + { + "epoch": 16.927274752199576, + "grad_norm": 0.27295276522636414, + "learning_rate": 3.5100148770181e-06, + "loss": 0.0062, + "num_input_tokens_seen": 184946672, + "step": 151990 + }, + { + "epoch": 16.927831607083196, + "grad_norm": 0.23736141622066498, + "learning_rate": 3.508773458908321e-06, + "loss": 0.0323, + "num_input_tokens_seen": 184952112, + "step": 151995 + }, + { + "epoch": 16.92838846196681, + "grad_norm": 0.28170886635780334, + "learning_rate": 3.5075322438004043e-06, + "loss": 0.0138, + "num_input_tokens_seen": 184958480, + "step": 152000 + }, + { + "epoch": 16.928945316850427, + "grad_norm": 0.02830997109413147, + "learning_rate": 3.5062912317060632e-06, + "loss": 0.0084, + "num_input_tokens_seen": 184964624, + "step": 152005 + }, + { + "epoch": 16.929502171734047, + "grad_norm": 0.010869085788726807, + "learning_rate": 3.5050504226370214e-06, + "loss": 0.0194, + "num_input_tokens_seen": 184970832, + "step": 152010 + }, + { + "epoch": 16.930059026617663, + "grad_norm": 0.4537069797515869, + "learning_rate": 3.503809816604997e-06, + "loss": 0.0403, + "num_input_tokens_seen": 184977104, + "step": 152015 + }, + { + "epoch": 16.930615881501282, + "grad_norm": 0.09333893656730652, + "learning_rate": 3.5025694136217146e-06, + "loss": 0.0017, + "num_input_tokens_seen": 184983152, + "step": 152020 + }, + { + "epoch": 16.931172736384898, + "grad_norm": 0.00022859545424580574, + "learning_rate": 3.501329213698881e-06, + "loss": 0.0372, + "num_input_tokens_seen": 184989232, + "step": 152025 + }, + { + "epoch": 16.931729591268514, + "grad_norm": 0.00023475605121348053, + "learning_rate": 3.500089216848232e-06, + "loss": 0.0534, + "num_input_tokens_seen": 184995344, + "step": 152030 + }, + { + "epoch": 16.932286446152133, + "grad_norm": 0.0019067280227318406, + "learning_rate": 3.4988494230814516e-06, + "loss": 0.0506, + "num_input_tokens_seen": 185001296, + "step": 152035 + }, + { + "epoch": 16.93284330103575, + "grad_norm": 0.14043095707893372, + "learning_rate": 3.497609832410273e-06, + "loss": 0.0145, + "num_input_tokens_seen": 185007600, + "step": 152040 + }, + { + "epoch": 16.93340015591937, + "grad_norm": 0.0011789958225563169, + "learning_rate": 3.4963704448463892e-06, + "loss": 0.136, + "num_input_tokens_seen": 185013872, + "step": 152045 + }, + { + "epoch": 16.933957010802985, + "grad_norm": 0.6192824840545654, + "learning_rate": 3.4951312604015213e-06, + "loss": 0.0148, + "num_input_tokens_seen": 185019952, + "step": 152050 + }, + { + "epoch": 16.9345138656866, + "grad_norm": 0.000451370287919417, + "learning_rate": 3.4938922790873657e-06, + "loss": 0.0525, + "num_input_tokens_seen": 185026000, + "step": 152055 + }, + { + "epoch": 16.93507072057022, + "grad_norm": 0.02260921522974968, + "learning_rate": 3.4926535009156324e-06, + "loss": 0.0242, + "num_input_tokens_seen": 185032272, + "step": 152060 + }, + { + "epoch": 16.935627575453836, + "grad_norm": 0.18092477321624756, + "learning_rate": 3.4914149258980095e-06, + "loss": 0.013, + "num_input_tokens_seen": 185038128, + "step": 152065 + }, + { + "epoch": 16.936184430337455, + "grad_norm": 0.06766659766435623, + "learning_rate": 3.4901765540462123e-06, + "loss": 0.0177, + "num_input_tokens_seen": 185043888, + "step": 152070 + }, + { + "epoch": 16.93674128522107, + "grad_norm": 1.047336459159851, + "learning_rate": 3.4889383853719287e-06, + "loss": 0.0086, + "num_input_tokens_seen": 185050224, + "step": 152075 + }, + { + "epoch": 16.937298140104687, + "grad_norm": 0.021232860162854195, + "learning_rate": 3.487700419886858e-06, + "loss": 0.0319, + "num_input_tokens_seen": 185056336, + "step": 152080 + }, + { + "epoch": 16.937854994988307, + "grad_norm": 0.15643689036369324, + "learning_rate": 3.4864626576026876e-06, + "loss": 0.0037, + "num_input_tokens_seen": 185062640, + "step": 152085 + }, + { + "epoch": 16.938411849871922, + "grad_norm": 0.042241476476192474, + "learning_rate": 3.485225098531117e-06, + "loss": 0.0116, + "num_input_tokens_seen": 185068976, + "step": 152090 + }, + { + "epoch": 16.938968704755542, + "grad_norm": 0.032670758664608, + "learning_rate": 3.483987742683828e-06, + "loss": 0.0231, + "num_input_tokens_seen": 185075376, + "step": 152095 + }, + { + "epoch": 16.939525559639158, + "grad_norm": 0.012081452645361423, + "learning_rate": 3.4827505900725256e-06, + "loss": 0.0034, + "num_input_tokens_seen": 185081712, + "step": 152100 + }, + { + "epoch": 16.940082414522774, + "grad_norm": 0.3985312581062317, + "learning_rate": 3.481513640708872e-06, + "loss": 0.0125, + "num_input_tokens_seen": 185087664, + "step": 152105 + }, + { + "epoch": 16.940639269406393, + "grad_norm": 0.0016041360795497894, + "learning_rate": 3.480276894604567e-06, + "loss": 0.084, + "num_input_tokens_seen": 185093552, + "step": 152110 + }, + { + "epoch": 16.94119612429001, + "grad_norm": 0.5647487640380859, + "learning_rate": 3.4790403517712815e-06, + "loss": 0.0594, + "num_input_tokens_seen": 185099920, + "step": 152115 + }, + { + "epoch": 16.94175297917363, + "grad_norm": 1.4376685619354248, + "learning_rate": 3.477804012220709e-06, + "loss": 0.0339, + "num_input_tokens_seen": 185105968, + "step": 152120 + }, + { + "epoch": 16.942309834057244, + "grad_norm": 0.011909677647054195, + "learning_rate": 3.4765678759645206e-06, + "loss": 0.0062, + "num_input_tokens_seen": 185112208, + "step": 152125 + }, + { + "epoch": 16.94286668894086, + "grad_norm": 0.01455886010080576, + "learning_rate": 3.4753319430143927e-06, + "loss": 0.0596, + "num_input_tokens_seen": 185117552, + "step": 152130 + }, + { + "epoch": 16.94342354382448, + "grad_norm": 0.0009533588890917599, + "learning_rate": 3.474096213381994e-06, + "loss": 0.036, + "num_input_tokens_seen": 185123312, + "step": 152135 + }, + { + "epoch": 16.943980398708096, + "grad_norm": 0.0006597902392968535, + "learning_rate": 3.4728606870790104e-06, + "loss": 0.0007, + "num_input_tokens_seen": 185129648, + "step": 152140 + }, + { + "epoch": 16.944537253591715, + "grad_norm": 0.03806447237730026, + "learning_rate": 3.471625364117101e-06, + "loss": 0.0484, + "num_input_tokens_seen": 185135280, + "step": 152145 + }, + { + "epoch": 16.94509410847533, + "grad_norm": 0.00042127250344492495, + "learning_rate": 3.470390244507943e-06, + "loss": 0.0153, + "num_input_tokens_seen": 185141424, + "step": 152150 + }, + { + "epoch": 16.945650963358947, + "grad_norm": 0.037910886108875275, + "learning_rate": 3.4691553282631904e-06, + "loss": 0.0522, + "num_input_tokens_seen": 185147536, + "step": 152155 + }, + { + "epoch": 16.946207818242566, + "grad_norm": 0.0017606986220926046, + "learning_rate": 3.467920615394521e-06, + "loss": 0.0022, + "num_input_tokens_seen": 185153520, + "step": 152160 + }, + { + "epoch": 16.946764673126182, + "grad_norm": 0.35894763469696045, + "learning_rate": 3.4666861059135886e-06, + "loss": 0.0067, + "num_input_tokens_seen": 185159472, + "step": 152165 + }, + { + "epoch": 16.9473215280098, + "grad_norm": 0.4919957220554352, + "learning_rate": 3.4654517998320674e-06, + "loss": 0.0054, + "num_input_tokens_seen": 185165840, + "step": 152170 + }, + { + "epoch": 16.947878382893418, + "grad_norm": 0.33339884877204895, + "learning_rate": 3.464217697161595e-06, + "loss": 0.108, + "num_input_tokens_seen": 185171760, + "step": 152175 + }, + { + "epoch": 16.948435237777034, + "grad_norm": 0.07695020735263824, + "learning_rate": 3.462983797913849e-06, + "loss": 0.0427, + "num_input_tokens_seen": 185177392, + "step": 152180 + }, + { + "epoch": 16.948992092660653, + "grad_norm": 0.01021474227309227, + "learning_rate": 3.4617501021004696e-06, + "loss": 0.0042, + "num_input_tokens_seen": 185183504, + "step": 152185 + }, + { + "epoch": 16.94954894754427, + "grad_norm": 0.000190533057320863, + "learning_rate": 3.460516609733122e-06, + "loss": 0.0068, + "num_input_tokens_seen": 185189584, + "step": 152190 + }, + { + "epoch": 16.95010580242789, + "grad_norm": 0.03797508031129837, + "learning_rate": 3.4592833208234534e-06, + "loss": 0.0091, + "num_input_tokens_seen": 185195696, + "step": 152195 + }, + { + "epoch": 16.950662657311504, + "grad_norm": 0.44279593229293823, + "learning_rate": 3.4580502353831117e-06, + "loss": 0.0585, + "num_input_tokens_seen": 185201136, + "step": 152200 + }, + { + "epoch": 16.951219512195124, + "grad_norm": 0.0008683640626259148, + "learning_rate": 3.4568173534237386e-06, + "loss": 0.068, + "num_input_tokens_seen": 185206672, + "step": 152205 + }, + { + "epoch": 16.95177636707874, + "grad_norm": 0.4191756546497345, + "learning_rate": 3.455584674956991e-06, + "loss": 0.1188, + "num_input_tokens_seen": 185213040, + "step": 152210 + }, + { + "epoch": 16.952333221962355, + "grad_norm": 0.09858513623476028, + "learning_rate": 3.4543521999945067e-06, + "loss": 0.0026, + "num_input_tokens_seen": 185218672, + "step": 152215 + }, + { + "epoch": 16.952890076845975, + "grad_norm": 0.6631307601928711, + "learning_rate": 3.4531199285479294e-06, + "loss": 0.0313, + "num_input_tokens_seen": 185224720, + "step": 152220 + }, + { + "epoch": 16.95344693172959, + "grad_norm": 0.008765674196183681, + "learning_rate": 3.451887860628897e-06, + "loss": 0.0041, + "num_input_tokens_seen": 185230512, + "step": 152225 + }, + { + "epoch": 16.95400378661321, + "grad_norm": 0.03383491560816765, + "learning_rate": 3.4506559962490415e-06, + "loss": 0.0244, + "num_input_tokens_seen": 185236848, + "step": 152230 + }, + { + "epoch": 16.954560641496826, + "grad_norm": 0.10392244905233383, + "learning_rate": 3.4494243354200096e-06, + "loss": 0.0216, + "num_input_tokens_seen": 185243568, + "step": 152235 + }, + { + "epoch": 16.955117496380442, + "grad_norm": 0.3446562588214874, + "learning_rate": 3.448192878153428e-06, + "loss": 0.021, + "num_input_tokens_seen": 185249136, + "step": 152240 + }, + { + "epoch": 16.95567435126406, + "grad_norm": 0.17810723185539246, + "learning_rate": 3.4469616244609425e-06, + "loss": 0.0041, + "num_input_tokens_seen": 185255312, + "step": 152245 + }, + { + "epoch": 16.956231206147677, + "grad_norm": 0.03726468235254288, + "learning_rate": 3.4457305743541585e-06, + "loss": 0.0071, + "num_input_tokens_seen": 185261616, + "step": 152250 + }, + { + "epoch": 16.956788061031297, + "grad_norm": 0.9589111804962158, + "learning_rate": 3.444499727844727e-06, + "loss": 0.0141, + "num_input_tokens_seen": 185267856, + "step": 152255 + }, + { + "epoch": 16.957344915914913, + "grad_norm": 0.15854987502098083, + "learning_rate": 3.443269084944259e-06, + "loss": 0.002, + "num_input_tokens_seen": 185274128, + "step": 152260 + }, + { + "epoch": 16.95790177079853, + "grad_norm": 0.00020187355403322726, + "learning_rate": 3.4420386456643916e-06, + "loss": 0.0126, + "num_input_tokens_seen": 185280048, + "step": 152265 + }, + { + "epoch": 16.958458625682148, + "grad_norm": 0.025940146297216415, + "learning_rate": 3.4408084100167407e-06, + "loss": 0.0784, + "num_input_tokens_seen": 185285616, + "step": 152270 + }, + { + "epoch": 16.959015480565764, + "grad_norm": 0.007709791883826256, + "learning_rate": 3.4395783780129255e-06, + "loss": 0.0103, + "num_input_tokens_seen": 185291824, + "step": 152275 + }, + { + "epoch": 16.959572335449383, + "grad_norm": 0.05081889405846596, + "learning_rate": 3.4383485496645634e-06, + "loss": 0.0189, + "num_input_tokens_seen": 185298224, + "step": 152280 + }, + { + "epoch": 16.960129190333, + "grad_norm": 0.044227804988622665, + "learning_rate": 3.437118924983279e-06, + "loss": 0.0171, + "num_input_tokens_seen": 185304400, + "step": 152285 + }, + { + "epoch": 16.960686045216615, + "grad_norm": 1.099434733390808, + "learning_rate": 3.43588950398068e-06, + "loss": 0.0217, + "num_input_tokens_seen": 185310448, + "step": 152290 + }, + { + "epoch": 16.961242900100235, + "grad_norm": 0.000536029867362231, + "learning_rate": 3.434660286668384e-06, + "loss": 0.0093, + "num_input_tokens_seen": 185316656, + "step": 152295 + }, + { + "epoch": 16.96179975498385, + "grad_norm": 0.3015196621417999, + "learning_rate": 3.4334312730579937e-06, + "loss": 0.0153, + "num_input_tokens_seen": 185322608, + "step": 152300 + }, + { + "epoch": 16.96235660986747, + "grad_norm": 0.10171904414892197, + "learning_rate": 3.43220246316113e-06, + "loss": 0.0574, + "num_input_tokens_seen": 185328752, + "step": 152305 + }, + { + "epoch": 16.962913464751086, + "grad_norm": 0.04639098420739174, + "learning_rate": 3.430973856989386e-06, + "loss": 0.0113, + "num_input_tokens_seen": 185334864, + "step": 152310 + }, + { + "epoch": 16.963470319634702, + "grad_norm": 0.038216203451156616, + "learning_rate": 3.4297454545543868e-06, + "loss": 0.0402, + "num_input_tokens_seen": 185340688, + "step": 152315 + }, + { + "epoch": 16.96402717451832, + "grad_norm": 0.015130246058106422, + "learning_rate": 3.4285172558677136e-06, + "loss": 0.0065, + "num_input_tokens_seen": 185346864, + "step": 152320 + }, + { + "epoch": 16.964584029401937, + "grad_norm": 0.7174017429351807, + "learning_rate": 3.427289260940983e-06, + "loss": 0.1317, + "num_input_tokens_seen": 185352592, + "step": 152325 + }, + { + "epoch": 16.965140884285557, + "grad_norm": 0.0019029553513973951, + "learning_rate": 3.4260614697857823e-06, + "loss": 0.0003, + "num_input_tokens_seen": 185358608, + "step": 152330 + }, + { + "epoch": 16.965697739169173, + "grad_norm": 0.2726655602455139, + "learning_rate": 3.4248338824137223e-06, + "loss": 0.0037, + "num_input_tokens_seen": 185364784, + "step": 152335 + }, + { + "epoch": 16.96625459405279, + "grad_norm": 0.40664488077163696, + "learning_rate": 3.423606498836393e-06, + "loss": 0.0122, + "num_input_tokens_seen": 185370832, + "step": 152340 + }, + { + "epoch": 16.966811448936408, + "grad_norm": 0.0007958993664942682, + "learning_rate": 3.422379319065386e-06, + "loss": 0.0353, + "num_input_tokens_seen": 185376528, + "step": 152345 + }, + { + "epoch": 16.967368303820024, + "grad_norm": 0.008133267052471638, + "learning_rate": 3.421152343112288e-06, + "loss": 0.0546, + "num_input_tokens_seen": 185382768, + "step": 152350 + }, + { + "epoch": 16.967925158703643, + "grad_norm": 0.012748876586556435, + "learning_rate": 3.4199255709886995e-06, + "loss": 0.024, + "num_input_tokens_seen": 185388880, + "step": 152355 + }, + { + "epoch": 16.96848201358726, + "grad_norm": 0.004517888184636831, + "learning_rate": 3.4186990027062048e-06, + "loss": 0.0023, + "num_input_tokens_seen": 185395184, + "step": 152360 + }, + { + "epoch": 16.969038868470875, + "grad_norm": 0.0004552869067993015, + "learning_rate": 3.4174726382763893e-06, + "loss": 0.0015, + "num_input_tokens_seen": 185401328, + "step": 152365 + }, + { + "epoch": 16.969595723354495, + "grad_norm": 2.1375412940979004, + "learning_rate": 3.41624647771083e-06, + "loss": 0.0757, + "num_input_tokens_seen": 185407632, + "step": 152370 + }, + { + "epoch": 16.97015257823811, + "grad_norm": 0.0018996804719790816, + "learning_rate": 3.41502052102112e-06, + "loss": 0.0914, + "num_input_tokens_seen": 185413712, + "step": 152375 + }, + { + "epoch": 16.97070943312173, + "grad_norm": 0.3644993305206299, + "learning_rate": 3.413794768218831e-06, + "loss": 0.0754, + "num_input_tokens_seen": 185420112, + "step": 152380 + }, + { + "epoch": 16.971266288005346, + "grad_norm": 1.0489094257354736, + "learning_rate": 3.4125692193155507e-06, + "loss": 0.0539, + "num_input_tokens_seen": 185426480, + "step": 152385 + }, + { + "epoch": 16.97182314288896, + "grad_norm": 0.6594207882881165, + "learning_rate": 3.4113438743228474e-06, + "loss": 0.0265, + "num_input_tokens_seen": 185432240, + "step": 152390 + }, + { + "epoch": 16.97237999777258, + "grad_norm": 0.005053953733295202, + "learning_rate": 3.4101187332523017e-06, + "loss": 0.0608, + "num_input_tokens_seen": 185438352, + "step": 152395 + }, + { + "epoch": 16.972936852656197, + "grad_norm": 1.3386280536651611, + "learning_rate": 3.4088937961154726e-06, + "loss": 0.015, + "num_input_tokens_seen": 185444336, + "step": 152400 + }, + { + "epoch": 16.973493707539816, + "grad_norm": 1.1200660467147827, + "learning_rate": 3.407669062923946e-06, + "loss": 0.0452, + "num_input_tokens_seen": 185450672, + "step": 152405 + }, + { + "epoch": 16.974050562423432, + "grad_norm": 0.005447758827358484, + "learning_rate": 3.4064445336892877e-06, + "loss": 0.0238, + "num_input_tokens_seen": 185456464, + "step": 152410 + }, + { + "epoch": 16.97460741730705, + "grad_norm": 0.07627296447753906, + "learning_rate": 3.405220208423057e-06, + "loss": 0.0166, + "num_input_tokens_seen": 185462640, + "step": 152415 + }, + { + "epoch": 16.975164272190668, + "grad_norm": 1.4086271524429321, + "learning_rate": 3.403996087136821e-06, + "loss": 0.12, + "num_input_tokens_seen": 185468528, + "step": 152420 + }, + { + "epoch": 16.975721127074284, + "grad_norm": 0.00446690758690238, + "learning_rate": 3.4027721698421473e-06, + "loss": 0.0588, + "num_input_tokens_seen": 185474512, + "step": 152425 + }, + { + "epoch": 16.976277981957903, + "grad_norm": 1.2421574592590332, + "learning_rate": 3.4015484565505904e-06, + "loss": 0.0362, + "num_input_tokens_seen": 185480528, + "step": 152430 + }, + { + "epoch": 16.97683483684152, + "grad_norm": 0.6636493802070618, + "learning_rate": 3.4003249472737223e-06, + "loss": 0.1419, + "num_input_tokens_seen": 185486448, + "step": 152435 + }, + { + "epoch": 16.977391691725135, + "grad_norm": 0.0690678283572197, + "learning_rate": 3.399101642023081e-06, + "loss": 0.0153, + "num_input_tokens_seen": 185492080, + "step": 152440 + }, + { + "epoch": 16.977948546608754, + "grad_norm": 0.00016279863484669477, + "learning_rate": 3.3978785408102342e-06, + "loss": 0.1181, + "num_input_tokens_seen": 185498576, + "step": 152445 + }, + { + "epoch": 16.97850540149237, + "grad_norm": 0.8219866156578064, + "learning_rate": 3.396655643646729e-06, + "loss": 0.0474, + "num_input_tokens_seen": 185504816, + "step": 152450 + }, + { + "epoch": 16.97906225637599, + "grad_norm": 0.007006362080574036, + "learning_rate": 3.395432950544125e-06, + "loss": 0.0829, + "num_input_tokens_seen": 185511056, + "step": 152455 + }, + { + "epoch": 16.979619111259606, + "grad_norm": 0.05166994035243988, + "learning_rate": 3.3942104615139637e-06, + "loss": 0.0118, + "num_input_tokens_seen": 185516816, + "step": 152460 + }, + { + "epoch": 16.98017596614322, + "grad_norm": 0.18010610342025757, + "learning_rate": 3.392988176567796e-06, + "loss": 0.094, + "num_input_tokens_seen": 185522768, + "step": 152465 + }, + { + "epoch": 16.98073282102684, + "grad_norm": 0.0691428929567337, + "learning_rate": 3.391766095717161e-06, + "loss": 0.0188, + "num_input_tokens_seen": 185529104, + "step": 152470 + }, + { + "epoch": 16.981289675910457, + "grad_norm": 0.0007131416932679713, + "learning_rate": 3.3905442189736124e-06, + "loss": 0.0684, + "num_input_tokens_seen": 185535184, + "step": 152475 + }, + { + "epoch": 16.981846530794076, + "grad_norm": 0.0018895692192018032, + "learning_rate": 3.389322546348686e-06, + "loss": 0.0092, + "num_input_tokens_seen": 185541360, + "step": 152480 + }, + { + "epoch": 16.982403385677692, + "grad_norm": 3.7237563133239746, + "learning_rate": 3.388101077853925e-06, + "loss": 0.0295, + "num_input_tokens_seen": 185547344, + "step": 152485 + }, + { + "epoch": 16.982960240561308, + "grad_norm": 0.24540282785892487, + "learning_rate": 3.3868798135008566e-06, + "loss": 0.004, + "num_input_tokens_seen": 185553392, + "step": 152490 + }, + { + "epoch": 16.983517095444927, + "grad_norm": 1.2136083841323853, + "learning_rate": 3.3856587533010324e-06, + "loss": 0.0267, + "num_input_tokens_seen": 185559408, + "step": 152495 + }, + { + "epoch": 16.984073950328543, + "grad_norm": 0.5593641996383667, + "learning_rate": 3.384437897265971e-06, + "loss": 0.0084, + "num_input_tokens_seen": 185565456, + "step": 152500 + }, + { + "epoch": 16.984630805212163, + "grad_norm": 2.559868812561035, + "learning_rate": 3.383217245407222e-06, + "loss": 0.0334, + "num_input_tokens_seen": 185571728, + "step": 152505 + }, + { + "epoch": 16.98518766009578, + "grad_norm": 0.3029593825340271, + "learning_rate": 3.3819967977362975e-06, + "loss": 0.0087, + "num_input_tokens_seen": 185577712, + "step": 152510 + }, + { + "epoch": 16.985744514979395, + "grad_norm": 1.0659817457199097, + "learning_rate": 3.380776554264736e-06, + "loss": 0.0461, + "num_input_tokens_seen": 185583984, + "step": 152515 + }, + { + "epoch": 16.986301369863014, + "grad_norm": 0.6594372987747192, + "learning_rate": 3.3795565150040566e-06, + "loss": 0.0112, + "num_input_tokens_seen": 185590096, + "step": 152520 + }, + { + "epoch": 16.98685822474663, + "grad_norm": 0.01621662825345993, + "learning_rate": 3.3783366799657934e-06, + "loss": 0.0087, + "num_input_tokens_seen": 185596272, + "step": 152525 + }, + { + "epoch": 16.98741507963025, + "grad_norm": 1.0253615379333496, + "learning_rate": 3.3771170491614603e-06, + "loss": 0.0459, + "num_input_tokens_seen": 185602352, + "step": 152530 + }, + { + "epoch": 16.987971934513865, + "grad_norm": 0.026584433391690254, + "learning_rate": 3.3758976226025833e-06, + "loss": 0.0366, + "num_input_tokens_seen": 185608464, + "step": 152535 + }, + { + "epoch": 16.988528789397485, + "grad_norm": 0.2595502734184265, + "learning_rate": 3.374678400300674e-06, + "loss": 0.1598, + "num_input_tokens_seen": 185614704, + "step": 152540 + }, + { + "epoch": 16.9890856442811, + "grad_norm": 0.13764041662216187, + "learning_rate": 3.3734593822672578e-06, + "loss": 0.0066, + "num_input_tokens_seen": 185621040, + "step": 152545 + }, + { + "epoch": 16.989642499164717, + "grad_norm": 0.9901371598243713, + "learning_rate": 3.372240568513843e-06, + "loss": 0.0251, + "num_input_tokens_seen": 185627216, + "step": 152550 + }, + { + "epoch": 16.990199354048336, + "grad_norm": 2.645385503768921, + "learning_rate": 3.3710219590519453e-06, + "loss": 0.1569, + "num_input_tokens_seen": 185633456, + "step": 152555 + }, + { + "epoch": 16.990756208931952, + "grad_norm": 0.08465829491615295, + "learning_rate": 3.3698035538930666e-06, + "loss": 0.156, + "num_input_tokens_seen": 185639696, + "step": 152560 + }, + { + "epoch": 16.99131306381557, + "grad_norm": 0.006127234548330307, + "learning_rate": 3.368585353048731e-06, + "loss": 0.052, + "num_input_tokens_seen": 185645936, + "step": 152565 + }, + { + "epoch": 16.991869918699187, + "grad_norm": 1.2955124378204346, + "learning_rate": 3.367367356530432e-06, + "loss": 0.0224, + "num_input_tokens_seen": 185652016, + "step": 152570 + }, + { + "epoch": 16.992426773582803, + "grad_norm": 0.0014181723818182945, + "learning_rate": 3.3661495643496853e-06, + "loss": 0.0088, + "num_input_tokens_seen": 185658064, + "step": 152575 + }, + { + "epoch": 16.992983628466423, + "grad_norm": 0.09569817781448364, + "learning_rate": 3.3649319765179875e-06, + "loss": 0.004, + "num_input_tokens_seen": 185664144, + "step": 152580 + }, + { + "epoch": 16.99354048335004, + "grad_norm": 1.3846632242202759, + "learning_rate": 3.3637145930468405e-06, + "loss": 0.1077, + "num_input_tokens_seen": 185670640, + "step": 152585 + }, + { + "epoch": 16.994097338233658, + "grad_norm": 0.18697838485240936, + "learning_rate": 3.3624974139477403e-06, + "loss": 0.0046, + "num_input_tokens_seen": 185676848, + "step": 152590 + }, + { + "epoch": 16.994654193117274, + "grad_norm": 0.012041154317557812, + "learning_rate": 3.3612804392321923e-06, + "loss": 0.1605, + "num_input_tokens_seen": 185682864, + "step": 152595 + }, + { + "epoch": 16.99521104800089, + "grad_norm": 0.15539966523647308, + "learning_rate": 3.3600636689116867e-06, + "loss": 0.0289, + "num_input_tokens_seen": 185688848, + "step": 152600 + }, + { + "epoch": 16.99576790288451, + "grad_norm": 0.8889099359512329, + "learning_rate": 3.3588471029977196e-06, + "loss": 0.1652, + "num_input_tokens_seen": 185695088, + "step": 152605 + }, + { + "epoch": 16.996324757768125, + "grad_norm": 0.01725788041949272, + "learning_rate": 3.3576307415017716e-06, + "loss": 0.0055, + "num_input_tokens_seen": 185701424, + "step": 152610 + }, + { + "epoch": 16.996881612651745, + "grad_norm": 0.08823935687541962, + "learning_rate": 3.3564145844353497e-06, + "loss": 0.0125, + "num_input_tokens_seen": 185707632, + "step": 152615 + }, + { + "epoch": 16.99743846753536, + "grad_norm": 2.7831625938415527, + "learning_rate": 3.3551986318099302e-06, + "loss": 0.1603, + "num_input_tokens_seen": 185713616, + "step": 152620 + }, + { + "epoch": 16.997995322418976, + "grad_norm": 1.4440760612487793, + "learning_rate": 3.3539828836370025e-06, + "loss": 0.0497, + "num_input_tokens_seen": 185719984, + "step": 152625 + }, + { + "epoch": 16.998552177302596, + "grad_norm": 0.12434684485197067, + "learning_rate": 3.352767339928048e-06, + "loss": 0.0937, + "num_input_tokens_seen": 185725872, + "step": 152630 + }, + { + "epoch": 16.99910903218621, + "grad_norm": 0.0010849999962374568, + "learning_rate": 3.351552000694544e-06, + "loss": 0.0376, + "num_input_tokens_seen": 185731856, + "step": 152635 + }, + { + "epoch": 16.99966588706983, + "grad_norm": 0.7773869037628174, + "learning_rate": 3.350336865947981e-06, + "loss": 0.0756, + "num_input_tokens_seen": 185738032, + "step": 152640 + }, + { + "epoch": 17.0, + "eval_loss": 0.0816754400730133, + "eval_runtime": 111.8883, + "eval_samples_per_second": 35.67, + "eval_steps_per_second": 8.92, + "num_input_tokens_seen": 185740864, + "step": 152643 + }, + { + "epoch": 17.000222741953447, + "grad_norm": 0.14904525876045227, + "learning_rate": 3.3491219356998253e-06, + "loss": 0.0977, + "num_input_tokens_seen": 185743328, + "step": 152645 + }, + { + "epoch": 17.000779596837063, + "grad_norm": 0.0012053472455590963, + "learning_rate": 3.3479072099615734e-06, + "loss": 0.0351, + "num_input_tokens_seen": 185748992, + "step": 152650 + }, + { + "epoch": 17.001336451720682, + "grad_norm": 0.001479665981605649, + "learning_rate": 3.346692688744671e-06, + "loss": 0.0906, + "num_input_tokens_seen": 185755232, + "step": 152655 + }, + { + "epoch": 17.0018933066043, + "grad_norm": 0.06686029583215714, + "learning_rate": 3.345478372060612e-06, + "loss": 0.0136, + "num_input_tokens_seen": 185761216, + "step": 152660 + }, + { + "epoch": 17.002450161487918, + "grad_norm": 0.015570646151900291, + "learning_rate": 3.344264259920854e-06, + "loss": 0.0684, + "num_input_tokens_seen": 185767456, + "step": 152665 + }, + { + "epoch": 17.003007016371534, + "grad_norm": 0.024645760655403137, + "learning_rate": 3.343050352336874e-06, + "loss": 0.0515, + "num_input_tokens_seen": 185773408, + "step": 152670 + }, + { + "epoch": 17.00356387125515, + "grad_norm": 0.04459632560610771, + "learning_rate": 3.3418366493201376e-06, + "loss": 0.0725, + "num_input_tokens_seen": 185779296, + "step": 152675 + }, + { + "epoch": 17.00412072613877, + "grad_norm": 0.02181817777454853, + "learning_rate": 3.3406231508821024e-06, + "loss": 0.0413, + "num_input_tokens_seen": 185785440, + "step": 152680 + }, + { + "epoch": 17.004677581022385, + "grad_norm": 0.006956351455301046, + "learning_rate": 3.3394098570342314e-06, + "loss": 0.0104, + "num_input_tokens_seen": 185791712, + "step": 152685 + }, + { + "epoch": 17.005234435906004, + "grad_norm": 0.06670542061328888, + "learning_rate": 3.3381967677879934e-06, + "loss": 0.0141, + "num_input_tokens_seen": 185797920, + "step": 152690 + }, + { + "epoch": 17.00579129078962, + "grad_norm": 0.000498702866025269, + "learning_rate": 3.3369838831548428e-06, + "loss": 0.0248, + "num_input_tokens_seen": 185804128, + "step": 152695 + }, + { + "epoch": 17.006348145673236, + "grad_norm": 0.020015349611639977, + "learning_rate": 3.3357712031462345e-06, + "loss": 0.0105, + "num_input_tokens_seen": 185810432, + "step": 152700 + }, + { + "epoch": 17.006905000556856, + "grad_norm": 0.0006099914317019284, + "learning_rate": 3.3345587277736207e-06, + "loss": 0.0282, + "num_input_tokens_seen": 185816672, + "step": 152705 + }, + { + "epoch": 17.00746185544047, + "grad_norm": 0.47717350721359253, + "learning_rate": 3.333346457048461e-06, + "loss": 0.0996, + "num_input_tokens_seen": 185822624, + "step": 152710 + }, + { + "epoch": 17.00801871032409, + "grad_norm": 1.1084423065185547, + "learning_rate": 3.3321343909822e-06, + "loss": 0.0543, + "num_input_tokens_seen": 185828736, + "step": 152715 + }, + { + "epoch": 17.008575565207707, + "grad_norm": 0.29398611187934875, + "learning_rate": 3.3309225295863e-06, + "loss": 0.0259, + "num_input_tokens_seen": 185834720, + "step": 152720 + }, + { + "epoch": 17.009132420091323, + "grad_norm": 1.7538657188415527, + "learning_rate": 3.3297108728721853e-06, + "loss": 0.0801, + "num_input_tokens_seen": 185840128, + "step": 152725 + }, + { + "epoch": 17.009689274974942, + "grad_norm": 0.006660863757133484, + "learning_rate": 3.3284994208513194e-06, + "loss": 0.0097, + "num_input_tokens_seen": 185845696, + "step": 152730 + }, + { + "epoch": 17.010246129858558, + "grad_norm": 0.17504122853279114, + "learning_rate": 3.3272881735351343e-06, + "loss": 0.0536, + "num_input_tokens_seen": 185851968, + "step": 152735 + }, + { + "epoch": 17.010802984742178, + "grad_norm": 0.00507623003795743, + "learning_rate": 3.326077130935079e-06, + "loss": 0.0697, + "num_input_tokens_seen": 185858144, + "step": 152740 + }, + { + "epoch": 17.011359839625793, + "grad_norm": 0.13891123235225677, + "learning_rate": 3.324866293062595e-06, + "loss": 0.0068, + "num_input_tokens_seen": 185864288, + "step": 152745 + }, + { + "epoch": 17.01191669450941, + "grad_norm": 0.0025807861238718033, + "learning_rate": 3.323655659929112e-06, + "loss": 0.0245, + "num_input_tokens_seen": 185870112, + "step": 152750 + }, + { + "epoch": 17.01247354939303, + "grad_norm": 0.0002800787042360753, + "learning_rate": 3.322445231546062e-06, + "loss": 0.0728, + "num_input_tokens_seen": 185876000, + "step": 152755 + }, + { + "epoch": 17.013030404276645, + "grad_norm": 1.07905113697052, + "learning_rate": 3.3212350079248915e-06, + "loss": 0.0878, + "num_input_tokens_seen": 185881888, + "step": 152760 + }, + { + "epoch": 17.013587259160264, + "grad_norm": 0.03195451945066452, + "learning_rate": 3.3200249890770223e-06, + "loss": 0.0078, + "num_input_tokens_seen": 185887776, + "step": 152765 + }, + { + "epoch": 17.01414411404388, + "grad_norm": 0.2709847390651703, + "learning_rate": 3.3188151750138896e-06, + "loss": 0.0767, + "num_input_tokens_seen": 185893824, + "step": 152770 + }, + { + "epoch": 17.014700968927496, + "grad_norm": 0.03585409000515938, + "learning_rate": 3.317605565746912e-06, + "loss": 0.0933, + "num_input_tokens_seen": 185899936, + "step": 152775 + }, + { + "epoch": 17.015257823811115, + "grad_norm": 0.0717037245631218, + "learning_rate": 3.316396161287527e-06, + "loss": 0.0075, + "num_input_tokens_seen": 185905728, + "step": 152780 + }, + { + "epoch": 17.01581467869473, + "grad_norm": 1.9634387493133545, + "learning_rate": 3.315186961647149e-06, + "loss": 0.0397, + "num_input_tokens_seen": 185911904, + "step": 152785 + }, + { + "epoch": 17.01637153357835, + "grad_norm": 0.014734347350895405, + "learning_rate": 3.3139779668372064e-06, + "loss": 0.0094, + "num_input_tokens_seen": 185918144, + "step": 152790 + }, + { + "epoch": 17.016928388461967, + "grad_norm": 0.4578298032283783, + "learning_rate": 3.3127691768691183e-06, + "loss": 0.127, + "num_input_tokens_seen": 185924096, + "step": 152795 + }, + { + "epoch": 17.017485243345583, + "grad_norm": 0.021183915436267853, + "learning_rate": 3.3115605917543007e-06, + "loss": 0.0083, + "num_input_tokens_seen": 185930240, + "step": 152800 + }, + { + "epoch": 17.018042098229202, + "grad_norm": 2.5308377742767334, + "learning_rate": 3.3103522115041642e-06, + "loss": 0.035, + "num_input_tokens_seen": 185936640, + "step": 152805 + }, + { + "epoch": 17.018598953112818, + "grad_norm": 0.00042806915007531643, + "learning_rate": 3.3091440361301325e-06, + "loss": 0.1005, + "num_input_tokens_seen": 185942464, + "step": 152810 + }, + { + "epoch": 17.019155807996437, + "grad_norm": 0.0023196388501673937, + "learning_rate": 3.307936065643616e-06, + "loss": 0.0055, + "num_input_tokens_seen": 185948704, + "step": 152815 + }, + { + "epoch": 17.019712662880053, + "grad_norm": 0.0001707628689473495, + "learning_rate": 3.306728300056022e-06, + "loss": 0.004, + "num_input_tokens_seen": 185954976, + "step": 152820 + }, + { + "epoch": 17.02026951776367, + "grad_norm": 0.7811126708984375, + "learning_rate": 3.3055207393787536e-06, + "loss": 0.0951, + "num_input_tokens_seen": 185961152, + "step": 152825 + }, + { + "epoch": 17.02082637264729, + "grad_norm": 0.011721830815076828, + "learning_rate": 3.304313383623228e-06, + "loss": 0.0038, + "num_input_tokens_seen": 185967296, + "step": 152830 + }, + { + "epoch": 17.021383227530904, + "grad_norm": 0.07572437822818756, + "learning_rate": 3.30310623280084e-06, + "loss": 0.0751, + "num_input_tokens_seen": 185973504, + "step": 152835 + }, + { + "epoch": 17.021940082414524, + "grad_norm": 0.14143621921539307, + "learning_rate": 3.3018992869230074e-06, + "loss": 0.0068, + "num_input_tokens_seen": 185979456, + "step": 152840 + }, + { + "epoch": 17.02249693729814, + "grad_norm": 0.0011175924446433783, + "learning_rate": 3.3006925460011107e-06, + "loss": 0.0071, + "num_input_tokens_seen": 185985664, + "step": 152845 + }, + { + "epoch": 17.023053792181756, + "grad_norm": 0.018846964463591576, + "learning_rate": 3.2994860100465625e-06, + "loss": 0.0015, + "num_input_tokens_seen": 185992096, + "step": 152850 + }, + { + "epoch": 17.023610647065375, + "grad_norm": 0.01726309396326542, + "learning_rate": 3.298279679070751e-06, + "loss": 0.0028, + "num_input_tokens_seen": 185998272, + "step": 152855 + }, + { + "epoch": 17.02416750194899, + "grad_norm": 0.8847387433052063, + "learning_rate": 3.2970735530850784e-06, + "loss": 0.0199, + "num_input_tokens_seen": 186004480, + "step": 152860 + }, + { + "epoch": 17.02472435683261, + "grad_norm": 0.10309360921382904, + "learning_rate": 3.2958676321009354e-06, + "loss": 0.0707, + "num_input_tokens_seen": 186010720, + "step": 152865 + }, + { + "epoch": 17.025281211716226, + "grad_norm": 0.0010029986733570695, + "learning_rate": 3.294661916129713e-06, + "loss": 0.0173, + "num_input_tokens_seen": 186016896, + "step": 152870 + }, + { + "epoch": 17.025838066599842, + "grad_norm": 0.12490807473659515, + "learning_rate": 3.2934564051827906e-06, + "loss": 0.0033, + "num_input_tokens_seen": 186022784, + "step": 152875 + }, + { + "epoch": 17.026394921483462, + "grad_norm": 0.007991373538970947, + "learning_rate": 3.2922510992715706e-06, + "loss": 0.0683, + "num_input_tokens_seen": 186028256, + "step": 152880 + }, + { + "epoch": 17.026951776367078, + "grad_norm": 0.00013814917474519461, + "learning_rate": 3.2910459984074297e-06, + "loss": 0.0874, + "num_input_tokens_seen": 186034624, + "step": 152885 + }, + { + "epoch": 17.027508631250697, + "grad_norm": 0.0058649745769798756, + "learning_rate": 3.289841102601754e-06, + "loss": 0.0032, + "num_input_tokens_seen": 186040992, + "step": 152890 + }, + { + "epoch": 17.028065486134313, + "grad_norm": 0.009077992290258408, + "learning_rate": 3.288636411865914e-06, + "loss": 0.0981, + "num_input_tokens_seen": 186047328, + "step": 152895 + }, + { + "epoch": 17.028622341017932, + "grad_norm": 0.0023378082551062107, + "learning_rate": 3.287431926211307e-06, + "loss": 0.0012, + "num_input_tokens_seen": 186053248, + "step": 152900 + }, + { + "epoch": 17.02917919590155, + "grad_norm": 0.5698365569114685, + "learning_rate": 3.286227645649295e-06, + "loss": 0.0406, + "num_input_tokens_seen": 186059232, + "step": 152905 + }, + { + "epoch": 17.029736050785164, + "grad_norm": 0.001694013481028378, + "learning_rate": 3.2850235701912703e-06, + "loss": 0.0122, + "num_input_tokens_seen": 186065504, + "step": 152910 + }, + { + "epoch": 17.030292905668784, + "grad_norm": 0.0106316227465868, + "learning_rate": 3.2838196998485837e-06, + "loss": 0.0183, + "num_input_tokens_seen": 186071616, + "step": 152915 + }, + { + "epoch": 17.0308497605524, + "grad_norm": 0.01482126023620367, + "learning_rate": 3.282616034632627e-06, + "loss": 0.0234, + "num_input_tokens_seen": 186077856, + "step": 152920 + }, + { + "epoch": 17.03140661543602, + "grad_norm": 0.0007025664672255516, + "learning_rate": 3.281412574554754e-06, + "loss": 0.0148, + "num_input_tokens_seen": 186083936, + "step": 152925 + }, + { + "epoch": 17.031963470319635, + "grad_norm": 1.0604512691497803, + "learning_rate": 3.2802093196263457e-06, + "loss": 0.0572, + "num_input_tokens_seen": 186090048, + "step": 152930 + }, + { + "epoch": 17.03252032520325, + "grad_norm": 0.9493423700332642, + "learning_rate": 3.2790062698587614e-06, + "loss": 0.0261, + "num_input_tokens_seen": 186095520, + "step": 152935 + }, + { + "epoch": 17.03307718008687, + "grad_norm": 0.0003668198478408158, + "learning_rate": 3.2778034252633678e-06, + "loss": 0.0486, + "num_input_tokens_seen": 186101824, + "step": 152940 + }, + { + "epoch": 17.033634034970486, + "grad_norm": 2.1091997623443604, + "learning_rate": 3.276600785851516e-06, + "loss": 0.147, + "num_input_tokens_seen": 186108128, + "step": 152945 + }, + { + "epoch": 17.034190889854106, + "grad_norm": 4.931766510009766, + "learning_rate": 3.2753983516345815e-06, + "loss": 0.0931, + "num_input_tokens_seen": 186114368, + "step": 152950 + }, + { + "epoch": 17.03474774473772, + "grad_norm": 0.18111518025398254, + "learning_rate": 3.274196122623915e-06, + "loss": 0.0293, + "num_input_tokens_seen": 186120256, + "step": 152955 + }, + { + "epoch": 17.035304599621337, + "grad_norm": 0.12187755852937698, + "learning_rate": 3.2729940988308726e-06, + "loss": 0.0686, + "num_input_tokens_seen": 186126400, + "step": 152960 + }, + { + "epoch": 17.035861454504957, + "grad_norm": 0.001066560740582645, + "learning_rate": 3.2717922802668e-06, + "loss": 0.0232, + "num_input_tokens_seen": 186132160, + "step": 152965 + }, + { + "epoch": 17.036418309388573, + "grad_norm": 0.22552305459976196, + "learning_rate": 3.2705906669430656e-06, + "loss": 0.0118, + "num_input_tokens_seen": 186138240, + "step": 152970 + }, + { + "epoch": 17.036975164272192, + "grad_norm": 1.192132592201233, + "learning_rate": 3.2693892588710084e-06, + "loss": 0.0627, + "num_input_tokens_seen": 186144544, + "step": 152975 + }, + { + "epoch": 17.037532019155808, + "grad_norm": 0.00042200752068310976, + "learning_rate": 3.2681880560619827e-06, + "loss": 0.0173, + "num_input_tokens_seen": 186150528, + "step": 152980 + }, + { + "epoch": 17.038088874039424, + "grad_norm": 0.8563569188117981, + "learning_rate": 3.2669870585273323e-06, + "loss": 0.0135, + "num_input_tokens_seen": 186156672, + "step": 152985 + }, + { + "epoch": 17.038645728923044, + "grad_norm": 1.5096123218536377, + "learning_rate": 3.2657862662784008e-06, + "loss": 0.0518, + "num_input_tokens_seen": 186162656, + "step": 152990 + }, + { + "epoch": 17.03920258380666, + "grad_norm": 0.0932169035077095, + "learning_rate": 3.264585679326526e-06, + "loss": 0.0103, + "num_input_tokens_seen": 186168320, + "step": 152995 + }, + { + "epoch": 17.03975943869028, + "grad_norm": 0.01393675897270441, + "learning_rate": 3.2633852976830583e-06, + "loss": 0.0059, + "num_input_tokens_seen": 186174432, + "step": 153000 + }, + { + "epoch": 17.040316293573895, + "grad_norm": 0.011701060459017754, + "learning_rate": 3.2621851213593347e-06, + "loss": 0.068, + "num_input_tokens_seen": 186180416, + "step": 153005 + }, + { + "epoch": 17.04087314845751, + "grad_norm": 0.014295591972768307, + "learning_rate": 3.2609851503666856e-06, + "loss": 0.0858, + "num_input_tokens_seen": 186186336, + "step": 153010 + }, + { + "epoch": 17.04143000334113, + "grad_norm": 1.778726577758789, + "learning_rate": 3.2597853847164434e-06, + "loss": 0.03, + "num_input_tokens_seen": 186192544, + "step": 153015 + }, + { + "epoch": 17.041986858224746, + "grad_norm": 0.0018462513107806444, + "learning_rate": 3.2585858244199545e-06, + "loss": 0.0532, + "num_input_tokens_seen": 186198368, + "step": 153020 + }, + { + "epoch": 17.042543713108365, + "grad_norm": 1.0083503723144531, + "learning_rate": 3.2573864694885325e-06, + "loss": 0.0628, + "num_input_tokens_seen": 186204928, + "step": 153025 + }, + { + "epoch": 17.04310056799198, + "grad_norm": 0.8738461136817932, + "learning_rate": 3.256187319933529e-06, + "loss": 0.0856, + "num_input_tokens_seen": 186210944, + "step": 153030 + }, + { + "epoch": 17.043657422875597, + "grad_norm": 0.09798284620046616, + "learning_rate": 3.2549883757662493e-06, + "loss": 0.0017, + "num_input_tokens_seen": 186216960, + "step": 153035 + }, + { + "epoch": 17.044214277759217, + "grad_norm": 0.02893679030239582, + "learning_rate": 3.2537896369980227e-06, + "loss": 0.141, + "num_input_tokens_seen": 186222912, + "step": 153040 + }, + { + "epoch": 17.044771132642833, + "grad_norm": 0.0006712118629366159, + "learning_rate": 3.252591103640179e-06, + "loss": 0.0099, + "num_input_tokens_seen": 186229056, + "step": 153045 + }, + { + "epoch": 17.045327987526452, + "grad_norm": 0.5239187479019165, + "learning_rate": 3.251392775704032e-06, + "loss": 0.0419, + "num_input_tokens_seen": 186235328, + "step": 153050 + }, + { + "epoch": 17.045884842410068, + "grad_norm": 0.350168913602829, + "learning_rate": 3.2501946532009164e-06, + "loss": 0.0795, + "num_input_tokens_seen": 186241120, + "step": 153055 + }, + { + "epoch": 17.046441697293684, + "grad_norm": 0.8267657160758972, + "learning_rate": 3.2489967361421264e-06, + "loss": 0.0207, + "num_input_tokens_seen": 186246976, + "step": 153060 + }, + { + "epoch": 17.046998552177303, + "grad_norm": 0.07617633044719696, + "learning_rate": 3.2477990245389945e-06, + "loss": 0.0686, + "num_input_tokens_seen": 186252928, + "step": 153065 + }, + { + "epoch": 17.04755540706092, + "grad_norm": 0.001481765997596085, + "learning_rate": 3.2466015184028224e-06, + "loss": 0.0159, + "num_input_tokens_seen": 186259392, + "step": 153070 + }, + { + "epoch": 17.04811226194454, + "grad_norm": 0.3482379615306854, + "learning_rate": 3.2454042177449323e-06, + "loss": 0.0646, + "num_input_tokens_seen": 186265248, + "step": 153075 + }, + { + "epoch": 17.048669116828155, + "grad_norm": 0.8449116349220276, + "learning_rate": 3.2442071225766312e-06, + "loss": 0.0428, + "num_input_tokens_seen": 186271072, + "step": 153080 + }, + { + "epoch": 17.04922597171177, + "grad_norm": 0.02459230087697506, + "learning_rate": 3.2430102329092217e-06, + "loss": 0.0126, + "num_input_tokens_seen": 186276736, + "step": 153085 + }, + { + "epoch": 17.04978282659539, + "grad_norm": 1.3426427841186523, + "learning_rate": 3.241813548754008e-06, + "loss": 0.2082, + "num_input_tokens_seen": 186283168, + "step": 153090 + }, + { + "epoch": 17.050339681479006, + "grad_norm": 2.561112642288208, + "learning_rate": 3.2406170701223038e-06, + "loss": 0.138, + "num_input_tokens_seen": 186289440, + "step": 153095 + }, + { + "epoch": 17.050896536362625, + "grad_norm": 0.3188828229904175, + "learning_rate": 3.2394207970254027e-06, + "loss": 0.0149, + "num_input_tokens_seen": 186295520, + "step": 153100 + }, + { + "epoch": 17.05145339124624, + "grad_norm": 0.002251107944175601, + "learning_rate": 3.2382247294746065e-06, + "loss": 0.1052, + "num_input_tokens_seen": 186301696, + "step": 153105 + }, + { + "epoch": 17.052010246129857, + "grad_norm": 0.26744115352630615, + "learning_rate": 3.2370288674812095e-06, + "loss": 0.1067, + "num_input_tokens_seen": 186307552, + "step": 153110 + }, + { + "epoch": 17.052567101013477, + "grad_norm": 1.3332843780517578, + "learning_rate": 3.235833211056516e-06, + "loss": 0.0518, + "num_input_tokens_seen": 186313664, + "step": 153115 + }, + { + "epoch": 17.053123955897092, + "grad_norm": 0.0060786595568060875, + "learning_rate": 3.2346377602118087e-06, + "loss": 0.0142, + "num_input_tokens_seen": 186319936, + "step": 153120 + }, + { + "epoch": 17.053680810780712, + "grad_norm": 0.003967768047004938, + "learning_rate": 3.2334425149583903e-06, + "loss": 0.0499, + "num_input_tokens_seen": 186326112, + "step": 153125 + }, + { + "epoch": 17.054237665664328, + "grad_norm": 1.9658514261245728, + "learning_rate": 3.232247475307548e-06, + "loss": 0.0641, + "num_input_tokens_seen": 186332320, + "step": 153130 + }, + { + "epoch": 17.054794520547944, + "grad_norm": 0.008525537326931953, + "learning_rate": 3.2310526412705685e-06, + "loss": 0.0113, + "num_input_tokens_seen": 186338464, + "step": 153135 + }, + { + "epoch": 17.055351375431563, + "grad_norm": 0.07596252113580704, + "learning_rate": 3.229858012858733e-06, + "loss": 0.0055, + "num_input_tokens_seen": 186344672, + "step": 153140 + }, + { + "epoch": 17.05590823031518, + "grad_norm": 1.1165294647216797, + "learning_rate": 3.2286635900833336e-06, + "loss": 0.0479, + "num_input_tokens_seen": 186350400, + "step": 153145 + }, + { + "epoch": 17.0564650851988, + "grad_norm": 1.1072560548782349, + "learning_rate": 3.227469372955652e-06, + "loss": 0.0329, + "num_input_tokens_seen": 186356384, + "step": 153150 + }, + { + "epoch": 17.057021940082414, + "grad_norm": 0.00047515478217974305, + "learning_rate": 3.2262753614869633e-06, + "loss": 0.0359, + "num_input_tokens_seen": 186362336, + "step": 153155 + }, + { + "epoch": 17.05757879496603, + "grad_norm": 0.4634445905685425, + "learning_rate": 3.2250815556885446e-06, + "loss": 0.079, + "num_input_tokens_seen": 186368768, + "step": 153160 + }, + { + "epoch": 17.05813564984965, + "grad_norm": 0.06219099834561348, + "learning_rate": 3.2238879555716835e-06, + "loss": 0.0333, + "num_input_tokens_seen": 186374720, + "step": 153165 + }, + { + "epoch": 17.058692504733266, + "grad_norm": 0.002949472051113844, + "learning_rate": 3.2226945611476435e-06, + "loss": 0.0257, + "num_input_tokens_seen": 186380832, + "step": 153170 + }, + { + "epoch": 17.059249359616885, + "grad_norm": 0.0478246696293354, + "learning_rate": 3.2215013724277045e-06, + "loss": 0.0579, + "num_input_tokens_seen": 186386624, + "step": 153175 + }, + { + "epoch": 17.0598062145005, + "grad_norm": 0.16200892627239227, + "learning_rate": 3.2203083894231266e-06, + "loss": 0.0599, + "num_input_tokens_seen": 186392832, + "step": 153180 + }, + { + "epoch": 17.060363069384117, + "grad_norm": 0.2503531575202942, + "learning_rate": 3.219115612145193e-06, + "loss": 0.0235, + "num_input_tokens_seen": 186399008, + "step": 153185 + }, + { + "epoch": 17.060919924267736, + "grad_norm": 0.12000411748886108, + "learning_rate": 3.2179230406051558e-06, + "loss": 0.018, + "num_input_tokens_seen": 186405024, + "step": 153190 + }, + { + "epoch": 17.061476779151352, + "grad_norm": 1.2455761432647705, + "learning_rate": 3.216730674814292e-06, + "loss": 0.0542, + "num_input_tokens_seen": 186411264, + "step": 153195 + }, + { + "epoch": 17.06203363403497, + "grad_norm": 0.045520685613155365, + "learning_rate": 3.2155385147838614e-06, + "loss": 0.0057, + "num_input_tokens_seen": 186417056, + "step": 153200 + }, + { + "epoch": 17.062590488918588, + "grad_norm": 0.06575237214565277, + "learning_rate": 3.2143465605251196e-06, + "loss": 0.1692, + "num_input_tokens_seen": 186423168, + "step": 153205 + }, + { + "epoch": 17.063147343802203, + "grad_norm": 0.014536666683852673, + "learning_rate": 3.2131548120493265e-06, + "loss": 0.1151, + "num_input_tokens_seen": 186429184, + "step": 153210 + }, + { + "epoch": 17.063704198685823, + "grad_norm": 0.14436805248260498, + "learning_rate": 3.211963269367746e-06, + "loss": 0.1053, + "num_input_tokens_seen": 186435072, + "step": 153215 + }, + { + "epoch": 17.06426105356944, + "grad_norm": 0.07864027470350266, + "learning_rate": 3.21077193249163e-06, + "loss": 0.0077, + "num_input_tokens_seen": 186441184, + "step": 153220 + }, + { + "epoch": 17.06481790845306, + "grad_norm": 0.0034043737687170506, + "learning_rate": 3.2095808014322272e-06, + "loss": 0.007, + "num_input_tokens_seen": 186447392, + "step": 153225 + }, + { + "epoch": 17.065374763336674, + "grad_norm": 0.001466490444727242, + "learning_rate": 3.2083898762007877e-06, + "loss": 0.1077, + "num_input_tokens_seen": 186453248, + "step": 153230 + }, + { + "epoch": 17.06593161822029, + "grad_norm": 0.2576158940792084, + "learning_rate": 3.207199156808571e-06, + "loss": 0.0612, + "num_input_tokens_seen": 186459584, + "step": 153235 + }, + { + "epoch": 17.06648847310391, + "grad_norm": 1.7334394454956055, + "learning_rate": 3.206008643266814e-06, + "loss": 0.0331, + "num_input_tokens_seen": 186465728, + "step": 153240 + }, + { + "epoch": 17.067045327987525, + "grad_norm": 0.09446511417627335, + "learning_rate": 3.204818335586776e-06, + "loss": 0.0035, + "num_input_tokens_seen": 186472128, + "step": 153245 + }, + { + "epoch": 17.067602182871145, + "grad_norm": 0.0009617937612347305, + "learning_rate": 3.203628233779679e-06, + "loss": 0.0224, + "num_input_tokens_seen": 186477856, + "step": 153250 + }, + { + "epoch": 17.06815903775476, + "grad_norm": 1.38716459274292, + "learning_rate": 3.2024383378567837e-06, + "loss": 0.0666, + "num_input_tokens_seen": 186484160, + "step": 153255 + }, + { + "epoch": 17.06871589263838, + "grad_norm": 0.020414434373378754, + "learning_rate": 3.2012486478293163e-06, + "loss": 0.0262, + "num_input_tokens_seen": 186490240, + "step": 153260 + }, + { + "epoch": 17.069272747521996, + "grad_norm": 0.038808856159448624, + "learning_rate": 3.200059163708527e-06, + "loss": 0.0951, + "num_input_tokens_seen": 186496480, + "step": 153265 + }, + { + "epoch": 17.069829602405612, + "grad_norm": 0.012593960389494896, + "learning_rate": 3.1988698855056426e-06, + "loss": 0.0016, + "num_input_tokens_seen": 186502944, + "step": 153270 + }, + { + "epoch": 17.07038645728923, + "grad_norm": 2.0616464614868164, + "learning_rate": 3.1976808132318986e-06, + "loss": 0.0459, + "num_input_tokens_seen": 186509280, + "step": 153275 + }, + { + "epoch": 17.070943312172847, + "grad_norm": 0.12183313816785812, + "learning_rate": 3.196491946898522e-06, + "loss": 0.0269, + "num_input_tokens_seen": 186515072, + "step": 153280 + }, + { + "epoch": 17.071500167056467, + "grad_norm": 0.0006250060978345573, + "learning_rate": 3.1953032865167545e-06, + "loss": 0.0032, + "num_input_tokens_seen": 186521152, + "step": 153285 + }, + { + "epoch": 17.072057021940083, + "grad_norm": 1.5499016046524048, + "learning_rate": 3.1941148320978163e-06, + "loss": 0.0964, + "num_input_tokens_seen": 186527328, + "step": 153290 + }, + { + "epoch": 17.0726138768237, + "grad_norm": 4.465760707855225, + "learning_rate": 3.192926583652933e-06, + "loss": 0.0696, + "num_input_tokens_seen": 186533504, + "step": 153295 + }, + { + "epoch": 17.073170731707318, + "grad_norm": 0.8195615410804749, + "learning_rate": 3.1917385411933258e-06, + "loss": 0.0627, + "num_input_tokens_seen": 186539520, + "step": 153300 + }, + { + "epoch": 17.073727586590934, + "grad_norm": 8.727383828954771e-05, + "learning_rate": 3.1905507047302245e-06, + "loss": 0.0028, + "num_input_tokens_seen": 186545696, + "step": 153305 + }, + { + "epoch": 17.074284441474553, + "grad_norm": 0.9761024117469788, + "learning_rate": 3.1893630742748427e-06, + "loss": 0.0678, + "num_input_tokens_seen": 186551840, + "step": 153310 + }, + { + "epoch": 17.07484129635817, + "grad_norm": 0.013702315278351307, + "learning_rate": 3.1881756498384096e-06, + "loss": 0.02, + "num_input_tokens_seen": 186557856, + "step": 153315 + }, + { + "epoch": 17.075398151241785, + "grad_norm": 0.02757621742784977, + "learning_rate": 3.1869884314321225e-06, + "loss": 0.0168, + "num_input_tokens_seen": 186564032, + "step": 153320 + }, + { + "epoch": 17.075955006125405, + "grad_norm": 0.10593082755804062, + "learning_rate": 3.185801419067211e-06, + "loss": 0.0159, + "num_input_tokens_seen": 186570080, + "step": 153325 + }, + { + "epoch": 17.07651186100902, + "grad_norm": 2.80267333984375, + "learning_rate": 3.184614612754877e-06, + "loss": 0.0707, + "num_input_tokens_seen": 186576288, + "step": 153330 + }, + { + "epoch": 17.07706871589264, + "grad_norm": 0.04838591068983078, + "learning_rate": 3.1834280125063396e-06, + "loss": 0.0399, + "num_input_tokens_seen": 186582688, + "step": 153335 + }, + { + "epoch": 17.077625570776256, + "grad_norm": 1.3774911165237427, + "learning_rate": 3.182241618332807e-06, + "loss": 0.0728, + "num_input_tokens_seen": 186588992, + "step": 153340 + }, + { + "epoch": 17.07818242565987, + "grad_norm": 0.8540313839912415, + "learning_rate": 3.1810554302454805e-06, + "loss": 0.0095, + "num_input_tokens_seen": 186595456, + "step": 153345 + }, + { + "epoch": 17.07873928054349, + "grad_norm": 1.5338170528411865, + "learning_rate": 3.1798694482555625e-06, + "loss": 0.0415, + "num_input_tokens_seen": 186601504, + "step": 153350 + }, + { + "epoch": 17.079296135427107, + "grad_norm": 0.053627803921699524, + "learning_rate": 3.1786836723742634e-06, + "loss": 0.0463, + "num_input_tokens_seen": 186607232, + "step": 153355 + }, + { + "epoch": 17.079852990310727, + "grad_norm": 0.0005285817314870656, + "learning_rate": 3.177498102612783e-06, + "loss": 0.0647, + "num_input_tokens_seen": 186613280, + "step": 153360 + }, + { + "epoch": 17.080409845194342, + "grad_norm": 0.12102869898080826, + "learning_rate": 3.1763127389823124e-06, + "loss": 0.0631, + "num_input_tokens_seen": 186619264, + "step": 153365 + }, + { + "epoch": 17.08096670007796, + "grad_norm": 0.3697260022163391, + "learning_rate": 3.17512758149405e-06, + "loss": 0.0592, + "num_input_tokens_seen": 186625504, + "step": 153370 + }, + { + "epoch": 17.081523554961578, + "grad_norm": 0.08717412501573563, + "learning_rate": 3.173942630159199e-06, + "loss": 0.0442, + "num_input_tokens_seen": 186631712, + "step": 153375 + }, + { + "epoch": 17.082080409845194, + "grad_norm": 0.009166492149233818, + "learning_rate": 3.1727578849889415e-06, + "loss": 0.0999, + "num_input_tokens_seen": 186637824, + "step": 153380 + }, + { + "epoch": 17.082637264728813, + "grad_norm": 1.0839732885360718, + "learning_rate": 3.1715733459944773e-06, + "loss": 0.0352, + "num_input_tokens_seen": 186643584, + "step": 153385 + }, + { + "epoch": 17.08319411961243, + "grad_norm": 0.03345583751797676, + "learning_rate": 3.170389013186992e-06, + "loss": 0.0025, + "num_input_tokens_seen": 186649984, + "step": 153390 + }, + { + "epoch": 17.083750974496045, + "grad_norm": 0.9062631726264954, + "learning_rate": 3.169204886577673e-06, + "loss": 0.0642, + "num_input_tokens_seen": 186656160, + "step": 153395 + }, + { + "epoch": 17.084307829379664, + "grad_norm": 1.1966110467910767, + "learning_rate": 3.1680209661776982e-06, + "loss": 0.0865, + "num_input_tokens_seen": 186662240, + "step": 153400 + }, + { + "epoch": 17.08486468426328, + "grad_norm": 0.00017148279584944248, + "learning_rate": 3.1668372519982644e-06, + "loss": 0.0016, + "num_input_tokens_seen": 186668352, + "step": 153405 + }, + { + "epoch": 17.0854215391469, + "grad_norm": 0.004160635638982058, + "learning_rate": 3.1656537440505424e-06, + "loss": 0.0788, + "num_input_tokens_seen": 186674720, + "step": 153410 + }, + { + "epoch": 17.085978394030516, + "grad_norm": 0.007692870683968067, + "learning_rate": 3.1644704423457157e-06, + "loss": 0.0034, + "num_input_tokens_seen": 186681120, + "step": 153415 + }, + { + "epoch": 17.08653524891413, + "grad_norm": 0.007014039438217878, + "learning_rate": 3.1632873468949525e-06, + "loss": 0.0173, + "num_input_tokens_seen": 186686912, + "step": 153420 + }, + { + "epoch": 17.08709210379775, + "grad_norm": 8.804583922028542e-05, + "learning_rate": 3.1621044577094415e-06, + "loss": 0.007, + "num_input_tokens_seen": 186693056, + "step": 153425 + }, + { + "epoch": 17.087648958681367, + "grad_norm": 0.040904249995946884, + "learning_rate": 3.1609217748003463e-06, + "loss": 0.0516, + "num_input_tokens_seen": 186698912, + "step": 153430 + }, + { + "epoch": 17.088205813564986, + "grad_norm": 0.47351405024528503, + "learning_rate": 3.1597392981788544e-06, + "loss": 0.0224, + "num_input_tokens_seen": 186705152, + "step": 153435 + }, + { + "epoch": 17.088762668448602, + "grad_norm": 0.43435654044151306, + "learning_rate": 3.158557027856113e-06, + "loss": 0.0519, + "num_input_tokens_seen": 186710976, + "step": 153440 + }, + { + "epoch": 17.089319523332218, + "grad_norm": 0.7763203978538513, + "learning_rate": 3.1573749638432988e-06, + "loss": 0.1114, + "num_input_tokens_seen": 186716832, + "step": 153445 + }, + { + "epoch": 17.089876378215838, + "grad_norm": 1.2060587406158447, + "learning_rate": 3.156193106151581e-06, + "loss": 0.0267, + "num_input_tokens_seen": 186723072, + "step": 153450 + }, + { + "epoch": 17.090433233099454, + "grad_norm": 0.0010518403723835945, + "learning_rate": 3.1550114547921144e-06, + "loss": 0.0368, + "num_input_tokens_seen": 186729280, + "step": 153455 + }, + { + "epoch": 17.090990087983073, + "grad_norm": 0.006547970697283745, + "learning_rate": 3.1538300097760763e-06, + "loss": 0.0015, + "num_input_tokens_seen": 186735840, + "step": 153460 + }, + { + "epoch": 17.09154694286669, + "grad_norm": 0.32262057065963745, + "learning_rate": 3.152648771114608e-06, + "loss": 0.0132, + "num_input_tokens_seen": 186741952, + "step": 153465 + }, + { + "epoch": 17.092103797750305, + "grad_norm": 0.00043924455530941486, + "learning_rate": 3.151467738818881e-06, + "loss": 0.0062, + "num_input_tokens_seen": 186748128, + "step": 153470 + }, + { + "epoch": 17.092660652633924, + "grad_norm": 0.0170891135931015, + "learning_rate": 3.1502869129000412e-06, + "loss": 0.0003, + "num_input_tokens_seen": 186754368, + "step": 153475 + }, + { + "epoch": 17.09321750751754, + "grad_norm": 0.5028834342956543, + "learning_rate": 3.149106293369253e-06, + "loss": 0.0253, + "num_input_tokens_seen": 186760608, + "step": 153480 + }, + { + "epoch": 17.09377436240116, + "grad_norm": 1.2627204656600952, + "learning_rate": 3.1479258802376595e-06, + "loss": 0.1294, + "num_input_tokens_seen": 186766656, + "step": 153485 + }, + { + "epoch": 17.094331217284775, + "grad_norm": 0.038886312395334244, + "learning_rate": 3.146745673516416e-06, + "loss": 0.0602, + "num_input_tokens_seen": 186772800, + "step": 153490 + }, + { + "epoch": 17.09488807216839, + "grad_norm": 0.0012216088362038136, + "learning_rate": 3.1455656732166633e-06, + "loss": 0.0008, + "num_input_tokens_seen": 186779264, + "step": 153495 + }, + { + "epoch": 17.09544492705201, + "grad_norm": 0.00020905422570649534, + "learning_rate": 3.1443858793495566e-06, + "loss": 0.0819, + "num_input_tokens_seen": 186785344, + "step": 153500 + }, + { + "epoch": 17.096001781935627, + "grad_norm": 0.06467006355524063, + "learning_rate": 3.1432062919262346e-06, + "loss": 0.0369, + "num_input_tokens_seen": 186791712, + "step": 153505 + }, + { + "epoch": 17.096558636819246, + "grad_norm": 0.11899074167013168, + "learning_rate": 3.1420269109578432e-06, + "loss": 0.0731, + "num_input_tokens_seen": 186797888, + "step": 153510 + }, + { + "epoch": 17.097115491702862, + "grad_norm": 0.005695046856999397, + "learning_rate": 3.1408477364555127e-06, + "loss": 0.0317, + "num_input_tokens_seen": 186804064, + "step": 153515 + }, + { + "epoch": 17.097672346586478, + "grad_norm": 0.07408491522073746, + "learning_rate": 3.1396687684303927e-06, + "loss": 0.009, + "num_input_tokens_seen": 186810304, + "step": 153520 + }, + { + "epoch": 17.098229201470097, + "grad_norm": 0.05247681587934494, + "learning_rate": 3.138490006893613e-06, + "loss": 0.0334, + "num_input_tokens_seen": 186816640, + "step": 153525 + }, + { + "epoch": 17.098786056353713, + "grad_norm": 0.1716262251138687, + "learning_rate": 3.1373114518563145e-06, + "loss": 0.0039, + "num_input_tokens_seen": 186822336, + "step": 153530 + }, + { + "epoch": 17.099342911237333, + "grad_norm": 0.16749432682991028, + "learning_rate": 3.136133103329625e-06, + "loss": 0.064, + "num_input_tokens_seen": 186828480, + "step": 153535 + }, + { + "epoch": 17.09989976612095, + "grad_norm": 0.15457524359226227, + "learning_rate": 3.1349549613246737e-06, + "loss": 0.0398, + "num_input_tokens_seen": 186834720, + "step": 153540 + }, + { + "epoch": 17.100456621004565, + "grad_norm": 0.7572508454322815, + "learning_rate": 3.133777025852588e-06, + "loss": 0.0726, + "num_input_tokens_seen": 186841088, + "step": 153545 + }, + { + "epoch": 17.101013475888184, + "grad_norm": 0.00021695694886147976, + "learning_rate": 3.1325992969245016e-06, + "loss": 0.0019, + "num_input_tokens_seen": 186846848, + "step": 153550 + }, + { + "epoch": 17.1015703307718, + "grad_norm": 0.05942579358816147, + "learning_rate": 3.131421774551535e-06, + "loss": 0.0075, + "num_input_tokens_seen": 186852864, + "step": 153555 + }, + { + "epoch": 17.10212718565542, + "grad_norm": 1.629948377609253, + "learning_rate": 3.1302444587448103e-06, + "loss": 0.0162, + "num_input_tokens_seen": 186859072, + "step": 153560 + }, + { + "epoch": 17.102684040539035, + "grad_norm": 1.236403226852417, + "learning_rate": 3.129067349515444e-06, + "loss": 0.0302, + "num_input_tokens_seen": 186865408, + "step": 153565 + }, + { + "epoch": 17.10324089542265, + "grad_norm": 0.09554552286863327, + "learning_rate": 3.127890446874565e-06, + "loss": 0.0405, + "num_input_tokens_seen": 186870848, + "step": 153570 + }, + { + "epoch": 17.10379775030627, + "grad_norm": 0.10813301056623459, + "learning_rate": 3.1267137508332765e-06, + "loss": 0.0237, + "num_input_tokens_seen": 186877120, + "step": 153575 + }, + { + "epoch": 17.104354605189886, + "grad_norm": 0.6616033911705017, + "learning_rate": 3.125537261402714e-06, + "loss": 0.024, + "num_input_tokens_seen": 186882720, + "step": 153580 + }, + { + "epoch": 17.104911460073506, + "grad_norm": 0.013538237661123276, + "learning_rate": 3.1243609785939656e-06, + "loss": 0.0124, + "num_input_tokens_seen": 186888768, + "step": 153585 + }, + { + "epoch": 17.105468314957122, + "grad_norm": 0.8362324833869934, + "learning_rate": 3.1231849024181608e-06, + "loss": 0.0265, + "num_input_tokens_seen": 186895168, + "step": 153590 + }, + { + "epoch": 17.10602516984074, + "grad_norm": 0.017798954620957375, + "learning_rate": 3.122009032886397e-06, + "loss": 0.0028, + "num_input_tokens_seen": 186901504, + "step": 153595 + }, + { + "epoch": 17.106582024724357, + "grad_norm": 0.11355229467153549, + "learning_rate": 3.1208333700097904e-06, + "loss": 0.0622, + "num_input_tokens_seen": 186907392, + "step": 153600 + }, + { + "epoch": 17.107138879607973, + "grad_norm": 0.020941441878676414, + "learning_rate": 3.1196579137994425e-06, + "loss": 0.0183, + "num_input_tokens_seen": 186913600, + "step": 153605 + }, + { + "epoch": 17.107695734491593, + "grad_norm": 2.1970622539520264, + "learning_rate": 3.118482664266456e-06, + "loss": 0.1011, + "num_input_tokens_seen": 186919616, + "step": 153610 + }, + { + "epoch": 17.10825258937521, + "grad_norm": 2.1623594760894775, + "learning_rate": 3.1173076214219247e-06, + "loss": 0.221, + "num_input_tokens_seen": 186925696, + "step": 153615 + }, + { + "epoch": 17.108809444258828, + "grad_norm": 0.027999818325042725, + "learning_rate": 3.1161327852769623e-06, + "loss": 0.047, + "num_input_tokens_seen": 186931872, + "step": 153620 + }, + { + "epoch": 17.109366299142444, + "grad_norm": 0.002147082006558776, + "learning_rate": 3.1149581558426595e-06, + "loss": 0.0064, + "num_input_tokens_seen": 186938112, + "step": 153625 + }, + { + "epoch": 17.10992315402606, + "grad_norm": 0.30927231907844543, + "learning_rate": 3.113783733130107e-06, + "loss": 0.0451, + "num_input_tokens_seen": 186944256, + "step": 153630 + }, + { + "epoch": 17.11048000890968, + "grad_norm": 0.0006998262251727283, + "learning_rate": 3.1126095171503998e-06, + "loss": 0.0373, + "num_input_tokens_seen": 186950624, + "step": 153635 + }, + { + "epoch": 17.111036863793295, + "grad_norm": 0.0014510354958474636, + "learning_rate": 3.111435507914637e-06, + "loss": 0.0207, + "num_input_tokens_seen": 186957088, + "step": 153640 + }, + { + "epoch": 17.111593718676914, + "grad_norm": 2.2705719470977783, + "learning_rate": 3.1102617054338977e-06, + "loss": 0.075, + "num_input_tokens_seen": 186962912, + "step": 153645 + }, + { + "epoch": 17.11215057356053, + "grad_norm": 0.9833014607429504, + "learning_rate": 3.1090881097192855e-06, + "loss": 0.0512, + "num_input_tokens_seen": 186968928, + "step": 153650 + }, + { + "epoch": 17.112707428444146, + "grad_norm": 0.00018608904792927206, + "learning_rate": 3.1079147207818626e-06, + "loss": 0.0064, + "num_input_tokens_seen": 186975232, + "step": 153655 + }, + { + "epoch": 17.113264283327766, + "grad_norm": 0.3494332730770111, + "learning_rate": 3.1067415386327293e-06, + "loss": 0.0109, + "num_input_tokens_seen": 186981248, + "step": 153660 + }, + { + "epoch": 17.11382113821138, + "grad_norm": 0.2933807373046875, + "learning_rate": 3.1055685632829594e-06, + "loss": 0.0333, + "num_input_tokens_seen": 186986880, + "step": 153665 + }, + { + "epoch": 17.114377993095, + "grad_norm": 0.0014047620352357626, + "learning_rate": 3.104395794743639e-06, + "loss": 0.0251, + "num_input_tokens_seen": 186993120, + "step": 153670 + }, + { + "epoch": 17.114934847978617, + "grad_norm": 0.5874342322349548, + "learning_rate": 3.103223233025845e-06, + "loss": 0.0072, + "num_input_tokens_seen": 186999424, + "step": 153675 + }, + { + "epoch": 17.115491702862233, + "grad_norm": 0.08178887516260147, + "learning_rate": 3.10205087814065e-06, + "loss": 0.024, + "num_input_tokens_seen": 187005760, + "step": 153680 + }, + { + "epoch": 17.116048557745852, + "grad_norm": 0.06793632358312607, + "learning_rate": 3.1008787300991244e-06, + "loss": 0.022, + "num_input_tokens_seen": 187012000, + "step": 153685 + }, + { + "epoch": 17.116605412629468, + "grad_norm": 0.18777383863925934, + "learning_rate": 3.0997067889123487e-06, + "loss": 0.0737, + "num_input_tokens_seen": 187018176, + "step": 153690 + }, + { + "epoch": 17.117162267513088, + "grad_norm": 0.7096526622772217, + "learning_rate": 3.0985350545913895e-06, + "loss": 0.0306, + "num_input_tokens_seen": 187023552, + "step": 153695 + }, + { + "epoch": 17.117719122396704, + "grad_norm": 0.0009137170272879303, + "learning_rate": 3.0973635271473123e-06, + "loss": 0.0024, + "num_input_tokens_seen": 187029568, + "step": 153700 + }, + { + "epoch": 17.11827597728032, + "grad_norm": 0.0028723019640892744, + "learning_rate": 3.0961922065911807e-06, + "loss": 0.0888, + "num_input_tokens_seen": 187035072, + "step": 153705 + }, + { + "epoch": 17.11883283216394, + "grad_norm": 0.5871937870979309, + "learning_rate": 3.0950210929340667e-06, + "loss": 0.0058, + "num_input_tokens_seen": 187041088, + "step": 153710 + }, + { + "epoch": 17.119389687047555, + "grad_norm": 2.0323281288146973, + "learning_rate": 3.093850186187025e-06, + "loss": 0.1361, + "num_input_tokens_seen": 187047104, + "step": 153715 + }, + { + "epoch": 17.119946541931174, + "grad_norm": 0.013182511553168297, + "learning_rate": 3.0926794863611303e-06, + "loss": 0.0158, + "num_input_tokens_seen": 187053056, + "step": 153720 + }, + { + "epoch": 17.12050339681479, + "grad_norm": 0.562954306602478, + "learning_rate": 3.091508993467418e-06, + "loss": 0.0112, + "num_input_tokens_seen": 187058944, + "step": 153725 + }, + { + "epoch": 17.121060251698406, + "grad_norm": 0.02790750004351139, + "learning_rate": 3.0903387075169597e-06, + "loss": 0.0031, + "num_input_tokens_seen": 187065248, + "step": 153730 + }, + { + "epoch": 17.121617106582026, + "grad_norm": 0.6113647818565369, + "learning_rate": 3.0891686285208026e-06, + "loss": 0.0096, + "num_input_tokens_seen": 187071584, + "step": 153735 + }, + { + "epoch": 17.12217396146564, + "grad_norm": 0.0026188630145043135, + "learning_rate": 3.087998756490007e-06, + "loss": 0.0406, + "num_input_tokens_seen": 187077376, + "step": 153740 + }, + { + "epoch": 17.12273081634926, + "grad_norm": 0.003082327079027891, + "learning_rate": 3.0868290914356197e-06, + "loss": 0.0122, + "num_input_tokens_seen": 187083712, + "step": 153745 + }, + { + "epoch": 17.123287671232877, + "grad_norm": 5.72188138961792, + "learning_rate": 3.085659633368687e-06, + "loss": 0.0268, + "num_input_tokens_seen": 187089600, + "step": 153750 + }, + { + "epoch": 17.123844526116493, + "grad_norm": 0.052917055785655975, + "learning_rate": 3.0844903823002536e-06, + "loss": 0.014, + "num_input_tokens_seen": 187095936, + "step": 153755 + }, + { + "epoch": 17.124401381000112, + "grad_norm": 0.14240732789039612, + "learning_rate": 3.0833213382413683e-06, + "loss": 0.0707, + "num_input_tokens_seen": 187102368, + "step": 153760 + }, + { + "epoch": 17.124958235883728, + "grad_norm": 9.432938531972468e-05, + "learning_rate": 3.0821525012030756e-06, + "loss": 0.0809, + "num_input_tokens_seen": 187108704, + "step": 153765 + }, + { + "epoch": 17.125515090767347, + "grad_norm": 2.3386764526367188, + "learning_rate": 3.0809838711964107e-06, + "loss": 0.1147, + "num_input_tokens_seen": 187114880, + "step": 153770 + }, + { + "epoch": 17.126071945650963, + "grad_norm": 1.8100528717041016, + "learning_rate": 3.079815448232412e-06, + "loss": 0.0729, + "num_input_tokens_seen": 187121248, + "step": 153775 + }, + { + "epoch": 17.12662880053458, + "grad_norm": 0.0014410940930247307, + "learning_rate": 3.078647232322121e-06, + "loss": 0.0125, + "num_input_tokens_seen": 187127360, + "step": 153780 + }, + { + "epoch": 17.1271856554182, + "grad_norm": 0.34401795268058777, + "learning_rate": 3.0774792234765648e-06, + "loss": 0.0889, + "num_input_tokens_seen": 187133344, + "step": 153785 + }, + { + "epoch": 17.127742510301815, + "grad_norm": 0.00015063685714267194, + "learning_rate": 3.0763114217067875e-06, + "loss": 0.0354, + "num_input_tokens_seen": 187139520, + "step": 153790 + }, + { + "epoch": 17.128299365185434, + "grad_norm": 1.0123361349105835, + "learning_rate": 3.075143827023816e-06, + "loss": 0.0389, + "num_input_tokens_seen": 187145632, + "step": 153795 + }, + { + "epoch": 17.12885622006905, + "grad_norm": 0.00033632919075898826, + "learning_rate": 3.0739764394386723e-06, + "loss": 0.0112, + "num_input_tokens_seen": 187151872, + "step": 153800 + }, + { + "epoch": 17.129413074952666, + "grad_norm": 0.003306551603600383, + "learning_rate": 3.0728092589623865e-06, + "loss": 0.0759, + "num_input_tokens_seen": 187158144, + "step": 153805 + }, + { + "epoch": 17.129969929836285, + "grad_norm": 0.10315621644258499, + "learning_rate": 3.0716422856059885e-06, + "loss": 0.0942, + "num_input_tokens_seen": 187163680, + "step": 153810 + }, + { + "epoch": 17.1305267847199, + "grad_norm": 0.0026911268942058086, + "learning_rate": 3.0704755193804973e-06, + "loss": 0.1204, + "num_input_tokens_seen": 187169984, + "step": 153815 + }, + { + "epoch": 17.13108363960352, + "grad_norm": 0.7440314292907715, + "learning_rate": 3.069308960296938e-06, + "loss": 0.0897, + "num_input_tokens_seen": 187176128, + "step": 153820 + }, + { + "epoch": 17.131640494487137, + "grad_norm": 0.000703530793543905, + "learning_rate": 3.0681426083663175e-06, + "loss": 0.0025, + "num_input_tokens_seen": 187182304, + "step": 153825 + }, + { + "epoch": 17.132197349370752, + "grad_norm": 0.1544286012649536, + "learning_rate": 3.066976463599666e-06, + "loss": 0.0712, + "num_input_tokens_seen": 187188416, + "step": 153830 + }, + { + "epoch": 17.132754204254372, + "grad_norm": 0.93026202917099, + "learning_rate": 3.0658105260079924e-06, + "loss": 0.0353, + "num_input_tokens_seen": 187194432, + "step": 153835 + }, + { + "epoch": 17.133311059137988, + "grad_norm": 0.2425350397825241, + "learning_rate": 3.06464479560232e-06, + "loss": 0.0991, + "num_input_tokens_seen": 187200352, + "step": 153840 + }, + { + "epoch": 17.133867914021607, + "grad_norm": 0.01458349172025919, + "learning_rate": 3.0634792723936405e-06, + "loss": 0.0058, + "num_input_tokens_seen": 187206528, + "step": 153845 + }, + { + "epoch": 17.134424768905223, + "grad_norm": 0.6275555491447449, + "learning_rate": 3.0623139563929815e-06, + "loss": 0.0095, + "num_input_tokens_seen": 187212640, + "step": 153850 + }, + { + "epoch": 17.13498162378884, + "grad_norm": 0.005258422810584307, + "learning_rate": 3.061148847611342e-06, + "loss": 0.0071, + "num_input_tokens_seen": 187218848, + "step": 153855 + }, + { + "epoch": 17.13553847867246, + "grad_norm": 0.039919719099998474, + "learning_rate": 3.0599839460597246e-06, + "loss": 0.0341, + "num_input_tokens_seen": 187224416, + "step": 153860 + }, + { + "epoch": 17.136095333556074, + "grad_norm": 0.4164402186870575, + "learning_rate": 3.058819251749148e-06, + "loss": 0.0269, + "num_input_tokens_seen": 187230048, + "step": 153865 + }, + { + "epoch": 17.136652188439694, + "grad_norm": 0.869364857673645, + "learning_rate": 3.057654764690593e-06, + "loss": 0.0436, + "num_input_tokens_seen": 187236160, + "step": 153870 + }, + { + "epoch": 17.13720904332331, + "grad_norm": 0.455336332321167, + "learning_rate": 3.056490484895072e-06, + "loss": 0.0483, + "num_input_tokens_seen": 187242048, + "step": 153875 + }, + { + "epoch": 17.137765898206926, + "grad_norm": 0.019591139629483223, + "learning_rate": 3.0553264123735715e-06, + "loss": 0.0431, + "num_input_tokens_seen": 187248192, + "step": 153880 + }, + { + "epoch": 17.138322753090545, + "grad_norm": 0.9753892421722412, + "learning_rate": 3.0541625471371042e-06, + "loss": 0.0492, + "num_input_tokens_seen": 187254080, + "step": 153885 + }, + { + "epoch": 17.13887960797416, + "grad_norm": 0.0008717236341908574, + "learning_rate": 3.052998889196654e-06, + "loss": 0.0154, + "num_input_tokens_seen": 187259712, + "step": 153890 + }, + { + "epoch": 17.13943646285778, + "grad_norm": 1.287703037261963, + "learning_rate": 3.051835438563211e-06, + "loss": 0.059, + "num_input_tokens_seen": 187266048, + "step": 153895 + }, + { + "epoch": 17.139993317741396, + "grad_norm": 0.0005409798468463123, + "learning_rate": 3.0506721952477613e-06, + "loss": 0.019, + "num_input_tokens_seen": 187272064, + "step": 153900 + }, + { + "epoch": 17.140550172625012, + "grad_norm": 1.9664742946624756, + "learning_rate": 3.0495091592613046e-06, + "loss": 0.0995, + "num_input_tokens_seen": 187278016, + "step": 153905 + }, + { + "epoch": 17.14110702750863, + "grad_norm": 1.4319961071014404, + "learning_rate": 3.048346330614821e-06, + "loss": 0.0341, + "num_input_tokens_seen": 187284320, + "step": 153910 + }, + { + "epoch": 17.141663882392248, + "grad_norm": 0.003616499714553356, + "learning_rate": 3.0471837093192928e-06, + "loss": 0.0026, + "num_input_tokens_seen": 187290272, + "step": 153915 + }, + { + "epoch": 17.142220737275867, + "grad_norm": 0.028320062905550003, + "learning_rate": 3.046021295385698e-06, + "loss": 0.0143, + "num_input_tokens_seen": 187296288, + "step": 153920 + }, + { + "epoch": 17.142777592159483, + "grad_norm": 0.017397526651620865, + "learning_rate": 3.044859088825025e-06, + "loss": 0.0112, + "num_input_tokens_seen": 187302656, + "step": 153925 + }, + { + "epoch": 17.1433344470431, + "grad_norm": 0.01524471677839756, + "learning_rate": 3.0436970896482454e-06, + "loss": 0.09, + "num_input_tokens_seen": 187308704, + "step": 153930 + }, + { + "epoch": 17.14389130192672, + "grad_norm": 0.01337223220616579, + "learning_rate": 3.042535297866342e-06, + "loss": 0.0507, + "num_input_tokens_seen": 187314528, + "step": 153935 + }, + { + "epoch": 17.144448156810334, + "grad_norm": 3.2656972408294678, + "learning_rate": 3.0413737134902835e-06, + "loss": 0.0826, + "num_input_tokens_seen": 187320960, + "step": 153940 + }, + { + "epoch": 17.145005011693954, + "grad_norm": 0.12108923494815826, + "learning_rate": 3.040212336531045e-06, + "loss": 0.1112, + "num_input_tokens_seen": 187326592, + "step": 153945 + }, + { + "epoch": 17.14556186657757, + "grad_norm": 0.5303776264190674, + "learning_rate": 3.0390511669995873e-06, + "loss": 0.1162, + "num_input_tokens_seen": 187332256, + "step": 153950 + }, + { + "epoch": 17.146118721461185, + "grad_norm": 1.655657410621643, + "learning_rate": 3.037890204906893e-06, + "loss": 0.04, + "num_input_tokens_seen": 187338464, + "step": 153955 + }, + { + "epoch": 17.146675576344805, + "grad_norm": 0.00036791045567952096, + "learning_rate": 3.0367294502639227e-06, + "loss": 0.0906, + "num_input_tokens_seen": 187344448, + "step": 153960 + }, + { + "epoch": 17.14723243122842, + "grad_norm": 0.018202053382992744, + "learning_rate": 3.035568903081637e-06, + "loss": 0.0891, + "num_input_tokens_seen": 187350720, + "step": 153965 + }, + { + "epoch": 17.14778928611204, + "grad_norm": 0.5400330424308777, + "learning_rate": 3.0344085633709966e-06, + "loss": 0.0166, + "num_input_tokens_seen": 187356704, + "step": 153970 + }, + { + "epoch": 17.148346140995656, + "grad_norm": 0.07639724016189575, + "learning_rate": 3.033248431142971e-06, + "loss": 0.043, + "num_input_tokens_seen": 187362208, + "step": 153975 + }, + { + "epoch": 17.148902995879276, + "grad_norm": 0.12311331927776337, + "learning_rate": 3.0320885064085093e-06, + "loss": 0.012, + "num_input_tokens_seen": 187368576, + "step": 153980 + }, + { + "epoch": 17.14945985076289, + "grad_norm": 0.08790599554777145, + "learning_rate": 3.03092878917858e-06, + "loss": 0.0398, + "num_input_tokens_seen": 187374752, + "step": 153985 + }, + { + "epoch": 17.150016705646507, + "grad_norm": 0.00012233320740051568, + "learning_rate": 3.029769279464123e-06, + "loss": 0.0395, + "num_input_tokens_seen": 187380832, + "step": 153990 + }, + { + "epoch": 17.150573560530127, + "grad_norm": 0.0006626583053730428, + "learning_rate": 3.0286099772760978e-06, + "loss": 0.0694, + "num_input_tokens_seen": 187387040, + "step": 153995 + }, + { + "epoch": 17.151130415413743, + "grad_norm": 0.06615153700113297, + "learning_rate": 3.0274508826254544e-06, + "loss": 0.0454, + "num_input_tokens_seen": 187393120, + "step": 154000 + }, + { + "epoch": 17.151687270297362, + "grad_norm": 0.2021174132823944, + "learning_rate": 3.0262919955231424e-06, + "loss": 0.1334, + "num_input_tokens_seen": 187398592, + "step": 154005 + }, + { + "epoch": 17.152244125180978, + "grad_norm": 0.00585087900981307, + "learning_rate": 3.025133315980111e-06, + "loss": 0.1281, + "num_input_tokens_seen": 187404768, + "step": 154010 + }, + { + "epoch": 17.152800980064594, + "grad_norm": 0.030835894867777824, + "learning_rate": 3.023974844007299e-06, + "loss": 0.0898, + "num_input_tokens_seen": 187411072, + "step": 154015 + }, + { + "epoch": 17.153357834948213, + "grad_norm": 0.0060435133054852486, + "learning_rate": 3.022816579615648e-06, + "loss": 0.1162, + "num_input_tokens_seen": 187417408, + "step": 154020 + }, + { + "epoch": 17.15391468983183, + "grad_norm": 0.007891476154327393, + "learning_rate": 3.021658522816104e-06, + "loss": 0.0372, + "num_input_tokens_seen": 187423200, + "step": 154025 + }, + { + "epoch": 17.15447154471545, + "grad_norm": 0.0005618186551146209, + "learning_rate": 3.0205006736196083e-06, + "loss": 0.0008, + "num_input_tokens_seen": 187429120, + "step": 154030 + }, + { + "epoch": 17.155028399599065, + "grad_norm": 0.8319634199142456, + "learning_rate": 3.0193430320370893e-06, + "loss": 0.044, + "num_input_tokens_seen": 187434976, + "step": 154035 + }, + { + "epoch": 17.15558525448268, + "grad_norm": 1.2084171772003174, + "learning_rate": 3.018185598079484e-06, + "loss": 0.0479, + "num_input_tokens_seen": 187440992, + "step": 154040 + }, + { + "epoch": 17.1561421093663, + "grad_norm": 1.085700511932373, + "learning_rate": 3.0170283717577326e-06, + "loss": 0.0967, + "num_input_tokens_seen": 187447104, + "step": 154045 + }, + { + "epoch": 17.156698964249916, + "grad_norm": 0.0004600991669576615, + "learning_rate": 3.0158713530827525e-06, + "loss": 0.0026, + "num_input_tokens_seen": 187453216, + "step": 154050 + }, + { + "epoch": 17.157255819133535, + "grad_norm": 0.022319389507174492, + "learning_rate": 3.0147145420654944e-06, + "loss": 0.0448, + "num_input_tokens_seen": 187459488, + "step": 154055 + }, + { + "epoch": 17.15781267401715, + "grad_norm": 1.0747599601745605, + "learning_rate": 3.01355793871686e-06, + "loss": 0.0728, + "num_input_tokens_seen": 187465472, + "step": 154060 + }, + { + "epoch": 17.158369528900767, + "grad_norm": 0.4693789482116699, + "learning_rate": 3.012401543047791e-06, + "loss": 0.0582, + "num_input_tokens_seen": 187471456, + "step": 154065 + }, + { + "epoch": 17.158926383784387, + "grad_norm": 0.0018208206165581942, + "learning_rate": 3.0112453550692004e-06, + "loss": 0.0206, + "num_input_tokens_seen": 187477536, + "step": 154070 + }, + { + "epoch": 17.159483238668003, + "grad_norm": 0.00012041170703014359, + "learning_rate": 3.010089374792019e-06, + "loss": 0.0323, + "num_input_tokens_seen": 187483552, + "step": 154075 + }, + { + "epoch": 17.160040093551622, + "grad_norm": 0.00891667976975441, + "learning_rate": 3.008933602227165e-06, + "loss": 0.0243, + "num_input_tokens_seen": 187489408, + "step": 154080 + }, + { + "epoch": 17.160596948435238, + "grad_norm": 1.0837371349334717, + "learning_rate": 3.0077780373855475e-06, + "loss": 0.0849, + "num_input_tokens_seen": 187495424, + "step": 154085 + }, + { + "epoch": 17.161153803318854, + "grad_norm": 0.0006301290122792125, + "learning_rate": 3.0066226802780846e-06, + "loss": 0.0498, + "num_input_tokens_seen": 187501760, + "step": 154090 + }, + { + "epoch": 17.161710658202473, + "grad_norm": 0.0013379747979342937, + "learning_rate": 3.0054675309156956e-06, + "loss": 0.0161, + "num_input_tokens_seen": 187507552, + "step": 154095 + }, + { + "epoch": 17.16226751308609, + "grad_norm": 0.33248040080070496, + "learning_rate": 3.0043125893092856e-06, + "loss": 0.0021, + "num_input_tokens_seen": 187513664, + "step": 154100 + }, + { + "epoch": 17.16282436796971, + "grad_norm": 0.021532351151108742, + "learning_rate": 3.0031578554697683e-06, + "loss": 0.1222, + "num_input_tokens_seen": 187519808, + "step": 154105 + }, + { + "epoch": 17.163381222853324, + "grad_norm": 0.8681522011756897, + "learning_rate": 3.002003329408043e-06, + "loss": 0.0412, + "num_input_tokens_seen": 187526016, + "step": 154110 + }, + { + "epoch": 17.16393807773694, + "grad_norm": 2.1799838542938232, + "learning_rate": 3.0008490111350236e-06, + "loss": 0.0199, + "num_input_tokens_seen": 187532064, + "step": 154115 + }, + { + "epoch": 17.16449493262056, + "grad_norm": 0.0018481137230992317, + "learning_rate": 2.9996949006616094e-06, + "loss": 0.0371, + "num_input_tokens_seen": 187538048, + "step": 154120 + }, + { + "epoch": 17.165051787504176, + "grad_norm": 0.003583215642720461, + "learning_rate": 2.9985409979987113e-06, + "loss": 0.1137, + "num_input_tokens_seen": 187544384, + "step": 154125 + }, + { + "epoch": 17.165608642387795, + "grad_norm": 0.008326424285769463, + "learning_rate": 2.997387303157209e-06, + "loss": 0.0664, + "num_input_tokens_seen": 187550816, + "step": 154130 + }, + { + "epoch": 17.16616549727141, + "grad_norm": 1.1123957633972168, + "learning_rate": 2.9962338161480195e-06, + "loss": 0.2434, + "num_input_tokens_seen": 187556768, + "step": 154135 + }, + { + "epoch": 17.166722352155027, + "grad_norm": 1.1512142419815063, + "learning_rate": 2.995080536982023e-06, + "loss": 0.1006, + "num_input_tokens_seen": 187562496, + "step": 154140 + }, + { + "epoch": 17.167279207038646, + "grad_norm": 0.0001569761661812663, + "learning_rate": 2.9939274656701295e-06, + "loss": 0.0095, + "num_input_tokens_seen": 187568576, + "step": 154145 + }, + { + "epoch": 17.167836061922262, + "grad_norm": 0.1271578073501587, + "learning_rate": 2.9927746022232194e-06, + "loss": 0.017, + "num_input_tokens_seen": 187574880, + "step": 154150 + }, + { + "epoch": 17.16839291680588, + "grad_norm": 0.1350499838590622, + "learning_rate": 2.991621946652187e-06, + "loss": 0.0277, + "num_input_tokens_seen": 187580928, + "step": 154155 + }, + { + "epoch": 17.168949771689498, + "grad_norm": 0.027975456789135933, + "learning_rate": 2.99046949896791e-06, + "loss": 0.0082, + "num_input_tokens_seen": 187587200, + "step": 154160 + }, + { + "epoch": 17.169506626573114, + "grad_norm": 0.00018826463201548904, + "learning_rate": 2.98931725918129e-06, + "loss": 0.0338, + "num_input_tokens_seen": 187593600, + "step": 154165 + }, + { + "epoch": 17.170063481456733, + "grad_norm": 0.010187603533267975, + "learning_rate": 2.9881652273032024e-06, + "loss": 0.0038, + "num_input_tokens_seen": 187599840, + "step": 154170 + }, + { + "epoch": 17.17062033634035, + "grad_norm": 0.04695357754826546, + "learning_rate": 2.9870134033445298e-06, + "loss": 0.0107, + "num_input_tokens_seen": 187605920, + "step": 154175 + }, + { + "epoch": 17.17117719122397, + "grad_norm": 0.01545715518295765, + "learning_rate": 2.9858617873161466e-06, + "loss": 0.0138, + "num_input_tokens_seen": 187612288, + "step": 154180 + }, + { + "epoch": 17.171734046107584, + "grad_norm": 3.4053540229797363, + "learning_rate": 2.9847103792289416e-06, + "loss": 0.0334, + "num_input_tokens_seen": 187618528, + "step": 154185 + }, + { + "epoch": 17.1722909009912, + "grad_norm": 0.07357673346996307, + "learning_rate": 2.9835591790937813e-06, + "loss": 0.0063, + "num_input_tokens_seen": 187624608, + "step": 154190 + }, + { + "epoch": 17.17284775587482, + "grad_norm": 0.03012380376458168, + "learning_rate": 2.982408186921548e-06, + "loss": 0.0134, + "num_input_tokens_seen": 187630496, + "step": 154195 + }, + { + "epoch": 17.173404610758435, + "grad_norm": 0.8465882539749146, + "learning_rate": 2.9812574027231116e-06, + "loss": 0.0359, + "num_input_tokens_seen": 187636512, + "step": 154200 + }, + { + "epoch": 17.173961465642055, + "grad_norm": 0.9376183748245239, + "learning_rate": 2.980106826509338e-06, + "loss": 0.0292, + "num_input_tokens_seen": 187642304, + "step": 154205 + }, + { + "epoch": 17.17451832052567, + "grad_norm": 1.248088002204895, + "learning_rate": 2.9789564582910905e-06, + "loss": 0.0261, + "num_input_tokens_seen": 187648320, + "step": 154210 + }, + { + "epoch": 17.175075175409287, + "grad_norm": 0.0002037527592619881, + "learning_rate": 2.97780629807925e-06, + "loss": 0.0149, + "num_input_tokens_seen": 187653728, + "step": 154215 + }, + { + "epoch": 17.175632030292906, + "grad_norm": 1.0331960916519165, + "learning_rate": 2.9766563458846736e-06, + "loss": 0.0159, + "num_input_tokens_seen": 187659872, + "step": 154220 + }, + { + "epoch": 17.176188885176522, + "grad_norm": 0.00014690632815472782, + "learning_rate": 2.975506601718223e-06, + "loss": 0.0596, + "num_input_tokens_seen": 187666016, + "step": 154225 + }, + { + "epoch": 17.17674574006014, + "grad_norm": 0.6691107153892517, + "learning_rate": 2.9743570655907494e-06, + "loss": 0.0149, + "num_input_tokens_seen": 187672480, + "step": 154230 + }, + { + "epoch": 17.177302594943757, + "grad_norm": 1.7403168678283691, + "learning_rate": 2.9732077375131285e-06, + "loss": 0.1015, + "num_input_tokens_seen": 187678560, + "step": 154235 + }, + { + "epoch": 17.177859449827373, + "grad_norm": 0.05795660614967346, + "learning_rate": 2.9720586174962012e-06, + "loss": 0.0385, + "num_input_tokens_seen": 187684640, + "step": 154240 + }, + { + "epoch": 17.178416304710993, + "grad_norm": 0.8919230103492737, + "learning_rate": 2.9709097055508397e-06, + "loss": 0.0224, + "num_input_tokens_seen": 187690944, + "step": 154245 + }, + { + "epoch": 17.17897315959461, + "grad_norm": 0.16168853640556335, + "learning_rate": 2.969761001687876e-06, + "loss": 0.0087, + "num_input_tokens_seen": 187697408, + "step": 154250 + }, + { + "epoch": 17.179530014478228, + "grad_norm": 0.37219467759132385, + "learning_rate": 2.968612505918175e-06, + "loss": 0.0566, + "num_input_tokens_seen": 187703488, + "step": 154255 + }, + { + "epoch": 17.180086869361844, + "grad_norm": 0.5244256258010864, + "learning_rate": 2.9674642182525746e-06, + "loss": 0.0092, + "num_input_tokens_seen": 187709856, + "step": 154260 + }, + { + "epoch": 17.18064372424546, + "grad_norm": 0.007562754210084677, + "learning_rate": 2.96631613870193e-06, + "loss": 0.0003, + "num_input_tokens_seen": 187716160, + "step": 154265 + }, + { + "epoch": 17.18120057912908, + "grad_norm": 0.1882098913192749, + "learning_rate": 2.965168267277091e-06, + "loss": 0.0502, + "num_input_tokens_seen": 187722400, + "step": 154270 + }, + { + "epoch": 17.181757434012695, + "grad_norm": 0.0004878579929936677, + "learning_rate": 2.964020603988879e-06, + "loss": 0.0821, + "num_input_tokens_seen": 187728000, + "step": 154275 + }, + { + "epoch": 17.182314288896315, + "grad_norm": 0.17126430571079254, + "learning_rate": 2.9628731488481528e-06, + "loss": 0.0068, + "num_input_tokens_seen": 187734240, + "step": 154280 + }, + { + "epoch": 17.18287114377993, + "grad_norm": 0.09664615243673325, + "learning_rate": 2.9617259018657416e-06, + "loss": 0.0511, + "num_input_tokens_seen": 187740288, + "step": 154285 + }, + { + "epoch": 17.183427998663547, + "grad_norm": 0.04666653275489807, + "learning_rate": 2.9605788630524904e-06, + "loss": 0.1929, + "num_input_tokens_seen": 187746400, + "step": 154290 + }, + { + "epoch": 17.183984853547166, + "grad_norm": 0.3943624496459961, + "learning_rate": 2.9594320324192294e-06, + "loss": 0.0697, + "num_input_tokens_seen": 187751872, + "step": 154295 + }, + { + "epoch": 17.184541708430782, + "grad_norm": 0.6970914602279663, + "learning_rate": 2.958285409976791e-06, + "loss": 0.138, + "num_input_tokens_seen": 187757216, + "step": 154300 + }, + { + "epoch": 17.1850985633144, + "grad_norm": 0.0001372525584883988, + "learning_rate": 2.957138995736003e-06, + "loss": 0.0809, + "num_input_tokens_seen": 187763680, + "step": 154305 + }, + { + "epoch": 17.185655418198017, + "grad_norm": 0.36017709970474243, + "learning_rate": 2.9559927897077013e-06, + "loss": 0.013, + "num_input_tokens_seen": 187770016, + "step": 154310 + }, + { + "epoch": 17.186212273081637, + "grad_norm": 2.0980169773101807, + "learning_rate": 2.95484679190271e-06, + "loss": 0.0345, + "num_input_tokens_seen": 187776608, + "step": 154315 + }, + { + "epoch": 17.186769127965253, + "grad_norm": 0.5919405221939087, + "learning_rate": 2.9537010023318516e-06, + "loss": 0.0894, + "num_input_tokens_seen": 187782784, + "step": 154320 + }, + { + "epoch": 17.18732598284887, + "grad_norm": 1.0406183004379272, + "learning_rate": 2.952555421005948e-06, + "loss": 0.032, + "num_input_tokens_seen": 187788992, + "step": 154325 + }, + { + "epoch": 17.187882837732488, + "grad_norm": 0.00657246820628643, + "learning_rate": 2.9514100479358265e-06, + "loss": 0.0089, + "num_input_tokens_seen": 187795072, + "step": 154330 + }, + { + "epoch": 17.188439692616104, + "grad_norm": 0.6525741815567017, + "learning_rate": 2.950264883132295e-06, + "loss": 0.0329, + "num_input_tokens_seen": 187801088, + "step": 154335 + }, + { + "epoch": 17.188996547499723, + "grad_norm": 1.3703131675720215, + "learning_rate": 2.9491199266061837e-06, + "loss": 0.0388, + "num_input_tokens_seen": 187807168, + "step": 154340 + }, + { + "epoch": 17.18955340238334, + "grad_norm": 0.00015740108210593462, + "learning_rate": 2.9479751783683034e-06, + "loss": 0.0112, + "num_input_tokens_seen": 187813216, + "step": 154345 + }, + { + "epoch": 17.190110257266955, + "grad_norm": 0.00027719224453903735, + "learning_rate": 2.9468306384294655e-06, + "loss": 0.0038, + "num_input_tokens_seen": 187819072, + "step": 154350 + }, + { + "epoch": 17.190667112150575, + "grad_norm": 0.3318957984447479, + "learning_rate": 2.9456863068004712e-06, + "loss": 0.0633, + "num_input_tokens_seen": 187824736, + "step": 154355 + }, + { + "epoch": 17.19122396703419, + "grad_norm": 0.028159648180007935, + "learning_rate": 2.944542183492149e-06, + "loss": 0.073, + "num_input_tokens_seen": 187831168, + "step": 154360 + }, + { + "epoch": 17.19178082191781, + "grad_norm": 1.7664586305618286, + "learning_rate": 2.943398268515296e-06, + "loss": 0.1412, + "num_input_tokens_seen": 187836736, + "step": 154365 + }, + { + "epoch": 17.192337676801426, + "grad_norm": 0.0002326503599761054, + "learning_rate": 2.942254561880717e-06, + "loss": 0.109, + "num_input_tokens_seen": 187842688, + "step": 154370 + }, + { + "epoch": 17.19289453168504, + "grad_norm": 0.7569829821586609, + "learning_rate": 2.9411110635992086e-06, + "loss": 0.0089, + "num_input_tokens_seen": 187848864, + "step": 154375 + }, + { + "epoch": 17.19345138656866, + "grad_norm": 1.4205302000045776, + "learning_rate": 2.939967773681587e-06, + "loss": 0.0701, + "num_input_tokens_seen": 187854976, + "step": 154380 + }, + { + "epoch": 17.194008241452277, + "grad_norm": 0.09794291108846664, + "learning_rate": 2.9388246921386393e-06, + "loss": 0.012, + "num_input_tokens_seen": 187861024, + "step": 154385 + }, + { + "epoch": 17.194565096335896, + "grad_norm": 0.004565669223666191, + "learning_rate": 2.9376818189811777e-06, + "loss": 0.017, + "num_input_tokens_seen": 187867232, + "step": 154390 + }, + { + "epoch": 17.195121951219512, + "grad_norm": 0.280501127243042, + "learning_rate": 2.936539154219975e-06, + "loss": 0.0221, + "num_input_tokens_seen": 187873376, + "step": 154395 + }, + { + "epoch": 17.19567880610313, + "grad_norm": 0.2746519446372986, + "learning_rate": 2.9353966978658444e-06, + "loss": 0.0489, + "num_input_tokens_seen": 187879680, + "step": 154400 + }, + { + "epoch": 17.196235660986748, + "grad_norm": 0.16499142348766327, + "learning_rate": 2.934254449929563e-06, + "loss": 0.0173, + "num_input_tokens_seen": 187885984, + "step": 154405 + }, + { + "epoch": 17.196792515870364, + "grad_norm": 0.4420205056667328, + "learning_rate": 2.9331124104219345e-06, + "loss": 0.0181, + "num_input_tokens_seen": 187891424, + "step": 154410 + }, + { + "epoch": 17.197349370753983, + "grad_norm": 0.09163641184568405, + "learning_rate": 2.9319705793537376e-06, + "loss": 0.0389, + "num_input_tokens_seen": 187896928, + "step": 154415 + }, + { + "epoch": 17.1979062256376, + "grad_norm": 0.022308913990855217, + "learning_rate": 2.9308289567357595e-06, + "loss": 0.0064, + "num_input_tokens_seen": 187902944, + "step": 154420 + }, + { + "epoch": 17.198463080521215, + "grad_norm": 0.0006421873695217073, + "learning_rate": 2.9296875425787768e-06, + "loss": 0.1429, + "num_input_tokens_seen": 187908928, + "step": 154425 + }, + { + "epoch": 17.199019935404834, + "grad_norm": 3.0384628772735596, + "learning_rate": 2.928546336893584e-06, + "loss": 0.1102, + "num_input_tokens_seen": 187915328, + "step": 154430 + }, + { + "epoch": 17.19957679028845, + "grad_norm": 0.7196670174598694, + "learning_rate": 2.927405339690956e-06, + "loss": 0.0452, + "num_input_tokens_seen": 187921952, + "step": 154435 + }, + { + "epoch": 17.20013364517207, + "grad_norm": 0.035581354051828384, + "learning_rate": 2.9262645509816665e-06, + "loss": 0.0028, + "num_input_tokens_seen": 187928160, + "step": 154440 + }, + { + "epoch": 17.200690500055686, + "grad_norm": 0.08487189561128616, + "learning_rate": 2.9251239707764887e-06, + "loss": 0.0575, + "num_input_tokens_seen": 187934272, + "step": 154445 + }, + { + "epoch": 17.2012473549393, + "grad_norm": 0.0004949980648234487, + "learning_rate": 2.923983599086208e-06, + "loss": 0.0188, + "num_input_tokens_seen": 187940480, + "step": 154450 + }, + { + "epoch": 17.20180420982292, + "grad_norm": 0.0032382141798734665, + "learning_rate": 2.9228434359215823e-06, + "loss": 0.0255, + "num_input_tokens_seen": 187946560, + "step": 154455 + }, + { + "epoch": 17.202361064706537, + "grad_norm": 0.030437899753451347, + "learning_rate": 2.9217034812933975e-06, + "loss": 0.0929, + "num_input_tokens_seen": 187952224, + "step": 154460 + }, + { + "epoch": 17.202917919590156, + "grad_norm": 0.001163118751719594, + "learning_rate": 2.920563735212403e-06, + "loss": 0.0079, + "num_input_tokens_seen": 187958464, + "step": 154465 + }, + { + "epoch": 17.203474774473772, + "grad_norm": 0.028624072670936584, + "learning_rate": 2.9194241976893797e-06, + "loss": 0.0411, + "num_input_tokens_seen": 187964512, + "step": 154470 + }, + { + "epoch": 17.204031629357388, + "grad_norm": 0.4923872947692871, + "learning_rate": 2.91828486873508e-06, + "loss": 0.0065, + "num_input_tokens_seen": 187970464, + "step": 154475 + }, + { + "epoch": 17.204588484241008, + "grad_norm": 0.023938067257404327, + "learning_rate": 2.9171457483602752e-06, + "loss": 0.0278, + "num_input_tokens_seen": 187976544, + "step": 154480 + }, + { + "epoch": 17.205145339124623, + "grad_norm": 0.870394766330719, + "learning_rate": 2.9160068365757244e-06, + "loss": 0.0491, + "num_input_tokens_seen": 187983008, + "step": 154485 + }, + { + "epoch": 17.205702194008243, + "grad_norm": 0.0607014037668705, + "learning_rate": 2.914868133392179e-06, + "loss": 0.0111, + "num_input_tokens_seen": 187989408, + "step": 154490 + }, + { + "epoch": 17.20625904889186, + "grad_norm": 0.013798967935144901, + "learning_rate": 2.913729638820395e-06, + "loss": 0.1791, + "num_input_tokens_seen": 187995488, + "step": 154495 + }, + { + "epoch": 17.206815903775475, + "grad_norm": 2.4586849212646484, + "learning_rate": 2.9125913528711355e-06, + "loss": 0.1261, + "num_input_tokens_seen": 188001664, + "step": 154500 + }, + { + "epoch": 17.207372758659094, + "grad_norm": 0.8026297092437744, + "learning_rate": 2.9114532755551454e-06, + "loss": 0.0175, + "num_input_tokens_seen": 188007776, + "step": 154505 + }, + { + "epoch": 17.20792961354271, + "grad_norm": 0.00023420796787831932, + "learning_rate": 2.9103154068831768e-06, + "loss": 0.1415, + "num_input_tokens_seen": 188013888, + "step": 154510 + }, + { + "epoch": 17.20848646842633, + "grad_norm": 0.0007683367584832013, + "learning_rate": 2.909177746865971e-06, + "loss": 0.0415, + "num_input_tokens_seen": 188020576, + "step": 154515 + }, + { + "epoch": 17.209043323309945, + "grad_norm": 0.050225336104631424, + "learning_rate": 2.908040295514289e-06, + "loss": 0.0218, + "num_input_tokens_seen": 188026560, + "step": 154520 + }, + { + "epoch": 17.20960017819356, + "grad_norm": 0.0015663978410884738, + "learning_rate": 2.906903052838858e-06, + "loss": 0.0226, + "num_input_tokens_seen": 188032736, + "step": 154525 + }, + { + "epoch": 17.21015703307718, + "grad_norm": 0.4068029820919037, + "learning_rate": 2.9057660188504337e-06, + "loss": 0.021, + "num_input_tokens_seen": 188038816, + "step": 154530 + }, + { + "epoch": 17.210713887960797, + "grad_norm": 0.808097779750824, + "learning_rate": 2.904629193559749e-06, + "loss": 0.0495, + "num_input_tokens_seen": 188043968, + "step": 154535 + }, + { + "epoch": 17.211270742844416, + "grad_norm": 0.8557231426239014, + "learning_rate": 2.9034925769775484e-06, + "loss": 0.0453, + "num_input_tokens_seen": 188049952, + "step": 154540 + }, + { + "epoch": 17.211827597728032, + "grad_norm": 0.7057152986526489, + "learning_rate": 2.902356169114556e-06, + "loss": 0.0181, + "num_input_tokens_seen": 188055712, + "step": 154545 + }, + { + "epoch": 17.212384452611648, + "grad_norm": 0.0015837997198104858, + "learning_rate": 2.9012199699815195e-06, + "loss": 0.0112, + "num_input_tokens_seen": 188061312, + "step": 154550 + }, + { + "epoch": 17.212941307495267, + "grad_norm": 0.02984689176082611, + "learning_rate": 2.900083979589166e-06, + "loss": 0.0146, + "num_input_tokens_seen": 188067424, + "step": 154555 + }, + { + "epoch": 17.213498162378883, + "grad_norm": 0.18979230523109436, + "learning_rate": 2.898948197948226e-06, + "loss": 0.0195, + "num_input_tokens_seen": 188073536, + "step": 154560 + }, + { + "epoch": 17.214055017262503, + "grad_norm": 0.00024373811902478337, + "learning_rate": 2.897812625069421e-06, + "loss": 0.0814, + "num_input_tokens_seen": 188079712, + "step": 154565 + }, + { + "epoch": 17.21461187214612, + "grad_norm": 0.01203244086354971, + "learning_rate": 2.8966772609634875e-06, + "loss": 0.0064, + "num_input_tokens_seen": 188086016, + "step": 154570 + }, + { + "epoch": 17.215168727029734, + "grad_norm": 0.07251213490962982, + "learning_rate": 2.8955421056411495e-06, + "loss": 0.0713, + "num_input_tokens_seen": 188092512, + "step": 154575 + }, + { + "epoch": 17.215725581913354, + "grad_norm": 0.0029183810111135244, + "learning_rate": 2.8944071591131246e-06, + "loss": 0.0022, + "num_input_tokens_seen": 188098816, + "step": 154580 + }, + { + "epoch": 17.21628243679697, + "grad_norm": 0.6309325098991394, + "learning_rate": 2.893272421390128e-06, + "loss": 0.0262, + "num_input_tokens_seen": 188105216, + "step": 154585 + }, + { + "epoch": 17.21683929168059, + "grad_norm": 2.2184863090515137, + "learning_rate": 2.8921378924828906e-06, + "loss": 0.1671, + "num_input_tokens_seen": 188111200, + "step": 154590 + }, + { + "epoch": 17.217396146564205, + "grad_norm": 0.004710946697741747, + "learning_rate": 2.8910035724021206e-06, + "loss": 0.0414, + "num_input_tokens_seen": 188117472, + "step": 154595 + }, + { + "epoch": 17.21795300144782, + "grad_norm": 0.0712418183684349, + "learning_rate": 2.8898694611585397e-06, + "loss": 0.1103, + "num_input_tokens_seen": 188123488, + "step": 154600 + }, + { + "epoch": 17.21850985633144, + "grad_norm": 0.10877327620983124, + "learning_rate": 2.888735558762856e-06, + "loss": 0.1182, + "num_input_tokens_seen": 188129728, + "step": 154605 + }, + { + "epoch": 17.219066711215056, + "grad_norm": 0.027591647580266, + "learning_rate": 2.887601865225778e-06, + "loss": 0.0725, + "num_input_tokens_seen": 188135776, + "step": 154610 + }, + { + "epoch": 17.219623566098676, + "grad_norm": 0.5247783064842224, + "learning_rate": 2.8864683805580133e-06, + "loss": 0.0821, + "num_input_tokens_seen": 188141888, + "step": 154615 + }, + { + "epoch": 17.22018042098229, + "grad_norm": 0.035748910158872604, + "learning_rate": 2.885335104770276e-06, + "loss": 0.0027, + "num_input_tokens_seen": 188147968, + "step": 154620 + }, + { + "epoch": 17.220737275865908, + "grad_norm": 0.052425503730773926, + "learning_rate": 2.884202037873268e-06, + "loss": 0.0102, + "num_input_tokens_seen": 188154080, + "step": 154625 + }, + { + "epoch": 17.221294130749527, + "grad_norm": 0.005499956663697958, + "learning_rate": 2.8830691798776897e-06, + "loss": 0.0212, + "num_input_tokens_seen": 188160192, + "step": 154630 + }, + { + "epoch": 17.221850985633143, + "grad_norm": 1.0988608598709106, + "learning_rate": 2.8819365307942383e-06, + "loss": 0.1017, + "num_input_tokens_seen": 188166336, + "step": 154635 + }, + { + "epoch": 17.222407840516762, + "grad_norm": 0.0006406582542695105, + "learning_rate": 2.8808040906336207e-06, + "loss": 0.0132, + "num_input_tokens_seen": 188172736, + "step": 154640 + }, + { + "epoch": 17.22296469540038, + "grad_norm": 1.1142710447311401, + "learning_rate": 2.879671859406527e-06, + "loss": 0.1296, + "num_input_tokens_seen": 188178816, + "step": 154645 + }, + { + "epoch": 17.223521550283994, + "grad_norm": 0.7295364141464233, + "learning_rate": 2.8785398371236643e-06, + "loss": 0.0677, + "num_input_tokens_seen": 188184608, + "step": 154650 + }, + { + "epoch": 17.224078405167614, + "grad_norm": 0.07891856133937836, + "learning_rate": 2.877408023795708e-06, + "loss": 0.107, + "num_input_tokens_seen": 188189888, + "step": 154655 + }, + { + "epoch": 17.22463526005123, + "grad_norm": 0.003287011757493019, + "learning_rate": 2.8762764194333603e-06, + "loss": 0.0033, + "num_input_tokens_seen": 188195968, + "step": 154660 + }, + { + "epoch": 17.22519211493485, + "grad_norm": 1.0834301710128784, + "learning_rate": 2.8751450240473018e-06, + "loss": 0.0332, + "num_input_tokens_seen": 188202048, + "step": 154665 + }, + { + "epoch": 17.225748969818465, + "grad_norm": 0.0022127837873995304, + "learning_rate": 2.874013837648229e-06, + "loss": 0.0076, + "num_input_tokens_seen": 188208160, + "step": 154670 + }, + { + "epoch": 17.226305824702084, + "grad_norm": 0.01754136197268963, + "learning_rate": 2.872882860246828e-06, + "loss": 0.09, + "num_input_tokens_seen": 188214080, + "step": 154675 + }, + { + "epoch": 17.2268626795857, + "grad_norm": 0.0011703444179147482, + "learning_rate": 2.8717520918537687e-06, + "loss": 0.0558, + "num_input_tokens_seen": 188220256, + "step": 154680 + }, + { + "epoch": 17.227419534469316, + "grad_norm": 8.516725938534364e-05, + "learning_rate": 2.870621532479742e-06, + "loss": 0.1376, + "num_input_tokens_seen": 188226400, + "step": 154685 + }, + { + "epoch": 17.227976389352936, + "grad_norm": 0.8711673021316528, + "learning_rate": 2.8694911821354196e-06, + "loss": 0.0689, + "num_input_tokens_seen": 188232608, + "step": 154690 + }, + { + "epoch": 17.22853324423655, + "grad_norm": 0.020807083696126938, + "learning_rate": 2.868361040831491e-06, + "loss": 0.1133, + "num_input_tokens_seen": 188238816, + "step": 154695 + }, + { + "epoch": 17.22909009912017, + "grad_norm": 0.1056402251124382, + "learning_rate": 2.8672311085786218e-06, + "loss": 0.2654, + "num_input_tokens_seen": 188244736, + "step": 154700 + }, + { + "epoch": 17.229646954003787, + "grad_norm": 0.0005820252117700875, + "learning_rate": 2.8661013853874903e-06, + "loss": 0.0074, + "num_input_tokens_seen": 188250848, + "step": 154705 + }, + { + "epoch": 17.230203808887403, + "grad_norm": 1.7555053234100342, + "learning_rate": 2.8649718712687566e-06, + "loss": 0.0519, + "num_input_tokens_seen": 188256640, + "step": 154710 + }, + { + "epoch": 17.230760663771022, + "grad_norm": 2.4448647499084473, + "learning_rate": 2.8638425662331048e-06, + "loss": 0.1519, + "num_input_tokens_seen": 188262720, + "step": 154715 + }, + { + "epoch": 17.231317518654638, + "grad_norm": 0.28561100363731384, + "learning_rate": 2.8627134702911922e-06, + "loss": 0.0217, + "num_input_tokens_seen": 188268928, + "step": 154720 + }, + { + "epoch": 17.231874373538258, + "grad_norm": 1.5220588445663452, + "learning_rate": 2.8615845834536886e-06, + "loss": 0.0587, + "num_input_tokens_seen": 188275136, + "step": 154725 + }, + { + "epoch": 17.232431228421873, + "grad_norm": 1.513444423675537, + "learning_rate": 2.860455905731252e-06, + "loss": 0.1442, + "num_input_tokens_seen": 188280640, + "step": 154730 + }, + { + "epoch": 17.23298808330549, + "grad_norm": 0.7196140289306641, + "learning_rate": 2.8593274371345514e-06, + "loss": 0.0939, + "num_input_tokens_seen": 188287072, + "step": 154735 + }, + { + "epoch": 17.23354493818911, + "grad_norm": 0.0011388311395421624, + "learning_rate": 2.858199177674237e-06, + "loss": 0.0047, + "num_input_tokens_seen": 188292864, + "step": 154740 + }, + { + "epoch": 17.234101793072725, + "grad_norm": 0.02780057303607464, + "learning_rate": 2.8570711273609746e-06, + "loss": 0.0049, + "num_input_tokens_seen": 188298944, + "step": 154745 + }, + { + "epoch": 17.234658647956344, + "grad_norm": 0.12373977154493332, + "learning_rate": 2.8559432862054175e-06, + "loss": 0.0055, + "num_input_tokens_seen": 188304736, + "step": 154750 + }, + { + "epoch": 17.23521550283996, + "grad_norm": 0.048254940658807755, + "learning_rate": 2.854815654218218e-06, + "loss": 0.058, + "num_input_tokens_seen": 188310336, + "step": 154755 + }, + { + "epoch": 17.235772357723576, + "grad_norm": 0.011733684688806534, + "learning_rate": 2.8536882314100203e-06, + "loss": 0.0585, + "num_input_tokens_seen": 188316512, + "step": 154760 + }, + { + "epoch": 17.236329212607195, + "grad_norm": 0.22313742339611053, + "learning_rate": 2.8525610177914857e-06, + "loss": 0.01, + "num_input_tokens_seen": 188322464, + "step": 154765 + }, + { + "epoch": 17.23688606749081, + "grad_norm": 0.8803347945213318, + "learning_rate": 2.851434013373258e-06, + "loss": 0.0246, + "num_input_tokens_seen": 188327936, + "step": 154770 + }, + { + "epoch": 17.23744292237443, + "grad_norm": 0.6030178666114807, + "learning_rate": 2.850307218165982e-06, + "loss": 0.0339, + "num_input_tokens_seen": 188333920, + "step": 154775 + }, + { + "epoch": 17.237999777258047, + "grad_norm": 0.6665524244308472, + "learning_rate": 2.849180632180293e-06, + "loss": 0.0552, + "num_input_tokens_seen": 188339776, + "step": 154780 + }, + { + "epoch": 17.238556632141663, + "grad_norm": 0.6693998575210571, + "learning_rate": 2.8480542554268462e-06, + "loss": 0.0179, + "num_input_tokens_seen": 188345984, + "step": 154785 + }, + { + "epoch": 17.239113487025282, + "grad_norm": 0.002322570187970996, + "learning_rate": 2.84692808791627e-06, + "loss": 0.0276, + "num_input_tokens_seen": 188351712, + "step": 154790 + }, + { + "epoch": 17.239670341908898, + "grad_norm": 0.0004645232984330505, + "learning_rate": 2.845802129659217e-06, + "loss": 0.0871, + "num_input_tokens_seen": 188357792, + "step": 154795 + }, + { + "epoch": 17.240227196792517, + "grad_norm": 0.0017544295405969024, + "learning_rate": 2.8446763806663e-06, + "loss": 0.0492, + "num_input_tokens_seen": 188364000, + "step": 154800 + }, + { + "epoch": 17.240784051676133, + "grad_norm": 0.9206299185752869, + "learning_rate": 2.8435508409481726e-06, + "loss": 0.019, + "num_input_tokens_seen": 188370336, + "step": 154805 + }, + { + "epoch": 17.24134090655975, + "grad_norm": 0.007201101165264845, + "learning_rate": 2.8424255105154535e-06, + "loss": 0.0111, + "num_input_tokens_seen": 188376128, + "step": 154810 + }, + { + "epoch": 17.24189776144337, + "grad_norm": 0.00045184732880443335, + "learning_rate": 2.8413003893787815e-06, + "loss": 0.0108, + "num_input_tokens_seen": 188382048, + "step": 154815 + }, + { + "epoch": 17.242454616326985, + "grad_norm": 0.012374875135719776, + "learning_rate": 2.8401754775487814e-06, + "loss": 0.0009, + "num_input_tokens_seen": 188388544, + "step": 154820 + }, + { + "epoch": 17.243011471210604, + "grad_norm": 0.8045138716697693, + "learning_rate": 2.8390507750360784e-06, + "loss": 0.0413, + "num_input_tokens_seen": 188394528, + "step": 154825 + }, + { + "epoch": 17.24356832609422, + "grad_norm": 0.012762190774083138, + "learning_rate": 2.8379262818512913e-06, + "loss": 0.1139, + "num_input_tokens_seen": 188400992, + "step": 154830 + }, + { + "epoch": 17.244125180977836, + "grad_norm": 0.5618361830711365, + "learning_rate": 2.836801998005051e-06, + "loss": 0.0403, + "num_input_tokens_seen": 188407360, + "step": 154835 + }, + { + "epoch": 17.244682035861455, + "grad_norm": 0.02977093867957592, + "learning_rate": 2.835677923507973e-06, + "loss": 0.0341, + "num_input_tokens_seen": 188413664, + "step": 154840 + }, + { + "epoch": 17.24523889074507, + "grad_norm": 0.02661065384745598, + "learning_rate": 2.8345540583706755e-06, + "loss": 0.0156, + "num_input_tokens_seen": 188419744, + "step": 154845 + }, + { + "epoch": 17.24579574562869, + "grad_norm": 0.7141703367233276, + "learning_rate": 2.833430402603765e-06, + "loss": 0.0604, + "num_input_tokens_seen": 188425920, + "step": 154850 + }, + { + "epoch": 17.246352600512306, + "grad_norm": 8.060125401243567e-05, + "learning_rate": 2.8323069562178728e-06, + "loss": 0.0015, + "num_input_tokens_seen": 188432128, + "step": 154855 + }, + { + "epoch": 17.246909455395922, + "grad_norm": 1.9616796970367432, + "learning_rate": 2.8311837192235955e-06, + "loss": 0.1039, + "num_input_tokens_seen": 188438080, + "step": 154860 + }, + { + "epoch": 17.247466310279542, + "grad_norm": 0.23383451998233795, + "learning_rate": 2.830060691631559e-06, + "loss": 0.01, + "num_input_tokens_seen": 188443968, + "step": 154865 + }, + { + "epoch": 17.248023165163158, + "grad_norm": 0.0009702139650471509, + "learning_rate": 2.82893787345235e-06, + "loss": 0.0702, + "num_input_tokens_seen": 188450336, + "step": 154870 + }, + { + "epoch": 17.248580020046777, + "grad_norm": 0.0005891002947464585, + "learning_rate": 2.8278152646965954e-06, + "loss": 0.0453, + "num_input_tokens_seen": 188456320, + "step": 154875 + }, + { + "epoch": 17.249136874930393, + "grad_norm": 0.7901275157928467, + "learning_rate": 2.8266928653748805e-06, + "loss": 0.1042, + "num_input_tokens_seen": 188462624, + "step": 154880 + }, + { + "epoch": 17.24969372981401, + "grad_norm": 0.1286904364824295, + "learning_rate": 2.825570675497824e-06, + "loss": 0.0121, + "num_input_tokens_seen": 188468800, + "step": 154885 + }, + { + "epoch": 17.25025058469763, + "grad_norm": 0.18157173693180084, + "learning_rate": 2.824448695076018e-06, + "loss": 0.0208, + "num_input_tokens_seen": 188474848, + "step": 154890 + }, + { + "epoch": 17.250807439581244, + "grad_norm": 0.00032075855415314436, + "learning_rate": 2.8233269241200593e-06, + "loss": 0.0566, + "num_input_tokens_seen": 188480672, + "step": 154895 + }, + { + "epoch": 17.251364294464864, + "grad_norm": 0.0011104721343144774, + "learning_rate": 2.82220536264054e-06, + "loss": 0.0286, + "num_input_tokens_seen": 188486144, + "step": 154900 + }, + { + "epoch": 17.25192114934848, + "grad_norm": 1.6489768028259277, + "learning_rate": 2.8210840106480672e-06, + "loss": 0.0901, + "num_input_tokens_seen": 188492128, + "step": 154905 + }, + { + "epoch": 17.252478004232096, + "grad_norm": 1.477171540260315, + "learning_rate": 2.8199628681532224e-06, + "loss": 0.1117, + "num_input_tokens_seen": 188497504, + "step": 154910 + }, + { + "epoch": 17.253034859115715, + "grad_norm": 0.01230932492762804, + "learning_rate": 2.8188419351666023e-06, + "loss": 0.0036, + "num_input_tokens_seen": 188503712, + "step": 154915 + }, + { + "epoch": 17.25359171399933, + "grad_norm": 0.226100355386734, + "learning_rate": 2.8177212116987845e-06, + "loss": 0.0173, + "num_input_tokens_seen": 188509760, + "step": 154920 + }, + { + "epoch": 17.25414856888295, + "grad_norm": 1.2818570137023926, + "learning_rate": 2.816600697760366e-06, + "loss": 0.0738, + "num_input_tokens_seen": 188515968, + "step": 154925 + }, + { + "epoch": 17.254705423766566, + "grad_norm": 0.00012187198444735259, + "learning_rate": 2.815480393361922e-06, + "loss": 0.0008, + "num_input_tokens_seen": 188522016, + "step": 154930 + }, + { + "epoch": 17.255262278650182, + "grad_norm": 0.011884802021086216, + "learning_rate": 2.814360298514046e-06, + "loss": 0.0463, + "num_input_tokens_seen": 188527968, + "step": 154935 + }, + { + "epoch": 17.2558191335338, + "grad_norm": 0.13554708659648895, + "learning_rate": 2.813240413227311e-06, + "loss": 0.0846, + "num_input_tokens_seen": 188534240, + "step": 154940 + }, + { + "epoch": 17.256375988417417, + "grad_norm": 0.0002949861518573016, + "learning_rate": 2.8121207375122973e-06, + "loss": 0.0098, + "num_input_tokens_seen": 188540352, + "step": 154945 + }, + { + "epoch": 17.256932843301037, + "grad_norm": 0.06839960068464279, + "learning_rate": 2.8110012713795736e-06, + "loss": 0.0229, + "num_input_tokens_seen": 188546336, + "step": 154950 + }, + { + "epoch": 17.257489698184653, + "grad_norm": 0.006611959543079138, + "learning_rate": 2.8098820148397266e-06, + "loss": 0.0134, + "num_input_tokens_seen": 188552608, + "step": 154955 + }, + { + "epoch": 17.25804655306827, + "grad_norm": 0.09598901867866516, + "learning_rate": 2.8087629679033197e-06, + "loss": 0.0021, + "num_input_tokens_seen": 188558688, + "step": 154960 + }, + { + "epoch": 17.258603407951888, + "grad_norm": 0.14527228474617004, + "learning_rate": 2.8076441305809276e-06, + "loss": 0.0051, + "num_input_tokens_seen": 188564960, + "step": 154965 + }, + { + "epoch": 17.259160262835504, + "grad_norm": 0.014286448247730732, + "learning_rate": 2.8065255028831116e-06, + "loss": 0.0457, + "num_input_tokens_seen": 188571200, + "step": 154970 + }, + { + "epoch": 17.259717117719124, + "grad_norm": 0.020217832177877426, + "learning_rate": 2.8054070848204494e-06, + "loss": 0.0686, + "num_input_tokens_seen": 188577088, + "step": 154975 + }, + { + "epoch": 17.26027397260274, + "grad_norm": 0.01212573517113924, + "learning_rate": 2.8042888764034938e-06, + "loss": 0.0093, + "num_input_tokens_seen": 188582880, + "step": 154980 + }, + { + "epoch": 17.260830827486355, + "grad_norm": 0.17198914289474487, + "learning_rate": 2.8031708776428216e-06, + "loss": 0.026, + "num_input_tokens_seen": 188589376, + "step": 154985 + }, + { + "epoch": 17.261387682369975, + "grad_norm": 0.0008703942876309156, + "learning_rate": 2.8020530885489755e-06, + "loss": 0.1171, + "num_input_tokens_seen": 188595808, + "step": 154990 + }, + { + "epoch": 17.26194453725359, + "grad_norm": 0.13626191020011902, + "learning_rate": 2.800935509132527e-06, + "loss": 0.0013, + "num_input_tokens_seen": 188602080, + "step": 154995 + }, + { + "epoch": 17.26250139213721, + "grad_norm": 0.7490716576576233, + "learning_rate": 2.799818139404023e-06, + "loss": 0.0147, + "num_input_tokens_seen": 188608320, + "step": 155000 + }, + { + "epoch": 17.263058247020826, + "grad_norm": 0.0011237134458497167, + "learning_rate": 2.7987009793740305e-06, + "loss": 0.0115, + "num_input_tokens_seen": 188614240, + "step": 155005 + }, + { + "epoch": 17.263615101904442, + "grad_norm": 0.010803998447954655, + "learning_rate": 2.7975840290530934e-06, + "loss": 0.0192, + "num_input_tokens_seen": 188620512, + "step": 155010 + }, + { + "epoch": 17.26417195678806, + "grad_norm": 0.016235297545790672, + "learning_rate": 2.7964672884517622e-06, + "loss": 0.0826, + "num_input_tokens_seen": 188626880, + "step": 155015 + }, + { + "epoch": 17.264728811671677, + "grad_norm": 2.4825520515441895, + "learning_rate": 2.795350757580581e-06, + "loss": 0.2463, + "num_input_tokens_seen": 188632800, + "step": 155020 + }, + { + "epoch": 17.265285666555297, + "grad_norm": 0.883150041103363, + "learning_rate": 2.7942344364501076e-06, + "loss": 0.0148, + "num_input_tokens_seen": 188638752, + "step": 155025 + }, + { + "epoch": 17.265842521438913, + "grad_norm": 0.031483087688684464, + "learning_rate": 2.7931183250708842e-06, + "loss": 0.0072, + "num_input_tokens_seen": 188644960, + "step": 155030 + }, + { + "epoch": 17.266399376322532, + "grad_norm": 0.21082785725593567, + "learning_rate": 2.7920024234534465e-06, + "loss": 0.0331, + "num_input_tokens_seen": 188651264, + "step": 155035 + }, + { + "epoch": 17.266956231206148, + "grad_norm": 0.25493162870407104, + "learning_rate": 2.7908867316083332e-06, + "loss": 0.0695, + "num_input_tokens_seen": 188656864, + "step": 155040 + }, + { + "epoch": 17.267513086089764, + "grad_norm": 0.7804623246192932, + "learning_rate": 2.789771249546097e-06, + "loss": 0.0954, + "num_input_tokens_seen": 188663072, + "step": 155045 + }, + { + "epoch": 17.268069940973383, + "grad_norm": 0.0002803584502544254, + "learning_rate": 2.7886559772772576e-06, + "loss": 0.0539, + "num_input_tokens_seen": 188668992, + "step": 155050 + }, + { + "epoch": 17.268626795857, + "grad_norm": 0.5479053854942322, + "learning_rate": 2.7875409148123698e-06, + "loss": 0.0088, + "num_input_tokens_seen": 188675136, + "step": 155055 + }, + { + "epoch": 17.26918365074062, + "grad_norm": 0.012122736312448978, + "learning_rate": 2.7864260621619425e-06, + "loss": 0.0208, + "num_input_tokens_seen": 188681280, + "step": 155060 + }, + { + "epoch": 17.269740505624235, + "grad_norm": 0.05167996510863304, + "learning_rate": 2.785311419336525e-06, + "loss": 0.122, + "num_input_tokens_seen": 188687488, + "step": 155065 + }, + { + "epoch": 17.27029736050785, + "grad_norm": 0.14905618131160736, + "learning_rate": 2.784196986346632e-06, + "loss": 0.0187, + "num_input_tokens_seen": 188693376, + "step": 155070 + }, + { + "epoch": 17.27085421539147, + "grad_norm": 1.0246440172195435, + "learning_rate": 2.7830827632028046e-06, + "loss": 0.0427, + "num_input_tokens_seen": 188699296, + "step": 155075 + }, + { + "epoch": 17.271411070275086, + "grad_norm": 0.17966681718826294, + "learning_rate": 2.781968749915559e-06, + "loss": 0.0039, + "num_input_tokens_seen": 188705600, + "step": 155080 + }, + { + "epoch": 17.271967925158705, + "grad_norm": 0.12086983770132065, + "learning_rate": 2.7808549464954204e-06, + "loss": 0.0149, + "num_input_tokens_seen": 188711456, + "step": 155085 + }, + { + "epoch": 17.27252478004232, + "grad_norm": 1.0731511116027832, + "learning_rate": 2.779741352952908e-06, + "loss": 0.0734, + "num_input_tokens_seen": 188717216, + "step": 155090 + }, + { + "epoch": 17.273081634925937, + "grad_norm": 0.027477823197841644, + "learning_rate": 2.778627969298539e-06, + "loss": 0.0235, + "num_input_tokens_seen": 188723488, + "step": 155095 + }, + { + "epoch": 17.273638489809557, + "grad_norm": 0.0014921913389116526, + "learning_rate": 2.777514795542832e-06, + "loss": 0.0692, + "num_input_tokens_seen": 188729408, + "step": 155100 + }, + { + "epoch": 17.274195344693172, + "grad_norm": 0.5416478514671326, + "learning_rate": 2.776401831696307e-06, + "loss": 0.031, + "num_input_tokens_seen": 188735360, + "step": 155105 + }, + { + "epoch": 17.274752199576792, + "grad_norm": 0.045946236699819565, + "learning_rate": 2.775289077769469e-06, + "loss": 0.0407, + "num_input_tokens_seen": 188741440, + "step": 155110 + }, + { + "epoch": 17.275309054460408, + "grad_norm": 0.10367811471223831, + "learning_rate": 2.7741765337728265e-06, + "loss": 0.0137, + "num_input_tokens_seen": 188747872, + "step": 155115 + }, + { + "epoch": 17.275865909344024, + "grad_norm": 0.03598871827125549, + "learning_rate": 2.7730641997169016e-06, + "loss": 0.0483, + "num_input_tokens_seen": 188753376, + "step": 155120 + }, + { + "epoch": 17.276422764227643, + "grad_norm": 0.10847394168376923, + "learning_rate": 2.7719520756121916e-06, + "loss": 0.0369, + "num_input_tokens_seen": 188759296, + "step": 155125 + }, + { + "epoch": 17.27697961911126, + "grad_norm": 0.008937016129493713, + "learning_rate": 2.7708401614692013e-06, + "loss": 0.011, + "num_input_tokens_seen": 188765632, + "step": 155130 + }, + { + "epoch": 17.27753647399488, + "grad_norm": 0.0028189285658299923, + "learning_rate": 2.769728457298432e-06, + "loss": 0.0132, + "num_input_tokens_seen": 188771808, + "step": 155135 + }, + { + "epoch": 17.278093328878494, + "grad_norm": 1.55157470703125, + "learning_rate": 2.7686169631103932e-06, + "loss": 0.0165, + "num_input_tokens_seen": 188777472, + "step": 155140 + }, + { + "epoch": 17.27865018376211, + "grad_norm": 1.5364854335784912, + "learning_rate": 2.7675056789155747e-06, + "loss": 0.0185, + "num_input_tokens_seen": 188783136, + "step": 155145 + }, + { + "epoch": 17.27920703864573, + "grad_norm": 0.45095062255859375, + "learning_rate": 2.766394604724479e-06, + "loss": 0.0453, + "num_input_tokens_seen": 188789120, + "step": 155150 + }, + { + "epoch": 17.279763893529346, + "grad_norm": 1.0808591842651367, + "learning_rate": 2.765283740547603e-06, + "loss": 0.063, + "num_input_tokens_seen": 188795040, + "step": 155155 + }, + { + "epoch": 17.280320748412965, + "grad_norm": 3.8588216304779053, + "learning_rate": 2.7641730863954358e-06, + "loss": 0.1265, + "num_input_tokens_seen": 188801248, + "step": 155160 + }, + { + "epoch": 17.28087760329658, + "grad_norm": 0.01389322429895401, + "learning_rate": 2.7630626422784605e-06, + "loss": 0.008, + "num_input_tokens_seen": 188807392, + "step": 155165 + }, + { + "epoch": 17.281434458180197, + "grad_norm": 1.6430832147598267, + "learning_rate": 2.761952408207183e-06, + "loss": 0.135, + "num_input_tokens_seen": 188813344, + "step": 155170 + }, + { + "epoch": 17.281991313063816, + "grad_norm": 1.7463164329528809, + "learning_rate": 2.7608423841920807e-06, + "loss": 0.0297, + "num_input_tokens_seen": 188819712, + "step": 155175 + }, + { + "epoch": 17.282548167947432, + "grad_norm": 0.011308721266686916, + "learning_rate": 2.7597325702436395e-06, + "loss": 0.0009, + "num_input_tokens_seen": 188826208, + "step": 155180 + }, + { + "epoch": 17.28310502283105, + "grad_norm": 0.11558951437473297, + "learning_rate": 2.7586229663723403e-06, + "loss": 0.0161, + "num_input_tokens_seen": 188832160, + "step": 155185 + }, + { + "epoch": 17.283661877714668, + "grad_norm": 1.5780867338180542, + "learning_rate": 2.757513572588669e-06, + "loss": 0.1499, + "num_input_tokens_seen": 188838272, + "step": 155190 + }, + { + "epoch": 17.284218732598283, + "grad_norm": 0.03772890567779541, + "learning_rate": 2.756404388903097e-06, + "loss": 0.0185, + "num_input_tokens_seen": 188844224, + "step": 155195 + }, + { + "epoch": 17.284775587481903, + "grad_norm": 0.27997326850891113, + "learning_rate": 2.7552954153261175e-06, + "loss": 0.0209, + "num_input_tokens_seen": 188849824, + "step": 155200 + }, + { + "epoch": 17.28533244236552, + "grad_norm": 0.12653830647468567, + "learning_rate": 2.754186651868185e-06, + "loss": 0.0086, + "num_input_tokens_seen": 188855744, + "step": 155205 + }, + { + "epoch": 17.28588929724914, + "grad_norm": 0.8673160076141357, + "learning_rate": 2.753078098539788e-06, + "loss": 0.0246, + "num_input_tokens_seen": 188861664, + "step": 155210 + }, + { + "epoch": 17.286446152132754, + "grad_norm": 0.0006074598641134799, + "learning_rate": 2.7519697553513884e-06, + "loss": 0.0003, + "num_input_tokens_seen": 188867936, + "step": 155215 + }, + { + "epoch": 17.28700300701637, + "grad_norm": 0.018669884651899338, + "learning_rate": 2.7508616223134608e-06, + "loss": 0.0035, + "num_input_tokens_seen": 188874080, + "step": 155220 + }, + { + "epoch": 17.28755986189999, + "grad_norm": 0.2238578200340271, + "learning_rate": 2.7497536994364746e-06, + "loss": 0.0156, + "num_input_tokens_seen": 188880608, + "step": 155225 + }, + { + "epoch": 17.288116716783605, + "grad_norm": 1.0985581874847412, + "learning_rate": 2.748645986730888e-06, + "loss": 0.0613, + "num_input_tokens_seen": 188886688, + "step": 155230 + }, + { + "epoch": 17.288673571667225, + "grad_norm": 0.018389442935585976, + "learning_rate": 2.7475384842071654e-06, + "loss": 0.0124, + "num_input_tokens_seen": 188892064, + "step": 155235 + }, + { + "epoch": 17.28923042655084, + "grad_norm": 0.0001840385521063581, + "learning_rate": 2.7464311918757756e-06, + "loss": 0.0436, + "num_input_tokens_seen": 188898176, + "step": 155240 + }, + { + "epoch": 17.289787281434457, + "grad_norm": 0.03606494143605232, + "learning_rate": 2.745324109747169e-06, + "loss": 0.0672, + "num_input_tokens_seen": 188904352, + "step": 155245 + }, + { + "epoch": 17.290344136318076, + "grad_norm": 0.003669114550575614, + "learning_rate": 2.744217237831809e-06, + "loss": 0.0064, + "num_input_tokens_seen": 188910496, + "step": 155250 + }, + { + "epoch": 17.290900991201692, + "grad_norm": 0.011787495575845242, + "learning_rate": 2.7431105761401427e-06, + "loss": 0.1122, + "num_input_tokens_seen": 188916800, + "step": 155255 + }, + { + "epoch": 17.29145784608531, + "grad_norm": 0.17186984419822693, + "learning_rate": 2.7420041246826343e-06, + "loss": 0.0735, + "num_input_tokens_seen": 188923136, + "step": 155260 + }, + { + "epoch": 17.292014700968927, + "grad_norm": 0.007326303515583277, + "learning_rate": 2.740897883469723e-06, + "loss": 0.0119, + "num_input_tokens_seen": 188929216, + "step": 155265 + }, + { + "epoch": 17.292571555852543, + "grad_norm": 0.18242496252059937, + "learning_rate": 2.7397918525118778e-06, + "loss": 0.0358, + "num_input_tokens_seen": 188935488, + "step": 155270 + }, + { + "epoch": 17.293128410736163, + "grad_norm": 0.8238592743873596, + "learning_rate": 2.738686031819521e-06, + "loss": 0.0168, + "num_input_tokens_seen": 188941248, + "step": 155275 + }, + { + "epoch": 17.29368526561978, + "grad_norm": 0.0009272801107726991, + "learning_rate": 2.737580421403116e-06, + "loss": 0.001, + "num_input_tokens_seen": 188947808, + "step": 155280 + }, + { + "epoch": 17.294242120503398, + "grad_norm": 0.0018833132926374674, + "learning_rate": 2.736475021273094e-06, + "loss": 0.0009, + "num_input_tokens_seen": 188954144, + "step": 155285 + }, + { + "epoch": 17.294798975387014, + "grad_norm": 0.00044945284025743604, + "learning_rate": 2.7353698314399074e-06, + "loss": 0.0835, + "num_input_tokens_seen": 188960512, + "step": 155290 + }, + { + "epoch": 17.29535583027063, + "grad_norm": 2.5659632682800293, + "learning_rate": 2.7342648519139925e-06, + "loss": 0.1175, + "num_input_tokens_seen": 188966912, + "step": 155295 + }, + { + "epoch": 17.29591268515425, + "grad_norm": 0.013307505287230015, + "learning_rate": 2.7331600827057853e-06, + "loss": 0.0473, + "num_input_tokens_seen": 188973024, + "step": 155300 + }, + { + "epoch": 17.296469540037865, + "grad_norm": 0.10155469924211502, + "learning_rate": 2.7320555238257133e-06, + "loss": 0.0204, + "num_input_tokens_seen": 188979200, + "step": 155305 + }, + { + "epoch": 17.297026394921485, + "grad_norm": 0.36418086290359497, + "learning_rate": 2.730951175284227e-06, + "loss": 0.0608, + "num_input_tokens_seen": 188984832, + "step": 155310 + }, + { + "epoch": 17.2975832498051, + "grad_norm": 0.09177248179912567, + "learning_rate": 2.7298470370917455e-06, + "loss": 0.0088, + "num_input_tokens_seen": 188990944, + "step": 155315 + }, + { + "epoch": 17.298140104688716, + "grad_norm": 0.39068636298179626, + "learning_rate": 2.7287431092587023e-06, + "loss": 0.11, + "num_input_tokens_seen": 188997088, + "step": 155320 + }, + { + "epoch": 17.298696959572336, + "grad_norm": 7.505541725549847e-05, + "learning_rate": 2.7276393917955166e-06, + "loss": 0.0113, + "num_input_tokens_seen": 189002848, + "step": 155325 + }, + { + "epoch": 17.299253814455952, + "grad_norm": 1.4788870811462402, + "learning_rate": 2.7265358847126277e-06, + "loss": 0.1307, + "num_input_tokens_seen": 189008960, + "step": 155330 + }, + { + "epoch": 17.29981066933957, + "grad_norm": 0.0036739453207701445, + "learning_rate": 2.725432588020449e-06, + "loss": 0.0017, + "num_input_tokens_seen": 189015392, + "step": 155335 + }, + { + "epoch": 17.300367524223187, + "grad_norm": 0.0003722640103660524, + "learning_rate": 2.7243295017294085e-06, + "loss": 0.0341, + "num_input_tokens_seen": 189021440, + "step": 155340 + }, + { + "epoch": 17.300924379106803, + "grad_norm": 0.19658693671226501, + "learning_rate": 2.7232266258499255e-06, + "loss": 0.0695, + "num_input_tokens_seen": 189027424, + "step": 155345 + }, + { + "epoch": 17.301481233990422, + "grad_norm": 0.0005457644583657384, + "learning_rate": 2.7221239603924116e-06, + "loss": 0.0068, + "num_input_tokens_seen": 189033600, + "step": 155350 + }, + { + "epoch": 17.30203808887404, + "grad_norm": 0.13771188259124756, + "learning_rate": 2.7210215053672826e-06, + "loss": 0.0203, + "num_input_tokens_seen": 189039616, + "step": 155355 + }, + { + "epoch": 17.302594943757658, + "grad_norm": 0.022465653717517853, + "learning_rate": 2.7199192607849615e-06, + "loss": 0.0466, + "num_input_tokens_seen": 189045696, + "step": 155360 + }, + { + "epoch": 17.303151798641274, + "grad_norm": 0.9269315004348755, + "learning_rate": 2.718817226655851e-06, + "loss": 0.0191, + "num_input_tokens_seen": 189051584, + "step": 155365 + }, + { + "epoch": 17.303708653524893, + "grad_norm": 0.2347479909658432, + "learning_rate": 2.7177154029903645e-06, + "loss": 0.0141, + "num_input_tokens_seen": 189057792, + "step": 155370 + }, + { + "epoch": 17.30426550840851, + "grad_norm": 0.5500423312187195, + "learning_rate": 2.7166137897989023e-06, + "loss": 0.0901, + "num_input_tokens_seen": 189063680, + "step": 155375 + }, + { + "epoch": 17.304822363292125, + "grad_norm": 0.13285396993160248, + "learning_rate": 2.7155123870918813e-06, + "loss": 0.1226, + "num_input_tokens_seen": 189069856, + "step": 155380 + }, + { + "epoch": 17.305379218175744, + "grad_norm": 0.5082648396492004, + "learning_rate": 2.714411194879693e-06, + "loss": 0.0453, + "num_input_tokens_seen": 189076096, + "step": 155385 + }, + { + "epoch": 17.30593607305936, + "grad_norm": 0.4220179319381714, + "learning_rate": 2.7133102131727596e-06, + "loss": 0.1349, + "num_input_tokens_seen": 189081664, + "step": 155390 + }, + { + "epoch": 17.30649292794298, + "grad_norm": 0.307583749294281, + "learning_rate": 2.7122094419814557e-06, + "loss": 0.0303, + "num_input_tokens_seen": 189087712, + "step": 155395 + }, + { + "epoch": 17.307049782826596, + "grad_norm": 0.03650518134236336, + "learning_rate": 2.7111088813161933e-06, + "loss": 0.0972, + "num_input_tokens_seen": 189094112, + "step": 155400 + }, + { + "epoch": 17.30760663771021, + "grad_norm": 0.2146405130624771, + "learning_rate": 2.7100085311873607e-06, + "loss": 0.0337, + "num_input_tokens_seen": 189100064, + "step": 155405 + }, + { + "epoch": 17.30816349259383, + "grad_norm": 0.00010611533070914447, + "learning_rate": 2.7089083916053635e-06, + "loss": 0.0033, + "num_input_tokens_seen": 189106496, + "step": 155410 + }, + { + "epoch": 17.308720347477447, + "grad_norm": 0.004354527220129967, + "learning_rate": 2.7078084625805828e-06, + "loss": 0.0079, + "num_input_tokens_seen": 189112928, + "step": 155415 + }, + { + "epoch": 17.309277202361066, + "grad_norm": 0.0027793576009571552, + "learning_rate": 2.706708744123415e-06, + "loss": 0.0117, + "num_input_tokens_seen": 189119136, + "step": 155420 + }, + { + "epoch": 17.309834057244682, + "grad_norm": 0.004282115492969751, + "learning_rate": 2.705609236244236e-06, + "loss": 0.0534, + "num_input_tokens_seen": 189125344, + "step": 155425 + }, + { + "epoch": 17.310390912128298, + "grad_norm": 0.008019868284463882, + "learning_rate": 2.7045099389534452e-06, + "loss": 0.0696, + "num_input_tokens_seen": 189130400, + "step": 155430 + }, + { + "epoch": 17.310947767011918, + "grad_norm": 0.020934967324137688, + "learning_rate": 2.7034108522614234e-06, + "loss": 0.0107, + "num_input_tokens_seen": 189136608, + "step": 155435 + }, + { + "epoch": 17.311504621895534, + "grad_norm": 2.4090383052825928, + "learning_rate": 2.7023119761785454e-06, + "loss": 0.1532, + "num_input_tokens_seen": 189142592, + "step": 155440 + }, + { + "epoch": 17.312061476779153, + "grad_norm": 0.00010175829083891585, + "learning_rate": 2.701213310715195e-06, + "loss": 0.019, + "num_input_tokens_seen": 189148672, + "step": 155445 + }, + { + "epoch": 17.31261833166277, + "grad_norm": 0.09516812860965729, + "learning_rate": 2.7001148558817524e-06, + "loss": 0.0127, + "num_input_tokens_seen": 189155200, + "step": 155450 + }, + { + "epoch": 17.313175186546385, + "grad_norm": 0.2993695139884949, + "learning_rate": 2.6990166116885875e-06, + "loss": 0.0243, + "num_input_tokens_seen": 189160992, + "step": 155455 + }, + { + "epoch": 17.313732041430004, + "grad_norm": 0.18538779020309448, + "learning_rate": 2.697918578146086e-06, + "loss": 0.0024, + "num_input_tokens_seen": 189167104, + "step": 155460 + }, + { + "epoch": 17.31428889631362, + "grad_norm": 0.3356943130493164, + "learning_rate": 2.6968207552646035e-06, + "loss": 0.0945, + "num_input_tokens_seen": 189173344, + "step": 155465 + }, + { + "epoch": 17.31484575119724, + "grad_norm": 0.017397016286849976, + "learning_rate": 2.6957231430545236e-06, + "loss": 0.0307, + "num_input_tokens_seen": 189179456, + "step": 155470 + }, + { + "epoch": 17.315402606080855, + "grad_norm": 0.025208815932273865, + "learning_rate": 2.6946257415262024e-06, + "loss": 0.0881, + "num_input_tokens_seen": 189185504, + "step": 155475 + }, + { + "epoch": 17.31595946096447, + "grad_norm": 0.7475447058677673, + "learning_rate": 2.6935285506900165e-06, + "loss": 0.0218, + "num_input_tokens_seen": 189191488, + "step": 155480 + }, + { + "epoch": 17.31651631584809, + "grad_norm": 0.9322310090065002, + "learning_rate": 2.692431570556325e-06, + "loss": 0.0264, + "num_input_tokens_seen": 189197856, + "step": 155485 + }, + { + "epoch": 17.317073170731707, + "grad_norm": 1.3776743412017822, + "learning_rate": 2.691334801135492e-06, + "loss": 0.1118, + "num_input_tokens_seen": 189204032, + "step": 155490 + }, + { + "epoch": 17.317630025615326, + "grad_norm": 0.012962089851498604, + "learning_rate": 2.690238242437873e-06, + "loss": 0.0135, + "num_input_tokens_seen": 189210272, + "step": 155495 + }, + { + "epoch": 17.318186880498942, + "grad_norm": 0.19430293142795563, + "learning_rate": 2.6891418944738234e-06, + "loss": 0.0531, + "num_input_tokens_seen": 189216192, + "step": 155500 + }, + { + "epoch": 17.318743735382558, + "grad_norm": 0.0001979868538910523, + "learning_rate": 2.68804575725371e-06, + "loss": 0.0133, + "num_input_tokens_seen": 189222336, + "step": 155505 + }, + { + "epoch": 17.319300590266177, + "grad_norm": 0.006127461325377226, + "learning_rate": 2.6869498307878797e-06, + "loss": 0.002, + "num_input_tokens_seen": 189228512, + "step": 155510 + }, + { + "epoch": 17.319857445149793, + "grad_norm": 1.4463292360305786, + "learning_rate": 2.6858541150866863e-06, + "loss": 0.1455, + "num_input_tokens_seen": 189234880, + "step": 155515 + }, + { + "epoch": 17.320414300033413, + "grad_norm": 0.00043696275679394603, + "learning_rate": 2.6847586101604705e-06, + "loss": 0.0065, + "num_input_tokens_seen": 189240960, + "step": 155520 + }, + { + "epoch": 17.32097115491703, + "grad_norm": 0.03155460208654404, + "learning_rate": 2.6836633160195967e-06, + "loss": 0.0081, + "num_input_tokens_seen": 189247360, + "step": 155525 + }, + { + "epoch": 17.321528009800645, + "grad_norm": 0.035912513732910156, + "learning_rate": 2.6825682326743957e-06, + "loss": 0.0212, + "num_input_tokens_seen": 189253248, + "step": 155530 + }, + { + "epoch": 17.322084864684264, + "grad_norm": 0.0005027193110436201, + "learning_rate": 2.6814733601352284e-06, + "loss": 0.0062, + "num_input_tokens_seen": 189258944, + "step": 155535 + }, + { + "epoch": 17.32264171956788, + "grad_norm": 0.0003361605922691524, + "learning_rate": 2.6803786984124168e-06, + "loss": 0.0093, + "num_input_tokens_seen": 189264928, + "step": 155540 + }, + { + "epoch": 17.3231985744515, + "grad_norm": 0.1699167788028717, + "learning_rate": 2.6792842475163145e-06, + "loss": 0.0612, + "num_input_tokens_seen": 189270912, + "step": 155545 + }, + { + "epoch": 17.323755429335115, + "grad_norm": 1.2259423732757568, + "learning_rate": 2.678190007457251e-06, + "loss": 0.1158, + "num_input_tokens_seen": 189277088, + "step": 155550 + }, + { + "epoch": 17.32431228421873, + "grad_norm": 0.023841911926865578, + "learning_rate": 2.6770959782455724e-06, + "loss": 0.0022, + "num_input_tokens_seen": 189283424, + "step": 155555 + }, + { + "epoch": 17.32486913910235, + "grad_norm": 0.0014961225679144263, + "learning_rate": 2.676002159891608e-06, + "loss": 0.0778, + "num_input_tokens_seen": 189289696, + "step": 155560 + }, + { + "epoch": 17.325425993985966, + "grad_norm": 1.4208028316497803, + "learning_rate": 2.674908552405686e-06, + "loss": 0.0186, + "num_input_tokens_seen": 189296352, + "step": 155565 + }, + { + "epoch": 17.325982848869586, + "grad_norm": 0.0023490514140576124, + "learning_rate": 2.6738151557981373e-06, + "loss": 0.0016, + "num_input_tokens_seen": 189302528, + "step": 155570 + }, + { + "epoch": 17.326539703753202, + "grad_norm": 1.2263598442077637, + "learning_rate": 2.672721970079295e-06, + "loss": 0.158, + "num_input_tokens_seen": 189308448, + "step": 155575 + }, + { + "epoch": 17.327096558636818, + "grad_norm": 0.9218297600746155, + "learning_rate": 2.6716289952594816e-06, + "loss": 0.1121, + "num_input_tokens_seen": 189314240, + "step": 155580 + }, + { + "epoch": 17.327653413520437, + "grad_norm": 1.2261663675308228, + "learning_rate": 2.6705362313490245e-06, + "loss": 0.0825, + "num_input_tokens_seen": 189320512, + "step": 155585 + }, + { + "epoch": 17.328210268404053, + "grad_norm": 0.1326584666967392, + "learning_rate": 2.6694436783582357e-06, + "loss": 0.0072, + "num_input_tokens_seen": 189326688, + "step": 155590 + }, + { + "epoch": 17.328767123287673, + "grad_norm": 0.012239077128469944, + "learning_rate": 2.6683513362974477e-06, + "loss": 0.0423, + "num_input_tokens_seen": 189332704, + "step": 155595 + }, + { + "epoch": 17.32932397817129, + "grad_norm": 0.23633413016796112, + "learning_rate": 2.6672592051769668e-06, + "loss": 0.0432, + "num_input_tokens_seen": 189338432, + "step": 155600 + }, + { + "epoch": 17.329880833054904, + "grad_norm": 0.13304071128368378, + "learning_rate": 2.6661672850071263e-06, + "loss": 0.005, + "num_input_tokens_seen": 189344384, + "step": 155605 + }, + { + "epoch": 17.330437687938524, + "grad_norm": 0.16766385734081268, + "learning_rate": 2.6650755757982203e-06, + "loss": 0.0169, + "num_input_tokens_seen": 189349568, + "step": 155610 + }, + { + "epoch": 17.33099454282214, + "grad_norm": 0.00016990168660413474, + "learning_rate": 2.6639840775605744e-06, + "loss": 0.0281, + "num_input_tokens_seen": 189355552, + "step": 155615 + }, + { + "epoch": 17.33155139770576, + "grad_norm": 0.00015082082245498896, + "learning_rate": 2.6628927903044887e-06, + "loss": 0.07, + "num_input_tokens_seen": 189362016, + "step": 155620 + }, + { + "epoch": 17.332108252589375, + "grad_norm": 0.7186796069145203, + "learning_rate": 2.6618017140402825e-06, + "loss": 0.0927, + "num_input_tokens_seen": 189367968, + "step": 155625 + }, + { + "epoch": 17.33266510747299, + "grad_norm": 0.00032322845072485507, + "learning_rate": 2.6607108487782557e-06, + "loss": 0.0091, + "num_input_tokens_seen": 189374144, + "step": 155630 + }, + { + "epoch": 17.33322196235661, + "grad_norm": 0.014686869457364082, + "learning_rate": 2.6596201945287113e-06, + "loss": 0.0009, + "num_input_tokens_seen": 189380320, + "step": 155635 + }, + { + "epoch": 17.333778817240226, + "grad_norm": 0.03933216258883476, + "learning_rate": 2.6585297513019495e-06, + "loss": 0.0409, + "num_input_tokens_seen": 189386144, + "step": 155640 + }, + { + "epoch": 17.334335672123846, + "grad_norm": 0.21269717812538147, + "learning_rate": 2.6574395191082792e-06, + "loss": 0.1792, + "num_input_tokens_seen": 189392288, + "step": 155645 + }, + { + "epoch": 17.33489252700746, + "grad_norm": 0.10672061145305634, + "learning_rate": 2.656349497957994e-06, + "loss": 0.0156, + "num_input_tokens_seen": 189398400, + "step": 155650 + }, + { + "epoch": 17.335449381891078, + "grad_norm": 0.04170091077685356, + "learning_rate": 2.655259687861386e-06, + "loss": 0.0129, + "num_input_tokens_seen": 189404608, + "step": 155655 + }, + { + "epoch": 17.336006236774697, + "grad_norm": 0.16056932508945465, + "learning_rate": 2.65417008882875e-06, + "loss": 0.0306, + "num_input_tokens_seen": 189410752, + "step": 155660 + }, + { + "epoch": 17.336563091658313, + "grad_norm": 0.11730378866195679, + "learning_rate": 2.653080700870386e-06, + "loss": 0.0154, + "num_input_tokens_seen": 189416992, + "step": 155665 + }, + { + "epoch": 17.337119946541932, + "grad_norm": 0.0017773605650290847, + "learning_rate": 2.651991523996575e-06, + "loss": 0.033, + "num_input_tokens_seen": 189423200, + "step": 155670 + }, + { + "epoch": 17.33767680142555, + "grad_norm": 1.236430048942566, + "learning_rate": 2.650902558217616e-06, + "loss": 0.0221, + "num_input_tokens_seen": 189429024, + "step": 155675 + }, + { + "epoch": 17.338233656309164, + "grad_norm": 0.047602493315935135, + "learning_rate": 2.6498138035437797e-06, + "loss": 0.0959, + "num_input_tokens_seen": 189435264, + "step": 155680 + }, + { + "epoch": 17.338790511192784, + "grad_norm": 0.3475986421108246, + "learning_rate": 2.6487252599853627e-06, + "loss": 0.0049, + "num_input_tokens_seen": 189441152, + "step": 155685 + }, + { + "epoch": 17.3393473660764, + "grad_norm": 3.852030038833618, + "learning_rate": 2.647636927552638e-06, + "loss": 0.0827, + "num_input_tokens_seen": 189447520, + "step": 155690 + }, + { + "epoch": 17.33990422096002, + "grad_norm": 0.00014968073810450733, + "learning_rate": 2.646548806255897e-06, + "loss": 0.025, + "num_input_tokens_seen": 189453664, + "step": 155695 + }, + { + "epoch": 17.340461075843635, + "grad_norm": 0.000245139526668936, + "learning_rate": 2.6454608961054115e-06, + "loss": 0.0121, + "num_input_tokens_seen": 189459456, + "step": 155700 + }, + { + "epoch": 17.341017930727254, + "grad_norm": 0.22934748232364655, + "learning_rate": 2.6443731971114603e-06, + "loss": 0.1424, + "num_input_tokens_seen": 189465504, + "step": 155705 + }, + { + "epoch": 17.34157478561087, + "grad_norm": 0.3779672086238861, + "learning_rate": 2.6432857092843073e-06, + "loss": 0.0142, + "num_input_tokens_seen": 189471616, + "step": 155710 + }, + { + "epoch": 17.342131640494486, + "grad_norm": 0.30151456594467163, + "learning_rate": 2.642198432634238e-06, + "loss": 0.0079, + "num_input_tokens_seen": 189477856, + "step": 155715 + }, + { + "epoch": 17.342688495378106, + "grad_norm": 0.00236211228184402, + "learning_rate": 2.6411113671715172e-06, + "loss": 0.0753, + "num_input_tokens_seen": 189483744, + "step": 155720 + }, + { + "epoch": 17.34324535026172, + "grad_norm": 0.019485225901007652, + "learning_rate": 2.640024512906414e-06, + "loss": 0.0387, + "num_input_tokens_seen": 189489920, + "step": 155725 + }, + { + "epoch": 17.343802205145337, + "grad_norm": 0.10214448720216751, + "learning_rate": 2.6389378698491894e-06, + "loss": 0.0908, + "num_input_tokens_seen": 189496000, + "step": 155730 + }, + { + "epoch": 17.344359060028957, + "grad_norm": 0.04287241771817207, + "learning_rate": 2.6378514380101165e-06, + "loss": 0.0379, + "num_input_tokens_seen": 189501888, + "step": 155735 + }, + { + "epoch": 17.344915914912573, + "grad_norm": 0.005739146843552589, + "learning_rate": 2.636765217399448e-06, + "loss": 0.002, + "num_input_tokens_seen": 189508160, + "step": 155740 + }, + { + "epoch": 17.345472769796192, + "grad_norm": 0.0035810780245810747, + "learning_rate": 2.6356792080274527e-06, + "loss": 0.0316, + "num_input_tokens_seen": 189514240, + "step": 155745 + }, + { + "epoch": 17.346029624679808, + "grad_norm": 2.8039438724517822, + "learning_rate": 2.634593409904387e-06, + "loss": 0.0463, + "num_input_tokens_seen": 189520032, + "step": 155750 + }, + { + "epoch": 17.346586479563427, + "grad_norm": 0.00019956067262683064, + "learning_rate": 2.6335078230405042e-06, + "loss": 0.0108, + "num_input_tokens_seen": 189526080, + "step": 155755 + }, + { + "epoch": 17.347143334447043, + "grad_norm": 1.1911700963974, + "learning_rate": 2.6324224474460537e-06, + "loss": 0.0846, + "num_input_tokens_seen": 189532384, + "step": 155760 + }, + { + "epoch": 17.34770018933066, + "grad_norm": 0.000997603521682322, + "learning_rate": 2.6313372831313023e-06, + "loss": 0.0159, + "num_input_tokens_seen": 189538592, + "step": 155765 + }, + { + "epoch": 17.34825704421428, + "grad_norm": 0.08147934079170227, + "learning_rate": 2.630252330106489e-06, + "loss": 0.0566, + "num_input_tokens_seen": 189544672, + "step": 155770 + }, + { + "epoch": 17.348813899097895, + "grad_norm": 0.025257574394345284, + "learning_rate": 2.6291675883818644e-06, + "loss": 0.0038, + "num_input_tokens_seen": 189550464, + "step": 155775 + }, + { + "epoch": 17.349370753981514, + "grad_norm": 0.6937605738639832, + "learning_rate": 2.628083057967673e-06, + "loss": 0.0333, + "num_input_tokens_seen": 189556608, + "step": 155780 + }, + { + "epoch": 17.34992760886513, + "grad_norm": 1.8843934535980225, + "learning_rate": 2.6269987388741646e-06, + "loss": 0.1248, + "num_input_tokens_seen": 189562752, + "step": 155785 + }, + { + "epoch": 17.350484463748746, + "grad_norm": 0.8369306325912476, + "learning_rate": 2.625914631111573e-06, + "loss": 0.0214, + "num_input_tokens_seen": 189568032, + "step": 155790 + }, + { + "epoch": 17.351041318632365, + "grad_norm": 0.7143216729164124, + "learning_rate": 2.6248307346901537e-06, + "loss": 0.0112, + "num_input_tokens_seen": 189574080, + "step": 155795 + }, + { + "epoch": 17.35159817351598, + "grad_norm": 0.047733500599861145, + "learning_rate": 2.6237470496201233e-06, + "loss": 0.0041, + "num_input_tokens_seen": 189580288, + "step": 155800 + }, + { + "epoch": 17.3521550283996, + "grad_norm": 1.8950711488723755, + "learning_rate": 2.6226635759117353e-06, + "loss": 0.2225, + "num_input_tokens_seen": 189586112, + "step": 155805 + }, + { + "epoch": 17.352711883283217, + "grad_norm": 0.03571758419275284, + "learning_rate": 2.6215803135752142e-06, + "loss": 0.062, + "num_input_tokens_seen": 189592352, + "step": 155810 + }, + { + "epoch": 17.353268738166832, + "grad_norm": 0.33329835534095764, + "learning_rate": 2.6204972626208025e-06, + "loss": 0.0439, + "num_input_tokens_seen": 189598496, + "step": 155815 + }, + { + "epoch": 17.353825593050452, + "grad_norm": 0.0005753404111601412, + "learning_rate": 2.619414423058722e-06, + "loss": 0.0003, + "num_input_tokens_seen": 189604864, + "step": 155820 + }, + { + "epoch": 17.354382447934068, + "grad_norm": 0.062453530728816986, + "learning_rate": 2.6183317948992037e-06, + "loss": 0.0566, + "num_input_tokens_seen": 189610848, + "step": 155825 + }, + { + "epoch": 17.354939302817687, + "grad_norm": 0.005765226669609547, + "learning_rate": 2.61724937815247e-06, + "loss": 0.0032, + "num_input_tokens_seen": 189617088, + "step": 155830 + }, + { + "epoch": 17.355496157701303, + "grad_norm": 0.020367145538330078, + "learning_rate": 2.6161671728287514e-06, + "loss": 0.0835, + "num_input_tokens_seen": 189623360, + "step": 155835 + }, + { + "epoch": 17.35605301258492, + "grad_norm": 0.00543197663500905, + "learning_rate": 2.6150851789382702e-06, + "loss": 0.0086, + "num_input_tokens_seen": 189629312, + "step": 155840 + }, + { + "epoch": 17.35660986746854, + "grad_norm": 0.004898298531770706, + "learning_rate": 2.614003396491241e-06, + "loss": 0.0131, + "num_input_tokens_seen": 189635136, + "step": 155845 + }, + { + "epoch": 17.357166722352154, + "grad_norm": 0.12670470774173737, + "learning_rate": 2.612921825497883e-06, + "loss": 0.0031, + "num_input_tokens_seen": 189641216, + "step": 155850 + }, + { + "epoch": 17.357723577235774, + "grad_norm": 0.013877220451831818, + "learning_rate": 2.611840465968418e-06, + "loss": 0.0167, + "num_input_tokens_seen": 189647104, + "step": 155855 + }, + { + "epoch": 17.35828043211939, + "grad_norm": 1.4199222326278687, + "learning_rate": 2.61075931791305e-06, + "loss": 0.0424, + "num_input_tokens_seen": 189653024, + "step": 155860 + }, + { + "epoch": 17.358837287003006, + "grad_norm": 0.013161306269466877, + "learning_rate": 2.6096783813420124e-06, + "loss": 0.0158, + "num_input_tokens_seen": 189659168, + "step": 155865 + }, + { + "epoch": 17.359394141886625, + "grad_norm": 0.03329254314303398, + "learning_rate": 2.608597656265488e-06, + "loss": 0.0297, + "num_input_tokens_seen": 189665568, + "step": 155870 + }, + { + "epoch": 17.35995099677024, + "grad_norm": 0.05645381659269333, + "learning_rate": 2.607517142693705e-06, + "loss": 0.1387, + "num_input_tokens_seen": 189671328, + "step": 155875 + }, + { + "epoch": 17.36050785165386, + "grad_norm": 0.003912025596946478, + "learning_rate": 2.606436840636858e-06, + "loss": 0.1207, + "num_input_tokens_seen": 189677728, + "step": 155880 + }, + { + "epoch": 17.361064706537476, + "grad_norm": 1.6078646183013916, + "learning_rate": 2.605356750105159e-06, + "loss": 0.0867, + "num_input_tokens_seen": 189683968, + "step": 155885 + }, + { + "epoch": 17.361621561421092, + "grad_norm": 0.07129522413015366, + "learning_rate": 2.60427687110881e-06, + "loss": 0.0318, + "num_input_tokens_seen": 189689760, + "step": 155890 + }, + { + "epoch": 17.36217841630471, + "grad_norm": 0.5093414783477783, + "learning_rate": 2.6031972036580087e-06, + "loss": 0.0145, + "num_input_tokens_seen": 189695872, + "step": 155895 + }, + { + "epoch": 17.362735271188328, + "grad_norm": 1.098125696182251, + "learning_rate": 2.60211774776295e-06, + "loss": 0.0212, + "num_input_tokens_seen": 189702272, + "step": 155900 + }, + { + "epoch": 17.363292126071947, + "grad_norm": 1.6155810356140137, + "learning_rate": 2.601038503433839e-06, + "loss": 0.1022, + "num_input_tokens_seen": 189708160, + "step": 155905 + }, + { + "epoch": 17.363848980955563, + "grad_norm": 0.36361417174339294, + "learning_rate": 2.5999594706808627e-06, + "loss": 0.088, + "num_input_tokens_seen": 189714144, + "step": 155910 + }, + { + "epoch": 17.36440583583918, + "grad_norm": 0.10660268366336823, + "learning_rate": 2.5988806495142183e-06, + "loss": 0.0084, + "num_input_tokens_seen": 189720320, + "step": 155915 + }, + { + "epoch": 17.3649626907228, + "grad_norm": 0.6255818009376526, + "learning_rate": 2.5978020399440917e-06, + "loss": 0.0753, + "num_input_tokens_seen": 189726176, + "step": 155920 + }, + { + "epoch": 17.365519545606414, + "grad_norm": 0.8697207570075989, + "learning_rate": 2.5967236419806695e-06, + "loss": 0.1255, + "num_input_tokens_seen": 189732352, + "step": 155925 + }, + { + "epoch": 17.366076400490034, + "grad_norm": 1.031156301498413, + "learning_rate": 2.5956454556341465e-06, + "loss": 0.0215, + "num_input_tokens_seen": 189738240, + "step": 155930 + }, + { + "epoch": 17.36663325537365, + "grad_norm": 0.41552817821502686, + "learning_rate": 2.5945674809146976e-06, + "loss": 0.0496, + "num_input_tokens_seen": 189744448, + "step": 155935 + }, + { + "epoch": 17.367190110257265, + "grad_norm": 0.8912890553474426, + "learning_rate": 2.59348971783252e-06, + "loss": 0.09, + "num_input_tokens_seen": 189750560, + "step": 155940 + }, + { + "epoch": 17.367746965140885, + "grad_norm": 0.000595135148614645, + "learning_rate": 2.592412166397773e-06, + "loss": 0.0827, + "num_input_tokens_seen": 189756544, + "step": 155945 + }, + { + "epoch": 17.3683038200245, + "grad_norm": 0.001009550062008202, + "learning_rate": 2.5913348266206507e-06, + "loss": 0.0292, + "num_input_tokens_seen": 189762784, + "step": 155950 + }, + { + "epoch": 17.36886067490812, + "grad_norm": 0.0016554746543988585, + "learning_rate": 2.5902576985113196e-06, + "loss": 0.0143, + "num_input_tokens_seen": 189768768, + "step": 155955 + }, + { + "epoch": 17.369417529791736, + "grad_norm": 0.19213850796222687, + "learning_rate": 2.5891807820799642e-06, + "loss": 0.0551, + "num_input_tokens_seen": 189773920, + "step": 155960 + }, + { + "epoch": 17.369974384675352, + "grad_norm": 0.0003216048062313348, + "learning_rate": 2.5881040773367503e-06, + "loss": 0.0134, + "num_input_tokens_seen": 189780256, + "step": 155965 + }, + { + "epoch": 17.37053123955897, + "grad_norm": 0.031483232975006104, + "learning_rate": 2.587027584291851e-06, + "loss": 0.0326, + "num_input_tokens_seen": 189786432, + "step": 155970 + }, + { + "epoch": 17.371088094442587, + "grad_norm": 2.614959955215454, + "learning_rate": 2.585951302955428e-06, + "loss": 0.1005, + "num_input_tokens_seen": 189792384, + "step": 155975 + }, + { + "epoch": 17.371644949326207, + "grad_norm": 0.0005508425529114902, + "learning_rate": 2.584875233337658e-06, + "loss": 0.0101, + "num_input_tokens_seen": 189798336, + "step": 155980 + }, + { + "epoch": 17.372201804209823, + "grad_norm": 0.04284873232245445, + "learning_rate": 2.5837993754487006e-06, + "loss": 0.0191, + "num_input_tokens_seen": 189804544, + "step": 155985 + }, + { + "epoch": 17.37275865909344, + "grad_norm": 0.009015639312565327, + "learning_rate": 2.5827237292987166e-06, + "loss": 0.1187, + "num_input_tokens_seen": 189810784, + "step": 155990 + }, + { + "epoch": 17.373315513977058, + "grad_norm": 0.9627234935760498, + "learning_rate": 2.581648294897862e-06, + "loss": 0.0242, + "num_input_tokens_seen": 189816704, + "step": 155995 + }, + { + "epoch": 17.373872368860674, + "grad_norm": 0.005353868473321199, + "learning_rate": 2.5805730722563067e-06, + "loss": 0.0012, + "num_input_tokens_seen": 189822720, + "step": 156000 + }, + { + "epoch": 17.374429223744293, + "grad_norm": 1.388198733329773, + "learning_rate": 2.5794980613841948e-06, + "loss": 0.1014, + "num_input_tokens_seen": 189828896, + "step": 156005 + }, + { + "epoch": 17.37498607862791, + "grad_norm": 0.08668991178274155, + "learning_rate": 2.5784232622916958e-06, + "loss": 0.0069, + "num_input_tokens_seen": 189835168, + "step": 156010 + }, + { + "epoch": 17.375542933511525, + "grad_norm": 0.004760266747325659, + "learning_rate": 2.577348674988944e-06, + "loss": 0.0138, + "num_input_tokens_seen": 189840672, + "step": 156015 + }, + { + "epoch": 17.376099788395145, + "grad_norm": 0.16559654474258423, + "learning_rate": 2.576274299486106e-06, + "loss": 0.1256, + "num_input_tokens_seen": 189846176, + "step": 156020 + }, + { + "epoch": 17.37665664327876, + "grad_norm": 0.21083778142929077, + "learning_rate": 2.5752001357933155e-06, + "loss": 0.0221, + "num_input_tokens_seen": 189852224, + "step": 156025 + }, + { + "epoch": 17.37721349816238, + "grad_norm": 0.04809324070811272, + "learning_rate": 2.57412618392073e-06, + "loss": 0.0017, + "num_input_tokens_seen": 189858336, + "step": 156030 + }, + { + "epoch": 17.377770353045996, + "grad_norm": 0.10241129994392395, + "learning_rate": 2.573052443878493e-06, + "loss": 0.0013, + "num_input_tokens_seen": 189864160, + "step": 156035 + }, + { + "epoch": 17.378327207929612, + "grad_norm": 0.16256816685199738, + "learning_rate": 2.5719789156767424e-06, + "loss": 0.012, + "num_input_tokens_seen": 189870336, + "step": 156040 + }, + { + "epoch": 17.37888406281323, + "grad_norm": 0.04324301332235336, + "learning_rate": 2.5709055993256125e-06, + "loss": 0.0454, + "num_input_tokens_seen": 189876544, + "step": 156045 + }, + { + "epoch": 17.379440917696847, + "grad_norm": 0.14542599022388458, + "learning_rate": 2.5698324948352586e-06, + "loss": 0.012, + "num_input_tokens_seen": 189882752, + "step": 156050 + }, + { + "epoch": 17.379997772580467, + "grad_norm": 0.38232746720314026, + "learning_rate": 2.5687596022158033e-06, + "loss": 0.0092, + "num_input_tokens_seen": 189888928, + "step": 156055 + }, + { + "epoch": 17.380554627464083, + "grad_norm": 0.0032073301263153553, + "learning_rate": 2.5676869214773885e-06, + "loss": 0.0231, + "num_input_tokens_seen": 189894624, + "step": 156060 + }, + { + "epoch": 17.3811114823477, + "grad_norm": 0.006292174570262432, + "learning_rate": 2.5666144526301366e-06, + "loss": 0.1258, + "num_input_tokens_seen": 189900736, + "step": 156065 + }, + { + "epoch": 17.381668337231318, + "grad_norm": 0.0013832340482622385, + "learning_rate": 2.5655421956841897e-06, + "loss": 0.0987, + "num_input_tokens_seen": 189906944, + "step": 156070 + }, + { + "epoch": 17.382225192114934, + "grad_norm": 0.0009440335561521351, + "learning_rate": 2.564470150649667e-06, + "loss": 0.0017, + "num_input_tokens_seen": 189913120, + "step": 156075 + }, + { + "epoch": 17.382782046998553, + "grad_norm": 0.00012387710739858449, + "learning_rate": 2.563398317536708e-06, + "loss": 0.0159, + "num_input_tokens_seen": 189919424, + "step": 156080 + }, + { + "epoch": 17.38333890188217, + "grad_norm": 0.0028799218125641346, + "learning_rate": 2.5623266963554187e-06, + "loss": 0.0102, + "num_input_tokens_seen": 189925760, + "step": 156085 + }, + { + "epoch": 17.38389575676579, + "grad_norm": 0.09490688145160675, + "learning_rate": 2.561255287115932e-06, + "loss": 0.0921, + "num_input_tokens_seen": 189931264, + "step": 156090 + }, + { + "epoch": 17.384452611649404, + "grad_norm": 0.10221287608146667, + "learning_rate": 2.560184089828366e-06, + "loss": 0.0958, + "num_input_tokens_seen": 189937152, + "step": 156095 + }, + { + "epoch": 17.38500946653302, + "grad_norm": 0.0004681394493672997, + "learning_rate": 2.5591131045028414e-06, + "loss": 0.01, + "num_input_tokens_seen": 189943264, + "step": 156100 + }, + { + "epoch": 17.38556632141664, + "grad_norm": 0.0462474450469017, + "learning_rate": 2.5580423311494766e-06, + "loss": 0.1327, + "num_input_tokens_seen": 189949376, + "step": 156105 + }, + { + "epoch": 17.386123176300256, + "grad_norm": 0.36938777565956116, + "learning_rate": 2.5569717697783795e-06, + "loss": 0.0104, + "num_input_tokens_seen": 189955712, + "step": 156110 + }, + { + "epoch": 17.386680031183875, + "grad_norm": 0.010023496113717556, + "learning_rate": 2.555901420399659e-06, + "loss": 0.0648, + "num_input_tokens_seen": 189962112, + "step": 156115 + }, + { + "epoch": 17.38723688606749, + "grad_norm": 0.43271514773368835, + "learning_rate": 2.55483128302344e-06, + "loss": 0.0152, + "num_input_tokens_seen": 189968128, + "step": 156120 + }, + { + "epoch": 17.387793740951107, + "grad_norm": 0.034555308520793915, + "learning_rate": 2.5537613576598204e-06, + "loss": 0.0638, + "num_input_tokens_seen": 189974464, + "step": 156125 + }, + { + "epoch": 17.388350595834726, + "grad_norm": 0.37856051325798035, + "learning_rate": 2.5526916443189082e-06, + "loss": 0.0536, + "num_input_tokens_seen": 189980672, + "step": 156130 + }, + { + "epoch": 17.388907450718342, + "grad_norm": 0.05041540041565895, + "learning_rate": 2.551622143010804e-06, + "loss": 0.0051, + "num_input_tokens_seen": 189986592, + "step": 156135 + }, + { + "epoch": 17.38946430560196, + "grad_norm": 0.8817465901374817, + "learning_rate": 2.550552853745616e-06, + "loss": 0.0672, + "num_input_tokens_seen": 189992096, + "step": 156140 + }, + { + "epoch": 17.390021160485578, + "grad_norm": 1.3004059791564941, + "learning_rate": 2.54948377653344e-06, + "loss": 0.0198, + "num_input_tokens_seen": 189998144, + "step": 156145 + }, + { + "epoch": 17.390578015369194, + "grad_norm": 0.002807839307934046, + "learning_rate": 2.548414911384381e-06, + "loss": 0.0074, + "num_input_tokens_seen": 190004288, + "step": 156150 + }, + { + "epoch": 17.391134870252813, + "grad_norm": 0.004046415910124779, + "learning_rate": 2.5473462583085335e-06, + "loss": 0.0106, + "num_input_tokens_seen": 190010304, + "step": 156155 + }, + { + "epoch": 17.39169172513643, + "grad_norm": 0.2874351441860199, + "learning_rate": 2.546277817315987e-06, + "loss": 0.1567, + "num_input_tokens_seen": 190016704, + "step": 156160 + }, + { + "epoch": 17.39224858002005, + "grad_norm": 0.009172631427645683, + "learning_rate": 2.5452095884168303e-06, + "loss": 0.0739, + "num_input_tokens_seen": 190022272, + "step": 156165 + }, + { + "epoch": 17.392805434903664, + "grad_norm": 0.00030484687886200845, + "learning_rate": 2.544141571621167e-06, + "loss": 0.0737, + "num_input_tokens_seen": 190028576, + "step": 156170 + }, + { + "epoch": 17.39336228978728, + "grad_norm": 0.8454943895339966, + "learning_rate": 2.5430737669390747e-06, + "loss": 0.1496, + "num_input_tokens_seen": 190034816, + "step": 156175 + }, + { + "epoch": 17.3939191446709, + "grad_norm": 3.6644644737243652, + "learning_rate": 2.5420061743806457e-06, + "loss": 0.0947, + "num_input_tokens_seen": 190041088, + "step": 156180 + }, + { + "epoch": 17.394475999554516, + "grad_norm": 1.5675233602523804, + "learning_rate": 2.5409387939559547e-06, + "loss": 0.1143, + "num_input_tokens_seen": 190047392, + "step": 156185 + }, + { + "epoch": 17.395032854438135, + "grad_norm": 0.06200800836086273, + "learning_rate": 2.539871625675097e-06, + "loss": 0.0196, + "num_input_tokens_seen": 190053728, + "step": 156190 + }, + { + "epoch": 17.39558970932175, + "grad_norm": 0.03275637701153755, + "learning_rate": 2.538804669548142e-06, + "loss": 0.0704, + "num_input_tokens_seen": 190059872, + "step": 156195 + }, + { + "epoch": 17.396146564205367, + "grad_norm": 0.6857030391693115, + "learning_rate": 2.5377379255851817e-06, + "loss": 0.0157, + "num_input_tokens_seen": 190065728, + "step": 156200 + }, + { + "epoch": 17.396703419088986, + "grad_norm": 0.35294023156166077, + "learning_rate": 2.536671393796272e-06, + "loss": 0.0399, + "num_input_tokens_seen": 190072288, + "step": 156205 + }, + { + "epoch": 17.397260273972602, + "grad_norm": 0.0004857824824284762, + "learning_rate": 2.535605074191505e-06, + "loss": 0.0173, + "num_input_tokens_seen": 190078144, + "step": 156210 + }, + { + "epoch": 17.39781712885622, + "grad_norm": 0.0009429403580725193, + "learning_rate": 2.5345389667809417e-06, + "loss": 0.0459, + "num_input_tokens_seen": 190084384, + "step": 156215 + }, + { + "epoch": 17.398373983739837, + "grad_norm": 0.23553572595119476, + "learning_rate": 2.53347307157466e-06, + "loss": 0.022, + "num_input_tokens_seen": 190090880, + "step": 156220 + }, + { + "epoch": 17.398930838623453, + "grad_norm": 0.5922023057937622, + "learning_rate": 2.532407388582725e-06, + "loss": 0.0419, + "num_input_tokens_seen": 190097376, + "step": 156225 + }, + { + "epoch": 17.399487693507073, + "grad_norm": 0.0001577961811563, + "learning_rate": 2.5313419178152024e-06, + "loss": 0.0103, + "num_input_tokens_seen": 190103648, + "step": 156230 + }, + { + "epoch": 17.40004454839069, + "grad_norm": 0.13892070949077606, + "learning_rate": 2.5302766592821546e-06, + "loss": 0.1181, + "num_input_tokens_seen": 190109504, + "step": 156235 + }, + { + "epoch": 17.400601403274308, + "grad_norm": 0.08642212301492691, + "learning_rate": 2.529211612993651e-06, + "loss": 0.0746, + "num_input_tokens_seen": 190114944, + "step": 156240 + }, + { + "epoch": 17.401158258157924, + "grad_norm": 0.005231016781181097, + "learning_rate": 2.5281467789597447e-06, + "loss": 0.1723, + "num_input_tokens_seen": 190121504, + "step": 156245 + }, + { + "epoch": 17.40171511304154, + "grad_norm": 2.5211222171783447, + "learning_rate": 2.5270821571904967e-06, + "loss": 0.1302, + "num_input_tokens_seen": 190127840, + "step": 156250 + }, + { + "epoch": 17.40227196792516, + "grad_norm": 0.11821804195642471, + "learning_rate": 2.5260177476959605e-06, + "loss": 0.0284, + "num_input_tokens_seen": 190133792, + "step": 156255 + }, + { + "epoch": 17.402828822808775, + "grad_norm": 0.3059734106063843, + "learning_rate": 2.524953550486195e-06, + "loss": 0.0116, + "num_input_tokens_seen": 190140128, + "step": 156260 + }, + { + "epoch": 17.403385677692395, + "grad_norm": 1.287925124168396, + "learning_rate": 2.523889565571244e-06, + "loss": 0.0706, + "num_input_tokens_seen": 190146240, + "step": 156265 + }, + { + "epoch": 17.40394253257601, + "grad_norm": 0.00021761259995400906, + "learning_rate": 2.5228257929611755e-06, + "loss": 0.0006, + "num_input_tokens_seen": 190152480, + "step": 156270 + }, + { + "epoch": 17.404499387459627, + "grad_norm": 0.0010071770520880818, + "learning_rate": 2.521762232666017e-06, + "loss": 0.0064, + "num_input_tokens_seen": 190158496, + "step": 156275 + }, + { + "epoch": 17.405056242343246, + "grad_norm": 0.5384019017219543, + "learning_rate": 2.520698884695824e-06, + "loss": 0.0363, + "num_input_tokens_seen": 190164096, + "step": 156280 + }, + { + "epoch": 17.405613097226862, + "grad_norm": 0.5299556851387024, + "learning_rate": 2.5196357490606395e-06, + "loss": 0.0434, + "num_input_tokens_seen": 190169568, + "step": 156285 + }, + { + "epoch": 17.40616995211048, + "grad_norm": 0.03955913707613945, + "learning_rate": 2.5185728257705098e-06, + "loss": 0.0782, + "num_input_tokens_seen": 190175872, + "step": 156290 + }, + { + "epoch": 17.406726806994097, + "grad_norm": 0.0004107641871087253, + "learning_rate": 2.5175101148354723e-06, + "loss": 0.0284, + "num_input_tokens_seen": 190182048, + "step": 156295 + }, + { + "epoch": 17.407283661877713, + "grad_norm": 0.015021944418549538, + "learning_rate": 2.516447616265563e-06, + "loss": 0.0195, + "num_input_tokens_seen": 190188256, + "step": 156300 + }, + { + "epoch": 17.407840516761333, + "grad_norm": 0.011826763860881329, + "learning_rate": 2.515385330070816e-06, + "loss": 0.0257, + "num_input_tokens_seen": 190194368, + "step": 156305 + }, + { + "epoch": 17.40839737164495, + "grad_norm": 0.25658074021339417, + "learning_rate": 2.5143232562612723e-06, + "loss": 0.0052, + "num_input_tokens_seen": 190200832, + "step": 156310 + }, + { + "epoch": 17.408954226528568, + "grad_norm": 0.0018520005978643894, + "learning_rate": 2.5132613948469606e-06, + "loss": 0.1203, + "num_input_tokens_seen": 190206592, + "step": 156315 + }, + { + "epoch": 17.409511081412184, + "grad_norm": 0.0018751086900010705, + "learning_rate": 2.512199745837912e-06, + "loss": 0.1436, + "num_input_tokens_seen": 190212192, + "step": 156320 + }, + { + "epoch": 17.4100679362958, + "grad_norm": 0.002596615580841899, + "learning_rate": 2.511138309244154e-06, + "loss": 0.0728, + "num_input_tokens_seen": 190218336, + "step": 156325 + }, + { + "epoch": 17.41062479117942, + "grad_norm": 1.6978890895843506, + "learning_rate": 2.510077085075707e-06, + "loss": 0.0346, + "num_input_tokens_seen": 190224384, + "step": 156330 + }, + { + "epoch": 17.411181646063035, + "grad_norm": 0.0027950063813477755, + "learning_rate": 2.5090160733426043e-06, + "loss": 0.0116, + "num_input_tokens_seen": 190230720, + "step": 156335 + }, + { + "epoch": 17.411738500946655, + "grad_norm": 0.44164010882377625, + "learning_rate": 2.507955274054863e-06, + "loss": 0.0132, + "num_input_tokens_seen": 190236832, + "step": 156340 + }, + { + "epoch": 17.41229535583027, + "grad_norm": 0.054663196206092834, + "learning_rate": 2.506894687222511e-06, + "loss": 0.2182, + "num_input_tokens_seen": 190242880, + "step": 156345 + }, + { + "epoch": 17.412852210713886, + "grad_norm": 2.206331253051758, + "learning_rate": 2.5058343128555513e-06, + "loss": 0.0358, + "num_input_tokens_seen": 190248480, + "step": 156350 + }, + { + "epoch": 17.413409065597506, + "grad_norm": 0.005321821663528681, + "learning_rate": 2.504774150964012e-06, + "loss": 0.0571, + "num_input_tokens_seen": 190254496, + "step": 156355 + }, + { + "epoch": 17.41396592048112, + "grad_norm": 0.3799037039279938, + "learning_rate": 2.5037142015578995e-06, + "loss": 0.0484, + "num_input_tokens_seen": 190260576, + "step": 156360 + }, + { + "epoch": 17.41452277536474, + "grad_norm": 0.06494558602571487, + "learning_rate": 2.5026544646472363e-06, + "loss": 0.0122, + "num_input_tokens_seen": 190266784, + "step": 156365 + }, + { + "epoch": 17.415079630248357, + "grad_norm": 0.0006044548936188221, + "learning_rate": 2.5015949402420246e-06, + "loss": 0.0099, + "num_input_tokens_seen": 190272896, + "step": 156370 + }, + { + "epoch": 17.415636485131973, + "grad_norm": 2.049657106399536, + "learning_rate": 2.500535628352277e-06, + "loss": 0.1678, + "num_input_tokens_seen": 190279392, + "step": 156375 + }, + { + "epoch": 17.416193340015592, + "grad_norm": 1.1892657279968262, + "learning_rate": 2.49947652898799e-06, + "loss": 0.0284, + "num_input_tokens_seen": 190285760, + "step": 156380 + }, + { + "epoch": 17.41675019489921, + "grad_norm": 1.441771149635315, + "learning_rate": 2.4984176421591786e-06, + "loss": 0.0313, + "num_input_tokens_seen": 190291488, + "step": 156385 + }, + { + "epoch": 17.417307049782828, + "grad_norm": 0.13686145842075348, + "learning_rate": 2.49735896787584e-06, + "loss": 0.019, + "num_input_tokens_seen": 190296608, + "step": 156390 + }, + { + "epoch": 17.417863904666444, + "grad_norm": 0.0006578559405170381, + "learning_rate": 2.496300506147975e-06, + "loss": 0.0154, + "num_input_tokens_seen": 190303168, + "step": 156395 + }, + { + "epoch": 17.41842075955006, + "grad_norm": 0.003171493299305439, + "learning_rate": 2.4952422569855777e-06, + "loss": 0.0008, + "num_input_tokens_seen": 190309472, + "step": 156400 + }, + { + "epoch": 17.41897761443368, + "grad_norm": 0.38956132531166077, + "learning_rate": 2.4941842203986515e-06, + "loss": 0.0058, + "num_input_tokens_seen": 190315936, + "step": 156405 + }, + { + "epoch": 17.419534469317295, + "grad_norm": 1.1989027261734009, + "learning_rate": 2.493126396397183e-06, + "loss": 0.0417, + "num_input_tokens_seen": 190322336, + "step": 156410 + }, + { + "epoch": 17.420091324200914, + "grad_norm": 0.004030772019177675, + "learning_rate": 2.4920687849911777e-06, + "loss": 0.1769, + "num_input_tokens_seen": 190328480, + "step": 156415 + }, + { + "epoch": 17.42064817908453, + "grad_norm": 1.019056797027588, + "learning_rate": 2.491011386190606e-06, + "loss": 0.091, + "num_input_tokens_seen": 190334848, + "step": 156420 + }, + { + "epoch": 17.42120503396815, + "grad_norm": 0.7685160040855408, + "learning_rate": 2.4899542000054678e-06, + "loss": 0.03, + "num_input_tokens_seen": 190340704, + "step": 156425 + }, + { + "epoch": 17.421761888851766, + "grad_norm": 0.001805819571018219, + "learning_rate": 2.4888972264457438e-06, + "loss": 0.0177, + "num_input_tokens_seen": 190346656, + "step": 156430 + }, + { + "epoch": 17.42231874373538, + "grad_norm": 0.38210761547088623, + "learning_rate": 2.4878404655214266e-06, + "loss": 0.1563, + "num_input_tokens_seen": 190352352, + "step": 156435 + }, + { + "epoch": 17.422875598619, + "grad_norm": 2.6133930683135986, + "learning_rate": 2.4867839172424937e-06, + "loss": 0.0776, + "num_input_tokens_seen": 190358304, + "step": 156440 + }, + { + "epoch": 17.423432453502617, + "grad_norm": 0.8717649579048157, + "learning_rate": 2.4857275816189212e-06, + "loss": 0.0521, + "num_input_tokens_seen": 190364320, + "step": 156445 + }, + { + "epoch": 17.423989308386236, + "grad_norm": 0.22083106637001038, + "learning_rate": 2.4846714586606867e-06, + "loss": 0.0216, + "num_input_tokens_seen": 190370528, + "step": 156450 + }, + { + "epoch": 17.424546163269852, + "grad_norm": 0.7039333581924438, + "learning_rate": 2.483615548377774e-06, + "loss": 0.0179, + "num_input_tokens_seen": 190376768, + "step": 156455 + }, + { + "epoch": 17.425103018153468, + "grad_norm": 0.14897701144218445, + "learning_rate": 2.482559850780153e-06, + "loss": 0.0212, + "num_input_tokens_seen": 190382784, + "step": 156460 + }, + { + "epoch": 17.425659873037088, + "grad_norm": 0.5333210825920105, + "learning_rate": 2.4815043658777933e-06, + "loss": 0.0193, + "num_input_tokens_seen": 190388576, + "step": 156465 + }, + { + "epoch": 17.426216727920703, + "grad_norm": 0.32059210538864136, + "learning_rate": 2.480449093680662e-06, + "loss": 0.0085, + "num_input_tokens_seen": 190394688, + "step": 156470 + }, + { + "epoch": 17.426773582804323, + "grad_norm": 0.14866667985916138, + "learning_rate": 2.479394034198737e-06, + "loss": 0.0649, + "num_input_tokens_seen": 190400384, + "step": 156475 + }, + { + "epoch": 17.42733043768794, + "grad_norm": 0.5320839285850525, + "learning_rate": 2.478339187441972e-06, + "loss": 0.0713, + "num_input_tokens_seen": 190406048, + "step": 156480 + }, + { + "epoch": 17.427887292571555, + "grad_norm": 0.11108770966529846, + "learning_rate": 2.477284553420345e-06, + "loss": 0.0031, + "num_input_tokens_seen": 190412448, + "step": 156485 + }, + { + "epoch": 17.428444147455174, + "grad_norm": 2.3325915336608887, + "learning_rate": 2.4762301321438086e-06, + "loss": 0.1557, + "num_input_tokens_seen": 190418880, + "step": 156490 + }, + { + "epoch": 17.42900100233879, + "grad_norm": 0.005292063113301992, + "learning_rate": 2.475175923622325e-06, + "loss": 0.0586, + "num_input_tokens_seen": 190425152, + "step": 156495 + }, + { + "epoch": 17.42955785722241, + "grad_norm": 0.04433300718665123, + "learning_rate": 2.474121927865844e-06, + "loss": 0.0013, + "num_input_tokens_seen": 190431008, + "step": 156500 + }, + { + "epoch": 17.430114712106025, + "grad_norm": 0.0009222206426784396, + "learning_rate": 2.4730681448843356e-06, + "loss": 0.006, + "num_input_tokens_seen": 190436832, + "step": 156505 + }, + { + "epoch": 17.43067156698964, + "grad_norm": 1.0894370079040527, + "learning_rate": 2.4720145746877475e-06, + "loss": 0.0263, + "num_input_tokens_seen": 190442912, + "step": 156510 + }, + { + "epoch": 17.43122842187326, + "grad_norm": 0.5448605418205261, + "learning_rate": 2.4709612172860303e-06, + "loss": 0.0143, + "num_input_tokens_seen": 190448896, + "step": 156515 + }, + { + "epoch": 17.431785276756877, + "grad_norm": 0.006409465800970793, + "learning_rate": 2.469908072689131e-06, + "loss": 0.0485, + "num_input_tokens_seen": 190454784, + "step": 156520 + }, + { + "epoch": 17.432342131640496, + "grad_norm": 1.0862818956375122, + "learning_rate": 2.4688551409070034e-06, + "loss": 0.0168, + "num_input_tokens_seen": 190460768, + "step": 156525 + }, + { + "epoch": 17.432898986524112, + "grad_norm": 0.06555699557065964, + "learning_rate": 2.4678024219495914e-06, + "loss": 0.0849, + "num_input_tokens_seen": 190466784, + "step": 156530 + }, + { + "epoch": 17.433455841407728, + "grad_norm": 0.4065611660480499, + "learning_rate": 2.466749915826838e-06, + "loss": 0.0101, + "num_input_tokens_seen": 190473184, + "step": 156535 + }, + { + "epoch": 17.434012696291347, + "grad_norm": 0.02659948542714119, + "learning_rate": 2.4656976225486796e-06, + "loss": 0.0089, + "num_input_tokens_seen": 190478912, + "step": 156540 + }, + { + "epoch": 17.434569551174963, + "grad_norm": 0.004346334375441074, + "learning_rate": 2.464645542125066e-06, + "loss": 0.0826, + "num_input_tokens_seen": 190485440, + "step": 156545 + }, + { + "epoch": 17.435126406058583, + "grad_norm": 0.07065954804420471, + "learning_rate": 2.4635936745659253e-06, + "loss": 0.0262, + "num_input_tokens_seen": 190491648, + "step": 156550 + }, + { + "epoch": 17.4356832609422, + "grad_norm": 0.11855864524841309, + "learning_rate": 2.462542019881206e-06, + "loss": 0.0075, + "num_input_tokens_seen": 190498272, + "step": 156555 + }, + { + "epoch": 17.436240115825814, + "grad_norm": 0.5516303777694702, + "learning_rate": 2.4614905780808302e-06, + "loss": 0.0151, + "num_input_tokens_seen": 190504352, + "step": 156560 + }, + { + "epoch": 17.436796970709434, + "grad_norm": 0.019150232896208763, + "learning_rate": 2.4604393491747366e-06, + "loss": 0.0379, + "num_input_tokens_seen": 190510432, + "step": 156565 + }, + { + "epoch": 17.43735382559305, + "grad_norm": 0.07021381705999374, + "learning_rate": 2.4593883331728464e-06, + "loss": 0.0047, + "num_input_tokens_seen": 190516480, + "step": 156570 + }, + { + "epoch": 17.43791068047667, + "grad_norm": 0.004265370778739452, + "learning_rate": 2.458337530085095e-06, + "loss": 0.0094, + "num_input_tokens_seen": 190522656, + "step": 156575 + }, + { + "epoch": 17.438467535360285, + "grad_norm": 0.0002367008273722604, + "learning_rate": 2.457286939921408e-06, + "loss": 0.0048, + "num_input_tokens_seen": 190528416, + "step": 156580 + }, + { + "epoch": 17.4390243902439, + "grad_norm": 1.040736436843872, + "learning_rate": 2.456236562691705e-06, + "loss": 0.1098, + "num_input_tokens_seen": 190534496, + "step": 156585 + }, + { + "epoch": 17.43958124512752, + "grad_norm": 0.12181970477104187, + "learning_rate": 2.4551863984059065e-06, + "loss": 0.0062, + "num_input_tokens_seen": 190540672, + "step": 156590 + }, + { + "epoch": 17.440138100011136, + "grad_norm": 0.02518637105822563, + "learning_rate": 2.4541364470739376e-06, + "loss": 0.0104, + "num_input_tokens_seen": 190546976, + "step": 156595 + }, + { + "epoch": 17.440694954894756, + "grad_norm": 0.7518842220306396, + "learning_rate": 2.4530867087057097e-06, + "loss": 0.0236, + "num_input_tokens_seen": 190552736, + "step": 156600 + }, + { + "epoch": 17.44125180977837, + "grad_norm": 0.0006481176824308932, + "learning_rate": 2.452037183311154e-06, + "loss": 0.0074, + "num_input_tokens_seen": 190558816, + "step": 156605 + }, + { + "epoch": 17.441808664661988, + "grad_norm": 0.09487815201282501, + "learning_rate": 2.4509878709001594e-06, + "loss": 0.0146, + "num_input_tokens_seen": 190564768, + "step": 156610 + }, + { + "epoch": 17.442365519545607, + "grad_norm": 0.2504230737686157, + "learning_rate": 2.4499387714826573e-06, + "loss": 0.0148, + "num_input_tokens_seen": 190570624, + "step": 156615 + }, + { + "epoch": 17.442922374429223, + "grad_norm": 0.06196403503417969, + "learning_rate": 2.448889885068545e-06, + "loss": 0.002, + "num_input_tokens_seen": 190576448, + "step": 156620 + }, + { + "epoch": 17.443479229312842, + "grad_norm": 0.023896291851997375, + "learning_rate": 2.4478412116677396e-06, + "loss": 0.002, + "num_input_tokens_seen": 190582496, + "step": 156625 + }, + { + "epoch": 17.44403608419646, + "grad_norm": 0.014855217188596725, + "learning_rate": 2.446792751290142e-06, + "loss": 0.0551, + "num_input_tokens_seen": 190588768, + "step": 156630 + }, + { + "epoch": 17.444592939080074, + "grad_norm": 0.1421976089477539, + "learning_rate": 2.445744503945657e-06, + "loss": 0.0217, + "num_input_tokens_seen": 190594816, + "step": 156635 + }, + { + "epoch": 17.445149793963694, + "grad_norm": 0.21512393653392792, + "learning_rate": 2.444696469644181e-06, + "loss": 0.1004, + "num_input_tokens_seen": 190600704, + "step": 156640 + }, + { + "epoch": 17.44570664884731, + "grad_norm": 0.03436139598488808, + "learning_rate": 2.4436486483956216e-06, + "loss": 0.0256, + "num_input_tokens_seen": 190606656, + "step": 156645 + }, + { + "epoch": 17.44626350373093, + "grad_norm": 1.6503067016601562, + "learning_rate": 2.4426010402098716e-06, + "loss": 0.0172, + "num_input_tokens_seen": 190612704, + "step": 156650 + }, + { + "epoch": 17.446820358614545, + "grad_norm": 0.013262761756777763, + "learning_rate": 2.4415536450968306e-06, + "loss": 0.0158, + "num_input_tokens_seen": 190618560, + "step": 156655 + }, + { + "epoch": 17.44737721349816, + "grad_norm": 0.008914891630411148, + "learning_rate": 2.4405064630663803e-06, + "loss": 0.1048, + "num_input_tokens_seen": 190624544, + "step": 156660 + }, + { + "epoch": 17.44793406838178, + "grad_norm": 0.0011782458750531077, + "learning_rate": 2.439459494128429e-06, + "loss": 0.038, + "num_input_tokens_seen": 190630912, + "step": 156665 + }, + { + "epoch": 17.448490923265396, + "grad_norm": 0.4303893744945526, + "learning_rate": 2.438412738292853e-06, + "loss": 0.0094, + "num_input_tokens_seen": 190637056, + "step": 156670 + }, + { + "epoch": 17.449047778149016, + "grad_norm": 0.9455899596214294, + "learning_rate": 2.4373661955695513e-06, + "loss": 0.0582, + "num_input_tokens_seen": 190643392, + "step": 156675 + }, + { + "epoch": 17.44960463303263, + "grad_norm": 0.00020065966236870736, + "learning_rate": 2.436319865968395e-06, + "loss": 0.0114, + "num_input_tokens_seen": 190649600, + "step": 156680 + }, + { + "epoch": 17.450161487916247, + "grad_norm": 0.014619330875575542, + "learning_rate": 2.435273749499281e-06, + "loss": 0.0024, + "num_input_tokens_seen": 190655680, + "step": 156685 + }, + { + "epoch": 17.450718342799867, + "grad_norm": 0.007980827242136002, + "learning_rate": 2.4342278461720797e-06, + "loss": 0.088, + "num_input_tokens_seen": 190661600, + "step": 156690 + }, + { + "epoch": 17.451275197683483, + "grad_norm": 0.5193905830383301, + "learning_rate": 2.43318215599668e-06, + "loss": 0.125, + "num_input_tokens_seen": 190667104, + "step": 156695 + }, + { + "epoch": 17.451832052567102, + "grad_norm": 1.1391654014587402, + "learning_rate": 2.432136678982955e-06, + "loss": 0.1195, + "num_input_tokens_seen": 190672928, + "step": 156700 + }, + { + "epoch": 17.452388907450718, + "grad_norm": 0.5166535377502441, + "learning_rate": 2.431091415140779e-06, + "loss": 0.0761, + "num_input_tokens_seen": 190679104, + "step": 156705 + }, + { + "epoch": 17.452945762334334, + "grad_norm": 0.07672131806612015, + "learning_rate": 2.430046364480024e-06, + "loss": 0.0096, + "num_input_tokens_seen": 190685248, + "step": 156710 + }, + { + "epoch": 17.453502617217953, + "grad_norm": 0.0034917148295789957, + "learning_rate": 2.4290015270105685e-06, + "loss": 0.051, + "num_input_tokens_seen": 190691616, + "step": 156715 + }, + { + "epoch": 17.45405947210157, + "grad_norm": 0.003299064701423049, + "learning_rate": 2.427956902742276e-06, + "loss": 0.0335, + "num_input_tokens_seen": 190697152, + "step": 156720 + }, + { + "epoch": 17.45461632698519, + "grad_norm": 0.001087755081243813, + "learning_rate": 2.4269124916850156e-06, + "loss": 0.1237, + "num_input_tokens_seen": 190703008, + "step": 156725 + }, + { + "epoch": 17.455173181868805, + "grad_norm": 0.02426191419363022, + "learning_rate": 2.4258682938486516e-06, + "loss": 0.0087, + "num_input_tokens_seen": 190709056, + "step": 156730 + }, + { + "epoch": 17.45573003675242, + "grad_norm": 0.8922557830810547, + "learning_rate": 2.424824309243043e-06, + "loss": 0.1975, + "num_input_tokens_seen": 190715072, + "step": 156735 + }, + { + "epoch": 17.45628689163604, + "grad_norm": 0.000652482092846185, + "learning_rate": 2.4237805378780593e-06, + "loss": 0.0764, + "num_input_tokens_seen": 190721184, + "step": 156740 + }, + { + "epoch": 17.456843746519656, + "grad_norm": 0.011396914720535278, + "learning_rate": 2.422736979763554e-06, + "loss": 0.0905, + "num_input_tokens_seen": 190727392, + "step": 156745 + }, + { + "epoch": 17.457400601403275, + "grad_norm": 0.20930622518062592, + "learning_rate": 2.4216936349093945e-06, + "loss": 0.0314, + "num_input_tokens_seen": 190733568, + "step": 156750 + }, + { + "epoch": 17.45795745628689, + "grad_norm": 0.2390635907649994, + "learning_rate": 2.4206505033254196e-06, + "loss": 0.0932, + "num_input_tokens_seen": 190739584, + "step": 156755 + }, + { + "epoch": 17.458514311170507, + "grad_norm": 0.032596077769994736, + "learning_rate": 2.419607585021494e-06, + "loss": 0.0171, + "num_input_tokens_seen": 190745984, + "step": 156760 + }, + { + "epoch": 17.459071166054127, + "grad_norm": 0.23332832753658295, + "learning_rate": 2.4185648800074625e-06, + "loss": 0.0522, + "num_input_tokens_seen": 190752128, + "step": 156765 + }, + { + "epoch": 17.459628020937743, + "grad_norm": 0.0786135271191597, + "learning_rate": 2.4175223882931806e-06, + "loss": 0.0336, + "num_input_tokens_seen": 190758336, + "step": 156770 + }, + { + "epoch": 17.460184875821362, + "grad_norm": 0.2381712645292282, + "learning_rate": 2.416480109888494e-06, + "loss": 0.0571, + "num_input_tokens_seen": 190764480, + "step": 156775 + }, + { + "epoch": 17.460741730704978, + "grad_norm": 0.3530511260032654, + "learning_rate": 2.415438044803248e-06, + "loss": 0.0099, + "num_input_tokens_seen": 190770208, + "step": 156780 + }, + { + "epoch": 17.461298585588594, + "grad_norm": 0.15989404916763306, + "learning_rate": 2.4143961930472773e-06, + "loss": 0.007, + "num_input_tokens_seen": 190776352, + "step": 156785 + }, + { + "epoch": 17.461855440472213, + "grad_norm": 1.458688735961914, + "learning_rate": 2.413354554630434e-06, + "loss": 0.0965, + "num_input_tokens_seen": 190782368, + "step": 156790 + }, + { + "epoch": 17.46241229535583, + "grad_norm": 0.0002209876838605851, + "learning_rate": 2.4123131295625547e-06, + "loss": 0.0015, + "num_input_tokens_seen": 190788608, + "step": 156795 + }, + { + "epoch": 17.46296915023945, + "grad_norm": 0.04288678988814354, + "learning_rate": 2.411271917853472e-06, + "loss": 0.0477, + "num_input_tokens_seen": 190794464, + "step": 156800 + }, + { + "epoch": 17.463526005123065, + "grad_norm": 0.009256704710423946, + "learning_rate": 2.4102309195130236e-06, + "loss": 0.0138, + "num_input_tokens_seen": 190800576, + "step": 156805 + }, + { + "epoch": 17.464082860006684, + "grad_norm": 0.0003148401156067848, + "learning_rate": 2.4091901345510425e-06, + "loss": 0.0371, + "num_input_tokens_seen": 190806720, + "step": 156810 + }, + { + "epoch": 17.4646397148903, + "grad_norm": 1.194105625152588, + "learning_rate": 2.4081495629773577e-06, + "loss": 0.0662, + "num_input_tokens_seen": 190813184, + "step": 156815 + }, + { + "epoch": 17.465196569773916, + "grad_norm": 9.49991590459831e-05, + "learning_rate": 2.407109204801811e-06, + "loss": 0.0159, + "num_input_tokens_seen": 190819296, + "step": 156820 + }, + { + "epoch": 17.465753424657535, + "grad_norm": 0.010339759290218353, + "learning_rate": 2.406069060034208e-06, + "loss": 0.0002, + "num_input_tokens_seen": 190825664, + "step": 156825 + }, + { + "epoch": 17.46631027954115, + "grad_norm": 0.003055356442928314, + "learning_rate": 2.4050291286843912e-06, + "loss": 0.1315, + "num_input_tokens_seen": 190831648, + "step": 156830 + }, + { + "epoch": 17.46686713442477, + "grad_norm": 1.6549180746078491, + "learning_rate": 2.4039894107621726e-06, + "loss": 0.0495, + "num_input_tokens_seen": 190837856, + "step": 156835 + }, + { + "epoch": 17.467423989308386, + "grad_norm": 0.5833747386932373, + "learning_rate": 2.40294990627738e-06, + "loss": 0.0211, + "num_input_tokens_seen": 190843872, + "step": 156840 + }, + { + "epoch": 17.467980844192002, + "grad_norm": 0.2428770214319229, + "learning_rate": 2.40191061523983e-06, + "loss": 0.0073, + "num_input_tokens_seen": 190850496, + "step": 156845 + }, + { + "epoch": 17.468537699075622, + "grad_norm": 0.0010976119665428996, + "learning_rate": 2.400871537659341e-06, + "loss": 0.0022, + "num_input_tokens_seen": 190856576, + "step": 156850 + }, + { + "epoch": 17.469094553959238, + "grad_norm": 0.17781229317188263, + "learning_rate": 2.399832673545721e-06, + "loss": 0.1011, + "num_input_tokens_seen": 190862720, + "step": 156855 + }, + { + "epoch": 17.469651408842857, + "grad_norm": 0.044886231422424316, + "learning_rate": 2.398794022908793e-06, + "loss": 0.0068, + "num_input_tokens_seen": 190868896, + "step": 156860 + }, + { + "epoch": 17.470208263726473, + "grad_norm": 0.0407334640622139, + "learning_rate": 2.3977555857583654e-06, + "loss": 0.0071, + "num_input_tokens_seen": 190874752, + "step": 156865 + }, + { + "epoch": 17.47076511861009, + "grad_norm": 0.002223446499556303, + "learning_rate": 2.396717362104242e-06, + "loss": 0.0268, + "num_input_tokens_seen": 190881056, + "step": 156870 + }, + { + "epoch": 17.47132197349371, + "grad_norm": 1.0487046241760254, + "learning_rate": 2.3956793519562286e-06, + "loss": 0.1168, + "num_input_tokens_seen": 190886976, + "step": 156875 + }, + { + "epoch": 17.471878828377324, + "grad_norm": 0.0013428129022940993, + "learning_rate": 2.3946415553241396e-06, + "loss": 0.0624, + "num_input_tokens_seen": 190892736, + "step": 156880 + }, + { + "epoch": 17.472435683260944, + "grad_norm": 0.00011777874897234142, + "learning_rate": 2.3936039722177674e-06, + "loss": 0.1481, + "num_input_tokens_seen": 190898912, + "step": 156885 + }, + { + "epoch": 17.47299253814456, + "grad_norm": 0.03984089195728302, + "learning_rate": 2.39256660264692e-06, + "loss": 0.0326, + "num_input_tokens_seen": 190904992, + "step": 156890 + }, + { + "epoch": 17.473549393028176, + "grad_norm": 0.8023115396499634, + "learning_rate": 2.3915294466213963e-06, + "loss": 0.0394, + "num_input_tokens_seen": 190910368, + "step": 156895 + }, + { + "epoch": 17.474106247911795, + "grad_norm": 0.023107431828975677, + "learning_rate": 2.39049250415099e-06, + "loss": 0.1122, + "num_input_tokens_seen": 190916448, + "step": 156900 + }, + { + "epoch": 17.47466310279541, + "grad_norm": 0.2865636646747589, + "learning_rate": 2.3894557752454917e-06, + "loss": 0.038, + "num_input_tokens_seen": 190922592, + "step": 156905 + }, + { + "epoch": 17.47521995767903, + "grad_norm": 0.6588441729545593, + "learning_rate": 2.3884192599147036e-06, + "loss": 0.0792, + "num_input_tokens_seen": 190928064, + "step": 156910 + }, + { + "epoch": 17.475776812562646, + "grad_norm": 0.0030607779044657946, + "learning_rate": 2.387382958168413e-06, + "loss": 0.0605, + "num_input_tokens_seen": 190934304, + "step": 156915 + }, + { + "epoch": 17.476333667446262, + "grad_norm": 0.5435815453529358, + "learning_rate": 2.386346870016404e-06, + "loss": 0.1445, + "num_input_tokens_seen": 190940672, + "step": 156920 + }, + { + "epoch": 17.47689052232988, + "grad_norm": 0.10339860618114471, + "learning_rate": 2.3853109954684647e-06, + "loss": 0.0289, + "num_input_tokens_seen": 190946752, + "step": 156925 + }, + { + "epoch": 17.477447377213498, + "grad_norm": 0.03328379988670349, + "learning_rate": 2.384275334534386e-06, + "loss": 0.1037, + "num_input_tokens_seen": 190952768, + "step": 156930 + }, + { + "epoch": 17.478004232097117, + "grad_norm": 0.023723427206277847, + "learning_rate": 2.3832398872239397e-06, + "loss": 0.051, + "num_input_tokens_seen": 190958688, + "step": 156935 + }, + { + "epoch": 17.478561086980733, + "grad_norm": 0.941173255443573, + "learning_rate": 2.3822046535469237e-06, + "loss": 0.022, + "num_input_tokens_seen": 190964544, + "step": 156940 + }, + { + "epoch": 17.47911794186435, + "grad_norm": 0.7537028193473816, + "learning_rate": 2.3811696335130974e-06, + "loss": 0.0128, + "num_input_tokens_seen": 190970688, + "step": 156945 + }, + { + "epoch": 17.479674796747968, + "grad_norm": 0.047102246433496475, + "learning_rate": 2.3801348271322495e-06, + "loss": 0.0506, + "num_input_tokens_seen": 190976800, + "step": 156950 + }, + { + "epoch": 17.480231651631584, + "grad_norm": 0.015603302977979183, + "learning_rate": 2.3791002344141445e-06, + "loss": 0.0582, + "num_input_tokens_seen": 190982912, + "step": 156955 + }, + { + "epoch": 17.480788506515204, + "grad_norm": 0.0007139651570469141, + "learning_rate": 2.378065855368569e-06, + "loss": 0.0012, + "num_input_tokens_seen": 190989216, + "step": 156960 + }, + { + "epoch": 17.48134536139882, + "grad_norm": 0.720188319683075, + "learning_rate": 2.3770316900052823e-06, + "loss": 0.0487, + "num_input_tokens_seen": 190995584, + "step": 156965 + }, + { + "epoch": 17.481902216282435, + "grad_norm": 0.011711006052792072, + "learning_rate": 2.37599773833406e-06, + "loss": 0.0209, + "num_input_tokens_seen": 191001632, + "step": 156970 + }, + { + "epoch": 17.482459071166055, + "grad_norm": 0.3262151777744293, + "learning_rate": 2.37496400036466e-06, + "loss": 0.0113, + "num_input_tokens_seen": 191007904, + "step": 156975 + }, + { + "epoch": 17.48301592604967, + "grad_norm": 0.2529565095901489, + "learning_rate": 2.373930476106856e-06, + "loss": 0.0137, + "num_input_tokens_seen": 191013920, + "step": 156980 + }, + { + "epoch": 17.48357278093329, + "grad_norm": 0.5494093894958496, + "learning_rate": 2.3728971655704063e-06, + "loss": 0.0076, + "num_input_tokens_seen": 191020192, + "step": 156985 + }, + { + "epoch": 17.484129635816906, + "grad_norm": 0.03153486177325249, + "learning_rate": 2.37186406876507e-06, + "loss": 0.0229, + "num_input_tokens_seen": 191026272, + "step": 156990 + }, + { + "epoch": 17.484686490700522, + "grad_norm": 0.10285325348377228, + "learning_rate": 2.370831185700603e-06, + "loss": 0.0047, + "num_input_tokens_seen": 191032640, + "step": 156995 + }, + { + "epoch": 17.48524334558414, + "grad_norm": 1.3315984010696411, + "learning_rate": 2.3697985163867704e-06, + "loss": 0.0211, + "num_input_tokens_seen": 191038752, + "step": 157000 + }, + { + "epoch": 17.485800200467757, + "grad_norm": 0.0029031841550022364, + "learning_rate": 2.368766060833319e-06, + "loss": 0.0317, + "num_input_tokens_seen": 191044608, + "step": 157005 + }, + { + "epoch": 17.486357055351377, + "grad_norm": 0.08288448303937912, + "learning_rate": 2.3677338190500107e-06, + "loss": 0.002, + "num_input_tokens_seen": 191050624, + "step": 157010 + }, + { + "epoch": 17.486913910234993, + "grad_norm": 0.0012507077772170305, + "learning_rate": 2.36670179104658e-06, + "loss": 0.0113, + "num_input_tokens_seen": 191056672, + "step": 157015 + }, + { + "epoch": 17.48747076511861, + "grad_norm": 0.16542181372642517, + "learning_rate": 2.3656699768327905e-06, + "loss": 0.078, + "num_input_tokens_seen": 191062688, + "step": 157020 + }, + { + "epoch": 17.488027620002228, + "grad_norm": 0.35473567247390747, + "learning_rate": 2.3646383764183767e-06, + "loss": 0.0668, + "num_input_tokens_seen": 191068800, + "step": 157025 + }, + { + "epoch": 17.488584474885844, + "grad_norm": 0.04651809483766556, + "learning_rate": 2.363606989813094e-06, + "loss": 0.0193, + "num_input_tokens_seen": 191075040, + "step": 157030 + }, + { + "epoch": 17.489141329769463, + "grad_norm": 0.046292051672935486, + "learning_rate": 2.3625758170266797e-06, + "loss": 0.0023, + "num_input_tokens_seen": 191081280, + "step": 157035 + }, + { + "epoch": 17.48969818465308, + "grad_norm": 1.186814308166504, + "learning_rate": 2.361544858068873e-06, + "loss": 0.0488, + "num_input_tokens_seen": 191087424, + "step": 157040 + }, + { + "epoch": 17.490255039536695, + "grad_norm": 0.0003981638001278043, + "learning_rate": 2.360514112949408e-06, + "loss": 0.0265, + "num_input_tokens_seen": 191093824, + "step": 157045 + }, + { + "epoch": 17.490811894420315, + "grad_norm": 0.5630770325660706, + "learning_rate": 2.359483581678029e-06, + "loss": 0.0286, + "num_input_tokens_seen": 191099872, + "step": 157050 + }, + { + "epoch": 17.49136874930393, + "grad_norm": 1.9628182649612427, + "learning_rate": 2.3584532642644706e-06, + "loss": 0.153, + "num_input_tokens_seen": 191105728, + "step": 157055 + }, + { + "epoch": 17.49192560418755, + "grad_norm": 0.05837767571210861, + "learning_rate": 2.3574231607184582e-06, + "loss": 0.0283, + "num_input_tokens_seen": 191111936, + "step": 157060 + }, + { + "epoch": 17.492482459071166, + "grad_norm": 0.8613378405570984, + "learning_rate": 2.35639327104972e-06, + "loss": 0.0187, + "num_input_tokens_seen": 191118368, + "step": 157065 + }, + { + "epoch": 17.49303931395478, + "grad_norm": 0.39760705828666687, + "learning_rate": 2.3553635952679954e-06, + "loss": 0.0051, + "num_input_tokens_seen": 191124672, + "step": 157070 + }, + { + "epoch": 17.4935961688384, + "grad_norm": 0.03532327711582184, + "learning_rate": 2.3543341333829988e-06, + "loss": 0.0182, + "num_input_tokens_seen": 191130720, + "step": 157075 + }, + { + "epoch": 17.494153023722017, + "grad_norm": 0.5737445950508118, + "learning_rate": 2.35330488540447e-06, + "loss": 0.1206, + "num_input_tokens_seen": 191136864, + "step": 157080 + }, + { + "epoch": 17.494709878605637, + "grad_norm": 0.44564875960350037, + "learning_rate": 2.352275851342109e-06, + "loss": 0.0145, + "num_input_tokens_seen": 191143008, + "step": 157085 + }, + { + "epoch": 17.495266733489252, + "grad_norm": 2.2789952754974365, + "learning_rate": 2.3512470312056554e-06, + "loss": 0.1423, + "num_input_tokens_seen": 191148832, + "step": 157090 + }, + { + "epoch": 17.49582358837287, + "grad_norm": 1.162169098854065, + "learning_rate": 2.350218425004813e-06, + "loss": 0.0532, + "num_input_tokens_seen": 191154720, + "step": 157095 + }, + { + "epoch": 17.496380443256488, + "grad_norm": 0.002170389052480459, + "learning_rate": 2.349190032749307e-06, + "loss": 0.0387, + "num_input_tokens_seen": 191160832, + "step": 157100 + }, + { + "epoch": 17.496937298140104, + "grad_norm": 0.0023097845260053873, + "learning_rate": 2.3481618544488522e-06, + "loss": 0.0644, + "num_input_tokens_seen": 191166976, + "step": 157105 + }, + { + "epoch": 17.497494153023723, + "grad_norm": 0.0254720039665699, + "learning_rate": 2.3471338901131567e-06, + "loss": 0.0072, + "num_input_tokens_seen": 191173120, + "step": 157110 + }, + { + "epoch": 17.49805100790734, + "grad_norm": 1.5031216144561768, + "learning_rate": 2.3461061397519245e-06, + "loss": 0.0948, + "num_input_tokens_seen": 191179232, + "step": 157115 + }, + { + "epoch": 17.498607862790955, + "grad_norm": 3.449559450149536, + "learning_rate": 2.345078603374876e-06, + "loss": 0.1088, + "num_input_tokens_seen": 191185152, + "step": 157120 + }, + { + "epoch": 17.499164717674574, + "grad_norm": 0.9625766277313232, + "learning_rate": 2.3440512809917075e-06, + "loss": 0.1196, + "num_input_tokens_seen": 191190528, + "step": 157125 + }, + { + "epoch": 17.49972157255819, + "grad_norm": 2.4241604804992676, + "learning_rate": 2.3430241726121294e-06, + "loss": 0.1683, + "num_input_tokens_seen": 191196160, + "step": 157130 + }, + { + "epoch": 17.50027842744181, + "grad_norm": 1.1600526571273804, + "learning_rate": 2.341997278245836e-06, + "loss": 0.1696, + "num_input_tokens_seen": 191202208, + "step": 157135 + }, + { + "epoch": 17.500835282325426, + "grad_norm": 0.8686661720275879, + "learning_rate": 2.3409705979025366e-06, + "loss": 0.0249, + "num_input_tokens_seen": 191208192, + "step": 157140 + }, + { + "epoch": 17.501392137209045, + "grad_norm": 1.7044761180877686, + "learning_rate": 2.339944131591923e-06, + "loss": 0.0786, + "num_input_tokens_seen": 191214304, + "step": 157145 + }, + { + "epoch": 17.50194899209266, + "grad_norm": 1.427522897720337, + "learning_rate": 2.338917879323685e-06, + "loss": 0.1263, + "num_input_tokens_seen": 191220384, + "step": 157150 + }, + { + "epoch": 17.502505846976277, + "grad_norm": 0.0010223889257758856, + "learning_rate": 2.337891841107537e-06, + "loss": 0.0427, + "num_input_tokens_seen": 191226272, + "step": 157155 + }, + { + "epoch": 17.503062701859896, + "grad_norm": 0.05676615610718727, + "learning_rate": 2.3368660169531433e-06, + "loss": 0.0236, + "num_input_tokens_seen": 191232416, + "step": 157160 + }, + { + "epoch": 17.503619556743512, + "grad_norm": 0.004222068004310131, + "learning_rate": 2.3358404068702154e-06, + "loss": 0.0452, + "num_input_tokens_seen": 191238432, + "step": 157165 + }, + { + "epoch": 17.50417641162713, + "grad_norm": 0.009795012883841991, + "learning_rate": 2.334815010868427e-06, + "loss": 0.0543, + "num_input_tokens_seen": 191244576, + "step": 157170 + }, + { + "epoch": 17.504733266510748, + "grad_norm": 0.004067368805408478, + "learning_rate": 2.333789828957475e-06, + "loss": 0.0042, + "num_input_tokens_seen": 191250912, + "step": 157175 + }, + { + "epoch": 17.505290121394363, + "grad_norm": 0.04501546919345856, + "learning_rate": 2.332764861147038e-06, + "loss": 0.0873, + "num_input_tokens_seen": 191256864, + "step": 157180 + }, + { + "epoch": 17.505846976277983, + "grad_norm": 0.01721237599849701, + "learning_rate": 2.3317401074467976e-06, + "loss": 0.0134, + "num_input_tokens_seen": 191262944, + "step": 157185 + }, + { + "epoch": 17.5064038311616, + "grad_norm": 0.007722113747149706, + "learning_rate": 2.330715567866429e-06, + "loss": 0.0737, + "num_input_tokens_seen": 191269248, + "step": 157190 + }, + { + "epoch": 17.50696068604522, + "grad_norm": 0.011693524196743965, + "learning_rate": 2.329691242415619e-06, + "loss": 0.0077, + "num_input_tokens_seen": 191275136, + "step": 157195 + }, + { + "epoch": 17.507517540928834, + "grad_norm": 0.0007738429703749716, + "learning_rate": 2.328667131104037e-06, + "loss": 0.0862, + "num_input_tokens_seen": 191281376, + "step": 157200 + }, + { + "epoch": 17.50807439581245, + "grad_norm": 0.07820466160774231, + "learning_rate": 2.3276432339413594e-06, + "loss": 0.0037, + "num_input_tokens_seen": 191287040, + "step": 157205 + }, + { + "epoch": 17.50863125069607, + "grad_norm": 0.0033412715420126915, + "learning_rate": 2.3266195509372507e-06, + "loss": 0.011, + "num_input_tokens_seen": 191292992, + "step": 157210 + }, + { + "epoch": 17.509188105579685, + "grad_norm": 0.00023395539028570056, + "learning_rate": 2.3255960821013917e-06, + "loss": 0.0071, + "num_input_tokens_seen": 191299232, + "step": 157215 + }, + { + "epoch": 17.509744960463305, + "grad_norm": 2.3229706287384033, + "learning_rate": 2.3245728274434386e-06, + "loss": 0.0357, + "num_input_tokens_seen": 191305248, + "step": 157220 + }, + { + "epoch": 17.51030181534692, + "grad_norm": 0.12026640772819519, + "learning_rate": 2.3235497869730726e-06, + "loss": 0.0778, + "num_input_tokens_seen": 191311424, + "step": 157225 + }, + { + "epoch": 17.510858670230537, + "grad_norm": 1.939406156539917, + "learning_rate": 2.322526960699936e-06, + "loss": 0.1023, + "num_input_tokens_seen": 191317408, + "step": 157230 + }, + { + "epoch": 17.511415525114156, + "grad_norm": 0.0018756578210741282, + "learning_rate": 2.3215043486337075e-06, + "loss": 0.0301, + "num_input_tokens_seen": 191323424, + "step": 157235 + }, + { + "epoch": 17.511972379997772, + "grad_norm": 0.6566323637962341, + "learning_rate": 2.320481950784034e-06, + "loss": 0.0158, + "num_input_tokens_seen": 191329984, + "step": 157240 + }, + { + "epoch": 17.51252923488139, + "grad_norm": 0.2353115975856781, + "learning_rate": 2.3194597671605837e-06, + "loss": 0.0038, + "num_input_tokens_seen": 191336192, + "step": 157245 + }, + { + "epoch": 17.513086089765007, + "grad_norm": 0.002286274218931794, + "learning_rate": 2.3184377977730098e-06, + "loss": 0.0018, + "num_input_tokens_seen": 191342432, + "step": 157250 + }, + { + "epoch": 17.513642944648623, + "grad_norm": 0.007413348648697138, + "learning_rate": 2.31741604263096e-06, + "loss": 0.0358, + "num_input_tokens_seen": 191348992, + "step": 157255 + }, + { + "epoch": 17.514199799532243, + "grad_norm": 0.0027304450049996376, + "learning_rate": 2.316394501744085e-06, + "loss": 0.0112, + "num_input_tokens_seen": 191354720, + "step": 157260 + }, + { + "epoch": 17.51475665441586, + "grad_norm": 0.00021665022359229624, + "learning_rate": 2.315373175122043e-06, + "loss": 0.0175, + "num_input_tokens_seen": 191360992, + "step": 157265 + }, + { + "epoch": 17.515313509299478, + "grad_norm": 0.06194458156824112, + "learning_rate": 2.3143520627744746e-06, + "loss": 0.064, + "num_input_tokens_seen": 191367360, + "step": 157270 + }, + { + "epoch": 17.515870364183094, + "grad_norm": 0.0002010047173826024, + "learning_rate": 2.3133311647110273e-06, + "loss": 0.009, + "num_input_tokens_seen": 191373728, + "step": 157275 + }, + { + "epoch": 17.51642721906671, + "grad_norm": 0.017403174191713333, + "learning_rate": 2.3123104809413405e-06, + "loss": 0.1063, + "num_input_tokens_seen": 191379648, + "step": 157280 + }, + { + "epoch": 17.51698407395033, + "grad_norm": 0.017551878467202187, + "learning_rate": 2.311290011475059e-06, + "loss": 0.0708, + "num_input_tokens_seen": 191385696, + "step": 157285 + }, + { + "epoch": 17.517540928833945, + "grad_norm": 0.630725085735321, + "learning_rate": 2.31026975632182e-06, + "loss": 0.0256, + "num_input_tokens_seen": 191392000, + "step": 157290 + }, + { + "epoch": 17.518097783717565, + "grad_norm": 0.00047601142432540655, + "learning_rate": 2.309249715491266e-06, + "loss": 0.0004, + "num_input_tokens_seen": 191398176, + "step": 157295 + }, + { + "epoch": 17.51865463860118, + "grad_norm": 0.641044557094574, + "learning_rate": 2.308229888993027e-06, + "loss": 0.0347, + "num_input_tokens_seen": 191404320, + "step": 157300 + }, + { + "epoch": 17.519211493484796, + "grad_norm": 0.000335106800775975, + "learning_rate": 2.3072102768367353e-06, + "loss": 0.0065, + "num_input_tokens_seen": 191410144, + "step": 157305 + }, + { + "epoch": 17.519768348368416, + "grad_norm": 0.026725564152002335, + "learning_rate": 2.3061908790320218e-06, + "loss": 0.0717, + "num_input_tokens_seen": 191416064, + "step": 157310 + }, + { + "epoch": 17.520325203252032, + "grad_norm": 0.0007548303110525012, + "learning_rate": 2.3051716955885205e-06, + "loss": 0.0297, + "num_input_tokens_seen": 191422336, + "step": 157315 + }, + { + "epoch": 17.52088205813565, + "grad_norm": 1.4357831478118896, + "learning_rate": 2.3041527265158568e-06, + "loss": 0.0197, + "num_input_tokens_seen": 191428416, + "step": 157320 + }, + { + "epoch": 17.521438913019267, + "grad_norm": 2.989777088165283, + "learning_rate": 2.3031339718236537e-06, + "loss": 0.1376, + "num_input_tokens_seen": 191434720, + "step": 157325 + }, + { + "epoch": 17.521995767902883, + "grad_norm": 0.006538944318890572, + "learning_rate": 2.3021154315215314e-06, + "loss": 0.0031, + "num_input_tokens_seen": 191440352, + "step": 157330 + }, + { + "epoch": 17.522552622786502, + "grad_norm": 0.2699151039123535, + "learning_rate": 2.3010971056191157e-06, + "loss": 0.0473, + "num_input_tokens_seen": 191446528, + "step": 157335 + }, + { + "epoch": 17.52310947767012, + "grad_norm": 0.49631184339523315, + "learning_rate": 2.3000789941260203e-06, + "loss": 0.0086, + "num_input_tokens_seen": 191452640, + "step": 157340 + }, + { + "epoch": 17.523666332553738, + "grad_norm": 0.10145928710699081, + "learning_rate": 2.2990610970518773e-06, + "loss": 0.0472, + "num_input_tokens_seen": 191458528, + "step": 157345 + }, + { + "epoch": 17.524223187437354, + "grad_norm": 0.0016848199302330613, + "learning_rate": 2.2980434144062785e-06, + "loss": 0.0262, + "num_input_tokens_seen": 191464704, + "step": 157350 + }, + { + "epoch": 17.52478004232097, + "grad_norm": 0.058914415538311005, + "learning_rate": 2.2970259461988525e-06, + "loss": 0.0457, + "num_input_tokens_seen": 191470432, + "step": 157355 + }, + { + "epoch": 17.52533689720459, + "grad_norm": 1.1701539754867554, + "learning_rate": 2.2960086924392026e-06, + "loss": 0.1058, + "num_input_tokens_seen": 191475936, + "step": 157360 + }, + { + "epoch": 17.525893752088205, + "grad_norm": 0.01148316077888012, + "learning_rate": 2.2949916531369438e-06, + "loss": 0.0886, + "num_input_tokens_seen": 191481920, + "step": 157365 + }, + { + "epoch": 17.526450606971824, + "grad_norm": 0.46141719818115234, + "learning_rate": 2.2939748283016786e-06, + "loss": 0.0183, + "num_input_tokens_seen": 191488128, + "step": 157370 + }, + { + "epoch": 17.52700746185544, + "grad_norm": 0.00027250731363892555, + "learning_rate": 2.292958217943017e-06, + "loss": 0.0173, + "num_input_tokens_seen": 191494240, + "step": 157375 + }, + { + "epoch": 17.527564316739056, + "grad_norm": 0.0008893389021977782, + "learning_rate": 2.2919418220705506e-06, + "loss": 0.0226, + "num_input_tokens_seen": 191500416, + "step": 157380 + }, + { + "epoch": 17.528121171622676, + "grad_norm": 0.8648817539215088, + "learning_rate": 2.2909256406938917e-06, + "loss": 0.0288, + "num_input_tokens_seen": 191506432, + "step": 157385 + }, + { + "epoch": 17.52867802650629, + "grad_norm": 0.06398230046033859, + "learning_rate": 2.289909673822632e-06, + "loss": 0.0057, + "num_input_tokens_seen": 191512480, + "step": 157390 + }, + { + "epoch": 17.52923488138991, + "grad_norm": 0.0044409180991351604, + "learning_rate": 2.2888939214663756e-06, + "loss": 0.0114, + "num_input_tokens_seen": 191518528, + "step": 157395 + }, + { + "epoch": 17.529791736273527, + "grad_norm": 0.011840597726404667, + "learning_rate": 2.2878783836347036e-06, + "loss": 0.0101, + "num_input_tokens_seen": 191524864, + "step": 157400 + }, + { + "epoch": 17.530348591157143, + "grad_norm": 0.10813474655151367, + "learning_rate": 2.2868630603372215e-06, + "loss": 0.0617, + "num_input_tokens_seen": 191530624, + "step": 157405 + }, + { + "epoch": 17.530905446040762, + "grad_norm": 1.9094809293746948, + "learning_rate": 2.2858479515835114e-06, + "loss": 0.1357, + "num_input_tokens_seen": 191536928, + "step": 157410 + }, + { + "epoch": 17.531462300924378, + "grad_norm": 3.219900369644165, + "learning_rate": 2.284833057383176e-06, + "loss": 0.0506, + "num_input_tokens_seen": 191543040, + "step": 157415 + }, + { + "epoch": 17.532019155807998, + "grad_norm": 0.8135883808135986, + "learning_rate": 2.2838183777457804e-06, + "loss": 0.0106, + "num_input_tokens_seen": 191549344, + "step": 157420 + }, + { + "epoch": 17.532576010691614, + "grad_norm": 0.04730585217475891, + "learning_rate": 2.2828039126809283e-06, + "loss": 0.0164, + "num_input_tokens_seen": 191555328, + "step": 157425 + }, + { + "epoch": 17.53313286557523, + "grad_norm": 1.3954553604125977, + "learning_rate": 2.281789662198186e-06, + "loss": 0.0174, + "num_input_tokens_seen": 191561728, + "step": 157430 + }, + { + "epoch": 17.53368972045885, + "grad_norm": 0.4449422359466553, + "learning_rate": 2.2807756263071466e-06, + "loss": 0.0805, + "num_input_tokens_seen": 191568160, + "step": 157435 + }, + { + "epoch": 17.534246575342465, + "grad_norm": 0.07441195845603943, + "learning_rate": 2.2797618050173858e-06, + "loss": 0.0021, + "num_input_tokens_seen": 191574400, + "step": 157440 + }, + { + "epoch": 17.534803430226084, + "grad_norm": 0.012885387055575848, + "learning_rate": 2.278748198338476e-06, + "loss": 0.001, + "num_input_tokens_seen": 191580352, + "step": 157445 + }, + { + "epoch": 17.5353602851097, + "grad_norm": 0.033123817294836044, + "learning_rate": 2.277734806279988e-06, + "loss": 0.0063, + "num_input_tokens_seen": 191586496, + "step": 157450 + }, + { + "epoch": 17.535917139993316, + "grad_norm": 0.00026692444225773215, + "learning_rate": 2.276721628851505e-06, + "loss": 0.0005, + "num_input_tokens_seen": 191592544, + "step": 157455 + }, + { + "epoch": 17.536473994876935, + "grad_norm": 0.2520824074745178, + "learning_rate": 2.2757086660625927e-06, + "loss": 0.087, + "num_input_tokens_seen": 191598464, + "step": 157460 + }, + { + "epoch": 17.53703084976055, + "grad_norm": 1.2479667663574219, + "learning_rate": 2.27469591792282e-06, + "loss": 0.0285, + "num_input_tokens_seen": 191604416, + "step": 157465 + }, + { + "epoch": 17.53758770464417, + "grad_norm": 0.001973297679796815, + "learning_rate": 2.2736833844417436e-06, + "loss": 0.0161, + "num_input_tokens_seen": 191610752, + "step": 157470 + }, + { + "epoch": 17.538144559527787, + "grad_norm": 0.2405344694852829, + "learning_rate": 2.2726710656289425e-06, + "loss": 0.0332, + "num_input_tokens_seen": 191616096, + "step": 157475 + }, + { + "epoch": 17.538701414411406, + "grad_norm": 0.33669859170913696, + "learning_rate": 2.2716589614939666e-06, + "loss": 0.0067, + "num_input_tokens_seen": 191622496, + "step": 157480 + }, + { + "epoch": 17.539258269295022, + "grad_norm": 0.019264081493020058, + "learning_rate": 2.2706470720463865e-06, + "loss": 0.0049, + "num_input_tokens_seen": 191628608, + "step": 157485 + }, + { + "epoch": 17.539815124178638, + "grad_norm": 0.006311220582574606, + "learning_rate": 2.2696353972957553e-06, + "loss": 0.0195, + "num_input_tokens_seen": 191635008, + "step": 157490 + }, + { + "epoch": 17.540371979062257, + "grad_norm": 0.14601624011993408, + "learning_rate": 2.2686239372516265e-06, + "loss": 0.0097, + "num_input_tokens_seen": 191641216, + "step": 157495 + }, + { + "epoch": 17.540928833945873, + "grad_norm": 0.006817779038101435, + "learning_rate": 2.267612691923554e-06, + "loss": 0.1227, + "num_input_tokens_seen": 191647392, + "step": 157500 + }, + { + "epoch": 17.54148568882949, + "grad_norm": 0.012134475633502007, + "learning_rate": 2.2666016613210967e-06, + "loss": 0.0061, + "num_input_tokens_seen": 191653632, + "step": 157505 + }, + { + "epoch": 17.54204254371311, + "grad_norm": 0.00019862489716615528, + "learning_rate": 2.2655908454538022e-06, + "loss": 0.0012, + "num_input_tokens_seen": 191659872, + "step": 157510 + }, + { + "epoch": 17.542599398596725, + "grad_norm": 1.852443814277649, + "learning_rate": 2.264580244331216e-06, + "loss": 0.0955, + "num_input_tokens_seen": 191664800, + "step": 157515 + }, + { + "epoch": 17.543156253480344, + "grad_norm": 0.8681797385215759, + "learning_rate": 2.26356985796288e-06, + "loss": 0.0187, + "num_input_tokens_seen": 191671136, + "step": 157520 + }, + { + "epoch": 17.54371310836396, + "grad_norm": 2.0054094791412354, + "learning_rate": 2.262559686358345e-06, + "loss": 0.0367, + "num_input_tokens_seen": 191677216, + "step": 157525 + }, + { + "epoch": 17.54426996324758, + "grad_norm": 0.011795489117503166, + "learning_rate": 2.261549729527154e-06, + "loss": 0.001, + "num_input_tokens_seen": 191683520, + "step": 157530 + }, + { + "epoch": 17.544826818131195, + "grad_norm": 0.4100848436355591, + "learning_rate": 2.260539987478841e-06, + "loss": 0.0141, + "num_input_tokens_seen": 191689088, + "step": 157535 + }, + { + "epoch": 17.54538367301481, + "grad_norm": 0.06524910032749176, + "learning_rate": 2.2595304602229442e-06, + "loss": 0.0018, + "num_input_tokens_seen": 191695072, + "step": 157540 + }, + { + "epoch": 17.54594052789843, + "grad_norm": 0.0037767074536532164, + "learning_rate": 2.258521147769005e-06, + "loss": 0.0458, + "num_input_tokens_seen": 191701088, + "step": 157545 + }, + { + "epoch": 17.546497382782047, + "grad_norm": 0.0002566455223131925, + "learning_rate": 2.2575120501265534e-06, + "loss": 0.0024, + "num_input_tokens_seen": 191707264, + "step": 157550 + }, + { + "epoch": 17.547054237665666, + "grad_norm": 0.006258106324821711, + "learning_rate": 2.2565031673051157e-06, + "loss": 0.0099, + "num_input_tokens_seen": 191713408, + "step": 157555 + }, + { + "epoch": 17.547611092549282, + "grad_norm": 0.036797232925891876, + "learning_rate": 2.25549449931424e-06, + "loss": 0.0505, + "num_input_tokens_seen": 191719200, + "step": 157560 + }, + { + "epoch": 17.548167947432898, + "grad_norm": 0.08792658895254135, + "learning_rate": 2.254486046163429e-06, + "loss": 0.1162, + "num_input_tokens_seen": 191725408, + "step": 157565 + }, + { + "epoch": 17.548724802316517, + "grad_norm": 0.036995965987443924, + "learning_rate": 2.2534778078622255e-06, + "loss": 0.0435, + "num_input_tokens_seen": 191731328, + "step": 157570 + }, + { + "epoch": 17.549281657200133, + "grad_norm": 0.010939800180494785, + "learning_rate": 2.252469784420144e-06, + "loss": 0.0099, + "num_input_tokens_seen": 191737600, + "step": 157575 + }, + { + "epoch": 17.549838512083753, + "grad_norm": 0.0010626247385516763, + "learning_rate": 2.2514619758467164e-06, + "loss": 0.0091, + "num_input_tokens_seen": 191743712, + "step": 157580 + }, + { + "epoch": 17.55039536696737, + "grad_norm": 0.2916213274002075, + "learning_rate": 2.250454382151457e-06, + "loss": 0.0578, + "num_input_tokens_seen": 191749632, + "step": 157585 + }, + { + "epoch": 17.550952221850984, + "grad_norm": 0.1624573916196823, + "learning_rate": 2.2494470033438797e-06, + "loss": 0.0104, + "num_input_tokens_seen": 191755872, + "step": 157590 + }, + { + "epoch": 17.551509076734604, + "grad_norm": 0.6996270418167114, + "learning_rate": 2.2484398394334997e-06, + "loss": 0.0099, + "num_input_tokens_seen": 191762368, + "step": 157595 + }, + { + "epoch": 17.55206593161822, + "grad_norm": 0.2014353722333908, + "learning_rate": 2.2474328904298374e-06, + "loss": 0.0238, + "num_input_tokens_seen": 191768384, + "step": 157600 + }, + { + "epoch": 17.55262278650184, + "grad_norm": 0.17029958963394165, + "learning_rate": 2.2464261563424014e-06, + "loss": 0.0133, + "num_input_tokens_seen": 191774496, + "step": 157605 + }, + { + "epoch": 17.553179641385455, + "grad_norm": 1.6837527751922607, + "learning_rate": 2.2454196371807015e-06, + "loss": 0.1381, + "num_input_tokens_seen": 191780704, + "step": 157610 + }, + { + "epoch": 17.55373649626907, + "grad_norm": 0.19099651277065277, + "learning_rate": 2.244413332954237e-06, + "loss": 0.015, + "num_input_tokens_seen": 191786976, + "step": 157615 + }, + { + "epoch": 17.55429335115269, + "grad_norm": 0.06838127970695496, + "learning_rate": 2.243407243672527e-06, + "loss": 0.0611, + "num_input_tokens_seen": 191793184, + "step": 157620 + }, + { + "epoch": 17.554850206036306, + "grad_norm": 0.004369885195046663, + "learning_rate": 2.2424013693450597e-06, + "loss": 0.046, + "num_input_tokens_seen": 191798816, + "step": 157625 + }, + { + "epoch": 17.555407060919926, + "grad_norm": 0.0016768211498856544, + "learning_rate": 2.241395709981356e-06, + "loss": 0.0178, + "num_input_tokens_seen": 191804800, + "step": 157630 + }, + { + "epoch": 17.55596391580354, + "grad_norm": 0.17152845859527588, + "learning_rate": 2.2403902655908943e-06, + "loss": 0.0437, + "num_input_tokens_seen": 191811040, + "step": 157635 + }, + { + "epoch": 17.556520770687158, + "grad_norm": 0.00446936534717679, + "learning_rate": 2.2393850361831866e-06, + "loss": 0.0013, + "num_input_tokens_seen": 191817376, + "step": 157640 + }, + { + "epoch": 17.557077625570777, + "grad_norm": 0.00035411500721238554, + "learning_rate": 2.2383800217677162e-06, + "loss": 0.0026, + "num_input_tokens_seen": 191823648, + "step": 157645 + }, + { + "epoch": 17.557634480454393, + "grad_norm": 0.02472238801419735, + "learning_rate": 2.237375222353991e-06, + "loss": 0.0085, + "num_input_tokens_seen": 191829504, + "step": 157650 + }, + { + "epoch": 17.558191335338012, + "grad_norm": 0.02267596498131752, + "learning_rate": 2.2363706379514903e-06, + "loss": 0.0021, + "num_input_tokens_seen": 191835520, + "step": 157655 + }, + { + "epoch": 17.55874819022163, + "grad_norm": 0.010674097575247288, + "learning_rate": 2.235366268569708e-06, + "loss": 0.0024, + "num_input_tokens_seen": 191841888, + "step": 157660 + }, + { + "epoch": 17.559305045105244, + "grad_norm": 0.0005556389805860817, + "learning_rate": 2.2343621142181247e-06, + "loss": 0.061, + "num_input_tokens_seen": 191847680, + "step": 157665 + }, + { + "epoch": 17.559861899988864, + "grad_norm": 0.03377815708518028, + "learning_rate": 2.233358174906236e-06, + "loss": 0.0176, + "num_input_tokens_seen": 191853856, + "step": 157670 + }, + { + "epoch": 17.56041875487248, + "grad_norm": 0.000264186441199854, + "learning_rate": 2.2323544506435203e-06, + "loss": 0.0075, + "num_input_tokens_seen": 191859776, + "step": 157675 + }, + { + "epoch": 17.5609756097561, + "grad_norm": 0.012172672897577286, + "learning_rate": 2.2313509414394586e-06, + "loss": 0.006, + "num_input_tokens_seen": 191866080, + "step": 157680 + }, + { + "epoch": 17.561532464639715, + "grad_norm": 0.06513559073209763, + "learning_rate": 2.230347647303524e-06, + "loss": 0.0025, + "num_input_tokens_seen": 191871776, + "step": 157685 + }, + { + "epoch": 17.56208931952333, + "grad_norm": 0.4853329658508301, + "learning_rate": 2.2293445682452007e-06, + "loss": 0.0254, + "num_input_tokens_seen": 191877888, + "step": 157690 + }, + { + "epoch": 17.56264617440695, + "grad_norm": 0.005383798386901617, + "learning_rate": 2.2283417042739592e-06, + "loss": 0.0055, + "num_input_tokens_seen": 191884160, + "step": 157695 + }, + { + "epoch": 17.563203029290566, + "grad_norm": 0.033806487917900085, + "learning_rate": 2.22733905539928e-06, + "loss": 0.0122, + "num_input_tokens_seen": 191889824, + "step": 157700 + }, + { + "epoch": 17.563759884174186, + "grad_norm": 0.652597963809967, + "learning_rate": 2.2263366216306257e-06, + "loss": 0.0097, + "num_input_tokens_seen": 191895840, + "step": 157705 + }, + { + "epoch": 17.5643167390578, + "grad_norm": 1.7507569789886475, + "learning_rate": 2.225334402977472e-06, + "loss": 0.0237, + "num_input_tokens_seen": 191902048, + "step": 157710 + }, + { + "epoch": 17.564873593941417, + "grad_norm": 0.2784700393676758, + "learning_rate": 2.2243323994492747e-06, + "loss": 0.0104, + "num_input_tokens_seen": 191908064, + "step": 157715 + }, + { + "epoch": 17.565430448825037, + "grad_norm": 0.16004301607608795, + "learning_rate": 2.22333061105551e-06, + "loss": 0.0083, + "num_input_tokens_seen": 191914272, + "step": 157720 + }, + { + "epoch": 17.565987303708653, + "grad_norm": 0.0004928259877488017, + "learning_rate": 2.2223290378056343e-06, + "loss": 0.0062, + "num_input_tokens_seen": 191920064, + "step": 157725 + }, + { + "epoch": 17.566544158592272, + "grad_norm": 0.11529388278722763, + "learning_rate": 2.221327679709112e-06, + "loss": 0.0547, + "num_input_tokens_seen": 191926080, + "step": 157730 + }, + { + "epoch": 17.567101013475888, + "grad_norm": 0.006898882333189249, + "learning_rate": 2.2203265367753967e-06, + "loss": 0.0392, + "num_input_tokens_seen": 191932256, + "step": 157735 + }, + { + "epoch": 17.567657868359504, + "grad_norm": 0.1928345113992691, + "learning_rate": 2.2193256090139503e-06, + "loss": 0.0544, + "num_input_tokens_seen": 191938080, + "step": 157740 + }, + { + "epoch": 17.568214723243123, + "grad_norm": 1.8718206882476807, + "learning_rate": 2.2183248964342205e-06, + "loss": 0.0746, + "num_input_tokens_seen": 191944288, + "step": 157745 + }, + { + "epoch": 17.56877157812674, + "grad_norm": 0.3022546172142029, + "learning_rate": 2.2173243990456754e-06, + "loss": 0.0055, + "num_input_tokens_seen": 191950688, + "step": 157750 + }, + { + "epoch": 17.56932843301036, + "grad_norm": 0.045912984758615494, + "learning_rate": 2.216324116857743e-06, + "loss": 0.0141, + "num_input_tokens_seen": 191956864, + "step": 157755 + }, + { + "epoch": 17.569885287893975, + "grad_norm": 1.218627691268921, + "learning_rate": 2.215324049879888e-06, + "loss": 0.0941, + "num_input_tokens_seen": 191962624, + "step": 157760 + }, + { + "epoch": 17.57044214277759, + "grad_norm": 0.001573200454004109, + "learning_rate": 2.2143241981215505e-06, + "loss": 0.0583, + "num_input_tokens_seen": 191968448, + "step": 157765 + }, + { + "epoch": 17.57099899766121, + "grad_norm": 0.1322707086801529, + "learning_rate": 2.213324561592178e-06, + "loss": 0.0354, + "num_input_tokens_seen": 191974912, + "step": 157770 + }, + { + "epoch": 17.571555852544826, + "grad_norm": 0.020370280370116234, + "learning_rate": 2.2123251403012135e-06, + "loss": 0.0225, + "num_input_tokens_seen": 191981408, + "step": 157775 + }, + { + "epoch": 17.572112707428445, + "grad_norm": 2.017580032348633, + "learning_rate": 2.2113259342580932e-06, + "loss": 0.0602, + "num_input_tokens_seen": 191987328, + "step": 157780 + }, + { + "epoch": 17.57266956231206, + "grad_norm": 2.3524341583251953, + "learning_rate": 2.2103269434722546e-06, + "loss": 0.0553, + "num_input_tokens_seen": 191993376, + "step": 157785 + }, + { + "epoch": 17.573226417195677, + "grad_norm": 0.00010962197848130018, + "learning_rate": 2.20932816795314e-06, + "loss": 0.0104, + "num_input_tokens_seen": 191999616, + "step": 157790 + }, + { + "epoch": 17.573783272079297, + "grad_norm": 0.0003857511037494987, + "learning_rate": 2.208329607710183e-06, + "loss": 0.0911, + "num_input_tokens_seen": 192005728, + "step": 157795 + }, + { + "epoch": 17.574340126962912, + "grad_norm": 0.23817682266235352, + "learning_rate": 2.2073312627528096e-06, + "loss": 0.0153, + "num_input_tokens_seen": 192011936, + "step": 157800 + }, + { + "epoch": 17.574896981846532, + "grad_norm": 0.00012494596012402326, + "learning_rate": 2.206333133090452e-06, + "loss": 0.0175, + "num_input_tokens_seen": 192018048, + "step": 157805 + }, + { + "epoch": 17.575453836730148, + "grad_norm": 0.014453648589551449, + "learning_rate": 2.2053352187325433e-06, + "loss": 0.0036, + "num_input_tokens_seen": 192024192, + "step": 157810 + }, + { + "epoch": 17.576010691613767, + "grad_norm": 0.1876339465379715, + "learning_rate": 2.2043375196885015e-06, + "loss": 0.068, + "num_input_tokens_seen": 192030336, + "step": 157815 + }, + { + "epoch": 17.576567546497383, + "grad_norm": 0.8130571842193604, + "learning_rate": 2.203340035967766e-06, + "loss": 0.1115, + "num_input_tokens_seen": 192036480, + "step": 157820 + }, + { + "epoch": 17.577124401381, + "grad_norm": 1.5260090827941895, + "learning_rate": 2.2023427675797376e-06, + "loss": 0.065, + "num_input_tokens_seen": 192041952, + "step": 157825 + }, + { + "epoch": 17.57768125626462, + "grad_norm": 0.40558165311813354, + "learning_rate": 2.2013457145338506e-06, + "loss": 0.0175, + "num_input_tokens_seen": 192047776, + "step": 157830 + }, + { + "epoch": 17.578238111148234, + "grad_norm": 1.0702840089797974, + "learning_rate": 2.200348876839517e-06, + "loss": 0.0557, + "num_input_tokens_seen": 192053792, + "step": 157835 + }, + { + "epoch": 17.57879496603185, + "grad_norm": 0.02198222652077675, + "learning_rate": 2.1993522545061567e-06, + "loss": 0.0446, + "num_input_tokens_seen": 192059936, + "step": 157840 + }, + { + "epoch": 17.57935182091547, + "grad_norm": 3.2306692600250244, + "learning_rate": 2.198355847543185e-06, + "loss": 0.0937, + "num_input_tokens_seen": 192065824, + "step": 157845 + }, + { + "epoch": 17.579908675799086, + "grad_norm": 0.0323953703045845, + "learning_rate": 2.19735965596001e-06, + "loss": 0.019, + "num_input_tokens_seen": 192072032, + "step": 157850 + }, + { + "epoch": 17.580465530682705, + "grad_norm": 0.35622191429138184, + "learning_rate": 2.1963636797660366e-06, + "loss": 0.017, + "num_input_tokens_seen": 192078144, + "step": 157855 + }, + { + "epoch": 17.58102238556632, + "grad_norm": 0.974923849105835, + "learning_rate": 2.1953679189706837e-06, + "loss": 0.0548, + "num_input_tokens_seen": 192084384, + "step": 157860 + }, + { + "epoch": 17.58157924044994, + "grad_norm": 0.009159822948276997, + "learning_rate": 2.1943723735833505e-06, + "loss": 0.0116, + "num_input_tokens_seen": 192090464, + "step": 157865 + }, + { + "epoch": 17.582136095333556, + "grad_norm": 0.0008349024574272335, + "learning_rate": 2.1933770436134427e-06, + "loss": 0.0127, + "num_input_tokens_seen": 192096352, + "step": 157870 + }, + { + "epoch": 17.582692950217172, + "grad_norm": 0.8645100593566895, + "learning_rate": 2.1923819290703558e-06, + "loss": 0.0433, + "num_input_tokens_seen": 192102144, + "step": 157875 + }, + { + "epoch": 17.58324980510079, + "grad_norm": 0.11184148490428925, + "learning_rate": 2.1913870299634984e-06, + "loss": 0.0458, + "num_input_tokens_seen": 192108128, + "step": 157880 + }, + { + "epoch": 17.583806659984408, + "grad_norm": 0.5825128555297852, + "learning_rate": 2.190392346302264e-06, + "loss": 0.0387, + "num_input_tokens_seen": 192114304, + "step": 157885 + }, + { + "epoch": 17.584363514868027, + "grad_norm": 0.002530049066990614, + "learning_rate": 2.18939787809605e-06, + "loss": 0.0037, + "num_input_tokens_seen": 192120448, + "step": 157890 + }, + { + "epoch": 17.584920369751643, + "grad_norm": 0.6605820655822754, + "learning_rate": 2.188403625354249e-06, + "loss": 0.0274, + "num_input_tokens_seen": 192126816, + "step": 157895 + }, + { + "epoch": 17.58547722463526, + "grad_norm": 0.3215995728969574, + "learning_rate": 2.1874095880862505e-06, + "loss": 0.1212, + "num_input_tokens_seen": 192132928, + "step": 157900 + }, + { + "epoch": 17.58603407951888, + "grad_norm": 0.010066238231956959, + "learning_rate": 2.1864157663014444e-06, + "loss": 0.0003, + "num_input_tokens_seen": 192139072, + "step": 157905 + }, + { + "epoch": 17.586590934402494, + "grad_norm": 1.1352421045303345, + "learning_rate": 2.1854221600092206e-06, + "loss": 0.0061, + "num_input_tokens_seen": 192145088, + "step": 157910 + }, + { + "epoch": 17.587147789286114, + "grad_norm": 0.09415610879659653, + "learning_rate": 2.184428769218966e-06, + "loss": 0.0053, + "num_input_tokens_seen": 192151328, + "step": 157915 + }, + { + "epoch": 17.58770464416973, + "grad_norm": 0.9608491063117981, + "learning_rate": 2.1834355939400587e-06, + "loss": 0.0308, + "num_input_tokens_seen": 192157440, + "step": 157920 + }, + { + "epoch": 17.588261499053345, + "grad_norm": 0.004931173287332058, + "learning_rate": 2.1824426341818803e-06, + "loss": 0.0412, + "num_input_tokens_seen": 192163552, + "step": 157925 + }, + { + "epoch": 17.588818353936965, + "grad_norm": 1.934401273727417, + "learning_rate": 2.1814498899538154e-06, + "loss": 0.1463, + "num_input_tokens_seen": 192168992, + "step": 157930 + }, + { + "epoch": 17.58937520882058, + "grad_norm": 2.8759493827819824, + "learning_rate": 2.180457361265234e-06, + "loss": 0.165, + "num_input_tokens_seen": 192175264, + "step": 157935 + }, + { + "epoch": 17.5899320637042, + "grad_norm": 0.2349458634853363, + "learning_rate": 2.179465048125526e-06, + "loss": 0.0086, + "num_input_tokens_seen": 192181024, + "step": 157940 + }, + { + "epoch": 17.590488918587816, + "grad_norm": 0.018061937764286995, + "learning_rate": 2.1784729505440445e-06, + "loss": 0.0023, + "num_input_tokens_seen": 192187040, + "step": 157945 + }, + { + "epoch": 17.591045773471432, + "grad_norm": 9.001364378491417e-05, + "learning_rate": 2.177481068530174e-06, + "loss": 0.0044, + "num_input_tokens_seen": 192193344, + "step": 157950 + }, + { + "epoch": 17.59160262835505, + "grad_norm": 0.2908813953399658, + "learning_rate": 2.1764894020932737e-06, + "loss": 0.048, + "num_input_tokens_seen": 192199360, + "step": 157955 + }, + { + "epoch": 17.592159483238667, + "grad_norm": 0.4205166697502136, + "learning_rate": 2.175497951242725e-06, + "loss": 0.0091, + "num_input_tokens_seen": 192205408, + "step": 157960 + }, + { + "epoch": 17.592716338122287, + "grad_norm": 0.1021459698677063, + "learning_rate": 2.174506715987887e-06, + "loss": 0.0043, + "num_input_tokens_seen": 192211040, + "step": 157965 + }, + { + "epoch": 17.593273193005903, + "grad_norm": 1.2198760509490967, + "learning_rate": 2.1735156963381104e-06, + "loss": 0.0575, + "num_input_tokens_seen": 192216992, + "step": 157970 + }, + { + "epoch": 17.59383004788952, + "grad_norm": 0.21917979419231415, + "learning_rate": 2.172524892302774e-06, + "loss": 0.0017, + "num_input_tokens_seen": 192223616, + "step": 157975 + }, + { + "epoch": 17.594386902773138, + "grad_norm": 0.0018841307610273361, + "learning_rate": 2.1715343038912234e-06, + "loss": 0.001, + "num_input_tokens_seen": 192230080, + "step": 157980 + }, + { + "epoch": 17.594943757656754, + "grad_norm": 0.03151007741689682, + "learning_rate": 2.1705439311128257e-06, + "loss": 0.0687, + "num_input_tokens_seen": 192236640, + "step": 157985 + }, + { + "epoch": 17.595500612540373, + "grad_norm": 0.5680813193321228, + "learning_rate": 2.169553773976929e-06, + "loss": 0.02, + "num_input_tokens_seen": 192242752, + "step": 157990 + }, + { + "epoch": 17.59605746742399, + "grad_norm": 0.0005814558244310319, + "learning_rate": 2.1685638324928927e-06, + "loss": 0.0164, + "num_input_tokens_seen": 192248768, + "step": 157995 + }, + { + "epoch": 17.596614322307605, + "grad_norm": 0.15284240245819092, + "learning_rate": 2.167574106670056e-06, + "loss": 0.2463, + "num_input_tokens_seen": 192254528, + "step": 158000 + }, + { + "epoch": 17.597171177191225, + "grad_norm": 0.06773746013641357, + "learning_rate": 2.1665845965177787e-06, + "loss": 0.0131, + "num_input_tokens_seen": 192260544, + "step": 158005 + }, + { + "epoch": 17.59772803207484, + "grad_norm": 0.00014949911565054208, + "learning_rate": 2.1655953020454033e-06, + "loss": 0.0013, + "num_input_tokens_seen": 192266720, + "step": 158010 + }, + { + "epoch": 17.59828488695846, + "grad_norm": 0.8322614431381226, + "learning_rate": 2.1646062232622776e-06, + "loss": 0.0315, + "num_input_tokens_seen": 192272256, + "step": 158015 + }, + { + "epoch": 17.598841741842076, + "grad_norm": 0.2459842413663864, + "learning_rate": 2.163617360177736e-06, + "loss": 0.0223, + "num_input_tokens_seen": 192278304, + "step": 158020 + }, + { + "epoch": 17.599398596725692, + "grad_norm": 0.07682785391807556, + "learning_rate": 2.162628712801129e-06, + "loss": 0.0832, + "num_input_tokens_seen": 192283936, + "step": 158025 + }, + { + "epoch": 17.59995545160931, + "grad_norm": 0.023503761738538742, + "learning_rate": 2.1616402811417858e-06, + "loss": 0.0322, + "num_input_tokens_seen": 192289824, + "step": 158030 + }, + { + "epoch": 17.600512306492927, + "grad_norm": 0.00017812804435379803, + "learning_rate": 2.1606520652090566e-06, + "loss": 0.0443, + "num_input_tokens_seen": 192295488, + "step": 158035 + }, + { + "epoch": 17.601069161376547, + "grad_norm": 0.026955783367156982, + "learning_rate": 2.1596640650122598e-06, + "loss": 0.096, + "num_input_tokens_seen": 192301504, + "step": 158040 + }, + { + "epoch": 17.601626016260163, + "grad_norm": 0.32172200083732605, + "learning_rate": 2.1586762805607397e-06, + "loss": 0.0074, + "num_input_tokens_seen": 192307648, + "step": 158045 + }, + { + "epoch": 17.60218287114378, + "grad_norm": 0.007130565587431192, + "learning_rate": 2.1576887118638143e-06, + "loss": 0.0106, + "num_input_tokens_seen": 192313888, + "step": 158050 + }, + { + "epoch": 17.602739726027398, + "grad_norm": 0.0001542506943223998, + "learning_rate": 2.156701358930829e-06, + "loss": 0.0271, + "num_input_tokens_seen": 192320256, + "step": 158055 + }, + { + "epoch": 17.603296580911014, + "grad_norm": 1.120367169380188, + "learning_rate": 2.155714221771099e-06, + "loss": 0.0168, + "num_input_tokens_seen": 192326688, + "step": 158060 + }, + { + "epoch": 17.603853435794633, + "grad_norm": 0.5285337567329407, + "learning_rate": 2.1547273003939523e-06, + "loss": 0.0107, + "num_input_tokens_seen": 192333088, + "step": 158065 + }, + { + "epoch": 17.60441029067825, + "grad_norm": 1.0886859893798828, + "learning_rate": 2.1537405948087038e-06, + "loss": 0.0645, + "num_input_tokens_seen": 192339232, + "step": 158070 + }, + { + "epoch": 17.604967145561865, + "grad_norm": 0.010079511441290379, + "learning_rate": 2.152754105024682e-06, + "loss": 0.0036, + "num_input_tokens_seen": 192345280, + "step": 158075 + }, + { + "epoch": 17.605524000445484, + "grad_norm": 0.008813794702291489, + "learning_rate": 2.1517678310512046e-06, + "loss": 0.0653, + "num_input_tokens_seen": 192351168, + "step": 158080 + }, + { + "epoch": 17.6060808553291, + "grad_norm": 0.3899611234664917, + "learning_rate": 2.1507817728975866e-06, + "loss": 0.0125, + "num_input_tokens_seen": 192357280, + "step": 158085 + }, + { + "epoch": 17.60663771021272, + "grad_norm": 0.09099826216697693, + "learning_rate": 2.1497959305731343e-06, + "loss": 0.0991, + "num_input_tokens_seen": 192363328, + "step": 158090 + }, + { + "epoch": 17.607194565096336, + "grad_norm": 2.35463809967041, + "learning_rate": 2.148810304087173e-06, + "loss": 0.0551, + "num_input_tokens_seen": 192369120, + "step": 158095 + }, + { + "epoch": 17.60775141997995, + "grad_norm": 0.001213885028846562, + "learning_rate": 2.147824893449002e-06, + "loss": 0.0036, + "num_input_tokens_seen": 192375104, + "step": 158100 + }, + { + "epoch": 17.60830827486357, + "grad_norm": 0.8922243714332581, + "learning_rate": 2.1468396986679377e-06, + "loss": 0.0239, + "num_input_tokens_seen": 192381088, + "step": 158105 + }, + { + "epoch": 17.608865129747187, + "grad_norm": 0.6132481098175049, + "learning_rate": 2.145854719753282e-06, + "loss": 0.0957, + "num_input_tokens_seen": 192387168, + "step": 158110 + }, + { + "epoch": 17.609421984630806, + "grad_norm": 3.5829570293426514, + "learning_rate": 2.1448699567143404e-06, + "loss": 0.0375, + "num_input_tokens_seen": 192393216, + "step": 158115 + }, + { + "epoch": 17.609978839514422, + "grad_norm": 1.0126516819000244, + "learning_rate": 2.143885409560406e-06, + "loss": 0.0986, + "num_input_tokens_seen": 192399360, + "step": 158120 + }, + { + "epoch": 17.61053569439804, + "grad_norm": 0.08355959504842758, + "learning_rate": 2.1429010783007934e-06, + "loss": 0.0026, + "num_input_tokens_seen": 192405600, + "step": 158125 + }, + { + "epoch": 17.611092549281658, + "grad_norm": 1.7579407691955566, + "learning_rate": 2.1419169629447925e-06, + "loss": 0.0914, + "num_input_tokens_seen": 192411776, + "step": 158130 + }, + { + "epoch": 17.611649404165274, + "grad_norm": 0.363419771194458, + "learning_rate": 2.1409330635016988e-06, + "loss": 0.1208, + "num_input_tokens_seen": 192417440, + "step": 158135 + }, + { + "epoch": 17.612206259048893, + "grad_norm": 0.005791991949081421, + "learning_rate": 2.1399493799808016e-06, + "loss": 0.0485, + "num_input_tokens_seen": 192423488, + "step": 158140 + }, + { + "epoch": 17.61276311393251, + "grad_norm": 1.7920783758163452, + "learning_rate": 2.1389659123914023e-06, + "loss": 0.0907, + "num_input_tokens_seen": 192429728, + "step": 158145 + }, + { + "epoch": 17.613319968816125, + "grad_norm": 1.8306127786636353, + "learning_rate": 2.137982660742782e-06, + "loss": 0.2006, + "num_input_tokens_seen": 192435840, + "step": 158150 + }, + { + "epoch": 17.613876823699744, + "grad_norm": 0.3748243749141693, + "learning_rate": 2.136999625044239e-06, + "loss": 0.0128, + "num_input_tokens_seen": 192441984, + "step": 158155 + }, + { + "epoch": 17.61443367858336, + "grad_norm": 0.02682310715317726, + "learning_rate": 2.1360168053050465e-06, + "loss": 0.062, + "num_input_tokens_seen": 192448544, + "step": 158160 + }, + { + "epoch": 17.61499053346698, + "grad_norm": 0.18181781470775604, + "learning_rate": 2.1350342015344945e-06, + "loss": 0.0652, + "num_input_tokens_seen": 192454016, + "step": 158165 + }, + { + "epoch": 17.615547388350596, + "grad_norm": 0.13571198284626007, + "learning_rate": 2.1340518137418607e-06, + "loss": 0.1741, + "num_input_tokens_seen": 192460256, + "step": 158170 + }, + { + "epoch": 17.61610424323421, + "grad_norm": 1.1140183210372925, + "learning_rate": 2.1330696419364333e-06, + "loss": 0.1838, + "num_input_tokens_seen": 192466720, + "step": 158175 + }, + { + "epoch": 17.61666109811783, + "grad_norm": 0.048965588212013245, + "learning_rate": 2.1320876861274817e-06, + "loss": 0.0232, + "num_input_tokens_seen": 192472832, + "step": 158180 + }, + { + "epoch": 17.617217953001447, + "grad_norm": 0.0018941133748739958, + "learning_rate": 2.1311059463242857e-06, + "loss": 0.0803, + "num_input_tokens_seen": 192479008, + "step": 158185 + }, + { + "epoch": 17.617774807885066, + "grad_norm": 1.2855312824249268, + "learning_rate": 2.1301244225361117e-06, + "loss": 0.0263, + "num_input_tokens_seen": 192485248, + "step": 158190 + }, + { + "epoch": 17.618331662768682, + "grad_norm": 0.05213621258735657, + "learning_rate": 2.129143114772239e-06, + "loss": 0.0361, + "num_input_tokens_seen": 192491552, + "step": 158195 + }, + { + "epoch": 17.6188885176523, + "grad_norm": 0.0008963937289081514, + "learning_rate": 2.1281620230419326e-06, + "loss": 0.0009, + "num_input_tokens_seen": 192497600, + "step": 158200 + }, + { + "epoch": 17.619445372535917, + "grad_norm": 0.09216273576021194, + "learning_rate": 2.1271811473544596e-06, + "loss": 0.0089, + "num_input_tokens_seen": 192503456, + "step": 158205 + }, + { + "epoch": 17.620002227419533, + "grad_norm": 9.097393922274932e-05, + "learning_rate": 2.126200487719085e-06, + "loss": 0.0045, + "num_input_tokens_seen": 192509856, + "step": 158210 + }, + { + "epoch": 17.620559082303153, + "grad_norm": 0.12793469429016113, + "learning_rate": 2.1252200441450737e-06, + "loss": 0.0025, + "num_input_tokens_seen": 192515520, + "step": 158215 + }, + { + "epoch": 17.62111593718677, + "grad_norm": 0.00010343507892685011, + "learning_rate": 2.124239816641685e-06, + "loss": 0.0893, + "num_input_tokens_seen": 192521920, + "step": 158220 + }, + { + "epoch": 17.621672792070388, + "grad_norm": 0.11459193378686905, + "learning_rate": 2.1232598052181862e-06, + "loss": 0.0732, + "num_input_tokens_seen": 192528096, + "step": 158225 + }, + { + "epoch": 17.622229646954004, + "grad_norm": 0.32708215713500977, + "learning_rate": 2.1222800098838177e-06, + "loss": 0.0089, + "num_input_tokens_seen": 192534368, + "step": 158230 + }, + { + "epoch": 17.62278650183762, + "grad_norm": 0.00028384520555846393, + "learning_rate": 2.121300430647849e-06, + "loss": 0.0068, + "num_input_tokens_seen": 192540736, + "step": 158235 + }, + { + "epoch": 17.62334335672124, + "grad_norm": 0.004223797935992479, + "learning_rate": 2.1203210675195236e-06, + "loss": 0.0015, + "num_input_tokens_seen": 192546912, + "step": 158240 + }, + { + "epoch": 17.623900211604855, + "grad_norm": 0.005555948242545128, + "learning_rate": 2.1193419205081e-06, + "loss": 0.0007, + "num_input_tokens_seen": 192553184, + "step": 158245 + }, + { + "epoch": 17.624457066488475, + "grad_norm": 0.0019447183003649116, + "learning_rate": 2.118362989622827e-06, + "loss": 0.0142, + "num_input_tokens_seen": 192559392, + "step": 158250 + }, + { + "epoch": 17.62501392137209, + "grad_norm": 2.206998109817505, + "learning_rate": 2.117384274872944e-06, + "loss": 0.0251, + "num_input_tokens_seen": 192566080, + "step": 158255 + }, + { + "epoch": 17.625570776255707, + "grad_norm": 1.2233693599700928, + "learning_rate": 2.1164057762676963e-06, + "loss": 0.0714, + "num_input_tokens_seen": 192572000, + "step": 158260 + }, + { + "epoch": 17.626127631139326, + "grad_norm": 0.6599626541137695, + "learning_rate": 2.1154274938163354e-06, + "loss": 0.0558, + "num_input_tokens_seen": 192578368, + "step": 158265 + }, + { + "epoch": 17.626684486022942, + "grad_norm": 0.00012347748270258307, + "learning_rate": 2.114449427528098e-06, + "loss": 0.0346, + "num_input_tokens_seen": 192584544, + "step": 158270 + }, + { + "epoch": 17.62724134090656, + "grad_norm": 2.219146490097046, + "learning_rate": 2.113471577412218e-06, + "loss": 0.0542, + "num_input_tokens_seen": 192590848, + "step": 158275 + }, + { + "epoch": 17.627798195790177, + "grad_norm": 0.009808862581849098, + "learning_rate": 2.1124939434779335e-06, + "loss": 0.0783, + "num_input_tokens_seen": 192597184, + "step": 158280 + }, + { + "epoch": 17.628355050673793, + "grad_norm": 0.30278804898262024, + "learning_rate": 2.111516525734483e-06, + "loss": 0.0516, + "num_input_tokens_seen": 192602912, + "step": 158285 + }, + { + "epoch": 17.628911905557413, + "grad_norm": 0.057116467505693436, + "learning_rate": 2.1105393241910935e-06, + "loss": 0.009, + "num_input_tokens_seen": 192609056, + "step": 158290 + }, + { + "epoch": 17.62946876044103, + "grad_norm": 0.014303554780781269, + "learning_rate": 2.1095623388570047e-06, + "loss": 0.03, + "num_input_tokens_seen": 192615296, + "step": 158295 + }, + { + "epoch": 17.630025615324648, + "grad_norm": 1.0628200769424438, + "learning_rate": 2.1085855697414364e-06, + "loss": 0.0207, + "num_input_tokens_seen": 192621568, + "step": 158300 + }, + { + "epoch": 17.630582470208264, + "grad_norm": 0.021024592220783234, + "learning_rate": 2.1076090168536204e-06, + "loss": 0.0636, + "num_input_tokens_seen": 192627776, + "step": 158305 + }, + { + "epoch": 17.63113932509188, + "grad_norm": 0.2644895613193512, + "learning_rate": 2.106632680202772e-06, + "loss": 0.0055, + "num_input_tokens_seen": 192634080, + "step": 158310 + }, + { + "epoch": 17.6316961799755, + "grad_norm": 0.0016887682722881436, + "learning_rate": 2.105656559798125e-06, + "loss": 0.0329, + "num_input_tokens_seen": 192640128, + "step": 158315 + }, + { + "epoch": 17.632253034859115, + "grad_norm": 0.0026228465139865875, + "learning_rate": 2.104680655648894e-06, + "loss": 0.0053, + "num_input_tokens_seen": 192646400, + "step": 158320 + }, + { + "epoch": 17.632809889742735, + "grad_norm": 0.04625188559293747, + "learning_rate": 2.1037049677643e-06, + "loss": 0.0168, + "num_input_tokens_seen": 192652544, + "step": 158325 + }, + { + "epoch": 17.63336674462635, + "grad_norm": 1.535530686378479, + "learning_rate": 2.1027294961535493e-06, + "loss": 0.0915, + "num_input_tokens_seen": 192659008, + "step": 158330 + }, + { + "epoch": 17.633923599509966, + "grad_norm": 0.0007015378214418888, + "learning_rate": 2.1017542408258677e-06, + "loss": 0.0862, + "num_input_tokens_seen": 192664960, + "step": 158335 + }, + { + "epoch": 17.634480454393586, + "grad_norm": 0.43926262855529785, + "learning_rate": 2.100779201790462e-06, + "loss": 0.0121, + "num_input_tokens_seen": 192671136, + "step": 158340 + }, + { + "epoch": 17.6350373092772, + "grad_norm": 1.5105087757110596, + "learning_rate": 2.0998043790565497e-06, + "loss": 0.1215, + "num_input_tokens_seen": 192677056, + "step": 158345 + }, + { + "epoch": 17.63559416416082, + "grad_norm": 2.1540207862854004, + "learning_rate": 2.0988297726333233e-06, + "loss": 0.0447, + "num_input_tokens_seen": 192683104, + "step": 158350 + }, + { + "epoch": 17.636151019044437, + "grad_norm": 0.4330384135246277, + "learning_rate": 2.0978553825300033e-06, + "loss": 0.0145, + "num_input_tokens_seen": 192688864, + "step": 158355 + }, + { + "epoch": 17.636707873928053, + "grad_norm": 0.8570874333381653, + "learning_rate": 2.0968812087557827e-06, + "loss": 0.0719, + "num_input_tokens_seen": 192695360, + "step": 158360 + }, + { + "epoch": 17.637264728811672, + "grad_norm": 0.0005666227079927921, + "learning_rate": 2.095907251319873e-06, + "loss": 0.0259, + "num_input_tokens_seen": 192701216, + "step": 158365 + }, + { + "epoch": 17.63782158369529, + "grad_norm": 1.0424139499664307, + "learning_rate": 2.09493351023147e-06, + "loss": 0.0735, + "num_input_tokens_seen": 192707392, + "step": 158370 + }, + { + "epoch": 17.638378438578908, + "grad_norm": 0.6371746063232422, + "learning_rate": 2.0939599854997717e-06, + "loss": 0.0783, + "num_input_tokens_seen": 192713888, + "step": 158375 + }, + { + "epoch": 17.638935293462524, + "grad_norm": 1.1065573692321777, + "learning_rate": 2.0929866771339735e-06, + "loss": 0.0296, + "num_input_tokens_seen": 192720096, + "step": 158380 + }, + { + "epoch": 17.63949214834614, + "grad_norm": 0.004477073438465595, + "learning_rate": 2.0920135851432655e-06, + "loss": 0.0183, + "num_input_tokens_seen": 192726432, + "step": 158385 + }, + { + "epoch": 17.64004900322976, + "grad_norm": 0.0018324061529710889, + "learning_rate": 2.0910407095368454e-06, + "loss": 0.0023, + "num_input_tokens_seen": 192732608, + "step": 158390 + }, + { + "epoch": 17.640605858113375, + "grad_norm": 0.1697455197572708, + "learning_rate": 2.090068050323901e-06, + "loss": 0.0497, + "num_input_tokens_seen": 192739168, + "step": 158395 + }, + { + "epoch": 17.641162712996994, + "grad_norm": 2.37739634513855, + "learning_rate": 2.089095607513619e-06, + "loss": 0.0714, + "num_input_tokens_seen": 192745120, + "step": 158400 + }, + { + "epoch": 17.64171956788061, + "grad_norm": 0.27477508783340454, + "learning_rate": 2.0881233811151787e-06, + "loss": 0.108, + "num_input_tokens_seen": 192751040, + "step": 158405 + }, + { + "epoch": 17.642276422764226, + "grad_norm": 1.0125969648361206, + "learning_rate": 2.087151371137777e-06, + "loss": 0.0446, + "num_input_tokens_seen": 192756832, + "step": 158410 + }, + { + "epoch": 17.642833277647846, + "grad_norm": 0.0013184115523472428, + "learning_rate": 2.0861795775905856e-06, + "loss": 0.0017, + "num_input_tokens_seen": 192763008, + "step": 158415 + }, + { + "epoch": 17.64339013253146, + "grad_norm": 0.013886960223317146, + "learning_rate": 2.085208000482788e-06, + "loss": 0.0305, + "num_input_tokens_seen": 192768736, + "step": 158420 + }, + { + "epoch": 17.64394698741508, + "grad_norm": 0.000608178204856813, + "learning_rate": 2.0842366398235556e-06, + "loss": 0.0036, + "num_input_tokens_seen": 192774400, + "step": 158425 + }, + { + "epoch": 17.644503842298697, + "grad_norm": 0.004155048634856939, + "learning_rate": 2.083265495622072e-06, + "loss": 0.0001, + "num_input_tokens_seen": 192780544, + "step": 158430 + }, + { + "epoch": 17.645060697182313, + "grad_norm": 0.5241397023200989, + "learning_rate": 2.0822945678874994e-06, + "loss": 0.0256, + "num_input_tokens_seen": 192786848, + "step": 158435 + }, + { + "epoch": 17.645617552065932, + "grad_norm": 2.8696141242980957, + "learning_rate": 2.081323856629025e-06, + "loss": 0.0513, + "num_input_tokens_seen": 192792384, + "step": 158440 + }, + { + "epoch": 17.646174406949548, + "grad_norm": 0.027598248794674873, + "learning_rate": 2.0803533618558054e-06, + "loss": 0.0032, + "num_input_tokens_seen": 192798176, + "step": 158445 + }, + { + "epoch": 17.646731261833168, + "grad_norm": 0.017745964229106903, + "learning_rate": 2.0793830835770133e-06, + "loss": 0.0053, + "num_input_tokens_seen": 192804256, + "step": 158450 + }, + { + "epoch": 17.647288116716783, + "grad_norm": 1.7987868785858154, + "learning_rate": 2.078413021801806e-06, + "loss": 0.0966, + "num_input_tokens_seen": 192810336, + "step": 158455 + }, + { + "epoch": 17.6478449716004, + "grad_norm": 0.1678764820098877, + "learning_rate": 2.0774431765393564e-06, + "loss": 0.0577, + "num_input_tokens_seen": 192816416, + "step": 158460 + }, + { + "epoch": 17.64840182648402, + "grad_norm": 1.241129994392395, + "learning_rate": 2.0764735477988213e-06, + "loss": 0.1056, + "num_input_tokens_seen": 192822592, + "step": 158465 + }, + { + "epoch": 17.648958681367635, + "grad_norm": 0.0008248382364399731, + "learning_rate": 2.0755041355893593e-06, + "loss": 0.0359, + "num_input_tokens_seen": 192828992, + "step": 158470 + }, + { + "epoch": 17.649515536251254, + "grad_norm": 0.9429009556770325, + "learning_rate": 2.074534939920125e-06, + "loss": 0.0436, + "num_input_tokens_seen": 192835008, + "step": 158475 + }, + { + "epoch": 17.65007239113487, + "grad_norm": 0.06604959815740585, + "learning_rate": 2.0735659608002777e-06, + "loss": 0.0038, + "num_input_tokens_seen": 192840640, + "step": 158480 + }, + { + "epoch": 17.650629246018486, + "grad_norm": 0.03022724948823452, + "learning_rate": 2.072597198238971e-06, + "loss": 0.0766, + "num_input_tokens_seen": 192846496, + "step": 158485 + }, + { + "epoch": 17.651186100902105, + "grad_norm": 0.17349766194820404, + "learning_rate": 2.07162865224535e-06, + "loss": 0.0594, + "num_input_tokens_seen": 192852512, + "step": 158490 + }, + { + "epoch": 17.65174295578572, + "grad_norm": 0.3811856508255005, + "learning_rate": 2.070660322828563e-06, + "loss": 0.0049, + "num_input_tokens_seen": 192858976, + "step": 158495 + }, + { + "epoch": 17.65229981066934, + "grad_norm": 0.14181464910507202, + "learning_rate": 2.0696922099977674e-06, + "loss": 0.0205, + "num_input_tokens_seen": 192865024, + "step": 158500 + }, + { + "epoch": 17.652856665552957, + "grad_norm": 0.021036190912127495, + "learning_rate": 2.068724313762091e-06, + "loss": 0.0552, + "num_input_tokens_seen": 192870720, + "step": 158505 + }, + { + "epoch": 17.653413520436573, + "grad_norm": 0.9230636954307556, + "learning_rate": 2.0677566341306937e-06, + "loss": 0.2792, + "num_input_tokens_seen": 192877088, + "step": 158510 + }, + { + "epoch": 17.653970375320192, + "grad_norm": 0.23392480611801147, + "learning_rate": 2.0667891711127067e-06, + "loss": 0.0058, + "num_input_tokens_seen": 192883328, + "step": 158515 + }, + { + "epoch": 17.654527230203808, + "grad_norm": 0.00024631695123389363, + "learning_rate": 2.0658219247172676e-06, + "loss": 0.0179, + "num_input_tokens_seen": 192889664, + "step": 158520 + }, + { + "epoch": 17.655084085087427, + "grad_norm": 0.02589126117527485, + "learning_rate": 2.0648548949535134e-06, + "loss": 0.0374, + "num_input_tokens_seen": 192894336, + "step": 158525 + }, + { + "epoch": 17.655640939971043, + "grad_norm": 0.00024146352370735258, + "learning_rate": 2.063888081830584e-06, + "loss": 0.0027, + "num_input_tokens_seen": 192900352, + "step": 158530 + }, + { + "epoch": 17.656197794854663, + "grad_norm": 0.00041307281935587525, + "learning_rate": 2.062921485357608e-06, + "loss": 0.0089, + "num_input_tokens_seen": 192905952, + "step": 158535 + }, + { + "epoch": 17.65675464973828, + "grad_norm": 0.9060661792755127, + "learning_rate": 2.0619551055437143e-06, + "loss": 0.0827, + "num_input_tokens_seen": 192911808, + "step": 158540 + }, + { + "epoch": 17.657311504621894, + "grad_norm": 0.11118757724761963, + "learning_rate": 2.060988942398029e-06, + "loss": 0.022, + "num_input_tokens_seen": 192918048, + "step": 158545 + }, + { + "epoch": 17.657868359505514, + "grad_norm": 0.030734345316886902, + "learning_rate": 2.0600229959296863e-06, + "loss": 0.1137, + "num_input_tokens_seen": 192923456, + "step": 158550 + }, + { + "epoch": 17.65842521438913, + "grad_norm": 0.11472073197364807, + "learning_rate": 2.0590572661477985e-06, + "loss": 0.0015, + "num_input_tokens_seen": 192929376, + "step": 158555 + }, + { + "epoch": 17.658982069272746, + "grad_norm": 1.4367495775222778, + "learning_rate": 2.058091753061506e-06, + "loss": 0.1825, + "num_input_tokens_seen": 192935424, + "step": 158560 + }, + { + "epoch": 17.659538924156365, + "grad_norm": 0.5971889495849609, + "learning_rate": 2.0571264566799087e-06, + "loss": 0.0209, + "num_input_tokens_seen": 192940768, + "step": 158565 + }, + { + "epoch": 17.66009577903998, + "grad_norm": 0.14881910383701324, + "learning_rate": 2.056161377012136e-06, + "loss": 0.012, + "num_input_tokens_seen": 192946656, + "step": 158570 + }, + { + "epoch": 17.6606526339236, + "grad_norm": 1.012454867362976, + "learning_rate": 2.0551965140673007e-06, + "loss": 0.0855, + "num_input_tokens_seen": 192952128, + "step": 158575 + }, + { + "epoch": 17.661209488807216, + "grad_norm": 0.0025682852137833834, + "learning_rate": 2.0542318678545197e-06, + "loss": 0.0056, + "num_input_tokens_seen": 192958752, + "step": 158580 + }, + { + "epoch": 17.661766343690836, + "grad_norm": 0.002036584308370948, + "learning_rate": 2.0532674383829027e-06, + "loss": 0.0523, + "num_input_tokens_seen": 192965152, + "step": 158585 + }, + { + "epoch": 17.66232319857445, + "grad_norm": 0.07273956388235092, + "learning_rate": 2.0523032256615585e-06, + "loss": 0.0114, + "num_input_tokens_seen": 192971296, + "step": 158590 + }, + { + "epoch": 17.662880053458068, + "grad_norm": 0.016715170815587044, + "learning_rate": 2.051339229699592e-06, + "loss": 0.0094, + "num_input_tokens_seen": 192977536, + "step": 158595 + }, + { + "epoch": 17.663436908341687, + "grad_norm": 1.3570823669433594, + "learning_rate": 2.0503754505061174e-06, + "loss": 0.0862, + "num_input_tokens_seen": 192983520, + "step": 158600 + }, + { + "epoch": 17.663993763225303, + "grad_norm": 0.04818534478545189, + "learning_rate": 2.049411888090233e-06, + "loss": 0.0222, + "num_input_tokens_seen": 192989088, + "step": 158605 + }, + { + "epoch": 17.664550618108922, + "grad_norm": 0.010405478999018669, + "learning_rate": 2.04844854246104e-06, + "loss": 0.0538, + "num_input_tokens_seen": 192995424, + "step": 158610 + }, + { + "epoch": 17.66510747299254, + "grad_norm": 0.08687316626310349, + "learning_rate": 2.047485413627637e-06, + "loss": 0.0078, + "num_input_tokens_seen": 193001376, + "step": 158615 + }, + { + "epoch": 17.665664327876154, + "grad_norm": 0.24023063480854034, + "learning_rate": 2.0465225015991267e-06, + "loss": 0.0023, + "num_input_tokens_seen": 193007712, + "step": 158620 + }, + { + "epoch": 17.666221182759774, + "grad_norm": 1.0140427350997925, + "learning_rate": 2.045559806384595e-06, + "loss": 0.0126, + "num_input_tokens_seen": 193013920, + "step": 158625 + }, + { + "epoch": 17.66677803764339, + "grad_norm": 0.053059156984090805, + "learning_rate": 2.044597327993153e-06, + "loss": 0.0431, + "num_input_tokens_seen": 193019744, + "step": 158630 + }, + { + "epoch": 17.66733489252701, + "grad_norm": 5.005162239074707, + "learning_rate": 2.0436350664338716e-06, + "loss": 0.0927, + "num_input_tokens_seen": 193025824, + "step": 158635 + }, + { + "epoch": 17.667891747410625, + "grad_norm": 0.7261634469032288, + "learning_rate": 2.0426730217158545e-06, + "loss": 0.052, + "num_input_tokens_seen": 193032032, + "step": 158640 + }, + { + "epoch": 17.66844860229424, + "grad_norm": 0.10547536611557007, + "learning_rate": 2.0417111938481783e-06, + "loss": 0.0123, + "num_input_tokens_seen": 193038176, + "step": 158645 + }, + { + "epoch": 17.66900545717786, + "grad_norm": 0.16188324987888336, + "learning_rate": 2.0407495828399376e-06, + "loss": 0.0033, + "num_input_tokens_seen": 193043808, + "step": 158650 + }, + { + "epoch": 17.669562312061476, + "grad_norm": 0.009534289129078388, + "learning_rate": 2.0397881887002145e-06, + "loss": 0.0038, + "num_input_tokens_seen": 193050176, + "step": 158655 + }, + { + "epoch": 17.670119166945096, + "grad_norm": 0.0035584077704697847, + "learning_rate": 2.038827011438085e-06, + "loss": 0.0692, + "num_input_tokens_seen": 193056320, + "step": 158660 + }, + { + "epoch": 17.67067602182871, + "grad_norm": 0.28093650937080383, + "learning_rate": 2.0378660510626256e-06, + "loss": 0.0108, + "num_input_tokens_seen": 193062656, + "step": 158665 + }, + { + "epoch": 17.671232876712327, + "grad_norm": 1.2616639137268066, + "learning_rate": 2.0369053075829232e-06, + "loss": 0.0495, + "num_input_tokens_seen": 193068736, + "step": 158670 + }, + { + "epoch": 17.671789731595947, + "grad_norm": 0.00017547080642543733, + "learning_rate": 2.035944781008048e-06, + "loss": 0.0109, + "num_input_tokens_seen": 193074752, + "step": 158675 + }, + { + "epoch": 17.672346586479563, + "grad_norm": 0.03468695282936096, + "learning_rate": 2.0349844713470735e-06, + "loss": 0.023, + "num_input_tokens_seen": 193081024, + "step": 158680 + }, + { + "epoch": 17.672903441363182, + "grad_norm": 0.0003752065822482109, + "learning_rate": 2.0340243786090676e-06, + "loss": 0.0166, + "num_input_tokens_seen": 193087168, + "step": 158685 + }, + { + "epoch": 17.673460296246798, + "grad_norm": 0.845180332660675, + "learning_rate": 2.033064502803103e-06, + "loss": 0.042, + "num_input_tokens_seen": 193093408, + "step": 158690 + }, + { + "epoch": 17.674017151130414, + "grad_norm": 0.01912664994597435, + "learning_rate": 2.03210484393824e-06, + "loss": 0.0165, + "num_input_tokens_seen": 193099744, + "step": 158695 + }, + { + "epoch": 17.674574006014034, + "grad_norm": 1.0867358446121216, + "learning_rate": 2.0311454020235544e-06, + "loss": 0.0566, + "num_input_tokens_seen": 193105920, + "step": 158700 + }, + { + "epoch": 17.67513086089765, + "grad_norm": 0.07199420779943466, + "learning_rate": 2.0301861770681025e-06, + "loss": 0.0482, + "num_input_tokens_seen": 193111808, + "step": 158705 + }, + { + "epoch": 17.67568771578127, + "grad_norm": 0.2465958446264267, + "learning_rate": 2.029227169080944e-06, + "loss": 0.0365, + "num_input_tokens_seen": 193118176, + "step": 158710 + }, + { + "epoch": 17.676244570664885, + "grad_norm": 0.01338814478367567, + "learning_rate": 2.0282683780711355e-06, + "loss": 0.0088, + "num_input_tokens_seen": 193124224, + "step": 158715 + }, + { + "epoch": 17.6768014255485, + "grad_norm": 0.039627231657505035, + "learning_rate": 2.0273098040477418e-06, + "loss": 0.0364, + "num_input_tokens_seen": 193129824, + "step": 158720 + }, + { + "epoch": 17.67735828043212, + "grad_norm": 0.06748378276824951, + "learning_rate": 2.026351447019811e-06, + "loss": 0.0394, + "num_input_tokens_seen": 193135168, + "step": 158725 + }, + { + "epoch": 17.677915135315736, + "grad_norm": 0.00014105821901466697, + "learning_rate": 2.025393306996398e-06, + "loss": 0.059, + "num_input_tokens_seen": 193141472, + "step": 158730 + }, + { + "epoch": 17.678471990199355, + "grad_norm": 0.01062546856701374, + "learning_rate": 2.0244353839865475e-06, + "loss": 0.11, + "num_input_tokens_seen": 193147520, + "step": 158735 + }, + { + "epoch": 17.67902884508297, + "grad_norm": 0.26327475905418396, + "learning_rate": 2.0234776779993163e-06, + "loss": 0.1381, + "num_input_tokens_seen": 193153312, + "step": 158740 + }, + { + "epoch": 17.679585699966587, + "grad_norm": 0.024350622668862343, + "learning_rate": 2.0225201890437446e-06, + "loss": 0.0068, + "num_input_tokens_seen": 193159680, + "step": 158745 + }, + { + "epoch": 17.680142554850207, + "grad_norm": 0.0016995379701256752, + "learning_rate": 2.0215629171288857e-06, + "loss": 0.0176, + "num_input_tokens_seen": 193165888, + "step": 158750 + }, + { + "epoch": 17.680699409733823, + "grad_norm": 2.5718438625335693, + "learning_rate": 2.0206058622637665e-06, + "loss": 0.166, + "num_input_tokens_seen": 193172224, + "step": 158755 + }, + { + "epoch": 17.681256264617442, + "grad_norm": 0.12329623848199844, + "learning_rate": 2.0196490244574402e-06, + "loss": 0.0995, + "num_input_tokens_seen": 193178208, + "step": 158760 + }, + { + "epoch": 17.681813119501058, + "grad_norm": 2.5227484703063965, + "learning_rate": 2.018692403718936e-06, + "loss": 0.133, + "num_input_tokens_seen": 193184384, + "step": 158765 + }, + { + "epoch": 17.682369974384674, + "grad_norm": 0.01737060211598873, + "learning_rate": 2.017736000057299e-06, + "loss": 0.01, + "num_input_tokens_seen": 193189792, + "step": 158770 + }, + { + "epoch": 17.682926829268293, + "grad_norm": 0.0011356427567079663, + "learning_rate": 2.0167798134815592e-06, + "loss": 0.0404, + "num_input_tokens_seen": 193195936, + "step": 158775 + }, + { + "epoch": 17.68348368415191, + "grad_norm": 0.0004047169641125947, + "learning_rate": 2.015823844000747e-06, + "loss": 0.0092, + "num_input_tokens_seen": 193202112, + "step": 158780 + }, + { + "epoch": 17.68404053903553, + "grad_norm": 1.4331897497177124, + "learning_rate": 2.0148680916238922e-06, + "loss": 0.0659, + "num_input_tokens_seen": 193207648, + "step": 158785 + }, + { + "epoch": 17.684597393919145, + "grad_norm": 0.9294108748435974, + "learning_rate": 2.0139125563600205e-06, + "loss": 0.0267, + "num_input_tokens_seen": 193213536, + "step": 158790 + }, + { + "epoch": 17.68515424880276, + "grad_norm": 2.05300235748291, + "learning_rate": 2.012957238218166e-06, + "loss": 0.0831, + "num_input_tokens_seen": 193219744, + "step": 158795 + }, + { + "epoch": 17.68571110368638, + "grad_norm": 0.008043301291763783, + "learning_rate": 2.0120021372073473e-06, + "loss": 0.007, + "num_input_tokens_seen": 193225856, + "step": 158800 + }, + { + "epoch": 17.686267958569996, + "grad_norm": 1.7185879945755005, + "learning_rate": 2.0110472533365843e-06, + "loss": 0.0908, + "num_input_tokens_seen": 193231840, + "step": 158805 + }, + { + "epoch": 17.686824813453615, + "grad_norm": 0.0007798403967171907, + "learning_rate": 2.010092586614895e-06, + "loss": 0.012, + "num_input_tokens_seen": 193238176, + "step": 158810 + }, + { + "epoch": 17.68738166833723, + "grad_norm": 0.039621368050575256, + "learning_rate": 2.0091381370513058e-06, + "loss": 0.0016, + "num_input_tokens_seen": 193244192, + "step": 158815 + }, + { + "epoch": 17.687938523220847, + "grad_norm": 0.08465876430273056, + "learning_rate": 2.0081839046548257e-06, + "loss": 0.0786, + "num_input_tokens_seen": 193250112, + "step": 158820 + }, + { + "epoch": 17.688495378104466, + "grad_norm": 0.16299420595169067, + "learning_rate": 2.00722988943447e-06, + "loss": 0.0306, + "num_input_tokens_seen": 193256384, + "step": 158825 + }, + { + "epoch": 17.689052232988082, + "grad_norm": 0.05821576714515686, + "learning_rate": 2.006276091399245e-06, + "loss": 0.0189, + "num_input_tokens_seen": 193261984, + "step": 158830 + }, + { + "epoch": 17.689609087871702, + "grad_norm": 0.0694909319281578, + "learning_rate": 2.005322510558166e-06, + "loss": 0.0232, + "num_input_tokens_seen": 193268160, + "step": 158835 + }, + { + "epoch": 17.690165942755318, + "grad_norm": 1.0130172967910767, + "learning_rate": 2.0043691469202377e-06, + "loss": 0.0416, + "num_input_tokens_seen": 193274048, + "step": 158840 + }, + { + "epoch": 17.690722797638934, + "grad_norm": 0.32644468545913696, + "learning_rate": 2.003416000494471e-06, + "loss": 0.014, + "num_input_tokens_seen": 193279936, + "step": 158845 + }, + { + "epoch": 17.691279652522553, + "grad_norm": 0.12238457798957825, + "learning_rate": 2.0024630712898647e-06, + "loss": 0.061, + "num_input_tokens_seen": 193285952, + "step": 158850 + }, + { + "epoch": 17.69183650740617, + "grad_norm": 0.0003221844381187111, + "learning_rate": 2.00151035931542e-06, + "loss": 0.0117, + "num_input_tokens_seen": 193292064, + "step": 158855 + }, + { + "epoch": 17.69239336228979, + "grad_norm": 0.32864099740982056, + "learning_rate": 2.00055786458013e-06, + "loss": 0.0176, + "num_input_tokens_seen": 193297920, + "step": 158860 + }, + { + "epoch": 17.692950217173404, + "grad_norm": 0.7427682280540466, + "learning_rate": 1.9996055870930037e-06, + "loss": 0.0088, + "num_input_tokens_seen": 193304224, + "step": 158865 + }, + { + "epoch": 17.69350707205702, + "grad_norm": 0.0008520129485987127, + "learning_rate": 1.9986535268630315e-06, + "loss": 0.022, + "num_input_tokens_seen": 193310496, + "step": 158870 + }, + { + "epoch": 17.69406392694064, + "grad_norm": 0.0003143989888485521, + "learning_rate": 1.9977016838992028e-06, + "loss": 0.1361, + "num_input_tokens_seen": 193316896, + "step": 158875 + }, + { + "epoch": 17.694620781824256, + "grad_norm": 1.5925710201263428, + "learning_rate": 1.996750058210506e-06, + "loss": 0.0196, + "num_input_tokens_seen": 193322912, + "step": 158880 + }, + { + "epoch": 17.695177636707875, + "grad_norm": 0.024038046598434448, + "learning_rate": 1.995798649805941e-06, + "loss": 0.0072, + "num_input_tokens_seen": 193329024, + "step": 158885 + }, + { + "epoch": 17.69573449159149, + "grad_norm": 0.05448736622929573, + "learning_rate": 1.994847458694485e-06, + "loss": 0.0071, + "num_input_tokens_seen": 193335520, + "step": 158890 + }, + { + "epoch": 17.696291346475107, + "grad_norm": 0.05833388492465019, + "learning_rate": 1.9938964848851325e-06, + "loss": 0.0344, + "num_input_tokens_seen": 193341504, + "step": 158895 + }, + { + "epoch": 17.696848201358726, + "grad_norm": 0.012871069833636284, + "learning_rate": 1.9929457283868525e-06, + "loss": 0.0735, + "num_input_tokens_seen": 193347744, + "step": 158900 + }, + { + "epoch": 17.697405056242342, + "grad_norm": 0.015136943198740482, + "learning_rate": 1.991995189208637e-06, + "loss": 0.041, + "num_input_tokens_seen": 193353856, + "step": 158905 + }, + { + "epoch": 17.69796191112596, + "grad_norm": 0.8723464608192444, + "learning_rate": 1.991044867359454e-06, + "loss": 0.0379, + "num_input_tokens_seen": 193359968, + "step": 158910 + }, + { + "epoch": 17.698518766009578, + "grad_norm": 0.09635407477617264, + "learning_rate": 1.9900947628482934e-06, + "loss": 0.0233, + "num_input_tokens_seen": 193365632, + "step": 158915 + }, + { + "epoch": 17.699075620893197, + "grad_norm": 0.22056496143341064, + "learning_rate": 1.9891448756841235e-06, + "loss": 0.0053, + "num_input_tokens_seen": 193372096, + "step": 158920 + }, + { + "epoch": 17.699632475776813, + "grad_norm": 0.05717942491173744, + "learning_rate": 1.9881952058759144e-06, + "loss": 0.0044, + "num_input_tokens_seen": 193378240, + "step": 158925 + }, + { + "epoch": 17.70018933066043, + "grad_norm": 0.05338272452354431, + "learning_rate": 1.9872457534326374e-06, + "loss": 0.0313, + "num_input_tokens_seen": 193384416, + "step": 158930 + }, + { + "epoch": 17.700746185544048, + "grad_norm": 0.09151862561702728, + "learning_rate": 1.986296518363262e-06, + "loss": 0.0488, + "num_input_tokens_seen": 193390592, + "step": 158935 + }, + { + "epoch": 17.701303040427664, + "grad_norm": 0.015274805016815662, + "learning_rate": 1.985347500676757e-06, + "loss": 0.0718, + "num_input_tokens_seen": 193396416, + "step": 158940 + }, + { + "epoch": 17.701859895311284, + "grad_norm": 0.0014232287649065256, + "learning_rate": 1.9843987003820842e-06, + "loss": 0.0429, + "num_input_tokens_seen": 193402656, + "step": 158945 + }, + { + "epoch": 17.7024167501949, + "grad_norm": 1.3796319961547852, + "learning_rate": 1.9834501174881975e-06, + "loss": 0.0242, + "num_input_tokens_seen": 193408512, + "step": 158950 + }, + { + "epoch": 17.702973605078515, + "grad_norm": 0.29062068462371826, + "learning_rate": 1.9825017520040735e-06, + "loss": 0.0166, + "num_input_tokens_seen": 193414784, + "step": 158955 + }, + { + "epoch": 17.703530459962135, + "grad_norm": 0.656682014465332, + "learning_rate": 1.9815536039386545e-06, + "loss": 0.0309, + "num_input_tokens_seen": 193420992, + "step": 158960 + }, + { + "epoch": 17.70408731484575, + "grad_norm": 2.107326030731201, + "learning_rate": 1.9806056733009143e-06, + "loss": 0.0964, + "num_input_tokens_seen": 193427200, + "step": 158965 + }, + { + "epoch": 17.70464416972937, + "grad_norm": 0.04078434407711029, + "learning_rate": 1.979657960099787e-06, + "loss": 0.0556, + "num_input_tokens_seen": 193433440, + "step": 158970 + }, + { + "epoch": 17.705201024612986, + "grad_norm": 0.006764487363398075, + "learning_rate": 1.9787104643442385e-06, + "loss": 0.1649, + "num_input_tokens_seen": 193439648, + "step": 158975 + }, + { + "epoch": 17.705757879496602, + "grad_norm": 0.001633667852729559, + "learning_rate": 1.9777631860432106e-06, + "loss": 0.1135, + "num_input_tokens_seen": 193445344, + "step": 158980 + }, + { + "epoch": 17.70631473438022, + "grad_norm": 0.00010253281652694568, + "learning_rate": 1.976816125205655e-06, + "loss": 0.0204, + "num_input_tokens_seen": 193451776, + "step": 158985 + }, + { + "epoch": 17.706871589263837, + "grad_norm": 0.0002065071021206677, + "learning_rate": 1.97586928184052e-06, + "loss": 0.0386, + "num_input_tokens_seen": 193457952, + "step": 158990 + }, + { + "epoch": 17.707428444147457, + "grad_norm": 2.209285020828247, + "learning_rate": 1.974922655956746e-06, + "loss": 0.1151, + "num_input_tokens_seen": 193463840, + "step": 158995 + }, + { + "epoch": 17.707985299031073, + "grad_norm": 0.00010627219307934865, + "learning_rate": 1.9739762475632722e-06, + "loss": 0.0058, + "num_input_tokens_seen": 193470016, + "step": 159000 + }, + { + "epoch": 17.70854215391469, + "grad_norm": 0.1859210878610611, + "learning_rate": 1.9730300566690423e-06, + "loss": 0.0398, + "num_input_tokens_seen": 193476128, + "step": 159005 + }, + { + "epoch": 17.709099008798308, + "grad_norm": 0.02335670031607151, + "learning_rate": 1.9720840832829934e-06, + "loss": 0.0845, + "num_input_tokens_seen": 193482400, + "step": 159010 + }, + { + "epoch": 17.709655863681924, + "grad_norm": 0.7819603681564331, + "learning_rate": 1.97113832741406e-06, + "loss": 0.0214, + "num_input_tokens_seen": 193488448, + "step": 159015 + }, + { + "epoch": 17.710212718565543, + "grad_norm": 0.6312242746353149, + "learning_rate": 1.970192789071171e-06, + "loss": 0.0147, + "num_input_tokens_seen": 193494464, + "step": 159020 + }, + { + "epoch": 17.71076957344916, + "grad_norm": 1.0676205158233643, + "learning_rate": 1.969247468263269e-06, + "loss": 0.0716, + "num_input_tokens_seen": 193499712, + "step": 159025 + }, + { + "epoch": 17.711326428332775, + "grad_norm": 0.05268416553735733, + "learning_rate": 1.9683023649992693e-06, + "loss": 0.001, + "num_input_tokens_seen": 193505856, + "step": 159030 + }, + { + "epoch": 17.711883283216395, + "grad_norm": 0.6006642580032349, + "learning_rate": 1.967357479288115e-06, + "loss": 0.0559, + "num_input_tokens_seen": 193512256, + "step": 159035 + }, + { + "epoch": 17.71244013810001, + "grad_norm": 0.27642834186553955, + "learning_rate": 1.9664128111387155e-06, + "loss": 0.0303, + "num_input_tokens_seen": 193518528, + "step": 159040 + }, + { + "epoch": 17.71299699298363, + "grad_norm": 3.6087186336517334, + "learning_rate": 1.965468360560005e-06, + "loss": 0.1444, + "num_input_tokens_seen": 193524608, + "step": 159045 + }, + { + "epoch": 17.713553847867246, + "grad_norm": 0.0039036686066538095, + "learning_rate": 1.9645241275608963e-06, + "loss": 0.0006, + "num_input_tokens_seen": 193530720, + "step": 159050 + }, + { + "epoch": 17.71411070275086, + "grad_norm": 0.0016195951029658318, + "learning_rate": 1.963580112150318e-06, + "loss": 0.0099, + "num_input_tokens_seen": 193536768, + "step": 159055 + }, + { + "epoch": 17.71466755763448, + "grad_norm": 0.43808111548423767, + "learning_rate": 1.9626363143371825e-06, + "loss": 0.025, + "num_input_tokens_seen": 193542880, + "step": 159060 + }, + { + "epoch": 17.715224412518097, + "grad_norm": 0.06285423785448074, + "learning_rate": 1.961692734130402e-06, + "loss": 0.1257, + "num_input_tokens_seen": 193549056, + "step": 159065 + }, + { + "epoch": 17.715781267401717, + "grad_norm": 0.020664365962147713, + "learning_rate": 1.9607493715388893e-06, + "loss": 0.0008, + "num_input_tokens_seen": 193555680, + "step": 159070 + }, + { + "epoch": 17.716338122285332, + "grad_norm": 0.0467105396091938, + "learning_rate": 1.9598062265715615e-06, + "loss": 0.0632, + "num_input_tokens_seen": 193561184, + "step": 159075 + }, + { + "epoch": 17.71689497716895, + "grad_norm": 9.222445805789903e-05, + "learning_rate": 1.9588632992373233e-06, + "loss": 0.0879, + "num_input_tokens_seen": 193567392, + "step": 159080 + }, + { + "epoch": 17.717451832052568, + "grad_norm": 0.616324782371521, + "learning_rate": 1.9579205895450813e-06, + "loss": 0.0518, + "num_input_tokens_seen": 193573664, + "step": 159085 + }, + { + "epoch": 17.718008686936184, + "grad_norm": 1.5183552503585815, + "learning_rate": 1.956978097503734e-06, + "loss": 0.0282, + "num_input_tokens_seen": 193579552, + "step": 159090 + }, + { + "epoch": 17.718565541819803, + "grad_norm": 0.0013355778064578772, + "learning_rate": 1.9560358231221985e-06, + "loss": 0.0343, + "num_input_tokens_seen": 193585216, + "step": 159095 + }, + { + "epoch": 17.71912239670342, + "grad_norm": 0.78947514295578, + "learning_rate": 1.9550937664093606e-06, + "loss": 0.0175, + "num_input_tokens_seen": 193591360, + "step": 159100 + }, + { + "epoch": 17.719679251587035, + "grad_norm": 0.152684286236763, + "learning_rate": 1.9541519273741286e-06, + "loss": 0.0024, + "num_input_tokens_seen": 193597600, + "step": 159105 + }, + { + "epoch": 17.720236106470654, + "grad_norm": 0.08561783283948898, + "learning_rate": 1.9532103060253992e-06, + "loss": 0.0025, + "num_input_tokens_seen": 193603904, + "step": 159110 + }, + { + "epoch": 17.72079296135427, + "grad_norm": 0.49892348051071167, + "learning_rate": 1.952268902372059e-06, + "loss": 0.0203, + "num_input_tokens_seen": 193610080, + "step": 159115 + }, + { + "epoch": 17.72134981623789, + "grad_norm": 0.12660811841487885, + "learning_rate": 1.9513277164230045e-06, + "loss": 0.0901, + "num_input_tokens_seen": 193616064, + "step": 159120 + }, + { + "epoch": 17.721906671121506, + "grad_norm": 0.03286973759531975, + "learning_rate": 1.9503867481871276e-06, + "loss": 0.0028, + "num_input_tokens_seen": 193622080, + "step": 159125 + }, + { + "epoch": 17.72246352600512, + "grad_norm": 1.0836784839630127, + "learning_rate": 1.949445997673316e-06, + "loss": 0.0591, + "num_input_tokens_seen": 193627936, + "step": 159130 + }, + { + "epoch": 17.72302038088874, + "grad_norm": 0.020864136517047882, + "learning_rate": 1.9485054648904544e-06, + "loss": 0.0649, + "num_input_tokens_seen": 193633792, + "step": 159135 + }, + { + "epoch": 17.723577235772357, + "grad_norm": 1.6774283647537231, + "learning_rate": 1.9475651498474216e-06, + "loss": 0.183, + "num_input_tokens_seen": 193639744, + "step": 159140 + }, + { + "epoch": 17.724134090655976, + "grad_norm": 0.008234485983848572, + "learning_rate": 1.946625052553111e-06, + "loss": 0.0032, + "num_input_tokens_seen": 193646144, + "step": 159145 + }, + { + "epoch": 17.724690945539592, + "grad_norm": 0.12293720990419388, + "learning_rate": 1.945685173016393e-06, + "loss": 0.119, + "num_input_tokens_seen": 193652256, + "step": 159150 + }, + { + "epoch": 17.725247800423208, + "grad_norm": 0.0452607236802578, + "learning_rate": 1.9447455112461573e-06, + "loss": 0.0069, + "num_input_tokens_seen": 193658368, + "step": 159155 + }, + { + "epoch": 17.725804655306828, + "grad_norm": 1.2195321321487427, + "learning_rate": 1.943806067251261e-06, + "loss": 0.0787, + "num_input_tokens_seen": 193664320, + "step": 159160 + }, + { + "epoch": 17.726361510190443, + "grad_norm": 4.440011501312256, + "learning_rate": 1.942866841040597e-06, + "loss": 0.09, + "num_input_tokens_seen": 193670400, + "step": 159165 + }, + { + "epoch": 17.726918365074063, + "grad_norm": 1.8245285749435425, + "learning_rate": 1.941927832623022e-06, + "loss": 0.0237, + "num_input_tokens_seen": 193676736, + "step": 159170 + }, + { + "epoch": 17.72747521995768, + "grad_norm": 0.015568542294204235, + "learning_rate": 1.9409890420074158e-06, + "loss": 0.0221, + "num_input_tokens_seen": 193682848, + "step": 159175 + }, + { + "epoch": 17.728032074841295, + "grad_norm": 0.020525190979242325, + "learning_rate": 1.940050469202645e-06, + "loss": 0.0091, + "num_input_tokens_seen": 193689024, + "step": 159180 + }, + { + "epoch": 17.728588929724914, + "grad_norm": 0.6320030689239502, + "learning_rate": 1.9391121142175726e-06, + "loss": 0.049, + "num_input_tokens_seen": 193694496, + "step": 159185 + }, + { + "epoch": 17.72914578460853, + "grad_norm": 0.31280970573425293, + "learning_rate": 1.9381739770610557e-06, + "loss": 0.1059, + "num_input_tokens_seen": 193700544, + "step": 159190 + }, + { + "epoch": 17.72970263949215, + "grad_norm": 0.022319704294204712, + "learning_rate": 1.93723605774197e-06, + "loss": 0.0486, + "num_input_tokens_seen": 193706560, + "step": 159195 + }, + { + "epoch": 17.730259494375765, + "grad_norm": 0.28426069021224976, + "learning_rate": 1.9362983562691646e-06, + "loss": 0.0035, + "num_input_tokens_seen": 193712512, + "step": 159200 + }, + { + "epoch": 17.73081634925938, + "grad_norm": 0.013569407165050507, + "learning_rate": 1.935360872651501e-06, + "loss": 0.0867, + "num_input_tokens_seen": 193718400, + "step": 159205 + }, + { + "epoch": 17.731373204143, + "grad_norm": 0.00022578010975848883, + "learning_rate": 1.9344236068978337e-06, + "loss": 0.0027, + "num_input_tokens_seen": 193724704, + "step": 159210 + }, + { + "epoch": 17.731930059026617, + "grad_norm": 0.01696552149951458, + "learning_rate": 1.9334865590170087e-06, + "loss": 0.121, + "num_input_tokens_seen": 193730880, + "step": 159215 + }, + { + "epoch": 17.732486913910236, + "grad_norm": 0.044607508927583694, + "learning_rate": 1.9325497290178906e-06, + "loss": 0.0166, + "num_input_tokens_seen": 193736928, + "step": 159220 + }, + { + "epoch": 17.733043768793852, + "grad_norm": 0.00018648382683750242, + "learning_rate": 1.93161311690932e-06, + "loss": 0.0264, + "num_input_tokens_seen": 193743008, + "step": 159225 + }, + { + "epoch": 17.733600623677468, + "grad_norm": 0.17703884840011597, + "learning_rate": 1.9306767227001477e-06, + "loss": 0.0436, + "num_input_tokens_seen": 193748416, + "step": 159230 + }, + { + "epoch": 17.734157478561087, + "grad_norm": 0.3638509511947632, + "learning_rate": 1.9297405463992086e-06, + "loss": 0.0391, + "num_input_tokens_seen": 193754112, + "step": 159235 + }, + { + "epoch": 17.734714333444703, + "grad_norm": 0.3385624885559082, + "learning_rate": 1.928804588015362e-06, + "loss": 0.058, + "num_input_tokens_seen": 193760064, + "step": 159240 + }, + { + "epoch": 17.735271188328323, + "grad_norm": 0.06321407854557037, + "learning_rate": 1.927868847557432e-06, + "loss": 0.0052, + "num_input_tokens_seen": 193766496, + "step": 159245 + }, + { + "epoch": 17.73582804321194, + "grad_norm": 0.0001402523776050657, + "learning_rate": 1.926933325034272e-06, + "loss": 0.0237, + "num_input_tokens_seen": 193772448, + "step": 159250 + }, + { + "epoch": 17.736384898095558, + "grad_norm": 0.034810710698366165, + "learning_rate": 1.925998020454714e-06, + "loss": 0.0939, + "num_input_tokens_seen": 193778080, + "step": 159255 + }, + { + "epoch": 17.736941752979174, + "grad_norm": 0.013649748638272285, + "learning_rate": 1.92506293382759e-06, + "loss": 0.1332, + "num_input_tokens_seen": 193783360, + "step": 159260 + }, + { + "epoch": 17.73749860786279, + "grad_norm": 1.1851335763931274, + "learning_rate": 1.9241280651617286e-06, + "loss": 0.0864, + "num_input_tokens_seen": 193789216, + "step": 159265 + }, + { + "epoch": 17.73805546274641, + "grad_norm": 0.00019061545026488602, + "learning_rate": 1.9231934144659707e-06, + "loss": 0.0109, + "num_input_tokens_seen": 193794752, + "step": 159270 + }, + { + "epoch": 17.738612317630025, + "grad_norm": 0.197231724858284, + "learning_rate": 1.922258981749142e-06, + "loss": 0.0355, + "num_input_tokens_seen": 193800800, + "step": 159275 + }, + { + "epoch": 17.73916917251364, + "grad_norm": 0.0006565427174791694, + "learning_rate": 1.9213247670200635e-06, + "loss": 0.0107, + "num_input_tokens_seen": 193806976, + "step": 159280 + }, + { + "epoch": 17.73972602739726, + "grad_norm": 0.2784213423728943, + "learning_rate": 1.9203907702875614e-06, + "loss": 0.078, + "num_input_tokens_seen": 193813088, + "step": 159285 + }, + { + "epoch": 17.740282882280876, + "grad_norm": 0.01076678279787302, + "learning_rate": 1.9194569915604617e-06, + "loss": 0.0293, + "num_input_tokens_seen": 193819296, + "step": 159290 + }, + { + "epoch": 17.740839737164496, + "grad_norm": 0.7520216703414917, + "learning_rate": 1.9185234308475797e-06, + "loss": 0.0104, + "num_input_tokens_seen": 193825344, + "step": 159295 + }, + { + "epoch": 17.741396592048112, + "grad_norm": 0.8401670455932617, + "learning_rate": 1.9175900881577448e-06, + "loss": 0.0238, + "num_input_tokens_seen": 193831200, + "step": 159300 + }, + { + "epoch": 17.74195344693173, + "grad_norm": 1.6623263359069824, + "learning_rate": 1.916656963499755e-06, + "loss": 0.0481, + "num_input_tokens_seen": 193837216, + "step": 159305 + }, + { + "epoch": 17.742510301815347, + "grad_norm": 1.7200602293014526, + "learning_rate": 1.91572405688244e-06, + "loss": 0.0345, + "num_input_tokens_seen": 193842976, + "step": 159310 + }, + { + "epoch": 17.743067156698963, + "grad_norm": 0.0230709221214056, + "learning_rate": 1.9147913683146e-06, + "loss": 0.0505, + "num_input_tokens_seen": 193849120, + "step": 159315 + }, + { + "epoch": 17.743624011582583, + "grad_norm": 0.8505390286445618, + "learning_rate": 1.913858897805057e-06, + "loss": 0.0159, + "num_input_tokens_seen": 193855584, + "step": 159320 + }, + { + "epoch": 17.7441808664662, + "grad_norm": 0.8132796883583069, + "learning_rate": 1.9129266453626118e-06, + "loss": 0.0644, + "num_input_tokens_seen": 193861056, + "step": 159325 + }, + { + "epoch": 17.744737721349818, + "grad_norm": 0.08082164078950882, + "learning_rate": 1.9119946109960733e-06, + "loss": 0.0707, + "num_input_tokens_seen": 193867040, + "step": 159330 + }, + { + "epoch": 17.745294576233434, + "grad_norm": 0.0016131311422213912, + "learning_rate": 1.9110627947142382e-06, + "loss": 0.0254, + "num_input_tokens_seen": 193873376, + "step": 159335 + }, + { + "epoch": 17.74585143111705, + "grad_norm": 0.010567670688033104, + "learning_rate": 1.9101311965259187e-06, + "loss": 0.0068, + "num_input_tokens_seen": 193879616, + "step": 159340 + }, + { + "epoch": 17.74640828600067, + "grad_norm": 0.009243525564670563, + "learning_rate": 1.909199816439908e-06, + "loss": 0.0016, + "num_input_tokens_seen": 193885824, + "step": 159345 + }, + { + "epoch": 17.746965140884285, + "grad_norm": 0.05398087576031685, + "learning_rate": 1.9082686544650063e-06, + "loss": 0.1104, + "num_input_tokens_seen": 193891904, + "step": 159350 + }, + { + "epoch": 17.747521995767904, + "grad_norm": 0.2289055585861206, + "learning_rate": 1.9073377106100021e-06, + "loss": 0.0243, + "num_input_tokens_seen": 193897888, + "step": 159355 + }, + { + "epoch": 17.74807885065152, + "grad_norm": 0.020097626373171806, + "learning_rate": 1.906406984883699e-06, + "loss": 0.0293, + "num_input_tokens_seen": 193904096, + "step": 159360 + }, + { + "epoch": 17.748635705535136, + "grad_norm": 0.046465035527944565, + "learning_rate": 1.905476477294882e-06, + "loss": 0.0368, + "num_input_tokens_seen": 193910240, + "step": 159365 + }, + { + "epoch": 17.749192560418756, + "grad_norm": 0.09873000532388687, + "learning_rate": 1.904546187852349e-06, + "loss": 0.0064, + "num_input_tokens_seen": 193916096, + "step": 159370 + }, + { + "epoch": 17.74974941530237, + "grad_norm": 0.9262967705726624, + "learning_rate": 1.903616116564874e-06, + "loss": 0.031, + "num_input_tokens_seen": 193921504, + "step": 159375 + }, + { + "epoch": 17.75030627018599, + "grad_norm": 0.003169835079461336, + "learning_rate": 1.902686263441253e-06, + "loss": 0.0212, + "num_input_tokens_seen": 193927776, + "step": 159380 + }, + { + "epoch": 17.750863125069607, + "grad_norm": 0.20338986814022064, + "learning_rate": 1.9017566284902616e-06, + "loss": 0.0382, + "num_input_tokens_seen": 193933792, + "step": 159385 + }, + { + "epoch": 17.751419979953223, + "grad_norm": 0.1007922887802124, + "learning_rate": 1.9008272117206876e-06, + "loss": 0.0265, + "num_input_tokens_seen": 193939488, + "step": 159390 + }, + { + "epoch": 17.751976834836842, + "grad_norm": 0.7680894136428833, + "learning_rate": 1.8998980131413102e-06, + "loss": 0.0867, + "num_input_tokens_seen": 193945600, + "step": 159395 + }, + { + "epoch": 17.752533689720458, + "grad_norm": 0.0020893909968435764, + "learning_rate": 1.8989690327609e-06, + "loss": 0.1788, + "num_input_tokens_seen": 193952000, + "step": 159400 + }, + { + "epoch": 17.753090544604078, + "grad_norm": 0.0024935961700975895, + "learning_rate": 1.8980402705882333e-06, + "loss": 0.004, + "num_input_tokens_seen": 193958304, + "step": 159405 + }, + { + "epoch": 17.753647399487694, + "grad_norm": 0.06463685631752014, + "learning_rate": 1.8971117266320892e-06, + "loss": 0.0297, + "num_input_tokens_seen": 193964864, + "step": 159410 + }, + { + "epoch": 17.75420425437131, + "grad_norm": 7.26086727809161e-05, + "learning_rate": 1.8961834009012357e-06, + "loss": 0.0086, + "num_input_tokens_seen": 193971168, + "step": 159415 + }, + { + "epoch": 17.75476110925493, + "grad_norm": 0.03235487639904022, + "learning_rate": 1.8952552934044409e-06, + "loss": 0.0108, + "num_input_tokens_seen": 193977312, + "step": 159420 + }, + { + "epoch": 17.755317964138545, + "grad_norm": 0.9305025935173035, + "learning_rate": 1.8943274041504643e-06, + "loss": 0.0404, + "num_input_tokens_seen": 193983136, + "step": 159425 + }, + { + "epoch": 17.755874819022164, + "grad_norm": 0.06046336144208908, + "learning_rate": 1.8933997331480825e-06, + "loss": 0.0383, + "num_input_tokens_seen": 193989600, + "step": 159430 + }, + { + "epoch": 17.75643167390578, + "grad_norm": 0.07157480716705322, + "learning_rate": 1.892472280406049e-06, + "loss": 0.014, + "num_input_tokens_seen": 193995936, + "step": 159435 + }, + { + "epoch": 17.756988528789396, + "grad_norm": 0.1515832394361496, + "learning_rate": 1.8915450459331324e-06, + "loss": 0.0989, + "num_input_tokens_seen": 194001728, + "step": 159440 + }, + { + "epoch": 17.757545383673015, + "grad_norm": 0.008156806230545044, + "learning_rate": 1.8906180297380865e-06, + "loss": 0.0292, + "num_input_tokens_seen": 194007808, + "step": 159445 + }, + { + "epoch": 17.75810223855663, + "grad_norm": 0.11317271739244461, + "learning_rate": 1.8896912318296683e-06, + "loss": 0.0029, + "num_input_tokens_seen": 194013344, + "step": 159450 + }, + { + "epoch": 17.75865909344025, + "grad_norm": 0.3703267276287079, + "learning_rate": 1.8887646522166292e-06, + "loss": 0.0623, + "num_input_tokens_seen": 194019392, + "step": 159455 + }, + { + "epoch": 17.759215948323867, + "grad_norm": 0.0021757844369858503, + "learning_rate": 1.8878382909077285e-06, + "loss": 0.0059, + "num_input_tokens_seen": 194025248, + "step": 159460 + }, + { + "epoch": 17.759772803207483, + "grad_norm": 0.00607285974547267, + "learning_rate": 1.8869121479117096e-06, + "loss": 0.0198, + "num_input_tokens_seen": 194031136, + "step": 159465 + }, + { + "epoch": 17.760329658091102, + "grad_norm": 0.016947630792856216, + "learning_rate": 1.8859862232373265e-06, + "loss": 0.0731, + "num_input_tokens_seen": 194037152, + "step": 159470 + }, + { + "epoch": 17.760886512974718, + "grad_norm": 0.0998961478471756, + "learning_rate": 1.8850605168933166e-06, + "loss": 0.0693, + "num_input_tokens_seen": 194043424, + "step": 159475 + }, + { + "epoch": 17.761443367858337, + "grad_norm": 0.006742959376424551, + "learning_rate": 1.8841350288884341e-06, + "loss": 0.0446, + "num_input_tokens_seen": 194049312, + "step": 159480 + }, + { + "epoch": 17.762000222741953, + "grad_norm": 0.16430915892124176, + "learning_rate": 1.8832097592314163e-06, + "loss": 0.0072, + "num_input_tokens_seen": 194055456, + "step": 159485 + }, + { + "epoch": 17.76255707762557, + "grad_norm": 0.6989993453025818, + "learning_rate": 1.8822847079310007e-06, + "loss": 0.0827, + "num_input_tokens_seen": 194061952, + "step": 159490 + }, + { + "epoch": 17.76311393250919, + "grad_norm": 0.0033127099741250277, + "learning_rate": 1.8813598749959276e-06, + "loss": 0.0442, + "num_input_tokens_seen": 194067648, + "step": 159495 + }, + { + "epoch": 17.763670787392805, + "grad_norm": 0.0004150323220528662, + "learning_rate": 1.8804352604349345e-06, + "loss": 0.0031, + "num_input_tokens_seen": 194073952, + "step": 159500 + }, + { + "epoch": 17.764227642276424, + "grad_norm": 0.0988537147641182, + "learning_rate": 1.8795108642567505e-06, + "loss": 0.0513, + "num_input_tokens_seen": 194080192, + "step": 159505 + }, + { + "epoch": 17.76478449716004, + "grad_norm": 0.6766723990440369, + "learning_rate": 1.878586686470113e-06, + "loss": 0.0163, + "num_input_tokens_seen": 194086400, + "step": 159510 + }, + { + "epoch": 17.765341352043656, + "grad_norm": 0.833942711353302, + "learning_rate": 1.8776627270837483e-06, + "loss": 0.0124, + "num_input_tokens_seen": 194092576, + "step": 159515 + }, + { + "epoch": 17.765898206927275, + "grad_norm": 0.6432357430458069, + "learning_rate": 1.8767389861063883e-06, + "loss": 0.0221, + "num_input_tokens_seen": 194098720, + "step": 159520 + }, + { + "epoch": 17.76645506181089, + "grad_norm": 0.008667507208883762, + "learning_rate": 1.8758154635467456e-06, + "loss": 0.0414, + "num_input_tokens_seen": 194104768, + "step": 159525 + }, + { + "epoch": 17.76701191669451, + "grad_norm": 0.07323603332042694, + "learning_rate": 1.8748921594135605e-06, + "loss": 0.2124, + "num_input_tokens_seen": 194111008, + "step": 159530 + }, + { + "epoch": 17.767568771578127, + "grad_norm": 0.10982681810855865, + "learning_rate": 1.8739690737155452e-06, + "loss": 0.0307, + "num_input_tokens_seen": 194116832, + "step": 159535 + }, + { + "epoch": 17.768125626461742, + "grad_norm": 0.08870720118284225, + "learning_rate": 1.8730462064614208e-06, + "loss": 0.0102, + "num_input_tokens_seen": 194123072, + "step": 159540 + }, + { + "epoch": 17.768682481345362, + "grad_norm": 0.6393855214118958, + "learning_rate": 1.8721235576598967e-06, + "loss": 0.052, + "num_input_tokens_seen": 194128864, + "step": 159545 + }, + { + "epoch": 17.769239336228978, + "grad_norm": 0.0662960410118103, + "learning_rate": 1.8712011273197021e-06, + "loss": 0.2065, + "num_input_tokens_seen": 194134688, + "step": 159550 + }, + { + "epoch": 17.769796191112597, + "grad_norm": 0.0035907267592847347, + "learning_rate": 1.8702789154495388e-06, + "loss": 0.0862, + "num_input_tokens_seen": 194140960, + "step": 159555 + }, + { + "epoch": 17.770353045996213, + "grad_norm": 0.00015762877592351288, + "learning_rate": 1.8693569220581326e-06, + "loss": 0.0229, + "num_input_tokens_seen": 194147008, + "step": 159560 + }, + { + "epoch": 17.77090990087983, + "grad_norm": 0.04792572930455208, + "learning_rate": 1.8684351471541711e-06, + "loss": 0.0188, + "num_input_tokens_seen": 194152960, + "step": 159565 + }, + { + "epoch": 17.77146675576345, + "grad_norm": 0.000742667296435684, + "learning_rate": 1.8675135907463782e-06, + "loss": 0.0301, + "num_input_tokens_seen": 194159040, + "step": 159570 + }, + { + "epoch": 17.772023610647064, + "grad_norm": 0.00043325129081495106, + "learning_rate": 1.8665922528434493e-06, + "loss": 0.0141, + "num_input_tokens_seen": 194165568, + "step": 159575 + }, + { + "epoch": 17.772580465530684, + "grad_norm": 1.082631230354309, + "learning_rate": 1.8656711334540916e-06, + "loss": 0.028, + "num_input_tokens_seen": 194172064, + "step": 159580 + }, + { + "epoch": 17.7731373204143, + "grad_norm": 0.33850058913230896, + "learning_rate": 1.8647502325870093e-06, + "loss": 0.0175, + "num_input_tokens_seen": 194178176, + "step": 159585 + }, + { + "epoch": 17.77369417529792, + "grad_norm": 0.0004467206308618188, + "learning_rate": 1.8638295502508923e-06, + "loss": 0.0022, + "num_input_tokens_seen": 194184448, + "step": 159590 + }, + { + "epoch": 17.774251030181535, + "grad_norm": 0.00013181463873479515, + "learning_rate": 1.8629090864544397e-06, + "loss": 0.0078, + "num_input_tokens_seen": 194190688, + "step": 159595 + }, + { + "epoch": 17.77480788506515, + "grad_norm": 0.00019059641635976732, + "learning_rate": 1.8619888412063525e-06, + "loss": 0.0047, + "num_input_tokens_seen": 194196640, + "step": 159600 + }, + { + "epoch": 17.77536473994877, + "grad_norm": 1.2351421117782593, + "learning_rate": 1.8610688145153181e-06, + "loss": 0.0303, + "num_input_tokens_seen": 194202816, + "step": 159605 + }, + { + "epoch": 17.775921594832386, + "grad_norm": 0.004301437176764011, + "learning_rate": 1.8601490063900272e-06, + "loss": 0.0523, + "num_input_tokens_seen": 194208608, + "step": 159610 + }, + { + "epoch": 17.776478449716002, + "grad_norm": 0.03541577607393265, + "learning_rate": 1.85922941683917e-06, + "loss": 0.0075, + "num_input_tokens_seen": 194214976, + "step": 159615 + }, + { + "epoch": 17.77703530459962, + "grad_norm": 0.0010677280370146036, + "learning_rate": 1.8583100458714254e-06, + "loss": 0.0031, + "num_input_tokens_seen": 194220800, + "step": 159620 + }, + { + "epoch": 17.777592159483238, + "grad_norm": 0.1511027067899704, + "learning_rate": 1.8573908934954864e-06, + "loss": 0.0057, + "num_input_tokens_seen": 194226720, + "step": 159625 + }, + { + "epoch": 17.778149014366857, + "grad_norm": 0.0035606161691248417, + "learning_rate": 1.8564719597200326e-06, + "loss": 0.0174, + "num_input_tokens_seen": 194232896, + "step": 159630 + }, + { + "epoch": 17.778705869250473, + "grad_norm": 1.443751335144043, + "learning_rate": 1.8555532445537428e-06, + "loss": 0.2137, + "num_input_tokens_seen": 194239296, + "step": 159635 + }, + { + "epoch": 17.779262724134092, + "grad_norm": 0.00015911002992652357, + "learning_rate": 1.8546347480052934e-06, + "loss": 0.0724, + "num_input_tokens_seen": 194245280, + "step": 159640 + }, + { + "epoch": 17.77981957901771, + "grad_norm": 0.022898374125361443, + "learning_rate": 1.8537164700833664e-06, + "loss": 0.0152, + "num_input_tokens_seen": 194251424, + "step": 159645 + }, + { + "epoch": 17.780376433901324, + "grad_norm": 0.002143479185178876, + "learning_rate": 1.8527984107966246e-06, + "loss": 0.0121, + "num_input_tokens_seen": 194257728, + "step": 159650 + }, + { + "epoch": 17.780933288784944, + "grad_norm": 0.06775100529193878, + "learning_rate": 1.851880570153755e-06, + "loss": 0.051, + "num_input_tokens_seen": 194264064, + "step": 159655 + }, + { + "epoch": 17.78149014366856, + "grad_norm": 0.5734238624572754, + "learning_rate": 1.8509629481634179e-06, + "loss": 0.0081, + "num_input_tokens_seen": 194270176, + "step": 159660 + }, + { + "epoch": 17.78204699855218, + "grad_norm": 0.01584302820265293, + "learning_rate": 1.8500455448342808e-06, + "loss": 0.0284, + "num_input_tokens_seen": 194276416, + "step": 159665 + }, + { + "epoch": 17.782603853435795, + "grad_norm": 0.06961263716220856, + "learning_rate": 1.8491283601750093e-06, + "loss": 0.0084, + "num_input_tokens_seen": 194282752, + "step": 159670 + }, + { + "epoch": 17.78316070831941, + "grad_norm": 0.05823935940861702, + "learning_rate": 1.8482113941942714e-06, + "loss": 0.0152, + "num_input_tokens_seen": 194288928, + "step": 159675 + }, + { + "epoch": 17.78371756320303, + "grad_norm": 0.14719341695308685, + "learning_rate": 1.847294646900727e-06, + "loss": 0.0145, + "num_input_tokens_seen": 194295200, + "step": 159680 + }, + { + "epoch": 17.784274418086646, + "grad_norm": 0.7884575128555298, + "learning_rate": 1.8463781183030327e-06, + "loss": 0.0208, + "num_input_tokens_seen": 194301152, + "step": 159685 + }, + { + "epoch": 17.784831272970266, + "grad_norm": 1.9638886451721191, + "learning_rate": 1.8454618084098457e-06, + "loss": 0.0435, + "num_input_tokens_seen": 194307200, + "step": 159690 + }, + { + "epoch": 17.78538812785388, + "grad_norm": 0.7828430533409119, + "learning_rate": 1.8445457172298259e-06, + "loss": 0.0582, + "num_input_tokens_seen": 194313408, + "step": 159695 + }, + { + "epoch": 17.785944982737497, + "grad_norm": 0.020871225744485855, + "learning_rate": 1.843629844771619e-06, + "loss": 0.0318, + "num_input_tokens_seen": 194319680, + "step": 159700 + }, + { + "epoch": 17.786501837621117, + "grad_norm": 0.11324813216924667, + "learning_rate": 1.84271419104389e-06, + "loss": 0.1758, + "num_input_tokens_seen": 194325888, + "step": 159705 + }, + { + "epoch": 17.787058692504733, + "grad_norm": 0.07466074079275131, + "learning_rate": 1.8417987560552685e-06, + "loss": 0.0495, + "num_input_tokens_seen": 194332096, + "step": 159710 + }, + { + "epoch": 17.787615547388352, + "grad_norm": 0.00031856284476816654, + "learning_rate": 1.840883539814417e-06, + "loss": 0.0224, + "num_input_tokens_seen": 194338080, + "step": 159715 + }, + { + "epoch": 17.788172402271968, + "grad_norm": 0.21620739996433258, + "learning_rate": 1.8399685423299729e-06, + "loss": 0.004, + "num_input_tokens_seen": 194344320, + "step": 159720 + }, + { + "epoch": 17.788729257155584, + "grad_norm": 0.26027819514274597, + "learning_rate": 1.839053763610582e-06, + "loss": 0.0838, + "num_input_tokens_seen": 194350016, + "step": 159725 + }, + { + "epoch": 17.789286112039203, + "grad_norm": 0.00024183948698919266, + "learning_rate": 1.8381392036648877e-06, + "loss": 0.0095, + "num_input_tokens_seen": 194356096, + "step": 159730 + }, + { + "epoch": 17.78984296692282, + "grad_norm": 0.225687175989151, + "learning_rate": 1.8372248625015243e-06, + "loss": 0.0693, + "num_input_tokens_seen": 194362336, + "step": 159735 + }, + { + "epoch": 17.79039982180644, + "grad_norm": 0.039373692125082016, + "learning_rate": 1.8363107401291241e-06, + "loss": 0.0071, + "num_input_tokens_seen": 194368544, + "step": 159740 + }, + { + "epoch": 17.790956676690055, + "grad_norm": 0.0006628444534726441, + "learning_rate": 1.8353968365563357e-06, + "loss": 0.0398, + "num_input_tokens_seen": 194374592, + "step": 159745 + }, + { + "epoch": 17.79151353157367, + "grad_norm": 0.07075691968202591, + "learning_rate": 1.8344831517917799e-06, + "loss": 0.0322, + "num_input_tokens_seen": 194380448, + "step": 159750 + }, + { + "epoch": 17.79207038645729, + "grad_norm": 0.2588668167591095, + "learning_rate": 1.8335696858440916e-06, + "loss": 0.0047, + "num_input_tokens_seen": 194386464, + "step": 159755 + }, + { + "epoch": 17.792627241340906, + "grad_norm": 0.08410980552434921, + "learning_rate": 1.8326564387218942e-06, + "loss": 0.0198, + "num_input_tokens_seen": 194392288, + "step": 159760 + }, + { + "epoch": 17.793184096224525, + "grad_norm": 1.5541722774505615, + "learning_rate": 1.8317434104338226e-06, + "loss": 0.1285, + "num_input_tokens_seen": 194398112, + "step": 159765 + }, + { + "epoch": 17.79374095110814, + "grad_norm": 1.8503717184066772, + "learning_rate": 1.8308306009884923e-06, + "loss": 0.0602, + "num_input_tokens_seen": 194404384, + "step": 159770 + }, + { + "epoch": 17.794297805991757, + "grad_norm": 0.411361426115036, + "learning_rate": 1.829918010394538e-06, + "loss": 0.0175, + "num_input_tokens_seen": 194410368, + "step": 159775 + }, + { + "epoch": 17.794854660875377, + "grad_norm": 1.100078821182251, + "learning_rate": 1.8290056386605609e-06, + "loss": 0.0234, + "num_input_tokens_seen": 194416608, + "step": 159780 + }, + { + "epoch": 17.795411515758992, + "grad_norm": 0.005799478385597467, + "learning_rate": 1.828093485795196e-06, + "loss": 0.0323, + "num_input_tokens_seen": 194422656, + "step": 159785 + }, + { + "epoch": 17.795968370642612, + "grad_norm": 0.00021654798183590174, + "learning_rate": 1.8271815518070502e-06, + "loss": 0.0588, + "num_input_tokens_seen": 194428192, + "step": 159790 + }, + { + "epoch": 17.796525225526228, + "grad_norm": 0.18852072954177856, + "learning_rate": 1.8262698367047444e-06, + "loss": 0.0043, + "num_input_tokens_seen": 194434432, + "step": 159795 + }, + { + "epoch": 17.797082080409844, + "grad_norm": 0.031962256878614426, + "learning_rate": 1.8253583404968854e-06, + "loss": 0.042, + "num_input_tokens_seen": 194440576, + "step": 159800 + }, + { + "epoch": 17.797638935293463, + "grad_norm": 0.017218366265296936, + "learning_rate": 1.8244470631920836e-06, + "loss": 0.0099, + "num_input_tokens_seen": 194446688, + "step": 159805 + }, + { + "epoch": 17.79819579017708, + "grad_norm": 0.09491617232561111, + "learning_rate": 1.8235360047989453e-06, + "loss": 0.0122, + "num_input_tokens_seen": 194452384, + "step": 159810 + }, + { + "epoch": 17.7987526450607, + "grad_norm": 0.25469550490379333, + "learning_rate": 1.8226251653260806e-06, + "loss": 0.1842, + "num_input_tokens_seen": 194458624, + "step": 159815 + }, + { + "epoch": 17.799309499944314, + "grad_norm": 0.0002676235744729638, + "learning_rate": 1.821714544782091e-06, + "loss": 0.0918, + "num_input_tokens_seen": 194464928, + "step": 159820 + }, + { + "epoch": 17.79986635482793, + "grad_norm": 0.15556669235229492, + "learning_rate": 1.8208041431755752e-06, + "loss": 0.0171, + "num_input_tokens_seen": 194470976, + "step": 159825 + }, + { + "epoch": 17.80042320971155, + "grad_norm": 1.1812012195587158, + "learning_rate": 1.8198939605151344e-06, + "loss": 0.0295, + "num_input_tokens_seen": 194477088, + "step": 159830 + }, + { + "epoch": 17.800980064595166, + "grad_norm": 0.5245797634124756, + "learning_rate": 1.8189839968093703e-06, + "loss": 0.0309, + "num_input_tokens_seen": 194483264, + "step": 159835 + }, + { + "epoch": 17.801536919478785, + "grad_norm": 0.00016287423204630613, + "learning_rate": 1.8180742520668703e-06, + "loss": 0.0549, + "num_input_tokens_seen": 194489568, + "step": 159840 + }, + { + "epoch": 17.8020937743624, + "grad_norm": 2.387549638748169, + "learning_rate": 1.8171647262962361e-06, + "loss": 0.0487, + "num_input_tokens_seen": 194495104, + "step": 159845 + }, + { + "epoch": 17.802650629246017, + "grad_norm": 0.338554322719574, + "learning_rate": 1.8162554195060523e-06, + "loss": 0.0751, + "num_input_tokens_seen": 194501344, + "step": 159850 + }, + { + "epoch": 17.803207484129636, + "grad_norm": 0.011976396664977074, + "learning_rate": 1.8153463317049146e-06, + "loss": 0.1308, + "num_input_tokens_seen": 194507232, + "step": 159855 + }, + { + "epoch": 17.803764339013252, + "grad_norm": 0.004823542665690184, + "learning_rate": 1.8144374629013999e-06, + "loss": 0.0873, + "num_input_tokens_seen": 194513760, + "step": 159860 + }, + { + "epoch": 17.80432119389687, + "grad_norm": 0.0046891565434634686, + "learning_rate": 1.8135288131041038e-06, + "loss": 0.0093, + "num_input_tokens_seen": 194520192, + "step": 159865 + }, + { + "epoch": 17.804878048780488, + "grad_norm": 0.6808540225028992, + "learning_rate": 1.8126203823216032e-06, + "loss": 0.0085, + "num_input_tokens_seen": 194526016, + "step": 159870 + }, + { + "epoch": 17.805434903664104, + "grad_norm": 0.003564742859452963, + "learning_rate": 1.8117121705624822e-06, + "loss": 0.0117, + "num_input_tokens_seen": 194532224, + "step": 159875 + }, + { + "epoch": 17.805991758547723, + "grad_norm": 0.7854744791984558, + "learning_rate": 1.8108041778353152e-06, + "loss": 0.1002, + "num_input_tokens_seen": 194538400, + "step": 159880 + }, + { + "epoch": 17.80654861343134, + "grad_norm": 0.007162455935031176, + "learning_rate": 1.8098964041486838e-06, + "loss": 0.0082, + "num_input_tokens_seen": 194544448, + "step": 159885 + }, + { + "epoch": 17.80710546831496, + "grad_norm": 1.6179728507995605, + "learning_rate": 1.8089888495111563e-06, + "loss": 0.0623, + "num_input_tokens_seen": 194550432, + "step": 159890 + }, + { + "epoch": 17.807662323198574, + "grad_norm": 0.7277178764343262, + "learning_rate": 1.8080815139313172e-06, + "loss": 0.0662, + "num_input_tokens_seen": 194556576, + "step": 159895 + }, + { + "epoch": 17.80821917808219, + "grad_norm": 0.0005511320196092129, + "learning_rate": 1.8071743974177213e-06, + "loss": 0.0489, + "num_input_tokens_seen": 194562816, + "step": 159900 + }, + { + "epoch": 17.80877603296581, + "grad_norm": 0.12166353315114975, + "learning_rate": 1.8062674999789502e-06, + "loss": 0.0029, + "num_input_tokens_seen": 194568832, + "step": 159905 + }, + { + "epoch": 17.809332887849425, + "grad_norm": 0.07498742640018463, + "learning_rate": 1.8053608216235613e-06, + "loss": 0.0141, + "num_input_tokens_seen": 194574976, + "step": 159910 + }, + { + "epoch": 17.809889742733045, + "grad_norm": 0.8537826538085938, + "learning_rate": 1.804454362360125e-06, + "loss": 0.0361, + "num_input_tokens_seen": 194580768, + "step": 159915 + }, + { + "epoch": 17.81044659761666, + "grad_norm": 0.014783281832933426, + "learning_rate": 1.8035481221972018e-06, + "loss": 0.0689, + "num_input_tokens_seen": 194586720, + "step": 159920 + }, + { + "epoch": 17.811003452500277, + "grad_norm": 0.0018562241457402706, + "learning_rate": 1.8026421011433508e-06, + "loss": 0.1098, + "num_input_tokens_seen": 194592768, + "step": 159925 + }, + { + "epoch": 17.811560307383896, + "grad_norm": 0.003929630853235722, + "learning_rate": 1.8017362992071295e-06, + "loss": 0.0005, + "num_input_tokens_seen": 194599008, + "step": 159930 + }, + { + "epoch": 17.812117162267512, + "grad_norm": 0.1791892647743225, + "learning_rate": 1.800830716397095e-06, + "loss": 0.0163, + "num_input_tokens_seen": 194605536, + "step": 159935 + }, + { + "epoch": 17.81267401715113, + "grad_norm": 0.021872764453291893, + "learning_rate": 1.799925352721804e-06, + "loss": 0.0009, + "num_input_tokens_seen": 194611936, + "step": 159940 + }, + { + "epoch": 17.813230872034747, + "grad_norm": 0.020628495141863823, + "learning_rate": 1.7990202081898056e-06, + "loss": 0.0307, + "num_input_tokens_seen": 194617888, + "step": 159945 + }, + { + "epoch": 17.813787726918363, + "grad_norm": 1.5461885929107666, + "learning_rate": 1.7981152828096425e-06, + "loss": 0.0852, + "num_input_tokens_seen": 194623680, + "step": 159950 + }, + { + "epoch": 17.814344581801983, + "grad_norm": 0.16355599462985992, + "learning_rate": 1.7972105765898777e-06, + "loss": 0.0075, + "num_input_tokens_seen": 194629824, + "step": 159955 + }, + { + "epoch": 17.8149014366856, + "grad_norm": 0.00043010906665585935, + "learning_rate": 1.7963060895390404e-06, + "loss": 0.0241, + "num_input_tokens_seen": 194636064, + "step": 159960 + }, + { + "epoch": 17.815458291569218, + "grad_norm": 0.006514870561659336, + "learning_rate": 1.7954018216656958e-06, + "loss": 0.0114, + "num_input_tokens_seen": 194642336, + "step": 159965 + }, + { + "epoch": 17.816015146452834, + "grad_norm": 0.010690070688724518, + "learning_rate": 1.7944977729783596e-06, + "loss": 0.0147, + "num_input_tokens_seen": 194648000, + "step": 159970 + }, + { + "epoch": 17.816572001336453, + "grad_norm": 0.07269321382045746, + "learning_rate": 1.7935939434855913e-06, + "loss": 0.0024, + "num_input_tokens_seen": 194653824, + "step": 159975 + }, + { + "epoch": 17.81712885622007, + "grad_norm": 1.937467336654663, + "learning_rate": 1.7926903331959149e-06, + "loss": 0.0922, + "num_input_tokens_seen": 194659936, + "step": 159980 + }, + { + "epoch": 17.817685711103685, + "grad_norm": 0.14811889827251434, + "learning_rate": 1.7917869421178763e-06, + "loss": 0.0154, + "num_input_tokens_seen": 194666080, + "step": 159985 + }, + { + "epoch": 17.818242565987305, + "grad_norm": 0.038647573441267014, + "learning_rate": 1.7908837702600017e-06, + "loss": 0.0017, + "num_input_tokens_seen": 194671808, + "step": 159990 + }, + { + "epoch": 17.81879942087092, + "grad_norm": 1.3905906677246094, + "learning_rate": 1.789980817630829e-06, + "loss": 0.042, + "num_input_tokens_seen": 194677888, + "step": 159995 + }, + { + "epoch": 17.81935627575454, + "grad_norm": 0.004842647351324558, + "learning_rate": 1.7890780842388765e-06, + "loss": 0.0035, + "num_input_tokens_seen": 194684288, + "step": 160000 + }, + { + "epoch": 17.819913130638156, + "grad_norm": 1.5602041482925415, + "learning_rate": 1.7881755700926817e-06, + "loss": 0.0111, + "num_input_tokens_seen": 194690432, + "step": 160005 + }, + { + "epoch": 17.820469985521772, + "grad_norm": 0.7930908799171448, + "learning_rate": 1.7872732752007654e-06, + "loss": 0.0241, + "num_input_tokens_seen": 194696128, + "step": 160010 + }, + { + "epoch": 17.82102684040539, + "grad_norm": 0.24785953760147095, + "learning_rate": 1.7863711995716515e-06, + "loss": 0.0111, + "num_input_tokens_seen": 194702304, + "step": 160015 + }, + { + "epoch": 17.821583695289007, + "grad_norm": 0.00028041511541232467, + "learning_rate": 1.7854693432138607e-06, + "loss": 0.0051, + "num_input_tokens_seen": 194708864, + "step": 160020 + }, + { + "epoch": 17.822140550172627, + "grad_norm": 1.328652262687683, + "learning_rate": 1.7845677061359062e-06, + "loss": 0.0783, + "num_input_tokens_seen": 194715008, + "step": 160025 + }, + { + "epoch": 17.822697405056243, + "grad_norm": 1.5052006244659424, + "learning_rate": 1.783666288346314e-06, + "loss": 0.1114, + "num_input_tokens_seen": 194720992, + "step": 160030 + }, + { + "epoch": 17.82325425993986, + "grad_norm": 0.3149445354938507, + "learning_rate": 1.782765089853594e-06, + "loss": 0.0125, + "num_input_tokens_seen": 194727136, + "step": 160035 + }, + { + "epoch": 17.823811114823478, + "grad_norm": 1.4296154975891113, + "learning_rate": 1.7818641106662593e-06, + "loss": 0.2122, + "num_input_tokens_seen": 194733216, + "step": 160040 + }, + { + "epoch": 17.824367969707094, + "grad_norm": 1.39310884475708, + "learning_rate": 1.7809633507928165e-06, + "loss": 0.0519, + "num_input_tokens_seen": 194739424, + "step": 160045 + }, + { + "epoch": 17.824924824590713, + "grad_norm": 0.030468614771962166, + "learning_rate": 1.780062810241781e-06, + "loss": 0.0104, + "num_input_tokens_seen": 194745728, + "step": 160050 + }, + { + "epoch": 17.82548167947433, + "grad_norm": 2.006237745285034, + "learning_rate": 1.7791624890216519e-06, + "loss": 0.2474, + "num_input_tokens_seen": 194751840, + "step": 160055 + }, + { + "epoch": 17.826038534357945, + "grad_norm": 0.010152017697691917, + "learning_rate": 1.7782623871409414e-06, + "loss": 0.0149, + "num_input_tokens_seen": 194758048, + "step": 160060 + }, + { + "epoch": 17.826595389241565, + "grad_norm": 0.00030623030033893883, + "learning_rate": 1.7773625046081488e-06, + "loss": 0.0314, + "num_input_tokens_seen": 194764096, + "step": 160065 + }, + { + "epoch": 17.82715224412518, + "grad_norm": 0.2908851206302643, + "learning_rate": 1.7764628414317723e-06, + "loss": 0.0246, + "num_input_tokens_seen": 194770240, + "step": 160070 + }, + { + "epoch": 17.8277090990088, + "grad_norm": 1.887160062789917, + "learning_rate": 1.7755633976203056e-06, + "loss": 0.0269, + "num_input_tokens_seen": 194776192, + "step": 160075 + }, + { + "epoch": 17.828265953892416, + "grad_norm": 0.0020810316782444715, + "learning_rate": 1.7746641731822555e-06, + "loss": 0.0082, + "num_input_tokens_seen": 194782304, + "step": 160080 + }, + { + "epoch": 17.82882280877603, + "grad_norm": 0.0037105896044522524, + "learning_rate": 1.773765168126107e-06, + "loss": 0.0113, + "num_input_tokens_seen": 194788704, + "step": 160085 + }, + { + "epoch": 17.82937966365965, + "grad_norm": 0.4507349729537964, + "learning_rate": 1.7728663824603586e-06, + "loss": 0.015, + "num_input_tokens_seen": 194794912, + "step": 160090 + }, + { + "epoch": 17.829936518543267, + "grad_norm": 0.05002728849649429, + "learning_rate": 1.77196781619349e-06, + "loss": 0.0908, + "num_input_tokens_seen": 194800768, + "step": 160095 + }, + { + "epoch": 17.830493373426886, + "grad_norm": 1.261502742767334, + "learning_rate": 1.7710694693339997e-06, + "loss": 0.0568, + "num_input_tokens_seen": 194806656, + "step": 160100 + }, + { + "epoch": 17.831050228310502, + "grad_norm": 0.0003932022664230317, + "learning_rate": 1.770171341890367e-06, + "loss": 0.0211, + "num_input_tokens_seen": 194812704, + "step": 160105 + }, + { + "epoch": 17.83160708319412, + "grad_norm": 0.026917433366179466, + "learning_rate": 1.7692734338710826e-06, + "loss": 0.0075, + "num_input_tokens_seen": 194818720, + "step": 160110 + }, + { + "epoch": 17.832163938077738, + "grad_norm": 0.07489760220050812, + "learning_rate": 1.7683757452846173e-06, + "loss": 0.0041, + "num_input_tokens_seen": 194824864, + "step": 160115 + }, + { + "epoch": 17.832720792961354, + "grad_norm": 0.0024353167973458767, + "learning_rate": 1.7674782761394587e-06, + "loss": 0.1012, + "num_input_tokens_seen": 194830784, + "step": 160120 + }, + { + "epoch": 17.833277647844973, + "grad_norm": 0.004416429437696934, + "learning_rate": 1.766581026444078e-06, + "loss": 0.0031, + "num_input_tokens_seen": 194837120, + "step": 160125 + }, + { + "epoch": 17.83383450272859, + "grad_norm": 0.6426443457603455, + "learning_rate": 1.765683996206957e-06, + "loss": 0.0299, + "num_input_tokens_seen": 194843392, + "step": 160130 + }, + { + "epoch": 17.834391357612205, + "grad_norm": 0.035353995859622955, + "learning_rate": 1.7647871854365644e-06, + "loss": 0.0162, + "num_input_tokens_seen": 194849536, + "step": 160135 + }, + { + "epoch": 17.834948212495824, + "grad_norm": 0.006342042703181505, + "learning_rate": 1.7638905941413763e-06, + "loss": 0.0234, + "num_input_tokens_seen": 194855648, + "step": 160140 + }, + { + "epoch": 17.83550506737944, + "grad_norm": 0.08681493252515793, + "learning_rate": 1.7629942223298502e-06, + "loss": 0.0111, + "num_input_tokens_seen": 194861760, + "step": 160145 + }, + { + "epoch": 17.83606192226306, + "grad_norm": 1.0647954940795898, + "learning_rate": 1.7620980700104679e-06, + "loss": 0.1042, + "num_input_tokens_seen": 194867328, + "step": 160150 + }, + { + "epoch": 17.836618777146676, + "grad_norm": 0.000524184841196984, + "learning_rate": 1.7612021371916838e-06, + "loss": 0.0358, + "num_input_tokens_seen": 194873184, + "step": 160155 + }, + { + "epoch": 17.83717563203029, + "grad_norm": 0.7562025785446167, + "learning_rate": 1.760306423881966e-06, + "loss": 0.0275, + "num_input_tokens_seen": 194879264, + "step": 160160 + }, + { + "epoch": 17.83773248691391, + "grad_norm": 0.107445128262043, + "learning_rate": 1.7594109300897693e-06, + "loss": 0.0233, + "num_input_tokens_seen": 194885248, + "step": 160165 + }, + { + "epoch": 17.838289341797527, + "grad_norm": 0.002949697896838188, + "learning_rate": 1.7585156558235616e-06, + "loss": 0.0206, + "num_input_tokens_seen": 194891328, + "step": 160170 + }, + { + "epoch": 17.838846196681146, + "grad_norm": 0.07902798801660538, + "learning_rate": 1.757620601091789e-06, + "loss": 0.0102, + "num_input_tokens_seen": 194897472, + "step": 160175 + }, + { + "epoch": 17.839403051564762, + "grad_norm": 0.039673708379268646, + "learning_rate": 1.7567257659029196e-06, + "loss": 0.1102, + "num_input_tokens_seen": 194903424, + "step": 160180 + }, + { + "epoch": 17.839959906448378, + "grad_norm": 0.15237146615982056, + "learning_rate": 1.7558311502653885e-06, + "loss": 0.0655, + "num_input_tokens_seen": 194909152, + "step": 160185 + }, + { + "epoch": 17.840516761331997, + "grad_norm": 1.1593350172042847, + "learning_rate": 1.754936754187661e-06, + "loss": 0.0913, + "num_input_tokens_seen": 194915168, + "step": 160190 + }, + { + "epoch": 17.841073616215613, + "grad_norm": 0.5586010813713074, + "learning_rate": 1.7540425776781748e-06, + "loss": 0.0119, + "num_input_tokens_seen": 194921632, + "step": 160195 + }, + { + "epoch": 17.841630471099233, + "grad_norm": 0.3551414906978607, + "learning_rate": 1.7531486207453845e-06, + "loss": 0.0683, + "num_input_tokens_seen": 194927904, + "step": 160200 + }, + { + "epoch": 17.84218732598285, + "grad_norm": 0.04730202630162239, + "learning_rate": 1.7522548833977303e-06, + "loss": 0.0245, + "num_input_tokens_seen": 194934112, + "step": 160205 + }, + { + "epoch": 17.842744180866465, + "grad_norm": 2.585251569747925, + "learning_rate": 1.7513613656436557e-06, + "loss": 0.0547, + "num_input_tokens_seen": 194940032, + "step": 160210 + }, + { + "epoch": 17.843301035750084, + "grad_norm": 0.23260146379470825, + "learning_rate": 1.7504680674915952e-06, + "loss": 0.0163, + "num_input_tokens_seen": 194946496, + "step": 160215 + }, + { + "epoch": 17.8438578906337, + "grad_norm": 0.001080540125258267, + "learning_rate": 1.7495749889499924e-06, + "loss": 0.0668, + "num_input_tokens_seen": 194953056, + "step": 160220 + }, + { + "epoch": 17.84441474551732, + "grad_norm": 0.14705868065357208, + "learning_rate": 1.748682130027285e-06, + "loss": 0.0403, + "num_input_tokens_seen": 194959168, + "step": 160225 + }, + { + "epoch": 17.844971600400935, + "grad_norm": 0.7554773092269897, + "learning_rate": 1.747789490731902e-06, + "loss": 0.0196, + "num_input_tokens_seen": 194965280, + "step": 160230 + }, + { + "epoch": 17.84552845528455, + "grad_norm": 0.4653404653072357, + "learning_rate": 1.7468970710722731e-06, + "loss": 0.1281, + "num_input_tokens_seen": 194971488, + "step": 160235 + }, + { + "epoch": 17.84608531016817, + "grad_norm": 0.0488494448363781, + "learning_rate": 1.746004871056836e-06, + "loss": 0.0105, + "num_input_tokens_seen": 194977632, + "step": 160240 + }, + { + "epoch": 17.846642165051787, + "grad_norm": 0.0016245665028691292, + "learning_rate": 1.745112890694009e-06, + "loss": 0.0995, + "num_input_tokens_seen": 194983712, + "step": 160245 + }, + { + "epoch": 17.847199019935406, + "grad_norm": 0.1731872856616974, + "learning_rate": 1.7442211299922267e-06, + "loss": 0.0809, + "num_input_tokens_seen": 194990240, + "step": 160250 + }, + { + "epoch": 17.847755874819022, + "grad_norm": 0.11132965236902237, + "learning_rate": 1.7433295889599078e-06, + "loss": 0.1138, + "num_input_tokens_seen": 194996352, + "step": 160255 + }, + { + "epoch": 17.848312729702638, + "grad_norm": 0.046741195023059845, + "learning_rate": 1.742438267605473e-06, + "loss": 0.1066, + "num_input_tokens_seen": 195002368, + "step": 160260 + }, + { + "epoch": 17.848869584586257, + "grad_norm": 0.052884556353092194, + "learning_rate": 1.7415471659373377e-06, + "loss": 0.0434, + "num_input_tokens_seen": 195008032, + "step": 160265 + }, + { + "epoch": 17.849426439469873, + "grad_norm": 1.766442894935608, + "learning_rate": 1.740656283963929e-06, + "loss": 0.1014, + "num_input_tokens_seen": 195013568, + "step": 160270 + }, + { + "epoch": 17.849983294353493, + "grad_norm": 0.7303229570388794, + "learning_rate": 1.7397656216936564e-06, + "loss": 0.059, + "num_input_tokens_seen": 195019808, + "step": 160275 + }, + { + "epoch": 17.85054014923711, + "grad_norm": 0.0036018795799463987, + "learning_rate": 1.7388751791349356e-06, + "loss": 0.0146, + "num_input_tokens_seen": 195025792, + "step": 160280 + }, + { + "epoch": 17.851097004120724, + "grad_norm": 1.223497986793518, + "learning_rate": 1.737984956296168e-06, + "loss": 0.0481, + "num_input_tokens_seen": 195031936, + "step": 160285 + }, + { + "epoch": 17.851653859004344, + "grad_norm": 0.10928186029195786, + "learning_rate": 1.737094953185775e-06, + "loss": 0.0378, + "num_input_tokens_seen": 195038016, + "step": 160290 + }, + { + "epoch": 17.85221071388796, + "grad_norm": 1.9700886011123657, + "learning_rate": 1.736205169812155e-06, + "loss": 0.0502, + "num_input_tokens_seen": 195044512, + "step": 160295 + }, + { + "epoch": 17.85276756877158, + "grad_norm": 0.2122548222541809, + "learning_rate": 1.735315606183724e-06, + "loss": 0.0051, + "num_input_tokens_seen": 195050880, + "step": 160300 + }, + { + "epoch": 17.853324423655195, + "grad_norm": 0.2624756097793579, + "learning_rate": 1.7344262623088664e-06, + "loss": 0.0082, + "num_input_tokens_seen": 195056832, + "step": 160305 + }, + { + "epoch": 17.853881278538815, + "grad_norm": 0.07001116871833801, + "learning_rate": 1.733537138195998e-06, + "loss": 0.014, + "num_input_tokens_seen": 195062624, + "step": 160310 + }, + { + "epoch": 17.85443813342243, + "grad_norm": 1.4203016757965088, + "learning_rate": 1.7326482338535095e-06, + "loss": 0.0767, + "num_input_tokens_seen": 195068384, + "step": 160315 + }, + { + "epoch": 17.854994988306046, + "grad_norm": 0.10656039416790009, + "learning_rate": 1.731759549289802e-06, + "loss": 0.1681, + "num_input_tokens_seen": 195074272, + "step": 160320 + }, + { + "epoch": 17.855551843189666, + "grad_norm": 0.06035078689455986, + "learning_rate": 1.7308710845132663e-06, + "loss": 0.1358, + "num_input_tokens_seen": 195080000, + "step": 160325 + }, + { + "epoch": 17.85610869807328, + "grad_norm": 0.7644681930541992, + "learning_rate": 1.7299828395322987e-06, + "loss": 0.0477, + "num_input_tokens_seen": 195086208, + "step": 160330 + }, + { + "epoch": 17.856665552956898, + "grad_norm": 0.7743719816207886, + "learning_rate": 1.7290948143552837e-06, + "loss": 0.0306, + "num_input_tokens_seen": 195092224, + "step": 160335 + }, + { + "epoch": 17.857222407840517, + "grad_norm": 1.2185454368591309, + "learning_rate": 1.728207008990615e-06, + "loss": 0.0692, + "num_input_tokens_seen": 195098144, + "step": 160340 + }, + { + "epoch": 17.857779262724133, + "grad_norm": 8.588387572672218e-05, + "learning_rate": 1.7273194234466744e-06, + "loss": 0.0704, + "num_input_tokens_seen": 195104512, + "step": 160345 + }, + { + "epoch": 17.858336117607752, + "grad_norm": 0.00199309759773314, + "learning_rate": 1.7264320577318498e-06, + "loss": 0.0316, + "num_input_tokens_seen": 195110528, + "step": 160350 + }, + { + "epoch": 17.85889297249137, + "grad_norm": 0.2970304489135742, + "learning_rate": 1.725544911854518e-06, + "loss": 0.0051, + "num_input_tokens_seen": 195116704, + "step": 160355 + }, + { + "epoch": 17.859449827374988, + "grad_norm": 1.0326948165893555, + "learning_rate": 1.7246579858230638e-06, + "loss": 0.017, + "num_input_tokens_seen": 195122144, + "step": 160360 + }, + { + "epoch": 17.860006682258604, + "grad_norm": 0.5554624199867249, + "learning_rate": 1.7237712796458582e-06, + "loss": 0.0526, + "num_input_tokens_seen": 195128384, + "step": 160365 + }, + { + "epoch": 17.86056353714222, + "grad_norm": 0.9533551931381226, + "learning_rate": 1.7228847933312893e-06, + "loss": 0.0205, + "num_input_tokens_seen": 195134368, + "step": 160370 + }, + { + "epoch": 17.86112039202584, + "grad_norm": 0.004677262622863054, + "learning_rate": 1.7219985268877165e-06, + "loss": 0.0139, + "num_input_tokens_seen": 195140480, + "step": 160375 + }, + { + "epoch": 17.861677246909455, + "grad_norm": 0.00023388145200442523, + "learning_rate": 1.7211124803235224e-06, + "loss": 0.0101, + "num_input_tokens_seen": 195146560, + "step": 160380 + }, + { + "epoch": 17.862234101793074, + "grad_norm": 0.00043958332389593124, + "learning_rate": 1.7202266536470668e-06, + "loss": 0.0074, + "num_input_tokens_seen": 195152672, + "step": 160385 + }, + { + "epoch": 17.86279095667669, + "grad_norm": 0.1157802939414978, + "learning_rate": 1.7193410468667237e-06, + "loss": 0.0426, + "num_input_tokens_seen": 195158336, + "step": 160390 + }, + { + "epoch": 17.863347811560306, + "grad_norm": 0.3865858316421509, + "learning_rate": 1.7184556599908586e-06, + "loss": 0.0101, + "num_input_tokens_seen": 195164864, + "step": 160395 + }, + { + "epoch": 17.863904666443926, + "grad_norm": 0.03929033875465393, + "learning_rate": 1.7175704930278313e-06, + "loss": 0.0744, + "num_input_tokens_seen": 195171072, + "step": 160400 + }, + { + "epoch": 17.86446152132754, + "grad_norm": 0.34470829367637634, + "learning_rate": 1.716685545986002e-06, + "loss": 0.0246, + "num_input_tokens_seen": 195177056, + "step": 160405 + }, + { + "epoch": 17.86501837621116, + "grad_norm": 1.1258959770202637, + "learning_rate": 1.7158008188737362e-06, + "loss": 0.0182, + "num_input_tokens_seen": 195183200, + "step": 160410 + }, + { + "epoch": 17.865575231094777, + "grad_norm": 1.5005929470062256, + "learning_rate": 1.7149163116993854e-06, + "loss": 0.0377, + "num_input_tokens_seen": 195189152, + "step": 160415 + }, + { + "epoch": 17.866132085978393, + "grad_norm": 0.8270615339279175, + "learning_rate": 1.714032024471307e-06, + "loss": 0.0456, + "num_input_tokens_seen": 195195040, + "step": 160420 + }, + { + "epoch": 17.866688940862012, + "grad_norm": 0.02599325403571129, + "learning_rate": 1.713147957197847e-06, + "loss": 0.0531, + "num_input_tokens_seen": 195201280, + "step": 160425 + }, + { + "epoch": 17.867245795745628, + "grad_norm": 0.0005445117130875587, + "learning_rate": 1.7122641098873653e-06, + "loss": 0.0055, + "num_input_tokens_seen": 195207456, + "step": 160430 + }, + { + "epoch": 17.867802650629248, + "grad_norm": 0.008897312916815281, + "learning_rate": 1.7113804825482082e-06, + "loss": 0.0265, + "num_input_tokens_seen": 195213760, + "step": 160435 + }, + { + "epoch": 17.868359505512863, + "grad_norm": 0.8050091862678528, + "learning_rate": 1.7104970751887217e-06, + "loss": 0.0222, + "num_input_tokens_seen": 195220000, + "step": 160440 + }, + { + "epoch": 17.86891636039648, + "grad_norm": 0.7504594922065735, + "learning_rate": 1.7096138878172491e-06, + "loss": 0.0177, + "num_input_tokens_seen": 195226176, + "step": 160445 + }, + { + "epoch": 17.8694732152801, + "grad_norm": 0.0070579820312559605, + "learning_rate": 1.708730920442128e-06, + "loss": 0.0047, + "num_input_tokens_seen": 195232480, + "step": 160450 + }, + { + "epoch": 17.870030070163715, + "grad_norm": 0.5694994330406189, + "learning_rate": 1.7078481730717078e-06, + "loss": 0.0265, + "num_input_tokens_seen": 195238880, + "step": 160455 + }, + { + "epoch": 17.870586925047334, + "grad_norm": 0.15881454944610596, + "learning_rate": 1.7069656457143202e-06, + "loss": 0.0068, + "num_input_tokens_seen": 195244864, + "step": 160460 + }, + { + "epoch": 17.87114377993095, + "grad_norm": 0.13302209973335266, + "learning_rate": 1.7060833383783086e-06, + "loss": 0.0082, + "num_input_tokens_seen": 195251008, + "step": 160465 + }, + { + "epoch": 17.871700634814566, + "grad_norm": 0.011693391017615795, + "learning_rate": 1.7052012510720001e-06, + "loss": 0.0089, + "num_input_tokens_seen": 195256800, + "step": 160470 + }, + { + "epoch": 17.872257489698185, + "grad_norm": 0.15591931343078613, + "learning_rate": 1.7043193838037318e-06, + "loss": 0.0208, + "num_input_tokens_seen": 195262816, + "step": 160475 + }, + { + "epoch": 17.8728143445818, + "grad_norm": 0.0870043933391571, + "learning_rate": 1.7034377365818254e-06, + "loss": 0.03, + "num_input_tokens_seen": 195268704, + "step": 160480 + }, + { + "epoch": 17.87337119946542, + "grad_norm": 0.028392214328050613, + "learning_rate": 1.7025563094146214e-06, + "loss": 0.0006, + "num_input_tokens_seen": 195274880, + "step": 160485 + }, + { + "epoch": 17.873928054349037, + "grad_norm": 0.6275810599327087, + "learning_rate": 1.7016751023104349e-06, + "loss": 0.0144, + "num_input_tokens_seen": 195280768, + "step": 160490 + }, + { + "epoch": 17.874484909232653, + "grad_norm": 0.3046953082084656, + "learning_rate": 1.7007941152775958e-06, + "loss": 0.0153, + "num_input_tokens_seen": 195286720, + "step": 160495 + }, + { + "epoch": 17.875041764116272, + "grad_norm": 0.11243367940187454, + "learning_rate": 1.6999133483244195e-06, + "loss": 0.0654, + "num_input_tokens_seen": 195292928, + "step": 160500 + }, + { + "epoch": 17.875598618999888, + "grad_norm": 1.3297559022903442, + "learning_rate": 1.6990328014592327e-06, + "loss": 0.049, + "num_input_tokens_seen": 195298464, + "step": 160505 + }, + { + "epoch": 17.876155473883507, + "grad_norm": 0.12783902883529663, + "learning_rate": 1.6981524746903455e-06, + "loss": 0.0128, + "num_input_tokens_seen": 195304256, + "step": 160510 + }, + { + "epoch": 17.876712328767123, + "grad_norm": 0.03573981672525406, + "learning_rate": 1.6972723680260843e-06, + "loss": 0.0328, + "num_input_tokens_seen": 195310816, + "step": 160515 + }, + { + "epoch": 17.87726918365074, + "grad_norm": 0.05322062969207764, + "learning_rate": 1.6963924814747483e-06, + "loss": 0.0461, + "num_input_tokens_seen": 195316960, + "step": 160520 + }, + { + "epoch": 17.87782603853436, + "grad_norm": 0.0005125583265908062, + "learning_rate": 1.6955128150446587e-06, + "loss": 0.0175, + "num_input_tokens_seen": 195323136, + "step": 160525 + }, + { + "epoch": 17.878382893417974, + "grad_norm": 0.03335520252585411, + "learning_rate": 1.6946333687441197e-06, + "loss": 0.0245, + "num_input_tokens_seen": 195329184, + "step": 160530 + }, + { + "epoch": 17.878939748301594, + "grad_norm": 1.0141743421554565, + "learning_rate": 1.6937541425814442e-06, + "loss": 0.0181, + "num_input_tokens_seen": 195335200, + "step": 160535 + }, + { + "epoch": 17.87949660318521, + "grad_norm": 0.012862036004662514, + "learning_rate": 1.6928751365649309e-06, + "loss": 0.064, + "num_input_tokens_seen": 195341504, + "step": 160540 + }, + { + "epoch": 17.880053458068826, + "grad_norm": 0.1983317881822586, + "learning_rate": 1.6919963507028874e-06, + "loss": 0.0321, + "num_input_tokens_seen": 195347360, + "step": 160545 + }, + { + "epoch": 17.880610312952445, + "grad_norm": 0.5369365811347961, + "learning_rate": 1.6911177850036097e-06, + "loss": 0.0177, + "num_input_tokens_seen": 195353536, + "step": 160550 + }, + { + "epoch": 17.88116716783606, + "grad_norm": 0.8504006862640381, + "learning_rate": 1.6902394394754023e-06, + "loss": 0.0308, + "num_input_tokens_seen": 195359552, + "step": 160555 + }, + { + "epoch": 17.88172402271968, + "grad_norm": 0.044092774391174316, + "learning_rate": 1.6893613141265585e-06, + "loss": 0.011, + "num_input_tokens_seen": 195365600, + "step": 160560 + }, + { + "epoch": 17.882280877603296, + "grad_norm": 0.4763029217720032, + "learning_rate": 1.6884834089653717e-06, + "loss": 0.0077, + "num_input_tokens_seen": 195371584, + "step": 160565 + }, + { + "epoch": 17.882837732486912, + "grad_norm": 0.9379106760025024, + "learning_rate": 1.6876057240001353e-06, + "loss": 0.0112, + "num_input_tokens_seen": 195377664, + "step": 160570 + }, + { + "epoch": 17.883394587370532, + "grad_norm": 0.008980144746601582, + "learning_rate": 1.6867282592391426e-06, + "loss": 0.0107, + "num_input_tokens_seen": 195383808, + "step": 160575 + }, + { + "epoch": 17.883951442254148, + "grad_norm": 0.5388281345367432, + "learning_rate": 1.685851014690676e-06, + "loss": 0.0117, + "num_input_tokens_seen": 195390016, + "step": 160580 + }, + { + "epoch": 17.884508297137767, + "grad_norm": 0.15567773580551147, + "learning_rate": 1.6849739903630312e-06, + "loss": 0.0439, + "num_input_tokens_seen": 195395936, + "step": 160585 + }, + { + "epoch": 17.885065152021383, + "grad_norm": 0.0002571528893895447, + "learning_rate": 1.6840971862644827e-06, + "loss": 0.0029, + "num_input_tokens_seen": 195402176, + "step": 160590 + }, + { + "epoch": 17.885622006905, + "grad_norm": 0.421268105506897, + "learning_rate": 1.6832206024033181e-06, + "loss": 0.0399, + "num_input_tokens_seen": 195408096, + "step": 160595 + }, + { + "epoch": 17.88617886178862, + "grad_norm": 0.15141433477401733, + "learning_rate": 1.682344238787814e-06, + "loss": 0.0846, + "num_input_tokens_seen": 195413920, + "step": 160600 + }, + { + "epoch": 17.886735716672234, + "grad_norm": 0.14636507630348206, + "learning_rate": 1.681468095426253e-06, + "loss": 0.1164, + "num_input_tokens_seen": 195419808, + "step": 160605 + }, + { + "epoch": 17.887292571555854, + "grad_norm": 0.06455065310001373, + "learning_rate": 1.6805921723269086e-06, + "loss": 0.0349, + "num_input_tokens_seen": 195426048, + "step": 160610 + }, + { + "epoch": 17.88784942643947, + "grad_norm": 4.58384895324707, + "learning_rate": 1.679716469498052e-06, + "loss": 0.0565, + "num_input_tokens_seen": 195431872, + "step": 160615 + }, + { + "epoch": 17.888406281323086, + "grad_norm": 0.11131355911493301, + "learning_rate": 1.6788409869479577e-06, + "loss": 0.0081, + "num_input_tokens_seen": 195437856, + "step": 160620 + }, + { + "epoch": 17.888963136206705, + "grad_norm": 0.14136624336242676, + "learning_rate": 1.6779657246848963e-06, + "loss": 0.0019, + "num_input_tokens_seen": 195444064, + "step": 160625 + }, + { + "epoch": 17.88951999109032, + "grad_norm": 0.161908358335495, + "learning_rate": 1.6770906827171333e-06, + "loss": 0.0159, + "num_input_tokens_seen": 195449184, + "step": 160630 + }, + { + "epoch": 17.89007684597394, + "grad_norm": 0.0018067726632580161, + "learning_rate": 1.6762158610529349e-06, + "loss": 0.0025, + "num_input_tokens_seen": 195455328, + "step": 160635 + }, + { + "epoch": 17.890633700857556, + "grad_norm": 0.029080931097269058, + "learning_rate": 1.6753412597005635e-06, + "loss": 0.0326, + "num_input_tokens_seen": 195461472, + "step": 160640 + }, + { + "epoch": 17.891190555741176, + "grad_norm": 0.005773029290139675, + "learning_rate": 1.674466878668282e-06, + "loss": 0.0149, + "num_input_tokens_seen": 195467616, + "step": 160645 + }, + { + "epoch": 17.89174741062479, + "grad_norm": 0.004241396673023701, + "learning_rate": 1.6735927179643452e-06, + "loss": 0.0028, + "num_input_tokens_seen": 195473472, + "step": 160650 + }, + { + "epoch": 17.892304265508407, + "grad_norm": 0.0025182508397847414, + "learning_rate": 1.672718777597021e-06, + "loss": 0.0361, + "num_input_tokens_seen": 195479712, + "step": 160655 + }, + { + "epoch": 17.892861120392027, + "grad_norm": 0.00038251353544183075, + "learning_rate": 1.6718450575745531e-06, + "loss": 0.1906, + "num_input_tokens_seen": 195485888, + "step": 160660 + }, + { + "epoch": 17.893417975275643, + "grad_norm": 1.5825895071029663, + "learning_rate": 1.6709715579052015e-06, + "loss": 0.0662, + "num_input_tokens_seen": 195491904, + "step": 160665 + }, + { + "epoch": 17.89397483015926, + "grad_norm": 0.0008427840075455606, + "learning_rate": 1.6700982785972097e-06, + "loss": 0.0036, + "num_input_tokens_seen": 195498272, + "step": 160670 + }, + { + "epoch": 17.894531685042878, + "grad_norm": 0.0019993665628135204, + "learning_rate": 1.6692252196588349e-06, + "loss": 0.0073, + "num_input_tokens_seen": 195503744, + "step": 160675 + }, + { + "epoch": 17.895088539926494, + "grad_norm": 0.03956003859639168, + "learning_rate": 1.668352381098323e-06, + "loss": 0.0731, + "num_input_tokens_seen": 195509920, + "step": 160680 + }, + { + "epoch": 17.895645394810114, + "grad_norm": 0.003943763207644224, + "learning_rate": 1.6674797629239126e-06, + "loss": 0.0017, + "num_input_tokens_seen": 195516288, + "step": 160685 + }, + { + "epoch": 17.89620224969373, + "grad_norm": 0.00010494811431271955, + "learning_rate": 1.6666073651438463e-06, + "loss": 0.0024, + "num_input_tokens_seen": 195522240, + "step": 160690 + }, + { + "epoch": 17.89675910457735, + "grad_norm": 0.06968039274215698, + "learning_rate": 1.6657351877663734e-06, + "loss": 0.0134, + "num_input_tokens_seen": 195527968, + "step": 160695 + }, + { + "epoch": 17.897315959460965, + "grad_norm": 1.28202486038208, + "learning_rate": 1.6648632307997208e-06, + "loss": 0.0484, + "num_input_tokens_seen": 195534208, + "step": 160700 + }, + { + "epoch": 17.89787281434458, + "grad_norm": 0.004942065570503473, + "learning_rate": 1.66399149425214e-06, + "loss": 0.0085, + "num_input_tokens_seen": 195540128, + "step": 160705 + }, + { + "epoch": 17.8984296692282, + "grad_norm": 0.5515185594558716, + "learning_rate": 1.6631199781318469e-06, + "loss": 0.12, + "num_input_tokens_seen": 195546592, + "step": 160710 + }, + { + "epoch": 17.898986524111816, + "grad_norm": 0.015166930854320526, + "learning_rate": 1.6622486824470872e-06, + "loss": 0.025, + "num_input_tokens_seen": 195552672, + "step": 160715 + }, + { + "epoch": 17.899543378995435, + "grad_norm": 0.8876647353172302, + "learning_rate": 1.6613776072060828e-06, + "loss": 0.0414, + "num_input_tokens_seen": 195558880, + "step": 160720 + }, + { + "epoch": 17.90010023387905, + "grad_norm": 0.039407361298799515, + "learning_rate": 1.660506752417071e-06, + "loss": 0.0052, + "num_input_tokens_seen": 195565152, + "step": 160725 + }, + { + "epoch": 17.900657088762667, + "grad_norm": 0.5672886371612549, + "learning_rate": 1.6596361180882703e-06, + "loss": 0.0531, + "num_input_tokens_seen": 195571488, + "step": 160730 + }, + { + "epoch": 17.901213943646287, + "grad_norm": 0.01877303048968315, + "learning_rate": 1.6587657042279048e-06, + "loss": 0.0519, + "num_input_tokens_seen": 195577728, + "step": 160735 + }, + { + "epoch": 17.901770798529903, + "grad_norm": 0.21064768731594086, + "learning_rate": 1.6578955108441957e-06, + "loss": 0.0817, + "num_input_tokens_seen": 195583968, + "step": 160740 + }, + { + "epoch": 17.902327653413522, + "grad_norm": 0.19762566685676575, + "learning_rate": 1.6570255379453698e-06, + "loss": 0.0488, + "num_input_tokens_seen": 195590080, + "step": 160745 + }, + { + "epoch": 17.902884508297138, + "grad_norm": 0.02996329590678215, + "learning_rate": 1.6561557855396398e-06, + "loss": 0.0066, + "num_input_tokens_seen": 195596192, + "step": 160750 + }, + { + "epoch": 17.903441363180754, + "grad_norm": 0.16647453606128693, + "learning_rate": 1.6552862536352214e-06, + "loss": 0.0504, + "num_input_tokens_seen": 195602112, + "step": 160755 + }, + { + "epoch": 17.903998218064373, + "grad_norm": 0.02927066944539547, + "learning_rate": 1.6544169422403221e-06, + "loss": 0.0211, + "num_input_tokens_seen": 195608320, + "step": 160760 + }, + { + "epoch": 17.90455507294799, + "grad_norm": 0.011014345102012157, + "learning_rate": 1.653547851363163e-06, + "loss": 0.0662, + "num_input_tokens_seen": 195614656, + "step": 160765 + }, + { + "epoch": 17.90511192783161, + "grad_norm": 0.00372475478798151, + "learning_rate": 1.6526789810119459e-06, + "loss": 0.0153, + "num_input_tokens_seen": 195620768, + "step": 160770 + }, + { + "epoch": 17.905668782715225, + "grad_norm": 1.7840321063995361, + "learning_rate": 1.651810331194889e-06, + "loss": 0.1193, + "num_input_tokens_seen": 195626944, + "step": 160775 + }, + { + "epoch": 17.90622563759884, + "grad_norm": 0.00010903157090069726, + "learning_rate": 1.6509419019201833e-06, + "loss": 0.0209, + "num_input_tokens_seen": 195633056, + "step": 160780 + }, + { + "epoch": 17.90678249248246, + "grad_norm": 0.0014743885258212686, + "learning_rate": 1.6500736931960414e-06, + "loss": 0.0003, + "num_input_tokens_seen": 195639232, + "step": 160785 + }, + { + "epoch": 17.907339347366076, + "grad_norm": 0.1399238556623459, + "learning_rate": 1.649205705030657e-06, + "loss": 0.0733, + "num_input_tokens_seen": 195644000, + "step": 160790 + }, + { + "epoch": 17.907896202249695, + "grad_norm": 0.006305244751274586, + "learning_rate": 1.6483379374322371e-06, + "loss": 0.0252, + "num_input_tokens_seen": 195650368, + "step": 160795 + }, + { + "epoch": 17.90845305713331, + "grad_norm": 3.131804943084717, + "learning_rate": 1.6474703904089755e-06, + "loss": 0.0255, + "num_input_tokens_seen": 195656224, + "step": 160800 + }, + { + "epoch": 17.909009912016927, + "grad_norm": 0.006106254644691944, + "learning_rate": 1.6466030639690627e-06, + "loss": 0.0194, + "num_input_tokens_seen": 195662432, + "step": 160805 + }, + { + "epoch": 17.909566766900546, + "grad_norm": 0.00508269015699625, + "learning_rate": 1.645735958120695e-06, + "loss": 0.0127, + "num_input_tokens_seen": 195668416, + "step": 160810 + }, + { + "epoch": 17.910123621784162, + "grad_norm": 0.09403613209724426, + "learning_rate": 1.6448690728720627e-06, + "loss": 0.0929, + "num_input_tokens_seen": 195674368, + "step": 160815 + }, + { + "epoch": 17.910680476667782, + "grad_norm": 0.0005370652652345598, + "learning_rate": 1.6440024082313542e-06, + "loss": 0.0368, + "num_input_tokens_seen": 195680448, + "step": 160820 + }, + { + "epoch": 17.911237331551398, + "grad_norm": 0.15719765424728394, + "learning_rate": 1.6431359642067574e-06, + "loss": 0.1266, + "num_input_tokens_seen": 195686688, + "step": 160825 + }, + { + "epoch": 17.911794186435014, + "grad_norm": 1.4293628931045532, + "learning_rate": 1.6422697408064485e-06, + "loss": 0.0655, + "num_input_tokens_seen": 195692896, + "step": 160830 + }, + { + "epoch": 17.912351041318633, + "grad_norm": 0.20946785807609558, + "learning_rate": 1.6414037380386216e-06, + "loss": 0.1032, + "num_input_tokens_seen": 195699296, + "step": 160835 + }, + { + "epoch": 17.91290789620225, + "grad_norm": 0.09523671120405197, + "learning_rate": 1.6405379559114504e-06, + "loss": 0.0027, + "num_input_tokens_seen": 195705536, + "step": 160840 + }, + { + "epoch": 17.91346475108587, + "grad_norm": 0.0012504594633355737, + "learning_rate": 1.6396723944331089e-06, + "loss": 0.0026, + "num_input_tokens_seen": 195711968, + "step": 160845 + }, + { + "epoch": 17.914021605969484, + "grad_norm": 0.8031535744667053, + "learning_rate": 1.638807053611785e-06, + "loss": 0.0117, + "num_input_tokens_seen": 195718368, + "step": 160850 + }, + { + "epoch": 17.9145784608531, + "grad_norm": 1.2367830276489258, + "learning_rate": 1.637941933455639e-06, + "loss": 0.0274, + "num_input_tokens_seen": 195724512, + "step": 160855 + }, + { + "epoch": 17.91513531573672, + "grad_norm": 0.22868438065052032, + "learning_rate": 1.6370770339728504e-06, + "loss": 0.003, + "num_input_tokens_seen": 195730848, + "step": 160860 + }, + { + "epoch": 17.915692170620336, + "grad_norm": 0.3608000874519348, + "learning_rate": 1.6362123551715847e-06, + "loss": 0.039, + "num_input_tokens_seen": 195737216, + "step": 160865 + }, + { + "epoch": 17.916249025503955, + "grad_norm": 3.3042778968811035, + "learning_rate": 1.6353478970600161e-06, + "loss": 0.0817, + "num_input_tokens_seen": 195743616, + "step": 160870 + }, + { + "epoch": 17.91680588038757, + "grad_norm": 0.7546052932739258, + "learning_rate": 1.6344836596463049e-06, + "loss": 0.0844, + "num_input_tokens_seen": 195750176, + "step": 160875 + }, + { + "epoch": 17.917362735271187, + "grad_norm": 0.04930175840854645, + "learning_rate": 1.633619642938619e-06, + "loss": 0.0661, + "num_input_tokens_seen": 195756160, + "step": 160880 + }, + { + "epoch": 17.917919590154806, + "grad_norm": 0.010956630110740662, + "learning_rate": 1.6327558469451081e-06, + "loss": 0.0666, + "num_input_tokens_seen": 195762688, + "step": 160885 + }, + { + "epoch": 17.918476445038422, + "grad_norm": 0.0034405505284667015, + "learning_rate": 1.631892271673946e-06, + "loss": 0.0116, + "num_input_tokens_seen": 195768768, + "step": 160890 + }, + { + "epoch": 17.91903329992204, + "grad_norm": 0.007307565771043301, + "learning_rate": 1.6310289171332843e-06, + "loss": 0.016, + "num_input_tokens_seen": 195774944, + "step": 160895 + }, + { + "epoch": 17.919590154805658, + "grad_norm": 0.5763212442398071, + "learning_rate": 1.630165783331275e-06, + "loss": 0.0492, + "num_input_tokens_seen": 195780960, + "step": 160900 + }, + { + "epoch": 17.920147009689273, + "grad_norm": 0.0886845514178276, + "learning_rate": 1.6293028702760726e-06, + "loss": 0.002, + "num_input_tokens_seen": 195787232, + "step": 160905 + }, + { + "epoch": 17.920703864572893, + "grad_norm": 0.8142114281654358, + "learning_rate": 1.6284401779758318e-06, + "loss": 0.0172, + "num_input_tokens_seen": 195793248, + "step": 160910 + }, + { + "epoch": 17.92126071945651, + "grad_norm": 1.305718183517456, + "learning_rate": 1.6275777064386933e-06, + "loss": 0.0851, + "num_input_tokens_seen": 195799424, + "step": 160915 + }, + { + "epoch": 17.92181757434013, + "grad_norm": 0.03441334143280983, + "learning_rate": 1.626715455672817e-06, + "loss": 0.0028, + "num_input_tokens_seen": 195805408, + "step": 160920 + }, + { + "epoch": 17.922374429223744, + "grad_norm": 0.3621963858604431, + "learning_rate": 1.625853425686333e-06, + "loss": 0.0043, + "num_input_tokens_seen": 195811712, + "step": 160925 + }, + { + "epoch": 17.92293128410736, + "grad_norm": 1.9529175758361816, + "learning_rate": 1.6249916164873925e-06, + "loss": 0.024, + "num_input_tokens_seen": 195817664, + "step": 160930 + }, + { + "epoch": 17.92348813899098, + "grad_norm": 0.0007829822134226561, + "learning_rate": 1.624130028084131e-06, + "loss": 0.0027, + "num_input_tokens_seen": 195823712, + "step": 160935 + }, + { + "epoch": 17.924044993874595, + "grad_norm": 1.7168192863464355, + "learning_rate": 1.6232686604846947e-06, + "loss": 0.0623, + "num_input_tokens_seen": 195829888, + "step": 160940 + }, + { + "epoch": 17.924601848758215, + "grad_norm": 0.0005478115053847432, + "learning_rate": 1.622407513697216e-06, + "loss": 0.0863, + "num_input_tokens_seen": 195836096, + "step": 160945 + }, + { + "epoch": 17.92515870364183, + "grad_norm": 0.09231776744127274, + "learning_rate": 1.6215465877298247e-06, + "loss": 0.0269, + "num_input_tokens_seen": 195842368, + "step": 160950 + }, + { + "epoch": 17.925715558525447, + "grad_norm": 0.010766491293907166, + "learning_rate": 1.6206858825906556e-06, + "loss": 0.0041, + "num_input_tokens_seen": 195848448, + "step": 160955 + }, + { + "epoch": 17.926272413409066, + "grad_norm": 0.14886337518692017, + "learning_rate": 1.6198253982878414e-06, + "loss": 0.0088, + "num_input_tokens_seen": 195854400, + "step": 160960 + }, + { + "epoch": 17.926829268292682, + "grad_norm": 0.924272894859314, + "learning_rate": 1.6189651348295087e-06, + "loss": 0.0397, + "num_input_tokens_seen": 195860512, + "step": 160965 + }, + { + "epoch": 17.9273861231763, + "grad_norm": 0.11641369760036469, + "learning_rate": 1.6181050922237817e-06, + "loss": 0.0092, + "num_input_tokens_seen": 195866272, + "step": 160970 + }, + { + "epoch": 17.927942978059917, + "grad_norm": 0.5867459774017334, + "learning_rate": 1.6172452704787844e-06, + "loss": 0.0341, + "num_input_tokens_seen": 195872032, + "step": 160975 + }, + { + "epoch": 17.928499832943533, + "grad_norm": 0.050973691046237946, + "learning_rate": 1.6163856696026408e-06, + "loss": 0.0011, + "num_input_tokens_seen": 195878208, + "step": 160980 + }, + { + "epoch": 17.929056687827153, + "grad_norm": 0.01918768137693405, + "learning_rate": 1.6155262896034667e-06, + "loss": 0.0081, + "num_input_tokens_seen": 195884512, + "step": 160985 + }, + { + "epoch": 17.92961354271077, + "grad_norm": 0.6367854475975037, + "learning_rate": 1.614667130489389e-06, + "loss": 0.0166, + "num_input_tokens_seen": 195890880, + "step": 160990 + }, + { + "epoch": 17.930170397594388, + "grad_norm": 0.032058753073215485, + "learning_rate": 1.6138081922685094e-06, + "loss": 0.0145, + "num_input_tokens_seen": 195896896, + "step": 160995 + }, + { + "epoch": 17.930727252478004, + "grad_norm": 0.002490764716640115, + "learning_rate": 1.6129494749489521e-06, + "loss": 0.0024, + "num_input_tokens_seen": 195903360, + "step": 161000 + }, + { + "epoch": 17.93128410736162, + "grad_norm": 0.06259581446647644, + "learning_rate": 1.612090978538819e-06, + "loss": 0.0927, + "num_input_tokens_seen": 195909376, + "step": 161005 + }, + { + "epoch": 17.93184096224524, + "grad_norm": 0.262515664100647, + "learning_rate": 1.611232703046231e-06, + "loss": 0.0087, + "num_input_tokens_seen": 195915712, + "step": 161010 + }, + { + "epoch": 17.932397817128855, + "grad_norm": 0.8660910129547119, + "learning_rate": 1.6103746484792875e-06, + "loss": 0.0281, + "num_input_tokens_seen": 195922080, + "step": 161015 + }, + { + "epoch": 17.932954672012475, + "grad_norm": 0.20237651467323303, + "learning_rate": 1.6095168148460932e-06, + "loss": 0.0154, + "num_input_tokens_seen": 195927648, + "step": 161020 + }, + { + "epoch": 17.93351152689609, + "grad_norm": 0.13366225361824036, + "learning_rate": 1.6086592021547525e-06, + "loss": 0.007, + "num_input_tokens_seen": 195934048, + "step": 161025 + }, + { + "epoch": 17.93406838177971, + "grad_norm": 0.00026482073008082807, + "learning_rate": 1.6078018104133674e-06, + "loss": 0.0339, + "num_input_tokens_seen": 195940160, + "step": 161030 + }, + { + "epoch": 17.934625236663326, + "grad_norm": 0.040734149515628815, + "learning_rate": 1.606944639630037e-06, + "loss": 0.0052, + "num_input_tokens_seen": 195946432, + "step": 161035 + }, + { + "epoch": 17.93518209154694, + "grad_norm": 1.8350489139556885, + "learning_rate": 1.6060876898128546e-06, + "loss": 0.0848, + "num_input_tokens_seen": 195952416, + "step": 161040 + }, + { + "epoch": 17.93573894643056, + "grad_norm": 0.8174899816513062, + "learning_rate": 1.6052309609699169e-06, + "loss": 0.0579, + "num_input_tokens_seen": 195958560, + "step": 161045 + }, + { + "epoch": 17.936295801314177, + "grad_norm": 3.343146800994873, + "learning_rate": 1.6043744531093168e-06, + "loss": 0.1127, + "num_input_tokens_seen": 195964256, + "step": 161050 + }, + { + "epoch": 17.936852656197797, + "grad_norm": 2.0017762184143066, + "learning_rate": 1.603518166239143e-06, + "loss": 0.1686, + "num_input_tokens_seen": 195970272, + "step": 161055 + }, + { + "epoch": 17.937409511081412, + "grad_norm": 0.4621984362602234, + "learning_rate": 1.6026621003674857e-06, + "loss": 0.0561, + "num_input_tokens_seen": 195976512, + "step": 161060 + }, + { + "epoch": 17.93796636596503, + "grad_norm": 0.11734786629676819, + "learning_rate": 1.6018062555024333e-06, + "loss": 0.0415, + "num_input_tokens_seen": 195982208, + "step": 161065 + }, + { + "epoch": 17.938523220848648, + "grad_norm": 0.01512849424034357, + "learning_rate": 1.600950631652065e-06, + "loss": 0.0063, + "num_input_tokens_seen": 195987744, + "step": 161070 + }, + { + "epoch": 17.939080075732264, + "grad_norm": 0.0005697428132407367, + "learning_rate": 1.6000952288244635e-06, + "loss": 0.0326, + "num_input_tokens_seen": 195994176, + "step": 161075 + }, + { + "epoch": 17.939636930615883, + "grad_norm": 0.024190982803702354, + "learning_rate": 1.5992400470277113e-06, + "loss": 0.0418, + "num_input_tokens_seen": 196000608, + "step": 161080 + }, + { + "epoch": 17.9401937854995, + "grad_norm": 1.0485073328018188, + "learning_rate": 1.598385086269888e-06, + "loss": 0.0703, + "num_input_tokens_seen": 196006944, + "step": 161085 + }, + { + "epoch": 17.940750640383115, + "grad_norm": 0.0016633181367069483, + "learning_rate": 1.5975303465590647e-06, + "loss": 0.0251, + "num_input_tokens_seen": 196012960, + "step": 161090 + }, + { + "epoch": 17.941307495266734, + "grad_norm": 0.5204823613166809, + "learning_rate": 1.5966758279033155e-06, + "loss": 0.0242, + "num_input_tokens_seen": 196018528, + "step": 161095 + }, + { + "epoch": 17.94186435015035, + "grad_norm": 0.034071434289216995, + "learning_rate": 1.595821530310715e-06, + "loss": 0.0572, + "num_input_tokens_seen": 196024512, + "step": 161100 + }, + { + "epoch": 17.94242120503397, + "grad_norm": 0.25827932357788086, + "learning_rate": 1.5949674537893284e-06, + "loss": 0.0112, + "num_input_tokens_seen": 196030688, + "step": 161105 + }, + { + "epoch": 17.942978059917586, + "grad_norm": 0.5370241403579712, + "learning_rate": 1.5941135983472328e-06, + "loss": 0.018, + "num_input_tokens_seen": 196036832, + "step": 161110 + }, + { + "epoch": 17.9435349148012, + "grad_norm": 0.0856914296746254, + "learning_rate": 1.59325996399248e-06, + "loss": 0.0078, + "num_input_tokens_seen": 196043008, + "step": 161115 + }, + { + "epoch": 17.94409176968482, + "grad_norm": 0.069643035531044, + "learning_rate": 1.5924065507331443e-06, + "loss": 0.0015, + "num_input_tokens_seen": 196049376, + "step": 161120 + }, + { + "epoch": 17.944648624568437, + "grad_norm": 0.149322047829628, + "learning_rate": 1.5915533585772775e-06, + "loss": 0.136, + "num_input_tokens_seen": 196054752, + "step": 161125 + }, + { + "epoch": 17.945205479452056, + "grad_norm": 0.02075106091797352, + "learning_rate": 1.590700387532948e-06, + "loss": 0.1452, + "num_input_tokens_seen": 196060736, + "step": 161130 + }, + { + "epoch": 17.945762334335672, + "grad_norm": 0.007270926143974066, + "learning_rate": 1.5898476376082104e-06, + "loss": 0.0164, + "num_input_tokens_seen": 196066912, + "step": 161135 + }, + { + "epoch": 17.946319189219288, + "grad_norm": 0.008000658825039864, + "learning_rate": 1.5889951088111143e-06, + "loss": 0.0644, + "num_input_tokens_seen": 196072704, + "step": 161140 + }, + { + "epoch": 17.946876044102908, + "grad_norm": 0.07929853349924088, + "learning_rate": 1.588142801149714e-06, + "loss": 0.0061, + "num_input_tokens_seen": 196078720, + "step": 161145 + }, + { + "epoch": 17.947432898986523, + "grad_norm": 1.0777195692062378, + "learning_rate": 1.587290714632067e-06, + "loss": 0.0725, + "num_input_tokens_seen": 196085184, + "step": 161150 + }, + { + "epoch": 17.947989753870143, + "grad_norm": 0.9880234003067017, + "learning_rate": 1.586438849266217e-06, + "loss": 0.0053, + "num_input_tokens_seen": 196091648, + "step": 161155 + }, + { + "epoch": 17.94854660875376, + "grad_norm": 1.9275785684585571, + "learning_rate": 1.58558720506021e-06, + "loss": 0.0298, + "num_input_tokens_seen": 196098016, + "step": 161160 + }, + { + "epoch": 17.949103463637375, + "grad_norm": 0.017688684165477753, + "learning_rate": 1.584735782022087e-06, + "loss": 0.0138, + "num_input_tokens_seen": 196103936, + "step": 161165 + }, + { + "epoch": 17.949660318520994, + "grad_norm": 0.7023215889930725, + "learning_rate": 1.5838845801598973e-06, + "loss": 0.2332, + "num_input_tokens_seen": 196110144, + "step": 161170 + }, + { + "epoch": 17.95021717340461, + "grad_norm": 1.165177583694458, + "learning_rate": 1.5830335994816758e-06, + "loss": 0.0685, + "num_input_tokens_seen": 196115360, + "step": 161175 + }, + { + "epoch": 17.95077402828823, + "grad_norm": 0.00045643869088962674, + "learning_rate": 1.5821828399954718e-06, + "loss": 0.0164, + "num_input_tokens_seen": 196121792, + "step": 161180 + }, + { + "epoch": 17.951330883171845, + "grad_norm": 0.01387594360858202, + "learning_rate": 1.581332301709304e-06, + "loss": 0.1246, + "num_input_tokens_seen": 196127840, + "step": 161185 + }, + { + "epoch": 17.95188773805546, + "grad_norm": 0.528451681137085, + "learning_rate": 1.5804819846312185e-06, + "loss": 0.0166, + "num_input_tokens_seen": 196133824, + "step": 161190 + }, + { + "epoch": 17.95244459293908, + "grad_norm": 0.006231278646737337, + "learning_rate": 1.5796318887692424e-06, + "loss": 0.057, + "num_input_tokens_seen": 196139872, + "step": 161195 + }, + { + "epoch": 17.953001447822697, + "grad_norm": 2.779020071029663, + "learning_rate": 1.5787820141314108e-06, + "loss": 0.0743, + "num_input_tokens_seen": 196145344, + "step": 161200 + }, + { + "epoch": 17.953558302706316, + "grad_norm": 0.10071143507957458, + "learning_rate": 1.577932360725745e-06, + "loss": 0.0249, + "num_input_tokens_seen": 196151456, + "step": 161205 + }, + { + "epoch": 17.954115157589932, + "grad_norm": 0.9451107382774353, + "learning_rate": 1.5770829285602778e-06, + "loss": 0.1425, + "num_input_tokens_seen": 196157344, + "step": 161210 + }, + { + "epoch": 17.954672012473548, + "grad_norm": 2.0226876735687256, + "learning_rate": 1.576233717643022e-06, + "loss": 0.0614, + "num_input_tokens_seen": 196163680, + "step": 161215 + }, + { + "epoch": 17.955228867357167, + "grad_norm": 0.21408089995384216, + "learning_rate": 1.57538472798201e-06, + "loss": 0.0333, + "num_input_tokens_seen": 196169216, + "step": 161220 + }, + { + "epoch": 17.955785722240783, + "grad_norm": 0.5936877727508545, + "learning_rate": 1.574535959585255e-06, + "loss": 0.1447, + "num_input_tokens_seen": 196174880, + "step": 161225 + }, + { + "epoch": 17.956342577124403, + "grad_norm": 0.057677485048770905, + "learning_rate": 1.573687412460778e-06, + "loss": 0.1197, + "num_input_tokens_seen": 196180960, + "step": 161230 + }, + { + "epoch": 17.95689943200802, + "grad_norm": 1.3323839902877808, + "learning_rate": 1.5728390866165872e-06, + "loss": 0.0141, + "num_input_tokens_seen": 196187264, + "step": 161235 + }, + { + "epoch": 17.957456286891635, + "grad_norm": 0.37000539898872375, + "learning_rate": 1.5719909820607058e-06, + "loss": 0.0125, + "num_input_tokens_seen": 196193504, + "step": 161240 + }, + { + "epoch": 17.958013141775254, + "grad_norm": 2.2136359214782715, + "learning_rate": 1.5711430988011366e-06, + "loss": 0.1072, + "num_input_tokens_seen": 196199552, + "step": 161245 + }, + { + "epoch": 17.95856999665887, + "grad_norm": 0.0012187723768875003, + "learning_rate": 1.5702954368458922e-06, + "loss": 0.0009, + "num_input_tokens_seen": 196205472, + "step": 161250 + }, + { + "epoch": 17.95912685154249, + "grad_norm": 0.6674113273620605, + "learning_rate": 1.5694479962029857e-06, + "loss": 0.0657, + "num_input_tokens_seen": 196211712, + "step": 161255 + }, + { + "epoch": 17.959683706426105, + "grad_norm": 0.0006462449673563242, + "learning_rate": 1.568600776880408e-06, + "loss": 0.0184, + "num_input_tokens_seen": 196217984, + "step": 161260 + }, + { + "epoch": 17.96024056130972, + "grad_norm": 0.002058988669887185, + "learning_rate": 1.5677537788861719e-06, + "loss": 0.0304, + "num_input_tokens_seen": 196224384, + "step": 161265 + }, + { + "epoch": 17.96079741619334, + "grad_norm": 0.00128641736228019, + "learning_rate": 1.5669070022282684e-06, + "loss": 0.0595, + "num_input_tokens_seen": 196230432, + "step": 161270 + }, + { + "epoch": 17.961354271076956, + "grad_norm": 0.008353245444595814, + "learning_rate": 1.5660604469147105e-06, + "loss": 0.0076, + "num_input_tokens_seen": 196236544, + "step": 161275 + }, + { + "epoch": 17.961911125960576, + "grad_norm": 0.0002296970778843388, + "learning_rate": 1.5652141129534836e-06, + "loss": 0.0612, + "num_input_tokens_seen": 196242624, + "step": 161280 + }, + { + "epoch": 17.962467980844192, + "grad_norm": 0.029904183000326157, + "learning_rate": 1.5643680003525868e-06, + "loss": 0.0015, + "num_input_tokens_seen": 196249056, + "step": 161285 + }, + { + "epoch": 17.963024835727808, + "grad_norm": 0.08572633564472198, + "learning_rate": 1.5635221091200053e-06, + "loss": 0.0075, + "num_input_tokens_seen": 196254944, + "step": 161290 + }, + { + "epoch": 17.963581690611427, + "grad_norm": 0.45951855182647705, + "learning_rate": 1.5626764392637411e-06, + "loss": 0.0097, + "num_input_tokens_seen": 196260768, + "step": 161295 + }, + { + "epoch": 17.964138545495043, + "grad_norm": 0.631680965423584, + "learning_rate": 1.5618309907917738e-06, + "loss": 0.0231, + "num_input_tokens_seen": 196266496, + "step": 161300 + }, + { + "epoch": 17.964695400378663, + "grad_norm": 0.026857761666178703, + "learning_rate": 1.5609857637120888e-06, + "loss": 0.0055, + "num_input_tokens_seen": 196272384, + "step": 161305 + }, + { + "epoch": 17.96525225526228, + "grad_norm": 0.00428675627335906, + "learning_rate": 1.5601407580326715e-06, + "loss": 0.0506, + "num_input_tokens_seen": 196277920, + "step": 161310 + }, + { + "epoch": 17.965809110145894, + "grad_norm": 1.2261918783187866, + "learning_rate": 1.5592959737615071e-06, + "loss": 0.0267, + "num_input_tokens_seen": 196284096, + "step": 161315 + }, + { + "epoch": 17.966365965029514, + "grad_norm": 0.007921753451228142, + "learning_rate": 1.5584514109065695e-06, + "loss": 0.0062, + "num_input_tokens_seen": 196290144, + "step": 161320 + }, + { + "epoch": 17.96692281991313, + "grad_norm": 0.004792972467839718, + "learning_rate": 1.557607069475847e-06, + "loss": 0.0079, + "num_input_tokens_seen": 196296480, + "step": 161325 + }, + { + "epoch": 17.96747967479675, + "grad_norm": 2.04689359664917, + "learning_rate": 1.5567629494773001e-06, + "loss": 0.0471, + "num_input_tokens_seen": 196303104, + "step": 161330 + }, + { + "epoch": 17.968036529680365, + "grad_norm": 0.3450002074241638, + "learning_rate": 1.555919050918911e-06, + "loss": 0.0074, + "num_input_tokens_seen": 196309536, + "step": 161335 + }, + { + "epoch": 17.96859338456398, + "grad_norm": 0.0007410293910652399, + "learning_rate": 1.5550753738086482e-06, + "loss": 0.0431, + "num_input_tokens_seen": 196315680, + "step": 161340 + }, + { + "epoch": 17.9691502394476, + "grad_norm": 0.02499472163617611, + "learning_rate": 1.5542319181544861e-06, + "loss": 0.0008, + "num_input_tokens_seen": 196322112, + "step": 161345 + }, + { + "epoch": 17.969707094331216, + "grad_norm": 1.3624539375305176, + "learning_rate": 1.553388683964388e-06, + "loss": 0.0551, + "num_input_tokens_seen": 196328224, + "step": 161350 + }, + { + "epoch": 17.970263949214836, + "grad_norm": 2.533043384552002, + "learning_rate": 1.5525456712463165e-06, + "loss": 0.0925, + "num_input_tokens_seen": 196334208, + "step": 161355 + }, + { + "epoch": 17.97082080409845, + "grad_norm": 0.002138364128768444, + "learning_rate": 1.551702880008235e-06, + "loss": 0.0115, + "num_input_tokens_seen": 196340416, + "step": 161360 + }, + { + "epoch": 17.97137765898207, + "grad_norm": 0.026435870677232742, + "learning_rate": 1.5508603102581092e-06, + "loss": 0.0201, + "num_input_tokens_seen": 196346816, + "step": 161365 + }, + { + "epoch": 17.971934513865687, + "grad_norm": 2.4063332080841064, + "learning_rate": 1.550017962003897e-06, + "loss": 0.041, + "num_input_tokens_seen": 196352800, + "step": 161370 + }, + { + "epoch": 17.972491368749303, + "grad_norm": 1.7844918966293335, + "learning_rate": 1.5491758352535496e-06, + "loss": 0.0266, + "num_input_tokens_seen": 196358944, + "step": 161375 + }, + { + "epoch": 17.973048223632922, + "grad_norm": 0.0015983541961759329, + "learning_rate": 1.5483339300150252e-06, + "loss": 0.012, + "num_input_tokens_seen": 196365120, + "step": 161380 + }, + { + "epoch": 17.973605078516538, + "grad_norm": 0.0027311285957694054, + "learning_rate": 1.5474922462962754e-06, + "loss": 0.1244, + "num_input_tokens_seen": 196371040, + "step": 161385 + }, + { + "epoch": 17.974161933400154, + "grad_norm": 0.04649442434310913, + "learning_rate": 1.5466507841052497e-06, + "loss": 0.01, + "num_input_tokens_seen": 196377120, + "step": 161390 + }, + { + "epoch": 17.974718788283774, + "grad_norm": 0.0703933835029602, + "learning_rate": 1.5458095434499025e-06, + "loss": 0.0201, + "num_input_tokens_seen": 196383168, + "step": 161395 + }, + { + "epoch": 17.97527564316739, + "grad_norm": 0.3487522304058075, + "learning_rate": 1.5449685243381751e-06, + "loss": 0.0615, + "num_input_tokens_seen": 196389280, + "step": 161400 + }, + { + "epoch": 17.97583249805101, + "grad_norm": 0.0012563341297209263, + "learning_rate": 1.5441277267780107e-06, + "loss": 0.0522, + "num_input_tokens_seen": 196395232, + "step": 161405 + }, + { + "epoch": 17.976389352934625, + "grad_norm": 0.0021458605770021677, + "learning_rate": 1.5432871507773478e-06, + "loss": 0.0161, + "num_input_tokens_seen": 196401120, + "step": 161410 + }, + { + "epoch": 17.976946207818244, + "grad_norm": 0.24858081340789795, + "learning_rate": 1.542446796344138e-06, + "loss": 0.0313, + "num_input_tokens_seen": 196407360, + "step": 161415 + }, + { + "epoch": 17.97750306270186, + "grad_norm": 0.017398426309227943, + "learning_rate": 1.5416066634863086e-06, + "loss": 0.0245, + "num_input_tokens_seen": 196413600, + "step": 161420 + }, + { + "epoch": 17.978059917585476, + "grad_norm": 1.0230305194854736, + "learning_rate": 1.5407667522118002e-06, + "loss": 0.0908, + "num_input_tokens_seen": 196420032, + "step": 161425 + }, + { + "epoch": 17.978616772469096, + "grad_norm": 0.5573607683181763, + "learning_rate": 1.5399270625285428e-06, + "loss": 0.0049, + "num_input_tokens_seen": 196426208, + "step": 161430 + }, + { + "epoch": 17.97917362735271, + "grad_norm": 0.1063244566321373, + "learning_rate": 1.5390875944444715e-06, + "loss": 0.037, + "num_input_tokens_seen": 196432416, + "step": 161435 + }, + { + "epoch": 17.97973048223633, + "grad_norm": 0.00010412983829155564, + "learning_rate": 1.5382483479675163e-06, + "loss": 0.0022, + "num_input_tokens_seen": 196438048, + "step": 161440 + }, + { + "epoch": 17.980287337119947, + "grad_norm": 0.46794816851615906, + "learning_rate": 1.5374093231056014e-06, + "loss": 0.0041, + "num_input_tokens_seen": 196444000, + "step": 161445 + }, + { + "epoch": 17.980844192003563, + "grad_norm": 4.25663948059082, + "learning_rate": 1.5365705198666508e-06, + "loss": 0.0447, + "num_input_tokens_seen": 196449984, + "step": 161450 + }, + { + "epoch": 17.981401046887182, + "grad_norm": 1.4122973680496216, + "learning_rate": 1.5357319382585915e-06, + "loss": 0.0378, + "num_input_tokens_seen": 196456096, + "step": 161455 + }, + { + "epoch": 17.981957901770798, + "grad_norm": 0.03229602053761482, + "learning_rate": 1.5348935782893425e-06, + "loss": 0.0492, + "num_input_tokens_seen": 196462432, + "step": 161460 + }, + { + "epoch": 17.982514756654417, + "grad_norm": 0.000524786242749542, + "learning_rate": 1.534055439966825e-06, + "loss": 0.043, + "num_input_tokens_seen": 196468672, + "step": 161465 + }, + { + "epoch": 17.983071611538033, + "grad_norm": 0.035115569829940796, + "learning_rate": 1.5332175232989577e-06, + "loss": 0.0834, + "num_input_tokens_seen": 196474656, + "step": 161470 + }, + { + "epoch": 17.98362846642165, + "grad_norm": 0.007578513119369745, + "learning_rate": 1.532379828293648e-06, + "loss": 0.0442, + "num_input_tokens_seen": 196480736, + "step": 161475 + }, + { + "epoch": 17.98418532130527, + "grad_norm": 0.0022712629288434982, + "learning_rate": 1.531542354958812e-06, + "loss": 0.0043, + "num_input_tokens_seen": 196487296, + "step": 161480 + }, + { + "epoch": 17.984742176188885, + "grad_norm": 0.00034917102311737835, + "learning_rate": 1.5307051033023628e-06, + "loss": 0.0244, + "num_input_tokens_seen": 196493408, + "step": 161485 + }, + { + "epoch": 17.985299031072504, + "grad_norm": 1.307535171508789, + "learning_rate": 1.5298680733322079e-06, + "loss": 0.0655, + "num_input_tokens_seen": 196499360, + "step": 161490 + }, + { + "epoch": 17.98585588595612, + "grad_norm": 1.6025805473327637, + "learning_rate": 1.529031265056255e-06, + "loss": 0.0414, + "num_input_tokens_seen": 196505280, + "step": 161495 + }, + { + "epoch": 17.986412740839736, + "grad_norm": 1.3437989950180054, + "learning_rate": 1.5281946784824003e-06, + "loss": 0.2399, + "num_input_tokens_seen": 196511392, + "step": 161500 + }, + { + "epoch": 17.986969595723355, + "grad_norm": 0.9253344535827637, + "learning_rate": 1.527358313618557e-06, + "loss": 0.0384, + "num_input_tokens_seen": 196517504, + "step": 161505 + }, + { + "epoch": 17.98752645060697, + "grad_norm": 0.46483227610588074, + "learning_rate": 1.5265221704726163e-06, + "loss": 0.1371, + "num_input_tokens_seen": 196523744, + "step": 161510 + }, + { + "epoch": 17.98808330549059, + "grad_norm": 0.35214751958847046, + "learning_rate": 1.5256862490524881e-06, + "loss": 0.0046, + "num_input_tokens_seen": 196530048, + "step": 161515 + }, + { + "epoch": 17.988640160374207, + "grad_norm": 0.019261211156845093, + "learning_rate": 1.5248505493660526e-06, + "loss": 0.0038, + "num_input_tokens_seen": 196535904, + "step": 161520 + }, + { + "epoch": 17.989197015257822, + "grad_norm": 0.00804815348237753, + "learning_rate": 1.524015071421217e-06, + "loss": 0.0691, + "num_input_tokens_seen": 196541408, + "step": 161525 + }, + { + "epoch": 17.989753870141442, + "grad_norm": 0.5521477460861206, + "learning_rate": 1.5231798152258614e-06, + "loss": 0.0637, + "num_input_tokens_seen": 196547488, + "step": 161530 + }, + { + "epoch": 17.990310725025058, + "grad_norm": 0.11673127859830856, + "learning_rate": 1.5223447807878876e-06, + "loss": 0.0119, + "num_input_tokens_seen": 196553824, + "step": 161535 + }, + { + "epoch": 17.990867579908677, + "grad_norm": 0.0001969452714547515, + "learning_rate": 1.5215099681151756e-06, + "loss": 0.048, + "num_input_tokens_seen": 196559264, + "step": 161540 + }, + { + "epoch": 17.991424434792293, + "grad_norm": 1.8859806060791016, + "learning_rate": 1.5206753772156136e-06, + "loss": 0.0696, + "num_input_tokens_seen": 196565120, + "step": 161545 + }, + { + "epoch": 17.99198128967591, + "grad_norm": 2.330843687057495, + "learning_rate": 1.519841008097081e-06, + "loss": 0.0578, + "num_input_tokens_seen": 196571296, + "step": 161550 + }, + { + "epoch": 17.99253814455953, + "grad_norm": 0.016823576763272285, + "learning_rate": 1.5190068607674634e-06, + "loss": 0.0569, + "num_input_tokens_seen": 196576896, + "step": 161555 + }, + { + "epoch": 17.993094999443144, + "grad_norm": 0.2573314309120178, + "learning_rate": 1.5181729352346407e-06, + "loss": 0.0236, + "num_input_tokens_seen": 196583008, + "step": 161560 + }, + { + "epoch": 17.993651854326764, + "grad_norm": 1.0310455560684204, + "learning_rate": 1.5173392315064871e-06, + "loss": 0.0515, + "num_input_tokens_seen": 196589280, + "step": 161565 + }, + { + "epoch": 17.99420870921038, + "grad_norm": 0.000321726081892848, + "learning_rate": 1.516505749590874e-06, + "loss": 0.1218, + "num_input_tokens_seen": 196595392, + "step": 161570 + }, + { + "epoch": 17.994765564093996, + "grad_norm": 0.04453492537140846, + "learning_rate": 1.515672489495684e-06, + "loss": 0.1021, + "num_input_tokens_seen": 196601248, + "step": 161575 + }, + { + "epoch": 17.995322418977615, + "grad_norm": 1.0402288436889648, + "learning_rate": 1.5148394512287778e-06, + "loss": 0.0202, + "num_input_tokens_seen": 196607392, + "step": 161580 + }, + { + "epoch": 17.99587927386123, + "grad_norm": 0.45382604002952576, + "learning_rate": 1.5140066347980376e-06, + "loss": 0.0078, + "num_input_tokens_seen": 196613376, + "step": 161585 + }, + { + "epoch": 17.99643612874485, + "grad_norm": 0.4344104528427124, + "learning_rate": 1.5131740402113153e-06, + "loss": 0.0078, + "num_input_tokens_seen": 196619488, + "step": 161590 + }, + { + "epoch": 17.996992983628466, + "grad_norm": 1.3212685585021973, + "learning_rate": 1.5123416674764829e-06, + "loss": 0.1208, + "num_input_tokens_seen": 196625600, + "step": 161595 + }, + { + "epoch": 17.997549838512082, + "grad_norm": 0.031838979572057724, + "learning_rate": 1.5115095166013977e-06, + "loss": 0.1097, + "num_input_tokens_seen": 196631968, + "step": 161600 + }, + { + "epoch": 17.9981066933957, + "grad_norm": 0.0014587034238502383, + "learning_rate": 1.5106775875939284e-06, + "loss": 0.003, + "num_input_tokens_seen": 196638336, + "step": 161605 + }, + { + "epoch": 17.998663548279318, + "grad_norm": 1.6836484670639038, + "learning_rate": 1.509845880461927e-06, + "loss": 0.0663, + "num_input_tokens_seen": 196644096, + "step": 161610 + }, + { + "epoch": 17.999220403162937, + "grad_norm": 0.043489690870046616, + "learning_rate": 1.509014395213254e-06, + "loss": 0.0489, + "num_input_tokens_seen": 196649504, + "step": 161615 + }, + { + "epoch": 17.999777258046553, + "grad_norm": 0.008189303800463676, + "learning_rate": 1.5081831318557533e-06, + "loss": 0.0044, + "num_input_tokens_seen": 196655872, + "step": 161620 + }, + { + "epoch": 18.0, + "eval_loss": 0.08222608268260956, + "eval_runtime": 111.6403, + "eval_samples_per_second": 35.749, + "eval_steps_per_second": 8.939, + "num_input_tokens_seen": 196657440, + "step": 161622 + }, + { + "epoch": 18.00033411293017, + "grad_norm": 0.24660848081111908, + "learning_rate": 1.5073520903972904e-06, + "loss": 0.143, + "num_input_tokens_seen": 196661024, + "step": 161625 + }, + { + "epoch": 18.00089096781379, + "grad_norm": 0.002071922179311514, + "learning_rate": 1.5065212708457094e-06, + "loss": 0.0309, + "num_input_tokens_seen": 196667200, + "step": 161630 + }, + { + "epoch": 18.001447822697404, + "grad_norm": 0.14074023067951202, + "learning_rate": 1.5056906732088565e-06, + "loss": 0.031, + "num_input_tokens_seen": 196673152, + "step": 161635 + }, + { + "epoch": 18.002004677581024, + "grad_norm": 0.08736500144004822, + "learning_rate": 1.5048602974945758e-06, + "loss": 0.0035, + "num_input_tokens_seen": 196679168, + "step": 161640 + }, + { + "epoch": 18.00256153246464, + "grad_norm": 0.04847481846809387, + "learning_rate": 1.5040301437107162e-06, + "loss": 0.0272, + "num_input_tokens_seen": 196685888, + "step": 161645 + }, + { + "epoch": 18.003118387348255, + "grad_norm": 0.7922660708427429, + "learning_rate": 1.5032002118651134e-06, + "loss": 0.0455, + "num_input_tokens_seen": 196692096, + "step": 161650 + }, + { + "epoch": 18.003675242231875, + "grad_norm": 0.003430429147556424, + "learning_rate": 1.5023705019656138e-06, + "loss": 0.0948, + "num_input_tokens_seen": 196698144, + "step": 161655 + }, + { + "epoch": 18.00423209711549, + "grad_norm": 1.0798189640045166, + "learning_rate": 1.5015410140200553e-06, + "loss": 0.1087, + "num_input_tokens_seen": 196703776, + "step": 161660 + }, + { + "epoch": 18.00478895199911, + "grad_norm": 2.717160940170288, + "learning_rate": 1.5007117480362597e-06, + "loss": 0.0336, + "num_input_tokens_seen": 196709696, + "step": 161665 + }, + { + "epoch": 18.005345806882726, + "grad_norm": 0.6342307329177856, + "learning_rate": 1.4998827040220736e-06, + "loss": 0.0192, + "num_input_tokens_seen": 196715936, + "step": 161670 + }, + { + "epoch": 18.005902661766342, + "grad_norm": 0.00551657984033227, + "learning_rate": 1.4990538819853183e-06, + "loss": 0.0601, + "num_input_tokens_seen": 196722016, + "step": 161675 + }, + { + "epoch": 18.00645951664996, + "grad_norm": 0.04257502406835556, + "learning_rate": 1.4982252819338322e-06, + "loss": 0.0318, + "num_input_tokens_seen": 196728128, + "step": 161680 + }, + { + "epoch": 18.007016371533577, + "grad_norm": 1.1560322046279907, + "learning_rate": 1.4973969038754392e-06, + "loss": 0.075, + "num_input_tokens_seen": 196733568, + "step": 161685 + }, + { + "epoch": 18.007573226417197, + "grad_norm": 0.45229968428611755, + "learning_rate": 1.4965687478179613e-06, + "loss": 0.0287, + "num_input_tokens_seen": 196739072, + "step": 161690 + }, + { + "epoch": 18.008130081300813, + "grad_norm": 0.03142351657152176, + "learning_rate": 1.495740813769217e-06, + "loss": 0.1023, + "num_input_tokens_seen": 196744544, + "step": 161695 + }, + { + "epoch": 18.00868693618443, + "grad_norm": 0.0006399208214133978, + "learning_rate": 1.494913101737036e-06, + "loss": 0.0092, + "num_input_tokens_seen": 196750656, + "step": 161700 + }, + { + "epoch": 18.009243791068048, + "grad_norm": 0.4116226136684418, + "learning_rate": 1.4940856117292346e-06, + "loss": 0.0062, + "num_input_tokens_seen": 196756960, + "step": 161705 + }, + { + "epoch": 18.009800645951664, + "grad_norm": 0.0004980099620297551, + "learning_rate": 1.4932583437536257e-06, + "loss": 0.068, + "num_input_tokens_seen": 196763008, + "step": 161710 + }, + { + "epoch": 18.010357500835283, + "grad_norm": 0.8327639102935791, + "learning_rate": 1.4924312978180227e-06, + "loss": 0.0423, + "num_input_tokens_seen": 196768928, + "step": 161715 + }, + { + "epoch": 18.0109143557189, + "grad_norm": 0.04424422234296799, + "learning_rate": 1.4916044739302415e-06, + "loss": 0.0358, + "num_input_tokens_seen": 196775008, + "step": 161720 + }, + { + "epoch": 18.011471210602515, + "grad_norm": 0.4938645362854004, + "learning_rate": 1.4907778720980898e-06, + "loss": 0.0396, + "num_input_tokens_seen": 196781056, + "step": 161725 + }, + { + "epoch": 18.012028065486135, + "grad_norm": 0.00481944065541029, + "learning_rate": 1.4899514923293806e-06, + "loss": 0.0044, + "num_input_tokens_seen": 196786976, + "step": 161730 + }, + { + "epoch": 18.01258492036975, + "grad_norm": 1.5333871841430664, + "learning_rate": 1.4891253346319106e-06, + "loss": 0.0921, + "num_input_tokens_seen": 196793024, + "step": 161735 + }, + { + "epoch": 18.01314177525337, + "grad_norm": 0.008970309048891068, + "learning_rate": 1.488299399013493e-06, + "loss": 0.0273, + "num_input_tokens_seen": 196799456, + "step": 161740 + }, + { + "epoch": 18.013698630136986, + "grad_norm": 0.21041357517242432, + "learning_rate": 1.4874736854819189e-06, + "loss": 0.1129, + "num_input_tokens_seen": 196805760, + "step": 161745 + }, + { + "epoch": 18.014255485020605, + "grad_norm": 0.00012164546205895022, + "learning_rate": 1.4866481940449984e-06, + "loss": 0.0045, + "num_input_tokens_seen": 196811808, + "step": 161750 + }, + { + "epoch": 18.01481233990422, + "grad_norm": 0.0010259241098538041, + "learning_rate": 1.4858229247105225e-06, + "loss": 0.0757, + "num_input_tokens_seen": 196818080, + "step": 161755 + }, + { + "epoch": 18.015369194787837, + "grad_norm": 0.8150874972343445, + "learning_rate": 1.4849978774862911e-06, + "loss": 0.0249, + "num_input_tokens_seen": 196824032, + "step": 161760 + }, + { + "epoch": 18.015926049671457, + "grad_norm": 0.000692468136548996, + "learning_rate": 1.4841730523800889e-06, + "loss": 0.0013, + "num_input_tokens_seen": 196830432, + "step": 161765 + }, + { + "epoch": 18.016482904555073, + "grad_norm": 0.0025362514425069094, + "learning_rate": 1.4833484493997156e-06, + "loss": 0.0934, + "num_input_tokens_seen": 196836448, + "step": 161770 + }, + { + "epoch": 18.017039759438692, + "grad_norm": 0.052420634776353836, + "learning_rate": 1.4825240685529595e-06, + "loss": 0.0325, + "num_input_tokens_seen": 196842720, + "step": 161775 + }, + { + "epoch": 18.017596614322308, + "grad_norm": 0.2393207550048828, + "learning_rate": 1.4816999098476059e-06, + "loss": 0.068, + "num_input_tokens_seen": 196848864, + "step": 161780 + }, + { + "epoch": 18.018153469205924, + "grad_norm": 0.00014006921264808625, + "learning_rate": 1.4808759732914347e-06, + "loss": 0.0239, + "num_input_tokens_seen": 196854656, + "step": 161785 + }, + { + "epoch": 18.018710324089543, + "grad_norm": 0.026013007387518883, + "learning_rate": 1.4800522588922366e-06, + "loss": 0.0172, + "num_input_tokens_seen": 196860480, + "step": 161790 + }, + { + "epoch": 18.01926717897316, + "grad_norm": 1.5617872476577759, + "learning_rate": 1.4792287666577864e-06, + "loss": 0.0416, + "num_input_tokens_seen": 196866976, + "step": 161795 + }, + { + "epoch": 18.01982403385678, + "grad_norm": 0.408117413520813, + "learning_rate": 1.4784054965958666e-06, + "loss": 0.0195, + "num_input_tokens_seen": 196873312, + "step": 161800 + }, + { + "epoch": 18.020380888740394, + "grad_norm": 0.02628946118056774, + "learning_rate": 1.4775824487142515e-06, + "loss": 0.124, + "num_input_tokens_seen": 196879296, + "step": 161805 + }, + { + "epoch": 18.02093774362401, + "grad_norm": 0.7002246975898743, + "learning_rate": 1.4767596230207182e-06, + "loss": 0.017, + "num_input_tokens_seen": 196884640, + "step": 161810 + }, + { + "epoch": 18.02149459850763, + "grad_norm": 0.013234087266027927, + "learning_rate": 1.4759370195230327e-06, + "loss": 0.0012, + "num_input_tokens_seen": 196890944, + "step": 161815 + }, + { + "epoch": 18.022051453391246, + "grad_norm": 0.042653147131204605, + "learning_rate": 1.475114638228972e-06, + "loss": 0.1012, + "num_input_tokens_seen": 196897056, + "step": 161820 + }, + { + "epoch": 18.022608308274865, + "grad_norm": 1.6590237617492676, + "learning_rate": 1.4742924791463024e-06, + "loss": 0.0533, + "num_input_tokens_seen": 196902848, + "step": 161825 + }, + { + "epoch": 18.02316516315848, + "grad_norm": 0.0170514527708292, + "learning_rate": 1.4734705422827898e-06, + "loss": 0.0798, + "num_input_tokens_seen": 196909472, + "step": 161830 + }, + { + "epoch": 18.023722018042097, + "grad_norm": 0.3936236798763275, + "learning_rate": 1.472648827646192e-06, + "loss": 0.0086, + "num_input_tokens_seen": 196915424, + "step": 161835 + }, + { + "epoch": 18.024278872925716, + "grad_norm": 1.6165560483932495, + "learning_rate": 1.4718273352442773e-06, + "loss": 0.0374, + "num_input_tokens_seen": 196921472, + "step": 161840 + }, + { + "epoch": 18.024835727809332, + "grad_norm": 0.22385774552822113, + "learning_rate": 1.471006065084804e-06, + "loss": 0.0528, + "num_input_tokens_seen": 196927584, + "step": 161845 + }, + { + "epoch": 18.02539258269295, + "grad_norm": 0.2565036118030548, + "learning_rate": 1.470185017175535e-06, + "loss": 0.0019, + "num_input_tokens_seen": 196933568, + "step": 161850 + }, + { + "epoch": 18.025949437576568, + "grad_norm": 0.025411969050765038, + "learning_rate": 1.4693641915242112e-06, + "loss": 0.0301, + "num_input_tokens_seen": 196939744, + "step": 161855 + }, + { + "epoch": 18.026506292460184, + "grad_norm": 0.0114527503028512, + "learning_rate": 1.4685435881386016e-06, + "loss": 0.0093, + "num_input_tokens_seen": 196945280, + "step": 161860 + }, + { + "epoch": 18.027063147343803, + "grad_norm": 0.0612245574593544, + "learning_rate": 1.4677232070264473e-06, + "loss": 0.0065, + "num_input_tokens_seen": 196951072, + "step": 161865 + }, + { + "epoch": 18.02762000222742, + "grad_norm": 1.8021081686019897, + "learning_rate": 1.4669030481955032e-06, + "loss": 0.0245, + "num_input_tokens_seen": 196957152, + "step": 161870 + }, + { + "epoch": 18.02817685711104, + "grad_norm": 1.4237955808639526, + "learning_rate": 1.4660831116535156e-06, + "loss": 0.1086, + "num_input_tokens_seen": 196963040, + "step": 161875 + }, + { + "epoch": 18.028733711994654, + "grad_norm": 1.420351266860962, + "learning_rate": 1.465263397408226e-06, + "loss": 0.0246, + "num_input_tokens_seen": 196968768, + "step": 161880 + }, + { + "epoch": 18.02929056687827, + "grad_norm": 0.08954101800918579, + "learning_rate": 1.4644439054673776e-06, + "loss": 0.0429, + "num_input_tokens_seen": 196975232, + "step": 161885 + }, + { + "epoch": 18.02984742176189, + "grad_norm": 0.035494111478328705, + "learning_rate": 1.4636246358387146e-06, + "loss": 0.0163, + "num_input_tokens_seen": 196981024, + "step": 161890 + }, + { + "epoch": 18.030404276645505, + "grad_norm": 0.05085032060742378, + "learning_rate": 1.4628055885299751e-06, + "loss": 0.0646, + "num_input_tokens_seen": 196986432, + "step": 161895 + }, + { + "epoch": 18.030961131529125, + "grad_norm": 0.15263508260250092, + "learning_rate": 1.4619867635488948e-06, + "loss": 0.037, + "num_input_tokens_seen": 196992384, + "step": 161900 + }, + { + "epoch": 18.03151798641274, + "grad_norm": 0.03190992400050163, + "learning_rate": 1.4611681609032035e-06, + "loss": 0.0508, + "num_input_tokens_seen": 196998368, + "step": 161905 + }, + { + "epoch": 18.032074841296357, + "grad_norm": 0.002547372365370393, + "learning_rate": 1.4603497806006423e-06, + "loss": 0.0175, + "num_input_tokens_seen": 197004512, + "step": 161910 + }, + { + "epoch": 18.032631696179976, + "grad_norm": 1.3049607276916504, + "learning_rate": 1.4595316226489353e-06, + "loss": 0.0712, + "num_input_tokens_seen": 197010400, + "step": 161915 + }, + { + "epoch": 18.033188551063592, + "grad_norm": 0.017261503264307976, + "learning_rate": 1.4587136870558156e-06, + "loss": 0.0074, + "num_input_tokens_seen": 197016512, + "step": 161920 + }, + { + "epoch": 18.03374540594721, + "grad_norm": 0.0939614400267601, + "learning_rate": 1.4578959738290015e-06, + "loss": 0.0332, + "num_input_tokens_seen": 197022560, + "step": 161925 + }, + { + "epoch": 18.034302260830827, + "grad_norm": 0.015987327322363853, + "learning_rate": 1.4570784829762235e-06, + "loss": 0.0321, + "num_input_tokens_seen": 197028896, + "step": 161930 + }, + { + "epoch": 18.034859115714443, + "grad_norm": 0.00308361672796309, + "learning_rate": 1.4562612145052002e-06, + "loss": 0.0044, + "num_input_tokens_seen": 197035264, + "step": 161935 + }, + { + "epoch": 18.035415970598063, + "grad_norm": 0.0021150961983948946, + "learning_rate": 1.4554441684236558e-06, + "loss": 0.0268, + "num_input_tokens_seen": 197041344, + "step": 161940 + }, + { + "epoch": 18.03597282548168, + "grad_norm": 0.06609688699245453, + "learning_rate": 1.4546273447393038e-06, + "loss": 0.0645, + "num_input_tokens_seen": 197047520, + "step": 161945 + }, + { + "epoch": 18.036529680365298, + "grad_norm": 0.007533097639679909, + "learning_rate": 1.4538107434598602e-06, + "loss": 0.017, + "num_input_tokens_seen": 197053504, + "step": 161950 + }, + { + "epoch": 18.037086535248914, + "grad_norm": 0.9583175182342529, + "learning_rate": 1.4529943645930355e-06, + "loss": 0.069, + "num_input_tokens_seen": 197059520, + "step": 161955 + }, + { + "epoch": 18.03764339013253, + "grad_norm": 0.009341382421553135, + "learning_rate": 1.4521782081465485e-06, + "loss": 0.0051, + "num_input_tokens_seen": 197065472, + "step": 161960 + }, + { + "epoch": 18.03820024501615, + "grad_norm": 0.0007797262514941394, + "learning_rate": 1.4513622741281069e-06, + "loss": 0.0185, + "num_input_tokens_seen": 197071648, + "step": 161965 + }, + { + "epoch": 18.038757099899765, + "grad_norm": 0.08733434975147247, + "learning_rate": 1.4505465625454128e-06, + "loss": 0.0367, + "num_input_tokens_seen": 197078208, + "step": 161970 + }, + { + "epoch": 18.039313954783385, + "grad_norm": 0.0500505231320858, + "learning_rate": 1.4497310734061714e-06, + "loss": 0.0034, + "num_input_tokens_seen": 197084256, + "step": 161975 + }, + { + "epoch": 18.039870809667, + "grad_norm": 0.00016016987501643598, + "learning_rate": 1.4489158067180901e-06, + "loss": 0.0377, + "num_input_tokens_seen": 197090368, + "step": 161980 + }, + { + "epoch": 18.040427664550617, + "grad_norm": 0.015360953286290169, + "learning_rate": 1.4481007624888659e-06, + "loss": 0.0011, + "num_input_tokens_seen": 197096384, + "step": 161985 + }, + { + "epoch": 18.040984519434236, + "grad_norm": 0.07537873089313507, + "learning_rate": 1.4472859407262064e-06, + "loss": 0.0325, + "num_input_tokens_seen": 197102816, + "step": 161990 + }, + { + "epoch": 18.041541374317852, + "grad_norm": 0.001976647414267063, + "learning_rate": 1.446471341437794e-06, + "loss": 0.0139, + "num_input_tokens_seen": 197108704, + "step": 161995 + }, + { + "epoch": 18.04209822920147, + "grad_norm": 0.17911961674690247, + "learning_rate": 1.445656964631334e-06, + "loss": 0.0832, + "num_input_tokens_seen": 197114336, + "step": 162000 + }, + { + "epoch": 18.042655084085087, + "grad_norm": 0.3784949779510498, + "learning_rate": 1.4448428103145118e-06, + "loss": 0.025, + "num_input_tokens_seen": 197120704, + "step": 162005 + }, + { + "epoch": 18.043211938968703, + "grad_norm": 0.01033647358417511, + "learning_rate": 1.444028878495024e-06, + "loss": 0.0391, + "num_input_tokens_seen": 197127136, + "step": 162010 + }, + { + "epoch": 18.043768793852323, + "grad_norm": 0.02201470546424389, + "learning_rate": 1.4432151691805562e-06, + "loss": 0.007, + "num_input_tokens_seen": 197133376, + "step": 162015 + }, + { + "epoch": 18.04432564873594, + "grad_norm": 1.283697485923767, + "learning_rate": 1.4424016823787966e-06, + "loss": 0.0141, + "num_input_tokens_seen": 197139296, + "step": 162020 + }, + { + "epoch": 18.044882503619558, + "grad_norm": 0.0713229849934578, + "learning_rate": 1.4415884180974226e-06, + "loss": 0.0013, + "num_input_tokens_seen": 197145472, + "step": 162025 + }, + { + "epoch": 18.045439358503174, + "grad_norm": 1.3656086921691895, + "learning_rate": 1.440775376344125e-06, + "loss": 0.1903, + "num_input_tokens_seen": 197151744, + "step": 162030 + }, + { + "epoch": 18.04599621338679, + "grad_norm": 2.4904472827911377, + "learning_rate": 1.4399625571265785e-06, + "loss": 0.0797, + "num_input_tokens_seen": 197157504, + "step": 162035 + }, + { + "epoch": 18.04655306827041, + "grad_norm": 0.22620555758476257, + "learning_rate": 1.4391499604524599e-06, + "loss": 0.0734, + "num_input_tokens_seen": 197163584, + "step": 162040 + }, + { + "epoch": 18.047109923154025, + "grad_norm": 0.04073447734117508, + "learning_rate": 1.438337586329444e-06, + "loss": 0.0437, + "num_input_tokens_seen": 197169568, + "step": 162045 + }, + { + "epoch": 18.047666778037645, + "grad_norm": 2.383121967315674, + "learning_rate": 1.4375254347652133e-06, + "loss": 0.1853, + "num_input_tokens_seen": 197175552, + "step": 162050 + }, + { + "epoch": 18.04822363292126, + "grad_norm": 0.17704534530639648, + "learning_rate": 1.4367135057674257e-06, + "loss": 0.0396, + "num_input_tokens_seen": 197181536, + "step": 162055 + }, + { + "epoch": 18.048780487804876, + "grad_norm": 0.03432253748178482, + "learning_rate": 1.435901799343764e-06, + "loss": 0.0024, + "num_input_tokens_seen": 197187488, + "step": 162060 + }, + { + "epoch": 18.049337342688496, + "grad_norm": 0.00027023989241570234, + "learning_rate": 1.4350903155018858e-06, + "loss": 0.0121, + "num_input_tokens_seen": 197193280, + "step": 162065 + }, + { + "epoch": 18.04989419757211, + "grad_norm": 0.03456970676779747, + "learning_rate": 1.4342790542494627e-06, + "loss": 0.0042, + "num_input_tokens_seen": 197199232, + "step": 162070 + }, + { + "epoch": 18.05045105245573, + "grad_norm": 0.0031794735696166754, + "learning_rate": 1.4334680155941526e-06, + "loss": 0.0696, + "num_input_tokens_seen": 197205472, + "step": 162075 + }, + { + "epoch": 18.051007907339347, + "grad_norm": 0.890670895576477, + "learning_rate": 1.4326571995436134e-06, + "loss": 0.0197, + "num_input_tokens_seen": 197210784, + "step": 162080 + }, + { + "epoch": 18.051564762222966, + "grad_norm": 0.00023909704759716988, + "learning_rate": 1.4318466061055165e-06, + "loss": 0.0055, + "num_input_tokens_seen": 197216864, + "step": 162085 + }, + { + "epoch": 18.052121617106582, + "grad_norm": 0.43912968039512634, + "learning_rate": 1.4310362352875084e-06, + "loss": 0.0091, + "num_input_tokens_seen": 197222848, + "step": 162090 + }, + { + "epoch": 18.0526784719902, + "grad_norm": 0.08834002912044525, + "learning_rate": 1.4302260870972444e-06, + "loss": 0.0295, + "num_input_tokens_seen": 197228416, + "step": 162095 + }, + { + "epoch": 18.053235326873818, + "grad_norm": 0.9583242535591125, + "learning_rate": 1.4294161615423767e-06, + "loss": 0.1134, + "num_input_tokens_seen": 197234304, + "step": 162100 + }, + { + "epoch": 18.053792181757434, + "grad_norm": 0.005671784747391939, + "learning_rate": 1.428606458630563e-06, + "loss": 0.0027, + "num_input_tokens_seen": 197240640, + "step": 162105 + }, + { + "epoch": 18.054349036641053, + "grad_norm": 0.08430641889572144, + "learning_rate": 1.4277969783694444e-06, + "loss": 0.0895, + "num_input_tokens_seen": 197247072, + "step": 162110 + }, + { + "epoch": 18.05490589152467, + "grad_norm": 0.05542290583252907, + "learning_rate": 1.4269877207666705e-06, + "loss": 0.0344, + "num_input_tokens_seen": 197252992, + "step": 162115 + }, + { + "epoch": 18.055462746408285, + "grad_norm": 0.038895510137081146, + "learning_rate": 1.4261786858298793e-06, + "loss": 0.0183, + "num_input_tokens_seen": 197259168, + "step": 162120 + }, + { + "epoch": 18.056019601291904, + "grad_norm": 1.337435245513916, + "learning_rate": 1.4253698735667203e-06, + "loss": 0.0968, + "num_input_tokens_seen": 197265344, + "step": 162125 + }, + { + "epoch": 18.05657645617552, + "grad_norm": 0.330761194229126, + "learning_rate": 1.4245612839848293e-06, + "loss": 0.0237, + "num_input_tokens_seen": 197271616, + "step": 162130 + }, + { + "epoch": 18.05713331105914, + "grad_norm": 0.8180575966835022, + "learning_rate": 1.4237529170918502e-06, + "loss": 0.0057, + "num_input_tokens_seen": 197278112, + "step": 162135 + }, + { + "epoch": 18.057690165942756, + "grad_norm": 1.2819740772247314, + "learning_rate": 1.42294477289541e-06, + "loss": 0.0703, + "num_input_tokens_seen": 197284640, + "step": 162140 + }, + { + "epoch": 18.05824702082637, + "grad_norm": 0.9464848637580872, + "learning_rate": 1.4221368514031469e-06, + "loss": 0.0828, + "num_input_tokens_seen": 197290784, + "step": 162145 + }, + { + "epoch": 18.05880387570999, + "grad_norm": 0.00042065209709107876, + "learning_rate": 1.4213291526226858e-06, + "loss": 0.0141, + "num_input_tokens_seen": 197296640, + "step": 162150 + }, + { + "epoch": 18.059360730593607, + "grad_norm": 0.015619885176420212, + "learning_rate": 1.4205216765616702e-06, + "loss": 0.0014, + "num_input_tokens_seen": 197302624, + "step": 162155 + }, + { + "epoch": 18.059917585477226, + "grad_norm": 0.00020496113575063646, + "learning_rate": 1.4197144232277166e-06, + "loss": 0.0051, + "num_input_tokens_seen": 197308256, + "step": 162160 + }, + { + "epoch": 18.060474440360842, + "grad_norm": 9.373826469527557e-05, + "learning_rate": 1.4189073926284519e-06, + "loss": 0.0055, + "num_input_tokens_seen": 197314560, + "step": 162165 + }, + { + "epoch": 18.061031295244458, + "grad_norm": 0.002703926293179393, + "learning_rate": 1.418100584771495e-06, + "loss": 0.011, + "num_input_tokens_seen": 197320640, + "step": 162170 + }, + { + "epoch": 18.061588150128078, + "grad_norm": 0.5794833898544312, + "learning_rate": 1.417293999664479e-06, + "loss": 0.1879, + "num_input_tokens_seen": 197326112, + "step": 162175 + }, + { + "epoch": 18.062145005011693, + "grad_norm": 0.0008070892072282732, + "learning_rate": 1.4164876373150116e-06, + "loss": 0.0881, + "num_input_tokens_seen": 197332192, + "step": 162180 + }, + { + "epoch": 18.062701859895313, + "grad_norm": 0.7975095510482788, + "learning_rate": 1.4156814977307143e-06, + "loss": 0.0536, + "num_input_tokens_seen": 197338496, + "step": 162185 + }, + { + "epoch": 18.06325871477893, + "grad_norm": 0.792625904083252, + "learning_rate": 1.4148755809191978e-06, + "loss": 0.0316, + "num_input_tokens_seen": 197344224, + "step": 162190 + }, + { + "epoch": 18.063815569662545, + "grad_norm": 0.2808479964733124, + "learning_rate": 1.4140698868880808e-06, + "loss": 0.0607, + "num_input_tokens_seen": 197350080, + "step": 162195 + }, + { + "epoch": 18.064372424546164, + "grad_norm": 0.0012821987038478255, + "learning_rate": 1.4132644156449687e-06, + "loss": 0.072, + "num_input_tokens_seen": 197356192, + "step": 162200 + }, + { + "epoch": 18.06492927942978, + "grad_norm": 0.0028670672327280045, + "learning_rate": 1.4124591671974718e-06, + "loss": 0.0027, + "num_input_tokens_seen": 197362304, + "step": 162205 + }, + { + "epoch": 18.0654861343134, + "grad_norm": 0.990931510925293, + "learning_rate": 1.411654141553198e-06, + "loss": 0.0354, + "num_input_tokens_seen": 197368448, + "step": 162210 + }, + { + "epoch": 18.066042989197015, + "grad_norm": 2.45247745513916, + "learning_rate": 1.4108493387197497e-06, + "loss": 0.074, + "num_input_tokens_seen": 197373920, + "step": 162215 + }, + { + "epoch": 18.06659984408063, + "grad_norm": 0.18007256090641022, + "learning_rate": 1.4100447587047261e-06, + "loss": 0.1042, + "num_input_tokens_seen": 197380128, + "step": 162220 + }, + { + "epoch": 18.06715669896425, + "grad_norm": 0.7139544486999512, + "learning_rate": 1.4092404015157295e-06, + "loss": 0.0698, + "num_input_tokens_seen": 197386144, + "step": 162225 + }, + { + "epoch": 18.067713553847867, + "grad_norm": 2.6102237701416016, + "learning_rate": 1.4084362671603623e-06, + "loss": 0.0619, + "num_input_tokens_seen": 197392352, + "step": 162230 + }, + { + "epoch": 18.068270408731486, + "grad_norm": 0.5612345337867737, + "learning_rate": 1.407632355646213e-06, + "loss": 0.0874, + "num_input_tokens_seen": 197398208, + "step": 162235 + }, + { + "epoch": 18.068827263615102, + "grad_norm": 0.08292379975318909, + "learning_rate": 1.406828666980875e-06, + "loss": 0.0231, + "num_input_tokens_seen": 197404608, + "step": 162240 + }, + { + "epoch": 18.069384118498718, + "grad_norm": 0.00011301787162665278, + "learning_rate": 1.4060252011719456e-06, + "loss": 0.1461, + "num_input_tokens_seen": 197410880, + "step": 162245 + }, + { + "epoch": 18.069940973382337, + "grad_norm": 0.0033442561980336905, + "learning_rate": 1.4052219582270098e-06, + "loss": 0.0422, + "num_input_tokens_seen": 197417184, + "step": 162250 + }, + { + "epoch": 18.070497828265953, + "grad_norm": 1.7009479999542236, + "learning_rate": 1.404418938153662e-06, + "loss": 0.1494, + "num_input_tokens_seen": 197423200, + "step": 162255 + }, + { + "epoch": 18.071054683149573, + "grad_norm": 1.1416513919830322, + "learning_rate": 1.4036161409594767e-06, + "loss": 0.0268, + "num_input_tokens_seen": 197429504, + "step": 162260 + }, + { + "epoch": 18.07161153803319, + "grad_norm": 0.06938107311725616, + "learning_rate": 1.402813566652042e-06, + "loss": 0.0709, + "num_input_tokens_seen": 197435584, + "step": 162265 + }, + { + "epoch": 18.072168392916804, + "grad_norm": 0.001518812496215105, + "learning_rate": 1.402011215238938e-06, + "loss": 0.1105, + "num_input_tokens_seen": 197441952, + "step": 162270 + }, + { + "epoch": 18.072725247800424, + "grad_norm": 0.8114731907844543, + "learning_rate": 1.4012090867277478e-06, + "loss": 0.0158, + "num_input_tokens_seen": 197447968, + "step": 162275 + }, + { + "epoch": 18.07328210268404, + "grad_norm": 0.1822328418493271, + "learning_rate": 1.4004071811260456e-06, + "loss": 0.037, + "num_input_tokens_seen": 197454048, + "step": 162280 + }, + { + "epoch": 18.07383895756766, + "grad_norm": 0.170811265707016, + "learning_rate": 1.399605498441403e-06, + "loss": 0.1005, + "num_input_tokens_seen": 197460288, + "step": 162285 + }, + { + "epoch": 18.074395812451275, + "grad_norm": 0.004869658499956131, + "learning_rate": 1.3988040386813921e-06, + "loss": 0.0011, + "num_input_tokens_seen": 197466400, + "step": 162290 + }, + { + "epoch": 18.07495266733489, + "grad_norm": 0.304801344871521, + "learning_rate": 1.3980028018535901e-06, + "loss": 0.0384, + "num_input_tokens_seen": 197472672, + "step": 162295 + }, + { + "epoch": 18.07550952221851, + "grad_norm": 0.005174596328288317, + "learning_rate": 1.3972017879655601e-06, + "loss": 0.0007, + "num_input_tokens_seen": 197479104, + "step": 162300 + }, + { + "epoch": 18.076066377102126, + "grad_norm": 0.6226804256439209, + "learning_rate": 1.3964009970248714e-06, + "loss": 0.0167, + "num_input_tokens_seen": 197485216, + "step": 162305 + }, + { + "epoch": 18.076623231985746, + "grad_norm": 0.5481879711151123, + "learning_rate": 1.3956004290390817e-06, + "loss": 0.0379, + "num_input_tokens_seen": 197491232, + "step": 162310 + }, + { + "epoch": 18.07718008686936, + "grad_norm": 0.045562464743852615, + "learning_rate": 1.39480008401576e-06, + "loss": 0.1315, + "num_input_tokens_seen": 197497472, + "step": 162315 + }, + { + "epoch": 18.077736941752978, + "grad_norm": 0.4084007441997528, + "learning_rate": 1.3939999619624612e-06, + "loss": 0.0058, + "num_input_tokens_seen": 197503648, + "step": 162320 + }, + { + "epoch": 18.078293796636597, + "grad_norm": 0.11354973167181015, + "learning_rate": 1.3932000628867515e-06, + "loss": 0.0049, + "num_input_tokens_seen": 197509152, + "step": 162325 + }, + { + "epoch": 18.078850651520213, + "grad_norm": 0.04492305591702461, + "learning_rate": 1.3924003867961749e-06, + "loss": 0.0146, + "num_input_tokens_seen": 197514464, + "step": 162330 + }, + { + "epoch": 18.079407506403832, + "grad_norm": 1.774658203125, + "learning_rate": 1.3916009336982949e-06, + "loss": 0.0322, + "num_input_tokens_seen": 197520608, + "step": 162335 + }, + { + "epoch": 18.07996436128745, + "grad_norm": 0.27476269006729126, + "learning_rate": 1.3908017036006527e-06, + "loss": 0.0411, + "num_input_tokens_seen": 197526912, + "step": 162340 + }, + { + "epoch": 18.080521216171064, + "grad_norm": 0.6991767883300781, + "learning_rate": 1.3900026965108088e-06, + "loss": 0.0802, + "num_input_tokens_seen": 197532960, + "step": 162345 + }, + { + "epoch": 18.081078071054684, + "grad_norm": 0.7469959855079651, + "learning_rate": 1.389203912436307e-06, + "loss": 0.0168, + "num_input_tokens_seen": 197539264, + "step": 162350 + }, + { + "epoch": 18.0816349259383, + "grad_norm": 0.07623478770256042, + "learning_rate": 1.388405351384689e-06, + "loss": 0.033, + "num_input_tokens_seen": 197545568, + "step": 162355 + }, + { + "epoch": 18.08219178082192, + "grad_norm": 0.008610943332314491, + "learning_rate": 1.3876070133634955e-06, + "loss": 0.0137, + "num_input_tokens_seen": 197551744, + "step": 162360 + }, + { + "epoch": 18.082748635705535, + "grad_norm": 0.005178239196538925, + "learning_rate": 1.3868088983802791e-06, + "loss": 0.0162, + "num_input_tokens_seen": 197557728, + "step": 162365 + }, + { + "epoch": 18.08330549058915, + "grad_norm": 0.0023428478743880987, + "learning_rate": 1.386011006442567e-06, + "loss": 0.018, + "num_input_tokens_seen": 197563936, + "step": 162370 + }, + { + "epoch": 18.08386234547277, + "grad_norm": 0.019100019708275795, + "learning_rate": 1.385213337557903e-06, + "loss": 0.0044, + "num_input_tokens_seen": 197569824, + "step": 162375 + }, + { + "epoch": 18.084419200356386, + "grad_norm": 0.13145729899406433, + "learning_rate": 1.3844158917338117e-06, + "loss": 0.0354, + "num_input_tokens_seen": 197575840, + "step": 162380 + }, + { + "epoch": 18.084976055240006, + "grad_norm": 0.024925842881202698, + "learning_rate": 1.3836186689778402e-06, + "loss": 0.0108, + "num_input_tokens_seen": 197582080, + "step": 162385 + }, + { + "epoch": 18.08553291012362, + "grad_norm": 0.1389768272638321, + "learning_rate": 1.382821669297507e-06, + "loss": 0.0103, + "num_input_tokens_seen": 197588224, + "step": 162390 + }, + { + "epoch": 18.086089765007237, + "grad_norm": 0.02097950130701065, + "learning_rate": 1.382024892700351e-06, + "loss": 0.0343, + "num_input_tokens_seen": 197594112, + "step": 162395 + }, + { + "epoch": 18.086646619890857, + "grad_norm": 0.4076865315437317, + "learning_rate": 1.3812283391938852e-06, + "loss": 0.0107, + "num_input_tokens_seen": 197599936, + "step": 162400 + }, + { + "epoch": 18.087203474774473, + "grad_norm": 0.142483189702034, + "learning_rate": 1.3804320087856453e-06, + "loss": 0.0055, + "num_input_tokens_seen": 197605952, + "step": 162405 + }, + { + "epoch": 18.087760329658092, + "grad_norm": 0.1880614161491394, + "learning_rate": 1.379635901483145e-06, + "loss": 0.0073, + "num_input_tokens_seen": 197612192, + "step": 162410 + }, + { + "epoch": 18.088317184541708, + "grad_norm": 1.926904320716858, + "learning_rate": 1.3788400172939086e-06, + "loss": 0.0943, + "num_input_tokens_seen": 197617984, + "step": 162415 + }, + { + "epoch": 18.088874039425324, + "grad_norm": 0.0038088152650743723, + "learning_rate": 1.378044356225458e-06, + "loss": 0.0551, + "num_input_tokens_seen": 197623936, + "step": 162420 + }, + { + "epoch": 18.089430894308943, + "grad_norm": 1.067314863204956, + "learning_rate": 1.377248918285301e-06, + "loss": 0.0361, + "num_input_tokens_seen": 197629920, + "step": 162425 + }, + { + "epoch": 18.08998774919256, + "grad_norm": 0.026268187910318375, + "learning_rate": 1.376453703480951e-06, + "loss": 0.0018, + "num_input_tokens_seen": 197636384, + "step": 162430 + }, + { + "epoch": 18.09054460407618, + "grad_norm": 0.025332147255539894, + "learning_rate": 1.3756587118199271e-06, + "loss": 0.0003, + "num_input_tokens_seen": 197642848, + "step": 162435 + }, + { + "epoch": 18.091101458959795, + "grad_norm": 0.0001641090348130092, + "learning_rate": 1.3748639433097342e-06, + "loss": 0.0005, + "num_input_tokens_seen": 197649024, + "step": 162440 + }, + { + "epoch": 18.091658313843414, + "grad_norm": 0.43894457817077637, + "learning_rate": 1.3740693979578777e-06, + "loss": 0.0604, + "num_input_tokens_seen": 197655136, + "step": 162445 + }, + { + "epoch": 18.09221516872703, + "grad_norm": 0.13292308151721954, + "learning_rate": 1.3732750757718627e-06, + "loss": 0.0408, + "num_input_tokens_seen": 197661184, + "step": 162450 + }, + { + "epoch": 18.092772023610646, + "grad_norm": 0.48943156003952026, + "learning_rate": 1.3724809767591967e-06, + "loss": 0.0205, + "num_input_tokens_seen": 197667200, + "step": 162455 + }, + { + "epoch": 18.093328878494265, + "grad_norm": 0.5007269978523254, + "learning_rate": 1.3716871009273742e-06, + "loss": 0.0214, + "num_input_tokens_seen": 197672992, + "step": 162460 + }, + { + "epoch": 18.09388573337788, + "grad_norm": 0.01804294064640999, + "learning_rate": 1.3708934482839026e-06, + "loss": 0.0532, + "num_input_tokens_seen": 197679264, + "step": 162465 + }, + { + "epoch": 18.0944425882615, + "grad_norm": 0.004107200540602207, + "learning_rate": 1.3701000188362734e-06, + "loss": 0.02, + "num_input_tokens_seen": 197685184, + "step": 162470 + }, + { + "epoch": 18.094999443145117, + "grad_norm": 0.21403545141220093, + "learning_rate": 1.3693068125919806e-06, + "loss": 0.0107, + "num_input_tokens_seen": 197691136, + "step": 162475 + }, + { + "epoch": 18.095556298028733, + "grad_norm": 2.2355434894561768, + "learning_rate": 1.3685138295585153e-06, + "loss": 0.0367, + "num_input_tokens_seen": 197697184, + "step": 162480 + }, + { + "epoch": 18.096113152912352, + "grad_norm": 0.09584714472293854, + "learning_rate": 1.3677210697433717e-06, + "loss": 0.2003, + "num_input_tokens_seen": 197703424, + "step": 162485 + }, + { + "epoch": 18.096670007795968, + "grad_norm": 0.0023413985036313534, + "learning_rate": 1.3669285331540383e-06, + "loss": 0.0168, + "num_input_tokens_seen": 197709696, + "step": 162490 + }, + { + "epoch": 18.097226862679587, + "grad_norm": 0.13805870711803436, + "learning_rate": 1.3661362197979977e-06, + "loss": 0.1174, + "num_input_tokens_seen": 197715680, + "step": 162495 + }, + { + "epoch": 18.097783717563203, + "grad_norm": 0.017908846959471703, + "learning_rate": 1.3653441296827386e-06, + "loss": 0.018, + "num_input_tokens_seen": 197722208, + "step": 162500 + }, + { + "epoch": 18.09834057244682, + "grad_norm": 4.105159282684326, + "learning_rate": 1.3645522628157326e-06, + "loss": 0.0409, + "num_input_tokens_seen": 197728192, + "step": 162505 + }, + { + "epoch": 18.09889742733044, + "grad_norm": 0.4834304451942444, + "learning_rate": 1.363760619204474e-06, + "loss": 0.0105, + "num_input_tokens_seen": 197733952, + "step": 162510 + }, + { + "epoch": 18.099454282214055, + "grad_norm": 1.974392056465149, + "learning_rate": 1.3629691988564314e-06, + "loss": 0.0886, + "num_input_tokens_seen": 197740000, + "step": 162515 + }, + { + "epoch": 18.100011137097674, + "grad_norm": 0.026902319863438606, + "learning_rate": 1.3621780017790825e-06, + "loss": 0.0197, + "num_input_tokens_seen": 197745984, + "step": 162520 + }, + { + "epoch": 18.10056799198129, + "grad_norm": 0.003284451551735401, + "learning_rate": 1.3613870279798963e-06, + "loss": 0.0149, + "num_input_tokens_seen": 197752160, + "step": 162525 + }, + { + "epoch": 18.101124846864906, + "grad_norm": 1.9268733263015747, + "learning_rate": 1.3605962774663527e-06, + "loss": 0.0813, + "num_input_tokens_seen": 197758048, + "step": 162530 + }, + { + "epoch": 18.101681701748525, + "grad_norm": 0.0010680491104722023, + "learning_rate": 1.3598057502459155e-06, + "loss": 0.016, + "num_input_tokens_seen": 197764128, + "step": 162535 + }, + { + "epoch": 18.10223855663214, + "grad_norm": 0.10642915964126587, + "learning_rate": 1.3590154463260562e-06, + "loss": 0.0406, + "num_input_tokens_seen": 197770304, + "step": 162540 + }, + { + "epoch": 18.10279541151576, + "grad_norm": 0.0007261877181008458, + "learning_rate": 1.358225365714233e-06, + "loss": 0.102, + "num_input_tokens_seen": 197776544, + "step": 162545 + }, + { + "epoch": 18.103352266399376, + "grad_norm": 0.00010895860032178462, + "learning_rate": 1.3574355084179146e-06, + "loss": 0.0173, + "num_input_tokens_seen": 197782400, + "step": 162550 + }, + { + "epoch": 18.103909121282992, + "grad_norm": 0.26810845732688904, + "learning_rate": 1.3566458744445566e-06, + "loss": 0.0139, + "num_input_tokens_seen": 197788224, + "step": 162555 + }, + { + "epoch": 18.104465976166612, + "grad_norm": 2.72416615486145, + "learning_rate": 1.3558564638016275e-06, + "loss": 0.3136, + "num_input_tokens_seen": 197793760, + "step": 162560 + }, + { + "epoch": 18.105022831050228, + "grad_norm": 1.225946307182312, + "learning_rate": 1.3550672764965744e-06, + "loss": 0.1145, + "num_input_tokens_seen": 197799680, + "step": 162565 + }, + { + "epoch": 18.105579685933847, + "grad_norm": 1.1011179685592651, + "learning_rate": 1.3542783125368552e-06, + "loss": 0.0329, + "num_input_tokens_seen": 197805952, + "step": 162570 + }, + { + "epoch": 18.106136540817463, + "grad_norm": 0.9132959246635437, + "learning_rate": 1.3534895719299196e-06, + "loss": 0.0781, + "num_input_tokens_seen": 197811584, + "step": 162575 + }, + { + "epoch": 18.10669339570108, + "grad_norm": 0.0011518847895786166, + "learning_rate": 1.3527010546832225e-06, + "loss": 0.025, + "num_input_tokens_seen": 197817472, + "step": 162580 + }, + { + "epoch": 18.1072502505847, + "grad_norm": 0.026222536340355873, + "learning_rate": 1.351912760804208e-06, + "loss": 0.0039, + "num_input_tokens_seen": 197822976, + "step": 162585 + }, + { + "epoch": 18.107807105468314, + "grad_norm": 0.03080509416759014, + "learning_rate": 1.351124690300329e-06, + "loss": 0.1326, + "num_input_tokens_seen": 197828864, + "step": 162590 + }, + { + "epoch": 18.108363960351934, + "grad_norm": 0.012869639322161674, + "learning_rate": 1.3503368431790176e-06, + "loss": 0.0122, + "num_input_tokens_seen": 197835072, + "step": 162595 + }, + { + "epoch": 18.10892081523555, + "grad_norm": 0.05912568047642708, + "learning_rate": 1.3495492194477267e-06, + "loss": 0.0416, + "num_input_tokens_seen": 197841120, + "step": 162600 + }, + { + "epoch": 18.109477670119166, + "grad_norm": 0.5690122246742249, + "learning_rate": 1.3487618191138861e-06, + "loss": 0.0743, + "num_input_tokens_seen": 197846304, + "step": 162605 + }, + { + "epoch": 18.110034525002785, + "grad_norm": 0.14170122146606445, + "learning_rate": 1.347974642184946e-06, + "loss": 0.0356, + "num_input_tokens_seen": 197852320, + "step": 162610 + }, + { + "epoch": 18.1105913798864, + "grad_norm": 2.1638600826263428, + "learning_rate": 1.3471876886683331e-06, + "loss": 0.1061, + "num_input_tokens_seen": 197857792, + "step": 162615 + }, + { + "epoch": 18.11114823477002, + "grad_norm": 0.09354809671640396, + "learning_rate": 1.3464009585714838e-06, + "loss": 0.0117, + "num_input_tokens_seen": 197863840, + "step": 162620 + }, + { + "epoch": 18.111705089653636, + "grad_norm": 0.002843884751200676, + "learning_rate": 1.3456144519018249e-06, + "loss": 0.0009, + "num_input_tokens_seen": 197869792, + "step": 162625 + }, + { + "epoch": 18.112261944537252, + "grad_norm": 0.00014556932728737593, + "learning_rate": 1.3448281686667923e-06, + "loss": 0.0605, + "num_input_tokens_seen": 197876064, + "step": 162630 + }, + { + "epoch": 18.11281879942087, + "grad_norm": 0.3400875926017761, + "learning_rate": 1.3440421088738109e-06, + "loss": 0.0156, + "num_input_tokens_seen": 197882112, + "step": 162635 + }, + { + "epoch": 18.113375654304487, + "grad_norm": 0.3720933496952057, + "learning_rate": 1.343256272530305e-06, + "loss": 0.0358, + "num_input_tokens_seen": 197888064, + "step": 162640 + }, + { + "epoch": 18.113932509188107, + "grad_norm": 0.003513916628435254, + "learning_rate": 1.3424706596436909e-06, + "loss": 0.0121, + "num_input_tokens_seen": 197894144, + "step": 162645 + }, + { + "epoch": 18.114489364071723, + "grad_norm": 0.6445779800415039, + "learning_rate": 1.3416852702213989e-06, + "loss": 0.0119, + "num_input_tokens_seen": 197900256, + "step": 162650 + }, + { + "epoch": 18.11504621895534, + "grad_norm": 0.08949082344770432, + "learning_rate": 1.3409001042708425e-06, + "loss": 0.0701, + "num_input_tokens_seen": 197906464, + "step": 162655 + }, + { + "epoch": 18.115603073838958, + "grad_norm": 1.037630319595337, + "learning_rate": 1.340115161799449e-06, + "loss": 0.0527, + "num_input_tokens_seen": 197912480, + "step": 162660 + }, + { + "epoch": 18.116159928722574, + "grad_norm": 0.3497202694416046, + "learning_rate": 1.3393304428146125e-06, + "loss": 0.0302, + "num_input_tokens_seen": 197918496, + "step": 162665 + }, + { + "epoch": 18.116716783606194, + "grad_norm": 0.009104114957153797, + "learning_rate": 1.3385459473237632e-06, + "loss": 0.0036, + "num_input_tokens_seen": 197923904, + "step": 162670 + }, + { + "epoch": 18.11727363848981, + "grad_norm": 0.0006124608335085213, + "learning_rate": 1.337761675334301e-06, + "loss": 0.0122, + "num_input_tokens_seen": 197929952, + "step": 162675 + }, + { + "epoch": 18.117830493373425, + "grad_norm": 2.0125339031219482, + "learning_rate": 1.3369776268536388e-06, + "loss": 0.2592, + "num_input_tokens_seen": 197936128, + "step": 162680 + }, + { + "epoch": 18.118387348257045, + "grad_norm": 0.0003347570600453764, + "learning_rate": 1.3361938018891822e-06, + "loss": 0.0028, + "num_input_tokens_seen": 197942272, + "step": 162685 + }, + { + "epoch": 18.11894420314066, + "grad_norm": 0.002164155477657914, + "learning_rate": 1.3354102004483366e-06, + "loss": 0.0007, + "num_input_tokens_seen": 197948544, + "step": 162690 + }, + { + "epoch": 18.11950105802428, + "grad_norm": 0.00019709129992406815, + "learning_rate": 1.3346268225384957e-06, + "loss": 0.0049, + "num_input_tokens_seen": 197954848, + "step": 162695 + }, + { + "epoch": 18.120057912907896, + "grad_norm": 0.7185685038566589, + "learning_rate": 1.3338436681670674e-06, + "loss": 0.028, + "num_input_tokens_seen": 197960448, + "step": 162700 + }, + { + "epoch": 18.120614767791512, + "grad_norm": 0.055004481226205826, + "learning_rate": 1.333060737341449e-06, + "loss": 0.0061, + "num_input_tokens_seen": 197966656, + "step": 162705 + }, + { + "epoch": 18.12117162267513, + "grad_norm": 0.1284942477941513, + "learning_rate": 1.3322780300690313e-06, + "loss": 0.0778, + "num_input_tokens_seen": 197972992, + "step": 162710 + }, + { + "epoch": 18.121728477558747, + "grad_norm": 0.12740777432918549, + "learning_rate": 1.3314955463572088e-06, + "loss": 0.0279, + "num_input_tokens_seen": 197978944, + "step": 162715 + }, + { + "epoch": 18.122285332442367, + "grad_norm": 0.002038130769506097, + "learning_rate": 1.3307132862133754e-06, + "loss": 0.0013, + "num_input_tokens_seen": 197985184, + "step": 162720 + }, + { + "epoch": 18.122842187325983, + "grad_norm": 0.3349359333515167, + "learning_rate": 1.3299312496449168e-06, + "loss": 0.0522, + "num_input_tokens_seen": 197991072, + "step": 162725 + }, + { + "epoch": 18.1233990422096, + "grad_norm": 0.0029115984216332436, + "learning_rate": 1.3291494366592272e-06, + "loss": 0.0519, + "num_input_tokens_seen": 197996832, + "step": 162730 + }, + { + "epoch": 18.123955897093218, + "grad_norm": 0.0010813025292009115, + "learning_rate": 1.3283678472636784e-06, + "loss": 0.0893, + "num_input_tokens_seen": 198003104, + "step": 162735 + }, + { + "epoch": 18.124512751976834, + "grad_norm": 0.027019301429390907, + "learning_rate": 1.3275864814656674e-06, + "loss": 0.0505, + "num_input_tokens_seen": 198009056, + "step": 162740 + }, + { + "epoch": 18.125069606860453, + "grad_norm": 0.17578047513961792, + "learning_rate": 1.3268053392725605e-06, + "loss": 0.0034, + "num_input_tokens_seen": 198015072, + "step": 162745 + }, + { + "epoch": 18.12562646174407, + "grad_norm": 0.0009412086801603436, + "learning_rate": 1.3260244206917489e-06, + "loss": 0.0042, + "num_input_tokens_seen": 198021504, + "step": 162750 + }, + { + "epoch": 18.126183316627685, + "grad_norm": 0.9111068844795227, + "learning_rate": 1.3252437257306044e-06, + "loss": 0.0309, + "num_input_tokens_seen": 198027552, + "step": 162755 + }, + { + "epoch": 18.126740171511305, + "grad_norm": 0.004344055894762278, + "learning_rate": 1.324463254396502e-06, + "loss": 0.0599, + "num_input_tokens_seen": 198033472, + "step": 162760 + }, + { + "epoch": 18.12729702639492, + "grad_norm": 1.1573636531829834, + "learning_rate": 1.3236830066968075e-06, + "loss": 0.0797, + "num_input_tokens_seen": 198039232, + "step": 162765 + }, + { + "epoch": 18.12785388127854, + "grad_norm": 0.43932756781578064, + "learning_rate": 1.3229029826388988e-06, + "loss": 0.0439, + "num_input_tokens_seen": 198045376, + "step": 162770 + }, + { + "epoch": 18.128410736162156, + "grad_norm": 2.520371198654175, + "learning_rate": 1.322123182230145e-06, + "loss": 0.0952, + "num_input_tokens_seen": 198051328, + "step": 162775 + }, + { + "epoch": 18.12896759104577, + "grad_norm": 0.34611353278160095, + "learning_rate": 1.3213436054779039e-06, + "loss": 0.0453, + "num_input_tokens_seen": 198057312, + "step": 162780 + }, + { + "epoch": 18.12952444592939, + "grad_norm": 0.0002717097522690892, + "learning_rate": 1.3205642523895418e-06, + "loss": 0.0212, + "num_input_tokens_seen": 198063840, + "step": 162785 + }, + { + "epoch": 18.130081300813007, + "grad_norm": 0.04943627864122391, + "learning_rate": 1.319785122972425e-06, + "loss": 0.0035, + "num_input_tokens_seen": 198069760, + "step": 162790 + }, + { + "epoch": 18.130638155696627, + "grad_norm": 0.09016117453575134, + "learning_rate": 1.3190062172339062e-06, + "loss": 0.1485, + "num_input_tokens_seen": 198075968, + "step": 162795 + }, + { + "epoch": 18.131195010580242, + "grad_norm": 0.00025130825815722346, + "learning_rate": 1.3182275351813516e-06, + "loss": 0.1222, + "num_input_tokens_seen": 198082368, + "step": 162800 + }, + { + "epoch": 18.131751865463862, + "grad_norm": 0.0002853104961104691, + "learning_rate": 1.317449076822111e-06, + "loss": 0.0116, + "num_input_tokens_seen": 198088160, + "step": 162805 + }, + { + "epoch": 18.132308720347478, + "grad_norm": 0.01292977761477232, + "learning_rate": 1.3166708421635366e-06, + "loss": 0.034, + "num_input_tokens_seen": 198093952, + "step": 162810 + }, + { + "epoch": 18.132865575231094, + "grad_norm": 1.937321424484253, + "learning_rate": 1.315892831212978e-06, + "loss": 0.0484, + "num_input_tokens_seen": 198099968, + "step": 162815 + }, + { + "epoch": 18.133422430114713, + "grad_norm": 0.024636395275592804, + "learning_rate": 1.3151150439777909e-06, + "loss": 0.0741, + "num_input_tokens_seen": 198106368, + "step": 162820 + }, + { + "epoch": 18.13397928499833, + "grad_norm": 0.04489880055189133, + "learning_rate": 1.314337480465319e-06, + "loss": 0.022, + "num_input_tokens_seen": 198112832, + "step": 162825 + }, + { + "epoch": 18.13453613988195, + "grad_norm": 0.00023393338778987527, + "learning_rate": 1.313560140682904e-06, + "loss": 0.0077, + "num_input_tokens_seen": 198118656, + "step": 162830 + }, + { + "epoch": 18.135092994765564, + "grad_norm": 1.5457578897476196, + "learning_rate": 1.3127830246378897e-06, + "loss": 0.0617, + "num_input_tokens_seen": 198125120, + "step": 162835 + }, + { + "epoch": 18.13564984964918, + "grad_norm": 0.0014362650690600276, + "learning_rate": 1.3120061323376205e-06, + "loss": 0.0081, + "num_input_tokens_seen": 198131328, + "step": 162840 + }, + { + "epoch": 18.1362067045328, + "grad_norm": 0.014089515432715416, + "learning_rate": 1.3112294637894263e-06, + "loss": 0.0335, + "num_input_tokens_seen": 198137248, + "step": 162845 + }, + { + "epoch": 18.136763559416416, + "grad_norm": 1.1810591220855713, + "learning_rate": 1.310453019000657e-06, + "loss": 0.0655, + "num_input_tokens_seen": 198143616, + "step": 162850 + }, + { + "epoch": 18.137320414300035, + "grad_norm": 0.0010842113988474011, + "learning_rate": 1.3096767979786345e-06, + "loss": 0.0527, + "num_input_tokens_seen": 198149984, + "step": 162855 + }, + { + "epoch": 18.13787726918365, + "grad_norm": 0.6024448871612549, + "learning_rate": 1.3089008007306947e-06, + "loss": 0.0809, + "num_input_tokens_seen": 198156192, + "step": 162860 + }, + { + "epoch": 18.138434124067267, + "grad_norm": 0.014509365893900394, + "learning_rate": 1.3081250272641649e-06, + "loss": 0.0046, + "num_input_tokens_seen": 198162400, + "step": 162865 + }, + { + "epoch": 18.138990978950886, + "grad_norm": 0.38155311346054077, + "learning_rate": 1.307349477586381e-06, + "loss": 0.0576, + "num_input_tokens_seen": 198168608, + "step": 162870 + }, + { + "epoch": 18.139547833834502, + "grad_norm": 0.08103925734758377, + "learning_rate": 1.306574151704662e-06, + "loss": 0.0599, + "num_input_tokens_seen": 198174656, + "step": 162875 + }, + { + "epoch": 18.14010468871812, + "grad_norm": 0.3056624233722687, + "learning_rate": 1.3057990496263357e-06, + "loss": 0.0455, + "num_input_tokens_seen": 198180320, + "step": 162880 + }, + { + "epoch": 18.140661543601738, + "grad_norm": 0.0010682620340958238, + "learning_rate": 1.3050241713587152e-06, + "loss": 0.0127, + "num_input_tokens_seen": 198186688, + "step": 162885 + }, + { + "epoch": 18.141218398485353, + "grad_norm": 0.5986088514328003, + "learning_rate": 1.3042495169091285e-06, + "loss": 0.1264, + "num_input_tokens_seen": 198192608, + "step": 162890 + }, + { + "epoch": 18.141775253368973, + "grad_norm": 2.5420000553131104, + "learning_rate": 1.3034750862848916e-06, + "loss": 0.114, + "num_input_tokens_seen": 198198464, + "step": 162895 + }, + { + "epoch": 18.14233210825259, + "grad_norm": 0.05097053945064545, + "learning_rate": 1.3027008794933155e-06, + "loss": 0.043, + "num_input_tokens_seen": 198204384, + "step": 162900 + }, + { + "epoch": 18.14288896313621, + "grad_norm": 0.5205267071723938, + "learning_rate": 1.3019268965417165e-06, + "loss": 0.0303, + "num_input_tokens_seen": 198210400, + "step": 162905 + }, + { + "epoch": 18.143445818019824, + "grad_norm": 0.000614285294432193, + "learning_rate": 1.3011531374374026e-06, + "loss": 0.0285, + "num_input_tokens_seen": 198216768, + "step": 162910 + }, + { + "epoch": 18.14400267290344, + "grad_norm": 0.00017231833771802485, + "learning_rate": 1.3003796021876875e-06, + "loss": 0.0104, + "num_input_tokens_seen": 198223200, + "step": 162915 + }, + { + "epoch": 18.14455952778706, + "grad_norm": 0.02744438871741295, + "learning_rate": 1.2996062907998735e-06, + "loss": 0.0207, + "num_input_tokens_seen": 198229184, + "step": 162920 + }, + { + "epoch": 18.145116382670675, + "grad_norm": 1.2142175436019897, + "learning_rate": 1.2988332032812662e-06, + "loss": 0.1498, + "num_input_tokens_seen": 198235232, + "step": 162925 + }, + { + "epoch": 18.145673237554295, + "grad_norm": 0.00011700517643475905, + "learning_rate": 1.2980603396391678e-06, + "loss": 0.0126, + "num_input_tokens_seen": 198241568, + "step": 162930 + }, + { + "epoch": 18.14623009243791, + "grad_norm": 0.033533353358507156, + "learning_rate": 1.297287699880878e-06, + "loss": 0.0049, + "num_input_tokens_seen": 198247776, + "step": 162935 + }, + { + "epoch": 18.146786947321527, + "grad_norm": 0.16803227365016937, + "learning_rate": 1.296515284013697e-06, + "loss": 0.0085, + "num_input_tokens_seen": 198253952, + "step": 162940 + }, + { + "epoch": 18.147343802205146, + "grad_norm": 0.34016814827919006, + "learning_rate": 1.2957430920449266e-06, + "loss": 0.0594, + "num_input_tokens_seen": 198260064, + "step": 162945 + }, + { + "epoch": 18.147900657088762, + "grad_norm": 0.016680007800459862, + "learning_rate": 1.2949711239818447e-06, + "loss": 0.0471, + "num_input_tokens_seen": 198266048, + "step": 162950 + }, + { + "epoch": 18.14845751197238, + "grad_norm": 0.5585816502571106, + "learning_rate": 1.2941993798317565e-06, + "loss": 0.0136, + "num_input_tokens_seen": 198272064, + "step": 162955 + }, + { + "epoch": 18.149014366855997, + "grad_norm": 1.7106389999389648, + "learning_rate": 1.2934278596019477e-06, + "loss": 0.035, + "num_input_tokens_seen": 198278272, + "step": 162960 + }, + { + "epoch": 18.149571221739613, + "grad_norm": 1.584906816482544, + "learning_rate": 1.2926565632997074e-06, + "loss": 0.0967, + "num_input_tokens_seen": 198284224, + "step": 162965 + }, + { + "epoch": 18.150128076623233, + "grad_norm": 0.034338708966970444, + "learning_rate": 1.2918854909323181e-06, + "loss": 0.0126, + "num_input_tokens_seen": 198290688, + "step": 162970 + }, + { + "epoch": 18.15068493150685, + "grad_norm": 0.00046374337398447096, + "learning_rate": 1.2911146425070686e-06, + "loss": 0.0051, + "num_input_tokens_seen": 198296640, + "step": 162975 + }, + { + "epoch": 18.151241786390468, + "grad_norm": 0.09927720576524734, + "learning_rate": 1.290344018031231e-06, + "loss": 0.0216, + "num_input_tokens_seen": 198302816, + "step": 162980 + }, + { + "epoch": 18.151798641274084, + "grad_norm": 1.047486424446106, + "learning_rate": 1.2895736175120937e-06, + "loss": 0.1045, + "num_input_tokens_seen": 198308832, + "step": 162985 + }, + { + "epoch": 18.1523554961577, + "grad_norm": 0.4531489908695221, + "learning_rate": 1.288803440956929e-06, + "loss": 0.0042, + "num_input_tokens_seen": 198314944, + "step": 162990 + }, + { + "epoch": 18.15291235104132, + "grad_norm": 0.9112651944160461, + "learning_rate": 1.2880334883730137e-06, + "loss": 0.092, + "num_input_tokens_seen": 198320224, + "step": 162995 + }, + { + "epoch": 18.153469205924935, + "grad_norm": 0.0008714733412489295, + "learning_rate": 1.287263759767618e-06, + "loss": 0.0407, + "num_input_tokens_seen": 198326048, + "step": 163000 + }, + { + "epoch": 18.154026060808555, + "grad_norm": 0.015601450577378273, + "learning_rate": 1.2864942551480157e-06, + "loss": 0.0199, + "num_input_tokens_seen": 198331968, + "step": 163005 + }, + { + "epoch": 18.15458291569217, + "grad_norm": 0.0027355540078133345, + "learning_rate": 1.2857249745214712e-06, + "loss": 0.0228, + "num_input_tokens_seen": 198337760, + "step": 163010 + }, + { + "epoch": 18.155139770575786, + "grad_norm": 0.011990751139819622, + "learning_rate": 1.2849559178952586e-06, + "loss": 0.0177, + "num_input_tokens_seen": 198343968, + "step": 163015 + }, + { + "epoch": 18.155696625459406, + "grad_norm": 0.07171895354986191, + "learning_rate": 1.2841870852766363e-06, + "loss": 0.0012, + "num_input_tokens_seen": 198350176, + "step": 163020 + }, + { + "epoch": 18.156253480343022, + "grad_norm": 0.1345161348581314, + "learning_rate": 1.283418476672868e-06, + "loss": 0.0138, + "num_input_tokens_seen": 198356672, + "step": 163025 + }, + { + "epoch": 18.15681033522664, + "grad_norm": 0.003032241016626358, + "learning_rate": 1.2826500920912087e-06, + "loss": 0.0001, + "num_input_tokens_seen": 198362880, + "step": 163030 + }, + { + "epoch": 18.157367190110257, + "grad_norm": 0.08436311036348343, + "learning_rate": 1.281881931538928e-06, + "loss": 0.0595, + "num_input_tokens_seen": 198368896, + "step": 163035 + }, + { + "epoch": 18.157924044993873, + "grad_norm": 0.018602760508656502, + "learning_rate": 1.2811139950232726e-06, + "loss": 0.0156, + "num_input_tokens_seen": 198375072, + "step": 163040 + }, + { + "epoch": 18.158480899877492, + "grad_norm": 0.2514418959617615, + "learning_rate": 1.2803462825514979e-06, + "loss": 0.0651, + "num_input_tokens_seen": 198380992, + "step": 163045 + }, + { + "epoch": 18.15903775476111, + "grad_norm": 0.17002485692501068, + "learning_rate": 1.2795787941308562e-06, + "loss": 0.023, + "num_input_tokens_seen": 198387360, + "step": 163050 + }, + { + "epoch": 18.159594609644728, + "grad_norm": 0.4587302803993225, + "learning_rate": 1.2788115297685976e-06, + "loss": 0.0067, + "num_input_tokens_seen": 198393504, + "step": 163055 + }, + { + "epoch": 18.160151464528344, + "grad_norm": 0.0006043343455530703, + "learning_rate": 1.2780444894719689e-06, + "loss": 0.0202, + "num_input_tokens_seen": 198399616, + "step": 163060 + }, + { + "epoch": 18.16070831941196, + "grad_norm": 0.005375794600695372, + "learning_rate": 1.27727767324822e-06, + "loss": 0.0763, + "num_input_tokens_seen": 198405280, + "step": 163065 + }, + { + "epoch": 18.16126517429558, + "grad_norm": 0.00014701252803206444, + "learning_rate": 1.2765110811045838e-06, + "loss": 0.005, + "num_input_tokens_seen": 198411392, + "step": 163070 + }, + { + "epoch": 18.161822029179195, + "grad_norm": 0.360930472612381, + "learning_rate": 1.2757447130483103e-06, + "loss": 0.0355, + "num_input_tokens_seen": 198417792, + "step": 163075 + }, + { + "epoch": 18.162378884062814, + "grad_norm": 0.031153744086623192, + "learning_rate": 1.2749785690866324e-06, + "loss": 0.0191, + "num_input_tokens_seen": 198423904, + "step": 163080 + }, + { + "epoch": 18.16293573894643, + "grad_norm": 2.1055262088775635, + "learning_rate": 1.2742126492267942e-06, + "loss": 0.1563, + "num_input_tokens_seen": 198429824, + "step": 163085 + }, + { + "epoch": 18.163492593830046, + "grad_norm": 0.002373615512624383, + "learning_rate": 1.2734469534760263e-06, + "loss": 0.0062, + "num_input_tokens_seen": 198436256, + "step": 163090 + }, + { + "epoch": 18.164049448713666, + "grad_norm": 0.0022109155543148518, + "learning_rate": 1.2726814818415617e-06, + "loss": 0.0725, + "num_input_tokens_seen": 198442336, + "step": 163095 + }, + { + "epoch": 18.16460630359728, + "grad_norm": 0.3260737657546997, + "learning_rate": 1.2719162343306252e-06, + "loss": 0.0431, + "num_input_tokens_seen": 198448096, + "step": 163100 + }, + { + "epoch": 18.1651631584809, + "grad_norm": 1.0153638124465942, + "learning_rate": 1.2711512109504553e-06, + "loss": 0.0208, + "num_input_tokens_seen": 198453696, + "step": 163105 + }, + { + "epoch": 18.165720013364517, + "grad_norm": 2.380403995513916, + "learning_rate": 1.2703864117082742e-06, + "loss": 0.0832, + "num_input_tokens_seen": 198459648, + "step": 163110 + }, + { + "epoch": 18.166276868248133, + "grad_norm": 0.04423213005065918, + "learning_rate": 1.2696218366113038e-06, + "loss": 0.1137, + "num_input_tokens_seen": 198466080, + "step": 163115 + }, + { + "epoch": 18.166833723131752, + "grad_norm": 0.19086280465126038, + "learning_rate": 1.2688574856667635e-06, + "loss": 0.0905, + "num_input_tokens_seen": 198472320, + "step": 163120 + }, + { + "epoch": 18.167390578015368, + "grad_norm": 0.009337813593447208, + "learning_rate": 1.2680933588818833e-06, + "loss": 0.0216, + "num_input_tokens_seen": 198478368, + "step": 163125 + }, + { + "epoch": 18.167947432898988, + "grad_norm": 2.615748167037964, + "learning_rate": 1.2673294562638688e-06, + "loss": 0.0569, + "num_input_tokens_seen": 198484320, + "step": 163130 + }, + { + "epoch": 18.168504287782604, + "grad_norm": 0.14970268309116364, + "learning_rate": 1.2665657778199503e-06, + "loss": 0.0228, + "num_input_tokens_seen": 198490784, + "step": 163135 + }, + { + "epoch": 18.169061142666223, + "grad_norm": 0.003668429795652628, + "learning_rate": 1.2658023235573274e-06, + "loss": 0.0077, + "num_input_tokens_seen": 198497088, + "step": 163140 + }, + { + "epoch": 18.16961799754984, + "grad_norm": 1.8616228103637695, + "learning_rate": 1.2650390934832168e-06, + "loss": 0.1901, + "num_input_tokens_seen": 198503136, + "step": 163145 + }, + { + "epoch": 18.170174852433455, + "grad_norm": 0.0005492084892466664, + "learning_rate": 1.2642760876048293e-06, + "loss": 0.0018, + "num_input_tokens_seen": 198508704, + "step": 163150 + }, + { + "epoch": 18.170731707317074, + "grad_norm": 0.013706383295357227, + "learning_rate": 1.2635133059293702e-06, + "loss": 0.0038, + "num_input_tokens_seen": 198514752, + "step": 163155 + }, + { + "epoch": 18.17128856220069, + "grad_norm": 0.0007460081251338124, + "learning_rate": 1.2627507484640477e-06, + "loss": 0.009, + "num_input_tokens_seen": 198520960, + "step": 163160 + }, + { + "epoch": 18.17184541708431, + "grad_norm": 0.118230901658535, + "learning_rate": 1.2619884152160615e-06, + "loss": 0.0393, + "num_input_tokens_seen": 198527168, + "step": 163165 + }, + { + "epoch": 18.172402271967925, + "grad_norm": 0.0032746761571615934, + "learning_rate": 1.2612263061926087e-06, + "loss": 0.0659, + "num_input_tokens_seen": 198532480, + "step": 163170 + }, + { + "epoch": 18.17295912685154, + "grad_norm": 0.4403669834136963, + "learning_rate": 1.2604644214008976e-06, + "loss": 0.0057, + "num_input_tokens_seen": 198538752, + "step": 163175 + }, + { + "epoch": 18.17351598173516, + "grad_norm": 0.0004726488550659269, + "learning_rate": 1.2597027608481193e-06, + "loss": 0.0318, + "num_input_tokens_seen": 198545344, + "step": 163180 + }, + { + "epoch": 18.174072836618777, + "grad_norm": 0.004218921530991793, + "learning_rate": 1.258941324541471e-06, + "loss": 0.1914, + "num_input_tokens_seen": 198550944, + "step": 163185 + }, + { + "epoch": 18.174629691502396, + "grad_norm": 0.711249053478241, + "learning_rate": 1.258180112488136e-06, + "loss": 0.0242, + "num_input_tokens_seen": 198557120, + "step": 163190 + }, + { + "epoch": 18.175186546386012, + "grad_norm": 0.011135881766676903, + "learning_rate": 1.2574191246953166e-06, + "loss": 0.0137, + "num_input_tokens_seen": 198563360, + "step": 163195 + }, + { + "epoch": 18.175743401269628, + "grad_norm": 0.00020478277292568237, + "learning_rate": 1.2566583611701933e-06, + "loss": 0.0061, + "num_input_tokens_seen": 198569728, + "step": 163200 + }, + { + "epoch": 18.176300256153247, + "grad_norm": 1.7264678478240967, + "learning_rate": 1.2558978219199573e-06, + "loss": 0.0755, + "num_input_tokens_seen": 198576032, + "step": 163205 + }, + { + "epoch": 18.176857111036863, + "grad_norm": 0.3130232095718384, + "learning_rate": 1.2551375069517895e-06, + "loss": 0.0593, + "num_input_tokens_seen": 198582176, + "step": 163210 + }, + { + "epoch": 18.177413965920483, + "grad_norm": 0.005548287648707628, + "learning_rate": 1.2543774162728728e-06, + "loss": 0.0555, + "num_input_tokens_seen": 198587616, + "step": 163215 + }, + { + "epoch": 18.1779708208041, + "grad_norm": 0.000762469251640141, + "learning_rate": 1.2536175498903817e-06, + "loss": 0.0058, + "num_input_tokens_seen": 198593760, + "step": 163220 + }, + { + "epoch": 18.178527675687715, + "grad_norm": 0.6492173075675964, + "learning_rate": 1.2528579078115e-06, + "loss": 0.015, + "num_input_tokens_seen": 198599104, + "step": 163225 + }, + { + "epoch": 18.179084530571334, + "grad_norm": 0.5259151458740234, + "learning_rate": 1.2520984900434046e-06, + "loss": 0.0256, + "num_input_tokens_seen": 198605408, + "step": 163230 + }, + { + "epoch": 18.17964138545495, + "grad_norm": 1.3601995706558228, + "learning_rate": 1.2513392965932625e-06, + "loss": 0.0551, + "num_input_tokens_seen": 198611744, + "step": 163235 + }, + { + "epoch": 18.18019824033857, + "grad_norm": 0.00010303266026312485, + "learning_rate": 1.2505803274682454e-06, + "loss": 0.0018, + "num_input_tokens_seen": 198617856, + "step": 163240 + }, + { + "epoch": 18.180755095222185, + "grad_norm": 0.01439129002392292, + "learning_rate": 1.2498215826755283e-06, + "loss": 0.0003, + "num_input_tokens_seen": 198624448, + "step": 163245 + }, + { + "epoch": 18.1813119501058, + "grad_norm": 1.17857027053833, + "learning_rate": 1.2490630622222721e-06, + "loss": 0.037, + "num_input_tokens_seen": 198630432, + "step": 163250 + }, + { + "epoch": 18.18186880498942, + "grad_norm": 0.0013206881703808904, + "learning_rate": 1.2483047661156517e-06, + "loss": 0.007, + "num_input_tokens_seen": 198636640, + "step": 163255 + }, + { + "epoch": 18.182425659873036, + "grad_norm": 1.1918013095855713, + "learning_rate": 1.247546694362814e-06, + "loss": 0.0332, + "num_input_tokens_seen": 198642624, + "step": 163260 + }, + { + "epoch": 18.182982514756656, + "grad_norm": 0.007909374311566353, + "learning_rate": 1.246788846970931e-06, + "loss": 0.0202, + "num_input_tokens_seen": 198648704, + "step": 163265 + }, + { + "epoch": 18.183539369640272, + "grad_norm": 0.15364231169223785, + "learning_rate": 1.2460312239471555e-06, + "loss": 0.0017, + "num_input_tokens_seen": 198654912, + "step": 163270 + }, + { + "epoch": 18.184096224523888, + "grad_norm": 0.003871982218697667, + "learning_rate": 1.2452738252986513e-06, + "loss": 0.0548, + "num_input_tokens_seen": 198660736, + "step": 163275 + }, + { + "epoch": 18.184653079407507, + "grad_norm": 0.8962801098823547, + "learning_rate": 1.244516651032565e-06, + "loss": 0.1125, + "num_input_tokens_seen": 198666432, + "step": 163280 + }, + { + "epoch": 18.185209934291123, + "grad_norm": 0.05237922817468643, + "learning_rate": 1.2437597011560526e-06, + "loss": 0.0077, + "num_input_tokens_seen": 198672352, + "step": 163285 + }, + { + "epoch": 18.185766789174743, + "grad_norm": 0.9812874794006348, + "learning_rate": 1.2430029756762606e-06, + "loss": 0.0195, + "num_input_tokens_seen": 198678848, + "step": 163290 + }, + { + "epoch": 18.18632364405836, + "grad_norm": 0.03375956043601036, + "learning_rate": 1.242246474600342e-06, + "loss": 0.0407, + "num_input_tokens_seen": 198684992, + "step": 163295 + }, + { + "epoch": 18.186880498941974, + "grad_norm": 0.02301456406712532, + "learning_rate": 1.2414901979354382e-06, + "loss": 0.0203, + "num_input_tokens_seen": 198690784, + "step": 163300 + }, + { + "epoch": 18.187437353825594, + "grad_norm": 0.04424024373292923, + "learning_rate": 1.2407341456886961e-06, + "loss": 0.001, + "num_input_tokens_seen": 198696704, + "step": 163305 + }, + { + "epoch": 18.18799420870921, + "grad_norm": 1.3385297060012817, + "learning_rate": 1.2399783178672548e-06, + "loss": 0.0869, + "num_input_tokens_seen": 198701984, + "step": 163310 + }, + { + "epoch": 18.18855106359283, + "grad_norm": 0.26892465353012085, + "learning_rate": 1.2392227144782525e-06, + "loss": 0.0102, + "num_input_tokens_seen": 198707968, + "step": 163315 + }, + { + "epoch": 18.189107918476445, + "grad_norm": 0.009776030667126179, + "learning_rate": 1.2384673355288311e-06, + "loss": 0.0024, + "num_input_tokens_seen": 198714112, + "step": 163320 + }, + { + "epoch": 18.18966477336006, + "grad_norm": 0.6414109468460083, + "learning_rate": 1.2377121810261238e-06, + "loss": 0.0071, + "num_input_tokens_seen": 198720384, + "step": 163325 + }, + { + "epoch": 18.19022162824368, + "grad_norm": 0.07379945367574692, + "learning_rate": 1.236957250977261e-06, + "loss": 0.1242, + "num_input_tokens_seen": 198726496, + "step": 163330 + }, + { + "epoch": 18.190778483127296, + "grad_norm": 0.0007870817207731307, + "learning_rate": 1.2362025453893755e-06, + "loss": 0.0376, + "num_input_tokens_seen": 198731872, + "step": 163335 + }, + { + "epoch": 18.191335338010916, + "grad_norm": 0.7249992489814758, + "learning_rate": 1.2354480642695982e-06, + "loss": 0.0264, + "num_input_tokens_seen": 198737760, + "step": 163340 + }, + { + "epoch": 18.19189219289453, + "grad_norm": 0.0007110239239409566, + "learning_rate": 1.234693807625048e-06, + "loss": 0.025, + "num_input_tokens_seen": 198743680, + "step": 163345 + }, + { + "epoch": 18.192449047778148, + "grad_norm": 0.6733055114746094, + "learning_rate": 1.2339397754628613e-06, + "loss": 0.0414, + "num_input_tokens_seen": 198749952, + "step": 163350 + }, + { + "epoch": 18.193005902661767, + "grad_norm": 0.0024230172857642174, + "learning_rate": 1.2331859677901542e-06, + "loss": 0.0028, + "num_input_tokens_seen": 198756224, + "step": 163355 + }, + { + "epoch": 18.193562757545383, + "grad_norm": 0.00016763292660471052, + "learning_rate": 1.2324323846140462e-06, + "loss": 0.0667, + "num_input_tokens_seen": 198762528, + "step": 163360 + }, + { + "epoch": 18.194119612429002, + "grad_norm": 0.012578717432916164, + "learning_rate": 1.2316790259416538e-06, + "loss": 0.0163, + "num_input_tokens_seen": 198768512, + "step": 163365 + }, + { + "epoch": 18.19467646731262, + "grad_norm": 0.08583737164735794, + "learning_rate": 1.2309258917800993e-06, + "loss": 0.0027, + "num_input_tokens_seen": 198774944, + "step": 163370 + }, + { + "epoch": 18.195233322196234, + "grad_norm": 0.0015499243745580316, + "learning_rate": 1.2301729821364931e-06, + "loss": 0.0308, + "num_input_tokens_seen": 198781280, + "step": 163375 + }, + { + "epoch": 18.195790177079854, + "grad_norm": 0.568513035774231, + "learning_rate": 1.229420297017947e-06, + "loss": 0.2851, + "num_input_tokens_seen": 198787680, + "step": 163380 + }, + { + "epoch": 18.19634703196347, + "grad_norm": 0.06592785567045212, + "learning_rate": 1.2286678364315656e-06, + "loss": 0.0583, + "num_input_tokens_seen": 198793696, + "step": 163385 + }, + { + "epoch": 18.19690388684709, + "grad_norm": 0.00011168452329002321, + "learning_rate": 1.2279156003844689e-06, + "loss": 0.0138, + "num_input_tokens_seen": 198800128, + "step": 163390 + }, + { + "epoch": 18.197460741730705, + "grad_norm": 0.02940167672932148, + "learning_rate": 1.2271635888837534e-06, + "loss": 0.0334, + "num_input_tokens_seen": 198806144, + "step": 163395 + }, + { + "epoch": 18.19801759661432, + "grad_norm": 0.05020708963274956, + "learning_rate": 1.226411801936525e-06, + "loss": 0.0022, + "num_input_tokens_seen": 198812256, + "step": 163400 + }, + { + "epoch": 18.19857445149794, + "grad_norm": 0.3192374110221863, + "learning_rate": 1.225660239549878e-06, + "loss": 0.0311, + "num_input_tokens_seen": 198818304, + "step": 163405 + }, + { + "epoch": 18.199131306381556, + "grad_norm": 0.5536450147628784, + "learning_rate": 1.2249089017309256e-06, + "loss": 0.0184, + "num_input_tokens_seen": 198824576, + "step": 163410 + }, + { + "epoch": 18.199688161265176, + "grad_norm": 1.6807363033294678, + "learning_rate": 1.2241577884867488e-06, + "loss": 0.0457, + "num_input_tokens_seen": 198830880, + "step": 163415 + }, + { + "epoch": 18.20024501614879, + "grad_norm": 0.8824822306632996, + "learning_rate": 1.2234068998244586e-06, + "loss": 0.0514, + "num_input_tokens_seen": 198837024, + "step": 163420 + }, + { + "epoch": 18.200801871032407, + "grad_norm": 0.916802704334259, + "learning_rate": 1.2226562357511352e-06, + "loss": 0.0703, + "num_input_tokens_seen": 198841792, + "step": 163425 + }, + { + "epoch": 18.201358725916027, + "grad_norm": 1.0135773420333862, + "learning_rate": 1.2219057962738783e-06, + "loss": 0.156, + "num_input_tokens_seen": 198847232, + "step": 163430 + }, + { + "epoch": 18.201915580799643, + "grad_norm": 1.8524973392486572, + "learning_rate": 1.221155581399766e-06, + "loss": 0.0298, + "num_input_tokens_seen": 198853312, + "step": 163435 + }, + { + "epoch": 18.202472435683262, + "grad_norm": 0.1411495953798294, + "learning_rate": 1.2204055911358925e-06, + "loss": 0.0755, + "num_input_tokens_seen": 198859680, + "step": 163440 + }, + { + "epoch": 18.203029290566878, + "grad_norm": 0.0007868013344705105, + "learning_rate": 1.2196558254893437e-06, + "loss": 0.03, + "num_input_tokens_seen": 198865888, + "step": 163445 + }, + { + "epoch": 18.203586145450494, + "grad_norm": 0.2133917510509491, + "learning_rate": 1.2189062844671944e-06, + "loss": 0.0259, + "num_input_tokens_seen": 198872128, + "step": 163450 + }, + { + "epoch": 18.204143000334113, + "grad_norm": 0.09689521044492722, + "learning_rate": 1.2181569680765282e-06, + "loss": 0.115, + "num_input_tokens_seen": 198878368, + "step": 163455 + }, + { + "epoch": 18.20469985521773, + "grad_norm": 0.7270209193229675, + "learning_rate": 1.217407876324425e-06, + "loss": 0.1498, + "num_input_tokens_seen": 198884864, + "step": 163460 + }, + { + "epoch": 18.20525671010135, + "grad_norm": 1.6173943281173706, + "learning_rate": 1.2166590092179547e-06, + "loss": 0.0665, + "num_input_tokens_seen": 198890624, + "step": 163465 + }, + { + "epoch": 18.205813564984965, + "grad_norm": 0.00033495426760055125, + "learning_rate": 1.215910366764203e-06, + "loss": 0.0069, + "num_input_tokens_seen": 198896896, + "step": 163470 + }, + { + "epoch": 18.20637041986858, + "grad_norm": 1.1662143468856812, + "learning_rate": 1.2151619489702255e-06, + "loss": 0.0345, + "num_input_tokens_seen": 198903200, + "step": 163475 + }, + { + "epoch": 18.2069272747522, + "grad_norm": 0.2612380385398865, + "learning_rate": 1.2144137558431023e-06, + "loss": 0.0098, + "num_input_tokens_seen": 198908992, + "step": 163480 + }, + { + "epoch": 18.207484129635816, + "grad_norm": 0.02385745383799076, + "learning_rate": 1.213665787389895e-06, + "loss": 0.056, + "num_input_tokens_seen": 198914880, + "step": 163485 + }, + { + "epoch": 18.208040984519435, + "grad_norm": 0.0006027176859788597, + "learning_rate": 1.212918043617675e-06, + "loss": 0.1369, + "num_input_tokens_seen": 198921024, + "step": 163490 + }, + { + "epoch": 18.20859783940305, + "grad_norm": 0.005170244257897139, + "learning_rate": 1.2121705245335042e-06, + "loss": 0.004, + "num_input_tokens_seen": 198926848, + "step": 163495 + }, + { + "epoch": 18.209154694286667, + "grad_norm": 0.01622394658625126, + "learning_rate": 1.21142323014444e-06, + "loss": 0.0872, + "num_input_tokens_seen": 198933120, + "step": 163500 + }, + { + "epoch": 18.209711549170287, + "grad_norm": 2.569695472717285, + "learning_rate": 1.2106761604575384e-06, + "loss": 0.0417, + "num_input_tokens_seen": 198939360, + "step": 163505 + }, + { + "epoch": 18.210268404053902, + "grad_norm": 0.0008279667817987502, + "learning_rate": 1.209929315479863e-06, + "loss": 0.0741, + "num_input_tokens_seen": 198945056, + "step": 163510 + }, + { + "epoch": 18.210825258937522, + "grad_norm": 2.090581178665161, + "learning_rate": 1.2091826952184665e-06, + "loss": 0.2265, + "num_input_tokens_seen": 198950880, + "step": 163515 + }, + { + "epoch": 18.211382113821138, + "grad_norm": 0.0040022204630076885, + "learning_rate": 1.208436299680399e-06, + "loss": 0.0184, + "num_input_tokens_seen": 198956832, + "step": 163520 + }, + { + "epoch": 18.211938968704757, + "grad_norm": 0.1199425458908081, + "learning_rate": 1.20769012887271e-06, + "loss": 0.0134, + "num_input_tokens_seen": 198962656, + "step": 163525 + }, + { + "epoch": 18.212495823588373, + "grad_norm": 0.0008703714120201766, + "learning_rate": 1.2069441828024526e-06, + "loss": 0.0235, + "num_input_tokens_seen": 198968672, + "step": 163530 + }, + { + "epoch": 18.21305267847199, + "grad_norm": 0.0003315513313282281, + "learning_rate": 1.206198461476668e-06, + "loss": 0.0221, + "num_input_tokens_seen": 198974848, + "step": 163535 + }, + { + "epoch": 18.21360953335561, + "grad_norm": 0.026456652209162712, + "learning_rate": 1.2054529649024094e-06, + "loss": 0.0189, + "num_input_tokens_seen": 198980768, + "step": 163540 + }, + { + "epoch": 18.214166388239224, + "grad_norm": 0.263801246881485, + "learning_rate": 1.2047076930867014e-06, + "loss": 0.0053, + "num_input_tokens_seen": 198986848, + "step": 163545 + }, + { + "epoch": 18.214723243122844, + "grad_norm": 0.007687598932534456, + "learning_rate": 1.2039626460365993e-06, + "loss": 0.0045, + "num_input_tokens_seen": 198992832, + "step": 163550 + }, + { + "epoch": 18.21528009800646, + "grad_norm": 0.013165388256311417, + "learning_rate": 1.2032178237591312e-06, + "loss": 0.0832, + "num_input_tokens_seen": 198998848, + "step": 163555 + }, + { + "epoch": 18.215836952890076, + "grad_norm": 0.1720888465642929, + "learning_rate": 1.2024732262613414e-06, + "loss": 0.0366, + "num_input_tokens_seen": 199004992, + "step": 163560 + }, + { + "epoch": 18.216393807773695, + "grad_norm": 0.12018278986215591, + "learning_rate": 1.2017288535502574e-06, + "loss": 0.018, + "num_input_tokens_seen": 199011168, + "step": 163565 + }, + { + "epoch": 18.21695066265731, + "grad_norm": 0.0022159868385642767, + "learning_rate": 1.20098470563291e-06, + "loss": 0.0331, + "num_input_tokens_seen": 199016640, + "step": 163570 + }, + { + "epoch": 18.21750751754093, + "grad_norm": 0.18098658323287964, + "learning_rate": 1.2002407825163264e-06, + "loss": 0.0466, + "num_input_tokens_seen": 199022944, + "step": 163575 + }, + { + "epoch": 18.218064372424546, + "grad_norm": 0.042153555899858475, + "learning_rate": 1.1994970842075404e-06, + "loss": 0.0086, + "num_input_tokens_seen": 199028576, + "step": 163580 + }, + { + "epoch": 18.218621227308162, + "grad_norm": 0.8143454790115356, + "learning_rate": 1.198753610713574e-06, + "loss": 0.0274, + "num_input_tokens_seen": 199034528, + "step": 163585 + }, + { + "epoch": 18.21917808219178, + "grad_norm": 0.14552414417266846, + "learning_rate": 1.198010362041449e-06, + "loss": 0.0242, + "num_input_tokens_seen": 199040864, + "step": 163590 + }, + { + "epoch": 18.219734937075398, + "grad_norm": 0.05298464745283127, + "learning_rate": 1.1972673381981797e-06, + "loss": 0.0104, + "num_input_tokens_seen": 199046752, + "step": 163595 + }, + { + "epoch": 18.220291791959017, + "grad_norm": 0.6235712766647339, + "learning_rate": 1.1965245391907964e-06, + "loss": 0.0424, + "num_input_tokens_seen": 199052512, + "step": 163600 + }, + { + "epoch": 18.220848646842633, + "grad_norm": 0.001250468660145998, + "learning_rate": 1.1957819650263075e-06, + "loss": 0.0065, + "num_input_tokens_seen": 199058528, + "step": 163605 + }, + { + "epoch": 18.22140550172625, + "grad_norm": 0.46607860922813416, + "learning_rate": 1.1950396157117322e-06, + "loss": 0.0145, + "num_input_tokens_seen": 199064768, + "step": 163610 + }, + { + "epoch": 18.22196235660987, + "grad_norm": 0.6172298192977905, + "learning_rate": 1.1942974912540788e-06, + "loss": 0.0376, + "num_input_tokens_seen": 199069920, + "step": 163615 + }, + { + "epoch": 18.222519211493484, + "grad_norm": 0.6054064035415649, + "learning_rate": 1.1935555916603586e-06, + "loss": 0.0484, + "num_input_tokens_seen": 199076000, + "step": 163620 + }, + { + "epoch": 18.223076066377104, + "grad_norm": 0.0003090626560151577, + "learning_rate": 1.1928139169375769e-06, + "loss": 0.0569, + "num_input_tokens_seen": 199081760, + "step": 163625 + }, + { + "epoch": 18.22363292126072, + "grad_norm": 0.021225420758128166, + "learning_rate": 1.1920724670927448e-06, + "loss": 0.1762, + "num_input_tokens_seen": 199087264, + "step": 163630 + }, + { + "epoch": 18.224189776144335, + "grad_norm": 0.7771808505058289, + "learning_rate": 1.1913312421328622e-06, + "loss": 0.0556, + "num_input_tokens_seen": 199093536, + "step": 163635 + }, + { + "epoch": 18.224746631027955, + "grad_norm": 0.40693140029907227, + "learning_rate": 1.1905902420649317e-06, + "loss": 0.0663, + "num_input_tokens_seen": 199099360, + "step": 163640 + }, + { + "epoch": 18.22530348591157, + "grad_norm": 3.4142513275146484, + "learning_rate": 1.1898494668959481e-06, + "loss": 0.069, + "num_input_tokens_seen": 199105440, + "step": 163645 + }, + { + "epoch": 18.22586034079519, + "grad_norm": 0.0024029945489019156, + "learning_rate": 1.1891089166329167e-06, + "loss": 0.0472, + "num_input_tokens_seen": 199111040, + "step": 163650 + }, + { + "epoch": 18.226417195678806, + "grad_norm": 1.4518009424209595, + "learning_rate": 1.1883685912828262e-06, + "loss": 0.0451, + "num_input_tokens_seen": 199117280, + "step": 163655 + }, + { + "epoch": 18.226974050562422, + "grad_norm": 1.0108768939971924, + "learning_rate": 1.1876284908526768e-06, + "loss": 0.089, + "num_input_tokens_seen": 199123296, + "step": 163660 + }, + { + "epoch": 18.22753090544604, + "grad_norm": 0.2781830132007599, + "learning_rate": 1.186888615349449e-06, + "loss": 0.005, + "num_input_tokens_seen": 199129568, + "step": 163665 + }, + { + "epoch": 18.228087760329657, + "grad_norm": 2.1536312103271484, + "learning_rate": 1.18614896478014e-06, + "loss": 0.2578, + "num_input_tokens_seen": 199135360, + "step": 163670 + }, + { + "epoch": 18.228644615213277, + "grad_norm": 0.27572739124298096, + "learning_rate": 1.18540953915173e-06, + "loss": 0.0526, + "num_input_tokens_seen": 199141440, + "step": 163675 + }, + { + "epoch": 18.229201470096893, + "grad_norm": 0.008200558833777905, + "learning_rate": 1.184670338471211e-06, + "loss": 0.045, + "num_input_tokens_seen": 199147968, + "step": 163680 + }, + { + "epoch": 18.22975832498051, + "grad_norm": 0.005545841064304113, + "learning_rate": 1.1839313627455578e-06, + "loss": 0.0596, + "num_input_tokens_seen": 199154400, + "step": 163685 + }, + { + "epoch": 18.230315179864128, + "grad_norm": 0.032708581537008286, + "learning_rate": 1.1831926119817567e-06, + "loss": 0.0175, + "num_input_tokens_seen": 199160320, + "step": 163690 + }, + { + "epoch": 18.230872034747744, + "grad_norm": 0.5965166091918945, + "learning_rate": 1.1824540861867794e-06, + "loss": 0.0854, + "num_input_tokens_seen": 199166400, + "step": 163695 + }, + { + "epoch": 18.231428889631363, + "grad_norm": 0.02348192408680916, + "learning_rate": 1.1817157853676098e-06, + "loss": 0.0066, + "num_input_tokens_seen": 199172704, + "step": 163700 + }, + { + "epoch": 18.23198574451498, + "grad_norm": 0.0033028663601726294, + "learning_rate": 1.1809777095312169e-06, + "loss": 0.1735, + "num_input_tokens_seen": 199178432, + "step": 163705 + }, + { + "epoch": 18.232542599398595, + "grad_norm": 1.1666796207427979, + "learning_rate": 1.1802398586845732e-06, + "loss": 0.0122, + "num_input_tokens_seen": 199184704, + "step": 163710 + }, + { + "epoch": 18.233099454282215, + "grad_norm": 0.7150675654411316, + "learning_rate": 1.1795022328346478e-06, + "loss": 0.019, + "num_input_tokens_seen": 199191008, + "step": 163715 + }, + { + "epoch": 18.23365630916583, + "grad_norm": 0.36863744258880615, + "learning_rate": 1.178764831988405e-06, + "loss": 0.0576, + "num_input_tokens_seen": 199196480, + "step": 163720 + }, + { + "epoch": 18.23421316404945, + "grad_norm": 0.05556374788284302, + "learning_rate": 1.1780276561528163e-06, + "loss": 0.0311, + "num_input_tokens_seen": 199202400, + "step": 163725 + }, + { + "epoch": 18.234770018933066, + "grad_norm": 0.5024293065071106, + "learning_rate": 1.1772907053348436e-06, + "loss": 0.1014, + "num_input_tokens_seen": 199208736, + "step": 163730 + }, + { + "epoch": 18.235326873816682, + "grad_norm": 0.10388154536485672, + "learning_rate": 1.1765539795414448e-06, + "loss": 0.0139, + "num_input_tokens_seen": 199214592, + "step": 163735 + }, + { + "epoch": 18.2358837287003, + "grad_norm": 0.09898555278778076, + "learning_rate": 1.1758174787795783e-06, + "loss": 0.0127, + "num_input_tokens_seen": 199220064, + "step": 163740 + }, + { + "epoch": 18.236440583583917, + "grad_norm": 0.056208327412605286, + "learning_rate": 1.1750812030562081e-06, + "loss": 0.0624, + "num_input_tokens_seen": 199225760, + "step": 163745 + }, + { + "epoch": 18.236997438467537, + "grad_norm": 1.696359395980835, + "learning_rate": 1.1743451523782784e-06, + "loss": 0.0327, + "num_input_tokens_seen": 199232096, + "step": 163750 + }, + { + "epoch": 18.237554293351153, + "grad_norm": 0.0001577952498337254, + "learning_rate": 1.1736093267527531e-06, + "loss": 0.0012, + "num_input_tokens_seen": 199238496, + "step": 163755 + }, + { + "epoch": 18.23811114823477, + "grad_norm": 0.23745959997177124, + "learning_rate": 1.1728737261865768e-06, + "loss": 0.0724, + "num_input_tokens_seen": 199244704, + "step": 163760 + }, + { + "epoch": 18.238668003118388, + "grad_norm": 0.022205747663974762, + "learning_rate": 1.1721383506866968e-06, + "loss": 0.0173, + "num_input_tokens_seen": 199250688, + "step": 163765 + }, + { + "epoch": 18.239224858002004, + "grad_norm": 0.036700453609228134, + "learning_rate": 1.1714032002600572e-06, + "loss": 0.0016, + "num_input_tokens_seen": 199257024, + "step": 163770 + }, + { + "epoch": 18.239781712885623, + "grad_norm": 0.027443228289484978, + "learning_rate": 1.170668274913611e-06, + "loss": 0.0056, + "num_input_tokens_seen": 199263200, + "step": 163775 + }, + { + "epoch": 18.24033856776924, + "grad_norm": 0.021377110853791237, + "learning_rate": 1.1699335746542917e-06, + "loss": 0.0167, + "num_input_tokens_seen": 199269664, + "step": 163780 + }, + { + "epoch": 18.240895422652855, + "grad_norm": 0.002628515474498272, + "learning_rate": 1.1691990994890433e-06, + "loss": 0.0028, + "num_input_tokens_seen": 199276096, + "step": 163785 + }, + { + "epoch": 18.241452277536474, + "grad_norm": 0.05531160160899162, + "learning_rate": 1.1684648494247997e-06, + "loss": 0.0082, + "num_input_tokens_seen": 199281984, + "step": 163790 + }, + { + "epoch": 18.24200913242009, + "grad_norm": 1.4627127647399902, + "learning_rate": 1.167730824468502e-06, + "loss": 0.142, + "num_input_tokens_seen": 199287776, + "step": 163795 + }, + { + "epoch": 18.24256598730371, + "grad_norm": 0.006113426294177771, + "learning_rate": 1.1669970246270784e-06, + "loss": 0.0656, + "num_input_tokens_seen": 199293856, + "step": 163800 + }, + { + "epoch": 18.243122842187326, + "grad_norm": 0.16467201709747314, + "learning_rate": 1.1662634499074675e-06, + "loss": 0.0127, + "num_input_tokens_seen": 199300256, + "step": 163805 + }, + { + "epoch": 18.24367969707094, + "grad_norm": 0.0006791016203351319, + "learning_rate": 1.1655301003165892e-06, + "loss": 0.0254, + "num_input_tokens_seen": 199306592, + "step": 163810 + }, + { + "epoch": 18.24423655195456, + "grad_norm": 0.016985446214675903, + "learning_rate": 1.1647969758613764e-06, + "loss": 0.0088, + "num_input_tokens_seen": 199312608, + "step": 163815 + }, + { + "epoch": 18.244793406838177, + "grad_norm": 0.22333493828773499, + "learning_rate": 1.1640640765487487e-06, + "loss": 0.003, + "num_input_tokens_seen": 199318656, + "step": 163820 + }, + { + "epoch": 18.245350261721796, + "grad_norm": 0.021129323169589043, + "learning_rate": 1.1633314023856367e-06, + "loss": 0.0067, + "num_input_tokens_seen": 199324672, + "step": 163825 + }, + { + "epoch": 18.245907116605412, + "grad_norm": 0.007050797808915377, + "learning_rate": 1.16259895337896e-06, + "loss": 0.0445, + "num_input_tokens_seen": 199330592, + "step": 163830 + }, + { + "epoch": 18.246463971489028, + "grad_norm": 0.030856497585773468, + "learning_rate": 1.1618667295356295e-06, + "loss": 0.0548, + "num_input_tokens_seen": 199336192, + "step": 163835 + }, + { + "epoch": 18.247020826372648, + "grad_norm": 0.0013470710255205631, + "learning_rate": 1.1611347308625675e-06, + "loss": 0.0181, + "num_input_tokens_seen": 199342112, + "step": 163840 + }, + { + "epoch": 18.247577681256264, + "grad_norm": 0.021261749789118767, + "learning_rate": 1.160402957366688e-06, + "loss": 0.0721, + "num_input_tokens_seen": 199348416, + "step": 163845 + }, + { + "epoch": 18.248134536139883, + "grad_norm": 0.0031681139953434467, + "learning_rate": 1.159671409054905e-06, + "loss": 0.0461, + "num_input_tokens_seen": 199354528, + "step": 163850 + }, + { + "epoch": 18.2486913910235, + "grad_norm": 0.7185750603675842, + "learning_rate": 1.1589400859341237e-06, + "loss": 0.0282, + "num_input_tokens_seen": 199361088, + "step": 163855 + }, + { + "epoch": 18.24924824590712, + "grad_norm": 0.6401146054267883, + "learning_rate": 1.1582089880112528e-06, + "loss": 0.017, + "num_input_tokens_seen": 199367616, + "step": 163860 + }, + { + "epoch": 18.249805100790734, + "grad_norm": 0.006390445865690708, + "learning_rate": 1.1574781152932007e-06, + "loss": 0.0413, + "num_input_tokens_seen": 199373440, + "step": 163865 + }, + { + "epoch": 18.25036195567435, + "grad_norm": 0.22876425087451935, + "learning_rate": 1.15674746778687e-06, + "loss": 0.0825, + "num_input_tokens_seen": 199379392, + "step": 163870 + }, + { + "epoch": 18.25091881055797, + "grad_norm": 0.36224326491355896, + "learning_rate": 1.1560170454991664e-06, + "loss": 0.0064, + "num_input_tokens_seen": 199384736, + "step": 163875 + }, + { + "epoch": 18.251475665441586, + "grad_norm": 0.01276963297277689, + "learning_rate": 1.155286848436979e-06, + "loss": 0.0162, + "num_input_tokens_seen": 199390944, + "step": 163880 + }, + { + "epoch": 18.252032520325205, + "grad_norm": 8.399475336773321e-05, + "learning_rate": 1.1545568766072157e-06, + "loss": 0.0362, + "num_input_tokens_seen": 199397120, + "step": 163885 + }, + { + "epoch": 18.25258937520882, + "grad_norm": 0.12924489378929138, + "learning_rate": 1.153827130016763e-06, + "loss": 0.0026, + "num_input_tokens_seen": 199403104, + "step": 163890 + }, + { + "epoch": 18.253146230092437, + "grad_norm": 0.10477586090564728, + "learning_rate": 1.1530976086725236e-06, + "loss": 0.1932, + "num_input_tokens_seen": 199409312, + "step": 163895 + }, + { + "epoch": 18.253703084976056, + "grad_norm": 0.06577005237340927, + "learning_rate": 1.1523683125813812e-06, + "loss": 0.0033, + "num_input_tokens_seen": 199415392, + "step": 163900 + }, + { + "epoch": 18.254259939859672, + "grad_norm": 0.12877507507801056, + "learning_rate": 1.1516392417502269e-06, + "loss": 0.016, + "num_input_tokens_seen": 199421568, + "step": 163905 + }, + { + "epoch": 18.25481679474329, + "grad_norm": 0.332050085067749, + "learning_rate": 1.1509103961859446e-06, + "loss": 0.0087, + "num_input_tokens_seen": 199427712, + "step": 163910 + }, + { + "epoch": 18.255373649626907, + "grad_norm": 0.2553102374076843, + "learning_rate": 1.1501817758954232e-06, + "loss": 0.004, + "num_input_tokens_seen": 199433792, + "step": 163915 + }, + { + "epoch": 18.255930504510523, + "grad_norm": 1.7094074487686157, + "learning_rate": 1.149453380885543e-06, + "loss": 0.022, + "num_input_tokens_seen": 199439968, + "step": 163920 + }, + { + "epoch": 18.256487359394143, + "grad_norm": 0.8658322691917419, + "learning_rate": 1.1487252111631847e-06, + "loss": 0.1426, + "num_input_tokens_seen": 199446400, + "step": 163925 + }, + { + "epoch": 18.25704421427776, + "grad_norm": 0.04661022499203682, + "learning_rate": 1.1479972667352234e-06, + "loss": 0.1481, + "num_input_tokens_seen": 199452384, + "step": 163930 + }, + { + "epoch": 18.257601069161378, + "grad_norm": 0.09612280130386353, + "learning_rate": 1.1472695476085427e-06, + "loss": 0.0464, + "num_input_tokens_seen": 199458624, + "step": 163935 + }, + { + "epoch": 18.258157924044994, + "grad_norm": 0.020985309034585953, + "learning_rate": 1.1465420537900062e-06, + "loss": 0.002, + "num_input_tokens_seen": 199465120, + "step": 163940 + }, + { + "epoch": 18.25871477892861, + "grad_norm": 0.00027039815904572606, + "learning_rate": 1.1458147852864975e-06, + "loss": 0.0109, + "num_input_tokens_seen": 199470784, + "step": 163945 + }, + { + "epoch": 18.25927163381223, + "grad_norm": 0.0819920152425766, + "learning_rate": 1.1450877421048723e-06, + "loss": 0.0388, + "num_input_tokens_seen": 199477056, + "step": 163950 + }, + { + "epoch": 18.259828488695845, + "grad_norm": 0.18802949786186218, + "learning_rate": 1.1443609242520109e-06, + "loss": 0.0869, + "num_input_tokens_seen": 199483072, + "step": 163955 + }, + { + "epoch": 18.260385343579465, + "grad_norm": 0.06684543192386627, + "learning_rate": 1.1436343317347692e-06, + "loss": 0.1008, + "num_input_tokens_seen": 199489216, + "step": 163960 + }, + { + "epoch": 18.26094219846308, + "grad_norm": 0.09705670177936554, + "learning_rate": 1.1429079645600167e-06, + "loss": 0.0353, + "num_input_tokens_seen": 199495360, + "step": 163965 + }, + { + "epoch": 18.261499053346697, + "grad_norm": 0.0011390295112505555, + "learning_rate": 1.1421818227346143e-06, + "loss": 0.0452, + "num_input_tokens_seen": 199501408, + "step": 163970 + }, + { + "epoch": 18.262055908230316, + "grad_norm": 0.08433670550584793, + "learning_rate": 1.1414559062654207e-06, + "loss": 0.0022, + "num_input_tokens_seen": 199507616, + "step": 163975 + }, + { + "epoch": 18.262612763113932, + "grad_norm": 0.5277582406997681, + "learning_rate": 1.1407302151592858e-06, + "loss": 0.0619, + "num_input_tokens_seen": 199513696, + "step": 163980 + }, + { + "epoch": 18.26316961799755, + "grad_norm": 0.612495481967926, + "learning_rate": 1.140004749423071e-06, + "loss": 0.0169, + "num_input_tokens_seen": 199519968, + "step": 163985 + }, + { + "epoch": 18.263726472881167, + "grad_norm": 0.3172290325164795, + "learning_rate": 1.1392795090636316e-06, + "loss": 0.0085, + "num_input_tokens_seen": 199525888, + "step": 163990 + }, + { + "epoch": 18.264283327764783, + "grad_norm": 0.25212809443473816, + "learning_rate": 1.1385544940878124e-06, + "loss": 0.0099, + "num_input_tokens_seen": 199531744, + "step": 163995 + }, + { + "epoch": 18.264840182648403, + "grad_norm": 0.2821693420410156, + "learning_rate": 1.1378297045024605e-06, + "loss": 0.048, + "num_input_tokens_seen": 199538208, + "step": 164000 + }, + { + "epoch": 18.26539703753202, + "grad_norm": 0.027317097410559654, + "learning_rate": 1.1371051403144261e-06, + "loss": 0.0024, + "num_input_tokens_seen": 199544480, + "step": 164005 + }, + { + "epoch": 18.265953892415638, + "grad_norm": 1.616485595703125, + "learning_rate": 1.1363808015305511e-06, + "loss": 0.0272, + "num_input_tokens_seen": 199550880, + "step": 164010 + }, + { + "epoch": 18.266510747299254, + "grad_norm": 0.033560656011104584, + "learning_rate": 1.1356566881576824e-06, + "loss": 0.061, + "num_input_tokens_seen": 199557120, + "step": 164015 + }, + { + "epoch": 18.26706760218287, + "grad_norm": 0.0005352164153009653, + "learning_rate": 1.1349328002026566e-06, + "loss": 0.0265, + "num_input_tokens_seen": 199563232, + "step": 164020 + }, + { + "epoch": 18.26762445706649, + "grad_norm": 0.16764533519744873, + "learning_rate": 1.1342091376723096e-06, + "loss": 0.0622, + "num_input_tokens_seen": 199569408, + "step": 164025 + }, + { + "epoch": 18.268181311950105, + "grad_norm": 0.0037776483222842216, + "learning_rate": 1.133485700573475e-06, + "loss": 0.0108, + "num_input_tokens_seen": 199575136, + "step": 164030 + }, + { + "epoch": 18.268738166833725, + "grad_norm": 1.4950979948043823, + "learning_rate": 1.1327624889129917e-06, + "loss": 0.0741, + "num_input_tokens_seen": 199581344, + "step": 164035 + }, + { + "epoch": 18.26929502171734, + "grad_norm": 0.018724767491221428, + "learning_rate": 1.1320395026976905e-06, + "loss": 0.024, + "num_input_tokens_seen": 199587200, + "step": 164040 + }, + { + "epoch": 18.269851876600956, + "grad_norm": 0.037795886397361755, + "learning_rate": 1.1313167419343963e-06, + "loss": 0.0501, + "num_input_tokens_seen": 199593152, + "step": 164045 + }, + { + "epoch": 18.270408731484576, + "grad_norm": 0.005892637185752392, + "learning_rate": 1.1305942066299396e-06, + "loss": 0.0027, + "num_input_tokens_seen": 199599744, + "step": 164050 + }, + { + "epoch": 18.27096558636819, + "grad_norm": 0.0019019866595044732, + "learning_rate": 1.1298718967911458e-06, + "loss": 0.0061, + "num_input_tokens_seen": 199605664, + "step": 164055 + }, + { + "epoch": 18.27152244125181, + "grad_norm": 0.0023735768627375364, + "learning_rate": 1.1291498124248317e-06, + "loss": 0.0014, + "num_input_tokens_seen": 199612000, + "step": 164060 + }, + { + "epoch": 18.272079296135427, + "grad_norm": 0.5844466686248779, + "learning_rate": 1.1284279535378305e-06, + "loss": 0.0885, + "num_input_tokens_seen": 199618176, + "step": 164065 + }, + { + "epoch": 18.272636151019043, + "grad_norm": 0.7010443806648254, + "learning_rate": 1.1277063201369454e-06, + "loss": 0.0232, + "num_input_tokens_seen": 199623872, + "step": 164070 + }, + { + "epoch": 18.273193005902662, + "grad_norm": 0.0003483795444481075, + "learning_rate": 1.126984912229004e-06, + "loss": 0.0103, + "num_input_tokens_seen": 199629952, + "step": 164075 + }, + { + "epoch": 18.27374986078628, + "grad_norm": 0.0007279947167262435, + "learning_rate": 1.1262637298208145e-06, + "loss": 0.0679, + "num_input_tokens_seen": 199636320, + "step": 164080 + }, + { + "epoch": 18.274306715669898, + "grad_norm": 1.8614088296890259, + "learning_rate": 1.1255427729191942e-06, + "loss": 0.0529, + "num_input_tokens_seen": 199642656, + "step": 164085 + }, + { + "epoch": 18.274863570553514, + "grad_norm": 0.01238261442631483, + "learning_rate": 1.1248220415309512e-06, + "loss": 0.1175, + "num_input_tokens_seen": 199648576, + "step": 164090 + }, + { + "epoch": 18.27542042543713, + "grad_norm": 3.184844970703125, + "learning_rate": 1.1241015356628915e-06, + "loss": 0.0581, + "num_input_tokens_seen": 199654080, + "step": 164095 + }, + { + "epoch": 18.27597728032075, + "grad_norm": 0.27976033091545105, + "learning_rate": 1.1233812553218177e-06, + "loss": 0.042, + "num_input_tokens_seen": 199659488, + "step": 164100 + }, + { + "epoch": 18.276534135204365, + "grad_norm": 0.40656062960624695, + "learning_rate": 1.1226612005145409e-06, + "loss": 0.0766, + "num_input_tokens_seen": 199665472, + "step": 164105 + }, + { + "epoch": 18.277090990087984, + "grad_norm": 0.2999727427959442, + "learning_rate": 1.1219413712478616e-06, + "loss": 0.014, + "num_input_tokens_seen": 199671392, + "step": 164110 + }, + { + "epoch": 18.2776478449716, + "grad_norm": 0.004881339147686958, + "learning_rate": 1.121221767528574e-06, + "loss": 0.0342, + "num_input_tokens_seen": 199677856, + "step": 164115 + }, + { + "epoch": 18.278204699855216, + "grad_norm": 0.017883146181702614, + "learning_rate": 1.1205023893634758e-06, + "loss": 0.0074, + "num_input_tokens_seen": 199684064, + "step": 164120 + }, + { + "epoch": 18.278761554738836, + "grad_norm": 0.0011283293133601546, + "learning_rate": 1.1197832367593697e-06, + "loss": 0.0228, + "num_input_tokens_seen": 199690336, + "step": 164125 + }, + { + "epoch": 18.27931840962245, + "grad_norm": 1.1544549465179443, + "learning_rate": 1.119064309723042e-06, + "loss": 0.0871, + "num_input_tokens_seen": 199696288, + "step": 164130 + }, + { + "epoch": 18.27987526450607, + "grad_norm": 0.17843106389045715, + "learning_rate": 1.1183456082612843e-06, + "loss": 0.0111, + "num_input_tokens_seen": 199702432, + "step": 164135 + }, + { + "epoch": 18.280432119389687, + "grad_norm": 0.8397817611694336, + "learning_rate": 1.1176271323808856e-06, + "loss": 0.0102, + "num_input_tokens_seen": 199708864, + "step": 164140 + }, + { + "epoch": 18.280988974273303, + "grad_norm": 0.009718083776533604, + "learning_rate": 1.1169088820886298e-06, + "loss": 0.0016, + "num_input_tokens_seen": 199715104, + "step": 164145 + }, + { + "epoch": 18.281545829156922, + "grad_norm": 0.8909704089164734, + "learning_rate": 1.116190857391311e-06, + "loss": 0.0184, + "num_input_tokens_seen": 199721504, + "step": 164150 + }, + { + "epoch": 18.282102684040538, + "grad_norm": 1.3657491207122803, + "learning_rate": 1.115473058295699e-06, + "loss": 0.1322, + "num_input_tokens_seen": 199727776, + "step": 164155 + }, + { + "epoch": 18.282659538924158, + "grad_norm": 1.817272663116455, + "learning_rate": 1.114755484808583e-06, + "loss": 0.0922, + "num_input_tokens_seen": 199733888, + "step": 164160 + }, + { + "epoch": 18.283216393807773, + "grad_norm": 1.388956904411316, + "learning_rate": 1.1140381369367374e-06, + "loss": 0.0351, + "num_input_tokens_seen": 199740224, + "step": 164165 + }, + { + "epoch": 18.28377324869139, + "grad_norm": 0.007127529010176659, + "learning_rate": 1.1133210146869382e-06, + "loss": 0.0652, + "num_input_tokens_seen": 199746400, + "step": 164170 + }, + { + "epoch": 18.28433010357501, + "grad_norm": 0.09443720430135727, + "learning_rate": 1.1126041180659602e-06, + "loss": 0.033, + "num_input_tokens_seen": 199752608, + "step": 164175 + }, + { + "epoch": 18.284886958458625, + "grad_norm": 0.0015313726617023349, + "learning_rate": 1.1118874470805757e-06, + "loss": 0.0034, + "num_input_tokens_seen": 199758880, + "step": 164180 + }, + { + "epoch": 18.285443813342244, + "grad_norm": 0.0017161545110866427, + "learning_rate": 1.1111710017375516e-06, + "loss": 0.0043, + "num_input_tokens_seen": 199765184, + "step": 164185 + }, + { + "epoch": 18.28600066822586, + "grad_norm": 2.650259494781494, + "learning_rate": 1.1104547820436572e-06, + "loss": 0.1064, + "num_input_tokens_seen": 199771168, + "step": 164190 + }, + { + "epoch": 18.28655752310948, + "grad_norm": 0.26362305879592896, + "learning_rate": 1.109738788005657e-06, + "loss": 0.0117, + "num_input_tokens_seen": 199777408, + "step": 164195 + }, + { + "epoch": 18.287114377993095, + "grad_norm": 0.0008446648134849966, + "learning_rate": 1.1090230196303148e-06, + "loss": 0.0803, + "num_input_tokens_seen": 199783520, + "step": 164200 + }, + { + "epoch": 18.28767123287671, + "grad_norm": 0.1574123352766037, + "learning_rate": 1.108307476924389e-06, + "loss": 0.0027, + "num_input_tokens_seen": 199789504, + "step": 164205 + }, + { + "epoch": 18.28822808776033, + "grad_norm": 0.22083251178264618, + "learning_rate": 1.1075921598946464e-06, + "loss": 0.0078, + "num_input_tokens_seen": 199795904, + "step": 164210 + }, + { + "epoch": 18.288784942643947, + "grad_norm": 0.25201159715652466, + "learning_rate": 1.1068770685478319e-06, + "loss": 0.0306, + "num_input_tokens_seen": 199801408, + "step": 164215 + }, + { + "epoch": 18.289341797527566, + "grad_norm": 0.12123408913612366, + "learning_rate": 1.106162202890712e-06, + "loss": 0.0646, + "num_input_tokens_seen": 199807616, + "step": 164220 + }, + { + "epoch": 18.289898652411182, + "grad_norm": 0.08772322535514832, + "learning_rate": 1.1054475629300286e-06, + "loss": 0.1176, + "num_input_tokens_seen": 199813504, + "step": 164225 + }, + { + "epoch": 18.290455507294798, + "grad_norm": 0.002129890024662018, + "learning_rate": 1.1047331486725405e-06, + "loss": 0.0019, + "num_input_tokens_seen": 199819456, + "step": 164230 + }, + { + "epoch": 18.291012362178417, + "grad_norm": 0.03267032280564308, + "learning_rate": 1.1040189601249917e-06, + "loss": 0.065, + "num_input_tokens_seen": 199825920, + "step": 164235 + }, + { + "epoch": 18.291569217062033, + "grad_norm": 2.0538887977600098, + "learning_rate": 1.1033049972941272e-06, + "loss": 0.0566, + "num_input_tokens_seen": 199831552, + "step": 164240 + }, + { + "epoch": 18.292126071945653, + "grad_norm": 0.04276460036635399, + "learning_rate": 1.1025912601866917e-06, + "loss": 0.0079, + "num_input_tokens_seen": 199837760, + "step": 164245 + }, + { + "epoch": 18.29268292682927, + "grad_norm": 0.8845638632774353, + "learning_rate": 1.1018777488094323e-06, + "loss": 0.0248, + "num_input_tokens_seen": 199844032, + "step": 164250 + }, + { + "epoch": 18.293239781712884, + "grad_norm": 0.0031262943521142006, + "learning_rate": 1.1011644631690827e-06, + "loss": 0.0061, + "num_input_tokens_seen": 199850144, + "step": 164255 + }, + { + "epoch": 18.293796636596504, + "grad_norm": 2.605699062347412, + "learning_rate": 1.1004514032723818e-06, + "loss": 0.1375, + "num_input_tokens_seen": 199855840, + "step": 164260 + }, + { + "epoch": 18.29435349148012, + "grad_norm": 0.8442564010620117, + "learning_rate": 1.0997385691260631e-06, + "loss": 0.0232, + "num_input_tokens_seen": 199862048, + "step": 164265 + }, + { + "epoch": 18.29491034636374, + "grad_norm": 1.1193819046020508, + "learning_rate": 1.0990259607368659e-06, + "loss": 0.0824, + "num_input_tokens_seen": 199868480, + "step": 164270 + }, + { + "epoch": 18.295467201247355, + "grad_norm": 0.07789935916662216, + "learning_rate": 1.0983135781115151e-06, + "loss": 0.0348, + "num_input_tokens_seen": 199874240, + "step": 164275 + }, + { + "epoch": 18.29602405613097, + "grad_norm": 0.000892659998498857, + "learning_rate": 1.097601421256747e-06, + "loss": 0.0295, + "num_input_tokens_seen": 199880480, + "step": 164280 + }, + { + "epoch": 18.29658091101459, + "grad_norm": 0.256207674741745, + "learning_rate": 1.0968894901792758e-06, + "loss": 0.0466, + "num_input_tokens_seen": 199886304, + "step": 164285 + }, + { + "epoch": 18.297137765898206, + "grad_norm": 0.784691572189331, + "learning_rate": 1.0961777848858407e-06, + "loss": 0.0195, + "num_input_tokens_seen": 199892608, + "step": 164290 + }, + { + "epoch": 18.297694620781826, + "grad_norm": 0.018589483574032784, + "learning_rate": 1.0954663053831526e-06, + "loss": 0.0902, + "num_input_tokens_seen": 199898720, + "step": 164295 + }, + { + "epoch": 18.29825147566544, + "grad_norm": 3.853691339492798, + "learning_rate": 1.0947550516779425e-06, + "loss": 0.1102, + "num_input_tokens_seen": 199905024, + "step": 164300 + }, + { + "epoch": 18.298808330549058, + "grad_norm": 0.6289278864860535, + "learning_rate": 1.0940440237769217e-06, + "loss": 0.0888, + "num_input_tokens_seen": 199910880, + "step": 164305 + }, + { + "epoch": 18.299365185432677, + "grad_norm": 0.5207861065864563, + "learning_rate": 1.093333221686807e-06, + "loss": 0.055, + "num_input_tokens_seen": 199917088, + "step": 164310 + }, + { + "epoch": 18.299922040316293, + "grad_norm": 0.0070322975516319275, + "learning_rate": 1.0926226454143124e-06, + "loss": 0.0258, + "num_input_tokens_seen": 199923520, + "step": 164315 + }, + { + "epoch": 18.300478895199912, + "grad_norm": 0.04348224028944969, + "learning_rate": 1.0919122949661548e-06, + "loss": 0.0112, + "num_input_tokens_seen": 199929856, + "step": 164320 + }, + { + "epoch": 18.30103575008353, + "grad_norm": 0.8795772194862366, + "learning_rate": 1.09120217034904e-06, + "loss": 0.0426, + "num_input_tokens_seen": 199936064, + "step": 164325 + }, + { + "epoch": 18.301592604967144, + "grad_norm": 0.020773977041244507, + "learning_rate": 1.0904922715696765e-06, + "loss": 0.1782, + "num_input_tokens_seen": 199942272, + "step": 164330 + }, + { + "epoch": 18.302149459850764, + "grad_norm": 0.021327761933207512, + "learning_rate": 1.0897825986347643e-06, + "loss": 0.0585, + "num_input_tokens_seen": 199948736, + "step": 164335 + }, + { + "epoch": 18.30270631473438, + "grad_norm": 0.3660065829753876, + "learning_rate": 1.0890731515510178e-06, + "loss": 0.0103, + "num_input_tokens_seen": 199954432, + "step": 164340 + }, + { + "epoch": 18.303263169618, + "grad_norm": 0.1160799041390419, + "learning_rate": 1.0883639303251286e-06, + "loss": 0.0638, + "num_input_tokens_seen": 199960608, + "step": 164345 + }, + { + "epoch": 18.303820024501615, + "grad_norm": 0.012779978103935719, + "learning_rate": 1.0876549349638055e-06, + "loss": 0.0578, + "num_input_tokens_seen": 199966720, + "step": 164350 + }, + { + "epoch": 18.30437687938523, + "grad_norm": 0.7294833064079285, + "learning_rate": 1.0869461654737318e-06, + "loss": 0.0314, + "num_input_tokens_seen": 199972768, + "step": 164355 + }, + { + "epoch": 18.30493373426885, + "grad_norm": 0.018583958968520164, + "learning_rate": 1.086237621861616e-06, + "loss": 0.1181, + "num_input_tokens_seen": 199978208, + "step": 164360 + }, + { + "epoch": 18.305490589152466, + "grad_norm": 1.632887601852417, + "learning_rate": 1.0855293041341419e-06, + "loss": 0.0809, + "num_input_tokens_seen": 199984608, + "step": 164365 + }, + { + "epoch": 18.306047444036086, + "grad_norm": 1.6854679584503174, + "learning_rate": 1.0848212122980068e-06, + "loss": 0.0167, + "num_input_tokens_seen": 199990816, + "step": 164370 + }, + { + "epoch": 18.3066042989197, + "grad_norm": 1.3216978311538696, + "learning_rate": 1.084113346359894e-06, + "loss": 0.0943, + "num_input_tokens_seen": 199996512, + "step": 164375 + }, + { + "epoch": 18.307161153803317, + "grad_norm": 0.0007157613290473819, + "learning_rate": 1.083405706326493e-06, + "loss": 0.0047, + "num_input_tokens_seen": 200002688, + "step": 164380 + }, + { + "epoch": 18.307718008686937, + "grad_norm": 0.7456879019737244, + "learning_rate": 1.0826982922044843e-06, + "loss": 0.0128, + "num_input_tokens_seen": 200008512, + "step": 164385 + }, + { + "epoch": 18.308274863570553, + "grad_norm": 0.44507020711898804, + "learning_rate": 1.0819911040005543e-06, + "loss": 0.006, + "num_input_tokens_seen": 200014656, + "step": 164390 + }, + { + "epoch": 18.308831718454172, + "grad_norm": 0.00022292106586974114, + "learning_rate": 1.081284141721381e-06, + "loss": 0.0554, + "num_input_tokens_seen": 200020672, + "step": 164395 + }, + { + "epoch": 18.309388573337788, + "grad_norm": 0.2122349590063095, + "learning_rate": 1.080577405373645e-06, + "loss": 0.085, + "num_input_tokens_seen": 200026848, + "step": 164400 + }, + { + "epoch": 18.309945428221404, + "grad_norm": 0.00016824591148179024, + "learning_rate": 1.0798708949640136e-06, + "loss": 0.1191, + "num_input_tokens_seen": 200033280, + "step": 164405 + }, + { + "epoch": 18.310502283105023, + "grad_norm": 0.295248806476593, + "learning_rate": 1.0791646104991698e-06, + "loss": 0.0122, + "num_input_tokens_seen": 200039296, + "step": 164410 + }, + { + "epoch": 18.31105913798864, + "grad_norm": 0.0006610738928429782, + "learning_rate": 1.0784585519857782e-06, + "loss": 0.003, + "num_input_tokens_seen": 200045184, + "step": 164415 + }, + { + "epoch": 18.31161599287226, + "grad_norm": 1.0994805097579956, + "learning_rate": 1.0777527194305138e-06, + "loss": 0.0553, + "num_input_tokens_seen": 200051008, + "step": 164420 + }, + { + "epoch": 18.312172847755875, + "grad_norm": 0.029508953914046288, + "learning_rate": 1.0770471128400433e-06, + "loss": 0.0094, + "num_input_tokens_seen": 200057120, + "step": 164425 + }, + { + "epoch": 18.31272970263949, + "grad_norm": 3.4893929958343506, + "learning_rate": 1.0763417322210256e-06, + "loss": 0.0745, + "num_input_tokens_seen": 200063168, + "step": 164430 + }, + { + "epoch": 18.31328655752311, + "grad_norm": 0.17183063924312592, + "learning_rate": 1.0756365775801275e-06, + "loss": 0.0204, + "num_input_tokens_seen": 200068992, + "step": 164435 + }, + { + "epoch": 18.313843412406726, + "grad_norm": 0.007954391650855541, + "learning_rate": 1.0749316489240129e-06, + "loss": 0.0144, + "num_input_tokens_seen": 200075040, + "step": 164440 + }, + { + "epoch": 18.314400267290345, + "grad_norm": 0.33928751945495605, + "learning_rate": 1.0742269462593352e-06, + "loss": 0.0072, + "num_input_tokens_seen": 200081216, + "step": 164445 + }, + { + "epoch": 18.31495712217396, + "grad_norm": 0.08368758857250214, + "learning_rate": 1.0735224695927554e-06, + "loss": 0.0019, + "num_input_tokens_seen": 200087392, + "step": 164450 + }, + { + "epoch": 18.315513977057577, + "grad_norm": 0.9485157132148743, + "learning_rate": 1.0728182189309211e-06, + "loss": 0.0602, + "num_input_tokens_seen": 200093472, + "step": 164455 + }, + { + "epoch": 18.316070831941197, + "grad_norm": 0.0005988547927699983, + "learning_rate": 1.0721141942804936e-06, + "loss": 0.01, + "num_input_tokens_seen": 200099488, + "step": 164460 + }, + { + "epoch": 18.316627686824813, + "grad_norm": 0.0009642955265007913, + "learning_rate": 1.071410395648112e-06, + "loss": 0.0103, + "num_input_tokens_seen": 200105568, + "step": 164465 + }, + { + "epoch": 18.317184541708432, + "grad_norm": 0.03718866407871246, + "learning_rate": 1.0707068230404404e-06, + "loss": 0.0429, + "num_input_tokens_seen": 200111840, + "step": 164470 + }, + { + "epoch": 18.317741396592048, + "grad_norm": 0.002561207627877593, + "learning_rate": 1.0700034764641042e-06, + "loss": 0.0103, + "num_input_tokens_seen": 200118080, + "step": 164475 + }, + { + "epoch": 18.318298251475664, + "grad_norm": 0.08275788277387619, + "learning_rate": 1.0693003559257647e-06, + "loss": 0.0316, + "num_input_tokens_seen": 200123872, + "step": 164480 + }, + { + "epoch": 18.318855106359283, + "grad_norm": 0.26273369789123535, + "learning_rate": 1.06859746143205e-06, + "loss": 0.0411, + "num_input_tokens_seen": 200129888, + "step": 164485 + }, + { + "epoch": 18.3194119612429, + "grad_norm": 0.20905360579490662, + "learning_rate": 1.06789479298961e-06, + "loss": 0.0254, + "num_input_tokens_seen": 200136032, + "step": 164490 + }, + { + "epoch": 18.31996881612652, + "grad_norm": 0.13054108619689941, + "learning_rate": 1.0671923506050785e-06, + "loss": 0.0376, + "num_input_tokens_seen": 200142112, + "step": 164495 + }, + { + "epoch": 18.320525671010135, + "grad_norm": 0.030205463990569115, + "learning_rate": 1.0664901342850891e-06, + "loss": 0.0265, + "num_input_tokens_seen": 200147488, + "step": 164500 + }, + { + "epoch": 18.32108252589375, + "grad_norm": 0.11569730192422867, + "learning_rate": 1.0657881440362755e-06, + "loss": 0.0743, + "num_input_tokens_seen": 200153728, + "step": 164505 + }, + { + "epoch": 18.32163938077737, + "grad_norm": 1.9662892818450928, + "learning_rate": 1.0650863798652683e-06, + "loss": 0.0423, + "num_input_tokens_seen": 200159776, + "step": 164510 + }, + { + "epoch": 18.322196235660986, + "grad_norm": 0.005537518300116062, + "learning_rate": 1.0643848417786984e-06, + "loss": 0.0217, + "num_input_tokens_seen": 200165920, + "step": 164515 + }, + { + "epoch": 18.322753090544605, + "grad_norm": 0.0001101857706089504, + "learning_rate": 1.0636835297831882e-06, + "loss": 0.0319, + "num_input_tokens_seen": 200172160, + "step": 164520 + }, + { + "epoch": 18.32330994542822, + "grad_norm": 0.15812864899635315, + "learning_rate": 1.062982443885363e-06, + "loss": 0.0253, + "num_input_tokens_seen": 200178144, + "step": 164525 + }, + { + "epoch": 18.323866800311837, + "grad_norm": 0.6248987317085266, + "learning_rate": 1.0622815840918481e-06, + "loss": 0.0112, + "num_input_tokens_seen": 200184000, + "step": 164530 + }, + { + "epoch": 18.324423655195456, + "grad_norm": 0.1966719925403595, + "learning_rate": 1.0615809504092633e-06, + "loss": 0.0369, + "num_input_tokens_seen": 200190336, + "step": 164535 + }, + { + "epoch": 18.324980510079072, + "grad_norm": 0.352683961391449, + "learning_rate": 1.060880542844228e-06, + "loss": 0.0074, + "num_input_tokens_seen": 200196448, + "step": 164540 + }, + { + "epoch": 18.325537364962692, + "grad_norm": 0.0003689397999551147, + "learning_rate": 1.060180361403354e-06, + "loss": 0.012, + "num_input_tokens_seen": 200202368, + "step": 164545 + }, + { + "epoch": 18.326094219846308, + "grad_norm": 0.03501318767666817, + "learning_rate": 1.0594804060932522e-06, + "loss": 0.0677, + "num_input_tokens_seen": 200208672, + "step": 164550 + }, + { + "epoch": 18.326651074729924, + "grad_norm": 0.008714108727872372, + "learning_rate": 1.0587806769205426e-06, + "loss": 0.0152, + "num_input_tokens_seen": 200214752, + "step": 164555 + }, + { + "epoch": 18.327207929613543, + "grad_norm": 0.009554191492497921, + "learning_rate": 1.0580811738918284e-06, + "loss": 0.0113, + "num_input_tokens_seen": 200220992, + "step": 164560 + }, + { + "epoch": 18.32776478449716, + "grad_norm": 0.006191419903188944, + "learning_rate": 1.0573818970137233e-06, + "loss": 0.0297, + "num_input_tokens_seen": 200226784, + "step": 164565 + }, + { + "epoch": 18.32832163938078, + "grad_norm": 2.234020948410034, + "learning_rate": 1.056682846292828e-06, + "loss": 0.144, + "num_input_tokens_seen": 200232640, + "step": 164570 + }, + { + "epoch": 18.328878494264394, + "grad_norm": 0.0030700520146638155, + "learning_rate": 1.0559840217357452e-06, + "loss": 0.0072, + "num_input_tokens_seen": 200238720, + "step": 164575 + }, + { + "epoch": 18.329435349148014, + "grad_norm": 0.0002740764757618308, + "learning_rate": 1.0552854233490754e-06, + "loss": 0.0599, + "num_input_tokens_seen": 200244800, + "step": 164580 + }, + { + "epoch": 18.32999220403163, + "grad_norm": 0.036956921219825745, + "learning_rate": 1.0545870511394218e-06, + "loss": 0.0534, + "num_input_tokens_seen": 200250560, + "step": 164585 + }, + { + "epoch": 18.330549058915246, + "grad_norm": 0.0014332868158817291, + "learning_rate": 1.053888905113376e-06, + "loss": 0.0137, + "num_input_tokens_seen": 200256768, + "step": 164590 + }, + { + "epoch": 18.331105913798865, + "grad_norm": 2.0950329303741455, + "learning_rate": 1.0531909852775385e-06, + "loss": 0.1748, + "num_input_tokens_seen": 200262720, + "step": 164595 + }, + { + "epoch": 18.33166276868248, + "grad_norm": 0.06129145994782448, + "learning_rate": 1.0524932916384928e-06, + "loss": 0.0155, + "num_input_tokens_seen": 200268992, + "step": 164600 + }, + { + "epoch": 18.3322196235661, + "grad_norm": 0.07008644938468933, + "learning_rate": 1.0517958242028364e-06, + "loss": 0.0585, + "num_input_tokens_seen": 200275232, + "step": 164605 + }, + { + "epoch": 18.332776478449716, + "grad_norm": 0.025307979434728622, + "learning_rate": 1.051098582977153e-06, + "loss": 0.0036, + "num_input_tokens_seen": 200281792, + "step": 164610 + }, + { + "epoch": 18.333333333333332, + "grad_norm": 1.8253679275512695, + "learning_rate": 1.0504015679680373e-06, + "loss": 0.0919, + "num_input_tokens_seen": 200287648, + "step": 164615 + }, + { + "epoch": 18.33389018821695, + "grad_norm": 0.0007417221204377711, + "learning_rate": 1.0497047791820619e-06, + "loss": 0.0064, + "num_input_tokens_seen": 200294080, + "step": 164620 + }, + { + "epoch": 18.334447043100567, + "grad_norm": 0.847926676273346, + "learning_rate": 1.0490082166258159e-06, + "loss": 0.03, + "num_input_tokens_seen": 200299616, + "step": 164625 + }, + { + "epoch": 18.335003897984187, + "grad_norm": 0.11980632692575455, + "learning_rate": 1.0483118803058745e-06, + "loss": 0.0284, + "num_input_tokens_seen": 200305824, + "step": 164630 + }, + { + "epoch": 18.335560752867803, + "grad_norm": 0.039723336696624756, + "learning_rate": 1.0476157702288187e-06, + "loss": 0.0087, + "num_input_tokens_seen": 200312288, + "step": 164635 + }, + { + "epoch": 18.33611760775142, + "grad_norm": 0.13681814074516296, + "learning_rate": 1.0469198864012236e-06, + "loss": 0.0036, + "num_input_tokens_seen": 200318624, + "step": 164640 + }, + { + "epoch": 18.336674462635038, + "grad_norm": 0.016338424757122993, + "learning_rate": 1.0462242288296593e-06, + "loss": 0.0362, + "num_input_tokens_seen": 200324992, + "step": 164645 + }, + { + "epoch": 18.337231317518654, + "grad_norm": 0.00041413266444578767, + "learning_rate": 1.045528797520695e-06, + "loss": 0.0071, + "num_input_tokens_seen": 200331136, + "step": 164650 + }, + { + "epoch": 18.337788172402274, + "grad_norm": 0.00038008022238500416, + "learning_rate": 1.0448335924809093e-06, + "loss": 0.0136, + "num_input_tokens_seen": 200337568, + "step": 164655 + }, + { + "epoch": 18.33834502728589, + "grad_norm": 0.01162934210151434, + "learning_rate": 1.0441386137168608e-06, + "loss": 0.0234, + "num_input_tokens_seen": 200343808, + "step": 164660 + }, + { + "epoch": 18.338901882169505, + "grad_norm": 0.039524976164102554, + "learning_rate": 1.043443861235116e-06, + "loss": 0.0247, + "num_input_tokens_seen": 200349600, + "step": 164665 + }, + { + "epoch": 18.339458737053125, + "grad_norm": 0.00030308368150144815, + "learning_rate": 1.0427493350422368e-06, + "loss": 0.1054, + "num_input_tokens_seen": 200355456, + "step": 164670 + }, + { + "epoch": 18.34001559193674, + "grad_norm": 0.0025949112605303526, + "learning_rate": 1.0420550351447844e-06, + "loss": 0.0176, + "num_input_tokens_seen": 200361344, + "step": 164675 + }, + { + "epoch": 18.34057244682036, + "grad_norm": 1.1896288394927979, + "learning_rate": 1.0413609615493147e-06, + "loss": 0.0216, + "num_input_tokens_seen": 200367168, + "step": 164680 + }, + { + "epoch": 18.341129301703976, + "grad_norm": 0.13272258639335632, + "learning_rate": 1.0406671142623947e-06, + "loss": 0.0044, + "num_input_tokens_seen": 200373312, + "step": 164685 + }, + { + "epoch": 18.341686156587592, + "grad_norm": 0.05684221535921097, + "learning_rate": 1.0399734932905608e-06, + "loss": 0.0246, + "num_input_tokens_seen": 200379232, + "step": 164690 + }, + { + "epoch": 18.34224301147121, + "grad_norm": 0.0001802892074920237, + "learning_rate": 1.0392800986403772e-06, + "loss": 0.0593, + "num_input_tokens_seen": 200385504, + "step": 164695 + }, + { + "epoch": 18.342799866354827, + "grad_norm": 0.0070604984648525715, + "learning_rate": 1.0385869303183888e-06, + "loss": 0.0513, + "num_input_tokens_seen": 200391840, + "step": 164700 + }, + { + "epoch": 18.343356721238447, + "grad_norm": 0.4846598505973816, + "learning_rate": 1.0378939883311457e-06, + "loss": 0.1296, + "num_input_tokens_seen": 200397856, + "step": 164705 + }, + { + "epoch": 18.343913576122063, + "grad_norm": 0.6327977180480957, + "learning_rate": 1.0372012726851926e-06, + "loss": 0.0104, + "num_input_tokens_seen": 200403776, + "step": 164710 + }, + { + "epoch": 18.34447043100568, + "grad_norm": 0.0002427388826617971, + "learning_rate": 1.0365087833870718e-06, + "loss": 0.0765, + "num_input_tokens_seen": 200409888, + "step": 164715 + }, + { + "epoch": 18.345027285889298, + "grad_norm": 0.8732783794403076, + "learning_rate": 1.0358165204433223e-06, + "loss": 0.0242, + "num_input_tokens_seen": 200415904, + "step": 164720 + }, + { + "epoch": 18.345584140772914, + "grad_norm": 0.24557355046272278, + "learning_rate": 1.035124483860489e-06, + "loss": 0.0126, + "num_input_tokens_seen": 200422016, + "step": 164725 + }, + { + "epoch": 18.346140995656533, + "grad_norm": 0.03671622648835182, + "learning_rate": 1.0344326736451027e-06, + "loss": 0.0256, + "num_input_tokens_seen": 200428160, + "step": 164730 + }, + { + "epoch": 18.34669785054015, + "grad_norm": 0.006876513361930847, + "learning_rate": 1.0337410898037026e-06, + "loss": 0.0047, + "num_input_tokens_seen": 200434048, + "step": 164735 + }, + { + "epoch": 18.347254705423765, + "grad_norm": 0.26739799976348877, + "learning_rate": 1.0330497323428168e-06, + "loss": 0.0176, + "num_input_tokens_seen": 200440000, + "step": 164740 + }, + { + "epoch": 18.347811560307385, + "grad_norm": 0.03346124291419983, + "learning_rate": 1.0323586012689818e-06, + "loss": 0.0017, + "num_input_tokens_seen": 200446048, + "step": 164745 + }, + { + "epoch": 18.348368415191, + "grad_norm": 0.004859721753746271, + "learning_rate": 1.0316676965887173e-06, + "loss": 0.0096, + "num_input_tokens_seen": 200452640, + "step": 164750 + }, + { + "epoch": 18.34892527007462, + "grad_norm": 0.004735453054308891, + "learning_rate": 1.0309770183085572e-06, + "loss": 0.0099, + "num_input_tokens_seen": 200458880, + "step": 164755 + }, + { + "epoch": 18.349482124958236, + "grad_norm": 0.28296154737472534, + "learning_rate": 1.0302865664350265e-06, + "loss": 0.0108, + "num_input_tokens_seen": 200464928, + "step": 164760 + }, + { + "epoch": 18.35003897984185, + "grad_norm": 0.009916429407894611, + "learning_rate": 1.0295963409746394e-06, + "loss": 0.0161, + "num_input_tokens_seen": 200470880, + "step": 164765 + }, + { + "epoch": 18.35059583472547, + "grad_norm": 8.157639240380377e-05, + "learning_rate": 1.028906341933919e-06, + "loss": 0.0496, + "num_input_tokens_seen": 200476928, + "step": 164770 + }, + { + "epoch": 18.351152689609087, + "grad_norm": 1.1778945922851562, + "learning_rate": 1.0282165693193846e-06, + "loss": 0.0468, + "num_input_tokens_seen": 200483456, + "step": 164775 + }, + { + "epoch": 18.351709544492707, + "grad_norm": 0.010241009294986725, + "learning_rate": 1.0275270231375533e-06, + "loss": 0.0563, + "num_input_tokens_seen": 200489824, + "step": 164780 + }, + { + "epoch": 18.352266399376322, + "grad_norm": 0.02710328996181488, + "learning_rate": 1.026837703394934e-06, + "loss": 0.0697, + "num_input_tokens_seen": 200496000, + "step": 164785 + }, + { + "epoch": 18.35282325425994, + "grad_norm": 0.0006758363451808691, + "learning_rate": 1.026148610098035e-06, + "loss": 0.0053, + "num_input_tokens_seen": 200501856, + "step": 164790 + }, + { + "epoch": 18.353380109143558, + "grad_norm": 0.04858539626002312, + "learning_rate": 1.0254597432533763e-06, + "loss": 0.1046, + "num_input_tokens_seen": 200507776, + "step": 164795 + }, + { + "epoch": 18.353936964027174, + "grad_norm": 0.27282941341400146, + "learning_rate": 1.024771102867453e-06, + "loss": 0.0337, + "num_input_tokens_seen": 200513952, + "step": 164800 + }, + { + "epoch": 18.354493818910793, + "grad_norm": 1.2707829475402832, + "learning_rate": 1.0240826889467814e-06, + "loss": 0.0733, + "num_input_tokens_seen": 200519616, + "step": 164805 + }, + { + "epoch": 18.35505067379441, + "grad_norm": 0.006815802305936813, + "learning_rate": 1.023394501497854e-06, + "loss": 0.002, + "num_input_tokens_seen": 200525792, + "step": 164810 + }, + { + "epoch": 18.355607528678025, + "grad_norm": 0.09130387008190155, + "learning_rate": 1.0227065405271768e-06, + "loss": 0.0289, + "num_input_tokens_seen": 200531712, + "step": 164815 + }, + { + "epoch": 18.356164383561644, + "grad_norm": 0.8686988353729248, + "learning_rate": 1.0220188060412445e-06, + "loss": 0.0709, + "num_input_tokens_seen": 200537600, + "step": 164820 + }, + { + "epoch": 18.35672123844526, + "grad_norm": 0.001131541095674038, + "learning_rate": 1.0213312980465573e-06, + "loss": 0.048, + "num_input_tokens_seen": 200543712, + "step": 164825 + }, + { + "epoch": 18.35727809332888, + "grad_norm": 0.037996556609869, + "learning_rate": 1.0206440165496073e-06, + "loss": 0.0717, + "num_input_tokens_seen": 200549856, + "step": 164830 + }, + { + "epoch": 18.357834948212496, + "grad_norm": 1.2956422567367554, + "learning_rate": 1.0199569615568865e-06, + "loss": 0.1248, + "num_input_tokens_seen": 200555872, + "step": 164835 + }, + { + "epoch": 18.35839180309611, + "grad_norm": 0.5889474153518677, + "learning_rate": 1.0192701330748816e-06, + "loss": 0.0117, + "num_input_tokens_seen": 200562176, + "step": 164840 + }, + { + "epoch": 18.35894865797973, + "grad_norm": 3.1313862800598145, + "learning_rate": 1.0185835311100871e-06, + "loss": 0.0252, + "num_input_tokens_seen": 200568352, + "step": 164845 + }, + { + "epoch": 18.359505512863347, + "grad_norm": 0.003378105815500021, + "learning_rate": 1.0178971556689843e-06, + "loss": 0.0128, + "num_input_tokens_seen": 200574496, + "step": 164850 + }, + { + "epoch": 18.360062367746966, + "grad_norm": 1.513442873954773, + "learning_rate": 1.0172110067580565e-06, + "loss": 0.0801, + "num_input_tokens_seen": 200580288, + "step": 164855 + }, + { + "epoch": 18.360619222630582, + "grad_norm": 0.03505754843354225, + "learning_rate": 1.0165250843837848e-06, + "loss": 0.0124, + "num_input_tokens_seen": 200586304, + "step": 164860 + }, + { + "epoch": 18.361176077514198, + "grad_norm": 0.007174776401370764, + "learning_rate": 1.01583938855265e-06, + "loss": 0.0106, + "num_input_tokens_seen": 200592288, + "step": 164865 + }, + { + "epoch": 18.361732932397818, + "grad_norm": 0.04102673754096031, + "learning_rate": 1.0151539192711251e-06, + "loss": 0.0453, + "num_input_tokens_seen": 200598368, + "step": 164870 + }, + { + "epoch": 18.362289787281433, + "grad_norm": 0.0022074466105550528, + "learning_rate": 1.0144686765456934e-06, + "loss": 0.0172, + "num_input_tokens_seen": 200604544, + "step": 164875 + }, + { + "epoch": 18.362846642165053, + "grad_norm": 0.4670813977718353, + "learning_rate": 1.0137836603828165e-06, + "loss": 0.0805, + "num_input_tokens_seen": 200610720, + "step": 164880 + }, + { + "epoch": 18.36340349704867, + "grad_norm": 0.01484699733555317, + "learning_rate": 1.0130988707889727e-06, + "loss": 0.0671, + "num_input_tokens_seen": 200616864, + "step": 164885 + }, + { + "epoch": 18.363960351932285, + "grad_norm": 0.03729065880179405, + "learning_rate": 1.012414307770626e-06, + "loss": 0.0045, + "num_input_tokens_seen": 200622752, + "step": 164890 + }, + { + "epoch": 18.364517206815904, + "grad_norm": 0.00969737209379673, + "learning_rate": 1.0117299713342466e-06, + "loss": 0.0162, + "num_input_tokens_seen": 200628608, + "step": 164895 + }, + { + "epoch": 18.36507406169952, + "grad_norm": 0.005991088692098856, + "learning_rate": 1.0110458614862983e-06, + "loss": 0.0135, + "num_input_tokens_seen": 200634976, + "step": 164900 + }, + { + "epoch": 18.36563091658314, + "grad_norm": 0.05827593803405762, + "learning_rate": 1.0103619782332403e-06, + "loss": 0.0084, + "num_input_tokens_seen": 200640832, + "step": 164905 + }, + { + "epoch": 18.366187771466755, + "grad_norm": 0.34875044226646423, + "learning_rate": 1.0096783215815308e-06, + "loss": 0.0068, + "num_input_tokens_seen": 200646784, + "step": 164910 + }, + { + "epoch": 18.366744626350375, + "grad_norm": 0.02765456773340702, + "learning_rate": 1.008994891537632e-06, + "loss": 0.0005, + "num_input_tokens_seen": 200652928, + "step": 164915 + }, + { + "epoch": 18.36730148123399, + "grad_norm": 0.27992427349090576, + "learning_rate": 1.008311688107999e-06, + "loss": 0.0078, + "num_input_tokens_seen": 200659104, + "step": 164920 + }, + { + "epoch": 18.367858336117607, + "grad_norm": 0.6014091968536377, + "learning_rate": 1.0076287112990856e-06, + "loss": 0.0152, + "num_input_tokens_seen": 200665024, + "step": 164925 + }, + { + "epoch": 18.368415191001226, + "grad_norm": 0.024701658636331558, + "learning_rate": 1.0069459611173365e-06, + "loss": 0.006, + "num_input_tokens_seen": 200671136, + "step": 164930 + }, + { + "epoch": 18.368972045884842, + "grad_norm": 0.035550083965063095, + "learning_rate": 1.0062634375692077e-06, + "loss": 0.0743, + "num_input_tokens_seen": 200676768, + "step": 164935 + }, + { + "epoch": 18.36952890076846, + "grad_norm": 0.3027935326099396, + "learning_rate": 1.0055811406611437e-06, + "loss": 0.0173, + "num_input_tokens_seen": 200682944, + "step": 164940 + }, + { + "epoch": 18.370085755652077, + "grad_norm": 0.7891919612884521, + "learning_rate": 1.0048990703995926e-06, + "loss": 0.0848, + "num_input_tokens_seen": 200688800, + "step": 164945 + }, + { + "epoch": 18.370642610535693, + "grad_norm": 0.01581716537475586, + "learning_rate": 1.0042172267909933e-06, + "loss": 0.015, + "num_input_tokens_seen": 200694560, + "step": 164950 + }, + { + "epoch": 18.371199465419313, + "grad_norm": 1.1599209308624268, + "learning_rate": 1.0035356098417853e-06, + "loss": 0.0843, + "num_input_tokens_seen": 200700704, + "step": 164955 + }, + { + "epoch": 18.37175632030293, + "grad_norm": 0.028519395738840103, + "learning_rate": 1.0028542195584107e-06, + "loss": 0.0172, + "num_input_tokens_seen": 200706592, + "step": 164960 + }, + { + "epoch": 18.372313175186548, + "grad_norm": 0.0192510187625885, + "learning_rate": 1.0021730559473031e-06, + "loss": 0.0252, + "num_input_tokens_seen": 200712672, + "step": 164965 + }, + { + "epoch": 18.372870030070164, + "grad_norm": 0.13215571641921997, + "learning_rate": 1.001492119014899e-06, + "loss": 0.0572, + "num_input_tokens_seen": 200718752, + "step": 164970 + }, + { + "epoch": 18.37342688495378, + "grad_norm": 1.862673282623291, + "learning_rate": 1.0008114087676296e-06, + "loss": 0.0491, + "num_input_tokens_seen": 200724896, + "step": 164975 + }, + { + "epoch": 18.3739837398374, + "grad_norm": 0.03959767520427704, + "learning_rate": 1.0001309252119228e-06, + "loss": 0.1704, + "num_input_tokens_seen": 200730880, + "step": 164980 + }, + { + "epoch": 18.374540594721015, + "grad_norm": 2.2741870880126953, + "learning_rate": 9.99450668354207e-07, + "loss": 0.0836, + "num_input_tokens_seen": 200737184, + "step": 164985 + }, + { + "epoch": 18.375097449604635, + "grad_norm": 1.590514063835144, + "learning_rate": 9.987706382009104e-07, + "loss": 0.1403, + "num_input_tokens_seen": 200743040, + "step": 164990 + }, + { + "epoch": 18.37565430448825, + "grad_norm": 0.009972630999982357, + "learning_rate": 9.980908347584556e-07, + "loss": 0.0027, + "num_input_tokens_seen": 200749312, + "step": 164995 + }, + { + "epoch": 18.376211159371866, + "grad_norm": 0.07528532296419144, + "learning_rate": 9.974112580332623e-07, + "loss": 0.0023, + "num_input_tokens_seen": 200755136, + "step": 165000 + }, + { + "epoch": 18.376768014255486, + "grad_norm": 0.0038646443281322718, + "learning_rate": 9.96731908031745e-07, + "loss": 0.0024, + "num_input_tokens_seen": 200761536, + "step": 165005 + }, + { + "epoch": 18.377324869139102, + "grad_norm": 0.0032012134324759245, + "learning_rate": 9.960527847603318e-07, + "loss": 0.0035, + "num_input_tokens_seen": 200767712, + "step": 165010 + }, + { + "epoch": 18.37788172402272, + "grad_norm": 0.001444136956706643, + "learning_rate": 9.953738882254287e-07, + "loss": 0.017, + "num_input_tokens_seen": 200773856, + "step": 165015 + }, + { + "epoch": 18.378438578906337, + "grad_norm": 0.06946295499801636, + "learning_rate": 9.946952184334558e-07, + "loss": 0.0071, + "num_input_tokens_seen": 200779968, + "step": 165020 + }, + { + "epoch": 18.378995433789953, + "grad_norm": 0.01659437268972397, + "learning_rate": 9.940167753908158e-07, + "loss": 0.0015, + "num_input_tokens_seen": 200786240, + "step": 165025 + }, + { + "epoch": 18.379552288673572, + "grad_norm": 0.6039819717407227, + "learning_rate": 9.933385591039207e-07, + "loss": 0.0094, + "num_input_tokens_seen": 200792064, + "step": 165030 + }, + { + "epoch": 18.38010914355719, + "grad_norm": 0.10288787633180618, + "learning_rate": 9.926605695791734e-07, + "loss": 0.082, + "num_input_tokens_seen": 200797856, + "step": 165035 + }, + { + "epoch": 18.380665998440808, + "grad_norm": 0.0025628365110605955, + "learning_rate": 9.919828068229885e-07, + "loss": 0.0399, + "num_input_tokens_seen": 200804000, + "step": 165040 + }, + { + "epoch": 18.381222853324424, + "grad_norm": 1.1398051977157593, + "learning_rate": 9.91305270841758e-07, + "loss": 0.072, + "num_input_tokens_seen": 200810240, + "step": 165045 + }, + { + "epoch": 18.38177970820804, + "grad_norm": 0.4075916111469269, + "learning_rate": 9.906279616418852e-07, + "loss": 0.0262, + "num_input_tokens_seen": 200816384, + "step": 165050 + }, + { + "epoch": 18.38233656309166, + "grad_norm": 0.003484775312244892, + "learning_rate": 9.899508792297618e-07, + "loss": 0.0978, + "num_input_tokens_seen": 200822880, + "step": 165055 + }, + { + "epoch": 18.382893417975275, + "grad_norm": 0.8975268006324768, + "learning_rate": 9.892740236117942e-07, + "loss": 0.0377, + "num_input_tokens_seen": 200828832, + "step": 165060 + }, + { + "epoch": 18.383450272858894, + "grad_norm": 1.4366530179977417, + "learning_rate": 9.885973947943717e-07, + "loss": 0.052, + "num_input_tokens_seen": 200835008, + "step": 165065 + }, + { + "epoch": 18.38400712774251, + "grad_norm": 2.549232244491577, + "learning_rate": 9.879209927838835e-07, + "loss": 0.1092, + "num_input_tokens_seen": 200841568, + "step": 165070 + }, + { + "epoch": 18.384563982626126, + "grad_norm": 0.42051100730895996, + "learning_rate": 9.872448175867161e-07, + "loss": 0.0819, + "num_input_tokens_seen": 200847776, + "step": 165075 + }, + { + "epoch": 18.385120837509746, + "grad_norm": 0.00022998616623226553, + "learning_rate": 9.865688692092617e-07, + "loss": 0.0009, + "num_input_tokens_seen": 200853856, + "step": 165080 + }, + { + "epoch": 18.38567769239336, + "grad_norm": 0.006113087292760611, + "learning_rate": 9.858931476579042e-07, + "loss": 0.0069, + "num_input_tokens_seen": 200860000, + "step": 165085 + }, + { + "epoch": 18.38623454727698, + "grad_norm": 1.358099341392517, + "learning_rate": 9.8521765293903e-07, + "loss": 0.0565, + "num_input_tokens_seen": 200866080, + "step": 165090 + }, + { + "epoch": 18.386791402160597, + "grad_norm": 2.2302067279815674, + "learning_rate": 9.845423850590092e-07, + "loss": 0.1356, + "num_input_tokens_seen": 200872416, + "step": 165095 + }, + { + "epoch": 18.387348257044213, + "grad_norm": 1.6428627967834473, + "learning_rate": 9.83867344024228e-07, + "loss": 0.1015, + "num_input_tokens_seen": 200878688, + "step": 165100 + }, + { + "epoch": 18.387905111927832, + "grad_norm": 0.06374067813158035, + "learning_rate": 9.831925298410593e-07, + "loss": 0.0052, + "num_input_tokens_seen": 200884832, + "step": 165105 + }, + { + "epoch": 18.388461966811448, + "grad_norm": 0.32005125284194946, + "learning_rate": 9.825179425158814e-07, + "loss": 0.0078, + "num_input_tokens_seen": 200891072, + "step": 165110 + }, + { + "epoch": 18.389018821695068, + "grad_norm": 3.13730788230896, + "learning_rate": 9.81843582055067e-07, + "loss": 0.0587, + "num_input_tokens_seen": 200897248, + "step": 165115 + }, + { + "epoch": 18.389575676578684, + "grad_norm": 0.3435332775115967, + "learning_rate": 9.811694484649802e-07, + "loss": 0.0084, + "num_input_tokens_seen": 200903520, + "step": 165120 + }, + { + "epoch": 18.3901325314623, + "grad_norm": 0.0006452189409174025, + "learning_rate": 9.804955417519884e-07, + "loss": 0.0082, + "num_input_tokens_seen": 200909632, + "step": 165125 + }, + { + "epoch": 18.39068938634592, + "grad_norm": 0.06895674020051956, + "learning_rate": 9.798218619224641e-07, + "loss": 0.0561, + "num_input_tokens_seen": 200915616, + "step": 165130 + }, + { + "epoch": 18.391246241229535, + "grad_norm": 0.13449668884277344, + "learning_rate": 9.79148408982769e-07, + "loss": 0.0217, + "num_input_tokens_seen": 200921952, + "step": 165135 + }, + { + "epoch": 18.391803096113154, + "grad_norm": 0.013054556213319302, + "learning_rate": 9.784751829392592e-07, + "loss": 0.0053, + "num_input_tokens_seen": 200927968, + "step": 165140 + }, + { + "epoch": 18.39235995099677, + "grad_norm": 1.4873642921447754, + "learning_rate": 9.77802183798296e-07, + "loss": 0.1615, + "num_input_tokens_seen": 200934016, + "step": 165145 + }, + { + "epoch": 18.392916805880386, + "grad_norm": 0.7581313848495483, + "learning_rate": 9.771294115662383e-07, + "loss": 0.0342, + "num_input_tokens_seen": 200940416, + "step": 165150 + }, + { + "epoch": 18.393473660764005, + "grad_norm": 0.016522694379091263, + "learning_rate": 9.764568662494395e-07, + "loss": 0.0075, + "num_input_tokens_seen": 200946336, + "step": 165155 + }, + { + "epoch": 18.39403051564762, + "grad_norm": 0.3825637102127075, + "learning_rate": 9.757845478542554e-07, + "loss": 0.0271, + "num_input_tokens_seen": 200952128, + "step": 165160 + }, + { + "epoch": 18.39458737053124, + "grad_norm": 0.02740604802966118, + "learning_rate": 9.751124563870312e-07, + "loss": 0.0008, + "num_input_tokens_seen": 200958656, + "step": 165165 + }, + { + "epoch": 18.395144225414857, + "grad_norm": 0.0002168452920159325, + "learning_rate": 9.744405918541227e-07, + "loss": 0.0021, + "num_input_tokens_seen": 200964832, + "step": 165170 + }, + { + "epoch": 18.395701080298473, + "grad_norm": 0.011902380734682083, + "learning_rate": 9.737689542618667e-07, + "loss": 0.0092, + "num_input_tokens_seen": 200971040, + "step": 165175 + }, + { + "epoch": 18.396257935182092, + "grad_norm": 2.258819580078125, + "learning_rate": 9.730975436166134e-07, + "loss": 0.0353, + "num_input_tokens_seen": 200977376, + "step": 165180 + }, + { + "epoch": 18.396814790065708, + "grad_norm": 0.02021317556500435, + "learning_rate": 9.72426359924708e-07, + "loss": 0.0572, + "num_input_tokens_seen": 200983392, + "step": 165185 + }, + { + "epoch": 18.397371644949327, + "grad_norm": 0.001855572103522718, + "learning_rate": 9.71755403192484e-07, + "loss": 0.0464, + "num_input_tokens_seen": 200989056, + "step": 165190 + }, + { + "epoch": 18.397928499832943, + "grad_norm": 0.052654653787612915, + "learning_rate": 9.710846734262785e-07, + "loss": 0.0307, + "num_input_tokens_seen": 200994880, + "step": 165195 + }, + { + "epoch": 18.39848535471656, + "grad_norm": 0.5848392844200134, + "learning_rate": 9.704141706324304e-07, + "loss": 0.0191, + "num_input_tokens_seen": 201001216, + "step": 165200 + }, + { + "epoch": 18.39904220960018, + "grad_norm": 0.019009916111826897, + "learning_rate": 9.697438948172737e-07, + "loss": 0.0038, + "num_input_tokens_seen": 201007680, + "step": 165205 + }, + { + "epoch": 18.399599064483795, + "grad_norm": 0.08122435957193375, + "learning_rate": 9.690738459871424e-07, + "loss": 0.0057, + "num_input_tokens_seen": 201013600, + "step": 165210 + }, + { + "epoch": 18.400155919367414, + "grad_norm": 1.2366591691970825, + "learning_rate": 9.68404024148356e-07, + "loss": 0.0481, + "num_input_tokens_seen": 201019840, + "step": 165215 + }, + { + "epoch": 18.40071277425103, + "grad_norm": 2.026125907897949, + "learning_rate": 9.677344293072516e-07, + "loss": 0.1567, + "num_input_tokens_seen": 201026080, + "step": 165220 + }, + { + "epoch": 18.401269629134646, + "grad_norm": 0.013954835012555122, + "learning_rate": 9.670650614701459e-07, + "loss": 0.0042, + "num_input_tokens_seen": 201032096, + "step": 165225 + }, + { + "epoch": 18.401826484018265, + "grad_norm": 0.07794390618801117, + "learning_rate": 9.663959206433704e-07, + "loss": 0.0014, + "num_input_tokens_seen": 201038208, + "step": 165230 + }, + { + "epoch": 18.40238333890188, + "grad_norm": 1.0312080383300781, + "learning_rate": 9.65727006833239e-07, + "loss": 0.0677, + "num_input_tokens_seen": 201044352, + "step": 165235 + }, + { + "epoch": 18.4029401937855, + "grad_norm": 0.6344930529594421, + "learning_rate": 9.650583200460721e-07, + "loss": 0.0603, + "num_input_tokens_seen": 201050496, + "step": 165240 + }, + { + "epoch": 18.403497048669117, + "grad_norm": 1.2922990322113037, + "learning_rate": 9.643898602881812e-07, + "loss": 0.0537, + "num_input_tokens_seen": 201056864, + "step": 165245 + }, + { + "epoch": 18.404053903552736, + "grad_norm": 0.34095799922943115, + "learning_rate": 9.63721627565889e-07, + "loss": 0.0142, + "num_input_tokens_seen": 201062976, + "step": 165250 + }, + { + "epoch": 18.404610758436352, + "grad_norm": 0.0009676244226284325, + "learning_rate": 9.630536218855068e-07, + "loss": 0.0057, + "num_input_tokens_seen": 201069248, + "step": 165255 + }, + { + "epoch": 18.405167613319968, + "grad_norm": 0.00010229439794784412, + "learning_rate": 9.623858432533383e-07, + "loss": 0.0579, + "num_input_tokens_seen": 201075296, + "step": 165260 + }, + { + "epoch": 18.405724468203587, + "grad_norm": 0.00245285639539361, + "learning_rate": 9.617182916756894e-07, + "loss": 0.0703, + "num_input_tokens_seen": 201081536, + "step": 165265 + }, + { + "epoch": 18.406281323087203, + "grad_norm": 0.22027675807476044, + "learning_rate": 9.610509671588774e-07, + "loss": 0.0516, + "num_input_tokens_seen": 201087840, + "step": 165270 + }, + { + "epoch": 18.40683817797082, + "grad_norm": 0.026959894225001335, + "learning_rate": 9.603838697091944e-07, + "loss": 0.0056, + "num_input_tokens_seen": 201093824, + "step": 165275 + }, + { + "epoch": 18.40739503285444, + "grad_norm": 0.73162841796875, + "learning_rate": 9.59716999332952e-07, + "loss": 0.0114, + "num_input_tokens_seen": 201099872, + "step": 165280 + }, + { + "epoch": 18.407951887738054, + "grad_norm": 8.231880201492459e-05, + "learning_rate": 9.590503560364366e-07, + "loss": 0.0476, + "num_input_tokens_seen": 201106208, + "step": 165285 + }, + { + "epoch": 18.408508742621674, + "grad_norm": 0.9793319702148438, + "learning_rate": 9.583839398259548e-07, + "loss": 0.052, + "num_input_tokens_seen": 201112320, + "step": 165290 + }, + { + "epoch": 18.40906559750529, + "grad_norm": 1.1783236265182495, + "learning_rate": 9.577177507077955e-07, + "loss": 0.0278, + "num_input_tokens_seen": 201118880, + "step": 165295 + }, + { + "epoch": 18.40962245238891, + "grad_norm": 0.0002148822823073715, + "learning_rate": 9.570517886882568e-07, + "loss": 0.1037, + "num_input_tokens_seen": 201124800, + "step": 165300 + }, + { + "epoch": 18.410179307272525, + "grad_norm": 0.0004110265872441232, + "learning_rate": 9.56386053773628e-07, + "loss": 0.0595, + "num_input_tokens_seen": 201130976, + "step": 165305 + }, + { + "epoch": 18.41073616215614, + "grad_norm": 0.004025522153824568, + "learning_rate": 9.557205459701957e-07, + "loss": 0.1239, + "num_input_tokens_seen": 201137120, + "step": 165310 + }, + { + "epoch": 18.41129301703976, + "grad_norm": 0.043566957116127014, + "learning_rate": 9.550552652842437e-07, + "loss": 0.0534, + "num_input_tokens_seen": 201143232, + "step": 165315 + }, + { + "epoch": 18.411849871923376, + "grad_norm": 0.07678207755088806, + "learning_rate": 9.543902117220643e-07, + "loss": 0.0047, + "num_input_tokens_seen": 201149376, + "step": 165320 + }, + { + "epoch": 18.412406726806996, + "grad_norm": 0.9405696988105774, + "learning_rate": 9.537253852899302e-07, + "loss": 0.0252, + "num_input_tokens_seen": 201155648, + "step": 165325 + }, + { + "epoch": 18.41296358169061, + "grad_norm": 0.10015320777893066, + "learning_rate": 9.530607859941281e-07, + "loss": 0.0097, + "num_input_tokens_seen": 201161792, + "step": 165330 + }, + { + "epoch": 18.413520436574228, + "grad_norm": 0.004747914150357246, + "learning_rate": 9.523964138409308e-07, + "loss": 0.0521, + "num_input_tokens_seen": 201168192, + "step": 165335 + }, + { + "epoch": 18.414077291457847, + "grad_norm": 0.1264391392469406, + "learning_rate": 9.517322688366164e-07, + "loss": 0.0287, + "num_input_tokens_seen": 201174496, + "step": 165340 + }, + { + "epoch": 18.414634146341463, + "grad_norm": 0.004479716066271067, + "learning_rate": 9.510683509874579e-07, + "loss": 0.0174, + "num_input_tokens_seen": 201180704, + "step": 165345 + }, + { + "epoch": 18.415191001225082, + "grad_norm": 0.06403161585330963, + "learning_rate": 9.504046602997308e-07, + "loss": 0.0067, + "num_input_tokens_seen": 201186880, + "step": 165350 + }, + { + "epoch": 18.4157478561087, + "grad_norm": 0.20883537828922272, + "learning_rate": 9.497411967796938e-07, + "loss": 0.1241, + "num_input_tokens_seen": 201193088, + "step": 165355 + }, + { + "epoch": 18.416304710992314, + "grad_norm": 0.001506704487837851, + "learning_rate": 9.490779604336226e-07, + "loss": 0.0233, + "num_input_tokens_seen": 201198720, + "step": 165360 + }, + { + "epoch": 18.416861565875934, + "grad_norm": 0.1327463984489441, + "learning_rate": 9.484149512677814e-07, + "loss": 0.0067, + "num_input_tokens_seen": 201204512, + "step": 165365 + }, + { + "epoch": 18.41741842075955, + "grad_norm": 0.09749966114759445, + "learning_rate": 9.477521692884267e-07, + "loss": 0.0499, + "num_input_tokens_seen": 201210496, + "step": 165370 + }, + { + "epoch": 18.41797527564317, + "grad_norm": 0.0017039590748026967, + "learning_rate": 9.470896145018254e-07, + "loss": 0.0171, + "num_input_tokens_seen": 201216768, + "step": 165375 + }, + { + "epoch": 18.418532130526785, + "grad_norm": 0.4137694537639618, + "learning_rate": 9.464272869142337e-07, + "loss": 0.0171, + "num_input_tokens_seen": 201223168, + "step": 165380 + }, + { + "epoch": 18.4190889854104, + "grad_norm": 0.18869221210479736, + "learning_rate": 9.457651865319078e-07, + "loss": 0.0578, + "num_input_tokens_seen": 201229184, + "step": 165385 + }, + { + "epoch": 18.41964584029402, + "grad_norm": 0.5753006339073181, + "learning_rate": 9.451033133610981e-07, + "loss": 0.0162, + "num_input_tokens_seen": 201235232, + "step": 165390 + }, + { + "epoch": 18.420202695177636, + "grad_norm": 0.7428058981895447, + "learning_rate": 9.444416674080636e-07, + "loss": 0.0394, + "num_input_tokens_seen": 201241344, + "step": 165395 + }, + { + "epoch": 18.420759550061256, + "grad_norm": 0.003041778225451708, + "learning_rate": 9.43780248679052e-07, + "loss": 0.0139, + "num_input_tokens_seen": 201247424, + "step": 165400 + }, + { + "epoch": 18.42131640494487, + "grad_norm": 0.5935202240943909, + "learning_rate": 9.431190571803083e-07, + "loss": 0.043, + "num_input_tokens_seen": 201253440, + "step": 165405 + }, + { + "epoch": 18.421873259828487, + "grad_norm": 0.02723187580704689, + "learning_rate": 9.424580929180749e-07, + "loss": 0.0019, + "num_input_tokens_seen": 201259680, + "step": 165410 + }, + { + "epoch": 18.422430114712107, + "grad_norm": 0.0005513999494723976, + "learning_rate": 9.417973558986048e-07, + "loss": 0.0044, + "num_input_tokens_seen": 201265856, + "step": 165415 + }, + { + "epoch": 18.422986969595723, + "grad_norm": 0.25704729557037354, + "learning_rate": 9.411368461281294e-07, + "loss": 0.1516, + "num_input_tokens_seen": 201272576, + "step": 165420 + }, + { + "epoch": 18.423543824479342, + "grad_norm": 0.8511970639228821, + "learning_rate": 9.404765636128965e-07, + "loss": 0.0939, + "num_input_tokens_seen": 201278880, + "step": 165425 + }, + { + "epoch": 18.424100679362958, + "grad_norm": 0.07152042537927628, + "learning_rate": 9.398165083591343e-07, + "loss": 0.009, + "num_input_tokens_seen": 201285344, + "step": 165430 + }, + { + "epoch": 18.424657534246574, + "grad_norm": 2.853929042816162, + "learning_rate": 9.39156680373085e-07, + "loss": 0.0994, + "num_input_tokens_seen": 201291584, + "step": 165435 + }, + { + "epoch": 18.425214389130193, + "grad_norm": 2.727323293685913, + "learning_rate": 9.384970796609771e-07, + "loss": 0.2166, + "num_input_tokens_seen": 201297440, + "step": 165440 + }, + { + "epoch": 18.42577124401381, + "grad_norm": 0.042979031801223755, + "learning_rate": 9.378377062290417e-07, + "loss": 0.0072, + "num_input_tokens_seen": 201303648, + "step": 165445 + }, + { + "epoch": 18.42632809889743, + "grad_norm": 0.100858673453331, + "learning_rate": 9.371785600835098e-07, + "loss": 0.0059, + "num_input_tokens_seen": 201309760, + "step": 165450 + }, + { + "epoch": 18.426884953781045, + "grad_norm": 0.4845860004425049, + "learning_rate": 9.365196412306043e-07, + "loss": 0.0441, + "num_input_tokens_seen": 201315936, + "step": 165455 + }, + { + "epoch": 18.42744180866466, + "grad_norm": 0.6139291524887085, + "learning_rate": 9.358609496765452e-07, + "loss": 0.069, + "num_input_tokens_seen": 201321984, + "step": 165460 + }, + { + "epoch": 18.42799866354828, + "grad_norm": 0.7942217588424683, + "learning_rate": 9.352024854275637e-07, + "loss": 0.0199, + "num_input_tokens_seen": 201327552, + "step": 165465 + }, + { + "epoch": 18.428555518431896, + "grad_norm": 0.27248579263687134, + "learning_rate": 9.34544248489877e-07, + "loss": 0.0179, + "num_input_tokens_seen": 201333536, + "step": 165470 + }, + { + "epoch": 18.429112373315515, + "grad_norm": 1.17194664478302, + "learning_rate": 9.338862388696995e-07, + "loss": 0.0891, + "num_input_tokens_seen": 201339584, + "step": 165475 + }, + { + "epoch": 18.42966922819913, + "grad_norm": 1.7016992568969727, + "learning_rate": 9.33228456573243e-07, + "loss": 0.0581, + "num_input_tokens_seen": 201344608, + "step": 165480 + }, + { + "epoch": 18.430226083082747, + "grad_norm": 0.07435711473226547, + "learning_rate": 9.325709016067302e-07, + "loss": 0.009, + "num_input_tokens_seen": 201350176, + "step": 165485 + }, + { + "epoch": 18.430782937966367, + "grad_norm": 0.005618687719106674, + "learning_rate": 9.319135739763646e-07, + "loss": 0.0472, + "num_input_tokens_seen": 201356288, + "step": 165490 + }, + { + "epoch": 18.431339792849982, + "grad_norm": 0.05977275222539902, + "learning_rate": 9.312564736883661e-07, + "loss": 0.0103, + "num_input_tokens_seen": 201362528, + "step": 165495 + }, + { + "epoch": 18.431896647733602, + "grad_norm": 0.0007642085547558963, + "learning_rate": 9.30599600748927e-07, + "loss": 0.0696, + "num_input_tokens_seen": 201368832, + "step": 165500 + }, + { + "epoch": 18.432453502617218, + "grad_norm": 0.45372116565704346, + "learning_rate": 9.299429551642591e-07, + "loss": 0.0169, + "num_input_tokens_seen": 201374528, + "step": 165505 + }, + { + "epoch": 18.433010357500834, + "grad_norm": 4.356781959533691, + "learning_rate": 9.292865369405656e-07, + "loss": 0.0999, + "num_input_tokens_seen": 201380576, + "step": 165510 + }, + { + "epoch": 18.433567212384453, + "grad_norm": 0.023740816861391068, + "learning_rate": 9.286303460840446e-07, + "loss": 0.0143, + "num_input_tokens_seen": 201386656, + "step": 165515 + }, + { + "epoch": 18.43412406726807, + "grad_norm": 0.4778543710708618, + "learning_rate": 9.279743826008991e-07, + "loss": 0.0142, + "num_input_tokens_seen": 201392576, + "step": 165520 + }, + { + "epoch": 18.43468092215169, + "grad_norm": 0.024913296103477478, + "learning_rate": 9.273186464973216e-07, + "loss": 0.0062, + "num_input_tokens_seen": 201398560, + "step": 165525 + }, + { + "epoch": 18.435237777035304, + "grad_norm": 0.015119267627596855, + "learning_rate": 9.266631377795015e-07, + "loss": 0.0063, + "num_input_tokens_seen": 201404352, + "step": 165530 + }, + { + "epoch": 18.43579463191892, + "grad_norm": 0.4753657579421997, + "learning_rate": 9.260078564536395e-07, + "loss": 0.1865, + "num_input_tokens_seen": 201410592, + "step": 165535 + }, + { + "epoch": 18.43635148680254, + "grad_norm": 0.7901867032051086, + "learning_rate": 9.253528025259195e-07, + "loss": 0.0222, + "num_input_tokens_seen": 201416928, + "step": 165540 + }, + { + "epoch": 18.436908341686156, + "grad_norm": 0.4570878744125366, + "learning_rate": 9.246979760025309e-07, + "loss": 0.025, + "num_input_tokens_seen": 201423072, + "step": 165545 + }, + { + "epoch": 18.437465196569775, + "grad_norm": 0.5914701819419861, + "learning_rate": 9.240433768896578e-07, + "loss": 0.0117, + "num_input_tokens_seen": 201428960, + "step": 165550 + }, + { + "epoch": 18.43802205145339, + "grad_norm": 0.7680831551551819, + "learning_rate": 9.233890051934841e-07, + "loss": 0.0264, + "num_input_tokens_seen": 201435520, + "step": 165555 + }, + { + "epoch": 18.438578906337007, + "grad_norm": 0.0010617005173116922, + "learning_rate": 9.227348609201908e-07, + "loss": 0.1023, + "num_input_tokens_seen": 201441696, + "step": 165560 + }, + { + "epoch": 18.439135761220626, + "grad_norm": 0.645671546459198, + "learning_rate": 9.220809440759592e-07, + "loss": 0.0113, + "num_input_tokens_seen": 201447328, + "step": 165565 + }, + { + "epoch": 18.439692616104242, + "grad_norm": 0.4735051691532135, + "learning_rate": 9.21427254666965e-07, + "loss": 0.012, + "num_input_tokens_seen": 201453408, + "step": 165570 + }, + { + "epoch": 18.44024947098786, + "grad_norm": 0.0009970178361982107, + "learning_rate": 9.207737926993781e-07, + "loss": 0.0502, + "num_input_tokens_seen": 201459808, + "step": 165575 + }, + { + "epoch": 18.440806325871478, + "grad_norm": 0.09281452745199203, + "learning_rate": 9.20120558179377e-07, + "loss": 0.0166, + "num_input_tokens_seen": 201466016, + "step": 165580 + }, + { + "epoch": 18.441363180755094, + "grad_norm": 0.05654230713844299, + "learning_rate": 9.19467551113129e-07, + "loss": 0.0155, + "num_input_tokens_seen": 201472192, + "step": 165585 + }, + { + "epoch": 18.441920035638713, + "grad_norm": 0.035144682973623276, + "learning_rate": 9.188147715068041e-07, + "loss": 0.0027, + "num_input_tokens_seen": 201478336, + "step": 165590 + }, + { + "epoch": 18.44247689052233, + "grad_norm": 0.0927412360906601, + "learning_rate": 9.181622193665668e-07, + "loss": 0.0013, + "num_input_tokens_seen": 201484800, + "step": 165595 + }, + { + "epoch": 18.44303374540595, + "grad_norm": 0.035032909363508224, + "learning_rate": 9.175098946985789e-07, + "loss": 0.1123, + "num_input_tokens_seen": 201491072, + "step": 165600 + }, + { + "epoch": 18.443590600289564, + "grad_norm": 9.539885650156066e-05, + "learning_rate": 9.168577975090076e-07, + "loss": 0.0702, + "num_input_tokens_seen": 201497248, + "step": 165605 + }, + { + "epoch": 18.44414745517318, + "grad_norm": 0.0002243208873551339, + "learning_rate": 9.162059278040063e-07, + "loss": 0.0192, + "num_input_tokens_seen": 201503456, + "step": 165610 + }, + { + "epoch": 18.4447043100568, + "grad_norm": 0.06954450905323029, + "learning_rate": 9.155542855897425e-07, + "loss": 0.0019, + "num_input_tokens_seen": 201509152, + "step": 165615 + }, + { + "epoch": 18.445261164940415, + "grad_norm": 0.061290740966796875, + "learning_rate": 9.149028708723583e-07, + "loss": 0.0371, + "num_input_tokens_seen": 201515232, + "step": 165620 + }, + { + "epoch": 18.445818019824035, + "grad_norm": 0.01308382023125887, + "learning_rate": 9.142516836580156e-07, + "loss": 0.0995, + "num_input_tokens_seen": 201521152, + "step": 165625 + }, + { + "epoch": 18.44637487470765, + "grad_norm": 0.6643208861351013, + "learning_rate": 9.136007239528593e-07, + "loss": 0.0612, + "num_input_tokens_seen": 201527168, + "step": 165630 + }, + { + "epoch": 18.44693172959127, + "grad_norm": 0.052322614938020706, + "learning_rate": 9.129499917630458e-07, + "loss": 0.0061, + "num_input_tokens_seen": 201533600, + "step": 165635 + }, + { + "epoch": 18.447488584474886, + "grad_norm": 0.4361187815666199, + "learning_rate": 9.122994870947171e-07, + "loss": 0.0099, + "num_input_tokens_seen": 201539872, + "step": 165640 + }, + { + "epoch": 18.448045439358502, + "grad_norm": 0.00406294222921133, + "learning_rate": 9.116492099540186e-07, + "loss": 0.0118, + "num_input_tokens_seen": 201546048, + "step": 165645 + }, + { + "epoch": 18.44860229424212, + "grad_norm": 0.055025868117809296, + "learning_rate": 9.109991603470896e-07, + "loss": 0.002, + "num_input_tokens_seen": 201552000, + "step": 165650 + }, + { + "epoch": 18.449159149125737, + "grad_norm": 0.33401980996131897, + "learning_rate": 9.103493382800781e-07, + "loss": 0.0074, + "num_input_tokens_seen": 201558208, + "step": 165655 + }, + { + "epoch": 18.449716004009357, + "grad_norm": 0.0005723483627662063, + "learning_rate": 9.096997437591153e-07, + "loss": 0.0356, + "num_input_tokens_seen": 201564352, + "step": 165660 + }, + { + "epoch": 18.450272858892973, + "grad_norm": 1.5427342653274536, + "learning_rate": 9.090503767903408e-07, + "loss": 0.0407, + "num_input_tokens_seen": 201570528, + "step": 165665 + }, + { + "epoch": 18.45082971377659, + "grad_norm": 0.04120525345206261, + "learning_rate": 9.084012373798828e-07, + "loss": 0.0728, + "num_input_tokens_seen": 201576704, + "step": 165670 + }, + { + "epoch": 18.451386568660208, + "grad_norm": 0.15679287910461426, + "learning_rate": 9.077523255338783e-07, + "loss": 0.0121, + "num_input_tokens_seen": 201583040, + "step": 165675 + }, + { + "epoch": 18.451943423543824, + "grad_norm": 0.00012339981913100928, + "learning_rate": 9.071036412584555e-07, + "loss": 0.1411, + "num_input_tokens_seen": 201589088, + "step": 165680 + }, + { + "epoch": 18.452500278427443, + "grad_norm": 0.0006536889122799039, + "learning_rate": 9.064551845597457e-07, + "loss": 0.0081, + "num_input_tokens_seen": 201595200, + "step": 165685 + }, + { + "epoch": 18.45305713331106, + "grad_norm": 0.1334650069475174, + "learning_rate": 9.058069554438664e-07, + "loss": 0.0149, + "num_input_tokens_seen": 201601248, + "step": 165690 + }, + { + "epoch": 18.453613988194675, + "grad_norm": 1.621643304824829, + "learning_rate": 9.051589539169458e-07, + "loss": 0.1017, + "num_input_tokens_seen": 201607200, + "step": 165695 + }, + { + "epoch": 18.454170843078295, + "grad_norm": 0.24259425699710846, + "learning_rate": 9.045111799850986e-07, + "loss": 0.0298, + "num_input_tokens_seen": 201613472, + "step": 165700 + }, + { + "epoch": 18.45472769796191, + "grad_norm": 0.46434837579727173, + "learning_rate": 9.038636336544532e-07, + "loss": 0.003, + "num_input_tokens_seen": 201619840, + "step": 165705 + }, + { + "epoch": 18.45528455284553, + "grad_norm": 0.02788105234503746, + "learning_rate": 9.032163149311213e-07, + "loss": 0.0357, + "num_input_tokens_seen": 201625824, + "step": 165710 + }, + { + "epoch": 18.455841407729146, + "grad_norm": 0.03905349597334862, + "learning_rate": 9.025692238212174e-07, + "loss": 0.0185, + "num_input_tokens_seen": 201632192, + "step": 165715 + }, + { + "epoch": 18.456398262612762, + "grad_norm": 0.16156676411628723, + "learning_rate": 9.019223603308508e-07, + "loss": 0.0137, + "num_input_tokens_seen": 201637984, + "step": 165720 + }, + { + "epoch": 18.45695511749638, + "grad_norm": 0.1028846874833107, + "learning_rate": 9.012757244661385e-07, + "loss": 0.0215, + "num_input_tokens_seen": 201643520, + "step": 165725 + }, + { + "epoch": 18.457511972379997, + "grad_norm": 0.0015768335433676839, + "learning_rate": 9.006293162331813e-07, + "loss": 0.0069, + "num_input_tokens_seen": 201649408, + "step": 165730 + }, + { + "epoch": 18.458068827263617, + "grad_norm": 0.04355816915631294, + "learning_rate": 8.999831356380911e-07, + "loss": 0.0398, + "num_input_tokens_seen": 201655808, + "step": 165735 + }, + { + "epoch": 18.458625682147233, + "grad_norm": 0.2940022945404053, + "learning_rate": 8.993371826869656e-07, + "loss": 0.006, + "num_input_tokens_seen": 201661664, + "step": 165740 + }, + { + "epoch": 18.45918253703085, + "grad_norm": 0.9084712862968445, + "learning_rate": 8.986914573859112e-07, + "loss": 0.093, + "num_input_tokens_seen": 201667616, + "step": 165745 + }, + { + "epoch": 18.459739391914468, + "grad_norm": 0.7401853203773499, + "learning_rate": 8.980459597410257e-07, + "loss": 0.0203, + "num_input_tokens_seen": 201673920, + "step": 165750 + }, + { + "epoch": 18.460296246798084, + "grad_norm": 0.010046499781310558, + "learning_rate": 8.97400689758407e-07, + "loss": 0.0415, + "num_input_tokens_seen": 201680128, + "step": 165755 + }, + { + "epoch": 18.460853101681703, + "grad_norm": 2.371110677719116, + "learning_rate": 8.967556474441474e-07, + "loss": 0.1112, + "num_input_tokens_seen": 201686112, + "step": 165760 + }, + { + "epoch": 18.46140995656532, + "grad_norm": 0.24317766726016998, + "learning_rate": 8.961108328043449e-07, + "loss": 0.0963, + "num_input_tokens_seen": 201691104, + "step": 165765 + }, + { + "epoch": 18.461966811448935, + "grad_norm": 0.004136190749704838, + "learning_rate": 8.954662458450864e-07, + "loss": 0.0059, + "num_input_tokens_seen": 201697248, + "step": 165770 + }, + { + "epoch": 18.462523666332554, + "grad_norm": 0.7131898403167725, + "learning_rate": 8.948218865724584e-07, + "loss": 0.1022, + "num_input_tokens_seen": 201703232, + "step": 165775 + }, + { + "epoch": 18.46308052121617, + "grad_norm": 0.1962917596101761, + "learning_rate": 8.941777549925535e-07, + "loss": 0.0292, + "num_input_tokens_seen": 201709440, + "step": 165780 + }, + { + "epoch": 18.46363737609979, + "grad_norm": 0.13292603194713593, + "learning_rate": 8.935338511114527e-07, + "loss": 0.0374, + "num_input_tokens_seen": 201715840, + "step": 165785 + }, + { + "epoch": 18.464194230983406, + "grad_norm": 0.24952316284179688, + "learning_rate": 8.928901749352376e-07, + "loss": 0.0369, + "num_input_tokens_seen": 201721984, + "step": 165790 + }, + { + "epoch": 18.46475108586702, + "grad_norm": 0.00017122137069236487, + "learning_rate": 8.92246726469989e-07, + "loss": 0.086, + "num_input_tokens_seen": 201727904, + "step": 165795 + }, + { + "epoch": 18.46530794075064, + "grad_norm": 0.03555876389145851, + "learning_rate": 8.916035057217859e-07, + "loss": 0.0013, + "num_input_tokens_seen": 201734144, + "step": 165800 + }, + { + "epoch": 18.465864795634257, + "grad_norm": 0.6336386203765869, + "learning_rate": 8.909605126967036e-07, + "loss": 0.081, + "num_input_tokens_seen": 201740224, + "step": 165805 + }, + { + "epoch": 18.466421650517876, + "grad_norm": 1.0593000650405884, + "learning_rate": 8.903177474008151e-07, + "loss": 0.0188, + "num_input_tokens_seen": 201746272, + "step": 165810 + }, + { + "epoch": 18.466978505401492, + "grad_norm": 0.6249818801879883, + "learning_rate": 8.896752098401879e-07, + "loss": 0.0376, + "num_input_tokens_seen": 201751936, + "step": 165815 + }, + { + "epoch": 18.46753536028511, + "grad_norm": 0.05987918749451637, + "learning_rate": 8.890329000208975e-07, + "loss": 0.0274, + "num_input_tokens_seen": 201757760, + "step": 165820 + }, + { + "epoch": 18.468092215168728, + "grad_norm": 0.002448001643642783, + "learning_rate": 8.883908179490086e-07, + "loss": 0.001, + "num_input_tokens_seen": 201763616, + "step": 165825 + }, + { + "epoch": 18.468649070052344, + "grad_norm": 0.5979921817779541, + "learning_rate": 8.877489636305885e-07, + "loss": 0.0813, + "num_input_tokens_seen": 201768864, + "step": 165830 + }, + { + "epoch": 18.469205924935963, + "grad_norm": 0.07695028930902481, + "learning_rate": 8.871073370716937e-07, + "loss": 0.0251, + "num_input_tokens_seen": 201775200, + "step": 165835 + }, + { + "epoch": 18.46976277981958, + "grad_norm": 0.043338049203157425, + "learning_rate": 8.864659382783941e-07, + "loss": 0.1054, + "num_input_tokens_seen": 201781184, + "step": 165840 + }, + { + "epoch": 18.470319634703195, + "grad_norm": 0.4216785728931427, + "learning_rate": 8.858247672567377e-07, + "loss": 0.0236, + "num_input_tokens_seen": 201787008, + "step": 165845 + }, + { + "epoch": 18.470876489586814, + "grad_norm": 0.5421313643455505, + "learning_rate": 8.851838240127891e-07, + "loss": 0.0796, + "num_input_tokens_seen": 201792992, + "step": 165850 + }, + { + "epoch": 18.47143334447043, + "grad_norm": 1.0231711864471436, + "learning_rate": 8.845431085526018e-07, + "loss": 0.0586, + "num_input_tokens_seen": 201799136, + "step": 165855 + }, + { + "epoch": 18.47199019935405, + "grad_norm": 0.15729668736457825, + "learning_rate": 8.839026208822238e-07, + "loss": 0.217, + "num_input_tokens_seen": 201805376, + "step": 165860 + }, + { + "epoch": 18.472547054237666, + "grad_norm": 0.04217798262834549, + "learning_rate": 8.832623610077057e-07, + "loss": 0.0928, + "num_input_tokens_seen": 201811392, + "step": 165865 + }, + { + "epoch": 18.47310390912128, + "grad_norm": 1.0769844055175781, + "learning_rate": 8.826223289350982e-07, + "loss": 0.0529, + "num_input_tokens_seen": 201817376, + "step": 165870 + }, + { + "epoch": 18.4736607640049, + "grad_norm": 0.11311798542737961, + "learning_rate": 8.819825246704466e-07, + "loss": 0.0016, + "num_input_tokens_seen": 201823680, + "step": 165875 + }, + { + "epoch": 18.474217618888517, + "grad_norm": 1.9875282049179077, + "learning_rate": 8.813429482197933e-07, + "loss": 0.0379, + "num_input_tokens_seen": 201829728, + "step": 165880 + }, + { + "epoch": 18.474774473772136, + "grad_norm": 0.004095681477338076, + "learning_rate": 8.80703599589175e-07, + "loss": 0.0025, + "num_input_tokens_seen": 201836128, + "step": 165885 + }, + { + "epoch": 18.475331328655752, + "grad_norm": 0.002691853092983365, + "learning_rate": 8.800644787846396e-07, + "loss": 0.001, + "num_input_tokens_seen": 201842368, + "step": 165890 + }, + { + "epoch": 18.475888183539368, + "grad_norm": 0.001443962100893259, + "learning_rate": 8.794255858122158e-07, + "loss": 0.0014, + "num_input_tokens_seen": 201848128, + "step": 165895 + }, + { + "epoch": 18.476445038422987, + "grad_norm": 0.00209405436180532, + "learning_rate": 8.787869206779487e-07, + "loss": 0.0003, + "num_input_tokens_seen": 201854592, + "step": 165900 + }, + { + "epoch": 18.477001893306603, + "grad_norm": 0.032759685069322586, + "learning_rate": 8.781484833878584e-07, + "loss": 0.0056, + "num_input_tokens_seen": 201860832, + "step": 165905 + }, + { + "epoch": 18.477558748190223, + "grad_norm": 0.3607700765132904, + "learning_rate": 8.775102739479846e-07, + "loss": 0.0354, + "num_input_tokens_seen": 201867008, + "step": 165910 + }, + { + "epoch": 18.47811560307384, + "grad_norm": 1.242144227027893, + "learning_rate": 8.768722923643502e-07, + "loss": 0.0697, + "num_input_tokens_seen": 201872704, + "step": 165915 + }, + { + "epoch": 18.478672457957455, + "grad_norm": 0.0559835247695446, + "learning_rate": 8.762345386429865e-07, + "loss": 0.0036, + "num_input_tokens_seen": 201878720, + "step": 165920 + }, + { + "epoch": 18.479229312841074, + "grad_norm": 0.002082236809656024, + "learning_rate": 8.755970127899166e-07, + "loss": 0.0024, + "num_input_tokens_seen": 201884832, + "step": 165925 + }, + { + "epoch": 18.47978616772469, + "grad_norm": 2.854861259460449, + "learning_rate": 8.749597148111604e-07, + "loss": 0.069, + "num_input_tokens_seen": 201890880, + "step": 165930 + }, + { + "epoch": 18.48034302260831, + "grad_norm": 1.8108958005905151, + "learning_rate": 8.743226447127356e-07, + "loss": 0.1042, + "num_input_tokens_seen": 201896928, + "step": 165935 + }, + { + "epoch": 18.480899877491925, + "grad_norm": 0.04335762932896614, + "learning_rate": 8.73685802500665e-07, + "loss": 0.0048, + "num_input_tokens_seen": 201902880, + "step": 165940 + }, + { + "epoch": 18.48145673237554, + "grad_norm": 0.005653929430991411, + "learning_rate": 8.730491881809633e-07, + "loss": 0.0435, + "num_input_tokens_seen": 201908832, + "step": 165945 + }, + { + "epoch": 18.48201358725916, + "grad_norm": 1.8223875761032104, + "learning_rate": 8.724128017596394e-07, + "loss": 0.0683, + "num_input_tokens_seen": 201915008, + "step": 165950 + }, + { + "epoch": 18.482570442142777, + "grad_norm": 0.41269397735595703, + "learning_rate": 8.717766432427055e-07, + "loss": 0.0353, + "num_input_tokens_seen": 201921376, + "step": 165955 + }, + { + "epoch": 18.483127297026396, + "grad_norm": 7.126353739295155e-05, + "learning_rate": 8.711407126361759e-07, + "loss": 0.0879, + "num_input_tokens_seen": 201927392, + "step": 165960 + }, + { + "epoch": 18.483684151910012, + "grad_norm": 1.4331480264663696, + "learning_rate": 8.705050099460516e-07, + "loss": 0.0533, + "num_input_tokens_seen": 201933504, + "step": 165965 + }, + { + "epoch": 18.48424100679363, + "grad_norm": 0.23298214375972748, + "learning_rate": 8.698695351783415e-07, + "loss": 0.0211, + "num_input_tokens_seen": 201939712, + "step": 165970 + }, + { + "epoch": 18.484797861677247, + "grad_norm": 1.6350442171096802, + "learning_rate": 8.692342883390464e-07, + "loss": 0.0419, + "num_input_tokens_seen": 201945696, + "step": 165975 + }, + { + "epoch": 18.485354716560863, + "grad_norm": 0.43327611684799194, + "learning_rate": 8.685992694341671e-07, + "loss": 0.037, + "num_input_tokens_seen": 201951200, + "step": 165980 + }, + { + "epoch": 18.485911571444483, + "grad_norm": 0.2172122746706009, + "learning_rate": 8.679644784696988e-07, + "loss": 0.0345, + "num_input_tokens_seen": 201957216, + "step": 165985 + }, + { + "epoch": 18.4864684263281, + "grad_norm": 0.00011483825073810294, + "learning_rate": 8.673299154516423e-07, + "loss": 0.0633, + "num_input_tokens_seen": 201963520, + "step": 165990 + }, + { + "epoch": 18.487025281211718, + "grad_norm": 0.004501135088503361, + "learning_rate": 8.666955803859928e-07, + "loss": 0.1175, + "num_input_tokens_seen": 201970048, + "step": 165995 + }, + { + "epoch": 18.487582136095334, + "grad_norm": 0.11684533208608627, + "learning_rate": 8.660614732787343e-07, + "loss": 0.0247, + "num_input_tokens_seen": 201976032, + "step": 166000 + }, + { + "epoch": 18.48813899097895, + "grad_norm": 0.00746524753049016, + "learning_rate": 8.654275941358592e-07, + "loss": 0.0136, + "num_input_tokens_seen": 201982176, + "step": 166005 + }, + { + "epoch": 18.48869584586257, + "grad_norm": 0.12022868543863297, + "learning_rate": 8.647939429633628e-07, + "loss": 0.0055, + "num_input_tokens_seen": 201988448, + "step": 166010 + }, + { + "epoch": 18.489252700746185, + "grad_norm": 0.020441707223653793, + "learning_rate": 8.641605197672182e-07, + "loss": 0.0132, + "num_input_tokens_seen": 201994304, + "step": 166015 + }, + { + "epoch": 18.489809555629805, + "grad_norm": 0.18667997419834137, + "learning_rate": 8.635273245534203e-07, + "loss": 0.0792, + "num_input_tokens_seen": 202000384, + "step": 166020 + }, + { + "epoch": 18.49036641051342, + "grad_norm": 0.046790193766355515, + "learning_rate": 8.628943573279425e-07, + "loss": 0.0908, + "num_input_tokens_seen": 202006656, + "step": 166025 + }, + { + "epoch": 18.490923265397036, + "grad_norm": 0.5672747492790222, + "learning_rate": 8.622616180967658e-07, + "loss": 0.0073, + "num_input_tokens_seen": 202012864, + "step": 166030 + }, + { + "epoch": 18.491480120280656, + "grad_norm": 1.0716674327850342, + "learning_rate": 8.616291068658633e-07, + "loss": 0.0394, + "num_input_tokens_seen": 202019168, + "step": 166035 + }, + { + "epoch": 18.49203697516427, + "grad_norm": 1.8080406188964844, + "learning_rate": 8.609968236412163e-07, + "loss": 0.0725, + "num_input_tokens_seen": 202025312, + "step": 166040 + }, + { + "epoch": 18.49259383004789, + "grad_norm": 0.9636914134025574, + "learning_rate": 8.603647684287952e-07, + "loss": 0.0283, + "num_input_tokens_seen": 202031488, + "step": 166045 + }, + { + "epoch": 18.493150684931507, + "grad_norm": 0.004158083349466324, + "learning_rate": 8.597329412345701e-07, + "loss": 0.0357, + "num_input_tokens_seen": 202037632, + "step": 166050 + }, + { + "epoch": 18.493707539815123, + "grad_norm": 0.2556919455528259, + "learning_rate": 8.591013420645055e-07, + "loss": 0.0058, + "num_input_tokens_seen": 202043616, + "step": 166055 + }, + { + "epoch": 18.494264394698742, + "grad_norm": 0.0070774671621620655, + "learning_rate": 8.58469970924572e-07, + "loss": 0.0046, + "num_input_tokens_seen": 202049856, + "step": 166060 + }, + { + "epoch": 18.49482124958236, + "grad_norm": 1.0811854600906372, + "learning_rate": 8.578388278207311e-07, + "loss": 0.0494, + "num_input_tokens_seen": 202055872, + "step": 166065 + }, + { + "epoch": 18.495378104465978, + "grad_norm": 0.006036320701241493, + "learning_rate": 8.572079127589449e-07, + "loss": 0.0016, + "num_input_tokens_seen": 202061600, + "step": 166070 + }, + { + "epoch": 18.495934959349594, + "grad_norm": 0.0002901947300415486, + "learning_rate": 8.565772257451699e-07, + "loss": 0.0794, + "num_input_tokens_seen": 202067584, + "step": 166075 + }, + { + "epoch": 18.49649181423321, + "grad_norm": 0.014590884558856487, + "learning_rate": 8.559467667853705e-07, + "loss": 0.0031, + "num_input_tokens_seen": 202074016, + "step": 166080 + }, + { + "epoch": 18.49704866911683, + "grad_norm": 0.0037567655090242624, + "learning_rate": 8.553165358854947e-07, + "loss": 0.0274, + "num_input_tokens_seen": 202080160, + "step": 166085 + }, + { + "epoch": 18.497605524000445, + "grad_norm": 0.00026228505885228515, + "learning_rate": 8.546865330515019e-07, + "loss": 0.0229, + "num_input_tokens_seen": 202086720, + "step": 166090 + }, + { + "epoch": 18.498162378884064, + "grad_norm": 0.003649576101452112, + "learning_rate": 8.540567582893372e-07, + "loss": 0.1371, + "num_input_tokens_seen": 202092800, + "step": 166095 + }, + { + "epoch": 18.49871923376768, + "grad_norm": 0.4169968366622925, + "learning_rate": 8.534272116049513e-07, + "loss": 0.0767, + "num_input_tokens_seen": 202099104, + "step": 166100 + }, + { + "epoch": 18.499276088651296, + "grad_norm": 0.003047086065635085, + "learning_rate": 8.527978930042923e-07, + "loss": 0.0008, + "num_input_tokens_seen": 202105056, + "step": 166105 + }, + { + "epoch": 18.499832943534916, + "grad_norm": 0.5839412808418274, + "learning_rate": 8.521688024933028e-07, + "loss": 0.1119, + "num_input_tokens_seen": 202111168, + "step": 166110 + }, + { + "epoch": 18.50038979841853, + "grad_norm": 0.005098224151879549, + "learning_rate": 8.515399400779278e-07, + "loss": 0.0122, + "num_input_tokens_seen": 202116832, + "step": 166115 + }, + { + "epoch": 18.50094665330215, + "grad_norm": 0.17264337837696075, + "learning_rate": 8.509113057641072e-07, + "loss": 0.1956, + "num_input_tokens_seen": 202122848, + "step": 166120 + }, + { + "epoch": 18.501503508185767, + "grad_norm": 0.015507727861404419, + "learning_rate": 8.502828995577722e-07, + "loss": 0.0582, + "num_input_tokens_seen": 202128384, + "step": 166125 + }, + { + "epoch": 18.502060363069383, + "grad_norm": 0.020982077345252037, + "learning_rate": 8.496547214648654e-07, + "loss": 0.0261, + "num_input_tokens_seen": 202134848, + "step": 166130 + }, + { + "epoch": 18.502617217953002, + "grad_norm": 0.01384524255990982, + "learning_rate": 8.490267714913208e-07, + "loss": 0.0204, + "num_input_tokens_seen": 202140704, + "step": 166135 + }, + { + "epoch": 18.503174072836618, + "grad_norm": 0.018421830609440804, + "learning_rate": 8.483990496430671e-07, + "loss": 0.0099, + "num_input_tokens_seen": 202146624, + "step": 166140 + }, + { + "epoch": 18.503730927720238, + "grad_norm": 0.0001066552140400745, + "learning_rate": 8.477715559260302e-07, + "loss": 0.0303, + "num_input_tokens_seen": 202153024, + "step": 166145 + }, + { + "epoch": 18.504287782603853, + "grad_norm": 0.000470559811219573, + "learning_rate": 8.471442903461468e-07, + "loss": 0.0006, + "num_input_tokens_seen": 202159296, + "step": 166150 + }, + { + "epoch": 18.50484463748747, + "grad_norm": 0.05036739632487297, + "learning_rate": 8.465172529093318e-07, + "loss": 0.0607, + "num_input_tokens_seen": 202165696, + "step": 166155 + }, + { + "epoch": 18.50540149237109, + "grad_norm": 0.14054663479328156, + "learning_rate": 8.458904436215164e-07, + "loss": 0.038, + "num_input_tokens_seen": 202171808, + "step": 166160 + }, + { + "epoch": 18.505958347254705, + "grad_norm": 0.7016712427139282, + "learning_rate": 8.452638624886183e-07, + "loss": 0.1063, + "num_input_tokens_seen": 202177760, + "step": 166165 + }, + { + "epoch": 18.506515202138324, + "grad_norm": 0.1382678896188736, + "learning_rate": 8.446375095165548e-07, + "loss": 0.0592, + "num_input_tokens_seen": 202183424, + "step": 166170 + }, + { + "epoch": 18.50707205702194, + "grad_norm": 1.6197295188903809, + "learning_rate": 8.44011384711238e-07, + "loss": 0.0399, + "num_input_tokens_seen": 202189600, + "step": 166175 + }, + { + "epoch": 18.507628911905556, + "grad_norm": 1.8995698690414429, + "learning_rate": 8.433854880785936e-07, + "loss": 0.0702, + "num_input_tokens_seen": 202195936, + "step": 166180 + }, + { + "epoch": 18.508185766789175, + "grad_norm": 0.0001535547780804336, + "learning_rate": 8.427598196245251e-07, + "loss": 0.0327, + "num_input_tokens_seen": 202202016, + "step": 166185 + }, + { + "epoch": 18.50874262167279, + "grad_norm": 1.2216744422912598, + "learning_rate": 8.421343793549446e-07, + "loss": 0.0246, + "num_input_tokens_seen": 202207488, + "step": 166190 + }, + { + "epoch": 18.50929947655641, + "grad_norm": 0.03504493460059166, + "learning_rate": 8.415091672757613e-07, + "loss": 0.1187, + "num_input_tokens_seen": 202213600, + "step": 166195 + }, + { + "epoch": 18.509856331440027, + "grad_norm": 1.749341607093811, + "learning_rate": 8.40884183392876e-07, + "loss": 0.1331, + "num_input_tokens_seen": 202219520, + "step": 166200 + }, + { + "epoch": 18.510413186323643, + "grad_norm": 0.6288855671882629, + "learning_rate": 8.402594277121978e-07, + "loss": 0.0138, + "num_input_tokens_seen": 202225376, + "step": 166205 + }, + { + "epoch": 18.510970041207262, + "grad_norm": 0.0006194799789227545, + "learning_rate": 8.396349002396247e-07, + "loss": 0.1631, + "num_input_tokens_seen": 202231488, + "step": 166210 + }, + { + "epoch": 18.511526896090878, + "grad_norm": 0.799598753452301, + "learning_rate": 8.390106009810578e-07, + "loss": 0.0076, + "num_input_tokens_seen": 202237632, + "step": 166215 + }, + { + "epoch": 18.512083750974497, + "grad_norm": 0.3924311399459839, + "learning_rate": 8.383865299423921e-07, + "loss": 0.0201, + "num_input_tokens_seen": 202243712, + "step": 166220 + }, + { + "epoch": 18.512640605858113, + "grad_norm": 0.0100981704890728, + "learning_rate": 8.37762687129523e-07, + "loss": 0.0012, + "num_input_tokens_seen": 202249728, + "step": 166225 + }, + { + "epoch": 18.51319746074173, + "grad_norm": 0.005524435546249151, + "learning_rate": 8.37139072548343e-07, + "loss": 0.1067, + "num_input_tokens_seen": 202255840, + "step": 166230 + }, + { + "epoch": 18.51375431562535, + "grad_norm": 0.3319413363933563, + "learning_rate": 8.365156862047502e-07, + "loss": 0.0619, + "num_input_tokens_seen": 202261984, + "step": 166235 + }, + { + "epoch": 18.514311170508964, + "grad_norm": 0.18098634481430054, + "learning_rate": 8.358925281046203e-07, + "loss": 0.0429, + "num_input_tokens_seen": 202268320, + "step": 166240 + }, + { + "epoch": 18.514868025392584, + "grad_norm": 0.7477153539657593, + "learning_rate": 8.35269598253846e-07, + "loss": 0.0168, + "num_input_tokens_seen": 202274496, + "step": 166245 + }, + { + "epoch": 18.5154248802762, + "grad_norm": 0.14318019151687622, + "learning_rate": 8.346468966583087e-07, + "loss": 0.0389, + "num_input_tokens_seen": 202280640, + "step": 166250 + }, + { + "epoch": 18.515981735159816, + "grad_norm": 0.0003083078481722623, + "learning_rate": 8.34024423323898e-07, + "loss": 0.0323, + "num_input_tokens_seen": 202287168, + "step": 166255 + }, + { + "epoch": 18.516538590043435, + "grad_norm": 1.2734973430633545, + "learning_rate": 8.334021782564843e-07, + "loss": 0.0942, + "num_input_tokens_seen": 202293120, + "step": 166260 + }, + { + "epoch": 18.51709544492705, + "grad_norm": 0.016984501853585243, + "learning_rate": 8.327801614619518e-07, + "loss": 0.0434, + "num_input_tokens_seen": 202298912, + "step": 166265 + }, + { + "epoch": 18.51765229981067, + "grad_norm": 0.021407214924693108, + "learning_rate": 8.321583729461679e-07, + "loss": 0.1045, + "num_input_tokens_seen": 202305088, + "step": 166270 + }, + { + "epoch": 18.518209154694286, + "grad_norm": 0.24298694729804993, + "learning_rate": 8.31536812715017e-07, + "loss": 0.0082, + "num_input_tokens_seen": 202311104, + "step": 166275 + }, + { + "epoch": 18.518766009577902, + "grad_norm": 0.006519087124615908, + "learning_rate": 8.309154807743608e-07, + "loss": 0.0051, + "num_input_tokens_seen": 202317312, + "step": 166280 + }, + { + "epoch": 18.51932286446152, + "grad_norm": 0.0015988517552614212, + "learning_rate": 8.302943771300753e-07, + "loss": 0.0023, + "num_input_tokens_seen": 202323008, + "step": 166285 + }, + { + "epoch": 18.519879719345138, + "grad_norm": 0.3330318331718445, + "learning_rate": 8.296735017880197e-07, + "loss": 0.0049, + "num_input_tokens_seen": 202329440, + "step": 166290 + }, + { + "epoch": 18.520436574228757, + "grad_norm": 0.5416011214256287, + "learning_rate": 8.290528547540643e-07, + "loss": 0.025, + "num_input_tokens_seen": 202335648, + "step": 166295 + }, + { + "epoch": 18.520993429112373, + "grad_norm": 0.027901073917746544, + "learning_rate": 8.284324360340684e-07, + "loss": 0.0027, + "num_input_tokens_seen": 202341888, + "step": 166300 + }, + { + "epoch": 18.521550283995992, + "grad_norm": 0.0004720434080809355, + "learning_rate": 8.278122456338993e-07, + "loss": 0.0996, + "num_input_tokens_seen": 202347520, + "step": 166305 + }, + { + "epoch": 18.52210713887961, + "grad_norm": 0.46438172459602356, + "learning_rate": 8.271922835594054e-07, + "loss": 0.0087, + "num_input_tokens_seen": 202353856, + "step": 166310 + }, + { + "epoch": 18.522663993763224, + "grad_norm": 0.00015166631783358753, + "learning_rate": 8.265725498164484e-07, + "loss": 0.064, + "num_input_tokens_seen": 202360128, + "step": 166315 + }, + { + "epoch": 18.523220848646844, + "grad_norm": 0.022298987954854965, + "learning_rate": 8.259530444108793e-07, + "loss": 0.0013, + "num_input_tokens_seen": 202366144, + "step": 166320 + }, + { + "epoch": 18.52377770353046, + "grad_norm": 0.09475316107273102, + "learning_rate": 8.253337673485545e-07, + "loss": 0.0053, + "num_input_tokens_seen": 202372288, + "step": 166325 + }, + { + "epoch": 18.524334558414076, + "grad_norm": 0.0023100802209228277, + "learning_rate": 8.247147186353193e-07, + "loss": 0.1034, + "num_input_tokens_seen": 202378272, + "step": 166330 + }, + { + "epoch": 18.524891413297695, + "grad_norm": 0.38088566064834595, + "learning_rate": 8.240958982770247e-07, + "loss": 0.0398, + "num_input_tokens_seen": 202384384, + "step": 166335 + }, + { + "epoch": 18.52544826818131, + "grad_norm": 0.007490532007068396, + "learning_rate": 8.234773062795104e-07, + "loss": 0.0822, + "num_input_tokens_seen": 202389728, + "step": 166340 + }, + { + "epoch": 18.52600512306493, + "grad_norm": 2.871122360229492, + "learning_rate": 8.228589426486244e-07, + "loss": 0.0405, + "num_input_tokens_seen": 202395680, + "step": 166345 + }, + { + "epoch": 18.526561977948546, + "grad_norm": 1.3580107688903809, + "learning_rate": 8.222408073902066e-07, + "loss": 0.1117, + "num_input_tokens_seen": 202401760, + "step": 166350 + }, + { + "epoch": 18.527118832832166, + "grad_norm": 1.6716018915176392, + "learning_rate": 8.216229005100967e-07, + "loss": 0.0248, + "num_input_tokens_seen": 202408032, + "step": 166355 + }, + { + "epoch": 18.52767568771578, + "grad_norm": 0.3035937547683716, + "learning_rate": 8.210052220141262e-07, + "loss": 0.0281, + "num_input_tokens_seen": 202414240, + "step": 166360 + }, + { + "epoch": 18.528232542599397, + "grad_norm": 0.011487841606140137, + "learning_rate": 8.203877719081349e-07, + "loss": 0.0466, + "num_input_tokens_seen": 202420864, + "step": 166365 + }, + { + "epoch": 18.528789397483017, + "grad_norm": 1.6689480543136597, + "learning_rate": 8.197705501979514e-07, + "loss": 0.0721, + "num_input_tokens_seen": 202426944, + "step": 166370 + }, + { + "epoch": 18.529346252366633, + "grad_norm": 0.9404417276382446, + "learning_rate": 8.191535568894127e-07, + "loss": 0.0272, + "num_input_tokens_seen": 202432928, + "step": 166375 + }, + { + "epoch": 18.529903107250252, + "grad_norm": 0.5380134582519531, + "learning_rate": 8.185367919883391e-07, + "loss": 0.0392, + "num_input_tokens_seen": 202439168, + "step": 166380 + }, + { + "epoch": 18.530459962133868, + "grad_norm": 0.01511361077427864, + "learning_rate": 8.179202555005622e-07, + "loss": 0.0671, + "num_input_tokens_seen": 202444928, + "step": 166385 + }, + { + "epoch": 18.531016817017484, + "grad_norm": 0.030009858310222626, + "learning_rate": 8.173039474318966e-07, + "loss": 0.0153, + "num_input_tokens_seen": 202451360, + "step": 166390 + }, + { + "epoch": 18.531573671901103, + "grad_norm": 0.0010257362155243754, + "learning_rate": 8.166878677881767e-07, + "loss": 0.088, + "num_input_tokens_seen": 202456640, + "step": 166395 + }, + { + "epoch": 18.53213052678472, + "grad_norm": 1.4566650390625, + "learning_rate": 8.160720165752117e-07, + "loss": 0.0443, + "num_input_tokens_seen": 202462048, + "step": 166400 + }, + { + "epoch": 18.53268738166834, + "grad_norm": 6.787635356886312e-05, + "learning_rate": 8.154563937988247e-07, + "loss": 0.0246, + "num_input_tokens_seen": 202468320, + "step": 166405 + }, + { + "epoch": 18.533244236551955, + "grad_norm": 0.12214475125074387, + "learning_rate": 8.148409994648249e-07, + "loss": 0.0046, + "num_input_tokens_seen": 202474656, + "step": 166410 + }, + { + "epoch": 18.53380109143557, + "grad_norm": 4.2251434326171875, + "learning_rate": 8.142258335790298e-07, + "loss": 0.0409, + "num_input_tokens_seen": 202481056, + "step": 166415 + }, + { + "epoch": 18.53435794631919, + "grad_norm": 0.6667953729629517, + "learning_rate": 8.136108961472488e-07, + "loss": 0.0379, + "num_input_tokens_seen": 202486912, + "step": 166420 + }, + { + "epoch": 18.534914801202806, + "grad_norm": 0.08880376070737839, + "learning_rate": 8.129961871752939e-07, + "loss": 0.0041, + "num_input_tokens_seen": 202493184, + "step": 166425 + }, + { + "epoch": 18.535471656086425, + "grad_norm": 0.00028260520775802433, + "learning_rate": 8.123817066689659e-07, + "loss": 0.0119, + "num_input_tokens_seen": 202499232, + "step": 166430 + }, + { + "epoch": 18.53602851097004, + "grad_norm": 2.428041458129883, + "learning_rate": 8.117674546340714e-07, + "loss": 0.1213, + "num_input_tokens_seen": 202505504, + "step": 166435 + }, + { + "epoch": 18.536585365853657, + "grad_norm": 0.108942411839962, + "learning_rate": 8.111534310764113e-07, + "loss": 0.0633, + "num_input_tokens_seen": 202511680, + "step": 166440 + }, + { + "epoch": 18.537142220737277, + "grad_norm": 0.1160808652639389, + "learning_rate": 8.105396360017892e-07, + "loss": 0.1582, + "num_input_tokens_seen": 202517504, + "step": 166445 + }, + { + "epoch": 18.537699075620893, + "grad_norm": 0.045448437333106995, + "learning_rate": 8.099260694160004e-07, + "loss": 0.0265, + "num_input_tokens_seen": 202523648, + "step": 166450 + }, + { + "epoch": 18.538255930504512, + "grad_norm": 0.0001389301905874163, + "learning_rate": 8.093127313248406e-07, + "loss": 0.0097, + "num_input_tokens_seen": 202530048, + "step": 166455 + }, + { + "epoch": 18.538812785388128, + "grad_norm": 0.04736221954226494, + "learning_rate": 8.086996217341019e-07, + "loss": 0.0048, + "num_input_tokens_seen": 202535968, + "step": 166460 + }, + { + "epoch": 18.539369640271744, + "grad_norm": 0.012687706388533115, + "learning_rate": 8.080867406495773e-07, + "loss": 0.0101, + "num_input_tokens_seen": 202542112, + "step": 166465 + }, + { + "epoch": 18.539926495155363, + "grad_norm": 0.3202953040599823, + "learning_rate": 8.074740880770565e-07, + "loss": 0.0315, + "num_input_tokens_seen": 202548256, + "step": 166470 + }, + { + "epoch": 18.54048335003898, + "grad_norm": 0.03962638974189758, + "learning_rate": 8.068616640223264e-07, + "loss": 0.044, + "num_input_tokens_seen": 202554592, + "step": 166475 + }, + { + "epoch": 18.5410402049226, + "grad_norm": 1.0473300218582153, + "learning_rate": 8.062494684911687e-07, + "loss": 0.0607, + "num_input_tokens_seen": 202560704, + "step": 166480 + }, + { + "epoch": 18.541597059806215, + "grad_norm": 0.06199674308300018, + "learning_rate": 8.056375014893703e-07, + "loss": 0.0112, + "num_input_tokens_seen": 202566944, + "step": 166485 + }, + { + "epoch": 18.54215391468983, + "grad_norm": 0.269857794046402, + "learning_rate": 8.05025763022707e-07, + "loss": 0.2354, + "num_input_tokens_seen": 202573056, + "step": 166490 + }, + { + "epoch": 18.54271076957345, + "grad_norm": 0.1995534598827362, + "learning_rate": 8.044142530969661e-07, + "loss": 0.0023, + "num_input_tokens_seen": 202579360, + "step": 166495 + }, + { + "epoch": 18.543267624457066, + "grad_norm": 2.007336139678955, + "learning_rate": 8.038029717179124e-07, + "loss": 0.1333, + "num_input_tokens_seen": 202585504, + "step": 166500 + }, + { + "epoch": 18.543824479340685, + "grad_norm": 0.01575261726975441, + "learning_rate": 8.031919188913273e-07, + "loss": 0.0067, + "num_input_tokens_seen": 202591776, + "step": 166505 + }, + { + "epoch": 18.5443813342243, + "grad_norm": 0.004456921014934778, + "learning_rate": 8.025810946229784e-07, + "loss": 0.0262, + "num_input_tokens_seen": 202597632, + "step": 166510 + }, + { + "epoch": 18.544938189107917, + "grad_norm": 0.16229486465454102, + "learning_rate": 8.019704989186416e-07, + "loss": 0.024, + "num_input_tokens_seen": 202603712, + "step": 166515 + }, + { + "epoch": 18.545495043991536, + "grad_norm": 0.014344567432999611, + "learning_rate": 8.013601317840791e-07, + "loss": 0.0783, + "num_input_tokens_seen": 202609728, + "step": 166520 + }, + { + "epoch": 18.546051898875152, + "grad_norm": 0.09155365824699402, + "learning_rate": 8.007499932250583e-07, + "loss": 0.025, + "num_input_tokens_seen": 202615680, + "step": 166525 + }, + { + "epoch": 18.546608753758772, + "grad_norm": 0.612923264503479, + "learning_rate": 8.001400832473388e-07, + "loss": 0.0223, + "num_input_tokens_seen": 202621760, + "step": 166530 + }, + { + "epoch": 18.547165608642388, + "grad_norm": 0.0784548744559288, + "learning_rate": 7.995304018566879e-07, + "loss": 0.0051, + "num_input_tokens_seen": 202628160, + "step": 166535 + }, + { + "epoch": 18.547722463526004, + "grad_norm": 0.005533725488930941, + "learning_rate": 7.989209490588595e-07, + "loss": 0.1549, + "num_input_tokens_seen": 202634240, + "step": 166540 + }, + { + "epoch": 18.548279318409623, + "grad_norm": 0.3518883287906647, + "learning_rate": 7.983117248596156e-07, + "loss": 0.0073, + "num_input_tokens_seen": 202640704, + "step": 166545 + }, + { + "epoch": 18.54883617329324, + "grad_norm": 9.907694038702175e-05, + "learning_rate": 7.977027292647016e-07, + "loss": 0.0433, + "num_input_tokens_seen": 202647232, + "step": 166550 + }, + { + "epoch": 18.54939302817686, + "grad_norm": 0.0018191972048953176, + "learning_rate": 7.970939622798823e-07, + "loss": 0.0328, + "num_input_tokens_seen": 202653536, + "step": 166555 + }, + { + "epoch": 18.549949883060474, + "grad_norm": 0.024706775322556496, + "learning_rate": 7.964854239108949e-07, + "loss": 0.109, + "num_input_tokens_seen": 202659168, + "step": 166560 + }, + { + "epoch": 18.55050673794409, + "grad_norm": 1.6265813112258911, + "learning_rate": 7.958771141635013e-07, + "loss": 0.0329, + "num_input_tokens_seen": 202664704, + "step": 166565 + }, + { + "epoch": 18.55106359282771, + "grad_norm": 0.08935891836881638, + "learning_rate": 7.952690330434359e-07, + "loss": 0.0697, + "num_input_tokens_seen": 202670816, + "step": 166570 + }, + { + "epoch": 18.551620447711326, + "grad_norm": 0.9437646269798279, + "learning_rate": 7.946611805564497e-07, + "loss": 0.0882, + "num_input_tokens_seen": 202676896, + "step": 166575 + }, + { + "epoch": 18.552177302594945, + "grad_norm": 0.0011422340758144855, + "learning_rate": 7.940535567082797e-07, + "loss": 0.0052, + "num_input_tokens_seen": 202682752, + "step": 166580 + }, + { + "epoch": 18.55273415747856, + "grad_norm": 0.041762929409742355, + "learning_rate": 7.934461615046684e-07, + "loss": 0.0108, + "num_input_tokens_seen": 202688800, + "step": 166585 + }, + { + "epoch": 18.553291012362177, + "grad_norm": 0.05228198319673538, + "learning_rate": 7.928389949513504e-07, + "loss": 0.0571, + "num_input_tokens_seen": 202695168, + "step": 166590 + }, + { + "epoch": 18.553847867245796, + "grad_norm": 0.34025976061820984, + "learning_rate": 7.922320570540653e-07, + "loss": 0.0148, + "num_input_tokens_seen": 202701280, + "step": 166595 + }, + { + "epoch": 18.554404722129412, + "grad_norm": 0.12357760965824127, + "learning_rate": 7.91625347818542e-07, + "loss": 0.0127, + "num_input_tokens_seen": 202707552, + "step": 166600 + }, + { + "epoch": 18.55496157701303, + "grad_norm": 0.0023435105103999376, + "learning_rate": 7.91018867250512e-07, + "loss": 0.002, + "num_input_tokens_seen": 202713792, + "step": 166605 + }, + { + "epoch": 18.555518431896648, + "grad_norm": 0.014142679050564766, + "learning_rate": 7.90412615355704e-07, + "loss": 0.003, + "num_input_tokens_seen": 202720064, + "step": 166610 + }, + { + "epoch": 18.556075286780263, + "grad_norm": 0.01376661378890276, + "learning_rate": 7.898065921398495e-07, + "loss": 0.0885, + "num_input_tokens_seen": 202726304, + "step": 166615 + }, + { + "epoch": 18.556632141663883, + "grad_norm": 0.051247138530015945, + "learning_rate": 7.892007976086663e-07, + "loss": 0.0419, + "num_input_tokens_seen": 202732608, + "step": 166620 + }, + { + "epoch": 18.5571889965475, + "grad_norm": 0.0003377799002919346, + "learning_rate": 7.885952317678747e-07, + "loss": 0.0882, + "num_input_tokens_seen": 202738688, + "step": 166625 + }, + { + "epoch": 18.557745851431118, + "grad_norm": 0.0009115203865803778, + "learning_rate": 7.879898946232034e-07, + "loss": 0.0362, + "num_input_tokens_seen": 202744704, + "step": 166630 + }, + { + "epoch": 18.558302706314734, + "grad_norm": 0.0039931172505021095, + "learning_rate": 7.873847861803646e-07, + "loss": 0.016, + "num_input_tokens_seen": 202750784, + "step": 166635 + }, + { + "epoch": 18.55885956119835, + "grad_norm": 0.0011438264045864344, + "learning_rate": 7.867799064450787e-07, + "loss": 0.0219, + "num_input_tokens_seen": 202756928, + "step": 166640 + }, + { + "epoch": 18.55941641608197, + "grad_norm": 0.34283533692359924, + "learning_rate": 7.861752554230494e-07, + "loss": 0.0341, + "num_input_tokens_seen": 202762656, + "step": 166645 + }, + { + "epoch": 18.559973270965585, + "grad_norm": 0.25686997175216675, + "learning_rate": 7.855708331200001e-07, + "loss": 0.0708, + "num_input_tokens_seen": 202768896, + "step": 166650 + }, + { + "epoch": 18.560530125849205, + "grad_norm": 3.1206769943237305, + "learning_rate": 7.849666395416289e-07, + "loss": 0.1273, + "num_input_tokens_seen": 202775072, + "step": 166655 + }, + { + "epoch": 18.56108698073282, + "grad_norm": 0.20235997438430786, + "learning_rate": 7.843626746936534e-07, + "loss": 0.0039, + "num_input_tokens_seen": 202781536, + "step": 166660 + }, + { + "epoch": 18.561643835616437, + "grad_norm": 0.007958456873893738, + "learning_rate": 7.837589385817746e-07, + "loss": 0.0098, + "num_input_tokens_seen": 202787872, + "step": 166665 + }, + { + "epoch": 18.562200690500056, + "grad_norm": 1.4723576307296753, + "learning_rate": 7.831554312116934e-07, + "loss": 0.0278, + "num_input_tokens_seen": 202793792, + "step": 166670 + }, + { + "epoch": 18.562757545383672, + "grad_norm": 0.02439807541668415, + "learning_rate": 7.825521525891083e-07, + "loss": 0.0465, + "num_input_tokens_seen": 202799968, + "step": 166675 + }, + { + "epoch": 18.56331440026729, + "grad_norm": 0.048894573003053665, + "learning_rate": 7.819491027197228e-07, + "loss": 0.0733, + "num_input_tokens_seen": 202806144, + "step": 166680 + }, + { + "epoch": 18.563871255150907, + "grad_norm": 0.0030230414122343063, + "learning_rate": 7.813462816092326e-07, + "loss": 0.0976, + "num_input_tokens_seen": 202811840, + "step": 166685 + }, + { + "epoch": 18.564428110034527, + "grad_norm": 0.0007128646248020232, + "learning_rate": 7.807436892633274e-07, + "loss": 0.0008, + "num_input_tokens_seen": 202818080, + "step": 166690 + }, + { + "epoch": 18.564984964918143, + "grad_norm": 0.03248872980475426, + "learning_rate": 7.801413256877027e-07, + "loss": 0.0869, + "num_input_tokens_seen": 202824192, + "step": 166695 + }, + { + "epoch": 18.56554181980176, + "grad_norm": 0.028527239337563515, + "learning_rate": 7.795391908880511e-07, + "loss": 0.0048, + "num_input_tokens_seen": 202830592, + "step": 166700 + }, + { + "epoch": 18.566098674685378, + "grad_norm": 0.004328243434429169, + "learning_rate": 7.789372848700516e-07, + "loss": 0.0733, + "num_input_tokens_seen": 202836608, + "step": 166705 + }, + { + "epoch": 18.566655529568994, + "grad_norm": 0.21259592473506927, + "learning_rate": 7.783356076393994e-07, + "loss": 0.1232, + "num_input_tokens_seen": 202842592, + "step": 166710 + }, + { + "epoch": 18.567212384452613, + "grad_norm": 0.00016173157200682908, + "learning_rate": 7.777341592017734e-07, + "loss": 0.0694, + "num_input_tokens_seen": 202848832, + "step": 166715 + }, + { + "epoch": 18.56776923933623, + "grad_norm": 1.0560349225997925, + "learning_rate": 7.771329395628524e-07, + "loss": 0.0397, + "num_input_tokens_seen": 202854976, + "step": 166720 + }, + { + "epoch": 18.568326094219845, + "grad_norm": 0.6555972099304199, + "learning_rate": 7.76531948728318e-07, + "loss": 0.0171, + "num_input_tokens_seen": 202861088, + "step": 166725 + }, + { + "epoch": 18.568882949103465, + "grad_norm": 0.030309872701764107, + "learning_rate": 7.759311867038488e-07, + "loss": 0.0329, + "num_input_tokens_seen": 202867200, + "step": 166730 + }, + { + "epoch": 18.56943980398708, + "grad_norm": 6.456954479217529, + "learning_rate": 7.753306534951182e-07, + "loss": 0.1721, + "num_input_tokens_seen": 202873408, + "step": 166735 + }, + { + "epoch": 18.5699966588707, + "grad_norm": 3.4387080669403076, + "learning_rate": 7.747303491077967e-07, + "loss": 0.1187, + "num_input_tokens_seen": 202878560, + "step": 166740 + }, + { + "epoch": 18.570553513754316, + "grad_norm": 0.280587762594223, + "learning_rate": 7.741302735475548e-07, + "loss": 0.013, + "num_input_tokens_seen": 202884672, + "step": 166745 + }, + { + "epoch": 18.57111036863793, + "grad_norm": 0.5991088151931763, + "learning_rate": 7.735304268200627e-07, + "loss": 0.0077, + "num_input_tokens_seen": 202890400, + "step": 166750 + }, + { + "epoch": 18.57166722352155, + "grad_norm": 1.9963167905807495, + "learning_rate": 7.729308089309856e-07, + "loss": 0.0546, + "num_input_tokens_seen": 202896512, + "step": 166755 + }, + { + "epoch": 18.572224078405167, + "grad_norm": 0.011680022813379765, + "learning_rate": 7.723314198859883e-07, + "loss": 0.0038, + "num_input_tokens_seen": 202902208, + "step": 166760 + }, + { + "epoch": 18.572780933288787, + "grad_norm": 1.7717602252960205, + "learning_rate": 7.7173225969073e-07, + "loss": 0.0997, + "num_input_tokens_seen": 202908512, + "step": 166765 + }, + { + "epoch": 18.573337788172402, + "grad_norm": 0.8559625148773193, + "learning_rate": 7.711333283508731e-07, + "loss": 0.1126, + "num_input_tokens_seen": 202914560, + "step": 166770 + }, + { + "epoch": 18.57389464305602, + "grad_norm": 0.0025423578917980194, + "learning_rate": 7.705346258720713e-07, + "loss": 0.0995, + "num_input_tokens_seen": 202919840, + "step": 166775 + }, + { + "epoch": 18.574451497939638, + "grad_norm": 0.1596532016992569, + "learning_rate": 7.699361522599868e-07, + "loss": 0.0327, + "num_input_tokens_seen": 202925984, + "step": 166780 + }, + { + "epoch": 18.575008352823254, + "grad_norm": 0.0028643484693020582, + "learning_rate": 7.693379075202651e-07, + "loss": 0.0579, + "num_input_tokens_seen": 202932320, + "step": 166785 + }, + { + "epoch": 18.575565207706873, + "grad_norm": 0.5084545612335205, + "learning_rate": 7.687398916585625e-07, + "loss": 0.04, + "num_input_tokens_seen": 202938528, + "step": 166790 + }, + { + "epoch": 18.57612206259049, + "grad_norm": 0.0022792210802435875, + "learning_rate": 7.681421046805221e-07, + "loss": 0.0046, + "num_input_tokens_seen": 202944672, + "step": 166795 + }, + { + "epoch": 18.576678917474105, + "grad_norm": 0.05451173707842827, + "learning_rate": 7.675445465917974e-07, + "loss": 0.0136, + "num_input_tokens_seen": 202950688, + "step": 166800 + }, + { + "epoch": 18.577235772357724, + "grad_norm": 0.10765254497528076, + "learning_rate": 7.669472173980257e-07, + "loss": 0.0655, + "num_input_tokens_seen": 202956736, + "step": 166805 + }, + { + "epoch": 18.57779262724134, + "grad_norm": 0.08600029349327087, + "learning_rate": 7.663501171048554e-07, + "loss": 0.1032, + "num_input_tokens_seen": 202962976, + "step": 166810 + }, + { + "epoch": 18.57834948212496, + "grad_norm": 0.184197336435318, + "learning_rate": 7.657532457179206e-07, + "loss": 0.2313, + "num_input_tokens_seen": 202968992, + "step": 166815 + }, + { + "epoch": 18.578906337008576, + "grad_norm": 0.00017127043975051492, + "learning_rate": 7.65156603242867e-07, + "loss": 0.0492, + "num_input_tokens_seen": 202974880, + "step": 166820 + }, + { + "epoch": 18.57946319189219, + "grad_norm": 0.00013187974400352687, + "learning_rate": 7.645601896853205e-07, + "loss": 0.0018, + "num_input_tokens_seen": 202980960, + "step": 166825 + }, + { + "epoch": 18.58002004677581, + "grad_norm": 0.0073034814558923244, + "learning_rate": 7.639640050509267e-07, + "loss": 0.0009, + "num_input_tokens_seen": 202987104, + "step": 166830 + }, + { + "epoch": 18.580576901659427, + "grad_norm": 0.9598471522331238, + "learning_rate": 7.63368049345306e-07, + "loss": 0.0424, + "num_input_tokens_seen": 202993472, + "step": 166835 + }, + { + "epoch": 18.581133756543046, + "grad_norm": 0.08289288729429245, + "learning_rate": 7.627723225740929e-07, + "loss": 0.0033, + "num_input_tokens_seen": 202999744, + "step": 166840 + }, + { + "epoch": 18.581690611426662, + "grad_norm": 0.0019126241095364094, + "learning_rate": 7.621768247429134e-07, + "loss": 0.0204, + "num_input_tokens_seen": 203006112, + "step": 166845 + }, + { + "epoch": 18.582247466310278, + "grad_norm": 0.3706720173358917, + "learning_rate": 7.615815558573936e-07, + "loss": 0.0096, + "num_input_tokens_seen": 203012192, + "step": 166850 + }, + { + "epoch": 18.582804321193898, + "grad_norm": 0.22750581800937653, + "learning_rate": 7.609865159231566e-07, + "loss": 0.0046, + "num_input_tokens_seen": 203018464, + "step": 166855 + }, + { + "epoch": 18.583361176077513, + "grad_norm": 0.4538353383541107, + "learning_rate": 7.603917049458203e-07, + "loss": 0.0137, + "num_input_tokens_seen": 203024640, + "step": 166860 + }, + { + "epoch": 18.583918030961133, + "grad_norm": 1.1208654642105103, + "learning_rate": 7.597971229310025e-07, + "loss": 0.096, + "num_input_tokens_seen": 203031168, + "step": 166865 + }, + { + "epoch": 18.58447488584475, + "grad_norm": 0.05050608515739441, + "learning_rate": 7.592027698843263e-07, + "loss": 0.0071, + "num_input_tokens_seen": 203037056, + "step": 166870 + }, + { + "epoch": 18.585031740728365, + "grad_norm": 0.003929475788027048, + "learning_rate": 7.586086458114011e-07, + "loss": 0.0222, + "num_input_tokens_seen": 203042752, + "step": 166875 + }, + { + "epoch": 18.585588595611984, + "grad_norm": 0.02595321647822857, + "learning_rate": 7.580147507178364e-07, + "loss": 0.0158, + "num_input_tokens_seen": 203048896, + "step": 166880 + }, + { + "epoch": 18.5861454504956, + "grad_norm": 0.08162946254014969, + "learning_rate": 7.574210846092444e-07, + "loss": 0.0855, + "num_input_tokens_seen": 203054720, + "step": 166885 + }, + { + "epoch": 18.58670230537922, + "grad_norm": 0.5264891982078552, + "learning_rate": 7.568276474912372e-07, + "loss": 0.0126, + "num_input_tokens_seen": 203060832, + "step": 166890 + }, + { + "epoch": 18.587259160262835, + "grad_norm": 0.3996035158634186, + "learning_rate": 7.562344393694104e-07, + "loss": 0.021, + "num_input_tokens_seen": 203067040, + "step": 166895 + }, + { + "epoch": 18.58781601514645, + "grad_norm": 0.9999592304229736, + "learning_rate": 7.556414602493788e-07, + "loss": 0.0114, + "num_input_tokens_seen": 203073440, + "step": 166900 + }, + { + "epoch": 18.58837287003007, + "grad_norm": 0.38361483812332153, + "learning_rate": 7.550487101367354e-07, + "loss": 0.0078, + "num_input_tokens_seen": 203079552, + "step": 166905 + }, + { + "epoch": 18.588929724913687, + "grad_norm": 0.0027733948081731796, + "learning_rate": 7.54456189037081e-07, + "loss": 0.0276, + "num_input_tokens_seen": 203084896, + "step": 166910 + }, + { + "epoch": 18.589486579797306, + "grad_norm": 1.1236096620559692, + "learning_rate": 7.538638969560114e-07, + "loss": 0.048, + "num_input_tokens_seen": 203090880, + "step": 166915 + }, + { + "epoch": 18.590043434680922, + "grad_norm": 9.61031109909527e-05, + "learning_rate": 7.532718338991273e-07, + "loss": 0.062, + "num_input_tokens_seen": 203096768, + "step": 166920 + }, + { + "epoch": 18.590600289564538, + "grad_norm": 0.002159812953323126, + "learning_rate": 7.526799998720135e-07, + "loss": 0.0757, + "num_input_tokens_seen": 203103168, + "step": 166925 + }, + { + "epoch": 18.591157144448157, + "grad_norm": 0.038124073296785355, + "learning_rate": 7.520883948802682e-07, + "loss": 0.0994, + "num_input_tokens_seen": 203108928, + "step": 166930 + }, + { + "epoch": 18.591713999331773, + "grad_norm": 0.0841456800699234, + "learning_rate": 7.514970189294701e-07, + "loss": 0.0619, + "num_input_tokens_seen": 203115264, + "step": 166935 + }, + { + "epoch": 18.592270854215393, + "grad_norm": 0.0004112065944354981, + "learning_rate": 7.509058720252121e-07, + "loss": 0.0078, + "num_input_tokens_seen": 203121376, + "step": 166940 + }, + { + "epoch": 18.59282770909901, + "grad_norm": 4.621971130371094, + "learning_rate": 7.503149541730758e-07, + "loss": 0.116, + "num_input_tokens_seen": 203127584, + "step": 166945 + }, + { + "epoch": 18.593384563982625, + "grad_norm": 0.8135185241699219, + "learning_rate": 7.497242653786457e-07, + "loss": 0.0433, + "num_input_tokens_seen": 203133504, + "step": 166950 + }, + { + "epoch": 18.593941418866244, + "grad_norm": 0.012283800169825554, + "learning_rate": 7.49133805647495e-07, + "loss": 0.0494, + "num_input_tokens_seen": 203139328, + "step": 166955 + }, + { + "epoch": 18.59449827374986, + "grad_norm": 0.7888373136520386, + "learning_rate": 7.485435749852083e-07, + "loss": 0.0179, + "num_input_tokens_seen": 203144928, + "step": 166960 + }, + { + "epoch": 18.59505512863348, + "grad_norm": 0.1981586366891861, + "learning_rate": 7.47953573397353e-07, + "loss": 0.0322, + "num_input_tokens_seen": 203150656, + "step": 166965 + }, + { + "epoch": 18.595611983517095, + "grad_norm": 0.5629431009292603, + "learning_rate": 7.473638008895112e-07, + "loss": 0.0166, + "num_input_tokens_seen": 203156608, + "step": 166970 + }, + { + "epoch": 18.59616883840071, + "grad_norm": 0.08536433428525925, + "learning_rate": 7.467742574672476e-07, + "loss": 0.012, + "num_input_tokens_seen": 203162560, + "step": 166975 + }, + { + "epoch": 18.59672569328433, + "grad_norm": 0.31514260172843933, + "learning_rate": 7.461849431361329e-07, + "loss": 0.0407, + "num_input_tokens_seen": 203168384, + "step": 166980 + }, + { + "epoch": 18.597282548167946, + "grad_norm": 0.23753845691680908, + "learning_rate": 7.455958579017319e-07, + "loss": 0.0158, + "num_input_tokens_seen": 203174240, + "step": 166985 + }, + { + "epoch": 18.597839403051566, + "grad_norm": 0.0013324549654498696, + "learning_rate": 7.450070017696098e-07, + "loss": 0.0442, + "num_input_tokens_seen": 203180480, + "step": 166990 + }, + { + "epoch": 18.598396257935182, + "grad_norm": 0.0009567155502736568, + "learning_rate": 7.444183747453342e-07, + "loss": 0.0116, + "num_input_tokens_seen": 203186336, + "step": 166995 + }, + { + "epoch": 18.598953112818798, + "grad_norm": 0.046456463634967804, + "learning_rate": 7.438299768344564e-07, + "loss": 0.0618, + "num_input_tokens_seen": 203192128, + "step": 167000 + }, + { + "epoch": 18.599509967702417, + "grad_norm": 0.05748237296938896, + "learning_rate": 7.432418080425385e-07, + "loss": 0.0712, + "num_input_tokens_seen": 203198464, + "step": 167005 + }, + { + "epoch": 18.600066822586033, + "grad_norm": 0.0024821069091558456, + "learning_rate": 7.426538683751344e-07, + "loss": 0.0067, + "num_input_tokens_seen": 203204480, + "step": 167010 + }, + { + "epoch": 18.600623677469653, + "grad_norm": 0.12129238247871399, + "learning_rate": 7.420661578378036e-07, + "loss": 0.0957, + "num_input_tokens_seen": 203210688, + "step": 167015 + }, + { + "epoch": 18.60118053235327, + "grad_norm": 0.00028564853710122406, + "learning_rate": 7.414786764360887e-07, + "loss": 0.0151, + "num_input_tokens_seen": 203217024, + "step": 167020 + }, + { + "epoch": 18.601737387236888, + "grad_norm": 0.0009646362741477787, + "learning_rate": 7.408914241755466e-07, + "loss": 0.0131, + "num_input_tokens_seen": 203222944, + "step": 167025 + }, + { + "epoch": 18.602294242120504, + "grad_norm": 0.2634969651699066, + "learning_rate": 7.403044010617172e-07, + "loss": 0.0055, + "num_input_tokens_seen": 203228896, + "step": 167030 + }, + { + "epoch": 18.60285109700412, + "grad_norm": 1.2438530921936035, + "learning_rate": 7.397176071001544e-07, + "loss": 0.0603, + "num_input_tokens_seen": 203235232, + "step": 167035 + }, + { + "epoch": 18.60340795188774, + "grad_norm": 0.07477377355098724, + "learning_rate": 7.391310422963898e-07, + "loss": 0.0063, + "num_input_tokens_seen": 203241312, + "step": 167040 + }, + { + "epoch": 18.603964806771355, + "grad_norm": 0.0017702507320791483, + "learning_rate": 7.385447066559775e-07, + "loss": 0.0002, + "num_input_tokens_seen": 203247680, + "step": 167045 + }, + { + "epoch": 18.60452166165497, + "grad_norm": 0.12497135996818542, + "learning_rate": 7.379586001844407e-07, + "loss": 0.0136, + "num_input_tokens_seen": 203253600, + "step": 167050 + }, + { + "epoch": 18.60507851653859, + "grad_norm": 1.0054693222045898, + "learning_rate": 7.373727228873279e-07, + "loss": 0.1545, + "num_input_tokens_seen": 203259392, + "step": 167055 + }, + { + "epoch": 18.605635371422206, + "grad_norm": 0.19635291397571564, + "learning_rate": 7.367870747701649e-07, + "loss": 0.0053, + "num_input_tokens_seen": 203265504, + "step": 167060 + }, + { + "epoch": 18.606192226305826, + "grad_norm": 1.9835659265518188, + "learning_rate": 7.362016558384921e-07, + "loss": 0.0658, + "num_input_tokens_seen": 203271552, + "step": 167065 + }, + { + "epoch": 18.60674908118944, + "grad_norm": 0.08159036189317703, + "learning_rate": 7.356164660978326e-07, + "loss": 0.0299, + "num_input_tokens_seen": 203277696, + "step": 167070 + }, + { + "epoch": 18.60730593607306, + "grad_norm": 0.0001430109841749072, + "learning_rate": 7.350315055537155e-07, + "loss": 0.0417, + "num_input_tokens_seen": 203283680, + "step": 167075 + }, + { + "epoch": 18.607862790956677, + "grad_norm": 0.103945292532444, + "learning_rate": 7.34446774211664e-07, + "loss": 0.0861, + "num_input_tokens_seen": 203289568, + "step": 167080 + }, + { + "epoch": 18.608419645840293, + "grad_norm": 9.272810711991042e-05, + "learning_rate": 7.338622720772071e-07, + "loss": 0.0176, + "num_input_tokens_seen": 203295936, + "step": 167085 + }, + { + "epoch": 18.608976500723912, + "grad_norm": 0.035409118980169296, + "learning_rate": 7.332779991558652e-07, + "loss": 0.0282, + "num_input_tokens_seen": 203302048, + "step": 167090 + }, + { + "epoch": 18.609533355607528, + "grad_norm": 0.46742406487464905, + "learning_rate": 7.326939554531509e-07, + "loss": 0.0077, + "num_input_tokens_seen": 203308160, + "step": 167095 + }, + { + "epoch": 18.610090210491148, + "grad_norm": 2.7525475025177, + "learning_rate": 7.321101409745846e-07, + "loss": 0.1447, + "num_input_tokens_seen": 203314688, + "step": 167100 + }, + { + "epoch": 18.610647065374764, + "grad_norm": 0.16751332581043243, + "learning_rate": 7.315265557256839e-07, + "loss": 0.0665, + "num_input_tokens_seen": 203320800, + "step": 167105 + }, + { + "epoch": 18.61120392025838, + "grad_norm": 0.36294451355934143, + "learning_rate": 7.309431997119532e-07, + "loss": 0.0156, + "num_input_tokens_seen": 203326560, + "step": 167110 + }, + { + "epoch": 18.611760775142, + "grad_norm": 0.22328250110149384, + "learning_rate": 7.303600729389127e-07, + "loss": 0.1068, + "num_input_tokens_seen": 203332800, + "step": 167115 + }, + { + "epoch": 18.612317630025615, + "grad_norm": 9.251327719539404e-05, + "learning_rate": 7.297771754120664e-07, + "loss": 0.0336, + "num_input_tokens_seen": 203338816, + "step": 167120 + }, + { + "epoch": 18.612874484909234, + "grad_norm": 0.14729291200637817, + "learning_rate": 7.291945071369182e-07, + "loss": 0.0787, + "num_input_tokens_seen": 203344992, + "step": 167125 + }, + { + "epoch": 18.61343133979285, + "grad_norm": 0.001913408632390201, + "learning_rate": 7.286120681189723e-07, + "loss": 0.0141, + "num_input_tokens_seen": 203350880, + "step": 167130 + }, + { + "epoch": 18.613988194676466, + "grad_norm": 1.5646556615829468, + "learning_rate": 7.280298583637324e-07, + "loss": 0.0248, + "num_input_tokens_seen": 203357120, + "step": 167135 + }, + { + "epoch": 18.614545049560085, + "grad_norm": 0.02713116444647312, + "learning_rate": 7.274478778766968e-07, + "loss": 0.0825, + "num_input_tokens_seen": 203363040, + "step": 167140 + }, + { + "epoch": 18.6151019044437, + "grad_norm": 0.04459955915808678, + "learning_rate": 7.268661266633641e-07, + "loss": 0.0989, + "num_input_tokens_seen": 203368992, + "step": 167145 + }, + { + "epoch": 18.61565875932732, + "grad_norm": 1.3027852773666382, + "learning_rate": 7.262846047292243e-07, + "loss": 0.004, + "num_input_tokens_seen": 203374880, + "step": 167150 + }, + { + "epoch": 18.616215614210937, + "grad_norm": 1.784497618675232, + "learning_rate": 7.257033120797757e-07, + "loss": 0.1342, + "num_input_tokens_seen": 203381056, + "step": 167155 + }, + { + "epoch": 18.616772469094553, + "grad_norm": 0.09220721572637558, + "learning_rate": 7.251222487205083e-07, + "loss": 0.0073, + "num_input_tokens_seen": 203387296, + "step": 167160 + }, + { + "epoch": 18.617329323978172, + "grad_norm": 0.007207820657640696, + "learning_rate": 7.245414146569124e-07, + "loss": 0.0624, + "num_input_tokens_seen": 203393344, + "step": 167165 + }, + { + "epoch": 18.617886178861788, + "grad_norm": 0.32917770743370056, + "learning_rate": 7.239608098944694e-07, + "loss": 0.0749, + "num_input_tokens_seen": 203399712, + "step": 167170 + }, + { + "epoch": 18.618443033745407, + "grad_norm": 0.031769439578056335, + "learning_rate": 7.233804344386669e-07, + "loss": 0.0133, + "num_input_tokens_seen": 203406080, + "step": 167175 + }, + { + "epoch": 18.618999888629023, + "grad_norm": 0.0005392468883655965, + "learning_rate": 7.228002882949835e-07, + "loss": 0.0109, + "num_input_tokens_seen": 203412224, + "step": 167180 + }, + { + "epoch": 18.61955674351264, + "grad_norm": 0.4701911509037018, + "learning_rate": 7.222203714689041e-07, + "loss": 0.0149, + "num_input_tokens_seen": 203418176, + "step": 167185 + }, + { + "epoch": 18.62011359839626, + "grad_norm": 0.0001521606754977256, + "learning_rate": 7.216406839659073e-07, + "loss": 0.0109, + "num_input_tokens_seen": 203424416, + "step": 167190 + }, + { + "epoch": 18.620670453279875, + "grad_norm": 0.0020698141306638718, + "learning_rate": 7.21061225791464e-07, + "loss": 0.0818, + "num_input_tokens_seen": 203430336, + "step": 167195 + }, + { + "epoch": 18.621227308163494, + "grad_norm": 0.13459324836730957, + "learning_rate": 7.204819969510446e-07, + "loss": 0.18, + "num_input_tokens_seen": 203436288, + "step": 167200 + }, + { + "epoch": 18.62178416304711, + "grad_norm": 0.575178325176239, + "learning_rate": 7.199029974501309e-07, + "loss": 0.0204, + "num_input_tokens_seen": 203442528, + "step": 167205 + }, + { + "epoch": 18.622341017930726, + "grad_norm": 0.07794389873743057, + "learning_rate": 7.193242272941853e-07, + "loss": 0.0098, + "num_input_tokens_seen": 203448736, + "step": 167210 + }, + { + "epoch": 18.622897872814345, + "grad_norm": 0.00011152945808134973, + "learning_rate": 7.187456864886755e-07, + "loss": 0.0883, + "num_input_tokens_seen": 203455168, + "step": 167215 + }, + { + "epoch": 18.62345472769796, + "grad_norm": 0.056429702788591385, + "learning_rate": 7.181673750390639e-07, + "loss": 0.0401, + "num_input_tokens_seen": 203461184, + "step": 167220 + }, + { + "epoch": 18.62401158258158, + "grad_norm": 0.553823709487915, + "learning_rate": 7.175892929508182e-07, + "loss": 0.0165, + "num_input_tokens_seen": 203467456, + "step": 167225 + }, + { + "epoch": 18.624568437465197, + "grad_norm": 1.6534391641616821, + "learning_rate": 7.170114402293926e-07, + "loss": 0.1044, + "num_input_tokens_seen": 203473152, + "step": 167230 + }, + { + "epoch": 18.625125292348812, + "grad_norm": 0.017057735472917557, + "learning_rate": 7.164338168802576e-07, + "loss": 0.001, + "num_input_tokens_seen": 203479104, + "step": 167235 + }, + { + "epoch": 18.625682147232432, + "grad_norm": 0.46822771430015564, + "learning_rate": 7.158564229088532e-07, + "loss": 0.0163, + "num_input_tokens_seen": 203485312, + "step": 167240 + }, + { + "epoch": 18.626239002116048, + "grad_norm": 1.619694709777832, + "learning_rate": 7.152792583206447e-07, + "loss": 0.0581, + "num_input_tokens_seen": 203491552, + "step": 167245 + }, + { + "epoch": 18.626795856999667, + "grad_norm": 0.02436203882098198, + "learning_rate": 7.147023231210748e-07, + "loss": 0.0225, + "num_input_tokens_seen": 203497248, + "step": 167250 + }, + { + "epoch": 18.627352711883283, + "grad_norm": 0.6099691987037659, + "learning_rate": 7.141256173156058e-07, + "loss": 0.0688, + "num_input_tokens_seen": 203503360, + "step": 167255 + }, + { + "epoch": 18.6279095667669, + "grad_norm": 0.05937772989273071, + "learning_rate": 7.13549140909675e-07, + "loss": 0.001, + "num_input_tokens_seen": 203509408, + "step": 167260 + }, + { + "epoch": 18.62846642165052, + "grad_norm": 0.013416150584816933, + "learning_rate": 7.129728939087311e-07, + "loss": 0.0111, + "num_input_tokens_seen": 203515648, + "step": 167265 + }, + { + "epoch": 18.629023276534134, + "grad_norm": 0.17874430119991302, + "learning_rate": 7.123968763182137e-07, + "loss": 0.0316, + "num_input_tokens_seen": 203521664, + "step": 167270 + }, + { + "epoch": 18.629580131417754, + "grad_norm": 0.00015038302808534354, + "learning_rate": 7.118210881435689e-07, + "loss": 0.0079, + "num_input_tokens_seen": 203527648, + "step": 167275 + }, + { + "epoch": 18.63013698630137, + "grad_norm": 0.00032684768666513264, + "learning_rate": 7.112455293902337e-07, + "loss": 0.0032, + "num_input_tokens_seen": 203534176, + "step": 167280 + }, + { + "epoch": 18.630693841184986, + "grad_norm": 0.00016932355356402695, + "learning_rate": 7.106702000636456e-07, + "loss": 0.0014, + "num_input_tokens_seen": 203540576, + "step": 167285 + }, + { + "epoch": 18.631250696068605, + "grad_norm": 0.6718215346336365, + "learning_rate": 7.100951001692336e-07, + "loss": 0.0118, + "num_input_tokens_seen": 203546336, + "step": 167290 + }, + { + "epoch": 18.63180755095222, + "grad_norm": 0.003707967000082135, + "learning_rate": 7.095202297124376e-07, + "loss": 0.008, + "num_input_tokens_seen": 203552160, + "step": 167295 + }, + { + "epoch": 18.63236440583584, + "grad_norm": 0.022605154663324356, + "learning_rate": 7.089455886986813e-07, + "loss": 0.0074, + "num_input_tokens_seen": 203558240, + "step": 167300 + }, + { + "epoch": 18.632921260719456, + "grad_norm": 0.09278473258018494, + "learning_rate": 7.083711771333989e-07, + "loss": 0.1415, + "num_input_tokens_seen": 203564288, + "step": 167305 + }, + { + "epoch": 18.633478115603072, + "grad_norm": 0.05758326128125191, + "learning_rate": 7.077969950220115e-07, + "loss": 0.0033, + "num_input_tokens_seen": 203570560, + "step": 167310 + }, + { + "epoch": 18.63403497048669, + "grad_norm": 0.0001602582196937874, + "learning_rate": 7.072230423699422e-07, + "loss": 0.0601, + "num_input_tokens_seen": 203576768, + "step": 167315 + }, + { + "epoch": 18.634591825370308, + "grad_norm": 1.2264432907104492, + "learning_rate": 7.066493191826146e-07, + "loss": 0.1229, + "num_input_tokens_seen": 203583200, + "step": 167320 + }, + { + "epoch": 18.635148680253927, + "grad_norm": 0.9200353622436523, + "learning_rate": 7.060758254654492e-07, + "loss": 0.131, + "num_input_tokens_seen": 203589344, + "step": 167325 + }, + { + "epoch": 18.635705535137543, + "grad_norm": 0.046480942517519, + "learning_rate": 7.055025612238642e-07, + "loss": 0.0032, + "num_input_tokens_seen": 203595840, + "step": 167330 + }, + { + "epoch": 18.63626239002116, + "grad_norm": 0.11341878026723862, + "learning_rate": 7.049295264632689e-07, + "loss": 0.0728, + "num_input_tokens_seen": 203601408, + "step": 167335 + }, + { + "epoch": 18.63681924490478, + "grad_norm": 0.5909733772277832, + "learning_rate": 7.043567211890784e-07, + "loss": 0.0081, + "num_input_tokens_seen": 203607552, + "step": 167340 + }, + { + "epoch": 18.637376099788394, + "grad_norm": 0.0004716605180874467, + "learning_rate": 7.037841454067051e-07, + "loss": 0.0702, + "num_input_tokens_seen": 203613760, + "step": 167345 + }, + { + "epoch": 18.637932954672014, + "grad_norm": 0.8043716549873352, + "learning_rate": 7.032117991215587e-07, + "loss": 0.1203, + "num_input_tokens_seen": 203619584, + "step": 167350 + }, + { + "epoch": 18.63848980955563, + "grad_norm": 0.5270408391952515, + "learning_rate": 7.026396823390402e-07, + "loss": 0.011, + "num_input_tokens_seen": 203625792, + "step": 167355 + }, + { + "epoch": 18.63904666443925, + "grad_norm": 0.08293920010328293, + "learning_rate": 7.020677950645566e-07, + "loss": 0.0489, + "num_input_tokens_seen": 203631648, + "step": 167360 + }, + { + "epoch": 18.639603519322865, + "grad_norm": 0.0010621737455949187, + "learning_rate": 7.014961373035089e-07, + "loss": 0.0303, + "num_input_tokens_seen": 203637760, + "step": 167365 + }, + { + "epoch": 18.64016037420648, + "grad_norm": 0.0009686198900453746, + "learning_rate": 7.009247090612986e-07, + "loss": 0.0626, + "num_input_tokens_seen": 203643840, + "step": 167370 + }, + { + "epoch": 18.6407172290901, + "grad_norm": 0.040051985532045364, + "learning_rate": 7.00353510343324e-07, + "loss": 0.0051, + "num_input_tokens_seen": 203649952, + "step": 167375 + }, + { + "epoch": 18.641274083973716, + "grad_norm": 0.0008350508287549019, + "learning_rate": 6.99782541154978e-07, + "loss": 0.0035, + "num_input_tokens_seen": 203656256, + "step": 167380 + }, + { + "epoch": 18.641830938857332, + "grad_norm": 0.04504949226975441, + "learning_rate": 6.992118015016564e-07, + "loss": 0.0019, + "num_input_tokens_seen": 203662432, + "step": 167385 + }, + { + "epoch": 18.64238779374095, + "grad_norm": 5.765984058380127, + "learning_rate": 6.986412913887463e-07, + "loss": 0.0355, + "num_input_tokens_seen": 203668544, + "step": 167390 + }, + { + "epoch": 18.642944648624567, + "grad_norm": 0.17699487507343292, + "learning_rate": 6.980710108216409e-07, + "loss": 0.0058, + "num_input_tokens_seen": 203674336, + "step": 167395 + }, + { + "epoch": 18.643501503508187, + "grad_norm": 0.013434586115181446, + "learning_rate": 6.975009598057247e-07, + "loss": 0.0064, + "num_input_tokens_seen": 203680384, + "step": 167400 + }, + { + "epoch": 18.644058358391803, + "grad_norm": 0.18636803328990936, + "learning_rate": 6.96931138346385e-07, + "loss": 0.008, + "num_input_tokens_seen": 203686752, + "step": 167405 + }, + { + "epoch": 18.644615213275422, + "grad_norm": 0.0026249217335134745, + "learning_rate": 6.96361546448998e-07, + "loss": 0.0012, + "num_input_tokens_seen": 203692928, + "step": 167410 + }, + { + "epoch": 18.645172068159038, + "grad_norm": 0.3769357204437256, + "learning_rate": 6.957921841189485e-07, + "loss": 0.083, + "num_input_tokens_seen": 203698848, + "step": 167415 + }, + { + "epoch": 18.645728923042654, + "grad_norm": 1.4842852354049683, + "learning_rate": 6.952230513616182e-07, + "loss": 0.0901, + "num_input_tokens_seen": 203704768, + "step": 167420 + }, + { + "epoch": 18.646285777926273, + "grad_norm": 0.20057235658168793, + "learning_rate": 6.946541481823749e-07, + "loss": 0.0385, + "num_input_tokens_seen": 203710272, + "step": 167425 + }, + { + "epoch": 18.64684263280989, + "grad_norm": 0.010171339847147465, + "learning_rate": 6.940854745865977e-07, + "loss": 0.0175, + "num_input_tokens_seen": 203716576, + "step": 167430 + }, + { + "epoch": 18.64739948769351, + "grad_norm": 0.05829472467303276, + "learning_rate": 6.935170305796546e-07, + "loss": 0.1385, + "num_input_tokens_seen": 203722624, + "step": 167435 + }, + { + "epoch": 18.647956342577125, + "grad_norm": 1.500463604927063, + "learning_rate": 6.929488161669217e-07, + "loss": 0.0278, + "num_input_tokens_seen": 203728896, + "step": 167440 + }, + { + "epoch": 18.64851319746074, + "grad_norm": 0.03741512820124626, + "learning_rate": 6.923808313537561e-07, + "loss": 0.0183, + "num_input_tokens_seen": 203735008, + "step": 167445 + }, + { + "epoch": 18.64907005234436, + "grad_norm": 0.032474029809236526, + "learning_rate": 6.918130761455338e-07, + "loss": 0.0286, + "num_input_tokens_seen": 203741248, + "step": 167450 + }, + { + "epoch": 18.649626907227976, + "grad_norm": 0.43065083026885986, + "learning_rate": 6.91245550547609e-07, + "loss": 0.0074, + "num_input_tokens_seen": 203747488, + "step": 167455 + }, + { + "epoch": 18.650183762111595, + "grad_norm": 0.03185045346617699, + "learning_rate": 6.906782545653467e-07, + "loss": 0.0108, + "num_input_tokens_seen": 203753632, + "step": 167460 + }, + { + "epoch": 18.65074061699521, + "grad_norm": 0.004888698924332857, + "learning_rate": 6.901111882041039e-07, + "loss": 0.0557, + "num_input_tokens_seen": 203759968, + "step": 167465 + }, + { + "epoch": 18.651297471878827, + "grad_norm": 0.7233700752258301, + "learning_rate": 6.895443514692374e-07, + "loss": 0.0853, + "num_input_tokens_seen": 203766240, + "step": 167470 + }, + { + "epoch": 18.651854326762447, + "grad_norm": 0.0384666733443737, + "learning_rate": 6.889777443661039e-07, + "loss": 0.0013, + "num_input_tokens_seen": 203772448, + "step": 167475 + }, + { + "epoch": 18.652411181646062, + "grad_norm": 0.3580639362335205, + "learning_rate": 6.884113669000547e-07, + "loss": 0.1402, + "num_input_tokens_seen": 203778720, + "step": 167480 + }, + { + "epoch": 18.652968036529682, + "grad_norm": 0.011429390870034695, + "learning_rate": 6.878452190764329e-07, + "loss": 0.0173, + "num_input_tokens_seen": 203784864, + "step": 167485 + }, + { + "epoch": 18.653524891413298, + "grad_norm": 0.3661695420742035, + "learning_rate": 6.872793009005951e-07, + "loss": 0.1562, + "num_input_tokens_seen": 203791040, + "step": 167490 + }, + { + "epoch": 18.654081746296914, + "grad_norm": 4.7833353164605796e-05, + "learning_rate": 6.867136123778817e-07, + "loss": 0.0016, + "num_input_tokens_seen": 203797312, + "step": 167495 + }, + { + "epoch": 18.654638601180533, + "grad_norm": 3.1656534671783447, + "learning_rate": 6.861481535136411e-07, + "loss": 0.0758, + "num_input_tokens_seen": 203803648, + "step": 167500 + }, + { + "epoch": 18.65519545606415, + "grad_norm": 0.0005955674569122493, + "learning_rate": 6.85582924313205e-07, + "loss": 0.002, + "num_input_tokens_seen": 203809952, + "step": 167505 + }, + { + "epoch": 18.65575231094777, + "grad_norm": 0.007830696180462837, + "learning_rate": 6.850179247819249e-07, + "loss": 0.0146, + "num_input_tokens_seen": 203816320, + "step": 167510 + }, + { + "epoch": 18.656309165831384, + "grad_norm": 0.8942387104034424, + "learning_rate": 6.84453154925127e-07, + "loss": 0.0749, + "num_input_tokens_seen": 203822496, + "step": 167515 + }, + { + "epoch": 18.656866020715, + "grad_norm": 0.6733975410461426, + "learning_rate": 6.838886147481516e-07, + "loss": 0.0619, + "num_input_tokens_seen": 203828352, + "step": 167520 + }, + { + "epoch": 18.65742287559862, + "grad_norm": 0.09747898578643799, + "learning_rate": 6.833243042563303e-07, + "loss": 0.0124, + "num_input_tokens_seen": 203834272, + "step": 167525 + }, + { + "epoch": 18.657979730482236, + "grad_norm": 0.0459955558180809, + "learning_rate": 6.827602234549952e-07, + "loss": 0.0434, + "num_input_tokens_seen": 203840320, + "step": 167530 + }, + { + "epoch": 18.658536585365855, + "grad_norm": 0.42917969822883606, + "learning_rate": 6.821963723494667e-07, + "loss": 0.0134, + "num_input_tokens_seen": 203846336, + "step": 167535 + }, + { + "epoch": 18.65909344024947, + "grad_norm": 1.2769473791122437, + "learning_rate": 6.816327509450826e-07, + "loss": 0.1304, + "num_input_tokens_seen": 203852384, + "step": 167540 + }, + { + "epoch": 18.659650295133087, + "grad_norm": 0.0023479321971535683, + "learning_rate": 6.810693592471579e-07, + "loss": 0.0279, + "num_input_tokens_seen": 203858432, + "step": 167545 + }, + { + "epoch": 18.660207150016706, + "grad_norm": 0.2155439555644989, + "learning_rate": 6.805061972610188e-07, + "loss": 0.055, + "num_input_tokens_seen": 203864800, + "step": 167550 + }, + { + "epoch": 18.660764004900322, + "grad_norm": 0.01087107788771391, + "learning_rate": 6.799432649919807e-07, + "loss": 0.0213, + "num_input_tokens_seen": 203871040, + "step": 167555 + }, + { + "epoch": 18.66132085978394, + "grad_norm": 0.0018395251827314496, + "learning_rate": 6.793805624453642e-07, + "loss": 0.0206, + "num_input_tokens_seen": 203877536, + "step": 167560 + }, + { + "epoch": 18.661877714667558, + "grad_norm": 2.0321249961853027, + "learning_rate": 6.788180896264817e-07, + "loss": 0.0894, + "num_input_tokens_seen": 203883712, + "step": 167565 + }, + { + "epoch": 18.662434569551174, + "grad_norm": 0.3821246325969696, + "learning_rate": 6.782558465406541e-07, + "loss": 0.0395, + "num_input_tokens_seen": 203889632, + "step": 167570 + }, + { + "epoch": 18.662991424434793, + "grad_norm": 0.0034023558255285025, + "learning_rate": 6.776938331931825e-07, + "loss": 0.0135, + "num_input_tokens_seen": 203896128, + "step": 167575 + }, + { + "epoch": 18.66354827931841, + "grad_norm": 0.060376979410648346, + "learning_rate": 6.771320495893796e-07, + "loss": 0.1053, + "num_input_tokens_seen": 203902208, + "step": 167580 + }, + { + "epoch": 18.66410513420203, + "grad_norm": 0.029226897284388542, + "learning_rate": 6.765704957345492e-07, + "loss": 0.0506, + "num_input_tokens_seen": 203908256, + "step": 167585 + }, + { + "epoch": 18.664661989085644, + "grad_norm": 4.730256080627441, + "learning_rate": 6.760091716340011e-07, + "loss": 0.0516, + "num_input_tokens_seen": 203914016, + "step": 167590 + }, + { + "epoch": 18.66521884396926, + "grad_norm": 0.0007305228500626981, + "learning_rate": 6.754480772930338e-07, + "loss": 0.0034, + "num_input_tokens_seen": 203920224, + "step": 167595 + }, + { + "epoch": 18.66577569885288, + "grad_norm": 0.01528224442154169, + "learning_rate": 6.748872127169487e-07, + "loss": 0.0578, + "num_input_tokens_seen": 203926496, + "step": 167600 + }, + { + "epoch": 18.666332553736495, + "grad_norm": 0.3376277983188629, + "learning_rate": 6.743265779110413e-07, + "loss": 0.0086, + "num_input_tokens_seen": 203932736, + "step": 167605 + }, + { + "epoch": 18.666889408620115, + "grad_norm": 0.0170343779027462, + "learning_rate": 6.737661728806105e-07, + "loss": 0.0375, + "num_input_tokens_seen": 203938848, + "step": 167610 + }, + { + "epoch": 18.66744626350373, + "grad_norm": 0.014054505154490471, + "learning_rate": 6.732059976309463e-07, + "loss": 0.0265, + "num_input_tokens_seen": 203944256, + "step": 167615 + }, + { + "epoch": 18.668003118387347, + "grad_norm": 0.0002730968699324876, + "learning_rate": 6.726460521673445e-07, + "loss": 0.0008, + "num_input_tokens_seen": 203950272, + "step": 167620 + }, + { + "epoch": 18.668559973270966, + "grad_norm": 1.992362380027771, + "learning_rate": 6.720863364950869e-07, + "loss": 0.049, + "num_input_tokens_seen": 203956224, + "step": 167625 + }, + { + "epoch": 18.669116828154582, + "grad_norm": 0.2356473207473755, + "learning_rate": 6.715268506194694e-07, + "loss": 0.1537, + "num_input_tokens_seen": 203962016, + "step": 167630 + }, + { + "epoch": 18.6696736830382, + "grad_norm": 0.03212633728981018, + "learning_rate": 6.709675945457683e-07, + "loss": 0.042, + "num_input_tokens_seen": 203968192, + "step": 167635 + }, + { + "epoch": 18.670230537921817, + "grad_norm": 0.9258667826652527, + "learning_rate": 6.704085682792765e-07, + "loss": 0.0311, + "num_input_tokens_seen": 203973984, + "step": 167640 + }, + { + "epoch": 18.670787392805433, + "grad_norm": 0.17291101813316345, + "learning_rate": 6.698497718252622e-07, + "loss": 0.0058, + "num_input_tokens_seen": 203980352, + "step": 167645 + }, + { + "epoch": 18.671344247689053, + "grad_norm": 0.45601987838745117, + "learning_rate": 6.692912051890127e-07, + "loss": 0.041, + "num_input_tokens_seen": 203986048, + "step": 167650 + }, + { + "epoch": 18.67190110257267, + "grad_norm": 0.07479816675186157, + "learning_rate": 6.687328683757987e-07, + "loss": 0.0102, + "num_input_tokens_seen": 203992000, + "step": 167655 + }, + { + "epoch": 18.672457957456288, + "grad_norm": 0.7873214483261108, + "learning_rate": 6.681747613908995e-07, + "loss": 0.1109, + "num_input_tokens_seen": 203998400, + "step": 167660 + }, + { + "epoch": 18.673014812339904, + "grad_norm": 0.0006965980282984674, + "learning_rate": 6.67616884239583e-07, + "loss": 0.0978, + "num_input_tokens_seen": 204004416, + "step": 167665 + }, + { + "epoch": 18.67357166722352, + "grad_norm": 0.0003550292458385229, + "learning_rate": 6.670592369271229e-07, + "loss": 0.0383, + "num_input_tokens_seen": 204009984, + "step": 167670 + }, + { + "epoch": 18.67412852210714, + "grad_norm": 0.0005435145576484501, + "learning_rate": 6.665018194587786e-07, + "loss": 0.0005, + "num_input_tokens_seen": 204015936, + "step": 167675 + }, + { + "epoch": 18.674685376990755, + "grad_norm": 0.0023308901581913233, + "learning_rate": 6.659446318398211e-07, + "loss": 0.0024, + "num_input_tokens_seen": 204022496, + "step": 167680 + }, + { + "epoch": 18.675242231874375, + "grad_norm": 1.0462216138839722, + "learning_rate": 6.653876740755155e-07, + "loss": 0.0415, + "num_input_tokens_seen": 204028608, + "step": 167685 + }, + { + "epoch": 18.67579908675799, + "grad_norm": 0.09713470190763474, + "learning_rate": 6.648309461711189e-07, + "loss": 0.094, + "num_input_tokens_seen": 204034688, + "step": 167690 + }, + { + "epoch": 18.676355941641607, + "grad_norm": 0.016615187749266624, + "learning_rate": 6.64274448131888e-07, + "loss": 0.0026, + "num_input_tokens_seen": 204041184, + "step": 167695 + }, + { + "epoch": 18.676912796525226, + "grad_norm": 1.5318856239318848, + "learning_rate": 6.637181799630854e-07, + "loss": 0.1322, + "num_input_tokens_seen": 204047296, + "step": 167700 + }, + { + "epoch": 18.677469651408842, + "grad_norm": 0.010564649477601051, + "learning_rate": 6.631621416699596e-07, + "loss": 0.0787, + "num_input_tokens_seen": 204053120, + "step": 167705 + }, + { + "epoch": 18.67802650629246, + "grad_norm": 0.0691150426864624, + "learning_rate": 6.626063332577704e-07, + "loss": 0.0317, + "num_input_tokens_seen": 204059616, + "step": 167710 + }, + { + "epoch": 18.678583361176077, + "grad_norm": 0.35889488458633423, + "learning_rate": 6.620507547317606e-07, + "loss": 0.0695, + "num_input_tokens_seen": 204065728, + "step": 167715 + }, + { + "epoch": 18.679140216059693, + "grad_norm": 1.2739393711090088, + "learning_rate": 6.614954060971818e-07, + "loss": 0.0593, + "num_input_tokens_seen": 204071776, + "step": 167720 + }, + { + "epoch": 18.679697070943313, + "grad_norm": 0.03002127632498741, + "learning_rate": 6.60940287359274e-07, + "loss": 0.0516, + "num_input_tokens_seen": 204077792, + "step": 167725 + }, + { + "epoch": 18.68025392582693, + "grad_norm": 0.1495513617992401, + "learning_rate": 6.603853985232916e-07, + "loss": 0.0076, + "num_input_tokens_seen": 204083168, + "step": 167730 + }, + { + "epoch": 18.680810780710548, + "grad_norm": 0.980302095413208, + "learning_rate": 6.598307395944664e-07, + "loss": 0.0925, + "num_input_tokens_seen": 204088992, + "step": 167735 + }, + { + "epoch": 18.681367635594164, + "grad_norm": 1.4711600542068481, + "learning_rate": 6.592763105780442e-07, + "loss": 0.1043, + "num_input_tokens_seen": 204094976, + "step": 167740 + }, + { + "epoch": 18.681924490477783, + "grad_norm": 0.5465044379234314, + "learning_rate": 6.587221114792513e-07, + "loss": 0.0098, + "num_input_tokens_seen": 204101344, + "step": 167745 + }, + { + "epoch": 18.6824813453614, + "grad_norm": 0.009394438937306404, + "learning_rate": 6.581681423033364e-07, + "loss": 0.0148, + "num_input_tokens_seen": 204106656, + "step": 167750 + }, + { + "epoch": 18.683038200245015, + "grad_norm": 0.04129067435860634, + "learning_rate": 6.576144030555259e-07, + "loss": 0.0139, + "num_input_tokens_seen": 204112224, + "step": 167755 + }, + { + "epoch": 18.683595055128634, + "grad_norm": 0.7008761167526245, + "learning_rate": 6.570608937410488e-07, + "loss": 0.0133, + "num_input_tokens_seen": 204118080, + "step": 167760 + }, + { + "epoch": 18.68415191001225, + "grad_norm": 0.13523417711257935, + "learning_rate": 6.565076143651316e-07, + "loss": 0.0032, + "num_input_tokens_seen": 204123680, + "step": 167765 + }, + { + "epoch": 18.68470876489587, + "grad_norm": 0.7071389555931091, + "learning_rate": 6.559545649330062e-07, + "loss": 0.0822, + "num_input_tokens_seen": 204129568, + "step": 167770 + }, + { + "epoch": 18.685265619779486, + "grad_norm": 0.11004706472158432, + "learning_rate": 6.554017454498934e-07, + "loss": 0.0103, + "num_input_tokens_seen": 204135712, + "step": 167775 + }, + { + "epoch": 18.6858224746631, + "grad_norm": 0.06571441143751144, + "learning_rate": 6.548491559210168e-07, + "loss": 0.0033, + "num_input_tokens_seen": 204141728, + "step": 167780 + }, + { + "epoch": 18.68637932954672, + "grad_norm": 0.3933399021625519, + "learning_rate": 6.542967963515944e-07, + "loss": 0.028, + "num_input_tokens_seen": 204147808, + "step": 167785 + }, + { + "epoch": 18.686936184430337, + "grad_norm": 0.0006190312560647726, + "learning_rate": 6.537446667468472e-07, + "loss": 0.02, + "num_input_tokens_seen": 204153472, + "step": 167790 + }, + { + "epoch": 18.687493039313956, + "grad_norm": 0.05951850116252899, + "learning_rate": 6.53192767111982e-07, + "loss": 0.0148, + "num_input_tokens_seen": 204159296, + "step": 167795 + }, + { + "epoch": 18.688049894197572, + "grad_norm": 0.1262446641921997, + "learning_rate": 6.526410974522196e-07, + "loss": 0.014, + "num_input_tokens_seen": 204165280, + "step": 167800 + }, + { + "epoch": 18.68860674908119, + "grad_norm": 0.007781852502375841, + "learning_rate": 6.520896577727698e-07, + "loss": 0.0081, + "num_input_tokens_seen": 204171200, + "step": 167805 + }, + { + "epoch": 18.689163603964808, + "grad_norm": 0.3380439281463623, + "learning_rate": 6.515384480788422e-07, + "loss": 0.0035, + "num_input_tokens_seen": 204177184, + "step": 167810 + }, + { + "epoch": 18.689720458848424, + "grad_norm": 0.8144857883453369, + "learning_rate": 6.509874683756384e-07, + "loss": 0.0206, + "num_input_tokens_seen": 204183328, + "step": 167815 + }, + { + "epoch": 18.690277313732043, + "grad_norm": 1.3865410089492798, + "learning_rate": 6.504367186683652e-07, + "loss": 0.1198, + "num_input_tokens_seen": 204189440, + "step": 167820 + }, + { + "epoch": 18.69083416861566, + "grad_norm": 0.04858780279755592, + "learning_rate": 6.498861989622268e-07, + "loss": 0.0024, + "num_input_tokens_seen": 204195008, + "step": 167825 + }, + { + "epoch": 18.691391023499275, + "grad_norm": 0.5639957189559937, + "learning_rate": 6.493359092624274e-07, + "loss": 0.0787, + "num_input_tokens_seen": 204201088, + "step": 167830 + }, + { + "epoch": 18.691947878382894, + "grad_norm": 1.3498265743255615, + "learning_rate": 6.487858495741545e-07, + "loss": 0.0422, + "num_input_tokens_seen": 204207488, + "step": 167835 + }, + { + "epoch": 18.69250473326651, + "grad_norm": 0.023413347080349922, + "learning_rate": 6.482360199026094e-07, + "loss": 0.0104, + "num_input_tokens_seen": 204213472, + "step": 167840 + }, + { + "epoch": 18.69306158815013, + "grad_norm": 0.3691786229610443, + "learning_rate": 6.476864202529853e-07, + "loss": 0.0049, + "num_input_tokens_seen": 204219456, + "step": 167845 + }, + { + "epoch": 18.693618443033746, + "grad_norm": 0.24710533022880554, + "learning_rate": 6.471370506304725e-07, + "loss": 0.0877, + "num_input_tokens_seen": 204225760, + "step": 167850 + }, + { + "epoch": 18.69417529791736, + "grad_norm": 0.016129575669765472, + "learning_rate": 6.465879110402667e-07, + "loss": 0.0665, + "num_input_tokens_seen": 204231424, + "step": 167855 + }, + { + "epoch": 18.69473215280098, + "grad_norm": 0.02010916918516159, + "learning_rate": 6.460390014875445e-07, + "loss": 0.0467, + "num_input_tokens_seen": 204237696, + "step": 167860 + }, + { + "epoch": 18.695289007684597, + "grad_norm": 0.04673105478286743, + "learning_rate": 6.454903219774988e-07, + "loss": 0.0303, + "num_input_tokens_seen": 204243104, + "step": 167865 + }, + { + "epoch": 18.695845862568216, + "grad_norm": 0.21421514451503754, + "learning_rate": 6.449418725153062e-07, + "loss": 0.0957, + "num_input_tokens_seen": 204249344, + "step": 167870 + }, + { + "epoch": 18.696402717451832, + "grad_norm": 0.0005873159971088171, + "learning_rate": 6.44393653106154e-07, + "loss": 0.0172, + "num_input_tokens_seen": 204255584, + "step": 167875 + }, + { + "epoch": 18.696959572335448, + "grad_norm": 0.004836241248995066, + "learning_rate": 6.43845663755216e-07, + "loss": 0.1064, + "num_input_tokens_seen": 204261856, + "step": 167880 + }, + { + "epoch": 18.697516427219067, + "grad_norm": 0.0002132454828824848, + "learning_rate": 6.432979044676712e-07, + "loss": 0.0717, + "num_input_tokens_seen": 204267936, + "step": 167885 + }, + { + "epoch": 18.698073282102683, + "grad_norm": 0.14096699655056, + "learning_rate": 6.42750375248688e-07, + "loss": 0.1643, + "num_input_tokens_seen": 204273952, + "step": 167890 + }, + { + "epoch": 18.698630136986303, + "grad_norm": 0.001880323514342308, + "learning_rate": 6.422030761034453e-07, + "loss": 0.052, + "num_input_tokens_seen": 204279840, + "step": 167895 + }, + { + "epoch": 18.69918699186992, + "grad_norm": 0.8074657320976257, + "learning_rate": 6.416560070371114e-07, + "loss": 0.0437, + "num_input_tokens_seen": 204286080, + "step": 167900 + }, + { + "epoch": 18.699743846753535, + "grad_norm": 0.0001909381680889055, + "learning_rate": 6.411091680548487e-07, + "loss": 0.0289, + "num_input_tokens_seen": 204291616, + "step": 167905 + }, + { + "epoch": 18.700300701637154, + "grad_norm": 1.0279135704040527, + "learning_rate": 6.405625591618253e-07, + "loss": 0.0493, + "num_input_tokens_seen": 204298112, + "step": 167910 + }, + { + "epoch": 18.70085755652077, + "grad_norm": 1.0049163103103638, + "learning_rate": 6.400161803632065e-07, + "loss": 0.052, + "num_input_tokens_seen": 204304416, + "step": 167915 + }, + { + "epoch": 18.70141441140439, + "grad_norm": 1.209425687789917, + "learning_rate": 6.394700316641522e-07, + "loss": 0.1108, + "num_input_tokens_seen": 204310688, + "step": 167920 + }, + { + "epoch": 18.701971266288005, + "grad_norm": 0.06752906739711761, + "learning_rate": 6.389241130698193e-07, + "loss": 0.0847, + "num_input_tokens_seen": 204317056, + "step": 167925 + }, + { + "epoch": 18.70252812117162, + "grad_norm": 0.043889086693525314, + "learning_rate": 6.383784245853674e-07, + "loss": 0.0503, + "num_input_tokens_seen": 204323264, + "step": 167930 + }, + { + "epoch": 18.70308497605524, + "grad_norm": 0.03901002183556557, + "learning_rate": 6.37832966215951e-07, + "loss": 0.0635, + "num_input_tokens_seen": 204329504, + "step": 167935 + }, + { + "epoch": 18.703641830938857, + "grad_norm": 0.013361633755266666, + "learning_rate": 6.372877379667159e-07, + "loss": 0.001, + "num_input_tokens_seen": 204335584, + "step": 167940 + }, + { + "epoch": 18.704198685822476, + "grad_norm": 0.00030193704878911376, + "learning_rate": 6.367427398428216e-07, + "loss": 0.0059, + "num_input_tokens_seen": 204341408, + "step": 167945 + }, + { + "epoch": 18.704755540706092, + "grad_norm": 3.1702725887298584, + "learning_rate": 6.361979718494115e-07, + "loss": 0.2513, + "num_input_tokens_seen": 204347744, + "step": 167950 + }, + { + "epoch": 18.705312395589708, + "grad_norm": 0.0027670031413435936, + "learning_rate": 6.356534339916315e-07, + "loss": 0.0251, + "num_input_tokens_seen": 204353728, + "step": 167955 + }, + { + "epoch": 18.705869250473327, + "grad_norm": 0.000837460276670754, + "learning_rate": 6.351091262746217e-07, + "loss": 0.0204, + "num_input_tokens_seen": 204359584, + "step": 167960 + }, + { + "epoch": 18.706426105356943, + "grad_norm": 1.7127596139907837, + "learning_rate": 6.345650487035309e-07, + "loss": 0.0785, + "num_input_tokens_seen": 204365664, + "step": 167965 + }, + { + "epoch": 18.706982960240563, + "grad_norm": 0.00043814218952320516, + "learning_rate": 6.340212012834912e-07, + "loss": 0.046, + "num_input_tokens_seen": 204371392, + "step": 167970 + }, + { + "epoch": 18.70753981512418, + "grad_norm": 0.38611936569213867, + "learning_rate": 6.334775840196483e-07, + "loss": 0.0443, + "num_input_tokens_seen": 204377440, + "step": 167975 + }, + { + "epoch": 18.708096670007794, + "grad_norm": 0.0008386022527702153, + "learning_rate": 6.32934196917126e-07, + "loss": 0.0043, + "num_input_tokens_seen": 204383520, + "step": 167980 + }, + { + "epoch": 18.708653524891414, + "grad_norm": 0.0005059769609943032, + "learning_rate": 6.323910399810646e-07, + "loss": 0.028, + "num_input_tokens_seen": 204389728, + "step": 167985 + }, + { + "epoch": 18.70921037977503, + "grad_norm": 1.066839575767517, + "learning_rate": 6.318481132165904e-07, + "loss": 0.0872, + "num_input_tokens_seen": 204395616, + "step": 167990 + }, + { + "epoch": 18.70976723465865, + "grad_norm": 0.009185904636979103, + "learning_rate": 6.313054166288385e-07, + "loss": 0.002, + "num_input_tokens_seen": 204401632, + "step": 167995 + }, + { + "epoch": 18.710324089542265, + "grad_norm": 1.2156835794448853, + "learning_rate": 6.307629502229296e-07, + "loss": 0.0167, + "num_input_tokens_seen": 204407680, + "step": 168000 + }, + { + "epoch": 18.71088094442588, + "grad_norm": 2.0036654472351074, + "learning_rate": 6.3022071400399e-07, + "loss": 0.0315, + "num_input_tokens_seen": 204413792, + "step": 168005 + }, + { + "epoch": 18.7114377993095, + "grad_norm": 1.9979639053344727, + "learning_rate": 6.296787079771382e-07, + "loss": 0.0385, + "num_input_tokens_seen": 204419808, + "step": 168010 + }, + { + "epoch": 18.711994654193116, + "grad_norm": 0.017689652740955353, + "learning_rate": 6.291369321474977e-07, + "loss": 0.057, + "num_input_tokens_seen": 204426112, + "step": 168015 + }, + { + "epoch": 18.712551509076736, + "grad_norm": 0.00029154165531508625, + "learning_rate": 6.285953865201838e-07, + "loss": 0.1406, + "num_input_tokens_seen": 204431808, + "step": 168020 + }, + { + "epoch": 18.71310836396035, + "grad_norm": 0.0011608207132667303, + "learning_rate": 6.280540711003119e-07, + "loss": 0.0527, + "num_input_tokens_seen": 204437824, + "step": 168025 + }, + { + "epoch": 18.713665218843968, + "grad_norm": 0.005061089526861906, + "learning_rate": 6.275129858929946e-07, + "loss": 0.0091, + "num_input_tokens_seen": 204443872, + "step": 168030 + }, + { + "epoch": 18.714222073727587, + "grad_norm": 3.8759829998016357, + "learning_rate": 6.269721309033472e-07, + "loss": 0.0646, + "num_input_tokens_seen": 204450112, + "step": 168035 + }, + { + "epoch": 18.714778928611203, + "grad_norm": 0.009153082966804504, + "learning_rate": 6.264315061364739e-07, + "loss": 0.0015, + "num_input_tokens_seen": 204456000, + "step": 168040 + }, + { + "epoch": 18.715335783494822, + "grad_norm": 0.5825026035308838, + "learning_rate": 6.258911115974847e-07, + "loss": 0.0071, + "num_input_tokens_seen": 204461600, + "step": 168045 + }, + { + "epoch": 18.71589263837844, + "grad_norm": 0.0672246664762497, + "learning_rate": 6.253509472914781e-07, + "loss": 0.0252, + "num_input_tokens_seen": 204467776, + "step": 168050 + }, + { + "epoch": 18.716449493262054, + "grad_norm": 0.0033873305656015873, + "learning_rate": 6.24811013223564e-07, + "loss": 0.0014, + "num_input_tokens_seen": 204474080, + "step": 168055 + }, + { + "epoch": 18.717006348145674, + "grad_norm": 1.6206252574920654, + "learning_rate": 6.242713093988356e-07, + "loss": 0.09, + "num_input_tokens_seen": 204480128, + "step": 168060 + }, + { + "epoch": 18.71756320302929, + "grad_norm": 0.5222076177597046, + "learning_rate": 6.23731835822397e-07, + "loss": 0.0759, + "num_input_tokens_seen": 204486496, + "step": 168065 + }, + { + "epoch": 18.71812005791291, + "grad_norm": 2.6914196014404297, + "learning_rate": 6.231925924993415e-07, + "loss": 0.117, + "num_input_tokens_seen": 204492640, + "step": 168070 + }, + { + "epoch": 18.718676912796525, + "grad_norm": 0.10937304049730301, + "learning_rate": 6.226535794347622e-07, + "loss": 0.0041, + "num_input_tokens_seen": 204498624, + "step": 168075 + }, + { + "epoch": 18.719233767680144, + "grad_norm": 0.0008838983485475183, + "learning_rate": 6.221147966337492e-07, + "loss": 0.0835, + "num_input_tokens_seen": 204505120, + "step": 168080 + }, + { + "epoch": 18.71979062256376, + "grad_norm": 0.3673645853996277, + "learning_rate": 6.215762441013934e-07, + "loss": 0.0214, + "num_input_tokens_seen": 204511552, + "step": 168085 + }, + { + "epoch": 18.720347477447376, + "grad_norm": 0.21294885873794556, + "learning_rate": 6.21037921842782e-07, + "loss": 0.0744, + "num_input_tokens_seen": 204517792, + "step": 168090 + }, + { + "epoch": 18.720904332330996, + "grad_norm": 1.4962093830108643, + "learning_rate": 6.204998298629999e-07, + "loss": 0.1073, + "num_input_tokens_seen": 204524224, + "step": 168095 + }, + { + "epoch": 18.72146118721461, + "grad_norm": 0.024780569598078728, + "learning_rate": 6.199619681671292e-07, + "loss": 0.0349, + "num_input_tokens_seen": 204530624, + "step": 168100 + }, + { + "epoch": 18.722018042098227, + "grad_norm": 0.024584371596574783, + "learning_rate": 6.194243367602493e-07, + "loss": 0.0127, + "num_input_tokens_seen": 204536800, + "step": 168105 + }, + { + "epoch": 18.722574896981847, + "grad_norm": 0.0002484155702404678, + "learning_rate": 6.188869356474391e-07, + "loss": 0.0011, + "num_input_tokens_seen": 204542848, + "step": 168110 + }, + { + "epoch": 18.723131751865463, + "grad_norm": 1.1863850355148315, + "learning_rate": 6.183497648337811e-07, + "loss": 0.0535, + "num_input_tokens_seen": 204548736, + "step": 168115 + }, + { + "epoch": 18.723688606749082, + "grad_norm": 2.0578489303588867, + "learning_rate": 6.178128243243403e-07, + "loss": 0.0812, + "num_input_tokens_seen": 204554528, + "step": 168120 + }, + { + "epoch": 18.724245461632698, + "grad_norm": 1.5198112726211548, + "learning_rate": 6.172761141241934e-07, + "loss": 0.0996, + "num_input_tokens_seen": 204560032, + "step": 168125 + }, + { + "epoch": 18.724802316516318, + "grad_norm": 0.7405012845993042, + "learning_rate": 6.167396342384057e-07, + "loss": 0.0172, + "num_input_tokens_seen": 204565888, + "step": 168130 + }, + { + "epoch": 18.725359171399933, + "grad_norm": 0.07579399645328522, + "learning_rate": 6.162033846720483e-07, + "loss": 0.0062, + "num_input_tokens_seen": 204571936, + "step": 168135 + }, + { + "epoch": 18.72591602628355, + "grad_norm": 0.009384295903146267, + "learning_rate": 6.156673654301892e-07, + "loss": 0.0119, + "num_input_tokens_seen": 204578080, + "step": 168140 + }, + { + "epoch": 18.72647288116717, + "grad_norm": 1.2023431062698364, + "learning_rate": 6.151315765178855e-07, + "loss": 0.0203, + "num_input_tokens_seen": 204584192, + "step": 168145 + }, + { + "epoch": 18.727029736050785, + "grad_norm": 0.03859579935669899, + "learning_rate": 6.145960179402e-07, + "loss": 0.0008, + "num_input_tokens_seen": 204590336, + "step": 168150 + }, + { + "epoch": 18.727586590934404, + "grad_norm": 0.0011772478464990854, + "learning_rate": 6.14060689702195e-07, + "loss": 0.0119, + "num_input_tokens_seen": 204596480, + "step": 168155 + }, + { + "epoch": 18.72814344581802, + "grad_norm": 0.28572261333465576, + "learning_rate": 6.135255918089222e-07, + "loss": 0.0664, + "num_input_tokens_seen": 204602496, + "step": 168160 + }, + { + "epoch": 18.728700300701636, + "grad_norm": 0.03658794239163399, + "learning_rate": 6.129907242654415e-07, + "loss": 0.0815, + "num_input_tokens_seen": 204608704, + "step": 168165 + }, + { + "epoch": 18.729257155585255, + "grad_norm": 1.120063066482544, + "learning_rate": 6.124560870767987e-07, + "loss": 0.0464, + "num_input_tokens_seen": 204615200, + "step": 168170 + }, + { + "epoch": 18.72981401046887, + "grad_norm": 0.008126717992126942, + "learning_rate": 6.119216802480482e-07, + "loss": 0.0552, + "num_input_tokens_seen": 204620992, + "step": 168175 + }, + { + "epoch": 18.73037086535249, + "grad_norm": 0.007177433930337429, + "learning_rate": 6.113875037842359e-07, + "loss": 0.041, + "num_input_tokens_seen": 204627328, + "step": 168180 + }, + { + "epoch": 18.730927720236107, + "grad_norm": 0.0027664434164762497, + "learning_rate": 6.108535576904107e-07, + "loss": 0.0062, + "num_input_tokens_seen": 204632928, + "step": 168185 + }, + { + "epoch": 18.731484575119723, + "grad_norm": 1.3266648054122925, + "learning_rate": 6.103198419716127e-07, + "loss": 0.0181, + "num_input_tokens_seen": 204639360, + "step": 168190 + }, + { + "epoch": 18.732041430003342, + "grad_norm": 3.0092716217041016, + "learning_rate": 6.097863566328854e-07, + "loss": 0.0813, + "num_input_tokens_seen": 204645824, + "step": 168195 + }, + { + "epoch": 18.732598284886958, + "grad_norm": 1.603511929512024, + "learning_rate": 6.092531016792635e-07, + "loss": 0.1082, + "num_input_tokens_seen": 204651904, + "step": 168200 + }, + { + "epoch": 18.733155139770577, + "grad_norm": 0.015022173523902893, + "learning_rate": 6.087200771157931e-07, + "loss": 0.0008, + "num_input_tokens_seen": 204658144, + "step": 168205 + }, + { + "epoch": 18.733711994654193, + "grad_norm": 9.60665347520262e-05, + "learning_rate": 6.081872829475005e-07, + "loss": 0.0398, + "num_input_tokens_seen": 204664416, + "step": 168210 + }, + { + "epoch": 18.73426884953781, + "grad_norm": 0.6675561666488647, + "learning_rate": 6.076547191794207e-07, + "loss": 0.0113, + "num_input_tokens_seen": 204670688, + "step": 168215 + }, + { + "epoch": 18.73482570442143, + "grad_norm": 0.0972597524523735, + "learning_rate": 6.071223858165859e-07, + "loss": 0.0977, + "num_input_tokens_seen": 204676768, + "step": 168220 + }, + { + "epoch": 18.735382559305044, + "grad_norm": 0.8805760741233826, + "learning_rate": 6.065902828640225e-07, + "loss": 0.0754, + "num_input_tokens_seen": 204683104, + "step": 168225 + }, + { + "epoch": 18.735939414188664, + "grad_norm": 0.08428440243005753, + "learning_rate": 6.06058410326757e-07, + "loss": 0.0049, + "num_input_tokens_seen": 204689376, + "step": 168230 + }, + { + "epoch": 18.73649626907228, + "grad_norm": 0.21040913462638855, + "learning_rate": 6.055267682098187e-07, + "loss": 0.0032, + "num_input_tokens_seen": 204695424, + "step": 168235 + }, + { + "epoch": 18.737053123955896, + "grad_norm": 0.00033274912857450545, + "learning_rate": 6.049953565182231e-07, + "loss": 0.0131, + "num_input_tokens_seen": 204701504, + "step": 168240 + }, + { + "epoch": 18.737609978839515, + "grad_norm": 0.03586537390947342, + "learning_rate": 6.044641752569857e-07, + "loss": 0.0032, + "num_input_tokens_seen": 204707200, + "step": 168245 + }, + { + "epoch": 18.73816683372313, + "grad_norm": 0.0004524302785284817, + "learning_rate": 6.039332244311357e-07, + "loss": 0.0141, + "num_input_tokens_seen": 204713312, + "step": 168250 + }, + { + "epoch": 18.73872368860675, + "grad_norm": 0.34592539072036743, + "learning_rate": 6.034025040456775e-07, + "loss": 0.1757, + "num_input_tokens_seen": 204719456, + "step": 168255 + }, + { + "epoch": 18.739280543490366, + "grad_norm": 1.2852144241333008, + "learning_rate": 6.028720141056349e-07, + "loss": 0.0573, + "num_input_tokens_seen": 204725792, + "step": 168260 + }, + { + "epoch": 18.739837398373982, + "grad_norm": 1.0810766220092773, + "learning_rate": 6.023417546160065e-07, + "loss": 0.0473, + "num_input_tokens_seen": 204732128, + "step": 168265 + }, + { + "epoch": 18.7403942532576, + "grad_norm": 0.35654330253601074, + "learning_rate": 6.018117255818106e-07, + "loss": 0.0498, + "num_input_tokens_seen": 204738112, + "step": 168270 + }, + { + "epoch": 18.740951108141218, + "grad_norm": 0.0022701332345604897, + "learning_rate": 6.012819270080461e-07, + "loss": 0.0871, + "num_input_tokens_seen": 204744064, + "step": 168275 + }, + { + "epoch": 18.741507963024837, + "grad_norm": 0.9502679109573364, + "learning_rate": 6.007523588997282e-07, + "loss": 0.0292, + "num_input_tokens_seen": 204750496, + "step": 168280 + }, + { + "epoch": 18.742064817908453, + "grad_norm": 0.055348820984363556, + "learning_rate": 6.002230212618503e-07, + "loss": 0.0568, + "num_input_tokens_seen": 204756800, + "step": 168285 + }, + { + "epoch": 18.74262167279207, + "grad_norm": 0.043393153697252274, + "learning_rate": 5.99693914099414e-07, + "loss": 0.0766, + "num_input_tokens_seen": 204763200, + "step": 168290 + }, + { + "epoch": 18.74317852767569, + "grad_norm": 0.034441329538822174, + "learning_rate": 5.991650374174151e-07, + "loss": 0.0489, + "num_input_tokens_seen": 204769088, + "step": 168295 + }, + { + "epoch": 18.743735382559304, + "grad_norm": 0.11397663503885269, + "learning_rate": 5.986363912208582e-07, + "loss": 0.0237, + "num_input_tokens_seen": 204775200, + "step": 168300 + }, + { + "epoch": 18.744292237442924, + "grad_norm": 0.03418080881237984, + "learning_rate": 5.981079755147279e-07, + "loss": 0.0265, + "num_input_tokens_seen": 204781344, + "step": 168305 + }, + { + "epoch": 18.74484909232654, + "grad_norm": 0.03751600533723831, + "learning_rate": 5.975797903040176e-07, + "loss": 0.0685, + "num_input_tokens_seen": 204787488, + "step": 168310 + }, + { + "epoch": 18.745405947210156, + "grad_norm": 0.0011069817701354623, + "learning_rate": 5.970518355937149e-07, + "loss": 0.1397, + "num_input_tokens_seen": 204793760, + "step": 168315 + }, + { + "epoch": 18.745962802093775, + "grad_norm": 0.011572751216590405, + "learning_rate": 5.965241113888131e-07, + "loss": 0.0021, + "num_input_tokens_seen": 204800064, + "step": 168320 + }, + { + "epoch": 18.74651965697739, + "grad_norm": 0.14158828556537628, + "learning_rate": 5.959966176942889e-07, + "loss": 0.043, + "num_input_tokens_seen": 204805952, + "step": 168325 + }, + { + "epoch": 18.74707651186101, + "grad_norm": 0.0008667794172652066, + "learning_rate": 5.954693545151296e-07, + "loss": 0.0646, + "num_input_tokens_seen": 204812384, + "step": 168330 + }, + { + "epoch": 18.747633366744626, + "grad_norm": 0.7914035320281982, + "learning_rate": 5.949423218563177e-07, + "loss": 0.016, + "num_input_tokens_seen": 204818560, + "step": 168335 + }, + { + "epoch": 18.748190221628242, + "grad_norm": 0.19504323601722717, + "learning_rate": 5.944155197228268e-07, + "loss": 0.0501, + "num_input_tokens_seen": 204824736, + "step": 168340 + }, + { + "epoch": 18.74874707651186, + "grad_norm": 1.6041183471679688, + "learning_rate": 5.938889481196335e-07, + "loss": 0.0236, + "num_input_tokens_seen": 204830336, + "step": 168345 + }, + { + "epoch": 18.749303931395477, + "grad_norm": 0.03972560912370682, + "learning_rate": 5.933626070517145e-07, + "loss": 0.048, + "num_input_tokens_seen": 204836416, + "step": 168350 + }, + { + "epoch": 18.749860786279097, + "grad_norm": 0.07539905607700348, + "learning_rate": 5.928364965240408e-07, + "loss": 0.0118, + "num_input_tokens_seen": 204842368, + "step": 168355 + }, + { + "epoch": 18.750417641162713, + "grad_norm": 0.0007931039435788989, + "learning_rate": 5.923106165415831e-07, + "loss": 0.127, + "num_input_tokens_seen": 204848384, + "step": 168360 + }, + { + "epoch": 18.75097449604633, + "grad_norm": 0.0003545787767507136, + "learning_rate": 5.917849671093018e-07, + "loss": 0.016, + "num_input_tokens_seen": 204854528, + "step": 168365 + }, + { + "epoch": 18.751531350929948, + "grad_norm": 0.05005738139152527, + "learning_rate": 5.912595482321676e-07, + "loss": 0.0022, + "num_input_tokens_seen": 204860672, + "step": 168370 + }, + { + "epoch": 18.752088205813564, + "grad_norm": 0.0447314977645874, + "learning_rate": 5.907343599151432e-07, + "loss": 0.0162, + "num_input_tokens_seen": 204866848, + "step": 168375 + }, + { + "epoch": 18.752645060697184, + "grad_norm": 0.04599280655384064, + "learning_rate": 5.902094021631943e-07, + "loss": 0.2059, + "num_input_tokens_seen": 204873120, + "step": 168380 + }, + { + "epoch": 18.7532019155808, + "grad_norm": 1.5367100238800049, + "learning_rate": 5.896846749812667e-07, + "loss": 0.1304, + "num_input_tokens_seen": 204879136, + "step": 168385 + }, + { + "epoch": 18.753758770464415, + "grad_norm": 0.00040594226447865367, + "learning_rate": 5.891601783743289e-07, + "loss": 0.0424, + "num_input_tokens_seen": 204885632, + "step": 168390 + }, + { + "epoch": 18.754315625348035, + "grad_norm": 0.14933046698570251, + "learning_rate": 5.886359123473295e-07, + "loss": 0.0153, + "num_input_tokens_seen": 204891616, + "step": 168395 + }, + { + "epoch": 18.75487248023165, + "grad_norm": 0.09531886130571365, + "learning_rate": 5.88111876905223e-07, + "loss": 0.0114, + "num_input_tokens_seen": 204897728, + "step": 168400 + }, + { + "epoch": 18.75542933511527, + "grad_norm": 0.39120036363601685, + "learning_rate": 5.875880720529581e-07, + "loss": 0.1162, + "num_input_tokens_seen": 204902816, + "step": 168405 + }, + { + "epoch": 18.755986189998886, + "grad_norm": 1.1750104427337646, + "learning_rate": 5.870644977954837e-07, + "loss": 0.0398, + "num_input_tokens_seen": 204909120, + "step": 168410 + }, + { + "epoch": 18.756543044882502, + "grad_norm": 0.0010314035462215543, + "learning_rate": 5.86541154137743e-07, + "loss": 0.0008, + "num_input_tokens_seen": 204915168, + "step": 168415 + }, + { + "epoch": 18.75709989976612, + "grad_norm": 0.008878033608198166, + "learning_rate": 5.860180410846794e-07, + "loss": 0.0558, + "num_input_tokens_seen": 204921312, + "step": 168420 + }, + { + "epoch": 18.757656754649737, + "grad_norm": 0.9925149083137512, + "learning_rate": 5.854951586412388e-07, + "loss": 0.0232, + "num_input_tokens_seen": 204927232, + "step": 168425 + }, + { + "epoch": 18.758213609533357, + "grad_norm": 0.43266063928604126, + "learning_rate": 5.849725068123563e-07, + "loss": 0.0479, + "num_input_tokens_seen": 204933088, + "step": 168430 + }, + { + "epoch": 18.758770464416973, + "grad_norm": 0.046019360423088074, + "learning_rate": 5.844500856029666e-07, + "loss": 0.0715, + "num_input_tokens_seen": 204938624, + "step": 168435 + }, + { + "epoch": 18.75932731930059, + "grad_norm": 0.00041306091588921845, + "learning_rate": 5.839278950180105e-07, + "loss": 0.055, + "num_input_tokens_seen": 204944928, + "step": 168440 + }, + { + "epoch": 18.759884174184208, + "grad_norm": 0.08744176477193832, + "learning_rate": 5.834059350624144e-07, + "loss": 0.0499, + "num_input_tokens_seen": 204950784, + "step": 168445 + }, + { + "epoch": 18.760441029067824, + "grad_norm": 1.4806771278381348, + "learning_rate": 5.82884205741116e-07, + "loss": 0.1882, + "num_input_tokens_seen": 204956736, + "step": 168450 + }, + { + "epoch": 18.760997883951443, + "grad_norm": 2.576345920562744, + "learning_rate": 5.823627070590337e-07, + "loss": 0.1078, + "num_input_tokens_seen": 204962976, + "step": 168455 + }, + { + "epoch": 18.76155473883506, + "grad_norm": 0.1197885274887085, + "learning_rate": 5.818414390211024e-07, + "loss": 0.096, + "num_input_tokens_seen": 204968384, + "step": 168460 + }, + { + "epoch": 18.76211159371868, + "grad_norm": 0.3698744475841522, + "learning_rate": 5.813204016322405e-07, + "loss": 0.0108, + "num_input_tokens_seen": 204974400, + "step": 168465 + }, + { + "epoch": 18.762668448602295, + "grad_norm": 0.0006502855685539544, + "learning_rate": 5.807995948973716e-07, + "loss": 0.0558, + "num_input_tokens_seen": 204980576, + "step": 168470 + }, + { + "epoch": 18.76322530348591, + "grad_norm": 0.034284964203834534, + "learning_rate": 5.802790188214141e-07, + "loss": 0.0135, + "num_input_tokens_seen": 204986560, + "step": 168475 + }, + { + "epoch": 18.76378215836953, + "grad_norm": 0.7359388470649719, + "learning_rate": 5.797586734092891e-07, + "loss": 0.022, + "num_input_tokens_seen": 204992320, + "step": 168480 + }, + { + "epoch": 18.764339013253146, + "grad_norm": 0.07320238649845123, + "learning_rate": 5.792385586659038e-07, + "loss": 0.0176, + "num_input_tokens_seen": 204998624, + "step": 168485 + }, + { + "epoch": 18.764895868136765, + "grad_norm": 0.028587879613041878, + "learning_rate": 5.787186745961792e-07, + "loss": 0.087, + "num_input_tokens_seen": 205004576, + "step": 168490 + }, + { + "epoch": 18.76545272302038, + "grad_norm": 1.7408617734909058, + "learning_rate": 5.781990212050226e-07, + "loss": 0.0942, + "num_input_tokens_seen": 205010368, + "step": 168495 + }, + { + "epoch": 18.766009577903997, + "grad_norm": 0.061614990234375, + "learning_rate": 5.776795984973438e-07, + "loss": 0.0045, + "num_input_tokens_seen": 205016512, + "step": 168500 + }, + { + "epoch": 18.766566432787616, + "grad_norm": 0.09918782114982605, + "learning_rate": 5.771604064780444e-07, + "loss": 0.0017, + "num_input_tokens_seen": 205022400, + "step": 168505 + }, + { + "epoch": 18.767123287671232, + "grad_norm": 0.0149383544921875, + "learning_rate": 5.766414451520347e-07, + "loss": 0.1041, + "num_input_tokens_seen": 205028288, + "step": 168510 + }, + { + "epoch": 18.767680142554852, + "grad_norm": 1.9872207641601562, + "learning_rate": 5.761227145242132e-07, + "loss": 0.1301, + "num_input_tokens_seen": 205034304, + "step": 168515 + }, + { + "epoch": 18.768236997438468, + "grad_norm": 0.6024624705314636, + "learning_rate": 5.756042145994816e-07, + "loss": 0.0199, + "num_input_tokens_seen": 205040128, + "step": 168520 + }, + { + "epoch": 18.768793852322084, + "grad_norm": 0.9522197842597961, + "learning_rate": 5.750859453827362e-07, + "loss": 0.0433, + "num_input_tokens_seen": 205046016, + "step": 168525 + }, + { + "epoch": 18.769350707205703, + "grad_norm": 0.022689616307616234, + "learning_rate": 5.745679068788728e-07, + "loss": 0.0197, + "num_input_tokens_seen": 205052448, + "step": 168530 + }, + { + "epoch": 18.76990756208932, + "grad_norm": 0.4585655927658081, + "learning_rate": 5.740500990927849e-07, + "loss": 0.0064, + "num_input_tokens_seen": 205058656, + "step": 168535 + }, + { + "epoch": 18.77046441697294, + "grad_norm": 0.2567666471004486, + "learning_rate": 5.73532522029363e-07, + "loss": 0.0065, + "num_input_tokens_seen": 205064768, + "step": 168540 + }, + { + "epoch": 18.771021271856554, + "grad_norm": 0.00012656024773605168, + "learning_rate": 5.730151756935003e-07, + "loss": 0.0767, + "num_input_tokens_seen": 205070848, + "step": 168545 + }, + { + "epoch": 18.77157812674017, + "grad_norm": 0.2108515501022339, + "learning_rate": 5.724980600900764e-07, + "loss": 0.0121, + "num_input_tokens_seen": 205077088, + "step": 168550 + }, + { + "epoch": 18.77213498162379, + "grad_norm": 0.005533372517675161, + "learning_rate": 5.71981175223979e-07, + "loss": 0.0526, + "num_input_tokens_seen": 205083424, + "step": 168555 + }, + { + "epoch": 18.772691836507406, + "grad_norm": 0.04556230828166008, + "learning_rate": 5.714645211000902e-07, + "loss": 0.1267, + "num_input_tokens_seen": 205089216, + "step": 168560 + }, + { + "epoch": 18.773248691391025, + "grad_norm": 0.012466911226511002, + "learning_rate": 5.709480977232922e-07, + "loss": 0.0536, + "num_input_tokens_seen": 205095296, + "step": 168565 + }, + { + "epoch": 18.77380554627464, + "grad_norm": 0.22325095534324646, + "learning_rate": 5.704319050984647e-07, + "loss": 0.0033, + "num_input_tokens_seen": 205101472, + "step": 168570 + }, + { + "epoch": 18.774362401158257, + "grad_norm": 0.00010851437400560826, + "learning_rate": 5.699159432304757e-07, + "loss": 0.0231, + "num_input_tokens_seen": 205107392, + "step": 168575 + }, + { + "epoch": 18.774919256041876, + "grad_norm": 0.03847287595272064, + "learning_rate": 5.694002121242048e-07, + "loss": 0.096, + "num_input_tokens_seen": 205113440, + "step": 168580 + }, + { + "epoch": 18.775476110925492, + "grad_norm": 0.6610494256019592, + "learning_rate": 5.688847117845231e-07, + "loss": 0.0158, + "num_input_tokens_seen": 205119424, + "step": 168585 + }, + { + "epoch": 18.77603296580911, + "grad_norm": 0.05269888415932655, + "learning_rate": 5.683694422162988e-07, + "loss": 0.0278, + "num_input_tokens_seen": 205124768, + "step": 168590 + }, + { + "epoch": 18.776589820692728, + "grad_norm": 0.0002841429668478668, + "learning_rate": 5.678544034244004e-07, + "loss": 0.0031, + "num_input_tokens_seen": 205131104, + "step": 168595 + }, + { + "epoch": 18.777146675576343, + "grad_norm": 0.001734620425850153, + "learning_rate": 5.673395954136934e-07, + "loss": 0.032, + "num_input_tokens_seen": 205137568, + "step": 168600 + }, + { + "epoch": 18.777703530459963, + "grad_norm": 0.0016406840877607465, + "learning_rate": 5.66825018189035e-07, + "loss": 0.0777, + "num_input_tokens_seen": 205143712, + "step": 168605 + }, + { + "epoch": 18.77826038534358, + "grad_norm": 0.699457049369812, + "learning_rate": 5.663106717552907e-07, + "loss": 0.104, + "num_input_tokens_seen": 205149408, + "step": 168610 + }, + { + "epoch": 18.7788172402272, + "grad_norm": 2.932995557785034, + "learning_rate": 5.657965561173207e-07, + "loss": 0.0956, + "num_input_tokens_seen": 205155488, + "step": 168615 + }, + { + "epoch": 18.779374095110814, + "grad_norm": 0.005908027291297913, + "learning_rate": 5.652826712799764e-07, + "loss": 0.0456, + "num_input_tokens_seen": 205161568, + "step": 168620 + }, + { + "epoch": 18.77993094999443, + "grad_norm": 0.1052805706858635, + "learning_rate": 5.647690172481124e-07, + "loss": 0.0296, + "num_input_tokens_seen": 205167616, + "step": 168625 + }, + { + "epoch": 18.78048780487805, + "grad_norm": 0.08927810937166214, + "learning_rate": 5.642555940265859e-07, + "loss": 0.0078, + "num_input_tokens_seen": 205173984, + "step": 168630 + }, + { + "epoch": 18.781044659761665, + "grad_norm": 0.004734812304377556, + "learning_rate": 5.637424016202403e-07, + "loss": 0.009, + "num_input_tokens_seen": 205180384, + "step": 168635 + }, + { + "epoch": 18.781601514645285, + "grad_norm": 0.03606735169887543, + "learning_rate": 5.632294400339299e-07, + "loss": 0.1976, + "num_input_tokens_seen": 205186688, + "step": 168640 + }, + { + "epoch": 18.7821583695289, + "grad_norm": 0.006152179557830095, + "learning_rate": 5.627167092724899e-07, + "loss": 0.1193, + "num_input_tokens_seen": 205192896, + "step": 168645 + }, + { + "epoch": 18.782715224412517, + "grad_norm": 0.02186848595738411, + "learning_rate": 5.622042093407748e-07, + "loss": 0.1266, + "num_input_tokens_seen": 205199232, + "step": 168650 + }, + { + "epoch": 18.783272079296136, + "grad_norm": 1.7230486869812012, + "learning_rate": 5.616919402436166e-07, + "loss": 0.0974, + "num_input_tokens_seen": 205205600, + "step": 168655 + }, + { + "epoch": 18.783828934179752, + "grad_norm": 0.00013619328092318028, + "learning_rate": 5.611799019858587e-07, + "loss": 0.0042, + "num_input_tokens_seen": 205211648, + "step": 168660 + }, + { + "epoch": 18.78438578906337, + "grad_norm": 0.1419302076101303, + "learning_rate": 5.606680945723364e-07, + "loss": 0.1198, + "num_input_tokens_seen": 205217728, + "step": 168665 + }, + { + "epoch": 18.784942643946987, + "grad_norm": 1.7140693664550781, + "learning_rate": 5.601565180078844e-07, + "loss": 0.0714, + "num_input_tokens_seen": 205223360, + "step": 168670 + }, + { + "epoch": 18.785499498830603, + "grad_norm": 0.0008874970953911543, + "learning_rate": 5.596451722973379e-07, + "loss": 0.005, + "num_input_tokens_seen": 205229280, + "step": 168675 + }, + { + "epoch": 18.786056353714223, + "grad_norm": 0.8521568775177002, + "learning_rate": 5.591340574455178e-07, + "loss": 0.0203, + "num_input_tokens_seen": 205235232, + "step": 168680 + }, + { + "epoch": 18.78661320859784, + "grad_norm": 0.06207692623138428, + "learning_rate": 5.586231734572622e-07, + "loss": 0.0203, + "num_input_tokens_seen": 205241824, + "step": 168685 + }, + { + "epoch": 18.787170063481458, + "grad_norm": 0.47775182127952576, + "learning_rate": 5.581125203373949e-07, + "loss": 0.0752, + "num_input_tokens_seen": 205247936, + "step": 168690 + }, + { + "epoch": 18.787726918365074, + "grad_norm": 0.4363601505756378, + "learning_rate": 5.576020980907342e-07, + "loss": 0.0058, + "num_input_tokens_seen": 205254240, + "step": 168695 + }, + { + "epoch": 18.78828377324869, + "grad_norm": 0.5214213132858276, + "learning_rate": 5.570919067221042e-07, + "loss": 0.0295, + "num_input_tokens_seen": 205260704, + "step": 168700 + }, + { + "epoch": 18.78884062813231, + "grad_norm": 0.5447692275047302, + "learning_rate": 5.565819462363258e-07, + "loss": 0.0928, + "num_input_tokens_seen": 205266496, + "step": 168705 + }, + { + "epoch": 18.789397483015925, + "grad_norm": 0.056285321712493896, + "learning_rate": 5.560722166382148e-07, + "loss": 0.0342, + "num_input_tokens_seen": 205272544, + "step": 168710 + }, + { + "epoch": 18.789954337899545, + "grad_norm": 0.005170990247279406, + "learning_rate": 5.555627179325868e-07, + "loss": 0.0003, + "num_input_tokens_seen": 205278880, + "step": 168715 + }, + { + "epoch": 18.79051119278316, + "grad_norm": 0.7026339769363403, + "learning_rate": 5.550534501242516e-07, + "loss": 0.0668, + "num_input_tokens_seen": 205284864, + "step": 168720 + }, + { + "epoch": 18.791068047666776, + "grad_norm": 0.00170718168374151, + "learning_rate": 5.545444132180222e-07, + "loss": 0.0082, + "num_input_tokens_seen": 205291296, + "step": 168725 + }, + { + "epoch": 18.791624902550396, + "grad_norm": 0.00594880897551775, + "learning_rate": 5.540356072187031e-07, + "loss": 0.0063, + "num_input_tokens_seen": 205297632, + "step": 168730 + }, + { + "epoch": 18.79218175743401, + "grad_norm": 0.00712926359847188, + "learning_rate": 5.53527032131107e-07, + "loss": 0.104, + "num_input_tokens_seen": 205304032, + "step": 168735 + }, + { + "epoch": 18.79273861231763, + "grad_norm": 1.9155439138412476, + "learning_rate": 5.530186879600358e-07, + "loss": 0.1403, + "num_input_tokens_seen": 205309664, + "step": 168740 + }, + { + "epoch": 18.793295467201247, + "grad_norm": 0.0228586383163929, + "learning_rate": 5.525105747102882e-07, + "loss": 0.0187, + "num_input_tokens_seen": 205315808, + "step": 168745 + }, + { + "epoch": 18.793852322084863, + "grad_norm": 0.01805867813527584, + "learning_rate": 5.520026923866633e-07, + "loss": 0.107, + "num_input_tokens_seen": 205322112, + "step": 168750 + }, + { + "epoch": 18.794409176968482, + "grad_norm": 2.0586788654327393, + "learning_rate": 5.514950409939629e-07, + "loss": 0.0514, + "num_input_tokens_seen": 205328160, + "step": 168755 + }, + { + "epoch": 18.7949660318521, + "grad_norm": 0.8518295288085938, + "learning_rate": 5.509876205369774e-07, + "loss": 0.0493, + "num_input_tokens_seen": 205334016, + "step": 168760 + }, + { + "epoch": 18.795522886735718, + "grad_norm": 0.10027417540550232, + "learning_rate": 5.504804310205031e-07, + "loss": 0.0098, + "num_input_tokens_seen": 205340128, + "step": 168765 + }, + { + "epoch": 18.796079741619334, + "grad_norm": 0.1750432550907135, + "learning_rate": 5.499734724493305e-07, + "loss": 0.0046, + "num_input_tokens_seen": 205346272, + "step": 168770 + }, + { + "epoch": 18.79663659650295, + "grad_norm": 0.0985073372721672, + "learning_rate": 5.494667448282475e-07, + "loss": 0.0081, + "num_input_tokens_seen": 205352288, + "step": 168775 + }, + { + "epoch": 18.79719345138657, + "grad_norm": 0.001417840481735766, + "learning_rate": 5.489602481620365e-07, + "loss": 0.0013, + "num_input_tokens_seen": 205358880, + "step": 168780 + }, + { + "epoch": 18.797750306270185, + "grad_norm": 0.00014873073087073863, + "learning_rate": 5.484539824554935e-07, + "loss": 0.0089, + "num_input_tokens_seen": 205365216, + "step": 168785 + }, + { + "epoch": 18.798307161153804, + "grad_norm": 0.00281133851967752, + "learning_rate": 5.47947947713387e-07, + "loss": 0.1172, + "num_input_tokens_seen": 205371136, + "step": 168790 + }, + { + "epoch": 18.79886401603742, + "grad_norm": 0.11640909314155579, + "learning_rate": 5.474421439405048e-07, + "loss": 0.0193, + "num_input_tokens_seen": 205377024, + "step": 168795 + }, + { + "epoch": 18.79942087092104, + "grad_norm": 0.6223099827766418, + "learning_rate": 5.46936571141618e-07, + "loss": 0.0266, + "num_input_tokens_seen": 205383456, + "step": 168800 + }, + { + "epoch": 18.799977725804656, + "grad_norm": 0.0570513978600502, + "learning_rate": 5.464312293215119e-07, + "loss": 0.0034, + "num_input_tokens_seen": 205389696, + "step": 168805 + }, + { + "epoch": 18.80053458068827, + "grad_norm": 7.435159204760566e-05, + "learning_rate": 5.459261184849545e-07, + "loss": 0.0116, + "num_input_tokens_seen": 205395968, + "step": 168810 + }, + { + "epoch": 18.80109143557189, + "grad_norm": 0.0026426322292536497, + "learning_rate": 5.454212386367175e-07, + "loss": 0.0237, + "num_input_tokens_seen": 205402080, + "step": 168815 + }, + { + "epoch": 18.801648290455507, + "grad_norm": 0.7275961637496948, + "learning_rate": 5.449165897815661e-07, + "loss": 0.0245, + "num_input_tokens_seen": 205408448, + "step": 168820 + }, + { + "epoch": 18.802205145339123, + "grad_norm": 0.004083594772964716, + "learning_rate": 5.444121719242745e-07, + "loss": 0.0005, + "num_input_tokens_seen": 205414816, + "step": 168825 + }, + { + "epoch": 18.802762000222742, + "grad_norm": 0.020855950191617012, + "learning_rate": 5.439079850696028e-07, + "loss": 0.0731, + "num_input_tokens_seen": 205420640, + "step": 168830 + }, + { + "epoch": 18.803318855106358, + "grad_norm": 1.103941559791565, + "learning_rate": 5.434040292223136e-07, + "loss": 0.081, + "num_input_tokens_seen": 205426432, + "step": 168835 + }, + { + "epoch": 18.803875709989978, + "grad_norm": 0.001681437948718667, + "learning_rate": 5.429003043871644e-07, + "loss": 0.0522, + "num_input_tokens_seen": 205432704, + "step": 168840 + }, + { + "epoch": 18.804432564873593, + "grad_norm": 0.02382761612534523, + "learning_rate": 5.423968105689209e-07, + "loss": 0.0013, + "num_input_tokens_seen": 205438816, + "step": 168845 + }, + { + "epoch": 18.804989419757213, + "grad_norm": 0.002004459733143449, + "learning_rate": 5.418935477723319e-07, + "loss": 0.1186, + "num_input_tokens_seen": 205445120, + "step": 168850 + }, + { + "epoch": 18.80554627464083, + "grad_norm": 0.42150968313217163, + "learning_rate": 5.413905160021576e-07, + "loss": 0.0486, + "num_input_tokens_seen": 205450784, + "step": 168855 + }, + { + "epoch": 18.806103129524445, + "grad_norm": 0.1838817596435547, + "learning_rate": 5.408877152631414e-07, + "loss": 0.0137, + "num_input_tokens_seen": 205457024, + "step": 168860 + }, + { + "epoch": 18.806659984408064, + "grad_norm": 2.4935147762298584, + "learning_rate": 5.403851455600406e-07, + "loss": 0.1137, + "num_input_tokens_seen": 205463104, + "step": 168865 + }, + { + "epoch": 18.80721683929168, + "grad_norm": 0.5623168349266052, + "learning_rate": 5.398828068975931e-07, + "loss": 0.1999, + "num_input_tokens_seen": 205469024, + "step": 168870 + }, + { + "epoch": 18.8077736941753, + "grad_norm": 0.9462267756462097, + "learning_rate": 5.393806992805561e-07, + "loss": 0.0419, + "num_input_tokens_seen": 205475008, + "step": 168875 + }, + { + "epoch": 18.808330549058915, + "grad_norm": 0.05824322998523712, + "learning_rate": 5.388788227136621e-07, + "loss": 0.0931, + "num_input_tokens_seen": 205480864, + "step": 168880 + }, + { + "epoch": 18.80888740394253, + "grad_norm": 0.15738214552402496, + "learning_rate": 5.383771772016599e-07, + "loss": 0.0219, + "num_input_tokens_seen": 205486976, + "step": 168885 + }, + { + "epoch": 18.80944425882615, + "grad_norm": 0.2939186692237854, + "learning_rate": 5.378757627492764e-07, + "loss": 0.0108, + "num_input_tokens_seen": 205492896, + "step": 168890 + }, + { + "epoch": 18.810001113709767, + "grad_norm": 1.005574107170105, + "learning_rate": 5.373745793612605e-07, + "loss": 0.0286, + "num_input_tokens_seen": 205498976, + "step": 168895 + }, + { + "epoch": 18.810557968593386, + "grad_norm": 0.4652775824069977, + "learning_rate": 5.368736270423391e-07, + "loss": 0.0365, + "num_input_tokens_seen": 205505504, + "step": 168900 + }, + { + "epoch": 18.811114823477002, + "grad_norm": 0.5712301731109619, + "learning_rate": 5.363729057972472e-07, + "loss": 0.0318, + "num_input_tokens_seen": 205512064, + "step": 168905 + }, + { + "epoch": 18.811671678360618, + "grad_norm": 0.24310605227947235, + "learning_rate": 5.358724156307116e-07, + "loss": 0.027, + "num_input_tokens_seen": 205518208, + "step": 168910 + }, + { + "epoch": 18.812228533244237, + "grad_norm": 0.23894573748111725, + "learning_rate": 5.353721565474617e-07, + "loss": 0.0064, + "num_input_tokens_seen": 205524256, + "step": 168915 + }, + { + "epoch": 18.812785388127853, + "grad_norm": 0.34597668051719666, + "learning_rate": 5.348721285522218e-07, + "loss": 0.0155, + "num_input_tokens_seen": 205529984, + "step": 168920 + }, + { + "epoch": 18.813342243011473, + "grad_norm": 0.3934684693813324, + "learning_rate": 5.343723316497184e-07, + "loss": 0.0173, + "num_input_tokens_seen": 205536288, + "step": 168925 + }, + { + "epoch": 18.81389909789509, + "grad_norm": 0.04454776272177696, + "learning_rate": 5.338727658446674e-07, + "loss": 0.0069, + "num_input_tokens_seen": 205542688, + "step": 168930 + }, + { + "epoch": 18.814455952778705, + "grad_norm": 0.0004817942972294986, + "learning_rate": 5.333734311417926e-07, + "loss": 0.12, + "num_input_tokens_seen": 205548832, + "step": 168935 + }, + { + "epoch": 18.815012807662324, + "grad_norm": 1.9695830345153809, + "learning_rate": 5.328743275458043e-07, + "loss": 0.1214, + "num_input_tokens_seen": 205555040, + "step": 168940 + }, + { + "epoch": 18.81556966254594, + "grad_norm": 0.5127013921737671, + "learning_rate": 5.323754550614235e-07, + "loss": 0.0107, + "num_input_tokens_seen": 205561088, + "step": 168945 + }, + { + "epoch": 18.81612651742956, + "grad_norm": 1.8321850299835205, + "learning_rate": 5.318768136933578e-07, + "loss": 0.0443, + "num_input_tokens_seen": 205567136, + "step": 168950 + }, + { + "epoch": 18.816683372313175, + "grad_norm": 1.630531907081604, + "learning_rate": 5.313784034463226e-07, + "loss": 0.1204, + "num_input_tokens_seen": 205573088, + "step": 168955 + }, + { + "epoch": 18.81724022719679, + "grad_norm": 1.8679559230804443, + "learning_rate": 5.308802243250171e-07, + "loss": 0.0537, + "num_input_tokens_seen": 205579200, + "step": 168960 + }, + { + "epoch": 18.81779708208041, + "grad_norm": 0.029435675591230392, + "learning_rate": 5.30382276334157e-07, + "loss": 0.0039, + "num_input_tokens_seen": 205585568, + "step": 168965 + }, + { + "epoch": 18.818353936964026, + "grad_norm": 0.007410069461911917, + "learning_rate": 5.298845594784358e-07, + "loss": 0.035, + "num_input_tokens_seen": 205591552, + "step": 168970 + }, + { + "epoch": 18.818910791847646, + "grad_norm": 0.010868466459214687, + "learning_rate": 5.293870737625662e-07, + "loss": 0.0023, + "num_input_tokens_seen": 205597632, + "step": 168975 + }, + { + "epoch": 18.819467646731262, + "grad_norm": 0.2774079740047455, + "learning_rate": 5.288898191912362e-07, + "loss": 0.1628, + "num_input_tokens_seen": 205603712, + "step": 168980 + }, + { + "epoch": 18.820024501614878, + "grad_norm": 0.2249784767627716, + "learning_rate": 5.283927957691504e-07, + "loss": 0.0218, + "num_input_tokens_seen": 205610048, + "step": 168985 + }, + { + "epoch": 18.820581356498497, + "grad_norm": 0.005085436627268791, + "learning_rate": 5.278960035009994e-07, + "loss": 0.1056, + "num_input_tokens_seen": 205616256, + "step": 168990 + }, + { + "epoch": 18.821138211382113, + "grad_norm": 0.9673934578895569, + "learning_rate": 5.273994423914797e-07, + "loss": 0.0315, + "num_input_tokens_seen": 205622240, + "step": 168995 + }, + { + "epoch": 18.821695066265733, + "grad_norm": 9.232218144461513e-05, + "learning_rate": 5.269031124452789e-07, + "loss": 0.069, + "num_input_tokens_seen": 205628704, + "step": 169000 + }, + { + "epoch": 18.82225192114935, + "grad_norm": 0.00472613051533699, + "learning_rate": 5.264070136670851e-07, + "loss": 0.0344, + "num_input_tokens_seen": 205635136, + "step": 169005 + }, + { + "epoch": 18.822808776032964, + "grad_norm": 1.6325714588165283, + "learning_rate": 5.259111460615834e-07, + "loss": 0.022, + "num_input_tokens_seen": 205641248, + "step": 169010 + }, + { + "epoch": 18.823365630916584, + "grad_norm": 0.014771564863622189, + "learning_rate": 5.254155096334618e-07, + "loss": 0.0768, + "num_input_tokens_seen": 205647360, + "step": 169015 + }, + { + "epoch": 18.8239224858002, + "grad_norm": 0.8717780113220215, + "learning_rate": 5.249201043873996e-07, + "loss": 0.0303, + "num_input_tokens_seen": 205653472, + "step": 169020 + }, + { + "epoch": 18.82447934068382, + "grad_norm": 0.001884551951661706, + "learning_rate": 5.244249303280741e-07, + "loss": 0.014, + "num_input_tokens_seen": 205659456, + "step": 169025 + }, + { + "epoch": 18.825036195567435, + "grad_norm": 0.0026728243101388216, + "learning_rate": 5.239299874601644e-07, + "loss": 0.1386, + "num_input_tokens_seen": 205665440, + "step": 169030 + }, + { + "epoch": 18.82559305045105, + "grad_norm": 3.0655763149261475, + "learning_rate": 5.234352757883476e-07, + "loss": 0.052, + "num_input_tokens_seen": 205671584, + "step": 169035 + }, + { + "epoch": 18.82614990533467, + "grad_norm": 0.012129023671150208, + "learning_rate": 5.229407953172922e-07, + "loss": 0.0772, + "num_input_tokens_seen": 205677280, + "step": 169040 + }, + { + "epoch": 18.826706760218286, + "grad_norm": 2.3679211139678955, + "learning_rate": 5.224465460516775e-07, + "loss": 0.1061, + "num_input_tokens_seen": 205683328, + "step": 169045 + }, + { + "epoch": 18.827263615101906, + "grad_norm": 0.046697042882442474, + "learning_rate": 5.219525279961585e-07, + "loss": 0.0625, + "num_input_tokens_seen": 205689376, + "step": 169050 + }, + { + "epoch": 18.82782046998552, + "grad_norm": 0.000474092666991055, + "learning_rate": 5.214587411554145e-07, + "loss": 0.0097, + "num_input_tokens_seen": 205695360, + "step": 169055 + }, + { + "epoch": 18.828377324869138, + "grad_norm": 0.0010679835686460137, + "learning_rate": 5.20965185534103e-07, + "loss": 0.0439, + "num_input_tokens_seen": 205701472, + "step": 169060 + }, + { + "epoch": 18.828934179752757, + "grad_norm": 1.4803520441055298, + "learning_rate": 5.204718611368869e-07, + "loss": 0.0325, + "num_input_tokens_seen": 205707584, + "step": 169065 + }, + { + "epoch": 18.829491034636373, + "grad_norm": 0.0950891524553299, + "learning_rate": 5.199787679684292e-07, + "loss": 0.022, + "num_input_tokens_seen": 205713568, + "step": 169070 + }, + { + "epoch": 18.830047889519992, + "grad_norm": 0.9327082633972168, + "learning_rate": 5.194859060333845e-07, + "loss": 0.0388, + "num_input_tokens_seen": 205719936, + "step": 169075 + }, + { + "epoch": 18.830604744403608, + "grad_norm": 0.3943819999694824, + "learning_rate": 5.189932753364074e-07, + "loss": 0.0138, + "num_input_tokens_seen": 205725760, + "step": 169080 + }, + { + "epoch": 18.831161599287224, + "grad_norm": 1.25072181224823, + "learning_rate": 5.185008758821525e-07, + "loss": 0.1807, + "num_input_tokens_seen": 205731648, + "step": 169085 + }, + { + "epoch": 18.831718454170844, + "grad_norm": 0.03095822036266327, + "learning_rate": 5.180087076752716e-07, + "loss": 0.0282, + "num_input_tokens_seen": 205737952, + "step": 169090 + }, + { + "epoch": 18.83227530905446, + "grad_norm": 1.4591566324234009, + "learning_rate": 5.175167707204137e-07, + "loss": 0.0512, + "num_input_tokens_seen": 205744192, + "step": 169095 + }, + { + "epoch": 18.83283216393808, + "grad_norm": 1.121219277381897, + "learning_rate": 5.170250650222253e-07, + "loss": 0.0391, + "num_input_tokens_seen": 205750048, + "step": 169100 + }, + { + "epoch": 18.833389018821695, + "grad_norm": 0.046483319252729416, + "learning_rate": 5.165335905853497e-07, + "loss": 0.0035, + "num_input_tokens_seen": 205756288, + "step": 169105 + }, + { + "epoch": 18.83394587370531, + "grad_norm": 0.5284407734870911, + "learning_rate": 5.160423474144305e-07, + "loss": 0.0658, + "num_input_tokens_seen": 205762560, + "step": 169110 + }, + { + "epoch": 18.83450272858893, + "grad_norm": 0.0005295316805131733, + "learning_rate": 5.155513355141056e-07, + "loss": 0.0162, + "num_input_tokens_seen": 205768736, + "step": 169115 + }, + { + "epoch": 18.835059583472546, + "grad_norm": 0.010963229462504387, + "learning_rate": 5.150605548890186e-07, + "loss": 0.0993, + "num_input_tokens_seen": 205775104, + "step": 169120 + }, + { + "epoch": 18.835616438356166, + "grad_norm": 0.006297523155808449, + "learning_rate": 5.145700055437991e-07, + "loss": 0.0635, + "num_input_tokens_seen": 205781088, + "step": 169125 + }, + { + "epoch": 18.83617329323978, + "grad_norm": 0.0007890909910202026, + "learning_rate": 5.14079687483085e-07, + "loss": 0.0107, + "num_input_tokens_seen": 205787168, + "step": 169130 + }, + { + "epoch": 18.8367301481234, + "grad_norm": 0.0006292031612247229, + "learning_rate": 5.135896007115032e-07, + "loss": 0.0557, + "num_input_tokens_seen": 205793216, + "step": 169135 + }, + { + "epoch": 18.837287003007017, + "grad_norm": 0.022846661508083344, + "learning_rate": 5.130997452336889e-07, + "loss": 0.0333, + "num_input_tokens_seen": 205799264, + "step": 169140 + }, + { + "epoch": 18.837843857890633, + "grad_norm": 1.1791326999664307, + "learning_rate": 5.126101210542661e-07, + "loss": 0.0173, + "num_input_tokens_seen": 205805408, + "step": 169145 + }, + { + "epoch": 18.838400712774252, + "grad_norm": 0.10215412825345993, + "learning_rate": 5.12120728177859e-07, + "loss": 0.0874, + "num_input_tokens_seen": 205811616, + "step": 169150 + }, + { + "epoch": 18.838957567657868, + "grad_norm": 0.10571512579917908, + "learning_rate": 5.116315666090887e-07, + "loss": 0.0161, + "num_input_tokens_seen": 205817856, + "step": 169155 + }, + { + "epoch": 18.839514422541484, + "grad_norm": 0.006592242978513241, + "learning_rate": 5.111426363525795e-07, + "loss": 0.0063, + "num_input_tokens_seen": 205823808, + "step": 169160 + }, + { + "epoch": 18.840071277425103, + "grad_norm": 0.0007717109401710331, + "learning_rate": 5.106539374129499e-07, + "loss": 0.0028, + "num_input_tokens_seen": 205829984, + "step": 169165 + }, + { + "epoch": 18.84062813230872, + "grad_norm": 0.00018397398525848985, + "learning_rate": 5.101654697948127e-07, + "loss": 0.0044, + "num_input_tokens_seen": 205836256, + "step": 169170 + }, + { + "epoch": 18.84118498719234, + "grad_norm": 0.015860799700021744, + "learning_rate": 5.09677233502781e-07, + "loss": 0.0615, + "num_input_tokens_seen": 205842496, + "step": 169175 + }, + { + "epoch": 18.841741842075955, + "grad_norm": 0.0028853381518274546, + "learning_rate": 5.091892285414735e-07, + "loss": 0.0128, + "num_input_tokens_seen": 205848704, + "step": 169180 + }, + { + "epoch": 18.842298696959574, + "grad_norm": 0.2544040381908417, + "learning_rate": 5.087014549154917e-07, + "loss": 0.0476, + "num_input_tokens_seen": 205855232, + "step": 169185 + }, + { + "epoch": 18.84285555184319, + "grad_norm": 0.007500781212002039, + "learning_rate": 5.082139126294516e-07, + "loss": 0.0017, + "num_input_tokens_seen": 205861568, + "step": 169190 + }, + { + "epoch": 18.843412406726806, + "grad_norm": 0.00274753849953413, + "learning_rate": 5.077266016879495e-07, + "loss": 0.0467, + "num_input_tokens_seen": 205867808, + "step": 169195 + }, + { + "epoch": 18.843969261610425, + "grad_norm": 0.00013285850582178682, + "learning_rate": 5.072395220955956e-07, + "loss": 0.0056, + "num_input_tokens_seen": 205874400, + "step": 169200 + }, + { + "epoch": 18.84452611649404, + "grad_norm": 0.08269284665584564, + "learning_rate": 5.067526738569834e-07, + "loss": 0.0039, + "num_input_tokens_seen": 205880320, + "step": 169205 + }, + { + "epoch": 18.84508297137766, + "grad_norm": 1.4356287717819214, + "learning_rate": 5.062660569767203e-07, + "loss": 0.0679, + "num_input_tokens_seen": 205886272, + "step": 169210 + }, + { + "epoch": 18.845639826261277, + "grad_norm": 0.008033793419599533, + "learning_rate": 5.05779671459397e-07, + "loss": 0.1055, + "num_input_tokens_seen": 205892608, + "step": 169215 + }, + { + "epoch": 18.846196681144892, + "grad_norm": 0.0037691572215408087, + "learning_rate": 5.052935173096102e-07, + "loss": 0.1142, + "num_input_tokens_seen": 205898112, + "step": 169220 + }, + { + "epoch": 18.846753536028512, + "grad_norm": 0.06627354770898819, + "learning_rate": 5.048075945319475e-07, + "loss": 0.0037, + "num_input_tokens_seen": 205904384, + "step": 169225 + }, + { + "epoch": 18.847310390912128, + "grad_norm": 0.0005166399059817195, + "learning_rate": 5.043219031310053e-07, + "loss": 0.0182, + "num_input_tokens_seen": 205910496, + "step": 169230 + }, + { + "epoch": 18.847867245795747, + "grad_norm": 3.2660391330718994, + "learning_rate": 5.038364431113662e-07, + "loss": 0.1075, + "num_input_tokens_seen": 205916736, + "step": 169235 + }, + { + "epoch": 18.848424100679363, + "grad_norm": 0.17629174888134003, + "learning_rate": 5.033512144776209e-07, + "loss": 0.0198, + "num_input_tokens_seen": 205922848, + "step": 169240 + }, + { + "epoch": 18.84898095556298, + "grad_norm": 0.5617374181747437, + "learning_rate": 5.028662172343462e-07, + "loss": 0.0551, + "num_input_tokens_seen": 205929024, + "step": 169245 + }, + { + "epoch": 18.8495378104466, + "grad_norm": 2.4522197246551514, + "learning_rate": 5.023814513861302e-07, + "loss": 0.173, + "num_input_tokens_seen": 205935104, + "step": 169250 + }, + { + "epoch": 18.850094665330214, + "grad_norm": 0.07460660487413406, + "learning_rate": 5.018969169375443e-07, + "loss": 0.0035, + "num_input_tokens_seen": 205941216, + "step": 169255 + }, + { + "epoch": 18.850651520213834, + "grad_norm": 0.10962363332509995, + "learning_rate": 5.014126138931763e-07, + "loss": 0.0717, + "num_input_tokens_seen": 205947328, + "step": 169260 + }, + { + "epoch": 18.85120837509745, + "grad_norm": 0.6738699674606323, + "learning_rate": 5.009285422575866e-07, + "loss": 0.0156, + "num_input_tokens_seen": 205953440, + "step": 169265 + }, + { + "epoch": 18.851765229981066, + "grad_norm": 1.5647281408309937, + "learning_rate": 5.004447020353603e-07, + "loss": 0.0384, + "num_input_tokens_seen": 205959488, + "step": 169270 + }, + { + "epoch": 18.852322084864685, + "grad_norm": 1.8332232236862183, + "learning_rate": 4.999610932310578e-07, + "loss": 0.067, + "num_input_tokens_seen": 205965632, + "step": 169275 + }, + { + "epoch": 18.8528789397483, + "grad_norm": 3.1130869388580322, + "learning_rate": 4.994777158492559e-07, + "loss": 0.0939, + "num_input_tokens_seen": 205971872, + "step": 169280 + }, + { + "epoch": 18.85343579463192, + "grad_norm": 0.006199406459927559, + "learning_rate": 4.989945698945148e-07, + "loss": 0.0191, + "num_input_tokens_seen": 205978144, + "step": 169285 + }, + { + "epoch": 18.853992649515536, + "grad_norm": 0.010044647380709648, + "learning_rate": 4.985116553714031e-07, + "loss": 0.0094, + "num_input_tokens_seen": 205984288, + "step": 169290 + }, + { + "epoch": 18.854549504399152, + "grad_norm": 0.00013410545943770558, + "learning_rate": 4.980289722844727e-07, + "loss": 0.0234, + "num_input_tokens_seen": 205990400, + "step": 169295 + }, + { + "epoch": 18.85510635928277, + "grad_norm": 0.1268230378627777, + "learning_rate": 4.975465206382951e-07, + "loss": 0.0139, + "num_input_tokens_seen": 205996320, + "step": 169300 + }, + { + "epoch": 18.855663214166388, + "grad_norm": 0.006360685918480158, + "learning_rate": 4.970643004374192e-07, + "loss": 0.0846, + "num_input_tokens_seen": 206002592, + "step": 169305 + }, + { + "epoch": 18.856220069050007, + "grad_norm": 0.04186410456895828, + "learning_rate": 4.965823116864055e-07, + "loss": 0.009, + "num_input_tokens_seen": 206008832, + "step": 169310 + }, + { + "epoch": 18.856776923933623, + "grad_norm": 0.0008299249457195401, + "learning_rate": 4.961005543897973e-07, + "loss": 0.0078, + "num_input_tokens_seen": 206014976, + "step": 169315 + }, + { + "epoch": 18.85733377881724, + "grad_norm": 0.3039889931678772, + "learning_rate": 4.956190285521578e-07, + "loss": 0.0098, + "num_input_tokens_seen": 206021152, + "step": 169320 + }, + { + "epoch": 18.85789063370086, + "grad_norm": 1.041464924812317, + "learning_rate": 4.951377341780251e-07, + "loss": 0.0365, + "num_input_tokens_seen": 206027520, + "step": 169325 + }, + { + "epoch": 18.858447488584474, + "grad_norm": 1.7085555791854858, + "learning_rate": 4.946566712719508e-07, + "loss": 0.0235, + "num_input_tokens_seen": 206033856, + "step": 169330 + }, + { + "epoch": 18.859004343468094, + "grad_norm": 0.6602206230163574, + "learning_rate": 4.941758398384789e-07, + "loss": 0.0304, + "num_input_tokens_seen": 206039360, + "step": 169335 + }, + { + "epoch": 18.85956119835171, + "grad_norm": 0.06911924481391907, + "learning_rate": 4.93695239882147e-07, + "loss": 0.0309, + "num_input_tokens_seen": 206045216, + "step": 169340 + }, + { + "epoch": 18.860118053235325, + "grad_norm": 1.740329384803772, + "learning_rate": 4.932148714074991e-07, + "loss": 0.1702, + "num_input_tokens_seen": 206050848, + "step": 169345 + }, + { + "epoch": 18.860674908118945, + "grad_norm": 0.2631925642490387, + "learning_rate": 4.92734734419073e-07, + "loss": 0.0092, + "num_input_tokens_seen": 206057152, + "step": 169350 + }, + { + "epoch": 18.86123176300256, + "grad_norm": 0.0031695140060037374, + "learning_rate": 4.922548289214012e-07, + "loss": 0.003, + "num_input_tokens_seen": 206063296, + "step": 169355 + }, + { + "epoch": 18.86178861788618, + "grad_norm": 0.0005074512446299195, + "learning_rate": 4.917751549190164e-07, + "loss": 0.1147, + "num_input_tokens_seen": 206069312, + "step": 169360 + }, + { + "epoch": 18.862345472769796, + "grad_norm": 0.011250898241996765, + "learning_rate": 4.912957124164508e-07, + "loss": 0.0034, + "num_input_tokens_seen": 206075552, + "step": 169365 + }, + { + "epoch": 18.862902327653412, + "grad_norm": 0.0006950985407456756, + "learning_rate": 4.90816501418237e-07, + "loss": 0.0366, + "num_input_tokens_seen": 206081696, + "step": 169370 + }, + { + "epoch": 18.86345918253703, + "grad_norm": 0.006615920923650265, + "learning_rate": 4.903375219288936e-07, + "loss": 0.0277, + "num_input_tokens_seen": 206087904, + "step": 169375 + }, + { + "epoch": 18.864016037420647, + "grad_norm": 0.004630311857908964, + "learning_rate": 4.898587739529531e-07, + "loss": 0.009, + "num_input_tokens_seen": 206094080, + "step": 169380 + }, + { + "epoch": 18.864572892304267, + "grad_norm": 0.1277254819869995, + "learning_rate": 4.893802574949285e-07, + "loss": 0.1164, + "num_input_tokens_seen": 206100192, + "step": 169385 + }, + { + "epoch": 18.865129747187883, + "grad_norm": 1.1772639751434326, + "learning_rate": 4.889019725593497e-07, + "loss": 0.0586, + "num_input_tokens_seen": 206106336, + "step": 169390 + }, + { + "epoch": 18.8656866020715, + "grad_norm": 2.589238166809082, + "learning_rate": 4.884239191507239e-07, + "loss": 0.0451, + "num_input_tokens_seen": 206112224, + "step": 169395 + }, + { + "epoch": 18.866243456955118, + "grad_norm": 0.06292901933193207, + "learning_rate": 4.879460972735784e-07, + "loss": 0.0334, + "num_input_tokens_seen": 206118560, + "step": 169400 + }, + { + "epoch": 18.866800311838734, + "grad_norm": 0.03151250630617142, + "learning_rate": 4.874685069324203e-07, + "loss": 0.0664, + "num_input_tokens_seen": 206124640, + "step": 169405 + }, + { + "epoch": 18.867357166722353, + "grad_norm": 0.0018150914693251252, + "learning_rate": 4.869911481317601e-07, + "loss": 0.0023, + "num_input_tokens_seen": 206130720, + "step": 169410 + }, + { + "epoch": 18.86791402160597, + "grad_norm": 0.029225360602140427, + "learning_rate": 4.865140208761054e-07, + "loss": 0.0047, + "num_input_tokens_seen": 206137216, + "step": 169415 + }, + { + "epoch": 18.868470876489585, + "grad_norm": 0.00034236718784086406, + "learning_rate": 4.860371251699691e-07, + "loss": 0.0068, + "num_input_tokens_seen": 206143456, + "step": 169420 + }, + { + "epoch": 18.869027731373205, + "grad_norm": 0.09461873769760132, + "learning_rate": 4.855604610178505e-07, + "loss": 0.0069, + "num_input_tokens_seen": 206149568, + "step": 169425 + }, + { + "epoch": 18.86958458625682, + "grad_norm": 0.9792585372924805, + "learning_rate": 4.850840284242541e-07, + "loss": 0.0158, + "num_input_tokens_seen": 206155616, + "step": 169430 + }, + { + "epoch": 18.87014144114044, + "grad_norm": 0.11654182523488998, + "learning_rate": 4.846078273936794e-07, + "loss": 0.0445, + "num_input_tokens_seen": 206162144, + "step": 169435 + }, + { + "epoch": 18.870698296024056, + "grad_norm": 0.005817152559757233, + "learning_rate": 4.841318579306281e-07, + "loss": 0.0028, + "num_input_tokens_seen": 206168640, + "step": 169440 + }, + { + "epoch": 18.871255150907672, + "grad_norm": 0.04793446883559227, + "learning_rate": 4.836561200395912e-07, + "loss": 0.0113, + "num_input_tokens_seen": 206174848, + "step": 169445 + }, + { + "epoch": 18.87181200579129, + "grad_norm": 1.6266889572143555, + "learning_rate": 4.831806137250649e-07, + "loss": 0.0601, + "num_input_tokens_seen": 206180928, + "step": 169450 + }, + { + "epoch": 18.872368860674907, + "grad_norm": 0.0018866433529183269, + "learning_rate": 4.827053389915404e-07, + "loss": 0.1705, + "num_input_tokens_seen": 206186944, + "step": 169455 + }, + { + "epoch": 18.872925715558527, + "grad_norm": 0.1563398689031601, + "learning_rate": 4.822302958435054e-07, + "loss": 0.0018, + "num_input_tokens_seen": 206193216, + "step": 169460 + }, + { + "epoch": 18.873482570442143, + "grad_norm": 0.2088073194026947, + "learning_rate": 4.817554842854483e-07, + "loss": 0.0235, + "num_input_tokens_seen": 206199360, + "step": 169465 + }, + { + "epoch": 18.87403942532576, + "grad_norm": 1.0411168336868286, + "learning_rate": 4.812809043218569e-07, + "loss": 0.0168, + "num_input_tokens_seen": 206205536, + "step": 169470 + }, + { + "epoch": 18.874596280209378, + "grad_norm": 0.236202210187912, + "learning_rate": 4.808065559572112e-07, + "loss": 0.0418, + "num_input_tokens_seen": 206211584, + "step": 169475 + }, + { + "epoch": 18.875153135092994, + "grad_norm": 0.23957239091396332, + "learning_rate": 4.803324391959907e-07, + "loss": 0.0946, + "num_input_tokens_seen": 206217504, + "step": 169480 + }, + { + "epoch": 18.875709989976613, + "grad_norm": 0.04259364306926727, + "learning_rate": 4.798585540426781e-07, + "loss": 0.0991, + "num_input_tokens_seen": 206223552, + "step": 169485 + }, + { + "epoch": 18.87626684486023, + "grad_norm": 1.0020490884780884, + "learning_rate": 4.79384900501742e-07, + "loss": 0.0777, + "num_input_tokens_seen": 206229760, + "step": 169490 + }, + { + "epoch": 18.876823699743845, + "grad_norm": 0.22821924090385437, + "learning_rate": 4.789114785776649e-07, + "loss": 0.0077, + "num_input_tokens_seen": 206236064, + "step": 169495 + }, + { + "epoch": 18.877380554627464, + "grad_norm": 0.019571403041481972, + "learning_rate": 4.784382882749127e-07, + "loss": 0.0017, + "num_input_tokens_seen": 206242176, + "step": 169500 + }, + { + "epoch": 18.87793740951108, + "grad_norm": 0.01123524084687233, + "learning_rate": 4.779653295979569e-07, + "loss": 0.1032, + "num_input_tokens_seen": 206248192, + "step": 169505 + }, + { + "epoch": 18.8784942643947, + "grad_norm": 0.006282103713601828, + "learning_rate": 4.77492602551266e-07, + "loss": 0.0006, + "num_input_tokens_seen": 206253952, + "step": 169510 + }, + { + "epoch": 18.879051119278316, + "grad_norm": 0.15728135406970978, + "learning_rate": 4.77020107139306e-07, + "loss": 0.0331, + "num_input_tokens_seen": 206260064, + "step": 169515 + }, + { + "epoch": 18.879607974161935, + "grad_norm": 0.059290848672389984, + "learning_rate": 4.7654784336653437e-07, + "loss": 0.0081, + "num_input_tokens_seen": 206266016, + "step": 169520 + }, + { + "epoch": 18.88016482904555, + "grad_norm": 2.6009104251861572, + "learning_rate": 4.760758112374225e-07, + "loss": 0.1489, + "num_input_tokens_seen": 206272096, + "step": 169525 + }, + { + "epoch": 18.880721683929167, + "grad_norm": 0.007317108102142811, + "learning_rate": 4.756040107564169e-07, + "loss": 0.0012, + "num_input_tokens_seen": 206278400, + "step": 169530 + }, + { + "epoch": 18.881278538812786, + "grad_norm": 0.49887609481811523, + "learning_rate": 4.7513244192798347e-07, + "loss": 0.0108, + "num_input_tokens_seen": 206284128, + "step": 169535 + }, + { + "epoch": 18.881835393696402, + "grad_norm": 0.016185805201530457, + "learning_rate": 4.7466110475657134e-07, + "loss": 0.0845, + "num_input_tokens_seen": 206289728, + "step": 169540 + }, + { + "epoch": 18.88239224858002, + "grad_norm": 0.005900051910430193, + "learning_rate": 4.7418999924663533e-07, + "loss": 0.0653, + "num_input_tokens_seen": 206296064, + "step": 169545 + }, + { + "epoch": 18.882949103463638, + "grad_norm": 2.142897129058838, + "learning_rate": 4.7371912540262466e-07, + "loss": 0.0652, + "num_input_tokens_seen": 206302112, + "step": 169550 + }, + { + "epoch": 18.883505958347254, + "grad_norm": 0.0018923977622762322, + "learning_rate": 4.732484832289885e-07, + "loss": 0.0425, + "num_input_tokens_seen": 206308320, + "step": 169555 + }, + { + "epoch": 18.884062813230873, + "grad_norm": 0.0004606013826560229, + "learning_rate": 4.7277807273016783e-07, + "loss": 0.0084, + "num_input_tokens_seen": 206314208, + "step": 169560 + }, + { + "epoch": 18.88461966811449, + "grad_norm": 0.6052801012992859, + "learning_rate": 4.7230789391061183e-07, + "loss": 0.1193, + "num_input_tokens_seen": 206320576, + "step": 169565 + }, + { + "epoch": 18.88517652299811, + "grad_norm": 0.507366955280304, + "learning_rate": 4.7183794677475577e-07, + "loss": 0.065, + "num_input_tokens_seen": 206326560, + "step": 169570 + }, + { + "epoch": 18.885733377881724, + "grad_norm": 0.33350640535354614, + "learning_rate": 4.713682313270462e-07, + "loss": 0.036, + "num_input_tokens_seen": 206331968, + "step": 169575 + }, + { + "epoch": 18.88629023276534, + "grad_norm": 0.38593000173568726, + "learning_rate": 4.708987475719101e-07, + "loss": 0.0113, + "num_input_tokens_seen": 206338176, + "step": 169580 + }, + { + "epoch": 18.88684708764896, + "grad_norm": 0.23178188502788544, + "learning_rate": 4.704294955137939e-07, + "loss": 0.0186, + "num_input_tokens_seen": 206344192, + "step": 169585 + }, + { + "epoch": 18.887403942532575, + "grad_norm": 0.001838016789406538, + "learning_rate": 4.6996047515711904e-07, + "loss": 0.0591, + "num_input_tokens_seen": 206350496, + "step": 169590 + }, + { + "epoch": 18.887960797416195, + "grad_norm": 0.2666119933128357, + "learning_rate": 4.694916865063237e-07, + "loss": 0.0036, + "num_input_tokens_seen": 206356768, + "step": 169595 + }, + { + "epoch": 18.88851765229981, + "grad_norm": 0.05917269363999367, + "learning_rate": 4.6902312956583206e-07, + "loss": 0.0468, + "num_input_tokens_seen": 206362944, + "step": 169600 + }, + { + "epoch": 18.889074507183427, + "grad_norm": 0.0016600607195869088, + "learning_rate": 4.6855480434007113e-07, + "loss": 0.0006, + "num_input_tokens_seen": 206369344, + "step": 169605 + }, + { + "epoch": 18.889631362067046, + "grad_norm": 0.393215537071228, + "learning_rate": 4.6808671083346246e-07, + "loss": 0.008, + "num_input_tokens_seen": 206375840, + "step": 169610 + }, + { + "epoch": 18.890188216950662, + "grad_norm": 0.06202550604939461, + "learning_rate": 4.676188490504302e-07, + "loss": 0.0012, + "num_input_tokens_seen": 206382240, + "step": 169615 + }, + { + "epoch": 18.89074507183428, + "grad_norm": 0.007490172050893307, + "learning_rate": 4.671512189953958e-07, + "loss": 0.0219, + "num_input_tokens_seen": 206388128, + "step": 169620 + }, + { + "epoch": 18.891301926717897, + "grad_norm": 0.3269073963165283, + "learning_rate": 4.666838206727697e-07, + "loss": 0.1525, + "num_input_tokens_seen": 206394016, + "step": 169625 + }, + { + "epoch": 18.891858781601513, + "grad_norm": 0.0720212310552597, + "learning_rate": 4.662166540869706e-07, + "loss": 0.0053, + "num_input_tokens_seen": 206400384, + "step": 169630 + }, + { + "epoch": 18.892415636485133, + "grad_norm": 0.9057499170303345, + "learning_rate": 4.657497192424143e-07, + "loss": 0.0563, + "num_input_tokens_seen": 206406176, + "step": 169635 + }, + { + "epoch": 18.89297249136875, + "grad_norm": 2.176405906677246, + "learning_rate": 4.6528301614350843e-07, + "loss": 0.0864, + "num_input_tokens_seen": 206412480, + "step": 169640 + }, + { + "epoch": 18.893529346252368, + "grad_norm": 0.008662041276693344, + "learning_rate": 4.6481654479466065e-07, + "loss": 0.0061, + "num_input_tokens_seen": 206418752, + "step": 169645 + }, + { + "epoch": 18.894086201135984, + "grad_norm": 0.16801020503044128, + "learning_rate": 4.643503052002757e-07, + "loss": 0.0018, + "num_input_tokens_seen": 206424992, + "step": 169650 + }, + { + "epoch": 18.8946430560196, + "grad_norm": 0.13450752198696136, + "learning_rate": 4.6388429736476115e-07, + "loss": 0.0549, + "num_input_tokens_seen": 206430752, + "step": 169655 + }, + { + "epoch": 18.89519991090322, + "grad_norm": 1.9262608289718628, + "learning_rate": 4.634185212925163e-07, + "loss": 0.0987, + "num_input_tokens_seen": 206437088, + "step": 169660 + }, + { + "epoch": 18.895756765786835, + "grad_norm": 1.2065002918243408, + "learning_rate": 4.6295297698794317e-07, + "loss": 0.0793, + "num_input_tokens_seen": 206443200, + "step": 169665 + }, + { + "epoch": 18.896313620670455, + "grad_norm": 0.017051592469215393, + "learning_rate": 4.6248766445543824e-07, + "loss": 0.0003, + "num_input_tokens_seen": 206449248, + "step": 169670 + }, + { + "epoch": 18.89687047555407, + "grad_norm": 0.17324937880039215, + "learning_rate": 4.6202258369939797e-07, + "loss": 0.0298, + "num_input_tokens_seen": 206455136, + "step": 169675 + }, + { + "epoch": 18.897427330437687, + "grad_norm": 0.48967617750167847, + "learning_rate": 4.615577347242106e-07, + "loss": 0.0066, + "num_input_tokens_seen": 206461184, + "step": 169680 + }, + { + "epoch": 18.897984185321306, + "grad_norm": 0.0005918223178014159, + "learning_rate": 4.6109311753427253e-07, + "loss": 0.1033, + "num_input_tokens_seen": 206467456, + "step": 169685 + }, + { + "epoch": 18.898541040204922, + "grad_norm": 0.2993960976600647, + "learning_rate": 4.606287321339692e-07, + "loss": 0.0065, + "num_input_tokens_seen": 206473536, + "step": 169690 + }, + { + "epoch": 18.89909789508854, + "grad_norm": 0.18263818323612213, + "learning_rate": 4.6016457852768866e-07, + "loss": 0.0203, + "num_input_tokens_seen": 206479776, + "step": 169695 + }, + { + "epoch": 18.899654749972157, + "grad_norm": 0.2202058732509613, + "learning_rate": 4.5970065671981365e-07, + "loss": 0.0388, + "num_input_tokens_seen": 206485536, + "step": 169700 + }, + { + "epoch": 18.900211604855773, + "grad_norm": 1.349770426750183, + "learning_rate": 4.592369667147295e-07, + "loss": 0.1039, + "num_input_tokens_seen": 206491648, + "step": 169705 + }, + { + "epoch": 18.900768459739393, + "grad_norm": 0.020013893023133278, + "learning_rate": 4.587735085168104e-07, + "loss": 0.0049, + "num_input_tokens_seen": 206497856, + "step": 169710 + }, + { + "epoch": 18.90132531462301, + "grad_norm": 0.03320617601275444, + "learning_rate": 4.583102821304419e-07, + "loss": 0.0542, + "num_input_tokens_seen": 206504160, + "step": 169715 + }, + { + "epoch": 18.901882169506628, + "grad_norm": 0.08645232021808624, + "learning_rate": 4.5784728755998983e-07, + "loss": 0.0816, + "num_input_tokens_seen": 206509600, + "step": 169720 + }, + { + "epoch": 18.902439024390244, + "grad_norm": 0.016067985445261, + "learning_rate": 4.5738452480983685e-07, + "loss": 0.0009, + "num_input_tokens_seen": 206515904, + "step": 169725 + }, + { + "epoch": 18.90299587927386, + "grad_norm": 0.04384758323431015, + "learning_rate": 4.5692199388434885e-07, + "loss": 0.0485, + "num_input_tokens_seen": 206522080, + "step": 169730 + }, + { + "epoch": 18.90355273415748, + "grad_norm": 0.000148735023685731, + "learning_rate": 4.564596947878974e-07, + "loss": 0.1723, + "num_input_tokens_seen": 206527968, + "step": 169735 + }, + { + "epoch": 18.904109589041095, + "grad_norm": 0.0011766162933781743, + "learning_rate": 4.5599762752484843e-07, + "loss": 0.08, + "num_input_tokens_seen": 206533760, + "step": 169740 + }, + { + "epoch": 18.904666443924715, + "grad_norm": 0.011859670281410217, + "learning_rate": 4.555357920995651e-07, + "loss": 0.08, + "num_input_tokens_seen": 206539616, + "step": 169745 + }, + { + "epoch": 18.90522329880833, + "grad_norm": 0.0001380154280923307, + "learning_rate": 4.550741885164106e-07, + "loss": 0.0425, + "num_input_tokens_seen": 206545824, + "step": 169750 + }, + { + "epoch": 18.905780153691946, + "grad_norm": 0.2804212272167206, + "learning_rate": 4.546128167797453e-07, + "loss": 0.0159, + "num_input_tokens_seen": 206552192, + "step": 169755 + }, + { + "epoch": 18.906337008575566, + "grad_norm": 0.0021440633572638035, + "learning_rate": 4.541516768939297e-07, + "loss": 0.0235, + "num_input_tokens_seen": 206558208, + "step": 169760 + }, + { + "epoch": 18.90689386345918, + "grad_norm": 0.009468570351600647, + "learning_rate": 4.5369076886331574e-07, + "loss": 0.0062, + "num_input_tokens_seen": 206564160, + "step": 169765 + }, + { + "epoch": 18.9074507183428, + "grad_norm": 0.009303784929215908, + "learning_rate": 4.532300926922584e-07, + "loss": 0.004, + "num_input_tokens_seen": 206570624, + "step": 169770 + }, + { + "epoch": 18.908007573226417, + "grad_norm": 1.1115152835845947, + "learning_rate": 4.527696483851096e-07, + "loss": 0.0412, + "num_input_tokens_seen": 206576608, + "step": 169775 + }, + { + "epoch": 18.908564428110033, + "grad_norm": 0.015909936279058456, + "learning_rate": 4.5230943594621597e-07, + "loss": 0.0446, + "num_input_tokens_seen": 206582624, + "step": 169780 + }, + { + "epoch": 18.909121282993652, + "grad_norm": 0.046595051884651184, + "learning_rate": 4.518494553799324e-07, + "loss": 0.0018, + "num_input_tokens_seen": 206588576, + "step": 169785 + }, + { + "epoch": 18.90967813787727, + "grad_norm": 0.5144041180610657, + "learning_rate": 4.5138970669059423e-07, + "loss": 0.0117, + "num_input_tokens_seen": 206594912, + "step": 169790 + }, + { + "epoch": 18.910234992760888, + "grad_norm": 0.00010123615356860682, + "learning_rate": 4.5093018988255076e-07, + "loss": 0.0236, + "num_input_tokens_seen": 206600992, + "step": 169795 + }, + { + "epoch": 18.910791847644504, + "grad_norm": 0.03104368783533573, + "learning_rate": 4.5047090496013745e-07, + "loss": 0.0082, + "num_input_tokens_seen": 206607072, + "step": 169800 + }, + { + "epoch": 18.91134870252812, + "grad_norm": 1.7815523147583008, + "learning_rate": 4.5001185192769524e-07, + "loss": 0.1265, + "num_input_tokens_seen": 206612352, + "step": 169805 + }, + { + "epoch": 18.91190555741174, + "grad_norm": 0.00023764312209095806, + "learning_rate": 4.495530307895623e-07, + "loss": 0.026, + "num_input_tokens_seen": 206618464, + "step": 169810 + }, + { + "epoch": 18.912462412295355, + "grad_norm": 0.00030742009403184056, + "learning_rate": 4.490944415500714e-07, + "loss": 0.0419, + "num_input_tokens_seen": 206624672, + "step": 169815 + }, + { + "epoch": 18.913019267178974, + "grad_norm": 0.029526636004447937, + "learning_rate": 4.486360842135495e-07, + "loss": 0.0009, + "num_input_tokens_seen": 206630912, + "step": 169820 + }, + { + "epoch": 18.91357612206259, + "grad_norm": 0.00016270802007056773, + "learning_rate": 4.481779587843321e-07, + "loss": 0.0166, + "num_input_tokens_seen": 206637184, + "step": 169825 + }, + { + "epoch": 18.914132976946206, + "grad_norm": 0.060013547539711, + "learning_rate": 4.4772006526674625e-07, + "loss": 0.0018, + "num_input_tokens_seen": 206643456, + "step": 169830 + }, + { + "epoch": 18.914689831829826, + "grad_norm": 0.20844538509845734, + "learning_rate": 4.4726240366511354e-07, + "loss": 0.0673, + "num_input_tokens_seen": 206649472, + "step": 169835 + }, + { + "epoch": 18.91524668671344, + "grad_norm": 0.2401028722524643, + "learning_rate": 4.468049739837582e-07, + "loss": 0.0107, + "num_input_tokens_seen": 206655488, + "step": 169840 + }, + { + "epoch": 18.91580354159706, + "grad_norm": 0.8610845804214478, + "learning_rate": 4.4634777622700187e-07, + "loss": 0.0174, + "num_input_tokens_seen": 206661536, + "step": 169845 + }, + { + "epoch": 18.916360396480677, + "grad_norm": 0.44477593898773193, + "learning_rate": 4.4589081039916047e-07, + "loss": 0.0312, + "num_input_tokens_seen": 206667584, + "step": 169850 + }, + { + "epoch": 18.916917251364296, + "grad_norm": 2.7132949829101562, + "learning_rate": 4.4543407650455836e-07, + "loss": 0.2483, + "num_input_tokens_seen": 206673760, + "step": 169855 + }, + { + "epoch": 18.917474106247912, + "grad_norm": 0.007497263606637716, + "learning_rate": 4.4497757454750044e-07, + "loss": 0.0214, + "num_input_tokens_seen": 206679968, + "step": 169860 + }, + { + "epoch": 18.918030961131528, + "grad_norm": 0.008559366688132286, + "learning_rate": 4.445213045323027e-07, + "loss": 0.0014, + "num_input_tokens_seen": 206686112, + "step": 169865 + }, + { + "epoch": 18.918587816015147, + "grad_norm": 1.4778553247451782, + "learning_rate": 4.440652664632755e-07, + "loss": 0.0438, + "num_input_tokens_seen": 206692288, + "step": 169870 + }, + { + "epoch": 18.919144670898763, + "grad_norm": 0.6622416973114014, + "learning_rate": 4.436094603447266e-07, + "loss": 0.0221, + "num_input_tokens_seen": 206698304, + "step": 169875 + }, + { + "epoch": 18.91970152578238, + "grad_norm": 0.0018269732827320695, + "learning_rate": 4.431538861809581e-07, + "loss": 0.0318, + "num_input_tokens_seen": 206704256, + "step": 169880 + }, + { + "epoch": 18.920258380666, + "grad_norm": 0.008285341784358025, + "learning_rate": 4.426985439762804e-07, + "loss": 0.0165, + "num_input_tokens_seen": 206710560, + "step": 169885 + }, + { + "epoch": 18.920815235549615, + "grad_norm": 0.008289029821753502, + "learning_rate": 4.4224343373498736e-07, + "loss": 0.0037, + "num_input_tokens_seen": 206716928, + "step": 169890 + }, + { + "epoch": 18.921372090433234, + "grad_norm": 0.6341497898101807, + "learning_rate": 4.417885554613782e-07, + "loss": 0.0424, + "num_input_tokens_seen": 206723168, + "step": 169895 + }, + { + "epoch": 18.92192894531685, + "grad_norm": 0.0990942195057869, + "learning_rate": 4.4133390915975236e-07, + "loss": 0.0302, + "num_input_tokens_seen": 206729248, + "step": 169900 + }, + { + "epoch": 18.92248580020047, + "grad_norm": 1.008005142211914, + "learning_rate": 4.4087949483440636e-07, + "loss": 0.0221, + "num_input_tokens_seen": 206735424, + "step": 169905 + }, + { + "epoch": 18.923042655084085, + "grad_norm": 1.4314079284667969, + "learning_rate": 4.404253124896285e-07, + "loss": 0.0571, + "num_input_tokens_seen": 206741504, + "step": 169910 + }, + { + "epoch": 18.9235995099677, + "grad_norm": 0.4760587513446808, + "learning_rate": 4.399713621297097e-07, + "loss": 0.0107, + "num_input_tokens_seen": 206747456, + "step": 169915 + }, + { + "epoch": 18.92415636485132, + "grad_norm": 0.09574143588542938, + "learning_rate": 4.39517643758941e-07, + "loss": 0.0015, + "num_input_tokens_seen": 206753824, + "step": 169920 + }, + { + "epoch": 18.924713219734937, + "grad_norm": 0.029062828049063683, + "learning_rate": 4.390641573816023e-07, + "loss": 0.0132, + "num_input_tokens_seen": 206759936, + "step": 169925 + }, + { + "epoch": 18.925270074618556, + "grad_norm": 1.862719178199768, + "learning_rate": 4.3861090300198473e-07, + "loss": 0.0892, + "num_input_tokens_seen": 206766208, + "step": 169930 + }, + { + "epoch": 18.925826929502172, + "grad_norm": 0.004660699516534805, + "learning_rate": 4.3815788062435967e-07, + "loss": 0.0232, + "num_input_tokens_seen": 206772320, + "step": 169935 + }, + { + "epoch": 18.926383784385788, + "grad_norm": 0.0173567496240139, + "learning_rate": 4.377050902530155e-07, + "loss": 0.0449, + "num_input_tokens_seen": 206778400, + "step": 169940 + }, + { + "epoch": 18.926940639269407, + "grad_norm": 1.4303114414215088, + "learning_rate": 4.372525318922266e-07, + "loss": 0.0569, + "num_input_tokens_seen": 206784544, + "step": 169945 + }, + { + "epoch": 18.927497494153023, + "grad_norm": 0.7363033294677734, + "learning_rate": 4.3680020554626446e-07, + "loss": 0.0067, + "num_input_tokens_seen": 206790944, + "step": 169950 + }, + { + "epoch": 18.928054349036643, + "grad_norm": 0.9444904327392578, + "learning_rate": 4.363481112194062e-07, + "loss": 0.0255, + "num_input_tokens_seen": 206797184, + "step": 169955 + }, + { + "epoch": 18.92861120392026, + "grad_norm": 0.07261057943105698, + "learning_rate": 4.3589624891592073e-07, + "loss": 0.0323, + "num_input_tokens_seen": 206803136, + "step": 169960 + }, + { + "epoch": 18.929168058803874, + "grad_norm": 0.000488394231069833, + "learning_rate": 4.3544461864007126e-07, + "loss": 0.003, + "num_input_tokens_seen": 206809600, + "step": 169965 + }, + { + "epoch": 18.929724913687494, + "grad_norm": 0.01055452786386013, + "learning_rate": 4.349932203961321e-07, + "loss": 0.0044, + "num_input_tokens_seen": 206815744, + "step": 169970 + }, + { + "epoch": 18.93028176857111, + "grad_norm": 2.0964996814727783, + "learning_rate": 4.345420541883638e-07, + "loss": 0.0444, + "num_input_tokens_seen": 206821632, + "step": 169975 + }, + { + "epoch": 18.93083862345473, + "grad_norm": 0.8581411838531494, + "learning_rate": 4.3409112002102683e-07, + "loss": 0.043, + "num_input_tokens_seen": 206827744, + "step": 169980 + }, + { + "epoch": 18.931395478338345, + "grad_norm": 0.0397479273378849, + "learning_rate": 4.3364041789837885e-07, + "loss": 0.0147, + "num_input_tokens_seen": 206833952, + "step": 169985 + }, + { + "epoch": 18.93195233322196, + "grad_norm": 0.0021758261136710644, + "learning_rate": 4.331899478246804e-07, + "loss": 0.081, + "num_input_tokens_seen": 206840224, + "step": 169990 + }, + { + "epoch": 18.93250918810558, + "grad_norm": 0.0003950317914132029, + "learning_rate": 4.3273970980418356e-07, + "loss": 0.0796, + "num_input_tokens_seen": 206846016, + "step": 169995 + }, + { + "epoch": 18.933066042989196, + "grad_norm": 0.8792131543159485, + "learning_rate": 4.3228970384114887e-07, + "loss": 0.0262, + "num_input_tokens_seen": 206852256, + "step": 170000 + }, + { + "epoch": 18.933622897872816, + "grad_norm": 0.0016149005386978388, + "learning_rate": 4.318399299398146e-07, + "loss": 0.0199, + "num_input_tokens_seen": 206857696, + "step": 170005 + }, + { + "epoch": 18.93417975275643, + "grad_norm": 0.0027526484336704016, + "learning_rate": 4.3139038810443845e-07, + "loss": 0.0262, + "num_input_tokens_seen": 206864032, + "step": 170010 + }, + { + "epoch": 18.934736607640048, + "grad_norm": 1.317991018295288, + "learning_rate": 4.3094107833926424e-07, + "loss": 0.106, + "num_input_tokens_seen": 206869920, + "step": 170015 + }, + { + "epoch": 18.935293462523667, + "grad_norm": 0.0027167778462171555, + "learning_rate": 4.304920006485358e-07, + "loss": 0.0159, + "num_input_tokens_seen": 206875872, + "step": 170020 + }, + { + "epoch": 18.935850317407283, + "grad_norm": 0.15163880586624146, + "learning_rate": 4.3004315503649697e-07, + "loss": 0.003, + "num_input_tokens_seen": 206882208, + "step": 170025 + }, + { + "epoch": 18.936407172290902, + "grad_norm": 0.8887814879417419, + "learning_rate": 4.29594541507386e-07, + "loss": 0.1348, + "num_input_tokens_seen": 206888480, + "step": 170030 + }, + { + "epoch": 18.93696402717452, + "grad_norm": 0.010146865621209145, + "learning_rate": 4.291461600654356e-07, + "loss": 0.033, + "num_input_tokens_seen": 206894592, + "step": 170035 + }, + { + "epoch": 18.937520882058134, + "grad_norm": 0.03657647222280502, + "learning_rate": 4.2869801071488967e-07, + "loss": 0.0971, + "num_input_tokens_seen": 206900160, + "step": 170040 + }, + { + "epoch": 18.938077736941754, + "grad_norm": 4.14870023727417, + "learning_rate": 4.2825009345997537e-07, + "loss": 0.0382, + "num_input_tokens_seen": 206906304, + "step": 170045 + }, + { + "epoch": 18.93863459182537, + "grad_norm": 2.6239049434661865, + "learning_rate": 4.2780240830492536e-07, + "loss": 0.1781, + "num_input_tokens_seen": 206912288, + "step": 170050 + }, + { + "epoch": 18.93919144670899, + "grad_norm": 0.04385000094771385, + "learning_rate": 4.2735495525396965e-07, + "loss": 0.006, + "num_input_tokens_seen": 206918240, + "step": 170055 + }, + { + "epoch": 18.939748301592605, + "grad_norm": 0.10306061059236526, + "learning_rate": 4.2690773431133256e-07, + "loss": 0.0686, + "num_input_tokens_seen": 206924448, + "step": 170060 + }, + { + "epoch": 18.94030515647622, + "grad_norm": 0.0029680971056222916, + "learning_rate": 4.264607454812386e-07, + "loss": 0.009, + "num_input_tokens_seen": 206930624, + "step": 170065 + }, + { + "epoch": 18.94086201135984, + "grad_norm": 0.12991978228092194, + "learning_rate": 4.260139887679121e-07, + "loss": 0.0697, + "num_input_tokens_seen": 206936800, + "step": 170070 + }, + { + "epoch": 18.941418866243456, + "grad_norm": 0.002603587694466114, + "learning_rate": 4.255674641755747e-07, + "loss": 0.0329, + "num_input_tokens_seen": 206942976, + "step": 170075 + }, + { + "epoch": 18.941975721127076, + "grad_norm": 0.05130919814109802, + "learning_rate": 4.2512117170843967e-07, + "loss": 0.0165, + "num_input_tokens_seen": 206949056, + "step": 170080 + }, + { + "epoch": 18.94253257601069, + "grad_norm": 1.0830034017562866, + "learning_rate": 4.2467511137072034e-07, + "loss": 0.1118, + "num_input_tokens_seen": 206954624, + "step": 170085 + }, + { + "epoch": 18.943089430894307, + "grad_norm": 0.7146207690238953, + "learning_rate": 4.2422928316663835e-07, + "loss": 0.0583, + "num_input_tokens_seen": 206960064, + "step": 170090 + }, + { + "epoch": 18.943646285777927, + "grad_norm": 1.4496463537216187, + "learning_rate": 4.2378368710039864e-07, + "loss": 0.0471, + "num_input_tokens_seen": 206965696, + "step": 170095 + }, + { + "epoch": 18.944203140661543, + "grad_norm": 0.03903974965214729, + "learning_rate": 4.233383231762145e-07, + "loss": 0.0369, + "num_input_tokens_seen": 206971488, + "step": 170100 + }, + { + "epoch": 18.944759995545162, + "grad_norm": 0.4108138978481293, + "learning_rate": 4.228931913982853e-07, + "loss": 0.009, + "num_input_tokens_seen": 206977376, + "step": 170105 + }, + { + "epoch": 18.945316850428778, + "grad_norm": 0.47841233015060425, + "learning_rate": 4.2244829177082446e-07, + "loss": 0.0443, + "num_input_tokens_seen": 206983808, + "step": 170110 + }, + { + "epoch": 18.945873705312394, + "grad_norm": 1.2954422235488892, + "learning_rate": 4.220036242980313e-07, + "loss": 0.0335, + "num_input_tokens_seen": 206989952, + "step": 170115 + }, + { + "epoch": 18.946430560196013, + "grad_norm": 0.0480181910097599, + "learning_rate": 4.215591889841053e-07, + "loss": 0.0278, + "num_input_tokens_seen": 206995712, + "step": 170120 + }, + { + "epoch": 18.94698741507963, + "grad_norm": 0.030509566888213158, + "learning_rate": 4.2111498583324306e-07, + "loss": 0.0015, + "num_input_tokens_seen": 207001600, + "step": 170125 + }, + { + "epoch": 18.94754426996325, + "grad_norm": 0.8909785151481628, + "learning_rate": 4.2067101484964397e-07, + "loss": 0.0045, + "num_input_tokens_seen": 207008160, + "step": 170130 + }, + { + "epoch": 18.948101124846865, + "grad_norm": 0.004645465407520533, + "learning_rate": 4.2022727603749647e-07, + "loss": 0.0886, + "num_input_tokens_seen": 207014176, + "step": 170135 + }, + { + "epoch": 18.94865797973048, + "grad_norm": 0.4215911328792572, + "learning_rate": 4.197837694009971e-07, + "loss": 0.0778, + "num_input_tokens_seen": 207020128, + "step": 170140 + }, + { + "epoch": 18.9492148346141, + "grad_norm": 0.033397626131772995, + "learning_rate": 4.1934049494433415e-07, + "loss": 0.0207, + "num_input_tokens_seen": 207026400, + "step": 170145 + }, + { + "epoch": 18.949771689497716, + "grad_norm": 0.5255991220474243, + "learning_rate": 4.18897452671696e-07, + "loss": 0.0591, + "num_input_tokens_seen": 207032480, + "step": 170150 + }, + { + "epoch": 18.950328544381335, + "grad_norm": 1.3734952211380005, + "learning_rate": 4.1845464258725985e-07, + "loss": 0.0693, + "num_input_tokens_seen": 207038496, + "step": 170155 + }, + { + "epoch": 18.95088539926495, + "grad_norm": 1.3312995433807373, + "learning_rate": 4.180120646952196e-07, + "loss": 0.0601, + "num_input_tokens_seen": 207044512, + "step": 170160 + }, + { + "epoch": 18.951442254148567, + "grad_norm": 0.29081571102142334, + "learning_rate": 4.1756971899974683e-07, + "loss": 0.0157, + "num_input_tokens_seen": 207050624, + "step": 170165 + }, + { + "epoch": 18.951999109032187, + "grad_norm": 0.27628955245018005, + "learning_rate": 4.171276055050244e-07, + "loss": 0.0023, + "num_input_tokens_seen": 207056864, + "step": 170170 + }, + { + "epoch": 18.952555963915803, + "grad_norm": 0.007580580655485392, + "learning_rate": 4.166857242152267e-07, + "loss": 0.0411, + "num_input_tokens_seen": 207062752, + "step": 170175 + }, + { + "epoch": 18.953112818799422, + "grad_norm": 1.2518060207366943, + "learning_rate": 4.1624407513452814e-07, + "loss": 0.0329, + "num_input_tokens_seen": 207068768, + "step": 170180 + }, + { + "epoch": 18.953669673683038, + "grad_norm": 1.9683772325515747, + "learning_rate": 4.158026582670976e-07, + "loss": 0.1053, + "num_input_tokens_seen": 207075040, + "step": 170185 + }, + { + "epoch": 18.954226528566657, + "grad_norm": 0.029693271964788437, + "learning_rate": 4.153614736171152e-07, + "loss": 0.002, + "num_input_tokens_seen": 207081344, + "step": 170190 + }, + { + "epoch": 18.954783383450273, + "grad_norm": 0.7350958585739136, + "learning_rate": 4.14920521188733e-07, + "loss": 0.124, + "num_input_tokens_seen": 207087296, + "step": 170195 + }, + { + "epoch": 18.95534023833389, + "grad_norm": 0.8711121082305908, + "learning_rate": 4.1447980098612836e-07, + "loss": 0.0483, + "num_input_tokens_seen": 207093664, + "step": 170200 + }, + { + "epoch": 18.95589709321751, + "grad_norm": 0.0002776084002107382, + "learning_rate": 4.1403931301345625e-07, + "loss": 0.0134, + "num_input_tokens_seen": 207099808, + "step": 170205 + }, + { + "epoch": 18.956453948101124, + "grad_norm": 0.009648945182561874, + "learning_rate": 4.135990572748827e-07, + "loss": 0.042, + "num_input_tokens_seen": 207105856, + "step": 170210 + }, + { + "epoch": 18.95701080298474, + "grad_norm": 0.04705776274204254, + "learning_rate": 4.1315903377456553e-07, + "loss": 0.0366, + "num_input_tokens_seen": 207112192, + "step": 170215 + }, + { + "epoch": 18.95756765786836, + "grad_norm": 0.6910872459411621, + "learning_rate": 4.1271924251665707e-07, + "loss": 0.0151, + "num_input_tokens_seen": 207118176, + "step": 170220 + }, + { + "epoch": 18.958124512751976, + "grad_norm": 1.807418704032898, + "learning_rate": 4.1227968350531497e-07, + "loss": 0.1108, + "num_input_tokens_seen": 207124256, + "step": 170225 + }, + { + "epoch": 18.958681367635595, + "grad_norm": 0.07901239395141602, + "learning_rate": 4.1184035674469155e-07, + "loss": 0.0343, + "num_input_tokens_seen": 207130528, + "step": 170230 + }, + { + "epoch": 18.95923822251921, + "grad_norm": 0.029110858216881752, + "learning_rate": 4.1140126223893626e-07, + "loss": 0.0039, + "num_input_tokens_seen": 207136576, + "step": 170235 + }, + { + "epoch": 18.95979507740283, + "grad_norm": 0.0010984207037836313, + "learning_rate": 4.1096239999219575e-07, + "loss": 0.0127, + "num_input_tokens_seen": 207142624, + "step": 170240 + }, + { + "epoch": 18.960351932286446, + "grad_norm": 0.0662916973233223, + "learning_rate": 4.1052377000861397e-07, + "loss": 0.0034, + "num_input_tokens_seen": 207148800, + "step": 170245 + }, + { + "epoch": 18.960908787170062, + "grad_norm": 0.3300737738609314, + "learning_rate": 4.100853722923376e-07, + "loss": 0.2081, + "num_input_tokens_seen": 207154784, + "step": 170250 + }, + { + "epoch": 18.961465642053682, + "grad_norm": 0.026680562645196915, + "learning_rate": 4.096472068475049e-07, + "loss": 0.0074, + "num_input_tokens_seen": 207160992, + "step": 170255 + }, + { + "epoch": 18.962022496937298, + "grad_norm": 0.00041855184826999903, + "learning_rate": 4.092092736782599e-07, + "loss": 0.0827, + "num_input_tokens_seen": 207166272, + "step": 170260 + }, + { + "epoch": 18.962579351820917, + "grad_norm": 0.00034440940362401307, + "learning_rate": 4.087715727887298e-07, + "loss": 0.0245, + "num_input_tokens_seen": 207172160, + "step": 170265 + }, + { + "epoch": 18.963136206704533, + "grad_norm": 0.009193120524287224, + "learning_rate": 4.0833410418305575e-07, + "loss": 0.0114, + "num_input_tokens_seen": 207178272, + "step": 170270 + }, + { + "epoch": 18.96369306158815, + "grad_norm": 0.00048232355038635433, + "learning_rate": 4.0789686786536773e-07, + "loss": 0.0623, + "num_input_tokens_seen": 207184608, + "step": 170275 + }, + { + "epoch": 18.96424991647177, + "grad_norm": 0.0038394087459892035, + "learning_rate": 4.074598638397986e-07, + "loss": 0.0102, + "num_input_tokens_seen": 207190944, + "step": 170280 + }, + { + "epoch": 18.964806771355384, + "grad_norm": 2.0130362510681152, + "learning_rate": 4.0702309211047564e-07, + "loss": 0.0666, + "num_input_tokens_seen": 207196864, + "step": 170285 + }, + { + "epoch": 18.965363626239004, + "grad_norm": 0.0902988389134407, + "learning_rate": 4.0658655268152046e-07, + "loss": 0.0157, + "num_input_tokens_seen": 207203040, + "step": 170290 + }, + { + "epoch": 18.96592048112262, + "grad_norm": 9.344614954898134e-05, + "learning_rate": 4.061502455570604e-07, + "loss": 0.0327, + "num_input_tokens_seen": 207208736, + "step": 170295 + }, + { + "epoch": 18.966477336006236, + "grad_norm": 0.00017649243818596005, + "learning_rate": 4.057141707412143e-07, + "loss": 0.0817, + "num_input_tokens_seen": 207214976, + "step": 170300 + }, + { + "epoch": 18.967034190889855, + "grad_norm": 0.017565594986081123, + "learning_rate": 4.05278328238104e-07, + "loss": 0.0019, + "num_input_tokens_seen": 207221088, + "step": 170305 + }, + { + "epoch": 18.96759104577347, + "grad_norm": 0.004453999921679497, + "learning_rate": 4.048427180518455e-07, + "loss": 0.1173, + "num_input_tokens_seen": 207226944, + "step": 170310 + }, + { + "epoch": 18.96814790065709, + "grad_norm": 0.13082535564899445, + "learning_rate": 4.044073401865522e-07, + "loss": 0.0363, + "num_input_tokens_seen": 207232928, + "step": 170315 + }, + { + "epoch": 18.968704755540706, + "grad_norm": 0.007772121112793684, + "learning_rate": 4.0397219464633484e-07, + "loss": 0.0588, + "num_input_tokens_seen": 207239040, + "step": 170320 + }, + { + "epoch": 18.969261610424322, + "grad_norm": 0.6170415878295898, + "learning_rate": 4.0353728143530946e-07, + "loss": 0.0104, + "num_input_tokens_seen": 207245152, + "step": 170325 + }, + { + "epoch": 18.96981846530794, + "grad_norm": 8.936734957387671e-05, + "learning_rate": 4.0310260055757554e-07, + "loss": 0.007, + "num_input_tokens_seen": 207251200, + "step": 170330 + }, + { + "epoch": 18.970375320191557, + "grad_norm": 0.008470756933093071, + "learning_rate": 4.0266815201725206e-07, + "loss": 0.0049, + "num_input_tokens_seen": 207257408, + "step": 170335 + }, + { + "epoch": 18.970932175075177, + "grad_norm": 2.402287244796753, + "learning_rate": 4.022339358184302e-07, + "loss": 0.0382, + "num_input_tokens_seen": 207263616, + "step": 170340 + }, + { + "epoch": 18.971489029958793, + "grad_norm": 0.002152453176677227, + "learning_rate": 4.017999519652149e-07, + "loss": 0.121, + "num_input_tokens_seen": 207269504, + "step": 170345 + }, + { + "epoch": 18.97204588484241, + "grad_norm": 0.5800235867500305, + "learning_rate": 4.013662004617086e-07, + "loss": 0.0042, + "num_input_tokens_seen": 207275584, + "step": 170350 + }, + { + "epoch": 18.972602739726028, + "grad_norm": 0.0015107891522347927, + "learning_rate": 4.009326813120079e-07, + "loss": 0.0285, + "num_input_tokens_seen": 207281152, + "step": 170355 + }, + { + "epoch": 18.973159594609644, + "grad_norm": 0.22137947380542755, + "learning_rate": 4.004993945202068e-07, + "loss": 0.0382, + "num_input_tokens_seen": 207287072, + "step": 170360 + }, + { + "epoch": 18.973716449493264, + "grad_norm": 0.4575843811035156, + "learning_rate": 4.0006634009039643e-07, + "loss": 0.0084, + "num_input_tokens_seen": 207293312, + "step": 170365 + }, + { + "epoch": 18.97427330437688, + "grad_norm": 0.15920504927635193, + "learning_rate": 3.996335180266653e-07, + "loss": 0.0078, + "num_input_tokens_seen": 207299488, + "step": 170370 + }, + { + "epoch": 18.974830159260495, + "grad_norm": 6.480058073066175e-05, + "learning_rate": 3.9920092833310995e-07, + "loss": 0.0102, + "num_input_tokens_seen": 207306080, + "step": 170375 + }, + { + "epoch": 18.975387014144115, + "grad_norm": 0.0006028510397300124, + "learning_rate": 3.987685710138106e-07, + "loss": 0.0005, + "num_input_tokens_seen": 207312416, + "step": 170380 + }, + { + "epoch": 18.97594386902773, + "grad_norm": 0.00016862867050804198, + "learning_rate": 3.983364460728528e-07, + "loss": 0.0784, + "num_input_tokens_seen": 207318656, + "step": 170385 + }, + { + "epoch": 18.97650072391135, + "grad_norm": 0.12433238327503204, + "learning_rate": 3.979045535143139e-07, + "loss": 0.0187, + "num_input_tokens_seen": 207324928, + "step": 170390 + }, + { + "epoch": 18.977057578794966, + "grad_norm": 0.0016539456555619836, + "learning_rate": 3.9747289334227943e-07, + "loss": 0.0104, + "num_input_tokens_seen": 207331040, + "step": 170395 + }, + { + "epoch": 18.977614433678582, + "grad_norm": 0.01351210381835699, + "learning_rate": 3.97041465560824e-07, + "loss": 0.0091, + "num_input_tokens_seen": 207337632, + "step": 170400 + }, + { + "epoch": 18.9781712885622, + "grad_norm": 1.0269827842712402, + "learning_rate": 3.966102701740276e-07, + "loss": 0.0087, + "num_input_tokens_seen": 207343680, + "step": 170405 + }, + { + "epoch": 18.978728143445817, + "grad_norm": 0.0036218457389622927, + "learning_rate": 3.961793071859565e-07, + "loss": 0.0138, + "num_input_tokens_seen": 207349856, + "step": 170410 + }, + { + "epoch": 18.979284998329437, + "grad_norm": 1.2309082746505737, + "learning_rate": 3.957485766006824e-07, + "loss": 0.1585, + "num_input_tokens_seen": 207355968, + "step": 170415 + }, + { + "epoch": 18.979841853213053, + "grad_norm": 0.003487677313387394, + "learning_rate": 3.953180784222771e-07, + "loss": 0.0026, + "num_input_tokens_seen": 207362016, + "step": 170420 + }, + { + "epoch": 18.98039870809667, + "grad_norm": 0.052855584770441055, + "learning_rate": 3.9488781265480667e-07, + "loss": 0.0089, + "num_input_tokens_seen": 207368128, + "step": 170425 + }, + { + "epoch": 18.980955562980288, + "grad_norm": 0.5294414758682251, + "learning_rate": 3.9445777930233183e-07, + "loss": 0.1112, + "num_input_tokens_seen": 207374080, + "step": 170430 + }, + { + "epoch": 18.981512417863904, + "grad_norm": 0.0005268188542686403, + "learning_rate": 3.940279783689188e-07, + "loss": 0.0033, + "num_input_tokens_seen": 207380256, + "step": 170435 + }, + { + "epoch": 18.982069272747523, + "grad_norm": 0.7826695442199707, + "learning_rate": 3.935984098586226e-07, + "loss": 0.0421, + "num_input_tokens_seen": 207386400, + "step": 170440 + }, + { + "epoch": 18.98262612763114, + "grad_norm": 0.5293316841125488, + "learning_rate": 3.931690737755067e-07, + "loss": 0.0157, + "num_input_tokens_seen": 207392384, + "step": 170445 + }, + { + "epoch": 18.983182982514755, + "grad_norm": 0.008396771736443043, + "learning_rate": 3.927399701236234e-07, + "loss": 0.022, + "num_input_tokens_seen": 207398304, + "step": 170450 + }, + { + "epoch": 18.983739837398375, + "grad_norm": 0.09417817741632462, + "learning_rate": 3.9231109890702777e-07, + "loss": 0.0526, + "num_input_tokens_seen": 207404576, + "step": 170455 + }, + { + "epoch": 18.98429669228199, + "grad_norm": 0.04363362863659859, + "learning_rate": 3.918824601297638e-07, + "loss": 0.1257, + "num_input_tokens_seen": 207410880, + "step": 170460 + }, + { + "epoch": 18.98485354716561, + "grad_norm": 0.0018916039261966944, + "learning_rate": 3.914540537958894e-07, + "loss": 0.012, + "num_input_tokens_seen": 207417152, + "step": 170465 + }, + { + "epoch": 18.985410402049226, + "grad_norm": 0.20576393604278564, + "learning_rate": 3.9102587990944573e-07, + "loss": 0.0049, + "num_input_tokens_seen": 207423008, + "step": 170470 + }, + { + "epoch": 18.98596725693284, + "grad_norm": 1.0282365083694458, + "learning_rate": 3.905979384744796e-07, + "loss": 0.2815, + "num_input_tokens_seen": 207428928, + "step": 170475 + }, + { + "epoch": 18.98652411181646, + "grad_norm": 0.0006944837514311075, + "learning_rate": 3.901702294950349e-07, + "loss": 0.002, + "num_input_tokens_seen": 207435328, + "step": 170480 + }, + { + "epoch": 18.987080966700077, + "grad_norm": 0.0012631132267415524, + "learning_rate": 3.897427529751474e-07, + "loss": 0.1274, + "num_input_tokens_seen": 207441088, + "step": 170485 + }, + { + "epoch": 18.987637821583697, + "grad_norm": 1.0412310361862183, + "learning_rate": 3.8931550891885547e-07, + "loss": 0.0129, + "num_input_tokens_seen": 207447264, + "step": 170490 + }, + { + "epoch": 18.988194676467312, + "grad_norm": 0.2768738567829132, + "learning_rate": 3.8888849733020037e-07, + "loss": 0.0053, + "num_input_tokens_seen": 207453696, + "step": 170495 + }, + { + "epoch": 18.98875153135093, + "grad_norm": 0.2585415542125702, + "learning_rate": 3.8846171821320943e-07, + "loss": 0.0099, + "num_input_tokens_seen": 207459808, + "step": 170500 + }, + { + "epoch": 18.989308386234548, + "grad_norm": 0.004933722782880068, + "learning_rate": 3.880351715719155e-07, + "loss": 0.0174, + "num_input_tokens_seen": 207465120, + "step": 170505 + }, + { + "epoch": 18.989865241118164, + "grad_norm": 0.06471966952085495, + "learning_rate": 3.876088574103487e-07, + "loss": 0.0026, + "num_input_tokens_seen": 207471296, + "step": 170510 + }, + { + "epoch": 18.990422096001783, + "grad_norm": 0.42000117897987366, + "learning_rate": 3.871827757325336e-07, + "loss": 0.049, + "num_input_tokens_seen": 207477344, + "step": 170515 + }, + { + "epoch": 18.9909789508854, + "grad_norm": 0.02776557393372059, + "learning_rate": 3.867569265424975e-07, + "loss": 0.0041, + "num_input_tokens_seen": 207483584, + "step": 170520 + }, + { + "epoch": 18.991535805769015, + "grad_norm": 0.01107095181941986, + "learning_rate": 3.8633130984426503e-07, + "loss": 0.1098, + "num_input_tokens_seen": 207489632, + "step": 170525 + }, + { + "epoch": 18.992092660652634, + "grad_norm": 1.894648551940918, + "learning_rate": 3.8590592564184957e-07, + "loss": 0.0909, + "num_input_tokens_seen": 207495744, + "step": 170530 + }, + { + "epoch": 18.99264951553625, + "grad_norm": 0.7901095747947693, + "learning_rate": 3.854807739392757e-07, + "loss": 0.0216, + "num_input_tokens_seen": 207501952, + "step": 170535 + }, + { + "epoch": 18.99320637041987, + "grad_norm": 0.77436363697052, + "learning_rate": 3.8505585474055416e-07, + "loss": 0.0484, + "num_input_tokens_seen": 207508000, + "step": 170540 + }, + { + "epoch": 18.993763225303486, + "grad_norm": 0.0033104955218732357, + "learning_rate": 3.846311680497039e-07, + "loss": 0.0206, + "num_input_tokens_seen": 207514080, + "step": 170545 + }, + { + "epoch": 18.9943200801871, + "grad_norm": 0.0009864537278190255, + "learning_rate": 3.8420671387073283e-07, + "loss": 0.0133, + "num_input_tokens_seen": 207520256, + "step": 170550 + }, + { + "epoch": 18.99487693507072, + "grad_norm": 1.7459684610366821, + "learning_rate": 3.837824922076516e-07, + "loss": 0.1054, + "num_input_tokens_seen": 207525920, + "step": 170555 + }, + { + "epoch": 18.995433789954337, + "grad_norm": 0.0074253869242966175, + "learning_rate": 3.8335850306446544e-07, + "loss": 0.0023, + "num_input_tokens_seen": 207532224, + "step": 170560 + }, + { + "epoch": 18.995990644837956, + "grad_norm": 0.00531452801078558, + "learning_rate": 3.829347464451821e-07, + "loss": 0.0964, + "num_input_tokens_seen": 207538656, + "step": 170565 + }, + { + "epoch": 18.996547499721572, + "grad_norm": 0.00020926003344357014, + "learning_rate": 3.825112223538041e-07, + "loss": 0.007, + "num_input_tokens_seen": 207544672, + "step": 170570 + }, + { + "epoch": 18.99710435460519, + "grad_norm": 0.0004015789891127497, + "learning_rate": 3.8208793079432813e-07, + "loss": 0.0221, + "num_input_tokens_seen": 207550624, + "step": 170575 + }, + { + "epoch": 18.997661209488808, + "grad_norm": 0.3318268656730652, + "learning_rate": 3.816648717707566e-07, + "loss": 0.0658, + "num_input_tokens_seen": 207556448, + "step": 170580 + }, + { + "epoch": 18.998218064372423, + "grad_norm": 0.002743798540905118, + "learning_rate": 3.812420452870835e-07, + "loss": 0.0964, + "num_input_tokens_seen": 207562496, + "step": 170585 + }, + { + "epoch": 18.998774919256043, + "grad_norm": 0.9561820030212402, + "learning_rate": 3.808194513473029e-07, + "loss": 0.0231, + "num_input_tokens_seen": 207568256, + "step": 170590 + }, + { + "epoch": 18.99933177413966, + "grad_norm": 0.01074955053627491, + "learning_rate": 3.803970899554116e-07, + "loss": 0.0461, + "num_input_tokens_seen": 207574688, + "step": 170595 + }, + { + "epoch": 18.99988862902328, + "grad_norm": 1.2395508289337158, + "learning_rate": 3.7997496111538963e-07, + "loss": 0.0566, + "num_input_tokens_seen": 207580864, + "step": 170600 + }, + { + "epoch": 19.0, + "eval_loss": 0.0823851004242897, + "eval_runtime": 111.5914, + "eval_samples_per_second": 35.764, + "eval_steps_per_second": 8.943, + "num_input_tokens_seen": 207581424, + "step": 170601 + }, + { + "epoch": 19.000445483906894, + "grad_norm": 6.170627602841705e-05, + "learning_rate": 3.795530648312312e-07, + "loss": 0.0015, + "num_input_tokens_seen": 207586320, + "step": 170605 + }, + { + "epoch": 19.00100233879051, + "grad_norm": 0.14985333383083344, + "learning_rate": 3.7913140110691634e-07, + "loss": 0.0033, + "num_input_tokens_seen": 207592432, + "step": 170610 + }, + { + "epoch": 19.00155919367413, + "grad_norm": 0.0070755211636424065, + "learning_rate": 3.7870996994643637e-07, + "loss": 0.0016, + "num_input_tokens_seen": 207598512, + "step": 170615 + }, + { + "epoch": 19.002116048557745, + "grad_norm": 0.002411436289548874, + "learning_rate": 3.782887713537658e-07, + "loss": 0.0113, + "num_input_tokens_seen": 207604784, + "step": 170620 + }, + { + "epoch": 19.002672903441365, + "grad_norm": 7.638434908585623e-05, + "learning_rate": 3.778678053328849e-07, + "loss": 0.0115, + "num_input_tokens_seen": 207610864, + "step": 170625 + }, + { + "epoch": 19.00322975832498, + "grad_norm": 0.02699178084731102, + "learning_rate": 3.774470718877654e-07, + "loss": 0.1286, + "num_input_tokens_seen": 207617360, + "step": 170630 + }, + { + "epoch": 19.003786613208597, + "grad_norm": 0.23940350115299225, + "learning_rate": 3.7702657102239024e-07, + "loss": 0.059, + "num_input_tokens_seen": 207623152, + "step": 170635 + }, + { + "epoch": 19.004343468092216, + "grad_norm": 0.08691345155239105, + "learning_rate": 3.766063027407257e-07, + "loss": 0.0983, + "num_input_tokens_seen": 207629104, + "step": 170640 + }, + { + "epoch": 19.004900322975832, + "grad_norm": 0.18510137498378754, + "learning_rate": 3.7618626704674086e-07, + "loss": 0.056, + "num_input_tokens_seen": 207635376, + "step": 170645 + }, + { + "epoch": 19.00545717785945, + "grad_norm": 1.1758145093917847, + "learning_rate": 3.7576646394440475e-07, + "loss": 0.0702, + "num_input_tokens_seen": 207641552, + "step": 170650 + }, + { + "epoch": 19.006014032743067, + "grad_norm": 0.3629456162452698, + "learning_rate": 3.7534689343768356e-07, + "loss": 0.0122, + "num_input_tokens_seen": 207647952, + "step": 170655 + }, + { + "epoch": 19.006570887626683, + "grad_norm": 0.11583296954631805, + "learning_rate": 3.7492755553054095e-07, + "loss": 0.1268, + "num_input_tokens_seen": 207653648, + "step": 170660 + }, + { + "epoch": 19.007127742510303, + "grad_norm": 0.22069190442562103, + "learning_rate": 3.7450845022693746e-07, + "loss": 0.0077, + "num_input_tokens_seen": 207659792, + "step": 170665 + }, + { + "epoch": 19.00768459739392, + "grad_norm": 6.78322758176364e-05, + "learning_rate": 3.740895775308284e-07, + "loss": 0.045, + "num_input_tokens_seen": 207665616, + "step": 170670 + }, + { + "epoch": 19.008241452277538, + "grad_norm": 0.09064657241106033, + "learning_rate": 3.736709374461772e-07, + "loss": 0.0274, + "num_input_tokens_seen": 207671440, + "step": 170675 + }, + { + "epoch": 19.008798307161154, + "grad_norm": 0.006865877192467451, + "learning_rate": 3.7325252997693074e-07, + "loss": 0.0018, + "num_input_tokens_seen": 207677296, + "step": 170680 + }, + { + "epoch": 19.00935516204477, + "grad_norm": 0.0006339069223031402, + "learning_rate": 3.728343551270469e-07, + "loss": 0.0266, + "num_input_tokens_seen": 207683120, + "step": 170685 + }, + { + "epoch": 19.00991201692839, + "grad_norm": 0.002660145051777363, + "learning_rate": 3.724164129004726e-07, + "loss": 0.0424, + "num_input_tokens_seen": 207689136, + "step": 170690 + }, + { + "epoch": 19.010468871812005, + "grad_norm": 0.00496946694329381, + "learning_rate": 3.719987033011574e-07, + "loss": 0.0729, + "num_input_tokens_seen": 207695440, + "step": 170695 + }, + { + "epoch": 19.011025726695625, + "grad_norm": 1.8230712413787842, + "learning_rate": 3.715812263330426e-07, + "loss": 0.077, + "num_input_tokens_seen": 207701744, + "step": 170700 + }, + { + "epoch": 19.01158258157924, + "grad_norm": 0.0002291987038915977, + "learning_rate": 3.7116398200007786e-07, + "loss": 0.1171, + "num_input_tokens_seen": 207707600, + "step": 170705 + }, + { + "epoch": 19.012139436462856, + "grad_norm": 0.06806232035160065, + "learning_rate": 3.707469703062044e-07, + "loss": 0.0134, + "num_input_tokens_seen": 207713584, + "step": 170710 + }, + { + "epoch": 19.012696291346476, + "grad_norm": 0.3828909695148468, + "learning_rate": 3.703301912553553e-07, + "loss": 0.0077, + "num_input_tokens_seen": 207719920, + "step": 170715 + }, + { + "epoch": 19.01325314623009, + "grad_norm": 0.01669255830347538, + "learning_rate": 3.699136448514717e-07, + "loss": 0.0744, + "num_input_tokens_seen": 207725936, + "step": 170720 + }, + { + "epoch": 19.01381000111371, + "grad_norm": 0.0007650941843166947, + "learning_rate": 3.6949733109848396e-07, + "loss": 0.0113, + "num_input_tokens_seen": 207731984, + "step": 170725 + }, + { + "epoch": 19.014366855997327, + "grad_norm": 0.013336866162717342, + "learning_rate": 3.6908125000033045e-07, + "loss": 0.0086, + "num_input_tokens_seen": 207738000, + "step": 170730 + }, + { + "epoch": 19.014923710880943, + "grad_norm": 2.1005523204803467, + "learning_rate": 3.686654015609359e-07, + "loss": 0.0603, + "num_input_tokens_seen": 207744112, + "step": 170735 + }, + { + "epoch": 19.015480565764562, + "grad_norm": 0.0009808044414967299, + "learning_rate": 3.68249785784236e-07, + "loss": 0.0728, + "num_input_tokens_seen": 207750448, + "step": 170740 + }, + { + "epoch": 19.01603742064818, + "grad_norm": 0.0822821632027626, + "learning_rate": 3.67834402674147e-07, + "loss": 0.0848, + "num_input_tokens_seen": 207756240, + "step": 170745 + }, + { + "epoch": 19.016594275531798, + "grad_norm": 0.504538893699646, + "learning_rate": 3.6741925223459925e-07, + "loss": 0.0583, + "num_input_tokens_seen": 207762384, + "step": 170750 + }, + { + "epoch": 19.017151130415414, + "grad_norm": 0.0012251166626811028, + "learning_rate": 3.67004334469509e-07, + "loss": 0.044, + "num_input_tokens_seen": 207768752, + "step": 170755 + }, + { + "epoch": 19.01770798529903, + "grad_norm": 5.847363471984863, + "learning_rate": 3.665896493828008e-07, + "loss": 0.0959, + "num_input_tokens_seen": 207774704, + "step": 170760 + }, + { + "epoch": 19.01826484018265, + "grad_norm": 0.629004180431366, + "learning_rate": 3.661751969783911e-07, + "loss": 0.0665, + "num_input_tokens_seen": 207781008, + "step": 170765 + }, + { + "epoch": 19.018821695066265, + "grad_norm": 0.04192230477929115, + "learning_rate": 3.6576097726019053e-07, + "loss": 0.0258, + "num_input_tokens_seen": 207787280, + "step": 170770 + }, + { + "epoch": 19.019378549949884, + "grad_norm": 1.1547611951828003, + "learning_rate": 3.653469902321127e-07, + "loss": 0.016, + "num_input_tokens_seen": 207793392, + "step": 170775 + }, + { + "epoch": 19.0199354048335, + "grad_norm": 0.11084407567977905, + "learning_rate": 3.6493323589807114e-07, + "loss": 0.0488, + "num_input_tokens_seen": 207799184, + "step": 170780 + }, + { + "epoch": 19.020492259717116, + "grad_norm": 0.06867194920778275, + "learning_rate": 3.6451971426197385e-07, + "loss": 0.046, + "num_input_tokens_seen": 207805168, + "step": 170785 + }, + { + "epoch": 19.021049114600736, + "grad_norm": 0.000580069434363395, + "learning_rate": 3.64106425327726e-07, + "loss": 0.0182, + "num_input_tokens_seen": 207811248, + "step": 170790 + }, + { + "epoch": 19.02160596948435, + "grad_norm": 0.12870684266090393, + "learning_rate": 3.6369336909922725e-07, + "loss": 0.003, + "num_input_tokens_seen": 207817264, + "step": 170795 + }, + { + "epoch": 19.02216282436797, + "grad_norm": 0.000950491230469197, + "learning_rate": 3.632805455803856e-07, + "loss": 0.0375, + "num_input_tokens_seen": 207823536, + "step": 170800 + }, + { + "epoch": 19.022719679251587, + "grad_norm": 0.00942537747323513, + "learning_rate": 3.628679547750952e-07, + "loss": 0.0434, + "num_input_tokens_seen": 207829712, + "step": 170805 + }, + { + "epoch": 19.023276534135203, + "grad_norm": 0.5759425759315491, + "learning_rate": 3.6245559668726114e-07, + "loss": 0.0154, + "num_input_tokens_seen": 207836112, + "step": 170810 + }, + { + "epoch": 19.023833389018822, + "grad_norm": 0.21785904467105865, + "learning_rate": 3.6204347132076653e-07, + "loss": 0.0671, + "num_input_tokens_seen": 207842032, + "step": 170815 + }, + { + "epoch": 19.024390243902438, + "grad_norm": 2.1952052116394043, + "learning_rate": 3.6163157867951656e-07, + "loss": 0.0789, + "num_input_tokens_seen": 207848240, + "step": 170820 + }, + { + "epoch": 19.024947098786058, + "grad_norm": 0.37972626090049744, + "learning_rate": 3.612199187673915e-07, + "loss": 0.0659, + "num_input_tokens_seen": 207854320, + "step": 170825 + }, + { + "epoch": 19.025503953669674, + "grad_norm": 0.5061957240104675, + "learning_rate": 3.608084915882881e-07, + "loss": 0.0241, + "num_input_tokens_seen": 207860336, + "step": 170830 + }, + { + "epoch": 19.02606080855329, + "grad_norm": 0.005762102548032999, + "learning_rate": 3.6039729714608673e-07, + "loss": 0.0601, + "num_input_tokens_seen": 207866256, + "step": 170835 + }, + { + "epoch": 19.02661766343691, + "grad_norm": 0.10073935985565186, + "learning_rate": 3.5998633544467586e-07, + "loss": 0.0113, + "num_input_tokens_seen": 207871888, + "step": 170840 + }, + { + "epoch": 19.027174518320525, + "grad_norm": 0.030143508687615395, + "learning_rate": 3.59575606487933e-07, + "loss": 0.0036, + "num_input_tokens_seen": 207877968, + "step": 170845 + }, + { + "epoch": 19.027731373204144, + "grad_norm": 0.001179926679469645, + "learning_rate": 3.5916511027974386e-07, + "loss": 0.0522, + "num_input_tokens_seen": 207884240, + "step": 170850 + }, + { + "epoch": 19.02828822808776, + "grad_norm": 0.08886440098285675, + "learning_rate": 3.587548468239804e-07, + "loss": 0.0156, + "num_input_tokens_seen": 207890512, + "step": 170855 + }, + { + "epoch": 19.028845082971376, + "grad_norm": 0.03632627800107002, + "learning_rate": 3.583448161245201e-07, + "loss": 0.031, + "num_input_tokens_seen": 207896368, + "step": 170860 + }, + { + "epoch": 19.029401937854995, + "grad_norm": 0.01261903252452612, + "learning_rate": 3.57935018185232e-07, + "loss": 0.0063, + "num_input_tokens_seen": 207902320, + "step": 170865 + }, + { + "epoch": 19.02995879273861, + "grad_norm": 0.009200314991176128, + "learning_rate": 3.5752545300999363e-07, + "loss": 0.0099, + "num_input_tokens_seen": 207908560, + "step": 170870 + }, + { + "epoch": 19.03051564762223, + "grad_norm": 0.03861595317721367, + "learning_rate": 3.571161206026685e-07, + "loss": 0.0058, + "num_input_tokens_seen": 207914512, + "step": 170875 + }, + { + "epoch": 19.031072502505847, + "grad_norm": 0.037221889942884445, + "learning_rate": 3.567070209671286e-07, + "loss": 0.008, + "num_input_tokens_seen": 207920656, + "step": 170880 + }, + { + "epoch": 19.031629357389463, + "grad_norm": 0.003992788959294558, + "learning_rate": 3.562981541072319e-07, + "loss": 0.0141, + "num_input_tokens_seen": 207926768, + "step": 170885 + }, + { + "epoch": 19.032186212273082, + "grad_norm": 0.5001674294471741, + "learning_rate": 3.558895200268475e-07, + "loss": 0.0027, + "num_input_tokens_seen": 207932624, + "step": 170890 + }, + { + "epoch": 19.032743067156698, + "grad_norm": 0.0006187534891068935, + "learning_rate": 3.5548111872982517e-07, + "loss": 0.0003, + "num_input_tokens_seen": 207938832, + "step": 170895 + }, + { + "epoch": 19.033299922040317, + "grad_norm": 0.00016894354484975338, + "learning_rate": 3.550729502200312e-07, + "loss": 0.0043, + "num_input_tokens_seen": 207944880, + "step": 170900 + }, + { + "epoch": 19.033856776923933, + "grad_norm": 0.0029288458172231913, + "learning_rate": 3.5466501450132085e-07, + "loss": 0.0489, + "num_input_tokens_seen": 207951056, + "step": 170905 + }, + { + "epoch": 19.03441363180755, + "grad_norm": 0.003136592684313655, + "learning_rate": 3.5425731157754103e-07, + "loss": 0.02, + "num_input_tokens_seen": 207957456, + "step": 170910 + }, + { + "epoch": 19.03497048669117, + "grad_norm": 0.0008422248647548258, + "learning_rate": 3.5384984145254706e-07, + "loss": 0.0058, + "num_input_tokens_seen": 207963504, + "step": 170915 + }, + { + "epoch": 19.035527341574785, + "grad_norm": 0.0042561315931379795, + "learning_rate": 3.534426041301914e-07, + "loss": 0.0394, + "num_input_tokens_seen": 207969392, + "step": 170920 + }, + { + "epoch": 19.036084196458404, + "grad_norm": 0.04389147832989693, + "learning_rate": 3.5303559961431256e-07, + "loss": 0.0751, + "num_input_tokens_seen": 207975504, + "step": 170925 + }, + { + "epoch": 19.03664105134202, + "grad_norm": 0.015518673695623875, + "learning_rate": 3.5262882790876305e-07, + "loss": 0.1297, + "num_input_tokens_seen": 207981392, + "step": 170930 + }, + { + "epoch": 19.03719790622564, + "grad_norm": 1.3442543745040894, + "learning_rate": 3.5222228901737874e-07, + "loss": 0.077, + "num_input_tokens_seen": 207987504, + "step": 170935 + }, + { + "epoch": 19.037754761109255, + "grad_norm": 0.0002115145034622401, + "learning_rate": 3.5181598294400373e-07, + "loss": 0.0835, + "num_input_tokens_seen": 207993616, + "step": 170940 + }, + { + "epoch": 19.03831161599287, + "grad_norm": 0.4652162790298462, + "learning_rate": 3.514099096924711e-07, + "loss": 0.101, + "num_input_tokens_seen": 207999440, + "step": 170945 + }, + { + "epoch": 19.03886847087649, + "grad_norm": 0.03612707555294037, + "learning_rate": 3.51004069266625e-07, + "loss": 0.0978, + "num_input_tokens_seen": 208004976, + "step": 170950 + }, + { + "epoch": 19.039425325760106, + "grad_norm": 0.11330603808164597, + "learning_rate": 3.505984616702901e-07, + "loss": 0.0116, + "num_input_tokens_seen": 208010768, + "step": 170955 + }, + { + "epoch": 19.039982180643726, + "grad_norm": 0.5381419062614441, + "learning_rate": 3.50193086907305e-07, + "loss": 0.0711, + "num_input_tokens_seen": 208016752, + "step": 170960 + }, + { + "epoch": 19.040539035527342, + "grad_norm": 0.003567100502550602, + "learning_rate": 3.497879449814917e-07, + "loss": 0.0007, + "num_input_tokens_seen": 208023056, + "step": 170965 + }, + { + "epoch": 19.041095890410958, + "grad_norm": 0.4244297444820404, + "learning_rate": 3.493830358966832e-07, + "loss": 0.0201, + "num_input_tokens_seen": 208029264, + "step": 170970 + }, + { + "epoch": 19.041652745294577, + "grad_norm": 5.28012228012085, + "learning_rate": 3.4897835965670144e-07, + "loss": 0.0808, + "num_input_tokens_seen": 208035472, + "step": 170975 + }, + { + "epoch": 19.042209600178193, + "grad_norm": 0.13016322255134583, + "learning_rate": 3.485739162653684e-07, + "loss": 0.017, + "num_input_tokens_seen": 208041616, + "step": 170980 + }, + { + "epoch": 19.042766455061813, + "grad_norm": 0.029287995770573616, + "learning_rate": 3.481697057265032e-07, + "loss": 0.086, + "num_input_tokens_seen": 208047760, + "step": 170985 + }, + { + "epoch": 19.04332330994543, + "grad_norm": 0.3026455342769623, + "learning_rate": 3.4776572804392783e-07, + "loss": 0.0206, + "num_input_tokens_seen": 208053968, + "step": 170990 + }, + { + "epoch": 19.043880164829044, + "grad_norm": 1.7167528867721558, + "learning_rate": 3.4736198322145587e-07, + "loss": 0.1654, + "num_input_tokens_seen": 208059312, + "step": 170995 + }, + { + "epoch": 19.044437019712664, + "grad_norm": 0.13805347681045532, + "learning_rate": 3.469584712629037e-07, + "loss": 0.0649, + "num_input_tokens_seen": 208065392, + "step": 171000 + }, + { + "epoch": 19.04499387459628, + "grad_norm": 7.526877743657678e-05, + "learning_rate": 3.465551921720767e-07, + "loss": 0.0206, + "num_input_tokens_seen": 208071696, + "step": 171005 + }, + { + "epoch": 19.0455507294799, + "grad_norm": 2.6788904666900635, + "learning_rate": 3.461521459527911e-07, + "loss": 0.0768, + "num_input_tokens_seen": 208077968, + "step": 171010 + }, + { + "epoch": 19.046107584363515, + "grad_norm": 0.0800284892320633, + "learning_rate": 3.457493326088468e-07, + "loss": 0.0039, + "num_input_tokens_seen": 208083664, + "step": 171015 + }, + { + "epoch": 19.04666443924713, + "grad_norm": 0.004640620667487383, + "learning_rate": 3.453467521440573e-07, + "loss": 0.0496, + "num_input_tokens_seen": 208089904, + "step": 171020 + }, + { + "epoch": 19.04722129413075, + "grad_norm": 0.007025213446468115, + "learning_rate": 3.4494440456221966e-07, + "loss": 0.0126, + "num_input_tokens_seen": 208095760, + "step": 171025 + }, + { + "epoch": 19.047778149014366, + "grad_norm": 0.22894984483718872, + "learning_rate": 3.4454228986713355e-07, + "loss": 0.0515, + "num_input_tokens_seen": 208101680, + "step": 171030 + }, + { + "epoch": 19.048335003897986, + "grad_norm": 1.3952637910842896, + "learning_rate": 3.441404080625987e-07, + "loss": 0.0319, + "num_input_tokens_seen": 208107664, + "step": 171035 + }, + { + "epoch": 19.0488918587816, + "grad_norm": 1.8386595249176025, + "learning_rate": 3.4373875915241493e-07, + "loss": 0.1852, + "num_input_tokens_seen": 208113808, + "step": 171040 + }, + { + "epoch": 19.049448713665218, + "grad_norm": 2.0985491275787354, + "learning_rate": 3.4333734314037083e-07, + "loss": 0.0569, + "num_input_tokens_seen": 208119664, + "step": 171045 + }, + { + "epoch": 19.050005568548837, + "grad_norm": 0.0012864398304373026, + "learning_rate": 3.4293616003026054e-07, + "loss": 0.0096, + "num_input_tokens_seen": 208125936, + "step": 171050 + }, + { + "epoch": 19.050562423432453, + "grad_norm": 0.001988158794119954, + "learning_rate": 3.425352098258727e-07, + "loss": 0.063, + "num_input_tokens_seen": 208131984, + "step": 171055 + }, + { + "epoch": 19.051119278316072, + "grad_norm": 0.05380317196249962, + "learning_rate": 3.421344925309933e-07, + "loss": 0.0024, + "num_input_tokens_seen": 208138448, + "step": 171060 + }, + { + "epoch": 19.051676133199688, + "grad_norm": 0.013448266312479973, + "learning_rate": 3.417340081494108e-07, + "loss": 0.063, + "num_input_tokens_seen": 208144432, + "step": 171065 + }, + { + "epoch": 19.052232988083304, + "grad_norm": 1.0554192066192627, + "learning_rate": 3.413337566849084e-07, + "loss": 0.0795, + "num_input_tokens_seen": 208150352, + "step": 171070 + }, + { + "epoch": 19.052789842966924, + "grad_norm": 0.26876887679100037, + "learning_rate": 3.4093373814126363e-07, + "loss": 0.1181, + "num_input_tokens_seen": 208156528, + "step": 171075 + }, + { + "epoch": 19.05334669785054, + "grad_norm": 0.005998596549034119, + "learning_rate": 3.40533952522254e-07, + "loss": 0.2019, + "num_input_tokens_seen": 208163056, + "step": 171080 + }, + { + "epoch": 19.05390355273416, + "grad_norm": 0.011695628054440022, + "learning_rate": 3.401343998316597e-07, + "loss": 0.0201, + "num_input_tokens_seen": 208169328, + "step": 171085 + }, + { + "epoch": 19.054460407617775, + "grad_norm": 0.00017257413128390908, + "learning_rate": 3.3973508007325293e-07, + "loss": 0.0157, + "num_input_tokens_seen": 208175696, + "step": 171090 + }, + { + "epoch": 19.05501726250139, + "grad_norm": 0.006361557170748711, + "learning_rate": 3.3933599325080835e-07, + "loss": 0.0195, + "num_input_tokens_seen": 208181680, + "step": 171095 + }, + { + "epoch": 19.05557411738501, + "grad_norm": 1.0741502046585083, + "learning_rate": 3.389371393680896e-07, + "loss": 0.0655, + "num_input_tokens_seen": 208187568, + "step": 171100 + }, + { + "epoch": 19.056130972268626, + "grad_norm": 0.014771200716495514, + "learning_rate": 3.3853851842886865e-07, + "loss": 0.1055, + "num_input_tokens_seen": 208194032, + "step": 171105 + }, + { + "epoch": 19.056687827152246, + "grad_norm": 0.010734982788562775, + "learning_rate": 3.3814013043690915e-07, + "loss": 0.0338, + "num_input_tokens_seen": 208200432, + "step": 171110 + }, + { + "epoch": 19.05724468203586, + "grad_norm": 0.06459090858697891, + "learning_rate": 3.377419753959776e-07, + "loss": 0.035, + "num_input_tokens_seen": 208206672, + "step": 171115 + }, + { + "epoch": 19.057801536919477, + "grad_norm": 0.00028268329333513975, + "learning_rate": 3.373440533098293e-07, + "loss": 0.051, + "num_input_tokens_seen": 208212752, + "step": 171120 + }, + { + "epoch": 19.058358391803097, + "grad_norm": 0.10799907147884369, + "learning_rate": 3.369463641822279e-07, + "loss": 0.0065, + "num_input_tokens_seen": 208218704, + "step": 171125 + }, + { + "epoch": 19.058915246686713, + "grad_norm": 0.00013687514001503587, + "learning_rate": 3.3654890801692593e-07, + "loss": 0.1078, + "num_input_tokens_seen": 208224912, + "step": 171130 + }, + { + "epoch": 19.059472101570332, + "grad_norm": 0.9147528409957886, + "learning_rate": 3.3615168481768153e-07, + "loss": 0.0272, + "num_input_tokens_seen": 208231152, + "step": 171135 + }, + { + "epoch": 19.060028956453948, + "grad_norm": 1.0954099893569946, + "learning_rate": 3.3575469458824173e-07, + "loss": 0.0194, + "num_input_tokens_seen": 208237264, + "step": 171140 + }, + { + "epoch": 19.060585811337564, + "grad_norm": 0.011864791624248028, + "learning_rate": 3.3535793733236455e-07, + "loss": 0.0024, + "num_input_tokens_seen": 208243536, + "step": 171145 + }, + { + "epoch": 19.061142666221183, + "grad_norm": 0.21015247702598572, + "learning_rate": 3.349614130537887e-07, + "loss": 0.0117, + "num_input_tokens_seen": 208249904, + "step": 171150 + }, + { + "epoch": 19.0616995211048, + "grad_norm": 0.8378127217292786, + "learning_rate": 3.34565121756264e-07, + "loss": 0.1288, + "num_input_tokens_seen": 208256080, + "step": 171155 + }, + { + "epoch": 19.06225637598842, + "grad_norm": 0.00030689858249388635, + "learning_rate": 3.3416906344353183e-07, + "loss": 0.0388, + "num_input_tokens_seen": 208261776, + "step": 171160 + }, + { + "epoch": 19.062813230872035, + "grad_norm": 0.30663201212882996, + "learning_rate": 3.3377323811933646e-07, + "loss": 0.0725, + "num_input_tokens_seen": 208267792, + "step": 171165 + }, + { + "epoch": 19.06337008575565, + "grad_norm": 0.5782406330108643, + "learning_rate": 3.333776457874166e-07, + "loss": 0.0403, + "num_input_tokens_seen": 208273232, + "step": 171170 + }, + { + "epoch": 19.06392694063927, + "grad_norm": 0.05777130648493767, + "learning_rate": 3.329822864515081e-07, + "loss": 0.0108, + "num_input_tokens_seen": 208279216, + "step": 171175 + }, + { + "epoch": 19.064483795522886, + "grad_norm": 0.5908457636833191, + "learning_rate": 3.325871601153413e-07, + "loss": 0.007, + "num_input_tokens_seen": 208285296, + "step": 171180 + }, + { + "epoch": 19.065040650406505, + "grad_norm": 0.01114470325410366, + "learning_rate": 3.321922667826521e-07, + "loss": 0.132, + "num_input_tokens_seen": 208291792, + "step": 171185 + }, + { + "epoch": 19.06559750529012, + "grad_norm": 0.2144857496023178, + "learning_rate": 3.3179760645717374e-07, + "loss": 0.0058, + "num_input_tokens_seen": 208297520, + "step": 171190 + }, + { + "epoch": 19.066154360173737, + "grad_norm": 0.012373515404760838, + "learning_rate": 3.3140317914262807e-07, + "loss": 0.0047, + "num_input_tokens_seen": 208303632, + "step": 171195 + }, + { + "epoch": 19.066711215057357, + "grad_norm": 0.0003755617653951049, + "learning_rate": 3.3100898484274555e-07, + "loss": 0.0018, + "num_input_tokens_seen": 208309872, + "step": 171200 + }, + { + "epoch": 19.067268069940972, + "grad_norm": 0.21664682030677795, + "learning_rate": 3.306150235612454e-07, + "loss": 0.0323, + "num_input_tokens_seen": 208315888, + "step": 171205 + }, + { + "epoch": 19.067824924824592, + "grad_norm": 1.2515528202056885, + "learning_rate": 3.302212953018524e-07, + "loss": 0.159, + "num_input_tokens_seen": 208321904, + "step": 171210 + }, + { + "epoch": 19.068381779708208, + "grad_norm": 0.013764345087110996, + "learning_rate": 3.2982780006828583e-07, + "loss": 0.0442, + "num_input_tokens_seen": 208327376, + "step": 171215 + }, + { + "epoch": 19.068938634591824, + "grad_norm": 0.00513429893180728, + "learning_rate": 3.2943453786425937e-07, + "loss": 0.0186, + "num_input_tokens_seen": 208333552, + "step": 171220 + }, + { + "epoch": 19.069495489475443, + "grad_norm": 0.0017428986029699445, + "learning_rate": 3.290415086934895e-07, + "loss": 0.0274, + "num_input_tokens_seen": 208339344, + "step": 171225 + }, + { + "epoch": 19.07005234435906, + "grad_norm": 0.17890328168869019, + "learning_rate": 3.286487125596871e-07, + "loss": 0.0031, + "num_input_tokens_seen": 208345744, + "step": 171230 + }, + { + "epoch": 19.07060919924268, + "grad_norm": 0.03182831034064293, + "learning_rate": 3.282561494665659e-07, + "loss": 0.0067, + "num_input_tokens_seen": 208351952, + "step": 171235 + }, + { + "epoch": 19.071166054126294, + "grad_norm": 0.24781690537929535, + "learning_rate": 3.278638194178313e-07, + "loss": 0.0065, + "num_input_tokens_seen": 208357872, + "step": 171240 + }, + { + "epoch": 19.07172290900991, + "grad_norm": 2.3008368015289307, + "learning_rate": 3.2747172241719137e-07, + "loss": 0.0505, + "num_input_tokens_seen": 208363824, + "step": 171245 + }, + { + "epoch": 19.07227976389353, + "grad_norm": 0.07103972882032394, + "learning_rate": 3.27079858468346e-07, + "loss": 0.0642, + "num_input_tokens_seen": 208370192, + "step": 171250 + }, + { + "epoch": 19.072836618777146, + "grad_norm": 0.0009482090827077627, + "learning_rate": 3.2668822757500053e-07, + "loss": 0.0077, + "num_input_tokens_seen": 208376432, + "step": 171255 + }, + { + "epoch": 19.073393473660765, + "grad_norm": 0.7149523496627808, + "learning_rate": 3.2629682974085196e-07, + "loss": 0.0995, + "num_input_tokens_seen": 208382320, + "step": 171260 + }, + { + "epoch": 19.07395032854438, + "grad_norm": 0.0030250530689954758, + "learning_rate": 3.259056649696002e-07, + "loss": 0.0048, + "num_input_tokens_seen": 208388432, + "step": 171265 + }, + { + "epoch": 19.074507183427997, + "grad_norm": 0.00021639312035404146, + "learning_rate": 3.255147332649339e-07, + "loss": 0.0099, + "num_input_tokens_seen": 208394736, + "step": 171270 + }, + { + "epoch": 19.075064038311616, + "grad_norm": 0.46719661355018616, + "learning_rate": 3.2512403463055283e-07, + "loss": 0.0539, + "num_input_tokens_seen": 208400624, + "step": 171275 + }, + { + "epoch": 19.075620893195232, + "grad_norm": 2.3931972980499268, + "learning_rate": 3.2473356907014306e-07, + "loss": 0.1474, + "num_input_tokens_seen": 208406192, + "step": 171280 + }, + { + "epoch": 19.07617774807885, + "grad_norm": 0.4325648248195648, + "learning_rate": 3.24343336587396e-07, + "loss": 0.0468, + "num_input_tokens_seen": 208412432, + "step": 171285 + }, + { + "epoch": 19.076734602962468, + "grad_norm": 0.07971368730068207, + "learning_rate": 3.2395333718599485e-07, + "loss": 0.0874, + "num_input_tokens_seen": 208418576, + "step": 171290 + }, + { + "epoch": 19.077291457846087, + "grad_norm": 0.16228412091732025, + "learning_rate": 3.2356357086962554e-07, + "loss": 0.0098, + "num_input_tokens_seen": 208424912, + "step": 171295 + }, + { + "epoch": 19.077848312729703, + "grad_norm": 0.04023374244570732, + "learning_rate": 3.231740376419656e-07, + "loss": 0.1228, + "num_input_tokens_seen": 208431152, + "step": 171300 + }, + { + "epoch": 19.07840516761332, + "grad_norm": 0.7595098614692688, + "learning_rate": 3.227847375067011e-07, + "loss": 0.0166, + "num_input_tokens_seen": 208437200, + "step": 171305 + }, + { + "epoch": 19.07896202249694, + "grad_norm": 0.09134899824857712, + "learning_rate": 3.2239567046750405e-07, + "loss": 0.0513, + "num_input_tokens_seen": 208443408, + "step": 171310 + }, + { + "epoch": 19.079518877380554, + "grad_norm": 0.5670238137245178, + "learning_rate": 3.2200683652805486e-07, + "loss": 0.0656, + "num_input_tokens_seen": 208449072, + "step": 171315 + }, + { + "epoch": 19.080075732264174, + "grad_norm": 3.439643383026123, + "learning_rate": 3.2161823569201723e-07, + "loss": 0.1466, + "num_input_tokens_seen": 208454736, + "step": 171320 + }, + { + "epoch": 19.08063258714779, + "grad_norm": 0.7408185601234436, + "learning_rate": 3.212298679630715e-07, + "loss": 0.0315, + "num_input_tokens_seen": 208460784, + "step": 171325 + }, + { + "epoch": 19.081189442031405, + "grad_norm": 0.9105406999588013, + "learning_rate": 3.208417333448788e-07, + "loss": 0.0772, + "num_input_tokens_seen": 208467024, + "step": 171330 + }, + { + "epoch": 19.081746296915025, + "grad_norm": 0.1726660281419754, + "learning_rate": 3.20453831841111e-07, + "loss": 0.0037, + "num_input_tokens_seen": 208473104, + "step": 171335 + }, + { + "epoch": 19.08230315179864, + "grad_norm": 0.038310419768095016, + "learning_rate": 3.200661634554264e-07, + "loss": 0.031, + "num_input_tokens_seen": 208479600, + "step": 171340 + }, + { + "epoch": 19.08286000668226, + "grad_norm": 1.863416314125061, + "learning_rate": 3.1967872819149425e-07, + "loss": 0.0715, + "num_input_tokens_seen": 208485584, + "step": 171345 + }, + { + "epoch": 19.083416861565876, + "grad_norm": 0.7004078030586243, + "learning_rate": 3.192915260529672e-07, + "loss": 0.0128, + "num_input_tokens_seen": 208491760, + "step": 171350 + }, + { + "epoch": 19.083973716449492, + "grad_norm": 0.21393664181232452, + "learning_rate": 3.189045570435062e-07, + "loss": 0.0554, + "num_input_tokens_seen": 208497936, + "step": 171355 + }, + { + "epoch": 19.08453057133311, + "grad_norm": 0.05608675256371498, + "learning_rate": 3.1851782116676666e-07, + "loss": 0.0471, + "num_input_tokens_seen": 208503952, + "step": 171360 + }, + { + "epoch": 19.085087426216727, + "grad_norm": 0.000518023909535259, + "learning_rate": 3.181313184264012e-07, + "loss": 0.0083, + "num_input_tokens_seen": 208510320, + "step": 171365 + }, + { + "epoch": 19.085644281100347, + "grad_norm": 0.33130383491516113, + "learning_rate": 3.1774504882605695e-07, + "loss": 0.0295, + "num_input_tokens_seen": 208516560, + "step": 171370 + }, + { + "epoch": 19.086201135983963, + "grad_norm": 0.005220526363700628, + "learning_rate": 3.1735901236938926e-07, + "loss": 0.0014, + "num_input_tokens_seen": 208522480, + "step": 171375 + }, + { + "epoch": 19.08675799086758, + "grad_norm": 0.04775729775428772, + "learning_rate": 3.169732090600397e-07, + "loss": 0.0021, + "num_input_tokens_seen": 208528304, + "step": 171380 + }, + { + "epoch": 19.087314845751198, + "grad_norm": 0.022796630859375, + "learning_rate": 3.165876389016553e-07, + "loss": 0.0621, + "num_input_tokens_seen": 208534448, + "step": 171385 + }, + { + "epoch": 19.087871700634814, + "grad_norm": 0.29379919171333313, + "learning_rate": 3.162023018978749e-07, + "loss": 0.0182, + "num_input_tokens_seen": 208540624, + "step": 171390 + }, + { + "epoch": 19.088428555518433, + "grad_norm": 0.37713685631752014, + "learning_rate": 3.1581719805234e-07, + "loss": 0.0463, + "num_input_tokens_seen": 208546704, + "step": 171395 + }, + { + "epoch": 19.08898541040205, + "grad_norm": 0.8820170760154724, + "learning_rate": 3.1543232736868934e-07, + "loss": 0.1141, + "num_input_tokens_seen": 208552912, + "step": 171400 + }, + { + "epoch": 19.089542265285665, + "grad_norm": 0.28234031796455383, + "learning_rate": 3.1504768985055887e-07, + "loss": 0.0071, + "num_input_tokens_seen": 208559024, + "step": 171405 + }, + { + "epoch": 19.090099120169285, + "grad_norm": 0.0339309386909008, + "learning_rate": 3.146632855015763e-07, + "loss": 0.0038, + "num_input_tokens_seen": 208565456, + "step": 171410 + }, + { + "epoch": 19.0906559750529, + "grad_norm": 1.3646451234817505, + "learning_rate": 3.142791143253804e-07, + "loss": 0.0438, + "num_input_tokens_seen": 208571856, + "step": 171415 + }, + { + "epoch": 19.09121282993652, + "grad_norm": 0.0001494756870670244, + "learning_rate": 3.1389517632559316e-07, + "loss": 0.068, + "num_input_tokens_seen": 208578064, + "step": 171420 + }, + { + "epoch": 19.091769684820136, + "grad_norm": 0.15007862448692322, + "learning_rate": 3.1351147150584516e-07, + "loss": 0.1209, + "num_input_tokens_seen": 208584336, + "step": 171425 + }, + { + "epoch": 19.092326539703752, + "grad_norm": 1.1348315477371216, + "learning_rate": 3.1312799986976116e-07, + "loss": 0.0857, + "num_input_tokens_seen": 208590512, + "step": 171430 + }, + { + "epoch": 19.09288339458737, + "grad_norm": 0.0009881508303806186, + "learning_rate": 3.1274476142096054e-07, + "loss": 0.0417, + "num_input_tokens_seen": 208596816, + "step": 171435 + }, + { + "epoch": 19.093440249470987, + "grad_norm": 0.0873173177242279, + "learning_rate": 3.123617561630626e-07, + "loss": 0.0606, + "num_input_tokens_seen": 208602864, + "step": 171440 + }, + { + "epoch": 19.093997104354607, + "grad_norm": 0.5293592214584351, + "learning_rate": 3.1197898409969227e-07, + "loss": 0.0129, + "num_input_tokens_seen": 208609168, + "step": 171445 + }, + { + "epoch": 19.094553959238223, + "grad_norm": 0.7834160923957825, + "learning_rate": 3.1159644523445775e-07, + "loss": 0.0259, + "num_input_tokens_seen": 208614736, + "step": 171450 + }, + { + "epoch": 19.09511081412184, + "grad_norm": 0.038243819028139114, + "learning_rate": 3.1121413957097556e-07, + "loss": 0.0008, + "num_input_tokens_seen": 208620848, + "step": 171455 + }, + { + "epoch": 19.095667669005458, + "grad_norm": 0.004924136213958263, + "learning_rate": 3.10832067112854e-07, + "loss": 0.09, + "num_input_tokens_seen": 208626640, + "step": 171460 + }, + { + "epoch": 19.096224523889074, + "grad_norm": 0.2696656584739685, + "learning_rate": 3.1045022786370394e-07, + "loss": 0.0175, + "num_input_tokens_seen": 208632784, + "step": 171465 + }, + { + "epoch": 19.096781378772693, + "grad_norm": 0.2851199209690094, + "learning_rate": 3.100686218271337e-07, + "loss": 0.0126, + "num_input_tokens_seen": 208638704, + "step": 171470 + }, + { + "epoch": 19.09733823365631, + "grad_norm": 0.009466511197388172, + "learning_rate": 3.0968724900674597e-07, + "loss": 0.0265, + "num_input_tokens_seen": 208644208, + "step": 171475 + }, + { + "epoch": 19.097895088539925, + "grad_norm": 0.07067693769931793, + "learning_rate": 3.0930610940614613e-07, + "loss": 0.0046, + "num_input_tokens_seen": 208650224, + "step": 171480 + }, + { + "epoch": 19.098451943423544, + "grad_norm": 0.09344586730003357, + "learning_rate": 3.089252030289314e-07, + "loss": 0.0166, + "num_input_tokens_seen": 208656176, + "step": 171485 + }, + { + "epoch": 19.09900879830716, + "grad_norm": 0.0031822004821151495, + "learning_rate": 3.0854452987869874e-07, + "loss": 0.0138, + "num_input_tokens_seen": 208662224, + "step": 171490 + }, + { + "epoch": 19.09956565319078, + "grad_norm": 0.6222872138023376, + "learning_rate": 3.0816408995904547e-07, + "loss": 0.0303, + "num_input_tokens_seen": 208668080, + "step": 171495 + }, + { + "epoch": 19.100122508074396, + "grad_norm": 0.8451666831970215, + "learning_rate": 3.077838832735658e-07, + "loss": 0.0638, + "num_input_tokens_seen": 208673744, + "step": 171500 + }, + { + "epoch": 19.10067936295801, + "grad_norm": 1.1714385747909546, + "learning_rate": 3.074039098258513e-07, + "loss": 0.1112, + "num_input_tokens_seen": 208679696, + "step": 171505 + }, + { + "epoch": 19.10123621784163, + "grad_norm": 0.001243076752871275, + "learning_rate": 3.070241696194881e-07, + "loss": 0.0012, + "num_input_tokens_seen": 208685648, + "step": 171510 + }, + { + "epoch": 19.101793072725247, + "grad_norm": 0.5395377278327942, + "learning_rate": 3.066446626580649e-07, + "loss": 0.0379, + "num_input_tokens_seen": 208691728, + "step": 171515 + }, + { + "epoch": 19.102349927608866, + "grad_norm": 0.5279502868652344, + "learning_rate": 3.0626538894516775e-07, + "loss": 0.0643, + "num_input_tokens_seen": 208697712, + "step": 171520 + }, + { + "epoch": 19.102906782492482, + "grad_norm": 0.7950231432914734, + "learning_rate": 3.058863484843827e-07, + "loss": 0.0507, + "num_input_tokens_seen": 208703856, + "step": 171525 + }, + { + "epoch": 19.103463637376098, + "grad_norm": 0.043736595660448074, + "learning_rate": 3.055075412792818e-07, + "loss": 0.0603, + "num_input_tokens_seen": 208709712, + "step": 171530 + }, + { + "epoch": 19.104020492259718, + "grad_norm": 0.0003260670055169612, + "learning_rate": 3.0512896733344563e-07, + "loss": 0.0261, + "num_input_tokens_seen": 208716496, + "step": 171535 + }, + { + "epoch": 19.104577347143334, + "grad_norm": 0.5338280200958252, + "learning_rate": 3.047506266504546e-07, + "loss": 0.0071, + "num_input_tokens_seen": 208722800, + "step": 171540 + }, + { + "epoch": 19.105134202026953, + "grad_norm": 0.37111932039260864, + "learning_rate": 3.043725192338753e-07, + "loss": 0.0813, + "num_input_tokens_seen": 208728688, + "step": 171545 + }, + { + "epoch": 19.10569105691057, + "grad_norm": 0.5498689413070679, + "learning_rate": 3.0399464508728825e-07, + "loss": 0.0077, + "num_input_tokens_seen": 208734832, + "step": 171550 + }, + { + "epoch": 19.106247911794185, + "grad_norm": 0.08548815548419952, + "learning_rate": 3.0361700421425444e-07, + "loss": 0.0013, + "num_input_tokens_seen": 208741040, + "step": 171555 + }, + { + "epoch": 19.106804766677804, + "grad_norm": 0.044240206480026245, + "learning_rate": 3.03239596618346e-07, + "loss": 0.0263, + "num_input_tokens_seen": 208746736, + "step": 171560 + }, + { + "epoch": 19.10736162156142, + "grad_norm": 0.11016719788312912, + "learning_rate": 3.02862422303124e-07, + "loss": 0.0057, + "num_input_tokens_seen": 208752944, + "step": 171565 + }, + { + "epoch": 19.10791847644504, + "grad_norm": 0.015375959686934948, + "learning_rate": 3.0248548127215504e-07, + "loss": 0.019, + "num_input_tokens_seen": 208758992, + "step": 171570 + }, + { + "epoch": 19.108475331328655, + "grad_norm": 1.1608622074127197, + "learning_rate": 3.021087735290001e-07, + "loss": 0.0345, + "num_input_tokens_seen": 208764944, + "step": 171575 + }, + { + "epoch": 19.10903218621227, + "grad_norm": 0.040849801152944565, + "learning_rate": 3.017322990772148e-07, + "loss": 0.0149, + "num_input_tokens_seen": 208771632, + "step": 171580 + }, + { + "epoch": 19.10958904109589, + "grad_norm": 2.991391658782959, + "learning_rate": 3.0135605792035173e-07, + "loss": 0.0765, + "num_input_tokens_seen": 208777392, + "step": 171585 + }, + { + "epoch": 19.110145895979507, + "grad_norm": 2.306758403778076, + "learning_rate": 3.0098005006197196e-07, + "loss": 0.0341, + "num_input_tokens_seen": 208783696, + "step": 171590 + }, + { + "epoch": 19.110702750863126, + "grad_norm": 0.006267842836678028, + "learning_rate": 3.0060427550562544e-07, + "loss": 0.1047, + "num_input_tokens_seen": 208789456, + "step": 171595 + }, + { + "epoch": 19.111259605746742, + "grad_norm": 0.000428812752943486, + "learning_rate": 3.002287342548593e-07, + "loss": 0.0139, + "num_input_tokens_seen": 208795728, + "step": 171600 + }, + { + "epoch": 19.111816460630358, + "grad_norm": 0.05770677700638771, + "learning_rate": 2.998534263132208e-07, + "loss": 0.0432, + "num_input_tokens_seen": 208801904, + "step": 171605 + }, + { + "epoch": 19.112373315513977, + "grad_norm": 0.13228009641170502, + "learning_rate": 2.9947835168425696e-07, + "loss": 0.004, + "num_input_tokens_seen": 208807888, + "step": 171610 + }, + { + "epoch": 19.112930170397593, + "grad_norm": 0.9531916379928589, + "learning_rate": 2.991035103715095e-07, + "loss": 0.0674, + "num_input_tokens_seen": 208814192, + "step": 171615 + }, + { + "epoch": 19.113487025281213, + "grad_norm": 0.004103296902030706, + "learning_rate": 2.9872890237852003e-07, + "loss": 0.0228, + "num_input_tokens_seen": 208820272, + "step": 171620 + }, + { + "epoch": 19.11404388016483, + "grad_norm": 0.0872260257601738, + "learning_rate": 2.9835452770882457e-07, + "loss": 0.012, + "num_input_tokens_seen": 208826768, + "step": 171625 + }, + { + "epoch": 19.114600735048448, + "grad_norm": 0.21938811242580414, + "learning_rate": 2.979803863659647e-07, + "loss": 0.0087, + "num_input_tokens_seen": 208833072, + "step": 171630 + }, + { + "epoch": 19.115157589932064, + "grad_norm": 0.8658510446548462, + "learning_rate": 2.9760647835346824e-07, + "loss": 0.0448, + "num_input_tokens_seen": 208839280, + "step": 171635 + }, + { + "epoch": 19.11571444481568, + "grad_norm": 0.021588362753391266, + "learning_rate": 2.9723280367487114e-07, + "loss": 0.0318, + "num_input_tokens_seen": 208845456, + "step": 171640 + }, + { + "epoch": 19.1162712996993, + "grad_norm": 0.003509674221277237, + "learning_rate": 2.9685936233370127e-07, + "loss": 0.0396, + "num_input_tokens_seen": 208851664, + "step": 171645 + }, + { + "epoch": 19.116828154582915, + "grad_norm": 0.30672284960746765, + "learning_rate": 2.9648615433348624e-07, + "loss": 0.0621, + "num_input_tokens_seen": 208857744, + "step": 171650 + }, + { + "epoch": 19.117385009466535, + "grad_norm": 0.16526737809181213, + "learning_rate": 2.961131796777511e-07, + "loss": 0.0076, + "num_input_tokens_seen": 208863088, + "step": 171655 + }, + { + "epoch": 19.11794186435015, + "grad_norm": 0.936322808265686, + "learning_rate": 2.9574043837002076e-07, + "loss": 0.0284, + "num_input_tokens_seen": 208869104, + "step": 171660 + }, + { + "epoch": 19.118498719233767, + "grad_norm": 0.0896170362830162, + "learning_rate": 2.953679304138146e-07, + "loss": 0.0714, + "num_input_tokens_seen": 208875216, + "step": 171665 + }, + { + "epoch": 19.119055574117386, + "grad_norm": 1.3015985488891602, + "learning_rate": 2.9499565581264933e-07, + "loss": 0.1131, + "num_input_tokens_seen": 208881424, + "step": 171670 + }, + { + "epoch": 19.119612429001002, + "grad_norm": 0.3807185888290405, + "learning_rate": 2.946236145700443e-07, + "loss": 0.0033, + "num_input_tokens_seen": 208887632, + "step": 171675 + }, + { + "epoch": 19.12016928388462, + "grad_norm": 0.21784064173698425, + "learning_rate": 2.9425180668951337e-07, + "loss": 0.0053, + "num_input_tokens_seen": 208893968, + "step": 171680 + }, + { + "epoch": 19.120726138768237, + "grad_norm": 1.6097354888916016, + "learning_rate": 2.938802321745676e-07, + "loss": 0.0327, + "num_input_tokens_seen": 208900176, + "step": 171685 + }, + { + "epoch": 19.121282993651853, + "grad_norm": 0.6943201422691345, + "learning_rate": 2.9350889102871816e-07, + "loss": 0.0599, + "num_input_tokens_seen": 208906128, + "step": 171690 + }, + { + "epoch": 19.121839848535473, + "grad_norm": 0.029692521318793297, + "learning_rate": 2.9313778325547323e-07, + "loss": 0.0575, + "num_input_tokens_seen": 208912080, + "step": 171695 + }, + { + "epoch": 19.12239670341909, + "grad_norm": 0.12193629145622253, + "learning_rate": 2.927669088583329e-07, + "loss": 0.0837, + "num_input_tokens_seen": 208918192, + "step": 171700 + }, + { + "epoch": 19.122953558302708, + "grad_norm": 0.005457218270748854, + "learning_rate": 2.923962678408054e-07, + "loss": 0.1357, + "num_input_tokens_seen": 208924304, + "step": 171705 + }, + { + "epoch": 19.123510413186324, + "grad_norm": 0.003406622214242816, + "learning_rate": 2.9202586020639357e-07, + "loss": 0.0429, + "num_input_tokens_seen": 208930608, + "step": 171710 + }, + { + "epoch": 19.12406726806994, + "grad_norm": 0.0001621795236133039, + "learning_rate": 2.916556859585917e-07, + "loss": 0.0053, + "num_input_tokens_seen": 208936624, + "step": 171715 + }, + { + "epoch": 19.12462412295356, + "grad_norm": 0.05050050467252731, + "learning_rate": 2.912857451008971e-07, + "loss": 0.0191, + "num_input_tokens_seen": 208942800, + "step": 171720 + }, + { + "epoch": 19.125180977837175, + "grad_norm": 0.018720563501119614, + "learning_rate": 2.9091603763680417e-07, + "loss": 0.0126, + "num_input_tokens_seen": 208948656, + "step": 171725 + }, + { + "epoch": 19.125737832720795, + "grad_norm": 0.8547779321670532, + "learning_rate": 2.9054656356980456e-07, + "loss": 0.0316, + "num_input_tokens_seen": 208954800, + "step": 171730 + }, + { + "epoch": 19.12629468760441, + "grad_norm": 0.011717543937265873, + "learning_rate": 2.901773229033927e-07, + "loss": 0.0464, + "num_input_tokens_seen": 208960592, + "step": 171735 + }, + { + "epoch": 19.126851542488026, + "grad_norm": 0.03699200972914696, + "learning_rate": 2.8980831564105195e-07, + "loss": 0.0139, + "num_input_tokens_seen": 208967120, + "step": 171740 + }, + { + "epoch": 19.127408397371646, + "grad_norm": 1.0548160076141357, + "learning_rate": 2.894395417862683e-07, + "loss": 0.023, + "num_input_tokens_seen": 208973040, + "step": 171745 + }, + { + "epoch": 19.12796525225526, + "grad_norm": 0.25795862078666687, + "learning_rate": 2.8907100134252796e-07, + "loss": 0.0192, + "num_input_tokens_seen": 208979312, + "step": 171750 + }, + { + "epoch": 19.12852210713888, + "grad_norm": 0.11004916578531265, + "learning_rate": 2.8870269431330866e-07, + "loss": 0.016, + "num_input_tokens_seen": 208985424, + "step": 171755 + }, + { + "epoch": 19.129078962022497, + "grad_norm": 0.2276725322008133, + "learning_rate": 2.8833462070209096e-07, + "loss": 0.0034, + "num_input_tokens_seen": 208991664, + "step": 171760 + }, + { + "epoch": 19.129635816906113, + "grad_norm": 0.3015928268432617, + "learning_rate": 2.8796678051235257e-07, + "loss": 0.0551, + "num_input_tokens_seen": 208997712, + "step": 171765 + }, + { + "epoch": 19.130192671789732, + "grad_norm": 0.04559328779578209, + "learning_rate": 2.8759917374756584e-07, + "loss": 0.0581, + "num_input_tokens_seen": 209003824, + "step": 171770 + }, + { + "epoch": 19.13074952667335, + "grad_norm": 0.00048785164835862815, + "learning_rate": 2.872318004112029e-07, + "loss": 0.2319, + "num_input_tokens_seen": 209009968, + "step": 171775 + }, + { + "epoch": 19.131306381556968, + "grad_norm": 0.04343722388148308, + "learning_rate": 2.8686466050673876e-07, + "loss": 0.013, + "num_input_tokens_seen": 209016112, + "step": 171780 + }, + { + "epoch": 19.131863236440584, + "grad_norm": 0.6924721598625183, + "learning_rate": 2.8649775403763456e-07, + "loss": 0.0345, + "num_input_tokens_seen": 209022192, + "step": 171785 + }, + { + "epoch": 19.1324200913242, + "grad_norm": 0.1480938047170639, + "learning_rate": 2.8613108100735975e-07, + "loss": 0.0329, + "num_input_tokens_seen": 209027344, + "step": 171790 + }, + { + "epoch": 19.13297694620782, + "grad_norm": 0.00020035951456520706, + "learning_rate": 2.857646414193782e-07, + "loss": 0.0077, + "num_input_tokens_seen": 209033584, + "step": 171795 + }, + { + "epoch": 19.133533801091435, + "grad_norm": 1.2738265991210938, + "learning_rate": 2.8539843527714827e-07, + "loss": 0.1615, + "num_input_tokens_seen": 209039824, + "step": 171800 + }, + { + "epoch": 19.134090655975054, + "grad_norm": 0.00738160964101553, + "learning_rate": 2.8503246258413387e-07, + "loss": 0.0062, + "num_input_tokens_seen": 209046192, + "step": 171805 + }, + { + "epoch": 19.13464751085867, + "grad_norm": 0.00016679789405316114, + "learning_rate": 2.8466672334378774e-07, + "loss": 0.0376, + "num_input_tokens_seen": 209052016, + "step": 171810 + }, + { + "epoch": 19.135204365742286, + "grad_norm": 0.11874783784151077, + "learning_rate": 2.843012175595655e-07, + "loss": 0.0275, + "num_input_tokens_seen": 209057904, + "step": 171815 + }, + { + "epoch": 19.135761220625906, + "grad_norm": 0.00022275773517321795, + "learning_rate": 2.8393594523492273e-07, + "loss": 0.113, + "num_input_tokens_seen": 209064368, + "step": 171820 + }, + { + "epoch": 19.13631807550952, + "grad_norm": 0.46491140127182007, + "learning_rate": 2.835709063733039e-07, + "loss": 0.0282, + "num_input_tokens_seen": 209070704, + "step": 171825 + }, + { + "epoch": 19.13687493039314, + "grad_norm": 0.0016014527063816786, + "learning_rate": 2.832061009781617e-07, + "loss": 0.0635, + "num_input_tokens_seen": 209076624, + "step": 171830 + }, + { + "epoch": 19.137431785276757, + "grad_norm": 0.007148748729377985, + "learning_rate": 2.828415290529407e-07, + "loss": 0.0032, + "num_input_tokens_seen": 209082800, + "step": 171835 + }, + { + "epoch": 19.137988640160373, + "grad_norm": 0.003759172512218356, + "learning_rate": 2.8247719060108533e-07, + "loss": 0.0063, + "num_input_tokens_seen": 209088752, + "step": 171840 + }, + { + "epoch": 19.138545495043992, + "grad_norm": 0.48739540576934814, + "learning_rate": 2.8211308562603453e-07, + "loss": 0.0681, + "num_input_tokens_seen": 209094640, + "step": 171845 + }, + { + "epoch": 19.139102349927608, + "grad_norm": 0.0021745823323726654, + "learning_rate": 2.8174921413123e-07, + "loss": 0.0279, + "num_input_tokens_seen": 209100848, + "step": 171850 + }, + { + "epoch": 19.139659204811228, + "grad_norm": 1.6529808044433594, + "learning_rate": 2.8138557612010784e-07, + "loss": 0.0527, + "num_input_tokens_seen": 209106960, + "step": 171855 + }, + { + "epoch": 19.140216059694843, + "grad_norm": 2.974902629852295, + "learning_rate": 2.810221715961042e-07, + "loss": 0.0616, + "num_input_tokens_seen": 209113296, + "step": 171860 + }, + { + "epoch": 19.14077291457846, + "grad_norm": 0.2239532470703125, + "learning_rate": 2.8065900056264973e-07, + "loss": 0.0206, + "num_input_tokens_seen": 209119248, + "step": 171865 + }, + { + "epoch": 19.14132976946208, + "grad_norm": 1.7136907577514648, + "learning_rate": 2.802960630231777e-07, + "loss": 0.1219, + "num_input_tokens_seen": 209125360, + "step": 171870 + }, + { + "epoch": 19.141886624345695, + "grad_norm": 0.09760517627000809, + "learning_rate": 2.7993335898111037e-07, + "loss": 0.0807, + "num_input_tokens_seen": 209131376, + "step": 171875 + }, + { + "epoch": 19.142443479229314, + "grad_norm": 0.5095779299736023, + "learning_rate": 2.795708884398812e-07, + "loss": 0.0692, + "num_input_tokens_seen": 209137520, + "step": 171880 + }, + { + "epoch": 19.14300033411293, + "grad_norm": 0.3703300952911377, + "learning_rate": 2.7920865140290964e-07, + "loss": 0.0116, + "num_input_tokens_seen": 209143536, + "step": 171885 + }, + { + "epoch": 19.143557188996546, + "grad_norm": 0.2519131004810333, + "learning_rate": 2.788466478736179e-07, + "loss": 0.1363, + "num_input_tokens_seen": 209149840, + "step": 171890 + }, + { + "epoch": 19.144114043880165, + "grad_norm": 1.0024642944335938, + "learning_rate": 2.7848487785542556e-07, + "loss": 0.0761, + "num_input_tokens_seen": 209156080, + "step": 171895 + }, + { + "epoch": 19.14467089876378, + "grad_norm": 0.0011610179208219051, + "learning_rate": 2.78123341351752e-07, + "loss": 0.0008, + "num_input_tokens_seen": 209162192, + "step": 171900 + }, + { + "epoch": 19.1452277536474, + "grad_norm": 0.004659797064960003, + "learning_rate": 2.7776203836600844e-07, + "loss": 0.044, + "num_input_tokens_seen": 209168496, + "step": 171905 + }, + { + "epoch": 19.145784608531017, + "grad_norm": 0.45628541707992554, + "learning_rate": 2.774009689016116e-07, + "loss": 0.0316, + "num_input_tokens_seen": 209174512, + "step": 171910 + }, + { + "epoch": 19.146341463414632, + "grad_norm": 0.03412278741598129, + "learning_rate": 2.7704013296196706e-07, + "loss": 0.174, + "num_input_tokens_seen": 209180688, + "step": 171915 + }, + { + "epoch": 19.146898318298252, + "grad_norm": 0.026585225015878677, + "learning_rate": 2.766795305504888e-07, + "loss": 0.0772, + "num_input_tokens_seen": 209186800, + "step": 171920 + }, + { + "epoch": 19.147455173181868, + "grad_norm": 1.8396615982055664, + "learning_rate": 2.763191616705796e-07, + "loss": 0.0288, + "num_input_tokens_seen": 209192720, + "step": 171925 + }, + { + "epoch": 19.148012028065487, + "grad_norm": 0.016355760395526886, + "learning_rate": 2.7595902632564505e-07, + "loss": 0.0021, + "num_input_tokens_seen": 209198832, + "step": 171930 + }, + { + "epoch": 19.148568882949103, + "grad_norm": 9.24856067285873e-05, + "learning_rate": 2.755991245190853e-07, + "loss": 0.0751, + "num_input_tokens_seen": 209204816, + "step": 171935 + }, + { + "epoch": 19.14912573783272, + "grad_norm": 0.04495207220315933, + "learning_rate": 2.7523945625430037e-07, + "loss": 0.0309, + "num_input_tokens_seen": 209210960, + "step": 171940 + }, + { + "epoch": 19.14968259271634, + "grad_norm": 1.3107072114944458, + "learning_rate": 2.748800215346875e-07, + "loss": 0.1369, + "num_input_tokens_seen": 209217232, + "step": 171945 + }, + { + "epoch": 19.150239447599954, + "grad_norm": 0.01707073301076889, + "learning_rate": 2.7452082036364126e-07, + "loss": 0.0046, + "num_input_tokens_seen": 209223760, + "step": 171950 + }, + { + "epoch": 19.150796302483574, + "grad_norm": 0.01991991139948368, + "learning_rate": 2.7416185274455886e-07, + "loss": 0.056, + "num_input_tokens_seen": 209229872, + "step": 171955 + }, + { + "epoch": 19.15135315736719, + "grad_norm": 0.012737481854856014, + "learning_rate": 2.738031186808265e-07, + "loss": 0.008, + "num_input_tokens_seen": 209235952, + "step": 171960 + }, + { + "epoch": 19.151910012250806, + "grad_norm": 1.277430772781372, + "learning_rate": 2.734446181758332e-07, + "loss": 0.0718, + "num_input_tokens_seen": 209241456, + "step": 171965 + }, + { + "epoch": 19.152466867134425, + "grad_norm": 0.021348342299461365, + "learning_rate": 2.730863512329651e-07, + "loss": 0.073, + "num_input_tokens_seen": 209247440, + "step": 171970 + }, + { + "epoch": 19.15302372201804, + "grad_norm": 0.2856217622756958, + "learning_rate": 2.727283178556084e-07, + "loss": 0.0045, + "num_input_tokens_seen": 209253872, + "step": 171975 + }, + { + "epoch": 19.15358057690166, + "grad_norm": 0.8495399355888367, + "learning_rate": 2.7237051804714365e-07, + "loss": 0.0232, + "num_input_tokens_seen": 209259952, + "step": 171980 + }, + { + "epoch": 19.154137431785276, + "grad_norm": 0.039785925298929214, + "learning_rate": 2.7201295181095154e-07, + "loss": 0.0422, + "num_input_tokens_seen": 209266000, + "step": 171985 + }, + { + "epoch": 19.154694286668896, + "grad_norm": 0.09933488816022873, + "learning_rate": 2.7165561915040995e-07, + "loss": 0.0759, + "num_input_tokens_seen": 209271760, + "step": 171990 + }, + { + "epoch": 19.15525114155251, + "grad_norm": 0.026930756866931915, + "learning_rate": 2.7129852006889113e-07, + "loss": 0.0027, + "num_input_tokens_seen": 209278320, + "step": 171995 + }, + { + "epoch": 19.155807996436128, + "grad_norm": 0.01570165529847145, + "learning_rate": 2.7094165456977014e-07, + "loss": 0.006, + "num_input_tokens_seen": 209284400, + "step": 172000 + }, + { + "epoch": 19.156364851319747, + "grad_norm": 0.18288609385490417, + "learning_rate": 2.705850226564194e-07, + "loss": 0.005, + "num_input_tokens_seen": 209290000, + "step": 172005 + }, + { + "epoch": 19.156921706203363, + "grad_norm": 0.007184979971498251, + "learning_rate": 2.7022862433220276e-07, + "loss": 0.1089, + "num_input_tokens_seen": 209296048, + "step": 172010 + }, + { + "epoch": 19.157478561086982, + "grad_norm": 0.5660102963447571, + "learning_rate": 2.6987245960049257e-07, + "loss": 0.0107, + "num_input_tokens_seen": 209302192, + "step": 172015 + }, + { + "epoch": 19.1580354159706, + "grad_norm": 0.020094571635127068, + "learning_rate": 2.6951652846465003e-07, + "loss": 0.0524, + "num_input_tokens_seen": 209308336, + "step": 172020 + }, + { + "epoch": 19.158592270854214, + "grad_norm": 1.5056545734405518, + "learning_rate": 2.6916083092803635e-07, + "loss": 0.0424, + "num_input_tokens_seen": 209313680, + "step": 172025 + }, + { + "epoch": 19.159149125737834, + "grad_norm": 0.009267273359000683, + "learning_rate": 2.6880536699401546e-07, + "loss": 0.012, + "num_input_tokens_seen": 209320016, + "step": 172030 + }, + { + "epoch": 19.15970598062145, + "grad_norm": 0.042092349380254745, + "learning_rate": 2.6845013666594034e-07, + "loss": 0.0012, + "num_input_tokens_seen": 209326096, + "step": 172035 + }, + { + "epoch": 19.16026283550507, + "grad_norm": 4.748037815093994, + "learning_rate": 2.680951399471665e-07, + "loss": 0.0215, + "num_input_tokens_seen": 209332144, + "step": 172040 + }, + { + "epoch": 19.160819690388685, + "grad_norm": 0.025572920218110085, + "learning_rate": 2.6774037684105245e-07, + "loss": 0.0037, + "num_input_tokens_seen": 209338192, + "step": 172045 + }, + { + "epoch": 19.1613765452723, + "grad_norm": 0.5310491919517517, + "learning_rate": 2.6738584735094273e-07, + "loss": 0.0646, + "num_input_tokens_seen": 209343792, + "step": 172050 + }, + { + "epoch": 19.16193340015592, + "grad_norm": 0.014818099327385426, + "learning_rate": 2.67031551480193e-07, + "loss": 0.0134, + "num_input_tokens_seen": 209349968, + "step": 172055 + }, + { + "epoch": 19.162490255039536, + "grad_norm": 0.045459214597940445, + "learning_rate": 2.666774892321422e-07, + "loss": 0.0006, + "num_input_tokens_seen": 209356144, + "step": 172060 + }, + { + "epoch": 19.163047109923156, + "grad_norm": 0.41043588519096375, + "learning_rate": 2.6632366061014044e-07, + "loss": 0.037, + "num_input_tokens_seen": 209362128, + "step": 172065 + }, + { + "epoch": 19.16360396480677, + "grad_norm": 1.1040359735488892, + "learning_rate": 2.6597006561752404e-07, + "loss": 0.0739, + "num_input_tokens_seen": 209368208, + "step": 172070 + }, + { + "epoch": 19.164160819690387, + "grad_norm": 0.9171571135520935, + "learning_rate": 2.6561670425764294e-07, + "loss": 0.0148, + "num_input_tokens_seen": 209373968, + "step": 172075 + }, + { + "epoch": 19.164717674574007, + "grad_norm": 0.014196457341313362, + "learning_rate": 2.652635765338252e-07, + "loss": 0.0186, + "num_input_tokens_seen": 209380208, + "step": 172080 + }, + { + "epoch": 19.165274529457623, + "grad_norm": 0.012628700584173203, + "learning_rate": 2.6491068244941243e-07, + "loss": 0.0021, + "num_input_tokens_seen": 209386096, + "step": 172085 + }, + { + "epoch": 19.165831384341242, + "grad_norm": 0.011262445710599422, + "learning_rate": 2.645580220077326e-07, + "loss": 0.0104, + "num_input_tokens_seen": 209392432, + "step": 172090 + }, + { + "epoch": 19.166388239224858, + "grad_norm": 0.028506526723504066, + "learning_rate": 2.6420559521212195e-07, + "loss": 0.0137, + "num_input_tokens_seen": 209398704, + "step": 172095 + }, + { + "epoch": 19.166945094108474, + "grad_norm": 0.0009037668351083994, + "learning_rate": 2.6385340206590835e-07, + "loss": 0.0149, + "num_input_tokens_seen": 209404784, + "step": 172100 + }, + { + "epoch": 19.167501948992093, + "grad_norm": 0.005283468868583441, + "learning_rate": 2.635014425724169e-07, + "loss": 0.0093, + "num_input_tokens_seen": 209410832, + "step": 172105 + }, + { + "epoch": 19.16805880387571, + "grad_norm": 0.048391032963991165, + "learning_rate": 2.6314971673497e-07, + "loss": 0.012, + "num_input_tokens_seen": 209417008, + "step": 172110 + }, + { + "epoch": 19.16861565875933, + "grad_norm": 1.7367364168167114, + "learning_rate": 2.6279822455689554e-07, + "loss": 0.07, + "num_input_tokens_seen": 209423088, + "step": 172115 + }, + { + "epoch": 19.169172513642945, + "grad_norm": 0.0017045405693352222, + "learning_rate": 2.6244696604151296e-07, + "loss": 0.0018, + "num_input_tokens_seen": 209429296, + "step": 172120 + }, + { + "epoch": 19.16972936852656, + "grad_norm": 0.12133920937776566, + "learning_rate": 2.6209594119213644e-07, + "loss": 0.0814, + "num_input_tokens_seen": 209435408, + "step": 172125 + }, + { + "epoch": 19.17028622341018, + "grad_norm": 1.2879658937454224, + "learning_rate": 2.6174515001207986e-07, + "loss": 0.019, + "num_input_tokens_seen": 209441424, + "step": 172130 + }, + { + "epoch": 19.170843078293796, + "grad_norm": 0.005371930077672005, + "learning_rate": 2.613945925046657e-07, + "loss": 0.0036, + "num_input_tokens_seen": 209447472, + "step": 172135 + }, + { + "epoch": 19.171399933177415, + "grad_norm": 0.6646022200584412, + "learning_rate": 2.610442686731968e-07, + "loss": 0.0385, + "num_input_tokens_seen": 209453808, + "step": 172140 + }, + { + "epoch": 19.17195678806103, + "grad_norm": 0.3894234299659729, + "learning_rate": 2.606941785209871e-07, + "loss": 0.0361, + "num_input_tokens_seen": 209459856, + "step": 172145 + }, + { + "epoch": 19.172513642944647, + "grad_norm": 0.011022320948541164, + "learning_rate": 2.6034432205133964e-07, + "loss": 0.0066, + "num_input_tokens_seen": 209465808, + "step": 172150 + }, + { + "epoch": 19.173070497828267, + "grad_norm": 0.23705196380615234, + "learning_rate": 2.599946992675628e-07, + "loss": 0.117, + "num_input_tokens_seen": 209472048, + "step": 172155 + }, + { + "epoch": 19.173627352711883, + "grad_norm": 0.06696861237287521, + "learning_rate": 2.596453101729568e-07, + "loss": 0.0384, + "num_input_tokens_seen": 209477904, + "step": 172160 + }, + { + "epoch": 19.174184207595502, + "grad_norm": 0.6411721706390381, + "learning_rate": 2.592961547708217e-07, + "loss": 0.0115, + "num_input_tokens_seen": 209484240, + "step": 172165 + }, + { + "epoch": 19.174741062479118, + "grad_norm": 0.19768334925174713, + "learning_rate": 2.5894723306445767e-07, + "loss": 0.0667, + "num_input_tokens_seen": 209490000, + "step": 172170 + }, + { + "epoch": 19.175297917362734, + "grad_norm": 0.0540362186729908, + "learning_rate": 2.585985450571593e-07, + "loss": 0.1027, + "num_input_tokens_seen": 209495760, + "step": 172175 + }, + { + "epoch": 19.175854772246353, + "grad_norm": 0.00040901792817749083, + "learning_rate": 2.582500907522184e-07, + "loss": 0.0382, + "num_input_tokens_seen": 209501808, + "step": 172180 + }, + { + "epoch": 19.17641162712997, + "grad_norm": 0.5147989988327026, + "learning_rate": 2.5790187015292953e-07, + "loss": 0.1605, + "num_input_tokens_seen": 209507376, + "step": 172185 + }, + { + "epoch": 19.17696848201359, + "grad_norm": 0.00019571308803278953, + "learning_rate": 2.57553883262579e-07, + "loss": 0.0207, + "num_input_tokens_seen": 209513360, + "step": 172190 + }, + { + "epoch": 19.177525336897205, + "grad_norm": 0.0760369524359703, + "learning_rate": 2.572061300844586e-07, + "loss": 0.0089, + "num_input_tokens_seen": 209519120, + "step": 172195 + }, + { + "epoch": 19.17808219178082, + "grad_norm": 0.7351414561271667, + "learning_rate": 2.5685861062184625e-07, + "loss": 0.1105, + "num_input_tokens_seen": 209524976, + "step": 172200 + }, + { + "epoch": 19.17863904666444, + "grad_norm": 0.04027029499411583, + "learning_rate": 2.5651132487803096e-07, + "loss": 0.0982, + "num_input_tokens_seen": 209531152, + "step": 172205 + }, + { + "epoch": 19.179195901548056, + "grad_norm": 0.8787940144538879, + "learning_rate": 2.5616427285628797e-07, + "loss": 0.0482, + "num_input_tokens_seen": 209537520, + "step": 172210 + }, + { + "epoch": 19.179752756431675, + "grad_norm": 0.0012016419786959887, + "learning_rate": 2.558174545599007e-07, + "loss": 0.1841, + "num_input_tokens_seen": 209543536, + "step": 172215 + }, + { + "epoch": 19.18030961131529, + "grad_norm": 0.003473857883363962, + "learning_rate": 2.5547086999213877e-07, + "loss": 0.0081, + "num_input_tokens_seen": 209549296, + "step": 172220 + }, + { + "epoch": 19.180866466198907, + "grad_norm": 0.8217276930809021, + "learning_rate": 2.551245191562829e-07, + "loss": 0.1141, + "num_input_tokens_seen": 209555216, + "step": 172225 + }, + { + "epoch": 19.181423321082526, + "grad_norm": 0.01334469672292471, + "learning_rate": 2.547784020555971e-07, + "loss": 0.1296, + "num_input_tokens_seen": 209561488, + "step": 172230 + }, + { + "epoch": 19.181980175966142, + "grad_norm": 0.03724943846464157, + "learning_rate": 2.5443251869335937e-07, + "loss": 0.1119, + "num_input_tokens_seen": 209567088, + "step": 172235 + }, + { + "epoch": 19.182537030849762, + "grad_norm": 0.0013925401726737618, + "learning_rate": 2.5408686907283096e-07, + "loss": 0.0118, + "num_input_tokens_seen": 209573328, + "step": 172240 + }, + { + "epoch": 19.183093885733378, + "grad_norm": 0.0023477417416870594, + "learning_rate": 2.537414531972787e-07, + "loss": 0.0491, + "num_input_tokens_seen": 209579248, + "step": 172245 + }, + { + "epoch": 19.183650740616994, + "grad_norm": 1.9783461093902588, + "learning_rate": 2.533962710699611e-07, + "loss": 0.0264, + "num_input_tokens_seen": 209584976, + "step": 172250 + }, + { + "epoch": 19.184207595500613, + "grad_norm": 0.010245706886053085, + "learning_rate": 2.53051322694145e-07, + "loss": 0.023, + "num_input_tokens_seen": 209591376, + "step": 172255 + }, + { + "epoch": 19.18476445038423, + "grad_norm": 1.4248806238174438, + "learning_rate": 2.527066080730861e-07, + "loss": 0.0421, + "num_input_tokens_seen": 209597552, + "step": 172260 + }, + { + "epoch": 19.18532130526785, + "grad_norm": 0.0441189743578434, + "learning_rate": 2.5236212721004295e-07, + "loss": 0.306, + "num_input_tokens_seen": 209603856, + "step": 172265 + }, + { + "epoch": 19.185878160151464, + "grad_norm": 0.005700273439288139, + "learning_rate": 2.5201788010826287e-07, + "loss": 0.0341, + "num_input_tokens_seen": 209609776, + "step": 172270 + }, + { + "epoch": 19.18643501503508, + "grad_norm": 0.16593515872955322, + "learning_rate": 2.5167386677100446e-07, + "loss": 0.0063, + "num_input_tokens_seen": 209615760, + "step": 172275 + }, + { + "epoch": 19.1869918699187, + "grad_norm": 0.08689676970243454, + "learning_rate": 2.513300872015123e-07, + "loss": 0.0156, + "num_input_tokens_seen": 209621904, + "step": 172280 + }, + { + "epoch": 19.187548724802316, + "grad_norm": 2.018937587738037, + "learning_rate": 2.509865414030366e-07, + "loss": 0.0823, + "num_input_tokens_seen": 209627920, + "step": 172285 + }, + { + "epoch": 19.188105579685935, + "grad_norm": 0.010181864723563194, + "learning_rate": 2.506432293788219e-07, + "loss": 0.0132, + "num_input_tokens_seen": 209633744, + "step": 172290 + }, + { + "epoch": 19.18866243456955, + "grad_norm": 0.07790467143058777, + "learning_rate": 2.503001511321101e-07, + "loss": 0.0253, + "num_input_tokens_seen": 209640048, + "step": 172295 + }, + { + "epoch": 19.189219289453167, + "grad_norm": 0.08227459341287613, + "learning_rate": 2.4995730666614315e-07, + "loss": 0.0173, + "num_input_tokens_seen": 209646480, + "step": 172300 + }, + { + "epoch": 19.189776144336786, + "grad_norm": 0.04571571573615074, + "learning_rate": 2.496146959841572e-07, + "loss": 0.0029, + "num_input_tokens_seen": 209652688, + "step": 172305 + }, + { + "epoch": 19.190332999220402, + "grad_norm": 0.1121981218457222, + "learning_rate": 2.492723190893914e-07, + "loss": 0.0042, + "num_input_tokens_seen": 209658576, + "step": 172310 + }, + { + "epoch": 19.19088985410402, + "grad_norm": 0.0941990315914154, + "learning_rate": 2.489301759850793e-07, + "loss": 0.0929, + "num_input_tokens_seen": 209664272, + "step": 172315 + }, + { + "epoch": 19.191446708987637, + "grad_norm": 0.4615735411643982, + "learning_rate": 2.4858826667445156e-07, + "loss": 0.0108, + "num_input_tokens_seen": 209670416, + "step": 172320 + }, + { + "epoch": 19.192003563871253, + "grad_norm": 0.06954462826251984, + "learning_rate": 2.48246591160739e-07, + "loss": 0.0081, + "num_input_tokens_seen": 209676560, + "step": 172325 + }, + { + "epoch": 19.192560418754873, + "grad_norm": 0.014782670885324478, + "learning_rate": 2.47905149447164e-07, + "loss": 0.0723, + "num_input_tokens_seen": 209682416, + "step": 172330 + }, + { + "epoch": 19.19311727363849, + "grad_norm": 1.5946508646011353, + "learning_rate": 2.4756394153696296e-07, + "loss": 0.0389, + "num_input_tokens_seen": 209688560, + "step": 172335 + }, + { + "epoch": 19.193674128522108, + "grad_norm": 0.00012916579726152122, + "learning_rate": 2.4722296743334426e-07, + "loss": 0.0891, + "num_input_tokens_seen": 209694640, + "step": 172340 + }, + { + "epoch": 19.194230983405724, + "grad_norm": 0.48297953605651855, + "learning_rate": 2.4688222713954156e-07, + "loss": 0.022, + "num_input_tokens_seen": 209700464, + "step": 172345 + }, + { + "epoch": 19.194787838289344, + "grad_norm": 1.2958943843841553, + "learning_rate": 2.4654172065876614e-07, + "loss": 0.1092, + "num_input_tokens_seen": 209706320, + "step": 172350 + }, + { + "epoch": 19.19534469317296, + "grad_norm": 0.02343902550637722, + "learning_rate": 2.4620144799423486e-07, + "loss": 0.0115, + "num_input_tokens_seen": 209712880, + "step": 172355 + }, + { + "epoch": 19.195901548056575, + "grad_norm": 1.0415329933166504, + "learning_rate": 2.4586140914916735e-07, + "loss": 0.0236, + "num_input_tokens_seen": 209719216, + "step": 172360 + }, + { + "epoch": 19.196458402940195, + "grad_norm": 2.0896925926208496, + "learning_rate": 2.4552160412676937e-07, + "loss": 0.1047, + "num_input_tokens_seen": 209725232, + "step": 172365 + }, + { + "epoch": 19.19701525782381, + "grad_norm": 0.014532475732266903, + "learning_rate": 2.4518203293025233e-07, + "loss": 0.0238, + "num_input_tokens_seen": 209731312, + "step": 172370 + }, + { + "epoch": 19.19757211270743, + "grad_norm": 0.11271727830171585, + "learning_rate": 2.448426955628219e-07, + "loss": 0.0165, + "num_input_tokens_seen": 209737264, + "step": 172375 + }, + { + "epoch": 19.198128967591046, + "grad_norm": 0.00015652687579859048, + "learning_rate": 2.4450359202768946e-07, + "loss": 0.0667, + "num_input_tokens_seen": 209743376, + "step": 172380 + }, + { + "epoch": 19.198685822474662, + "grad_norm": 0.010814175941050053, + "learning_rate": 2.4416472232805243e-07, + "loss": 0.1117, + "num_input_tokens_seen": 209749360, + "step": 172385 + }, + { + "epoch": 19.19924267735828, + "grad_norm": 0.7884610891342163, + "learning_rate": 2.4382608646711656e-07, + "loss": 0.0136, + "num_input_tokens_seen": 209755248, + "step": 172390 + }, + { + "epoch": 19.199799532241897, + "grad_norm": 0.0005117445252835751, + "learning_rate": 2.434876844480738e-07, + "loss": 0.016, + "num_input_tokens_seen": 209761136, + "step": 172395 + }, + { + "epoch": 19.200356387125517, + "grad_norm": 0.03521144390106201, + "learning_rate": 2.4314951627412707e-07, + "loss": 0.0064, + "num_input_tokens_seen": 209767056, + "step": 172400 + }, + { + "epoch": 19.200913242009133, + "grad_norm": 1.9905093908309937, + "learning_rate": 2.428115819484655e-07, + "loss": 0.1586, + "num_input_tokens_seen": 209773136, + "step": 172405 + }, + { + "epoch": 19.20147009689275, + "grad_norm": 1.4496210813522339, + "learning_rate": 2.4247388147428665e-07, + "loss": 0.0566, + "num_input_tokens_seen": 209778576, + "step": 172410 + }, + { + "epoch": 19.202026951776368, + "grad_norm": 0.006151101086288691, + "learning_rate": 2.421364148547739e-07, + "loss": 0.0915, + "num_input_tokens_seen": 209784592, + "step": 172415 + }, + { + "epoch": 19.202583806659984, + "grad_norm": 0.0002498069661669433, + "learning_rate": 2.41799182093122e-07, + "loss": 0.0022, + "num_input_tokens_seen": 209790736, + "step": 172420 + }, + { + "epoch": 19.203140661543603, + "grad_norm": 0.7342391014099121, + "learning_rate": 2.41462183192509e-07, + "loss": 0.0558, + "num_input_tokens_seen": 209797008, + "step": 172425 + }, + { + "epoch": 19.20369751642722, + "grad_norm": 1.2877599000930786, + "learning_rate": 2.41125418156124e-07, + "loss": 0.0164, + "num_input_tokens_seen": 209803056, + "step": 172430 + }, + { + "epoch": 19.204254371310835, + "grad_norm": 0.35632064938545227, + "learning_rate": 2.407888869871477e-07, + "loss": 0.0705, + "num_input_tokens_seen": 209809040, + "step": 172435 + }, + { + "epoch": 19.204811226194455, + "grad_norm": 0.08799941092729568, + "learning_rate": 2.404525896887555e-07, + "loss": 0.0068, + "num_input_tokens_seen": 209815024, + "step": 172440 + }, + { + "epoch": 19.20536808107807, + "grad_norm": 0.370372474193573, + "learning_rate": 2.401165262641225e-07, + "loss": 0.0996, + "num_input_tokens_seen": 209820784, + "step": 172445 + }, + { + "epoch": 19.20592493596169, + "grad_norm": 0.22187452018260956, + "learning_rate": 2.3978069671642955e-07, + "loss": 0.1438, + "num_input_tokens_seen": 209827312, + "step": 172450 + }, + { + "epoch": 19.206481790845306, + "grad_norm": 0.09778905659914017, + "learning_rate": 2.3944510104884633e-07, + "loss": 0.0991, + "num_input_tokens_seen": 209833008, + "step": 172455 + }, + { + "epoch": 19.20703864572892, + "grad_norm": 0.026100432500243187, + "learning_rate": 2.391097392645425e-07, + "loss": 0.0966, + "num_input_tokens_seen": 209838928, + "step": 172460 + }, + { + "epoch": 19.20759550061254, + "grad_norm": 0.0356157161295414, + "learning_rate": 2.387746113666822e-07, + "loss": 0.0392, + "num_input_tokens_seen": 209845392, + "step": 172465 + }, + { + "epoch": 19.208152355496157, + "grad_norm": 0.0037621399387717247, + "learning_rate": 2.3843971735843516e-07, + "loss": 0.0279, + "num_input_tokens_seen": 209851056, + "step": 172470 + }, + { + "epoch": 19.208709210379777, + "grad_norm": 0.5805186033248901, + "learning_rate": 2.3810505724296271e-07, + "loss": 0.0156, + "num_input_tokens_seen": 209857712, + "step": 172475 + }, + { + "epoch": 19.209266065263392, + "grad_norm": 0.0007762937457300723, + "learning_rate": 2.3777063102342901e-07, + "loss": 0.0528, + "num_input_tokens_seen": 209862832, + "step": 172480 + }, + { + "epoch": 19.20982292014701, + "grad_norm": 0.05500403419137001, + "learning_rate": 2.3743643870298982e-07, + "loss": 0.0054, + "num_input_tokens_seen": 209868880, + "step": 172485 + }, + { + "epoch": 19.210379775030628, + "grad_norm": 0.6701152920722961, + "learning_rate": 2.3710248028480376e-07, + "loss": 0.1522, + "num_input_tokens_seen": 209875152, + "step": 172490 + }, + { + "epoch": 19.210936629914244, + "grad_norm": 0.0004025939851999283, + "learning_rate": 2.3676875577202106e-07, + "loss": 0.0174, + "num_input_tokens_seen": 209881200, + "step": 172495 + }, + { + "epoch": 19.211493484797863, + "grad_norm": 0.3956989645957947, + "learning_rate": 2.3643526516780034e-07, + "loss": 0.0645, + "num_input_tokens_seen": 209886512, + "step": 172500 + }, + { + "epoch": 19.21205033968148, + "grad_norm": 0.00302890595048666, + "learning_rate": 2.3610200847528907e-07, + "loss": 0.0146, + "num_input_tokens_seen": 209892656, + "step": 172505 + }, + { + "epoch": 19.212607194565095, + "grad_norm": 0.050640836358070374, + "learning_rate": 2.3576898569763473e-07, + "loss": 0.006, + "num_input_tokens_seen": 209898928, + "step": 172510 + }, + { + "epoch": 19.213164049448714, + "grad_norm": 0.12579034268856049, + "learning_rate": 2.3543619683798202e-07, + "loss": 0.0655, + "num_input_tokens_seen": 209905296, + "step": 172515 + }, + { + "epoch": 19.21372090433233, + "grad_norm": 1.680892825126648, + "learning_rate": 2.3510364189947565e-07, + "loss": 0.0642, + "num_input_tokens_seen": 209911536, + "step": 172520 + }, + { + "epoch": 19.21427775921595, + "grad_norm": 0.05102837085723877, + "learning_rate": 2.3477132088525756e-07, + "loss": 0.0382, + "num_input_tokens_seen": 209917936, + "step": 172525 + }, + { + "epoch": 19.214834614099566, + "grad_norm": 0.003045754274353385, + "learning_rate": 2.3443923379846688e-07, + "loss": 0.0667, + "num_input_tokens_seen": 209923952, + "step": 172530 + }, + { + "epoch": 19.21539146898318, + "grad_norm": 0.0480315238237381, + "learning_rate": 2.3410738064223448e-07, + "loss": 0.0785, + "num_input_tokens_seen": 209929712, + "step": 172535 + }, + { + "epoch": 19.2159483238668, + "grad_norm": 0.48875153064727783, + "learning_rate": 2.3377576141970503e-07, + "loss": 0.0124, + "num_input_tokens_seen": 209935856, + "step": 172540 + }, + { + "epoch": 19.216505178750417, + "grad_norm": 0.0001240470155607909, + "learning_rate": 2.3344437613400384e-07, + "loss": 0.0232, + "num_input_tokens_seen": 209941936, + "step": 172545 + }, + { + "epoch": 19.217062033634036, + "grad_norm": 0.621955394744873, + "learning_rate": 2.3311322478826447e-07, + "loss": 0.0355, + "num_input_tokens_seen": 209948048, + "step": 172550 + }, + { + "epoch": 19.217618888517652, + "grad_norm": 0.00013929464330431074, + "learning_rate": 2.3278230738561225e-07, + "loss": 0.0025, + "num_input_tokens_seen": 209954160, + "step": 172555 + }, + { + "epoch": 19.218175743401268, + "grad_norm": 0.1667054444551468, + "learning_rate": 2.324516239291752e-07, + "loss": 0.0021, + "num_input_tokens_seen": 209960144, + "step": 172560 + }, + { + "epoch": 19.218732598284888, + "grad_norm": 0.004456885624676943, + "learning_rate": 2.3212117442207582e-07, + "loss": 0.1682, + "num_input_tokens_seen": 209966288, + "step": 172565 + }, + { + "epoch": 19.219289453168503, + "grad_norm": 0.0617869570851326, + "learning_rate": 2.3179095886743386e-07, + "loss": 0.0153, + "num_input_tokens_seen": 209972656, + "step": 172570 + }, + { + "epoch": 19.219846308052123, + "grad_norm": 0.3642893433570862, + "learning_rate": 2.3146097726837178e-07, + "loss": 0.1621, + "num_input_tokens_seen": 209978768, + "step": 172575 + }, + { + "epoch": 19.22040316293574, + "grad_norm": 0.0022116811014711857, + "learning_rate": 2.311312296280066e-07, + "loss": 0.006, + "num_input_tokens_seen": 209985008, + "step": 172580 + }, + { + "epoch": 19.220960017819355, + "grad_norm": 0.14259473979473114, + "learning_rate": 2.3080171594944966e-07, + "loss": 0.01, + "num_input_tokens_seen": 209990736, + "step": 172585 + }, + { + "epoch": 19.221516872702974, + "grad_norm": 0.7492812275886536, + "learning_rate": 2.3047243623581516e-07, + "loss": 0.022, + "num_input_tokens_seen": 209996752, + "step": 172590 + }, + { + "epoch": 19.22207372758659, + "grad_norm": 0.18665538728237152, + "learning_rate": 2.301433904902145e-07, + "loss": 0.0117, + "num_input_tokens_seen": 210002864, + "step": 172595 + }, + { + "epoch": 19.22263058247021, + "grad_norm": 0.4565773606300354, + "learning_rate": 2.298145787157535e-07, + "loss": 0.0089, + "num_input_tokens_seen": 210008816, + "step": 172600 + }, + { + "epoch": 19.223187437353825, + "grad_norm": 0.004886697046458721, + "learning_rate": 2.2948600091553808e-07, + "loss": 0.0004, + "num_input_tokens_seen": 210014992, + "step": 172605 + }, + { + "epoch": 19.22374429223744, + "grad_norm": 0.0003306918079033494, + "learning_rate": 2.2915765709267678e-07, + "loss": 0.0003, + "num_input_tokens_seen": 210020944, + "step": 172610 + }, + { + "epoch": 19.22430114712106, + "grad_norm": 0.0018277409253641963, + "learning_rate": 2.288295472502644e-07, + "loss": 0.0257, + "num_input_tokens_seen": 210027056, + "step": 172615 + }, + { + "epoch": 19.224858002004677, + "grad_norm": 0.8667604327201843, + "learning_rate": 2.2850167139140677e-07, + "loss": 0.0383, + "num_input_tokens_seen": 210033136, + "step": 172620 + }, + { + "epoch": 19.225414856888296, + "grad_norm": 0.030241191387176514, + "learning_rate": 2.281740295191931e-07, + "loss": 0.1464, + "num_input_tokens_seen": 210038480, + "step": 172625 + }, + { + "epoch": 19.225971711771912, + "grad_norm": 0.029287783429026604, + "learning_rate": 2.2784662163672644e-07, + "loss": 0.1359, + "num_input_tokens_seen": 210044912, + "step": 172630 + }, + { + "epoch": 19.226528566655528, + "grad_norm": 0.0012164224172011018, + "learning_rate": 2.2751944774709322e-07, + "loss": 0.0033, + "num_input_tokens_seen": 210051472, + "step": 172635 + }, + { + "epoch": 19.227085421539147, + "grad_norm": 1.2175661325454712, + "learning_rate": 2.2719250785338543e-07, + "loss": 0.0752, + "num_input_tokens_seen": 210057232, + "step": 172640 + }, + { + "epoch": 19.227642276422763, + "grad_norm": 0.015596064738929272, + "learning_rate": 2.26865801958695e-07, + "loss": 0.0374, + "num_input_tokens_seen": 210063440, + "step": 172645 + }, + { + "epoch": 19.228199131306383, + "grad_norm": 0.41314876079559326, + "learning_rate": 2.2653933006610284e-07, + "loss": 0.0815, + "num_input_tokens_seen": 210069296, + "step": 172650 + }, + { + "epoch": 19.22875598619, + "grad_norm": 0.05321803689002991, + "learning_rate": 2.2621309217869534e-07, + "loss": 0.0389, + "num_input_tokens_seen": 210075504, + "step": 172655 + }, + { + "epoch": 19.229312841073614, + "grad_norm": 0.08388432115316391, + "learning_rate": 2.258870882995534e-07, + "loss": 0.0124, + "num_input_tokens_seen": 210081648, + "step": 172660 + }, + { + "epoch": 19.229869695957234, + "grad_norm": 0.8686538934707642, + "learning_rate": 2.2556131843175787e-07, + "loss": 0.0965, + "num_input_tokens_seen": 210087408, + "step": 172665 + }, + { + "epoch": 19.23042655084085, + "grad_norm": 1.6890417337417603, + "learning_rate": 2.2523578257838406e-07, + "loss": 0.1266, + "num_input_tokens_seen": 210093328, + "step": 172670 + }, + { + "epoch": 19.23098340572447, + "grad_norm": 1.7145214080810547, + "learning_rate": 2.2491048074250732e-07, + "loss": 0.0614, + "num_input_tokens_seen": 210099632, + "step": 172675 + }, + { + "epoch": 19.231540260608085, + "grad_norm": 0.01879783906042576, + "learning_rate": 2.2458541292720015e-07, + "loss": 0.0003, + "num_input_tokens_seen": 210105616, + "step": 172680 + }, + { + "epoch": 19.232097115491705, + "grad_norm": 0.29667428135871887, + "learning_rate": 2.2426057913553235e-07, + "loss": 0.0078, + "num_input_tokens_seen": 210111952, + "step": 172685 + }, + { + "epoch": 19.23265397037532, + "grad_norm": 0.1565333604812622, + "learning_rate": 2.2393597937057642e-07, + "loss": 0.0239, + "num_input_tokens_seen": 210117584, + "step": 172690 + }, + { + "epoch": 19.233210825258936, + "grad_norm": 1.1257423162460327, + "learning_rate": 2.2361161363539385e-07, + "loss": 0.0277, + "num_input_tokens_seen": 210123696, + "step": 172695 + }, + { + "epoch": 19.233767680142556, + "grad_norm": 0.00699350330978632, + "learning_rate": 2.2328748193304883e-07, + "loss": 0.036, + "num_input_tokens_seen": 210129168, + "step": 172700 + }, + { + "epoch": 19.234324535026172, + "grad_norm": 2.6108992099761963, + "learning_rate": 2.2296358426660556e-07, + "loss": 0.1107, + "num_input_tokens_seen": 210135312, + "step": 172705 + }, + { + "epoch": 19.23488138990979, + "grad_norm": 0.06871972978115082, + "learning_rate": 2.2263992063912277e-07, + "loss": 0.0527, + "num_input_tokens_seen": 210141232, + "step": 172710 + }, + { + "epoch": 19.235438244793407, + "grad_norm": 0.002536361338570714, + "learning_rate": 2.2231649105365625e-07, + "loss": 0.0418, + "num_input_tokens_seen": 210147376, + "step": 172715 + }, + { + "epoch": 19.235995099677023, + "grad_norm": 0.003409485798329115, + "learning_rate": 2.2199329551326198e-07, + "loss": 0.0052, + "num_input_tokens_seen": 210153680, + "step": 172720 + }, + { + "epoch": 19.236551954560642, + "grad_norm": 0.00012084357149433345, + "learning_rate": 2.2167033402099302e-07, + "loss": 0.0345, + "num_input_tokens_seen": 210159920, + "step": 172725 + }, + { + "epoch": 19.23710880944426, + "grad_norm": 0.9224587082862854, + "learning_rate": 2.2134760657989972e-07, + "loss": 0.0197, + "num_input_tokens_seen": 210166352, + "step": 172730 + }, + { + "epoch": 19.237665664327878, + "grad_norm": 0.00022722911671735346, + "learning_rate": 2.2102511319303242e-07, + "loss": 0.0668, + "num_input_tokens_seen": 210172400, + "step": 172735 + }, + { + "epoch": 19.238222519211494, + "grad_norm": 0.051620081067085266, + "learning_rate": 2.207028538634359e-07, + "loss": 0.0374, + "num_input_tokens_seen": 210178416, + "step": 172740 + }, + { + "epoch": 19.23877937409511, + "grad_norm": 0.10639438033103943, + "learning_rate": 2.2038082859414944e-07, + "loss": 0.0588, + "num_input_tokens_seen": 210184656, + "step": 172745 + }, + { + "epoch": 19.23933622897873, + "grad_norm": 0.027351517230272293, + "learning_rate": 2.200590373882233e-07, + "loss": 0.1181, + "num_input_tokens_seen": 210190800, + "step": 172750 + }, + { + "epoch": 19.239893083862345, + "grad_norm": 0.003403623588383198, + "learning_rate": 2.1973748024868845e-07, + "loss": 0.1171, + "num_input_tokens_seen": 210196816, + "step": 172755 + }, + { + "epoch": 19.240449938745964, + "grad_norm": 0.01104552298784256, + "learning_rate": 2.1941615717858964e-07, + "loss": 0.0176, + "num_input_tokens_seen": 210202544, + "step": 172760 + }, + { + "epoch": 19.24100679362958, + "grad_norm": 0.2773706614971161, + "learning_rate": 2.190950681809606e-07, + "loss": 0.0233, + "num_input_tokens_seen": 210208752, + "step": 172765 + }, + { + "epoch": 19.241563648513196, + "grad_norm": 0.00035680303699336946, + "learning_rate": 2.1877421325883217e-07, + "loss": 0.0012, + "num_input_tokens_seen": 210214800, + "step": 172770 + }, + { + "epoch": 19.242120503396816, + "grad_norm": 0.8371683359146118, + "learning_rate": 2.1845359241523533e-07, + "loss": 0.0309, + "num_input_tokens_seen": 210220976, + "step": 172775 + }, + { + "epoch": 19.24267735828043, + "grad_norm": 1.7615323066711426, + "learning_rate": 2.181332056531954e-07, + "loss": 0.057, + "num_input_tokens_seen": 210227184, + "step": 172780 + }, + { + "epoch": 19.24323421316405, + "grad_norm": 0.11206009238958359, + "learning_rate": 2.1781305297574606e-07, + "loss": 0.1187, + "num_input_tokens_seen": 210233456, + "step": 172785 + }, + { + "epoch": 19.243791068047667, + "grad_norm": 0.13869354128837585, + "learning_rate": 2.1749313438590714e-07, + "loss": 0.0042, + "num_input_tokens_seen": 210239504, + "step": 172790 + }, + { + "epoch": 19.244347922931283, + "grad_norm": 1.4163198471069336, + "learning_rate": 2.171734498867012e-07, + "loss": 0.0501, + "num_input_tokens_seen": 210245520, + "step": 172795 + }, + { + "epoch": 19.244904777814902, + "grad_norm": 2.4384493827819824, + "learning_rate": 2.1685399948114527e-07, + "loss": 0.1081, + "num_input_tokens_seen": 210251856, + "step": 172800 + }, + { + "epoch": 19.245461632698518, + "grad_norm": 0.00016067776596173644, + "learning_rate": 2.1653478317226194e-07, + "loss": 0.0242, + "num_input_tokens_seen": 210258384, + "step": 172805 + }, + { + "epoch": 19.246018487582138, + "grad_norm": 0.023363426327705383, + "learning_rate": 2.1621580096306272e-07, + "loss": 0.0042, + "num_input_tokens_seen": 210264496, + "step": 172810 + }, + { + "epoch": 19.246575342465754, + "grad_norm": 0.11197542399168015, + "learning_rate": 2.158970528565618e-07, + "loss": 0.0263, + "num_input_tokens_seen": 210270096, + "step": 172815 + }, + { + "epoch": 19.24713219734937, + "grad_norm": 0.001737029291689396, + "learning_rate": 2.1557853885577072e-07, + "loss": 0.0027, + "num_input_tokens_seen": 210276208, + "step": 172820 + }, + { + "epoch": 19.24768905223299, + "grad_norm": 1.5944294929504395, + "learning_rate": 2.152602589636954e-07, + "loss": 0.1897, + "num_input_tokens_seen": 210282352, + "step": 172825 + }, + { + "epoch": 19.248245907116605, + "grad_norm": 0.016807524487376213, + "learning_rate": 2.1494221318334451e-07, + "loss": 0.1263, + "num_input_tokens_seen": 210288464, + "step": 172830 + }, + { + "epoch": 19.248802762000224, + "grad_norm": 0.9343055486679077, + "learning_rate": 2.14624401517724e-07, + "loss": 0.0392, + "num_input_tokens_seen": 210294576, + "step": 172835 + }, + { + "epoch": 19.24935961688384, + "grad_norm": 0.013836231082677841, + "learning_rate": 2.1430682396983148e-07, + "loss": 0.0055, + "num_input_tokens_seen": 210301008, + "step": 172840 + }, + { + "epoch": 19.249916471767456, + "grad_norm": 0.029856083914637566, + "learning_rate": 2.139894805426701e-07, + "loss": 0.0742, + "num_input_tokens_seen": 210307056, + "step": 172845 + }, + { + "epoch": 19.250473326651075, + "grad_norm": 1.508406162261963, + "learning_rate": 2.1367237123923467e-07, + "loss": 0.0645, + "num_input_tokens_seen": 210313136, + "step": 172850 + }, + { + "epoch": 19.25103018153469, + "grad_norm": 0.009628077037632465, + "learning_rate": 2.133554960625228e-07, + "loss": 0.0026, + "num_input_tokens_seen": 210318832, + "step": 172855 + }, + { + "epoch": 19.25158703641831, + "grad_norm": 0.35100117325782776, + "learning_rate": 2.1303885501552933e-07, + "loss": 0.0212, + "num_input_tokens_seen": 210324816, + "step": 172860 + }, + { + "epoch": 19.252143891301927, + "grad_norm": 0.2037363499403, + "learning_rate": 2.1272244810124077e-07, + "loss": 0.0481, + "num_input_tokens_seen": 210330864, + "step": 172865 + }, + { + "epoch": 19.252700746185543, + "grad_norm": 0.47708144783973694, + "learning_rate": 2.124062753226491e-07, + "loss": 0.0403, + "num_input_tokens_seen": 210337072, + "step": 172870 + }, + { + "epoch": 19.253257601069162, + "grad_norm": 0.10744422674179077, + "learning_rate": 2.1209033668273814e-07, + "loss": 0.0321, + "num_input_tokens_seen": 210343408, + "step": 172875 + }, + { + "epoch": 19.253814455952778, + "grad_norm": 0.0470392070710659, + "learning_rate": 2.1177463218449433e-07, + "loss": 0.0062, + "num_input_tokens_seen": 210349584, + "step": 172880 + }, + { + "epoch": 19.254371310836397, + "grad_norm": 0.47990190982818604, + "learning_rate": 2.1145916183090143e-07, + "loss": 0.0119, + "num_input_tokens_seen": 210355728, + "step": 172885 + }, + { + "epoch": 19.254928165720013, + "grad_norm": 0.07142123579978943, + "learning_rate": 2.11143925624932e-07, + "loss": 0.1937, + "num_input_tokens_seen": 210361744, + "step": 172890 + }, + { + "epoch": 19.25548502060363, + "grad_norm": 0.002190479077398777, + "learning_rate": 2.1082892356957261e-07, + "loss": 0.0462, + "num_input_tokens_seen": 210367952, + "step": 172895 + }, + { + "epoch": 19.25604187548725, + "grad_norm": 0.0023999817203730345, + "learning_rate": 2.1051415566779308e-07, + "loss": 0.0356, + "num_input_tokens_seen": 210374192, + "step": 172900 + }, + { + "epoch": 19.256598730370865, + "grad_norm": 0.0003311213804408908, + "learning_rate": 2.1019962192256882e-07, + "loss": 0.0036, + "num_input_tokens_seen": 210380144, + "step": 172905 + }, + { + "epoch": 19.257155585254484, + "grad_norm": 0.8933529257774353, + "learning_rate": 2.0988532233686964e-07, + "loss": 0.0147, + "num_input_tokens_seen": 210386736, + "step": 172910 + }, + { + "epoch": 19.2577124401381, + "grad_norm": 0.06200946494936943, + "learning_rate": 2.095712569136682e-07, + "loss": 0.0155, + "num_input_tokens_seen": 210392912, + "step": 172915 + }, + { + "epoch": 19.258269295021716, + "grad_norm": 0.06334187090396881, + "learning_rate": 2.0925742565592322e-07, + "loss": 0.0572, + "num_input_tokens_seen": 210398768, + "step": 172920 + }, + { + "epoch": 19.258826149905335, + "grad_norm": 0.394867867231369, + "learning_rate": 2.0894382856660732e-07, + "loss": 0.028, + "num_input_tokens_seen": 210404912, + "step": 172925 + }, + { + "epoch": 19.25938300478895, + "grad_norm": 0.09345347434282303, + "learning_rate": 2.0863046564867927e-07, + "loss": 0.0443, + "num_input_tokens_seen": 210411184, + "step": 172930 + }, + { + "epoch": 19.25993985967257, + "grad_norm": 0.00016238637908827513, + "learning_rate": 2.083173369050978e-07, + "loss": 0.0037, + "num_input_tokens_seen": 210417392, + "step": 172935 + }, + { + "epoch": 19.260496714556187, + "grad_norm": 1.9668611288070679, + "learning_rate": 2.0800444233882165e-07, + "loss": 0.0735, + "num_input_tokens_seen": 210423408, + "step": 172940 + }, + { + "epoch": 19.261053569439802, + "grad_norm": 0.015617563389241695, + "learning_rate": 2.076917819528068e-07, + "loss": 0.0048, + "num_input_tokens_seen": 210429456, + "step": 172945 + }, + { + "epoch": 19.261610424323422, + "grad_norm": 0.1647137701511383, + "learning_rate": 2.0737935575000645e-07, + "loss": 0.0048, + "num_input_tokens_seen": 210435568, + "step": 172950 + }, + { + "epoch": 19.262167279207038, + "grad_norm": 1.3155291080474854, + "learning_rate": 2.0706716373337377e-07, + "loss": 0.0639, + "num_input_tokens_seen": 210441744, + "step": 172955 + }, + { + "epoch": 19.262724134090657, + "grad_norm": 0.049676623195409775, + "learning_rate": 2.0675520590585084e-07, + "loss": 0.0012, + "num_input_tokens_seen": 210447792, + "step": 172960 + }, + { + "epoch": 19.263280988974273, + "grad_norm": 0.00011954798537772149, + "learning_rate": 2.064434822703909e-07, + "loss": 0.0715, + "num_input_tokens_seen": 210453872, + "step": 172965 + }, + { + "epoch": 19.26383784385789, + "grad_norm": 0.04907538369297981, + "learning_rate": 2.0613199282993877e-07, + "loss": 0.0276, + "num_input_tokens_seen": 210460080, + "step": 172970 + }, + { + "epoch": 19.26439469874151, + "grad_norm": 0.00024668173864483833, + "learning_rate": 2.0582073758743103e-07, + "loss": 0.0058, + "num_input_tokens_seen": 210466640, + "step": 172975 + }, + { + "epoch": 19.264951553625124, + "grad_norm": 0.6985459327697754, + "learning_rate": 2.055097165458153e-07, + "loss": 0.0315, + "num_input_tokens_seen": 210472528, + "step": 172980 + }, + { + "epoch": 19.265508408508744, + "grad_norm": 0.03094552457332611, + "learning_rate": 2.0519892970802258e-07, + "loss": 0.0335, + "num_input_tokens_seen": 210478704, + "step": 172985 + }, + { + "epoch": 19.26606526339236, + "grad_norm": 0.00035850235144607723, + "learning_rate": 2.0488837707698938e-07, + "loss": 0.0108, + "num_input_tokens_seen": 210484656, + "step": 172990 + }, + { + "epoch": 19.266622118275976, + "grad_norm": 0.07588788866996765, + "learning_rate": 2.0457805865565506e-07, + "loss": 0.0154, + "num_input_tokens_seen": 210490896, + "step": 172995 + }, + { + "epoch": 19.267178973159595, + "grad_norm": 0.13373272120952606, + "learning_rate": 2.04267974446945e-07, + "loss": 0.0043, + "num_input_tokens_seen": 210497072, + "step": 173000 + }, + { + "epoch": 19.26773582804321, + "grad_norm": 0.11666528880596161, + "learning_rate": 2.0395812445379026e-07, + "loss": 0.0042, + "num_input_tokens_seen": 210503088, + "step": 173005 + }, + { + "epoch": 19.26829268292683, + "grad_norm": 0.011537919752299786, + "learning_rate": 2.0364850867911622e-07, + "loss": 0.1053, + "num_input_tokens_seen": 210509296, + "step": 173010 + }, + { + "epoch": 19.268849537810446, + "grad_norm": 0.001241635880433023, + "learning_rate": 2.0333912712584835e-07, + "loss": 0.0395, + "num_input_tokens_seen": 210515600, + "step": 173015 + }, + { + "epoch": 19.269406392694062, + "grad_norm": 0.0003106976510025561, + "learning_rate": 2.0302997979690929e-07, + "loss": 0.0872, + "num_input_tokens_seen": 210521776, + "step": 173020 + }, + { + "epoch": 19.26996324757768, + "grad_norm": 0.0039694360457360744, + "learning_rate": 2.0272106669522173e-07, + "loss": 0.005, + "num_input_tokens_seen": 210527856, + "step": 173025 + }, + { + "epoch": 19.270520102461298, + "grad_norm": 0.29799529910087585, + "learning_rate": 2.0241238782369997e-07, + "loss": 0.0072, + "num_input_tokens_seen": 210534256, + "step": 173030 + }, + { + "epoch": 19.271076957344917, + "grad_norm": 0.0011338507756590843, + "learning_rate": 2.021039431852584e-07, + "loss": 0.0923, + "num_input_tokens_seen": 210540336, + "step": 173035 + }, + { + "epoch": 19.271633812228533, + "grad_norm": 0.019210437312722206, + "learning_rate": 2.0179573278281406e-07, + "loss": 0.0952, + "num_input_tokens_seen": 210546768, + "step": 173040 + }, + { + "epoch": 19.27219066711215, + "grad_norm": 0.01490616425871849, + "learning_rate": 2.0148775661927855e-07, + "loss": 0.0134, + "num_input_tokens_seen": 210552784, + "step": 173045 + }, + { + "epoch": 19.27274752199577, + "grad_norm": 0.21909181773662567, + "learning_rate": 2.0118001469755787e-07, + "loss": 0.0357, + "num_input_tokens_seen": 210558736, + "step": 173050 + }, + { + "epoch": 19.273304376879384, + "grad_norm": 0.40245872735977173, + "learning_rate": 2.008725070205608e-07, + "loss": 0.0172, + "num_input_tokens_seen": 210564112, + "step": 173055 + }, + { + "epoch": 19.273861231763004, + "grad_norm": 0.1523568481206894, + "learning_rate": 2.005652335911906e-07, + "loss": 0.0498, + "num_input_tokens_seen": 210569808, + "step": 173060 + }, + { + "epoch": 19.27441808664662, + "grad_norm": 0.5580509901046753, + "learning_rate": 2.002581944123505e-07, + "loss": 0.0893, + "num_input_tokens_seen": 210575312, + "step": 173065 + }, + { + "epoch": 19.27497494153024, + "grad_norm": 0.00011322419595671818, + "learning_rate": 1.9995138948694092e-07, + "loss": 0.0013, + "num_input_tokens_seen": 210581456, + "step": 173070 + }, + { + "epoch": 19.275531796413855, + "grad_norm": 1.0199226140975952, + "learning_rate": 1.9964481881786512e-07, + "loss": 0.0734, + "num_input_tokens_seen": 210587600, + "step": 173075 + }, + { + "epoch": 19.27608865129747, + "grad_norm": 0.02159092016518116, + "learning_rate": 1.9933848240800689e-07, + "loss": 0.1265, + "num_input_tokens_seen": 210593488, + "step": 173080 + }, + { + "epoch": 19.27664550618109, + "grad_norm": 0.08089014887809753, + "learning_rate": 1.990323802602695e-07, + "loss": 0.1031, + "num_input_tokens_seen": 210599568, + "step": 173085 + }, + { + "epoch": 19.277202361064706, + "grad_norm": 2.600672483444214, + "learning_rate": 1.9872651237754226e-07, + "loss": 0.127, + "num_input_tokens_seen": 210605744, + "step": 173090 + }, + { + "epoch": 19.277759215948326, + "grad_norm": 0.17901313304901123, + "learning_rate": 1.9842087876271175e-07, + "loss": 0.012, + "num_input_tokens_seen": 210611856, + "step": 173095 + }, + { + "epoch": 19.27831607083194, + "grad_norm": 0.32342395186424255, + "learning_rate": 1.9811547941867014e-07, + "loss": 0.0105, + "num_input_tokens_seen": 210618320, + "step": 173100 + }, + { + "epoch": 19.278872925715557, + "grad_norm": 0.14541548490524292, + "learning_rate": 1.9781031434829566e-07, + "loss": 0.0838, + "num_input_tokens_seen": 210624368, + "step": 173105 + }, + { + "epoch": 19.279429780599177, + "grad_norm": 0.0006585364462807775, + "learning_rate": 1.9750538355447212e-07, + "loss": 0.0901, + "num_input_tokens_seen": 210630096, + "step": 173110 + }, + { + "epoch": 19.279986635482793, + "grad_norm": 0.1414911448955536, + "learning_rate": 1.972006870400861e-07, + "loss": 0.0554, + "num_input_tokens_seen": 210636240, + "step": 173115 + }, + { + "epoch": 19.280543490366412, + "grad_norm": 0.015773097053170204, + "learning_rate": 1.9689622480801028e-07, + "loss": 0.0768, + "num_input_tokens_seen": 210641680, + "step": 173120 + }, + { + "epoch": 19.281100345250028, + "grad_norm": 0.33705559372901917, + "learning_rate": 1.9659199686112017e-07, + "loss": 0.0567, + "num_input_tokens_seen": 210647856, + "step": 173125 + }, + { + "epoch": 19.281657200133644, + "grad_norm": 1.8879910707473755, + "learning_rate": 1.9628800320229124e-07, + "loss": 0.1646, + "num_input_tokens_seen": 210654192, + "step": 173130 + }, + { + "epoch": 19.282214055017263, + "grad_norm": 0.941717267036438, + "learning_rate": 1.959842438343934e-07, + "loss": 0.0236, + "num_input_tokens_seen": 210660144, + "step": 173135 + }, + { + "epoch": 19.28277090990088, + "grad_norm": 0.450211763381958, + "learning_rate": 1.956807187602966e-07, + "loss": 0.0666, + "num_input_tokens_seen": 210666096, + "step": 173140 + }, + { + "epoch": 19.2833277647845, + "grad_norm": 0.00942733883857727, + "learning_rate": 1.953774279828735e-07, + "loss": 0.0099, + "num_input_tokens_seen": 210672336, + "step": 173145 + }, + { + "epoch": 19.283884619668115, + "grad_norm": 0.1454337239265442, + "learning_rate": 1.9507437150497742e-07, + "loss": 0.0708, + "num_input_tokens_seen": 210678480, + "step": 173150 + }, + { + "epoch": 19.28444147455173, + "grad_norm": 0.017400024458765984, + "learning_rate": 1.9477154932948104e-07, + "loss": 0.012, + "num_input_tokens_seen": 210684528, + "step": 173155 + }, + { + "epoch": 19.28499832943535, + "grad_norm": 0.4155585467815399, + "learning_rate": 1.9446896145923766e-07, + "loss": 0.1023, + "num_input_tokens_seen": 210690416, + "step": 173160 + }, + { + "epoch": 19.285555184318966, + "grad_norm": 0.8768965601921082, + "learning_rate": 1.941666078971116e-07, + "loss": 0.1139, + "num_input_tokens_seen": 210696336, + "step": 173165 + }, + { + "epoch": 19.286112039202585, + "grad_norm": 0.0020008038263767958, + "learning_rate": 1.9386448864595896e-07, + "loss": 0.0156, + "num_input_tokens_seen": 210702224, + "step": 173170 + }, + { + "epoch": 19.2866688940862, + "grad_norm": 0.22549225389957428, + "learning_rate": 1.9356260370862468e-07, + "loss": 0.0421, + "num_input_tokens_seen": 210708528, + "step": 173175 + }, + { + "epoch": 19.287225748969817, + "grad_norm": 1.3592071533203125, + "learning_rate": 1.9326095308797031e-07, + "loss": 0.0253, + "num_input_tokens_seen": 210714544, + "step": 173180 + }, + { + "epoch": 19.287782603853437, + "grad_norm": 0.018021980300545692, + "learning_rate": 1.929595367868381e-07, + "loss": 0.0024, + "num_input_tokens_seen": 210720624, + "step": 173185 + }, + { + "epoch": 19.288339458737052, + "grad_norm": 1.1489918231964111, + "learning_rate": 1.9265835480807848e-07, + "loss": 0.0676, + "num_input_tokens_seen": 210726704, + "step": 173190 + }, + { + "epoch": 19.288896313620672, + "grad_norm": 0.7564038038253784, + "learning_rate": 1.9235740715453642e-07, + "loss": 0.022, + "num_input_tokens_seen": 210732720, + "step": 173195 + }, + { + "epoch": 19.289453168504288, + "grad_norm": 1.9460426568984985, + "learning_rate": 1.9205669382905688e-07, + "loss": 0.1612, + "num_input_tokens_seen": 210738832, + "step": 173200 + }, + { + "epoch": 19.290010023387904, + "grad_norm": 1.5991039276123047, + "learning_rate": 1.917562148344737e-07, + "loss": 0.0634, + "num_input_tokens_seen": 210745040, + "step": 173205 + }, + { + "epoch": 19.290566878271523, + "grad_norm": 0.9320402145385742, + "learning_rate": 1.9145597017363182e-07, + "loss": 0.0597, + "num_input_tokens_seen": 210751344, + "step": 173210 + }, + { + "epoch": 19.29112373315514, + "grad_norm": 0.038745179772377014, + "learning_rate": 1.911559598493623e-07, + "loss": 0.044, + "num_input_tokens_seen": 210757616, + "step": 173215 + }, + { + "epoch": 19.29168058803876, + "grad_norm": 0.0002576776023488492, + "learning_rate": 1.9085618386450454e-07, + "loss": 0.0305, + "num_input_tokens_seen": 210763760, + "step": 173220 + }, + { + "epoch": 19.292237442922374, + "grad_norm": 0.05079781636595726, + "learning_rate": 1.9055664222188407e-07, + "loss": 0.011, + "num_input_tokens_seen": 210769648, + "step": 173225 + }, + { + "epoch": 19.29279429780599, + "grad_norm": 0.03908469155430794, + "learning_rate": 1.9025733492433474e-07, + "loss": 0.1112, + "num_input_tokens_seen": 210775440, + "step": 173230 + }, + { + "epoch": 19.29335115268961, + "grad_norm": 0.009029621258378029, + "learning_rate": 1.8995826197467926e-07, + "loss": 0.0435, + "num_input_tokens_seen": 210781936, + "step": 173235 + }, + { + "epoch": 19.293908007573226, + "grad_norm": 0.006357393227517605, + "learning_rate": 1.896594233757487e-07, + "loss": 0.0738, + "num_input_tokens_seen": 210788560, + "step": 173240 + }, + { + "epoch": 19.294464862456845, + "grad_norm": 0.004390592686831951, + "learning_rate": 1.893608191303603e-07, + "loss": 0.045, + "num_input_tokens_seen": 210794736, + "step": 173245 + }, + { + "epoch": 19.29502171734046, + "grad_norm": 0.024266179651021957, + "learning_rate": 1.8906244924133953e-07, + "loss": 0.0352, + "num_input_tokens_seen": 210800656, + "step": 173250 + }, + { + "epoch": 19.295578572224077, + "grad_norm": 1.5057553052902222, + "learning_rate": 1.8876431371149805e-07, + "loss": 0.0295, + "num_input_tokens_seen": 210807312, + "step": 173255 + }, + { + "epoch": 19.296135427107696, + "grad_norm": 0.029859602451324463, + "learning_rate": 1.884664125436586e-07, + "loss": 0.0149, + "num_input_tokens_seen": 210813456, + "step": 173260 + }, + { + "epoch": 19.296692281991312, + "grad_norm": 0.04425117373466492, + "learning_rate": 1.881687457406356e-07, + "loss": 0.0041, + "num_input_tokens_seen": 210819600, + "step": 173265 + }, + { + "epoch": 19.29724913687493, + "grad_norm": 0.01530058030039072, + "learning_rate": 1.8787131330523235e-07, + "loss": 0.0043, + "num_input_tokens_seen": 210825968, + "step": 173270 + }, + { + "epoch": 19.297805991758548, + "grad_norm": 0.06561684608459473, + "learning_rate": 1.8757411524026603e-07, + "loss": 0.0131, + "num_input_tokens_seen": 210832304, + "step": 173275 + }, + { + "epoch": 19.298362846642164, + "grad_norm": 0.11887781322002411, + "learning_rate": 1.8727715154854275e-07, + "loss": 0.0162, + "num_input_tokens_seen": 210838320, + "step": 173280 + }, + { + "epoch": 19.298919701525783, + "grad_norm": 0.05387800559401512, + "learning_rate": 1.8698042223286306e-07, + "loss": 0.0735, + "num_input_tokens_seen": 210843792, + "step": 173285 + }, + { + "epoch": 19.2994765564094, + "grad_norm": 1.1664665937423706, + "learning_rate": 1.8668392729603855e-07, + "loss": 0.0799, + "num_input_tokens_seen": 210849328, + "step": 173290 + }, + { + "epoch": 19.30003341129302, + "grad_norm": 0.0026901729870587587, + "learning_rate": 1.863876667408587e-07, + "loss": 0.0236, + "num_input_tokens_seen": 210855664, + "step": 173295 + }, + { + "epoch": 19.300590266176634, + "grad_norm": 0.14075620472431183, + "learning_rate": 1.8609164057013239e-07, + "loss": 0.0472, + "num_input_tokens_seen": 210861744, + "step": 173300 + }, + { + "epoch": 19.30114712106025, + "grad_norm": 0.08033876866102219, + "learning_rate": 1.8579584878664623e-07, + "loss": 0.1159, + "num_input_tokens_seen": 210867792, + "step": 173305 + }, + { + "epoch": 19.30170397594387, + "grad_norm": 1.9830100536346436, + "learning_rate": 1.8550029139320358e-07, + "loss": 0.1796, + "num_input_tokens_seen": 210873840, + "step": 173310 + }, + { + "epoch": 19.302260830827485, + "grad_norm": 0.0007858345634303987, + "learning_rate": 1.8520496839258827e-07, + "loss": 0.0095, + "num_input_tokens_seen": 210880016, + "step": 173315 + }, + { + "epoch": 19.302817685711105, + "grad_norm": 0.16348396241664886, + "learning_rate": 1.8490987978759534e-07, + "loss": 0.0339, + "num_input_tokens_seen": 210886544, + "step": 173320 + }, + { + "epoch": 19.30337454059472, + "grad_norm": 0.09644827246665955, + "learning_rate": 1.8461502558100862e-07, + "loss": 0.0078, + "num_input_tokens_seen": 210892624, + "step": 173325 + }, + { + "epoch": 19.303931395478337, + "grad_norm": 0.20340237021446228, + "learning_rate": 1.843204057756176e-07, + "loss": 0.0063, + "num_input_tokens_seen": 210898704, + "step": 173330 + }, + { + "epoch": 19.304488250361956, + "grad_norm": 0.010004137642681599, + "learning_rate": 1.8402602037420058e-07, + "loss": 0.0107, + "num_input_tokens_seen": 210904496, + "step": 173335 + }, + { + "epoch": 19.305045105245572, + "grad_norm": 0.3614256978034973, + "learning_rate": 1.8373186937954146e-07, + "loss": 0.0303, + "num_input_tokens_seen": 210909936, + "step": 173340 + }, + { + "epoch": 19.30560196012919, + "grad_norm": 0.17185448110103607, + "learning_rate": 1.83437952794413e-07, + "loss": 0.1662, + "num_input_tokens_seen": 210915824, + "step": 173345 + }, + { + "epoch": 19.306158815012807, + "grad_norm": 0.21365317702293396, + "learning_rate": 1.8314427062159911e-07, + "loss": 0.0368, + "num_input_tokens_seen": 210921840, + "step": 173350 + }, + { + "epoch": 19.306715669896423, + "grad_norm": 0.00012909204815514386, + "learning_rate": 1.8285082286386978e-07, + "loss": 0.0398, + "num_input_tokens_seen": 210928208, + "step": 173355 + }, + { + "epoch": 19.307272524780043, + "grad_norm": 0.00015921029262244701, + "learning_rate": 1.8255760952399782e-07, + "loss": 0.0308, + "num_input_tokens_seen": 210934064, + "step": 173360 + }, + { + "epoch": 19.30782937966366, + "grad_norm": 0.30445411801338196, + "learning_rate": 1.8226463060475318e-07, + "loss": 0.0101, + "num_input_tokens_seen": 210940272, + "step": 173365 + }, + { + "epoch": 19.308386234547278, + "grad_norm": 0.010272563435137272, + "learning_rate": 1.8197188610890315e-07, + "loss": 0.0543, + "num_input_tokens_seen": 210946480, + "step": 173370 + }, + { + "epoch": 19.308943089430894, + "grad_norm": 0.12226156890392303, + "learning_rate": 1.8167937603920938e-07, + "loss": 0.0215, + "num_input_tokens_seen": 210952848, + "step": 173375 + }, + { + "epoch": 19.30949994431451, + "grad_norm": 0.0006982243503443897, + "learning_rate": 1.8138710039844186e-07, + "loss": 0.0922, + "num_input_tokens_seen": 210959056, + "step": 173380 + }, + { + "epoch": 19.31005679919813, + "grad_norm": 1.6626611948013306, + "learning_rate": 1.8109505918935675e-07, + "loss": 0.1697, + "num_input_tokens_seen": 210964848, + "step": 173385 + }, + { + "epoch": 19.310613654081745, + "grad_norm": 0.040230363607406616, + "learning_rate": 1.8080325241471019e-07, + "loss": 0.0332, + "num_input_tokens_seen": 210970640, + "step": 173390 + }, + { + "epoch": 19.311170508965365, + "grad_norm": 0.0009027541964314878, + "learning_rate": 1.8051168007726383e-07, + "loss": 0.0057, + "num_input_tokens_seen": 210976624, + "step": 173395 + }, + { + "epoch": 19.31172736384898, + "grad_norm": 0.3146514594554901, + "learning_rate": 1.8022034217977102e-07, + "loss": 0.0285, + "num_input_tokens_seen": 210982832, + "step": 173400 + }, + { + "epoch": 19.3122842187326, + "grad_norm": 1.3843432664871216, + "learning_rate": 1.7992923872498234e-07, + "loss": 0.0716, + "num_input_tokens_seen": 210988112, + "step": 173405 + }, + { + "epoch": 19.312841073616216, + "grad_norm": 0.004190241917967796, + "learning_rate": 1.7963836971564562e-07, + "loss": 0.0361, + "num_input_tokens_seen": 210994640, + "step": 173410 + }, + { + "epoch": 19.313397928499832, + "grad_norm": 0.0006104967906139791, + "learning_rate": 1.7934773515451143e-07, + "loss": 0.035, + "num_input_tokens_seen": 211000752, + "step": 173415 + }, + { + "epoch": 19.31395478338345, + "grad_norm": 0.0008584287716075778, + "learning_rate": 1.79057335044322e-07, + "loss": 0.0671, + "num_input_tokens_seen": 211006928, + "step": 173420 + }, + { + "epoch": 19.314511638267067, + "grad_norm": 1.4564378261566162, + "learning_rate": 1.7876716938782235e-07, + "loss": 0.0439, + "num_input_tokens_seen": 211013232, + "step": 173425 + }, + { + "epoch": 19.315068493150687, + "grad_norm": 0.8353103995323181, + "learning_rate": 1.7847723818775476e-07, + "loss": 0.074, + "num_input_tokens_seen": 211019184, + "step": 173430 + }, + { + "epoch": 19.315625348034303, + "grad_norm": 0.005508281756192446, + "learning_rate": 1.7818754144685867e-07, + "loss": 0.0044, + "num_input_tokens_seen": 211025168, + "step": 173435 + }, + { + "epoch": 19.31618220291792, + "grad_norm": 1.0038501024246216, + "learning_rate": 1.7789807916786527e-07, + "loss": 0.1215, + "num_input_tokens_seen": 211031376, + "step": 173440 + }, + { + "epoch": 19.316739057801538, + "grad_norm": 0.01779252104461193, + "learning_rate": 1.7760885135351124e-07, + "loss": 0.0157, + "num_input_tokens_seen": 211037808, + "step": 173445 + }, + { + "epoch": 19.317295912685154, + "grad_norm": 0.001980713102966547, + "learning_rate": 1.773198580065305e-07, + "loss": 0.0172, + "num_input_tokens_seen": 211043952, + "step": 173450 + }, + { + "epoch": 19.317852767568773, + "grad_norm": 2.011782646179199, + "learning_rate": 1.7703109912965142e-07, + "loss": 0.06, + "num_input_tokens_seen": 211049936, + "step": 173455 + }, + { + "epoch": 19.31840962245239, + "grad_norm": 0.00926226656883955, + "learning_rate": 1.7674257472559963e-07, + "loss": 0.0251, + "num_input_tokens_seen": 211056144, + "step": 173460 + }, + { + "epoch": 19.318966477336005, + "grad_norm": 1.1612334251403809, + "learning_rate": 1.7645428479710348e-07, + "loss": 0.0457, + "num_input_tokens_seen": 211062064, + "step": 173465 + }, + { + "epoch": 19.319523332219624, + "grad_norm": 0.29959866404533386, + "learning_rate": 1.76166229346883e-07, + "loss": 0.0045, + "num_input_tokens_seen": 211068080, + "step": 173470 + }, + { + "epoch": 19.32008018710324, + "grad_norm": 0.02578572742640972, + "learning_rate": 1.75878408377661e-07, + "loss": 0.0014, + "num_input_tokens_seen": 211074448, + "step": 173475 + }, + { + "epoch": 19.32063704198686, + "grad_norm": 0.32655978202819824, + "learning_rate": 1.7559082189216036e-07, + "loss": 0.0102, + "num_input_tokens_seen": 211080400, + "step": 173480 + }, + { + "epoch": 19.321193896870476, + "grad_norm": 0.0003642434603534639, + "learning_rate": 1.7530346989309e-07, + "loss": 0.0232, + "num_input_tokens_seen": 211086832, + "step": 173485 + }, + { + "epoch": 19.32175075175409, + "grad_norm": 0.008250869810581207, + "learning_rate": 1.7501635238316993e-07, + "loss": 0.0016, + "num_input_tokens_seen": 211092688, + "step": 173490 + }, + { + "epoch": 19.32230760663771, + "grad_norm": 0.9344041347503662, + "learning_rate": 1.7472946936510636e-07, + "loss": 0.0511, + "num_input_tokens_seen": 211098864, + "step": 173495 + }, + { + "epoch": 19.322864461521327, + "grad_norm": 0.1677372008562088, + "learning_rate": 1.7444282084161657e-07, + "loss": 0.0154, + "num_input_tokens_seen": 211105136, + "step": 173500 + }, + { + "epoch": 19.323421316404946, + "grad_norm": 0.08454371243715286, + "learning_rate": 1.7415640681540114e-07, + "loss": 0.0107, + "num_input_tokens_seen": 211111440, + "step": 173505 + }, + { + "epoch": 19.323978171288562, + "grad_norm": 0.7630237340927124, + "learning_rate": 1.738702272891718e-07, + "loss": 0.0142, + "num_input_tokens_seen": 211117808, + "step": 173510 + }, + { + "epoch": 19.324535026172178, + "grad_norm": 0.08741728961467743, + "learning_rate": 1.7358428226562362e-07, + "loss": 0.0209, + "num_input_tokens_seen": 211123664, + "step": 173515 + }, + { + "epoch": 19.325091881055798, + "grad_norm": 0.009658767841756344, + "learning_rate": 1.7329857174746555e-07, + "loss": 0.0443, + "num_input_tokens_seen": 211129648, + "step": 173520 + }, + { + "epoch": 19.325648735939414, + "grad_norm": 0.004334995523095131, + "learning_rate": 1.7301309573739543e-07, + "loss": 0.0052, + "num_input_tokens_seen": 211136016, + "step": 173525 + }, + { + "epoch": 19.326205590823033, + "grad_norm": 0.3253844678401947, + "learning_rate": 1.7272785423810555e-07, + "loss": 0.0646, + "num_input_tokens_seen": 211142768, + "step": 173530 + }, + { + "epoch": 19.32676244570665, + "grad_norm": 0.00648887362331152, + "learning_rate": 1.72442847252291e-07, + "loss": 0.0089, + "num_input_tokens_seen": 211148880, + "step": 173535 + }, + { + "epoch": 19.327319300590265, + "grad_norm": 0.18543541431427002, + "learning_rate": 1.7215807478264677e-07, + "loss": 0.0045, + "num_input_tokens_seen": 211155120, + "step": 173540 + }, + { + "epoch": 19.327876155473884, + "grad_norm": 0.0023047730792313814, + "learning_rate": 1.7187353683185968e-07, + "loss": 0.073, + "num_input_tokens_seen": 211161200, + "step": 173545 + }, + { + "epoch": 19.3284330103575, + "grad_norm": 0.741168200969696, + "learning_rate": 1.71589233402622e-07, + "loss": 0.0353, + "num_input_tokens_seen": 211167248, + "step": 173550 + }, + { + "epoch": 19.32898986524112, + "grad_norm": 1.6335018873214722, + "learning_rate": 1.7130516449761213e-07, + "loss": 0.0445, + "num_input_tokens_seen": 211173008, + "step": 173555 + }, + { + "epoch": 19.329546720124736, + "grad_norm": 0.9797447323799133, + "learning_rate": 1.710213301195196e-07, + "loss": 0.1171, + "num_input_tokens_seen": 211178992, + "step": 173560 + }, + { + "epoch": 19.33010357500835, + "grad_norm": 0.009007495827972889, + "learning_rate": 1.707377302710228e-07, + "loss": 0.0139, + "num_input_tokens_seen": 211185200, + "step": 173565 + }, + { + "epoch": 19.33066042989197, + "grad_norm": 0.04406942427158356, + "learning_rate": 1.7045436495480293e-07, + "loss": 0.0024, + "num_input_tokens_seen": 211191472, + "step": 173570 + }, + { + "epoch": 19.331217284775587, + "grad_norm": 0.8625627756118774, + "learning_rate": 1.7017123417353285e-07, + "loss": 0.0169, + "num_input_tokens_seen": 211197776, + "step": 173575 + }, + { + "epoch": 19.331774139659206, + "grad_norm": 0.0075558447279036045, + "learning_rate": 1.69888337929891e-07, + "loss": 0.024, + "num_input_tokens_seen": 211203536, + "step": 173580 + }, + { + "epoch": 19.332330994542822, + "grad_norm": 0.029819443821907043, + "learning_rate": 1.6960567622654466e-07, + "loss": 0.0365, + "num_input_tokens_seen": 211209552, + "step": 173585 + }, + { + "epoch": 19.332887849426438, + "grad_norm": 0.0016511440044268966, + "learning_rate": 1.693232490661667e-07, + "loss": 0.0271, + "num_input_tokens_seen": 211215504, + "step": 173590 + }, + { + "epoch": 19.333444704310057, + "grad_norm": 1.1141064167022705, + "learning_rate": 1.6904105645142444e-07, + "loss": 0.023, + "num_input_tokens_seen": 211221488, + "step": 173595 + }, + { + "epoch": 19.334001559193673, + "grad_norm": 0.037768661975860596, + "learning_rate": 1.6875909838498515e-07, + "loss": 0.0801, + "num_input_tokens_seen": 211227440, + "step": 173600 + }, + { + "epoch": 19.334558414077293, + "grad_norm": 0.9997033476829529, + "learning_rate": 1.6847737486951065e-07, + "loss": 0.0305, + "num_input_tokens_seen": 211233520, + "step": 173605 + }, + { + "epoch": 19.33511526896091, + "grad_norm": 0.07481613010168076, + "learning_rate": 1.6819588590766265e-07, + "loss": 0.0565, + "num_input_tokens_seen": 211239664, + "step": 173610 + }, + { + "epoch": 19.335672123844525, + "grad_norm": 0.1497759073972702, + "learning_rate": 1.679146315020974e-07, + "loss": 0.141, + "num_input_tokens_seen": 211245904, + "step": 173615 + }, + { + "epoch": 19.336228978728144, + "grad_norm": 0.14262725412845612, + "learning_rate": 1.6763361165547387e-07, + "loss": 0.0201, + "num_input_tokens_seen": 211252144, + "step": 173620 + }, + { + "epoch": 19.33678583361176, + "grad_norm": 1.1372995376586914, + "learning_rate": 1.6735282637044825e-07, + "loss": 0.0413, + "num_input_tokens_seen": 211257808, + "step": 173625 + }, + { + "epoch": 19.33734268849538, + "grad_norm": 0.022744745016098022, + "learning_rate": 1.6707227564966844e-07, + "loss": 0.0102, + "num_input_tokens_seen": 211263984, + "step": 173630 + }, + { + "epoch": 19.337899543378995, + "grad_norm": 0.20441469550132751, + "learning_rate": 1.6679195949578785e-07, + "loss": 0.0157, + "num_input_tokens_seen": 211270352, + "step": 173635 + }, + { + "epoch": 19.33845639826261, + "grad_norm": 0.007239634171128273, + "learning_rate": 1.665118779114516e-07, + "loss": 0.0213, + "num_input_tokens_seen": 211276688, + "step": 173640 + }, + { + "epoch": 19.33901325314623, + "grad_norm": 0.49814629554748535, + "learning_rate": 1.6623203089930762e-07, + "loss": 0.0206, + "num_input_tokens_seen": 211282576, + "step": 173645 + }, + { + "epoch": 19.339570108029847, + "grad_norm": 3.045882225036621, + "learning_rate": 1.6595241846200092e-07, + "loss": 0.1166, + "num_input_tokens_seen": 211288208, + "step": 173650 + }, + { + "epoch": 19.340126962913466, + "grad_norm": 0.3817148208618164, + "learning_rate": 1.6567304060216836e-07, + "loss": 0.0436, + "num_input_tokens_seen": 211294224, + "step": 173655 + }, + { + "epoch": 19.340683817797082, + "grad_norm": 0.33857956528663635, + "learning_rate": 1.6539389732245226e-07, + "loss": 0.007, + "num_input_tokens_seen": 211300048, + "step": 173660 + }, + { + "epoch": 19.341240672680698, + "grad_norm": 0.3435801565647125, + "learning_rate": 1.6511498862548657e-07, + "loss": 0.171, + "num_input_tokens_seen": 211306320, + "step": 173665 + }, + { + "epoch": 19.341797527564317, + "grad_norm": 0.0010745411273092031, + "learning_rate": 1.6483631451390813e-07, + "loss": 0.1088, + "num_input_tokens_seen": 211312496, + "step": 173670 + }, + { + "epoch": 19.342354382447933, + "grad_norm": 0.09441564977169037, + "learning_rate": 1.6455787499034815e-07, + "loss": 0.1115, + "num_input_tokens_seen": 211318768, + "step": 173675 + }, + { + "epoch": 19.342911237331553, + "grad_norm": 0.02913186326622963, + "learning_rate": 1.6427967005743506e-07, + "loss": 0.0185, + "num_input_tokens_seen": 211324688, + "step": 173680 + }, + { + "epoch": 19.34346809221517, + "grad_norm": 0.00012169970432296395, + "learning_rate": 1.640016997178001e-07, + "loss": 0.0631, + "num_input_tokens_seen": 211330704, + "step": 173685 + }, + { + "epoch": 19.344024947098784, + "grad_norm": 0.2758377492427826, + "learning_rate": 1.637239639740662e-07, + "loss": 0.118, + "num_input_tokens_seen": 211336784, + "step": 173690 + }, + { + "epoch": 19.344581801982404, + "grad_norm": 0.07813702523708344, + "learning_rate": 1.63446462828859e-07, + "loss": 0.0178, + "num_input_tokens_seen": 211343312, + "step": 173695 + }, + { + "epoch": 19.34513865686602, + "grad_norm": 0.007829247042536736, + "learning_rate": 1.6316919628479865e-07, + "loss": 0.0168, + "num_input_tokens_seen": 211349584, + "step": 173700 + }, + { + "epoch": 19.34569551174964, + "grad_norm": 0.0003461351152509451, + "learning_rate": 1.6289216434450528e-07, + "loss": 0.033, + "num_input_tokens_seen": 211355440, + "step": 173705 + }, + { + "epoch": 19.346252366633255, + "grad_norm": 0.000142036093166098, + "learning_rate": 1.6261536701059065e-07, + "loss": 0.0205, + "num_input_tokens_seen": 211361712, + "step": 173710 + }, + { + "epoch": 19.34680922151687, + "grad_norm": 0.3031826317310333, + "learning_rate": 1.623388042856777e-07, + "loss": 0.0329, + "num_input_tokens_seen": 211367696, + "step": 173715 + }, + { + "epoch": 19.34736607640049, + "grad_norm": 0.026630524545907974, + "learning_rate": 1.6206247617237268e-07, + "loss": 0.0103, + "num_input_tokens_seen": 211373808, + "step": 173720 + }, + { + "epoch": 19.347922931284106, + "grad_norm": 0.019634250551462173, + "learning_rate": 1.6178638267328738e-07, + "loss": 0.0181, + "num_input_tokens_seen": 211380016, + "step": 173725 + }, + { + "epoch": 19.348479786167726, + "grad_norm": 0.112217478454113, + "learning_rate": 1.6151052379103082e-07, + "loss": 0.0142, + "num_input_tokens_seen": 211386224, + "step": 173730 + }, + { + "epoch": 19.34903664105134, + "grad_norm": 2.070234537124634, + "learning_rate": 1.6123489952820647e-07, + "loss": 0.0452, + "num_input_tokens_seen": 211392176, + "step": 173735 + }, + { + "epoch": 19.34959349593496, + "grad_norm": 0.022245626896619797, + "learning_rate": 1.609595098874178e-07, + "loss": 0.0393, + "num_input_tokens_seen": 211398288, + "step": 173740 + }, + { + "epoch": 19.350150350818577, + "grad_norm": 0.007245040964335203, + "learning_rate": 1.606843548712683e-07, + "loss": 0.0586, + "num_input_tokens_seen": 211404464, + "step": 173745 + }, + { + "epoch": 19.350707205702193, + "grad_norm": 0.03280997276306152, + "learning_rate": 1.604094344823559e-07, + "loss": 0.041, + "num_input_tokens_seen": 211410352, + "step": 173750 + }, + { + "epoch": 19.351264060585812, + "grad_norm": 0.059992264956235886, + "learning_rate": 1.601347487232785e-07, + "loss": 0.1814, + "num_input_tokens_seen": 211416240, + "step": 173755 + }, + { + "epoch": 19.35182091546943, + "grad_norm": 0.2876071631908417, + "learning_rate": 1.5986029759662846e-07, + "loss": 0.0838, + "num_input_tokens_seen": 211421712, + "step": 173760 + }, + { + "epoch": 19.352377770353048, + "grad_norm": 0.04215341806411743, + "learning_rate": 1.5958608110500094e-07, + "loss": 0.0399, + "num_input_tokens_seen": 211427856, + "step": 173765 + }, + { + "epoch": 19.352934625236664, + "grad_norm": 2.188089370727539, + "learning_rate": 1.5931209925098278e-07, + "loss": 0.1659, + "num_input_tokens_seen": 211433552, + "step": 173770 + }, + { + "epoch": 19.35349148012028, + "grad_norm": 3.613664388656616, + "learning_rate": 1.5903835203716633e-07, + "loss": 0.2542, + "num_input_tokens_seen": 211439376, + "step": 173775 + }, + { + "epoch": 19.3540483350039, + "grad_norm": 0.028233559802174568, + "learning_rate": 1.5876483946613285e-07, + "loss": 0.0232, + "num_input_tokens_seen": 211445520, + "step": 173780 + }, + { + "epoch": 19.354605189887515, + "grad_norm": 0.0008470414904877543, + "learning_rate": 1.5849156154046918e-07, + "loss": 0.0886, + "num_input_tokens_seen": 211451344, + "step": 173785 + }, + { + "epoch": 19.355162044771134, + "grad_norm": 0.21745868027210236, + "learning_rate": 1.582185182627538e-07, + "loss": 0.1068, + "num_input_tokens_seen": 211456880, + "step": 173790 + }, + { + "epoch": 19.35571889965475, + "grad_norm": 0.4407241642475128, + "learning_rate": 1.5794570963557076e-07, + "loss": 0.0101, + "num_input_tokens_seen": 211462704, + "step": 173795 + }, + { + "epoch": 19.356275754538366, + "grad_norm": 0.012146156281232834, + "learning_rate": 1.5767313566149022e-07, + "loss": 0.0988, + "num_input_tokens_seen": 211468720, + "step": 173800 + }, + { + "epoch": 19.356832609421986, + "grad_norm": 0.03724154084920883, + "learning_rate": 1.574007963430907e-07, + "loss": 0.0981, + "num_input_tokens_seen": 211474800, + "step": 173805 + }, + { + "epoch": 19.3573894643056, + "grad_norm": 0.13027241826057434, + "learning_rate": 1.5712869168294508e-07, + "loss": 0.0112, + "num_input_tokens_seen": 211480944, + "step": 173810 + }, + { + "epoch": 19.35794631918922, + "grad_norm": 0.0005451480392366648, + "learning_rate": 1.568568216836236e-07, + "loss": 0.0088, + "num_input_tokens_seen": 211487088, + "step": 173815 + }, + { + "epoch": 19.358503174072837, + "grad_norm": 0.0023186856415122747, + "learning_rate": 1.565851863476908e-07, + "loss": 0.0481, + "num_input_tokens_seen": 211493136, + "step": 173820 + }, + { + "epoch": 19.359060028956453, + "grad_norm": 1.8032101392745972, + "learning_rate": 1.5631378567771693e-07, + "loss": 0.0763, + "num_input_tokens_seen": 211499696, + "step": 173825 + }, + { + "epoch": 19.359616883840072, + "grad_norm": 0.00024125898198690265, + "learning_rate": 1.5604261967626376e-07, + "loss": 0.0028, + "num_input_tokens_seen": 211505904, + "step": 173830 + }, + { + "epoch": 19.360173738723688, + "grad_norm": 0.02240462601184845, + "learning_rate": 1.5577168834589316e-07, + "loss": 0.1064, + "num_input_tokens_seen": 211512080, + "step": 173835 + }, + { + "epoch": 19.360730593607308, + "grad_norm": 0.028062406927347183, + "learning_rate": 1.5550099168916422e-07, + "loss": 0.0413, + "num_input_tokens_seen": 211518160, + "step": 173840 + }, + { + "epoch": 19.361287448490923, + "grad_norm": 1.9326460361480713, + "learning_rate": 1.552305297086304e-07, + "loss": 0.1067, + "num_input_tokens_seen": 211524112, + "step": 173845 + }, + { + "epoch": 19.36184430337454, + "grad_norm": 0.05033039674162865, + "learning_rate": 1.5496030240685077e-07, + "loss": 0.0751, + "num_input_tokens_seen": 211530032, + "step": 173850 + }, + { + "epoch": 19.36240115825816, + "grad_norm": 0.027067653834819794, + "learning_rate": 1.5469030978637888e-07, + "loss": 0.0338, + "num_input_tokens_seen": 211535952, + "step": 173855 + }, + { + "epoch": 19.362958013141775, + "grad_norm": 1.1050071716308594, + "learning_rate": 1.5442055184976269e-07, + "loss": 0.0178, + "num_input_tokens_seen": 211542288, + "step": 173860 + }, + { + "epoch": 19.363514868025394, + "grad_norm": 0.05000618100166321, + "learning_rate": 1.5415102859954732e-07, + "loss": 0.0034, + "num_input_tokens_seen": 211548208, + "step": 173865 + }, + { + "epoch": 19.36407172290901, + "grad_norm": 1.8382068872451782, + "learning_rate": 1.538817400382836e-07, + "loss": 0.0962, + "num_input_tokens_seen": 211554288, + "step": 173870 + }, + { + "epoch": 19.364628577792626, + "grad_norm": 0.04834376648068428, + "learning_rate": 1.5361268616851388e-07, + "loss": 0.0503, + "num_input_tokens_seen": 211560656, + "step": 173875 + }, + { + "epoch": 19.365185432676245, + "grad_norm": 0.01324508897960186, + "learning_rate": 1.5334386699277504e-07, + "loss": 0.0048, + "num_input_tokens_seen": 211566736, + "step": 173880 + }, + { + "epoch": 19.36574228755986, + "grad_norm": 0.44904541969299316, + "learning_rate": 1.5307528251361503e-07, + "loss": 0.1127, + "num_input_tokens_seen": 211572688, + "step": 173885 + }, + { + "epoch": 19.36629914244348, + "grad_norm": 1.7348445653915405, + "learning_rate": 1.528069327335624e-07, + "loss": 0.1013, + "num_input_tokens_seen": 211578576, + "step": 173890 + }, + { + "epoch": 19.366855997327097, + "grad_norm": 0.9957280158996582, + "learning_rate": 1.5253881765515953e-07, + "loss": 0.1035, + "num_input_tokens_seen": 211584752, + "step": 173895 + }, + { + "epoch": 19.367412852210713, + "grad_norm": 1.1602630615234375, + "learning_rate": 1.5227093728092945e-07, + "loss": 0.0545, + "num_input_tokens_seen": 211590960, + "step": 173900 + }, + { + "epoch": 19.367969707094332, + "grad_norm": 0.007694633211940527, + "learning_rate": 1.5200329161341177e-07, + "loss": 0.0098, + "num_input_tokens_seen": 211597264, + "step": 173905 + }, + { + "epoch": 19.368526561977948, + "grad_norm": 0.005252427887171507, + "learning_rate": 1.5173588065513222e-07, + "loss": 0.0184, + "num_input_tokens_seen": 211603344, + "step": 173910 + }, + { + "epoch": 19.369083416861567, + "grad_norm": 0.004854429513216019, + "learning_rate": 1.5146870440861383e-07, + "loss": 0.0179, + "num_input_tokens_seen": 211609520, + "step": 173915 + }, + { + "epoch": 19.369640271745183, + "grad_norm": 0.3027311861515045, + "learning_rate": 1.5120176287637956e-07, + "loss": 0.0316, + "num_input_tokens_seen": 211615408, + "step": 173920 + }, + { + "epoch": 19.3701971266288, + "grad_norm": 0.011945446953177452, + "learning_rate": 1.5093505606095515e-07, + "loss": 0.0823, + "num_input_tokens_seen": 211621488, + "step": 173925 + }, + { + "epoch": 19.37075398151242, + "grad_norm": 0.3261510133743286, + "learning_rate": 1.5066858396485807e-07, + "loss": 0.008, + "num_input_tokens_seen": 211627760, + "step": 173930 + }, + { + "epoch": 19.371310836396034, + "grad_norm": 0.0005122713628225029, + "learning_rate": 1.504023465906057e-07, + "loss": 0.1006, + "num_input_tokens_seen": 211633712, + "step": 173935 + }, + { + "epoch": 19.371867691279654, + "grad_norm": 0.05875016376376152, + "learning_rate": 1.5013634394070996e-07, + "loss": 0.0341, + "num_input_tokens_seen": 211639952, + "step": 173940 + }, + { + "epoch": 19.37242454616327, + "grad_norm": 1.2288604974746704, + "learning_rate": 1.4987057601768827e-07, + "loss": 0.1314, + "num_input_tokens_seen": 211645520, + "step": 173945 + }, + { + "epoch": 19.372981401046886, + "grad_norm": 1.6146575212478638, + "learning_rate": 1.4960504282404698e-07, + "loss": 0.0517, + "num_input_tokens_seen": 211651440, + "step": 173950 + }, + { + "epoch": 19.373538255930505, + "grad_norm": 0.48413699865341187, + "learning_rate": 1.4933974436229792e-07, + "loss": 0.0198, + "num_input_tokens_seen": 211657712, + "step": 173955 + }, + { + "epoch": 19.37409511081412, + "grad_norm": 0.62566739320755, + "learning_rate": 1.4907468063494189e-07, + "loss": 0.0066, + "num_input_tokens_seen": 211664016, + "step": 173960 + }, + { + "epoch": 19.37465196569774, + "grad_norm": 0.9913741946220398, + "learning_rate": 1.4880985164448803e-07, + "loss": 0.1253, + "num_input_tokens_seen": 211670128, + "step": 173965 + }, + { + "epoch": 19.375208820581356, + "grad_norm": 0.05229758098721504, + "learning_rate": 1.4854525739343427e-07, + "loss": 0.0229, + "num_input_tokens_seen": 211676592, + "step": 173970 + }, + { + "epoch": 19.375765675464972, + "grad_norm": 1.4674686193466187, + "learning_rate": 1.4828089788428424e-07, + "loss": 0.0329, + "num_input_tokens_seen": 211682416, + "step": 173975 + }, + { + "epoch": 19.37632253034859, + "grad_norm": 0.0009550146642141044, + "learning_rate": 1.4801677311953032e-07, + "loss": 0.0111, + "num_input_tokens_seen": 211688752, + "step": 173980 + }, + { + "epoch": 19.376879385232208, + "grad_norm": 0.0002570381329860538, + "learning_rate": 1.477528831016678e-07, + "loss": 0.0806, + "num_input_tokens_seen": 211695184, + "step": 173985 + }, + { + "epoch": 19.377436240115827, + "grad_norm": 0.6331967711448669, + "learning_rate": 1.4748922783318907e-07, + "loss": 0.0134, + "num_input_tokens_seen": 211700976, + "step": 173990 + }, + { + "epoch": 19.377993094999443, + "grad_norm": 1.2944585084915161, + "learning_rate": 1.472258073165894e-07, + "loss": 0.1184, + "num_input_tokens_seen": 211707088, + "step": 173995 + }, + { + "epoch": 19.37854994988306, + "grad_norm": 0.6326460242271423, + "learning_rate": 1.469626215543529e-07, + "loss": 0.1147, + "num_input_tokens_seen": 211712880, + "step": 174000 + }, + { + "epoch": 19.37910680476668, + "grad_norm": 0.0020653442479670048, + "learning_rate": 1.466996705489665e-07, + "loss": 0.0043, + "num_input_tokens_seen": 211719184, + "step": 174005 + }, + { + "epoch": 19.379663659650294, + "grad_norm": 1.1236240863800049, + "learning_rate": 1.4643695430291428e-07, + "loss": 0.0266, + "num_input_tokens_seen": 211724944, + "step": 174010 + }, + { + "epoch": 19.380220514533914, + "grad_norm": 0.34760457277297974, + "learning_rate": 1.461744728186748e-07, + "loss": 0.0531, + "num_input_tokens_seen": 211731088, + "step": 174015 + }, + { + "epoch": 19.38077736941753, + "grad_norm": 0.2598136067390442, + "learning_rate": 1.4591222609873224e-07, + "loss": 0.0287, + "num_input_tokens_seen": 211737584, + "step": 174020 + }, + { + "epoch": 19.381334224301145, + "grad_norm": 0.1582208275794983, + "learning_rate": 1.4565021414555956e-07, + "loss": 0.0423, + "num_input_tokens_seen": 211743792, + "step": 174025 + }, + { + "epoch": 19.381891079184765, + "grad_norm": 2.5492918491363525, + "learning_rate": 1.4538843696163817e-07, + "loss": 0.0902, + "num_input_tokens_seen": 211749040, + "step": 174030 + }, + { + "epoch": 19.38244793406838, + "grad_norm": 0.000287520611891523, + "learning_rate": 1.4512689454942997e-07, + "loss": 0.0013, + "num_input_tokens_seen": 211755216, + "step": 174035 + }, + { + "epoch": 19.383004788952, + "grad_norm": 1.3895341157913208, + "learning_rate": 1.4486558691141627e-07, + "loss": 0.0611, + "num_input_tokens_seen": 211761296, + "step": 174040 + }, + { + "epoch": 19.383561643835616, + "grad_norm": 1.4802348613739014, + "learning_rate": 1.4460451405005626e-07, + "loss": 0.1114, + "num_input_tokens_seen": 211767120, + "step": 174045 + }, + { + "epoch": 19.384118498719232, + "grad_norm": 0.2739701569080353, + "learning_rate": 1.4434367596782572e-07, + "loss": 0.0063, + "num_input_tokens_seen": 211773296, + "step": 174050 + }, + { + "epoch": 19.38467535360285, + "grad_norm": 0.03951647877693176, + "learning_rate": 1.4408307266718102e-07, + "loss": 0.0203, + "num_input_tokens_seen": 211779248, + "step": 174055 + }, + { + "epoch": 19.385232208486467, + "grad_norm": 0.4752456545829773, + "learning_rate": 1.4382270415058408e-07, + "loss": 0.0501, + "num_input_tokens_seen": 211785424, + "step": 174060 + }, + { + "epoch": 19.385789063370087, + "grad_norm": 0.02141399309039116, + "learning_rate": 1.435625704204968e-07, + "loss": 0.0054, + "num_input_tokens_seen": 211791760, + "step": 174065 + }, + { + "epoch": 19.386345918253703, + "grad_norm": 0.0754004567861557, + "learning_rate": 1.4330267147937837e-07, + "loss": 0.0193, + "num_input_tokens_seen": 211798096, + "step": 174070 + }, + { + "epoch": 19.38690277313732, + "grad_norm": 2.0084097385406494, + "learning_rate": 1.4304300732967956e-07, + "loss": 0.1013, + "num_input_tokens_seen": 211804464, + "step": 174075 + }, + { + "epoch": 19.387459628020938, + "grad_norm": 0.3081264793872833, + "learning_rate": 1.42783577973854e-07, + "loss": 0.0139, + "num_input_tokens_seen": 211810768, + "step": 174080 + }, + { + "epoch": 19.388016482904554, + "grad_norm": 1.3328980207443237, + "learning_rate": 1.4252438341435248e-07, + "loss": 0.0417, + "num_input_tokens_seen": 211816912, + "step": 174085 + }, + { + "epoch": 19.388573337788173, + "grad_norm": 0.0006275965133681893, + "learning_rate": 1.4226542365362304e-07, + "loss": 0.2071, + "num_input_tokens_seen": 211823184, + "step": 174090 + }, + { + "epoch": 19.38913019267179, + "grad_norm": 0.5044459104537964, + "learning_rate": 1.4200669869411375e-07, + "loss": 0.0631, + "num_input_tokens_seen": 211829520, + "step": 174095 + }, + { + "epoch": 19.389687047555405, + "grad_norm": 0.050294797867536545, + "learning_rate": 1.417482085382671e-07, + "loss": 0.0684, + "num_input_tokens_seen": 211835568, + "step": 174100 + }, + { + "epoch": 19.390243902439025, + "grad_norm": 0.8824687004089355, + "learning_rate": 1.4148995318852277e-07, + "loss": 0.1501, + "num_input_tokens_seen": 211842288, + "step": 174105 + }, + { + "epoch": 19.39080075732264, + "grad_norm": 0.001741555635817349, + "learning_rate": 1.4123193264732603e-07, + "loss": 0.0038, + "num_input_tokens_seen": 211848336, + "step": 174110 + }, + { + "epoch": 19.39135761220626, + "grad_norm": 0.04284663870930672, + "learning_rate": 1.4097414691710552e-07, + "loss": 0.0018, + "num_input_tokens_seen": 211854512, + "step": 174115 + }, + { + "epoch": 19.391914467089876, + "grad_norm": 0.0012815827503800392, + "learning_rate": 1.4071659600030373e-07, + "loss": 0.0598, + "num_input_tokens_seen": 211861072, + "step": 174120 + }, + { + "epoch": 19.392471321973495, + "grad_norm": 0.025098972022533417, + "learning_rate": 1.4045927989935204e-07, + "loss": 0.0047, + "num_input_tokens_seen": 211867216, + "step": 174125 + }, + { + "epoch": 19.39302817685711, + "grad_norm": 0.5875474810600281, + "learning_rate": 1.4020219861667906e-07, + "loss": 0.1791, + "num_input_tokens_seen": 211873296, + "step": 174130 + }, + { + "epoch": 19.393585031740727, + "grad_norm": 0.0096711665391922, + "learning_rate": 1.399453521547106e-07, + "loss": 0.0033, + "num_input_tokens_seen": 211879440, + "step": 174135 + }, + { + "epoch": 19.394141886624347, + "grad_norm": 0.8890275955200195, + "learning_rate": 1.396887405158781e-07, + "loss": 0.0189, + "num_input_tokens_seen": 211885584, + "step": 174140 + }, + { + "epoch": 19.394698741507963, + "grad_norm": 0.006776955910027027, + "learning_rate": 1.3943236370260183e-07, + "loss": 0.0839, + "num_input_tokens_seen": 211891472, + "step": 174145 + }, + { + "epoch": 19.395255596391582, + "grad_norm": 0.04512069746851921, + "learning_rate": 1.391762217173076e-07, + "loss": 0.0937, + "num_input_tokens_seen": 211897776, + "step": 174150 + }, + { + "epoch": 19.395812451275198, + "grad_norm": 0.17406165599822998, + "learning_rate": 1.389203145624074e-07, + "loss": 0.0136, + "num_input_tokens_seen": 211904080, + "step": 174155 + }, + { + "epoch": 19.396369306158814, + "grad_norm": 0.21800942718982697, + "learning_rate": 1.3866464224032705e-07, + "loss": 0.0056, + "num_input_tokens_seen": 211910352, + "step": 174160 + }, + { + "epoch": 19.396926161042433, + "grad_norm": 0.030596943572163582, + "learning_rate": 1.3840920475347575e-07, + "loss": 0.0047, + "num_input_tokens_seen": 211916656, + "step": 174165 + }, + { + "epoch": 19.39748301592605, + "grad_norm": 0.034140266478061676, + "learning_rate": 1.38154002104271e-07, + "loss": 0.0059, + "num_input_tokens_seen": 211922704, + "step": 174170 + }, + { + "epoch": 19.39803987080967, + "grad_norm": 0.2756344676017761, + "learning_rate": 1.378990342951192e-07, + "loss": 0.0557, + "num_input_tokens_seen": 211928848, + "step": 174175 + }, + { + "epoch": 19.398596725693285, + "grad_norm": 0.26817217469215393, + "learning_rate": 1.3764430132842953e-07, + "loss": 0.0153, + "num_input_tokens_seen": 211935056, + "step": 174180 + }, + { + "epoch": 19.3991535805769, + "grad_norm": 0.6956409811973572, + "learning_rate": 1.3738980320660842e-07, + "loss": 0.0715, + "num_input_tokens_seen": 211940784, + "step": 174185 + }, + { + "epoch": 19.39971043546052, + "grad_norm": 0.23956571519374847, + "learning_rate": 1.3713553993206228e-07, + "loss": 0.0527, + "num_input_tokens_seen": 211945840, + "step": 174190 + }, + { + "epoch": 19.400267290344136, + "grad_norm": 0.0019959043711423874, + "learning_rate": 1.368815115071892e-07, + "loss": 0.0057, + "num_input_tokens_seen": 211951760, + "step": 174195 + }, + { + "epoch": 19.400824145227755, + "grad_norm": 0.06392200291156769, + "learning_rate": 1.3662771793439e-07, + "loss": 0.0053, + "num_input_tokens_seen": 211958000, + "step": 174200 + }, + { + "epoch": 19.40138100011137, + "grad_norm": 0.0062215086072683334, + "learning_rate": 1.3637415921606277e-07, + "loss": 0.0069, + "num_input_tokens_seen": 211964176, + "step": 174205 + }, + { + "epoch": 19.401937854994987, + "grad_norm": 0.0009579365141689777, + "learning_rate": 1.3612083535460284e-07, + "loss": 0.0077, + "num_input_tokens_seen": 211970448, + "step": 174210 + }, + { + "epoch": 19.402494709878606, + "grad_norm": 1.2217222452163696, + "learning_rate": 1.3586774635239997e-07, + "loss": 0.1658, + "num_input_tokens_seen": 211976272, + "step": 174215 + }, + { + "epoch": 19.403051564762222, + "grad_norm": 0.00018378367531113327, + "learning_rate": 1.3561489221184942e-07, + "loss": 0.0813, + "num_input_tokens_seen": 211982352, + "step": 174220 + }, + { + "epoch": 19.403608419645842, + "grad_norm": 0.6331471800804138, + "learning_rate": 1.3536227293533544e-07, + "loss": 0.112, + "num_input_tokens_seen": 211988336, + "step": 174225 + }, + { + "epoch": 19.404165274529458, + "grad_norm": 1.3540010452270508, + "learning_rate": 1.3510988852524777e-07, + "loss": 0.108, + "num_input_tokens_seen": 211994704, + "step": 174230 + }, + { + "epoch": 19.404722129413074, + "grad_norm": 0.03760722279548645, + "learning_rate": 1.3485773898396504e-07, + "loss": 0.0166, + "num_input_tokens_seen": 212001136, + "step": 174235 + }, + { + "epoch": 19.405278984296693, + "grad_norm": 0.07810736447572708, + "learning_rate": 1.3460582431387704e-07, + "loss": 0.0029, + "num_input_tokens_seen": 212006928, + "step": 174240 + }, + { + "epoch": 19.40583583918031, + "grad_norm": 2.256850481033325, + "learning_rate": 1.3435414451735685e-07, + "loss": 0.0784, + "num_input_tokens_seen": 212012432, + "step": 174245 + }, + { + "epoch": 19.40639269406393, + "grad_norm": 0.0014994750963523984, + "learning_rate": 1.341026995967831e-07, + "loss": 0.0265, + "num_input_tokens_seen": 212018480, + "step": 174250 + }, + { + "epoch": 19.406949548947544, + "grad_norm": 2.0387356281280518, + "learning_rate": 1.3385148955453174e-07, + "loss": 0.0532, + "num_input_tokens_seen": 212024400, + "step": 174255 + }, + { + "epoch": 19.40750640383116, + "grad_norm": 0.01696569286286831, + "learning_rate": 1.336005143929786e-07, + "loss": 0.1211, + "num_input_tokens_seen": 212030768, + "step": 174260 + }, + { + "epoch": 19.40806325871478, + "grad_norm": 0.29473504424095154, + "learning_rate": 1.3334977411448845e-07, + "loss": 0.0279, + "num_input_tokens_seen": 212036880, + "step": 174265 + }, + { + "epoch": 19.408620113598396, + "grad_norm": 0.04965096712112427, + "learning_rate": 1.3309926872143163e-07, + "loss": 0.03, + "num_input_tokens_seen": 212042928, + "step": 174270 + }, + { + "epoch": 19.409176968482015, + "grad_norm": 0.08397696912288666, + "learning_rate": 1.328489982161757e-07, + "loss": 0.0174, + "num_input_tokens_seen": 212048688, + "step": 174275 + }, + { + "epoch": 19.40973382336563, + "grad_norm": 0.15501394867897034, + "learning_rate": 1.325989626010854e-07, + "loss": 0.0211, + "num_input_tokens_seen": 212054640, + "step": 174280 + }, + { + "epoch": 19.410290678249247, + "grad_norm": 0.012241998687386513, + "learning_rate": 1.3234916187851999e-07, + "loss": 0.0518, + "num_input_tokens_seen": 212060880, + "step": 174285 + }, + { + "epoch": 19.410847533132866, + "grad_norm": 0.5192545056343079, + "learning_rate": 1.320995960508442e-07, + "loss": 0.097, + "num_input_tokens_seen": 212066736, + "step": 174290 + }, + { + "epoch": 19.411404388016482, + "grad_norm": 0.043312475085258484, + "learning_rate": 1.3185026512040622e-07, + "loss": 0.0024, + "num_input_tokens_seen": 212073008, + "step": 174295 + }, + { + "epoch": 19.4119612429001, + "grad_norm": 0.02665349468588829, + "learning_rate": 1.3160116908957076e-07, + "loss": 0.0117, + "num_input_tokens_seen": 212078992, + "step": 174300 + }, + { + "epoch": 19.412518097783718, + "grad_norm": 0.031725022941827774, + "learning_rate": 1.313523079606832e-07, + "loss": 0.0075, + "num_input_tokens_seen": 212084912, + "step": 174305 + }, + { + "epoch": 19.413074952667333, + "grad_norm": 1.01784086227417, + "learning_rate": 1.311036817361e-07, + "loss": 0.0491, + "num_input_tokens_seen": 212091120, + "step": 174310 + }, + { + "epoch": 19.413631807550953, + "grad_norm": 0.0299467071890831, + "learning_rate": 1.3085529041816646e-07, + "loss": 0.0038, + "num_input_tokens_seen": 212096752, + "step": 174315 + }, + { + "epoch": 19.41418866243457, + "grad_norm": 0.4801594018936157, + "learning_rate": 1.3060713400922798e-07, + "loss": 0.022, + "num_input_tokens_seen": 212103152, + "step": 174320 + }, + { + "epoch": 19.414745517318188, + "grad_norm": 0.3619450032711029, + "learning_rate": 1.3035921251163263e-07, + "loss": 0.1749, + "num_input_tokens_seen": 212109072, + "step": 174325 + }, + { + "epoch": 19.415302372201804, + "grad_norm": 0.0001250208733836189, + "learning_rate": 1.3011152592771746e-07, + "loss": 0.0185, + "num_input_tokens_seen": 212115152, + "step": 174330 + }, + { + "epoch": 19.41585922708542, + "grad_norm": 0.02848399057984352, + "learning_rate": 1.2986407425982506e-07, + "loss": 0.0411, + "num_input_tokens_seen": 212121296, + "step": 174335 + }, + { + "epoch": 19.41641608196904, + "grad_norm": 1.3335379362106323, + "learning_rate": 1.2961685751029518e-07, + "loss": 0.0893, + "num_input_tokens_seen": 212127024, + "step": 174340 + }, + { + "epoch": 19.416972936852655, + "grad_norm": 0.000410452950745821, + "learning_rate": 1.2936987568145653e-07, + "loss": 0.0161, + "num_input_tokens_seen": 212132624, + "step": 174345 + }, + { + "epoch": 19.417529791736275, + "grad_norm": 0.2724461257457733, + "learning_rate": 1.2912312877564615e-07, + "loss": 0.0149, + "num_input_tokens_seen": 212137776, + "step": 174350 + }, + { + "epoch": 19.41808664661989, + "grad_norm": 0.0018864456797018647, + "learning_rate": 1.2887661679519268e-07, + "loss": 0.008, + "num_input_tokens_seen": 212143824, + "step": 174355 + }, + { + "epoch": 19.418643501503507, + "grad_norm": 0.0002462489646859467, + "learning_rate": 1.2863033974242765e-07, + "loss": 0.0031, + "num_input_tokens_seen": 212150192, + "step": 174360 + }, + { + "epoch": 19.419200356387126, + "grad_norm": 0.0009358524694107473, + "learning_rate": 1.2838429761967418e-07, + "loss": 0.0029, + "num_input_tokens_seen": 212156208, + "step": 174365 + }, + { + "epoch": 19.419757211270742, + "grad_norm": 0.036642275750637054, + "learning_rate": 1.2813849042926095e-07, + "loss": 0.0241, + "num_input_tokens_seen": 212162320, + "step": 174370 + }, + { + "epoch": 19.42031406615436, + "grad_norm": 0.3074492812156677, + "learning_rate": 1.278929181735028e-07, + "loss": 0.0118, + "num_input_tokens_seen": 212168528, + "step": 174375 + }, + { + "epoch": 19.420870921037977, + "grad_norm": 0.8886306881904602, + "learning_rate": 1.276475808547256e-07, + "loss": 0.0079, + "num_input_tokens_seen": 212175024, + "step": 174380 + }, + { + "epoch": 19.421427775921593, + "grad_norm": 0.8927417993545532, + "learning_rate": 1.2740247847524422e-07, + "loss": 0.025, + "num_input_tokens_seen": 212181232, + "step": 174385 + }, + { + "epoch": 19.421984630805213, + "grad_norm": 0.19996607303619385, + "learning_rate": 1.2715761103737345e-07, + "loss": 0.1714, + "num_input_tokens_seen": 212187376, + "step": 174390 + }, + { + "epoch": 19.42254148568883, + "grad_norm": 0.0026665818877518177, + "learning_rate": 1.269129785434253e-07, + "loss": 0.042, + "num_input_tokens_seen": 212193872, + "step": 174395 + }, + { + "epoch": 19.423098340572448, + "grad_norm": 0.0006170601700432599, + "learning_rate": 1.2666858099571467e-07, + "loss": 0.0541, + "num_input_tokens_seen": 212200240, + "step": 174400 + }, + { + "epoch": 19.423655195456064, + "grad_norm": 0.0015049686189740896, + "learning_rate": 1.2642441839654795e-07, + "loss": 0.0145, + "num_input_tokens_seen": 212206544, + "step": 174405 + }, + { + "epoch": 19.42421205033968, + "grad_norm": 0.002823788905516267, + "learning_rate": 1.261804907482289e-07, + "loss": 0.0387, + "num_input_tokens_seen": 212212432, + "step": 174410 + }, + { + "epoch": 19.4247689052233, + "grad_norm": 0.049929242581129074, + "learning_rate": 1.2593679805306403e-07, + "loss": 0.0028, + "num_input_tokens_seen": 212218544, + "step": 174415 + }, + { + "epoch": 19.425325760106915, + "grad_norm": 1.0318410396575928, + "learning_rate": 1.2569334031335423e-07, + "loss": 0.0173, + "num_input_tokens_seen": 212224464, + "step": 174420 + }, + { + "epoch": 19.425882614990535, + "grad_norm": 0.04068361595273018, + "learning_rate": 1.2545011753140322e-07, + "loss": 0.0489, + "num_input_tokens_seen": 212230416, + "step": 174425 + }, + { + "epoch": 19.42643946987415, + "grad_norm": 1.31309974193573, + "learning_rate": 1.2520712970950088e-07, + "loss": 0.0493, + "num_input_tokens_seen": 212236400, + "step": 174430 + }, + { + "epoch": 19.426996324757766, + "grad_norm": 0.11816742271184921, + "learning_rate": 1.249643768499509e-07, + "loss": 0.0286, + "num_input_tokens_seen": 212242672, + "step": 174435 + }, + { + "epoch": 19.427553179641386, + "grad_norm": 0.00027087744092568755, + "learning_rate": 1.2472185895503752e-07, + "loss": 0.0418, + "num_input_tokens_seen": 212248912, + "step": 174440 + }, + { + "epoch": 19.428110034525, + "grad_norm": 1.1986260414123535, + "learning_rate": 1.2447957602705895e-07, + "loss": 0.1022, + "num_input_tokens_seen": 212255088, + "step": 174445 + }, + { + "epoch": 19.42866688940862, + "grad_norm": 2.6041793823242188, + "learning_rate": 1.242375280682967e-07, + "loss": 0.0431, + "num_input_tokens_seen": 212260528, + "step": 174450 + }, + { + "epoch": 19.429223744292237, + "grad_norm": 0.002310351002961397, + "learning_rate": 1.2399571508104612e-07, + "loss": 0.0043, + "num_input_tokens_seen": 212266768, + "step": 174455 + }, + { + "epoch": 19.429780599175857, + "grad_norm": 0.3426320254802704, + "learning_rate": 1.237541370675832e-07, + "loss": 0.0092, + "num_input_tokens_seen": 212272688, + "step": 174460 + }, + { + "epoch": 19.430337454059472, + "grad_norm": 0.003343836171552539, + "learning_rate": 1.23512794030195e-07, + "loss": 0.121, + "num_input_tokens_seen": 212278544, + "step": 174465 + }, + { + "epoch": 19.43089430894309, + "grad_norm": 0.12309783697128296, + "learning_rate": 1.2327168597115746e-07, + "loss": 0.0052, + "num_input_tokens_seen": 212284560, + "step": 174470 + }, + { + "epoch": 19.431451163826708, + "grad_norm": 0.05571059137582779, + "learning_rate": 1.2303081289274932e-07, + "loss": 0.0061, + "num_input_tokens_seen": 212290576, + "step": 174475 + }, + { + "epoch": 19.432008018710324, + "grad_norm": 0.089602030813694, + "learning_rate": 1.227901747972493e-07, + "loss": 0.0033, + "num_input_tokens_seen": 212296592, + "step": 174480 + }, + { + "epoch": 19.432564873593943, + "grad_norm": 2.189134359359741, + "learning_rate": 1.2254977168692504e-07, + "loss": 0.0857, + "num_input_tokens_seen": 212302576, + "step": 174485 + }, + { + "epoch": 19.43312172847756, + "grad_norm": 0.041864290833473206, + "learning_rate": 1.2230960356404975e-07, + "loss": 0.0029, + "num_input_tokens_seen": 212309008, + "step": 174490 + }, + { + "epoch": 19.433678583361175, + "grad_norm": 0.4277186393737793, + "learning_rate": 1.22069670430891e-07, + "loss": 0.0079, + "num_input_tokens_seen": 212315504, + "step": 174495 + }, + { + "epoch": 19.434235438244794, + "grad_norm": 0.013040086254477501, + "learning_rate": 1.2182997228971648e-07, + "loss": 0.0241, + "num_input_tokens_seen": 212321840, + "step": 174500 + }, + { + "epoch": 19.43479229312841, + "grad_norm": 0.03922652453184128, + "learning_rate": 1.2159050914279103e-07, + "loss": 0.0023, + "num_input_tokens_seen": 212328368, + "step": 174505 + }, + { + "epoch": 19.43534914801203, + "grad_norm": 0.40562474727630615, + "learning_rate": 1.213512809923767e-07, + "loss": 0.0214, + "num_input_tokens_seen": 212334384, + "step": 174510 + }, + { + "epoch": 19.435906002895646, + "grad_norm": 0.033103927969932556, + "learning_rate": 1.2111228784073003e-07, + "loss": 0.0077, + "num_input_tokens_seen": 212340272, + "step": 174515 + }, + { + "epoch": 19.43646285777926, + "grad_norm": 0.0738966315984726, + "learning_rate": 1.2087352969011034e-07, + "loss": 0.0024, + "num_input_tokens_seen": 212346256, + "step": 174520 + }, + { + "epoch": 19.43701971266288, + "grad_norm": 0.009418082423508167, + "learning_rate": 1.2063500654277137e-07, + "loss": 0.0683, + "num_input_tokens_seen": 212352240, + "step": 174525 + }, + { + "epoch": 19.437576567546497, + "grad_norm": 0.0020984902512282133, + "learning_rate": 1.2039671840097245e-07, + "loss": 0.0831, + "num_input_tokens_seen": 212358352, + "step": 174530 + }, + { + "epoch": 19.438133422430116, + "grad_norm": 0.6267776489257812, + "learning_rate": 1.2015866526695618e-07, + "loss": 0.022, + "num_input_tokens_seen": 212364112, + "step": 174535 + }, + { + "epoch": 19.438690277313732, + "grad_norm": 0.170449897646904, + "learning_rate": 1.1992084714297636e-07, + "loss": 0.0075, + "num_input_tokens_seen": 212369968, + "step": 174540 + }, + { + "epoch": 19.439247132197348, + "grad_norm": 0.5862420797348022, + "learning_rate": 1.196832640312756e-07, + "loss": 0.0155, + "num_input_tokens_seen": 212376176, + "step": 174545 + }, + { + "epoch": 19.439803987080968, + "grad_norm": 0.43164974451065063, + "learning_rate": 1.1944591593410214e-07, + "loss": 0.0099, + "num_input_tokens_seen": 212382320, + "step": 174550 + }, + { + "epoch": 19.440360841964583, + "grad_norm": 0.1749524623155594, + "learning_rate": 1.1920880285369584e-07, + "loss": 0.0123, + "num_input_tokens_seen": 212388080, + "step": 174555 + }, + { + "epoch": 19.440917696848203, + "grad_norm": 0.032385967671871185, + "learning_rate": 1.1897192479229657e-07, + "loss": 0.1129, + "num_input_tokens_seen": 212394096, + "step": 174560 + }, + { + "epoch": 19.44147455173182, + "grad_norm": 0.10115131735801697, + "learning_rate": 1.1873528175214143e-07, + "loss": 0.0076, + "num_input_tokens_seen": 212400080, + "step": 174565 + }, + { + "epoch": 19.442031406615435, + "grad_norm": 0.04410111904144287, + "learning_rate": 1.1849887373546476e-07, + "loss": 0.0051, + "num_input_tokens_seen": 212406128, + "step": 174570 + }, + { + "epoch": 19.442588261499054, + "grad_norm": 0.02807098813354969, + "learning_rate": 1.1826270074450363e-07, + "loss": 0.0103, + "num_input_tokens_seen": 212412528, + "step": 174575 + }, + { + "epoch": 19.44314511638267, + "grad_norm": 0.009059225209057331, + "learning_rate": 1.1802676278148406e-07, + "loss": 0.0202, + "num_input_tokens_seen": 212418544, + "step": 174580 + }, + { + "epoch": 19.44370197126629, + "grad_norm": 0.1417350172996521, + "learning_rate": 1.1779105984863759e-07, + "loss": 0.0794, + "num_input_tokens_seen": 212424880, + "step": 174585 + }, + { + "epoch": 19.444258826149905, + "grad_norm": 0.16085369884967804, + "learning_rate": 1.1755559194818744e-07, + "loss": 0.1877, + "num_input_tokens_seen": 212430992, + "step": 174590 + }, + { + "epoch": 19.44481568103352, + "grad_norm": 0.37024950981140137, + "learning_rate": 1.1732035908236517e-07, + "loss": 0.0186, + "num_input_tokens_seen": 212436368, + "step": 174595 + }, + { + "epoch": 19.44537253591714, + "grad_norm": 0.00014473043847829103, + "learning_rate": 1.1708536125338565e-07, + "loss": 0.0177, + "num_input_tokens_seen": 212441520, + "step": 174600 + }, + { + "epoch": 19.445929390800757, + "grad_norm": 0.3900294005870819, + "learning_rate": 1.1685059846346935e-07, + "loss": 0.0072, + "num_input_tokens_seen": 212447760, + "step": 174605 + }, + { + "epoch": 19.446486245684376, + "grad_norm": 0.022078227251768112, + "learning_rate": 1.166160707148367e-07, + "loss": 0.1391, + "num_input_tokens_seen": 212453680, + "step": 174610 + }, + { + "epoch": 19.447043100567992, + "grad_norm": 0.46747955679893494, + "learning_rate": 1.1638177800969984e-07, + "loss": 0.0752, + "num_input_tokens_seen": 212459536, + "step": 174615 + }, + { + "epoch": 19.447599955451608, + "grad_norm": 1.3003603219985962, + "learning_rate": 1.1614772035027644e-07, + "loss": 0.0329, + "num_input_tokens_seen": 212465808, + "step": 174620 + }, + { + "epoch": 19.448156810335227, + "grad_norm": 0.31630825996398926, + "learning_rate": 1.1591389773877026e-07, + "loss": 0.0825, + "num_input_tokens_seen": 212472016, + "step": 174625 + }, + { + "epoch": 19.448713665218843, + "grad_norm": 0.17787012457847595, + "learning_rate": 1.1568031017739623e-07, + "loss": 0.006, + "num_input_tokens_seen": 212478000, + "step": 174630 + }, + { + "epoch": 19.449270520102463, + "grad_norm": 0.15587164461612701, + "learning_rate": 1.154469576683609e-07, + "loss": 0.0082, + "num_input_tokens_seen": 212483952, + "step": 174635 + }, + { + "epoch": 19.44982737498608, + "grad_norm": 0.21589408814907074, + "learning_rate": 1.152138402138625e-07, + "loss": 0.0037, + "num_input_tokens_seen": 212490288, + "step": 174640 + }, + { + "epoch": 19.450384229869695, + "grad_norm": 0.00020299719471950084, + "learning_rate": 1.1498095781610762e-07, + "loss": 0.0101, + "num_input_tokens_seen": 212496432, + "step": 174645 + }, + { + "epoch": 19.450941084753314, + "grad_norm": 0.0001275737740797922, + "learning_rate": 1.1474831047729728e-07, + "loss": 0.0093, + "num_input_tokens_seen": 212502608, + "step": 174650 + }, + { + "epoch": 19.45149793963693, + "grad_norm": 0.11814593523740768, + "learning_rate": 1.1451589819962693e-07, + "loss": 0.0693, + "num_input_tokens_seen": 212508848, + "step": 174655 + }, + { + "epoch": 19.45205479452055, + "grad_norm": 0.4984751045703888, + "learning_rate": 1.1428372098528927e-07, + "loss": 0.0954, + "num_input_tokens_seen": 212514960, + "step": 174660 + }, + { + "epoch": 19.452611649404165, + "grad_norm": 0.0010618118103593588, + "learning_rate": 1.1405177883647977e-07, + "loss": 0.0412, + "num_input_tokens_seen": 212521200, + "step": 174665 + }, + { + "epoch": 19.45316850428778, + "grad_norm": 0.013068463653326035, + "learning_rate": 1.138200717553911e-07, + "loss": 0.1114, + "num_input_tokens_seen": 212527344, + "step": 174670 + }, + { + "epoch": 19.4537253591714, + "grad_norm": 0.013409423641860485, + "learning_rate": 1.1358859974421043e-07, + "loss": 0.0641, + "num_input_tokens_seen": 212533232, + "step": 174675 + }, + { + "epoch": 19.454282214055016, + "grad_norm": 0.14786073565483093, + "learning_rate": 1.1335736280512488e-07, + "loss": 0.0796, + "num_input_tokens_seen": 212539152, + "step": 174680 + }, + { + "epoch": 19.454839068938636, + "grad_norm": 0.04302085191011429, + "learning_rate": 1.1312636094031604e-07, + "loss": 0.0022, + "num_input_tokens_seen": 212545232, + "step": 174685 + }, + { + "epoch": 19.455395923822252, + "grad_norm": 0.4578135907649994, + "learning_rate": 1.1289559415196826e-07, + "loss": 0.0065, + "num_input_tokens_seen": 212551376, + "step": 174690 + }, + { + "epoch": 19.455952778705868, + "grad_norm": 0.0004365680506452918, + "learning_rate": 1.1266506244226316e-07, + "loss": 0.0738, + "num_input_tokens_seen": 212557552, + "step": 174695 + }, + { + "epoch": 19.456509633589487, + "grad_norm": 0.041696153581142426, + "learning_rate": 1.124347658133712e-07, + "loss": 0.0226, + "num_input_tokens_seen": 212563696, + "step": 174700 + }, + { + "epoch": 19.457066488473103, + "grad_norm": 0.19144763052463531, + "learning_rate": 1.1220470426747676e-07, + "loss": 0.0557, + "num_input_tokens_seen": 212569936, + "step": 174705 + }, + { + "epoch": 19.457623343356723, + "grad_norm": 0.016637051478028297, + "learning_rate": 1.1197487780674476e-07, + "loss": 0.0764, + "num_input_tokens_seen": 212575760, + "step": 174710 + }, + { + "epoch": 19.45818019824034, + "grad_norm": 0.2600686550140381, + "learning_rate": 1.1174528643335402e-07, + "loss": 0.0518, + "num_input_tokens_seen": 212581712, + "step": 174715 + }, + { + "epoch": 19.458737053123954, + "grad_norm": 0.0003026639751624316, + "learning_rate": 1.1151593014946671e-07, + "loss": 0.0332, + "num_input_tokens_seen": 212587856, + "step": 174720 + }, + { + "epoch": 19.459293908007574, + "grad_norm": 0.028180807828903198, + "learning_rate": 1.112868089572533e-07, + "loss": 0.0122, + "num_input_tokens_seen": 212594352, + "step": 174725 + }, + { + "epoch": 19.45985076289119, + "grad_norm": 0.13041137158870697, + "learning_rate": 1.1105792285887595e-07, + "loss": 0.0027, + "num_input_tokens_seen": 212600528, + "step": 174730 + }, + { + "epoch": 19.46040761777481, + "grad_norm": 0.0009567787637934089, + "learning_rate": 1.1082927185649683e-07, + "loss": 0.0522, + "num_input_tokens_seen": 212606480, + "step": 174735 + }, + { + "epoch": 19.460964472658425, + "grad_norm": 0.003601845819503069, + "learning_rate": 1.1060085595227531e-07, + "loss": 0.0036, + "num_input_tokens_seen": 212612368, + "step": 174740 + }, + { + "epoch": 19.46152132754204, + "grad_norm": 0.1906535029411316, + "learning_rate": 1.103726751483708e-07, + "loss": 0.0088, + "num_input_tokens_seen": 212618608, + "step": 174745 + }, + { + "epoch": 19.46207818242566, + "grad_norm": 0.024584773927927017, + "learning_rate": 1.101447294469371e-07, + "loss": 0.0324, + "num_input_tokens_seen": 212624336, + "step": 174750 + }, + { + "epoch": 19.462635037309276, + "grad_norm": 0.04611542820930481, + "learning_rate": 1.0991701885012806e-07, + "loss": 0.0084, + "num_input_tokens_seen": 212630736, + "step": 174755 + }, + { + "epoch": 19.463191892192896, + "grad_norm": 1.7853329181671143, + "learning_rate": 1.0968954336009473e-07, + "loss": 0.0515, + "num_input_tokens_seen": 212636784, + "step": 174760 + }, + { + "epoch": 19.46374874707651, + "grad_norm": 0.0004105621192138642, + "learning_rate": 1.0946230297898541e-07, + "loss": 0.0364, + "num_input_tokens_seen": 212642704, + "step": 174765 + }, + { + "epoch": 19.464305601960127, + "grad_norm": 3.0297482013702393, + "learning_rate": 1.0923529770894558e-07, + "loss": 0.0875, + "num_input_tokens_seen": 212648784, + "step": 174770 + }, + { + "epoch": 19.464862456843747, + "grad_norm": 1.314355731010437, + "learning_rate": 1.0900852755212354e-07, + "loss": 0.0895, + "num_input_tokens_seen": 212655312, + "step": 174775 + }, + { + "epoch": 19.465419311727363, + "grad_norm": 2.677208423614502, + "learning_rate": 1.0878199251065369e-07, + "loss": 0.1103, + "num_input_tokens_seen": 212661712, + "step": 174780 + }, + { + "epoch": 19.465976166610982, + "grad_norm": 0.0028706511948257685, + "learning_rate": 1.085556925866843e-07, + "loss": 0.0007, + "num_input_tokens_seen": 212667536, + "step": 174785 + }, + { + "epoch": 19.466533021494598, + "grad_norm": 2.136357307434082, + "learning_rate": 1.0832962778234701e-07, + "loss": 0.0691, + "num_input_tokens_seen": 212673616, + "step": 174790 + }, + { + "epoch": 19.467089876378218, + "grad_norm": 0.02584274671971798, + "learning_rate": 1.0810379809978177e-07, + "loss": 0.0027, + "num_input_tokens_seen": 212679920, + "step": 174795 + }, + { + "epoch": 19.467646731261834, + "grad_norm": 0.006765368394553661, + "learning_rate": 1.0787820354111467e-07, + "loss": 0.0447, + "num_input_tokens_seen": 212685936, + "step": 174800 + }, + { + "epoch": 19.46820358614545, + "grad_norm": 0.15259312093257904, + "learning_rate": 1.0765284410848565e-07, + "loss": 0.0391, + "num_input_tokens_seen": 212691952, + "step": 174805 + }, + { + "epoch": 19.46876044102907, + "grad_norm": 1.747632384300232, + "learning_rate": 1.0742771980401801e-07, + "loss": 0.0763, + "num_input_tokens_seen": 212698192, + "step": 174810 + }, + { + "epoch": 19.469317295912685, + "grad_norm": 0.283942848443985, + "learning_rate": 1.0720283062983782e-07, + "loss": 0.0104, + "num_input_tokens_seen": 212703856, + "step": 174815 + }, + { + "epoch": 19.4698741507963, + "grad_norm": 0.630915105342865, + "learning_rate": 1.0697817658807119e-07, + "loss": 0.0097, + "num_input_tokens_seen": 212710064, + "step": 174820 + }, + { + "epoch": 19.47043100567992, + "grad_norm": 4.561835765838623, + "learning_rate": 1.0675375768083862e-07, + "loss": 0.0632, + "num_input_tokens_seen": 212716144, + "step": 174825 + }, + { + "epoch": 19.470987860563536, + "grad_norm": 0.6369342803955078, + "learning_rate": 1.0652957391026064e-07, + "loss": 0.0129, + "num_input_tokens_seen": 212722000, + "step": 174830 + }, + { + "epoch": 19.471544715447155, + "grad_norm": 0.16369210183620453, + "learning_rate": 1.0630562527845778e-07, + "loss": 0.0546, + "num_input_tokens_seen": 212728208, + "step": 174835 + }, + { + "epoch": 19.47210157033077, + "grad_norm": 0.9734081029891968, + "learning_rate": 1.0608191178754223e-07, + "loss": 0.0907, + "num_input_tokens_seen": 212734480, + "step": 174840 + }, + { + "epoch": 19.47265842521439, + "grad_norm": 0.9544011950492859, + "learning_rate": 1.0585843343962621e-07, + "loss": 0.0357, + "num_input_tokens_seen": 212740656, + "step": 174845 + }, + { + "epoch": 19.473215280098007, + "grad_norm": 0.15443754196166992, + "learning_rate": 1.056351902368219e-07, + "loss": 0.0195, + "num_input_tokens_seen": 212746768, + "step": 174850 + }, + { + "epoch": 19.473772134981623, + "grad_norm": 0.021618416532874107, + "learning_rate": 1.0541218218123872e-07, + "loss": 0.0487, + "num_input_tokens_seen": 212752752, + "step": 174855 + }, + { + "epoch": 19.474328989865242, + "grad_norm": 0.01415413711220026, + "learning_rate": 1.0518940927498333e-07, + "loss": 0.0051, + "num_input_tokens_seen": 212758928, + "step": 174860 + }, + { + "epoch": 19.474885844748858, + "grad_norm": 0.4356295168399811, + "learning_rate": 1.0496687152015961e-07, + "loss": 0.0216, + "num_input_tokens_seen": 212764752, + "step": 174865 + }, + { + "epoch": 19.475442699632477, + "grad_norm": 0.021539999172091484, + "learning_rate": 1.0474456891886863e-07, + "loss": 0.0206, + "num_input_tokens_seen": 212770896, + "step": 174870 + }, + { + "epoch": 19.475999554516093, + "grad_norm": 0.0003571481502149254, + "learning_rate": 1.0452250147321152e-07, + "loss": 0.0331, + "num_input_tokens_seen": 212777200, + "step": 174875 + }, + { + "epoch": 19.47655640939971, + "grad_norm": 0.007079604547470808, + "learning_rate": 1.043006691852838e-07, + "loss": 0.1169, + "num_input_tokens_seen": 212783024, + "step": 174880 + }, + { + "epoch": 19.47711326428333, + "grad_norm": 0.006583824288100004, + "learning_rate": 1.0407907205718381e-07, + "loss": 0.002, + "num_input_tokens_seen": 212789360, + "step": 174885 + }, + { + "epoch": 19.477670119166945, + "grad_norm": 0.0117545360699296, + "learning_rate": 1.0385771009100432e-07, + "loss": 0.0667, + "num_input_tokens_seen": 212795696, + "step": 174890 + }, + { + "epoch": 19.478226974050564, + "grad_norm": 0.8825770616531372, + "learning_rate": 1.0363658328883252e-07, + "loss": 0.0121, + "num_input_tokens_seen": 212801712, + "step": 174895 + }, + { + "epoch": 19.47878382893418, + "grad_norm": 0.001093168742954731, + "learning_rate": 1.0341569165276121e-07, + "loss": 0.0095, + "num_input_tokens_seen": 212807920, + "step": 174900 + }, + { + "epoch": 19.479340683817796, + "grad_norm": 0.00031733192736282945, + "learning_rate": 1.0319503518487483e-07, + "loss": 0.0031, + "num_input_tokens_seen": 212814256, + "step": 174905 + }, + { + "epoch": 19.479897538701415, + "grad_norm": 0.0778631865978241, + "learning_rate": 1.029746138872606e-07, + "loss": 0.0115, + "num_input_tokens_seen": 212819984, + "step": 174910 + }, + { + "epoch": 19.48045439358503, + "grad_norm": 0.0684080421924591, + "learning_rate": 1.027544277619974e-07, + "loss": 0.0228, + "num_input_tokens_seen": 212826000, + "step": 174915 + }, + { + "epoch": 19.48101124846865, + "grad_norm": 0.004469567909836769, + "learning_rate": 1.0253447681116412e-07, + "loss": 0.0006, + "num_input_tokens_seen": 212832048, + "step": 174920 + }, + { + "epoch": 19.481568103352267, + "grad_norm": 0.00036748734419234097, + "learning_rate": 1.0231476103684246e-07, + "loss": 0.0445, + "num_input_tokens_seen": 212838032, + "step": 174925 + }, + { + "epoch": 19.482124958235882, + "grad_norm": 0.09775517880916595, + "learning_rate": 1.0209528044110294e-07, + "loss": 0.0064, + "num_input_tokens_seen": 212844624, + "step": 174930 + }, + { + "epoch": 19.482681813119502, + "grad_norm": 0.017738206312060356, + "learning_rate": 1.0187603502602449e-07, + "loss": 0.0272, + "num_input_tokens_seen": 212850736, + "step": 174935 + }, + { + "epoch": 19.483238668003118, + "grad_norm": 0.1254841387271881, + "learning_rate": 1.0165702479367212e-07, + "loss": 0.0362, + "num_input_tokens_seen": 212856272, + "step": 174940 + }, + { + "epoch": 19.483795522886737, + "grad_norm": 0.011859564110636711, + "learning_rate": 1.0143824974611915e-07, + "loss": 0.0823, + "num_input_tokens_seen": 212862096, + "step": 174945 + }, + { + "epoch": 19.484352377770353, + "grad_norm": 0.00012085712660336867, + "learning_rate": 1.0121970988543061e-07, + "loss": 0.0019, + "num_input_tokens_seen": 212868080, + "step": 174950 + }, + { + "epoch": 19.48490923265397, + "grad_norm": 0.34440916776657104, + "learning_rate": 1.0100140521366874e-07, + "loss": 0.0847, + "num_input_tokens_seen": 212874480, + "step": 174955 + }, + { + "epoch": 19.48546608753759, + "grad_norm": 0.00024994442355819046, + "learning_rate": 1.0078333573289855e-07, + "loss": 0.0025, + "num_input_tokens_seen": 212880912, + "step": 174960 + }, + { + "epoch": 19.486022942421204, + "grad_norm": 0.13246086239814758, + "learning_rate": 1.0056550144517674e-07, + "loss": 0.0096, + "num_input_tokens_seen": 212887024, + "step": 174965 + }, + { + "epoch": 19.486579797304824, + "grad_norm": 0.006648743525147438, + "learning_rate": 1.0034790235256552e-07, + "loss": 0.0044, + "num_input_tokens_seen": 212893264, + "step": 174970 + }, + { + "epoch": 19.48713665218844, + "grad_norm": 4.348289489746094, + "learning_rate": 1.0013053845711606e-07, + "loss": 0.1175, + "num_input_tokens_seen": 212899600, + "step": 174975 + }, + { + "epoch": 19.487693507072056, + "grad_norm": 0.5526319742202759, + "learning_rate": 9.991340976088503e-08, + "loss": 0.0729, + "num_input_tokens_seen": 212905488, + "step": 174980 + }, + { + "epoch": 19.488250361955675, + "grad_norm": 0.07143232971429825, + "learning_rate": 9.969651626591803e-08, + "loss": 0.0046, + "num_input_tokens_seen": 212911504, + "step": 174985 + }, + { + "epoch": 19.48880721683929, + "grad_norm": 0.06217920780181885, + "learning_rate": 9.947985797427173e-08, + "loss": 0.0034, + "num_input_tokens_seen": 212917616, + "step": 174990 + }, + { + "epoch": 19.48936407172291, + "grad_norm": 0.5526428818702698, + "learning_rate": 9.92634348879834e-08, + "loss": 0.0175, + "num_input_tokens_seen": 212923664, + "step": 174995 + }, + { + "epoch": 19.489920926606526, + "grad_norm": 0.03884949907660484, + "learning_rate": 9.904724700910417e-08, + "loss": 0.0513, + "num_input_tokens_seen": 212929776, + "step": 175000 + }, + { + "epoch": 19.490477781490142, + "grad_norm": 1.3067364692687988, + "learning_rate": 9.883129433967686e-08, + "loss": 0.0563, + "num_input_tokens_seen": 212936144, + "step": 175005 + }, + { + "epoch": 19.49103463637376, + "grad_norm": 0.0668465793132782, + "learning_rate": 9.861557688173595e-08, + "loss": 0.0093, + "num_input_tokens_seen": 212942320, + "step": 175010 + }, + { + "epoch": 19.491591491257378, + "grad_norm": 0.18124808371067047, + "learning_rate": 9.84000946373187e-08, + "loss": 0.0339, + "num_input_tokens_seen": 212948624, + "step": 175015 + }, + { + "epoch": 19.492148346140997, + "grad_norm": 2.646181583404541, + "learning_rate": 9.818484760846791e-08, + "loss": 0.1009, + "num_input_tokens_seen": 212954320, + "step": 175020 + }, + { + "epoch": 19.492705201024613, + "grad_norm": 0.00977886002510786, + "learning_rate": 9.796983579720975e-08, + "loss": 0.0273, + "num_input_tokens_seen": 212960624, + "step": 175025 + }, + { + "epoch": 19.49326205590823, + "grad_norm": 2.494748592376709, + "learning_rate": 9.77550592055787e-08, + "loss": 0.068, + "num_input_tokens_seen": 212966608, + "step": 175030 + }, + { + "epoch": 19.49381891079185, + "grad_norm": 1.486476182937622, + "learning_rate": 9.754051783560092e-08, + "loss": 0.0543, + "num_input_tokens_seen": 212972976, + "step": 175035 + }, + { + "epoch": 19.494375765675464, + "grad_norm": 0.011319007724523544, + "learning_rate": 9.732621168930533e-08, + "loss": 0.007, + "num_input_tokens_seen": 212979280, + "step": 175040 + }, + { + "epoch": 19.494932620559084, + "grad_norm": 0.126164972782135, + "learning_rate": 9.711214076871534e-08, + "loss": 0.0203, + "num_input_tokens_seen": 212985232, + "step": 175045 + }, + { + "epoch": 19.4954894754427, + "grad_norm": 0.009137352928519249, + "learning_rate": 9.68983050758543e-08, + "loss": 0.0158, + "num_input_tokens_seen": 212991280, + "step": 175050 + }, + { + "epoch": 19.496046330326315, + "grad_norm": 0.000997709110379219, + "learning_rate": 9.668470461274004e-08, + "loss": 0.086, + "num_input_tokens_seen": 212997392, + "step": 175055 + }, + { + "epoch": 19.496603185209935, + "grad_norm": 1.6988924741744995, + "learning_rate": 9.647133938139042e-08, + "loss": 0.0664, + "num_input_tokens_seen": 213003312, + "step": 175060 + }, + { + "epoch": 19.49716004009355, + "grad_norm": 3.2049548625946045, + "learning_rate": 9.625820938382046e-08, + "loss": 0.0275, + "num_input_tokens_seen": 213009712, + "step": 175065 + }, + { + "epoch": 19.49771689497717, + "grad_norm": 0.043627455830574036, + "learning_rate": 9.604531462204802e-08, + "loss": 0.002, + "num_input_tokens_seen": 213016048, + "step": 175070 + }, + { + "epoch": 19.498273749860786, + "grad_norm": 0.7583791017532349, + "learning_rate": 9.583265509807705e-08, + "loss": 0.048, + "num_input_tokens_seen": 213022000, + "step": 175075 + }, + { + "epoch": 19.498830604744402, + "grad_norm": 0.07774844020605087, + "learning_rate": 9.56202308139198e-08, + "loss": 0.1079, + "num_input_tokens_seen": 213028528, + "step": 175080 + }, + { + "epoch": 19.49938745962802, + "grad_norm": 0.023988084867596626, + "learning_rate": 9.540804177158302e-08, + "loss": 0.0327, + "num_input_tokens_seen": 213034768, + "step": 175085 + }, + { + "epoch": 19.499944314511637, + "grad_norm": 0.22200465202331543, + "learning_rate": 9.519608797307067e-08, + "loss": 0.0043, + "num_input_tokens_seen": 213041296, + "step": 175090 + }, + { + "epoch": 19.500501169395257, + "grad_norm": 2.5057427883148193, + "learning_rate": 9.498436942038391e-08, + "loss": 0.0743, + "num_input_tokens_seen": 213047632, + "step": 175095 + }, + { + "epoch": 19.501058024278873, + "grad_norm": 1.4257582426071167, + "learning_rate": 9.477288611552393e-08, + "loss": 0.0455, + "num_input_tokens_seen": 213053808, + "step": 175100 + }, + { + "epoch": 19.50161487916249, + "grad_norm": 0.04696323350071907, + "learning_rate": 9.456163806048912e-08, + "loss": 0.02, + "num_input_tokens_seen": 213059888, + "step": 175105 + }, + { + "epoch": 19.502171734046108, + "grad_norm": 0.002309588249772787, + "learning_rate": 9.435062525727235e-08, + "loss": 0.0018, + "num_input_tokens_seen": 213066096, + "step": 175110 + }, + { + "epoch": 19.502728588929724, + "grad_norm": 0.003747127251699567, + "learning_rate": 9.413984770786644e-08, + "loss": 0.0814, + "num_input_tokens_seen": 213072240, + "step": 175115 + }, + { + "epoch": 19.503285443813343, + "grad_norm": 2.2929742336273193, + "learning_rate": 9.392930541426425e-08, + "loss": 0.2066, + "num_input_tokens_seen": 213078064, + "step": 175120 + }, + { + "epoch": 19.50384229869696, + "grad_norm": 1.0759154558181763, + "learning_rate": 9.371899837845588e-08, + "loss": 0.0213, + "num_input_tokens_seen": 213084208, + "step": 175125 + }, + { + "epoch": 19.504399153580575, + "grad_norm": 0.6497594714164734, + "learning_rate": 9.350892660242582e-08, + "loss": 0.021, + "num_input_tokens_seen": 213089584, + "step": 175130 + }, + { + "epoch": 19.504956008464195, + "grad_norm": 0.055115241557359695, + "learning_rate": 9.329909008815862e-08, + "loss": 0.048, + "num_input_tokens_seen": 213095888, + "step": 175135 + }, + { + "epoch": 19.50551286334781, + "grad_norm": 0.062228742986917496, + "learning_rate": 9.308948883763602e-08, + "loss": 0.1021, + "num_input_tokens_seen": 213101936, + "step": 175140 + }, + { + "epoch": 19.50606971823143, + "grad_norm": 0.021876072511076927, + "learning_rate": 9.288012285283976e-08, + "loss": 0.0006, + "num_input_tokens_seen": 213107952, + "step": 175145 + }, + { + "epoch": 19.506626573115046, + "grad_norm": 0.007715430110692978, + "learning_rate": 9.267099213574326e-08, + "loss": 0.0047, + "num_input_tokens_seen": 213114192, + "step": 175150 + }, + { + "epoch": 19.507183427998662, + "grad_norm": 0.48657339811325073, + "learning_rate": 9.246209668832551e-08, + "loss": 0.0146, + "num_input_tokens_seen": 213120272, + "step": 175155 + }, + { + "epoch": 19.50774028288228, + "grad_norm": 0.002611669013276696, + "learning_rate": 9.22534365125599e-08, + "loss": 0.0136, + "num_input_tokens_seen": 213126640, + "step": 175160 + }, + { + "epoch": 19.508297137765897, + "grad_norm": 0.025570284575223923, + "learning_rate": 9.204501161041711e-08, + "loss": 0.0107, + "num_input_tokens_seen": 213132496, + "step": 175165 + }, + { + "epoch": 19.508853992649517, + "grad_norm": 0.004703671205788851, + "learning_rate": 9.183682198386501e-08, + "loss": 0.0192, + "num_input_tokens_seen": 213138800, + "step": 175170 + }, + { + "epoch": 19.509410847533132, + "grad_norm": 1.127587914466858, + "learning_rate": 9.162886763486588e-08, + "loss": 0.0469, + "num_input_tokens_seen": 213145008, + "step": 175175 + }, + { + "epoch": 19.509967702416752, + "grad_norm": 1.1884727478027344, + "learning_rate": 9.142114856539318e-08, + "loss": 0.048, + "num_input_tokens_seen": 213151184, + "step": 175180 + }, + { + "epoch": 19.510524557300368, + "grad_norm": 0.10699497908353806, + "learning_rate": 9.12136647774009e-08, + "loss": 0.0552, + "num_input_tokens_seen": 213157040, + "step": 175185 + }, + { + "epoch": 19.511081412183984, + "grad_norm": 0.13393625617027283, + "learning_rate": 9.100641627285412e-08, + "loss": 0.0029, + "num_input_tokens_seen": 213163152, + "step": 175190 + }, + { + "epoch": 19.511638267067603, + "grad_norm": 0.07253597676753998, + "learning_rate": 9.07994030537096e-08, + "loss": 0.0196, + "num_input_tokens_seen": 213169424, + "step": 175195 + }, + { + "epoch": 19.51219512195122, + "grad_norm": 0.07084645330905914, + "learning_rate": 9.059262512191857e-08, + "loss": 0.0066, + "num_input_tokens_seen": 213175568, + "step": 175200 + }, + { + "epoch": 19.51275197683484, + "grad_norm": 0.15356767177581787, + "learning_rate": 9.038608247944059e-08, + "loss": 0.0389, + "num_input_tokens_seen": 213181552, + "step": 175205 + }, + { + "epoch": 19.513308831718454, + "grad_norm": 1.1081444025039673, + "learning_rate": 9.017977512822129e-08, + "loss": 0.1017, + "num_input_tokens_seen": 213187760, + "step": 175210 + }, + { + "epoch": 19.51386568660207, + "grad_norm": 0.14662151038646698, + "learning_rate": 8.997370307021191e-08, + "loss": 0.0736, + "num_input_tokens_seen": 213193744, + "step": 175215 + }, + { + "epoch": 19.51442254148569, + "grad_norm": 0.24162432551383972, + "learning_rate": 8.976786630735811e-08, + "loss": 0.0617, + "num_input_tokens_seen": 213199664, + "step": 175220 + }, + { + "epoch": 19.514979396369306, + "grad_norm": 0.003969577141106129, + "learning_rate": 8.956226484160557e-08, + "loss": 0.0387, + "num_input_tokens_seen": 213205872, + "step": 175225 + }, + { + "epoch": 19.515536251252925, + "grad_norm": 2.36199688911438, + "learning_rate": 8.935689867489438e-08, + "loss": 0.0317, + "num_input_tokens_seen": 213211824, + "step": 175230 + }, + { + "epoch": 19.51609310613654, + "grad_norm": 0.30610185861587524, + "learning_rate": 8.915176780916468e-08, + "loss": 0.0173, + "num_input_tokens_seen": 213217872, + "step": 175235 + }, + { + "epoch": 19.516649961020157, + "grad_norm": 0.0031042618211358786, + "learning_rate": 8.894687224635378e-08, + "loss": 0.0841, + "num_input_tokens_seen": 213224144, + "step": 175240 + }, + { + "epoch": 19.517206815903776, + "grad_norm": 0.019314728677272797, + "learning_rate": 8.874221198840182e-08, + "loss": 0.001, + "num_input_tokens_seen": 213230288, + "step": 175245 + }, + { + "epoch": 19.517763670787392, + "grad_norm": 0.00019796937704086304, + "learning_rate": 8.853778703723503e-08, + "loss": 0.0121, + "num_input_tokens_seen": 213236752, + "step": 175250 + }, + { + "epoch": 19.51832052567101, + "grad_norm": 0.0005477660452015698, + "learning_rate": 8.833359739479075e-08, + "loss": 0.0093, + "num_input_tokens_seen": 213243024, + "step": 175255 + }, + { + "epoch": 19.518877380554628, + "grad_norm": 0.0005118926055729389, + "learning_rate": 8.812964306299244e-08, + "loss": 0.017, + "num_input_tokens_seen": 213248752, + "step": 175260 + }, + { + "epoch": 19.519434235438244, + "grad_norm": 0.035653986036777496, + "learning_rate": 8.79259240437691e-08, + "loss": 0.0171, + "num_input_tokens_seen": 213254544, + "step": 175265 + }, + { + "epoch": 19.519991090321863, + "grad_norm": 0.06655050069093704, + "learning_rate": 8.77224403390442e-08, + "loss": 0.0043, + "num_input_tokens_seen": 213260784, + "step": 175270 + }, + { + "epoch": 19.52054794520548, + "grad_norm": 0.03508344292640686, + "learning_rate": 8.751919195074121e-08, + "loss": 0.0153, + "num_input_tokens_seen": 213267248, + "step": 175275 + }, + { + "epoch": 19.5211048000891, + "grad_norm": 0.31405478715896606, + "learning_rate": 8.731617888077526e-08, + "loss": 0.016, + "num_input_tokens_seen": 213273680, + "step": 175280 + }, + { + "epoch": 19.521661654972714, + "grad_norm": 0.5756339430809021, + "learning_rate": 8.711340113107258e-08, + "loss": 0.0816, + "num_input_tokens_seen": 213279856, + "step": 175285 + }, + { + "epoch": 19.52221850985633, + "grad_norm": 0.13217546045780182, + "learning_rate": 8.691085870354276e-08, + "loss": 0.0073, + "num_input_tokens_seen": 213286256, + "step": 175290 + }, + { + "epoch": 19.52277536473995, + "grad_norm": 0.002992327092215419, + "learning_rate": 8.670855160009816e-08, + "loss": 0.093, + "num_input_tokens_seen": 213292240, + "step": 175295 + }, + { + "epoch": 19.523332219623565, + "grad_norm": 0.4914257228374481, + "learning_rate": 8.650647982265114e-08, + "loss": 0.0087, + "num_input_tokens_seen": 213298416, + "step": 175300 + }, + { + "epoch": 19.523889074507185, + "grad_norm": 0.011736290529370308, + "learning_rate": 8.630464337311128e-08, + "loss": 0.0567, + "num_input_tokens_seen": 213304592, + "step": 175305 + }, + { + "epoch": 19.5244459293908, + "grad_norm": 0.24902239441871643, + "learning_rate": 8.610304225338539e-08, + "loss": 0.052, + "num_input_tokens_seen": 213310608, + "step": 175310 + }, + { + "epoch": 19.525002784274417, + "grad_norm": 0.8470370769500732, + "learning_rate": 8.59016764653775e-08, + "loss": 0.0093, + "num_input_tokens_seen": 213316720, + "step": 175315 + }, + { + "epoch": 19.525559639158036, + "grad_norm": 0.0014351783320307732, + "learning_rate": 8.570054601098886e-08, + "loss": 0.0068, + "num_input_tokens_seen": 213322736, + "step": 175320 + }, + { + "epoch": 19.526116494041652, + "grad_norm": 0.7564605474472046, + "learning_rate": 8.549965089211798e-08, + "loss": 0.1106, + "num_input_tokens_seen": 213329136, + "step": 175325 + }, + { + "epoch": 19.52667334892527, + "grad_norm": 0.00047025797539390624, + "learning_rate": 8.529899111066608e-08, + "loss": 0.0804, + "num_input_tokens_seen": 213334960, + "step": 175330 + }, + { + "epoch": 19.527230203808887, + "grad_norm": 0.015651434659957886, + "learning_rate": 8.509856666852611e-08, + "loss": 0.1032, + "num_input_tokens_seen": 213340976, + "step": 175335 + }, + { + "epoch": 19.527787058692503, + "grad_norm": 0.13119491934776306, + "learning_rate": 8.489837756759101e-08, + "loss": 0.0125, + "num_input_tokens_seen": 213346992, + "step": 175340 + }, + { + "epoch": 19.528343913576123, + "grad_norm": 0.003194274613633752, + "learning_rate": 8.469842380975368e-08, + "loss": 0.0143, + "num_input_tokens_seen": 213353072, + "step": 175345 + }, + { + "epoch": 19.52890076845974, + "grad_norm": 0.0002512486244086176, + "learning_rate": 8.449870539689875e-08, + "loss": 0.021, + "num_input_tokens_seen": 213359568, + "step": 175350 + }, + { + "epoch": 19.529457623343358, + "grad_norm": 0.5567751526832581, + "learning_rate": 8.429922233091636e-08, + "loss": 0.0075, + "num_input_tokens_seen": 213366000, + "step": 175355 + }, + { + "epoch": 19.530014478226974, + "grad_norm": 0.04651211202144623, + "learning_rate": 8.409997461369112e-08, + "loss": 0.0047, + "num_input_tokens_seen": 213372048, + "step": 175360 + }, + { + "epoch": 19.53057133311059, + "grad_norm": 0.72078937292099, + "learning_rate": 8.390096224710487e-08, + "loss": 0.0155, + "num_input_tokens_seen": 213378384, + "step": 175365 + }, + { + "epoch": 19.53112818799421, + "grad_norm": 0.14937013387680054, + "learning_rate": 8.370218523303386e-08, + "loss": 0.0195, + "num_input_tokens_seen": 213384496, + "step": 175370 + }, + { + "epoch": 19.531685042877825, + "grad_norm": 0.0010790169471874833, + "learning_rate": 8.350364357335993e-08, + "loss": 0.0139, + "num_input_tokens_seen": 213390448, + "step": 175375 + }, + { + "epoch": 19.532241897761445, + "grad_norm": 0.11484099179506302, + "learning_rate": 8.33053372699566e-08, + "loss": 0.0187, + "num_input_tokens_seen": 213396688, + "step": 175380 + }, + { + "epoch": 19.53279875264506, + "grad_norm": 0.008923470042645931, + "learning_rate": 8.310726632469734e-08, + "loss": 0.0024, + "num_input_tokens_seen": 213402928, + "step": 175385 + }, + { + "epoch": 19.533355607528677, + "grad_norm": 0.0001741087471600622, + "learning_rate": 8.29094307394529e-08, + "loss": 0.0091, + "num_input_tokens_seen": 213408944, + "step": 175390 + }, + { + "epoch": 19.533912462412296, + "grad_norm": 0.49476584792137146, + "learning_rate": 8.2711830516094e-08, + "loss": 0.0041, + "num_input_tokens_seen": 213415088, + "step": 175395 + }, + { + "epoch": 19.534469317295912, + "grad_norm": 0.02792472019791603, + "learning_rate": 8.251446565648303e-08, + "loss": 0.0215, + "num_input_tokens_seen": 213421168, + "step": 175400 + }, + { + "epoch": 19.53502617217953, + "grad_norm": 0.0006182205979712307, + "learning_rate": 8.231733616248516e-08, + "loss": 0.0007, + "num_input_tokens_seen": 213427312, + "step": 175405 + }, + { + "epoch": 19.535583027063147, + "grad_norm": 0.0013330609071999788, + "learning_rate": 8.212044203596559e-08, + "loss": 0.0482, + "num_input_tokens_seen": 213433264, + "step": 175410 + }, + { + "epoch": 19.536139881946763, + "grad_norm": 0.004146179184317589, + "learning_rate": 8.192378327878392e-08, + "loss": 0.0894, + "num_input_tokens_seen": 213439216, + "step": 175415 + }, + { + "epoch": 19.536696736830383, + "grad_norm": 0.2517470419406891, + "learning_rate": 8.172735989279423e-08, + "loss": 0.0204, + "num_input_tokens_seen": 213445264, + "step": 175420 + }, + { + "epoch": 19.537253591714, + "grad_norm": 0.08787254244089127, + "learning_rate": 8.153117187985337e-08, + "loss": 0.0703, + "num_input_tokens_seen": 213451376, + "step": 175425 + }, + { + "epoch": 19.537810446597618, + "grad_norm": 0.13508139550685883, + "learning_rate": 8.133521924181542e-08, + "loss": 0.0067, + "num_input_tokens_seen": 213457616, + "step": 175430 + }, + { + "epoch": 19.538367301481234, + "grad_norm": 0.34445154666900635, + "learning_rate": 8.113950198053167e-08, + "loss": 0.1066, + "num_input_tokens_seen": 213463792, + "step": 175435 + }, + { + "epoch": 19.53892415636485, + "grad_norm": 0.0007717859116382897, + "learning_rate": 8.094402009784785e-08, + "loss": 0.0265, + "num_input_tokens_seen": 213470160, + "step": 175440 + }, + { + "epoch": 19.53948101124847, + "grad_norm": 0.006048997864127159, + "learning_rate": 8.074877359561528e-08, + "loss": 0.0028, + "num_input_tokens_seen": 213476400, + "step": 175445 + }, + { + "epoch": 19.540037866132085, + "grad_norm": 0.00016279284318443388, + "learning_rate": 8.055376247567415e-08, + "loss": 0.0048, + "num_input_tokens_seen": 213482608, + "step": 175450 + }, + { + "epoch": 19.540594721015704, + "grad_norm": 0.00018939927394967526, + "learning_rate": 8.035898673986741e-08, + "loss": 0.1319, + "num_input_tokens_seen": 213488528, + "step": 175455 + }, + { + "epoch": 19.54115157589932, + "grad_norm": 8.563903975300491e-05, + "learning_rate": 8.016444639003529e-08, + "loss": 0.0134, + "num_input_tokens_seen": 213493968, + "step": 175460 + }, + { + "epoch": 19.541708430782936, + "grad_norm": 0.006039277650415897, + "learning_rate": 7.997014142801795e-08, + "loss": 0.0294, + "num_input_tokens_seen": 213499888, + "step": 175465 + }, + { + "epoch": 19.542265285666556, + "grad_norm": 0.002088721841573715, + "learning_rate": 7.977607185564451e-08, + "loss": 0.0208, + "num_input_tokens_seen": 213505488, + "step": 175470 + }, + { + "epoch": 19.54282214055017, + "grad_norm": 0.030525943264365196, + "learning_rate": 7.958223767475514e-08, + "loss": 0.0005, + "num_input_tokens_seen": 213511632, + "step": 175475 + }, + { + "epoch": 19.54337899543379, + "grad_norm": 0.390730619430542, + "learning_rate": 7.938863888717618e-08, + "loss": 0.0249, + "num_input_tokens_seen": 213517968, + "step": 175480 + }, + { + "epoch": 19.543935850317407, + "grad_norm": 0.007062618155032396, + "learning_rate": 7.91952754947367e-08, + "loss": 0.015, + "num_input_tokens_seen": 213523824, + "step": 175485 + }, + { + "epoch": 19.544492705201023, + "grad_norm": 2.8239784240722656, + "learning_rate": 7.900214749926304e-08, + "loss": 0.0647, + "num_input_tokens_seen": 213529744, + "step": 175490 + }, + { + "epoch": 19.545049560084642, + "grad_norm": 0.0005260013276711106, + "learning_rate": 7.880925490258151e-08, + "loss": 0.004, + "num_input_tokens_seen": 213535792, + "step": 175495 + }, + { + "epoch": 19.54560641496826, + "grad_norm": 0.36997565627098083, + "learning_rate": 7.861659770651564e-08, + "loss": 0.0408, + "num_input_tokens_seen": 213542000, + "step": 175500 + }, + { + "epoch": 19.546163269851878, + "grad_norm": 0.008319905959069729, + "learning_rate": 7.842417591288065e-08, + "loss": 0.0404, + "num_input_tokens_seen": 213547984, + "step": 175505 + }, + { + "epoch": 19.546720124735494, + "grad_norm": 1.7771461009979248, + "learning_rate": 7.823198952349453e-08, + "loss": 0.0674, + "num_input_tokens_seen": 213553520, + "step": 175510 + }, + { + "epoch": 19.547276979619113, + "grad_norm": 0.7960441708564758, + "learning_rate": 7.804003854017805e-08, + "loss": 0.0352, + "num_input_tokens_seen": 213559504, + "step": 175515 + }, + { + "epoch": 19.54783383450273, + "grad_norm": 2.1316792964935303, + "learning_rate": 7.78483229647381e-08, + "loss": 0.0426, + "num_input_tokens_seen": 213565296, + "step": 175520 + }, + { + "epoch": 19.548390689386345, + "grad_norm": 0.5922355651855469, + "learning_rate": 7.765684279898711e-08, + "loss": 0.0099, + "num_input_tokens_seen": 213571472, + "step": 175525 + }, + { + "epoch": 19.548947544269964, + "grad_norm": 1.9191014766693115, + "learning_rate": 7.746559804473753e-08, + "loss": 0.1455, + "num_input_tokens_seen": 213576848, + "step": 175530 + }, + { + "epoch": 19.54950439915358, + "grad_norm": 0.021815458312630653, + "learning_rate": 7.727458870379067e-08, + "loss": 0.0131, + "num_input_tokens_seen": 213582960, + "step": 175535 + }, + { + "epoch": 19.550061254037196, + "grad_norm": 0.5217623114585876, + "learning_rate": 7.708381477795346e-08, + "loss": 0.0213, + "num_input_tokens_seen": 213588560, + "step": 175540 + }, + { + "epoch": 19.550618108920816, + "grad_norm": 0.28197893500328064, + "learning_rate": 7.68932762690272e-08, + "loss": 0.1358, + "num_input_tokens_seen": 213594576, + "step": 175545 + }, + { + "epoch": 19.55117496380443, + "grad_norm": 1.9547488689422607, + "learning_rate": 7.670297317881325e-08, + "loss": 0.077, + "num_input_tokens_seen": 213600976, + "step": 175550 + }, + { + "epoch": 19.55173181868805, + "grad_norm": 0.250621497631073, + "learning_rate": 7.65129055091074e-08, + "loss": 0.0061, + "num_input_tokens_seen": 213607216, + "step": 175555 + }, + { + "epoch": 19.552288673571667, + "grad_norm": 0.32740482687950134, + "learning_rate": 7.63230732617054e-08, + "loss": 0.013, + "num_input_tokens_seen": 213613424, + "step": 175560 + }, + { + "epoch": 19.552845528455286, + "grad_norm": 0.240581676363945, + "learning_rate": 7.61334764384003e-08, + "loss": 0.1061, + "num_input_tokens_seen": 213619056, + "step": 175565 + }, + { + "epoch": 19.553402383338902, + "grad_norm": 0.017236124724149704, + "learning_rate": 7.594411504098231e-08, + "loss": 0.0223, + "num_input_tokens_seen": 213624976, + "step": 175570 + }, + { + "epoch": 19.553959238222518, + "grad_norm": 1.1389957666397095, + "learning_rate": 7.575498907124445e-08, + "loss": 0.1419, + "num_input_tokens_seen": 213630864, + "step": 175575 + }, + { + "epoch": 19.554516093106137, + "grad_norm": 0.0004352623946033418, + "learning_rate": 7.556609853096586e-08, + "loss": 0.0566, + "num_input_tokens_seen": 213637200, + "step": 175580 + }, + { + "epoch": 19.555072947989753, + "grad_norm": 0.11871124804019928, + "learning_rate": 7.537744342193675e-08, + "loss": 0.0084, + "num_input_tokens_seen": 213643344, + "step": 175585 + }, + { + "epoch": 19.555629802873373, + "grad_norm": 0.22381697595119476, + "learning_rate": 7.518902374593629e-08, + "loss": 0.0205, + "num_input_tokens_seen": 213649296, + "step": 175590 + }, + { + "epoch": 19.55618665775699, + "grad_norm": 0.032149434089660645, + "learning_rate": 7.500083950474357e-08, + "loss": 0.0604, + "num_input_tokens_seen": 213655312, + "step": 175595 + }, + { + "epoch": 19.556743512640605, + "grad_norm": 0.02157505229115486, + "learning_rate": 7.481289070014053e-08, + "loss": 0.004, + "num_input_tokens_seen": 213661200, + "step": 175600 + }, + { + "epoch": 19.557300367524224, + "grad_norm": 0.006583079230040312, + "learning_rate": 7.462517733389795e-08, + "loss": 0.0089, + "num_input_tokens_seen": 213667440, + "step": 175605 + }, + { + "epoch": 19.55785722240784, + "grad_norm": 0.07598913460969925, + "learning_rate": 7.443769940778944e-08, + "loss": 0.0079, + "num_input_tokens_seen": 213673008, + "step": 175610 + }, + { + "epoch": 19.55841407729146, + "grad_norm": 0.11622578650712967, + "learning_rate": 7.425045692358856e-08, + "loss": 0.0993, + "num_input_tokens_seen": 213678928, + "step": 175615 + }, + { + "epoch": 19.558970932175075, + "grad_norm": 0.017964845523238182, + "learning_rate": 7.406344988306057e-08, + "loss": 0.0211, + "num_input_tokens_seen": 213684784, + "step": 175620 + }, + { + "epoch": 19.55952778705869, + "grad_norm": 0.9564290642738342, + "learning_rate": 7.387667828797629e-08, + "loss": 0.0477, + "num_input_tokens_seen": 213690384, + "step": 175625 + }, + { + "epoch": 19.56008464194231, + "grad_norm": 0.00012490341032389551, + "learning_rate": 7.369014214009262e-08, + "loss": 0.0554, + "num_input_tokens_seen": 213696880, + "step": 175630 + }, + { + "epoch": 19.560641496825927, + "grad_norm": 0.01134216133505106, + "learning_rate": 7.350384144118039e-08, + "loss": 0.0149, + "num_input_tokens_seen": 213703056, + "step": 175635 + }, + { + "epoch": 19.561198351709546, + "grad_norm": 0.14647439122200012, + "learning_rate": 7.331777619299373e-08, + "loss": 0.0051, + "num_input_tokens_seen": 213709040, + "step": 175640 + }, + { + "epoch": 19.561755206593162, + "grad_norm": 0.08070448040962219, + "learning_rate": 7.313194639729237e-08, + "loss": 0.0733, + "num_input_tokens_seen": 213715344, + "step": 175645 + }, + { + "epoch": 19.562312061476778, + "grad_norm": 0.018660912290215492, + "learning_rate": 7.294635205583045e-08, + "loss": 0.0108, + "num_input_tokens_seen": 213721360, + "step": 175650 + }, + { + "epoch": 19.562868916360397, + "grad_norm": 0.18274827301502228, + "learning_rate": 7.276099317035934e-08, + "loss": 0.124, + "num_input_tokens_seen": 213727280, + "step": 175655 + }, + { + "epoch": 19.563425771244013, + "grad_norm": 0.010093322955071926, + "learning_rate": 7.257586974263597e-08, + "loss": 0.1299, + "num_input_tokens_seen": 213732912, + "step": 175660 + }, + { + "epoch": 19.563982626127633, + "grad_norm": 1.8668231964111328, + "learning_rate": 7.239098177440063e-08, + "loss": 0.0362, + "num_input_tokens_seen": 213739152, + "step": 175665 + }, + { + "epoch": 19.56453948101125, + "grad_norm": 1.7674870491027832, + "learning_rate": 7.220632926740745e-08, + "loss": 0.0685, + "num_input_tokens_seen": 213745488, + "step": 175670 + }, + { + "epoch": 19.565096335894864, + "grad_norm": 0.05152876675128937, + "learning_rate": 7.202191222339671e-08, + "loss": 0.1396, + "num_input_tokens_seen": 213751728, + "step": 175675 + }, + { + "epoch": 19.565653190778484, + "grad_norm": 0.0010471688583493233, + "learning_rate": 7.183773064411147e-08, + "loss": 0.0126, + "num_input_tokens_seen": 213757808, + "step": 175680 + }, + { + "epoch": 19.5662100456621, + "grad_norm": 1.0791599750518799, + "learning_rate": 7.165378453128924e-08, + "loss": 0.0719, + "num_input_tokens_seen": 213763952, + "step": 175685 + }, + { + "epoch": 19.56676690054572, + "grad_norm": 1.0345768928527832, + "learning_rate": 7.147007388667027e-08, + "loss": 0.0589, + "num_input_tokens_seen": 213770256, + "step": 175690 + }, + { + "epoch": 19.567323755429335, + "grad_norm": 0.003373397048562765, + "learning_rate": 7.128659871198929e-08, + "loss": 0.0001, + "num_input_tokens_seen": 213776176, + "step": 175695 + }, + { + "epoch": 19.56788061031295, + "grad_norm": 0.03662582114338875, + "learning_rate": 7.110335900897825e-08, + "loss": 0.0145, + "num_input_tokens_seen": 213781936, + "step": 175700 + }, + { + "epoch": 19.56843746519657, + "grad_norm": 0.1492973268032074, + "learning_rate": 7.092035477936632e-08, + "loss": 0.0897, + "num_input_tokens_seen": 213788016, + "step": 175705 + }, + { + "epoch": 19.568994320080186, + "grad_norm": 0.9884299039840698, + "learning_rate": 7.073758602488823e-08, + "loss": 0.0167, + "num_input_tokens_seen": 213794032, + "step": 175710 + }, + { + "epoch": 19.569551174963806, + "grad_norm": 1.7485593557357788, + "learning_rate": 7.055505274726482e-08, + "loss": 0.0681, + "num_input_tokens_seen": 213800080, + "step": 175715 + }, + { + "epoch": 19.57010802984742, + "grad_norm": 1.9167089462280273, + "learning_rate": 7.037275494822248e-08, + "loss": 0.0778, + "num_input_tokens_seen": 213805680, + "step": 175720 + }, + { + "epoch": 19.570664884731038, + "grad_norm": 0.16424140334129333, + "learning_rate": 7.019069262948208e-08, + "loss": 0.0719, + "num_input_tokens_seen": 213811984, + "step": 175725 + }, + { + "epoch": 19.571221739614657, + "grad_norm": 0.3636218011379242, + "learning_rate": 7.000886579276721e-08, + "loss": 0.0248, + "num_input_tokens_seen": 213818320, + "step": 175730 + }, + { + "epoch": 19.571778594498273, + "grad_norm": 1.0844199657440186, + "learning_rate": 6.982727443978765e-08, + "loss": 0.0214, + "num_input_tokens_seen": 213824336, + "step": 175735 + }, + { + "epoch": 19.572335449381892, + "grad_norm": 0.04303579777479172, + "learning_rate": 6.9645918572267e-08, + "loss": 0.0051, + "num_input_tokens_seen": 213830416, + "step": 175740 + }, + { + "epoch": 19.57289230426551, + "grad_norm": 0.10443346947431564, + "learning_rate": 6.94647981919122e-08, + "loss": 0.0064, + "num_input_tokens_seen": 213836336, + "step": 175745 + }, + { + "epoch": 19.573449159149124, + "grad_norm": 0.023656602948904037, + "learning_rate": 6.928391330043583e-08, + "loss": 0.0031, + "num_input_tokens_seen": 213842160, + "step": 175750 + }, + { + "epoch": 19.574006014032744, + "grad_norm": 0.001717213774099946, + "learning_rate": 6.910326389954758e-08, + "loss": 0.0021, + "num_input_tokens_seen": 213847952, + "step": 175755 + }, + { + "epoch": 19.57456286891636, + "grad_norm": 1.2360512018203735, + "learning_rate": 6.892284999095444e-08, + "loss": 0.0144, + "num_input_tokens_seen": 213854256, + "step": 175760 + }, + { + "epoch": 19.57511972379998, + "grad_norm": 3.4065608978271484, + "learning_rate": 6.874267157636061e-08, + "loss": 0.1033, + "num_input_tokens_seen": 213860496, + "step": 175765 + }, + { + "epoch": 19.575676578683595, + "grad_norm": 2.6038570404052734, + "learning_rate": 6.856272865746472e-08, + "loss": 0.0274, + "num_input_tokens_seen": 213866416, + "step": 175770 + }, + { + "epoch": 19.57623343356721, + "grad_norm": 0.0012091894168406725, + "learning_rate": 6.838302123596818e-08, + "loss": 0.0014, + "num_input_tokens_seen": 213872976, + "step": 175775 + }, + { + "epoch": 19.57679028845083, + "grad_norm": 0.1828063726425171, + "learning_rate": 6.820354931356965e-08, + "loss": 0.0501, + "num_input_tokens_seen": 213878992, + "step": 175780 + }, + { + "epoch": 19.577347143334446, + "grad_norm": 1.2664432525634766, + "learning_rate": 6.802431289196498e-08, + "loss": 0.1672, + "num_input_tokens_seen": 213885136, + "step": 175785 + }, + { + "epoch": 19.577903998218066, + "grad_norm": 0.003987124189734459, + "learning_rate": 6.78453119728445e-08, + "loss": 0.0103, + "num_input_tokens_seen": 213891120, + "step": 175790 + }, + { + "epoch": 19.57846085310168, + "grad_norm": 0.061006467789411545, + "learning_rate": 6.766654655790128e-08, + "loss": 0.0275, + "num_input_tokens_seen": 213897328, + "step": 175795 + }, + { + "epoch": 19.579017707985297, + "grad_norm": 0.5027507543563843, + "learning_rate": 6.74880166488201e-08, + "loss": 0.1242, + "num_input_tokens_seen": 213903408, + "step": 175800 + }, + { + "epoch": 19.579574562868917, + "grad_norm": 1.3863415718078613, + "learning_rate": 6.730972224729126e-08, + "loss": 0.1121, + "num_input_tokens_seen": 213909360, + "step": 175805 + }, + { + "epoch": 19.580131417752533, + "grad_norm": 0.000795510713942349, + "learning_rate": 6.713166335499955e-08, + "loss": 0.0099, + "num_input_tokens_seen": 213915408, + "step": 175810 + }, + { + "epoch": 19.580688272636152, + "grad_norm": 0.6817761063575745, + "learning_rate": 6.695383997362414e-08, + "loss": 0.1633, + "num_input_tokens_seen": 213921680, + "step": 175815 + }, + { + "epoch": 19.581245127519768, + "grad_norm": 0.02595527097582817, + "learning_rate": 6.677625210484706e-08, + "loss": 0.0187, + "num_input_tokens_seen": 213927664, + "step": 175820 + }, + { + "epoch": 19.581801982403384, + "grad_norm": 0.4825791120529175, + "learning_rate": 6.659889975034194e-08, + "loss": 0.0044, + "num_input_tokens_seen": 213933520, + "step": 175825 + }, + { + "epoch": 19.582358837287003, + "grad_norm": 0.5389423370361328, + "learning_rate": 6.64217829117908e-08, + "loss": 0.0142, + "num_input_tokens_seen": 213939792, + "step": 175830 + }, + { + "epoch": 19.58291569217062, + "grad_norm": 0.663303017616272, + "learning_rate": 6.624490159085894e-08, + "loss": 0.0321, + "num_input_tokens_seen": 213945744, + "step": 175835 + }, + { + "epoch": 19.58347254705424, + "grad_norm": 0.18059952557086945, + "learning_rate": 6.606825578922004e-08, + "loss": 0.0249, + "num_input_tokens_seen": 213951824, + "step": 175840 + }, + { + "epoch": 19.584029401937855, + "grad_norm": 0.010064193978905678, + "learning_rate": 6.589184550854499e-08, + "loss": 0.0107, + "num_input_tokens_seen": 213957936, + "step": 175845 + }, + { + "epoch": 19.584586256821474, + "grad_norm": 0.00016728760965634137, + "learning_rate": 6.571567075049634e-08, + "loss": 0.0139, + "num_input_tokens_seen": 213964144, + "step": 175850 + }, + { + "epoch": 19.58514311170509, + "grad_norm": 0.09412830322980881, + "learning_rate": 6.55397315167422e-08, + "loss": 0.017, + "num_input_tokens_seen": 213970320, + "step": 175855 + }, + { + "epoch": 19.585699966588706, + "grad_norm": 0.010930228047072887, + "learning_rate": 6.536402780894235e-08, + "loss": 0.0328, + "num_input_tokens_seen": 213975504, + "step": 175860 + }, + { + "epoch": 19.586256821472325, + "grad_norm": 1.2158070802688599, + "learning_rate": 6.518855962875658e-08, + "loss": 0.0667, + "num_input_tokens_seen": 213980880, + "step": 175865 + }, + { + "epoch": 19.58681367635594, + "grad_norm": 0.1658453494310379, + "learning_rate": 6.50133269778419e-08, + "loss": 0.023, + "num_input_tokens_seen": 213986928, + "step": 175870 + }, + { + "epoch": 19.587370531239557, + "grad_norm": 0.08225233852863312, + "learning_rate": 6.483832985785254e-08, + "loss": 0.0056, + "num_input_tokens_seen": 213993200, + "step": 175875 + }, + { + "epoch": 19.587927386123177, + "grad_norm": 2.2041234970092773, + "learning_rate": 6.466356827044551e-08, + "loss": 0.0516, + "num_input_tokens_seen": 213999408, + "step": 175880 + }, + { + "epoch": 19.588484241006793, + "grad_norm": 1.949076771736145, + "learning_rate": 6.44890422172667e-08, + "loss": 0.028, + "num_input_tokens_seen": 214005328, + "step": 175885 + }, + { + "epoch": 19.589041095890412, + "grad_norm": 0.016489986330270767, + "learning_rate": 6.43147516999676e-08, + "loss": 0.0162, + "num_input_tokens_seen": 214011248, + "step": 175890 + }, + { + "epoch": 19.589597950774028, + "grad_norm": 0.12797755002975464, + "learning_rate": 6.414069672019407e-08, + "loss": 0.0882, + "num_input_tokens_seen": 214017296, + "step": 175895 + }, + { + "epoch": 19.590154805657647, + "grad_norm": 0.007175221107900143, + "learning_rate": 6.396687727958928e-08, + "loss": 0.0077, + "num_input_tokens_seen": 214023504, + "step": 175900 + }, + { + "epoch": 19.590711660541263, + "grad_norm": 0.19600236415863037, + "learning_rate": 6.379329337979634e-08, + "loss": 0.0561, + "num_input_tokens_seen": 214029776, + "step": 175905 + }, + { + "epoch": 19.59126851542488, + "grad_norm": 0.030596014112234116, + "learning_rate": 6.361994502245561e-08, + "loss": 0.049, + "num_input_tokens_seen": 214036176, + "step": 175910 + }, + { + "epoch": 19.5918253703085, + "grad_norm": 0.6649450659751892, + "learning_rate": 6.344683220919911e-08, + "loss": 0.0498, + "num_input_tokens_seen": 214042320, + "step": 175915 + }, + { + "epoch": 19.592382225192114, + "grad_norm": 1.4754496812820435, + "learning_rate": 6.327395494166999e-08, + "loss": 0.185, + "num_input_tokens_seen": 214048464, + "step": 175920 + }, + { + "epoch": 19.592939080075734, + "grad_norm": 0.6908103227615356, + "learning_rate": 6.310131322149471e-08, + "loss": 0.0202, + "num_input_tokens_seen": 214054256, + "step": 175925 + }, + { + "epoch": 19.59349593495935, + "grad_norm": 1.1691004037857056, + "learning_rate": 6.292890705030807e-08, + "loss": 0.024, + "num_input_tokens_seen": 214060432, + "step": 175930 + }, + { + "epoch": 19.594052789842966, + "grad_norm": 0.008408388122916222, + "learning_rate": 6.275673642973934e-08, + "loss": 0.0029, + "num_input_tokens_seen": 214066480, + "step": 175935 + }, + { + "epoch": 19.594609644726585, + "grad_norm": 0.1764957755804062, + "learning_rate": 6.258480136140943e-08, + "loss": 0.0796, + "num_input_tokens_seen": 214071472, + "step": 175940 + }, + { + "epoch": 19.5951664996102, + "grad_norm": 0.00017590151401236653, + "learning_rate": 6.241310184694482e-08, + "loss": 0.0134, + "num_input_tokens_seen": 214077680, + "step": 175945 + }, + { + "epoch": 19.59572335449382, + "grad_norm": 0.10459613800048828, + "learning_rate": 6.2241637887972e-08, + "loss": 0.0245, + "num_input_tokens_seen": 214083536, + "step": 175950 + }, + { + "epoch": 19.596280209377436, + "grad_norm": 2.1424219608306885, + "learning_rate": 6.207040948610354e-08, + "loss": 0.022, + "num_input_tokens_seen": 214089680, + "step": 175955 + }, + { + "epoch": 19.596837064261052, + "grad_norm": 0.00011373622692190111, + "learning_rate": 6.189941664296317e-08, + "loss": 0.0366, + "num_input_tokens_seen": 214095696, + "step": 175960 + }, + { + "epoch": 19.59739391914467, + "grad_norm": 0.06337322294712067, + "learning_rate": 6.172865936015792e-08, + "loss": 0.0038, + "num_input_tokens_seen": 214101936, + "step": 175965 + }, + { + "epoch": 19.597950774028288, + "grad_norm": 1.100514531135559, + "learning_rate": 6.155813763930873e-08, + "loss": 0.0283, + "num_input_tokens_seen": 214108432, + "step": 175970 + }, + { + "epoch": 19.598507628911907, + "grad_norm": 0.9780095219612122, + "learning_rate": 6.138785148202264e-08, + "loss": 0.0883, + "num_input_tokens_seen": 214114608, + "step": 175975 + }, + { + "epoch": 19.599064483795523, + "grad_norm": 0.0006034639663994312, + "learning_rate": 6.121780088990947e-08, + "loss": 0.0984, + "num_input_tokens_seen": 214120656, + "step": 175980 + }, + { + "epoch": 19.59962133867914, + "grad_norm": 1.0422558784484863, + "learning_rate": 6.104798586457627e-08, + "loss": 0.0516, + "num_input_tokens_seen": 214126704, + "step": 175985 + }, + { + "epoch": 19.60017819356276, + "grad_norm": 0.000308910763124004, + "learning_rate": 6.087840640762455e-08, + "loss": 0.0247, + "num_input_tokens_seen": 214132784, + "step": 175990 + }, + { + "epoch": 19.600735048446374, + "grad_norm": 0.2445206344127655, + "learning_rate": 6.070906252065578e-08, + "loss": 0.0107, + "num_input_tokens_seen": 214138896, + "step": 175995 + }, + { + "epoch": 19.601291903329994, + "grad_norm": 0.016935769468545914, + "learning_rate": 6.053995420527148e-08, + "loss": 0.058, + "num_input_tokens_seen": 214145008, + "step": 176000 + }, + { + "epoch": 19.60184875821361, + "grad_norm": 0.46214523911476135, + "learning_rate": 6.037108146306759e-08, + "loss": 0.0141, + "num_input_tokens_seen": 214151024, + "step": 176005 + }, + { + "epoch": 19.602405613097226, + "grad_norm": 0.016814321279525757, + "learning_rate": 6.020244429564282e-08, + "loss": 0.1213, + "num_input_tokens_seen": 214157136, + "step": 176010 + }, + { + "epoch": 19.602962467980845, + "grad_norm": 0.16879285871982574, + "learning_rate": 6.00340427045848e-08, + "loss": 0.0122, + "num_input_tokens_seen": 214163024, + "step": 176015 + }, + { + "epoch": 19.60351932286446, + "grad_norm": 0.21899771690368652, + "learning_rate": 5.986587669148669e-08, + "loss": 0.0027, + "num_input_tokens_seen": 214169200, + "step": 176020 + }, + { + "epoch": 19.60407617774808, + "grad_norm": 0.007132383994758129, + "learning_rate": 5.96979462579389e-08, + "loss": 0.0068, + "num_input_tokens_seen": 214175312, + "step": 176025 + }, + { + "epoch": 19.604633032631696, + "grad_norm": 0.016933172941207886, + "learning_rate": 5.953025140552626e-08, + "loss": 0.0447, + "num_input_tokens_seen": 214181328, + "step": 176030 + }, + { + "epoch": 19.605189887515312, + "grad_norm": 0.002527048112824559, + "learning_rate": 5.9362792135830845e-08, + "loss": 0.0709, + "num_input_tokens_seen": 214187536, + "step": 176035 + }, + { + "epoch": 19.60574674239893, + "grad_norm": 0.0021041224244982004, + "learning_rate": 5.919556845043472e-08, + "loss": 0.0165, + "num_input_tokens_seen": 214193520, + "step": 176040 + }, + { + "epoch": 19.606303597282547, + "grad_norm": 0.06983668357133865, + "learning_rate": 5.902858035091996e-08, + "loss": 0.001, + "num_input_tokens_seen": 214199920, + "step": 176045 + }, + { + "epoch": 19.606860452166167, + "grad_norm": 9.793930075829849e-05, + "learning_rate": 5.886182783886307e-08, + "loss": 0.0253, + "num_input_tokens_seen": 214205936, + "step": 176050 + }, + { + "epoch": 19.607417307049783, + "grad_norm": 0.016219506040215492, + "learning_rate": 5.8695310915840574e-08, + "loss": 0.0005, + "num_input_tokens_seen": 214212176, + "step": 176055 + }, + { + "epoch": 19.6079741619334, + "grad_norm": 1.316981315612793, + "learning_rate": 5.852902958342066e-08, + "loss": 0.0459, + "num_input_tokens_seen": 214218416, + "step": 176060 + }, + { + "epoch": 19.608531016817018, + "grad_norm": 0.010507012717425823, + "learning_rate": 5.8362983843177064e-08, + "loss": 0.0209, + "num_input_tokens_seen": 214224848, + "step": 176065 + }, + { + "epoch": 19.609087871700634, + "grad_norm": 0.02730707824230194, + "learning_rate": 5.819717369667799e-08, + "loss": 0.0022, + "num_input_tokens_seen": 214230832, + "step": 176070 + }, + { + "epoch": 19.609644726584254, + "grad_norm": 0.9798731803894043, + "learning_rate": 5.803159914549161e-08, + "loss": 0.0272, + "num_input_tokens_seen": 214237328, + "step": 176075 + }, + { + "epoch": 19.61020158146787, + "grad_norm": 2.042384147644043, + "learning_rate": 5.78662601911778e-08, + "loss": 0.0702, + "num_input_tokens_seen": 214243504, + "step": 176080 + }, + { + "epoch": 19.610758436351485, + "grad_norm": 0.00018209261179435998, + "learning_rate": 5.770115683530197e-08, + "loss": 0.0037, + "num_input_tokens_seen": 214249744, + "step": 176085 + }, + { + "epoch": 19.611315291235105, + "grad_norm": 0.2533537745475769, + "learning_rate": 5.753628907942121e-08, + "loss": 0.0317, + "num_input_tokens_seen": 214255920, + "step": 176090 + }, + { + "epoch": 19.61187214611872, + "grad_norm": 0.8594213128089905, + "learning_rate": 5.7371656925095387e-08, + "loss": 0.123, + "num_input_tokens_seen": 214261872, + "step": 176095 + }, + { + "epoch": 19.61242900100234, + "grad_norm": 0.02781769260764122, + "learning_rate": 5.720726037387603e-08, + "loss": 0.009, + "num_input_tokens_seen": 214268176, + "step": 176100 + }, + { + "epoch": 19.612985855885956, + "grad_norm": 1.6436042785644531, + "learning_rate": 5.7043099427320226e-08, + "loss": 0.0249, + "num_input_tokens_seen": 214274768, + "step": 176105 + }, + { + "epoch": 19.613542710769572, + "grad_norm": 0.09730938822031021, + "learning_rate": 5.6879174086973964e-08, + "loss": 0.0079, + "num_input_tokens_seen": 214280880, + "step": 176110 + }, + { + "epoch": 19.61409956565319, + "grad_norm": 0.1286221146583557, + "learning_rate": 5.671548435438878e-08, + "loss": 0.0909, + "num_input_tokens_seen": 214286608, + "step": 176115 + }, + { + "epoch": 19.614656420536807, + "grad_norm": 0.00019852971308864653, + "learning_rate": 5.655203023110789e-08, + "loss": 0.0011, + "num_input_tokens_seen": 214292912, + "step": 176120 + }, + { + "epoch": 19.615213275420427, + "grad_norm": 3.1901462078094482, + "learning_rate": 5.6388811718680045e-08, + "loss": 0.107, + "num_input_tokens_seen": 214298992, + "step": 176125 + }, + { + "epoch": 19.615770130304043, + "grad_norm": 0.00012843747390434146, + "learning_rate": 5.622582881864291e-08, + "loss": 0.0711, + "num_input_tokens_seen": 214305424, + "step": 176130 + }, + { + "epoch": 19.61632698518766, + "grad_norm": 0.2273448407649994, + "learning_rate": 5.60630815325397e-08, + "loss": 0.0696, + "num_input_tokens_seen": 214311184, + "step": 176135 + }, + { + "epoch": 19.616883840071278, + "grad_norm": 1.5616846084594727, + "learning_rate": 5.590056986190251e-08, + "loss": 0.084, + "num_input_tokens_seen": 214317232, + "step": 176140 + }, + { + "epoch": 19.617440694954894, + "grad_norm": 1.0719730854034424, + "learning_rate": 5.5738293808271777e-08, + "loss": 0.0815, + "num_input_tokens_seen": 214323024, + "step": 176145 + }, + { + "epoch": 19.617997549838513, + "grad_norm": 1.5750421285629272, + "learning_rate": 5.557625337317685e-08, + "loss": 0.0857, + "num_input_tokens_seen": 214328944, + "step": 176150 + }, + { + "epoch": 19.61855440472213, + "grad_norm": 3.43827748298645, + "learning_rate": 5.541444855814981e-08, + "loss": 0.1297, + "num_input_tokens_seen": 214334960, + "step": 176155 + }, + { + "epoch": 19.619111259605745, + "grad_norm": 0.01391550712287426, + "learning_rate": 5.5252879364717236e-08, + "loss": 0.0182, + "num_input_tokens_seen": 214341200, + "step": 176160 + }, + { + "epoch": 19.619668114489365, + "grad_norm": 0.022963471710681915, + "learning_rate": 5.509154579440845e-08, + "loss": 0.0399, + "num_input_tokens_seen": 214346864, + "step": 176165 + }, + { + "epoch": 19.62022496937298, + "grad_norm": 2.0412092208862305, + "learning_rate": 5.493044784874446e-08, + "loss": 0.0991, + "num_input_tokens_seen": 214352816, + "step": 176170 + }, + { + "epoch": 19.6207818242566, + "grad_norm": 1.8276324272155762, + "learning_rate": 5.4769585529249046e-08, + "loss": 0.0979, + "num_input_tokens_seen": 214358736, + "step": 176175 + }, + { + "epoch": 19.621338679140216, + "grad_norm": 0.011175431311130524, + "learning_rate": 5.460895883744044e-08, + "loss": 0.0202, + "num_input_tokens_seen": 214364752, + "step": 176180 + }, + { + "epoch": 19.62189553402383, + "grad_norm": 0.0016132856253534555, + "learning_rate": 5.444856777483409e-08, + "loss": 0.037, + "num_input_tokens_seen": 214370672, + "step": 176185 + }, + { + "epoch": 19.62245238890745, + "grad_norm": 0.00018026075849775225, + "learning_rate": 5.428841234294824e-08, + "loss": 0.0169, + "num_input_tokens_seen": 214376880, + "step": 176190 + }, + { + "epoch": 19.623009243791067, + "grad_norm": 1.6581772565841675, + "learning_rate": 5.412849254329555e-08, + "loss": 0.0303, + "num_input_tokens_seen": 214382864, + "step": 176195 + }, + { + "epoch": 19.623566098674686, + "grad_norm": 0.003760996740311384, + "learning_rate": 5.396880837738594e-08, + "loss": 0.0227, + "num_input_tokens_seen": 214389136, + "step": 176200 + }, + { + "epoch": 19.624122953558302, + "grad_norm": 0.012940099462866783, + "learning_rate": 5.380935984672653e-08, + "loss": 0.0127, + "num_input_tokens_seen": 214395792, + "step": 176205 + }, + { + "epoch": 19.62467980844192, + "grad_norm": 0.007893750444054604, + "learning_rate": 5.3650146952821666e-08, + "loss": 0.0494, + "num_input_tokens_seen": 214401904, + "step": 176210 + }, + { + "epoch": 19.625236663325538, + "grad_norm": 0.0010317724663764238, + "learning_rate": 5.349116969718127e-08, + "loss": 0.0035, + "num_input_tokens_seen": 214407792, + "step": 176215 + }, + { + "epoch": 19.625793518209154, + "grad_norm": 0.0020131233613938093, + "learning_rate": 5.333242808130412e-08, + "loss": 0.0431, + "num_input_tokens_seen": 214413968, + "step": 176220 + }, + { + "epoch": 19.626350373092773, + "grad_norm": 0.011192656122148037, + "learning_rate": 5.317392210668626e-08, + "loss": 0.0127, + "num_input_tokens_seen": 214419952, + "step": 176225 + }, + { + "epoch": 19.62690722797639, + "grad_norm": 0.8256153464317322, + "learning_rate": 5.301565177482925e-08, + "loss": 0.0748, + "num_input_tokens_seen": 214426192, + "step": 176230 + }, + { + "epoch": 19.62746408286001, + "grad_norm": 0.009812665171921253, + "learning_rate": 5.2857617087226364e-08, + "loss": 0.0067, + "num_input_tokens_seen": 214432400, + "step": 176235 + }, + { + "epoch": 19.628020937743624, + "grad_norm": 0.7311552166938782, + "learning_rate": 5.269981804537083e-08, + "loss": 0.0377, + "num_input_tokens_seen": 214438352, + "step": 176240 + }, + { + "epoch": 19.62857779262724, + "grad_norm": 0.06676868349313736, + "learning_rate": 5.254225465075313e-08, + "loss": 0.0339, + "num_input_tokens_seen": 214444624, + "step": 176245 + }, + { + "epoch": 19.62913464751086, + "grad_norm": 0.02359028533101082, + "learning_rate": 5.2384926904860964e-08, + "loss": 0.0293, + "num_input_tokens_seen": 214450480, + "step": 176250 + }, + { + "epoch": 19.629691502394476, + "grad_norm": 0.1062992662191391, + "learning_rate": 5.222783480917925e-08, + "loss": 0.0209, + "num_input_tokens_seen": 214456880, + "step": 176255 + }, + { + "epoch": 19.630248357278095, + "grad_norm": 0.016023578122258186, + "learning_rate": 5.20709783651957e-08, + "loss": 0.1098, + "num_input_tokens_seen": 214463024, + "step": 176260 + }, + { + "epoch": 19.63080521216171, + "grad_norm": 0.3347950279712677, + "learning_rate": 5.1914357574389673e-08, + "loss": 0.0666, + "num_input_tokens_seen": 214469328, + "step": 176265 + }, + { + "epoch": 19.631362067045327, + "grad_norm": 0.022701498121023178, + "learning_rate": 5.1757972438240544e-08, + "loss": 0.0115, + "num_input_tokens_seen": 214475440, + "step": 176270 + }, + { + "epoch": 19.631918921928946, + "grad_norm": 1.1892635822296143, + "learning_rate": 5.160182295822491e-08, + "loss": 0.1337, + "num_input_tokens_seen": 214481648, + "step": 176275 + }, + { + "epoch": 19.632475776812562, + "grad_norm": 1.9358086585998535, + "learning_rate": 5.1445909135816595e-08, + "loss": 0.1547, + "num_input_tokens_seen": 214488080, + "step": 176280 + }, + { + "epoch": 19.63303263169618, + "grad_norm": 0.02133994735777378, + "learning_rate": 5.129023097249219e-08, + "loss": 0.0257, + "num_input_tokens_seen": 214494128, + "step": 176285 + }, + { + "epoch": 19.633589486579798, + "grad_norm": 0.00010901812493102625, + "learning_rate": 5.1134788469719976e-08, + "loss": 0.0187, + "num_input_tokens_seen": 214500368, + "step": 176290 + }, + { + "epoch": 19.634146341463413, + "grad_norm": 0.005678082350641489, + "learning_rate": 5.0979581628970985e-08, + "loss": 0.0518, + "num_input_tokens_seen": 214506384, + "step": 176295 + }, + { + "epoch": 19.634703196347033, + "grad_norm": 0.00032638697302900255, + "learning_rate": 5.082461045170517e-08, + "loss": 0.0021, + "num_input_tokens_seen": 214512560, + "step": 176300 + }, + { + "epoch": 19.63526005123065, + "grad_norm": 0.0002659741439856589, + "learning_rate": 5.0669874939390795e-08, + "loss": 0.0057, + "num_input_tokens_seen": 214518736, + "step": 176305 + }, + { + "epoch": 19.635816906114268, + "grad_norm": 0.2175906002521515, + "learning_rate": 5.0515375093487804e-08, + "loss": 0.0422, + "num_input_tokens_seen": 214524624, + "step": 176310 + }, + { + "epoch": 19.636373760997884, + "grad_norm": 0.11056005209684372, + "learning_rate": 5.036111091545614e-08, + "loss": 0.0274, + "num_input_tokens_seen": 214530864, + "step": 176315 + }, + { + "epoch": 19.6369306158815, + "grad_norm": 0.11766396462917328, + "learning_rate": 5.020708240675576e-08, + "loss": 0.0025, + "num_input_tokens_seen": 214536816, + "step": 176320 + }, + { + "epoch": 19.63748747076512, + "grad_norm": 0.1515631079673767, + "learning_rate": 5.005328956883548e-08, + "loss": 0.0434, + "num_input_tokens_seen": 214543280, + "step": 176325 + }, + { + "epoch": 19.638044325648735, + "grad_norm": 0.2594628632068634, + "learning_rate": 4.989973240315249e-08, + "loss": 0.0082, + "num_input_tokens_seen": 214549616, + "step": 176330 + }, + { + "epoch": 19.638601180532355, + "grad_norm": 0.4211615324020386, + "learning_rate": 4.974641091115839e-08, + "loss": 0.0327, + "num_input_tokens_seen": 214555664, + "step": 176335 + }, + { + "epoch": 19.63915803541597, + "grad_norm": 0.00038121946272440255, + "learning_rate": 4.9593325094296486e-08, + "loss": 0.0144, + "num_input_tokens_seen": 214562160, + "step": 176340 + }, + { + "epoch": 19.639714890299587, + "grad_norm": 2.76021671295166, + "learning_rate": 4.944047495401838e-08, + "loss": 0.1035, + "num_input_tokens_seen": 214568336, + "step": 176345 + }, + { + "epoch": 19.640271745183206, + "grad_norm": 0.7122589349746704, + "learning_rate": 4.928786049176182e-08, + "loss": 0.0192, + "num_input_tokens_seen": 214574352, + "step": 176350 + }, + { + "epoch": 19.640828600066822, + "grad_norm": 1.441838264465332, + "learning_rate": 4.9135481708972864e-08, + "loss": 0.0387, + "num_input_tokens_seen": 214580432, + "step": 176355 + }, + { + "epoch": 19.64138545495044, + "grad_norm": 0.7109785676002502, + "learning_rate": 4.8983338607092036e-08, + "loss": 0.0196, + "num_input_tokens_seen": 214586704, + "step": 176360 + }, + { + "epoch": 19.641942309834057, + "grad_norm": 0.0005975837702862918, + "learning_rate": 4.8831431187551515e-08, + "loss": 0.0523, + "num_input_tokens_seen": 214593072, + "step": 176365 + }, + { + "epoch": 19.642499164717673, + "grad_norm": 0.0038173228967934847, + "learning_rate": 4.867975945178904e-08, + "loss": 0.0089, + "num_input_tokens_seen": 214599472, + "step": 176370 + }, + { + "epoch": 19.643056019601293, + "grad_norm": 0.008466055616736412, + "learning_rate": 4.852832340123681e-08, + "loss": 0.0497, + "num_input_tokens_seen": 214605648, + "step": 176375 + }, + { + "epoch": 19.64361287448491, + "grad_norm": 3.1260199546813965, + "learning_rate": 4.837712303732422e-08, + "loss": 0.1013, + "num_input_tokens_seen": 214611920, + "step": 176380 + }, + { + "epoch": 19.644169729368528, + "grad_norm": 2.0213444232940674, + "learning_rate": 4.82261583614807e-08, + "loss": 0.0611, + "num_input_tokens_seen": 214618032, + "step": 176385 + }, + { + "epoch": 19.644726584252144, + "grad_norm": 0.007761539425700903, + "learning_rate": 4.807542937513565e-08, + "loss": 0.0023, + "num_input_tokens_seen": 214624656, + "step": 176390 + }, + { + "epoch": 19.64528343913576, + "grad_norm": 1.038814902305603, + "learning_rate": 4.792493607970738e-08, + "loss": 0.0468, + "num_input_tokens_seen": 214630864, + "step": 176395 + }, + { + "epoch": 19.64584029401938, + "grad_norm": 0.001137334736995399, + "learning_rate": 4.777467847661699e-08, + "loss": 0.0384, + "num_input_tokens_seen": 214636912, + "step": 176400 + }, + { + "epoch": 19.646397148902995, + "grad_norm": 0.003434652229771018, + "learning_rate": 4.762465656728832e-08, + "loss": 0.0143, + "num_input_tokens_seen": 214642928, + "step": 176405 + }, + { + "epoch": 19.646954003786615, + "grad_norm": 0.043486155569553375, + "learning_rate": 4.7474870353136915e-08, + "loss": 0.0048, + "num_input_tokens_seen": 214649488, + "step": 176410 + }, + { + "epoch": 19.64751085867023, + "grad_norm": 0.3966144025325775, + "learning_rate": 4.732531983557553e-08, + "loss": 0.0068, + "num_input_tokens_seen": 214655312, + "step": 176415 + }, + { + "epoch": 19.648067713553846, + "grad_norm": 1.1908035278320312, + "learning_rate": 4.7176005016019706e-08, + "loss": 0.0288, + "num_input_tokens_seen": 214661104, + "step": 176420 + }, + { + "epoch": 19.648624568437466, + "grad_norm": 0.8457236289978027, + "learning_rate": 4.702692589587665e-08, + "loss": 0.009, + "num_input_tokens_seen": 214666864, + "step": 176425 + }, + { + "epoch": 19.64918142332108, + "grad_norm": 0.9895032644271851, + "learning_rate": 4.687808247655634e-08, + "loss": 0.033, + "num_input_tokens_seen": 214673104, + "step": 176430 + }, + { + "epoch": 19.6497382782047, + "grad_norm": 0.0017590905772522092, + "learning_rate": 4.6729474759465986e-08, + "loss": 0.0286, + "num_input_tokens_seen": 214679376, + "step": 176435 + }, + { + "epoch": 19.650295133088317, + "grad_norm": 0.0011482021072879434, + "learning_rate": 4.6581102746007246e-08, + "loss": 0.1193, + "num_input_tokens_seen": 214685456, + "step": 176440 + }, + { + "epoch": 19.650851987971933, + "grad_norm": 0.2824413478374481, + "learning_rate": 4.643296643758177e-08, + "loss": 0.0174, + "num_input_tokens_seen": 214691632, + "step": 176445 + }, + { + "epoch": 19.651408842855552, + "grad_norm": 1.10953950881958, + "learning_rate": 4.6285065835591224e-08, + "loss": 0.1115, + "num_input_tokens_seen": 214697776, + "step": 176450 + }, + { + "epoch": 19.65196569773917, + "grad_norm": 1.572509765625, + "learning_rate": 4.613740094142893e-08, + "loss": 0.1507, + "num_input_tokens_seen": 214703696, + "step": 176455 + }, + { + "epoch": 19.652522552622788, + "grad_norm": 0.4359455704689026, + "learning_rate": 4.598997175649378e-08, + "loss": 0.1388, + "num_input_tokens_seen": 214709712, + "step": 176460 + }, + { + "epoch": 19.653079407506404, + "grad_norm": 0.03538412228226662, + "learning_rate": 4.584277828217354e-08, + "loss": 0.0263, + "num_input_tokens_seen": 214715536, + "step": 176465 + }, + { + "epoch": 19.65363626239002, + "grad_norm": 0.1273830085992813, + "learning_rate": 4.5695820519861545e-08, + "loss": 0.0597, + "num_input_tokens_seen": 214721264, + "step": 176470 + }, + { + "epoch": 19.65419311727364, + "grad_norm": 0.0001616156514501199, + "learning_rate": 4.554909847094835e-08, + "loss": 0.0338, + "num_input_tokens_seen": 214727600, + "step": 176475 + }, + { + "epoch": 19.654749972157255, + "grad_norm": 0.0014079903485253453, + "learning_rate": 4.5402612136813405e-08, + "loss": 0.0059, + "num_input_tokens_seen": 214733776, + "step": 176480 + }, + { + "epoch": 19.655306827040874, + "grad_norm": 0.001500167534686625, + "learning_rate": 4.525636151884727e-08, + "loss": 0.0025, + "num_input_tokens_seen": 214739920, + "step": 176485 + }, + { + "epoch": 19.65586368192449, + "grad_norm": 0.02431872859597206, + "learning_rate": 4.511034661842661e-08, + "loss": 0.0004, + "num_input_tokens_seen": 214746320, + "step": 176490 + }, + { + "epoch": 19.656420536808106, + "grad_norm": 0.015057290904223919, + "learning_rate": 4.496456743693089e-08, + "loss": 0.0033, + "num_input_tokens_seen": 214752400, + "step": 176495 + }, + { + "epoch": 19.656977391691726, + "grad_norm": 0.0005040448158979416, + "learning_rate": 4.481902397574233e-08, + "loss": 0.0844, + "num_input_tokens_seen": 214758416, + "step": 176500 + }, + { + "epoch": 19.65753424657534, + "grad_norm": 0.1144004613161087, + "learning_rate": 4.467371623622929e-08, + "loss": 0.0025, + "num_input_tokens_seen": 214764688, + "step": 176505 + }, + { + "epoch": 19.65809110145896, + "grad_norm": 0.00026546287699602544, + "learning_rate": 4.4528644219765656e-08, + "loss": 0.0171, + "num_input_tokens_seen": 214770736, + "step": 176510 + }, + { + "epoch": 19.658647956342577, + "grad_norm": 0.5337136387825012, + "learning_rate": 4.438380792772534e-08, + "loss": 0.0282, + "num_input_tokens_seen": 214777072, + "step": 176515 + }, + { + "epoch": 19.659204811226193, + "grad_norm": 1.880983591079712, + "learning_rate": 4.423920736147391e-08, + "loss": 0.0375, + "num_input_tokens_seen": 214783184, + "step": 176520 + }, + { + "epoch": 19.659761666109812, + "grad_norm": 0.019628683105111122, + "learning_rate": 4.409484252237417e-08, + "loss": 0.0608, + "num_input_tokens_seen": 214789392, + "step": 176525 + }, + { + "epoch": 19.660318520993428, + "grad_norm": 1.107215404510498, + "learning_rate": 4.395071341179724e-08, + "loss": 0.0261, + "num_input_tokens_seen": 214795568, + "step": 176530 + }, + { + "epoch": 19.660875375877048, + "grad_norm": 2.278038501739502, + "learning_rate": 4.3806820031097596e-08, + "loss": 0.109, + "num_input_tokens_seen": 214801648, + "step": 176535 + }, + { + "epoch": 19.661432230760663, + "grad_norm": 0.023285256698727608, + "learning_rate": 4.366316238163804e-08, + "loss": 0.0371, + "num_input_tokens_seen": 214807760, + "step": 176540 + }, + { + "epoch": 19.66198908564428, + "grad_norm": 0.05935201793909073, + "learning_rate": 4.351974046477303e-08, + "loss": 0.0261, + "num_input_tokens_seen": 214813456, + "step": 176545 + }, + { + "epoch": 19.6625459405279, + "grad_norm": 1.0839145183563232, + "learning_rate": 4.33765542818626e-08, + "loss": 0.1655, + "num_input_tokens_seen": 214819344, + "step": 176550 + }, + { + "epoch": 19.663102795411515, + "grad_norm": 0.29820695519447327, + "learning_rate": 4.32336038342529e-08, + "loss": 0.0082, + "num_input_tokens_seen": 214825360, + "step": 176555 + }, + { + "epoch": 19.663659650295134, + "grad_norm": 0.2492164671421051, + "learning_rate": 4.30908891232984e-08, + "loss": 0.0117, + "num_input_tokens_seen": 214831376, + "step": 176560 + }, + { + "epoch": 19.66421650517875, + "grad_norm": 0.6610007882118225, + "learning_rate": 4.294841015034245e-08, + "loss": 0.0235, + "num_input_tokens_seen": 214837328, + "step": 176565 + }, + { + "epoch": 19.66477336006237, + "grad_norm": 0.10159053653478622, + "learning_rate": 4.280616691673678e-08, + "loss": 0.0159, + "num_input_tokens_seen": 214843184, + "step": 176570 + }, + { + "epoch": 19.665330214945985, + "grad_norm": 0.48769402503967285, + "learning_rate": 4.266415942382196e-08, + "loss": 0.0199, + "num_input_tokens_seen": 214849136, + "step": 176575 + }, + { + "epoch": 19.6658870698296, + "grad_norm": 0.41481298208236694, + "learning_rate": 4.252238767293859e-08, + "loss": 0.0129, + "num_input_tokens_seen": 214854800, + "step": 176580 + }, + { + "epoch": 19.66644392471322, + "grad_norm": 1.3241724967956543, + "learning_rate": 4.2380851665427266e-08, + "loss": 0.0683, + "num_input_tokens_seen": 214861072, + "step": 176585 + }, + { + "epoch": 19.667000779596837, + "grad_norm": 0.025366298854351044, + "learning_rate": 4.223955140262581e-08, + "loss": 0.0401, + "num_input_tokens_seen": 214867088, + "step": 176590 + }, + { + "epoch": 19.667557634480453, + "grad_norm": 1.9399861097335815, + "learning_rate": 4.209848688586371e-08, + "loss": 0.117, + "num_input_tokens_seen": 214873232, + "step": 176595 + }, + { + "epoch": 19.668114489364072, + "grad_norm": 0.0027607933152467012, + "learning_rate": 4.1957658116481557e-08, + "loss": 0.0376, + "num_input_tokens_seen": 214879600, + "step": 176600 + }, + { + "epoch": 19.668671344247688, + "grad_norm": 0.7116089463233948, + "learning_rate": 4.18170650958033e-08, + "loss": 0.0407, + "num_input_tokens_seen": 214885904, + "step": 176605 + }, + { + "epoch": 19.669228199131307, + "grad_norm": 2.0039777755737305, + "learning_rate": 4.16767078251612e-08, + "loss": 0.0928, + "num_input_tokens_seen": 214891920, + "step": 176610 + }, + { + "epoch": 19.669785054014923, + "grad_norm": 0.40465137362480164, + "learning_rate": 4.1536586305876426e-08, + "loss": 0.0097, + "num_input_tokens_seen": 214897744, + "step": 176615 + }, + { + "epoch": 19.670341908898543, + "grad_norm": 0.5947532057762146, + "learning_rate": 4.139670053927569e-08, + "loss": 0.067, + "num_input_tokens_seen": 214903408, + "step": 176620 + }, + { + "epoch": 19.67089876378216, + "grad_norm": 0.011594383046030998, + "learning_rate": 4.125705052668016e-08, + "loss": 0.1073, + "num_input_tokens_seen": 214909200, + "step": 176625 + }, + { + "epoch": 19.671455618665775, + "grad_norm": 1.4287891387939453, + "learning_rate": 4.1117636269408233e-08, + "loss": 0.0697, + "num_input_tokens_seen": 214915216, + "step": 176630 + }, + { + "epoch": 19.672012473549394, + "grad_norm": 0.027531061321496964, + "learning_rate": 4.0978457768775515e-08, + "loss": 0.0205, + "num_input_tokens_seen": 214921296, + "step": 176635 + }, + { + "epoch": 19.67256932843301, + "grad_norm": 1.4245468378067017, + "learning_rate": 4.0839515026100395e-08, + "loss": 0.0721, + "num_input_tokens_seen": 214926960, + "step": 176640 + }, + { + "epoch": 19.67312618331663, + "grad_norm": 0.0011644130572676659, + "learning_rate": 4.070080804269016e-08, + "loss": 0.001, + "num_input_tokens_seen": 214933040, + "step": 176645 + }, + { + "epoch": 19.673683038200245, + "grad_norm": 0.6513500213623047, + "learning_rate": 4.056233681986044e-08, + "loss": 0.0419, + "num_input_tokens_seen": 214939216, + "step": 176650 + }, + { + "epoch": 19.67423989308386, + "grad_norm": 0.7159112095832825, + "learning_rate": 4.042410135891572e-08, + "loss": 0.0056, + "num_input_tokens_seen": 214945552, + "step": 176655 + }, + { + "epoch": 19.67479674796748, + "grad_norm": 0.03429736569523811, + "learning_rate": 4.028610166116331e-08, + "loss": 0.0023, + "num_input_tokens_seen": 214951888, + "step": 176660 + }, + { + "epoch": 19.675353602851096, + "grad_norm": 1.6044697761535645, + "learning_rate": 4.014833772790494e-08, + "loss": 0.0348, + "num_input_tokens_seen": 214958000, + "step": 176665 + }, + { + "epoch": 19.675910457734716, + "grad_norm": 0.010576273314654827, + "learning_rate": 4.001080956044234e-08, + "loss": 0.0662, + "num_input_tokens_seen": 214963920, + "step": 176670 + }, + { + "epoch": 19.676467312618332, + "grad_norm": 0.14712581038475037, + "learning_rate": 3.9873517160077255e-08, + "loss": 0.0047, + "num_input_tokens_seen": 214969712, + "step": 176675 + }, + { + "epoch": 19.677024167501948, + "grad_norm": 0.007481050677597523, + "learning_rate": 3.9736460528105866e-08, + "loss": 0.0071, + "num_input_tokens_seen": 214975504, + "step": 176680 + }, + { + "epoch": 19.677581022385567, + "grad_norm": 0.2841718792915344, + "learning_rate": 3.959963966581881e-08, + "loss": 0.0203, + "num_input_tokens_seen": 214981104, + "step": 176685 + }, + { + "epoch": 19.678137877269183, + "grad_norm": 0.3238447308540344, + "learning_rate": 3.94630545745095e-08, + "loss": 0.0787, + "num_input_tokens_seen": 214987376, + "step": 176690 + }, + { + "epoch": 19.678694732152803, + "grad_norm": 0.026314612478017807, + "learning_rate": 3.9326705255474104e-08, + "loss": 0.0411, + "num_input_tokens_seen": 214993616, + "step": 176695 + }, + { + "epoch": 19.67925158703642, + "grad_norm": 0.00218438683077693, + "learning_rate": 3.919059170999218e-08, + "loss": 0.0134, + "num_input_tokens_seen": 214999728, + "step": 176700 + }, + { + "epoch": 19.679808441920034, + "grad_norm": 0.025469928979873657, + "learning_rate": 3.9054713939354336e-08, + "loss": 0.0236, + "num_input_tokens_seen": 215005456, + "step": 176705 + }, + { + "epoch": 19.680365296803654, + "grad_norm": 0.03283395245671272, + "learning_rate": 3.891907194484568e-08, + "loss": 0.0165, + "num_input_tokens_seen": 215011280, + "step": 176710 + }, + { + "epoch": 19.68092215168727, + "grad_norm": 0.5572851896286011, + "learning_rate": 3.878366572774295e-08, + "loss": 0.016, + "num_input_tokens_seen": 215017552, + "step": 176715 + }, + { + "epoch": 19.68147900657089, + "grad_norm": 0.00012941603199578822, + "learning_rate": 3.864849528932568e-08, + "loss": 0.0207, + "num_input_tokens_seen": 215023696, + "step": 176720 + }, + { + "epoch": 19.682035861454505, + "grad_norm": 0.005431815516203642, + "learning_rate": 3.851356063087341e-08, + "loss": 0.0307, + "num_input_tokens_seen": 215029840, + "step": 176725 + }, + { + "epoch": 19.68259271633812, + "grad_norm": 0.2549150586128235, + "learning_rate": 3.837886175366012e-08, + "loss": 0.0096, + "num_input_tokens_seen": 215035920, + "step": 176730 + }, + { + "epoch": 19.68314957122174, + "grad_norm": 0.11283930391073227, + "learning_rate": 3.824439865895701e-08, + "loss": 0.0727, + "num_input_tokens_seen": 215041840, + "step": 176735 + }, + { + "epoch": 19.683706426105356, + "grad_norm": 0.0424654483795166, + "learning_rate": 3.811017134803252e-08, + "loss": 0.0378, + "num_input_tokens_seen": 215048112, + "step": 176740 + }, + { + "epoch": 19.684263280988976, + "grad_norm": 0.007691101636737585, + "learning_rate": 3.7976179822160615e-08, + "loss": 0.0166, + "num_input_tokens_seen": 215054000, + "step": 176745 + }, + { + "epoch": 19.68482013587259, + "grad_norm": 0.0834570899605751, + "learning_rate": 3.7842424082598635e-08, + "loss": 0.0034, + "num_input_tokens_seen": 215060208, + "step": 176750 + }, + { + "epoch": 19.685376990756208, + "grad_norm": 0.0036375478375703096, + "learning_rate": 3.770890413061778e-08, + "loss": 0.0003, + "num_input_tokens_seen": 215066224, + "step": 176755 + }, + { + "epoch": 19.685933845639827, + "grad_norm": 0.052176207304000854, + "learning_rate": 3.757561996747539e-08, + "loss": 0.0015, + "num_input_tokens_seen": 215072368, + "step": 176760 + }, + { + "epoch": 19.686490700523443, + "grad_norm": 0.005468982271850109, + "learning_rate": 3.744257159442877e-08, + "loss": 0.0035, + "num_input_tokens_seen": 215078544, + "step": 176765 + }, + { + "epoch": 19.687047555407062, + "grad_norm": 1.8071444034576416, + "learning_rate": 3.730975901273803e-08, + "loss": 0.0673, + "num_input_tokens_seen": 215084976, + "step": 176770 + }, + { + "epoch": 19.687604410290678, + "grad_norm": 1.0238230228424072, + "learning_rate": 3.717718222365496e-08, + "loss": 0.1142, + "num_input_tokens_seen": 215091056, + "step": 176775 + }, + { + "epoch": 19.688161265174294, + "grad_norm": 0.010073870420455933, + "learning_rate": 3.70448412284341e-08, + "loss": 0.0499, + "num_input_tokens_seen": 215096400, + "step": 176780 + }, + { + "epoch": 19.688718120057914, + "grad_norm": 0.5070759654045105, + "learning_rate": 3.691273602832446e-08, + "loss": 0.0394, + "num_input_tokens_seen": 215102544, + "step": 176785 + }, + { + "epoch": 19.68927497494153, + "grad_norm": 0.49811360239982605, + "learning_rate": 3.678086662457503e-08, + "loss": 0.0448, + "num_input_tokens_seen": 215108176, + "step": 176790 + }, + { + "epoch": 19.68983182982515, + "grad_norm": 0.00604339549317956, + "learning_rate": 3.664923301843204e-08, + "loss": 0.0257, + "num_input_tokens_seen": 215114640, + "step": 176795 + }, + { + "epoch": 19.690388684708765, + "grad_norm": 0.45489221811294556, + "learning_rate": 3.6517835211136164e-08, + "loss": 0.0099, + "num_input_tokens_seen": 215121008, + "step": 176800 + }, + { + "epoch": 19.69094553959238, + "grad_norm": 0.0035909090656787157, + "learning_rate": 3.638667320392808e-08, + "loss": 0.0027, + "num_input_tokens_seen": 215127248, + "step": 176805 + }, + { + "epoch": 19.691502394476, + "grad_norm": 3.9306304454803467, + "learning_rate": 3.6255746998048455e-08, + "loss": 0.0809, + "num_input_tokens_seen": 215133712, + "step": 176810 + }, + { + "epoch": 19.692059249359616, + "grad_norm": 0.21351833641529083, + "learning_rate": 3.6125056594735196e-08, + "loss": 0.0136, + "num_input_tokens_seen": 215139984, + "step": 176815 + }, + { + "epoch": 19.692616104243235, + "grad_norm": 3.794365644454956, + "learning_rate": 3.599460199522065e-08, + "loss": 0.0834, + "num_input_tokens_seen": 215146192, + "step": 176820 + }, + { + "epoch": 19.69317295912685, + "grad_norm": 0.009020755998790264, + "learning_rate": 3.586438320073993e-08, + "loss": 0.0028, + "num_input_tokens_seen": 215152272, + "step": 176825 + }, + { + "epoch": 19.693729814010467, + "grad_norm": 0.34627971053123474, + "learning_rate": 3.5734400212519835e-08, + "loss": 0.007, + "num_input_tokens_seen": 215158608, + "step": 176830 + }, + { + "epoch": 19.694286668894087, + "grad_norm": 0.6246809363365173, + "learning_rate": 3.560465303178717e-08, + "loss": 0.151, + "num_input_tokens_seen": 215164880, + "step": 176835 + }, + { + "epoch": 19.694843523777703, + "grad_norm": 0.053050946444272995, + "learning_rate": 3.5475141659771506e-08, + "loss": 0.0411, + "num_input_tokens_seen": 215170992, + "step": 176840 + }, + { + "epoch": 19.695400378661322, + "grad_norm": 0.8221216797828674, + "learning_rate": 3.534586609769408e-08, + "loss": 0.0511, + "num_input_tokens_seen": 215176112, + "step": 176845 + }, + { + "epoch": 19.695957233544938, + "grad_norm": 0.028115330263972282, + "learning_rate": 3.521682634677892e-08, + "loss": 0.0116, + "num_input_tokens_seen": 215182384, + "step": 176850 + }, + { + "epoch": 19.696514088428554, + "grad_norm": 0.001813395763747394, + "learning_rate": 3.508802240823894e-08, + "loss": 0.0154, + "num_input_tokens_seen": 215188784, + "step": 176855 + }, + { + "epoch": 19.697070943312173, + "grad_norm": 4.167116165161133, + "learning_rate": 3.495945428329539e-08, + "loss": 0.1429, + "num_input_tokens_seen": 215194960, + "step": 176860 + }, + { + "epoch": 19.69762779819579, + "grad_norm": 0.9055350422859192, + "learning_rate": 3.483112197316119e-08, + "loss": 0.0532, + "num_input_tokens_seen": 215201200, + "step": 176865 + }, + { + "epoch": 19.69818465307941, + "grad_norm": 0.017096983268857002, + "learning_rate": 3.4703025479049245e-08, + "loss": 0.0039, + "num_input_tokens_seen": 215207472, + "step": 176870 + }, + { + "epoch": 19.698741507963025, + "grad_norm": 0.21764466166496277, + "learning_rate": 3.457516480216971e-08, + "loss": 0.0104, + "num_input_tokens_seen": 215213488, + "step": 176875 + }, + { + "epoch": 19.69929836284664, + "grad_norm": 0.13151022791862488, + "learning_rate": 3.444753994372718e-08, + "loss": 0.0022, + "num_input_tokens_seen": 215219760, + "step": 176880 + }, + { + "epoch": 19.69985521773026, + "grad_norm": 0.7141143083572388, + "learning_rate": 3.432015090493179e-08, + "loss": 0.0583, + "num_input_tokens_seen": 215225936, + "step": 176885 + }, + { + "epoch": 19.700412072613876, + "grad_norm": 0.00011097906826762483, + "learning_rate": 3.419299768698259e-08, + "loss": 0.0026, + "num_input_tokens_seen": 215231952, + "step": 176890 + }, + { + "epoch": 19.700968927497495, + "grad_norm": 0.02136225812137127, + "learning_rate": 3.406608029108693e-08, + "loss": 0.0016, + "num_input_tokens_seen": 215238096, + "step": 176895 + }, + { + "epoch": 19.70152578238111, + "grad_norm": 0.5369585156440735, + "learning_rate": 3.393939871843554e-08, + "loss": 0.1128, + "num_input_tokens_seen": 215243888, + "step": 176900 + }, + { + "epoch": 19.70208263726473, + "grad_norm": 0.36475223302841187, + "learning_rate": 3.381295297023024e-08, + "loss": 0.0892, + "num_input_tokens_seen": 215250096, + "step": 176905 + }, + { + "epoch": 19.702639492148347, + "grad_norm": 0.05959373712539673, + "learning_rate": 3.3686743047664506e-08, + "loss": 0.004, + "num_input_tokens_seen": 215255984, + "step": 176910 + }, + { + "epoch": 19.703196347031962, + "grad_norm": 0.659191906452179, + "learning_rate": 3.3560768951931834e-08, + "loss": 0.0208, + "num_input_tokens_seen": 215262032, + "step": 176915 + }, + { + "epoch": 19.703753201915582, + "grad_norm": 1.1042543649673462, + "learning_rate": 3.3435030684217383e-08, + "loss": 0.1057, + "num_input_tokens_seen": 215268016, + "step": 176920 + }, + { + "epoch": 19.704310056799198, + "grad_norm": 2.0853350162506104, + "learning_rate": 3.330952824571188e-08, + "loss": 0.0941, + "num_input_tokens_seen": 215274096, + "step": 176925 + }, + { + "epoch": 19.704866911682814, + "grad_norm": 0.058734092861413956, + "learning_rate": 3.3184261637603245e-08, + "loss": 0.0205, + "num_input_tokens_seen": 215280272, + "step": 176930 + }, + { + "epoch": 19.705423766566433, + "grad_norm": 0.0006276373751461506, + "learning_rate": 3.30592308610711e-08, + "loss": 0.07, + "num_input_tokens_seen": 215286320, + "step": 176935 + }, + { + "epoch": 19.70598062145005, + "grad_norm": 0.09215644001960754, + "learning_rate": 3.2934435917297836e-08, + "loss": 0.0066, + "num_input_tokens_seen": 215292368, + "step": 176940 + }, + { + "epoch": 19.70653747633367, + "grad_norm": 0.06345754861831665, + "learning_rate": 3.2809876807463056e-08, + "loss": 0.1484, + "num_input_tokens_seen": 215298576, + "step": 176945 + }, + { + "epoch": 19.707094331217284, + "grad_norm": 1.3524634838104248, + "learning_rate": 3.268555353274083e-08, + "loss": 0.0169, + "num_input_tokens_seen": 215304912, + "step": 176950 + }, + { + "epoch": 19.707651186100904, + "grad_norm": 0.12552852928638458, + "learning_rate": 3.256146609430799e-08, + "loss": 0.0229, + "num_input_tokens_seen": 215311056, + "step": 176955 + }, + { + "epoch": 19.70820804098452, + "grad_norm": 0.14696525037288666, + "learning_rate": 3.243761449333582e-08, + "loss": 0.0119, + "num_input_tokens_seen": 215317040, + "step": 176960 + }, + { + "epoch": 19.708764895868136, + "grad_norm": 0.04670986160635948, + "learning_rate": 3.2313998730992835e-08, + "loss": 0.0971, + "num_input_tokens_seen": 215323376, + "step": 176965 + }, + { + "epoch": 19.709321750751755, + "grad_norm": 0.004063499625772238, + "learning_rate": 3.219061880844754e-08, + "loss": 0.0079, + "num_input_tokens_seen": 215329232, + "step": 176970 + }, + { + "epoch": 19.70987860563537, + "grad_norm": 0.35464969277381897, + "learning_rate": 3.2067474726868444e-08, + "loss": 0.0486, + "num_input_tokens_seen": 215335568, + "step": 176975 + }, + { + "epoch": 19.71043546051899, + "grad_norm": 0.013313676230609417, + "learning_rate": 3.194456648741295e-08, + "loss": 0.0386, + "num_input_tokens_seen": 215341616, + "step": 176980 + }, + { + "epoch": 19.710992315402606, + "grad_norm": 0.4503885805606842, + "learning_rate": 3.182189409124958e-08, + "loss": 0.0081, + "num_input_tokens_seen": 215348208, + "step": 176985 + }, + { + "epoch": 19.711549170286222, + "grad_norm": 0.011739186942577362, + "learning_rate": 3.169945753952741e-08, + "loss": 0.0337, + "num_input_tokens_seen": 215354448, + "step": 176990 + }, + { + "epoch": 19.71210602516984, + "grad_norm": 0.00033821415854617953, + "learning_rate": 3.1577256833412175e-08, + "loss": 0.0002, + "num_input_tokens_seen": 215360688, + "step": 176995 + }, + { + "epoch": 19.712662880053458, + "grad_norm": 0.0008248700178228319, + "learning_rate": 3.145529197405017e-08, + "loss": 0.0699, + "num_input_tokens_seen": 215366288, + "step": 177000 + }, + { + "epoch": 19.713219734937077, + "grad_norm": 0.0008287824457511306, + "learning_rate": 3.1333562962601594e-08, + "loss": 0.0504, + "num_input_tokens_seen": 215372752, + "step": 177005 + }, + { + "epoch": 19.713776589820693, + "grad_norm": 0.16429872810840607, + "learning_rate": 3.1212069800209966e-08, + "loss": 0.0053, + "num_input_tokens_seen": 215379024, + "step": 177010 + }, + { + "epoch": 19.71433344470431, + "grad_norm": 0.0028649303130805492, + "learning_rate": 3.109081248802437e-08, + "loss": 0.0005, + "num_input_tokens_seen": 215385328, + "step": 177015 + }, + { + "epoch": 19.71489029958793, + "grad_norm": 0.05529779940843582, + "learning_rate": 3.096979102719111e-08, + "loss": 0.0111, + "num_input_tokens_seen": 215391568, + "step": 177020 + }, + { + "epoch": 19.715447154471544, + "grad_norm": 0.017062371596693993, + "learning_rate": 3.0849005418853715e-08, + "loss": 0.0476, + "num_input_tokens_seen": 215398000, + "step": 177025 + }, + { + "epoch": 19.716004009355164, + "grad_norm": 0.265949547290802, + "learning_rate": 3.072845566415295e-08, + "loss": 0.0068, + "num_input_tokens_seen": 215404144, + "step": 177030 + }, + { + "epoch": 19.71656086423878, + "grad_norm": 0.22228685021400452, + "learning_rate": 3.0608141764223996e-08, + "loss": 0.0068, + "num_input_tokens_seen": 215410544, + "step": 177035 + }, + { + "epoch": 19.717117719122395, + "grad_norm": 0.000774337793700397, + "learning_rate": 3.04880637202104e-08, + "loss": 0.0204, + "num_input_tokens_seen": 215416688, + "step": 177040 + }, + { + "epoch": 19.717674574006015, + "grad_norm": 0.1976633369922638, + "learning_rate": 3.0368221533239036e-08, + "loss": 0.0126, + "num_input_tokens_seen": 215422832, + "step": 177045 + }, + { + "epoch": 19.71823142888963, + "grad_norm": 1.1937060356140137, + "learning_rate": 3.024861520444511e-08, + "loss": 0.0154, + "num_input_tokens_seen": 215428848, + "step": 177050 + }, + { + "epoch": 19.71878828377325, + "grad_norm": 0.0018018411938101053, + "learning_rate": 3.0129244734961035e-08, + "loss": 0.0366, + "num_input_tokens_seen": 215434992, + "step": 177055 + }, + { + "epoch": 19.719345138656866, + "grad_norm": 0.014453739859163761, + "learning_rate": 3.001011012591093e-08, + "loss": 0.0028, + "num_input_tokens_seen": 215441040, + "step": 177060 + }, + { + "epoch": 19.719901993540482, + "grad_norm": 0.0009770331671461463, + "learning_rate": 2.9891211378421654e-08, + "loss": 0.002, + "num_input_tokens_seen": 215446640, + "step": 177065 + }, + { + "epoch": 19.7204588484241, + "grad_norm": 1.1485861539840698, + "learning_rate": 2.9772548493611775e-08, + "loss": 0.0268, + "num_input_tokens_seen": 215452816, + "step": 177070 + }, + { + "epoch": 19.721015703307717, + "grad_norm": 0.02252834476530552, + "learning_rate": 2.965412147261093e-08, + "loss": 0.0873, + "num_input_tokens_seen": 215458992, + "step": 177075 + }, + { + "epoch": 19.721572558191337, + "grad_norm": 0.30297601222991943, + "learning_rate": 2.9535930316529348e-08, + "loss": 0.0125, + "num_input_tokens_seen": 215465520, + "step": 177080 + }, + { + "epoch": 19.722129413074953, + "grad_norm": 0.03140881285071373, + "learning_rate": 2.9417975026488353e-08, + "loss": 0.0177, + "num_input_tokens_seen": 215471632, + "step": 177085 + }, + { + "epoch": 19.72268626795857, + "grad_norm": 1.796599268913269, + "learning_rate": 2.9300255603600945e-08, + "loss": 0.0301, + "num_input_tokens_seen": 215477552, + "step": 177090 + }, + { + "epoch": 19.723243122842188, + "grad_norm": 0.04447835311293602, + "learning_rate": 2.9182772048977348e-08, + "loss": 0.02, + "num_input_tokens_seen": 215483888, + "step": 177095 + }, + { + "epoch": 19.723799977725804, + "grad_norm": 0.0010049246484413743, + "learning_rate": 2.9065524363730556e-08, + "loss": 0.008, + "num_input_tokens_seen": 215489232, + "step": 177100 + }, + { + "epoch": 19.724356832609423, + "grad_norm": 0.00048777094343677163, + "learning_rate": 2.8948512548965246e-08, + "loss": 0.0635, + "num_input_tokens_seen": 215495760, + "step": 177105 + }, + { + "epoch": 19.72491368749304, + "grad_norm": 0.20281024277210236, + "learning_rate": 2.8831736605786087e-08, + "loss": 0.084, + "num_input_tokens_seen": 215501936, + "step": 177110 + }, + { + "epoch": 19.725470542376655, + "grad_norm": 0.010881275869905949, + "learning_rate": 2.8715196535300526e-08, + "loss": 0.0008, + "num_input_tokens_seen": 215508208, + "step": 177115 + }, + { + "epoch": 19.726027397260275, + "grad_norm": 0.025806544348597527, + "learning_rate": 2.859889233860491e-08, + "loss": 0.02, + "num_input_tokens_seen": 215514128, + "step": 177120 + }, + { + "epoch": 19.72658425214389, + "grad_norm": 0.3790726661682129, + "learning_rate": 2.8482824016801135e-08, + "loss": 0.106, + "num_input_tokens_seen": 215519984, + "step": 177125 + }, + { + "epoch": 19.72714110702751, + "grad_norm": 0.08096778392791748, + "learning_rate": 2.836699157098277e-08, + "loss": 0.0533, + "num_input_tokens_seen": 215526448, + "step": 177130 + }, + { + "epoch": 19.727697961911126, + "grad_norm": 2.084683656692505, + "learning_rate": 2.8251395002246162e-08, + "loss": 0.0734, + "num_input_tokens_seen": 215532592, + "step": 177135 + }, + { + "epoch": 19.728254816794742, + "grad_norm": 0.019450001418590546, + "learning_rate": 2.8136034311679328e-08, + "loss": 0.0507, + "num_input_tokens_seen": 215538544, + "step": 177140 + }, + { + "epoch": 19.72881167167836, + "grad_norm": 0.002756464760750532, + "learning_rate": 2.8020909500378613e-08, + "loss": 0.0563, + "num_input_tokens_seen": 215544592, + "step": 177145 + }, + { + "epoch": 19.729368526561977, + "grad_norm": 0.004167813807725906, + "learning_rate": 2.7906020569426483e-08, + "loss": 0.0316, + "num_input_tokens_seen": 215549808, + "step": 177150 + }, + { + "epoch": 19.729925381445597, + "grad_norm": 0.0019502331269904971, + "learning_rate": 2.7791367519908186e-08, + "loss": 0.0016, + "num_input_tokens_seen": 215556176, + "step": 177155 + }, + { + "epoch": 19.730482236329212, + "grad_norm": 1.4683573246002197, + "learning_rate": 2.767695035290896e-08, + "loss": 0.1475, + "num_input_tokens_seen": 215562352, + "step": 177160 + }, + { + "epoch": 19.73103909121283, + "grad_norm": 1.29171621799469, + "learning_rate": 2.7562769069505723e-08, + "loss": 0.0431, + "num_input_tokens_seen": 215568112, + "step": 177165 + }, + { + "epoch": 19.731595946096448, + "grad_norm": 0.005373157560825348, + "learning_rate": 2.7448823670783718e-08, + "loss": 0.0378, + "num_input_tokens_seen": 215574256, + "step": 177170 + }, + { + "epoch": 19.732152800980064, + "grad_norm": 0.647213339805603, + "learning_rate": 2.7335114157811535e-08, + "loss": 0.0226, + "num_input_tokens_seen": 215580240, + "step": 177175 + }, + { + "epoch": 19.732709655863683, + "grad_norm": 0.7457994818687439, + "learning_rate": 2.7221640531668868e-08, + "loss": 0.0159, + "num_input_tokens_seen": 215586640, + "step": 177180 + }, + { + "epoch": 19.7332665107473, + "grad_norm": 2.308709144592285, + "learning_rate": 2.710840279342708e-08, + "loss": 0.2398, + "num_input_tokens_seen": 215592176, + "step": 177185 + }, + { + "epoch": 19.733823365630915, + "grad_norm": 1.8710048198699951, + "learning_rate": 2.6995400944151984e-08, + "loss": 0.0563, + "num_input_tokens_seen": 215598224, + "step": 177190 + }, + { + "epoch": 19.734380220514534, + "grad_norm": 0.28059470653533936, + "learning_rate": 2.688263498491217e-08, + "loss": 0.0077, + "num_input_tokens_seen": 215604336, + "step": 177195 + }, + { + "epoch": 19.73493707539815, + "grad_norm": 0.13218747079372406, + "learning_rate": 2.6770104916776228e-08, + "loss": 0.0334, + "num_input_tokens_seen": 215610576, + "step": 177200 + }, + { + "epoch": 19.73549393028177, + "grad_norm": 0.0025834725238382816, + "learning_rate": 2.6657810740804423e-08, + "loss": 0.0011, + "num_input_tokens_seen": 215616720, + "step": 177205 + }, + { + "epoch": 19.736050785165386, + "grad_norm": 0.027648411691188812, + "learning_rate": 2.6545752458059793e-08, + "loss": 0.0323, + "num_input_tokens_seen": 215622768, + "step": 177210 + }, + { + "epoch": 19.736607640049, + "grad_norm": 0.10388734191656113, + "learning_rate": 2.6433930069597046e-08, + "loss": 0.0042, + "num_input_tokens_seen": 215628720, + "step": 177215 + }, + { + "epoch": 19.73716449493262, + "grad_norm": 0.04667210951447487, + "learning_rate": 2.6322343576473673e-08, + "loss": 0.0134, + "num_input_tokens_seen": 215634864, + "step": 177220 + }, + { + "epoch": 19.737721349816237, + "grad_norm": 0.7289323806762695, + "learning_rate": 2.621099297974716e-08, + "loss": 0.0144, + "num_input_tokens_seen": 215641008, + "step": 177225 + }, + { + "epoch": 19.738278204699856, + "grad_norm": 0.7682905197143555, + "learning_rate": 2.6099878280463898e-08, + "loss": 0.043, + "num_input_tokens_seen": 215647216, + "step": 177230 + }, + { + "epoch": 19.738835059583472, + "grad_norm": 0.05963994935154915, + "learning_rate": 2.5988999479675812e-08, + "loss": 0.0223, + "num_input_tokens_seen": 215653136, + "step": 177235 + }, + { + "epoch": 19.739391914467088, + "grad_norm": 1.030496597290039, + "learning_rate": 2.5878356578432073e-08, + "loss": 0.0494, + "num_input_tokens_seen": 215659248, + "step": 177240 + }, + { + "epoch": 19.739948769350708, + "grad_norm": 0.06071767210960388, + "learning_rate": 2.5767949577773508e-08, + "loss": 0.0018, + "num_input_tokens_seen": 215665552, + "step": 177245 + }, + { + "epoch": 19.740505624234324, + "grad_norm": 0.2245669662952423, + "learning_rate": 2.5657778478749284e-08, + "loss": 0.0756, + "num_input_tokens_seen": 215671376, + "step": 177250 + }, + { + "epoch": 19.741062479117943, + "grad_norm": 1.0966298580169678, + "learning_rate": 2.5547843282394678e-08, + "loss": 0.0657, + "num_input_tokens_seen": 215677424, + "step": 177255 + }, + { + "epoch": 19.74161933400156, + "grad_norm": 0.12310110777616501, + "learning_rate": 2.5438143989747754e-08, + "loss": 0.0103, + "num_input_tokens_seen": 215683504, + "step": 177260 + }, + { + "epoch": 19.742176188885175, + "grad_norm": 0.00041858479380607605, + "learning_rate": 2.5328680601849343e-08, + "loss": 0.0713, + "num_input_tokens_seen": 215689616, + "step": 177265 + }, + { + "epoch": 19.742733043768794, + "grad_norm": 1.1509416103363037, + "learning_rate": 2.521945311973195e-08, + "loss": 0.0503, + "num_input_tokens_seen": 215695632, + "step": 177270 + }, + { + "epoch": 19.74328989865241, + "grad_norm": 0.0020485222339630127, + "learning_rate": 2.511046154442531e-08, + "loss": 0.0058, + "num_input_tokens_seen": 215700624, + "step": 177275 + }, + { + "epoch": 19.74384675353603, + "grad_norm": 0.061861611902713776, + "learning_rate": 2.5001705876959157e-08, + "loss": 0.0025, + "num_input_tokens_seen": 215706672, + "step": 177280 + }, + { + "epoch": 19.744403608419645, + "grad_norm": 0.006963819265365601, + "learning_rate": 2.4893186118360446e-08, + "loss": 0.067, + "num_input_tokens_seen": 215712944, + "step": 177285 + }, + { + "epoch": 19.744960463303265, + "grad_norm": 0.0007267093169502914, + "learning_rate": 2.4784902269658906e-08, + "loss": 0.02, + "num_input_tokens_seen": 215719088, + "step": 177290 + }, + { + "epoch": 19.74551731818688, + "grad_norm": 0.0038484889082610607, + "learning_rate": 2.467685433187039e-08, + "loss": 0.2428, + "num_input_tokens_seen": 215725104, + "step": 177295 + }, + { + "epoch": 19.746074173070497, + "grad_norm": 0.6876802444458008, + "learning_rate": 2.456904230602186e-08, + "loss": 0.0362, + "num_input_tokens_seen": 215731120, + "step": 177300 + }, + { + "epoch": 19.746631027954116, + "grad_norm": 0.16082993149757385, + "learning_rate": 2.4461466193126393e-08, + "loss": 0.0023, + "num_input_tokens_seen": 215737264, + "step": 177305 + }, + { + "epoch": 19.747187882837732, + "grad_norm": 0.0003026487829629332, + "learning_rate": 2.4354125994202613e-08, + "loss": 0.0502, + "num_input_tokens_seen": 215743472, + "step": 177310 + }, + { + "epoch": 19.74774473772135, + "grad_norm": 0.13885925710201263, + "learning_rate": 2.4247021710263608e-08, + "loss": 0.0016, + "num_input_tokens_seen": 215750064, + "step": 177315 + }, + { + "epoch": 19.748301592604967, + "grad_norm": 1.4638036489486694, + "learning_rate": 2.414015334232522e-08, + "loss": 0.1044, + "num_input_tokens_seen": 215755312, + "step": 177320 + }, + { + "epoch": 19.748858447488583, + "grad_norm": 0.05802585929632187, + "learning_rate": 2.4033520891389437e-08, + "loss": 0.1582, + "num_input_tokens_seen": 215761456, + "step": 177325 + }, + { + "epoch": 19.749415302372203, + "grad_norm": 1.1032649278640747, + "learning_rate": 2.3927124358469332e-08, + "loss": 0.0326, + "num_input_tokens_seen": 215767088, + "step": 177330 + }, + { + "epoch": 19.74997215725582, + "grad_norm": 0.01923094503581524, + "learning_rate": 2.3820963744566882e-08, + "loss": 0.0553, + "num_input_tokens_seen": 215773264, + "step": 177335 + }, + { + "epoch": 19.750529012139438, + "grad_norm": 0.18083910644054413, + "learning_rate": 2.371503905068684e-08, + "loss": 0.023, + "num_input_tokens_seen": 215779056, + "step": 177340 + }, + { + "epoch": 19.751085867023054, + "grad_norm": 0.0004724199534393847, + "learning_rate": 2.36093502778284e-08, + "loss": 0.04, + "num_input_tokens_seen": 215785488, + "step": 177345 + }, + { + "epoch": 19.75164272190667, + "grad_norm": 0.14792408049106598, + "learning_rate": 2.3503897426990775e-08, + "loss": 0.0082, + "num_input_tokens_seen": 215791792, + "step": 177350 + }, + { + "epoch": 19.75219957679029, + "grad_norm": 0.2568512260913849, + "learning_rate": 2.3398680499170377e-08, + "loss": 0.0963, + "num_input_tokens_seen": 215798000, + "step": 177355 + }, + { + "epoch": 19.752756431673905, + "grad_norm": 0.0739399790763855, + "learning_rate": 2.3293699495360865e-08, + "loss": 0.0522, + "num_input_tokens_seen": 215804304, + "step": 177360 + }, + { + "epoch": 19.753313286557525, + "grad_norm": 0.005798896308988333, + "learning_rate": 2.318895441655311e-08, + "loss": 0.0429, + "num_input_tokens_seen": 215810416, + "step": 177365 + }, + { + "epoch": 19.75387014144114, + "grad_norm": 0.9139528274536133, + "learning_rate": 2.308444526373521e-08, + "loss": 0.1099, + "num_input_tokens_seen": 215815856, + "step": 177370 + }, + { + "epoch": 19.754426996324757, + "grad_norm": 0.16397294402122498, + "learning_rate": 2.2980172037895265e-08, + "loss": 0.0081, + "num_input_tokens_seen": 215822128, + "step": 177375 + }, + { + "epoch": 19.754983851208376, + "grad_norm": 0.025227302685379982, + "learning_rate": 2.287613474002137e-08, + "loss": 0.0276, + "num_input_tokens_seen": 215828464, + "step": 177380 + }, + { + "epoch": 19.755540706091992, + "grad_norm": 0.04432055354118347, + "learning_rate": 2.2772333371090525e-08, + "loss": 0.0123, + "num_input_tokens_seen": 215834512, + "step": 177385 + }, + { + "epoch": 19.75609756097561, + "grad_norm": 0.02327822521328926, + "learning_rate": 2.266876793209083e-08, + "loss": 0.0051, + "num_input_tokens_seen": 215840848, + "step": 177390 + }, + { + "epoch": 19.756654415859227, + "grad_norm": 0.929536759853363, + "learning_rate": 2.2565438423993725e-08, + "loss": 0.0483, + "num_input_tokens_seen": 215846960, + "step": 177395 + }, + { + "epoch": 19.757211270742843, + "grad_norm": 1.614730954170227, + "learning_rate": 2.2462344847776205e-08, + "loss": 0.186, + "num_input_tokens_seen": 215853104, + "step": 177400 + }, + { + "epoch": 19.757768125626463, + "grad_norm": 2.4267961978912354, + "learning_rate": 2.2359487204415273e-08, + "loss": 0.1133, + "num_input_tokens_seen": 215858992, + "step": 177405 + }, + { + "epoch": 19.75832498051008, + "grad_norm": 0.13763056695461273, + "learning_rate": 2.2256865494879597e-08, + "loss": 0.0009, + "num_input_tokens_seen": 215865264, + "step": 177410 + }, + { + "epoch": 19.758881835393698, + "grad_norm": 0.5641576647758484, + "learning_rate": 2.215447972014062e-08, + "loss": 0.007, + "num_input_tokens_seen": 215871216, + "step": 177415 + }, + { + "epoch": 19.759438690277314, + "grad_norm": 0.5506904721260071, + "learning_rate": 2.2052329881167012e-08, + "loss": 0.0667, + "num_input_tokens_seen": 215877680, + "step": 177420 + }, + { + "epoch": 19.75999554516093, + "grad_norm": 0.010534653440117836, + "learning_rate": 2.195041597891634e-08, + "loss": 0.0095, + "num_input_tokens_seen": 215883920, + "step": 177425 + }, + { + "epoch": 19.76055240004455, + "grad_norm": 0.051714036613702774, + "learning_rate": 2.184873801436005e-08, + "loss": 0.0097, + "num_input_tokens_seen": 215890256, + "step": 177430 + }, + { + "epoch": 19.761109254928165, + "grad_norm": 0.0003134125145152211, + "learning_rate": 2.174729598845293e-08, + "loss": 0.0413, + "num_input_tokens_seen": 215896432, + "step": 177435 + }, + { + "epoch": 19.761666109811785, + "grad_norm": 0.0016674456419423223, + "learning_rate": 2.1646089902152557e-08, + "loss": 0.1018, + "num_input_tokens_seen": 215902320, + "step": 177440 + }, + { + "epoch": 19.7622229646954, + "grad_norm": 0.012557754293084145, + "learning_rate": 2.1545119756419262e-08, + "loss": 0.0062, + "num_input_tokens_seen": 215908624, + "step": 177445 + }, + { + "epoch": 19.762779819579016, + "grad_norm": 0.0009144449722953141, + "learning_rate": 2.144438555220507e-08, + "loss": 0.0347, + "num_input_tokens_seen": 215914512, + "step": 177450 + }, + { + "epoch": 19.763336674462636, + "grad_norm": 0.5875910520553589, + "learning_rate": 2.1343887290461993e-08, + "loss": 0.013, + "num_input_tokens_seen": 215920688, + "step": 177455 + }, + { + "epoch": 19.76389352934625, + "grad_norm": 0.08506828546524048, + "learning_rate": 2.12436249721365e-08, + "loss": 0.0055, + "num_input_tokens_seen": 215926832, + "step": 177460 + }, + { + "epoch": 19.76445038422987, + "grad_norm": 0.4054498076438904, + "learning_rate": 2.114359859817783e-08, + "loss": 0.0763, + "num_input_tokens_seen": 215932720, + "step": 177465 + }, + { + "epoch": 19.765007239113487, + "grad_norm": 0.0002678809396456927, + "learning_rate": 2.104380816953244e-08, + "loss": 0.0075, + "num_input_tokens_seen": 215938736, + "step": 177470 + }, + { + "epoch": 19.765564093997103, + "grad_norm": 0.016674935817718506, + "learning_rate": 2.094425368713848e-08, + "loss": 0.1595, + "num_input_tokens_seen": 215944816, + "step": 177475 + }, + { + "epoch": 19.766120948880722, + "grad_norm": 8.060621621552855e-05, + "learning_rate": 2.0844935151942413e-08, + "loss": 0.0225, + "num_input_tokens_seen": 215950928, + "step": 177480 + }, + { + "epoch": 19.76667780376434, + "grad_norm": 0.0050897118635475636, + "learning_rate": 2.074585256487682e-08, + "loss": 0.0119, + "num_input_tokens_seen": 215957264, + "step": 177485 + }, + { + "epoch": 19.767234658647958, + "grad_norm": 0.00015803801943548024, + "learning_rate": 2.064700592687985e-08, + "loss": 0.0485, + "num_input_tokens_seen": 215963120, + "step": 177490 + }, + { + "epoch": 19.767791513531574, + "grad_norm": 0.3016637861728668, + "learning_rate": 2.0548395238884076e-08, + "loss": 0.02, + "num_input_tokens_seen": 215968944, + "step": 177495 + }, + { + "epoch": 19.76834836841519, + "grad_norm": 0.39789384603500366, + "learning_rate": 2.04500205018221e-08, + "loss": 0.0017, + "num_input_tokens_seen": 215975216, + "step": 177500 + }, + { + "epoch": 19.76890522329881, + "grad_norm": 0.3331925570964813, + "learning_rate": 2.0351881716623723e-08, + "loss": 0.0054, + "num_input_tokens_seen": 215981552, + "step": 177505 + }, + { + "epoch": 19.769462078182425, + "grad_norm": 0.09126333147287369, + "learning_rate": 2.0253978884215987e-08, + "loss": 0.1098, + "num_input_tokens_seen": 215987696, + "step": 177510 + }, + { + "epoch": 19.770018933066044, + "grad_norm": 0.0221183393150568, + "learning_rate": 2.0156312005520374e-08, + "loss": 0.0088, + "num_input_tokens_seen": 215993744, + "step": 177515 + }, + { + "epoch": 19.77057578794966, + "grad_norm": 0.22036480903625488, + "learning_rate": 2.005888108146392e-08, + "loss": 0.0018, + "num_input_tokens_seen": 215999664, + "step": 177520 + }, + { + "epoch": 19.771132642833276, + "grad_norm": 0.11297199875116348, + "learning_rate": 1.996168611296534e-08, + "loss": 0.0303, + "num_input_tokens_seen": 216005424, + "step": 177525 + }, + { + "epoch": 19.771689497716896, + "grad_norm": 0.06773687899112701, + "learning_rate": 1.986472710094056e-08, + "loss": 0.0046, + "num_input_tokens_seen": 216011536, + "step": 177530 + }, + { + "epoch": 19.77224635260051, + "grad_norm": 3.37062931060791, + "learning_rate": 1.9768004046308297e-08, + "loss": 0.2449, + "num_input_tokens_seen": 216017616, + "step": 177535 + }, + { + "epoch": 19.77280320748413, + "grad_norm": 1.8924996852874756, + "learning_rate": 1.9671516949981706e-08, + "loss": 0.0845, + "num_input_tokens_seen": 216023568, + "step": 177540 + }, + { + "epoch": 19.773360062367747, + "grad_norm": 0.0493718683719635, + "learning_rate": 1.9575265812868393e-08, + "loss": 0.102, + "num_input_tokens_seen": 216028976, + "step": 177545 + }, + { + "epoch": 19.773916917251363, + "grad_norm": 0.002245688810944557, + "learning_rate": 1.9479250635884295e-08, + "loss": 0.0047, + "num_input_tokens_seen": 216034960, + "step": 177550 + }, + { + "epoch": 19.774473772134982, + "grad_norm": 0.04227421432733536, + "learning_rate": 1.9383471419931466e-08, + "loss": 0.0264, + "num_input_tokens_seen": 216041264, + "step": 177555 + }, + { + "epoch": 19.775030627018598, + "grad_norm": 0.00012763596896547824, + "learning_rate": 1.9287928165914736e-08, + "loss": 0.1097, + "num_input_tokens_seen": 216047280, + "step": 177560 + }, + { + "epoch": 19.775587481902217, + "grad_norm": 1.7636464834213257, + "learning_rate": 1.919262087473894e-08, + "loss": 0.0745, + "num_input_tokens_seen": 216053456, + "step": 177565 + }, + { + "epoch": 19.776144336785833, + "grad_norm": 0.0009305339190177619, + "learning_rate": 1.9097549547303363e-08, + "loss": 0.0343, + "num_input_tokens_seen": 216059152, + "step": 177570 + }, + { + "epoch": 19.77670119166945, + "grad_norm": 0.7998013496398926, + "learning_rate": 1.90027141845045e-08, + "loss": 0.0561, + "num_input_tokens_seen": 216065616, + "step": 177575 + }, + { + "epoch": 19.77725804655307, + "grad_norm": 0.026424260810017586, + "learning_rate": 1.890811478724164e-08, + "loss": 0.0421, + "num_input_tokens_seen": 216071760, + "step": 177580 + }, + { + "epoch": 19.777814901436685, + "grad_norm": 1.2604469060897827, + "learning_rate": 1.8813751356402952e-08, + "loss": 0.0526, + "num_input_tokens_seen": 216077808, + "step": 177585 + }, + { + "epoch": 19.778371756320304, + "grad_norm": 0.1685624122619629, + "learning_rate": 1.8719623892884952e-08, + "loss": 0.0113, + "num_input_tokens_seen": 216083696, + "step": 177590 + }, + { + "epoch": 19.77892861120392, + "grad_norm": 1.094205617904663, + "learning_rate": 1.8625732397575813e-08, + "loss": 0.0429, + "num_input_tokens_seen": 216089776, + "step": 177595 + }, + { + "epoch": 19.779485466087536, + "grad_norm": 0.00013517889601644129, + "learning_rate": 1.8532076871360936e-08, + "loss": 0.0125, + "num_input_tokens_seen": 216095984, + "step": 177600 + }, + { + "epoch": 19.780042320971155, + "grad_norm": 0.3933420181274414, + "learning_rate": 1.8438657315122955e-08, + "loss": 0.043, + "num_input_tokens_seen": 216102448, + "step": 177605 + }, + { + "epoch": 19.78059917585477, + "grad_norm": 0.6295027136802673, + "learning_rate": 1.834547372975004e-08, + "loss": 0.0088, + "num_input_tokens_seen": 216108752, + "step": 177610 + }, + { + "epoch": 19.78115603073839, + "grad_norm": 1.3372068405151367, + "learning_rate": 1.8252526116116497e-08, + "loss": 0.0766, + "num_input_tokens_seen": 216114928, + "step": 177615 + }, + { + "epoch": 19.781712885622007, + "grad_norm": 0.026515671983361244, + "learning_rate": 1.815981447510495e-08, + "loss": 0.0694, + "num_input_tokens_seen": 216120944, + "step": 177620 + }, + { + "epoch": 19.782269740505626, + "grad_norm": 2.4638001918792725, + "learning_rate": 1.806733880758693e-08, + "loss": 0.1712, + "num_input_tokens_seen": 216126640, + "step": 177625 + }, + { + "epoch": 19.782826595389242, + "grad_norm": 0.0007311736699193716, + "learning_rate": 1.797509911443951e-08, + "loss": 0.0484, + "num_input_tokens_seen": 216132784, + "step": 177630 + }, + { + "epoch": 19.783383450272858, + "grad_norm": 1.65728759765625, + "learning_rate": 1.7883095396531436e-08, + "loss": 0.0258, + "num_input_tokens_seen": 216138832, + "step": 177635 + }, + { + "epoch": 19.783940305156477, + "grad_norm": 0.09970246255397797, + "learning_rate": 1.7791327654734236e-08, + "loss": 0.0145, + "num_input_tokens_seen": 216144784, + "step": 177640 + }, + { + "epoch": 19.784497160040093, + "grad_norm": 0.1159071996808052, + "learning_rate": 1.7699795889913885e-08, + "loss": 0.0238, + "num_input_tokens_seen": 216150960, + "step": 177645 + }, + { + "epoch": 19.78505401492371, + "grad_norm": 0.002059898804873228, + "learning_rate": 1.760850010293358e-08, + "loss": 0.0296, + "num_input_tokens_seen": 216157008, + "step": 177650 + }, + { + "epoch": 19.78561086980733, + "grad_norm": 1.0134670734405518, + "learning_rate": 1.7517440294656516e-08, + "loss": 0.0521, + "num_input_tokens_seen": 216162960, + "step": 177655 + }, + { + "epoch": 19.786167724690944, + "grad_norm": 0.43584468960762024, + "learning_rate": 1.7426616465943124e-08, + "loss": 0.0736, + "num_input_tokens_seen": 216168848, + "step": 177660 + }, + { + "epoch": 19.786724579574564, + "grad_norm": 0.014632286503911018, + "learning_rate": 1.733602861764827e-08, + "loss": 0.0021, + "num_input_tokens_seen": 216174960, + "step": 177665 + }, + { + "epoch": 19.78728143445818, + "grad_norm": 0.000839485670439899, + "learning_rate": 1.7245676750635152e-08, + "loss": 0.0089, + "num_input_tokens_seen": 216180624, + "step": 177670 + }, + { + "epoch": 19.7878382893418, + "grad_norm": 2.110180139541626, + "learning_rate": 1.7155560865747545e-08, + "loss": 0.0268, + "num_input_tokens_seen": 216186736, + "step": 177675 + }, + { + "epoch": 19.788395144225415, + "grad_norm": 0.9656549692153931, + "learning_rate": 1.7065680963845864e-08, + "loss": 0.1037, + "num_input_tokens_seen": 216193104, + "step": 177680 + }, + { + "epoch": 19.78895199910903, + "grad_norm": 0.003174195298925042, + "learning_rate": 1.697603704577111e-08, + "loss": 0.0132, + "num_input_tokens_seen": 216199408, + "step": 177685 + }, + { + "epoch": 19.78950885399265, + "grad_norm": 0.2910655736923218, + "learning_rate": 1.688662911237815e-08, + "loss": 0.033, + "num_input_tokens_seen": 216204976, + "step": 177690 + }, + { + "epoch": 19.790065708876266, + "grad_norm": 0.09636563062667847, + "learning_rate": 1.679745716450243e-08, + "loss": 0.017, + "num_input_tokens_seen": 216211152, + "step": 177695 + }, + { + "epoch": 19.790622563759886, + "grad_norm": 0.0021683142986148596, + "learning_rate": 1.6708521202993266e-08, + "loss": 0.0811, + "num_input_tokens_seen": 216217168, + "step": 177700 + }, + { + "epoch": 19.7911794186435, + "grad_norm": 0.10242278128862381, + "learning_rate": 1.6619821228688882e-08, + "loss": 0.0249, + "num_input_tokens_seen": 216223344, + "step": 177705 + }, + { + "epoch": 19.791736273527118, + "grad_norm": 4.12600564956665, + "learning_rate": 1.6531357242427492e-08, + "loss": 0.0331, + "num_input_tokens_seen": 216229200, + "step": 177710 + }, + { + "epoch": 19.792293128410737, + "grad_norm": 0.03031780570745468, + "learning_rate": 1.6443129245041766e-08, + "loss": 0.0031, + "num_input_tokens_seen": 216235504, + "step": 177715 + }, + { + "epoch": 19.792849983294353, + "grad_norm": 0.04194365069270134, + "learning_rate": 1.6355137237367147e-08, + "loss": 0.0482, + "num_input_tokens_seen": 216241392, + "step": 177720 + }, + { + "epoch": 19.793406838177972, + "grad_norm": 0.0087435869500041, + "learning_rate": 1.62673812202363e-08, + "loss": 0.0142, + "num_input_tokens_seen": 216247472, + "step": 177725 + }, + { + "epoch": 19.79396369306159, + "grad_norm": 0.0189976766705513, + "learning_rate": 1.617986119447634e-08, + "loss": 0.0716, + "num_input_tokens_seen": 216253232, + "step": 177730 + }, + { + "epoch": 19.794520547945204, + "grad_norm": 0.00016094859165605158, + "learning_rate": 1.609257716091439e-08, + "loss": 0.0223, + "num_input_tokens_seen": 216259440, + "step": 177735 + }, + { + "epoch": 19.795077402828824, + "grad_norm": 0.21006231009960175, + "learning_rate": 1.600552912037201e-08, + "loss": 0.0093, + "num_input_tokens_seen": 216265616, + "step": 177740 + }, + { + "epoch": 19.79563425771244, + "grad_norm": 0.06014315411448479, + "learning_rate": 1.5918717073676316e-08, + "loss": 0.0621, + "num_input_tokens_seen": 216271472, + "step": 177745 + }, + { + "epoch": 19.79619111259606, + "grad_norm": 0.5290378332138062, + "learning_rate": 1.5832141021646097e-08, + "loss": 0.0262, + "num_input_tokens_seen": 216277712, + "step": 177750 + }, + { + "epoch": 19.796747967479675, + "grad_norm": 2.3787755966186523, + "learning_rate": 1.574580096509737e-08, + "loss": 0.077, + "num_input_tokens_seen": 216284080, + "step": 177755 + }, + { + "epoch": 19.79730482236329, + "grad_norm": 0.003262293292209506, + "learning_rate": 1.5659696904846144e-08, + "loss": 0.0015, + "num_input_tokens_seen": 216290224, + "step": 177760 + }, + { + "epoch": 19.79786167724691, + "grad_norm": 2.235325574874878, + "learning_rate": 1.5573828841708438e-08, + "loss": 0.1634, + "num_input_tokens_seen": 216295760, + "step": 177765 + }, + { + "epoch": 19.798418532130526, + "grad_norm": 1.0818277597427368, + "learning_rate": 1.5488196776491937e-08, + "loss": 0.0619, + "num_input_tokens_seen": 216301808, + "step": 177770 + }, + { + "epoch": 19.798975387014146, + "grad_norm": 0.12651120126247406, + "learning_rate": 1.5402800710007102e-08, + "loss": 0.0488, + "num_input_tokens_seen": 216308080, + "step": 177775 + }, + { + "epoch": 19.79953224189776, + "grad_norm": 0.17424163222312927, + "learning_rate": 1.531764064305885e-08, + "loss": 0.0834, + "num_input_tokens_seen": 216313648, + "step": 177780 + }, + { + "epoch": 19.800089096781377, + "grad_norm": 0.16793103516101837, + "learning_rate": 1.5232716576452087e-08, + "loss": 0.0143, + "num_input_tokens_seen": 216319792, + "step": 177785 + }, + { + "epoch": 19.800645951664997, + "grad_norm": 0.028303686529397964, + "learning_rate": 1.514802851099173e-08, + "loss": 0.0013, + "num_input_tokens_seen": 216325936, + "step": 177790 + }, + { + "epoch": 19.801202806548613, + "grad_norm": 0.009051090106368065, + "learning_rate": 1.506357644747436e-08, + "loss": 0.0429, + "num_input_tokens_seen": 216331696, + "step": 177795 + }, + { + "epoch": 19.801759661432232, + "grad_norm": 1.3639897108078003, + "learning_rate": 1.4979360386699337e-08, + "loss": 0.089, + "num_input_tokens_seen": 216337040, + "step": 177800 + }, + { + "epoch": 19.802316516315848, + "grad_norm": 0.012567208148539066, + "learning_rate": 1.489538032946325e-08, + "loss": 0.0057, + "num_input_tokens_seen": 216343216, + "step": 177805 + }, + { + "epoch": 19.802873371199464, + "grad_norm": 0.3396633267402649, + "learning_rate": 1.4811636276557128e-08, + "loss": 0.0036, + "num_input_tokens_seen": 216349552, + "step": 177810 + }, + { + "epoch": 19.803430226083083, + "grad_norm": 0.2346287965774536, + "learning_rate": 1.472812822877201e-08, + "loss": 0.0725, + "num_input_tokens_seen": 216356176, + "step": 177815 + }, + { + "epoch": 19.8039870809667, + "grad_norm": 0.0028296958189457655, + "learning_rate": 1.4644856186898925e-08, + "loss": 0.0285, + "num_input_tokens_seen": 216362096, + "step": 177820 + }, + { + "epoch": 19.80454393585032, + "grad_norm": 0.027769070118665695, + "learning_rate": 1.456182015172336e-08, + "loss": 0.0827, + "num_input_tokens_seen": 216368048, + "step": 177825 + }, + { + "epoch": 19.805100790733935, + "grad_norm": 1.256334662437439, + "learning_rate": 1.447902012402802e-08, + "loss": 0.0133, + "num_input_tokens_seen": 216374032, + "step": 177830 + }, + { + "epoch": 19.80565764561755, + "grad_norm": 0.029780283570289612, + "learning_rate": 1.4396456104598388e-08, + "loss": 0.0014, + "num_input_tokens_seen": 216380464, + "step": 177835 + }, + { + "epoch": 19.80621450050117, + "grad_norm": 0.10108339786529541, + "learning_rate": 1.4314128094211621e-08, + "loss": 0.0622, + "num_input_tokens_seen": 216386544, + "step": 177840 + }, + { + "epoch": 19.806771355384786, + "grad_norm": 0.0033869906328618526, + "learning_rate": 1.4232036093644874e-08, + "loss": 0.0809, + "num_input_tokens_seen": 216392752, + "step": 177845 + }, + { + "epoch": 19.807328210268405, + "grad_norm": 0.7248917818069458, + "learning_rate": 1.4150180103675303e-08, + "loss": 0.0428, + "num_input_tokens_seen": 216398960, + "step": 177850 + }, + { + "epoch": 19.80788506515202, + "grad_norm": 0.6840599179267883, + "learning_rate": 1.4068560125077291e-08, + "loss": 0.0125, + "num_input_tokens_seen": 216405200, + "step": 177855 + }, + { + "epoch": 19.808441920035637, + "grad_norm": 1.2025443315505981, + "learning_rate": 1.3987176158616888e-08, + "loss": 0.0493, + "num_input_tokens_seen": 216411376, + "step": 177860 + }, + { + "epoch": 19.808998774919257, + "grad_norm": 1.651742935180664, + "learning_rate": 1.3906028205068478e-08, + "loss": 0.0354, + "num_input_tokens_seen": 216417424, + "step": 177865 + }, + { + "epoch": 19.809555629802873, + "grad_norm": 0.4588966369628906, + "learning_rate": 1.3825116265195337e-08, + "loss": 0.0139, + "num_input_tokens_seen": 216423696, + "step": 177870 + }, + { + "epoch": 19.810112484686492, + "grad_norm": 0.018343526870012283, + "learning_rate": 1.374444033976352e-08, + "loss": 0.0231, + "num_input_tokens_seen": 216429616, + "step": 177875 + }, + { + "epoch": 19.810669339570108, + "grad_norm": 0.020480895414948463, + "learning_rate": 1.366400042953353e-08, + "loss": 0.0131, + "num_input_tokens_seen": 216436176, + "step": 177880 + }, + { + "epoch": 19.811226194453724, + "grad_norm": 0.23379696905612946, + "learning_rate": 1.3583796535265868e-08, + "loss": 0.01, + "num_input_tokens_seen": 216442480, + "step": 177885 + }, + { + "epoch": 19.811783049337343, + "grad_norm": 0.2993669807910919, + "learning_rate": 1.3503828657718266e-08, + "loss": 0.1184, + "num_input_tokens_seen": 216448656, + "step": 177890 + }, + { + "epoch": 19.81233990422096, + "grad_norm": 0.2442711889743805, + "learning_rate": 1.342409679764567e-08, + "loss": 0.0064, + "num_input_tokens_seen": 216454736, + "step": 177895 + }, + { + "epoch": 19.81289675910458, + "grad_norm": 6.134520053863525, + "learning_rate": 1.3344600955800257e-08, + "loss": 0.0649, + "num_input_tokens_seen": 216460912, + "step": 177900 + }, + { + "epoch": 19.813453613988194, + "grad_norm": 0.5883973240852356, + "learning_rate": 1.3265341132934206e-08, + "loss": 0.0885, + "num_input_tokens_seen": 216466832, + "step": 177905 + }, + { + "epoch": 19.81401046887181, + "grad_norm": 0.0023826616816222668, + "learning_rate": 1.3186317329796915e-08, + "loss": 0.0331, + "num_input_tokens_seen": 216472528, + "step": 177910 + }, + { + "epoch": 19.81456732375543, + "grad_norm": 0.0002758461341727525, + "learning_rate": 1.310752954713501e-08, + "loss": 0.037, + "num_input_tokens_seen": 216478768, + "step": 177915 + }, + { + "epoch": 19.815124178639046, + "grad_norm": 0.012568664737045765, + "learning_rate": 1.3028977785689567e-08, + "loss": 0.1462, + "num_input_tokens_seen": 216484784, + "step": 177920 + }, + { + "epoch": 19.815681033522665, + "grad_norm": 0.003621497191488743, + "learning_rate": 1.295066204620443e-08, + "loss": 0.0019, + "num_input_tokens_seen": 216490864, + "step": 177925 + }, + { + "epoch": 19.81623788840628, + "grad_norm": 0.0009367743041366339, + "learning_rate": 1.287258232942068e-08, + "loss": 0.0087, + "num_input_tokens_seen": 216496816, + "step": 177930 + }, + { + "epoch": 19.816794743289897, + "grad_norm": 0.0013187339063733816, + "learning_rate": 1.2794738636076608e-08, + "loss": 0.0347, + "num_input_tokens_seen": 216502896, + "step": 177935 + }, + { + "epoch": 19.817351598173516, + "grad_norm": 3.275829553604126, + "learning_rate": 1.271713096690219e-08, + "loss": 0.0287, + "num_input_tokens_seen": 216508912, + "step": 177940 + }, + { + "epoch": 19.817908453057132, + "grad_norm": 0.9785517454147339, + "learning_rate": 1.2639759322635725e-08, + "loss": 0.109, + "num_input_tokens_seen": 216515024, + "step": 177945 + }, + { + "epoch": 19.818465307940752, + "grad_norm": 0.0039822799153625965, + "learning_rate": 1.2562623704007181e-08, + "loss": 0.0411, + "num_input_tokens_seen": 216520400, + "step": 177950 + }, + { + "epoch": 19.819022162824368, + "grad_norm": 0.021559439599514008, + "learning_rate": 1.2485724111740982e-08, + "loss": 0.002, + "num_input_tokens_seen": 216526736, + "step": 177955 + }, + { + "epoch": 19.819579017707984, + "grad_norm": 0.03755269572138786, + "learning_rate": 1.2409060546569873e-08, + "loss": 0.0236, + "num_input_tokens_seen": 216532496, + "step": 177960 + }, + { + "epoch": 19.820135872591603, + "grad_norm": 0.0005335069727152586, + "learning_rate": 1.23326330092155e-08, + "loss": 0.0286, + "num_input_tokens_seen": 216538544, + "step": 177965 + }, + { + "epoch": 19.82069272747522, + "grad_norm": 0.30184653401374817, + "learning_rate": 1.2256441500396732e-08, + "loss": 0.0051, + "num_input_tokens_seen": 216544560, + "step": 177970 + }, + { + "epoch": 19.82124958235884, + "grad_norm": 0.05790971219539642, + "learning_rate": 1.2180486020835214e-08, + "loss": 0.1185, + "num_input_tokens_seen": 216550896, + "step": 177975 + }, + { + "epoch": 19.821806437242454, + "grad_norm": 0.0035950937308371067, + "learning_rate": 1.210476657125259e-08, + "loss": 0.101, + "num_input_tokens_seen": 216556976, + "step": 177980 + }, + { + "epoch": 19.82236329212607, + "grad_norm": 0.10807640105485916, + "learning_rate": 1.2029283152356629e-08, + "loss": 0.0211, + "num_input_tokens_seen": 216562960, + "step": 177985 + }, + { + "epoch": 19.82292014700969, + "grad_norm": 0.31550833582878113, + "learning_rate": 1.19540357648662e-08, + "loss": 0.0034, + "num_input_tokens_seen": 216569296, + "step": 177990 + }, + { + "epoch": 19.823477001893306, + "grad_norm": 0.17624203860759735, + "learning_rate": 1.187902440948907e-08, + "loss": 0.0114, + "num_input_tokens_seen": 216575728, + "step": 177995 + }, + { + "epoch": 19.824033856776925, + "grad_norm": 0.8948548436164856, + "learning_rate": 1.1804249086935782e-08, + "loss": 0.0179, + "num_input_tokens_seen": 216582288, + "step": 178000 + }, + { + "epoch": 19.82459071166054, + "grad_norm": 0.0001464840315748006, + "learning_rate": 1.1729709797911326e-08, + "loss": 0.0279, + "num_input_tokens_seen": 216588336, + "step": 178005 + }, + { + "epoch": 19.82514756654416, + "grad_norm": 0.0019497487228363752, + "learning_rate": 1.1655406543117919e-08, + "loss": 0.0339, + "num_input_tokens_seen": 216593936, + "step": 178010 + }, + { + "epoch": 19.825704421427776, + "grad_norm": 0.7348816394805908, + "learning_rate": 1.158133932326333e-08, + "loss": 0.1203, + "num_input_tokens_seen": 216600016, + "step": 178015 + }, + { + "epoch": 19.826261276311392, + "grad_norm": 0.006327931769192219, + "learning_rate": 1.1507508139041446e-08, + "loss": 0.1483, + "num_input_tokens_seen": 216606096, + "step": 178020 + }, + { + "epoch": 19.82681813119501, + "grad_norm": 0.6351482272148132, + "learning_rate": 1.1433912991148931e-08, + "loss": 0.0687, + "num_input_tokens_seen": 216611984, + "step": 178025 + }, + { + "epoch": 19.827374986078627, + "grad_norm": 0.0004539898654911667, + "learning_rate": 1.1360553880288005e-08, + "loss": 0.0123, + "num_input_tokens_seen": 216618128, + "step": 178030 + }, + { + "epoch": 19.827931840962247, + "grad_norm": 0.020720461383461952, + "learning_rate": 1.1287430807144229e-08, + "loss": 0.029, + "num_input_tokens_seen": 216624304, + "step": 178035 + }, + { + "epoch": 19.828488695845863, + "grad_norm": 0.2734065651893616, + "learning_rate": 1.121454377241149e-08, + "loss": 0.0333, + "num_input_tokens_seen": 216630128, + "step": 178040 + }, + { + "epoch": 19.82904555072948, + "grad_norm": 0.7904155850410461, + "learning_rate": 1.1141892776780905e-08, + "loss": 0.0626, + "num_input_tokens_seen": 216636208, + "step": 178045 + }, + { + "epoch": 19.829602405613098, + "grad_norm": 4.342410087585449, + "learning_rate": 1.1069477820932484e-08, + "loss": 0.0654, + "num_input_tokens_seen": 216642288, + "step": 178050 + }, + { + "epoch": 19.830159260496714, + "grad_norm": 0.039151113480329514, + "learning_rate": 1.0997298905554564e-08, + "loss": 0.0084, + "num_input_tokens_seen": 216648912, + "step": 178055 + }, + { + "epoch": 19.830716115380334, + "grad_norm": 0.5435516834259033, + "learning_rate": 1.0925356031329937e-08, + "loss": 0.1965, + "num_input_tokens_seen": 216654800, + "step": 178060 + }, + { + "epoch": 19.83127297026395, + "grad_norm": 0.29662182927131653, + "learning_rate": 1.0853649198935834e-08, + "loss": 0.0189, + "num_input_tokens_seen": 216661072, + "step": 178065 + }, + { + "epoch": 19.831829825147565, + "grad_norm": 0.2021685391664505, + "learning_rate": 1.0782178409046716e-08, + "loss": 0.0576, + "num_input_tokens_seen": 216667216, + "step": 178070 + }, + { + "epoch": 19.832386680031185, + "grad_norm": 0.8151760101318359, + "learning_rate": 1.0710943662345375e-08, + "loss": 0.0252, + "num_input_tokens_seen": 216673296, + "step": 178075 + }, + { + "epoch": 19.8329435349148, + "grad_norm": 0.028020337224006653, + "learning_rate": 1.0639944959497939e-08, + "loss": 0.0141, + "num_input_tokens_seen": 216679280, + "step": 178080 + }, + { + "epoch": 19.83350038979842, + "grad_norm": 1.5288816690444946, + "learning_rate": 1.0569182301176094e-08, + "loss": 0.0715, + "num_input_tokens_seen": 216685488, + "step": 178085 + }, + { + "epoch": 19.834057244682036, + "grad_norm": 0.31626659631729126, + "learning_rate": 1.0498655688051528e-08, + "loss": 0.1273, + "num_input_tokens_seen": 216691920, + "step": 178090 + }, + { + "epoch": 19.834614099565652, + "grad_norm": 1.242067813873291, + "learning_rate": 1.0428365120787598e-08, + "loss": 0.0766, + "num_input_tokens_seen": 216698064, + "step": 178095 + }, + { + "epoch": 19.83517095444927, + "grad_norm": 0.3767615854740143, + "learning_rate": 1.0358310600050435e-08, + "loss": 0.0166, + "num_input_tokens_seen": 216704272, + "step": 178100 + }, + { + "epoch": 19.835727809332887, + "grad_norm": 0.3175889253616333, + "learning_rate": 1.0288492126497851e-08, + "loss": 0.131, + "num_input_tokens_seen": 216710608, + "step": 178105 + }, + { + "epoch": 19.836284664216507, + "grad_norm": 0.0008583086309954524, + "learning_rate": 1.02189097007932e-08, + "loss": 0.0693, + "num_input_tokens_seen": 216716656, + "step": 178110 + }, + { + "epoch": 19.836841519100123, + "grad_norm": 0.6953304409980774, + "learning_rate": 1.0149563323591515e-08, + "loss": 0.0172, + "num_input_tokens_seen": 216722384, + "step": 178115 + }, + { + "epoch": 19.83739837398374, + "grad_norm": 0.7686234712600708, + "learning_rate": 1.0080452995550604e-08, + "loss": 0.0398, + "num_input_tokens_seen": 216728496, + "step": 178120 + }, + { + "epoch": 19.837955228867358, + "grad_norm": 0.11148153245449066, + "learning_rate": 1.0011578717319946e-08, + "loss": 0.0041, + "num_input_tokens_seen": 216734672, + "step": 178125 + }, + { + "epoch": 19.838512083750974, + "grad_norm": 0.0023374261800199747, + "learning_rate": 9.942940489554576e-09, + "loss": 0.0037, + "num_input_tokens_seen": 216740464, + "step": 178130 + }, + { + "epoch": 19.839068938634593, + "grad_norm": 0.46351858973503113, + "learning_rate": 9.874538312895642e-09, + "loss": 0.1015, + "num_input_tokens_seen": 216746576, + "step": 178135 + }, + { + "epoch": 19.83962579351821, + "grad_norm": 0.014336907304823399, + "learning_rate": 9.806372187995405e-09, + "loss": 0.0306, + "num_input_tokens_seen": 216752752, + "step": 178140 + }, + { + "epoch": 19.840182648401825, + "grad_norm": 0.7953476905822754, + "learning_rate": 9.738442115495016e-09, + "loss": 0.1265, + "num_input_tokens_seen": 216758640, + "step": 178145 + }, + { + "epoch": 19.840739503285445, + "grad_norm": 0.11820036172866821, + "learning_rate": 9.670748096038407e-09, + "loss": 0.0195, + "num_input_tokens_seen": 216764208, + "step": 178150 + }, + { + "epoch": 19.84129635816906, + "grad_norm": 0.00030434824293479323, + "learning_rate": 9.603290130261178e-09, + "loss": 0.1045, + "num_input_tokens_seen": 216770352, + "step": 178155 + }, + { + "epoch": 19.84185321305268, + "grad_norm": 0.7801375985145569, + "learning_rate": 9.536068218804484e-09, + "loss": 0.0608, + "num_input_tokens_seen": 216776720, + "step": 178160 + }, + { + "epoch": 19.842410067936296, + "grad_norm": 0.18316172063350677, + "learning_rate": 9.469082362301151e-09, + "loss": 0.006, + "num_input_tokens_seen": 216782896, + "step": 178165 + }, + { + "epoch": 19.84296692281991, + "grad_norm": 0.00346741103567183, + "learning_rate": 9.402332561386784e-09, + "loss": 0.0054, + "num_input_tokens_seen": 216789072, + "step": 178170 + }, + { + "epoch": 19.84352377770353, + "grad_norm": 2.0204129219055176, + "learning_rate": 9.335818816685883e-09, + "loss": 0.1766, + "num_input_tokens_seen": 216795184, + "step": 178175 + }, + { + "epoch": 19.844080632587147, + "grad_norm": 1.5552055835723877, + "learning_rate": 9.269541128831272e-09, + "loss": 0.0901, + "num_input_tokens_seen": 216801104, + "step": 178180 + }, + { + "epoch": 19.844637487470767, + "grad_norm": 0.36292916536331177, + "learning_rate": 9.20349949845023e-09, + "loss": 0.0597, + "num_input_tokens_seen": 216806896, + "step": 178185 + }, + { + "epoch": 19.845194342354382, + "grad_norm": 0.29740607738494873, + "learning_rate": 9.137693926161705e-09, + "loss": 0.0406, + "num_input_tokens_seen": 216812816, + "step": 178190 + }, + { + "epoch": 19.845751197238, + "grad_norm": 0.014344875700771809, + "learning_rate": 9.072124412592975e-09, + "loss": 0.0344, + "num_input_tokens_seen": 216818736, + "step": 178195 + }, + { + "epoch": 19.846308052121618, + "grad_norm": 0.05381295830011368, + "learning_rate": 9.006790958357435e-09, + "loss": 0.0204, + "num_input_tokens_seen": 216824976, + "step": 178200 + }, + { + "epoch": 19.846864907005234, + "grad_norm": 0.4268200695514679, + "learning_rate": 8.941693564076815e-09, + "loss": 0.0261, + "num_input_tokens_seen": 216830992, + "step": 178205 + }, + { + "epoch": 19.847421761888853, + "grad_norm": 0.06381329894065857, + "learning_rate": 8.876832230364507e-09, + "loss": 0.0107, + "num_input_tokens_seen": 216836976, + "step": 178210 + }, + { + "epoch": 19.84797861677247, + "grad_norm": 0.4664113223552704, + "learning_rate": 8.812206957831138e-09, + "loss": 0.0159, + "num_input_tokens_seen": 216843344, + "step": 178215 + }, + { + "epoch": 19.848535471656085, + "grad_norm": 0.00045573097304441035, + "learning_rate": 8.747817747090103e-09, + "loss": 0.0307, + "num_input_tokens_seen": 216849488, + "step": 178220 + }, + { + "epoch": 19.849092326539704, + "grad_norm": 1.7882522344589233, + "learning_rate": 8.683664598749252e-09, + "loss": 0.0734, + "num_input_tokens_seen": 216855440, + "step": 178225 + }, + { + "epoch": 19.84964918142332, + "grad_norm": 1.6412221193313599, + "learning_rate": 8.619747513413656e-09, + "loss": 0.0241, + "num_input_tokens_seen": 216861296, + "step": 178230 + }, + { + "epoch": 19.85020603630694, + "grad_norm": 0.004479329567402601, + "learning_rate": 8.556066491688385e-09, + "loss": 0.0081, + "num_input_tokens_seen": 216867280, + "step": 178235 + }, + { + "epoch": 19.850762891190556, + "grad_norm": 1.0003987550735474, + "learning_rate": 8.492621534172962e-09, + "loss": 0.0169, + "num_input_tokens_seen": 216873520, + "step": 178240 + }, + { + "epoch": 19.85131974607417, + "grad_norm": 1.5380569696426392, + "learning_rate": 8.429412641466905e-09, + "loss": 0.0899, + "num_input_tokens_seen": 216879088, + "step": 178245 + }, + { + "epoch": 19.85187660095779, + "grad_norm": 0.05221856012940407, + "learning_rate": 8.366439814169736e-09, + "loss": 0.0181, + "num_input_tokens_seen": 216885328, + "step": 178250 + }, + { + "epoch": 19.852433455841407, + "grad_norm": 1.4417109489440918, + "learning_rate": 8.303703052872646e-09, + "loss": 0.0629, + "num_input_tokens_seen": 216891728, + "step": 178255 + }, + { + "epoch": 19.852990310725026, + "grad_norm": 1.7034095525741577, + "learning_rate": 8.241202358169608e-09, + "loss": 0.0538, + "num_input_tokens_seen": 216897904, + "step": 178260 + }, + { + "epoch": 19.853547165608642, + "grad_norm": 1.9097766876220703, + "learning_rate": 8.178937730651815e-09, + "loss": 0.0822, + "num_input_tokens_seen": 216903920, + "step": 178265 + }, + { + "epoch": 19.854104020492258, + "grad_norm": 0.00019464862998574972, + "learning_rate": 8.116909170910458e-09, + "loss": 0.1044, + "num_input_tokens_seen": 216909712, + "step": 178270 + }, + { + "epoch": 19.854660875375878, + "grad_norm": 1.941117525100708, + "learning_rate": 8.055116679522857e-09, + "loss": 0.0596, + "num_input_tokens_seen": 216915440, + "step": 178275 + }, + { + "epoch": 19.855217730259493, + "grad_norm": 0.8462857007980347, + "learning_rate": 7.993560257082977e-09, + "loss": 0.0686, + "num_input_tokens_seen": 216921328, + "step": 178280 + }, + { + "epoch": 19.855774585143113, + "grad_norm": 0.10776568949222565, + "learning_rate": 7.932239904162586e-09, + "loss": 0.0235, + "num_input_tokens_seen": 216927184, + "step": 178285 + }, + { + "epoch": 19.85633144002673, + "grad_norm": 0.24654649198055267, + "learning_rate": 7.871155621347326e-09, + "loss": 0.0741, + "num_input_tokens_seen": 216933168, + "step": 178290 + }, + { + "epoch": 19.856888294910345, + "grad_norm": 0.5749701857566833, + "learning_rate": 7.810307409214513e-09, + "loss": 0.0829, + "num_input_tokens_seen": 216939472, + "step": 178295 + }, + { + "epoch": 19.857445149793964, + "grad_norm": 0.11094324290752411, + "learning_rate": 7.749695268333134e-09, + "loss": 0.0331, + "num_input_tokens_seen": 216945232, + "step": 178300 + }, + { + "epoch": 19.85800200467758, + "grad_norm": 0.0015844093868508935, + "learning_rate": 7.68931919928051e-09, + "loss": 0.0058, + "num_input_tokens_seen": 216951280, + "step": 178305 + }, + { + "epoch": 19.8585588595612, + "grad_norm": 2.9124186038970947, + "learning_rate": 7.6291792026284e-09, + "loss": 0.0766, + "num_input_tokens_seen": 216957584, + "step": 178310 + }, + { + "epoch": 19.859115714444815, + "grad_norm": 0.18522578477859497, + "learning_rate": 7.569275278940246e-09, + "loss": 0.0128, + "num_input_tokens_seen": 216963696, + "step": 178315 + }, + { + "epoch": 19.85967256932843, + "grad_norm": 0.217122882604599, + "learning_rate": 7.509607428782262e-09, + "loss": 0.0072, + "num_input_tokens_seen": 216969808, + "step": 178320 + }, + { + "epoch": 19.86022942421205, + "grad_norm": 1.7076252698898315, + "learning_rate": 7.450175652720659e-09, + "loss": 0.0979, + "num_input_tokens_seen": 216975952, + "step": 178325 + }, + { + "epoch": 19.860786279095667, + "grad_norm": 0.8679513335227966, + "learning_rate": 7.3909799513161015e-09, + "loss": 0.1014, + "num_input_tokens_seen": 216981904, + "step": 178330 + }, + { + "epoch": 19.861343133979286, + "grad_norm": 0.02448047138750553, + "learning_rate": 7.332020325129252e-09, + "loss": 0.1621, + "num_input_tokens_seen": 216988080, + "step": 178335 + }, + { + "epoch": 19.861899988862902, + "grad_norm": 0.0424184687435627, + "learning_rate": 7.2732967747124455e-09, + "loss": 0.0018, + "num_input_tokens_seen": 216994128, + "step": 178340 + }, + { + "epoch": 19.86245684374652, + "grad_norm": 0.007607418578118086, + "learning_rate": 7.214809300626346e-09, + "loss": 0.0858, + "num_input_tokens_seen": 217000048, + "step": 178345 + }, + { + "epoch": 19.863013698630137, + "grad_norm": 0.00012017277913400903, + "learning_rate": 7.156557903417738e-09, + "loss": 0.0991, + "num_input_tokens_seen": 217006032, + "step": 178350 + }, + { + "epoch": 19.863570553513753, + "grad_norm": 0.05718433856964111, + "learning_rate": 7.098542583638957e-09, + "loss": 0.006, + "num_input_tokens_seen": 217012304, + "step": 178355 + }, + { + "epoch": 19.864127408397373, + "grad_norm": 0.39701834321022034, + "learning_rate": 7.040763341839563e-09, + "loss": 0.0467, + "num_input_tokens_seen": 217017808, + "step": 178360 + }, + { + "epoch": 19.86468426328099, + "grad_norm": 1.6691676378250122, + "learning_rate": 6.983220178566341e-09, + "loss": 0.0343, + "num_input_tokens_seen": 217024240, + "step": 178365 + }, + { + "epoch": 19.865241118164604, + "grad_norm": 1.6804155111312866, + "learning_rate": 6.92591309435775e-09, + "loss": 0.1798, + "num_input_tokens_seen": 217030000, + "step": 178370 + }, + { + "epoch": 19.865797973048224, + "grad_norm": 0.1191939264535904, + "learning_rate": 6.868842089757799e-09, + "loss": 0.1926, + "num_input_tokens_seen": 217035376, + "step": 178375 + }, + { + "epoch": 19.86635482793184, + "grad_norm": 0.2525589168071747, + "learning_rate": 6.812007165307721e-09, + "loss": 0.0033, + "num_input_tokens_seen": 217041776, + "step": 178380 + }, + { + "epoch": 19.86691168281546, + "grad_norm": 2.588818073272705, + "learning_rate": 6.755408321540424e-09, + "loss": 0.0458, + "num_input_tokens_seen": 217047856, + "step": 178385 + }, + { + "epoch": 19.867468537699075, + "grad_norm": 0.14355216920375824, + "learning_rate": 6.6990455589943655e-09, + "loss": 0.0029, + "num_input_tokens_seen": 217053968, + "step": 178390 + }, + { + "epoch": 19.868025392582695, + "grad_norm": 0.054485470056533813, + "learning_rate": 6.642918878199677e-09, + "loss": 0.0145, + "num_input_tokens_seen": 217060208, + "step": 178395 + }, + { + "epoch": 19.86858224746631, + "grad_norm": 4.510334014892578, + "learning_rate": 6.587028279686491e-09, + "loss": 0.0518, + "num_input_tokens_seen": 217066480, + "step": 178400 + }, + { + "epoch": 19.869139102349926, + "grad_norm": 1.3038181066513062, + "learning_rate": 6.531373763982162e-09, + "loss": 0.083, + "num_input_tokens_seen": 217072528, + "step": 178405 + }, + { + "epoch": 19.869695957233546, + "grad_norm": 0.510107159614563, + "learning_rate": 6.4759553316168235e-09, + "loss": 0.0123, + "num_input_tokens_seen": 217078672, + "step": 178410 + }, + { + "epoch": 19.87025281211716, + "grad_norm": 0.19884254038333893, + "learning_rate": 6.4207729831067266e-09, + "loss": 0.02, + "num_input_tokens_seen": 217084816, + "step": 178415 + }, + { + "epoch": 19.87080966700078, + "grad_norm": 0.4839189648628235, + "learning_rate": 6.365826718979229e-09, + "loss": 0.0839, + "num_input_tokens_seen": 217091152, + "step": 178420 + }, + { + "epoch": 19.871366521884397, + "grad_norm": 1.176249623298645, + "learning_rate": 6.311116539750583e-09, + "loss": 0.0738, + "num_input_tokens_seen": 217097264, + "step": 178425 + }, + { + "epoch": 19.871923376768013, + "grad_norm": 1.163881778717041, + "learning_rate": 6.256642445937044e-09, + "loss": 0.0325, + "num_input_tokens_seen": 217103536, + "step": 178430 + }, + { + "epoch": 19.872480231651632, + "grad_norm": 0.2951185703277588, + "learning_rate": 6.202404438054865e-09, + "loss": 0.0033, + "num_input_tokens_seen": 217110032, + "step": 178435 + }, + { + "epoch": 19.87303708653525, + "grad_norm": 1.7557350397109985, + "learning_rate": 6.148402516617524e-09, + "loss": 0.0468, + "num_input_tokens_seen": 217116624, + "step": 178440 + }, + { + "epoch": 19.873593941418868, + "grad_norm": 0.6912878155708313, + "learning_rate": 6.0946366821301725e-09, + "loss": 0.0744, + "num_input_tokens_seen": 217122864, + "step": 178445 + }, + { + "epoch": 19.874150796302484, + "grad_norm": 0.0006833836087025702, + "learning_rate": 6.041106935106289e-09, + "loss": 0.0375, + "num_input_tokens_seen": 217129136, + "step": 178450 + }, + { + "epoch": 19.8747076511861, + "grad_norm": 0.0013110453728586435, + "learning_rate": 5.987813276048249e-09, + "loss": 0.0036, + "num_input_tokens_seen": 217135088, + "step": 178455 + }, + { + "epoch": 19.87526450606972, + "grad_norm": 0.1785406917333603, + "learning_rate": 5.934755705458428e-09, + "loss": 0.0639, + "num_input_tokens_seen": 217141072, + "step": 178460 + }, + { + "epoch": 19.875821360953335, + "grad_norm": 1.541457176208496, + "learning_rate": 5.881934223841978e-09, + "loss": 0.1186, + "num_input_tokens_seen": 217147152, + "step": 178465 + }, + { + "epoch": 19.876378215836954, + "grad_norm": 0.0002524421433918178, + "learning_rate": 5.829348831695725e-09, + "loss": 0.0203, + "num_input_tokens_seen": 217153232, + "step": 178470 + }, + { + "epoch": 19.87693507072057, + "grad_norm": 0.006541735026985407, + "learning_rate": 5.776999529513716e-09, + "loss": 0.0033, + "num_input_tokens_seen": 217158832, + "step": 178475 + }, + { + "epoch": 19.877491925604186, + "grad_norm": 0.0003918818256352097, + "learning_rate": 5.724886317795553e-09, + "loss": 0.0793, + "num_input_tokens_seen": 217164976, + "step": 178480 + }, + { + "epoch": 19.878048780487806, + "grad_norm": 0.17050278186798096, + "learning_rate": 5.673009197029733e-09, + "loss": 0.0033, + "num_input_tokens_seen": 217171120, + "step": 178485 + }, + { + "epoch": 19.87860563537142, + "grad_norm": 0.000367176893632859, + "learning_rate": 5.62136816770753e-09, + "loss": 0.009, + "num_input_tokens_seen": 217177328, + "step": 178490 + }, + { + "epoch": 19.87916249025504, + "grad_norm": 0.4259343147277832, + "learning_rate": 5.5699632303174436e-09, + "loss": 0.0047, + "num_input_tokens_seen": 217183728, + "step": 178495 + }, + { + "epoch": 19.879719345138657, + "grad_norm": 1.7179641723632812, + "learning_rate": 5.5187943853424186e-09, + "loss": 0.05, + "num_input_tokens_seen": 217189648, + "step": 178500 + }, + { + "epoch": 19.880276200022273, + "grad_norm": 0.02243558131158352, + "learning_rate": 5.467861633268179e-09, + "loss": 0.0054, + "num_input_tokens_seen": 217195888, + "step": 178505 + }, + { + "epoch": 19.880833054905892, + "grad_norm": 0.004374175798147917, + "learning_rate": 5.417164974577671e-09, + "loss": 0.0099, + "num_input_tokens_seen": 217202064, + "step": 178510 + }, + { + "epoch": 19.881389909789508, + "grad_norm": 1.1027233600616455, + "learning_rate": 5.3667044097455155e-09, + "loss": 0.0159, + "num_input_tokens_seen": 217207728, + "step": 178515 + }, + { + "epoch": 19.881946764673128, + "grad_norm": 0.10248663276433945, + "learning_rate": 5.316479939249108e-09, + "loss": 0.0846, + "num_input_tokens_seen": 217213648, + "step": 178520 + }, + { + "epoch": 19.882503619556744, + "grad_norm": 0.00014048133743926883, + "learning_rate": 5.266491563565845e-09, + "loss": 0.0849, + "num_input_tokens_seen": 217219952, + "step": 178525 + }, + { + "epoch": 19.88306047444036, + "grad_norm": 0.005381626542657614, + "learning_rate": 5.216739283164795e-09, + "loss": 0.0232, + "num_input_tokens_seen": 217226448, + "step": 178530 + }, + { + "epoch": 19.88361732932398, + "grad_norm": 0.6605765223503113, + "learning_rate": 5.167223098517804e-09, + "loss": 0.015, + "num_input_tokens_seen": 217232848, + "step": 178535 + }, + { + "epoch": 19.884174184207595, + "grad_norm": 0.024604296311736107, + "learning_rate": 5.117943010091164e-09, + "loss": 0.0043, + "num_input_tokens_seen": 217239248, + "step": 178540 + }, + { + "epoch": 19.884731039091214, + "grad_norm": 0.11083868145942688, + "learning_rate": 5.06889901835117e-09, + "loss": 0.0344, + "num_input_tokens_seen": 217245392, + "step": 178545 + }, + { + "epoch": 19.88528789397483, + "grad_norm": 0.9063509702682495, + "learning_rate": 5.0200911237641145e-09, + "loss": 0.0621, + "num_input_tokens_seen": 217251600, + "step": 178550 + }, + { + "epoch": 19.885844748858446, + "grad_norm": 1.1770957708358765, + "learning_rate": 4.97151932678519e-09, + "loss": 0.0616, + "num_input_tokens_seen": 217257488, + "step": 178555 + }, + { + "epoch": 19.886401603742065, + "grad_norm": 1.7172703742980957, + "learning_rate": 4.923183627875139e-09, + "loss": 0.0933, + "num_input_tokens_seen": 217263856, + "step": 178560 + }, + { + "epoch": 19.88695845862568, + "grad_norm": 0.004901634529232979, + "learning_rate": 4.875084027491928e-09, + "loss": 0.0188, + "num_input_tokens_seen": 217269808, + "step": 178565 + }, + { + "epoch": 19.8875153135093, + "grad_norm": 0.3774230182170868, + "learning_rate": 4.827220526090748e-09, + "loss": 0.0387, + "num_input_tokens_seen": 217275760, + "step": 178570 + }, + { + "epoch": 19.888072168392917, + "grad_norm": 0.016439933329820633, + "learning_rate": 4.77959312412124e-09, + "loss": 0.0495, + "num_input_tokens_seen": 217282064, + "step": 178575 + }, + { + "epoch": 19.888629023276533, + "grad_norm": 0.001380539033561945, + "learning_rate": 4.732201822033045e-09, + "loss": 0.0057, + "num_input_tokens_seen": 217288080, + "step": 178580 + }, + { + "epoch": 19.889185878160152, + "grad_norm": 0.15558642148971558, + "learning_rate": 4.685046620278577e-09, + "loss": 0.0491, + "num_input_tokens_seen": 217293776, + "step": 178585 + }, + { + "epoch": 19.889742733043768, + "grad_norm": 0.04086868464946747, + "learning_rate": 4.638127519296376e-09, + "loss": 0.0346, + "num_input_tokens_seen": 217299920, + "step": 178590 + }, + { + "epoch": 19.890299587927387, + "grad_norm": 0.5451751351356506, + "learning_rate": 4.591444519533306e-09, + "loss": 0.1127, + "num_input_tokens_seen": 217306384, + "step": 178595 + }, + { + "epoch": 19.890856442811003, + "grad_norm": 0.10862891376018524, + "learning_rate": 4.54499762143068e-09, + "loss": 0.0771, + "num_input_tokens_seen": 217312464, + "step": 178600 + }, + { + "epoch": 19.89141329769462, + "grad_norm": 0.08835744112730026, + "learning_rate": 4.498786825427037e-09, + "loss": 0.0585, + "num_input_tokens_seen": 217318416, + "step": 178605 + }, + { + "epoch": 19.89197015257824, + "grad_norm": 0.008895836770534515, + "learning_rate": 4.452812131958139e-09, + "loss": 0.012, + "num_input_tokens_seen": 217324752, + "step": 178610 + }, + { + "epoch": 19.892527007461855, + "grad_norm": 0.064173623919487, + "learning_rate": 4.4070735414597494e-09, + "loss": 0.0088, + "num_input_tokens_seen": 217331088, + "step": 178615 + }, + { + "epoch": 19.893083862345474, + "grad_norm": 0.00034982600482180715, + "learning_rate": 4.361571054362079e-09, + "loss": 0.0626, + "num_input_tokens_seen": 217337296, + "step": 178620 + }, + { + "epoch": 19.89364071722909, + "grad_norm": 1.6647768020629883, + "learning_rate": 4.316304671092564e-09, + "loss": 0.0526, + "num_input_tokens_seen": 217343184, + "step": 178625 + }, + { + "epoch": 19.894197572112706, + "grad_norm": 1.457300066947937, + "learning_rate": 4.2712743920841905e-09, + "loss": 0.0366, + "num_input_tokens_seen": 217349520, + "step": 178630 + }, + { + "epoch": 19.894754426996325, + "grad_norm": 0.24822396039962769, + "learning_rate": 4.226480217761619e-09, + "loss": 0.0157, + "num_input_tokens_seen": 217355664, + "step": 178635 + }, + { + "epoch": 19.89531128187994, + "grad_norm": 2.1368801593780518, + "learning_rate": 4.181922148543959e-09, + "loss": 0.1212, + "num_input_tokens_seen": 217361712, + "step": 178640 + }, + { + "epoch": 19.89586813676356, + "grad_norm": 0.12072654813528061, + "learning_rate": 4.137600184855872e-09, + "loss": 0.029, + "num_input_tokens_seen": 217367600, + "step": 178645 + }, + { + "epoch": 19.896424991647176, + "grad_norm": 0.47699108719825745, + "learning_rate": 4.09351432711369e-09, + "loss": 0.0132, + "num_input_tokens_seen": 217373744, + "step": 178650 + }, + { + "epoch": 19.896981846530792, + "grad_norm": 0.3636360764503479, + "learning_rate": 4.049664575733747e-09, + "loss": 0.0065, + "num_input_tokens_seen": 217379984, + "step": 178655 + }, + { + "epoch": 19.897538701414412, + "grad_norm": 0.6411072611808777, + "learning_rate": 4.006050931132377e-09, + "loss": 0.014, + "num_input_tokens_seen": 217386096, + "step": 178660 + }, + { + "epoch": 19.898095556298028, + "grad_norm": 1.4375042915344238, + "learning_rate": 3.962673393717586e-09, + "loss": 0.1222, + "num_input_tokens_seen": 217392208, + "step": 178665 + }, + { + "epoch": 19.898652411181647, + "grad_norm": 1.5399855375289917, + "learning_rate": 3.9195319639057095e-09, + "loss": 0.0875, + "num_input_tokens_seen": 217397904, + "step": 178670 + }, + { + "epoch": 19.899209266065263, + "grad_norm": 1.5729212760925293, + "learning_rate": 3.876626642099202e-09, + "loss": 0.0988, + "num_input_tokens_seen": 217404080, + "step": 178675 + }, + { + "epoch": 19.899766120948883, + "grad_norm": 0.00798195879906416, + "learning_rate": 3.833957428703294e-09, + "loss": 0.055, + "num_input_tokens_seen": 217409968, + "step": 178680 + }, + { + "epoch": 19.9003229758325, + "grad_norm": 0.003563063917681575, + "learning_rate": 3.791524324123219e-09, + "loss": 0.0072, + "num_input_tokens_seen": 217416752, + "step": 178685 + }, + { + "epoch": 19.900879830716114, + "grad_norm": 0.2837289273738861, + "learning_rate": 3.7493273287586565e-09, + "loss": 0.0198, + "num_input_tokens_seen": 217422928, + "step": 178690 + }, + { + "epoch": 19.901436685599734, + "grad_norm": 0.19070576131343842, + "learning_rate": 3.7073664430065104e-09, + "loss": 0.013, + "num_input_tokens_seen": 217429296, + "step": 178695 + }, + { + "epoch": 19.90199354048335, + "grad_norm": 0.034485187381505966, + "learning_rate": 3.6656416672664617e-09, + "loss": 0.0341, + "num_input_tokens_seen": 217435632, + "step": 178700 + }, + { + "epoch": 19.902550395366966, + "grad_norm": 0.4141295254230499, + "learning_rate": 3.6241530019326395e-09, + "loss": 0.0099, + "num_input_tokens_seen": 217441264, + "step": 178705 + }, + { + "epoch": 19.903107250250585, + "grad_norm": 0.0017938005039468408, + "learning_rate": 3.5829004473936224e-09, + "loss": 0.0398, + "num_input_tokens_seen": 217447440, + "step": 178710 + }, + { + "epoch": 19.9036641051342, + "grad_norm": 1.076826810836792, + "learning_rate": 3.5418840040435386e-09, + "loss": 0.0851, + "num_input_tokens_seen": 217453552, + "step": 178715 + }, + { + "epoch": 19.90422096001782, + "grad_norm": 0.008522551506757736, + "learning_rate": 3.50110367226264e-09, + "loss": 0.0051, + "num_input_tokens_seen": 217459536, + "step": 178720 + }, + { + "epoch": 19.904777814901436, + "grad_norm": 0.01438820455223322, + "learning_rate": 3.460559452445056e-09, + "loss": 0.0244, + "num_input_tokens_seen": 217465712, + "step": 178725 + }, + { + "epoch": 19.905334669785056, + "grad_norm": 0.12045129388570786, + "learning_rate": 3.4202513449682616e-09, + "loss": 0.0563, + "num_input_tokens_seen": 217471600, + "step": 178730 + }, + { + "epoch": 19.90589152466867, + "grad_norm": 2.515244483947754, + "learning_rate": 3.380179350212509e-09, + "loss": 0.1133, + "num_input_tokens_seen": 217477424, + "step": 178735 + }, + { + "epoch": 19.906448379552288, + "grad_norm": 1.0570627450942993, + "learning_rate": 3.3403434685580493e-09, + "loss": 0.0237, + "num_input_tokens_seen": 217483312, + "step": 178740 + }, + { + "epoch": 19.907005234435907, + "grad_norm": 0.028785929083824158, + "learning_rate": 3.3007437003823583e-09, + "loss": 0.0483, + "num_input_tokens_seen": 217489488, + "step": 178745 + }, + { + "epoch": 19.907562089319523, + "grad_norm": 0.0005376693443395197, + "learning_rate": 3.261380046057361e-09, + "loss": 0.0098, + "num_input_tokens_seen": 217495536, + "step": 178750 + }, + { + "epoch": 19.908118944203142, + "grad_norm": 1.4661710262298584, + "learning_rate": 3.2222525059549813e-09, + "loss": 0.0524, + "num_input_tokens_seen": 217501872, + "step": 178755 + }, + { + "epoch": 19.908675799086758, + "grad_norm": 0.02823571488261223, + "learning_rate": 3.183361080447145e-09, + "loss": 0.0096, + "num_input_tokens_seen": 217508400, + "step": 178760 + }, + { + "epoch": 19.909232653970374, + "grad_norm": 0.072870172560215, + "learning_rate": 3.1447057699002246e-09, + "loss": 0.0116, + "num_input_tokens_seen": 217514512, + "step": 178765 + }, + { + "epoch": 19.909789508853994, + "grad_norm": 0.4035452902317047, + "learning_rate": 3.1062865746750435e-09, + "loss": 0.0515, + "num_input_tokens_seen": 217520624, + "step": 178770 + }, + { + "epoch": 19.91034636373761, + "grad_norm": 0.044361695647239685, + "learning_rate": 3.0681034951407507e-09, + "loss": 0.0163, + "num_input_tokens_seen": 217526768, + "step": 178775 + }, + { + "epoch": 19.91090321862123, + "grad_norm": 0.5748197436332703, + "learning_rate": 3.0301565316553926e-09, + "loss": 0.0693, + "num_input_tokens_seen": 217533232, + "step": 178780 + }, + { + "epoch": 19.911460073504845, + "grad_norm": 0.32271745800971985, + "learning_rate": 2.9924456845770167e-09, + "loss": 0.1423, + "num_input_tokens_seen": 217539056, + "step": 178785 + }, + { + "epoch": 19.91201692838846, + "grad_norm": 0.01236222218722105, + "learning_rate": 2.9549709542636695e-09, + "loss": 0.01, + "num_input_tokens_seen": 217545200, + "step": 178790 + }, + { + "epoch": 19.91257378327208, + "grad_norm": 0.002386291278526187, + "learning_rate": 2.917732341067847e-09, + "loss": 0.0301, + "num_input_tokens_seen": 217551024, + "step": 178795 + }, + { + "epoch": 19.913130638155696, + "grad_norm": 2.713209390640259, + "learning_rate": 2.8807298453392696e-09, + "loss": 0.0923, + "num_input_tokens_seen": 217557168, + "step": 178800 + }, + { + "epoch": 19.913687493039316, + "grad_norm": 0.16052064299583435, + "learning_rate": 2.8439634674304326e-09, + "loss": 0.0022, + "num_input_tokens_seen": 217563088, + "step": 178805 + }, + { + "epoch": 19.91424434792293, + "grad_norm": 0.5559470653533936, + "learning_rate": 2.8074332076882814e-09, + "loss": 0.0438, + "num_input_tokens_seen": 217569200, + "step": 178810 + }, + { + "epoch": 19.914801202806547, + "grad_norm": 1.2961390018463135, + "learning_rate": 2.7711390664569846e-09, + "loss": 0.039, + "num_input_tokens_seen": 217575216, + "step": 178815 + }, + { + "epoch": 19.915358057690167, + "grad_norm": 0.00021217115863692015, + "learning_rate": 2.735081044077936e-09, + "loss": 0.0483, + "num_input_tokens_seen": 217581424, + "step": 178820 + }, + { + "epoch": 19.915914912573783, + "grad_norm": 0.3186814785003662, + "learning_rate": 2.699259140895305e-09, + "loss": 0.0056, + "num_input_tokens_seen": 217587312, + "step": 178825 + }, + { + "epoch": 19.916471767457402, + "grad_norm": 0.08320736140012741, + "learning_rate": 2.663673357247709e-09, + "loss": 0.0127, + "num_input_tokens_seen": 217593360, + "step": 178830 + }, + { + "epoch": 19.917028622341018, + "grad_norm": 8.475183858536184e-05, + "learning_rate": 2.6283236934654397e-09, + "loss": 0.0796, + "num_input_tokens_seen": 217599408, + "step": 178835 + }, + { + "epoch": 19.917585477224634, + "grad_norm": 0.3724362850189209, + "learning_rate": 2.593210149887115e-09, + "loss": 0.0289, + "num_input_tokens_seen": 217605488, + "step": 178840 + }, + { + "epoch": 19.918142332108253, + "grad_norm": 1.8927446603775024, + "learning_rate": 2.5583327268458025e-09, + "loss": 0.0878, + "num_input_tokens_seen": 217611568, + "step": 178845 + }, + { + "epoch": 19.91869918699187, + "grad_norm": 0.003784729167819023, + "learning_rate": 2.5236914246662413e-09, + "loss": 0.0171, + "num_input_tokens_seen": 217617296, + "step": 178850 + }, + { + "epoch": 19.91925604187549, + "grad_norm": 0.27372440695762634, + "learning_rate": 2.4892862436787232e-09, + "loss": 0.0141, + "num_input_tokens_seen": 217623472, + "step": 178855 + }, + { + "epoch": 19.919812896759105, + "grad_norm": 0.013763008639216423, + "learning_rate": 2.455117184207989e-09, + "loss": 0.0546, + "num_input_tokens_seen": 217629968, + "step": 178860 + }, + { + "epoch": 19.92036975164272, + "grad_norm": 0.19122228026390076, + "learning_rate": 2.4211842465760027e-09, + "loss": 0.0061, + "num_input_tokens_seen": 217636176, + "step": 178865 + }, + { + "epoch": 19.92092660652634, + "grad_norm": 0.024618875235319138, + "learning_rate": 2.387487431104729e-09, + "loss": 0.0065, + "num_input_tokens_seen": 217641776, + "step": 178870 + }, + { + "epoch": 19.921483461409956, + "grad_norm": 0.002964814193546772, + "learning_rate": 2.3540267381105817e-09, + "loss": 0.0111, + "num_input_tokens_seen": 217648048, + "step": 178875 + }, + { + "epoch": 19.922040316293575, + "grad_norm": 3.3387084007263184, + "learning_rate": 2.3208021679099744e-09, + "loss": 0.1961, + "num_input_tokens_seen": 217653936, + "step": 178880 + }, + { + "epoch": 19.92259717117719, + "grad_norm": 0.01360471360385418, + "learning_rate": 2.2878137208193205e-09, + "loss": 0.0149, + "num_input_tokens_seen": 217659920, + "step": 178885 + }, + { + "epoch": 19.923154026060807, + "grad_norm": 1.1428934335708618, + "learning_rate": 2.2550613971439318e-09, + "loss": 0.0115, + "num_input_tokens_seen": 217666000, + "step": 178890 + }, + { + "epoch": 19.923710880944427, + "grad_norm": 0.0016291304491460323, + "learning_rate": 2.2225451972002208e-09, + "loss": 0.0683, + "num_input_tokens_seen": 217672400, + "step": 178895 + }, + { + "epoch": 19.924267735828042, + "grad_norm": 0.024572599679231644, + "learning_rate": 2.1902651212935e-09, + "loss": 0.0823, + "num_input_tokens_seen": 217678672, + "step": 178900 + }, + { + "epoch": 19.924824590711662, + "grad_norm": 0.6327639818191528, + "learning_rate": 2.158221169726304e-09, + "loss": 0.1039, + "num_input_tokens_seen": 217684688, + "step": 178905 + }, + { + "epoch": 19.925381445595278, + "grad_norm": 2.1981873512268066, + "learning_rate": 2.12641334280117e-09, + "loss": 0.2617, + "num_input_tokens_seen": 217690576, + "step": 178910 + }, + { + "epoch": 19.925938300478894, + "grad_norm": 0.011471699923276901, + "learning_rate": 2.0948416408206327e-09, + "loss": 0.0121, + "num_input_tokens_seen": 217696784, + "step": 178915 + }, + { + "epoch": 19.926495155362513, + "grad_norm": 0.055025871843099594, + "learning_rate": 2.0635060640844527e-09, + "loss": 0.0283, + "num_input_tokens_seen": 217702768, + "step": 178920 + }, + { + "epoch": 19.92705201024613, + "grad_norm": 0.10884993523359299, + "learning_rate": 2.0324066128840637e-09, + "loss": 0.0095, + "num_input_tokens_seen": 217708912, + "step": 178925 + }, + { + "epoch": 19.92760886512975, + "grad_norm": 0.6410996913909912, + "learning_rate": 2.00154328751645e-09, + "loss": 0.048, + "num_input_tokens_seen": 217714384, + "step": 178930 + }, + { + "epoch": 19.928165720013364, + "grad_norm": 0.0008072317577898502, + "learning_rate": 1.9709160882730448e-09, + "loss": 0.0008, + "num_input_tokens_seen": 217720784, + "step": 178935 + }, + { + "epoch": 19.92872257489698, + "grad_norm": 0.25879889726638794, + "learning_rate": 1.940525015442507e-09, + "loss": 0.0269, + "num_input_tokens_seen": 217726704, + "step": 178940 + }, + { + "epoch": 19.9292794297806, + "grad_norm": 0.04907174035906792, + "learning_rate": 1.9103700693107187e-09, + "loss": 0.0182, + "num_input_tokens_seen": 217732976, + "step": 178945 + }, + { + "epoch": 19.929836284664216, + "grad_norm": 0.02885921113193035, + "learning_rate": 1.8804512501635618e-09, + "loss": 0.0046, + "num_input_tokens_seen": 217739280, + "step": 178950 + }, + { + "epoch": 19.930393139547835, + "grad_norm": 0.0031263900455087423, + "learning_rate": 1.850768558284144e-09, + "loss": 0.0034, + "num_input_tokens_seen": 217744816, + "step": 178955 + }, + { + "epoch": 19.93094999443145, + "grad_norm": 0.035870958119630814, + "learning_rate": 1.821321993952796e-09, + "loss": 0.0085, + "num_input_tokens_seen": 217750768, + "step": 178960 + }, + { + "epoch": 19.931506849315067, + "grad_norm": 0.0016126298578456044, + "learning_rate": 1.7921115574470738e-09, + "loss": 0.0159, + "num_input_tokens_seen": 217756784, + "step": 178965 + }, + { + "epoch": 19.932063704198686, + "grad_norm": 0.2025289088487625, + "learning_rate": 1.7631372490445331e-09, + "loss": 0.0108, + "num_input_tokens_seen": 217762704, + "step": 178970 + }, + { + "epoch": 19.932620559082302, + "grad_norm": 0.003931635990738869, + "learning_rate": 1.734399069014403e-09, + "loss": 0.1109, + "num_input_tokens_seen": 217768720, + "step": 178975 + }, + { + "epoch": 19.93317741396592, + "grad_norm": 0.8159840703010559, + "learning_rate": 1.7058970176314637e-09, + "loss": 0.0151, + "num_input_tokens_seen": 217774928, + "step": 178980 + }, + { + "epoch": 19.933734268849538, + "grad_norm": 0.33742910623550415, + "learning_rate": 1.67763109516772e-09, + "loss": 0.1059, + "num_input_tokens_seen": 217780784, + "step": 178985 + }, + { + "epoch": 19.934291123733153, + "grad_norm": 0.00011893608461832628, + "learning_rate": 1.649601301884074e-09, + "loss": 0.007, + "num_input_tokens_seen": 217786768, + "step": 178990 + }, + { + "epoch": 19.934847978616773, + "grad_norm": 0.6624374389648438, + "learning_rate": 1.6218076380497549e-09, + "loss": 0.0184, + "num_input_tokens_seen": 217792656, + "step": 178995 + }, + { + "epoch": 19.93540483350039, + "grad_norm": 3.670651435852051, + "learning_rate": 1.5942501039256652e-09, + "loss": 0.0565, + "num_input_tokens_seen": 217798640, + "step": 179000 + }, + { + "epoch": 19.93596168838401, + "grad_norm": 0.00014691728574689478, + "learning_rate": 1.5669286997727072e-09, + "loss": 0.0001, + "num_input_tokens_seen": 217804752, + "step": 179005 + }, + { + "epoch": 19.936518543267624, + "grad_norm": 0.05267663300037384, + "learning_rate": 1.5398434258462324e-09, + "loss": 0.0776, + "num_input_tokens_seen": 217810736, + "step": 179010 + }, + { + "epoch": 19.93707539815124, + "grad_norm": 0.12271305173635483, + "learning_rate": 1.512994282407143e-09, + "loss": 0.0281, + "num_input_tokens_seen": 217816528, + "step": 179015 + }, + { + "epoch": 19.93763225303486, + "grad_norm": 0.0038527254946529865, + "learning_rate": 1.4863812697052392e-09, + "loss": 0.0029, + "num_input_tokens_seen": 217822640, + "step": 179020 + }, + { + "epoch": 19.938189107918475, + "grad_norm": 0.09951590746641159, + "learning_rate": 1.460004387993097e-09, + "loss": 0.0018, + "num_input_tokens_seen": 217828880, + "step": 179025 + }, + { + "epoch": 19.938745962802095, + "grad_norm": 0.011383906938135624, + "learning_rate": 1.4338636375177405e-09, + "loss": 0.0163, + "num_input_tokens_seen": 217835376, + "step": 179030 + }, + { + "epoch": 19.93930281768571, + "grad_norm": 0.15595243871212006, + "learning_rate": 1.4079590185289705e-09, + "loss": 0.0875, + "num_input_tokens_seen": 217841552, + "step": 179035 + }, + { + "epoch": 19.939859672569327, + "grad_norm": 0.7349945306777954, + "learning_rate": 1.382290531273811e-09, + "loss": 0.0225, + "num_input_tokens_seen": 217847408, + "step": 179040 + }, + { + "epoch": 19.940416527452946, + "grad_norm": 0.3917217552661896, + "learning_rate": 1.356858175988185e-09, + "loss": 0.0371, + "num_input_tokens_seen": 217853360, + "step": 179045 + }, + { + "epoch": 19.940973382336562, + "grad_norm": 0.00279355701059103, + "learning_rate": 1.331661952916341e-09, + "loss": 0.0279, + "num_input_tokens_seen": 217859472, + "step": 179050 + }, + { + "epoch": 19.94153023722018, + "grad_norm": 0.5003112554550171, + "learning_rate": 1.3067018622942018e-09, + "loss": 0.0272, + "num_input_tokens_seen": 217865744, + "step": 179055 + }, + { + "epoch": 19.942087092103797, + "grad_norm": 0.02805885300040245, + "learning_rate": 1.2819779043604651e-09, + "loss": 0.0165, + "num_input_tokens_seen": 217871856, + "step": 179060 + }, + { + "epoch": 19.942643946987417, + "grad_norm": 0.03873773291707039, + "learning_rate": 1.257490079348278e-09, + "loss": 0.0019, + "num_input_tokens_seen": 217877840, + "step": 179065 + }, + { + "epoch": 19.943200801871033, + "grad_norm": 0.0644797831773758, + "learning_rate": 1.233238387485236e-09, + "loss": 0.0113, + "num_input_tokens_seen": 217883728, + "step": 179070 + }, + { + "epoch": 19.94375765675465, + "grad_norm": 0.1753464788198471, + "learning_rate": 1.209222829004486e-09, + "loss": 0.0266, + "num_input_tokens_seen": 217889808, + "step": 179075 + }, + { + "epoch": 19.944314511638268, + "grad_norm": 0.7089464664459229, + "learning_rate": 1.1854434041308482e-09, + "loss": 0.0357, + "num_input_tokens_seen": 217895696, + "step": 179080 + }, + { + "epoch": 19.944871366521884, + "grad_norm": 0.10391726344823837, + "learning_rate": 1.1619001130891428e-09, + "loss": 0.0683, + "num_input_tokens_seen": 217901328, + "step": 179085 + }, + { + "epoch": 19.945428221405503, + "grad_norm": 7.484584057237953e-05, + "learning_rate": 1.1385929561041897e-09, + "loss": 0.0152, + "num_input_tokens_seen": 217907344, + "step": 179090 + }, + { + "epoch": 19.94598507628912, + "grad_norm": 0.22796107828617096, + "learning_rate": 1.1155219333897072e-09, + "loss": 0.0769, + "num_input_tokens_seen": 217913456, + "step": 179095 + }, + { + "epoch": 19.946541931172735, + "grad_norm": 0.02005443349480629, + "learning_rate": 1.092687045170515e-09, + "loss": 0.0127, + "num_input_tokens_seen": 217919472, + "step": 179100 + }, + { + "epoch": 19.947098786056355, + "grad_norm": 0.4459858536720276, + "learning_rate": 1.0700882916603316e-09, + "loss": 0.0238, + "num_input_tokens_seen": 217925168, + "step": 179105 + }, + { + "epoch": 19.94765564093997, + "grad_norm": 0.02102719433605671, + "learning_rate": 1.047725673070099e-09, + "loss": 0.0094, + "num_input_tokens_seen": 217931120, + "step": 179110 + }, + { + "epoch": 19.94821249582359, + "grad_norm": 1.6385247707366943, + "learning_rate": 1.0255991896163108e-09, + "loss": 0.0499, + "num_input_tokens_seen": 217937008, + "step": 179115 + }, + { + "epoch": 19.948769350707206, + "grad_norm": 0.0037032263353466988, + "learning_rate": 1.0037088415015827e-09, + "loss": 0.0514, + "num_input_tokens_seen": 217943440, + "step": 179120 + }, + { + "epoch": 19.949326205590822, + "grad_norm": 0.09230536967515945, + "learning_rate": 9.820546289368571e-10, + "loss": 0.0208, + "num_input_tokens_seen": 217949904, + "step": 179125 + }, + { + "epoch": 19.94988306047444, + "grad_norm": 0.0012525159399956465, + "learning_rate": 9.606365521247495e-10, + "loss": 0.013, + "num_input_tokens_seen": 217956176, + "step": 179130 + }, + { + "epoch": 19.950439915358057, + "grad_norm": 0.23959216475486755, + "learning_rate": 9.39454611267876e-10, + "loss": 0.0146, + "num_input_tokens_seen": 217961936, + "step": 179135 + }, + { + "epoch": 19.950996770241677, + "grad_norm": 0.0564199835062027, + "learning_rate": 9.185088065688519e-10, + "loss": 0.0223, + "num_input_tokens_seen": 217968144, + "step": 179140 + }, + { + "epoch": 19.951553625125293, + "grad_norm": 0.06601887941360474, + "learning_rate": 8.977991382219664e-10, + "loss": 0.011, + "num_input_tokens_seen": 217974288, + "step": 179145 + }, + { + "epoch": 19.95211048000891, + "grad_norm": 0.007648613769561052, + "learning_rate": 8.773256064242841e-10, + "loss": 0.0878, + "num_input_tokens_seen": 217979728, + "step": 179150 + }, + { + "epoch": 19.952667334892528, + "grad_norm": 0.0001025174860842526, + "learning_rate": 8.570882113673185e-10, + "loss": 0.0074, + "num_input_tokens_seen": 217986096, + "step": 179155 + }, + { + "epoch": 19.953224189776144, + "grad_norm": 0.007713718339800835, + "learning_rate": 8.370869532481341e-10, + "loss": 0.0108, + "num_input_tokens_seen": 217992080, + "step": 179160 + }, + { + "epoch": 19.953781044659763, + "grad_norm": 0.0006006716284900904, + "learning_rate": 8.173218322499176e-10, + "loss": 0.0331, + "num_input_tokens_seen": 217998384, + "step": 179165 + }, + { + "epoch": 19.95433789954338, + "grad_norm": 0.43695124983787537, + "learning_rate": 7.977928485586317e-10, + "loss": 0.0252, + "num_input_tokens_seen": 218004528, + "step": 179170 + }, + { + "epoch": 19.954894754426995, + "grad_norm": 1.0151838064193726, + "learning_rate": 7.785000023657895e-10, + "loss": 0.01, + "num_input_tokens_seen": 218010800, + "step": 179175 + }, + { + "epoch": 19.955451609310614, + "grad_norm": 0.5804868340492249, + "learning_rate": 7.594432938462515e-10, + "loss": 0.1013, + "num_input_tokens_seen": 218016784, + "step": 179180 + }, + { + "epoch": 19.95600846419423, + "grad_norm": 0.9406203627586365, + "learning_rate": 7.406227231832041e-10, + "loss": 0.0223, + "num_input_tokens_seen": 218022736, + "step": 179185 + }, + { + "epoch": 19.95656531907785, + "grad_norm": 0.011920885182917118, + "learning_rate": 7.220382905542833e-10, + "loss": 0.0229, + "num_input_tokens_seen": 218028688, + "step": 179190 + }, + { + "epoch": 19.957122173961466, + "grad_norm": 0.12130916118621826, + "learning_rate": 7.03689996134349e-10, + "loss": 0.031, + "num_input_tokens_seen": 218034640, + "step": 179195 + }, + { + "epoch": 19.95767902884508, + "grad_norm": 0.13509707152843475, + "learning_rate": 6.855778400982615e-10, + "loss": 0.0104, + "num_input_tokens_seen": 218040752, + "step": 179200 + }, + { + "epoch": 19.9582358837287, + "grad_norm": 2.517005681991577, + "learning_rate": 6.677018226125542e-10, + "loss": 0.2333, + "num_input_tokens_seen": 218046736, + "step": 179205 + }, + { + "epoch": 19.958792738612317, + "grad_norm": 0.9968018531799316, + "learning_rate": 6.500619438548627e-10, + "loss": 0.0218, + "num_input_tokens_seen": 218052688, + "step": 179210 + }, + { + "epoch": 19.959349593495936, + "grad_norm": 0.0035284440964460373, + "learning_rate": 6.326582039833939e-10, + "loss": 0.0045, + "num_input_tokens_seen": 218058640, + "step": 179215 + }, + { + "epoch": 19.959906448379552, + "grad_norm": 0.00018149860261473805, + "learning_rate": 6.154906031646812e-10, + "loss": 0.0721, + "num_input_tokens_seen": 218064528, + "step": 179220 + }, + { + "epoch": 19.960463303263168, + "grad_norm": 1.4854300022125244, + "learning_rate": 5.985591415624825e-10, + "loss": 0.0466, + "num_input_tokens_seen": 218070928, + "step": 179225 + }, + { + "epoch": 19.961020158146788, + "grad_norm": 0.02628578245639801, + "learning_rate": 5.818638193377801e-10, + "loss": 0.0153, + "num_input_tokens_seen": 218076944, + "step": 179230 + }, + { + "epoch": 19.961577013030404, + "grad_norm": 0.04504351690411568, + "learning_rate": 5.654046366460053e-10, + "loss": 0.045, + "num_input_tokens_seen": 218082864, + "step": 179235 + }, + { + "epoch": 19.962133867914023, + "grad_norm": 0.2085273563861847, + "learning_rate": 5.491815936425892e-10, + "loss": 0.0302, + "num_input_tokens_seen": 218088784, + "step": 179240 + }, + { + "epoch": 19.96269072279764, + "grad_norm": 1.591854214668274, + "learning_rate": 5.331946904829633e-10, + "loss": 0.1043, + "num_input_tokens_seen": 218095248, + "step": 179245 + }, + { + "epoch": 19.963247577681255, + "grad_norm": 0.012273375876247883, + "learning_rate": 5.174439273142318e-10, + "loss": 0.0036, + "num_input_tokens_seen": 218101648, + "step": 179250 + }, + { + "epoch": 19.963804432564874, + "grad_norm": 0.7266348004341125, + "learning_rate": 5.019293042890505e-10, + "loss": 0.0149, + "num_input_tokens_seen": 218107824, + "step": 179255 + }, + { + "epoch": 19.96436128744849, + "grad_norm": 0.005817605182528496, + "learning_rate": 4.866508215517484e-10, + "loss": 0.0539, + "num_input_tokens_seen": 218114256, + "step": 179260 + }, + { + "epoch": 19.96491814233211, + "grad_norm": 0.3005451261997223, + "learning_rate": 4.716084792466546e-10, + "loss": 0.064, + "num_input_tokens_seen": 218120080, + "step": 179265 + }, + { + "epoch": 19.965474997215725, + "grad_norm": 0.4729657471179962, + "learning_rate": 4.56802277518098e-10, + "loss": 0.0066, + "num_input_tokens_seen": 218126128, + "step": 179270 + }, + { + "epoch": 19.96603185209934, + "grad_norm": 0.014383314177393913, + "learning_rate": 4.422322165020809e-10, + "loss": 0.0025, + "num_input_tokens_seen": 218132080, + "step": 179275 + }, + { + "epoch": 19.96658870698296, + "grad_norm": 0.7603710889816284, + "learning_rate": 4.2789829634015677e-10, + "loss": 0.0301, + "num_input_tokens_seen": 218137488, + "step": 179280 + }, + { + "epoch": 19.967145561866577, + "grad_norm": 3.7244932651519775, + "learning_rate": 4.1380051716555236e-10, + "loss": 0.0862, + "num_input_tokens_seen": 218143408, + "step": 179285 + }, + { + "epoch": 19.967702416750196, + "grad_norm": 0.003820643061771989, + "learning_rate": 3.9993887911149443e-10, + "loss": 0.1053, + "num_input_tokens_seen": 218149744, + "step": 179290 + }, + { + "epoch": 19.968259271633812, + "grad_norm": 0.02987886406481266, + "learning_rate": 3.8631338231120973e-10, + "loss": 0.1138, + "num_input_tokens_seen": 218156176, + "step": 179295 + }, + { + "epoch": 19.968816126517428, + "grad_norm": 0.2499290406703949, + "learning_rate": 3.729240268895984e-10, + "loss": 0.0093, + "num_input_tokens_seen": 218161904, + "step": 179300 + }, + { + "epoch": 19.969372981401047, + "grad_norm": 0.002668224973604083, + "learning_rate": 3.5977081297711156e-10, + "loss": 0.0028, + "num_input_tokens_seen": 218168048, + "step": 179305 + }, + { + "epoch": 19.969929836284663, + "grad_norm": 0.20038670301437378, + "learning_rate": 3.4685374069309826e-10, + "loss": 0.1258, + "num_input_tokens_seen": 218174000, + "step": 179310 + }, + { + "epoch": 19.970486691168283, + "grad_norm": 0.5115211606025696, + "learning_rate": 3.341728101652342e-10, + "loss": 0.0224, + "num_input_tokens_seen": 218180144, + "step": 179315 + }, + { + "epoch": 19.9710435460519, + "grad_norm": 0.3842436373233795, + "learning_rate": 3.217280215100926e-10, + "loss": 0.0127, + "num_input_tokens_seen": 218186320, + "step": 179320 + }, + { + "epoch": 19.971600400935515, + "grad_norm": 0.002553005935624242, + "learning_rate": 3.095193748442471e-10, + "loss": 0.0145, + "num_input_tokens_seen": 218191920, + "step": 179325 + }, + { + "epoch": 19.972157255819134, + "grad_norm": 0.1464196890592575, + "learning_rate": 2.97546870284271e-10, + "loss": 0.0057, + "num_input_tokens_seen": 218198480, + "step": 179330 + }, + { + "epoch": 19.97271411070275, + "grad_norm": 0.00023176423565018922, + "learning_rate": 2.8581050794396216e-10, + "loss": 0.0187, + "num_input_tokens_seen": 218204880, + "step": 179335 + }, + { + "epoch": 19.97327096558637, + "grad_norm": 0.5076847076416016, + "learning_rate": 2.7431028793434287e-10, + "loss": 0.0672, + "num_input_tokens_seen": 218211184, + "step": 179340 + }, + { + "epoch": 19.973827820469985, + "grad_norm": 0.007158211898058653, + "learning_rate": 2.6304621036365994e-10, + "loss": 0.0069, + "num_input_tokens_seen": 218216976, + "step": 179345 + }, + { + "epoch": 19.9743846753536, + "grad_norm": 2.4181406497955322, + "learning_rate": 2.5201827533460897e-10, + "loss": 0.1951, + "num_input_tokens_seen": 218223184, + "step": 179350 + }, + { + "epoch": 19.97494153023722, + "grad_norm": 0.009538110345602036, + "learning_rate": 2.4122648295821225e-10, + "loss": 0.0046, + "num_input_tokens_seen": 218229328, + "step": 179355 + }, + { + "epoch": 19.975498385120837, + "grad_norm": 0.04393555596470833, + "learning_rate": 2.3067083333161433e-10, + "loss": 0.0077, + "num_input_tokens_seen": 218235344, + "step": 179360 + }, + { + "epoch": 19.976055240004456, + "grad_norm": 0.8274234533309937, + "learning_rate": 2.2035132655751079e-10, + "loss": 0.0363, + "num_input_tokens_seen": 218241424, + "step": 179365 + }, + { + "epoch": 19.976612094888072, + "grad_norm": 0.000975161325186491, + "learning_rate": 2.102679627302706e-10, + "loss": 0.0015, + "num_input_tokens_seen": 218247792, + "step": 179370 + }, + { + "epoch": 19.977168949771688, + "grad_norm": 1.0158473253250122, + "learning_rate": 2.0042074194426275e-10, + "loss": 0.0192, + "num_input_tokens_seen": 218254064, + "step": 179375 + }, + { + "epoch": 19.977725804655307, + "grad_norm": 0.01481655240058899, + "learning_rate": 1.9080966429940727e-10, + "loss": 0.0059, + "num_input_tokens_seen": 218260560, + "step": 179380 + }, + { + "epoch": 19.978282659538923, + "grad_norm": 1.61356782913208, + "learning_rate": 1.8143472987897093e-10, + "loss": 0.1687, + "num_input_tokens_seen": 218266864, + "step": 179385 + }, + { + "epoch": 19.978839514422543, + "grad_norm": 0.0036764973774552345, + "learning_rate": 1.722959387745471e-10, + "loss": 0.0015, + "num_input_tokens_seen": 218273104, + "step": 179390 + }, + { + "epoch": 19.97939636930616, + "grad_norm": 0.00027715039323084056, + "learning_rate": 1.6339329107217803e-10, + "loss": 0.0526, + "num_input_tokens_seen": 218279248, + "step": 179395 + }, + { + "epoch": 19.979953224189778, + "grad_norm": 9.842878353083506e-05, + "learning_rate": 1.547267868579061e-10, + "loss": 0.0118, + "num_input_tokens_seen": 218285392, + "step": 179400 + }, + { + "epoch": 19.980510079073394, + "grad_norm": 0.5177279114723206, + "learning_rate": 1.4629642620944683e-10, + "loss": 0.0143, + "num_input_tokens_seen": 218291664, + "step": 179405 + }, + { + "epoch": 19.98106693395701, + "grad_norm": 0.04375282675027847, + "learning_rate": 1.38102209210067e-10, + "loss": 0.0387, + "num_input_tokens_seen": 218297616, + "step": 179410 + }, + { + "epoch": 19.98162378884063, + "grad_norm": 0.5268539190292358, + "learning_rate": 1.3014413593470664e-10, + "loss": 0.0191, + "num_input_tokens_seen": 218303792, + "step": 179415 + }, + { + "epoch": 19.982180643724245, + "grad_norm": 0.022219981998205185, + "learning_rate": 1.2242220646108138e-10, + "loss": 0.004, + "num_input_tokens_seen": 218309712, + "step": 179420 + }, + { + "epoch": 19.98273749860786, + "grad_norm": 0.11435728520154953, + "learning_rate": 1.149364208613557e-10, + "loss": 0.0148, + "num_input_tokens_seen": 218315600, + "step": 179425 + }, + { + "epoch": 19.98329435349148, + "grad_norm": 0.2219865769147873, + "learning_rate": 1.0768677920214299e-10, + "loss": 0.091, + "num_input_tokens_seen": 218322064, + "step": 179430 + }, + { + "epoch": 19.983851208375096, + "grad_norm": 0.005378384608775377, + "learning_rate": 1.006732815583833e-10, + "loss": 0.0362, + "num_input_tokens_seen": 218328272, + "step": 179435 + }, + { + "epoch": 19.984408063258716, + "grad_norm": 1.6727334260940552, + "learning_rate": 9.389592799391444e-11, + "loss": 0.0429, + "num_input_tokens_seen": 218334320, + "step": 179440 + }, + { + "epoch": 19.98496491814233, + "grad_norm": 0.7033886909484863, + "learning_rate": 8.735471856979871e-11, + "loss": 0.0426, + "num_input_tokens_seen": 218340368, + "step": 179445 + }, + { + "epoch": 19.98552177302595, + "grad_norm": 0.021992135792970657, + "learning_rate": 8.104965335264946e-11, + "loss": 0.0132, + "num_input_tokens_seen": 218346416, + "step": 179450 + }, + { + "epoch": 19.986078627909567, + "grad_norm": 1.5006135702133179, + "learning_rate": 7.498073239797787e-11, + "loss": 0.1091, + "num_input_tokens_seen": 218352784, + "step": 179455 + }, + { + "epoch": 19.986635482793183, + "grad_norm": 0.002663541352376342, + "learning_rate": 6.914795576407063e-11, + "loss": 0.079, + "num_input_tokens_seen": 218358960, + "step": 179460 + }, + { + "epoch": 19.987192337676802, + "grad_norm": 0.007803441025316715, + "learning_rate": 6.355132350921445e-11, + "loss": 0.0015, + "num_input_tokens_seen": 218365488, + "step": 179465 + }, + { + "epoch": 19.98774919256042, + "grad_norm": 1.7205487489700317, + "learning_rate": 5.81908356805938e-11, + "loss": 0.0221, + "num_input_tokens_seen": 218371664, + "step": 179470 + }, + { + "epoch": 19.988306047444038, + "grad_norm": 0.958049476146698, + "learning_rate": 5.3066492333719855e-11, + "loss": 0.0565, + "num_input_tokens_seen": 218378160, + "step": 179475 + }, + { + "epoch": 19.988862902327654, + "grad_norm": 0.0009360251133330166, + "learning_rate": 4.817829351577707e-11, + "loss": 0.0066, + "num_input_tokens_seen": 218384592, + "step": 179480 + }, + { + "epoch": 19.98941975721127, + "grad_norm": 0.0006838940898887813, + "learning_rate": 4.3526239271174385e-11, + "loss": 0.1397, + "num_input_tokens_seen": 218390416, + "step": 179485 + }, + { + "epoch": 19.98997661209489, + "grad_norm": 0.051063135266304016, + "learning_rate": 3.911032964709627e-11, + "loss": 0.0264, + "num_input_tokens_seen": 218396432, + "step": 179490 + }, + { + "epoch": 19.990533466978505, + "grad_norm": 1.729561686515808, + "learning_rate": 3.4930564682400526e-11, + "loss": 0.0703, + "num_input_tokens_seen": 218402736, + "step": 179495 + }, + { + "epoch": 19.991090321862124, + "grad_norm": 0.46684345602989197, + "learning_rate": 3.098694441594496e-11, + "loss": 0.0154, + "num_input_tokens_seen": 218409200, + "step": 179500 + }, + { + "epoch": 19.99164717674574, + "grad_norm": 0.8294164538383484, + "learning_rate": 2.7279468886587388e-11, + "loss": 0.0072, + "num_input_tokens_seen": 218415120, + "step": 179505 + }, + { + "epoch": 19.992204031629356, + "grad_norm": 1.3308014869689941, + "learning_rate": 2.3808138130410052e-11, + "loss": 0.1276, + "num_input_tokens_seen": 218421552, + "step": 179510 + }, + { + "epoch": 19.992760886512976, + "grad_norm": 1.736026406288147, + "learning_rate": 2.0572952177944083e-11, + "loss": 0.0564, + "num_input_tokens_seen": 218426512, + "step": 179515 + }, + { + "epoch": 19.99331774139659, + "grad_norm": 0.040462296456098557, + "learning_rate": 1.7573911062496173e-11, + "loss": 0.0048, + "num_input_tokens_seen": 218432944, + "step": 179520 + }, + { + "epoch": 19.99387459628021, + "grad_norm": 0.0313887894153595, + "learning_rate": 1.481101480904634e-11, + "loss": 0.0252, + "num_input_tokens_seen": 218438928, + "step": 179525 + }, + { + "epoch": 19.994431451163827, + "grad_norm": 1.960821509361267, + "learning_rate": 1.2284263448125721e-11, + "loss": 0.0623, + "num_input_tokens_seen": 218445136, + "step": 179530 + }, + { + "epoch": 19.994988306047443, + "grad_norm": 0.07547677308320999, + "learning_rate": 9.993656996387657e-12, + "loss": 0.0017, + "num_input_tokens_seen": 218451248, + "step": 179535 + }, + { + "epoch": 19.995545160931062, + "grad_norm": 2.5324976444244385, + "learning_rate": 7.939195484363283e-12, + "loss": 0.0514, + "num_input_tokens_seen": 218457552, + "step": 179540 + }, + { + "epoch": 19.996102015814678, + "grad_norm": 0.5890628099441528, + "learning_rate": 6.120878923154827e-12, + "loss": 0.0054, + "num_input_tokens_seen": 218463696, + "step": 179545 + }, + { + "epoch": 19.996658870698298, + "grad_norm": 0.002541125752031803, + "learning_rate": 4.538707337742309e-12, + "loss": 0.0319, + "num_input_tokens_seen": 218470064, + "step": 179550 + }, + { + "epoch": 19.997215725581913, + "grad_norm": 1.7552800178527832, + "learning_rate": 3.1926807364524025e-12, + "loss": 0.0541, + "num_input_tokens_seen": 218476080, + "step": 179555 + }, + { + "epoch": 19.99777258046553, + "grad_norm": 0.17984606325626373, + "learning_rate": 2.0827991331628936e-12, + "loss": 0.0111, + "num_input_tokens_seen": 218482000, + "step": 179560 + }, + { + "epoch": 19.99832943534915, + "grad_norm": 0.02408224157989025, + "learning_rate": 1.2090625417515712e-12, + "loss": 0.0356, + "num_input_tokens_seen": 218488176, + "step": 179565 + }, + { + "epoch": 19.998886290232765, + "grad_norm": 0.2497536987066269, + "learning_rate": 5.714709705451071e-13, + "loss": 0.0086, + "num_input_tokens_seen": 218494480, + "step": 179570 + }, + { + "epoch": 19.999443145116384, + "grad_norm": 0.047501225024461746, + "learning_rate": 1.7002442231905947e-13, + "loss": 0.0248, + "num_input_tokens_seen": 218500752, + "step": 179575 + }, + { + "epoch": 20.0, + "grad_norm": 0.019318455830216408, + "learning_rate": 4.7228998489856624e-15, + "loss": 0.0165, + "num_input_tokens_seen": 218506144, + "step": 179580 + }, + { + "epoch": 20.0, + "eval_loss": 0.08236955851316452, + "eval_runtime": 111.6072, + "eval_samples_per_second": 35.759, + "eval_steps_per_second": 8.942, + "num_input_tokens_seen": 218506144, + "step": 179580 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 218506144, + "step": 179580, + "total_flos": 9.839490666870866e+18, + "train_loss": 0.07811526285345054, + "train_runtime": 67479.3134, + "train_samples_per_second": 10.644, + "train_steps_per_second": 2.661 + } + ], + "logging_steps": 5, + "max_steps": 179580, + "num_input_tokens_seen": 218506144, + "num_train_epochs": 20, + "save_steps": 8979, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.839490666870866e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}