{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.991614255765199, "eval_steps": 1000, "global_step": 38000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005241090146750524, "grad_norm": 1.6935213804244995, "learning_rate": 4.9988207547169815e-05, "loss": 0.3559, "step": 10 }, { "epoch": 0.0010482180293501049, "grad_norm": 2.7643861770629883, "learning_rate": 4.997510482180294e-05, "loss": 0.2081, "step": 20 }, { "epoch": 0.0015723270440251573, "grad_norm": 1.5143877267837524, "learning_rate": 4.996200209643606e-05, "loss": 0.219, "step": 30 }, { "epoch": 0.0020964360587002098, "grad_norm": 2.633585214614868, "learning_rate": 4.9948899371069186e-05, "loss": 0.2276, "step": 40 }, { "epoch": 0.002620545073375262, "grad_norm": 1.973532795906067, "learning_rate": 4.993579664570231e-05, "loss": 0.2041, "step": 50 }, { "epoch": 0.0031446540880503146, "grad_norm": 2.134101629257202, "learning_rate": 4.992269392033543e-05, "loss": 0.2058, "step": 60 }, { "epoch": 0.003668763102725367, "grad_norm": 1.7121070623397827, "learning_rate": 4.9909591194968556e-05, "loss": 0.2332, "step": 70 }, { "epoch": 0.0041928721174004195, "grad_norm": 1.5422728061676025, "learning_rate": 4.989648846960168e-05, "loss": 0.1932, "step": 80 }, { "epoch": 0.0047169811320754715, "grad_norm": 1.2396705150604248, "learning_rate": 4.98833857442348e-05, "loss": 0.1739, "step": 90 }, { "epoch": 0.005241090146750524, "grad_norm": 1.5578513145446777, "learning_rate": 4.9870283018867926e-05, "loss": 0.1871, "step": 100 }, { "epoch": 0.005765199161425576, "grad_norm": 1.3153727054595947, "learning_rate": 4.985718029350105e-05, "loss": 0.2039, "step": 110 }, { "epoch": 0.006289308176100629, "grad_norm": 1.5129348039627075, "learning_rate": 4.984407756813417e-05, "loss": 0.1398, "step": 120 }, { "epoch": 0.006813417190775681, "grad_norm": 1.8399486541748047, "learning_rate": 4.9830974842767296e-05, "loss": 0.181, "step": 130 }, { "epoch": 0.007337526205450734, "grad_norm": 1.6457992792129517, "learning_rate": 4.981787211740042e-05, "loss": 0.1897, "step": 140 }, { "epoch": 0.007861635220125786, "grad_norm": 1.87934148311615, "learning_rate": 4.980476939203354e-05, "loss": 0.1606, "step": 150 }, { "epoch": 0.008385744234800839, "grad_norm": 1.4629708528518677, "learning_rate": 4.979166666666667e-05, "loss": 0.1798, "step": 160 }, { "epoch": 0.00890985324947589, "grad_norm": 1.632991909980774, "learning_rate": 4.9778563941299796e-05, "loss": 0.1524, "step": 170 }, { "epoch": 0.009433962264150943, "grad_norm": 1.5290257930755615, "learning_rate": 4.976546121593292e-05, "loss": 0.151, "step": 180 }, { "epoch": 0.009958071278825996, "grad_norm": 1.6157230138778687, "learning_rate": 4.975235849056604e-05, "loss": 0.1807, "step": 190 }, { "epoch": 0.010482180293501049, "grad_norm": 1.7142693996429443, "learning_rate": 4.9739255765199167e-05, "loss": 0.1754, "step": 200 }, { "epoch": 0.0110062893081761, "grad_norm": 1.5730280876159668, "learning_rate": 4.972615303983228e-05, "loss": 0.172, "step": 210 }, { "epoch": 0.011530398322851153, "grad_norm": 1.9272352457046509, "learning_rate": 4.9713050314465407e-05, "loss": 0.1684, "step": 220 }, { "epoch": 0.012054507337526206, "grad_norm": 1.6200827360153198, "learning_rate": 4.969994758909853e-05, "loss": 0.2375, "step": 230 }, { "epoch": 0.012578616352201259, "grad_norm": 1.1623066663742065, "learning_rate": 4.968684486373166e-05, "loss": 0.144, "step": 240 }, { "epoch": 0.01310272536687631, "grad_norm": 1.6734042167663574, "learning_rate": 4.9673742138364784e-05, "loss": 0.1523, "step": 250 }, { "epoch": 0.013626834381551363, "grad_norm": 1.4891057014465332, "learning_rate": 4.966063941299791e-05, "loss": 0.1881, "step": 260 }, { "epoch": 0.014150943396226415, "grad_norm": 1.4071035385131836, "learning_rate": 4.964753668763103e-05, "loss": 0.146, "step": 270 }, { "epoch": 0.014675052410901468, "grad_norm": 3.614590644836426, "learning_rate": 4.9634433962264154e-05, "loss": 0.1603, "step": 280 }, { "epoch": 0.01519916142557652, "grad_norm": 1.7132965326309204, "learning_rate": 4.962133123689728e-05, "loss": 0.1538, "step": 290 }, { "epoch": 0.015723270440251572, "grad_norm": 1.7030532360076904, "learning_rate": 4.96082285115304e-05, "loss": 0.188, "step": 300 }, { "epoch": 0.016247379454926623, "grad_norm": 0.9326625466346741, "learning_rate": 4.9595125786163524e-05, "loss": 0.1669, "step": 310 }, { "epoch": 0.016771488469601678, "grad_norm": 1.9342471361160278, "learning_rate": 4.9582023060796654e-05, "loss": 0.1618, "step": 320 }, { "epoch": 0.01729559748427673, "grad_norm": 1.786618947982788, "learning_rate": 4.956892033542977e-05, "loss": 0.1645, "step": 330 }, { "epoch": 0.01781970649895178, "grad_norm": 2.348848819732666, "learning_rate": 4.9555817610062894e-05, "loss": 0.1511, "step": 340 }, { "epoch": 0.018343815513626835, "grad_norm": 1.2310869693756104, "learning_rate": 4.954271488469602e-05, "loss": 0.162, "step": 350 }, { "epoch": 0.018867924528301886, "grad_norm": 1.5017716884613037, "learning_rate": 4.952961215932914e-05, "loss": 0.1844, "step": 360 }, { "epoch": 0.01939203354297694, "grad_norm": 2.7584903240203857, "learning_rate": 4.9516509433962264e-05, "loss": 0.1623, "step": 370 }, { "epoch": 0.019916142557651992, "grad_norm": 1.2648184299468994, "learning_rate": 4.950340670859539e-05, "loss": 0.198, "step": 380 }, { "epoch": 0.020440251572327043, "grad_norm": 1.4309797286987305, "learning_rate": 4.949030398322851e-05, "loss": 0.1776, "step": 390 }, { "epoch": 0.020964360587002098, "grad_norm": 2.9164257049560547, "learning_rate": 4.947720125786164e-05, "loss": 0.19, "step": 400 }, { "epoch": 0.02148846960167715, "grad_norm": 1.5874290466308594, "learning_rate": 4.9464098532494764e-05, "loss": 0.1611, "step": 410 }, { "epoch": 0.0220125786163522, "grad_norm": 3.8210947513580322, "learning_rate": 4.945099580712789e-05, "loss": 0.1678, "step": 420 }, { "epoch": 0.022536687631027254, "grad_norm": 1.253749966621399, "learning_rate": 4.943789308176101e-05, "loss": 0.1717, "step": 430 }, { "epoch": 0.023060796645702306, "grad_norm": 2.2594664096832275, "learning_rate": 4.9424790356394135e-05, "loss": 0.1781, "step": 440 }, { "epoch": 0.02358490566037736, "grad_norm": 1.278881549835205, "learning_rate": 4.941168763102725e-05, "loss": 0.1582, "step": 450 }, { "epoch": 0.02410901467505241, "grad_norm": 1.303890585899353, "learning_rate": 4.9398584905660375e-05, "loss": 0.1413, "step": 460 }, { "epoch": 0.024633123689727462, "grad_norm": 1.44670832157135, "learning_rate": 4.9385482180293505e-05, "loss": 0.1731, "step": 470 }, { "epoch": 0.025157232704402517, "grad_norm": 1.9329112768173218, "learning_rate": 4.937237945492663e-05, "loss": 0.1769, "step": 480 }, { "epoch": 0.025681341719077568, "grad_norm": 1.873005747795105, "learning_rate": 4.935927672955975e-05, "loss": 0.1945, "step": 490 }, { "epoch": 0.02620545073375262, "grad_norm": 1.2180646657943726, "learning_rate": 4.9346174004192875e-05, "loss": 0.1456, "step": 500 }, { "epoch": 0.026729559748427674, "grad_norm": 1.3814204931259155, "learning_rate": 4.9333071278826e-05, "loss": 0.1732, "step": 510 }, { "epoch": 0.027253668763102725, "grad_norm": 1.440902590751648, "learning_rate": 4.931996855345912e-05, "loss": 0.2116, "step": 520 }, { "epoch": 0.027777777777777776, "grad_norm": 1.0152875185012817, "learning_rate": 4.9306865828092245e-05, "loss": 0.1555, "step": 530 }, { "epoch": 0.02830188679245283, "grad_norm": 1.1150940656661987, "learning_rate": 4.929376310272537e-05, "loss": 0.1619, "step": 540 }, { "epoch": 0.028825995807127882, "grad_norm": 1.3169711828231812, "learning_rate": 4.928066037735849e-05, "loss": 0.1839, "step": 550 }, { "epoch": 0.029350104821802937, "grad_norm": 1.2143508195877075, "learning_rate": 4.926755765199162e-05, "loss": 0.171, "step": 560 }, { "epoch": 0.029874213836477988, "grad_norm": 0.9038965702056885, "learning_rate": 4.925445492662474e-05, "loss": 0.1504, "step": 570 }, { "epoch": 0.03039832285115304, "grad_norm": 1.3861496448516846, "learning_rate": 4.924135220125786e-05, "loss": 0.1926, "step": 580 }, { "epoch": 0.030922431865828093, "grad_norm": 1.5401298999786377, "learning_rate": 4.9228249475890985e-05, "loss": 0.172, "step": 590 }, { "epoch": 0.031446540880503145, "grad_norm": 2.0027623176574707, "learning_rate": 4.921514675052411e-05, "loss": 0.1753, "step": 600 }, { "epoch": 0.0319706498951782, "grad_norm": 1.0361213684082031, "learning_rate": 4.920204402515723e-05, "loss": 0.1549, "step": 610 }, { "epoch": 0.03249475890985325, "grad_norm": 1.3739137649536133, "learning_rate": 4.9188941299790356e-05, "loss": 0.194, "step": 620 }, { "epoch": 0.0330188679245283, "grad_norm": 2.229529857635498, "learning_rate": 4.9175838574423486e-05, "loss": 0.1705, "step": 630 }, { "epoch": 0.033542976939203356, "grad_norm": 1.585580825805664, "learning_rate": 4.916273584905661e-05, "loss": 0.1404, "step": 640 }, { "epoch": 0.034067085953878404, "grad_norm": 1.1737724542617798, "learning_rate": 4.914963312368973e-05, "loss": 0.1814, "step": 650 }, { "epoch": 0.03459119496855346, "grad_norm": 1.4785288572311401, "learning_rate": 4.9136530398322856e-05, "loss": 0.1425, "step": 660 }, { "epoch": 0.03511530398322851, "grad_norm": 2.3371455669403076, "learning_rate": 4.912342767295598e-05, "loss": 0.1601, "step": 670 }, { "epoch": 0.03563941299790356, "grad_norm": 1.0585002899169922, "learning_rate": 4.91103249475891e-05, "loss": 0.1261, "step": 680 }, { "epoch": 0.036163522012578615, "grad_norm": 1.873073935508728, "learning_rate": 4.909722222222222e-05, "loss": 0.1793, "step": 690 }, { "epoch": 0.03668763102725367, "grad_norm": 1.6408770084381104, "learning_rate": 4.908411949685535e-05, "loss": 0.1635, "step": 700 }, { "epoch": 0.037211740041928724, "grad_norm": 2.1603291034698486, "learning_rate": 4.907101677148847e-05, "loss": 0.1348, "step": 710 }, { "epoch": 0.03773584905660377, "grad_norm": 3.0273585319519043, "learning_rate": 4.9057914046121596e-05, "loss": 0.1864, "step": 720 }, { "epoch": 0.03825995807127883, "grad_norm": 1.1323351860046387, "learning_rate": 4.904481132075472e-05, "loss": 0.1577, "step": 730 }, { "epoch": 0.03878406708595388, "grad_norm": 1.803604006767273, "learning_rate": 4.903170859538784e-05, "loss": 0.1612, "step": 740 }, { "epoch": 0.03930817610062893, "grad_norm": 2.119659900665283, "learning_rate": 4.9018605870020966e-05, "loss": 0.1484, "step": 750 }, { "epoch": 0.039832285115303984, "grad_norm": 2.6356945037841797, "learning_rate": 4.900550314465409e-05, "loss": 0.158, "step": 760 }, { "epoch": 0.04035639412997904, "grad_norm": 1.738153100013733, "learning_rate": 4.899240041928721e-05, "loss": 0.1704, "step": 770 }, { "epoch": 0.040880503144654086, "grad_norm": 2.1323139667510986, "learning_rate": 4.8979297693920336e-05, "loss": 0.173, "step": 780 }, { "epoch": 0.04140461215932914, "grad_norm": 2.8090837001800537, "learning_rate": 4.896619496855347e-05, "loss": 0.1349, "step": 790 }, { "epoch": 0.041928721174004195, "grad_norm": 1.0705924034118652, "learning_rate": 4.895309224318658e-05, "loss": 0.147, "step": 800 }, { "epoch": 0.04245283018867924, "grad_norm": 1.817993402481079, "learning_rate": 4.893998951781971e-05, "loss": 0.2037, "step": 810 }, { "epoch": 0.0429769392033543, "grad_norm": 1.405219316482544, "learning_rate": 4.892688679245283e-05, "loss": 0.1496, "step": 820 }, { "epoch": 0.04350104821802935, "grad_norm": 2.2118489742279053, "learning_rate": 4.8913784067085953e-05, "loss": 0.1513, "step": 830 }, { "epoch": 0.0440251572327044, "grad_norm": 1.1836580038070679, "learning_rate": 4.890068134171908e-05, "loss": 0.1458, "step": 840 }, { "epoch": 0.044549266247379454, "grad_norm": 1.7217124700546265, "learning_rate": 4.88875786163522e-05, "loss": 0.1554, "step": 850 }, { "epoch": 0.04507337526205451, "grad_norm": 1.640258550643921, "learning_rate": 4.887447589098533e-05, "loss": 0.12, "step": 860 }, { "epoch": 0.04559748427672956, "grad_norm": 1.1371960639953613, "learning_rate": 4.8861373165618454e-05, "loss": 0.1992, "step": 870 }, { "epoch": 0.04612159329140461, "grad_norm": 2.01666259765625, "learning_rate": 4.884827044025158e-05, "loss": 0.1374, "step": 880 }, { "epoch": 0.046645702306079666, "grad_norm": 2.8290281295776367, "learning_rate": 4.88351677148847e-05, "loss": 0.1368, "step": 890 }, { "epoch": 0.04716981132075472, "grad_norm": 1.649423599243164, "learning_rate": 4.8822064989517824e-05, "loss": 0.1858, "step": 900 }, { "epoch": 0.04769392033542977, "grad_norm": 1.3996362686157227, "learning_rate": 4.880896226415095e-05, "loss": 0.1751, "step": 910 }, { "epoch": 0.04821802935010482, "grad_norm": 1.836682915687561, "learning_rate": 4.8795859538784064e-05, "loss": 0.1383, "step": 920 }, { "epoch": 0.04874213836477988, "grad_norm": 2.5748958587646484, "learning_rate": 4.8782756813417194e-05, "loss": 0.1296, "step": 930 }, { "epoch": 0.049266247379454925, "grad_norm": 1.4079227447509766, "learning_rate": 4.876965408805032e-05, "loss": 0.1476, "step": 940 }, { "epoch": 0.04979035639412998, "grad_norm": 0.9867343902587891, "learning_rate": 4.875655136268344e-05, "loss": 0.1841, "step": 950 }, { "epoch": 0.050314465408805034, "grad_norm": 1.5223592519760132, "learning_rate": 4.8743448637316564e-05, "loss": 0.1666, "step": 960 }, { "epoch": 0.05083857442348008, "grad_norm": 1.3690940141677856, "learning_rate": 4.873034591194969e-05, "loss": 0.1901, "step": 970 }, { "epoch": 0.051362683438155136, "grad_norm": 1.6786198616027832, "learning_rate": 4.871724318658281e-05, "loss": 0.156, "step": 980 }, { "epoch": 0.05188679245283019, "grad_norm": 1.3634241819381714, "learning_rate": 4.8704140461215934e-05, "loss": 0.1808, "step": 990 }, { "epoch": 0.05241090146750524, "grad_norm": 1.5819250345230103, "learning_rate": 4.869103773584906e-05, "loss": 0.1346, "step": 1000 }, { "epoch": 0.05241090146750524, "eval_loss": 0.32801321148872375, "eval_runtime": 267.484, "eval_samples_per_second": 7.443, "eval_steps_per_second": 1.241, "step": 1000 }, { "epoch": 0.05293501048218029, "grad_norm": 1.4934093952178955, "learning_rate": 4.867793501048218e-05, "loss": 0.1558, "step": 1010 }, { "epoch": 0.05345911949685535, "grad_norm": 2.1169118881225586, "learning_rate": 4.866483228511531e-05, "loss": 0.1664, "step": 1020 }, { "epoch": 0.053983228511530396, "grad_norm": 1.9853368997573853, "learning_rate": 4.8651729559748435e-05, "loss": 0.1532, "step": 1030 }, { "epoch": 0.05450733752620545, "grad_norm": 2.548008680343628, "learning_rate": 4.863862683438155e-05, "loss": 0.1607, "step": 1040 }, { "epoch": 0.055031446540880505, "grad_norm": 1.3498486280441284, "learning_rate": 4.8625524109014675e-05, "loss": 0.1667, "step": 1050 }, { "epoch": 0.05555555555555555, "grad_norm": 3.581585168838501, "learning_rate": 4.86124213836478e-05, "loss": 0.2109, "step": 1060 }, { "epoch": 0.05607966457023061, "grad_norm": 1.8366062641143799, "learning_rate": 4.859931865828092e-05, "loss": 0.1457, "step": 1070 }, { "epoch": 0.05660377358490566, "grad_norm": 2.5987987518310547, "learning_rate": 4.8586215932914045e-05, "loss": 0.1583, "step": 1080 }, { "epoch": 0.05712788259958071, "grad_norm": 1.100653886795044, "learning_rate": 4.8573113207547175e-05, "loss": 0.1689, "step": 1090 }, { "epoch": 0.057651991614255764, "grad_norm": 0.548283040523529, "learning_rate": 4.85600104821803e-05, "loss": 0.1453, "step": 1100 }, { "epoch": 0.05817610062893082, "grad_norm": 1.1596165895462036, "learning_rate": 4.854690775681342e-05, "loss": 0.1452, "step": 1110 }, { "epoch": 0.05870020964360587, "grad_norm": 1.637635350227356, "learning_rate": 4.8533805031446545e-05, "loss": 0.1465, "step": 1120 }, { "epoch": 0.05922431865828092, "grad_norm": 0.9369329810142517, "learning_rate": 4.852070230607967e-05, "loss": 0.1459, "step": 1130 }, { "epoch": 0.059748427672955975, "grad_norm": 1.2311546802520752, "learning_rate": 4.850759958071279e-05, "loss": 0.1766, "step": 1140 }, { "epoch": 0.06027253668763103, "grad_norm": 2.20399808883667, "learning_rate": 4.8494496855345915e-05, "loss": 0.1597, "step": 1150 }, { "epoch": 0.06079664570230608, "grad_norm": 1.283770203590393, "learning_rate": 4.848139412997903e-05, "loss": 0.1612, "step": 1160 }, { "epoch": 0.06132075471698113, "grad_norm": 1.9280977249145508, "learning_rate": 4.846829140461216e-05, "loss": 0.139, "step": 1170 }, { "epoch": 0.06184486373165619, "grad_norm": 1.577441692352295, "learning_rate": 4.8455188679245285e-05, "loss": 0.1676, "step": 1180 }, { "epoch": 0.062368972746331235, "grad_norm": 1.5173856019973755, "learning_rate": 4.844208595387841e-05, "loss": 0.1791, "step": 1190 }, { "epoch": 0.06289308176100629, "grad_norm": 1.5214680433273315, "learning_rate": 4.842898322851153e-05, "loss": 0.1491, "step": 1200 }, { "epoch": 0.06341719077568134, "grad_norm": 1.152410864830017, "learning_rate": 4.8415880503144656e-05, "loss": 0.1435, "step": 1210 }, { "epoch": 0.0639412997903564, "grad_norm": 1.456242322921753, "learning_rate": 4.840277777777778e-05, "loss": 0.1374, "step": 1220 }, { "epoch": 0.06446540880503145, "grad_norm": 0.6337676644325256, "learning_rate": 4.83896750524109e-05, "loss": 0.1484, "step": 1230 }, { "epoch": 0.0649895178197065, "grad_norm": 1.5528064966201782, "learning_rate": 4.8376572327044026e-05, "loss": 0.1796, "step": 1240 }, { "epoch": 0.06551362683438156, "grad_norm": 1.5369819402694702, "learning_rate": 4.8363469601677156e-05, "loss": 0.151, "step": 1250 }, { "epoch": 0.0660377358490566, "grad_norm": 2.3839173316955566, "learning_rate": 4.835036687631028e-05, "loss": 0.1574, "step": 1260 }, { "epoch": 0.06656184486373165, "grad_norm": 1.5514627695083618, "learning_rate": 4.83372641509434e-05, "loss": 0.1378, "step": 1270 }, { "epoch": 0.06708595387840671, "grad_norm": 1.8946847915649414, "learning_rate": 4.832416142557652e-05, "loss": 0.1413, "step": 1280 }, { "epoch": 0.06761006289308176, "grad_norm": 2.3054966926574707, "learning_rate": 4.831105870020964e-05, "loss": 0.1899, "step": 1290 }, { "epoch": 0.06813417190775681, "grad_norm": 1.0668176412582397, "learning_rate": 4.8297955974842766e-05, "loss": 0.134, "step": 1300 }, { "epoch": 0.06865828092243187, "grad_norm": 2.0305306911468506, "learning_rate": 4.828485324947589e-05, "loss": 0.211, "step": 1310 }, { "epoch": 0.06918238993710692, "grad_norm": 1.6836894750595093, "learning_rate": 4.827175052410901e-05, "loss": 0.1703, "step": 1320 }, { "epoch": 0.06970649895178196, "grad_norm": 1.3076380491256714, "learning_rate": 4.825864779874214e-05, "loss": 0.1626, "step": 1330 }, { "epoch": 0.07023060796645703, "grad_norm": 1.258353352546692, "learning_rate": 4.8245545073375266e-05, "loss": 0.1685, "step": 1340 }, { "epoch": 0.07075471698113207, "grad_norm": 1.0019201040267944, "learning_rate": 4.823244234800839e-05, "loss": 0.1468, "step": 1350 }, { "epoch": 0.07127882599580712, "grad_norm": 2.594862222671509, "learning_rate": 4.821933962264151e-05, "loss": 0.1298, "step": 1360 }, { "epoch": 0.07180293501048218, "grad_norm": 0.9913552403450012, "learning_rate": 4.8206236897274637e-05, "loss": 0.1482, "step": 1370 }, { "epoch": 0.07232704402515723, "grad_norm": 1.8441811800003052, "learning_rate": 4.819313417190776e-05, "loss": 0.1885, "step": 1380 }, { "epoch": 0.07285115303983228, "grad_norm": 1.8914382457733154, "learning_rate": 4.818003144654088e-05, "loss": 0.1218, "step": 1390 }, { "epoch": 0.07337526205450734, "grad_norm": 1.9815479516983032, "learning_rate": 4.816692872117401e-05, "loss": 0.122, "step": 1400 }, { "epoch": 0.07389937106918239, "grad_norm": 1.6056983470916748, "learning_rate": 4.815382599580713e-05, "loss": 0.192, "step": 1410 }, { "epoch": 0.07442348008385745, "grad_norm": 2.7241697311401367, "learning_rate": 4.8140723270440253e-05, "loss": 0.1282, "step": 1420 }, { "epoch": 0.0749475890985325, "grad_norm": 2.503002405166626, "learning_rate": 4.812762054507338e-05, "loss": 0.1728, "step": 1430 }, { "epoch": 0.07547169811320754, "grad_norm": 1.2872258424758911, "learning_rate": 4.81145178197065e-05, "loss": 0.1376, "step": 1440 }, { "epoch": 0.0759958071278826, "grad_norm": 1.5114333629608154, "learning_rate": 4.8101415094339624e-05, "loss": 0.1729, "step": 1450 }, { "epoch": 0.07651991614255765, "grad_norm": 1.2313517332077026, "learning_rate": 4.808831236897275e-05, "loss": 0.1507, "step": 1460 }, { "epoch": 0.0770440251572327, "grad_norm": 1.3203717470169067, "learning_rate": 4.807520964360587e-05, "loss": 0.1681, "step": 1470 }, { "epoch": 0.07756813417190776, "grad_norm": 0.7284913063049316, "learning_rate": 4.8062106918238994e-05, "loss": 0.1493, "step": 1480 }, { "epoch": 0.07809224318658281, "grad_norm": 1.1101444959640503, "learning_rate": 4.8049004192872124e-05, "loss": 0.1573, "step": 1490 }, { "epoch": 0.07861635220125786, "grad_norm": 0.9905779957771301, "learning_rate": 4.803590146750525e-05, "loss": 0.1423, "step": 1500 }, { "epoch": 0.07914046121593292, "grad_norm": 2.1272692680358887, "learning_rate": 4.802279874213837e-05, "loss": 0.1638, "step": 1510 }, { "epoch": 0.07966457023060797, "grad_norm": 1.397460699081421, "learning_rate": 4.800969601677149e-05, "loss": 0.1299, "step": 1520 }, { "epoch": 0.08018867924528301, "grad_norm": 0.9754815101623535, "learning_rate": 4.799659329140461e-05, "loss": 0.1495, "step": 1530 }, { "epoch": 0.08071278825995808, "grad_norm": 2.096616268157959, "learning_rate": 4.7983490566037734e-05, "loss": 0.133, "step": 1540 }, { "epoch": 0.08123689727463312, "grad_norm": 1.4818449020385742, "learning_rate": 4.797038784067086e-05, "loss": 0.1158, "step": 1550 }, { "epoch": 0.08176100628930817, "grad_norm": 1.9018614292144775, "learning_rate": 4.795728511530399e-05, "loss": 0.1637, "step": 1560 }, { "epoch": 0.08228511530398323, "grad_norm": 1.2683446407318115, "learning_rate": 4.794418238993711e-05, "loss": 0.1492, "step": 1570 }, { "epoch": 0.08280922431865828, "grad_norm": 1.5261001586914062, "learning_rate": 4.7931079664570234e-05, "loss": 0.1528, "step": 1580 }, { "epoch": 0.08333333333333333, "grad_norm": 1.243240237236023, "learning_rate": 4.791797693920336e-05, "loss": 0.1795, "step": 1590 }, { "epoch": 0.08385744234800839, "grad_norm": 1.667392373085022, "learning_rate": 4.790487421383648e-05, "loss": 0.1549, "step": 1600 }, { "epoch": 0.08438155136268344, "grad_norm": 1.0932554006576538, "learning_rate": 4.7891771488469605e-05, "loss": 0.1494, "step": 1610 }, { "epoch": 0.08490566037735849, "grad_norm": 1.1366685628890991, "learning_rate": 4.787866876310273e-05, "loss": 0.1842, "step": 1620 }, { "epoch": 0.08542976939203355, "grad_norm": 1.2293801307678223, "learning_rate": 4.786556603773585e-05, "loss": 0.1567, "step": 1630 }, { "epoch": 0.0859538784067086, "grad_norm": 2.5766963958740234, "learning_rate": 4.7852463312368975e-05, "loss": 0.1495, "step": 1640 }, { "epoch": 0.08647798742138364, "grad_norm": 1.4142640829086304, "learning_rate": 4.78393605870021e-05, "loss": 0.1468, "step": 1650 }, { "epoch": 0.0870020964360587, "grad_norm": 1.6835039854049683, "learning_rate": 4.782625786163522e-05, "loss": 0.1554, "step": 1660 }, { "epoch": 0.08752620545073375, "grad_norm": 1.5390868186950684, "learning_rate": 4.7813155136268345e-05, "loss": 0.1953, "step": 1670 }, { "epoch": 0.0880503144654088, "grad_norm": 2.5762722492218018, "learning_rate": 4.780005241090147e-05, "loss": 0.1474, "step": 1680 }, { "epoch": 0.08857442348008386, "grad_norm": 1.0517301559448242, "learning_rate": 4.778694968553459e-05, "loss": 0.1648, "step": 1690 }, { "epoch": 0.08909853249475891, "grad_norm": 1.400057315826416, "learning_rate": 4.7773846960167715e-05, "loss": 0.1446, "step": 1700 }, { "epoch": 0.08962264150943396, "grad_norm": 0.9269096851348877, "learning_rate": 4.776074423480084e-05, "loss": 0.1373, "step": 1710 }, { "epoch": 0.09014675052410902, "grad_norm": 1.1477895975112915, "learning_rate": 4.774764150943397e-05, "loss": 0.1643, "step": 1720 }, { "epoch": 0.09067085953878407, "grad_norm": 1.0856636762619019, "learning_rate": 4.773453878406709e-05, "loss": 0.1464, "step": 1730 }, { "epoch": 0.09119496855345911, "grad_norm": 1.5389838218688965, "learning_rate": 4.7721436058700215e-05, "loss": 0.1532, "step": 1740 }, { "epoch": 0.09171907756813417, "grad_norm": 6.00367546081543, "learning_rate": 4.770833333333334e-05, "loss": 0.1527, "step": 1750 }, { "epoch": 0.09224318658280922, "grad_norm": 1.1369270086288452, "learning_rate": 4.7695230607966455e-05, "loss": 0.1323, "step": 1760 }, { "epoch": 0.09276729559748427, "grad_norm": 2.0414113998413086, "learning_rate": 4.768212788259958e-05, "loss": 0.1486, "step": 1770 }, { "epoch": 0.09329140461215933, "grad_norm": 2.0718820095062256, "learning_rate": 4.76690251572327e-05, "loss": 0.1626, "step": 1780 }, { "epoch": 0.09381551362683438, "grad_norm": 2.329056739807129, "learning_rate": 4.765592243186583e-05, "loss": 0.1432, "step": 1790 }, { "epoch": 0.09433962264150944, "grad_norm": 1.9668174982070923, "learning_rate": 4.7642819706498956e-05, "loss": 0.1597, "step": 1800 }, { "epoch": 0.09486373165618449, "grad_norm": 2.178865671157837, "learning_rate": 4.762971698113208e-05, "loss": 0.1787, "step": 1810 }, { "epoch": 0.09538784067085954, "grad_norm": 2.251453161239624, "learning_rate": 4.76166142557652e-05, "loss": 0.1425, "step": 1820 }, { "epoch": 0.0959119496855346, "grad_norm": 2.2333481311798096, "learning_rate": 4.7603511530398326e-05, "loss": 0.1417, "step": 1830 }, { "epoch": 0.09643605870020965, "grad_norm": 3.560199022293091, "learning_rate": 4.759040880503145e-05, "loss": 0.1252, "step": 1840 }, { "epoch": 0.09696016771488469, "grad_norm": 2.0870866775512695, "learning_rate": 4.757730607966457e-05, "loss": 0.1617, "step": 1850 }, { "epoch": 0.09748427672955975, "grad_norm": 1.6968202590942383, "learning_rate": 4.7564203354297696e-05, "loss": 0.1438, "step": 1860 }, { "epoch": 0.0980083857442348, "grad_norm": 1.149673581123352, "learning_rate": 4.755110062893082e-05, "loss": 0.1519, "step": 1870 }, { "epoch": 0.09853249475890985, "grad_norm": 1.1046313047409058, "learning_rate": 4.753799790356394e-05, "loss": 0.1318, "step": 1880 }, { "epoch": 0.09905660377358491, "grad_norm": 1.192717432975769, "learning_rate": 4.7524895178197066e-05, "loss": 0.1514, "step": 1890 }, { "epoch": 0.09958071278825996, "grad_norm": 1.7590820789337158, "learning_rate": 4.751179245283019e-05, "loss": 0.1653, "step": 1900 }, { "epoch": 0.100104821802935, "grad_norm": 1.865303635597229, "learning_rate": 4.749868972746331e-05, "loss": 0.1605, "step": 1910 }, { "epoch": 0.10062893081761007, "grad_norm": 1.695488452911377, "learning_rate": 4.7485587002096436e-05, "loss": 0.1473, "step": 1920 }, { "epoch": 0.10115303983228512, "grad_norm": 1.4552061557769775, "learning_rate": 4.747248427672956e-05, "loss": 0.1402, "step": 1930 }, { "epoch": 0.10167714884696016, "grad_norm": 3.4640729427337646, "learning_rate": 4.745938155136268e-05, "loss": 0.1738, "step": 1940 }, { "epoch": 0.10220125786163523, "grad_norm": 1.700392484664917, "learning_rate": 4.744627882599581e-05, "loss": 0.1367, "step": 1950 }, { "epoch": 0.10272536687631027, "grad_norm": 1.9521784782409668, "learning_rate": 4.743317610062894e-05, "loss": 0.1459, "step": 1960 }, { "epoch": 0.10324947589098532, "grad_norm": 2.07822322845459, "learning_rate": 4.742007337526206e-05, "loss": 0.1851, "step": 1970 }, { "epoch": 0.10377358490566038, "grad_norm": 1.2537726163864136, "learning_rate": 4.7406970649895183e-05, "loss": 0.1588, "step": 1980 }, { "epoch": 0.10429769392033543, "grad_norm": 3.269179105758667, "learning_rate": 4.739386792452831e-05, "loss": 0.1697, "step": 1990 }, { "epoch": 0.10482180293501048, "grad_norm": 1.4735572338104248, "learning_rate": 4.7380765199161423e-05, "loss": 0.1634, "step": 2000 }, { "epoch": 0.10482180293501048, "eval_loss": 0.3126026391983032, "eval_runtime": 267.0186, "eval_samples_per_second": 7.456, "eval_steps_per_second": 1.243, "step": 2000 }, { "epoch": 0.10534591194968554, "grad_norm": 1.6849536895751953, "learning_rate": 4.736766247379455e-05, "loss": 0.1624, "step": 2010 }, { "epoch": 0.10587002096436059, "grad_norm": 1.844832181930542, "learning_rate": 4.735455974842768e-05, "loss": 0.144, "step": 2020 }, { "epoch": 0.10639412997903563, "grad_norm": 2.9537103176116943, "learning_rate": 4.73414570230608e-05, "loss": 0.1877, "step": 2030 }, { "epoch": 0.1069182389937107, "grad_norm": 0.9054603576660156, "learning_rate": 4.7328354297693924e-05, "loss": 0.1407, "step": 2040 }, { "epoch": 0.10744234800838574, "grad_norm": 1.7040249109268188, "learning_rate": 4.731525157232705e-05, "loss": 0.1432, "step": 2050 }, { "epoch": 0.10796645702306079, "grad_norm": 4.041100025177002, "learning_rate": 4.730214884696017e-05, "loss": 0.1575, "step": 2060 }, { "epoch": 0.10849056603773585, "grad_norm": 1.2942545413970947, "learning_rate": 4.7289046121593294e-05, "loss": 0.1458, "step": 2070 }, { "epoch": 0.1090146750524109, "grad_norm": 2.1114304065704346, "learning_rate": 4.727594339622642e-05, "loss": 0.1529, "step": 2080 }, { "epoch": 0.10953878406708595, "grad_norm": 1.232842206954956, "learning_rate": 4.726284067085954e-05, "loss": 0.158, "step": 2090 }, { "epoch": 0.11006289308176101, "grad_norm": 1.1483700275421143, "learning_rate": 4.7249737945492664e-05, "loss": 0.171, "step": 2100 }, { "epoch": 0.11058700209643606, "grad_norm": 2.2358574867248535, "learning_rate": 4.7236635220125794e-05, "loss": 0.138, "step": 2110 }, { "epoch": 0.1111111111111111, "grad_norm": 0.9957813620567322, "learning_rate": 4.722353249475891e-05, "loss": 0.1364, "step": 2120 }, { "epoch": 0.11163522012578617, "grad_norm": 1.6623884439468384, "learning_rate": 4.7210429769392034e-05, "loss": 0.1433, "step": 2130 }, { "epoch": 0.11215932914046121, "grad_norm": 1.312627911567688, "learning_rate": 4.719732704402516e-05, "loss": 0.1549, "step": 2140 }, { "epoch": 0.11268343815513626, "grad_norm": 1.5365536212921143, "learning_rate": 4.718422431865828e-05, "loss": 0.1608, "step": 2150 }, { "epoch": 0.11320754716981132, "grad_norm": 2.1387362480163574, "learning_rate": 4.7171121593291404e-05, "loss": 0.1678, "step": 2160 }, { "epoch": 0.11373165618448637, "grad_norm": 1.675684928894043, "learning_rate": 4.715801886792453e-05, "loss": 0.1411, "step": 2170 }, { "epoch": 0.11425576519916142, "grad_norm": 2.3778798580169678, "learning_rate": 4.714491614255766e-05, "loss": 0.1283, "step": 2180 }, { "epoch": 0.11477987421383648, "grad_norm": 1.5061962604522705, "learning_rate": 4.713181341719078e-05, "loss": 0.1551, "step": 2190 }, { "epoch": 0.11530398322851153, "grad_norm": 1.1273947954177856, "learning_rate": 4.7118710691823905e-05, "loss": 0.1316, "step": 2200 }, { "epoch": 0.11582809224318659, "grad_norm": 1.9337362051010132, "learning_rate": 4.710560796645703e-05, "loss": 0.1611, "step": 2210 }, { "epoch": 0.11635220125786164, "grad_norm": 5.833995342254639, "learning_rate": 4.709250524109015e-05, "loss": 0.1447, "step": 2220 }, { "epoch": 0.11687631027253668, "grad_norm": 1.9764273166656494, "learning_rate": 4.7079402515723275e-05, "loss": 0.154, "step": 2230 }, { "epoch": 0.11740041928721175, "grad_norm": 1.5811856985092163, "learning_rate": 4.706629979035639e-05, "loss": 0.1399, "step": 2240 }, { "epoch": 0.1179245283018868, "grad_norm": 1.6124383211135864, "learning_rate": 4.7053197064989515e-05, "loss": 0.1416, "step": 2250 }, { "epoch": 0.11844863731656184, "grad_norm": 1.4890698194503784, "learning_rate": 4.7040094339622645e-05, "loss": 0.1595, "step": 2260 }, { "epoch": 0.1189727463312369, "grad_norm": 1.7862485647201538, "learning_rate": 4.702699161425577e-05, "loss": 0.167, "step": 2270 }, { "epoch": 0.11949685534591195, "grad_norm": 1.266352891921997, "learning_rate": 4.701388888888889e-05, "loss": 0.1934, "step": 2280 }, { "epoch": 0.120020964360587, "grad_norm": 2.252445697784424, "learning_rate": 4.7000786163522015e-05, "loss": 0.1665, "step": 2290 }, { "epoch": 0.12054507337526206, "grad_norm": 0.7538189888000488, "learning_rate": 4.698768343815514e-05, "loss": 0.1292, "step": 2300 }, { "epoch": 0.12106918238993711, "grad_norm": 1.1450074911117554, "learning_rate": 4.697458071278826e-05, "loss": 0.1515, "step": 2310 }, { "epoch": 0.12159329140461216, "grad_norm": 1.5450239181518555, "learning_rate": 4.6961477987421385e-05, "loss": 0.1658, "step": 2320 }, { "epoch": 0.12211740041928722, "grad_norm": 2.0828919410705566, "learning_rate": 4.694837526205451e-05, "loss": 0.1334, "step": 2330 }, { "epoch": 0.12264150943396226, "grad_norm": 3.190901756286621, "learning_rate": 4.693527253668764e-05, "loss": 0.1313, "step": 2340 }, { "epoch": 0.12316561844863731, "grad_norm": 1.8129808902740479, "learning_rate": 4.692216981132076e-05, "loss": 0.1695, "step": 2350 }, { "epoch": 0.12368972746331237, "grad_norm": 1.8840081691741943, "learning_rate": 4.690906708595388e-05, "loss": 0.1409, "step": 2360 }, { "epoch": 0.12421383647798742, "grad_norm": 1.7952953577041626, "learning_rate": 4.6895964360587e-05, "loss": 0.1132, "step": 2370 }, { "epoch": 0.12473794549266247, "grad_norm": 2.009758949279785, "learning_rate": 4.6882861635220126e-05, "loss": 0.1673, "step": 2380 }, { "epoch": 0.12526205450733752, "grad_norm": 1.2804065942764282, "learning_rate": 4.686975890985325e-05, "loss": 0.1489, "step": 2390 }, { "epoch": 0.12578616352201258, "grad_norm": 1.7180591821670532, "learning_rate": 4.685665618448637e-05, "loss": 0.1653, "step": 2400 }, { "epoch": 0.12631027253668764, "grad_norm": 3.489091634750366, "learning_rate": 4.6843553459119496e-05, "loss": 0.128, "step": 2410 }, { "epoch": 0.12683438155136267, "grad_norm": 2.4423317909240723, "learning_rate": 4.6830450733752626e-05, "loss": 0.1706, "step": 2420 }, { "epoch": 0.12735849056603774, "grad_norm": 0.7794014811515808, "learning_rate": 4.681734800838575e-05, "loss": 0.143, "step": 2430 }, { "epoch": 0.1278825995807128, "grad_norm": 1.549364447593689, "learning_rate": 4.680424528301887e-05, "loss": 0.163, "step": 2440 }, { "epoch": 0.12840670859538783, "grad_norm": 4.103890895843506, "learning_rate": 4.6791142557651996e-05, "loss": 0.1499, "step": 2450 }, { "epoch": 0.1289308176100629, "grad_norm": 1.770443081855774, "learning_rate": 4.677803983228512e-05, "loss": 0.1629, "step": 2460 }, { "epoch": 0.12945492662473795, "grad_norm": 2.194091796875, "learning_rate": 4.6764937106918236e-05, "loss": 0.198, "step": 2470 }, { "epoch": 0.129979035639413, "grad_norm": 1.8205286264419556, "learning_rate": 4.675183438155136e-05, "loss": 0.1464, "step": 2480 }, { "epoch": 0.13050314465408805, "grad_norm": 1.9129751920700073, "learning_rate": 4.673873165618449e-05, "loss": 0.1567, "step": 2490 }, { "epoch": 0.1310272536687631, "grad_norm": 1.2685729265213013, "learning_rate": 4.672562893081761e-05, "loss": 0.1614, "step": 2500 }, { "epoch": 0.13155136268343814, "grad_norm": 2.170300006866455, "learning_rate": 4.6712526205450736e-05, "loss": 0.153, "step": 2510 }, { "epoch": 0.1320754716981132, "grad_norm": 0.7259665727615356, "learning_rate": 4.669942348008386e-05, "loss": 0.1171, "step": 2520 }, { "epoch": 0.13259958071278827, "grad_norm": 1.8095953464508057, "learning_rate": 4.668632075471698e-05, "loss": 0.1621, "step": 2530 }, { "epoch": 0.1331236897274633, "grad_norm": 3.30016827583313, "learning_rate": 4.6673218029350107e-05, "loss": 0.1727, "step": 2540 }, { "epoch": 0.13364779874213836, "grad_norm": 1.0500829219818115, "learning_rate": 4.666011530398323e-05, "loss": 0.177, "step": 2550 }, { "epoch": 0.13417190775681342, "grad_norm": 2.164457321166992, "learning_rate": 4.664701257861635e-05, "loss": 0.1632, "step": 2560 }, { "epoch": 0.13469601677148846, "grad_norm": 1.6930001974105835, "learning_rate": 4.663390985324948e-05, "loss": 0.1496, "step": 2570 }, { "epoch": 0.13522012578616352, "grad_norm": 2.6686787605285645, "learning_rate": 4.662080712788261e-05, "loss": 0.1436, "step": 2580 }, { "epoch": 0.13574423480083858, "grad_norm": 1.6781806945800781, "learning_rate": 4.6607704402515723e-05, "loss": 0.1369, "step": 2590 }, { "epoch": 0.13626834381551362, "grad_norm": 1.3258206844329834, "learning_rate": 4.659460167714885e-05, "loss": 0.1494, "step": 2600 }, { "epoch": 0.13679245283018868, "grad_norm": 2.0583879947662354, "learning_rate": 4.658149895178197e-05, "loss": 0.15, "step": 2610 }, { "epoch": 0.13731656184486374, "grad_norm": 3.495466709136963, "learning_rate": 4.6568396226415094e-05, "loss": 0.1532, "step": 2620 }, { "epoch": 0.13784067085953877, "grad_norm": 2.6582422256469727, "learning_rate": 4.655529350104822e-05, "loss": 0.1372, "step": 2630 }, { "epoch": 0.13836477987421383, "grad_norm": 2.5871691703796387, "learning_rate": 4.654219077568134e-05, "loss": 0.1544, "step": 2640 }, { "epoch": 0.1388888888888889, "grad_norm": 3.1617019176483154, "learning_rate": 4.652908805031447e-05, "loss": 0.1149, "step": 2650 }, { "epoch": 0.13941299790356393, "grad_norm": 1.8542845249176025, "learning_rate": 4.6515985324947594e-05, "loss": 0.1354, "step": 2660 }, { "epoch": 0.139937106918239, "grad_norm": 1.4777790307998657, "learning_rate": 4.650288259958072e-05, "loss": 0.1347, "step": 2670 }, { "epoch": 0.14046121593291405, "grad_norm": 2.647198438644409, "learning_rate": 4.648977987421384e-05, "loss": 0.1698, "step": 2680 }, { "epoch": 0.14098532494758909, "grad_norm": 2.088172197341919, "learning_rate": 4.6476677148846964e-05, "loss": 0.1414, "step": 2690 }, { "epoch": 0.14150943396226415, "grad_norm": 1.4307177066802979, "learning_rate": 4.646357442348009e-05, "loss": 0.1309, "step": 2700 }, { "epoch": 0.1420335429769392, "grad_norm": 1.9990031719207764, "learning_rate": 4.6450471698113204e-05, "loss": 0.1448, "step": 2710 }, { "epoch": 0.14255765199161424, "grad_norm": 2.125288724899292, "learning_rate": 4.6437368972746334e-05, "loss": 0.1224, "step": 2720 }, { "epoch": 0.1430817610062893, "grad_norm": 1.175098180770874, "learning_rate": 4.642426624737946e-05, "loss": 0.1375, "step": 2730 }, { "epoch": 0.14360587002096437, "grad_norm": 2.440237522125244, "learning_rate": 4.641116352201258e-05, "loss": 0.1617, "step": 2740 }, { "epoch": 0.1441299790356394, "grad_norm": 1.116590976715088, "learning_rate": 4.6398060796645704e-05, "loss": 0.121, "step": 2750 }, { "epoch": 0.14465408805031446, "grad_norm": 2.4094574451446533, "learning_rate": 4.638495807127883e-05, "loss": 0.1326, "step": 2760 }, { "epoch": 0.14517819706498952, "grad_norm": 1.5828850269317627, "learning_rate": 4.637185534591195e-05, "loss": 0.1741, "step": 2770 }, { "epoch": 0.14570230607966456, "grad_norm": 1.1513792276382446, "learning_rate": 4.6358752620545075e-05, "loss": 0.1224, "step": 2780 }, { "epoch": 0.14622641509433962, "grad_norm": 1.0948734283447266, "learning_rate": 4.63456498951782e-05, "loss": 0.1681, "step": 2790 }, { "epoch": 0.14675052410901468, "grad_norm": 1.7512544393539429, "learning_rate": 4.633254716981132e-05, "loss": 0.1803, "step": 2800 }, { "epoch": 0.14727463312368974, "grad_norm": 1.3149117231369019, "learning_rate": 4.631944444444445e-05, "loss": 0.1433, "step": 2810 }, { "epoch": 0.14779874213836477, "grad_norm": 1.12627112865448, "learning_rate": 4.6306341719077575e-05, "loss": 0.1586, "step": 2820 }, { "epoch": 0.14832285115303984, "grad_norm": 1.3142722845077515, "learning_rate": 4.629323899371069e-05, "loss": 0.137, "step": 2830 }, { "epoch": 0.1488469601677149, "grad_norm": 1.5012706518173218, "learning_rate": 4.6280136268343815e-05, "loss": 0.1382, "step": 2840 }, { "epoch": 0.14937106918238993, "grad_norm": 2.6549742221832275, "learning_rate": 4.626703354297694e-05, "loss": 0.1624, "step": 2850 }, { "epoch": 0.149895178197065, "grad_norm": 1.5423760414123535, "learning_rate": 4.625393081761006e-05, "loss": 0.136, "step": 2860 }, { "epoch": 0.15041928721174005, "grad_norm": 1.6354541778564453, "learning_rate": 4.6240828092243185e-05, "loss": 0.1606, "step": 2870 }, { "epoch": 0.1509433962264151, "grad_norm": 1.840997338294983, "learning_rate": 4.6227725366876315e-05, "loss": 0.1657, "step": 2880 }, { "epoch": 0.15146750524109015, "grad_norm": 1.7014777660369873, "learning_rate": 4.621462264150944e-05, "loss": 0.152, "step": 2890 }, { "epoch": 0.1519916142557652, "grad_norm": 1.1734257936477661, "learning_rate": 4.620151991614256e-05, "loss": 0.1468, "step": 2900 }, { "epoch": 0.15251572327044025, "grad_norm": 1.7266418933868408, "learning_rate": 4.6188417190775685e-05, "loss": 0.1644, "step": 2910 }, { "epoch": 0.1530398322851153, "grad_norm": 1.7747575044631958, "learning_rate": 4.617531446540881e-05, "loss": 0.1536, "step": 2920 }, { "epoch": 0.15356394129979037, "grad_norm": 2.43758487701416, "learning_rate": 4.616221174004193e-05, "loss": 0.1576, "step": 2930 }, { "epoch": 0.1540880503144654, "grad_norm": 1.5235657691955566, "learning_rate": 4.6149109014675056e-05, "loss": 0.1342, "step": 2940 }, { "epoch": 0.15461215932914046, "grad_norm": 2.2934274673461914, "learning_rate": 4.613600628930818e-05, "loss": 0.1715, "step": 2950 }, { "epoch": 0.15513626834381553, "grad_norm": 1.945422887802124, "learning_rate": 4.61229035639413e-05, "loss": 0.1629, "step": 2960 }, { "epoch": 0.15566037735849056, "grad_norm": 1.4969581365585327, "learning_rate": 4.6109800838574426e-05, "loss": 0.1411, "step": 2970 }, { "epoch": 0.15618448637316562, "grad_norm": 1.6622493267059326, "learning_rate": 4.609669811320755e-05, "loss": 0.1424, "step": 2980 }, { "epoch": 0.15670859538784068, "grad_norm": 1.6840288639068604, "learning_rate": 4.608359538784067e-05, "loss": 0.1623, "step": 2990 }, { "epoch": 0.15723270440251572, "grad_norm": 1.4131284952163696, "learning_rate": 4.6070492662473796e-05, "loss": 0.1682, "step": 3000 }, { "epoch": 0.15723270440251572, "eval_loss": 0.32990387082099915, "eval_runtime": 267.4782, "eval_samples_per_second": 7.444, "eval_steps_per_second": 1.241, "step": 3000 }, { "epoch": 0.15775681341719078, "grad_norm": 1.9176369905471802, "learning_rate": 4.605738993710692e-05, "loss": 0.1506, "step": 3010 }, { "epoch": 0.15828092243186584, "grad_norm": 2.0427310466766357, "learning_rate": 4.604428721174004e-05, "loss": 0.139, "step": 3020 }, { "epoch": 0.15880503144654087, "grad_norm": 0.8098589181900024, "learning_rate": 4.6031184486373166e-05, "loss": 0.1411, "step": 3030 }, { "epoch": 0.15932914046121593, "grad_norm": 1.4167261123657227, "learning_rate": 4.6018081761006296e-05, "loss": 0.1425, "step": 3040 }, { "epoch": 0.159853249475891, "grad_norm": 1.541549801826477, "learning_rate": 4.600497903563942e-05, "loss": 0.1243, "step": 3050 }, { "epoch": 0.16037735849056603, "grad_norm": 2.3950603008270264, "learning_rate": 4.599187631027254e-05, "loss": 0.1456, "step": 3060 }, { "epoch": 0.1609014675052411, "grad_norm": 1.3509130477905273, "learning_rate": 4.597877358490566e-05, "loss": 0.1447, "step": 3070 }, { "epoch": 0.16142557651991615, "grad_norm": 1.6379474401474, "learning_rate": 4.596567085953878e-05, "loss": 0.1296, "step": 3080 }, { "epoch": 0.1619496855345912, "grad_norm": 1.4416191577911377, "learning_rate": 4.5952568134171906e-05, "loss": 0.1413, "step": 3090 }, { "epoch": 0.16247379454926625, "grad_norm": 1.3093364238739014, "learning_rate": 4.593946540880503e-05, "loss": 0.1566, "step": 3100 }, { "epoch": 0.1629979035639413, "grad_norm": 1.1312339305877686, "learning_rate": 4.592636268343816e-05, "loss": 0.144, "step": 3110 }, { "epoch": 0.16352201257861634, "grad_norm": 1.5782580375671387, "learning_rate": 4.591325995807128e-05, "loss": 0.1596, "step": 3120 }, { "epoch": 0.1640461215932914, "grad_norm": 1.804060697555542, "learning_rate": 4.5900157232704407e-05, "loss": 0.1431, "step": 3130 }, { "epoch": 0.16457023060796647, "grad_norm": 1.9302703142166138, "learning_rate": 4.588705450733753e-05, "loss": 0.1696, "step": 3140 }, { "epoch": 0.1650943396226415, "grad_norm": 1.2807234525680542, "learning_rate": 4.587395178197065e-05, "loss": 0.1502, "step": 3150 }, { "epoch": 0.16561844863731656, "grad_norm": 1.3993560075759888, "learning_rate": 4.586084905660378e-05, "loss": 0.1489, "step": 3160 }, { "epoch": 0.16614255765199162, "grad_norm": 2.2172915935516357, "learning_rate": 4.58477463312369e-05, "loss": 0.14, "step": 3170 }, { "epoch": 0.16666666666666666, "grad_norm": 2.70487117767334, "learning_rate": 4.5834643605870024e-05, "loss": 0.1605, "step": 3180 }, { "epoch": 0.16719077568134172, "grad_norm": 1.2218254804611206, "learning_rate": 4.582154088050315e-05, "loss": 0.1503, "step": 3190 }, { "epoch": 0.16771488469601678, "grad_norm": 1.8308320045471191, "learning_rate": 4.580843815513627e-05, "loss": 0.1308, "step": 3200 }, { "epoch": 0.16823899371069181, "grad_norm": 2.082028865814209, "learning_rate": 4.5795335429769394e-05, "loss": 0.1643, "step": 3210 }, { "epoch": 0.16876310272536688, "grad_norm": 1.1031092405319214, "learning_rate": 4.578223270440252e-05, "loss": 0.1329, "step": 3220 }, { "epoch": 0.16928721174004194, "grad_norm": 1.128424048423767, "learning_rate": 4.576912997903564e-05, "loss": 0.1344, "step": 3230 }, { "epoch": 0.16981132075471697, "grad_norm": 1.0885006189346313, "learning_rate": 4.5756027253668764e-05, "loss": 0.1365, "step": 3240 }, { "epoch": 0.17033542976939203, "grad_norm": 1.0524739027023315, "learning_rate": 4.574292452830189e-05, "loss": 0.1377, "step": 3250 }, { "epoch": 0.1708595387840671, "grad_norm": 1.4963139295578003, "learning_rate": 4.572982180293501e-05, "loss": 0.1555, "step": 3260 }, { "epoch": 0.17138364779874213, "grad_norm": 1.3145508766174316, "learning_rate": 4.571671907756814e-05, "loss": 0.1345, "step": 3270 }, { "epoch": 0.1719077568134172, "grad_norm": 1.2556843757629395, "learning_rate": 4.5703616352201264e-05, "loss": 0.1298, "step": 3280 }, { "epoch": 0.17243186582809225, "grad_norm": 2.102116584777832, "learning_rate": 4.569051362683439e-05, "loss": 0.1717, "step": 3290 }, { "epoch": 0.17295597484276728, "grad_norm": 2.472163438796997, "learning_rate": 4.567741090146751e-05, "loss": 0.1381, "step": 3300 }, { "epoch": 0.17348008385744235, "grad_norm": 1.828572392463684, "learning_rate": 4.566430817610063e-05, "loss": 0.1226, "step": 3310 }, { "epoch": 0.1740041928721174, "grad_norm": 1.566004991531372, "learning_rate": 4.565120545073375e-05, "loss": 0.1356, "step": 3320 }, { "epoch": 0.17452830188679244, "grad_norm": 1.1402428150177002, "learning_rate": 4.5638102725366874e-05, "loss": 0.1192, "step": 3330 }, { "epoch": 0.1750524109014675, "grad_norm": 1.3995243310928345, "learning_rate": 4.5625e-05, "loss": 0.1352, "step": 3340 }, { "epoch": 0.17557651991614256, "grad_norm": 2.771331310272217, "learning_rate": 4.561189727463313e-05, "loss": 0.1585, "step": 3350 }, { "epoch": 0.1761006289308176, "grad_norm": 2.0721607208251953, "learning_rate": 4.559879454926625e-05, "loss": 0.156, "step": 3360 }, { "epoch": 0.17662473794549266, "grad_norm": 3.02065110206604, "learning_rate": 4.5585691823899375e-05, "loss": 0.1046, "step": 3370 }, { "epoch": 0.17714884696016772, "grad_norm": 1.377319574356079, "learning_rate": 4.55725890985325e-05, "loss": 0.1795, "step": 3380 }, { "epoch": 0.17767295597484276, "grad_norm": 2.061204195022583, "learning_rate": 4.555948637316562e-05, "loss": 0.1209, "step": 3390 }, { "epoch": 0.17819706498951782, "grad_norm": 1.6569091081619263, "learning_rate": 4.5546383647798745e-05, "loss": 0.1244, "step": 3400 }, { "epoch": 0.17872117400419288, "grad_norm": 1.5188376903533936, "learning_rate": 4.553328092243187e-05, "loss": 0.1589, "step": 3410 }, { "epoch": 0.1792452830188679, "grad_norm": 1.463887095451355, "learning_rate": 4.552017819706499e-05, "loss": 0.1109, "step": 3420 }, { "epoch": 0.17976939203354297, "grad_norm": 1.8769468069076538, "learning_rate": 4.5507075471698115e-05, "loss": 0.1313, "step": 3430 }, { "epoch": 0.18029350104821804, "grad_norm": 1.344223976135254, "learning_rate": 4.549397274633124e-05, "loss": 0.1406, "step": 3440 }, { "epoch": 0.18081761006289307, "grad_norm": 1.0123755931854248, "learning_rate": 4.548087002096436e-05, "loss": 0.1449, "step": 3450 }, { "epoch": 0.18134171907756813, "grad_norm": 0.779967725276947, "learning_rate": 4.5467767295597485e-05, "loss": 0.1201, "step": 3460 }, { "epoch": 0.1818658280922432, "grad_norm": 1.4109629392623901, "learning_rate": 4.545466457023061e-05, "loss": 0.1449, "step": 3470 }, { "epoch": 0.18238993710691823, "grad_norm": 0.7555325031280518, "learning_rate": 4.544156184486373e-05, "loss": 0.1368, "step": 3480 }, { "epoch": 0.1829140461215933, "grad_norm": 2.1541240215301514, "learning_rate": 4.5428459119496855e-05, "loss": 0.1483, "step": 3490 }, { "epoch": 0.18343815513626835, "grad_norm": 1.714505910873413, "learning_rate": 4.541535639412998e-05, "loss": 0.1706, "step": 3500 }, { "epoch": 0.18396226415094338, "grad_norm": 3.6619317531585693, "learning_rate": 4.540225366876311e-05, "loss": 0.1669, "step": 3510 }, { "epoch": 0.18448637316561844, "grad_norm": 1.6405800580978394, "learning_rate": 4.538915094339623e-05, "loss": 0.1463, "step": 3520 }, { "epoch": 0.1850104821802935, "grad_norm": 1.3726412057876587, "learning_rate": 4.5376048218029356e-05, "loss": 0.1274, "step": 3530 }, { "epoch": 0.18553459119496854, "grad_norm": 1.2834166288375854, "learning_rate": 4.536294549266248e-05, "loss": 0.1536, "step": 3540 }, { "epoch": 0.1860587002096436, "grad_norm": 1.3806772232055664, "learning_rate": 4.5349842767295596e-05, "loss": 0.1745, "step": 3550 }, { "epoch": 0.18658280922431866, "grad_norm": 1.5404632091522217, "learning_rate": 4.533674004192872e-05, "loss": 0.1519, "step": 3560 }, { "epoch": 0.1871069182389937, "grad_norm": 1.23709237575531, "learning_rate": 4.532363731656184e-05, "loss": 0.1706, "step": 3570 }, { "epoch": 0.18763102725366876, "grad_norm": 1.5122413635253906, "learning_rate": 4.531053459119497e-05, "loss": 0.1296, "step": 3580 }, { "epoch": 0.18815513626834382, "grad_norm": 1.3612315654754639, "learning_rate": 4.5297431865828096e-05, "loss": 0.1705, "step": 3590 }, { "epoch": 0.18867924528301888, "grad_norm": 2.4789493083953857, "learning_rate": 4.528432914046122e-05, "loss": 0.1162, "step": 3600 }, { "epoch": 0.18920335429769392, "grad_norm": 1.856713891029358, "learning_rate": 4.527122641509434e-05, "loss": 0.1527, "step": 3610 }, { "epoch": 0.18972746331236898, "grad_norm": 2.436396598815918, "learning_rate": 4.5258123689727466e-05, "loss": 0.1303, "step": 3620 }, { "epoch": 0.19025157232704404, "grad_norm": 1.4787251949310303, "learning_rate": 4.524502096436059e-05, "loss": 0.1531, "step": 3630 }, { "epoch": 0.19077568134171907, "grad_norm": 1.5141669511795044, "learning_rate": 4.523191823899371e-05, "loss": 0.1397, "step": 3640 }, { "epoch": 0.19129979035639413, "grad_norm": 1.4852555990219116, "learning_rate": 4.5218815513626836e-05, "loss": 0.1645, "step": 3650 }, { "epoch": 0.1918238993710692, "grad_norm": 2.069603204727173, "learning_rate": 4.520571278825996e-05, "loss": 0.1361, "step": 3660 }, { "epoch": 0.19234800838574423, "grad_norm": 1.8766626119613647, "learning_rate": 4.519261006289308e-05, "loss": 0.1536, "step": 3670 }, { "epoch": 0.1928721174004193, "grad_norm": 1.3918403387069702, "learning_rate": 4.5179507337526206e-05, "loss": 0.1456, "step": 3680 }, { "epoch": 0.19339622641509435, "grad_norm": 3.741128444671631, "learning_rate": 4.516640461215933e-05, "loss": 0.1348, "step": 3690 }, { "epoch": 0.19392033542976939, "grad_norm": 2.8250186443328857, "learning_rate": 4.515330188679245e-05, "loss": 0.1424, "step": 3700 }, { "epoch": 0.19444444444444445, "grad_norm": 1.4397534132003784, "learning_rate": 4.5140199161425577e-05, "loss": 0.1534, "step": 3710 }, { "epoch": 0.1949685534591195, "grad_norm": 1.3303152322769165, "learning_rate": 4.51270964360587e-05, "loss": 0.126, "step": 3720 }, { "epoch": 0.19549266247379454, "grad_norm": 1.1962412595748901, "learning_rate": 4.511399371069182e-05, "loss": 0.1284, "step": 3730 }, { "epoch": 0.1960167714884696, "grad_norm": 2.205249309539795, "learning_rate": 4.5100890985324953e-05, "loss": 0.1575, "step": 3740 }, { "epoch": 0.19654088050314467, "grad_norm": 1.817156195640564, "learning_rate": 4.508778825995808e-05, "loss": 0.1523, "step": 3750 }, { "epoch": 0.1970649895178197, "grad_norm": 1.2243177890777588, "learning_rate": 4.50746855345912e-05, "loss": 0.1379, "step": 3760 }, { "epoch": 0.19758909853249476, "grad_norm": 1.650865077972412, "learning_rate": 4.5061582809224324e-05, "loss": 0.1374, "step": 3770 }, { "epoch": 0.19811320754716982, "grad_norm": 2.2793819904327393, "learning_rate": 4.504848008385745e-05, "loss": 0.1408, "step": 3780 }, { "epoch": 0.19863731656184486, "grad_norm": 2.1652002334594727, "learning_rate": 4.5035377358490564e-05, "loss": 0.1281, "step": 3790 }, { "epoch": 0.19916142557651992, "grad_norm": 1.8161773681640625, "learning_rate": 4.502227463312369e-05, "loss": 0.1492, "step": 3800 }, { "epoch": 0.19968553459119498, "grad_norm": 1.5104507207870483, "learning_rate": 4.500917190775682e-05, "loss": 0.1419, "step": 3810 }, { "epoch": 0.20020964360587, "grad_norm": 2.545625925064087, "learning_rate": 4.499606918238994e-05, "loss": 0.1451, "step": 3820 }, { "epoch": 0.20073375262054508, "grad_norm": 1.7046788930892944, "learning_rate": 4.4982966457023064e-05, "loss": 0.1623, "step": 3830 }, { "epoch": 0.20125786163522014, "grad_norm": 1.6505851745605469, "learning_rate": 4.496986373165619e-05, "loss": 0.1357, "step": 3840 }, { "epoch": 0.20178197064989517, "grad_norm": 7.002503395080566, "learning_rate": 4.495676100628931e-05, "loss": 0.1187, "step": 3850 }, { "epoch": 0.20230607966457023, "grad_norm": 1.687934398651123, "learning_rate": 4.4943658280922434e-05, "loss": 0.1638, "step": 3860 }, { "epoch": 0.2028301886792453, "grad_norm": 1.5736196041107178, "learning_rate": 4.493055555555556e-05, "loss": 0.1493, "step": 3870 }, { "epoch": 0.20335429769392033, "grad_norm": 1.3231173753738403, "learning_rate": 4.491745283018868e-05, "loss": 0.1393, "step": 3880 }, { "epoch": 0.2038784067085954, "grad_norm": 1.744145154953003, "learning_rate": 4.4904350104821804e-05, "loss": 0.1424, "step": 3890 }, { "epoch": 0.20440251572327045, "grad_norm": 1.9657684564590454, "learning_rate": 4.4891247379454934e-05, "loss": 0.1211, "step": 3900 }, { "epoch": 0.20492662473794548, "grad_norm": 1.672458529472351, "learning_rate": 4.487814465408805e-05, "loss": 0.1592, "step": 3910 }, { "epoch": 0.20545073375262055, "grad_norm": 2.555405378341675, "learning_rate": 4.4865041928721174e-05, "loss": 0.1473, "step": 3920 }, { "epoch": 0.2059748427672956, "grad_norm": 5.1308183670043945, "learning_rate": 4.48519392033543e-05, "loss": 0.153, "step": 3930 }, { "epoch": 0.20649895178197064, "grad_norm": 0.9765024185180664, "learning_rate": 4.483883647798742e-05, "loss": 0.1173, "step": 3940 }, { "epoch": 0.2070230607966457, "grad_norm": 2.002351999282837, "learning_rate": 4.4825733752620545e-05, "loss": 0.1367, "step": 3950 }, { "epoch": 0.20754716981132076, "grad_norm": 4.004027843475342, "learning_rate": 4.481263102725367e-05, "loss": 0.1373, "step": 3960 }, { "epoch": 0.2080712788259958, "grad_norm": 3.0619328022003174, "learning_rate": 4.47995283018868e-05, "loss": 0.1148, "step": 3970 }, { "epoch": 0.20859538784067086, "grad_norm": 1.905088186264038, "learning_rate": 4.478642557651992e-05, "loss": 0.1673, "step": 3980 }, { "epoch": 0.20911949685534592, "grad_norm": 1.8251434564590454, "learning_rate": 4.4773322851153045e-05, "loss": 0.1535, "step": 3990 }, { "epoch": 0.20964360587002095, "grad_norm": 2.9351112842559814, "learning_rate": 4.476022012578617e-05, "loss": 0.1167, "step": 4000 }, { "epoch": 0.20964360587002095, "eval_loss": 0.30853426456451416, "eval_runtime": 267.8685, "eval_samples_per_second": 7.433, "eval_steps_per_second": 1.239, "step": 4000 }, { "epoch": 0.21016771488469602, "grad_norm": 1.5548582077026367, "learning_rate": 4.474711740041929e-05, "loss": 0.1359, "step": 4010 }, { "epoch": 0.21069182389937108, "grad_norm": 1.887024164199829, "learning_rate": 4.473401467505241e-05, "loss": 0.145, "step": 4020 }, { "epoch": 0.2112159329140461, "grad_norm": 1.1486566066741943, "learning_rate": 4.472091194968553e-05, "loss": 0.1225, "step": 4030 }, { "epoch": 0.21174004192872117, "grad_norm": 6.734325885772705, "learning_rate": 4.470780922431866e-05, "loss": 0.1309, "step": 4040 }, { "epoch": 0.21226415094339623, "grad_norm": 1.878401756286621, "learning_rate": 4.4694706498951785e-05, "loss": 0.1526, "step": 4050 }, { "epoch": 0.21278825995807127, "grad_norm": 1.8532344102859497, "learning_rate": 4.468160377358491e-05, "loss": 0.1794, "step": 4060 }, { "epoch": 0.21331236897274633, "grad_norm": 1.9898567199707031, "learning_rate": 4.466850104821803e-05, "loss": 0.1432, "step": 4070 }, { "epoch": 0.2138364779874214, "grad_norm": 1.4500483274459839, "learning_rate": 4.4655398322851155e-05, "loss": 0.1246, "step": 4080 }, { "epoch": 0.21436058700209643, "grad_norm": 2.1289262771606445, "learning_rate": 4.464229559748428e-05, "loss": 0.1444, "step": 4090 }, { "epoch": 0.2148846960167715, "grad_norm": 1.5706223249435425, "learning_rate": 4.46291928721174e-05, "loss": 0.135, "step": 4100 }, { "epoch": 0.21540880503144655, "grad_norm": 1.5430450439453125, "learning_rate": 4.4616090146750525e-05, "loss": 0.1527, "step": 4110 }, { "epoch": 0.21593291404612158, "grad_norm": 1.4780341386795044, "learning_rate": 4.460298742138365e-05, "loss": 0.1401, "step": 4120 }, { "epoch": 0.21645702306079664, "grad_norm": 1.447033166885376, "learning_rate": 4.458988469601678e-05, "loss": 0.1461, "step": 4130 }, { "epoch": 0.2169811320754717, "grad_norm": 1.8528786897659302, "learning_rate": 4.4576781970649896e-05, "loss": 0.1243, "step": 4140 }, { "epoch": 0.21750524109014674, "grad_norm": 1.6013343334197998, "learning_rate": 4.456367924528302e-05, "loss": 0.1052, "step": 4150 }, { "epoch": 0.2180293501048218, "grad_norm": 1.5129047632217407, "learning_rate": 4.455057651991614e-05, "loss": 0.1346, "step": 4160 }, { "epoch": 0.21855345911949686, "grad_norm": 2.5807998180389404, "learning_rate": 4.4537473794549266e-05, "loss": 0.14, "step": 4170 }, { "epoch": 0.2190775681341719, "grad_norm": 2.2867794036865234, "learning_rate": 4.452437106918239e-05, "loss": 0.1632, "step": 4180 }, { "epoch": 0.21960167714884696, "grad_norm": 1.180045247077942, "learning_rate": 4.451126834381551e-05, "loss": 0.1398, "step": 4190 }, { "epoch": 0.22012578616352202, "grad_norm": 1.8197094202041626, "learning_rate": 4.449816561844864e-05, "loss": 0.1282, "step": 4200 }, { "epoch": 0.22064989517819705, "grad_norm": 1.807565689086914, "learning_rate": 4.4485062893081766e-05, "loss": 0.1569, "step": 4210 }, { "epoch": 0.22117400419287211, "grad_norm": 2.2830843925476074, "learning_rate": 4.447196016771489e-05, "loss": 0.1479, "step": 4220 }, { "epoch": 0.22169811320754718, "grad_norm": 1.467529058456421, "learning_rate": 4.445885744234801e-05, "loss": 0.1247, "step": 4230 }, { "epoch": 0.2222222222222222, "grad_norm": 1.474076271057129, "learning_rate": 4.4445754716981136e-05, "loss": 0.1556, "step": 4240 }, { "epoch": 0.22274633123689727, "grad_norm": 2.1999564170837402, "learning_rate": 4.443265199161426e-05, "loss": 0.1816, "step": 4250 }, { "epoch": 0.22327044025157233, "grad_norm": 1.921810507774353, "learning_rate": 4.4419549266247376e-05, "loss": 0.1245, "step": 4260 }, { "epoch": 0.22379454926624737, "grad_norm": 3.8777430057525635, "learning_rate": 4.4406446540880506e-05, "loss": 0.1503, "step": 4270 }, { "epoch": 0.22431865828092243, "grad_norm": 2.411684989929199, "learning_rate": 4.439334381551363e-05, "loss": 0.1776, "step": 4280 }, { "epoch": 0.2248427672955975, "grad_norm": 1.9826992750167847, "learning_rate": 4.438024109014675e-05, "loss": 0.124, "step": 4290 }, { "epoch": 0.22536687631027252, "grad_norm": 1.5890358686447144, "learning_rate": 4.4367138364779877e-05, "loss": 0.1429, "step": 4300 }, { "epoch": 0.22589098532494759, "grad_norm": 1.2754069566726685, "learning_rate": 4.4354035639413e-05, "loss": 0.1331, "step": 4310 }, { "epoch": 0.22641509433962265, "grad_norm": 1.54972505569458, "learning_rate": 4.434093291404612e-05, "loss": 0.1105, "step": 4320 }, { "epoch": 0.22693920335429768, "grad_norm": 2.839402437210083, "learning_rate": 4.432783018867925e-05, "loss": 0.1604, "step": 4330 }, { "epoch": 0.22746331236897274, "grad_norm": 1.7896904945373535, "learning_rate": 4.431472746331237e-05, "loss": 0.1081, "step": 4340 }, { "epoch": 0.2279874213836478, "grad_norm": 2.3019604682922363, "learning_rate": 4.4301624737945494e-05, "loss": 0.0951, "step": 4350 }, { "epoch": 0.22851153039832284, "grad_norm": 1.953927755355835, "learning_rate": 4.4288522012578624e-05, "loss": 0.1508, "step": 4360 }, { "epoch": 0.2290356394129979, "grad_norm": 2.379554510116577, "learning_rate": 4.427541928721175e-05, "loss": 0.1446, "step": 4370 }, { "epoch": 0.22955974842767296, "grad_norm": 1.1505978107452393, "learning_rate": 4.4262316561844864e-05, "loss": 0.1378, "step": 4380 }, { "epoch": 0.23008385744234802, "grad_norm": 1.7770054340362549, "learning_rate": 4.424921383647799e-05, "loss": 0.1432, "step": 4390 }, { "epoch": 0.23060796645702306, "grad_norm": 2.470569133758545, "learning_rate": 4.423611111111111e-05, "loss": 0.1286, "step": 4400 }, { "epoch": 0.23113207547169812, "grad_norm": 2.54297137260437, "learning_rate": 4.4223008385744234e-05, "loss": 0.1375, "step": 4410 }, { "epoch": 0.23165618448637318, "grad_norm": 1.6648815870285034, "learning_rate": 4.420990566037736e-05, "loss": 0.1307, "step": 4420 }, { "epoch": 0.2321802935010482, "grad_norm": 2.62874174118042, "learning_rate": 4.419680293501048e-05, "loss": 0.1549, "step": 4430 }, { "epoch": 0.23270440251572327, "grad_norm": 1.2740308046340942, "learning_rate": 4.418370020964361e-05, "loss": 0.1303, "step": 4440 }, { "epoch": 0.23322851153039834, "grad_norm": 2.0457019805908203, "learning_rate": 4.4170597484276734e-05, "loss": 0.1381, "step": 4450 }, { "epoch": 0.23375262054507337, "grad_norm": 1.8146222829818726, "learning_rate": 4.415749475890986e-05, "loss": 0.1178, "step": 4460 }, { "epoch": 0.23427672955974843, "grad_norm": 1.4437885284423828, "learning_rate": 4.414439203354298e-05, "loss": 0.1612, "step": 4470 }, { "epoch": 0.2348008385744235, "grad_norm": 1.3668901920318604, "learning_rate": 4.4131289308176104e-05, "loss": 0.1306, "step": 4480 }, { "epoch": 0.23532494758909853, "grad_norm": 1.2944235801696777, "learning_rate": 4.411818658280923e-05, "loss": 0.1164, "step": 4490 }, { "epoch": 0.2358490566037736, "grad_norm": 1.7875624895095825, "learning_rate": 4.4105083857442344e-05, "loss": 0.1303, "step": 4500 }, { "epoch": 0.23637316561844865, "grad_norm": 1.772647500038147, "learning_rate": 4.4091981132075474e-05, "loss": 0.1428, "step": 4510 }, { "epoch": 0.23689727463312368, "grad_norm": 1.3781625032424927, "learning_rate": 4.40788784067086e-05, "loss": 0.1246, "step": 4520 }, { "epoch": 0.23742138364779874, "grad_norm": 2.086047649383545, "learning_rate": 4.406577568134172e-05, "loss": 0.1437, "step": 4530 }, { "epoch": 0.2379454926624738, "grad_norm": 1.900099754333496, "learning_rate": 4.4052672955974845e-05, "loss": 0.1324, "step": 4540 }, { "epoch": 0.23846960167714884, "grad_norm": 5.876873016357422, "learning_rate": 4.403957023060797e-05, "loss": 0.1661, "step": 4550 }, { "epoch": 0.2389937106918239, "grad_norm": 2.2485744953155518, "learning_rate": 4.402646750524109e-05, "loss": 0.1262, "step": 4560 }, { "epoch": 0.23951781970649896, "grad_norm": 2.297826051712036, "learning_rate": 4.4013364779874215e-05, "loss": 0.1435, "step": 4570 }, { "epoch": 0.240041928721174, "grad_norm": 1.4973938465118408, "learning_rate": 4.400026205450734e-05, "loss": 0.1309, "step": 4580 }, { "epoch": 0.24056603773584906, "grad_norm": 3.2034554481506348, "learning_rate": 4.398715932914046e-05, "loss": 0.1406, "step": 4590 }, { "epoch": 0.24109014675052412, "grad_norm": 1.022011160850525, "learning_rate": 4.397405660377359e-05, "loss": 0.1335, "step": 4600 }, { "epoch": 0.24161425576519915, "grad_norm": 1.525408387184143, "learning_rate": 4.3960953878406715e-05, "loss": 0.1115, "step": 4610 }, { "epoch": 0.24213836477987422, "grad_norm": 1.0966118574142456, "learning_rate": 4.394785115303983e-05, "loss": 0.1334, "step": 4620 }, { "epoch": 0.24266247379454928, "grad_norm": 2.1737682819366455, "learning_rate": 4.3934748427672955e-05, "loss": 0.118, "step": 4630 }, { "epoch": 0.2431865828092243, "grad_norm": 1.6387145519256592, "learning_rate": 4.392164570230608e-05, "loss": 0.1034, "step": 4640 }, { "epoch": 0.24371069182389937, "grad_norm": 2.735123634338379, "learning_rate": 4.39085429769392e-05, "loss": 0.1379, "step": 4650 }, { "epoch": 0.24423480083857443, "grad_norm": 3.40956974029541, "learning_rate": 4.3895440251572325e-05, "loss": 0.1388, "step": 4660 }, { "epoch": 0.24475890985324947, "grad_norm": 2.1785049438476562, "learning_rate": 4.3882337526205455e-05, "loss": 0.1496, "step": 4670 }, { "epoch": 0.24528301886792453, "grad_norm": 1.1879709959030151, "learning_rate": 4.386923480083858e-05, "loss": 0.1515, "step": 4680 }, { "epoch": 0.2458071278825996, "grad_norm": 1.3993653059005737, "learning_rate": 4.38561320754717e-05, "loss": 0.1063, "step": 4690 }, { "epoch": 0.24633123689727462, "grad_norm": 2.238290309906006, "learning_rate": 4.3843029350104826e-05, "loss": 0.1606, "step": 4700 }, { "epoch": 0.2468553459119497, "grad_norm": 1.1759029626846313, "learning_rate": 4.382992662473795e-05, "loss": 0.1201, "step": 4710 }, { "epoch": 0.24737945492662475, "grad_norm": 1.6260344982147217, "learning_rate": 4.381682389937107e-05, "loss": 0.1332, "step": 4720 }, { "epoch": 0.24790356394129978, "grad_norm": 2.897883653640747, "learning_rate": 4.3803721174004196e-05, "loss": 0.1343, "step": 4730 }, { "epoch": 0.24842767295597484, "grad_norm": 2.3788511753082275, "learning_rate": 4.379061844863732e-05, "loss": 0.1348, "step": 4740 }, { "epoch": 0.2489517819706499, "grad_norm": 1.6649222373962402, "learning_rate": 4.377751572327044e-05, "loss": 0.1316, "step": 4750 }, { "epoch": 0.24947589098532494, "grad_norm": 0.7832605242729187, "learning_rate": 4.3764412997903566e-05, "loss": 0.1674, "step": 4760 }, { "epoch": 0.25, "grad_norm": 1.33087158203125, "learning_rate": 4.375131027253669e-05, "loss": 0.1434, "step": 4770 }, { "epoch": 0.25052410901467503, "grad_norm": 2.168484687805176, "learning_rate": 4.373820754716981e-05, "loss": 0.0985, "step": 4780 }, { "epoch": 0.2510482180293501, "grad_norm": 1.5246626138687134, "learning_rate": 4.3725104821802936e-05, "loss": 0.1304, "step": 4790 }, { "epoch": 0.25157232704402516, "grad_norm": 1.0635180473327637, "learning_rate": 4.371200209643606e-05, "loss": 0.1401, "step": 4800 }, { "epoch": 0.2520964360587002, "grad_norm": 3.1194510459899902, "learning_rate": 4.369889937106918e-05, "loss": 0.1053, "step": 4810 }, { "epoch": 0.2526205450733753, "grad_norm": 1.4997973442077637, "learning_rate": 4.3685796645702306e-05, "loss": 0.1227, "step": 4820 }, { "epoch": 0.2531446540880503, "grad_norm": 1.7188799381256104, "learning_rate": 4.3672693920335436e-05, "loss": 0.1178, "step": 4830 }, { "epoch": 0.25366876310272535, "grad_norm": 1.7366551160812378, "learning_rate": 4.365959119496856e-05, "loss": 0.1291, "step": 4840 }, { "epoch": 0.25419287211740044, "grad_norm": 1.3258930444717407, "learning_rate": 4.364648846960168e-05, "loss": 0.1202, "step": 4850 }, { "epoch": 0.25471698113207547, "grad_norm": 2.1996500492095947, "learning_rate": 4.36333857442348e-05, "loss": 0.1506, "step": 4860 }, { "epoch": 0.2552410901467505, "grad_norm": 1.105446219444275, "learning_rate": 4.362028301886792e-05, "loss": 0.147, "step": 4870 }, { "epoch": 0.2557651991614256, "grad_norm": 1.894164800643921, "learning_rate": 4.3607180293501046e-05, "loss": 0.126, "step": 4880 }, { "epoch": 0.2562893081761006, "grad_norm": 1.4393502473831177, "learning_rate": 4.359407756813417e-05, "loss": 0.1275, "step": 4890 }, { "epoch": 0.25681341719077566, "grad_norm": 3.0993599891662598, "learning_rate": 4.35809748427673e-05, "loss": 0.1365, "step": 4900 }, { "epoch": 0.25733752620545075, "grad_norm": 1.72710382938385, "learning_rate": 4.3567872117400423e-05, "loss": 0.1429, "step": 4910 }, { "epoch": 0.2578616352201258, "grad_norm": 1.2111247777938843, "learning_rate": 4.355476939203355e-05, "loss": 0.137, "step": 4920 }, { "epoch": 0.2583857442348008, "grad_norm": 2.8836333751678467, "learning_rate": 4.354166666666667e-05, "loss": 0.1212, "step": 4930 }, { "epoch": 0.2589098532494759, "grad_norm": 1.2956396341323853, "learning_rate": 4.3528563941299794e-05, "loss": 0.1161, "step": 4940 }, { "epoch": 0.25943396226415094, "grad_norm": 1.8439433574676514, "learning_rate": 4.351546121593292e-05, "loss": 0.1302, "step": 4950 }, { "epoch": 0.259958071278826, "grad_norm": 1.5384888648986816, "learning_rate": 4.350235849056604e-05, "loss": 0.1121, "step": 4960 }, { "epoch": 0.26048218029350106, "grad_norm": 3.545708179473877, "learning_rate": 4.3489255765199164e-05, "loss": 0.1298, "step": 4970 }, { "epoch": 0.2610062893081761, "grad_norm": 1.2400144338607788, "learning_rate": 4.347615303983229e-05, "loss": 0.144, "step": 4980 }, { "epoch": 0.26153039832285113, "grad_norm": 1.563647747039795, "learning_rate": 4.346305031446541e-05, "loss": 0.1564, "step": 4990 }, { "epoch": 0.2620545073375262, "grad_norm": 1.5921865701675415, "learning_rate": 4.3449947589098534e-05, "loss": 0.1594, "step": 5000 }, { "epoch": 0.2620545073375262, "eval_loss": 0.30120959877967834, "eval_runtime": 267.2437, "eval_samples_per_second": 7.45, "eval_steps_per_second": 1.242, "step": 5000 }, { "epoch": 0.26257861635220126, "grad_norm": 0.8612467646598816, "learning_rate": 4.343684486373166e-05, "loss": 0.114, "step": 5010 }, { "epoch": 0.2631027253668763, "grad_norm": 2.6393580436706543, "learning_rate": 4.342374213836478e-05, "loss": 0.1402, "step": 5020 }, { "epoch": 0.2636268343815514, "grad_norm": 1.4097959995269775, "learning_rate": 4.3410639412997904e-05, "loss": 0.1563, "step": 5030 }, { "epoch": 0.2641509433962264, "grad_norm": 1.2873144149780273, "learning_rate": 4.339753668763103e-05, "loss": 0.1428, "step": 5040 }, { "epoch": 0.26467505241090145, "grad_norm": 4.058903217315674, "learning_rate": 4.338443396226415e-05, "loss": 0.1603, "step": 5050 }, { "epoch": 0.26519916142557654, "grad_norm": 2.0691044330596924, "learning_rate": 4.337133123689728e-05, "loss": 0.1526, "step": 5060 }, { "epoch": 0.26572327044025157, "grad_norm": 1.3779879808425903, "learning_rate": 4.3358228511530404e-05, "loss": 0.1273, "step": 5070 }, { "epoch": 0.2662473794549266, "grad_norm": 1.724474549293518, "learning_rate": 4.334512578616353e-05, "loss": 0.1202, "step": 5080 }, { "epoch": 0.2667714884696017, "grad_norm": 0.8896052837371826, "learning_rate": 4.333202306079665e-05, "loss": 0.1302, "step": 5090 }, { "epoch": 0.2672955974842767, "grad_norm": 1.071658968925476, "learning_rate": 4.331892033542977e-05, "loss": 0.1432, "step": 5100 }, { "epoch": 0.26781970649895176, "grad_norm": 2.4866292476654053, "learning_rate": 4.330581761006289e-05, "loss": 0.1108, "step": 5110 }, { "epoch": 0.26834381551362685, "grad_norm": 2.052104949951172, "learning_rate": 4.3292714884696015e-05, "loss": 0.1243, "step": 5120 }, { "epoch": 0.2688679245283019, "grad_norm": 1.5990567207336426, "learning_rate": 4.3279612159329145e-05, "loss": 0.1131, "step": 5130 }, { "epoch": 0.2693920335429769, "grad_norm": 2.5835673809051514, "learning_rate": 4.326650943396227e-05, "loss": 0.1157, "step": 5140 }, { "epoch": 0.269916142557652, "grad_norm": 3.6429848670959473, "learning_rate": 4.325340670859539e-05, "loss": 0.1523, "step": 5150 }, { "epoch": 0.27044025157232704, "grad_norm": 2.0268514156341553, "learning_rate": 4.3240303983228515e-05, "loss": 0.1389, "step": 5160 }, { "epoch": 0.2709643605870021, "grad_norm": 0.8983651399612427, "learning_rate": 4.322720125786164e-05, "loss": 0.1175, "step": 5170 }, { "epoch": 0.27148846960167716, "grad_norm": 2.746926784515381, "learning_rate": 4.321409853249476e-05, "loss": 0.1333, "step": 5180 }, { "epoch": 0.2720125786163522, "grad_norm": 1.4841305017471313, "learning_rate": 4.3200995807127885e-05, "loss": 0.1648, "step": 5190 }, { "epoch": 0.27253668763102723, "grad_norm": 1.7416741847991943, "learning_rate": 4.318789308176101e-05, "loss": 0.1382, "step": 5200 }, { "epoch": 0.2730607966457023, "grad_norm": 1.7636288404464722, "learning_rate": 4.317479035639413e-05, "loss": 0.1399, "step": 5210 }, { "epoch": 0.27358490566037735, "grad_norm": 2.511547803878784, "learning_rate": 4.3161687631027255e-05, "loss": 0.1647, "step": 5220 }, { "epoch": 0.2741090146750524, "grad_norm": 3.5610642433166504, "learning_rate": 4.314858490566038e-05, "loss": 0.1673, "step": 5230 }, { "epoch": 0.2746331236897275, "grad_norm": 1.4131566286087036, "learning_rate": 4.31354821802935e-05, "loss": 0.1264, "step": 5240 }, { "epoch": 0.2751572327044025, "grad_norm": 1.3859107494354248, "learning_rate": 4.3122379454926625e-05, "loss": 0.1355, "step": 5250 }, { "epoch": 0.27568134171907754, "grad_norm": 2.9283275604248047, "learning_rate": 4.310927672955975e-05, "loss": 0.152, "step": 5260 }, { "epoch": 0.27620545073375263, "grad_norm": 5.7555437088012695, "learning_rate": 4.309617400419287e-05, "loss": 0.1556, "step": 5270 }, { "epoch": 0.27672955974842767, "grad_norm": 1.091763973236084, "learning_rate": 4.3083071278825995e-05, "loss": 0.1106, "step": 5280 }, { "epoch": 0.2772536687631027, "grad_norm": 2.568847179412842, "learning_rate": 4.3069968553459126e-05, "loss": 0.1428, "step": 5290 }, { "epoch": 0.2777777777777778, "grad_norm": 3.7078545093536377, "learning_rate": 4.305686582809225e-05, "loss": 0.1443, "step": 5300 }, { "epoch": 0.2783018867924528, "grad_norm": 5.091422080993652, "learning_rate": 4.304376310272537e-05, "loss": 0.1279, "step": 5310 }, { "epoch": 0.27882599580712786, "grad_norm": 6.574954986572266, "learning_rate": 4.3030660377358496e-05, "loss": 0.1408, "step": 5320 }, { "epoch": 0.27935010482180295, "grad_norm": 3.3132095336914062, "learning_rate": 4.301755765199162e-05, "loss": 0.1368, "step": 5330 }, { "epoch": 0.279874213836478, "grad_norm": 1.6334365606307983, "learning_rate": 4.3004454926624736e-05, "loss": 0.1311, "step": 5340 }, { "epoch": 0.280398322851153, "grad_norm": 1.6367018222808838, "learning_rate": 4.299135220125786e-05, "loss": 0.1481, "step": 5350 }, { "epoch": 0.2809224318658281, "grad_norm": 1.8219107389450073, "learning_rate": 4.297824947589099e-05, "loss": 0.1447, "step": 5360 }, { "epoch": 0.28144654088050314, "grad_norm": 2.499232292175293, "learning_rate": 4.296514675052411e-05, "loss": 0.151, "step": 5370 }, { "epoch": 0.28197064989517817, "grad_norm": 2.656902313232422, "learning_rate": 4.2952044025157236e-05, "loss": 0.1118, "step": 5380 }, { "epoch": 0.28249475890985326, "grad_norm": 4.296574592590332, "learning_rate": 4.293894129979036e-05, "loss": 0.1419, "step": 5390 }, { "epoch": 0.2830188679245283, "grad_norm": 2.389521360397339, "learning_rate": 4.292583857442348e-05, "loss": 0.1075, "step": 5400 }, { "epoch": 0.28354297693920333, "grad_norm": 2.1653847694396973, "learning_rate": 4.2912735849056606e-05, "loss": 0.1642, "step": 5410 }, { "epoch": 0.2840670859538784, "grad_norm": 1.6562432050704956, "learning_rate": 4.289963312368973e-05, "loss": 0.1238, "step": 5420 }, { "epoch": 0.28459119496855345, "grad_norm": 1.8484896421432495, "learning_rate": 4.288653039832285e-05, "loss": 0.1343, "step": 5430 }, { "epoch": 0.2851153039832285, "grad_norm": 6.8420729637146, "learning_rate": 4.2873427672955976e-05, "loss": 0.1346, "step": 5440 }, { "epoch": 0.2856394129979036, "grad_norm": 1.6548465490341187, "learning_rate": 4.2860324947589107e-05, "loss": 0.1245, "step": 5450 }, { "epoch": 0.2861635220125786, "grad_norm": 1.1709433794021606, "learning_rate": 4.284722222222222e-05, "loss": 0.1375, "step": 5460 }, { "epoch": 0.28668763102725364, "grad_norm": 1.1040029525756836, "learning_rate": 4.2834119496855347e-05, "loss": 0.1303, "step": 5470 }, { "epoch": 0.28721174004192873, "grad_norm": 3.498873472213745, "learning_rate": 4.282101677148847e-05, "loss": 0.119, "step": 5480 }, { "epoch": 0.28773584905660377, "grad_norm": 1.783225417137146, "learning_rate": 4.280791404612159e-05, "loss": 0.1121, "step": 5490 }, { "epoch": 0.2882599580712788, "grad_norm": 1.912412166595459, "learning_rate": 4.279481132075472e-05, "loss": 0.1234, "step": 5500 }, { "epoch": 0.2887840670859539, "grad_norm": 1.2715930938720703, "learning_rate": 4.278170859538784e-05, "loss": 0.1299, "step": 5510 }, { "epoch": 0.2893081761006289, "grad_norm": 3.9322400093078613, "learning_rate": 4.2768605870020963e-05, "loss": 0.1656, "step": 5520 }, { "epoch": 0.28983228511530396, "grad_norm": 2.730623245239258, "learning_rate": 4.2755503144654094e-05, "loss": 0.1322, "step": 5530 }, { "epoch": 0.29035639412997905, "grad_norm": 2.107334852218628, "learning_rate": 4.274240041928722e-05, "loss": 0.1209, "step": 5540 }, { "epoch": 0.2908805031446541, "grad_norm": 3.0399837493896484, "learning_rate": 4.272929769392034e-05, "loss": 0.1487, "step": 5550 }, { "epoch": 0.2914046121593291, "grad_norm": 2.08225679397583, "learning_rate": 4.2716194968553464e-05, "loss": 0.1239, "step": 5560 }, { "epoch": 0.2919287211740042, "grad_norm": 2.5326271057128906, "learning_rate": 4.270309224318658e-05, "loss": 0.1231, "step": 5570 }, { "epoch": 0.29245283018867924, "grad_norm": 2.3042261600494385, "learning_rate": 4.2689989517819704e-05, "loss": 0.098, "step": 5580 }, { "epoch": 0.2929769392033543, "grad_norm": 1.0774246454238892, "learning_rate": 4.267688679245283e-05, "loss": 0.1391, "step": 5590 }, { "epoch": 0.29350104821802936, "grad_norm": 2.216782569885254, "learning_rate": 4.266378406708596e-05, "loss": 0.1151, "step": 5600 }, { "epoch": 0.2940251572327044, "grad_norm": 3.898007869720459, "learning_rate": 4.265068134171908e-05, "loss": 0.1371, "step": 5610 }, { "epoch": 0.2945492662473795, "grad_norm": 1.5311529636383057, "learning_rate": 4.2637578616352204e-05, "loss": 0.1396, "step": 5620 }, { "epoch": 0.2950733752620545, "grad_norm": 2.2003555297851562, "learning_rate": 4.262447589098533e-05, "loss": 0.1196, "step": 5630 }, { "epoch": 0.29559748427672955, "grad_norm": 2.0077884197235107, "learning_rate": 4.261137316561845e-05, "loss": 0.1381, "step": 5640 }, { "epoch": 0.29612159329140464, "grad_norm": 2.7949230670928955, "learning_rate": 4.2598270440251574e-05, "loss": 0.1265, "step": 5650 }, { "epoch": 0.2966457023060797, "grad_norm": 2.4836373329162598, "learning_rate": 4.25851677148847e-05, "loss": 0.1308, "step": 5660 }, { "epoch": 0.2971698113207547, "grad_norm": 0.6496070027351379, "learning_rate": 4.257206498951782e-05, "loss": 0.1099, "step": 5670 }, { "epoch": 0.2976939203354298, "grad_norm": 1.4952316284179688, "learning_rate": 4.2558962264150944e-05, "loss": 0.1284, "step": 5680 }, { "epoch": 0.29821802935010483, "grad_norm": 2.3443973064422607, "learning_rate": 4.254585953878407e-05, "loss": 0.1258, "step": 5690 }, { "epoch": 0.29874213836477986, "grad_norm": 1.2094248533248901, "learning_rate": 4.253275681341719e-05, "loss": 0.17, "step": 5700 }, { "epoch": 0.29926624737945495, "grad_norm": 1.8085112571716309, "learning_rate": 4.2519654088050315e-05, "loss": 0.1549, "step": 5710 }, { "epoch": 0.29979035639413, "grad_norm": 1.5376954078674316, "learning_rate": 4.250655136268344e-05, "loss": 0.1595, "step": 5720 }, { "epoch": 0.300314465408805, "grad_norm": 1.7698854207992554, "learning_rate": 4.249344863731656e-05, "loss": 0.1736, "step": 5730 }, { "epoch": 0.3008385744234801, "grad_norm": 1.0237462520599365, "learning_rate": 4.2480345911949685e-05, "loss": 0.1194, "step": 5740 }, { "epoch": 0.30136268343815514, "grad_norm": 5.32814359664917, "learning_rate": 4.246724318658281e-05, "loss": 0.1446, "step": 5750 }, { "epoch": 0.3018867924528302, "grad_norm": 1.3462707996368408, "learning_rate": 4.245414046121594e-05, "loss": 0.1287, "step": 5760 }, { "epoch": 0.30241090146750527, "grad_norm": 2.194868326187134, "learning_rate": 4.244103773584906e-05, "loss": 0.1232, "step": 5770 }, { "epoch": 0.3029350104821803, "grad_norm": 1.084516167640686, "learning_rate": 4.2427935010482185e-05, "loss": 0.1053, "step": 5780 }, { "epoch": 0.30345911949685533, "grad_norm": 4.030144691467285, "learning_rate": 4.241483228511531e-05, "loss": 0.145, "step": 5790 }, { "epoch": 0.3039832285115304, "grad_norm": 5.405220985412598, "learning_rate": 4.240172955974843e-05, "loss": 0.1405, "step": 5800 }, { "epoch": 0.30450733752620546, "grad_norm": 5.14114236831665, "learning_rate": 4.238862683438155e-05, "loss": 0.1424, "step": 5810 }, { "epoch": 0.3050314465408805, "grad_norm": 1.4198951721191406, "learning_rate": 4.237552410901467e-05, "loss": 0.1208, "step": 5820 }, { "epoch": 0.3055555555555556, "grad_norm": 1.4760233163833618, "learning_rate": 4.23624213836478e-05, "loss": 0.1174, "step": 5830 }, { "epoch": 0.3060796645702306, "grad_norm": 1.518790602684021, "learning_rate": 4.2349318658280925e-05, "loss": 0.1227, "step": 5840 }, { "epoch": 0.30660377358490565, "grad_norm": 2.099006175994873, "learning_rate": 4.233621593291405e-05, "loss": 0.1212, "step": 5850 }, { "epoch": 0.30712788259958074, "grad_norm": 3.2384157180786133, "learning_rate": 4.232311320754717e-05, "loss": 0.1491, "step": 5860 }, { "epoch": 0.30765199161425577, "grad_norm": 1.7764296531677246, "learning_rate": 4.2310010482180296e-05, "loss": 0.14, "step": 5870 }, { "epoch": 0.3081761006289308, "grad_norm": 0.9582472443580627, "learning_rate": 4.229690775681342e-05, "loss": 0.1425, "step": 5880 }, { "epoch": 0.3087002096436059, "grad_norm": 1.730102300643921, "learning_rate": 4.228380503144654e-05, "loss": 0.1646, "step": 5890 }, { "epoch": 0.30922431865828093, "grad_norm": 1.567408561706543, "learning_rate": 4.2270702306079666e-05, "loss": 0.1575, "step": 5900 }, { "epoch": 0.30974842767295596, "grad_norm": 1.3192992210388184, "learning_rate": 4.225759958071279e-05, "loss": 0.1209, "step": 5910 }, { "epoch": 0.31027253668763105, "grad_norm": 2.4381003379821777, "learning_rate": 4.224449685534592e-05, "loss": 0.1604, "step": 5920 }, { "epoch": 0.3107966457023061, "grad_norm": 5.242435455322266, "learning_rate": 4.2231394129979036e-05, "loss": 0.1444, "step": 5930 }, { "epoch": 0.3113207547169811, "grad_norm": 3.918074369430542, "learning_rate": 4.221829140461216e-05, "loss": 0.114, "step": 5940 }, { "epoch": 0.3118448637316562, "grad_norm": 1.4617007970809937, "learning_rate": 4.220518867924528e-05, "loss": 0.1442, "step": 5950 }, { "epoch": 0.31236897274633124, "grad_norm": 2.0521883964538574, "learning_rate": 4.2192085953878406e-05, "loss": 0.1241, "step": 5960 }, { "epoch": 0.3128930817610063, "grad_norm": 1.4939581155776978, "learning_rate": 4.217898322851153e-05, "loss": 0.1282, "step": 5970 }, { "epoch": 0.31341719077568136, "grad_norm": 3.2475051879882812, "learning_rate": 4.216588050314465e-05, "loss": 0.15, "step": 5980 }, { "epoch": 0.3139412997903564, "grad_norm": 2.8565642833709717, "learning_rate": 4.215277777777778e-05, "loss": 0.1143, "step": 5990 }, { "epoch": 0.31446540880503143, "grad_norm": 1.7236065864562988, "learning_rate": 4.2139675052410906e-05, "loss": 0.1419, "step": 6000 }, { "epoch": 0.31446540880503143, "eval_loss": 0.30979835987091064, "eval_runtime": 267.3439, "eval_samples_per_second": 7.447, "eval_steps_per_second": 1.242, "step": 6000 }, { "epoch": 0.3149895178197065, "grad_norm": 1.8495186567306519, "learning_rate": 4.212657232704403e-05, "loss": 0.1558, "step": 6010 }, { "epoch": 0.31551362683438156, "grad_norm": 0.4863542318344116, "learning_rate": 4.211346960167715e-05, "loss": 0.108, "step": 6020 }, { "epoch": 0.3160377358490566, "grad_norm": 2.037104368209839, "learning_rate": 4.2100366876310276e-05, "loss": 0.1544, "step": 6030 }, { "epoch": 0.3165618448637317, "grad_norm": 1.5306822061538696, "learning_rate": 4.20872641509434e-05, "loss": 0.109, "step": 6040 }, { "epoch": 0.3170859538784067, "grad_norm": 0.885473906993866, "learning_rate": 4.2074161425576516e-05, "loss": 0.1273, "step": 6050 }, { "epoch": 0.31761006289308175, "grad_norm": 1.3081514835357666, "learning_rate": 4.206105870020965e-05, "loss": 0.1408, "step": 6060 }, { "epoch": 0.31813417190775684, "grad_norm": 2.302015542984009, "learning_rate": 4.204795597484277e-05, "loss": 0.1196, "step": 6070 }, { "epoch": 0.31865828092243187, "grad_norm": 1.8867014646530151, "learning_rate": 4.2034853249475893e-05, "loss": 0.1052, "step": 6080 }, { "epoch": 0.3191823899371069, "grad_norm": 1.1797665357589722, "learning_rate": 4.202175052410902e-05, "loss": 0.1478, "step": 6090 }, { "epoch": 0.319706498951782, "grad_norm": 1.2745922803878784, "learning_rate": 4.200864779874214e-05, "loss": 0.1041, "step": 6100 }, { "epoch": 0.320230607966457, "grad_norm": 2.031139850616455, "learning_rate": 4.1995545073375264e-05, "loss": 0.1437, "step": 6110 }, { "epoch": 0.32075471698113206, "grad_norm": 1.2220163345336914, "learning_rate": 4.198244234800839e-05, "loss": 0.1411, "step": 6120 }, { "epoch": 0.32127882599580715, "grad_norm": 2.0426888465881348, "learning_rate": 4.196933962264151e-05, "loss": 0.1147, "step": 6130 }, { "epoch": 0.3218029350104822, "grad_norm": 3.2713701725006104, "learning_rate": 4.1956236897274634e-05, "loss": 0.1457, "step": 6140 }, { "epoch": 0.3223270440251572, "grad_norm": 1.5383789539337158, "learning_rate": 4.1943134171907764e-05, "loss": 0.1035, "step": 6150 }, { "epoch": 0.3228511530398323, "grad_norm": 2.0623626708984375, "learning_rate": 4.193003144654089e-05, "loss": 0.1259, "step": 6160 }, { "epoch": 0.32337526205450734, "grad_norm": 2.215529441833496, "learning_rate": 4.1916928721174004e-05, "loss": 0.0988, "step": 6170 }, { "epoch": 0.3238993710691824, "grad_norm": 1.0296365022659302, "learning_rate": 4.190382599580713e-05, "loss": 0.159, "step": 6180 }, { "epoch": 0.32442348008385746, "grad_norm": 1.9438577890396118, "learning_rate": 4.189072327044025e-05, "loss": 0.1413, "step": 6190 }, { "epoch": 0.3249475890985325, "grad_norm": 0.9789099097251892, "learning_rate": 4.1877620545073374e-05, "loss": 0.129, "step": 6200 }, { "epoch": 0.32547169811320753, "grad_norm": 0.9686060547828674, "learning_rate": 4.18645178197065e-05, "loss": 0.1672, "step": 6210 }, { "epoch": 0.3259958071278826, "grad_norm": 1.2769176959991455, "learning_rate": 4.185141509433963e-05, "loss": 0.1302, "step": 6220 }, { "epoch": 0.32651991614255765, "grad_norm": 1.8056713342666626, "learning_rate": 4.183831236897275e-05, "loss": 0.1302, "step": 6230 }, { "epoch": 0.3270440251572327, "grad_norm": 1.526102066040039, "learning_rate": 4.1825209643605874e-05, "loss": 0.1469, "step": 6240 }, { "epoch": 0.3275681341719078, "grad_norm": 0.9519234895706177, "learning_rate": 4.1812106918239e-05, "loss": 0.1064, "step": 6250 }, { "epoch": 0.3280922431865828, "grad_norm": 3.5756402015686035, "learning_rate": 4.179900419287212e-05, "loss": 0.1175, "step": 6260 }, { "epoch": 0.32861635220125784, "grad_norm": 2.9796833992004395, "learning_rate": 4.1785901467505245e-05, "loss": 0.1332, "step": 6270 }, { "epoch": 0.32914046121593293, "grad_norm": 2.4725258350372314, "learning_rate": 4.177279874213837e-05, "loss": 0.1116, "step": 6280 }, { "epoch": 0.32966457023060797, "grad_norm": 1.5295337438583374, "learning_rate": 4.175969601677149e-05, "loss": 0.0934, "step": 6290 }, { "epoch": 0.330188679245283, "grad_norm": 2.7749555110931396, "learning_rate": 4.1746593291404615e-05, "loss": 0.1517, "step": 6300 }, { "epoch": 0.3307127882599581, "grad_norm": 1.2054258584976196, "learning_rate": 4.173349056603774e-05, "loss": 0.1343, "step": 6310 }, { "epoch": 0.3312368972746331, "grad_norm": 2.1348698139190674, "learning_rate": 4.172038784067086e-05, "loss": 0.1256, "step": 6320 }, { "epoch": 0.33176100628930816, "grad_norm": 3.1444623470306396, "learning_rate": 4.1707285115303985e-05, "loss": 0.1095, "step": 6330 }, { "epoch": 0.33228511530398325, "grad_norm": 1.7551017999649048, "learning_rate": 4.169418238993711e-05, "loss": 0.1224, "step": 6340 }, { "epoch": 0.3328092243186583, "grad_norm": 6.100649833679199, "learning_rate": 4.168107966457023e-05, "loss": 0.1533, "step": 6350 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8948473930358887, "learning_rate": 4.1667976939203355e-05, "loss": 0.1126, "step": 6360 }, { "epoch": 0.3338574423480084, "grad_norm": 1.9172160625457764, "learning_rate": 4.165487421383648e-05, "loss": 0.1407, "step": 6370 }, { "epoch": 0.33438155136268344, "grad_norm": 1.2122879028320312, "learning_rate": 4.164177148846961e-05, "loss": 0.131, "step": 6380 }, { "epoch": 0.33490566037735847, "grad_norm": 0.9031748175621033, "learning_rate": 4.162866876310273e-05, "loss": 0.1635, "step": 6390 }, { "epoch": 0.33542976939203356, "grad_norm": 1.341102123260498, "learning_rate": 4.1615566037735855e-05, "loss": 0.1267, "step": 6400 }, { "epoch": 0.3359538784067086, "grad_norm": 2.564326286315918, "learning_rate": 4.160246331236897e-05, "loss": 0.1287, "step": 6410 }, { "epoch": 0.33647798742138363, "grad_norm": 2.386312246322632, "learning_rate": 4.1589360587002095e-05, "loss": 0.1269, "step": 6420 }, { "epoch": 0.3370020964360587, "grad_norm": 1.0762248039245605, "learning_rate": 4.157625786163522e-05, "loss": 0.1315, "step": 6430 }, { "epoch": 0.33752620545073375, "grad_norm": 0.7837504148483276, "learning_rate": 4.156315513626834e-05, "loss": 0.1281, "step": 6440 }, { "epoch": 0.3380503144654088, "grad_norm": 0.790789783000946, "learning_rate": 4.155005241090147e-05, "loss": 0.1336, "step": 6450 }, { "epoch": 0.3385744234800839, "grad_norm": 1.4993865489959717, "learning_rate": 4.1536949685534596e-05, "loss": 0.1262, "step": 6460 }, { "epoch": 0.3390985324947589, "grad_norm": 1.7968735694885254, "learning_rate": 4.152384696016772e-05, "loss": 0.1357, "step": 6470 }, { "epoch": 0.33962264150943394, "grad_norm": 1.5854703187942505, "learning_rate": 4.151074423480084e-05, "loss": 0.123, "step": 6480 }, { "epoch": 0.34014675052410903, "grad_norm": 1.7488397359848022, "learning_rate": 4.1497641509433966e-05, "loss": 0.126, "step": 6490 }, { "epoch": 0.34067085953878407, "grad_norm": 1.2105824947357178, "learning_rate": 4.148453878406709e-05, "loss": 0.1363, "step": 6500 }, { "epoch": 0.3411949685534591, "grad_norm": 1.638002634048462, "learning_rate": 4.147143605870021e-05, "loss": 0.1188, "step": 6510 }, { "epoch": 0.3417190775681342, "grad_norm": 1.4952040910720825, "learning_rate": 4.1458333333333336e-05, "loss": 0.1114, "step": 6520 }, { "epoch": 0.3422431865828092, "grad_norm": 1.3935679197311401, "learning_rate": 4.144523060796646e-05, "loss": 0.1283, "step": 6530 }, { "epoch": 0.34276729559748426, "grad_norm": 1.1780931949615479, "learning_rate": 4.143212788259958e-05, "loss": 0.1093, "step": 6540 }, { "epoch": 0.34329140461215935, "grad_norm": 1.7428357601165771, "learning_rate": 4.1419025157232706e-05, "loss": 0.1586, "step": 6550 }, { "epoch": 0.3438155136268344, "grad_norm": 1.624427080154419, "learning_rate": 4.140592243186583e-05, "loss": 0.114, "step": 6560 }, { "epoch": 0.3443396226415094, "grad_norm": 2.378514051437378, "learning_rate": 4.139281970649895e-05, "loss": 0.1634, "step": 6570 }, { "epoch": 0.3448637316561845, "grad_norm": 0.7246071696281433, "learning_rate": 4.1379716981132076e-05, "loss": 0.1095, "step": 6580 }, { "epoch": 0.34538784067085954, "grad_norm": 1.7475792169570923, "learning_rate": 4.13666142557652e-05, "loss": 0.1379, "step": 6590 }, { "epoch": 0.34591194968553457, "grad_norm": 1.2346446514129639, "learning_rate": 4.135351153039832e-05, "loss": 0.1196, "step": 6600 }, { "epoch": 0.34643605870020966, "grad_norm": 1.024915099143982, "learning_rate": 4.1340408805031446e-05, "loss": 0.1486, "step": 6610 }, { "epoch": 0.3469601677148847, "grad_norm": 11.363612174987793, "learning_rate": 4.1327306079664577e-05, "loss": 0.1026, "step": 6620 }, { "epoch": 0.3474842767295597, "grad_norm": 1.540850281715393, "learning_rate": 4.13142033542977e-05, "loss": 0.1658, "step": 6630 }, { "epoch": 0.3480083857442348, "grad_norm": 1.565014123916626, "learning_rate": 4.130110062893082e-05, "loss": 0.1321, "step": 6640 }, { "epoch": 0.34853249475890985, "grad_norm": 2.709578275680542, "learning_rate": 4.128799790356394e-05, "loss": 0.1416, "step": 6650 }, { "epoch": 0.3490566037735849, "grad_norm": 4.274057388305664, "learning_rate": 4.127489517819706e-05, "loss": 0.1303, "step": 6660 }, { "epoch": 0.34958071278826, "grad_norm": 1.6477665901184082, "learning_rate": 4.126179245283019e-05, "loss": 0.1158, "step": 6670 }, { "epoch": 0.350104821802935, "grad_norm": 2.525834798812866, "learning_rate": 4.124868972746331e-05, "loss": 0.1274, "step": 6680 }, { "epoch": 0.35062893081761004, "grad_norm": 1.99843430519104, "learning_rate": 4.123558700209644e-05, "loss": 0.1149, "step": 6690 }, { "epoch": 0.35115303983228513, "grad_norm": 1.540343165397644, "learning_rate": 4.1222484276729564e-05, "loss": 0.1383, "step": 6700 }, { "epoch": 0.35167714884696016, "grad_norm": 1.622219443321228, "learning_rate": 4.120938155136269e-05, "loss": 0.1072, "step": 6710 }, { "epoch": 0.3522012578616352, "grad_norm": 1.3897455930709839, "learning_rate": 4.119627882599581e-05, "loss": 0.1388, "step": 6720 }, { "epoch": 0.3527253668763103, "grad_norm": 2.0670998096466064, "learning_rate": 4.1183176100628934e-05, "loss": 0.1493, "step": 6730 }, { "epoch": 0.3532494758909853, "grad_norm": 1.3831391334533691, "learning_rate": 4.117007337526206e-05, "loss": 0.1381, "step": 6740 }, { "epoch": 0.35377358490566035, "grad_norm": 2.390122413635254, "learning_rate": 4.115697064989518e-05, "loss": 0.1364, "step": 6750 }, { "epoch": 0.35429769392033544, "grad_norm": 1.3669023513793945, "learning_rate": 4.1143867924528304e-05, "loss": 0.1264, "step": 6760 }, { "epoch": 0.3548218029350105, "grad_norm": 1.4862803220748901, "learning_rate": 4.113076519916143e-05, "loss": 0.1375, "step": 6770 }, { "epoch": 0.3553459119496855, "grad_norm": 2.1498663425445557, "learning_rate": 4.111766247379455e-05, "loss": 0.1314, "step": 6780 }, { "epoch": 0.3558700209643606, "grad_norm": 0.7945308089256287, "learning_rate": 4.1104559748427674e-05, "loss": 0.1302, "step": 6790 }, { "epoch": 0.35639412997903563, "grad_norm": 1.1641976833343506, "learning_rate": 4.10914570230608e-05, "loss": 0.1176, "step": 6800 }, { "epoch": 0.35691823899371067, "grad_norm": 1.3097875118255615, "learning_rate": 4.107835429769392e-05, "loss": 0.1252, "step": 6810 }, { "epoch": 0.35744234800838576, "grad_norm": 1.769045114517212, "learning_rate": 4.1065251572327044e-05, "loss": 0.15, "step": 6820 }, { "epoch": 0.3579664570230608, "grad_norm": 1.7525954246520996, "learning_rate": 4.105214884696017e-05, "loss": 0.1611, "step": 6830 }, { "epoch": 0.3584905660377358, "grad_norm": 1.655821681022644, "learning_rate": 4.103904612159329e-05, "loss": 0.1194, "step": 6840 }, { "epoch": 0.3590146750524109, "grad_norm": 1.8870012760162354, "learning_rate": 4.102594339622642e-05, "loss": 0.1501, "step": 6850 }, { "epoch": 0.35953878406708595, "grad_norm": 1.5264195203781128, "learning_rate": 4.1012840670859545e-05, "loss": 0.1432, "step": 6860 }, { "epoch": 0.360062893081761, "grad_norm": 1.597440242767334, "learning_rate": 4.099973794549267e-05, "loss": 0.1274, "step": 6870 }, { "epoch": 0.36058700209643607, "grad_norm": 1.3905103206634521, "learning_rate": 4.098663522012579e-05, "loss": 0.1331, "step": 6880 }, { "epoch": 0.3611111111111111, "grad_norm": 1.5292747020721436, "learning_rate": 4.097353249475891e-05, "loss": 0.1253, "step": 6890 }, { "epoch": 0.36163522012578614, "grad_norm": 1.0919030904769897, "learning_rate": 4.096042976939203e-05, "loss": 0.1233, "step": 6900 }, { "epoch": 0.36215932914046123, "grad_norm": 1.8438016176223755, "learning_rate": 4.0947327044025155e-05, "loss": 0.1309, "step": 6910 }, { "epoch": 0.36268343815513626, "grad_norm": 2.1451685428619385, "learning_rate": 4.0934224318658285e-05, "loss": 0.1617, "step": 6920 }, { "epoch": 0.3632075471698113, "grad_norm": 4.021899700164795, "learning_rate": 4.092112159329141e-05, "loss": 0.1233, "step": 6930 }, { "epoch": 0.3637316561844864, "grad_norm": 1.0114710330963135, "learning_rate": 4.090801886792453e-05, "loss": 0.1142, "step": 6940 }, { "epoch": 0.3642557651991614, "grad_norm": 1.8664952516555786, "learning_rate": 4.0894916142557655e-05, "loss": 0.1364, "step": 6950 }, { "epoch": 0.36477987421383645, "grad_norm": 0.8328051567077637, "learning_rate": 4.088181341719078e-05, "loss": 0.125, "step": 6960 }, { "epoch": 0.36530398322851154, "grad_norm": 1.7279052734375, "learning_rate": 4.08687106918239e-05, "loss": 0.1379, "step": 6970 }, { "epoch": 0.3658280922431866, "grad_norm": 2.0779712200164795, "learning_rate": 4.0855607966457025e-05, "loss": 0.1433, "step": 6980 }, { "epoch": 0.3663522012578616, "grad_norm": 3.7102408409118652, "learning_rate": 4.084250524109015e-05, "loss": 0.1306, "step": 6990 }, { "epoch": 0.3668763102725367, "grad_norm": 2.5546956062316895, "learning_rate": 4.082940251572327e-05, "loss": 0.1186, "step": 7000 }, { "epoch": 0.3668763102725367, "eval_loss": 0.2957610487937927, "eval_runtime": 268.2309, "eval_samples_per_second": 7.423, "eval_steps_per_second": 1.238, "step": 7000 }, { "epoch": 0.36740041928721173, "grad_norm": 1.1695574522018433, "learning_rate": 4.0816299790356395e-05, "loss": 0.0949, "step": 7010 }, { "epoch": 0.36792452830188677, "grad_norm": 2.4255287647247314, "learning_rate": 4.080319706498952e-05, "loss": 0.1318, "step": 7020 }, { "epoch": 0.36844863731656186, "grad_norm": 1.4534999132156372, "learning_rate": 4.079009433962264e-05, "loss": 0.1263, "step": 7030 }, { "epoch": 0.3689727463312369, "grad_norm": 1.4702773094177246, "learning_rate": 4.0776991614255766e-05, "loss": 0.1283, "step": 7040 }, { "epoch": 0.3694968553459119, "grad_norm": 1.4011280536651611, "learning_rate": 4.076388888888889e-05, "loss": 0.1062, "step": 7050 }, { "epoch": 0.370020964360587, "grad_norm": 1.7604784965515137, "learning_rate": 4.075078616352201e-05, "loss": 0.1452, "step": 7060 }, { "epoch": 0.37054507337526205, "grad_norm": 1.455108404159546, "learning_rate": 4.0737683438155136e-05, "loss": 0.1604, "step": 7070 }, { "epoch": 0.3710691823899371, "grad_norm": 1.530150055885315, "learning_rate": 4.0724580712788266e-05, "loss": 0.1598, "step": 7080 }, { "epoch": 0.37159329140461217, "grad_norm": 1.792792558670044, "learning_rate": 4.071147798742139e-05, "loss": 0.1255, "step": 7090 }, { "epoch": 0.3721174004192872, "grad_norm": 2.6770427227020264, "learning_rate": 4.069837526205451e-05, "loss": 0.1279, "step": 7100 }, { "epoch": 0.37264150943396224, "grad_norm": 1.6971386671066284, "learning_rate": 4.0685272536687636e-05, "loss": 0.1409, "step": 7110 }, { "epoch": 0.3731656184486373, "grad_norm": 1.3991681337356567, "learning_rate": 4.067216981132076e-05, "loss": 0.1354, "step": 7120 }, { "epoch": 0.37368972746331236, "grad_norm": 1.0325071811676025, "learning_rate": 4.0659067085953876e-05, "loss": 0.1231, "step": 7130 }, { "epoch": 0.3742138364779874, "grad_norm": 2.683823823928833, "learning_rate": 4.0645964360587e-05, "loss": 0.1249, "step": 7140 }, { "epoch": 0.3747379454926625, "grad_norm": 2.177319288253784, "learning_rate": 4.063286163522013e-05, "loss": 0.1291, "step": 7150 }, { "epoch": 0.3752620545073375, "grad_norm": 3.0221738815307617, "learning_rate": 4.061975890985325e-05, "loss": 0.1233, "step": 7160 }, { "epoch": 0.3757861635220126, "grad_norm": 1.712924599647522, "learning_rate": 4.0606656184486376e-05, "loss": 0.1215, "step": 7170 }, { "epoch": 0.37631027253668764, "grad_norm": 2.8734965324401855, "learning_rate": 4.05935534591195e-05, "loss": 0.143, "step": 7180 }, { "epoch": 0.3768343815513627, "grad_norm": 1.5283716917037964, "learning_rate": 4.058045073375262e-05, "loss": 0.1228, "step": 7190 }, { "epoch": 0.37735849056603776, "grad_norm": 1.5394947528839111, "learning_rate": 4.0567348008385746e-05, "loss": 0.1597, "step": 7200 }, { "epoch": 0.3778825995807128, "grad_norm": 2.937851667404175, "learning_rate": 4.055424528301887e-05, "loss": 0.1796, "step": 7210 }, { "epoch": 0.37840670859538783, "grad_norm": 1.1585332155227661, "learning_rate": 4.054114255765199e-05, "loss": 0.1573, "step": 7220 }, { "epoch": 0.3789308176100629, "grad_norm": 1.674102783203125, "learning_rate": 4.0528039832285117e-05, "loss": 0.1102, "step": 7230 }, { "epoch": 0.37945492662473795, "grad_norm": 2.403864860534668, "learning_rate": 4.051493710691824e-05, "loss": 0.1669, "step": 7240 }, { "epoch": 0.379979035639413, "grad_norm": 5.073932647705078, "learning_rate": 4.050183438155136e-05, "loss": 0.124, "step": 7250 }, { "epoch": 0.3805031446540881, "grad_norm": 2.1044859886169434, "learning_rate": 4.048873165618449e-05, "loss": 0.157, "step": 7260 }, { "epoch": 0.3810272536687631, "grad_norm": 1.426034927368164, "learning_rate": 4.047562893081761e-05, "loss": 0.1508, "step": 7270 }, { "epoch": 0.38155136268343814, "grad_norm": 2.211362838745117, "learning_rate": 4.0462526205450734e-05, "loss": 0.1546, "step": 7280 }, { "epoch": 0.38207547169811323, "grad_norm": 1.7680007219314575, "learning_rate": 4.044942348008386e-05, "loss": 0.1362, "step": 7290 }, { "epoch": 0.38259958071278827, "grad_norm": 1.0435234308242798, "learning_rate": 4.043632075471698e-05, "loss": 0.1438, "step": 7300 }, { "epoch": 0.3831236897274633, "grad_norm": 2.113070487976074, "learning_rate": 4.042321802935011e-05, "loss": 0.1372, "step": 7310 }, { "epoch": 0.3836477987421384, "grad_norm": 1.3755215406417847, "learning_rate": 4.0410115303983234e-05, "loss": 0.1439, "step": 7320 }, { "epoch": 0.3841719077568134, "grad_norm": 2.1089391708374023, "learning_rate": 4.039701257861636e-05, "loss": 0.138, "step": 7330 }, { "epoch": 0.38469601677148846, "grad_norm": 2.3198318481445312, "learning_rate": 4.038390985324948e-05, "loss": 0.1078, "step": 7340 }, { "epoch": 0.38522012578616355, "grad_norm": 1.23020339012146, "learning_rate": 4.0370807127882604e-05, "loss": 0.1405, "step": 7350 }, { "epoch": 0.3857442348008386, "grad_norm": 3.249234676361084, "learning_rate": 4.035770440251572e-05, "loss": 0.19, "step": 7360 }, { "epoch": 0.3862683438155136, "grad_norm": 1.7448391914367676, "learning_rate": 4.0344601677148844e-05, "loss": 0.1153, "step": 7370 }, { "epoch": 0.3867924528301887, "grad_norm": 1.3847689628601074, "learning_rate": 4.0331498951781974e-05, "loss": 0.1377, "step": 7380 }, { "epoch": 0.38731656184486374, "grad_norm": 0.9152111411094666, "learning_rate": 4.03183962264151e-05, "loss": 0.1111, "step": 7390 }, { "epoch": 0.38784067085953877, "grad_norm": 1.5579804182052612, "learning_rate": 4.030529350104822e-05, "loss": 0.1232, "step": 7400 }, { "epoch": 0.38836477987421386, "grad_norm": 5.35411262512207, "learning_rate": 4.0292190775681344e-05, "loss": 0.1371, "step": 7410 }, { "epoch": 0.3888888888888889, "grad_norm": 1.0708833932876587, "learning_rate": 4.027908805031447e-05, "loss": 0.0993, "step": 7420 }, { "epoch": 0.38941299790356393, "grad_norm": 1.5841659307479858, "learning_rate": 4.026598532494759e-05, "loss": 0.1205, "step": 7430 }, { "epoch": 0.389937106918239, "grad_norm": 1.9003546237945557, "learning_rate": 4.0252882599580714e-05, "loss": 0.1076, "step": 7440 }, { "epoch": 0.39046121593291405, "grad_norm": 1.3129006624221802, "learning_rate": 4.023977987421384e-05, "loss": 0.1503, "step": 7450 }, { "epoch": 0.3909853249475891, "grad_norm": 2.282996892929077, "learning_rate": 4.022667714884696e-05, "loss": 0.1119, "step": 7460 }, { "epoch": 0.3915094339622642, "grad_norm": 16.443607330322266, "learning_rate": 4.021357442348009e-05, "loss": 0.1448, "step": 7470 }, { "epoch": 0.3920335429769392, "grad_norm": 1.9104045629501343, "learning_rate": 4.020047169811321e-05, "loss": 0.1364, "step": 7480 }, { "epoch": 0.39255765199161424, "grad_norm": 2.893193483352661, "learning_rate": 4.018736897274633e-05, "loss": 0.1242, "step": 7490 }, { "epoch": 0.39308176100628933, "grad_norm": 2.8008060455322266, "learning_rate": 4.0174266247379455e-05, "loss": 0.1374, "step": 7500 }, { "epoch": 0.39360587002096437, "grad_norm": 2.654747724533081, "learning_rate": 4.016116352201258e-05, "loss": 0.146, "step": 7510 }, { "epoch": 0.3941299790356394, "grad_norm": 1.9274426698684692, "learning_rate": 4.01480607966457e-05, "loss": 0.1346, "step": 7520 }, { "epoch": 0.3946540880503145, "grad_norm": 5.771965026855469, "learning_rate": 4.0134958071278825e-05, "loss": 0.129, "step": 7530 }, { "epoch": 0.3951781970649895, "grad_norm": 1.6615891456604004, "learning_rate": 4.012185534591195e-05, "loss": 0.11, "step": 7540 }, { "epoch": 0.39570230607966456, "grad_norm": 3.1531715393066406, "learning_rate": 4.010875262054508e-05, "loss": 0.1413, "step": 7550 }, { "epoch": 0.39622641509433965, "grad_norm": 1.0126038789749146, "learning_rate": 4.00956498951782e-05, "loss": 0.1173, "step": 7560 }, { "epoch": 0.3967505241090147, "grad_norm": 1.4275016784667969, "learning_rate": 4.0082547169811325e-05, "loss": 0.0979, "step": 7570 }, { "epoch": 0.3972746331236897, "grad_norm": 1.2405078411102295, "learning_rate": 4.006944444444445e-05, "loss": 0.1297, "step": 7580 }, { "epoch": 0.3977987421383648, "grad_norm": 1.497534990310669, "learning_rate": 4.005634171907757e-05, "loss": 0.1095, "step": 7590 }, { "epoch": 0.39832285115303984, "grad_norm": 2.914259195327759, "learning_rate": 4.004323899371069e-05, "loss": 0.1186, "step": 7600 }, { "epoch": 0.39884696016771487, "grad_norm": 2.3535306453704834, "learning_rate": 4.003013626834381e-05, "loss": 0.1537, "step": 7610 }, { "epoch": 0.39937106918238996, "grad_norm": 3.1690330505371094, "learning_rate": 4.001703354297694e-05, "loss": 0.1411, "step": 7620 }, { "epoch": 0.399895178197065, "grad_norm": 3.2396535873413086, "learning_rate": 4.0003930817610066e-05, "loss": 0.1346, "step": 7630 }, { "epoch": 0.40041928721174, "grad_norm": 2.383460760116577, "learning_rate": 3.999082809224319e-05, "loss": 0.1222, "step": 7640 }, { "epoch": 0.4009433962264151, "grad_norm": 1.5564273595809937, "learning_rate": 3.997772536687631e-05, "loss": 0.1819, "step": 7650 }, { "epoch": 0.40146750524109015, "grad_norm": 3.6901915073394775, "learning_rate": 3.9964622641509436e-05, "loss": 0.1093, "step": 7660 }, { "epoch": 0.4019916142557652, "grad_norm": 1.2570509910583496, "learning_rate": 3.995151991614256e-05, "loss": 0.1441, "step": 7670 }, { "epoch": 0.4025157232704403, "grad_norm": 1.0235661268234253, "learning_rate": 3.993841719077568e-05, "loss": 0.1207, "step": 7680 }, { "epoch": 0.4030398322851153, "grad_norm": 1.4244946241378784, "learning_rate": 3.9925314465408806e-05, "loss": 0.1355, "step": 7690 }, { "epoch": 0.40356394129979034, "grad_norm": 1.821220874786377, "learning_rate": 3.991221174004193e-05, "loss": 0.1222, "step": 7700 }, { "epoch": 0.40408805031446543, "grad_norm": 1.3305425643920898, "learning_rate": 3.989910901467506e-05, "loss": 0.1227, "step": 7710 }, { "epoch": 0.40461215932914046, "grad_norm": 0.676930844783783, "learning_rate": 3.9886006289308176e-05, "loss": 0.1164, "step": 7720 }, { "epoch": 0.4051362683438155, "grad_norm": 2.514782190322876, "learning_rate": 3.98729035639413e-05, "loss": 0.1203, "step": 7730 }, { "epoch": 0.4056603773584906, "grad_norm": 2.546602487564087, "learning_rate": 3.985980083857442e-05, "loss": 0.135, "step": 7740 }, { "epoch": 0.4061844863731656, "grad_norm": 1.8678992986679077, "learning_rate": 3.9846698113207546e-05, "loss": 0.1605, "step": 7750 }, { "epoch": 0.40670859538784065, "grad_norm": 1.5499653816223145, "learning_rate": 3.983359538784067e-05, "loss": 0.1584, "step": 7760 }, { "epoch": 0.40723270440251574, "grad_norm": 1.3117645978927612, "learning_rate": 3.982049266247379e-05, "loss": 0.143, "step": 7770 }, { "epoch": 0.4077568134171908, "grad_norm": 3.2754364013671875, "learning_rate": 3.980738993710692e-05, "loss": 0.1416, "step": 7780 }, { "epoch": 0.4082809224318658, "grad_norm": 1.771898627281189, "learning_rate": 3.9794287211740047e-05, "loss": 0.1443, "step": 7790 }, { "epoch": 0.4088050314465409, "grad_norm": 1.7889633178710938, "learning_rate": 3.978118448637317e-05, "loss": 0.1185, "step": 7800 }, { "epoch": 0.40932914046121593, "grad_norm": 2.0119128227233887, "learning_rate": 3.976808176100629e-05, "loss": 0.1344, "step": 7810 }, { "epoch": 0.40985324947589097, "grad_norm": 0.9358534216880798, "learning_rate": 3.975497903563942e-05, "loss": 0.0939, "step": 7820 }, { "epoch": 0.41037735849056606, "grad_norm": 1.441988229751587, "learning_rate": 3.974187631027254e-05, "loss": 0.1183, "step": 7830 }, { "epoch": 0.4109014675052411, "grad_norm": 2.3082668781280518, "learning_rate": 3.972877358490566e-05, "loss": 0.1241, "step": 7840 }, { "epoch": 0.4114255765199161, "grad_norm": 1.446537971496582, "learning_rate": 3.971567085953879e-05, "loss": 0.1556, "step": 7850 }, { "epoch": 0.4119496855345912, "grad_norm": 1.0463677644729614, "learning_rate": 3.970256813417191e-05, "loss": 0.1298, "step": 7860 }, { "epoch": 0.41247379454926625, "grad_norm": 1.2671233415603638, "learning_rate": 3.9689465408805034e-05, "loss": 0.1444, "step": 7870 }, { "epoch": 0.4129979035639413, "grad_norm": 1.6452945470809937, "learning_rate": 3.967636268343816e-05, "loss": 0.1183, "step": 7880 }, { "epoch": 0.41352201257861637, "grad_norm": 1.7226898670196533, "learning_rate": 3.966325995807128e-05, "loss": 0.1098, "step": 7890 }, { "epoch": 0.4140461215932914, "grad_norm": 1.7937055826187134, "learning_rate": 3.9650157232704404e-05, "loss": 0.1162, "step": 7900 }, { "epoch": 0.41457023060796644, "grad_norm": 1.9490100145339966, "learning_rate": 3.963705450733753e-05, "loss": 0.143, "step": 7910 }, { "epoch": 0.41509433962264153, "grad_norm": 2.1928627490997314, "learning_rate": 3.962395178197065e-05, "loss": 0.1508, "step": 7920 }, { "epoch": 0.41561844863731656, "grad_norm": 2.3016629219055176, "learning_rate": 3.9610849056603774e-05, "loss": 0.1206, "step": 7930 }, { "epoch": 0.4161425576519916, "grad_norm": 1.2063487768173218, "learning_rate": 3.9597746331236904e-05, "loss": 0.1042, "step": 7940 }, { "epoch": 0.4166666666666667, "grad_norm": 1.34829843044281, "learning_rate": 3.958464360587003e-05, "loss": 0.1465, "step": 7950 }, { "epoch": 0.4171907756813417, "grad_norm": 2.4825642108917236, "learning_rate": 3.9571540880503144e-05, "loss": 0.1397, "step": 7960 }, { "epoch": 0.41771488469601675, "grad_norm": 1.801196813583374, "learning_rate": 3.955843815513627e-05, "loss": 0.1388, "step": 7970 }, { "epoch": 0.41823899371069184, "grad_norm": 1.4181357622146606, "learning_rate": 3.954533542976939e-05, "loss": 0.1759, "step": 7980 }, { "epoch": 0.4187631027253669, "grad_norm": 1.4561703205108643, "learning_rate": 3.9532232704402514e-05, "loss": 0.1653, "step": 7990 }, { "epoch": 0.4192872117400419, "grad_norm": 1.21798574924469, "learning_rate": 3.951912997903564e-05, "loss": 0.1136, "step": 8000 }, { "epoch": 0.4192872117400419, "eval_loss": 0.29459360241889954, "eval_runtime": 267.455, "eval_samples_per_second": 7.444, "eval_steps_per_second": 1.241, "step": 8000 }, { "epoch": 0.419811320754717, "grad_norm": 1.3732337951660156, "learning_rate": 3.950602725366877e-05, "loss": 0.1248, "step": 8010 }, { "epoch": 0.42033542976939203, "grad_norm": 2.136629581451416, "learning_rate": 3.949292452830189e-05, "loss": 0.1281, "step": 8020 }, { "epoch": 0.42085953878406707, "grad_norm": 0.736299991607666, "learning_rate": 3.9479821802935015e-05, "loss": 0.108, "step": 8030 }, { "epoch": 0.42138364779874216, "grad_norm": 2.7431278228759766, "learning_rate": 3.946671907756814e-05, "loss": 0.1553, "step": 8040 }, { "epoch": 0.4219077568134172, "grad_norm": 3.224233627319336, "learning_rate": 3.945361635220126e-05, "loss": 0.1563, "step": 8050 }, { "epoch": 0.4224318658280922, "grad_norm": 1.2465264797210693, "learning_rate": 3.9440513626834385e-05, "loss": 0.1183, "step": 8060 }, { "epoch": 0.4229559748427673, "grad_norm": 1.5469056367874146, "learning_rate": 3.942741090146751e-05, "loss": 0.1435, "step": 8070 }, { "epoch": 0.42348008385744235, "grad_norm": 2.5857508182525635, "learning_rate": 3.941430817610063e-05, "loss": 0.1432, "step": 8080 }, { "epoch": 0.4240041928721174, "grad_norm": 1.5477666854858398, "learning_rate": 3.9401205450733755e-05, "loss": 0.1425, "step": 8090 }, { "epoch": 0.42452830188679247, "grad_norm": 1.2745437622070312, "learning_rate": 3.938810272536688e-05, "loss": 0.1156, "step": 8100 }, { "epoch": 0.4250524109014675, "grad_norm": 1.5633890628814697, "learning_rate": 3.9375e-05, "loss": 0.1352, "step": 8110 }, { "epoch": 0.42557651991614254, "grad_norm": 1.2198878526687622, "learning_rate": 3.9361897274633125e-05, "loss": 0.1607, "step": 8120 }, { "epoch": 0.4261006289308176, "grad_norm": 1.835465908050537, "learning_rate": 3.934879454926625e-05, "loss": 0.1214, "step": 8130 }, { "epoch": 0.42662473794549266, "grad_norm": 1.3060351610183716, "learning_rate": 3.933569182389937e-05, "loss": 0.1388, "step": 8140 }, { "epoch": 0.4271488469601677, "grad_norm": 2.6293728351593018, "learning_rate": 3.9322589098532495e-05, "loss": 0.1443, "step": 8150 }, { "epoch": 0.4276729559748428, "grad_norm": 2.6649739742279053, "learning_rate": 3.930948637316562e-05, "loss": 0.1182, "step": 8160 }, { "epoch": 0.4281970649895178, "grad_norm": 2.201756238937378, "learning_rate": 3.929638364779875e-05, "loss": 0.1498, "step": 8170 }, { "epoch": 0.42872117400419285, "grad_norm": 2.9758079051971436, "learning_rate": 3.928328092243187e-05, "loss": 0.1713, "step": 8180 }, { "epoch": 0.42924528301886794, "grad_norm": 2.305769920349121, "learning_rate": 3.9270178197064995e-05, "loss": 0.1184, "step": 8190 }, { "epoch": 0.429769392033543, "grad_norm": 1.409645438194275, "learning_rate": 3.925707547169811e-05, "loss": 0.1198, "step": 8200 }, { "epoch": 0.430293501048218, "grad_norm": 1.2932605743408203, "learning_rate": 3.9243972746331235e-05, "loss": 0.1263, "step": 8210 }, { "epoch": 0.4308176100628931, "grad_norm": 2.1385724544525146, "learning_rate": 3.923087002096436e-05, "loss": 0.116, "step": 8220 }, { "epoch": 0.43134171907756813, "grad_norm": 1.7936503887176514, "learning_rate": 3.921776729559748e-05, "loss": 0.1194, "step": 8230 }, { "epoch": 0.43186582809224316, "grad_norm": 1.0538984537124634, "learning_rate": 3.920466457023061e-05, "loss": 0.1398, "step": 8240 }, { "epoch": 0.43238993710691825, "grad_norm": 1.157038927078247, "learning_rate": 3.9191561844863736e-05, "loss": 0.12, "step": 8250 }, { "epoch": 0.4329140461215933, "grad_norm": 1.3017743825912476, "learning_rate": 3.917845911949686e-05, "loss": 0.1097, "step": 8260 }, { "epoch": 0.4334381551362683, "grad_norm": 2.975079298019409, "learning_rate": 3.916535639412998e-05, "loss": 0.1581, "step": 8270 }, { "epoch": 0.4339622641509434, "grad_norm": 0.8380312323570251, "learning_rate": 3.9152253668763106e-05, "loss": 0.1059, "step": 8280 }, { "epoch": 0.43448637316561844, "grad_norm": 2.567601203918457, "learning_rate": 3.913915094339623e-05, "loss": 0.1374, "step": 8290 }, { "epoch": 0.4350104821802935, "grad_norm": 2.6411819458007812, "learning_rate": 3.912604821802935e-05, "loss": 0.1206, "step": 8300 }, { "epoch": 0.43553459119496857, "grad_norm": 1.1850274801254272, "learning_rate": 3.9112945492662476e-05, "loss": 0.1311, "step": 8310 }, { "epoch": 0.4360587002096436, "grad_norm": 1.537529468536377, "learning_rate": 3.90998427672956e-05, "loss": 0.1189, "step": 8320 }, { "epoch": 0.43658280922431864, "grad_norm": 3.455749034881592, "learning_rate": 3.908674004192872e-05, "loss": 0.1454, "step": 8330 }, { "epoch": 0.4371069182389937, "grad_norm": 1.6670117378234863, "learning_rate": 3.9073637316561846e-05, "loss": 0.1372, "step": 8340 }, { "epoch": 0.43763102725366876, "grad_norm": 1.6339397430419922, "learning_rate": 3.906053459119497e-05, "loss": 0.1367, "step": 8350 }, { "epoch": 0.4381551362683438, "grad_norm": 2.6764585971832275, "learning_rate": 3.904743186582809e-05, "loss": 0.1438, "step": 8360 }, { "epoch": 0.4386792452830189, "grad_norm": 1.5620914697647095, "learning_rate": 3.9034329140461216e-05, "loss": 0.1225, "step": 8370 }, { "epoch": 0.4392033542976939, "grad_norm": 2.1984214782714844, "learning_rate": 3.902122641509434e-05, "loss": 0.1207, "step": 8380 }, { "epoch": 0.43972746331236895, "grad_norm": 1.5800156593322754, "learning_rate": 3.900812368972746e-05, "loss": 0.1137, "step": 8390 }, { "epoch": 0.44025157232704404, "grad_norm": 4.698128700256348, "learning_rate": 3.899502096436059e-05, "loss": 0.1242, "step": 8400 }, { "epoch": 0.44077568134171907, "grad_norm": 1.764726996421814, "learning_rate": 3.898191823899372e-05, "loss": 0.1459, "step": 8410 }, { "epoch": 0.4412997903563941, "grad_norm": 1.1937742233276367, "learning_rate": 3.896881551362684e-05, "loss": 0.1184, "step": 8420 }, { "epoch": 0.4418238993710692, "grad_norm": 1.3434696197509766, "learning_rate": 3.8955712788259964e-05, "loss": 0.1189, "step": 8430 }, { "epoch": 0.44234800838574423, "grad_norm": 0.9694046378135681, "learning_rate": 3.894261006289308e-05, "loss": 0.1254, "step": 8440 }, { "epoch": 0.44287211740041926, "grad_norm": 4.838512897491455, "learning_rate": 3.8929507337526204e-05, "loss": 0.135, "step": 8450 }, { "epoch": 0.44339622641509435, "grad_norm": 3.3563590049743652, "learning_rate": 3.891640461215933e-05, "loss": 0.103, "step": 8460 }, { "epoch": 0.4439203354297694, "grad_norm": 1.2375174760818481, "learning_rate": 3.890330188679246e-05, "loss": 0.1489, "step": 8470 }, { "epoch": 0.4444444444444444, "grad_norm": 0.8426742553710938, "learning_rate": 3.889019916142558e-05, "loss": 0.1292, "step": 8480 }, { "epoch": 0.4449685534591195, "grad_norm": 2.4112813472747803, "learning_rate": 3.8877096436058704e-05, "loss": 0.1292, "step": 8490 }, { "epoch": 0.44549266247379454, "grad_norm": 2.111625909805298, "learning_rate": 3.886399371069183e-05, "loss": 0.1141, "step": 8500 }, { "epoch": 0.4460167714884696, "grad_norm": 1.7435418367385864, "learning_rate": 3.885089098532495e-05, "loss": 0.1245, "step": 8510 }, { "epoch": 0.44654088050314467, "grad_norm": 2.7497286796569824, "learning_rate": 3.8837788259958074e-05, "loss": 0.1417, "step": 8520 }, { "epoch": 0.4470649895178197, "grad_norm": 2.3329012393951416, "learning_rate": 3.88246855345912e-05, "loss": 0.1388, "step": 8530 }, { "epoch": 0.44758909853249473, "grad_norm": 1.873579978942871, "learning_rate": 3.881158280922432e-05, "loss": 0.1279, "step": 8540 }, { "epoch": 0.4481132075471698, "grad_norm": 1.848961353302002, "learning_rate": 3.8798480083857444e-05, "loss": 0.1293, "step": 8550 }, { "epoch": 0.44863731656184486, "grad_norm": 1.7882370948791504, "learning_rate": 3.878537735849057e-05, "loss": 0.1059, "step": 8560 }, { "epoch": 0.4491614255765199, "grad_norm": 1.3296475410461426, "learning_rate": 3.877227463312369e-05, "loss": 0.1222, "step": 8570 }, { "epoch": 0.449685534591195, "grad_norm": 3.1347081661224365, "learning_rate": 3.8759171907756814e-05, "loss": 0.1358, "step": 8580 }, { "epoch": 0.45020964360587, "grad_norm": 1.8556910753250122, "learning_rate": 3.874606918238994e-05, "loss": 0.0963, "step": 8590 }, { "epoch": 0.45073375262054505, "grad_norm": 1.9654614925384521, "learning_rate": 3.873296645702306e-05, "loss": 0.1124, "step": 8600 }, { "epoch": 0.45125786163522014, "grad_norm": 2.273122549057007, "learning_rate": 3.8719863731656184e-05, "loss": 0.1212, "step": 8610 }, { "epoch": 0.45178197064989517, "grad_norm": 2.0771892070770264, "learning_rate": 3.870676100628931e-05, "loss": 0.142, "step": 8620 }, { "epoch": 0.4523060796645702, "grad_norm": 7.753453731536865, "learning_rate": 3.869365828092243e-05, "loss": 0.1598, "step": 8630 }, { "epoch": 0.4528301886792453, "grad_norm": 1.923572301864624, "learning_rate": 3.868055555555556e-05, "loss": 0.1393, "step": 8640 }, { "epoch": 0.4533542976939203, "grad_norm": 2.978624105453491, "learning_rate": 3.8667452830188685e-05, "loss": 0.1199, "step": 8650 }, { "epoch": 0.45387840670859536, "grad_norm": 1.3282644748687744, "learning_rate": 3.865435010482181e-05, "loss": 0.1248, "step": 8660 }, { "epoch": 0.45440251572327045, "grad_norm": 2.1446094512939453, "learning_rate": 3.864124737945493e-05, "loss": 0.1349, "step": 8670 }, { "epoch": 0.4549266247379455, "grad_norm": 1.1796557903289795, "learning_rate": 3.862814465408805e-05, "loss": 0.1175, "step": 8680 }, { "epoch": 0.4554507337526205, "grad_norm": 2.4166107177734375, "learning_rate": 3.861504192872117e-05, "loss": 0.0966, "step": 8690 }, { "epoch": 0.4559748427672956, "grad_norm": 1.9842512607574463, "learning_rate": 3.8601939203354295e-05, "loss": 0.101, "step": 8700 }, { "epoch": 0.45649895178197064, "grad_norm": 1.7683640718460083, "learning_rate": 3.8588836477987425e-05, "loss": 0.145, "step": 8710 }, { "epoch": 0.4570230607966457, "grad_norm": 1.3694344758987427, "learning_rate": 3.857573375262055e-05, "loss": 0.105, "step": 8720 }, { "epoch": 0.45754716981132076, "grad_norm": 1.667144775390625, "learning_rate": 3.856263102725367e-05, "loss": 0.118, "step": 8730 }, { "epoch": 0.4580712788259958, "grad_norm": 2.1638240814208984, "learning_rate": 3.8549528301886795e-05, "loss": 0.1249, "step": 8740 }, { "epoch": 0.4585953878406709, "grad_norm": 1.1001627445220947, "learning_rate": 3.853642557651992e-05, "loss": 0.1162, "step": 8750 }, { "epoch": 0.4591194968553459, "grad_norm": 1.8323266506195068, "learning_rate": 3.852332285115304e-05, "loss": 0.1246, "step": 8760 }, { "epoch": 0.45964360587002095, "grad_norm": 1.4820311069488525, "learning_rate": 3.8510220125786165e-05, "loss": 0.1362, "step": 8770 }, { "epoch": 0.46016771488469604, "grad_norm": 1.7225009202957153, "learning_rate": 3.849711740041929e-05, "loss": 0.1262, "step": 8780 }, { "epoch": 0.4606918238993711, "grad_norm": 1.7169979810714722, "learning_rate": 3.848401467505241e-05, "loss": 0.1545, "step": 8790 }, { "epoch": 0.4612159329140461, "grad_norm": 2.1871843338012695, "learning_rate": 3.8470911949685536e-05, "loss": 0.1133, "step": 8800 }, { "epoch": 0.4617400419287212, "grad_norm": 1.6585966348648071, "learning_rate": 3.845780922431866e-05, "loss": 0.1199, "step": 8810 }, { "epoch": 0.46226415094339623, "grad_norm": 5.046018123626709, "learning_rate": 3.844470649895178e-05, "loss": 0.1433, "step": 8820 }, { "epoch": 0.46278825995807127, "grad_norm": 2.51607084274292, "learning_rate": 3.8431603773584906e-05, "loss": 0.1336, "step": 8830 }, { "epoch": 0.46331236897274636, "grad_norm": 1.3547303676605225, "learning_rate": 3.841850104821803e-05, "loss": 0.1539, "step": 8840 }, { "epoch": 0.4638364779874214, "grad_norm": 1.824432373046875, "learning_rate": 3.840539832285115e-05, "loss": 0.1383, "step": 8850 }, { "epoch": 0.4643605870020964, "grad_norm": 1.8877265453338623, "learning_rate": 3.8392295597484276e-05, "loss": 0.112, "step": 8860 }, { "epoch": 0.4648846960167715, "grad_norm": 1.1183520555496216, "learning_rate": 3.8379192872117406e-05, "loss": 0.116, "step": 8870 }, { "epoch": 0.46540880503144655, "grad_norm": 0.9367240071296692, "learning_rate": 3.836609014675053e-05, "loss": 0.1138, "step": 8880 }, { "epoch": 0.4659329140461216, "grad_norm": 1.4263917207717896, "learning_rate": 3.835298742138365e-05, "loss": 0.1587, "step": 8890 }, { "epoch": 0.46645702306079667, "grad_norm": 1.440211296081543, "learning_rate": 3.8339884696016776e-05, "loss": 0.0893, "step": 8900 }, { "epoch": 0.4669811320754717, "grad_norm": 1.3915868997573853, "learning_rate": 3.832678197064989e-05, "loss": 0.097, "step": 8910 }, { "epoch": 0.46750524109014674, "grad_norm": 2.4701268672943115, "learning_rate": 3.8313679245283016e-05, "loss": 0.1198, "step": 8920 }, { "epoch": 0.46802935010482183, "grad_norm": 1.6906559467315674, "learning_rate": 3.830057651991614e-05, "loss": 0.1052, "step": 8930 }, { "epoch": 0.46855345911949686, "grad_norm": 1.4248497486114502, "learning_rate": 3.828747379454927e-05, "loss": 0.1307, "step": 8940 }, { "epoch": 0.4690775681341719, "grad_norm": 1.064107894897461, "learning_rate": 3.827437106918239e-05, "loss": 0.1089, "step": 8950 }, { "epoch": 0.469601677148847, "grad_norm": 1.1374626159667969, "learning_rate": 3.8261268343815517e-05, "loss": 0.1344, "step": 8960 }, { "epoch": 0.470125786163522, "grad_norm": 3.8050456047058105, "learning_rate": 3.824816561844864e-05, "loss": 0.1233, "step": 8970 }, { "epoch": 0.47064989517819705, "grad_norm": 2.235957145690918, "learning_rate": 3.823506289308176e-05, "loss": 0.1109, "step": 8980 }, { "epoch": 0.47117400419287214, "grad_norm": 1.165960669517517, "learning_rate": 3.822196016771489e-05, "loss": 0.1275, "step": 8990 }, { "epoch": 0.4716981132075472, "grad_norm": 2.664379835128784, "learning_rate": 3.820885744234801e-05, "loss": 0.1637, "step": 9000 }, { "epoch": 0.4716981132075472, "eval_loss": 0.29463252425193787, "eval_runtime": 267.3008, "eval_samples_per_second": 7.449, "eval_steps_per_second": 1.242, "step": 9000 }, { "epoch": 0.4722222222222222, "grad_norm": 2.2115066051483154, "learning_rate": 3.8195754716981133e-05, "loss": 0.1339, "step": 9010 }, { "epoch": 0.4727463312368973, "grad_norm": 1.608081579208374, "learning_rate": 3.818265199161426e-05, "loss": 0.0984, "step": 9020 }, { "epoch": 0.47327044025157233, "grad_norm": 2.9085326194763184, "learning_rate": 3.816954926624738e-05, "loss": 0.1239, "step": 9030 }, { "epoch": 0.47379454926624737, "grad_norm": 13.745386123657227, "learning_rate": 3.8156446540880504e-05, "loss": 0.115, "step": 9040 }, { "epoch": 0.47431865828092246, "grad_norm": 3.2827248573303223, "learning_rate": 3.814334381551363e-05, "loss": 0.1289, "step": 9050 }, { "epoch": 0.4748427672955975, "grad_norm": 2.124379873275757, "learning_rate": 3.813024109014675e-05, "loss": 0.1247, "step": 9060 }, { "epoch": 0.4753668763102725, "grad_norm": 1.5814337730407715, "learning_rate": 3.8117138364779874e-05, "loss": 0.1247, "step": 9070 }, { "epoch": 0.4758909853249476, "grad_norm": 1.8060868978500366, "learning_rate": 3.8104035639413e-05, "loss": 0.1498, "step": 9080 }, { "epoch": 0.47641509433962265, "grad_norm": 1.223948359489441, "learning_rate": 3.809093291404612e-05, "loss": 0.0915, "step": 9090 }, { "epoch": 0.4769392033542977, "grad_norm": 0.885719895362854, "learning_rate": 3.807783018867925e-05, "loss": 0.122, "step": 9100 }, { "epoch": 0.47746331236897277, "grad_norm": 1.971203088760376, "learning_rate": 3.8064727463312374e-05, "loss": 0.1585, "step": 9110 }, { "epoch": 0.4779874213836478, "grad_norm": 2.219496965408325, "learning_rate": 3.80516247379455e-05, "loss": 0.1119, "step": 9120 }, { "epoch": 0.47851153039832284, "grad_norm": 1.7364963293075562, "learning_rate": 3.803852201257862e-05, "loss": 0.1005, "step": 9130 }, { "epoch": 0.4790356394129979, "grad_norm": 1.9840654134750366, "learning_rate": 3.8025419287211744e-05, "loss": 0.1209, "step": 9140 }, { "epoch": 0.47955974842767296, "grad_norm": 2.238281726837158, "learning_rate": 3.801231656184486e-05, "loss": 0.1569, "step": 9150 }, { "epoch": 0.480083857442348, "grad_norm": 1.8940962553024292, "learning_rate": 3.7999213836477984e-05, "loss": 0.0958, "step": 9160 }, { "epoch": 0.4806079664570231, "grad_norm": 2.0582616329193115, "learning_rate": 3.7986111111111114e-05, "loss": 0.1442, "step": 9170 }, { "epoch": 0.4811320754716981, "grad_norm": 2.1361806392669678, "learning_rate": 3.797300838574424e-05, "loss": 0.1104, "step": 9180 }, { "epoch": 0.48165618448637315, "grad_norm": 2.00907826423645, "learning_rate": 3.795990566037736e-05, "loss": 0.1137, "step": 9190 }, { "epoch": 0.48218029350104824, "grad_norm": 1.998676061630249, "learning_rate": 3.7946802935010485e-05, "loss": 0.1456, "step": 9200 }, { "epoch": 0.4827044025157233, "grad_norm": 6.387622356414795, "learning_rate": 3.793370020964361e-05, "loss": 0.1616, "step": 9210 }, { "epoch": 0.4832285115303983, "grad_norm": 2.027569055557251, "learning_rate": 3.792059748427673e-05, "loss": 0.1425, "step": 9220 }, { "epoch": 0.4837526205450734, "grad_norm": 2.2389369010925293, "learning_rate": 3.7907494758909855e-05, "loss": 0.1355, "step": 9230 }, { "epoch": 0.48427672955974843, "grad_norm": 0.3307386636734009, "learning_rate": 3.789439203354298e-05, "loss": 0.1269, "step": 9240 }, { "epoch": 0.48480083857442346, "grad_norm": 2.1012308597564697, "learning_rate": 3.78812893081761e-05, "loss": 0.1235, "step": 9250 }, { "epoch": 0.48532494758909855, "grad_norm": 0.8590827584266663, "learning_rate": 3.786818658280923e-05, "loss": 0.1244, "step": 9260 }, { "epoch": 0.4858490566037736, "grad_norm": 1.6406866312026978, "learning_rate": 3.785508385744235e-05, "loss": 0.1256, "step": 9270 }, { "epoch": 0.4863731656184486, "grad_norm": 1.5975079536437988, "learning_rate": 3.784198113207547e-05, "loss": 0.1074, "step": 9280 }, { "epoch": 0.4868972746331237, "grad_norm": 1.5340465307235718, "learning_rate": 3.7828878406708595e-05, "loss": 0.0975, "step": 9290 }, { "epoch": 0.48742138364779874, "grad_norm": 3.123337984085083, "learning_rate": 3.781577568134172e-05, "loss": 0.156, "step": 9300 }, { "epoch": 0.4879454926624738, "grad_norm": 6.452347755432129, "learning_rate": 3.780267295597484e-05, "loss": 0.1253, "step": 9310 }, { "epoch": 0.48846960167714887, "grad_norm": 1.0351651906967163, "learning_rate": 3.7789570230607965e-05, "loss": 0.1547, "step": 9320 }, { "epoch": 0.4889937106918239, "grad_norm": 1.8313533067703247, "learning_rate": 3.7776467505241095e-05, "loss": 0.1104, "step": 9330 }, { "epoch": 0.48951781970649894, "grad_norm": 1.1199965476989746, "learning_rate": 3.776336477987422e-05, "loss": 0.0976, "step": 9340 }, { "epoch": 0.490041928721174, "grad_norm": 2.7808194160461426, "learning_rate": 3.775026205450734e-05, "loss": 0.1449, "step": 9350 }, { "epoch": 0.49056603773584906, "grad_norm": 4.0185041427612305, "learning_rate": 3.7737159329140465e-05, "loss": 0.142, "step": 9360 }, { "epoch": 0.4910901467505241, "grad_norm": 4.457677364349365, "learning_rate": 3.772405660377359e-05, "loss": 0.1096, "step": 9370 }, { "epoch": 0.4916142557651992, "grad_norm": 1.8629597425460815, "learning_rate": 3.771095387840671e-05, "loss": 0.1369, "step": 9380 }, { "epoch": 0.4921383647798742, "grad_norm": 2.3238027095794678, "learning_rate": 3.769785115303983e-05, "loss": 0.1253, "step": 9390 }, { "epoch": 0.49266247379454925, "grad_norm": 2.06125807762146, "learning_rate": 3.768474842767296e-05, "loss": 0.1438, "step": 9400 }, { "epoch": 0.49318658280922434, "grad_norm": 1.5867494344711304, "learning_rate": 3.767164570230608e-05, "loss": 0.1318, "step": 9410 }, { "epoch": 0.4937106918238994, "grad_norm": 1.5369681119918823, "learning_rate": 3.7658542976939206e-05, "loss": 0.1474, "step": 9420 }, { "epoch": 0.4942348008385744, "grad_norm": 1.043910264968872, "learning_rate": 3.764544025157233e-05, "loss": 0.1205, "step": 9430 }, { "epoch": 0.4947589098532495, "grad_norm": 1.8726791143417358, "learning_rate": 3.763233752620545e-05, "loss": 0.1045, "step": 9440 }, { "epoch": 0.49528301886792453, "grad_norm": 3.224243402481079, "learning_rate": 3.7619234800838576e-05, "loss": 0.1327, "step": 9450 }, { "epoch": 0.49580712788259956, "grad_norm": 1.566446304321289, "learning_rate": 3.76061320754717e-05, "loss": 0.1272, "step": 9460 }, { "epoch": 0.49633123689727465, "grad_norm": 2.9973807334899902, "learning_rate": 3.759302935010482e-05, "loss": 0.0983, "step": 9470 }, { "epoch": 0.4968553459119497, "grad_norm": 1.0932589769363403, "learning_rate": 3.7579926624737946e-05, "loss": 0.1243, "step": 9480 }, { "epoch": 0.4973794549266247, "grad_norm": 1.4678490161895752, "learning_rate": 3.7566823899371076e-05, "loss": 0.0928, "step": 9490 }, { "epoch": 0.4979035639412998, "grad_norm": 1.9472997188568115, "learning_rate": 3.75537211740042e-05, "loss": 0.1297, "step": 9500 }, { "epoch": 0.49842767295597484, "grad_norm": 1.9206911325454712, "learning_rate": 3.7540618448637316e-05, "loss": 0.1146, "step": 9510 }, { "epoch": 0.4989517819706499, "grad_norm": 2.2353034019470215, "learning_rate": 3.752751572327044e-05, "loss": 0.1539, "step": 9520 }, { "epoch": 0.49947589098532497, "grad_norm": 1.8114569187164307, "learning_rate": 3.751441299790356e-05, "loss": 0.1193, "step": 9530 }, { "epoch": 0.5, "grad_norm": 2.5642435550689697, "learning_rate": 3.7501310272536686e-05, "loss": 0.1495, "step": 9540 }, { "epoch": 0.500524109014675, "grad_norm": 1.5244758129119873, "learning_rate": 3.748820754716981e-05, "loss": 0.1157, "step": 9550 }, { "epoch": 0.5010482180293501, "grad_norm": 1.0185999870300293, "learning_rate": 3.747510482180294e-05, "loss": 0.1053, "step": 9560 }, { "epoch": 0.5015723270440252, "grad_norm": 1.3741650581359863, "learning_rate": 3.746200209643606e-05, "loss": 0.1021, "step": 9570 }, { "epoch": 0.5020964360587002, "grad_norm": 13.233023643493652, "learning_rate": 3.744889937106919e-05, "loss": 0.1031, "step": 9580 }, { "epoch": 0.5026205450733753, "grad_norm": 2.069478750228882, "learning_rate": 3.743579664570231e-05, "loss": 0.1639, "step": 9590 }, { "epoch": 0.5031446540880503, "grad_norm": 1.6018750667572021, "learning_rate": 3.7422693920335433e-05, "loss": 0.1165, "step": 9600 }, { "epoch": 0.5036687631027253, "grad_norm": 1.1766276359558105, "learning_rate": 3.740959119496856e-05, "loss": 0.1308, "step": 9610 }, { "epoch": 0.5041928721174004, "grad_norm": 7.346364498138428, "learning_rate": 3.739648846960168e-05, "loss": 0.1188, "step": 9620 }, { "epoch": 0.5047169811320755, "grad_norm": 2.2837276458740234, "learning_rate": 3.73833857442348e-05, "loss": 0.0947, "step": 9630 }, { "epoch": 0.5052410901467506, "grad_norm": 2.5841448307037354, "learning_rate": 3.737028301886793e-05, "loss": 0.1177, "step": 9640 }, { "epoch": 0.5057651991614256, "grad_norm": 2.1917734146118164, "learning_rate": 3.735718029350105e-05, "loss": 0.1451, "step": 9650 }, { "epoch": 0.5062893081761006, "grad_norm": 1.490285038948059, "learning_rate": 3.7344077568134174e-05, "loss": 0.1132, "step": 9660 }, { "epoch": 0.5068134171907757, "grad_norm": 1.1655572652816772, "learning_rate": 3.73309748427673e-05, "loss": 0.1181, "step": 9670 }, { "epoch": 0.5073375262054507, "grad_norm": 1.4635337591171265, "learning_rate": 3.731787211740042e-05, "loss": 0.1121, "step": 9680 }, { "epoch": 0.5078616352201258, "grad_norm": 1.9814307689666748, "learning_rate": 3.7304769392033544e-05, "loss": 0.1067, "step": 9690 }, { "epoch": 0.5083857442348009, "grad_norm": 1.5174517631530762, "learning_rate": 3.729166666666667e-05, "loss": 0.1324, "step": 9700 }, { "epoch": 0.5089098532494759, "grad_norm": 1.8455206155776978, "learning_rate": 3.727856394129979e-05, "loss": 0.0979, "step": 9710 }, { "epoch": 0.5094339622641509, "grad_norm": 1.3721349239349365, "learning_rate": 3.7265461215932914e-05, "loss": 0.1358, "step": 9720 }, { "epoch": 0.509958071278826, "grad_norm": 1.6334538459777832, "learning_rate": 3.7252358490566044e-05, "loss": 0.1105, "step": 9730 }, { "epoch": 0.510482180293501, "grad_norm": 1.7805728912353516, "learning_rate": 3.723925576519917e-05, "loss": 0.1211, "step": 9740 }, { "epoch": 0.5110062893081762, "grad_norm": 1.514751672744751, "learning_rate": 3.7226153039832284e-05, "loss": 0.1012, "step": 9750 }, { "epoch": 0.5115303983228512, "grad_norm": 2.340724468231201, "learning_rate": 3.721305031446541e-05, "loss": 0.1427, "step": 9760 }, { "epoch": 0.5120545073375262, "grad_norm": 1.5210148096084595, "learning_rate": 3.719994758909853e-05, "loss": 0.1124, "step": 9770 }, { "epoch": 0.5125786163522013, "grad_norm": 2.119563341140747, "learning_rate": 3.7186844863731654e-05, "loss": 0.1263, "step": 9780 }, { "epoch": 0.5131027253668763, "grad_norm": 1.4220565557479858, "learning_rate": 3.717374213836478e-05, "loss": 0.1007, "step": 9790 }, { "epoch": 0.5136268343815513, "grad_norm": 1.1034489870071411, "learning_rate": 3.716063941299791e-05, "loss": 0.1312, "step": 9800 }, { "epoch": 0.5141509433962265, "grad_norm": 2.0566272735595703, "learning_rate": 3.714753668763103e-05, "loss": 0.1019, "step": 9810 }, { "epoch": 0.5146750524109015, "grad_norm": 1.0858715772628784, "learning_rate": 3.7134433962264155e-05, "loss": 0.1345, "step": 9820 }, { "epoch": 0.5151991614255765, "grad_norm": 0.9479877948760986, "learning_rate": 3.712133123689728e-05, "loss": 0.1138, "step": 9830 }, { "epoch": 0.5157232704402516, "grad_norm": 2.655113935470581, "learning_rate": 3.71082285115304e-05, "loss": 0.1238, "step": 9840 }, { "epoch": 0.5162473794549266, "grad_norm": 1.081669569015503, "learning_rate": 3.7095125786163525e-05, "loss": 0.1447, "step": 9850 }, { "epoch": 0.5167714884696016, "grad_norm": 1.6646862030029297, "learning_rate": 3.708202306079665e-05, "loss": 0.1348, "step": 9860 }, { "epoch": 0.5172955974842768, "grad_norm": 2.4982075691223145, "learning_rate": 3.706892033542977e-05, "loss": 0.1408, "step": 9870 }, { "epoch": 0.5178197064989518, "grad_norm": 1.4935152530670166, "learning_rate": 3.7055817610062895e-05, "loss": 0.1067, "step": 9880 }, { "epoch": 0.5183438155136268, "grad_norm": 1.6594271659851074, "learning_rate": 3.704271488469602e-05, "loss": 0.1184, "step": 9890 }, { "epoch": 0.5188679245283019, "grad_norm": 2.2178964614868164, "learning_rate": 3.702961215932914e-05, "loss": 0.1538, "step": 9900 }, { "epoch": 0.5193920335429769, "grad_norm": 2.592712640762329, "learning_rate": 3.7016509433962265e-05, "loss": 0.1141, "step": 9910 }, { "epoch": 0.519916142557652, "grad_norm": 9.270172119140625, "learning_rate": 3.700340670859539e-05, "loss": 0.1254, "step": 9920 }, { "epoch": 0.5204402515723271, "grad_norm": 1.9585696458816528, "learning_rate": 3.699030398322851e-05, "loss": 0.1527, "step": 9930 }, { "epoch": 0.5209643605870021, "grad_norm": 1.2968515157699585, "learning_rate": 3.6977201257861635e-05, "loss": 0.1091, "step": 9940 }, { "epoch": 0.5214884696016772, "grad_norm": 3.7071094512939453, "learning_rate": 3.696409853249476e-05, "loss": 0.1466, "step": 9950 }, { "epoch": 0.5220125786163522, "grad_norm": 1.7539535760879517, "learning_rate": 3.695099580712789e-05, "loss": 0.1164, "step": 9960 }, { "epoch": 0.5225366876310272, "grad_norm": 1.4288002252578735, "learning_rate": 3.693789308176101e-05, "loss": 0.0937, "step": 9970 }, { "epoch": 0.5230607966457023, "grad_norm": 2.288053035736084, "learning_rate": 3.6924790356394136e-05, "loss": 0.1354, "step": 9980 }, { "epoch": 0.5235849056603774, "grad_norm": 1.5926620960235596, "learning_rate": 3.691168763102725e-05, "loss": 0.1133, "step": 9990 }, { "epoch": 0.5241090146750524, "grad_norm": 1.2741729021072388, "learning_rate": 3.6898584905660376e-05, "loss": 0.0885, "step": 10000 }, { "epoch": 0.5241090146750524, "eval_loss": 0.29621487855911255, "eval_runtime": 267.7799, "eval_samples_per_second": 7.435, "eval_steps_per_second": 1.24, "step": 10000 }, { "epoch": 0.5246331236897275, "grad_norm": 2.342545509338379, "learning_rate": 3.68854821802935e-05, "loss": 0.133, "step": 10010 }, { "epoch": 0.5251572327044025, "grad_norm": 2.749216318130493, "learning_rate": 3.687237945492662e-05, "loss": 0.1033, "step": 10020 }, { "epoch": 0.5256813417190775, "grad_norm": 1.3036613464355469, "learning_rate": 3.685927672955975e-05, "loss": 0.1188, "step": 10030 }, { "epoch": 0.5262054507337526, "grad_norm": 1.6999369859695435, "learning_rate": 3.6846174004192876e-05, "loss": 0.1453, "step": 10040 }, { "epoch": 0.5267295597484277, "grad_norm": 1.395107626914978, "learning_rate": 3.6833071278826e-05, "loss": 0.1173, "step": 10050 }, { "epoch": 0.5272536687631028, "grad_norm": 2.4702157974243164, "learning_rate": 3.681996855345912e-05, "loss": 0.1151, "step": 10060 }, { "epoch": 0.5277777777777778, "grad_norm": 1.3825613260269165, "learning_rate": 3.6806865828092246e-05, "loss": 0.1106, "step": 10070 }, { "epoch": 0.5283018867924528, "grad_norm": 1.9985954761505127, "learning_rate": 3.679376310272537e-05, "loss": 0.1264, "step": 10080 }, { "epoch": 0.5288259958071279, "grad_norm": 1.4469823837280273, "learning_rate": 3.678066037735849e-05, "loss": 0.1173, "step": 10090 }, { "epoch": 0.5293501048218029, "grad_norm": 1.9220649003982544, "learning_rate": 3.6767557651991616e-05, "loss": 0.1465, "step": 10100 }, { "epoch": 0.529874213836478, "grad_norm": 2.985271453857422, "learning_rate": 3.675445492662474e-05, "loss": 0.1341, "step": 10110 }, { "epoch": 0.5303983228511531, "grad_norm": 1.2226923704147339, "learning_rate": 3.674135220125786e-05, "loss": 0.1246, "step": 10120 }, { "epoch": 0.5309224318658281, "grad_norm": 1.7916375398635864, "learning_rate": 3.6728249475890986e-05, "loss": 0.1169, "step": 10130 }, { "epoch": 0.5314465408805031, "grad_norm": 1.7915583848953247, "learning_rate": 3.671514675052411e-05, "loss": 0.12, "step": 10140 }, { "epoch": 0.5319706498951782, "grad_norm": 1.5447636842727661, "learning_rate": 3.670204402515723e-05, "loss": 0.1112, "step": 10150 }, { "epoch": 0.5324947589098532, "grad_norm": 3.285203695297241, "learning_rate": 3.668894129979036e-05, "loss": 0.1622, "step": 10160 }, { "epoch": 0.5330188679245284, "grad_norm": 1.0093086957931519, "learning_rate": 3.667583857442348e-05, "loss": 0.1437, "step": 10170 }, { "epoch": 0.5335429769392034, "grad_norm": 1.082940697669983, "learning_rate": 3.6662735849056603e-05, "loss": 0.1059, "step": 10180 }, { "epoch": 0.5340670859538784, "grad_norm": 0.6458982825279236, "learning_rate": 3.6649633123689734e-05, "loss": 0.1267, "step": 10190 }, { "epoch": 0.5345911949685535, "grad_norm": 1.6019344329833984, "learning_rate": 3.663653039832286e-05, "loss": 0.1192, "step": 10200 }, { "epoch": 0.5351153039832285, "grad_norm": 1.5945937633514404, "learning_rate": 3.662342767295598e-05, "loss": 0.1058, "step": 10210 }, { "epoch": 0.5356394129979035, "grad_norm": 0.926892876625061, "learning_rate": 3.6610324947589104e-05, "loss": 0.1209, "step": 10220 }, { "epoch": 0.5361635220125787, "grad_norm": 1.6717281341552734, "learning_rate": 3.659722222222222e-05, "loss": 0.1327, "step": 10230 }, { "epoch": 0.5366876310272537, "grad_norm": 1.7691363096237183, "learning_rate": 3.6584119496855344e-05, "loss": 0.1286, "step": 10240 }, { "epoch": 0.5372117400419287, "grad_norm": 2.906761646270752, "learning_rate": 3.657101677148847e-05, "loss": 0.1253, "step": 10250 }, { "epoch": 0.5377358490566038, "grad_norm": 2.2023632526397705, "learning_rate": 3.65579140461216e-05, "loss": 0.1193, "step": 10260 }, { "epoch": 0.5382599580712788, "grad_norm": 1.8191137313842773, "learning_rate": 3.654481132075472e-05, "loss": 0.1465, "step": 10270 }, { "epoch": 0.5387840670859538, "grad_norm": 2.309532642364502, "learning_rate": 3.6531708595387844e-05, "loss": 0.1244, "step": 10280 }, { "epoch": 0.539308176100629, "grad_norm": 1.8280638456344604, "learning_rate": 3.651860587002097e-05, "loss": 0.1307, "step": 10290 }, { "epoch": 0.539832285115304, "grad_norm": 2.2038843631744385, "learning_rate": 3.650550314465409e-05, "loss": 0.12, "step": 10300 }, { "epoch": 0.540356394129979, "grad_norm": 1.8919661045074463, "learning_rate": 3.6492400419287214e-05, "loss": 0.1278, "step": 10310 }, { "epoch": 0.5408805031446541, "grad_norm": 1.6221542358398438, "learning_rate": 3.647929769392034e-05, "loss": 0.1022, "step": 10320 }, { "epoch": 0.5414046121593291, "grad_norm": 2.0718319416046143, "learning_rate": 3.646619496855346e-05, "loss": 0.138, "step": 10330 }, { "epoch": 0.5419287211740041, "grad_norm": 1.4710763692855835, "learning_rate": 3.6453092243186584e-05, "loss": 0.1091, "step": 10340 }, { "epoch": 0.5424528301886793, "grad_norm": 1.6080454587936401, "learning_rate": 3.643998951781971e-05, "loss": 0.1248, "step": 10350 }, { "epoch": 0.5429769392033543, "grad_norm": 0.9281677603721619, "learning_rate": 3.642688679245283e-05, "loss": 0.1445, "step": 10360 }, { "epoch": 0.5435010482180294, "grad_norm": 1.4223753213882446, "learning_rate": 3.6413784067085955e-05, "loss": 0.1557, "step": 10370 }, { "epoch": 0.5440251572327044, "grad_norm": 1.4216049909591675, "learning_rate": 3.640068134171908e-05, "loss": 0.1083, "step": 10380 }, { "epoch": 0.5445492662473794, "grad_norm": 1.84040105342865, "learning_rate": 3.63875786163522e-05, "loss": 0.1137, "step": 10390 }, { "epoch": 0.5450733752620545, "grad_norm": 1.6752604246139526, "learning_rate": 3.6374475890985325e-05, "loss": 0.1421, "step": 10400 }, { "epoch": 0.5455974842767296, "grad_norm": 2.0979623794555664, "learning_rate": 3.636137316561845e-05, "loss": 0.1228, "step": 10410 }, { "epoch": 0.5461215932914046, "grad_norm": 0.7087647318840027, "learning_rate": 3.634827044025158e-05, "loss": 0.1235, "step": 10420 }, { "epoch": 0.5466457023060797, "grad_norm": 1.492990255355835, "learning_rate": 3.63351677148847e-05, "loss": 0.1434, "step": 10430 }, { "epoch": 0.5471698113207547, "grad_norm": 2.378920793533325, "learning_rate": 3.6322064989517825e-05, "loss": 0.1569, "step": 10440 }, { "epoch": 0.5476939203354297, "grad_norm": 1.9160252809524536, "learning_rate": 3.630896226415095e-05, "loss": 0.1054, "step": 10450 }, { "epoch": 0.5482180293501048, "grad_norm": 0.846349835395813, "learning_rate": 3.6295859538784065e-05, "loss": 0.1133, "step": 10460 }, { "epoch": 0.5487421383647799, "grad_norm": 1.7348216772079468, "learning_rate": 3.628275681341719e-05, "loss": 0.1288, "step": 10470 }, { "epoch": 0.549266247379455, "grad_norm": 1.307425618171692, "learning_rate": 3.626965408805031e-05, "loss": 0.1251, "step": 10480 }, { "epoch": 0.54979035639413, "grad_norm": 1.8501406908035278, "learning_rate": 3.625655136268344e-05, "loss": 0.1204, "step": 10490 }, { "epoch": 0.550314465408805, "grad_norm": 1.3911058902740479, "learning_rate": 3.6243448637316565e-05, "loss": 0.1249, "step": 10500 }, { "epoch": 0.55083857442348, "grad_norm": 0.7849336862564087, "learning_rate": 3.623034591194969e-05, "loss": 0.0963, "step": 10510 }, { "epoch": 0.5513626834381551, "grad_norm": 2.364187002182007, "learning_rate": 3.621724318658281e-05, "loss": 0.1067, "step": 10520 }, { "epoch": 0.5518867924528302, "grad_norm": 2.4732234477996826, "learning_rate": 3.6204140461215935e-05, "loss": 0.1011, "step": 10530 }, { "epoch": 0.5524109014675053, "grad_norm": 2.1230411529541016, "learning_rate": 3.619103773584906e-05, "loss": 0.1204, "step": 10540 }, { "epoch": 0.5529350104821803, "grad_norm": 1.2235745191574097, "learning_rate": 3.617793501048218e-05, "loss": 0.1249, "step": 10550 }, { "epoch": 0.5534591194968553, "grad_norm": 1.4653679132461548, "learning_rate": 3.6164832285115306e-05, "loss": 0.1377, "step": 10560 }, { "epoch": 0.5539832285115304, "grad_norm": 2.0891177654266357, "learning_rate": 3.615172955974843e-05, "loss": 0.1404, "step": 10570 }, { "epoch": 0.5545073375262054, "grad_norm": 1.4860715866088867, "learning_rate": 3.613862683438155e-05, "loss": 0.1067, "step": 10580 }, { "epoch": 0.5550314465408805, "grad_norm": 1.492263674736023, "learning_rate": 3.6125524109014676e-05, "loss": 0.1387, "step": 10590 }, { "epoch": 0.5555555555555556, "grad_norm": 2.3158419132232666, "learning_rate": 3.61124213836478e-05, "loss": 0.1267, "step": 10600 }, { "epoch": 0.5560796645702306, "grad_norm": 1.954278588294983, "learning_rate": 3.609931865828092e-05, "loss": 0.1284, "step": 10610 }, { "epoch": 0.5566037735849056, "grad_norm": 2.4319417476654053, "learning_rate": 3.6086215932914046e-05, "loss": 0.1373, "step": 10620 }, { "epoch": 0.5571278825995807, "grad_norm": 2.4689600467681885, "learning_rate": 3.607311320754717e-05, "loss": 0.1314, "step": 10630 }, { "epoch": 0.5576519916142557, "grad_norm": 1.7527562379837036, "learning_rate": 3.606001048218029e-05, "loss": 0.1118, "step": 10640 }, { "epoch": 0.5581761006289309, "grad_norm": 1.3557283878326416, "learning_rate": 3.604690775681342e-05, "loss": 0.1176, "step": 10650 }, { "epoch": 0.5587002096436059, "grad_norm": 1.7744745016098022, "learning_rate": 3.6033805031446546e-05, "loss": 0.1599, "step": 10660 }, { "epoch": 0.5592243186582809, "grad_norm": 1.2975720167160034, "learning_rate": 3.602070230607967e-05, "loss": 0.1449, "step": 10670 }, { "epoch": 0.559748427672956, "grad_norm": 1.860067367553711, "learning_rate": 3.600759958071279e-05, "loss": 0.1395, "step": 10680 }, { "epoch": 0.560272536687631, "grad_norm": 1.673621416091919, "learning_rate": 3.5994496855345916e-05, "loss": 0.1146, "step": 10690 }, { "epoch": 0.560796645702306, "grad_norm": 1.0844975709915161, "learning_rate": 3.598139412997903e-05, "loss": 0.123, "step": 10700 }, { "epoch": 0.5613207547169812, "grad_norm": 1.365915060043335, "learning_rate": 3.5968291404612156e-05, "loss": 0.1111, "step": 10710 }, { "epoch": 0.5618448637316562, "grad_norm": 1.3708051443099976, "learning_rate": 3.595518867924528e-05, "loss": 0.1357, "step": 10720 }, { "epoch": 0.5623689727463312, "grad_norm": 1.8915815353393555, "learning_rate": 3.594208595387841e-05, "loss": 0.1265, "step": 10730 }, { "epoch": 0.5628930817610063, "grad_norm": 1.4480313062667847, "learning_rate": 3.592898322851153e-05, "loss": 0.1442, "step": 10740 }, { "epoch": 0.5634171907756813, "grad_norm": 2.1300652027130127, "learning_rate": 3.591588050314466e-05, "loss": 0.1274, "step": 10750 }, { "epoch": 0.5639412997903563, "grad_norm": 1.2599786520004272, "learning_rate": 3.590277777777778e-05, "loss": 0.1179, "step": 10760 }, { "epoch": 0.5644654088050315, "grad_norm": 2.037057638168335, "learning_rate": 3.5889675052410903e-05, "loss": 0.1286, "step": 10770 }, { "epoch": 0.5649895178197065, "grad_norm": 0.6876270174980164, "learning_rate": 3.587657232704403e-05, "loss": 0.1271, "step": 10780 }, { "epoch": 0.5655136268343816, "grad_norm": 2.1114234924316406, "learning_rate": 3.586346960167715e-05, "loss": 0.1341, "step": 10790 }, { "epoch": 0.5660377358490566, "grad_norm": 2.157926082611084, "learning_rate": 3.5850366876310274e-05, "loss": 0.0956, "step": 10800 }, { "epoch": 0.5665618448637316, "grad_norm": 0.7500534653663635, "learning_rate": 3.58372641509434e-05, "loss": 0.1256, "step": 10810 }, { "epoch": 0.5670859538784067, "grad_norm": 2.3284528255462646, "learning_rate": 3.582416142557652e-05, "loss": 0.1552, "step": 10820 }, { "epoch": 0.5676100628930818, "grad_norm": 1.4946759939193726, "learning_rate": 3.5811058700209644e-05, "loss": 0.1305, "step": 10830 }, { "epoch": 0.5681341719077568, "grad_norm": 1.227433443069458, "learning_rate": 3.579795597484277e-05, "loss": 0.1475, "step": 10840 }, { "epoch": 0.5686582809224319, "grad_norm": 1.4880372285842896, "learning_rate": 3.578485324947589e-05, "loss": 0.1095, "step": 10850 }, { "epoch": 0.5691823899371069, "grad_norm": 1.418043613433838, "learning_rate": 3.5771750524109014e-05, "loss": 0.1169, "step": 10860 }, { "epoch": 0.5697064989517819, "grad_norm": 1.149854302406311, "learning_rate": 3.575864779874214e-05, "loss": 0.1301, "step": 10870 }, { "epoch": 0.570230607966457, "grad_norm": 1.978639841079712, "learning_rate": 3.574554507337526e-05, "loss": 0.1101, "step": 10880 }, { "epoch": 0.5707547169811321, "grad_norm": 1.548750638961792, "learning_rate": 3.573244234800839e-05, "loss": 0.0986, "step": 10890 }, { "epoch": 0.5712788259958071, "grad_norm": 1.443969488143921, "learning_rate": 3.5719339622641514e-05, "loss": 0.1083, "step": 10900 }, { "epoch": 0.5718029350104822, "grad_norm": 1.9009684324264526, "learning_rate": 3.570623689727464e-05, "loss": 0.1396, "step": 10910 }, { "epoch": 0.5723270440251572, "grad_norm": 1.0748625993728638, "learning_rate": 3.569313417190776e-05, "loss": 0.1182, "step": 10920 }, { "epoch": 0.5728511530398323, "grad_norm": 1.075340747833252, "learning_rate": 3.5680031446540884e-05, "loss": 0.1138, "step": 10930 }, { "epoch": 0.5733752620545073, "grad_norm": 0.9983365535736084, "learning_rate": 3.5666928721174e-05, "loss": 0.1029, "step": 10940 }, { "epoch": 0.5738993710691824, "grad_norm": 2.230050802230835, "learning_rate": 3.5653825995807124e-05, "loss": 0.1359, "step": 10950 }, { "epoch": 0.5744234800838575, "grad_norm": 1.0450794696807861, "learning_rate": 3.5640723270440255e-05, "loss": 0.0988, "step": 10960 }, { "epoch": 0.5749475890985325, "grad_norm": 1.466576337814331, "learning_rate": 3.562762054507338e-05, "loss": 0.1257, "step": 10970 }, { "epoch": 0.5754716981132075, "grad_norm": 2.0776283740997314, "learning_rate": 3.56145178197065e-05, "loss": 0.1116, "step": 10980 }, { "epoch": 0.5759958071278826, "grad_norm": 1.320647120475769, "learning_rate": 3.5601415094339625e-05, "loss": 0.1229, "step": 10990 }, { "epoch": 0.5765199161425576, "grad_norm": 1.7118204832077026, "learning_rate": 3.558831236897275e-05, "loss": 0.1438, "step": 11000 }, { "epoch": 0.5765199161425576, "eval_loss": 0.28642794489860535, "eval_runtime": 267.1324, "eval_samples_per_second": 7.453, "eval_steps_per_second": 1.243, "step": 11000 }, { "epoch": 0.5770440251572327, "grad_norm": 0.9733932018280029, "learning_rate": 3.557520964360587e-05, "loss": 0.1234, "step": 11010 }, { "epoch": 0.5775681341719078, "grad_norm": 1.7534810304641724, "learning_rate": 3.5562106918238995e-05, "loss": 0.1142, "step": 11020 }, { "epoch": 0.5780922431865828, "grad_norm": 1.526125192642212, "learning_rate": 3.554900419287212e-05, "loss": 0.1058, "step": 11030 }, { "epoch": 0.5786163522012578, "grad_norm": 1.795940637588501, "learning_rate": 3.553590146750524e-05, "loss": 0.1426, "step": 11040 }, { "epoch": 0.5791404612159329, "grad_norm": 1.329445242881775, "learning_rate": 3.552279874213837e-05, "loss": 0.157, "step": 11050 }, { "epoch": 0.5796645702306079, "grad_norm": 2.718548059463501, "learning_rate": 3.550969601677149e-05, "loss": 0.1365, "step": 11060 }, { "epoch": 0.5801886792452831, "grad_norm": 1.8392747640609741, "learning_rate": 3.549659329140461e-05, "loss": 0.1152, "step": 11070 }, { "epoch": 0.5807127882599581, "grad_norm": 1.6081047058105469, "learning_rate": 3.5483490566037735e-05, "loss": 0.1195, "step": 11080 }, { "epoch": 0.5812368972746331, "grad_norm": 1.4167786836624146, "learning_rate": 3.547038784067086e-05, "loss": 0.1254, "step": 11090 }, { "epoch": 0.5817610062893082, "grad_norm": 1.563456416130066, "learning_rate": 3.545728511530398e-05, "loss": 0.1209, "step": 11100 }, { "epoch": 0.5822851153039832, "grad_norm": 2.8495681285858154, "learning_rate": 3.5444182389937105e-05, "loss": 0.1335, "step": 11110 }, { "epoch": 0.5828092243186582, "grad_norm": 2.373828649520874, "learning_rate": 3.5431079664570236e-05, "loss": 0.1343, "step": 11120 }, { "epoch": 0.5833333333333334, "grad_norm": 1.545283555984497, "learning_rate": 3.541797693920336e-05, "loss": 0.1264, "step": 11130 }, { "epoch": 0.5838574423480084, "grad_norm": 1.7932777404785156, "learning_rate": 3.540487421383648e-05, "loss": 0.1819, "step": 11140 }, { "epoch": 0.5843815513626834, "grad_norm": 1.548215389251709, "learning_rate": 3.5391771488469606e-05, "loss": 0.1277, "step": 11150 }, { "epoch": 0.5849056603773585, "grad_norm": 1.0522581338882446, "learning_rate": 3.537866876310273e-05, "loss": 0.1297, "step": 11160 }, { "epoch": 0.5854297693920335, "grad_norm": 1.84707510471344, "learning_rate": 3.536556603773585e-05, "loss": 0.1104, "step": 11170 }, { "epoch": 0.5859538784067087, "grad_norm": 1.651559829711914, "learning_rate": 3.535246331236897e-05, "loss": 0.1137, "step": 11180 }, { "epoch": 0.5864779874213837, "grad_norm": 1.496799111366272, "learning_rate": 3.53393605870021e-05, "loss": 0.1152, "step": 11190 }, { "epoch": 0.5870020964360587, "grad_norm": 1.6322681903839111, "learning_rate": 3.532625786163522e-05, "loss": 0.1173, "step": 11200 }, { "epoch": 0.5875262054507338, "grad_norm": 1.550951361656189, "learning_rate": 3.5313155136268346e-05, "loss": 0.1068, "step": 11210 }, { "epoch": 0.5880503144654088, "grad_norm": 2.4538066387176514, "learning_rate": 3.530005241090147e-05, "loss": 0.1406, "step": 11220 }, { "epoch": 0.5885744234800838, "grad_norm": 1.4006059169769287, "learning_rate": 3.528694968553459e-05, "loss": 0.1141, "step": 11230 }, { "epoch": 0.589098532494759, "grad_norm": 1.2701703310012817, "learning_rate": 3.5273846960167716e-05, "loss": 0.1179, "step": 11240 }, { "epoch": 0.589622641509434, "grad_norm": 1.4434823989868164, "learning_rate": 3.526074423480084e-05, "loss": 0.1044, "step": 11250 }, { "epoch": 0.590146750524109, "grad_norm": 2.028015613555908, "learning_rate": 3.524764150943396e-05, "loss": 0.1306, "step": 11260 }, { "epoch": 0.5906708595387841, "grad_norm": 1.5259170532226562, "learning_rate": 3.5234538784067086e-05, "loss": 0.0971, "step": 11270 }, { "epoch": 0.5911949685534591, "grad_norm": 1.5181983709335327, "learning_rate": 3.5221436058700216e-05, "loss": 0.1125, "step": 11280 }, { "epoch": 0.5917190775681341, "grad_norm": 1.5904532670974731, "learning_rate": 3.520833333333334e-05, "loss": 0.141, "step": 11290 }, { "epoch": 0.5922431865828093, "grad_norm": 1.1001319885253906, "learning_rate": 3.5195230607966456e-05, "loss": 0.0911, "step": 11300 }, { "epoch": 0.5927672955974843, "grad_norm": 1.2055481672286987, "learning_rate": 3.518212788259958e-05, "loss": 0.1525, "step": 11310 }, { "epoch": 0.5932914046121593, "grad_norm": 1.7501437664031982, "learning_rate": 3.51690251572327e-05, "loss": 0.1298, "step": 11320 }, { "epoch": 0.5938155136268344, "grad_norm": 1.5523266792297363, "learning_rate": 3.515592243186583e-05, "loss": 0.1222, "step": 11330 }, { "epoch": 0.5943396226415094, "grad_norm": 1.746579647064209, "learning_rate": 3.514281970649895e-05, "loss": 0.1188, "step": 11340 }, { "epoch": 0.5948637316561844, "grad_norm": 2.078474283218384, "learning_rate": 3.512971698113208e-05, "loss": 0.1608, "step": 11350 }, { "epoch": 0.5953878406708596, "grad_norm": 1.7152198553085327, "learning_rate": 3.5116614255765204e-05, "loss": 0.1422, "step": 11360 }, { "epoch": 0.5959119496855346, "grad_norm": 3.1155312061309814, "learning_rate": 3.510351153039833e-05, "loss": 0.1311, "step": 11370 }, { "epoch": 0.5964360587002097, "grad_norm": 2.6921157836914062, "learning_rate": 3.509040880503145e-05, "loss": 0.1245, "step": 11380 }, { "epoch": 0.5969601677148847, "grad_norm": 1.7192963361740112, "learning_rate": 3.5077306079664574e-05, "loss": 0.1133, "step": 11390 }, { "epoch": 0.5974842767295597, "grad_norm": 2.002624273300171, "learning_rate": 3.50642033542977e-05, "loss": 0.1914, "step": 11400 }, { "epoch": 0.5980083857442348, "grad_norm": 2.982755184173584, "learning_rate": 3.505110062893082e-05, "loss": 0.1263, "step": 11410 }, { "epoch": 0.5985324947589099, "grad_norm": 1.5957186222076416, "learning_rate": 3.5037997903563944e-05, "loss": 0.1042, "step": 11420 }, { "epoch": 0.5990566037735849, "grad_norm": 1.106004238128662, "learning_rate": 3.502489517819707e-05, "loss": 0.1075, "step": 11430 }, { "epoch": 0.59958071278826, "grad_norm": 0.960927426815033, "learning_rate": 3.501179245283019e-05, "loss": 0.1113, "step": 11440 }, { "epoch": 0.600104821802935, "grad_norm": 1.5358039140701294, "learning_rate": 3.4998689727463314e-05, "loss": 0.1175, "step": 11450 }, { "epoch": 0.60062893081761, "grad_norm": 2.1409425735473633, "learning_rate": 3.498558700209644e-05, "loss": 0.1279, "step": 11460 }, { "epoch": 0.6011530398322851, "grad_norm": 1.1909631490707397, "learning_rate": 3.497248427672956e-05, "loss": 0.1417, "step": 11470 }, { "epoch": 0.6016771488469602, "grad_norm": 1.6865551471710205, "learning_rate": 3.4959381551362684e-05, "loss": 0.1386, "step": 11480 }, { "epoch": 0.6022012578616353, "grad_norm": 1.3004859685897827, "learning_rate": 3.494627882599581e-05, "loss": 0.147, "step": 11490 }, { "epoch": 0.6027253668763103, "grad_norm": 1.1438429355621338, "learning_rate": 3.493317610062893e-05, "loss": 0.1137, "step": 11500 }, { "epoch": 0.6032494758909853, "grad_norm": 1.17679762840271, "learning_rate": 3.492007337526206e-05, "loss": 0.1251, "step": 11510 }, { "epoch": 0.6037735849056604, "grad_norm": 1.6681784391403198, "learning_rate": 3.4906970649895184e-05, "loss": 0.1186, "step": 11520 }, { "epoch": 0.6042976939203354, "grad_norm": 1.6447416543960571, "learning_rate": 3.489386792452831e-05, "loss": 0.1433, "step": 11530 }, { "epoch": 0.6048218029350105, "grad_norm": 1.9654818773269653, "learning_rate": 3.4880765199161424e-05, "loss": 0.1179, "step": 11540 }, { "epoch": 0.6053459119496856, "grad_norm": 1.8266123533248901, "learning_rate": 3.486766247379455e-05, "loss": 0.1201, "step": 11550 }, { "epoch": 0.6058700209643606, "grad_norm": 1.8127845525741577, "learning_rate": 3.485455974842767e-05, "loss": 0.12, "step": 11560 }, { "epoch": 0.6063941299790356, "grad_norm": 1.7763968706130981, "learning_rate": 3.4841457023060795e-05, "loss": 0.1157, "step": 11570 }, { "epoch": 0.6069182389937107, "grad_norm": 1.6913458108901978, "learning_rate": 3.4828354297693925e-05, "loss": 0.136, "step": 11580 }, { "epoch": 0.6074423480083857, "grad_norm": 2.5830845832824707, "learning_rate": 3.481525157232705e-05, "loss": 0.1056, "step": 11590 }, { "epoch": 0.6079664570230608, "grad_norm": 2.0478758811950684, "learning_rate": 3.480214884696017e-05, "loss": 0.154, "step": 11600 }, { "epoch": 0.6084905660377359, "grad_norm": 1.382096529006958, "learning_rate": 3.4789046121593295e-05, "loss": 0.1139, "step": 11610 }, { "epoch": 0.6090146750524109, "grad_norm": 1.7384464740753174, "learning_rate": 3.477594339622642e-05, "loss": 0.1505, "step": 11620 }, { "epoch": 0.609538784067086, "grad_norm": 0.7147114276885986, "learning_rate": 3.476284067085954e-05, "loss": 0.1231, "step": 11630 }, { "epoch": 0.610062893081761, "grad_norm": 1.229019284248352, "learning_rate": 3.4749737945492665e-05, "loss": 0.1408, "step": 11640 }, { "epoch": 0.610587002096436, "grad_norm": 1.091243028640747, "learning_rate": 3.473663522012579e-05, "loss": 0.094, "step": 11650 }, { "epoch": 0.6111111111111112, "grad_norm": 1.5786937475204468, "learning_rate": 3.472353249475891e-05, "loss": 0.1862, "step": 11660 }, { "epoch": 0.6116352201257862, "grad_norm": 1.2560791969299316, "learning_rate": 3.4710429769392035e-05, "loss": 0.1067, "step": 11670 }, { "epoch": 0.6121593291404612, "grad_norm": 1.329825520515442, "learning_rate": 3.469732704402516e-05, "loss": 0.1226, "step": 11680 }, { "epoch": 0.6126834381551363, "grad_norm": 2.251732349395752, "learning_rate": 3.468422431865828e-05, "loss": 0.144, "step": 11690 }, { "epoch": 0.6132075471698113, "grad_norm": 1.7934879064559937, "learning_rate": 3.4671121593291405e-05, "loss": 0.1501, "step": 11700 }, { "epoch": 0.6137316561844863, "grad_norm": 1.642490267753601, "learning_rate": 3.465801886792453e-05, "loss": 0.126, "step": 11710 }, { "epoch": 0.6142557651991615, "grad_norm": 1.3452117443084717, "learning_rate": 3.464491614255765e-05, "loss": 0.1075, "step": 11720 }, { "epoch": 0.6147798742138365, "grad_norm": 1.608151912689209, "learning_rate": 3.4631813417190776e-05, "loss": 0.1224, "step": 11730 }, { "epoch": 0.6153039832285115, "grad_norm": 0.9076985120773315, "learning_rate": 3.4618710691823906e-05, "loss": 0.1444, "step": 11740 }, { "epoch": 0.6158280922431866, "grad_norm": 0.8261033296585083, "learning_rate": 3.460560796645703e-05, "loss": 0.119, "step": 11750 }, { "epoch": 0.6163522012578616, "grad_norm": 2.104185104370117, "learning_rate": 3.459250524109015e-05, "loss": 0.1415, "step": 11760 }, { "epoch": 0.6168763102725366, "grad_norm": 0.9446232318878174, "learning_rate": 3.4579402515723276e-05, "loss": 0.1156, "step": 11770 }, { "epoch": 0.6174004192872118, "grad_norm": 2.3821616172790527, "learning_rate": 3.456629979035639e-05, "loss": 0.1542, "step": 11780 }, { "epoch": 0.6179245283018868, "grad_norm": 2.2270455360412598, "learning_rate": 3.4553197064989516e-05, "loss": 0.1382, "step": 11790 }, { "epoch": 0.6184486373165619, "grad_norm": 2.193085193634033, "learning_rate": 3.454009433962264e-05, "loss": 0.1374, "step": 11800 }, { "epoch": 0.6189727463312369, "grad_norm": 1.7484729290008545, "learning_rate": 3.452699161425576e-05, "loss": 0.1238, "step": 11810 }, { "epoch": 0.6194968553459119, "grad_norm": 2.2514584064483643, "learning_rate": 3.451388888888889e-05, "loss": 0.1478, "step": 11820 }, { "epoch": 0.620020964360587, "grad_norm": 1.5764837265014648, "learning_rate": 3.4500786163522016e-05, "loss": 0.1077, "step": 11830 }, { "epoch": 0.6205450733752621, "grad_norm": 1.4490690231323242, "learning_rate": 3.448768343815514e-05, "loss": 0.1195, "step": 11840 }, { "epoch": 0.6210691823899371, "grad_norm": 1.7251434326171875, "learning_rate": 3.447458071278826e-05, "loss": 0.1091, "step": 11850 }, { "epoch": 0.6215932914046122, "grad_norm": 1.4244098663330078, "learning_rate": 3.4461477987421386e-05, "loss": 0.1146, "step": 11860 }, { "epoch": 0.6221174004192872, "grad_norm": 0.8161481022834778, "learning_rate": 3.444837526205451e-05, "loss": 0.129, "step": 11870 }, { "epoch": 0.6226415094339622, "grad_norm": 1.3293105363845825, "learning_rate": 3.443527253668763e-05, "loss": 0.1029, "step": 11880 }, { "epoch": 0.6231656184486373, "grad_norm": 13.429025650024414, "learning_rate": 3.4422169811320757e-05, "loss": 0.1325, "step": 11890 }, { "epoch": 0.6236897274633124, "grad_norm": 1.104237675666809, "learning_rate": 3.440906708595388e-05, "loss": 0.1125, "step": 11900 }, { "epoch": 0.6242138364779874, "grad_norm": 1.6588174104690552, "learning_rate": 3.4395964360587e-05, "loss": 0.1143, "step": 11910 }, { "epoch": 0.6247379454926625, "grad_norm": 2.6823654174804688, "learning_rate": 3.438286163522013e-05, "loss": 0.1536, "step": 11920 }, { "epoch": 0.6252620545073375, "grad_norm": 1.737794280052185, "learning_rate": 3.436975890985325e-05, "loss": 0.1477, "step": 11930 }, { "epoch": 0.6257861635220126, "grad_norm": 1.0251657962799072, "learning_rate": 3.4356656184486373e-05, "loss": 0.1084, "step": 11940 }, { "epoch": 0.6263102725366876, "grad_norm": 2.0638644695281982, "learning_rate": 3.43435534591195e-05, "loss": 0.1675, "step": 11950 }, { "epoch": 0.6268343815513627, "grad_norm": 1.1078050136566162, "learning_rate": 3.433045073375262e-05, "loss": 0.1051, "step": 11960 }, { "epoch": 0.6273584905660378, "grad_norm": 1.5060793161392212, "learning_rate": 3.4317348008385744e-05, "loss": 0.1129, "step": 11970 }, { "epoch": 0.6278825995807128, "grad_norm": 1.2375575304031372, "learning_rate": 3.4304245283018874e-05, "loss": 0.1262, "step": 11980 }, { "epoch": 0.6284067085953878, "grad_norm": 0.9460673332214355, "learning_rate": 3.4291142557652e-05, "loss": 0.1568, "step": 11990 }, { "epoch": 0.6289308176100629, "grad_norm": 2.6357972621917725, "learning_rate": 3.427803983228512e-05, "loss": 0.1254, "step": 12000 }, { "epoch": 0.6289308176100629, "eval_loss": 0.2810102105140686, "eval_runtime": 267.5223, "eval_samples_per_second": 7.442, "eval_steps_per_second": 1.241, "step": 12000 }, { "epoch": 0.6294549266247379, "grad_norm": 1.6299917697906494, "learning_rate": 3.426493710691824e-05, "loss": 0.1376, "step": 12010 }, { "epoch": 0.629979035639413, "grad_norm": 1.9334322214126587, "learning_rate": 3.425183438155136e-05, "loss": 0.0876, "step": 12020 }, { "epoch": 0.6305031446540881, "grad_norm": 1.3271043300628662, "learning_rate": 3.4238731656184484e-05, "loss": 0.1258, "step": 12030 }, { "epoch": 0.6310272536687631, "grad_norm": 1.4171603918075562, "learning_rate": 3.422562893081761e-05, "loss": 0.1236, "step": 12040 }, { "epoch": 0.6315513626834381, "grad_norm": 2.808084011077881, "learning_rate": 3.421252620545074e-05, "loss": 0.1272, "step": 12050 }, { "epoch": 0.6320754716981132, "grad_norm": 1.4960280656814575, "learning_rate": 3.419942348008386e-05, "loss": 0.1629, "step": 12060 }, { "epoch": 0.6325995807127882, "grad_norm": 2.7985665798187256, "learning_rate": 3.4186320754716984e-05, "loss": 0.1066, "step": 12070 }, { "epoch": 0.6331236897274634, "grad_norm": 0.7567781805992126, "learning_rate": 3.417321802935011e-05, "loss": 0.0965, "step": 12080 }, { "epoch": 0.6336477987421384, "grad_norm": 1.7532941102981567, "learning_rate": 3.416011530398323e-05, "loss": 0.1472, "step": 12090 }, { "epoch": 0.6341719077568134, "grad_norm": 1.5983638763427734, "learning_rate": 3.4147012578616354e-05, "loss": 0.1119, "step": 12100 }, { "epoch": 0.6346960167714885, "grad_norm": 1.1211780309677124, "learning_rate": 3.413390985324948e-05, "loss": 0.1433, "step": 12110 }, { "epoch": 0.6352201257861635, "grad_norm": 1.1015443801879883, "learning_rate": 3.41208071278826e-05, "loss": 0.1333, "step": 12120 }, { "epoch": 0.6357442348008385, "grad_norm": 2.756619453430176, "learning_rate": 3.4107704402515725e-05, "loss": 0.1155, "step": 12130 }, { "epoch": 0.6362683438155137, "grad_norm": 1.5897578001022339, "learning_rate": 3.409460167714885e-05, "loss": 0.112, "step": 12140 }, { "epoch": 0.6367924528301887, "grad_norm": 1.2167946100234985, "learning_rate": 3.408149895178197e-05, "loss": 0.1093, "step": 12150 }, { "epoch": 0.6373165618448637, "grad_norm": 1.0480616092681885, "learning_rate": 3.4068396226415095e-05, "loss": 0.1167, "step": 12160 }, { "epoch": 0.6378406708595388, "grad_norm": 1.782877802848816, "learning_rate": 3.405529350104822e-05, "loss": 0.1219, "step": 12170 }, { "epoch": 0.6383647798742138, "grad_norm": 2.3879051208496094, "learning_rate": 3.404219077568134e-05, "loss": 0.114, "step": 12180 }, { "epoch": 0.6388888888888888, "grad_norm": 1.9370415210723877, "learning_rate": 3.4029088050314465e-05, "loss": 0.1186, "step": 12190 }, { "epoch": 0.639412997903564, "grad_norm": 1.4608991146087646, "learning_rate": 3.401598532494759e-05, "loss": 0.1196, "step": 12200 }, { "epoch": 0.639937106918239, "grad_norm": 2.2498528957366943, "learning_rate": 3.400288259958072e-05, "loss": 0.125, "step": 12210 }, { "epoch": 0.640461215932914, "grad_norm": 2.014331579208374, "learning_rate": 3.398977987421384e-05, "loss": 0.1699, "step": 12220 }, { "epoch": 0.6409853249475891, "grad_norm": 1.862320065498352, "learning_rate": 3.3976677148846965e-05, "loss": 0.1182, "step": 12230 }, { "epoch": 0.6415094339622641, "grad_norm": 0.9242523312568665, "learning_rate": 3.396357442348009e-05, "loss": 0.1136, "step": 12240 }, { "epoch": 0.6420335429769392, "grad_norm": 1.7910327911376953, "learning_rate": 3.3950471698113205e-05, "loss": 0.1173, "step": 12250 }, { "epoch": 0.6425576519916143, "grad_norm": 1.989595651626587, "learning_rate": 3.393736897274633e-05, "loss": 0.1306, "step": 12260 }, { "epoch": 0.6430817610062893, "grad_norm": 1.5238025188446045, "learning_rate": 3.392426624737945e-05, "loss": 0.1331, "step": 12270 }, { "epoch": 0.6436058700209644, "grad_norm": 3.9870452880859375, "learning_rate": 3.391116352201258e-05, "loss": 0.129, "step": 12280 }, { "epoch": 0.6441299790356394, "grad_norm": 1.6588159799575806, "learning_rate": 3.3898060796645705e-05, "loss": 0.1239, "step": 12290 }, { "epoch": 0.6446540880503144, "grad_norm": 5.81835412979126, "learning_rate": 3.388495807127883e-05, "loss": 0.1182, "step": 12300 }, { "epoch": 0.6451781970649895, "grad_norm": 1.9480522871017456, "learning_rate": 3.387185534591195e-05, "loss": 0.1077, "step": 12310 }, { "epoch": 0.6457023060796646, "grad_norm": 1.4687237739562988, "learning_rate": 3.3858752620545076e-05, "loss": 0.1014, "step": 12320 }, { "epoch": 0.6462264150943396, "grad_norm": 1.8134074211120605, "learning_rate": 3.38456498951782e-05, "loss": 0.0997, "step": 12330 }, { "epoch": 0.6467505241090147, "grad_norm": 0.7567766308784485, "learning_rate": 3.383254716981132e-05, "loss": 0.1545, "step": 12340 }, { "epoch": 0.6472746331236897, "grad_norm": 1.324652075767517, "learning_rate": 3.3819444444444446e-05, "loss": 0.1063, "step": 12350 }, { "epoch": 0.6477987421383647, "grad_norm": 1.9718384742736816, "learning_rate": 3.380634171907757e-05, "loss": 0.1065, "step": 12360 }, { "epoch": 0.6483228511530398, "grad_norm": 1.6241679191589355, "learning_rate": 3.379323899371069e-05, "loss": 0.1323, "step": 12370 }, { "epoch": 0.6488469601677149, "grad_norm": 1.7419811487197876, "learning_rate": 3.3780136268343816e-05, "loss": 0.1247, "step": 12380 }, { "epoch": 0.64937106918239, "grad_norm": 2.216460943222046, "learning_rate": 3.376703354297694e-05, "loss": 0.0928, "step": 12390 }, { "epoch": 0.649895178197065, "grad_norm": 1.5031715631484985, "learning_rate": 3.375393081761006e-05, "loss": 0.1595, "step": 12400 }, { "epoch": 0.65041928721174, "grad_norm": 1.9624732732772827, "learning_rate": 3.3740828092243186e-05, "loss": 0.1122, "step": 12410 }, { "epoch": 0.6509433962264151, "grad_norm": 1.3125239610671997, "learning_rate": 3.372772536687631e-05, "loss": 0.1352, "step": 12420 }, { "epoch": 0.6514675052410901, "grad_norm": 2.7664361000061035, "learning_rate": 3.371462264150943e-05, "loss": 0.1173, "step": 12430 }, { "epoch": 0.6519916142557652, "grad_norm": 1.2956434488296509, "learning_rate": 3.370151991614256e-05, "loss": 0.1383, "step": 12440 }, { "epoch": 0.6525157232704403, "grad_norm": 1.1372560262680054, "learning_rate": 3.3688417190775686e-05, "loss": 0.1002, "step": 12450 }, { "epoch": 0.6530398322851153, "grad_norm": 4.1682353019714355, "learning_rate": 3.367531446540881e-05, "loss": 0.1352, "step": 12460 }, { "epoch": 0.6535639412997903, "grad_norm": 1.243196964263916, "learning_rate": 3.366221174004193e-05, "loss": 0.1224, "step": 12470 }, { "epoch": 0.6540880503144654, "grad_norm": 1.4892295598983765, "learning_rate": 3.3649109014675057e-05, "loss": 0.1316, "step": 12480 }, { "epoch": 0.6546121593291404, "grad_norm": 1.3253917694091797, "learning_rate": 3.363600628930817e-05, "loss": 0.1142, "step": 12490 }, { "epoch": 0.6551362683438156, "grad_norm": 1.1935123205184937, "learning_rate": 3.3622903563941297e-05, "loss": 0.1323, "step": 12500 }, { "epoch": 0.6556603773584906, "grad_norm": 1.5510022640228271, "learning_rate": 3.360980083857443e-05, "loss": 0.1209, "step": 12510 }, { "epoch": 0.6561844863731656, "grad_norm": 2.739384174346924, "learning_rate": 3.359669811320755e-05, "loss": 0.086, "step": 12520 }, { "epoch": 0.6567085953878407, "grad_norm": 1.8388934135437012, "learning_rate": 3.3583595387840674e-05, "loss": 0.1352, "step": 12530 }, { "epoch": 0.6572327044025157, "grad_norm": 1.7754712104797363, "learning_rate": 3.35704926624738e-05, "loss": 0.1606, "step": 12540 }, { "epoch": 0.6577568134171907, "grad_norm": 1.7802470922470093, "learning_rate": 3.355738993710692e-05, "loss": 0.1453, "step": 12550 }, { "epoch": 0.6582809224318659, "grad_norm": 2.5520379543304443, "learning_rate": 3.3544287211740044e-05, "loss": 0.1198, "step": 12560 }, { "epoch": 0.6588050314465409, "grad_norm": 1.5761022567749023, "learning_rate": 3.353118448637317e-05, "loss": 0.1368, "step": 12570 }, { "epoch": 0.6593291404612159, "grad_norm": 1.0065501928329468, "learning_rate": 3.351808176100629e-05, "loss": 0.135, "step": 12580 }, { "epoch": 0.659853249475891, "grad_norm": 1.7206062078475952, "learning_rate": 3.3504979035639414e-05, "loss": 0.107, "step": 12590 }, { "epoch": 0.660377358490566, "grad_norm": 1.093377947807312, "learning_rate": 3.3491876310272544e-05, "loss": 0.1611, "step": 12600 }, { "epoch": 0.660901467505241, "grad_norm": 1.6093024015426636, "learning_rate": 3.347877358490566e-05, "loss": 0.1176, "step": 12610 }, { "epoch": 0.6614255765199162, "grad_norm": 0.8911046981811523, "learning_rate": 3.3465670859538784e-05, "loss": 0.1103, "step": 12620 }, { "epoch": 0.6619496855345912, "grad_norm": 2.1810872554779053, "learning_rate": 3.345256813417191e-05, "loss": 0.1163, "step": 12630 }, { "epoch": 0.6624737945492662, "grad_norm": 1.4465842247009277, "learning_rate": 3.343946540880503e-05, "loss": 0.1093, "step": 12640 }, { "epoch": 0.6629979035639413, "grad_norm": 1.8797701597213745, "learning_rate": 3.3426362683438154e-05, "loss": 0.1508, "step": 12650 }, { "epoch": 0.6635220125786163, "grad_norm": 2.4389729499816895, "learning_rate": 3.341325995807128e-05, "loss": 0.1191, "step": 12660 }, { "epoch": 0.6640461215932913, "grad_norm": 1.7194629907608032, "learning_rate": 3.340015723270441e-05, "loss": 0.1318, "step": 12670 }, { "epoch": 0.6645702306079665, "grad_norm": 1.6836072206497192, "learning_rate": 3.338705450733753e-05, "loss": 0.1239, "step": 12680 }, { "epoch": 0.6650943396226415, "grad_norm": 0.973906934261322, "learning_rate": 3.3373951781970654e-05, "loss": 0.1049, "step": 12690 }, { "epoch": 0.6656184486373166, "grad_norm": 2.0951108932495117, "learning_rate": 3.336084905660378e-05, "loss": 0.1372, "step": 12700 }, { "epoch": 0.6661425576519916, "grad_norm": 1.2338130474090576, "learning_rate": 3.33477463312369e-05, "loss": 0.0986, "step": 12710 }, { "epoch": 0.6666666666666666, "grad_norm": 1.3110014200210571, "learning_rate": 3.3334643605870025e-05, "loss": 0.1211, "step": 12720 }, { "epoch": 0.6671907756813418, "grad_norm": 1.1576558351516724, "learning_rate": 3.332154088050314e-05, "loss": 0.1298, "step": 12730 }, { "epoch": 0.6677148846960168, "grad_norm": 1.0870299339294434, "learning_rate": 3.3308438155136265e-05, "loss": 0.1313, "step": 12740 }, { "epoch": 0.6682389937106918, "grad_norm": 1.5776509046554565, "learning_rate": 3.3295335429769395e-05, "loss": 0.0887, "step": 12750 }, { "epoch": 0.6687631027253669, "grad_norm": 1.6563470363616943, "learning_rate": 3.328223270440252e-05, "loss": 0.1193, "step": 12760 }, { "epoch": 0.6692872117400419, "grad_norm": 1.5006121397018433, "learning_rate": 3.326912997903564e-05, "loss": 0.1447, "step": 12770 }, { "epoch": 0.6698113207547169, "grad_norm": 2.2964420318603516, "learning_rate": 3.3256027253668765e-05, "loss": 0.1217, "step": 12780 }, { "epoch": 0.6703354297693921, "grad_norm": 1.14298677444458, "learning_rate": 3.324292452830189e-05, "loss": 0.1285, "step": 12790 }, { "epoch": 0.6708595387840671, "grad_norm": 1.4093611240386963, "learning_rate": 3.322982180293501e-05, "loss": 0.1114, "step": 12800 }, { "epoch": 0.6713836477987422, "grad_norm": 5.533634662628174, "learning_rate": 3.3216719077568135e-05, "loss": 0.1171, "step": 12810 }, { "epoch": 0.6719077568134172, "grad_norm": 2.222405433654785, "learning_rate": 3.320361635220126e-05, "loss": 0.1288, "step": 12820 }, { "epoch": 0.6724318658280922, "grad_norm": 1.3084903955459595, "learning_rate": 3.319051362683439e-05, "loss": 0.1397, "step": 12830 }, { "epoch": 0.6729559748427673, "grad_norm": 1.6426416635513306, "learning_rate": 3.317741090146751e-05, "loss": 0.1266, "step": 12840 }, { "epoch": 0.6734800838574424, "grad_norm": 0.9996996521949768, "learning_rate": 3.316430817610063e-05, "loss": 0.1363, "step": 12850 }, { "epoch": 0.6740041928721174, "grad_norm": 0.8206360936164856, "learning_rate": 3.315120545073375e-05, "loss": 0.1161, "step": 12860 }, { "epoch": 0.6745283018867925, "grad_norm": 2.556631326675415, "learning_rate": 3.3138102725366875e-05, "loss": 0.1186, "step": 12870 }, { "epoch": 0.6750524109014675, "grad_norm": 1.9859380722045898, "learning_rate": 3.3125e-05, "loss": 0.1142, "step": 12880 }, { "epoch": 0.6755765199161425, "grad_norm": 2.254906177520752, "learning_rate": 3.311189727463312e-05, "loss": 0.1166, "step": 12890 }, { "epoch": 0.6761006289308176, "grad_norm": 2.126537799835205, "learning_rate": 3.3098794549266246e-05, "loss": 0.0938, "step": 12900 }, { "epoch": 0.6766247379454927, "grad_norm": 1.368828535079956, "learning_rate": 3.3085691823899376e-05, "loss": 0.1353, "step": 12910 }, { "epoch": 0.6771488469601677, "grad_norm": 1.1199910640716553, "learning_rate": 3.30725890985325e-05, "loss": 0.1056, "step": 12920 }, { "epoch": 0.6776729559748428, "grad_norm": 2.5403850078582764, "learning_rate": 3.305948637316562e-05, "loss": 0.1151, "step": 12930 }, { "epoch": 0.6781970649895178, "grad_norm": 5.520144939422607, "learning_rate": 3.3046383647798746e-05, "loss": 0.1185, "step": 12940 }, { "epoch": 0.6787211740041929, "grad_norm": 2.1471378803253174, "learning_rate": 3.303328092243187e-05, "loss": 0.1044, "step": 12950 }, { "epoch": 0.6792452830188679, "grad_norm": 1.0975602865219116, "learning_rate": 3.302017819706499e-05, "loss": 0.1319, "step": 12960 }, { "epoch": 0.679769392033543, "grad_norm": 1.5250160694122314, "learning_rate": 3.300707547169811e-05, "loss": 0.1387, "step": 12970 }, { "epoch": 0.6802935010482181, "grad_norm": 1.1736054420471191, "learning_rate": 3.299397274633124e-05, "loss": 0.1229, "step": 12980 }, { "epoch": 0.6808176100628931, "grad_norm": 1.3668029308319092, "learning_rate": 3.298087002096436e-05, "loss": 0.1206, "step": 12990 }, { "epoch": 0.6813417190775681, "grad_norm": 1.4439727067947388, "learning_rate": 3.2967767295597486e-05, "loss": 0.1131, "step": 13000 }, { "epoch": 0.6813417190775681, "eval_loss": 0.28294411301612854, "eval_runtime": 267.3683, "eval_samples_per_second": 7.447, "eval_steps_per_second": 1.242, "step": 13000 }, { "epoch": 0.6818658280922432, "grad_norm": 0.6069740056991577, "learning_rate": 3.295466457023061e-05, "loss": 0.148, "step": 13010 }, { "epoch": 0.6823899371069182, "grad_norm": 3.8916685581207275, "learning_rate": 3.294156184486373e-05, "loss": 0.1325, "step": 13020 }, { "epoch": 0.6829140461215933, "grad_norm": 0.8557778596878052, "learning_rate": 3.2928459119496856e-05, "loss": 0.1107, "step": 13030 }, { "epoch": 0.6834381551362684, "grad_norm": 1.2309248447418213, "learning_rate": 3.291535639412998e-05, "loss": 0.1091, "step": 13040 }, { "epoch": 0.6839622641509434, "grad_norm": 0.9375829100608826, "learning_rate": 3.29022536687631e-05, "loss": 0.1033, "step": 13050 }, { "epoch": 0.6844863731656184, "grad_norm": 1.7269313335418701, "learning_rate": 3.2889150943396227e-05, "loss": 0.1076, "step": 13060 }, { "epoch": 0.6850104821802935, "grad_norm": 1.9099595546722412, "learning_rate": 3.287604821802936e-05, "loss": 0.1242, "step": 13070 }, { "epoch": 0.6855345911949685, "grad_norm": 1.4554835557937622, "learning_rate": 3.286294549266248e-05, "loss": 0.1332, "step": 13080 }, { "epoch": 0.6860587002096437, "grad_norm": 1.0192896127700806, "learning_rate": 3.28498427672956e-05, "loss": 0.1317, "step": 13090 }, { "epoch": 0.6865828092243187, "grad_norm": 1.4376505613327026, "learning_rate": 3.283674004192872e-05, "loss": 0.0972, "step": 13100 }, { "epoch": 0.6871069182389937, "grad_norm": 2.5238263607025146, "learning_rate": 3.2823637316561843e-05, "loss": 0.1481, "step": 13110 }, { "epoch": 0.6876310272536688, "grad_norm": 1.4387770891189575, "learning_rate": 3.281053459119497e-05, "loss": 0.1134, "step": 13120 }, { "epoch": 0.6881551362683438, "grad_norm": 1.2158639430999756, "learning_rate": 3.279743186582809e-05, "loss": 0.0965, "step": 13130 }, { "epoch": 0.6886792452830188, "grad_norm": 1.5913608074188232, "learning_rate": 3.278432914046122e-05, "loss": 0.1234, "step": 13140 }, { "epoch": 0.689203354297694, "grad_norm": 1.5430805683135986, "learning_rate": 3.2771226415094344e-05, "loss": 0.1157, "step": 13150 }, { "epoch": 0.689727463312369, "grad_norm": 2.6382107734680176, "learning_rate": 3.275812368972747e-05, "loss": 0.1237, "step": 13160 }, { "epoch": 0.690251572327044, "grad_norm": 1.8848053216934204, "learning_rate": 3.274502096436059e-05, "loss": 0.1117, "step": 13170 }, { "epoch": 0.6907756813417191, "grad_norm": 2.1337838172912598, "learning_rate": 3.2731918238993714e-05, "loss": 0.1371, "step": 13180 }, { "epoch": 0.6912997903563941, "grad_norm": 2.070481538772583, "learning_rate": 3.271881551362684e-05, "loss": 0.1269, "step": 13190 }, { "epoch": 0.6918238993710691, "grad_norm": 2.1413769721984863, "learning_rate": 3.270571278825996e-05, "loss": 0.1789, "step": 13200 }, { "epoch": 0.6923480083857443, "grad_norm": 1.6787424087524414, "learning_rate": 3.2692610062893084e-05, "loss": 0.1399, "step": 13210 }, { "epoch": 0.6928721174004193, "grad_norm": 2.589695453643799, "learning_rate": 3.267950733752621e-05, "loss": 0.1223, "step": 13220 }, { "epoch": 0.6933962264150944, "grad_norm": 2.0658650398254395, "learning_rate": 3.266640461215933e-05, "loss": 0.1583, "step": 13230 }, { "epoch": 0.6939203354297694, "grad_norm": 1.8367148637771606, "learning_rate": 3.2653301886792454e-05, "loss": 0.124, "step": 13240 }, { "epoch": 0.6944444444444444, "grad_norm": 1.4144045114517212, "learning_rate": 3.264019916142558e-05, "loss": 0.1161, "step": 13250 }, { "epoch": 0.6949685534591195, "grad_norm": 1.424119472503662, "learning_rate": 3.26270964360587e-05, "loss": 0.1167, "step": 13260 }, { "epoch": 0.6954926624737946, "grad_norm": 1.1759669780731201, "learning_rate": 3.2613993710691824e-05, "loss": 0.1342, "step": 13270 }, { "epoch": 0.6960167714884696, "grad_norm": 2.510288715362549, "learning_rate": 3.260089098532495e-05, "loss": 0.1149, "step": 13280 }, { "epoch": 0.6965408805031447, "grad_norm": 1.3063350915908813, "learning_rate": 3.258778825995807e-05, "loss": 0.1246, "step": 13290 }, { "epoch": 0.6970649895178197, "grad_norm": 1.7909855842590332, "learning_rate": 3.25746855345912e-05, "loss": 0.1009, "step": 13300 }, { "epoch": 0.6975890985324947, "grad_norm": 1.8222286701202393, "learning_rate": 3.2561582809224325e-05, "loss": 0.087, "step": 13310 }, { "epoch": 0.6981132075471698, "grad_norm": 1.255692720413208, "learning_rate": 3.254848008385745e-05, "loss": 0.1172, "step": 13320 }, { "epoch": 0.6986373165618449, "grad_norm": 1.0486621856689453, "learning_rate": 3.2535377358490565e-05, "loss": 0.1173, "step": 13330 }, { "epoch": 0.69916142557652, "grad_norm": 1.5670002698898315, "learning_rate": 3.252227463312369e-05, "loss": 0.1241, "step": 13340 }, { "epoch": 0.699685534591195, "grad_norm": 1.8074911832809448, "learning_rate": 3.250917190775681e-05, "loss": 0.124, "step": 13350 }, { "epoch": 0.70020964360587, "grad_norm": 1.7373123168945312, "learning_rate": 3.2496069182389935e-05, "loss": 0.1177, "step": 13360 }, { "epoch": 0.700733752620545, "grad_norm": 1.53976571559906, "learning_rate": 3.2482966457023065e-05, "loss": 0.1335, "step": 13370 }, { "epoch": 0.7012578616352201, "grad_norm": 1.9045443534851074, "learning_rate": 3.246986373165619e-05, "loss": 0.1128, "step": 13380 }, { "epoch": 0.7017819706498952, "grad_norm": 1.210325002670288, "learning_rate": 3.245676100628931e-05, "loss": 0.0985, "step": 13390 }, { "epoch": 0.7023060796645703, "grad_norm": 1.382736325263977, "learning_rate": 3.2443658280922435e-05, "loss": 0.1309, "step": 13400 }, { "epoch": 0.7028301886792453, "grad_norm": 1.24323570728302, "learning_rate": 3.243055555555556e-05, "loss": 0.1584, "step": 13410 }, { "epoch": 0.7033542976939203, "grad_norm": 2.1430201530456543, "learning_rate": 3.241745283018868e-05, "loss": 0.1114, "step": 13420 }, { "epoch": 0.7038784067085954, "grad_norm": 1.5643491744995117, "learning_rate": 3.2404350104821805e-05, "loss": 0.1459, "step": 13430 }, { "epoch": 0.7044025157232704, "grad_norm": 1.205976963043213, "learning_rate": 3.239124737945493e-05, "loss": 0.1064, "step": 13440 }, { "epoch": 0.7049266247379455, "grad_norm": 2.065765142440796, "learning_rate": 3.237814465408805e-05, "loss": 0.1261, "step": 13450 }, { "epoch": 0.7054507337526206, "grad_norm": 2.1515004634857178, "learning_rate": 3.2365041928721175e-05, "loss": 0.1169, "step": 13460 }, { "epoch": 0.7059748427672956, "grad_norm": 2.8570148944854736, "learning_rate": 3.23519392033543e-05, "loss": 0.1242, "step": 13470 }, { "epoch": 0.7064989517819706, "grad_norm": 1.6113414764404297, "learning_rate": 3.233883647798742e-05, "loss": 0.135, "step": 13480 }, { "epoch": 0.7070230607966457, "grad_norm": 1.531532645225525, "learning_rate": 3.2325733752620546e-05, "loss": 0.1511, "step": 13490 }, { "epoch": 0.7075471698113207, "grad_norm": 1.5004215240478516, "learning_rate": 3.231263102725367e-05, "loss": 0.1191, "step": 13500 }, { "epoch": 0.7080712788259959, "grad_norm": 2.088381767272949, "learning_rate": 3.229952830188679e-05, "loss": 0.1127, "step": 13510 }, { "epoch": 0.7085953878406709, "grad_norm": 1.1922684907913208, "learning_rate": 3.2286425576519916e-05, "loss": 0.123, "step": 13520 }, { "epoch": 0.7091194968553459, "grad_norm": 1.5962071418762207, "learning_rate": 3.2273322851153046e-05, "loss": 0.1197, "step": 13530 }, { "epoch": 0.709643605870021, "grad_norm": 1.5418871641159058, "learning_rate": 3.226022012578617e-05, "loss": 0.1137, "step": 13540 }, { "epoch": 0.710167714884696, "grad_norm": 1.5913938283920288, "learning_rate": 3.224711740041929e-05, "loss": 0.1222, "step": 13550 }, { "epoch": 0.710691823899371, "grad_norm": 2.4484283924102783, "learning_rate": 3.223401467505241e-05, "loss": 0.1021, "step": 13560 }, { "epoch": 0.7112159329140462, "grad_norm": 6.387475490570068, "learning_rate": 3.222091194968553e-05, "loss": 0.1205, "step": 13570 }, { "epoch": 0.7117400419287212, "grad_norm": 1.4460914134979248, "learning_rate": 3.2207809224318656e-05, "loss": 0.1423, "step": 13580 }, { "epoch": 0.7122641509433962, "grad_norm": 1.3883821964263916, "learning_rate": 3.219470649895178e-05, "loss": 0.1155, "step": 13590 }, { "epoch": 0.7127882599580713, "grad_norm": 2.029080390930176, "learning_rate": 3.218160377358491e-05, "loss": 0.1194, "step": 13600 }, { "epoch": 0.7133123689727463, "grad_norm": 2.9844512939453125, "learning_rate": 3.216850104821803e-05, "loss": 0.168, "step": 13610 }, { "epoch": 0.7138364779874213, "grad_norm": 11.359793663024902, "learning_rate": 3.2155398322851156e-05, "loss": 0.1327, "step": 13620 }, { "epoch": 0.7143605870020965, "grad_norm": 1.7264271974563599, "learning_rate": 3.214229559748428e-05, "loss": 0.1268, "step": 13630 }, { "epoch": 0.7148846960167715, "grad_norm": 2.224581718444824, "learning_rate": 3.21291928721174e-05, "loss": 0.1148, "step": 13640 }, { "epoch": 0.7154088050314465, "grad_norm": 2.717209577560425, "learning_rate": 3.2116090146750527e-05, "loss": 0.1401, "step": 13650 }, { "epoch": 0.7159329140461216, "grad_norm": 1.0973457098007202, "learning_rate": 3.210298742138365e-05, "loss": 0.1241, "step": 13660 }, { "epoch": 0.7164570230607966, "grad_norm": 2.276607036590576, "learning_rate": 3.208988469601677e-05, "loss": 0.1436, "step": 13670 }, { "epoch": 0.7169811320754716, "grad_norm": 1.4400893449783325, "learning_rate": 3.20767819706499e-05, "loss": 0.1324, "step": 13680 }, { "epoch": 0.7175052410901468, "grad_norm": 1.5834757089614868, "learning_rate": 3.206367924528302e-05, "loss": 0.1471, "step": 13690 }, { "epoch": 0.7180293501048218, "grad_norm": 1.8143632411956787, "learning_rate": 3.2050576519916144e-05, "loss": 0.1093, "step": 13700 }, { "epoch": 0.7185534591194969, "grad_norm": 1.4229177236557007, "learning_rate": 3.203747379454927e-05, "loss": 0.1209, "step": 13710 }, { "epoch": 0.7190775681341719, "grad_norm": 1.990929126739502, "learning_rate": 3.202437106918239e-05, "loss": 0.1138, "step": 13720 }, { "epoch": 0.7196016771488469, "grad_norm": 3.4440460205078125, "learning_rate": 3.2011268343815514e-05, "loss": 0.107, "step": 13730 }, { "epoch": 0.720125786163522, "grad_norm": 1.457261085510254, "learning_rate": 3.199816561844864e-05, "loss": 0.0956, "step": 13740 }, { "epoch": 0.7206498951781971, "grad_norm": 1.8652387857437134, "learning_rate": 3.198506289308176e-05, "loss": 0.1358, "step": 13750 }, { "epoch": 0.7211740041928721, "grad_norm": 1.6599705219268799, "learning_rate": 3.197196016771489e-05, "loss": 0.141, "step": 13760 }, { "epoch": 0.7216981132075472, "grad_norm": 1.345238208770752, "learning_rate": 3.1958857442348014e-05, "loss": 0.1321, "step": 13770 }, { "epoch": 0.7222222222222222, "grad_norm": 1.5770273208618164, "learning_rate": 3.194575471698114e-05, "loss": 0.0967, "step": 13780 }, { "epoch": 0.7227463312368972, "grad_norm": 1.219248652458191, "learning_rate": 3.193265199161426e-05, "loss": 0.1167, "step": 13790 }, { "epoch": 0.7232704402515723, "grad_norm": 2.964763641357422, "learning_rate": 3.191954926624738e-05, "loss": 0.1202, "step": 13800 }, { "epoch": 0.7237945492662474, "grad_norm": 1.5197694301605225, "learning_rate": 3.19064465408805e-05, "loss": 0.115, "step": 13810 }, { "epoch": 0.7243186582809225, "grad_norm": 2.9044370651245117, "learning_rate": 3.1893343815513624e-05, "loss": 0.1271, "step": 13820 }, { "epoch": 0.7248427672955975, "grad_norm": 1.1253836154937744, "learning_rate": 3.188024109014675e-05, "loss": 0.0998, "step": 13830 }, { "epoch": 0.7253668763102725, "grad_norm": 3.362396717071533, "learning_rate": 3.186713836477988e-05, "loss": 0.2023, "step": 13840 }, { "epoch": 0.7258909853249476, "grad_norm": 2.730292558670044, "learning_rate": 3.1854035639413e-05, "loss": 0.1528, "step": 13850 }, { "epoch": 0.7264150943396226, "grad_norm": 2.7650363445281982, "learning_rate": 3.1840932914046124e-05, "loss": 0.1211, "step": 13860 }, { "epoch": 0.7269392033542977, "grad_norm": 1.6592358350753784, "learning_rate": 3.182783018867925e-05, "loss": 0.1398, "step": 13870 }, { "epoch": 0.7274633123689728, "grad_norm": 1.5524930953979492, "learning_rate": 3.181472746331237e-05, "loss": 0.1147, "step": 13880 }, { "epoch": 0.7279874213836478, "grad_norm": 1.3090453147888184, "learning_rate": 3.1801624737945495e-05, "loss": 0.1217, "step": 13890 }, { "epoch": 0.7285115303983228, "grad_norm": 1.784716010093689, "learning_rate": 3.178852201257862e-05, "loss": 0.1172, "step": 13900 }, { "epoch": 0.7290356394129979, "grad_norm": 1.9261764287948608, "learning_rate": 3.177541928721174e-05, "loss": 0.1311, "step": 13910 }, { "epoch": 0.7295597484276729, "grad_norm": 1.0017802715301514, "learning_rate": 3.1762316561844865e-05, "loss": 0.0981, "step": 13920 }, { "epoch": 0.730083857442348, "grad_norm": 1.6215063333511353, "learning_rate": 3.174921383647799e-05, "loss": 0.1409, "step": 13930 }, { "epoch": 0.7306079664570231, "grad_norm": 3.0000991821289062, "learning_rate": 3.173611111111111e-05, "loss": 0.1398, "step": 13940 }, { "epoch": 0.7311320754716981, "grad_norm": 1.7880263328552246, "learning_rate": 3.1723008385744235e-05, "loss": 0.1173, "step": 13950 }, { "epoch": 0.7316561844863732, "grad_norm": 1.4684351682662964, "learning_rate": 3.170990566037736e-05, "loss": 0.1675, "step": 13960 }, { "epoch": 0.7321802935010482, "grad_norm": 1.1320511102676392, "learning_rate": 3.169680293501048e-05, "loss": 0.1285, "step": 13970 }, { "epoch": 0.7327044025157232, "grad_norm": 1.8147705793380737, "learning_rate": 3.1683700209643605e-05, "loss": 0.1178, "step": 13980 }, { "epoch": 0.7332285115303984, "grad_norm": 2.41721773147583, "learning_rate": 3.167059748427673e-05, "loss": 0.1055, "step": 13990 }, { "epoch": 0.7337526205450734, "grad_norm": 1.2500311136245728, "learning_rate": 3.165749475890986e-05, "loss": 0.0891, "step": 14000 }, { "epoch": 0.7337526205450734, "eval_loss": 0.2849844992160797, "eval_runtime": 267.9508, "eval_samples_per_second": 7.43, "eval_steps_per_second": 1.239, "step": 14000 }, { "epoch": 0.7342767295597484, "grad_norm": 1.110494613647461, "learning_rate": 3.164439203354298e-05, "loss": 0.109, "step": 14010 }, { "epoch": 0.7348008385744235, "grad_norm": 1.408320665359497, "learning_rate": 3.1631289308176105e-05, "loss": 0.1335, "step": 14020 }, { "epoch": 0.7353249475890985, "grad_norm": 1.6937309503555298, "learning_rate": 3.161818658280923e-05, "loss": 0.1337, "step": 14030 }, { "epoch": 0.7358490566037735, "grad_norm": 1.3139739036560059, "learning_rate": 3.1605083857442345e-05, "loss": 0.1279, "step": 14040 }, { "epoch": 0.7363731656184487, "grad_norm": 1.5453401803970337, "learning_rate": 3.159198113207547e-05, "loss": 0.1439, "step": 14050 }, { "epoch": 0.7368972746331237, "grad_norm": 2.1473937034606934, "learning_rate": 3.157887840670859e-05, "loss": 0.1426, "step": 14060 }, { "epoch": 0.7374213836477987, "grad_norm": 1.6203043460845947, "learning_rate": 3.156577568134172e-05, "loss": 0.138, "step": 14070 }, { "epoch": 0.7379454926624738, "grad_norm": 1.2558566331863403, "learning_rate": 3.1552672955974846e-05, "loss": 0.1303, "step": 14080 }, { "epoch": 0.7384696016771488, "grad_norm": 0.9717229604721069, "learning_rate": 3.153957023060797e-05, "loss": 0.1371, "step": 14090 }, { "epoch": 0.7389937106918238, "grad_norm": 1.5402069091796875, "learning_rate": 3.152646750524109e-05, "loss": 0.1568, "step": 14100 }, { "epoch": 0.739517819706499, "grad_norm": 1.4260953664779663, "learning_rate": 3.1513364779874216e-05, "loss": 0.1152, "step": 14110 }, { "epoch": 0.740041928721174, "grad_norm": 0.9325925707817078, "learning_rate": 3.150026205450734e-05, "loss": 0.1202, "step": 14120 }, { "epoch": 0.7405660377358491, "grad_norm": 1.3378041982650757, "learning_rate": 3.148715932914046e-05, "loss": 0.113, "step": 14130 }, { "epoch": 0.7410901467505241, "grad_norm": 0.816106915473938, "learning_rate": 3.1474056603773586e-05, "loss": 0.1068, "step": 14140 }, { "epoch": 0.7416142557651991, "grad_norm": 1.1117491722106934, "learning_rate": 3.146095387840671e-05, "loss": 0.1038, "step": 14150 }, { "epoch": 0.7421383647798742, "grad_norm": 2.233264446258545, "learning_rate": 3.144785115303983e-05, "loss": 0.1321, "step": 14160 }, { "epoch": 0.7426624737945493, "grad_norm": 1.4176737070083618, "learning_rate": 3.1434748427672956e-05, "loss": 0.1154, "step": 14170 }, { "epoch": 0.7431865828092243, "grad_norm": 1.8627464771270752, "learning_rate": 3.142164570230608e-05, "loss": 0.1307, "step": 14180 }, { "epoch": 0.7437106918238994, "grad_norm": 1.6727312803268433, "learning_rate": 3.14085429769392e-05, "loss": 0.1341, "step": 14190 }, { "epoch": 0.7442348008385744, "grad_norm": 1.7155283689498901, "learning_rate": 3.1395440251572326e-05, "loss": 0.1261, "step": 14200 }, { "epoch": 0.7447589098532494, "grad_norm": 1.3220230340957642, "learning_rate": 3.138233752620545e-05, "loss": 0.1047, "step": 14210 }, { "epoch": 0.7452830188679245, "grad_norm": 1.772420048713684, "learning_rate": 3.136923480083857e-05, "loss": 0.1299, "step": 14220 }, { "epoch": 0.7458071278825996, "grad_norm": 1.4345825910568237, "learning_rate": 3.13561320754717e-05, "loss": 0.1324, "step": 14230 }, { "epoch": 0.7463312368972747, "grad_norm": 1.821257472038269, "learning_rate": 3.134302935010483e-05, "loss": 0.1312, "step": 14240 }, { "epoch": 0.7468553459119497, "grad_norm": 1.6807703971862793, "learning_rate": 3.132992662473795e-05, "loss": 0.0969, "step": 14250 }, { "epoch": 0.7473794549266247, "grad_norm": 1.5309361219406128, "learning_rate": 3.1316823899371073e-05, "loss": 0.1293, "step": 14260 }, { "epoch": 0.7479035639412998, "grad_norm": 1.249865174293518, "learning_rate": 3.13037211740042e-05, "loss": 0.1112, "step": 14270 }, { "epoch": 0.7484276729559748, "grad_norm": 3.1915624141693115, "learning_rate": 3.1290618448637313e-05, "loss": 0.1542, "step": 14280 }, { "epoch": 0.7489517819706499, "grad_norm": 1.123343825340271, "learning_rate": 3.127751572327044e-05, "loss": 0.0977, "step": 14290 }, { "epoch": 0.749475890985325, "grad_norm": 1.7678483724594116, "learning_rate": 3.126441299790357e-05, "loss": 0.1502, "step": 14300 }, { "epoch": 0.75, "grad_norm": 1.4714158773422241, "learning_rate": 3.125131027253669e-05, "loss": 0.1462, "step": 14310 }, { "epoch": 0.750524109014675, "grad_norm": 1.335870623588562, "learning_rate": 3.1238207547169814e-05, "loss": 0.1237, "step": 14320 }, { "epoch": 0.7510482180293501, "grad_norm": 1.0049090385437012, "learning_rate": 3.122510482180294e-05, "loss": 0.1159, "step": 14330 }, { "epoch": 0.7515723270440252, "grad_norm": 3.0360207557678223, "learning_rate": 3.121200209643606e-05, "loss": 0.1274, "step": 14340 }, { "epoch": 0.7520964360587002, "grad_norm": 1.8040392398834229, "learning_rate": 3.1198899371069184e-05, "loss": 0.1272, "step": 14350 }, { "epoch": 0.7526205450733753, "grad_norm": 1.7428536415100098, "learning_rate": 3.118579664570231e-05, "loss": 0.139, "step": 14360 }, { "epoch": 0.7531446540880503, "grad_norm": 2.1202545166015625, "learning_rate": 3.117269392033543e-05, "loss": 0.1295, "step": 14370 }, { "epoch": 0.7536687631027253, "grad_norm": 1.1606621742248535, "learning_rate": 3.1159591194968554e-05, "loss": 0.0937, "step": 14380 }, { "epoch": 0.7541928721174004, "grad_norm": 0.8252704739570618, "learning_rate": 3.1146488469601684e-05, "loss": 0.1126, "step": 14390 }, { "epoch": 0.7547169811320755, "grad_norm": 0.904585063457489, "learning_rate": 3.11333857442348e-05, "loss": 0.0978, "step": 14400 }, { "epoch": 0.7552410901467506, "grad_norm": 1.936915397644043, "learning_rate": 3.1120283018867924e-05, "loss": 0.1144, "step": 14410 }, { "epoch": 0.7557651991614256, "grad_norm": 1.9841164350509644, "learning_rate": 3.110718029350105e-05, "loss": 0.141, "step": 14420 }, { "epoch": 0.7562893081761006, "grad_norm": 1.5874981880187988, "learning_rate": 3.109407756813417e-05, "loss": 0.139, "step": 14430 }, { "epoch": 0.7568134171907757, "grad_norm": 1.0045322179794312, "learning_rate": 3.1080974842767294e-05, "loss": 0.1067, "step": 14440 }, { "epoch": 0.7573375262054507, "grad_norm": 1.535852074623108, "learning_rate": 3.106787211740042e-05, "loss": 0.1262, "step": 14450 }, { "epoch": 0.7578616352201258, "grad_norm": 1.0694433450698853, "learning_rate": 3.105476939203355e-05, "loss": 0.1029, "step": 14460 }, { "epoch": 0.7583857442348009, "grad_norm": 2.2281646728515625, "learning_rate": 3.104166666666667e-05, "loss": 0.1058, "step": 14470 }, { "epoch": 0.7589098532494759, "grad_norm": 2.186168909072876, "learning_rate": 3.1028563941299795e-05, "loss": 0.1099, "step": 14480 }, { "epoch": 0.7594339622641509, "grad_norm": 1.6012275218963623, "learning_rate": 3.101546121593292e-05, "loss": 0.1249, "step": 14490 }, { "epoch": 0.759958071278826, "grad_norm": 1.6102908849716187, "learning_rate": 3.100235849056604e-05, "loss": 0.1223, "step": 14500 }, { "epoch": 0.760482180293501, "grad_norm": 2.9501163959503174, "learning_rate": 3.0989255765199165e-05, "loss": 0.1348, "step": 14510 }, { "epoch": 0.7610062893081762, "grad_norm": 1.4406819343566895, "learning_rate": 3.097615303983228e-05, "loss": 0.1277, "step": 14520 }, { "epoch": 0.7615303983228512, "grad_norm": 1.3746488094329834, "learning_rate": 3.096305031446541e-05, "loss": 0.0998, "step": 14530 }, { "epoch": 0.7620545073375262, "grad_norm": 0.9767179489135742, "learning_rate": 3.0949947589098535e-05, "loss": 0.1141, "step": 14540 }, { "epoch": 0.7625786163522013, "grad_norm": 1.8980849981307983, "learning_rate": 3.093684486373166e-05, "loss": 0.1409, "step": 14550 }, { "epoch": 0.7631027253668763, "grad_norm": 1.578115463256836, "learning_rate": 3.092374213836478e-05, "loss": 0.0995, "step": 14560 }, { "epoch": 0.7636268343815513, "grad_norm": 1.580794095993042, "learning_rate": 3.0910639412997905e-05, "loss": 0.1562, "step": 14570 }, { "epoch": 0.7641509433962265, "grad_norm": 1.6007169485092163, "learning_rate": 3.089753668763103e-05, "loss": 0.1027, "step": 14580 }, { "epoch": 0.7646750524109015, "grad_norm": 1.7334402799606323, "learning_rate": 3.088443396226415e-05, "loss": 0.1173, "step": 14590 }, { "epoch": 0.7651991614255765, "grad_norm": 1.2498724460601807, "learning_rate": 3.0871331236897275e-05, "loss": 0.1007, "step": 14600 }, { "epoch": 0.7657232704402516, "grad_norm": 1.0881195068359375, "learning_rate": 3.08582285115304e-05, "loss": 0.0993, "step": 14610 }, { "epoch": 0.7662473794549266, "grad_norm": 2.333465814590454, "learning_rate": 3.084512578616353e-05, "loss": 0.1076, "step": 14620 }, { "epoch": 0.7667714884696016, "grad_norm": 1.2131460905075073, "learning_rate": 3.083202306079665e-05, "loss": 0.1103, "step": 14630 }, { "epoch": 0.7672955974842768, "grad_norm": 2.7174482345581055, "learning_rate": 3.081892033542977e-05, "loss": 0.122, "step": 14640 }, { "epoch": 0.7678197064989518, "grad_norm": 0.7355225682258606, "learning_rate": 3.080581761006289e-05, "loss": 0.1053, "step": 14650 }, { "epoch": 0.7683438155136268, "grad_norm": 1.972083568572998, "learning_rate": 3.0792714884696016e-05, "loss": 0.1474, "step": 14660 }, { "epoch": 0.7688679245283019, "grad_norm": 1.4130945205688477, "learning_rate": 3.077961215932914e-05, "loss": 0.115, "step": 14670 }, { "epoch": 0.7693920335429769, "grad_norm": 1.699841022491455, "learning_rate": 3.076650943396226e-05, "loss": 0.1161, "step": 14680 }, { "epoch": 0.769916142557652, "grad_norm": 1.4874564409255981, "learning_rate": 3.075340670859539e-05, "loss": 0.1268, "step": 14690 }, { "epoch": 0.7704402515723271, "grad_norm": 1.0537846088409424, "learning_rate": 3.0740303983228516e-05, "loss": 0.1153, "step": 14700 }, { "epoch": 0.7709643605870021, "grad_norm": 1.64293372631073, "learning_rate": 3.072720125786164e-05, "loss": 0.1495, "step": 14710 }, { "epoch": 0.7714884696016772, "grad_norm": 1.6864267587661743, "learning_rate": 3.071409853249476e-05, "loss": 0.1056, "step": 14720 }, { "epoch": 0.7720125786163522, "grad_norm": 1.3438204526901245, "learning_rate": 3.0700995807127886e-05, "loss": 0.0935, "step": 14730 }, { "epoch": 0.7725366876310272, "grad_norm": 1.8872520923614502, "learning_rate": 3.068789308176101e-05, "loss": 0.108, "step": 14740 }, { "epoch": 0.7730607966457023, "grad_norm": 1.7647202014923096, "learning_rate": 3.067479035639413e-05, "loss": 0.1238, "step": 14750 }, { "epoch": 0.7735849056603774, "grad_norm": 3.869321346282959, "learning_rate": 3.066168763102725e-05, "loss": 0.1109, "step": 14760 }, { "epoch": 0.7741090146750524, "grad_norm": 1.3759615421295166, "learning_rate": 3.064858490566038e-05, "loss": 0.1415, "step": 14770 }, { "epoch": 0.7746331236897275, "grad_norm": 2.167872428894043, "learning_rate": 3.06354821802935e-05, "loss": 0.1456, "step": 14780 }, { "epoch": 0.7751572327044025, "grad_norm": 1.0983874797821045, "learning_rate": 3.0622379454926626e-05, "loss": 0.1033, "step": 14790 }, { "epoch": 0.7756813417190775, "grad_norm": 1.2786545753479004, "learning_rate": 3.060927672955975e-05, "loss": 0.1094, "step": 14800 }, { "epoch": 0.7762054507337526, "grad_norm": 1.4020774364471436, "learning_rate": 3.059617400419287e-05, "loss": 0.1365, "step": 14810 }, { "epoch": 0.7767295597484277, "grad_norm": 1.3768659830093384, "learning_rate": 3.0583071278825997e-05, "loss": 0.1133, "step": 14820 }, { "epoch": 0.7772536687631028, "grad_norm": 1.1481125354766846, "learning_rate": 3.056996855345912e-05, "loss": 0.1176, "step": 14830 }, { "epoch": 0.7777777777777778, "grad_norm": 1.4727671146392822, "learning_rate": 3.055686582809224e-05, "loss": 0.1541, "step": 14840 }, { "epoch": 0.7783018867924528, "grad_norm": 1.5166109800338745, "learning_rate": 3.0543763102725373e-05, "loss": 0.11, "step": 14850 }, { "epoch": 0.7788259958071279, "grad_norm": 1.597730040550232, "learning_rate": 3.05306603773585e-05, "loss": 0.1268, "step": 14860 }, { "epoch": 0.7793501048218029, "grad_norm": 1.0425934791564941, "learning_rate": 3.0517557651991617e-05, "loss": 0.1093, "step": 14870 }, { "epoch": 0.779874213836478, "grad_norm": 1.4101150035858154, "learning_rate": 3.0504454926624737e-05, "loss": 0.1286, "step": 14880 }, { "epoch": 0.7803983228511531, "grad_norm": 0.8775603175163269, "learning_rate": 3.049135220125786e-05, "loss": 0.0974, "step": 14890 }, { "epoch": 0.7809224318658281, "grad_norm": 1.4296807050704956, "learning_rate": 3.0478249475890987e-05, "loss": 0.0968, "step": 14900 }, { "epoch": 0.7814465408805031, "grad_norm": 0.9347816109657288, "learning_rate": 3.046514675052411e-05, "loss": 0.1099, "step": 14910 }, { "epoch": 0.7819706498951782, "grad_norm": 1.1797919273376465, "learning_rate": 3.0452044025157234e-05, "loss": 0.1267, "step": 14920 }, { "epoch": 0.7824947589098532, "grad_norm": 1.7169650793075562, "learning_rate": 3.0438941299790357e-05, "loss": 0.118, "step": 14930 }, { "epoch": 0.7830188679245284, "grad_norm": 1.5158791542053223, "learning_rate": 3.042583857442348e-05, "loss": 0.116, "step": 14940 }, { "epoch": 0.7835429769392034, "grad_norm": 1.524572491645813, "learning_rate": 3.0412735849056607e-05, "loss": 0.1425, "step": 14950 }, { "epoch": 0.7840670859538784, "grad_norm": 1.839971899986267, "learning_rate": 3.039963312368973e-05, "loss": 0.1026, "step": 14960 }, { "epoch": 0.7845911949685535, "grad_norm": 0.864676296710968, "learning_rate": 3.0386530398322854e-05, "loss": 0.0963, "step": 14970 }, { "epoch": 0.7851153039832285, "grad_norm": 2.310966730117798, "learning_rate": 3.0373427672955977e-05, "loss": 0.1307, "step": 14980 }, { "epoch": 0.7856394129979035, "grad_norm": 1.5302140712738037, "learning_rate": 3.0360324947589104e-05, "loss": 0.1279, "step": 14990 }, { "epoch": 0.7861635220125787, "grad_norm": 1.6324013471603394, "learning_rate": 3.034722222222222e-05, "loss": 0.1314, "step": 15000 }, { "epoch": 0.7861635220125787, "eval_loss": 0.2777167558670044, "eval_runtime": 268.1288, "eval_samples_per_second": 7.426, "eval_steps_per_second": 1.238, "step": 15000 }, { "epoch": 0.7866876310272537, "grad_norm": 1.9445385932922363, "learning_rate": 3.0334119496855344e-05, "loss": 0.1347, "step": 15010 }, { "epoch": 0.7872117400419287, "grad_norm": 1.0624724626541138, "learning_rate": 3.032101677148847e-05, "loss": 0.1104, "step": 15020 }, { "epoch": 0.7877358490566038, "grad_norm": 1.1631520986557007, "learning_rate": 3.0307914046121594e-05, "loss": 0.1151, "step": 15030 }, { "epoch": 0.7882599580712788, "grad_norm": 0.9908153414726257, "learning_rate": 3.0294811320754718e-05, "loss": 0.1304, "step": 15040 }, { "epoch": 0.7887840670859538, "grad_norm": 2.5934977531433105, "learning_rate": 3.028170859538784e-05, "loss": 0.1065, "step": 15050 }, { "epoch": 0.789308176100629, "grad_norm": 1.1737087965011597, "learning_rate": 3.0268605870020965e-05, "loss": 0.112, "step": 15060 }, { "epoch": 0.789832285115304, "grad_norm": 1.6975194215774536, "learning_rate": 3.025550314465409e-05, "loss": 0.1253, "step": 15070 }, { "epoch": 0.790356394129979, "grad_norm": 3.0375587940216064, "learning_rate": 3.0242400419287215e-05, "loss": 0.119, "step": 15080 }, { "epoch": 0.7908805031446541, "grad_norm": 1.3421461582183838, "learning_rate": 3.0229297693920338e-05, "loss": 0.1191, "step": 15090 }, { "epoch": 0.7914046121593291, "grad_norm": 3.2138166427612305, "learning_rate": 3.021619496855346e-05, "loss": 0.1506, "step": 15100 }, { "epoch": 0.7919287211740041, "grad_norm": 1.5473613739013672, "learning_rate": 3.020309224318658e-05, "loss": 0.1389, "step": 15110 }, { "epoch": 0.7924528301886793, "grad_norm": 4.1486592292785645, "learning_rate": 3.0189989517819705e-05, "loss": 0.1511, "step": 15120 }, { "epoch": 0.7929769392033543, "grad_norm": 1.7993791103363037, "learning_rate": 3.0176886792452828e-05, "loss": 0.1299, "step": 15130 }, { "epoch": 0.7935010482180294, "grad_norm": 1.2573460340499878, "learning_rate": 3.0163784067085955e-05, "loss": 0.1098, "step": 15140 }, { "epoch": 0.7940251572327044, "grad_norm": 2.687364339828491, "learning_rate": 3.015068134171908e-05, "loss": 0.121, "step": 15150 }, { "epoch": 0.7945492662473794, "grad_norm": 1.0059900283813477, "learning_rate": 3.0137578616352202e-05, "loss": 0.1137, "step": 15160 }, { "epoch": 0.7950733752620545, "grad_norm": 2.1174488067626953, "learning_rate": 3.0124475890985325e-05, "loss": 0.0791, "step": 15170 }, { "epoch": 0.7955974842767296, "grad_norm": 2.5065529346466064, "learning_rate": 3.0111373165618452e-05, "loss": 0.0929, "step": 15180 }, { "epoch": 0.7961215932914046, "grad_norm": 1.4558613300323486, "learning_rate": 3.0098270440251575e-05, "loss": 0.1228, "step": 15190 }, { "epoch": 0.7966457023060797, "grad_norm": 1.318332552909851, "learning_rate": 3.00851677148847e-05, "loss": 0.0896, "step": 15200 }, { "epoch": 0.7971698113207547, "grad_norm": 1.5867081880569458, "learning_rate": 3.0072064989517822e-05, "loss": 0.1312, "step": 15210 }, { "epoch": 0.7976939203354297, "grad_norm": 2.399853229522705, "learning_rate": 3.0058962264150946e-05, "loss": 0.1074, "step": 15220 }, { "epoch": 0.7982180293501048, "grad_norm": 1.729591965675354, "learning_rate": 3.0045859538784066e-05, "loss": 0.0967, "step": 15230 }, { "epoch": 0.7987421383647799, "grad_norm": 2.1603991985321045, "learning_rate": 3.003275681341719e-05, "loss": 0.1126, "step": 15240 }, { "epoch": 0.799266247379455, "grad_norm": 1.2216774225234985, "learning_rate": 3.0019654088050316e-05, "loss": 0.1125, "step": 15250 }, { "epoch": 0.79979035639413, "grad_norm": 4.094639301300049, "learning_rate": 3.000655136268344e-05, "loss": 0.1394, "step": 15260 }, { "epoch": 0.800314465408805, "grad_norm": 2.448464870452881, "learning_rate": 2.9993448637316562e-05, "loss": 0.0978, "step": 15270 }, { "epoch": 0.80083857442348, "grad_norm": 1.6635816097259521, "learning_rate": 2.9980345911949686e-05, "loss": 0.1443, "step": 15280 }, { "epoch": 0.8013626834381551, "grad_norm": 1.4908186197280884, "learning_rate": 2.996724318658281e-05, "loss": 0.1262, "step": 15290 }, { "epoch": 0.8018867924528302, "grad_norm": 1.3856154680252075, "learning_rate": 2.9954140461215936e-05, "loss": 0.137, "step": 15300 }, { "epoch": 0.8024109014675053, "grad_norm": 2.067502498626709, "learning_rate": 2.994103773584906e-05, "loss": 0.1272, "step": 15310 }, { "epoch": 0.8029350104821803, "grad_norm": 12.453585624694824, "learning_rate": 2.9927935010482183e-05, "loss": 0.0965, "step": 15320 }, { "epoch": 0.8034591194968553, "grad_norm": 2.868530511856079, "learning_rate": 2.9914832285115306e-05, "loss": 0.0958, "step": 15330 }, { "epoch": 0.8039832285115304, "grad_norm": 1.3196369409561157, "learning_rate": 2.9901729559748433e-05, "loss": 0.1208, "step": 15340 }, { "epoch": 0.8045073375262054, "grad_norm": 0.8990975022315979, "learning_rate": 2.988862683438155e-05, "loss": 0.1065, "step": 15350 }, { "epoch": 0.8050314465408805, "grad_norm": 1.5096759796142578, "learning_rate": 2.9875524109014673e-05, "loss": 0.1133, "step": 15360 }, { "epoch": 0.8055555555555556, "grad_norm": 1.556361198425293, "learning_rate": 2.98624213836478e-05, "loss": 0.1362, "step": 15370 }, { "epoch": 0.8060796645702306, "grad_norm": 2.255713939666748, "learning_rate": 2.9849318658280923e-05, "loss": 0.12, "step": 15380 }, { "epoch": 0.8066037735849056, "grad_norm": 1.21890127658844, "learning_rate": 2.9836215932914046e-05, "loss": 0.0996, "step": 15390 }, { "epoch": 0.8071278825995807, "grad_norm": 1.7154406309127808, "learning_rate": 2.982311320754717e-05, "loss": 0.1089, "step": 15400 }, { "epoch": 0.8076519916142557, "grad_norm": 20.269657135009766, "learning_rate": 2.9810010482180297e-05, "loss": 0.127, "step": 15410 }, { "epoch": 0.8081761006289309, "grad_norm": 1.4827377796173096, "learning_rate": 2.979690775681342e-05, "loss": 0.0965, "step": 15420 }, { "epoch": 0.8087002096436059, "grad_norm": 2.1818010807037354, "learning_rate": 2.9783805031446543e-05, "loss": 0.1361, "step": 15430 }, { "epoch": 0.8092243186582809, "grad_norm": 3.504404306411743, "learning_rate": 2.9770702306079667e-05, "loss": 0.1108, "step": 15440 }, { "epoch": 0.809748427672956, "grad_norm": 1.8620023727416992, "learning_rate": 2.975759958071279e-05, "loss": 0.1077, "step": 15450 }, { "epoch": 0.810272536687631, "grad_norm": 0.9601500034332275, "learning_rate": 2.9744496855345917e-05, "loss": 0.1188, "step": 15460 }, { "epoch": 0.810796645702306, "grad_norm": 1.2247925996780396, "learning_rate": 2.9731394129979034e-05, "loss": 0.0988, "step": 15470 }, { "epoch": 0.8113207547169812, "grad_norm": 2.1443095207214355, "learning_rate": 2.9718291404612157e-05, "loss": 0.147, "step": 15480 }, { "epoch": 0.8118448637316562, "grad_norm": 1.4585750102996826, "learning_rate": 2.9705188679245284e-05, "loss": 0.1855, "step": 15490 }, { "epoch": 0.8123689727463312, "grad_norm": 1.9782541990280151, "learning_rate": 2.9692085953878407e-05, "loss": 0.1283, "step": 15500 }, { "epoch": 0.8128930817610063, "grad_norm": 1.5811617374420166, "learning_rate": 2.967898322851153e-05, "loss": 0.1051, "step": 15510 }, { "epoch": 0.8134171907756813, "grad_norm": 3.186302423477173, "learning_rate": 2.9665880503144654e-05, "loss": 0.11, "step": 15520 }, { "epoch": 0.8139412997903563, "grad_norm": 1.5484328269958496, "learning_rate": 2.965277777777778e-05, "loss": 0.1241, "step": 15530 }, { "epoch": 0.8144654088050315, "grad_norm": 0.7247985601425171, "learning_rate": 2.9639675052410904e-05, "loss": 0.1159, "step": 15540 }, { "epoch": 0.8149895178197065, "grad_norm": 2.027569055557251, "learning_rate": 2.9626572327044027e-05, "loss": 0.1339, "step": 15550 }, { "epoch": 0.8155136268343816, "grad_norm": 3.1215574741363525, "learning_rate": 2.961346960167715e-05, "loss": 0.1233, "step": 15560 }, { "epoch": 0.8160377358490566, "grad_norm": 1.5289641618728638, "learning_rate": 2.9600366876310278e-05, "loss": 0.1208, "step": 15570 }, { "epoch": 0.8165618448637316, "grad_norm": 2.011868953704834, "learning_rate": 2.95872641509434e-05, "loss": 0.1041, "step": 15580 }, { "epoch": 0.8170859538784067, "grad_norm": 1.1236916780471802, "learning_rate": 2.9574161425576518e-05, "loss": 0.1069, "step": 15590 }, { "epoch": 0.8176100628930818, "grad_norm": 1.9092012643814087, "learning_rate": 2.9561058700209644e-05, "loss": 0.1264, "step": 15600 }, { "epoch": 0.8181341719077568, "grad_norm": 1.2404230833053589, "learning_rate": 2.9547955974842768e-05, "loss": 0.0975, "step": 15610 }, { "epoch": 0.8186582809224319, "grad_norm": 1.748572826385498, "learning_rate": 2.953485324947589e-05, "loss": 0.1142, "step": 15620 }, { "epoch": 0.8191823899371069, "grad_norm": 1.5801773071289062, "learning_rate": 2.9521750524109014e-05, "loss": 0.1174, "step": 15630 }, { "epoch": 0.8197064989517819, "grad_norm": 0.7898293137550354, "learning_rate": 2.9508647798742138e-05, "loss": 0.1337, "step": 15640 }, { "epoch": 0.820230607966457, "grad_norm": 1.4395421743392944, "learning_rate": 2.9495545073375265e-05, "loss": 0.1178, "step": 15650 }, { "epoch": 0.8207547169811321, "grad_norm": 0.7716788649559021, "learning_rate": 2.9482442348008388e-05, "loss": 0.0857, "step": 15660 }, { "epoch": 0.8212788259958071, "grad_norm": 2.1449007987976074, "learning_rate": 2.946933962264151e-05, "loss": 0.1222, "step": 15670 }, { "epoch": 0.8218029350104822, "grad_norm": 1.2264463901519775, "learning_rate": 2.9456236897274635e-05, "loss": 0.1064, "step": 15680 }, { "epoch": 0.8223270440251572, "grad_norm": 1.507369041442871, "learning_rate": 2.944313417190776e-05, "loss": 0.1101, "step": 15690 }, { "epoch": 0.8228511530398323, "grad_norm": 2.119400978088379, "learning_rate": 2.9430031446540885e-05, "loss": 0.1166, "step": 15700 }, { "epoch": 0.8233752620545073, "grad_norm": 1.4604309797286987, "learning_rate": 2.9416928721174e-05, "loss": 0.1165, "step": 15710 }, { "epoch": 0.8238993710691824, "grad_norm": 2.91054105758667, "learning_rate": 2.940382599580713e-05, "loss": 0.1082, "step": 15720 }, { "epoch": 0.8244234800838575, "grad_norm": 1.6494545936584473, "learning_rate": 2.9390723270440252e-05, "loss": 0.1083, "step": 15730 }, { "epoch": 0.8249475890985325, "grad_norm": 1.565700650215149, "learning_rate": 2.9377620545073375e-05, "loss": 0.1166, "step": 15740 }, { "epoch": 0.8254716981132075, "grad_norm": 1.6435893774032593, "learning_rate": 2.93645178197065e-05, "loss": 0.1176, "step": 15750 }, { "epoch": 0.8259958071278826, "grad_norm": 1.2347391843795776, "learning_rate": 2.9351415094339625e-05, "loss": 0.1166, "step": 15760 }, { "epoch": 0.8265199161425576, "grad_norm": 1.281410574913025, "learning_rate": 2.933831236897275e-05, "loss": 0.1275, "step": 15770 }, { "epoch": 0.8270440251572327, "grad_norm": 1.275313377380371, "learning_rate": 2.9325209643605872e-05, "loss": 0.1434, "step": 15780 }, { "epoch": 0.8275681341719078, "grad_norm": 2.0496749877929688, "learning_rate": 2.9312106918238995e-05, "loss": 0.1286, "step": 15790 }, { "epoch": 0.8280922431865828, "grad_norm": 0.8086955547332764, "learning_rate": 2.929900419287212e-05, "loss": 0.1114, "step": 15800 }, { "epoch": 0.8286163522012578, "grad_norm": 2.2521004676818848, "learning_rate": 2.9285901467505246e-05, "loss": 0.1293, "step": 15810 }, { "epoch": 0.8291404612159329, "grad_norm": 1.0213623046875, "learning_rate": 2.927279874213837e-05, "loss": 0.1008, "step": 15820 }, { "epoch": 0.8296645702306079, "grad_norm": 10.51981258392334, "learning_rate": 2.925969601677149e-05, "loss": 0.1318, "step": 15830 }, { "epoch": 0.8301886792452831, "grad_norm": 1.4294241666793823, "learning_rate": 2.9246593291404612e-05, "loss": 0.1175, "step": 15840 }, { "epoch": 0.8307127882599581, "grad_norm": 2.056034564971924, "learning_rate": 2.9233490566037736e-05, "loss": 0.1179, "step": 15850 }, { "epoch": 0.8312368972746331, "grad_norm": 2.3232133388519287, "learning_rate": 2.922038784067086e-05, "loss": 0.1132, "step": 15860 }, { "epoch": 0.8317610062893082, "grad_norm": 1.936185598373413, "learning_rate": 2.9207285115303983e-05, "loss": 0.1514, "step": 15870 }, { "epoch": 0.8322851153039832, "grad_norm": 2.3033699989318848, "learning_rate": 2.919418238993711e-05, "loss": 0.1453, "step": 15880 }, { "epoch": 0.8328092243186582, "grad_norm": 1.7244473695755005, "learning_rate": 2.9181079664570233e-05, "loss": 0.1109, "step": 15890 }, { "epoch": 0.8333333333333334, "grad_norm": 1.8862981796264648, "learning_rate": 2.9167976939203356e-05, "loss": 0.1223, "step": 15900 }, { "epoch": 0.8338574423480084, "grad_norm": 2.6670587062835693, "learning_rate": 2.915487421383648e-05, "loss": 0.1297, "step": 15910 }, { "epoch": 0.8343815513626834, "grad_norm": 0.8241652250289917, "learning_rate": 2.9141771488469606e-05, "loss": 0.1257, "step": 15920 }, { "epoch": 0.8349056603773585, "grad_norm": 1.6652482748031616, "learning_rate": 2.912866876310273e-05, "loss": 0.1129, "step": 15930 }, { "epoch": 0.8354297693920335, "grad_norm": 1.059584379196167, "learning_rate": 2.9115566037735853e-05, "loss": 0.111, "step": 15940 }, { "epoch": 0.8359538784067087, "grad_norm": 1.6806567907333374, "learning_rate": 2.9102463312368973e-05, "loss": 0.1272, "step": 15950 }, { "epoch": 0.8364779874213837, "grad_norm": 0.8284962177276611, "learning_rate": 2.9089360587002096e-05, "loss": 0.1192, "step": 15960 }, { "epoch": 0.8370020964360587, "grad_norm": 1.719659686088562, "learning_rate": 2.907625786163522e-05, "loss": 0.186, "step": 15970 }, { "epoch": 0.8375262054507338, "grad_norm": 1.34307861328125, "learning_rate": 2.9063155136268343e-05, "loss": 0.1316, "step": 15980 }, { "epoch": 0.8380503144654088, "grad_norm": 1.9360815286636353, "learning_rate": 2.905005241090147e-05, "loss": 0.144, "step": 15990 }, { "epoch": 0.8385744234800838, "grad_norm": 2.661608934402466, "learning_rate": 2.9036949685534593e-05, "loss": 0.118, "step": 16000 }, { "epoch": 0.8385744234800838, "eval_loss": 0.2744086682796478, "eval_runtime": 267.8021, "eval_samples_per_second": 7.435, "eval_steps_per_second": 1.24, "step": 16000 }, { "epoch": 0.839098532494759, "grad_norm": 2.3948934078216553, "learning_rate": 2.9023846960167717e-05, "loss": 0.0907, "step": 16010 }, { "epoch": 0.839622641509434, "grad_norm": 1.7479071617126465, "learning_rate": 2.901074423480084e-05, "loss": 0.133, "step": 16020 }, { "epoch": 0.840146750524109, "grad_norm": 5.435430526733398, "learning_rate": 2.8997641509433963e-05, "loss": 0.1126, "step": 16030 }, { "epoch": 0.8406708595387841, "grad_norm": 1.9367077350616455, "learning_rate": 2.898453878406709e-05, "loss": 0.1004, "step": 16040 }, { "epoch": 0.8411949685534591, "grad_norm": 1.109445571899414, "learning_rate": 2.8971436058700214e-05, "loss": 0.1029, "step": 16050 }, { "epoch": 0.8417190775681341, "grad_norm": 1.7349110841751099, "learning_rate": 2.8958333333333337e-05, "loss": 0.0916, "step": 16060 }, { "epoch": 0.8422431865828093, "grad_norm": 2.7323551177978516, "learning_rate": 2.8945230607966457e-05, "loss": 0.1462, "step": 16070 }, { "epoch": 0.8427672955974843, "grad_norm": 1.405383825302124, "learning_rate": 2.893212788259958e-05, "loss": 0.1439, "step": 16080 }, { "epoch": 0.8432914046121593, "grad_norm": 1.6317007541656494, "learning_rate": 2.8919025157232704e-05, "loss": 0.1094, "step": 16090 }, { "epoch": 0.8438155136268344, "grad_norm": 2.196056842803955, "learning_rate": 2.8905922431865827e-05, "loss": 0.1118, "step": 16100 }, { "epoch": 0.8443396226415094, "grad_norm": 1.5937583446502686, "learning_rate": 2.8892819706498954e-05, "loss": 0.1333, "step": 16110 }, { "epoch": 0.8448637316561844, "grad_norm": 1.4908620119094849, "learning_rate": 2.8879716981132077e-05, "loss": 0.145, "step": 16120 }, { "epoch": 0.8453878406708596, "grad_norm": 1.6847023963928223, "learning_rate": 2.88666142557652e-05, "loss": 0.0794, "step": 16130 }, { "epoch": 0.8459119496855346, "grad_norm": 3.2816317081451416, "learning_rate": 2.8853511530398324e-05, "loss": 0.1317, "step": 16140 }, { "epoch": 0.8464360587002097, "grad_norm": 1.2879031896591187, "learning_rate": 2.8840408805031447e-05, "loss": 0.1105, "step": 16150 }, { "epoch": 0.8469601677148847, "grad_norm": 1.481139063835144, "learning_rate": 2.8827306079664574e-05, "loss": 0.1356, "step": 16160 }, { "epoch": 0.8474842767295597, "grad_norm": 1.0397553443908691, "learning_rate": 2.8814203354297698e-05, "loss": 0.1308, "step": 16170 }, { "epoch": 0.8480083857442348, "grad_norm": 1.932265281677246, "learning_rate": 2.880110062893082e-05, "loss": 0.0992, "step": 16180 }, { "epoch": 0.8485324947589099, "grad_norm": 2.0606870651245117, "learning_rate": 2.878799790356394e-05, "loss": 0.1139, "step": 16190 }, { "epoch": 0.8490566037735849, "grad_norm": 1.4234727621078491, "learning_rate": 2.8774895178197064e-05, "loss": 0.1026, "step": 16200 }, { "epoch": 0.84958071278826, "grad_norm": 1.9039376974105835, "learning_rate": 2.8761792452830188e-05, "loss": 0.095, "step": 16210 }, { "epoch": 0.850104821802935, "grad_norm": 1.5415253639221191, "learning_rate": 2.874868972746331e-05, "loss": 0.0932, "step": 16220 }, { "epoch": 0.85062893081761, "grad_norm": 1.4809905290603638, "learning_rate": 2.8735587002096438e-05, "loss": 0.0962, "step": 16230 }, { "epoch": 0.8511530398322851, "grad_norm": 3.9822001457214355, "learning_rate": 2.872248427672956e-05, "loss": 0.1441, "step": 16240 }, { "epoch": 0.8516771488469602, "grad_norm": 1.1046079397201538, "learning_rate": 2.8709381551362685e-05, "loss": 0.1435, "step": 16250 }, { "epoch": 0.8522012578616353, "grad_norm": 1.510729193687439, "learning_rate": 2.8696278825995808e-05, "loss": 0.1209, "step": 16260 }, { "epoch": 0.8527253668763103, "grad_norm": 1.1853828430175781, "learning_rate": 2.8683176100628935e-05, "loss": 0.1369, "step": 16270 }, { "epoch": 0.8532494758909853, "grad_norm": 1.8410040140151978, "learning_rate": 2.8670073375262058e-05, "loss": 0.108, "step": 16280 }, { "epoch": 0.8537735849056604, "grad_norm": 1.6348074674606323, "learning_rate": 2.865697064989518e-05, "loss": 0.1342, "step": 16290 }, { "epoch": 0.8542976939203354, "grad_norm": 2.014404535293579, "learning_rate": 2.8643867924528305e-05, "loss": 0.1126, "step": 16300 }, { "epoch": 0.8548218029350105, "grad_norm": 2.791364908218384, "learning_rate": 2.8630765199161425e-05, "loss": 0.1488, "step": 16310 }, { "epoch": 0.8553459119496856, "grad_norm": 1.9733411073684692, "learning_rate": 2.861766247379455e-05, "loss": 0.1033, "step": 16320 }, { "epoch": 0.8558700209643606, "grad_norm": 1.3962713479995728, "learning_rate": 2.8604559748427672e-05, "loss": 0.1143, "step": 16330 }, { "epoch": 0.8563941299790356, "grad_norm": 2.352142572402954, "learning_rate": 2.85914570230608e-05, "loss": 0.1312, "step": 16340 }, { "epoch": 0.8569182389937107, "grad_norm": 1.4277571439743042, "learning_rate": 2.8578354297693922e-05, "loss": 0.1372, "step": 16350 }, { "epoch": 0.8574423480083857, "grad_norm": 0.8835510015487671, "learning_rate": 2.8565251572327045e-05, "loss": 0.1469, "step": 16360 }, { "epoch": 0.8579664570230608, "grad_norm": 1.7008707523345947, "learning_rate": 2.855214884696017e-05, "loss": 0.1068, "step": 16370 }, { "epoch": 0.8584905660377359, "grad_norm": 1.7003710269927979, "learning_rate": 2.8539046121593292e-05, "loss": 0.1413, "step": 16380 }, { "epoch": 0.8590146750524109, "grad_norm": 1.588629961013794, "learning_rate": 2.852594339622642e-05, "loss": 0.1328, "step": 16390 }, { "epoch": 0.859538784067086, "grad_norm": 1.6589187383651733, "learning_rate": 2.8512840670859542e-05, "loss": 0.0913, "step": 16400 }, { "epoch": 0.860062893081761, "grad_norm": 1.504050374031067, "learning_rate": 2.8499737945492666e-05, "loss": 0.1071, "step": 16410 }, { "epoch": 0.860587002096436, "grad_norm": 2.1353161334991455, "learning_rate": 2.848663522012579e-05, "loss": 0.1046, "step": 16420 }, { "epoch": 0.8611111111111112, "grad_norm": 1.3967875242233276, "learning_rate": 2.847353249475891e-05, "loss": 0.0898, "step": 16430 }, { "epoch": 0.8616352201257862, "grad_norm": 0.9945201873779297, "learning_rate": 2.8460429769392032e-05, "loss": 0.1462, "step": 16440 }, { "epoch": 0.8621593291404612, "grad_norm": 1.9785001277923584, "learning_rate": 2.8447327044025156e-05, "loss": 0.1323, "step": 16450 }, { "epoch": 0.8626834381551363, "grad_norm": 1.7696077823638916, "learning_rate": 2.8434224318658283e-05, "loss": 0.124, "step": 16460 }, { "epoch": 0.8632075471698113, "grad_norm": 3.277092218399048, "learning_rate": 2.8421121593291406e-05, "loss": 0.1498, "step": 16470 }, { "epoch": 0.8637316561844863, "grad_norm": 1.6171987056732178, "learning_rate": 2.840801886792453e-05, "loss": 0.1278, "step": 16480 }, { "epoch": 0.8642557651991615, "grad_norm": 1.6969029903411865, "learning_rate": 2.8394916142557653e-05, "loss": 0.1134, "step": 16490 }, { "epoch": 0.8647798742138365, "grad_norm": 1.8987184762954712, "learning_rate": 2.838181341719078e-05, "loss": 0.1448, "step": 16500 }, { "epoch": 0.8653039832285115, "grad_norm": 1.8838675022125244, "learning_rate": 2.8368710691823903e-05, "loss": 0.1366, "step": 16510 }, { "epoch": 0.8658280922431866, "grad_norm": 1.2478837966918945, "learning_rate": 2.8355607966457026e-05, "loss": 0.1336, "step": 16520 }, { "epoch": 0.8663522012578616, "grad_norm": 1.7950494289398193, "learning_rate": 2.834250524109015e-05, "loss": 0.1083, "step": 16530 }, { "epoch": 0.8668763102725366, "grad_norm": 2.6232709884643555, "learning_rate": 2.8329402515723273e-05, "loss": 0.1079, "step": 16540 }, { "epoch": 0.8674004192872118, "grad_norm": 1.988641619682312, "learning_rate": 2.8316299790356393e-05, "loss": 0.12, "step": 16550 }, { "epoch": 0.8679245283018868, "grad_norm": 1.4371179342269897, "learning_rate": 2.8303197064989516e-05, "loss": 0.1194, "step": 16560 }, { "epoch": 0.8684486373165619, "grad_norm": 1.8544507026672363, "learning_rate": 2.829009433962264e-05, "loss": 0.1214, "step": 16570 }, { "epoch": 0.8689727463312369, "grad_norm": 1.6521706581115723, "learning_rate": 2.8276991614255767e-05, "loss": 0.0754, "step": 16580 }, { "epoch": 0.8694968553459119, "grad_norm": 2.2902579307556152, "learning_rate": 2.826388888888889e-05, "loss": 0.0884, "step": 16590 }, { "epoch": 0.870020964360587, "grad_norm": 1.0637155771255493, "learning_rate": 2.8250786163522013e-05, "loss": 0.1322, "step": 16600 }, { "epoch": 0.8705450733752621, "grad_norm": 3.798917531967163, "learning_rate": 2.8237683438155137e-05, "loss": 0.0991, "step": 16610 }, { "epoch": 0.8710691823899371, "grad_norm": 57.41527557373047, "learning_rate": 2.8224580712788264e-05, "loss": 0.279, "step": 16620 }, { "epoch": 0.8715932914046122, "grad_norm": 2.0917751789093018, "learning_rate": 2.8211477987421387e-05, "loss": 0.1624, "step": 16630 }, { "epoch": 0.8721174004192872, "grad_norm": 1.0532481670379639, "learning_rate": 2.819837526205451e-05, "loss": 0.1086, "step": 16640 }, { "epoch": 0.8726415094339622, "grad_norm": 1.6892848014831543, "learning_rate": 2.8185272536687634e-05, "loss": 0.105, "step": 16650 }, { "epoch": 0.8731656184486373, "grad_norm": 1.1963019371032715, "learning_rate": 2.817216981132076e-05, "loss": 0.1088, "step": 16660 }, { "epoch": 0.8736897274633124, "grad_norm": 3.1233057975769043, "learning_rate": 2.8159067085953877e-05, "loss": 0.129, "step": 16670 }, { "epoch": 0.8742138364779874, "grad_norm": 2.250922203063965, "learning_rate": 2.8145964360587e-05, "loss": 0.0974, "step": 16680 }, { "epoch": 0.8747379454926625, "grad_norm": 1.731714129447937, "learning_rate": 2.8132861635220127e-05, "loss": 0.1156, "step": 16690 }, { "epoch": 0.8752620545073375, "grad_norm": 1.5934561491012573, "learning_rate": 2.811975890985325e-05, "loss": 0.1097, "step": 16700 }, { "epoch": 0.8757861635220126, "grad_norm": 1.4221527576446533, "learning_rate": 2.8106656184486374e-05, "loss": 0.1084, "step": 16710 }, { "epoch": 0.8763102725366876, "grad_norm": 2.553152322769165, "learning_rate": 2.8093553459119497e-05, "loss": 0.126, "step": 16720 }, { "epoch": 0.8768343815513627, "grad_norm": 1.8152376413345337, "learning_rate": 2.808045073375262e-05, "loss": 0.1058, "step": 16730 }, { "epoch": 0.8773584905660378, "grad_norm": 1.7730140686035156, "learning_rate": 2.8067348008385748e-05, "loss": 0.1215, "step": 16740 }, { "epoch": 0.8778825995807128, "grad_norm": 1.7322098016738892, "learning_rate": 2.805424528301887e-05, "loss": 0.113, "step": 16750 }, { "epoch": 0.8784067085953878, "grad_norm": 2.0963635444641113, "learning_rate": 2.8041142557651994e-05, "loss": 0.1191, "step": 16760 }, { "epoch": 0.8789308176100629, "grad_norm": 1.5918827056884766, "learning_rate": 2.8028039832285118e-05, "loss": 0.1084, "step": 16770 }, { "epoch": 0.8794549266247379, "grad_norm": 1.8807268142700195, "learning_rate": 2.8014937106918238e-05, "loss": 0.0942, "step": 16780 }, { "epoch": 0.879979035639413, "grad_norm": 1.7885172367095947, "learning_rate": 2.800183438155136e-05, "loss": 0.094, "step": 16790 }, { "epoch": 0.8805031446540881, "grad_norm": 1.7495297193527222, "learning_rate": 2.7988731656184484e-05, "loss": 0.1317, "step": 16800 }, { "epoch": 0.8810272536687631, "grad_norm": 1.4859161376953125, "learning_rate": 2.797562893081761e-05, "loss": 0.108, "step": 16810 }, { "epoch": 0.8815513626834381, "grad_norm": 2.0737903118133545, "learning_rate": 2.7962526205450735e-05, "loss": 0.1156, "step": 16820 }, { "epoch": 0.8820754716981132, "grad_norm": 2.136467933654785, "learning_rate": 2.7949423480083858e-05, "loss": 0.1188, "step": 16830 }, { "epoch": 0.8825995807127882, "grad_norm": 2.7223691940307617, "learning_rate": 2.793632075471698e-05, "loss": 0.0917, "step": 16840 }, { "epoch": 0.8831236897274634, "grad_norm": 1.7403265237808228, "learning_rate": 2.7923218029350108e-05, "loss": 0.1266, "step": 16850 }, { "epoch": 0.8836477987421384, "grad_norm": 3.2215659618377686, "learning_rate": 2.791011530398323e-05, "loss": 0.1222, "step": 16860 }, { "epoch": 0.8841719077568134, "grad_norm": 6.004558086395264, "learning_rate": 2.7897012578616355e-05, "loss": 0.1328, "step": 16870 }, { "epoch": 0.8846960167714885, "grad_norm": 2.02972149848938, "learning_rate": 2.788390985324948e-05, "loss": 0.1054, "step": 16880 }, { "epoch": 0.8852201257861635, "grad_norm": 1.8536604642868042, "learning_rate": 2.7870807127882602e-05, "loss": 0.1491, "step": 16890 }, { "epoch": 0.8857442348008385, "grad_norm": 3.0676920413970947, "learning_rate": 2.7857704402515722e-05, "loss": 0.1047, "step": 16900 }, { "epoch": 0.8862683438155137, "grad_norm": 1.6882869005203247, "learning_rate": 2.7844601677148845e-05, "loss": 0.1023, "step": 16910 }, { "epoch": 0.8867924528301887, "grad_norm": 1.2589284181594849, "learning_rate": 2.7831498951781972e-05, "loss": 0.1274, "step": 16920 }, { "epoch": 0.8873165618448637, "grad_norm": 1.274383783340454, "learning_rate": 2.7818396226415095e-05, "loss": 0.116, "step": 16930 }, { "epoch": 0.8878406708595388, "grad_norm": 1.9524480104446411, "learning_rate": 2.780529350104822e-05, "loss": 0.1014, "step": 16940 }, { "epoch": 0.8883647798742138, "grad_norm": 1.2366316318511963, "learning_rate": 2.7792190775681342e-05, "loss": 0.1031, "step": 16950 }, { "epoch": 0.8888888888888888, "grad_norm": 2.1134531497955322, "learning_rate": 2.7779088050314465e-05, "loss": 0.1009, "step": 16960 }, { "epoch": 0.889412997903564, "grad_norm": 1.4316993951797485, "learning_rate": 2.7765985324947592e-05, "loss": 0.1101, "step": 16970 }, { "epoch": 0.889937106918239, "grad_norm": 1.7305949926376343, "learning_rate": 2.7752882599580716e-05, "loss": 0.1318, "step": 16980 }, { "epoch": 0.890461215932914, "grad_norm": 1.0645042657852173, "learning_rate": 2.773977987421384e-05, "loss": 0.1161, "step": 16990 }, { "epoch": 0.8909853249475891, "grad_norm": 0.9148305058479309, "learning_rate": 2.7726677148846962e-05, "loss": 0.1117, "step": 17000 }, { "epoch": 0.8909853249475891, "eval_loss": 0.2766527533531189, "eval_runtime": 267.9233, "eval_samples_per_second": 7.431, "eval_steps_per_second": 1.239, "step": 17000 }, { "epoch": 0.8915094339622641, "grad_norm": 2.0542149543762207, "learning_rate": 2.771357442348009e-05, "loss": 0.1624, "step": 17010 }, { "epoch": 0.8920335429769392, "grad_norm": 1.9283877611160278, "learning_rate": 2.7700471698113206e-05, "loss": 0.1331, "step": 17020 }, { "epoch": 0.8925576519916143, "grad_norm": 1.7226213216781616, "learning_rate": 2.768736897274633e-05, "loss": 0.1475, "step": 17030 }, { "epoch": 0.8930817610062893, "grad_norm": 2.5038414001464844, "learning_rate": 2.7674266247379456e-05, "loss": 0.1141, "step": 17040 }, { "epoch": 0.8936058700209644, "grad_norm": 1.534328818321228, "learning_rate": 2.766116352201258e-05, "loss": 0.0955, "step": 17050 }, { "epoch": 0.8941299790356394, "grad_norm": 1.3931151628494263, "learning_rate": 2.7648060796645703e-05, "loss": 0.0959, "step": 17060 }, { "epoch": 0.8946540880503144, "grad_norm": 2.5487723350524902, "learning_rate": 2.7634958071278826e-05, "loss": 0.1464, "step": 17070 }, { "epoch": 0.8951781970649895, "grad_norm": 2.670689105987549, "learning_rate": 2.762185534591195e-05, "loss": 0.0956, "step": 17080 }, { "epoch": 0.8957023060796646, "grad_norm": 2.335292339324951, "learning_rate": 2.7608752620545076e-05, "loss": 0.1242, "step": 17090 }, { "epoch": 0.8962264150943396, "grad_norm": 2.065664529800415, "learning_rate": 2.75956498951782e-05, "loss": 0.1469, "step": 17100 }, { "epoch": 0.8967505241090147, "grad_norm": 2.1072165966033936, "learning_rate": 2.7582547169811323e-05, "loss": 0.0981, "step": 17110 }, { "epoch": 0.8972746331236897, "grad_norm": 2.1046767234802246, "learning_rate": 2.7569444444444446e-05, "loss": 0.1114, "step": 17120 }, { "epoch": 0.8977987421383647, "grad_norm": 1.2484157085418701, "learning_rate": 2.7556341719077573e-05, "loss": 0.1279, "step": 17130 }, { "epoch": 0.8983228511530398, "grad_norm": 1.4223566055297852, "learning_rate": 2.754323899371069e-05, "loss": 0.1233, "step": 17140 }, { "epoch": 0.8988469601677149, "grad_norm": 3.491486072540283, "learning_rate": 2.7530136268343813e-05, "loss": 0.1275, "step": 17150 }, { "epoch": 0.89937106918239, "grad_norm": 1.536576747894287, "learning_rate": 2.751703354297694e-05, "loss": 0.1498, "step": 17160 }, { "epoch": 0.899895178197065, "grad_norm": 1.4447746276855469, "learning_rate": 2.7503930817610063e-05, "loss": 0.1133, "step": 17170 }, { "epoch": 0.90041928721174, "grad_norm": 1.4244399070739746, "learning_rate": 2.7490828092243187e-05, "loss": 0.1195, "step": 17180 }, { "epoch": 0.9009433962264151, "grad_norm": 4.854112148284912, "learning_rate": 2.747772536687631e-05, "loss": 0.1334, "step": 17190 }, { "epoch": 0.9014675052410901, "grad_norm": 3.5608410835266113, "learning_rate": 2.7464622641509437e-05, "loss": 0.0988, "step": 17200 }, { "epoch": 0.9019916142557652, "grad_norm": 1.3616375923156738, "learning_rate": 2.745151991614256e-05, "loss": 0.1249, "step": 17210 }, { "epoch": 0.9025157232704403, "grad_norm": 1.1115492582321167, "learning_rate": 2.7438417190775684e-05, "loss": 0.1261, "step": 17220 }, { "epoch": 0.9030398322851153, "grad_norm": 1.767063856124878, "learning_rate": 2.7425314465408807e-05, "loss": 0.118, "step": 17230 }, { "epoch": 0.9035639412997903, "grad_norm": 4.163326263427734, "learning_rate": 2.741221174004193e-05, "loss": 0.1086, "step": 17240 }, { "epoch": 0.9040880503144654, "grad_norm": 14.82337474822998, "learning_rate": 2.7399109014675057e-05, "loss": 0.1362, "step": 17250 }, { "epoch": 0.9046121593291404, "grad_norm": 2.8704612255096436, "learning_rate": 2.7386006289308174e-05, "loss": 0.1427, "step": 17260 }, { "epoch": 0.9051362683438156, "grad_norm": 3.1805038452148438, "learning_rate": 2.73729035639413e-05, "loss": 0.1222, "step": 17270 }, { "epoch": 0.9056603773584906, "grad_norm": 2.0406835079193115, "learning_rate": 2.7359800838574424e-05, "loss": 0.1003, "step": 17280 }, { "epoch": 0.9061844863731656, "grad_norm": 2.228046178817749, "learning_rate": 2.7346698113207547e-05, "loss": 0.1191, "step": 17290 }, { "epoch": 0.9067085953878407, "grad_norm": 1.1091362237930298, "learning_rate": 2.733359538784067e-05, "loss": 0.109, "step": 17300 }, { "epoch": 0.9072327044025157, "grad_norm": 3.2433698177337646, "learning_rate": 2.7320492662473794e-05, "loss": 0.1341, "step": 17310 }, { "epoch": 0.9077568134171907, "grad_norm": 1.1009184122085571, "learning_rate": 2.730738993710692e-05, "loss": 0.0792, "step": 17320 }, { "epoch": 0.9082809224318659, "grad_norm": 1.9047338962554932, "learning_rate": 2.7294287211740044e-05, "loss": 0.1178, "step": 17330 }, { "epoch": 0.9088050314465409, "grad_norm": 2.6728718280792236, "learning_rate": 2.7281184486373168e-05, "loss": 0.141, "step": 17340 }, { "epoch": 0.9093291404612159, "grad_norm": 2.257340669631958, "learning_rate": 2.726808176100629e-05, "loss": 0.1055, "step": 17350 }, { "epoch": 0.909853249475891, "grad_norm": 1.4239635467529297, "learning_rate": 2.7254979035639418e-05, "loss": 0.1329, "step": 17360 }, { "epoch": 0.910377358490566, "grad_norm": 1.608129620552063, "learning_rate": 2.724187631027254e-05, "loss": 0.1188, "step": 17370 }, { "epoch": 0.910901467505241, "grad_norm": 1.944496989250183, "learning_rate": 2.7228773584905658e-05, "loss": 0.0997, "step": 17380 }, { "epoch": 0.9114255765199162, "grad_norm": 2.178675413131714, "learning_rate": 2.7215670859538785e-05, "loss": 0.1327, "step": 17390 }, { "epoch": 0.9119496855345912, "grad_norm": 1.738000512123108, "learning_rate": 2.7202568134171908e-05, "loss": 0.1098, "step": 17400 }, { "epoch": 0.9124737945492662, "grad_norm": 1.5480185747146606, "learning_rate": 2.718946540880503e-05, "loss": 0.1324, "step": 17410 }, { "epoch": 0.9129979035639413, "grad_norm": 1.7627813816070557, "learning_rate": 2.7176362683438155e-05, "loss": 0.1138, "step": 17420 }, { "epoch": 0.9135220125786163, "grad_norm": 1.8327022790908813, "learning_rate": 2.716325995807128e-05, "loss": 0.1371, "step": 17430 }, { "epoch": 0.9140461215932913, "grad_norm": 1.486836552619934, "learning_rate": 2.7150157232704405e-05, "loss": 0.1221, "step": 17440 }, { "epoch": 0.9145702306079665, "grad_norm": 1.3775781393051147, "learning_rate": 2.7137054507337528e-05, "loss": 0.1166, "step": 17450 }, { "epoch": 0.9150943396226415, "grad_norm": 1.868113398551941, "learning_rate": 2.712395178197065e-05, "loss": 0.1362, "step": 17460 }, { "epoch": 0.9156184486373166, "grad_norm": 0.9045465588569641, "learning_rate": 2.7110849056603775e-05, "loss": 0.0987, "step": 17470 }, { "epoch": 0.9161425576519916, "grad_norm": 1.4192860126495361, "learning_rate": 2.7097746331236902e-05, "loss": 0.1006, "step": 17480 }, { "epoch": 0.9166666666666666, "grad_norm": 1.904269814491272, "learning_rate": 2.7084643605870025e-05, "loss": 0.097, "step": 17490 }, { "epoch": 0.9171907756813418, "grad_norm": 1.5991660356521606, "learning_rate": 2.7071540880503142e-05, "loss": 0.0964, "step": 17500 }, { "epoch": 0.9177148846960168, "grad_norm": 1.5102187395095825, "learning_rate": 2.705843815513627e-05, "loss": 0.122, "step": 17510 }, { "epoch": 0.9182389937106918, "grad_norm": 1.7714667320251465, "learning_rate": 2.7045335429769392e-05, "loss": 0.101, "step": 17520 }, { "epoch": 0.9187631027253669, "grad_norm": 1.6487922668457031, "learning_rate": 2.7032232704402515e-05, "loss": 0.1212, "step": 17530 }, { "epoch": 0.9192872117400419, "grad_norm": 1.3789714574813843, "learning_rate": 2.701912997903564e-05, "loss": 0.1159, "step": 17540 }, { "epoch": 0.9198113207547169, "grad_norm": 8.510631561279297, "learning_rate": 2.7006027253668765e-05, "loss": 0.1262, "step": 17550 }, { "epoch": 0.9203354297693921, "grad_norm": 2.0327212810516357, "learning_rate": 2.699292452830189e-05, "loss": 0.1335, "step": 17560 }, { "epoch": 0.9208595387840671, "grad_norm": 1.652156114578247, "learning_rate": 2.6979821802935012e-05, "loss": 0.1267, "step": 17570 }, { "epoch": 0.9213836477987422, "grad_norm": 1.6591423749923706, "learning_rate": 2.6966719077568136e-05, "loss": 0.1372, "step": 17580 }, { "epoch": 0.9219077568134172, "grad_norm": 1.7691068649291992, "learning_rate": 2.6953616352201262e-05, "loss": 0.1142, "step": 17590 }, { "epoch": 0.9224318658280922, "grad_norm": 1.4638423919677734, "learning_rate": 2.6940513626834386e-05, "loss": 0.1348, "step": 17600 }, { "epoch": 0.9229559748427673, "grad_norm": 1.6467829942703247, "learning_rate": 2.692741090146751e-05, "loss": 0.1239, "step": 17610 }, { "epoch": 0.9234800838574424, "grad_norm": 1.6024755239486694, "learning_rate": 2.691430817610063e-05, "loss": 0.1158, "step": 17620 }, { "epoch": 0.9240041928721174, "grad_norm": 1.2841496467590332, "learning_rate": 2.6901205450733753e-05, "loss": 0.1175, "step": 17630 }, { "epoch": 0.9245283018867925, "grad_norm": 2.259477376937866, "learning_rate": 2.6888102725366876e-05, "loss": 0.1128, "step": 17640 }, { "epoch": 0.9250524109014675, "grad_norm": 1.0841439962387085, "learning_rate": 2.6875e-05, "loss": 0.1331, "step": 17650 }, { "epoch": 0.9255765199161425, "grad_norm": 1.8893989324569702, "learning_rate": 2.6861897274633123e-05, "loss": 0.1246, "step": 17660 }, { "epoch": 0.9261006289308176, "grad_norm": 1.761985421180725, "learning_rate": 2.684879454926625e-05, "loss": 0.1142, "step": 17670 }, { "epoch": 0.9266247379454927, "grad_norm": 5.233839511871338, "learning_rate": 2.6835691823899373e-05, "loss": 0.1055, "step": 17680 }, { "epoch": 0.9271488469601677, "grad_norm": 2.6796579360961914, "learning_rate": 2.6822589098532496e-05, "loss": 0.1113, "step": 17690 }, { "epoch": 0.9276729559748428, "grad_norm": 3.1834607124328613, "learning_rate": 2.680948637316562e-05, "loss": 0.1402, "step": 17700 }, { "epoch": 0.9281970649895178, "grad_norm": 3.362410068511963, "learning_rate": 2.6796383647798746e-05, "loss": 0.127, "step": 17710 }, { "epoch": 0.9287211740041929, "grad_norm": 1.9332648515701294, "learning_rate": 2.678328092243187e-05, "loss": 0.0974, "step": 17720 }, { "epoch": 0.9292452830188679, "grad_norm": 1.9798800945281982, "learning_rate": 2.6770178197064993e-05, "loss": 0.1189, "step": 17730 }, { "epoch": 0.929769392033543, "grad_norm": 1.7517541646957397, "learning_rate": 2.6757075471698113e-05, "loss": 0.1375, "step": 17740 }, { "epoch": 0.9302935010482181, "grad_norm": 1.3065685033798218, "learning_rate": 2.6743972746331237e-05, "loss": 0.1313, "step": 17750 }, { "epoch": 0.9308176100628931, "grad_norm": 3.3601229190826416, "learning_rate": 2.673087002096436e-05, "loss": 0.1445, "step": 17760 }, { "epoch": 0.9313417190775681, "grad_norm": 1.3460884094238281, "learning_rate": 2.6717767295597483e-05, "loss": 0.106, "step": 17770 }, { "epoch": 0.9318658280922432, "grad_norm": 1.8291536569595337, "learning_rate": 2.670466457023061e-05, "loss": 0.1067, "step": 17780 }, { "epoch": 0.9323899371069182, "grad_norm": 1.4517912864685059, "learning_rate": 2.6691561844863734e-05, "loss": 0.1144, "step": 17790 }, { "epoch": 0.9329140461215933, "grad_norm": 1.0053421258926392, "learning_rate": 2.6678459119496857e-05, "loss": 0.1039, "step": 17800 }, { "epoch": 0.9334381551362684, "grad_norm": 4.46645975112915, "learning_rate": 2.666535639412998e-05, "loss": 0.0918, "step": 17810 }, { "epoch": 0.9339622641509434, "grad_norm": 2.079599142074585, "learning_rate": 2.6652253668763104e-05, "loss": 0.1066, "step": 17820 }, { "epoch": 0.9344863731656184, "grad_norm": 1.3694299459457397, "learning_rate": 2.663915094339623e-05, "loss": 0.1326, "step": 17830 }, { "epoch": 0.9350104821802935, "grad_norm": 2.696722984313965, "learning_rate": 2.6626048218029354e-05, "loss": 0.1249, "step": 17840 }, { "epoch": 0.9355345911949685, "grad_norm": 3.4780609607696533, "learning_rate": 2.6612945492662477e-05, "loss": 0.1351, "step": 17850 }, { "epoch": 0.9360587002096437, "grad_norm": 1.8707832098007202, "learning_rate": 2.6599842767295597e-05, "loss": 0.0928, "step": 17860 }, { "epoch": 0.9365828092243187, "grad_norm": 1.8133983612060547, "learning_rate": 2.658674004192872e-05, "loss": 0.1317, "step": 17870 }, { "epoch": 0.9371069182389937, "grad_norm": 3.4762990474700928, "learning_rate": 2.6573637316561844e-05, "loss": 0.0866, "step": 17880 }, { "epoch": 0.9376310272536688, "grad_norm": 4.619908809661865, "learning_rate": 2.6560534591194967e-05, "loss": 0.1165, "step": 17890 }, { "epoch": 0.9381551362683438, "grad_norm": 1.6376959085464478, "learning_rate": 2.6547431865828094e-05, "loss": 0.1235, "step": 17900 }, { "epoch": 0.9386792452830188, "grad_norm": 1.5230728387832642, "learning_rate": 2.6534329140461218e-05, "loss": 0.1079, "step": 17910 }, { "epoch": 0.939203354297694, "grad_norm": 2.2401304244995117, "learning_rate": 2.652122641509434e-05, "loss": 0.0961, "step": 17920 }, { "epoch": 0.939727463312369, "grad_norm": 1.3648204803466797, "learning_rate": 2.6508123689727464e-05, "loss": 0.105, "step": 17930 }, { "epoch": 0.940251572327044, "grad_norm": 2.788958787918091, "learning_rate": 2.649502096436059e-05, "loss": 0.1171, "step": 17940 }, { "epoch": 0.9407756813417191, "grad_norm": 2.2564330101013184, "learning_rate": 2.6481918238993714e-05, "loss": 0.1015, "step": 17950 }, { "epoch": 0.9412997903563941, "grad_norm": 2.0158989429473877, "learning_rate": 2.6468815513626838e-05, "loss": 0.0915, "step": 17960 }, { "epoch": 0.9418238993710691, "grad_norm": 2.8656909465789795, "learning_rate": 2.645571278825996e-05, "loss": 0.0965, "step": 17970 }, { "epoch": 0.9423480083857443, "grad_norm": 2.881559371948242, "learning_rate": 2.644261006289308e-05, "loss": 0.1194, "step": 17980 }, { "epoch": 0.9428721174004193, "grad_norm": 1.544648289680481, "learning_rate": 2.6429507337526205e-05, "loss": 0.1258, "step": 17990 }, { "epoch": 0.9433962264150944, "grad_norm": 1.1653016805648804, "learning_rate": 2.6416404612159328e-05, "loss": 0.1141, "step": 18000 }, { "epoch": 0.9433962264150944, "eval_loss": 0.273366242647171, "eval_runtime": 268.2749, "eval_samples_per_second": 7.421, "eval_steps_per_second": 1.238, "step": 18000 }, { "epoch": 0.9439203354297694, "grad_norm": 2.2898740768432617, "learning_rate": 2.6403301886792455e-05, "loss": 0.1076, "step": 18010 }, { "epoch": 0.9444444444444444, "grad_norm": 2.067246437072754, "learning_rate": 2.6390199161425578e-05, "loss": 0.1208, "step": 18020 }, { "epoch": 0.9449685534591195, "grad_norm": 2.2256276607513428, "learning_rate": 2.63770964360587e-05, "loss": 0.135, "step": 18030 }, { "epoch": 0.9454926624737946, "grad_norm": 1.5676586627960205, "learning_rate": 2.6363993710691825e-05, "loss": 0.1077, "step": 18040 }, { "epoch": 0.9460167714884696, "grad_norm": 1.3694120645523071, "learning_rate": 2.635089098532495e-05, "loss": 0.0976, "step": 18050 }, { "epoch": 0.9465408805031447, "grad_norm": 1.5198019742965698, "learning_rate": 2.6337788259958075e-05, "loss": 0.1529, "step": 18060 }, { "epoch": 0.9470649895178197, "grad_norm": 0.8670737147331238, "learning_rate": 2.63246855345912e-05, "loss": 0.1139, "step": 18070 }, { "epoch": 0.9475890985324947, "grad_norm": 1.631770133972168, "learning_rate": 2.6311582809224322e-05, "loss": 0.0935, "step": 18080 }, { "epoch": 0.9481132075471698, "grad_norm": 2.3997673988342285, "learning_rate": 2.6298480083857445e-05, "loss": 0.1175, "step": 18090 }, { "epoch": 0.9486373165618449, "grad_norm": 2.676593065261841, "learning_rate": 2.6285377358490565e-05, "loss": 0.1232, "step": 18100 }, { "epoch": 0.94916142557652, "grad_norm": 1.3629957437515259, "learning_rate": 2.627227463312369e-05, "loss": 0.1248, "step": 18110 }, { "epoch": 0.949685534591195, "grad_norm": 1.683488130569458, "learning_rate": 2.6259171907756812e-05, "loss": 0.1111, "step": 18120 }, { "epoch": 0.95020964360587, "grad_norm": 1.4218518733978271, "learning_rate": 2.624606918238994e-05, "loss": 0.0971, "step": 18130 }, { "epoch": 0.950733752620545, "grad_norm": 1.9305870532989502, "learning_rate": 2.6232966457023062e-05, "loss": 0.1405, "step": 18140 }, { "epoch": 0.9512578616352201, "grad_norm": 1.6784343719482422, "learning_rate": 2.6219863731656186e-05, "loss": 0.1172, "step": 18150 }, { "epoch": 0.9517819706498952, "grad_norm": 2.8489601612091064, "learning_rate": 2.620676100628931e-05, "loss": 0.1533, "step": 18160 }, { "epoch": 0.9523060796645703, "grad_norm": 1.829404354095459, "learning_rate": 2.6193658280922432e-05, "loss": 0.1015, "step": 18170 }, { "epoch": 0.9528301886792453, "grad_norm": 2.171243667602539, "learning_rate": 2.618055555555556e-05, "loss": 0.1176, "step": 18180 }, { "epoch": 0.9533542976939203, "grad_norm": 1.988754153251648, "learning_rate": 2.6167452830188682e-05, "loss": 0.1135, "step": 18190 }, { "epoch": 0.9538784067085954, "grad_norm": 1.1377837657928467, "learning_rate": 2.6154350104821806e-05, "loss": 0.1367, "step": 18200 }, { "epoch": 0.9544025157232704, "grad_norm": 1.9834413528442383, "learning_rate": 2.614124737945493e-05, "loss": 0.1373, "step": 18210 }, { "epoch": 0.9549266247379455, "grad_norm": 1.2570122480392456, "learning_rate": 2.612814465408805e-05, "loss": 0.1191, "step": 18220 }, { "epoch": 0.9554507337526206, "grad_norm": 1.0435508489608765, "learning_rate": 2.6115041928721173e-05, "loss": 0.116, "step": 18230 }, { "epoch": 0.9559748427672956, "grad_norm": 2.252382516860962, "learning_rate": 2.6101939203354296e-05, "loss": 0.1292, "step": 18240 }, { "epoch": 0.9564989517819706, "grad_norm": 2.4033634662628174, "learning_rate": 2.6088836477987423e-05, "loss": 0.1072, "step": 18250 }, { "epoch": 0.9570230607966457, "grad_norm": 0.9558582305908203, "learning_rate": 2.6075733752620546e-05, "loss": 0.1053, "step": 18260 }, { "epoch": 0.9575471698113207, "grad_norm": 2.276141405105591, "learning_rate": 2.606263102725367e-05, "loss": 0.1108, "step": 18270 }, { "epoch": 0.9580712788259959, "grad_norm": 2.3497204780578613, "learning_rate": 2.6049528301886793e-05, "loss": 0.1623, "step": 18280 }, { "epoch": 0.9585953878406709, "grad_norm": 1.9832830429077148, "learning_rate": 2.603642557651992e-05, "loss": 0.1167, "step": 18290 }, { "epoch": 0.9591194968553459, "grad_norm": 1.21957528591156, "learning_rate": 2.6023322851153043e-05, "loss": 0.1158, "step": 18300 }, { "epoch": 0.959643605870021, "grad_norm": 2.1913418769836426, "learning_rate": 2.6010220125786166e-05, "loss": 0.1035, "step": 18310 }, { "epoch": 0.960167714884696, "grad_norm": 1.9338150024414062, "learning_rate": 2.599711740041929e-05, "loss": 0.1179, "step": 18320 }, { "epoch": 0.960691823899371, "grad_norm": 1.4277621507644653, "learning_rate": 2.598401467505241e-05, "loss": 0.1137, "step": 18330 }, { "epoch": 0.9612159329140462, "grad_norm": 2.319413185119629, "learning_rate": 2.5970911949685533e-05, "loss": 0.1376, "step": 18340 }, { "epoch": 0.9617400419287212, "grad_norm": 2.9515397548675537, "learning_rate": 2.5957809224318657e-05, "loss": 0.112, "step": 18350 }, { "epoch": 0.9622641509433962, "grad_norm": 1.2524621486663818, "learning_rate": 2.5944706498951783e-05, "loss": 0.1294, "step": 18360 }, { "epoch": 0.9627882599580713, "grad_norm": 1.4624210596084595, "learning_rate": 2.5931603773584907e-05, "loss": 0.0921, "step": 18370 }, { "epoch": 0.9633123689727463, "grad_norm": 2.1402573585510254, "learning_rate": 2.591850104821803e-05, "loss": 0.099, "step": 18380 }, { "epoch": 0.9638364779874213, "grad_norm": 1.677443265914917, "learning_rate": 2.5905398322851154e-05, "loss": 0.1414, "step": 18390 }, { "epoch": 0.9643605870020965, "grad_norm": 2.3793933391571045, "learning_rate": 2.5892295597484277e-05, "loss": 0.0931, "step": 18400 }, { "epoch": 0.9648846960167715, "grad_norm": 1.8992564678192139, "learning_rate": 2.5879192872117404e-05, "loss": 0.1172, "step": 18410 }, { "epoch": 0.9654088050314465, "grad_norm": 1.8893632888793945, "learning_rate": 2.5866090146750527e-05, "loss": 0.1063, "step": 18420 }, { "epoch": 0.9659329140461216, "grad_norm": 3.0469095706939697, "learning_rate": 2.585298742138365e-05, "loss": 0.0926, "step": 18430 }, { "epoch": 0.9664570230607966, "grad_norm": 0.6239261627197266, "learning_rate": 2.5839884696016774e-05, "loss": 0.12, "step": 18440 }, { "epoch": 0.9669811320754716, "grad_norm": 1.5373907089233398, "learning_rate": 2.5826781970649894e-05, "loss": 0.131, "step": 18450 }, { "epoch": 0.9675052410901468, "grad_norm": 1.107908844947815, "learning_rate": 2.5813679245283017e-05, "loss": 0.1001, "step": 18460 }, { "epoch": 0.9680293501048218, "grad_norm": 0.9736654758453369, "learning_rate": 2.580057651991614e-05, "loss": 0.0895, "step": 18470 }, { "epoch": 0.9685534591194969, "grad_norm": 2.5150935649871826, "learning_rate": 2.5787473794549267e-05, "loss": 0.1471, "step": 18480 }, { "epoch": 0.9690775681341719, "grad_norm": 2.4384822845458984, "learning_rate": 2.577437106918239e-05, "loss": 0.1279, "step": 18490 }, { "epoch": 0.9696016771488469, "grad_norm": 2.5240912437438965, "learning_rate": 2.5761268343815514e-05, "loss": 0.1529, "step": 18500 }, { "epoch": 0.970125786163522, "grad_norm": 1.687264084815979, "learning_rate": 2.5748165618448638e-05, "loss": 0.1279, "step": 18510 }, { "epoch": 0.9706498951781971, "grad_norm": 2.4330201148986816, "learning_rate": 2.5735062893081764e-05, "loss": 0.1216, "step": 18520 }, { "epoch": 0.9711740041928721, "grad_norm": 1.3596785068511963, "learning_rate": 2.5721960167714888e-05, "loss": 0.1052, "step": 18530 }, { "epoch": 0.9716981132075472, "grad_norm": 3.4725117683410645, "learning_rate": 2.570885744234801e-05, "loss": 0.1065, "step": 18540 }, { "epoch": 0.9722222222222222, "grad_norm": 0.9534229636192322, "learning_rate": 2.5695754716981135e-05, "loss": 0.089, "step": 18550 }, { "epoch": 0.9727463312368972, "grad_norm": 1.0076327323913574, "learning_rate": 2.5682651991614258e-05, "loss": 0.0991, "step": 18560 }, { "epoch": 0.9732704402515723, "grad_norm": 2.910527229309082, "learning_rate": 2.5669549266247378e-05, "loss": 0.1067, "step": 18570 }, { "epoch": 0.9737945492662474, "grad_norm": 1.0768955945968628, "learning_rate": 2.56564465408805e-05, "loss": 0.1105, "step": 18580 }, { "epoch": 0.9743186582809225, "grad_norm": 2.1621615886688232, "learning_rate": 2.5643343815513625e-05, "loss": 0.1246, "step": 18590 }, { "epoch": 0.9748427672955975, "grad_norm": 2.638805627822876, "learning_rate": 2.563024109014675e-05, "loss": 0.1277, "step": 18600 }, { "epoch": 0.9753668763102725, "grad_norm": 1.5452163219451904, "learning_rate": 2.5617138364779875e-05, "loss": 0.0956, "step": 18610 }, { "epoch": 0.9758909853249476, "grad_norm": 2.066392183303833, "learning_rate": 2.5604035639412998e-05, "loss": 0.0932, "step": 18620 }, { "epoch": 0.9764150943396226, "grad_norm": 2.3010566234588623, "learning_rate": 2.559093291404612e-05, "loss": 0.1238, "step": 18630 }, { "epoch": 0.9769392033542977, "grad_norm": 3.2208967208862305, "learning_rate": 2.557783018867925e-05, "loss": 0.1268, "step": 18640 }, { "epoch": 0.9774633123689728, "grad_norm": 6.718996047973633, "learning_rate": 2.5564727463312372e-05, "loss": 0.1182, "step": 18650 }, { "epoch": 0.9779874213836478, "grad_norm": 2.7678651809692383, "learning_rate": 2.5551624737945495e-05, "loss": 0.1413, "step": 18660 }, { "epoch": 0.9785115303983228, "grad_norm": 1.6215708255767822, "learning_rate": 2.553852201257862e-05, "loss": 0.0962, "step": 18670 }, { "epoch": 0.9790356394129979, "grad_norm": 1.7245888710021973, "learning_rate": 2.5525419287211745e-05, "loss": 0.1187, "step": 18680 }, { "epoch": 0.9795597484276729, "grad_norm": 1.980368733406067, "learning_rate": 2.5512316561844862e-05, "loss": 0.0917, "step": 18690 }, { "epoch": 0.980083857442348, "grad_norm": 1.9294238090515137, "learning_rate": 2.5499213836477985e-05, "loss": 0.1395, "step": 18700 }, { "epoch": 0.9806079664570231, "grad_norm": 1.6551685333251953, "learning_rate": 2.5486111111111112e-05, "loss": 0.1034, "step": 18710 }, { "epoch": 0.9811320754716981, "grad_norm": 1.7868574857711792, "learning_rate": 2.5473008385744235e-05, "loss": 0.0993, "step": 18720 }, { "epoch": 0.9816561844863732, "grad_norm": 2.7772812843322754, "learning_rate": 2.545990566037736e-05, "loss": 0.1022, "step": 18730 }, { "epoch": 0.9821802935010482, "grad_norm": 3.1440446376800537, "learning_rate": 2.5446802935010482e-05, "loss": 0.1154, "step": 18740 }, { "epoch": 0.9827044025157232, "grad_norm": 1.7626208066940308, "learning_rate": 2.5433700209643606e-05, "loss": 0.1015, "step": 18750 }, { "epoch": 0.9832285115303984, "grad_norm": 1.3496103286743164, "learning_rate": 2.5420597484276732e-05, "loss": 0.1373, "step": 18760 }, { "epoch": 0.9837526205450734, "grad_norm": 2.246685028076172, "learning_rate": 2.5407494758909856e-05, "loss": 0.1137, "step": 18770 }, { "epoch": 0.9842767295597484, "grad_norm": 1.2018938064575195, "learning_rate": 2.539439203354298e-05, "loss": 0.1291, "step": 18780 }, { "epoch": 0.9848008385744235, "grad_norm": 1.1192766427993774, "learning_rate": 2.5381289308176103e-05, "loss": 0.1051, "step": 18790 }, { "epoch": 0.9853249475890985, "grad_norm": 1.578484296798706, "learning_rate": 2.536818658280923e-05, "loss": 0.1074, "step": 18800 }, { "epoch": 0.9858490566037735, "grad_norm": 1.7460689544677734, "learning_rate": 2.5355083857442346e-05, "loss": 0.1506, "step": 18810 }, { "epoch": 0.9863731656184487, "grad_norm": 2.330599784851074, "learning_rate": 2.534198113207547e-05, "loss": 0.0993, "step": 18820 }, { "epoch": 0.9868972746331237, "grad_norm": 1.5016928911209106, "learning_rate": 2.5328878406708596e-05, "loss": 0.114, "step": 18830 }, { "epoch": 0.9874213836477987, "grad_norm": 2.1551918983459473, "learning_rate": 2.531577568134172e-05, "loss": 0.0984, "step": 18840 }, { "epoch": 0.9879454926624738, "grad_norm": 1.67074716091156, "learning_rate": 2.5302672955974843e-05, "loss": 0.1162, "step": 18850 }, { "epoch": 0.9884696016771488, "grad_norm": 1.218502163887024, "learning_rate": 2.5289570230607966e-05, "loss": 0.1058, "step": 18860 }, { "epoch": 0.9889937106918238, "grad_norm": 2.4353883266448975, "learning_rate": 2.5276467505241093e-05, "loss": 0.1061, "step": 18870 }, { "epoch": 0.989517819706499, "grad_norm": 1.4544498920440674, "learning_rate": 2.5263364779874216e-05, "loss": 0.1033, "step": 18880 }, { "epoch": 0.990041928721174, "grad_norm": 1.7678455114364624, "learning_rate": 2.525026205450734e-05, "loss": 0.1332, "step": 18890 }, { "epoch": 0.9905660377358491, "grad_norm": 1.480412244796753, "learning_rate": 2.5237159329140463e-05, "loss": 0.1034, "step": 18900 }, { "epoch": 0.9910901467505241, "grad_norm": 1.789129376411438, "learning_rate": 2.5224056603773587e-05, "loss": 0.115, "step": 18910 }, { "epoch": 0.9916142557651991, "grad_norm": 1.5032306909561157, "learning_rate": 2.5210953878406713e-05, "loss": 0.1095, "step": 18920 }, { "epoch": 0.9921383647798742, "grad_norm": 1.3824424743652344, "learning_rate": 2.519785115303983e-05, "loss": 0.1232, "step": 18930 }, { "epoch": 0.9926624737945493, "grad_norm": 1.3315826654434204, "learning_rate": 2.5184748427672957e-05, "loss": 0.0899, "step": 18940 }, { "epoch": 0.9931865828092243, "grad_norm": 4.2224297523498535, "learning_rate": 2.517164570230608e-05, "loss": 0.1064, "step": 18950 }, { "epoch": 0.9937106918238994, "grad_norm": 1.537833571434021, "learning_rate": 2.5158542976939203e-05, "loss": 0.092, "step": 18960 }, { "epoch": 0.9942348008385744, "grad_norm": 1.8726469278335571, "learning_rate": 2.5145440251572327e-05, "loss": 0.1214, "step": 18970 }, { "epoch": 0.9947589098532494, "grad_norm": 2.2182183265686035, "learning_rate": 2.513233752620545e-05, "loss": 0.1118, "step": 18980 }, { "epoch": 0.9952830188679245, "grad_norm": 2.326605796813965, "learning_rate": 2.5119234800838577e-05, "loss": 0.1268, "step": 18990 }, { "epoch": 0.9958071278825996, "grad_norm": 1.908422589302063, "learning_rate": 2.51061320754717e-05, "loss": 0.119, "step": 19000 }, { "epoch": 0.9958071278825996, "eval_loss": 0.2677764594554901, "eval_runtime": 268.2087, "eval_samples_per_second": 7.423, "eval_steps_per_second": 1.238, "step": 19000 }, { "epoch": 0.9963312368972747, "grad_norm": 2.0410823822021484, "learning_rate": 2.5093029350104824e-05, "loss": 0.1056, "step": 19010 }, { "epoch": 0.9968553459119497, "grad_norm": 2.298825740814209, "learning_rate": 2.5079926624737947e-05, "loss": 0.1308, "step": 19020 }, { "epoch": 0.9973794549266247, "grad_norm": 2.703629493713379, "learning_rate": 2.5066823899371074e-05, "loss": 0.1018, "step": 19030 }, { "epoch": 0.9979035639412998, "grad_norm": 0.8350358009338379, "learning_rate": 2.5053721174004197e-05, "loss": 0.0902, "step": 19040 }, { "epoch": 0.9984276729559748, "grad_norm": 1.1665393114089966, "learning_rate": 2.5040618448637314e-05, "loss": 0.1118, "step": 19050 }, { "epoch": 0.9989517819706499, "grad_norm": 1.3827977180480957, "learning_rate": 2.502751572327044e-05, "loss": 0.1178, "step": 19060 }, { "epoch": 0.999475890985325, "grad_norm": 3.110260248184204, "learning_rate": 2.5014412997903564e-05, "loss": 0.1189, "step": 19070 }, { "epoch": 1.0, "grad_norm": 1.3243497610092163, "learning_rate": 2.5001310272536688e-05, "loss": 0.096, "step": 19080 }, { "epoch": 1.000524109014675, "grad_norm": 1.87785804271698, "learning_rate": 2.498820754716981e-05, "loss": 0.0894, "step": 19090 }, { "epoch": 1.00104821802935, "grad_norm": 2.367110252380371, "learning_rate": 2.4975104821802938e-05, "loss": 0.079, "step": 19100 }, { "epoch": 1.001572327044025, "grad_norm": 0.9960259199142456, "learning_rate": 2.496200209643606e-05, "loss": 0.0941, "step": 19110 }, { "epoch": 1.0020964360587001, "grad_norm": 1.7719085216522217, "learning_rate": 2.4948899371069184e-05, "loss": 0.0812, "step": 19120 }, { "epoch": 1.0026205450733752, "grad_norm": 1.4971020221710205, "learning_rate": 2.4935796645702308e-05, "loss": 0.0978, "step": 19130 }, { "epoch": 1.0031446540880504, "grad_norm": 1.7672542333602905, "learning_rate": 2.492269392033543e-05, "loss": 0.0786, "step": 19140 }, { "epoch": 1.0036687631027255, "grad_norm": 2.371225118637085, "learning_rate": 2.4909591194968555e-05, "loss": 0.0678, "step": 19150 }, { "epoch": 1.0041928721174005, "grad_norm": 1.103453278541565, "learning_rate": 2.4896488469601678e-05, "loss": 0.0737, "step": 19160 }, { "epoch": 1.0047169811320755, "grad_norm": 2.200951337814331, "learning_rate": 2.48833857442348e-05, "loss": 0.0708, "step": 19170 }, { "epoch": 1.0052410901467506, "grad_norm": 1.470166563987732, "learning_rate": 2.4870283018867928e-05, "loss": 0.094, "step": 19180 }, { "epoch": 1.0057651991614256, "grad_norm": 1.8624029159545898, "learning_rate": 2.485718029350105e-05, "loss": 0.0968, "step": 19190 }, { "epoch": 1.0062893081761006, "grad_norm": 2.2916412353515625, "learning_rate": 2.484407756813417e-05, "loss": 0.1011, "step": 19200 }, { "epoch": 1.0068134171907757, "grad_norm": 0.8032910823822021, "learning_rate": 2.4830974842767295e-05, "loss": 0.0769, "step": 19210 }, { "epoch": 1.0073375262054507, "grad_norm": 1.478871464729309, "learning_rate": 2.481787211740042e-05, "loss": 0.0966, "step": 19220 }, { "epoch": 1.0078616352201257, "grad_norm": 0.7236099243164062, "learning_rate": 2.4804769392033545e-05, "loss": 0.0725, "step": 19230 }, { "epoch": 1.0083857442348008, "grad_norm": 1.7173033952713013, "learning_rate": 2.479166666666667e-05, "loss": 0.1062, "step": 19240 }, { "epoch": 1.0089098532494758, "grad_norm": 0.6187518239021301, "learning_rate": 2.4778563941299792e-05, "loss": 0.0739, "step": 19250 }, { "epoch": 1.009433962264151, "grad_norm": 1.4748426675796509, "learning_rate": 2.4765461215932915e-05, "loss": 0.0902, "step": 19260 }, { "epoch": 1.009958071278826, "grad_norm": 1.5630605220794678, "learning_rate": 2.475235849056604e-05, "loss": 0.0821, "step": 19270 }, { "epoch": 1.0104821802935011, "grad_norm": 1.2467037439346313, "learning_rate": 2.4739255765199162e-05, "loss": 0.0869, "step": 19280 }, { "epoch": 1.0110062893081762, "grad_norm": 1.1149821281433105, "learning_rate": 2.4726153039832285e-05, "loss": 0.0956, "step": 19290 }, { "epoch": 1.0115303983228512, "grad_norm": 1.2000000476837158, "learning_rate": 2.4713050314465412e-05, "loss": 0.0988, "step": 19300 }, { "epoch": 1.0120545073375262, "grad_norm": 1.31313157081604, "learning_rate": 2.4699947589098536e-05, "loss": 0.0645, "step": 19310 }, { "epoch": 1.0125786163522013, "grad_norm": 2.025057554244995, "learning_rate": 2.4686844863731656e-05, "loss": 0.096, "step": 19320 }, { "epoch": 1.0131027253668763, "grad_norm": 0.7827894687652588, "learning_rate": 2.467374213836478e-05, "loss": 0.1035, "step": 19330 }, { "epoch": 1.0136268343815513, "grad_norm": 0.8256064057350159, "learning_rate": 2.4660639412997906e-05, "loss": 0.0851, "step": 19340 }, { "epoch": 1.0141509433962264, "grad_norm": 6.706319808959961, "learning_rate": 2.464753668763103e-05, "loss": 0.069, "step": 19350 }, { "epoch": 1.0146750524109014, "grad_norm": 0.8227464556694031, "learning_rate": 2.4634433962264152e-05, "loss": 0.0946, "step": 19360 }, { "epoch": 1.0151991614255764, "grad_norm": 1.1776174306869507, "learning_rate": 2.4621331236897276e-05, "loss": 0.063, "step": 19370 }, { "epoch": 1.0157232704402517, "grad_norm": 1.0992966890335083, "learning_rate": 2.46082285115304e-05, "loss": 0.071, "step": 19380 }, { "epoch": 1.0162473794549267, "grad_norm": 1.3341543674468994, "learning_rate": 2.4595125786163523e-05, "loss": 0.061, "step": 19390 }, { "epoch": 1.0167714884696017, "grad_norm": 3.2466483116149902, "learning_rate": 2.4582023060796646e-05, "loss": 0.0904, "step": 19400 }, { "epoch": 1.0172955974842768, "grad_norm": 3.0862956047058105, "learning_rate": 2.456892033542977e-05, "loss": 0.1189, "step": 19410 }, { "epoch": 1.0178197064989518, "grad_norm": 1.4831918478012085, "learning_rate": 2.4555817610062896e-05, "loss": 0.0899, "step": 19420 }, { "epoch": 1.0183438155136268, "grad_norm": 1.3484694957733154, "learning_rate": 2.454271488469602e-05, "loss": 0.0898, "step": 19430 }, { "epoch": 1.0188679245283019, "grad_norm": 3.4956963062286377, "learning_rate": 2.452961215932914e-05, "loss": 0.1003, "step": 19440 }, { "epoch": 1.019392033542977, "grad_norm": 2.407393217086792, "learning_rate": 2.4516509433962266e-05, "loss": 0.0911, "step": 19450 }, { "epoch": 1.019916142557652, "grad_norm": 1.5907275676727295, "learning_rate": 2.450340670859539e-05, "loss": 0.0957, "step": 19460 }, { "epoch": 1.020440251572327, "grad_norm": 1.243753433227539, "learning_rate": 2.4490303983228513e-05, "loss": 0.0647, "step": 19470 }, { "epoch": 1.020964360587002, "grad_norm": 3.1798362731933594, "learning_rate": 2.4477201257861636e-05, "loss": 0.0972, "step": 19480 }, { "epoch": 1.021488469601677, "grad_norm": 1.9811335802078247, "learning_rate": 2.446409853249476e-05, "loss": 0.0893, "step": 19490 }, { "epoch": 1.0220125786163523, "grad_norm": 1.0129342079162598, "learning_rate": 2.4450995807127883e-05, "loss": 0.1057, "step": 19500 }, { "epoch": 1.0225366876310273, "grad_norm": 1.6326560974121094, "learning_rate": 2.4437893081761007e-05, "loss": 0.1077, "step": 19510 }, { "epoch": 1.0230607966457024, "grad_norm": 1.4604363441467285, "learning_rate": 2.442479035639413e-05, "loss": 0.0887, "step": 19520 }, { "epoch": 1.0235849056603774, "grad_norm": 2.1132311820983887, "learning_rate": 2.4411687631027257e-05, "loss": 0.0953, "step": 19530 }, { "epoch": 1.0241090146750524, "grad_norm": 1.9144420623779297, "learning_rate": 2.439858490566038e-05, "loss": 0.0488, "step": 19540 }, { "epoch": 1.0246331236897275, "grad_norm": 2.6526668071746826, "learning_rate": 2.4385482180293504e-05, "loss": 0.0924, "step": 19550 }, { "epoch": 1.0251572327044025, "grad_norm": 1.5566918849945068, "learning_rate": 2.4372379454926624e-05, "loss": 0.1057, "step": 19560 }, { "epoch": 1.0256813417190775, "grad_norm": 3.5535433292388916, "learning_rate": 2.435927672955975e-05, "loss": 0.0897, "step": 19570 }, { "epoch": 1.0262054507337526, "grad_norm": 0.8449379205703735, "learning_rate": 2.4346174004192874e-05, "loss": 0.0778, "step": 19580 }, { "epoch": 1.0267295597484276, "grad_norm": 2.4703757762908936, "learning_rate": 2.4333071278825997e-05, "loss": 0.0718, "step": 19590 }, { "epoch": 1.0272536687631026, "grad_norm": 1.603649616241455, "learning_rate": 2.431996855345912e-05, "loss": 0.0697, "step": 19600 }, { "epoch": 1.0277777777777777, "grad_norm": 1.5522819757461548, "learning_rate": 2.4306865828092247e-05, "loss": 0.0935, "step": 19610 }, { "epoch": 1.028301886792453, "grad_norm": 1.6726840734481812, "learning_rate": 2.4293763102725367e-05, "loss": 0.0782, "step": 19620 }, { "epoch": 1.028825995807128, "grad_norm": 0.7730293869972229, "learning_rate": 2.428066037735849e-05, "loss": 0.0712, "step": 19630 }, { "epoch": 1.029350104821803, "grad_norm": 1.2800456285476685, "learning_rate": 2.4267557651991614e-05, "loss": 0.0679, "step": 19640 }, { "epoch": 1.029874213836478, "grad_norm": 3.403644561767578, "learning_rate": 2.425445492662474e-05, "loss": 0.0874, "step": 19650 }, { "epoch": 1.030398322851153, "grad_norm": 0.9466264843940735, "learning_rate": 2.4241352201257864e-05, "loss": 0.0763, "step": 19660 }, { "epoch": 1.030922431865828, "grad_norm": 1.6146653890609741, "learning_rate": 2.4228249475890988e-05, "loss": 0.0892, "step": 19670 }, { "epoch": 1.0314465408805031, "grad_norm": 0.8680115938186646, "learning_rate": 2.4215146750524108e-05, "loss": 0.0987, "step": 19680 }, { "epoch": 1.0319706498951782, "grad_norm": 0.9922559857368469, "learning_rate": 2.4202044025157234e-05, "loss": 0.0901, "step": 19690 }, { "epoch": 1.0324947589098532, "grad_norm": 0.8490608930587769, "learning_rate": 2.4188941299790358e-05, "loss": 0.0958, "step": 19700 }, { "epoch": 1.0330188679245282, "grad_norm": 1.9773913621902466, "learning_rate": 2.417583857442348e-05, "loss": 0.0909, "step": 19710 }, { "epoch": 1.0335429769392033, "grad_norm": 1.3205777406692505, "learning_rate": 2.4162735849056605e-05, "loss": 0.0607, "step": 19720 }, { "epoch": 1.0340670859538783, "grad_norm": 22.19045066833496, "learning_rate": 2.414963312368973e-05, "loss": 0.0704, "step": 19730 }, { "epoch": 1.0345911949685536, "grad_norm": 0.9581673741340637, "learning_rate": 2.413653039832285e-05, "loss": 0.1178, "step": 19740 }, { "epoch": 1.0351153039832286, "grad_norm": 1.1050844192504883, "learning_rate": 2.4123427672955975e-05, "loss": 0.0761, "step": 19750 }, { "epoch": 1.0356394129979036, "grad_norm": 0.8335663676261902, "learning_rate": 2.4110324947589098e-05, "loss": 0.1002, "step": 19760 }, { "epoch": 1.0361635220125787, "grad_norm": 1.0287737846374512, "learning_rate": 2.4097222222222225e-05, "loss": 0.1094, "step": 19770 }, { "epoch": 1.0366876310272537, "grad_norm": 2.1343629360198975, "learning_rate": 2.4084119496855348e-05, "loss": 0.0879, "step": 19780 }, { "epoch": 1.0372117400419287, "grad_norm": 0.8107201457023621, "learning_rate": 2.407101677148847e-05, "loss": 0.083, "step": 19790 }, { "epoch": 1.0377358490566038, "grad_norm": 1.4638867378234863, "learning_rate": 2.4057914046121595e-05, "loss": 0.0891, "step": 19800 }, { "epoch": 1.0382599580712788, "grad_norm": 1.2282848358154297, "learning_rate": 2.404481132075472e-05, "loss": 0.0603, "step": 19810 }, { "epoch": 1.0387840670859538, "grad_norm": 0.7536243796348572, "learning_rate": 2.4031708595387842e-05, "loss": 0.0688, "step": 19820 }, { "epoch": 1.0393081761006289, "grad_norm": 2.29469895362854, "learning_rate": 2.4018605870020965e-05, "loss": 0.0838, "step": 19830 }, { "epoch": 1.039832285115304, "grad_norm": 1.947985291481018, "learning_rate": 2.400550314465409e-05, "loss": 0.0972, "step": 19840 }, { "epoch": 1.040356394129979, "grad_norm": 1.8855829238891602, "learning_rate": 2.3992400419287215e-05, "loss": 0.0648, "step": 19850 }, { "epoch": 1.0408805031446542, "grad_norm": 2.420112133026123, "learning_rate": 2.3979297693920335e-05, "loss": 0.0595, "step": 19860 }, { "epoch": 1.0414046121593292, "grad_norm": 1.4276695251464844, "learning_rate": 2.396619496855346e-05, "loss": 0.1112, "step": 19870 }, { "epoch": 1.0419287211740043, "grad_norm": 1.6597774028778076, "learning_rate": 2.3953092243186585e-05, "loss": 0.0573, "step": 19880 }, { "epoch": 1.0424528301886793, "grad_norm": 2.0908398628234863, "learning_rate": 2.393998951781971e-05, "loss": 0.1037, "step": 19890 }, { "epoch": 1.0429769392033543, "grad_norm": 1.0306899547576904, "learning_rate": 2.3926886792452832e-05, "loss": 0.0821, "step": 19900 }, { "epoch": 1.0435010482180294, "grad_norm": 1.2301905155181885, "learning_rate": 2.3913784067085952e-05, "loss": 0.0606, "step": 19910 }, { "epoch": 1.0440251572327044, "grad_norm": 0.8270147442817688, "learning_rate": 2.390068134171908e-05, "loss": 0.0677, "step": 19920 }, { "epoch": 1.0445492662473794, "grad_norm": 1.662843942642212, "learning_rate": 2.3887578616352202e-05, "loss": 0.0812, "step": 19930 }, { "epoch": 1.0450733752620545, "grad_norm": 2.4776611328125, "learning_rate": 2.3874475890985326e-05, "loss": 0.082, "step": 19940 }, { "epoch": 1.0455974842767295, "grad_norm": 0.6950798034667969, "learning_rate": 2.386137316561845e-05, "loss": 0.0864, "step": 19950 }, { "epoch": 1.0461215932914045, "grad_norm": 1.1080272197723389, "learning_rate": 2.3848270440251576e-05, "loss": 0.0813, "step": 19960 }, { "epoch": 1.0466457023060796, "grad_norm": 2.492471218109131, "learning_rate": 2.3835167714884696e-05, "loss": 0.0527, "step": 19970 }, { "epoch": 1.0471698113207548, "grad_norm": 1.7215197086334229, "learning_rate": 2.382206498951782e-05, "loss": 0.0744, "step": 19980 }, { "epoch": 1.0476939203354299, "grad_norm": 1.311968445777893, "learning_rate": 2.3808962264150943e-05, "loss": 0.081, "step": 19990 }, { "epoch": 1.0482180293501049, "grad_norm": 3.416196584701538, "learning_rate": 2.379585953878407e-05, "loss": 0.0817, "step": 20000 }, { "epoch": 1.0482180293501049, "eval_loss": 0.2798672318458557, "eval_runtime": 268.0531, "eval_samples_per_second": 7.428, "eval_steps_per_second": 1.239, "step": 20000 }, { "epoch": 1.04874213836478, "grad_norm": 1.2927495241165161, "learning_rate": 2.3782756813417193e-05, "loss": 0.079, "step": 20010 }, { "epoch": 1.049266247379455, "grad_norm": 0.9687632918357849, "learning_rate": 2.3769654088050316e-05, "loss": 0.1144, "step": 20020 }, { "epoch": 1.04979035639413, "grad_norm": 1.7993711233139038, "learning_rate": 2.375655136268344e-05, "loss": 0.0949, "step": 20030 }, { "epoch": 1.050314465408805, "grad_norm": 2.492044687271118, "learning_rate": 2.3743448637316563e-05, "loss": 0.081, "step": 20040 }, { "epoch": 1.05083857442348, "grad_norm": 1.8582724332809448, "learning_rate": 2.3730345911949686e-05, "loss": 0.0852, "step": 20050 }, { "epoch": 1.051362683438155, "grad_norm": 1.9378348588943481, "learning_rate": 2.371724318658281e-05, "loss": 0.0818, "step": 20060 }, { "epoch": 1.0518867924528301, "grad_norm": 1.2538186311721802, "learning_rate": 2.3704140461215933e-05, "loss": 0.0735, "step": 20070 }, { "epoch": 1.0524109014675052, "grad_norm": 1.1138423681259155, "learning_rate": 2.369103773584906e-05, "loss": 0.0776, "step": 20080 }, { "epoch": 1.0529350104821802, "grad_norm": 1.8726630210876465, "learning_rate": 2.367793501048218e-05, "loss": 0.0932, "step": 20090 }, { "epoch": 1.0534591194968554, "grad_norm": 0.9272988438606262, "learning_rate": 2.3664832285115303e-05, "loss": 0.0687, "step": 20100 }, { "epoch": 1.0539832285115305, "grad_norm": 1.6455448865890503, "learning_rate": 2.365172955974843e-05, "loss": 0.0869, "step": 20110 }, { "epoch": 1.0545073375262055, "grad_norm": 1.239667534828186, "learning_rate": 2.3638626834381553e-05, "loss": 0.0855, "step": 20120 }, { "epoch": 1.0550314465408805, "grad_norm": 1.8769423961639404, "learning_rate": 2.3625524109014677e-05, "loss": 0.0896, "step": 20130 }, { "epoch": 1.0555555555555556, "grad_norm": 1.3619462251663208, "learning_rate": 2.36124213836478e-05, "loss": 0.0884, "step": 20140 }, { "epoch": 1.0560796645702306, "grad_norm": 2.014624834060669, "learning_rate": 2.3599318658280924e-05, "loss": 0.0861, "step": 20150 }, { "epoch": 1.0566037735849056, "grad_norm": 1.067657709121704, "learning_rate": 2.3586215932914047e-05, "loss": 0.1005, "step": 20160 }, { "epoch": 1.0571278825995807, "grad_norm": 1.8703769445419312, "learning_rate": 2.357311320754717e-05, "loss": 0.1162, "step": 20170 }, { "epoch": 1.0576519916142557, "grad_norm": 0.8814947009086609, "learning_rate": 2.3560010482180294e-05, "loss": 0.064, "step": 20180 }, { "epoch": 1.0581761006289307, "grad_norm": 1.0151050090789795, "learning_rate": 2.354690775681342e-05, "loss": 0.098, "step": 20190 }, { "epoch": 1.0587002096436058, "grad_norm": 0.915666937828064, "learning_rate": 2.3533805031446544e-05, "loss": 0.0796, "step": 20200 }, { "epoch": 1.0592243186582808, "grad_norm": 1.6964950561523438, "learning_rate": 2.3520702306079664e-05, "loss": 0.0945, "step": 20210 }, { "epoch": 1.059748427672956, "grad_norm": 1.1933314800262451, "learning_rate": 2.3507599580712787e-05, "loss": 0.1041, "step": 20220 }, { "epoch": 1.060272536687631, "grad_norm": 1.2950000762939453, "learning_rate": 2.3494496855345914e-05, "loss": 0.0987, "step": 20230 }, { "epoch": 1.0607966457023061, "grad_norm": 3.0018699169158936, "learning_rate": 2.3481394129979037e-05, "loss": 0.0951, "step": 20240 }, { "epoch": 1.0613207547169812, "grad_norm": 1.342100739479065, "learning_rate": 2.346829140461216e-05, "loss": 0.1001, "step": 20250 }, { "epoch": 1.0618448637316562, "grad_norm": 1.8974846601486206, "learning_rate": 2.3455188679245284e-05, "loss": 0.0742, "step": 20260 }, { "epoch": 1.0623689727463312, "grad_norm": 7.072523593902588, "learning_rate": 2.3442085953878408e-05, "loss": 0.0828, "step": 20270 }, { "epoch": 1.0628930817610063, "grad_norm": 1.7849225997924805, "learning_rate": 2.342898322851153e-05, "loss": 0.1056, "step": 20280 }, { "epoch": 1.0634171907756813, "grad_norm": 1.6293972730636597, "learning_rate": 2.3415880503144654e-05, "loss": 0.0775, "step": 20290 }, { "epoch": 1.0639412997903563, "grad_norm": 1.087839961051941, "learning_rate": 2.3402777777777778e-05, "loss": 0.0821, "step": 20300 }, { "epoch": 1.0644654088050314, "grad_norm": 1.9795023202896118, "learning_rate": 2.3389675052410905e-05, "loss": 0.0805, "step": 20310 }, { "epoch": 1.0649895178197064, "grad_norm": 1.3084776401519775, "learning_rate": 2.3376572327044028e-05, "loss": 0.0841, "step": 20320 }, { "epoch": 1.0655136268343814, "grad_norm": 2.340204954147339, "learning_rate": 2.3363469601677148e-05, "loss": 0.0879, "step": 20330 }, { "epoch": 1.0660377358490567, "grad_norm": 0.9186223745346069, "learning_rate": 2.335036687631027e-05, "loss": 0.0893, "step": 20340 }, { "epoch": 1.0665618448637317, "grad_norm": 1.2676678895950317, "learning_rate": 2.3337264150943398e-05, "loss": 0.1021, "step": 20350 }, { "epoch": 1.0670859538784068, "grad_norm": 0.9018216133117676, "learning_rate": 2.332416142557652e-05, "loss": 0.1051, "step": 20360 }, { "epoch": 1.0676100628930818, "grad_norm": 1.2806211709976196, "learning_rate": 2.3311058700209645e-05, "loss": 0.0798, "step": 20370 }, { "epoch": 1.0681341719077568, "grad_norm": 1.302931785583496, "learning_rate": 2.3297955974842768e-05, "loss": 0.0808, "step": 20380 }, { "epoch": 1.0686582809224319, "grad_norm": 1.1825281381607056, "learning_rate": 2.328485324947589e-05, "loss": 0.0655, "step": 20390 }, { "epoch": 1.069182389937107, "grad_norm": 1.1505693197250366, "learning_rate": 2.3271750524109015e-05, "loss": 0.0667, "step": 20400 }, { "epoch": 1.069706498951782, "grad_norm": 1.1411997079849243, "learning_rate": 2.325864779874214e-05, "loss": 0.063, "step": 20410 }, { "epoch": 1.070230607966457, "grad_norm": 3.213791608810425, "learning_rate": 2.3245545073375262e-05, "loss": 0.0878, "step": 20420 }, { "epoch": 1.070754716981132, "grad_norm": 1.0483222007751465, "learning_rate": 2.323244234800839e-05, "loss": 0.0997, "step": 20430 }, { "epoch": 1.071278825995807, "grad_norm": 1.048044204711914, "learning_rate": 2.3219339622641512e-05, "loss": 0.0718, "step": 20440 }, { "epoch": 1.0718029350104823, "grad_norm": 2.3207290172576904, "learning_rate": 2.3206236897274632e-05, "loss": 0.0635, "step": 20450 }, { "epoch": 1.0723270440251573, "grad_norm": 1.414986491203308, "learning_rate": 2.319313417190776e-05, "loss": 0.0951, "step": 20460 }, { "epoch": 1.0728511530398324, "grad_norm": 1.8669378757476807, "learning_rate": 2.3180031446540882e-05, "loss": 0.0832, "step": 20470 }, { "epoch": 1.0733752620545074, "grad_norm": 1.9529931545257568, "learning_rate": 2.3166928721174006e-05, "loss": 0.0792, "step": 20480 }, { "epoch": 1.0738993710691824, "grad_norm": 1.9123833179473877, "learning_rate": 2.315382599580713e-05, "loss": 0.0816, "step": 20490 }, { "epoch": 1.0744234800838575, "grad_norm": 1.0265159606933594, "learning_rate": 2.3140723270440252e-05, "loss": 0.0734, "step": 20500 }, { "epoch": 1.0749475890985325, "grad_norm": 1.1382958889007568, "learning_rate": 2.3127620545073376e-05, "loss": 0.0709, "step": 20510 }, { "epoch": 1.0754716981132075, "grad_norm": 1.5899590253829956, "learning_rate": 2.31145178197065e-05, "loss": 0.0658, "step": 20520 }, { "epoch": 1.0759958071278826, "grad_norm": 2.3364782333374023, "learning_rate": 2.3101415094339622e-05, "loss": 0.0805, "step": 20530 }, { "epoch": 1.0765199161425576, "grad_norm": 1.562552571296692, "learning_rate": 2.308831236897275e-05, "loss": 0.0868, "step": 20540 }, { "epoch": 1.0770440251572326, "grad_norm": 1.4748271703720093, "learning_rate": 2.3075209643605873e-05, "loss": 0.0733, "step": 20550 }, { "epoch": 1.0775681341719077, "grad_norm": 0.6530401706695557, "learning_rate": 2.3062106918238996e-05, "loss": 0.0909, "step": 20560 }, { "epoch": 1.0780922431865827, "grad_norm": 1.4629088640213013, "learning_rate": 2.3049004192872116e-05, "loss": 0.072, "step": 20570 }, { "epoch": 1.078616352201258, "grad_norm": 1.0382362604141235, "learning_rate": 2.3035901467505243e-05, "loss": 0.0942, "step": 20580 }, { "epoch": 1.079140461215933, "grad_norm": 1.4376634359359741, "learning_rate": 2.3022798742138366e-05, "loss": 0.0751, "step": 20590 }, { "epoch": 1.079664570230608, "grad_norm": 1.0215072631835938, "learning_rate": 2.300969601677149e-05, "loss": 0.0652, "step": 20600 }, { "epoch": 1.080188679245283, "grad_norm": 2.8767483234405518, "learning_rate": 2.2996593291404613e-05, "loss": 0.0791, "step": 20610 }, { "epoch": 1.080712788259958, "grad_norm": 1.6112250089645386, "learning_rate": 2.298349056603774e-05, "loss": 0.0639, "step": 20620 }, { "epoch": 1.0812368972746331, "grad_norm": 4.758137226104736, "learning_rate": 2.297038784067086e-05, "loss": 0.0753, "step": 20630 }, { "epoch": 1.0817610062893082, "grad_norm": 11.462589263916016, "learning_rate": 2.2957285115303983e-05, "loss": 0.0713, "step": 20640 }, { "epoch": 1.0822851153039832, "grad_norm": 2.255251169204712, "learning_rate": 2.2944182389937106e-05, "loss": 0.0757, "step": 20650 }, { "epoch": 1.0828092243186582, "grad_norm": 1.9338898658752441, "learning_rate": 2.2931079664570233e-05, "loss": 0.07, "step": 20660 }, { "epoch": 1.0833333333333333, "grad_norm": 3.076637029647827, "learning_rate": 2.2917976939203357e-05, "loss": 0.095, "step": 20670 }, { "epoch": 1.0838574423480083, "grad_norm": 2.129887580871582, "learning_rate": 2.290487421383648e-05, "loss": 0.0727, "step": 20680 }, { "epoch": 1.0843815513626835, "grad_norm": 1.7468769550323486, "learning_rate": 2.28917714884696e-05, "loss": 0.0819, "step": 20690 }, { "epoch": 1.0849056603773586, "grad_norm": 1.341432809829712, "learning_rate": 2.2878668763102727e-05, "loss": 0.0719, "step": 20700 }, { "epoch": 1.0854297693920336, "grad_norm": 2.184701442718506, "learning_rate": 2.286556603773585e-05, "loss": 0.0972, "step": 20710 }, { "epoch": 1.0859538784067087, "grad_norm": 1.3120720386505127, "learning_rate": 2.2852463312368974e-05, "loss": 0.0849, "step": 20720 }, { "epoch": 1.0864779874213837, "grad_norm": 2.175135850906372, "learning_rate": 2.2839360587002097e-05, "loss": 0.0775, "step": 20730 }, { "epoch": 1.0870020964360587, "grad_norm": 1.1797727346420288, "learning_rate": 2.2826257861635224e-05, "loss": 0.0982, "step": 20740 }, { "epoch": 1.0875262054507338, "grad_norm": 1.645690679550171, "learning_rate": 2.2813155136268344e-05, "loss": 0.0795, "step": 20750 }, { "epoch": 1.0880503144654088, "grad_norm": 1.927145004272461, "learning_rate": 2.2800052410901467e-05, "loss": 0.0973, "step": 20760 }, { "epoch": 1.0885744234800838, "grad_norm": 1.39384126663208, "learning_rate": 2.278694968553459e-05, "loss": 0.0593, "step": 20770 }, { "epoch": 1.0890985324947589, "grad_norm": 1.5645813941955566, "learning_rate": 2.2773846960167717e-05, "loss": 0.0879, "step": 20780 }, { "epoch": 1.0896226415094339, "grad_norm": 2.0848135948181152, "learning_rate": 2.276074423480084e-05, "loss": 0.0821, "step": 20790 }, { "epoch": 1.090146750524109, "grad_norm": 0.7393643260002136, "learning_rate": 2.2747641509433964e-05, "loss": 0.0967, "step": 20800 }, { "epoch": 1.090670859538784, "grad_norm": 1.7958664894104004, "learning_rate": 2.2734538784067087e-05, "loss": 0.0828, "step": 20810 }, { "epoch": 1.0911949685534592, "grad_norm": 1.8084328174591064, "learning_rate": 2.272143605870021e-05, "loss": 0.0715, "step": 20820 }, { "epoch": 1.0917190775681342, "grad_norm": 1.6376843452453613, "learning_rate": 2.2708333333333334e-05, "loss": 0.073, "step": 20830 }, { "epoch": 1.0922431865828093, "grad_norm": 2.854077100753784, "learning_rate": 2.2695230607966458e-05, "loss": 0.0707, "step": 20840 }, { "epoch": 1.0927672955974843, "grad_norm": 1.1942236423492432, "learning_rate": 2.268212788259958e-05, "loss": 0.0696, "step": 20850 }, { "epoch": 1.0932914046121593, "grad_norm": 2.580146312713623, "learning_rate": 2.2669025157232708e-05, "loss": 0.0894, "step": 20860 }, { "epoch": 1.0938155136268344, "grad_norm": 1.7355183362960815, "learning_rate": 2.2655922431865828e-05, "loss": 0.0898, "step": 20870 }, { "epoch": 1.0943396226415094, "grad_norm": 0.6533396244049072, "learning_rate": 2.264281970649895e-05, "loss": 0.052, "step": 20880 }, { "epoch": 1.0948637316561844, "grad_norm": 1.6988980770111084, "learning_rate": 2.2629716981132078e-05, "loss": 0.0753, "step": 20890 }, { "epoch": 1.0953878406708595, "grad_norm": 3.65659761428833, "learning_rate": 2.26166142557652e-05, "loss": 0.099, "step": 20900 }, { "epoch": 1.0959119496855345, "grad_norm": 1.2218525409698486, "learning_rate": 2.2603511530398325e-05, "loss": 0.083, "step": 20910 }, { "epoch": 1.0964360587002095, "grad_norm": 1.5201767683029175, "learning_rate": 2.2590408805031448e-05, "loss": 0.1023, "step": 20920 }, { "epoch": 1.0969601677148848, "grad_norm": 1.712514877319336, "learning_rate": 2.257730607966457e-05, "loss": 0.0958, "step": 20930 }, { "epoch": 1.0974842767295598, "grad_norm": 1.532578945159912, "learning_rate": 2.2564203354297695e-05, "loss": 0.0693, "step": 20940 }, { "epoch": 1.0980083857442349, "grad_norm": 0.4898047149181366, "learning_rate": 2.2551100628930818e-05, "loss": 0.0621, "step": 20950 }, { "epoch": 1.09853249475891, "grad_norm": 1.3494205474853516, "learning_rate": 2.253799790356394e-05, "loss": 0.0647, "step": 20960 }, { "epoch": 1.099056603773585, "grad_norm": 1.0484117269515991, "learning_rate": 2.252489517819707e-05, "loss": 0.0789, "step": 20970 }, { "epoch": 1.09958071278826, "grad_norm": 1.8230509757995605, "learning_rate": 2.2511792452830192e-05, "loss": 0.0807, "step": 20980 }, { "epoch": 1.100104821802935, "grad_norm": 1.3117787837982178, "learning_rate": 2.2498689727463312e-05, "loss": 0.0842, "step": 20990 }, { "epoch": 1.10062893081761, "grad_norm": 1.3283014297485352, "learning_rate": 2.2485587002096435e-05, "loss": 0.078, "step": 21000 }, { "epoch": 1.10062893081761, "eval_loss": 0.2784247398376465, "eval_runtime": 267.8534, "eval_samples_per_second": 7.433, "eval_steps_per_second": 1.239, "step": 21000 }, { "epoch": 1.101153039832285, "grad_norm": 2.8443620204925537, "learning_rate": 2.2472484276729562e-05, "loss": 0.0587, "step": 21010 }, { "epoch": 1.10167714884696, "grad_norm": 2.1663818359375, "learning_rate": 2.2459381551362685e-05, "loss": 0.0754, "step": 21020 }, { "epoch": 1.1022012578616351, "grad_norm": 1.3457231521606445, "learning_rate": 2.244627882599581e-05, "loss": 0.1037, "step": 21030 }, { "epoch": 1.1027253668763102, "grad_norm": 3.6146819591522217, "learning_rate": 2.2433176100628932e-05, "loss": 0.0905, "step": 21040 }, { "epoch": 1.1032494758909852, "grad_norm": 1.658570408821106, "learning_rate": 2.2420073375262055e-05, "loss": 0.0779, "step": 21050 }, { "epoch": 1.1037735849056605, "grad_norm": 1.4389158487319946, "learning_rate": 2.240697064989518e-05, "loss": 0.0776, "step": 21060 }, { "epoch": 1.1042976939203355, "grad_norm": 3.1745357513427734, "learning_rate": 2.2393867924528302e-05, "loss": 0.0852, "step": 21070 }, { "epoch": 1.1048218029350105, "grad_norm": 1.5652964115142822, "learning_rate": 2.2380765199161426e-05, "loss": 0.0839, "step": 21080 }, { "epoch": 1.1053459119496856, "grad_norm": 2.5231242179870605, "learning_rate": 2.2367662473794552e-05, "loss": 0.0819, "step": 21090 }, { "epoch": 1.1058700209643606, "grad_norm": 2.037729263305664, "learning_rate": 2.2354559748427676e-05, "loss": 0.0995, "step": 21100 }, { "epoch": 1.1063941299790356, "grad_norm": 1.0101507902145386, "learning_rate": 2.2341457023060796e-05, "loss": 0.0935, "step": 21110 }, { "epoch": 1.1069182389937107, "grad_norm": 26.269018173217773, "learning_rate": 2.2328354297693923e-05, "loss": 0.0915, "step": 21120 }, { "epoch": 1.1074423480083857, "grad_norm": 1.1649367809295654, "learning_rate": 2.2315251572327046e-05, "loss": 0.0649, "step": 21130 }, { "epoch": 1.1079664570230607, "grad_norm": 1.7164911031723022, "learning_rate": 2.230214884696017e-05, "loss": 0.0964, "step": 21140 }, { "epoch": 1.1084905660377358, "grad_norm": 1.4876567125320435, "learning_rate": 2.2289046121593293e-05, "loss": 0.1082, "step": 21150 }, { "epoch": 1.1090146750524108, "grad_norm": 1.605036735534668, "learning_rate": 2.2275943396226416e-05, "loss": 0.0751, "step": 21160 }, { "epoch": 1.109538784067086, "grad_norm": 3.23136830329895, "learning_rate": 2.226284067085954e-05, "loss": 0.0933, "step": 21170 }, { "epoch": 1.110062893081761, "grad_norm": 2.443599224090576, "learning_rate": 2.2249737945492663e-05, "loss": 0.0781, "step": 21180 }, { "epoch": 1.1105870020964361, "grad_norm": 2.5603668689727783, "learning_rate": 2.2236635220125786e-05, "loss": 0.1169, "step": 21190 }, { "epoch": 1.1111111111111112, "grad_norm": 1.8467520475387573, "learning_rate": 2.2223532494758913e-05, "loss": 0.0748, "step": 21200 }, { "epoch": 1.1116352201257862, "grad_norm": 1.695407509803772, "learning_rate": 2.2210429769392036e-05, "loss": 0.0884, "step": 21210 }, { "epoch": 1.1121593291404612, "grad_norm": 1.1145198345184326, "learning_rate": 2.219732704402516e-05, "loss": 0.0885, "step": 21220 }, { "epoch": 1.1126834381551363, "grad_norm": 1.6497974395751953, "learning_rate": 2.218422431865828e-05, "loss": 0.1082, "step": 21230 }, { "epoch": 1.1132075471698113, "grad_norm": 1.1065045595169067, "learning_rate": 2.2171121593291407e-05, "loss": 0.075, "step": 21240 }, { "epoch": 1.1137316561844863, "grad_norm": 1.636327862739563, "learning_rate": 2.215801886792453e-05, "loss": 0.093, "step": 21250 }, { "epoch": 1.1142557651991614, "grad_norm": 1.5162497758865356, "learning_rate": 2.2144916142557653e-05, "loss": 0.0896, "step": 21260 }, { "epoch": 1.1147798742138364, "grad_norm": 1.259658694267273, "learning_rate": 2.2131813417190777e-05, "loss": 0.0985, "step": 21270 }, { "epoch": 1.1153039832285114, "grad_norm": 1.8518342971801758, "learning_rate": 2.2118710691823903e-05, "loss": 0.0906, "step": 21280 }, { "epoch": 1.1158280922431867, "grad_norm": 1.3462951183319092, "learning_rate": 2.2105607966457023e-05, "loss": 0.0759, "step": 21290 }, { "epoch": 1.1163522012578617, "grad_norm": 2.863619565963745, "learning_rate": 2.2092505241090147e-05, "loss": 0.1001, "step": 21300 }, { "epoch": 1.1168763102725368, "grad_norm": 1.411542534828186, "learning_rate": 2.207940251572327e-05, "loss": 0.0834, "step": 21310 }, { "epoch": 1.1174004192872118, "grad_norm": 1.7012072801589966, "learning_rate": 2.2066299790356397e-05, "loss": 0.1045, "step": 21320 }, { "epoch": 1.1179245283018868, "grad_norm": 1.516890525817871, "learning_rate": 2.205319706498952e-05, "loss": 0.079, "step": 21330 }, { "epoch": 1.1184486373165619, "grad_norm": 2.6436009407043457, "learning_rate": 2.2040094339622644e-05, "loss": 0.094, "step": 21340 }, { "epoch": 1.118972746331237, "grad_norm": 1.700116515159607, "learning_rate": 2.2026991614255764e-05, "loss": 0.09, "step": 21350 }, { "epoch": 1.119496855345912, "grad_norm": 1.6481711864471436, "learning_rate": 2.201388888888889e-05, "loss": 0.0853, "step": 21360 }, { "epoch": 1.120020964360587, "grad_norm": 0.7776415348052979, "learning_rate": 2.2000786163522014e-05, "loss": 0.0777, "step": 21370 }, { "epoch": 1.120545073375262, "grad_norm": 2.0679523944854736, "learning_rate": 2.1987683438155137e-05, "loss": 0.0697, "step": 21380 }, { "epoch": 1.121069182389937, "grad_norm": 1.485421895980835, "learning_rate": 2.197458071278826e-05, "loss": 0.104, "step": 21390 }, { "epoch": 1.121593291404612, "grad_norm": 1.4850335121154785, "learning_rate": 2.1961477987421387e-05, "loss": 0.0851, "step": 21400 }, { "epoch": 1.1221174004192873, "grad_norm": 2.6674656867980957, "learning_rate": 2.1948375262054507e-05, "loss": 0.0735, "step": 21410 }, { "epoch": 1.1226415094339623, "grad_norm": 1.8950225114822388, "learning_rate": 2.193527253668763e-05, "loss": 0.0873, "step": 21420 }, { "epoch": 1.1231656184486374, "grad_norm": 2.049248218536377, "learning_rate": 2.1922169811320754e-05, "loss": 0.1157, "step": 21430 }, { "epoch": 1.1236897274633124, "grad_norm": 1.7911266088485718, "learning_rate": 2.190906708595388e-05, "loss": 0.0687, "step": 21440 }, { "epoch": 1.1242138364779874, "grad_norm": 1.8410648107528687, "learning_rate": 2.1895964360587004e-05, "loss": 0.0876, "step": 21450 }, { "epoch": 1.1247379454926625, "grad_norm": 1.8042258024215698, "learning_rate": 2.1882861635220128e-05, "loss": 0.1086, "step": 21460 }, { "epoch": 1.1252620545073375, "grad_norm": 0.6935757398605347, "learning_rate": 2.186975890985325e-05, "loss": 0.0694, "step": 21470 }, { "epoch": 1.1257861635220126, "grad_norm": 1.8044699430465698, "learning_rate": 2.1856656184486375e-05, "loss": 0.0844, "step": 21480 }, { "epoch": 1.1263102725366876, "grad_norm": 1.7842930555343628, "learning_rate": 2.1843553459119498e-05, "loss": 0.0659, "step": 21490 }, { "epoch": 1.1268343815513626, "grad_norm": 2.891422748565674, "learning_rate": 2.183045073375262e-05, "loss": 0.101, "step": 21500 }, { "epoch": 1.1273584905660377, "grad_norm": 3.2450954914093018, "learning_rate": 2.1817348008385745e-05, "loss": 0.1052, "step": 21510 }, { "epoch": 1.1278825995807127, "grad_norm": 1.5685365200042725, "learning_rate": 2.1804245283018868e-05, "loss": 0.083, "step": 21520 }, { "epoch": 1.1284067085953877, "grad_norm": 1.3027487993240356, "learning_rate": 2.179114255765199e-05, "loss": 0.0593, "step": 21530 }, { "epoch": 1.128930817610063, "grad_norm": 1.3013249635696411, "learning_rate": 2.1778039832285115e-05, "loss": 0.089, "step": 21540 }, { "epoch": 1.129454926624738, "grad_norm": 6.376953601837158, "learning_rate": 2.176493710691824e-05, "loss": 0.076, "step": 21550 }, { "epoch": 1.129979035639413, "grad_norm": 1.9800182580947876, "learning_rate": 2.1751834381551365e-05, "loss": 0.0849, "step": 21560 }, { "epoch": 1.130503144654088, "grad_norm": 1.376758098602295, "learning_rate": 2.173873165618449e-05, "loss": 0.0911, "step": 21570 }, { "epoch": 1.131027253668763, "grad_norm": 1.5990701913833618, "learning_rate": 2.172562893081761e-05, "loss": 0.0957, "step": 21580 }, { "epoch": 1.1315513626834381, "grad_norm": 1.919875144958496, "learning_rate": 2.1712526205450735e-05, "loss": 0.0947, "step": 21590 }, { "epoch": 1.1320754716981132, "grad_norm": 1.7069398164749146, "learning_rate": 2.169942348008386e-05, "loss": 0.082, "step": 21600 }, { "epoch": 1.1325995807127882, "grad_norm": 2.2045705318450928, "learning_rate": 2.1686320754716982e-05, "loss": 0.0742, "step": 21610 }, { "epoch": 1.1331236897274632, "grad_norm": 0.6748783588409424, "learning_rate": 2.1673218029350105e-05, "loss": 0.0829, "step": 21620 }, { "epoch": 1.1336477987421383, "grad_norm": 1.7201391458511353, "learning_rate": 2.1660115303983232e-05, "loss": 0.1016, "step": 21630 }, { "epoch": 1.1341719077568135, "grad_norm": 1.3689556121826172, "learning_rate": 2.1647012578616352e-05, "loss": 0.0736, "step": 21640 }, { "epoch": 1.1346960167714886, "grad_norm": 1.6909518241882324, "learning_rate": 2.1633909853249475e-05, "loss": 0.0841, "step": 21650 }, { "epoch": 1.1352201257861636, "grad_norm": 1.656886100769043, "learning_rate": 2.16208071278826e-05, "loss": 0.0975, "step": 21660 }, { "epoch": 1.1357442348008386, "grad_norm": 1.7160608768463135, "learning_rate": 2.1607704402515726e-05, "loss": 0.0857, "step": 21670 }, { "epoch": 1.1362683438155137, "grad_norm": 2.022963047027588, "learning_rate": 2.159460167714885e-05, "loss": 0.0886, "step": 21680 }, { "epoch": 1.1367924528301887, "grad_norm": 0.8972265720367432, "learning_rate": 2.1581498951781972e-05, "loss": 0.0882, "step": 21690 }, { "epoch": 1.1373165618448637, "grad_norm": 1.2054320573806763, "learning_rate": 2.1568396226415092e-05, "loss": 0.073, "step": 21700 }, { "epoch": 1.1378406708595388, "grad_norm": 7.229464054107666, "learning_rate": 2.155529350104822e-05, "loss": 0.078, "step": 21710 }, { "epoch": 1.1383647798742138, "grad_norm": 1.8779710531234741, "learning_rate": 2.1542190775681343e-05, "loss": 0.0914, "step": 21720 }, { "epoch": 1.1388888888888888, "grad_norm": 0.6816754341125488, "learning_rate": 2.1529088050314466e-05, "loss": 0.0648, "step": 21730 }, { "epoch": 1.1394129979035639, "grad_norm": 1.756729245185852, "learning_rate": 2.151598532494759e-05, "loss": 0.1165, "step": 21740 }, { "epoch": 1.139937106918239, "grad_norm": 1.6944255828857422, "learning_rate": 2.1502882599580716e-05, "loss": 0.0831, "step": 21750 }, { "epoch": 1.140461215932914, "grad_norm": 1.633293628692627, "learning_rate": 2.1489779874213836e-05, "loss": 0.0777, "step": 21760 }, { "epoch": 1.140985324947589, "grad_norm": 1.755721092224121, "learning_rate": 2.147667714884696e-05, "loss": 0.0714, "step": 21770 }, { "epoch": 1.1415094339622642, "grad_norm": 1.9764751195907593, "learning_rate": 2.1463574423480083e-05, "loss": 0.0815, "step": 21780 }, { "epoch": 1.1420335429769393, "grad_norm": 2.773207187652588, "learning_rate": 2.145047169811321e-05, "loss": 0.1026, "step": 21790 }, { "epoch": 1.1425576519916143, "grad_norm": 1.6087942123413086, "learning_rate": 2.1437368972746333e-05, "loss": 0.0758, "step": 21800 }, { "epoch": 1.1430817610062893, "grad_norm": 2.314055919647217, "learning_rate": 2.1424266247379456e-05, "loss": 0.0797, "step": 21810 }, { "epoch": 1.1436058700209644, "grad_norm": 1.288299560546875, "learning_rate": 2.141116352201258e-05, "loss": 0.0582, "step": 21820 }, { "epoch": 1.1441299790356394, "grad_norm": 2.209122657775879, "learning_rate": 2.1398060796645703e-05, "loss": 0.0791, "step": 21830 }, { "epoch": 1.1446540880503144, "grad_norm": 0.9858881831169128, "learning_rate": 2.1384958071278827e-05, "loss": 0.0837, "step": 21840 }, { "epoch": 1.1451781970649895, "grad_norm": 1.1568701267242432, "learning_rate": 2.137185534591195e-05, "loss": 0.081, "step": 21850 }, { "epoch": 1.1457023060796645, "grad_norm": 1.5264705419540405, "learning_rate": 2.1358752620545073e-05, "loss": 0.0735, "step": 21860 }, { "epoch": 1.1462264150943395, "grad_norm": 0.5113538503646851, "learning_rate": 2.13456498951782e-05, "loss": 0.083, "step": 21870 }, { "epoch": 1.1467505241090148, "grad_norm": 0.6113923788070679, "learning_rate": 2.133254716981132e-05, "loss": 0.0741, "step": 21880 }, { "epoch": 1.1472746331236898, "grad_norm": 0.6499518752098083, "learning_rate": 2.1319444444444444e-05, "loss": 0.0761, "step": 21890 }, { "epoch": 1.1477987421383649, "grad_norm": 0.9743715524673462, "learning_rate": 2.130634171907757e-05, "loss": 0.0725, "step": 21900 }, { "epoch": 1.14832285115304, "grad_norm": 2.16135835647583, "learning_rate": 2.1293238993710694e-05, "loss": 0.0935, "step": 21910 }, { "epoch": 1.148846960167715, "grad_norm": 2.5390031337738037, "learning_rate": 2.1280136268343817e-05, "loss": 0.1021, "step": 21920 }, { "epoch": 1.14937106918239, "grad_norm": 1.8435341119766235, "learning_rate": 2.126703354297694e-05, "loss": 0.0847, "step": 21930 }, { "epoch": 1.149895178197065, "grad_norm": 0.9867782592773438, "learning_rate": 2.1253930817610064e-05, "loss": 0.0768, "step": 21940 }, { "epoch": 1.15041928721174, "grad_norm": 1.5365256071090698, "learning_rate": 2.1240828092243187e-05, "loss": 0.1035, "step": 21950 }, { "epoch": 1.150943396226415, "grad_norm": 1.4743176698684692, "learning_rate": 2.122772536687631e-05, "loss": 0.0555, "step": 21960 }, { "epoch": 1.15146750524109, "grad_norm": 1.6891299486160278, "learning_rate": 2.1214622641509434e-05, "loss": 0.0775, "step": 21970 }, { "epoch": 1.1519916142557651, "grad_norm": 1.8441011905670166, "learning_rate": 2.120151991614256e-05, "loss": 0.0716, "step": 21980 }, { "epoch": 1.1525157232704402, "grad_norm": 1.4782497882843018, "learning_rate": 2.1188417190775684e-05, "loss": 0.086, "step": 21990 }, { "epoch": 1.1530398322851152, "grad_norm": 1.208433985710144, "learning_rate": 2.1175314465408804e-05, "loss": 0.0837, "step": 22000 }, { "epoch": 1.1530398322851152, "eval_loss": 0.2847573161125183, "eval_runtime": 267.8045, "eval_samples_per_second": 7.435, "eval_steps_per_second": 1.24, "step": 22000 }, { "epoch": 1.1535639412997905, "grad_norm": 1.218479037284851, "learning_rate": 2.1162211740041928e-05, "loss": 0.091, "step": 22010 }, { "epoch": 1.1540880503144655, "grad_norm": 1.2849527597427368, "learning_rate": 2.1149109014675054e-05, "loss": 0.0958, "step": 22020 }, { "epoch": 1.1546121593291405, "grad_norm": 1.1848913431167603, "learning_rate": 2.1136006289308178e-05, "loss": 0.0752, "step": 22030 }, { "epoch": 1.1551362683438156, "grad_norm": 1.1825535297393799, "learning_rate": 2.11229035639413e-05, "loss": 0.0515, "step": 22040 }, { "epoch": 1.1556603773584906, "grad_norm": 1.4824421405792236, "learning_rate": 2.1109800838574424e-05, "loss": 0.0732, "step": 22050 }, { "epoch": 1.1561844863731656, "grad_norm": 2.042879581451416, "learning_rate": 2.1096698113207548e-05, "loss": 0.079, "step": 22060 }, { "epoch": 1.1567085953878407, "grad_norm": 2.0270042419433594, "learning_rate": 2.108359538784067e-05, "loss": 0.0895, "step": 22070 }, { "epoch": 1.1572327044025157, "grad_norm": 1.9607288837432861, "learning_rate": 2.1070492662473795e-05, "loss": 0.0703, "step": 22080 }, { "epoch": 1.1577568134171907, "grad_norm": 0.7053709030151367, "learning_rate": 2.1057389937106918e-05, "loss": 0.0902, "step": 22090 }, { "epoch": 1.1582809224318658, "grad_norm": 1.8536524772644043, "learning_rate": 2.1044287211740045e-05, "loss": 0.0696, "step": 22100 }, { "epoch": 1.1588050314465408, "grad_norm": 2.1766324043273926, "learning_rate": 2.1031184486373168e-05, "loss": 0.0847, "step": 22110 }, { "epoch": 1.159329140461216, "grad_norm": 1.659189224243164, "learning_rate": 2.1018081761006288e-05, "loss": 0.0708, "step": 22120 }, { "epoch": 1.159853249475891, "grad_norm": 0.9984623193740845, "learning_rate": 2.1004979035639415e-05, "loss": 0.0747, "step": 22130 }, { "epoch": 1.1603773584905661, "grad_norm": 2.5320563316345215, "learning_rate": 2.099187631027254e-05, "loss": 0.0808, "step": 22140 }, { "epoch": 1.1609014675052411, "grad_norm": 1.5325194597244263, "learning_rate": 2.0978773584905662e-05, "loss": 0.1024, "step": 22150 }, { "epoch": 1.1614255765199162, "grad_norm": 2.04860782623291, "learning_rate": 2.0965670859538785e-05, "loss": 0.0838, "step": 22160 }, { "epoch": 1.1619496855345912, "grad_norm": 3.146354913711548, "learning_rate": 2.095256813417191e-05, "loss": 0.0903, "step": 22170 }, { "epoch": 1.1624737945492662, "grad_norm": 1.1269997358322144, "learning_rate": 2.0939465408805032e-05, "loss": 0.0882, "step": 22180 }, { "epoch": 1.1629979035639413, "grad_norm": 0.8249778747558594, "learning_rate": 2.0926362683438155e-05, "loss": 0.0923, "step": 22190 }, { "epoch": 1.1635220125786163, "grad_norm": 1.9373500347137451, "learning_rate": 2.091325995807128e-05, "loss": 0.1042, "step": 22200 }, { "epoch": 1.1640461215932913, "grad_norm": 1.7245064973831177, "learning_rate": 2.0900157232704405e-05, "loss": 0.1104, "step": 22210 }, { "epoch": 1.1645702306079664, "grad_norm": 1.0767366886138916, "learning_rate": 2.088705450733753e-05, "loss": 0.0754, "step": 22220 }, { "epoch": 1.1650943396226414, "grad_norm": 0.576208233833313, "learning_rate": 2.0873951781970652e-05, "loss": 0.0851, "step": 22230 }, { "epoch": 1.1656184486373165, "grad_norm": 1.783109188079834, "learning_rate": 2.0860849056603772e-05, "loss": 0.0959, "step": 22240 }, { "epoch": 1.1661425576519917, "grad_norm": 1.835574746131897, "learning_rate": 2.08477463312369e-05, "loss": 0.0939, "step": 22250 }, { "epoch": 1.1666666666666667, "grad_norm": 1.7771759033203125, "learning_rate": 2.0834643605870022e-05, "loss": 0.0882, "step": 22260 }, { "epoch": 1.1671907756813418, "grad_norm": 2.202103614807129, "learning_rate": 2.0821540880503146e-05, "loss": 0.0955, "step": 22270 }, { "epoch": 1.1677148846960168, "grad_norm": 1.0116591453552246, "learning_rate": 2.080843815513627e-05, "loss": 0.0803, "step": 22280 }, { "epoch": 1.1682389937106918, "grad_norm": 1.7534410953521729, "learning_rate": 2.0795335429769396e-05, "loss": 0.0853, "step": 22290 }, { "epoch": 1.1687631027253669, "grad_norm": 1.1506248712539673, "learning_rate": 2.0782232704402516e-05, "loss": 0.0913, "step": 22300 }, { "epoch": 1.169287211740042, "grad_norm": 1.104394555091858, "learning_rate": 2.076912997903564e-05, "loss": 0.0895, "step": 22310 }, { "epoch": 1.169811320754717, "grad_norm": 1.3719996213912964, "learning_rate": 2.0756027253668763e-05, "loss": 0.1063, "step": 22320 }, { "epoch": 1.170335429769392, "grad_norm": 1.9209152460098267, "learning_rate": 2.074292452830189e-05, "loss": 0.0718, "step": 22330 }, { "epoch": 1.170859538784067, "grad_norm": 2.760194778442383, "learning_rate": 2.0729821802935013e-05, "loss": 0.0533, "step": 22340 }, { "epoch": 1.171383647798742, "grad_norm": 3.66623854637146, "learning_rate": 2.0716719077568136e-05, "loss": 0.0881, "step": 22350 }, { "epoch": 1.1719077568134173, "grad_norm": 1.5722193717956543, "learning_rate": 2.0703616352201256e-05, "loss": 0.0609, "step": 22360 }, { "epoch": 1.1724318658280923, "grad_norm": 1.63033127784729, "learning_rate": 2.0690513626834383e-05, "loss": 0.1026, "step": 22370 }, { "epoch": 1.1729559748427674, "grad_norm": 1.509321928024292, "learning_rate": 2.0677410901467506e-05, "loss": 0.1055, "step": 22380 }, { "epoch": 1.1734800838574424, "grad_norm": 2.2437334060668945, "learning_rate": 2.066430817610063e-05, "loss": 0.0891, "step": 22390 }, { "epoch": 1.1740041928721174, "grad_norm": 1.9515290260314941, "learning_rate": 2.0651205450733753e-05, "loss": 0.0835, "step": 22400 }, { "epoch": 1.1745283018867925, "grad_norm": 1.6707795858383179, "learning_rate": 2.063810272536688e-05, "loss": 0.0925, "step": 22410 }, { "epoch": 1.1750524109014675, "grad_norm": 1.5614157915115356, "learning_rate": 2.0625e-05, "loss": 0.0949, "step": 22420 }, { "epoch": 1.1755765199161425, "grad_norm": 1.81149160861969, "learning_rate": 2.0611897274633123e-05, "loss": 0.1014, "step": 22430 }, { "epoch": 1.1761006289308176, "grad_norm": 1.1202183961868286, "learning_rate": 2.0598794549266247e-05, "loss": 0.0803, "step": 22440 }, { "epoch": 1.1766247379454926, "grad_norm": 0.9992743730545044, "learning_rate": 2.0585691823899373e-05, "loss": 0.0877, "step": 22450 }, { "epoch": 1.1771488469601676, "grad_norm": 1.73166823387146, "learning_rate": 2.0572589098532497e-05, "loss": 0.0762, "step": 22460 }, { "epoch": 1.1776729559748427, "grad_norm": 1.5337498188018799, "learning_rate": 2.055948637316562e-05, "loss": 0.0799, "step": 22470 }, { "epoch": 1.1781970649895177, "grad_norm": 2.315154552459717, "learning_rate": 2.0546383647798744e-05, "loss": 0.0857, "step": 22480 }, { "epoch": 1.178721174004193, "grad_norm": 22.706295013427734, "learning_rate": 2.0533280922431867e-05, "loss": 0.0877, "step": 22490 }, { "epoch": 1.179245283018868, "grad_norm": 1.9468846321105957, "learning_rate": 2.052017819706499e-05, "loss": 0.1001, "step": 22500 }, { "epoch": 1.179769392033543, "grad_norm": 1.4240168333053589, "learning_rate": 2.0507075471698114e-05, "loss": 0.075, "step": 22510 }, { "epoch": 1.180293501048218, "grad_norm": 0.939274251461029, "learning_rate": 2.0493972746331237e-05, "loss": 0.0663, "step": 22520 }, { "epoch": 1.180817610062893, "grad_norm": 1.8253507614135742, "learning_rate": 2.0480870020964364e-05, "loss": 0.0856, "step": 22530 }, { "epoch": 1.1813417190775681, "grad_norm": 1.9873319864273071, "learning_rate": 2.0467767295597484e-05, "loss": 0.0866, "step": 22540 }, { "epoch": 1.1818658280922432, "grad_norm": 1.9492064714431763, "learning_rate": 2.0454664570230607e-05, "loss": 0.0869, "step": 22550 }, { "epoch": 1.1823899371069182, "grad_norm": 1.099589228630066, "learning_rate": 2.0441561844863734e-05, "loss": 0.0808, "step": 22560 }, { "epoch": 1.1829140461215932, "grad_norm": 2.4824342727661133, "learning_rate": 2.0428459119496857e-05, "loss": 0.0929, "step": 22570 }, { "epoch": 1.1834381551362683, "grad_norm": 1.5264191627502441, "learning_rate": 2.041535639412998e-05, "loss": 0.0813, "step": 22580 }, { "epoch": 1.1839622641509433, "grad_norm": 2.196829319000244, "learning_rate": 2.0402253668763104e-05, "loss": 0.0803, "step": 22590 }, { "epoch": 1.1844863731656186, "grad_norm": 1.7362253665924072, "learning_rate": 2.0389150943396228e-05, "loss": 0.0687, "step": 22600 }, { "epoch": 1.1850104821802936, "grad_norm": 1.0077215433120728, "learning_rate": 2.037604821802935e-05, "loss": 0.1056, "step": 22610 }, { "epoch": 1.1855345911949686, "grad_norm": 1.3866441249847412, "learning_rate": 2.0362945492662474e-05, "loss": 0.094, "step": 22620 }, { "epoch": 1.1860587002096437, "grad_norm": 1.4912267923355103, "learning_rate": 2.0349842767295598e-05, "loss": 0.0996, "step": 22630 }, { "epoch": 1.1865828092243187, "grad_norm": 1.5430445671081543, "learning_rate": 2.0336740041928725e-05, "loss": 0.0834, "step": 22640 }, { "epoch": 1.1871069182389937, "grad_norm": 1.434836506843567, "learning_rate": 2.0323637316561848e-05, "loss": 0.0909, "step": 22650 }, { "epoch": 1.1876310272536688, "grad_norm": 1.989877462387085, "learning_rate": 2.0310534591194968e-05, "loss": 0.0932, "step": 22660 }, { "epoch": 1.1881551362683438, "grad_norm": 1.966412901878357, "learning_rate": 2.029743186582809e-05, "loss": 0.1094, "step": 22670 }, { "epoch": 1.1886792452830188, "grad_norm": 2.0110154151916504, "learning_rate": 2.0284329140461218e-05, "loss": 0.0736, "step": 22680 }, { "epoch": 1.1892033542976939, "grad_norm": 2.2693698406219482, "learning_rate": 2.027122641509434e-05, "loss": 0.0827, "step": 22690 }, { "epoch": 1.189727463312369, "grad_norm": 0.8845059275627136, "learning_rate": 2.0258123689727465e-05, "loss": 0.0772, "step": 22700 }, { "epoch": 1.190251572327044, "grad_norm": 2.433473587036133, "learning_rate": 2.0245020964360588e-05, "loss": 0.0976, "step": 22710 }, { "epoch": 1.190775681341719, "grad_norm": 1.9096776247024536, "learning_rate": 2.023191823899371e-05, "loss": 0.0674, "step": 22720 }, { "epoch": 1.1912997903563942, "grad_norm": 1.502175211906433, "learning_rate": 2.0218815513626835e-05, "loss": 0.0808, "step": 22730 }, { "epoch": 1.1918238993710693, "grad_norm": 0.8952791094779968, "learning_rate": 2.020571278825996e-05, "loss": 0.1002, "step": 22740 }, { "epoch": 1.1923480083857443, "grad_norm": 1.0224030017852783, "learning_rate": 2.0192610062893082e-05, "loss": 0.0841, "step": 22750 }, { "epoch": 1.1928721174004193, "grad_norm": 1.8299604654312134, "learning_rate": 2.017950733752621e-05, "loss": 0.0828, "step": 22760 }, { "epoch": 1.1933962264150944, "grad_norm": 1.035951852798462, "learning_rate": 2.0166404612159332e-05, "loss": 0.0696, "step": 22770 }, { "epoch": 1.1939203354297694, "grad_norm": 1.6324563026428223, "learning_rate": 2.0153301886792452e-05, "loss": 0.0792, "step": 22780 }, { "epoch": 1.1944444444444444, "grad_norm": 0.7720912098884583, "learning_rate": 2.0140199161425575e-05, "loss": 0.0821, "step": 22790 }, { "epoch": 1.1949685534591195, "grad_norm": 1.4368795156478882, "learning_rate": 2.0127096436058702e-05, "loss": 0.0815, "step": 22800 }, { "epoch": 1.1954926624737945, "grad_norm": 2.1467883586883545, "learning_rate": 2.0113993710691825e-05, "loss": 0.0792, "step": 22810 }, { "epoch": 1.1960167714884695, "grad_norm": 4.250417709350586, "learning_rate": 2.010089098532495e-05, "loss": 0.0691, "step": 22820 }, { "epoch": 1.1965408805031448, "grad_norm": 2.1994504928588867, "learning_rate": 2.0087788259958072e-05, "loss": 0.0837, "step": 22830 }, { "epoch": 1.1970649895178198, "grad_norm": 1.6909440755844116, "learning_rate": 2.0074685534591196e-05, "loss": 0.0784, "step": 22840 }, { "epoch": 1.1975890985324948, "grad_norm": 1.3120115995407104, "learning_rate": 2.006158280922432e-05, "loss": 0.0911, "step": 22850 }, { "epoch": 1.1981132075471699, "grad_norm": 1.8463976383209229, "learning_rate": 2.0048480083857442e-05, "loss": 0.0702, "step": 22860 }, { "epoch": 1.198637316561845, "grad_norm": 1.0547575950622559, "learning_rate": 2.0035377358490566e-05, "loss": 0.0837, "step": 22870 }, { "epoch": 1.19916142557652, "grad_norm": 2.4690308570861816, "learning_rate": 2.0022274633123693e-05, "loss": 0.0826, "step": 22880 }, { "epoch": 1.199685534591195, "grad_norm": 2.1016459465026855, "learning_rate": 2.0009171907756816e-05, "loss": 0.0828, "step": 22890 }, { "epoch": 1.20020964360587, "grad_norm": 1.5869488716125488, "learning_rate": 1.9996069182389936e-05, "loss": 0.0722, "step": 22900 }, { "epoch": 1.200733752620545, "grad_norm": 1.17249596118927, "learning_rate": 1.9982966457023063e-05, "loss": 0.0758, "step": 22910 }, { "epoch": 1.20125786163522, "grad_norm": 3.9294803142547607, "learning_rate": 1.9969863731656186e-05, "loss": 0.0719, "step": 22920 }, { "epoch": 1.2017819706498951, "grad_norm": 1.0598368644714355, "learning_rate": 1.995676100628931e-05, "loss": 0.0832, "step": 22930 }, { "epoch": 1.2023060796645701, "grad_norm": 1.414456844329834, "learning_rate": 1.9943658280922433e-05, "loss": 0.0702, "step": 22940 }, { "epoch": 1.2028301886792452, "grad_norm": 1.4275473356246948, "learning_rate": 1.9930555555555556e-05, "loss": 0.0924, "step": 22950 }, { "epoch": 1.2033542976939202, "grad_norm": 1.5400974750518799, "learning_rate": 1.991745283018868e-05, "loss": 0.0635, "step": 22960 }, { "epoch": 1.2038784067085955, "grad_norm": 2.163780450820923, "learning_rate": 1.9904350104821803e-05, "loss": 0.0882, "step": 22970 }, { "epoch": 1.2044025157232705, "grad_norm": 2.687192440032959, "learning_rate": 1.9891247379454926e-05, "loss": 0.0728, "step": 22980 }, { "epoch": 1.2049266247379455, "grad_norm": 1.8120397329330444, "learning_rate": 1.9878144654088053e-05, "loss": 0.0655, "step": 22990 }, { "epoch": 1.2054507337526206, "grad_norm": 1.232093334197998, "learning_rate": 1.9865041928721177e-05, "loss": 0.0635, "step": 23000 }, { "epoch": 1.2054507337526206, "eval_loss": 0.28128090500831604, "eval_runtime": 267.6553, "eval_samples_per_second": 7.439, "eval_steps_per_second": 1.24, "step": 23000 }, { "epoch": 1.2059748427672956, "grad_norm": 2.507798433303833, "learning_rate": 1.98519392033543e-05, "loss": 0.1134, "step": 23010 }, { "epoch": 1.2064989517819706, "grad_norm": 0.5228186249732971, "learning_rate": 1.983883647798742e-05, "loss": 0.0916, "step": 23020 }, { "epoch": 1.2070230607966457, "grad_norm": 1.7124361991882324, "learning_rate": 1.9825733752620547e-05, "loss": 0.0849, "step": 23030 }, { "epoch": 1.2075471698113207, "grad_norm": 5.533046722412109, "learning_rate": 1.981263102725367e-05, "loss": 0.0556, "step": 23040 }, { "epoch": 1.2080712788259957, "grad_norm": 0.8444564938545227, "learning_rate": 1.9799528301886793e-05, "loss": 0.0688, "step": 23050 }, { "epoch": 1.2085953878406708, "grad_norm": 1.8386211395263672, "learning_rate": 1.9786425576519917e-05, "loss": 0.0891, "step": 23060 }, { "epoch": 1.209119496855346, "grad_norm": 1.8787667751312256, "learning_rate": 1.977332285115304e-05, "loss": 0.0927, "step": 23070 }, { "epoch": 1.209643605870021, "grad_norm": 0.8088480830192566, "learning_rate": 1.9760220125786164e-05, "loss": 0.0783, "step": 23080 }, { "epoch": 1.210167714884696, "grad_norm": 1.171616792678833, "learning_rate": 1.9747117400419287e-05, "loss": 0.0905, "step": 23090 }, { "epoch": 1.2106918238993711, "grad_norm": 2.413273572921753, "learning_rate": 1.973401467505241e-05, "loss": 0.0936, "step": 23100 }, { "epoch": 1.2112159329140462, "grad_norm": 2.519268035888672, "learning_rate": 1.9720911949685537e-05, "loss": 0.1114, "step": 23110 }, { "epoch": 1.2117400419287212, "grad_norm": 2.1220171451568604, "learning_rate": 1.970780922431866e-05, "loss": 0.0773, "step": 23120 }, { "epoch": 1.2122641509433962, "grad_norm": 2.4642395973205566, "learning_rate": 1.969470649895178e-05, "loss": 0.0851, "step": 23130 }, { "epoch": 1.2127882599580713, "grad_norm": 0.9276864528656006, "learning_rate": 1.9681603773584907e-05, "loss": 0.0707, "step": 23140 }, { "epoch": 1.2133123689727463, "grad_norm": 0.9606568217277527, "learning_rate": 1.966850104821803e-05, "loss": 0.0991, "step": 23150 }, { "epoch": 1.2138364779874213, "grad_norm": 2.05163311958313, "learning_rate": 1.9655398322851154e-05, "loss": 0.0769, "step": 23160 }, { "epoch": 1.2143605870020964, "grad_norm": 1.6464990377426147, "learning_rate": 1.9642295597484278e-05, "loss": 0.1088, "step": 23170 }, { "epoch": 1.2148846960167714, "grad_norm": 3.4842240810394287, "learning_rate": 1.96291928721174e-05, "loss": 0.0954, "step": 23180 }, { "epoch": 1.2154088050314464, "grad_norm": 1.8171097040176392, "learning_rate": 1.9616090146750524e-05, "loss": 0.1078, "step": 23190 }, { "epoch": 1.2159329140461215, "grad_norm": 1.6532925367355347, "learning_rate": 1.9602987421383648e-05, "loss": 0.0726, "step": 23200 }, { "epoch": 1.2164570230607967, "grad_norm": 1.8835959434509277, "learning_rate": 1.958988469601677e-05, "loss": 0.1119, "step": 23210 }, { "epoch": 1.2169811320754718, "grad_norm": 0.9576271176338196, "learning_rate": 1.9576781970649898e-05, "loss": 0.0839, "step": 23220 }, { "epoch": 1.2175052410901468, "grad_norm": 1.8635886907577515, "learning_rate": 1.956367924528302e-05, "loss": 0.076, "step": 23230 }, { "epoch": 1.2180293501048218, "grad_norm": 0.8350629806518555, "learning_rate": 1.9550576519916145e-05, "loss": 0.0756, "step": 23240 }, { "epoch": 1.2185534591194969, "grad_norm": 1.33456552028656, "learning_rate": 1.9537473794549265e-05, "loss": 0.0774, "step": 23250 }, { "epoch": 1.219077568134172, "grad_norm": 2.3194878101348877, "learning_rate": 1.952437106918239e-05, "loss": 0.0833, "step": 23260 }, { "epoch": 1.219601677148847, "grad_norm": 2.020064115524292, "learning_rate": 1.9511268343815515e-05, "loss": 0.0428, "step": 23270 }, { "epoch": 1.220125786163522, "grad_norm": 1.8238146305084229, "learning_rate": 1.9498165618448638e-05, "loss": 0.0772, "step": 23280 }, { "epoch": 1.220649895178197, "grad_norm": 1.7014821767807007, "learning_rate": 1.948506289308176e-05, "loss": 0.084, "step": 23290 }, { "epoch": 1.221174004192872, "grad_norm": 1.2988556623458862, "learning_rate": 1.9471960167714888e-05, "loss": 0.0781, "step": 23300 }, { "epoch": 1.2216981132075473, "grad_norm": 1.7309902906417847, "learning_rate": 1.9458857442348008e-05, "loss": 0.0977, "step": 23310 }, { "epoch": 1.2222222222222223, "grad_norm": 1.721279263496399, "learning_rate": 1.944575471698113e-05, "loss": 0.0726, "step": 23320 }, { "epoch": 1.2227463312368974, "grad_norm": 1.762795090675354, "learning_rate": 1.9432651991614255e-05, "loss": 0.0805, "step": 23330 }, { "epoch": 1.2232704402515724, "grad_norm": 1.2574833631515503, "learning_rate": 1.9419549266247382e-05, "loss": 0.0626, "step": 23340 }, { "epoch": 1.2237945492662474, "grad_norm": 1.1116647720336914, "learning_rate": 1.9406446540880505e-05, "loss": 0.079, "step": 23350 }, { "epoch": 1.2243186582809225, "grad_norm": 1.4834880828857422, "learning_rate": 1.939334381551363e-05, "loss": 0.0835, "step": 23360 }, { "epoch": 1.2248427672955975, "grad_norm": 1.7493497133255005, "learning_rate": 1.938024109014675e-05, "loss": 0.0874, "step": 23370 }, { "epoch": 1.2253668763102725, "grad_norm": 1.5207605361938477, "learning_rate": 1.9367138364779875e-05, "loss": 0.0804, "step": 23380 }, { "epoch": 1.2258909853249476, "grad_norm": 1.1050654649734497, "learning_rate": 1.9354035639413e-05, "loss": 0.0602, "step": 23390 }, { "epoch": 1.2264150943396226, "grad_norm": 1.671478033065796, "learning_rate": 1.9340932914046122e-05, "loss": 0.0931, "step": 23400 }, { "epoch": 1.2269392033542976, "grad_norm": 2.356992721557617, "learning_rate": 1.9327830188679246e-05, "loss": 0.0736, "step": 23410 }, { "epoch": 1.2274633123689727, "grad_norm": 1.6068631410598755, "learning_rate": 1.9314727463312372e-05, "loss": 0.0843, "step": 23420 }, { "epoch": 1.2279874213836477, "grad_norm": 1.5799087285995483, "learning_rate": 1.9301624737945492e-05, "loss": 0.0627, "step": 23430 }, { "epoch": 1.2285115303983227, "grad_norm": 1.4050095081329346, "learning_rate": 1.9288522012578616e-05, "loss": 0.0867, "step": 23440 }, { "epoch": 1.229035639412998, "grad_norm": 1.1417391300201416, "learning_rate": 1.927541928721174e-05, "loss": 0.082, "step": 23450 }, { "epoch": 1.229559748427673, "grad_norm": 1.4258673191070557, "learning_rate": 1.9262316561844866e-05, "loss": 0.0891, "step": 23460 }, { "epoch": 1.230083857442348, "grad_norm": 1.9718081951141357, "learning_rate": 1.924921383647799e-05, "loss": 0.1071, "step": 23470 }, { "epoch": 1.230607966457023, "grad_norm": 1.9824044704437256, "learning_rate": 1.9236111111111113e-05, "loss": 0.0804, "step": 23480 }, { "epoch": 1.2311320754716981, "grad_norm": 1.7826849222183228, "learning_rate": 1.9223008385744236e-05, "loss": 0.0713, "step": 23490 }, { "epoch": 1.2316561844863732, "grad_norm": 1.1761234998703003, "learning_rate": 1.920990566037736e-05, "loss": 0.0805, "step": 23500 }, { "epoch": 1.2321802935010482, "grad_norm": 1.566415548324585, "learning_rate": 1.9196802935010483e-05, "loss": 0.0932, "step": 23510 }, { "epoch": 1.2327044025157232, "grad_norm": 1.5760856866836548, "learning_rate": 1.9183700209643606e-05, "loss": 0.0546, "step": 23520 }, { "epoch": 1.2332285115303983, "grad_norm": 1.3049871921539307, "learning_rate": 1.917059748427673e-05, "loss": 0.0666, "step": 23530 }, { "epoch": 1.2337526205450733, "grad_norm": 1.3135899305343628, "learning_rate": 1.9157494758909856e-05, "loss": 0.1063, "step": 23540 }, { "epoch": 1.2342767295597485, "grad_norm": 2.9520959854125977, "learning_rate": 1.9144392033542976e-05, "loss": 0.1013, "step": 23550 }, { "epoch": 1.2348008385744236, "grad_norm": 1.6666831970214844, "learning_rate": 1.91312893081761e-05, "loss": 0.0893, "step": 23560 }, { "epoch": 1.2353249475890986, "grad_norm": 1.646226406097412, "learning_rate": 1.9118186582809226e-05, "loss": 0.093, "step": 23570 }, { "epoch": 1.2358490566037736, "grad_norm": 1.1616284847259521, "learning_rate": 1.910508385744235e-05, "loss": 0.0635, "step": 23580 }, { "epoch": 1.2363731656184487, "grad_norm": 1.1960272789001465, "learning_rate": 1.9091981132075473e-05, "loss": 0.0731, "step": 23590 }, { "epoch": 1.2368972746331237, "grad_norm": 1.1174589395523071, "learning_rate": 1.9078878406708597e-05, "loss": 0.0949, "step": 23600 }, { "epoch": 1.2374213836477987, "grad_norm": 2.366077423095703, "learning_rate": 1.906577568134172e-05, "loss": 0.0673, "step": 23610 }, { "epoch": 1.2379454926624738, "grad_norm": 1.1007031202316284, "learning_rate": 1.9052672955974843e-05, "loss": 0.0662, "step": 23620 }, { "epoch": 1.2384696016771488, "grad_norm": 2.171886682510376, "learning_rate": 1.9039570230607967e-05, "loss": 0.1205, "step": 23630 }, { "epoch": 1.2389937106918238, "grad_norm": 2.20706844329834, "learning_rate": 1.902646750524109e-05, "loss": 0.0727, "step": 23640 }, { "epoch": 1.2395178197064989, "grad_norm": 1.7492408752441406, "learning_rate": 1.9013364779874217e-05, "loss": 0.0795, "step": 23650 }, { "epoch": 1.240041928721174, "grad_norm": 2.009042978286743, "learning_rate": 1.900026205450734e-05, "loss": 0.0908, "step": 23660 }, { "epoch": 1.240566037735849, "grad_norm": 1.2455140352249146, "learning_rate": 1.898715932914046e-05, "loss": 0.073, "step": 23670 }, { "epoch": 1.2410901467505242, "grad_norm": 1.2918400764465332, "learning_rate": 1.8974056603773584e-05, "loss": 0.0594, "step": 23680 }, { "epoch": 1.2416142557651992, "grad_norm": 2.6020805835723877, "learning_rate": 1.896095387840671e-05, "loss": 0.0691, "step": 23690 }, { "epoch": 1.2421383647798743, "grad_norm": 1.4150261878967285, "learning_rate": 1.8947851153039834e-05, "loss": 0.0718, "step": 23700 }, { "epoch": 1.2426624737945493, "grad_norm": 2.6102347373962402, "learning_rate": 1.8934748427672957e-05, "loss": 0.0836, "step": 23710 }, { "epoch": 1.2431865828092243, "grad_norm": 0.48219212889671326, "learning_rate": 1.892164570230608e-05, "loss": 0.0774, "step": 23720 }, { "epoch": 1.2437106918238994, "grad_norm": 1.1697911024093628, "learning_rate": 1.8908542976939204e-05, "loss": 0.0608, "step": 23730 }, { "epoch": 1.2442348008385744, "grad_norm": 1.2900214195251465, "learning_rate": 1.8895440251572327e-05, "loss": 0.0761, "step": 23740 }, { "epoch": 1.2447589098532494, "grad_norm": 1.660508394241333, "learning_rate": 1.888233752620545e-05, "loss": 0.0781, "step": 23750 }, { "epoch": 1.2452830188679245, "grad_norm": 1.1662437915802002, "learning_rate": 1.8869234800838574e-05, "loss": 0.0885, "step": 23760 }, { "epoch": 1.2458071278825995, "grad_norm": 1.7090445756912231, "learning_rate": 1.88561320754717e-05, "loss": 0.0704, "step": 23770 }, { "epoch": 1.2463312368972745, "grad_norm": 1.6148638725280762, "learning_rate": 1.8843029350104824e-05, "loss": 0.0778, "step": 23780 }, { "epoch": 1.2468553459119498, "grad_norm": 1.092518925666809, "learning_rate": 1.8829926624737944e-05, "loss": 0.0762, "step": 23790 }, { "epoch": 1.2473794549266248, "grad_norm": 1.5229153633117676, "learning_rate": 1.8816823899371068e-05, "loss": 0.0889, "step": 23800 }, { "epoch": 1.2479035639412999, "grad_norm": 1.1291695833206177, "learning_rate": 1.8803721174004195e-05, "loss": 0.0985, "step": 23810 }, { "epoch": 1.248427672955975, "grad_norm": 1.2589600086212158, "learning_rate": 1.8790618448637318e-05, "loss": 0.0932, "step": 23820 }, { "epoch": 1.24895178197065, "grad_norm": 1.1041934490203857, "learning_rate": 1.877751572327044e-05, "loss": 0.076, "step": 23830 }, { "epoch": 1.249475890985325, "grad_norm": 1.4280002117156982, "learning_rate": 1.8764412997903565e-05, "loss": 0.0906, "step": 23840 }, { "epoch": 1.25, "grad_norm": 1.893744945526123, "learning_rate": 1.8751310272536688e-05, "loss": 0.0997, "step": 23850 }, { "epoch": 1.250524109014675, "grad_norm": 1.3034499883651733, "learning_rate": 1.873820754716981e-05, "loss": 0.0927, "step": 23860 }, { "epoch": 1.25104821802935, "grad_norm": 1.0884954929351807, "learning_rate": 1.8725104821802935e-05, "loss": 0.0619, "step": 23870 }, { "epoch": 1.251572327044025, "grad_norm": 1.8677185773849487, "learning_rate": 1.8712002096436058e-05, "loss": 0.0795, "step": 23880 }, { "epoch": 1.2520964360587001, "grad_norm": 1.7047679424285889, "learning_rate": 1.8698899371069185e-05, "loss": 0.0926, "step": 23890 }, { "epoch": 1.2526205450733752, "grad_norm": 1.151652216911316, "learning_rate": 1.868579664570231e-05, "loss": 0.0899, "step": 23900 }, { "epoch": 1.2531446540880502, "grad_norm": 1.0031532049179077, "learning_rate": 1.867269392033543e-05, "loss": 0.0753, "step": 23910 }, { "epoch": 1.2536687631027252, "grad_norm": 2.4875776767730713, "learning_rate": 1.8659591194968555e-05, "loss": 0.0919, "step": 23920 }, { "epoch": 1.2541928721174005, "grad_norm": 2.6797757148742676, "learning_rate": 1.864648846960168e-05, "loss": 0.0719, "step": 23930 }, { "epoch": 1.2547169811320755, "grad_norm": 1.4403328895568848, "learning_rate": 1.8633385744234802e-05, "loss": 0.102, "step": 23940 }, { "epoch": 1.2552410901467506, "grad_norm": 2.064087390899658, "learning_rate": 1.8620283018867925e-05, "loss": 0.0838, "step": 23950 }, { "epoch": 1.2557651991614256, "grad_norm": 6.239688873291016, "learning_rate": 1.860718029350105e-05, "loss": 0.0872, "step": 23960 }, { "epoch": 1.2562893081761006, "grad_norm": 1.3345998525619507, "learning_rate": 1.8594077568134172e-05, "loss": 0.0869, "step": 23970 }, { "epoch": 1.2568134171907757, "grad_norm": 1.0408798456192017, "learning_rate": 1.8580974842767295e-05, "loss": 0.0642, "step": 23980 }, { "epoch": 1.2573375262054507, "grad_norm": 0.3426121175289154, "learning_rate": 1.856787211740042e-05, "loss": 0.0875, "step": 23990 }, { "epoch": 1.2578616352201257, "grad_norm": 1.971558690071106, "learning_rate": 1.8554769392033546e-05, "loss": 0.0705, "step": 24000 }, { "epoch": 1.2578616352201257, "eval_loss": 0.2765878736972809, "eval_runtime": 267.3744, "eval_samples_per_second": 7.446, "eval_steps_per_second": 1.242, "step": 24000 }, { "epoch": 1.2583857442348008, "grad_norm": 1.6661800146102905, "learning_rate": 1.854166666666667e-05, "loss": 0.0872, "step": 24010 }, { "epoch": 1.258909853249476, "grad_norm": 1.5502612590789795, "learning_rate": 1.8528563941299792e-05, "loss": 0.0711, "step": 24020 }, { "epoch": 1.259433962264151, "grad_norm": 2.2991933822631836, "learning_rate": 1.8515461215932912e-05, "loss": 0.0798, "step": 24030 }, { "epoch": 1.259958071278826, "grad_norm": 1.4546477794647217, "learning_rate": 1.850235849056604e-05, "loss": 0.0913, "step": 24040 }, { "epoch": 1.2604821802935011, "grad_norm": 0.8518606424331665, "learning_rate": 1.8489255765199163e-05, "loss": 0.0892, "step": 24050 }, { "epoch": 1.2610062893081762, "grad_norm": 0.5437707901000977, "learning_rate": 1.8476153039832286e-05, "loss": 0.0515, "step": 24060 }, { "epoch": 1.2615303983228512, "grad_norm": 2.1086854934692383, "learning_rate": 1.846305031446541e-05, "loss": 0.0687, "step": 24070 }, { "epoch": 1.2620545073375262, "grad_norm": 1.66475248336792, "learning_rate": 1.8449947589098536e-05, "loss": 0.0899, "step": 24080 }, { "epoch": 1.2625786163522013, "grad_norm": 1.212482213973999, "learning_rate": 1.8436844863731656e-05, "loss": 0.0922, "step": 24090 }, { "epoch": 1.2631027253668763, "grad_norm": 1.6632353067398071, "learning_rate": 1.842374213836478e-05, "loss": 0.0613, "step": 24100 }, { "epoch": 1.2636268343815513, "grad_norm": 0.9513425827026367, "learning_rate": 1.8410639412997903e-05, "loss": 0.0865, "step": 24110 }, { "epoch": 1.2641509433962264, "grad_norm": 1.3721930980682373, "learning_rate": 1.839753668763103e-05, "loss": 0.0868, "step": 24120 }, { "epoch": 1.2646750524109014, "grad_norm": 0.6984042525291443, "learning_rate": 1.8384433962264153e-05, "loss": 0.0884, "step": 24130 }, { "epoch": 1.2651991614255764, "grad_norm": 1.0884439945220947, "learning_rate": 1.8371331236897276e-05, "loss": 0.0847, "step": 24140 }, { "epoch": 1.2657232704402515, "grad_norm": 0.779158353805542, "learning_rate": 1.83582285115304e-05, "loss": 0.0633, "step": 24150 }, { "epoch": 1.2662473794549265, "grad_norm": 5.8341898918151855, "learning_rate": 1.8345125786163523e-05, "loss": 0.0853, "step": 24160 }, { "epoch": 1.2667714884696017, "grad_norm": 0.932140052318573, "learning_rate": 1.8332023060796647e-05, "loss": 0.1061, "step": 24170 }, { "epoch": 1.2672955974842768, "grad_norm": 1.941718578338623, "learning_rate": 1.831892033542977e-05, "loss": 0.0707, "step": 24180 }, { "epoch": 1.2678197064989518, "grad_norm": 1.0429099798202515, "learning_rate": 1.8305817610062893e-05, "loss": 0.077, "step": 24190 }, { "epoch": 1.2683438155136268, "grad_norm": 2.249950885772705, "learning_rate": 1.829271488469602e-05, "loss": 0.0978, "step": 24200 }, { "epoch": 1.2688679245283019, "grad_norm": 1.7662737369537354, "learning_rate": 1.827961215932914e-05, "loss": 0.0831, "step": 24210 }, { "epoch": 1.269392033542977, "grad_norm": 1.8380582332611084, "learning_rate": 1.8266509433962263e-05, "loss": 0.0703, "step": 24220 }, { "epoch": 1.269916142557652, "grad_norm": 1.9762569665908813, "learning_rate": 1.825340670859539e-05, "loss": 0.0581, "step": 24230 }, { "epoch": 1.270440251572327, "grad_norm": 1.7735848426818848, "learning_rate": 1.8240303983228514e-05, "loss": 0.084, "step": 24240 }, { "epoch": 1.270964360587002, "grad_norm": 1.975049376487732, "learning_rate": 1.8227201257861637e-05, "loss": 0.1041, "step": 24250 }, { "epoch": 1.2714884696016773, "grad_norm": 1.864380955696106, "learning_rate": 1.821409853249476e-05, "loss": 0.0721, "step": 24260 }, { "epoch": 1.2720125786163523, "grad_norm": 1.9561831951141357, "learning_rate": 1.8200995807127884e-05, "loss": 0.0863, "step": 24270 }, { "epoch": 1.2725366876310273, "grad_norm": 2.3323402404785156, "learning_rate": 1.8187893081761007e-05, "loss": 0.0783, "step": 24280 }, { "epoch": 1.2730607966457024, "grad_norm": 1.369814395904541, "learning_rate": 1.817479035639413e-05, "loss": 0.0649, "step": 24290 }, { "epoch": 1.2735849056603774, "grad_norm": 3.996958017349243, "learning_rate": 1.8161687631027254e-05, "loss": 0.104, "step": 24300 }, { "epoch": 1.2741090146750524, "grad_norm": 1.671249508857727, "learning_rate": 1.814858490566038e-05, "loss": 0.0811, "step": 24310 }, { "epoch": 1.2746331236897275, "grad_norm": 1.0019627809524536, "learning_rate": 1.8135482180293504e-05, "loss": 0.0614, "step": 24320 }, { "epoch": 1.2751572327044025, "grad_norm": 2.198514223098755, "learning_rate": 1.8122379454926624e-05, "loss": 0.0905, "step": 24330 }, { "epoch": 1.2756813417190775, "grad_norm": 1.8543412685394287, "learning_rate": 1.8109276729559747e-05, "loss": 0.0843, "step": 24340 }, { "epoch": 1.2762054507337526, "grad_norm": 1.5628409385681152, "learning_rate": 1.8096174004192874e-05, "loss": 0.1025, "step": 24350 }, { "epoch": 1.2767295597484276, "grad_norm": 0.930385947227478, "learning_rate": 1.8083071278825998e-05, "loss": 0.0841, "step": 24360 }, { "epoch": 1.2772536687631026, "grad_norm": 1.0186971426010132, "learning_rate": 1.806996855345912e-05, "loss": 0.0483, "step": 24370 }, { "epoch": 1.2777777777777777, "grad_norm": 1.3639895915985107, "learning_rate": 1.8056865828092244e-05, "loss": 0.0903, "step": 24380 }, { "epoch": 1.2783018867924527, "grad_norm": 1.24062180519104, "learning_rate": 1.8043763102725368e-05, "loss": 0.0912, "step": 24390 }, { "epoch": 1.2788259958071277, "grad_norm": 0.5283358097076416, "learning_rate": 1.803066037735849e-05, "loss": 0.0629, "step": 24400 }, { "epoch": 1.279350104821803, "grad_norm": 1.4935684204101562, "learning_rate": 1.8017557651991615e-05, "loss": 0.0711, "step": 24410 }, { "epoch": 1.279874213836478, "grad_norm": 4.116866111755371, "learning_rate": 1.8004454926624738e-05, "loss": 0.083, "step": 24420 }, { "epoch": 1.280398322851153, "grad_norm": 2.2552740573883057, "learning_rate": 1.7991352201257865e-05, "loss": 0.0794, "step": 24430 }, { "epoch": 1.280922431865828, "grad_norm": 0.832618772983551, "learning_rate": 1.7978249475890988e-05, "loss": 0.0798, "step": 24440 }, { "epoch": 1.2814465408805031, "grad_norm": 1.6791424751281738, "learning_rate": 1.7965146750524108e-05, "loss": 0.066, "step": 24450 }, { "epoch": 1.2819706498951782, "grad_norm": 1.4778084754943848, "learning_rate": 1.795204402515723e-05, "loss": 0.0689, "step": 24460 }, { "epoch": 1.2824947589098532, "grad_norm": 1.879032850265503, "learning_rate": 1.7938941299790358e-05, "loss": 0.0602, "step": 24470 }, { "epoch": 1.2830188679245282, "grad_norm": 1.6691385507583618, "learning_rate": 1.792583857442348e-05, "loss": 0.0647, "step": 24480 }, { "epoch": 1.2835429769392033, "grad_norm": 1.581830620765686, "learning_rate": 1.7912735849056605e-05, "loss": 0.069, "step": 24490 }, { "epoch": 1.2840670859538785, "grad_norm": 1.7334755659103394, "learning_rate": 1.789963312368973e-05, "loss": 0.0885, "step": 24500 }, { "epoch": 1.2845911949685536, "grad_norm": 1.2646406888961792, "learning_rate": 1.7886530398322852e-05, "loss": 0.0844, "step": 24510 }, { "epoch": 1.2851153039832286, "grad_norm": 1.4649100303649902, "learning_rate": 1.7873427672955975e-05, "loss": 0.072, "step": 24520 }, { "epoch": 1.2856394129979036, "grad_norm": 1.4135819673538208, "learning_rate": 1.78603249475891e-05, "loss": 0.0782, "step": 24530 }, { "epoch": 1.2861635220125787, "grad_norm": 4.786865234375, "learning_rate": 1.7847222222222222e-05, "loss": 0.0969, "step": 24540 }, { "epoch": 1.2866876310272537, "grad_norm": 1.6510841846466064, "learning_rate": 1.783411949685535e-05, "loss": 0.081, "step": 24550 }, { "epoch": 1.2872117400419287, "grad_norm": 1.9891088008880615, "learning_rate": 1.7821016771488472e-05, "loss": 0.0773, "step": 24560 }, { "epoch": 1.2877358490566038, "grad_norm": 1.993718147277832, "learning_rate": 1.7807914046121592e-05, "loss": 0.1071, "step": 24570 }, { "epoch": 1.2882599580712788, "grad_norm": 1.368506669998169, "learning_rate": 1.779481132075472e-05, "loss": 0.0825, "step": 24580 }, { "epoch": 1.2887840670859538, "grad_norm": 2.430133104324341, "learning_rate": 1.7781708595387842e-05, "loss": 0.057, "step": 24590 }, { "epoch": 1.2893081761006289, "grad_norm": 0.8209651708602905, "learning_rate": 1.7768605870020966e-05, "loss": 0.0897, "step": 24600 }, { "epoch": 1.289832285115304, "grad_norm": 1.1338999271392822, "learning_rate": 1.775550314465409e-05, "loss": 0.0922, "step": 24610 }, { "epoch": 1.290356394129979, "grad_norm": 1.7733038663864136, "learning_rate": 1.7742400419287212e-05, "loss": 0.087, "step": 24620 }, { "epoch": 1.290880503144654, "grad_norm": 1.529729962348938, "learning_rate": 1.7729297693920336e-05, "loss": 0.0581, "step": 24630 }, { "epoch": 1.291404612159329, "grad_norm": 1.0193239450454712, "learning_rate": 1.771619496855346e-05, "loss": 0.0697, "step": 24640 }, { "epoch": 1.2919287211740043, "grad_norm": 2.4770092964172363, "learning_rate": 1.7703092243186583e-05, "loss": 0.1183, "step": 24650 }, { "epoch": 1.2924528301886793, "grad_norm": 1.4490543603897095, "learning_rate": 1.768998951781971e-05, "loss": 0.0778, "step": 24660 }, { "epoch": 1.2929769392033543, "grad_norm": 1.66744863986969, "learning_rate": 1.7676886792452833e-05, "loss": 0.0871, "step": 24670 }, { "epoch": 1.2935010482180294, "grad_norm": 3.002897262573242, "learning_rate": 1.7663784067085953e-05, "loss": 0.0937, "step": 24680 }, { "epoch": 1.2940251572327044, "grad_norm": 1.3816642761230469, "learning_rate": 1.7650681341719076e-05, "loss": 0.0908, "step": 24690 }, { "epoch": 1.2945492662473794, "grad_norm": 2.319898843765259, "learning_rate": 1.7637578616352203e-05, "loss": 0.0796, "step": 24700 }, { "epoch": 1.2950733752620545, "grad_norm": 1.7644799947738647, "learning_rate": 1.7624475890985326e-05, "loss": 0.0802, "step": 24710 }, { "epoch": 1.2955974842767295, "grad_norm": 1.809232234954834, "learning_rate": 1.761137316561845e-05, "loss": 0.1001, "step": 24720 }, { "epoch": 1.2961215932914047, "grad_norm": 1.5217998027801514, "learning_rate": 1.7598270440251573e-05, "loss": 0.0938, "step": 24730 }, { "epoch": 1.2966457023060798, "grad_norm": 1.3758409023284912, "learning_rate": 1.7585167714884696e-05, "loss": 0.0849, "step": 24740 }, { "epoch": 1.2971698113207548, "grad_norm": 3.6694116592407227, "learning_rate": 1.757206498951782e-05, "loss": 0.0688, "step": 24750 }, { "epoch": 1.2976939203354299, "grad_norm": 1.149685025215149, "learning_rate": 1.7558962264150943e-05, "loss": 0.0924, "step": 24760 }, { "epoch": 1.2982180293501049, "grad_norm": 4.056723117828369, "learning_rate": 1.7545859538784067e-05, "loss": 0.0835, "step": 24770 }, { "epoch": 1.29874213836478, "grad_norm": 1.304752230644226, "learning_rate": 1.7532756813417193e-05, "loss": 0.0758, "step": 24780 }, { "epoch": 1.299266247379455, "grad_norm": 1.4952155351638794, "learning_rate": 1.7519654088050317e-05, "loss": 0.101, "step": 24790 }, { "epoch": 1.29979035639413, "grad_norm": 2.3801004886627197, "learning_rate": 1.7506551362683437e-05, "loss": 0.0961, "step": 24800 }, { "epoch": 1.300314465408805, "grad_norm": 2.32028865814209, "learning_rate": 1.749344863731656e-05, "loss": 0.0858, "step": 24810 }, { "epoch": 1.30083857442348, "grad_norm": 1.2908284664154053, "learning_rate": 1.7480345911949687e-05, "loss": 0.0753, "step": 24820 }, { "epoch": 1.301362683438155, "grad_norm": 1.50250244140625, "learning_rate": 1.746724318658281e-05, "loss": 0.0786, "step": 24830 }, { "epoch": 1.3018867924528301, "grad_norm": 1.1071337461471558, "learning_rate": 1.7454140461215934e-05, "loss": 0.0938, "step": 24840 }, { "epoch": 1.3024109014675052, "grad_norm": 1.892912745475769, "learning_rate": 1.7441037735849057e-05, "loss": 0.0845, "step": 24850 }, { "epoch": 1.3029350104821802, "grad_norm": 1.4982322454452515, "learning_rate": 1.742793501048218e-05, "loss": 0.0852, "step": 24860 }, { "epoch": 1.3034591194968552, "grad_norm": 1.102202296257019, "learning_rate": 1.7414832285115304e-05, "loss": 0.0562, "step": 24870 }, { "epoch": 1.3039832285115305, "grad_norm": 1.2208045721054077, "learning_rate": 1.7401729559748427e-05, "loss": 0.0956, "step": 24880 }, { "epoch": 1.3045073375262055, "grad_norm": 1.5137170553207397, "learning_rate": 1.738862683438155e-05, "loss": 0.0797, "step": 24890 }, { "epoch": 1.3050314465408805, "grad_norm": 2.021153688430786, "learning_rate": 1.7375524109014677e-05, "loss": 0.0793, "step": 24900 }, { "epoch": 1.3055555555555556, "grad_norm": 1.4623279571533203, "learning_rate": 1.73624213836478e-05, "loss": 0.0784, "step": 24910 }, { "epoch": 1.3060796645702306, "grad_norm": 2.375816583633423, "learning_rate": 1.734931865828092e-05, "loss": 0.1034, "step": 24920 }, { "epoch": 1.3066037735849056, "grad_norm": 4.277223110198975, "learning_rate": 1.7336215932914048e-05, "loss": 0.1049, "step": 24930 }, { "epoch": 1.3071278825995807, "grad_norm": 0.9970260262489319, "learning_rate": 1.732311320754717e-05, "loss": 0.0733, "step": 24940 }, { "epoch": 1.3076519916142557, "grad_norm": 0.7952342629432678, "learning_rate": 1.7310010482180294e-05, "loss": 0.0653, "step": 24950 }, { "epoch": 1.3081761006289307, "grad_norm": 2.167339324951172, "learning_rate": 1.7296907756813418e-05, "loss": 0.0685, "step": 24960 }, { "epoch": 1.308700209643606, "grad_norm": 1.8290060758590698, "learning_rate": 1.728380503144654e-05, "loss": 0.0777, "step": 24970 }, { "epoch": 1.309224318658281, "grad_norm": 1.786184310913086, "learning_rate": 1.7270702306079664e-05, "loss": 0.0852, "step": 24980 }, { "epoch": 1.309748427672956, "grad_norm": 0.8602116703987122, "learning_rate": 1.7257599580712788e-05, "loss": 0.078, "step": 24990 }, { "epoch": 1.310272536687631, "grad_norm": 2.8858346939086914, "learning_rate": 1.724449685534591e-05, "loss": 0.089, "step": 25000 }, { "epoch": 1.310272536687631, "eval_loss": 0.2705696225166321, "eval_runtime": 267.3452, "eval_samples_per_second": 7.447, "eval_steps_per_second": 1.242, "step": 25000 }, { "epoch": 1.3107966457023061, "grad_norm": 1.337947130203247, "learning_rate": 1.7231394129979038e-05, "loss": 0.0964, "step": 25010 }, { "epoch": 1.3113207547169812, "grad_norm": 0.7933491468429565, "learning_rate": 1.721829140461216e-05, "loss": 0.0623, "step": 25020 }, { "epoch": 1.3118448637316562, "grad_norm": 1.4174140691757202, "learning_rate": 1.7205188679245285e-05, "loss": 0.0571, "step": 25030 }, { "epoch": 1.3123689727463312, "grad_norm": 1.281467318534851, "learning_rate": 1.7192085953878405e-05, "loss": 0.064, "step": 25040 }, { "epoch": 1.3128930817610063, "grad_norm": 0.8671842217445374, "learning_rate": 1.717898322851153e-05, "loss": 0.0589, "step": 25050 }, { "epoch": 1.3134171907756813, "grad_norm": 1.0511995553970337, "learning_rate": 1.7165880503144655e-05, "loss": 0.0584, "step": 25060 }, { "epoch": 1.3139412997903563, "grad_norm": 1.4137508869171143, "learning_rate": 1.715277777777778e-05, "loss": 0.0886, "step": 25070 }, { "epoch": 1.3144654088050314, "grad_norm": 1.7218070030212402, "learning_rate": 1.7139675052410902e-05, "loss": 0.0705, "step": 25080 }, { "epoch": 1.3149895178197064, "grad_norm": 1.2758680582046509, "learning_rate": 1.712657232704403e-05, "loss": 0.0654, "step": 25090 }, { "epoch": 1.3155136268343814, "grad_norm": 1.9731104373931885, "learning_rate": 1.711346960167715e-05, "loss": 0.0936, "step": 25100 }, { "epoch": 1.3160377358490565, "grad_norm": 1.5953030586242676, "learning_rate": 1.7100366876310272e-05, "loss": 0.0706, "step": 25110 }, { "epoch": 1.3165618448637317, "grad_norm": 2.0026612281799316, "learning_rate": 1.7087264150943395e-05, "loss": 0.0643, "step": 25120 }, { "epoch": 1.3170859538784068, "grad_norm": 2.5946147441864014, "learning_rate": 1.7074161425576522e-05, "loss": 0.0718, "step": 25130 }, { "epoch": 1.3176100628930818, "grad_norm": 2.603461265563965, "learning_rate": 1.7061058700209645e-05, "loss": 0.0671, "step": 25140 }, { "epoch": 1.3181341719077568, "grad_norm": 2.067505359649658, "learning_rate": 1.704795597484277e-05, "loss": 0.0665, "step": 25150 }, { "epoch": 1.3186582809224319, "grad_norm": 2.5327792167663574, "learning_rate": 1.7034853249475892e-05, "loss": 0.0844, "step": 25160 }, { "epoch": 1.319182389937107, "grad_norm": 1.8760923147201538, "learning_rate": 1.7021750524109016e-05, "loss": 0.107, "step": 25170 }, { "epoch": 1.319706498951782, "grad_norm": 1.9224241971969604, "learning_rate": 1.700864779874214e-05, "loss": 0.0915, "step": 25180 }, { "epoch": 1.320230607966457, "grad_norm": 1.2848577499389648, "learning_rate": 1.6995545073375262e-05, "loss": 0.0901, "step": 25190 }, { "epoch": 1.320754716981132, "grad_norm": 1.1218456029891968, "learning_rate": 1.6982442348008386e-05, "loss": 0.0721, "step": 25200 }, { "epoch": 1.3212788259958073, "grad_norm": 1.2322758436203003, "learning_rate": 1.6969339622641513e-05, "loss": 0.0756, "step": 25210 }, { "epoch": 1.3218029350104823, "grad_norm": 1.3583430051803589, "learning_rate": 1.6956236897274633e-05, "loss": 0.0769, "step": 25220 }, { "epoch": 1.3223270440251573, "grad_norm": 1.2304691076278687, "learning_rate": 1.6943134171907756e-05, "loss": 0.1051, "step": 25230 }, { "epoch": 1.3228511530398324, "grad_norm": 0.6938934326171875, "learning_rate": 1.6930031446540883e-05, "loss": 0.0658, "step": 25240 }, { "epoch": 1.3233752620545074, "grad_norm": 1.2099896669387817, "learning_rate": 1.6916928721174006e-05, "loss": 0.1008, "step": 25250 }, { "epoch": 1.3238993710691824, "grad_norm": 1.8550902605056763, "learning_rate": 1.690382599580713e-05, "loss": 0.0807, "step": 25260 }, { "epoch": 1.3244234800838575, "grad_norm": 1.730497121810913, "learning_rate": 1.6890723270440253e-05, "loss": 0.0948, "step": 25270 }, { "epoch": 1.3249475890985325, "grad_norm": 1.381489872932434, "learning_rate": 1.6877620545073376e-05, "loss": 0.0846, "step": 25280 }, { "epoch": 1.3254716981132075, "grad_norm": 2.28286075592041, "learning_rate": 1.68645178197065e-05, "loss": 0.093, "step": 25290 }, { "epoch": 1.3259958071278826, "grad_norm": 0.8911481499671936, "learning_rate": 1.6851415094339623e-05, "loss": 0.068, "step": 25300 }, { "epoch": 1.3265199161425576, "grad_norm": 1.4973480701446533, "learning_rate": 1.6838312368972746e-05, "loss": 0.0768, "step": 25310 }, { "epoch": 1.3270440251572326, "grad_norm": 1.591711163520813, "learning_rate": 1.6825209643605873e-05, "loss": 0.0908, "step": 25320 }, { "epoch": 1.3275681341719077, "grad_norm": 2.0032050609588623, "learning_rate": 1.6812106918238997e-05, "loss": 0.0747, "step": 25330 }, { "epoch": 1.3280922431865827, "grad_norm": 0.906340479850769, "learning_rate": 1.6799004192872117e-05, "loss": 0.0652, "step": 25340 }, { "epoch": 1.3286163522012577, "grad_norm": 1.9054814577102661, "learning_rate": 1.678590146750524e-05, "loss": 0.0761, "step": 25350 }, { "epoch": 1.329140461215933, "grad_norm": 0.8013094067573547, "learning_rate": 1.6772798742138367e-05, "loss": 0.0736, "step": 25360 }, { "epoch": 1.329664570230608, "grad_norm": 1.0076137781143188, "learning_rate": 1.675969601677149e-05, "loss": 0.0819, "step": 25370 }, { "epoch": 1.330188679245283, "grad_norm": 1.4720983505249023, "learning_rate": 1.6746593291404613e-05, "loss": 0.0625, "step": 25380 }, { "epoch": 1.330712788259958, "grad_norm": 1.1966739892959595, "learning_rate": 1.6733490566037737e-05, "loss": 0.0645, "step": 25390 }, { "epoch": 1.3312368972746331, "grad_norm": 1.5404307842254639, "learning_rate": 1.672038784067086e-05, "loss": 0.0765, "step": 25400 }, { "epoch": 1.3317610062893082, "grad_norm": 1.782375693321228, "learning_rate": 1.6707285115303984e-05, "loss": 0.0719, "step": 25410 }, { "epoch": 1.3322851153039832, "grad_norm": 1.7037994861602783, "learning_rate": 1.6694182389937107e-05, "loss": 0.0644, "step": 25420 }, { "epoch": 1.3328092243186582, "grad_norm": 1.1350616216659546, "learning_rate": 1.668107966457023e-05, "loss": 0.0615, "step": 25430 }, { "epoch": 1.3333333333333333, "grad_norm": 1.5969865322113037, "learning_rate": 1.6667976939203357e-05, "loss": 0.0851, "step": 25440 }, { "epoch": 1.3338574423480085, "grad_norm": 2.3378772735595703, "learning_rate": 1.665487421383648e-05, "loss": 0.0882, "step": 25450 }, { "epoch": 1.3343815513626835, "grad_norm": 1.8046290874481201, "learning_rate": 1.66417714884696e-05, "loss": 0.0719, "step": 25460 }, { "epoch": 1.3349056603773586, "grad_norm": 1.4188587665557861, "learning_rate": 1.6628668763102724e-05, "loss": 0.085, "step": 25470 }, { "epoch": 1.3354297693920336, "grad_norm": 1.3640601634979248, "learning_rate": 1.661556603773585e-05, "loss": 0.0713, "step": 25480 }, { "epoch": 1.3359538784067087, "grad_norm": 1.4131982326507568, "learning_rate": 1.6602463312368974e-05, "loss": 0.0978, "step": 25490 }, { "epoch": 1.3364779874213837, "grad_norm": 1.2081284523010254, "learning_rate": 1.6589360587002097e-05, "loss": 0.0988, "step": 25500 }, { "epoch": 1.3370020964360587, "grad_norm": 1.0797348022460938, "learning_rate": 1.657625786163522e-05, "loss": 0.0692, "step": 25510 }, { "epoch": 1.3375262054507338, "grad_norm": 2.361949920654297, "learning_rate": 1.6563155136268344e-05, "loss": 0.0553, "step": 25520 }, { "epoch": 1.3380503144654088, "grad_norm": 1.3106904029846191, "learning_rate": 1.6550052410901468e-05, "loss": 0.0726, "step": 25530 }, { "epoch": 1.3385744234800838, "grad_norm": 1.1975557804107666, "learning_rate": 1.653694968553459e-05, "loss": 0.0744, "step": 25540 }, { "epoch": 1.3390985324947589, "grad_norm": 1.9830306768417358, "learning_rate": 1.6523846960167714e-05, "loss": 0.0855, "step": 25550 }, { "epoch": 1.3396226415094339, "grad_norm": 1.101518988609314, "learning_rate": 1.651074423480084e-05, "loss": 0.0719, "step": 25560 }, { "epoch": 1.340146750524109, "grad_norm": 2.719447612762451, "learning_rate": 1.6497641509433965e-05, "loss": 0.0897, "step": 25570 }, { "epoch": 1.340670859538784, "grad_norm": 2.419497013092041, "learning_rate": 1.6484538784067085e-05, "loss": 0.0667, "step": 25580 }, { "epoch": 1.341194968553459, "grad_norm": 1.612613558769226, "learning_rate": 1.647143605870021e-05, "loss": 0.0958, "step": 25590 }, { "epoch": 1.3417190775681342, "grad_norm": 1.1840425729751587, "learning_rate": 1.6458333333333335e-05, "loss": 0.0588, "step": 25600 }, { "epoch": 1.3422431865828093, "grad_norm": 1.41465163230896, "learning_rate": 1.6445230607966458e-05, "loss": 0.0773, "step": 25610 }, { "epoch": 1.3427672955974843, "grad_norm": 1.5764625072479248, "learning_rate": 1.643212788259958e-05, "loss": 0.0812, "step": 25620 }, { "epoch": 1.3432914046121593, "grad_norm": 1.6776331663131714, "learning_rate": 1.6419025157232705e-05, "loss": 0.0829, "step": 25630 }, { "epoch": 1.3438155136268344, "grad_norm": 1.4592304229736328, "learning_rate": 1.6405922431865828e-05, "loss": 0.1036, "step": 25640 }, { "epoch": 1.3443396226415094, "grad_norm": 1.5961178541183472, "learning_rate": 1.639281970649895e-05, "loss": 0.083, "step": 25650 }, { "epoch": 1.3448637316561844, "grad_norm": 1.031274437904358, "learning_rate": 1.6379716981132075e-05, "loss": 0.1249, "step": 25660 }, { "epoch": 1.3453878406708595, "grad_norm": 1.7729008197784424, "learning_rate": 1.6366614255765202e-05, "loss": 0.0676, "step": 25670 }, { "epoch": 1.3459119496855345, "grad_norm": 1.2640308141708374, "learning_rate": 1.6353511530398325e-05, "loss": 0.0984, "step": 25680 }, { "epoch": 1.3464360587002098, "grad_norm": 1.1028263568878174, "learning_rate": 1.634040880503145e-05, "loss": 0.0775, "step": 25690 }, { "epoch": 1.3469601677148848, "grad_norm": 1.7719197273254395, "learning_rate": 1.632730607966457e-05, "loss": 0.0797, "step": 25700 }, { "epoch": 1.3474842767295598, "grad_norm": 1.8258262872695923, "learning_rate": 1.6314203354297695e-05, "loss": 0.0782, "step": 25710 }, { "epoch": 1.3480083857442349, "grad_norm": 0.8588356971740723, "learning_rate": 1.630110062893082e-05, "loss": 0.0887, "step": 25720 }, { "epoch": 1.34853249475891, "grad_norm": 1.2232142686843872, "learning_rate": 1.6287997903563942e-05, "loss": 0.0933, "step": 25730 }, { "epoch": 1.349056603773585, "grad_norm": 2.0187501907348633, "learning_rate": 1.6274895178197065e-05, "loss": 0.0824, "step": 25740 }, { "epoch": 1.34958071278826, "grad_norm": 1.4330048561096191, "learning_rate": 1.6261792452830192e-05, "loss": 0.0874, "step": 25750 }, { "epoch": 1.350104821802935, "grad_norm": 2.3740146160125732, "learning_rate": 1.6248689727463312e-05, "loss": 0.0845, "step": 25760 }, { "epoch": 1.35062893081761, "grad_norm": 1.3695255517959595, "learning_rate": 1.6235587002096436e-05, "loss": 0.0757, "step": 25770 }, { "epoch": 1.351153039832285, "grad_norm": 0.8165902495384216, "learning_rate": 1.622248427672956e-05, "loss": 0.0834, "step": 25780 }, { "epoch": 1.35167714884696, "grad_norm": 1.2380322217941284, "learning_rate": 1.6209381551362686e-05, "loss": 0.0839, "step": 25790 }, { "epoch": 1.3522012578616351, "grad_norm": 1.3144499063491821, "learning_rate": 1.619627882599581e-05, "loss": 0.0785, "step": 25800 }, { "epoch": 1.3527253668763102, "grad_norm": 2.314235210418701, "learning_rate": 1.6183176100628933e-05, "loss": 0.0764, "step": 25810 }, { "epoch": 1.3532494758909852, "grad_norm": 1.4942843914031982, "learning_rate": 1.6170073375262053e-05, "loss": 0.0837, "step": 25820 }, { "epoch": 1.3537735849056602, "grad_norm": 1.46351158618927, "learning_rate": 1.615697064989518e-05, "loss": 0.0997, "step": 25830 }, { "epoch": 1.3542976939203355, "grad_norm": 1.4346762895584106, "learning_rate": 1.6143867924528303e-05, "loss": 0.0796, "step": 25840 }, { "epoch": 1.3548218029350105, "grad_norm": 1.5181978940963745, "learning_rate": 1.6130765199161426e-05, "loss": 0.0859, "step": 25850 }, { "epoch": 1.3553459119496856, "grad_norm": 0.9744179248809814, "learning_rate": 1.611766247379455e-05, "loss": 0.0965, "step": 25860 }, { "epoch": 1.3558700209643606, "grad_norm": 1.5921379327774048, "learning_rate": 1.6104559748427676e-05, "loss": 0.08, "step": 25870 }, { "epoch": 1.3563941299790356, "grad_norm": 1.3599023818969727, "learning_rate": 1.6091457023060796e-05, "loss": 0.0823, "step": 25880 }, { "epoch": 1.3569182389937107, "grad_norm": 4.234800338745117, "learning_rate": 1.607835429769392e-05, "loss": 0.1235, "step": 25890 }, { "epoch": 1.3574423480083857, "grad_norm": 1.4063974618911743, "learning_rate": 1.6065251572327043e-05, "loss": 0.1067, "step": 25900 }, { "epoch": 1.3579664570230607, "grad_norm": 2.220942974090576, "learning_rate": 1.605214884696017e-05, "loss": 0.1032, "step": 25910 }, { "epoch": 1.3584905660377358, "grad_norm": 1.3111824989318848, "learning_rate": 1.6039046121593293e-05, "loss": 0.07, "step": 25920 }, { "epoch": 1.359014675052411, "grad_norm": 1.5388332605361938, "learning_rate": 1.6025943396226417e-05, "loss": 0.0916, "step": 25930 }, { "epoch": 1.359538784067086, "grad_norm": 1.8049447536468506, "learning_rate": 1.601284067085954e-05, "loss": 0.075, "step": 25940 }, { "epoch": 1.360062893081761, "grad_norm": 0.8838228583335876, "learning_rate": 1.5999737945492663e-05, "loss": 0.0648, "step": 25950 }, { "epoch": 1.3605870020964361, "grad_norm": 1.5289140939712524, "learning_rate": 1.5986635220125787e-05, "loss": 0.0704, "step": 25960 }, { "epoch": 1.3611111111111112, "grad_norm": 1.8726352453231812, "learning_rate": 1.597353249475891e-05, "loss": 0.0682, "step": 25970 }, { "epoch": 1.3616352201257862, "grad_norm": 2.143695116043091, "learning_rate": 1.5960429769392034e-05, "loss": 0.1031, "step": 25980 }, { "epoch": 1.3621593291404612, "grad_norm": 1.7918883562088013, "learning_rate": 1.594732704402516e-05, "loss": 0.0792, "step": 25990 }, { "epoch": 1.3626834381551363, "grad_norm": 1.1076338291168213, "learning_rate": 1.593422431865828e-05, "loss": 0.0886, "step": 26000 }, { "epoch": 1.3626834381551363, "eval_loss": 0.2706705927848816, "eval_runtime": 268.5851, "eval_samples_per_second": 7.413, "eval_steps_per_second": 1.236, "step": 26000 }, { "epoch": 1.3632075471698113, "grad_norm": 1.611303448677063, "learning_rate": 1.5921121593291404e-05, "loss": 0.0804, "step": 26010 }, { "epoch": 1.3637316561844863, "grad_norm": 1.3968373537063599, "learning_rate": 1.590801886792453e-05, "loss": 0.0903, "step": 26020 }, { "epoch": 1.3642557651991614, "grad_norm": 6.171770095825195, "learning_rate": 1.5894916142557654e-05, "loss": 0.0706, "step": 26030 }, { "epoch": 1.3647798742138364, "grad_norm": 1.0653380155563354, "learning_rate": 1.5881813417190777e-05, "loss": 0.0819, "step": 26040 }, { "epoch": 1.3653039832285114, "grad_norm": 2.0599710941314697, "learning_rate": 1.58687106918239e-05, "loss": 0.0788, "step": 26050 }, { "epoch": 1.3658280922431865, "grad_norm": 3.233006477355957, "learning_rate": 1.5855607966457024e-05, "loss": 0.0782, "step": 26060 }, { "epoch": 1.3663522012578615, "grad_norm": 1.5495176315307617, "learning_rate": 1.5842505241090147e-05, "loss": 0.1035, "step": 26070 }, { "epoch": 1.3668763102725368, "grad_norm": 0.6416164636611938, "learning_rate": 1.582940251572327e-05, "loss": 0.0433, "step": 26080 }, { "epoch": 1.3674004192872118, "grad_norm": 1.9599913358688354, "learning_rate": 1.5816299790356394e-05, "loss": 0.074, "step": 26090 }, { "epoch": 1.3679245283018868, "grad_norm": 1.8246488571166992, "learning_rate": 1.580319706498952e-05, "loss": 0.1008, "step": 26100 }, { "epoch": 1.3684486373165619, "grad_norm": 1.8829816579818726, "learning_rate": 1.5790094339622644e-05, "loss": 0.0777, "step": 26110 }, { "epoch": 1.368972746331237, "grad_norm": 11.83426570892334, "learning_rate": 1.5776991614255764e-05, "loss": 0.0897, "step": 26120 }, { "epoch": 1.369496855345912, "grad_norm": 1.663411021232605, "learning_rate": 1.5763888888888888e-05, "loss": 0.0669, "step": 26130 }, { "epoch": 1.370020964360587, "grad_norm": 1.9873604774475098, "learning_rate": 1.5750786163522014e-05, "loss": 0.0922, "step": 26140 }, { "epoch": 1.370545073375262, "grad_norm": 1.4869024753570557, "learning_rate": 1.5737683438155138e-05, "loss": 0.0694, "step": 26150 }, { "epoch": 1.371069182389937, "grad_norm": 1.3856655359268188, "learning_rate": 1.572458071278826e-05, "loss": 0.0631, "step": 26160 }, { "epoch": 1.3715932914046123, "grad_norm": 1.4655511379241943, "learning_rate": 1.5711477987421385e-05, "loss": 0.0503, "step": 26170 }, { "epoch": 1.3721174004192873, "grad_norm": 1.414749026298523, "learning_rate": 1.5698375262054508e-05, "loss": 0.0912, "step": 26180 }, { "epoch": 1.3726415094339623, "grad_norm": 1.7459259033203125, "learning_rate": 1.568527253668763e-05, "loss": 0.0796, "step": 26190 }, { "epoch": 1.3731656184486374, "grad_norm": 1.1313443183898926, "learning_rate": 1.5672169811320755e-05, "loss": 0.0664, "step": 26200 }, { "epoch": 1.3736897274633124, "grad_norm": 1.0263671875, "learning_rate": 1.5659067085953878e-05, "loss": 0.0659, "step": 26210 }, { "epoch": 1.3742138364779874, "grad_norm": 1.0881389379501343, "learning_rate": 1.5645964360587005e-05, "loss": 0.0584, "step": 26220 }, { "epoch": 1.3747379454926625, "grad_norm": 1.92306387424469, "learning_rate": 1.563286163522013e-05, "loss": 0.0642, "step": 26230 }, { "epoch": 1.3752620545073375, "grad_norm": 1.1980657577514648, "learning_rate": 1.561975890985325e-05, "loss": 0.1216, "step": 26240 }, { "epoch": 1.3757861635220126, "grad_norm": 4.562958717346191, "learning_rate": 1.5606656184486375e-05, "loss": 0.0944, "step": 26250 }, { "epoch": 1.3763102725366876, "grad_norm": 1.993789553642273, "learning_rate": 1.55935534591195e-05, "loss": 0.0762, "step": 26260 }, { "epoch": 1.3768343815513626, "grad_norm": 1.6461892127990723, "learning_rate": 1.5580450733752622e-05, "loss": 0.084, "step": 26270 }, { "epoch": 1.3773584905660377, "grad_norm": 1.9939159154891968, "learning_rate": 1.5567348008385745e-05, "loss": 0.0708, "step": 26280 }, { "epoch": 1.3778825995807127, "grad_norm": 1.601395606994629, "learning_rate": 1.555424528301887e-05, "loss": 0.0748, "step": 26290 }, { "epoch": 1.3784067085953877, "grad_norm": 1.2121330499649048, "learning_rate": 1.5541142557651992e-05, "loss": 0.091, "step": 26300 }, { "epoch": 1.378930817610063, "grad_norm": 1.6086260080337524, "learning_rate": 1.5528039832285115e-05, "loss": 0.0841, "step": 26310 }, { "epoch": 1.379454926624738, "grad_norm": 1.4790680408477783, "learning_rate": 1.551493710691824e-05, "loss": 0.0791, "step": 26320 }, { "epoch": 1.379979035639413, "grad_norm": 1.49806547164917, "learning_rate": 1.5501834381551366e-05, "loss": 0.0889, "step": 26330 }, { "epoch": 1.380503144654088, "grad_norm": 1.4938971996307373, "learning_rate": 1.548873165618449e-05, "loss": 0.0731, "step": 26340 }, { "epoch": 1.381027253668763, "grad_norm": 1.3213557004928589, "learning_rate": 1.547562893081761e-05, "loss": 0.0913, "step": 26350 }, { "epoch": 1.3815513626834381, "grad_norm": 2.0190086364746094, "learning_rate": 1.5462526205450732e-05, "loss": 0.0914, "step": 26360 }, { "epoch": 1.3820754716981132, "grad_norm": 2.5714008808135986, "learning_rate": 1.544942348008386e-05, "loss": 0.1062, "step": 26370 }, { "epoch": 1.3825995807127882, "grad_norm": 0.9055834412574768, "learning_rate": 1.5436320754716982e-05, "loss": 0.0648, "step": 26380 }, { "epoch": 1.3831236897274632, "grad_norm": 1.8946198225021362, "learning_rate": 1.5423218029350106e-05, "loss": 0.0936, "step": 26390 }, { "epoch": 1.3836477987421385, "grad_norm": 1.461883544921875, "learning_rate": 1.541011530398323e-05, "loss": 0.0617, "step": 26400 }, { "epoch": 1.3841719077568135, "grad_norm": 1.4220163822174072, "learning_rate": 1.5397012578616353e-05, "loss": 0.0623, "step": 26410 }, { "epoch": 1.3846960167714886, "grad_norm": 1.1401937007904053, "learning_rate": 1.5383909853249476e-05, "loss": 0.0677, "step": 26420 }, { "epoch": 1.3852201257861636, "grad_norm": 2.134997844696045, "learning_rate": 1.53708071278826e-05, "loss": 0.0569, "step": 26430 }, { "epoch": 1.3857442348008386, "grad_norm": 1.6183842420578003, "learning_rate": 1.5357704402515723e-05, "loss": 0.0739, "step": 26440 }, { "epoch": 1.3862683438155137, "grad_norm": 1.135965347290039, "learning_rate": 1.534460167714885e-05, "loss": 0.0897, "step": 26450 }, { "epoch": 1.3867924528301887, "grad_norm": 1.706466794013977, "learning_rate": 1.5331498951781973e-05, "loss": 0.0835, "step": 26460 }, { "epoch": 1.3873165618448637, "grad_norm": 2.735337734222412, "learning_rate": 1.5318396226415093e-05, "loss": 0.0755, "step": 26470 }, { "epoch": 1.3878406708595388, "grad_norm": 1.0616151094436646, "learning_rate": 1.5305293501048216e-05, "loss": 0.077, "step": 26480 }, { "epoch": 1.3883647798742138, "grad_norm": 1.5158982276916504, "learning_rate": 1.5292190775681343e-05, "loss": 0.0793, "step": 26490 }, { "epoch": 1.3888888888888888, "grad_norm": 1.1948658227920532, "learning_rate": 1.5279088050314467e-05, "loss": 0.083, "step": 26500 }, { "epoch": 1.3894129979035639, "grad_norm": 1.381131649017334, "learning_rate": 1.526598532494759e-05, "loss": 0.0961, "step": 26510 }, { "epoch": 1.389937106918239, "grad_norm": 0.8586825132369995, "learning_rate": 1.5252882599580715e-05, "loss": 0.079, "step": 26520 }, { "epoch": 1.390461215932914, "grad_norm": 0.9948753118515015, "learning_rate": 1.5239779874213837e-05, "loss": 0.0779, "step": 26530 }, { "epoch": 1.390985324947589, "grad_norm": 1.9307538270950317, "learning_rate": 1.522667714884696e-05, "loss": 0.0889, "step": 26540 }, { "epoch": 1.3915094339622642, "grad_norm": 0.7694650888442993, "learning_rate": 1.5213574423480083e-05, "loss": 0.0905, "step": 26550 }, { "epoch": 1.3920335429769393, "grad_norm": 1.3657408952713013, "learning_rate": 1.5200471698113209e-05, "loss": 0.0698, "step": 26560 }, { "epoch": 1.3925576519916143, "grad_norm": 1.0885812044143677, "learning_rate": 1.5187368972746332e-05, "loss": 0.056, "step": 26570 }, { "epoch": 1.3930817610062893, "grad_norm": 2.0228495597839355, "learning_rate": 1.5174266247379457e-05, "loss": 0.0907, "step": 26580 }, { "epoch": 1.3936058700209644, "grad_norm": 1.1174840927124023, "learning_rate": 1.5161163522012579e-05, "loss": 0.072, "step": 26590 }, { "epoch": 1.3941299790356394, "grad_norm": 1.6065596342086792, "learning_rate": 1.5148060796645702e-05, "loss": 0.0919, "step": 26600 }, { "epoch": 1.3946540880503144, "grad_norm": 2.008131742477417, "learning_rate": 1.5134958071278827e-05, "loss": 0.0917, "step": 26610 }, { "epoch": 1.3951781970649895, "grad_norm": 2.5115885734558105, "learning_rate": 1.512185534591195e-05, "loss": 0.0812, "step": 26620 }, { "epoch": 1.3957023060796645, "grad_norm": 1.995683193206787, "learning_rate": 1.5108752620545074e-05, "loss": 0.0776, "step": 26630 }, { "epoch": 1.3962264150943398, "grad_norm": 1.5423414707183838, "learning_rate": 1.5095649895178199e-05, "loss": 0.082, "step": 26640 }, { "epoch": 1.3967505241090148, "grad_norm": 1.428948998451233, "learning_rate": 1.508254716981132e-05, "loss": 0.0762, "step": 26650 }, { "epoch": 1.3972746331236898, "grad_norm": 0.9008168578147888, "learning_rate": 1.5069444444444444e-05, "loss": 0.0711, "step": 26660 }, { "epoch": 1.3977987421383649, "grad_norm": 1.6736418008804321, "learning_rate": 1.5056341719077569e-05, "loss": 0.1039, "step": 26670 }, { "epoch": 1.39832285115304, "grad_norm": 1.5996124744415283, "learning_rate": 1.5043238993710693e-05, "loss": 0.0765, "step": 26680 }, { "epoch": 1.398846960167715, "grad_norm": 1.6097357273101807, "learning_rate": 1.5030136268343818e-05, "loss": 0.0807, "step": 26690 }, { "epoch": 1.39937106918239, "grad_norm": 1.1372876167297363, "learning_rate": 1.5017033542976941e-05, "loss": 0.0734, "step": 26700 }, { "epoch": 1.399895178197065, "grad_norm": 2.229891538619995, "learning_rate": 1.5003930817610063e-05, "loss": 0.073, "step": 26710 }, { "epoch": 1.40041928721174, "grad_norm": 1.2906321287155151, "learning_rate": 1.4990828092243186e-05, "loss": 0.0706, "step": 26720 }, { "epoch": 1.400943396226415, "grad_norm": 1.1182019710540771, "learning_rate": 1.4977725366876311e-05, "loss": 0.0711, "step": 26730 }, { "epoch": 1.40146750524109, "grad_norm": 0.8318687677383423, "learning_rate": 1.4964622641509435e-05, "loss": 0.0461, "step": 26740 }, { "epoch": 1.4019916142557651, "grad_norm": 1.398024320602417, "learning_rate": 1.495151991614256e-05, "loss": 0.0826, "step": 26750 }, { "epoch": 1.4025157232704402, "grad_norm": 1.820096731185913, "learning_rate": 1.4938417190775683e-05, "loss": 0.099, "step": 26760 }, { "epoch": 1.4030398322851152, "grad_norm": 1.1774033308029175, "learning_rate": 1.4925314465408805e-05, "loss": 0.0826, "step": 26770 }, { "epoch": 1.4035639412997902, "grad_norm": 2.464606761932373, "learning_rate": 1.4912211740041928e-05, "loss": 0.0767, "step": 26780 }, { "epoch": 1.4040880503144655, "grad_norm": 1.0393379926681519, "learning_rate": 1.4899109014675053e-05, "loss": 0.0867, "step": 26790 }, { "epoch": 1.4046121593291405, "grad_norm": 2.6681644916534424, "learning_rate": 1.4886006289308177e-05, "loss": 0.0782, "step": 26800 }, { "epoch": 1.4051362683438156, "grad_norm": 2.1190297603607178, "learning_rate": 1.4872903563941302e-05, "loss": 0.0724, "step": 26810 }, { "epoch": 1.4056603773584906, "grad_norm": 1.8009228706359863, "learning_rate": 1.4859800838574425e-05, "loss": 0.1171, "step": 26820 }, { "epoch": 1.4061844863731656, "grad_norm": 1.9393926858901978, "learning_rate": 1.4846698113207547e-05, "loss": 0.0797, "step": 26830 }, { "epoch": 1.4067085953878407, "grad_norm": 1.5478100776672363, "learning_rate": 1.483359538784067e-05, "loss": 0.0877, "step": 26840 }, { "epoch": 1.4072327044025157, "grad_norm": 1.9180935621261597, "learning_rate": 1.4820492662473795e-05, "loss": 0.0857, "step": 26850 }, { "epoch": 1.4077568134171907, "grad_norm": 1.5819123983383179, "learning_rate": 1.4807389937106919e-05, "loss": 0.0828, "step": 26860 }, { "epoch": 1.4082809224318658, "grad_norm": 1.8015711307525635, "learning_rate": 1.4794287211740044e-05, "loss": 0.0774, "step": 26870 }, { "epoch": 1.408805031446541, "grad_norm": 1.995409369468689, "learning_rate": 1.4781184486373167e-05, "loss": 0.0711, "step": 26880 }, { "epoch": 1.409329140461216, "grad_norm": 1.8823596239089966, "learning_rate": 1.4768081761006289e-05, "loss": 0.1001, "step": 26890 }, { "epoch": 1.409853249475891, "grad_norm": 1.262195110321045, "learning_rate": 1.4754979035639414e-05, "loss": 0.0789, "step": 26900 }, { "epoch": 1.4103773584905661, "grad_norm": 1.5328476428985596, "learning_rate": 1.4741876310272537e-05, "loss": 0.0712, "step": 26910 }, { "epoch": 1.4109014675052411, "grad_norm": 1.3686559200286865, "learning_rate": 1.472877358490566e-05, "loss": 0.0917, "step": 26920 }, { "epoch": 1.4114255765199162, "grad_norm": 1.4035519361495972, "learning_rate": 1.4715670859538786e-05, "loss": 0.0922, "step": 26930 }, { "epoch": 1.4119496855345912, "grad_norm": 1.396806001663208, "learning_rate": 1.4702568134171909e-05, "loss": 0.0707, "step": 26940 }, { "epoch": 1.4124737945492662, "grad_norm": 1.1253314018249512, "learning_rate": 1.468946540880503e-05, "loss": 0.0688, "step": 26950 }, { "epoch": 1.4129979035639413, "grad_norm": 1.334794282913208, "learning_rate": 1.4676362683438156e-05, "loss": 0.0707, "step": 26960 }, { "epoch": 1.4135220125786163, "grad_norm": 2.1766927242279053, "learning_rate": 1.466325995807128e-05, "loss": 0.0645, "step": 26970 }, { "epoch": 1.4140461215932913, "grad_norm": 1.0007305145263672, "learning_rate": 1.4650157232704404e-05, "loss": 0.0826, "step": 26980 }, { "epoch": 1.4145702306079664, "grad_norm": 1.4699190855026245, "learning_rate": 1.4637054507337528e-05, "loss": 0.0646, "step": 26990 }, { "epoch": 1.4150943396226414, "grad_norm": 1.193003535270691, "learning_rate": 1.4623951781970651e-05, "loss": 0.0765, "step": 27000 }, { "epoch": 1.4150943396226414, "eval_loss": 0.2722998857498169, "eval_runtime": 268.1874, "eval_samples_per_second": 7.424, "eval_steps_per_second": 1.238, "step": 27000 }, { "epoch": 1.4156184486373165, "grad_norm": 1.677714228630066, "learning_rate": 1.4610849056603773e-05, "loss": 0.0874, "step": 27010 }, { "epoch": 1.4161425576519915, "grad_norm": 1.2611427307128906, "learning_rate": 1.4597746331236898e-05, "loss": 0.0665, "step": 27020 }, { "epoch": 1.4166666666666667, "grad_norm": 1.3466812372207642, "learning_rate": 1.4584643605870021e-05, "loss": 0.0796, "step": 27030 }, { "epoch": 1.4171907756813418, "grad_norm": 2.360762119293213, "learning_rate": 1.4571540880503146e-05, "loss": 0.077, "step": 27040 }, { "epoch": 1.4177148846960168, "grad_norm": 1.946505069732666, "learning_rate": 1.455843815513627e-05, "loss": 0.081, "step": 27050 }, { "epoch": 1.4182389937106918, "grad_norm": 1.1225969791412354, "learning_rate": 1.4545335429769395e-05, "loss": 0.0652, "step": 27060 }, { "epoch": 1.4187631027253669, "grad_norm": 2.124420642852783, "learning_rate": 1.4532232704402515e-05, "loss": 0.0759, "step": 27070 }, { "epoch": 1.419287211740042, "grad_norm": 1.7057894468307495, "learning_rate": 1.451912997903564e-05, "loss": 0.0684, "step": 27080 }, { "epoch": 1.419811320754717, "grad_norm": 0.9108951687812805, "learning_rate": 1.4506027253668763e-05, "loss": 0.0756, "step": 27090 }, { "epoch": 1.420335429769392, "grad_norm": 0.9875414371490479, "learning_rate": 1.4492924528301888e-05, "loss": 0.0938, "step": 27100 }, { "epoch": 1.420859538784067, "grad_norm": 0.9708214402198792, "learning_rate": 1.4479821802935012e-05, "loss": 0.0862, "step": 27110 }, { "epoch": 1.4213836477987423, "grad_norm": 1.5186963081359863, "learning_rate": 1.4466719077568137e-05, "loss": 0.0742, "step": 27120 }, { "epoch": 1.4219077568134173, "grad_norm": 1.439297080039978, "learning_rate": 1.4453616352201257e-05, "loss": 0.0982, "step": 27130 }, { "epoch": 1.4224318658280923, "grad_norm": 1.1225006580352783, "learning_rate": 1.4440513626834382e-05, "loss": 0.0685, "step": 27140 }, { "epoch": 1.4229559748427674, "grad_norm": 1.1998291015625, "learning_rate": 1.4427410901467505e-05, "loss": 0.0683, "step": 27150 }, { "epoch": 1.4234800838574424, "grad_norm": 1.8755435943603516, "learning_rate": 1.441430817610063e-05, "loss": 0.1194, "step": 27160 }, { "epoch": 1.4240041928721174, "grad_norm": 1.7551946640014648, "learning_rate": 1.4401205450733754e-05, "loss": 0.078, "step": 27170 }, { "epoch": 1.4245283018867925, "grad_norm": 1.9918137788772583, "learning_rate": 1.4388102725366879e-05, "loss": 0.0836, "step": 27180 }, { "epoch": 1.4250524109014675, "grad_norm": 1.413087248802185, "learning_rate": 1.4374999999999999e-05, "loss": 0.0777, "step": 27190 }, { "epoch": 1.4255765199161425, "grad_norm": 1.1328896284103394, "learning_rate": 1.4361897274633124e-05, "loss": 0.0809, "step": 27200 }, { "epoch": 1.4261006289308176, "grad_norm": 1.0863498449325562, "learning_rate": 1.4348794549266247e-05, "loss": 0.0732, "step": 27210 }, { "epoch": 1.4266247379454926, "grad_norm": 1.7177821397781372, "learning_rate": 1.4335691823899372e-05, "loss": 0.1035, "step": 27220 }, { "epoch": 1.4271488469601676, "grad_norm": 1.144360065460205, "learning_rate": 1.4322589098532496e-05, "loss": 0.0773, "step": 27230 }, { "epoch": 1.4276729559748427, "grad_norm": 1.9664690494537354, "learning_rate": 1.430948637316562e-05, "loss": 0.0734, "step": 27240 }, { "epoch": 1.4281970649895177, "grad_norm": 1.9434707164764404, "learning_rate": 1.4296383647798742e-05, "loss": 0.0664, "step": 27250 }, { "epoch": 1.4287211740041927, "grad_norm": 2.2035915851593018, "learning_rate": 1.4283280922431866e-05, "loss": 0.0783, "step": 27260 }, { "epoch": 1.429245283018868, "grad_norm": 1.1164571046829224, "learning_rate": 1.427017819706499e-05, "loss": 0.0771, "step": 27270 }, { "epoch": 1.429769392033543, "grad_norm": 0.8932652473449707, "learning_rate": 1.4257075471698114e-05, "loss": 0.067, "step": 27280 }, { "epoch": 1.430293501048218, "grad_norm": 2.819211006164551, "learning_rate": 1.4243972746331238e-05, "loss": 0.0996, "step": 27290 }, { "epoch": 1.430817610062893, "grad_norm": 1.9603825807571411, "learning_rate": 1.4230870020964363e-05, "loss": 0.0952, "step": 27300 }, { "epoch": 1.4313417190775681, "grad_norm": 1.5057666301727295, "learning_rate": 1.4217767295597484e-05, "loss": 0.1054, "step": 27310 }, { "epoch": 1.4318658280922432, "grad_norm": 1.899045467376709, "learning_rate": 1.4204664570230608e-05, "loss": 0.0817, "step": 27320 }, { "epoch": 1.4323899371069182, "grad_norm": 1.6118180751800537, "learning_rate": 1.4191561844863733e-05, "loss": 0.0736, "step": 27330 }, { "epoch": 1.4329140461215932, "grad_norm": 1.7266743183135986, "learning_rate": 1.4178459119496856e-05, "loss": 0.0748, "step": 27340 }, { "epoch": 1.4334381551362683, "grad_norm": 1.0827453136444092, "learning_rate": 1.416535639412998e-05, "loss": 0.0719, "step": 27350 }, { "epoch": 1.4339622641509435, "grad_norm": 1.9279223680496216, "learning_rate": 1.4152253668763105e-05, "loss": 0.0724, "step": 27360 }, { "epoch": 1.4344863731656186, "grad_norm": 1.8519426584243774, "learning_rate": 1.4139150943396226e-05, "loss": 0.0606, "step": 27370 }, { "epoch": 1.4350104821802936, "grad_norm": 1.5819141864776611, "learning_rate": 1.412604821802935e-05, "loss": 0.0722, "step": 27380 }, { "epoch": 1.4355345911949686, "grad_norm": 0.8837409019470215, "learning_rate": 1.4112945492662475e-05, "loss": 0.0627, "step": 27390 }, { "epoch": 1.4360587002096437, "grad_norm": 1.7745553255081177, "learning_rate": 1.4099842767295598e-05, "loss": 0.0779, "step": 27400 }, { "epoch": 1.4365828092243187, "grad_norm": 0.6781327128410339, "learning_rate": 1.4086740041928723e-05, "loss": 0.0984, "step": 27410 }, { "epoch": 1.4371069182389937, "grad_norm": 1.6918905973434448, "learning_rate": 1.4073637316561847e-05, "loss": 0.0595, "step": 27420 }, { "epoch": 1.4376310272536688, "grad_norm": 1.5333856344223022, "learning_rate": 1.4060534591194968e-05, "loss": 0.0992, "step": 27430 }, { "epoch": 1.4381551362683438, "grad_norm": 2.154278039932251, "learning_rate": 1.4047431865828092e-05, "loss": 0.0761, "step": 27440 }, { "epoch": 1.4386792452830188, "grad_norm": 0.8082470893859863, "learning_rate": 1.4034329140461217e-05, "loss": 0.0637, "step": 27450 }, { "epoch": 1.4392033542976939, "grad_norm": 2.1959385871887207, "learning_rate": 1.402122641509434e-05, "loss": 0.0699, "step": 27460 }, { "epoch": 1.439727463312369, "grad_norm": 1.508762001991272, "learning_rate": 1.4008123689727465e-05, "loss": 0.0652, "step": 27470 }, { "epoch": 1.440251572327044, "grad_norm": 2.946122407913208, "learning_rate": 1.3995020964360589e-05, "loss": 0.0814, "step": 27480 }, { "epoch": 1.440775681341719, "grad_norm": 0.6886641383171082, "learning_rate": 1.398191823899371e-05, "loss": 0.0743, "step": 27490 }, { "epoch": 1.441299790356394, "grad_norm": 1.3334239721298218, "learning_rate": 1.3968815513626834e-05, "loss": 0.0892, "step": 27500 }, { "epoch": 1.4418238993710693, "grad_norm": 1.2078886032104492, "learning_rate": 1.3955712788259959e-05, "loss": 0.119, "step": 27510 }, { "epoch": 1.4423480083857443, "grad_norm": 1.5522276163101196, "learning_rate": 1.3942610062893082e-05, "loss": 0.0822, "step": 27520 }, { "epoch": 1.4428721174004193, "grad_norm": 1.9133297204971313, "learning_rate": 1.3929507337526207e-05, "loss": 0.0736, "step": 27530 }, { "epoch": 1.4433962264150944, "grad_norm": 1.6485482454299927, "learning_rate": 1.391640461215933e-05, "loss": 0.0839, "step": 27540 }, { "epoch": 1.4439203354297694, "grad_norm": 1.8468326330184937, "learning_rate": 1.3903301886792452e-05, "loss": 0.0764, "step": 27550 }, { "epoch": 1.4444444444444444, "grad_norm": 1.209517478942871, "learning_rate": 1.3890199161425576e-05, "loss": 0.084, "step": 27560 }, { "epoch": 1.4449685534591195, "grad_norm": 1.0626308917999268, "learning_rate": 1.3877096436058701e-05, "loss": 0.0803, "step": 27570 }, { "epoch": 1.4454926624737945, "grad_norm": 1.577529788017273, "learning_rate": 1.3863993710691824e-05, "loss": 0.0854, "step": 27580 }, { "epoch": 1.4460167714884695, "grad_norm": 2.4681644439697266, "learning_rate": 1.385089098532495e-05, "loss": 0.0867, "step": 27590 }, { "epoch": 1.4465408805031448, "grad_norm": 1.8343055248260498, "learning_rate": 1.3837788259958073e-05, "loss": 0.0517, "step": 27600 }, { "epoch": 1.4470649895178198, "grad_norm": 1.8408375978469849, "learning_rate": 1.3824685534591194e-05, "loss": 0.0743, "step": 27610 }, { "epoch": 1.4475890985324948, "grad_norm": 1.6427656412124634, "learning_rate": 1.381158280922432e-05, "loss": 0.0783, "step": 27620 }, { "epoch": 1.4481132075471699, "grad_norm": 0.9956682920455933, "learning_rate": 1.3798480083857443e-05, "loss": 0.0651, "step": 27630 }, { "epoch": 1.448637316561845, "grad_norm": 0.8190504908561707, "learning_rate": 1.3785377358490566e-05, "loss": 0.0778, "step": 27640 }, { "epoch": 1.44916142557652, "grad_norm": 0.7166125178337097, "learning_rate": 1.3772274633123691e-05, "loss": 0.0917, "step": 27650 }, { "epoch": 1.449685534591195, "grad_norm": 1.4953032732009888, "learning_rate": 1.3759171907756815e-05, "loss": 0.0706, "step": 27660 }, { "epoch": 1.45020964360587, "grad_norm": 1.7364864349365234, "learning_rate": 1.3746069182389936e-05, "loss": 0.09, "step": 27670 }, { "epoch": 1.450733752620545, "grad_norm": 0.8806532621383667, "learning_rate": 1.3732966457023062e-05, "loss": 0.0785, "step": 27680 }, { "epoch": 1.45125786163522, "grad_norm": 1.4088395833969116, "learning_rate": 1.3719863731656185e-05, "loss": 0.069, "step": 27690 }, { "epoch": 1.4517819706498951, "grad_norm": 1.0472676753997803, "learning_rate": 1.370676100628931e-05, "loss": 0.0824, "step": 27700 }, { "epoch": 1.4523060796645701, "grad_norm": 1.0019946098327637, "learning_rate": 1.3693658280922433e-05, "loss": 0.0882, "step": 27710 }, { "epoch": 1.4528301886792452, "grad_norm": 2.3907127380371094, "learning_rate": 1.3680555555555557e-05, "loss": 0.1072, "step": 27720 }, { "epoch": 1.4533542976939202, "grad_norm": 2.1289756298065186, "learning_rate": 1.3667452830188678e-05, "loss": 0.0681, "step": 27730 }, { "epoch": 1.4538784067085953, "grad_norm": 1.460325002670288, "learning_rate": 1.3654350104821804e-05, "loss": 0.0673, "step": 27740 }, { "epoch": 1.4544025157232705, "grad_norm": 1.5463610887527466, "learning_rate": 1.3641247379454927e-05, "loss": 0.0916, "step": 27750 }, { "epoch": 1.4549266247379455, "grad_norm": 0.9027696847915649, "learning_rate": 1.3628144654088052e-05, "loss": 0.0592, "step": 27760 }, { "epoch": 1.4554507337526206, "grad_norm": 1.820248007774353, "learning_rate": 1.3615041928721175e-05, "loss": 0.0787, "step": 27770 }, { "epoch": 1.4559748427672956, "grad_norm": 1.1869231462478638, "learning_rate": 1.36019392033543e-05, "loss": 0.0861, "step": 27780 }, { "epoch": 1.4564989517819706, "grad_norm": 1.538305401802063, "learning_rate": 1.358883647798742e-05, "loss": 0.0573, "step": 27790 }, { "epoch": 1.4570230607966457, "grad_norm": 1.3371244668960571, "learning_rate": 1.3575733752620546e-05, "loss": 0.0701, "step": 27800 }, { "epoch": 1.4575471698113207, "grad_norm": 0.7081040143966675, "learning_rate": 1.3562631027253669e-05, "loss": 0.0964, "step": 27810 }, { "epoch": 1.4580712788259957, "grad_norm": 0.6923393607139587, "learning_rate": 1.3549528301886794e-05, "loss": 0.0888, "step": 27820 }, { "epoch": 1.458595387840671, "grad_norm": 1.5953444242477417, "learning_rate": 1.3536425576519917e-05, "loss": 0.0716, "step": 27830 }, { "epoch": 1.459119496855346, "grad_norm": 1.727294921875, "learning_rate": 1.3523322851153039e-05, "loss": 0.066, "step": 27840 }, { "epoch": 1.459643605870021, "grad_norm": 3.990200996398926, "learning_rate": 1.3510220125786163e-05, "loss": 0.1102, "step": 27850 }, { "epoch": 1.460167714884696, "grad_norm": 2.4308507442474365, "learning_rate": 1.3497117400419288e-05, "loss": 0.0796, "step": 27860 }, { "epoch": 1.4606918238993711, "grad_norm": 1.9697445631027222, "learning_rate": 1.3484014675052411e-05, "loss": 0.0841, "step": 27870 }, { "epoch": 1.4612159329140462, "grad_norm": 0.9104553461074829, "learning_rate": 1.3470911949685536e-05, "loss": 0.089, "step": 27880 }, { "epoch": 1.4617400419287212, "grad_norm": 0.9098082780838013, "learning_rate": 1.345780922431866e-05, "loss": 0.0986, "step": 27890 }, { "epoch": 1.4622641509433962, "grad_norm": 1.175636887550354, "learning_rate": 1.3444706498951781e-05, "loss": 0.0793, "step": 27900 }, { "epoch": 1.4627882599580713, "grad_norm": 0.9262112379074097, "learning_rate": 1.3431603773584906e-05, "loss": 0.0646, "step": 27910 }, { "epoch": 1.4633123689727463, "grad_norm": 1.373189091682434, "learning_rate": 1.341850104821803e-05, "loss": 0.0697, "step": 27920 }, { "epoch": 1.4638364779874213, "grad_norm": 1.5659918785095215, "learning_rate": 1.3405398322851153e-05, "loss": 0.0755, "step": 27930 }, { "epoch": 1.4643605870020964, "grad_norm": 3.091872453689575, "learning_rate": 1.3392295597484278e-05, "loss": 0.0906, "step": 27940 }, { "epoch": 1.4648846960167714, "grad_norm": 1.3412938117980957, "learning_rate": 1.3379192872117401e-05, "loss": 0.0876, "step": 27950 }, { "epoch": 1.4654088050314464, "grad_norm": 1.137826681137085, "learning_rate": 1.3366090146750523e-05, "loss": 0.0773, "step": 27960 }, { "epoch": 1.4659329140461215, "grad_norm": 3.942056894302368, "learning_rate": 1.3352987421383648e-05, "loss": 0.0677, "step": 27970 }, { "epoch": 1.4664570230607967, "grad_norm": 1.7244807481765747, "learning_rate": 1.3339884696016772e-05, "loss": 0.0704, "step": 27980 }, { "epoch": 1.4669811320754718, "grad_norm": 1.608075737953186, "learning_rate": 1.3326781970649897e-05, "loss": 0.0703, "step": 27990 }, { "epoch": 1.4675052410901468, "grad_norm": 2.1725499629974365, "learning_rate": 1.331367924528302e-05, "loss": 0.0709, "step": 28000 }, { "epoch": 1.4675052410901468, "eval_loss": 0.271745502948761, "eval_runtime": 267.6865, "eval_samples_per_second": 7.438, "eval_steps_per_second": 1.24, "step": 28000 }, { "epoch": 1.4680293501048218, "grad_norm": 1.803655743598938, "learning_rate": 1.3300576519916143e-05, "loss": 0.0864, "step": 28010 }, { "epoch": 1.4685534591194969, "grad_norm": 1.6521601676940918, "learning_rate": 1.3287473794549265e-05, "loss": 0.0755, "step": 28020 }, { "epoch": 1.469077568134172, "grad_norm": 2.3128561973571777, "learning_rate": 1.327437106918239e-05, "loss": 0.081, "step": 28030 }, { "epoch": 1.469601677148847, "grad_norm": 1.0363951921463013, "learning_rate": 1.3261268343815514e-05, "loss": 0.0575, "step": 28040 }, { "epoch": 1.470125786163522, "grad_norm": 0.7471582889556885, "learning_rate": 1.3248165618448639e-05, "loss": 0.0583, "step": 28050 }, { "epoch": 1.470649895178197, "grad_norm": 1.6395134925842285, "learning_rate": 1.3235062893081762e-05, "loss": 0.0674, "step": 28060 }, { "epoch": 1.4711740041928723, "grad_norm": 1.7214020490646362, "learning_rate": 1.3221960167714887e-05, "loss": 0.09, "step": 28070 }, { "epoch": 1.4716981132075473, "grad_norm": 1.6279518604278564, "learning_rate": 1.3208857442348007e-05, "loss": 0.0857, "step": 28080 }, { "epoch": 1.4722222222222223, "grad_norm": 1.4659122228622437, "learning_rate": 1.3195754716981132e-05, "loss": 0.0843, "step": 28090 }, { "epoch": 1.4727463312368974, "grad_norm": 0.8685888648033142, "learning_rate": 1.3182651991614256e-05, "loss": 0.0554, "step": 28100 }, { "epoch": 1.4732704402515724, "grad_norm": 1.3319734334945679, "learning_rate": 1.316954926624738e-05, "loss": 0.079, "step": 28110 }, { "epoch": 1.4737945492662474, "grad_norm": 1.8119728565216064, "learning_rate": 1.3156446540880504e-05, "loss": 0.0726, "step": 28120 }, { "epoch": 1.4743186582809225, "grad_norm": 1.3160663843154907, "learning_rate": 1.314334381551363e-05, "loss": 0.0875, "step": 28130 }, { "epoch": 1.4748427672955975, "grad_norm": 2.143087387084961, "learning_rate": 1.313024109014675e-05, "loss": 0.0828, "step": 28140 }, { "epoch": 1.4753668763102725, "grad_norm": 1.9717251062393188, "learning_rate": 1.3117138364779874e-05, "loss": 0.0709, "step": 28150 }, { "epoch": 1.4758909853249476, "grad_norm": 1.5455591678619385, "learning_rate": 1.3104035639412998e-05, "loss": 0.1053, "step": 28160 }, { "epoch": 1.4764150943396226, "grad_norm": 1.7881935834884644, "learning_rate": 1.3090932914046123e-05, "loss": 0.082, "step": 28170 }, { "epoch": 1.4769392033542976, "grad_norm": 1.2156028747558594, "learning_rate": 1.3077830188679246e-05, "loss": 0.0662, "step": 28180 }, { "epoch": 1.4774633123689727, "grad_norm": 1.5937763452529907, "learning_rate": 1.3064727463312371e-05, "loss": 0.0686, "step": 28190 }, { "epoch": 1.4779874213836477, "grad_norm": 2.669137477874756, "learning_rate": 1.3051624737945491e-05, "loss": 0.0726, "step": 28200 }, { "epoch": 1.4785115303983227, "grad_norm": 1.6879535913467407, "learning_rate": 1.3038522012578616e-05, "loss": 0.0885, "step": 28210 }, { "epoch": 1.479035639412998, "grad_norm": 1.1833429336547852, "learning_rate": 1.302541928721174e-05, "loss": 0.0692, "step": 28220 }, { "epoch": 1.479559748427673, "grad_norm": 2.149759531021118, "learning_rate": 1.3012316561844865e-05, "loss": 0.1053, "step": 28230 }, { "epoch": 1.480083857442348, "grad_norm": 2.0585362911224365, "learning_rate": 1.2999213836477988e-05, "loss": 0.0833, "step": 28240 }, { "epoch": 1.480607966457023, "grad_norm": 1.4494343996047974, "learning_rate": 1.2986111111111113e-05, "loss": 0.0934, "step": 28250 }, { "epoch": 1.4811320754716981, "grad_norm": 1.7021030187606812, "learning_rate": 1.2973008385744235e-05, "loss": 0.0583, "step": 28260 }, { "epoch": 1.4816561844863732, "grad_norm": 2.6101348400115967, "learning_rate": 1.2959905660377358e-05, "loss": 0.0768, "step": 28270 }, { "epoch": 1.4821802935010482, "grad_norm": 0.9507656693458557, "learning_rate": 1.2946802935010482e-05, "loss": 0.0526, "step": 28280 }, { "epoch": 1.4827044025157232, "grad_norm": 1.1151862144470215, "learning_rate": 1.2933700209643607e-05, "loss": 0.0898, "step": 28290 }, { "epoch": 1.4832285115303983, "grad_norm": 1.3400394916534424, "learning_rate": 1.292059748427673e-05, "loss": 0.0771, "step": 28300 }, { "epoch": 1.4837526205450735, "grad_norm": 1.4885073900222778, "learning_rate": 1.2907494758909855e-05, "loss": 0.0877, "step": 28310 }, { "epoch": 1.4842767295597485, "grad_norm": 1.9722825288772583, "learning_rate": 1.2894392033542977e-05, "loss": 0.0913, "step": 28320 }, { "epoch": 1.4848008385744236, "grad_norm": 1.2791171073913574, "learning_rate": 1.28812893081761e-05, "loss": 0.0747, "step": 28330 }, { "epoch": 1.4853249475890986, "grad_norm": 1.2421149015426636, "learning_rate": 1.2868186582809225e-05, "loss": 0.0727, "step": 28340 }, { "epoch": 1.4858490566037736, "grad_norm": 1.6803362369537354, "learning_rate": 1.2855083857442349e-05, "loss": 0.09, "step": 28350 }, { "epoch": 1.4863731656184487, "grad_norm": 1.8740354776382446, "learning_rate": 1.2841981132075472e-05, "loss": 0.0651, "step": 28360 }, { "epoch": 1.4868972746331237, "grad_norm": 1.3395440578460693, "learning_rate": 1.2828878406708597e-05, "loss": 0.0775, "step": 28370 }, { "epoch": 1.4874213836477987, "grad_norm": 1.6447713375091553, "learning_rate": 1.2815775681341719e-05, "loss": 0.0763, "step": 28380 }, { "epoch": 1.4879454926624738, "grad_norm": 2.1082639694213867, "learning_rate": 1.2802672955974842e-05, "loss": 0.0919, "step": 28390 }, { "epoch": 1.4884696016771488, "grad_norm": 1.401515245437622, "learning_rate": 1.2789570230607967e-05, "loss": 0.0709, "step": 28400 }, { "epoch": 1.4889937106918238, "grad_norm": 1.3581801652908325, "learning_rate": 1.277646750524109e-05, "loss": 0.0765, "step": 28410 }, { "epoch": 1.4895178197064989, "grad_norm": 1.3556405305862427, "learning_rate": 1.2763364779874216e-05, "loss": 0.0641, "step": 28420 }, { "epoch": 1.490041928721174, "grad_norm": 1.4136556386947632, "learning_rate": 1.275026205450734e-05, "loss": 0.0687, "step": 28430 }, { "epoch": 1.490566037735849, "grad_norm": 1.5147451162338257, "learning_rate": 1.2737159329140461e-05, "loss": 0.0774, "step": 28440 }, { "epoch": 1.491090146750524, "grad_norm": 1.2595840692520142, "learning_rate": 1.2724056603773584e-05, "loss": 0.0548, "step": 28450 }, { "epoch": 1.4916142557651992, "grad_norm": 1.0852960348129272, "learning_rate": 1.271095387840671e-05, "loss": 0.0611, "step": 28460 }, { "epoch": 1.4921383647798743, "grad_norm": 1.0651873350143433, "learning_rate": 1.2697851153039833e-05, "loss": 0.0732, "step": 28470 }, { "epoch": 1.4926624737945493, "grad_norm": 1.559222936630249, "learning_rate": 1.2684748427672958e-05, "loss": 0.0612, "step": 28480 }, { "epoch": 1.4931865828092243, "grad_norm": 3.197054862976074, "learning_rate": 1.2671645702306081e-05, "loss": 0.0671, "step": 28490 }, { "epoch": 1.4937106918238994, "grad_norm": 0.6540378928184509, "learning_rate": 1.2658542976939203e-05, "loss": 0.068, "step": 28500 }, { "epoch": 1.4942348008385744, "grad_norm": 1.2630752325057983, "learning_rate": 1.2645440251572326e-05, "loss": 0.0583, "step": 28510 }, { "epoch": 1.4947589098532494, "grad_norm": 1.6068637371063232, "learning_rate": 1.2632337526205451e-05, "loss": 0.093, "step": 28520 }, { "epoch": 1.4952830188679245, "grad_norm": 1.573815941810608, "learning_rate": 1.2619234800838575e-05, "loss": 0.0651, "step": 28530 }, { "epoch": 1.4958071278825995, "grad_norm": 1.224945068359375, "learning_rate": 1.26061320754717e-05, "loss": 0.0812, "step": 28540 }, { "epoch": 1.4963312368972748, "grad_norm": 2.006347179412842, "learning_rate": 1.2593029350104823e-05, "loss": 0.1058, "step": 28550 }, { "epoch": 1.4968553459119498, "grad_norm": 1.5014920234680176, "learning_rate": 1.2579926624737945e-05, "loss": 0.0671, "step": 28560 }, { "epoch": 1.4973794549266248, "grad_norm": 1.4988354444503784, "learning_rate": 1.2566823899371068e-05, "loss": 0.0572, "step": 28570 }, { "epoch": 1.4979035639412999, "grad_norm": 1.3243881464004517, "learning_rate": 1.2553721174004193e-05, "loss": 0.0719, "step": 28580 }, { "epoch": 1.498427672955975, "grad_norm": 1.867492437362671, "learning_rate": 1.2540618448637317e-05, "loss": 0.0725, "step": 28590 }, { "epoch": 1.49895178197065, "grad_norm": 2.1830546855926514, "learning_rate": 1.2527515723270442e-05, "loss": 0.0958, "step": 28600 }, { "epoch": 1.499475890985325, "grad_norm": 1.920746922492981, "learning_rate": 1.2514412997903565e-05, "loss": 0.1057, "step": 28610 }, { "epoch": 1.5, "grad_norm": 1.0297828912734985, "learning_rate": 1.2501310272536687e-05, "loss": 0.0722, "step": 28620 }, { "epoch": 1.500524109014675, "grad_norm": 5.875576496124268, "learning_rate": 1.2488207547169812e-05, "loss": 0.1083, "step": 28630 }, { "epoch": 1.50104821802935, "grad_norm": 1.3598127365112305, "learning_rate": 1.2475104821802935e-05, "loss": 0.0681, "step": 28640 }, { "epoch": 1.501572327044025, "grad_norm": 1.3286088705062866, "learning_rate": 1.2462002096436059e-05, "loss": 0.0812, "step": 28650 }, { "epoch": 1.5020964360587001, "grad_norm": 1.2798917293548584, "learning_rate": 1.2448899371069182e-05, "loss": 0.0801, "step": 28660 }, { "epoch": 1.5026205450733752, "grad_norm": 1.671739935874939, "learning_rate": 1.2435796645702307e-05, "loss": 0.0794, "step": 28670 }, { "epoch": 1.5031446540880502, "grad_norm": 1.265716791152954, "learning_rate": 1.242269392033543e-05, "loss": 0.0814, "step": 28680 }, { "epoch": 1.5036687631027252, "grad_norm": 1.810259222984314, "learning_rate": 1.2409591194968554e-05, "loss": 0.0834, "step": 28690 }, { "epoch": 1.5041928721174003, "grad_norm": 1.5818339586257935, "learning_rate": 1.2396488469601677e-05, "loss": 0.0725, "step": 28700 }, { "epoch": 1.5047169811320755, "grad_norm": 1.0720511674880981, "learning_rate": 1.2383385744234802e-05, "loss": 0.0663, "step": 28710 }, { "epoch": 1.5052410901467506, "grad_norm": 0.9712188839912415, "learning_rate": 1.2370283018867924e-05, "loss": 0.0672, "step": 28720 }, { "epoch": 1.5057651991614256, "grad_norm": 1.4916762113571167, "learning_rate": 1.235718029350105e-05, "loss": 0.0621, "step": 28730 }, { "epoch": 1.5062893081761006, "grad_norm": 0.9293265342712402, "learning_rate": 1.2344077568134173e-05, "loss": 0.0785, "step": 28740 }, { "epoch": 1.5068134171907757, "grad_norm": 1.805554747581482, "learning_rate": 1.2330974842767296e-05, "loss": 0.0687, "step": 28750 }, { "epoch": 1.5073375262054507, "grad_norm": 0.8354784250259399, "learning_rate": 1.231787211740042e-05, "loss": 0.0841, "step": 28760 }, { "epoch": 1.507861635220126, "grad_norm": 2.4125020503997803, "learning_rate": 1.2304769392033544e-05, "loss": 0.0648, "step": 28770 }, { "epoch": 1.508385744234801, "grad_norm": 2.2017788887023926, "learning_rate": 1.2291666666666666e-05, "loss": 0.0821, "step": 28780 }, { "epoch": 1.508909853249476, "grad_norm": 1.4541821479797363, "learning_rate": 1.2278563941299791e-05, "loss": 0.0816, "step": 28790 }, { "epoch": 1.509433962264151, "grad_norm": 0.7428493499755859, "learning_rate": 1.2265461215932915e-05, "loss": 0.058, "step": 28800 }, { "epoch": 1.509958071278826, "grad_norm": 1.1366612911224365, "learning_rate": 1.2252358490566038e-05, "loss": 0.0588, "step": 28810 }, { "epoch": 1.5104821802935011, "grad_norm": 1.2058767080307007, "learning_rate": 1.2239255765199161e-05, "loss": 0.0732, "step": 28820 }, { "epoch": 1.5110062893081762, "grad_norm": 3.127432107925415, "learning_rate": 1.2226153039832286e-05, "loss": 0.1002, "step": 28830 }, { "epoch": 1.5115303983228512, "grad_norm": 3.9987447261810303, "learning_rate": 1.2213050314465408e-05, "loss": 0.0848, "step": 28840 }, { "epoch": 1.5120545073375262, "grad_norm": 0.9082832932472229, "learning_rate": 1.2199947589098533e-05, "loss": 0.0777, "step": 28850 }, { "epoch": 1.5125786163522013, "grad_norm": 1.5602792501449585, "learning_rate": 1.2186844863731657e-05, "loss": 0.0609, "step": 28860 }, { "epoch": 1.5131027253668763, "grad_norm": 1.9996236562728882, "learning_rate": 1.217374213836478e-05, "loss": 0.0764, "step": 28870 }, { "epoch": 1.5136268343815513, "grad_norm": 1.3080304861068726, "learning_rate": 1.2160639412997903e-05, "loss": 0.0757, "step": 28880 }, { "epoch": 1.5141509433962264, "grad_norm": 1.3440593481063843, "learning_rate": 1.2147536687631028e-05, "loss": 0.071, "step": 28890 }, { "epoch": 1.5146750524109014, "grad_norm": 1.1804187297821045, "learning_rate": 1.213443396226415e-05, "loss": 0.1036, "step": 28900 }, { "epoch": 1.5151991614255764, "grad_norm": 0.9023783802986145, "learning_rate": 1.2121331236897275e-05, "loss": 0.0663, "step": 28910 }, { "epoch": 1.5157232704402515, "grad_norm": 1.4749324321746826, "learning_rate": 1.2108228511530399e-05, "loss": 0.0619, "step": 28920 }, { "epoch": 1.5162473794549265, "grad_norm": 1.2634544372558594, "learning_rate": 1.2095125786163522e-05, "loss": 0.1205, "step": 28930 }, { "epoch": 1.5167714884696015, "grad_norm": 2.068763256072998, "learning_rate": 1.2082023060796645e-05, "loss": 0.0663, "step": 28940 }, { "epoch": 1.5172955974842768, "grad_norm": 1.1468791961669922, "learning_rate": 1.206892033542977e-05, "loss": 0.0826, "step": 28950 }, { "epoch": 1.5178197064989518, "grad_norm": 1.9512981176376343, "learning_rate": 1.2055817610062894e-05, "loss": 0.0921, "step": 28960 }, { "epoch": 1.5183438155136268, "grad_norm": 1.7221571207046509, "learning_rate": 1.2042714884696017e-05, "loss": 0.07, "step": 28970 }, { "epoch": 1.5188679245283019, "grad_norm": 1.6264697313308716, "learning_rate": 1.202961215932914e-05, "loss": 0.0596, "step": 28980 }, { "epoch": 1.519392033542977, "grad_norm": 2.5485575199127197, "learning_rate": 1.2016509433962264e-05, "loss": 0.0852, "step": 28990 }, { "epoch": 1.519916142557652, "grad_norm": 0.7181016802787781, "learning_rate": 1.2003406708595389e-05, "loss": 0.0684, "step": 29000 }, { "epoch": 1.519916142557652, "eval_loss": 0.2725262939929962, "eval_runtime": 267.0455, "eval_samples_per_second": 7.456, "eval_steps_per_second": 1.243, "step": 29000 }, { "epoch": 1.5204402515723272, "grad_norm": 1.1129300594329834, "learning_rate": 1.1990303983228512e-05, "loss": 0.1032, "step": 29010 }, { "epoch": 1.5209643605870022, "grad_norm": 1.3954484462738037, "learning_rate": 1.1977201257861636e-05, "loss": 0.0645, "step": 29020 }, { "epoch": 1.5214884696016773, "grad_norm": 2.2983057498931885, "learning_rate": 1.196409853249476e-05, "loss": 0.0875, "step": 29030 }, { "epoch": 1.5220125786163523, "grad_norm": 1.7596241235733032, "learning_rate": 1.1950995807127884e-05, "loss": 0.0701, "step": 29040 }, { "epoch": 1.5225366876310273, "grad_norm": 2.6219558715820312, "learning_rate": 1.1937893081761006e-05, "loss": 0.0652, "step": 29050 }, { "epoch": 1.5230607966457024, "grad_norm": 1.7040820121765137, "learning_rate": 1.1924790356394131e-05, "loss": 0.099, "step": 29060 }, { "epoch": 1.5235849056603774, "grad_norm": 0.6065346002578735, "learning_rate": 1.1911687631027254e-05, "loss": 0.0608, "step": 29070 }, { "epoch": 1.5241090146750524, "grad_norm": 1.5256175994873047, "learning_rate": 1.1898584905660378e-05, "loss": 0.0973, "step": 29080 }, { "epoch": 1.5246331236897275, "grad_norm": 1.1144390106201172, "learning_rate": 1.1885482180293501e-05, "loss": 0.084, "step": 29090 }, { "epoch": 1.5251572327044025, "grad_norm": 1.670206069946289, "learning_rate": 1.1872379454926626e-05, "loss": 0.0827, "step": 29100 }, { "epoch": 1.5256813417190775, "grad_norm": 3.055248975753784, "learning_rate": 1.1859276729559748e-05, "loss": 0.076, "step": 29110 }, { "epoch": 1.5262054507337526, "grad_norm": 2.054408311843872, "learning_rate": 1.1846174004192873e-05, "loss": 0.0893, "step": 29120 }, { "epoch": 1.5267295597484276, "grad_norm": 1.0369834899902344, "learning_rate": 1.1833071278825997e-05, "loss": 0.088, "step": 29130 }, { "epoch": 1.5272536687631026, "grad_norm": 1.8977478742599487, "learning_rate": 1.181996855345912e-05, "loss": 0.0666, "step": 29140 }, { "epoch": 1.5277777777777777, "grad_norm": 1.332196831703186, "learning_rate": 1.1806865828092243e-05, "loss": 0.0837, "step": 29150 }, { "epoch": 1.5283018867924527, "grad_norm": 2.6005096435546875, "learning_rate": 1.1793763102725368e-05, "loss": 0.0783, "step": 29160 }, { "epoch": 1.5288259958071277, "grad_norm": 2.069912910461426, "learning_rate": 1.178066037735849e-05, "loss": 0.0706, "step": 29170 }, { "epoch": 1.5293501048218028, "grad_norm": 0.627423107624054, "learning_rate": 1.1767557651991615e-05, "loss": 0.0632, "step": 29180 }, { "epoch": 1.529874213836478, "grad_norm": 2.5008325576782227, "learning_rate": 1.1754454926624739e-05, "loss": 0.0732, "step": 29190 }, { "epoch": 1.530398322851153, "grad_norm": 1.6689064502716064, "learning_rate": 1.1741352201257862e-05, "loss": 0.091, "step": 29200 }, { "epoch": 1.530922431865828, "grad_norm": 1.0986666679382324, "learning_rate": 1.1728249475890985e-05, "loss": 0.0808, "step": 29210 }, { "epoch": 1.5314465408805031, "grad_norm": 1.1996750831604004, "learning_rate": 1.171514675052411e-05, "loss": 0.0848, "step": 29220 }, { "epoch": 1.5319706498951782, "grad_norm": 1.5958057641983032, "learning_rate": 1.1702044025157232e-05, "loss": 0.0674, "step": 29230 }, { "epoch": 1.5324947589098532, "grad_norm": 2.4333527088165283, "learning_rate": 1.1688941299790357e-05, "loss": 0.0807, "step": 29240 }, { "epoch": 1.5330188679245285, "grad_norm": 1.3615031242370605, "learning_rate": 1.167583857442348e-05, "loss": 0.0838, "step": 29250 }, { "epoch": 1.5335429769392035, "grad_norm": 1.0076712369918823, "learning_rate": 1.1662735849056604e-05, "loss": 0.0667, "step": 29260 }, { "epoch": 1.5340670859538785, "grad_norm": 1.245064377784729, "learning_rate": 1.1649633123689727e-05, "loss": 0.0921, "step": 29270 }, { "epoch": 1.5345911949685536, "grad_norm": 1.793381690979004, "learning_rate": 1.1636530398322852e-05, "loss": 0.0726, "step": 29280 }, { "epoch": 1.5351153039832286, "grad_norm": 1.2510024309158325, "learning_rate": 1.1623427672955974e-05, "loss": 0.0705, "step": 29290 }, { "epoch": 1.5356394129979036, "grad_norm": 1.0669950246810913, "learning_rate": 1.1610324947589099e-05, "loss": 0.0773, "step": 29300 }, { "epoch": 1.5361635220125787, "grad_norm": 2.63558292388916, "learning_rate": 1.1597222222222223e-05, "loss": 0.084, "step": 29310 }, { "epoch": 1.5366876310272537, "grad_norm": 1.210523009300232, "learning_rate": 1.1584119496855346e-05, "loss": 0.0802, "step": 29320 }, { "epoch": 1.5372117400419287, "grad_norm": 1.076084017753601, "learning_rate": 1.157101677148847e-05, "loss": 0.0855, "step": 29330 }, { "epoch": 1.5377358490566038, "grad_norm": 2.554025411605835, "learning_rate": 1.1557914046121594e-05, "loss": 0.088, "step": 29340 }, { "epoch": 1.5382599580712788, "grad_norm": 1.6165305376052856, "learning_rate": 1.1544811320754718e-05, "loss": 0.0707, "step": 29350 }, { "epoch": 1.5387840670859538, "grad_norm": 1.7850388288497925, "learning_rate": 1.1531708595387841e-05, "loss": 0.0816, "step": 29360 }, { "epoch": 1.5393081761006289, "grad_norm": 1.9576303958892822, "learning_rate": 1.1518605870020965e-05, "loss": 0.0886, "step": 29370 }, { "epoch": 1.539832285115304, "grad_norm": 1.1999833583831787, "learning_rate": 1.1505503144654088e-05, "loss": 0.0605, "step": 29380 }, { "epoch": 1.540356394129979, "grad_norm": 1.0975391864776611, "learning_rate": 1.1492400419287213e-05, "loss": 0.0951, "step": 29390 }, { "epoch": 1.540880503144654, "grad_norm": 1.9649990797042847, "learning_rate": 1.1479297693920336e-05, "loss": 0.078, "step": 29400 }, { "epoch": 1.541404612159329, "grad_norm": 1.9952278137207031, "learning_rate": 1.146619496855346e-05, "loss": 0.0705, "step": 29410 }, { "epoch": 1.541928721174004, "grad_norm": 1.1087632179260254, "learning_rate": 1.1453092243186583e-05, "loss": 0.0881, "step": 29420 }, { "epoch": 1.5424528301886793, "grad_norm": 1.3009109497070312, "learning_rate": 1.1439989517819708e-05, "loss": 0.0798, "step": 29430 }, { "epoch": 1.5429769392033543, "grad_norm": 0.8007161021232605, "learning_rate": 1.142688679245283e-05, "loss": 0.0661, "step": 29440 }, { "epoch": 1.5435010482180294, "grad_norm": 0.6858960390090942, "learning_rate": 1.1413784067085955e-05, "loss": 0.0712, "step": 29450 }, { "epoch": 1.5440251572327044, "grad_norm": 0.7989315390586853, "learning_rate": 1.1400681341719078e-05, "loss": 0.0719, "step": 29460 }, { "epoch": 1.5445492662473794, "grad_norm": 1.4100242853164673, "learning_rate": 1.1387578616352202e-05, "loss": 0.0808, "step": 29470 }, { "epoch": 1.5450733752620545, "grad_norm": 1.6161035299301147, "learning_rate": 1.1374475890985325e-05, "loss": 0.0812, "step": 29480 }, { "epoch": 1.5455974842767297, "grad_norm": 0.731370210647583, "learning_rate": 1.136137316561845e-05, "loss": 0.0706, "step": 29490 }, { "epoch": 1.5461215932914047, "grad_norm": 2.8555400371551514, "learning_rate": 1.1348270440251572e-05, "loss": 0.097, "step": 29500 }, { "epoch": 1.5466457023060798, "grad_norm": 0.6062478423118591, "learning_rate": 1.1335167714884697e-05, "loss": 0.0681, "step": 29510 }, { "epoch": 1.5471698113207548, "grad_norm": 1.8448848724365234, "learning_rate": 1.132206498951782e-05, "loss": 0.0881, "step": 29520 }, { "epoch": 1.5476939203354299, "grad_norm": 1.489322304725647, "learning_rate": 1.1308962264150944e-05, "loss": 0.0973, "step": 29530 }, { "epoch": 1.5482180293501049, "grad_norm": 1.3408927917480469, "learning_rate": 1.1295859538784067e-05, "loss": 0.0577, "step": 29540 }, { "epoch": 1.54874213836478, "grad_norm": 1.461776852607727, "learning_rate": 1.1282756813417192e-05, "loss": 0.0777, "step": 29550 }, { "epoch": 1.549266247379455, "grad_norm": 1.5345159769058228, "learning_rate": 1.1269654088050314e-05, "loss": 0.0886, "step": 29560 }, { "epoch": 1.54979035639413, "grad_norm": 0.965441882610321, "learning_rate": 1.1256551362683439e-05, "loss": 0.1009, "step": 29570 }, { "epoch": 1.550314465408805, "grad_norm": 1.252944827079773, "learning_rate": 1.1243448637316562e-05, "loss": 0.1045, "step": 29580 }, { "epoch": 1.55083857442348, "grad_norm": 1.1413624286651611, "learning_rate": 1.1230345911949686e-05, "loss": 0.0585, "step": 29590 }, { "epoch": 1.551362683438155, "grad_norm": 0.5949219465255737, "learning_rate": 1.121724318658281e-05, "loss": 0.0833, "step": 29600 }, { "epoch": 1.5518867924528301, "grad_norm": 1.4133487939834595, "learning_rate": 1.1204140461215934e-05, "loss": 0.0636, "step": 29610 }, { "epoch": 1.5524109014675052, "grad_norm": 1.1068850755691528, "learning_rate": 1.1191037735849056e-05, "loss": 0.0931, "step": 29620 }, { "epoch": 1.5529350104821802, "grad_norm": 4.534146785736084, "learning_rate": 1.1177935010482181e-05, "loss": 0.0539, "step": 29630 }, { "epoch": 1.5534591194968552, "grad_norm": 1.7356497049331665, "learning_rate": 1.1164832285115304e-05, "loss": 0.0625, "step": 29640 }, { "epoch": 1.5539832285115303, "grad_norm": 2.3491108417510986, "learning_rate": 1.1151729559748428e-05, "loss": 0.0854, "step": 29650 }, { "epoch": 1.5545073375262053, "grad_norm": 0.9400236010551453, "learning_rate": 1.1138626834381551e-05, "loss": 0.064, "step": 29660 }, { "epoch": 1.5550314465408805, "grad_norm": 2.1760456562042236, "learning_rate": 1.1125524109014676e-05, "loss": 0.0894, "step": 29670 }, { "epoch": 1.5555555555555556, "grad_norm": 2.121522903442383, "learning_rate": 1.11124213836478e-05, "loss": 0.0675, "step": 29680 }, { "epoch": 1.5560796645702306, "grad_norm": 1.169234037399292, "learning_rate": 1.1099318658280923e-05, "loss": 0.1041, "step": 29690 }, { "epoch": 1.5566037735849056, "grad_norm": 2.202223062515259, "learning_rate": 1.1086215932914046e-05, "loss": 0.101, "step": 29700 }, { "epoch": 1.5571278825995807, "grad_norm": 1.6702232360839844, "learning_rate": 1.107311320754717e-05, "loss": 0.0904, "step": 29710 }, { "epoch": 1.5576519916142557, "grad_norm": 1.469577670097351, "learning_rate": 1.1060010482180295e-05, "loss": 0.0834, "step": 29720 }, { "epoch": 1.558176100628931, "grad_norm": 1.481614589691162, "learning_rate": 1.1046907756813418e-05, "loss": 0.0655, "step": 29730 }, { "epoch": 1.558700209643606, "grad_norm": 1.0949363708496094, "learning_rate": 1.1033805031446542e-05, "loss": 0.0814, "step": 29740 }, { "epoch": 1.559224318658281, "grad_norm": 1.97059965133667, "learning_rate": 1.1020702306079665e-05, "loss": 0.0542, "step": 29750 }, { "epoch": 1.559748427672956, "grad_norm": 1.6910370588302612, "learning_rate": 1.100759958071279e-05, "loss": 0.0948, "step": 29760 }, { "epoch": 1.560272536687631, "grad_norm": 1.5514713525772095, "learning_rate": 1.0994496855345912e-05, "loss": 0.0924, "step": 29770 }, { "epoch": 1.5607966457023061, "grad_norm": 2.059285879135132, "learning_rate": 1.0981394129979037e-05, "loss": 0.0719, "step": 29780 }, { "epoch": 1.5613207547169812, "grad_norm": 1.273655891418457, "learning_rate": 1.096829140461216e-05, "loss": 0.1041, "step": 29790 }, { "epoch": 1.5618448637316562, "grad_norm": 1.5605663061141968, "learning_rate": 1.0955188679245284e-05, "loss": 0.07, "step": 29800 }, { "epoch": 1.5623689727463312, "grad_norm": 0.9228988885879517, "learning_rate": 1.0942085953878407e-05, "loss": 0.0642, "step": 29810 }, { "epoch": 1.5628930817610063, "grad_norm": 5.248297214508057, "learning_rate": 1.0928983228511532e-05, "loss": 0.0842, "step": 29820 }, { "epoch": 1.5634171907756813, "grad_norm": 1.027686595916748, "learning_rate": 1.0915880503144654e-05, "loss": 0.0771, "step": 29830 }, { "epoch": 1.5639412997903563, "grad_norm": 1.1501569747924805, "learning_rate": 1.0902777777777779e-05, "loss": 0.0803, "step": 29840 }, { "epoch": 1.5644654088050314, "grad_norm": 1.298979640007019, "learning_rate": 1.0889675052410902e-05, "loss": 0.0667, "step": 29850 }, { "epoch": 1.5649895178197064, "grad_norm": 1.156478762626648, "learning_rate": 1.0876572327044026e-05, "loss": 0.088, "step": 29860 }, { "epoch": 1.5655136268343814, "grad_norm": 1.7162197828292847, "learning_rate": 1.0863469601677149e-05, "loss": 0.0829, "step": 29870 }, { "epoch": 1.5660377358490565, "grad_norm": 1.008570909500122, "learning_rate": 1.0850366876310274e-05, "loss": 0.0697, "step": 29880 }, { "epoch": 1.5665618448637315, "grad_norm": 1.2989729642868042, "learning_rate": 1.0837264150943396e-05, "loss": 0.0591, "step": 29890 }, { "epoch": 1.5670859538784065, "grad_norm": 1.0060633420944214, "learning_rate": 1.0824161425576521e-05, "loss": 0.08, "step": 29900 }, { "epoch": 1.5676100628930818, "grad_norm": 1.4241963624954224, "learning_rate": 1.0811058700209644e-05, "loss": 0.0745, "step": 29910 }, { "epoch": 1.5681341719077568, "grad_norm": 2.18135404586792, "learning_rate": 1.0797955974842768e-05, "loss": 0.0789, "step": 29920 }, { "epoch": 1.5686582809224319, "grad_norm": 1.7818392515182495, "learning_rate": 1.0784853249475891e-05, "loss": 0.0737, "step": 29930 }, { "epoch": 1.569182389937107, "grad_norm": 1.8234585523605347, "learning_rate": 1.0771750524109016e-05, "loss": 0.0653, "step": 29940 }, { "epoch": 1.569706498951782, "grad_norm": 1.3113237619400024, "learning_rate": 1.0758647798742138e-05, "loss": 0.0976, "step": 29950 }, { "epoch": 1.570230607966457, "grad_norm": 1.1958644390106201, "learning_rate": 1.0745545073375263e-05, "loss": 0.0658, "step": 29960 }, { "epoch": 1.5707547169811322, "grad_norm": 2.510767936706543, "learning_rate": 1.0732442348008386e-05, "loss": 0.0733, "step": 29970 }, { "epoch": 1.5712788259958073, "grad_norm": 3.316685199737549, "learning_rate": 1.071933962264151e-05, "loss": 0.0893, "step": 29980 }, { "epoch": 1.5718029350104823, "grad_norm": 1.476149082183838, "learning_rate": 1.0706236897274633e-05, "loss": 0.0912, "step": 29990 }, { "epoch": 1.5723270440251573, "grad_norm": 1.2044668197631836, "learning_rate": 1.0693134171907758e-05, "loss": 0.0814, "step": 30000 }, { "epoch": 1.5723270440251573, "eval_loss": 0.26660144329071045, "eval_runtime": 267.6774, "eval_samples_per_second": 7.438, "eval_steps_per_second": 1.24, "step": 30000 }, { "epoch": 1.5728511530398324, "grad_norm": 1.601665735244751, "learning_rate": 1.0680031446540882e-05, "loss": 0.0705, "step": 30010 }, { "epoch": 1.5733752620545074, "grad_norm": 1.4815196990966797, "learning_rate": 1.0666928721174005e-05, "loss": 0.0904, "step": 30020 }, { "epoch": 1.5738993710691824, "grad_norm": 1.331384539604187, "learning_rate": 1.0653825995807128e-05, "loss": 0.0812, "step": 30030 }, { "epoch": 1.5744234800838575, "grad_norm": 2.9445841312408447, "learning_rate": 1.0640723270440252e-05, "loss": 0.0734, "step": 30040 }, { "epoch": 1.5749475890985325, "grad_norm": 3.5984508991241455, "learning_rate": 1.0627620545073377e-05, "loss": 0.0739, "step": 30050 }, { "epoch": 1.5754716981132075, "grad_norm": 2.2922754287719727, "learning_rate": 1.06145178197065e-05, "loss": 0.0612, "step": 30060 }, { "epoch": 1.5759958071278826, "grad_norm": 1.01918625831604, "learning_rate": 1.0601415094339624e-05, "loss": 0.0866, "step": 30070 }, { "epoch": 1.5765199161425576, "grad_norm": 1.2766294479370117, "learning_rate": 1.0588312368972747e-05, "loss": 0.0795, "step": 30080 }, { "epoch": 1.5770440251572326, "grad_norm": 1.4242372512817383, "learning_rate": 1.0575209643605872e-05, "loss": 0.0847, "step": 30090 }, { "epoch": 1.5775681341719077, "grad_norm": 1.5070456266403198, "learning_rate": 1.0562106918238994e-05, "loss": 0.0705, "step": 30100 }, { "epoch": 1.5780922431865827, "grad_norm": 0.5492226481437683, "learning_rate": 1.0549004192872119e-05, "loss": 0.0839, "step": 30110 }, { "epoch": 1.5786163522012577, "grad_norm": 2.577894687652588, "learning_rate": 1.0535901467505242e-05, "loss": 0.1087, "step": 30120 }, { "epoch": 1.5791404612159328, "grad_norm": 1.4646040201187134, "learning_rate": 1.0522798742138366e-05, "loss": 0.0812, "step": 30130 }, { "epoch": 1.5796645702306078, "grad_norm": 1.1385252475738525, "learning_rate": 1.0509696016771489e-05, "loss": 0.071, "step": 30140 }, { "epoch": 1.580188679245283, "grad_norm": 0.8355076909065247, "learning_rate": 1.0496593291404614e-05, "loss": 0.0766, "step": 30150 }, { "epoch": 1.580712788259958, "grad_norm": 1.4133363962173462, "learning_rate": 1.0483490566037736e-05, "loss": 0.062, "step": 30160 }, { "epoch": 1.5812368972746331, "grad_norm": 3.577280282974243, "learning_rate": 1.047038784067086e-05, "loss": 0.0829, "step": 30170 }, { "epoch": 1.5817610062893082, "grad_norm": 1.477343201637268, "learning_rate": 1.0457285115303984e-05, "loss": 0.0647, "step": 30180 }, { "epoch": 1.5822851153039832, "grad_norm": 1.4361555576324463, "learning_rate": 1.0444182389937108e-05, "loss": 0.0882, "step": 30190 }, { "epoch": 1.5828092243186582, "grad_norm": 2.002866506576538, "learning_rate": 1.0431079664570231e-05, "loss": 0.0856, "step": 30200 }, { "epoch": 1.5833333333333335, "grad_norm": 1.5531564950942993, "learning_rate": 1.0417976939203356e-05, "loss": 0.0849, "step": 30210 }, { "epoch": 1.5838574423480085, "grad_norm": 1.7375233173370361, "learning_rate": 1.0404874213836478e-05, "loss": 0.0721, "step": 30220 }, { "epoch": 1.5843815513626835, "grad_norm": 1.705243706703186, "learning_rate": 1.0391771488469603e-05, "loss": 0.0718, "step": 30230 }, { "epoch": 1.5849056603773586, "grad_norm": 0.8574244976043701, "learning_rate": 1.0378668763102724e-05, "loss": 0.0639, "step": 30240 }, { "epoch": 1.5854297693920336, "grad_norm": 1.3724231719970703, "learning_rate": 1.036556603773585e-05, "loss": 0.0738, "step": 30250 }, { "epoch": 1.5859538784067087, "grad_norm": 4.6184983253479, "learning_rate": 1.0352463312368973e-05, "loss": 0.0857, "step": 30260 }, { "epoch": 1.5864779874213837, "grad_norm": 6.087825298309326, "learning_rate": 1.0339360587002096e-05, "loss": 0.0729, "step": 30270 }, { "epoch": 1.5870020964360587, "grad_norm": 1.5708401203155518, "learning_rate": 1.032625786163522e-05, "loss": 0.0693, "step": 30280 }, { "epoch": 1.5875262054507338, "grad_norm": 1.4279801845550537, "learning_rate": 1.0313155136268345e-05, "loss": 0.0889, "step": 30290 }, { "epoch": 1.5880503144654088, "grad_norm": 1.2791606187820435, "learning_rate": 1.0300052410901468e-05, "loss": 0.0741, "step": 30300 }, { "epoch": 1.5885744234800838, "grad_norm": 1.5464717149734497, "learning_rate": 1.0286949685534592e-05, "loss": 0.0785, "step": 30310 }, { "epoch": 1.5890985324947589, "grad_norm": 4.76186990737915, "learning_rate": 1.0273846960167715e-05, "loss": 0.0658, "step": 30320 }, { "epoch": 1.5896226415094339, "grad_norm": 1.987302541732788, "learning_rate": 1.0260744234800838e-05, "loss": 0.0644, "step": 30330 }, { "epoch": 1.590146750524109, "grad_norm": 2.006908416748047, "learning_rate": 1.0247641509433962e-05, "loss": 0.055, "step": 30340 }, { "epoch": 1.590670859538784, "grad_norm": 1.641992449760437, "learning_rate": 1.0234538784067087e-05, "loss": 0.087, "step": 30350 }, { "epoch": 1.591194968553459, "grad_norm": 1.6064209938049316, "learning_rate": 1.022143605870021e-05, "loss": 0.1012, "step": 30360 }, { "epoch": 1.591719077568134, "grad_norm": 1.9769614934921265, "learning_rate": 1.0208333333333334e-05, "loss": 0.0708, "step": 30370 }, { "epoch": 1.5922431865828093, "grad_norm": 1.4612163305282593, "learning_rate": 1.0195230607966457e-05, "loss": 0.0763, "step": 30380 }, { "epoch": 1.5927672955974843, "grad_norm": 0.8201736807823181, "learning_rate": 1.018212788259958e-05, "loss": 0.0629, "step": 30390 }, { "epoch": 1.5932914046121593, "grad_norm": 1.8059312105178833, "learning_rate": 1.0169025157232705e-05, "loss": 0.0862, "step": 30400 }, { "epoch": 1.5938155136268344, "grad_norm": 2.156587600708008, "learning_rate": 1.0155922431865829e-05, "loss": 0.0975, "step": 30410 }, { "epoch": 1.5943396226415094, "grad_norm": 1.717282772064209, "learning_rate": 1.0142819706498952e-05, "loss": 0.0653, "step": 30420 }, { "epoch": 1.5948637316561844, "grad_norm": 0.8761667013168335, "learning_rate": 1.0129716981132076e-05, "loss": 0.0925, "step": 30430 }, { "epoch": 1.5953878406708597, "grad_norm": 1.7277803421020508, "learning_rate": 1.01166142557652e-05, "loss": 0.0752, "step": 30440 }, { "epoch": 1.5959119496855347, "grad_norm": 1.5075651407241821, "learning_rate": 1.0103511530398322e-05, "loss": 0.0645, "step": 30450 }, { "epoch": 1.5964360587002098, "grad_norm": 2.044546604156494, "learning_rate": 1.0090408805031447e-05, "loss": 0.0839, "step": 30460 }, { "epoch": 1.5969601677148848, "grad_norm": 0.9201863408088684, "learning_rate": 1.007730607966457e-05, "loss": 0.0731, "step": 30470 }, { "epoch": 1.5974842767295598, "grad_norm": 1.1442701816558838, "learning_rate": 1.0064203354297694e-05, "loss": 0.0771, "step": 30480 }, { "epoch": 1.5980083857442349, "grad_norm": 1.5111795663833618, "learning_rate": 1.0051100628930818e-05, "loss": 0.0885, "step": 30490 }, { "epoch": 1.59853249475891, "grad_norm": 1.7600494623184204, "learning_rate": 1.0037997903563943e-05, "loss": 0.0768, "step": 30500 }, { "epoch": 1.599056603773585, "grad_norm": 1.3260642290115356, "learning_rate": 1.0024895178197064e-05, "loss": 0.0724, "step": 30510 }, { "epoch": 1.59958071278826, "grad_norm": 1.7742761373519897, "learning_rate": 1.001179245283019e-05, "loss": 0.0708, "step": 30520 }, { "epoch": 1.600104821802935, "grad_norm": 1.3558989763259888, "learning_rate": 9.998689727463313e-06, "loss": 0.0828, "step": 30530 }, { "epoch": 1.60062893081761, "grad_norm": 2.178793430328369, "learning_rate": 9.985587002096436e-06, "loss": 0.0779, "step": 30540 }, { "epoch": 1.601153039832285, "grad_norm": 2.13956618309021, "learning_rate": 9.97248427672956e-06, "loss": 0.0604, "step": 30550 }, { "epoch": 1.60167714884696, "grad_norm": 1.9969738721847534, "learning_rate": 9.959381551362685e-06, "loss": 0.0816, "step": 30560 }, { "epoch": 1.6022012578616351, "grad_norm": 1.623021125793457, "learning_rate": 9.946278825995806e-06, "loss": 0.105, "step": 30570 }, { "epoch": 1.6027253668763102, "grad_norm": 1.9130661487579346, "learning_rate": 9.933176100628931e-06, "loss": 0.0778, "step": 30580 }, { "epoch": 1.6032494758909852, "grad_norm": 1.505086898803711, "learning_rate": 9.920073375262055e-06, "loss": 0.07, "step": 30590 }, { "epoch": 1.6037735849056602, "grad_norm": 2.169313907623291, "learning_rate": 9.906970649895178e-06, "loss": 0.0929, "step": 30600 }, { "epoch": 1.6042976939203353, "grad_norm": 1.09574294090271, "learning_rate": 9.893867924528302e-06, "loss": 0.0498, "step": 30610 }, { "epoch": 1.6048218029350105, "grad_norm": 1.1906548738479614, "learning_rate": 9.880765199161427e-06, "loss": 0.0773, "step": 30620 }, { "epoch": 1.6053459119496856, "grad_norm": 2.843764543533325, "learning_rate": 9.867662473794548e-06, "loss": 0.0815, "step": 30630 }, { "epoch": 1.6058700209643606, "grad_norm": 1.0723826885223389, "learning_rate": 9.854559748427673e-06, "loss": 0.0642, "step": 30640 }, { "epoch": 1.6063941299790356, "grad_norm": 0.8873287439346313, "learning_rate": 9.841457023060797e-06, "loss": 0.0595, "step": 30650 }, { "epoch": 1.6069182389937107, "grad_norm": 1.246100664138794, "learning_rate": 9.82835429769392e-06, "loss": 0.0818, "step": 30660 }, { "epoch": 1.6074423480083857, "grad_norm": 1.3966413736343384, "learning_rate": 9.815251572327044e-06, "loss": 0.0646, "step": 30670 }, { "epoch": 1.607966457023061, "grad_norm": 2.654226541519165, "learning_rate": 9.802148846960169e-06, "loss": 0.0825, "step": 30680 }, { "epoch": 1.608490566037736, "grad_norm": 0.9575130939483643, "learning_rate": 9.789046121593292e-06, "loss": 0.0607, "step": 30690 }, { "epoch": 1.609014675052411, "grad_norm": 1.2333629131317139, "learning_rate": 9.775943396226415e-06, "loss": 0.0654, "step": 30700 }, { "epoch": 1.609538784067086, "grad_norm": 1.891493558883667, "learning_rate": 9.762840670859539e-06, "loss": 0.0772, "step": 30710 }, { "epoch": 1.610062893081761, "grad_norm": 1.4982374906539917, "learning_rate": 9.749737945492662e-06, "loss": 0.0706, "step": 30720 }, { "epoch": 1.6105870020964361, "grad_norm": 1.1844661235809326, "learning_rate": 9.736635220125787e-06, "loss": 0.0801, "step": 30730 }, { "epoch": 1.6111111111111112, "grad_norm": 0.8028875589370728, "learning_rate": 9.72353249475891e-06, "loss": 0.0731, "step": 30740 }, { "epoch": 1.6116352201257862, "grad_norm": 1.5980677604675293, "learning_rate": 9.710429769392034e-06, "loss": 0.0789, "step": 30750 }, { "epoch": 1.6121593291404612, "grad_norm": 4.673587322235107, "learning_rate": 9.697327044025157e-06, "loss": 0.0885, "step": 30760 }, { "epoch": 1.6126834381551363, "grad_norm": 1.1823970079421997, "learning_rate": 9.684224318658283e-06, "loss": 0.0787, "step": 30770 }, { "epoch": 1.6132075471698113, "grad_norm": 1.0304325819015503, "learning_rate": 9.671121593291404e-06, "loss": 0.0987, "step": 30780 }, { "epoch": 1.6137316561844863, "grad_norm": 1.4936891794204712, "learning_rate": 9.65801886792453e-06, "loss": 0.0712, "step": 30790 }, { "epoch": 1.6142557651991614, "grad_norm": 0.8515013456344604, "learning_rate": 9.644916142557653e-06, "loss": 0.0764, "step": 30800 }, { "epoch": 1.6147798742138364, "grad_norm": 1.0494325160980225, "learning_rate": 9.631813417190776e-06, "loss": 0.095, "step": 30810 }, { "epoch": 1.6153039832285114, "grad_norm": 2.069692611694336, "learning_rate": 9.6187106918239e-06, "loss": 0.0568, "step": 30820 }, { "epoch": 1.6158280922431865, "grad_norm": 1.360653281211853, "learning_rate": 9.605607966457025e-06, "loss": 0.0832, "step": 30830 }, { "epoch": 1.6163522012578615, "grad_norm": 2.063434362411499, "learning_rate": 9.592505241090146e-06, "loss": 0.0716, "step": 30840 }, { "epoch": 1.6168763102725365, "grad_norm": 1.8808234930038452, "learning_rate": 9.579402515723271e-06, "loss": 0.0713, "step": 30850 }, { "epoch": 1.6174004192872118, "grad_norm": 2.8570516109466553, "learning_rate": 9.566299790356395e-06, "loss": 0.0763, "step": 30860 }, { "epoch": 1.6179245283018868, "grad_norm": 1.3575539588928223, "learning_rate": 9.553197064989518e-06, "loss": 0.0777, "step": 30870 }, { "epoch": 1.6184486373165619, "grad_norm": 1.710089087486267, "learning_rate": 9.540094339622641e-06, "loss": 0.0806, "step": 30880 }, { "epoch": 1.618972746331237, "grad_norm": 2.1114864349365234, "learning_rate": 9.526991614255767e-06, "loss": 0.0818, "step": 30890 }, { "epoch": 1.619496855345912, "grad_norm": 2.2566704750061035, "learning_rate": 9.513888888888888e-06, "loss": 0.0857, "step": 30900 }, { "epoch": 1.620020964360587, "grad_norm": 2.0487608909606934, "learning_rate": 9.500786163522013e-06, "loss": 0.0779, "step": 30910 }, { "epoch": 1.6205450733752622, "grad_norm": 1.2093197107315063, "learning_rate": 9.487683438155137e-06, "loss": 0.0942, "step": 30920 }, { "epoch": 1.6210691823899372, "grad_norm": 0.9803887605667114, "learning_rate": 9.47458071278826e-06, "loss": 0.0695, "step": 30930 }, { "epoch": 1.6215932914046123, "grad_norm": 1.6742619276046753, "learning_rate": 9.461477987421383e-06, "loss": 0.0709, "step": 30940 }, { "epoch": 1.6221174004192873, "grad_norm": 1.9753679037094116, "learning_rate": 9.448375262054509e-06, "loss": 0.0818, "step": 30950 }, { "epoch": 1.6226415094339623, "grad_norm": 1.4718005657196045, "learning_rate": 9.43527253668763e-06, "loss": 0.0865, "step": 30960 }, { "epoch": 1.6231656184486374, "grad_norm": 1.4376145601272583, "learning_rate": 9.422169811320755e-06, "loss": 0.0652, "step": 30970 }, { "epoch": 1.6236897274633124, "grad_norm": 1.3017988204956055, "learning_rate": 9.409067085953879e-06, "loss": 0.0706, "step": 30980 }, { "epoch": 1.6242138364779874, "grad_norm": 2.7291808128356934, "learning_rate": 9.395964360587002e-06, "loss": 0.0871, "step": 30990 }, { "epoch": 1.6247379454926625, "grad_norm": 1.0201388597488403, "learning_rate": 9.382861635220125e-06, "loss": 0.0584, "step": 31000 }, { "epoch": 1.6247379454926625, "eval_loss": 0.26702582836151123, "eval_runtime": 267.8655, "eval_samples_per_second": 7.433, "eval_steps_per_second": 1.239, "step": 31000 }, { "epoch": 1.6252620545073375, "grad_norm": 3.665487051010132, "learning_rate": 9.36975890985325e-06, "loss": 0.0844, "step": 31010 }, { "epoch": 1.6257861635220126, "grad_norm": 1.0894783735275269, "learning_rate": 9.356656184486374e-06, "loss": 0.0604, "step": 31020 }, { "epoch": 1.6263102725366876, "grad_norm": 1.6628109216690063, "learning_rate": 9.343553459119497e-06, "loss": 0.0645, "step": 31030 }, { "epoch": 1.6268343815513626, "grad_norm": 0.9611374735832214, "learning_rate": 9.33045073375262e-06, "loss": 0.0919, "step": 31040 }, { "epoch": 1.6273584905660377, "grad_norm": 1.490185022354126, "learning_rate": 9.317348008385744e-06, "loss": 0.1212, "step": 31050 }, { "epoch": 1.6278825995807127, "grad_norm": 2.0159690380096436, "learning_rate": 9.30424528301887e-06, "loss": 0.0724, "step": 31060 }, { "epoch": 1.6284067085953877, "grad_norm": 1.2287893295288086, "learning_rate": 9.291142557651993e-06, "loss": 0.0487, "step": 31070 }, { "epoch": 1.6289308176100628, "grad_norm": 1.0067164897918701, "learning_rate": 9.278039832285116e-06, "loss": 0.0842, "step": 31080 }, { "epoch": 1.6294549266247378, "grad_norm": 2.069939374923706, "learning_rate": 9.26493710691824e-06, "loss": 0.0636, "step": 31090 }, { "epoch": 1.629979035639413, "grad_norm": 1.2028172016143799, "learning_rate": 9.251834381551364e-06, "loss": 0.1012, "step": 31100 }, { "epoch": 1.630503144654088, "grad_norm": 3.1692330837249756, "learning_rate": 9.238731656184486e-06, "loss": 0.0742, "step": 31110 }, { "epoch": 1.631027253668763, "grad_norm": 1.7992483377456665, "learning_rate": 9.225628930817611e-06, "loss": 0.0736, "step": 31120 }, { "epoch": 1.6315513626834381, "grad_norm": 1.6034457683563232, "learning_rate": 9.212526205450735e-06, "loss": 0.0703, "step": 31130 }, { "epoch": 1.6320754716981132, "grad_norm": 1.4430203437805176, "learning_rate": 9.199423480083858e-06, "loss": 0.1016, "step": 31140 }, { "epoch": 1.6325995807127882, "grad_norm": 1.9012161493301392, "learning_rate": 9.186320754716981e-06, "loss": 0.0933, "step": 31150 }, { "epoch": 1.6331236897274635, "grad_norm": 1.7923085689544678, "learning_rate": 9.173218029350106e-06, "loss": 0.0862, "step": 31160 }, { "epoch": 1.6336477987421385, "grad_norm": 0.35635894536972046, "learning_rate": 9.160115303983228e-06, "loss": 0.0741, "step": 31170 }, { "epoch": 1.6341719077568135, "grad_norm": 1.7451974153518677, "learning_rate": 9.147012578616353e-06, "loss": 0.0802, "step": 31180 }, { "epoch": 1.6346960167714886, "grad_norm": 1.976898431777954, "learning_rate": 9.133909853249477e-06, "loss": 0.0759, "step": 31190 }, { "epoch": 1.6352201257861636, "grad_norm": 2.0606613159179688, "learning_rate": 9.1208071278826e-06, "loss": 0.0785, "step": 31200 }, { "epoch": 1.6357442348008386, "grad_norm": 1.5760445594787598, "learning_rate": 9.107704402515723e-06, "loss": 0.0753, "step": 31210 }, { "epoch": 1.6362683438155137, "grad_norm": 1.2342818975448608, "learning_rate": 9.094601677148848e-06, "loss": 0.0619, "step": 31220 }, { "epoch": 1.6367924528301887, "grad_norm": 0.7712125778198242, "learning_rate": 9.08149895178197e-06, "loss": 0.0603, "step": 31230 }, { "epoch": 1.6373165618448637, "grad_norm": 0.5956855416297913, "learning_rate": 9.068396226415095e-06, "loss": 0.0714, "step": 31240 }, { "epoch": 1.6378406708595388, "grad_norm": 2.2633252143859863, "learning_rate": 9.055293501048219e-06, "loss": 0.0811, "step": 31250 }, { "epoch": 1.6383647798742138, "grad_norm": 1.7489700317382812, "learning_rate": 9.042190775681342e-06, "loss": 0.0803, "step": 31260 }, { "epoch": 1.6388888888888888, "grad_norm": 1.7314835786819458, "learning_rate": 9.029088050314465e-06, "loss": 0.0852, "step": 31270 }, { "epoch": 1.6394129979035639, "grad_norm": 1.1030824184417725, "learning_rate": 9.01598532494759e-06, "loss": 0.0644, "step": 31280 }, { "epoch": 1.639937106918239, "grad_norm": 0.5983395576477051, "learning_rate": 9.002882599580712e-06, "loss": 0.0758, "step": 31290 }, { "epoch": 1.640461215932914, "grad_norm": 0.8161409497261047, "learning_rate": 8.989779874213837e-06, "loss": 0.0666, "step": 31300 }, { "epoch": 1.640985324947589, "grad_norm": 2.6947455406188965, "learning_rate": 8.97667714884696e-06, "loss": 0.0755, "step": 31310 }, { "epoch": 1.641509433962264, "grad_norm": 1.2531763315200806, "learning_rate": 8.963574423480084e-06, "loss": 0.0996, "step": 31320 }, { "epoch": 1.642033542976939, "grad_norm": 1.8375675678253174, "learning_rate": 8.950471698113207e-06, "loss": 0.067, "step": 31330 }, { "epoch": 1.6425576519916143, "grad_norm": 2.1205642223358154, "learning_rate": 8.937368972746332e-06, "loss": 0.0675, "step": 31340 }, { "epoch": 1.6430817610062893, "grad_norm": 1.071904182434082, "learning_rate": 8.924266247379456e-06, "loss": 0.0656, "step": 31350 }, { "epoch": 1.6436058700209644, "grad_norm": 1.9515632390975952, "learning_rate": 8.91116352201258e-06, "loss": 0.0793, "step": 31360 }, { "epoch": 1.6441299790356394, "grad_norm": 1.9191174507141113, "learning_rate": 8.898060796645703e-06, "loss": 0.0645, "step": 31370 }, { "epoch": 1.6446540880503144, "grad_norm": 1.7642149925231934, "learning_rate": 8.884958071278826e-06, "loss": 0.0743, "step": 31380 }, { "epoch": 1.6451781970649895, "grad_norm": 3.854020595550537, "learning_rate": 8.87185534591195e-06, "loss": 0.0908, "step": 31390 }, { "epoch": 1.6457023060796647, "grad_norm": 1.7950454950332642, "learning_rate": 8.858752620545074e-06, "loss": 0.0827, "step": 31400 }, { "epoch": 1.6462264150943398, "grad_norm": 39.41985321044922, "learning_rate": 8.845649895178198e-06, "loss": 0.0732, "step": 31410 }, { "epoch": 1.6467505241090148, "grad_norm": 1.0624687671661377, "learning_rate": 8.832547169811321e-06, "loss": 0.073, "step": 31420 }, { "epoch": 1.6472746331236898, "grad_norm": 1.975717544555664, "learning_rate": 8.819444444444445e-06, "loss": 0.1077, "step": 31430 }, { "epoch": 1.6477987421383649, "grad_norm": 1.54432213306427, "learning_rate": 8.806341719077568e-06, "loss": 0.0686, "step": 31440 }, { "epoch": 1.64832285115304, "grad_norm": 1.4304097890853882, "learning_rate": 8.793238993710693e-06, "loss": 0.0734, "step": 31450 }, { "epoch": 1.648846960167715, "grad_norm": 1.6963316202163696, "learning_rate": 8.780136268343816e-06, "loss": 0.0625, "step": 31460 }, { "epoch": 1.64937106918239, "grad_norm": 2.143986463546753, "learning_rate": 8.76703354297694e-06, "loss": 0.1018, "step": 31470 }, { "epoch": 1.649895178197065, "grad_norm": 2.237755537033081, "learning_rate": 8.753930817610063e-06, "loss": 0.0835, "step": 31480 }, { "epoch": 1.65041928721174, "grad_norm": 1.3473063707351685, "learning_rate": 8.740828092243188e-06, "loss": 0.0719, "step": 31490 }, { "epoch": 1.650943396226415, "grad_norm": 2.768249273300171, "learning_rate": 8.72772536687631e-06, "loss": 0.0754, "step": 31500 }, { "epoch": 1.65146750524109, "grad_norm": 2.2245090007781982, "learning_rate": 8.714622641509435e-06, "loss": 0.0825, "step": 31510 }, { "epoch": 1.6519916142557651, "grad_norm": 0.9257903695106506, "learning_rate": 8.701519916142558e-06, "loss": 0.1143, "step": 31520 }, { "epoch": 1.6525157232704402, "grad_norm": 2.393784999847412, "learning_rate": 8.688417190775682e-06, "loss": 0.0977, "step": 31530 }, { "epoch": 1.6530398322851152, "grad_norm": 0.9498158693313599, "learning_rate": 8.675314465408805e-06, "loss": 0.0735, "step": 31540 }, { "epoch": 1.6535639412997902, "grad_norm": 1.3141714334487915, "learning_rate": 8.66221174004193e-06, "loss": 0.0538, "step": 31550 }, { "epoch": 1.6540880503144653, "grad_norm": 1.4560273885726929, "learning_rate": 8.649109014675052e-06, "loss": 0.0747, "step": 31560 }, { "epoch": 1.6546121593291403, "grad_norm": 2.20974063873291, "learning_rate": 8.636006289308177e-06, "loss": 0.0816, "step": 31570 }, { "epoch": 1.6551362683438156, "grad_norm": 1.07097327709198, "learning_rate": 8.6229035639413e-06, "loss": 0.085, "step": 31580 }, { "epoch": 1.6556603773584906, "grad_norm": 2.0411245822906494, "learning_rate": 8.609800838574424e-06, "loss": 0.1071, "step": 31590 }, { "epoch": 1.6561844863731656, "grad_norm": 2.4444754123687744, "learning_rate": 8.596698113207547e-06, "loss": 0.1079, "step": 31600 }, { "epoch": 1.6567085953878407, "grad_norm": 1.7422720193862915, "learning_rate": 8.583595387840672e-06, "loss": 0.0527, "step": 31610 }, { "epoch": 1.6572327044025157, "grad_norm": 1.8651151657104492, "learning_rate": 8.570492662473794e-06, "loss": 0.0854, "step": 31620 }, { "epoch": 1.6577568134171907, "grad_norm": 1.7437045574188232, "learning_rate": 8.557389937106919e-06, "loss": 0.0758, "step": 31630 }, { "epoch": 1.658280922431866, "grad_norm": 1.2671029567718506, "learning_rate": 8.544287211740042e-06, "loss": 0.0868, "step": 31640 }, { "epoch": 1.658805031446541, "grad_norm": 1.625141978263855, "learning_rate": 8.531184486373166e-06, "loss": 0.0598, "step": 31650 }, { "epoch": 1.659329140461216, "grad_norm": 1.3473106622695923, "learning_rate": 8.51808176100629e-06, "loss": 0.0527, "step": 31660 }, { "epoch": 1.659853249475891, "grad_norm": 1.9228523969650269, "learning_rate": 8.504979035639414e-06, "loss": 0.0719, "step": 31670 }, { "epoch": 1.6603773584905661, "grad_norm": 2.128453016281128, "learning_rate": 8.491876310272536e-06, "loss": 0.1052, "step": 31680 }, { "epoch": 1.6609014675052411, "grad_norm": 1.360034465789795, "learning_rate": 8.478773584905661e-06, "loss": 0.0727, "step": 31690 }, { "epoch": 1.6614255765199162, "grad_norm": 1.688598394393921, "learning_rate": 8.465670859538784e-06, "loss": 0.0994, "step": 31700 }, { "epoch": 1.6619496855345912, "grad_norm": 1.1820000410079956, "learning_rate": 8.452568134171908e-06, "loss": 0.073, "step": 31710 }, { "epoch": 1.6624737945492662, "grad_norm": 1.195401906967163, "learning_rate": 8.439465408805031e-06, "loss": 0.0593, "step": 31720 }, { "epoch": 1.6629979035639413, "grad_norm": 1.9621461629867554, "learning_rate": 8.426362683438156e-06, "loss": 0.0728, "step": 31730 }, { "epoch": 1.6635220125786163, "grad_norm": 2.7265822887420654, "learning_rate": 8.41325995807128e-06, "loss": 0.0825, "step": 31740 }, { "epoch": 1.6640461215932913, "grad_norm": 0.510811984539032, "learning_rate": 8.400157232704403e-06, "loss": 0.0595, "step": 31750 }, { "epoch": 1.6645702306079664, "grad_norm": 1.9401181936264038, "learning_rate": 8.387054507337526e-06, "loss": 0.095, "step": 31760 }, { "epoch": 1.6650943396226414, "grad_norm": 0.9660150408744812, "learning_rate": 8.37395178197065e-06, "loss": 0.0613, "step": 31770 }, { "epoch": 1.6656184486373165, "grad_norm": 1.6908096075057983, "learning_rate": 8.360849056603775e-06, "loss": 0.076, "step": 31780 }, { "epoch": 1.6661425576519915, "grad_norm": 2.0633528232574463, "learning_rate": 8.347746331236898e-06, "loss": 0.0697, "step": 31790 }, { "epoch": 1.6666666666666665, "grad_norm": 1.6293288469314575, "learning_rate": 8.334643605870022e-06, "loss": 0.0751, "step": 31800 }, { "epoch": 1.6671907756813418, "grad_norm": 2.2690975666046143, "learning_rate": 8.321540880503145e-06, "loss": 0.0903, "step": 31810 }, { "epoch": 1.6677148846960168, "grad_norm": 1.2514418363571167, "learning_rate": 8.308438155136269e-06, "loss": 0.0881, "step": 31820 }, { "epoch": 1.6682389937106918, "grad_norm": 1.7849570512771606, "learning_rate": 8.295335429769392e-06, "loss": 0.0651, "step": 31830 }, { "epoch": 1.6687631027253669, "grad_norm": 1.8932183980941772, "learning_rate": 8.282232704402517e-06, "loss": 0.0693, "step": 31840 }, { "epoch": 1.669287211740042, "grad_norm": 1.2900819778442383, "learning_rate": 8.269129979035639e-06, "loss": 0.0743, "step": 31850 }, { "epoch": 1.669811320754717, "grad_norm": 2.7592389583587646, "learning_rate": 8.256027253668764e-06, "loss": 0.0761, "step": 31860 }, { "epoch": 1.6703354297693922, "grad_norm": 2.449463367462158, "learning_rate": 8.242924528301887e-06, "loss": 0.0778, "step": 31870 }, { "epoch": 1.6708595387840672, "grad_norm": 1.5721780061721802, "learning_rate": 8.22982180293501e-06, "loss": 0.0703, "step": 31880 }, { "epoch": 1.6713836477987423, "grad_norm": 1.907634973526001, "learning_rate": 8.216719077568134e-06, "loss": 0.0777, "step": 31890 }, { "epoch": 1.6719077568134173, "grad_norm": 1.0545610189437866, "learning_rate": 8.203616352201259e-06, "loss": 0.0707, "step": 31900 }, { "epoch": 1.6724318658280923, "grad_norm": 2.518768072128296, "learning_rate": 8.19051362683438e-06, "loss": 0.0736, "step": 31910 }, { "epoch": 1.6729559748427674, "grad_norm": 1.4451135396957397, "learning_rate": 8.177410901467506e-06, "loss": 0.0736, "step": 31920 }, { "epoch": 1.6734800838574424, "grad_norm": 1.5687718391418457, "learning_rate": 8.164308176100629e-06, "loss": 0.0707, "step": 31930 }, { "epoch": 1.6740041928721174, "grad_norm": 1.0529969930648804, "learning_rate": 8.151205450733753e-06, "loss": 0.0878, "step": 31940 }, { "epoch": 1.6745283018867925, "grad_norm": 1.8931552171707153, "learning_rate": 8.138102725366876e-06, "loss": 0.0829, "step": 31950 }, { "epoch": 1.6750524109014675, "grad_norm": 1.5445560216903687, "learning_rate": 8.125000000000001e-06, "loss": 0.0655, "step": 31960 }, { "epoch": 1.6755765199161425, "grad_norm": 1.209445834159851, "learning_rate": 8.111897274633123e-06, "loss": 0.0687, "step": 31970 }, { "epoch": 1.6761006289308176, "grad_norm": 1.4542816877365112, "learning_rate": 8.098794549266248e-06, "loss": 0.0859, "step": 31980 }, { "epoch": 1.6766247379454926, "grad_norm": 1.4635159969329834, "learning_rate": 8.085691823899371e-06, "loss": 0.0772, "step": 31990 }, { "epoch": 1.6771488469601676, "grad_norm": 1.0781625509262085, "learning_rate": 8.072589098532495e-06, "loss": 0.0818, "step": 32000 }, { "epoch": 1.6771488469601676, "eval_loss": 0.2654711604118347, "eval_runtime": 266.8065, "eval_samples_per_second": 7.462, "eval_steps_per_second": 1.244, "step": 32000 }, { "epoch": 1.6776729559748427, "grad_norm": 1.2641340494155884, "learning_rate": 8.059486373165618e-06, "loss": 0.0691, "step": 32010 }, { "epoch": 1.6781970649895177, "grad_norm": 0.6858422756195068, "learning_rate": 8.046383647798743e-06, "loss": 0.0607, "step": 32020 }, { "epoch": 1.6787211740041927, "grad_norm": 1.2624872922897339, "learning_rate": 8.033280922431866e-06, "loss": 0.0675, "step": 32030 }, { "epoch": 1.6792452830188678, "grad_norm": 1.3456783294677734, "learning_rate": 8.02017819706499e-06, "loss": 0.0799, "step": 32040 }, { "epoch": 1.679769392033543, "grad_norm": 1.8370672464370728, "learning_rate": 8.007075471698113e-06, "loss": 0.0857, "step": 32050 }, { "epoch": 1.680293501048218, "grad_norm": 0.7102406024932861, "learning_rate": 7.993972746331237e-06, "loss": 0.0552, "step": 32060 }, { "epoch": 1.680817610062893, "grad_norm": 1.2570568323135376, "learning_rate": 7.980870020964362e-06, "loss": 0.0611, "step": 32070 }, { "epoch": 1.6813417190775681, "grad_norm": 1.3005815744400024, "learning_rate": 7.967767295597485e-06, "loss": 0.0898, "step": 32080 }, { "epoch": 1.6818658280922432, "grad_norm": 1.8173960447311401, "learning_rate": 7.954664570230608e-06, "loss": 0.0866, "step": 32090 }, { "epoch": 1.6823899371069182, "grad_norm": 0.9965292811393738, "learning_rate": 7.941561844863732e-06, "loss": 0.0599, "step": 32100 }, { "epoch": 1.6829140461215935, "grad_norm": 2.850111246109009, "learning_rate": 7.928459119496857e-06, "loss": 0.0999, "step": 32110 }, { "epoch": 1.6834381551362685, "grad_norm": 1.4791312217712402, "learning_rate": 7.915356394129979e-06, "loss": 0.0638, "step": 32120 }, { "epoch": 1.6839622641509435, "grad_norm": 1.24795401096344, "learning_rate": 7.902253668763104e-06, "loss": 0.0735, "step": 32130 }, { "epoch": 1.6844863731656186, "grad_norm": 1.7736852169036865, "learning_rate": 7.889150943396227e-06, "loss": 0.0833, "step": 32140 }, { "epoch": 1.6850104821802936, "grad_norm": 2.36114501953125, "learning_rate": 7.87604821802935e-06, "loss": 0.0716, "step": 32150 }, { "epoch": 1.6855345911949686, "grad_norm": 2.078085422515869, "learning_rate": 7.862945492662474e-06, "loss": 0.0707, "step": 32160 }, { "epoch": 1.6860587002096437, "grad_norm": 2.08408784866333, "learning_rate": 7.849842767295599e-06, "loss": 0.0796, "step": 32170 }, { "epoch": 1.6865828092243187, "grad_norm": 1.5705268383026123, "learning_rate": 7.83674004192872e-06, "loss": 0.0968, "step": 32180 }, { "epoch": 1.6871069182389937, "grad_norm": 2.4469356536865234, "learning_rate": 7.823637316561846e-06, "loss": 0.0784, "step": 32190 }, { "epoch": 1.6876310272536688, "grad_norm": 2.0996434688568115, "learning_rate": 7.810534591194969e-06, "loss": 0.0739, "step": 32200 }, { "epoch": 1.6881551362683438, "grad_norm": 1.6528130769729614, "learning_rate": 7.797431865828092e-06, "loss": 0.0769, "step": 32210 }, { "epoch": 1.6886792452830188, "grad_norm": 1.4783482551574707, "learning_rate": 7.784329140461216e-06, "loss": 0.075, "step": 32220 }, { "epoch": 1.6892033542976939, "grad_norm": 1.4927986860275269, "learning_rate": 7.771226415094341e-06, "loss": 0.0874, "step": 32230 }, { "epoch": 1.689727463312369, "grad_norm": 1.3704826831817627, "learning_rate": 7.758123689727463e-06, "loss": 0.0762, "step": 32240 }, { "epoch": 1.690251572327044, "grad_norm": 1.3907643556594849, "learning_rate": 7.745020964360588e-06, "loss": 0.0639, "step": 32250 }, { "epoch": 1.690775681341719, "grad_norm": 3.422758102416992, "learning_rate": 7.731918238993711e-06, "loss": 0.0658, "step": 32260 }, { "epoch": 1.691299790356394, "grad_norm": 1.4427984952926636, "learning_rate": 7.718815513626834e-06, "loss": 0.1001, "step": 32270 }, { "epoch": 1.691823899371069, "grad_norm": 2.1378979682922363, "learning_rate": 7.705712788259958e-06, "loss": 0.0795, "step": 32280 }, { "epoch": 1.6923480083857443, "grad_norm": 1.7514784336090088, "learning_rate": 7.692610062893083e-06, "loss": 0.064, "step": 32290 }, { "epoch": 1.6928721174004193, "grad_norm": 1.7054566144943237, "learning_rate": 7.679507337526205e-06, "loss": 0.0541, "step": 32300 }, { "epoch": 1.6933962264150944, "grad_norm": 0.8755590319633484, "learning_rate": 7.66640461215933e-06, "loss": 0.06, "step": 32310 }, { "epoch": 1.6939203354297694, "grad_norm": 1.724295735359192, "learning_rate": 7.653301886792453e-06, "loss": 0.0478, "step": 32320 }, { "epoch": 1.6944444444444444, "grad_norm": 1.4551647901535034, "learning_rate": 7.640199161425576e-06, "loss": 0.0612, "step": 32330 }, { "epoch": 1.6949685534591195, "grad_norm": 1.4933216571807861, "learning_rate": 7.627096436058701e-06, "loss": 0.0804, "step": 32340 }, { "epoch": 1.6954926624737947, "grad_norm": 2.513120651245117, "learning_rate": 7.613993710691825e-06, "loss": 0.0713, "step": 32350 }, { "epoch": 1.6960167714884697, "grad_norm": 1.3239364624023438, "learning_rate": 7.600890985324947e-06, "loss": 0.0798, "step": 32360 }, { "epoch": 1.6965408805031448, "grad_norm": 2.5568461418151855, "learning_rate": 7.587788259958072e-06, "loss": 0.0703, "step": 32370 }, { "epoch": 1.6970649895178198, "grad_norm": 1.7434065341949463, "learning_rate": 7.574685534591196e-06, "loss": 0.0681, "step": 32380 }, { "epoch": 1.6975890985324948, "grad_norm": 3.6494479179382324, "learning_rate": 7.561582809224318e-06, "loss": 0.0823, "step": 32390 }, { "epoch": 1.6981132075471699, "grad_norm": 1.6936825513839722, "learning_rate": 7.548480083857443e-06, "loss": 0.0895, "step": 32400 }, { "epoch": 1.698637316561845, "grad_norm": 1.0573102235794067, "learning_rate": 7.535377358490567e-06, "loss": 0.073, "step": 32410 }, { "epoch": 1.69916142557652, "grad_norm": 0.9793390035629272, "learning_rate": 7.522274633123689e-06, "loss": 0.0593, "step": 32420 }, { "epoch": 1.699685534591195, "grad_norm": 1.1624884605407715, "learning_rate": 7.509171907756814e-06, "loss": 0.0583, "step": 32430 }, { "epoch": 1.70020964360587, "grad_norm": 1.6360301971435547, "learning_rate": 7.496069182389938e-06, "loss": 0.0801, "step": 32440 }, { "epoch": 1.700733752620545, "grad_norm": 2.4007883071899414, "learning_rate": 7.48296645702306e-06, "loss": 0.0979, "step": 32450 }, { "epoch": 1.70125786163522, "grad_norm": 1.2194461822509766, "learning_rate": 7.469863731656185e-06, "loss": 0.0702, "step": 32460 }, { "epoch": 1.7017819706498951, "grad_norm": 0.7657409906387329, "learning_rate": 7.456761006289309e-06, "loss": 0.0647, "step": 32470 }, { "epoch": 1.7023060796645701, "grad_norm": 3.282167434692383, "learning_rate": 7.4436582809224314e-06, "loss": 0.095, "step": 32480 }, { "epoch": 1.7028301886792452, "grad_norm": 1.2034971714019775, "learning_rate": 7.430555555555556e-06, "loss": 0.0901, "step": 32490 }, { "epoch": 1.7033542976939202, "grad_norm": 2.2986252307891846, "learning_rate": 7.41745283018868e-06, "loss": 0.0612, "step": 32500 }, { "epoch": 1.7038784067085953, "grad_norm": 0.8976192474365234, "learning_rate": 7.4043501048218024e-06, "loss": 0.0573, "step": 32510 }, { "epoch": 1.7044025157232703, "grad_norm": 1.3209949731826782, "learning_rate": 7.391247379454927e-06, "loss": 0.069, "step": 32520 }, { "epoch": 1.7049266247379455, "grad_norm": 3.4305360317230225, "learning_rate": 7.378144654088051e-06, "loss": 0.0876, "step": 32530 }, { "epoch": 1.7054507337526206, "grad_norm": 2.0367486476898193, "learning_rate": 7.365041928721174e-06, "loss": 0.1007, "step": 32540 }, { "epoch": 1.7059748427672956, "grad_norm": 1.563010334968567, "learning_rate": 7.351939203354298e-06, "loss": 0.0704, "step": 32550 }, { "epoch": 1.7064989517819706, "grad_norm": 1.1888530254364014, "learning_rate": 7.338836477987422e-06, "loss": 0.0697, "step": 32560 }, { "epoch": 1.7070230607966457, "grad_norm": 2.30206561088562, "learning_rate": 7.325733752620545e-06, "loss": 0.0774, "step": 32570 }, { "epoch": 1.7075471698113207, "grad_norm": 1.4187016487121582, "learning_rate": 7.3126310272536695e-06, "loss": 0.0823, "step": 32580 }, { "epoch": 1.708071278825996, "grad_norm": 1.6031312942504883, "learning_rate": 7.299528301886793e-06, "loss": 0.0951, "step": 32590 }, { "epoch": 1.708595387840671, "grad_norm": 1.295620083808899, "learning_rate": 7.286425576519916e-06, "loss": 0.0878, "step": 32600 }, { "epoch": 1.709119496855346, "grad_norm": 1.4431999921798706, "learning_rate": 7.2733228511530405e-06, "loss": 0.0847, "step": 32610 }, { "epoch": 1.709643605870021, "grad_norm": 1.8405075073242188, "learning_rate": 7.260220125786165e-06, "loss": 0.0621, "step": 32620 }, { "epoch": 1.710167714884696, "grad_norm": 0.9919503927230835, "learning_rate": 7.247117400419287e-06, "loss": 0.064, "step": 32630 }, { "epoch": 1.7106918238993711, "grad_norm": 1.1123491525650024, "learning_rate": 7.2340146750524115e-06, "loss": 0.0621, "step": 32640 }, { "epoch": 1.7112159329140462, "grad_norm": 1.684038758277893, "learning_rate": 7.220911949685536e-06, "loss": 0.0957, "step": 32650 }, { "epoch": 1.7117400419287212, "grad_norm": 1.1205902099609375, "learning_rate": 7.207809224318658e-06, "loss": 0.094, "step": 32660 }, { "epoch": 1.7122641509433962, "grad_norm": 2.6448683738708496, "learning_rate": 7.1947064989517825e-06, "loss": 0.0735, "step": 32670 }, { "epoch": 1.7127882599580713, "grad_norm": 1.614770531654358, "learning_rate": 7.181603773584907e-06, "loss": 0.0694, "step": 32680 }, { "epoch": 1.7133123689727463, "grad_norm": 1.2186577320098877, "learning_rate": 7.168501048218029e-06, "loss": 0.0793, "step": 32690 }, { "epoch": 1.7138364779874213, "grad_norm": 2.874343156814575, "learning_rate": 7.1553983228511535e-06, "loss": 0.0806, "step": 32700 }, { "epoch": 1.7143605870020964, "grad_norm": 1.3548649549484253, "learning_rate": 7.142295597484278e-06, "loss": 0.075, "step": 32710 }, { "epoch": 1.7148846960167714, "grad_norm": 0.8531060218811035, "learning_rate": 7.1291928721174e-06, "loss": 0.0579, "step": 32720 }, { "epoch": 1.7154088050314464, "grad_norm": 1.160235047340393, "learning_rate": 7.1160901467505245e-06, "loss": 0.0667, "step": 32730 }, { "epoch": 1.7159329140461215, "grad_norm": 1.3317363262176514, "learning_rate": 7.102987421383649e-06, "loss": 0.0869, "step": 32740 }, { "epoch": 1.7164570230607965, "grad_norm": 1.2752031087875366, "learning_rate": 7.089884696016771e-06, "loss": 0.081, "step": 32750 }, { "epoch": 1.7169811320754715, "grad_norm": 1.738019585609436, "learning_rate": 7.0767819706498955e-06, "loss": 0.0777, "step": 32760 }, { "epoch": 1.7175052410901468, "grad_norm": 1.576297402381897, "learning_rate": 7.06367924528302e-06, "loss": 0.0893, "step": 32770 }, { "epoch": 1.7180293501048218, "grad_norm": 1.2655184268951416, "learning_rate": 7.050576519916142e-06, "loss": 0.0678, "step": 32780 }, { "epoch": 1.7185534591194969, "grad_norm": 3.7932636737823486, "learning_rate": 7.0374737945492665e-06, "loss": 0.0686, "step": 32790 }, { "epoch": 1.719077568134172, "grad_norm": 1.3198883533477783, "learning_rate": 7.024371069182391e-06, "loss": 0.076, "step": 32800 }, { "epoch": 1.719601677148847, "grad_norm": 1.3575078248977661, "learning_rate": 7.011268343815513e-06, "loss": 0.0739, "step": 32810 }, { "epoch": 1.720125786163522, "grad_norm": 0.8939098715782166, "learning_rate": 6.9981656184486375e-06, "loss": 0.0742, "step": 32820 }, { "epoch": 1.7206498951781972, "grad_norm": 2.349097728729248, "learning_rate": 6.985062893081762e-06, "loss": 0.0636, "step": 32830 }, { "epoch": 1.7211740041928723, "grad_norm": 1.1938402652740479, "learning_rate": 6.971960167714884e-06, "loss": 0.0707, "step": 32840 }, { "epoch": 1.7216981132075473, "grad_norm": 1.7119457721710205, "learning_rate": 6.9588574423480085e-06, "loss": 0.0935, "step": 32850 }, { "epoch": 1.7222222222222223, "grad_norm": 1.0587613582611084, "learning_rate": 6.945754716981133e-06, "loss": 0.0601, "step": 32860 }, { "epoch": 1.7227463312368974, "grad_norm": 1.8793436288833618, "learning_rate": 6.932651991614255e-06, "loss": 0.0823, "step": 32870 }, { "epoch": 1.7232704402515724, "grad_norm": 2.296818733215332, "learning_rate": 6.9195492662473795e-06, "loss": 0.0805, "step": 32880 }, { "epoch": 1.7237945492662474, "grad_norm": 1.4315125942230225, "learning_rate": 6.906446540880504e-06, "loss": 0.0796, "step": 32890 }, { "epoch": 1.7243186582809225, "grad_norm": 1.7421220541000366, "learning_rate": 6.893343815513627e-06, "loss": 0.1007, "step": 32900 }, { "epoch": 1.7248427672955975, "grad_norm": 1.3139567375183105, "learning_rate": 6.8802410901467506e-06, "loss": 0.0681, "step": 32910 }, { "epoch": 1.7253668763102725, "grad_norm": 0.7437646389007568, "learning_rate": 6.867138364779875e-06, "loss": 0.0792, "step": 32920 }, { "epoch": 1.7258909853249476, "grad_norm": 1.4558409452438354, "learning_rate": 6.854035639412998e-06, "loss": 0.0786, "step": 32930 }, { "epoch": 1.7264150943396226, "grad_norm": 0.9515263438224792, "learning_rate": 6.840932914046122e-06, "loss": 0.0832, "step": 32940 }, { "epoch": 1.7269392033542976, "grad_norm": 1.8956114053726196, "learning_rate": 6.827830188679246e-06, "loss": 0.062, "step": 32950 }, { "epoch": 1.7274633123689727, "grad_norm": 0.8939383029937744, "learning_rate": 6.814727463312369e-06, "loss": 0.0942, "step": 32960 }, { "epoch": 1.7279874213836477, "grad_norm": 1.30724036693573, "learning_rate": 6.801624737945493e-06, "loss": 0.0728, "step": 32970 }, { "epoch": 1.7285115303983227, "grad_norm": 1.8635292053222656, "learning_rate": 6.788522012578618e-06, "loss": 0.0693, "step": 32980 }, { "epoch": 1.7290356394129978, "grad_norm": 1.9945436716079712, "learning_rate": 6.77541928721174e-06, "loss": 0.0585, "step": 32990 }, { "epoch": 1.7295597484276728, "grad_norm": 1.2557333707809448, "learning_rate": 6.762316561844864e-06, "loss": 0.0793, "step": 33000 }, { "epoch": 1.7295597484276728, "eval_loss": 0.2656969130039215, "eval_runtime": 268.277, "eval_samples_per_second": 7.421, "eval_steps_per_second": 1.238, "step": 33000 }, { "epoch": 1.730083857442348, "grad_norm": 0.8602995872497559, "learning_rate": 6.749213836477989e-06, "loss": 0.0698, "step": 33010 }, { "epoch": 1.730607966457023, "grad_norm": 1.8230172395706177, "learning_rate": 6.736111111111111e-06, "loss": 0.065, "step": 33020 }, { "epoch": 1.7311320754716981, "grad_norm": 1.202260136604309, "learning_rate": 6.723008385744235e-06, "loss": 0.082, "step": 33030 }, { "epoch": 1.7316561844863732, "grad_norm": 1.9522544145584106, "learning_rate": 6.70990566037736e-06, "loss": 0.0736, "step": 33040 }, { "epoch": 1.7321802935010482, "grad_norm": 1.8593506813049316, "learning_rate": 6.696802935010482e-06, "loss": 0.0876, "step": 33050 }, { "epoch": 1.7327044025157232, "grad_norm": 1.674597144126892, "learning_rate": 6.683700209643606e-06, "loss": 0.0905, "step": 33060 }, { "epoch": 1.7332285115303985, "grad_norm": 0.780911922454834, "learning_rate": 6.670597484276731e-06, "loss": 0.0694, "step": 33070 }, { "epoch": 1.7337526205450735, "grad_norm": 1.7897731065750122, "learning_rate": 6.657494758909853e-06, "loss": 0.0852, "step": 33080 }, { "epoch": 1.7342767295597485, "grad_norm": 1.1359392404556274, "learning_rate": 6.644392033542977e-06, "loss": 0.0464, "step": 33090 }, { "epoch": 1.7348008385744236, "grad_norm": 1.3630025386810303, "learning_rate": 6.631289308176102e-06, "loss": 0.0614, "step": 33100 }, { "epoch": 1.7353249475890986, "grad_norm": 1.1544018983840942, "learning_rate": 6.618186582809224e-06, "loss": 0.0774, "step": 33110 }, { "epoch": 1.7358490566037736, "grad_norm": 2.56064510345459, "learning_rate": 6.605083857442348e-06, "loss": 0.0788, "step": 33120 }, { "epoch": 1.7363731656184487, "grad_norm": 1.8875850439071655, "learning_rate": 6.591981132075473e-06, "loss": 0.0934, "step": 33130 }, { "epoch": 1.7368972746331237, "grad_norm": 1.464888334274292, "learning_rate": 6.578878406708595e-06, "loss": 0.0748, "step": 33140 }, { "epoch": 1.7374213836477987, "grad_norm": 1.6824110746383667, "learning_rate": 6.5657756813417194e-06, "loss": 0.0636, "step": 33150 }, { "epoch": 1.7379454926624738, "grad_norm": 1.7789379358291626, "learning_rate": 6.552672955974844e-06, "loss": 0.0628, "step": 33160 }, { "epoch": 1.7384696016771488, "grad_norm": 1.5505372285842896, "learning_rate": 6.539570230607966e-06, "loss": 0.0688, "step": 33170 }, { "epoch": 1.7389937106918238, "grad_norm": 1.3862566947937012, "learning_rate": 6.5264675052410904e-06, "loss": 0.0884, "step": 33180 }, { "epoch": 1.7395178197064989, "grad_norm": 2.0535497665405273, "learning_rate": 6.513364779874215e-06, "loss": 0.0731, "step": 33190 }, { "epoch": 1.740041928721174, "grad_norm": 2.2445552349090576, "learning_rate": 6.500262054507337e-06, "loss": 0.0587, "step": 33200 }, { "epoch": 1.740566037735849, "grad_norm": 2.1130218505859375, "learning_rate": 6.4871593291404614e-06, "loss": 0.0546, "step": 33210 }, { "epoch": 1.741090146750524, "grad_norm": 7.03181266784668, "learning_rate": 6.474056603773586e-06, "loss": 0.0894, "step": 33220 }, { "epoch": 1.741614255765199, "grad_norm": 1.9390767812728882, "learning_rate": 6.460953878406708e-06, "loss": 0.068, "step": 33230 }, { "epoch": 1.742138364779874, "grad_norm": 1.0925933122634888, "learning_rate": 6.4478511530398324e-06, "loss": 0.0773, "step": 33240 }, { "epoch": 1.7426624737945493, "grad_norm": 2.1912240982055664, "learning_rate": 6.434748427672957e-06, "loss": 0.095, "step": 33250 }, { "epoch": 1.7431865828092243, "grad_norm": 1.3166011571884155, "learning_rate": 6.42164570230608e-06, "loss": 0.0768, "step": 33260 }, { "epoch": 1.7437106918238994, "grad_norm": 0.6529295444488525, "learning_rate": 6.4085429769392034e-06, "loss": 0.0857, "step": 33270 }, { "epoch": 1.7442348008385744, "grad_norm": 1.3718055486679077, "learning_rate": 6.395440251572328e-06, "loss": 0.0739, "step": 33280 }, { "epoch": 1.7447589098532494, "grad_norm": 1.3556514978408813, "learning_rate": 6.382337526205451e-06, "loss": 0.051, "step": 33290 }, { "epoch": 1.7452830188679245, "grad_norm": 0.8113279938697815, "learning_rate": 6.369234800838575e-06, "loss": 0.0675, "step": 33300 }, { "epoch": 1.7458071278825997, "grad_norm": 1.9193611145019531, "learning_rate": 6.356132075471699e-06, "loss": 0.1029, "step": 33310 }, { "epoch": 1.7463312368972748, "grad_norm": 1.2385278940200806, "learning_rate": 6.343029350104822e-06, "loss": 0.0942, "step": 33320 }, { "epoch": 1.7468553459119498, "grad_norm": 1.3271894454956055, "learning_rate": 6.329926624737946e-06, "loss": 0.0754, "step": 33330 }, { "epoch": 1.7473794549266248, "grad_norm": 2.492251396179199, "learning_rate": 6.3168238993710705e-06, "loss": 0.0789, "step": 33340 }, { "epoch": 1.7479035639412999, "grad_norm": 1.195293664932251, "learning_rate": 6.303721174004193e-06, "loss": 0.0594, "step": 33350 }, { "epoch": 1.748427672955975, "grad_norm": 1.112662672996521, "learning_rate": 6.290618448637317e-06, "loss": 0.0632, "step": 33360 }, { "epoch": 1.74895178197065, "grad_norm": 0.9955492615699768, "learning_rate": 6.2775157232704415e-06, "loss": 0.0686, "step": 33370 }, { "epoch": 1.749475890985325, "grad_norm": 1.3373956680297852, "learning_rate": 6.264412997903564e-06, "loss": 0.0725, "step": 33380 }, { "epoch": 1.75, "grad_norm": 1.872594952583313, "learning_rate": 6.251310272536688e-06, "loss": 0.0652, "step": 33390 }, { "epoch": 1.750524109014675, "grad_norm": 1.3409441709518433, "learning_rate": 6.238207547169812e-06, "loss": 0.0834, "step": 33400 }, { "epoch": 1.75104821802935, "grad_norm": 0.4260615110397339, "learning_rate": 6.225104821802935e-06, "loss": 0.0771, "step": 33410 }, { "epoch": 1.751572327044025, "grad_norm": 2.4701662063598633, "learning_rate": 6.212002096436059e-06, "loss": 0.0763, "step": 33420 }, { "epoch": 1.7520964360587001, "grad_norm": 0.7983285188674927, "learning_rate": 6.198899371069183e-06, "loss": 0.057, "step": 33430 }, { "epoch": 1.7526205450733752, "grad_norm": 1.0928597450256348, "learning_rate": 6.185796645702306e-06, "loss": 0.0618, "step": 33440 }, { "epoch": 1.7531446540880502, "grad_norm": 1.12595534324646, "learning_rate": 6.17269392033543e-06, "loss": 0.066, "step": 33450 }, { "epoch": 1.7536687631027252, "grad_norm": 1.363580346107483, "learning_rate": 6.159591194968554e-06, "loss": 0.0926, "step": 33460 }, { "epoch": 1.7541928721174003, "grad_norm": 2.26957106590271, "learning_rate": 6.146488469601677e-06, "loss": 0.0755, "step": 33470 }, { "epoch": 1.7547169811320755, "grad_norm": 1.3690286874771118, "learning_rate": 6.133385744234801e-06, "loss": 0.102, "step": 33480 }, { "epoch": 1.7552410901467506, "grad_norm": 1.2949638366699219, "learning_rate": 6.120283018867925e-06, "loss": 0.0681, "step": 33490 }, { "epoch": 1.7557651991614256, "grad_norm": 2.4410061836242676, "learning_rate": 6.107180293501048e-06, "loss": 0.0868, "step": 33500 }, { "epoch": 1.7562893081761006, "grad_norm": 1.083730936050415, "learning_rate": 6.094077568134172e-06, "loss": 0.0666, "step": 33510 }, { "epoch": 1.7568134171907757, "grad_norm": 1.5479590892791748, "learning_rate": 6.080974842767296e-06, "loss": 0.055, "step": 33520 }, { "epoch": 1.7573375262054507, "grad_norm": 1.6351350545883179, "learning_rate": 6.067872117400419e-06, "loss": 0.086, "step": 33530 }, { "epoch": 1.757861635220126, "grad_norm": 0.9401642680168152, "learning_rate": 6.054769392033543e-06, "loss": 0.0827, "step": 33540 }, { "epoch": 1.758385744234801, "grad_norm": 3.313169240951538, "learning_rate": 6.041666666666667e-06, "loss": 0.0769, "step": 33550 }, { "epoch": 1.758909853249476, "grad_norm": 0.568518340587616, "learning_rate": 6.02856394129979e-06, "loss": 0.082, "step": 33560 }, { "epoch": 1.759433962264151, "grad_norm": 1.2666873931884766, "learning_rate": 6.015461215932914e-06, "loss": 0.0736, "step": 33570 }, { "epoch": 1.759958071278826, "grad_norm": 2.380655527114868, "learning_rate": 6.002358490566038e-06, "loss": 0.0792, "step": 33580 }, { "epoch": 1.7604821802935011, "grad_norm": 2.7970130443573, "learning_rate": 5.989255765199162e-06, "loss": 0.0904, "step": 33590 }, { "epoch": 1.7610062893081762, "grad_norm": 1.3964394330978394, "learning_rate": 5.976153039832285e-06, "loss": 0.0889, "step": 33600 }, { "epoch": 1.7615303983228512, "grad_norm": 0.5676695704460144, "learning_rate": 5.9630503144654096e-06, "loss": 0.0706, "step": 33610 }, { "epoch": 1.7620545073375262, "grad_norm": 1.2144248485565186, "learning_rate": 5.949947589098533e-06, "loss": 0.081, "step": 33620 }, { "epoch": 1.7625786163522013, "grad_norm": 1.4676384925842285, "learning_rate": 5.936844863731657e-06, "loss": 0.0731, "step": 33630 }, { "epoch": 1.7631027253668763, "grad_norm": 1.430888295173645, "learning_rate": 5.9237421383647806e-06, "loss": 0.0725, "step": 33640 }, { "epoch": 1.7636268343815513, "grad_norm": 2.2190182209014893, "learning_rate": 5.910639412997904e-06, "loss": 0.0925, "step": 33650 }, { "epoch": 1.7641509433962264, "grad_norm": 1.8190736770629883, "learning_rate": 5.897536687631028e-06, "loss": 0.0853, "step": 33660 }, { "epoch": 1.7646750524109014, "grad_norm": 1.6736314296722412, "learning_rate": 5.8844339622641516e-06, "loss": 0.067, "step": 33670 }, { "epoch": 1.7651991614255764, "grad_norm": 2.305516004562378, "learning_rate": 5.871331236897275e-06, "loss": 0.0833, "step": 33680 }, { "epoch": 1.7657232704402515, "grad_norm": 1.2158180475234985, "learning_rate": 5.858228511530399e-06, "loss": 0.0962, "step": 33690 }, { "epoch": 1.7662473794549265, "grad_norm": 0.7084839344024658, "learning_rate": 5.8451257861635226e-06, "loss": 0.083, "step": 33700 }, { "epoch": 1.7667714884696015, "grad_norm": 1.0288736820220947, "learning_rate": 5.832023060796646e-06, "loss": 0.0964, "step": 33710 }, { "epoch": 1.7672955974842768, "grad_norm": 1.8042439222335815, "learning_rate": 5.81892033542977e-06, "loss": 0.068, "step": 33720 }, { "epoch": 1.7678197064989518, "grad_norm": 1.5784512758255005, "learning_rate": 5.8058176100628936e-06, "loss": 0.0823, "step": 33730 }, { "epoch": 1.7683438155136268, "grad_norm": 2.227677822113037, "learning_rate": 5.792714884696017e-06, "loss": 0.0743, "step": 33740 }, { "epoch": 1.7688679245283019, "grad_norm": 0.7300745844841003, "learning_rate": 5.779612159329141e-06, "loss": 0.0675, "step": 33750 }, { "epoch": 1.769392033542977, "grad_norm": 2.08028244972229, "learning_rate": 5.7665094339622646e-06, "loss": 0.0863, "step": 33760 }, { "epoch": 1.769916142557652, "grad_norm": 1.086308240890503, "learning_rate": 5.753406708595388e-06, "loss": 0.0839, "step": 33770 }, { "epoch": 1.7704402515723272, "grad_norm": 0.8018303513526917, "learning_rate": 5.740303983228512e-06, "loss": 0.0626, "step": 33780 }, { "epoch": 1.7709643605870022, "grad_norm": 2.3211405277252197, "learning_rate": 5.7272012578616356e-06, "loss": 0.0845, "step": 33790 }, { "epoch": 1.7714884696016773, "grad_norm": 1.7425010204315186, "learning_rate": 5.714098532494759e-06, "loss": 0.0585, "step": 33800 }, { "epoch": 1.7720125786163523, "grad_norm": 0.9054259657859802, "learning_rate": 5.700995807127882e-06, "loss": 0.0753, "step": 33810 }, { "epoch": 1.7725366876310273, "grad_norm": 2.504011392593384, "learning_rate": 5.687893081761007e-06, "loss": 0.0862, "step": 33820 }, { "epoch": 1.7730607966457024, "grad_norm": 1.1530929803848267, "learning_rate": 5.67479035639413e-06, "loss": 0.0875, "step": 33830 }, { "epoch": 1.7735849056603774, "grad_norm": 1.3565245866775513, "learning_rate": 5.661687631027253e-06, "loss": 0.0625, "step": 33840 }, { "epoch": 1.7741090146750524, "grad_norm": 0.9649665355682373, "learning_rate": 5.648584905660378e-06, "loss": 0.0637, "step": 33850 }, { "epoch": 1.7746331236897275, "grad_norm": 2.0053367614746094, "learning_rate": 5.635482180293501e-06, "loss": 0.0853, "step": 33860 }, { "epoch": 1.7751572327044025, "grad_norm": 0.9277843832969666, "learning_rate": 5.622379454926624e-06, "loss": 0.0709, "step": 33870 }, { "epoch": 1.7756813417190775, "grad_norm": 1.4471749067306519, "learning_rate": 5.609276729559749e-06, "loss": 0.0639, "step": 33880 }, { "epoch": 1.7762054507337526, "grad_norm": 1.230859398841858, "learning_rate": 5.596174004192872e-06, "loss": 0.0806, "step": 33890 }, { "epoch": 1.7767295597484276, "grad_norm": 1.3443126678466797, "learning_rate": 5.583071278825995e-06, "loss": 0.0909, "step": 33900 }, { "epoch": 1.7772536687631026, "grad_norm": 1.3505141735076904, "learning_rate": 5.56996855345912e-06, "loss": 0.0836, "step": 33910 }, { "epoch": 1.7777777777777777, "grad_norm": 2.4082603454589844, "learning_rate": 5.556865828092243e-06, "loss": 0.1045, "step": 33920 }, { "epoch": 1.7783018867924527, "grad_norm": 0.7405912280082703, "learning_rate": 5.543763102725367e-06, "loss": 0.0627, "step": 33930 }, { "epoch": 1.7788259958071277, "grad_norm": 1.2767046689987183, "learning_rate": 5.530660377358491e-06, "loss": 0.1103, "step": 33940 }, { "epoch": 1.7793501048218028, "grad_norm": 0.9530086517333984, "learning_rate": 5.517557651991615e-06, "loss": 0.0908, "step": 33950 }, { "epoch": 1.779874213836478, "grad_norm": 1.1291282176971436, "learning_rate": 5.504454926624738e-06, "loss": 0.0711, "step": 33960 }, { "epoch": 1.780398322851153, "grad_norm": 1.126333475112915, "learning_rate": 5.4913522012578624e-06, "loss": 0.0604, "step": 33970 }, { "epoch": 1.780922431865828, "grad_norm": 1.3223693370819092, "learning_rate": 5.478249475890986e-06, "loss": 0.0724, "step": 33980 }, { "epoch": 1.7814465408805031, "grad_norm": 1.7418841123580933, "learning_rate": 5.465146750524109e-06, "loss": 0.0801, "step": 33990 }, { "epoch": 1.7819706498951782, "grad_norm": 0.8617030382156372, "learning_rate": 5.4520440251572334e-06, "loss": 0.07, "step": 34000 }, { "epoch": 1.7819706498951782, "eval_loss": 0.26762381196022034, "eval_runtime": 267.2636, "eval_samples_per_second": 7.45, "eval_steps_per_second": 1.242, "step": 34000 }, { "epoch": 1.7824947589098532, "grad_norm": 1.1304025650024414, "learning_rate": 5.438941299790357e-06, "loss": 0.0816, "step": 34010 }, { "epoch": 1.7830188679245285, "grad_norm": 1.0686378479003906, "learning_rate": 5.42583857442348e-06, "loss": 0.0719, "step": 34020 }, { "epoch": 1.7835429769392035, "grad_norm": 1.211351752281189, "learning_rate": 5.4127358490566045e-06, "loss": 0.0626, "step": 34030 }, { "epoch": 1.7840670859538785, "grad_norm": 1.4797831773757935, "learning_rate": 5.399633123689728e-06, "loss": 0.091, "step": 34040 }, { "epoch": 1.7845911949685536, "grad_norm": 1.6055877208709717, "learning_rate": 5.386530398322851e-06, "loss": 0.0821, "step": 34050 }, { "epoch": 1.7851153039832286, "grad_norm": 1.0258854627609253, "learning_rate": 5.3734276729559755e-06, "loss": 0.0681, "step": 34060 }, { "epoch": 1.7856394129979036, "grad_norm": 2.157062530517578, "learning_rate": 5.360324947589099e-06, "loss": 0.0851, "step": 34070 }, { "epoch": 1.7861635220125787, "grad_norm": 1.3092892169952393, "learning_rate": 5.347222222222222e-06, "loss": 0.0657, "step": 34080 }, { "epoch": 1.7866876310272537, "grad_norm": 1.9904484748840332, "learning_rate": 5.3341194968553465e-06, "loss": 0.0895, "step": 34090 }, { "epoch": 1.7872117400419287, "grad_norm": 1.3187782764434814, "learning_rate": 5.32101677148847e-06, "loss": 0.0639, "step": 34100 }, { "epoch": 1.7877358490566038, "grad_norm": 2.1087183952331543, "learning_rate": 5.307914046121593e-06, "loss": 0.0806, "step": 34110 }, { "epoch": 1.7882599580712788, "grad_norm": 1.045880913734436, "learning_rate": 5.2948113207547175e-06, "loss": 0.0628, "step": 34120 }, { "epoch": 1.7887840670859538, "grad_norm": 1.3817152976989746, "learning_rate": 5.281708595387841e-06, "loss": 0.0764, "step": 34130 }, { "epoch": 1.7893081761006289, "grad_norm": 3.6602184772491455, "learning_rate": 5.268605870020964e-06, "loss": 0.08, "step": 34140 }, { "epoch": 1.789832285115304, "grad_norm": 1.980810284614563, "learning_rate": 5.2555031446540885e-06, "loss": 0.0749, "step": 34150 }, { "epoch": 1.790356394129979, "grad_norm": 1.0341511964797974, "learning_rate": 5.242400419287212e-06, "loss": 0.0796, "step": 34160 }, { "epoch": 1.790880503144654, "grad_norm": 1.0620462894439697, "learning_rate": 5.229297693920335e-06, "loss": 0.0776, "step": 34170 }, { "epoch": 1.791404612159329, "grad_norm": 0.9034409523010254, "learning_rate": 5.2161949685534595e-06, "loss": 0.0947, "step": 34180 }, { "epoch": 1.791928721174004, "grad_norm": 1.7526142597198486, "learning_rate": 5.203092243186583e-06, "loss": 0.1004, "step": 34190 }, { "epoch": 1.7924528301886793, "grad_norm": 1.0870200395584106, "learning_rate": 5.189989517819706e-06, "loss": 0.0603, "step": 34200 }, { "epoch": 1.7929769392033543, "grad_norm": 1.2750418186187744, "learning_rate": 5.1768867924528305e-06, "loss": 0.0767, "step": 34210 }, { "epoch": 1.7935010482180294, "grad_norm": 1.42095947265625, "learning_rate": 5.163784067085954e-06, "loss": 0.0771, "step": 34220 }, { "epoch": 1.7940251572327044, "grad_norm": 1.6726841926574707, "learning_rate": 5.150681341719077e-06, "loss": 0.068, "step": 34230 }, { "epoch": 1.7945492662473794, "grad_norm": 1.2818461656570435, "learning_rate": 5.1375786163522015e-06, "loss": 0.0809, "step": 34240 }, { "epoch": 1.7950733752620545, "grad_norm": 1.0451738834381104, "learning_rate": 5.124475890985325e-06, "loss": 0.0714, "step": 34250 }, { "epoch": 1.7955974842767297, "grad_norm": 1.672234296798706, "learning_rate": 5.111373165618449e-06, "loss": 0.0773, "step": 34260 }, { "epoch": 1.7961215932914047, "grad_norm": 3.063178300857544, "learning_rate": 5.0982704402515725e-06, "loss": 0.0802, "step": 34270 }, { "epoch": 1.7966457023060798, "grad_norm": 1.3467538356781006, "learning_rate": 5.085167714884697e-06, "loss": 0.0771, "step": 34280 }, { "epoch": 1.7971698113207548, "grad_norm": 1.362091302871704, "learning_rate": 5.07206498951782e-06, "loss": 0.0961, "step": 34290 }, { "epoch": 1.7976939203354299, "grad_norm": 0.9203101396560669, "learning_rate": 5.0589622641509435e-06, "loss": 0.0744, "step": 34300 }, { "epoch": 1.7982180293501049, "grad_norm": 2.0424604415893555, "learning_rate": 5.045859538784068e-06, "loss": 0.0767, "step": 34310 }, { "epoch": 1.79874213836478, "grad_norm": 1.1854127645492554, "learning_rate": 5.032756813417191e-06, "loss": 0.0632, "step": 34320 }, { "epoch": 1.799266247379455, "grad_norm": 0.7411090731620789, "learning_rate": 5.019654088050315e-06, "loss": 0.0641, "step": 34330 }, { "epoch": 1.79979035639413, "grad_norm": 2.001063823699951, "learning_rate": 5.006551362683439e-06, "loss": 0.0785, "step": 34340 }, { "epoch": 1.800314465408805, "grad_norm": 1.6688907146453857, "learning_rate": 4.993448637316562e-06, "loss": 0.0846, "step": 34350 }, { "epoch": 1.80083857442348, "grad_norm": 1.111572265625, "learning_rate": 4.980345911949686e-06, "loss": 0.0814, "step": 34360 }, { "epoch": 1.801362683438155, "grad_norm": 0.7180928587913513, "learning_rate": 4.96724318658281e-06, "loss": 0.0586, "step": 34370 }, { "epoch": 1.8018867924528301, "grad_norm": 0.7584537863731384, "learning_rate": 4.954140461215933e-06, "loss": 0.0548, "step": 34380 }, { "epoch": 1.8024109014675052, "grad_norm": 1.2297812700271606, "learning_rate": 4.941037735849057e-06, "loss": 0.0519, "step": 34390 }, { "epoch": 1.8029350104821802, "grad_norm": 1.7759953737258911, "learning_rate": 4.927935010482181e-06, "loss": 0.0741, "step": 34400 }, { "epoch": 1.8034591194968552, "grad_norm": 1.6569321155548096, "learning_rate": 4.914832285115304e-06, "loss": 0.0868, "step": 34410 }, { "epoch": 1.8039832285115303, "grad_norm": 2.512075424194336, "learning_rate": 4.901729559748428e-06, "loss": 0.0547, "step": 34420 }, { "epoch": 1.8045073375262053, "grad_norm": 0.7744259834289551, "learning_rate": 4.888626834381552e-06, "loss": 0.0586, "step": 34430 }, { "epoch": 1.8050314465408805, "grad_norm": 1.7868129014968872, "learning_rate": 4.875524109014675e-06, "loss": 0.0717, "step": 34440 }, { "epoch": 1.8055555555555556, "grad_norm": 1.1555416584014893, "learning_rate": 4.862421383647799e-06, "loss": 0.0676, "step": 34450 }, { "epoch": 1.8060796645702306, "grad_norm": 2.247636079788208, "learning_rate": 4.849318658280923e-06, "loss": 0.0798, "step": 34460 }, { "epoch": 1.8066037735849056, "grad_norm": 1.6758849620819092, "learning_rate": 4.836215932914046e-06, "loss": 0.0498, "step": 34470 }, { "epoch": 1.8071278825995807, "grad_norm": 0.8932026624679565, "learning_rate": 4.82311320754717e-06, "loss": 0.0837, "step": 34480 }, { "epoch": 1.8076519916142557, "grad_norm": 1.1528186798095703, "learning_rate": 4.810010482180294e-06, "loss": 0.1109, "step": 34490 }, { "epoch": 1.808176100628931, "grad_norm": 1.6238465309143066, "learning_rate": 4.796907756813417e-06, "loss": 0.065, "step": 34500 }, { "epoch": 1.808700209643606, "grad_norm": 0.6329945921897888, "learning_rate": 4.783805031446541e-06, "loss": 0.0633, "step": 34510 }, { "epoch": 1.809224318658281, "grad_norm": 1.354620099067688, "learning_rate": 4.770702306079665e-06, "loss": 0.0651, "step": 34520 }, { "epoch": 1.809748427672956, "grad_norm": 2.1168293952941895, "learning_rate": 4.757599580712788e-06, "loss": 0.0778, "step": 34530 }, { "epoch": 1.810272536687631, "grad_norm": 0.7592911720275879, "learning_rate": 4.744496855345912e-06, "loss": 0.0913, "step": 34540 }, { "epoch": 1.8107966457023061, "grad_norm": 1.3963457345962524, "learning_rate": 4.731394129979036e-06, "loss": 0.1037, "step": 34550 }, { "epoch": 1.8113207547169812, "grad_norm": 1.0729615688323975, "learning_rate": 4.718291404612159e-06, "loss": 0.0816, "step": 34560 }, { "epoch": 1.8118448637316562, "grad_norm": 2.7409613132476807, "learning_rate": 4.705188679245283e-06, "loss": 0.0704, "step": 34570 }, { "epoch": 1.8123689727463312, "grad_norm": 1.1228454113006592, "learning_rate": 4.692085953878407e-06, "loss": 0.0866, "step": 34580 }, { "epoch": 1.8128930817610063, "grad_norm": 0.7433912754058838, "learning_rate": 4.67898322851153e-06, "loss": 0.0466, "step": 34590 }, { "epoch": 1.8134171907756813, "grad_norm": 1.5128268003463745, "learning_rate": 4.665880503144654e-06, "loss": 0.0657, "step": 34600 }, { "epoch": 1.8139412997903563, "grad_norm": 0.7765136361122131, "learning_rate": 4.652777777777778e-06, "loss": 0.1025, "step": 34610 }, { "epoch": 1.8144654088050314, "grad_norm": 0.8656555414199829, "learning_rate": 4.639675052410902e-06, "loss": 0.082, "step": 34620 }, { "epoch": 1.8149895178197064, "grad_norm": 0.8403626084327698, "learning_rate": 4.626572327044025e-06, "loss": 0.0711, "step": 34630 }, { "epoch": 1.8155136268343814, "grad_norm": 1.6570936441421509, "learning_rate": 4.61346960167715e-06, "loss": 0.059, "step": 34640 }, { "epoch": 1.8160377358490565, "grad_norm": 1.7300814390182495, "learning_rate": 4.600366876310273e-06, "loss": 0.0747, "step": 34650 }, { "epoch": 1.8165618448637315, "grad_norm": 1.816053032875061, "learning_rate": 4.587264150943396e-06, "loss": 0.0811, "step": 34660 }, { "epoch": 1.8170859538784065, "grad_norm": 1.2429102659225464, "learning_rate": 4.574161425576521e-06, "loss": 0.096, "step": 34670 }, { "epoch": 1.8176100628930818, "grad_norm": 1.878833293914795, "learning_rate": 4.561058700209644e-06, "loss": 0.0667, "step": 34680 }, { "epoch": 1.8181341719077568, "grad_norm": 1.124155879020691, "learning_rate": 4.547955974842767e-06, "loss": 0.073, "step": 34690 }, { "epoch": 1.8186582809224319, "grad_norm": 0.4943144917488098, "learning_rate": 4.534853249475892e-06, "loss": 0.0642, "step": 34700 }, { "epoch": 1.819182389937107, "grad_norm": 2.3582496643066406, "learning_rate": 4.521750524109015e-06, "loss": 0.0892, "step": 34710 }, { "epoch": 1.819706498951782, "grad_norm": 1.2020031213760376, "learning_rate": 4.508647798742138e-06, "loss": 0.0636, "step": 34720 }, { "epoch": 1.820230607966457, "grad_norm": 1.8882160186767578, "learning_rate": 4.495545073375263e-06, "loss": 0.0623, "step": 34730 }, { "epoch": 1.8207547169811322, "grad_norm": 1.6447032690048218, "learning_rate": 4.482442348008386e-06, "loss": 0.094, "step": 34740 }, { "epoch": 1.8212788259958073, "grad_norm": 3.1155807971954346, "learning_rate": 4.469339622641509e-06, "loss": 0.116, "step": 34750 }, { "epoch": 1.8218029350104823, "grad_norm": 1.1563801765441895, "learning_rate": 4.456236897274634e-06, "loss": 0.0752, "step": 34760 }, { "epoch": 1.8223270440251573, "grad_norm": 0.9092895984649658, "learning_rate": 4.443134171907757e-06, "loss": 0.0844, "step": 34770 }, { "epoch": 1.8228511530398324, "grad_norm": 0.7180648446083069, "learning_rate": 4.43003144654088e-06, "loss": 0.0711, "step": 34780 }, { "epoch": 1.8233752620545074, "grad_norm": 0.9739916920661926, "learning_rate": 4.416928721174005e-06, "loss": 0.0666, "step": 34790 }, { "epoch": 1.8238993710691824, "grad_norm": 1.4060016870498657, "learning_rate": 4.403825995807128e-06, "loss": 0.0791, "step": 34800 }, { "epoch": 1.8244234800838575, "grad_norm": 1.233502984046936, "learning_rate": 4.390723270440251e-06, "loss": 0.0613, "step": 34810 }, { "epoch": 1.8249475890985325, "grad_norm": 2.0533065795898438, "learning_rate": 4.377620545073376e-06, "loss": 0.0712, "step": 34820 }, { "epoch": 1.8254716981132075, "grad_norm": 2.9354019165039062, "learning_rate": 4.364517819706499e-06, "loss": 0.0782, "step": 34830 }, { "epoch": 1.8259958071278826, "grad_norm": 1.3234208822250366, "learning_rate": 4.351415094339622e-06, "loss": 0.0762, "step": 34840 }, { "epoch": 1.8265199161425576, "grad_norm": 1.6237906217575073, "learning_rate": 4.338312368972747e-06, "loss": 0.0761, "step": 34850 }, { "epoch": 1.8270440251572326, "grad_norm": 1.6208617687225342, "learning_rate": 4.32520964360587e-06, "loss": 0.0788, "step": 34860 }, { "epoch": 1.8275681341719077, "grad_norm": 1.607041597366333, "learning_rate": 4.312106918238993e-06, "loss": 0.072, "step": 34870 }, { "epoch": 1.8280922431865827, "grad_norm": 2.776214599609375, "learning_rate": 4.299004192872118e-06, "loss": 0.0779, "step": 34880 }, { "epoch": 1.8286163522012577, "grad_norm": 1.4961811304092407, "learning_rate": 4.285901467505241e-06, "loss": 0.1001, "step": 34890 }, { "epoch": 1.8291404612159328, "grad_norm": 1.2335947751998901, "learning_rate": 4.272798742138364e-06, "loss": 0.0771, "step": 34900 }, { "epoch": 1.8296645702306078, "grad_norm": 1.5569696426391602, "learning_rate": 4.259696016771489e-06, "loss": 0.078, "step": 34910 }, { "epoch": 1.830188679245283, "grad_norm": 2.003178596496582, "learning_rate": 4.246593291404612e-06, "loss": 0.0807, "step": 34920 }, { "epoch": 1.830712788259958, "grad_norm": 1.8837343454360962, "learning_rate": 4.233490566037735e-06, "loss": 0.0776, "step": 34930 }, { "epoch": 1.8312368972746331, "grad_norm": 1.3460261821746826, "learning_rate": 4.22038784067086e-06, "loss": 0.0827, "step": 34940 }, { "epoch": 1.8317610062893082, "grad_norm": 1.528942584991455, "learning_rate": 4.207285115303983e-06, "loss": 0.0697, "step": 34950 }, { "epoch": 1.8322851153039832, "grad_norm": 1.4546736478805542, "learning_rate": 4.194182389937107e-06, "loss": 0.0843, "step": 34960 }, { "epoch": 1.8328092243186582, "grad_norm": 1.3445278406143188, "learning_rate": 4.181079664570231e-06, "loss": 0.0685, "step": 34970 }, { "epoch": 1.8333333333333335, "grad_norm": 1.9412113428115845, "learning_rate": 4.167976939203355e-06, "loss": 0.0894, "step": 34980 }, { "epoch": 1.8338574423480085, "grad_norm": 2.7959461212158203, "learning_rate": 4.154874213836478e-06, "loss": 0.0892, "step": 34990 }, { "epoch": 1.8343815513626835, "grad_norm": 0.8164100646972656, "learning_rate": 4.1417714884696025e-06, "loss": 0.0567, "step": 35000 }, { "epoch": 1.8343815513626835, "eval_loss": 0.26678207516670227, "eval_runtime": 266.7648, "eval_samples_per_second": 7.464, "eval_steps_per_second": 1.245, "step": 35000 }, { "epoch": 1.8349056603773586, "grad_norm": 2.1443021297454834, "learning_rate": 4.128668763102726e-06, "loss": 0.0722, "step": 35010 }, { "epoch": 1.8354297693920336, "grad_norm": 2.0307836532592773, "learning_rate": 4.115566037735849e-06, "loss": 0.0967, "step": 35020 }, { "epoch": 1.8359538784067087, "grad_norm": 1.3431941270828247, "learning_rate": 4.1024633123689735e-06, "loss": 0.0752, "step": 35030 }, { "epoch": 1.8364779874213837, "grad_norm": 1.4859864711761475, "learning_rate": 4.089360587002097e-06, "loss": 0.0913, "step": 35040 }, { "epoch": 1.8370020964360587, "grad_norm": 0.9566323161125183, "learning_rate": 4.07625786163522e-06, "loss": 0.07, "step": 35050 }, { "epoch": 1.8375262054507338, "grad_norm": 1.1632527112960815, "learning_rate": 4.0631551362683445e-06, "loss": 0.0664, "step": 35060 }, { "epoch": 1.8380503144654088, "grad_norm": 1.5765260457992554, "learning_rate": 4.050052410901468e-06, "loss": 0.0641, "step": 35070 }, { "epoch": 1.8385744234800838, "grad_norm": 1.8369210958480835, "learning_rate": 4.036949685534591e-06, "loss": 0.061, "step": 35080 }, { "epoch": 1.8390985324947589, "grad_norm": 1.4045135974884033, "learning_rate": 4.0238469601677155e-06, "loss": 0.0725, "step": 35090 }, { "epoch": 1.8396226415094339, "grad_norm": 1.1947506666183472, "learning_rate": 4.010744234800839e-06, "loss": 0.0726, "step": 35100 }, { "epoch": 1.840146750524109, "grad_norm": 1.765573501586914, "learning_rate": 3.997641509433962e-06, "loss": 0.0864, "step": 35110 }, { "epoch": 1.840670859538784, "grad_norm": 1.8163455724716187, "learning_rate": 3.9845387840670865e-06, "loss": 0.0513, "step": 35120 }, { "epoch": 1.841194968553459, "grad_norm": 1.3820146322250366, "learning_rate": 3.97143605870021e-06, "loss": 0.0593, "step": 35130 }, { "epoch": 1.841719077568134, "grad_norm": 1.7238701581954956, "learning_rate": 3.958333333333333e-06, "loss": 0.0754, "step": 35140 }, { "epoch": 1.8422431865828093, "grad_norm": 2.048316240310669, "learning_rate": 3.9452306079664575e-06, "loss": 0.0596, "step": 35150 }, { "epoch": 1.8427672955974843, "grad_norm": 1.101599097251892, "learning_rate": 3.932127882599581e-06, "loss": 0.0716, "step": 35160 }, { "epoch": 1.8432914046121593, "grad_norm": 1.935444712638855, "learning_rate": 3.919025157232704e-06, "loss": 0.0899, "step": 35170 }, { "epoch": 1.8438155136268344, "grad_norm": 0.8457840085029602, "learning_rate": 3.9059224318658285e-06, "loss": 0.0795, "step": 35180 }, { "epoch": 1.8443396226415094, "grad_norm": 1.2010316848754883, "learning_rate": 3.892819706498952e-06, "loss": 0.0894, "step": 35190 }, { "epoch": 1.8448637316561844, "grad_norm": 1.988616943359375, "learning_rate": 3.879716981132075e-06, "loss": 0.0726, "step": 35200 }, { "epoch": 1.8453878406708597, "grad_norm": 1.7617828845977783, "learning_rate": 3.8666142557651995e-06, "loss": 0.104, "step": 35210 }, { "epoch": 1.8459119496855347, "grad_norm": 1.2007869482040405, "learning_rate": 3.853511530398323e-06, "loss": 0.0668, "step": 35220 }, { "epoch": 1.8464360587002098, "grad_norm": 1.926151156425476, "learning_rate": 3.840408805031446e-06, "loss": 0.079, "step": 35230 }, { "epoch": 1.8469601677148848, "grad_norm": 1.7576385736465454, "learning_rate": 3.8273060796645705e-06, "loss": 0.0704, "step": 35240 }, { "epoch": 1.8474842767295598, "grad_norm": 1.9205641746520996, "learning_rate": 3.8142033542976943e-06, "loss": 0.0807, "step": 35250 }, { "epoch": 1.8480083857442349, "grad_norm": 1.8392775058746338, "learning_rate": 3.8011006289308177e-06, "loss": 0.0826, "step": 35260 }, { "epoch": 1.84853249475891, "grad_norm": 1.0592628717422485, "learning_rate": 3.787997903563942e-06, "loss": 0.0955, "step": 35270 }, { "epoch": 1.849056603773585, "grad_norm": 1.6646348237991333, "learning_rate": 3.7748951781970653e-06, "loss": 0.0465, "step": 35280 }, { "epoch": 1.84958071278826, "grad_norm": 1.4347747564315796, "learning_rate": 3.7617924528301887e-06, "loss": 0.0815, "step": 35290 }, { "epoch": 1.850104821802935, "grad_norm": 2.0240726470947266, "learning_rate": 3.748689727463313e-06, "loss": 0.0875, "step": 35300 }, { "epoch": 1.85062893081761, "grad_norm": 2.382476568222046, "learning_rate": 3.7355870020964363e-06, "loss": 0.0775, "step": 35310 }, { "epoch": 1.851153039832285, "grad_norm": 0.916605532169342, "learning_rate": 3.7224842767295597e-06, "loss": 0.0718, "step": 35320 }, { "epoch": 1.85167714884696, "grad_norm": 1.1663122177124023, "learning_rate": 3.709381551362684e-06, "loss": 0.0927, "step": 35330 }, { "epoch": 1.8522012578616351, "grad_norm": 1.4649229049682617, "learning_rate": 3.6962788259958073e-06, "loss": 0.0861, "step": 35340 }, { "epoch": 1.8527253668763102, "grad_norm": 1.1901835203170776, "learning_rate": 3.6831761006289307e-06, "loss": 0.0751, "step": 35350 }, { "epoch": 1.8532494758909852, "grad_norm": 2.1648807525634766, "learning_rate": 3.670073375262055e-06, "loss": 0.0753, "step": 35360 }, { "epoch": 1.8537735849056602, "grad_norm": 2.470012664794922, "learning_rate": 3.6569706498951783e-06, "loss": 0.074, "step": 35370 }, { "epoch": 1.8542976939203353, "grad_norm": 1.3808513879776, "learning_rate": 3.643867924528302e-06, "loss": 0.0745, "step": 35380 }, { "epoch": 1.8548218029350105, "grad_norm": 1.5830243825912476, "learning_rate": 3.6307651991614255e-06, "loss": 0.0838, "step": 35390 }, { "epoch": 1.8553459119496856, "grad_norm": 1.371849775314331, "learning_rate": 3.6176624737945498e-06, "loss": 0.0821, "step": 35400 }, { "epoch": 1.8558700209643606, "grad_norm": 1.2144567966461182, "learning_rate": 3.604559748427673e-06, "loss": 0.0717, "step": 35410 }, { "epoch": 1.8563941299790356, "grad_norm": 1.3262728452682495, "learning_rate": 3.5914570230607965e-06, "loss": 0.0682, "step": 35420 }, { "epoch": 1.8569182389937107, "grad_norm": 1.0475879907608032, "learning_rate": 3.5783542976939208e-06, "loss": 0.0571, "step": 35430 }, { "epoch": 1.8574423480083857, "grad_norm": 1.328346848487854, "learning_rate": 3.565251572327044e-06, "loss": 0.071, "step": 35440 }, { "epoch": 1.857966457023061, "grad_norm": 1.3281270265579224, "learning_rate": 3.5521488469601675e-06, "loss": 0.076, "step": 35450 }, { "epoch": 1.858490566037736, "grad_norm": 1.2660882472991943, "learning_rate": 3.5390461215932918e-06, "loss": 0.0712, "step": 35460 }, { "epoch": 1.859014675052411, "grad_norm": 1.620882272720337, "learning_rate": 3.525943396226415e-06, "loss": 0.0778, "step": 35470 }, { "epoch": 1.859538784067086, "grad_norm": 0.6216922402381897, "learning_rate": 3.5128406708595385e-06, "loss": 0.0676, "step": 35480 }, { "epoch": 1.860062893081761, "grad_norm": 1.293033242225647, "learning_rate": 3.4997379454926628e-06, "loss": 0.0877, "step": 35490 }, { "epoch": 1.8605870020964361, "grad_norm": 2.2693710327148438, "learning_rate": 3.486635220125786e-06, "loss": 0.0581, "step": 35500 }, { "epoch": 1.8611111111111112, "grad_norm": 0.49999281764030457, "learning_rate": 3.4735324947589095e-06, "loss": 0.0728, "step": 35510 }, { "epoch": 1.8616352201257862, "grad_norm": 1.4117684364318848, "learning_rate": 3.4604297693920338e-06, "loss": 0.0569, "step": 35520 }, { "epoch": 1.8621593291404612, "grad_norm": 1.4921469688415527, "learning_rate": 3.447327044025157e-06, "loss": 0.0561, "step": 35530 }, { "epoch": 1.8626834381551363, "grad_norm": 2.2984938621520996, "learning_rate": 3.434224318658281e-06, "loss": 0.0652, "step": 35540 }, { "epoch": 1.8632075471698113, "grad_norm": 1.0348659753799438, "learning_rate": 3.4211215932914048e-06, "loss": 0.0534, "step": 35550 }, { "epoch": 1.8637316561844863, "grad_norm": 2.2559142112731934, "learning_rate": 3.4080188679245286e-06, "loss": 0.0671, "step": 35560 }, { "epoch": 1.8642557651991614, "grad_norm": 1.69614839553833, "learning_rate": 3.394916142557652e-06, "loss": 0.083, "step": 35570 }, { "epoch": 1.8647798742138364, "grad_norm": 3.1688826084136963, "learning_rate": 3.381813417190776e-06, "loss": 0.0827, "step": 35580 }, { "epoch": 1.8653039832285114, "grad_norm": 0.9188790321350098, "learning_rate": 3.3687106918238996e-06, "loss": 0.0764, "step": 35590 }, { "epoch": 1.8658280922431865, "grad_norm": 1.3099772930145264, "learning_rate": 3.355607966457023e-06, "loss": 0.0641, "step": 35600 }, { "epoch": 1.8663522012578615, "grad_norm": 1.3244524002075195, "learning_rate": 3.342505241090147e-06, "loss": 0.0829, "step": 35610 }, { "epoch": 1.8668763102725365, "grad_norm": 1.8879936933517456, "learning_rate": 3.3294025157232706e-06, "loss": 0.0788, "step": 35620 }, { "epoch": 1.8674004192872118, "grad_norm": 2.605762481689453, "learning_rate": 3.316299790356394e-06, "loss": 0.1098, "step": 35630 }, { "epoch": 1.8679245283018868, "grad_norm": 1.6959515810012817, "learning_rate": 3.303197064989518e-06, "loss": 0.0705, "step": 35640 }, { "epoch": 1.8684486373165619, "grad_norm": 1.204658031463623, "learning_rate": 3.2900943396226416e-06, "loss": 0.0673, "step": 35650 }, { "epoch": 1.868972746331237, "grad_norm": 1.229408860206604, "learning_rate": 3.276991614255765e-06, "loss": 0.0807, "step": 35660 }, { "epoch": 1.869496855345912, "grad_norm": 1.9810676574707031, "learning_rate": 3.2638888888888892e-06, "loss": 0.0809, "step": 35670 }, { "epoch": 1.870020964360587, "grad_norm": 1.1754933595657349, "learning_rate": 3.2507861635220126e-06, "loss": 0.0774, "step": 35680 }, { "epoch": 1.8705450733752622, "grad_norm": 0.6706782579421997, "learning_rate": 3.237683438155136e-06, "loss": 0.0621, "step": 35690 }, { "epoch": 1.8710691823899372, "grad_norm": 2.3637070655822754, "learning_rate": 3.2245807127882602e-06, "loss": 0.0837, "step": 35700 }, { "epoch": 1.8715932914046123, "grad_norm": 1.4699527025222778, "learning_rate": 3.2114779874213836e-06, "loss": 0.0628, "step": 35710 }, { "epoch": 1.8721174004192873, "grad_norm": 1.398445963859558, "learning_rate": 3.1983752620545074e-06, "loss": 0.071, "step": 35720 }, { "epoch": 1.8726415094339623, "grad_norm": 1.279449462890625, "learning_rate": 3.1852725366876312e-06, "loss": 0.0708, "step": 35730 }, { "epoch": 1.8731656184486374, "grad_norm": 1.7630802392959595, "learning_rate": 3.172169811320755e-06, "loss": 0.1008, "step": 35740 }, { "epoch": 1.8736897274633124, "grad_norm": 4.057922840118408, "learning_rate": 3.1590670859538784e-06, "loss": 0.101, "step": 35750 }, { "epoch": 1.8742138364779874, "grad_norm": 2.732259750366211, "learning_rate": 3.1459643605870026e-06, "loss": 0.0978, "step": 35760 }, { "epoch": 1.8747379454926625, "grad_norm": 1.7538576126098633, "learning_rate": 3.132861635220126e-06, "loss": 0.1018, "step": 35770 }, { "epoch": 1.8752620545073375, "grad_norm": 2.2278151512145996, "learning_rate": 3.11975890985325e-06, "loss": 0.0765, "step": 35780 }, { "epoch": 1.8757861635220126, "grad_norm": 2.4491899013519287, "learning_rate": 3.1066561844863732e-06, "loss": 0.0764, "step": 35790 }, { "epoch": 1.8763102725366876, "grad_norm": 3.7896568775177, "learning_rate": 3.093553459119497e-06, "loss": 0.0558, "step": 35800 }, { "epoch": 1.8768343815513626, "grad_norm": 2.0129201412200928, "learning_rate": 3.080450733752621e-06, "loss": 0.0875, "step": 35810 }, { "epoch": 1.8773584905660377, "grad_norm": 0.8520331978797913, "learning_rate": 3.0673480083857442e-06, "loss": 0.0805, "step": 35820 }, { "epoch": 1.8778825995807127, "grad_norm": 1.552331566810608, "learning_rate": 3.054245283018868e-06, "loss": 0.0633, "step": 35830 }, { "epoch": 1.8784067085953877, "grad_norm": 1.4037585258483887, "learning_rate": 3.041142557651992e-06, "loss": 0.0824, "step": 35840 }, { "epoch": 1.8789308176100628, "grad_norm": 1.717598795890808, "learning_rate": 3.0280398322851152e-06, "loss": 0.0905, "step": 35850 }, { "epoch": 1.8794549266247378, "grad_norm": 1.6586337089538574, "learning_rate": 3.014937106918239e-06, "loss": 0.0653, "step": 35860 }, { "epoch": 1.879979035639413, "grad_norm": 1.139183759689331, "learning_rate": 3.001834381551363e-06, "loss": 0.0795, "step": 35870 }, { "epoch": 1.880503144654088, "grad_norm": 1.106190800666809, "learning_rate": 2.9887316561844862e-06, "loss": 0.0605, "step": 35880 }, { "epoch": 1.881027253668763, "grad_norm": 0.8567935228347778, "learning_rate": 2.97562893081761e-06, "loss": 0.071, "step": 35890 }, { "epoch": 1.8815513626834381, "grad_norm": 1.2470753192901611, "learning_rate": 2.962526205450734e-06, "loss": 0.099, "step": 35900 }, { "epoch": 1.8820754716981132, "grad_norm": 1.275889277458191, "learning_rate": 2.9494234800838577e-06, "loss": 0.0688, "step": 35910 }, { "epoch": 1.8825995807127882, "grad_norm": 1.2817095518112183, "learning_rate": 2.9363207547169815e-06, "loss": 0.069, "step": 35920 }, { "epoch": 1.8831236897274635, "grad_norm": 1.6084591150283813, "learning_rate": 2.9232180293501053e-06, "loss": 0.0632, "step": 35930 }, { "epoch": 1.8836477987421385, "grad_norm": 2.0670547485351562, "learning_rate": 2.9101153039832287e-06, "loss": 0.0693, "step": 35940 }, { "epoch": 1.8841719077568135, "grad_norm": 1.5240498781204224, "learning_rate": 2.8970125786163525e-06, "loss": 0.0782, "step": 35950 }, { "epoch": 1.8846960167714886, "grad_norm": 1.8212049007415771, "learning_rate": 2.8839098532494763e-06, "loss": 0.0494, "step": 35960 }, { "epoch": 1.8852201257861636, "grad_norm": 1.2207962274551392, "learning_rate": 2.8708071278825997e-06, "loss": 0.0916, "step": 35970 }, { "epoch": 1.8857442348008386, "grad_norm": 1.3317270278930664, "learning_rate": 2.8577044025157235e-06, "loss": 0.0724, "step": 35980 }, { "epoch": 1.8862683438155137, "grad_norm": 0.759463906288147, "learning_rate": 2.844601677148847e-06, "loss": 0.0547, "step": 35990 }, { "epoch": 1.8867924528301887, "grad_norm": 1.6556220054626465, "learning_rate": 2.8314989517819707e-06, "loss": 0.067, "step": 36000 }, { "epoch": 1.8867924528301887, "eval_loss": 0.2678377032279968, "eval_runtime": 268.3326, "eval_samples_per_second": 7.42, "eval_steps_per_second": 1.237, "step": 36000 }, { "epoch": 1.8873165618448637, "grad_norm": 2.3549275398254395, "learning_rate": 2.8183962264150945e-06, "loss": 0.0642, "step": 36010 }, { "epoch": 1.8878406708595388, "grad_norm": 2.4621994495391846, "learning_rate": 2.805293501048218e-06, "loss": 0.1005, "step": 36020 }, { "epoch": 1.8883647798742138, "grad_norm": 2.111743211746216, "learning_rate": 2.7921907756813417e-06, "loss": 0.0768, "step": 36030 }, { "epoch": 1.8888888888888888, "grad_norm": 1.8729252815246582, "learning_rate": 2.7790880503144655e-06, "loss": 0.0833, "step": 36040 }, { "epoch": 1.8894129979035639, "grad_norm": 1.3784617185592651, "learning_rate": 2.765985324947589e-06, "loss": 0.0807, "step": 36050 }, { "epoch": 1.889937106918239, "grad_norm": 1.35466730594635, "learning_rate": 2.7528825995807127e-06, "loss": 0.078, "step": 36060 }, { "epoch": 1.890461215932914, "grad_norm": 0.8412442207336426, "learning_rate": 2.7397798742138365e-06, "loss": 0.0695, "step": 36070 }, { "epoch": 1.890985324947589, "grad_norm": 1.1234647035598755, "learning_rate": 2.7266771488469603e-06, "loss": 0.0729, "step": 36080 }, { "epoch": 1.891509433962264, "grad_norm": 2.3850042819976807, "learning_rate": 2.713574423480084e-06, "loss": 0.0747, "step": 36090 }, { "epoch": 1.892033542976939, "grad_norm": 1.2947684526443481, "learning_rate": 2.700471698113208e-06, "loss": 0.0747, "step": 36100 }, { "epoch": 1.8925576519916143, "grad_norm": 1.541504144668579, "learning_rate": 2.6873689727463313e-06, "loss": 0.0769, "step": 36110 }, { "epoch": 1.8930817610062893, "grad_norm": 1.178537130355835, "learning_rate": 2.674266247379455e-06, "loss": 0.0664, "step": 36120 }, { "epoch": 1.8936058700209644, "grad_norm": 1.822480320930481, "learning_rate": 2.661163522012579e-06, "loss": 0.0994, "step": 36130 }, { "epoch": 1.8941299790356394, "grad_norm": 1.35443115234375, "learning_rate": 2.6480607966457023e-06, "loss": 0.0581, "step": 36140 }, { "epoch": 1.8946540880503144, "grad_norm": 1.701316237449646, "learning_rate": 2.634958071278826e-06, "loss": 0.0724, "step": 36150 }, { "epoch": 1.8951781970649895, "grad_norm": 1.102696418762207, "learning_rate": 2.62185534591195e-06, "loss": 0.0811, "step": 36160 }, { "epoch": 1.8957023060796647, "grad_norm": 1.3935344219207764, "learning_rate": 2.6087526205450733e-06, "loss": 0.1073, "step": 36170 }, { "epoch": 1.8962264150943398, "grad_norm": 1.8276008367538452, "learning_rate": 2.595649895178197e-06, "loss": 0.0756, "step": 36180 }, { "epoch": 1.8967505241090148, "grad_norm": 1.2710710763931274, "learning_rate": 2.582547169811321e-06, "loss": 0.0697, "step": 36190 }, { "epoch": 1.8972746331236898, "grad_norm": 1.8811373710632324, "learning_rate": 2.5694444444444443e-06, "loss": 0.0652, "step": 36200 }, { "epoch": 1.8977987421383649, "grad_norm": 3.026477575302124, "learning_rate": 2.556341719077568e-06, "loss": 0.0877, "step": 36210 }, { "epoch": 1.89832285115304, "grad_norm": 1.3149288892745972, "learning_rate": 2.543238993710692e-06, "loss": 0.0545, "step": 36220 }, { "epoch": 1.898846960167715, "grad_norm": 1.6216425895690918, "learning_rate": 2.5301362683438157e-06, "loss": 0.0902, "step": 36230 }, { "epoch": 1.89937106918239, "grad_norm": 0.9983451962471008, "learning_rate": 2.5170335429769395e-06, "loss": 0.0807, "step": 36240 }, { "epoch": 1.899895178197065, "grad_norm": 1.5166202783584595, "learning_rate": 2.5039308176100634e-06, "loss": 0.0674, "step": 36250 }, { "epoch": 1.90041928721174, "grad_norm": 0.7801737189292908, "learning_rate": 2.4908280922431867e-06, "loss": 0.0646, "step": 36260 }, { "epoch": 1.900943396226415, "grad_norm": 2.308824062347412, "learning_rate": 2.4777253668763106e-06, "loss": 0.0764, "step": 36270 }, { "epoch": 1.90146750524109, "grad_norm": 2.3885467052459717, "learning_rate": 2.4646226415094344e-06, "loss": 0.0765, "step": 36280 }, { "epoch": 1.9019916142557651, "grad_norm": 1.0932997465133667, "learning_rate": 2.4515199161425577e-06, "loss": 0.0733, "step": 36290 }, { "epoch": 1.9025157232704402, "grad_norm": 2.306108236312866, "learning_rate": 2.4384171907756816e-06, "loss": 0.0751, "step": 36300 }, { "epoch": 1.9030398322851152, "grad_norm": 1.6138864755630493, "learning_rate": 2.4253144654088054e-06, "loss": 0.0709, "step": 36310 }, { "epoch": 1.9035639412997902, "grad_norm": 1.8217240571975708, "learning_rate": 2.4122117400419288e-06, "loss": 0.0731, "step": 36320 }, { "epoch": 1.9040880503144653, "grad_norm": 1.2665067911148071, "learning_rate": 2.3991090146750526e-06, "loss": 0.0814, "step": 36330 }, { "epoch": 1.9046121593291403, "grad_norm": 1.3618237972259521, "learning_rate": 2.3860062893081764e-06, "loss": 0.0744, "step": 36340 }, { "epoch": 1.9051362683438156, "grad_norm": 1.7993749380111694, "learning_rate": 2.3729035639412998e-06, "loss": 0.0607, "step": 36350 }, { "epoch": 1.9056603773584906, "grad_norm": 0.970848560333252, "learning_rate": 2.3598008385744236e-06, "loss": 0.0651, "step": 36360 }, { "epoch": 1.9061844863731656, "grad_norm": 1.9030567407608032, "learning_rate": 2.3466981132075474e-06, "loss": 0.0804, "step": 36370 }, { "epoch": 1.9067085953878407, "grad_norm": 1.25066077709198, "learning_rate": 2.3335953878406708e-06, "loss": 0.0843, "step": 36380 }, { "epoch": 1.9072327044025157, "grad_norm": 1.5621609687805176, "learning_rate": 2.3204926624737946e-06, "loss": 0.0665, "step": 36390 }, { "epoch": 1.9077568134171907, "grad_norm": 1.3874887228012085, "learning_rate": 2.3073899371069184e-06, "loss": 0.0767, "step": 36400 }, { "epoch": 1.908280922431866, "grad_norm": 2.122204542160034, "learning_rate": 2.294287211740042e-06, "loss": 0.0669, "step": 36410 }, { "epoch": 1.908805031446541, "grad_norm": 2.125373601913452, "learning_rate": 2.281184486373166e-06, "loss": 0.0722, "step": 36420 }, { "epoch": 1.909329140461216, "grad_norm": 1.2056728601455688, "learning_rate": 2.2680817610062894e-06, "loss": 0.0659, "step": 36430 }, { "epoch": 1.909853249475891, "grad_norm": 2.644113063812256, "learning_rate": 2.254979035639413e-06, "loss": 0.1034, "step": 36440 }, { "epoch": 1.9103773584905661, "grad_norm": 2.2098119258880615, "learning_rate": 2.241876310272537e-06, "loss": 0.0757, "step": 36450 }, { "epoch": 1.9109014675052411, "grad_norm": 2.2728121280670166, "learning_rate": 2.2287735849056604e-06, "loss": 0.1028, "step": 36460 }, { "epoch": 1.9114255765199162, "grad_norm": 1.3554298877716064, "learning_rate": 2.215670859538784e-06, "loss": 0.0924, "step": 36470 }, { "epoch": 1.9119496855345912, "grad_norm": 0.8044809699058533, "learning_rate": 2.202568134171908e-06, "loss": 0.0501, "step": 36480 }, { "epoch": 1.9124737945492662, "grad_norm": 1.018548846244812, "learning_rate": 2.1894654088050314e-06, "loss": 0.0684, "step": 36490 }, { "epoch": 1.9129979035639413, "grad_norm": 1.5749702453613281, "learning_rate": 2.176362683438155e-06, "loss": 0.0729, "step": 36500 }, { "epoch": 1.9135220125786163, "grad_norm": 1.1555321216583252, "learning_rate": 2.163259958071279e-06, "loss": 0.0863, "step": 36510 }, { "epoch": 1.9140461215932913, "grad_norm": 1.5626978874206543, "learning_rate": 2.1501572327044024e-06, "loss": 0.0809, "step": 36520 }, { "epoch": 1.9145702306079664, "grad_norm": 1.4471033811569214, "learning_rate": 2.137054507337526e-06, "loss": 0.0663, "step": 36530 }, { "epoch": 1.9150943396226414, "grad_norm": 1.3958340883255005, "learning_rate": 2.12395178197065e-06, "loss": 0.061, "step": 36540 }, { "epoch": 1.9156184486373165, "grad_norm": 2.147571086883545, "learning_rate": 2.1108490566037734e-06, "loss": 0.0585, "step": 36550 }, { "epoch": 1.9161425576519915, "grad_norm": 1.4060626029968262, "learning_rate": 2.097746331236897e-06, "loss": 0.0629, "step": 36560 }, { "epoch": 1.9166666666666665, "grad_norm": 2.2379798889160156, "learning_rate": 2.084643605870021e-06, "loss": 0.0847, "step": 36570 }, { "epoch": 1.9171907756813418, "grad_norm": 1.4189167022705078, "learning_rate": 2.071540880503145e-06, "loss": 0.0864, "step": 36580 }, { "epoch": 1.9177148846960168, "grad_norm": 1.625536561012268, "learning_rate": 2.0584381551362686e-06, "loss": 0.0726, "step": 36590 }, { "epoch": 1.9182389937106918, "grad_norm": 1.1209425926208496, "learning_rate": 2.0453354297693924e-06, "loss": 0.0614, "step": 36600 }, { "epoch": 1.9187631027253669, "grad_norm": 0.8219720125198364, "learning_rate": 2.032232704402516e-06, "loss": 0.0652, "step": 36610 }, { "epoch": 1.919287211740042, "grad_norm": 1.4849026203155518, "learning_rate": 2.0191299790356396e-06, "loss": 0.0791, "step": 36620 }, { "epoch": 1.919811320754717, "grad_norm": 0.8608081340789795, "learning_rate": 2.0060272536687634e-06, "loss": 0.103, "step": 36630 }, { "epoch": 1.9203354297693922, "grad_norm": 2.357111930847168, "learning_rate": 1.992924528301887e-06, "loss": 0.0639, "step": 36640 }, { "epoch": 1.9208595387840672, "grad_norm": 1.304469347000122, "learning_rate": 1.9798218029350106e-06, "loss": 0.0917, "step": 36650 }, { "epoch": 1.9213836477987423, "grad_norm": 1.540612816810608, "learning_rate": 1.9667190775681344e-06, "loss": 0.0776, "step": 36660 }, { "epoch": 1.9219077568134173, "grad_norm": 1.7036010026931763, "learning_rate": 1.953616352201258e-06, "loss": 0.0675, "step": 36670 }, { "epoch": 1.9224318658280923, "grad_norm": 0.8985329866409302, "learning_rate": 1.9405136268343816e-06, "loss": 0.0686, "step": 36680 }, { "epoch": 1.9229559748427674, "grad_norm": 1.7493515014648438, "learning_rate": 1.9274109014675054e-06, "loss": 0.063, "step": 36690 }, { "epoch": 1.9234800838574424, "grad_norm": 1.0596920251846313, "learning_rate": 1.914308176100629e-06, "loss": 0.0765, "step": 36700 }, { "epoch": 1.9240041928721174, "grad_norm": 1.8956947326660156, "learning_rate": 1.9012054507337529e-06, "loss": 0.0779, "step": 36710 }, { "epoch": 1.9245283018867925, "grad_norm": 1.4802206754684448, "learning_rate": 1.8881027253668767e-06, "loss": 0.068, "step": 36720 }, { "epoch": 1.9250524109014675, "grad_norm": 1.4648950099945068, "learning_rate": 1.875e-06, "loss": 0.0762, "step": 36730 }, { "epoch": 1.9255765199161425, "grad_norm": 2.2809669971466064, "learning_rate": 1.8618972746331239e-06, "loss": 0.0818, "step": 36740 }, { "epoch": 1.9261006289308176, "grad_norm": 0.834746241569519, "learning_rate": 1.8487945492662477e-06, "loss": 0.0509, "step": 36750 }, { "epoch": 1.9266247379454926, "grad_norm": 1.7022600173950195, "learning_rate": 1.835691823899371e-06, "loss": 0.0865, "step": 36760 }, { "epoch": 1.9271488469601676, "grad_norm": 1.0883166790008545, "learning_rate": 1.8225890985324949e-06, "loss": 0.0809, "step": 36770 }, { "epoch": 1.9276729559748427, "grad_norm": 1.4769940376281738, "learning_rate": 1.8094863731656185e-06, "loss": 0.0755, "step": 36780 }, { "epoch": 1.9281970649895177, "grad_norm": 0.7859780192375183, "learning_rate": 1.7963836477987423e-06, "loss": 0.0564, "step": 36790 }, { "epoch": 1.9287211740041927, "grad_norm": 0.8886245489120483, "learning_rate": 1.783280922431866e-06, "loss": 0.0916, "step": 36800 }, { "epoch": 1.9292452830188678, "grad_norm": 1.6430957317352295, "learning_rate": 1.7701781970649895e-06, "loss": 0.0777, "step": 36810 }, { "epoch": 1.929769392033543, "grad_norm": 2.1593551635742188, "learning_rate": 1.7570754716981133e-06, "loss": 0.0844, "step": 36820 }, { "epoch": 1.930293501048218, "grad_norm": 0.8387053608894348, "learning_rate": 1.743972746331237e-06, "loss": 0.0669, "step": 36830 }, { "epoch": 1.930817610062893, "grad_norm": 1.2902204990386963, "learning_rate": 1.7308700209643605e-06, "loss": 0.0739, "step": 36840 }, { "epoch": 1.9313417190775681, "grad_norm": 1.5502692461013794, "learning_rate": 1.7177672955974843e-06, "loss": 0.1027, "step": 36850 }, { "epoch": 1.9318658280922432, "grad_norm": 1.5457940101623535, "learning_rate": 1.704664570230608e-06, "loss": 0.0823, "step": 36860 }, { "epoch": 1.9323899371069182, "grad_norm": 3.4082517623901367, "learning_rate": 1.6915618448637317e-06, "loss": 0.0633, "step": 36870 }, { "epoch": 1.9329140461215935, "grad_norm": 2.142735242843628, "learning_rate": 1.6784591194968555e-06, "loss": 0.0952, "step": 36880 }, { "epoch": 1.9334381551362685, "grad_norm": 0.6079370975494385, "learning_rate": 1.6653563941299793e-06, "loss": 0.0664, "step": 36890 }, { "epoch": 1.9339622641509435, "grad_norm": 1.949703574180603, "learning_rate": 1.6522536687631027e-06, "loss": 0.0689, "step": 36900 }, { "epoch": 1.9344863731656186, "grad_norm": 2.3975839614868164, "learning_rate": 1.6391509433962265e-06, "loss": 0.0702, "step": 36910 }, { "epoch": 1.9350104821802936, "grad_norm": 1.2329648733139038, "learning_rate": 1.6260482180293503e-06, "loss": 0.0612, "step": 36920 }, { "epoch": 1.9355345911949686, "grad_norm": 0.8647230863571167, "learning_rate": 1.6129454926624737e-06, "loss": 0.0533, "step": 36930 }, { "epoch": 1.9360587002096437, "grad_norm": 1.8352750539779663, "learning_rate": 1.5998427672955975e-06, "loss": 0.09, "step": 36940 }, { "epoch": 1.9365828092243187, "grad_norm": 1.6736629009246826, "learning_rate": 1.5867400419287213e-06, "loss": 0.0873, "step": 36950 }, { "epoch": 1.9371069182389937, "grad_norm": 3.0347659587860107, "learning_rate": 1.573637316561845e-06, "loss": 0.0882, "step": 36960 }, { "epoch": 1.9376310272536688, "grad_norm": 1.4697365760803223, "learning_rate": 1.5605345911949687e-06, "loss": 0.0721, "step": 36970 }, { "epoch": 1.9381551362683438, "grad_norm": 0.8763979077339172, "learning_rate": 1.5474318658280923e-06, "loss": 0.0745, "step": 36980 }, { "epoch": 1.9386792452830188, "grad_norm": 1.6568057537078857, "learning_rate": 1.5343291404612161e-06, "loss": 0.0761, "step": 36990 }, { "epoch": 1.9392033542976939, "grad_norm": 2.1250786781311035, "learning_rate": 1.5212264150943397e-06, "loss": 0.0767, "step": 37000 }, { "epoch": 1.9392033542976939, "eval_loss": 0.266403466463089, "eval_runtime": 267.5874, "eval_samples_per_second": 7.441, "eval_steps_per_second": 1.241, "step": 37000 }, { "epoch": 1.939727463312369, "grad_norm": 1.6555750370025635, "learning_rate": 1.5081236897274633e-06, "loss": 0.0722, "step": 37010 }, { "epoch": 1.940251572327044, "grad_norm": 1.4497612714767456, "learning_rate": 1.4950209643605871e-06, "loss": 0.076, "step": 37020 }, { "epoch": 1.940775681341719, "grad_norm": 2.2555832862854004, "learning_rate": 1.4819182389937107e-06, "loss": 0.076, "step": 37030 }, { "epoch": 1.941299790356394, "grad_norm": 1.407079815864563, "learning_rate": 1.4688155136268343e-06, "loss": 0.094, "step": 37040 }, { "epoch": 1.941823899371069, "grad_norm": 1.192366361618042, "learning_rate": 1.4557127882599581e-06, "loss": 0.0895, "step": 37050 }, { "epoch": 1.9423480083857443, "grad_norm": 1.538325548171997, "learning_rate": 1.442610062893082e-06, "loss": 0.0702, "step": 37060 }, { "epoch": 1.9428721174004193, "grad_norm": 1.4339028596878052, "learning_rate": 1.4295073375262055e-06, "loss": 0.0673, "step": 37070 }, { "epoch": 1.9433962264150944, "grad_norm": 1.572577953338623, "learning_rate": 1.4164046121593291e-06, "loss": 0.0884, "step": 37080 }, { "epoch": 1.9439203354297694, "grad_norm": 1.6123378276824951, "learning_rate": 1.403301886792453e-06, "loss": 0.0561, "step": 37090 }, { "epoch": 1.9444444444444444, "grad_norm": 2.3728435039520264, "learning_rate": 1.3901991614255765e-06, "loss": 0.0495, "step": 37100 }, { "epoch": 1.9449685534591195, "grad_norm": 0.9430110454559326, "learning_rate": 1.3770964360587001e-06, "loss": 0.0809, "step": 37110 }, { "epoch": 1.9454926624737947, "grad_norm": 1.7856323719024658, "learning_rate": 1.363993710691824e-06, "loss": 0.099, "step": 37120 }, { "epoch": 1.9460167714884697, "grad_norm": 3.6460683345794678, "learning_rate": 1.3508909853249477e-06, "loss": 0.0939, "step": 37130 }, { "epoch": 1.9465408805031448, "grad_norm": 1.259628176689148, "learning_rate": 1.3377882599580713e-06, "loss": 0.0723, "step": 37140 }, { "epoch": 1.9470649895178198, "grad_norm": 2.077188014984131, "learning_rate": 1.3246855345911952e-06, "loss": 0.0742, "step": 37150 }, { "epoch": 1.9475890985324948, "grad_norm": 0.9879984855651855, "learning_rate": 1.3115828092243188e-06, "loss": 0.0766, "step": 37160 }, { "epoch": 1.9481132075471699, "grad_norm": 8.38661003112793, "learning_rate": 1.2984800838574423e-06, "loss": 0.0837, "step": 37170 }, { "epoch": 1.948637316561845, "grad_norm": 1.6310886144638062, "learning_rate": 1.2853773584905662e-06, "loss": 0.0631, "step": 37180 }, { "epoch": 1.94916142557652, "grad_norm": 1.455169916152954, "learning_rate": 1.2722746331236898e-06, "loss": 0.0777, "step": 37190 }, { "epoch": 1.949685534591195, "grad_norm": 1.819744348526001, "learning_rate": 1.2591719077568134e-06, "loss": 0.0678, "step": 37200 }, { "epoch": 1.95020964360587, "grad_norm": 1.5065569877624512, "learning_rate": 1.2460691823899372e-06, "loss": 0.0725, "step": 37210 }, { "epoch": 1.950733752620545, "grad_norm": 2.0217177867889404, "learning_rate": 1.232966457023061e-06, "loss": 0.0674, "step": 37220 }, { "epoch": 1.95125786163522, "grad_norm": 0.7243431210517883, "learning_rate": 1.2198637316561846e-06, "loss": 0.0547, "step": 37230 }, { "epoch": 1.9517819706498951, "grad_norm": 1.3811225891113281, "learning_rate": 1.2067610062893084e-06, "loss": 0.0559, "step": 37240 }, { "epoch": 1.9523060796645701, "grad_norm": 1.1925405263900757, "learning_rate": 1.193658280922432e-06, "loss": 0.0649, "step": 37250 }, { "epoch": 1.9528301886792452, "grad_norm": 1.6529408693313599, "learning_rate": 1.1805555555555556e-06, "loss": 0.09, "step": 37260 }, { "epoch": 1.9533542976939202, "grad_norm": 3.579072952270508, "learning_rate": 1.1674528301886792e-06, "loss": 0.0808, "step": 37270 }, { "epoch": 1.9538784067085953, "grad_norm": 1.4058364629745483, "learning_rate": 1.154350104821803e-06, "loss": 0.072, "step": 37280 }, { "epoch": 1.9544025157232703, "grad_norm": 1.150667667388916, "learning_rate": 1.1412473794549266e-06, "loss": 0.0564, "step": 37290 }, { "epoch": 1.9549266247379455, "grad_norm": 1.4494599103927612, "learning_rate": 1.1281446540880504e-06, "loss": 0.0755, "step": 37300 }, { "epoch": 1.9554507337526206, "grad_norm": 1.234850525856018, "learning_rate": 1.1150419287211742e-06, "loss": 0.0717, "step": 37310 }, { "epoch": 1.9559748427672956, "grad_norm": 0.9447284936904907, "learning_rate": 1.1019392033542978e-06, "loss": 0.07, "step": 37320 }, { "epoch": 1.9564989517819706, "grad_norm": 2.047245740890503, "learning_rate": 1.0888364779874214e-06, "loss": 0.1194, "step": 37330 }, { "epoch": 1.9570230607966457, "grad_norm": 1.7440499067306519, "learning_rate": 1.0757337526205452e-06, "loss": 0.0658, "step": 37340 }, { "epoch": 1.9575471698113207, "grad_norm": 1.5569164752960205, "learning_rate": 1.0626310272536688e-06, "loss": 0.0636, "step": 37350 }, { "epoch": 1.958071278825996, "grad_norm": 1.9636361598968506, "learning_rate": 1.0495283018867924e-06, "loss": 0.0676, "step": 37360 }, { "epoch": 1.958595387840671, "grad_norm": 1.25540030002594, "learning_rate": 1.0364255765199162e-06, "loss": 0.0699, "step": 37370 }, { "epoch": 1.959119496855346, "grad_norm": 1.2568477392196655, "learning_rate": 1.0233228511530398e-06, "loss": 0.0866, "step": 37380 }, { "epoch": 1.959643605870021, "grad_norm": 1.4221638441085815, "learning_rate": 1.0102201257861636e-06, "loss": 0.0492, "step": 37390 }, { "epoch": 1.960167714884696, "grad_norm": 0.9008753299713135, "learning_rate": 9.971174004192874e-07, "loss": 0.0714, "step": 37400 }, { "epoch": 1.9606918238993711, "grad_norm": 1.7107698917388916, "learning_rate": 9.84014675052411e-07, "loss": 0.0567, "step": 37410 }, { "epoch": 1.9612159329140462, "grad_norm": 1.0695396661758423, "learning_rate": 9.709119496855346e-07, "loss": 0.078, "step": 37420 }, { "epoch": 1.9617400419287212, "grad_norm": 2.351242780685425, "learning_rate": 9.578092243186584e-07, "loss": 0.0725, "step": 37430 }, { "epoch": 1.9622641509433962, "grad_norm": 1.5893714427947998, "learning_rate": 9.44706498951782e-07, "loss": 0.0753, "step": 37440 }, { "epoch": 1.9627882599580713, "grad_norm": 0.7627315521240234, "learning_rate": 9.316037735849057e-07, "loss": 0.0902, "step": 37450 }, { "epoch": 1.9633123689727463, "grad_norm": 1.9830408096313477, "learning_rate": 9.185010482180295e-07, "loss": 0.0628, "step": 37460 }, { "epoch": 1.9638364779874213, "grad_norm": 1.4400569200515747, "learning_rate": 9.053983228511531e-07, "loss": 0.0667, "step": 37470 }, { "epoch": 1.9643605870020964, "grad_norm": 1.388247013092041, "learning_rate": 8.922955974842767e-07, "loss": 0.074, "step": 37480 }, { "epoch": 1.9648846960167714, "grad_norm": 1.9638195037841797, "learning_rate": 8.791928721174004e-07, "loss": 0.0686, "step": 37490 }, { "epoch": 1.9654088050314464, "grad_norm": 1.6383897066116333, "learning_rate": 8.660901467505242e-07, "loss": 0.0732, "step": 37500 }, { "epoch": 1.9659329140461215, "grad_norm": 0.9515641927719116, "learning_rate": 8.529874213836478e-07, "loss": 0.0531, "step": 37510 }, { "epoch": 1.9664570230607965, "grad_norm": 1.4345340728759766, "learning_rate": 8.398846960167714e-07, "loss": 0.0805, "step": 37520 }, { "epoch": 1.9669811320754715, "grad_norm": 1.228147029876709, "learning_rate": 8.267819706498952e-07, "loss": 0.082, "step": 37530 }, { "epoch": 1.9675052410901468, "grad_norm": 1.4178779125213623, "learning_rate": 8.136792452830189e-07, "loss": 0.0908, "step": 37540 }, { "epoch": 1.9680293501048218, "grad_norm": 2.3367421627044678, "learning_rate": 8.005765199161425e-07, "loss": 0.0769, "step": 37550 }, { "epoch": 1.9685534591194969, "grad_norm": 2.3791344165802, "learning_rate": 7.874737945492663e-07, "loss": 0.0594, "step": 37560 }, { "epoch": 1.969077568134172, "grad_norm": 1.5467840433120728, "learning_rate": 7.743710691823899e-07, "loss": 0.0895, "step": 37570 }, { "epoch": 1.969601677148847, "grad_norm": 1.9167823791503906, "learning_rate": 7.612683438155136e-07, "loss": 0.0867, "step": 37580 }, { "epoch": 1.970125786163522, "grad_norm": 1.9124726057052612, "learning_rate": 7.481656184486373e-07, "loss": 0.0531, "step": 37590 }, { "epoch": 1.9706498951781972, "grad_norm": 2.3825502395629883, "learning_rate": 7.35062893081761e-07, "loss": 0.0647, "step": 37600 }, { "epoch": 1.9711740041928723, "grad_norm": 1.4810495376586914, "learning_rate": 7.219601677148848e-07, "loss": 0.0654, "step": 37610 }, { "epoch": 1.9716981132075473, "grad_norm": 1.6239418983459473, "learning_rate": 7.088574423480085e-07, "loss": 0.0627, "step": 37620 }, { "epoch": 1.9722222222222223, "grad_norm": 2.5030767917633057, "learning_rate": 6.957547169811322e-07, "loss": 0.0503, "step": 37630 }, { "epoch": 1.9727463312368974, "grad_norm": 0.7092583775520325, "learning_rate": 6.826519916142558e-07, "loss": 0.0614, "step": 37640 }, { "epoch": 1.9732704402515724, "grad_norm": 1.160536527633667, "learning_rate": 6.695492662473795e-07, "loss": 0.0859, "step": 37650 }, { "epoch": 1.9737945492662474, "grad_norm": 1.278551697731018, "learning_rate": 6.564465408805032e-07, "loss": 0.0718, "step": 37660 }, { "epoch": 1.9743186582809225, "grad_norm": 1.3502774238586426, "learning_rate": 6.433438155136269e-07, "loss": 0.0755, "step": 37670 }, { "epoch": 1.9748427672955975, "grad_norm": 2.126868724822998, "learning_rate": 6.302410901467506e-07, "loss": 0.0829, "step": 37680 }, { "epoch": 1.9753668763102725, "grad_norm": 2.7387828826904297, "learning_rate": 6.171383647798743e-07, "loss": 0.0774, "step": 37690 }, { "epoch": 1.9758909853249476, "grad_norm": 2.5403170585632324, "learning_rate": 6.040356394129979e-07, "loss": 0.0854, "step": 37700 }, { "epoch": 1.9764150943396226, "grad_norm": 1.5051779747009277, "learning_rate": 5.909329140461217e-07, "loss": 0.0823, "step": 37710 }, { "epoch": 1.9769392033542976, "grad_norm": 1.1935973167419434, "learning_rate": 5.778301886792453e-07, "loss": 0.0647, "step": 37720 }, { "epoch": 1.9774633123689727, "grad_norm": 1.5811327695846558, "learning_rate": 5.64727463312369e-07, "loss": 0.1096, "step": 37730 }, { "epoch": 1.9779874213836477, "grad_norm": 5.325149059295654, "learning_rate": 5.516247379454927e-07, "loss": 0.082, "step": 37740 }, { "epoch": 1.9785115303983227, "grad_norm": 0.6052076816558838, "learning_rate": 5.385220125786164e-07, "loss": 0.0502, "step": 37750 }, { "epoch": 1.9790356394129978, "grad_norm": 1.2876312732696533, "learning_rate": 5.254192872117401e-07, "loss": 0.0864, "step": 37760 }, { "epoch": 1.9795597484276728, "grad_norm": 1.1572834253311157, "learning_rate": 5.123165618448638e-07, "loss": 0.0712, "step": 37770 }, { "epoch": 1.980083857442348, "grad_norm": 9.827251434326172, "learning_rate": 4.992138364779874e-07, "loss": 0.0594, "step": 37780 }, { "epoch": 1.980607966457023, "grad_norm": 0.8759055137634277, "learning_rate": 4.861111111111111e-07, "loss": 0.0716, "step": 37790 }, { "epoch": 1.9811320754716981, "grad_norm": 1.3479708433151245, "learning_rate": 4.7300838574423485e-07, "loss": 0.0778, "step": 37800 }, { "epoch": 1.9816561844863732, "grad_norm": 3.9129786491394043, "learning_rate": 4.599056603773585e-07, "loss": 0.076, "step": 37810 }, { "epoch": 1.9821802935010482, "grad_norm": 1.9781731367111206, "learning_rate": 4.468029350104822e-07, "loss": 0.0679, "step": 37820 }, { "epoch": 1.9827044025157232, "grad_norm": 17.147493362426758, "learning_rate": 4.3370020964360585e-07, "loss": 0.0607, "step": 37830 }, { "epoch": 1.9832285115303985, "grad_norm": 1.202863335609436, "learning_rate": 4.2059748427672955e-07, "loss": 0.0794, "step": 37840 }, { "epoch": 1.9837526205450735, "grad_norm": 1.6506856679916382, "learning_rate": 4.074947589098533e-07, "loss": 0.0636, "step": 37850 }, { "epoch": 1.9842767295597485, "grad_norm": 1.0308916568756104, "learning_rate": 3.943920335429769e-07, "loss": 0.0728, "step": 37860 }, { "epoch": 1.9848008385744236, "grad_norm": 1.316379427909851, "learning_rate": 3.8128930817610066e-07, "loss": 0.0817, "step": 37870 }, { "epoch": 1.9853249475890986, "grad_norm": 1.3974709510803223, "learning_rate": 3.681865828092243e-07, "loss": 0.0527, "step": 37880 }, { "epoch": 1.9858490566037736, "grad_norm": 1.2634168863296509, "learning_rate": 3.55083857442348e-07, "loss": 0.0741, "step": 37890 }, { "epoch": 1.9863731656184487, "grad_norm": 1.9009695053100586, "learning_rate": 3.419811320754717e-07, "loss": 0.0895, "step": 37900 }, { "epoch": 1.9868972746331237, "grad_norm": 1.6653246879577637, "learning_rate": 3.288784067085954e-07, "loss": 0.0794, "step": 37910 }, { "epoch": 1.9874213836477987, "grad_norm": 1.7431325912475586, "learning_rate": 3.1577568134171907e-07, "loss": 0.0872, "step": 37920 }, { "epoch": 1.9879454926624738, "grad_norm": 0.8424640893936157, "learning_rate": 3.026729559748428e-07, "loss": 0.0714, "step": 37930 }, { "epoch": 1.9884696016771488, "grad_norm": 2.0467424392700195, "learning_rate": 2.895702306079665e-07, "loss": 0.0872, "step": 37940 }, { "epoch": 1.9889937106918238, "grad_norm": 1.6486082077026367, "learning_rate": 2.764675052410902e-07, "loss": 0.0946, "step": 37950 }, { "epoch": 1.9895178197064989, "grad_norm": 1.3245718479156494, "learning_rate": 2.6336477987421383e-07, "loss": 0.0832, "step": 37960 }, { "epoch": 1.990041928721174, "grad_norm": 1.2091447114944458, "learning_rate": 2.5026205450733754e-07, "loss": 0.0847, "step": 37970 }, { "epoch": 1.990566037735849, "grad_norm": 0.769639790058136, "learning_rate": 2.3715932914046124e-07, "loss": 0.0586, "step": 37980 }, { "epoch": 1.991090146750524, "grad_norm": 0.9816944599151611, "learning_rate": 2.2405660377358492e-07, "loss": 0.066, "step": 37990 }, { "epoch": 1.991614255765199, "grad_norm": 1.0189626216888428, "learning_rate": 2.109538784067086e-07, "loss": 0.0697, "step": 38000 }, { "epoch": 1.991614255765199, "eval_loss": 0.2673446834087372, "eval_runtime": 267.8679, "eval_samples_per_second": 7.433, "eval_steps_per_second": 1.239, "step": 38000 } ], "logging_steps": 10, "max_steps": 38160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.669752402333338e+19, "train_batch_size": 6, "trial_name": null, "trial_params": null }