diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26938 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.991614255765199, + "eval_steps": 1000, + "global_step": 38000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005241090146750524, + "grad_norm": 1.6935213804244995, + "learning_rate": 4.9988207547169815e-05, + "loss": 0.3559, + "step": 10 + }, + { + "epoch": 0.0010482180293501049, + "grad_norm": 2.7643861770629883, + "learning_rate": 4.997510482180294e-05, + "loss": 0.2081, + "step": 20 + }, + { + "epoch": 0.0015723270440251573, + "grad_norm": 1.5143877267837524, + "learning_rate": 4.996200209643606e-05, + "loss": 0.219, + "step": 30 + }, + { + "epoch": 0.0020964360587002098, + "grad_norm": 2.633585214614868, + "learning_rate": 4.9948899371069186e-05, + "loss": 0.2276, + "step": 40 + }, + { + "epoch": 0.002620545073375262, + "grad_norm": 1.973532795906067, + "learning_rate": 4.993579664570231e-05, + "loss": 0.2041, + "step": 50 + }, + { + "epoch": 0.0031446540880503146, + "grad_norm": 2.134101629257202, + "learning_rate": 4.992269392033543e-05, + "loss": 0.2058, + "step": 60 + }, + { + "epoch": 0.003668763102725367, + "grad_norm": 1.7121070623397827, + "learning_rate": 4.9909591194968556e-05, + "loss": 0.2332, + "step": 70 + }, + { + "epoch": 0.0041928721174004195, + "grad_norm": 1.5422728061676025, + "learning_rate": 4.989648846960168e-05, + "loss": 0.1932, + "step": 80 + }, + { + "epoch": 0.0047169811320754715, + "grad_norm": 1.2396705150604248, + "learning_rate": 4.98833857442348e-05, + "loss": 0.1739, + "step": 90 + }, + { + "epoch": 0.005241090146750524, + "grad_norm": 1.5578513145446777, + "learning_rate": 4.9870283018867926e-05, + "loss": 0.1871, + "step": 100 + }, + { + "epoch": 0.005765199161425576, + "grad_norm": 1.3153727054595947, + "learning_rate": 4.985718029350105e-05, + "loss": 0.2039, + "step": 110 + }, + { + "epoch": 0.006289308176100629, + "grad_norm": 1.5129348039627075, + "learning_rate": 4.984407756813417e-05, + "loss": 0.1398, + "step": 120 + }, + { + "epoch": 0.006813417190775681, + "grad_norm": 1.8399486541748047, + "learning_rate": 4.9830974842767296e-05, + "loss": 0.181, + "step": 130 + }, + { + "epoch": 0.007337526205450734, + "grad_norm": 1.6457992792129517, + "learning_rate": 4.981787211740042e-05, + "loss": 0.1897, + "step": 140 + }, + { + "epoch": 0.007861635220125786, + "grad_norm": 1.87934148311615, + "learning_rate": 4.980476939203354e-05, + "loss": 0.1606, + "step": 150 + }, + { + "epoch": 0.008385744234800839, + "grad_norm": 1.4629708528518677, + "learning_rate": 4.979166666666667e-05, + "loss": 0.1798, + "step": 160 + }, + { + "epoch": 0.00890985324947589, + "grad_norm": 1.632991909980774, + "learning_rate": 4.9778563941299796e-05, + "loss": 0.1524, + "step": 170 + }, + { + "epoch": 0.009433962264150943, + "grad_norm": 1.5290257930755615, + "learning_rate": 4.976546121593292e-05, + "loss": 0.151, + "step": 180 + }, + { + "epoch": 0.009958071278825996, + "grad_norm": 1.6157230138778687, + "learning_rate": 4.975235849056604e-05, + "loss": 0.1807, + "step": 190 + }, + { + "epoch": 0.010482180293501049, + "grad_norm": 1.7142693996429443, + "learning_rate": 4.9739255765199167e-05, + "loss": 0.1754, + "step": 200 + }, + { + "epoch": 0.0110062893081761, + "grad_norm": 1.5730280876159668, + "learning_rate": 4.972615303983228e-05, + "loss": 0.172, + "step": 210 + }, + { + "epoch": 0.011530398322851153, + "grad_norm": 1.9272352457046509, + "learning_rate": 4.9713050314465407e-05, + "loss": 0.1684, + "step": 220 + }, + { + "epoch": 0.012054507337526206, + "grad_norm": 1.6200827360153198, + "learning_rate": 4.969994758909853e-05, + "loss": 0.2375, + "step": 230 + }, + { + "epoch": 0.012578616352201259, + "grad_norm": 1.1623066663742065, + "learning_rate": 4.968684486373166e-05, + "loss": 0.144, + "step": 240 + }, + { + "epoch": 0.01310272536687631, + "grad_norm": 1.6734042167663574, + "learning_rate": 4.9673742138364784e-05, + "loss": 0.1523, + "step": 250 + }, + { + "epoch": 0.013626834381551363, + "grad_norm": 1.4891057014465332, + "learning_rate": 4.966063941299791e-05, + "loss": 0.1881, + "step": 260 + }, + { + "epoch": 0.014150943396226415, + "grad_norm": 1.4071035385131836, + "learning_rate": 4.964753668763103e-05, + "loss": 0.146, + "step": 270 + }, + { + "epoch": 0.014675052410901468, + "grad_norm": 3.614590644836426, + "learning_rate": 4.9634433962264154e-05, + "loss": 0.1603, + "step": 280 + }, + { + "epoch": 0.01519916142557652, + "grad_norm": 1.7132965326309204, + "learning_rate": 4.962133123689728e-05, + "loss": 0.1538, + "step": 290 + }, + { + "epoch": 0.015723270440251572, + "grad_norm": 1.7030532360076904, + "learning_rate": 4.96082285115304e-05, + "loss": 0.188, + "step": 300 + }, + { + "epoch": 0.016247379454926623, + "grad_norm": 0.9326625466346741, + "learning_rate": 4.9595125786163524e-05, + "loss": 0.1669, + "step": 310 + }, + { + "epoch": 0.016771488469601678, + "grad_norm": 1.9342471361160278, + "learning_rate": 4.9582023060796654e-05, + "loss": 0.1618, + "step": 320 + }, + { + "epoch": 0.01729559748427673, + "grad_norm": 1.786618947982788, + "learning_rate": 4.956892033542977e-05, + "loss": 0.1645, + "step": 330 + }, + { + "epoch": 0.01781970649895178, + "grad_norm": 2.348848819732666, + "learning_rate": 4.9555817610062894e-05, + "loss": 0.1511, + "step": 340 + }, + { + "epoch": 0.018343815513626835, + "grad_norm": 1.2310869693756104, + "learning_rate": 4.954271488469602e-05, + "loss": 0.162, + "step": 350 + }, + { + "epoch": 0.018867924528301886, + "grad_norm": 1.5017716884613037, + "learning_rate": 4.952961215932914e-05, + "loss": 0.1844, + "step": 360 + }, + { + "epoch": 0.01939203354297694, + "grad_norm": 2.7584903240203857, + "learning_rate": 4.9516509433962264e-05, + "loss": 0.1623, + "step": 370 + }, + { + "epoch": 0.019916142557651992, + "grad_norm": 1.2648184299468994, + "learning_rate": 4.950340670859539e-05, + "loss": 0.198, + "step": 380 + }, + { + "epoch": 0.020440251572327043, + "grad_norm": 1.4309797286987305, + "learning_rate": 4.949030398322851e-05, + "loss": 0.1776, + "step": 390 + }, + { + "epoch": 0.020964360587002098, + "grad_norm": 2.9164257049560547, + "learning_rate": 4.947720125786164e-05, + "loss": 0.19, + "step": 400 + }, + { + "epoch": 0.02148846960167715, + "grad_norm": 1.5874290466308594, + "learning_rate": 4.9464098532494764e-05, + "loss": 0.1611, + "step": 410 + }, + { + "epoch": 0.0220125786163522, + "grad_norm": 3.8210947513580322, + "learning_rate": 4.945099580712789e-05, + "loss": 0.1678, + "step": 420 + }, + { + "epoch": 0.022536687631027254, + "grad_norm": 1.253749966621399, + "learning_rate": 4.943789308176101e-05, + "loss": 0.1717, + "step": 430 + }, + { + "epoch": 0.023060796645702306, + "grad_norm": 2.2594664096832275, + "learning_rate": 4.9424790356394135e-05, + "loss": 0.1781, + "step": 440 + }, + { + "epoch": 0.02358490566037736, + "grad_norm": 1.278881549835205, + "learning_rate": 4.941168763102725e-05, + "loss": 0.1582, + "step": 450 + }, + { + "epoch": 0.02410901467505241, + "grad_norm": 1.303890585899353, + "learning_rate": 4.9398584905660375e-05, + "loss": 0.1413, + "step": 460 + }, + { + "epoch": 0.024633123689727462, + "grad_norm": 1.44670832157135, + "learning_rate": 4.9385482180293505e-05, + "loss": 0.1731, + "step": 470 + }, + { + "epoch": 0.025157232704402517, + "grad_norm": 1.9329112768173218, + "learning_rate": 4.937237945492663e-05, + "loss": 0.1769, + "step": 480 + }, + { + "epoch": 0.025681341719077568, + "grad_norm": 1.873005747795105, + "learning_rate": 4.935927672955975e-05, + "loss": 0.1945, + "step": 490 + }, + { + "epoch": 0.02620545073375262, + "grad_norm": 1.2180646657943726, + "learning_rate": 4.9346174004192875e-05, + "loss": 0.1456, + "step": 500 + }, + { + "epoch": 0.026729559748427674, + "grad_norm": 1.3814204931259155, + "learning_rate": 4.9333071278826e-05, + "loss": 0.1732, + "step": 510 + }, + { + "epoch": 0.027253668763102725, + "grad_norm": 1.440902590751648, + "learning_rate": 4.931996855345912e-05, + "loss": 0.2116, + "step": 520 + }, + { + "epoch": 0.027777777777777776, + "grad_norm": 1.0152875185012817, + "learning_rate": 4.9306865828092245e-05, + "loss": 0.1555, + "step": 530 + }, + { + "epoch": 0.02830188679245283, + "grad_norm": 1.1150940656661987, + "learning_rate": 4.929376310272537e-05, + "loss": 0.1619, + "step": 540 + }, + { + "epoch": 0.028825995807127882, + "grad_norm": 1.3169711828231812, + "learning_rate": 4.928066037735849e-05, + "loss": 0.1839, + "step": 550 + }, + { + "epoch": 0.029350104821802937, + "grad_norm": 1.2143508195877075, + "learning_rate": 4.926755765199162e-05, + "loss": 0.171, + "step": 560 + }, + { + "epoch": 0.029874213836477988, + "grad_norm": 0.9038965702056885, + "learning_rate": 4.925445492662474e-05, + "loss": 0.1504, + "step": 570 + }, + { + "epoch": 0.03039832285115304, + "grad_norm": 1.3861496448516846, + "learning_rate": 4.924135220125786e-05, + "loss": 0.1926, + "step": 580 + }, + { + "epoch": 0.030922431865828093, + "grad_norm": 1.5401298999786377, + "learning_rate": 4.9228249475890985e-05, + "loss": 0.172, + "step": 590 + }, + { + "epoch": 0.031446540880503145, + "grad_norm": 2.0027623176574707, + "learning_rate": 4.921514675052411e-05, + "loss": 0.1753, + "step": 600 + }, + { + "epoch": 0.0319706498951782, + "grad_norm": 1.0361213684082031, + "learning_rate": 4.920204402515723e-05, + "loss": 0.1549, + "step": 610 + }, + { + "epoch": 0.03249475890985325, + "grad_norm": 1.3739137649536133, + "learning_rate": 4.9188941299790356e-05, + "loss": 0.194, + "step": 620 + }, + { + "epoch": 0.0330188679245283, + "grad_norm": 2.229529857635498, + "learning_rate": 4.9175838574423486e-05, + "loss": 0.1705, + "step": 630 + }, + { + "epoch": 0.033542976939203356, + "grad_norm": 1.585580825805664, + "learning_rate": 4.916273584905661e-05, + "loss": 0.1404, + "step": 640 + }, + { + "epoch": 0.034067085953878404, + "grad_norm": 1.1737724542617798, + "learning_rate": 4.914963312368973e-05, + "loss": 0.1814, + "step": 650 + }, + { + "epoch": 0.03459119496855346, + "grad_norm": 1.4785288572311401, + "learning_rate": 4.9136530398322856e-05, + "loss": 0.1425, + "step": 660 + }, + { + "epoch": 0.03511530398322851, + "grad_norm": 2.3371455669403076, + "learning_rate": 4.912342767295598e-05, + "loss": 0.1601, + "step": 670 + }, + { + "epoch": 0.03563941299790356, + "grad_norm": 1.0585002899169922, + "learning_rate": 4.91103249475891e-05, + "loss": 0.1261, + "step": 680 + }, + { + "epoch": 0.036163522012578615, + "grad_norm": 1.873073935508728, + "learning_rate": 4.909722222222222e-05, + "loss": 0.1793, + "step": 690 + }, + { + "epoch": 0.03668763102725367, + "grad_norm": 1.6408770084381104, + "learning_rate": 4.908411949685535e-05, + "loss": 0.1635, + "step": 700 + }, + { + "epoch": 0.037211740041928724, + "grad_norm": 2.1603291034698486, + "learning_rate": 4.907101677148847e-05, + "loss": 0.1348, + "step": 710 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 3.0273585319519043, + "learning_rate": 4.9057914046121596e-05, + "loss": 0.1864, + "step": 720 + }, + { + "epoch": 0.03825995807127883, + "grad_norm": 1.1323351860046387, + "learning_rate": 4.904481132075472e-05, + "loss": 0.1577, + "step": 730 + }, + { + "epoch": 0.03878406708595388, + "grad_norm": 1.803604006767273, + "learning_rate": 4.903170859538784e-05, + "loss": 0.1612, + "step": 740 + }, + { + "epoch": 0.03930817610062893, + "grad_norm": 2.119659900665283, + "learning_rate": 4.9018605870020966e-05, + "loss": 0.1484, + "step": 750 + }, + { + "epoch": 0.039832285115303984, + "grad_norm": 2.6356945037841797, + "learning_rate": 4.900550314465409e-05, + "loss": 0.158, + "step": 760 + }, + { + "epoch": 0.04035639412997904, + "grad_norm": 1.738153100013733, + "learning_rate": 4.899240041928721e-05, + "loss": 0.1704, + "step": 770 + }, + { + "epoch": 0.040880503144654086, + "grad_norm": 2.1323139667510986, + "learning_rate": 4.8979297693920336e-05, + "loss": 0.173, + "step": 780 + }, + { + "epoch": 0.04140461215932914, + "grad_norm": 2.8090837001800537, + "learning_rate": 4.896619496855347e-05, + "loss": 0.1349, + "step": 790 + }, + { + "epoch": 0.041928721174004195, + "grad_norm": 1.0705924034118652, + "learning_rate": 4.895309224318658e-05, + "loss": 0.147, + "step": 800 + }, + { + "epoch": 0.04245283018867924, + "grad_norm": 1.817993402481079, + "learning_rate": 4.893998951781971e-05, + "loss": 0.2037, + "step": 810 + }, + { + "epoch": 0.0429769392033543, + "grad_norm": 1.405219316482544, + "learning_rate": 4.892688679245283e-05, + "loss": 0.1496, + "step": 820 + }, + { + "epoch": 0.04350104821802935, + "grad_norm": 2.2118489742279053, + "learning_rate": 4.8913784067085953e-05, + "loss": 0.1513, + "step": 830 + }, + { + "epoch": 0.0440251572327044, + "grad_norm": 1.1836580038070679, + "learning_rate": 4.890068134171908e-05, + "loss": 0.1458, + "step": 840 + }, + { + "epoch": 0.044549266247379454, + "grad_norm": 1.7217124700546265, + "learning_rate": 4.88875786163522e-05, + "loss": 0.1554, + "step": 850 + }, + { + "epoch": 0.04507337526205451, + "grad_norm": 1.640258550643921, + "learning_rate": 4.887447589098533e-05, + "loss": 0.12, + "step": 860 + }, + { + "epoch": 0.04559748427672956, + "grad_norm": 1.1371960639953613, + "learning_rate": 4.8861373165618454e-05, + "loss": 0.1992, + "step": 870 + }, + { + "epoch": 0.04612159329140461, + "grad_norm": 2.01666259765625, + "learning_rate": 4.884827044025158e-05, + "loss": 0.1374, + "step": 880 + }, + { + "epoch": 0.046645702306079666, + "grad_norm": 2.8290281295776367, + "learning_rate": 4.88351677148847e-05, + "loss": 0.1368, + "step": 890 + }, + { + "epoch": 0.04716981132075472, + "grad_norm": 1.649423599243164, + "learning_rate": 4.8822064989517824e-05, + "loss": 0.1858, + "step": 900 + }, + { + "epoch": 0.04769392033542977, + "grad_norm": 1.3996362686157227, + "learning_rate": 4.880896226415095e-05, + "loss": 0.1751, + "step": 910 + }, + { + "epoch": 0.04821802935010482, + "grad_norm": 1.836682915687561, + "learning_rate": 4.8795859538784064e-05, + "loss": 0.1383, + "step": 920 + }, + { + "epoch": 0.04874213836477988, + "grad_norm": 2.5748958587646484, + "learning_rate": 4.8782756813417194e-05, + "loss": 0.1296, + "step": 930 + }, + { + "epoch": 0.049266247379454925, + "grad_norm": 1.4079227447509766, + "learning_rate": 4.876965408805032e-05, + "loss": 0.1476, + "step": 940 + }, + { + "epoch": 0.04979035639412998, + "grad_norm": 0.9867343902587891, + "learning_rate": 4.875655136268344e-05, + "loss": 0.1841, + "step": 950 + }, + { + "epoch": 0.050314465408805034, + "grad_norm": 1.5223592519760132, + "learning_rate": 4.8743448637316564e-05, + "loss": 0.1666, + "step": 960 + }, + { + "epoch": 0.05083857442348008, + "grad_norm": 1.3690940141677856, + "learning_rate": 4.873034591194969e-05, + "loss": 0.1901, + "step": 970 + }, + { + "epoch": 0.051362683438155136, + "grad_norm": 1.6786198616027832, + "learning_rate": 4.871724318658281e-05, + "loss": 0.156, + "step": 980 + }, + { + "epoch": 0.05188679245283019, + "grad_norm": 1.3634241819381714, + "learning_rate": 4.8704140461215934e-05, + "loss": 0.1808, + "step": 990 + }, + { + "epoch": 0.05241090146750524, + "grad_norm": 1.5819250345230103, + "learning_rate": 4.869103773584906e-05, + "loss": 0.1346, + "step": 1000 + }, + { + "epoch": 0.05241090146750524, + "eval_loss": 0.32801321148872375, + "eval_runtime": 267.484, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 1.241, + "step": 1000 + }, + { + "epoch": 0.05293501048218029, + "grad_norm": 1.4934093952178955, + "learning_rate": 4.867793501048218e-05, + "loss": 0.1558, + "step": 1010 + }, + { + "epoch": 0.05345911949685535, + "grad_norm": 2.1169118881225586, + "learning_rate": 4.866483228511531e-05, + "loss": 0.1664, + "step": 1020 + }, + { + "epoch": 0.053983228511530396, + "grad_norm": 1.9853368997573853, + "learning_rate": 4.8651729559748435e-05, + "loss": 0.1532, + "step": 1030 + }, + { + "epoch": 0.05450733752620545, + "grad_norm": 2.548008680343628, + "learning_rate": 4.863862683438155e-05, + "loss": 0.1607, + "step": 1040 + }, + { + "epoch": 0.055031446540880505, + "grad_norm": 1.3498486280441284, + "learning_rate": 4.8625524109014675e-05, + "loss": 0.1667, + "step": 1050 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 3.581585168838501, + "learning_rate": 4.86124213836478e-05, + "loss": 0.2109, + "step": 1060 + }, + { + "epoch": 0.05607966457023061, + "grad_norm": 1.8366062641143799, + "learning_rate": 4.859931865828092e-05, + "loss": 0.1457, + "step": 1070 + }, + { + "epoch": 0.05660377358490566, + "grad_norm": 2.5987987518310547, + "learning_rate": 4.8586215932914045e-05, + "loss": 0.1583, + "step": 1080 + }, + { + "epoch": 0.05712788259958071, + "grad_norm": 1.100653886795044, + "learning_rate": 4.8573113207547175e-05, + "loss": 0.1689, + "step": 1090 + }, + { + "epoch": 0.057651991614255764, + "grad_norm": 0.548283040523529, + "learning_rate": 4.85600104821803e-05, + "loss": 0.1453, + "step": 1100 + }, + { + "epoch": 0.05817610062893082, + "grad_norm": 1.1596165895462036, + "learning_rate": 4.854690775681342e-05, + "loss": 0.1452, + "step": 1110 + }, + { + "epoch": 0.05870020964360587, + "grad_norm": 1.637635350227356, + "learning_rate": 4.8533805031446545e-05, + "loss": 0.1465, + "step": 1120 + }, + { + "epoch": 0.05922431865828092, + "grad_norm": 0.9369329810142517, + "learning_rate": 4.852070230607967e-05, + "loss": 0.1459, + "step": 1130 + }, + { + "epoch": 0.059748427672955975, + "grad_norm": 1.2311546802520752, + "learning_rate": 4.850759958071279e-05, + "loss": 0.1766, + "step": 1140 + }, + { + "epoch": 0.06027253668763103, + "grad_norm": 2.20399808883667, + "learning_rate": 4.8494496855345915e-05, + "loss": 0.1597, + "step": 1150 + }, + { + "epoch": 0.06079664570230608, + "grad_norm": 1.283770203590393, + "learning_rate": 4.848139412997903e-05, + "loss": 0.1612, + "step": 1160 + }, + { + "epoch": 0.06132075471698113, + "grad_norm": 1.9280977249145508, + "learning_rate": 4.846829140461216e-05, + "loss": 0.139, + "step": 1170 + }, + { + "epoch": 0.06184486373165619, + "grad_norm": 1.577441692352295, + "learning_rate": 4.8455188679245285e-05, + "loss": 0.1676, + "step": 1180 + }, + { + "epoch": 0.062368972746331235, + "grad_norm": 1.5173856019973755, + "learning_rate": 4.844208595387841e-05, + "loss": 0.1791, + "step": 1190 + }, + { + "epoch": 0.06289308176100629, + "grad_norm": 1.5214680433273315, + "learning_rate": 4.842898322851153e-05, + "loss": 0.1491, + "step": 1200 + }, + { + "epoch": 0.06341719077568134, + "grad_norm": 1.152410864830017, + "learning_rate": 4.8415880503144656e-05, + "loss": 0.1435, + "step": 1210 + }, + { + "epoch": 0.0639412997903564, + "grad_norm": 1.456242322921753, + "learning_rate": 4.840277777777778e-05, + "loss": 0.1374, + "step": 1220 + }, + { + "epoch": 0.06446540880503145, + "grad_norm": 0.6337676644325256, + "learning_rate": 4.83896750524109e-05, + "loss": 0.1484, + "step": 1230 + }, + { + "epoch": 0.0649895178197065, + "grad_norm": 1.5528064966201782, + "learning_rate": 4.8376572327044026e-05, + "loss": 0.1796, + "step": 1240 + }, + { + "epoch": 0.06551362683438156, + "grad_norm": 1.5369819402694702, + "learning_rate": 4.8363469601677156e-05, + "loss": 0.151, + "step": 1250 + }, + { + "epoch": 0.0660377358490566, + "grad_norm": 2.3839173316955566, + "learning_rate": 4.835036687631028e-05, + "loss": 0.1574, + "step": 1260 + }, + { + "epoch": 0.06656184486373165, + "grad_norm": 1.5514627695083618, + "learning_rate": 4.83372641509434e-05, + "loss": 0.1378, + "step": 1270 + }, + { + "epoch": 0.06708595387840671, + "grad_norm": 1.8946847915649414, + "learning_rate": 4.832416142557652e-05, + "loss": 0.1413, + "step": 1280 + }, + { + "epoch": 0.06761006289308176, + "grad_norm": 2.3054966926574707, + "learning_rate": 4.831105870020964e-05, + "loss": 0.1899, + "step": 1290 + }, + { + "epoch": 0.06813417190775681, + "grad_norm": 1.0668176412582397, + "learning_rate": 4.8297955974842766e-05, + "loss": 0.134, + "step": 1300 + }, + { + "epoch": 0.06865828092243187, + "grad_norm": 2.0305306911468506, + "learning_rate": 4.828485324947589e-05, + "loss": 0.211, + "step": 1310 + }, + { + "epoch": 0.06918238993710692, + "grad_norm": 1.6836894750595093, + "learning_rate": 4.827175052410901e-05, + "loss": 0.1703, + "step": 1320 + }, + { + "epoch": 0.06970649895178196, + "grad_norm": 1.3076380491256714, + "learning_rate": 4.825864779874214e-05, + "loss": 0.1626, + "step": 1330 + }, + { + "epoch": 0.07023060796645703, + "grad_norm": 1.258353352546692, + "learning_rate": 4.8245545073375266e-05, + "loss": 0.1685, + "step": 1340 + }, + { + "epoch": 0.07075471698113207, + "grad_norm": 1.0019201040267944, + "learning_rate": 4.823244234800839e-05, + "loss": 0.1468, + "step": 1350 + }, + { + "epoch": 0.07127882599580712, + "grad_norm": 2.594862222671509, + "learning_rate": 4.821933962264151e-05, + "loss": 0.1298, + "step": 1360 + }, + { + "epoch": 0.07180293501048218, + "grad_norm": 0.9913552403450012, + "learning_rate": 4.8206236897274637e-05, + "loss": 0.1482, + "step": 1370 + }, + { + "epoch": 0.07232704402515723, + "grad_norm": 1.8441811800003052, + "learning_rate": 4.819313417190776e-05, + "loss": 0.1885, + "step": 1380 + }, + { + "epoch": 0.07285115303983228, + "grad_norm": 1.8914382457733154, + "learning_rate": 4.818003144654088e-05, + "loss": 0.1218, + "step": 1390 + }, + { + "epoch": 0.07337526205450734, + "grad_norm": 1.9815479516983032, + "learning_rate": 4.816692872117401e-05, + "loss": 0.122, + "step": 1400 + }, + { + "epoch": 0.07389937106918239, + "grad_norm": 1.6056983470916748, + "learning_rate": 4.815382599580713e-05, + "loss": 0.192, + "step": 1410 + }, + { + "epoch": 0.07442348008385745, + "grad_norm": 2.7241697311401367, + "learning_rate": 4.8140723270440253e-05, + "loss": 0.1282, + "step": 1420 + }, + { + "epoch": 0.0749475890985325, + "grad_norm": 2.503002405166626, + "learning_rate": 4.812762054507338e-05, + "loss": 0.1728, + "step": 1430 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 1.2872258424758911, + "learning_rate": 4.81145178197065e-05, + "loss": 0.1376, + "step": 1440 + }, + { + "epoch": 0.0759958071278826, + "grad_norm": 1.5114333629608154, + "learning_rate": 4.8101415094339624e-05, + "loss": 0.1729, + "step": 1450 + }, + { + "epoch": 0.07651991614255765, + "grad_norm": 1.2313517332077026, + "learning_rate": 4.808831236897275e-05, + "loss": 0.1507, + "step": 1460 + }, + { + "epoch": 0.0770440251572327, + "grad_norm": 1.3203717470169067, + "learning_rate": 4.807520964360587e-05, + "loss": 0.1681, + "step": 1470 + }, + { + "epoch": 0.07756813417190776, + "grad_norm": 0.7284913063049316, + "learning_rate": 4.8062106918238994e-05, + "loss": 0.1493, + "step": 1480 + }, + { + "epoch": 0.07809224318658281, + "grad_norm": 1.1101444959640503, + "learning_rate": 4.8049004192872124e-05, + "loss": 0.1573, + "step": 1490 + }, + { + "epoch": 0.07861635220125786, + "grad_norm": 0.9905779957771301, + "learning_rate": 4.803590146750525e-05, + "loss": 0.1423, + "step": 1500 + }, + { + "epoch": 0.07914046121593292, + "grad_norm": 2.1272692680358887, + "learning_rate": 4.802279874213837e-05, + "loss": 0.1638, + "step": 1510 + }, + { + "epoch": 0.07966457023060797, + "grad_norm": 1.397460699081421, + "learning_rate": 4.800969601677149e-05, + "loss": 0.1299, + "step": 1520 + }, + { + "epoch": 0.08018867924528301, + "grad_norm": 0.9754815101623535, + "learning_rate": 4.799659329140461e-05, + "loss": 0.1495, + "step": 1530 + }, + { + "epoch": 0.08071278825995808, + "grad_norm": 2.096616268157959, + "learning_rate": 4.7983490566037734e-05, + "loss": 0.133, + "step": 1540 + }, + { + "epoch": 0.08123689727463312, + "grad_norm": 1.4818449020385742, + "learning_rate": 4.797038784067086e-05, + "loss": 0.1158, + "step": 1550 + }, + { + "epoch": 0.08176100628930817, + "grad_norm": 1.9018614292144775, + "learning_rate": 4.795728511530399e-05, + "loss": 0.1637, + "step": 1560 + }, + { + "epoch": 0.08228511530398323, + "grad_norm": 1.2683446407318115, + "learning_rate": 4.794418238993711e-05, + "loss": 0.1492, + "step": 1570 + }, + { + "epoch": 0.08280922431865828, + "grad_norm": 1.5261001586914062, + "learning_rate": 4.7931079664570234e-05, + "loss": 0.1528, + "step": 1580 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 1.243240237236023, + "learning_rate": 4.791797693920336e-05, + "loss": 0.1795, + "step": 1590 + }, + { + "epoch": 0.08385744234800839, + "grad_norm": 1.667392373085022, + "learning_rate": 4.790487421383648e-05, + "loss": 0.1549, + "step": 1600 + }, + { + "epoch": 0.08438155136268344, + "grad_norm": 1.0932554006576538, + "learning_rate": 4.7891771488469605e-05, + "loss": 0.1494, + "step": 1610 + }, + { + "epoch": 0.08490566037735849, + "grad_norm": 1.1366685628890991, + "learning_rate": 4.787866876310273e-05, + "loss": 0.1842, + "step": 1620 + }, + { + "epoch": 0.08542976939203355, + "grad_norm": 1.2293801307678223, + "learning_rate": 4.786556603773585e-05, + "loss": 0.1567, + "step": 1630 + }, + { + "epoch": 0.0859538784067086, + "grad_norm": 2.5766963958740234, + "learning_rate": 4.7852463312368975e-05, + "loss": 0.1495, + "step": 1640 + }, + { + "epoch": 0.08647798742138364, + "grad_norm": 1.4142640829086304, + "learning_rate": 4.78393605870021e-05, + "loss": 0.1468, + "step": 1650 + }, + { + "epoch": 0.0870020964360587, + "grad_norm": 1.6835039854049683, + "learning_rate": 4.782625786163522e-05, + "loss": 0.1554, + "step": 1660 + }, + { + "epoch": 0.08752620545073375, + "grad_norm": 1.5390868186950684, + "learning_rate": 4.7813155136268345e-05, + "loss": 0.1953, + "step": 1670 + }, + { + "epoch": 0.0880503144654088, + "grad_norm": 2.5762722492218018, + "learning_rate": 4.780005241090147e-05, + "loss": 0.1474, + "step": 1680 + }, + { + "epoch": 0.08857442348008386, + "grad_norm": 1.0517301559448242, + "learning_rate": 4.778694968553459e-05, + "loss": 0.1648, + "step": 1690 + }, + { + "epoch": 0.08909853249475891, + "grad_norm": 1.400057315826416, + "learning_rate": 4.7773846960167715e-05, + "loss": 0.1446, + "step": 1700 + }, + { + "epoch": 0.08962264150943396, + "grad_norm": 0.9269096851348877, + "learning_rate": 4.776074423480084e-05, + "loss": 0.1373, + "step": 1710 + }, + { + "epoch": 0.09014675052410902, + "grad_norm": 1.1477895975112915, + "learning_rate": 4.774764150943397e-05, + "loss": 0.1643, + "step": 1720 + }, + { + "epoch": 0.09067085953878407, + "grad_norm": 1.0856636762619019, + "learning_rate": 4.773453878406709e-05, + "loss": 0.1464, + "step": 1730 + }, + { + "epoch": 0.09119496855345911, + "grad_norm": 1.5389838218688965, + "learning_rate": 4.7721436058700215e-05, + "loss": 0.1532, + "step": 1740 + }, + { + "epoch": 0.09171907756813417, + "grad_norm": 6.00367546081543, + "learning_rate": 4.770833333333334e-05, + "loss": 0.1527, + "step": 1750 + }, + { + "epoch": 0.09224318658280922, + "grad_norm": 1.1369270086288452, + "learning_rate": 4.7695230607966455e-05, + "loss": 0.1323, + "step": 1760 + }, + { + "epoch": 0.09276729559748427, + "grad_norm": 2.0414113998413086, + "learning_rate": 4.768212788259958e-05, + "loss": 0.1486, + "step": 1770 + }, + { + "epoch": 0.09329140461215933, + "grad_norm": 2.0718820095062256, + "learning_rate": 4.76690251572327e-05, + "loss": 0.1626, + "step": 1780 + }, + { + "epoch": 0.09381551362683438, + "grad_norm": 2.329056739807129, + "learning_rate": 4.765592243186583e-05, + "loss": 0.1432, + "step": 1790 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 1.9668174982070923, + "learning_rate": 4.7642819706498956e-05, + "loss": 0.1597, + "step": 1800 + }, + { + "epoch": 0.09486373165618449, + "grad_norm": 2.178865671157837, + "learning_rate": 4.762971698113208e-05, + "loss": 0.1787, + "step": 1810 + }, + { + "epoch": 0.09538784067085954, + "grad_norm": 2.251453161239624, + "learning_rate": 4.76166142557652e-05, + "loss": 0.1425, + "step": 1820 + }, + { + "epoch": 0.0959119496855346, + "grad_norm": 2.2333481311798096, + "learning_rate": 4.7603511530398326e-05, + "loss": 0.1417, + "step": 1830 + }, + { + "epoch": 0.09643605870020965, + "grad_norm": 3.560199022293091, + "learning_rate": 4.759040880503145e-05, + "loss": 0.1252, + "step": 1840 + }, + { + "epoch": 0.09696016771488469, + "grad_norm": 2.0870866775512695, + "learning_rate": 4.757730607966457e-05, + "loss": 0.1617, + "step": 1850 + }, + { + "epoch": 0.09748427672955975, + "grad_norm": 1.6968202590942383, + "learning_rate": 4.7564203354297696e-05, + "loss": 0.1438, + "step": 1860 + }, + { + "epoch": 0.0980083857442348, + "grad_norm": 1.149673581123352, + "learning_rate": 4.755110062893082e-05, + "loss": 0.1519, + "step": 1870 + }, + { + "epoch": 0.09853249475890985, + "grad_norm": 1.1046313047409058, + "learning_rate": 4.753799790356394e-05, + "loss": 0.1318, + "step": 1880 + }, + { + "epoch": 0.09905660377358491, + "grad_norm": 1.192717432975769, + "learning_rate": 4.7524895178197066e-05, + "loss": 0.1514, + "step": 1890 + }, + { + "epoch": 0.09958071278825996, + "grad_norm": 1.7590820789337158, + "learning_rate": 4.751179245283019e-05, + "loss": 0.1653, + "step": 1900 + }, + { + "epoch": 0.100104821802935, + "grad_norm": 1.865303635597229, + "learning_rate": 4.749868972746331e-05, + "loss": 0.1605, + "step": 1910 + }, + { + "epoch": 0.10062893081761007, + "grad_norm": 1.695488452911377, + "learning_rate": 4.7485587002096436e-05, + "loss": 0.1473, + "step": 1920 + }, + { + "epoch": 0.10115303983228512, + "grad_norm": 1.4552061557769775, + "learning_rate": 4.747248427672956e-05, + "loss": 0.1402, + "step": 1930 + }, + { + "epoch": 0.10167714884696016, + "grad_norm": 3.4640729427337646, + "learning_rate": 4.745938155136268e-05, + "loss": 0.1738, + "step": 1940 + }, + { + "epoch": 0.10220125786163523, + "grad_norm": 1.700392484664917, + "learning_rate": 4.744627882599581e-05, + "loss": 0.1367, + "step": 1950 + }, + { + "epoch": 0.10272536687631027, + "grad_norm": 1.9521784782409668, + "learning_rate": 4.743317610062894e-05, + "loss": 0.1459, + "step": 1960 + }, + { + "epoch": 0.10324947589098532, + "grad_norm": 2.07822322845459, + "learning_rate": 4.742007337526206e-05, + "loss": 0.1851, + "step": 1970 + }, + { + "epoch": 0.10377358490566038, + "grad_norm": 1.2537726163864136, + "learning_rate": 4.7406970649895183e-05, + "loss": 0.1588, + "step": 1980 + }, + { + "epoch": 0.10429769392033543, + "grad_norm": 3.269179105758667, + "learning_rate": 4.739386792452831e-05, + "loss": 0.1697, + "step": 1990 + }, + { + "epoch": 0.10482180293501048, + "grad_norm": 1.4735572338104248, + "learning_rate": 4.7380765199161423e-05, + "loss": 0.1634, + "step": 2000 + }, + { + "epoch": 0.10482180293501048, + "eval_loss": 0.3126026391983032, + "eval_runtime": 267.0186, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 1.243, + "step": 2000 + }, + { + "epoch": 0.10534591194968554, + "grad_norm": 1.6849536895751953, + "learning_rate": 4.736766247379455e-05, + "loss": 0.1624, + "step": 2010 + }, + { + "epoch": 0.10587002096436059, + "grad_norm": 1.844832181930542, + "learning_rate": 4.735455974842768e-05, + "loss": 0.144, + "step": 2020 + }, + { + "epoch": 0.10639412997903563, + "grad_norm": 2.9537103176116943, + "learning_rate": 4.73414570230608e-05, + "loss": 0.1877, + "step": 2030 + }, + { + "epoch": 0.1069182389937107, + "grad_norm": 0.9054603576660156, + "learning_rate": 4.7328354297693924e-05, + "loss": 0.1407, + "step": 2040 + }, + { + "epoch": 0.10744234800838574, + "grad_norm": 1.7040249109268188, + "learning_rate": 4.731525157232705e-05, + "loss": 0.1432, + "step": 2050 + }, + { + "epoch": 0.10796645702306079, + "grad_norm": 4.041100025177002, + "learning_rate": 4.730214884696017e-05, + "loss": 0.1575, + "step": 2060 + }, + { + "epoch": 0.10849056603773585, + "grad_norm": 1.2942545413970947, + "learning_rate": 4.7289046121593294e-05, + "loss": 0.1458, + "step": 2070 + }, + { + "epoch": 0.1090146750524109, + "grad_norm": 2.1114304065704346, + "learning_rate": 4.727594339622642e-05, + "loss": 0.1529, + "step": 2080 + }, + { + "epoch": 0.10953878406708595, + "grad_norm": 1.232842206954956, + "learning_rate": 4.726284067085954e-05, + "loss": 0.158, + "step": 2090 + }, + { + "epoch": 0.11006289308176101, + "grad_norm": 1.1483700275421143, + "learning_rate": 4.7249737945492664e-05, + "loss": 0.171, + "step": 2100 + }, + { + "epoch": 0.11058700209643606, + "grad_norm": 2.2358574867248535, + "learning_rate": 4.7236635220125794e-05, + "loss": 0.138, + "step": 2110 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.9957813620567322, + "learning_rate": 4.722353249475891e-05, + "loss": 0.1364, + "step": 2120 + }, + { + "epoch": 0.11163522012578617, + "grad_norm": 1.6623884439468384, + "learning_rate": 4.7210429769392034e-05, + "loss": 0.1433, + "step": 2130 + }, + { + "epoch": 0.11215932914046121, + "grad_norm": 1.312627911567688, + "learning_rate": 4.719732704402516e-05, + "loss": 0.1549, + "step": 2140 + }, + { + "epoch": 0.11268343815513626, + "grad_norm": 1.5365536212921143, + "learning_rate": 4.718422431865828e-05, + "loss": 0.1608, + "step": 2150 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 2.1387362480163574, + "learning_rate": 4.7171121593291404e-05, + "loss": 0.1678, + "step": 2160 + }, + { + "epoch": 0.11373165618448637, + "grad_norm": 1.675684928894043, + "learning_rate": 4.715801886792453e-05, + "loss": 0.1411, + "step": 2170 + }, + { + "epoch": 0.11425576519916142, + "grad_norm": 2.3778798580169678, + "learning_rate": 4.714491614255766e-05, + "loss": 0.1283, + "step": 2180 + }, + { + "epoch": 0.11477987421383648, + "grad_norm": 1.5061962604522705, + "learning_rate": 4.713181341719078e-05, + "loss": 0.1551, + "step": 2190 + }, + { + "epoch": 0.11530398322851153, + "grad_norm": 1.1273947954177856, + "learning_rate": 4.7118710691823905e-05, + "loss": 0.1316, + "step": 2200 + }, + { + "epoch": 0.11582809224318659, + "grad_norm": 1.9337362051010132, + "learning_rate": 4.710560796645703e-05, + "loss": 0.1611, + "step": 2210 + }, + { + "epoch": 0.11635220125786164, + "grad_norm": 5.833995342254639, + "learning_rate": 4.709250524109015e-05, + "loss": 0.1447, + "step": 2220 + }, + { + "epoch": 0.11687631027253668, + "grad_norm": 1.9764273166656494, + "learning_rate": 4.7079402515723275e-05, + "loss": 0.154, + "step": 2230 + }, + { + "epoch": 0.11740041928721175, + "grad_norm": 1.5811856985092163, + "learning_rate": 4.706629979035639e-05, + "loss": 0.1399, + "step": 2240 + }, + { + "epoch": 0.1179245283018868, + "grad_norm": 1.6124383211135864, + "learning_rate": 4.7053197064989515e-05, + "loss": 0.1416, + "step": 2250 + }, + { + "epoch": 0.11844863731656184, + "grad_norm": 1.4890698194503784, + "learning_rate": 4.7040094339622645e-05, + "loss": 0.1595, + "step": 2260 + }, + { + "epoch": 0.1189727463312369, + "grad_norm": 1.7862485647201538, + "learning_rate": 4.702699161425577e-05, + "loss": 0.167, + "step": 2270 + }, + { + "epoch": 0.11949685534591195, + "grad_norm": 1.266352891921997, + "learning_rate": 4.701388888888889e-05, + "loss": 0.1934, + "step": 2280 + }, + { + "epoch": 0.120020964360587, + "grad_norm": 2.252445697784424, + "learning_rate": 4.7000786163522015e-05, + "loss": 0.1665, + "step": 2290 + }, + { + "epoch": 0.12054507337526206, + "grad_norm": 0.7538189888000488, + "learning_rate": 4.698768343815514e-05, + "loss": 0.1292, + "step": 2300 + }, + { + "epoch": 0.12106918238993711, + "grad_norm": 1.1450074911117554, + "learning_rate": 4.697458071278826e-05, + "loss": 0.1515, + "step": 2310 + }, + { + "epoch": 0.12159329140461216, + "grad_norm": 1.5450239181518555, + "learning_rate": 4.6961477987421385e-05, + "loss": 0.1658, + "step": 2320 + }, + { + "epoch": 0.12211740041928722, + "grad_norm": 2.0828919410705566, + "learning_rate": 4.694837526205451e-05, + "loss": 0.1334, + "step": 2330 + }, + { + "epoch": 0.12264150943396226, + "grad_norm": 3.190901756286621, + "learning_rate": 4.693527253668764e-05, + "loss": 0.1313, + "step": 2340 + }, + { + "epoch": 0.12316561844863731, + "grad_norm": 1.8129808902740479, + "learning_rate": 4.692216981132076e-05, + "loss": 0.1695, + "step": 2350 + }, + { + "epoch": 0.12368972746331237, + "grad_norm": 1.8840081691741943, + "learning_rate": 4.690906708595388e-05, + "loss": 0.1409, + "step": 2360 + }, + { + "epoch": 0.12421383647798742, + "grad_norm": 1.7952953577041626, + "learning_rate": 4.6895964360587e-05, + "loss": 0.1132, + "step": 2370 + }, + { + "epoch": 0.12473794549266247, + "grad_norm": 2.009758949279785, + "learning_rate": 4.6882861635220126e-05, + "loss": 0.1673, + "step": 2380 + }, + { + "epoch": 0.12526205450733752, + "grad_norm": 1.2804065942764282, + "learning_rate": 4.686975890985325e-05, + "loss": 0.1489, + "step": 2390 + }, + { + "epoch": 0.12578616352201258, + "grad_norm": 1.7180591821670532, + "learning_rate": 4.685665618448637e-05, + "loss": 0.1653, + "step": 2400 + }, + { + "epoch": 0.12631027253668764, + "grad_norm": 3.489091634750366, + "learning_rate": 4.6843553459119496e-05, + "loss": 0.128, + "step": 2410 + }, + { + "epoch": 0.12683438155136267, + "grad_norm": 2.4423317909240723, + "learning_rate": 4.6830450733752626e-05, + "loss": 0.1706, + "step": 2420 + }, + { + "epoch": 0.12735849056603774, + "grad_norm": 0.7794014811515808, + "learning_rate": 4.681734800838575e-05, + "loss": 0.143, + "step": 2430 + }, + { + "epoch": 0.1278825995807128, + "grad_norm": 1.549364447593689, + "learning_rate": 4.680424528301887e-05, + "loss": 0.163, + "step": 2440 + }, + { + "epoch": 0.12840670859538783, + "grad_norm": 4.103890895843506, + "learning_rate": 4.6791142557651996e-05, + "loss": 0.1499, + "step": 2450 + }, + { + "epoch": 0.1289308176100629, + "grad_norm": 1.770443081855774, + "learning_rate": 4.677803983228512e-05, + "loss": 0.1629, + "step": 2460 + }, + { + "epoch": 0.12945492662473795, + "grad_norm": 2.194091796875, + "learning_rate": 4.6764937106918236e-05, + "loss": 0.198, + "step": 2470 + }, + { + "epoch": 0.129979035639413, + "grad_norm": 1.8205286264419556, + "learning_rate": 4.675183438155136e-05, + "loss": 0.1464, + "step": 2480 + }, + { + "epoch": 0.13050314465408805, + "grad_norm": 1.9129751920700073, + "learning_rate": 4.673873165618449e-05, + "loss": 0.1567, + "step": 2490 + }, + { + "epoch": 0.1310272536687631, + "grad_norm": 1.2685729265213013, + "learning_rate": 4.672562893081761e-05, + "loss": 0.1614, + "step": 2500 + }, + { + "epoch": 0.13155136268343814, + "grad_norm": 2.170300006866455, + "learning_rate": 4.6712526205450736e-05, + "loss": 0.153, + "step": 2510 + }, + { + "epoch": 0.1320754716981132, + "grad_norm": 0.7259665727615356, + "learning_rate": 4.669942348008386e-05, + "loss": 0.1171, + "step": 2520 + }, + { + "epoch": 0.13259958071278827, + "grad_norm": 1.8095953464508057, + "learning_rate": 4.668632075471698e-05, + "loss": 0.1621, + "step": 2530 + }, + { + "epoch": 0.1331236897274633, + "grad_norm": 3.30016827583313, + "learning_rate": 4.6673218029350107e-05, + "loss": 0.1727, + "step": 2540 + }, + { + "epoch": 0.13364779874213836, + "grad_norm": 1.0500829219818115, + "learning_rate": 4.666011530398323e-05, + "loss": 0.177, + "step": 2550 + }, + { + "epoch": 0.13417190775681342, + "grad_norm": 2.164457321166992, + "learning_rate": 4.664701257861635e-05, + "loss": 0.1632, + "step": 2560 + }, + { + "epoch": 0.13469601677148846, + "grad_norm": 1.6930001974105835, + "learning_rate": 4.663390985324948e-05, + "loss": 0.1496, + "step": 2570 + }, + { + "epoch": 0.13522012578616352, + "grad_norm": 2.6686787605285645, + "learning_rate": 4.662080712788261e-05, + "loss": 0.1436, + "step": 2580 + }, + { + "epoch": 0.13574423480083858, + "grad_norm": 1.6781806945800781, + "learning_rate": 4.6607704402515723e-05, + "loss": 0.1369, + "step": 2590 + }, + { + "epoch": 0.13626834381551362, + "grad_norm": 1.3258206844329834, + "learning_rate": 4.659460167714885e-05, + "loss": 0.1494, + "step": 2600 + }, + { + "epoch": 0.13679245283018868, + "grad_norm": 2.0583879947662354, + "learning_rate": 4.658149895178197e-05, + "loss": 0.15, + "step": 2610 + }, + { + "epoch": 0.13731656184486374, + "grad_norm": 3.495466709136963, + "learning_rate": 4.6568396226415094e-05, + "loss": 0.1532, + "step": 2620 + }, + { + "epoch": 0.13784067085953877, + "grad_norm": 2.6582422256469727, + "learning_rate": 4.655529350104822e-05, + "loss": 0.1372, + "step": 2630 + }, + { + "epoch": 0.13836477987421383, + "grad_norm": 2.5871691703796387, + "learning_rate": 4.654219077568134e-05, + "loss": 0.1544, + "step": 2640 + }, + { + "epoch": 0.1388888888888889, + "grad_norm": 3.1617019176483154, + "learning_rate": 4.652908805031447e-05, + "loss": 0.1149, + "step": 2650 + }, + { + "epoch": 0.13941299790356393, + "grad_norm": 1.8542845249176025, + "learning_rate": 4.6515985324947594e-05, + "loss": 0.1354, + "step": 2660 + }, + { + "epoch": 0.139937106918239, + "grad_norm": 1.4777790307998657, + "learning_rate": 4.650288259958072e-05, + "loss": 0.1347, + "step": 2670 + }, + { + "epoch": 0.14046121593291405, + "grad_norm": 2.647198438644409, + "learning_rate": 4.648977987421384e-05, + "loss": 0.1698, + "step": 2680 + }, + { + "epoch": 0.14098532494758909, + "grad_norm": 2.088172197341919, + "learning_rate": 4.6476677148846964e-05, + "loss": 0.1414, + "step": 2690 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 1.4307177066802979, + "learning_rate": 4.646357442348009e-05, + "loss": 0.1309, + "step": 2700 + }, + { + "epoch": 0.1420335429769392, + "grad_norm": 1.9990031719207764, + "learning_rate": 4.6450471698113204e-05, + "loss": 0.1448, + "step": 2710 + }, + { + "epoch": 0.14255765199161424, + "grad_norm": 2.125288724899292, + "learning_rate": 4.6437368972746334e-05, + "loss": 0.1224, + "step": 2720 + }, + { + "epoch": 0.1430817610062893, + "grad_norm": 1.175098180770874, + "learning_rate": 4.642426624737946e-05, + "loss": 0.1375, + "step": 2730 + }, + { + "epoch": 0.14360587002096437, + "grad_norm": 2.440237522125244, + "learning_rate": 4.641116352201258e-05, + "loss": 0.1617, + "step": 2740 + }, + { + "epoch": 0.1441299790356394, + "grad_norm": 1.116590976715088, + "learning_rate": 4.6398060796645704e-05, + "loss": 0.121, + "step": 2750 + }, + { + "epoch": 0.14465408805031446, + "grad_norm": 2.4094574451446533, + "learning_rate": 4.638495807127883e-05, + "loss": 0.1326, + "step": 2760 + }, + { + "epoch": 0.14517819706498952, + "grad_norm": 1.5828850269317627, + "learning_rate": 4.637185534591195e-05, + "loss": 0.1741, + "step": 2770 + }, + { + "epoch": 0.14570230607966456, + "grad_norm": 1.1513792276382446, + "learning_rate": 4.6358752620545075e-05, + "loss": 0.1224, + "step": 2780 + }, + { + "epoch": 0.14622641509433962, + "grad_norm": 1.0948734283447266, + "learning_rate": 4.63456498951782e-05, + "loss": 0.1681, + "step": 2790 + }, + { + "epoch": 0.14675052410901468, + "grad_norm": 1.7512544393539429, + "learning_rate": 4.633254716981132e-05, + "loss": 0.1803, + "step": 2800 + }, + { + "epoch": 0.14727463312368974, + "grad_norm": 1.3149117231369019, + "learning_rate": 4.631944444444445e-05, + "loss": 0.1433, + "step": 2810 + }, + { + "epoch": 0.14779874213836477, + "grad_norm": 1.12627112865448, + "learning_rate": 4.6306341719077575e-05, + "loss": 0.1586, + "step": 2820 + }, + { + "epoch": 0.14832285115303984, + "grad_norm": 1.3142722845077515, + "learning_rate": 4.629323899371069e-05, + "loss": 0.137, + "step": 2830 + }, + { + "epoch": 0.1488469601677149, + "grad_norm": 1.5012706518173218, + "learning_rate": 4.6280136268343815e-05, + "loss": 0.1382, + "step": 2840 + }, + { + "epoch": 0.14937106918238993, + "grad_norm": 2.6549742221832275, + "learning_rate": 4.626703354297694e-05, + "loss": 0.1624, + "step": 2850 + }, + { + "epoch": 0.149895178197065, + "grad_norm": 1.5423760414123535, + "learning_rate": 4.625393081761006e-05, + "loss": 0.136, + "step": 2860 + }, + { + "epoch": 0.15041928721174005, + "grad_norm": 1.6354541778564453, + "learning_rate": 4.6240828092243185e-05, + "loss": 0.1606, + "step": 2870 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 1.840997338294983, + "learning_rate": 4.6227725366876315e-05, + "loss": 0.1657, + "step": 2880 + }, + { + "epoch": 0.15146750524109015, + "grad_norm": 1.7014777660369873, + "learning_rate": 4.621462264150944e-05, + "loss": 0.152, + "step": 2890 + }, + { + "epoch": 0.1519916142557652, + "grad_norm": 1.1734257936477661, + "learning_rate": 4.620151991614256e-05, + "loss": 0.1468, + "step": 2900 + }, + { + "epoch": 0.15251572327044025, + "grad_norm": 1.7266418933868408, + "learning_rate": 4.6188417190775685e-05, + "loss": 0.1644, + "step": 2910 + }, + { + "epoch": 0.1530398322851153, + "grad_norm": 1.7747575044631958, + "learning_rate": 4.617531446540881e-05, + "loss": 0.1536, + "step": 2920 + }, + { + "epoch": 0.15356394129979037, + "grad_norm": 2.43758487701416, + "learning_rate": 4.616221174004193e-05, + "loss": 0.1576, + "step": 2930 + }, + { + "epoch": 0.1540880503144654, + "grad_norm": 1.5235657691955566, + "learning_rate": 4.6149109014675056e-05, + "loss": 0.1342, + "step": 2940 + }, + { + "epoch": 0.15461215932914046, + "grad_norm": 2.2934274673461914, + "learning_rate": 4.613600628930818e-05, + "loss": 0.1715, + "step": 2950 + }, + { + "epoch": 0.15513626834381553, + "grad_norm": 1.945422887802124, + "learning_rate": 4.61229035639413e-05, + "loss": 0.1629, + "step": 2960 + }, + { + "epoch": 0.15566037735849056, + "grad_norm": 1.4969581365585327, + "learning_rate": 4.6109800838574426e-05, + "loss": 0.1411, + "step": 2970 + }, + { + "epoch": 0.15618448637316562, + "grad_norm": 1.6622493267059326, + "learning_rate": 4.609669811320755e-05, + "loss": 0.1424, + "step": 2980 + }, + { + "epoch": 0.15670859538784068, + "grad_norm": 1.6840288639068604, + "learning_rate": 4.608359538784067e-05, + "loss": 0.1623, + "step": 2990 + }, + { + "epoch": 0.15723270440251572, + "grad_norm": 1.4131284952163696, + "learning_rate": 4.6070492662473796e-05, + "loss": 0.1682, + "step": 3000 + }, + { + "epoch": 0.15723270440251572, + "eval_loss": 0.32990387082099915, + "eval_runtime": 267.4782, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 1.241, + "step": 3000 + }, + { + "epoch": 0.15775681341719078, + "grad_norm": 1.9176369905471802, + "learning_rate": 4.605738993710692e-05, + "loss": 0.1506, + "step": 3010 + }, + { + "epoch": 0.15828092243186584, + "grad_norm": 2.0427310466766357, + "learning_rate": 4.604428721174004e-05, + "loss": 0.139, + "step": 3020 + }, + { + "epoch": 0.15880503144654087, + "grad_norm": 0.8098589181900024, + "learning_rate": 4.6031184486373166e-05, + "loss": 0.1411, + "step": 3030 + }, + { + "epoch": 0.15932914046121593, + "grad_norm": 1.4167261123657227, + "learning_rate": 4.6018081761006296e-05, + "loss": 0.1425, + "step": 3040 + }, + { + "epoch": 0.159853249475891, + "grad_norm": 1.541549801826477, + "learning_rate": 4.600497903563942e-05, + "loss": 0.1243, + "step": 3050 + }, + { + "epoch": 0.16037735849056603, + "grad_norm": 2.3950603008270264, + "learning_rate": 4.599187631027254e-05, + "loss": 0.1456, + "step": 3060 + }, + { + "epoch": 0.1609014675052411, + "grad_norm": 1.3509130477905273, + "learning_rate": 4.597877358490566e-05, + "loss": 0.1447, + "step": 3070 + }, + { + "epoch": 0.16142557651991615, + "grad_norm": 1.6379474401474, + "learning_rate": 4.596567085953878e-05, + "loss": 0.1296, + "step": 3080 + }, + { + "epoch": 0.1619496855345912, + "grad_norm": 1.4416191577911377, + "learning_rate": 4.5952568134171906e-05, + "loss": 0.1413, + "step": 3090 + }, + { + "epoch": 0.16247379454926625, + "grad_norm": 1.3093364238739014, + "learning_rate": 4.593946540880503e-05, + "loss": 0.1566, + "step": 3100 + }, + { + "epoch": 0.1629979035639413, + "grad_norm": 1.1312339305877686, + "learning_rate": 4.592636268343816e-05, + "loss": 0.144, + "step": 3110 + }, + { + "epoch": 0.16352201257861634, + "grad_norm": 1.5782580375671387, + "learning_rate": 4.591325995807128e-05, + "loss": 0.1596, + "step": 3120 + }, + { + "epoch": 0.1640461215932914, + "grad_norm": 1.804060697555542, + "learning_rate": 4.5900157232704407e-05, + "loss": 0.1431, + "step": 3130 + }, + { + "epoch": 0.16457023060796647, + "grad_norm": 1.9302703142166138, + "learning_rate": 4.588705450733753e-05, + "loss": 0.1696, + "step": 3140 + }, + { + "epoch": 0.1650943396226415, + "grad_norm": 1.2807234525680542, + "learning_rate": 4.587395178197065e-05, + "loss": 0.1502, + "step": 3150 + }, + { + "epoch": 0.16561844863731656, + "grad_norm": 1.3993560075759888, + "learning_rate": 4.586084905660378e-05, + "loss": 0.1489, + "step": 3160 + }, + { + "epoch": 0.16614255765199162, + "grad_norm": 2.2172915935516357, + "learning_rate": 4.58477463312369e-05, + "loss": 0.14, + "step": 3170 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 2.70487117767334, + "learning_rate": 4.5834643605870024e-05, + "loss": 0.1605, + "step": 3180 + }, + { + "epoch": 0.16719077568134172, + "grad_norm": 1.2218254804611206, + "learning_rate": 4.582154088050315e-05, + "loss": 0.1503, + "step": 3190 + }, + { + "epoch": 0.16771488469601678, + "grad_norm": 1.8308320045471191, + "learning_rate": 4.580843815513627e-05, + "loss": 0.1308, + "step": 3200 + }, + { + "epoch": 0.16823899371069181, + "grad_norm": 2.082028865814209, + "learning_rate": 4.5795335429769394e-05, + "loss": 0.1643, + "step": 3210 + }, + { + "epoch": 0.16876310272536688, + "grad_norm": 1.1031092405319214, + "learning_rate": 4.578223270440252e-05, + "loss": 0.1329, + "step": 3220 + }, + { + "epoch": 0.16928721174004194, + "grad_norm": 1.128424048423767, + "learning_rate": 4.576912997903564e-05, + "loss": 0.1344, + "step": 3230 + }, + { + "epoch": 0.16981132075471697, + "grad_norm": 1.0885006189346313, + "learning_rate": 4.5756027253668764e-05, + "loss": 0.1365, + "step": 3240 + }, + { + "epoch": 0.17033542976939203, + "grad_norm": 1.0524739027023315, + "learning_rate": 4.574292452830189e-05, + "loss": 0.1377, + "step": 3250 + }, + { + "epoch": 0.1708595387840671, + "grad_norm": 1.4963139295578003, + "learning_rate": 4.572982180293501e-05, + "loss": 0.1555, + "step": 3260 + }, + { + "epoch": 0.17138364779874213, + "grad_norm": 1.3145508766174316, + "learning_rate": 4.571671907756814e-05, + "loss": 0.1345, + "step": 3270 + }, + { + "epoch": 0.1719077568134172, + "grad_norm": 1.2556843757629395, + "learning_rate": 4.5703616352201264e-05, + "loss": 0.1298, + "step": 3280 + }, + { + "epoch": 0.17243186582809225, + "grad_norm": 2.102116584777832, + "learning_rate": 4.569051362683439e-05, + "loss": 0.1717, + "step": 3290 + }, + { + "epoch": 0.17295597484276728, + "grad_norm": 2.472163438796997, + "learning_rate": 4.567741090146751e-05, + "loss": 0.1381, + "step": 3300 + }, + { + "epoch": 0.17348008385744235, + "grad_norm": 1.828572392463684, + "learning_rate": 4.566430817610063e-05, + "loss": 0.1226, + "step": 3310 + }, + { + "epoch": 0.1740041928721174, + "grad_norm": 1.566004991531372, + "learning_rate": 4.565120545073375e-05, + "loss": 0.1356, + "step": 3320 + }, + { + "epoch": 0.17452830188679244, + "grad_norm": 1.1402428150177002, + "learning_rate": 4.5638102725366874e-05, + "loss": 0.1192, + "step": 3330 + }, + { + "epoch": 0.1750524109014675, + "grad_norm": 1.3995243310928345, + "learning_rate": 4.5625e-05, + "loss": 0.1352, + "step": 3340 + }, + { + "epoch": 0.17557651991614256, + "grad_norm": 2.771331310272217, + "learning_rate": 4.561189727463313e-05, + "loss": 0.1585, + "step": 3350 + }, + { + "epoch": 0.1761006289308176, + "grad_norm": 2.0721607208251953, + "learning_rate": 4.559879454926625e-05, + "loss": 0.156, + "step": 3360 + }, + { + "epoch": 0.17662473794549266, + "grad_norm": 3.02065110206604, + "learning_rate": 4.5585691823899375e-05, + "loss": 0.1046, + "step": 3370 + }, + { + "epoch": 0.17714884696016772, + "grad_norm": 1.377319574356079, + "learning_rate": 4.55725890985325e-05, + "loss": 0.1795, + "step": 3380 + }, + { + "epoch": 0.17767295597484276, + "grad_norm": 2.061204195022583, + "learning_rate": 4.555948637316562e-05, + "loss": 0.1209, + "step": 3390 + }, + { + "epoch": 0.17819706498951782, + "grad_norm": 1.6569091081619263, + "learning_rate": 4.5546383647798745e-05, + "loss": 0.1244, + "step": 3400 + }, + { + "epoch": 0.17872117400419288, + "grad_norm": 1.5188376903533936, + "learning_rate": 4.553328092243187e-05, + "loss": 0.1589, + "step": 3410 + }, + { + "epoch": 0.1792452830188679, + "grad_norm": 1.463887095451355, + "learning_rate": 4.552017819706499e-05, + "loss": 0.1109, + "step": 3420 + }, + { + "epoch": 0.17976939203354297, + "grad_norm": 1.8769468069076538, + "learning_rate": 4.5507075471698115e-05, + "loss": 0.1313, + "step": 3430 + }, + { + "epoch": 0.18029350104821804, + "grad_norm": 1.344223976135254, + "learning_rate": 4.549397274633124e-05, + "loss": 0.1406, + "step": 3440 + }, + { + "epoch": 0.18081761006289307, + "grad_norm": 1.0123755931854248, + "learning_rate": 4.548087002096436e-05, + "loss": 0.1449, + "step": 3450 + }, + { + "epoch": 0.18134171907756813, + "grad_norm": 0.779967725276947, + "learning_rate": 4.5467767295597485e-05, + "loss": 0.1201, + "step": 3460 + }, + { + "epoch": 0.1818658280922432, + "grad_norm": 1.4109629392623901, + "learning_rate": 4.545466457023061e-05, + "loss": 0.1449, + "step": 3470 + }, + { + "epoch": 0.18238993710691823, + "grad_norm": 0.7555325031280518, + "learning_rate": 4.544156184486373e-05, + "loss": 0.1368, + "step": 3480 + }, + { + "epoch": 0.1829140461215933, + "grad_norm": 2.1541240215301514, + "learning_rate": 4.5428459119496855e-05, + "loss": 0.1483, + "step": 3490 + }, + { + "epoch": 0.18343815513626835, + "grad_norm": 1.714505910873413, + "learning_rate": 4.541535639412998e-05, + "loss": 0.1706, + "step": 3500 + }, + { + "epoch": 0.18396226415094338, + "grad_norm": 3.6619317531585693, + "learning_rate": 4.540225366876311e-05, + "loss": 0.1669, + "step": 3510 + }, + { + "epoch": 0.18448637316561844, + "grad_norm": 1.6405800580978394, + "learning_rate": 4.538915094339623e-05, + "loss": 0.1463, + "step": 3520 + }, + { + "epoch": 0.1850104821802935, + "grad_norm": 1.3726412057876587, + "learning_rate": 4.5376048218029356e-05, + "loss": 0.1274, + "step": 3530 + }, + { + "epoch": 0.18553459119496854, + "grad_norm": 1.2834166288375854, + "learning_rate": 4.536294549266248e-05, + "loss": 0.1536, + "step": 3540 + }, + { + "epoch": 0.1860587002096436, + "grad_norm": 1.3806772232055664, + "learning_rate": 4.5349842767295596e-05, + "loss": 0.1745, + "step": 3550 + }, + { + "epoch": 0.18658280922431866, + "grad_norm": 1.5404632091522217, + "learning_rate": 4.533674004192872e-05, + "loss": 0.1519, + "step": 3560 + }, + { + "epoch": 0.1871069182389937, + "grad_norm": 1.23709237575531, + "learning_rate": 4.532363731656184e-05, + "loss": 0.1706, + "step": 3570 + }, + { + "epoch": 0.18763102725366876, + "grad_norm": 1.5122413635253906, + "learning_rate": 4.531053459119497e-05, + "loss": 0.1296, + "step": 3580 + }, + { + "epoch": 0.18815513626834382, + "grad_norm": 1.3612315654754639, + "learning_rate": 4.5297431865828096e-05, + "loss": 0.1705, + "step": 3590 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 2.4789493083953857, + "learning_rate": 4.528432914046122e-05, + "loss": 0.1162, + "step": 3600 + }, + { + "epoch": 0.18920335429769392, + "grad_norm": 1.856713891029358, + "learning_rate": 4.527122641509434e-05, + "loss": 0.1527, + "step": 3610 + }, + { + "epoch": 0.18972746331236898, + "grad_norm": 2.436396598815918, + "learning_rate": 4.5258123689727466e-05, + "loss": 0.1303, + "step": 3620 + }, + { + "epoch": 0.19025157232704404, + "grad_norm": 1.4787251949310303, + "learning_rate": 4.524502096436059e-05, + "loss": 0.1531, + "step": 3630 + }, + { + "epoch": 0.19077568134171907, + "grad_norm": 1.5141669511795044, + "learning_rate": 4.523191823899371e-05, + "loss": 0.1397, + "step": 3640 + }, + { + "epoch": 0.19129979035639413, + "grad_norm": 1.4852555990219116, + "learning_rate": 4.5218815513626836e-05, + "loss": 0.1645, + "step": 3650 + }, + { + "epoch": 0.1918238993710692, + "grad_norm": 2.069603204727173, + "learning_rate": 4.520571278825996e-05, + "loss": 0.1361, + "step": 3660 + }, + { + "epoch": 0.19234800838574423, + "grad_norm": 1.8766626119613647, + "learning_rate": 4.519261006289308e-05, + "loss": 0.1536, + "step": 3670 + }, + { + "epoch": 0.1928721174004193, + "grad_norm": 1.3918403387069702, + "learning_rate": 4.5179507337526206e-05, + "loss": 0.1456, + "step": 3680 + }, + { + "epoch": 0.19339622641509435, + "grad_norm": 3.741128444671631, + "learning_rate": 4.516640461215933e-05, + "loss": 0.1348, + "step": 3690 + }, + { + "epoch": 0.19392033542976939, + "grad_norm": 2.8250186443328857, + "learning_rate": 4.515330188679245e-05, + "loss": 0.1424, + "step": 3700 + }, + { + "epoch": 0.19444444444444445, + "grad_norm": 1.4397534132003784, + "learning_rate": 4.5140199161425577e-05, + "loss": 0.1534, + "step": 3710 + }, + { + "epoch": 0.1949685534591195, + "grad_norm": 1.3303152322769165, + "learning_rate": 4.51270964360587e-05, + "loss": 0.126, + "step": 3720 + }, + { + "epoch": 0.19549266247379454, + "grad_norm": 1.1962412595748901, + "learning_rate": 4.511399371069182e-05, + "loss": 0.1284, + "step": 3730 + }, + { + "epoch": 0.1960167714884696, + "grad_norm": 2.205249309539795, + "learning_rate": 4.5100890985324953e-05, + "loss": 0.1575, + "step": 3740 + }, + { + "epoch": 0.19654088050314467, + "grad_norm": 1.817156195640564, + "learning_rate": 4.508778825995808e-05, + "loss": 0.1523, + "step": 3750 + }, + { + "epoch": 0.1970649895178197, + "grad_norm": 1.2243177890777588, + "learning_rate": 4.50746855345912e-05, + "loss": 0.1379, + "step": 3760 + }, + { + "epoch": 0.19758909853249476, + "grad_norm": 1.650865077972412, + "learning_rate": 4.5061582809224324e-05, + "loss": 0.1374, + "step": 3770 + }, + { + "epoch": 0.19811320754716982, + "grad_norm": 2.2793819904327393, + "learning_rate": 4.504848008385745e-05, + "loss": 0.1408, + "step": 3780 + }, + { + "epoch": 0.19863731656184486, + "grad_norm": 2.1652002334594727, + "learning_rate": 4.5035377358490564e-05, + "loss": 0.1281, + "step": 3790 + }, + { + "epoch": 0.19916142557651992, + "grad_norm": 1.8161773681640625, + "learning_rate": 4.502227463312369e-05, + "loss": 0.1492, + "step": 3800 + }, + { + "epoch": 0.19968553459119498, + "grad_norm": 1.5104507207870483, + "learning_rate": 4.500917190775682e-05, + "loss": 0.1419, + "step": 3810 + }, + { + "epoch": 0.20020964360587, + "grad_norm": 2.545625925064087, + "learning_rate": 4.499606918238994e-05, + "loss": 0.1451, + "step": 3820 + }, + { + "epoch": 0.20073375262054508, + "grad_norm": 1.7046788930892944, + "learning_rate": 4.4982966457023064e-05, + "loss": 0.1623, + "step": 3830 + }, + { + "epoch": 0.20125786163522014, + "grad_norm": 1.6505851745605469, + "learning_rate": 4.496986373165619e-05, + "loss": 0.1357, + "step": 3840 + }, + { + "epoch": 0.20178197064989517, + "grad_norm": 7.002503395080566, + "learning_rate": 4.495676100628931e-05, + "loss": 0.1187, + "step": 3850 + }, + { + "epoch": 0.20230607966457023, + "grad_norm": 1.687934398651123, + "learning_rate": 4.4943658280922434e-05, + "loss": 0.1638, + "step": 3860 + }, + { + "epoch": 0.2028301886792453, + "grad_norm": 1.5736196041107178, + "learning_rate": 4.493055555555556e-05, + "loss": 0.1493, + "step": 3870 + }, + { + "epoch": 0.20335429769392033, + "grad_norm": 1.3231173753738403, + "learning_rate": 4.491745283018868e-05, + "loss": 0.1393, + "step": 3880 + }, + { + "epoch": 0.2038784067085954, + "grad_norm": 1.744145154953003, + "learning_rate": 4.4904350104821804e-05, + "loss": 0.1424, + "step": 3890 + }, + { + "epoch": 0.20440251572327045, + "grad_norm": 1.9657684564590454, + "learning_rate": 4.4891247379454934e-05, + "loss": 0.1211, + "step": 3900 + }, + { + "epoch": 0.20492662473794548, + "grad_norm": 1.672458529472351, + "learning_rate": 4.487814465408805e-05, + "loss": 0.1592, + "step": 3910 + }, + { + "epoch": 0.20545073375262055, + "grad_norm": 2.555405378341675, + "learning_rate": 4.4865041928721174e-05, + "loss": 0.1473, + "step": 3920 + }, + { + "epoch": 0.2059748427672956, + "grad_norm": 5.1308183670043945, + "learning_rate": 4.48519392033543e-05, + "loss": 0.153, + "step": 3930 + }, + { + "epoch": 0.20649895178197064, + "grad_norm": 0.9765024185180664, + "learning_rate": 4.483883647798742e-05, + "loss": 0.1173, + "step": 3940 + }, + { + "epoch": 0.2070230607966457, + "grad_norm": 2.002351999282837, + "learning_rate": 4.4825733752620545e-05, + "loss": 0.1367, + "step": 3950 + }, + { + "epoch": 0.20754716981132076, + "grad_norm": 4.004027843475342, + "learning_rate": 4.481263102725367e-05, + "loss": 0.1373, + "step": 3960 + }, + { + "epoch": 0.2080712788259958, + "grad_norm": 3.0619328022003174, + "learning_rate": 4.47995283018868e-05, + "loss": 0.1148, + "step": 3970 + }, + { + "epoch": 0.20859538784067086, + "grad_norm": 1.905088186264038, + "learning_rate": 4.478642557651992e-05, + "loss": 0.1673, + "step": 3980 + }, + { + "epoch": 0.20911949685534592, + "grad_norm": 1.8251434564590454, + "learning_rate": 4.4773322851153045e-05, + "loss": 0.1535, + "step": 3990 + }, + { + "epoch": 0.20964360587002095, + "grad_norm": 2.9351112842559814, + "learning_rate": 4.476022012578617e-05, + "loss": 0.1167, + "step": 4000 + }, + { + "epoch": 0.20964360587002095, + "eval_loss": 0.30853426456451416, + "eval_runtime": 267.8685, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 1.239, + "step": 4000 + }, + { + "epoch": 0.21016771488469602, + "grad_norm": 1.5548582077026367, + "learning_rate": 4.474711740041929e-05, + "loss": 0.1359, + "step": 4010 + }, + { + "epoch": 0.21069182389937108, + "grad_norm": 1.887024164199829, + "learning_rate": 4.473401467505241e-05, + "loss": 0.145, + "step": 4020 + }, + { + "epoch": 0.2112159329140461, + "grad_norm": 1.1486566066741943, + "learning_rate": 4.472091194968553e-05, + "loss": 0.1225, + "step": 4030 + }, + { + "epoch": 0.21174004192872117, + "grad_norm": 6.734325885772705, + "learning_rate": 4.470780922431866e-05, + "loss": 0.1309, + "step": 4040 + }, + { + "epoch": 0.21226415094339623, + "grad_norm": 1.878401756286621, + "learning_rate": 4.4694706498951785e-05, + "loss": 0.1526, + "step": 4050 + }, + { + "epoch": 0.21278825995807127, + "grad_norm": 1.8532344102859497, + "learning_rate": 4.468160377358491e-05, + "loss": 0.1794, + "step": 4060 + }, + { + "epoch": 0.21331236897274633, + "grad_norm": 1.9898567199707031, + "learning_rate": 4.466850104821803e-05, + "loss": 0.1432, + "step": 4070 + }, + { + "epoch": 0.2138364779874214, + "grad_norm": 1.4500483274459839, + "learning_rate": 4.4655398322851155e-05, + "loss": 0.1246, + "step": 4080 + }, + { + "epoch": 0.21436058700209643, + "grad_norm": 2.1289262771606445, + "learning_rate": 4.464229559748428e-05, + "loss": 0.1444, + "step": 4090 + }, + { + "epoch": 0.2148846960167715, + "grad_norm": 1.5706223249435425, + "learning_rate": 4.46291928721174e-05, + "loss": 0.135, + "step": 4100 + }, + { + "epoch": 0.21540880503144655, + "grad_norm": 1.5430450439453125, + "learning_rate": 4.4616090146750525e-05, + "loss": 0.1527, + "step": 4110 + }, + { + "epoch": 0.21593291404612158, + "grad_norm": 1.4780341386795044, + "learning_rate": 4.460298742138365e-05, + "loss": 0.1401, + "step": 4120 + }, + { + "epoch": 0.21645702306079664, + "grad_norm": 1.447033166885376, + "learning_rate": 4.458988469601678e-05, + "loss": 0.1461, + "step": 4130 + }, + { + "epoch": 0.2169811320754717, + "grad_norm": 1.8528786897659302, + "learning_rate": 4.4576781970649896e-05, + "loss": 0.1243, + "step": 4140 + }, + { + "epoch": 0.21750524109014674, + "grad_norm": 1.6013343334197998, + "learning_rate": 4.456367924528302e-05, + "loss": 0.1052, + "step": 4150 + }, + { + "epoch": 0.2180293501048218, + "grad_norm": 1.5129047632217407, + "learning_rate": 4.455057651991614e-05, + "loss": 0.1346, + "step": 4160 + }, + { + "epoch": 0.21855345911949686, + "grad_norm": 2.5807998180389404, + "learning_rate": 4.4537473794549266e-05, + "loss": 0.14, + "step": 4170 + }, + { + "epoch": 0.2190775681341719, + "grad_norm": 2.2867794036865234, + "learning_rate": 4.452437106918239e-05, + "loss": 0.1632, + "step": 4180 + }, + { + "epoch": 0.21960167714884696, + "grad_norm": 1.180045247077942, + "learning_rate": 4.451126834381551e-05, + "loss": 0.1398, + "step": 4190 + }, + { + "epoch": 0.22012578616352202, + "grad_norm": 1.8197094202041626, + "learning_rate": 4.449816561844864e-05, + "loss": 0.1282, + "step": 4200 + }, + { + "epoch": 0.22064989517819705, + "grad_norm": 1.807565689086914, + "learning_rate": 4.4485062893081766e-05, + "loss": 0.1569, + "step": 4210 + }, + { + "epoch": 0.22117400419287211, + "grad_norm": 2.2830843925476074, + "learning_rate": 4.447196016771489e-05, + "loss": 0.1479, + "step": 4220 + }, + { + "epoch": 0.22169811320754718, + "grad_norm": 1.467529058456421, + "learning_rate": 4.445885744234801e-05, + "loss": 0.1247, + "step": 4230 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 1.474076271057129, + "learning_rate": 4.4445754716981136e-05, + "loss": 0.1556, + "step": 4240 + }, + { + "epoch": 0.22274633123689727, + "grad_norm": 2.1999564170837402, + "learning_rate": 4.443265199161426e-05, + "loss": 0.1816, + "step": 4250 + }, + { + "epoch": 0.22327044025157233, + "grad_norm": 1.921810507774353, + "learning_rate": 4.4419549266247376e-05, + "loss": 0.1245, + "step": 4260 + }, + { + "epoch": 0.22379454926624737, + "grad_norm": 3.8777430057525635, + "learning_rate": 4.4406446540880506e-05, + "loss": 0.1503, + "step": 4270 + }, + { + "epoch": 0.22431865828092243, + "grad_norm": 2.411684989929199, + "learning_rate": 4.439334381551363e-05, + "loss": 0.1776, + "step": 4280 + }, + { + "epoch": 0.2248427672955975, + "grad_norm": 1.9826992750167847, + "learning_rate": 4.438024109014675e-05, + "loss": 0.124, + "step": 4290 + }, + { + "epoch": 0.22536687631027252, + "grad_norm": 1.5890358686447144, + "learning_rate": 4.4367138364779877e-05, + "loss": 0.1429, + "step": 4300 + }, + { + "epoch": 0.22589098532494759, + "grad_norm": 1.2754069566726685, + "learning_rate": 4.4354035639413e-05, + "loss": 0.1331, + "step": 4310 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 1.54972505569458, + "learning_rate": 4.434093291404612e-05, + "loss": 0.1105, + "step": 4320 + }, + { + "epoch": 0.22693920335429768, + "grad_norm": 2.839402437210083, + "learning_rate": 4.432783018867925e-05, + "loss": 0.1604, + "step": 4330 + }, + { + "epoch": 0.22746331236897274, + "grad_norm": 1.7896904945373535, + "learning_rate": 4.431472746331237e-05, + "loss": 0.1081, + "step": 4340 + }, + { + "epoch": 0.2279874213836478, + "grad_norm": 2.3019604682922363, + "learning_rate": 4.4301624737945494e-05, + "loss": 0.0951, + "step": 4350 + }, + { + "epoch": 0.22851153039832284, + "grad_norm": 1.953927755355835, + "learning_rate": 4.4288522012578624e-05, + "loss": 0.1508, + "step": 4360 + }, + { + "epoch": 0.2290356394129979, + "grad_norm": 2.379554510116577, + "learning_rate": 4.427541928721175e-05, + "loss": 0.1446, + "step": 4370 + }, + { + "epoch": 0.22955974842767296, + "grad_norm": 1.1505978107452393, + "learning_rate": 4.4262316561844864e-05, + "loss": 0.1378, + "step": 4380 + }, + { + "epoch": 0.23008385744234802, + "grad_norm": 1.7770054340362549, + "learning_rate": 4.424921383647799e-05, + "loss": 0.1432, + "step": 4390 + }, + { + "epoch": 0.23060796645702306, + "grad_norm": 2.470569133758545, + "learning_rate": 4.423611111111111e-05, + "loss": 0.1286, + "step": 4400 + }, + { + "epoch": 0.23113207547169812, + "grad_norm": 2.54297137260437, + "learning_rate": 4.4223008385744234e-05, + "loss": 0.1375, + "step": 4410 + }, + { + "epoch": 0.23165618448637318, + "grad_norm": 1.6648815870285034, + "learning_rate": 4.420990566037736e-05, + "loss": 0.1307, + "step": 4420 + }, + { + "epoch": 0.2321802935010482, + "grad_norm": 2.62874174118042, + "learning_rate": 4.419680293501048e-05, + "loss": 0.1549, + "step": 4430 + }, + { + "epoch": 0.23270440251572327, + "grad_norm": 1.2740308046340942, + "learning_rate": 4.418370020964361e-05, + "loss": 0.1303, + "step": 4440 + }, + { + "epoch": 0.23322851153039834, + "grad_norm": 2.0457019805908203, + "learning_rate": 4.4170597484276734e-05, + "loss": 0.1381, + "step": 4450 + }, + { + "epoch": 0.23375262054507337, + "grad_norm": 1.8146222829818726, + "learning_rate": 4.415749475890986e-05, + "loss": 0.1178, + "step": 4460 + }, + { + "epoch": 0.23427672955974843, + "grad_norm": 1.4437885284423828, + "learning_rate": 4.414439203354298e-05, + "loss": 0.1612, + "step": 4470 + }, + { + "epoch": 0.2348008385744235, + "grad_norm": 1.3668901920318604, + "learning_rate": 4.4131289308176104e-05, + "loss": 0.1306, + "step": 4480 + }, + { + "epoch": 0.23532494758909853, + "grad_norm": 1.2944235801696777, + "learning_rate": 4.411818658280923e-05, + "loss": 0.1164, + "step": 4490 + }, + { + "epoch": 0.2358490566037736, + "grad_norm": 1.7875624895095825, + "learning_rate": 4.4105083857442344e-05, + "loss": 0.1303, + "step": 4500 + }, + { + "epoch": 0.23637316561844865, + "grad_norm": 1.772647500038147, + "learning_rate": 4.4091981132075474e-05, + "loss": 0.1428, + "step": 4510 + }, + { + "epoch": 0.23689727463312368, + "grad_norm": 1.3781625032424927, + "learning_rate": 4.40788784067086e-05, + "loss": 0.1246, + "step": 4520 + }, + { + "epoch": 0.23742138364779874, + "grad_norm": 2.086047649383545, + "learning_rate": 4.406577568134172e-05, + "loss": 0.1437, + "step": 4530 + }, + { + "epoch": 0.2379454926624738, + "grad_norm": 1.900099754333496, + "learning_rate": 4.4052672955974845e-05, + "loss": 0.1324, + "step": 4540 + }, + { + "epoch": 0.23846960167714884, + "grad_norm": 5.876873016357422, + "learning_rate": 4.403957023060797e-05, + "loss": 0.1661, + "step": 4550 + }, + { + "epoch": 0.2389937106918239, + "grad_norm": 2.2485744953155518, + "learning_rate": 4.402646750524109e-05, + "loss": 0.1262, + "step": 4560 + }, + { + "epoch": 0.23951781970649896, + "grad_norm": 2.297826051712036, + "learning_rate": 4.4013364779874215e-05, + "loss": 0.1435, + "step": 4570 + }, + { + "epoch": 0.240041928721174, + "grad_norm": 1.4973938465118408, + "learning_rate": 4.400026205450734e-05, + "loss": 0.1309, + "step": 4580 + }, + { + "epoch": 0.24056603773584906, + "grad_norm": 3.2034554481506348, + "learning_rate": 4.398715932914046e-05, + "loss": 0.1406, + "step": 4590 + }, + { + "epoch": 0.24109014675052412, + "grad_norm": 1.022011160850525, + "learning_rate": 4.397405660377359e-05, + "loss": 0.1335, + "step": 4600 + }, + { + "epoch": 0.24161425576519915, + "grad_norm": 1.525408387184143, + "learning_rate": 4.3960953878406715e-05, + "loss": 0.1115, + "step": 4610 + }, + { + "epoch": 0.24213836477987422, + "grad_norm": 1.0966118574142456, + "learning_rate": 4.394785115303983e-05, + "loss": 0.1334, + "step": 4620 + }, + { + "epoch": 0.24266247379454928, + "grad_norm": 2.1737682819366455, + "learning_rate": 4.3934748427672955e-05, + "loss": 0.118, + "step": 4630 + }, + { + "epoch": 0.2431865828092243, + "grad_norm": 1.6387145519256592, + "learning_rate": 4.392164570230608e-05, + "loss": 0.1034, + "step": 4640 + }, + { + "epoch": 0.24371069182389937, + "grad_norm": 2.735123634338379, + "learning_rate": 4.39085429769392e-05, + "loss": 0.1379, + "step": 4650 + }, + { + "epoch": 0.24423480083857443, + "grad_norm": 3.40956974029541, + "learning_rate": 4.3895440251572325e-05, + "loss": 0.1388, + "step": 4660 + }, + { + "epoch": 0.24475890985324947, + "grad_norm": 2.1785049438476562, + "learning_rate": 4.3882337526205455e-05, + "loss": 0.1496, + "step": 4670 + }, + { + "epoch": 0.24528301886792453, + "grad_norm": 1.1879709959030151, + "learning_rate": 4.386923480083858e-05, + "loss": 0.1515, + "step": 4680 + }, + { + "epoch": 0.2458071278825996, + "grad_norm": 1.3993653059005737, + "learning_rate": 4.38561320754717e-05, + "loss": 0.1063, + "step": 4690 + }, + { + "epoch": 0.24633123689727462, + "grad_norm": 2.238290309906006, + "learning_rate": 4.3843029350104826e-05, + "loss": 0.1606, + "step": 4700 + }, + { + "epoch": 0.2468553459119497, + "grad_norm": 1.1759029626846313, + "learning_rate": 4.382992662473795e-05, + "loss": 0.1201, + "step": 4710 + }, + { + "epoch": 0.24737945492662475, + "grad_norm": 1.6260344982147217, + "learning_rate": 4.381682389937107e-05, + "loss": 0.1332, + "step": 4720 + }, + { + "epoch": 0.24790356394129978, + "grad_norm": 2.897883653640747, + "learning_rate": 4.3803721174004196e-05, + "loss": 0.1343, + "step": 4730 + }, + { + "epoch": 0.24842767295597484, + "grad_norm": 2.3788511753082275, + "learning_rate": 4.379061844863732e-05, + "loss": 0.1348, + "step": 4740 + }, + { + "epoch": 0.2489517819706499, + "grad_norm": 1.6649222373962402, + "learning_rate": 4.377751572327044e-05, + "loss": 0.1316, + "step": 4750 + }, + { + "epoch": 0.24947589098532494, + "grad_norm": 0.7832605242729187, + "learning_rate": 4.3764412997903566e-05, + "loss": 0.1674, + "step": 4760 + }, + { + "epoch": 0.25, + "grad_norm": 1.33087158203125, + "learning_rate": 4.375131027253669e-05, + "loss": 0.1434, + "step": 4770 + }, + { + "epoch": 0.25052410901467503, + "grad_norm": 2.168484687805176, + "learning_rate": 4.373820754716981e-05, + "loss": 0.0985, + "step": 4780 + }, + { + "epoch": 0.2510482180293501, + "grad_norm": 1.5246626138687134, + "learning_rate": 4.3725104821802936e-05, + "loss": 0.1304, + "step": 4790 + }, + { + "epoch": 0.25157232704402516, + "grad_norm": 1.0635180473327637, + "learning_rate": 4.371200209643606e-05, + "loss": 0.1401, + "step": 4800 + }, + { + "epoch": 0.2520964360587002, + "grad_norm": 3.1194510459899902, + "learning_rate": 4.369889937106918e-05, + "loss": 0.1053, + "step": 4810 + }, + { + "epoch": 0.2526205450733753, + "grad_norm": 1.4997973442077637, + "learning_rate": 4.3685796645702306e-05, + "loss": 0.1227, + "step": 4820 + }, + { + "epoch": 0.2531446540880503, + "grad_norm": 1.7188799381256104, + "learning_rate": 4.3672693920335436e-05, + "loss": 0.1178, + "step": 4830 + }, + { + "epoch": 0.25366876310272535, + "grad_norm": 1.7366551160812378, + "learning_rate": 4.365959119496856e-05, + "loss": 0.1291, + "step": 4840 + }, + { + "epoch": 0.25419287211740044, + "grad_norm": 1.3258930444717407, + "learning_rate": 4.364648846960168e-05, + "loss": 0.1202, + "step": 4850 + }, + { + "epoch": 0.25471698113207547, + "grad_norm": 2.1996500492095947, + "learning_rate": 4.36333857442348e-05, + "loss": 0.1506, + "step": 4860 + }, + { + "epoch": 0.2552410901467505, + "grad_norm": 1.105446219444275, + "learning_rate": 4.362028301886792e-05, + "loss": 0.147, + "step": 4870 + }, + { + "epoch": 0.2557651991614256, + "grad_norm": 1.894164800643921, + "learning_rate": 4.3607180293501046e-05, + "loss": 0.126, + "step": 4880 + }, + { + "epoch": 0.2562893081761006, + "grad_norm": 1.4393502473831177, + "learning_rate": 4.359407756813417e-05, + "loss": 0.1275, + "step": 4890 + }, + { + "epoch": 0.25681341719077566, + "grad_norm": 3.0993599891662598, + "learning_rate": 4.35809748427673e-05, + "loss": 0.1365, + "step": 4900 + }, + { + "epoch": 0.25733752620545075, + "grad_norm": 1.72710382938385, + "learning_rate": 4.3567872117400423e-05, + "loss": 0.1429, + "step": 4910 + }, + { + "epoch": 0.2578616352201258, + "grad_norm": 1.2111247777938843, + "learning_rate": 4.355476939203355e-05, + "loss": 0.137, + "step": 4920 + }, + { + "epoch": 0.2583857442348008, + "grad_norm": 2.8836333751678467, + "learning_rate": 4.354166666666667e-05, + "loss": 0.1212, + "step": 4930 + }, + { + "epoch": 0.2589098532494759, + "grad_norm": 1.2956396341323853, + "learning_rate": 4.3528563941299794e-05, + "loss": 0.1161, + "step": 4940 + }, + { + "epoch": 0.25943396226415094, + "grad_norm": 1.8439433574676514, + "learning_rate": 4.351546121593292e-05, + "loss": 0.1302, + "step": 4950 + }, + { + "epoch": 0.259958071278826, + "grad_norm": 1.5384888648986816, + "learning_rate": 4.350235849056604e-05, + "loss": 0.1121, + "step": 4960 + }, + { + "epoch": 0.26048218029350106, + "grad_norm": 3.545708179473877, + "learning_rate": 4.3489255765199164e-05, + "loss": 0.1298, + "step": 4970 + }, + { + "epoch": 0.2610062893081761, + "grad_norm": 1.2400144338607788, + "learning_rate": 4.347615303983229e-05, + "loss": 0.144, + "step": 4980 + }, + { + "epoch": 0.26153039832285113, + "grad_norm": 1.563647747039795, + "learning_rate": 4.346305031446541e-05, + "loss": 0.1564, + "step": 4990 + }, + { + "epoch": 0.2620545073375262, + "grad_norm": 1.5921865701675415, + "learning_rate": 4.3449947589098534e-05, + "loss": 0.1594, + "step": 5000 + }, + { + "epoch": 0.2620545073375262, + "eval_loss": 0.30120959877967834, + "eval_runtime": 267.2437, + "eval_samples_per_second": 7.45, + "eval_steps_per_second": 1.242, + "step": 5000 + }, + { + "epoch": 0.26257861635220126, + "grad_norm": 0.8612467646598816, + "learning_rate": 4.343684486373166e-05, + "loss": 0.114, + "step": 5010 + }, + { + "epoch": 0.2631027253668763, + "grad_norm": 2.6393580436706543, + "learning_rate": 4.342374213836478e-05, + "loss": 0.1402, + "step": 5020 + }, + { + "epoch": 0.2636268343815514, + "grad_norm": 1.4097959995269775, + "learning_rate": 4.3410639412997904e-05, + "loss": 0.1563, + "step": 5030 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 1.2873144149780273, + "learning_rate": 4.339753668763103e-05, + "loss": 0.1428, + "step": 5040 + }, + { + "epoch": 0.26467505241090145, + "grad_norm": 4.058903217315674, + "learning_rate": 4.338443396226415e-05, + "loss": 0.1603, + "step": 5050 + }, + { + "epoch": 0.26519916142557654, + "grad_norm": 2.0691044330596924, + "learning_rate": 4.337133123689728e-05, + "loss": 0.1526, + "step": 5060 + }, + { + "epoch": 0.26572327044025157, + "grad_norm": 1.3779879808425903, + "learning_rate": 4.3358228511530404e-05, + "loss": 0.1273, + "step": 5070 + }, + { + "epoch": 0.2662473794549266, + "grad_norm": 1.724474549293518, + "learning_rate": 4.334512578616353e-05, + "loss": 0.1202, + "step": 5080 + }, + { + "epoch": 0.2667714884696017, + "grad_norm": 0.8896052837371826, + "learning_rate": 4.333202306079665e-05, + "loss": 0.1302, + "step": 5090 + }, + { + "epoch": 0.2672955974842767, + "grad_norm": 1.071658968925476, + "learning_rate": 4.331892033542977e-05, + "loss": 0.1432, + "step": 5100 + }, + { + "epoch": 0.26781970649895176, + "grad_norm": 2.4866292476654053, + "learning_rate": 4.330581761006289e-05, + "loss": 0.1108, + "step": 5110 + }, + { + "epoch": 0.26834381551362685, + "grad_norm": 2.052104949951172, + "learning_rate": 4.3292714884696015e-05, + "loss": 0.1243, + "step": 5120 + }, + { + "epoch": 0.2688679245283019, + "grad_norm": 1.5990567207336426, + "learning_rate": 4.3279612159329145e-05, + "loss": 0.1131, + "step": 5130 + }, + { + "epoch": 0.2693920335429769, + "grad_norm": 2.5835673809051514, + "learning_rate": 4.326650943396227e-05, + "loss": 0.1157, + "step": 5140 + }, + { + "epoch": 0.269916142557652, + "grad_norm": 3.6429848670959473, + "learning_rate": 4.325340670859539e-05, + "loss": 0.1523, + "step": 5150 + }, + { + "epoch": 0.27044025157232704, + "grad_norm": 2.0268514156341553, + "learning_rate": 4.3240303983228515e-05, + "loss": 0.1389, + "step": 5160 + }, + { + "epoch": 0.2709643605870021, + "grad_norm": 0.8983651399612427, + "learning_rate": 4.322720125786164e-05, + "loss": 0.1175, + "step": 5170 + }, + { + "epoch": 0.27148846960167716, + "grad_norm": 2.746926784515381, + "learning_rate": 4.321409853249476e-05, + "loss": 0.1333, + "step": 5180 + }, + { + "epoch": 0.2720125786163522, + "grad_norm": 1.4841305017471313, + "learning_rate": 4.3200995807127885e-05, + "loss": 0.1648, + "step": 5190 + }, + { + "epoch": 0.27253668763102723, + "grad_norm": 1.7416741847991943, + "learning_rate": 4.318789308176101e-05, + "loss": 0.1382, + "step": 5200 + }, + { + "epoch": 0.2730607966457023, + "grad_norm": 1.7636288404464722, + "learning_rate": 4.317479035639413e-05, + "loss": 0.1399, + "step": 5210 + }, + { + "epoch": 0.27358490566037735, + "grad_norm": 2.511547803878784, + "learning_rate": 4.3161687631027255e-05, + "loss": 0.1647, + "step": 5220 + }, + { + "epoch": 0.2741090146750524, + "grad_norm": 3.5610642433166504, + "learning_rate": 4.314858490566038e-05, + "loss": 0.1673, + "step": 5230 + }, + { + "epoch": 0.2746331236897275, + "grad_norm": 1.4131566286087036, + "learning_rate": 4.31354821802935e-05, + "loss": 0.1264, + "step": 5240 + }, + { + "epoch": 0.2751572327044025, + "grad_norm": 1.3859107494354248, + "learning_rate": 4.3122379454926625e-05, + "loss": 0.1355, + "step": 5250 + }, + { + "epoch": 0.27568134171907754, + "grad_norm": 2.9283275604248047, + "learning_rate": 4.310927672955975e-05, + "loss": 0.152, + "step": 5260 + }, + { + "epoch": 0.27620545073375263, + "grad_norm": 5.7555437088012695, + "learning_rate": 4.309617400419287e-05, + "loss": 0.1556, + "step": 5270 + }, + { + "epoch": 0.27672955974842767, + "grad_norm": 1.091763973236084, + "learning_rate": 4.3083071278825995e-05, + "loss": 0.1106, + "step": 5280 + }, + { + "epoch": 0.2772536687631027, + "grad_norm": 2.568847179412842, + "learning_rate": 4.3069968553459126e-05, + "loss": 0.1428, + "step": 5290 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 3.7078545093536377, + "learning_rate": 4.305686582809225e-05, + "loss": 0.1443, + "step": 5300 + }, + { + "epoch": 0.2783018867924528, + "grad_norm": 5.091422080993652, + "learning_rate": 4.304376310272537e-05, + "loss": 0.1279, + "step": 5310 + }, + { + "epoch": 0.27882599580712786, + "grad_norm": 6.574954986572266, + "learning_rate": 4.3030660377358496e-05, + "loss": 0.1408, + "step": 5320 + }, + { + "epoch": 0.27935010482180295, + "grad_norm": 3.3132095336914062, + "learning_rate": 4.301755765199162e-05, + "loss": 0.1368, + "step": 5330 + }, + { + "epoch": 0.279874213836478, + "grad_norm": 1.6334365606307983, + "learning_rate": 4.3004454926624736e-05, + "loss": 0.1311, + "step": 5340 + }, + { + "epoch": 0.280398322851153, + "grad_norm": 1.6367018222808838, + "learning_rate": 4.299135220125786e-05, + "loss": 0.1481, + "step": 5350 + }, + { + "epoch": 0.2809224318658281, + "grad_norm": 1.8219107389450073, + "learning_rate": 4.297824947589099e-05, + "loss": 0.1447, + "step": 5360 + }, + { + "epoch": 0.28144654088050314, + "grad_norm": 2.499232292175293, + "learning_rate": 4.296514675052411e-05, + "loss": 0.151, + "step": 5370 + }, + { + "epoch": 0.28197064989517817, + "grad_norm": 2.656902313232422, + "learning_rate": 4.2952044025157236e-05, + "loss": 0.1118, + "step": 5380 + }, + { + "epoch": 0.28249475890985326, + "grad_norm": 4.296574592590332, + "learning_rate": 4.293894129979036e-05, + "loss": 0.1419, + "step": 5390 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 2.389521360397339, + "learning_rate": 4.292583857442348e-05, + "loss": 0.1075, + "step": 5400 + }, + { + "epoch": 0.28354297693920333, + "grad_norm": 2.1653847694396973, + "learning_rate": 4.2912735849056606e-05, + "loss": 0.1642, + "step": 5410 + }, + { + "epoch": 0.2840670859538784, + "grad_norm": 1.6562432050704956, + "learning_rate": 4.289963312368973e-05, + "loss": 0.1238, + "step": 5420 + }, + { + "epoch": 0.28459119496855345, + "grad_norm": 1.8484896421432495, + "learning_rate": 4.288653039832285e-05, + "loss": 0.1343, + "step": 5430 + }, + { + "epoch": 0.2851153039832285, + "grad_norm": 6.8420729637146, + "learning_rate": 4.2873427672955976e-05, + "loss": 0.1346, + "step": 5440 + }, + { + "epoch": 0.2856394129979036, + "grad_norm": 1.6548465490341187, + "learning_rate": 4.2860324947589107e-05, + "loss": 0.1245, + "step": 5450 + }, + { + "epoch": 0.2861635220125786, + "grad_norm": 1.1709433794021606, + "learning_rate": 4.284722222222222e-05, + "loss": 0.1375, + "step": 5460 + }, + { + "epoch": 0.28668763102725364, + "grad_norm": 1.1040029525756836, + "learning_rate": 4.2834119496855347e-05, + "loss": 0.1303, + "step": 5470 + }, + { + "epoch": 0.28721174004192873, + "grad_norm": 3.498873472213745, + "learning_rate": 4.282101677148847e-05, + "loss": 0.119, + "step": 5480 + }, + { + "epoch": 0.28773584905660377, + "grad_norm": 1.783225417137146, + "learning_rate": 4.280791404612159e-05, + "loss": 0.1121, + "step": 5490 + }, + { + "epoch": 0.2882599580712788, + "grad_norm": 1.912412166595459, + "learning_rate": 4.279481132075472e-05, + "loss": 0.1234, + "step": 5500 + }, + { + "epoch": 0.2887840670859539, + "grad_norm": 1.2715930938720703, + "learning_rate": 4.278170859538784e-05, + "loss": 0.1299, + "step": 5510 + }, + { + "epoch": 0.2893081761006289, + "grad_norm": 3.9322400093078613, + "learning_rate": 4.2768605870020963e-05, + "loss": 0.1656, + "step": 5520 + }, + { + "epoch": 0.28983228511530396, + "grad_norm": 2.730623245239258, + "learning_rate": 4.2755503144654094e-05, + "loss": 0.1322, + "step": 5530 + }, + { + "epoch": 0.29035639412997905, + "grad_norm": 2.107334852218628, + "learning_rate": 4.274240041928722e-05, + "loss": 0.1209, + "step": 5540 + }, + { + "epoch": 0.2908805031446541, + "grad_norm": 3.0399837493896484, + "learning_rate": 4.272929769392034e-05, + "loss": 0.1487, + "step": 5550 + }, + { + "epoch": 0.2914046121593291, + "grad_norm": 2.08225679397583, + "learning_rate": 4.2716194968553464e-05, + "loss": 0.1239, + "step": 5560 + }, + { + "epoch": 0.2919287211740042, + "grad_norm": 2.5326271057128906, + "learning_rate": 4.270309224318658e-05, + "loss": 0.1231, + "step": 5570 + }, + { + "epoch": 0.29245283018867924, + "grad_norm": 2.3042261600494385, + "learning_rate": 4.2689989517819704e-05, + "loss": 0.098, + "step": 5580 + }, + { + "epoch": 0.2929769392033543, + "grad_norm": 1.0774246454238892, + "learning_rate": 4.267688679245283e-05, + "loss": 0.1391, + "step": 5590 + }, + { + "epoch": 0.29350104821802936, + "grad_norm": 2.216782569885254, + "learning_rate": 4.266378406708596e-05, + "loss": 0.1151, + "step": 5600 + }, + { + "epoch": 0.2940251572327044, + "grad_norm": 3.898007869720459, + "learning_rate": 4.265068134171908e-05, + "loss": 0.1371, + "step": 5610 + }, + { + "epoch": 0.2945492662473795, + "grad_norm": 1.5311529636383057, + "learning_rate": 4.2637578616352204e-05, + "loss": 0.1396, + "step": 5620 + }, + { + "epoch": 0.2950733752620545, + "grad_norm": 2.2003555297851562, + "learning_rate": 4.262447589098533e-05, + "loss": 0.1196, + "step": 5630 + }, + { + "epoch": 0.29559748427672955, + "grad_norm": 2.0077884197235107, + "learning_rate": 4.261137316561845e-05, + "loss": 0.1381, + "step": 5640 + }, + { + "epoch": 0.29612159329140464, + "grad_norm": 2.7949230670928955, + "learning_rate": 4.2598270440251574e-05, + "loss": 0.1265, + "step": 5650 + }, + { + "epoch": 0.2966457023060797, + "grad_norm": 2.4836373329162598, + "learning_rate": 4.25851677148847e-05, + "loss": 0.1308, + "step": 5660 + }, + { + "epoch": 0.2971698113207547, + "grad_norm": 0.6496070027351379, + "learning_rate": 4.257206498951782e-05, + "loss": 0.1099, + "step": 5670 + }, + { + "epoch": 0.2976939203354298, + "grad_norm": 1.4952316284179688, + "learning_rate": 4.2558962264150944e-05, + "loss": 0.1284, + "step": 5680 + }, + { + "epoch": 0.29821802935010483, + "grad_norm": 2.3443973064422607, + "learning_rate": 4.254585953878407e-05, + "loss": 0.1258, + "step": 5690 + }, + { + "epoch": 0.29874213836477986, + "grad_norm": 1.2094248533248901, + "learning_rate": 4.253275681341719e-05, + "loss": 0.17, + "step": 5700 + }, + { + "epoch": 0.29926624737945495, + "grad_norm": 1.8085112571716309, + "learning_rate": 4.2519654088050315e-05, + "loss": 0.1549, + "step": 5710 + }, + { + "epoch": 0.29979035639413, + "grad_norm": 1.5376954078674316, + "learning_rate": 4.250655136268344e-05, + "loss": 0.1595, + "step": 5720 + }, + { + "epoch": 0.300314465408805, + "grad_norm": 1.7698854207992554, + "learning_rate": 4.249344863731656e-05, + "loss": 0.1736, + "step": 5730 + }, + { + "epoch": 0.3008385744234801, + "grad_norm": 1.0237462520599365, + "learning_rate": 4.2480345911949685e-05, + "loss": 0.1194, + "step": 5740 + }, + { + "epoch": 0.30136268343815514, + "grad_norm": 5.32814359664917, + "learning_rate": 4.246724318658281e-05, + "loss": 0.1446, + "step": 5750 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 1.3462707996368408, + "learning_rate": 4.245414046121594e-05, + "loss": 0.1287, + "step": 5760 + }, + { + "epoch": 0.30241090146750527, + "grad_norm": 2.194868326187134, + "learning_rate": 4.244103773584906e-05, + "loss": 0.1232, + "step": 5770 + }, + { + "epoch": 0.3029350104821803, + "grad_norm": 1.084516167640686, + "learning_rate": 4.2427935010482185e-05, + "loss": 0.1053, + "step": 5780 + }, + { + "epoch": 0.30345911949685533, + "grad_norm": 4.030144691467285, + "learning_rate": 4.241483228511531e-05, + "loss": 0.145, + "step": 5790 + }, + { + "epoch": 0.3039832285115304, + "grad_norm": 5.405220985412598, + "learning_rate": 4.240172955974843e-05, + "loss": 0.1405, + "step": 5800 + }, + { + "epoch": 0.30450733752620546, + "grad_norm": 5.14114236831665, + "learning_rate": 4.238862683438155e-05, + "loss": 0.1424, + "step": 5810 + }, + { + "epoch": 0.3050314465408805, + "grad_norm": 1.4198951721191406, + "learning_rate": 4.237552410901467e-05, + "loss": 0.1208, + "step": 5820 + }, + { + "epoch": 0.3055555555555556, + "grad_norm": 1.4760233163833618, + "learning_rate": 4.23624213836478e-05, + "loss": 0.1174, + "step": 5830 + }, + { + "epoch": 0.3060796645702306, + "grad_norm": 1.518790602684021, + "learning_rate": 4.2349318658280925e-05, + "loss": 0.1227, + "step": 5840 + }, + { + "epoch": 0.30660377358490565, + "grad_norm": 2.099006175994873, + "learning_rate": 4.233621593291405e-05, + "loss": 0.1212, + "step": 5850 + }, + { + "epoch": 0.30712788259958074, + "grad_norm": 3.2384157180786133, + "learning_rate": 4.232311320754717e-05, + "loss": 0.1491, + "step": 5860 + }, + { + "epoch": 0.30765199161425577, + "grad_norm": 1.7764296531677246, + "learning_rate": 4.2310010482180296e-05, + "loss": 0.14, + "step": 5870 + }, + { + "epoch": 0.3081761006289308, + "grad_norm": 0.9582472443580627, + "learning_rate": 4.229690775681342e-05, + "loss": 0.1425, + "step": 5880 + }, + { + "epoch": 0.3087002096436059, + "grad_norm": 1.730102300643921, + "learning_rate": 4.228380503144654e-05, + "loss": 0.1646, + "step": 5890 + }, + { + "epoch": 0.30922431865828093, + "grad_norm": 1.567408561706543, + "learning_rate": 4.2270702306079666e-05, + "loss": 0.1575, + "step": 5900 + }, + { + "epoch": 0.30974842767295596, + "grad_norm": 1.3192992210388184, + "learning_rate": 4.225759958071279e-05, + "loss": 0.1209, + "step": 5910 + }, + { + "epoch": 0.31027253668763105, + "grad_norm": 2.4381003379821777, + "learning_rate": 4.224449685534592e-05, + "loss": 0.1604, + "step": 5920 + }, + { + "epoch": 0.3107966457023061, + "grad_norm": 5.242435455322266, + "learning_rate": 4.2231394129979036e-05, + "loss": 0.1444, + "step": 5930 + }, + { + "epoch": 0.3113207547169811, + "grad_norm": 3.918074369430542, + "learning_rate": 4.221829140461216e-05, + "loss": 0.114, + "step": 5940 + }, + { + "epoch": 0.3118448637316562, + "grad_norm": 1.4617007970809937, + "learning_rate": 4.220518867924528e-05, + "loss": 0.1442, + "step": 5950 + }, + { + "epoch": 0.31236897274633124, + "grad_norm": 2.0521883964538574, + "learning_rate": 4.2192085953878406e-05, + "loss": 0.1241, + "step": 5960 + }, + { + "epoch": 0.3128930817610063, + "grad_norm": 1.4939581155776978, + "learning_rate": 4.217898322851153e-05, + "loss": 0.1282, + "step": 5970 + }, + { + "epoch": 0.31341719077568136, + "grad_norm": 3.2475051879882812, + "learning_rate": 4.216588050314465e-05, + "loss": 0.15, + "step": 5980 + }, + { + "epoch": 0.3139412997903564, + "grad_norm": 2.8565642833709717, + "learning_rate": 4.215277777777778e-05, + "loss": 0.1143, + "step": 5990 + }, + { + "epoch": 0.31446540880503143, + "grad_norm": 1.7236065864562988, + "learning_rate": 4.2139675052410906e-05, + "loss": 0.1419, + "step": 6000 + }, + { + "epoch": 0.31446540880503143, + "eval_loss": 0.30979835987091064, + "eval_runtime": 267.3439, + "eval_samples_per_second": 7.447, + "eval_steps_per_second": 1.242, + "step": 6000 + }, + { + "epoch": 0.3149895178197065, + "grad_norm": 1.8495186567306519, + "learning_rate": 4.212657232704403e-05, + "loss": 0.1558, + "step": 6010 + }, + { + "epoch": 0.31551362683438156, + "grad_norm": 0.4863542318344116, + "learning_rate": 4.211346960167715e-05, + "loss": 0.108, + "step": 6020 + }, + { + "epoch": 0.3160377358490566, + "grad_norm": 2.037104368209839, + "learning_rate": 4.2100366876310276e-05, + "loss": 0.1544, + "step": 6030 + }, + { + "epoch": 0.3165618448637317, + "grad_norm": 1.5306822061538696, + "learning_rate": 4.20872641509434e-05, + "loss": 0.109, + "step": 6040 + }, + { + "epoch": 0.3170859538784067, + "grad_norm": 0.885473906993866, + "learning_rate": 4.2074161425576516e-05, + "loss": 0.1273, + "step": 6050 + }, + { + "epoch": 0.31761006289308175, + "grad_norm": 1.3081514835357666, + "learning_rate": 4.206105870020965e-05, + "loss": 0.1408, + "step": 6060 + }, + { + "epoch": 0.31813417190775684, + "grad_norm": 2.302015542984009, + "learning_rate": 4.204795597484277e-05, + "loss": 0.1196, + "step": 6070 + }, + { + "epoch": 0.31865828092243187, + "grad_norm": 1.8867014646530151, + "learning_rate": 4.2034853249475893e-05, + "loss": 0.1052, + "step": 6080 + }, + { + "epoch": 0.3191823899371069, + "grad_norm": 1.1797665357589722, + "learning_rate": 4.202175052410902e-05, + "loss": 0.1478, + "step": 6090 + }, + { + "epoch": 0.319706498951782, + "grad_norm": 1.2745922803878784, + "learning_rate": 4.200864779874214e-05, + "loss": 0.1041, + "step": 6100 + }, + { + "epoch": 0.320230607966457, + "grad_norm": 2.031139850616455, + "learning_rate": 4.1995545073375264e-05, + "loss": 0.1437, + "step": 6110 + }, + { + "epoch": 0.32075471698113206, + "grad_norm": 1.2220163345336914, + "learning_rate": 4.198244234800839e-05, + "loss": 0.1411, + "step": 6120 + }, + { + "epoch": 0.32127882599580715, + "grad_norm": 2.0426888465881348, + "learning_rate": 4.196933962264151e-05, + "loss": 0.1147, + "step": 6130 + }, + { + "epoch": 0.3218029350104822, + "grad_norm": 3.2713701725006104, + "learning_rate": 4.1956236897274634e-05, + "loss": 0.1457, + "step": 6140 + }, + { + "epoch": 0.3223270440251572, + "grad_norm": 1.5383789539337158, + "learning_rate": 4.1943134171907764e-05, + "loss": 0.1035, + "step": 6150 + }, + { + "epoch": 0.3228511530398323, + "grad_norm": 2.0623626708984375, + "learning_rate": 4.193003144654089e-05, + "loss": 0.1259, + "step": 6160 + }, + { + "epoch": 0.32337526205450734, + "grad_norm": 2.215529441833496, + "learning_rate": 4.1916928721174004e-05, + "loss": 0.0988, + "step": 6170 + }, + { + "epoch": 0.3238993710691824, + "grad_norm": 1.0296365022659302, + "learning_rate": 4.190382599580713e-05, + "loss": 0.159, + "step": 6180 + }, + { + "epoch": 0.32442348008385746, + "grad_norm": 1.9438577890396118, + "learning_rate": 4.189072327044025e-05, + "loss": 0.1413, + "step": 6190 + }, + { + "epoch": 0.3249475890985325, + "grad_norm": 0.9789099097251892, + "learning_rate": 4.1877620545073374e-05, + "loss": 0.129, + "step": 6200 + }, + { + "epoch": 0.32547169811320753, + "grad_norm": 0.9686060547828674, + "learning_rate": 4.18645178197065e-05, + "loss": 0.1672, + "step": 6210 + }, + { + "epoch": 0.3259958071278826, + "grad_norm": 1.2769176959991455, + "learning_rate": 4.185141509433963e-05, + "loss": 0.1302, + "step": 6220 + }, + { + "epoch": 0.32651991614255765, + "grad_norm": 1.8056713342666626, + "learning_rate": 4.183831236897275e-05, + "loss": 0.1302, + "step": 6230 + }, + { + "epoch": 0.3270440251572327, + "grad_norm": 1.526102066040039, + "learning_rate": 4.1825209643605874e-05, + "loss": 0.1469, + "step": 6240 + }, + { + "epoch": 0.3275681341719078, + "grad_norm": 0.9519234895706177, + "learning_rate": 4.1812106918239e-05, + "loss": 0.1064, + "step": 6250 + }, + { + "epoch": 0.3280922431865828, + "grad_norm": 3.5756402015686035, + "learning_rate": 4.179900419287212e-05, + "loss": 0.1175, + "step": 6260 + }, + { + "epoch": 0.32861635220125784, + "grad_norm": 2.9796833992004395, + "learning_rate": 4.1785901467505245e-05, + "loss": 0.1332, + "step": 6270 + }, + { + "epoch": 0.32914046121593293, + "grad_norm": 2.4725258350372314, + "learning_rate": 4.177279874213837e-05, + "loss": 0.1116, + "step": 6280 + }, + { + "epoch": 0.32966457023060797, + "grad_norm": 1.5295337438583374, + "learning_rate": 4.175969601677149e-05, + "loss": 0.0934, + "step": 6290 + }, + { + "epoch": 0.330188679245283, + "grad_norm": 2.7749555110931396, + "learning_rate": 4.1746593291404615e-05, + "loss": 0.1517, + "step": 6300 + }, + { + "epoch": 0.3307127882599581, + "grad_norm": 1.2054258584976196, + "learning_rate": 4.173349056603774e-05, + "loss": 0.1343, + "step": 6310 + }, + { + "epoch": 0.3312368972746331, + "grad_norm": 2.1348698139190674, + "learning_rate": 4.172038784067086e-05, + "loss": 0.1256, + "step": 6320 + }, + { + "epoch": 0.33176100628930816, + "grad_norm": 3.1444623470306396, + "learning_rate": 4.1707285115303985e-05, + "loss": 0.1095, + "step": 6330 + }, + { + "epoch": 0.33228511530398325, + "grad_norm": 1.7551017999649048, + "learning_rate": 4.169418238993711e-05, + "loss": 0.1224, + "step": 6340 + }, + { + "epoch": 0.3328092243186583, + "grad_norm": 6.100649833679199, + "learning_rate": 4.168107966457023e-05, + "loss": 0.1533, + "step": 6350 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.8948473930358887, + "learning_rate": 4.1667976939203355e-05, + "loss": 0.1126, + "step": 6360 + }, + { + "epoch": 0.3338574423480084, + "grad_norm": 1.9172160625457764, + "learning_rate": 4.165487421383648e-05, + "loss": 0.1407, + "step": 6370 + }, + { + "epoch": 0.33438155136268344, + "grad_norm": 1.2122879028320312, + "learning_rate": 4.164177148846961e-05, + "loss": 0.131, + "step": 6380 + }, + { + "epoch": 0.33490566037735847, + "grad_norm": 0.9031748175621033, + "learning_rate": 4.162866876310273e-05, + "loss": 0.1635, + "step": 6390 + }, + { + "epoch": 0.33542976939203356, + "grad_norm": 1.341102123260498, + "learning_rate": 4.1615566037735855e-05, + "loss": 0.1267, + "step": 6400 + }, + { + "epoch": 0.3359538784067086, + "grad_norm": 2.564326286315918, + "learning_rate": 4.160246331236897e-05, + "loss": 0.1287, + "step": 6410 + }, + { + "epoch": 0.33647798742138363, + "grad_norm": 2.386312246322632, + "learning_rate": 4.1589360587002095e-05, + "loss": 0.1269, + "step": 6420 + }, + { + "epoch": 0.3370020964360587, + "grad_norm": 1.0762248039245605, + "learning_rate": 4.157625786163522e-05, + "loss": 0.1315, + "step": 6430 + }, + { + "epoch": 0.33752620545073375, + "grad_norm": 0.7837504148483276, + "learning_rate": 4.156315513626834e-05, + "loss": 0.1281, + "step": 6440 + }, + { + "epoch": 0.3380503144654088, + "grad_norm": 0.790789783000946, + "learning_rate": 4.155005241090147e-05, + "loss": 0.1336, + "step": 6450 + }, + { + "epoch": 0.3385744234800839, + "grad_norm": 1.4993865489959717, + "learning_rate": 4.1536949685534596e-05, + "loss": 0.1262, + "step": 6460 + }, + { + "epoch": 0.3390985324947589, + "grad_norm": 1.7968735694885254, + "learning_rate": 4.152384696016772e-05, + "loss": 0.1357, + "step": 6470 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 1.5854703187942505, + "learning_rate": 4.151074423480084e-05, + "loss": 0.123, + "step": 6480 + }, + { + "epoch": 0.34014675052410903, + "grad_norm": 1.7488397359848022, + "learning_rate": 4.1497641509433966e-05, + "loss": 0.126, + "step": 6490 + }, + { + "epoch": 0.34067085953878407, + "grad_norm": 1.2105824947357178, + "learning_rate": 4.148453878406709e-05, + "loss": 0.1363, + "step": 6500 + }, + { + "epoch": 0.3411949685534591, + "grad_norm": 1.638002634048462, + "learning_rate": 4.147143605870021e-05, + "loss": 0.1188, + "step": 6510 + }, + { + "epoch": 0.3417190775681342, + "grad_norm": 1.4952040910720825, + "learning_rate": 4.1458333333333336e-05, + "loss": 0.1114, + "step": 6520 + }, + { + "epoch": 0.3422431865828092, + "grad_norm": 1.3935679197311401, + "learning_rate": 4.144523060796646e-05, + "loss": 0.1283, + "step": 6530 + }, + { + "epoch": 0.34276729559748426, + "grad_norm": 1.1780931949615479, + "learning_rate": 4.143212788259958e-05, + "loss": 0.1093, + "step": 6540 + }, + { + "epoch": 0.34329140461215935, + "grad_norm": 1.7428357601165771, + "learning_rate": 4.1419025157232706e-05, + "loss": 0.1586, + "step": 6550 + }, + { + "epoch": 0.3438155136268344, + "grad_norm": 1.624427080154419, + "learning_rate": 4.140592243186583e-05, + "loss": 0.114, + "step": 6560 + }, + { + "epoch": 0.3443396226415094, + "grad_norm": 2.378514051437378, + "learning_rate": 4.139281970649895e-05, + "loss": 0.1634, + "step": 6570 + }, + { + "epoch": 0.3448637316561845, + "grad_norm": 0.7246071696281433, + "learning_rate": 4.1379716981132076e-05, + "loss": 0.1095, + "step": 6580 + }, + { + "epoch": 0.34538784067085954, + "grad_norm": 1.7475792169570923, + "learning_rate": 4.13666142557652e-05, + "loss": 0.1379, + "step": 6590 + }, + { + "epoch": 0.34591194968553457, + "grad_norm": 1.2346446514129639, + "learning_rate": 4.135351153039832e-05, + "loss": 0.1196, + "step": 6600 + }, + { + "epoch": 0.34643605870020966, + "grad_norm": 1.024915099143982, + "learning_rate": 4.1340408805031446e-05, + "loss": 0.1486, + "step": 6610 + }, + { + "epoch": 0.3469601677148847, + "grad_norm": 11.363612174987793, + "learning_rate": 4.1327306079664577e-05, + "loss": 0.1026, + "step": 6620 + }, + { + "epoch": 0.3474842767295597, + "grad_norm": 1.540850281715393, + "learning_rate": 4.13142033542977e-05, + "loss": 0.1658, + "step": 6630 + }, + { + "epoch": 0.3480083857442348, + "grad_norm": 1.565014123916626, + "learning_rate": 4.130110062893082e-05, + "loss": 0.1321, + "step": 6640 + }, + { + "epoch": 0.34853249475890985, + "grad_norm": 2.709578275680542, + "learning_rate": 4.128799790356394e-05, + "loss": 0.1416, + "step": 6650 + }, + { + "epoch": 0.3490566037735849, + "grad_norm": 4.274057388305664, + "learning_rate": 4.127489517819706e-05, + "loss": 0.1303, + "step": 6660 + }, + { + "epoch": 0.34958071278826, + "grad_norm": 1.6477665901184082, + "learning_rate": 4.126179245283019e-05, + "loss": 0.1158, + "step": 6670 + }, + { + "epoch": 0.350104821802935, + "grad_norm": 2.525834798812866, + "learning_rate": 4.124868972746331e-05, + "loss": 0.1274, + "step": 6680 + }, + { + "epoch": 0.35062893081761004, + "grad_norm": 1.99843430519104, + "learning_rate": 4.123558700209644e-05, + "loss": 0.1149, + "step": 6690 + }, + { + "epoch": 0.35115303983228513, + "grad_norm": 1.540343165397644, + "learning_rate": 4.1222484276729564e-05, + "loss": 0.1383, + "step": 6700 + }, + { + "epoch": 0.35167714884696016, + "grad_norm": 1.622219443321228, + "learning_rate": 4.120938155136269e-05, + "loss": 0.1072, + "step": 6710 + }, + { + "epoch": 0.3522012578616352, + "grad_norm": 1.3897455930709839, + "learning_rate": 4.119627882599581e-05, + "loss": 0.1388, + "step": 6720 + }, + { + "epoch": 0.3527253668763103, + "grad_norm": 2.0670998096466064, + "learning_rate": 4.1183176100628934e-05, + "loss": 0.1493, + "step": 6730 + }, + { + "epoch": 0.3532494758909853, + "grad_norm": 1.3831391334533691, + "learning_rate": 4.117007337526206e-05, + "loss": 0.1381, + "step": 6740 + }, + { + "epoch": 0.35377358490566035, + "grad_norm": 2.390122413635254, + "learning_rate": 4.115697064989518e-05, + "loss": 0.1364, + "step": 6750 + }, + { + "epoch": 0.35429769392033544, + "grad_norm": 1.3669023513793945, + "learning_rate": 4.1143867924528304e-05, + "loss": 0.1264, + "step": 6760 + }, + { + "epoch": 0.3548218029350105, + "grad_norm": 1.4862803220748901, + "learning_rate": 4.113076519916143e-05, + "loss": 0.1375, + "step": 6770 + }, + { + "epoch": 0.3553459119496855, + "grad_norm": 2.1498663425445557, + "learning_rate": 4.111766247379455e-05, + "loss": 0.1314, + "step": 6780 + }, + { + "epoch": 0.3558700209643606, + "grad_norm": 0.7945308089256287, + "learning_rate": 4.1104559748427674e-05, + "loss": 0.1302, + "step": 6790 + }, + { + "epoch": 0.35639412997903563, + "grad_norm": 1.1641976833343506, + "learning_rate": 4.10914570230608e-05, + "loss": 0.1176, + "step": 6800 + }, + { + "epoch": 0.35691823899371067, + "grad_norm": 1.3097875118255615, + "learning_rate": 4.107835429769392e-05, + "loss": 0.1252, + "step": 6810 + }, + { + "epoch": 0.35744234800838576, + "grad_norm": 1.769045114517212, + "learning_rate": 4.1065251572327044e-05, + "loss": 0.15, + "step": 6820 + }, + { + "epoch": 0.3579664570230608, + "grad_norm": 1.7525954246520996, + "learning_rate": 4.105214884696017e-05, + "loss": 0.1611, + "step": 6830 + }, + { + "epoch": 0.3584905660377358, + "grad_norm": 1.655821681022644, + "learning_rate": 4.103904612159329e-05, + "loss": 0.1194, + "step": 6840 + }, + { + "epoch": 0.3590146750524109, + "grad_norm": 1.8870012760162354, + "learning_rate": 4.102594339622642e-05, + "loss": 0.1501, + "step": 6850 + }, + { + "epoch": 0.35953878406708595, + "grad_norm": 1.5264195203781128, + "learning_rate": 4.1012840670859545e-05, + "loss": 0.1432, + "step": 6860 + }, + { + "epoch": 0.360062893081761, + "grad_norm": 1.597440242767334, + "learning_rate": 4.099973794549267e-05, + "loss": 0.1274, + "step": 6870 + }, + { + "epoch": 0.36058700209643607, + "grad_norm": 1.3905103206634521, + "learning_rate": 4.098663522012579e-05, + "loss": 0.1331, + "step": 6880 + }, + { + "epoch": 0.3611111111111111, + "grad_norm": 1.5292747020721436, + "learning_rate": 4.097353249475891e-05, + "loss": 0.1253, + "step": 6890 + }, + { + "epoch": 0.36163522012578614, + "grad_norm": 1.0919030904769897, + "learning_rate": 4.096042976939203e-05, + "loss": 0.1233, + "step": 6900 + }, + { + "epoch": 0.36215932914046123, + "grad_norm": 1.8438016176223755, + "learning_rate": 4.0947327044025155e-05, + "loss": 0.1309, + "step": 6910 + }, + { + "epoch": 0.36268343815513626, + "grad_norm": 2.1451685428619385, + "learning_rate": 4.0934224318658285e-05, + "loss": 0.1617, + "step": 6920 + }, + { + "epoch": 0.3632075471698113, + "grad_norm": 4.021899700164795, + "learning_rate": 4.092112159329141e-05, + "loss": 0.1233, + "step": 6930 + }, + { + "epoch": 0.3637316561844864, + "grad_norm": 1.0114710330963135, + "learning_rate": 4.090801886792453e-05, + "loss": 0.1142, + "step": 6940 + }, + { + "epoch": 0.3642557651991614, + "grad_norm": 1.8664952516555786, + "learning_rate": 4.0894916142557655e-05, + "loss": 0.1364, + "step": 6950 + }, + { + "epoch": 0.36477987421383645, + "grad_norm": 0.8328051567077637, + "learning_rate": 4.088181341719078e-05, + "loss": 0.125, + "step": 6960 + }, + { + "epoch": 0.36530398322851154, + "grad_norm": 1.7279052734375, + "learning_rate": 4.08687106918239e-05, + "loss": 0.1379, + "step": 6970 + }, + { + "epoch": 0.3658280922431866, + "grad_norm": 2.0779712200164795, + "learning_rate": 4.0855607966457025e-05, + "loss": 0.1433, + "step": 6980 + }, + { + "epoch": 0.3663522012578616, + "grad_norm": 3.7102408409118652, + "learning_rate": 4.084250524109015e-05, + "loss": 0.1306, + "step": 6990 + }, + { + "epoch": 0.3668763102725367, + "grad_norm": 2.5546956062316895, + "learning_rate": 4.082940251572327e-05, + "loss": 0.1186, + "step": 7000 + }, + { + "epoch": 0.3668763102725367, + "eval_loss": 0.2957610487937927, + "eval_runtime": 268.2309, + "eval_samples_per_second": 7.423, + "eval_steps_per_second": 1.238, + "step": 7000 + }, + { + "epoch": 0.36740041928721173, + "grad_norm": 1.1695574522018433, + "learning_rate": 4.0816299790356395e-05, + "loss": 0.0949, + "step": 7010 + }, + { + "epoch": 0.36792452830188677, + "grad_norm": 2.4255287647247314, + "learning_rate": 4.080319706498952e-05, + "loss": 0.1318, + "step": 7020 + }, + { + "epoch": 0.36844863731656186, + "grad_norm": 1.4534999132156372, + "learning_rate": 4.079009433962264e-05, + "loss": 0.1263, + "step": 7030 + }, + { + "epoch": 0.3689727463312369, + "grad_norm": 1.4702773094177246, + "learning_rate": 4.0776991614255766e-05, + "loss": 0.1283, + "step": 7040 + }, + { + "epoch": 0.3694968553459119, + "grad_norm": 1.4011280536651611, + "learning_rate": 4.076388888888889e-05, + "loss": 0.1062, + "step": 7050 + }, + { + "epoch": 0.370020964360587, + "grad_norm": 1.7604784965515137, + "learning_rate": 4.075078616352201e-05, + "loss": 0.1452, + "step": 7060 + }, + { + "epoch": 0.37054507337526205, + "grad_norm": 1.455108404159546, + "learning_rate": 4.0737683438155136e-05, + "loss": 0.1604, + "step": 7070 + }, + { + "epoch": 0.3710691823899371, + "grad_norm": 1.530150055885315, + "learning_rate": 4.0724580712788266e-05, + "loss": 0.1598, + "step": 7080 + }, + { + "epoch": 0.37159329140461217, + "grad_norm": 1.792792558670044, + "learning_rate": 4.071147798742139e-05, + "loss": 0.1255, + "step": 7090 + }, + { + "epoch": 0.3721174004192872, + "grad_norm": 2.6770427227020264, + "learning_rate": 4.069837526205451e-05, + "loss": 0.1279, + "step": 7100 + }, + { + "epoch": 0.37264150943396224, + "grad_norm": 1.6971386671066284, + "learning_rate": 4.0685272536687636e-05, + "loss": 0.1409, + "step": 7110 + }, + { + "epoch": 0.3731656184486373, + "grad_norm": 1.3991681337356567, + "learning_rate": 4.067216981132076e-05, + "loss": 0.1354, + "step": 7120 + }, + { + "epoch": 0.37368972746331236, + "grad_norm": 1.0325071811676025, + "learning_rate": 4.0659067085953876e-05, + "loss": 0.1231, + "step": 7130 + }, + { + "epoch": 0.3742138364779874, + "grad_norm": 2.683823823928833, + "learning_rate": 4.0645964360587e-05, + "loss": 0.1249, + "step": 7140 + }, + { + "epoch": 0.3747379454926625, + "grad_norm": 2.177319288253784, + "learning_rate": 4.063286163522013e-05, + "loss": 0.1291, + "step": 7150 + }, + { + "epoch": 0.3752620545073375, + "grad_norm": 3.0221738815307617, + "learning_rate": 4.061975890985325e-05, + "loss": 0.1233, + "step": 7160 + }, + { + "epoch": 0.3757861635220126, + "grad_norm": 1.712924599647522, + "learning_rate": 4.0606656184486376e-05, + "loss": 0.1215, + "step": 7170 + }, + { + "epoch": 0.37631027253668764, + "grad_norm": 2.8734965324401855, + "learning_rate": 4.05935534591195e-05, + "loss": 0.143, + "step": 7180 + }, + { + "epoch": 0.3768343815513627, + "grad_norm": 1.5283716917037964, + "learning_rate": 4.058045073375262e-05, + "loss": 0.1228, + "step": 7190 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 1.5394947528839111, + "learning_rate": 4.0567348008385746e-05, + "loss": 0.1597, + "step": 7200 + }, + { + "epoch": 0.3778825995807128, + "grad_norm": 2.937851667404175, + "learning_rate": 4.055424528301887e-05, + "loss": 0.1796, + "step": 7210 + }, + { + "epoch": 0.37840670859538783, + "grad_norm": 1.1585332155227661, + "learning_rate": 4.054114255765199e-05, + "loss": 0.1573, + "step": 7220 + }, + { + "epoch": 0.3789308176100629, + "grad_norm": 1.674102783203125, + "learning_rate": 4.0528039832285117e-05, + "loss": 0.1102, + "step": 7230 + }, + { + "epoch": 0.37945492662473795, + "grad_norm": 2.403864860534668, + "learning_rate": 4.051493710691824e-05, + "loss": 0.1669, + "step": 7240 + }, + { + "epoch": 0.379979035639413, + "grad_norm": 5.073932647705078, + "learning_rate": 4.050183438155136e-05, + "loss": 0.124, + "step": 7250 + }, + { + "epoch": 0.3805031446540881, + "grad_norm": 2.1044859886169434, + "learning_rate": 4.048873165618449e-05, + "loss": 0.157, + "step": 7260 + }, + { + "epoch": 0.3810272536687631, + "grad_norm": 1.426034927368164, + "learning_rate": 4.047562893081761e-05, + "loss": 0.1508, + "step": 7270 + }, + { + "epoch": 0.38155136268343814, + "grad_norm": 2.211362838745117, + "learning_rate": 4.0462526205450734e-05, + "loss": 0.1546, + "step": 7280 + }, + { + "epoch": 0.38207547169811323, + "grad_norm": 1.7680007219314575, + "learning_rate": 4.044942348008386e-05, + "loss": 0.1362, + "step": 7290 + }, + { + "epoch": 0.38259958071278827, + "grad_norm": 1.0435234308242798, + "learning_rate": 4.043632075471698e-05, + "loss": 0.1438, + "step": 7300 + }, + { + "epoch": 0.3831236897274633, + "grad_norm": 2.113070487976074, + "learning_rate": 4.042321802935011e-05, + "loss": 0.1372, + "step": 7310 + }, + { + "epoch": 0.3836477987421384, + "grad_norm": 1.3755215406417847, + "learning_rate": 4.0410115303983234e-05, + "loss": 0.1439, + "step": 7320 + }, + { + "epoch": 0.3841719077568134, + "grad_norm": 2.1089391708374023, + "learning_rate": 4.039701257861636e-05, + "loss": 0.138, + "step": 7330 + }, + { + "epoch": 0.38469601677148846, + "grad_norm": 2.3198318481445312, + "learning_rate": 4.038390985324948e-05, + "loss": 0.1078, + "step": 7340 + }, + { + "epoch": 0.38522012578616355, + "grad_norm": 1.23020339012146, + "learning_rate": 4.0370807127882604e-05, + "loss": 0.1405, + "step": 7350 + }, + { + "epoch": 0.3857442348008386, + "grad_norm": 3.249234676361084, + "learning_rate": 4.035770440251572e-05, + "loss": 0.19, + "step": 7360 + }, + { + "epoch": 0.3862683438155136, + "grad_norm": 1.7448391914367676, + "learning_rate": 4.0344601677148844e-05, + "loss": 0.1153, + "step": 7370 + }, + { + "epoch": 0.3867924528301887, + "grad_norm": 1.3847689628601074, + "learning_rate": 4.0331498951781974e-05, + "loss": 0.1377, + "step": 7380 + }, + { + "epoch": 0.38731656184486374, + "grad_norm": 0.9152111411094666, + "learning_rate": 4.03183962264151e-05, + "loss": 0.1111, + "step": 7390 + }, + { + "epoch": 0.38784067085953877, + "grad_norm": 1.5579804182052612, + "learning_rate": 4.030529350104822e-05, + "loss": 0.1232, + "step": 7400 + }, + { + "epoch": 0.38836477987421386, + "grad_norm": 5.35411262512207, + "learning_rate": 4.0292190775681344e-05, + "loss": 0.1371, + "step": 7410 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.0708833932876587, + "learning_rate": 4.027908805031447e-05, + "loss": 0.0993, + "step": 7420 + }, + { + "epoch": 0.38941299790356393, + "grad_norm": 1.5841659307479858, + "learning_rate": 4.026598532494759e-05, + "loss": 0.1205, + "step": 7430 + }, + { + "epoch": 0.389937106918239, + "grad_norm": 1.9003546237945557, + "learning_rate": 4.0252882599580714e-05, + "loss": 0.1076, + "step": 7440 + }, + { + "epoch": 0.39046121593291405, + "grad_norm": 1.3129006624221802, + "learning_rate": 4.023977987421384e-05, + "loss": 0.1503, + "step": 7450 + }, + { + "epoch": 0.3909853249475891, + "grad_norm": 2.282996892929077, + "learning_rate": 4.022667714884696e-05, + "loss": 0.1119, + "step": 7460 + }, + { + "epoch": 0.3915094339622642, + "grad_norm": 16.443607330322266, + "learning_rate": 4.021357442348009e-05, + "loss": 0.1448, + "step": 7470 + }, + { + "epoch": 0.3920335429769392, + "grad_norm": 1.9104045629501343, + "learning_rate": 4.020047169811321e-05, + "loss": 0.1364, + "step": 7480 + }, + { + "epoch": 0.39255765199161424, + "grad_norm": 2.893193483352661, + "learning_rate": 4.018736897274633e-05, + "loss": 0.1242, + "step": 7490 + }, + { + "epoch": 0.39308176100628933, + "grad_norm": 2.8008060455322266, + "learning_rate": 4.0174266247379455e-05, + "loss": 0.1374, + "step": 7500 + }, + { + "epoch": 0.39360587002096437, + "grad_norm": 2.654747724533081, + "learning_rate": 4.016116352201258e-05, + "loss": 0.146, + "step": 7510 + }, + { + "epoch": 0.3941299790356394, + "grad_norm": 1.9274426698684692, + "learning_rate": 4.01480607966457e-05, + "loss": 0.1346, + "step": 7520 + }, + { + "epoch": 0.3946540880503145, + "grad_norm": 5.771965026855469, + "learning_rate": 4.0134958071278825e-05, + "loss": 0.129, + "step": 7530 + }, + { + "epoch": 0.3951781970649895, + "grad_norm": 1.6615891456604004, + "learning_rate": 4.012185534591195e-05, + "loss": 0.11, + "step": 7540 + }, + { + "epoch": 0.39570230607966456, + "grad_norm": 3.1531715393066406, + "learning_rate": 4.010875262054508e-05, + "loss": 0.1413, + "step": 7550 + }, + { + "epoch": 0.39622641509433965, + "grad_norm": 1.0126038789749146, + "learning_rate": 4.00956498951782e-05, + "loss": 0.1173, + "step": 7560 + }, + { + "epoch": 0.3967505241090147, + "grad_norm": 1.4275016784667969, + "learning_rate": 4.0082547169811325e-05, + "loss": 0.0979, + "step": 7570 + }, + { + "epoch": 0.3972746331236897, + "grad_norm": 1.2405078411102295, + "learning_rate": 4.006944444444445e-05, + "loss": 0.1297, + "step": 7580 + }, + { + "epoch": 0.3977987421383648, + "grad_norm": 1.497534990310669, + "learning_rate": 4.005634171907757e-05, + "loss": 0.1095, + "step": 7590 + }, + { + "epoch": 0.39832285115303984, + "grad_norm": 2.914259195327759, + "learning_rate": 4.004323899371069e-05, + "loss": 0.1186, + "step": 7600 + }, + { + "epoch": 0.39884696016771487, + "grad_norm": 2.3535306453704834, + "learning_rate": 4.003013626834381e-05, + "loss": 0.1537, + "step": 7610 + }, + { + "epoch": 0.39937106918238996, + "grad_norm": 3.1690330505371094, + "learning_rate": 4.001703354297694e-05, + "loss": 0.1411, + "step": 7620 + }, + { + "epoch": 0.399895178197065, + "grad_norm": 3.2396535873413086, + "learning_rate": 4.0003930817610066e-05, + "loss": 0.1346, + "step": 7630 + }, + { + "epoch": 0.40041928721174, + "grad_norm": 2.383460760116577, + "learning_rate": 3.999082809224319e-05, + "loss": 0.1222, + "step": 7640 + }, + { + "epoch": 0.4009433962264151, + "grad_norm": 1.5564273595809937, + "learning_rate": 3.997772536687631e-05, + "loss": 0.1819, + "step": 7650 + }, + { + "epoch": 0.40146750524109015, + "grad_norm": 3.6901915073394775, + "learning_rate": 3.9964622641509436e-05, + "loss": 0.1093, + "step": 7660 + }, + { + "epoch": 0.4019916142557652, + "grad_norm": 1.2570509910583496, + "learning_rate": 3.995151991614256e-05, + "loss": 0.1441, + "step": 7670 + }, + { + "epoch": 0.4025157232704403, + "grad_norm": 1.0235661268234253, + "learning_rate": 3.993841719077568e-05, + "loss": 0.1207, + "step": 7680 + }, + { + "epoch": 0.4030398322851153, + "grad_norm": 1.4244946241378784, + "learning_rate": 3.9925314465408806e-05, + "loss": 0.1355, + "step": 7690 + }, + { + "epoch": 0.40356394129979034, + "grad_norm": 1.821220874786377, + "learning_rate": 3.991221174004193e-05, + "loss": 0.1222, + "step": 7700 + }, + { + "epoch": 0.40408805031446543, + "grad_norm": 1.3305425643920898, + "learning_rate": 3.989910901467506e-05, + "loss": 0.1227, + "step": 7710 + }, + { + "epoch": 0.40461215932914046, + "grad_norm": 0.676930844783783, + "learning_rate": 3.9886006289308176e-05, + "loss": 0.1164, + "step": 7720 + }, + { + "epoch": 0.4051362683438155, + "grad_norm": 2.514782190322876, + "learning_rate": 3.98729035639413e-05, + "loss": 0.1203, + "step": 7730 + }, + { + "epoch": 0.4056603773584906, + "grad_norm": 2.546602487564087, + "learning_rate": 3.985980083857442e-05, + "loss": 0.135, + "step": 7740 + }, + { + "epoch": 0.4061844863731656, + "grad_norm": 1.8678992986679077, + "learning_rate": 3.9846698113207546e-05, + "loss": 0.1605, + "step": 7750 + }, + { + "epoch": 0.40670859538784065, + "grad_norm": 1.5499653816223145, + "learning_rate": 3.983359538784067e-05, + "loss": 0.1584, + "step": 7760 + }, + { + "epoch": 0.40723270440251574, + "grad_norm": 1.3117645978927612, + "learning_rate": 3.982049266247379e-05, + "loss": 0.143, + "step": 7770 + }, + { + "epoch": 0.4077568134171908, + "grad_norm": 3.2754364013671875, + "learning_rate": 3.980738993710692e-05, + "loss": 0.1416, + "step": 7780 + }, + { + "epoch": 0.4082809224318658, + "grad_norm": 1.771898627281189, + "learning_rate": 3.9794287211740047e-05, + "loss": 0.1443, + "step": 7790 + }, + { + "epoch": 0.4088050314465409, + "grad_norm": 1.7889633178710938, + "learning_rate": 3.978118448637317e-05, + "loss": 0.1185, + "step": 7800 + }, + { + "epoch": 0.40932914046121593, + "grad_norm": 2.0119128227233887, + "learning_rate": 3.976808176100629e-05, + "loss": 0.1344, + "step": 7810 + }, + { + "epoch": 0.40985324947589097, + "grad_norm": 0.9358534216880798, + "learning_rate": 3.975497903563942e-05, + "loss": 0.0939, + "step": 7820 + }, + { + "epoch": 0.41037735849056606, + "grad_norm": 1.441988229751587, + "learning_rate": 3.974187631027254e-05, + "loss": 0.1183, + "step": 7830 + }, + { + "epoch": 0.4109014675052411, + "grad_norm": 2.3082668781280518, + "learning_rate": 3.972877358490566e-05, + "loss": 0.1241, + "step": 7840 + }, + { + "epoch": 0.4114255765199161, + "grad_norm": 1.446537971496582, + "learning_rate": 3.971567085953879e-05, + "loss": 0.1556, + "step": 7850 + }, + { + "epoch": 0.4119496855345912, + "grad_norm": 1.0463677644729614, + "learning_rate": 3.970256813417191e-05, + "loss": 0.1298, + "step": 7860 + }, + { + "epoch": 0.41247379454926625, + "grad_norm": 1.2671233415603638, + "learning_rate": 3.9689465408805034e-05, + "loss": 0.1444, + "step": 7870 + }, + { + "epoch": 0.4129979035639413, + "grad_norm": 1.6452945470809937, + "learning_rate": 3.967636268343816e-05, + "loss": 0.1183, + "step": 7880 + }, + { + "epoch": 0.41352201257861637, + "grad_norm": 1.7226898670196533, + "learning_rate": 3.966325995807128e-05, + "loss": 0.1098, + "step": 7890 + }, + { + "epoch": 0.4140461215932914, + "grad_norm": 1.7937055826187134, + "learning_rate": 3.9650157232704404e-05, + "loss": 0.1162, + "step": 7900 + }, + { + "epoch": 0.41457023060796644, + "grad_norm": 1.9490100145339966, + "learning_rate": 3.963705450733753e-05, + "loss": 0.143, + "step": 7910 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 2.1928627490997314, + "learning_rate": 3.962395178197065e-05, + "loss": 0.1508, + "step": 7920 + }, + { + "epoch": 0.41561844863731656, + "grad_norm": 2.3016629219055176, + "learning_rate": 3.9610849056603774e-05, + "loss": 0.1206, + "step": 7930 + }, + { + "epoch": 0.4161425576519916, + "grad_norm": 1.2063487768173218, + "learning_rate": 3.9597746331236904e-05, + "loss": 0.1042, + "step": 7940 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 1.34829843044281, + "learning_rate": 3.958464360587003e-05, + "loss": 0.1465, + "step": 7950 + }, + { + "epoch": 0.4171907756813417, + "grad_norm": 2.4825642108917236, + "learning_rate": 3.9571540880503144e-05, + "loss": 0.1397, + "step": 7960 + }, + { + "epoch": 0.41771488469601675, + "grad_norm": 1.801196813583374, + "learning_rate": 3.955843815513627e-05, + "loss": 0.1388, + "step": 7970 + }, + { + "epoch": 0.41823899371069184, + "grad_norm": 1.4181357622146606, + "learning_rate": 3.954533542976939e-05, + "loss": 0.1759, + "step": 7980 + }, + { + "epoch": 0.4187631027253669, + "grad_norm": 1.4561703205108643, + "learning_rate": 3.9532232704402514e-05, + "loss": 0.1653, + "step": 7990 + }, + { + "epoch": 0.4192872117400419, + "grad_norm": 1.21798574924469, + "learning_rate": 3.951912997903564e-05, + "loss": 0.1136, + "step": 8000 + }, + { + "epoch": 0.4192872117400419, + "eval_loss": 0.29459360241889954, + "eval_runtime": 267.455, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 1.241, + "step": 8000 + }, + { + "epoch": 0.419811320754717, + "grad_norm": 1.3732337951660156, + "learning_rate": 3.950602725366877e-05, + "loss": 0.1248, + "step": 8010 + }, + { + "epoch": 0.42033542976939203, + "grad_norm": 2.136629581451416, + "learning_rate": 3.949292452830189e-05, + "loss": 0.1281, + "step": 8020 + }, + { + "epoch": 0.42085953878406707, + "grad_norm": 0.736299991607666, + "learning_rate": 3.9479821802935015e-05, + "loss": 0.108, + "step": 8030 + }, + { + "epoch": 0.42138364779874216, + "grad_norm": 2.7431278228759766, + "learning_rate": 3.946671907756814e-05, + "loss": 0.1553, + "step": 8040 + }, + { + "epoch": 0.4219077568134172, + "grad_norm": 3.224233627319336, + "learning_rate": 3.945361635220126e-05, + "loss": 0.1563, + "step": 8050 + }, + { + "epoch": 0.4224318658280922, + "grad_norm": 1.2465264797210693, + "learning_rate": 3.9440513626834385e-05, + "loss": 0.1183, + "step": 8060 + }, + { + "epoch": 0.4229559748427673, + "grad_norm": 1.5469056367874146, + "learning_rate": 3.942741090146751e-05, + "loss": 0.1435, + "step": 8070 + }, + { + "epoch": 0.42348008385744235, + "grad_norm": 2.5857508182525635, + "learning_rate": 3.941430817610063e-05, + "loss": 0.1432, + "step": 8080 + }, + { + "epoch": 0.4240041928721174, + "grad_norm": 1.5477666854858398, + "learning_rate": 3.9401205450733755e-05, + "loss": 0.1425, + "step": 8090 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 1.2745437622070312, + "learning_rate": 3.938810272536688e-05, + "loss": 0.1156, + "step": 8100 + }, + { + "epoch": 0.4250524109014675, + "grad_norm": 1.5633890628814697, + "learning_rate": 3.9375e-05, + "loss": 0.1352, + "step": 8110 + }, + { + "epoch": 0.42557651991614254, + "grad_norm": 1.2198878526687622, + "learning_rate": 3.9361897274633125e-05, + "loss": 0.1607, + "step": 8120 + }, + { + "epoch": 0.4261006289308176, + "grad_norm": 1.835465908050537, + "learning_rate": 3.934879454926625e-05, + "loss": 0.1214, + "step": 8130 + }, + { + "epoch": 0.42662473794549266, + "grad_norm": 1.3060351610183716, + "learning_rate": 3.933569182389937e-05, + "loss": 0.1388, + "step": 8140 + }, + { + "epoch": 0.4271488469601677, + "grad_norm": 2.6293728351593018, + "learning_rate": 3.9322589098532495e-05, + "loss": 0.1443, + "step": 8150 + }, + { + "epoch": 0.4276729559748428, + "grad_norm": 2.6649739742279053, + "learning_rate": 3.930948637316562e-05, + "loss": 0.1182, + "step": 8160 + }, + { + "epoch": 0.4281970649895178, + "grad_norm": 2.201756238937378, + "learning_rate": 3.929638364779875e-05, + "loss": 0.1498, + "step": 8170 + }, + { + "epoch": 0.42872117400419285, + "grad_norm": 2.9758079051971436, + "learning_rate": 3.928328092243187e-05, + "loss": 0.1713, + "step": 8180 + }, + { + "epoch": 0.42924528301886794, + "grad_norm": 2.305769920349121, + "learning_rate": 3.9270178197064995e-05, + "loss": 0.1184, + "step": 8190 + }, + { + "epoch": 0.429769392033543, + "grad_norm": 1.409645438194275, + "learning_rate": 3.925707547169811e-05, + "loss": 0.1198, + "step": 8200 + }, + { + "epoch": 0.430293501048218, + "grad_norm": 1.2932605743408203, + "learning_rate": 3.9243972746331235e-05, + "loss": 0.1263, + "step": 8210 + }, + { + "epoch": 0.4308176100628931, + "grad_norm": 2.1385724544525146, + "learning_rate": 3.923087002096436e-05, + "loss": 0.116, + "step": 8220 + }, + { + "epoch": 0.43134171907756813, + "grad_norm": 1.7936503887176514, + "learning_rate": 3.921776729559748e-05, + "loss": 0.1194, + "step": 8230 + }, + { + "epoch": 0.43186582809224316, + "grad_norm": 1.0538984537124634, + "learning_rate": 3.920466457023061e-05, + "loss": 0.1398, + "step": 8240 + }, + { + "epoch": 0.43238993710691825, + "grad_norm": 1.157038927078247, + "learning_rate": 3.9191561844863736e-05, + "loss": 0.12, + "step": 8250 + }, + { + "epoch": 0.4329140461215933, + "grad_norm": 1.3017743825912476, + "learning_rate": 3.917845911949686e-05, + "loss": 0.1097, + "step": 8260 + }, + { + "epoch": 0.4334381551362683, + "grad_norm": 2.975079298019409, + "learning_rate": 3.916535639412998e-05, + "loss": 0.1581, + "step": 8270 + }, + { + "epoch": 0.4339622641509434, + "grad_norm": 0.8380312323570251, + "learning_rate": 3.9152253668763106e-05, + "loss": 0.1059, + "step": 8280 + }, + { + "epoch": 0.43448637316561844, + "grad_norm": 2.567601203918457, + "learning_rate": 3.913915094339623e-05, + "loss": 0.1374, + "step": 8290 + }, + { + "epoch": 0.4350104821802935, + "grad_norm": 2.6411819458007812, + "learning_rate": 3.912604821802935e-05, + "loss": 0.1206, + "step": 8300 + }, + { + "epoch": 0.43553459119496857, + "grad_norm": 1.1850274801254272, + "learning_rate": 3.9112945492662476e-05, + "loss": 0.1311, + "step": 8310 + }, + { + "epoch": 0.4360587002096436, + "grad_norm": 1.537529468536377, + "learning_rate": 3.90998427672956e-05, + "loss": 0.1189, + "step": 8320 + }, + { + "epoch": 0.43658280922431864, + "grad_norm": 3.455749034881592, + "learning_rate": 3.908674004192872e-05, + "loss": 0.1454, + "step": 8330 + }, + { + "epoch": 0.4371069182389937, + "grad_norm": 1.6670117378234863, + "learning_rate": 3.9073637316561846e-05, + "loss": 0.1372, + "step": 8340 + }, + { + "epoch": 0.43763102725366876, + "grad_norm": 1.6339397430419922, + "learning_rate": 3.906053459119497e-05, + "loss": 0.1367, + "step": 8350 + }, + { + "epoch": 0.4381551362683438, + "grad_norm": 2.6764585971832275, + "learning_rate": 3.904743186582809e-05, + "loss": 0.1438, + "step": 8360 + }, + { + "epoch": 0.4386792452830189, + "grad_norm": 1.5620914697647095, + "learning_rate": 3.9034329140461216e-05, + "loss": 0.1225, + "step": 8370 + }, + { + "epoch": 0.4392033542976939, + "grad_norm": 2.1984214782714844, + "learning_rate": 3.902122641509434e-05, + "loss": 0.1207, + "step": 8380 + }, + { + "epoch": 0.43972746331236895, + "grad_norm": 1.5800156593322754, + "learning_rate": 3.900812368972746e-05, + "loss": 0.1137, + "step": 8390 + }, + { + "epoch": 0.44025157232704404, + "grad_norm": 4.698128700256348, + "learning_rate": 3.899502096436059e-05, + "loss": 0.1242, + "step": 8400 + }, + { + "epoch": 0.44077568134171907, + "grad_norm": 1.764726996421814, + "learning_rate": 3.898191823899372e-05, + "loss": 0.1459, + "step": 8410 + }, + { + "epoch": 0.4412997903563941, + "grad_norm": 1.1937742233276367, + "learning_rate": 3.896881551362684e-05, + "loss": 0.1184, + "step": 8420 + }, + { + "epoch": 0.4418238993710692, + "grad_norm": 1.3434696197509766, + "learning_rate": 3.8955712788259964e-05, + "loss": 0.1189, + "step": 8430 + }, + { + "epoch": 0.44234800838574423, + "grad_norm": 0.9694046378135681, + "learning_rate": 3.894261006289308e-05, + "loss": 0.1254, + "step": 8440 + }, + { + "epoch": 0.44287211740041926, + "grad_norm": 4.838512897491455, + "learning_rate": 3.8929507337526204e-05, + "loss": 0.135, + "step": 8450 + }, + { + "epoch": 0.44339622641509435, + "grad_norm": 3.3563590049743652, + "learning_rate": 3.891640461215933e-05, + "loss": 0.103, + "step": 8460 + }, + { + "epoch": 0.4439203354297694, + "grad_norm": 1.2375174760818481, + "learning_rate": 3.890330188679246e-05, + "loss": 0.1489, + "step": 8470 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.8426742553710938, + "learning_rate": 3.889019916142558e-05, + "loss": 0.1292, + "step": 8480 + }, + { + "epoch": 0.4449685534591195, + "grad_norm": 2.4112813472747803, + "learning_rate": 3.8877096436058704e-05, + "loss": 0.1292, + "step": 8490 + }, + { + "epoch": 0.44549266247379454, + "grad_norm": 2.111625909805298, + "learning_rate": 3.886399371069183e-05, + "loss": 0.1141, + "step": 8500 + }, + { + "epoch": 0.4460167714884696, + "grad_norm": 1.7435418367385864, + "learning_rate": 3.885089098532495e-05, + "loss": 0.1245, + "step": 8510 + }, + { + "epoch": 0.44654088050314467, + "grad_norm": 2.7497286796569824, + "learning_rate": 3.8837788259958074e-05, + "loss": 0.1417, + "step": 8520 + }, + { + "epoch": 0.4470649895178197, + "grad_norm": 2.3329012393951416, + "learning_rate": 3.88246855345912e-05, + "loss": 0.1388, + "step": 8530 + }, + { + "epoch": 0.44758909853249473, + "grad_norm": 1.873579978942871, + "learning_rate": 3.881158280922432e-05, + "loss": 0.1279, + "step": 8540 + }, + { + "epoch": 0.4481132075471698, + "grad_norm": 1.848961353302002, + "learning_rate": 3.8798480083857444e-05, + "loss": 0.1293, + "step": 8550 + }, + { + "epoch": 0.44863731656184486, + "grad_norm": 1.7882370948791504, + "learning_rate": 3.878537735849057e-05, + "loss": 0.1059, + "step": 8560 + }, + { + "epoch": 0.4491614255765199, + "grad_norm": 1.3296475410461426, + "learning_rate": 3.877227463312369e-05, + "loss": 0.1222, + "step": 8570 + }, + { + "epoch": 0.449685534591195, + "grad_norm": 3.1347081661224365, + "learning_rate": 3.8759171907756814e-05, + "loss": 0.1358, + "step": 8580 + }, + { + "epoch": 0.45020964360587, + "grad_norm": 1.8556910753250122, + "learning_rate": 3.874606918238994e-05, + "loss": 0.0963, + "step": 8590 + }, + { + "epoch": 0.45073375262054505, + "grad_norm": 1.9654614925384521, + "learning_rate": 3.873296645702306e-05, + "loss": 0.1124, + "step": 8600 + }, + { + "epoch": 0.45125786163522014, + "grad_norm": 2.273122549057007, + "learning_rate": 3.8719863731656184e-05, + "loss": 0.1212, + "step": 8610 + }, + { + "epoch": 0.45178197064989517, + "grad_norm": 2.0771892070770264, + "learning_rate": 3.870676100628931e-05, + "loss": 0.142, + "step": 8620 + }, + { + "epoch": 0.4523060796645702, + "grad_norm": 7.753453731536865, + "learning_rate": 3.869365828092243e-05, + "loss": 0.1598, + "step": 8630 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 1.923572301864624, + "learning_rate": 3.868055555555556e-05, + "loss": 0.1393, + "step": 8640 + }, + { + "epoch": 0.4533542976939203, + "grad_norm": 2.978624105453491, + "learning_rate": 3.8667452830188685e-05, + "loss": 0.1199, + "step": 8650 + }, + { + "epoch": 0.45387840670859536, + "grad_norm": 1.3282644748687744, + "learning_rate": 3.865435010482181e-05, + "loss": 0.1248, + "step": 8660 + }, + { + "epoch": 0.45440251572327045, + "grad_norm": 2.1446094512939453, + "learning_rate": 3.864124737945493e-05, + "loss": 0.1349, + "step": 8670 + }, + { + "epoch": 0.4549266247379455, + "grad_norm": 1.1796557903289795, + "learning_rate": 3.862814465408805e-05, + "loss": 0.1175, + "step": 8680 + }, + { + "epoch": 0.4554507337526205, + "grad_norm": 2.4166107177734375, + "learning_rate": 3.861504192872117e-05, + "loss": 0.0966, + "step": 8690 + }, + { + "epoch": 0.4559748427672956, + "grad_norm": 1.9842512607574463, + "learning_rate": 3.8601939203354295e-05, + "loss": 0.101, + "step": 8700 + }, + { + "epoch": 0.45649895178197064, + "grad_norm": 1.7683640718460083, + "learning_rate": 3.8588836477987425e-05, + "loss": 0.145, + "step": 8710 + }, + { + "epoch": 0.4570230607966457, + "grad_norm": 1.3694344758987427, + "learning_rate": 3.857573375262055e-05, + "loss": 0.105, + "step": 8720 + }, + { + "epoch": 0.45754716981132076, + "grad_norm": 1.667144775390625, + "learning_rate": 3.856263102725367e-05, + "loss": 0.118, + "step": 8730 + }, + { + "epoch": 0.4580712788259958, + "grad_norm": 2.1638240814208984, + "learning_rate": 3.8549528301886795e-05, + "loss": 0.1249, + "step": 8740 + }, + { + "epoch": 0.4585953878406709, + "grad_norm": 1.1001627445220947, + "learning_rate": 3.853642557651992e-05, + "loss": 0.1162, + "step": 8750 + }, + { + "epoch": 0.4591194968553459, + "grad_norm": 1.8323266506195068, + "learning_rate": 3.852332285115304e-05, + "loss": 0.1246, + "step": 8760 + }, + { + "epoch": 0.45964360587002095, + "grad_norm": 1.4820311069488525, + "learning_rate": 3.8510220125786165e-05, + "loss": 0.1362, + "step": 8770 + }, + { + "epoch": 0.46016771488469604, + "grad_norm": 1.7225009202957153, + "learning_rate": 3.849711740041929e-05, + "loss": 0.1262, + "step": 8780 + }, + { + "epoch": 0.4606918238993711, + "grad_norm": 1.7169979810714722, + "learning_rate": 3.848401467505241e-05, + "loss": 0.1545, + "step": 8790 + }, + { + "epoch": 0.4612159329140461, + "grad_norm": 2.1871843338012695, + "learning_rate": 3.8470911949685536e-05, + "loss": 0.1133, + "step": 8800 + }, + { + "epoch": 0.4617400419287212, + "grad_norm": 1.6585966348648071, + "learning_rate": 3.845780922431866e-05, + "loss": 0.1199, + "step": 8810 + }, + { + "epoch": 0.46226415094339623, + "grad_norm": 5.046018123626709, + "learning_rate": 3.844470649895178e-05, + "loss": 0.1433, + "step": 8820 + }, + { + "epoch": 0.46278825995807127, + "grad_norm": 2.51607084274292, + "learning_rate": 3.8431603773584906e-05, + "loss": 0.1336, + "step": 8830 + }, + { + "epoch": 0.46331236897274636, + "grad_norm": 1.3547303676605225, + "learning_rate": 3.841850104821803e-05, + "loss": 0.1539, + "step": 8840 + }, + { + "epoch": 0.4638364779874214, + "grad_norm": 1.824432373046875, + "learning_rate": 3.840539832285115e-05, + "loss": 0.1383, + "step": 8850 + }, + { + "epoch": 0.4643605870020964, + "grad_norm": 1.8877265453338623, + "learning_rate": 3.8392295597484276e-05, + "loss": 0.112, + "step": 8860 + }, + { + "epoch": 0.4648846960167715, + "grad_norm": 1.1183520555496216, + "learning_rate": 3.8379192872117406e-05, + "loss": 0.116, + "step": 8870 + }, + { + "epoch": 0.46540880503144655, + "grad_norm": 0.9367240071296692, + "learning_rate": 3.836609014675053e-05, + "loss": 0.1138, + "step": 8880 + }, + { + "epoch": 0.4659329140461216, + "grad_norm": 1.4263917207717896, + "learning_rate": 3.835298742138365e-05, + "loss": 0.1587, + "step": 8890 + }, + { + "epoch": 0.46645702306079667, + "grad_norm": 1.440211296081543, + "learning_rate": 3.8339884696016776e-05, + "loss": 0.0893, + "step": 8900 + }, + { + "epoch": 0.4669811320754717, + "grad_norm": 1.3915868997573853, + "learning_rate": 3.832678197064989e-05, + "loss": 0.097, + "step": 8910 + }, + { + "epoch": 0.46750524109014674, + "grad_norm": 2.4701268672943115, + "learning_rate": 3.8313679245283016e-05, + "loss": 0.1198, + "step": 8920 + }, + { + "epoch": 0.46802935010482183, + "grad_norm": 1.6906559467315674, + "learning_rate": 3.830057651991614e-05, + "loss": 0.1052, + "step": 8930 + }, + { + "epoch": 0.46855345911949686, + "grad_norm": 1.4248497486114502, + "learning_rate": 3.828747379454927e-05, + "loss": 0.1307, + "step": 8940 + }, + { + "epoch": 0.4690775681341719, + "grad_norm": 1.064107894897461, + "learning_rate": 3.827437106918239e-05, + "loss": 0.1089, + "step": 8950 + }, + { + "epoch": 0.469601677148847, + "grad_norm": 1.1374626159667969, + "learning_rate": 3.8261268343815517e-05, + "loss": 0.1344, + "step": 8960 + }, + { + "epoch": 0.470125786163522, + "grad_norm": 3.8050456047058105, + "learning_rate": 3.824816561844864e-05, + "loss": 0.1233, + "step": 8970 + }, + { + "epoch": 0.47064989517819705, + "grad_norm": 2.235957145690918, + "learning_rate": 3.823506289308176e-05, + "loss": 0.1109, + "step": 8980 + }, + { + "epoch": 0.47117400419287214, + "grad_norm": 1.165960669517517, + "learning_rate": 3.822196016771489e-05, + "loss": 0.1275, + "step": 8990 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 2.664379835128784, + "learning_rate": 3.820885744234801e-05, + "loss": 0.1637, + "step": 9000 + }, + { + "epoch": 0.4716981132075472, + "eval_loss": 0.29463252425193787, + "eval_runtime": 267.3008, + "eval_samples_per_second": 7.449, + "eval_steps_per_second": 1.242, + "step": 9000 + }, + { + "epoch": 0.4722222222222222, + "grad_norm": 2.2115066051483154, + "learning_rate": 3.8195754716981133e-05, + "loss": 0.1339, + "step": 9010 + }, + { + "epoch": 0.4727463312368973, + "grad_norm": 1.608081579208374, + "learning_rate": 3.818265199161426e-05, + "loss": 0.0984, + "step": 9020 + }, + { + "epoch": 0.47327044025157233, + "grad_norm": 2.9085326194763184, + "learning_rate": 3.816954926624738e-05, + "loss": 0.1239, + "step": 9030 + }, + { + "epoch": 0.47379454926624737, + "grad_norm": 13.745386123657227, + "learning_rate": 3.8156446540880504e-05, + "loss": 0.115, + "step": 9040 + }, + { + "epoch": 0.47431865828092246, + "grad_norm": 3.2827248573303223, + "learning_rate": 3.814334381551363e-05, + "loss": 0.1289, + "step": 9050 + }, + { + "epoch": 0.4748427672955975, + "grad_norm": 2.124379873275757, + "learning_rate": 3.813024109014675e-05, + "loss": 0.1247, + "step": 9060 + }, + { + "epoch": 0.4753668763102725, + "grad_norm": 1.5814337730407715, + "learning_rate": 3.8117138364779874e-05, + "loss": 0.1247, + "step": 9070 + }, + { + "epoch": 0.4758909853249476, + "grad_norm": 1.8060868978500366, + "learning_rate": 3.8104035639413e-05, + "loss": 0.1498, + "step": 9080 + }, + { + "epoch": 0.47641509433962265, + "grad_norm": 1.223948359489441, + "learning_rate": 3.809093291404612e-05, + "loss": 0.0915, + "step": 9090 + }, + { + "epoch": 0.4769392033542977, + "grad_norm": 0.885719895362854, + "learning_rate": 3.807783018867925e-05, + "loss": 0.122, + "step": 9100 + }, + { + "epoch": 0.47746331236897277, + "grad_norm": 1.971203088760376, + "learning_rate": 3.8064727463312374e-05, + "loss": 0.1585, + "step": 9110 + }, + { + "epoch": 0.4779874213836478, + "grad_norm": 2.219496965408325, + "learning_rate": 3.80516247379455e-05, + "loss": 0.1119, + "step": 9120 + }, + { + "epoch": 0.47851153039832284, + "grad_norm": 1.7364963293075562, + "learning_rate": 3.803852201257862e-05, + "loss": 0.1005, + "step": 9130 + }, + { + "epoch": 0.4790356394129979, + "grad_norm": 1.9840654134750366, + "learning_rate": 3.8025419287211744e-05, + "loss": 0.1209, + "step": 9140 + }, + { + "epoch": 0.47955974842767296, + "grad_norm": 2.238281726837158, + "learning_rate": 3.801231656184486e-05, + "loss": 0.1569, + "step": 9150 + }, + { + "epoch": 0.480083857442348, + "grad_norm": 1.8940962553024292, + "learning_rate": 3.7999213836477984e-05, + "loss": 0.0958, + "step": 9160 + }, + { + "epoch": 0.4806079664570231, + "grad_norm": 2.0582616329193115, + "learning_rate": 3.7986111111111114e-05, + "loss": 0.1442, + "step": 9170 + }, + { + "epoch": 0.4811320754716981, + "grad_norm": 2.1361806392669678, + "learning_rate": 3.797300838574424e-05, + "loss": 0.1104, + "step": 9180 + }, + { + "epoch": 0.48165618448637315, + "grad_norm": 2.00907826423645, + "learning_rate": 3.795990566037736e-05, + "loss": 0.1137, + "step": 9190 + }, + { + "epoch": 0.48218029350104824, + "grad_norm": 1.998676061630249, + "learning_rate": 3.7946802935010485e-05, + "loss": 0.1456, + "step": 9200 + }, + { + "epoch": 0.4827044025157233, + "grad_norm": 6.387622356414795, + "learning_rate": 3.793370020964361e-05, + "loss": 0.1616, + "step": 9210 + }, + { + "epoch": 0.4832285115303983, + "grad_norm": 2.027569055557251, + "learning_rate": 3.792059748427673e-05, + "loss": 0.1425, + "step": 9220 + }, + { + "epoch": 0.4837526205450734, + "grad_norm": 2.2389369010925293, + "learning_rate": 3.7907494758909855e-05, + "loss": 0.1355, + "step": 9230 + }, + { + "epoch": 0.48427672955974843, + "grad_norm": 0.3307386636734009, + "learning_rate": 3.789439203354298e-05, + "loss": 0.1269, + "step": 9240 + }, + { + "epoch": 0.48480083857442346, + "grad_norm": 2.1012308597564697, + "learning_rate": 3.78812893081761e-05, + "loss": 0.1235, + "step": 9250 + }, + { + "epoch": 0.48532494758909855, + "grad_norm": 0.8590827584266663, + "learning_rate": 3.786818658280923e-05, + "loss": 0.1244, + "step": 9260 + }, + { + "epoch": 0.4858490566037736, + "grad_norm": 1.6406866312026978, + "learning_rate": 3.785508385744235e-05, + "loss": 0.1256, + "step": 9270 + }, + { + "epoch": 0.4863731656184486, + "grad_norm": 1.5975079536437988, + "learning_rate": 3.784198113207547e-05, + "loss": 0.1074, + "step": 9280 + }, + { + "epoch": 0.4868972746331237, + "grad_norm": 1.5340465307235718, + "learning_rate": 3.7828878406708595e-05, + "loss": 0.0975, + "step": 9290 + }, + { + "epoch": 0.48742138364779874, + "grad_norm": 3.123337984085083, + "learning_rate": 3.781577568134172e-05, + "loss": 0.156, + "step": 9300 + }, + { + "epoch": 0.4879454926624738, + "grad_norm": 6.452347755432129, + "learning_rate": 3.780267295597484e-05, + "loss": 0.1253, + "step": 9310 + }, + { + "epoch": 0.48846960167714887, + "grad_norm": 1.0351651906967163, + "learning_rate": 3.7789570230607965e-05, + "loss": 0.1547, + "step": 9320 + }, + { + "epoch": 0.4889937106918239, + "grad_norm": 1.8313533067703247, + "learning_rate": 3.7776467505241095e-05, + "loss": 0.1104, + "step": 9330 + }, + { + "epoch": 0.48951781970649894, + "grad_norm": 1.1199965476989746, + "learning_rate": 3.776336477987422e-05, + "loss": 0.0976, + "step": 9340 + }, + { + "epoch": 0.490041928721174, + "grad_norm": 2.7808194160461426, + "learning_rate": 3.775026205450734e-05, + "loss": 0.1449, + "step": 9350 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 4.0185041427612305, + "learning_rate": 3.7737159329140465e-05, + "loss": 0.142, + "step": 9360 + }, + { + "epoch": 0.4910901467505241, + "grad_norm": 4.457677364349365, + "learning_rate": 3.772405660377359e-05, + "loss": 0.1096, + "step": 9370 + }, + { + "epoch": 0.4916142557651992, + "grad_norm": 1.8629597425460815, + "learning_rate": 3.771095387840671e-05, + "loss": 0.1369, + "step": 9380 + }, + { + "epoch": 0.4921383647798742, + "grad_norm": 2.3238027095794678, + "learning_rate": 3.769785115303983e-05, + "loss": 0.1253, + "step": 9390 + }, + { + "epoch": 0.49266247379454925, + "grad_norm": 2.06125807762146, + "learning_rate": 3.768474842767296e-05, + "loss": 0.1438, + "step": 9400 + }, + { + "epoch": 0.49318658280922434, + "grad_norm": 1.5867494344711304, + "learning_rate": 3.767164570230608e-05, + "loss": 0.1318, + "step": 9410 + }, + { + "epoch": 0.4937106918238994, + "grad_norm": 1.5369681119918823, + "learning_rate": 3.7658542976939206e-05, + "loss": 0.1474, + "step": 9420 + }, + { + "epoch": 0.4942348008385744, + "grad_norm": 1.043910264968872, + "learning_rate": 3.764544025157233e-05, + "loss": 0.1205, + "step": 9430 + }, + { + "epoch": 0.4947589098532495, + "grad_norm": 1.8726791143417358, + "learning_rate": 3.763233752620545e-05, + "loss": 0.1045, + "step": 9440 + }, + { + "epoch": 0.49528301886792453, + "grad_norm": 3.224243402481079, + "learning_rate": 3.7619234800838576e-05, + "loss": 0.1327, + "step": 9450 + }, + { + "epoch": 0.49580712788259956, + "grad_norm": 1.566446304321289, + "learning_rate": 3.76061320754717e-05, + "loss": 0.1272, + "step": 9460 + }, + { + "epoch": 0.49633123689727465, + "grad_norm": 2.9973807334899902, + "learning_rate": 3.759302935010482e-05, + "loss": 0.0983, + "step": 9470 + }, + { + "epoch": 0.4968553459119497, + "grad_norm": 1.0932589769363403, + "learning_rate": 3.7579926624737946e-05, + "loss": 0.1243, + "step": 9480 + }, + { + "epoch": 0.4973794549266247, + "grad_norm": 1.4678490161895752, + "learning_rate": 3.7566823899371076e-05, + "loss": 0.0928, + "step": 9490 + }, + { + "epoch": 0.4979035639412998, + "grad_norm": 1.9472997188568115, + "learning_rate": 3.75537211740042e-05, + "loss": 0.1297, + "step": 9500 + }, + { + "epoch": 0.49842767295597484, + "grad_norm": 1.9206911325454712, + "learning_rate": 3.7540618448637316e-05, + "loss": 0.1146, + "step": 9510 + }, + { + "epoch": 0.4989517819706499, + "grad_norm": 2.2353034019470215, + "learning_rate": 3.752751572327044e-05, + "loss": 0.1539, + "step": 9520 + }, + { + "epoch": 0.49947589098532497, + "grad_norm": 1.8114569187164307, + "learning_rate": 3.751441299790356e-05, + "loss": 0.1193, + "step": 9530 + }, + { + "epoch": 0.5, + "grad_norm": 2.5642435550689697, + "learning_rate": 3.7501310272536686e-05, + "loss": 0.1495, + "step": 9540 + }, + { + "epoch": 0.500524109014675, + "grad_norm": 1.5244758129119873, + "learning_rate": 3.748820754716981e-05, + "loss": 0.1157, + "step": 9550 + }, + { + "epoch": 0.5010482180293501, + "grad_norm": 1.0185999870300293, + "learning_rate": 3.747510482180294e-05, + "loss": 0.1053, + "step": 9560 + }, + { + "epoch": 0.5015723270440252, + "grad_norm": 1.3741650581359863, + "learning_rate": 3.746200209643606e-05, + "loss": 0.1021, + "step": 9570 + }, + { + "epoch": 0.5020964360587002, + "grad_norm": 13.233023643493652, + "learning_rate": 3.744889937106919e-05, + "loss": 0.1031, + "step": 9580 + }, + { + "epoch": 0.5026205450733753, + "grad_norm": 2.069478750228882, + "learning_rate": 3.743579664570231e-05, + "loss": 0.1639, + "step": 9590 + }, + { + "epoch": 0.5031446540880503, + "grad_norm": 1.6018750667572021, + "learning_rate": 3.7422693920335433e-05, + "loss": 0.1165, + "step": 9600 + }, + { + "epoch": 0.5036687631027253, + "grad_norm": 1.1766276359558105, + "learning_rate": 3.740959119496856e-05, + "loss": 0.1308, + "step": 9610 + }, + { + "epoch": 0.5041928721174004, + "grad_norm": 7.346364498138428, + "learning_rate": 3.739648846960168e-05, + "loss": 0.1188, + "step": 9620 + }, + { + "epoch": 0.5047169811320755, + "grad_norm": 2.2837276458740234, + "learning_rate": 3.73833857442348e-05, + "loss": 0.0947, + "step": 9630 + }, + { + "epoch": 0.5052410901467506, + "grad_norm": 2.5841448307037354, + "learning_rate": 3.737028301886793e-05, + "loss": 0.1177, + "step": 9640 + }, + { + "epoch": 0.5057651991614256, + "grad_norm": 2.1917734146118164, + "learning_rate": 3.735718029350105e-05, + "loss": 0.1451, + "step": 9650 + }, + { + "epoch": 0.5062893081761006, + "grad_norm": 1.490285038948059, + "learning_rate": 3.7344077568134174e-05, + "loss": 0.1132, + "step": 9660 + }, + { + "epoch": 0.5068134171907757, + "grad_norm": 1.1655572652816772, + "learning_rate": 3.73309748427673e-05, + "loss": 0.1181, + "step": 9670 + }, + { + "epoch": 0.5073375262054507, + "grad_norm": 1.4635337591171265, + "learning_rate": 3.731787211740042e-05, + "loss": 0.1121, + "step": 9680 + }, + { + "epoch": 0.5078616352201258, + "grad_norm": 1.9814307689666748, + "learning_rate": 3.7304769392033544e-05, + "loss": 0.1067, + "step": 9690 + }, + { + "epoch": 0.5083857442348009, + "grad_norm": 1.5174517631530762, + "learning_rate": 3.729166666666667e-05, + "loss": 0.1324, + "step": 9700 + }, + { + "epoch": 0.5089098532494759, + "grad_norm": 1.8455206155776978, + "learning_rate": 3.727856394129979e-05, + "loss": 0.0979, + "step": 9710 + }, + { + "epoch": 0.5094339622641509, + "grad_norm": 1.3721349239349365, + "learning_rate": 3.7265461215932914e-05, + "loss": 0.1358, + "step": 9720 + }, + { + "epoch": 0.509958071278826, + "grad_norm": 1.6334538459777832, + "learning_rate": 3.7252358490566044e-05, + "loss": 0.1105, + "step": 9730 + }, + { + "epoch": 0.510482180293501, + "grad_norm": 1.7805728912353516, + "learning_rate": 3.723925576519917e-05, + "loss": 0.1211, + "step": 9740 + }, + { + "epoch": 0.5110062893081762, + "grad_norm": 1.514751672744751, + "learning_rate": 3.7226153039832284e-05, + "loss": 0.1012, + "step": 9750 + }, + { + "epoch": 0.5115303983228512, + "grad_norm": 2.340724468231201, + "learning_rate": 3.721305031446541e-05, + "loss": 0.1427, + "step": 9760 + }, + { + "epoch": 0.5120545073375262, + "grad_norm": 1.5210148096084595, + "learning_rate": 3.719994758909853e-05, + "loss": 0.1124, + "step": 9770 + }, + { + "epoch": 0.5125786163522013, + "grad_norm": 2.119563341140747, + "learning_rate": 3.7186844863731654e-05, + "loss": 0.1263, + "step": 9780 + }, + { + "epoch": 0.5131027253668763, + "grad_norm": 1.4220565557479858, + "learning_rate": 3.717374213836478e-05, + "loss": 0.1007, + "step": 9790 + }, + { + "epoch": 0.5136268343815513, + "grad_norm": 1.1034489870071411, + "learning_rate": 3.716063941299791e-05, + "loss": 0.1312, + "step": 9800 + }, + { + "epoch": 0.5141509433962265, + "grad_norm": 2.0566272735595703, + "learning_rate": 3.714753668763103e-05, + "loss": 0.1019, + "step": 9810 + }, + { + "epoch": 0.5146750524109015, + "grad_norm": 1.0858715772628784, + "learning_rate": 3.7134433962264155e-05, + "loss": 0.1345, + "step": 9820 + }, + { + "epoch": 0.5151991614255765, + "grad_norm": 0.9479877948760986, + "learning_rate": 3.712133123689728e-05, + "loss": 0.1138, + "step": 9830 + }, + { + "epoch": 0.5157232704402516, + "grad_norm": 2.655113935470581, + "learning_rate": 3.71082285115304e-05, + "loss": 0.1238, + "step": 9840 + }, + { + "epoch": 0.5162473794549266, + "grad_norm": 1.081669569015503, + "learning_rate": 3.7095125786163525e-05, + "loss": 0.1447, + "step": 9850 + }, + { + "epoch": 0.5167714884696016, + "grad_norm": 1.6646862030029297, + "learning_rate": 3.708202306079665e-05, + "loss": 0.1348, + "step": 9860 + }, + { + "epoch": 0.5172955974842768, + "grad_norm": 2.4982075691223145, + "learning_rate": 3.706892033542977e-05, + "loss": 0.1408, + "step": 9870 + }, + { + "epoch": 0.5178197064989518, + "grad_norm": 1.4935152530670166, + "learning_rate": 3.7055817610062895e-05, + "loss": 0.1067, + "step": 9880 + }, + { + "epoch": 0.5183438155136268, + "grad_norm": 1.6594271659851074, + "learning_rate": 3.704271488469602e-05, + "loss": 0.1184, + "step": 9890 + }, + { + "epoch": 0.5188679245283019, + "grad_norm": 2.2178964614868164, + "learning_rate": 3.702961215932914e-05, + "loss": 0.1538, + "step": 9900 + }, + { + "epoch": 0.5193920335429769, + "grad_norm": 2.592712640762329, + "learning_rate": 3.7016509433962265e-05, + "loss": 0.1141, + "step": 9910 + }, + { + "epoch": 0.519916142557652, + "grad_norm": 9.270172119140625, + "learning_rate": 3.700340670859539e-05, + "loss": 0.1254, + "step": 9920 + }, + { + "epoch": 0.5204402515723271, + "grad_norm": 1.9585696458816528, + "learning_rate": 3.699030398322851e-05, + "loss": 0.1527, + "step": 9930 + }, + { + "epoch": 0.5209643605870021, + "grad_norm": 1.2968515157699585, + "learning_rate": 3.6977201257861635e-05, + "loss": 0.1091, + "step": 9940 + }, + { + "epoch": 0.5214884696016772, + "grad_norm": 3.7071094512939453, + "learning_rate": 3.696409853249476e-05, + "loss": 0.1466, + "step": 9950 + }, + { + "epoch": 0.5220125786163522, + "grad_norm": 1.7539535760879517, + "learning_rate": 3.695099580712789e-05, + "loss": 0.1164, + "step": 9960 + }, + { + "epoch": 0.5225366876310272, + "grad_norm": 1.4288002252578735, + "learning_rate": 3.693789308176101e-05, + "loss": 0.0937, + "step": 9970 + }, + { + "epoch": 0.5230607966457023, + "grad_norm": 2.288053035736084, + "learning_rate": 3.6924790356394136e-05, + "loss": 0.1354, + "step": 9980 + }, + { + "epoch": 0.5235849056603774, + "grad_norm": 1.5926620960235596, + "learning_rate": 3.691168763102725e-05, + "loss": 0.1133, + "step": 9990 + }, + { + "epoch": 0.5241090146750524, + "grad_norm": 1.2741729021072388, + "learning_rate": 3.6898584905660376e-05, + "loss": 0.0885, + "step": 10000 + }, + { + "epoch": 0.5241090146750524, + "eval_loss": 0.29621487855911255, + "eval_runtime": 267.7799, + "eval_samples_per_second": 7.435, + "eval_steps_per_second": 1.24, + "step": 10000 + }, + { + "epoch": 0.5246331236897275, + "grad_norm": 2.342545509338379, + "learning_rate": 3.68854821802935e-05, + "loss": 0.133, + "step": 10010 + }, + { + "epoch": 0.5251572327044025, + "grad_norm": 2.749216318130493, + "learning_rate": 3.687237945492662e-05, + "loss": 0.1033, + "step": 10020 + }, + { + "epoch": 0.5256813417190775, + "grad_norm": 1.3036613464355469, + "learning_rate": 3.685927672955975e-05, + "loss": 0.1188, + "step": 10030 + }, + { + "epoch": 0.5262054507337526, + "grad_norm": 1.6999369859695435, + "learning_rate": 3.6846174004192876e-05, + "loss": 0.1453, + "step": 10040 + }, + { + "epoch": 0.5267295597484277, + "grad_norm": 1.395107626914978, + "learning_rate": 3.6833071278826e-05, + "loss": 0.1173, + "step": 10050 + }, + { + "epoch": 0.5272536687631028, + "grad_norm": 2.4702157974243164, + "learning_rate": 3.681996855345912e-05, + "loss": 0.1151, + "step": 10060 + }, + { + "epoch": 0.5277777777777778, + "grad_norm": 1.3825613260269165, + "learning_rate": 3.6806865828092246e-05, + "loss": 0.1106, + "step": 10070 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 1.9985954761505127, + "learning_rate": 3.679376310272537e-05, + "loss": 0.1264, + "step": 10080 + }, + { + "epoch": 0.5288259958071279, + "grad_norm": 1.4469823837280273, + "learning_rate": 3.678066037735849e-05, + "loss": 0.1173, + "step": 10090 + }, + { + "epoch": 0.5293501048218029, + "grad_norm": 1.9220649003982544, + "learning_rate": 3.6767557651991616e-05, + "loss": 0.1465, + "step": 10100 + }, + { + "epoch": 0.529874213836478, + "grad_norm": 2.985271453857422, + "learning_rate": 3.675445492662474e-05, + "loss": 0.1341, + "step": 10110 + }, + { + "epoch": 0.5303983228511531, + "grad_norm": 1.2226923704147339, + "learning_rate": 3.674135220125786e-05, + "loss": 0.1246, + "step": 10120 + }, + { + "epoch": 0.5309224318658281, + "grad_norm": 1.7916375398635864, + "learning_rate": 3.6728249475890986e-05, + "loss": 0.1169, + "step": 10130 + }, + { + "epoch": 0.5314465408805031, + "grad_norm": 1.7915583848953247, + "learning_rate": 3.671514675052411e-05, + "loss": 0.12, + "step": 10140 + }, + { + "epoch": 0.5319706498951782, + "grad_norm": 1.5447636842727661, + "learning_rate": 3.670204402515723e-05, + "loss": 0.1112, + "step": 10150 + }, + { + "epoch": 0.5324947589098532, + "grad_norm": 3.285203695297241, + "learning_rate": 3.668894129979036e-05, + "loss": 0.1622, + "step": 10160 + }, + { + "epoch": 0.5330188679245284, + "grad_norm": 1.0093086957931519, + "learning_rate": 3.667583857442348e-05, + "loss": 0.1437, + "step": 10170 + }, + { + "epoch": 0.5335429769392034, + "grad_norm": 1.082940697669983, + "learning_rate": 3.6662735849056603e-05, + "loss": 0.1059, + "step": 10180 + }, + { + "epoch": 0.5340670859538784, + "grad_norm": 0.6458982825279236, + "learning_rate": 3.6649633123689734e-05, + "loss": 0.1267, + "step": 10190 + }, + { + "epoch": 0.5345911949685535, + "grad_norm": 1.6019344329833984, + "learning_rate": 3.663653039832286e-05, + "loss": 0.1192, + "step": 10200 + }, + { + "epoch": 0.5351153039832285, + "grad_norm": 1.5945937633514404, + "learning_rate": 3.662342767295598e-05, + "loss": 0.1058, + "step": 10210 + }, + { + "epoch": 0.5356394129979035, + "grad_norm": 0.926892876625061, + "learning_rate": 3.6610324947589104e-05, + "loss": 0.1209, + "step": 10220 + }, + { + "epoch": 0.5361635220125787, + "grad_norm": 1.6717281341552734, + "learning_rate": 3.659722222222222e-05, + "loss": 0.1327, + "step": 10230 + }, + { + "epoch": 0.5366876310272537, + "grad_norm": 1.7691363096237183, + "learning_rate": 3.6584119496855344e-05, + "loss": 0.1286, + "step": 10240 + }, + { + "epoch": 0.5372117400419287, + "grad_norm": 2.906761646270752, + "learning_rate": 3.657101677148847e-05, + "loss": 0.1253, + "step": 10250 + }, + { + "epoch": 0.5377358490566038, + "grad_norm": 2.2023632526397705, + "learning_rate": 3.65579140461216e-05, + "loss": 0.1193, + "step": 10260 + }, + { + "epoch": 0.5382599580712788, + "grad_norm": 1.8191137313842773, + "learning_rate": 3.654481132075472e-05, + "loss": 0.1465, + "step": 10270 + }, + { + "epoch": 0.5387840670859538, + "grad_norm": 2.309532642364502, + "learning_rate": 3.6531708595387844e-05, + "loss": 0.1244, + "step": 10280 + }, + { + "epoch": 0.539308176100629, + "grad_norm": 1.8280638456344604, + "learning_rate": 3.651860587002097e-05, + "loss": 0.1307, + "step": 10290 + }, + { + "epoch": 0.539832285115304, + "grad_norm": 2.2038843631744385, + "learning_rate": 3.650550314465409e-05, + "loss": 0.12, + "step": 10300 + }, + { + "epoch": 0.540356394129979, + "grad_norm": 1.8919661045074463, + "learning_rate": 3.6492400419287214e-05, + "loss": 0.1278, + "step": 10310 + }, + { + "epoch": 0.5408805031446541, + "grad_norm": 1.6221542358398438, + "learning_rate": 3.647929769392034e-05, + "loss": 0.1022, + "step": 10320 + }, + { + "epoch": 0.5414046121593291, + "grad_norm": 2.0718319416046143, + "learning_rate": 3.646619496855346e-05, + "loss": 0.138, + "step": 10330 + }, + { + "epoch": 0.5419287211740041, + "grad_norm": 1.4710763692855835, + "learning_rate": 3.6453092243186584e-05, + "loss": 0.1091, + "step": 10340 + }, + { + "epoch": 0.5424528301886793, + "grad_norm": 1.6080454587936401, + "learning_rate": 3.643998951781971e-05, + "loss": 0.1248, + "step": 10350 + }, + { + "epoch": 0.5429769392033543, + "grad_norm": 0.9281677603721619, + "learning_rate": 3.642688679245283e-05, + "loss": 0.1445, + "step": 10360 + }, + { + "epoch": 0.5435010482180294, + "grad_norm": 1.4223753213882446, + "learning_rate": 3.6413784067085955e-05, + "loss": 0.1557, + "step": 10370 + }, + { + "epoch": 0.5440251572327044, + "grad_norm": 1.4216049909591675, + "learning_rate": 3.640068134171908e-05, + "loss": 0.1083, + "step": 10380 + }, + { + "epoch": 0.5445492662473794, + "grad_norm": 1.84040105342865, + "learning_rate": 3.63875786163522e-05, + "loss": 0.1137, + "step": 10390 + }, + { + "epoch": 0.5450733752620545, + "grad_norm": 1.6752604246139526, + "learning_rate": 3.6374475890985325e-05, + "loss": 0.1421, + "step": 10400 + }, + { + "epoch": 0.5455974842767296, + "grad_norm": 2.0979623794555664, + "learning_rate": 3.636137316561845e-05, + "loss": 0.1228, + "step": 10410 + }, + { + "epoch": 0.5461215932914046, + "grad_norm": 0.7087647318840027, + "learning_rate": 3.634827044025158e-05, + "loss": 0.1235, + "step": 10420 + }, + { + "epoch": 0.5466457023060797, + "grad_norm": 1.492990255355835, + "learning_rate": 3.63351677148847e-05, + "loss": 0.1434, + "step": 10430 + }, + { + "epoch": 0.5471698113207547, + "grad_norm": 2.378920793533325, + "learning_rate": 3.6322064989517825e-05, + "loss": 0.1569, + "step": 10440 + }, + { + "epoch": 0.5476939203354297, + "grad_norm": 1.9160252809524536, + "learning_rate": 3.630896226415095e-05, + "loss": 0.1054, + "step": 10450 + }, + { + "epoch": 0.5482180293501048, + "grad_norm": 0.846349835395813, + "learning_rate": 3.6295859538784065e-05, + "loss": 0.1133, + "step": 10460 + }, + { + "epoch": 0.5487421383647799, + "grad_norm": 1.7348216772079468, + "learning_rate": 3.628275681341719e-05, + "loss": 0.1288, + "step": 10470 + }, + { + "epoch": 0.549266247379455, + "grad_norm": 1.307425618171692, + "learning_rate": 3.626965408805031e-05, + "loss": 0.1251, + "step": 10480 + }, + { + "epoch": 0.54979035639413, + "grad_norm": 1.8501406908035278, + "learning_rate": 3.625655136268344e-05, + "loss": 0.1204, + "step": 10490 + }, + { + "epoch": 0.550314465408805, + "grad_norm": 1.3911058902740479, + "learning_rate": 3.6243448637316565e-05, + "loss": 0.1249, + "step": 10500 + }, + { + "epoch": 0.55083857442348, + "grad_norm": 0.7849336862564087, + "learning_rate": 3.623034591194969e-05, + "loss": 0.0963, + "step": 10510 + }, + { + "epoch": 0.5513626834381551, + "grad_norm": 2.364187002182007, + "learning_rate": 3.621724318658281e-05, + "loss": 0.1067, + "step": 10520 + }, + { + "epoch": 0.5518867924528302, + "grad_norm": 2.4732234477996826, + "learning_rate": 3.6204140461215935e-05, + "loss": 0.1011, + "step": 10530 + }, + { + "epoch": 0.5524109014675053, + "grad_norm": 2.1230411529541016, + "learning_rate": 3.619103773584906e-05, + "loss": 0.1204, + "step": 10540 + }, + { + "epoch": 0.5529350104821803, + "grad_norm": 1.2235745191574097, + "learning_rate": 3.617793501048218e-05, + "loss": 0.1249, + "step": 10550 + }, + { + "epoch": 0.5534591194968553, + "grad_norm": 1.4653679132461548, + "learning_rate": 3.6164832285115306e-05, + "loss": 0.1377, + "step": 10560 + }, + { + "epoch": 0.5539832285115304, + "grad_norm": 2.0891177654266357, + "learning_rate": 3.615172955974843e-05, + "loss": 0.1404, + "step": 10570 + }, + { + "epoch": 0.5545073375262054, + "grad_norm": 1.4860715866088867, + "learning_rate": 3.613862683438155e-05, + "loss": 0.1067, + "step": 10580 + }, + { + "epoch": 0.5550314465408805, + "grad_norm": 1.492263674736023, + "learning_rate": 3.6125524109014676e-05, + "loss": 0.1387, + "step": 10590 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.3158419132232666, + "learning_rate": 3.61124213836478e-05, + "loss": 0.1267, + "step": 10600 + }, + { + "epoch": 0.5560796645702306, + "grad_norm": 1.954278588294983, + "learning_rate": 3.609931865828092e-05, + "loss": 0.1284, + "step": 10610 + }, + { + "epoch": 0.5566037735849056, + "grad_norm": 2.4319417476654053, + "learning_rate": 3.6086215932914046e-05, + "loss": 0.1373, + "step": 10620 + }, + { + "epoch": 0.5571278825995807, + "grad_norm": 2.4689600467681885, + "learning_rate": 3.607311320754717e-05, + "loss": 0.1314, + "step": 10630 + }, + { + "epoch": 0.5576519916142557, + "grad_norm": 1.7527562379837036, + "learning_rate": 3.606001048218029e-05, + "loss": 0.1118, + "step": 10640 + }, + { + "epoch": 0.5581761006289309, + "grad_norm": 1.3557283878326416, + "learning_rate": 3.604690775681342e-05, + "loss": 0.1176, + "step": 10650 + }, + { + "epoch": 0.5587002096436059, + "grad_norm": 1.7744745016098022, + "learning_rate": 3.6033805031446546e-05, + "loss": 0.1599, + "step": 10660 + }, + { + "epoch": 0.5592243186582809, + "grad_norm": 1.2975720167160034, + "learning_rate": 3.602070230607967e-05, + "loss": 0.1449, + "step": 10670 + }, + { + "epoch": 0.559748427672956, + "grad_norm": 1.860067367553711, + "learning_rate": 3.600759958071279e-05, + "loss": 0.1395, + "step": 10680 + }, + { + "epoch": 0.560272536687631, + "grad_norm": 1.673621416091919, + "learning_rate": 3.5994496855345916e-05, + "loss": 0.1146, + "step": 10690 + }, + { + "epoch": 0.560796645702306, + "grad_norm": 1.0844975709915161, + "learning_rate": 3.598139412997903e-05, + "loss": 0.123, + "step": 10700 + }, + { + "epoch": 0.5613207547169812, + "grad_norm": 1.365915060043335, + "learning_rate": 3.5968291404612156e-05, + "loss": 0.1111, + "step": 10710 + }, + { + "epoch": 0.5618448637316562, + "grad_norm": 1.3708051443099976, + "learning_rate": 3.595518867924528e-05, + "loss": 0.1357, + "step": 10720 + }, + { + "epoch": 0.5623689727463312, + "grad_norm": 1.8915815353393555, + "learning_rate": 3.594208595387841e-05, + "loss": 0.1265, + "step": 10730 + }, + { + "epoch": 0.5628930817610063, + "grad_norm": 1.4480313062667847, + "learning_rate": 3.592898322851153e-05, + "loss": 0.1442, + "step": 10740 + }, + { + "epoch": 0.5634171907756813, + "grad_norm": 2.1300652027130127, + "learning_rate": 3.591588050314466e-05, + "loss": 0.1274, + "step": 10750 + }, + { + "epoch": 0.5639412997903563, + "grad_norm": 1.2599786520004272, + "learning_rate": 3.590277777777778e-05, + "loss": 0.1179, + "step": 10760 + }, + { + "epoch": 0.5644654088050315, + "grad_norm": 2.037057638168335, + "learning_rate": 3.5889675052410903e-05, + "loss": 0.1286, + "step": 10770 + }, + { + "epoch": 0.5649895178197065, + "grad_norm": 0.6876270174980164, + "learning_rate": 3.587657232704403e-05, + "loss": 0.1271, + "step": 10780 + }, + { + "epoch": 0.5655136268343816, + "grad_norm": 2.1114234924316406, + "learning_rate": 3.586346960167715e-05, + "loss": 0.1341, + "step": 10790 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 2.157926082611084, + "learning_rate": 3.5850366876310274e-05, + "loss": 0.0956, + "step": 10800 + }, + { + "epoch": 0.5665618448637316, + "grad_norm": 0.7500534653663635, + "learning_rate": 3.58372641509434e-05, + "loss": 0.1256, + "step": 10810 + }, + { + "epoch": 0.5670859538784067, + "grad_norm": 2.3284528255462646, + "learning_rate": 3.582416142557652e-05, + "loss": 0.1552, + "step": 10820 + }, + { + "epoch": 0.5676100628930818, + "grad_norm": 1.4946759939193726, + "learning_rate": 3.5811058700209644e-05, + "loss": 0.1305, + "step": 10830 + }, + { + "epoch": 0.5681341719077568, + "grad_norm": 1.227433443069458, + "learning_rate": 3.579795597484277e-05, + "loss": 0.1475, + "step": 10840 + }, + { + "epoch": 0.5686582809224319, + "grad_norm": 1.4880372285842896, + "learning_rate": 3.578485324947589e-05, + "loss": 0.1095, + "step": 10850 + }, + { + "epoch": 0.5691823899371069, + "grad_norm": 1.418043613433838, + "learning_rate": 3.5771750524109014e-05, + "loss": 0.1169, + "step": 10860 + }, + { + "epoch": 0.5697064989517819, + "grad_norm": 1.149854302406311, + "learning_rate": 3.575864779874214e-05, + "loss": 0.1301, + "step": 10870 + }, + { + "epoch": 0.570230607966457, + "grad_norm": 1.978639841079712, + "learning_rate": 3.574554507337526e-05, + "loss": 0.1101, + "step": 10880 + }, + { + "epoch": 0.5707547169811321, + "grad_norm": 1.548750638961792, + "learning_rate": 3.573244234800839e-05, + "loss": 0.0986, + "step": 10890 + }, + { + "epoch": 0.5712788259958071, + "grad_norm": 1.443969488143921, + "learning_rate": 3.5719339622641514e-05, + "loss": 0.1083, + "step": 10900 + }, + { + "epoch": 0.5718029350104822, + "grad_norm": 1.9009684324264526, + "learning_rate": 3.570623689727464e-05, + "loss": 0.1396, + "step": 10910 + }, + { + "epoch": 0.5723270440251572, + "grad_norm": 1.0748625993728638, + "learning_rate": 3.569313417190776e-05, + "loss": 0.1182, + "step": 10920 + }, + { + "epoch": 0.5728511530398323, + "grad_norm": 1.075340747833252, + "learning_rate": 3.5680031446540884e-05, + "loss": 0.1138, + "step": 10930 + }, + { + "epoch": 0.5733752620545073, + "grad_norm": 0.9983365535736084, + "learning_rate": 3.5666928721174e-05, + "loss": 0.1029, + "step": 10940 + }, + { + "epoch": 0.5738993710691824, + "grad_norm": 2.230050802230835, + "learning_rate": 3.5653825995807124e-05, + "loss": 0.1359, + "step": 10950 + }, + { + "epoch": 0.5744234800838575, + "grad_norm": 1.0450794696807861, + "learning_rate": 3.5640723270440255e-05, + "loss": 0.0988, + "step": 10960 + }, + { + "epoch": 0.5749475890985325, + "grad_norm": 1.466576337814331, + "learning_rate": 3.562762054507338e-05, + "loss": 0.1257, + "step": 10970 + }, + { + "epoch": 0.5754716981132075, + "grad_norm": 2.0776283740997314, + "learning_rate": 3.56145178197065e-05, + "loss": 0.1116, + "step": 10980 + }, + { + "epoch": 0.5759958071278826, + "grad_norm": 1.320647120475769, + "learning_rate": 3.5601415094339625e-05, + "loss": 0.1229, + "step": 10990 + }, + { + "epoch": 0.5765199161425576, + "grad_norm": 1.7118204832077026, + "learning_rate": 3.558831236897275e-05, + "loss": 0.1438, + "step": 11000 + }, + { + "epoch": 0.5765199161425576, + "eval_loss": 0.28642794489860535, + "eval_runtime": 267.1324, + "eval_samples_per_second": 7.453, + "eval_steps_per_second": 1.243, + "step": 11000 + }, + { + "epoch": 0.5770440251572327, + "grad_norm": 0.9733932018280029, + "learning_rate": 3.557520964360587e-05, + "loss": 0.1234, + "step": 11010 + }, + { + "epoch": 0.5775681341719078, + "grad_norm": 1.7534810304641724, + "learning_rate": 3.5562106918238995e-05, + "loss": 0.1142, + "step": 11020 + }, + { + "epoch": 0.5780922431865828, + "grad_norm": 1.526125192642212, + "learning_rate": 3.554900419287212e-05, + "loss": 0.1058, + "step": 11030 + }, + { + "epoch": 0.5786163522012578, + "grad_norm": 1.795940637588501, + "learning_rate": 3.553590146750524e-05, + "loss": 0.1426, + "step": 11040 + }, + { + "epoch": 0.5791404612159329, + "grad_norm": 1.329445242881775, + "learning_rate": 3.552279874213837e-05, + "loss": 0.157, + "step": 11050 + }, + { + "epoch": 0.5796645702306079, + "grad_norm": 2.718548059463501, + "learning_rate": 3.550969601677149e-05, + "loss": 0.1365, + "step": 11060 + }, + { + "epoch": 0.5801886792452831, + "grad_norm": 1.8392747640609741, + "learning_rate": 3.549659329140461e-05, + "loss": 0.1152, + "step": 11070 + }, + { + "epoch": 0.5807127882599581, + "grad_norm": 1.6081047058105469, + "learning_rate": 3.5483490566037735e-05, + "loss": 0.1195, + "step": 11080 + }, + { + "epoch": 0.5812368972746331, + "grad_norm": 1.4167786836624146, + "learning_rate": 3.547038784067086e-05, + "loss": 0.1254, + "step": 11090 + }, + { + "epoch": 0.5817610062893082, + "grad_norm": 1.563456416130066, + "learning_rate": 3.545728511530398e-05, + "loss": 0.1209, + "step": 11100 + }, + { + "epoch": 0.5822851153039832, + "grad_norm": 2.8495681285858154, + "learning_rate": 3.5444182389937105e-05, + "loss": 0.1335, + "step": 11110 + }, + { + "epoch": 0.5828092243186582, + "grad_norm": 2.373828649520874, + "learning_rate": 3.5431079664570236e-05, + "loss": 0.1343, + "step": 11120 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 1.545283555984497, + "learning_rate": 3.541797693920336e-05, + "loss": 0.1264, + "step": 11130 + }, + { + "epoch": 0.5838574423480084, + "grad_norm": 1.7932777404785156, + "learning_rate": 3.540487421383648e-05, + "loss": 0.1819, + "step": 11140 + }, + { + "epoch": 0.5843815513626834, + "grad_norm": 1.548215389251709, + "learning_rate": 3.5391771488469606e-05, + "loss": 0.1277, + "step": 11150 + }, + { + "epoch": 0.5849056603773585, + "grad_norm": 1.0522581338882446, + "learning_rate": 3.537866876310273e-05, + "loss": 0.1297, + "step": 11160 + }, + { + "epoch": 0.5854297693920335, + "grad_norm": 1.84707510471344, + "learning_rate": 3.536556603773585e-05, + "loss": 0.1104, + "step": 11170 + }, + { + "epoch": 0.5859538784067087, + "grad_norm": 1.651559829711914, + "learning_rate": 3.535246331236897e-05, + "loss": 0.1137, + "step": 11180 + }, + { + "epoch": 0.5864779874213837, + "grad_norm": 1.496799111366272, + "learning_rate": 3.53393605870021e-05, + "loss": 0.1152, + "step": 11190 + }, + { + "epoch": 0.5870020964360587, + "grad_norm": 1.6322681903839111, + "learning_rate": 3.532625786163522e-05, + "loss": 0.1173, + "step": 11200 + }, + { + "epoch": 0.5875262054507338, + "grad_norm": 1.550951361656189, + "learning_rate": 3.5313155136268346e-05, + "loss": 0.1068, + "step": 11210 + }, + { + "epoch": 0.5880503144654088, + "grad_norm": 2.4538066387176514, + "learning_rate": 3.530005241090147e-05, + "loss": 0.1406, + "step": 11220 + }, + { + "epoch": 0.5885744234800838, + "grad_norm": 1.4006059169769287, + "learning_rate": 3.528694968553459e-05, + "loss": 0.1141, + "step": 11230 + }, + { + "epoch": 0.589098532494759, + "grad_norm": 1.2701703310012817, + "learning_rate": 3.5273846960167716e-05, + "loss": 0.1179, + "step": 11240 + }, + { + "epoch": 0.589622641509434, + "grad_norm": 1.4434823989868164, + "learning_rate": 3.526074423480084e-05, + "loss": 0.1044, + "step": 11250 + }, + { + "epoch": 0.590146750524109, + "grad_norm": 2.028015613555908, + "learning_rate": 3.524764150943396e-05, + "loss": 0.1306, + "step": 11260 + }, + { + "epoch": 0.5906708595387841, + "grad_norm": 1.5259170532226562, + "learning_rate": 3.5234538784067086e-05, + "loss": 0.0971, + "step": 11270 + }, + { + "epoch": 0.5911949685534591, + "grad_norm": 1.5181983709335327, + "learning_rate": 3.5221436058700216e-05, + "loss": 0.1125, + "step": 11280 + }, + { + "epoch": 0.5917190775681341, + "grad_norm": 1.5904532670974731, + "learning_rate": 3.520833333333334e-05, + "loss": 0.141, + "step": 11290 + }, + { + "epoch": 0.5922431865828093, + "grad_norm": 1.1001319885253906, + "learning_rate": 3.5195230607966456e-05, + "loss": 0.0911, + "step": 11300 + }, + { + "epoch": 0.5927672955974843, + "grad_norm": 1.2055481672286987, + "learning_rate": 3.518212788259958e-05, + "loss": 0.1525, + "step": 11310 + }, + { + "epoch": 0.5932914046121593, + "grad_norm": 1.7501437664031982, + "learning_rate": 3.51690251572327e-05, + "loss": 0.1298, + "step": 11320 + }, + { + "epoch": 0.5938155136268344, + "grad_norm": 1.5523266792297363, + "learning_rate": 3.515592243186583e-05, + "loss": 0.1222, + "step": 11330 + }, + { + "epoch": 0.5943396226415094, + "grad_norm": 1.746579647064209, + "learning_rate": 3.514281970649895e-05, + "loss": 0.1188, + "step": 11340 + }, + { + "epoch": 0.5948637316561844, + "grad_norm": 2.078474283218384, + "learning_rate": 3.512971698113208e-05, + "loss": 0.1608, + "step": 11350 + }, + { + "epoch": 0.5953878406708596, + "grad_norm": 1.7152198553085327, + "learning_rate": 3.5116614255765204e-05, + "loss": 0.1422, + "step": 11360 + }, + { + "epoch": 0.5959119496855346, + "grad_norm": 3.1155312061309814, + "learning_rate": 3.510351153039833e-05, + "loss": 0.1311, + "step": 11370 + }, + { + "epoch": 0.5964360587002097, + "grad_norm": 2.6921157836914062, + "learning_rate": 3.509040880503145e-05, + "loss": 0.1245, + "step": 11380 + }, + { + "epoch": 0.5969601677148847, + "grad_norm": 1.7192963361740112, + "learning_rate": 3.5077306079664574e-05, + "loss": 0.1133, + "step": 11390 + }, + { + "epoch": 0.5974842767295597, + "grad_norm": 2.002624273300171, + "learning_rate": 3.50642033542977e-05, + "loss": 0.1914, + "step": 11400 + }, + { + "epoch": 0.5980083857442348, + "grad_norm": 2.982755184173584, + "learning_rate": 3.505110062893082e-05, + "loss": 0.1263, + "step": 11410 + }, + { + "epoch": 0.5985324947589099, + "grad_norm": 1.5957186222076416, + "learning_rate": 3.5037997903563944e-05, + "loss": 0.1042, + "step": 11420 + }, + { + "epoch": 0.5990566037735849, + "grad_norm": 1.106004238128662, + "learning_rate": 3.502489517819707e-05, + "loss": 0.1075, + "step": 11430 + }, + { + "epoch": 0.59958071278826, + "grad_norm": 0.960927426815033, + "learning_rate": 3.501179245283019e-05, + "loss": 0.1113, + "step": 11440 + }, + { + "epoch": 0.600104821802935, + "grad_norm": 1.5358039140701294, + "learning_rate": 3.4998689727463314e-05, + "loss": 0.1175, + "step": 11450 + }, + { + "epoch": 0.60062893081761, + "grad_norm": 2.1409425735473633, + "learning_rate": 3.498558700209644e-05, + "loss": 0.1279, + "step": 11460 + }, + { + "epoch": 0.6011530398322851, + "grad_norm": 1.1909631490707397, + "learning_rate": 3.497248427672956e-05, + "loss": 0.1417, + "step": 11470 + }, + { + "epoch": 0.6016771488469602, + "grad_norm": 1.6865551471710205, + "learning_rate": 3.4959381551362684e-05, + "loss": 0.1386, + "step": 11480 + }, + { + "epoch": 0.6022012578616353, + "grad_norm": 1.3004859685897827, + "learning_rate": 3.494627882599581e-05, + "loss": 0.147, + "step": 11490 + }, + { + "epoch": 0.6027253668763103, + "grad_norm": 1.1438429355621338, + "learning_rate": 3.493317610062893e-05, + "loss": 0.1137, + "step": 11500 + }, + { + "epoch": 0.6032494758909853, + "grad_norm": 1.17679762840271, + "learning_rate": 3.492007337526206e-05, + "loss": 0.1251, + "step": 11510 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 1.6681784391403198, + "learning_rate": 3.4906970649895184e-05, + "loss": 0.1186, + "step": 11520 + }, + { + "epoch": 0.6042976939203354, + "grad_norm": 1.6447416543960571, + "learning_rate": 3.489386792452831e-05, + "loss": 0.1433, + "step": 11530 + }, + { + "epoch": 0.6048218029350105, + "grad_norm": 1.9654818773269653, + "learning_rate": 3.4880765199161424e-05, + "loss": 0.1179, + "step": 11540 + }, + { + "epoch": 0.6053459119496856, + "grad_norm": 1.8266123533248901, + "learning_rate": 3.486766247379455e-05, + "loss": 0.1201, + "step": 11550 + }, + { + "epoch": 0.6058700209643606, + "grad_norm": 1.8127845525741577, + "learning_rate": 3.485455974842767e-05, + "loss": 0.12, + "step": 11560 + }, + { + "epoch": 0.6063941299790356, + "grad_norm": 1.7763968706130981, + "learning_rate": 3.4841457023060795e-05, + "loss": 0.1157, + "step": 11570 + }, + { + "epoch": 0.6069182389937107, + "grad_norm": 1.6913458108901978, + "learning_rate": 3.4828354297693925e-05, + "loss": 0.136, + "step": 11580 + }, + { + "epoch": 0.6074423480083857, + "grad_norm": 2.5830845832824707, + "learning_rate": 3.481525157232705e-05, + "loss": 0.1056, + "step": 11590 + }, + { + "epoch": 0.6079664570230608, + "grad_norm": 2.0478758811950684, + "learning_rate": 3.480214884696017e-05, + "loss": 0.154, + "step": 11600 + }, + { + "epoch": 0.6084905660377359, + "grad_norm": 1.382096529006958, + "learning_rate": 3.4789046121593295e-05, + "loss": 0.1139, + "step": 11610 + }, + { + "epoch": 0.6090146750524109, + "grad_norm": 1.7384464740753174, + "learning_rate": 3.477594339622642e-05, + "loss": 0.1505, + "step": 11620 + }, + { + "epoch": 0.609538784067086, + "grad_norm": 0.7147114276885986, + "learning_rate": 3.476284067085954e-05, + "loss": 0.1231, + "step": 11630 + }, + { + "epoch": 0.610062893081761, + "grad_norm": 1.229019284248352, + "learning_rate": 3.4749737945492665e-05, + "loss": 0.1408, + "step": 11640 + }, + { + "epoch": 0.610587002096436, + "grad_norm": 1.091243028640747, + "learning_rate": 3.473663522012579e-05, + "loss": 0.094, + "step": 11650 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 1.5786937475204468, + "learning_rate": 3.472353249475891e-05, + "loss": 0.1862, + "step": 11660 + }, + { + "epoch": 0.6116352201257862, + "grad_norm": 1.2560791969299316, + "learning_rate": 3.4710429769392035e-05, + "loss": 0.1067, + "step": 11670 + }, + { + "epoch": 0.6121593291404612, + "grad_norm": 1.329825520515442, + "learning_rate": 3.469732704402516e-05, + "loss": 0.1226, + "step": 11680 + }, + { + "epoch": 0.6126834381551363, + "grad_norm": 2.251732349395752, + "learning_rate": 3.468422431865828e-05, + "loss": 0.144, + "step": 11690 + }, + { + "epoch": 0.6132075471698113, + "grad_norm": 1.7934879064559937, + "learning_rate": 3.4671121593291405e-05, + "loss": 0.1501, + "step": 11700 + }, + { + "epoch": 0.6137316561844863, + "grad_norm": 1.642490267753601, + "learning_rate": 3.465801886792453e-05, + "loss": 0.126, + "step": 11710 + }, + { + "epoch": 0.6142557651991615, + "grad_norm": 1.3452117443084717, + "learning_rate": 3.464491614255765e-05, + "loss": 0.1075, + "step": 11720 + }, + { + "epoch": 0.6147798742138365, + "grad_norm": 1.608151912689209, + "learning_rate": 3.4631813417190776e-05, + "loss": 0.1224, + "step": 11730 + }, + { + "epoch": 0.6153039832285115, + "grad_norm": 0.9076985120773315, + "learning_rate": 3.4618710691823906e-05, + "loss": 0.1444, + "step": 11740 + }, + { + "epoch": 0.6158280922431866, + "grad_norm": 0.8261033296585083, + "learning_rate": 3.460560796645703e-05, + "loss": 0.119, + "step": 11750 + }, + { + "epoch": 0.6163522012578616, + "grad_norm": 2.104185104370117, + "learning_rate": 3.459250524109015e-05, + "loss": 0.1415, + "step": 11760 + }, + { + "epoch": 0.6168763102725366, + "grad_norm": 0.9446232318878174, + "learning_rate": 3.4579402515723276e-05, + "loss": 0.1156, + "step": 11770 + }, + { + "epoch": 0.6174004192872118, + "grad_norm": 2.3821616172790527, + "learning_rate": 3.456629979035639e-05, + "loss": 0.1542, + "step": 11780 + }, + { + "epoch": 0.6179245283018868, + "grad_norm": 2.2270455360412598, + "learning_rate": 3.4553197064989516e-05, + "loss": 0.1382, + "step": 11790 + }, + { + "epoch": 0.6184486373165619, + "grad_norm": 2.193085193634033, + "learning_rate": 3.454009433962264e-05, + "loss": 0.1374, + "step": 11800 + }, + { + "epoch": 0.6189727463312369, + "grad_norm": 1.7484729290008545, + "learning_rate": 3.452699161425576e-05, + "loss": 0.1238, + "step": 11810 + }, + { + "epoch": 0.6194968553459119, + "grad_norm": 2.2514584064483643, + "learning_rate": 3.451388888888889e-05, + "loss": 0.1478, + "step": 11820 + }, + { + "epoch": 0.620020964360587, + "grad_norm": 1.5764837265014648, + "learning_rate": 3.4500786163522016e-05, + "loss": 0.1077, + "step": 11830 + }, + { + "epoch": 0.6205450733752621, + "grad_norm": 1.4490690231323242, + "learning_rate": 3.448768343815514e-05, + "loss": 0.1195, + "step": 11840 + }, + { + "epoch": 0.6210691823899371, + "grad_norm": 1.7251434326171875, + "learning_rate": 3.447458071278826e-05, + "loss": 0.1091, + "step": 11850 + }, + { + "epoch": 0.6215932914046122, + "grad_norm": 1.4244098663330078, + "learning_rate": 3.4461477987421386e-05, + "loss": 0.1146, + "step": 11860 + }, + { + "epoch": 0.6221174004192872, + "grad_norm": 0.8161481022834778, + "learning_rate": 3.444837526205451e-05, + "loss": 0.129, + "step": 11870 + }, + { + "epoch": 0.6226415094339622, + "grad_norm": 1.3293105363845825, + "learning_rate": 3.443527253668763e-05, + "loss": 0.1029, + "step": 11880 + }, + { + "epoch": 0.6231656184486373, + "grad_norm": 13.429025650024414, + "learning_rate": 3.4422169811320757e-05, + "loss": 0.1325, + "step": 11890 + }, + { + "epoch": 0.6236897274633124, + "grad_norm": 1.104237675666809, + "learning_rate": 3.440906708595388e-05, + "loss": 0.1125, + "step": 11900 + }, + { + "epoch": 0.6242138364779874, + "grad_norm": 1.6588174104690552, + "learning_rate": 3.4395964360587e-05, + "loss": 0.1143, + "step": 11910 + }, + { + "epoch": 0.6247379454926625, + "grad_norm": 2.6823654174804688, + "learning_rate": 3.438286163522013e-05, + "loss": 0.1536, + "step": 11920 + }, + { + "epoch": 0.6252620545073375, + "grad_norm": 1.737794280052185, + "learning_rate": 3.436975890985325e-05, + "loss": 0.1477, + "step": 11930 + }, + { + "epoch": 0.6257861635220126, + "grad_norm": 1.0251657962799072, + "learning_rate": 3.4356656184486373e-05, + "loss": 0.1084, + "step": 11940 + }, + { + "epoch": 0.6263102725366876, + "grad_norm": 2.0638644695281982, + "learning_rate": 3.43435534591195e-05, + "loss": 0.1675, + "step": 11950 + }, + { + "epoch": 0.6268343815513627, + "grad_norm": 1.1078050136566162, + "learning_rate": 3.433045073375262e-05, + "loss": 0.1051, + "step": 11960 + }, + { + "epoch": 0.6273584905660378, + "grad_norm": 1.5060793161392212, + "learning_rate": 3.4317348008385744e-05, + "loss": 0.1129, + "step": 11970 + }, + { + "epoch": 0.6278825995807128, + "grad_norm": 1.2375575304031372, + "learning_rate": 3.4304245283018874e-05, + "loss": 0.1262, + "step": 11980 + }, + { + "epoch": 0.6284067085953878, + "grad_norm": 0.9460673332214355, + "learning_rate": 3.4291142557652e-05, + "loss": 0.1568, + "step": 11990 + }, + { + "epoch": 0.6289308176100629, + "grad_norm": 2.6357972621917725, + "learning_rate": 3.427803983228512e-05, + "loss": 0.1254, + "step": 12000 + }, + { + "epoch": 0.6289308176100629, + "eval_loss": 0.2810102105140686, + "eval_runtime": 267.5223, + "eval_samples_per_second": 7.442, + "eval_steps_per_second": 1.241, + "step": 12000 + }, + { + "epoch": 0.6294549266247379, + "grad_norm": 1.6299917697906494, + "learning_rate": 3.426493710691824e-05, + "loss": 0.1376, + "step": 12010 + }, + { + "epoch": 0.629979035639413, + "grad_norm": 1.9334322214126587, + "learning_rate": 3.425183438155136e-05, + "loss": 0.0876, + "step": 12020 + }, + { + "epoch": 0.6305031446540881, + "grad_norm": 1.3271043300628662, + "learning_rate": 3.4238731656184484e-05, + "loss": 0.1258, + "step": 12030 + }, + { + "epoch": 0.6310272536687631, + "grad_norm": 1.4171603918075562, + "learning_rate": 3.422562893081761e-05, + "loss": 0.1236, + "step": 12040 + }, + { + "epoch": 0.6315513626834381, + "grad_norm": 2.808084011077881, + "learning_rate": 3.421252620545074e-05, + "loss": 0.1272, + "step": 12050 + }, + { + "epoch": 0.6320754716981132, + "grad_norm": 1.4960280656814575, + "learning_rate": 3.419942348008386e-05, + "loss": 0.1629, + "step": 12060 + }, + { + "epoch": 0.6325995807127882, + "grad_norm": 2.7985665798187256, + "learning_rate": 3.4186320754716984e-05, + "loss": 0.1066, + "step": 12070 + }, + { + "epoch": 0.6331236897274634, + "grad_norm": 0.7567781805992126, + "learning_rate": 3.417321802935011e-05, + "loss": 0.0965, + "step": 12080 + }, + { + "epoch": 0.6336477987421384, + "grad_norm": 1.7532941102981567, + "learning_rate": 3.416011530398323e-05, + "loss": 0.1472, + "step": 12090 + }, + { + "epoch": 0.6341719077568134, + "grad_norm": 1.5983638763427734, + "learning_rate": 3.4147012578616354e-05, + "loss": 0.1119, + "step": 12100 + }, + { + "epoch": 0.6346960167714885, + "grad_norm": 1.1211780309677124, + "learning_rate": 3.413390985324948e-05, + "loss": 0.1433, + "step": 12110 + }, + { + "epoch": 0.6352201257861635, + "grad_norm": 1.1015443801879883, + "learning_rate": 3.41208071278826e-05, + "loss": 0.1333, + "step": 12120 + }, + { + "epoch": 0.6357442348008385, + "grad_norm": 2.756619453430176, + "learning_rate": 3.4107704402515725e-05, + "loss": 0.1155, + "step": 12130 + }, + { + "epoch": 0.6362683438155137, + "grad_norm": 1.5897578001022339, + "learning_rate": 3.409460167714885e-05, + "loss": 0.112, + "step": 12140 + }, + { + "epoch": 0.6367924528301887, + "grad_norm": 1.2167946100234985, + "learning_rate": 3.408149895178197e-05, + "loss": 0.1093, + "step": 12150 + }, + { + "epoch": 0.6373165618448637, + "grad_norm": 1.0480616092681885, + "learning_rate": 3.4068396226415095e-05, + "loss": 0.1167, + "step": 12160 + }, + { + "epoch": 0.6378406708595388, + "grad_norm": 1.782877802848816, + "learning_rate": 3.405529350104822e-05, + "loss": 0.1219, + "step": 12170 + }, + { + "epoch": 0.6383647798742138, + "grad_norm": 2.3879051208496094, + "learning_rate": 3.404219077568134e-05, + "loss": 0.114, + "step": 12180 + }, + { + "epoch": 0.6388888888888888, + "grad_norm": 1.9370415210723877, + "learning_rate": 3.4029088050314465e-05, + "loss": 0.1186, + "step": 12190 + }, + { + "epoch": 0.639412997903564, + "grad_norm": 1.4608991146087646, + "learning_rate": 3.401598532494759e-05, + "loss": 0.1196, + "step": 12200 + }, + { + "epoch": 0.639937106918239, + "grad_norm": 2.2498528957366943, + "learning_rate": 3.400288259958072e-05, + "loss": 0.125, + "step": 12210 + }, + { + "epoch": 0.640461215932914, + "grad_norm": 2.014331579208374, + "learning_rate": 3.398977987421384e-05, + "loss": 0.1699, + "step": 12220 + }, + { + "epoch": 0.6409853249475891, + "grad_norm": 1.862320065498352, + "learning_rate": 3.3976677148846965e-05, + "loss": 0.1182, + "step": 12230 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.9242523312568665, + "learning_rate": 3.396357442348009e-05, + "loss": 0.1136, + "step": 12240 + }, + { + "epoch": 0.6420335429769392, + "grad_norm": 1.7910327911376953, + "learning_rate": 3.3950471698113205e-05, + "loss": 0.1173, + "step": 12250 + }, + { + "epoch": 0.6425576519916143, + "grad_norm": 1.989595651626587, + "learning_rate": 3.393736897274633e-05, + "loss": 0.1306, + "step": 12260 + }, + { + "epoch": 0.6430817610062893, + "grad_norm": 1.5238025188446045, + "learning_rate": 3.392426624737945e-05, + "loss": 0.1331, + "step": 12270 + }, + { + "epoch": 0.6436058700209644, + "grad_norm": 3.9870452880859375, + "learning_rate": 3.391116352201258e-05, + "loss": 0.129, + "step": 12280 + }, + { + "epoch": 0.6441299790356394, + "grad_norm": 1.6588159799575806, + "learning_rate": 3.3898060796645705e-05, + "loss": 0.1239, + "step": 12290 + }, + { + "epoch": 0.6446540880503144, + "grad_norm": 5.81835412979126, + "learning_rate": 3.388495807127883e-05, + "loss": 0.1182, + "step": 12300 + }, + { + "epoch": 0.6451781970649895, + "grad_norm": 1.9480522871017456, + "learning_rate": 3.387185534591195e-05, + "loss": 0.1077, + "step": 12310 + }, + { + "epoch": 0.6457023060796646, + "grad_norm": 1.4687237739562988, + "learning_rate": 3.3858752620545076e-05, + "loss": 0.1014, + "step": 12320 + }, + { + "epoch": 0.6462264150943396, + "grad_norm": 1.8134074211120605, + "learning_rate": 3.38456498951782e-05, + "loss": 0.0997, + "step": 12330 + }, + { + "epoch": 0.6467505241090147, + "grad_norm": 0.7567766308784485, + "learning_rate": 3.383254716981132e-05, + "loss": 0.1545, + "step": 12340 + }, + { + "epoch": 0.6472746331236897, + "grad_norm": 1.324652075767517, + "learning_rate": 3.3819444444444446e-05, + "loss": 0.1063, + "step": 12350 + }, + { + "epoch": 0.6477987421383647, + "grad_norm": 1.9718384742736816, + "learning_rate": 3.380634171907757e-05, + "loss": 0.1065, + "step": 12360 + }, + { + "epoch": 0.6483228511530398, + "grad_norm": 1.6241679191589355, + "learning_rate": 3.379323899371069e-05, + "loss": 0.1323, + "step": 12370 + }, + { + "epoch": 0.6488469601677149, + "grad_norm": 1.7419811487197876, + "learning_rate": 3.3780136268343816e-05, + "loss": 0.1247, + "step": 12380 + }, + { + "epoch": 0.64937106918239, + "grad_norm": 2.216460943222046, + "learning_rate": 3.376703354297694e-05, + "loss": 0.0928, + "step": 12390 + }, + { + "epoch": 0.649895178197065, + "grad_norm": 1.5031715631484985, + "learning_rate": 3.375393081761006e-05, + "loss": 0.1595, + "step": 12400 + }, + { + "epoch": 0.65041928721174, + "grad_norm": 1.9624732732772827, + "learning_rate": 3.3740828092243186e-05, + "loss": 0.1122, + "step": 12410 + }, + { + "epoch": 0.6509433962264151, + "grad_norm": 1.3125239610671997, + "learning_rate": 3.372772536687631e-05, + "loss": 0.1352, + "step": 12420 + }, + { + "epoch": 0.6514675052410901, + "grad_norm": 2.7664361000061035, + "learning_rate": 3.371462264150943e-05, + "loss": 0.1173, + "step": 12430 + }, + { + "epoch": 0.6519916142557652, + "grad_norm": 1.2956434488296509, + "learning_rate": 3.370151991614256e-05, + "loss": 0.1383, + "step": 12440 + }, + { + "epoch": 0.6525157232704403, + "grad_norm": 1.1372560262680054, + "learning_rate": 3.3688417190775686e-05, + "loss": 0.1002, + "step": 12450 + }, + { + "epoch": 0.6530398322851153, + "grad_norm": 4.1682353019714355, + "learning_rate": 3.367531446540881e-05, + "loss": 0.1352, + "step": 12460 + }, + { + "epoch": 0.6535639412997903, + "grad_norm": 1.243196964263916, + "learning_rate": 3.366221174004193e-05, + "loss": 0.1224, + "step": 12470 + }, + { + "epoch": 0.6540880503144654, + "grad_norm": 1.4892295598983765, + "learning_rate": 3.3649109014675057e-05, + "loss": 0.1316, + "step": 12480 + }, + { + "epoch": 0.6546121593291404, + "grad_norm": 1.3253917694091797, + "learning_rate": 3.363600628930817e-05, + "loss": 0.1142, + "step": 12490 + }, + { + "epoch": 0.6551362683438156, + "grad_norm": 1.1935123205184937, + "learning_rate": 3.3622903563941297e-05, + "loss": 0.1323, + "step": 12500 + }, + { + "epoch": 0.6556603773584906, + "grad_norm": 1.5510022640228271, + "learning_rate": 3.360980083857443e-05, + "loss": 0.1209, + "step": 12510 + }, + { + "epoch": 0.6561844863731656, + "grad_norm": 2.739384174346924, + "learning_rate": 3.359669811320755e-05, + "loss": 0.086, + "step": 12520 + }, + { + "epoch": 0.6567085953878407, + "grad_norm": 1.8388934135437012, + "learning_rate": 3.3583595387840674e-05, + "loss": 0.1352, + "step": 12530 + }, + { + "epoch": 0.6572327044025157, + "grad_norm": 1.7754712104797363, + "learning_rate": 3.35704926624738e-05, + "loss": 0.1606, + "step": 12540 + }, + { + "epoch": 0.6577568134171907, + "grad_norm": 1.7802470922470093, + "learning_rate": 3.355738993710692e-05, + "loss": 0.1453, + "step": 12550 + }, + { + "epoch": 0.6582809224318659, + "grad_norm": 2.5520379543304443, + "learning_rate": 3.3544287211740044e-05, + "loss": 0.1198, + "step": 12560 + }, + { + "epoch": 0.6588050314465409, + "grad_norm": 1.5761022567749023, + "learning_rate": 3.353118448637317e-05, + "loss": 0.1368, + "step": 12570 + }, + { + "epoch": 0.6593291404612159, + "grad_norm": 1.0065501928329468, + "learning_rate": 3.351808176100629e-05, + "loss": 0.135, + "step": 12580 + }, + { + "epoch": 0.659853249475891, + "grad_norm": 1.7206062078475952, + "learning_rate": 3.3504979035639414e-05, + "loss": 0.107, + "step": 12590 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 1.093377947807312, + "learning_rate": 3.3491876310272544e-05, + "loss": 0.1611, + "step": 12600 + }, + { + "epoch": 0.660901467505241, + "grad_norm": 1.6093024015426636, + "learning_rate": 3.347877358490566e-05, + "loss": 0.1176, + "step": 12610 + }, + { + "epoch": 0.6614255765199162, + "grad_norm": 0.8911046981811523, + "learning_rate": 3.3465670859538784e-05, + "loss": 0.1103, + "step": 12620 + }, + { + "epoch": 0.6619496855345912, + "grad_norm": 2.1810872554779053, + "learning_rate": 3.345256813417191e-05, + "loss": 0.1163, + "step": 12630 + }, + { + "epoch": 0.6624737945492662, + "grad_norm": 1.4465842247009277, + "learning_rate": 3.343946540880503e-05, + "loss": 0.1093, + "step": 12640 + }, + { + "epoch": 0.6629979035639413, + "grad_norm": 1.8797701597213745, + "learning_rate": 3.3426362683438154e-05, + "loss": 0.1508, + "step": 12650 + }, + { + "epoch": 0.6635220125786163, + "grad_norm": 2.4389729499816895, + "learning_rate": 3.341325995807128e-05, + "loss": 0.1191, + "step": 12660 + }, + { + "epoch": 0.6640461215932913, + "grad_norm": 1.7194629907608032, + "learning_rate": 3.340015723270441e-05, + "loss": 0.1318, + "step": 12670 + }, + { + "epoch": 0.6645702306079665, + "grad_norm": 1.6836072206497192, + "learning_rate": 3.338705450733753e-05, + "loss": 0.1239, + "step": 12680 + }, + { + "epoch": 0.6650943396226415, + "grad_norm": 0.973906934261322, + "learning_rate": 3.3373951781970654e-05, + "loss": 0.1049, + "step": 12690 + }, + { + "epoch": 0.6656184486373166, + "grad_norm": 2.0951108932495117, + "learning_rate": 3.336084905660378e-05, + "loss": 0.1372, + "step": 12700 + }, + { + "epoch": 0.6661425576519916, + "grad_norm": 1.2338130474090576, + "learning_rate": 3.33477463312369e-05, + "loss": 0.0986, + "step": 12710 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.3110014200210571, + "learning_rate": 3.3334643605870025e-05, + "loss": 0.1211, + "step": 12720 + }, + { + "epoch": 0.6671907756813418, + "grad_norm": 1.1576558351516724, + "learning_rate": 3.332154088050314e-05, + "loss": 0.1298, + "step": 12730 + }, + { + "epoch": 0.6677148846960168, + "grad_norm": 1.0870299339294434, + "learning_rate": 3.3308438155136265e-05, + "loss": 0.1313, + "step": 12740 + }, + { + "epoch": 0.6682389937106918, + "grad_norm": 1.5776509046554565, + "learning_rate": 3.3295335429769395e-05, + "loss": 0.0887, + "step": 12750 + }, + { + "epoch": 0.6687631027253669, + "grad_norm": 1.6563470363616943, + "learning_rate": 3.328223270440252e-05, + "loss": 0.1193, + "step": 12760 + }, + { + "epoch": 0.6692872117400419, + "grad_norm": 1.5006121397018433, + "learning_rate": 3.326912997903564e-05, + "loss": 0.1447, + "step": 12770 + }, + { + "epoch": 0.6698113207547169, + "grad_norm": 2.2964420318603516, + "learning_rate": 3.3256027253668765e-05, + "loss": 0.1217, + "step": 12780 + }, + { + "epoch": 0.6703354297693921, + "grad_norm": 1.14298677444458, + "learning_rate": 3.324292452830189e-05, + "loss": 0.1285, + "step": 12790 + }, + { + "epoch": 0.6708595387840671, + "grad_norm": 1.4093611240386963, + "learning_rate": 3.322982180293501e-05, + "loss": 0.1114, + "step": 12800 + }, + { + "epoch": 0.6713836477987422, + "grad_norm": 5.533634662628174, + "learning_rate": 3.3216719077568135e-05, + "loss": 0.1171, + "step": 12810 + }, + { + "epoch": 0.6719077568134172, + "grad_norm": 2.222405433654785, + "learning_rate": 3.320361635220126e-05, + "loss": 0.1288, + "step": 12820 + }, + { + "epoch": 0.6724318658280922, + "grad_norm": 1.3084903955459595, + "learning_rate": 3.319051362683439e-05, + "loss": 0.1397, + "step": 12830 + }, + { + "epoch": 0.6729559748427673, + "grad_norm": 1.6426416635513306, + "learning_rate": 3.317741090146751e-05, + "loss": 0.1266, + "step": 12840 + }, + { + "epoch": 0.6734800838574424, + "grad_norm": 0.9996996521949768, + "learning_rate": 3.316430817610063e-05, + "loss": 0.1363, + "step": 12850 + }, + { + "epoch": 0.6740041928721174, + "grad_norm": 0.8206360936164856, + "learning_rate": 3.315120545073375e-05, + "loss": 0.1161, + "step": 12860 + }, + { + "epoch": 0.6745283018867925, + "grad_norm": 2.556631326675415, + "learning_rate": 3.3138102725366875e-05, + "loss": 0.1186, + "step": 12870 + }, + { + "epoch": 0.6750524109014675, + "grad_norm": 1.9859380722045898, + "learning_rate": 3.3125e-05, + "loss": 0.1142, + "step": 12880 + }, + { + "epoch": 0.6755765199161425, + "grad_norm": 2.254906177520752, + "learning_rate": 3.311189727463312e-05, + "loss": 0.1166, + "step": 12890 + }, + { + "epoch": 0.6761006289308176, + "grad_norm": 2.126537799835205, + "learning_rate": 3.3098794549266246e-05, + "loss": 0.0938, + "step": 12900 + }, + { + "epoch": 0.6766247379454927, + "grad_norm": 1.368828535079956, + "learning_rate": 3.3085691823899376e-05, + "loss": 0.1353, + "step": 12910 + }, + { + "epoch": 0.6771488469601677, + "grad_norm": 1.1199910640716553, + "learning_rate": 3.30725890985325e-05, + "loss": 0.1056, + "step": 12920 + }, + { + "epoch": 0.6776729559748428, + "grad_norm": 2.5403850078582764, + "learning_rate": 3.305948637316562e-05, + "loss": 0.1151, + "step": 12930 + }, + { + "epoch": 0.6781970649895178, + "grad_norm": 5.520144939422607, + "learning_rate": 3.3046383647798746e-05, + "loss": 0.1185, + "step": 12940 + }, + { + "epoch": 0.6787211740041929, + "grad_norm": 2.1471378803253174, + "learning_rate": 3.303328092243187e-05, + "loss": 0.1044, + "step": 12950 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 1.0975602865219116, + "learning_rate": 3.302017819706499e-05, + "loss": 0.1319, + "step": 12960 + }, + { + "epoch": 0.679769392033543, + "grad_norm": 1.5250160694122314, + "learning_rate": 3.300707547169811e-05, + "loss": 0.1387, + "step": 12970 + }, + { + "epoch": 0.6802935010482181, + "grad_norm": 1.1736054420471191, + "learning_rate": 3.299397274633124e-05, + "loss": 0.1229, + "step": 12980 + }, + { + "epoch": 0.6808176100628931, + "grad_norm": 1.3668029308319092, + "learning_rate": 3.298087002096436e-05, + "loss": 0.1206, + "step": 12990 + }, + { + "epoch": 0.6813417190775681, + "grad_norm": 1.4439727067947388, + "learning_rate": 3.2967767295597486e-05, + "loss": 0.1131, + "step": 13000 + }, + { + "epoch": 0.6813417190775681, + "eval_loss": 0.28294411301612854, + "eval_runtime": 267.3683, + "eval_samples_per_second": 7.447, + "eval_steps_per_second": 1.242, + "step": 13000 + }, + { + "epoch": 0.6818658280922432, + "grad_norm": 0.6069740056991577, + "learning_rate": 3.295466457023061e-05, + "loss": 0.148, + "step": 13010 + }, + { + "epoch": 0.6823899371069182, + "grad_norm": 3.8916685581207275, + "learning_rate": 3.294156184486373e-05, + "loss": 0.1325, + "step": 13020 + }, + { + "epoch": 0.6829140461215933, + "grad_norm": 0.8557778596878052, + "learning_rate": 3.2928459119496856e-05, + "loss": 0.1107, + "step": 13030 + }, + { + "epoch": 0.6834381551362684, + "grad_norm": 1.2309248447418213, + "learning_rate": 3.291535639412998e-05, + "loss": 0.1091, + "step": 13040 + }, + { + "epoch": 0.6839622641509434, + "grad_norm": 0.9375829100608826, + "learning_rate": 3.29022536687631e-05, + "loss": 0.1033, + "step": 13050 + }, + { + "epoch": 0.6844863731656184, + "grad_norm": 1.7269313335418701, + "learning_rate": 3.2889150943396227e-05, + "loss": 0.1076, + "step": 13060 + }, + { + "epoch": 0.6850104821802935, + "grad_norm": 1.9099595546722412, + "learning_rate": 3.287604821802936e-05, + "loss": 0.1242, + "step": 13070 + }, + { + "epoch": 0.6855345911949685, + "grad_norm": 1.4554835557937622, + "learning_rate": 3.286294549266248e-05, + "loss": 0.1332, + "step": 13080 + }, + { + "epoch": 0.6860587002096437, + "grad_norm": 1.0192896127700806, + "learning_rate": 3.28498427672956e-05, + "loss": 0.1317, + "step": 13090 + }, + { + "epoch": 0.6865828092243187, + "grad_norm": 1.4376505613327026, + "learning_rate": 3.283674004192872e-05, + "loss": 0.0972, + "step": 13100 + }, + { + "epoch": 0.6871069182389937, + "grad_norm": 2.5238263607025146, + "learning_rate": 3.2823637316561843e-05, + "loss": 0.1481, + "step": 13110 + }, + { + "epoch": 0.6876310272536688, + "grad_norm": 1.4387770891189575, + "learning_rate": 3.281053459119497e-05, + "loss": 0.1134, + "step": 13120 + }, + { + "epoch": 0.6881551362683438, + "grad_norm": 1.2158639430999756, + "learning_rate": 3.279743186582809e-05, + "loss": 0.0965, + "step": 13130 + }, + { + "epoch": 0.6886792452830188, + "grad_norm": 1.5913608074188232, + "learning_rate": 3.278432914046122e-05, + "loss": 0.1234, + "step": 13140 + }, + { + "epoch": 0.689203354297694, + "grad_norm": 1.5430805683135986, + "learning_rate": 3.2771226415094344e-05, + "loss": 0.1157, + "step": 13150 + }, + { + "epoch": 0.689727463312369, + "grad_norm": 2.6382107734680176, + "learning_rate": 3.275812368972747e-05, + "loss": 0.1237, + "step": 13160 + }, + { + "epoch": 0.690251572327044, + "grad_norm": 1.8848053216934204, + "learning_rate": 3.274502096436059e-05, + "loss": 0.1117, + "step": 13170 + }, + { + "epoch": 0.6907756813417191, + "grad_norm": 2.1337838172912598, + "learning_rate": 3.2731918238993714e-05, + "loss": 0.1371, + "step": 13180 + }, + { + "epoch": 0.6912997903563941, + "grad_norm": 2.070481538772583, + "learning_rate": 3.271881551362684e-05, + "loss": 0.1269, + "step": 13190 + }, + { + "epoch": 0.6918238993710691, + "grad_norm": 2.1413769721984863, + "learning_rate": 3.270571278825996e-05, + "loss": 0.1789, + "step": 13200 + }, + { + "epoch": 0.6923480083857443, + "grad_norm": 1.6787424087524414, + "learning_rate": 3.2692610062893084e-05, + "loss": 0.1399, + "step": 13210 + }, + { + "epoch": 0.6928721174004193, + "grad_norm": 2.589695453643799, + "learning_rate": 3.267950733752621e-05, + "loss": 0.1223, + "step": 13220 + }, + { + "epoch": 0.6933962264150944, + "grad_norm": 2.0658650398254395, + "learning_rate": 3.266640461215933e-05, + "loss": 0.1583, + "step": 13230 + }, + { + "epoch": 0.6939203354297694, + "grad_norm": 1.8367148637771606, + "learning_rate": 3.2653301886792454e-05, + "loss": 0.124, + "step": 13240 + }, + { + "epoch": 0.6944444444444444, + "grad_norm": 1.4144045114517212, + "learning_rate": 3.264019916142558e-05, + "loss": 0.1161, + "step": 13250 + }, + { + "epoch": 0.6949685534591195, + "grad_norm": 1.424119472503662, + "learning_rate": 3.26270964360587e-05, + "loss": 0.1167, + "step": 13260 + }, + { + "epoch": 0.6954926624737946, + "grad_norm": 1.1759669780731201, + "learning_rate": 3.2613993710691824e-05, + "loss": 0.1342, + "step": 13270 + }, + { + "epoch": 0.6960167714884696, + "grad_norm": 2.510288715362549, + "learning_rate": 3.260089098532495e-05, + "loss": 0.1149, + "step": 13280 + }, + { + "epoch": 0.6965408805031447, + "grad_norm": 1.3063350915908813, + "learning_rate": 3.258778825995807e-05, + "loss": 0.1246, + "step": 13290 + }, + { + "epoch": 0.6970649895178197, + "grad_norm": 1.7909855842590332, + "learning_rate": 3.25746855345912e-05, + "loss": 0.1009, + "step": 13300 + }, + { + "epoch": 0.6975890985324947, + "grad_norm": 1.8222286701202393, + "learning_rate": 3.2561582809224325e-05, + "loss": 0.087, + "step": 13310 + }, + { + "epoch": 0.6981132075471698, + "grad_norm": 1.255692720413208, + "learning_rate": 3.254848008385745e-05, + "loss": 0.1172, + "step": 13320 + }, + { + "epoch": 0.6986373165618449, + "grad_norm": 1.0486621856689453, + "learning_rate": 3.2535377358490565e-05, + "loss": 0.1173, + "step": 13330 + }, + { + "epoch": 0.69916142557652, + "grad_norm": 1.5670002698898315, + "learning_rate": 3.252227463312369e-05, + "loss": 0.1241, + "step": 13340 + }, + { + "epoch": 0.699685534591195, + "grad_norm": 1.8074911832809448, + "learning_rate": 3.250917190775681e-05, + "loss": 0.124, + "step": 13350 + }, + { + "epoch": 0.70020964360587, + "grad_norm": 1.7373123168945312, + "learning_rate": 3.2496069182389935e-05, + "loss": 0.1177, + "step": 13360 + }, + { + "epoch": 0.700733752620545, + "grad_norm": 1.53976571559906, + "learning_rate": 3.2482966457023065e-05, + "loss": 0.1335, + "step": 13370 + }, + { + "epoch": 0.7012578616352201, + "grad_norm": 1.9045443534851074, + "learning_rate": 3.246986373165619e-05, + "loss": 0.1128, + "step": 13380 + }, + { + "epoch": 0.7017819706498952, + "grad_norm": 1.210325002670288, + "learning_rate": 3.245676100628931e-05, + "loss": 0.0985, + "step": 13390 + }, + { + "epoch": 0.7023060796645703, + "grad_norm": 1.382736325263977, + "learning_rate": 3.2443658280922435e-05, + "loss": 0.1309, + "step": 13400 + }, + { + "epoch": 0.7028301886792453, + "grad_norm": 1.24323570728302, + "learning_rate": 3.243055555555556e-05, + "loss": 0.1584, + "step": 13410 + }, + { + "epoch": 0.7033542976939203, + "grad_norm": 2.1430201530456543, + "learning_rate": 3.241745283018868e-05, + "loss": 0.1114, + "step": 13420 + }, + { + "epoch": 0.7038784067085954, + "grad_norm": 1.5643491744995117, + "learning_rate": 3.2404350104821805e-05, + "loss": 0.1459, + "step": 13430 + }, + { + "epoch": 0.7044025157232704, + "grad_norm": 1.205976963043213, + "learning_rate": 3.239124737945493e-05, + "loss": 0.1064, + "step": 13440 + }, + { + "epoch": 0.7049266247379455, + "grad_norm": 2.065765142440796, + "learning_rate": 3.237814465408805e-05, + "loss": 0.1261, + "step": 13450 + }, + { + "epoch": 0.7054507337526206, + "grad_norm": 2.1515004634857178, + "learning_rate": 3.2365041928721175e-05, + "loss": 0.1169, + "step": 13460 + }, + { + "epoch": 0.7059748427672956, + "grad_norm": 2.8570148944854736, + "learning_rate": 3.23519392033543e-05, + "loss": 0.1242, + "step": 13470 + }, + { + "epoch": 0.7064989517819706, + "grad_norm": 1.6113414764404297, + "learning_rate": 3.233883647798742e-05, + "loss": 0.135, + "step": 13480 + }, + { + "epoch": 0.7070230607966457, + "grad_norm": 1.531532645225525, + "learning_rate": 3.2325733752620546e-05, + "loss": 0.1511, + "step": 13490 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 1.5004215240478516, + "learning_rate": 3.231263102725367e-05, + "loss": 0.1191, + "step": 13500 + }, + { + "epoch": 0.7080712788259959, + "grad_norm": 2.088381767272949, + "learning_rate": 3.229952830188679e-05, + "loss": 0.1127, + "step": 13510 + }, + { + "epoch": 0.7085953878406709, + "grad_norm": 1.1922684907913208, + "learning_rate": 3.2286425576519916e-05, + "loss": 0.123, + "step": 13520 + }, + { + "epoch": 0.7091194968553459, + "grad_norm": 1.5962071418762207, + "learning_rate": 3.2273322851153046e-05, + "loss": 0.1197, + "step": 13530 + }, + { + "epoch": 0.709643605870021, + "grad_norm": 1.5418871641159058, + "learning_rate": 3.226022012578617e-05, + "loss": 0.1137, + "step": 13540 + }, + { + "epoch": 0.710167714884696, + "grad_norm": 1.5913938283920288, + "learning_rate": 3.224711740041929e-05, + "loss": 0.1222, + "step": 13550 + }, + { + "epoch": 0.710691823899371, + "grad_norm": 2.4484283924102783, + "learning_rate": 3.223401467505241e-05, + "loss": 0.1021, + "step": 13560 + }, + { + "epoch": 0.7112159329140462, + "grad_norm": 6.387475490570068, + "learning_rate": 3.222091194968553e-05, + "loss": 0.1205, + "step": 13570 + }, + { + "epoch": 0.7117400419287212, + "grad_norm": 1.4460914134979248, + "learning_rate": 3.2207809224318656e-05, + "loss": 0.1423, + "step": 13580 + }, + { + "epoch": 0.7122641509433962, + "grad_norm": 1.3883821964263916, + "learning_rate": 3.219470649895178e-05, + "loss": 0.1155, + "step": 13590 + }, + { + "epoch": 0.7127882599580713, + "grad_norm": 2.029080390930176, + "learning_rate": 3.218160377358491e-05, + "loss": 0.1194, + "step": 13600 + }, + { + "epoch": 0.7133123689727463, + "grad_norm": 2.9844512939453125, + "learning_rate": 3.216850104821803e-05, + "loss": 0.168, + "step": 13610 + }, + { + "epoch": 0.7138364779874213, + "grad_norm": 11.359793663024902, + "learning_rate": 3.2155398322851156e-05, + "loss": 0.1327, + "step": 13620 + }, + { + "epoch": 0.7143605870020965, + "grad_norm": 1.7264271974563599, + "learning_rate": 3.214229559748428e-05, + "loss": 0.1268, + "step": 13630 + }, + { + "epoch": 0.7148846960167715, + "grad_norm": 2.224581718444824, + "learning_rate": 3.21291928721174e-05, + "loss": 0.1148, + "step": 13640 + }, + { + "epoch": 0.7154088050314465, + "grad_norm": 2.717209577560425, + "learning_rate": 3.2116090146750527e-05, + "loss": 0.1401, + "step": 13650 + }, + { + "epoch": 0.7159329140461216, + "grad_norm": 1.0973457098007202, + "learning_rate": 3.210298742138365e-05, + "loss": 0.1241, + "step": 13660 + }, + { + "epoch": 0.7164570230607966, + "grad_norm": 2.276607036590576, + "learning_rate": 3.208988469601677e-05, + "loss": 0.1436, + "step": 13670 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 1.4400893449783325, + "learning_rate": 3.20767819706499e-05, + "loss": 0.1324, + "step": 13680 + }, + { + "epoch": 0.7175052410901468, + "grad_norm": 1.5834757089614868, + "learning_rate": 3.206367924528302e-05, + "loss": 0.1471, + "step": 13690 + }, + { + "epoch": 0.7180293501048218, + "grad_norm": 1.8143632411956787, + "learning_rate": 3.2050576519916144e-05, + "loss": 0.1093, + "step": 13700 + }, + { + "epoch": 0.7185534591194969, + "grad_norm": 1.4229177236557007, + "learning_rate": 3.203747379454927e-05, + "loss": 0.1209, + "step": 13710 + }, + { + "epoch": 0.7190775681341719, + "grad_norm": 1.990929126739502, + "learning_rate": 3.202437106918239e-05, + "loss": 0.1138, + "step": 13720 + }, + { + "epoch": 0.7196016771488469, + "grad_norm": 3.4440460205078125, + "learning_rate": 3.2011268343815514e-05, + "loss": 0.107, + "step": 13730 + }, + { + "epoch": 0.720125786163522, + "grad_norm": 1.457261085510254, + "learning_rate": 3.199816561844864e-05, + "loss": 0.0956, + "step": 13740 + }, + { + "epoch": 0.7206498951781971, + "grad_norm": 1.8652387857437134, + "learning_rate": 3.198506289308176e-05, + "loss": 0.1358, + "step": 13750 + }, + { + "epoch": 0.7211740041928721, + "grad_norm": 1.6599705219268799, + "learning_rate": 3.197196016771489e-05, + "loss": 0.141, + "step": 13760 + }, + { + "epoch": 0.7216981132075472, + "grad_norm": 1.345238208770752, + "learning_rate": 3.1958857442348014e-05, + "loss": 0.1321, + "step": 13770 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 1.5770273208618164, + "learning_rate": 3.194575471698114e-05, + "loss": 0.0967, + "step": 13780 + }, + { + "epoch": 0.7227463312368972, + "grad_norm": 1.219248652458191, + "learning_rate": 3.193265199161426e-05, + "loss": 0.1167, + "step": 13790 + }, + { + "epoch": 0.7232704402515723, + "grad_norm": 2.964763641357422, + "learning_rate": 3.191954926624738e-05, + "loss": 0.1202, + "step": 13800 + }, + { + "epoch": 0.7237945492662474, + "grad_norm": 1.5197694301605225, + "learning_rate": 3.19064465408805e-05, + "loss": 0.115, + "step": 13810 + }, + { + "epoch": 0.7243186582809225, + "grad_norm": 2.9044370651245117, + "learning_rate": 3.1893343815513624e-05, + "loss": 0.1271, + "step": 13820 + }, + { + "epoch": 0.7248427672955975, + "grad_norm": 1.1253836154937744, + "learning_rate": 3.188024109014675e-05, + "loss": 0.0998, + "step": 13830 + }, + { + "epoch": 0.7253668763102725, + "grad_norm": 3.362396717071533, + "learning_rate": 3.186713836477988e-05, + "loss": 0.2023, + "step": 13840 + }, + { + "epoch": 0.7258909853249476, + "grad_norm": 2.730292558670044, + "learning_rate": 3.1854035639413e-05, + "loss": 0.1528, + "step": 13850 + }, + { + "epoch": 0.7264150943396226, + "grad_norm": 2.7650363445281982, + "learning_rate": 3.1840932914046124e-05, + "loss": 0.1211, + "step": 13860 + }, + { + "epoch": 0.7269392033542977, + "grad_norm": 1.6592358350753784, + "learning_rate": 3.182783018867925e-05, + "loss": 0.1398, + "step": 13870 + }, + { + "epoch": 0.7274633123689728, + "grad_norm": 1.5524930953979492, + "learning_rate": 3.181472746331237e-05, + "loss": 0.1147, + "step": 13880 + }, + { + "epoch": 0.7279874213836478, + "grad_norm": 1.3090453147888184, + "learning_rate": 3.1801624737945495e-05, + "loss": 0.1217, + "step": 13890 + }, + { + "epoch": 0.7285115303983228, + "grad_norm": 1.784716010093689, + "learning_rate": 3.178852201257862e-05, + "loss": 0.1172, + "step": 13900 + }, + { + "epoch": 0.7290356394129979, + "grad_norm": 1.9261764287948608, + "learning_rate": 3.177541928721174e-05, + "loss": 0.1311, + "step": 13910 + }, + { + "epoch": 0.7295597484276729, + "grad_norm": 1.0017802715301514, + "learning_rate": 3.1762316561844865e-05, + "loss": 0.0981, + "step": 13920 + }, + { + "epoch": 0.730083857442348, + "grad_norm": 1.6215063333511353, + "learning_rate": 3.174921383647799e-05, + "loss": 0.1409, + "step": 13930 + }, + { + "epoch": 0.7306079664570231, + "grad_norm": 3.0000991821289062, + "learning_rate": 3.173611111111111e-05, + "loss": 0.1398, + "step": 13940 + }, + { + "epoch": 0.7311320754716981, + "grad_norm": 1.7880263328552246, + "learning_rate": 3.1723008385744235e-05, + "loss": 0.1173, + "step": 13950 + }, + { + "epoch": 0.7316561844863732, + "grad_norm": 1.4684351682662964, + "learning_rate": 3.170990566037736e-05, + "loss": 0.1675, + "step": 13960 + }, + { + "epoch": 0.7321802935010482, + "grad_norm": 1.1320511102676392, + "learning_rate": 3.169680293501048e-05, + "loss": 0.1285, + "step": 13970 + }, + { + "epoch": 0.7327044025157232, + "grad_norm": 1.8147705793380737, + "learning_rate": 3.1683700209643605e-05, + "loss": 0.1178, + "step": 13980 + }, + { + "epoch": 0.7332285115303984, + "grad_norm": 2.41721773147583, + "learning_rate": 3.167059748427673e-05, + "loss": 0.1055, + "step": 13990 + }, + { + "epoch": 0.7337526205450734, + "grad_norm": 1.2500311136245728, + "learning_rate": 3.165749475890986e-05, + "loss": 0.0891, + "step": 14000 + }, + { + "epoch": 0.7337526205450734, + "eval_loss": 0.2849844992160797, + "eval_runtime": 267.9508, + "eval_samples_per_second": 7.43, + "eval_steps_per_second": 1.239, + "step": 14000 + }, + { + "epoch": 0.7342767295597484, + "grad_norm": 1.110494613647461, + "learning_rate": 3.164439203354298e-05, + "loss": 0.109, + "step": 14010 + }, + { + "epoch": 0.7348008385744235, + "grad_norm": 1.408320665359497, + "learning_rate": 3.1631289308176105e-05, + "loss": 0.1335, + "step": 14020 + }, + { + "epoch": 0.7353249475890985, + "grad_norm": 1.6937309503555298, + "learning_rate": 3.161818658280923e-05, + "loss": 0.1337, + "step": 14030 + }, + { + "epoch": 0.7358490566037735, + "grad_norm": 1.3139739036560059, + "learning_rate": 3.1605083857442345e-05, + "loss": 0.1279, + "step": 14040 + }, + { + "epoch": 0.7363731656184487, + "grad_norm": 1.5453401803970337, + "learning_rate": 3.159198113207547e-05, + "loss": 0.1439, + "step": 14050 + }, + { + "epoch": 0.7368972746331237, + "grad_norm": 2.1473937034606934, + "learning_rate": 3.157887840670859e-05, + "loss": 0.1426, + "step": 14060 + }, + { + "epoch": 0.7374213836477987, + "grad_norm": 1.6203043460845947, + "learning_rate": 3.156577568134172e-05, + "loss": 0.138, + "step": 14070 + }, + { + "epoch": 0.7379454926624738, + "grad_norm": 1.2558566331863403, + "learning_rate": 3.1552672955974846e-05, + "loss": 0.1303, + "step": 14080 + }, + { + "epoch": 0.7384696016771488, + "grad_norm": 0.9717229604721069, + "learning_rate": 3.153957023060797e-05, + "loss": 0.1371, + "step": 14090 + }, + { + "epoch": 0.7389937106918238, + "grad_norm": 1.5402069091796875, + "learning_rate": 3.152646750524109e-05, + "loss": 0.1568, + "step": 14100 + }, + { + "epoch": 0.739517819706499, + "grad_norm": 1.4260953664779663, + "learning_rate": 3.1513364779874216e-05, + "loss": 0.1152, + "step": 14110 + }, + { + "epoch": 0.740041928721174, + "grad_norm": 0.9325925707817078, + "learning_rate": 3.150026205450734e-05, + "loss": 0.1202, + "step": 14120 + }, + { + "epoch": 0.7405660377358491, + "grad_norm": 1.3378041982650757, + "learning_rate": 3.148715932914046e-05, + "loss": 0.113, + "step": 14130 + }, + { + "epoch": 0.7410901467505241, + "grad_norm": 0.816106915473938, + "learning_rate": 3.1474056603773586e-05, + "loss": 0.1068, + "step": 14140 + }, + { + "epoch": 0.7416142557651991, + "grad_norm": 1.1117491722106934, + "learning_rate": 3.146095387840671e-05, + "loss": 0.1038, + "step": 14150 + }, + { + "epoch": 0.7421383647798742, + "grad_norm": 2.233264446258545, + "learning_rate": 3.144785115303983e-05, + "loss": 0.1321, + "step": 14160 + }, + { + "epoch": 0.7426624737945493, + "grad_norm": 1.4176737070083618, + "learning_rate": 3.1434748427672956e-05, + "loss": 0.1154, + "step": 14170 + }, + { + "epoch": 0.7431865828092243, + "grad_norm": 1.8627464771270752, + "learning_rate": 3.142164570230608e-05, + "loss": 0.1307, + "step": 14180 + }, + { + "epoch": 0.7437106918238994, + "grad_norm": 1.6727312803268433, + "learning_rate": 3.14085429769392e-05, + "loss": 0.1341, + "step": 14190 + }, + { + "epoch": 0.7442348008385744, + "grad_norm": 1.7155283689498901, + "learning_rate": 3.1395440251572326e-05, + "loss": 0.1261, + "step": 14200 + }, + { + "epoch": 0.7447589098532494, + "grad_norm": 1.3220230340957642, + "learning_rate": 3.138233752620545e-05, + "loss": 0.1047, + "step": 14210 + }, + { + "epoch": 0.7452830188679245, + "grad_norm": 1.772420048713684, + "learning_rate": 3.136923480083857e-05, + "loss": 0.1299, + "step": 14220 + }, + { + "epoch": 0.7458071278825996, + "grad_norm": 1.4345825910568237, + "learning_rate": 3.13561320754717e-05, + "loss": 0.1324, + "step": 14230 + }, + { + "epoch": 0.7463312368972747, + "grad_norm": 1.821257472038269, + "learning_rate": 3.134302935010483e-05, + "loss": 0.1312, + "step": 14240 + }, + { + "epoch": 0.7468553459119497, + "grad_norm": 1.6807703971862793, + "learning_rate": 3.132992662473795e-05, + "loss": 0.0969, + "step": 14250 + }, + { + "epoch": 0.7473794549266247, + "grad_norm": 1.5309361219406128, + "learning_rate": 3.1316823899371073e-05, + "loss": 0.1293, + "step": 14260 + }, + { + "epoch": 0.7479035639412998, + "grad_norm": 1.249865174293518, + "learning_rate": 3.13037211740042e-05, + "loss": 0.1112, + "step": 14270 + }, + { + "epoch": 0.7484276729559748, + "grad_norm": 3.1915624141693115, + "learning_rate": 3.1290618448637313e-05, + "loss": 0.1542, + "step": 14280 + }, + { + "epoch": 0.7489517819706499, + "grad_norm": 1.123343825340271, + "learning_rate": 3.127751572327044e-05, + "loss": 0.0977, + "step": 14290 + }, + { + "epoch": 0.749475890985325, + "grad_norm": 1.7678483724594116, + "learning_rate": 3.126441299790357e-05, + "loss": 0.1502, + "step": 14300 + }, + { + "epoch": 0.75, + "grad_norm": 1.4714158773422241, + "learning_rate": 3.125131027253669e-05, + "loss": 0.1462, + "step": 14310 + }, + { + "epoch": 0.750524109014675, + "grad_norm": 1.335870623588562, + "learning_rate": 3.1238207547169814e-05, + "loss": 0.1237, + "step": 14320 + }, + { + "epoch": 0.7510482180293501, + "grad_norm": 1.0049090385437012, + "learning_rate": 3.122510482180294e-05, + "loss": 0.1159, + "step": 14330 + }, + { + "epoch": 0.7515723270440252, + "grad_norm": 3.0360207557678223, + "learning_rate": 3.121200209643606e-05, + "loss": 0.1274, + "step": 14340 + }, + { + "epoch": 0.7520964360587002, + "grad_norm": 1.8040392398834229, + "learning_rate": 3.1198899371069184e-05, + "loss": 0.1272, + "step": 14350 + }, + { + "epoch": 0.7526205450733753, + "grad_norm": 1.7428536415100098, + "learning_rate": 3.118579664570231e-05, + "loss": 0.139, + "step": 14360 + }, + { + "epoch": 0.7531446540880503, + "grad_norm": 2.1202545166015625, + "learning_rate": 3.117269392033543e-05, + "loss": 0.1295, + "step": 14370 + }, + { + "epoch": 0.7536687631027253, + "grad_norm": 1.1606621742248535, + "learning_rate": 3.1159591194968554e-05, + "loss": 0.0937, + "step": 14380 + }, + { + "epoch": 0.7541928721174004, + "grad_norm": 0.8252704739570618, + "learning_rate": 3.1146488469601684e-05, + "loss": 0.1126, + "step": 14390 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.904585063457489, + "learning_rate": 3.11333857442348e-05, + "loss": 0.0978, + "step": 14400 + }, + { + "epoch": 0.7552410901467506, + "grad_norm": 1.936915397644043, + "learning_rate": 3.1120283018867924e-05, + "loss": 0.1144, + "step": 14410 + }, + { + "epoch": 0.7557651991614256, + "grad_norm": 1.9841164350509644, + "learning_rate": 3.110718029350105e-05, + "loss": 0.141, + "step": 14420 + }, + { + "epoch": 0.7562893081761006, + "grad_norm": 1.5874981880187988, + "learning_rate": 3.109407756813417e-05, + "loss": 0.139, + "step": 14430 + }, + { + "epoch": 0.7568134171907757, + "grad_norm": 1.0045322179794312, + "learning_rate": 3.1080974842767294e-05, + "loss": 0.1067, + "step": 14440 + }, + { + "epoch": 0.7573375262054507, + "grad_norm": 1.535852074623108, + "learning_rate": 3.106787211740042e-05, + "loss": 0.1262, + "step": 14450 + }, + { + "epoch": 0.7578616352201258, + "grad_norm": 1.0694433450698853, + "learning_rate": 3.105476939203355e-05, + "loss": 0.1029, + "step": 14460 + }, + { + "epoch": 0.7583857442348009, + "grad_norm": 2.2281646728515625, + "learning_rate": 3.104166666666667e-05, + "loss": 0.1058, + "step": 14470 + }, + { + "epoch": 0.7589098532494759, + "grad_norm": 2.186168909072876, + "learning_rate": 3.1028563941299795e-05, + "loss": 0.1099, + "step": 14480 + }, + { + "epoch": 0.7594339622641509, + "grad_norm": 1.6012275218963623, + "learning_rate": 3.101546121593292e-05, + "loss": 0.1249, + "step": 14490 + }, + { + "epoch": 0.759958071278826, + "grad_norm": 1.6102908849716187, + "learning_rate": 3.100235849056604e-05, + "loss": 0.1223, + "step": 14500 + }, + { + "epoch": 0.760482180293501, + "grad_norm": 2.9501163959503174, + "learning_rate": 3.0989255765199165e-05, + "loss": 0.1348, + "step": 14510 + }, + { + "epoch": 0.7610062893081762, + "grad_norm": 1.4406819343566895, + "learning_rate": 3.097615303983228e-05, + "loss": 0.1277, + "step": 14520 + }, + { + "epoch": 0.7615303983228512, + "grad_norm": 1.3746488094329834, + "learning_rate": 3.096305031446541e-05, + "loss": 0.0998, + "step": 14530 + }, + { + "epoch": 0.7620545073375262, + "grad_norm": 0.9767179489135742, + "learning_rate": 3.0949947589098535e-05, + "loss": 0.1141, + "step": 14540 + }, + { + "epoch": 0.7625786163522013, + "grad_norm": 1.8980849981307983, + "learning_rate": 3.093684486373166e-05, + "loss": 0.1409, + "step": 14550 + }, + { + "epoch": 0.7631027253668763, + "grad_norm": 1.578115463256836, + "learning_rate": 3.092374213836478e-05, + "loss": 0.0995, + "step": 14560 + }, + { + "epoch": 0.7636268343815513, + "grad_norm": 1.580794095993042, + "learning_rate": 3.0910639412997905e-05, + "loss": 0.1562, + "step": 14570 + }, + { + "epoch": 0.7641509433962265, + "grad_norm": 1.6007169485092163, + "learning_rate": 3.089753668763103e-05, + "loss": 0.1027, + "step": 14580 + }, + { + "epoch": 0.7646750524109015, + "grad_norm": 1.7334402799606323, + "learning_rate": 3.088443396226415e-05, + "loss": 0.1173, + "step": 14590 + }, + { + "epoch": 0.7651991614255765, + "grad_norm": 1.2498724460601807, + "learning_rate": 3.0871331236897275e-05, + "loss": 0.1007, + "step": 14600 + }, + { + "epoch": 0.7657232704402516, + "grad_norm": 1.0881195068359375, + "learning_rate": 3.08582285115304e-05, + "loss": 0.0993, + "step": 14610 + }, + { + "epoch": 0.7662473794549266, + "grad_norm": 2.333465814590454, + "learning_rate": 3.084512578616353e-05, + "loss": 0.1076, + "step": 14620 + }, + { + "epoch": 0.7667714884696016, + "grad_norm": 1.2131460905075073, + "learning_rate": 3.083202306079665e-05, + "loss": 0.1103, + "step": 14630 + }, + { + "epoch": 0.7672955974842768, + "grad_norm": 2.7174482345581055, + "learning_rate": 3.081892033542977e-05, + "loss": 0.122, + "step": 14640 + }, + { + "epoch": 0.7678197064989518, + "grad_norm": 0.7355225682258606, + "learning_rate": 3.080581761006289e-05, + "loss": 0.1053, + "step": 14650 + }, + { + "epoch": 0.7683438155136268, + "grad_norm": 1.972083568572998, + "learning_rate": 3.0792714884696016e-05, + "loss": 0.1474, + "step": 14660 + }, + { + "epoch": 0.7688679245283019, + "grad_norm": 1.4130945205688477, + "learning_rate": 3.077961215932914e-05, + "loss": 0.115, + "step": 14670 + }, + { + "epoch": 0.7693920335429769, + "grad_norm": 1.699841022491455, + "learning_rate": 3.076650943396226e-05, + "loss": 0.1161, + "step": 14680 + }, + { + "epoch": 0.769916142557652, + "grad_norm": 1.4874564409255981, + "learning_rate": 3.075340670859539e-05, + "loss": 0.1268, + "step": 14690 + }, + { + "epoch": 0.7704402515723271, + "grad_norm": 1.0537846088409424, + "learning_rate": 3.0740303983228516e-05, + "loss": 0.1153, + "step": 14700 + }, + { + "epoch": 0.7709643605870021, + "grad_norm": 1.64293372631073, + "learning_rate": 3.072720125786164e-05, + "loss": 0.1495, + "step": 14710 + }, + { + "epoch": 0.7714884696016772, + "grad_norm": 1.6864267587661743, + "learning_rate": 3.071409853249476e-05, + "loss": 0.1056, + "step": 14720 + }, + { + "epoch": 0.7720125786163522, + "grad_norm": 1.3438204526901245, + "learning_rate": 3.0700995807127886e-05, + "loss": 0.0935, + "step": 14730 + }, + { + "epoch": 0.7725366876310272, + "grad_norm": 1.8872520923614502, + "learning_rate": 3.068789308176101e-05, + "loss": 0.108, + "step": 14740 + }, + { + "epoch": 0.7730607966457023, + "grad_norm": 1.7647202014923096, + "learning_rate": 3.067479035639413e-05, + "loss": 0.1238, + "step": 14750 + }, + { + "epoch": 0.7735849056603774, + "grad_norm": 3.869321346282959, + "learning_rate": 3.066168763102725e-05, + "loss": 0.1109, + "step": 14760 + }, + { + "epoch": 0.7741090146750524, + "grad_norm": 1.3759615421295166, + "learning_rate": 3.064858490566038e-05, + "loss": 0.1415, + "step": 14770 + }, + { + "epoch": 0.7746331236897275, + "grad_norm": 2.167872428894043, + "learning_rate": 3.06354821802935e-05, + "loss": 0.1456, + "step": 14780 + }, + { + "epoch": 0.7751572327044025, + "grad_norm": 1.0983874797821045, + "learning_rate": 3.0622379454926626e-05, + "loss": 0.1033, + "step": 14790 + }, + { + "epoch": 0.7756813417190775, + "grad_norm": 1.2786545753479004, + "learning_rate": 3.060927672955975e-05, + "loss": 0.1094, + "step": 14800 + }, + { + "epoch": 0.7762054507337526, + "grad_norm": 1.4020774364471436, + "learning_rate": 3.059617400419287e-05, + "loss": 0.1365, + "step": 14810 + }, + { + "epoch": 0.7767295597484277, + "grad_norm": 1.3768659830093384, + "learning_rate": 3.0583071278825997e-05, + "loss": 0.1133, + "step": 14820 + }, + { + "epoch": 0.7772536687631028, + "grad_norm": 1.1481125354766846, + "learning_rate": 3.056996855345912e-05, + "loss": 0.1176, + "step": 14830 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 1.4727671146392822, + "learning_rate": 3.055686582809224e-05, + "loss": 0.1541, + "step": 14840 + }, + { + "epoch": 0.7783018867924528, + "grad_norm": 1.5166109800338745, + "learning_rate": 3.0543763102725373e-05, + "loss": 0.11, + "step": 14850 + }, + { + "epoch": 0.7788259958071279, + "grad_norm": 1.597730040550232, + "learning_rate": 3.05306603773585e-05, + "loss": 0.1268, + "step": 14860 + }, + { + "epoch": 0.7793501048218029, + "grad_norm": 1.0425934791564941, + "learning_rate": 3.0517557651991617e-05, + "loss": 0.1093, + "step": 14870 + }, + { + "epoch": 0.779874213836478, + "grad_norm": 1.4101150035858154, + "learning_rate": 3.0504454926624737e-05, + "loss": 0.1286, + "step": 14880 + }, + { + "epoch": 0.7803983228511531, + "grad_norm": 0.8775603175163269, + "learning_rate": 3.049135220125786e-05, + "loss": 0.0974, + "step": 14890 + }, + { + "epoch": 0.7809224318658281, + "grad_norm": 1.4296807050704956, + "learning_rate": 3.0478249475890987e-05, + "loss": 0.0968, + "step": 14900 + }, + { + "epoch": 0.7814465408805031, + "grad_norm": 0.9347816109657288, + "learning_rate": 3.046514675052411e-05, + "loss": 0.1099, + "step": 14910 + }, + { + "epoch": 0.7819706498951782, + "grad_norm": 1.1797919273376465, + "learning_rate": 3.0452044025157234e-05, + "loss": 0.1267, + "step": 14920 + }, + { + "epoch": 0.7824947589098532, + "grad_norm": 1.7169650793075562, + "learning_rate": 3.0438941299790357e-05, + "loss": 0.118, + "step": 14930 + }, + { + "epoch": 0.7830188679245284, + "grad_norm": 1.5158791542053223, + "learning_rate": 3.042583857442348e-05, + "loss": 0.116, + "step": 14940 + }, + { + "epoch": 0.7835429769392034, + "grad_norm": 1.524572491645813, + "learning_rate": 3.0412735849056607e-05, + "loss": 0.1425, + "step": 14950 + }, + { + "epoch": 0.7840670859538784, + "grad_norm": 1.839971899986267, + "learning_rate": 3.039963312368973e-05, + "loss": 0.1026, + "step": 14960 + }, + { + "epoch": 0.7845911949685535, + "grad_norm": 0.864676296710968, + "learning_rate": 3.0386530398322854e-05, + "loss": 0.0963, + "step": 14970 + }, + { + "epoch": 0.7851153039832285, + "grad_norm": 2.310966730117798, + "learning_rate": 3.0373427672955977e-05, + "loss": 0.1307, + "step": 14980 + }, + { + "epoch": 0.7856394129979035, + "grad_norm": 1.5302140712738037, + "learning_rate": 3.0360324947589104e-05, + "loss": 0.1279, + "step": 14990 + }, + { + "epoch": 0.7861635220125787, + "grad_norm": 1.6324013471603394, + "learning_rate": 3.034722222222222e-05, + "loss": 0.1314, + "step": 15000 + }, + { + "epoch": 0.7861635220125787, + "eval_loss": 0.2777167558670044, + "eval_runtime": 268.1288, + "eval_samples_per_second": 7.426, + "eval_steps_per_second": 1.238, + "step": 15000 + }, + { + "epoch": 0.7866876310272537, + "grad_norm": 1.9445385932922363, + "learning_rate": 3.0334119496855344e-05, + "loss": 0.1347, + "step": 15010 + }, + { + "epoch": 0.7872117400419287, + "grad_norm": 1.0624724626541138, + "learning_rate": 3.032101677148847e-05, + "loss": 0.1104, + "step": 15020 + }, + { + "epoch": 0.7877358490566038, + "grad_norm": 1.1631520986557007, + "learning_rate": 3.0307914046121594e-05, + "loss": 0.1151, + "step": 15030 + }, + { + "epoch": 0.7882599580712788, + "grad_norm": 0.9908153414726257, + "learning_rate": 3.0294811320754718e-05, + "loss": 0.1304, + "step": 15040 + }, + { + "epoch": 0.7887840670859538, + "grad_norm": 2.5934977531433105, + "learning_rate": 3.028170859538784e-05, + "loss": 0.1065, + "step": 15050 + }, + { + "epoch": 0.789308176100629, + "grad_norm": 1.1737087965011597, + "learning_rate": 3.0268605870020965e-05, + "loss": 0.112, + "step": 15060 + }, + { + "epoch": 0.789832285115304, + "grad_norm": 1.6975194215774536, + "learning_rate": 3.025550314465409e-05, + "loss": 0.1253, + "step": 15070 + }, + { + "epoch": 0.790356394129979, + "grad_norm": 3.0375587940216064, + "learning_rate": 3.0242400419287215e-05, + "loss": 0.119, + "step": 15080 + }, + { + "epoch": 0.7908805031446541, + "grad_norm": 1.3421461582183838, + "learning_rate": 3.0229297693920338e-05, + "loss": 0.1191, + "step": 15090 + }, + { + "epoch": 0.7914046121593291, + "grad_norm": 3.2138166427612305, + "learning_rate": 3.021619496855346e-05, + "loss": 0.1506, + "step": 15100 + }, + { + "epoch": 0.7919287211740041, + "grad_norm": 1.5473613739013672, + "learning_rate": 3.020309224318658e-05, + "loss": 0.1389, + "step": 15110 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 4.1486592292785645, + "learning_rate": 3.0189989517819705e-05, + "loss": 0.1511, + "step": 15120 + }, + { + "epoch": 0.7929769392033543, + "grad_norm": 1.7993791103363037, + "learning_rate": 3.0176886792452828e-05, + "loss": 0.1299, + "step": 15130 + }, + { + "epoch": 0.7935010482180294, + "grad_norm": 1.2573460340499878, + "learning_rate": 3.0163784067085955e-05, + "loss": 0.1098, + "step": 15140 + }, + { + "epoch": 0.7940251572327044, + "grad_norm": 2.687364339828491, + "learning_rate": 3.015068134171908e-05, + "loss": 0.121, + "step": 15150 + }, + { + "epoch": 0.7945492662473794, + "grad_norm": 1.0059900283813477, + "learning_rate": 3.0137578616352202e-05, + "loss": 0.1137, + "step": 15160 + }, + { + "epoch": 0.7950733752620545, + "grad_norm": 2.1174488067626953, + "learning_rate": 3.0124475890985325e-05, + "loss": 0.0791, + "step": 15170 + }, + { + "epoch": 0.7955974842767296, + "grad_norm": 2.5065529346466064, + "learning_rate": 3.0111373165618452e-05, + "loss": 0.0929, + "step": 15180 + }, + { + "epoch": 0.7961215932914046, + "grad_norm": 1.4558613300323486, + "learning_rate": 3.0098270440251575e-05, + "loss": 0.1228, + "step": 15190 + }, + { + "epoch": 0.7966457023060797, + "grad_norm": 1.318332552909851, + "learning_rate": 3.00851677148847e-05, + "loss": 0.0896, + "step": 15200 + }, + { + "epoch": 0.7971698113207547, + "grad_norm": 1.5867081880569458, + "learning_rate": 3.0072064989517822e-05, + "loss": 0.1312, + "step": 15210 + }, + { + "epoch": 0.7976939203354297, + "grad_norm": 2.399853229522705, + "learning_rate": 3.0058962264150946e-05, + "loss": 0.1074, + "step": 15220 + }, + { + "epoch": 0.7982180293501048, + "grad_norm": 1.729591965675354, + "learning_rate": 3.0045859538784066e-05, + "loss": 0.0967, + "step": 15230 + }, + { + "epoch": 0.7987421383647799, + "grad_norm": 2.1603991985321045, + "learning_rate": 3.003275681341719e-05, + "loss": 0.1126, + "step": 15240 + }, + { + "epoch": 0.799266247379455, + "grad_norm": 1.2216774225234985, + "learning_rate": 3.0019654088050316e-05, + "loss": 0.1125, + "step": 15250 + }, + { + "epoch": 0.79979035639413, + "grad_norm": 4.094639301300049, + "learning_rate": 3.000655136268344e-05, + "loss": 0.1394, + "step": 15260 + }, + { + "epoch": 0.800314465408805, + "grad_norm": 2.448464870452881, + "learning_rate": 2.9993448637316562e-05, + "loss": 0.0978, + "step": 15270 + }, + { + "epoch": 0.80083857442348, + "grad_norm": 1.6635816097259521, + "learning_rate": 2.9980345911949686e-05, + "loss": 0.1443, + "step": 15280 + }, + { + "epoch": 0.8013626834381551, + "grad_norm": 1.4908186197280884, + "learning_rate": 2.996724318658281e-05, + "loss": 0.1262, + "step": 15290 + }, + { + "epoch": 0.8018867924528302, + "grad_norm": 1.3856154680252075, + "learning_rate": 2.9954140461215936e-05, + "loss": 0.137, + "step": 15300 + }, + { + "epoch": 0.8024109014675053, + "grad_norm": 2.067502498626709, + "learning_rate": 2.994103773584906e-05, + "loss": 0.1272, + "step": 15310 + }, + { + "epoch": 0.8029350104821803, + "grad_norm": 12.453585624694824, + "learning_rate": 2.9927935010482183e-05, + "loss": 0.0965, + "step": 15320 + }, + { + "epoch": 0.8034591194968553, + "grad_norm": 2.868530511856079, + "learning_rate": 2.9914832285115306e-05, + "loss": 0.0958, + "step": 15330 + }, + { + "epoch": 0.8039832285115304, + "grad_norm": 1.3196369409561157, + "learning_rate": 2.9901729559748433e-05, + "loss": 0.1208, + "step": 15340 + }, + { + "epoch": 0.8045073375262054, + "grad_norm": 0.8990975022315979, + "learning_rate": 2.988862683438155e-05, + "loss": 0.1065, + "step": 15350 + }, + { + "epoch": 0.8050314465408805, + "grad_norm": 1.5096759796142578, + "learning_rate": 2.9875524109014673e-05, + "loss": 0.1133, + "step": 15360 + }, + { + "epoch": 0.8055555555555556, + "grad_norm": 1.556361198425293, + "learning_rate": 2.98624213836478e-05, + "loss": 0.1362, + "step": 15370 + }, + { + "epoch": 0.8060796645702306, + "grad_norm": 2.255713939666748, + "learning_rate": 2.9849318658280923e-05, + "loss": 0.12, + "step": 15380 + }, + { + "epoch": 0.8066037735849056, + "grad_norm": 1.21890127658844, + "learning_rate": 2.9836215932914046e-05, + "loss": 0.0996, + "step": 15390 + }, + { + "epoch": 0.8071278825995807, + "grad_norm": 1.7154406309127808, + "learning_rate": 2.982311320754717e-05, + "loss": 0.1089, + "step": 15400 + }, + { + "epoch": 0.8076519916142557, + "grad_norm": 20.269657135009766, + "learning_rate": 2.9810010482180297e-05, + "loss": 0.127, + "step": 15410 + }, + { + "epoch": 0.8081761006289309, + "grad_norm": 1.4827377796173096, + "learning_rate": 2.979690775681342e-05, + "loss": 0.0965, + "step": 15420 + }, + { + "epoch": 0.8087002096436059, + "grad_norm": 2.1818010807037354, + "learning_rate": 2.9783805031446543e-05, + "loss": 0.1361, + "step": 15430 + }, + { + "epoch": 0.8092243186582809, + "grad_norm": 3.504404306411743, + "learning_rate": 2.9770702306079667e-05, + "loss": 0.1108, + "step": 15440 + }, + { + "epoch": 0.809748427672956, + "grad_norm": 1.8620023727416992, + "learning_rate": 2.975759958071279e-05, + "loss": 0.1077, + "step": 15450 + }, + { + "epoch": 0.810272536687631, + "grad_norm": 0.9601500034332275, + "learning_rate": 2.9744496855345917e-05, + "loss": 0.1188, + "step": 15460 + }, + { + "epoch": 0.810796645702306, + "grad_norm": 1.2247925996780396, + "learning_rate": 2.9731394129979034e-05, + "loss": 0.0988, + "step": 15470 + }, + { + "epoch": 0.8113207547169812, + "grad_norm": 2.1443095207214355, + "learning_rate": 2.9718291404612157e-05, + "loss": 0.147, + "step": 15480 + }, + { + "epoch": 0.8118448637316562, + "grad_norm": 1.4585750102996826, + "learning_rate": 2.9705188679245284e-05, + "loss": 0.1855, + "step": 15490 + }, + { + "epoch": 0.8123689727463312, + "grad_norm": 1.9782541990280151, + "learning_rate": 2.9692085953878407e-05, + "loss": 0.1283, + "step": 15500 + }, + { + "epoch": 0.8128930817610063, + "grad_norm": 1.5811617374420166, + "learning_rate": 2.967898322851153e-05, + "loss": 0.1051, + "step": 15510 + }, + { + "epoch": 0.8134171907756813, + "grad_norm": 3.186302423477173, + "learning_rate": 2.9665880503144654e-05, + "loss": 0.11, + "step": 15520 + }, + { + "epoch": 0.8139412997903563, + "grad_norm": 1.5484328269958496, + "learning_rate": 2.965277777777778e-05, + "loss": 0.1241, + "step": 15530 + }, + { + "epoch": 0.8144654088050315, + "grad_norm": 0.7247985601425171, + "learning_rate": 2.9639675052410904e-05, + "loss": 0.1159, + "step": 15540 + }, + { + "epoch": 0.8149895178197065, + "grad_norm": 2.027569055557251, + "learning_rate": 2.9626572327044027e-05, + "loss": 0.1339, + "step": 15550 + }, + { + "epoch": 0.8155136268343816, + "grad_norm": 3.1215574741363525, + "learning_rate": 2.961346960167715e-05, + "loss": 0.1233, + "step": 15560 + }, + { + "epoch": 0.8160377358490566, + "grad_norm": 1.5289641618728638, + "learning_rate": 2.9600366876310278e-05, + "loss": 0.1208, + "step": 15570 + }, + { + "epoch": 0.8165618448637316, + "grad_norm": 2.011868953704834, + "learning_rate": 2.95872641509434e-05, + "loss": 0.1041, + "step": 15580 + }, + { + "epoch": 0.8170859538784067, + "grad_norm": 1.1236916780471802, + "learning_rate": 2.9574161425576518e-05, + "loss": 0.1069, + "step": 15590 + }, + { + "epoch": 0.8176100628930818, + "grad_norm": 1.9092012643814087, + "learning_rate": 2.9561058700209644e-05, + "loss": 0.1264, + "step": 15600 + }, + { + "epoch": 0.8181341719077568, + "grad_norm": 1.2404230833053589, + "learning_rate": 2.9547955974842768e-05, + "loss": 0.0975, + "step": 15610 + }, + { + "epoch": 0.8186582809224319, + "grad_norm": 1.748572826385498, + "learning_rate": 2.953485324947589e-05, + "loss": 0.1142, + "step": 15620 + }, + { + "epoch": 0.8191823899371069, + "grad_norm": 1.5801773071289062, + "learning_rate": 2.9521750524109014e-05, + "loss": 0.1174, + "step": 15630 + }, + { + "epoch": 0.8197064989517819, + "grad_norm": 0.7898293137550354, + "learning_rate": 2.9508647798742138e-05, + "loss": 0.1337, + "step": 15640 + }, + { + "epoch": 0.820230607966457, + "grad_norm": 1.4395421743392944, + "learning_rate": 2.9495545073375265e-05, + "loss": 0.1178, + "step": 15650 + }, + { + "epoch": 0.8207547169811321, + "grad_norm": 0.7716788649559021, + "learning_rate": 2.9482442348008388e-05, + "loss": 0.0857, + "step": 15660 + }, + { + "epoch": 0.8212788259958071, + "grad_norm": 2.1449007987976074, + "learning_rate": 2.946933962264151e-05, + "loss": 0.1222, + "step": 15670 + }, + { + "epoch": 0.8218029350104822, + "grad_norm": 1.2264463901519775, + "learning_rate": 2.9456236897274635e-05, + "loss": 0.1064, + "step": 15680 + }, + { + "epoch": 0.8223270440251572, + "grad_norm": 1.507369041442871, + "learning_rate": 2.944313417190776e-05, + "loss": 0.1101, + "step": 15690 + }, + { + "epoch": 0.8228511530398323, + "grad_norm": 2.119400978088379, + "learning_rate": 2.9430031446540885e-05, + "loss": 0.1166, + "step": 15700 + }, + { + "epoch": 0.8233752620545073, + "grad_norm": 1.4604309797286987, + "learning_rate": 2.9416928721174e-05, + "loss": 0.1165, + "step": 15710 + }, + { + "epoch": 0.8238993710691824, + "grad_norm": 2.91054105758667, + "learning_rate": 2.940382599580713e-05, + "loss": 0.1082, + "step": 15720 + }, + { + "epoch": 0.8244234800838575, + "grad_norm": 1.6494545936584473, + "learning_rate": 2.9390723270440252e-05, + "loss": 0.1083, + "step": 15730 + }, + { + "epoch": 0.8249475890985325, + "grad_norm": 1.565700650215149, + "learning_rate": 2.9377620545073375e-05, + "loss": 0.1166, + "step": 15740 + }, + { + "epoch": 0.8254716981132075, + "grad_norm": 1.6435893774032593, + "learning_rate": 2.93645178197065e-05, + "loss": 0.1176, + "step": 15750 + }, + { + "epoch": 0.8259958071278826, + "grad_norm": 1.2347391843795776, + "learning_rate": 2.9351415094339625e-05, + "loss": 0.1166, + "step": 15760 + }, + { + "epoch": 0.8265199161425576, + "grad_norm": 1.281410574913025, + "learning_rate": 2.933831236897275e-05, + "loss": 0.1275, + "step": 15770 + }, + { + "epoch": 0.8270440251572327, + "grad_norm": 1.275313377380371, + "learning_rate": 2.9325209643605872e-05, + "loss": 0.1434, + "step": 15780 + }, + { + "epoch": 0.8275681341719078, + "grad_norm": 2.0496749877929688, + "learning_rate": 2.9312106918238995e-05, + "loss": 0.1286, + "step": 15790 + }, + { + "epoch": 0.8280922431865828, + "grad_norm": 0.8086955547332764, + "learning_rate": 2.929900419287212e-05, + "loss": 0.1114, + "step": 15800 + }, + { + "epoch": 0.8286163522012578, + "grad_norm": 2.2521004676818848, + "learning_rate": 2.9285901467505246e-05, + "loss": 0.1293, + "step": 15810 + }, + { + "epoch": 0.8291404612159329, + "grad_norm": 1.0213623046875, + "learning_rate": 2.927279874213837e-05, + "loss": 0.1008, + "step": 15820 + }, + { + "epoch": 0.8296645702306079, + "grad_norm": 10.51981258392334, + "learning_rate": 2.925969601677149e-05, + "loss": 0.1318, + "step": 15830 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 1.4294241666793823, + "learning_rate": 2.9246593291404612e-05, + "loss": 0.1175, + "step": 15840 + }, + { + "epoch": 0.8307127882599581, + "grad_norm": 2.056034564971924, + "learning_rate": 2.9233490566037736e-05, + "loss": 0.1179, + "step": 15850 + }, + { + "epoch": 0.8312368972746331, + "grad_norm": 2.3232133388519287, + "learning_rate": 2.922038784067086e-05, + "loss": 0.1132, + "step": 15860 + }, + { + "epoch": 0.8317610062893082, + "grad_norm": 1.936185598373413, + "learning_rate": 2.9207285115303983e-05, + "loss": 0.1514, + "step": 15870 + }, + { + "epoch": 0.8322851153039832, + "grad_norm": 2.3033699989318848, + "learning_rate": 2.919418238993711e-05, + "loss": 0.1453, + "step": 15880 + }, + { + "epoch": 0.8328092243186582, + "grad_norm": 1.7244473695755005, + "learning_rate": 2.9181079664570233e-05, + "loss": 0.1109, + "step": 15890 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 1.8862981796264648, + "learning_rate": 2.9167976939203356e-05, + "loss": 0.1223, + "step": 15900 + }, + { + "epoch": 0.8338574423480084, + "grad_norm": 2.6670587062835693, + "learning_rate": 2.915487421383648e-05, + "loss": 0.1297, + "step": 15910 + }, + { + "epoch": 0.8343815513626834, + "grad_norm": 0.8241652250289917, + "learning_rate": 2.9141771488469606e-05, + "loss": 0.1257, + "step": 15920 + }, + { + "epoch": 0.8349056603773585, + "grad_norm": 1.6652482748031616, + "learning_rate": 2.912866876310273e-05, + "loss": 0.1129, + "step": 15930 + }, + { + "epoch": 0.8354297693920335, + "grad_norm": 1.059584379196167, + "learning_rate": 2.9115566037735853e-05, + "loss": 0.111, + "step": 15940 + }, + { + "epoch": 0.8359538784067087, + "grad_norm": 1.6806567907333374, + "learning_rate": 2.9102463312368973e-05, + "loss": 0.1272, + "step": 15950 + }, + { + "epoch": 0.8364779874213837, + "grad_norm": 0.8284962177276611, + "learning_rate": 2.9089360587002096e-05, + "loss": 0.1192, + "step": 15960 + }, + { + "epoch": 0.8370020964360587, + "grad_norm": 1.719659686088562, + "learning_rate": 2.907625786163522e-05, + "loss": 0.186, + "step": 15970 + }, + { + "epoch": 0.8375262054507338, + "grad_norm": 1.34307861328125, + "learning_rate": 2.9063155136268343e-05, + "loss": 0.1316, + "step": 15980 + }, + { + "epoch": 0.8380503144654088, + "grad_norm": 1.9360815286636353, + "learning_rate": 2.905005241090147e-05, + "loss": 0.144, + "step": 15990 + }, + { + "epoch": 0.8385744234800838, + "grad_norm": 2.661608934402466, + "learning_rate": 2.9036949685534593e-05, + "loss": 0.118, + "step": 16000 + }, + { + "epoch": 0.8385744234800838, + "eval_loss": 0.2744086682796478, + "eval_runtime": 267.8021, + "eval_samples_per_second": 7.435, + "eval_steps_per_second": 1.24, + "step": 16000 + }, + { + "epoch": 0.839098532494759, + "grad_norm": 2.3948934078216553, + "learning_rate": 2.9023846960167717e-05, + "loss": 0.0907, + "step": 16010 + }, + { + "epoch": 0.839622641509434, + "grad_norm": 1.7479071617126465, + "learning_rate": 2.901074423480084e-05, + "loss": 0.133, + "step": 16020 + }, + { + "epoch": 0.840146750524109, + "grad_norm": 5.435430526733398, + "learning_rate": 2.8997641509433963e-05, + "loss": 0.1126, + "step": 16030 + }, + { + "epoch": 0.8406708595387841, + "grad_norm": 1.9367077350616455, + "learning_rate": 2.898453878406709e-05, + "loss": 0.1004, + "step": 16040 + }, + { + "epoch": 0.8411949685534591, + "grad_norm": 1.109445571899414, + "learning_rate": 2.8971436058700214e-05, + "loss": 0.1029, + "step": 16050 + }, + { + "epoch": 0.8417190775681341, + "grad_norm": 1.7349110841751099, + "learning_rate": 2.8958333333333337e-05, + "loss": 0.0916, + "step": 16060 + }, + { + "epoch": 0.8422431865828093, + "grad_norm": 2.7323551177978516, + "learning_rate": 2.8945230607966457e-05, + "loss": 0.1462, + "step": 16070 + }, + { + "epoch": 0.8427672955974843, + "grad_norm": 1.405383825302124, + "learning_rate": 2.893212788259958e-05, + "loss": 0.1439, + "step": 16080 + }, + { + "epoch": 0.8432914046121593, + "grad_norm": 1.6317007541656494, + "learning_rate": 2.8919025157232704e-05, + "loss": 0.1094, + "step": 16090 + }, + { + "epoch": 0.8438155136268344, + "grad_norm": 2.196056842803955, + "learning_rate": 2.8905922431865827e-05, + "loss": 0.1118, + "step": 16100 + }, + { + "epoch": 0.8443396226415094, + "grad_norm": 1.5937583446502686, + "learning_rate": 2.8892819706498954e-05, + "loss": 0.1333, + "step": 16110 + }, + { + "epoch": 0.8448637316561844, + "grad_norm": 1.4908620119094849, + "learning_rate": 2.8879716981132077e-05, + "loss": 0.145, + "step": 16120 + }, + { + "epoch": 0.8453878406708596, + "grad_norm": 1.6847023963928223, + "learning_rate": 2.88666142557652e-05, + "loss": 0.0794, + "step": 16130 + }, + { + "epoch": 0.8459119496855346, + "grad_norm": 3.2816317081451416, + "learning_rate": 2.8853511530398324e-05, + "loss": 0.1317, + "step": 16140 + }, + { + "epoch": 0.8464360587002097, + "grad_norm": 1.2879031896591187, + "learning_rate": 2.8840408805031447e-05, + "loss": 0.1105, + "step": 16150 + }, + { + "epoch": 0.8469601677148847, + "grad_norm": 1.481139063835144, + "learning_rate": 2.8827306079664574e-05, + "loss": 0.1356, + "step": 16160 + }, + { + "epoch": 0.8474842767295597, + "grad_norm": 1.0397553443908691, + "learning_rate": 2.8814203354297698e-05, + "loss": 0.1308, + "step": 16170 + }, + { + "epoch": 0.8480083857442348, + "grad_norm": 1.932265281677246, + "learning_rate": 2.880110062893082e-05, + "loss": 0.0992, + "step": 16180 + }, + { + "epoch": 0.8485324947589099, + "grad_norm": 2.0606870651245117, + "learning_rate": 2.878799790356394e-05, + "loss": 0.1139, + "step": 16190 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 1.4234727621078491, + "learning_rate": 2.8774895178197064e-05, + "loss": 0.1026, + "step": 16200 + }, + { + "epoch": 0.84958071278826, + "grad_norm": 1.9039376974105835, + "learning_rate": 2.8761792452830188e-05, + "loss": 0.095, + "step": 16210 + }, + { + "epoch": 0.850104821802935, + "grad_norm": 1.5415253639221191, + "learning_rate": 2.874868972746331e-05, + "loss": 0.0932, + "step": 16220 + }, + { + "epoch": 0.85062893081761, + "grad_norm": 1.4809905290603638, + "learning_rate": 2.8735587002096438e-05, + "loss": 0.0962, + "step": 16230 + }, + { + "epoch": 0.8511530398322851, + "grad_norm": 3.9822001457214355, + "learning_rate": 2.872248427672956e-05, + "loss": 0.1441, + "step": 16240 + }, + { + "epoch": 0.8516771488469602, + "grad_norm": 1.1046079397201538, + "learning_rate": 2.8709381551362685e-05, + "loss": 0.1435, + "step": 16250 + }, + { + "epoch": 0.8522012578616353, + "grad_norm": 1.510729193687439, + "learning_rate": 2.8696278825995808e-05, + "loss": 0.1209, + "step": 16260 + }, + { + "epoch": 0.8527253668763103, + "grad_norm": 1.1853828430175781, + "learning_rate": 2.8683176100628935e-05, + "loss": 0.1369, + "step": 16270 + }, + { + "epoch": 0.8532494758909853, + "grad_norm": 1.8410040140151978, + "learning_rate": 2.8670073375262058e-05, + "loss": 0.108, + "step": 16280 + }, + { + "epoch": 0.8537735849056604, + "grad_norm": 1.6348074674606323, + "learning_rate": 2.865697064989518e-05, + "loss": 0.1342, + "step": 16290 + }, + { + "epoch": 0.8542976939203354, + "grad_norm": 2.014404535293579, + "learning_rate": 2.8643867924528305e-05, + "loss": 0.1126, + "step": 16300 + }, + { + "epoch": 0.8548218029350105, + "grad_norm": 2.791364908218384, + "learning_rate": 2.8630765199161425e-05, + "loss": 0.1488, + "step": 16310 + }, + { + "epoch": 0.8553459119496856, + "grad_norm": 1.9733411073684692, + "learning_rate": 2.861766247379455e-05, + "loss": 0.1033, + "step": 16320 + }, + { + "epoch": 0.8558700209643606, + "grad_norm": 1.3962713479995728, + "learning_rate": 2.8604559748427672e-05, + "loss": 0.1143, + "step": 16330 + }, + { + "epoch": 0.8563941299790356, + "grad_norm": 2.352142572402954, + "learning_rate": 2.85914570230608e-05, + "loss": 0.1312, + "step": 16340 + }, + { + "epoch": 0.8569182389937107, + "grad_norm": 1.4277571439743042, + "learning_rate": 2.8578354297693922e-05, + "loss": 0.1372, + "step": 16350 + }, + { + "epoch": 0.8574423480083857, + "grad_norm": 0.8835510015487671, + "learning_rate": 2.8565251572327045e-05, + "loss": 0.1469, + "step": 16360 + }, + { + "epoch": 0.8579664570230608, + "grad_norm": 1.7008707523345947, + "learning_rate": 2.855214884696017e-05, + "loss": 0.1068, + "step": 16370 + }, + { + "epoch": 0.8584905660377359, + "grad_norm": 1.7003710269927979, + "learning_rate": 2.8539046121593292e-05, + "loss": 0.1413, + "step": 16380 + }, + { + "epoch": 0.8590146750524109, + "grad_norm": 1.588629961013794, + "learning_rate": 2.852594339622642e-05, + "loss": 0.1328, + "step": 16390 + }, + { + "epoch": 0.859538784067086, + "grad_norm": 1.6589187383651733, + "learning_rate": 2.8512840670859542e-05, + "loss": 0.0913, + "step": 16400 + }, + { + "epoch": 0.860062893081761, + "grad_norm": 1.504050374031067, + "learning_rate": 2.8499737945492666e-05, + "loss": 0.1071, + "step": 16410 + }, + { + "epoch": 0.860587002096436, + "grad_norm": 2.1353161334991455, + "learning_rate": 2.848663522012579e-05, + "loss": 0.1046, + "step": 16420 + }, + { + "epoch": 0.8611111111111112, + "grad_norm": 1.3967875242233276, + "learning_rate": 2.847353249475891e-05, + "loss": 0.0898, + "step": 16430 + }, + { + "epoch": 0.8616352201257862, + "grad_norm": 0.9945201873779297, + "learning_rate": 2.8460429769392032e-05, + "loss": 0.1462, + "step": 16440 + }, + { + "epoch": 0.8621593291404612, + "grad_norm": 1.9785001277923584, + "learning_rate": 2.8447327044025156e-05, + "loss": 0.1323, + "step": 16450 + }, + { + "epoch": 0.8626834381551363, + "grad_norm": 1.7696077823638916, + "learning_rate": 2.8434224318658283e-05, + "loss": 0.124, + "step": 16460 + }, + { + "epoch": 0.8632075471698113, + "grad_norm": 3.277092218399048, + "learning_rate": 2.8421121593291406e-05, + "loss": 0.1498, + "step": 16470 + }, + { + "epoch": 0.8637316561844863, + "grad_norm": 1.6171987056732178, + "learning_rate": 2.840801886792453e-05, + "loss": 0.1278, + "step": 16480 + }, + { + "epoch": 0.8642557651991615, + "grad_norm": 1.6969029903411865, + "learning_rate": 2.8394916142557653e-05, + "loss": 0.1134, + "step": 16490 + }, + { + "epoch": 0.8647798742138365, + "grad_norm": 1.8987184762954712, + "learning_rate": 2.838181341719078e-05, + "loss": 0.1448, + "step": 16500 + }, + { + "epoch": 0.8653039832285115, + "grad_norm": 1.8838675022125244, + "learning_rate": 2.8368710691823903e-05, + "loss": 0.1366, + "step": 16510 + }, + { + "epoch": 0.8658280922431866, + "grad_norm": 1.2478837966918945, + "learning_rate": 2.8355607966457026e-05, + "loss": 0.1336, + "step": 16520 + }, + { + "epoch": 0.8663522012578616, + "grad_norm": 1.7950494289398193, + "learning_rate": 2.834250524109015e-05, + "loss": 0.1083, + "step": 16530 + }, + { + "epoch": 0.8668763102725366, + "grad_norm": 2.6232709884643555, + "learning_rate": 2.8329402515723273e-05, + "loss": 0.1079, + "step": 16540 + }, + { + "epoch": 0.8674004192872118, + "grad_norm": 1.988641619682312, + "learning_rate": 2.8316299790356393e-05, + "loss": 0.12, + "step": 16550 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 1.4371179342269897, + "learning_rate": 2.8303197064989516e-05, + "loss": 0.1194, + "step": 16560 + }, + { + "epoch": 0.8684486373165619, + "grad_norm": 1.8544507026672363, + "learning_rate": 2.829009433962264e-05, + "loss": 0.1214, + "step": 16570 + }, + { + "epoch": 0.8689727463312369, + "grad_norm": 1.6521706581115723, + "learning_rate": 2.8276991614255767e-05, + "loss": 0.0754, + "step": 16580 + }, + { + "epoch": 0.8694968553459119, + "grad_norm": 2.2902579307556152, + "learning_rate": 2.826388888888889e-05, + "loss": 0.0884, + "step": 16590 + }, + { + "epoch": 0.870020964360587, + "grad_norm": 1.0637155771255493, + "learning_rate": 2.8250786163522013e-05, + "loss": 0.1322, + "step": 16600 + }, + { + "epoch": 0.8705450733752621, + "grad_norm": 3.798917531967163, + "learning_rate": 2.8237683438155137e-05, + "loss": 0.0991, + "step": 16610 + }, + { + "epoch": 0.8710691823899371, + "grad_norm": 57.41527557373047, + "learning_rate": 2.8224580712788264e-05, + "loss": 0.279, + "step": 16620 + }, + { + "epoch": 0.8715932914046122, + "grad_norm": 2.0917751789093018, + "learning_rate": 2.8211477987421387e-05, + "loss": 0.1624, + "step": 16630 + }, + { + "epoch": 0.8721174004192872, + "grad_norm": 1.0532481670379639, + "learning_rate": 2.819837526205451e-05, + "loss": 0.1086, + "step": 16640 + }, + { + "epoch": 0.8726415094339622, + "grad_norm": 1.6892848014831543, + "learning_rate": 2.8185272536687634e-05, + "loss": 0.105, + "step": 16650 + }, + { + "epoch": 0.8731656184486373, + "grad_norm": 1.1963019371032715, + "learning_rate": 2.817216981132076e-05, + "loss": 0.1088, + "step": 16660 + }, + { + "epoch": 0.8736897274633124, + "grad_norm": 3.1233057975769043, + "learning_rate": 2.8159067085953877e-05, + "loss": 0.129, + "step": 16670 + }, + { + "epoch": 0.8742138364779874, + "grad_norm": 2.250922203063965, + "learning_rate": 2.8145964360587e-05, + "loss": 0.0974, + "step": 16680 + }, + { + "epoch": 0.8747379454926625, + "grad_norm": 1.731714129447937, + "learning_rate": 2.8132861635220127e-05, + "loss": 0.1156, + "step": 16690 + }, + { + "epoch": 0.8752620545073375, + "grad_norm": 1.5934561491012573, + "learning_rate": 2.811975890985325e-05, + "loss": 0.1097, + "step": 16700 + }, + { + "epoch": 0.8757861635220126, + "grad_norm": 1.4221527576446533, + "learning_rate": 2.8106656184486374e-05, + "loss": 0.1084, + "step": 16710 + }, + { + "epoch": 0.8763102725366876, + "grad_norm": 2.553152322769165, + "learning_rate": 2.8093553459119497e-05, + "loss": 0.126, + "step": 16720 + }, + { + "epoch": 0.8768343815513627, + "grad_norm": 1.8152376413345337, + "learning_rate": 2.808045073375262e-05, + "loss": 0.1058, + "step": 16730 + }, + { + "epoch": 0.8773584905660378, + "grad_norm": 1.7730140686035156, + "learning_rate": 2.8067348008385748e-05, + "loss": 0.1215, + "step": 16740 + }, + { + "epoch": 0.8778825995807128, + "grad_norm": 1.7322098016738892, + "learning_rate": 2.805424528301887e-05, + "loss": 0.113, + "step": 16750 + }, + { + "epoch": 0.8784067085953878, + "grad_norm": 2.0963635444641113, + "learning_rate": 2.8041142557651994e-05, + "loss": 0.1191, + "step": 16760 + }, + { + "epoch": 0.8789308176100629, + "grad_norm": 1.5918827056884766, + "learning_rate": 2.8028039832285118e-05, + "loss": 0.1084, + "step": 16770 + }, + { + "epoch": 0.8794549266247379, + "grad_norm": 1.8807268142700195, + "learning_rate": 2.8014937106918238e-05, + "loss": 0.0942, + "step": 16780 + }, + { + "epoch": 0.879979035639413, + "grad_norm": 1.7885172367095947, + "learning_rate": 2.800183438155136e-05, + "loss": 0.094, + "step": 16790 + }, + { + "epoch": 0.8805031446540881, + "grad_norm": 1.7495297193527222, + "learning_rate": 2.7988731656184484e-05, + "loss": 0.1317, + "step": 16800 + }, + { + "epoch": 0.8810272536687631, + "grad_norm": 1.4859161376953125, + "learning_rate": 2.797562893081761e-05, + "loss": 0.108, + "step": 16810 + }, + { + "epoch": 0.8815513626834381, + "grad_norm": 2.0737903118133545, + "learning_rate": 2.7962526205450735e-05, + "loss": 0.1156, + "step": 16820 + }, + { + "epoch": 0.8820754716981132, + "grad_norm": 2.136467933654785, + "learning_rate": 2.7949423480083858e-05, + "loss": 0.1188, + "step": 16830 + }, + { + "epoch": 0.8825995807127882, + "grad_norm": 2.7223691940307617, + "learning_rate": 2.793632075471698e-05, + "loss": 0.0917, + "step": 16840 + }, + { + "epoch": 0.8831236897274634, + "grad_norm": 1.7403265237808228, + "learning_rate": 2.7923218029350108e-05, + "loss": 0.1266, + "step": 16850 + }, + { + "epoch": 0.8836477987421384, + "grad_norm": 3.2215659618377686, + "learning_rate": 2.791011530398323e-05, + "loss": 0.1222, + "step": 16860 + }, + { + "epoch": 0.8841719077568134, + "grad_norm": 6.004558086395264, + "learning_rate": 2.7897012578616355e-05, + "loss": 0.1328, + "step": 16870 + }, + { + "epoch": 0.8846960167714885, + "grad_norm": 2.02972149848938, + "learning_rate": 2.788390985324948e-05, + "loss": 0.1054, + "step": 16880 + }, + { + "epoch": 0.8852201257861635, + "grad_norm": 1.8536604642868042, + "learning_rate": 2.7870807127882602e-05, + "loss": 0.1491, + "step": 16890 + }, + { + "epoch": 0.8857442348008385, + "grad_norm": 3.0676920413970947, + "learning_rate": 2.7857704402515722e-05, + "loss": 0.1047, + "step": 16900 + }, + { + "epoch": 0.8862683438155137, + "grad_norm": 1.6882869005203247, + "learning_rate": 2.7844601677148845e-05, + "loss": 0.1023, + "step": 16910 + }, + { + "epoch": 0.8867924528301887, + "grad_norm": 1.2589284181594849, + "learning_rate": 2.7831498951781972e-05, + "loss": 0.1274, + "step": 16920 + }, + { + "epoch": 0.8873165618448637, + "grad_norm": 1.274383783340454, + "learning_rate": 2.7818396226415095e-05, + "loss": 0.116, + "step": 16930 + }, + { + "epoch": 0.8878406708595388, + "grad_norm": 1.9524480104446411, + "learning_rate": 2.780529350104822e-05, + "loss": 0.1014, + "step": 16940 + }, + { + "epoch": 0.8883647798742138, + "grad_norm": 1.2366316318511963, + "learning_rate": 2.7792190775681342e-05, + "loss": 0.1031, + "step": 16950 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.1134531497955322, + "learning_rate": 2.7779088050314465e-05, + "loss": 0.1009, + "step": 16960 + }, + { + "epoch": 0.889412997903564, + "grad_norm": 1.4316993951797485, + "learning_rate": 2.7765985324947592e-05, + "loss": 0.1101, + "step": 16970 + }, + { + "epoch": 0.889937106918239, + "grad_norm": 1.7305949926376343, + "learning_rate": 2.7752882599580716e-05, + "loss": 0.1318, + "step": 16980 + }, + { + "epoch": 0.890461215932914, + "grad_norm": 1.0645042657852173, + "learning_rate": 2.773977987421384e-05, + "loss": 0.1161, + "step": 16990 + }, + { + "epoch": 0.8909853249475891, + "grad_norm": 0.9148305058479309, + "learning_rate": 2.7726677148846962e-05, + "loss": 0.1117, + "step": 17000 + }, + { + "epoch": 0.8909853249475891, + "eval_loss": 0.2766527533531189, + "eval_runtime": 267.9233, + "eval_samples_per_second": 7.431, + "eval_steps_per_second": 1.239, + "step": 17000 + }, + { + "epoch": 0.8915094339622641, + "grad_norm": 2.0542149543762207, + "learning_rate": 2.771357442348009e-05, + "loss": 0.1624, + "step": 17010 + }, + { + "epoch": 0.8920335429769392, + "grad_norm": 1.9283877611160278, + "learning_rate": 2.7700471698113206e-05, + "loss": 0.1331, + "step": 17020 + }, + { + "epoch": 0.8925576519916143, + "grad_norm": 1.7226213216781616, + "learning_rate": 2.768736897274633e-05, + "loss": 0.1475, + "step": 17030 + }, + { + "epoch": 0.8930817610062893, + "grad_norm": 2.5038414001464844, + "learning_rate": 2.7674266247379456e-05, + "loss": 0.1141, + "step": 17040 + }, + { + "epoch": 0.8936058700209644, + "grad_norm": 1.534328818321228, + "learning_rate": 2.766116352201258e-05, + "loss": 0.0955, + "step": 17050 + }, + { + "epoch": 0.8941299790356394, + "grad_norm": 1.3931151628494263, + "learning_rate": 2.7648060796645703e-05, + "loss": 0.0959, + "step": 17060 + }, + { + "epoch": 0.8946540880503144, + "grad_norm": 2.5487723350524902, + "learning_rate": 2.7634958071278826e-05, + "loss": 0.1464, + "step": 17070 + }, + { + "epoch": 0.8951781970649895, + "grad_norm": 2.670689105987549, + "learning_rate": 2.762185534591195e-05, + "loss": 0.0956, + "step": 17080 + }, + { + "epoch": 0.8957023060796646, + "grad_norm": 2.335292339324951, + "learning_rate": 2.7608752620545076e-05, + "loss": 0.1242, + "step": 17090 + }, + { + "epoch": 0.8962264150943396, + "grad_norm": 2.065664529800415, + "learning_rate": 2.75956498951782e-05, + "loss": 0.1469, + "step": 17100 + }, + { + "epoch": 0.8967505241090147, + "grad_norm": 2.1072165966033936, + "learning_rate": 2.7582547169811323e-05, + "loss": 0.0981, + "step": 17110 + }, + { + "epoch": 0.8972746331236897, + "grad_norm": 2.1046767234802246, + "learning_rate": 2.7569444444444446e-05, + "loss": 0.1114, + "step": 17120 + }, + { + "epoch": 0.8977987421383647, + "grad_norm": 1.2484157085418701, + "learning_rate": 2.7556341719077573e-05, + "loss": 0.1279, + "step": 17130 + }, + { + "epoch": 0.8983228511530398, + "grad_norm": 1.4223566055297852, + "learning_rate": 2.754323899371069e-05, + "loss": 0.1233, + "step": 17140 + }, + { + "epoch": 0.8988469601677149, + "grad_norm": 3.491486072540283, + "learning_rate": 2.7530136268343813e-05, + "loss": 0.1275, + "step": 17150 + }, + { + "epoch": 0.89937106918239, + "grad_norm": 1.536576747894287, + "learning_rate": 2.751703354297694e-05, + "loss": 0.1498, + "step": 17160 + }, + { + "epoch": 0.899895178197065, + "grad_norm": 1.4447746276855469, + "learning_rate": 2.7503930817610063e-05, + "loss": 0.1133, + "step": 17170 + }, + { + "epoch": 0.90041928721174, + "grad_norm": 1.4244399070739746, + "learning_rate": 2.7490828092243187e-05, + "loss": 0.1195, + "step": 17180 + }, + { + "epoch": 0.9009433962264151, + "grad_norm": 4.854112148284912, + "learning_rate": 2.747772536687631e-05, + "loss": 0.1334, + "step": 17190 + }, + { + "epoch": 0.9014675052410901, + "grad_norm": 3.5608410835266113, + "learning_rate": 2.7464622641509437e-05, + "loss": 0.0988, + "step": 17200 + }, + { + "epoch": 0.9019916142557652, + "grad_norm": 1.3616375923156738, + "learning_rate": 2.745151991614256e-05, + "loss": 0.1249, + "step": 17210 + }, + { + "epoch": 0.9025157232704403, + "grad_norm": 1.1115492582321167, + "learning_rate": 2.7438417190775684e-05, + "loss": 0.1261, + "step": 17220 + }, + { + "epoch": 0.9030398322851153, + "grad_norm": 1.767063856124878, + "learning_rate": 2.7425314465408807e-05, + "loss": 0.118, + "step": 17230 + }, + { + "epoch": 0.9035639412997903, + "grad_norm": 4.163326263427734, + "learning_rate": 2.741221174004193e-05, + "loss": 0.1086, + "step": 17240 + }, + { + "epoch": 0.9040880503144654, + "grad_norm": 14.82337474822998, + "learning_rate": 2.7399109014675057e-05, + "loss": 0.1362, + "step": 17250 + }, + { + "epoch": 0.9046121593291404, + "grad_norm": 2.8704612255096436, + "learning_rate": 2.7386006289308174e-05, + "loss": 0.1427, + "step": 17260 + }, + { + "epoch": 0.9051362683438156, + "grad_norm": 3.1805038452148438, + "learning_rate": 2.73729035639413e-05, + "loss": 0.1222, + "step": 17270 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 2.0406835079193115, + "learning_rate": 2.7359800838574424e-05, + "loss": 0.1003, + "step": 17280 + }, + { + "epoch": 0.9061844863731656, + "grad_norm": 2.228046178817749, + "learning_rate": 2.7346698113207547e-05, + "loss": 0.1191, + "step": 17290 + }, + { + "epoch": 0.9067085953878407, + "grad_norm": 1.1091362237930298, + "learning_rate": 2.733359538784067e-05, + "loss": 0.109, + "step": 17300 + }, + { + "epoch": 0.9072327044025157, + "grad_norm": 3.2433698177337646, + "learning_rate": 2.7320492662473794e-05, + "loss": 0.1341, + "step": 17310 + }, + { + "epoch": 0.9077568134171907, + "grad_norm": 1.1009184122085571, + "learning_rate": 2.730738993710692e-05, + "loss": 0.0792, + "step": 17320 + }, + { + "epoch": 0.9082809224318659, + "grad_norm": 1.9047338962554932, + "learning_rate": 2.7294287211740044e-05, + "loss": 0.1178, + "step": 17330 + }, + { + "epoch": 0.9088050314465409, + "grad_norm": 2.6728718280792236, + "learning_rate": 2.7281184486373168e-05, + "loss": 0.141, + "step": 17340 + }, + { + "epoch": 0.9093291404612159, + "grad_norm": 2.257340669631958, + "learning_rate": 2.726808176100629e-05, + "loss": 0.1055, + "step": 17350 + }, + { + "epoch": 0.909853249475891, + "grad_norm": 1.4239635467529297, + "learning_rate": 2.7254979035639418e-05, + "loss": 0.1329, + "step": 17360 + }, + { + "epoch": 0.910377358490566, + "grad_norm": 1.608129620552063, + "learning_rate": 2.724187631027254e-05, + "loss": 0.1188, + "step": 17370 + }, + { + "epoch": 0.910901467505241, + "grad_norm": 1.944496989250183, + "learning_rate": 2.7228773584905658e-05, + "loss": 0.0997, + "step": 17380 + }, + { + "epoch": 0.9114255765199162, + "grad_norm": 2.178675413131714, + "learning_rate": 2.7215670859538785e-05, + "loss": 0.1327, + "step": 17390 + }, + { + "epoch": 0.9119496855345912, + "grad_norm": 1.738000512123108, + "learning_rate": 2.7202568134171908e-05, + "loss": 0.1098, + "step": 17400 + }, + { + "epoch": 0.9124737945492662, + "grad_norm": 1.5480185747146606, + "learning_rate": 2.718946540880503e-05, + "loss": 0.1324, + "step": 17410 + }, + { + "epoch": 0.9129979035639413, + "grad_norm": 1.7627813816070557, + "learning_rate": 2.7176362683438155e-05, + "loss": 0.1138, + "step": 17420 + }, + { + "epoch": 0.9135220125786163, + "grad_norm": 1.8327022790908813, + "learning_rate": 2.716325995807128e-05, + "loss": 0.1371, + "step": 17430 + }, + { + "epoch": 0.9140461215932913, + "grad_norm": 1.486836552619934, + "learning_rate": 2.7150157232704405e-05, + "loss": 0.1221, + "step": 17440 + }, + { + "epoch": 0.9145702306079665, + "grad_norm": 1.3775781393051147, + "learning_rate": 2.7137054507337528e-05, + "loss": 0.1166, + "step": 17450 + }, + { + "epoch": 0.9150943396226415, + "grad_norm": 1.868113398551941, + "learning_rate": 2.712395178197065e-05, + "loss": 0.1362, + "step": 17460 + }, + { + "epoch": 0.9156184486373166, + "grad_norm": 0.9045465588569641, + "learning_rate": 2.7110849056603775e-05, + "loss": 0.0987, + "step": 17470 + }, + { + "epoch": 0.9161425576519916, + "grad_norm": 1.4192860126495361, + "learning_rate": 2.7097746331236902e-05, + "loss": 0.1006, + "step": 17480 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 1.904269814491272, + "learning_rate": 2.7084643605870025e-05, + "loss": 0.097, + "step": 17490 + }, + { + "epoch": 0.9171907756813418, + "grad_norm": 1.5991660356521606, + "learning_rate": 2.7071540880503142e-05, + "loss": 0.0964, + "step": 17500 + }, + { + "epoch": 0.9177148846960168, + "grad_norm": 1.5102187395095825, + "learning_rate": 2.705843815513627e-05, + "loss": 0.122, + "step": 17510 + }, + { + "epoch": 0.9182389937106918, + "grad_norm": 1.7714667320251465, + "learning_rate": 2.7045335429769392e-05, + "loss": 0.101, + "step": 17520 + }, + { + "epoch": 0.9187631027253669, + "grad_norm": 1.6487922668457031, + "learning_rate": 2.7032232704402515e-05, + "loss": 0.1212, + "step": 17530 + }, + { + "epoch": 0.9192872117400419, + "grad_norm": 1.3789714574813843, + "learning_rate": 2.701912997903564e-05, + "loss": 0.1159, + "step": 17540 + }, + { + "epoch": 0.9198113207547169, + "grad_norm": 8.510631561279297, + "learning_rate": 2.7006027253668765e-05, + "loss": 0.1262, + "step": 17550 + }, + { + "epoch": 0.9203354297693921, + "grad_norm": 2.0327212810516357, + "learning_rate": 2.699292452830189e-05, + "loss": 0.1335, + "step": 17560 + }, + { + "epoch": 0.9208595387840671, + "grad_norm": 1.652156114578247, + "learning_rate": 2.6979821802935012e-05, + "loss": 0.1267, + "step": 17570 + }, + { + "epoch": 0.9213836477987422, + "grad_norm": 1.6591423749923706, + "learning_rate": 2.6966719077568136e-05, + "loss": 0.1372, + "step": 17580 + }, + { + "epoch": 0.9219077568134172, + "grad_norm": 1.7691068649291992, + "learning_rate": 2.6953616352201262e-05, + "loss": 0.1142, + "step": 17590 + }, + { + "epoch": 0.9224318658280922, + "grad_norm": 1.4638423919677734, + "learning_rate": 2.6940513626834386e-05, + "loss": 0.1348, + "step": 17600 + }, + { + "epoch": 0.9229559748427673, + "grad_norm": 1.6467829942703247, + "learning_rate": 2.692741090146751e-05, + "loss": 0.1239, + "step": 17610 + }, + { + "epoch": 0.9234800838574424, + "grad_norm": 1.6024755239486694, + "learning_rate": 2.691430817610063e-05, + "loss": 0.1158, + "step": 17620 + }, + { + "epoch": 0.9240041928721174, + "grad_norm": 1.2841496467590332, + "learning_rate": 2.6901205450733753e-05, + "loss": 0.1175, + "step": 17630 + }, + { + "epoch": 0.9245283018867925, + "grad_norm": 2.259477376937866, + "learning_rate": 2.6888102725366876e-05, + "loss": 0.1128, + "step": 17640 + }, + { + "epoch": 0.9250524109014675, + "grad_norm": 1.0841439962387085, + "learning_rate": 2.6875e-05, + "loss": 0.1331, + "step": 17650 + }, + { + "epoch": 0.9255765199161425, + "grad_norm": 1.8893989324569702, + "learning_rate": 2.6861897274633123e-05, + "loss": 0.1246, + "step": 17660 + }, + { + "epoch": 0.9261006289308176, + "grad_norm": 1.761985421180725, + "learning_rate": 2.684879454926625e-05, + "loss": 0.1142, + "step": 17670 + }, + { + "epoch": 0.9266247379454927, + "grad_norm": 5.233839511871338, + "learning_rate": 2.6835691823899373e-05, + "loss": 0.1055, + "step": 17680 + }, + { + "epoch": 0.9271488469601677, + "grad_norm": 2.6796579360961914, + "learning_rate": 2.6822589098532496e-05, + "loss": 0.1113, + "step": 17690 + }, + { + "epoch": 0.9276729559748428, + "grad_norm": 3.1834607124328613, + "learning_rate": 2.680948637316562e-05, + "loss": 0.1402, + "step": 17700 + }, + { + "epoch": 0.9281970649895178, + "grad_norm": 3.362410068511963, + "learning_rate": 2.6796383647798746e-05, + "loss": 0.127, + "step": 17710 + }, + { + "epoch": 0.9287211740041929, + "grad_norm": 1.9332648515701294, + "learning_rate": 2.678328092243187e-05, + "loss": 0.0974, + "step": 17720 + }, + { + "epoch": 0.9292452830188679, + "grad_norm": 1.9798800945281982, + "learning_rate": 2.6770178197064993e-05, + "loss": 0.1189, + "step": 17730 + }, + { + "epoch": 0.929769392033543, + "grad_norm": 1.7517541646957397, + "learning_rate": 2.6757075471698113e-05, + "loss": 0.1375, + "step": 17740 + }, + { + "epoch": 0.9302935010482181, + "grad_norm": 1.3065685033798218, + "learning_rate": 2.6743972746331237e-05, + "loss": 0.1313, + "step": 17750 + }, + { + "epoch": 0.9308176100628931, + "grad_norm": 3.3601229190826416, + "learning_rate": 2.673087002096436e-05, + "loss": 0.1445, + "step": 17760 + }, + { + "epoch": 0.9313417190775681, + "grad_norm": 1.3460884094238281, + "learning_rate": 2.6717767295597483e-05, + "loss": 0.106, + "step": 17770 + }, + { + "epoch": 0.9318658280922432, + "grad_norm": 1.8291536569595337, + "learning_rate": 2.670466457023061e-05, + "loss": 0.1067, + "step": 17780 + }, + { + "epoch": 0.9323899371069182, + "grad_norm": 1.4517912864685059, + "learning_rate": 2.6691561844863734e-05, + "loss": 0.1144, + "step": 17790 + }, + { + "epoch": 0.9329140461215933, + "grad_norm": 1.0053421258926392, + "learning_rate": 2.6678459119496857e-05, + "loss": 0.1039, + "step": 17800 + }, + { + "epoch": 0.9334381551362684, + "grad_norm": 4.46645975112915, + "learning_rate": 2.666535639412998e-05, + "loss": 0.0918, + "step": 17810 + }, + { + "epoch": 0.9339622641509434, + "grad_norm": 2.079599142074585, + "learning_rate": 2.6652253668763104e-05, + "loss": 0.1066, + "step": 17820 + }, + { + "epoch": 0.9344863731656184, + "grad_norm": 1.3694299459457397, + "learning_rate": 2.663915094339623e-05, + "loss": 0.1326, + "step": 17830 + }, + { + "epoch": 0.9350104821802935, + "grad_norm": 2.696722984313965, + "learning_rate": 2.6626048218029354e-05, + "loss": 0.1249, + "step": 17840 + }, + { + "epoch": 0.9355345911949685, + "grad_norm": 3.4780609607696533, + "learning_rate": 2.6612945492662477e-05, + "loss": 0.1351, + "step": 17850 + }, + { + "epoch": 0.9360587002096437, + "grad_norm": 1.8707832098007202, + "learning_rate": 2.6599842767295597e-05, + "loss": 0.0928, + "step": 17860 + }, + { + "epoch": 0.9365828092243187, + "grad_norm": 1.8133983612060547, + "learning_rate": 2.658674004192872e-05, + "loss": 0.1317, + "step": 17870 + }, + { + "epoch": 0.9371069182389937, + "grad_norm": 3.4762990474700928, + "learning_rate": 2.6573637316561844e-05, + "loss": 0.0866, + "step": 17880 + }, + { + "epoch": 0.9376310272536688, + "grad_norm": 4.619908809661865, + "learning_rate": 2.6560534591194967e-05, + "loss": 0.1165, + "step": 17890 + }, + { + "epoch": 0.9381551362683438, + "grad_norm": 1.6376959085464478, + "learning_rate": 2.6547431865828094e-05, + "loss": 0.1235, + "step": 17900 + }, + { + "epoch": 0.9386792452830188, + "grad_norm": 1.5230728387832642, + "learning_rate": 2.6534329140461218e-05, + "loss": 0.1079, + "step": 17910 + }, + { + "epoch": 0.939203354297694, + "grad_norm": 2.2401304244995117, + "learning_rate": 2.652122641509434e-05, + "loss": 0.0961, + "step": 17920 + }, + { + "epoch": 0.939727463312369, + "grad_norm": 1.3648204803466797, + "learning_rate": 2.6508123689727464e-05, + "loss": 0.105, + "step": 17930 + }, + { + "epoch": 0.940251572327044, + "grad_norm": 2.788958787918091, + "learning_rate": 2.649502096436059e-05, + "loss": 0.1171, + "step": 17940 + }, + { + "epoch": 0.9407756813417191, + "grad_norm": 2.2564330101013184, + "learning_rate": 2.6481918238993714e-05, + "loss": 0.1015, + "step": 17950 + }, + { + "epoch": 0.9412997903563941, + "grad_norm": 2.0158989429473877, + "learning_rate": 2.6468815513626838e-05, + "loss": 0.0915, + "step": 17960 + }, + { + "epoch": 0.9418238993710691, + "grad_norm": 2.8656909465789795, + "learning_rate": 2.645571278825996e-05, + "loss": 0.0965, + "step": 17970 + }, + { + "epoch": 0.9423480083857443, + "grad_norm": 2.881559371948242, + "learning_rate": 2.644261006289308e-05, + "loss": 0.1194, + "step": 17980 + }, + { + "epoch": 0.9428721174004193, + "grad_norm": 1.544648289680481, + "learning_rate": 2.6429507337526205e-05, + "loss": 0.1258, + "step": 17990 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 1.1653016805648804, + "learning_rate": 2.6416404612159328e-05, + "loss": 0.1141, + "step": 18000 + }, + { + "epoch": 0.9433962264150944, + "eval_loss": 0.273366242647171, + "eval_runtime": 268.2749, + "eval_samples_per_second": 7.421, + "eval_steps_per_second": 1.238, + "step": 18000 + }, + { + "epoch": 0.9439203354297694, + "grad_norm": 2.2898740768432617, + "learning_rate": 2.6403301886792455e-05, + "loss": 0.1076, + "step": 18010 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 2.067246437072754, + "learning_rate": 2.6390199161425578e-05, + "loss": 0.1208, + "step": 18020 + }, + { + "epoch": 0.9449685534591195, + "grad_norm": 2.2256276607513428, + "learning_rate": 2.63770964360587e-05, + "loss": 0.135, + "step": 18030 + }, + { + "epoch": 0.9454926624737946, + "grad_norm": 1.5676586627960205, + "learning_rate": 2.6363993710691825e-05, + "loss": 0.1077, + "step": 18040 + }, + { + "epoch": 0.9460167714884696, + "grad_norm": 1.3694120645523071, + "learning_rate": 2.635089098532495e-05, + "loss": 0.0976, + "step": 18050 + }, + { + "epoch": 0.9465408805031447, + "grad_norm": 1.5198019742965698, + "learning_rate": 2.6337788259958075e-05, + "loss": 0.1529, + "step": 18060 + }, + { + "epoch": 0.9470649895178197, + "grad_norm": 0.8670737147331238, + "learning_rate": 2.63246855345912e-05, + "loss": 0.1139, + "step": 18070 + }, + { + "epoch": 0.9475890985324947, + "grad_norm": 1.631770133972168, + "learning_rate": 2.6311582809224322e-05, + "loss": 0.0935, + "step": 18080 + }, + { + "epoch": 0.9481132075471698, + "grad_norm": 2.3997673988342285, + "learning_rate": 2.6298480083857445e-05, + "loss": 0.1175, + "step": 18090 + }, + { + "epoch": 0.9486373165618449, + "grad_norm": 2.676593065261841, + "learning_rate": 2.6285377358490565e-05, + "loss": 0.1232, + "step": 18100 + }, + { + "epoch": 0.94916142557652, + "grad_norm": 1.3629957437515259, + "learning_rate": 2.627227463312369e-05, + "loss": 0.1248, + "step": 18110 + }, + { + "epoch": 0.949685534591195, + "grad_norm": 1.683488130569458, + "learning_rate": 2.6259171907756812e-05, + "loss": 0.1111, + "step": 18120 + }, + { + "epoch": 0.95020964360587, + "grad_norm": 1.4218518733978271, + "learning_rate": 2.624606918238994e-05, + "loss": 0.0971, + "step": 18130 + }, + { + "epoch": 0.950733752620545, + "grad_norm": 1.9305870532989502, + "learning_rate": 2.6232966457023062e-05, + "loss": 0.1405, + "step": 18140 + }, + { + "epoch": 0.9512578616352201, + "grad_norm": 1.6784343719482422, + "learning_rate": 2.6219863731656186e-05, + "loss": 0.1172, + "step": 18150 + }, + { + "epoch": 0.9517819706498952, + "grad_norm": 2.8489601612091064, + "learning_rate": 2.620676100628931e-05, + "loss": 0.1533, + "step": 18160 + }, + { + "epoch": 0.9523060796645703, + "grad_norm": 1.829404354095459, + "learning_rate": 2.6193658280922432e-05, + "loss": 0.1015, + "step": 18170 + }, + { + "epoch": 0.9528301886792453, + "grad_norm": 2.171243667602539, + "learning_rate": 2.618055555555556e-05, + "loss": 0.1176, + "step": 18180 + }, + { + "epoch": 0.9533542976939203, + "grad_norm": 1.988754153251648, + "learning_rate": 2.6167452830188682e-05, + "loss": 0.1135, + "step": 18190 + }, + { + "epoch": 0.9538784067085954, + "grad_norm": 1.1377837657928467, + "learning_rate": 2.6154350104821806e-05, + "loss": 0.1367, + "step": 18200 + }, + { + "epoch": 0.9544025157232704, + "grad_norm": 1.9834413528442383, + "learning_rate": 2.614124737945493e-05, + "loss": 0.1373, + "step": 18210 + }, + { + "epoch": 0.9549266247379455, + "grad_norm": 1.2570122480392456, + "learning_rate": 2.612814465408805e-05, + "loss": 0.1191, + "step": 18220 + }, + { + "epoch": 0.9554507337526206, + "grad_norm": 1.0435508489608765, + "learning_rate": 2.6115041928721173e-05, + "loss": 0.116, + "step": 18230 + }, + { + "epoch": 0.9559748427672956, + "grad_norm": 2.252382516860962, + "learning_rate": 2.6101939203354296e-05, + "loss": 0.1292, + "step": 18240 + }, + { + "epoch": 0.9564989517819706, + "grad_norm": 2.4033634662628174, + "learning_rate": 2.6088836477987423e-05, + "loss": 0.1072, + "step": 18250 + }, + { + "epoch": 0.9570230607966457, + "grad_norm": 0.9558582305908203, + "learning_rate": 2.6075733752620546e-05, + "loss": 0.1053, + "step": 18260 + }, + { + "epoch": 0.9575471698113207, + "grad_norm": 2.276141405105591, + "learning_rate": 2.606263102725367e-05, + "loss": 0.1108, + "step": 18270 + }, + { + "epoch": 0.9580712788259959, + "grad_norm": 2.3497204780578613, + "learning_rate": 2.6049528301886793e-05, + "loss": 0.1623, + "step": 18280 + }, + { + "epoch": 0.9585953878406709, + "grad_norm": 1.9832830429077148, + "learning_rate": 2.603642557651992e-05, + "loss": 0.1167, + "step": 18290 + }, + { + "epoch": 0.9591194968553459, + "grad_norm": 1.21957528591156, + "learning_rate": 2.6023322851153043e-05, + "loss": 0.1158, + "step": 18300 + }, + { + "epoch": 0.959643605870021, + "grad_norm": 2.1913418769836426, + "learning_rate": 2.6010220125786166e-05, + "loss": 0.1035, + "step": 18310 + }, + { + "epoch": 0.960167714884696, + "grad_norm": 1.9338150024414062, + "learning_rate": 2.599711740041929e-05, + "loss": 0.1179, + "step": 18320 + }, + { + "epoch": 0.960691823899371, + "grad_norm": 1.4277621507644653, + "learning_rate": 2.598401467505241e-05, + "loss": 0.1137, + "step": 18330 + }, + { + "epoch": 0.9612159329140462, + "grad_norm": 2.319413185119629, + "learning_rate": 2.5970911949685533e-05, + "loss": 0.1376, + "step": 18340 + }, + { + "epoch": 0.9617400419287212, + "grad_norm": 2.9515397548675537, + "learning_rate": 2.5957809224318657e-05, + "loss": 0.112, + "step": 18350 + }, + { + "epoch": 0.9622641509433962, + "grad_norm": 1.2524621486663818, + "learning_rate": 2.5944706498951783e-05, + "loss": 0.1294, + "step": 18360 + }, + { + "epoch": 0.9627882599580713, + "grad_norm": 1.4624210596084595, + "learning_rate": 2.5931603773584907e-05, + "loss": 0.0921, + "step": 18370 + }, + { + "epoch": 0.9633123689727463, + "grad_norm": 2.1402573585510254, + "learning_rate": 2.591850104821803e-05, + "loss": 0.099, + "step": 18380 + }, + { + "epoch": 0.9638364779874213, + "grad_norm": 1.677443265914917, + "learning_rate": 2.5905398322851154e-05, + "loss": 0.1414, + "step": 18390 + }, + { + "epoch": 0.9643605870020965, + "grad_norm": 2.3793933391571045, + "learning_rate": 2.5892295597484277e-05, + "loss": 0.0931, + "step": 18400 + }, + { + "epoch": 0.9648846960167715, + "grad_norm": 1.8992564678192139, + "learning_rate": 2.5879192872117404e-05, + "loss": 0.1172, + "step": 18410 + }, + { + "epoch": 0.9654088050314465, + "grad_norm": 1.8893632888793945, + "learning_rate": 2.5866090146750527e-05, + "loss": 0.1063, + "step": 18420 + }, + { + "epoch": 0.9659329140461216, + "grad_norm": 3.0469095706939697, + "learning_rate": 2.585298742138365e-05, + "loss": 0.0926, + "step": 18430 + }, + { + "epoch": 0.9664570230607966, + "grad_norm": 0.6239261627197266, + "learning_rate": 2.5839884696016774e-05, + "loss": 0.12, + "step": 18440 + }, + { + "epoch": 0.9669811320754716, + "grad_norm": 1.5373907089233398, + "learning_rate": 2.5826781970649894e-05, + "loss": 0.131, + "step": 18450 + }, + { + "epoch": 0.9675052410901468, + "grad_norm": 1.107908844947815, + "learning_rate": 2.5813679245283017e-05, + "loss": 0.1001, + "step": 18460 + }, + { + "epoch": 0.9680293501048218, + "grad_norm": 0.9736654758453369, + "learning_rate": 2.580057651991614e-05, + "loss": 0.0895, + "step": 18470 + }, + { + "epoch": 0.9685534591194969, + "grad_norm": 2.5150935649871826, + "learning_rate": 2.5787473794549267e-05, + "loss": 0.1471, + "step": 18480 + }, + { + "epoch": 0.9690775681341719, + "grad_norm": 2.4384822845458984, + "learning_rate": 2.577437106918239e-05, + "loss": 0.1279, + "step": 18490 + }, + { + "epoch": 0.9696016771488469, + "grad_norm": 2.5240912437438965, + "learning_rate": 2.5761268343815514e-05, + "loss": 0.1529, + "step": 18500 + }, + { + "epoch": 0.970125786163522, + "grad_norm": 1.687264084815979, + "learning_rate": 2.5748165618448638e-05, + "loss": 0.1279, + "step": 18510 + }, + { + "epoch": 0.9706498951781971, + "grad_norm": 2.4330201148986816, + "learning_rate": 2.5735062893081764e-05, + "loss": 0.1216, + "step": 18520 + }, + { + "epoch": 0.9711740041928721, + "grad_norm": 1.3596785068511963, + "learning_rate": 2.5721960167714888e-05, + "loss": 0.1052, + "step": 18530 + }, + { + "epoch": 0.9716981132075472, + "grad_norm": 3.4725117683410645, + "learning_rate": 2.570885744234801e-05, + "loss": 0.1065, + "step": 18540 + }, + { + "epoch": 0.9722222222222222, + "grad_norm": 0.9534229636192322, + "learning_rate": 2.5695754716981135e-05, + "loss": 0.089, + "step": 18550 + }, + { + "epoch": 0.9727463312368972, + "grad_norm": 1.0076327323913574, + "learning_rate": 2.5682651991614258e-05, + "loss": 0.0991, + "step": 18560 + }, + { + "epoch": 0.9732704402515723, + "grad_norm": 2.910527229309082, + "learning_rate": 2.5669549266247378e-05, + "loss": 0.1067, + "step": 18570 + }, + { + "epoch": 0.9737945492662474, + "grad_norm": 1.0768955945968628, + "learning_rate": 2.56564465408805e-05, + "loss": 0.1105, + "step": 18580 + }, + { + "epoch": 0.9743186582809225, + "grad_norm": 2.1621615886688232, + "learning_rate": 2.5643343815513625e-05, + "loss": 0.1246, + "step": 18590 + }, + { + "epoch": 0.9748427672955975, + "grad_norm": 2.638805627822876, + "learning_rate": 2.563024109014675e-05, + "loss": 0.1277, + "step": 18600 + }, + { + "epoch": 0.9753668763102725, + "grad_norm": 1.5452163219451904, + "learning_rate": 2.5617138364779875e-05, + "loss": 0.0956, + "step": 18610 + }, + { + "epoch": 0.9758909853249476, + "grad_norm": 2.066392183303833, + "learning_rate": 2.5604035639412998e-05, + "loss": 0.0932, + "step": 18620 + }, + { + "epoch": 0.9764150943396226, + "grad_norm": 2.3010566234588623, + "learning_rate": 2.559093291404612e-05, + "loss": 0.1238, + "step": 18630 + }, + { + "epoch": 0.9769392033542977, + "grad_norm": 3.2208967208862305, + "learning_rate": 2.557783018867925e-05, + "loss": 0.1268, + "step": 18640 + }, + { + "epoch": 0.9774633123689728, + "grad_norm": 6.718996047973633, + "learning_rate": 2.5564727463312372e-05, + "loss": 0.1182, + "step": 18650 + }, + { + "epoch": 0.9779874213836478, + "grad_norm": 2.7678651809692383, + "learning_rate": 2.5551624737945495e-05, + "loss": 0.1413, + "step": 18660 + }, + { + "epoch": 0.9785115303983228, + "grad_norm": 1.6215708255767822, + "learning_rate": 2.553852201257862e-05, + "loss": 0.0962, + "step": 18670 + }, + { + "epoch": 0.9790356394129979, + "grad_norm": 1.7245888710021973, + "learning_rate": 2.5525419287211745e-05, + "loss": 0.1187, + "step": 18680 + }, + { + "epoch": 0.9795597484276729, + "grad_norm": 1.980368733406067, + "learning_rate": 2.5512316561844862e-05, + "loss": 0.0917, + "step": 18690 + }, + { + "epoch": 0.980083857442348, + "grad_norm": 1.9294238090515137, + "learning_rate": 2.5499213836477985e-05, + "loss": 0.1395, + "step": 18700 + }, + { + "epoch": 0.9806079664570231, + "grad_norm": 1.6551685333251953, + "learning_rate": 2.5486111111111112e-05, + "loss": 0.1034, + "step": 18710 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 1.7868574857711792, + "learning_rate": 2.5473008385744235e-05, + "loss": 0.0993, + "step": 18720 + }, + { + "epoch": 0.9816561844863732, + "grad_norm": 2.7772812843322754, + "learning_rate": 2.545990566037736e-05, + "loss": 0.1022, + "step": 18730 + }, + { + "epoch": 0.9821802935010482, + "grad_norm": 3.1440446376800537, + "learning_rate": 2.5446802935010482e-05, + "loss": 0.1154, + "step": 18740 + }, + { + "epoch": 0.9827044025157232, + "grad_norm": 1.7626208066940308, + "learning_rate": 2.5433700209643606e-05, + "loss": 0.1015, + "step": 18750 + }, + { + "epoch": 0.9832285115303984, + "grad_norm": 1.3496103286743164, + "learning_rate": 2.5420597484276732e-05, + "loss": 0.1373, + "step": 18760 + }, + { + "epoch": 0.9837526205450734, + "grad_norm": 2.246685028076172, + "learning_rate": 2.5407494758909856e-05, + "loss": 0.1137, + "step": 18770 + }, + { + "epoch": 0.9842767295597484, + "grad_norm": 1.2018938064575195, + "learning_rate": 2.539439203354298e-05, + "loss": 0.1291, + "step": 18780 + }, + { + "epoch": 0.9848008385744235, + "grad_norm": 1.1192766427993774, + "learning_rate": 2.5381289308176103e-05, + "loss": 0.1051, + "step": 18790 + }, + { + "epoch": 0.9853249475890985, + "grad_norm": 1.578484296798706, + "learning_rate": 2.536818658280923e-05, + "loss": 0.1074, + "step": 18800 + }, + { + "epoch": 0.9858490566037735, + "grad_norm": 1.7460689544677734, + "learning_rate": 2.5355083857442346e-05, + "loss": 0.1506, + "step": 18810 + }, + { + "epoch": 0.9863731656184487, + "grad_norm": 2.330599784851074, + "learning_rate": 2.534198113207547e-05, + "loss": 0.0993, + "step": 18820 + }, + { + "epoch": 0.9868972746331237, + "grad_norm": 1.5016928911209106, + "learning_rate": 2.5328878406708596e-05, + "loss": 0.114, + "step": 18830 + }, + { + "epoch": 0.9874213836477987, + "grad_norm": 2.1551918983459473, + "learning_rate": 2.531577568134172e-05, + "loss": 0.0984, + "step": 18840 + }, + { + "epoch": 0.9879454926624738, + "grad_norm": 1.67074716091156, + "learning_rate": 2.5302672955974843e-05, + "loss": 0.1162, + "step": 18850 + }, + { + "epoch": 0.9884696016771488, + "grad_norm": 1.218502163887024, + "learning_rate": 2.5289570230607966e-05, + "loss": 0.1058, + "step": 18860 + }, + { + "epoch": 0.9889937106918238, + "grad_norm": 2.4353883266448975, + "learning_rate": 2.5276467505241093e-05, + "loss": 0.1061, + "step": 18870 + }, + { + "epoch": 0.989517819706499, + "grad_norm": 1.4544498920440674, + "learning_rate": 2.5263364779874216e-05, + "loss": 0.1033, + "step": 18880 + }, + { + "epoch": 0.990041928721174, + "grad_norm": 1.7678455114364624, + "learning_rate": 2.525026205450734e-05, + "loss": 0.1332, + "step": 18890 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 1.480412244796753, + "learning_rate": 2.5237159329140463e-05, + "loss": 0.1034, + "step": 18900 + }, + { + "epoch": 0.9910901467505241, + "grad_norm": 1.789129376411438, + "learning_rate": 2.5224056603773587e-05, + "loss": 0.115, + "step": 18910 + }, + { + "epoch": 0.9916142557651991, + "grad_norm": 1.5032306909561157, + "learning_rate": 2.5210953878406713e-05, + "loss": 0.1095, + "step": 18920 + }, + { + "epoch": 0.9921383647798742, + "grad_norm": 1.3824424743652344, + "learning_rate": 2.519785115303983e-05, + "loss": 0.1232, + "step": 18930 + }, + { + "epoch": 0.9926624737945493, + "grad_norm": 1.3315826654434204, + "learning_rate": 2.5184748427672957e-05, + "loss": 0.0899, + "step": 18940 + }, + { + "epoch": 0.9931865828092243, + "grad_norm": 4.2224297523498535, + "learning_rate": 2.517164570230608e-05, + "loss": 0.1064, + "step": 18950 + }, + { + "epoch": 0.9937106918238994, + "grad_norm": 1.537833571434021, + "learning_rate": 2.5158542976939203e-05, + "loss": 0.092, + "step": 18960 + }, + { + "epoch": 0.9942348008385744, + "grad_norm": 1.8726469278335571, + "learning_rate": 2.5145440251572327e-05, + "loss": 0.1214, + "step": 18970 + }, + { + "epoch": 0.9947589098532494, + "grad_norm": 2.2182183265686035, + "learning_rate": 2.513233752620545e-05, + "loss": 0.1118, + "step": 18980 + }, + { + "epoch": 0.9952830188679245, + "grad_norm": 2.326605796813965, + "learning_rate": 2.5119234800838577e-05, + "loss": 0.1268, + "step": 18990 + }, + { + "epoch": 0.9958071278825996, + "grad_norm": 1.908422589302063, + "learning_rate": 2.51061320754717e-05, + "loss": 0.119, + "step": 19000 + }, + { + "epoch": 0.9958071278825996, + "eval_loss": 0.2677764594554901, + "eval_runtime": 268.2087, + "eval_samples_per_second": 7.423, + "eval_steps_per_second": 1.238, + "step": 19000 + }, + { + "epoch": 0.9963312368972747, + "grad_norm": 2.0410823822021484, + "learning_rate": 2.5093029350104824e-05, + "loss": 0.1056, + "step": 19010 + }, + { + "epoch": 0.9968553459119497, + "grad_norm": 2.298825740814209, + "learning_rate": 2.5079926624737947e-05, + "loss": 0.1308, + "step": 19020 + }, + { + "epoch": 0.9973794549266247, + "grad_norm": 2.703629493713379, + "learning_rate": 2.5066823899371074e-05, + "loss": 0.1018, + "step": 19030 + }, + { + "epoch": 0.9979035639412998, + "grad_norm": 0.8350358009338379, + "learning_rate": 2.5053721174004197e-05, + "loss": 0.0902, + "step": 19040 + }, + { + "epoch": 0.9984276729559748, + "grad_norm": 1.1665393114089966, + "learning_rate": 2.5040618448637314e-05, + "loss": 0.1118, + "step": 19050 + }, + { + "epoch": 0.9989517819706499, + "grad_norm": 1.3827977180480957, + "learning_rate": 2.502751572327044e-05, + "loss": 0.1178, + "step": 19060 + }, + { + "epoch": 0.999475890985325, + "grad_norm": 3.110260248184204, + "learning_rate": 2.5014412997903564e-05, + "loss": 0.1189, + "step": 19070 + }, + { + "epoch": 1.0, + "grad_norm": 1.3243497610092163, + "learning_rate": 2.5001310272536688e-05, + "loss": 0.096, + "step": 19080 + }, + { + "epoch": 1.000524109014675, + "grad_norm": 1.87785804271698, + "learning_rate": 2.498820754716981e-05, + "loss": 0.0894, + "step": 19090 + }, + { + "epoch": 1.00104821802935, + "grad_norm": 2.367110252380371, + "learning_rate": 2.4975104821802938e-05, + "loss": 0.079, + "step": 19100 + }, + { + "epoch": 1.001572327044025, + "grad_norm": 0.9960259199142456, + "learning_rate": 2.496200209643606e-05, + "loss": 0.0941, + "step": 19110 + }, + { + "epoch": 1.0020964360587001, + "grad_norm": 1.7719085216522217, + "learning_rate": 2.4948899371069184e-05, + "loss": 0.0812, + "step": 19120 + }, + { + "epoch": 1.0026205450733752, + "grad_norm": 1.4971020221710205, + "learning_rate": 2.4935796645702308e-05, + "loss": 0.0978, + "step": 19130 + }, + { + "epoch": 1.0031446540880504, + "grad_norm": 1.7672542333602905, + "learning_rate": 2.492269392033543e-05, + "loss": 0.0786, + "step": 19140 + }, + { + "epoch": 1.0036687631027255, + "grad_norm": 2.371225118637085, + "learning_rate": 2.4909591194968555e-05, + "loss": 0.0678, + "step": 19150 + }, + { + "epoch": 1.0041928721174005, + "grad_norm": 1.103453278541565, + "learning_rate": 2.4896488469601678e-05, + "loss": 0.0737, + "step": 19160 + }, + { + "epoch": 1.0047169811320755, + "grad_norm": 2.200951337814331, + "learning_rate": 2.48833857442348e-05, + "loss": 0.0708, + "step": 19170 + }, + { + "epoch": 1.0052410901467506, + "grad_norm": 1.470166563987732, + "learning_rate": 2.4870283018867928e-05, + "loss": 0.094, + "step": 19180 + }, + { + "epoch": 1.0057651991614256, + "grad_norm": 1.8624029159545898, + "learning_rate": 2.485718029350105e-05, + "loss": 0.0968, + "step": 19190 + }, + { + "epoch": 1.0062893081761006, + "grad_norm": 2.2916412353515625, + "learning_rate": 2.484407756813417e-05, + "loss": 0.1011, + "step": 19200 + }, + { + "epoch": 1.0068134171907757, + "grad_norm": 0.8032910823822021, + "learning_rate": 2.4830974842767295e-05, + "loss": 0.0769, + "step": 19210 + }, + { + "epoch": 1.0073375262054507, + "grad_norm": 1.478871464729309, + "learning_rate": 2.481787211740042e-05, + "loss": 0.0966, + "step": 19220 + }, + { + "epoch": 1.0078616352201257, + "grad_norm": 0.7236099243164062, + "learning_rate": 2.4804769392033545e-05, + "loss": 0.0725, + "step": 19230 + }, + { + "epoch": 1.0083857442348008, + "grad_norm": 1.7173033952713013, + "learning_rate": 2.479166666666667e-05, + "loss": 0.1062, + "step": 19240 + }, + { + "epoch": 1.0089098532494758, + "grad_norm": 0.6187518239021301, + "learning_rate": 2.4778563941299792e-05, + "loss": 0.0739, + "step": 19250 + }, + { + "epoch": 1.009433962264151, + "grad_norm": 1.4748426675796509, + "learning_rate": 2.4765461215932915e-05, + "loss": 0.0902, + "step": 19260 + }, + { + "epoch": 1.009958071278826, + "grad_norm": 1.5630605220794678, + "learning_rate": 2.475235849056604e-05, + "loss": 0.0821, + "step": 19270 + }, + { + "epoch": 1.0104821802935011, + "grad_norm": 1.2467037439346313, + "learning_rate": 2.4739255765199162e-05, + "loss": 0.0869, + "step": 19280 + }, + { + "epoch": 1.0110062893081762, + "grad_norm": 1.1149821281433105, + "learning_rate": 2.4726153039832285e-05, + "loss": 0.0956, + "step": 19290 + }, + { + "epoch": 1.0115303983228512, + "grad_norm": 1.2000000476837158, + "learning_rate": 2.4713050314465412e-05, + "loss": 0.0988, + "step": 19300 + }, + { + "epoch": 1.0120545073375262, + "grad_norm": 1.31313157081604, + "learning_rate": 2.4699947589098536e-05, + "loss": 0.0645, + "step": 19310 + }, + { + "epoch": 1.0125786163522013, + "grad_norm": 2.025057554244995, + "learning_rate": 2.4686844863731656e-05, + "loss": 0.096, + "step": 19320 + }, + { + "epoch": 1.0131027253668763, + "grad_norm": 0.7827894687652588, + "learning_rate": 2.467374213836478e-05, + "loss": 0.1035, + "step": 19330 + }, + { + "epoch": 1.0136268343815513, + "grad_norm": 0.8256064057350159, + "learning_rate": 2.4660639412997906e-05, + "loss": 0.0851, + "step": 19340 + }, + { + "epoch": 1.0141509433962264, + "grad_norm": 6.706319808959961, + "learning_rate": 2.464753668763103e-05, + "loss": 0.069, + "step": 19350 + }, + { + "epoch": 1.0146750524109014, + "grad_norm": 0.8227464556694031, + "learning_rate": 2.4634433962264152e-05, + "loss": 0.0946, + "step": 19360 + }, + { + "epoch": 1.0151991614255764, + "grad_norm": 1.1776174306869507, + "learning_rate": 2.4621331236897276e-05, + "loss": 0.063, + "step": 19370 + }, + { + "epoch": 1.0157232704402517, + "grad_norm": 1.0992966890335083, + "learning_rate": 2.46082285115304e-05, + "loss": 0.071, + "step": 19380 + }, + { + "epoch": 1.0162473794549267, + "grad_norm": 1.3341543674468994, + "learning_rate": 2.4595125786163523e-05, + "loss": 0.061, + "step": 19390 + }, + { + "epoch": 1.0167714884696017, + "grad_norm": 3.2466483116149902, + "learning_rate": 2.4582023060796646e-05, + "loss": 0.0904, + "step": 19400 + }, + { + "epoch": 1.0172955974842768, + "grad_norm": 3.0862956047058105, + "learning_rate": 2.456892033542977e-05, + "loss": 0.1189, + "step": 19410 + }, + { + "epoch": 1.0178197064989518, + "grad_norm": 1.4831918478012085, + "learning_rate": 2.4555817610062896e-05, + "loss": 0.0899, + "step": 19420 + }, + { + "epoch": 1.0183438155136268, + "grad_norm": 1.3484694957733154, + "learning_rate": 2.454271488469602e-05, + "loss": 0.0898, + "step": 19430 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 3.4956963062286377, + "learning_rate": 2.452961215932914e-05, + "loss": 0.1003, + "step": 19440 + }, + { + "epoch": 1.019392033542977, + "grad_norm": 2.407393217086792, + "learning_rate": 2.4516509433962266e-05, + "loss": 0.0911, + "step": 19450 + }, + { + "epoch": 1.019916142557652, + "grad_norm": 1.5907275676727295, + "learning_rate": 2.450340670859539e-05, + "loss": 0.0957, + "step": 19460 + }, + { + "epoch": 1.020440251572327, + "grad_norm": 1.243753433227539, + "learning_rate": 2.4490303983228513e-05, + "loss": 0.0647, + "step": 19470 + }, + { + "epoch": 1.020964360587002, + "grad_norm": 3.1798362731933594, + "learning_rate": 2.4477201257861636e-05, + "loss": 0.0972, + "step": 19480 + }, + { + "epoch": 1.021488469601677, + "grad_norm": 1.9811335802078247, + "learning_rate": 2.446409853249476e-05, + "loss": 0.0893, + "step": 19490 + }, + { + "epoch": 1.0220125786163523, + "grad_norm": 1.0129342079162598, + "learning_rate": 2.4450995807127883e-05, + "loss": 0.1057, + "step": 19500 + }, + { + "epoch": 1.0225366876310273, + "grad_norm": 1.6326560974121094, + "learning_rate": 2.4437893081761007e-05, + "loss": 0.1077, + "step": 19510 + }, + { + "epoch": 1.0230607966457024, + "grad_norm": 1.4604363441467285, + "learning_rate": 2.442479035639413e-05, + "loss": 0.0887, + "step": 19520 + }, + { + "epoch": 1.0235849056603774, + "grad_norm": 2.1132311820983887, + "learning_rate": 2.4411687631027257e-05, + "loss": 0.0953, + "step": 19530 + }, + { + "epoch": 1.0241090146750524, + "grad_norm": 1.9144420623779297, + "learning_rate": 2.439858490566038e-05, + "loss": 0.0488, + "step": 19540 + }, + { + "epoch": 1.0246331236897275, + "grad_norm": 2.6526668071746826, + "learning_rate": 2.4385482180293504e-05, + "loss": 0.0924, + "step": 19550 + }, + { + "epoch": 1.0251572327044025, + "grad_norm": 1.5566918849945068, + "learning_rate": 2.4372379454926624e-05, + "loss": 0.1057, + "step": 19560 + }, + { + "epoch": 1.0256813417190775, + "grad_norm": 3.5535433292388916, + "learning_rate": 2.435927672955975e-05, + "loss": 0.0897, + "step": 19570 + }, + { + "epoch": 1.0262054507337526, + "grad_norm": 0.8449379205703735, + "learning_rate": 2.4346174004192874e-05, + "loss": 0.0778, + "step": 19580 + }, + { + "epoch": 1.0267295597484276, + "grad_norm": 2.4703757762908936, + "learning_rate": 2.4333071278825997e-05, + "loss": 0.0718, + "step": 19590 + }, + { + "epoch": 1.0272536687631026, + "grad_norm": 1.603649616241455, + "learning_rate": 2.431996855345912e-05, + "loss": 0.0697, + "step": 19600 + }, + { + "epoch": 1.0277777777777777, + "grad_norm": 1.5522819757461548, + "learning_rate": 2.4306865828092247e-05, + "loss": 0.0935, + "step": 19610 + }, + { + "epoch": 1.028301886792453, + "grad_norm": 1.6726840734481812, + "learning_rate": 2.4293763102725367e-05, + "loss": 0.0782, + "step": 19620 + }, + { + "epoch": 1.028825995807128, + "grad_norm": 0.7730293869972229, + "learning_rate": 2.428066037735849e-05, + "loss": 0.0712, + "step": 19630 + }, + { + "epoch": 1.029350104821803, + "grad_norm": 1.2800456285476685, + "learning_rate": 2.4267557651991614e-05, + "loss": 0.0679, + "step": 19640 + }, + { + "epoch": 1.029874213836478, + "grad_norm": 3.403644561767578, + "learning_rate": 2.425445492662474e-05, + "loss": 0.0874, + "step": 19650 + }, + { + "epoch": 1.030398322851153, + "grad_norm": 0.9466264843940735, + "learning_rate": 2.4241352201257864e-05, + "loss": 0.0763, + "step": 19660 + }, + { + "epoch": 1.030922431865828, + "grad_norm": 1.6146653890609741, + "learning_rate": 2.4228249475890988e-05, + "loss": 0.0892, + "step": 19670 + }, + { + "epoch": 1.0314465408805031, + "grad_norm": 0.8680115938186646, + "learning_rate": 2.4215146750524108e-05, + "loss": 0.0987, + "step": 19680 + }, + { + "epoch": 1.0319706498951782, + "grad_norm": 0.9922559857368469, + "learning_rate": 2.4202044025157234e-05, + "loss": 0.0901, + "step": 19690 + }, + { + "epoch": 1.0324947589098532, + "grad_norm": 0.8490608930587769, + "learning_rate": 2.4188941299790358e-05, + "loss": 0.0958, + "step": 19700 + }, + { + "epoch": 1.0330188679245282, + "grad_norm": 1.9773913621902466, + "learning_rate": 2.417583857442348e-05, + "loss": 0.0909, + "step": 19710 + }, + { + "epoch": 1.0335429769392033, + "grad_norm": 1.3205777406692505, + "learning_rate": 2.4162735849056605e-05, + "loss": 0.0607, + "step": 19720 + }, + { + "epoch": 1.0340670859538783, + "grad_norm": 22.19045066833496, + "learning_rate": 2.414963312368973e-05, + "loss": 0.0704, + "step": 19730 + }, + { + "epoch": 1.0345911949685536, + "grad_norm": 0.9581673741340637, + "learning_rate": 2.413653039832285e-05, + "loss": 0.1178, + "step": 19740 + }, + { + "epoch": 1.0351153039832286, + "grad_norm": 1.1050844192504883, + "learning_rate": 2.4123427672955975e-05, + "loss": 0.0761, + "step": 19750 + }, + { + "epoch": 1.0356394129979036, + "grad_norm": 0.8335663676261902, + "learning_rate": 2.4110324947589098e-05, + "loss": 0.1002, + "step": 19760 + }, + { + "epoch": 1.0361635220125787, + "grad_norm": 1.0287737846374512, + "learning_rate": 2.4097222222222225e-05, + "loss": 0.1094, + "step": 19770 + }, + { + "epoch": 1.0366876310272537, + "grad_norm": 2.1343629360198975, + "learning_rate": 2.4084119496855348e-05, + "loss": 0.0879, + "step": 19780 + }, + { + "epoch": 1.0372117400419287, + "grad_norm": 0.8107201457023621, + "learning_rate": 2.407101677148847e-05, + "loss": 0.083, + "step": 19790 + }, + { + "epoch": 1.0377358490566038, + "grad_norm": 1.4638867378234863, + "learning_rate": 2.4057914046121595e-05, + "loss": 0.0891, + "step": 19800 + }, + { + "epoch": 1.0382599580712788, + "grad_norm": 1.2282848358154297, + "learning_rate": 2.404481132075472e-05, + "loss": 0.0603, + "step": 19810 + }, + { + "epoch": 1.0387840670859538, + "grad_norm": 0.7536243796348572, + "learning_rate": 2.4031708595387842e-05, + "loss": 0.0688, + "step": 19820 + }, + { + "epoch": 1.0393081761006289, + "grad_norm": 2.29469895362854, + "learning_rate": 2.4018605870020965e-05, + "loss": 0.0838, + "step": 19830 + }, + { + "epoch": 1.039832285115304, + "grad_norm": 1.947985291481018, + "learning_rate": 2.400550314465409e-05, + "loss": 0.0972, + "step": 19840 + }, + { + "epoch": 1.040356394129979, + "grad_norm": 1.8855829238891602, + "learning_rate": 2.3992400419287215e-05, + "loss": 0.0648, + "step": 19850 + }, + { + "epoch": 1.0408805031446542, + "grad_norm": 2.420112133026123, + "learning_rate": 2.3979297693920335e-05, + "loss": 0.0595, + "step": 19860 + }, + { + "epoch": 1.0414046121593292, + "grad_norm": 1.4276695251464844, + "learning_rate": 2.396619496855346e-05, + "loss": 0.1112, + "step": 19870 + }, + { + "epoch": 1.0419287211740043, + "grad_norm": 1.6597774028778076, + "learning_rate": 2.3953092243186585e-05, + "loss": 0.0573, + "step": 19880 + }, + { + "epoch": 1.0424528301886793, + "grad_norm": 2.0908398628234863, + "learning_rate": 2.393998951781971e-05, + "loss": 0.1037, + "step": 19890 + }, + { + "epoch": 1.0429769392033543, + "grad_norm": 1.0306899547576904, + "learning_rate": 2.3926886792452832e-05, + "loss": 0.0821, + "step": 19900 + }, + { + "epoch": 1.0435010482180294, + "grad_norm": 1.2301905155181885, + "learning_rate": 2.3913784067085952e-05, + "loss": 0.0606, + "step": 19910 + }, + { + "epoch": 1.0440251572327044, + "grad_norm": 0.8270147442817688, + "learning_rate": 2.390068134171908e-05, + "loss": 0.0677, + "step": 19920 + }, + { + "epoch": 1.0445492662473794, + "grad_norm": 1.662843942642212, + "learning_rate": 2.3887578616352202e-05, + "loss": 0.0812, + "step": 19930 + }, + { + "epoch": 1.0450733752620545, + "grad_norm": 2.4776611328125, + "learning_rate": 2.3874475890985326e-05, + "loss": 0.082, + "step": 19940 + }, + { + "epoch": 1.0455974842767295, + "grad_norm": 0.6950798034667969, + "learning_rate": 2.386137316561845e-05, + "loss": 0.0864, + "step": 19950 + }, + { + "epoch": 1.0461215932914045, + "grad_norm": 1.1080272197723389, + "learning_rate": 2.3848270440251576e-05, + "loss": 0.0813, + "step": 19960 + }, + { + "epoch": 1.0466457023060796, + "grad_norm": 2.492471218109131, + "learning_rate": 2.3835167714884696e-05, + "loss": 0.0527, + "step": 19970 + }, + { + "epoch": 1.0471698113207548, + "grad_norm": 1.7215197086334229, + "learning_rate": 2.382206498951782e-05, + "loss": 0.0744, + "step": 19980 + }, + { + "epoch": 1.0476939203354299, + "grad_norm": 1.311968445777893, + "learning_rate": 2.3808962264150943e-05, + "loss": 0.081, + "step": 19990 + }, + { + "epoch": 1.0482180293501049, + "grad_norm": 3.416196584701538, + "learning_rate": 2.379585953878407e-05, + "loss": 0.0817, + "step": 20000 + }, + { + "epoch": 1.0482180293501049, + "eval_loss": 0.2798672318458557, + "eval_runtime": 268.0531, + "eval_samples_per_second": 7.428, + "eval_steps_per_second": 1.239, + "step": 20000 + }, + { + "epoch": 1.04874213836478, + "grad_norm": 1.2927495241165161, + "learning_rate": 2.3782756813417193e-05, + "loss": 0.079, + "step": 20010 + }, + { + "epoch": 1.049266247379455, + "grad_norm": 0.9687632918357849, + "learning_rate": 2.3769654088050316e-05, + "loss": 0.1144, + "step": 20020 + }, + { + "epoch": 1.04979035639413, + "grad_norm": 1.7993711233139038, + "learning_rate": 2.375655136268344e-05, + "loss": 0.0949, + "step": 20030 + }, + { + "epoch": 1.050314465408805, + "grad_norm": 2.492044687271118, + "learning_rate": 2.3743448637316563e-05, + "loss": 0.081, + "step": 20040 + }, + { + "epoch": 1.05083857442348, + "grad_norm": 1.8582724332809448, + "learning_rate": 2.3730345911949686e-05, + "loss": 0.0852, + "step": 20050 + }, + { + "epoch": 1.051362683438155, + "grad_norm": 1.9378348588943481, + "learning_rate": 2.371724318658281e-05, + "loss": 0.0818, + "step": 20060 + }, + { + "epoch": 1.0518867924528301, + "grad_norm": 1.2538186311721802, + "learning_rate": 2.3704140461215933e-05, + "loss": 0.0735, + "step": 20070 + }, + { + "epoch": 1.0524109014675052, + "grad_norm": 1.1138423681259155, + "learning_rate": 2.369103773584906e-05, + "loss": 0.0776, + "step": 20080 + }, + { + "epoch": 1.0529350104821802, + "grad_norm": 1.8726630210876465, + "learning_rate": 2.367793501048218e-05, + "loss": 0.0932, + "step": 20090 + }, + { + "epoch": 1.0534591194968554, + "grad_norm": 0.9272988438606262, + "learning_rate": 2.3664832285115303e-05, + "loss": 0.0687, + "step": 20100 + }, + { + "epoch": 1.0539832285115305, + "grad_norm": 1.6455448865890503, + "learning_rate": 2.365172955974843e-05, + "loss": 0.0869, + "step": 20110 + }, + { + "epoch": 1.0545073375262055, + "grad_norm": 1.239667534828186, + "learning_rate": 2.3638626834381553e-05, + "loss": 0.0855, + "step": 20120 + }, + { + "epoch": 1.0550314465408805, + "grad_norm": 1.8769423961639404, + "learning_rate": 2.3625524109014677e-05, + "loss": 0.0896, + "step": 20130 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 1.3619462251663208, + "learning_rate": 2.36124213836478e-05, + "loss": 0.0884, + "step": 20140 + }, + { + "epoch": 1.0560796645702306, + "grad_norm": 2.014624834060669, + "learning_rate": 2.3599318658280924e-05, + "loss": 0.0861, + "step": 20150 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 1.067657709121704, + "learning_rate": 2.3586215932914047e-05, + "loss": 0.1005, + "step": 20160 + }, + { + "epoch": 1.0571278825995807, + "grad_norm": 1.8703769445419312, + "learning_rate": 2.357311320754717e-05, + "loss": 0.1162, + "step": 20170 + }, + { + "epoch": 1.0576519916142557, + "grad_norm": 0.8814947009086609, + "learning_rate": 2.3560010482180294e-05, + "loss": 0.064, + "step": 20180 + }, + { + "epoch": 1.0581761006289307, + "grad_norm": 1.0151050090789795, + "learning_rate": 2.354690775681342e-05, + "loss": 0.098, + "step": 20190 + }, + { + "epoch": 1.0587002096436058, + "grad_norm": 0.915666937828064, + "learning_rate": 2.3533805031446544e-05, + "loss": 0.0796, + "step": 20200 + }, + { + "epoch": 1.0592243186582808, + "grad_norm": 1.6964950561523438, + "learning_rate": 2.3520702306079664e-05, + "loss": 0.0945, + "step": 20210 + }, + { + "epoch": 1.059748427672956, + "grad_norm": 1.1933314800262451, + "learning_rate": 2.3507599580712787e-05, + "loss": 0.1041, + "step": 20220 + }, + { + "epoch": 1.060272536687631, + "grad_norm": 1.2950000762939453, + "learning_rate": 2.3494496855345914e-05, + "loss": 0.0987, + "step": 20230 + }, + { + "epoch": 1.0607966457023061, + "grad_norm": 3.0018699169158936, + "learning_rate": 2.3481394129979037e-05, + "loss": 0.0951, + "step": 20240 + }, + { + "epoch": 1.0613207547169812, + "grad_norm": 1.342100739479065, + "learning_rate": 2.346829140461216e-05, + "loss": 0.1001, + "step": 20250 + }, + { + "epoch": 1.0618448637316562, + "grad_norm": 1.8974846601486206, + "learning_rate": 2.3455188679245284e-05, + "loss": 0.0742, + "step": 20260 + }, + { + "epoch": 1.0623689727463312, + "grad_norm": 7.072523593902588, + "learning_rate": 2.3442085953878408e-05, + "loss": 0.0828, + "step": 20270 + }, + { + "epoch": 1.0628930817610063, + "grad_norm": 1.7849225997924805, + "learning_rate": 2.342898322851153e-05, + "loss": 0.1056, + "step": 20280 + }, + { + "epoch": 1.0634171907756813, + "grad_norm": 1.6293972730636597, + "learning_rate": 2.3415880503144654e-05, + "loss": 0.0775, + "step": 20290 + }, + { + "epoch": 1.0639412997903563, + "grad_norm": 1.087839961051941, + "learning_rate": 2.3402777777777778e-05, + "loss": 0.0821, + "step": 20300 + }, + { + "epoch": 1.0644654088050314, + "grad_norm": 1.9795023202896118, + "learning_rate": 2.3389675052410905e-05, + "loss": 0.0805, + "step": 20310 + }, + { + "epoch": 1.0649895178197064, + "grad_norm": 1.3084776401519775, + "learning_rate": 2.3376572327044028e-05, + "loss": 0.0841, + "step": 20320 + }, + { + "epoch": 1.0655136268343814, + "grad_norm": 2.340204954147339, + "learning_rate": 2.3363469601677148e-05, + "loss": 0.0879, + "step": 20330 + }, + { + "epoch": 1.0660377358490567, + "grad_norm": 0.9186223745346069, + "learning_rate": 2.335036687631027e-05, + "loss": 0.0893, + "step": 20340 + }, + { + "epoch": 1.0665618448637317, + "grad_norm": 1.2676678895950317, + "learning_rate": 2.3337264150943398e-05, + "loss": 0.1021, + "step": 20350 + }, + { + "epoch": 1.0670859538784068, + "grad_norm": 0.9018216133117676, + "learning_rate": 2.332416142557652e-05, + "loss": 0.1051, + "step": 20360 + }, + { + "epoch": 1.0676100628930818, + "grad_norm": 1.2806211709976196, + "learning_rate": 2.3311058700209645e-05, + "loss": 0.0798, + "step": 20370 + }, + { + "epoch": 1.0681341719077568, + "grad_norm": 1.302931785583496, + "learning_rate": 2.3297955974842768e-05, + "loss": 0.0808, + "step": 20380 + }, + { + "epoch": 1.0686582809224319, + "grad_norm": 1.1825281381607056, + "learning_rate": 2.328485324947589e-05, + "loss": 0.0655, + "step": 20390 + }, + { + "epoch": 1.069182389937107, + "grad_norm": 1.1505693197250366, + "learning_rate": 2.3271750524109015e-05, + "loss": 0.0667, + "step": 20400 + }, + { + "epoch": 1.069706498951782, + "grad_norm": 1.1411997079849243, + "learning_rate": 2.325864779874214e-05, + "loss": 0.063, + "step": 20410 + }, + { + "epoch": 1.070230607966457, + "grad_norm": 3.213791608810425, + "learning_rate": 2.3245545073375262e-05, + "loss": 0.0878, + "step": 20420 + }, + { + "epoch": 1.070754716981132, + "grad_norm": 1.0483222007751465, + "learning_rate": 2.323244234800839e-05, + "loss": 0.0997, + "step": 20430 + }, + { + "epoch": 1.071278825995807, + "grad_norm": 1.048044204711914, + "learning_rate": 2.3219339622641512e-05, + "loss": 0.0718, + "step": 20440 + }, + { + "epoch": 1.0718029350104823, + "grad_norm": 2.3207290172576904, + "learning_rate": 2.3206236897274632e-05, + "loss": 0.0635, + "step": 20450 + }, + { + "epoch": 1.0723270440251573, + "grad_norm": 1.414986491203308, + "learning_rate": 2.319313417190776e-05, + "loss": 0.0951, + "step": 20460 + }, + { + "epoch": 1.0728511530398324, + "grad_norm": 1.8669378757476807, + "learning_rate": 2.3180031446540882e-05, + "loss": 0.0832, + "step": 20470 + }, + { + "epoch": 1.0733752620545074, + "grad_norm": 1.9529931545257568, + "learning_rate": 2.3166928721174006e-05, + "loss": 0.0792, + "step": 20480 + }, + { + "epoch": 1.0738993710691824, + "grad_norm": 1.9123833179473877, + "learning_rate": 2.315382599580713e-05, + "loss": 0.0816, + "step": 20490 + }, + { + "epoch": 1.0744234800838575, + "grad_norm": 1.0265159606933594, + "learning_rate": 2.3140723270440252e-05, + "loss": 0.0734, + "step": 20500 + }, + { + "epoch": 1.0749475890985325, + "grad_norm": 1.1382958889007568, + "learning_rate": 2.3127620545073376e-05, + "loss": 0.0709, + "step": 20510 + }, + { + "epoch": 1.0754716981132075, + "grad_norm": 1.5899590253829956, + "learning_rate": 2.31145178197065e-05, + "loss": 0.0658, + "step": 20520 + }, + { + "epoch": 1.0759958071278826, + "grad_norm": 2.3364782333374023, + "learning_rate": 2.3101415094339622e-05, + "loss": 0.0805, + "step": 20530 + }, + { + "epoch": 1.0765199161425576, + "grad_norm": 1.562552571296692, + "learning_rate": 2.308831236897275e-05, + "loss": 0.0868, + "step": 20540 + }, + { + "epoch": 1.0770440251572326, + "grad_norm": 1.4748271703720093, + "learning_rate": 2.3075209643605873e-05, + "loss": 0.0733, + "step": 20550 + }, + { + "epoch": 1.0775681341719077, + "grad_norm": 0.6530401706695557, + "learning_rate": 2.3062106918238996e-05, + "loss": 0.0909, + "step": 20560 + }, + { + "epoch": 1.0780922431865827, + "grad_norm": 1.4629088640213013, + "learning_rate": 2.3049004192872116e-05, + "loss": 0.072, + "step": 20570 + }, + { + "epoch": 1.078616352201258, + "grad_norm": 1.0382362604141235, + "learning_rate": 2.3035901467505243e-05, + "loss": 0.0942, + "step": 20580 + }, + { + "epoch": 1.079140461215933, + "grad_norm": 1.4376634359359741, + "learning_rate": 2.3022798742138366e-05, + "loss": 0.0751, + "step": 20590 + }, + { + "epoch": 1.079664570230608, + "grad_norm": 1.0215072631835938, + "learning_rate": 2.300969601677149e-05, + "loss": 0.0652, + "step": 20600 + }, + { + "epoch": 1.080188679245283, + "grad_norm": 2.8767483234405518, + "learning_rate": 2.2996593291404613e-05, + "loss": 0.0791, + "step": 20610 + }, + { + "epoch": 1.080712788259958, + "grad_norm": 1.6112250089645386, + "learning_rate": 2.298349056603774e-05, + "loss": 0.0639, + "step": 20620 + }, + { + "epoch": 1.0812368972746331, + "grad_norm": 4.758137226104736, + "learning_rate": 2.297038784067086e-05, + "loss": 0.0753, + "step": 20630 + }, + { + "epoch": 1.0817610062893082, + "grad_norm": 11.462589263916016, + "learning_rate": 2.2957285115303983e-05, + "loss": 0.0713, + "step": 20640 + }, + { + "epoch": 1.0822851153039832, + "grad_norm": 2.255251169204712, + "learning_rate": 2.2944182389937106e-05, + "loss": 0.0757, + "step": 20650 + }, + { + "epoch": 1.0828092243186582, + "grad_norm": 1.9338898658752441, + "learning_rate": 2.2931079664570233e-05, + "loss": 0.07, + "step": 20660 + }, + { + "epoch": 1.0833333333333333, + "grad_norm": 3.076637029647827, + "learning_rate": 2.2917976939203357e-05, + "loss": 0.095, + "step": 20670 + }, + { + "epoch": 1.0838574423480083, + "grad_norm": 2.129887580871582, + "learning_rate": 2.290487421383648e-05, + "loss": 0.0727, + "step": 20680 + }, + { + "epoch": 1.0843815513626835, + "grad_norm": 1.7468769550323486, + "learning_rate": 2.28917714884696e-05, + "loss": 0.0819, + "step": 20690 + }, + { + "epoch": 1.0849056603773586, + "grad_norm": 1.341432809829712, + "learning_rate": 2.2878668763102727e-05, + "loss": 0.0719, + "step": 20700 + }, + { + "epoch": 1.0854297693920336, + "grad_norm": 2.184701442718506, + "learning_rate": 2.286556603773585e-05, + "loss": 0.0972, + "step": 20710 + }, + { + "epoch": 1.0859538784067087, + "grad_norm": 1.3120720386505127, + "learning_rate": 2.2852463312368974e-05, + "loss": 0.0849, + "step": 20720 + }, + { + "epoch": 1.0864779874213837, + "grad_norm": 2.175135850906372, + "learning_rate": 2.2839360587002097e-05, + "loss": 0.0775, + "step": 20730 + }, + { + "epoch": 1.0870020964360587, + "grad_norm": 1.1797727346420288, + "learning_rate": 2.2826257861635224e-05, + "loss": 0.0982, + "step": 20740 + }, + { + "epoch": 1.0875262054507338, + "grad_norm": 1.645690679550171, + "learning_rate": 2.2813155136268344e-05, + "loss": 0.0795, + "step": 20750 + }, + { + "epoch": 1.0880503144654088, + "grad_norm": 1.927145004272461, + "learning_rate": 2.2800052410901467e-05, + "loss": 0.0973, + "step": 20760 + }, + { + "epoch": 1.0885744234800838, + "grad_norm": 1.39384126663208, + "learning_rate": 2.278694968553459e-05, + "loss": 0.0593, + "step": 20770 + }, + { + "epoch": 1.0890985324947589, + "grad_norm": 1.5645813941955566, + "learning_rate": 2.2773846960167717e-05, + "loss": 0.0879, + "step": 20780 + }, + { + "epoch": 1.0896226415094339, + "grad_norm": 2.0848135948181152, + "learning_rate": 2.276074423480084e-05, + "loss": 0.0821, + "step": 20790 + }, + { + "epoch": 1.090146750524109, + "grad_norm": 0.7393643260002136, + "learning_rate": 2.2747641509433964e-05, + "loss": 0.0967, + "step": 20800 + }, + { + "epoch": 1.090670859538784, + "grad_norm": 1.7958664894104004, + "learning_rate": 2.2734538784067087e-05, + "loss": 0.0828, + "step": 20810 + }, + { + "epoch": 1.0911949685534592, + "grad_norm": 1.8084328174591064, + "learning_rate": 2.272143605870021e-05, + "loss": 0.0715, + "step": 20820 + }, + { + "epoch": 1.0917190775681342, + "grad_norm": 1.6376843452453613, + "learning_rate": 2.2708333333333334e-05, + "loss": 0.073, + "step": 20830 + }, + { + "epoch": 1.0922431865828093, + "grad_norm": 2.854077100753784, + "learning_rate": 2.2695230607966458e-05, + "loss": 0.0707, + "step": 20840 + }, + { + "epoch": 1.0927672955974843, + "grad_norm": 1.1942236423492432, + "learning_rate": 2.268212788259958e-05, + "loss": 0.0696, + "step": 20850 + }, + { + "epoch": 1.0932914046121593, + "grad_norm": 2.580146312713623, + "learning_rate": 2.2669025157232708e-05, + "loss": 0.0894, + "step": 20860 + }, + { + "epoch": 1.0938155136268344, + "grad_norm": 1.7355183362960815, + "learning_rate": 2.2655922431865828e-05, + "loss": 0.0898, + "step": 20870 + }, + { + "epoch": 1.0943396226415094, + "grad_norm": 0.6533396244049072, + "learning_rate": 2.264281970649895e-05, + "loss": 0.052, + "step": 20880 + }, + { + "epoch": 1.0948637316561844, + "grad_norm": 1.6988980770111084, + "learning_rate": 2.2629716981132078e-05, + "loss": 0.0753, + "step": 20890 + }, + { + "epoch": 1.0953878406708595, + "grad_norm": 3.65659761428833, + "learning_rate": 2.26166142557652e-05, + "loss": 0.099, + "step": 20900 + }, + { + "epoch": 1.0959119496855345, + "grad_norm": 1.2218525409698486, + "learning_rate": 2.2603511530398325e-05, + "loss": 0.083, + "step": 20910 + }, + { + "epoch": 1.0964360587002095, + "grad_norm": 1.5201767683029175, + "learning_rate": 2.2590408805031448e-05, + "loss": 0.1023, + "step": 20920 + }, + { + "epoch": 1.0969601677148848, + "grad_norm": 1.712514877319336, + "learning_rate": 2.257730607966457e-05, + "loss": 0.0958, + "step": 20930 + }, + { + "epoch": 1.0974842767295598, + "grad_norm": 1.532578945159912, + "learning_rate": 2.2564203354297695e-05, + "loss": 0.0693, + "step": 20940 + }, + { + "epoch": 1.0980083857442349, + "grad_norm": 0.4898047149181366, + "learning_rate": 2.2551100628930818e-05, + "loss": 0.0621, + "step": 20950 + }, + { + "epoch": 1.09853249475891, + "grad_norm": 1.3494205474853516, + "learning_rate": 2.253799790356394e-05, + "loss": 0.0647, + "step": 20960 + }, + { + "epoch": 1.099056603773585, + "grad_norm": 1.0484117269515991, + "learning_rate": 2.252489517819707e-05, + "loss": 0.0789, + "step": 20970 + }, + { + "epoch": 1.09958071278826, + "grad_norm": 1.8230509757995605, + "learning_rate": 2.2511792452830192e-05, + "loss": 0.0807, + "step": 20980 + }, + { + "epoch": 1.100104821802935, + "grad_norm": 1.3117787837982178, + "learning_rate": 2.2498689727463312e-05, + "loss": 0.0842, + "step": 20990 + }, + { + "epoch": 1.10062893081761, + "grad_norm": 1.3283014297485352, + "learning_rate": 2.2485587002096435e-05, + "loss": 0.078, + "step": 21000 + }, + { + "epoch": 1.10062893081761, + "eval_loss": 0.2784247398376465, + "eval_runtime": 267.8534, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 1.239, + "step": 21000 + }, + { + "epoch": 1.101153039832285, + "grad_norm": 2.8443620204925537, + "learning_rate": 2.2472484276729562e-05, + "loss": 0.0587, + "step": 21010 + }, + { + "epoch": 1.10167714884696, + "grad_norm": 2.1663818359375, + "learning_rate": 2.2459381551362685e-05, + "loss": 0.0754, + "step": 21020 + }, + { + "epoch": 1.1022012578616351, + "grad_norm": 1.3457231521606445, + "learning_rate": 2.244627882599581e-05, + "loss": 0.1037, + "step": 21030 + }, + { + "epoch": 1.1027253668763102, + "grad_norm": 3.6146819591522217, + "learning_rate": 2.2433176100628932e-05, + "loss": 0.0905, + "step": 21040 + }, + { + "epoch": 1.1032494758909852, + "grad_norm": 1.658570408821106, + "learning_rate": 2.2420073375262055e-05, + "loss": 0.0779, + "step": 21050 + }, + { + "epoch": 1.1037735849056605, + "grad_norm": 1.4389158487319946, + "learning_rate": 2.240697064989518e-05, + "loss": 0.0776, + "step": 21060 + }, + { + "epoch": 1.1042976939203355, + "grad_norm": 3.1745357513427734, + "learning_rate": 2.2393867924528302e-05, + "loss": 0.0852, + "step": 21070 + }, + { + "epoch": 1.1048218029350105, + "grad_norm": 1.5652964115142822, + "learning_rate": 2.2380765199161426e-05, + "loss": 0.0839, + "step": 21080 + }, + { + "epoch": 1.1053459119496856, + "grad_norm": 2.5231242179870605, + "learning_rate": 2.2367662473794552e-05, + "loss": 0.0819, + "step": 21090 + }, + { + "epoch": 1.1058700209643606, + "grad_norm": 2.037729263305664, + "learning_rate": 2.2354559748427676e-05, + "loss": 0.0995, + "step": 21100 + }, + { + "epoch": 1.1063941299790356, + "grad_norm": 1.0101507902145386, + "learning_rate": 2.2341457023060796e-05, + "loss": 0.0935, + "step": 21110 + }, + { + "epoch": 1.1069182389937107, + "grad_norm": 26.269018173217773, + "learning_rate": 2.2328354297693923e-05, + "loss": 0.0915, + "step": 21120 + }, + { + "epoch": 1.1074423480083857, + "grad_norm": 1.1649367809295654, + "learning_rate": 2.2315251572327046e-05, + "loss": 0.0649, + "step": 21130 + }, + { + "epoch": 1.1079664570230607, + "grad_norm": 1.7164911031723022, + "learning_rate": 2.230214884696017e-05, + "loss": 0.0964, + "step": 21140 + }, + { + "epoch": 1.1084905660377358, + "grad_norm": 1.4876567125320435, + "learning_rate": 2.2289046121593293e-05, + "loss": 0.1082, + "step": 21150 + }, + { + "epoch": 1.1090146750524108, + "grad_norm": 1.605036735534668, + "learning_rate": 2.2275943396226416e-05, + "loss": 0.0751, + "step": 21160 + }, + { + "epoch": 1.109538784067086, + "grad_norm": 3.23136830329895, + "learning_rate": 2.226284067085954e-05, + "loss": 0.0933, + "step": 21170 + }, + { + "epoch": 1.110062893081761, + "grad_norm": 2.443599224090576, + "learning_rate": 2.2249737945492663e-05, + "loss": 0.0781, + "step": 21180 + }, + { + "epoch": 1.1105870020964361, + "grad_norm": 2.5603668689727783, + "learning_rate": 2.2236635220125786e-05, + "loss": 0.1169, + "step": 21190 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 1.8467520475387573, + "learning_rate": 2.2223532494758913e-05, + "loss": 0.0748, + "step": 21200 + }, + { + "epoch": 1.1116352201257862, + "grad_norm": 1.695407509803772, + "learning_rate": 2.2210429769392036e-05, + "loss": 0.0884, + "step": 21210 + }, + { + "epoch": 1.1121593291404612, + "grad_norm": 1.1145198345184326, + "learning_rate": 2.219732704402516e-05, + "loss": 0.0885, + "step": 21220 + }, + { + "epoch": 1.1126834381551363, + "grad_norm": 1.6497974395751953, + "learning_rate": 2.218422431865828e-05, + "loss": 0.1082, + "step": 21230 + }, + { + "epoch": 1.1132075471698113, + "grad_norm": 1.1065045595169067, + "learning_rate": 2.2171121593291407e-05, + "loss": 0.075, + "step": 21240 + }, + { + "epoch": 1.1137316561844863, + "grad_norm": 1.636327862739563, + "learning_rate": 2.215801886792453e-05, + "loss": 0.093, + "step": 21250 + }, + { + "epoch": 1.1142557651991614, + "grad_norm": 1.5162497758865356, + "learning_rate": 2.2144916142557653e-05, + "loss": 0.0896, + "step": 21260 + }, + { + "epoch": 1.1147798742138364, + "grad_norm": 1.259658694267273, + "learning_rate": 2.2131813417190777e-05, + "loss": 0.0985, + "step": 21270 + }, + { + "epoch": 1.1153039832285114, + "grad_norm": 1.8518342971801758, + "learning_rate": 2.2118710691823903e-05, + "loss": 0.0906, + "step": 21280 + }, + { + "epoch": 1.1158280922431867, + "grad_norm": 1.3462951183319092, + "learning_rate": 2.2105607966457023e-05, + "loss": 0.0759, + "step": 21290 + }, + { + "epoch": 1.1163522012578617, + "grad_norm": 2.863619565963745, + "learning_rate": 2.2092505241090147e-05, + "loss": 0.1001, + "step": 21300 + }, + { + "epoch": 1.1168763102725368, + "grad_norm": 1.411542534828186, + "learning_rate": 2.207940251572327e-05, + "loss": 0.0834, + "step": 21310 + }, + { + "epoch": 1.1174004192872118, + "grad_norm": 1.7012072801589966, + "learning_rate": 2.2066299790356397e-05, + "loss": 0.1045, + "step": 21320 + }, + { + "epoch": 1.1179245283018868, + "grad_norm": 1.516890525817871, + "learning_rate": 2.205319706498952e-05, + "loss": 0.079, + "step": 21330 + }, + { + "epoch": 1.1184486373165619, + "grad_norm": 2.6436009407043457, + "learning_rate": 2.2040094339622644e-05, + "loss": 0.094, + "step": 21340 + }, + { + "epoch": 1.118972746331237, + "grad_norm": 1.700116515159607, + "learning_rate": 2.2026991614255764e-05, + "loss": 0.09, + "step": 21350 + }, + { + "epoch": 1.119496855345912, + "grad_norm": 1.6481711864471436, + "learning_rate": 2.201388888888889e-05, + "loss": 0.0853, + "step": 21360 + }, + { + "epoch": 1.120020964360587, + "grad_norm": 0.7776415348052979, + "learning_rate": 2.2000786163522014e-05, + "loss": 0.0777, + "step": 21370 + }, + { + "epoch": 1.120545073375262, + "grad_norm": 2.0679523944854736, + "learning_rate": 2.1987683438155137e-05, + "loss": 0.0697, + "step": 21380 + }, + { + "epoch": 1.121069182389937, + "grad_norm": 1.485421895980835, + "learning_rate": 2.197458071278826e-05, + "loss": 0.104, + "step": 21390 + }, + { + "epoch": 1.121593291404612, + "grad_norm": 1.4850335121154785, + "learning_rate": 2.1961477987421387e-05, + "loss": 0.0851, + "step": 21400 + }, + { + "epoch": 1.1221174004192873, + "grad_norm": 2.6674656867980957, + "learning_rate": 2.1948375262054507e-05, + "loss": 0.0735, + "step": 21410 + }, + { + "epoch": 1.1226415094339623, + "grad_norm": 1.8950225114822388, + "learning_rate": 2.193527253668763e-05, + "loss": 0.0873, + "step": 21420 + }, + { + "epoch": 1.1231656184486374, + "grad_norm": 2.049248218536377, + "learning_rate": 2.1922169811320754e-05, + "loss": 0.1157, + "step": 21430 + }, + { + "epoch": 1.1236897274633124, + "grad_norm": 1.7911266088485718, + "learning_rate": 2.190906708595388e-05, + "loss": 0.0687, + "step": 21440 + }, + { + "epoch": 1.1242138364779874, + "grad_norm": 1.8410648107528687, + "learning_rate": 2.1895964360587004e-05, + "loss": 0.0876, + "step": 21450 + }, + { + "epoch": 1.1247379454926625, + "grad_norm": 1.8042258024215698, + "learning_rate": 2.1882861635220128e-05, + "loss": 0.1086, + "step": 21460 + }, + { + "epoch": 1.1252620545073375, + "grad_norm": 0.6935757398605347, + "learning_rate": 2.186975890985325e-05, + "loss": 0.0694, + "step": 21470 + }, + { + "epoch": 1.1257861635220126, + "grad_norm": 1.8044699430465698, + "learning_rate": 2.1856656184486375e-05, + "loss": 0.0844, + "step": 21480 + }, + { + "epoch": 1.1263102725366876, + "grad_norm": 1.7842930555343628, + "learning_rate": 2.1843553459119498e-05, + "loss": 0.0659, + "step": 21490 + }, + { + "epoch": 1.1268343815513626, + "grad_norm": 2.891422748565674, + "learning_rate": 2.183045073375262e-05, + "loss": 0.101, + "step": 21500 + }, + { + "epoch": 1.1273584905660377, + "grad_norm": 3.2450954914093018, + "learning_rate": 2.1817348008385745e-05, + "loss": 0.1052, + "step": 21510 + }, + { + "epoch": 1.1278825995807127, + "grad_norm": 1.5685365200042725, + "learning_rate": 2.1804245283018868e-05, + "loss": 0.083, + "step": 21520 + }, + { + "epoch": 1.1284067085953877, + "grad_norm": 1.3027487993240356, + "learning_rate": 2.179114255765199e-05, + "loss": 0.0593, + "step": 21530 + }, + { + "epoch": 1.128930817610063, + "grad_norm": 1.3013249635696411, + "learning_rate": 2.1778039832285115e-05, + "loss": 0.089, + "step": 21540 + }, + { + "epoch": 1.129454926624738, + "grad_norm": 6.376953601837158, + "learning_rate": 2.176493710691824e-05, + "loss": 0.076, + "step": 21550 + }, + { + "epoch": 1.129979035639413, + "grad_norm": 1.9800182580947876, + "learning_rate": 2.1751834381551365e-05, + "loss": 0.0849, + "step": 21560 + }, + { + "epoch": 1.130503144654088, + "grad_norm": 1.376758098602295, + "learning_rate": 2.173873165618449e-05, + "loss": 0.0911, + "step": 21570 + }, + { + "epoch": 1.131027253668763, + "grad_norm": 1.5990701913833618, + "learning_rate": 2.172562893081761e-05, + "loss": 0.0957, + "step": 21580 + }, + { + "epoch": 1.1315513626834381, + "grad_norm": 1.919875144958496, + "learning_rate": 2.1712526205450735e-05, + "loss": 0.0947, + "step": 21590 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 1.7069398164749146, + "learning_rate": 2.169942348008386e-05, + "loss": 0.082, + "step": 21600 + }, + { + "epoch": 1.1325995807127882, + "grad_norm": 2.2045705318450928, + "learning_rate": 2.1686320754716982e-05, + "loss": 0.0742, + "step": 21610 + }, + { + "epoch": 1.1331236897274632, + "grad_norm": 0.6748783588409424, + "learning_rate": 2.1673218029350105e-05, + "loss": 0.0829, + "step": 21620 + }, + { + "epoch": 1.1336477987421383, + "grad_norm": 1.7201391458511353, + "learning_rate": 2.1660115303983232e-05, + "loss": 0.1016, + "step": 21630 + }, + { + "epoch": 1.1341719077568135, + "grad_norm": 1.3689556121826172, + "learning_rate": 2.1647012578616352e-05, + "loss": 0.0736, + "step": 21640 + }, + { + "epoch": 1.1346960167714886, + "grad_norm": 1.6909518241882324, + "learning_rate": 2.1633909853249475e-05, + "loss": 0.0841, + "step": 21650 + }, + { + "epoch": 1.1352201257861636, + "grad_norm": 1.656886100769043, + "learning_rate": 2.16208071278826e-05, + "loss": 0.0975, + "step": 21660 + }, + { + "epoch": 1.1357442348008386, + "grad_norm": 1.7160608768463135, + "learning_rate": 2.1607704402515726e-05, + "loss": 0.0857, + "step": 21670 + }, + { + "epoch": 1.1362683438155137, + "grad_norm": 2.022963047027588, + "learning_rate": 2.159460167714885e-05, + "loss": 0.0886, + "step": 21680 + }, + { + "epoch": 1.1367924528301887, + "grad_norm": 0.8972265720367432, + "learning_rate": 2.1581498951781972e-05, + "loss": 0.0882, + "step": 21690 + }, + { + "epoch": 1.1373165618448637, + "grad_norm": 1.2054320573806763, + "learning_rate": 2.1568396226415092e-05, + "loss": 0.073, + "step": 21700 + }, + { + "epoch": 1.1378406708595388, + "grad_norm": 7.229464054107666, + "learning_rate": 2.155529350104822e-05, + "loss": 0.078, + "step": 21710 + }, + { + "epoch": 1.1383647798742138, + "grad_norm": 1.8779710531234741, + "learning_rate": 2.1542190775681343e-05, + "loss": 0.0914, + "step": 21720 + }, + { + "epoch": 1.1388888888888888, + "grad_norm": 0.6816754341125488, + "learning_rate": 2.1529088050314466e-05, + "loss": 0.0648, + "step": 21730 + }, + { + "epoch": 1.1394129979035639, + "grad_norm": 1.756729245185852, + "learning_rate": 2.151598532494759e-05, + "loss": 0.1165, + "step": 21740 + }, + { + "epoch": 1.139937106918239, + "grad_norm": 1.6944255828857422, + "learning_rate": 2.1502882599580716e-05, + "loss": 0.0831, + "step": 21750 + }, + { + "epoch": 1.140461215932914, + "grad_norm": 1.633293628692627, + "learning_rate": 2.1489779874213836e-05, + "loss": 0.0777, + "step": 21760 + }, + { + "epoch": 1.140985324947589, + "grad_norm": 1.755721092224121, + "learning_rate": 2.147667714884696e-05, + "loss": 0.0714, + "step": 21770 + }, + { + "epoch": 1.1415094339622642, + "grad_norm": 1.9764751195907593, + "learning_rate": 2.1463574423480083e-05, + "loss": 0.0815, + "step": 21780 + }, + { + "epoch": 1.1420335429769393, + "grad_norm": 2.773207187652588, + "learning_rate": 2.145047169811321e-05, + "loss": 0.1026, + "step": 21790 + }, + { + "epoch": 1.1425576519916143, + "grad_norm": 1.6087942123413086, + "learning_rate": 2.1437368972746333e-05, + "loss": 0.0758, + "step": 21800 + }, + { + "epoch": 1.1430817610062893, + "grad_norm": 2.314055919647217, + "learning_rate": 2.1424266247379456e-05, + "loss": 0.0797, + "step": 21810 + }, + { + "epoch": 1.1436058700209644, + "grad_norm": 1.288299560546875, + "learning_rate": 2.141116352201258e-05, + "loss": 0.0582, + "step": 21820 + }, + { + "epoch": 1.1441299790356394, + "grad_norm": 2.209122657775879, + "learning_rate": 2.1398060796645703e-05, + "loss": 0.0791, + "step": 21830 + }, + { + "epoch": 1.1446540880503144, + "grad_norm": 0.9858881831169128, + "learning_rate": 2.1384958071278827e-05, + "loss": 0.0837, + "step": 21840 + }, + { + "epoch": 1.1451781970649895, + "grad_norm": 1.1568701267242432, + "learning_rate": 2.137185534591195e-05, + "loss": 0.081, + "step": 21850 + }, + { + "epoch": 1.1457023060796645, + "grad_norm": 1.5264705419540405, + "learning_rate": 2.1358752620545073e-05, + "loss": 0.0735, + "step": 21860 + }, + { + "epoch": 1.1462264150943395, + "grad_norm": 0.5113538503646851, + "learning_rate": 2.13456498951782e-05, + "loss": 0.083, + "step": 21870 + }, + { + "epoch": 1.1467505241090148, + "grad_norm": 0.6113923788070679, + "learning_rate": 2.133254716981132e-05, + "loss": 0.0741, + "step": 21880 + }, + { + "epoch": 1.1472746331236898, + "grad_norm": 0.6499518752098083, + "learning_rate": 2.1319444444444444e-05, + "loss": 0.0761, + "step": 21890 + }, + { + "epoch": 1.1477987421383649, + "grad_norm": 0.9743715524673462, + "learning_rate": 2.130634171907757e-05, + "loss": 0.0725, + "step": 21900 + }, + { + "epoch": 1.14832285115304, + "grad_norm": 2.16135835647583, + "learning_rate": 2.1293238993710694e-05, + "loss": 0.0935, + "step": 21910 + }, + { + "epoch": 1.148846960167715, + "grad_norm": 2.5390031337738037, + "learning_rate": 2.1280136268343817e-05, + "loss": 0.1021, + "step": 21920 + }, + { + "epoch": 1.14937106918239, + "grad_norm": 1.8435341119766235, + "learning_rate": 2.126703354297694e-05, + "loss": 0.0847, + "step": 21930 + }, + { + "epoch": 1.149895178197065, + "grad_norm": 0.9867782592773438, + "learning_rate": 2.1253930817610064e-05, + "loss": 0.0768, + "step": 21940 + }, + { + "epoch": 1.15041928721174, + "grad_norm": 1.5365256071090698, + "learning_rate": 2.1240828092243187e-05, + "loss": 0.1035, + "step": 21950 + }, + { + "epoch": 1.150943396226415, + "grad_norm": 1.4743176698684692, + "learning_rate": 2.122772536687631e-05, + "loss": 0.0555, + "step": 21960 + }, + { + "epoch": 1.15146750524109, + "grad_norm": 1.6891299486160278, + "learning_rate": 2.1214622641509434e-05, + "loss": 0.0775, + "step": 21970 + }, + { + "epoch": 1.1519916142557651, + "grad_norm": 1.8441011905670166, + "learning_rate": 2.120151991614256e-05, + "loss": 0.0716, + "step": 21980 + }, + { + "epoch": 1.1525157232704402, + "grad_norm": 1.4782497882843018, + "learning_rate": 2.1188417190775684e-05, + "loss": 0.086, + "step": 21990 + }, + { + "epoch": 1.1530398322851152, + "grad_norm": 1.208433985710144, + "learning_rate": 2.1175314465408804e-05, + "loss": 0.0837, + "step": 22000 + }, + { + "epoch": 1.1530398322851152, + "eval_loss": 0.2847573161125183, + "eval_runtime": 267.8045, + "eval_samples_per_second": 7.435, + "eval_steps_per_second": 1.24, + "step": 22000 + }, + { + "epoch": 1.1535639412997905, + "grad_norm": 1.218479037284851, + "learning_rate": 2.1162211740041928e-05, + "loss": 0.091, + "step": 22010 + }, + { + "epoch": 1.1540880503144655, + "grad_norm": 1.2849527597427368, + "learning_rate": 2.1149109014675054e-05, + "loss": 0.0958, + "step": 22020 + }, + { + "epoch": 1.1546121593291405, + "grad_norm": 1.1848913431167603, + "learning_rate": 2.1136006289308178e-05, + "loss": 0.0752, + "step": 22030 + }, + { + "epoch": 1.1551362683438156, + "grad_norm": 1.1825535297393799, + "learning_rate": 2.11229035639413e-05, + "loss": 0.0515, + "step": 22040 + }, + { + "epoch": 1.1556603773584906, + "grad_norm": 1.4824421405792236, + "learning_rate": 2.1109800838574424e-05, + "loss": 0.0732, + "step": 22050 + }, + { + "epoch": 1.1561844863731656, + "grad_norm": 2.042879581451416, + "learning_rate": 2.1096698113207548e-05, + "loss": 0.079, + "step": 22060 + }, + { + "epoch": 1.1567085953878407, + "grad_norm": 2.0270042419433594, + "learning_rate": 2.108359538784067e-05, + "loss": 0.0895, + "step": 22070 + }, + { + "epoch": 1.1572327044025157, + "grad_norm": 1.9607288837432861, + "learning_rate": 2.1070492662473795e-05, + "loss": 0.0703, + "step": 22080 + }, + { + "epoch": 1.1577568134171907, + "grad_norm": 0.7053709030151367, + "learning_rate": 2.1057389937106918e-05, + "loss": 0.0902, + "step": 22090 + }, + { + "epoch": 1.1582809224318658, + "grad_norm": 1.8536524772644043, + "learning_rate": 2.1044287211740045e-05, + "loss": 0.0696, + "step": 22100 + }, + { + "epoch": 1.1588050314465408, + "grad_norm": 2.1766324043273926, + "learning_rate": 2.1031184486373168e-05, + "loss": 0.0847, + "step": 22110 + }, + { + "epoch": 1.159329140461216, + "grad_norm": 1.659189224243164, + "learning_rate": 2.1018081761006288e-05, + "loss": 0.0708, + "step": 22120 + }, + { + "epoch": 1.159853249475891, + "grad_norm": 0.9984623193740845, + "learning_rate": 2.1004979035639415e-05, + "loss": 0.0747, + "step": 22130 + }, + { + "epoch": 1.1603773584905661, + "grad_norm": 2.5320563316345215, + "learning_rate": 2.099187631027254e-05, + "loss": 0.0808, + "step": 22140 + }, + { + "epoch": 1.1609014675052411, + "grad_norm": 1.5325194597244263, + "learning_rate": 2.0978773584905662e-05, + "loss": 0.1024, + "step": 22150 + }, + { + "epoch": 1.1614255765199162, + "grad_norm": 2.04860782623291, + "learning_rate": 2.0965670859538785e-05, + "loss": 0.0838, + "step": 22160 + }, + { + "epoch": 1.1619496855345912, + "grad_norm": 3.146354913711548, + "learning_rate": 2.095256813417191e-05, + "loss": 0.0903, + "step": 22170 + }, + { + "epoch": 1.1624737945492662, + "grad_norm": 1.1269997358322144, + "learning_rate": 2.0939465408805032e-05, + "loss": 0.0882, + "step": 22180 + }, + { + "epoch": 1.1629979035639413, + "grad_norm": 0.8249778747558594, + "learning_rate": 2.0926362683438155e-05, + "loss": 0.0923, + "step": 22190 + }, + { + "epoch": 1.1635220125786163, + "grad_norm": 1.9373500347137451, + "learning_rate": 2.091325995807128e-05, + "loss": 0.1042, + "step": 22200 + }, + { + "epoch": 1.1640461215932913, + "grad_norm": 1.7245064973831177, + "learning_rate": 2.0900157232704405e-05, + "loss": 0.1104, + "step": 22210 + }, + { + "epoch": 1.1645702306079664, + "grad_norm": 1.0767366886138916, + "learning_rate": 2.088705450733753e-05, + "loss": 0.0754, + "step": 22220 + }, + { + "epoch": 1.1650943396226414, + "grad_norm": 0.576208233833313, + "learning_rate": 2.0873951781970652e-05, + "loss": 0.0851, + "step": 22230 + }, + { + "epoch": 1.1656184486373165, + "grad_norm": 1.783109188079834, + "learning_rate": 2.0860849056603772e-05, + "loss": 0.0959, + "step": 22240 + }, + { + "epoch": 1.1661425576519917, + "grad_norm": 1.835574746131897, + "learning_rate": 2.08477463312369e-05, + "loss": 0.0939, + "step": 22250 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 1.7771759033203125, + "learning_rate": 2.0834643605870022e-05, + "loss": 0.0882, + "step": 22260 + }, + { + "epoch": 1.1671907756813418, + "grad_norm": 2.202103614807129, + "learning_rate": 2.0821540880503146e-05, + "loss": 0.0955, + "step": 22270 + }, + { + "epoch": 1.1677148846960168, + "grad_norm": 1.0116591453552246, + "learning_rate": 2.080843815513627e-05, + "loss": 0.0803, + "step": 22280 + }, + { + "epoch": 1.1682389937106918, + "grad_norm": 1.7534410953521729, + "learning_rate": 2.0795335429769396e-05, + "loss": 0.0853, + "step": 22290 + }, + { + "epoch": 1.1687631027253669, + "grad_norm": 1.1506248712539673, + "learning_rate": 2.0782232704402516e-05, + "loss": 0.0913, + "step": 22300 + }, + { + "epoch": 1.169287211740042, + "grad_norm": 1.104394555091858, + "learning_rate": 2.076912997903564e-05, + "loss": 0.0895, + "step": 22310 + }, + { + "epoch": 1.169811320754717, + "grad_norm": 1.3719996213912964, + "learning_rate": 2.0756027253668763e-05, + "loss": 0.1063, + "step": 22320 + }, + { + "epoch": 1.170335429769392, + "grad_norm": 1.9209152460098267, + "learning_rate": 2.074292452830189e-05, + "loss": 0.0718, + "step": 22330 + }, + { + "epoch": 1.170859538784067, + "grad_norm": 2.760194778442383, + "learning_rate": 2.0729821802935013e-05, + "loss": 0.0533, + "step": 22340 + }, + { + "epoch": 1.171383647798742, + "grad_norm": 3.66623854637146, + "learning_rate": 2.0716719077568136e-05, + "loss": 0.0881, + "step": 22350 + }, + { + "epoch": 1.1719077568134173, + "grad_norm": 1.5722193717956543, + "learning_rate": 2.0703616352201256e-05, + "loss": 0.0609, + "step": 22360 + }, + { + "epoch": 1.1724318658280923, + "grad_norm": 1.63033127784729, + "learning_rate": 2.0690513626834383e-05, + "loss": 0.1026, + "step": 22370 + }, + { + "epoch": 1.1729559748427674, + "grad_norm": 1.509321928024292, + "learning_rate": 2.0677410901467506e-05, + "loss": 0.1055, + "step": 22380 + }, + { + "epoch": 1.1734800838574424, + "grad_norm": 2.2437334060668945, + "learning_rate": 2.066430817610063e-05, + "loss": 0.0891, + "step": 22390 + }, + { + "epoch": 1.1740041928721174, + "grad_norm": 1.9515290260314941, + "learning_rate": 2.0651205450733753e-05, + "loss": 0.0835, + "step": 22400 + }, + { + "epoch": 1.1745283018867925, + "grad_norm": 1.6707795858383179, + "learning_rate": 2.063810272536688e-05, + "loss": 0.0925, + "step": 22410 + }, + { + "epoch": 1.1750524109014675, + "grad_norm": 1.5614157915115356, + "learning_rate": 2.0625e-05, + "loss": 0.0949, + "step": 22420 + }, + { + "epoch": 1.1755765199161425, + "grad_norm": 1.81149160861969, + "learning_rate": 2.0611897274633123e-05, + "loss": 0.1014, + "step": 22430 + }, + { + "epoch": 1.1761006289308176, + "grad_norm": 1.1202183961868286, + "learning_rate": 2.0598794549266247e-05, + "loss": 0.0803, + "step": 22440 + }, + { + "epoch": 1.1766247379454926, + "grad_norm": 0.9992743730545044, + "learning_rate": 2.0585691823899373e-05, + "loss": 0.0877, + "step": 22450 + }, + { + "epoch": 1.1771488469601676, + "grad_norm": 1.73166823387146, + "learning_rate": 2.0572589098532497e-05, + "loss": 0.0762, + "step": 22460 + }, + { + "epoch": 1.1776729559748427, + "grad_norm": 1.5337498188018799, + "learning_rate": 2.055948637316562e-05, + "loss": 0.0799, + "step": 22470 + }, + { + "epoch": 1.1781970649895177, + "grad_norm": 2.315154552459717, + "learning_rate": 2.0546383647798744e-05, + "loss": 0.0857, + "step": 22480 + }, + { + "epoch": 1.178721174004193, + "grad_norm": 22.706295013427734, + "learning_rate": 2.0533280922431867e-05, + "loss": 0.0877, + "step": 22490 + }, + { + "epoch": 1.179245283018868, + "grad_norm": 1.9468846321105957, + "learning_rate": 2.052017819706499e-05, + "loss": 0.1001, + "step": 22500 + }, + { + "epoch": 1.179769392033543, + "grad_norm": 1.4240168333053589, + "learning_rate": 2.0507075471698114e-05, + "loss": 0.075, + "step": 22510 + }, + { + "epoch": 1.180293501048218, + "grad_norm": 0.939274251461029, + "learning_rate": 2.0493972746331237e-05, + "loss": 0.0663, + "step": 22520 + }, + { + "epoch": 1.180817610062893, + "grad_norm": 1.8253507614135742, + "learning_rate": 2.0480870020964364e-05, + "loss": 0.0856, + "step": 22530 + }, + { + "epoch": 1.1813417190775681, + "grad_norm": 1.9873319864273071, + "learning_rate": 2.0467767295597484e-05, + "loss": 0.0866, + "step": 22540 + }, + { + "epoch": 1.1818658280922432, + "grad_norm": 1.9492064714431763, + "learning_rate": 2.0454664570230607e-05, + "loss": 0.0869, + "step": 22550 + }, + { + "epoch": 1.1823899371069182, + "grad_norm": 1.099589228630066, + "learning_rate": 2.0441561844863734e-05, + "loss": 0.0808, + "step": 22560 + }, + { + "epoch": 1.1829140461215932, + "grad_norm": 2.4824342727661133, + "learning_rate": 2.0428459119496857e-05, + "loss": 0.0929, + "step": 22570 + }, + { + "epoch": 1.1834381551362683, + "grad_norm": 1.5264191627502441, + "learning_rate": 2.041535639412998e-05, + "loss": 0.0813, + "step": 22580 + }, + { + "epoch": 1.1839622641509433, + "grad_norm": 2.196829319000244, + "learning_rate": 2.0402253668763104e-05, + "loss": 0.0803, + "step": 22590 + }, + { + "epoch": 1.1844863731656186, + "grad_norm": 1.7362253665924072, + "learning_rate": 2.0389150943396228e-05, + "loss": 0.0687, + "step": 22600 + }, + { + "epoch": 1.1850104821802936, + "grad_norm": 1.0077215433120728, + "learning_rate": 2.037604821802935e-05, + "loss": 0.1056, + "step": 22610 + }, + { + "epoch": 1.1855345911949686, + "grad_norm": 1.3866441249847412, + "learning_rate": 2.0362945492662474e-05, + "loss": 0.094, + "step": 22620 + }, + { + "epoch": 1.1860587002096437, + "grad_norm": 1.4912267923355103, + "learning_rate": 2.0349842767295598e-05, + "loss": 0.0996, + "step": 22630 + }, + { + "epoch": 1.1865828092243187, + "grad_norm": 1.5430445671081543, + "learning_rate": 2.0336740041928725e-05, + "loss": 0.0834, + "step": 22640 + }, + { + "epoch": 1.1871069182389937, + "grad_norm": 1.434836506843567, + "learning_rate": 2.0323637316561848e-05, + "loss": 0.0909, + "step": 22650 + }, + { + "epoch": 1.1876310272536688, + "grad_norm": 1.989877462387085, + "learning_rate": 2.0310534591194968e-05, + "loss": 0.0932, + "step": 22660 + }, + { + "epoch": 1.1881551362683438, + "grad_norm": 1.966412901878357, + "learning_rate": 2.029743186582809e-05, + "loss": 0.1094, + "step": 22670 + }, + { + "epoch": 1.1886792452830188, + "grad_norm": 2.0110154151916504, + "learning_rate": 2.0284329140461218e-05, + "loss": 0.0736, + "step": 22680 + }, + { + "epoch": 1.1892033542976939, + "grad_norm": 2.2693698406219482, + "learning_rate": 2.027122641509434e-05, + "loss": 0.0827, + "step": 22690 + }, + { + "epoch": 1.189727463312369, + "grad_norm": 0.8845059275627136, + "learning_rate": 2.0258123689727465e-05, + "loss": 0.0772, + "step": 22700 + }, + { + "epoch": 1.190251572327044, + "grad_norm": 2.433473587036133, + "learning_rate": 2.0245020964360588e-05, + "loss": 0.0976, + "step": 22710 + }, + { + "epoch": 1.190775681341719, + "grad_norm": 1.9096776247024536, + "learning_rate": 2.023191823899371e-05, + "loss": 0.0674, + "step": 22720 + }, + { + "epoch": 1.1912997903563942, + "grad_norm": 1.502175211906433, + "learning_rate": 2.0218815513626835e-05, + "loss": 0.0808, + "step": 22730 + }, + { + "epoch": 1.1918238993710693, + "grad_norm": 0.8952791094779968, + "learning_rate": 2.020571278825996e-05, + "loss": 0.1002, + "step": 22740 + }, + { + "epoch": 1.1923480083857443, + "grad_norm": 1.0224030017852783, + "learning_rate": 2.0192610062893082e-05, + "loss": 0.0841, + "step": 22750 + }, + { + "epoch": 1.1928721174004193, + "grad_norm": 1.8299604654312134, + "learning_rate": 2.017950733752621e-05, + "loss": 0.0828, + "step": 22760 + }, + { + "epoch": 1.1933962264150944, + "grad_norm": 1.035951852798462, + "learning_rate": 2.0166404612159332e-05, + "loss": 0.0696, + "step": 22770 + }, + { + "epoch": 1.1939203354297694, + "grad_norm": 1.6324563026428223, + "learning_rate": 2.0153301886792452e-05, + "loss": 0.0792, + "step": 22780 + }, + { + "epoch": 1.1944444444444444, + "grad_norm": 0.7720912098884583, + "learning_rate": 2.0140199161425575e-05, + "loss": 0.0821, + "step": 22790 + }, + { + "epoch": 1.1949685534591195, + "grad_norm": 1.4368795156478882, + "learning_rate": 2.0127096436058702e-05, + "loss": 0.0815, + "step": 22800 + }, + { + "epoch": 1.1954926624737945, + "grad_norm": 2.1467883586883545, + "learning_rate": 2.0113993710691825e-05, + "loss": 0.0792, + "step": 22810 + }, + { + "epoch": 1.1960167714884695, + "grad_norm": 4.250417709350586, + "learning_rate": 2.010089098532495e-05, + "loss": 0.0691, + "step": 22820 + }, + { + "epoch": 1.1965408805031448, + "grad_norm": 2.1994504928588867, + "learning_rate": 2.0087788259958072e-05, + "loss": 0.0837, + "step": 22830 + }, + { + "epoch": 1.1970649895178198, + "grad_norm": 1.6909440755844116, + "learning_rate": 2.0074685534591196e-05, + "loss": 0.0784, + "step": 22840 + }, + { + "epoch": 1.1975890985324948, + "grad_norm": 1.3120115995407104, + "learning_rate": 2.006158280922432e-05, + "loss": 0.0911, + "step": 22850 + }, + { + "epoch": 1.1981132075471699, + "grad_norm": 1.8463976383209229, + "learning_rate": 2.0048480083857442e-05, + "loss": 0.0702, + "step": 22860 + }, + { + "epoch": 1.198637316561845, + "grad_norm": 1.0547575950622559, + "learning_rate": 2.0035377358490566e-05, + "loss": 0.0837, + "step": 22870 + }, + { + "epoch": 1.19916142557652, + "grad_norm": 2.4690308570861816, + "learning_rate": 2.0022274633123693e-05, + "loss": 0.0826, + "step": 22880 + }, + { + "epoch": 1.199685534591195, + "grad_norm": 2.1016459465026855, + "learning_rate": 2.0009171907756816e-05, + "loss": 0.0828, + "step": 22890 + }, + { + "epoch": 1.20020964360587, + "grad_norm": 1.5869488716125488, + "learning_rate": 1.9996069182389936e-05, + "loss": 0.0722, + "step": 22900 + }, + { + "epoch": 1.200733752620545, + "grad_norm": 1.17249596118927, + "learning_rate": 1.9982966457023063e-05, + "loss": 0.0758, + "step": 22910 + }, + { + "epoch": 1.20125786163522, + "grad_norm": 3.9294803142547607, + "learning_rate": 1.9969863731656186e-05, + "loss": 0.0719, + "step": 22920 + }, + { + "epoch": 1.2017819706498951, + "grad_norm": 1.0598368644714355, + "learning_rate": 1.995676100628931e-05, + "loss": 0.0832, + "step": 22930 + }, + { + "epoch": 1.2023060796645701, + "grad_norm": 1.414456844329834, + "learning_rate": 1.9943658280922433e-05, + "loss": 0.0702, + "step": 22940 + }, + { + "epoch": 1.2028301886792452, + "grad_norm": 1.4275473356246948, + "learning_rate": 1.9930555555555556e-05, + "loss": 0.0924, + "step": 22950 + }, + { + "epoch": 1.2033542976939202, + "grad_norm": 1.5400974750518799, + "learning_rate": 1.991745283018868e-05, + "loss": 0.0635, + "step": 22960 + }, + { + "epoch": 1.2038784067085955, + "grad_norm": 2.163780450820923, + "learning_rate": 1.9904350104821803e-05, + "loss": 0.0882, + "step": 22970 + }, + { + "epoch": 1.2044025157232705, + "grad_norm": 2.687192440032959, + "learning_rate": 1.9891247379454926e-05, + "loss": 0.0728, + "step": 22980 + }, + { + "epoch": 1.2049266247379455, + "grad_norm": 1.8120397329330444, + "learning_rate": 1.9878144654088053e-05, + "loss": 0.0655, + "step": 22990 + }, + { + "epoch": 1.2054507337526206, + "grad_norm": 1.232093334197998, + "learning_rate": 1.9865041928721177e-05, + "loss": 0.0635, + "step": 23000 + }, + { + "epoch": 1.2054507337526206, + "eval_loss": 0.28128090500831604, + "eval_runtime": 267.6553, + "eval_samples_per_second": 7.439, + "eval_steps_per_second": 1.24, + "step": 23000 + }, + { + "epoch": 1.2059748427672956, + "grad_norm": 2.507798433303833, + "learning_rate": 1.98519392033543e-05, + "loss": 0.1134, + "step": 23010 + }, + { + "epoch": 1.2064989517819706, + "grad_norm": 0.5228186249732971, + "learning_rate": 1.983883647798742e-05, + "loss": 0.0916, + "step": 23020 + }, + { + "epoch": 1.2070230607966457, + "grad_norm": 1.7124361991882324, + "learning_rate": 1.9825733752620547e-05, + "loss": 0.0849, + "step": 23030 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 5.533046722412109, + "learning_rate": 1.981263102725367e-05, + "loss": 0.0556, + "step": 23040 + }, + { + "epoch": 1.2080712788259957, + "grad_norm": 0.8444564938545227, + "learning_rate": 1.9799528301886793e-05, + "loss": 0.0688, + "step": 23050 + }, + { + "epoch": 1.2085953878406708, + "grad_norm": 1.8386211395263672, + "learning_rate": 1.9786425576519917e-05, + "loss": 0.0891, + "step": 23060 + }, + { + "epoch": 1.209119496855346, + "grad_norm": 1.8787667751312256, + "learning_rate": 1.977332285115304e-05, + "loss": 0.0927, + "step": 23070 + }, + { + "epoch": 1.209643605870021, + "grad_norm": 0.8088480830192566, + "learning_rate": 1.9760220125786164e-05, + "loss": 0.0783, + "step": 23080 + }, + { + "epoch": 1.210167714884696, + "grad_norm": 1.171616792678833, + "learning_rate": 1.9747117400419287e-05, + "loss": 0.0905, + "step": 23090 + }, + { + "epoch": 1.2106918238993711, + "grad_norm": 2.413273572921753, + "learning_rate": 1.973401467505241e-05, + "loss": 0.0936, + "step": 23100 + }, + { + "epoch": 1.2112159329140462, + "grad_norm": 2.519268035888672, + "learning_rate": 1.9720911949685537e-05, + "loss": 0.1114, + "step": 23110 + }, + { + "epoch": 1.2117400419287212, + "grad_norm": 2.1220171451568604, + "learning_rate": 1.970780922431866e-05, + "loss": 0.0773, + "step": 23120 + }, + { + "epoch": 1.2122641509433962, + "grad_norm": 2.4642395973205566, + "learning_rate": 1.969470649895178e-05, + "loss": 0.0851, + "step": 23130 + }, + { + "epoch": 1.2127882599580713, + "grad_norm": 0.9276864528656006, + "learning_rate": 1.9681603773584907e-05, + "loss": 0.0707, + "step": 23140 + }, + { + "epoch": 1.2133123689727463, + "grad_norm": 0.9606568217277527, + "learning_rate": 1.966850104821803e-05, + "loss": 0.0991, + "step": 23150 + }, + { + "epoch": 1.2138364779874213, + "grad_norm": 2.05163311958313, + "learning_rate": 1.9655398322851154e-05, + "loss": 0.0769, + "step": 23160 + }, + { + "epoch": 1.2143605870020964, + "grad_norm": 1.6464990377426147, + "learning_rate": 1.9642295597484278e-05, + "loss": 0.1088, + "step": 23170 + }, + { + "epoch": 1.2148846960167714, + "grad_norm": 3.4842240810394287, + "learning_rate": 1.96291928721174e-05, + "loss": 0.0954, + "step": 23180 + }, + { + "epoch": 1.2154088050314464, + "grad_norm": 1.8171097040176392, + "learning_rate": 1.9616090146750524e-05, + "loss": 0.1078, + "step": 23190 + }, + { + "epoch": 1.2159329140461215, + "grad_norm": 1.6532925367355347, + "learning_rate": 1.9602987421383648e-05, + "loss": 0.0726, + "step": 23200 + }, + { + "epoch": 1.2164570230607967, + "grad_norm": 1.8835959434509277, + "learning_rate": 1.958988469601677e-05, + "loss": 0.1119, + "step": 23210 + }, + { + "epoch": 1.2169811320754718, + "grad_norm": 0.9576271176338196, + "learning_rate": 1.9576781970649898e-05, + "loss": 0.0839, + "step": 23220 + }, + { + "epoch": 1.2175052410901468, + "grad_norm": 1.8635886907577515, + "learning_rate": 1.956367924528302e-05, + "loss": 0.076, + "step": 23230 + }, + { + "epoch": 1.2180293501048218, + "grad_norm": 0.8350629806518555, + "learning_rate": 1.9550576519916145e-05, + "loss": 0.0756, + "step": 23240 + }, + { + "epoch": 1.2185534591194969, + "grad_norm": 1.33456552028656, + "learning_rate": 1.9537473794549265e-05, + "loss": 0.0774, + "step": 23250 + }, + { + "epoch": 1.219077568134172, + "grad_norm": 2.3194878101348877, + "learning_rate": 1.952437106918239e-05, + "loss": 0.0833, + "step": 23260 + }, + { + "epoch": 1.219601677148847, + "grad_norm": 2.020064115524292, + "learning_rate": 1.9511268343815515e-05, + "loss": 0.0428, + "step": 23270 + }, + { + "epoch": 1.220125786163522, + "grad_norm": 1.8238146305084229, + "learning_rate": 1.9498165618448638e-05, + "loss": 0.0772, + "step": 23280 + }, + { + "epoch": 1.220649895178197, + "grad_norm": 1.7014821767807007, + "learning_rate": 1.948506289308176e-05, + "loss": 0.084, + "step": 23290 + }, + { + "epoch": 1.221174004192872, + "grad_norm": 1.2988556623458862, + "learning_rate": 1.9471960167714888e-05, + "loss": 0.0781, + "step": 23300 + }, + { + "epoch": 1.2216981132075473, + "grad_norm": 1.7309902906417847, + "learning_rate": 1.9458857442348008e-05, + "loss": 0.0977, + "step": 23310 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 1.721279263496399, + "learning_rate": 1.944575471698113e-05, + "loss": 0.0726, + "step": 23320 + }, + { + "epoch": 1.2227463312368974, + "grad_norm": 1.762795090675354, + "learning_rate": 1.9432651991614255e-05, + "loss": 0.0805, + "step": 23330 + }, + { + "epoch": 1.2232704402515724, + "grad_norm": 1.2574833631515503, + "learning_rate": 1.9419549266247382e-05, + "loss": 0.0626, + "step": 23340 + }, + { + "epoch": 1.2237945492662474, + "grad_norm": 1.1116647720336914, + "learning_rate": 1.9406446540880505e-05, + "loss": 0.079, + "step": 23350 + }, + { + "epoch": 1.2243186582809225, + "grad_norm": 1.4834880828857422, + "learning_rate": 1.939334381551363e-05, + "loss": 0.0835, + "step": 23360 + }, + { + "epoch": 1.2248427672955975, + "grad_norm": 1.7493497133255005, + "learning_rate": 1.938024109014675e-05, + "loss": 0.0874, + "step": 23370 + }, + { + "epoch": 1.2253668763102725, + "grad_norm": 1.5207605361938477, + "learning_rate": 1.9367138364779875e-05, + "loss": 0.0804, + "step": 23380 + }, + { + "epoch": 1.2258909853249476, + "grad_norm": 1.1050654649734497, + "learning_rate": 1.9354035639413e-05, + "loss": 0.0602, + "step": 23390 + }, + { + "epoch": 1.2264150943396226, + "grad_norm": 1.671478033065796, + "learning_rate": 1.9340932914046122e-05, + "loss": 0.0931, + "step": 23400 + }, + { + "epoch": 1.2269392033542976, + "grad_norm": 2.356992721557617, + "learning_rate": 1.9327830188679246e-05, + "loss": 0.0736, + "step": 23410 + }, + { + "epoch": 1.2274633123689727, + "grad_norm": 1.6068631410598755, + "learning_rate": 1.9314727463312372e-05, + "loss": 0.0843, + "step": 23420 + }, + { + "epoch": 1.2279874213836477, + "grad_norm": 1.5799087285995483, + "learning_rate": 1.9301624737945492e-05, + "loss": 0.0627, + "step": 23430 + }, + { + "epoch": 1.2285115303983227, + "grad_norm": 1.4050095081329346, + "learning_rate": 1.9288522012578616e-05, + "loss": 0.0867, + "step": 23440 + }, + { + "epoch": 1.229035639412998, + "grad_norm": 1.1417391300201416, + "learning_rate": 1.927541928721174e-05, + "loss": 0.082, + "step": 23450 + }, + { + "epoch": 1.229559748427673, + "grad_norm": 1.4258673191070557, + "learning_rate": 1.9262316561844866e-05, + "loss": 0.0891, + "step": 23460 + }, + { + "epoch": 1.230083857442348, + "grad_norm": 1.9718081951141357, + "learning_rate": 1.924921383647799e-05, + "loss": 0.1071, + "step": 23470 + }, + { + "epoch": 1.230607966457023, + "grad_norm": 1.9824044704437256, + "learning_rate": 1.9236111111111113e-05, + "loss": 0.0804, + "step": 23480 + }, + { + "epoch": 1.2311320754716981, + "grad_norm": 1.7826849222183228, + "learning_rate": 1.9223008385744236e-05, + "loss": 0.0713, + "step": 23490 + }, + { + "epoch": 1.2316561844863732, + "grad_norm": 1.1761234998703003, + "learning_rate": 1.920990566037736e-05, + "loss": 0.0805, + "step": 23500 + }, + { + "epoch": 1.2321802935010482, + "grad_norm": 1.566415548324585, + "learning_rate": 1.9196802935010483e-05, + "loss": 0.0932, + "step": 23510 + }, + { + "epoch": 1.2327044025157232, + "grad_norm": 1.5760856866836548, + "learning_rate": 1.9183700209643606e-05, + "loss": 0.0546, + "step": 23520 + }, + { + "epoch": 1.2332285115303983, + "grad_norm": 1.3049871921539307, + "learning_rate": 1.917059748427673e-05, + "loss": 0.0666, + "step": 23530 + }, + { + "epoch": 1.2337526205450733, + "grad_norm": 1.3135899305343628, + "learning_rate": 1.9157494758909856e-05, + "loss": 0.1063, + "step": 23540 + }, + { + "epoch": 1.2342767295597485, + "grad_norm": 2.9520959854125977, + "learning_rate": 1.9144392033542976e-05, + "loss": 0.1013, + "step": 23550 + }, + { + "epoch": 1.2348008385744236, + "grad_norm": 1.6666831970214844, + "learning_rate": 1.91312893081761e-05, + "loss": 0.0893, + "step": 23560 + }, + { + "epoch": 1.2353249475890986, + "grad_norm": 1.646226406097412, + "learning_rate": 1.9118186582809226e-05, + "loss": 0.093, + "step": 23570 + }, + { + "epoch": 1.2358490566037736, + "grad_norm": 1.1616284847259521, + "learning_rate": 1.910508385744235e-05, + "loss": 0.0635, + "step": 23580 + }, + { + "epoch": 1.2363731656184487, + "grad_norm": 1.1960272789001465, + "learning_rate": 1.9091981132075473e-05, + "loss": 0.0731, + "step": 23590 + }, + { + "epoch": 1.2368972746331237, + "grad_norm": 1.1174589395523071, + "learning_rate": 1.9078878406708597e-05, + "loss": 0.0949, + "step": 23600 + }, + { + "epoch": 1.2374213836477987, + "grad_norm": 2.366077423095703, + "learning_rate": 1.906577568134172e-05, + "loss": 0.0673, + "step": 23610 + }, + { + "epoch": 1.2379454926624738, + "grad_norm": 1.1007031202316284, + "learning_rate": 1.9052672955974843e-05, + "loss": 0.0662, + "step": 23620 + }, + { + "epoch": 1.2384696016771488, + "grad_norm": 2.171886682510376, + "learning_rate": 1.9039570230607967e-05, + "loss": 0.1205, + "step": 23630 + }, + { + "epoch": 1.2389937106918238, + "grad_norm": 2.20706844329834, + "learning_rate": 1.902646750524109e-05, + "loss": 0.0727, + "step": 23640 + }, + { + "epoch": 1.2395178197064989, + "grad_norm": 1.7492408752441406, + "learning_rate": 1.9013364779874217e-05, + "loss": 0.0795, + "step": 23650 + }, + { + "epoch": 1.240041928721174, + "grad_norm": 2.009042978286743, + "learning_rate": 1.900026205450734e-05, + "loss": 0.0908, + "step": 23660 + }, + { + "epoch": 1.240566037735849, + "grad_norm": 1.2455140352249146, + "learning_rate": 1.898715932914046e-05, + "loss": 0.073, + "step": 23670 + }, + { + "epoch": 1.2410901467505242, + "grad_norm": 1.2918400764465332, + "learning_rate": 1.8974056603773584e-05, + "loss": 0.0594, + "step": 23680 + }, + { + "epoch": 1.2416142557651992, + "grad_norm": 2.6020805835723877, + "learning_rate": 1.896095387840671e-05, + "loss": 0.0691, + "step": 23690 + }, + { + "epoch": 1.2421383647798743, + "grad_norm": 1.4150261878967285, + "learning_rate": 1.8947851153039834e-05, + "loss": 0.0718, + "step": 23700 + }, + { + "epoch": 1.2426624737945493, + "grad_norm": 2.6102347373962402, + "learning_rate": 1.8934748427672957e-05, + "loss": 0.0836, + "step": 23710 + }, + { + "epoch": 1.2431865828092243, + "grad_norm": 0.48219212889671326, + "learning_rate": 1.892164570230608e-05, + "loss": 0.0774, + "step": 23720 + }, + { + "epoch": 1.2437106918238994, + "grad_norm": 1.1697911024093628, + "learning_rate": 1.8908542976939204e-05, + "loss": 0.0608, + "step": 23730 + }, + { + "epoch": 1.2442348008385744, + "grad_norm": 1.2900214195251465, + "learning_rate": 1.8895440251572327e-05, + "loss": 0.0761, + "step": 23740 + }, + { + "epoch": 1.2447589098532494, + "grad_norm": 1.660508394241333, + "learning_rate": 1.888233752620545e-05, + "loss": 0.0781, + "step": 23750 + }, + { + "epoch": 1.2452830188679245, + "grad_norm": 1.1662437915802002, + "learning_rate": 1.8869234800838574e-05, + "loss": 0.0885, + "step": 23760 + }, + { + "epoch": 1.2458071278825995, + "grad_norm": 1.7090445756912231, + "learning_rate": 1.88561320754717e-05, + "loss": 0.0704, + "step": 23770 + }, + { + "epoch": 1.2463312368972745, + "grad_norm": 1.6148638725280762, + "learning_rate": 1.8843029350104824e-05, + "loss": 0.0778, + "step": 23780 + }, + { + "epoch": 1.2468553459119498, + "grad_norm": 1.092518925666809, + "learning_rate": 1.8829926624737944e-05, + "loss": 0.0762, + "step": 23790 + }, + { + "epoch": 1.2473794549266248, + "grad_norm": 1.5229153633117676, + "learning_rate": 1.8816823899371068e-05, + "loss": 0.0889, + "step": 23800 + }, + { + "epoch": 1.2479035639412999, + "grad_norm": 1.1291695833206177, + "learning_rate": 1.8803721174004195e-05, + "loss": 0.0985, + "step": 23810 + }, + { + "epoch": 1.248427672955975, + "grad_norm": 1.2589600086212158, + "learning_rate": 1.8790618448637318e-05, + "loss": 0.0932, + "step": 23820 + }, + { + "epoch": 1.24895178197065, + "grad_norm": 1.1041934490203857, + "learning_rate": 1.877751572327044e-05, + "loss": 0.076, + "step": 23830 + }, + { + "epoch": 1.249475890985325, + "grad_norm": 1.4280002117156982, + "learning_rate": 1.8764412997903565e-05, + "loss": 0.0906, + "step": 23840 + }, + { + "epoch": 1.25, + "grad_norm": 1.893744945526123, + "learning_rate": 1.8751310272536688e-05, + "loss": 0.0997, + "step": 23850 + }, + { + "epoch": 1.250524109014675, + "grad_norm": 1.3034499883651733, + "learning_rate": 1.873820754716981e-05, + "loss": 0.0927, + "step": 23860 + }, + { + "epoch": 1.25104821802935, + "grad_norm": 1.0884954929351807, + "learning_rate": 1.8725104821802935e-05, + "loss": 0.0619, + "step": 23870 + }, + { + "epoch": 1.251572327044025, + "grad_norm": 1.8677185773849487, + "learning_rate": 1.8712002096436058e-05, + "loss": 0.0795, + "step": 23880 + }, + { + "epoch": 1.2520964360587001, + "grad_norm": 1.7047679424285889, + "learning_rate": 1.8698899371069185e-05, + "loss": 0.0926, + "step": 23890 + }, + { + "epoch": 1.2526205450733752, + "grad_norm": 1.151652216911316, + "learning_rate": 1.868579664570231e-05, + "loss": 0.0899, + "step": 23900 + }, + { + "epoch": 1.2531446540880502, + "grad_norm": 1.0031532049179077, + "learning_rate": 1.867269392033543e-05, + "loss": 0.0753, + "step": 23910 + }, + { + "epoch": 1.2536687631027252, + "grad_norm": 2.4875776767730713, + "learning_rate": 1.8659591194968555e-05, + "loss": 0.0919, + "step": 23920 + }, + { + "epoch": 1.2541928721174005, + "grad_norm": 2.6797757148742676, + "learning_rate": 1.864648846960168e-05, + "loss": 0.0719, + "step": 23930 + }, + { + "epoch": 1.2547169811320755, + "grad_norm": 1.4403328895568848, + "learning_rate": 1.8633385744234802e-05, + "loss": 0.102, + "step": 23940 + }, + { + "epoch": 1.2552410901467506, + "grad_norm": 2.064087390899658, + "learning_rate": 1.8620283018867925e-05, + "loss": 0.0838, + "step": 23950 + }, + { + "epoch": 1.2557651991614256, + "grad_norm": 6.239688873291016, + "learning_rate": 1.860718029350105e-05, + "loss": 0.0872, + "step": 23960 + }, + { + "epoch": 1.2562893081761006, + "grad_norm": 1.3345998525619507, + "learning_rate": 1.8594077568134172e-05, + "loss": 0.0869, + "step": 23970 + }, + { + "epoch": 1.2568134171907757, + "grad_norm": 1.0408798456192017, + "learning_rate": 1.8580974842767295e-05, + "loss": 0.0642, + "step": 23980 + }, + { + "epoch": 1.2573375262054507, + "grad_norm": 0.3426121175289154, + "learning_rate": 1.856787211740042e-05, + "loss": 0.0875, + "step": 23990 + }, + { + "epoch": 1.2578616352201257, + "grad_norm": 1.971558690071106, + "learning_rate": 1.8554769392033546e-05, + "loss": 0.0705, + "step": 24000 + }, + { + "epoch": 1.2578616352201257, + "eval_loss": 0.2765878736972809, + "eval_runtime": 267.3744, + "eval_samples_per_second": 7.446, + "eval_steps_per_second": 1.242, + "step": 24000 + }, + { + "epoch": 1.2583857442348008, + "grad_norm": 1.6661800146102905, + "learning_rate": 1.854166666666667e-05, + "loss": 0.0872, + "step": 24010 + }, + { + "epoch": 1.258909853249476, + "grad_norm": 1.5502612590789795, + "learning_rate": 1.8528563941299792e-05, + "loss": 0.0711, + "step": 24020 + }, + { + "epoch": 1.259433962264151, + "grad_norm": 2.2991933822631836, + "learning_rate": 1.8515461215932912e-05, + "loss": 0.0798, + "step": 24030 + }, + { + "epoch": 1.259958071278826, + "grad_norm": 1.4546477794647217, + "learning_rate": 1.850235849056604e-05, + "loss": 0.0913, + "step": 24040 + }, + { + "epoch": 1.2604821802935011, + "grad_norm": 0.8518606424331665, + "learning_rate": 1.8489255765199163e-05, + "loss": 0.0892, + "step": 24050 + }, + { + "epoch": 1.2610062893081762, + "grad_norm": 0.5437707901000977, + "learning_rate": 1.8476153039832286e-05, + "loss": 0.0515, + "step": 24060 + }, + { + "epoch": 1.2615303983228512, + "grad_norm": 2.1086854934692383, + "learning_rate": 1.846305031446541e-05, + "loss": 0.0687, + "step": 24070 + }, + { + "epoch": 1.2620545073375262, + "grad_norm": 1.66475248336792, + "learning_rate": 1.8449947589098536e-05, + "loss": 0.0899, + "step": 24080 + }, + { + "epoch": 1.2625786163522013, + "grad_norm": 1.212482213973999, + "learning_rate": 1.8436844863731656e-05, + "loss": 0.0922, + "step": 24090 + }, + { + "epoch": 1.2631027253668763, + "grad_norm": 1.6632353067398071, + "learning_rate": 1.842374213836478e-05, + "loss": 0.0613, + "step": 24100 + }, + { + "epoch": 1.2636268343815513, + "grad_norm": 0.9513425827026367, + "learning_rate": 1.8410639412997903e-05, + "loss": 0.0865, + "step": 24110 + }, + { + "epoch": 1.2641509433962264, + "grad_norm": 1.3721930980682373, + "learning_rate": 1.839753668763103e-05, + "loss": 0.0868, + "step": 24120 + }, + { + "epoch": 1.2646750524109014, + "grad_norm": 0.6984042525291443, + "learning_rate": 1.8384433962264153e-05, + "loss": 0.0884, + "step": 24130 + }, + { + "epoch": 1.2651991614255764, + "grad_norm": 1.0884439945220947, + "learning_rate": 1.8371331236897276e-05, + "loss": 0.0847, + "step": 24140 + }, + { + "epoch": 1.2657232704402515, + "grad_norm": 0.779158353805542, + "learning_rate": 1.83582285115304e-05, + "loss": 0.0633, + "step": 24150 + }, + { + "epoch": 1.2662473794549265, + "grad_norm": 5.8341898918151855, + "learning_rate": 1.8345125786163523e-05, + "loss": 0.0853, + "step": 24160 + }, + { + "epoch": 1.2667714884696017, + "grad_norm": 0.932140052318573, + "learning_rate": 1.8332023060796647e-05, + "loss": 0.1061, + "step": 24170 + }, + { + "epoch": 1.2672955974842768, + "grad_norm": 1.941718578338623, + "learning_rate": 1.831892033542977e-05, + "loss": 0.0707, + "step": 24180 + }, + { + "epoch": 1.2678197064989518, + "grad_norm": 1.0429099798202515, + "learning_rate": 1.8305817610062893e-05, + "loss": 0.077, + "step": 24190 + }, + { + "epoch": 1.2683438155136268, + "grad_norm": 2.249950885772705, + "learning_rate": 1.829271488469602e-05, + "loss": 0.0978, + "step": 24200 + }, + { + "epoch": 1.2688679245283019, + "grad_norm": 1.7662737369537354, + "learning_rate": 1.827961215932914e-05, + "loss": 0.0831, + "step": 24210 + }, + { + "epoch": 1.269392033542977, + "grad_norm": 1.8380582332611084, + "learning_rate": 1.8266509433962263e-05, + "loss": 0.0703, + "step": 24220 + }, + { + "epoch": 1.269916142557652, + "grad_norm": 1.9762569665908813, + "learning_rate": 1.825340670859539e-05, + "loss": 0.0581, + "step": 24230 + }, + { + "epoch": 1.270440251572327, + "grad_norm": 1.7735848426818848, + "learning_rate": 1.8240303983228514e-05, + "loss": 0.084, + "step": 24240 + }, + { + "epoch": 1.270964360587002, + "grad_norm": 1.975049376487732, + "learning_rate": 1.8227201257861637e-05, + "loss": 0.1041, + "step": 24250 + }, + { + "epoch": 1.2714884696016773, + "grad_norm": 1.864380955696106, + "learning_rate": 1.821409853249476e-05, + "loss": 0.0721, + "step": 24260 + }, + { + "epoch": 1.2720125786163523, + "grad_norm": 1.9561831951141357, + "learning_rate": 1.8200995807127884e-05, + "loss": 0.0863, + "step": 24270 + }, + { + "epoch": 1.2725366876310273, + "grad_norm": 2.3323402404785156, + "learning_rate": 1.8187893081761007e-05, + "loss": 0.0783, + "step": 24280 + }, + { + "epoch": 1.2730607966457024, + "grad_norm": 1.369814395904541, + "learning_rate": 1.817479035639413e-05, + "loss": 0.0649, + "step": 24290 + }, + { + "epoch": 1.2735849056603774, + "grad_norm": 3.996958017349243, + "learning_rate": 1.8161687631027254e-05, + "loss": 0.104, + "step": 24300 + }, + { + "epoch": 1.2741090146750524, + "grad_norm": 1.671249508857727, + "learning_rate": 1.814858490566038e-05, + "loss": 0.0811, + "step": 24310 + }, + { + "epoch": 1.2746331236897275, + "grad_norm": 1.0019627809524536, + "learning_rate": 1.8135482180293504e-05, + "loss": 0.0614, + "step": 24320 + }, + { + "epoch": 1.2751572327044025, + "grad_norm": 2.198514223098755, + "learning_rate": 1.8122379454926624e-05, + "loss": 0.0905, + "step": 24330 + }, + { + "epoch": 1.2756813417190775, + "grad_norm": 1.8543412685394287, + "learning_rate": 1.8109276729559747e-05, + "loss": 0.0843, + "step": 24340 + }, + { + "epoch": 1.2762054507337526, + "grad_norm": 1.5628409385681152, + "learning_rate": 1.8096174004192874e-05, + "loss": 0.1025, + "step": 24350 + }, + { + "epoch": 1.2767295597484276, + "grad_norm": 0.930385947227478, + "learning_rate": 1.8083071278825998e-05, + "loss": 0.0841, + "step": 24360 + }, + { + "epoch": 1.2772536687631026, + "grad_norm": 1.0186971426010132, + "learning_rate": 1.806996855345912e-05, + "loss": 0.0483, + "step": 24370 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 1.3639895915985107, + "learning_rate": 1.8056865828092244e-05, + "loss": 0.0903, + "step": 24380 + }, + { + "epoch": 1.2783018867924527, + "grad_norm": 1.24062180519104, + "learning_rate": 1.8043763102725368e-05, + "loss": 0.0912, + "step": 24390 + }, + { + "epoch": 1.2788259958071277, + "grad_norm": 0.5283358097076416, + "learning_rate": 1.803066037735849e-05, + "loss": 0.0629, + "step": 24400 + }, + { + "epoch": 1.279350104821803, + "grad_norm": 1.4935684204101562, + "learning_rate": 1.8017557651991615e-05, + "loss": 0.0711, + "step": 24410 + }, + { + "epoch": 1.279874213836478, + "grad_norm": 4.116866111755371, + "learning_rate": 1.8004454926624738e-05, + "loss": 0.083, + "step": 24420 + }, + { + "epoch": 1.280398322851153, + "grad_norm": 2.2552740573883057, + "learning_rate": 1.7991352201257865e-05, + "loss": 0.0794, + "step": 24430 + }, + { + "epoch": 1.280922431865828, + "grad_norm": 0.832618772983551, + "learning_rate": 1.7978249475890988e-05, + "loss": 0.0798, + "step": 24440 + }, + { + "epoch": 1.2814465408805031, + "grad_norm": 1.6791424751281738, + "learning_rate": 1.7965146750524108e-05, + "loss": 0.066, + "step": 24450 + }, + { + "epoch": 1.2819706498951782, + "grad_norm": 1.4778084754943848, + "learning_rate": 1.795204402515723e-05, + "loss": 0.0689, + "step": 24460 + }, + { + "epoch": 1.2824947589098532, + "grad_norm": 1.879032850265503, + "learning_rate": 1.7938941299790358e-05, + "loss": 0.0602, + "step": 24470 + }, + { + "epoch": 1.2830188679245282, + "grad_norm": 1.6691385507583618, + "learning_rate": 1.792583857442348e-05, + "loss": 0.0647, + "step": 24480 + }, + { + "epoch": 1.2835429769392033, + "grad_norm": 1.581830620765686, + "learning_rate": 1.7912735849056605e-05, + "loss": 0.069, + "step": 24490 + }, + { + "epoch": 1.2840670859538785, + "grad_norm": 1.7334755659103394, + "learning_rate": 1.789963312368973e-05, + "loss": 0.0885, + "step": 24500 + }, + { + "epoch": 1.2845911949685536, + "grad_norm": 1.2646406888961792, + "learning_rate": 1.7886530398322852e-05, + "loss": 0.0844, + "step": 24510 + }, + { + "epoch": 1.2851153039832286, + "grad_norm": 1.4649100303649902, + "learning_rate": 1.7873427672955975e-05, + "loss": 0.072, + "step": 24520 + }, + { + "epoch": 1.2856394129979036, + "grad_norm": 1.4135819673538208, + "learning_rate": 1.78603249475891e-05, + "loss": 0.0782, + "step": 24530 + }, + { + "epoch": 1.2861635220125787, + "grad_norm": 4.786865234375, + "learning_rate": 1.7847222222222222e-05, + "loss": 0.0969, + "step": 24540 + }, + { + "epoch": 1.2866876310272537, + "grad_norm": 1.6510841846466064, + "learning_rate": 1.783411949685535e-05, + "loss": 0.081, + "step": 24550 + }, + { + "epoch": 1.2872117400419287, + "grad_norm": 1.9891088008880615, + "learning_rate": 1.7821016771488472e-05, + "loss": 0.0773, + "step": 24560 + }, + { + "epoch": 1.2877358490566038, + "grad_norm": 1.993718147277832, + "learning_rate": 1.7807914046121592e-05, + "loss": 0.1071, + "step": 24570 + }, + { + "epoch": 1.2882599580712788, + "grad_norm": 1.368506669998169, + "learning_rate": 1.779481132075472e-05, + "loss": 0.0825, + "step": 24580 + }, + { + "epoch": 1.2887840670859538, + "grad_norm": 2.430133104324341, + "learning_rate": 1.7781708595387842e-05, + "loss": 0.057, + "step": 24590 + }, + { + "epoch": 1.2893081761006289, + "grad_norm": 0.8209651708602905, + "learning_rate": 1.7768605870020966e-05, + "loss": 0.0897, + "step": 24600 + }, + { + "epoch": 1.289832285115304, + "grad_norm": 1.1338999271392822, + "learning_rate": 1.775550314465409e-05, + "loss": 0.0922, + "step": 24610 + }, + { + "epoch": 1.290356394129979, + "grad_norm": 1.7733038663864136, + "learning_rate": 1.7742400419287212e-05, + "loss": 0.087, + "step": 24620 + }, + { + "epoch": 1.290880503144654, + "grad_norm": 1.529729962348938, + "learning_rate": 1.7729297693920336e-05, + "loss": 0.0581, + "step": 24630 + }, + { + "epoch": 1.291404612159329, + "grad_norm": 1.0193239450454712, + "learning_rate": 1.771619496855346e-05, + "loss": 0.0697, + "step": 24640 + }, + { + "epoch": 1.2919287211740043, + "grad_norm": 2.4770092964172363, + "learning_rate": 1.7703092243186583e-05, + "loss": 0.1183, + "step": 24650 + }, + { + "epoch": 1.2924528301886793, + "grad_norm": 1.4490543603897095, + "learning_rate": 1.768998951781971e-05, + "loss": 0.0778, + "step": 24660 + }, + { + "epoch": 1.2929769392033543, + "grad_norm": 1.66744863986969, + "learning_rate": 1.7676886792452833e-05, + "loss": 0.0871, + "step": 24670 + }, + { + "epoch": 1.2935010482180294, + "grad_norm": 3.002897262573242, + "learning_rate": 1.7663784067085953e-05, + "loss": 0.0937, + "step": 24680 + }, + { + "epoch": 1.2940251572327044, + "grad_norm": 1.3816642761230469, + "learning_rate": 1.7650681341719076e-05, + "loss": 0.0908, + "step": 24690 + }, + { + "epoch": 1.2945492662473794, + "grad_norm": 2.319898843765259, + "learning_rate": 1.7637578616352203e-05, + "loss": 0.0796, + "step": 24700 + }, + { + "epoch": 1.2950733752620545, + "grad_norm": 1.7644799947738647, + "learning_rate": 1.7624475890985326e-05, + "loss": 0.0802, + "step": 24710 + }, + { + "epoch": 1.2955974842767295, + "grad_norm": 1.809232234954834, + "learning_rate": 1.761137316561845e-05, + "loss": 0.1001, + "step": 24720 + }, + { + "epoch": 1.2961215932914047, + "grad_norm": 1.5217998027801514, + "learning_rate": 1.7598270440251573e-05, + "loss": 0.0938, + "step": 24730 + }, + { + "epoch": 1.2966457023060798, + "grad_norm": 1.3758409023284912, + "learning_rate": 1.7585167714884696e-05, + "loss": 0.0849, + "step": 24740 + }, + { + "epoch": 1.2971698113207548, + "grad_norm": 3.6694116592407227, + "learning_rate": 1.757206498951782e-05, + "loss": 0.0688, + "step": 24750 + }, + { + "epoch": 1.2976939203354299, + "grad_norm": 1.149685025215149, + "learning_rate": 1.7558962264150943e-05, + "loss": 0.0924, + "step": 24760 + }, + { + "epoch": 1.2982180293501049, + "grad_norm": 4.056723117828369, + "learning_rate": 1.7545859538784067e-05, + "loss": 0.0835, + "step": 24770 + }, + { + "epoch": 1.29874213836478, + "grad_norm": 1.304752230644226, + "learning_rate": 1.7532756813417193e-05, + "loss": 0.0758, + "step": 24780 + }, + { + "epoch": 1.299266247379455, + "grad_norm": 1.4952155351638794, + "learning_rate": 1.7519654088050317e-05, + "loss": 0.101, + "step": 24790 + }, + { + "epoch": 1.29979035639413, + "grad_norm": 2.3801004886627197, + "learning_rate": 1.7506551362683437e-05, + "loss": 0.0961, + "step": 24800 + }, + { + "epoch": 1.300314465408805, + "grad_norm": 2.32028865814209, + "learning_rate": 1.749344863731656e-05, + "loss": 0.0858, + "step": 24810 + }, + { + "epoch": 1.30083857442348, + "grad_norm": 1.2908284664154053, + "learning_rate": 1.7480345911949687e-05, + "loss": 0.0753, + "step": 24820 + }, + { + "epoch": 1.301362683438155, + "grad_norm": 1.50250244140625, + "learning_rate": 1.746724318658281e-05, + "loss": 0.0786, + "step": 24830 + }, + { + "epoch": 1.3018867924528301, + "grad_norm": 1.1071337461471558, + "learning_rate": 1.7454140461215934e-05, + "loss": 0.0938, + "step": 24840 + }, + { + "epoch": 1.3024109014675052, + "grad_norm": 1.892912745475769, + "learning_rate": 1.7441037735849057e-05, + "loss": 0.0845, + "step": 24850 + }, + { + "epoch": 1.3029350104821802, + "grad_norm": 1.4982322454452515, + "learning_rate": 1.742793501048218e-05, + "loss": 0.0852, + "step": 24860 + }, + { + "epoch": 1.3034591194968552, + "grad_norm": 1.102202296257019, + "learning_rate": 1.7414832285115304e-05, + "loss": 0.0562, + "step": 24870 + }, + { + "epoch": 1.3039832285115305, + "grad_norm": 1.2208045721054077, + "learning_rate": 1.7401729559748427e-05, + "loss": 0.0956, + "step": 24880 + }, + { + "epoch": 1.3045073375262055, + "grad_norm": 1.5137170553207397, + "learning_rate": 1.738862683438155e-05, + "loss": 0.0797, + "step": 24890 + }, + { + "epoch": 1.3050314465408805, + "grad_norm": 2.021153688430786, + "learning_rate": 1.7375524109014677e-05, + "loss": 0.0793, + "step": 24900 + }, + { + "epoch": 1.3055555555555556, + "grad_norm": 1.4623279571533203, + "learning_rate": 1.73624213836478e-05, + "loss": 0.0784, + "step": 24910 + }, + { + "epoch": 1.3060796645702306, + "grad_norm": 2.375816583633423, + "learning_rate": 1.734931865828092e-05, + "loss": 0.1034, + "step": 24920 + }, + { + "epoch": 1.3066037735849056, + "grad_norm": 4.277223110198975, + "learning_rate": 1.7336215932914048e-05, + "loss": 0.1049, + "step": 24930 + }, + { + "epoch": 1.3071278825995807, + "grad_norm": 0.9970260262489319, + "learning_rate": 1.732311320754717e-05, + "loss": 0.0733, + "step": 24940 + }, + { + "epoch": 1.3076519916142557, + "grad_norm": 0.7952342629432678, + "learning_rate": 1.7310010482180294e-05, + "loss": 0.0653, + "step": 24950 + }, + { + "epoch": 1.3081761006289307, + "grad_norm": 2.167339324951172, + "learning_rate": 1.7296907756813418e-05, + "loss": 0.0685, + "step": 24960 + }, + { + "epoch": 1.308700209643606, + "grad_norm": 1.8290060758590698, + "learning_rate": 1.728380503144654e-05, + "loss": 0.0777, + "step": 24970 + }, + { + "epoch": 1.309224318658281, + "grad_norm": 1.786184310913086, + "learning_rate": 1.7270702306079664e-05, + "loss": 0.0852, + "step": 24980 + }, + { + "epoch": 1.309748427672956, + "grad_norm": 0.8602116703987122, + "learning_rate": 1.7257599580712788e-05, + "loss": 0.078, + "step": 24990 + }, + { + "epoch": 1.310272536687631, + "grad_norm": 2.8858346939086914, + "learning_rate": 1.724449685534591e-05, + "loss": 0.089, + "step": 25000 + }, + { + "epoch": 1.310272536687631, + "eval_loss": 0.2705696225166321, + "eval_runtime": 267.3452, + "eval_samples_per_second": 7.447, + "eval_steps_per_second": 1.242, + "step": 25000 + }, + { + "epoch": 1.3107966457023061, + "grad_norm": 1.337947130203247, + "learning_rate": 1.7231394129979038e-05, + "loss": 0.0964, + "step": 25010 + }, + { + "epoch": 1.3113207547169812, + "grad_norm": 0.7933491468429565, + "learning_rate": 1.721829140461216e-05, + "loss": 0.0623, + "step": 25020 + }, + { + "epoch": 1.3118448637316562, + "grad_norm": 1.4174140691757202, + "learning_rate": 1.7205188679245285e-05, + "loss": 0.0571, + "step": 25030 + }, + { + "epoch": 1.3123689727463312, + "grad_norm": 1.281467318534851, + "learning_rate": 1.7192085953878405e-05, + "loss": 0.064, + "step": 25040 + }, + { + "epoch": 1.3128930817610063, + "grad_norm": 0.8671842217445374, + "learning_rate": 1.717898322851153e-05, + "loss": 0.0589, + "step": 25050 + }, + { + "epoch": 1.3134171907756813, + "grad_norm": 1.0511995553970337, + "learning_rate": 1.7165880503144655e-05, + "loss": 0.0584, + "step": 25060 + }, + { + "epoch": 1.3139412997903563, + "grad_norm": 1.4137508869171143, + "learning_rate": 1.715277777777778e-05, + "loss": 0.0886, + "step": 25070 + }, + { + "epoch": 1.3144654088050314, + "grad_norm": 1.7218070030212402, + "learning_rate": 1.7139675052410902e-05, + "loss": 0.0705, + "step": 25080 + }, + { + "epoch": 1.3149895178197064, + "grad_norm": 1.2758680582046509, + "learning_rate": 1.712657232704403e-05, + "loss": 0.0654, + "step": 25090 + }, + { + "epoch": 1.3155136268343814, + "grad_norm": 1.9731104373931885, + "learning_rate": 1.711346960167715e-05, + "loss": 0.0936, + "step": 25100 + }, + { + "epoch": 1.3160377358490565, + "grad_norm": 1.5953030586242676, + "learning_rate": 1.7100366876310272e-05, + "loss": 0.0706, + "step": 25110 + }, + { + "epoch": 1.3165618448637317, + "grad_norm": 2.0026612281799316, + "learning_rate": 1.7087264150943395e-05, + "loss": 0.0643, + "step": 25120 + }, + { + "epoch": 1.3170859538784068, + "grad_norm": 2.5946147441864014, + "learning_rate": 1.7074161425576522e-05, + "loss": 0.0718, + "step": 25130 + }, + { + "epoch": 1.3176100628930818, + "grad_norm": 2.603461265563965, + "learning_rate": 1.7061058700209645e-05, + "loss": 0.0671, + "step": 25140 + }, + { + "epoch": 1.3181341719077568, + "grad_norm": 2.067505359649658, + "learning_rate": 1.704795597484277e-05, + "loss": 0.0665, + "step": 25150 + }, + { + "epoch": 1.3186582809224319, + "grad_norm": 2.5327792167663574, + "learning_rate": 1.7034853249475892e-05, + "loss": 0.0844, + "step": 25160 + }, + { + "epoch": 1.319182389937107, + "grad_norm": 1.8760923147201538, + "learning_rate": 1.7021750524109016e-05, + "loss": 0.107, + "step": 25170 + }, + { + "epoch": 1.319706498951782, + "grad_norm": 1.9224241971969604, + "learning_rate": 1.700864779874214e-05, + "loss": 0.0915, + "step": 25180 + }, + { + "epoch": 1.320230607966457, + "grad_norm": 1.2848577499389648, + "learning_rate": 1.6995545073375262e-05, + "loss": 0.0901, + "step": 25190 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 1.1218456029891968, + "learning_rate": 1.6982442348008386e-05, + "loss": 0.0721, + "step": 25200 + }, + { + "epoch": 1.3212788259958073, + "grad_norm": 1.2322758436203003, + "learning_rate": 1.6969339622641513e-05, + "loss": 0.0756, + "step": 25210 + }, + { + "epoch": 1.3218029350104823, + "grad_norm": 1.3583430051803589, + "learning_rate": 1.6956236897274633e-05, + "loss": 0.0769, + "step": 25220 + }, + { + "epoch": 1.3223270440251573, + "grad_norm": 1.2304691076278687, + "learning_rate": 1.6943134171907756e-05, + "loss": 0.1051, + "step": 25230 + }, + { + "epoch": 1.3228511530398324, + "grad_norm": 0.6938934326171875, + "learning_rate": 1.6930031446540883e-05, + "loss": 0.0658, + "step": 25240 + }, + { + "epoch": 1.3233752620545074, + "grad_norm": 1.2099896669387817, + "learning_rate": 1.6916928721174006e-05, + "loss": 0.1008, + "step": 25250 + }, + { + "epoch": 1.3238993710691824, + "grad_norm": 1.8550902605056763, + "learning_rate": 1.690382599580713e-05, + "loss": 0.0807, + "step": 25260 + }, + { + "epoch": 1.3244234800838575, + "grad_norm": 1.730497121810913, + "learning_rate": 1.6890723270440253e-05, + "loss": 0.0948, + "step": 25270 + }, + { + "epoch": 1.3249475890985325, + "grad_norm": 1.381489872932434, + "learning_rate": 1.6877620545073376e-05, + "loss": 0.0846, + "step": 25280 + }, + { + "epoch": 1.3254716981132075, + "grad_norm": 2.28286075592041, + "learning_rate": 1.68645178197065e-05, + "loss": 0.093, + "step": 25290 + }, + { + "epoch": 1.3259958071278826, + "grad_norm": 0.8911481499671936, + "learning_rate": 1.6851415094339623e-05, + "loss": 0.068, + "step": 25300 + }, + { + "epoch": 1.3265199161425576, + "grad_norm": 1.4973480701446533, + "learning_rate": 1.6838312368972746e-05, + "loss": 0.0768, + "step": 25310 + }, + { + "epoch": 1.3270440251572326, + "grad_norm": 1.591711163520813, + "learning_rate": 1.6825209643605873e-05, + "loss": 0.0908, + "step": 25320 + }, + { + "epoch": 1.3275681341719077, + "grad_norm": 2.0032050609588623, + "learning_rate": 1.6812106918238997e-05, + "loss": 0.0747, + "step": 25330 + }, + { + "epoch": 1.3280922431865827, + "grad_norm": 0.906340479850769, + "learning_rate": 1.6799004192872117e-05, + "loss": 0.0652, + "step": 25340 + }, + { + "epoch": 1.3286163522012577, + "grad_norm": 1.9054814577102661, + "learning_rate": 1.678590146750524e-05, + "loss": 0.0761, + "step": 25350 + }, + { + "epoch": 1.329140461215933, + "grad_norm": 0.8013094067573547, + "learning_rate": 1.6772798742138367e-05, + "loss": 0.0736, + "step": 25360 + }, + { + "epoch": 1.329664570230608, + "grad_norm": 1.0076137781143188, + "learning_rate": 1.675969601677149e-05, + "loss": 0.0819, + "step": 25370 + }, + { + "epoch": 1.330188679245283, + "grad_norm": 1.4720983505249023, + "learning_rate": 1.6746593291404613e-05, + "loss": 0.0625, + "step": 25380 + }, + { + "epoch": 1.330712788259958, + "grad_norm": 1.1966739892959595, + "learning_rate": 1.6733490566037737e-05, + "loss": 0.0645, + "step": 25390 + }, + { + "epoch": 1.3312368972746331, + "grad_norm": 1.5404307842254639, + "learning_rate": 1.672038784067086e-05, + "loss": 0.0765, + "step": 25400 + }, + { + "epoch": 1.3317610062893082, + "grad_norm": 1.782375693321228, + "learning_rate": 1.6707285115303984e-05, + "loss": 0.0719, + "step": 25410 + }, + { + "epoch": 1.3322851153039832, + "grad_norm": 1.7037994861602783, + "learning_rate": 1.6694182389937107e-05, + "loss": 0.0644, + "step": 25420 + }, + { + "epoch": 1.3328092243186582, + "grad_norm": 1.1350616216659546, + "learning_rate": 1.668107966457023e-05, + "loss": 0.0615, + "step": 25430 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.5969865322113037, + "learning_rate": 1.6667976939203357e-05, + "loss": 0.0851, + "step": 25440 + }, + { + "epoch": 1.3338574423480085, + "grad_norm": 2.3378772735595703, + "learning_rate": 1.665487421383648e-05, + "loss": 0.0882, + "step": 25450 + }, + { + "epoch": 1.3343815513626835, + "grad_norm": 1.8046290874481201, + "learning_rate": 1.66417714884696e-05, + "loss": 0.0719, + "step": 25460 + }, + { + "epoch": 1.3349056603773586, + "grad_norm": 1.4188587665557861, + "learning_rate": 1.6628668763102724e-05, + "loss": 0.085, + "step": 25470 + }, + { + "epoch": 1.3354297693920336, + "grad_norm": 1.3640601634979248, + "learning_rate": 1.661556603773585e-05, + "loss": 0.0713, + "step": 25480 + }, + { + "epoch": 1.3359538784067087, + "grad_norm": 1.4131982326507568, + "learning_rate": 1.6602463312368974e-05, + "loss": 0.0978, + "step": 25490 + }, + { + "epoch": 1.3364779874213837, + "grad_norm": 1.2081284523010254, + "learning_rate": 1.6589360587002097e-05, + "loss": 0.0988, + "step": 25500 + }, + { + "epoch": 1.3370020964360587, + "grad_norm": 1.0797348022460938, + "learning_rate": 1.657625786163522e-05, + "loss": 0.0692, + "step": 25510 + }, + { + "epoch": 1.3375262054507338, + "grad_norm": 2.361949920654297, + "learning_rate": 1.6563155136268344e-05, + "loss": 0.0553, + "step": 25520 + }, + { + "epoch": 1.3380503144654088, + "grad_norm": 1.3106904029846191, + "learning_rate": 1.6550052410901468e-05, + "loss": 0.0726, + "step": 25530 + }, + { + "epoch": 1.3385744234800838, + "grad_norm": 1.1975557804107666, + "learning_rate": 1.653694968553459e-05, + "loss": 0.0744, + "step": 25540 + }, + { + "epoch": 1.3390985324947589, + "grad_norm": 1.9830306768417358, + "learning_rate": 1.6523846960167714e-05, + "loss": 0.0855, + "step": 25550 + }, + { + "epoch": 1.3396226415094339, + "grad_norm": 1.101518988609314, + "learning_rate": 1.651074423480084e-05, + "loss": 0.0719, + "step": 25560 + }, + { + "epoch": 1.340146750524109, + "grad_norm": 2.719447612762451, + "learning_rate": 1.6497641509433965e-05, + "loss": 0.0897, + "step": 25570 + }, + { + "epoch": 1.340670859538784, + "grad_norm": 2.419497013092041, + "learning_rate": 1.6484538784067085e-05, + "loss": 0.0667, + "step": 25580 + }, + { + "epoch": 1.341194968553459, + "grad_norm": 1.612613558769226, + "learning_rate": 1.647143605870021e-05, + "loss": 0.0958, + "step": 25590 + }, + { + "epoch": 1.3417190775681342, + "grad_norm": 1.1840425729751587, + "learning_rate": 1.6458333333333335e-05, + "loss": 0.0588, + "step": 25600 + }, + { + "epoch": 1.3422431865828093, + "grad_norm": 1.41465163230896, + "learning_rate": 1.6445230607966458e-05, + "loss": 0.0773, + "step": 25610 + }, + { + "epoch": 1.3427672955974843, + "grad_norm": 1.5764625072479248, + "learning_rate": 1.643212788259958e-05, + "loss": 0.0812, + "step": 25620 + }, + { + "epoch": 1.3432914046121593, + "grad_norm": 1.6776331663131714, + "learning_rate": 1.6419025157232705e-05, + "loss": 0.0829, + "step": 25630 + }, + { + "epoch": 1.3438155136268344, + "grad_norm": 1.4592304229736328, + "learning_rate": 1.6405922431865828e-05, + "loss": 0.1036, + "step": 25640 + }, + { + "epoch": 1.3443396226415094, + "grad_norm": 1.5961178541183472, + "learning_rate": 1.639281970649895e-05, + "loss": 0.083, + "step": 25650 + }, + { + "epoch": 1.3448637316561844, + "grad_norm": 1.031274437904358, + "learning_rate": 1.6379716981132075e-05, + "loss": 0.1249, + "step": 25660 + }, + { + "epoch": 1.3453878406708595, + "grad_norm": 1.7729008197784424, + "learning_rate": 1.6366614255765202e-05, + "loss": 0.0676, + "step": 25670 + }, + { + "epoch": 1.3459119496855345, + "grad_norm": 1.2640308141708374, + "learning_rate": 1.6353511530398325e-05, + "loss": 0.0984, + "step": 25680 + }, + { + "epoch": 1.3464360587002098, + "grad_norm": 1.1028263568878174, + "learning_rate": 1.634040880503145e-05, + "loss": 0.0775, + "step": 25690 + }, + { + "epoch": 1.3469601677148848, + "grad_norm": 1.7719197273254395, + "learning_rate": 1.632730607966457e-05, + "loss": 0.0797, + "step": 25700 + }, + { + "epoch": 1.3474842767295598, + "grad_norm": 1.8258262872695923, + "learning_rate": 1.6314203354297695e-05, + "loss": 0.0782, + "step": 25710 + }, + { + "epoch": 1.3480083857442349, + "grad_norm": 0.8588356971740723, + "learning_rate": 1.630110062893082e-05, + "loss": 0.0887, + "step": 25720 + }, + { + "epoch": 1.34853249475891, + "grad_norm": 1.2232142686843872, + "learning_rate": 1.6287997903563942e-05, + "loss": 0.0933, + "step": 25730 + }, + { + "epoch": 1.349056603773585, + "grad_norm": 2.0187501907348633, + "learning_rate": 1.6274895178197065e-05, + "loss": 0.0824, + "step": 25740 + }, + { + "epoch": 1.34958071278826, + "grad_norm": 1.4330048561096191, + "learning_rate": 1.6261792452830192e-05, + "loss": 0.0874, + "step": 25750 + }, + { + "epoch": 1.350104821802935, + "grad_norm": 2.3740146160125732, + "learning_rate": 1.6248689727463312e-05, + "loss": 0.0845, + "step": 25760 + }, + { + "epoch": 1.35062893081761, + "grad_norm": 1.3695255517959595, + "learning_rate": 1.6235587002096436e-05, + "loss": 0.0757, + "step": 25770 + }, + { + "epoch": 1.351153039832285, + "grad_norm": 0.8165902495384216, + "learning_rate": 1.622248427672956e-05, + "loss": 0.0834, + "step": 25780 + }, + { + "epoch": 1.35167714884696, + "grad_norm": 1.2380322217941284, + "learning_rate": 1.6209381551362686e-05, + "loss": 0.0839, + "step": 25790 + }, + { + "epoch": 1.3522012578616351, + "grad_norm": 1.3144499063491821, + "learning_rate": 1.619627882599581e-05, + "loss": 0.0785, + "step": 25800 + }, + { + "epoch": 1.3527253668763102, + "grad_norm": 2.314235210418701, + "learning_rate": 1.6183176100628933e-05, + "loss": 0.0764, + "step": 25810 + }, + { + "epoch": 1.3532494758909852, + "grad_norm": 1.4942843914031982, + "learning_rate": 1.6170073375262053e-05, + "loss": 0.0837, + "step": 25820 + }, + { + "epoch": 1.3537735849056602, + "grad_norm": 1.46351158618927, + "learning_rate": 1.615697064989518e-05, + "loss": 0.0997, + "step": 25830 + }, + { + "epoch": 1.3542976939203355, + "grad_norm": 1.4346762895584106, + "learning_rate": 1.6143867924528303e-05, + "loss": 0.0796, + "step": 25840 + }, + { + "epoch": 1.3548218029350105, + "grad_norm": 1.5181978940963745, + "learning_rate": 1.6130765199161426e-05, + "loss": 0.0859, + "step": 25850 + }, + { + "epoch": 1.3553459119496856, + "grad_norm": 0.9744179248809814, + "learning_rate": 1.611766247379455e-05, + "loss": 0.0965, + "step": 25860 + }, + { + "epoch": 1.3558700209643606, + "grad_norm": 1.5921379327774048, + "learning_rate": 1.6104559748427676e-05, + "loss": 0.08, + "step": 25870 + }, + { + "epoch": 1.3563941299790356, + "grad_norm": 1.3599023818969727, + "learning_rate": 1.6091457023060796e-05, + "loss": 0.0823, + "step": 25880 + }, + { + "epoch": 1.3569182389937107, + "grad_norm": 4.234800338745117, + "learning_rate": 1.607835429769392e-05, + "loss": 0.1235, + "step": 25890 + }, + { + "epoch": 1.3574423480083857, + "grad_norm": 1.4063974618911743, + "learning_rate": 1.6065251572327043e-05, + "loss": 0.1067, + "step": 25900 + }, + { + "epoch": 1.3579664570230607, + "grad_norm": 2.220942974090576, + "learning_rate": 1.605214884696017e-05, + "loss": 0.1032, + "step": 25910 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 1.3111824989318848, + "learning_rate": 1.6039046121593293e-05, + "loss": 0.07, + "step": 25920 + }, + { + "epoch": 1.359014675052411, + "grad_norm": 1.5388332605361938, + "learning_rate": 1.6025943396226417e-05, + "loss": 0.0916, + "step": 25930 + }, + { + "epoch": 1.359538784067086, + "grad_norm": 1.8049447536468506, + "learning_rate": 1.601284067085954e-05, + "loss": 0.075, + "step": 25940 + }, + { + "epoch": 1.360062893081761, + "grad_norm": 0.8838228583335876, + "learning_rate": 1.5999737945492663e-05, + "loss": 0.0648, + "step": 25950 + }, + { + "epoch": 1.3605870020964361, + "grad_norm": 1.5289140939712524, + "learning_rate": 1.5986635220125787e-05, + "loss": 0.0704, + "step": 25960 + }, + { + "epoch": 1.3611111111111112, + "grad_norm": 1.8726352453231812, + "learning_rate": 1.597353249475891e-05, + "loss": 0.0682, + "step": 25970 + }, + { + "epoch": 1.3616352201257862, + "grad_norm": 2.143695116043091, + "learning_rate": 1.5960429769392034e-05, + "loss": 0.1031, + "step": 25980 + }, + { + "epoch": 1.3621593291404612, + "grad_norm": 1.7918883562088013, + "learning_rate": 1.594732704402516e-05, + "loss": 0.0792, + "step": 25990 + }, + { + "epoch": 1.3626834381551363, + "grad_norm": 1.1076338291168213, + "learning_rate": 1.593422431865828e-05, + "loss": 0.0886, + "step": 26000 + }, + { + "epoch": 1.3626834381551363, + "eval_loss": 0.2706705927848816, + "eval_runtime": 268.5851, + "eval_samples_per_second": 7.413, + "eval_steps_per_second": 1.236, + "step": 26000 + }, + { + "epoch": 1.3632075471698113, + "grad_norm": 1.611303448677063, + "learning_rate": 1.5921121593291404e-05, + "loss": 0.0804, + "step": 26010 + }, + { + "epoch": 1.3637316561844863, + "grad_norm": 1.3968373537063599, + "learning_rate": 1.590801886792453e-05, + "loss": 0.0903, + "step": 26020 + }, + { + "epoch": 1.3642557651991614, + "grad_norm": 6.171770095825195, + "learning_rate": 1.5894916142557654e-05, + "loss": 0.0706, + "step": 26030 + }, + { + "epoch": 1.3647798742138364, + "grad_norm": 1.0653380155563354, + "learning_rate": 1.5881813417190777e-05, + "loss": 0.0819, + "step": 26040 + }, + { + "epoch": 1.3653039832285114, + "grad_norm": 2.0599710941314697, + "learning_rate": 1.58687106918239e-05, + "loss": 0.0788, + "step": 26050 + }, + { + "epoch": 1.3658280922431865, + "grad_norm": 3.233006477355957, + "learning_rate": 1.5855607966457024e-05, + "loss": 0.0782, + "step": 26060 + }, + { + "epoch": 1.3663522012578615, + "grad_norm": 1.5495176315307617, + "learning_rate": 1.5842505241090147e-05, + "loss": 0.1035, + "step": 26070 + }, + { + "epoch": 1.3668763102725368, + "grad_norm": 0.6416164636611938, + "learning_rate": 1.582940251572327e-05, + "loss": 0.0433, + "step": 26080 + }, + { + "epoch": 1.3674004192872118, + "grad_norm": 1.9599913358688354, + "learning_rate": 1.5816299790356394e-05, + "loss": 0.074, + "step": 26090 + }, + { + "epoch": 1.3679245283018868, + "grad_norm": 1.8246488571166992, + "learning_rate": 1.580319706498952e-05, + "loss": 0.1008, + "step": 26100 + }, + { + "epoch": 1.3684486373165619, + "grad_norm": 1.8829816579818726, + "learning_rate": 1.5790094339622644e-05, + "loss": 0.0777, + "step": 26110 + }, + { + "epoch": 1.368972746331237, + "grad_norm": 11.83426570892334, + "learning_rate": 1.5776991614255764e-05, + "loss": 0.0897, + "step": 26120 + }, + { + "epoch": 1.369496855345912, + "grad_norm": 1.663411021232605, + "learning_rate": 1.5763888888888888e-05, + "loss": 0.0669, + "step": 26130 + }, + { + "epoch": 1.370020964360587, + "grad_norm": 1.9873604774475098, + "learning_rate": 1.5750786163522014e-05, + "loss": 0.0922, + "step": 26140 + }, + { + "epoch": 1.370545073375262, + "grad_norm": 1.4869024753570557, + "learning_rate": 1.5737683438155138e-05, + "loss": 0.0694, + "step": 26150 + }, + { + "epoch": 1.371069182389937, + "grad_norm": 1.3856655359268188, + "learning_rate": 1.572458071278826e-05, + "loss": 0.0631, + "step": 26160 + }, + { + "epoch": 1.3715932914046123, + "grad_norm": 1.4655511379241943, + "learning_rate": 1.5711477987421385e-05, + "loss": 0.0503, + "step": 26170 + }, + { + "epoch": 1.3721174004192873, + "grad_norm": 1.414749026298523, + "learning_rate": 1.5698375262054508e-05, + "loss": 0.0912, + "step": 26180 + }, + { + "epoch": 1.3726415094339623, + "grad_norm": 1.7459259033203125, + "learning_rate": 1.568527253668763e-05, + "loss": 0.0796, + "step": 26190 + }, + { + "epoch": 1.3731656184486374, + "grad_norm": 1.1313443183898926, + "learning_rate": 1.5672169811320755e-05, + "loss": 0.0664, + "step": 26200 + }, + { + "epoch": 1.3736897274633124, + "grad_norm": 1.0263671875, + "learning_rate": 1.5659067085953878e-05, + "loss": 0.0659, + "step": 26210 + }, + { + "epoch": 1.3742138364779874, + "grad_norm": 1.0881389379501343, + "learning_rate": 1.5645964360587005e-05, + "loss": 0.0584, + "step": 26220 + }, + { + "epoch": 1.3747379454926625, + "grad_norm": 1.92306387424469, + "learning_rate": 1.563286163522013e-05, + "loss": 0.0642, + "step": 26230 + }, + { + "epoch": 1.3752620545073375, + "grad_norm": 1.1980657577514648, + "learning_rate": 1.561975890985325e-05, + "loss": 0.1216, + "step": 26240 + }, + { + "epoch": 1.3757861635220126, + "grad_norm": 4.562958717346191, + "learning_rate": 1.5606656184486375e-05, + "loss": 0.0944, + "step": 26250 + }, + { + "epoch": 1.3763102725366876, + "grad_norm": 1.993789553642273, + "learning_rate": 1.55935534591195e-05, + "loss": 0.0762, + "step": 26260 + }, + { + "epoch": 1.3768343815513626, + "grad_norm": 1.6461892127990723, + "learning_rate": 1.5580450733752622e-05, + "loss": 0.084, + "step": 26270 + }, + { + "epoch": 1.3773584905660377, + "grad_norm": 1.9939159154891968, + "learning_rate": 1.5567348008385745e-05, + "loss": 0.0708, + "step": 26280 + }, + { + "epoch": 1.3778825995807127, + "grad_norm": 1.601395606994629, + "learning_rate": 1.555424528301887e-05, + "loss": 0.0748, + "step": 26290 + }, + { + "epoch": 1.3784067085953877, + "grad_norm": 1.2121330499649048, + "learning_rate": 1.5541142557651992e-05, + "loss": 0.091, + "step": 26300 + }, + { + "epoch": 1.378930817610063, + "grad_norm": 1.6086260080337524, + "learning_rate": 1.5528039832285115e-05, + "loss": 0.0841, + "step": 26310 + }, + { + "epoch": 1.379454926624738, + "grad_norm": 1.4790680408477783, + "learning_rate": 1.551493710691824e-05, + "loss": 0.0791, + "step": 26320 + }, + { + "epoch": 1.379979035639413, + "grad_norm": 1.49806547164917, + "learning_rate": 1.5501834381551366e-05, + "loss": 0.0889, + "step": 26330 + }, + { + "epoch": 1.380503144654088, + "grad_norm": 1.4938971996307373, + "learning_rate": 1.548873165618449e-05, + "loss": 0.0731, + "step": 26340 + }, + { + "epoch": 1.381027253668763, + "grad_norm": 1.3213557004928589, + "learning_rate": 1.547562893081761e-05, + "loss": 0.0913, + "step": 26350 + }, + { + "epoch": 1.3815513626834381, + "grad_norm": 2.0190086364746094, + "learning_rate": 1.5462526205450732e-05, + "loss": 0.0914, + "step": 26360 + }, + { + "epoch": 1.3820754716981132, + "grad_norm": 2.5714008808135986, + "learning_rate": 1.544942348008386e-05, + "loss": 0.1062, + "step": 26370 + }, + { + "epoch": 1.3825995807127882, + "grad_norm": 0.9055834412574768, + "learning_rate": 1.5436320754716982e-05, + "loss": 0.0648, + "step": 26380 + }, + { + "epoch": 1.3831236897274632, + "grad_norm": 1.8946198225021362, + "learning_rate": 1.5423218029350106e-05, + "loss": 0.0936, + "step": 26390 + }, + { + "epoch": 1.3836477987421385, + "grad_norm": 1.461883544921875, + "learning_rate": 1.541011530398323e-05, + "loss": 0.0617, + "step": 26400 + }, + { + "epoch": 1.3841719077568135, + "grad_norm": 1.4220163822174072, + "learning_rate": 1.5397012578616353e-05, + "loss": 0.0623, + "step": 26410 + }, + { + "epoch": 1.3846960167714886, + "grad_norm": 1.1401937007904053, + "learning_rate": 1.5383909853249476e-05, + "loss": 0.0677, + "step": 26420 + }, + { + "epoch": 1.3852201257861636, + "grad_norm": 2.134997844696045, + "learning_rate": 1.53708071278826e-05, + "loss": 0.0569, + "step": 26430 + }, + { + "epoch": 1.3857442348008386, + "grad_norm": 1.6183842420578003, + "learning_rate": 1.5357704402515723e-05, + "loss": 0.0739, + "step": 26440 + }, + { + "epoch": 1.3862683438155137, + "grad_norm": 1.135965347290039, + "learning_rate": 1.534460167714885e-05, + "loss": 0.0897, + "step": 26450 + }, + { + "epoch": 1.3867924528301887, + "grad_norm": 1.706466794013977, + "learning_rate": 1.5331498951781973e-05, + "loss": 0.0835, + "step": 26460 + }, + { + "epoch": 1.3873165618448637, + "grad_norm": 2.735337734222412, + "learning_rate": 1.5318396226415093e-05, + "loss": 0.0755, + "step": 26470 + }, + { + "epoch": 1.3878406708595388, + "grad_norm": 1.0616151094436646, + "learning_rate": 1.5305293501048216e-05, + "loss": 0.077, + "step": 26480 + }, + { + "epoch": 1.3883647798742138, + "grad_norm": 1.5158982276916504, + "learning_rate": 1.5292190775681343e-05, + "loss": 0.0793, + "step": 26490 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 1.1948658227920532, + "learning_rate": 1.5279088050314467e-05, + "loss": 0.083, + "step": 26500 + }, + { + "epoch": 1.3894129979035639, + "grad_norm": 1.381131649017334, + "learning_rate": 1.526598532494759e-05, + "loss": 0.0961, + "step": 26510 + }, + { + "epoch": 1.389937106918239, + "grad_norm": 0.8586825132369995, + "learning_rate": 1.5252882599580715e-05, + "loss": 0.079, + "step": 26520 + }, + { + "epoch": 1.390461215932914, + "grad_norm": 0.9948753118515015, + "learning_rate": 1.5239779874213837e-05, + "loss": 0.0779, + "step": 26530 + }, + { + "epoch": 1.390985324947589, + "grad_norm": 1.9307538270950317, + "learning_rate": 1.522667714884696e-05, + "loss": 0.0889, + "step": 26540 + }, + { + "epoch": 1.3915094339622642, + "grad_norm": 0.7694650888442993, + "learning_rate": 1.5213574423480083e-05, + "loss": 0.0905, + "step": 26550 + }, + { + "epoch": 1.3920335429769393, + "grad_norm": 1.3657408952713013, + "learning_rate": 1.5200471698113209e-05, + "loss": 0.0698, + "step": 26560 + }, + { + "epoch": 1.3925576519916143, + "grad_norm": 1.0885812044143677, + "learning_rate": 1.5187368972746332e-05, + "loss": 0.056, + "step": 26570 + }, + { + "epoch": 1.3930817610062893, + "grad_norm": 2.0228495597839355, + "learning_rate": 1.5174266247379457e-05, + "loss": 0.0907, + "step": 26580 + }, + { + "epoch": 1.3936058700209644, + "grad_norm": 1.1174840927124023, + "learning_rate": 1.5161163522012579e-05, + "loss": 0.072, + "step": 26590 + }, + { + "epoch": 1.3941299790356394, + "grad_norm": 1.6065596342086792, + "learning_rate": 1.5148060796645702e-05, + "loss": 0.0919, + "step": 26600 + }, + { + "epoch": 1.3946540880503144, + "grad_norm": 2.008131742477417, + "learning_rate": 1.5134958071278827e-05, + "loss": 0.0917, + "step": 26610 + }, + { + "epoch": 1.3951781970649895, + "grad_norm": 2.5115885734558105, + "learning_rate": 1.512185534591195e-05, + "loss": 0.0812, + "step": 26620 + }, + { + "epoch": 1.3957023060796645, + "grad_norm": 1.995683193206787, + "learning_rate": 1.5108752620545074e-05, + "loss": 0.0776, + "step": 26630 + }, + { + "epoch": 1.3962264150943398, + "grad_norm": 1.5423414707183838, + "learning_rate": 1.5095649895178199e-05, + "loss": 0.082, + "step": 26640 + }, + { + "epoch": 1.3967505241090148, + "grad_norm": 1.428948998451233, + "learning_rate": 1.508254716981132e-05, + "loss": 0.0762, + "step": 26650 + }, + { + "epoch": 1.3972746331236898, + "grad_norm": 0.9008168578147888, + "learning_rate": 1.5069444444444444e-05, + "loss": 0.0711, + "step": 26660 + }, + { + "epoch": 1.3977987421383649, + "grad_norm": 1.6736418008804321, + "learning_rate": 1.5056341719077569e-05, + "loss": 0.1039, + "step": 26670 + }, + { + "epoch": 1.39832285115304, + "grad_norm": 1.5996124744415283, + "learning_rate": 1.5043238993710693e-05, + "loss": 0.0765, + "step": 26680 + }, + { + "epoch": 1.398846960167715, + "grad_norm": 1.6097357273101807, + "learning_rate": 1.5030136268343818e-05, + "loss": 0.0807, + "step": 26690 + }, + { + "epoch": 1.39937106918239, + "grad_norm": 1.1372876167297363, + "learning_rate": 1.5017033542976941e-05, + "loss": 0.0734, + "step": 26700 + }, + { + "epoch": 1.399895178197065, + "grad_norm": 2.229891538619995, + "learning_rate": 1.5003930817610063e-05, + "loss": 0.073, + "step": 26710 + }, + { + "epoch": 1.40041928721174, + "grad_norm": 1.2906321287155151, + "learning_rate": 1.4990828092243186e-05, + "loss": 0.0706, + "step": 26720 + }, + { + "epoch": 1.400943396226415, + "grad_norm": 1.1182019710540771, + "learning_rate": 1.4977725366876311e-05, + "loss": 0.0711, + "step": 26730 + }, + { + "epoch": 1.40146750524109, + "grad_norm": 0.8318687677383423, + "learning_rate": 1.4964622641509435e-05, + "loss": 0.0461, + "step": 26740 + }, + { + "epoch": 1.4019916142557651, + "grad_norm": 1.398024320602417, + "learning_rate": 1.495151991614256e-05, + "loss": 0.0826, + "step": 26750 + }, + { + "epoch": 1.4025157232704402, + "grad_norm": 1.820096731185913, + "learning_rate": 1.4938417190775683e-05, + "loss": 0.099, + "step": 26760 + }, + { + "epoch": 1.4030398322851152, + "grad_norm": 1.1774033308029175, + "learning_rate": 1.4925314465408805e-05, + "loss": 0.0826, + "step": 26770 + }, + { + "epoch": 1.4035639412997902, + "grad_norm": 2.464606761932373, + "learning_rate": 1.4912211740041928e-05, + "loss": 0.0767, + "step": 26780 + }, + { + "epoch": 1.4040880503144655, + "grad_norm": 1.0393379926681519, + "learning_rate": 1.4899109014675053e-05, + "loss": 0.0867, + "step": 26790 + }, + { + "epoch": 1.4046121593291405, + "grad_norm": 2.6681644916534424, + "learning_rate": 1.4886006289308177e-05, + "loss": 0.0782, + "step": 26800 + }, + { + "epoch": 1.4051362683438156, + "grad_norm": 2.1190297603607178, + "learning_rate": 1.4872903563941302e-05, + "loss": 0.0724, + "step": 26810 + }, + { + "epoch": 1.4056603773584906, + "grad_norm": 1.8009228706359863, + "learning_rate": 1.4859800838574425e-05, + "loss": 0.1171, + "step": 26820 + }, + { + "epoch": 1.4061844863731656, + "grad_norm": 1.9393926858901978, + "learning_rate": 1.4846698113207547e-05, + "loss": 0.0797, + "step": 26830 + }, + { + "epoch": 1.4067085953878407, + "grad_norm": 1.5478100776672363, + "learning_rate": 1.483359538784067e-05, + "loss": 0.0877, + "step": 26840 + }, + { + "epoch": 1.4072327044025157, + "grad_norm": 1.9180935621261597, + "learning_rate": 1.4820492662473795e-05, + "loss": 0.0857, + "step": 26850 + }, + { + "epoch": 1.4077568134171907, + "grad_norm": 1.5819123983383179, + "learning_rate": 1.4807389937106919e-05, + "loss": 0.0828, + "step": 26860 + }, + { + "epoch": 1.4082809224318658, + "grad_norm": 1.8015711307525635, + "learning_rate": 1.4794287211740044e-05, + "loss": 0.0774, + "step": 26870 + }, + { + "epoch": 1.408805031446541, + "grad_norm": 1.995409369468689, + "learning_rate": 1.4781184486373167e-05, + "loss": 0.0711, + "step": 26880 + }, + { + "epoch": 1.409329140461216, + "grad_norm": 1.8823596239089966, + "learning_rate": 1.4768081761006289e-05, + "loss": 0.1001, + "step": 26890 + }, + { + "epoch": 1.409853249475891, + "grad_norm": 1.262195110321045, + "learning_rate": 1.4754979035639414e-05, + "loss": 0.0789, + "step": 26900 + }, + { + "epoch": 1.4103773584905661, + "grad_norm": 1.5328476428985596, + "learning_rate": 1.4741876310272537e-05, + "loss": 0.0712, + "step": 26910 + }, + { + "epoch": 1.4109014675052411, + "grad_norm": 1.3686559200286865, + "learning_rate": 1.472877358490566e-05, + "loss": 0.0917, + "step": 26920 + }, + { + "epoch": 1.4114255765199162, + "grad_norm": 1.4035519361495972, + "learning_rate": 1.4715670859538786e-05, + "loss": 0.0922, + "step": 26930 + }, + { + "epoch": 1.4119496855345912, + "grad_norm": 1.396806001663208, + "learning_rate": 1.4702568134171909e-05, + "loss": 0.0707, + "step": 26940 + }, + { + "epoch": 1.4124737945492662, + "grad_norm": 1.1253314018249512, + "learning_rate": 1.468946540880503e-05, + "loss": 0.0688, + "step": 26950 + }, + { + "epoch": 1.4129979035639413, + "grad_norm": 1.334794282913208, + "learning_rate": 1.4676362683438156e-05, + "loss": 0.0707, + "step": 26960 + }, + { + "epoch": 1.4135220125786163, + "grad_norm": 2.1766927242279053, + "learning_rate": 1.466325995807128e-05, + "loss": 0.0645, + "step": 26970 + }, + { + "epoch": 1.4140461215932913, + "grad_norm": 1.0007305145263672, + "learning_rate": 1.4650157232704404e-05, + "loss": 0.0826, + "step": 26980 + }, + { + "epoch": 1.4145702306079664, + "grad_norm": 1.4699190855026245, + "learning_rate": 1.4637054507337528e-05, + "loss": 0.0646, + "step": 26990 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 1.193003535270691, + "learning_rate": 1.4623951781970651e-05, + "loss": 0.0765, + "step": 27000 + }, + { + "epoch": 1.4150943396226414, + "eval_loss": 0.2722998857498169, + "eval_runtime": 268.1874, + "eval_samples_per_second": 7.424, + "eval_steps_per_second": 1.238, + "step": 27000 + }, + { + "epoch": 1.4156184486373165, + "grad_norm": 1.677714228630066, + "learning_rate": 1.4610849056603773e-05, + "loss": 0.0874, + "step": 27010 + }, + { + "epoch": 1.4161425576519915, + "grad_norm": 1.2611427307128906, + "learning_rate": 1.4597746331236898e-05, + "loss": 0.0665, + "step": 27020 + }, + { + "epoch": 1.4166666666666667, + "grad_norm": 1.3466812372207642, + "learning_rate": 1.4584643605870021e-05, + "loss": 0.0796, + "step": 27030 + }, + { + "epoch": 1.4171907756813418, + "grad_norm": 2.360762119293213, + "learning_rate": 1.4571540880503146e-05, + "loss": 0.077, + "step": 27040 + }, + { + "epoch": 1.4177148846960168, + "grad_norm": 1.946505069732666, + "learning_rate": 1.455843815513627e-05, + "loss": 0.081, + "step": 27050 + }, + { + "epoch": 1.4182389937106918, + "grad_norm": 1.1225969791412354, + "learning_rate": 1.4545335429769395e-05, + "loss": 0.0652, + "step": 27060 + }, + { + "epoch": 1.4187631027253669, + "grad_norm": 2.124420642852783, + "learning_rate": 1.4532232704402515e-05, + "loss": 0.0759, + "step": 27070 + }, + { + "epoch": 1.419287211740042, + "grad_norm": 1.7057894468307495, + "learning_rate": 1.451912997903564e-05, + "loss": 0.0684, + "step": 27080 + }, + { + "epoch": 1.419811320754717, + "grad_norm": 0.9108951687812805, + "learning_rate": 1.4506027253668763e-05, + "loss": 0.0756, + "step": 27090 + }, + { + "epoch": 1.420335429769392, + "grad_norm": 0.9875414371490479, + "learning_rate": 1.4492924528301888e-05, + "loss": 0.0938, + "step": 27100 + }, + { + "epoch": 1.420859538784067, + "grad_norm": 0.9708214402198792, + "learning_rate": 1.4479821802935012e-05, + "loss": 0.0862, + "step": 27110 + }, + { + "epoch": 1.4213836477987423, + "grad_norm": 1.5186963081359863, + "learning_rate": 1.4466719077568137e-05, + "loss": 0.0742, + "step": 27120 + }, + { + "epoch": 1.4219077568134173, + "grad_norm": 1.439297080039978, + "learning_rate": 1.4453616352201257e-05, + "loss": 0.0982, + "step": 27130 + }, + { + "epoch": 1.4224318658280923, + "grad_norm": 1.1225006580352783, + "learning_rate": 1.4440513626834382e-05, + "loss": 0.0685, + "step": 27140 + }, + { + "epoch": 1.4229559748427674, + "grad_norm": 1.1998291015625, + "learning_rate": 1.4427410901467505e-05, + "loss": 0.0683, + "step": 27150 + }, + { + "epoch": 1.4234800838574424, + "grad_norm": 1.8755435943603516, + "learning_rate": 1.441430817610063e-05, + "loss": 0.1194, + "step": 27160 + }, + { + "epoch": 1.4240041928721174, + "grad_norm": 1.7551946640014648, + "learning_rate": 1.4401205450733754e-05, + "loss": 0.078, + "step": 27170 + }, + { + "epoch": 1.4245283018867925, + "grad_norm": 1.9918137788772583, + "learning_rate": 1.4388102725366879e-05, + "loss": 0.0836, + "step": 27180 + }, + { + "epoch": 1.4250524109014675, + "grad_norm": 1.413087248802185, + "learning_rate": 1.4374999999999999e-05, + "loss": 0.0777, + "step": 27190 + }, + { + "epoch": 1.4255765199161425, + "grad_norm": 1.1328896284103394, + "learning_rate": 1.4361897274633124e-05, + "loss": 0.0809, + "step": 27200 + }, + { + "epoch": 1.4261006289308176, + "grad_norm": 1.0863498449325562, + "learning_rate": 1.4348794549266247e-05, + "loss": 0.0732, + "step": 27210 + }, + { + "epoch": 1.4266247379454926, + "grad_norm": 1.7177821397781372, + "learning_rate": 1.4335691823899372e-05, + "loss": 0.1035, + "step": 27220 + }, + { + "epoch": 1.4271488469601676, + "grad_norm": 1.144360065460205, + "learning_rate": 1.4322589098532496e-05, + "loss": 0.0773, + "step": 27230 + }, + { + "epoch": 1.4276729559748427, + "grad_norm": 1.9664690494537354, + "learning_rate": 1.430948637316562e-05, + "loss": 0.0734, + "step": 27240 + }, + { + "epoch": 1.4281970649895177, + "grad_norm": 1.9434707164764404, + "learning_rate": 1.4296383647798742e-05, + "loss": 0.0664, + "step": 27250 + }, + { + "epoch": 1.4287211740041927, + "grad_norm": 2.2035915851593018, + "learning_rate": 1.4283280922431866e-05, + "loss": 0.0783, + "step": 27260 + }, + { + "epoch": 1.429245283018868, + "grad_norm": 1.1164571046829224, + "learning_rate": 1.427017819706499e-05, + "loss": 0.0771, + "step": 27270 + }, + { + "epoch": 1.429769392033543, + "grad_norm": 0.8932652473449707, + "learning_rate": 1.4257075471698114e-05, + "loss": 0.067, + "step": 27280 + }, + { + "epoch": 1.430293501048218, + "grad_norm": 2.819211006164551, + "learning_rate": 1.4243972746331238e-05, + "loss": 0.0996, + "step": 27290 + }, + { + "epoch": 1.430817610062893, + "grad_norm": 1.9603825807571411, + "learning_rate": 1.4230870020964363e-05, + "loss": 0.0952, + "step": 27300 + }, + { + "epoch": 1.4313417190775681, + "grad_norm": 1.5057666301727295, + "learning_rate": 1.4217767295597484e-05, + "loss": 0.1054, + "step": 27310 + }, + { + "epoch": 1.4318658280922432, + "grad_norm": 1.899045467376709, + "learning_rate": 1.4204664570230608e-05, + "loss": 0.0817, + "step": 27320 + }, + { + "epoch": 1.4323899371069182, + "grad_norm": 1.6118180751800537, + "learning_rate": 1.4191561844863733e-05, + "loss": 0.0736, + "step": 27330 + }, + { + "epoch": 1.4329140461215932, + "grad_norm": 1.7266743183135986, + "learning_rate": 1.4178459119496856e-05, + "loss": 0.0748, + "step": 27340 + }, + { + "epoch": 1.4334381551362683, + "grad_norm": 1.0827453136444092, + "learning_rate": 1.416535639412998e-05, + "loss": 0.0719, + "step": 27350 + }, + { + "epoch": 1.4339622641509435, + "grad_norm": 1.9279223680496216, + "learning_rate": 1.4152253668763105e-05, + "loss": 0.0724, + "step": 27360 + }, + { + "epoch": 1.4344863731656186, + "grad_norm": 1.8519426584243774, + "learning_rate": 1.4139150943396226e-05, + "loss": 0.0606, + "step": 27370 + }, + { + "epoch": 1.4350104821802936, + "grad_norm": 1.5819141864776611, + "learning_rate": 1.412604821802935e-05, + "loss": 0.0722, + "step": 27380 + }, + { + "epoch": 1.4355345911949686, + "grad_norm": 0.8837409019470215, + "learning_rate": 1.4112945492662475e-05, + "loss": 0.0627, + "step": 27390 + }, + { + "epoch": 1.4360587002096437, + "grad_norm": 1.7745553255081177, + "learning_rate": 1.4099842767295598e-05, + "loss": 0.0779, + "step": 27400 + }, + { + "epoch": 1.4365828092243187, + "grad_norm": 0.6781327128410339, + "learning_rate": 1.4086740041928723e-05, + "loss": 0.0984, + "step": 27410 + }, + { + "epoch": 1.4371069182389937, + "grad_norm": 1.6918905973434448, + "learning_rate": 1.4073637316561847e-05, + "loss": 0.0595, + "step": 27420 + }, + { + "epoch": 1.4376310272536688, + "grad_norm": 1.5333856344223022, + "learning_rate": 1.4060534591194968e-05, + "loss": 0.0992, + "step": 27430 + }, + { + "epoch": 1.4381551362683438, + "grad_norm": 2.154278039932251, + "learning_rate": 1.4047431865828092e-05, + "loss": 0.0761, + "step": 27440 + }, + { + "epoch": 1.4386792452830188, + "grad_norm": 0.8082470893859863, + "learning_rate": 1.4034329140461217e-05, + "loss": 0.0637, + "step": 27450 + }, + { + "epoch": 1.4392033542976939, + "grad_norm": 2.1959385871887207, + "learning_rate": 1.402122641509434e-05, + "loss": 0.0699, + "step": 27460 + }, + { + "epoch": 1.439727463312369, + "grad_norm": 1.508762001991272, + "learning_rate": 1.4008123689727465e-05, + "loss": 0.0652, + "step": 27470 + }, + { + "epoch": 1.440251572327044, + "grad_norm": 2.946122407913208, + "learning_rate": 1.3995020964360589e-05, + "loss": 0.0814, + "step": 27480 + }, + { + "epoch": 1.440775681341719, + "grad_norm": 0.6886641383171082, + "learning_rate": 1.398191823899371e-05, + "loss": 0.0743, + "step": 27490 + }, + { + "epoch": 1.441299790356394, + "grad_norm": 1.3334239721298218, + "learning_rate": 1.3968815513626834e-05, + "loss": 0.0892, + "step": 27500 + }, + { + "epoch": 1.4418238993710693, + "grad_norm": 1.2078886032104492, + "learning_rate": 1.3955712788259959e-05, + "loss": 0.119, + "step": 27510 + }, + { + "epoch": 1.4423480083857443, + "grad_norm": 1.5522276163101196, + "learning_rate": 1.3942610062893082e-05, + "loss": 0.0822, + "step": 27520 + }, + { + "epoch": 1.4428721174004193, + "grad_norm": 1.9133297204971313, + "learning_rate": 1.3929507337526207e-05, + "loss": 0.0736, + "step": 27530 + }, + { + "epoch": 1.4433962264150944, + "grad_norm": 1.6485482454299927, + "learning_rate": 1.391640461215933e-05, + "loss": 0.0839, + "step": 27540 + }, + { + "epoch": 1.4439203354297694, + "grad_norm": 1.8468326330184937, + "learning_rate": 1.3903301886792452e-05, + "loss": 0.0764, + "step": 27550 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 1.209517478942871, + "learning_rate": 1.3890199161425576e-05, + "loss": 0.084, + "step": 27560 + }, + { + "epoch": 1.4449685534591195, + "grad_norm": 1.0626308917999268, + "learning_rate": 1.3877096436058701e-05, + "loss": 0.0803, + "step": 27570 + }, + { + "epoch": 1.4454926624737945, + "grad_norm": 1.577529788017273, + "learning_rate": 1.3863993710691824e-05, + "loss": 0.0854, + "step": 27580 + }, + { + "epoch": 1.4460167714884695, + "grad_norm": 2.4681644439697266, + "learning_rate": 1.385089098532495e-05, + "loss": 0.0867, + "step": 27590 + }, + { + "epoch": 1.4465408805031448, + "grad_norm": 1.8343055248260498, + "learning_rate": 1.3837788259958073e-05, + "loss": 0.0517, + "step": 27600 + }, + { + "epoch": 1.4470649895178198, + "grad_norm": 1.8408375978469849, + "learning_rate": 1.3824685534591194e-05, + "loss": 0.0743, + "step": 27610 + }, + { + "epoch": 1.4475890985324948, + "grad_norm": 1.6427656412124634, + "learning_rate": 1.381158280922432e-05, + "loss": 0.0783, + "step": 27620 + }, + { + "epoch": 1.4481132075471699, + "grad_norm": 0.9956682920455933, + "learning_rate": 1.3798480083857443e-05, + "loss": 0.0651, + "step": 27630 + }, + { + "epoch": 1.448637316561845, + "grad_norm": 0.8190504908561707, + "learning_rate": 1.3785377358490566e-05, + "loss": 0.0778, + "step": 27640 + }, + { + "epoch": 1.44916142557652, + "grad_norm": 0.7166125178337097, + "learning_rate": 1.3772274633123691e-05, + "loss": 0.0917, + "step": 27650 + }, + { + "epoch": 1.449685534591195, + "grad_norm": 1.4953032732009888, + "learning_rate": 1.3759171907756815e-05, + "loss": 0.0706, + "step": 27660 + }, + { + "epoch": 1.45020964360587, + "grad_norm": 1.7364864349365234, + "learning_rate": 1.3746069182389936e-05, + "loss": 0.09, + "step": 27670 + }, + { + "epoch": 1.450733752620545, + "grad_norm": 0.8806532621383667, + "learning_rate": 1.3732966457023062e-05, + "loss": 0.0785, + "step": 27680 + }, + { + "epoch": 1.45125786163522, + "grad_norm": 1.4088395833969116, + "learning_rate": 1.3719863731656185e-05, + "loss": 0.069, + "step": 27690 + }, + { + "epoch": 1.4517819706498951, + "grad_norm": 1.0472676753997803, + "learning_rate": 1.370676100628931e-05, + "loss": 0.0824, + "step": 27700 + }, + { + "epoch": 1.4523060796645701, + "grad_norm": 1.0019946098327637, + "learning_rate": 1.3693658280922433e-05, + "loss": 0.0882, + "step": 27710 + }, + { + "epoch": 1.4528301886792452, + "grad_norm": 2.3907127380371094, + "learning_rate": 1.3680555555555557e-05, + "loss": 0.1072, + "step": 27720 + }, + { + "epoch": 1.4533542976939202, + "grad_norm": 2.1289756298065186, + "learning_rate": 1.3667452830188678e-05, + "loss": 0.0681, + "step": 27730 + }, + { + "epoch": 1.4538784067085953, + "grad_norm": 1.460325002670288, + "learning_rate": 1.3654350104821804e-05, + "loss": 0.0673, + "step": 27740 + }, + { + "epoch": 1.4544025157232705, + "grad_norm": 1.5463610887527466, + "learning_rate": 1.3641247379454927e-05, + "loss": 0.0916, + "step": 27750 + }, + { + "epoch": 1.4549266247379455, + "grad_norm": 0.9027696847915649, + "learning_rate": 1.3628144654088052e-05, + "loss": 0.0592, + "step": 27760 + }, + { + "epoch": 1.4554507337526206, + "grad_norm": 1.820248007774353, + "learning_rate": 1.3615041928721175e-05, + "loss": 0.0787, + "step": 27770 + }, + { + "epoch": 1.4559748427672956, + "grad_norm": 1.1869231462478638, + "learning_rate": 1.36019392033543e-05, + "loss": 0.0861, + "step": 27780 + }, + { + "epoch": 1.4564989517819706, + "grad_norm": 1.538305401802063, + "learning_rate": 1.358883647798742e-05, + "loss": 0.0573, + "step": 27790 + }, + { + "epoch": 1.4570230607966457, + "grad_norm": 1.3371244668960571, + "learning_rate": 1.3575733752620546e-05, + "loss": 0.0701, + "step": 27800 + }, + { + "epoch": 1.4575471698113207, + "grad_norm": 0.7081040143966675, + "learning_rate": 1.3562631027253669e-05, + "loss": 0.0964, + "step": 27810 + }, + { + "epoch": 1.4580712788259957, + "grad_norm": 0.6923393607139587, + "learning_rate": 1.3549528301886794e-05, + "loss": 0.0888, + "step": 27820 + }, + { + "epoch": 1.458595387840671, + "grad_norm": 1.5953444242477417, + "learning_rate": 1.3536425576519917e-05, + "loss": 0.0716, + "step": 27830 + }, + { + "epoch": 1.459119496855346, + "grad_norm": 1.727294921875, + "learning_rate": 1.3523322851153039e-05, + "loss": 0.066, + "step": 27840 + }, + { + "epoch": 1.459643605870021, + "grad_norm": 3.990200996398926, + "learning_rate": 1.3510220125786163e-05, + "loss": 0.1102, + "step": 27850 + }, + { + "epoch": 1.460167714884696, + "grad_norm": 2.4308507442474365, + "learning_rate": 1.3497117400419288e-05, + "loss": 0.0796, + "step": 27860 + }, + { + "epoch": 1.4606918238993711, + "grad_norm": 1.9697445631027222, + "learning_rate": 1.3484014675052411e-05, + "loss": 0.0841, + "step": 27870 + }, + { + "epoch": 1.4612159329140462, + "grad_norm": 0.9104553461074829, + "learning_rate": 1.3470911949685536e-05, + "loss": 0.089, + "step": 27880 + }, + { + "epoch": 1.4617400419287212, + "grad_norm": 0.9098082780838013, + "learning_rate": 1.345780922431866e-05, + "loss": 0.0986, + "step": 27890 + }, + { + "epoch": 1.4622641509433962, + "grad_norm": 1.175636887550354, + "learning_rate": 1.3444706498951781e-05, + "loss": 0.0793, + "step": 27900 + }, + { + "epoch": 1.4627882599580713, + "grad_norm": 0.9262112379074097, + "learning_rate": 1.3431603773584906e-05, + "loss": 0.0646, + "step": 27910 + }, + { + "epoch": 1.4633123689727463, + "grad_norm": 1.373189091682434, + "learning_rate": 1.341850104821803e-05, + "loss": 0.0697, + "step": 27920 + }, + { + "epoch": 1.4638364779874213, + "grad_norm": 1.5659918785095215, + "learning_rate": 1.3405398322851153e-05, + "loss": 0.0755, + "step": 27930 + }, + { + "epoch": 1.4643605870020964, + "grad_norm": 3.091872453689575, + "learning_rate": 1.3392295597484278e-05, + "loss": 0.0906, + "step": 27940 + }, + { + "epoch": 1.4648846960167714, + "grad_norm": 1.3412938117980957, + "learning_rate": 1.3379192872117401e-05, + "loss": 0.0876, + "step": 27950 + }, + { + "epoch": 1.4654088050314464, + "grad_norm": 1.137826681137085, + "learning_rate": 1.3366090146750523e-05, + "loss": 0.0773, + "step": 27960 + }, + { + "epoch": 1.4659329140461215, + "grad_norm": 3.942056894302368, + "learning_rate": 1.3352987421383648e-05, + "loss": 0.0677, + "step": 27970 + }, + { + "epoch": 1.4664570230607967, + "grad_norm": 1.7244807481765747, + "learning_rate": 1.3339884696016772e-05, + "loss": 0.0704, + "step": 27980 + }, + { + "epoch": 1.4669811320754718, + "grad_norm": 1.608075737953186, + "learning_rate": 1.3326781970649897e-05, + "loss": 0.0703, + "step": 27990 + }, + { + "epoch": 1.4675052410901468, + "grad_norm": 2.1725499629974365, + "learning_rate": 1.331367924528302e-05, + "loss": 0.0709, + "step": 28000 + }, + { + "epoch": 1.4675052410901468, + "eval_loss": 0.271745502948761, + "eval_runtime": 267.6865, + "eval_samples_per_second": 7.438, + "eval_steps_per_second": 1.24, + "step": 28000 + }, + { + "epoch": 1.4680293501048218, + "grad_norm": 1.803655743598938, + "learning_rate": 1.3300576519916143e-05, + "loss": 0.0864, + "step": 28010 + }, + { + "epoch": 1.4685534591194969, + "grad_norm": 1.6521601676940918, + "learning_rate": 1.3287473794549265e-05, + "loss": 0.0755, + "step": 28020 + }, + { + "epoch": 1.469077568134172, + "grad_norm": 2.3128561973571777, + "learning_rate": 1.327437106918239e-05, + "loss": 0.081, + "step": 28030 + }, + { + "epoch": 1.469601677148847, + "grad_norm": 1.0363951921463013, + "learning_rate": 1.3261268343815514e-05, + "loss": 0.0575, + "step": 28040 + }, + { + "epoch": 1.470125786163522, + "grad_norm": 0.7471582889556885, + "learning_rate": 1.3248165618448639e-05, + "loss": 0.0583, + "step": 28050 + }, + { + "epoch": 1.470649895178197, + "grad_norm": 1.6395134925842285, + "learning_rate": 1.3235062893081762e-05, + "loss": 0.0674, + "step": 28060 + }, + { + "epoch": 1.4711740041928723, + "grad_norm": 1.7214020490646362, + "learning_rate": 1.3221960167714887e-05, + "loss": 0.09, + "step": 28070 + }, + { + "epoch": 1.4716981132075473, + "grad_norm": 1.6279518604278564, + "learning_rate": 1.3208857442348007e-05, + "loss": 0.0857, + "step": 28080 + }, + { + "epoch": 1.4722222222222223, + "grad_norm": 1.4659122228622437, + "learning_rate": 1.3195754716981132e-05, + "loss": 0.0843, + "step": 28090 + }, + { + "epoch": 1.4727463312368974, + "grad_norm": 0.8685888648033142, + "learning_rate": 1.3182651991614256e-05, + "loss": 0.0554, + "step": 28100 + }, + { + "epoch": 1.4732704402515724, + "grad_norm": 1.3319734334945679, + "learning_rate": 1.316954926624738e-05, + "loss": 0.079, + "step": 28110 + }, + { + "epoch": 1.4737945492662474, + "grad_norm": 1.8119728565216064, + "learning_rate": 1.3156446540880504e-05, + "loss": 0.0726, + "step": 28120 + }, + { + "epoch": 1.4743186582809225, + "grad_norm": 1.3160663843154907, + "learning_rate": 1.314334381551363e-05, + "loss": 0.0875, + "step": 28130 + }, + { + "epoch": 1.4748427672955975, + "grad_norm": 2.143087387084961, + "learning_rate": 1.313024109014675e-05, + "loss": 0.0828, + "step": 28140 + }, + { + "epoch": 1.4753668763102725, + "grad_norm": 1.9717251062393188, + "learning_rate": 1.3117138364779874e-05, + "loss": 0.0709, + "step": 28150 + }, + { + "epoch": 1.4758909853249476, + "grad_norm": 1.5455591678619385, + "learning_rate": 1.3104035639412998e-05, + "loss": 0.1053, + "step": 28160 + }, + { + "epoch": 1.4764150943396226, + "grad_norm": 1.7881935834884644, + "learning_rate": 1.3090932914046123e-05, + "loss": 0.082, + "step": 28170 + }, + { + "epoch": 1.4769392033542976, + "grad_norm": 1.2156028747558594, + "learning_rate": 1.3077830188679246e-05, + "loss": 0.0662, + "step": 28180 + }, + { + "epoch": 1.4774633123689727, + "grad_norm": 1.5937763452529907, + "learning_rate": 1.3064727463312371e-05, + "loss": 0.0686, + "step": 28190 + }, + { + "epoch": 1.4779874213836477, + "grad_norm": 2.669137477874756, + "learning_rate": 1.3051624737945491e-05, + "loss": 0.0726, + "step": 28200 + }, + { + "epoch": 1.4785115303983227, + "grad_norm": 1.6879535913467407, + "learning_rate": 1.3038522012578616e-05, + "loss": 0.0885, + "step": 28210 + }, + { + "epoch": 1.479035639412998, + "grad_norm": 1.1833429336547852, + "learning_rate": 1.302541928721174e-05, + "loss": 0.0692, + "step": 28220 + }, + { + "epoch": 1.479559748427673, + "grad_norm": 2.149759531021118, + "learning_rate": 1.3012316561844865e-05, + "loss": 0.1053, + "step": 28230 + }, + { + "epoch": 1.480083857442348, + "grad_norm": 2.0585362911224365, + "learning_rate": 1.2999213836477988e-05, + "loss": 0.0833, + "step": 28240 + }, + { + "epoch": 1.480607966457023, + "grad_norm": 1.4494343996047974, + "learning_rate": 1.2986111111111113e-05, + "loss": 0.0934, + "step": 28250 + }, + { + "epoch": 1.4811320754716981, + "grad_norm": 1.7021030187606812, + "learning_rate": 1.2973008385744235e-05, + "loss": 0.0583, + "step": 28260 + }, + { + "epoch": 1.4816561844863732, + "grad_norm": 2.6101348400115967, + "learning_rate": 1.2959905660377358e-05, + "loss": 0.0768, + "step": 28270 + }, + { + "epoch": 1.4821802935010482, + "grad_norm": 0.9507656693458557, + "learning_rate": 1.2946802935010482e-05, + "loss": 0.0526, + "step": 28280 + }, + { + "epoch": 1.4827044025157232, + "grad_norm": 1.1151862144470215, + "learning_rate": 1.2933700209643607e-05, + "loss": 0.0898, + "step": 28290 + }, + { + "epoch": 1.4832285115303983, + "grad_norm": 1.3400394916534424, + "learning_rate": 1.292059748427673e-05, + "loss": 0.0771, + "step": 28300 + }, + { + "epoch": 1.4837526205450735, + "grad_norm": 1.4885073900222778, + "learning_rate": 1.2907494758909855e-05, + "loss": 0.0877, + "step": 28310 + }, + { + "epoch": 1.4842767295597485, + "grad_norm": 1.9722825288772583, + "learning_rate": 1.2894392033542977e-05, + "loss": 0.0913, + "step": 28320 + }, + { + "epoch": 1.4848008385744236, + "grad_norm": 1.2791171073913574, + "learning_rate": 1.28812893081761e-05, + "loss": 0.0747, + "step": 28330 + }, + { + "epoch": 1.4853249475890986, + "grad_norm": 1.2421149015426636, + "learning_rate": 1.2868186582809225e-05, + "loss": 0.0727, + "step": 28340 + }, + { + "epoch": 1.4858490566037736, + "grad_norm": 1.6803362369537354, + "learning_rate": 1.2855083857442349e-05, + "loss": 0.09, + "step": 28350 + }, + { + "epoch": 1.4863731656184487, + "grad_norm": 1.8740354776382446, + "learning_rate": 1.2841981132075472e-05, + "loss": 0.0651, + "step": 28360 + }, + { + "epoch": 1.4868972746331237, + "grad_norm": 1.3395440578460693, + "learning_rate": 1.2828878406708597e-05, + "loss": 0.0775, + "step": 28370 + }, + { + "epoch": 1.4874213836477987, + "grad_norm": 1.6447713375091553, + "learning_rate": 1.2815775681341719e-05, + "loss": 0.0763, + "step": 28380 + }, + { + "epoch": 1.4879454926624738, + "grad_norm": 2.1082639694213867, + "learning_rate": 1.2802672955974842e-05, + "loss": 0.0919, + "step": 28390 + }, + { + "epoch": 1.4884696016771488, + "grad_norm": 1.401515245437622, + "learning_rate": 1.2789570230607967e-05, + "loss": 0.0709, + "step": 28400 + }, + { + "epoch": 1.4889937106918238, + "grad_norm": 1.3581801652908325, + "learning_rate": 1.277646750524109e-05, + "loss": 0.0765, + "step": 28410 + }, + { + "epoch": 1.4895178197064989, + "grad_norm": 1.3556405305862427, + "learning_rate": 1.2763364779874216e-05, + "loss": 0.0641, + "step": 28420 + }, + { + "epoch": 1.490041928721174, + "grad_norm": 1.4136556386947632, + "learning_rate": 1.275026205450734e-05, + "loss": 0.0687, + "step": 28430 + }, + { + "epoch": 1.490566037735849, + "grad_norm": 1.5147451162338257, + "learning_rate": 1.2737159329140461e-05, + "loss": 0.0774, + "step": 28440 + }, + { + "epoch": 1.491090146750524, + "grad_norm": 1.2595840692520142, + "learning_rate": 1.2724056603773584e-05, + "loss": 0.0548, + "step": 28450 + }, + { + "epoch": 1.4916142557651992, + "grad_norm": 1.0852960348129272, + "learning_rate": 1.271095387840671e-05, + "loss": 0.0611, + "step": 28460 + }, + { + "epoch": 1.4921383647798743, + "grad_norm": 1.0651873350143433, + "learning_rate": 1.2697851153039833e-05, + "loss": 0.0732, + "step": 28470 + }, + { + "epoch": 1.4926624737945493, + "grad_norm": 1.559222936630249, + "learning_rate": 1.2684748427672958e-05, + "loss": 0.0612, + "step": 28480 + }, + { + "epoch": 1.4931865828092243, + "grad_norm": 3.197054862976074, + "learning_rate": 1.2671645702306081e-05, + "loss": 0.0671, + "step": 28490 + }, + { + "epoch": 1.4937106918238994, + "grad_norm": 0.6540378928184509, + "learning_rate": 1.2658542976939203e-05, + "loss": 0.068, + "step": 28500 + }, + { + "epoch": 1.4942348008385744, + "grad_norm": 1.2630752325057983, + "learning_rate": 1.2645440251572326e-05, + "loss": 0.0583, + "step": 28510 + }, + { + "epoch": 1.4947589098532494, + "grad_norm": 1.6068637371063232, + "learning_rate": 1.2632337526205451e-05, + "loss": 0.093, + "step": 28520 + }, + { + "epoch": 1.4952830188679245, + "grad_norm": 1.573815941810608, + "learning_rate": 1.2619234800838575e-05, + "loss": 0.0651, + "step": 28530 + }, + { + "epoch": 1.4958071278825995, + "grad_norm": 1.224945068359375, + "learning_rate": 1.26061320754717e-05, + "loss": 0.0812, + "step": 28540 + }, + { + "epoch": 1.4963312368972748, + "grad_norm": 2.006347179412842, + "learning_rate": 1.2593029350104823e-05, + "loss": 0.1058, + "step": 28550 + }, + { + "epoch": 1.4968553459119498, + "grad_norm": 1.5014920234680176, + "learning_rate": 1.2579926624737945e-05, + "loss": 0.0671, + "step": 28560 + }, + { + "epoch": 1.4973794549266248, + "grad_norm": 1.4988354444503784, + "learning_rate": 1.2566823899371068e-05, + "loss": 0.0572, + "step": 28570 + }, + { + "epoch": 1.4979035639412999, + "grad_norm": 1.3243881464004517, + "learning_rate": 1.2553721174004193e-05, + "loss": 0.0719, + "step": 28580 + }, + { + "epoch": 1.498427672955975, + "grad_norm": 1.867492437362671, + "learning_rate": 1.2540618448637317e-05, + "loss": 0.0725, + "step": 28590 + }, + { + "epoch": 1.49895178197065, + "grad_norm": 2.1830546855926514, + "learning_rate": 1.2527515723270442e-05, + "loss": 0.0958, + "step": 28600 + }, + { + "epoch": 1.499475890985325, + "grad_norm": 1.920746922492981, + "learning_rate": 1.2514412997903565e-05, + "loss": 0.1057, + "step": 28610 + }, + { + "epoch": 1.5, + "grad_norm": 1.0297828912734985, + "learning_rate": 1.2501310272536687e-05, + "loss": 0.0722, + "step": 28620 + }, + { + "epoch": 1.500524109014675, + "grad_norm": 5.875576496124268, + "learning_rate": 1.2488207547169812e-05, + "loss": 0.1083, + "step": 28630 + }, + { + "epoch": 1.50104821802935, + "grad_norm": 1.3598127365112305, + "learning_rate": 1.2475104821802935e-05, + "loss": 0.0681, + "step": 28640 + }, + { + "epoch": 1.501572327044025, + "grad_norm": 1.3286088705062866, + "learning_rate": 1.2462002096436059e-05, + "loss": 0.0812, + "step": 28650 + }, + { + "epoch": 1.5020964360587001, + "grad_norm": 1.2798917293548584, + "learning_rate": 1.2448899371069182e-05, + "loss": 0.0801, + "step": 28660 + }, + { + "epoch": 1.5026205450733752, + "grad_norm": 1.671739935874939, + "learning_rate": 1.2435796645702307e-05, + "loss": 0.0794, + "step": 28670 + }, + { + "epoch": 1.5031446540880502, + "grad_norm": 1.265716791152954, + "learning_rate": 1.242269392033543e-05, + "loss": 0.0814, + "step": 28680 + }, + { + "epoch": 1.5036687631027252, + "grad_norm": 1.810259222984314, + "learning_rate": 1.2409591194968554e-05, + "loss": 0.0834, + "step": 28690 + }, + { + "epoch": 1.5041928721174003, + "grad_norm": 1.5818339586257935, + "learning_rate": 1.2396488469601677e-05, + "loss": 0.0725, + "step": 28700 + }, + { + "epoch": 1.5047169811320755, + "grad_norm": 1.0720511674880981, + "learning_rate": 1.2383385744234802e-05, + "loss": 0.0663, + "step": 28710 + }, + { + "epoch": 1.5052410901467506, + "grad_norm": 0.9712188839912415, + "learning_rate": 1.2370283018867924e-05, + "loss": 0.0672, + "step": 28720 + }, + { + "epoch": 1.5057651991614256, + "grad_norm": 1.4916762113571167, + "learning_rate": 1.235718029350105e-05, + "loss": 0.0621, + "step": 28730 + }, + { + "epoch": 1.5062893081761006, + "grad_norm": 0.9293265342712402, + "learning_rate": 1.2344077568134173e-05, + "loss": 0.0785, + "step": 28740 + }, + { + "epoch": 1.5068134171907757, + "grad_norm": 1.805554747581482, + "learning_rate": 1.2330974842767296e-05, + "loss": 0.0687, + "step": 28750 + }, + { + "epoch": 1.5073375262054507, + "grad_norm": 0.8354784250259399, + "learning_rate": 1.231787211740042e-05, + "loss": 0.0841, + "step": 28760 + }, + { + "epoch": 1.507861635220126, + "grad_norm": 2.4125020503997803, + "learning_rate": 1.2304769392033544e-05, + "loss": 0.0648, + "step": 28770 + }, + { + "epoch": 1.508385744234801, + "grad_norm": 2.2017788887023926, + "learning_rate": 1.2291666666666666e-05, + "loss": 0.0821, + "step": 28780 + }, + { + "epoch": 1.508909853249476, + "grad_norm": 1.4541821479797363, + "learning_rate": 1.2278563941299791e-05, + "loss": 0.0816, + "step": 28790 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 0.7428493499755859, + "learning_rate": 1.2265461215932915e-05, + "loss": 0.058, + "step": 28800 + }, + { + "epoch": 1.509958071278826, + "grad_norm": 1.1366612911224365, + "learning_rate": 1.2252358490566038e-05, + "loss": 0.0588, + "step": 28810 + }, + { + "epoch": 1.5104821802935011, + "grad_norm": 1.2058767080307007, + "learning_rate": 1.2239255765199161e-05, + "loss": 0.0732, + "step": 28820 + }, + { + "epoch": 1.5110062893081762, + "grad_norm": 3.127432107925415, + "learning_rate": 1.2226153039832286e-05, + "loss": 0.1002, + "step": 28830 + }, + { + "epoch": 1.5115303983228512, + "grad_norm": 3.9987447261810303, + "learning_rate": 1.2213050314465408e-05, + "loss": 0.0848, + "step": 28840 + }, + { + "epoch": 1.5120545073375262, + "grad_norm": 0.9082832932472229, + "learning_rate": 1.2199947589098533e-05, + "loss": 0.0777, + "step": 28850 + }, + { + "epoch": 1.5125786163522013, + "grad_norm": 1.5602792501449585, + "learning_rate": 1.2186844863731657e-05, + "loss": 0.0609, + "step": 28860 + }, + { + "epoch": 1.5131027253668763, + "grad_norm": 1.9996236562728882, + "learning_rate": 1.217374213836478e-05, + "loss": 0.0764, + "step": 28870 + }, + { + "epoch": 1.5136268343815513, + "grad_norm": 1.3080304861068726, + "learning_rate": 1.2160639412997903e-05, + "loss": 0.0757, + "step": 28880 + }, + { + "epoch": 1.5141509433962264, + "grad_norm": 1.3440593481063843, + "learning_rate": 1.2147536687631028e-05, + "loss": 0.071, + "step": 28890 + }, + { + "epoch": 1.5146750524109014, + "grad_norm": 1.1804187297821045, + "learning_rate": 1.213443396226415e-05, + "loss": 0.1036, + "step": 28900 + }, + { + "epoch": 1.5151991614255764, + "grad_norm": 0.9023783802986145, + "learning_rate": 1.2121331236897275e-05, + "loss": 0.0663, + "step": 28910 + }, + { + "epoch": 1.5157232704402515, + "grad_norm": 1.4749324321746826, + "learning_rate": 1.2108228511530399e-05, + "loss": 0.0619, + "step": 28920 + }, + { + "epoch": 1.5162473794549265, + "grad_norm": 1.2634544372558594, + "learning_rate": 1.2095125786163522e-05, + "loss": 0.1205, + "step": 28930 + }, + { + "epoch": 1.5167714884696015, + "grad_norm": 2.068763256072998, + "learning_rate": 1.2082023060796645e-05, + "loss": 0.0663, + "step": 28940 + }, + { + "epoch": 1.5172955974842768, + "grad_norm": 1.1468791961669922, + "learning_rate": 1.206892033542977e-05, + "loss": 0.0826, + "step": 28950 + }, + { + "epoch": 1.5178197064989518, + "grad_norm": 1.9512981176376343, + "learning_rate": 1.2055817610062894e-05, + "loss": 0.0921, + "step": 28960 + }, + { + "epoch": 1.5183438155136268, + "grad_norm": 1.7221571207046509, + "learning_rate": 1.2042714884696017e-05, + "loss": 0.07, + "step": 28970 + }, + { + "epoch": 1.5188679245283019, + "grad_norm": 1.6264697313308716, + "learning_rate": 1.202961215932914e-05, + "loss": 0.0596, + "step": 28980 + }, + { + "epoch": 1.519392033542977, + "grad_norm": 2.5485575199127197, + "learning_rate": 1.2016509433962264e-05, + "loss": 0.0852, + "step": 28990 + }, + { + "epoch": 1.519916142557652, + "grad_norm": 0.7181016802787781, + "learning_rate": 1.2003406708595389e-05, + "loss": 0.0684, + "step": 29000 + }, + { + "epoch": 1.519916142557652, + "eval_loss": 0.2725262939929962, + "eval_runtime": 267.0455, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 1.243, + "step": 29000 + }, + { + "epoch": 1.5204402515723272, + "grad_norm": 1.1129300594329834, + "learning_rate": 1.1990303983228512e-05, + "loss": 0.1032, + "step": 29010 + }, + { + "epoch": 1.5209643605870022, + "grad_norm": 1.3954484462738037, + "learning_rate": 1.1977201257861636e-05, + "loss": 0.0645, + "step": 29020 + }, + { + "epoch": 1.5214884696016773, + "grad_norm": 2.2983057498931885, + "learning_rate": 1.196409853249476e-05, + "loss": 0.0875, + "step": 29030 + }, + { + "epoch": 1.5220125786163523, + "grad_norm": 1.7596241235733032, + "learning_rate": 1.1950995807127884e-05, + "loss": 0.0701, + "step": 29040 + }, + { + "epoch": 1.5225366876310273, + "grad_norm": 2.6219558715820312, + "learning_rate": 1.1937893081761006e-05, + "loss": 0.0652, + "step": 29050 + }, + { + "epoch": 1.5230607966457024, + "grad_norm": 1.7040820121765137, + "learning_rate": 1.1924790356394131e-05, + "loss": 0.099, + "step": 29060 + }, + { + "epoch": 1.5235849056603774, + "grad_norm": 0.6065346002578735, + "learning_rate": 1.1911687631027254e-05, + "loss": 0.0608, + "step": 29070 + }, + { + "epoch": 1.5241090146750524, + "grad_norm": 1.5256175994873047, + "learning_rate": 1.1898584905660378e-05, + "loss": 0.0973, + "step": 29080 + }, + { + "epoch": 1.5246331236897275, + "grad_norm": 1.1144390106201172, + "learning_rate": 1.1885482180293501e-05, + "loss": 0.084, + "step": 29090 + }, + { + "epoch": 1.5251572327044025, + "grad_norm": 1.670206069946289, + "learning_rate": 1.1872379454926626e-05, + "loss": 0.0827, + "step": 29100 + }, + { + "epoch": 1.5256813417190775, + "grad_norm": 3.055248975753784, + "learning_rate": 1.1859276729559748e-05, + "loss": 0.076, + "step": 29110 + }, + { + "epoch": 1.5262054507337526, + "grad_norm": 2.054408311843872, + "learning_rate": 1.1846174004192873e-05, + "loss": 0.0893, + "step": 29120 + }, + { + "epoch": 1.5267295597484276, + "grad_norm": 1.0369834899902344, + "learning_rate": 1.1833071278825997e-05, + "loss": 0.088, + "step": 29130 + }, + { + "epoch": 1.5272536687631026, + "grad_norm": 1.8977478742599487, + "learning_rate": 1.181996855345912e-05, + "loss": 0.0666, + "step": 29140 + }, + { + "epoch": 1.5277777777777777, + "grad_norm": 1.332196831703186, + "learning_rate": 1.1806865828092243e-05, + "loss": 0.0837, + "step": 29150 + }, + { + "epoch": 1.5283018867924527, + "grad_norm": 2.6005096435546875, + "learning_rate": 1.1793763102725368e-05, + "loss": 0.0783, + "step": 29160 + }, + { + "epoch": 1.5288259958071277, + "grad_norm": 2.069912910461426, + "learning_rate": 1.178066037735849e-05, + "loss": 0.0706, + "step": 29170 + }, + { + "epoch": 1.5293501048218028, + "grad_norm": 0.627423107624054, + "learning_rate": 1.1767557651991615e-05, + "loss": 0.0632, + "step": 29180 + }, + { + "epoch": 1.529874213836478, + "grad_norm": 2.5008325576782227, + "learning_rate": 1.1754454926624739e-05, + "loss": 0.0732, + "step": 29190 + }, + { + "epoch": 1.530398322851153, + "grad_norm": 1.6689064502716064, + "learning_rate": 1.1741352201257862e-05, + "loss": 0.091, + "step": 29200 + }, + { + "epoch": 1.530922431865828, + "grad_norm": 1.0986666679382324, + "learning_rate": 1.1728249475890985e-05, + "loss": 0.0808, + "step": 29210 + }, + { + "epoch": 1.5314465408805031, + "grad_norm": 1.1996750831604004, + "learning_rate": 1.171514675052411e-05, + "loss": 0.0848, + "step": 29220 + }, + { + "epoch": 1.5319706498951782, + "grad_norm": 1.5958057641983032, + "learning_rate": 1.1702044025157232e-05, + "loss": 0.0674, + "step": 29230 + }, + { + "epoch": 1.5324947589098532, + "grad_norm": 2.4333527088165283, + "learning_rate": 1.1688941299790357e-05, + "loss": 0.0807, + "step": 29240 + }, + { + "epoch": 1.5330188679245285, + "grad_norm": 1.3615031242370605, + "learning_rate": 1.167583857442348e-05, + "loss": 0.0838, + "step": 29250 + }, + { + "epoch": 1.5335429769392035, + "grad_norm": 1.0076712369918823, + "learning_rate": 1.1662735849056604e-05, + "loss": 0.0667, + "step": 29260 + }, + { + "epoch": 1.5340670859538785, + "grad_norm": 1.245064377784729, + "learning_rate": 1.1649633123689727e-05, + "loss": 0.0921, + "step": 29270 + }, + { + "epoch": 1.5345911949685536, + "grad_norm": 1.793381690979004, + "learning_rate": 1.1636530398322852e-05, + "loss": 0.0726, + "step": 29280 + }, + { + "epoch": 1.5351153039832286, + "grad_norm": 1.2510024309158325, + "learning_rate": 1.1623427672955974e-05, + "loss": 0.0705, + "step": 29290 + }, + { + "epoch": 1.5356394129979036, + "grad_norm": 1.0669950246810913, + "learning_rate": 1.1610324947589099e-05, + "loss": 0.0773, + "step": 29300 + }, + { + "epoch": 1.5361635220125787, + "grad_norm": 2.63558292388916, + "learning_rate": 1.1597222222222223e-05, + "loss": 0.084, + "step": 29310 + }, + { + "epoch": 1.5366876310272537, + "grad_norm": 1.210523009300232, + "learning_rate": 1.1584119496855346e-05, + "loss": 0.0802, + "step": 29320 + }, + { + "epoch": 1.5372117400419287, + "grad_norm": 1.076084017753601, + "learning_rate": 1.157101677148847e-05, + "loss": 0.0855, + "step": 29330 + }, + { + "epoch": 1.5377358490566038, + "grad_norm": 2.554025411605835, + "learning_rate": 1.1557914046121594e-05, + "loss": 0.088, + "step": 29340 + }, + { + "epoch": 1.5382599580712788, + "grad_norm": 1.6165305376052856, + "learning_rate": 1.1544811320754718e-05, + "loss": 0.0707, + "step": 29350 + }, + { + "epoch": 1.5387840670859538, + "grad_norm": 1.7850388288497925, + "learning_rate": 1.1531708595387841e-05, + "loss": 0.0816, + "step": 29360 + }, + { + "epoch": 1.5393081761006289, + "grad_norm": 1.9576303958892822, + "learning_rate": 1.1518605870020965e-05, + "loss": 0.0886, + "step": 29370 + }, + { + "epoch": 1.539832285115304, + "grad_norm": 1.1999833583831787, + "learning_rate": 1.1505503144654088e-05, + "loss": 0.0605, + "step": 29380 + }, + { + "epoch": 1.540356394129979, + "grad_norm": 1.0975391864776611, + "learning_rate": 1.1492400419287213e-05, + "loss": 0.0951, + "step": 29390 + }, + { + "epoch": 1.540880503144654, + "grad_norm": 1.9649990797042847, + "learning_rate": 1.1479297693920336e-05, + "loss": 0.078, + "step": 29400 + }, + { + "epoch": 1.541404612159329, + "grad_norm": 1.9952278137207031, + "learning_rate": 1.146619496855346e-05, + "loss": 0.0705, + "step": 29410 + }, + { + "epoch": 1.541928721174004, + "grad_norm": 1.1087632179260254, + "learning_rate": 1.1453092243186583e-05, + "loss": 0.0881, + "step": 29420 + }, + { + "epoch": 1.5424528301886793, + "grad_norm": 1.3009109497070312, + "learning_rate": 1.1439989517819708e-05, + "loss": 0.0798, + "step": 29430 + }, + { + "epoch": 1.5429769392033543, + "grad_norm": 0.8007161021232605, + "learning_rate": 1.142688679245283e-05, + "loss": 0.0661, + "step": 29440 + }, + { + "epoch": 1.5435010482180294, + "grad_norm": 0.6858960390090942, + "learning_rate": 1.1413784067085955e-05, + "loss": 0.0712, + "step": 29450 + }, + { + "epoch": 1.5440251572327044, + "grad_norm": 0.7989315390586853, + "learning_rate": 1.1400681341719078e-05, + "loss": 0.0719, + "step": 29460 + }, + { + "epoch": 1.5445492662473794, + "grad_norm": 1.4100242853164673, + "learning_rate": 1.1387578616352202e-05, + "loss": 0.0808, + "step": 29470 + }, + { + "epoch": 1.5450733752620545, + "grad_norm": 1.6161035299301147, + "learning_rate": 1.1374475890985325e-05, + "loss": 0.0812, + "step": 29480 + }, + { + "epoch": 1.5455974842767297, + "grad_norm": 0.731370210647583, + "learning_rate": 1.136137316561845e-05, + "loss": 0.0706, + "step": 29490 + }, + { + "epoch": 1.5461215932914047, + "grad_norm": 2.8555400371551514, + "learning_rate": 1.1348270440251572e-05, + "loss": 0.097, + "step": 29500 + }, + { + "epoch": 1.5466457023060798, + "grad_norm": 0.6062478423118591, + "learning_rate": 1.1335167714884697e-05, + "loss": 0.0681, + "step": 29510 + }, + { + "epoch": 1.5471698113207548, + "grad_norm": 1.8448848724365234, + "learning_rate": 1.132206498951782e-05, + "loss": 0.0881, + "step": 29520 + }, + { + "epoch": 1.5476939203354299, + "grad_norm": 1.489322304725647, + "learning_rate": 1.1308962264150944e-05, + "loss": 0.0973, + "step": 29530 + }, + { + "epoch": 1.5482180293501049, + "grad_norm": 1.3408927917480469, + "learning_rate": 1.1295859538784067e-05, + "loss": 0.0577, + "step": 29540 + }, + { + "epoch": 1.54874213836478, + "grad_norm": 1.461776852607727, + "learning_rate": 1.1282756813417192e-05, + "loss": 0.0777, + "step": 29550 + }, + { + "epoch": 1.549266247379455, + "grad_norm": 1.5345159769058228, + "learning_rate": 1.1269654088050314e-05, + "loss": 0.0886, + "step": 29560 + }, + { + "epoch": 1.54979035639413, + "grad_norm": 0.965441882610321, + "learning_rate": 1.1256551362683439e-05, + "loss": 0.1009, + "step": 29570 + }, + { + "epoch": 1.550314465408805, + "grad_norm": 1.252944827079773, + "learning_rate": 1.1243448637316562e-05, + "loss": 0.1045, + "step": 29580 + }, + { + "epoch": 1.55083857442348, + "grad_norm": 1.1413624286651611, + "learning_rate": 1.1230345911949686e-05, + "loss": 0.0585, + "step": 29590 + }, + { + "epoch": 1.551362683438155, + "grad_norm": 0.5949219465255737, + "learning_rate": 1.121724318658281e-05, + "loss": 0.0833, + "step": 29600 + }, + { + "epoch": 1.5518867924528301, + "grad_norm": 1.4133487939834595, + "learning_rate": 1.1204140461215934e-05, + "loss": 0.0636, + "step": 29610 + }, + { + "epoch": 1.5524109014675052, + "grad_norm": 1.1068850755691528, + "learning_rate": 1.1191037735849056e-05, + "loss": 0.0931, + "step": 29620 + }, + { + "epoch": 1.5529350104821802, + "grad_norm": 4.534146785736084, + "learning_rate": 1.1177935010482181e-05, + "loss": 0.0539, + "step": 29630 + }, + { + "epoch": 1.5534591194968552, + "grad_norm": 1.7356497049331665, + "learning_rate": 1.1164832285115304e-05, + "loss": 0.0625, + "step": 29640 + }, + { + "epoch": 1.5539832285115303, + "grad_norm": 2.3491108417510986, + "learning_rate": 1.1151729559748428e-05, + "loss": 0.0854, + "step": 29650 + }, + { + "epoch": 1.5545073375262053, + "grad_norm": 0.9400236010551453, + "learning_rate": 1.1138626834381551e-05, + "loss": 0.064, + "step": 29660 + }, + { + "epoch": 1.5550314465408805, + "grad_norm": 2.1760456562042236, + "learning_rate": 1.1125524109014676e-05, + "loss": 0.0894, + "step": 29670 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 2.121522903442383, + "learning_rate": 1.11124213836478e-05, + "loss": 0.0675, + "step": 29680 + }, + { + "epoch": 1.5560796645702306, + "grad_norm": 1.169234037399292, + "learning_rate": 1.1099318658280923e-05, + "loss": 0.1041, + "step": 29690 + }, + { + "epoch": 1.5566037735849056, + "grad_norm": 2.202223062515259, + "learning_rate": 1.1086215932914046e-05, + "loss": 0.101, + "step": 29700 + }, + { + "epoch": 1.5571278825995807, + "grad_norm": 1.6702232360839844, + "learning_rate": 1.107311320754717e-05, + "loss": 0.0904, + "step": 29710 + }, + { + "epoch": 1.5576519916142557, + "grad_norm": 1.469577670097351, + "learning_rate": 1.1060010482180295e-05, + "loss": 0.0834, + "step": 29720 + }, + { + "epoch": 1.558176100628931, + "grad_norm": 1.481614589691162, + "learning_rate": 1.1046907756813418e-05, + "loss": 0.0655, + "step": 29730 + }, + { + "epoch": 1.558700209643606, + "grad_norm": 1.0949363708496094, + "learning_rate": 1.1033805031446542e-05, + "loss": 0.0814, + "step": 29740 + }, + { + "epoch": 1.559224318658281, + "grad_norm": 1.97059965133667, + "learning_rate": 1.1020702306079665e-05, + "loss": 0.0542, + "step": 29750 + }, + { + "epoch": 1.559748427672956, + "grad_norm": 1.6910370588302612, + "learning_rate": 1.100759958071279e-05, + "loss": 0.0948, + "step": 29760 + }, + { + "epoch": 1.560272536687631, + "grad_norm": 1.5514713525772095, + "learning_rate": 1.0994496855345912e-05, + "loss": 0.0924, + "step": 29770 + }, + { + "epoch": 1.5607966457023061, + "grad_norm": 2.059285879135132, + "learning_rate": 1.0981394129979037e-05, + "loss": 0.0719, + "step": 29780 + }, + { + "epoch": 1.5613207547169812, + "grad_norm": 1.273655891418457, + "learning_rate": 1.096829140461216e-05, + "loss": 0.1041, + "step": 29790 + }, + { + "epoch": 1.5618448637316562, + "grad_norm": 1.5605663061141968, + "learning_rate": 1.0955188679245284e-05, + "loss": 0.07, + "step": 29800 + }, + { + "epoch": 1.5623689727463312, + "grad_norm": 0.9228988885879517, + "learning_rate": 1.0942085953878407e-05, + "loss": 0.0642, + "step": 29810 + }, + { + "epoch": 1.5628930817610063, + "grad_norm": 5.248297214508057, + "learning_rate": 1.0928983228511532e-05, + "loss": 0.0842, + "step": 29820 + }, + { + "epoch": 1.5634171907756813, + "grad_norm": 1.027686595916748, + "learning_rate": 1.0915880503144654e-05, + "loss": 0.0771, + "step": 29830 + }, + { + "epoch": 1.5639412997903563, + "grad_norm": 1.1501569747924805, + "learning_rate": 1.0902777777777779e-05, + "loss": 0.0803, + "step": 29840 + }, + { + "epoch": 1.5644654088050314, + "grad_norm": 1.298979640007019, + "learning_rate": 1.0889675052410902e-05, + "loss": 0.0667, + "step": 29850 + }, + { + "epoch": 1.5649895178197064, + "grad_norm": 1.156478762626648, + "learning_rate": 1.0876572327044026e-05, + "loss": 0.088, + "step": 29860 + }, + { + "epoch": 1.5655136268343814, + "grad_norm": 1.7162197828292847, + "learning_rate": 1.0863469601677149e-05, + "loss": 0.0829, + "step": 29870 + }, + { + "epoch": 1.5660377358490565, + "grad_norm": 1.008570909500122, + "learning_rate": 1.0850366876310274e-05, + "loss": 0.0697, + "step": 29880 + }, + { + "epoch": 1.5665618448637315, + "grad_norm": 1.2989729642868042, + "learning_rate": 1.0837264150943396e-05, + "loss": 0.0591, + "step": 29890 + }, + { + "epoch": 1.5670859538784065, + "grad_norm": 1.0060633420944214, + "learning_rate": 1.0824161425576521e-05, + "loss": 0.08, + "step": 29900 + }, + { + "epoch": 1.5676100628930818, + "grad_norm": 1.4241963624954224, + "learning_rate": 1.0811058700209644e-05, + "loss": 0.0745, + "step": 29910 + }, + { + "epoch": 1.5681341719077568, + "grad_norm": 2.18135404586792, + "learning_rate": 1.0797955974842768e-05, + "loss": 0.0789, + "step": 29920 + }, + { + "epoch": 1.5686582809224319, + "grad_norm": 1.7818392515182495, + "learning_rate": 1.0784853249475891e-05, + "loss": 0.0737, + "step": 29930 + }, + { + "epoch": 1.569182389937107, + "grad_norm": 1.8234585523605347, + "learning_rate": 1.0771750524109016e-05, + "loss": 0.0653, + "step": 29940 + }, + { + "epoch": 1.569706498951782, + "grad_norm": 1.3113237619400024, + "learning_rate": 1.0758647798742138e-05, + "loss": 0.0976, + "step": 29950 + }, + { + "epoch": 1.570230607966457, + "grad_norm": 1.1958644390106201, + "learning_rate": 1.0745545073375263e-05, + "loss": 0.0658, + "step": 29960 + }, + { + "epoch": 1.5707547169811322, + "grad_norm": 2.510767936706543, + "learning_rate": 1.0732442348008386e-05, + "loss": 0.0733, + "step": 29970 + }, + { + "epoch": 1.5712788259958073, + "grad_norm": 3.316685199737549, + "learning_rate": 1.071933962264151e-05, + "loss": 0.0893, + "step": 29980 + }, + { + "epoch": 1.5718029350104823, + "grad_norm": 1.476149082183838, + "learning_rate": 1.0706236897274633e-05, + "loss": 0.0912, + "step": 29990 + }, + { + "epoch": 1.5723270440251573, + "grad_norm": 1.2044668197631836, + "learning_rate": 1.0693134171907758e-05, + "loss": 0.0814, + "step": 30000 + }, + { + "epoch": 1.5723270440251573, + "eval_loss": 0.26660144329071045, + "eval_runtime": 267.6774, + "eval_samples_per_second": 7.438, + "eval_steps_per_second": 1.24, + "step": 30000 + }, + { + "epoch": 1.5728511530398324, + "grad_norm": 1.601665735244751, + "learning_rate": 1.0680031446540882e-05, + "loss": 0.0705, + "step": 30010 + }, + { + "epoch": 1.5733752620545074, + "grad_norm": 1.4815196990966797, + "learning_rate": 1.0666928721174005e-05, + "loss": 0.0904, + "step": 30020 + }, + { + "epoch": 1.5738993710691824, + "grad_norm": 1.331384539604187, + "learning_rate": 1.0653825995807128e-05, + "loss": 0.0812, + "step": 30030 + }, + { + "epoch": 1.5744234800838575, + "grad_norm": 2.9445841312408447, + "learning_rate": 1.0640723270440252e-05, + "loss": 0.0734, + "step": 30040 + }, + { + "epoch": 1.5749475890985325, + "grad_norm": 3.5984508991241455, + "learning_rate": 1.0627620545073377e-05, + "loss": 0.0739, + "step": 30050 + }, + { + "epoch": 1.5754716981132075, + "grad_norm": 2.2922754287719727, + "learning_rate": 1.06145178197065e-05, + "loss": 0.0612, + "step": 30060 + }, + { + "epoch": 1.5759958071278826, + "grad_norm": 1.01918625831604, + "learning_rate": 1.0601415094339624e-05, + "loss": 0.0866, + "step": 30070 + }, + { + "epoch": 1.5765199161425576, + "grad_norm": 1.2766294479370117, + "learning_rate": 1.0588312368972747e-05, + "loss": 0.0795, + "step": 30080 + }, + { + "epoch": 1.5770440251572326, + "grad_norm": 1.4242372512817383, + "learning_rate": 1.0575209643605872e-05, + "loss": 0.0847, + "step": 30090 + }, + { + "epoch": 1.5775681341719077, + "grad_norm": 1.5070456266403198, + "learning_rate": 1.0562106918238994e-05, + "loss": 0.0705, + "step": 30100 + }, + { + "epoch": 1.5780922431865827, + "grad_norm": 0.5492226481437683, + "learning_rate": 1.0549004192872119e-05, + "loss": 0.0839, + "step": 30110 + }, + { + "epoch": 1.5786163522012577, + "grad_norm": 2.577894687652588, + "learning_rate": 1.0535901467505242e-05, + "loss": 0.1087, + "step": 30120 + }, + { + "epoch": 1.5791404612159328, + "grad_norm": 1.4646040201187134, + "learning_rate": 1.0522798742138366e-05, + "loss": 0.0812, + "step": 30130 + }, + { + "epoch": 1.5796645702306078, + "grad_norm": 1.1385252475738525, + "learning_rate": 1.0509696016771489e-05, + "loss": 0.071, + "step": 30140 + }, + { + "epoch": 1.580188679245283, + "grad_norm": 0.8355076909065247, + "learning_rate": 1.0496593291404614e-05, + "loss": 0.0766, + "step": 30150 + }, + { + "epoch": 1.580712788259958, + "grad_norm": 1.4133363962173462, + "learning_rate": 1.0483490566037736e-05, + "loss": 0.062, + "step": 30160 + }, + { + "epoch": 1.5812368972746331, + "grad_norm": 3.577280282974243, + "learning_rate": 1.047038784067086e-05, + "loss": 0.0829, + "step": 30170 + }, + { + "epoch": 1.5817610062893082, + "grad_norm": 1.477343201637268, + "learning_rate": 1.0457285115303984e-05, + "loss": 0.0647, + "step": 30180 + }, + { + "epoch": 1.5822851153039832, + "grad_norm": 1.4361555576324463, + "learning_rate": 1.0444182389937108e-05, + "loss": 0.0882, + "step": 30190 + }, + { + "epoch": 1.5828092243186582, + "grad_norm": 2.002866506576538, + "learning_rate": 1.0431079664570231e-05, + "loss": 0.0856, + "step": 30200 + }, + { + "epoch": 1.5833333333333335, + "grad_norm": 1.5531564950942993, + "learning_rate": 1.0417976939203356e-05, + "loss": 0.0849, + "step": 30210 + }, + { + "epoch": 1.5838574423480085, + "grad_norm": 1.7375233173370361, + "learning_rate": 1.0404874213836478e-05, + "loss": 0.0721, + "step": 30220 + }, + { + "epoch": 1.5843815513626835, + "grad_norm": 1.705243706703186, + "learning_rate": 1.0391771488469603e-05, + "loss": 0.0718, + "step": 30230 + }, + { + "epoch": 1.5849056603773586, + "grad_norm": 0.8574244976043701, + "learning_rate": 1.0378668763102724e-05, + "loss": 0.0639, + "step": 30240 + }, + { + "epoch": 1.5854297693920336, + "grad_norm": 1.3724231719970703, + "learning_rate": 1.036556603773585e-05, + "loss": 0.0738, + "step": 30250 + }, + { + "epoch": 1.5859538784067087, + "grad_norm": 4.6184983253479, + "learning_rate": 1.0352463312368973e-05, + "loss": 0.0857, + "step": 30260 + }, + { + "epoch": 1.5864779874213837, + "grad_norm": 6.087825298309326, + "learning_rate": 1.0339360587002096e-05, + "loss": 0.0729, + "step": 30270 + }, + { + "epoch": 1.5870020964360587, + "grad_norm": 1.5708401203155518, + "learning_rate": 1.032625786163522e-05, + "loss": 0.0693, + "step": 30280 + }, + { + "epoch": 1.5875262054507338, + "grad_norm": 1.4279801845550537, + "learning_rate": 1.0313155136268345e-05, + "loss": 0.0889, + "step": 30290 + }, + { + "epoch": 1.5880503144654088, + "grad_norm": 1.2791606187820435, + "learning_rate": 1.0300052410901468e-05, + "loss": 0.0741, + "step": 30300 + }, + { + "epoch": 1.5885744234800838, + "grad_norm": 1.5464717149734497, + "learning_rate": 1.0286949685534592e-05, + "loss": 0.0785, + "step": 30310 + }, + { + "epoch": 1.5890985324947589, + "grad_norm": 4.76186990737915, + "learning_rate": 1.0273846960167715e-05, + "loss": 0.0658, + "step": 30320 + }, + { + "epoch": 1.5896226415094339, + "grad_norm": 1.987302541732788, + "learning_rate": 1.0260744234800838e-05, + "loss": 0.0644, + "step": 30330 + }, + { + "epoch": 1.590146750524109, + "grad_norm": 2.006908416748047, + "learning_rate": 1.0247641509433962e-05, + "loss": 0.055, + "step": 30340 + }, + { + "epoch": 1.590670859538784, + "grad_norm": 1.641992449760437, + "learning_rate": 1.0234538784067087e-05, + "loss": 0.087, + "step": 30350 + }, + { + "epoch": 1.591194968553459, + "grad_norm": 1.6064209938049316, + "learning_rate": 1.022143605870021e-05, + "loss": 0.1012, + "step": 30360 + }, + { + "epoch": 1.591719077568134, + "grad_norm": 1.9769614934921265, + "learning_rate": 1.0208333333333334e-05, + "loss": 0.0708, + "step": 30370 + }, + { + "epoch": 1.5922431865828093, + "grad_norm": 1.4612163305282593, + "learning_rate": 1.0195230607966457e-05, + "loss": 0.0763, + "step": 30380 + }, + { + "epoch": 1.5927672955974843, + "grad_norm": 0.8201736807823181, + "learning_rate": 1.018212788259958e-05, + "loss": 0.0629, + "step": 30390 + }, + { + "epoch": 1.5932914046121593, + "grad_norm": 1.8059312105178833, + "learning_rate": 1.0169025157232705e-05, + "loss": 0.0862, + "step": 30400 + }, + { + "epoch": 1.5938155136268344, + "grad_norm": 2.156587600708008, + "learning_rate": 1.0155922431865829e-05, + "loss": 0.0975, + "step": 30410 + }, + { + "epoch": 1.5943396226415094, + "grad_norm": 1.717282772064209, + "learning_rate": 1.0142819706498952e-05, + "loss": 0.0653, + "step": 30420 + }, + { + "epoch": 1.5948637316561844, + "grad_norm": 0.8761667013168335, + "learning_rate": 1.0129716981132076e-05, + "loss": 0.0925, + "step": 30430 + }, + { + "epoch": 1.5953878406708597, + "grad_norm": 1.7277803421020508, + "learning_rate": 1.01166142557652e-05, + "loss": 0.0752, + "step": 30440 + }, + { + "epoch": 1.5959119496855347, + "grad_norm": 1.5075651407241821, + "learning_rate": 1.0103511530398322e-05, + "loss": 0.0645, + "step": 30450 + }, + { + "epoch": 1.5964360587002098, + "grad_norm": 2.044546604156494, + "learning_rate": 1.0090408805031447e-05, + "loss": 0.0839, + "step": 30460 + }, + { + "epoch": 1.5969601677148848, + "grad_norm": 0.9201863408088684, + "learning_rate": 1.007730607966457e-05, + "loss": 0.0731, + "step": 30470 + }, + { + "epoch": 1.5974842767295598, + "grad_norm": 1.1442701816558838, + "learning_rate": 1.0064203354297694e-05, + "loss": 0.0771, + "step": 30480 + }, + { + "epoch": 1.5980083857442349, + "grad_norm": 1.5111795663833618, + "learning_rate": 1.0051100628930818e-05, + "loss": 0.0885, + "step": 30490 + }, + { + "epoch": 1.59853249475891, + "grad_norm": 1.7600494623184204, + "learning_rate": 1.0037997903563943e-05, + "loss": 0.0768, + "step": 30500 + }, + { + "epoch": 1.599056603773585, + "grad_norm": 1.3260642290115356, + "learning_rate": 1.0024895178197064e-05, + "loss": 0.0724, + "step": 30510 + }, + { + "epoch": 1.59958071278826, + "grad_norm": 1.7742761373519897, + "learning_rate": 1.001179245283019e-05, + "loss": 0.0708, + "step": 30520 + }, + { + "epoch": 1.600104821802935, + "grad_norm": 1.3558989763259888, + "learning_rate": 9.998689727463313e-06, + "loss": 0.0828, + "step": 30530 + }, + { + "epoch": 1.60062893081761, + "grad_norm": 2.178793430328369, + "learning_rate": 9.985587002096436e-06, + "loss": 0.0779, + "step": 30540 + }, + { + "epoch": 1.601153039832285, + "grad_norm": 2.13956618309021, + "learning_rate": 9.97248427672956e-06, + "loss": 0.0604, + "step": 30550 + }, + { + "epoch": 1.60167714884696, + "grad_norm": 1.9969738721847534, + "learning_rate": 9.959381551362685e-06, + "loss": 0.0816, + "step": 30560 + }, + { + "epoch": 1.6022012578616351, + "grad_norm": 1.623021125793457, + "learning_rate": 9.946278825995806e-06, + "loss": 0.105, + "step": 30570 + }, + { + "epoch": 1.6027253668763102, + "grad_norm": 1.9130661487579346, + "learning_rate": 9.933176100628931e-06, + "loss": 0.0778, + "step": 30580 + }, + { + "epoch": 1.6032494758909852, + "grad_norm": 1.505086898803711, + "learning_rate": 9.920073375262055e-06, + "loss": 0.07, + "step": 30590 + }, + { + "epoch": 1.6037735849056602, + "grad_norm": 2.169313907623291, + "learning_rate": 9.906970649895178e-06, + "loss": 0.0929, + "step": 30600 + }, + { + "epoch": 1.6042976939203353, + "grad_norm": 1.09574294090271, + "learning_rate": 9.893867924528302e-06, + "loss": 0.0498, + "step": 30610 + }, + { + "epoch": 1.6048218029350105, + "grad_norm": 1.1906548738479614, + "learning_rate": 9.880765199161427e-06, + "loss": 0.0773, + "step": 30620 + }, + { + "epoch": 1.6053459119496856, + "grad_norm": 2.843764543533325, + "learning_rate": 9.867662473794548e-06, + "loss": 0.0815, + "step": 30630 + }, + { + "epoch": 1.6058700209643606, + "grad_norm": 1.0723826885223389, + "learning_rate": 9.854559748427673e-06, + "loss": 0.0642, + "step": 30640 + }, + { + "epoch": 1.6063941299790356, + "grad_norm": 0.8873287439346313, + "learning_rate": 9.841457023060797e-06, + "loss": 0.0595, + "step": 30650 + }, + { + "epoch": 1.6069182389937107, + "grad_norm": 1.246100664138794, + "learning_rate": 9.82835429769392e-06, + "loss": 0.0818, + "step": 30660 + }, + { + "epoch": 1.6074423480083857, + "grad_norm": 1.3966413736343384, + "learning_rate": 9.815251572327044e-06, + "loss": 0.0646, + "step": 30670 + }, + { + "epoch": 1.607966457023061, + "grad_norm": 2.654226541519165, + "learning_rate": 9.802148846960169e-06, + "loss": 0.0825, + "step": 30680 + }, + { + "epoch": 1.608490566037736, + "grad_norm": 0.9575130939483643, + "learning_rate": 9.789046121593292e-06, + "loss": 0.0607, + "step": 30690 + }, + { + "epoch": 1.609014675052411, + "grad_norm": 1.2333629131317139, + "learning_rate": 9.775943396226415e-06, + "loss": 0.0654, + "step": 30700 + }, + { + "epoch": 1.609538784067086, + "grad_norm": 1.891493558883667, + "learning_rate": 9.762840670859539e-06, + "loss": 0.0772, + "step": 30710 + }, + { + "epoch": 1.610062893081761, + "grad_norm": 1.4982374906539917, + "learning_rate": 9.749737945492662e-06, + "loss": 0.0706, + "step": 30720 + }, + { + "epoch": 1.6105870020964361, + "grad_norm": 1.1844661235809326, + "learning_rate": 9.736635220125787e-06, + "loss": 0.0801, + "step": 30730 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.8028875589370728, + "learning_rate": 9.72353249475891e-06, + "loss": 0.0731, + "step": 30740 + }, + { + "epoch": 1.6116352201257862, + "grad_norm": 1.5980677604675293, + "learning_rate": 9.710429769392034e-06, + "loss": 0.0789, + "step": 30750 + }, + { + "epoch": 1.6121593291404612, + "grad_norm": 4.673587322235107, + "learning_rate": 9.697327044025157e-06, + "loss": 0.0885, + "step": 30760 + }, + { + "epoch": 1.6126834381551363, + "grad_norm": 1.1823970079421997, + "learning_rate": 9.684224318658283e-06, + "loss": 0.0787, + "step": 30770 + }, + { + "epoch": 1.6132075471698113, + "grad_norm": 1.0304325819015503, + "learning_rate": 9.671121593291404e-06, + "loss": 0.0987, + "step": 30780 + }, + { + "epoch": 1.6137316561844863, + "grad_norm": 1.4936891794204712, + "learning_rate": 9.65801886792453e-06, + "loss": 0.0712, + "step": 30790 + }, + { + "epoch": 1.6142557651991614, + "grad_norm": 0.8515013456344604, + "learning_rate": 9.644916142557653e-06, + "loss": 0.0764, + "step": 30800 + }, + { + "epoch": 1.6147798742138364, + "grad_norm": 1.0494325160980225, + "learning_rate": 9.631813417190776e-06, + "loss": 0.095, + "step": 30810 + }, + { + "epoch": 1.6153039832285114, + "grad_norm": 2.069692611694336, + "learning_rate": 9.6187106918239e-06, + "loss": 0.0568, + "step": 30820 + }, + { + "epoch": 1.6158280922431865, + "grad_norm": 1.360653281211853, + "learning_rate": 9.605607966457025e-06, + "loss": 0.0832, + "step": 30830 + }, + { + "epoch": 1.6163522012578615, + "grad_norm": 2.063434362411499, + "learning_rate": 9.592505241090146e-06, + "loss": 0.0716, + "step": 30840 + }, + { + "epoch": 1.6168763102725365, + "grad_norm": 1.8808234930038452, + "learning_rate": 9.579402515723271e-06, + "loss": 0.0713, + "step": 30850 + }, + { + "epoch": 1.6174004192872118, + "grad_norm": 2.8570516109466553, + "learning_rate": 9.566299790356395e-06, + "loss": 0.0763, + "step": 30860 + }, + { + "epoch": 1.6179245283018868, + "grad_norm": 1.3575539588928223, + "learning_rate": 9.553197064989518e-06, + "loss": 0.0777, + "step": 30870 + }, + { + "epoch": 1.6184486373165619, + "grad_norm": 1.710089087486267, + "learning_rate": 9.540094339622641e-06, + "loss": 0.0806, + "step": 30880 + }, + { + "epoch": 1.618972746331237, + "grad_norm": 2.1114864349365234, + "learning_rate": 9.526991614255767e-06, + "loss": 0.0818, + "step": 30890 + }, + { + "epoch": 1.619496855345912, + "grad_norm": 2.2566704750061035, + "learning_rate": 9.513888888888888e-06, + "loss": 0.0857, + "step": 30900 + }, + { + "epoch": 1.620020964360587, + "grad_norm": 2.0487608909606934, + "learning_rate": 9.500786163522013e-06, + "loss": 0.0779, + "step": 30910 + }, + { + "epoch": 1.6205450733752622, + "grad_norm": 1.2093197107315063, + "learning_rate": 9.487683438155137e-06, + "loss": 0.0942, + "step": 30920 + }, + { + "epoch": 1.6210691823899372, + "grad_norm": 0.9803887605667114, + "learning_rate": 9.47458071278826e-06, + "loss": 0.0695, + "step": 30930 + }, + { + "epoch": 1.6215932914046123, + "grad_norm": 1.6742619276046753, + "learning_rate": 9.461477987421383e-06, + "loss": 0.0709, + "step": 30940 + }, + { + "epoch": 1.6221174004192873, + "grad_norm": 1.9753679037094116, + "learning_rate": 9.448375262054509e-06, + "loss": 0.0818, + "step": 30950 + }, + { + "epoch": 1.6226415094339623, + "grad_norm": 1.4718005657196045, + "learning_rate": 9.43527253668763e-06, + "loss": 0.0865, + "step": 30960 + }, + { + "epoch": 1.6231656184486374, + "grad_norm": 1.4376145601272583, + "learning_rate": 9.422169811320755e-06, + "loss": 0.0652, + "step": 30970 + }, + { + "epoch": 1.6236897274633124, + "grad_norm": 1.3017988204956055, + "learning_rate": 9.409067085953879e-06, + "loss": 0.0706, + "step": 30980 + }, + { + "epoch": 1.6242138364779874, + "grad_norm": 2.7291808128356934, + "learning_rate": 9.395964360587002e-06, + "loss": 0.0871, + "step": 30990 + }, + { + "epoch": 1.6247379454926625, + "grad_norm": 1.0201388597488403, + "learning_rate": 9.382861635220125e-06, + "loss": 0.0584, + "step": 31000 + }, + { + "epoch": 1.6247379454926625, + "eval_loss": 0.26702582836151123, + "eval_runtime": 267.8655, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 1.239, + "step": 31000 + }, + { + "epoch": 1.6252620545073375, + "grad_norm": 3.665487051010132, + "learning_rate": 9.36975890985325e-06, + "loss": 0.0844, + "step": 31010 + }, + { + "epoch": 1.6257861635220126, + "grad_norm": 1.0894783735275269, + "learning_rate": 9.356656184486374e-06, + "loss": 0.0604, + "step": 31020 + }, + { + "epoch": 1.6263102725366876, + "grad_norm": 1.6628109216690063, + "learning_rate": 9.343553459119497e-06, + "loss": 0.0645, + "step": 31030 + }, + { + "epoch": 1.6268343815513626, + "grad_norm": 0.9611374735832214, + "learning_rate": 9.33045073375262e-06, + "loss": 0.0919, + "step": 31040 + }, + { + "epoch": 1.6273584905660377, + "grad_norm": 1.490185022354126, + "learning_rate": 9.317348008385744e-06, + "loss": 0.1212, + "step": 31050 + }, + { + "epoch": 1.6278825995807127, + "grad_norm": 2.0159690380096436, + "learning_rate": 9.30424528301887e-06, + "loss": 0.0724, + "step": 31060 + }, + { + "epoch": 1.6284067085953877, + "grad_norm": 1.2287893295288086, + "learning_rate": 9.291142557651993e-06, + "loss": 0.0487, + "step": 31070 + }, + { + "epoch": 1.6289308176100628, + "grad_norm": 1.0067164897918701, + "learning_rate": 9.278039832285116e-06, + "loss": 0.0842, + "step": 31080 + }, + { + "epoch": 1.6294549266247378, + "grad_norm": 2.069939374923706, + "learning_rate": 9.26493710691824e-06, + "loss": 0.0636, + "step": 31090 + }, + { + "epoch": 1.629979035639413, + "grad_norm": 1.2028172016143799, + "learning_rate": 9.251834381551364e-06, + "loss": 0.1012, + "step": 31100 + }, + { + "epoch": 1.630503144654088, + "grad_norm": 3.1692330837249756, + "learning_rate": 9.238731656184486e-06, + "loss": 0.0742, + "step": 31110 + }, + { + "epoch": 1.631027253668763, + "grad_norm": 1.7992483377456665, + "learning_rate": 9.225628930817611e-06, + "loss": 0.0736, + "step": 31120 + }, + { + "epoch": 1.6315513626834381, + "grad_norm": 1.6034457683563232, + "learning_rate": 9.212526205450735e-06, + "loss": 0.0703, + "step": 31130 + }, + { + "epoch": 1.6320754716981132, + "grad_norm": 1.4430203437805176, + "learning_rate": 9.199423480083858e-06, + "loss": 0.1016, + "step": 31140 + }, + { + "epoch": 1.6325995807127882, + "grad_norm": 1.9012161493301392, + "learning_rate": 9.186320754716981e-06, + "loss": 0.0933, + "step": 31150 + }, + { + "epoch": 1.6331236897274635, + "grad_norm": 1.7923085689544678, + "learning_rate": 9.173218029350106e-06, + "loss": 0.0862, + "step": 31160 + }, + { + "epoch": 1.6336477987421385, + "grad_norm": 0.35635894536972046, + "learning_rate": 9.160115303983228e-06, + "loss": 0.0741, + "step": 31170 + }, + { + "epoch": 1.6341719077568135, + "grad_norm": 1.7451974153518677, + "learning_rate": 9.147012578616353e-06, + "loss": 0.0802, + "step": 31180 + }, + { + "epoch": 1.6346960167714886, + "grad_norm": 1.976898431777954, + "learning_rate": 9.133909853249477e-06, + "loss": 0.0759, + "step": 31190 + }, + { + "epoch": 1.6352201257861636, + "grad_norm": 2.0606613159179688, + "learning_rate": 9.1208071278826e-06, + "loss": 0.0785, + "step": 31200 + }, + { + "epoch": 1.6357442348008386, + "grad_norm": 1.5760445594787598, + "learning_rate": 9.107704402515723e-06, + "loss": 0.0753, + "step": 31210 + }, + { + "epoch": 1.6362683438155137, + "grad_norm": 1.2342818975448608, + "learning_rate": 9.094601677148848e-06, + "loss": 0.0619, + "step": 31220 + }, + { + "epoch": 1.6367924528301887, + "grad_norm": 0.7712125778198242, + "learning_rate": 9.08149895178197e-06, + "loss": 0.0603, + "step": 31230 + }, + { + "epoch": 1.6373165618448637, + "grad_norm": 0.5956855416297913, + "learning_rate": 9.068396226415095e-06, + "loss": 0.0714, + "step": 31240 + }, + { + "epoch": 1.6378406708595388, + "grad_norm": 2.2633252143859863, + "learning_rate": 9.055293501048219e-06, + "loss": 0.0811, + "step": 31250 + }, + { + "epoch": 1.6383647798742138, + "grad_norm": 1.7489700317382812, + "learning_rate": 9.042190775681342e-06, + "loss": 0.0803, + "step": 31260 + }, + { + "epoch": 1.6388888888888888, + "grad_norm": 1.7314835786819458, + "learning_rate": 9.029088050314465e-06, + "loss": 0.0852, + "step": 31270 + }, + { + "epoch": 1.6394129979035639, + "grad_norm": 1.1030824184417725, + "learning_rate": 9.01598532494759e-06, + "loss": 0.0644, + "step": 31280 + }, + { + "epoch": 1.639937106918239, + "grad_norm": 0.5983395576477051, + "learning_rate": 9.002882599580712e-06, + "loss": 0.0758, + "step": 31290 + }, + { + "epoch": 1.640461215932914, + "grad_norm": 0.8161409497261047, + "learning_rate": 8.989779874213837e-06, + "loss": 0.0666, + "step": 31300 + }, + { + "epoch": 1.640985324947589, + "grad_norm": 2.6947455406188965, + "learning_rate": 8.97667714884696e-06, + "loss": 0.0755, + "step": 31310 + }, + { + "epoch": 1.641509433962264, + "grad_norm": 1.2531763315200806, + "learning_rate": 8.963574423480084e-06, + "loss": 0.0996, + "step": 31320 + }, + { + "epoch": 1.642033542976939, + "grad_norm": 1.8375675678253174, + "learning_rate": 8.950471698113207e-06, + "loss": 0.067, + "step": 31330 + }, + { + "epoch": 1.6425576519916143, + "grad_norm": 2.1205642223358154, + "learning_rate": 8.937368972746332e-06, + "loss": 0.0675, + "step": 31340 + }, + { + "epoch": 1.6430817610062893, + "grad_norm": 1.071904182434082, + "learning_rate": 8.924266247379456e-06, + "loss": 0.0656, + "step": 31350 + }, + { + "epoch": 1.6436058700209644, + "grad_norm": 1.9515632390975952, + "learning_rate": 8.91116352201258e-06, + "loss": 0.0793, + "step": 31360 + }, + { + "epoch": 1.6441299790356394, + "grad_norm": 1.9191174507141113, + "learning_rate": 8.898060796645703e-06, + "loss": 0.0645, + "step": 31370 + }, + { + "epoch": 1.6446540880503144, + "grad_norm": 1.7642149925231934, + "learning_rate": 8.884958071278826e-06, + "loss": 0.0743, + "step": 31380 + }, + { + "epoch": 1.6451781970649895, + "grad_norm": 3.854020595550537, + "learning_rate": 8.87185534591195e-06, + "loss": 0.0908, + "step": 31390 + }, + { + "epoch": 1.6457023060796647, + "grad_norm": 1.7950454950332642, + "learning_rate": 8.858752620545074e-06, + "loss": 0.0827, + "step": 31400 + }, + { + "epoch": 1.6462264150943398, + "grad_norm": 39.41985321044922, + "learning_rate": 8.845649895178198e-06, + "loss": 0.0732, + "step": 31410 + }, + { + "epoch": 1.6467505241090148, + "grad_norm": 1.0624687671661377, + "learning_rate": 8.832547169811321e-06, + "loss": 0.073, + "step": 31420 + }, + { + "epoch": 1.6472746331236898, + "grad_norm": 1.975717544555664, + "learning_rate": 8.819444444444445e-06, + "loss": 0.1077, + "step": 31430 + }, + { + "epoch": 1.6477987421383649, + "grad_norm": 1.54432213306427, + "learning_rate": 8.806341719077568e-06, + "loss": 0.0686, + "step": 31440 + }, + { + "epoch": 1.64832285115304, + "grad_norm": 1.4304097890853882, + "learning_rate": 8.793238993710693e-06, + "loss": 0.0734, + "step": 31450 + }, + { + "epoch": 1.648846960167715, + "grad_norm": 1.6963316202163696, + "learning_rate": 8.780136268343816e-06, + "loss": 0.0625, + "step": 31460 + }, + { + "epoch": 1.64937106918239, + "grad_norm": 2.143986463546753, + "learning_rate": 8.76703354297694e-06, + "loss": 0.1018, + "step": 31470 + }, + { + "epoch": 1.649895178197065, + "grad_norm": 2.237755537033081, + "learning_rate": 8.753930817610063e-06, + "loss": 0.0835, + "step": 31480 + }, + { + "epoch": 1.65041928721174, + "grad_norm": 1.3473063707351685, + "learning_rate": 8.740828092243188e-06, + "loss": 0.0719, + "step": 31490 + }, + { + "epoch": 1.650943396226415, + "grad_norm": 2.768249273300171, + "learning_rate": 8.72772536687631e-06, + "loss": 0.0754, + "step": 31500 + }, + { + "epoch": 1.65146750524109, + "grad_norm": 2.2245090007781982, + "learning_rate": 8.714622641509435e-06, + "loss": 0.0825, + "step": 31510 + }, + { + "epoch": 1.6519916142557651, + "grad_norm": 0.9257903695106506, + "learning_rate": 8.701519916142558e-06, + "loss": 0.1143, + "step": 31520 + }, + { + "epoch": 1.6525157232704402, + "grad_norm": 2.393784999847412, + "learning_rate": 8.688417190775682e-06, + "loss": 0.0977, + "step": 31530 + }, + { + "epoch": 1.6530398322851152, + "grad_norm": 0.9498158693313599, + "learning_rate": 8.675314465408805e-06, + "loss": 0.0735, + "step": 31540 + }, + { + "epoch": 1.6535639412997902, + "grad_norm": 1.3141714334487915, + "learning_rate": 8.66221174004193e-06, + "loss": 0.0538, + "step": 31550 + }, + { + "epoch": 1.6540880503144653, + "grad_norm": 1.4560273885726929, + "learning_rate": 8.649109014675052e-06, + "loss": 0.0747, + "step": 31560 + }, + { + "epoch": 1.6546121593291403, + "grad_norm": 2.20974063873291, + "learning_rate": 8.636006289308177e-06, + "loss": 0.0816, + "step": 31570 + }, + { + "epoch": 1.6551362683438156, + "grad_norm": 1.07097327709198, + "learning_rate": 8.6229035639413e-06, + "loss": 0.085, + "step": 31580 + }, + { + "epoch": 1.6556603773584906, + "grad_norm": 2.0411245822906494, + "learning_rate": 8.609800838574424e-06, + "loss": 0.1071, + "step": 31590 + }, + { + "epoch": 1.6561844863731656, + "grad_norm": 2.4444754123687744, + "learning_rate": 8.596698113207547e-06, + "loss": 0.1079, + "step": 31600 + }, + { + "epoch": 1.6567085953878407, + "grad_norm": 1.7422720193862915, + "learning_rate": 8.583595387840672e-06, + "loss": 0.0527, + "step": 31610 + }, + { + "epoch": 1.6572327044025157, + "grad_norm": 1.8651151657104492, + "learning_rate": 8.570492662473794e-06, + "loss": 0.0854, + "step": 31620 + }, + { + "epoch": 1.6577568134171907, + "grad_norm": 1.7437045574188232, + "learning_rate": 8.557389937106919e-06, + "loss": 0.0758, + "step": 31630 + }, + { + "epoch": 1.658280922431866, + "grad_norm": 1.2671029567718506, + "learning_rate": 8.544287211740042e-06, + "loss": 0.0868, + "step": 31640 + }, + { + "epoch": 1.658805031446541, + "grad_norm": 1.625141978263855, + "learning_rate": 8.531184486373166e-06, + "loss": 0.0598, + "step": 31650 + }, + { + "epoch": 1.659329140461216, + "grad_norm": 1.3473106622695923, + "learning_rate": 8.51808176100629e-06, + "loss": 0.0527, + "step": 31660 + }, + { + "epoch": 1.659853249475891, + "grad_norm": 1.9228523969650269, + "learning_rate": 8.504979035639414e-06, + "loss": 0.0719, + "step": 31670 + }, + { + "epoch": 1.6603773584905661, + "grad_norm": 2.128453016281128, + "learning_rate": 8.491876310272536e-06, + "loss": 0.1052, + "step": 31680 + }, + { + "epoch": 1.6609014675052411, + "grad_norm": 1.360034465789795, + "learning_rate": 8.478773584905661e-06, + "loss": 0.0727, + "step": 31690 + }, + { + "epoch": 1.6614255765199162, + "grad_norm": 1.688598394393921, + "learning_rate": 8.465670859538784e-06, + "loss": 0.0994, + "step": 31700 + }, + { + "epoch": 1.6619496855345912, + "grad_norm": 1.1820000410079956, + "learning_rate": 8.452568134171908e-06, + "loss": 0.073, + "step": 31710 + }, + { + "epoch": 1.6624737945492662, + "grad_norm": 1.195401906967163, + "learning_rate": 8.439465408805031e-06, + "loss": 0.0593, + "step": 31720 + }, + { + "epoch": 1.6629979035639413, + "grad_norm": 1.9621461629867554, + "learning_rate": 8.426362683438156e-06, + "loss": 0.0728, + "step": 31730 + }, + { + "epoch": 1.6635220125786163, + "grad_norm": 2.7265822887420654, + "learning_rate": 8.41325995807128e-06, + "loss": 0.0825, + "step": 31740 + }, + { + "epoch": 1.6640461215932913, + "grad_norm": 0.510811984539032, + "learning_rate": 8.400157232704403e-06, + "loss": 0.0595, + "step": 31750 + }, + { + "epoch": 1.6645702306079664, + "grad_norm": 1.9401181936264038, + "learning_rate": 8.387054507337526e-06, + "loss": 0.095, + "step": 31760 + }, + { + "epoch": 1.6650943396226414, + "grad_norm": 0.9660150408744812, + "learning_rate": 8.37395178197065e-06, + "loss": 0.0613, + "step": 31770 + }, + { + "epoch": 1.6656184486373165, + "grad_norm": 1.6908096075057983, + "learning_rate": 8.360849056603775e-06, + "loss": 0.076, + "step": 31780 + }, + { + "epoch": 1.6661425576519915, + "grad_norm": 2.0633528232574463, + "learning_rate": 8.347746331236898e-06, + "loss": 0.0697, + "step": 31790 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.6293288469314575, + "learning_rate": 8.334643605870022e-06, + "loss": 0.0751, + "step": 31800 + }, + { + "epoch": 1.6671907756813418, + "grad_norm": 2.2690975666046143, + "learning_rate": 8.321540880503145e-06, + "loss": 0.0903, + "step": 31810 + }, + { + "epoch": 1.6677148846960168, + "grad_norm": 1.2514418363571167, + "learning_rate": 8.308438155136269e-06, + "loss": 0.0881, + "step": 31820 + }, + { + "epoch": 1.6682389937106918, + "grad_norm": 1.7849570512771606, + "learning_rate": 8.295335429769392e-06, + "loss": 0.0651, + "step": 31830 + }, + { + "epoch": 1.6687631027253669, + "grad_norm": 1.8932183980941772, + "learning_rate": 8.282232704402517e-06, + "loss": 0.0693, + "step": 31840 + }, + { + "epoch": 1.669287211740042, + "grad_norm": 1.2900819778442383, + "learning_rate": 8.269129979035639e-06, + "loss": 0.0743, + "step": 31850 + }, + { + "epoch": 1.669811320754717, + "grad_norm": 2.7592389583587646, + "learning_rate": 8.256027253668764e-06, + "loss": 0.0761, + "step": 31860 + }, + { + "epoch": 1.6703354297693922, + "grad_norm": 2.449463367462158, + "learning_rate": 8.242924528301887e-06, + "loss": 0.0778, + "step": 31870 + }, + { + "epoch": 1.6708595387840672, + "grad_norm": 1.5721780061721802, + "learning_rate": 8.22982180293501e-06, + "loss": 0.0703, + "step": 31880 + }, + { + "epoch": 1.6713836477987423, + "grad_norm": 1.907634973526001, + "learning_rate": 8.216719077568134e-06, + "loss": 0.0777, + "step": 31890 + }, + { + "epoch": 1.6719077568134173, + "grad_norm": 1.0545610189437866, + "learning_rate": 8.203616352201259e-06, + "loss": 0.0707, + "step": 31900 + }, + { + "epoch": 1.6724318658280923, + "grad_norm": 2.518768072128296, + "learning_rate": 8.19051362683438e-06, + "loss": 0.0736, + "step": 31910 + }, + { + "epoch": 1.6729559748427674, + "grad_norm": 1.4451135396957397, + "learning_rate": 8.177410901467506e-06, + "loss": 0.0736, + "step": 31920 + }, + { + "epoch": 1.6734800838574424, + "grad_norm": 1.5687718391418457, + "learning_rate": 8.164308176100629e-06, + "loss": 0.0707, + "step": 31930 + }, + { + "epoch": 1.6740041928721174, + "grad_norm": 1.0529969930648804, + "learning_rate": 8.151205450733753e-06, + "loss": 0.0878, + "step": 31940 + }, + { + "epoch": 1.6745283018867925, + "grad_norm": 1.8931552171707153, + "learning_rate": 8.138102725366876e-06, + "loss": 0.0829, + "step": 31950 + }, + { + "epoch": 1.6750524109014675, + "grad_norm": 1.5445560216903687, + "learning_rate": 8.125000000000001e-06, + "loss": 0.0655, + "step": 31960 + }, + { + "epoch": 1.6755765199161425, + "grad_norm": 1.209445834159851, + "learning_rate": 8.111897274633123e-06, + "loss": 0.0687, + "step": 31970 + }, + { + "epoch": 1.6761006289308176, + "grad_norm": 1.4542816877365112, + "learning_rate": 8.098794549266248e-06, + "loss": 0.0859, + "step": 31980 + }, + { + "epoch": 1.6766247379454926, + "grad_norm": 1.4635159969329834, + "learning_rate": 8.085691823899371e-06, + "loss": 0.0772, + "step": 31990 + }, + { + "epoch": 1.6771488469601676, + "grad_norm": 1.0781625509262085, + "learning_rate": 8.072589098532495e-06, + "loss": 0.0818, + "step": 32000 + }, + { + "epoch": 1.6771488469601676, + "eval_loss": 0.2654711604118347, + "eval_runtime": 266.8065, + "eval_samples_per_second": 7.462, + "eval_steps_per_second": 1.244, + "step": 32000 + }, + { + "epoch": 1.6776729559748427, + "grad_norm": 1.2641340494155884, + "learning_rate": 8.059486373165618e-06, + "loss": 0.0691, + "step": 32010 + }, + { + "epoch": 1.6781970649895177, + "grad_norm": 0.6858422756195068, + "learning_rate": 8.046383647798743e-06, + "loss": 0.0607, + "step": 32020 + }, + { + "epoch": 1.6787211740041927, + "grad_norm": 1.2624872922897339, + "learning_rate": 8.033280922431866e-06, + "loss": 0.0675, + "step": 32030 + }, + { + "epoch": 1.6792452830188678, + "grad_norm": 1.3456783294677734, + "learning_rate": 8.02017819706499e-06, + "loss": 0.0799, + "step": 32040 + }, + { + "epoch": 1.679769392033543, + "grad_norm": 1.8370672464370728, + "learning_rate": 8.007075471698113e-06, + "loss": 0.0857, + "step": 32050 + }, + { + "epoch": 1.680293501048218, + "grad_norm": 0.7102406024932861, + "learning_rate": 7.993972746331237e-06, + "loss": 0.0552, + "step": 32060 + }, + { + "epoch": 1.680817610062893, + "grad_norm": 1.2570568323135376, + "learning_rate": 7.980870020964362e-06, + "loss": 0.0611, + "step": 32070 + }, + { + "epoch": 1.6813417190775681, + "grad_norm": 1.3005815744400024, + "learning_rate": 7.967767295597485e-06, + "loss": 0.0898, + "step": 32080 + }, + { + "epoch": 1.6818658280922432, + "grad_norm": 1.8173960447311401, + "learning_rate": 7.954664570230608e-06, + "loss": 0.0866, + "step": 32090 + }, + { + "epoch": 1.6823899371069182, + "grad_norm": 0.9965292811393738, + "learning_rate": 7.941561844863732e-06, + "loss": 0.0599, + "step": 32100 + }, + { + "epoch": 1.6829140461215935, + "grad_norm": 2.850111246109009, + "learning_rate": 7.928459119496857e-06, + "loss": 0.0999, + "step": 32110 + }, + { + "epoch": 1.6834381551362685, + "grad_norm": 1.4791312217712402, + "learning_rate": 7.915356394129979e-06, + "loss": 0.0638, + "step": 32120 + }, + { + "epoch": 1.6839622641509435, + "grad_norm": 1.24795401096344, + "learning_rate": 7.902253668763104e-06, + "loss": 0.0735, + "step": 32130 + }, + { + "epoch": 1.6844863731656186, + "grad_norm": 1.7736852169036865, + "learning_rate": 7.889150943396227e-06, + "loss": 0.0833, + "step": 32140 + }, + { + "epoch": 1.6850104821802936, + "grad_norm": 2.36114501953125, + "learning_rate": 7.87604821802935e-06, + "loss": 0.0716, + "step": 32150 + }, + { + "epoch": 1.6855345911949686, + "grad_norm": 2.078085422515869, + "learning_rate": 7.862945492662474e-06, + "loss": 0.0707, + "step": 32160 + }, + { + "epoch": 1.6860587002096437, + "grad_norm": 2.08408784866333, + "learning_rate": 7.849842767295599e-06, + "loss": 0.0796, + "step": 32170 + }, + { + "epoch": 1.6865828092243187, + "grad_norm": 1.5705268383026123, + "learning_rate": 7.83674004192872e-06, + "loss": 0.0968, + "step": 32180 + }, + { + "epoch": 1.6871069182389937, + "grad_norm": 2.4469356536865234, + "learning_rate": 7.823637316561846e-06, + "loss": 0.0784, + "step": 32190 + }, + { + "epoch": 1.6876310272536688, + "grad_norm": 2.0996434688568115, + "learning_rate": 7.810534591194969e-06, + "loss": 0.0739, + "step": 32200 + }, + { + "epoch": 1.6881551362683438, + "grad_norm": 1.6528130769729614, + "learning_rate": 7.797431865828092e-06, + "loss": 0.0769, + "step": 32210 + }, + { + "epoch": 1.6886792452830188, + "grad_norm": 1.4783482551574707, + "learning_rate": 7.784329140461216e-06, + "loss": 0.075, + "step": 32220 + }, + { + "epoch": 1.6892033542976939, + "grad_norm": 1.4927986860275269, + "learning_rate": 7.771226415094341e-06, + "loss": 0.0874, + "step": 32230 + }, + { + "epoch": 1.689727463312369, + "grad_norm": 1.3704826831817627, + "learning_rate": 7.758123689727463e-06, + "loss": 0.0762, + "step": 32240 + }, + { + "epoch": 1.690251572327044, + "grad_norm": 1.3907643556594849, + "learning_rate": 7.745020964360588e-06, + "loss": 0.0639, + "step": 32250 + }, + { + "epoch": 1.690775681341719, + "grad_norm": 3.422758102416992, + "learning_rate": 7.731918238993711e-06, + "loss": 0.0658, + "step": 32260 + }, + { + "epoch": 1.691299790356394, + "grad_norm": 1.4427984952926636, + "learning_rate": 7.718815513626834e-06, + "loss": 0.1001, + "step": 32270 + }, + { + "epoch": 1.691823899371069, + "grad_norm": 2.1378979682922363, + "learning_rate": 7.705712788259958e-06, + "loss": 0.0795, + "step": 32280 + }, + { + "epoch": 1.6923480083857443, + "grad_norm": 1.7514784336090088, + "learning_rate": 7.692610062893083e-06, + "loss": 0.064, + "step": 32290 + }, + { + "epoch": 1.6928721174004193, + "grad_norm": 1.7054566144943237, + "learning_rate": 7.679507337526205e-06, + "loss": 0.0541, + "step": 32300 + }, + { + "epoch": 1.6933962264150944, + "grad_norm": 0.8755590319633484, + "learning_rate": 7.66640461215933e-06, + "loss": 0.06, + "step": 32310 + }, + { + "epoch": 1.6939203354297694, + "grad_norm": 1.724295735359192, + "learning_rate": 7.653301886792453e-06, + "loss": 0.0478, + "step": 32320 + }, + { + "epoch": 1.6944444444444444, + "grad_norm": 1.4551647901535034, + "learning_rate": 7.640199161425576e-06, + "loss": 0.0612, + "step": 32330 + }, + { + "epoch": 1.6949685534591195, + "grad_norm": 1.4933216571807861, + "learning_rate": 7.627096436058701e-06, + "loss": 0.0804, + "step": 32340 + }, + { + "epoch": 1.6954926624737947, + "grad_norm": 2.513120651245117, + "learning_rate": 7.613993710691825e-06, + "loss": 0.0713, + "step": 32350 + }, + { + "epoch": 1.6960167714884697, + "grad_norm": 1.3239364624023438, + "learning_rate": 7.600890985324947e-06, + "loss": 0.0798, + "step": 32360 + }, + { + "epoch": 1.6965408805031448, + "grad_norm": 2.5568461418151855, + "learning_rate": 7.587788259958072e-06, + "loss": 0.0703, + "step": 32370 + }, + { + "epoch": 1.6970649895178198, + "grad_norm": 1.7434065341949463, + "learning_rate": 7.574685534591196e-06, + "loss": 0.0681, + "step": 32380 + }, + { + "epoch": 1.6975890985324948, + "grad_norm": 3.6494479179382324, + "learning_rate": 7.561582809224318e-06, + "loss": 0.0823, + "step": 32390 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 1.6936825513839722, + "learning_rate": 7.548480083857443e-06, + "loss": 0.0895, + "step": 32400 + }, + { + "epoch": 1.698637316561845, + "grad_norm": 1.0573102235794067, + "learning_rate": 7.535377358490567e-06, + "loss": 0.073, + "step": 32410 + }, + { + "epoch": 1.69916142557652, + "grad_norm": 0.9793390035629272, + "learning_rate": 7.522274633123689e-06, + "loss": 0.0593, + "step": 32420 + }, + { + "epoch": 1.699685534591195, + "grad_norm": 1.1624884605407715, + "learning_rate": 7.509171907756814e-06, + "loss": 0.0583, + "step": 32430 + }, + { + "epoch": 1.70020964360587, + "grad_norm": 1.6360301971435547, + "learning_rate": 7.496069182389938e-06, + "loss": 0.0801, + "step": 32440 + }, + { + "epoch": 1.700733752620545, + "grad_norm": 2.4007883071899414, + "learning_rate": 7.48296645702306e-06, + "loss": 0.0979, + "step": 32450 + }, + { + "epoch": 1.70125786163522, + "grad_norm": 1.2194461822509766, + "learning_rate": 7.469863731656185e-06, + "loss": 0.0702, + "step": 32460 + }, + { + "epoch": 1.7017819706498951, + "grad_norm": 0.7657409906387329, + "learning_rate": 7.456761006289309e-06, + "loss": 0.0647, + "step": 32470 + }, + { + "epoch": 1.7023060796645701, + "grad_norm": 3.282167434692383, + "learning_rate": 7.4436582809224314e-06, + "loss": 0.095, + "step": 32480 + }, + { + "epoch": 1.7028301886792452, + "grad_norm": 1.2034971714019775, + "learning_rate": 7.430555555555556e-06, + "loss": 0.0901, + "step": 32490 + }, + { + "epoch": 1.7033542976939202, + "grad_norm": 2.2986252307891846, + "learning_rate": 7.41745283018868e-06, + "loss": 0.0612, + "step": 32500 + }, + { + "epoch": 1.7038784067085953, + "grad_norm": 0.8976192474365234, + "learning_rate": 7.4043501048218024e-06, + "loss": 0.0573, + "step": 32510 + }, + { + "epoch": 1.7044025157232703, + "grad_norm": 1.3209949731826782, + "learning_rate": 7.391247379454927e-06, + "loss": 0.069, + "step": 32520 + }, + { + "epoch": 1.7049266247379455, + "grad_norm": 3.4305360317230225, + "learning_rate": 7.378144654088051e-06, + "loss": 0.0876, + "step": 32530 + }, + { + "epoch": 1.7054507337526206, + "grad_norm": 2.0367486476898193, + "learning_rate": 7.365041928721174e-06, + "loss": 0.1007, + "step": 32540 + }, + { + "epoch": 1.7059748427672956, + "grad_norm": 1.563010334968567, + "learning_rate": 7.351939203354298e-06, + "loss": 0.0704, + "step": 32550 + }, + { + "epoch": 1.7064989517819706, + "grad_norm": 1.1888530254364014, + "learning_rate": 7.338836477987422e-06, + "loss": 0.0697, + "step": 32560 + }, + { + "epoch": 1.7070230607966457, + "grad_norm": 2.30206561088562, + "learning_rate": 7.325733752620545e-06, + "loss": 0.0774, + "step": 32570 + }, + { + "epoch": 1.7075471698113207, + "grad_norm": 1.4187016487121582, + "learning_rate": 7.3126310272536695e-06, + "loss": 0.0823, + "step": 32580 + }, + { + "epoch": 1.708071278825996, + "grad_norm": 1.6031312942504883, + "learning_rate": 7.299528301886793e-06, + "loss": 0.0951, + "step": 32590 + }, + { + "epoch": 1.708595387840671, + "grad_norm": 1.295620083808899, + "learning_rate": 7.286425576519916e-06, + "loss": 0.0878, + "step": 32600 + }, + { + "epoch": 1.709119496855346, + "grad_norm": 1.4431999921798706, + "learning_rate": 7.2733228511530405e-06, + "loss": 0.0847, + "step": 32610 + }, + { + "epoch": 1.709643605870021, + "grad_norm": 1.8405075073242188, + "learning_rate": 7.260220125786165e-06, + "loss": 0.0621, + "step": 32620 + }, + { + "epoch": 1.710167714884696, + "grad_norm": 0.9919503927230835, + "learning_rate": 7.247117400419287e-06, + "loss": 0.064, + "step": 32630 + }, + { + "epoch": 1.7106918238993711, + "grad_norm": 1.1123491525650024, + "learning_rate": 7.2340146750524115e-06, + "loss": 0.0621, + "step": 32640 + }, + { + "epoch": 1.7112159329140462, + "grad_norm": 1.684038758277893, + "learning_rate": 7.220911949685536e-06, + "loss": 0.0957, + "step": 32650 + }, + { + "epoch": 1.7117400419287212, + "grad_norm": 1.1205902099609375, + "learning_rate": 7.207809224318658e-06, + "loss": 0.094, + "step": 32660 + }, + { + "epoch": 1.7122641509433962, + "grad_norm": 2.6448683738708496, + "learning_rate": 7.1947064989517825e-06, + "loss": 0.0735, + "step": 32670 + }, + { + "epoch": 1.7127882599580713, + "grad_norm": 1.614770531654358, + "learning_rate": 7.181603773584907e-06, + "loss": 0.0694, + "step": 32680 + }, + { + "epoch": 1.7133123689727463, + "grad_norm": 1.2186577320098877, + "learning_rate": 7.168501048218029e-06, + "loss": 0.0793, + "step": 32690 + }, + { + "epoch": 1.7138364779874213, + "grad_norm": 2.874343156814575, + "learning_rate": 7.1553983228511535e-06, + "loss": 0.0806, + "step": 32700 + }, + { + "epoch": 1.7143605870020964, + "grad_norm": 1.3548649549484253, + "learning_rate": 7.142295597484278e-06, + "loss": 0.075, + "step": 32710 + }, + { + "epoch": 1.7148846960167714, + "grad_norm": 0.8531060218811035, + "learning_rate": 7.1291928721174e-06, + "loss": 0.0579, + "step": 32720 + }, + { + "epoch": 1.7154088050314464, + "grad_norm": 1.160235047340393, + "learning_rate": 7.1160901467505245e-06, + "loss": 0.0667, + "step": 32730 + }, + { + "epoch": 1.7159329140461215, + "grad_norm": 1.3317363262176514, + "learning_rate": 7.102987421383649e-06, + "loss": 0.0869, + "step": 32740 + }, + { + "epoch": 1.7164570230607965, + "grad_norm": 1.2752031087875366, + "learning_rate": 7.089884696016771e-06, + "loss": 0.081, + "step": 32750 + }, + { + "epoch": 1.7169811320754715, + "grad_norm": 1.738019585609436, + "learning_rate": 7.0767819706498955e-06, + "loss": 0.0777, + "step": 32760 + }, + { + "epoch": 1.7175052410901468, + "grad_norm": 1.576297402381897, + "learning_rate": 7.06367924528302e-06, + "loss": 0.0893, + "step": 32770 + }, + { + "epoch": 1.7180293501048218, + "grad_norm": 1.2655184268951416, + "learning_rate": 7.050576519916142e-06, + "loss": 0.0678, + "step": 32780 + }, + { + "epoch": 1.7185534591194969, + "grad_norm": 3.7932636737823486, + "learning_rate": 7.0374737945492665e-06, + "loss": 0.0686, + "step": 32790 + }, + { + "epoch": 1.719077568134172, + "grad_norm": 1.3198883533477783, + "learning_rate": 7.024371069182391e-06, + "loss": 0.076, + "step": 32800 + }, + { + "epoch": 1.719601677148847, + "grad_norm": 1.3575078248977661, + "learning_rate": 7.011268343815513e-06, + "loss": 0.0739, + "step": 32810 + }, + { + "epoch": 1.720125786163522, + "grad_norm": 0.8939098715782166, + "learning_rate": 6.9981656184486375e-06, + "loss": 0.0742, + "step": 32820 + }, + { + "epoch": 1.7206498951781972, + "grad_norm": 2.349097728729248, + "learning_rate": 6.985062893081762e-06, + "loss": 0.0636, + "step": 32830 + }, + { + "epoch": 1.7211740041928723, + "grad_norm": 1.1938402652740479, + "learning_rate": 6.971960167714884e-06, + "loss": 0.0707, + "step": 32840 + }, + { + "epoch": 1.7216981132075473, + "grad_norm": 1.7119457721710205, + "learning_rate": 6.9588574423480085e-06, + "loss": 0.0935, + "step": 32850 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 1.0587613582611084, + "learning_rate": 6.945754716981133e-06, + "loss": 0.0601, + "step": 32860 + }, + { + "epoch": 1.7227463312368974, + "grad_norm": 1.8793436288833618, + "learning_rate": 6.932651991614255e-06, + "loss": 0.0823, + "step": 32870 + }, + { + "epoch": 1.7232704402515724, + "grad_norm": 2.296818733215332, + "learning_rate": 6.9195492662473795e-06, + "loss": 0.0805, + "step": 32880 + }, + { + "epoch": 1.7237945492662474, + "grad_norm": 1.4315125942230225, + "learning_rate": 6.906446540880504e-06, + "loss": 0.0796, + "step": 32890 + }, + { + "epoch": 1.7243186582809225, + "grad_norm": 1.7421220541000366, + "learning_rate": 6.893343815513627e-06, + "loss": 0.1007, + "step": 32900 + }, + { + "epoch": 1.7248427672955975, + "grad_norm": 1.3139567375183105, + "learning_rate": 6.8802410901467506e-06, + "loss": 0.0681, + "step": 32910 + }, + { + "epoch": 1.7253668763102725, + "grad_norm": 0.7437646389007568, + "learning_rate": 6.867138364779875e-06, + "loss": 0.0792, + "step": 32920 + }, + { + "epoch": 1.7258909853249476, + "grad_norm": 1.4558409452438354, + "learning_rate": 6.854035639412998e-06, + "loss": 0.0786, + "step": 32930 + }, + { + "epoch": 1.7264150943396226, + "grad_norm": 0.9515263438224792, + "learning_rate": 6.840932914046122e-06, + "loss": 0.0832, + "step": 32940 + }, + { + "epoch": 1.7269392033542976, + "grad_norm": 1.8956114053726196, + "learning_rate": 6.827830188679246e-06, + "loss": 0.062, + "step": 32950 + }, + { + "epoch": 1.7274633123689727, + "grad_norm": 0.8939383029937744, + "learning_rate": 6.814727463312369e-06, + "loss": 0.0942, + "step": 32960 + }, + { + "epoch": 1.7279874213836477, + "grad_norm": 1.30724036693573, + "learning_rate": 6.801624737945493e-06, + "loss": 0.0728, + "step": 32970 + }, + { + "epoch": 1.7285115303983227, + "grad_norm": 1.8635292053222656, + "learning_rate": 6.788522012578618e-06, + "loss": 0.0693, + "step": 32980 + }, + { + "epoch": 1.7290356394129978, + "grad_norm": 1.9945436716079712, + "learning_rate": 6.77541928721174e-06, + "loss": 0.0585, + "step": 32990 + }, + { + "epoch": 1.7295597484276728, + "grad_norm": 1.2557333707809448, + "learning_rate": 6.762316561844864e-06, + "loss": 0.0793, + "step": 33000 + }, + { + "epoch": 1.7295597484276728, + "eval_loss": 0.2656969130039215, + "eval_runtime": 268.277, + "eval_samples_per_second": 7.421, + "eval_steps_per_second": 1.238, + "step": 33000 + }, + { + "epoch": 1.730083857442348, + "grad_norm": 0.8602995872497559, + "learning_rate": 6.749213836477989e-06, + "loss": 0.0698, + "step": 33010 + }, + { + "epoch": 1.730607966457023, + "grad_norm": 1.8230172395706177, + "learning_rate": 6.736111111111111e-06, + "loss": 0.065, + "step": 33020 + }, + { + "epoch": 1.7311320754716981, + "grad_norm": 1.202260136604309, + "learning_rate": 6.723008385744235e-06, + "loss": 0.082, + "step": 33030 + }, + { + "epoch": 1.7316561844863732, + "grad_norm": 1.9522544145584106, + "learning_rate": 6.70990566037736e-06, + "loss": 0.0736, + "step": 33040 + }, + { + "epoch": 1.7321802935010482, + "grad_norm": 1.8593506813049316, + "learning_rate": 6.696802935010482e-06, + "loss": 0.0876, + "step": 33050 + }, + { + "epoch": 1.7327044025157232, + "grad_norm": 1.674597144126892, + "learning_rate": 6.683700209643606e-06, + "loss": 0.0905, + "step": 33060 + }, + { + "epoch": 1.7332285115303985, + "grad_norm": 0.780911922454834, + "learning_rate": 6.670597484276731e-06, + "loss": 0.0694, + "step": 33070 + }, + { + "epoch": 1.7337526205450735, + "grad_norm": 1.7897731065750122, + "learning_rate": 6.657494758909853e-06, + "loss": 0.0852, + "step": 33080 + }, + { + "epoch": 1.7342767295597485, + "grad_norm": 1.1359392404556274, + "learning_rate": 6.644392033542977e-06, + "loss": 0.0464, + "step": 33090 + }, + { + "epoch": 1.7348008385744236, + "grad_norm": 1.3630025386810303, + "learning_rate": 6.631289308176102e-06, + "loss": 0.0614, + "step": 33100 + }, + { + "epoch": 1.7353249475890986, + "grad_norm": 1.1544018983840942, + "learning_rate": 6.618186582809224e-06, + "loss": 0.0774, + "step": 33110 + }, + { + "epoch": 1.7358490566037736, + "grad_norm": 2.56064510345459, + "learning_rate": 6.605083857442348e-06, + "loss": 0.0788, + "step": 33120 + }, + { + "epoch": 1.7363731656184487, + "grad_norm": 1.8875850439071655, + "learning_rate": 6.591981132075473e-06, + "loss": 0.0934, + "step": 33130 + }, + { + "epoch": 1.7368972746331237, + "grad_norm": 1.464888334274292, + "learning_rate": 6.578878406708595e-06, + "loss": 0.0748, + "step": 33140 + }, + { + "epoch": 1.7374213836477987, + "grad_norm": 1.6824110746383667, + "learning_rate": 6.5657756813417194e-06, + "loss": 0.0636, + "step": 33150 + }, + { + "epoch": 1.7379454926624738, + "grad_norm": 1.7789379358291626, + "learning_rate": 6.552672955974844e-06, + "loss": 0.0628, + "step": 33160 + }, + { + "epoch": 1.7384696016771488, + "grad_norm": 1.5505372285842896, + "learning_rate": 6.539570230607966e-06, + "loss": 0.0688, + "step": 33170 + }, + { + "epoch": 1.7389937106918238, + "grad_norm": 1.3862566947937012, + "learning_rate": 6.5264675052410904e-06, + "loss": 0.0884, + "step": 33180 + }, + { + "epoch": 1.7395178197064989, + "grad_norm": 2.0535497665405273, + "learning_rate": 6.513364779874215e-06, + "loss": 0.0731, + "step": 33190 + }, + { + "epoch": 1.740041928721174, + "grad_norm": 2.2445552349090576, + "learning_rate": 6.500262054507337e-06, + "loss": 0.0587, + "step": 33200 + }, + { + "epoch": 1.740566037735849, + "grad_norm": 2.1130218505859375, + "learning_rate": 6.4871593291404614e-06, + "loss": 0.0546, + "step": 33210 + }, + { + "epoch": 1.741090146750524, + "grad_norm": 7.03181266784668, + "learning_rate": 6.474056603773586e-06, + "loss": 0.0894, + "step": 33220 + }, + { + "epoch": 1.741614255765199, + "grad_norm": 1.9390767812728882, + "learning_rate": 6.460953878406708e-06, + "loss": 0.068, + "step": 33230 + }, + { + "epoch": 1.742138364779874, + "grad_norm": 1.0925933122634888, + "learning_rate": 6.4478511530398324e-06, + "loss": 0.0773, + "step": 33240 + }, + { + "epoch": 1.7426624737945493, + "grad_norm": 2.1912240982055664, + "learning_rate": 6.434748427672957e-06, + "loss": 0.095, + "step": 33250 + }, + { + "epoch": 1.7431865828092243, + "grad_norm": 1.3166011571884155, + "learning_rate": 6.42164570230608e-06, + "loss": 0.0768, + "step": 33260 + }, + { + "epoch": 1.7437106918238994, + "grad_norm": 0.6529295444488525, + "learning_rate": 6.4085429769392034e-06, + "loss": 0.0857, + "step": 33270 + }, + { + "epoch": 1.7442348008385744, + "grad_norm": 1.3718055486679077, + "learning_rate": 6.395440251572328e-06, + "loss": 0.0739, + "step": 33280 + }, + { + "epoch": 1.7447589098532494, + "grad_norm": 1.3556514978408813, + "learning_rate": 6.382337526205451e-06, + "loss": 0.051, + "step": 33290 + }, + { + "epoch": 1.7452830188679245, + "grad_norm": 0.8113279938697815, + "learning_rate": 6.369234800838575e-06, + "loss": 0.0675, + "step": 33300 + }, + { + "epoch": 1.7458071278825997, + "grad_norm": 1.9193611145019531, + "learning_rate": 6.356132075471699e-06, + "loss": 0.1029, + "step": 33310 + }, + { + "epoch": 1.7463312368972748, + "grad_norm": 1.2385278940200806, + "learning_rate": 6.343029350104822e-06, + "loss": 0.0942, + "step": 33320 + }, + { + "epoch": 1.7468553459119498, + "grad_norm": 1.3271894454956055, + "learning_rate": 6.329926624737946e-06, + "loss": 0.0754, + "step": 33330 + }, + { + "epoch": 1.7473794549266248, + "grad_norm": 2.492251396179199, + "learning_rate": 6.3168238993710705e-06, + "loss": 0.0789, + "step": 33340 + }, + { + "epoch": 1.7479035639412999, + "grad_norm": 1.195293664932251, + "learning_rate": 6.303721174004193e-06, + "loss": 0.0594, + "step": 33350 + }, + { + "epoch": 1.748427672955975, + "grad_norm": 1.112662672996521, + "learning_rate": 6.290618448637317e-06, + "loss": 0.0632, + "step": 33360 + }, + { + "epoch": 1.74895178197065, + "grad_norm": 0.9955492615699768, + "learning_rate": 6.2775157232704415e-06, + "loss": 0.0686, + "step": 33370 + }, + { + "epoch": 1.749475890985325, + "grad_norm": 1.3373956680297852, + "learning_rate": 6.264412997903564e-06, + "loss": 0.0725, + "step": 33380 + }, + { + "epoch": 1.75, + "grad_norm": 1.872594952583313, + "learning_rate": 6.251310272536688e-06, + "loss": 0.0652, + "step": 33390 + }, + { + "epoch": 1.750524109014675, + "grad_norm": 1.3409441709518433, + "learning_rate": 6.238207547169812e-06, + "loss": 0.0834, + "step": 33400 + }, + { + "epoch": 1.75104821802935, + "grad_norm": 0.4260615110397339, + "learning_rate": 6.225104821802935e-06, + "loss": 0.0771, + "step": 33410 + }, + { + "epoch": 1.751572327044025, + "grad_norm": 2.4701662063598633, + "learning_rate": 6.212002096436059e-06, + "loss": 0.0763, + "step": 33420 + }, + { + "epoch": 1.7520964360587001, + "grad_norm": 0.7983285188674927, + "learning_rate": 6.198899371069183e-06, + "loss": 0.057, + "step": 33430 + }, + { + "epoch": 1.7526205450733752, + "grad_norm": 1.0928597450256348, + "learning_rate": 6.185796645702306e-06, + "loss": 0.0618, + "step": 33440 + }, + { + "epoch": 1.7531446540880502, + "grad_norm": 1.12595534324646, + "learning_rate": 6.17269392033543e-06, + "loss": 0.066, + "step": 33450 + }, + { + "epoch": 1.7536687631027252, + "grad_norm": 1.363580346107483, + "learning_rate": 6.159591194968554e-06, + "loss": 0.0926, + "step": 33460 + }, + { + "epoch": 1.7541928721174003, + "grad_norm": 2.26957106590271, + "learning_rate": 6.146488469601677e-06, + "loss": 0.0755, + "step": 33470 + }, + { + "epoch": 1.7547169811320755, + "grad_norm": 1.3690286874771118, + "learning_rate": 6.133385744234801e-06, + "loss": 0.102, + "step": 33480 + }, + { + "epoch": 1.7552410901467506, + "grad_norm": 1.2949638366699219, + "learning_rate": 6.120283018867925e-06, + "loss": 0.0681, + "step": 33490 + }, + { + "epoch": 1.7557651991614256, + "grad_norm": 2.4410061836242676, + "learning_rate": 6.107180293501048e-06, + "loss": 0.0868, + "step": 33500 + }, + { + "epoch": 1.7562893081761006, + "grad_norm": 1.083730936050415, + "learning_rate": 6.094077568134172e-06, + "loss": 0.0666, + "step": 33510 + }, + { + "epoch": 1.7568134171907757, + "grad_norm": 1.5479590892791748, + "learning_rate": 6.080974842767296e-06, + "loss": 0.055, + "step": 33520 + }, + { + "epoch": 1.7573375262054507, + "grad_norm": 1.6351350545883179, + "learning_rate": 6.067872117400419e-06, + "loss": 0.086, + "step": 33530 + }, + { + "epoch": 1.757861635220126, + "grad_norm": 0.9401642680168152, + "learning_rate": 6.054769392033543e-06, + "loss": 0.0827, + "step": 33540 + }, + { + "epoch": 1.758385744234801, + "grad_norm": 3.313169240951538, + "learning_rate": 6.041666666666667e-06, + "loss": 0.0769, + "step": 33550 + }, + { + "epoch": 1.758909853249476, + "grad_norm": 0.568518340587616, + "learning_rate": 6.02856394129979e-06, + "loss": 0.082, + "step": 33560 + }, + { + "epoch": 1.759433962264151, + "grad_norm": 1.2666873931884766, + "learning_rate": 6.015461215932914e-06, + "loss": 0.0736, + "step": 33570 + }, + { + "epoch": 1.759958071278826, + "grad_norm": 2.380655527114868, + "learning_rate": 6.002358490566038e-06, + "loss": 0.0792, + "step": 33580 + }, + { + "epoch": 1.7604821802935011, + "grad_norm": 2.7970130443573, + "learning_rate": 5.989255765199162e-06, + "loss": 0.0904, + "step": 33590 + }, + { + "epoch": 1.7610062893081762, + "grad_norm": 1.3964394330978394, + "learning_rate": 5.976153039832285e-06, + "loss": 0.0889, + "step": 33600 + }, + { + "epoch": 1.7615303983228512, + "grad_norm": 0.5676695704460144, + "learning_rate": 5.9630503144654096e-06, + "loss": 0.0706, + "step": 33610 + }, + { + "epoch": 1.7620545073375262, + "grad_norm": 1.2144248485565186, + "learning_rate": 5.949947589098533e-06, + "loss": 0.081, + "step": 33620 + }, + { + "epoch": 1.7625786163522013, + "grad_norm": 1.4676384925842285, + "learning_rate": 5.936844863731657e-06, + "loss": 0.0731, + "step": 33630 + }, + { + "epoch": 1.7631027253668763, + "grad_norm": 1.430888295173645, + "learning_rate": 5.9237421383647806e-06, + "loss": 0.0725, + "step": 33640 + }, + { + "epoch": 1.7636268343815513, + "grad_norm": 2.2190182209014893, + "learning_rate": 5.910639412997904e-06, + "loss": 0.0925, + "step": 33650 + }, + { + "epoch": 1.7641509433962264, + "grad_norm": 1.8190736770629883, + "learning_rate": 5.897536687631028e-06, + "loss": 0.0853, + "step": 33660 + }, + { + "epoch": 1.7646750524109014, + "grad_norm": 1.6736314296722412, + "learning_rate": 5.8844339622641516e-06, + "loss": 0.067, + "step": 33670 + }, + { + "epoch": 1.7651991614255764, + "grad_norm": 2.305516004562378, + "learning_rate": 5.871331236897275e-06, + "loss": 0.0833, + "step": 33680 + }, + { + "epoch": 1.7657232704402515, + "grad_norm": 1.2158180475234985, + "learning_rate": 5.858228511530399e-06, + "loss": 0.0962, + "step": 33690 + }, + { + "epoch": 1.7662473794549265, + "grad_norm": 0.7084839344024658, + "learning_rate": 5.8451257861635226e-06, + "loss": 0.083, + "step": 33700 + }, + { + "epoch": 1.7667714884696015, + "grad_norm": 1.0288736820220947, + "learning_rate": 5.832023060796646e-06, + "loss": 0.0964, + "step": 33710 + }, + { + "epoch": 1.7672955974842768, + "grad_norm": 1.8042439222335815, + "learning_rate": 5.81892033542977e-06, + "loss": 0.068, + "step": 33720 + }, + { + "epoch": 1.7678197064989518, + "grad_norm": 1.5784512758255005, + "learning_rate": 5.8058176100628936e-06, + "loss": 0.0823, + "step": 33730 + }, + { + "epoch": 1.7683438155136268, + "grad_norm": 2.227677822113037, + "learning_rate": 5.792714884696017e-06, + "loss": 0.0743, + "step": 33740 + }, + { + "epoch": 1.7688679245283019, + "grad_norm": 0.7300745844841003, + "learning_rate": 5.779612159329141e-06, + "loss": 0.0675, + "step": 33750 + }, + { + "epoch": 1.769392033542977, + "grad_norm": 2.08028244972229, + "learning_rate": 5.7665094339622646e-06, + "loss": 0.0863, + "step": 33760 + }, + { + "epoch": 1.769916142557652, + "grad_norm": 1.086308240890503, + "learning_rate": 5.753406708595388e-06, + "loss": 0.0839, + "step": 33770 + }, + { + "epoch": 1.7704402515723272, + "grad_norm": 0.8018303513526917, + "learning_rate": 5.740303983228512e-06, + "loss": 0.0626, + "step": 33780 + }, + { + "epoch": 1.7709643605870022, + "grad_norm": 2.3211405277252197, + "learning_rate": 5.7272012578616356e-06, + "loss": 0.0845, + "step": 33790 + }, + { + "epoch": 1.7714884696016773, + "grad_norm": 1.7425010204315186, + "learning_rate": 5.714098532494759e-06, + "loss": 0.0585, + "step": 33800 + }, + { + "epoch": 1.7720125786163523, + "grad_norm": 0.9054259657859802, + "learning_rate": 5.700995807127882e-06, + "loss": 0.0753, + "step": 33810 + }, + { + "epoch": 1.7725366876310273, + "grad_norm": 2.504011392593384, + "learning_rate": 5.687893081761007e-06, + "loss": 0.0862, + "step": 33820 + }, + { + "epoch": 1.7730607966457024, + "grad_norm": 1.1530929803848267, + "learning_rate": 5.67479035639413e-06, + "loss": 0.0875, + "step": 33830 + }, + { + "epoch": 1.7735849056603774, + "grad_norm": 1.3565245866775513, + "learning_rate": 5.661687631027253e-06, + "loss": 0.0625, + "step": 33840 + }, + { + "epoch": 1.7741090146750524, + "grad_norm": 0.9649665355682373, + "learning_rate": 5.648584905660378e-06, + "loss": 0.0637, + "step": 33850 + }, + { + "epoch": 1.7746331236897275, + "grad_norm": 2.0053367614746094, + "learning_rate": 5.635482180293501e-06, + "loss": 0.0853, + "step": 33860 + }, + { + "epoch": 1.7751572327044025, + "grad_norm": 0.9277843832969666, + "learning_rate": 5.622379454926624e-06, + "loss": 0.0709, + "step": 33870 + }, + { + "epoch": 1.7756813417190775, + "grad_norm": 1.4471749067306519, + "learning_rate": 5.609276729559749e-06, + "loss": 0.0639, + "step": 33880 + }, + { + "epoch": 1.7762054507337526, + "grad_norm": 1.230859398841858, + "learning_rate": 5.596174004192872e-06, + "loss": 0.0806, + "step": 33890 + }, + { + "epoch": 1.7767295597484276, + "grad_norm": 1.3443126678466797, + "learning_rate": 5.583071278825995e-06, + "loss": 0.0909, + "step": 33900 + }, + { + "epoch": 1.7772536687631026, + "grad_norm": 1.3505141735076904, + "learning_rate": 5.56996855345912e-06, + "loss": 0.0836, + "step": 33910 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.4082603454589844, + "learning_rate": 5.556865828092243e-06, + "loss": 0.1045, + "step": 33920 + }, + { + "epoch": 1.7783018867924527, + "grad_norm": 0.7405912280082703, + "learning_rate": 5.543763102725367e-06, + "loss": 0.0627, + "step": 33930 + }, + { + "epoch": 1.7788259958071277, + "grad_norm": 1.2767046689987183, + "learning_rate": 5.530660377358491e-06, + "loss": 0.1103, + "step": 33940 + }, + { + "epoch": 1.7793501048218028, + "grad_norm": 0.9530086517333984, + "learning_rate": 5.517557651991615e-06, + "loss": 0.0908, + "step": 33950 + }, + { + "epoch": 1.779874213836478, + "grad_norm": 1.1291282176971436, + "learning_rate": 5.504454926624738e-06, + "loss": 0.0711, + "step": 33960 + }, + { + "epoch": 1.780398322851153, + "grad_norm": 1.126333475112915, + "learning_rate": 5.4913522012578624e-06, + "loss": 0.0604, + "step": 33970 + }, + { + "epoch": 1.780922431865828, + "grad_norm": 1.3223693370819092, + "learning_rate": 5.478249475890986e-06, + "loss": 0.0724, + "step": 33980 + }, + { + "epoch": 1.7814465408805031, + "grad_norm": 1.7418841123580933, + "learning_rate": 5.465146750524109e-06, + "loss": 0.0801, + "step": 33990 + }, + { + "epoch": 1.7819706498951782, + "grad_norm": 0.8617030382156372, + "learning_rate": 5.4520440251572334e-06, + "loss": 0.07, + "step": 34000 + }, + { + "epoch": 1.7819706498951782, + "eval_loss": 0.26762381196022034, + "eval_runtime": 267.2636, + "eval_samples_per_second": 7.45, + "eval_steps_per_second": 1.242, + "step": 34000 + }, + { + "epoch": 1.7824947589098532, + "grad_norm": 1.1304025650024414, + "learning_rate": 5.438941299790357e-06, + "loss": 0.0816, + "step": 34010 + }, + { + "epoch": 1.7830188679245285, + "grad_norm": 1.0686378479003906, + "learning_rate": 5.42583857442348e-06, + "loss": 0.0719, + "step": 34020 + }, + { + "epoch": 1.7835429769392035, + "grad_norm": 1.211351752281189, + "learning_rate": 5.4127358490566045e-06, + "loss": 0.0626, + "step": 34030 + }, + { + "epoch": 1.7840670859538785, + "grad_norm": 1.4797831773757935, + "learning_rate": 5.399633123689728e-06, + "loss": 0.091, + "step": 34040 + }, + { + "epoch": 1.7845911949685536, + "grad_norm": 1.6055877208709717, + "learning_rate": 5.386530398322851e-06, + "loss": 0.0821, + "step": 34050 + }, + { + "epoch": 1.7851153039832286, + "grad_norm": 1.0258854627609253, + "learning_rate": 5.3734276729559755e-06, + "loss": 0.0681, + "step": 34060 + }, + { + "epoch": 1.7856394129979036, + "grad_norm": 2.157062530517578, + "learning_rate": 5.360324947589099e-06, + "loss": 0.0851, + "step": 34070 + }, + { + "epoch": 1.7861635220125787, + "grad_norm": 1.3092892169952393, + "learning_rate": 5.347222222222222e-06, + "loss": 0.0657, + "step": 34080 + }, + { + "epoch": 1.7866876310272537, + "grad_norm": 1.9904484748840332, + "learning_rate": 5.3341194968553465e-06, + "loss": 0.0895, + "step": 34090 + }, + { + "epoch": 1.7872117400419287, + "grad_norm": 1.3187782764434814, + "learning_rate": 5.32101677148847e-06, + "loss": 0.0639, + "step": 34100 + }, + { + "epoch": 1.7877358490566038, + "grad_norm": 2.1087183952331543, + "learning_rate": 5.307914046121593e-06, + "loss": 0.0806, + "step": 34110 + }, + { + "epoch": 1.7882599580712788, + "grad_norm": 1.045880913734436, + "learning_rate": 5.2948113207547175e-06, + "loss": 0.0628, + "step": 34120 + }, + { + "epoch": 1.7887840670859538, + "grad_norm": 1.3817152976989746, + "learning_rate": 5.281708595387841e-06, + "loss": 0.0764, + "step": 34130 + }, + { + "epoch": 1.7893081761006289, + "grad_norm": 3.6602184772491455, + "learning_rate": 5.268605870020964e-06, + "loss": 0.08, + "step": 34140 + }, + { + "epoch": 1.789832285115304, + "grad_norm": 1.980810284614563, + "learning_rate": 5.2555031446540885e-06, + "loss": 0.0749, + "step": 34150 + }, + { + "epoch": 1.790356394129979, + "grad_norm": 1.0341511964797974, + "learning_rate": 5.242400419287212e-06, + "loss": 0.0796, + "step": 34160 + }, + { + "epoch": 1.790880503144654, + "grad_norm": 1.0620462894439697, + "learning_rate": 5.229297693920335e-06, + "loss": 0.0776, + "step": 34170 + }, + { + "epoch": 1.791404612159329, + "grad_norm": 0.9034409523010254, + "learning_rate": 5.2161949685534595e-06, + "loss": 0.0947, + "step": 34180 + }, + { + "epoch": 1.791928721174004, + "grad_norm": 1.7526142597198486, + "learning_rate": 5.203092243186583e-06, + "loss": 0.1004, + "step": 34190 + }, + { + "epoch": 1.7924528301886793, + "grad_norm": 1.0870200395584106, + "learning_rate": 5.189989517819706e-06, + "loss": 0.0603, + "step": 34200 + }, + { + "epoch": 1.7929769392033543, + "grad_norm": 1.2750418186187744, + "learning_rate": 5.1768867924528305e-06, + "loss": 0.0767, + "step": 34210 + }, + { + "epoch": 1.7935010482180294, + "grad_norm": 1.42095947265625, + "learning_rate": 5.163784067085954e-06, + "loss": 0.0771, + "step": 34220 + }, + { + "epoch": 1.7940251572327044, + "grad_norm": 1.6726841926574707, + "learning_rate": 5.150681341719077e-06, + "loss": 0.068, + "step": 34230 + }, + { + "epoch": 1.7945492662473794, + "grad_norm": 1.2818461656570435, + "learning_rate": 5.1375786163522015e-06, + "loss": 0.0809, + "step": 34240 + }, + { + "epoch": 1.7950733752620545, + "grad_norm": 1.0451738834381104, + "learning_rate": 5.124475890985325e-06, + "loss": 0.0714, + "step": 34250 + }, + { + "epoch": 1.7955974842767297, + "grad_norm": 1.672234296798706, + "learning_rate": 5.111373165618449e-06, + "loss": 0.0773, + "step": 34260 + }, + { + "epoch": 1.7961215932914047, + "grad_norm": 3.063178300857544, + "learning_rate": 5.0982704402515725e-06, + "loss": 0.0802, + "step": 34270 + }, + { + "epoch": 1.7966457023060798, + "grad_norm": 1.3467538356781006, + "learning_rate": 5.085167714884697e-06, + "loss": 0.0771, + "step": 34280 + }, + { + "epoch": 1.7971698113207548, + "grad_norm": 1.362091302871704, + "learning_rate": 5.07206498951782e-06, + "loss": 0.0961, + "step": 34290 + }, + { + "epoch": 1.7976939203354299, + "grad_norm": 0.9203101396560669, + "learning_rate": 5.0589622641509435e-06, + "loss": 0.0744, + "step": 34300 + }, + { + "epoch": 1.7982180293501049, + "grad_norm": 2.0424604415893555, + "learning_rate": 5.045859538784068e-06, + "loss": 0.0767, + "step": 34310 + }, + { + "epoch": 1.79874213836478, + "grad_norm": 1.1854127645492554, + "learning_rate": 5.032756813417191e-06, + "loss": 0.0632, + "step": 34320 + }, + { + "epoch": 1.799266247379455, + "grad_norm": 0.7411090731620789, + "learning_rate": 5.019654088050315e-06, + "loss": 0.0641, + "step": 34330 + }, + { + "epoch": 1.79979035639413, + "grad_norm": 2.001063823699951, + "learning_rate": 5.006551362683439e-06, + "loss": 0.0785, + "step": 34340 + }, + { + "epoch": 1.800314465408805, + "grad_norm": 1.6688907146453857, + "learning_rate": 4.993448637316562e-06, + "loss": 0.0846, + "step": 34350 + }, + { + "epoch": 1.80083857442348, + "grad_norm": 1.111572265625, + "learning_rate": 4.980345911949686e-06, + "loss": 0.0814, + "step": 34360 + }, + { + "epoch": 1.801362683438155, + "grad_norm": 0.7180928587913513, + "learning_rate": 4.96724318658281e-06, + "loss": 0.0586, + "step": 34370 + }, + { + "epoch": 1.8018867924528301, + "grad_norm": 0.7584537863731384, + "learning_rate": 4.954140461215933e-06, + "loss": 0.0548, + "step": 34380 + }, + { + "epoch": 1.8024109014675052, + "grad_norm": 1.2297812700271606, + "learning_rate": 4.941037735849057e-06, + "loss": 0.0519, + "step": 34390 + }, + { + "epoch": 1.8029350104821802, + "grad_norm": 1.7759953737258911, + "learning_rate": 4.927935010482181e-06, + "loss": 0.0741, + "step": 34400 + }, + { + "epoch": 1.8034591194968552, + "grad_norm": 1.6569321155548096, + "learning_rate": 4.914832285115304e-06, + "loss": 0.0868, + "step": 34410 + }, + { + "epoch": 1.8039832285115303, + "grad_norm": 2.512075424194336, + "learning_rate": 4.901729559748428e-06, + "loss": 0.0547, + "step": 34420 + }, + { + "epoch": 1.8045073375262053, + "grad_norm": 0.7744259834289551, + "learning_rate": 4.888626834381552e-06, + "loss": 0.0586, + "step": 34430 + }, + { + "epoch": 1.8050314465408805, + "grad_norm": 1.7868129014968872, + "learning_rate": 4.875524109014675e-06, + "loss": 0.0717, + "step": 34440 + }, + { + "epoch": 1.8055555555555556, + "grad_norm": 1.1555416584014893, + "learning_rate": 4.862421383647799e-06, + "loss": 0.0676, + "step": 34450 + }, + { + "epoch": 1.8060796645702306, + "grad_norm": 2.247636079788208, + "learning_rate": 4.849318658280923e-06, + "loss": 0.0798, + "step": 34460 + }, + { + "epoch": 1.8066037735849056, + "grad_norm": 1.6758849620819092, + "learning_rate": 4.836215932914046e-06, + "loss": 0.0498, + "step": 34470 + }, + { + "epoch": 1.8071278825995807, + "grad_norm": 0.8932026624679565, + "learning_rate": 4.82311320754717e-06, + "loss": 0.0837, + "step": 34480 + }, + { + "epoch": 1.8076519916142557, + "grad_norm": 1.1528186798095703, + "learning_rate": 4.810010482180294e-06, + "loss": 0.1109, + "step": 34490 + }, + { + "epoch": 1.808176100628931, + "grad_norm": 1.6238465309143066, + "learning_rate": 4.796907756813417e-06, + "loss": 0.065, + "step": 34500 + }, + { + "epoch": 1.808700209643606, + "grad_norm": 0.6329945921897888, + "learning_rate": 4.783805031446541e-06, + "loss": 0.0633, + "step": 34510 + }, + { + "epoch": 1.809224318658281, + "grad_norm": 1.354620099067688, + "learning_rate": 4.770702306079665e-06, + "loss": 0.0651, + "step": 34520 + }, + { + "epoch": 1.809748427672956, + "grad_norm": 2.1168293952941895, + "learning_rate": 4.757599580712788e-06, + "loss": 0.0778, + "step": 34530 + }, + { + "epoch": 1.810272536687631, + "grad_norm": 0.7592911720275879, + "learning_rate": 4.744496855345912e-06, + "loss": 0.0913, + "step": 34540 + }, + { + "epoch": 1.8107966457023061, + "grad_norm": 1.3963457345962524, + "learning_rate": 4.731394129979036e-06, + "loss": 0.1037, + "step": 34550 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 1.0729615688323975, + "learning_rate": 4.718291404612159e-06, + "loss": 0.0816, + "step": 34560 + }, + { + "epoch": 1.8118448637316562, + "grad_norm": 2.7409613132476807, + "learning_rate": 4.705188679245283e-06, + "loss": 0.0704, + "step": 34570 + }, + { + "epoch": 1.8123689727463312, + "grad_norm": 1.1228454113006592, + "learning_rate": 4.692085953878407e-06, + "loss": 0.0866, + "step": 34580 + }, + { + "epoch": 1.8128930817610063, + "grad_norm": 0.7433912754058838, + "learning_rate": 4.67898322851153e-06, + "loss": 0.0466, + "step": 34590 + }, + { + "epoch": 1.8134171907756813, + "grad_norm": 1.5128268003463745, + "learning_rate": 4.665880503144654e-06, + "loss": 0.0657, + "step": 34600 + }, + { + "epoch": 1.8139412997903563, + "grad_norm": 0.7765136361122131, + "learning_rate": 4.652777777777778e-06, + "loss": 0.1025, + "step": 34610 + }, + { + "epoch": 1.8144654088050314, + "grad_norm": 0.8656555414199829, + "learning_rate": 4.639675052410902e-06, + "loss": 0.082, + "step": 34620 + }, + { + "epoch": 1.8149895178197064, + "grad_norm": 0.8403626084327698, + "learning_rate": 4.626572327044025e-06, + "loss": 0.0711, + "step": 34630 + }, + { + "epoch": 1.8155136268343814, + "grad_norm": 1.6570936441421509, + "learning_rate": 4.61346960167715e-06, + "loss": 0.059, + "step": 34640 + }, + { + "epoch": 1.8160377358490565, + "grad_norm": 1.7300814390182495, + "learning_rate": 4.600366876310273e-06, + "loss": 0.0747, + "step": 34650 + }, + { + "epoch": 1.8165618448637315, + "grad_norm": 1.816053032875061, + "learning_rate": 4.587264150943396e-06, + "loss": 0.0811, + "step": 34660 + }, + { + "epoch": 1.8170859538784065, + "grad_norm": 1.2429102659225464, + "learning_rate": 4.574161425576521e-06, + "loss": 0.096, + "step": 34670 + }, + { + "epoch": 1.8176100628930818, + "grad_norm": 1.878833293914795, + "learning_rate": 4.561058700209644e-06, + "loss": 0.0667, + "step": 34680 + }, + { + "epoch": 1.8181341719077568, + "grad_norm": 1.124155879020691, + "learning_rate": 4.547955974842767e-06, + "loss": 0.073, + "step": 34690 + }, + { + "epoch": 1.8186582809224319, + "grad_norm": 0.4943144917488098, + "learning_rate": 4.534853249475892e-06, + "loss": 0.0642, + "step": 34700 + }, + { + "epoch": 1.819182389937107, + "grad_norm": 2.3582496643066406, + "learning_rate": 4.521750524109015e-06, + "loss": 0.0892, + "step": 34710 + }, + { + "epoch": 1.819706498951782, + "grad_norm": 1.2020031213760376, + "learning_rate": 4.508647798742138e-06, + "loss": 0.0636, + "step": 34720 + }, + { + "epoch": 1.820230607966457, + "grad_norm": 1.8882160186767578, + "learning_rate": 4.495545073375263e-06, + "loss": 0.0623, + "step": 34730 + }, + { + "epoch": 1.8207547169811322, + "grad_norm": 1.6447032690048218, + "learning_rate": 4.482442348008386e-06, + "loss": 0.094, + "step": 34740 + }, + { + "epoch": 1.8212788259958073, + "grad_norm": 3.1155807971954346, + "learning_rate": 4.469339622641509e-06, + "loss": 0.116, + "step": 34750 + }, + { + "epoch": 1.8218029350104823, + "grad_norm": 1.1563801765441895, + "learning_rate": 4.456236897274634e-06, + "loss": 0.0752, + "step": 34760 + }, + { + "epoch": 1.8223270440251573, + "grad_norm": 0.9092895984649658, + "learning_rate": 4.443134171907757e-06, + "loss": 0.0844, + "step": 34770 + }, + { + "epoch": 1.8228511530398324, + "grad_norm": 0.7180648446083069, + "learning_rate": 4.43003144654088e-06, + "loss": 0.0711, + "step": 34780 + }, + { + "epoch": 1.8233752620545074, + "grad_norm": 0.9739916920661926, + "learning_rate": 4.416928721174005e-06, + "loss": 0.0666, + "step": 34790 + }, + { + "epoch": 1.8238993710691824, + "grad_norm": 1.4060016870498657, + "learning_rate": 4.403825995807128e-06, + "loss": 0.0791, + "step": 34800 + }, + { + "epoch": 1.8244234800838575, + "grad_norm": 1.233502984046936, + "learning_rate": 4.390723270440251e-06, + "loss": 0.0613, + "step": 34810 + }, + { + "epoch": 1.8249475890985325, + "grad_norm": 2.0533065795898438, + "learning_rate": 4.377620545073376e-06, + "loss": 0.0712, + "step": 34820 + }, + { + "epoch": 1.8254716981132075, + "grad_norm": 2.9354019165039062, + "learning_rate": 4.364517819706499e-06, + "loss": 0.0782, + "step": 34830 + }, + { + "epoch": 1.8259958071278826, + "grad_norm": 1.3234208822250366, + "learning_rate": 4.351415094339622e-06, + "loss": 0.0762, + "step": 34840 + }, + { + "epoch": 1.8265199161425576, + "grad_norm": 1.6237906217575073, + "learning_rate": 4.338312368972747e-06, + "loss": 0.0761, + "step": 34850 + }, + { + "epoch": 1.8270440251572326, + "grad_norm": 1.6208617687225342, + "learning_rate": 4.32520964360587e-06, + "loss": 0.0788, + "step": 34860 + }, + { + "epoch": 1.8275681341719077, + "grad_norm": 1.607041597366333, + "learning_rate": 4.312106918238993e-06, + "loss": 0.072, + "step": 34870 + }, + { + "epoch": 1.8280922431865827, + "grad_norm": 2.776214599609375, + "learning_rate": 4.299004192872118e-06, + "loss": 0.0779, + "step": 34880 + }, + { + "epoch": 1.8286163522012577, + "grad_norm": 1.4961811304092407, + "learning_rate": 4.285901467505241e-06, + "loss": 0.1001, + "step": 34890 + }, + { + "epoch": 1.8291404612159328, + "grad_norm": 1.2335947751998901, + "learning_rate": 4.272798742138364e-06, + "loss": 0.0771, + "step": 34900 + }, + { + "epoch": 1.8296645702306078, + "grad_norm": 1.5569696426391602, + "learning_rate": 4.259696016771489e-06, + "loss": 0.078, + "step": 34910 + }, + { + "epoch": 1.830188679245283, + "grad_norm": 2.003178596496582, + "learning_rate": 4.246593291404612e-06, + "loss": 0.0807, + "step": 34920 + }, + { + "epoch": 1.830712788259958, + "grad_norm": 1.8837343454360962, + "learning_rate": 4.233490566037735e-06, + "loss": 0.0776, + "step": 34930 + }, + { + "epoch": 1.8312368972746331, + "grad_norm": 1.3460261821746826, + "learning_rate": 4.22038784067086e-06, + "loss": 0.0827, + "step": 34940 + }, + { + "epoch": 1.8317610062893082, + "grad_norm": 1.528942584991455, + "learning_rate": 4.207285115303983e-06, + "loss": 0.0697, + "step": 34950 + }, + { + "epoch": 1.8322851153039832, + "grad_norm": 1.4546736478805542, + "learning_rate": 4.194182389937107e-06, + "loss": 0.0843, + "step": 34960 + }, + { + "epoch": 1.8328092243186582, + "grad_norm": 1.3445278406143188, + "learning_rate": 4.181079664570231e-06, + "loss": 0.0685, + "step": 34970 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 1.9412113428115845, + "learning_rate": 4.167976939203355e-06, + "loss": 0.0894, + "step": 34980 + }, + { + "epoch": 1.8338574423480085, + "grad_norm": 2.7959461212158203, + "learning_rate": 4.154874213836478e-06, + "loss": 0.0892, + "step": 34990 + }, + { + "epoch": 1.8343815513626835, + "grad_norm": 0.8164100646972656, + "learning_rate": 4.1417714884696025e-06, + "loss": 0.0567, + "step": 35000 + }, + { + "epoch": 1.8343815513626835, + "eval_loss": 0.26678207516670227, + "eval_runtime": 266.7648, + "eval_samples_per_second": 7.464, + "eval_steps_per_second": 1.245, + "step": 35000 + }, + { + "epoch": 1.8349056603773586, + "grad_norm": 2.1443021297454834, + "learning_rate": 4.128668763102726e-06, + "loss": 0.0722, + "step": 35010 + }, + { + "epoch": 1.8354297693920336, + "grad_norm": 2.0307836532592773, + "learning_rate": 4.115566037735849e-06, + "loss": 0.0967, + "step": 35020 + }, + { + "epoch": 1.8359538784067087, + "grad_norm": 1.3431941270828247, + "learning_rate": 4.1024633123689735e-06, + "loss": 0.0752, + "step": 35030 + }, + { + "epoch": 1.8364779874213837, + "grad_norm": 1.4859864711761475, + "learning_rate": 4.089360587002097e-06, + "loss": 0.0913, + "step": 35040 + }, + { + "epoch": 1.8370020964360587, + "grad_norm": 0.9566323161125183, + "learning_rate": 4.07625786163522e-06, + "loss": 0.07, + "step": 35050 + }, + { + "epoch": 1.8375262054507338, + "grad_norm": 1.1632527112960815, + "learning_rate": 4.0631551362683445e-06, + "loss": 0.0664, + "step": 35060 + }, + { + "epoch": 1.8380503144654088, + "grad_norm": 1.5765260457992554, + "learning_rate": 4.050052410901468e-06, + "loss": 0.0641, + "step": 35070 + }, + { + "epoch": 1.8385744234800838, + "grad_norm": 1.8369210958480835, + "learning_rate": 4.036949685534591e-06, + "loss": 0.061, + "step": 35080 + }, + { + "epoch": 1.8390985324947589, + "grad_norm": 1.4045135974884033, + "learning_rate": 4.0238469601677155e-06, + "loss": 0.0725, + "step": 35090 + }, + { + "epoch": 1.8396226415094339, + "grad_norm": 1.1947506666183472, + "learning_rate": 4.010744234800839e-06, + "loss": 0.0726, + "step": 35100 + }, + { + "epoch": 1.840146750524109, + "grad_norm": 1.765573501586914, + "learning_rate": 3.997641509433962e-06, + "loss": 0.0864, + "step": 35110 + }, + { + "epoch": 1.840670859538784, + "grad_norm": 1.8163455724716187, + "learning_rate": 3.9845387840670865e-06, + "loss": 0.0513, + "step": 35120 + }, + { + "epoch": 1.841194968553459, + "grad_norm": 1.3820146322250366, + "learning_rate": 3.97143605870021e-06, + "loss": 0.0593, + "step": 35130 + }, + { + "epoch": 1.841719077568134, + "grad_norm": 1.7238701581954956, + "learning_rate": 3.958333333333333e-06, + "loss": 0.0754, + "step": 35140 + }, + { + "epoch": 1.8422431865828093, + "grad_norm": 2.048316240310669, + "learning_rate": 3.9452306079664575e-06, + "loss": 0.0596, + "step": 35150 + }, + { + "epoch": 1.8427672955974843, + "grad_norm": 1.101599097251892, + "learning_rate": 3.932127882599581e-06, + "loss": 0.0716, + "step": 35160 + }, + { + "epoch": 1.8432914046121593, + "grad_norm": 1.935444712638855, + "learning_rate": 3.919025157232704e-06, + "loss": 0.0899, + "step": 35170 + }, + { + "epoch": 1.8438155136268344, + "grad_norm": 0.8457840085029602, + "learning_rate": 3.9059224318658285e-06, + "loss": 0.0795, + "step": 35180 + }, + { + "epoch": 1.8443396226415094, + "grad_norm": 1.2010316848754883, + "learning_rate": 3.892819706498952e-06, + "loss": 0.0894, + "step": 35190 + }, + { + "epoch": 1.8448637316561844, + "grad_norm": 1.988616943359375, + "learning_rate": 3.879716981132075e-06, + "loss": 0.0726, + "step": 35200 + }, + { + "epoch": 1.8453878406708597, + "grad_norm": 1.7617828845977783, + "learning_rate": 3.8666142557651995e-06, + "loss": 0.104, + "step": 35210 + }, + { + "epoch": 1.8459119496855347, + "grad_norm": 1.2007869482040405, + "learning_rate": 3.853511530398323e-06, + "loss": 0.0668, + "step": 35220 + }, + { + "epoch": 1.8464360587002098, + "grad_norm": 1.926151156425476, + "learning_rate": 3.840408805031446e-06, + "loss": 0.079, + "step": 35230 + }, + { + "epoch": 1.8469601677148848, + "grad_norm": 1.7576385736465454, + "learning_rate": 3.8273060796645705e-06, + "loss": 0.0704, + "step": 35240 + }, + { + "epoch": 1.8474842767295598, + "grad_norm": 1.9205641746520996, + "learning_rate": 3.8142033542976943e-06, + "loss": 0.0807, + "step": 35250 + }, + { + "epoch": 1.8480083857442349, + "grad_norm": 1.8392775058746338, + "learning_rate": 3.8011006289308177e-06, + "loss": 0.0826, + "step": 35260 + }, + { + "epoch": 1.84853249475891, + "grad_norm": 1.0592628717422485, + "learning_rate": 3.787997903563942e-06, + "loss": 0.0955, + "step": 35270 + }, + { + "epoch": 1.849056603773585, + "grad_norm": 1.6646348237991333, + "learning_rate": 3.7748951781970653e-06, + "loss": 0.0465, + "step": 35280 + }, + { + "epoch": 1.84958071278826, + "grad_norm": 1.4347747564315796, + "learning_rate": 3.7617924528301887e-06, + "loss": 0.0815, + "step": 35290 + }, + { + "epoch": 1.850104821802935, + "grad_norm": 2.0240726470947266, + "learning_rate": 3.748689727463313e-06, + "loss": 0.0875, + "step": 35300 + }, + { + "epoch": 1.85062893081761, + "grad_norm": 2.382476568222046, + "learning_rate": 3.7355870020964363e-06, + "loss": 0.0775, + "step": 35310 + }, + { + "epoch": 1.851153039832285, + "grad_norm": 0.916605532169342, + "learning_rate": 3.7224842767295597e-06, + "loss": 0.0718, + "step": 35320 + }, + { + "epoch": 1.85167714884696, + "grad_norm": 1.1663122177124023, + "learning_rate": 3.709381551362684e-06, + "loss": 0.0927, + "step": 35330 + }, + { + "epoch": 1.8522012578616351, + "grad_norm": 1.4649229049682617, + "learning_rate": 3.6962788259958073e-06, + "loss": 0.0861, + "step": 35340 + }, + { + "epoch": 1.8527253668763102, + "grad_norm": 1.1901835203170776, + "learning_rate": 3.6831761006289307e-06, + "loss": 0.0751, + "step": 35350 + }, + { + "epoch": 1.8532494758909852, + "grad_norm": 2.1648807525634766, + "learning_rate": 3.670073375262055e-06, + "loss": 0.0753, + "step": 35360 + }, + { + "epoch": 1.8537735849056602, + "grad_norm": 2.470012664794922, + "learning_rate": 3.6569706498951783e-06, + "loss": 0.074, + "step": 35370 + }, + { + "epoch": 1.8542976939203353, + "grad_norm": 1.3808513879776, + "learning_rate": 3.643867924528302e-06, + "loss": 0.0745, + "step": 35380 + }, + { + "epoch": 1.8548218029350105, + "grad_norm": 1.5830243825912476, + "learning_rate": 3.6307651991614255e-06, + "loss": 0.0838, + "step": 35390 + }, + { + "epoch": 1.8553459119496856, + "grad_norm": 1.371849775314331, + "learning_rate": 3.6176624737945498e-06, + "loss": 0.0821, + "step": 35400 + }, + { + "epoch": 1.8558700209643606, + "grad_norm": 1.2144567966461182, + "learning_rate": 3.604559748427673e-06, + "loss": 0.0717, + "step": 35410 + }, + { + "epoch": 1.8563941299790356, + "grad_norm": 1.3262728452682495, + "learning_rate": 3.5914570230607965e-06, + "loss": 0.0682, + "step": 35420 + }, + { + "epoch": 1.8569182389937107, + "grad_norm": 1.0475879907608032, + "learning_rate": 3.5783542976939208e-06, + "loss": 0.0571, + "step": 35430 + }, + { + "epoch": 1.8574423480083857, + "grad_norm": 1.328346848487854, + "learning_rate": 3.565251572327044e-06, + "loss": 0.071, + "step": 35440 + }, + { + "epoch": 1.857966457023061, + "grad_norm": 1.3281270265579224, + "learning_rate": 3.5521488469601675e-06, + "loss": 0.076, + "step": 35450 + }, + { + "epoch": 1.858490566037736, + "grad_norm": 1.2660882472991943, + "learning_rate": 3.5390461215932918e-06, + "loss": 0.0712, + "step": 35460 + }, + { + "epoch": 1.859014675052411, + "grad_norm": 1.620882272720337, + "learning_rate": 3.525943396226415e-06, + "loss": 0.0778, + "step": 35470 + }, + { + "epoch": 1.859538784067086, + "grad_norm": 0.6216922402381897, + "learning_rate": 3.5128406708595385e-06, + "loss": 0.0676, + "step": 35480 + }, + { + "epoch": 1.860062893081761, + "grad_norm": 1.293033242225647, + "learning_rate": 3.4997379454926628e-06, + "loss": 0.0877, + "step": 35490 + }, + { + "epoch": 1.8605870020964361, + "grad_norm": 2.2693710327148438, + "learning_rate": 3.486635220125786e-06, + "loss": 0.0581, + "step": 35500 + }, + { + "epoch": 1.8611111111111112, + "grad_norm": 0.49999281764030457, + "learning_rate": 3.4735324947589095e-06, + "loss": 0.0728, + "step": 35510 + }, + { + "epoch": 1.8616352201257862, + "grad_norm": 1.4117684364318848, + "learning_rate": 3.4604297693920338e-06, + "loss": 0.0569, + "step": 35520 + }, + { + "epoch": 1.8621593291404612, + "grad_norm": 1.4921469688415527, + "learning_rate": 3.447327044025157e-06, + "loss": 0.0561, + "step": 35530 + }, + { + "epoch": 1.8626834381551363, + "grad_norm": 2.2984938621520996, + "learning_rate": 3.434224318658281e-06, + "loss": 0.0652, + "step": 35540 + }, + { + "epoch": 1.8632075471698113, + "grad_norm": 1.0348659753799438, + "learning_rate": 3.4211215932914048e-06, + "loss": 0.0534, + "step": 35550 + }, + { + "epoch": 1.8637316561844863, + "grad_norm": 2.2559142112731934, + "learning_rate": 3.4080188679245286e-06, + "loss": 0.0671, + "step": 35560 + }, + { + "epoch": 1.8642557651991614, + "grad_norm": 1.69614839553833, + "learning_rate": 3.394916142557652e-06, + "loss": 0.083, + "step": 35570 + }, + { + "epoch": 1.8647798742138364, + "grad_norm": 3.1688826084136963, + "learning_rate": 3.381813417190776e-06, + "loss": 0.0827, + "step": 35580 + }, + { + "epoch": 1.8653039832285114, + "grad_norm": 0.9188790321350098, + "learning_rate": 3.3687106918238996e-06, + "loss": 0.0764, + "step": 35590 + }, + { + "epoch": 1.8658280922431865, + "grad_norm": 1.3099772930145264, + "learning_rate": 3.355607966457023e-06, + "loss": 0.0641, + "step": 35600 + }, + { + "epoch": 1.8663522012578615, + "grad_norm": 1.3244524002075195, + "learning_rate": 3.342505241090147e-06, + "loss": 0.0829, + "step": 35610 + }, + { + "epoch": 1.8668763102725365, + "grad_norm": 1.8879936933517456, + "learning_rate": 3.3294025157232706e-06, + "loss": 0.0788, + "step": 35620 + }, + { + "epoch": 1.8674004192872118, + "grad_norm": 2.605762481689453, + "learning_rate": 3.316299790356394e-06, + "loss": 0.1098, + "step": 35630 + }, + { + "epoch": 1.8679245283018868, + "grad_norm": 1.6959515810012817, + "learning_rate": 3.303197064989518e-06, + "loss": 0.0705, + "step": 35640 + }, + { + "epoch": 1.8684486373165619, + "grad_norm": 1.204658031463623, + "learning_rate": 3.2900943396226416e-06, + "loss": 0.0673, + "step": 35650 + }, + { + "epoch": 1.868972746331237, + "grad_norm": 1.229408860206604, + "learning_rate": 3.276991614255765e-06, + "loss": 0.0807, + "step": 35660 + }, + { + "epoch": 1.869496855345912, + "grad_norm": 1.9810676574707031, + "learning_rate": 3.2638888888888892e-06, + "loss": 0.0809, + "step": 35670 + }, + { + "epoch": 1.870020964360587, + "grad_norm": 1.1754933595657349, + "learning_rate": 3.2507861635220126e-06, + "loss": 0.0774, + "step": 35680 + }, + { + "epoch": 1.8705450733752622, + "grad_norm": 0.6706782579421997, + "learning_rate": 3.237683438155136e-06, + "loss": 0.0621, + "step": 35690 + }, + { + "epoch": 1.8710691823899372, + "grad_norm": 2.3637070655822754, + "learning_rate": 3.2245807127882602e-06, + "loss": 0.0837, + "step": 35700 + }, + { + "epoch": 1.8715932914046123, + "grad_norm": 1.4699527025222778, + "learning_rate": 3.2114779874213836e-06, + "loss": 0.0628, + "step": 35710 + }, + { + "epoch": 1.8721174004192873, + "grad_norm": 1.398445963859558, + "learning_rate": 3.1983752620545074e-06, + "loss": 0.071, + "step": 35720 + }, + { + "epoch": 1.8726415094339623, + "grad_norm": 1.279449462890625, + "learning_rate": 3.1852725366876312e-06, + "loss": 0.0708, + "step": 35730 + }, + { + "epoch": 1.8731656184486374, + "grad_norm": 1.7630802392959595, + "learning_rate": 3.172169811320755e-06, + "loss": 0.1008, + "step": 35740 + }, + { + "epoch": 1.8736897274633124, + "grad_norm": 4.057922840118408, + "learning_rate": 3.1590670859538784e-06, + "loss": 0.101, + "step": 35750 + }, + { + "epoch": 1.8742138364779874, + "grad_norm": 2.732259750366211, + "learning_rate": 3.1459643605870026e-06, + "loss": 0.0978, + "step": 35760 + }, + { + "epoch": 1.8747379454926625, + "grad_norm": 1.7538576126098633, + "learning_rate": 3.132861635220126e-06, + "loss": 0.1018, + "step": 35770 + }, + { + "epoch": 1.8752620545073375, + "grad_norm": 2.2278151512145996, + "learning_rate": 3.11975890985325e-06, + "loss": 0.0765, + "step": 35780 + }, + { + "epoch": 1.8757861635220126, + "grad_norm": 2.4491899013519287, + "learning_rate": 3.1066561844863732e-06, + "loss": 0.0764, + "step": 35790 + }, + { + "epoch": 1.8763102725366876, + "grad_norm": 3.7896568775177, + "learning_rate": 3.093553459119497e-06, + "loss": 0.0558, + "step": 35800 + }, + { + "epoch": 1.8768343815513626, + "grad_norm": 2.0129201412200928, + "learning_rate": 3.080450733752621e-06, + "loss": 0.0875, + "step": 35810 + }, + { + "epoch": 1.8773584905660377, + "grad_norm": 0.8520331978797913, + "learning_rate": 3.0673480083857442e-06, + "loss": 0.0805, + "step": 35820 + }, + { + "epoch": 1.8778825995807127, + "grad_norm": 1.552331566810608, + "learning_rate": 3.054245283018868e-06, + "loss": 0.0633, + "step": 35830 + }, + { + "epoch": 1.8784067085953877, + "grad_norm": 1.4037585258483887, + "learning_rate": 3.041142557651992e-06, + "loss": 0.0824, + "step": 35840 + }, + { + "epoch": 1.8789308176100628, + "grad_norm": 1.717598795890808, + "learning_rate": 3.0280398322851152e-06, + "loss": 0.0905, + "step": 35850 + }, + { + "epoch": 1.8794549266247378, + "grad_norm": 1.6586337089538574, + "learning_rate": 3.014937106918239e-06, + "loss": 0.0653, + "step": 35860 + }, + { + "epoch": 1.879979035639413, + "grad_norm": 1.139183759689331, + "learning_rate": 3.001834381551363e-06, + "loss": 0.0795, + "step": 35870 + }, + { + "epoch": 1.880503144654088, + "grad_norm": 1.106190800666809, + "learning_rate": 2.9887316561844862e-06, + "loss": 0.0605, + "step": 35880 + }, + { + "epoch": 1.881027253668763, + "grad_norm": 0.8567935228347778, + "learning_rate": 2.97562893081761e-06, + "loss": 0.071, + "step": 35890 + }, + { + "epoch": 1.8815513626834381, + "grad_norm": 1.2470753192901611, + "learning_rate": 2.962526205450734e-06, + "loss": 0.099, + "step": 35900 + }, + { + "epoch": 1.8820754716981132, + "grad_norm": 1.275889277458191, + "learning_rate": 2.9494234800838577e-06, + "loss": 0.0688, + "step": 35910 + }, + { + "epoch": 1.8825995807127882, + "grad_norm": 1.2817095518112183, + "learning_rate": 2.9363207547169815e-06, + "loss": 0.069, + "step": 35920 + }, + { + "epoch": 1.8831236897274635, + "grad_norm": 1.6084591150283813, + "learning_rate": 2.9232180293501053e-06, + "loss": 0.0632, + "step": 35930 + }, + { + "epoch": 1.8836477987421385, + "grad_norm": 2.0670547485351562, + "learning_rate": 2.9101153039832287e-06, + "loss": 0.0693, + "step": 35940 + }, + { + "epoch": 1.8841719077568135, + "grad_norm": 1.5240498781204224, + "learning_rate": 2.8970125786163525e-06, + "loss": 0.0782, + "step": 35950 + }, + { + "epoch": 1.8846960167714886, + "grad_norm": 1.8212049007415771, + "learning_rate": 2.8839098532494763e-06, + "loss": 0.0494, + "step": 35960 + }, + { + "epoch": 1.8852201257861636, + "grad_norm": 1.2207962274551392, + "learning_rate": 2.8708071278825997e-06, + "loss": 0.0916, + "step": 35970 + }, + { + "epoch": 1.8857442348008386, + "grad_norm": 1.3317270278930664, + "learning_rate": 2.8577044025157235e-06, + "loss": 0.0724, + "step": 35980 + }, + { + "epoch": 1.8862683438155137, + "grad_norm": 0.759463906288147, + "learning_rate": 2.844601677148847e-06, + "loss": 0.0547, + "step": 35990 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 1.6556220054626465, + "learning_rate": 2.8314989517819707e-06, + "loss": 0.067, + "step": 36000 + }, + { + "epoch": 1.8867924528301887, + "eval_loss": 0.2678377032279968, + "eval_runtime": 268.3326, + "eval_samples_per_second": 7.42, + "eval_steps_per_second": 1.237, + "step": 36000 + }, + { + "epoch": 1.8873165618448637, + "grad_norm": 2.3549275398254395, + "learning_rate": 2.8183962264150945e-06, + "loss": 0.0642, + "step": 36010 + }, + { + "epoch": 1.8878406708595388, + "grad_norm": 2.4621994495391846, + "learning_rate": 2.805293501048218e-06, + "loss": 0.1005, + "step": 36020 + }, + { + "epoch": 1.8883647798742138, + "grad_norm": 2.111743211746216, + "learning_rate": 2.7921907756813417e-06, + "loss": 0.0768, + "step": 36030 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 1.8729252815246582, + "learning_rate": 2.7790880503144655e-06, + "loss": 0.0833, + "step": 36040 + }, + { + "epoch": 1.8894129979035639, + "grad_norm": 1.3784617185592651, + "learning_rate": 2.765985324947589e-06, + "loss": 0.0807, + "step": 36050 + }, + { + "epoch": 1.889937106918239, + "grad_norm": 1.35466730594635, + "learning_rate": 2.7528825995807127e-06, + "loss": 0.078, + "step": 36060 + }, + { + "epoch": 1.890461215932914, + "grad_norm": 0.8412442207336426, + "learning_rate": 2.7397798742138365e-06, + "loss": 0.0695, + "step": 36070 + }, + { + "epoch": 1.890985324947589, + "grad_norm": 1.1234647035598755, + "learning_rate": 2.7266771488469603e-06, + "loss": 0.0729, + "step": 36080 + }, + { + "epoch": 1.891509433962264, + "grad_norm": 2.3850042819976807, + "learning_rate": 2.713574423480084e-06, + "loss": 0.0747, + "step": 36090 + }, + { + "epoch": 1.892033542976939, + "grad_norm": 1.2947684526443481, + "learning_rate": 2.700471698113208e-06, + "loss": 0.0747, + "step": 36100 + }, + { + "epoch": 1.8925576519916143, + "grad_norm": 1.541504144668579, + "learning_rate": 2.6873689727463313e-06, + "loss": 0.0769, + "step": 36110 + }, + { + "epoch": 1.8930817610062893, + "grad_norm": 1.178537130355835, + "learning_rate": 2.674266247379455e-06, + "loss": 0.0664, + "step": 36120 + }, + { + "epoch": 1.8936058700209644, + "grad_norm": 1.822480320930481, + "learning_rate": 2.661163522012579e-06, + "loss": 0.0994, + "step": 36130 + }, + { + "epoch": 1.8941299790356394, + "grad_norm": 1.35443115234375, + "learning_rate": 2.6480607966457023e-06, + "loss": 0.0581, + "step": 36140 + }, + { + "epoch": 1.8946540880503144, + "grad_norm": 1.701316237449646, + "learning_rate": 2.634958071278826e-06, + "loss": 0.0724, + "step": 36150 + }, + { + "epoch": 1.8951781970649895, + "grad_norm": 1.102696418762207, + "learning_rate": 2.62185534591195e-06, + "loss": 0.0811, + "step": 36160 + }, + { + "epoch": 1.8957023060796647, + "grad_norm": 1.3935344219207764, + "learning_rate": 2.6087526205450733e-06, + "loss": 0.1073, + "step": 36170 + }, + { + "epoch": 1.8962264150943398, + "grad_norm": 1.8276008367538452, + "learning_rate": 2.595649895178197e-06, + "loss": 0.0756, + "step": 36180 + }, + { + "epoch": 1.8967505241090148, + "grad_norm": 1.2710710763931274, + "learning_rate": 2.582547169811321e-06, + "loss": 0.0697, + "step": 36190 + }, + { + "epoch": 1.8972746331236898, + "grad_norm": 1.8811373710632324, + "learning_rate": 2.5694444444444443e-06, + "loss": 0.0652, + "step": 36200 + }, + { + "epoch": 1.8977987421383649, + "grad_norm": 3.026477575302124, + "learning_rate": 2.556341719077568e-06, + "loss": 0.0877, + "step": 36210 + }, + { + "epoch": 1.89832285115304, + "grad_norm": 1.3149288892745972, + "learning_rate": 2.543238993710692e-06, + "loss": 0.0545, + "step": 36220 + }, + { + "epoch": 1.898846960167715, + "grad_norm": 1.6216425895690918, + "learning_rate": 2.5301362683438157e-06, + "loss": 0.0902, + "step": 36230 + }, + { + "epoch": 1.89937106918239, + "grad_norm": 0.9983451962471008, + "learning_rate": 2.5170335429769395e-06, + "loss": 0.0807, + "step": 36240 + }, + { + "epoch": 1.899895178197065, + "grad_norm": 1.5166202783584595, + "learning_rate": 2.5039308176100634e-06, + "loss": 0.0674, + "step": 36250 + }, + { + "epoch": 1.90041928721174, + "grad_norm": 0.7801737189292908, + "learning_rate": 2.4908280922431867e-06, + "loss": 0.0646, + "step": 36260 + }, + { + "epoch": 1.900943396226415, + "grad_norm": 2.308824062347412, + "learning_rate": 2.4777253668763106e-06, + "loss": 0.0764, + "step": 36270 + }, + { + "epoch": 1.90146750524109, + "grad_norm": 2.3885467052459717, + "learning_rate": 2.4646226415094344e-06, + "loss": 0.0765, + "step": 36280 + }, + { + "epoch": 1.9019916142557651, + "grad_norm": 1.0932997465133667, + "learning_rate": 2.4515199161425577e-06, + "loss": 0.0733, + "step": 36290 + }, + { + "epoch": 1.9025157232704402, + "grad_norm": 2.306108236312866, + "learning_rate": 2.4384171907756816e-06, + "loss": 0.0751, + "step": 36300 + }, + { + "epoch": 1.9030398322851152, + "grad_norm": 1.6138864755630493, + "learning_rate": 2.4253144654088054e-06, + "loss": 0.0709, + "step": 36310 + }, + { + "epoch": 1.9035639412997902, + "grad_norm": 1.8217240571975708, + "learning_rate": 2.4122117400419288e-06, + "loss": 0.0731, + "step": 36320 + }, + { + "epoch": 1.9040880503144653, + "grad_norm": 1.2665067911148071, + "learning_rate": 2.3991090146750526e-06, + "loss": 0.0814, + "step": 36330 + }, + { + "epoch": 1.9046121593291403, + "grad_norm": 1.3618237972259521, + "learning_rate": 2.3860062893081764e-06, + "loss": 0.0744, + "step": 36340 + }, + { + "epoch": 1.9051362683438156, + "grad_norm": 1.7993749380111694, + "learning_rate": 2.3729035639412998e-06, + "loss": 0.0607, + "step": 36350 + }, + { + "epoch": 1.9056603773584906, + "grad_norm": 0.970848560333252, + "learning_rate": 2.3598008385744236e-06, + "loss": 0.0651, + "step": 36360 + }, + { + "epoch": 1.9061844863731656, + "grad_norm": 1.9030567407608032, + "learning_rate": 2.3466981132075474e-06, + "loss": 0.0804, + "step": 36370 + }, + { + "epoch": 1.9067085953878407, + "grad_norm": 1.25066077709198, + "learning_rate": 2.3335953878406708e-06, + "loss": 0.0843, + "step": 36380 + }, + { + "epoch": 1.9072327044025157, + "grad_norm": 1.5621609687805176, + "learning_rate": 2.3204926624737946e-06, + "loss": 0.0665, + "step": 36390 + }, + { + "epoch": 1.9077568134171907, + "grad_norm": 1.3874887228012085, + "learning_rate": 2.3073899371069184e-06, + "loss": 0.0767, + "step": 36400 + }, + { + "epoch": 1.908280922431866, + "grad_norm": 2.122204542160034, + "learning_rate": 2.294287211740042e-06, + "loss": 0.0669, + "step": 36410 + }, + { + "epoch": 1.908805031446541, + "grad_norm": 2.125373601913452, + "learning_rate": 2.281184486373166e-06, + "loss": 0.0722, + "step": 36420 + }, + { + "epoch": 1.909329140461216, + "grad_norm": 1.2056728601455688, + "learning_rate": 2.2680817610062894e-06, + "loss": 0.0659, + "step": 36430 + }, + { + "epoch": 1.909853249475891, + "grad_norm": 2.644113063812256, + "learning_rate": 2.254979035639413e-06, + "loss": 0.1034, + "step": 36440 + }, + { + "epoch": 1.9103773584905661, + "grad_norm": 2.2098119258880615, + "learning_rate": 2.241876310272537e-06, + "loss": 0.0757, + "step": 36450 + }, + { + "epoch": 1.9109014675052411, + "grad_norm": 2.2728121280670166, + "learning_rate": 2.2287735849056604e-06, + "loss": 0.1028, + "step": 36460 + }, + { + "epoch": 1.9114255765199162, + "grad_norm": 1.3554298877716064, + "learning_rate": 2.215670859538784e-06, + "loss": 0.0924, + "step": 36470 + }, + { + "epoch": 1.9119496855345912, + "grad_norm": 0.8044809699058533, + "learning_rate": 2.202568134171908e-06, + "loss": 0.0501, + "step": 36480 + }, + { + "epoch": 1.9124737945492662, + "grad_norm": 1.018548846244812, + "learning_rate": 2.1894654088050314e-06, + "loss": 0.0684, + "step": 36490 + }, + { + "epoch": 1.9129979035639413, + "grad_norm": 1.5749702453613281, + "learning_rate": 2.176362683438155e-06, + "loss": 0.0729, + "step": 36500 + }, + { + "epoch": 1.9135220125786163, + "grad_norm": 1.1555321216583252, + "learning_rate": 2.163259958071279e-06, + "loss": 0.0863, + "step": 36510 + }, + { + "epoch": 1.9140461215932913, + "grad_norm": 1.5626978874206543, + "learning_rate": 2.1501572327044024e-06, + "loss": 0.0809, + "step": 36520 + }, + { + "epoch": 1.9145702306079664, + "grad_norm": 1.4471033811569214, + "learning_rate": 2.137054507337526e-06, + "loss": 0.0663, + "step": 36530 + }, + { + "epoch": 1.9150943396226414, + "grad_norm": 1.3958340883255005, + "learning_rate": 2.12395178197065e-06, + "loss": 0.061, + "step": 36540 + }, + { + "epoch": 1.9156184486373165, + "grad_norm": 2.147571086883545, + "learning_rate": 2.1108490566037734e-06, + "loss": 0.0585, + "step": 36550 + }, + { + "epoch": 1.9161425576519915, + "grad_norm": 1.4060626029968262, + "learning_rate": 2.097746331236897e-06, + "loss": 0.0629, + "step": 36560 + }, + { + "epoch": 1.9166666666666665, + "grad_norm": 2.2379798889160156, + "learning_rate": 2.084643605870021e-06, + "loss": 0.0847, + "step": 36570 + }, + { + "epoch": 1.9171907756813418, + "grad_norm": 1.4189167022705078, + "learning_rate": 2.071540880503145e-06, + "loss": 0.0864, + "step": 36580 + }, + { + "epoch": 1.9177148846960168, + "grad_norm": 1.625536561012268, + "learning_rate": 2.0584381551362686e-06, + "loss": 0.0726, + "step": 36590 + }, + { + "epoch": 1.9182389937106918, + "grad_norm": 1.1209425926208496, + "learning_rate": 2.0453354297693924e-06, + "loss": 0.0614, + "step": 36600 + }, + { + "epoch": 1.9187631027253669, + "grad_norm": 0.8219720125198364, + "learning_rate": 2.032232704402516e-06, + "loss": 0.0652, + "step": 36610 + }, + { + "epoch": 1.919287211740042, + "grad_norm": 1.4849026203155518, + "learning_rate": 2.0191299790356396e-06, + "loss": 0.0791, + "step": 36620 + }, + { + "epoch": 1.919811320754717, + "grad_norm": 0.8608081340789795, + "learning_rate": 2.0060272536687634e-06, + "loss": 0.103, + "step": 36630 + }, + { + "epoch": 1.9203354297693922, + "grad_norm": 2.357111930847168, + "learning_rate": 1.992924528301887e-06, + "loss": 0.0639, + "step": 36640 + }, + { + "epoch": 1.9208595387840672, + "grad_norm": 1.304469347000122, + "learning_rate": 1.9798218029350106e-06, + "loss": 0.0917, + "step": 36650 + }, + { + "epoch": 1.9213836477987423, + "grad_norm": 1.540612816810608, + "learning_rate": 1.9667190775681344e-06, + "loss": 0.0776, + "step": 36660 + }, + { + "epoch": 1.9219077568134173, + "grad_norm": 1.7036010026931763, + "learning_rate": 1.953616352201258e-06, + "loss": 0.0675, + "step": 36670 + }, + { + "epoch": 1.9224318658280923, + "grad_norm": 0.8985329866409302, + "learning_rate": 1.9405136268343816e-06, + "loss": 0.0686, + "step": 36680 + }, + { + "epoch": 1.9229559748427674, + "grad_norm": 1.7493515014648438, + "learning_rate": 1.9274109014675054e-06, + "loss": 0.063, + "step": 36690 + }, + { + "epoch": 1.9234800838574424, + "grad_norm": 1.0596920251846313, + "learning_rate": 1.914308176100629e-06, + "loss": 0.0765, + "step": 36700 + }, + { + "epoch": 1.9240041928721174, + "grad_norm": 1.8956947326660156, + "learning_rate": 1.9012054507337529e-06, + "loss": 0.0779, + "step": 36710 + }, + { + "epoch": 1.9245283018867925, + "grad_norm": 1.4802206754684448, + "learning_rate": 1.8881027253668767e-06, + "loss": 0.068, + "step": 36720 + }, + { + "epoch": 1.9250524109014675, + "grad_norm": 1.4648950099945068, + "learning_rate": 1.875e-06, + "loss": 0.0762, + "step": 36730 + }, + { + "epoch": 1.9255765199161425, + "grad_norm": 2.2809669971466064, + "learning_rate": 1.8618972746331239e-06, + "loss": 0.0818, + "step": 36740 + }, + { + "epoch": 1.9261006289308176, + "grad_norm": 0.834746241569519, + "learning_rate": 1.8487945492662477e-06, + "loss": 0.0509, + "step": 36750 + }, + { + "epoch": 1.9266247379454926, + "grad_norm": 1.7022600173950195, + "learning_rate": 1.835691823899371e-06, + "loss": 0.0865, + "step": 36760 + }, + { + "epoch": 1.9271488469601676, + "grad_norm": 1.0883166790008545, + "learning_rate": 1.8225890985324949e-06, + "loss": 0.0809, + "step": 36770 + }, + { + "epoch": 1.9276729559748427, + "grad_norm": 1.4769940376281738, + "learning_rate": 1.8094863731656185e-06, + "loss": 0.0755, + "step": 36780 + }, + { + "epoch": 1.9281970649895177, + "grad_norm": 0.7859780192375183, + "learning_rate": 1.7963836477987423e-06, + "loss": 0.0564, + "step": 36790 + }, + { + "epoch": 1.9287211740041927, + "grad_norm": 0.8886245489120483, + "learning_rate": 1.783280922431866e-06, + "loss": 0.0916, + "step": 36800 + }, + { + "epoch": 1.9292452830188678, + "grad_norm": 1.6430957317352295, + "learning_rate": 1.7701781970649895e-06, + "loss": 0.0777, + "step": 36810 + }, + { + "epoch": 1.929769392033543, + "grad_norm": 2.1593551635742188, + "learning_rate": 1.7570754716981133e-06, + "loss": 0.0844, + "step": 36820 + }, + { + "epoch": 1.930293501048218, + "grad_norm": 0.8387053608894348, + "learning_rate": 1.743972746331237e-06, + "loss": 0.0669, + "step": 36830 + }, + { + "epoch": 1.930817610062893, + "grad_norm": 1.2902204990386963, + "learning_rate": 1.7308700209643605e-06, + "loss": 0.0739, + "step": 36840 + }, + { + "epoch": 1.9313417190775681, + "grad_norm": 1.5502692461013794, + "learning_rate": 1.7177672955974843e-06, + "loss": 0.1027, + "step": 36850 + }, + { + "epoch": 1.9318658280922432, + "grad_norm": 1.5457940101623535, + "learning_rate": 1.704664570230608e-06, + "loss": 0.0823, + "step": 36860 + }, + { + "epoch": 1.9323899371069182, + "grad_norm": 3.4082517623901367, + "learning_rate": 1.6915618448637317e-06, + "loss": 0.0633, + "step": 36870 + }, + { + "epoch": 1.9329140461215935, + "grad_norm": 2.142735242843628, + "learning_rate": 1.6784591194968555e-06, + "loss": 0.0952, + "step": 36880 + }, + { + "epoch": 1.9334381551362685, + "grad_norm": 0.6079370975494385, + "learning_rate": 1.6653563941299793e-06, + "loss": 0.0664, + "step": 36890 + }, + { + "epoch": 1.9339622641509435, + "grad_norm": 1.949703574180603, + "learning_rate": 1.6522536687631027e-06, + "loss": 0.0689, + "step": 36900 + }, + { + "epoch": 1.9344863731656186, + "grad_norm": 2.3975839614868164, + "learning_rate": 1.6391509433962265e-06, + "loss": 0.0702, + "step": 36910 + }, + { + "epoch": 1.9350104821802936, + "grad_norm": 1.2329648733139038, + "learning_rate": 1.6260482180293503e-06, + "loss": 0.0612, + "step": 36920 + }, + { + "epoch": 1.9355345911949686, + "grad_norm": 0.8647230863571167, + "learning_rate": 1.6129454926624737e-06, + "loss": 0.0533, + "step": 36930 + }, + { + "epoch": 1.9360587002096437, + "grad_norm": 1.8352750539779663, + "learning_rate": 1.5998427672955975e-06, + "loss": 0.09, + "step": 36940 + }, + { + "epoch": 1.9365828092243187, + "grad_norm": 1.6736629009246826, + "learning_rate": 1.5867400419287213e-06, + "loss": 0.0873, + "step": 36950 + }, + { + "epoch": 1.9371069182389937, + "grad_norm": 3.0347659587860107, + "learning_rate": 1.573637316561845e-06, + "loss": 0.0882, + "step": 36960 + }, + { + "epoch": 1.9376310272536688, + "grad_norm": 1.4697365760803223, + "learning_rate": 1.5605345911949687e-06, + "loss": 0.0721, + "step": 36970 + }, + { + "epoch": 1.9381551362683438, + "grad_norm": 0.8763979077339172, + "learning_rate": 1.5474318658280923e-06, + "loss": 0.0745, + "step": 36980 + }, + { + "epoch": 1.9386792452830188, + "grad_norm": 1.6568057537078857, + "learning_rate": 1.5343291404612161e-06, + "loss": 0.0761, + "step": 36990 + }, + { + "epoch": 1.9392033542976939, + "grad_norm": 2.1250786781311035, + "learning_rate": 1.5212264150943397e-06, + "loss": 0.0767, + "step": 37000 + }, + { + "epoch": 1.9392033542976939, + "eval_loss": 0.266403466463089, + "eval_runtime": 267.5874, + "eval_samples_per_second": 7.441, + "eval_steps_per_second": 1.241, + "step": 37000 + }, + { + "epoch": 1.939727463312369, + "grad_norm": 1.6555750370025635, + "learning_rate": 1.5081236897274633e-06, + "loss": 0.0722, + "step": 37010 + }, + { + "epoch": 1.940251572327044, + "grad_norm": 1.4497612714767456, + "learning_rate": 1.4950209643605871e-06, + "loss": 0.076, + "step": 37020 + }, + { + "epoch": 1.940775681341719, + "grad_norm": 2.2555832862854004, + "learning_rate": 1.4819182389937107e-06, + "loss": 0.076, + "step": 37030 + }, + { + "epoch": 1.941299790356394, + "grad_norm": 1.407079815864563, + "learning_rate": 1.4688155136268343e-06, + "loss": 0.094, + "step": 37040 + }, + { + "epoch": 1.941823899371069, + "grad_norm": 1.192366361618042, + "learning_rate": 1.4557127882599581e-06, + "loss": 0.0895, + "step": 37050 + }, + { + "epoch": 1.9423480083857443, + "grad_norm": 1.538325548171997, + "learning_rate": 1.442610062893082e-06, + "loss": 0.0702, + "step": 37060 + }, + { + "epoch": 1.9428721174004193, + "grad_norm": 1.4339028596878052, + "learning_rate": 1.4295073375262055e-06, + "loss": 0.0673, + "step": 37070 + }, + { + "epoch": 1.9433962264150944, + "grad_norm": 1.572577953338623, + "learning_rate": 1.4164046121593291e-06, + "loss": 0.0884, + "step": 37080 + }, + { + "epoch": 1.9439203354297694, + "grad_norm": 1.6123378276824951, + "learning_rate": 1.403301886792453e-06, + "loss": 0.0561, + "step": 37090 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 2.3728435039520264, + "learning_rate": 1.3901991614255765e-06, + "loss": 0.0495, + "step": 37100 + }, + { + "epoch": 1.9449685534591195, + "grad_norm": 0.9430110454559326, + "learning_rate": 1.3770964360587001e-06, + "loss": 0.0809, + "step": 37110 + }, + { + "epoch": 1.9454926624737947, + "grad_norm": 1.7856323719024658, + "learning_rate": 1.363993710691824e-06, + "loss": 0.099, + "step": 37120 + }, + { + "epoch": 1.9460167714884697, + "grad_norm": 3.6460683345794678, + "learning_rate": 1.3508909853249477e-06, + "loss": 0.0939, + "step": 37130 + }, + { + "epoch": 1.9465408805031448, + "grad_norm": 1.259628176689148, + "learning_rate": 1.3377882599580713e-06, + "loss": 0.0723, + "step": 37140 + }, + { + "epoch": 1.9470649895178198, + "grad_norm": 2.077188014984131, + "learning_rate": 1.3246855345911952e-06, + "loss": 0.0742, + "step": 37150 + }, + { + "epoch": 1.9475890985324948, + "grad_norm": 0.9879984855651855, + "learning_rate": 1.3115828092243188e-06, + "loss": 0.0766, + "step": 37160 + }, + { + "epoch": 1.9481132075471699, + "grad_norm": 8.38661003112793, + "learning_rate": 1.2984800838574423e-06, + "loss": 0.0837, + "step": 37170 + }, + { + "epoch": 1.948637316561845, + "grad_norm": 1.6310886144638062, + "learning_rate": 1.2853773584905662e-06, + "loss": 0.0631, + "step": 37180 + }, + { + "epoch": 1.94916142557652, + "grad_norm": 1.455169916152954, + "learning_rate": 1.2722746331236898e-06, + "loss": 0.0777, + "step": 37190 + }, + { + "epoch": 1.949685534591195, + "grad_norm": 1.819744348526001, + "learning_rate": 1.2591719077568134e-06, + "loss": 0.0678, + "step": 37200 + }, + { + "epoch": 1.95020964360587, + "grad_norm": 1.5065569877624512, + "learning_rate": 1.2460691823899372e-06, + "loss": 0.0725, + "step": 37210 + }, + { + "epoch": 1.950733752620545, + "grad_norm": 2.0217177867889404, + "learning_rate": 1.232966457023061e-06, + "loss": 0.0674, + "step": 37220 + }, + { + "epoch": 1.95125786163522, + "grad_norm": 0.7243431210517883, + "learning_rate": 1.2198637316561846e-06, + "loss": 0.0547, + "step": 37230 + }, + { + "epoch": 1.9517819706498951, + "grad_norm": 1.3811225891113281, + "learning_rate": 1.2067610062893084e-06, + "loss": 0.0559, + "step": 37240 + }, + { + "epoch": 1.9523060796645701, + "grad_norm": 1.1925405263900757, + "learning_rate": 1.193658280922432e-06, + "loss": 0.0649, + "step": 37250 + }, + { + "epoch": 1.9528301886792452, + "grad_norm": 1.6529408693313599, + "learning_rate": 1.1805555555555556e-06, + "loss": 0.09, + "step": 37260 + }, + { + "epoch": 1.9533542976939202, + "grad_norm": 3.579072952270508, + "learning_rate": 1.1674528301886792e-06, + "loss": 0.0808, + "step": 37270 + }, + { + "epoch": 1.9538784067085953, + "grad_norm": 1.4058364629745483, + "learning_rate": 1.154350104821803e-06, + "loss": 0.072, + "step": 37280 + }, + { + "epoch": 1.9544025157232703, + "grad_norm": 1.150667667388916, + "learning_rate": 1.1412473794549266e-06, + "loss": 0.0564, + "step": 37290 + }, + { + "epoch": 1.9549266247379455, + "grad_norm": 1.4494599103927612, + "learning_rate": 1.1281446540880504e-06, + "loss": 0.0755, + "step": 37300 + }, + { + "epoch": 1.9554507337526206, + "grad_norm": 1.234850525856018, + "learning_rate": 1.1150419287211742e-06, + "loss": 0.0717, + "step": 37310 + }, + { + "epoch": 1.9559748427672956, + "grad_norm": 0.9447284936904907, + "learning_rate": 1.1019392033542978e-06, + "loss": 0.07, + "step": 37320 + }, + { + "epoch": 1.9564989517819706, + "grad_norm": 2.047245740890503, + "learning_rate": 1.0888364779874214e-06, + "loss": 0.1194, + "step": 37330 + }, + { + "epoch": 1.9570230607966457, + "grad_norm": 1.7440499067306519, + "learning_rate": 1.0757337526205452e-06, + "loss": 0.0658, + "step": 37340 + }, + { + "epoch": 1.9575471698113207, + "grad_norm": 1.5569164752960205, + "learning_rate": 1.0626310272536688e-06, + "loss": 0.0636, + "step": 37350 + }, + { + "epoch": 1.958071278825996, + "grad_norm": 1.9636361598968506, + "learning_rate": 1.0495283018867924e-06, + "loss": 0.0676, + "step": 37360 + }, + { + "epoch": 1.958595387840671, + "grad_norm": 1.25540030002594, + "learning_rate": 1.0364255765199162e-06, + "loss": 0.0699, + "step": 37370 + }, + { + "epoch": 1.959119496855346, + "grad_norm": 1.2568477392196655, + "learning_rate": 1.0233228511530398e-06, + "loss": 0.0866, + "step": 37380 + }, + { + "epoch": 1.959643605870021, + "grad_norm": 1.4221638441085815, + "learning_rate": 1.0102201257861636e-06, + "loss": 0.0492, + "step": 37390 + }, + { + "epoch": 1.960167714884696, + "grad_norm": 0.9008753299713135, + "learning_rate": 9.971174004192874e-07, + "loss": 0.0714, + "step": 37400 + }, + { + "epoch": 1.9606918238993711, + "grad_norm": 1.7107698917388916, + "learning_rate": 9.84014675052411e-07, + "loss": 0.0567, + "step": 37410 + }, + { + "epoch": 1.9612159329140462, + "grad_norm": 1.0695396661758423, + "learning_rate": 9.709119496855346e-07, + "loss": 0.078, + "step": 37420 + }, + { + "epoch": 1.9617400419287212, + "grad_norm": 2.351242780685425, + "learning_rate": 9.578092243186584e-07, + "loss": 0.0725, + "step": 37430 + }, + { + "epoch": 1.9622641509433962, + "grad_norm": 1.5893714427947998, + "learning_rate": 9.44706498951782e-07, + "loss": 0.0753, + "step": 37440 + }, + { + "epoch": 1.9627882599580713, + "grad_norm": 0.7627315521240234, + "learning_rate": 9.316037735849057e-07, + "loss": 0.0902, + "step": 37450 + }, + { + "epoch": 1.9633123689727463, + "grad_norm": 1.9830408096313477, + "learning_rate": 9.185010482180295e-07, + "loss": 0.0628, + "step": 37460 + }, + { + "epoch": 1.9638364779874213, + "grad_norm": 1.4400569200515747, + "learning_rate": 9.053983228511531e-07, + "loss": 0.0667, + "step": 37470 + }, + { + "epoch": 1.9643605870020964, + "grad_norm": 1.388247013092041, + "learning_rate": 8.922955974842767e-07, + "loss": 0.074, + "step": 37480 + }, + { + "epoch": 1.9648846960167714, + "grad_norm": 1.9638195037841797, + "learning_rate": 8.791928721174004e-07, + "loss": 0.0686, + "step": 37490 + }, + { + "epoch": 1.9654088050314464, + "grad_norm": 1.6383897066116333, + "learning_rate": 8.660901467505242e-07, + "loss": 0.0732, + "step": 37500 + }, + { + "epoch": 1.9659329140461215, + "grad_norm": 0.9515641927719116, + "learning_rate": 8.529874213836478e-07, + "loss": 0.0531, + "step": 37510 + }, + { + "epoch": 1.9664570230607965, + "grad_norm": 1.4345340728759766, + "learning_rate": 8.398846960167714e-07, + "loss": 0.0805, + "step": 37520 + }, + { + "epoch": 1.9669811320754715, + "grad_norm": 1.228147029876709, + "learning_rate": 8.267819706498952e-07, + "loss": 0.082, + "step": 37530 + }, + { + "epoch": 1.9675052410901468, + "grad_norm": 1.4178779125213623, + "learning_rate": 8.136792452830189e-07, + "loss": 0.0908, + "step": 37540 + }, + { + "epoch": 1.9680293501048218, + "grad_norm": 2.3367421627044678, + "learning_rate": 8.005765199161425e-07, + "loss": 0.0769, + "step": 37550 + }, + { + "epoch": 1.9685534591194969, + "grad_norm": 2.3791344165802, + "learning_rate": 7.874737945492663e-07, + "loss": 0.0594, + "step": 37560 + }, + { + "epoch": 1.969077568134172, + "grad_norm": 1.5467840433120728, + "learning_rate": 7.743710691823899e-07, + "loss": 0.0895, + "step": 37570 + }, + { + "epoch": 1.969601677148847, + "grad_norm": 1.9167823791503906, + "learning_rate": 7.612683438155136e-07, + "loss": 0.0867, + "step": 37580 + }, + { + "epoch": 1.970125786163522, + "grad_norm": 1.9124726057052612, + "learning_rate": 7.481656184486373e-07, + "loss": 0.0531, + "step": 37590 + }, + { + "epoch": 1.9706498951781972, + "grad_norm": 2.3825502395629883, + "learning_rate": 7.35062893081761e-07, + "loss": 0.0647, + "step": 37600 + }, + { + "epoch": 1.9711740041928723, + "grad_norm": 1.4810495376586914, + "learning_rate": 7.219601677148848e-07, + "loss": 0.0654, + "step": 37610 + }, + { + "epoch": 1.9716981132075473, + "grad_norm": 1.6239418983459473, + "learning_rate": 7.088574423480085e-07, + "loss": 0.0627, + "step": 37620 + }, + { + "epoch": 1.9722222222222223, + "grad_norm": 2.5030767917633057, + "learning_rate": 6.957547169811322e-07, + "loss": 0.0503, + "step": 37630 + }, + { + "epoch": 1.9727463312368974, + "grad_norm": 0.7092583775520325, + "learning_rate": 6.826519916142558e-07, + "loss": 0.0614, + "step": 37640 + }, + { + "epoch": 1.9732704402515724, + "grad_norm": 1.160536527633667, + "learning_rate": 6.695492662473795e-07, + "loss": 0.0859, + "step": 37650 + }, + { + "epoch": 1.9737945492662474, + "grad_norm": 1.278551697731018, + "learning_rate": 6.564465408805032e-07, + "loss": 0.0718, + "step": 37660 + }, + { + "epoch": 1.9743186582809225, + "grad_norm": 1.3502774238586426, + "learning_rate": 6.433438155136269e-07, + "loss": 0.0755, + "step": 37670 + }, + { + "epoch": 1.9748427672955975, + "grad_norm": 2.126868724822998, + "learning_rate": 6.302410901467506e-07, + "loss": 0.0829, + "step": 37680 + }, + { + "epoch": 1.9753668763102725, + "grad_norm": 2.7387828826904297, + "learning_rate": 6.171383647798743e-07, + "loss": 0.0774, + "step": 37690 + }, + { + "epoch": 1.9758909853249476, + "grad_norm": 2.5403170585632324, + "learning_rate": 6.040356394129979e-07, + "loss": 0.0854, + "step": 37700 + }, + { + "epoch": 1.9764150943396226, + "grad_norm": 1.5051779747009277, + "learning_rate": 5.909329140461217e-07, + "loss": 0.0823, + "step": 37710 + }, + { + "epoch": 1.9769392033542976, + "grad_norm": 1.1935973167419434, + "learning_rate": 5.778301886792453e-07, + "loss": 0.0647, + "step": 37720 + }, + { + "epoch": 1.9774633123689727, + "grad_norm": 1.5811327695846558, + "learning_rate": 5.64727463312369e-07, + "loss": 0.1096, + "step": 37730 + }, + { + "epoch": 1.9779874213836477, + "grad_norm": 5.325149059295654, + "learning_rate": 5.516247379454927e-07, + "loss": 0.082, + "step": 37740 + }, + { + "epoch": 1.9785115303983227, + "grad_norm": 0.6052076816558838, + "learning_rate": 5.385220125786164e-07, + "loss": 0.0502, + "step": 37750 + }, + { + "epoch": 1.9790356394129978, + "grad_norm": 1.2876312732696533, + "learning_rate": 5.254192872117401e-07, + "loss": 0.0864, + "step": 37760 + }, + { + "epoch": 1.9795597484276728, + "grad_norm": 1.1572834253311157, + "learning_rate": 5.123165618448638e-07, + "loss": 0.0712, + "step": 37770 + }, + { + "epoch": 1.980083857442348, + "grad_norm": 9.827251434326172, + "learning_rate": 4.992138364779874e-07, + "loss": 0.0594, + "step": 37780 + }, + { + "epoch": 1.980607966457023, + "grad_norm": 0.8759055137634277, + "learning_rate": 4.861111111111111e-07, + "loss": 0.0716, + "step": 37790 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 1.3479708433151245, + "learning_rate": 4.7300838574423485e-07, + "loss": 0.0778, + "step": 37800 + }, + { + "epoch": 1.9816561844863732, + "grad_norm": 3.9129786491394043, + "learning_rate": 4.599056603773585e-07, + "loss": 0.076, + "step": 37810 + }, + { + "epoch": 1.9821802935010482, + "grad_norm": 1.9781731367111206, + "learning_rate": 4.468029350104822e-07, + "loss": 0.0679, + "step": 37820 + }, + { + "epoch": 1.9827044025157232, + "grad_norm": 17.147493362426758, + "learning_rate": 4.3370020964360585e-07, + "loss": 0.0607, + "step": 37830 + }, + { + "epoch": 1.9832285115303985, + "grad_norm": 1.202863335609436, + "learning_rate": 4.2059748427672955e-07, + "loss": 0.0794, + "step": 37840 + }, + { + "epoch": 1.9837526205450735, + "grad_norm": 1.6506856679916382, + "learning_rate": 4.074947589098533e-07, + "loss": 0.0636, + "step": 37850 + }, + { + "epoch": 1.9842767295597485, + "grad_norm": 1.0308916568756104, + "learning_rate": 3.943920335429769e-07, + "loss": 0.0728, + "step": 37860 + }, + { + "epoch": 1.9848008385744236, + "grad_norm": 1.316379427909851, + "learning_rate": 3.8128930817610066e-07, + "loss": 0.0817, + "step": 37870 + }, + { + "epoch": 1.9853249475890986, + "grad_norm": 1.3974709510803223, + "learning_rate": 3.681865828092243e-07, + "loss": 0.0527, + "step": 37880 + }, + { + "epoch": 1.9858490566037736, + "grad_norm": 1.2634168863296509, + "learning_rate": 3.55083857442348e-07, + "loss": 0.0741, + "step": 37890 + }, + { + "epoch": 1.9863731656184487, + "grad_norm": 1.9009695053100586, + "learning_rate": 3.419811320754717e-07, + "loss": 0.0895, + "step": 37900 + }, + { + "epoch": 1.9868972746331237, + "grad_norm": 1.6653246879577637, + "learning_rate": 3.288784067085954e-07, + "loss": 0.0794, + "step": 37910 + }, + { + "epoch": 1.9874213836477987, + "grad_norm": 1.7431325912475586, + "learning_rate": 3.1577568134171907e-07, + "loss": 0.0872, + "step": 37920 + }, + { + "epoch": 1.9879454926624738, + "grad_norm": 0.8424640893936157, + "learning_rate": 3.026729559748428e-07, + "loss": 0.0714, + "step": 37930 + }, + { + "epoch": 1.9884696016771488, + "grad_norm": 2.0467424392700195, + "learning_rate": 2.895702306079665e-07, + "loss": 0.0872, + "step": 37940 + }, + { + "epoch": 1.9889937106918238, + "grad_norm": 1.6486082077026367, + "learning_rate": 2.764675052410902e-07, + "loss": 0.0946, + "step": 37950 + }, + { + "epoch": 1.9895178197064989, + "grad_norm": 1.3245718479156494, + "learning_rate": 2.6336477987421383e-07, + "loss": 0.0832, + "step": 37960 + }, + { + "epoch": 1.990041928721174, + "grad_norm": 1.2091447114944458, + "learning_rate": 2.5026205450733754e-07, + "loss": 0.0847, + "step": 37970 + }, + { + "epoch": 1.990566037735849, + "grad_norm": 0.769639790058136, + "learning_rate": 2.3715932914046124e-07, + "loss": 0.0586, + "step": 37980 + }, + { + "epoch": 1.991090146750524, + "grad_norm": 0.9816944599151611, + "learning_rate": 2.2405660377358492e-07, + "loss": 0.066, + "step": 37990 + }, + { + "epoch": 1.991614255765199, + "grad_norm": 1.0189626216888428, + "learning_rate": 2.109538784067086e-07, + "loss": 0.0697, + "step": 38000 + }, + { + "epoch": 1.991614255765199, + "eval_loss": 0.2673446834087372, + "eval_runtime": 267.8679, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 1.239, + "step": 38000 + } + ], + "logging_steps": 10, + "max_steps": 38160, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.669752402333338e+19, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}